mariadb/strings/ctype-mb.ic

260 lines
6.6 KiB
Text

/*
Copyright (c) 2015, MariaDB Foundation
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; version 2 of the License.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with this program; if not, write to the Free Software
Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
*/
#ifndef MY_FUNCTION_NAME
#error MY_FUNCTION_NAME is not defined
#endif
#if defined(IS_MB3_CHAR) && !defined(IS_MB2_CHAR)
#error IS_MB3_CHAR is defined, while IS_MB2_CHAR is not!
#endif
#if defined(IS_MB4_CHAR) && !defined(IS_MB3_CHAR)
#error IS_MB4_CHAR is defined, while IS_MB3_CHAR is not!
#endif
#ifdef DEFINE_ASIAN_ROUTINES
#define DEFINE_WELL_FORMED_LEN
#define DEFINE_WELL_FORMED_CHAR_LENGTH
#define DEFINE_CHARLEN
#endif
#ifdef DEFINE_CHARLEN
/**
Returns length of the left-most character of a string.
@param cs - charset with mbminlen==1 and mbmaxlen<=4
@param b - the beginning of the string
@param e - the end of the string
@return MY_CS_ILSEQ if a bad byte sequence was found
@return MY_CS_TOOSMALL(N) if the string ended unexpectedly
@return >0 if a valid character was found
*/
static int
MY_FUNCTION_NAME(charlen)(CHARSET_INFO *cs __attribute__((unused)),
const uchar *b, const uchar *e)
{
DBUG_ASSERT(cs->mbminlen == 1);
DBUG_ASSERT(cs->mbmaxlen <= 4);
if (b >= e)
return MY_CS_TOOSMALL;
if ((uchar) b[0] < 128)
return 1; /* Single byte ASCII character */
#ifdef IS_8BIT_CHAR
if (IS_8BIT_CHAR(b[0]))
{
/* Single byte non-ASCII character, e.g. half width kana in sjis */
return 1;
}
#endif
if (b + 2 > e)
return MY_CS_TOOSMALLN(2);
if (IS_MB2_CHAR(b[0], b[1]))
return 2; /* Double byte character */
#ifdef IS_MB3_CHAR
if (b + 3 > e)
return MY_CS_TOOSMALLN(3);
if (IS_MB3_CHAR(b[0], b[1], b[2]))
return 3; /* Three-byte character */
#endif
#ifdef IS_MB4_CHAR
if (b + 4 > e)
return MY_CS_TOOSMALLN(4);
if (IS_MB4_CHAR(b[0], b[1], b[2], b[3]))
return 4; /* Four-byte character */
#endif
/* Wrong byte sequence */
return MY_CS_ILSEQ;
}
#endif /* DEFINE_WELL_FORMED_LEN */
#ifdef DEFINE_WELL_FORMED_LEN
/**
Returns well formed length of a character string with
variable character length for character sets with:
- mbminlen == 1
- mbmaxlen == 2, 3, or 4
*/
static size_t
MY_FUNCTION_NAME(well_formed_len)(CHARSET_INFO *cs __attribute__((unused)),
const char *b, const char *e,
size_t nchars, int *error)
{
const char *b0= b;
DBUG_ASSERT(cs->mbminlen == 1);
DBUG_ASSERT(cs->mbmaxlen <= 4);
for (*error= 0 ; b < e && nchars-- ; )
{
if ((uchar) b[0] < 128)
{
b++; /* Single byte ASCII character */
continue;
}
if (b + 2 <= e && IS_MB2_CHAR(b[0], b[1]))
{
b+= 2; /* Double byte character */
continue;
}
#ifdef IS_MB3_CHAR
if (b + 3 <= e && IS_MB3_CHAR(b[0], b[1], b[2]))
{
b+= 3; /* Three-byte character */
continue;
}
#endif
#ifdef IS_MB4_CHAR
if (b + 4 <= e && IS_MB4_CHAR(b[0], b[1], b[2], b[3]))
{
b+= 4; /* Four-byte character */
continue;
}
#endif
#ifdef IS_8BIT_CHAR
if (IS_8BIT_CHAR(b[0]))
{
b++; /* Single byte non-ASCII character, e.g. half width kana in sjis */
continue;
}
#endif
/* Wrong byte sequence */
*error= 1;
break;
}
return b - b0;
}
#endif /* DEFINE_WELL_FORMED_LEN */
#ifdef DEFINE_WELL_FORMED_CHAR_LENGTH
/**
Returns well formed length of a string
measured in characters (rather than in bytes).
Version for character sets that define IS_MB?_CHAR(), e.g. big5.
*/
static size_t
MY_FUNCTION_NAME(well_formed_char_length)(CHARSET_INFO *cs __attribute__((unused)),
const char *b, const char *e,
size_t nchars,
MY_STRCOPY_STATUS *status)
{
size_t nchars0= nchars;
for ( ; b < e && nchars ; nchars--)
{
if ((uchar) b[0] < 128)
{
b++; /* Single byte ASCII character */
continue;
}
if (b + 2 <= e && IS_MB2_CHAR(b[0], b[1]))
{
b+= 2; /* Double byte character */
continue;
}
#ifdef IS_MB3_CHAR
if (b + 3 <= e && IS_MB3_CHAR(b[0], b[1], b[2]))
{
b+= 3; /* Three-byte character */
continue;
}
#endif
#ifdef IS_MB4_CHAR
if (b + 4 <= e && IS_MB4_CHAR(b[0], b[1], b[2], b[3]))
{
b+= 4; /* Four-byte character */
continue;
}
#endif
#ifdef IS_8BIT_CHAR
if (IS_8BIT_CHAR(b[0]))
{
b++; /* Single byte non-ASCII character, e.g. half width kana in sjis */
continue;
}
#endif
/* Wrong byte sequence */
status->m_source_end_pos= status->m_well_formed_error_pos= b;
return nchars0 - nchars;
}
status->m_source_end_pos= b;
status->m_well_formed_error_pos= NULL;
return nchars0 - nchars;
}
#endif /* DEFINE_WELL_FORMED_CHAR_LENGTH */
#ifdef DEFINE_WELL_FORMED_CHAR_LENGTH_USING_CHARLEN
#ifndef CHARLEN
#error CHARLEN is not defined
#endif
/**
Returns well formed length of a string
measured in characters (rather than in bytes).
Version for character sets that define CHARLEN(), e.g. utf8.
CHARLEN(cs,b,e) must use the same return code convension that mb_wc() does:
- a positive number in the range [1-mbmaxlen] if a valid
single-byte or multi-byte character was found
- MY_CS_ILSEQ (0) on a bad byte sequence
- MY_CS_TOOSMALLxx if the incoming sequence is incomplete
*/
static size_t
MY_FUNCTION_NAME(well_formed_char_length)(CHARSET_INFO *cs __attribute__((unused)),
const char *b, const char *e,
size_t nchars,
MY_STRCOPY_STATUS *status)
{
size_t nchars0= nchars;
int chlen;
for ( ; nchars ; nchars--, b+= chlen)
{
if ((chlen= CHARLEN(cs, (uchar*) b, (uchar*) e)) <= 0)
{
status->m_well_formed_error_pos= b < e ? b : NULL;
status->m_source_end_pos= b;
return nchars0 - nchars;
}
}
status->m_well_formed_error_pos= NULL;
status->m_source_end_pos= b;
return nchars0 - nchars;
}
#endif /* DEFINE_WELL_FORMED_CHAR_LENGTH_USING_CHARLEN */
#undef MY_FUNCTION_NAME