MDEV-17474 Change Unicode collation implementation from "handler" to "inline" style

This commit is contained in:
Alexander Barkov 2018-10-16 19:10:57 +04:00
parent fee24b1281
commit 6eae037c4c
9 changed files with 1323 additions and 1268 deletions

View file

@ -362,7 +362,6 @@ extern MY_COLLATION_HANDLER my_collation_8bit_bin_handler;
extern MY_COLLATION_HANDLER my_collation_8bit_simple_ci_handler;
extern MY_COLLATION_HANDLER my_collation_8bit_nopad_bin_handler;
extern MY_COLLATION_HANDLER my_collation_8bit_simple_nopad_ci_handler;
extern MY_COLLATION_HANDLER my_collation_ucs2_uca_handler;
/* Some typedef to make it easy for C++ to make function pointers */
typedef int (*my_charset_conv_mb_wc)(CHARSET_INFO *, my_wc_t *,

File diff suppressed because it is too large Load diff

763
strings/ctype-uca.ic Normal file
View file

@ -0,0 +1,763 @@
/*
Copyright (c) 2018 MariaDB Corporation
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; version 2 of the License.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with this program; if not, write to the Free Software
Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
*/
#ifndef MY_FUNCTION_NAME
#error MY_FUNCTION_NAME is not defined
#endif
#ifndef MY_MB_WC
#error MY_MB_WC is not defined
#endif
#ifndef MY_LIKE_RANGE
#error MY_LIKE_RANGE is not defined
#endif
static inline int
MY_FUNCTION_NAME(scanner_next)(my_uca_scanner *scanner)
{
/*
Check if the weights for the previous character have been
already fully scanned. If yes, then get the next character and
initialize wbeg and wlength to its weight string.
*/
if (scanner->wbeg[0]) /* More weights left from the previous step: */
return *scanner->wbeg++; /* return the next weight from expansion */
do
{
const uint16 *wpage;
my_wc_t wc[MY_UCA_MAX_CONTRACTION];
int mblen;
/* Get next character */
if (((mblen= MY_MB_WC(scanner, wc, scanner->sbeg,
scanner->send)) <= 0))
{
if (scanner->sbeg >= scanner->send)
return -1; /* No more bytes, end of line reached */
/*
There are some more bytes left. Non-positive mb_len means that
we got an incomplete or a bad byte sequence. Consume mbminlen bytes.
*/
if ((scanner->sbeg+= scanner->cs->mbminlen) > scanner->send)
{
/* For safety purposes don't go beyond the string range. */
scanner->sbeg= scanner->send;
}
/*
Treat every complete or incomplete mbminlen unit as a weight which is
greater than weight for any possible normal character.
0xFFFF is greater than any possible weight in the UCA weight table.
*/
return 0xFFFF;
}
scanner->sbeg+= mblen;
if (wc[0] > scanner->level->maxchar)
{
/* Return 0xFFFD as weight for all characters outside BMP */
scanner->wbeg= nochar;
return 0xFFFD;
}
if (my_uca_have_contractions_quick(scanner->level))
{
uint16 *cweight;
/*
If we have scanned a character which can have previous context,
and there were some more characters already before,
then reconstruct codepoint of the previous character
from "page" and "code" into w[1], and verify that {wc[1], wc[0]}
together form a real previous context pair.
Note, we support only 2-character long sequences with previous
context at the moment. CLDR does not have longer sequences.
*/
if (my_uca_can_be_previous_context_tail(&scanner->level->contractions,
wc[0]) &&
scanner->wbeg != nochar && /* if not the very first character */
my_uca_can_be_previous_context_head(&scanner->level->contractions,
(wc[1]= ((scanner->page << 8) +
scanner->code))) &&
(cweight= my_uca_previous_context_find(scanner, wc[1], wc[0])))
{
scanner->page= scanner->code= 0; /* Clear for the next character */
return *cweight;
}
else if (my_uca_can_be_contraction_head(&scanner->level->contractions,
wc[0]))
{
/* Check if w[0] starts a contraction */
if ((cweight= my_uca_scanner_contraction_find(scanner, wc)))
return *cweight;
}
}
/* Process single character */
scanner->page= wc[0] >> 8;
scanner->code= wc[0] & 0xFF;
/* If weight page for w[0] does not exist, then calculate algoritmically */
if (!(wpage= scanner->level->weights[scanner->page]))
return my_uca_scanner_next_implicit(scanner);
/* Calculate pointer to w[0]'s weight, using page and offset */
scanner->wbeg= wpage +
scanner->code * scanner->level->lengths[scanner->page];
} while (!scanner->wbeg[0]); /* Skip ignorable characters */
return *scanner->wbeg++;
}
/*
Compares two strings according to the collation
SYNOPSIS:
strnncoll_onelevel()
cs Character set information
level Weight level (0 primary, 1 secondary, 2 tertiary, etc)
s First string
slen First string length
t Second string
tlen Seconf string length
level DUCETweight level
NOTES:
Initializes two weight scanners and gets weights
corresponding to two strings in a loop. If weights are not
the same at some step then returns their difference.
In the while() comparison these situations are possible:
1. (s_res>0) and (t_res>0) and (s_res == t_res)
Weights are the same so far, continue comparison
2. (s_res>0) and (t_res>0) and (s_res!=t_res)
A difference has been found, return.
3. (s_res>0) and (t_res<0)
We have reached the end of the second string, or found
an illegal multibyte sequence in the second string.
Return a positive number, i.e. the first string is bigger.
4. (s_res<0) and (t_res>0)
We have reached the end of the first string, or found
an illegal multibyte sequence in the first string.
Return a negative number, i.e. the second string is bigger.
5. (s_res<0) and (t_res<0)
Both scanners returned -1. It means we have riched
the end-of-string of illegal-sequence in both strings
at the same time. Return 0, strings are equal.
RETURN
Difference between two strings, according to the collation:
0 - means strings are equal
negative number - means the first string is smaller
positive number - means the first string is bigger
*/
static int
MY_FUNCTION_NAME(strnncoll_onelevel)(CHARSET_INFO *cs,
const MY_UCA_WEIGHT_LEVEL *level,
const uchar *s, size_t slen,
const uchar *t, size_t tlen,
my_bool t_is_prefix)
{
my_uca_scanner sscanner;
my_uca_scanner tscanner;
int s_res;
int t_res;
my_uca_scanner_init_any(&sscanner, cs, level, s, slen);
my_uca_scanner_init_any(&tscanner, cs, level, t, tlen);
do
{
s_res= MY_FUNCTION_NAME(scanner_next)(&sscanner);
t_res= MY_FUNCTION_NAME(scanner_next)(&tscanner);
} while ( s_res == t_res && s_res >0);
return (t_is_prefix && t_res < 0) ? 0 : (s_res - t_res);
}
/*
One-level, PAD SPACE.
*/
static int
MY_FUNCTION_NAME(strnncoll)(CHARSET_INFO *cs,
const uchar *s, size_t slen,
const uchar *t, size_t tlen,
my_bool t_is_prefix)
{
return MY_FUNCTION_NAME(strnncoll_onelevel)(cs, &cs->uca->level[0],
s, slen, t, tlen, t_is_prefix);
}
/*
Multi-level, PAD SPACE.
*/
static int
MY_FUNCTION_NAME(strnncoll_multilevel)(CHARSET_INFO *cs,
const uchar *s, size_t slen,
const uchar *t, size_t tlen,
my_bool t_is_prefix)
{
uint i, num_level= cs->levels_for_order;
for (i= 0; i != num_level; i++)
{
int ret= MY_FUNCTION_NAME(strnncoll_onelevel)(cs, &cs->uca->level[i],
s, slen, t, tlen,
t_is_prefix);
if (ret)
return ret;
}
return 0;
}
/*
Compares two strings according to the collation,
ignoring trailing spaces.
SYNOPSIS:
strnncollsp_onelevel()
cs Character set information
level UCA weight level
s First string
slen First string length
t Second string
tlen Seconf string length
level DUCETweight level
NOTES:
Works exactly the same with my_strnncoll_uca(),
but ignores trailing spaces.
In the while() comparison these situations are possible:
1. (s_res>0) and (t_res>0) and (s_res == t_res)
Weights are the same so far, continue comparison
2. (s_res>0) and (t_res>0) and (s_res!=t_res)
A difference has been found, return.
3. (s_res>0) and (t_res<0)
We have reached the end of the second string, or found
an illegal multibyte sequence in the second string.
Compare the first string to an infinite array of
space characters until difference is found, or until
the end of the first string.
4. (s_res<0) and (t_res>0)
We have reached the end of the first string, or found
an illegal multibyte sequence in the first string.
Compare the second string to an infinite array of
space characters until difference is found or until
the end of the second steing.
5. (s_res<0) and (t_res<0)
Both scanners returned -1. It means we have riched
the end-of-string of illegal-sequence in both strings
at the same time. Return 0, strings are equal.
RETURN
Difference between two strings, according to the collation:
0 - means strings are equal
negative number - means the first string is smaller
positive number - means the first string is bigger
*/
static int
MY_FUNCTION_NAME(strnncollsp_onelevel)(CHARSET_INFO *cs,
const MY_UCA_WEIGHT_LEVEL *level,
const uchar *s, size_t slen,
const uchar *t, size_t tlen)
{
my_uca_scanner sscanner, tscanner;
int s_res, t_res;
my_uca_scanner_init_any(&sscanner, cs, level, s, slen);
my_uca_scanner_init_any(&tscanner, cs, level, t, tlen);
do
{
s_res= MY_FUNCTION_NAME(scanner_next)(&sscanner);
t_res= MY_FUNCTION_NAME(scanner_next)(&tscanner);
} while ( s_res == t_res && s_res >0);
if (s_res > 0 && t_res < 0)
{
/* Calculate weight for SPACE character */
t_res= my_space_weight(level);
/* compare the first string to spaces */
do
{
if (s_res != t_res)
return (s_res - t_res);
s_res= MY_FUNCTION_NAME(scanner_next)(&sscanner);
} while (s_res > 0);
return 0;
}
if (s_res < 0 && t_res > 0)
{
/* Calculate weight for SPACE character */
s_res= my_space_weight(level);
/* compare the second string to spaces */
do
{
if (s_res != t_res)
return (s_res - t_res);
t_res= MY_FUNCTION_NAME(scanner_next)(&tscanner);
} while (t_res > 0);
return 0;
}
return ( s_res - t_res );
}
/*
One-level, PAD SPACE
*/
static int
MY_FUNCTION_NAME(strnncollsp)(CHARSET_INFO *cs,
const uchar *s, size_t slen,
const uchar *t, size_t tlen)
{
return MY_FUNCTION_NAME(strnncollsp_onelevel)(cs, &cs->uca->level[0],
s, slen, t, tlen);
}
/*
One-level, NO PAD
*/
static int
MY_FUNCTION_NAME(strnncollsp_nopad)(CHARSET_INFO *cs,
const uchar *s, size_t slen,
const uchar *t, size_t tlen)
{
return MY_FUNCTION_NAME(strnncoll_onelevel)(cs, &cs->uca->level[0],
s, slen, t, tlen, FALSE);
}
/*
Multi-level, PAD SPACE
*/
static int
MY_FUNCTION_NAME(strnncollsp_multilevel)(CHARSET_INFO *cs,
const uchar *s, size_t slen,
const uchar *t, size_t tlen)
{
uint i, num_level= cs->levels_for_order;
for (i= 0; i != num_level; i++)
{
int ret= MY_FUNCTION_NAME(strnncollsp_onelevel)(cs, &cs->uca->level[i],
s, slen, t, tlen);
if (ret)
return ret;
}
return 0;
}
/*
Multi-level, NO PAD
*/
static int
MY_FUNCTION_NAME(strnncollsp_nopad_multilevel)(CHARSET_INFO *cs,
const uchar *s, size_t slen,
const uchar *t, size_t tlen)
{
uint num_level= cs->levels_for_order;
uint i;
for (i= 0; i != num_level; i++)
{
int ret= MY_FUNCTION_NAME(strnncoll_onelevel)(cs, &cs->uca->level[i],
s, slen, t, tlen, FALSE);
if (ret)
return ret;
}
return 0;
}
/*
Calculates hash value for the given string,
according to the collation, and ignoring trailing spaces.
SYNOPSIS:
hash_sort()
cs Character set information
s String
slen String's length
n1 First hash parameter
n2 Second hash parameter
NOTES:
Scans consequently weights and updates
hash parameters n1 and n2. In a case insensitive collation,
upper and lower case of the same letter will return the same
weight sequence, and thus will produce the same hash values
in n1 and n2.
This functions is used for one-level and for multi-level collations.
We intentionally use only primary level in multi-level collations.
This helps to have PARTITION BY KEY put primarily equal records
into the same partition. E.g. in utf8_thai_520_ci records that differ
only in tone marks go into the same partition.
RETURN
N/A
*/
static void
MY_FUNCTION_NAME(hash_sort)(CHARSET_INFO *cs,
const uchar *s, size_t slen,
ulong *nr1, ulong *nr2)
{
int s_res;
my_uca_scanner scanner;
int space_weight= my_space_weight(&cs->uca->level[0]);
register ulong m1= *nr1, m2= *nr2;
my_uca_scanner_init_any(&scanner, cs, &cs->uca->level[0], s, slen);
while ((s_res= MY_FUNCTION_NAME(scanner_next)(&scanner)) >0)
{
if (s_res == space_weight)
{
/* Combine all spaces to be able to skip end spaces */
uint count= 0;
do
{
count++;
if ((s_res= MY_FUNCTION_NAME(scanner_next)(&scanner)) <= 0)
{
/* Skip strings at end of string */
goto end;
}
}
while (s_res == space_weight);
/* Add back that has for the space characters */
do
{
/*
We can't use MY_HASH_ADD_16() here as we, because of a misstake
in the original code, where we added the 16 byte variable the
opposite way. Changing this would cause old partitioned tables
to fail.
*/
MY_HASH_ADD(m1, m2, space_weight >> 8);
MY_HASH_ADD(m1, m2, space_weight & 0xFF);
}
while (--count != 0);
}
/* See comment above why we can't use MY_HASH_ADD_16() */
MY_HASH_ADD(m1, m2, s_res >> 8);
MY_HASH_ADD(m1, m2, s_res & 0xFF);
}
end:
*nr1= m1;
*nr2= m2;
}
static void
MY_FUNCTION_NAME(hash_sort_nopad)(CHARSET_INFO *cs,
const uchar *s, size_t slen,
ulong *nr1, ulong *nr2)
{
int s_res;
my_uca_scanner scanner;
register ulong m1= *nr1, m2= *nr2;
my_uca_scanner_init_any(&scanner, cs, &cs->uca->level[0], s, slen);
while ((s_res= MY_FUNCTION_NAME(scanner_next)(&scanner)) >0)
{
/* See comment above why we can't use MY_HASH_ADD_16() */
MY_HASH_ADD(m1, m2, s_res >> 8);
MY_HASH_ADD(m1, m2, s_res & 0xFF);
}
*nr1= m1;
*nr2= m2;
}
/*
For the given string creates its "binary image", suitable
to be used in binary comparison, i.e. in memcmp().
SYNOPSIS:
my_strnxfrm_uca()
cs Character set information
dst Where to write the image
dstlen Space available for the image, in bytes
src The source string
srclen Length of the source string, in bytes
NOTES:
In a loop, scans weights from the source string and writes
them into the binary image. In a case insensitive collation,
upper and lower cases of the same letter will produce the
same image subsequences. When we have reached the end-of-string
or found an illegal multibyte sequence, the loop stops.
It is impossible to restore the original string using its
binary image.
Binary images are used for bulk comparison purposes,
e.g. in ORDER BY, when it is more efficient to create
a binary image and use it instead of weight scanner
for the original strings for every comparison.
RETURN
Number of bytes that have been written into the binary image.
*/
static uchar *
MY_FUNCTION_NAME(strnxfrm_onelevel_internal)(CHARSET_INFO *cs,
MY_UCA_WEIGHT_LEVEL *level,
uchar *dst, uchar *de,
uint *nweights,
const uchar *src, size_t srclen)
{
my_uca_scanner scanner;
int s_res;
DBUG_ASSERT(src || !srclen);
my_uca_scanner_init_any(&scanner, cs, level, src, srclen);
for (; dst < de && *nweights &&
(s_res= MY_FUNCTION_NAME(scanner_next)(&scanner)) > 0 ; (*nweights)--)
{
*dst++= s_res >> 8;
if (dst < de)
*dst++= s_res & 0xFF;
}
return dst;
}
static uchar *
MY_FUNCTION_NAME(strnxfrm_onelevel)(CHARSET_INFO *cs,
MY_UCA_WEIGHT_LEVEL *level,
uchar *dst, uchar *de, uint nweights,
const uchar *src, size_t srclen, uint flags)
{
uchar *d0= dst;
dst= MY_FUNCTION_NAME(strnxfrm_onelevel_internal)(cs, level,
dst, de, &nweights,
src, srclen);
DBUG_ASSERT(dst <= de);
if (dst < de && nweights && (flags & MY_STRXFRM_PAD_WITH_SPACE))
dst= my_strnxfrm_uca_padn(dst, de, nweights, my_space_weight(level));
DBUG_ASSERT(dst <= de);
my_strxfrm_desc_and_reverse(d0, dst, flags, 0);
return dst;
}
static uchar *
MY_FUNCTION_NAME(strnxfrm_nopad_onelevel)(CHARSET_INFO *cs,
MY_UCA_WEIGHT_LEVEL *level,
uchar *dst, uchar *de, uint nweights,
const uchar *src, size_t srclen,
uint flags)
{
uchar *d0= dst;
dst= MY_FUNCTION_NAME(strnxfrm_onelevel_internal)(cs, level,
dst, de, &nweights,
src, srclen);
DBUG_ASSERT(dst <= de);
/* Pad with the minimum possible weight on this level */
if (dst < de && nweights && (flags & MY_STRXFRM_PAD_WITH_SPACE))
dst= my_strnxfrm_uca_padn(dst, de, nweights, min_weight_on_level(level));
DBUG_ASSERT(dst <= de);
my_strxfrm_desc_and_reverse(d0, dst, flags, 0);
return dst;
}
static size_t
MY_FUNCTION_NAME(strnxfrm)(CHARSET_INFO *cs,
uchar *dst, size_t dstlen, uint nweights,
const uchar *src, size_t srclen, uint flags)
{
uchar *d0= dst;
uchar *de= dst + dstlen;
dst= MY_FUNCTION_NAME(strnxfrm_onelevel)(cs, &cs->uca->level[0],
dst, de, nweights,
src, srclen, flags);
/*
This can probably be changed to memset(dst, 0, de - dst),
like my_strnxfrm_uca_multilevel() does.
*/
if ((flags & MY_STRXFRM_PAD_TO_MAXLEN) && dst < de)
dst= my_strnxfrm_uca_pad(dst, de, my_space_weight(&cs->uca->level[0]));
return dst - d0;
}
static size_t
MY_FUNCTION_NAME(strnxfrm_nopad)(CHARSET_INFO *cs,
uchar *dst, size_t dstlen,
uint nweights,
const uchar *src, size_t srclen,
uint flags)
{
uchar *d0= dst;
uchar *de= dst + dstlen;
dst= MY_FUNCTION_NAME(strnxfrm_nopad_onelevel)(cs, &cs->uca->level[0],
dst, de, nweights,
src, srclen, flags);
if ((flags & MY_STRXFRM_PAD_TO_MAXLEN) && dst < de)
{
memset(dst, 0, de - dst);
dst= de;
}
return dst - d0;
}
static size_t
MY_FUNCTION_NAME(strnxfrm_multilevel)(CHARSET_INFO *cs,
uchar *dst, size_t dstlen,
uint nweights,
const uchar *src, size_t srclen,
uint flags)
{
uint num_level= cs->levels_for_order;
uchar *d0= dst;
uchar *de= dst + dstlen;
uint current_level;
for (current_level= 0; current_level != num_level; current_level++)
{
if (!(flags & MY_STRXFRM_LEVEL_ALL) ||
(flags & (MY_STRXFRM_LEVEL1 << current_level)))
dst= cs->state & MY_CS_NOPAD ?
MY_FUNCTION_NAME(strnxfrm_nopad_onelevel)(cs,
&cs->uca->level[current_level],
dst, de, nweights,
src, srclen, flags) :
MY_FUNCTION_NAME(strnxfrm_onelevel)(cs,
&cs->uca->level[current_level],
dst, de, nweights,
src, srclen, flags);
}
if (dst < de && (flags & MY_STRXFRM_PAD_TO_MAXLEN))
{
memset(dst, 0, de - dst);
dst= de;
}
return dst - d0;
}
/*
One-level, PAD SPACE
*/
MY_COLLATION_HANDLER MY_FUNCTION_NAME(collation_handler)=
{
my_coll_init_uca,
MY_FUNCTION_NAME(strnncoll),
MY_FUNCTION_NAME(strnncollsp),
MY_FUNCTION_NAME(strnxfrm),
my_strnxfrmlen_any_uca,
MY_LIKE_RANGE,
my_wildcmp_uca,
NULL, /* strcasecmp() */
my_instr_mb,
MY_FUNCTION_NAME(hash_sort),
my_propagate_complex
};
/*
One-level, NO PAD
For character sets with mbminlen==1 use MY_LIKE_RANGE=my_like_range_mb
For character sets with mbminlen>=2 use MY_LIKE_RANGE=my_like_range_generic
*/
MY_COLLATION_HANDLER MY_FUNCTION_NAME(collation_handler_nopad)=
{
my_coll_init_uca,
MY_FUNCTION_NAME(strnncoll),
MY_FUNCTION_NAME(strnncollsp_nopad),
MY_FUNCTION_NAME(strnxfrm_nopad),
my_strnxfrmlen_any_uca,
MY_LIKE_RANGE, /* my_like_range_mb or my_like_range_generic */
my_wildcmp_uca,
NULL, /* strcasecmp() */
my_instr_mb,
MY_FUNCTION_NAME(hash_sort_nopad),
my_propagate_complex
};
/*
Multi-level, PAD SPACE
*/
MY_COLLATION_HANDLER MY_FUNCTION_NAME(collation_handler_multilevel)=
{
my_coll_init_uca,
MY_FUNCTION_NAME(strnncoll_multilevel),
MY_FUNCTION_NAME(strnncollsp_multilevel),
MY_FUNCTION_NAME(strnxfrm_multilevel),
my_strnxfrmlen_any_uca_multilevel,
MY_LIKE_RANGE,
my_wildcmp_uca,
NULL, /* strcasecmp() */
my_instr_mb,
MY_FUNCTION_NAME(hash_sort),
my_propagate_complex
};
/*
Multi-level, NO PAD
*/
MY_COLLATION_HANDLER MY_FUNCTION_NAME(collation_handler_nopad_multilevel)=
{
my_coll_init_uca,
MY_FUNCTION_NAME(strnncoll_multilevel),
MY_FUNCTION_NAME(strnncollsp_nopad_multilevel),
MY_FUNCTION_NAME(strnxfrm_multilevel),
my_strnxfrmlen_any_uca_multilevel,
MY_LIKE_RANGE,
my_wildcmp_uca,
NULL, /* strcasecmp() */
my_instr_mb,
MY_FUNCTION_NAME(hash_sort),
my_propagate_complex
};
#undef MY_FUNCTION_NAME
#undef MY_MB_WC
#undef MY_LIKE_RANGE

View file

@ -1184,35 +1184,7 @@ my_lengthsp_mb2(CHARSET_INFO *cs __attribute__((unused)),
but the JSON functions needed my_utf16_uni()
so the #ifdef was moved lower.
*/
/*
D800..DB7F - Non-provate surrogate high (896 pages)
DB80..DBFF - Private surrogate high (128 pages)
DC00..DFFF - Surrogate low (1024 codes in a page)
*/
#define MY_UTF16_SURROGATE_HIGH_FIRST 0xD800
#define MY_UTF16_SURROGATE_HIGH_LAST 0xDBFF
#define MY_UTF16_SURROGATE_LOW_FIRST 0xDC00
#define MY_UTF16_SURROGATE_LOW_LAST 0xDFFF
#define MY_UTF16_HIGH_HEAD(x) ((((uchar) (x)) & 0xFC) == 0xD8)
#define MY_UTF16_LOW_HEAD(x) ((((uchar) (x)) & 0xFC) == 0xDC)
/* Test if a byte is a leading byte of a high or low surrogate head: */
#define MY_UTF16_SURROGATE_HEAD(x) ((((uchar) (x)) & 0xF8) == 0xD8)
/* Test if a Unicode code point is a high or low surrogate head */
#define MY_UTF16_SURROGATE(x) (((x) & 0xF800) == 0xD800)
#define MY_UTF16_WC2(a, b) ((a << 8) + b)
/*
a= 110110?? (<< 18)
b= ???????? (<< 10)
c= 110111?? (<< 8)
d= ???????? (<< 0)
*/
#define MY_UTF16_WC4(a, b, c, d) (((a & 3) << 18) + (b << 10) + \
((c & 3) << 8) + d + 0x10000)
#include "ctype-utf16.h"
#define IS_MB2_CHAR(b0,b1) (!MY_UTF16_SURROGATE_HEAD(b0))
#define IS_MB4_CHAR(b0,b1,b2,b3) (MY_UTF16_HIGH_HEAD(b0) && MY_UTF16_LOW_HEAD(b2))
@ -1261,32 +1233,7 @@ static inline int my_weight_mb2_utf16mb2_general_ci(uchar b0, uchar b1)
my_utf16_uni(CHARSET_INFO *cs __attribute__((unused)),
my_wc_t *pwc, const uchar *s, const uchar *e)
{
if (s + 2 > e)
return MY_CS_TOOSMALL2;
/*
High bytes: 0xD[89AB] = B'110110??'
Low bytes: 0xD[CDEF] = B'110111??'
Surrogate mask: 0xFC = B'11111100'
*/
if (MY_UTF16_HIGH_HEAD(*s)) /* Surrogate head */
{
if (s + 4 > e)
return MY_CS_TOOSMALL4;
if (!MY_UTF16_LOW_HEAD(s[2])) /* Broken surrigate pair */
return MY_CS_ILSEQ;
*pwc= MY_UTF16_WC4(s[0], s[1], s[2], s[3]);
return 4;
}
if (MY_UTF16_LOW_HEAD(*s)) /* Low surrogate part without high part */
return MY_CS_ILSEQ;
*pwc= MY_UTF16_WC2(s[0], s[1]);
return 2;
return my_mb_wc_utf16_quick(pwc, s, e);
}
@ -2109,6 +2056,8 @@ struct charset_info_st my_charset_utf16le_nopad_bin=
#ifdef HAVE_CHARSET_utf32
#include "ctype-utf32.h"
/*
Check is b0 and b1 start a valid UTF32 four-byte sequence.
Don't accept characters greater than U+10FFFF.
@ -2117,8 +2066,6 @@ struct charset_info_st my_charset_utf16le_nopad_bin=
#define IS_MB4_CHAR(b0,b1,b2,b3) (IS_UTF32_MBHEAD4(b0,b1))
#define MY_UTF32_WC4(b0,b1,b2,b3) ((((my_wc_t)b0) << 24) + (b1 << 16) + \
(b2 << 8) + (b3))
static inline int my_weight_utf32_general_ci(uchar b0, uchar b1,
uchar b2, uchar b3)
@ -2161,10 +2108,7 @@ static int
my_utf32_uni(CHARSET_INFO *cs __attribute__((unused)),
my_wc_t *pwc, const uchar *s, const uchar *e)
{
if (s + 4 > e)
return MY_CS_TOOSMALL4;
*pwc= MY_UTF32_WC4(s[0], s[1], s[2], s[3]);
return *pwc > 0x10FFFF ? MY_CS_ILSEQ : 4;
return my_mb_wc_utf32_quick(pwc, s, e);
}
@ -2928,6 +2872,8 @@ struct charset_info_st my_charset_utf32_nopad_bin=
#ifdef HAVE_CHARSET_ucs2
#include "ctype-ucs2.h"
static const uchar ctype_ucs2[] = {
0,
32, 32, 32, 32, 32, 32, 32, 32, 32, 40, 40, 40, 40, 40, 32, 32,
@ -3037,11 +2983,7 @@ my_charlen_ucs2(CHARSET_INFO *cs __attribute__((unused)),
static int my_ucs2_uni(CHARSET_INFO *cs __attribute__((unused)),
my_wc_t * pwc, const uchar *s, const uchar *e)
{
if (s+2 > e) /* Need 2 characters */
return MY_CS_TOOSMALL2;
*pwc= ((uchar)s[0]) * 256 + ((uchar)s[1]);
return 2;
return my_mb_wc_ucs2_quick(pwc, s, e);
}
static int my_uni_ucs2(CHARSET_INFO *cs __attribute__((unused)) ,

32
strings/ctype-ucs2.h Normal file
View file

@ -0,0 +1,32 @@
/*
Copyright (c) 2018 MariaDB Corporation
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; version 2 of the License.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with this program; if not, write to the Free Software
Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
*/
#ifndef _CTYPE_UCS2_H
#define _CTYPE_UCS2_H
static inline int
my_mb_wc_ucs2_quick(my_wc_t * pwc, const uchar *s, const uchar *e)
{
if (s+2 > e) /* Need 2 characters */
return MY_CS_TOOSMALL2;
*pwc= ((uchar)s[0]) * 256 + ((uchar)s[1]);
return 2;
}
#endif /* _CTYPE_UCS2_H */

80
strings/ctype-utf16.h Normal file
View file

@ -0,0 +1,80 @@
/*
Copyright (c) 2018 MariaDB Corporation
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; version 2 of the License.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with this program; if not, write to the Free Software
Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
*/
#ifndef _CTYPE_UTF16_H
#define _CTYPE_UTF16_H
/*
D800..DB7F - Non-provate surrogate high (896 pages)
DB80..DBFF - Private surrogate high (128 pages)
DC00..DFFF - Surrogate low (1024 codes in a page)
*/
#define MY_UTF16_SURROGATE_HIGH_FIRST 0xD800
#define MY_UTF16_SURROGATE_HIGH_LAST 0xDBFF
#define MY_UTF16_SURROGATE_LOW_FIRST 0xDC00
#define MY_UTF16_SURROGATE_LOW_LAST 0xDFFF
#define MY_UTF16_HIGH_HEAD(x) ((((uchar) (x)) & 0xFC) == 0xD8)
#define MY_UTF16_LOW_HEAD(x) ((((uchar) (x)) & 0xFC) == 0xDC)
/* Test if a byte is a leading byte of a high or low surrogate head: */
#define MY_UTF16_SURROGATE_HEAD(x) ((((uchar) (x)) & 0xF8) == 0xD8)
/* Test if a Unicode code point is a high or low surrogate head */
#define MY_UTF16_SURROGATE(x) (((x) & 0xF800) == 0xD800)
#define MY_UTF16_WC2(a, b) ((a << 8) + b)
/*
a= 110110?? (<< 18)
b= ???????? (<< 10)
c= 110111?? (<< 8)
d= ???????? (<< 0)
*/
#define MY_UTF16_WC4(a, b, c, d) (((a & 3) << 18) + (b << 10) + \
((c & 3) << 8) + d + 0x10000)
static inline int
my_mb_wc_utf16_quick(my_wc_t *pwc, const uchar *s, const uchar *e)
{
if (s + 2 > e)
return MY_CS_TOOSMALL2;
/*
High bytes: 0xD[89AB] = B'110110??'
Low bytes: 0xD[CDEF] = B'110111??'
Surrogate mask: 0xFC = B'11111100'
*/
if (MY_UTF16_HIGH_HEAD(*s)) /* Surrogate head */
{
if (s + 4 > e)
return MY_CS_TOOSMALL4;
if (!MY_UTF16_LOW_HEAD(s[2])) /* Broken surrigate pair */
return MY_CS_ILSEQ;
*pwc= MY_UTF16_WC4(s[0], s[1], s[2], s[3]);
return 4;
}
if (MY_UTF16_LOW_HEAD(*s)) /* Low surrogate part without high part */
return MY_CS_ILSEQ;
*pwc= MY_UTF16_WC2(s[0], s[1]);
return 2;
}
#endif /* _CTYPE_UTF16_H */

33
strings/ctype-utf32.h Normal file
View file

@ -0,0 +1,33 @@
/*
Copyright (c) 2018 MariaDB Corporation
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; version 2 of the License.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with this program; if not, write to the Free Software
Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
*/
#ifndef _CTYPE_UTF32_H
#define _CTYPE_UTF32_H
#define MY_UTF32_WC4(b0,b1,b2,b3) ((((my_wc_t)b0) << 24) + (b1 << 16) + \
(b2 << 8) + (b3))
static inline int
my_mb_wc_utf32_quick(my_wc_t *pwc, const uchar *s, const uchar *e)
{
if (s + 4 > e)
return MY_CS_TOOSMALL4;
*pwc= MY_UTF32_WC4(s[0], s[1], s[2], s[3]);
return *pwc > 0x10FFFF ? MY_CS_ILSEQ : 4;
}
#endif /* _CTYPE_UTF32_H */

View file

@ -26,78 +26,9 @@
#define EILSEQ ENOENT
#endif
/* Detect special bytes and sequences */
#define IS_CONTINUATION_BYTE(c) (((uchar) (c) ^ 0x80) < 0x40)
/*
Check MB2 character assuming that b0 is alredy known to be >= 0xC2.
Use this macro if the caller already checked b0 for:
- an MB1 character
- an unused gap between MB1 and MB2HEAD
*/
#define IS_UTF8MB2_STEP2(b0,b1) (((uchar) (b0) < 0xE0) && \
IS_CONTINUATION_BYTE((uchar) b1))
#include "ctype-utf8.h"
/*
Check MB3 character assuming that b0 is already known to be
in the valid MB3HEAD range [0xE0..0xEF].
*/
#define IS_UTF8MB3_STEP2(b0,b1,b2) (IS_CONTINUATION_BYTE(b1) && \
IS_CONTINUATION_BYTE(b2) && \
((uchar) b0 >= 0xe1 || (uchar) b1 >= 0xa0))
/*
Check MB3 character assuming that b0 is already known to be >= 0xE0,
but is not checked for the high end 0xF0 yet.
Use this macro if the caller already checked b0 for:
- an MB1 character
- an unused gap between MB1 and MB2HEAD
- an MB2HEAD
*/
#define IS_UTF8MB3_STEP3(b0,b1,b2) (((uchar) (b0) < 0xF0) && \
IS_UTF8MB3_STEP2(b0,b1,b2))
/*
UTF-8 quick four-byte mask:
11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
Encoding allows to encode U+00010000..U+001FFFFF
The maximum character defined in the Unicode standard is U+0010FFFF.
Higher characters U+00110000..U+001FFFFF are not used.
11110000.10010000.10xxxxxx.10xxxxxx == F0.90.80.80 == U+00010000 (min)
11110100.10001111.10111111.10111111 == F4.8F.BF.BF == U+0010FFFF (max)
Valid codes:
[F0][90..BF][80..BF][80..BF]
[F1][80..BF][80..BF][80..BF]
[F2][80..BF][80..BF][80..BF]
[F3][80..BF][80..BF][80..BF]
[F4][80..8F][80..BF][80..BF]
*/
/*
Check MB4 character assuming that b0 is already
known to be in the range [0xF0..0xF4]
*/
#define IS_UTF8MB4_STEP2(b0,b1,b2,b3) (IS_CONTINUATION_BYTE(b1) && \
IS_CONTINUATION_BYTE(b2) && \
IS_CONTINUATION_BYTE(b3) && \
(b0 >= 0xf1 || b1 >= 0x90) && \
(b0 <= 0xf3 || b1 <= 0x8F))
#define IS_UTF8MB4_STEP3(b0,b1,b2,b3) (((uchar) (b0) < 0xF5) && \
IS_UTF8MB4_STEP2(b0,b1,b2,b3))
/* Convert individual bytes to Unicode code points */
#define UTF8MB2_CODE(b0,b1) (((my_wc_t) ((uchar) b0 & 0x1f) << 6) |\
((my_wc_t) ((uchar) b1 ^ 0x80)))
#define UTF8MB3_CODE(b0,b1,b2) (((my_wc_t) ((uchar) b0 & 0x0f) << 12) |\
((my_wc_t) ((uchar) b1 ^ 0x80) << 6) |\
((my_wc_t) ((uchar) b2 ^ 0x80)))
#define UTF8MB4_CODE(b0,b1,b2,b3) (((my_wc_t) ((uchar) b0 & 0x07) << 18) |\
((my_wc_t) ((uchar) b1 ^ 0x80) << 12) |\
((my_wc_t) ((uchar) b2 ^ 0x80) << 6) |\
(my_wc_t) ((uchar) b3 ^ 0x80))
/* Definitions for strcoll.ic */
#define IS_MB1_CHAR(x) ((uchar) (x) < 0x80)
@ -4981,42 +4912,7 @@ static const uchar to_upper_utf8[] = {
static int my_utf8_uni(CHARSET_INFO *cs __attribute__((unused)),
my_wc_t * pwc, const uchar *s, const uchar *e)
{
uchar c;
if (s >= e)
return MY_CS_TOOSMALL;
c= s[0];
if (c < 0x80)
{
*pwc = c;
return 1;
}
else if (c < 0xc2)
return MY_CS_ILSEQ;
else if (c < 0xe0)
{
if (s+2 > e) /* We need 2 characters */
return MY_CS_TOOSMALL2;
if (!(IS_CONTINUATION_BYTE(s[1])))
return MY_CS_ILSEQ;
*pwc= UTF8MB2_CODE(c, s[1]);
return 2;
}
else if (c < 0xf0)
{
if (s+3 > e) /* We need 3 characters */
return MY_CS_TOOSMALL3;
if (!IS_UTF8MB3_STEP2(c, s[1], s[2]))
return MY_CS_ILSEQ;
*pwc= UTF8MB3_CODE(c, s[1], s[2]);
return 3;
}
return MY_CS_ILSEQ;
return my_mb_wc_utf8mb3_quick(pwc, s, e);
}
@ -7379,52 +7275,7 @@ static int
my_mb_wc_utf8mb4(CHARSET_INFO *cs __attribute__((unused)),
my_wc_t * pwc, const uchar *s, const uchar *e)
{
uchar c;
if (s >= e)
return MY_CS_TOOSMALL;
c= s[0];
if (c < 0x80)
{
*pwc= c;
return 1;
}
else if (c < 0xc2)
return MY_CS_ILSEQ;
else if (c < 0xe0)
{
if (s + 2 > e) /* We need 2 characters */
return MY_CS_TOOSMALL2;
if (!(IS_CONTINUATION_BYTE(s[1])))
return MY_CS_ILSEQ;
*pwc= UTF8MB2_CODE(c, s[1]);
return 2;
}
else if (c < 0xf0)
{
if (s + 3 > e) /* We need 3 characters */
return MY_CS_TOOSMALL3;
if (!IS_UTF8MB3_STEP2(c, s[1], s[2]))
return MY_CS_ILSEQ;
*pwc= UTF8MB3_CODE(c, s[1], s[2]);
return 3;
}
else if (c < 0xf5)
{
if (s + 4 > e) /* We need 4 characters */
return MY_CS_TOOSMALL4;
if (!IS_UTF8MB4_STEP2(c, s[1], s[2], s[3]))
return MY_CS_ILSEQ;
*pwc= UTF8MB4_CODE(c, s[1], s[2], s[3]);
return 4;
}
return MY_CS_ILSEQ;
return my_mb_wc_utf8mb4_quick(pwc, s, e);
}

190
strings/ctype-utf8.h Normal file
View file

@ -0,0 +1,190 @@
/*
Copyright (c) 2018 MariaDB Corporation
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; version 2 of the License.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with this program; if not, write to the Free Software
Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
*/
#ifndef _CTYPE_UTF8_H
#define _CTYPE_UTF8_H
/* Detect special bytes and sequences */
#define IS_CONTINUATION_BYTE(c) (((uchar) (c) ^ 0x80) < 0x40)
/*
Check MB2 character assuming that b0 is alredy known to be >= 0xC2.
Use this macro if the caller already checked b0 for:
- an MB1 character
- an unused gap between MB1 and MB2HEAD
*/
#define IS_UTF8MB2_STEP2(b0,b1) (((uchar) (b0) < 0xE0) && \
IS_CONTINUATION_BYTE((uchar) b1))
/*
Check MB3 character assuming that b0 is already known to be
in the valid MB3HEAD range [0xE0..0xEF].
*/
#define IS_UTF8MB3_STEP2(b0,b1,b2) (IS_CONTINUATION_BYTE(b1) && \
IS_CONTINUATION_BYTE(b2) && \
((uchar) b0 >= 0xe1 || (uchar) b1 >= 0xa0))
/*
Check MB3 character assuming that b0 is already known to be >= 0xE0,
but is not checked for the high end 0xF0 yet.
Use this macro if the caller already checked b0 for:
- an MB1 character
- an unused gap between MB1 and MB2HEAD
- an MB2HEAD
*/
#define IS_UTF8MB3_STEP3(b0,b1,b2) (((uchar) (b0) < 0xF0) && \
IS_UTF8MB3_STEP2(b0,b1,b2))
/*
UTF-8 quick four-byte mask:
11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
Encoding allows to encode U+00010000..U+001FFFFF
The maximum character defined in the Unicode standard is U+0010FFFF.
Higher characters U+00110000..U+001FFFFF are not used.
11110000.10010000.10xxxxxx.10xxxxxx == F0.90.80.80 == U+00010000 (min)
11110100.10001111.10111111.10111111 == F4.8F.BF.BF == U+0010FFFF (max)
Valid codes:
[F0][90..BF][80..BF][80..BF]
[F1][80..BF][80..BF][80..BF]
[F2][80..BF][80..BF][80..BF]
[F3][80..BF][80..BF][80..BF]
[F4][80..8F][80..BF][80..BF]
*/
/*
Check MB4 character assuming that b0 is already
known to be in the range [0xF0..0xF4]
*/
#define IS_UTF8MB4_STEP2(b0,b1,b2,b3) (IS_CONTINUATION_BYTE(b1) && \
IS_CONTINUATION_BYTE(b2) && \
IS_CONTINUATION_BYTE(b3) && \
(b0 >= 0xf1 || b1 >= 0x90) && \
(b0 <= 0xf3 || b1 <= 0x8F))
#define IS_UTF8MB4_STEP3(b0,b1,b2,b3) (((uchar) (b0) < 0xF5) && \
IS_UTF8MB4_STEP2(b0,b1,b2,b3))
/* Convert individual bytes to Unicode code points */
#define UTF8MB2_CODE(b0,b1) (((my_wc_t) ((uchar) b0 & 0x1f) << 6) |\
((my_wc_t) ((uchar) b1 ^ 0x80)))
#define UTF8MB3_CODE(b0,b1,b2) (((my_wc_t) ((uchar) b0 & 0x0f) << 12) |\
((my_wc_t) ((uchar) b1 ^ 0x80) << 6) |\
((my_wc_t) ((uchar) b2 ^ 0x80)))
#define UTF8MB4_CODE(b0,b1,b2,b3) (((my_wc_t) ((uchar) b0 & 0x07) << 18) |\
((my_wc_t) ((uchar) b1 ^ 0x80) << 12) |\
((my_wc_t) ((uchar) b2 ^ 0x80) << 6) |\
(my_wc_t) ((uchar) b3 ^ 0x80))
static inline int
my_mb_wc_utf8mb3_quick(my_wc_t * pwc, const uchar *s, const uchar *e)
{
uchar c;
if (s >= e)
return MY_CS_TOOSMALL;
c= s[0];
if (c < 0x80)
{
*pwc = c;
return 1;
}
else if (c < 0xc2)
return MY_CS_ILSEQ;
else if (c < 0xe0)
{
if (s+2 > e) /* We need 2 characters */
return MY_CS_TOOSMALL2;
if (!(IS_CONTINUATION_BYTE(s[1])))
return MY_CS_ILSEQ;
*pwc= UTF8MB2_CODE(c, s[1]);
return 2;
}
else if (c < 0xf0)
{
if (s+3 > e) /* We need 3 characters */
return MY_CS_TOOSMALL3;
if (!IS_UTF8MB3_STEP2(c, s[1], s[2]))
return MY_CS_ILSEQ;
*pwc= UTF8MB3_CODE(c, s[1], s[2]);
return 3;
}
return MY_CS_ILSEQ;
}
#ifdef HAVE_CHARSET_utf8mb4
static inline int
my_mb_wc_utf8mb4_quick(my_wc_t *pwc, const uchar *s, const uchar *e)
{
uchar c;
if (s >= e)
return MY_CS_TOOSMALL;
c= s[0];
if (c < 0x80)
{
*pwc= c;
return 1;
}
else if (c < 0xc2)
return MY_CS_ILSEQ;
else if (c < 0xe0)
{
if (s + 2 > e) /* We need 2 characters */
return MY_CS_TOOSMALL2;
if (!(IS_CONTINUATION_BYTE(s[1])))
return MY_CS_ILSEQ;
*pwc= UTF8MB2_CODE(c, s[1]);
return 2;
}
else if (c < 0xf0)
{
if (s + 3 > e) /* We need 3 characters */
return MY_CS_TOOSMALL3;
if (!IS_UTF8MB3_STEP2(c, s[1], s[2]))
return MY_CS_ILSEQ;
*pwc= UTF8MB3_CODE(c, s[1], s[2]);
return 3;
}
else if (c < 0xf5)
{
if (s + 4 > e) /* We need 4 characters */
return MY_CS_TOOSMALL4;
if (!IS_UTF8MB4_STEP2(c, s[1], s[2], s[3]))
return MY_CS_ILSEQ;
*pwc= UTF8MB4_CODE(c, s[1], s[2], s[3]);
return 4;
}
return MY_CS_ILSEQ;
}
#endif /* HAVE_CHARSET_utf8mb4*/
#endif /* _CTYPE_UTF8_H */