mirror of
https://github.com/MariaDB/server.git
synced 2025-01-15 19:42:28 +01:00
MDEV-17474 Change Unicode collation implementation from "handler" to "inline" style
This commit is contained in:
parent
fee24b1281
commit
6eae037c4c
9 changed files with 1323 additions and 1268 deletions
|
@ -362,7 +362,6 @@ extern MY_COLLATION_HANDLER my_collation_8bit_bin_handler;
|
|||
extern MY_COLLATION_HANDLER my_collation_8bit_simple_ci_handler;
|
||||
extern MY_COLLATION_HANDLER my_collation_8bit_nopad_bin_handler;
|
||||
extern MY_COLLATION_HANDLER my_collation_8bit_simple_nopad_ci_handler;
|
||||
extern MY_COLLATION_HANDLER my_collation_ucs2_uca_handler;
|
||||
|
||||
/* Some typedef to make it easy for C++ to make function pointers */
|
||||
typedef int (*my_charset_conv_mb_wc)(CHARSET_INFO *, my_wc_t *,
|
||||
|
|
1263
strings/ctype-uca.c
1263
strings/ctype-uca.c
File diff suppressed because it is too large
Load diff
763
strings/ctype-uca.ic
Normal file
763
strings/ctype-uca.ic
Normal file
|
@ -0,0 +1,763 @@
|
|||
/*
|
||||
Copyright (c) 2018 MariaDB Corporation
|
||||
|
||||
This program is free software; you can redistribute it and/or modify
|
||||
it under the terms of the GNU General Public License as published by
|
||||
the Free Software Foundation; version 2 of the License.
|
||||
|
||||
This program is distributed in the hope that it will be useful,
|
||||
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
GNU General Public License for more details.
|
||||
|
||||
You should have received a copy of the GNU General Public License
|
||||
along with this program; if not, write to the Free Software
|
||||
Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
*/
|
||||
|
||||
|
||||
#ifndef MY_FUNCTION_NAME
|
||||
#error MY_FUNCTION_NAME is not defined
|
||||
#endif
|
||||
#ifndef MY_MB_WC
|
||||
#error MY_MB_WC is not defined
|
||||
#endif
|
||||
#ifndef MY_LIKE_RANGE
|
||||
#error MY_LIKE_RANGE is not defined
|
||||
#endif
|
||||
|
||||
|
||||
static inline int
|
||||
MY_FUNCTION_NAME(scanner_next)(my_uca_scanner *scanner)
|
||||
{
|
||||
/*
|
||||
Check if the weights for the previous character have been
|
||||
already fully scanned. If yes, then get the next character and
|
||||
initialize wbeg and wlength to its weight string.
|
||||
*/
|
||||
|
||||
if (scanner->wbeg[0]) /* More weights left from the previous step: */
|
||||
return *scanner->wbeg++; /* return the next weight from expansion */
|
||||
|
||||
do
|
||||
{
|
||||
const uint16 *wpage;
|
||||
my_wc_t wc[MY_UCA_MAX_CONTRACTION];
|
||||
int mblen;
|
||||
|
||||
/* Get next character */
|
||||
if (((mblen= MY_MB_WC(scanner, wc, scanner->sbeg,
|
||||
scanner->send)) <= 0))
|
||||
{
|
||||
if (scanner->sbeg >= scanner->send)
|
||||
return -1; /* No more bytes, end of line reached */
|
||||
/*
|
||||
There are some more bytes left. Non-positive mb_len means that
|
||||
we got an incomplete or a bad byte sequence. Consume mbminlen bytes.
|
||||
*/
|
||||
if ((scanner->sbeg+= scanner->cs->mbminlen) > scanner->send)
|
||||
{
|
||||
/* For safety purposes don't go beyond the string range. */
|
||||
scanner->sbeg= scanner->send;
|
||||
}
|
||||
/*
|
||||
Treat every complete or incomplete mbminlen unit as a weight which is
|
||||
greater than weight for any possible normal character.
|
||||
0xFFFF is greater than any possible weight in the UCA weight table.
|
||||
*/
|
||||
return 0xFFFF;
|
||||
}
|
||||
|
||||
scanner->sbeg+= mblen;
|
||||
if (wc[0] > scanner->level->maxchar)
|
||||
{
|
||||
/* Return 0xFFFD as weight for all characters outside BMP */
|
||||
scanner->wbeg= nochar;
|
||||
return 0xFFFD;
|
||||
}
|
||||
|
||||
if (my_uca_have_contractions_quick(scanner->level))
|
||||
{
|
||||
uint16 *cweight;
|
||||
/*
|
||||
If we have scanned a character which can have previous context,
|
||||
and there were some more characters already before,
|
||||
then reconstruct codepoint of the previous character
|
||||
from "page" and "code" into w[1], and verify that {wc[1], wc[0]}
|
||||
together form a real previous context pair.
|
||||
Note, we support only 2-character long sequences with previous
|
||||
context at the moment. CLDR does not have longer sequences.
|
||||
*/
|
||||
if (my_uca_can_be_previous_context_tail(&scanner->level->contractions,
|
||||
wc[0]) &&
|
||||
scanner->wbeg != nochar && /* if not the very first character */
|
||||
my_uca_can_be_previous_context_head(&scanner->level->contractions,
|
||||
(wc[1]= ((scanner->page << 8) +
|
||||
scanner->code))) &&
|
||||
(cweight= my_uca_previous_context_find(scanner, wc[1], wc[0])))
|
||||
{
|
||||
scanner->page= scanner->code= 0; /* Clear for the next character */
|
||||
return *cweight;
|
||||
}
|
||||
else if (my_uca_can_be_contraction_head(&scanner->level->contractions,
|
||||
wc[0]))
|
||||
{
|
||||
/* Check if w[0] starts a contraction */
|
||||
if ((cweight= my_uca_scanner_contraction_find(scanner, wc)))
|
||||
return *cweight;
|
||||
}
|
||||
}
|
||||
|
||||
/* Process single character */
|
||||
scanner->page= wc[0] >> 8;
|
||||
scanner->code= wc[0] & 0xFF;
|
||||
|
||||
/* If weight page for w[0] does not exist, then calculate algoritmically */
|
||||
if (!(wpage= scanner->level->weights[scanner->page]))
|
||||
return my_uca_scanner_next_implicit(scanner);
|
||||
|
||||
/* Calculate pointer to w[0]'s weight, using page and offset */
|
||||
scanner->wbeg= wpage +
|
||||
scanner->code * scanner->level->lengths[scanner->page];
|
||||
} while (!scanner->wbeg[0]); /* Skip ignorable characters */
|
||||
|
||||
return *scanner->wbeg++;
|
||||
}
|
||||
|
||||
|
||||
|
||||
/*
|
||||
Compares two strings according to the collation
|
||||
|
||||
SYNOPSIS:
|
||||
strnncoll_onelevel()
|
||||
cs Character set information
|
||||
level Weight level (0 primary, 1 secondary, 2 tertiary, etc)
|
||||
s First string
|
||||
slen First string length
|
||||
t Second string
|
||||
tlen Seconf string length
|
||||
level DUCETweight level
|
||||
|
||||
NOTES:
|
||||
Initializes two weight scanners and gets weights
|
||||
corresponding to two strings in a loop. If weights are not
|
||||
the same at some step then returns their difference.
|
||||
|
||||
In the while() comparison these situations are possible:
|
||||
1. (s_res>0) and (t_res>0) and (s_res == t_res)
|
||||
Weights are the same so far, continue comparison
|
||||
2. (s_res>0) and (t_res>0) and (s_res!=t_res)
|
||||
A difference has been found, return.
|
||||
3. (s_res>0) and (t_res<0)
|
||||
We have reached the end of the second string, or found
|
||||
an illegal multibyte sequence in the second string.
|
||||
Return a positive number, i.e. the first string is bigger.
|
||||
4. (s_res<0) and (t_res>0)
|
||||
We have reached the end of the first string, or found
|
||||
an illegal multibyte sequence in the first string.
|
||||
Return a negative number, i.e. the second string is bigger.
|
||||
5. (s_res<0) and (t_res<0)
|
||||
Both scanners returned -1. It means we have riched
|
||||
the end-of-string of illegal-sequence in both strings
|
||||
at the same time. Return 0, strings are equal.
|
||||
|
||||
RETURN
|
||||
Difference between two strings, according to the collation:
|
||||
0 - means strings are equal
|
||||
negative number - means the first string is smaller
|
||||
positive number - means the first string is bigger
|
||||
*/
|
||||
|
||||
static int
|
||||
MY_FUNCTION_NAME(strnncoll_onelevel)(CHARSET_INFO *cs,
|
||||
const MY_UCA_WEIGHT_LEVEL *level,
|
||||
const uchar *s, size_t slen,
|
||||
const uchar *t, size_t tlen,
|
||||
my_bool t_is_prefix)
|
||||
{
|
||||
my_uca_scanner sscanner;
|
||||
my_uca_scanner tscanner;
|
||||
int s_res;
|
||||
int t_res;
|
||||
|
||||
my_uca_scanner_init_any(&sscanner, cs, level, s, slen);
|
||||
my_uca_scanner_init_any(&tscanner, cs, level, t, tlen);
|
||||
|
||||
do
|
||||
{
|
||||
s_res= MY_FUNCTION_NAME(scanner_next)(&sscanner);
|
||||
t_res= MY_FUNCTION_NAME(scanner_next)(&tscanner);
|
||||
} while ( s_res == t_res && s_res >0);
|
||||
|
||||
return (t_is_prefix && t_res < 0) ? 0 : (s_res - t_res);
|
||||
}
|
||||
|
||||
|
||||
/*
|
||||
One-level, PAD SPACE.
|
||||
*/
|
||||
static int
|
||||
MY_FUNCTION_NAME(strnncoll)(CHARSET_INFO *cs,
|
||||
const uchar *s, size_t slen,
|
||||
const uchar *t, size_t tlen,
|
||||
my_bool t_is_prefix)
|
||||
{
|
||||
return MY_FUNCTION_NAME(strnncoll_onelevel)(cs, &cs->uca->level[0],
|
||||
s, slen, t, tlen, t_is_prefix);
|
||||
}
|
||||
|
||||
|
||||
/*
|
||||
Multi-level, PAD SPACE.
|
||||
*/
|
||||
static int
|
||||
MY_FUNCTION_NAME(strnncoll_multilevel)(CHARSET_INFO *cs,
|
||||
const uchar *s, size_t slen,
|
||||
const uchar *t, size_t tlen,
|
||||
my_bool t_is_prefix)
|
||||
{
|
||||
uint i, num_level= cs->levels_for_order;
|
||||
for (i= 0; i != num_level; i++)
|
||||
{
|
||||
int ret= MY_FUNCTION_NAME(strnncoll_onelevel)(cs, &cs->uca->level[i],
|
||||
s, slen, t, tlen,
|
||||
t_is_prefix);
|
||||
if (ret)
|
||||
return ret;
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
|
||||
|
||||
/*
|
||||
Compares two strings according to the collation,
|
||||
ignoring trailing spaces.
|
||||
|
||||
SYNOPSIS:
|
||||
strnncollsp_onelevel()
|
||||
cs Character set information
|
||||
level UCA weight level
|
||||
s First string
|
||||
slen First string length
|
||||
t Second string
|
||||
tlen Seconf string length
|
||||
level DUCETweight level
|
||||
|
||||
NOTES:
|
||||
Works exactly the same with my_strnncoll_uca(),
|
||||
but ignores trailing spaces.
|
||||
|
||||
In the while() comparison these situations are possible:
|
||||
1. (s_res>0) and (t_res>0) and (s_res == t_res)
|
||||
Weights are the same so far, continue comparison
|
||||
2. (s_res>0) and (t_res>0) and (s_res!=t_res)
|
||||
A difference has been found, return.
|
||||
3. (s_res>0) and (t_res<0)
|
||||
We have reached the end of the second string, or found
|
||||
an illegal multibyte sequence in the second string.
|
||||
Compare the first string to an infinite array of
|
||||
space characters until difference is found, or until
|
||||
the end of the first string.
|
||||
4. (s_res<0) and (t_res>0)
|
||||
We have reached the end of the first string, or found
|
||||
an illegal multibyte sequence in the first string.
|
||||
Compare the second string to an infinite array of
|
||||
space characters until difference is found or until
|
||||
the end of the second steing.
|
||||
5. (s_res<0) and (t_res<0)
|
||||
Both scanners returned -1. It means we have riched
|
||||
the end-of-string of illegal-sequence in both strings
|
||||
at the same time. Return 0, strings are equal.
|
||||
|
||||
RETURN
|
||||
Difference between two strings, according to the collation:
|
||||
0 - means strings are equal
|
||||
negative number - means the first string is smaller
|
||||
positive number - means the first string is bigger
|
||||
*/
|
||||
|
||||
static int
|
||||
MY_FUNCTION_NAME(strnncollsp_onelevel)(CHARSET_INFO *cs,
|
||||
const MY_UCA_WEIGHT_LEVEL *level,
|
||||
const uchar *s, size_t slen,
|
||||
const uchar *t, size_t tlen)
|
||||
{
|
||||
my_uca_scanner sscanner, tscanner;
|
||||
int s_res, t_res;
|
||||
|
||||
my_uca_scanner_init_any(&sscanner, cs, level, s, slen);
|
||||
my_uca_scanner_init_any(&tscanner, cs, level, t, tlen);
|
||||
|
||||
do
|
||||
{
|
||||
s_res= MY_FUNCTION_NAME(scanner_next)(&sscanner);
|
||||
t_res= MY_FUNCTION_NAME(scanner_next)(&tscanner);
|
||||
} while ( s_res == t_res && s_res >0);
|
||||
|
||||
if (s_res > 0 && t_res < 0)
|
||||
{
|
||||
/* Calculate weight for SPACE character */
|
||||
t_res= my_space_weight(level);
|
||||
|
||||
/* compare the first string to spaces */
|
||||
do
|
||||
{
|
||||
if (s_res != t_res)
|
||||
return (s_res - t_res);
|
||||
s_res= MY_FUNCTION_NAME(scanner_next)(&sscanner);
|
||||
} while (s_res > 0);
|
||||
return 0;
|
||||
}
|
||||
|
||||
if (s_res < 0 && t_res > 0)
|
||||
{
|
||||
/* Calculate weight for SPACE character */
|
||||
s_res= my_space_weight(level);
|
||||
|
||||
/* compare the second string to spaces */
|
||||
do
|
||||
{
|
||||
if (s_res != t_res)
|
||||
return (s_res - t_res);
|
||||
t_res= MY_FUNCTION_NAME(scanner_next)(&tscanner);
|
||||
} while (t_res > 0);
|
||||
return 0;
|
||||
}
|
||||
|
||||
return ( s_res - t_res );
|
||||
}
|
||||
|
||||
|
||||
/*
|
||||
One-level, PAD SPACE
|
||||
*/
|
||||
static int
|
||||
MY_FUNCTION_NAME(strnncollsp)(CHARSET_INFO *cs,
|
||||
const uchar *s, size_t slen,
|
||||
const uchar *t, size_t tlen)
|
||||
{
|
||||
return MY_FUNCTION_NAME(strnncollsp_onelevel)(cs, &cs->uca->level[0],
|
||||
s, slen, t, tlen);
|
||||
}
|
||||
|
||||
|
||||
/*
|
||||
One-level, NO PAD
|
||||
*/
|
||||
static int
|
||||
MY_FUNCTION_NAME(strnncollsp_nopad)(CHARSET_INFO *cs,
|
||||
const uchar *s, size_t slen,
|
||||
const uchar *t, size_t tlen)
|
||||
{
|
||||
return MY_FUNCTION_NAME(strnncoll_onelevel)(cs, &cs->uca->level[0],
|
||||
s, slen, t, tlen, FALSE);
|
||||
}
|
||||
|
||||
|
||||
/*
|
||||
Multi-level, PAD SPACE
|
||||
*/
|
||||
static int
|
||||
MY_FUNCTION_NAME(strnncollsp_multilevel)(CHARSET_INFO *cs,
|
||||
const uchar *s, size_t slen,
|
||||
const uchar *t, size_t tlen)
|
||||
{
|
||||
|
||||
uint i, num_level= cs->levels_for_order;
|
||||
for (i= 0; i != num_level; i++)
|
||||
{
|
||||
int ret= MY_FUNCTION_NAME(strnncollsp_onelevel)(cs, &cs->uca->level[i],
|
||||
s, slen, t, tlen);
|
||||
if (ret)
|
||||
return ret;
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
|
||||
|
||||
/*
|
||||
Multi-level, NO PAD
|
||||
*/
|
||||
static int
|
||||
MY_FUNCTION_NAME(strnncollsp_nopad_multilevel)(CHARSET_INFO *cs,
|
||||
const uchar *s, size_t slen,
|
||||
const uchar *t, size_t tlen)
|
||||
{
|
||||
uint num_level= cs->levels_for_order;
|
||||
uint i;
|
||||
for (i= 0; i != num_level; i++)
|
||||
{
|
||||
int ret= MY_FUNCTION_NAME(strnncoll_onelevel)(cs, &cs->uca->level[i],
|
||||
s, slen, t, tlen, FALSE);
|
||||
if (ret)
|
||||
return ret;
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
|
||||
|
||||
|
||||
/*
|
||||
Calculates hash value for the given string,
|
||||
according to the collation, and ignoring trailing spaces.
|
||||
|
||||
SYNOPSIS:
|
||||
hash_sort()
|
||||
cs Character set information
|
||||
s String
|
||||
slen String's length
|
||||
n1 First hash parameter
|
||||
n2 Second hash parameter
|
||||
|
||||
NOTES:
|
||||
Scans consequently weights and updates
|
||||
hash parameters n1 and n2. In a case insensitive collation,
|
||||
upper and lower case of the same letter will return the same
|
||||
weight sequence, and thus will produce the same hash values
|
||||
in n1 and n2.
|
||||
|
||||
This functions is used for one-level and for multi-level collations.
|
||||
We intentionally use only primary level in multi-level collations.
|
||||
This helps to have PARTITION BY KEY put primarily equal records
|
||||
into the same partition. E.g. in utf8_thai_520_ci records that differ
|
||||
only in tone marks go into the same partition.
|
||||
|
||||
RETURN
|
||||
N/A
|
||||
*/
|
||||
|
||||
static void
|
||||
MY_FUNCTION_NAME(hash_sort)(CHARSET_INFO *cs,
|
||||
const uchar *s, size_t slen,
|
||||
ulong *nr1, ulong *nr2)
|
||||
{
|
||||
int s_res;
|
||||
my_uca_scanner scanner;
|
||||
int space_weight= my_space_weight(&cs->uca->level[0]);
|
||||
register ulong m1= *nr1, m2= *nr2;
|
||||
|
||||
my_uca_scanner_init_any(&scanner, cs, &cs->uca->level[0], s, slen);
|
||||
|
||||
while ((s_res= MY_FUNCTION_NAME(scanner_next)(&scanner)) >0)
|
||||
{
|
||||
if (s_res == space_weight)
|
||||
{
|
||||
/* Combine all spaces to be able to skip end spaces */
|
||||
uint count= 0;
|
||||
do
|
||||
{
|
||||
count++;
|
||||
if ((s_res= MY_FUNCTION_NAME(scanner_next)(&scanner)) <= 0)
|
||||
{
|
||||
/* Skip strings at end of string */
|
||||
goto end;
|
||||
}
|
||||
}
|
||||
while (s_res == space_weight);
|
||||
|
||||
/* Add back that has for the space characters */
|
||||
do
|
||||
{
|
||||
/*
|
||||
We can't use MY_HASH_ADD_16() here as we, because of a misstake
|
||||
in the original code, where we added the 16 byte variable the
|
||||
opposite way. Changing this would cause old partitioned tables
|
||||
to fail.
|
||||
*/
|
||||
MY_HASH_ADD(m1, m2, space_weight >> 8);
|
||||
MY_HASH_ADD(m1, m2, space_weight & 0xFF);
|
||||
}
|
||||
while (--count != 0);
|
||||
|
||||
}
|
||||
/* See comment above why we can't use MY_HASH_ADD_16() */
|
||||
MY_HASH_ADD(m1, m2, s_res >> 8);
|
||||
MY_HASH_ADD(m1, m2, s_res & 0xFF);
|
||||
}
|
||||
end:
|
||||
*nr1= m1;
|
||||
*nr2= m2;
|
||||
}
|
||||
|
||||
|
||||
static void
|
||||
MY_FUNCTION_NAME(hash_sort_nopad)(CHARSET_INFO *cs,
|
||||
const uchar *s, size_t slen,
|
||||
ulong *nr1, ulong *nr2)
|
||||
{
|
||||
int s_res;
|
||||
my_uca_scanner scanner;
|
||||
register ulong m1= *nr1, m2= *nr2;
|
||||
|
||||
my_uca_scanner_init_any(&scanner, cs, &cs->uca->level[0], s, slen);
|
||||
|
||||
while ((s_res= MY_FUNCTION_NAME(scanner_next)(&scanner)) >0)
|
||||
{
|
||||
/* See comment above why we can't use MY_HASH_ADD_16() */
|
||||
MY_HASH_ADD(m1, m2, s_res >> 8);
|
||||
MY_HASH_ADD(m1, m2, s_res & 0xFF);
|
||||
}
|
||||
*nr1= m1;
|
||||
*nr2= m2;
|
||||
}
|
||||
|
||||
|
||||
|
||||
/*
|
||||
For the given string creates its "binary image", suitable
|
||||
to be used in binary comparison, i.e. in memcmp().
|
||||
|
||||
SYNOPSIS:
|
||||
my_strnxfrm_uca()
|
||||
cs Character set information
|
||||
dst Where to write the image
|
||||
dstlen Space available for the image, in bytes
|
||||
src The source string
|
||||
srclen Length of the source string, in bytes
|
||||
|
||||
NOTES:
|
||||
In a loop, scans weights from the source string and writes
|
||||
them into the binary image. In a case insensitive collation,
|
||||
upper and lower cases of the same letter will produce the
|
||||
same image subsequences. When we have reached the end-of-string
|
||||
or found an illegal multibyte sequence, the loop stops.
|
||||
|
||||
It is impossible to restore the original string using its
|
||||
binary image.
|
||||
|
||||
Binary images are used for bulk comparison purposes,
|
||||
e.g. in ORDER BY, when it is more efficient to create
|
||||
a binary image and use it instead of weight scanner
|
||||
for the original strings for every comparison.
|
||||
|
||||
RETURN
|
||||
Number of bytes that have been written into the binary image.
|
||||
*/
|
||||
|
||||
static uchar *
|
||||
MY_FUNCTION_NAME(strnxfrm_onelevel_internal)(CHARSET_INFO *cs,
|
||||
MY_UCA_WEIGHT_LEVEL *level,
|
||||
uchar *dst, uchar *de,
|
||||
uint *nweights,
|
||||
const uchar *src, size_t srclen)
|
||||
{
|
||||
my_uca_scanner scanner;
|
||||
int s_res;
|
||||
|
||||
DBUG_ASSERT(src || !srclen);
|
||||
|
||||
my_uca_scanner_init_any(&scanner, cs, level, src, srclen);
|
||||
for (; dst < de && *nweights &&
|
||||
(s_res= MY_FUNCTION_NAME(scanner_next)(&scanner)) > 0 ; (*nweights)--)
|
||||
{
|
||||
*dst++= s_res >> 8;
|
||||
if (dst < de)
|
||||
*dst++= s_res & 0xFF;
|
||||
}
|
||||
return dst;
|
||||
}
|
||||
|
||||
|
||||
static uchar *
|
||||
MY_FUNCTION_NAME(strnxfrm_onelevel)(CHARSET_INFO *cs,
|
||||
MY_UCA_WEIGHT_LEVEL *level,
|
||||
uchar *dst, uchar *de, uint nweights,
|
||||
const uchar *src, size_t srclen, uint flags)
|
||||
{
|
||||
uchar *d0= dst;
|
||||
dst= MY_FUNCTION_NAME(strnxfrm_onelevel_internal)(cs, level,
|
||||
dst, de, &nweights,
|
||||
src, srclen);
|
||||
DBUG_ASSERT(dst <= de);
|
||||
if (dst < de && nweights && (flags & MY_STRXFRM_PAD_WITH_SPACE))
|
||||
dst= my_strnxfrm_uca_padn(dst, de, nweights, my_space_weight(level));
|
||||
DBUG_ASSERT(dst <= de);
|
||||
my_strxfrm_desc_and_reverse(d0, dst, flags, 0);
|
||||
return dst;
|
||||
}
|
||||
|
||||
|
||||
|
||||
static uchar *
|
||||
MY_FUNCTION_NAME(strnxfrm_nopad_onelevel)(CHARSET_INFO *cs,
|
||||
MY_UCA_WEIGHT_LEVEL *level,
|
||||
uchar *dst, uchar *de, uint nweights,
|
||||
const uchar *src, size_t srclen,
|
||||
uint flags)
|
||||
{
|
||||
uchar *d0= dst;
|
||||
dst= MY_FUNCTION_NAME(strnxfrm_onelevel_internal)(cs, level,
|
||||
dst, de, &nweights,
|
||||
src, srclen);
|
||||
DBUG_ASSERT(dst <= de);
|
||||
/* Pad with the minimum possible weight on this level */
|
||||
if (dst < de && nweights && (flags & MY_STRXFRM_PAD_WITH_SPACE))
|
||||
dst= my_strnxfrm_uca_padn(dst, de, nweights, min_weight_on_level(level));
|
||||
DBUG_ASSERT(dst <= de);
|
||||
my_strxfrm_desc_and_reverse(d0, dst, flags, 0);
|
||||
return dst;
|
||||
}
|
||||
|
||||
|
||||
static size_t
|
||||
MY_FUNCTION_NAME(strnxfrm)(CHARSET_INFO *cs,
|
||||
uchar *dst, size_t dstlen, uint nweights,
|
||||
const uchar *src, size_t srclen, uint flags)
|
||||
{
|
||||
uchar *d0= dst;
|
||||
uchar *de= dst + dstlen;
|
||||
|
||||
dst= MY_FUNCTION_NAME(strnxfrm_onelevel)(cs, &cs->uca->level[0],
|
||||
dst, de, nweights,
|
||||
src, srclen, flags);
|
||||
/*
|
||||
This can probably be changed to memset(dst, 0, de - dst),
|
||||
like my_strnxfrm_uca_multilevel() does.
|
||||
*/
|
||||
if ((flags & MY_STRXFRM_PAD_TO_MAXLEN) && dst < de)
|
||||
dst= my_strnxfrm_uca_pad(dst, de, my_space_weight(&cs->uca->level[0]));
|
||||
return dst - d0;
|
||||
}
|
||||
|
||||
|
||||
static size_t
|
||||
MY_FUNCTION_NAME(strnxfrm_nopad)(CHARSET_INFO *cs,
|
||||
uchar *dst, size_t dstlen,
|
||||
uint nweights,
|
||||
const uchar *src, size_t srclen,
|
||||
uint flags)
|
||||
{
|
||||
uchar *d0= dst;
|
||||
uchar *de= dst + dstlen;
|
||||
|
||||
dst= MY_FUNCTION_NAME(strnxfrm_nopad_onelevel)(cs, &cs->uca->level[0],
|
||||
dst, de, nweights,
|
||||
src, srclen, flags);
|
||||
if ((flags & MY_STRXFRM_PAD_TO_MAXLEN) && dst < de)
|
||||
{
|
||||
memset(dst, 0, de - dst);
|
||||
dst= de;
|
||||
}
|
||||
return dst - d0;
|
||||
}
|
||||
|
||||
|
||||
static size_t
|
||||
MY_FUNCTION_NAME(strnxfrm_multilevel)(CHARSET_INFO *cs,
|
||||
uchar *dst, size_t dstlen,
|
||||
uint nweights,
|
||||
const uchar *src, size_t srclen,
|
||||
uint flags)
|
||||
{
|
||||
uint num_level= cs->levels_for_order;
|
||||
uchar *d0= dst;
|
||||
uchar *de= dst + dstlen;
|
||||
uint current_level;
|
||||
|
||||
for (current_level= 0; current_level != num_level; current_level++)
|
||||
{
|
||||
if (!(flags & MY_STRXFRM_LEVEL_ALL) ||
|
||||
(flags & (MY_STRXFRM_LEVEL1 << current_level)))
|
||||
dst= cs->state & MY_CS_NOPAD ?
|
||||
MY_FUNCTION_NAME(strnxfrm_nopad_onelevel)(cs,
|
||||
&cs->uca->level[current_level],
|
||||
dst, de, nweights,
|
||||
src, srclen, flags) :
|
||||
MY_FUNCTION_NAME(strnxfrm_onelevel)(cs,
|
||||
&cs->uca->level[current_level],
|
||||
dst, de, nweights,
|
||||
src, srclen, flags);
|
||||
}
|
||||
|
||||
if (dst < de && (flags & MY_STRXFRM_PAD_TO_MAXLEN))
|
||||
{
|
||||
memset(dst, 0, de - dst);
|
||||
dst= de;
|
||||
}
|
||||
|
||||
return dst - d0;
|
||||
}
|
||||
|
||||
|
||||
/*
|
||||
One-level, PAD SPACE
|
||||
*/
|
||||
MY_COLLATION_HANDLER MY_FUNCTION_NAME(collation_handler)=
|
||||
{
|
||||
my_coll_init_uca,
|
||||
MY_FUNCTION_NAME(strnncoll),
|
||||
MY_FUNCTION_NAME(strnncollsp),
|
||||
MY_FUNCTION_NAME(strnxfrm),
|
||||
my_strnxfrmlen_any_uca,
|
||||
MY_LIKE_RANGE,
|
||||
my_wildcmp_uca,
|
||||
NULL, /* strcasecmp() */
|
||||
my_instr_mb,
|
||||
MY_FUNCTION_NAME(hash_sort),
|
||||
my_propagate_complex
|
||||
};
|
||||
|
||||
|
||||
/*
|
||||
One-level, NO PAD
|
||||
For character sets with mbminlen==1 use MY_LIKE_RANGE=my_like_range_mb
|
||||
For character sets with mbminlen>=2 use MY_LIKE_RANGE=my_like_range_generic
|
||||
*/
|
||||
MY_COLLATION_HANDLER MY_FUNCTION_NAME(collation_handler_nopad)=
|
||||
{
|
||||
my_coll_init_uca,
|
||||
MY_FUNCTION_NAME(strnncoll),
|
||||
MY_FUNCTION_NAME(strnncollsp_nopad),
|
||||
MY_FUNCTION_NAME(strnxfrm_nopad),
|
||||
my_strnxfrmlen_any_uca,
|
||||
MY_LIKE_RANGE, /* my_like_range_mb or my_like_range_generic */
|
||||
my_wildcmp_uca,
|
||||
NULL, /* strcasecmp() */
|
||||
my_instr_mb,
|
||||
MY_FUNCTION_NAME(hash_sort_nopad),
|
||||
my_propagate_complex
|
||||
};
|
||||
|
||||
|
||||
/*
|
||||
Multi-level, PAD SPACE
|
||||
*/
|
||||
MY_COLLATION_HANDLER MY_FUNCTION_NAME(collation_handler_multilevel)=
|
||||
{
|
||||
my_coll_init_uca,
|
||||
MY_FUNCTION_NAME(strnncoll_multilevel),
|
||||
MY_FUNCTION_NAME(strnncollsp_multilevel),
|
||||
MY_FUNCTION_NAME(strnxfrm_multilevel),
|
||||
my_strnxfrmlen_any_uca_multilevel,
|
||||
MY_LIKE_RANGE,
|
||||
my_wildcmp_uca,
|
||||
NULL, /* strcasecmp() */
|
||||
my_instr_mb,
|
||||
MY_FUNCTION_NAME(hash_sort),
|
||||
my_propagate_complex
|
||||
};
|
||||
|
||||
|
||||
/*
|
||||
Multi-level, NO PAD
|
||||
*/
|
||||
MY_COLLATION_HANDLER MY_FUNCTION_NAME(collation_handler_nopad_multilevel)=
|
||||
{
|
||||
my_coll_init_uca,
|
||||
MY_FUNCTION_NAME(strnncoll_multilevel),
|
||||
MY_FUNCTION_NAME(strnncollsp_nopad_multilevel),
|
||||
MY_FUNCTION_NAME(strnxfrm_multilevel),
|
||||
my_strnxfrmlen_any_uca_multilevel,
|
||||
MY_LIKE_RANGE,
|
||||
my_wildcmp_uca,
|
||||
NULL, /* strcasecmp() */
|
||||
my_instr_mb,
|
||||
MY_FUNCTION_NAME(hash_sort),
|
||||
my_propagate_complex
|
||||
};
|
||||
|
||||
|
||||
#undef MY_FUNCTION_NAME
|
||||
#undef MY_MB_WC
|
||||
#undef MY_LIKE_RANGE
|
|
@ -1184,35 +1184,7 @@ my_lengthsp_mb2(CHARSET_INFO *cs __attribute__((unused)),
|
|||
but the JSON functions needed my_utf16_uni()
|
||||
so the #ifdef was moved lower.
|
||||
*/
|
||||
|
||||
|
||||
/*
|
||||
D800..DB7F - Non-provate surrogate high (896 pages)
|
||||
DB80..DBFF - Private surrogate high (128 pages)
|
||||
DC00..DFFF - Surrogate low (1024 codes in a page)
|
||||
*/
|
||||
#define MY_UTF16_SURROGATE_HIGH_FIRST 0xD800
|
||||
#define MY_UTF16_SURROGATE_HIGH_LAST 0xDBFF
|
||||
#define MY_UTF16_SURROGATE_LOW_FIRST 0xDC00
|
||||
#define MY_UTF16_SURROGATE_LOW_LAST 0xDFFF
|
||||
|
||||
#define MY_UTF16_HIGH_HEAD(x) ((((uchar) (x)) & 0xFC) == 0xD8)
|
||||
#define MY_UTF16_LOW_HEAD(x) ((((uchar) (x)) & 0xFC) == 0xDC)
|
||||
/* Test if a byte is a leading byte of a high or low surrogate head: */
|
||||
#define MY_UTF16_SURROGATE_HEAD(x) ((((uchar) (x)) & 0xF8) == 0xD8)
|
||||
/* Test if a Unicode code point is a high or low surrogate head */
|
||||
#define MY_UTF16_SURROGATE(x) (((x) & 0xF800) == 0xD800)
|
||||
|
||||
#define MY_UTF16_WC2(a, b) ((a << 8) + b)
|
||||
|
||||
/*
|
||||
a= 110110?? (<< 18)
|
||||
b= ???????? (<< 10)
|
||||
c= 110111?? (<< 8)
|
||||
d= ???????? (<< 0)
|
||||
*/
|
||||
#define MY_UTF16_WC4(a, b, c, d) (((a & 3) << 18) + (b << 10) + \
|
||||
((c & 3) << 8) + d + 0x10000)
|
||||
#include "ctype-utf16.h"
|
||||
|
||||
#define IS_MB2_CHAR(b0,b1) (!MY_UTF16_SURROGATE_HEAD(b0))
|
||||
#define IS_MB4_CHAR(b0,b1,b2,b3) (MY_UTF16_HIGH_HEAD(b0) && MY_UTF16_LOW_HEAD(b2))
|
||||
|
@ -1261,32 +1233,7 @@ static inline int my_weight_mb2_utf16mb2_general_ci(uchar b0, uchar b1)
|
|||
my_utf16_uni(CHARSET_INFO *cs __attribute__((unused)),
|
||||
my_wc_t *pwc, const uchar *s, const uchar *e)
|
||||
{
|
||||
if (s + 2 > e)
|
||||
return MY_CS_TOOSMALL2;
|
||||
|
||||
/*
|
||||
High bytes: 0xD[89AB] = B'110110??'
|
||||
Low bytes: 0xD[CDEF] = B'110111??'
|
||||
Surrogate mask: 0xFC = B'11111100'
|
||||
*/
|
||||
|
||||
if (MY_UTF16_HIGH_HEAD(*s)) /* Surrogate head */
|
||||
{
|
||||
if (s + 4 > e)
|
||||
return MY_CS_TOOSMALL4;
|
||||
|
||||
if (!MY_UTF16_LOW_HEAD(s[2])) /* Broken surrigate pair */
|
||||
return MY_CS_ILSEQ;
|
||||
|
||||
*pwc= MY_UTF16_WC4(s[0], s[1], s[2], s[3]);
|
||||
return 4;
|
||||
}
|
||||
|
||||
if (MY_UTF16_LOW_HEAD(*s)) /* Low surrogate part without high part */
|
||||
return MY_CS_ILSEQ;
|
||||
|
||||
*pwc= MY_UTF16_WC2(s[0], s[1]);
|
||||
return 2;
|
||||
return my_mb_wc_utf16_quick(pwc, s, e);
|
||||
}
|
||||
|
||||
|
||||
|
@ -2109,6 +2056,8 @@ struct charset_info_st my_charset_utf16le_nopad_bin=
|
|||
|
||||
#ifdef HAVE_CHARSET_utf32
|
||||
|
||||
#include "ctype-utf32.h"
|
||||
|
||||
/*
|
||||
Check is b0 and b1 start a valid UTF32 four-byte sequence.
|
||||
Don't accept characters greater than U+10FFFF.
|
||||
|
@ -2117,8 +2066,6 @@ struct charset_info_st my_charset_utf16le_nopad_bin=
|
|||
|
||||
#define IS_MB4_CHAR(b0,b1,b2,b3) (IS_UTF32_MBHEAD4(b0,b1))
|
||||
|
||||
#define MY_UTF32_WC4(b0,b1,b2,b3) ((((my_wc_t)b0) << 24) + (b1 << 16) + \
|
||||
(b2 << 8) + (b3))
|
||||
|
||||
static inline int my_weight_utf32_general_ci(uchar b0, uchar b1,
|
||||
uchar b2, uchar b3)
|
||||
|
@ -2161,10 +2108,7 @@ static int
|
|||
my_utf32_uni(CHARSET_INFO *cs __attribute__((unused)),
|
||||
my_wc_t *pwc, const uchar *s, const uchar *e)
|
||||
{
|
||||
if (s + 4 > e)
|
||||
return MY_CS_TOOSMALL4;
|
||||
*pwc= MY_UTF32_WC4(s[0], s[1], s[2], s[3]);
|
||||
return *pwc > 0x10FFFF ? MY_CS_ILSEQ : 4;
|
||||
return my_mb_wc_utf32_quick(pwc, s, e);
|
||||
}
|
||||
|
||||
|
||||
|
@ -2928,6 +2872,8 @@ struct charset_info_st my_charset_utf32_nopad_bin=
|
|||
|
||||
#ifdef HAVE_CHARSET_ucs2
|
||||
|
||||
#include "ctype-ucs2.h"
|
||||
|
||||
static const uchar ctype_ucs2[] = {
|
||||
0,
|
||||
32, 32, 32, 32, 32, 32, 32, 32, 32, 40, 40, 40, 40, 40, 32, 32,
|
||||
|
@ -3037,11 +2983,7 @@ my_charlen_ucs2(CHARSET_INFO *cs __attribute__((unused)),
|
|||
static int my_ucs2_uni(CHARSET_INFO *cs __attribute__((unused)),
|
||||
my_wc_t * pwc, const uchar *s, const uchar *e)
|
||||
{
|
||||
if (s+2 > e) /* Need 2 characters */
|
||||
return MY_CS_TOOSMALL2;
|
||||
|
||||
*pwc= ((uchar)s[0]) * 256 + ((uchar)s[1]);
|
||||
return 2;
|
||||
return my_mb_wc_ucs2_quick(pwc, s, e);
|
||||
}
|
||||
|
||||
static int my_uni_ucs2(CHARSET_INFO *cs __attribute__((unused)) ,
|
||||
|
|
32
strings/ctype-ucs2.h
Normal file
32
strings/ctype-ucs2.h
Normal file
|
@ -0,0 +1,32 @@
|
|||
/*
|
||||
Copyright (c) 2018 MariaDB Corporation
|
||||
|
||||
This program is free software; you can redistribute it and/or modify
|
||||
it under the terms of the GNU General Public License as published by
|
||||
the Free Software Foundation; version 2 of the License.
|
||||
|
||||
This program is distributed in the hope that it will be useful,
|
||||
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
GNU General Public License for more details.
|
||||
|
||||
You should have received a copy of the GNU General Public License
|
||||
along with this program; if not, write to the Free Software
|
||||
Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
*/
|
||||
|
||||
#ifndef _CTYPE_UCS2_H
|
||||
#define _CTYPE_UCS2_H
|
||||
|
||||
|
||||
static inline int
|
||||
my_mb_wc_ucs2_quick(my_wc_t * pwc, const uchar *s, const uchar *e)
|
||||
{
|
||||
if (s+2 > e) /* Need 2 characters */
|
||||
return MY_CS_TOOSMALL2;
|
||||
*pwc= ((uchar)s[0]) * 256 + ((uchar)s[1]);
|
||||
return 2;
|
||||
}
|
||||
|
||||
|
||||
#endif /* _CTYPE_UCS2_H */
|
80
strings/ctype-utf16.h
Normal file
80
strings/ctype-utf16.h
Normal file
|
@ -0,0 +1,80 @@
|
|||
/*
|
||||
Copyright (c) 2018 MariaDB Corporation
|
||||
|
||||
This program is free software; you can redistribute it and/or modify
|
||||
it under the terms of the GNU General Public License as published by
|
||||
the Free Software Foundation; version 2 of the License.
|
||||
|
||||
This program is distributed in the hope that it will be useful,
|
||||
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
GNU General Public License for more details.
|
||||
|
||||
You should have received a copy of the GNU General Public License
|
||||
along with this program; if not, write to the Free Software
|
||||
Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
*/
|
||||
|
||||
#ifndef _CTYPE_UTF16_H
|
||||
#define _CTYPE_UTF16_H
|
||||
|
||||
/*
|
||||
D800..DB7F - Non-provate surrogate high (896 pages)
|
||||
DB80..DBFF - Private surrogate high (128 pages)
|
||||
DC00..DFFF - Surrogate low (1024 codes in a page)
|
||||
*/
|
||||
#define MY_UTF16_SURROGATE_HIGH_FIRST 0xD800
|
||||
#define MY_UTF16_SURROGATE_HIGH_LAST 0xDBFF
|
||||
#define MY_UTF16_SURROGATE_LOW_FIRST 0xDC00
|
||||
#define MY_UTF16_SURROGATE_LOW_LAST 0xDFFF
|
||||
|
||||
#define MY_UTF16_HIGH_HEAD(x) ((((uchar) (x)) & 0xFC) == 0xD8)
|
||||
#define MY_UTF16_LOW_HEAD(x) ((((uchar) (x)) & 0xFC) == 0xDC)
|
||||
/* Test if a byte is a leading byte of a high or low surrogate head: */
|
||||
#define MY_UTF16_SURROGATE_HEAD(x) ((((uchar) (x)) & 0xF8) == 0xD8)
|
||||
/* Test if a Unicode code point is a high or low surrogate head */
|
||||
#define MY_UTF16_SURROGATE(x) (((x) & 0xF800) == 0xD800)
|
||||
|
||||
#define MY_UTF16_WC2(a, b) ((a << 8) + b)
|
||||
|
||||
/*
|
||||
a= 110110?? (<< 18)
|
||||
b= ???????? (<< 10)
|
||||
c= 110111?? (<< 8)
|
||||
d= ???????? (<< 0)
|
||||
*/
|
||||
#define MY_UTF16_WC4(a, b, c, d) (((a & 3) << 18) + (b << 10) + \
|
||||
((c & 3) << 8) + d + 0x10000)
|
||||
|
||||
static inline int
|
||||
my_mb_wc_utf16_quick(my_wc_t *pwc, const uchar *s, const uchar *e)
|
||||
{
|
||||
if (s + 2 > e)
|
||||
return MY_CS_TOOSMALL2;
|
||||
|
||||
/*
|
||||
High bytes: 0xD[89AB] = B'110110??'
|
||||
Low bytes: 0xD[CDEF] = B'110111??'
|
||||
Surrogate mask: 0xFC = B'11111100'
|
||||
*/
|
||||
|
||||
if (MY_UTF16_HIGH_HEAD(*s)) /* Surrogate head */
|
||||
{
|
||||
if (s + 4 > e)
|
||||
return MY_CS_TOOSMALL4;
|
||||
|
||||
if (!MY_UTF16_LOW_HEAD(s[2])) /* Broken surrigate pair */
|
||||
return MY_CS_ILSEQ;
|
||||
|
||||
*pwc= MY_UTF16_WC4(s[0], s[1], s[2], s[3]);
|
||||
return 4;
|
||||
}
|
||||
|
||||
if (MY_UTF16_LOW_HEAD(*s)) /* Low surrogate part without high part */
|
||||
return MY_CS_ILSEQ;
|
||||
|
||||
*pwc= MY_UTF16_WC2(s[0], s[1]);
|
||||
return 2;
|
||||
}
|
||||
|
||||
#endif /* _CTYPE_UTF16_H */
|
33
strings/ctype-utf32.h
Normal file
33
strings/ctype-utf32.h
Normal file
|
@ -0,0 +1,33 @@
|
|||
/*
|
||||
Copyright (c) 2018 MariaDB Corporation
|
||||
|
||||
This program is free software; you can redistribute it and/or modify
|
||||
it under the terms of the GNU General Public License as published by
|
||||
the Free Software Foundation; version 2 of the License.
|
||||
|
||||
This program is distributed in the hope that it will be useful,
|
||||
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
GNU General Public License for more details.
|
||||
|
||||
You should have received a copy of the GNU General Public License
|
||||
along with this program; if not, write to the Free Software
|
||||
Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
*/
|
||||
|
||||
#ifndef _CTYPE_UTF32_H
|
||||
#define _CTYPE_UTF32_H
|
||||
|
||||
#define MY_UTF32_WC4(b0,b1,b2,b3) ((((my_wc_t)b0) << 24) + (b1 << 16) + \
|
||||
(b2 << 8) + (b3))
|
||||
|
||||
static inline int
|
||||
my_mb_wc_utf32_quick(my_wc_t *pwc, const uchar *s, const uchar *e)
|
||||
{
|
||||
if (s + 4 > e)
|
||||
return MY_CS_TOOSMALL4;
|
||||
*pwc= MY_UTF32_WC4(s[0], s[1], s[2], s[3]);
|
||||
return *pwc > 0x10FFFF ? MY_CS_ILSEQ : 4;
|
||||
}
|
||||
|
||||
#endif /* _CTYPE_UTF32_H */
|
|
@ -26,78 +26,9 @@
|
|||
#define EILSEQ ENOENT
|
||||
#endif
|
||||
|
||||
/* Detect special bytes and sequences */
|
||||
#define IS_CONTINUATION_BYTE(c) (((uchar) (c) ^ 0x80) < 0x40)
|
||||
|
||||
/*
|
||||
Check MB2 character assuming that b0 is alredy known to be >= 0xC2.
|
||||
Use this macro if the caller already checked b0 for:
|
||||
- an MB1 character
|
||||
- an unused gap between MB1 and MB2HEAD
|
||||
*/
|
||||
#define IS_UTF8MB2_STEP2(b0,b1) (((uchar) (b0) < 0xE0) && \
|
||||
IS_CONTINUATION_BYTE((uchar) b1))
|
||||
#include "ctype-utf8.h"
|
||||
|
||||
/*
|
||||
Check MB3 character assuming that b0 is already known to be
|
||||
in the valid MB3HEAD range [0xE0..0xEF].
|
||||
*/
|
||||
#define IS_UTF8MB3_STEP2(b0,b1,b2) (IS_CONTINUATION_BYTE(b1) && \
|
||||
IS_CONTINUATION_BYTE(b2) && \
|
||||
((uchar) b0 >= 0xe1 || (uchar) b1 >= 0xa0))
|
||||
|
||||
/*
|
||||
Check MB3 character assuming that b0 is already known to be >= 0xE0,
|
||||
but is not checked for the high end 0xF0 yet.
|
||||
Use this macro if the caller already checked b0 for:
|
||||
- an MB1 character
|
||||
- an unused gap between MB1 and MB2HEAD
|
||||
- an MB2HEAD
|
||||
*/
|
||||
#define IS_UTF8MB3_STEP3(b0,b1,b2) (((uchar) (b0) < 0xF0) && \
|
||||
IS_UTF8MB3_STEP2(b0,b1,b2))
|
||||
|
||||
/*
|
||||
UTF-8 quick four-byte mask:
|
||||
11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
|
||||
Encoding allows to encode U+00010000..U+001FFFFF
|
||||
|
||||
The maximum character defined in the Unicode standard is U+0010FFFF.
|
||||
Higher characters U+00110000..U+001FFFFF are not used.
|
||||
|
||||
11110000.10010000.10xxxxxx.10xxxxxx == F0.90.80.80 == U+00010000 (min)
|
||||
11110100.10001111.10111111.10111111 == F4.8F.BF.BF == U+0010FFFF (max)
|
||||
|
||||
Valid codes:
|
||||
[F0][90..BF][80..BF][80..BF]
|
||||
[F1][80..BF][80..BF][80..BF]
|
||||
[F2][80..BF][80..BF][80..BF]
|
||||
[F3][80..BF][80..BF][80..BF]
|
||||
[F4][80..8F][80..BF][80..BF]
|
||||
*/
|
||||
|
||||
/*
|
||||
Check MB4 character assuming that b0 is already
|
||||
known to be in the range [0xF0..0xF4]
|
||||
*/
|
||||
#define IS_UTF8MB4_STEP2(b0,b1,b2,b3) (IS_CONTINUATION_BYTE(b1) && \
|
||||
IS_CONTINUATION_BYTE(b2) && \
|
||||
IS_CONTINUATION_BYTE(b3) && \
|
||||
(b0 >= 0xf1 || b1 >= 0x90) && \
|
||||
(b0 <= 0xf3 || b1 <= 0x8F))
|
||||
#define IS_UTF8MB4_STEP3(b0,b1,b2,b3) (((uchar) (b0) < 0xF5) && \
|
||||
IS_UTF8MB4_STEP2(b0,b1,b2,b3))
|
||||
|
||||
/* Convert individual bytes to Unicode code points */
|
||||
#define UTF8MB2_CODE(b0,b1) (((my_wc_t) ((uchar) b0 & 0x1f) << 6) |\
|
||||
((my_wc_t) ((uchar) b1 ^ 0x80)))
|
||||
#define UTF8MB3_CODE(b0,b1,b2) (((my_wc_t) ((uchar) b0 & 0x0f) << 12) |\
|
||||
((my_wc_t) ((uchar) b1 ^ 0x80) << 6) |\
|
||||
((my_wc_t) ((uchar) b2 ^ 0x80)))
|
||||
#define UTF8MB4_CODE(b0,b1,b2,b3) (((my_wc_t) ((uchar) b0 & 0x07) << 18) |\
|
||||
((my_wc_t) ((uchar) b1 ^ 0x80) << 12) |\
|
||||
((my_wc_t) ((uchar) b2 ^ 0x80) << 6) |\
|
||||
(my_wc_t) ((uchar) b3 ^ 0x80))
|
||||
|
||||
/* Definitions for strcoll.ic */
|
||||
#define IS_MB1_CHAR(x) ((uchar) (x) < 0x80)
|
||||
|
@ -4981,42 +4912,7 @@ static const uchar to_upper_utf8[] = {
|
|||
static int my_utf8_uni(CHARSET_INFO *cs __attribute__((unused)),
|
||||
my_wc_t * pwc, const uchar *s, const uchar *e)
|
||||
{
|
||||
uchar c;
|
||||
|
||||
if (s >= e)
|
||||
return MY_CS_TOOSMALL;
|
||||
|
||||
c= s[0];
|
||||
if (c < 0x80)
|
||||
{
|
||||
*pwc = c;
|
||||
return 1;
|
||||
}
|
||||
else if (c < 0xc2)
|
||||
return MY_CS_ILSEQ;
|
||||
else if (c < 0xe0)
|
||||
{
|
||||
if (s+2 > e) /* We need 2 characters */
|
||||
return MY_CS_TOOSMALL2;
|
||||
|
||||
if (!(IS_CONTINUATION_BYTE(s[1])))
|
||||
return MY_CS_ILSEQ;
|
||||
|
||||
*pwc= UTF8MB2_CODE(c, s[1]);
|
||||
return 2;
|
||||
}
|
||||
else if (c < 0xf0)
|
||||
{
|
||||
if (s+3 > e) /* We need 3 characters */
|
||||
return MY_CS_TOOSMALL3;
|
||||
|
||||
if (!IS_UTF8MB3_STEP2(c, s[1], s[2]))
|
||||
return MY_CS_ILSEQ;
|
||||
|
||||
*pwc= UTF8MB3_CODE(c, s[1], s[2]);
|
||||
return 3;
|
||||
}
|
||||
return MY_CS_ILSEQ;
|
||||
return my_mb_wc_utf8mb3_quick(pwc, s, e);
|
||||
}
|
||||
|
||||
|
||||
|
@ -7379,52 +7275,7 @@ static int
|
|||
my_mb_wc_utf8mb4(CHARSET_INFO *cs __attribute__((unused)),
|
||||
my_wc_t * pwc, const uchar *s, const uchar *e)
|
||||
{
|
||||
uchar c;
|
||||
|
||||
if (s >= e)
|
||||
return MY_CS_TOOSMALL;
|
||||
|
||||
c= s[0];
|
||||
if (c < 0x80)
|
||||
{
|
||||
*pwc= c;
|
||||
return 1;
|
||||
}
|
||||
else if (c < 0xc2)
|
||||
return MY_CS_ILSEQ;
|
||||
else if (c < 0xe0)
|
||||
{
|
||||
if (s + 2 > e) /* We need 2 characters */
|
||||
return MY_CS_TOOSMALL2;
|
||||
|
||||
if (!(IS_CONTINUATION_BYTE(s[1])))
|
||||
return MY_CS_ILSEQ;
|
||||
|
||||
*pwc= UTF8MB2_CODE(c, s[1]);
|
||||
return 2;
|
||||
}
|
||||
else if (c < 0xf0)
|
||||
{
|
||||
if (s + 3 > e) /* We need 3 characters */
|
||||
return MY_CS_TOOSMALL3;
|
||||
|
||||
if (!IS_UTF8MB3_STEP2(c, s[1], s[2]))
|
||||
return MY_CS_ILSEQ;
|
||||
|
||||
*pwc= UTF8MB3_CODE(c, s[1], s[2]);
|
||||
return 3;
|
||||
}
|
||||
else if (c < 0xf5)
|
||||
{
|
||||
if (s + 4 > e) /* We need 4 characters */
|
||||
return MY_CS_TOOSMALL4;
|
||||
|
||||
if (!IS_UTF8MB4_STEP2(c, s[1], s[2], s[3]))
|
||||
return MY_CS_ILSEQ;
|
||||
*pwc= UTF8MB4_CODE(c, s[1], s[2], s[3]);
|
||||
return 4;
|
||||
}
|
||||
return MY_CS_ILSEQ;
|
||||
return my_mb_wc_utf8mb4_quick(pwc, s, e);
|
||||
}
|
||||
|
||||
|
||||
|
|
190
strings/ctype-utf8.h
Normal file
190
strings/ctype-utf8.h
Normal file
|
@ -0,0 +1,190 @@
|
|||
/*
|
||||
Copyright (c) 2018 MariaDB Corporation
|
||||
|
||||
This program is free software; you can redistribute it and/or modify
|
||||
it under the terms of the GNU General Public License as published by
|
||||
the Free Software Foundation; version 2 of the License.
|
||||
|
||||
This program is distributed in the hope that it will be useful,
|
||||
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
GNU General Public License for more details.
|
||||
|
||||
You should have received a copy of the GNU General Public License
|
||||
along with this program; if not, write to the Free Software
|
||||
Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
*/
|
||||
|
||||
#ifndef _CTYPE_UTF8_H
|
||||
#define _CTYPE_UTF8_H
|
||||
|
||||
/* Detect special bytes and sequences */
|
||||
#define IS_CONTINUATION_BYTE(c) (((uchar) (c) ^ 0x80) < 0x40)
|
||||
|
||||
/*
|
||||
Check MB2 character assuming that b0 is alredy known to be >= 0xC2.
|
||||
Use this macro if the caller already checked b0 for:
|
||||
- an MB1 character
|
||||
- an unused gap between MB1 and MB2HEAD
|
||||
*/
|
||||
#define IS_UTF8MB2_STEP2(b0,b1) (((uchar) (b0) < 0xE0) && \
|
||||
IS_CONTINUATION_BYTE((uchar) b1))
|
||||
|
||||
/*
|
||||
Check MB3 character assuming that b0 is already known to be
|
||||
in the valid MB3HEAD range [0xE0..0xEF].
|
||||
*/
|
||||
#define IS_UTF8MB3_STEP2(b0,b1,b2) (IS_CONTINUATION_BYTE(b1) && \
|
||||
IS_CONTINUATION_BYTE(b2) && \
|
||||
((uchar) b0 >= 0xe1 || (uchar) b1 >= 0xa0))
|
||||
|
||||
/*
|
||||
Check MB3 character assuming that b0 is already known to be >= 0xE0,
|
||||
but is not checked for the high end 0xF0 yet.
|
||||
Use this macro if the caller already checked b0 for:
|
||||
- an MB1 character
|
||||
- an unused gap between MB1 and MB2HEAD
|
||||
- an MB2HEAD
|
||||
*/
|
||||
#define IS_UTF8MB3_STEP3(b0,b1,b2) (((uchar) (b0) < 0xF0) && \
|
||||
IS_UTF8MB3_STEP2(b0,b1,b2))
|
||||
|
||||
/*
|
||||
UTF-8 quick four-byte mask:
|
||||
11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
|
||||
Encoding allows to encode U+00010000..U+001FFFFF
|
||||
|
||||
The maximum character defined in the Unicode standard is U+0010FFFF.
|
||||
Higher characters U+00110000..U+001FFFFF are not used.
|
||||
|
||||
11110000.10010000.10xxxxxx.10xxxxxx == F0.90.80.80 == U+00010000 (min)
|
||||
11110100.10001111.10111111.10111111 == F4.8F.BF.BF == U+0010FFFF (max)
|
||||
|
||||
Valid codes:
|
||||
[F0][90..BF][80..BF][80..BF]
|
||||
[F1][80..BF][80..BF][80..BF]
|
||||
[F2][80..BF][80..BF][80..BF]
|
||||
[F3][80..BF][80..BF][80..BF]
|
||||
[F4][80..8F][80..BF][80..BF]
|
||||
*/
|
||||
|
||||
/*
|
||||
Check MB4 character assuming that b0 is already
|
||||
known to be in the range [0xF0..0xF4]
|
||||
*/
|
||||
#define IS_UTF8MB4_STEP2(b0,b1,b2,b3) (IS_CONTINUATION_BYTE(b1) && \
|
||||
IS_CONTINUATION_BYTE(b2) && \
|
||||
IS_CONTINUATION_BYTE(b3) && \
|
||||
(b0 >= 0xf1 || b1 >= 0x90) && \
|
||||
(b0 <= 0xf3 || b1 <= 0x8F))
|
||||
#define IS_UTF8MB4_STEP3(b0,b1,b2,b3) (((uchar) (b0) < 0xF5) && \
|
||||
IS_UTF8MB4_STEP2(b0,b1,b2,b3))
|
||||
|
||||
/* Convert individual bytes to Unicode code points */
|
||||
#define UTF8MB2_CODE(b0,b1) (((my_wc_t) ((uchar) b0 & 0x1f) << 6) |\
|
||||
((my_wc_t) ((uchar) b1 ^ 0x80)))
|
||||
#define UTF8MB3_CODE(b0,b1,b2) (((my_wc_t) ((uchar) b0 & 0x0f) << 12) |\
|
||||
((my_wc_t) ((uchar) b1 ^ 0x80) << 6) |\
|
||||
((my_wc_t) ((uchar) b2 ^ 0x80)))
|
||||
#define UTF8MB4_CODE(b0,b1,b2,b3) (((my_wc_t) ((uchar) b0 & 0x07) << 18) |\
|
||||
((my_wc_t) ((uchar) b1 ^ 0x80) << 12) |\
|
||||
((my_wc_t) ((uchar) b2 ^ 0x80) << 6) |\
|
||||
(my_wc_t) ((uchar) b3 ^ 0x80))
|
||||
|
||||
static inline int
|
||||
my_mb_wc_utf8mb3_quick(my_wc_t * pwc, const uchar *s, const uchar *e)
|
||||
{
|
||||
uchar c;
|
||||
|
||||
if (s >= e)
|
||||
return MY_CS_TOOSMALL;
|
||||
|
||||
c= s[0];
|
||||
if (c < 0x80)
|
||||
{
|
||||
*pwc = c;
|
||||
return 1;
|
||||
}
|
||||
else if (c < 0xc2)
|
||||
return MY_CS_ILSEQ;
|
||||
else if (c < 0xe0)
|
||||
{
|
||||
if (s+2 > e) /* We need 2 characters */
|
||||
return MY_CS_TOOSMALL2;
|
||||
|
||||
if (!(IS_CONTINUATION_BYTE(s[1])))
|
||||
return MY_CS_ILSEQ;
|
||||
|
||||
*pwc= UTF8MB2_CODE(c, s[1]);
|
||||
return 2;
|
||||
}
|
||||
else if (c < 0xf0)
|
||||
{
|
||||
if (s+3 > e) /* We need 3 characters */
|
||||
return MY_CS_TOOSMALL3;
|
||||
|
||||
if (!IS_UTF8MB3_STEP2(c, s[1], s[2]))
|
||||
return MY_CS_ILSEQ;
|
||||
|
||||
*pwc= UTF8MB3_CODE(c, s[1], s[2]);
|
||||
return 3;
|
||||
}
|
||||
return MY_CS_ILSEQ;
|
||||
}
|
||||
|
||||
|
||||
#ifdef HAVE_CHARSET_utf8mb4
|
||||
static inline int
|
||||
my_mb_wc_utf8mb4_quick(my_wc_t *pwc, const uchar *s, const uchar *e)
|
||||
{
|
||||
uchar c;
|
||||
|
||||
if (s >= e)
|
||||
return MY_CS_TOOSMALL;
|
||||
|
||||
c= s[0];
|
||||
if (c < 0x80)
|
||||
{
|
||||
*pwc= c;
|
||||
return 1;
|
||||
}
|
||||
else if (c < 0xc2)
|
||||
return MY_CS_ILSEQ;
|
||||
else if (c < 0xe0)
|
||||
{
|
||||
if (s + 2 > e) /* We need 2 characters */
|
||||
return MY_CS_TOOSMALL2;
|
||||
|
||||
if (!(IS_CONTINUATION_BYTE(s[1])))
|
||||
return MY_CS_ILSEQ;
|
||||
|
||||
*pwc= UTF8MB2_CODE(c, s[1]);
|
||||
return 2;
|
||||
}
|
||||
else if (c < 0xf0)
|
||||
{
|
||||
if (s + 3 > e) /* We need 3 characters */
|
||||
return MY_CS_TOOSMALL3;
|
||||
|
||||
if (!IS_UTF8MB3_STEP2(c, s[1], s[2]))
|
||||
return MY_CS_ILSEQ;
|
||||
|
||||
*pwc= UTF8MB3_CODE(c, s[1], s[2]);
|
||||
return 3;
|
||||
}
|
||||
else if (c < 0xf5)
|
||||
{
|
||||
if (s + 4 > e) /* We need 4 characters */
|
||||
return MY_CS_TOOSMALL4;
|
||||
|
||||
if (!IS_UTF8MB4_STEP2(c, s[1], s[2], s[3]))
|
||||
return MY_CS_ILSEQ;
|
||||
*pwc= UTF8MB4_CODE(c, s[1], s[2], s[3]);
|
||||
return 4;
|
||||
}
|
||||
return MY_CS_ILSEQ;
|
||||
}
|
||||
#endif /* HAVE_CHARSET_utf8mb4*/
|
||||
|
||||
|
||||
#endif /* _CTYPE_UTF8_H */
|
Loading…
Reference in a new issue