From a8efe7ab1f28e2219df5ae9aa88fa63c40ad1066 Mon Sep 17 00:00:00 2001 From: Alexander Barkov Date: Fri, 19 Oct 2018 14:20:31 +0400 Subject: [PATCH] MDEV-17502 MDEV-17474 Change Unicode xxx_general_ci and xxx_bin collation implementation to "inline" style --- include/m_ctype.h | 8 -- strings/ctype-ucs2.c | 71 ++++++++--- strings/ctype-unidata.h | 31 +++++ strings/ctype-utf8.c | 209 ++++++++++++------------------- strings/strcoll.ic | 267 +++++++++++++++++++++++++++++++++++++++- 5 files changed, 430 insertions(+), 156 deletions(-) create mode 100644 strings/ctype-unidata.h diff --git a/include/m_ctype.h b/include/m_ctype.h index 51c7954d575..c6273590bbe 100644 --- a/include/m_ctype.h +++ b/include/m_ctype.h @@ -871,14 +871,6 @@ size_t my_strnxfrm_mb_nopad(CHARSET_INFO *, uchar *dst, size_t dstlen, uint nweights, const uchar *src, size_t srclen, uint flags); -size_t my_strnxfrm_unicode(CHARSET_INFO *, - uchar *dst, size_t dstlen, uint nweights, - const uchar *src, size_t srclen, uint flags); - -size_t my_strnxfrm_unicode_nopad(CHARSET_INFO *, - uchar *dst, size_t dstlen, uint nweights, - const uchar *src, size_t srclen, uint flags); - size_t my_strnxfrmlen_unicode(CHARSET_INFO *, size_t); size_t my_strnxfrm_unicode_full_bin(CHARSET_INFO *, diff --git a/strings/ctype-ucs2.c b/strings/ctype-ucs2.c index f34b2a841e6..28e7def3ddf 100644 --- a/strings/ctype-ucs2.c +++ b/strings/ctype-ucs2.c @@ -23,6 +23,8 @@ #include #include +#include "ctype-unidata.h" + #if defined(HAVE_CHARSET_utf16) || defined(HAVE_CHARSET_ucs2) #define HAVE_CHARSET_mb2 @@ -1192,10 +1194,17 @@ my_lengthsp_mb2(CHARSET_INFO *cs __attribute__((unused)), static inline int my_weight_mb2_utf16mb2_general_ci(uchar b0, uchar b1) { my_wc_t wc= MY_UTF16_WC2(b0, b1); - MY_UNICASE_CHARACTER *page= my_unicase_default.page[wc >> 8]; + MY_UNICASE_CHARACTER *page= my_unicase_default_pages[wc >> 8]; return (int) (page ? page[wc & 0xFF].sort : wc); } #define MY_FUNCTION_NAME(x) my_ ## x ## _utf16_general_ci +#define DEFINE_STRNXFRM_UNICODE +#define DEFINE_STRNXFRM_UNICODE_NOPAD +#define MY_MB_WC(cs, pwc, s, e) my_mb_wc_utf16_quick(pwc, s, e) +#define OPTIMIZE_ASCII 0 +#define UNICASE_MAXCHAR MY_UNICASE_INFO_DEFAULT_MAXCHAR +#define UNICASE_PAGE0 my_unicase_default_page00 +#define UNICASE_PAGES my_unicase_default_pages #define WEIGHT_ILSEQ(x) (0xFF0000 + (uchar) (x)) #define WEIGHT_MB2(b0,b1) my_weight_mb2_utf16mb2_general_ci(b0,b1) #define WEIGHT_MB4(b0,b1,b2,b3) MY_CS_REPLACEMENT_CHARACTER @@ -1493,7 +1502,7 @@ static MY_COLLATION_HANDLER my_collation_utf16_general_ci_handler = NULL, /* init */ my_strnncoll_utf16_general_ci, my_strnncollsp_utf16_general_ci, - my_strnxfrm_unicode, + my_strnxfrm_utf16_general_ci, my_strnxfrmlen_unicode, my_like_range_generic, my_wildcmp_utf16_ci, @@ -1525,7 +1534,7 @@ static MY_COLLATION_HANDLER my_collation_utf16_general_nopad_ci_handler = NULL, /* init */ my_strnncoll_utf16_general_ci, my_strnncollsp_utf16_general_nopad_ci, - my_strnxfrm_unicode_nopad, + my_strnxfrm_nopad_utf16_general_ci, my_strnxfrmlen_unicode, my_like_range_generic, my_wildcmp_utf16_ci, @@ -1722,6 +1731,13 @@ struct charset_info_st my_charset_utf16_nopad_bin= #define IS_MB4_CHAR(b0,b1,b2,b3) (MY_UTF16_HIGH_HEAD(b1) && MY_UTF16_LOW_HEAD(b3)) #define MY_FUNCTION_NAME(x) my_ ## x ## _utf16le_general_ci +#define DEFINE_STRNXFRM_UNICODE +#define DEFINE_STRNXFRM_UNICODE_NOPAD +#define MY_MB_WC(cs, pwc, s, e) (cs->cset->mb_wc(cs, pwc, s, e)) +#define OPTIMIZE_ASCII 0 +#define UNICASE_MAXCHAR MY_UNICASE_INFO_DEFAULT_MAXCHAR +#define UNICASE_PAGE0 my_unicase_default_page00 +#define UNICASE_PAGES my_unicase_default_pages #define WEIGHT_ILSEQ(x) (0xFF0000 + (uchar) (x)) #define WEIGHT_MB2(b0,b1) my_weight_mb2_utf16mb2_general_ci(b1,b0) #define WEIGHT_MB4(b0,b1,b2,b3) MY_CS_REPLACEMENT_CHARACTER @@ -1826,7 +1842,7 @@ static MY_COLLATION_HANDLER my_collation_utf16le_general_ci_handler = NULL, /* init */ my_strnncoll_utf16le_general_ci, my_strnncollsp_utf16le_general_ci, - my_strnxfrm_unicode, + my_strnxfrm_utf16le_general_ci, my_strnxfrmlen_unicode, my_like_range_generic, my_wildcmp_utf16_ci, @@ -1858,7 +1874,7 @@ static MY_COLLATION_HANDLER my_collation_utf16le_general_nopad_ci_handler = NULL, /* init */ my_strnncoll_utf16le_general_ci, my_strnncollsp_utf16le_general_nopad_ci, - my_strnxfrm_unicode_nopad, + my_strnxfrm_nopad_utf16le_general_ci, my_strnxfrmlen_unicode, my_like_range_generic, my_wildcmp_utf16_ci, @@ -2073,12 +2089,19 @@ static inline int my_weight_utf32_general_ci(uchar b0, uchar b1, my_wc_t wc= MY_UTF32_WC4(b0, b1, b2, b3); if (wc <= 0xFFFF) { - MY_UNICASE_CHARACTER *page= my_unicase_default.page[wc >> 8]; + MY_UNICASE_CHARACTER *page= my_unicase_default_pages[wc >> 8]; return (int) (page ? page[wc & 0xFF].sort : wc); } return MY_CS_REPLACEMENT_CHARACTER; } #define MY_FUNCTION_NAME(x) my_ ## x ## _utf32_general_ci +#define DEFINE_STRNXFRM_UNICODE +#define DEFINE_STRNXFRM_UNICODE_NOPAD +#define MY_MB_WC(cs, pwc, s, e) my_mb_wc_utf32_quick(pwc, s, e) +#define OPTIMIZE_ASCII 0 +#define UNICASE_MAXCHAR MY_UNICASE_INFO_DEFAULT_MAXCHAR +#define UNICASE_PAGE0 my_unicase_default_page00 +#define UNICASE_PAGES my_unicase_default_pages #define WEIGHT_ILSEQ(x) (0xFF0000 + (uchar) (x)) #define WEIGHT_MB4(b0,b1,b2,b3) my_weight_utf32_general_ci(b0, b1, b2, b3) #include "strcoll.ic" @@ -2642,7 +2665,7 @@ static MY_COLLATION_HANDLER my_collation_utf32_general_ci_handler = NULL, /* init */ my_strnncoll_utf32_general_ci, my_strnncollsp_utf32_general_ci, - my_strnxfrm_unicode, + my_strnxfrm_utf32_general_ci, my_strnxfrmlen_unicode, my_like_range_generic, my_wildcmp_utf32_ci, @@ -2674,7 +2697,7 @@ static MY_COLLATION_HANDLER my_collation_utf32_general_nopad_ci_handler = NULL, /* init */ my_strnncoll_utf32_general_ci, my_strnncollsp_utf32_general_nopad_ci, - my_strnxfrm_unicode_nopad, + my_strnxfrm_nopad_utf32_general_ci, my_strnxfrmlen_unicode, my_like_range_generic, my_wildcmp_utf32_ci, @@ -2941,20 +2964,30 @@ static const uchar to_upper_ucs2[] = { static inline int my_weight_mb2_ucs2_general_ci(uchar b0, uchar b1) { my_wc_t wc= UCS2_CODE(b0, b1); - MY_UNICASE_CHARACTER *page= my_unicase_default.page[wc >> 8]; + MY_UNICASE_CHARACTER *page= my_unicase_default_pages[wc >> 8]; return (int) (page ? page[wc & 0xFF].sort : wc); } -#define MY_FUNCTION_NAME(x) my_ ## x ## _ucs2_general_ci -#define WEIGHT_ILSEQ(x) (0xFF0000 + (uchar) (x)) -#define WEIGHT_MB2(b0,b1) my_weight_mb2_ucs2_general_ci(b0,b1) +#define MY_FUNCTION_NAME(x) my_ ## x ## _ucs2_general_ci +#define DEFINE_STRNXFRM_UNICODE +#define DEFINE_STRNXFRM_UNICODE_NOPAD +#define MY_MB_WC(cs, pwc, s, e) my_mb_wc_ucs2_quick(pwc, s, e) +#define OPTIMIZE_ASCII 0 +#define UNICASE_MAXCHAR MY_UNICASE_INFO_DEFAULT_MAXCHAR +#define UNICASE_PAGE0 my_unicase_default_page00 +#define UNICASE_PAGES my_unicase_default_pages +#define WEIGHT_ILSEQ(x) (0xFF0000 + (uchar) (x)) +#define WEIGHT_MB2(b0,b1) my_weight_mb2_ucs2_general_ci(b0,b1) #include "strcoll.ic" -#define MY_FUNCTION_NAME(x) my_ ## x ## _ucs2_bin -#define WEIGHT_ILSEQ(x) (0xFF0000 + (uchar) (x)) -#define WEIGHT_MB2(b0,b1) UCS2_CODE(b0,b1) +#define MY_FUNCTION_NAME(x) my_ ## x ## _ucs2_bin +#define DEFINE_STRNXFRM_UNICODE_BIN2 +#define MY_MB_WC(cs, pwc, s, e) my_mb_wc_ucs2_quick(pwc, s, e) +#define OPTIMIZE_ASCII 0 +#define WEIGHT_ILSEQ(x) (0xFF0000 + (uchar) (x)) +#define WEIGHT_MB2(b0,b1) UCS2_CODE(b0,b1) #include "strcoll.ic" @@ -3222,7 +3255,7 @@ static MY_COLLATION_HANDLER my_collation_ucs2_general_ci_handler = NULL, /* init */ my_strnncoll_ucs2_general_ci, my_strnncollsp_ucs2_general_ci, - my_strnxfrm_unicode, + my_strnxfrm_ucs2_general_ci, my_strnxfrmlen_unicode, my_like_range_generic, my_wildcmp_ucs2_ci, @@ -3238,7 +3271,7 @@ static MY_COLLATION_HANDLER my_collation_ucs2_bin_handler = NULL, /* init */ my_strnncoll_ucs2_bin, my_strnncollsp_ucs2_bin, - my_strnxfrm_unicode, + my_strnxfrm_ucs2_bin, my_strnxfrmlen_unicode, my_like_range_generic, my_wildcmp_ucs2_bin, @@ -3254,7 +3287,7 @@ static MY_COLLATION_HANDLER my_collation_ucs2_general_nopad_ci_handler = NULL, /* init */ my_strnncoll_ucs2_general_ci, my_strnncollsp_ucs2_general_nopad_ci, - my_strnxfrm_unicode_nopad, + my_strnxfrm_nopad_ucs2_general_ci, my_strnxfrmlen_unicode, my_like_range_generic, my_wildcmp_ucs2_ci, @@ -3270,7 +3303,7 @@ static MY_COLLATION_HANDLER my_collation_ucs2_nopad_bin_handler = NULL, /* init */ my_strnncoll_ucs2_bin, my_strnncollsp_ucs2_nopad_bin, - my_strnxfrm_unicode_nopad, + my_strnxfrm_nopad_ucs2_bin, my_strnxfrmlen_unicode, my_like_range_generic, my_wildcmp_ucs2_bin, diff --git a/strings/ctype-unidata.h b/strings/ctype-unidata.h new file mode 100644 index 00000000000..6712f5e1d79 --- /dev/null +++ b/strings/ctype-unidata.h @@ -0,0 +1,31 @@ +#ifndef CTYPE_UNIDATA_H_INCLUDED +#define CTYPE_UNIDATA_H_INCLUDED +/* + Copyright (c) 2018 MariaDB Corporation + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; version 2 of the License. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program; if not, write to the Free Software + Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA +*/ + +#define MY_UNICASE_INFO_DEFAULT_MAXCHAR 0xFFFF +extern MY_UNICASE_CHARACTER my_unicase_default_page00[256]; +extern MY_UNICASE_CHARACTER *my_unicase_default_pages[256]; + +size_t my_strxfrm_pad_nweights_unicode(uchar *str, uchar *strend, size_t nweights); +size_t my_strxfrm_pad_unicode(uchar *str, uchar *strend); + + +#define PUT_WC_BE2_HAVE_1BYTE(dst, de, wc) \ + do { *dst++= (uchar) (wc >> 8); if (dst < de) *dst++= (uchar) (wc & 0xFF); } while(0) + +#endif /* CTYPE_UNIDATA_H_INCLUDED */ diff --git a/strings/ctype-utf8.c b/strings/ctype-utf8.c index 44544e38d4f..20a6530953a 100644 --- a/strings/ctype-utf8.c +++ b/strings/ctype-utf8.c @@ -28,6 +28,7 @@ #include "ctype-utf8.h" +#include "ctype-unidata.h" /* Definitions for strcoll.ic */ @@ -111,7 +112,7 @@ int my_valid_mbcharlen_utf8mb3(const uchar *s, const uchar *e) #include "my_uctype.h" -static MY_UNICASE_CHARACTER plane00[]={ +MY_UNICASE_CHARACTER my_unicase_default_page00[]={ {0x0000,0x0000,0x0000}, {0x0001,0x0001,0x0001}, {0x0002,0x0002,0x0002}, {0x0003,0x0003,0x0003}, {0x0004,0x0004,0x0004}, {0x0005,0x0005,0x0005}, @@ -244,7 +245,7 @@ static MY_UNICASE_CHARACTER plane00[]={ /* - Almost similar to plane00, but maps sorting order + Almost similar to my_unicase_default_page00, but maps sorting order for U+00DF to 0x00DF instead of 0x0053. */ static MY_UNICASE_CHARACTER plane00_mysql500[]={ @@ -1690,9 +1691,10 @@ static MY_UNICASE_CHARACTER planeFF[]={ }; -static MY_UNICASE_CHARACTER *my_unicase_pages_default[256]= +MY_UNICASE_CHARACTER *my_unicase_default_pages[256]= { - plane00, plane01, plane02, plane03, plane04, plane05, NULL, NULL, + my_unicase_default_page00, + plane01, plane02, plane03, plane04, plane05, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, plane1E, plane1F, @@ -1729,8 +1731,8 @@ static MY_UNICASE_CHARACTER *my_unicase_pages_default[256]= MY_UNICASE_INFO my_unicase_default= { - 0xFFFF, - my_unicase_pages_default + MY_UNICASE_INFO_DEFAULT_MAXCHAR, + my_unicase_default_pages }; @@ -4581,7 +4583,7 @@ my_wildcmp_unicode(CHARSET_INFO *cs, @return Result length */ -static size_t +size_t my_strxfrm_pad_nweights_unicode(uchar *str, uchar *strend, size_t nweights) { uchar *str0; @@ -4610,7 +4612,7 @@ my_strxfrm_pad_nweights_unicode(uchar *str, uchar *strend, size_t nweights) @return Result length */ -static size_t +size_t my_strxfrm_pad_unicode(uchar *str, uchar *strend) { uchar *str0= str; @@ -4625,95 +4627,6 @@ my_strxfrm_pad_unicode(uchar *str, uchar *strend) } -size_t my_strnxfrm_unicode_internal(CHARSET_INFO *cs, - uchar *dst, uchar *de, uint *nweights, - const uchar *src, const uchar *se) -{ - my_wc_t UNINIT_VAR(wc); - int res; - uchar *dst0= dst; - MY_UNICASE_INFO *uni_plane= (cs->state & MY_CS_BINSORT) ? - NULL : cs->caseinfo; - - DBUG_ASSERT(src || !se); - - for (; dst < de && *nweights; (*nweights)--) - { - if ((res= cs->cset->mb_wc(cs, &wc, src, se)) <= 0) - break; - src+= res; - - if (uni_plane) - my_tosort_unicode(uni_plane, &wc, cs->state); - - *dst++= (uchar) (wc >> 8); - if (dst < de) - *dst++= (uchar) (wc & 0xFF); - } - return dst - dst0; -} - - -/* - Store sorting weights using 2 bytes per character. - - This function is shared between - - utf8mb3_general_ci, utf8_bin, ucs2_general_ci, ucs2_bin - which support BMP only (U+0000..U+FFFF). - - utf8mb4_general_ci, utf16_general_ci, utf32_general_ci, - which map all supplementary characters to weight 0xFFFD. -*/ -size_t -my_strnxfrm_unicode(CHARSET_INFO *cs, - uchar *dst, size_t dstlen, uint nweights, - const uchar *src, size_t srclen, uint flags) -{ - uchar *dst0= dst; - uchar *de= dst + dstlen; - dst+= my_strnxfrm_unicode_internal(cs, dst, de, &nweights, - src, src + srclen); - DBUG_ASSERT(dst <= de); /* Safety */ - - if (dst < de && nweights && (flags & MY_STRXFRM_PAD_WITH_SPACE)) - dst+= my_strxfrm_pad_nweights_unicode(dst, de, nweights); - - my_strxfrm_desc_and_reverse(dst0, dst, flags, 0); - - if ((flags & MY_STRXFRM_PAD_TO_MAXLEN) && dst < de) - dst+= my_strxfrm_pad_unicode(dst, de); - return dst - dst0; -} - - -size_t -my_strnxfrm_unicode_nopad(CHARSET_INFO *cs, - uchar *dst, size_t dstlen, uint nweights, - const uchar *src, size_t srclen, uint flags) -{ - uchar *dst0= dst; - uchar *de= dst + dstlen; - dst+= my_strnxfrm_unicode_internal(cs, dst, de, &nweights, - src, src + srclen); - DBUG_ASSERT(dst <= de); /* Safety */ - - if (dst < de && nweights && (flags & MY_STRXFRM_PAD_WITH_SPACE)) - { - size_t len= de - dst; - set_if_smaller(len, nweights * 2); - memset(dst, 0x00, len); - dst+= len; - } - - my_strxfrm_desc_and_reverse(dst0, dst, flags, 0); - - if ((flags & MY_STRXFRM_PAD_TO_MAXLEN) && dst < de) - { - memset(dst, 0x00, de - dst); - dst= de; - } - return dst - dst0; -} - /* For BMP-only collations that use 2 bytes per weight. */ @@ -5208,7 +5121,7 @@ int my_strcasecmp_utf8(CHARSET_INFO *cs, const char *s, const char *t) It represents a single byte character. Convert it into weight according to collation. */ - s_wc= plane00[(uchar) s[0]].tolower; + s_wc= my_unicase_default_page00[(uchar) s[0]].tolower; s++; } else @@ -5250,7 +5163,7 @@ int my_strcasecmp_utf8(CHARSET_INFO *cs, const char *s, const char *t) if ((uchar) t[0] < 128) { /* Convert single byte character into weight */ - t_wc= plane00[(uchar) t[0]].tolower; + t_wc= my_unicase_default_page00[(uchar) t[0]].tolower; t++; } else @@ -5313,14 +5226,14 @@ int my_charlen_utf8(CHARSET_INFO *cs __attribute__((unused)), static inline int my_weight_mb1_utf8_general_ci(uchar b) { - return (int) plane00[b & 0xFF].sort; + return (int) my_unicase_default_page00[b & 0xFF].sort; } static inline int my_weight_mb2_utf8_general_ci(uchar b0, uchar b1) { my_wc_t wc= UTF8MB2_CODE(b0, b1); - MY_UNICASE_CHARACTER *page= my_unicase_pages_default[wc >> 8]; + MY_UNICASE_CHARACTER *page= my_unicase_default_pages[wc >> 8]; return (int) (page ? page[wc & 0xFF].sort : wc); } @@ -5328,16 +5241,23 @@ static inline int my_weight_mb2_utf8_general_ci(uchar b0, uchar b1) static inline int my_weight_mb3_utf8_general_ci(uchar b0, uchar b1, uchar b2) { my_wc_t wc= UTF8MB3_CODE(b0, b1, b2); - MY_UNICASE_CHARACTER *page= my_unicase_pages_default[wc >> 8]; + MY_UNICASE_CHARACTER *page= my_unicase_default_pages[wc >> 8]; return (int) (page ? page[wc & 0xFF].sort : wc); } -#define MY_FUNCTION_NAME(x) my_ ## x ## _utf8_general_ci -#define WEIGHT_ILSEQ(x) (0xFF0000 + (uchar) (x)) -#define WEIGHT_MB1(x) my_weight_mb1_utf8_general_ci(x) -#define WEIGHT_MB2(x,y) my_weight_mb2_utf8_general_ci(x,y) -#define WEIGHT_MB3(x,y,z) my_weight_mb3_utf8_general_ci(x,y,z) +#define MY_FUNCTION_NAME(x) my_ ## x ## _utf8_general_ci +#define DEFINE_STRNXFRM_UNICODE +#define DEFINE_STRNXFRM_UNICODE_NOPAD +#define MY_MB_WC(cs, pwc, s, e) my_mb_wc_utf8mb3_quick(pwc, s, e) +#define OPTIMIZE_ASCII 1 +#define UNICASE_MAXCHAR MY_UNICASE_INFO_DEFAULT_MAXCHAR +#define UNICASE_PAGE0 my_unicase_default_page00 +#define UNICASE_PAGES my_unicase_default_pages +#define WEIGHT_ILSEQ(x) (0xFF0000 + (uchar) (x)) +#define WEIGHT_MB1(x) my_weight_mb1_utf8_general_ci(x) +#define WEIGHT_MB2(x,y) my_weight_mb2_utf8_general_ci(x,y) +#define WEIGHT_MB3(x,y,z) my_weight_mb3_utf8_general_ci(x,y,z) #include "strcoll.ic" @@ -5373,19 +5293,28 @@ my_weight_mb3_utf8_general_mysql500_ci(uchar b0, uchar b1, uchar b2) } -#define MY_FUNCTION_NAME(x) my_ ## x ## _utf8_general_mysql500_ci -#define WEIGHT_ILSEQ(x) (0xFF0000 + (uchar) (x)) -#define WEIGHT_MB1(x) my_weight_mb1_utf8_general_mysql500_ci(x) -#define WEIGHT_MB2(x,y) my_weight_mb2_utf8_general_mysql500_ci(x,y) -#define WEIGHT_MB3(x,y,z) my_weight_mb3_utf8_general_mysql500_ci(x,y,z) +#define MY_FUNCTION_NAME(x) my_ ## x ## _utf8_general_mysql500_ci +#define DEFINE_STRNXFRM_UNICODE +#define MY_MB_WC(cs, pwc, s, e) my_mb_wc_utf8mb3_quick(pwc, s, e) +#define OPTIMIZE_ASCII 1 +#define UNICASE_MAXCHAR MY_UNICASE_INFO_DEFAULT_MAXCHAR +#define UNICASE_PAGE0 plane00_mysql500 +#define UNICASE_PAGES my_unicase_pages_mysql500 +#define WEIGHT_ILSEQ(x) (0xFF0000 + (uchar) (x)) +#define WEIGHT_MB1(x) my_weight_mb1_utf8_general_mysql500_ci(x) +#define WEIGHT_MB2(x,y) my_weight_mb2_utf8_general_mysql500_ci(x,y) +#define WEIGHT_MB3(x,y,z) my_weight_mb3_utf8_general_mysql500_ci(x,y,z) #include "strcoll.ic" -#define MY_FUNCTION_NAME(x) my_ ## x ## _utf8_bin -#define WEIGHT_ILSEQ(x) (0xFF0000 + (uchar) (x)) -#define WEIGHT_MB1(x) ((int) (uchar) (x)) -#define WEIGHT_MB2(x,y) ((int) UTF8MB2_CODE(x,y)) -#define WEIGHT_MB3(x,y,z) ((int) UTF8MB3_CODE(x,y,z)) +#define MY_FUNCTION_NAME(x) my_ ## x ## _utf8_bin +#define DEFINE_STRNXFRM_UNICODE_BIN2 +#define MY_MB_WC(cs, pwc, s, e) my_mb_wc_utf8mb3_quick(pwc, s, e) +#define OPTIMIZE_ASCII 1 +#define WEIGHT_ILSEQ(x) (0xFF0000 + (uchar) (x)) +#define WEIGHT_MB1(x) ((int) (uchar) (x)) +#define WEIGHT_MB2(x,y) ((int) UTF8MB2_CODE(x,y)) +#define WEIGHT_MB3(x,y,z) ((int) UTF8MB3_CODE(x,y,z)) #include "strcoll.ic" @@ -5434,7 +5363,7 @@ static MY_COLLATION_HANDLER my_collation_utf8_general_ci_handler = NULL, /* init */ my_strnncoll_utf8_general_ci, my_strnncollsp_utf8_general_ci, - my_strnxfrm_unicode, + my_strnxfrm_utf8_general_ci, my_strnxfrmlen_unicode, my_like_range_mb, my_wildcmp_utf8, @@ -5450,7 +5379,7 @@ static MY_COLLATION_HANDLER my_collation_utf8_general_mysql500_ci_handler = NULL, /* init */ my_strnncoll_utf8_general_mysql500_ci, my_strnncollsp_utf8_general_mysql500_ci, - my_strnxfrm_unicode, + my_strnxfrm_utf8_general_mysql500_ci, my_strnxfrmlen_unicode, my_like_range_mb, my_wildcmp_utf8, @@ -5466,7 +5395,7 @@ static MY_COLLATION_HANDLER my_collation_utf8_bin_handler = NULL, /* init */ my_strnncoll_utf8_bin, my_strnncollsp_utf8_bin, - my_strnxfrm_unicode, + my_strnxfrm_utf8_bin, my_strnxfrmlen_unicode, my_like_range_mb, my_wildcmp_mb_bin, @@ -5482,7 +5411,7 @@ static MY_COLLATION_HANDLER my_collation_utf8_general_nopad_ci_handler = NULL, /* init */ my_strnncoll_utf8_general_ci, my_strnncollsp_utf8_general_nopad_ci, - my_strnxfrm_unicode_nopad, + my_strnxfrm_nopad_utf8_general_ci, my_strnxfrmlen_unicode, my_like_range_mb, my_wildcmp_utf8, @@ -5498,7 +5427,7 @@ static MY_COLLATION_HANDLER my_collation_utf8_nopad_bin_handler = NULL, /* init */ my_strnncoll_utf8_bin, my_strnncollsp_utf8_nopad_bin, - my_strnxfrm_unicode_nopad, + my_strnxfrm_nopad_utf8_bin, my_strnxfrmlen_unicode, my_like_range_mb, my_wildcmp_mb_bin, @@ -5827,7 +5756,7 @@ static MY_COLLATION_HANDLER my_collation_cs_handler = NULL, /* init */ my_strnncoll_utf8_cs, my_strnncollsp_utf8_cs, - my_strnxfrm_unicode, + my_strnxfrm_utf8_general_ci, my_strnxfrmlen_unicode, my_like_range_simple, my_wildcmp_mb, @@ -7112,13 +7041,30 @@ my_charlen_filename(CHARSET_INFO *cs, const uchar *str, const uchar *end) #undef DEFINE_WELL_FORMED_CHAR_LENGTH_USING_CHARLEN /* my_well_formed_char_length_filename */ +#define MY_FUNCTION_NAME(x) my_ ## x ## _filename +#define DEFINE_STRNNCOLL 0 +#define DEFINE_STRNXFRM_UNICODE +#define MY_MB_WC(cs, pwc, s, e) my_mb_wc_filename(cs, pwc, s, e) +#define OPTIMIZE_ASCII 0 +#define UNICASE_MAXCHAR MY_UNICASE_INFO_DEFAULT_MAXCHAR +#define UNICASE_PAGE0 my_unicase_default_page00 +#define UNICASE_PAGES my_unicase_default_pages + +/* +#define WEIGHT_ILSEQ(x) (0xFF0000 + (uchar) (x)) +#define WEIGHT_MB1(x) my_weight_mb1_utf8_general_ci(x) +#define WEIGHT_MB2(x,y) my_weight_mb2_utf8_general_ci(x,y) +#define WEIGHT_MB3(x,y,z) my_weight_mb3_utf8_general_ci(x,y,z) +*/ +#include "strcoll.ic" + static MY_COLLATION_HANDLER my_collation_filename_handler = { NULL, /* init */ my_strnncoll_simple, my_strnncollsp_simple, - my_strnxfrm_unicode, + my_strnxfrm_filename, my_strnxfrmlen_unicode, my_like_range_mb, my_wildcmp_utf8, @@ -7607,7 +7553,7 @@ my_strcasecmp_utf8mb4(CHARSET_INFO *cs, const char *s, const char *t) It represents a single byte character. Convert it into weight according to collation. */ - s_wc= plane00[(uchar) s[0]].tolower; + s_wc= my_unicase_default_page00[(uchar) s[0]].tolower; s++; } else @@ -7631,7 +7577,7 @@ my_strcasecmp_utf8mb4(CHARSET_INFO *cs, const char *s, const char *t) if ((uchar) t[0] < 128) { /* Convert single byte character into weight */ - t_wc= plane00[(uchar) t[0]].tolower; + t_wc= my_unicase_default_page00[(uchar) t[0]].tolower; t++; } else @@ -7702,6 +7648,13 @@ my_charlen_utf8mb4(CHARSET_INFO *cs __attribute__((unused)), #define MY_FUNCTION_NAME(x) my_ ## x ## _utf8mb4_general_ci +#define DEFINE_STRNXFRM_UNICODE +#define DEFINE_STRNXFRM_UNICODE_NOPAD +#define MY_MB_WC(cs, pwc, s, e) my_mb_wc_utf8mb4_quick(pwc, s, e) +#define OPTIMIZE_ASCII 1 +#define UNICASE_MAXCHAR MY_UNICASE_INFO_DEFAULT_MAXCHAR +#define UNICASE_PAGE0 my_unicase_default_page00 +#define UNICASE_PAGES my_unicase_default_pages #define IS_MB4_CHAR(b0,b1,b2,b3) IS_UTF8MB4_STEP3(b0,b1,b2,b3) #define WEIGHT_ILSEQ(x) (0xFF0000 + (uchar) (x)) #define WEIGHT_MB1(b0) my_weight_mb1_utf8_general_ci(b0) @@ -7752,7 +7705,7 @@ static MY_COLLATION_HANDLER my_collation_utf8mb4_general_ci_handler= NULL, /* init */ my_strnncoll_utf8mb4_general_ci, my_strnncollsp_utf8mb4_general_ci, - my_strnxfrm_unicode, + my_strnxfrm_utf8mb4_general_ci, my_strnxfrmlen_unicode, my_like_range_mb, my_wildcmp_utf8mb4, @@ -7784,7 +7737,7 @@ static MY_COLLATION_HANDLER my_collation_utf8mb4_general_nopad_ci_handler= NULL, /* init */ my_strnncoll_utf8mb4_general_ci, my_strnncollsp_utf8mb4_general_nopad_ci, - my_strnxfrm_unicode_nopad, + my_strnxfrm_nopad_utf8mb4_general_ci, my_strnxfrmlen_unicode, my_like_range_mb, my_wildcmp_utf8mb4, diff --git a/strings/strcoll.ic b/strings/strcoll.ic index c647a5ef57e..9dfccb9018c 100644 --- a/strings/strcoll.ic +++ b/strings/strcoll.ic @@ -15,11 +15,18 @@ Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA */ - #ifndef MY_FUNCTION_NAME #error MY_FUNCTION_NAME is not defined #endif +/* + Define strnncoll() and strnncollsp() by default, + unless "#define DEFINE_STRNNCOLL 0" is specified. +*/ +#ifndef DEFINE_STRNNCOLL +#define DEFINE_STRNNCOLL 1 +#endif + /* The weight for automatically padded spaces when comparing strings with @@ -54,6 +61,8 @@ #endif +#if DEFINE_STRNNCOLL + /** Scan a valid character, or a bad byte, or an auto-padded space from a string and calculate the weight of the scanned sequence. @@ -278,6 +287,8 @@ MY_FUNCTION_NAME(strnncollsp)(CHARSET_INFO *cs __attribute__((unused)), } #endif +#endif /* DEFINE_STRNNCOLL */ + #ifdef DEFINE_STRNXFRM #ifndef WEIGHT_MB2_FRM @@ -322,11 +333,261 @@ MY_FUNCTION_NAME(strnxfrm)(CHARSET_INFO *cs, #endif /* DEFINE_STRNXFRM */ +#if defined(DEFINE_STRNXFRM_UNICODE) || defined(DEFINE_STRNXFRM_UNICODE_NOPAD) + +/* + Store sorting weights using 2 bytes per character. + + This function is shared between + - utf8mb3_general_ci, utf8_bin, ucs2_general_ci, ucs2_bin + which support BMP only (U+0000..U+FFFF). + - utf8mb4_general_ci, utf16_general_ci, utf32_general_ci, + which map all supplementary characters to weight 0xFFFD. +*/ + +#ifndef MY_MB_WC +#error MY_MB_WC must be defined for DEFINE_STRNXFRM_UNICODE +#endif + +#ifndef OPTIMIZE_ASCII +#error OPTIMIZE_ASCII must be defined for DEFINE_STRNXFRM_UNICODE +#endif + +#ifndef UNICASE_MAXCHAR +#error UNICASE_MAXCHAR must be defined for DEFINE_STRNXFRM_UNICODE +#endif + +#ifndef UNICASE_PAGE0 +#error UNICASE_PAGE0 must be defined for DEFINE_STRNXFRM_UNICODE +#endif + +#ifndef UNICASE_PAGES +#error UNICASE_PAGES must be defined for DEFINE_STRNXFRM_UNICODE +#endif + + +static size_t +MY_FUNCTION_NAME(strnxfrm_internal)(CHARSET_INFO *cs, + uchar *dst, uchar *de, + uint *nweights, + const uchar *src, const uchar *se) +{ + my_wc_t UNINIT_VAR(wc); + uchar *dst0= dst; + + DBUG_ASSERT(src || !se); + DBUG_ASSERT((cs->state & MY_CS_LOWER_SORT) == 0); + DBUG_ASSERT(0x7F <= UNICASE_MAXCHAR); + + for (; dst < de && *nweights; (*nweights)--) + { + int res; +#if OPTIMIZE_ASCII + if (src >= se) + break; + if (src[0] <= 0x7F) + { + wc= UNICASE_PAGE0[*src++].sort; + PUT_WC_BE2_HAVE_1BYTE(dst, de, wc); + continue; + } +#endif + if ((res= MY_MB_WC(cs, &wc, src, se)) <= 0) + break; + src+= res; + if (wc <= UNICASE_MAXCHAR) + { + MY_UNICASE_CHARACTER *page; + if ((page= UNICASE_PAGES[wc >> 8])) + wc= page[wc & 0xFF].sort; + } + else + wc= MY_CS_REPLACEMENT_CHARACTER; + PUT_WC_BE2_HAVE_1BYTE(dst, de, wc); + } + return dst - dst0; +} + + +static size_t +MY_FUNCTION_NAME(strnxfrm)(CHARSET_INFO *cs, + uchar *dst, size_t dstlen, uint nweights, + const uchar *src, size_t srclen, uint flags) +{ + uchar *dst0= dst; + uchar *de= dst + dstlen; + dst+= MY_FUNCTION_NAME(strnxfrm_internal)(cs, dst, de, &nweights, + src, src + srclen); + DBUG_ASSERT(dst <= de); /* Safety */ + + if (dst < de && nweights && (flags & MY_STRXFRM_PAD_WITH_SPACE)) + dst+= my_strxfrm_pad_nweights_unicode(dst, de, nweights); + + my_strxfrm_desc_and_reverse(dst0, dst, flags, 0); + + if ((flags & MY_STRXFRM_PAD_TO_MAXLEN) && dst < de) + dst+= my_strxfrm_pad_unicode(dst, de); + return dst - dst0; +} + + +#ifdef DEFINE_STRNXFRM_UNICODE_NOPAD +static size_t +MY_FUNCTION_NAME(strnxfrm_nopad)(CHARSET_INFO *cs, + uchar *dst, size_t dstlen, + uint nweights, + const uchar *src, size_t srclen, uint flags) +{ + uchar *dst0= dst; + uchar *de= dst + dstlen; + dst+= MY_FUNCTION_NAME(strnxfrm_internal)(cs, dst, de, &nweights, + src, src + srclen); + DBUG_ASSERT(dst <= de); /* Safety */ + + if (dst < de && nweights && (flags & MY_STRXFRM_PAD_WITH_SPACE)) + { + size_t len= de - dst; + set_if_smaller(len, nweights * 2); + memset(dst, 0x00, len); + dst+= len; + } + + my_strxfrm_desc_and_reverse(dst0, dst, flags, 0); + + if ((flags & MY_STRXFRM_PAD_TO_MAXLEN) && dst < de) + { + memset(dst, 0x00, de - dst); + dst= de; + } + return dst - dst0; +} +#endif + +#endif /* DEFINE_STRNXFRM_UNICODE || DEFINE_STRNXFRM_UNICODE_NOPAD */ + + + +#ifdef DEFINE_STRNXFRM_UNICODE_BIN2 + +/* + Store sorting weights using 2 bytes per character. + + These functions are shared between + - utf8mb3_general_ci, utf8_bin, ucs2_general_ci, ucs2_bin + which support BMP only (U+0000..U+FFFF). + - utf8mb4_general_ci, utf16_general_ci, utf32_general_ci, + which map all supplementary characters to weight 0xFFFD. +*/ + +#ifndef MY_MB_WC +#error MY_MB_WC must be defined for DEFINE_STRNXFRM_UNICODE_BIN2 +#endif + +#ifndef OPTIMIZE_ASCII +#error OPTIMIZE_ASCII must be defined for DEFINE_STRNXFRM_UNICODE_BIN2 +#endif + + +static size_t +MY_FUNCTION_NAME(strnxfrm_internal)(CHARSET_INFO *cs, + uchar *dst, uchar *de, + uint *nweights, + const uchar *src, + const uchar *se) +{ + my_wc_t UNINIT_VAR(wc); + uchar *dst0= dst; + + DBUG_ASSERT(src || !se); + + for (; dst < de && *nweights; (*nweights)--) + { + int res; +#if OPTIMIZE_ASCII + if (src >= se) + break; + if (src[0] <= 0x7F) + { + wc= *src++; + PUT_WC_BE2_HAVE_1BYTE(dst, de, wc); + continue; + } +#endif + if ((res= MY_MB_WC(cs, &wc, src, se)) <= 0) + break; + src+= res; + if (wc > 0xFFFF) + wc= MY_CS_REPLACEMENT_CHARACTER; + PUT_WC_BE2_HAVE_1BYTE(dst, de, wc); + } + return dst - dst0; +} + + +static size_t +MY_FUNCTION_NAME(strnxfrm)(CHARSET_INFO *cs, + uchar *dst, size_t dstlen, uint nweights, + const uchar *src, size_t srclen, uint flags) +{ + uchar *dst0= dst; + uchar *de= dst + dstlen; + dst+= MY_FUNCTION_NAME(strnxfrm_internal)(cs, dst, de, &nweights, + src, src + srclen); + DBUG_ASSERT(dst <= de); /* Safety */ + + if (dst < de && nweights && (flags & MY_STRXFRM_PAD_WITH_SPACE)) + dst+= my_strxfrm_pad_nweights_unicode(dst, de, nweights); + + my_strxfrm_desc_and_reverse(dst0, dst, flags, 0); + + if ((flags & MY_STRXFRM_PAD_TO_MAXLEN) && dst < de) + dst+= my_strxfrm_pad_unicode(dst, de); + return dst - dst0; +} + + +static size_t +MY_FUNCTION_NAME(strnxfrm_nopad)(CHARSET_INFO *cs, + uchar *dst, size_t dstlen, uint nweights, + const uchar *src, size_t srclen, uint flags) +{ + uchar *dst0= dst; + uchar *de= dst + dstlen; + dst+= MY_FUNCTION_NAME(strnxfrm_internal)(cs, dst, de, &nweights, + src, src + srclen); + DBUG_ASSERT(dst <= de); /* Safety */ + + if (dst < de && nweights && (flags & MY_STRXFRM_PAD_WITH_SPACE)) + { + size_t len= de - dst; + set_if_smaller(len, nweights * 2); + memset(dst, 0x00, len); + dst+= len; + } + + my_strxfrm_desc_and_reverse(dst0, dst, flags, 0); + + if ((flags & MY_STRXFRM_PAD_TO_MAXLEN) && dst < de) + { + memset(dst, 0x00, de - dst); + dst= de; + } + return dst - dst0; +} + +#endif /* DEFINE_STRNXFRM_UNICODE_BIN2 */ + + /* We usually include this file at least two times from the same source file, for the _ci and the _bin collations. Prepare for the second inclusion. */ #undef MY_FUNCTION_NAME +#undef MY_MB_WC +#undef OPTIMIZE_ASCII +#undef UNICASE_MAXCHAR +#undef UNICASE_PAGE0 +#undef UNICASE_PAGES #undef WEIGHT_ILSEQ #undef WEIGHT_MB1 #undef WEIGHT_MB2 @@ -335,4 +596,8 @@ MY_FUNCTION_NAME(strnxfrm)(CHARSET_INFO *cs, #undef WEIGHT_PAD_SPACE #undef WEIGHT_MB2_FRM #undef DEFINE_STRNXFRM +#undef DEFINE_STRNXFRM_UNICODE +#undef DEFINE_STRNXFRM_UNICODE_NOPAD +#undef DEFINE_STRNXFRM_UNICODE_BIN2 +#undef DEFINE_STRNNCOLL #undef DEFINE_STRNNCOLLSP_NOPAD