diff --git a/include/m_ctype.h b/include/m_ctype.h index 06cbfd779c8..42e8f88cc0e 100644 --- a/include/m_ctype.h +++ b/include/m_ctype.h @@ -539,6 +539,11 @@ size_t my_strnxfrm_unicode(CHARSET_INFO *, uchar *dst, size_t dstlen, const uchar *src, size_t srclen); +size_t my_strnxfrm_unicode_full_bin(CHARSET_INFO *, + uchar *dst, size_t dstlen, + const uchar *src, size_t srclen); +size_t my_strnxfrmlen_unicode_full_bin(CHARSET_INFO *, size_t); + int my_wildcmp_unicode(CHARSET_INFO *cs, const char *str, const char *str_end, const char *wildstr, const char *wildend, diff --git a/mysql-test/include/ctype_filesort2.inc b/mysql-test/include/ctype_filesort2.inc new file mode 100644 index 00000000000..7b09eb482a5 --- /dev/null +++ b/mysql-test/include/ctype_filesort2.inc @@ -0,0 +1,16 @@ +# +# Testing filesort for full Unicode character sets +# with supplementary characters. +# + +--echo # +--echo # Bug#55980 Character sets: supplementary character _bin ordering is wrong +--echo # +CREATE TABLE t1 AS SELECT REPEAT('a',1) AS a LIMIT 0; +SHOW CREATE TABLE t1; +INSERT INTO t1 VALUES (_utf8mb4 0xEFBE9D),(_utf8mb4 0xF0908E84); +INSERT INTO t1 VALUES (_utf8mb4 0xCE85),(_utf8mb4 0xF4808080); +SELECT HEX(a), HEX(CONVERT(a USING utf8mb4)) FROM t1 ORDER BY a; +ALTER TABLE t1 ADD KEY(a); +SELECT HEX(a), HEX(CONVERT(a USING utf8mb4)) FROM t1 ORDER BY a; +DROP TABLE IF EXISTS t1; diff --git a/mysql-test/r/ctype_utf16.result b/mysql-test/r/ctype_utf16.result index c5fd7ef1439..7cee15aecef 100644 --- a/mysql-test/r/ctype_utf16.result +++ b/mysql-test/r/ctype_utf16.result @@ -611,6 +611,31 @@ utf16_bin 00610009 utf16_bin 0061 utf16_bin 00610020 drop table t1; +# +# Bug#55980 Character sets: supplementary character _bin ordering is wrong +# +CREATE TABLE t1 AS SELECT REPEAT('a',1) AS a LIMIT 0; +SHOW CREATE TABLE t1; +Table Create Table +t1 CREATE TABLE `t1` ( + `a` varchar(1) CHARACTER SET utf16 COLLATE utf16_bin NOT NULL DEFAULT '' +) ENGINE=MyISAM DEFAULT CHARSET=latin1 +INSERT INTO t1 VALUES (_utf8mb4 0xEFBE9D),(_utf8mb4 0xF0908E84); +INSERT INTO t1 VALUES (_utf8mb4 0xCE85),(_utf8mb4 0xF4808080); +SELECT HEX(a), HEX(CONVERT(a USING utf8mb4)) FROM t1 ORDER BY a; +HEX(a) HEX(CONVERT(a USING utf8mb4)) +0385 CE85 +D800DF84 F0908E84 +DBC0DC00 F4808080 +FF9D EFBE9D +ALTER TABLE t1 ADD KEY(a); +SELECT HEX(a), HEX(CONVERT(a USING utf8mb4)) FROM t1 ORDER BY a; +HEX(a) HEX(CONVERT(a USING utf8mb4)) +0385 CE85 +D800DF84 F0908E84 +DBC0DC00 F4808080 +FF9D EFBE9D +DROP TABLE IF EXISTS t1; select @@collation_connection; @@collation_connection utf16_bin diff --git a/mysql-test/r/ctype_utf32.result b/mysql-test/r/ctype_utf32.result index 79e714eab47..94a73093154 100644 --- a/mysql-test/r/ctype_utf32.result +++ b/mysql-test/r/ctype_utf32.result @@ -610,6 +610,31 @@ utf32_bin 0000006100000009 utf32_bin 00000061 utf32_bin 0000006100000020 drop table t1; +# +# Bug#55980 Character sets: supplementary character _bin ordering is wrong +# +CREATE TABLE t1 AS SELECT REPEAT('a',1) AS a LIMIT 0; +SHOW CREATE TABLE t1; +Table Create Table +t1 CREATE TABLE `t1` ( + `a` varchar(1) CHARACTER SET utf32 COLLATE utf32_bin NOT NULL DEFAULT '' +) ENGINE=MyISAM DEFAULT CHARSET=latin1 +INSERT INTO t1 VALUES (_utf8mb4 0xEFBE9D),(_utf8mb4 0xF0908E84); +INSERT INTO t1 VALUES (_utf8mb4 0xCE85),(_utf8mb4 0xF4808080); +SELECT HEX(a), HEX(CONVERT(a USING utf8mb4)) FROM t1 ORDER BY a; +HEX(a) HEX(CONVERT(a USING utf8mb4)) +00000385 CE85 +0000FF9D EFBE9D +00010384 F0908E84 +00100000 F4808080 +ALTER TABLE t1 ADD KEY(a); +SELECT HEX(a), HEX(CONVERT(a USING utf8mb4)) FROM t1 ORDER BY a; +HEX(a) HEX(CONVERT(a USING utf8mb4)) +00000385 CE85 +0000FF9D EFBE9D +00010384 F0908E84 +00100000 F4808080 +DROP TABLE IF EXISTS t1; select @@collation_connection; @@collation_connection utf32_bin diff --git a/mysql-test/r/ctype_utf8mb4.result b/mysql-test/r/ctype_utf8mb4.result index 454c9d4bfbb..3b9abbc5412 100644 --- a/mysql-test/r/ctype_utf8mb4.result +++ b/mysql-test/r/ctype_utf8mb4.result @@ -987,6 +987,31 @@ utf8mb4_bin 6109 utf8mb4_bin 61 utf8mb4_bin 6120 drop table t1; +# +# Bug#55980 Character sets: supplementary character _bin ordering is wrong +# +CREATE TABLE t1 AS SELECT REPEAT('a',1) AS a LIMIT 0; +SHOW CREATE TABLE t1; +Table Create Table +t1 CREATE TABLE `t1` ( + `a` varchar(1) CHARACTER SET utf8mb4 COLLATE utf8mb4_bin NOT NULL DEFAULT '' +) ENGINE=MyISAM DEFAULT CHARSET=latin1 +INSERT INTO t1 VALUES (_utf8mb4 0xEFBE9D),(_utf8mb4 0xF0908E84); +INSERT INTO t1 VALUES (_utf8mb4 0xCE85),(_utf8mb4 0xF4808080); +SELECT HEX(a), HEX(CONVERT(a USING utf8mb4)) FROM t1 ORDER BY a; +HEX(a) HEX(CONVERT(a USING utf8mb4)) +CE85 CE85 +EFBE9D EFBE9D +F0908E84 F0908E84 +F4808080 F4808080 +ALTER TABLE t1 ADD KEY(a); +SELECT HEX(a), HEX(CONVERT(a USING utf8mb4)) FROM t1 ORDER BY a; +HEX(a) HEX(CONVERT(a USING utf8mb4)) +CE85 CE85 +EFBE9D EFBE9D +F0908E84 F0908E84 +F4808080 F4808080 +DROP TABLE IF EXISTS t1; select @@collation_connection; @@collation_connection utf8mb4_bin diff --git a/mysql-test/t/ctype_utf16.test b/mysql-test/t/ctype_utf16.test index e9c7e569250..ef705474eee 100644 --- a/mysql-test/t/ctype_utf16.test +++ b/mysql-test/t/ctype_utf16.test @@ -326,6 +326,7 @@ SET collation_connection='utf16_general_ci'; SET NAMES latin1; SET collation_connection='utf16_bin'; -- source include/ctype_filesort.inc +-- source include/ctype_filesort2.inc -- source include/ctype_like_escape.inc # diff --git a/mysql-test/t/ctype_utf32.test b/mysql-test/t/ctype_utf32.test index 668b3b033bd..b113403944a 100644 --- a/mysql-test/t/ctype_utf32.test +++ b/mysql-test/t/ctype_utf32.test @@ -328,6 +328,7 @@ SET collation_connection='utf32_general_ci'; SET NAMES latin1; SET collation_connection='utf32_bin'; -- source include/ctype_filesort.inc +-- source include/ctype_filesort2.inc -- source include/ctype_like_escape.inc # diff --git a/mysql-test/t/ctype_utf8mb4.test b/mysql-test/t/ctype_utf8mb4.test index 8fcba92ff47..03696f385b5 100644 --- a/mysql-test/t/ctype_utf8mb4.test +++ b/mysql-test/t/ctype_utf8mb4.test @@ -733,6 +733,7 @@ SET collation_connection='utf8mb4_general_ci'; -- source include/ctype_german.inc SET collation_connection='utf8mb4_bin'; -- source include/ctype_filesort.inc +-- source include/ctype_filesort2.inc -- source include/ctype_like_escape.inc # diff --git a/strings/ctype-ucs2.c b/strings/ctype-ucs2.c index d3b0b93a939..ecfac3170d1 100644 --- a/strings/ctype-ucs2.c +++ b/strings/ctype-ucs2.c @@ -1469,7 +1469,7 @@ my_strnncoll_utf16_bin(CHARSET_INFO *cs, } if (s_wc != t_wc) { - return s_wc > t_wc ? 1 : -1; + return my_bincmp(s, s + s_res, t, t + t_res); } s+= s_res; @@ -1511,7 +1511,7 @@ my_strnncollsp_utf16_bin(CHARSET_INFO *cs, if (s_wc != t_wc) { - return s_wc > t_wc ? 1 : -1; + return my_bincmp(s, s + s_res, t, t + t_res); } s+= s_res; @@ -1684,8 +1684,8 @@ static MY_COLLATION_HANDLER my_collation_utf16_bin_handler = NULL, /* init */ my_strnncoll_utf16_bin, my_strnncollsp_utf16_bin, - my_strnxfrm_unicode, - my_strnxfrmlen_simple, + my_strnxfrm_unicode_full_bin, + my_strnxfrmlen_unicode_full_bin, my_like_range_utf16, my_wildcmp_utf16_bin, my_strcasecmp_mb2_or_mb4, @@ -2711,8 +2711,8 @@ static MY_COLLATION_HANDLER my_collation_utf32_bin_handler = NULL, /* init */ my_strnncoll_utf32_bin, my_strnncollsp_utf32_bin, - my_strnxfrm_unicode, - my_strnxfrmlen_utf32, + my_strnxfrm_unicode_full_bin, + my_strnxfrmlen_unicode_full_bin, my_like_range_utf32, my_wildcmp_utf32_bin, my_strcasecmp_mb2_or_mb4, diff --git a/strings/ctype-utf8.c b/strings/ctype-utf8.c index ace39130c12..76fff72290b 100644 --- a/strings/ctype-utf8.c +++ b/strings/ctype-utf8.c @@ -1893,7 +1893,13 @@ my_wildcmp_unicode(CHARSET_INFO *cs, /* - This function is shared between utf8mb3/utf8mb4/ucs2/utf16/utf32 + Store sorting weights using 2 bytes per character. + + This function is shared between + - utf8mb3_general_ci, utf8_bin, ucs2_general_ci, ucs2_bin + which support BMP only (U+0000..U+FFFF). + - utf8mb4_general_ci, utf16_general_ci, utf32_general_ci, + which map all supplementary characters to weight 0xFFFD. */ size_t my_strnxfrm_unicode(CHARSET_INFO *cs, @@ -1937,6 +1943,70 @@ my_strnxfrm_unicode(CHARSET_INFO *cs, } +/* + Store sorting weights using 3 bytes per character. + This function is shared between utf8mb4_bin, utf16_bin, utf32_bin. +*/ +size_t +my_strnxfrm_unicode_full_bin(CHARSET_INFO *cs, + uchar *dst, size_t dstlen, + const uchar *src, size_t srclen) +{ + my_wc_t wc; + uchar *de= dst + dstlen; + uchar *de_beg= de - 2; /* The beginning of the last chunk */ + const uchar *se = src + srclen; + + LINT_INIT(wc); + DBUG_ASSERT(src); + DBUG_ASSERT(cs->state & MY_CS_BINSORT); + + while (dst < de_beg) + { + int res; + if ((res= cs->cset->mb_wc(cs, &wc, src, se)) <= 0) + break; + src+= res; + if (cs->mbminlen == 2) /* utf16_bin */ + { + /* + Reorder code points to weights as follows: + U+0000..U+D7FF -> [00][00][00]..[00][D7][FF] BMP part #1 + U+10000..U+10FFFF -> [01][00][00]..[10][FF][FF] Supplementary + U+E000..U+FFFF -> [20][E0][00]..[20][FF][FF] BMP part #2 + */ + if (wc >= 0xE000 && wc <= 0xFFFF) + wc+= 0x200000; + } + *dst++= (uchar) (wc >> 16); + *dst++= (uchar) ((wc >> 8) & 0xFF); + *dst++= (uchar) (wc & 0xFF); + } + + while (dst < de_beg) /* Fill the tail with keys for space character */ + { + *dst++= 0x00; + *dst++= 0x00; + *dst++= 0x20; + } + + /* Clear the last one or two bytes, if "dstlen" was not divisible by 3 */ + if (dst < de) + { + *dst++= 0x00; + if (dst < de) + *dst= 0x00; + } + + return dstlen; +} + + +size_t +my_strnxfrmlen_unicode_full_bin(CHARSET_INFO *cs, size_t len) +{ + return ((len + 3) / cs->mbmaxlen) * 3; +} #endif /* HAVE_UNIDATA */ @@ -5067,8 +5137,8 @@ static MY_COLLATION_HANDLER my_collation_utf8mb4_bin_handler = NULL, /* init */ my_strnncoll_mb_bin, my_strnncollsp_mb_bin, - my_strnxfrm_unicode, - my_strnxfrmlen_utf8mb4, + my_strnxfrm_unicode_full_bin, + my_strnxfrmlen_unicode_full_bin, my_like_range_mb, my_wildcmp_mb_bin, my_strcasecmp_mb_bin,