mirror of
https://github.com/MariaDB/server.git
synced 2026-04-18 06:15:32 +02:00
Cherry-picking patch for Bug#55980.
Original changeset: ------------------------------------------------------------ revno: 3197 revision-id: alik@sun.com-20100831135426-h5a4s2w6ih1d8q2x parent: magnus.blaudd@sun.com-20100830120632-u3xzy002mdwueli8 committer: Alexander Nozdrin <alik@sun.com> branch nick: mysql-5.5-bugfixing timestamp: Tue 2010-08-31 17:54:26 +0400 message: Bug#55980 Character sets: supplementary character _bin ordering is wrong Problem: - ORDER BY for utf8mb4_bin, utf16_bin and utf32_bin returned results in a wrong order, because old functions (supporting only BMP range) were used to handle these collations. - Additionally, utf16_bin did not sort supplementary characters between U+D700 and U+E000, as WL#1213 specification specified. ------------------------------------------------------------
This commit is contained in:
parent
4283a70458
commit
316e9d6339
10 changed files with 178 additions and 9 deletions
|
|
@ -539,6 +539,11 @@ size_t my_strnxfrm_unicode(CHARSET_INFO *,
|
|||
uchar *dst, size_t dstlen,
|
||||
const uchar *src, size_t srclen);
|
||||
|
||||
size_t my_strnxfrm_unicode_full_bin(CHARSET_INFO *,
|
||||
uchar *dst, size_t dstlen,
|
||||
const uchar *src, size_t srclen);
|
||||
size_t my_strnxfrmlen_unicode_full_bin(CHARSET_INFO *, size_t);
|
||||
|
||||
int my_wildcmp_unicode(CHARSET_INFO *cs,
|
||||
const char *str, const char *str_end,
|
||||
const char *wildstr, const char *wildend,
|
||||
|
|
|
|||
16
mysql-test/include/ctype_filesort2.inc
Normal file
16
mysql-test/include/ctype_filesort2.inc
Normal file
|
|
@ -0,0 +1,16 @@
|
|||
#
|
||||
# Testing filesort for full Unicode character sets
|
||||
# with supplementary characters.
|
||||
#
|
||||
|
||||
--echo #
|
||||
--echo # Bug#55980 Character sets: supplementary character _bin ordering is wrong
|
||||
--echo #
|
||||
CREATE TABLE t1 AS SELECT REPEAT('a',1) AS a LIMIT 0;
|
||||
SHOW CREATE TABLE t1;
|
||||
INSERT INTO t1 VALUES (_utf8mb4 0xEFBE9D),(_utf8mb4 0xF0908E84);
|
||||
INSERT INTO t1 VALUES (_utf8mb4 0xCE85),(_utf8mb4 0xF4808080);
|
||||
SELECT HEX(a), HEX(CONVERT(a USING utf8mb4)) FROM t1 ORDER BY a;
|
||||
ALTER TABLE t1 ADD KEY(a);
|
||||
SELECT HEX(a), HEX(CONVERT(a USING utf8mb4)) FROM t1 ORDER BY a;
|
||||
DROP TABLE IF EXISTS t1;
|
||||
|
|
@ -611,6 +611,31 @@ utf16_bin 00610009
|
|||
utf16_bin 0061
|
||||
utf16_bin 00610020
|
||||
drop table t1;
|
||||
#
|
||||
# Bug#55980 Character sets: supplementary character _bin ordering is wrong
|
||||
#
|
||||
CREATE TABLE t1 AS SELECT REPEAT('a',1) AS a LIMIT 0;
|
||||
SHOW CREATE TABLE t1;
|
||||
Table Create Table
|
||||
t1 CREATE TABLE `t1` (
|
||||
`a` varchar(1) CHARACTER SET utf16 COLLATE utf16_bin NOT NULL DEFAULT ''
|
||||
) ENGINE=MyISAM DEFAULT CHARSET=latin1
|
||||
INSERT INTO t1 VALUES (_utf8mb4 0xEFBE9D),(_utf8mb4 0xF0908E84);
|
||||
INSERT INTO t1 VALUES (_utf8mb4 0xCE85),(_utf8mb4 0xF4808080);
|
||||
SELECT HEX(a), HEX(CONVERT(a USING utf8mb4)) FROM t1 ORDER BY a;
|
||||
HEX(a) HEX(CONVERT(a USING utf8mb4))
|
||||
0385 CE85
|
||||
D800DF84 F0908E84
|
||||
DBC0DC00 F4808080
|
||||
FF9D EFBE9D
|
||||
ALTER TABLE t1 ADD KEY(a);
|
||||
SELECT HEX(a), HEX(CONVERT(a USING utf8mb4)) FROM t1 ORDER BY a;
|
||||
HEX(a) HEX(CONVERT(a USING utf8mb4))
|
||||
0385 CE85
|
||||
D800DF84 F0908E84
|
||||
DBC0DC00 F4808080
|
||||
FF9D EFBE9D
|
||||
DROP TABLE IF EXISTS t1;
|
||||
select @@collation_connection;
|
||||
@@collation_connection
|
||||
utf16_bin
|
||||
|
|
|
|||
|
|
@ -610,6 +610,31 @@ utf32_bin 0000006100000009
|
|||
utf32_bin 00000061
|
||||
utf32_bin 0000006100000020
|
||||
drop table t1;
|
||||
#
|
||||
# Bug#55980 Character sets: supplementary character _bin ordering is wrong
|
||||
#
|
||||
CREATE TABLE t1 AS SELECT REPEAT('a',1) AS a LIMIT 0;
|
||||
SHOW CREATE TABLE t1;
|
||||
Table Create Table
|
||||
t1 CREATE TABLE `t1` (
|
||||
`a` varchar(1) CHARACTER SET utf32 COLLATE utf32_bin NOT NULL DEFAULT ''
|
||||
) ENGINE=MyISAM DEFAULT CHARSET=latin1
|
||||
INSERT INTO t1 VALUES (_utf8mb4 0xEFBE9D),(_utf8mb4 0xF0908E84);
|
||||
INSERT INTO t1 VALUES (_utf8mb4 0xCE85),(_utf8mb4 0xF4808080);
|
||||
SELECT HEX(a), HEX(CONVERT(a USING utf8mb4)) FROM t1 ORDER BY a;
|
||||
HEX(a) HEX(CONVERT(a USING utf8mb4))
|
||||
00000385 CE85
|
||||
0000FF9D EFBE9D
|
||||
00010384 F0908E84
|
||||
00100000 F4808080
|
||||
ALTER TABLE t1 ADD KEY(a);
|
||||
SELECT HEX(a), HEX(CONVERT(a USING utf8mb4)) FROM t1 ORDER BY a;
|
||||
HEX(a) HEX(CONVERT(a USING utf8mb4))
|
||||
00000385 CE85
|
||||
0000FF9D EFBE9D
|
||||
00010384 F0908E84
|
||||
00100000 F4808080
|
||||
DROP TABLE IF EXISTS t1;
|
||||
select @@collation_connection;
|
||||
@@collation_connection
|
||||
utf32_bin
|
||||
|
|
|
|||
|
|
@ -987,6 +987,31 @@ utf8mb4_bin 6109
|
|||
utf8mb4_bin 61
|
||||
utf8mb4_bin 6120
|
||||
drop table t1;
|
||||
#
|
||||
# Bug#55980 Character sets: supplementary character _bin ordering is wrong
|
||||
#
|
||||
CREATE TABLE t1 AS SELECT REPEAT('a',1) AS a LIMIT 0;
|
||||
SHOW CREATE TABLE t1;
|
||||
Table Create Table
|
||||
t1 CREATE TABLE `t1` (
|
||||
`a` varchar(1) CHARACTER SET utf8mb4 COLLATE utf8mb4_bin NOT NULL DEFAULT ''
|
||||
) ENGINE=MyISAM DEFAULT CHARSET=latin1
|
||||
INSERT INTO t1 VALUES (_utf8mb4 0xEFBE9D),(_utf8mb4 0xF0908E84);
|
||||
INSERT INTO t1 VALUES (_utf8mb4 0xCE85),(_utf8mb4 0xF4808080);
|
||||
SELECT HEX(a), HEX(CONVERT(a USING utf8mb4)) FROM t1 ORDER BY a;
|
||||
HEX(a) HEX(CONVERT(a USING utf8mb4))
|
||||
CE85 CE85
|
||||
EFBE9D EFBE9D
|
||||
F0908E84 F0908E84
|
||||
F4808080 F4808080
|
||||
ALTER TABLE t1 ADD KEY(a);
|
||||
SELECT HEX(a), HEX(CONVERT(a USING utf8mb4)) FROM t1 ORDER BY a;
|
||||
HEX(a) HEX(CONVERT(a USING utf8mb4))
|
||||
CE85 CE85
|
||||
EFBE9D EFBE9D
|
||||
F0908E84 F0908E84
|
||||
F4808080 F4808080
|
||||
DROP TABLE IF EXISTS t1;
|
||||
select @@collation_connection;
|
||||
@@collation_connection
|
||||
utf8mb4_bin
|
||||
|
|
|
|||
|
|
@ -326,6 +326,7 @@ SET collation_connection='utf16_general_ci';
|
|||
SET NAMES latin1;
|
||||
SET collation_connection='utf16_bin';
|
||||
-- source include/ctype_filesort.inc
|
||||
-- source include/ctype_filesort2.inc
|
||||
-- source include/ctype_like_escape.inc
|
||||
|
||||
#
|
||||
|
|
|
|||
|
|
@ -328,6 +328,7 @@ SET collation_connection='utf32_general_ci';
|
|||
SET NAMES latin1;
|
||||
SET collation_connection='utf32_bin';
|
||||
-- source include/ctype_filesort.inc
|
||||
-- source include/ctype_filesort2.inc
|
||||
-- source include/ctype_like_escape.inc
|
||||
|
||||
#
|
||||
|
|
|
|||
|
|
@ -733,6 +733,7 @@ SET collation_connection='utf8mb4_general_ci';
|
|||
-- source include/ctype_german.inc
|
||||
SET collation_connection='utf8mb4_bin';
|
||||
-- source include/ctype_filesort.inc
|
||||
-- source include/ctype_filesort2.inc
|
||||
-- source include/ctype_like_escape.inc
|
||||
|
||||
#
|
||||
|
|
|
|||
|
|
@ -1469,7 +1469,7 @@ my_strnncoll_utf16_bin(CHARSET_INFO *cs,
|
|||
}
|
||||
if (s_wc != t_wc)
|
||||
{
|
||||
return s_wc > t_wc ? 1 : -1;
|
||||
return my_bincmp(s, s + s_res, t, t + t_res);
|
||||
}
|
||||
|
||||
s+= s_res;
|
||||
|
|
@ -1511,7 +1511,7 @@ my_strnncollsp_utf16_bin(CHARSET_INFO *cs,
|
|||
|
||||
if (s_wc != t_wc)
|
||||
{
|
||||
return s_wc > t_wc ? 1 : -1;
|
||||
return my_bincmp(s, s + s_res, t, t + t_res);
|
||||
}
|
||||
|
||||
s+= s_res;
|
||||
|
|
@ -1684,8 +1684,8 @@ static MY_COLLATION_HANDLER my_collation_utf16_bin_handler =
|
|||
NULL, /* init */
|
||||
my_strnncoll_utf16_bin,
|
||||
my_strnncollsp_utf16_bin,
|
||||
my_strnxfrm_unicode,
|
||||
my_strnxfrmlen_simple,
|
||||
my_strnxfrm_unicode_full_bin,
|
||||
my_strnxfrmlen_unicode_full_bin,
|
||||
my_like_range_utf16,
|
||||
my_wildcmp_utf16_bin,
|
||||
my_strcasecmp_mb2_or_mb4,
|
||||
|
|
@ -2711,8 +2711,8 @@ static MY_COLLATION_HANDLER my_collation_utf32_bin_handler =
|
|||
NULL, /* init */
|
||||
my_strnncoll_utf32_bin,
|
||||
my_strnncollsp_utf32_bin,
|
||||
my_strnxfrm_unicode,
|
||||
my_strnxfrmlen_utf32,
|
||||
my_strnxfrm_unicode_full_bin,
|
||||
my_strnxfrmlen_unicode_full_bin,
|
||||
my_like_range_utf32,
|
||||
my_wildcmp_utf32_bin,
|
||||
my_strcasecmp_mb2_or_mb4,
|
||||
|
|
|
|||
|
|
@ -1893,7 +1893,13 @@ my_wildcmp_unicode(CHARSET_INFO *cs,
|
|||
|
||||
|
||||
/*
|
||||
This function is shared between utf8mb3/utf8mb4/ucs2/utf16/utf32
|
||||
Store sorting weights using 2 bytes per character.
|
||||
|
||||
This function is shared between
|
||||
- utf8mb3_general_ci, utf8_bin, ucs2_general_ci, ucs2_bin
|
||||
which support BMP only (U+0000..U+FFFF).
|
||||
- utf8mb4_general_ci, utf16_general_ci, utf32_general_ci,
|
||||
which map all supplementary characters to weight 0xFFFD.
|
||||
*/
|
||||
size_t
|
||||
my_strnxfrm_unicode(CHARSET_INFO *cs,
|
||||
|
|
@ -1937,6 +1943,70 @@ my_strnxfrm_unicode(CHARSET_INFO *cs,
|
|||
}
|
||||
|
||||
|
||||
/*
|
||||
Store sorting weights using 3 bytes per character.
|
||||
This function is shared between utf8mb4_bin, utf16_bin, utf32_bin.
|
||||
*/
|
||||
size_t
|
||||
my_strnxfrm_unicode_full_bin(CHARSET_INFO *cs,
|
||||
uchar *dst, size_t dstlen,
|
||||
const uchar *src, size_t srclen)
|
||||
{
|
||||
my_wc_t wc;
|
||||
uchar *de= dst + dstlen;
|
||||
uchar *de_beg= de - 2; /* The beginning of the last chunk */
|
||||
const uchar *se = src + srclen;
|
||||
|
||||
LINT_INIT(wc);
|
||||
DBUG_ASSERT(src);
|
||||
DBUG_ASSERT(cs->state & MY_CS_BINSORT);
|
||||
|
||||
while (dst < de_beg)
|
||||
{
|
||||
int res;
|
||||
if ((res= cs->cset->mb_wc(cs, &wc, src, se)) <= 0)
|
||||
break;
|
||||
src+= res;
|
||||
if (cs->mbminlen == 2) /* utf16_bin */
|
||||
{
|
||||
/*
|
||||
Reorder code points to weights as follows:
|
||||
U+0000..U+D7FF -> [00][00][00]..[00][D7][FF] BMP part #1
|
||||
U+10000..U+10FFFF -> [01][00][00]..[10][FF][FF] Supplementary
|
||||
U+E000..U+FFFF -> [20][E0][00]..[20][FF][FF] BMP part #2
|
||||
*/
|
||||
if (wc >= 0xE000 && wc <= 0xFFFF)
|
||||
wc+= 0x200000;
|
||||
}
|
||||
*dst++= (uchar) (wc >> 16);
|
||||
*dst++= (uchar) ((wc >> 8) & 0xFF);
|
||||
*dst++= (uchar) (wc & 0xFF);
|
||||
}
|
||||
|
||||
while (dst < de_beg) /* Fill the tail with keys for space character */
|
||||
{
|
||||
*dst++= 0x00;
|
||||
*dst++= 0x00;
|
||||
*dst++= 0x20;
|
||||
}
|
||||
|
||||
/* Clear the last one or two bytes, if "dstlen" was not divisible by 3 */
|
||||
if (dst < de)
|
||||
{
|
||||
*dst++= 0x00;
|
||||
if (dst < de)
|
||||
*dst= 0x00;
|
||||
}
|
||||
|
||||
return dstlen;
|
||||
}
|
||||
|
||||
|
||||
size_t
|
||||
my_strnxfrmlen_unicode_full_bin(CHARSET_INFO *cs, size_t len)
|
||||
{
|
||||
return ((len + 3) / cs->mbmaxlen) * 3;
|
||||
}
|
||||
#endif /* HAVE_UNIDATA */
|
||||
|
||||
|
||||
|
|
@ -5067,8 +5137,8 @@ static MY_COLLATION_HANDLER my_collation_utf8mb4_bin_handler =
|
|||
NULL, /* init */
|
||||
my_strnncoll_mb_bin,
|
||||
my_strnncollsp_mb_bin,
|
||||
my_strnxfrm_unicode,
|
||||
my_strnxfrmlen_utf8mb4,
|
||||
my_strnxfrm_unicode_full_bin,
|
||||
my_strnxfrmlen_unicode_full_bin,
|
||||
my_like_range_mb,
|
||||
my_wildcmp_mb_bin,
|
||||
my_strcasecmp_mb_bin,
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue