diff --git a/include/m_ctype.h b/include/m_ctype.h index 0228b359111..4a9415f43f9 100644 --- a/include/m_ctype.h +++ b/include/m_ctype.h @@ -206,8 +206,9 @@ typedef struct charset_info_st uchar state_map[256]; uchar ident_map[256]; uint strxfrm_multiply; + uint mbminlen; uint mbmaxlen; - char max_sort_char; /* For LIKE optimization */ + char max_sort_char; /* For LIKE optimization */ MY_CHARSET_HANDLER *cset; MY_COLLATION_HANDLER *coll; diff --git a/mysql-test/r/ctype_ucs.result b/mysql-test/r/ctype_ucs.result index 58761526150..d6e9cc690a2 100644 --- a/mysql-test/r/ctype_ucs.result +++ b/mysql-test/r/ctype_ucs.result @@ -276,3 +276,51 @@ aardvara aardvark aardvarz DROP TABLE t1; +SELECT HEX(_ucs2 0x0); +HEX(_ucs2 0x0) +0000 +SELECT HEX(_ucs2 0x01); +HEX(_ucs2 0x01) +0001 +SELECT HEX(_ucs2 0x012); +HEX(_ucs2 0x012) +0012 +SELECT HEX(_ucs2 0x0123); +HEX(_ucs2 0x0123) +0123 +SELECT HEX(_ucs2 0x01234); +HEX(_ucs2 0x01234) +00001234 +SELECT HEX(_ucs2 0x012345); +HEX(_ucs2 0x012345) +00012345 +SELECT HEX(_ucs2 0x0123456); +HEX(_ucs2 0x0123456) +00123456 +SELECT HEX(_ucs2 0x01234567); +HEX(_ucs2 0x01234567) +01234567 +SELECT HEX(_ucs2 0x012345678); +HEX(_ucs2 0x012345678) +000012345678 +SELECT HEX(_ucs2 0x0123456789); +HEX(_ucs2 0x0123456789) +000123456789 +SELECT HEX(_ucs2 0x0123456789A); +HEX(_ucs2 0x0123456789A) +00123456789A +SELECT HEX(_ucs2 0x0123456789AB); +HEX(_ucs2 0x0123456789AB) +0123456789AB +SELECT HEX(_ucs2 0x0123456789ABC); +HEX(_ucs2 0x0123456789ABC) +0000123456789ABC +SELECT HEX(_ucs2 0x0123456789ABCD); +HEX(_ucs2 0x0123456789ABCD) +000123456789ABCD +SELECT HEX(_ucs2 0x0123456789ABCDE); +HEX(_ucs2 0x0123456789ABCDE) +00123456789ABCDE +SELECT HEX(_ucs2 0x0123456789ABCDEF); +HEX(_ucs2 0x0123456789ABCDEF) +0123456789ABCDEF diff --git a/mysql-test/t/ctype_ucs.test b/mysql-test/t/ctype_ucs.test index 7eec58563b3..fd2a1b1cd7d 100644 --- a/mysql-test/t/ctype_ucs.test +++ b/mysql-test/t/ctype_ucs.test @@ -197,3 +197,24 @@ DROP TABLE t1; # END OF Bug 1264 test # ######################################################## + + +# Bug #2390 +# Check alignment +# +SELECT HEX(_ucs2 0x0); +SELECT HEX(_ucs2 0x01); +SELECT HEX(_ucs2 0x012); +SELECT HEX(_ucs2 0x0123); +SELECT HEX(_ucs2 0x01234); +SELECT HEX(_ucs2 0x012345); +SELECT HEX(_ucs2 0x0123456); +SELECT HEX(_ucs2 0x01234567); +SELECT HEX(_ucs2 0x012345678); +SELECT HEX(_ucs2 0x0123456789); +SELECT HEX(_ucs2 0x0123456789A); +SELECT HEX(_ucs2 0x0123456789AB); +SELECT HEX(_ucs2 0x0123456789ABC); +SELECT HEX(_ucs2 0x0123456789ABCD); +SELECT HEX(_ucs2 0x0123456789ABCDE); +SELECT HEX(_ucs2 0x0123456789ABCDEF); diff --git a/mysys/charset.c b/mysys/charset.c index 5e9e3c3fcaa..40a026f161f 100644 --- a/mysys/charset.c +++ b/mysys/charset.c @@ -131,7 +131,8 @@ static void simple_cs_init_functions(CHARSET_INFO *cs) cs->coll= &my_collation_8bit_simple_ci_handler; cs->cset= &my_charset_8bit_handler; - cs->mbmaxlen = 1; + cs->mbminlen= 1; + cs->mbmaxlen= 1; } @@ -273,6 +274,7 @@ static int simple_cs_copy_data(CHARSET_INFO *to, CHARSET_INFO *from) if (create_fromuni(to)) goto err; } + to->mbminlen= 1; to->mbmaxlen= 1; return 0; diff --git a/sql/item.h b/sql/item.h index 5def1e2b710..e6ed8109534 100644 --- a/sql/item.h +++ b/sql/item.h @@ -477,7 +477,7 @@ public: CHARSET_INFO *cs, Derivation dv= DERIVATION_COERCIBLE) { collation.set(cs, dv); - str_value.set(str,length,cs); + str_value.set_or_copy_aligned(str,length,cs); /* We have to have a different max_length than 'length' here to ensure that we get the right length if we do use the item @@ -493,7 +493,7 @@ public: CHARSET_INFO *cs, Derivation dv= DERIVATION_COERCIBLE) { collation.set(cs, dv); - str_value.set(str,length,cs); + str_value.set_or_copy_aligned(str,length,cs); max_length= str_value.numchars()*cs->mbmaxlen; set_name(name_par,0,cs); decimals=NOT_FIXED_DEC; diff --git a/sql/sql_string.cc b/sql/sql_string.cc index 89f48607969..9534c5605fe 100644 --- a/sql/sql_string.cc +++ b/sql/sql_string.cc @@ -228,6 +228,52 @@ bool String::copy(const char *str,uint32 arg_length, CHARSET_INFO *cs) return FALSE; } +/* +** For real multi-byte, ascii incompatible charactser sets, +** like UCS-2, add leading zeros if we have an incomplete character. +** Thus, +** SELECT _ucs2 0xAA +** will automatically be converted into +** SELECT _ucs2 0x00AA +*/ + +bool String::set_or_copy_aligned(const char *str,uint32 arg_length, + CHARSET_INFO *cs) +{ + /* How many bytes are in incomplete character */ + uint32 offs= (arg_length % cs->mbminlen); + + if (!offs) /* All characters are complete, just copy */ + { + set(str, arg_length, cs); + return FALSE; + } + + offs= cs->mbmaxlen - offs; /* How many zeros we should prepend */ + uint32 aligned_length= arg_length + offs; + if (alloc(aligned_length)) + return TRUE; + + /* + Probably this condition is not really necessary + because if aligned_length is 0 then offs is 0 too + and we'll return after calling set(). + */ + if ((str_length= aligned_length)) + { + /* + Note, this is only safe for little-endian UCS-2. + If we add big-endian UCS-2 sometimes, this code + will be more complicated. But it's OK for now. + */ + bzero((char*)Ptr, offs); + memcpy(Ptr + offs, str, arg_length); + } + Ptr[aligned_length]=0; + str_charset=cs; + return FALSE; +} + /* Copy with charset convertion */ bool String::copy(const char *str, uint32 arg_length, diff --git a/sql/sql_string.h b/sql/sql_string.h index 325611737ca..8817aa8eab8 100644 --- a/sql/sql_string.h +++ b/sql/sql_string.h @@ -183,6 +183,7 @@ public: bool copy(); // Alloc string if not alloced bool copy(const String &s); // Allocate new string bool copy(const char *s,uint32 arg_length, CHARSET_INFO *cs); // Allocate new string + bool set_or_copy_aligned(const char *s, uint32 arg_length, CHARSET_INFO *cs); bool copy(const char*s,uint32 arg_length, CHARSET_INFO *csfrom, CHARSET_INFO *csto); bool append(const String &s); diff --git a/strings/ctype-big5.c b/strings/ctype-big5.c index c5ddc167d0d..8d4081fb2aa 100644 --- a/strings/ctype-big5.c +++ b/strings/ctype-big5.c @@ -6281,6 +6281,7 @@ CHARSET_INFO my_charset_big5_chinese_ci= "", "", 1, /* strxfrm_multiply */ + 1, /* mbminlen */ 2, /* mbmaxlen */ 0, &my_charset_big5_handler, @@ -6304,6 +6305,7 @@ CHARSET_INFO my_charset_big5_bin= "", "", 1, /* strxfrm_multiply */ + 1, /* mbminlen */ 2, /* mbmaxlen */ 0, &my_charset_big5_handler, diff --git a/strings/ctype-bin.c b/strings/ctype-bin.c index 6f28c43b2c6..67435b7df6c 100644 --- a/strings/ctype-bin.c +++ b/strings/ctype-bin.c @@ -381,6 +381,7 @@ CHARSET_INFO my_charset_bin = NULL, /* tab_from_uni */ "","", 1, /* strxfrm_multiply */ + 1, /* mbminlen */ 1, /* mbmaxlen */ (char) 255, /* max_sort_char */ &my_charset_handler, diff --git a/strings/ctype-czech.c b/strings/ctype-czech.c index b2e4f1886ed..1a07a5eba7e 100644 --- a/strings/ctype-czech.c +++ b/strings/ctype-czech.c @@ -631,6 +631,7 @@ CHARSET_INFO my_charset_latin2_czech_ci = idx_uni_8859_2, /* tab_from_uni */ "","", 4, /* strxfrm_multiply */ + 1, /* mbminlen */ 1, /* mbmaxlen */ 0, &my_charset_8bit_handler, diff --git a/strings/ctype-euc_kr.c b/strings/ctype-euc_kr.c index addd7803680..366a5d500ed 100644 --- a/strings/ctype-euc_kr.c +++ b/strings/ctype-euc_kr.c @@ -8689,6 +8689,7 @@ CHARSET_INFO my_charset_euckr_korean_ci= "", "", 1, /* strxfrm_multiply */ + 1, /* mbminlen */ 2, /* mbmaxlen */ 0, &my_charset_handler, @@ -8712,6 +8713,7 @@ CHARSET_INFO my_charset_euckr_bin= "", "", 1, /* strxfrm_multiply */ + 1, /* mbminlen */ 2, /* mbmaxlen */ 0, &my_charset_handler, diff --git a/strings/ctype-extra.c b/strings/ctype-extra.c index 55bfa09ea5f..0085d264416 100644 --- a/strings/ctype-extra.c +++ b/strings/ctype-extra.c @@ -34,6 +34,7 @@ CHARSET_INFO compiled_charsets[] = { 0, 0, 0, + 0, NULL, NULL } diff --git a/strings/ctype-gb2312.c b/strings/ctype-gb2312.c index b84ddc9081b..44a58b2b906 100644 --- a/strings/ctype-gb2312.c +++ b/strings/ctype-gb2312.c @@ -5740,6 +5740,7 @@ CHARSET_INFO my_charset_gb2312_chinese_ci= "", "", 1, /* strxfrm_multiply */ + 1, /* mbminlen */ 2, /* mbmaxlen */ 0, &my_charset_handler, @@ -5762,6 +5763,7 @@ CHARSET_INFO my_charset_gb2312_bin= "", "", 1, /* strxfrm_multiply */ + 1, /* mbminlen */ 2, /* mbmaxlen */ 0, &my_charset_handler, diff --git a/strings/ctype-gbk.c b/strings/ctype-gbk.c index 585dc66be4c..5475c3bd363 100644 --- a/strings/ctype-gbk.c +++ b/strings/ctype-gbk.c @@ -9936,6 +9936,7 @@ CHARSET_INFO my_charset_gbk_chinese_ci= "", "", 1, /* strxfrm_multiply */ + 1, /* mbminlen */ 2, /* mbmaxlen */ 0, &my_charset_handler, @@ -9958,6 +9959,7 @@ CHARSET_INFO my_charset_gbk_bin= "", "", 1, /* strxfrm_multiply */ + 1, /* mbminlen */ 2, /* mbmaxlen */ 0, &my_charset_handler, diff --git a/strings/ctype-latin1.c b/strings/ctype-latin1.c index 15798abb85b..c00ded21575 100644 --- a/strings/ctype-latin1.c +++ b/strings/ctype-latin1.c @@ -215,6 +215,7 @@ CHARSET_INFO my_charset_latin1= NULL, /* tab_from_uni */ "","", 1, /* strxfrm_multiply */ + 1, /* mbminlen */ 1, /* mbmaxlen */ 0, &my_charset_handler, @@ -410,6 +411,7 @@ CHARSET_INFO my_charset_latin1_german2_ci= NULL, /* tab_from_uni */ "","", 2, /* strxfrm_multiply */ + 1, /* mbminlen */ 1, /* mbmaxlen */ 0, &my_charset_handler, @@ -433,6 +435,7 @@ CHARSET_INFO my_charset_latin1_bin= "", "", 1, /* strxfrm_multiply */ + 1, /* mbminlen */ 1, /* mbmaxlen */ 0, &my_charset_handler, diff --git a/strings/ctype-sjis.c b/strings/ctype-sjis.c index a84fbd16e5d..42f32fe739b 100644 --- a/strings/ctype-sjis.c +++ b/strings/ctype-sjis.c @@ -4525,6 +4525,7 @@ CHARSET_INFO my_charset_sjis_japanese_ci= "", "", 1, /* strxfrm_multiply */ + 1, /* mbminlen */ 2, /* mbmaxlen */ 0, &my_charset_handler, @@ -4547,6 +4548,7 @@ CHARSET_INFO my_charset_sjis_bin= "", "", 1, /* strxfrm_multiply */ + 1, /* mbminlen */ 2, /* mbmaxlen */ 0, &my_charset_handler, diff --git a/strings/ctype-tis620.c b/strings/ctype-tis620.c index 92b2eeb25e0..09552a0dc23 100644 --- a/strings/ctype-tis620.c +++ b/strings/ctype-tis620.c @@ -956,6 +956,7 @@ CHARSET_INFO my_charset_tis620_thai_ci= "", "", 4, /* strxfrm_multiply */ + 1, /* mbminlen */ 1, /* mbmaxlen */ 0, &my_charset_handler, @@ -978,6 +979,7 @@ CHARSET_INFO my_charset_tis620_bin= "", "", 1, /* strxfrm_multiply */ + 1, /* mbminlen */ 1, /* mbmaxlen */ 0, &my_charset_handler, diff --git a/strings/ctype-ucs2.c b/strings/ctype-ucs2.c index beb803a69f2..a7a59fc50f7 100644 --- a/strings/ctype-ucs2.c +++ b/strings/ctype-ucs2.c @@ -1322,6 +1322,7 @@ CHARSET_INFO my_charset_ucs2_general_ci= "", "", 1, /* strxfrm_multiply */ + 2, /* mbminlen */ 2, /* mbmaxlen */ 0, &my_charset_ucs2_handler, @@ -1345,6 +1346,7 @@ CHARSET_INFO my_charset_ucs2_bin= "", "", 1, /* strxfrm_multiply */ + 2, /* mbminlen */ 2, /* mbmaxlen */ 0, &my_charset_ucs2_handler, diff --git a/strings/ctype-ujis.c b/strings/ctype-ujis.c index 2815b70351b..f6928e9426e 100644 --- a/strings/ctype-ujis.c +++ b/strings/ctype-ujis.c @@ -8480,6 +8480,7 @@ CHARSET_INFO my_charset_ujis_japanese_ci= NULL, /* tab_from_uni */ "","", 1, /* strxfrm_multiply */ + 1, /* mbminlen */ 3, /* mbmaxlen */ 0, &my_charset_handler, @@ -8502,6 +8503,7 @@ CHARSET_INFO my_charset_ujis_bin= NULL, /* tab_from_uni */ "","", 1, /* strxfrm_multiply */ + 1, /* mbminlen */ 3, /* mbmaxlen */ 0, &my_charset_handler, diff --git a/strings/ctype-utf8.c b/strings/ctype-utf8.c index b5716c53ea2..8004fba75b7 100644 --- a/strings/ctype-utf8.c +++ b/strings/ctype-utf8.c @@ -2006,6 +2006,7 @@ CHARSET_INFO my_charset_utf8_general_ci= "", "", 1, /* strxfrm_multiply */ + 1, /* mbminlen */ 3, /* mbmaxlen */ 0, &my_charset_handler, @@ -2029,6 +2030,7 @@ CHARSET_INFO my_charset_utf8_bin= "", "", 1, /* strxfrm_multiply */ + 1, /* mbminlen */ 3, /* mbmaxlen */ 0, &my_charset_handler, diff --git a/strings/ctype-win1250ch.c b/strings/ctype-win1250ch.c index 60a5737009f..d3b5c9d1796 100644 --- a/strings/ctype-win1250ch.c +++ b/strings/ctype-win1250ch.c @@ -671,6 +671,7 @@ CHARSET_INFO my_charset_cp1250_czech_ci = idx_uni_cp1250, /* tab_from_uni */ "","", 2, /* strxfrm_multiply */ + 1, /* mbminlen */ 1, /* mbmaxlen */ 0, &my_charset_8bit_handler,