From 9bb464487d6c1c0359dba9aa79e88f927cd24c93 Mon Sep 17 00:00:00 2001 From: unknown <bar@mysql.com> Date: Wed, 26 Jan 2005 16:34:09 +0400 Subject: [PATCH] CSC#4385: slow sorting for UTF8 large table: my_strnxfrm_utf8 now requires 2 bytes per character in filesort key, instead of 3 bytes per character. Shorter filesort keys make sorting faster. --- include/m_ctype.h | 4 +++- sql/filesort.cc | 4 ++-- strings/ctype-big5.c | 1 + strings/ctype-bin.c | 2 ++ strings/ctype-cp932.c | 1 + strings/ctype-czech.c | 1 + strings/ctype-euc_kr.c | 1 + strings/ctype-eucjpms.c | 1 + strings/ctype-gb2312.c | 1 + strings/ctype-gbk.c | 1 + strings/ctype-latin1.c | 1 + strings/ctype-mb.c | 1 + strings/ctype-simple.c | 10 ++++++++++ strings/ctype-sjis.c | 1 + strings/ctype-tis620.c | 1 + strings/ctype-uca.c | 2 ++ strings/ctype-ucs2.c | 2 ++ strings/ctype-ujis.c | 1 + strings/ctype-utf8.c | 37 ++++++++++++++++++++++++------------- strings/ctype-win1250ch.c | 1 + 20 files changed, 58 insertions(+), 16 deletions(-) diff --git a/include/m_ctype.h b/include/m_ctype.h index c2354c7feff..c41c7385b3d 100644 --- a/include/m_ctype.h +++ b/include/m_ctype.h @@ -110,6 +110,7 @@ typedef struct my_collation_handler_st my_bool diff_if_only_endspace_difference); int (*strnxfrm)(struct charset_info_st *, uchar *, uint, const uchar *, uint); + uint (*strnxfrmlen)(struct charset_info_st *, uint); my_bool (*like_range)(struct charset_info_st *, const char *s, uint s_length, pchar w_prefix, pchar w_one, pchar w_many, @@ -259,7 +260,8 @@ extern CHARSET_INFO my_charset_cp1250_czech_ci; /* declarations for simple charsets */ extern int my_strnxfrm_simple(CHARSET_INFO *, uchar *, uint, const uchar *, - uint); + uint); +uint my_strnxfrmlen_simple(CHARSET_INFO *, uint); extern int my_strnncoll_simple(CHARSET_INFO *, const uchar *, uint, const uchar *, uint, my_bool); diff --git a/sql/filesort.cc b/sql/filesort.cc index 0e9fa8c79ed..1665358dbf0 100644 --- a/sql/filesort.cc +++ b/sql/filesort.cc @@ -1187,7 +1187,7 @@ sortlength(SORT_FIELD *sortorder, uint s_length, bool *multi_byte_charset) { sortorder->need_strxnfrm= 1; *multi_byte_charset= 1; - sortorder->length= sortorder->length*cs->strxfrm_multiply; + sortorder->length= cs->coll->strnxfrmlen(cs, sortorder->length); } } if (sortorder->field->maybe_null()) @@ -1200,7 +1200,7 @@ sortlength(SORT_FIELD *sortorder, uint s_length, bool *multi_byte_charset) sortorder->length=sortorder->item->max_length; if (use_strnxfrm((cs=sortorder->item->collation.collation))) { - sortorder->length= sortorder->length*cs->strxfrm_multiply; + sortorder->length= cs->coll->strnxfrmlen(cs, sortorder->length); sortorder->need_strxnfrm= 1; *multi_byte_charset= 1; } diff --git a/strings/ctype-big5.c b/strings/ctype-big5.c index a2db7de244e..70c5ec633be 100644 --- a/strings/ctype-big5.c +++ b/strings/ctype-big5.c @@ -6293,6 +6293,7 @@ static MY_COLLATION_HANDLER my_collation_big5_chinese_ci_handler = my_strnncoll_big5, my_strnncollsp_big5, my_strnxfrm_big5, + my_strnxfrmlen_simple, my_like_range_big5, my_wildcmp_mb, my_strcasecmp_mb, diff --git a/strings/ctype-bin.c b/strings/ctype-bin.c index 401605a462f..50c66a63e97 100644 --- a/strings/ctype-bin.c +++ b/strings/ctype-bin.c @@ -447,6 +447,7 @@ MY_COLLATION_HANDLER my_collation_8bit_bin_handler = my_strnncoll_8bit_bin, my_strnncollsp_8bit_bin, my_strnxfrm_8bit_bin, + my_strnxfrmlen_simple, my_like_range_simple, my_wildcmp_bin, my_strcasecmp_bin, @@ -461,6 +462,7 @@ static MY_COLLATION_HANDLER my_collation_binary_handler = my_strnncoll_binary, my_strnncollsp_binary, my_strnxfrm_bin, + my_strnxfrmlen_simple, my_like_range_simple, my_wildcmp_bin, my_strcasecmp_bin, diff --git a/strings/ctype-cp932.c b/strings/ctype-cp932.c index 804f87b2a5b..c47f2c2d8ce 100644 --- a/strings/ctype-cp932.c +++ b/strings/ctype-cp932.c @@ -5454,6 +5454,7 @@ static MY_COLLATION_HANDLER my_collation_ci_handler = my_strnncoll_cp932, my_strnncollsp_cp932, my_strnxfrm_cp932, + my_strnxfrmlen_simple, my_like_range_cp932, my_wildcmp_mb, /* wildcmp */ my_strcasecmp_8bit, diff --git a/strings/ctype-czech.c b/strings/ctype-czech.c index 2834dbb28ff..f5a410afc50 100644 --- a/strings/ctype-czech.c +++ b/strings/ctype-czech.c @@ -593,6 +593,7 @@ static MY_COLLATION_HANDLER my_collation_latin2_czech_ci_handler = my_strnncoll_czech, my_strnncollsp_czech, my_strnxfrm_czech, + my_strnxfrmlen_simple, my_like_range_czech, my_wildcmp_8bit, my_strcasecmp_8bit, diff --git a/strings/ctype-euc_kr.c b/strings/ctype-euc_kr.c index ee792d9c3e4..289b7309ea0 100644 --- a/strings/ctype-euc_kr.c +++ b/strings/ctype-euc_kr.c @@ -8641,6 +8641,7 @@ static MY_COLLATION_HANDLER my_collation_ci_handler = my_strnncoll_simple, /* strnncoll */ my_strnncollsp_simple, my_strnxfrm_simple, /* strnxfrm */ + my_strnxfrmlen_simple, my_like_range_simple, /* like_range */ my_wildcmp_mb, /* wildcmp */ my_strcasecmp_mb, diff --git a/strings/ctype-eucjpms.c b/strings/ctype-eucjpms.c index 5b108d24f4b..8c8d237cf48 100644 --- a/strings/ctype-eucjpms.c +++ b/strings/ctype-eucjpms.c @@ -8636,6 +8636,7 @@ static MY_COLLATION_HANDLER my_collation_ci_handler = my_strnncoll_simple,/* strnncoll */ my_strnncollsp_simple, my_strnxfrm_simple, /* strnxfrm */ + my_strnxfrmlen_simple, my_like_range_simple,/* like_range */ my_wildcmp_mb, /* wildcmp */ my_strcasecmp_mb, diff --git a/strings/ctype-gb2312.c b/strings/ctype-gb2312.c index f17cc94723f..73e4132dd7f 100644 --- a/strings/ctype-gb2312.c +++ b/strings/ctype-gb2312.c @@ -5692,6 +5692,7 @@ static MY_COLLATION_HANDLER my_collation_ci_handler = my_strnncoll_simple, /* strnncoll */ my_strnncollsp_simple, my_strnxfrm_simple, /* strnxfrm */ + my_strnxfrmlen_simple, my_like_range_simple, /* like_range */ my_wildcmp_mb, /* wildcmp */ my_strcasecmp_mb, /* instr */ diff --git a/strings/ctype-gbk.c b/strings/ctype-gbk.c index dc4aea60096..6b47b537fb9 100644 --- a/strings/ctype-gbk.c +++ b/strings/ctype-gbk.c @@ -9939,6 +9939,7 @@ static MY_COLLATION_HANDLER my_collation_ci_handler = my_strnncoll_gbk, my_strnncollsp_gbk, my_strnxfrm_gbk, + my_strnxfrmlen_simple, my_like_range_gbk, my_wildcmp_mb, my_strcasecmp_mb, diff --git a/strings/ctype-latin1.c b/strings/ctype-latin1.c index b5da99a7452..043645684cf 100644 --- a/strings/ctype-latin1.c +++ b/strings/ctype-latin1.c @@ -693,6 +693,7 @@ static MY_COLLATION_HANDLER my_collation_german2_ci_handler= my_strnncoll_latin1_de, my_strnncollsp_latin1_de, my_strnxfrm_latin1_de, + my_strnxfrmlen_simple, my_like_range_simple, my_wildcmp_8bit, my_strcasecmp_8bit, diff --git a/strings/ctype-mb.c b/strings/ctype-mb.c index 4be21599fef..e902730d65a 100644 --- a/strings/ctype-mb.c +++ b/strings/ctype-mb.c @@ -912,6 +912,7 @@ MY_COLLATION_HANDLER my_collation_mb_bin_handler = my_strnncoll_mb_bin, my_strnncollsp_mb_bin, my_strnxfrm_mb_bin, + my_strnxfrmlen_simple, my_like_range_simple, my_wildcmp_mb_bin, my_strcasecmp_mb_bin, diff --git a/strings/ctype-simple.c b/strings/ctype-simple.c index bb623ef66f1..e436d5f8702 100644 --- a/strings/ctype-simple.c +++ b/strings/ctype-simple.c @@ -21,6 +21,15 @@ #include "stdarg.h" +/* + Returns the number of bytes required for strnxfrm(). +*/ +uint my_strnxfrmlen_simple(CHARSET_INFO *cs, uint len) +{ + return len * (cs->strxfrm_multiply ? cs->strxfrm_multiply : 1); +} + + /* Converts a string into its sort key. @@ -1365,6 +1374,7 @@ MY_COLLATION_HANDLER my_collation_8bit_simple_ci_handler = my_strnncoll_simple, my_strnncollsp_simple, my_strnxfrm_simple, + my_strnxfrmlen_simple, my_like_range_simple, my_wildcmp_8bit, my_strcasecmp_8bit, diff --git a/strings/ctype-sjis.c b/strings/ctype-sjis.c index c1e41dc2d94..22cc8d9818d 100644 --- a/strings/ctype-sjis.c +++ b/strings/ctype-sjis.c @@ -4627,6 +4627,7 @@ static MY_COLLATION_HANDLER my_collation_ci_handler = my_strnncoll_sjis, my_strnncollsp_sjis, my_strnxfrm_sjis, + my_strnxfrmlen_simple, my_like_range_sjis, my_wildcmp_mb, /* wildcmp */ my_strcasecmp_8bit, diff --git a/strings/ctype-tis620.c b/strings/ctype-tis620.c index c6bdd106ad4..9ba35e1c8ec 100644 --- a/strings/ctype-tis620.c +++ b/strings/ctype-tis620.c @@ -927,6 +927,7 @@ static MY_COLLATION_HANDLER my_collation_ci_handler = my_strnncoll_tis620, my_strnncollsp_tis620, my_strnxfrm_tis620, + my_strnxfrmlen_simple, my_like_range_tis620, my_wildcmp_8bit, /* wildcmp */ my_strcasecmp_8bit, diff --git a/strings/ctype-uca.c b/strings/ctype-uca.c index 2353c9397a2..8345d0474f2 100644 --- a/strings/ctype-uca.c +++ b/strings/ctype-uca.c @@ -8024,6 +8024,7 @@ MY_COLLATION_HANDLER my_collation_ucs2_uca_handler = my_strnncoll_ucs2_uca, my_strnncollsp_ucs2_uca, my_strnxfrm_ucs2_uca, + my_strnxfrmlen_simple, my_like_range_ucs2, my_wildcmp_uca, NULL, @@ -8504,6 +8505,7 @@ MY_COLLATION_HANDLER my_collation_any_uca_handler = my_strnncoll_any_uca, my_strnncollsp_any_uca, my_strnxfrm_any_uca, + my_strnxfrmlen_simple, my_like_range_mb, my_wildcmp_uca, NULL, diff --git a/strings/ctype-ucs2.c b/strings/ctype-ucs2.c index adfd4794e36..0d45cceb64d 100644 --- a/strings/ctype-ucs2.c +++ b/strings/ctype-ucs2.c @@ -1499,6 +1499,7 @@ static MY_COLLATION_HANDLER my_collation_ucs2_general_ci_handler = my_strnncoll_ucs2, my_strnncollsp_ucs2, my_strnxfrm_ucs2, + my_strnxfrmlen_simple, my_like_range_ucs2, my_wildcmp_ucs2_ci, my_strcasecmp_ucs2, @@ -1513,6 +1514,7 @@ static MY_COLLATION_HANDLER my_collation_ucs2_bin_handler = my_strnncoll_ucs2_bin, my_strnncollsp_ucs2_bin, my_strnxfrm_ucs2_bin, + my_strnxfrmlen_simple, my_like_range_simple, my_wildcmp_ucs2_bin, my_strcasecmp_ucs2_bin, diff --git a/strings/ctype-ujis.c b/strings/ctype-ujis.c index fc1496df280..deaddcc76f6 100644 --- a/strings/ctype-ujis.c +++ b/strings/ctype-ujis.c @@ -8501,6 +8501,7 @@ static MY_COLLATION_HANDLER my_collation_ci_handler = my_strnncoll_simple,/* strnncoll */ my_strnncollsp_simple, my_strnxfrm_simple, /* strnxfrm */ + my_strnxfrmlen_simple, my_like_range_simple,/* like_range */ my_wildcmp_mb, /* wildcmp */ my_strcasecmp_mb, diff --git a/strings/ctype-utf8.c b/strings/ctype-utf8.c index 1f9f158a73d..e17e7587e85 100644 --- a/strings/ctype-utf8.c +++ b/strings/ctype-utf8.c @@ -2238,6 +2238,12 @@ int my_wildcmp_utf8(CHARSET_INFO *cs, } +static +uint my_strnxfrmlen_utf8(CHARSET_INFO *cs __attribute__((unused)), uint len) +{ + return (len * 2 + 2) / 3; +} + static int my_strnxfrm_utf8(CHARSET_INFO *cs, uchar *dst, uint dstlen, const uchar *src, uint srclen) @@ -2245,29 +2251,33 @@ static int my_strnxfrm_utf8(CHARSET_INFO *cs, my_wc_t wc; int res; int plane; - uchar *de = dst + dstlen; + uchar *de= dst + dstlen; + uchar *de_beg= de - 1; const uchar *se = src + srclen; - while( src < se && dst < de ) + while (dst < de_beg) { - if ((res=my_utf8_uni(cs,&wc, src, se))<0) - { + if ((res=my_utf8_uni(cs,&wc, src, se)) <= 0) break; - } src+=res; - srclen-=res; plane=(wc>>8) & 0xFF; wc = uni_plane[plane] ? uni_plane[plane][wc & 0xFF].sort : wc; - if ((res=my_uni_utf8(cs,wc,dst,de)) <0) - { - break; - } - dst+=res; + *dst++= wc >> 8; + *dst++= wc & 0xFF; + } - if (dst < de) - bfill(dst, de - dst, ' '); + + while (dst < de_beg) /* Fill the tail with keys for space character */ + { + *dst++= 0x00; + *dst++= 0x20; + } + + if (dst < de) /* Clear the last byte, if "dstlen" was an odd number */ + *de= 0x00; + return dstlen; } @@ -2306,6 +2316,7 @@ static MY_COLLATION_HANDLER my_collation_ci_handler = my_strnncoll_utf8, my_strnncollsp_utf8, my_strnxfrm_utf8, + my_strnxfrmlen_utf8, my_like_range_mb, my_wildcmp_utf8, my_strcasecmp_utf8, diff --git a/strings/ctype-win1250ch.c b/strings/ctype-win1250ch.c index b58a8f0f1e5..37611a5bd20 100644 --- a/strings/ctype-win1250ch.c +++ b/strings/ctype-win1250ch.c @@ -626,6 +626,7 @@ static MY_COLLATION_HANDLER my_collation_czech_ci_handler = my_strnncoll_win1250ch, my_strnncollsp_win1250ch, my_strnxfrm_win1250ch, + my_strnxfrmlen_simple, my_like_range_win1250ch, my_wildcmp_8bit, my_strcasecmp_8bit,