CSC#4385: slow sorting for UTF8 large table:

my_strnxfrm_utf8 now requires 2 bytes per character
in filesort key, instead of 3 bytes per character.
Shorter filesort keys make sorting faster.
This commit is contained in:
unknown 2005-01-26 16:34:09 +04:00
parent e8ae338356
commit 9bb464487d
20 changed files with 58 additions and 16 deletions

View file

@ -110,6 +110,7 @@ typedef struct my_collation_handler_st
my_bool diff_if_only_endspace_difference);
int (*strnxfrm)(struct charset_info_st *,
uchar *, uint, const uchar *, uint);
uint (*strnxfrmlen)(struct charset_info_st *, uint);
my_bool (*like_range)(struct charset_info_st *,
const char *s, uint s_length,
pchar w_prefix, pchar w_one, pchar w_many,
@ -259,7 +260,8 @@ extern CHARSET_INFO my_charset_cp1250_czech_ci;
/* declarations for simple charsets */
extern int my_strnxfrm_simple(CHARSET_INFO *, uchar *, uint, const uchar *,
uint);
uint);
uint my_strnxfrmlen_simple(CHARSET_INFO *, uint);
extern int my_strnncoll_simple(CHARSET_INFO *, const uchar *, uint,
const uchar *, uint, my_bool);

View file

@ -1187,7 +1187,7 @@ sortlength(SORT_FIELD *sortorder, uint s_length, bool *multi_byte_charset)
{
sortorder->need_strxnfrm= 1;
*multi_byte_charset= 1;
sortorder->length= sortorder->length*cs->strxfrm_multiply;
sortorder->length= cs->coll->strnxfrmlen(cs, sortorder->length);
}
}
if (sortorder->field->maybe_null())
@ -1200,7 +1200,7 @@ sortlength(SORT_FIELD *sortorder, uint s_length, bool *multi_byte_charset)
sortorder->length=sortorder->item->max_length;
if (use_strnxfrm((cs=sortorder->item->collation.collation)))
{
sortorder->length= sortorder->length*cs->strxfrm_multiply;
sortorder->length= cs->coll->strnxfrmlen(cs, sortorder->length);
sortorder->need_strxnfrm= 1;
*multi_byte_charset= 1;
}

View file

@ -6293,6 +6293,7 @@ static MY_COLLATION_HANDLER my_collation_big5_chinese_ci_handler =
my_strnncoll_big5,
my_strnncollsp_big5,
my_strnxfrm_big5,
my_strnxfrmlen_simple,
my_like_range_big5,
my_wildcmp_mb,
my_strcasecmp_mb,

View file

@ -447,6 +447,7 @@ MY_COLLATION_HANDLER my_collation_8bit_bin_handler =
my_strnncoll_8bit_bin,
my_strnncollsp_8bit_bin,
my_strnxfrm_8bit_bin,
my_strnxfrmlen_simple,
my_like_range_simple,
my_wildcmp_bin,
my_strcasecmp_bin,
@ -461,6 +462,7 @@ static MY_COLLATION_HANDLER my_collation_binary_handler =
my_strnncoll_binary,
my_strnncollsp_binary,
my_strnxfrm_bin,
my_strnxfrmlen_simple,
my_like_range_simple,
my_wildcmp_bin,
my_strcasecmp_bin,

View file

@ -5454,6 +5454,7 @@ static MY_COLLATION_HANDLER my_collation_ci_handler =
my_strnncoll_cp932,
my_strnncollsp_cp932,
my_strnxfrm_cp932,
my_strnxfrmlen_simple,
my_like_range_cp932,
my_wildcmp_mb, /* wildcmp */
my_strcasecmp_8bit,

View file

@ -593,6 +593,7 @@ static MY_COLLATION_HANDLER my_collation_latin2_czech_ci_handler =
my_strnncoll_czech,
my_strnncollsp_czech,
my_strnxfrm_czech,
my_strnxfrmlen_simple,
my_like_range_czech,
my_wildcmp_8bit,
my_strcasecmp_8bit,

View file

@ -8641,6 +8641,7 @@ static MY_COLLATION_HANDLER my_collation_ci_handler =
my_strnncoll_simple, /* strnncoll */
my_strnncollsp_simple,
my_strnxfrm_simple, /* strnxfrm */
my_strnxfrmlen_simple,
my_like_range_simple, /* like_range */
my_wildcmp_mb, /* wildcmp */
my_strcasecmp_mb,

View file

@ -8636,6 +8636,7 @@ static MY_COLLATION_HANDLER my_collation_ci_handler =
my_strnncoll_simple,/* strnncoll */
my_strnncollsp_simple,
my_strnxfrm_simple, /* strnxfrm */
my_strnxfrmlen_simple,
my_like_range_simple,/* like_range */
my_wildcmp_mb, /* wildcmp */
my_strcasecmp_mb,

View file

@ -5692,6 +5692,7 @@ static MY_COLLATION_HANDLER my_collation_ci_handler =
my_strnncoll_simple, /* strnncoll */
my_strnncollsp_simple,
my_strnxfrm_simple, /* strnxfrm */
my_strnxfrmlen_simple,
my_like_range_simple, /* like_range */
my_wildcmp_mb, /* wildcmp */
my_strcasecmp_mb, /* instr */

View file

@ -9939,6 +9939,7 @@ static MY_COLLATION_HANDLER my_collation_ci_handler =
my_strnncoll_gbk,
my_strnncollsp_gbk,
my_strnxfrm_gbk,
my_strnxfrmlen_simple,
my_like_range_gbk,
my_wildcmp_mb,
my_strcasecmp_mb,

View file

@ -693,6 +693,7 @@ static MY_COLLATION_HANDLER my_collation_german2_ci_handler=
my_strnncoll_latin1_de,
my_strnncollsp_latin1_de,
my_strnxfrm_latin1_de,
my_strnxfrmlen_simple,
my_like_range_simple,
my_wildcmp_8bit,
my_strcasecmp_8bit,

View file

@ -912,6 +912,7 @@ MY_COLLATION_HANDLER my_collation_mb_bin_handler =
my_strnncoll_mb_bin,
my_strnncollsp_mb_bin,
my_strnxfrm_mb_bin,
my_strnxfrmlen_simple,
my_like_range_simple,
my_wildcmp_mb_bin,
my_strcasecmp_mb_bin,

View file

@ -21,6 +21,15 @@
#include "stdarg.h"
/*
Returns the number of bytes required for strnxfrm().
*/
uint my_strnxfrmlen_simple(CHARSET_INFO *cs, uint len)
{
return len * (cs->strxfrm_multiply ? cs->strxfrm_multiply : 1);
}
/*
Converts a string into its sort key.
@ -1365,6 +1374,7 @@ MY_COLLATION_HANDLER my_collation_8bit_simple_ci_handler =
my_strnncoll_simple,
my_strnncollsp_simple,
my_strnxfrm_simple,
my_strnxfrmlen_simple,
my_like_range_simple,
my_wildcmp_8bit,
my_strcasecmp_8bit,

View file

@ -4627,6 +4627,7 @@ static MY_COLLATION_HANDLER my_collation_ci_handler =
my_strnncoll_sjis,
my_strnncollsp_sjis,
my_strnxfrm_sjis,
my_strnxfrmlen_simple,
my_like_range_sjis,
my_wildcmp_mb, /* wildcmp */
my_strcasecmp_8bit,

View file

@ -927,6 +927,7 @@ static MY_COLLATION_HANDLER my_collation_ci_handler =
my_strnncoll_tis620,
my_strnncollsp_tis620,
my_strnxfrm_tis620,
my_strnxfrmlen_simple,
my_like_range_tis620,
my_wildcmp_8bit, /* wildcmp */
my_strcasecmp_8bit,

View file

@ -8024,6 +8024,7 @@ MY_COLLATION_HANDLER my_collation_ucs2_uca_handler =
my_strnncoll_ucs2_uca,
my_strnncollsp_ucs2_uca,
my_strnxfrm_ucs2_uca,
my_strnxfrmlen_simple,
my_like_range_ucs2,
my_wildcmp_uca,
NULL,
@ -8504,6 +8505,7 @@ MY_COLLATION_HANDLER my_collation_any_uca_handler =
my_strnncoll_any_uca,
my_strnncollsp_any_uca,
my_strnxfrm_any_uca,
my_strnxfrmlen_simple,
my_like_range_mb,
my_wildcmp_uca,
NULL,

View file

@ -1499,6 +1499,7 @@ static MY_COLLATION_HANDLER my_collation_ucs2_general_ci_handler =
my_strnncoll_ucs2,
my_strnncollsp_ucs2,
my_strnxfrm_ucs2,
my_strnxfrmlen_simple,
my_like_range_ucs2,
my_wildcmp_ucs2_ci,
my_strcasecmp_ucs2,
@ -1513,6 +1514,7 @@ static MY_COLLATION_HANDLER my_collation_ucs2_bin_handler =
my_strnncoll_ucs2_bin,
my_strnncollsp_ucs2_bin,
my_strnxfrm_ucs2_bin,
my_strnxfrmlen_simple,
my_like_range_simple,
my_wildcmp_ucs2_bin,
my_strcasecmp_ucs2_bin,

View file

@ -8501,6 +8501,7 @@ static MY_COLLATION_HANDLER my_collation_ci_handler =
my_strnncoll_simple,/* strnncoll */
my_strnncollsp_simple,
my_strnxfrm_simple, /* strnxfrm */
my_strnxfrmlen_simple,
my_like_range_simple,/* like_range */
my_wildcmp_mb, /* wildcmp */
my_strcasecmp_mb,

View file

@ -2238,6 +2238,12 @@ int my_wildcmp_utf8(CHARSET_INFO *cs,
}
static
uint my_strnxfrmlen_utf8(CHARSET_INFO *cs __attribute__((unused)), uint len)
{
return (len * 2 + 2) / 3;
}
static int my_strnxfrm_utf8(CHARSET_INFO *cs,
uchar *dst, uint dstlen,
const uchar *src, uint srclen)
@ -2245,29 +2251,33 @@ static int my_strnxfrm_utf8(CHARSET_INFO *cs,
my_wc_t wc;
int res;
int plane;
uchar *de = dst + dstlen;
uchar *de= dst + dstlen;
uchar *de_beg= de - 1;
const uchar *se = src + srclen;
while( src < se && dst < de )
while (dst < de_beg)
{
if ((res=my_utf8_uni(cs,&wc, src, se))<0)
{
if ((res=my_utf8_uni(cs,&wc, src, se)) <= 0)
break;
}
src+=res;
srclen-=res;
plane=(wc>>8) & 0xFF;
wc = uni_plane[plane] ? uni_plane[plane][wc & 0xFF].sort : wc;
if ((res=my_uni_utf8(cs,wc,dst,de)) <0)
{
break;
}
dst+=res;
*dst++= wc >> 8;
*dst++= wc & 0xFF;
}
if (dst < de)
bfill(dst, de - dst, ' ');
while (dst < de_beg) /* Fill the tail with keys for space character */
{
*dst++= 0x00;
*dst++= 0x20;
}
if (dst < de) /* Clear the last byte, if "dstlen" was an odd number */
*de= 0x00;
return dstlen;
}
@ -2306,6 +2316,7 @@ static MY_COLLATION_HANDLER my_collation_ci_handler =
my_strnncoll_utf8,
my_strnncollsp_utf8,
my_strnxfrm_utf8,
my_strnxfrmlen_utf8,
my_like_range_mb,
my_wildcmp_utf8,
my_strcasecmp_utf8,

View file

@ -626,6 +626,7 @@ static MY_COLLATION_HANDLER my_collation_czech_ci_handler =
my_strnncoll_win1250ch,
my_strnncollsp_win1250ch,
my_strnxfrm_win1250ch,
my_strnxfrmlen_simple,
my_like_range_win1250ch,
my_wildcmp_8bit,
my_strcasecmp_8bit,