MDEV-31069 Reuse duplicate char-to-weight conversion code in ctype-utf8.c and ctype-ucs2.c

Removing similar functions from ctype-utf8.c and ctype-ucs2.c

- my_tosort_utf16()
- my_tosort_utf32()
- my_tosort_ucs2()
- my_tosort_unicode()

Adding new shared functions into ctype-unidata.h:

- my_tosort_unicode_bmp()  - reused for utf8mb3, ucs2
- my_tosort_unicode()      - reused for utf8mb4, utf16, utf32

For simplicity, the new version of my_tosort_unicode*()
does not include the code handling the MY_CS_LOWER_SORT flag because:
- it affects performance negatively
- we don't have any collations with this flag yet anyway
(This code was most likely earlier erroneously merged from
MySQL's utf8_tolower_ci at some point.)
This commit is contained in:
Alexander Barkov 2023-04-18 09:40:41 +04:00
parent 30b4bb4204
commit 2ad287caad
3 changed files with 39 additions and 70 deletions

View file

@ -1284,22 +1284,6 @@ my_uni_utf16(CHARSET_INFO *cs __attribute__((unused)),
const char charset_name_utf16le[]= "utf16le";
#define charset_name_utf16le_length (sizeof(charset_name_utf16le)-1)
static inline void
my_tosort_utf16(MY_UNICASE_INFO *uni_plane, my_wc_t *wc)
{
if (*wc <= uni_plane->maxchar)
{
MY_UNICASE_CHARACTER *page;
if ((page= uni_plane->page[*wc >> 8]))
*wc= page[*wc & 0xFF].sort;
}
else
{
*wc= MY_CS_REPLACEMENT_CHARACTER;
}
}
static size_t
my_caseup_utf16(CHARSET_INFO *cs, const char *src, size_t srclen,
@ -1341,7 +1325,7 @@ my_hash_sort_utf16_nopad(CHARSET_INFO *cs,
while ((s < e) && (res= mb_wc(cs, &wc, (uchar *) s, (uchar *) e)) > 0)
{
my_tosort_utf16(uni_plane, &wc);
my_tosort_unicode(uni_plane, &wc);
MY_HASH_ADD_16(m1, m2, wc);
s+= res;
}
@ -2178,22 +2162,6 @@ my_uni_utf32(CHARSET_INFO *cs __attribute__((unused)),
}
static inline void
my_tosort_utf32(MY_UNICASE_INFO *uni_plane, my_wc_t *wc)
{
if (*wc <= uni_plane->maxchar)
{
MY_UNICASE_CHARACTER *page;
if ((page= uni_plane->page[*wc >> 8]))
*wc= page[*wc & 0xFF].sort;
}
else
{
*wc= MY_CS_REPLACEMENT_CHARACTER;
}
}
static size_t
my_lengthsp_utf32(CHARSET_INFO *cs __attribute__((unused)),
const char *ptr, size_t length)
@ -2242,7 +2210,7 @@ my_hash_sort_utf32_nopad(CHARSET_INFO *cs, const uchar *s, size_t slen,
while ((res= my_utf32_uni(cs, &wc, (uchar*) s, (uchar*) e)) > 0)
{
my_tosort_utf32(uni_plane, &wc);
my_tosort_unicode(uni_plane, &wc);
MY_HASH_ADD(m1, m2, (uint) (wc >> 24));
MY_HASH_ADD(m1, m2, (uint) (wc >> 16) & 0xFF);
MY_HASH_ADD(m1, m2, (uint) (wc >> 8) & 0xFF);
@ -3082,14 +3050,6 @@ static int my_uni_ucs2(CHARSET_INFO *cs __attribute__((unused)) ,
}
static inline void
my_tosort_ucs2(MY_UNICASE_INFO *uni_plane, my_wc_t *wc)
{
MY_UNICASE_CHARACTER *page;
if ((page= uni_plane->page[(*wc >> 8) & 0xFF]))
*wc= page[*wc & 0xFF].sort;
}
static size_t my_caseup_ucs2(CHARSET_INFO *cs, const char *src, size_t srclen,
char *dst, size_t dstlen)
{
@ -3125,7 +3085,7 @@ my_hash_sort_ucs2_nopad(CHARSET_INFO *cs, const uchar *s, size_t slen,
while ((s < e) && (res=my_ucs2_uni(cs,&wc, (uchar *)s, (uchar*)e)) >0)
{
my_tosort_ucs2(uni_plane, &wc);
my_tosort_unicode_bmp(uni_plane, &wc);
MY_HASH_ADD_16(m1, m2, wc);
s+=res;
}

View file

@ -36,6 +36,32 @@ static inline my_wc_t my_u300_toupper_7bit(uchar ch)
}
static inline void my_tosort_unicode_bmp(MY_UNICASE_INFO *uni_plane,
my_wc_t *wc)
{
const MY_UNICASE_CHARACTER *page;
DBUG_ASSERT(*wc <= uni_plane->maxchar);
if ((page= uni_plane->page[*wc >> 8]))
*wc= page[*wc & 0xFF].sort;
}
static inline void my_tosort_unicode(MY_UNICASE_INFO *uni_plane,
my_wc_t *wc)
{
if (*wc <= uni_plane->maxchar)
{
const MY_UNICASE_CHARACTER *page;
if ((page= uni_plane->page[*wc >> 8]))
*wc= page[*wc & 0xFF].sort;
}
else
{
*wc= MY_CS_REPLACEMENT_CHARACTER;
}
}
static inline void
my_tolower_unicode_bmp(MY_UNICASE_INFO *uni_plane, my_wc_t *wc)
{

View file

@ -4638,23 +4638,6 @@ MY_UNICASE_INFO my_unicase_unicode520=
};
static inline void
my_tosort_unicode(MY_UNICASE_INFO *uni_plane, my_wc_t *wc, uint flags)
{
if (*wc <= uni_plane->maxchar)
{
MY_UNICASE_CHARACTER *page;
if ((page= uni_plane->page[*wc >> 8]))
*wc= (flags & MY_CS_LOWER_SORT) ?
page[*wc & 0xFF].tolower :
page[*wc & 0xFF].sort;
}
else
{
*wc= MY_CS_REPLACEMENT_CHARACTER;
}
}
static uint
my_casefold_multiply_utf8mbx(CHARSET_INFO *cs)
@ -4734,8 +4717,8 @@ int my_wildcmp_unicode_impl(CHARSET_INFO *cs,
{
if (weights)
{
my_tosort_unicode(weights, &s_wc, cs->state);
my_tosort_unicode(weights, &w_wc, cs->state);
my_tosort_unicode(weights, &s_wc);
my_tosort_unicode(weights, &w_wc);
}
if (s_wc != w_wc)
return 1; /* No match */
@ -4803,8 +4786,8 @@ int my_wildcmp_unicode_impl(CHARSET_INFO *cs,
return 1;
if (weights)
{
my_tosort_unicode(weights, &s_wc, cs->state);
my_tosort_unicode(weights, &w_wc, cs->state);
my_tosort_unicode(weights, &s_wc);
my_tosort_unicode(weights, &w_wc);
}
if (s_wc == w_wc)
@ -5242,7 +5225,7 @@ static void my_hash_sort_utf8mb3_nopad(CHARSET_INFO *cs, const uchar *s, size_t
while ((s < e) && (res=my_utf8mb3_uni(cs,&wc, (uchar *)s, (uchar*)e))>0 )
{
my_tosort_unicode(uni_plane, &wc, cs->state);
my_tosort_unicode(uni_plane, &wc);
MY_HASH_ADD_16(m1, m2, wc);
s+= res;
}
@ -5976,8 +5959,8 @@ static int my_strnncoll_utf8mb3_cs(CHARSET_INFO *cs,
save_diff = ((int)s_wc) - ((int)t_wc);
}
my_tosort_unicode(uni_plane, &s_wc, cs->state);
my_tosort_unicode(uni_plane, &t_wc, cs->state);
my_tosort_unicode(uni_plane, &s_wc);
my_tosort_unicode(uni_plane, &t_wc);
if ( s_wc != t_wc )
{
@ -6018,8 +6001,8 @@ static int my_strnncollsp_utf8mb3_cs(CHARSET_INFO *cs,
save_diff = ((int)s_wc) - ((int)t_wc);
}
my_tosort_unicode(uni_plane, &s_wc, cs->state);
my_tosort_unicode(uni_plane, &t_wc, cs->state);
my_tosort_unicode(uni_plane, &s_wc);
my_tosort_unicode(uni_plane, &t_wc);
if ( s_wc != t_wc )
{
@ -7697,7 +7680,7 @@ my_hash_sort_utf8mb4_nopad(CHARSET_INFO *cs, const uchar *s, size_t slen,
while ((res= my_mb_wc_utf8mb4(cs, &wc, (uchar*) s, (uchar*) e)) > 0)
{
my_tosort_unicode(uni_plane, &wc, cs->state);
my_tosort_unicode(uni_plane, &wc);
MY_HASH_ADD_16(m1, m2, (uint) (wc & 0xFFFF));
if (wc > 0xFFFF)
{