MDEV-31069 Reuse duplicate char-to-weight conversion code in ctype-utf8.c and ctype-ucs2.c

Removing similar functions from ctype-utf8.c and ctype-ucs2.c - my_tosort_utf16() - my_tosort_utf32() - my_tosort_ucs2() - my_tosort_unicode() Adding new shared functions into ctype-unidata.h: - my_tosort_unicode_bmp() - reused for utf8mb3, ucs2 - my_tosort_unicode() - reused for utf8mb4, utf16, utf32 For simplicity, the new version of my_tosort_unicode*() does not include the code handling the MY_CS_LOWER_SORT flag because: - it affects performance negatively - we don't have any collations with this flag yet anyway (This code was most likely earlier erroneously merged from MySQL's utf8_tolower_ci at some point.)
2025-01-15 19:42:28 +01:00 · 2023-04-18 09:40:41 +04:00 · 2023-04-18 09:40:41 +04:00 · 2ad287caad
commit 2ad287caad
parent 30b4bb4204
3 changed files with 39 additions and 70 deletions
--- a/strings/ctype-ucs2.c
+++ b/strings/ctype-ucs2.c
@ -1284,22 +1284,6 @@ my_uni_utf16(CHARSET_INFO *cs __attribute__((unused)),
 const char charset_name_utf16le[]= "utf16le";
 #define charset_name_utf16le_length (sizeof(charset_name_utf16le)-1)

-static inline void
-my_tosort_utf16(MY_UNICASE_INFO *uni_plane, my_wc_t *wc)
-{
-  if (*wc <= uni_plane->maxchar)
-  {
-    MY_UNICASE_CHARACTER *page;
-    if ((page= uni_plane->page[*wc >> 8]))
-      *wc= page[*wc & 0xFF].sort;
-  }
-  else
-  {
-    *wc= MY_CS_REPLACEMENT_CHARACTER;
-  }
-}
-
-

 static size_t
 my_caseup_utf16(CHARSET_INFO *cs, const char *src, size_t srclen,
@ -1341,7 +1325,7 @@ my_hash_sort_utf16_nopad(CHARSET_INFO *cs,

  while ((s < e) && (res= mb_wc(cs, &wc, (uchar *) s, (uchar *) e)) > 0)
  {
-    my_tosort_utf16(uni_plane, &wc);
+    my_tosort_unicode(uni_plane, &wc);
    MY_HASH_ADD_16(m1, m2, wc);
    s+= res;
  }
@ -2178,22 +2162,6 @@ my_uni_utf32(CHARSET_INFO *cs __attribute__((unused)),
 }


-static inline void
-my_tosort_utf32(MY_UNICASE_INFO *uni_plane, my_wc_t *wc)
-{
-  if (*wc <= uni_plane->maxchar)
-  {
-    MY_UNICASE_CHARACTER *page;
-    if ((page= uni_plane->page[*wc >> 8]))
-      *wc= page[*wc & 0xFF].sort;
-  }
-  else
-  {
-    *wc= MY_CS_REPLACEMENT_CHARACTER;
-  }
-}
-
-
 static size_t
 my_lengthsp_utf32(CHARSET_INFO *cs __attribute__((unused)),
                  const char *ptr, size_t length)
@ -2242,7 +2210,7 @@ my_hash_sort_utf32_nopad(CHARSET_INFO *cs, const uchar *s, size_t slen,

  while ((res= my_utf32_uni(cs, &wc, (uchar*) s, (uchar*) e)) > 0)
  {
-    my_tosort_utf32(uni_plane, &wc);
+    my_tosort_unicode(uni_plane, &wc);
    MY_HASH_ADD(m1, m2, (uint) (wc >> 24));
    MY_HASH_ADD(m1, m2, (uint) (wc >> 16) & 0xFF);
    MY_HASH_ADD(m1, m2, (uint) (wc >> 8)  & 0xFF);
@ -3082,14 +3050,6 @@ static int my_uni_ucs2(CHARSET_INFO *cs __attribute__((unused)) ,
 }


-static inline void
-my_tosort_ucs2(MY_UNICASE_INFO *uni_plane, my_wc_t *wc)
-{
-  MY_UNICASE_CHARACTER *page;
-  if ((page= uni_plane->page[(*wc >> 8) & 0xFF]))
-    *wc= page[*wc & 0xFF].sort;
-}
-
 static size_t my_caseup_ucs2(CHARSET_INFO *cs, const char *src, size_t srclen,
                           char *dst, size_t dstlen)
 {
@ -3125,7 +3085,7 @@ my_hash_sort_ucs2_nopad(CHARSET_INFO *cs, const uchar *s, size_t slen,

  while ((s < e) && (res=my_ucs2_uni(cs,&wc, (uchar *)s, (uchar*)e)) >0)
  {
-    my_tosort_ucs2(uni_plane, &wc);
+    my_tosort_unicode_bmp(uni_plane, &wc);
    MY_HASH_ADD_16(m1, m2, wc);
    s+=res;
  }
--- a/strings/ctype-unidata.h
+++ b/strings/ctype-unidata.h
@ -36,6 +36,32 @@ static inline my_wc_t my_u300_toupper_7bit(uchar ch)
 }


+static inline void my_tosort_unicode_bmp(MY_UNICASE_INFO *uni_plane,
+                                         my_wc_t *wc)
+{
+  const MY_UNICASE_CHARACTER *page;
+  DBUG_ASSERT(*wc <= uni_plane->maxchar);
+  if ((page= uni_plane->page[*wc >> 8]))
+    *wc= page[*wc & 0xFF].sort;
+}
+
+
+static inline void my_tosort_unicode(MY_UNICASE_INFO *uni_plane,
+                                     my_wc_t *wc)
+{
+  if (*wc <= uni_plane->maxchar)
+  {
+    const MY_UNICASE_CHARACTER *page;
+    if ((page= uni_plane->page[*wc >> 8]))
+      *wc= page[*wc & 0xFF].sort;
+  }
+  else
+  {
+    *wc= MY_CS_REPLACEMENT_CHARACTER;
+  }
+}
+
+
 static inline void
 my_tolower_unicode_bmp(MY_UNICASE_INFO *uni_plane, my_wc_t *wc)
 {
--- a/strings/ctype-utf8.c
+++ b/strings/ctype-utf8.c
@ -4638,23 +4638,6 @@ MY_UNICASE_INFO my_unicase_unicode520=
 };


-static inline void
-my_tosort_unicode(MY_UNICASE_INFO *uni_plane, my_wc_t *wc, uint flags)
-{
-  if (*wc <= uni_plane->maxchar)
-  {
-    MY_UNICASE_CHARACTER *page;
-    if ((page= uni_plane->page[*wc >> 8]))
-      *wc= (flags & MY_CS_LOWER_SORT) ?
-           page[*wc & 0xFF].tolower :
-           page[*wc & 0xFF].sort;
-  }
-  else
-  {
-    *wc= MY_CS_REPLACEMENT_CHARACTER;
-  }
-}
-

 static uint
 my_casefold_multiply_utf8mbx(CHARSET_INFO *cs)
@ -4734,8 +4717,8 @@ int my_wildcmp_unicode_impl(CHARSET_INFO *cs,
      {
        if (weights)
        {
-          my_tosort_unicode(weights, &s_wc, cs->state);
-          my_tosort_unicode(weights, &w_wc, cs->state);
+          my_tosort_unicode(weights, &s_wc);
+          my_tosort_unicode(weights, &w_wc);
        }
        if (s_wc != w_wc)
          return 1;                               /* No match */
@ -4803,8 +4786,8 @@ int my_wildcmp_unicode_impl(CHARSET_INFO *cs,
            return 1;
          if (weights)
          {
-            my_tosort_unicode(weights, &s_wc, cs->state);
-            my_tosort_unicode(weights, &w_wc, cs->state);
+            my_tosort_unicode(weights, &s_wc);
+            my_tosort_unicode(weights, &w_wc);
          }

          if (s_wc == w_wc)
@ -5242,7 +5225,7 @@ static void my_hash_sort_utf8mb3_nopad(CHARSET_INFO *cs, const uchar *s, size_t

  while ((s < e) && (res=my_utf8mb3_uni(cs,&wc, (uchar *)s, (uchar*)e))>0 )
  {
-    my_tosort_unicode(uni_plane, &wc, cs->state);
+    my_tosort_unicode(uni_plane, &wc);
    MY_HASH_ADD_16(m1, m2, wc);
    s+= res;
  }
@ -5976,8 +5959,8 @@ static int my_strnncoll_utf8mb3_cs(CHARSET_INFO *cs,
      save_diff = ((int)s_wc) - ((int)t_wc);
    }

-    my_tosort_unicode(uni_plane, &s_wc, cs->state);
-    my_tosort_unicode(uni_plane, &t_wc, cs->state);
+    my_tosort_unicode(uni_plane, &s_wc);
+    my_tosort_unicode(uni_plane, &t_wc);

    if ( s_wc != t_wc )
    {
@ -6018,8 +6001,8 @@ static int my_strnncollsp_utf8mb3_cs(CHARSET_INFO *cs,
      save_diff = ((int)s_wc) - ((int)t_wc);
    }

-    my_tosort_unicode(uni_plane, &s_wc, cs->state);
-    my_tosort_unicode(uni_plane, &t_wc, cs->state);
+    my_tosort_unicode(uni_plane, &s_wc);
+    my_tosort_unicode(uni_plane, &t_wc);

    if ( s_wc != t_wc )
    {
@ -7697,7 +7680,7 @@ my_hash_sort_utf8mb4_nopad(CHARSET_INFO *cs, const uchar *s, size_t slen,

  while ((res= my_mb_wc_utf8mb4(cs, &wc, (uchar*) s, (uchar*) e)) > 0)
  {
-    my_tosort_unicode(uni_plane, &wc, cs->state);
+    my_tosort_unicode(uni_plane, &wc);
    MY_HASH_ADD_16(m1, m2, (uint) (wc & 0xFFFF));
    if (wc > 0xFFFF)
    {