MDEV-17474 Change Unicode collation implementation from "handler" to "inline" style

2025-01-15 19:42:28 +01:00 · 2018-10-16 19:10:57 +04:00 · 2018-10-16 19:10:57 +04:00 · 6eae037c4c
commit 6eae037c4c
parent fee24b1281
9 changed files with 1323 additions and 1268 deletions
--- a/include/m_ctype.h
+++ b/include/m_ctype.h
@ -362,7 +362,6 @@ extern MY_COLLATION_HANDLER my_collation_8bit_bin_handler;
 extern MY_COLLATION_HANDLER my_collation_8bit_simple_ci_handler;
 extern MY_COLLATION_HANDLER my_collation_8bit_nopad_bin_handler;
 extern MY_COLLATION_HANDLER my_collation_8bit_simple_nopad_ci_handler;
-extern MY_COLLATION_HANDLER my_collation_ucs2_uca_handler;

 /* Some typedef to make it easy for C++ to make function pointers */
 typedef int (*my_charset_conv_mb_wc)(CHARSET_INFO *, my_wc_t *,
--- a/strings/ctype-uca.c
+++ b/strings/ctype-uca.c
--- a/strings/ctype-uca.ic
+++ b/strings/ctype-uca.ic
@ -0,0 +1,763 @@
+/*
+  Copyright (c) 2018 MariaDB Corporation
+
+  This program is free software; you can redistribute it and/or modify
+  it under the terms of the GNU General Public License as published by
+  the Free Software Foundation; version 2 of the License.
+
+  This program is distributed in the hope that it will be useful,
+  but WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+  GNU General Public License for more details.
+
+  You should have received a copy of the GNU General Public License
+  along with this program; if not, write to the Free Software
+  Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
+*/
+
+
+#ifndef MY_FUNCTION_NAME
+#error MY_FUNCTION_NAME is not defined
+#endif
+#ifndef MY_MB_WC
+#error MY_MB_WC is not defined
+#endif
+#ifndef MY_LIKE_RANGE
+#error MY_LIKE_RANGE is not defined
+#endif
+
+
+static inline int
+MY_FUNCTION_NAME(scanner_next)(my_uca_scanner *scanner)
+{
+  /*
+    Check if the weights for the previous character have been
+    already fully scanned. If yes, then get the next character and
+    initialize wbeg and wlength to its weight string.
+  */
+
+  if (scanner->wbeg[0])      /* More weights left from the previous step: */
+    return *scanner->wbeg++; /* return the next weight from expansion     */
+
+  do
+  {
+    const uint16 *wpage;
+    my_wc_t wc[MY_UCA_MAX_CONTRACTION];
+    int mblen;
+
+    /* Get next character */
+    if (((mblen= MY_MB_WC(scanner, wc, scanner->sbeg,
+                                       scanner->send)) <= 0))
+    {
+      if (scanner->sbeg >= scanner->send)
+        return -1; /* No more bytes, end of line reached */
+      /*
+        There are some more bytes left. Non-positive mb_len means that
+        we got an incomplete or a bad byte sequence. Consume mbminlen bytes.
+      */
+      if ((scanner->sbeg+= scanner->cs->mbminlen) > scanner->send)
+      {
+        /* For safety purposes don't go beyond the string range. */
+        scanner->sbeg= scanner->send;
+      }
+      /*
+        Treat every complete or incomplete mbminlen unit as a weight which is
+        greater than weight for any possible normal character.
+        0xFFFF is greater than any possible weight in the UCA weight table.
+      */
+      return 0xFFFF;
+    }
+
+    scanner->sbeg+= mblen;
+    if (wc[0] > scanner->level->maxchar)
+    {
+      /* Return 0xFFFD as weight for all characters outside BMP */
+      scanner->wbeg= nochar;
+      return 0xFFFD;
+    }
+
+    if (my_uca_have_contractions_quick(scanner->level))
+    {
+      uint16 *cweight;
+      /*
+        If we have scanned a character which can have previous context,
+        and there were some more characters already before,
+        then reconstruct codepoint of the previous character
+        from "page" and "code" into w[1], and verify that {wc[1], wc[0]}
+        together form a real previous context pair.
+        Note, we support only 2-character long sequences with previous
+        context at the moment. CLDR does not have longer sequences.
+      */
+      if (my_uca_can_be_previous_context_tail(&scanner->level->contractions,
+                                              wc[0]) &&
+          scanner->wbeg != nochar &&     /* if not the very first character */
+          my_uca_can_be_previous_context_head(&scanner->level->contractions,
+                                              (wc[1]= ((scanner->page << 8) +
+                                                        scanner->code))) &&
+          (cweight= my_uca_previous_context_find(scanner, wc[1], wc[0])))
+      {
+        scanner->page= scanner->code= 0; /* Clear for the next character */
+        return *cweight;
+      }
+      else if (my_uca_can_be_contraction_head(&scanner->level->contractions,
+                                              wc[0]))
+      {
+        /* Check if w[0] starts a contraction */
+        if ((cweight= my_uca_scanner_contraction_find(scanner, wc)))
+          return *cweight;
+      }
+    }
+
+    /* Process single character */
+    scanner->page= wc[0] >> 8;
+    scanner->code= wc[0] & 0xFF;
+
+    /* If weight page for w[0] does not exist, then calculate algoritmically */
+    if (!(wpage= scanner->level->weights[scanner->page]))
+      return my_uca_scanner_next_implicit(scanner);
+
+    /* Calculate pointer to w[0]'s weight, using page and offset */
+    scanner->wbeg= wpage +
+                   scanner->code * scanner->level->lengths[scanner->page];
+  } while (!scanner->wbeg[0]); /* Skip ignorable characters */
+
+  return *scanner->wbeg++;
+}
+
+
+
+/*
+  Compares two strings according to the collation
+
+  SYNOPSIS:
+    strnncoll_onelevel()
+    cs		Character set information
+    level       Weight level (0 primary, 1 secondary, 2 tertiary, etc)
+    s		First string
+    slen	First string length
+    t		Second string
+    tlen	Seconf string length
+    level	DUCETweight level
+  
+  NOTES:
+    Initializes two weight scanners and gets weights
+    corresponding to two strings in a loop. If weights are not
+    the same at some step then returns their difference.
+    
+    In the while() comparison these situations are possible:
+    1. (s_res>0) and (t_res>0) and (s_res == t_res)
+       Weights are the same so far, continue comparison
+    2. (s_res>0) and (t_res>0) and (s_res!=t_res)
+       A difference has been found, return.
+    3. (s_res>0) and (t_res<0)
+       We have reached the end of the second string, or found
+       an illegal multibyte sequence in the second string.
+       Return a positive number, i.e. the first string is bigger.
+    4. (s_res<0) and (t_res>0)   
+       We have reached the end of the first string, or found
+       an illegal multibyte sequence in the first string.
+       Return a negative number, i.e. the second string is bigger.
+    5. (s_res<0) and (t_res<0)
+       Both scanners returned -1. It means we have riched
+       the end-of-string of illegal-sequence in both strings
+       at the same time. Return 0, strings are equal.
+    
+  RETURN
+    Difference between two strings, according to the collation:
+    0               - means strings are equal
+    negative number - means the first string is smaller
+    positive number - means the first string is bigger
+*/
+
+static int
+MY_FUNCTION_NAME(strnncoll_onelevel)(CHARSET_INFO *cs, 
+                                     const MY_UCA_WEIGHT_LEVEL *level,
+                                     const uchar *s, size_t slen,
+                                     const uchar *t, size_t tlen,
+                                     my_bool t_is_prefix)
+{
+  my_uca_scanner sscanner;
+  my_uca_scanner tscanner;
+  int s_res;
+  int t_res;
+  
+  my_uca_scanner_init_any(&sscanner, cs, level, s, slen);
+  my_uca_scanner_init_any(&tscanner, cs, level, t, tlen);
+  
+  do
+  {
+    s_res= MY_FUNCTION_NAME(scanner_next)(&sscanner);
+    t_res= MY_FUNCTION_NAME(scanner_next)(&tscanner);
+  } while ( s_res == t_res && s_res >0);
+  
+  return  (t_is_prefix && t_res < 0) ? 0 : (s_res - t_res);
+}
+
+
+/*
+  One-level, PAD SPACE.
+*/
+static int
+MY_FUNCTION_NAME(strnncoll)(CHARSET_INFO *cs,
+                            const uchar *s, size_t slen,
+                            const uchar *t, size_t tlen,
+                            my_bool t_is_prefix)
+{
+  return MY_FUNCTION_NAME(strnncoll_onelevel)(cs, &cs->uca->level[0],
+                                              s, slen, t, tlen, t_is_prefix);
+}
+
+
+/*
+  Multi-level, PAD SPACE.
+*/
+static int
+MY_FUNCTION_NAME(strnncoll_multilevel)(CHARSET_INFO *cs,
+                                       const uchar *s, size_t slen,
+                                       const uchar *t, size_t tlen,
+                                       my_bool t_is_prefix)
+{
+  uint i, num_level= cs->levels_for_order;
+  for (i= 0; i != num_level; i++)
+  {
+    int ret= MY_FUNCTION_NAME(strnncoll_onelevel)(cs, &cs->uca->level[i],
+                                                  s, slen, t, tlen,
+                                                  t_is_prefix);
+    if (ret)
+       return ret;
+  }
+  return 0;
+}
+
+
+/*
+  Compares two strings according to the collation,
+  ignoring trailing spaces.
+
+  SYNOPSIS:
+    strnncollsp_onelevel()
+    cs		Character set information
+    level       UCA weight level
+    s		First string
+    slen	First string length
+    t		Second string
+    tlen	Seconf string length
+    level	DUCETweight level
+
+  NOTES:
+    Works exactly the same with my_strnncoll_uca(),
+    but ignores trailing spaces.
+
+    In the while() comparison these situations are possible:
+    1. (s_res>0) and (t_res>0) and (s_res == t_res)
+       Weights are the same so far, continue comparison
+    2. (s_res>0) and (t_res>0) and (s_res!=t_res)
+       A difference has been found, return.
+    3. (s_res>0) and (t_res<0)
+       We have reached the end of the second string, or found
+       an illegal multibyte sequence in the second string.
+       Compare the first string to an infinite array of
+       space characters until difference is found, or until
+       the end of the first string.
+    4. (s_res<0) and (t_res>0)
+       We have reached the end of the first string, or found
+       an illegal multibyte sequence in the first string.
+       Compare the second string to an infinite array of
+       space characters until difference is found or until
+       the end of the second steing.
+    5. (s_res<0) and (t_res<0)
+       Both scanners returned -1. It means we have riched
+       the end-of-string of illegal-sequence in both strings
+       at the same time. Return 0, strings are equal.
+
+  RETURN
+    Difference between two strings, according to the collation:
+    0               - means strings are equal
+    negative number - means the first string is smaller
+    positive number - means the first string is bigger
+*/
+
+static int
+MY_FUNCTION_NAME(strnncollsp_onelevel)(CHARSET_INFO *cs,
+                                       const MY_UCA_WEIGHT_LEVEL *level,
+                                       const uchar *s, size_t slen,
+                                       const uchar *t, size_t tlen)
+{
+  my_uca_scanner sscanner, tscanner;
+  int s_res, t_res;
+
+  my_uca_scanner_init_any(&sscanner, cs, level, s, slen);
+  my_uca_scanner_init_any(&tscanner, cs, level, t, tlen);
+
+  do
+  {
+    s_res= MY_FUNCTION_NAME(scanner_next)(&sscanner);
+    t_res= MY_FUNCTION_NAME(scanner_next)(&tscanner);
+  } while ( s_res == t_res && s_res >0);
+
+  if (s_res > 0 && t_res < 0)
+  {
+    /* Calculate weight for SPACE character */
+    t_res= my_space_weight(level);
+
+    /* compare the first string to spaces */
+    do
+    {
+      if (s_res != t_res)
+        return (s_res - t_res);
+      s_res= MY_FUNCTION_NAME(scanner_next)(&sscanner);
+    } while (s_res > 0);
+    return 0;
+  }
+
+  if (s_res < 0 && t_res > 0)
+  {
+    /* Calculate weight for SPACE character */
+    s_res= my_space_weight(level);
+
+    /* compare the second string to spaces */
+    do
+    {
+      if (s_res != t_res)
+        return (s_res - t_res);
+      t_res= MY_FUNCTION_NAME(scanner_next)(&tscanner);
+    } while (t_res > 0);
+    return 0;
+  }
+
+  return ( s_res - t_res );
+}
+
+
+/*
+  One-level, PAD SPACE
+*/
+static int
+MY_FUNCTION_NAME(strnncollsp)(CHARSET_INFO *cs,
+                              const uchar *s, size_t slen,
+                              const uchar *t, size_t tlen)
+{
+  return MY_FUNCTION_NAME(strnncollsp_onelevel)(cs, &cs->uca->level[0],
+                                                s, slen, t, tlen);
+}
+
+
+/*
+  One-level, NO PAD
+*/
+static int
+MY_FUNCTION_NAME(strnncollsp_nopad)(CHARSET_INFO *cs,
+                                    const uchar *s, size_t slen,
+                                    const uchar *t, size_t tlen)
+{
+  return MY_FUNCTION_NAME(strnncoll_onelevel)(cs, &cs->uca->level[0],
+                                              s, slen, t, tlen, FALSE);
+}
+
+
+/*
+  Multi-level, PAD SPACE
+*/
+static int
+MY_FUNCTION_NAME(strnncollsp_multilevel)(CHARSET_INFO *cs,
+                                         const uchar *s, size_t slen,
+                                         const uchar *t, size_t tlen)
+{
+
+  uint i, num_level= cs->levels_for_order;
+  for (i= 0; i != num_level; i++)
+  {
+    int ret= MY_FUNCTION_NAME(strnncollsp_onelevel)(cs, &cs->uca->level[i],
+                                                    s, slen, t, tlen);
+    if (ret)
+      return ret;
+  }
+  return 0;
+}
+
+
+/*
+  Multi-level, NO PAD
+*/
+static int
+MY_FUNCTION_NAME(strnncollsp_nopad_multilevel)(CHARSET_INFO *cs,
+                                               const uchar *s, size_t slen,
+                                               const uchar *t, size_t tlen)
+{
+  uint num_level= cs->levels_for_order;
+  uint i;
+  for (i= 0; i != num_level; i++)
+  {
+    int ret= MY_FUNCTION_NAME(strnncoll_onelevel)(cs, &cs->uca->level[i],
+                                                  s, slen, t, tlen, FALSE);
+    if (ret)
+       return ret;
+  }
+  return 0;
+}
+
+
+
+/*
+  Calculates hash value for the given string,
+  according to the collation, and ignoring trailing spaces.
+
+  SYNOPSIS:
+    hash_sort()
+    cs		Character set information
+    s		String
+    slen	String's length
+    n1		First hash parameter
+    n2		Second hash parameter
+
+  NOTES:
+    Scans consequently weights and updates
+    hash parameters n1 and n2. In a case insensitive collation,
+    upper and lower case of the same letter will return the same
+    weight sequence, and thus will produce the same hash values
+    in n1 and n2.
+
+    This functions is used for one-level and for multi-level collations.
+    We intentionally use only primary level in multi-level collations.
+    This helps to have PARTITION BY KEY put primarily equal records
+    into the same partition. E.g. in utf8_thai_520_ci records that differ
+    only in tone marks go into the same partition.
+
+  RETURN
+    N/A
+*/
+
+static void
+MY_FUNCTION_NAME(hash_sort)(CHARSET_INFO *cs,
+                            const uchar *s, size_t slen,
+                            ulong *nr1, ulong *nr2)
+{
+  int   s_res;
+  my_uca_scanner scanner;
+  int space_weight= my_space_weight(&cs->uca->level[0]);
+  register ulong m1= *nr1, m2= *nr2;
+
+  my_uca_scanner_init_any(&scanner, cs, &cs->uca->level[0], s, slen);
+
+  while ((s_res= MY_FUNCTION_NAME(scanner_next)(&scanner)) >0)
+  {
+    if (s_res == space_weight)
+    {
+      /* Combine all spaces to be able to skip end spaces */
+      uint count= 0;
+      do
+      {
+        count++;
+        if ((s_res= MY_FUNCTION_NAME(scanner_next)(&scanner)) <= 0)
+        {
+          /* Skip strings at end of string */
+          goto end;
+        }
+      }
+      while (s_res == space_weight);
+
+      /* Add back that has for the space characters */
+      do
+      {
+        /*
+          We can't use MY_HASH_ADD_16() here as we, because of a misstake
+          in the original code, where we added the 16 byte variable the
+          opposite way.  Changing this would cause old partitioned tables
+          to fail.
+        */
+        MY_HASH_ADD(m1, m2, space_weight >> 8);
+        MY_HASH_ADD(m1, m2, space_weight & 0xFF);
+      }
+      while (--count != 0);
+
+    }
+    /* See comment above why we can't use MY_HASH_ADD_16() */
+    MY_HASH_ADD(m1, m2, s_res >> 8);
+    MY_HASH_ADD(m1, m2, s_res & 0xFF);
+  }
+end:
+  *nr1= m1;
+  *nr2= m2;
+}
+
+
+static void
+MY_FUNCTION_NAME(hash_sort_nopad)(CHARSET_INFO *cs,
+                                  const uchar *s, size_t slen,
+                                  ulong *nr1, ulong *nr2)
+{
+  int   s_res;
+  my_uca_scanner scanner;
+  register ulong m1= *nr1, m2= *nr2;
+
+  my_uca_scanner_init_any(&scanner, cs, &cs->uca->level[0], s, slen);
+
+  while ((s_res= MY_FUNCTION_NAME(scanner_next)(&scanner)) >0)
+  {
+    /* See comment above why we can't use MY_HASH_ADD_16() */
+    MY_HASH_ADD(m1, m2, s_res >> 8);
+    MY_HASH_ADD(m1, m2, s_res & 0xFF);
+  }
+  *nr1= m1;
+  *nr2= m2;
+}
+
+
+
+/*
+  For the given string creates its "binary image", suitable
+  to be used in binary comparison, i.e. in memcmp(). 
+  
+  SYNOPSIS:
+    my_strnxfrm_uca()
+    cs		Character set information
+    dst		Where to write the image
+    dstlen	Space available for the image, in bytes
+    src		The source string
+    srclen	Length of the source string, in bytes
+  
+  NOTES:
+    In a loop, scans weights from the source string and writes
+    them into the binary image. In a case insensitive collation,
+    upper and lower cases of the same letter will produce the
+    same image subsequences. When we have reached the end-of-string
+    or found an illegal multibyte sequence, the loop stops.
+
+    It is impossible to restore the original string using its
+    binary image. 
+    
+    Binary images are used for bulk comparison purposes,
+    e.g. in ORDER BY, when it is more efficient to create
+    a binary image and use it instead of weight scanner
+    for the original strings for every comparison.
+  
+  RETURN
+    Number of bytes that have been written into the binary image.
+*/
+
+static uchar *
+MY_FUNCTION_NAME(strnxfrm_onelevel_internal)(CHARSET_INFO *cs,
+                                             MY_UCA_WEIGHT_LEVEL *level,
+                                             uchar *dst, uchar *de,
+                                             uint *nweights,
+                                             const uchar *src, size_t srclen)
+{
+  my_uca_scanner scanner;
+  int s_res;
+
+  DBUG_ASSERT(src || !srclen);
+
+  my_uca_scanner_init_any(&scanner, cs, level, src, srclen);
+  for (; dst < de && *nweights &&
+         (s_res= MY_FUNCTION_NAME(scanner_next)(&scanner)) > 0 ; (*nweights)--)
+  {
+    *dst++= s_res >> 8;
+    if (dst < de)
+      *dst++= s_res & 0xFF;
+  }
+  return dst;
+}
+
+
+static uchar *
+MY_FUNCTION_NAME(strnxfrm_onelevel)(CHARSET_INFO *cs,
+                                    MY_UCA_WEIGHT_LEVEL *level,
+                                    uchar *dst, uchar *de, uint nweights,
+                                    const uchar *src, size_t srclen, uint flags)
+{
+  uchar *d0= dst;
+  dst= MY_FUNCTION_NAME(strnxfrm_onelevel_internal)(cs, level,
+                                                    dst, de, &nweights,
+                                                    src, srclen);
+  DBUG_ASSERT(dst <= de);
+  if (dst < de && nweights && (flags & MY_STRXFRM_PAD_WITH_SPACE))
+    dst= my_strnxfrm_uca_padn(dst, de, nweights, my_space_weight(level));
+  DBUG_ASSERT(dst <= de);
+  my_strxfrm_desc_and_reverse(d0, dst, flags, 0);
+  return dst;
+}
+
+
+
+static uchar *
+MY_FUNCTION_NAME(strnxfrm_nopad_onelevel)(CHARSET_INFO *cs,
+                                          MY_UCA_WEIGHT_LEVEL *level,
+                                          uchar *dst, uchar *de, uint nweights,
+                                          const uchar *src, size_t srclen,
+                                          uint flags)
+{
+  uchar *d0= dst;
+  dst= MY_FUNCTION_NAME(strnxfrm_onelevel_internal)(cs, level,
+                                                    dst, de, &nweights,
+                                                    src, srclen);
+  DBUG_ASSERT(dst <= de);
+  /*  Pad with the minimum possible weight on this level */
+  if (dst < de && nweights && (flags & MY_STRXFRM_PAD_WITH_SPACE))
+    dst= my_strnxfrm_uca_padn(dst, de, nweights, min_weight_on_level(level));
+  DBUG_ASSERT(dst <= de);
+  my_strxfrm_desc_and_reverse(d0, dst, flags, 0);
+  return dst;
+}
+
+
+static size_t
+MY_FUNCTION_NAME(strnxfrm)(CHARSET_INFO *cs,
+                           uchar *dst, size_t dstlen, uint nweights,
+                           const uchar *src, size_t srclen, uint flags)
+{
+  uchar *d0= dst;
+  uchar *de= dst + dstlen;
+
+  dst= MY_FUNCTION_NAME(strnxfrm_onelevel)(cs, &cs->uca->level[0],
+                                           dst, de, nweights,
+                                           src, srclen, flags);
+  /*
+    This can probably be changed to memset(dst, 0, de - dst),
+    like my_strnxfrm_uca_multilevel() does.
+  */
+  if ((flags & MY_STRXFRM_PAD_TO_MAXLEN) && dst < de)
+    dst= my_strnxfrm_uca_pad(dst, de, my_space_weight(&cs->uca->level[0]));
+  return dst - d0;
+}
+
+
+static size_t
+MY_FUNCTION_NAME(strnxfrm_nopad)(CHARSET_INFO *cs,
+                                 uchar *dst, size_t dstlen,
+                                 uint nweights,
+                                 const uchar *src, size_t srclen,
+                                 uint flags)
+{
+  uchar *d0= dst;
+  uchar *de= dst + dstlen;
+
+  dst= MY_FUNCTION_NAME(strnxfrm_nopad_onelevel)(cs, &cs->uca->level[0],
+                                                 dst, de, nweights,
+                                                 src, srclen, flags);
+  if ((flags & MY_STRXFRM_PAD_TO_MAXLEN) && dst < de)
+  {
+    memset(dst, 0, de - dst);
+    dst= de;
+  }
+  return dst - d0;
+}
+
+
+static size_t
+MY_FUNCTION_NAME(strnxfrm_multilevel)(CHARSET_INFO *cs, 
+                                      uchar *dst, size_t dstlen,
+                                      uint nweights,
+                                      const uchar *src, size_t srclen,
+                                      uint flags)
+{
+  uint num_level= cs->levels_for_order;
+  uchar *d0= dst;
+  uchar *de= dst + dstlen;
+  uint current_level;
+
+  for (current_level= 0; current_level != num_level; current_level++)
+  {
+    if (!(flags & MY_STRXFRM_LEVEL_ALL) ||
+        (flags & (MY_STRXFRM_LEVEL1 << current_level)))
+      dst= cs->state & MY_CS_NOPAD ?
+           MY_FUNCTION_NAME(strnxfrm_nopad_onelevel)(cs,
+                                          &cs->uca->level[current_level],
+                                          dst, de, nweights,
+                                          src, srclen, flags) :
+           MY_FUNCTION_NAME(strnxfrm_onelevel)(cs,
+                                    &cs->uca->level[current_level],
+                                    dst, de, nweights,
+                                    src, srclen, flags);
+  }
+
+  if (dst < de && (flags & MY_STRXFRM_PAD_TO_MAXLEN))
+  {
+    memset(dst, 0, de - dst);
+    dst= de;
+  }
+
+  return dst - d0;
+}
+
+
+/*
+  One-level, PAD SPACE
+*/
+MY_COLLATION_HANDLER MY_FUNCTION_NAME(collation_handler)=
+{
+  my_coll_init_uca,
+  MY_FUNCTION_NAME(strnncoll),
+  MY_FUNCTION_NAME(strnncollsp),
+  MY_FUNCTION_NAME(strnxfrm),
+  my_strnxfrmlen_any_uca,
+  MY_LIKE_RANGE,
+  my_wildcmp_uca,
+  NULL,                                /* strcasecmp() */
+  my_instr_mb,
+  MY_FUNCTION_NAME(hash_sort),
+  my_propagate_complex
+};
+
+
+/*
+  One-level, NO PAD
+  For character sets with mbminlen==1 use MY_LIKE_RANGE=my_like_range_mb
+  For character sets with mbminlen>=2 use MY_LIKE_RANGE=my_like_range_generic
+*/
+MY_COLLATION_HANDLER MY_FUNCTION_NAME(collation_handler_nopad)=
+{
+  my_coll_init_uca,
+  MY_FUNCTION_NAME(strnncoll),
+  MY_FUNCTION_NAME(strnncollsp_nopad),
+  MY_FUNCTION_NAME(strnxfrm_nopad),
+  my_strnxfrmlen_any_uca,
+  MY_LIKE_RANGE,    /* my_like_range_mb or my_like_range_generic */
+  my_wildcmp_uca,
+  NULL,                                /* strcasecmp() */
+  my_instr_mb,
+  MY_FUNCTION_NAME(hash_sort_nopad),
+  my_propagate_complex
+};
+
+
+/*
+  Multi-level, PAD SPACE
+*/
+MY_COLLATION_HANDLER MY_FUNCTION_NAME(collation_handler_multilevel)=
+{
+  my_coll_init_uca,
+  MY_FUNCTION_NAME(strnncoll_multilevel),
+  MY_FUNCTION_NAME(strnncollsp_multilevel),
+  MY_FUNCTION_NAME(strnxfrm_multilevel),
+  my_strnxfrmlen_any_uca_multilevel,
+  MY_LIKE_RANGE,
+  my_wildcmp_uca,
+  NULL,                                /* strcasecmp() */
+  my_instr_mb,
+  MY_FUNCTION_NAME(hash_sort),
+  my_propagate_complex
+};
+
+
+/*
+  Multi-level, NO PAD
+*/
+MY_COLLATION_HANDLER MY_FUNCTION_NAME(collation_handler_nopad_multilevel)=
+{
+  my_coll_init_uca,
+  MY_FUNCTION_NAME(strnncoll_multilevel),
+  MY_FUNCTION_NAME(strnncollsp_nopad_multilevel),
+  MY_FUNCTION_NAME(strnxfrm_multilevel),
+  my_strnxfrmlen_any_uca_multilevel,
+  MY_LIKE_RANGE,
+  my_wildcmp_uca,
+  NULL,                                /* strcasecmp() */
+  my_instr_mb,
+  MY_FUNCTION_NAME(hash_sort),
+  my_propagate_complex
+};
+
+
+#undef MY_FUNCTION_NAME
+#undef MY_MB_WC
+#undef MY_LIKE_RANGE
--- a/strings/ctype-ucs2.c
+++ b/strings/ctype-ucs2.c
@ -1184,35 +1184,7 @@ my_lengthsp_mb2(CHARSET_INFO *cs __attribute__((unused)),
  but the JSON functions needed my_utf16_uni()
  so the #ifdef was moved lower.
 */
-
-
-/*
-  D800..DB7F - Non-provate surrogate high (896 pages)
-  DB80..DBFF - Private surrogate high     (128 pages)
-  DC00..DFFF - Surrogate low              (1024 codes in a page)
-*/
-#define MY_UTF16_SURROGATE_HIGH_FIRST 0xD800
-#define MY_UTF16_SURROGATE_HIGH_LAST  0xDBFF
-#define MY_UTF16_SURROGATE_LOW_FIRST  0xDC00
-#define MY_UTF16_SURROGATE_LOW_LAST   0xDFFF
-
-#define MY_UTF16_HIGH_HEAD(x)      ((((uchar) (x)) & 0xFC) == 0xD8)
-#define MY_UTF16_LOW_HEAD(x)       ((((uchar) (x)) & 0xFC) == 0xDC)
-/* Test if a byte is a leading byte of a high or low surrogate head: */
-#define MY_UTF16_SURROGATE_HEAD(x) ((((uchar) (x)) & 0xF8) == 0xD8)
-/* Test if a Unicode code point is a high or low surrogate head */
-#define MY_UTF16_SURROGATE(x)      (((x) & 0xF800) == 0xD800)
-
-#define MY_UTF16_WC2(a, b)         ((a << 8) + b)
-
-/*
-  a= 110110??  (<< 18)
-  b= ????????  (<< 10)
-  c= 110111??  (<<  8)
-  d= ????????  (<<  0)
-*/
-#define MY_UTF16_WC4(a, b, c, d) (((a & 3) << 18) + (b << 10) + \
-                                  ((c & 3) << 8) + d + 0x10000)
+#include "ctype-utf16.h"

 #define IS_MB2_CHAR(b0,b1)       (!MY_UTF16_SURROGATE_HEAD(b0))
 #define IS_MB4_CHAR(b0,b1,b2,b3) (MY_UTF16_HIGH_HEAD(b0) && MY_UTF16_LOW_HEAD(b2))
@ -1261,32 +1233,7 @@ static inline int my_weight_mb2_utf16mb2_general_ci(uchar b0, uchar b1)
 my_utf16_uni(CHARSET_INFO *cs __attribute__((unused)),
             my_wc_t *pwc, const uchar *s, const uchar *e)
 {
-  if (s + 2 > e)
-    return MY_CS_TOOSMALL2;
-  
-  /*
-    High bytes: 0xD[89AB] = B'110110??'
-    Low bytes:  0xD[CDEF] = B'110111??'
-    Surrogate mask:  0xFC = B'11111100'
-  */
-
-  if (MY_UTF16_HIGH_HEAD(*s)) /* Surrogate head */
-  {
-    if (s + 4 > e)
-      return MY_CS_TOOSMALL4;
-
-    if (!MY_UTF16_LOW_HEAD(s[2]))  /* Broken surrigate pair */
-      return MY_CS_ILSEQ;
-
-    *pwc= MY_UTF16_WC4(s[0], s[1], s[2], s[3]);
-    return 4;
-  }
-
-  if (MY_UTF16_LOW_HEAD(*s)) /* Low surrogate part without high part */
-    return MY_CS_ILSEQ;
-
-  *pwc= MY_UTF16_WC2(s[0], s[1]);
-  return 2;
+  return my_mb_wc_utf16_quick(pwc, s, e);
 }


@ -2109,6 +2056,8 @@ struct charset_info_st my_charset_utf16le_nopad_bin=

 #ifdef HAVE_CHARSET_utf32

+#include "ctype-utf32.h"
+
 /*
  Check is b0 and b1 start a valid UTF32 four-byte sequence.
  Don't accept characters greater than U+10FFFF.
@ -2117,8 +2066,6 @@ struct charset_info_st my_charset_utf16le_nopad_bin=

 #define IS_MB4_CHAR(b0,b1,b2,b3)   (IS_UTF32_MBHEAD4(b0,b1))

-#define MY_UTF32_WC4(b0,b1,b2,b3)  ((((my_wc_t)b0) << 24) + (b1 << 16) + \
-                                                (b2 << 8) + (b3))

 static inline int my_weight_utf32_general_ci(uchar b0, uchar b1,
                                             uchar b2, uchar b3)
@ -2161,10 +2108,7 @@ static int
 my_utf32_uni(CHARSET_INFO *cs __attribute__((unused)),
             my_wc_t *pwc, const uchar *s, const uchar *e)
 {
-  if (s + 4 > e)
-    return MY_CS_TOOSMALL4;
-  *pwc= MY_UTF32_WC4(s[0], s[1], s[2], s[3]);
-  return *pwc > 0x10FFFF ? MY_CS_ILSEQ : 4;
+  return my_mb_wc_utf32_quick(pwc, s, e);
 }


@ -2928,6 +2872,8 @@ struct charset_info_st my_charset_utf32_nopad_bin=

 #ifdef HAVE_CHARSET_ucs2

+#include "ctype-ucs2.h"
+
 static const uchar ctype_ucs2[] = {
    0,
   32, 32, 32, 32, 32, 32, 32, 32, 32, 40, 40, 40, 40, 40, 32, 32,
@ -3037,11 +2983,7 @@ my_charlen_ucs2(CHARSET_INFO *cs __attribute__((unused)),
 static int my_ucs2_uni(CHARSET_INFO *cs __attribute__((unused)),
 		       my_wc_t * pwc, const uchar *s, const uchar *e)
 {
-  if (s+2 > e) /* Need 2 characters */
-    return MY_CS_TOOSMALL2;
-  
-  *pwc= ((uchar)s[0]) * 256  + ((uchar)s[1]);
-  return 2;
+  return my_mb_wc_ucs2_quick(pwc, s, e);
 }

 static int my_uni_ucs2(CHARSET_INFO *cs __attribute__((unused)) ,
--- a/strings/ctype-ucs2.h
+++ b/strings/ctype-ucs2.h
@ -0,0 +1,32 @@
+/*
+  Copyright (c) 2018 MariaDB Corporation
+
+  This program is free software; you can redistribute it and/or modify
+  it under the terms of the GNU General Public License as published by
+  the Free Software Foundation; version 2 of the License.
+
+  This program is distributed in the hope that it will be useful,
+  but WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+  GNU General Public License for more details.
+
+  You should have received a copy of the GNU General Public License
+  along with this program; if not, write to the Free Software
+  Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
+*/
+
+#ifndef _CTYPE_UCS2_H
+#define _CTYPE_UCS2_H
+
+
+static inline int
+my_mb_wc_ucs2_quick(my_wc_t * pwc, const uchar *s, const uchar *e)
+{
+  if (s+2 > e) /* Need 2 characters */
+    return MY_CS_TOOSMALL2;
+  *pwc= ((uchar)s[0]) * 256  + ((uchar)s[1]);
+  return 2;
+}
+
+
+#endif /* _CTYPE_UCS2_H */
--- a/strings/ctype-utf16.h
+++ b/strings/ctype-utf16.h
@ -0,0 +1,80 @@
+/*
+  Copyright (c) 2018 MariaDB Corporation
+
+  This program is free software; you can redistribute it and/or modify
+  it under the terms of the GNU General Public License as published by
+  the Free Software Foundation; version 2 of the License.
+
+  This program is distributed in the hope that it will be useful,
+  but WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+  GNU General Public License for more details.
+
+  You should have received a copy of the GNU General Public License
+  along with this program; if not, write to the Free Software
+  Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
+*/
+
+#ifndef _CTYPE_UTF16_H
+#define _CTYPE_UTF16_H
+
+/*
+  D800..DB7F - Non-provate surrogate high (896 pages)
+  DB80..DBFF - Private surrogate high     (128 pages)
+  DC00..DFFF - Surrogate low              (1024 codes in a page)
+*/
+#define MY_UTF16_SURROGATE_HIGH_FIRST 0xD800
+#define MY_UTF16_SURROGATE_HIGH_LAST  0xDBFF
+#define MY_UTF16_SURROGATE_LOW_FIRST  0xDC00
+#define MY_UTF16_SURROGATE_LOW_LAST   0xDFFF
+
+#define MY_UTF16_HIGH_HEAD(x)      ((((uchar) (x)) & 0xFC) == 0xD8)
+#define MY_UTF16_LOW_HEAD(x)       ((((uchar) (x)) & 0xFC) == 0xDC)
+/* Test if a byte is a leading byte of a high or low surrogate head: */
+#define MY_UTF16_SURROGATE_HEAD(x) ((((uchar) (x)) & 0xF8) == 0xD8)
+/* Test if a Unicode code point is a high or low surrogate head */
+#define MY_UTF16_SURROGATE(x)      (((x) & 0xF800) == 0xD800)
+
+#define MY_UTF16_WC2(a, b)         ((a << 8) + b)
+
+/*
+  a= 110110??  (<< 18)
+  b= ????????  (<< 10)
+  c= 110111??  (<<  8)
+  d= ????????  (<<  0)
+*/
+#define MY_UTF16_WC4(a, b, c, d) (((a & 3) << 18) + (b << 10) + \
+                                  ((c & 3) << 8) + d + 0x10000)
+
+static inline int
+my_mb_wc_utf16_quick(my_wc_t *pwc, const uchar *s, const uchar *e)
+{
+  if (s + 2 > e)
+    return MY_CS_TOOSMALL2;
+
+  /*
+    High bytes: 0xD[89AB] = B'110110??'
+    Low bytes:  0xD[CDEF] = B'110111??'
+    Surrogate mask:  0xFC = B'11111100'
+  */
+
+  if (MY_UTF16_HIGH_HEAD(*s)) /* Surrogate head */
+  {
+    if (s + 4 > e)
+      return MY_CS_TOOSMALL4;
+
+    if (!MY_UTF16_LOW_HEAD(s[2]))  /* Broken surrigate pair */
+      return MY_CS_ILSEQ;
+
+    *pwc= MY_UTF16_WC4(s[0], s[1], s[2], s[3]);
+    return 4;
+  }
+
+  if (MY_UTF16_LOW_HEAD(*s)) /* Low surrogate part without high part */
+    return MY_CS_ILSEQ;
+
+  *pwc= MY_UTF16_WC2(s[0], s[1]);
+  return 2;
+}
+
+#endif /* _CTYPE_UTF16_H */
--- a/strings/ctype-utf32.h
+++ b/strings/ctype-utf32.h
@ -0,0 +1,33 @@
+/*
+  Copyright (c) 2018 MariaDB Corporation
+
+  This program is free software; you can redistribute it and/or modify
+  it under the terms of the GNU General Public License as published by
+  the Free Software Foundation; version 2 of the License.
+
+  This program is distributed in the hope that it will be useful,
+  but WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+  GNU General Public License for more details.
+
+  You should have received a copy of the GNU General Public License
+  along with this program; if not, write to the Free Software
+  Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
+*/
+
+#ifndef _CTYPE_UTF32_H
+#define _CTYPE_UTF32_H
+
+#define MY_UTF32_WC4(b0,b1,b2,b3)  ((((my_wc_t)b0) << 24) + (b1 << 16) + \
+                                                (b2 << 8) + (b3))
+
+static inline int
+my_mb_wc_utf32_quick(my_wc_t *pwc, const uchar *s, const uchar *e)
+{
+  if (s + 4 > e)
+    return MY_CS_TOOSMALL4;
+  *pwc= MY_UTF32_WC4(s[0], s[1], s[2], s[3]);
+  return *pwc > 0x10FFFF ? MY_CS_ILSEQ : 4;
+}
+
+#endif /* _CTYPE_UTF32_H */
--- a/strings/ctype-utf8.c
+++ b/strings/ctype-utf8.c
@ -26,78 +26,9 @@
 #define EILSEQ ENOENT
 #endif

-/* Detect special bytes and sequences */
-#define IS_CONTINUATION_BYTE(c)   (((uchar) (c) ^ 0x80) < 0x40)

-/*
-  Check MB2 character assuming that b0 is alredy known to be >= 0xC2.
-  Use this macro if the caller already checked b0 for:
-  - an MB1 character
-  - an unused gap between MB1 and MB2HEAD
-*/
-#define IS_UTF8MB2_STEP2(b0,b1)     (((uchar) (b0) < 0xE0) && \
-                                     IS_CONTINUATION_BYTE((uchar) b1))
+#include "ctype-utf8.h"

-/*
-  Check MB3 character assuming that b0 is already known to be
-  in the valid MB3HEAD range [0xE0..0xEF].
-*/
-#define IS_UTF8MB3_STEP2(b0,b1,b2) (IS_CONTINUATION_BYTE(b1) && \
-                                    IS_CONTINUATION_BYTE(b2) && \
-                                    ((uchar) b0 >= 0xe1 || (uchar) b1 >= 0xa0))
-
-/*
-  Check MB3 character assuming that b0 is already known to be >= 0xE0,
-  but is not checked for the high end 0xF0 yet.
-  Use this macro if the caller already checked b0 for:
-  - an MB1 character
-  - an unused gap between MB1 and MB2HEAD
-  - an MB2HEAD
-*/
-#define IS_UTF8MB3_STEP3(b0,b1,b2) (((uchar) (b0) < 0xF0) && \
-                                    IS_UTF8MB3_STEP2(b0,b1,b2))
-
-/*
-  UTF-8 quick four-byte mask:
-  11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
-  Encoding allows to encode U+00010000..U+001FFFFF
-
-  The maximum character defined in the Unicode standard is U+0010FFFF.
-  Higher characters U+00110000..U+001FFFFF are not used.
-
-  11110000.10010000.10xxxxxx.10xxxxxx == F0.90.80.80 == U+00010000 (min)
-  11110100.10001111.10111111.10111111 == F4.8F.BF.BF == U+0010FFFF (max)
-
-  Valid codes:
-  [F0][90..BF][80..BF][80..BF]
-  [F1][80..BF][80..BF][80..BF]
-  [F2][80..BF][80..BF][80..BF]
-  [F3][80..BF][80..BF][80..BF]
-  [F4][80..8F][80..BF][80..BF]
-*/
-
-/*
-  Check MB4 character assuming that b0 is already
-  known to be in the range [0xF0..0xF4]
-*/
-#define IS_UTF8MB4_STEP2(b0,b1,b2,b3) (IS_CONTINUATION_BYTE(b1) && \
-                                       IS_CONTINUATION_BYTE(b2) && \
-                                       IS_CONTINUATION_BYTE(b3) && \
-                                       (b0 >= 0xf1 || b1 >= 0x90) && \
-                                       (b0 <= 0xf3 || b1 <= 0x8F))
-#define IS_UTF8MB4_STEP3(b0,b1,b2,b3) (((uchar) (b0) < 0xF5) && \
-                                       IS_UTF8MB4_STEP2(b0,b1,b2,b3))
-
-/* Convert individual bytes to Unicode code points */
-#define UTF8MB2_CODE(b0,b1)       (((my_wc_t) ((uchar) b0 & 0x1f) << 6)  |\
-                                   ((my_wc_t) ((uchar) b1 ^ 0x80)))
-#define UTF8MB3_CODE(b0,b1,b2)    (((my_wc_t) ((uchar) b0 & 0x0f) << 12) |\
-                                   ((my_wc_t) ((uchar) b1 ^ 0x80) << 6)  |\
-                                   ((my_wc_t) ((uchar) b2 ^ 0x80)))
-#define UTF8MB4_CODE(b0,b1,b2,b3) (((my_wc_t) ((uchar) b0 & 0x07) << 18) |\
-                                   ((my_wc_t) ((uchar) b1 ^ 0x80) << 12) |\
-                                   ((my_wc_t) ((uchar) b2 ^ 0x80) << 6)  |\
-                                    (my_wc_t) ((uchar) b3 ^ 0x80))

 /* Definitions for strcoll.ic */
 #define IS_MB1_CHAR(x)              ((uchar) (x) < 0x80)
@ -4981,42 +4912,7 @@ static const uchar to_upper_utf8[] = {
 static int my_utf8_uni(CHARSET_INFO *cs __attribute__((unused)),
                       my_wc_t * pwc, const uchar *s, const uchar *e)
 {
-  uchar c;
-
-  if (s >= e)
-    return MY_CS_TOOSMALL;
-
-  c= s[0];
-  if (c < 0x80)
-  {
-    *pwc = c;
-    return 1;
-  }
-  else if (c < 0xc2)
-    return MY_CS_ILSEQ;
-  else if (c < 0xe0)
-  {
-    if (s+2 > e) /* We need 2 characters */
-      return MY_CS_TOOSMALL2;
-
-    if (!(IS_CONTINUATION_BYTE(s[1])))
-      return MY_CS_ILSEQ;
-
-    *pwc= UTF8MB2_CODE(c, s[1]);
-    return 2;
-  }
-  else if (c < 0xf0)
-  {
-    if (s+3 > e) /* We need 3 characters */
-      return MY_CS_TOOSMALL3;
-
-    if (!IS_UTF8MB3_STEP2(c, s[1], s[2]))
-      return MY_CS_ILSEQ;
-
-    *pwc= UTF8MB3_CODE(c, s[1], s[2]);
-    return 3;
-  }
-  return MY_CS_ILSEQ;
+  return my_mb_wc_utf8mb3_quick(pwc, s, e);
 }


@ -7379,52 +7275,7 @@ static int
 my_mb_wc_utf8mb4(CHARSET_INFO *cs __attribute__((unused)),
                 my_wc_t * pwc, const uchar *s, const uchar *e)
 {
-  uchar c;
-
-  if (s >= e)
-    return MY_CS_TOOSMALL;
-
-  c= s[0];
-  if (c < 0x80)
-  {
-    *pwc= c;
-    return 1;
-  }
-  else if (c < 0xc2)
-    return MY_CS_ILSEQ;
-  else if (c < 0xe0)
-  {
-    if (s + 2 > e) /* We need 2 characters */
-      return MY_CS_TOOSMALL2;
-
-    if (!(IS_CONTINUATION_BYTE(s[1])))
-      return MY_CS_ILSEQ;
-
-    *pwc= UTF8MB2_CODE(c, s[1]);
-    return 2;
-  }
-  else if (c < 0xf0)
-  {
-    if (s + 3 > e) /* We need 3 characters */
-      return MY_CS_TOOSMALL3;
-
-    if (!IS_UTF8MB3_STEP2(c, s[1], s[2]))
-      return MY_CS_ILSEQ;
-
-    *pwc= UTF8MB3_CODE(c, s[1], s[2]);
-    return 3;
-  }
-  else if (c < 0xf5)
-  {
-    if (s + 4 > e) /* We need 4 characters */
-      return MY_CS_TOOSMALL4;
-
-    if (!IS_UTF8MB4_STEP2(c, s[1], s[2], s[3]))
-      return MY_CS_ILSEQ;
-    *pwc= UTF8MB4_CODE(c, s[1], s[2], s[3]);
-    return 4;
-  }
-  return MY_CS_ILSEQ;
+  return my_mb_wc_utf8mb4_quick(pwc, s, e);
 }


--- a/strings/ctype-utf8.h
+++ b/strings/ctype-utf8.h
@ -0,0 +1,190 @@
+/*
+  Copyright (c) 2018 MariaDB Corporation
+
+  This program is free software; you can redistribute it and/or modify
+  it under the terms of the GNU General Public License as published by
+  the Free Software Foundation; version 2 of the License.
+
+  This program is distributed in the hope that it will be useful,
+  but WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+  GNU General Public License for more details.
+
+  You should have received a copy of the GNU General Public License
+  along with this program; if not, write to the Free Software
+  Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
+*/
+
+#ifndef _CTYPE_UTF8_H
+#define _CTYPE_UTF8_H
+
+/* Detect special bytes and sequences */
+#define IS_CONTINUATION_BYTE(c)   (((uchar) (c) ^ 0x80) < 0x40)
+
+/*
+  Check MB2 character assuming that b0 is alredy known to be >= 0xC2.
+  Use this macro if the caller already checked b0 for:
+  - an MB1 character
+  - an unused gap between MB1 and MB2HEAD
+*/
+#define IS_UTF8MB2_STEP2(b0,b1)     (((uchar) (b0) < 0xE0) && \
+                                     IS_CONTINUATION_BYTE((uchar) b1))
+
+/*
+  Check MB3 character assuming that b0 is already known to be
+  in the valid MB3HEAD range [0xE0..0xEF].
+*/
+#define IS_UTF8MB3_STEP2(b0,b1,b2) (IS_CONTINUATION_BYTE(b1) && \
+                                    IS_CONTINUATION_BYTE(b2) && \
+                                    ((uchar) b0 >= 0xe1 || (uchar) b1 >= 0xa0))
+
+/*
+  Check MB3 character assuming that b0 is already known to be >= 0xE0,
+  but is not checked for the high end 0xF0 yet.
+  Use this macro if the caller already checked b0 for:
+  - an MB1 character
+  - an unused gap between MB1 and MB2HEAD
+  - an MB2HEAD
+*/
+#define IS_UTF8MB3_STEP3(b0,b1,b2) (((uchar) (b0) < 0xF0) && \
+                                    IS_UTF8MB3_STEP2(b0,b1,b2))
+
+/*
+  UTF-8 quick four-byte mask:
+  11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
+  Encoding allows to encode U+00010000..U+001FFFFF
+
+  The maximum character defined in the Unicode standard is U+0010FFFF.
+  Higher characters U+00110000..U+001FFFFF are not used.
+
+  11110000.10010000.10xxxxxx.10xxxxxx == F0.90.80.80 == U+00010000 (min)
+  11110100.10001111.10111111.10111111 == F4.8F.BF.BF == U+0010FFFF (max)
+
+  Valid codes:
+  [F0][90..BF][80..BF][80..BF]
+  [F1][80..BF][80..BF][80..BF]
+  [F2][80..BF][80..BF][80..BF]
+  [F3][80..BF][80..BF][80..BF]
+  [F4][80..8F][80..BF][80..BF]
+*/
+
+/*
+  Check MB4 character assuming that b0 is already
+  known to be in the range [0xF0..0xF4]
+*/
+#define IS_UTF8MB4_STEP2(b0,b1,b2,b3) (IS_CONTINUATION_BYTE(b1) && \
+                                       IS_CONTINUATION_BYTE(b2) && \
+                                       IS_CONTINUATION_BYTE(b3) && \
+                                       (b0 >= 0xf1 || b1 >= 0x90) && \
+                                       (b0 <= 0xf3 || b1 <= 0x8F))
+#define IS_UTF8MB4_STEP3(b0,b1,b2,b3) (((uchar) (b0) < 0xF5) && \
+                                       IS_UTF8MB4_STEP2(b0,b1,b2,b3))
+
+/* Convert individual bytes to Unicode code points */
+#define UTF8MB2_CODE(b0,b1)       (((my_wc_t) ((uchar) b0 & 0x1f) << 6)  |\
+                                   ((my_wc_t) ((uchar) b1 ^ 0x80)))
+#define UTF8MB3_CODE(b0,b1,b2)    (((my_wc_t) ((uchar) b0 & 0x0f) << 12) |\
+                                   ((my_wc_t) ((uchar) b1 ^ 0x80) << 6)  |\
+                                   ((my_wc_t) ((uchar) b2 ^ 0x80)))
+#define UTF8MB4_CODE(b0,b1,b2,b3) (((my_wc_t) ((uchar) b0 & 0x07) << 18) |\
+                                   ((my_wc_t) ((uchar) b1 ^ 0x80) << 12) |\
+                                   ((my_wc_t) ((uchar) b2 ^ 0x80) << 6)  |\
+                                    (my_wc_t) ((uchar) b3 ^ 0x80))
+
+static inline int
+my_mb_wc_utf8mb3_quick(my_wc_t * pwc, const uchar *s, const uchar *e)
+{
+  uchar c;
+
+  if (s >= e)
+    return MY_CS_TOOSMALL;
+
+  c= s[0];
+  if (c < 0x80)
+  {
+    *pwc = c;
+    return 1;
+  }
+  else if (c < 0xc2)
+    return MY_CS_ILSEQ;
+  else if (c < 0xe0)
+  {
+    if (s+2 > e) /* We need 2 characters */
+      return MY_CS_TOOSMALL2;
+
+    if (!(IS_CONTINUATION_BYTE(s[1])))
+      return MY_CS_ILSEQ;
+
+    *pwc= UTF8MB2_CODE(c, s[1]);
+    return 2;
+  }
+  else if (c < 0xf0)
+  {
+    if (s+3 > e) /* We need 3 characters */
+      return MY_CS_TOOSMALL3;
+
+    if (!IS_UTF8MB3_STEP2(c, s[1], s[2]))
+      return MY_CS_ILSEQ;
+
+    *pwc= UTF8MB3_CODE(c, s[1], s[2]);
+    return 3;
+  }
+  return MY_CS_ILSEQ;
+}
+
+
+#ifdef HAVE_CHARSET_utf8mb4
+static inline int
+my_mb_wc_utf8mb4_quick(my_wc_t *pwc, const uchar *s, const uchar *e)
+{
+  uchar c;
+
+  if (s >= e)
+    return MY_CS_TOOSMALL;
+
+  c= s[0];
+  if (c < 0x80)
+  {
+    *pwc= c;
+    return 1;
+  }
+  else if (c < 0xc2)
+    return MY_CS_ILSEQ;
+  else if (c < 0xe0)
+  {
+    if (s + 2 > e) /* We need 2 characters */
+      return MY_CS_TOOSMALL2;
+
+    if (!(IS_CONTINUATION_BYTE(s[1])))
+      return MY_CS_ILSEQ;
+
+    *pwc= UTF8MB2_CODE(c, s[1]);
+    return 2;
+  }
+  else if (c < 0xf0)
+  {
+    if (s + 3 > e) /* We need 3 characters */
+      return MY_CS_TOOSMALL3;
+
+    if (!IS_UTF8MB3_STEP2(c, s[1], s[2]))
+      return MY_CS_ILSEQ;
+
+    *pwc= UTF8MB3_CODE(c, s[1], s[2]);
+    return 3;
+  }
+  else if (c < 0xf5)
+  {
+    if (s + 4 > e) /* We need 4 characters */
+      return MY_CS_TOOSMALL4;
+
+    if (!IS_UTF8MB4_STEP2(c, s[1], s[2], s[3]))
+      return MY_CS_ILSEQ;
+    *pwc= UTF8MB4_CODE(c, s[1], s[2], s[3]);
+    return 4;
+  }
+  return MY_CS_ILSEQ;
+}
+#endif /* HAVE_CHARSET_utf8mb4*/
+
+
+#endif /* _CTYPE_UTF8_H */