MDEV-30879 Add support for up to BASE 62 to CONV()

BASE 62 uses 0-9, A-Z and then a-z to give the numbers 0-61. This patch increases the range of the string functions to cover this. Based on ideas and tests in PR #2589, but re-written into the charset functions. Includes fix by Sergei, UBSAN complained: ctype-simple.c:683:38: runtime error: negation of -9223372036854775808 cannot be represented in type 'long long int'; cast to an unsigned type to negate this value to itself Co-authored-by: Weijun Huang <huangweijun1001@gmail.com> Co-authored-by: Sergei Golubchik <serg@mariadb.org>
2025-01-15 19:42:28 +01:00 · 2023-11-17 17:41:23 +00:00 · 2023-11-17 17:41:23 +00:00 · f552febe43
commit f552febe43
parent be6d48fd53
10 changed files with 160 additions and 22 deletions
--- a/include/m_string.h
+++ b/include/m_string.h
@ -74,6 +74,7 @@ extern "C" {
 #endif

 /* Declared in int2str() */
+extern const char _dig_vec_base62[];
 extern const char _dig_vec_upper[];
 extern const char _dig_vec_lower[];

--- a/mysql-test/main/func_str.result
+++ b/mysql-test/main/func_str.result
@ -1078,8 +1078,8 @@ lpad(12345, 5, "#")
 SELECT conv(71, 10, 36), conv('1Z', 36, 10);
 conv(71, 10, 36)	conv('1Z', 36, 10)
 1Z	71
-SELECT conv(71, 10, 37), conv('1Z', 37, 10), conv(0,1,10),conv(0,0,10), conv(0,-1,10);
-conv(71, 10, 37)	conv('1Z', 37, 10)	conv(0,1,10)	conv(0,0,10)	conv(0,-1,10)
+SELECT conv(71, 10, 63), conv('1Z', 63, 10), conv(0,1,10),conv(0,0,10), conv(0,-1,10);
+conv(71, 10, 63)	conv('1Z', 63, 10)	conv(0,1,10)	conv(0,0,10)	conv(0,-1,10)
 NULL	NULL	NULL	NULL	NULL
 create table t1 (id int(1), str varchar(10)) DEFAULT CHARSET=utf8;
 insert into t1 values (1,'aaaaaaaaaa'), (2,'bbbbbbbbbb');
@ -5535,3 +5535,63 @@ aes_encrypt(a,a) is null
 #
 # End of 11.2 tests
 #
+#
+# MDEV-30879 Add conversion to based 62 for CONV function
+#
+SELECT CONV('1z', 62, 10);
+CONV('1z', 62, 10)
+123
+SELECT CONV('1Z', 62, 10);
+CONV('1Z', 62, 10)
+97
+SELECT CONV('-1Z', 62, 10);
+CONV('-1Z', 62, 10)
+18446744073709551519
+SELECT CONV('-1Z', -62, 10);
+CONV('-1Z', -62, 10)
+18446744073709551519
+SELECT CONV('-1Z', 62, -10);
+CONV('-1Z', 62, -10)
+-97
+SELECT CONV('-1Z', -62, -10);
+CONV('-1Z', -62, -10)
+-97
+SELECT CONV('AzL8n0Y58m7', 62, 10);
+CONV('AzL8n0Y58m7', 62, 10)
+9223372036854775807
+SELECT CONV('LygHa16AHYE', 62, 10);
+CONV('LygHa16AHYE', 62, 10)
+18446744073709551614
+SELECT CONV('LygHa16AHYF', 62, 10);
+CONV('LygHa16AHYF', 62, 10)
+18446744073709551615
+SELECT CONV('LygHa16AHZ0', 62, 10);
+CONV('LygHa16AHZ0', 62, 10)
+18446744073709551615
+SELECT CONV('-AzL8n0Y58m7', -62, -10);
+CONV('-AzL8n0Y58m7', -62, -10)
+-9223372036854775807
+SELECT CONV('-AzL8n0Y58m8', -62, -10);
+CONV('-AzL8n0Y58m8', -62, -10)
+-9223372036854775808
+SELECT CONV('-AzL8n0Y58m9', -62, -10);
+CONV('-AzL8n0Y58m9', -62, -10)
+-9223372036854775808
+SELECT CONV('-LygHa16AHZ0', -62, -10);
+CONV('-LygHa16AHZ0', -62, -10)
+-9223372036854775808
+SELECT CONV('LygHa16AHYF', 63, 10);
+CONV('LygHa16AHYF', 63, 10)
+NULL
+SELECT CONV(18446744073709551615, 10, 63);
+CONV(18446744073709551615, 10, 63)
+NULL
+SELECT CONV(18446744073709551615, 10, 62);
+CONV(18446744073709551615, 10, 62)
+LygHa16AHYF
+SELECT CONV(-9223372036854775808, -10, -62);
+CONV(-9223372036854775808, -10, -62)
+-AzL8n0Y58m8
+#
+# End of 11.4 tests
+#
--- a/mysql-test/main/func_str.test
+++ b/mysql-test/main/func_str.test
@ -570,7 +570,7 @@ SELECT lpad(12345, 5, "#");
 #
 
 SELECT conv(71, 10, 36), conv('1Z', 36, 10);
-SELECT conv(71, 10, 37), conv('1Z', 37, 10), conv(0,1,10),conv(0,0,10), conv(0,-1,10);
+SELECT conv(71, 10, 63), conv('1Z', 63, 10), conv(0,1,10),conv(0,0,10), conv(0,-1,10);

 #
 # Bug in SUBSTRING when mixed with CONCAT and ORDER BY (Bug #3089)
@ -2481,3 +2481,40 @@ select aes_encrypt(a,a) is null from (values('a'),(NULL),('b')) x;
 --echo #
 --echo # End of 11.2 tests
 --echo #
+
+--echo #
+--echo # MDEV-30879 Add conversion to based 62 for CONV function
+--echo #
+
+SELECT CONV('1z', 62, 10);
+SELECT CONV('1Z', 62, 10);
+
+SELECT CONV('-1Z', 62, 10);
+SELECT CONV('-1Z', -62, 10);
+SELECT CONV('-1Z', 62, -10);
+SELECT CONV('-1Z', -62, -10);
+
+# Check limits
+SELECT CONV('AzL8n0Y58m7', 62, 10);
+SELECT CONV('LygHa16AHYE', 62, 10);
+SELECT CONV('LygHa16AHYF', 62, 10);
+
+# Overflow doesn't appear to warn, but does overflow
+SELECT CONV('LygHa16AHZ0', 62, 10);
+
+SELECT CONV('-AzL8n0Y58m7', -62, -10);
+SELECT CONV('-AzL8n0Y58m8', -62, -10);
+SELECT CONV('-AzL8n0Y58m9', -62, -10);
+SELECT CONV('-LygHa16AHZ0', -62, -10);
+
+# Should NULL
+SELECT CONV('LygHa16AHYF', 63, 10);
+SELECT CONV(18446744073709551615, 10, 63);
+
+# Test 10 -> 62
+SELECT CONV(18446744073709551615, 10, 62);
+SELECT CONV(-9223372036854775808, -10, -62);
+
+--echo #
+--echo # End of 11.4 tests
+--echo #
--- a/sql/item_strfunc.cc
+++ b/sql/item_strfunc.cc
@ -3936,8 +3936,8 @@ String *Item_func_conv::val_str(String *str)
  // Note that abs(INT_MIN) is undefined.
  if (args[0]->null_value || args[1]->null_value || args[2]->null_value ||
      from_base == INT_MIN || to_base == INT_MIN ||
-      abs(to_base) > 36 || abs(to_base) < 2 ||
-      abs(from_base) > 36 || abs(from_base) < 2 || !(res->length()))
+      abs(to_base) > 62 || abs(to_base) < 2 ||
+      abs(from_base) > 62 || abs(from_base) < 2 || !(res->length()))
  {
    null_value= 1;
    return NULL;
--- a/strings/ctype-simple.c
+++ b/strings/ctype-simple.c
@ -451,7 +451,11 @@ long my_strntol_8bit(CHARSET_INFO *cs,
    else if (c>='A' && c<='Z')
      c = c - 'A' + 10;
    else if (c>='a' && c<='z')
+    {
      c = c - 'a' + 10;
+      if (base > 36)
+        c += 26;
+    }
    else
      break;
    if (c >= base)
@ -546,7 +550,11 @@ ulong my_strntoul_8bit(CHARSET_INFO *cs,
    else if (c>='A' && c<='Z')
      c = c - 'A' + 10;
    else if (c>='a' && c<='z')
+    {
      c = c - 'a' + 10;
+      if (base > 36)
+        c += 26;
+    }
    else
      break;
    if (c >= base)
@ -634,7 +642,11 @@ longlong my_strntoll_8bit(CHARSET_INFO *cs __attribute__((unused)),
    else if (c>='A' && c<='Z')
      c = c - 'A' + 10;
    else if (c>='a' && c<='z')
+    {
      c = c - 'a' + 10;
+      if (base > 36)
+        c += 26;
+    }
    else
      break;
    if (c >= base)
@ -656,9 +668,13 @@ longlong my_strntoll_8bit(CHARSET_INFO *cs __attribute__((unused)),

  if (negative)
  {
-    if (i  > (ulonglong) LONGLONG_MIN)
+    if (i >= (ulonglong) LONGLONG_MIN)
+    {
+      if (i == (ulonglong) LONGLONG_MIN)
+        return LONGLONG_MIN;
      overflow = 1;
    }
+  }
  else if (i > (ulonglong) LONGLONG_MAX)
    overflow = 1;

@ -731,7 +747,11 @@ ulonglong my_strntoull_8bit(CHARSET_INFO *cs,
    else if (c>='A' && c<='Z')
      c = c - 'A' + 10;
    else if (c>='a' && c<='z')
+    {
      c = c - 'a' + 10;
+      if (base > 36)
+        c += 26;
+    }
    else
      break;
    if (c >= base)
--- a/strings/ctype-ucs2.c
+++ b/strings/ctype-ucs2.c
@ -462,7 +462,11 @@ bs:
      else if ( wc>='A' && wc<='Z')
        wc = wc - 'A' + 10;
      else if ( wc>='a' && wc<='z')
+      {
        wc = wc - 'a' + 10;
+        if (base > 36)
+          wc += 26;
+      }
      else
        break;
      if ((int)wc >= base)
@ -575,7 +579,11 @@ bs:
      else if ( wc>='A' && wc<='Z')
        wc = wc - 'A' + 10;
      else if ( wc>='a' && wc<='z')
+      {
        wc = wc - 'a' + 10;
+        if (base > 36)
+          wc += 26;
+      }
      else
        break;
      if ((int)wc >= base)
--- a/strings/int2str.c
+++ b/strings/int2str.c
@ -31,6 +31,8 @@
 /*
  _dig_vec arrays are public because they are used in several outer places.
 */
+const char _dig_vec_base62[] =
+  "0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz";
 const char _dig_vec_upper[] =
  "0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZ";
 const char _dig_vec_lower[] =
@ -50,7 +52,7 @@ const char _dig_vec_lower[] =
  DESCRIPTION
    Converts the (long) integer value to its character form and moves it to 
    the destination buffer followed by a terminating NUL. 
-    If radix is -2..-36, val is taken to be SIGNED, if radix is  2..36, val is
+    If radix is -2..-62, val is taken to be SIGNED, if radix is  2..62, val is
    taken to be UNSIGNED. That is, val is signed if and only if radix is. 
    All other radixes treated as bad and nothing will be changed in this case.

@ -68,12 +70,17 @@ int2str(register long int val, register char *dst, register int radix,
  char buffer[65];
  register char *p;
  long int new_val;
-  const char *dig_vec= upcase ? _dig_vec_upper : _dig_vec_lower;
+  const char *dig_vec;
  ulong uval= (ulong) val;

+  if (radix < -36 || radix > 36)
+    dig_vec= _dig_vec_base62;
+  else
+    dig_vec= upcase ? _dig_vec_upper : _dig_vec_lower;
+
  if (radix < 0)
  {
-    if (radix < -36 || radix > -2)
+    if (radix < -62 || radix > -2)
      return NullS;
    if (val < 0)
    {
@ -83,7 +90,7 @@ int2str(register long int val, register char *dst, register int radix,
    }
    radix = -radix;
  }
-  else if (radix > 36 || radix < 2)
+  else if (radix > 62 || radix < 2)
    return NullS;

  /*
--- a/strings/longlong2str.c
+++ b/strings/longlong2str.c
@ -35,8 +35,8 @@
  result is normally a pointer to this NUL character, but if the radix
  is dud the result will be NullS and nothing will be changed.

-  If radix is -2..-36, val is taken to be SIGNED.
-  If radix is  2.. 36, val is taken to be UNSIGNED.
+  If radix is -2..-62, val is taken to be SIGNED.
+  If radix is  2.. 62, val is taken to be UNSIGNED.
  That is, val is signed if and only if radix is.  You will normally
  use radix -10 only through itoa and ltoa, for radix 2, 8, or 16
  unsigned is what you generally want.
@ -63,12 +63,17 @@ char *ll2str(longlong val,char *dst,int radix, int upcase)
  char buffer[65];
  register char *p;
  long long_val;
-  const char *dig_vec= upcase ? _dig_vec_upper : _dig_vec_lower;
+  const char *dig_vec;
  ulonglong uval= (ulonglong) val;

+  if (radix < -36 || radix > 36)
+    dig_vec= _dig_vec_base62;
+  else
+    dig_vec= upcase ? _dig_vec_upper : _dig_vec_lower;
+
  if (radix < 0)
  {
-    if (radix < -36 || radix > -2) return (char*) 0;
+    if (radix < -62 || radix > -2) return (char*) 0;
    if (val < 0) {
      *dst++ = '-';
      /* Avoid integer overflow in (-val) for LONGLONG_MIN (BUG#31799). */
@ -78,7 +83,7 @@ char *ll2str(longlong val,char *dst,int radix, int upcase)
  }
  else
  {
-    if (radix > 36 || radix < 2) return (char*) 0;
+    if (radix > 62 || radix < 2) return (char*) 0;
  }
  if (uval == 0)
  {
--- a/strings/str2int.c
+++ b/strings/str2int.c
@ -55,9 +55,9 @@
 #include "my_sys.h"			/* defines errno */
 #include <errno.h>

-#define char_val(X) (X >= '0' && X <= '9' ? X-'0' :\
+#define char_val(X, Y) (X >= '0' && X <= '9' ? X-'0' :\
 		     X >= 'A' && X <= 'Z' ? X-'A'+10 :\
-		     X >= 'a' && X <= 'z' ? X-'a'+10 :\
+		     X >= 'a' && X <= 'z' ? (Y <= 36 ? X-'a'+10 : X-'a'+36) :\
 		     '\177')

 char *str2int(register const char *src, register int radix, long int lower,
@ -76,10 +76,10 @@ char *str2int(register const char *src, register int radix, long int lower,

  *val = 0;

-  /*  Check that the radix is in the range 2..36  */
+  /*  Check that the radix is in the range 2..62  */

 #ifndef DBUG_OFF
-  if (radix < 2 || radix > 36) {
+  if (radix < 2 || radix > 62) {
    errno=EDOM;
    return NullS;
  }
@ -126,7 +126,7 @@ char *str2int(register const char *src, register int radix, long int lower,
      to left in order to avoid overflow.  Answer is after last digit.
      */

-  for (n = 0; (digits[n]=char_val(*src)) < radix && n < 20; n++,src++) ;
+  for (n = 0; (digits[n]=char_val(*src, radix)) < radix && n < 20; n++,src++) ;

  /*  Check that there is at least one digit  */

--- a/strings/string.doc
+++ b/strings/string.doc
@ -22,8 +22,8 @@ Speciella anv
  the destination string "dst" followed by a terminating NUL.  The
  result is normally a pointer to this NUL character, but if the radix
  is dud the result will be NullS and nothing will be changed.
-  If radix is -2..-36, val is taken to be SIGNED.
-  If radix is  2.. 36, val is taken to be UNSIGNED.
+  If radix is -2..-62, val is taken to be SIGNED.
+  If radix is  2.. 62, val is taken to be UNSIGNED.
  That is, val is signed if and only if radix is.  You will normally
  use radix -10 only through itoa and ltoa, for radix 2, 8, or 16
  unsigned is what you generally want.