MDEV-30879 Add support for up to BASE 62 to CONV()

BASE 62 uses 0-9, A-Z and then a-z to give the numbers 0-61. This patch
increases the range of the string functions to cover this.

Based on ideas and tests in PR #2589, but re-written into the charset
functions.

Includes fix by Sergei, UBSAN complained:
ctype-simple.c:683:38: runtime error: negation of -9223372036854775808
cannot be represented in type 'long long int'; cast to an unsigned
type to negate this value to itself

Co-authored-by: Weijun Huang <huangweijun1001@gmail.com>
Co-authored-by: Sergei Golubchik <serg@mariadb.org>
This commit is contained in:
Andrew Hutchings 2023-11-17 17:41:23 +00:00 committed by Andrew Hutchings
parent be6d48fd53
commit f552febe43
10 changed files with 160 additions and 22 deletions

View file

@ -74,6 +74,7 @@ extern "C" {
#endif
/* Declared in int2str() */
extern const char _dig_vec_base62[];
extern const char _dig_vec_upper[];
extern const char _dig_vec_lower[];

View file

@ -1078,8 +1078,8 @@ lpad(12345, 5, "#")
SELECT conv(71, 10, 36), conv('1Z', 36, 10);
conv(71, 10, 36) conv('1Z', 36, 10)
1Z 71
SELECT conv(71, 10, 37), conv('1Z', 37, 10), conv(0,1,10),conv(0,0,10), conv(0,-1,10);
conv(71, 10, 37) conv('1Z', 37, 10) conv(0,1,10) conv(0,0,10) conv(0,-1,10)
SELECT conv(71, 10, 63), conv('1Z', 63, 10), conv(0,1,10),conv(0,0,10), conv(0,-1,10);
conv(71, 10, 63) conv('1Z', 63, 10) conv(0,1,10) conv(0,0,10) conv(0,-1,10)
NULL NULL NULL NULL NULL
create table t1 (id int(1), str varchar(10)) DEFAULT CHARSET=utf8;
insert into t1 values (1,'aaaaaaaaaa'), (2,'bbbbbbbbbb');
@ -5535,3 +5535,63 @@ aes_encrypt(a,a) is null
#
# End of 11.2 tests
#
#
# MDEV-30879 Add conversion to based 62 for CONV function
#
SELECT CONV('1z', 62, 10);
CONV('1z', 62, 10)
123
SELECT CONV('1Z', 62, 10);
CONV('1Z', 62, 10)
97
SELECT CONV('-1Z', 62, 10);
CONV('-1Z', 62, 10)
18446744073709551519
SELECT CONV('-1Z', -62, 10);
CONV('-1Z', -62, 10)
18446744073709551519
SELECT CONV('-1Z', 62, -10);
CONV('-1Z', 62, -10)
-97
SELECT CONV('-1Z', -62, -10);
CONV('-1Z', -62, -10)
-97
SELECT CONV('AzL8n0Y58m7', 62, 10);
CONV('AzL8n0Y58m7', 62, 10)
9223372036854775807
SELECT CONV('LygHa16AHYE', 62, 10);
CONV('LygHa16AHYE', 62, 10)
18446744073709551614
SELECT CONV('LygHa16AHYF', 62, 10);
CONV('LygHa16AHYF', 62, 10)
18446744073709551615
SELECT CONV('LygHa16AHZ0', 62, 10);
CONV('LygHa16AHZ0', 62, 10)
18446744073709551615
SELECT CONV('-AzL8n0Y58m7', -62, -10);
CONV('-AzL8n0Y58m7', -62, -10)
-9223372036854775807
SELECT CONV('-AzL8n0Y58m8', -62, -10);
CONV('-AzL8n0Y58m8', -62, -10)
-9223372036854775808
SELECT CONV('-AzL8n0Y58m9', -62, -10);
CONV('-AzL8n0Y58m9', -62, -10)
-9223372036854775808
SELECT CONV('-LygHa16AHZ0', -62, -10);
CONV('-LygHa16AHZ0', -62, -10)
-9223372036854775808
SELECT CONV('LygHa16AHYF', 63, 10);
CONV('LygHa16AHYF', 63, 10)
NULL
SELECT CONV(18446744073709551615, 10, 63);
CONV(18446744073709551615, 10, 63)
NULL
SELECT CONV(18446744073709551615, 10, 62);
CONV(18446744073709551615, 10, 62)
LygHa16AHYF
SELECT CONV(-9223372036854775808, -10, -62);
CONV(-9223372036854775808, -10, -62)
-AzL8n0Y58m8
#
# End of 11.4 tests
#

View file

@ -570,7 +570,7 @@ SELECT lpad(12345, 5, "#");
#
SELECT conv(71, 10, 36), conv('1Z', 36, 10);
SELECT conv(71, 10, 37), conv('1Z', 37, 10), conv(0,1,10),conv(0,0,10), conv(0,-1,10);
SELECT conv(71, 10, 63), conv('1Z', 63, 10), conv(0,1,10),conv(0,0,10), conv(0,-1,10);
#
# Bug in SUBSTRING when mixed with CONCAT and ORDER BY (Bug #3089)
@ -2481,3 +2481,40 @@ select aes_encrypt(a,a) is null from (values('a'),(NULL),('b')) x;
--echo #
--echo # End of 11.2 tests
--echo #
--echo #
--echo # MDEV-30879 Add conversion to based 62 for CONV function
--echo #
SELECT CONV('1z', 62, 10);
SELECT CONV('1Z', 62, 10);
SELECT CONV('-1Z', 62, 10);
SELECT CONV('-1Z', -62, 10);
SELECT CONV('-1Z', 62, -10);
SELECT CONV('-1Z', -62, -10);
# Check limits
SELECT CONV('AzL8n0Y58m7', 62, 10);
SELECT CONV('LygHa16AHYE', 62, 10);
SELECT CONV('LygHa16AHYF', 62, 10);
# Overflow doesn't appear to warn, but does overflow
SELECT CONV('LygHa16AHZ0', 62, 10);
SELECT CONV('-AzL8n0Y58m7', -62, -10);
SELECT CONV('-AzL8n0Y58m8', -62, -10);
SELECT CONV('-AzL8n0Y58m9', -62, -10);
SELECT CONV('-LygHa16AHZ0', -62, -10);
# Should NULL
SELECT CONV('LygHa16AHYF', 63, 10);
SELECT CONV(18446744073709551615, 10, 63);
# Test 10 -> 62
SELECT CONV(18446744073709551615, 10, 62);
SELECT CONV(-9223372036854775808, -10, -62);
--echo #
--echo # End of 11.4 tests
--echo #

View file

@ -3936,8 +3936,8 @@ String *Item_func_conv::val_str(String *str)
// Note that abs(INT_MIN) is undefined.
if (args[0]->null_value || args[1]->null_value || args[2]->null_value ||
from_base == INT_MIN || to_base == INT_MIN ||
abs(to_base) > 36 || abs(to_base) < 2 ||
abs(from_base) > 36 || abs(from_base) < 2 || !(res->length()))
abs(to_base) > 62 || abs(to_base) < 2 ||
abs(from_base) > 62 || abs(from_base) < 2 || !(res->length()))
{
null_value= 1;
return NULL;

View file

@ -451,7 +451,11 @@ long my_strntol_8bit(CHARSET_INFO *cs,
else if (c>='A' && c<='Z')
c = c - 'A' + 10;
else if (c>='a' && c<='z')
{
c = c - 'a' + 10;
if (base > 36)
c += 26;
}
else
break;
if (c >= base)
@ -546,7 +550,11 @@ ulong my_strntoul_8bit(CHARSET_INFO *cs,
else if (c>='A' && c<='Z')
c = c - 'A' + 10;
else if (c>='a' && c<='z')
{
c = c - 'a' + 10;
if (base > 36)
c += 26;
}
else
break;
if (c >= base)
@ -634,7 +642,11 @@ longlong my_strntoll_8bit(CHARSET_INFO *cs __attribute__((unused)),
else if (c>='A' && c<='Z')
c = c - 'A' + 10;
else if (c>='a' && c<='z')
{
c = c - 'a' + 10;
if (base > 36)
c += 26;
}
else
break;
if (c >= base)
@ -656,9 +668,13 @@ longlong my_strntoll_8bit(CHARSET_INFO *cs __attribute__((unused)),
if (negative)
{
if (i > (ulonglong) LONGLONG_MIN)
if (i >= (ulonglong) LONGLONG_MIN)
{
if (i == (ulonglong) LONGLONG_MIN)
return LONGLONG_MIN;
overflow = 1;
}
}
else if (i > (ulonglong) LONGLONG_MAX)
overflow = 1;
@ -731,7 +747,11 @@ ulonglong my_strntoull_8bit(CHARSET_INFO *cs,
else if (c>='A' && c<='Z')
c = c - 'A' + 10;
else if (c>='a' && c<='z')
{
c = c - 'a' + 10;
if (base > 36)
c += 26;
}
else
break;
if (c >= base)

View file

@ -462,7 +462,11 @@ bs:
else if ( wc>='A' && wc<='Z')
wc = wc - 'A' + 10;
else if ( wc>='a' && wc<='z')
{
wc = wc - 'a' + 10;
if (base > 36)
wc += 26;
}
else
break;
if ((int)wc >= base)
@ -575,7 +579,11 @@ bs:
else if ( wc>='A' && wc<='Z')
wc = wc - 'A' + 10;
else if ( wc>='a' && wc<='z')
{
wc = wc - 'a' + 10;
if (base > 36)
wc += 26;
}
else
break;
if ((int)wc >= base)

View file

@ -31,6 +31,8 @@
/*
_dig_vec arrays are public because they are used in several outer places.
*/
const char _dig_vec_base62[] =
"0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz";
const char _dig_vec_upper[] =
"0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZ";
const char _dig_vec_lower[] =
@ -50,7 +52,7 @@ const char _dig_vec_lower[] =
DESCRIPTION
Converts the (long) integer value to its character form and moves it to
the destination buffer followed by a terminating NUL.
If radix is -2..-36, val is taken to be SIGNED, if radix is 2..36, val is
If radix is -2..-62, val is taken to be SIGNED, if radix is 2..62, val is
taken to be UNSIGNED. That is, val is signed if and only if radix is.
All other radixes treated as bad and nothing will be changed in this case.
@ -68,12 +70,17 @@ int2str(register long int val, register char *dst, register int radix,
char buffer[65];
register char *p;
long int new_val;
const char *dig_vec= upcase ? _dig_vec_upper : _dig_vec_lower;
const char *dig_vec;
ulong uval= (ulong) val;
if (radix < -36 || radix > 36)
dig_vec= _dig_vec_base62;
else
dig_vec= upcase ? _dig_vec_upper : _dig_vec_lower;
if (radix < 0)
{
if (radix < -36 || radix > -2)
if (radix < -62 || radix > -2)
return NullS;
if (val < 0)
{
@ -83,7 +90,7 @@ int2str(register long int val, register char *dst, register int radix,
}
radix = -radix;
}
else if (radix > 36 || radix < 2)
else if (radix > 62 || radix < 2)
return NullS;
/*

View file

@ -35,8 +35,8 @@
result is normally a pointer to this NUL character, but if the radix
is dud the result will be NullS and nothing will be changed.
If radix is -2..-36, val is taken to be SIGNED.
If radix is 2.. 36, val is taken to be UNSIGNED.
If radix is -2..-62, val is taken to be SIGNED.
If radix is 2.. 62, val is taken to be UNSIGNED.
That is, val is signed if and only if radix is. You will normally
use radix -10 only through itoa and ltoa, for radix 2, 8, or 16
unsigned is what you generally want.
@ -63,12 +63,17 @@ char *ll2str(longlong val,char *dst,int radix, int upcase)
char buffer[65];
register char *p;
long long_val;
const char *dig_vec= upcase ? _dig_vec_upper : _dig_vec_lower;
const char *dig_vec;
ulonglong uval= (ulonglong) val;
if (radix < -36 || radix > 36)
dig_vec= _dig_vec_base62;
else
dig_vec= upcase ? _dig_vec_upper : _dig_vec_lower;
if (radix < 0)
{
if (radix < -36 || radix > -2) return (char*) 0;
if (radix < -62 || radix > -2) return (char*) 0;
if (val < 0) {
*dst++ = '-';
/* Avoid integer overflow in (-val) for LONGLONG_MIN (BUG#31799). */
@ -78,7 +83,7 @@ char *ll2str(longlong val,char *dst,int radix, int upcase)
}
else
{
if (radix > 36 || radix < 2) return (char*) 0;
if (radix > 62 || radix < 2) return (char*) 0;
}
if (uval == 0)
{

View file

@ -55,9 +55,9 @@
#include "my_sys.h" /* defines errno */
#include <errno.h>
#define char_val(X) (X >= '0' && X <= '9' ? X-'0' :\
#define char_val(X, Y) (X >= '0' && X <= '9' ? X-'0' :\
X >= 'A' && X <= 'Z' ? X-'A'+10 :\
X >= 'a' && X <= 'z' ? X-'a'+10 :\
X >= 'a' && X <= 'z' ? (Y <= 36 ? X-'a'+10 : X-'a'+36) :\
'\177')
char *str2int(register const char *src, register int radix, long int lower,
@ -76,10 +76,10 @@ char *str2int(register const char *src, register int radix, long int lower,
*val = 0;
/* Check that the radix is in the range 2..36 */
/* Check that the radix is in the range 2..62 */
#ifndef DBUG_OFF
if (radix < 2 || radix > 36) {
if (radix < 2 || radix > 62) {
errno=EDOM;
return NullS;
}
@ -126,7 +126,7 @@ char *str2int(register const char *src, register int radix, long int lower,
to left in order to avoid overflow. Answer is after last digit.
*/
for (n = 0; (digits[n]=char_val(*src)) < radix && n < 20; n++,src++) ;
for (n = 0; (digits[n]=char_val(*src, radix)) < radix && n < 20; n++,src++) ;
/* Check that there is at least one digit */

View file

@ -22,8 +22,8 @@ Speciella anv
the destination string "dst" followed by a terminating NUL. The
result is normally a pointer to this NUL character, but if the radix
is dud the result will be NullS and nothing will be changed.
If radix is -2..-36, val is taken to be SIGNED.
If radix is 2.. 36, val is taken to be UNSIGNED.
If radix is -2..-62, val is taken to be SIGNED.
If radix is 2.. 62, val is taken to be UNSIGNED.
That is, val is signed if and only if radix is. You will normally
use radix -10 only through itoa and ltoa, for radix 2, 8, or 16
unsigned is what you generally want.