MDEV-4742 - provide function to sort string in "natural" order

The numbers should be compared as numbers,  while the rest should be compared
as string.

Introduce natural_sort_key() function that transforms original string
so that the lexicographic order of such keys is suitable for
natural sort.
This commit is contained in:
Vladislav Vaintroub 2021-07-29 13:28:11 +02:00
parent b4911f5a34
commit 5b29d407f6
5 changed files with 525 additions and 0 deletions

View file

@ -0,0 +1,169 @@
SET NAMES utf8mb4;
SELECT NATURAL_SORT_KEY(NULL);
NATURAL_SORT_KEY(NULL)
NULL
SELECT '' c WHERE 0 UNION VALUES('a10'),('a9'),('a1000'), ('a0'),('b'),('b0') ORDER BY NATURAL_SORT_KEY(c);
c
a0
a9
a10
a1000
b
b0
SELECT NATURAL_SORT_KEY(repeat('a1',@@max_allowed_packet/2-1));
NATURAL_SORT_KEY(repeat('a1',@@max_allowed_packet/2-1))
NULL
Warnings:
Warning 1301 Result of natural_sort_key() was larger than max_allowed_packet (16777216) - truncated
CREATE TABLE t1(
c VARCHAR(30) CHARACTER SET latin1 COLLATE latin1_bin,
k VARCHAR(45) CHARACTER SET utf8mb4 COLLATE utf8mb4_unicode_ci AS (NATURAL_SORT_KEY(CONVERT(c USING utf8mb4))) INVISIBLE,
KEY(k,c)) ENGINE=InnoDB;
INSERT INTO t1 values
('A1'),('a1'),('A100'),('a100'),('A2'),('ä2'),('a2'),('A99'),
('äb'),('B1'),('B100'),('B9'),('C'),('100');
EXPLAIN SELECT c FROM t1 ORDER BY k,c;
id select_type table type possible_keys key key_len ref rows Extra
1 SIMPLE t1 index NULL k 216 NULL 14 Using index
#Natural sort order.
SELECT c FROM t1 ORDER BY k,c;
c
100
A1
a1
A2
a2
ä2
A99
A100
a100
äb
B1
B9
B100
C
#Unnatural but unicode aware) sort order
SELECT c FROM t1 ORDER BY CONVERT(c USING utf8mb4) COLLATE utf8mb4_unicode_ci,c;
c
100
A1
a1
A100
a100
A2
a2
ä2
A99
äb
B1
B100
B9
C
CREATE TABLE t2 AS SELECT c, NATURAL_SORT_KEY(c) FROM t1 WHERE 0;
SHOW CREATE TABLE t2;
Table Create Table
t2 CREATE TABLE `t2` (
`c` varchar(30) CHARACTER SET latin1 COLLATE latin1_bin DEFAULT NULL,
`NATURAL_SORT_KEY(c)` varchar(45) CHARACTER SET latin1 COLLATE latin1_bin DEFAULT NULL
) ENGINE=MyISAM DEFAULT CHARSET=latin1
DROP TABLE t1,t2;
SELECT RPAD(val,21,' ') value , RPAD(NATURAL_SORT_KEY(val),26,' ') sortkey , LENGTH(NATURAL_SORT_KEY(val)) - LENGTH(val) encoding_overhead
FROM
(
SELECT 0 val
UNION VALUES ('0.0'),('0.1'), ('0,15'),('1.001'),('1.002'),('1.010'),('1.02'),('1.1'),('1.3'),('1'),('01'),('0001')
UNION SELECT CONCAT('1',repeat('0',seq)) FROM seq_1_to_20
) AS numbers ORDER BY sortkey;
value sortkey encoding_overhead
0 00 1
0,15 00,115 2
0.0 00.00 2
0.1 00.01 2
1 01 1
1.1 01.01 2
1.001 01.011 1
1.02 01.020 2
1.002 01.021 1
1.3 01.03 2
1.010 01.1100 2
01 010 1
0001 012 -1
10 110 1
100 2100 1
1000 31000 1
10000 410000 1
100000 5100000 1
1000000 61000000 1
10000000 710000000 1
100000000 8100000000 1
1000000000 901000000000 2
10000000000 9110000000000 2
100000000000 92100000000000 2
1000000000000 931000000000000 2
10000000000000 9410000000000000 2
100000000000000 95100000000000000 2
1000000000000000 961000000000000000 2
10000000000000000 9710000000000000000 2
100000000000000000 98100000000000000000 2
1000000000000000000 9901000000000000000000 3
10000000000000000000 9912010000000000000000000 5
100000000000000000000 99121100000000000000000000 5
# Disable fractions handling by passing NULL as second parameter to NATURAL_SORT_KEY
SELECT val value
FROM
(
SELECT 0 val
UNION VALUES ('0.1'),('1.001'),('1.002'),('1.010'),('1.02'),('1.1'),('1.3'),('1'), ('0,1'),('1,001'),('1,002'),('1,010'),('1,02'),('1,1'),('1,3'),('1'))
AS numbers ORDER BY NATURAL_SORT_KEY(val, NULL);
value
0
0,1
0.1
1
1,1
1,001
1,02
1,002
1,3
1,010
1.1
1.001
1.02
1.002
1.3
1.010
# Use ',' as decimal separator for NATURAL_SORT_KEY
SELECT val value, NATURAL_SORT_KEY(val,',') sortkey
FROM
(
SELECT 0 val
UNION VALUES ('0,1'),('1,001'),('1,002'),('1,010'),('1,02'),('1,1'),('1,3'),('1'))
AS numbers ORDER BY sortkey;
value sortkey
0 00
0,1 00,1
1 01
1,001 01,001
1,002 01,002
1,010 01,010
1,02 01,02
1,1 01,1
1,3 01,3
# Use '.' as decimal separator for NATURAL_SORT_KEY
SELECT val value,NATURAL_SORT_KEY(val,'.') sortkey
FROM
(
SELECT 0 val
UNION VALUES ('0.1'),('1.001'),('1.002'),('1.010'),('1.02'),('1.1'),('1.3'),('1'))
AS numbers ORDER BY sortkey;
value sortkey
0 00
0.1 00.1
1 01
1.001 01.001
1.002 01.002
1.010 01.010
1.02 01.02
1.1 01.1
1.3 01.3
SET NAMES DEFAULT;

View file

@ -0,0 +1,64 @@
--source include/have_innodb.inc
--source include/have_sequence.inc
SET NAMES utf8mb4;
SELECT NATURAL_SORT_KEY(NULL);
# Sort without tables
SELECT '' c WHERE 0 UNION VALUES('a10'),('a9'),('a1000'), ('a0'),('b'),('b0') ORDER BY NATURAL_SORT_KEY(c);
#Test that max packet overflow produces NULL plus warning
SELECT NATURAL_SORT_KEY(repeat('a1',@@max_allowed_packet/2-1));
#Test with virtual(index only) key
CREATE TABLE t1(
c VARCHAR(30) CHARACTER SET latin1 COLLATE latin1_bin,
k VARCHAR(45) CHARACTER SET utf8mb4 COLLATE utf8mb4_unicode_ci AS (NATURAL_SORT_KEY(CONVERT(c USING utf8mb4))) INVISIBLE,
KEY(k,c)) ENGINE=InnoDB;
INSERT INTO t1 values
('A1'),('a1'),('A100'),('a100'),('A2'),('ä2'),('a2'),('A99'),
('äb'),('B1'),('B100'),('B9'),('C'),('100');
EXPLAIN SELECT c FROM t1 ORDER BY k,c;
-- echo #Natural sort order.
# We sort by 2 colums, for stable sort,as we do not currenly have a case and accent insensitive Unicode collation.
SELECT c FROM t1 ORDER BY k,c;
-- echo #Unnatural but unicode aware) sort order
SELECT c FROM t1 ORDER BY CONVERT(c USING utf8mb4) COLLATE utf8mb4_unicode_ci,c;
# CREATE TABLE AS SELECT, to see that length of the column is correct.
CREATE TABLE t2 AS SELECT c, NATURAL_SORT_KEY(c) FROM t1 WHERE 0;
SHOW CREATE TABLE t2;
DROP TABLE t1,t2;
#Show encoding of numbers, including fractions, and leading whitespace.
SELECT RPAD(val,21,' ') value , RPAD(NATURAL_SORT_KEY(val),26,' ') sortkey , LENGTH(NATURAL_SORT_KEY(val)) - LENGTH(val) encoding_overhead
FROM
(
SELECT 0 val
UNION VALUES ('0.0'),('0.1'), ('0,15'),('1.001'),('1.002'),('1.010'),('1.02'),('1.1'),('1.3'),('1'),('01'),('0001')
UNION SELECT CONCAT('1',repeat('0',seq)) FROM seq_1_to_20
) AS numbers ORDER BY sortkey;
--echo # Disable fractions handling by passing NULL as second parameter to NATURAL_SORT_KEY
SELECT val value
FROM
(
SELECT 0 val
UNION VALUES ('0.1'),('1.001'),('1.002'),('1.010'),('1.02'),('1.1'),('1.3'),('1'), ('0,1'),('1,001'),('1,002'),('1,010'),('1,02'),('1,1'),('1,3'),('1'))
AS numbers ORDER BY NATURAL_SORT_KEY(val, NULL);
--echo # Use ',' as decimal separator for NATURAL_SORT_KEY
SELECT val value, NATURAL_SORT_KEY(val,',') sortkey
FROM
(
SELECT 0 val
UNION VALUES ('0,1'),('1,001'),('1,002'),('1,010'),('1,02'),('1,1'),('1,3'),('1'))
AS numbers ORDER BY sortkey;
--echo # Use '.' as decimal separator for NATURAL_SORT_KEY
SELECT val value,NATURAL_SORT_KEY(val,'.') sortkey
FROM
(
SELECT 0 val
UNION VALUES ('0.1'),('1.001'),('1.002'),('1.010'),('1.02'),('1.1'),('1.3'),('1'))
AS numbers ORDER BY sortkey;
SET NAMES DEFAULT;

View file

@ -1622,6 +1622,16 @@ protected:
virtual ~Create_func_name_const() {}
};
class Create_func_natural_sort_key : public Create_native_func
{
public:
virtual Item *create_native(THD *thd, LEX_CSTRING *name,
List<Item> *item_list) override;
static Create_func_natural_sort_key s_singleton;
protected:
Create_func_natural_sort_key() {}
virtual ~Create_func_natural_sort_key() {}
};
class Create_func_nullif : public Create_func_arg2
{
@ -4652,6 +4662,36 @@ Create_func_md5::create_1_arg(THD *thd, Item *arg1)
return new (thd->mem_root) Item_func_md5(thd, arg1);
}
Create_func_natural_sort_key Create_func_natural_sort_key::s_singleton;
Item *Create_func_natural_sort_key::create_native(THD *thd, LEX_CSTRING *name,
List<Item> *item_list)
{
Item *func= NULL;
int arg_count= 0;
if (item_list != NULL)
arg_count= item_list->elements;
Item *param_1, *param_2;
switch (arg_count)
{
case 1:
param_1= item_list->pop();
func= new (thd->mem_root) Item_func_natural_sort_key(thd, param_1);
break;
case 2:
param_1= item_list->pop();
param_2= item_list->pop();
func= new (thd->mem_root) Item_func_natural_sort_key(thd, param_1, param_2);
break;
default:
my_error(ER_WRONG_PARAMCOUNT_TO_NATIVE_FCT, MYF(0), name->str);
break;
}
return func;
}
Create_func_monthname Create_func_monthname::s_singleton;
@ -5678,6 +5718,7 @@ Native_func_registry func_array[] =
{ { STRING_WITH_LEN("MD5") }, BUILDER(Create_func_md5)},
{ { STRING_WITH_LEN("MONTHNAME") }, BUILDER(Create_func_monthname)},
{ { STRING_WITH_LEN("NAME_CONST") }, BUILDER(Create_func_name_const)},
{ {STRING_WITH_LEN("NATURAL_SORT_KEY")}, BUILDER(Create_func_natural_sort_key)},
{ { STRING_WITH_LEN("NVL") }, BUILDER(Create_func_ifnull)},
{ { STRING_WITH_LEN("NVL2") }, BUILDER(Create_func_nvl2)},
{ { STRING_WITH_LEN("NULLIF") }, BUILDER(Create_func_nullif)},

View file

@ -5439,6 +5439,232 @@ String *Item_temptable_rowid::val_str(String *str)
return &str_value;
}
/**
Here is how we generate a key suitable for lexicographic comparison
from a numeric string (representing positive number).
We prepend variable length prefix, that only encodes string length
The longer the string, the lexicographically larger is the prefix.
Rules for generating prefix is following.
1. Small length (1 to 9)
Prefix is single char '0' + length -1
This gives the range '0' - '8'
2. Medium length(10 to 18)
Prefix is 2 chars - concat('9', '0'+length-10)'
This gives range '90'-'98'
3. Large length(19)
Prefix is "990"
4. Huge length (20 to 2^32-1)
Prefix stars with '99', then log10 of the length is added as char
then the number itself is added
The range is '99120' - '9994294967295'
*/
#define NATSORT_BUFSIZE 16
static size_t natsort_num2str(size_t n, char *s)
{
if (n < 10)
{
s[0]= '0' + (char)n -1;
return 1;
}
if (n < 19)
{
s[0]= '9';
s[1]= '0'+ (char)n- 10;
return 2;
}
if (n == 19)
{
s[0]= '9';
s[1]= '9';
s[2]= '0';
return 3;
}
/* Here, n > 19*/
s[0]= '9';
s[1]= '9';
size_t log10n= 0;
for (size_t tmp= n/10; tmp; tmp /= 10)
log10n++;
DBUG_ASSERT(log10n && log10n < 10);
s[2]='0' + (char)log10n;
return ll2str(n,s+3,10,0) - s;
}
/**
Convert a string to natural sort key.
@param[in] in - input string
@param[out] s - output string
We assume that memory is preallocated for the output
string, so that appends do not fail.
The lead zeros count, if positive, is stored after
the numeric string.
Fractional parts are stored and compared as strings.
Even if they have numbers in them, they should just
be compared as strings.
*/
static void to_natsort_key(const String *in, String *s, my_wc_t decimal_sep)
{
CHARSET_INFO *cs= in->charset();
uchar *pos= (uchar *) in->ptr();
uchar *end= pos + in->length();
size_t num_len= 0;
size_t leading_zeros= 0;
uchar *num_start= nullptr;
bool in_fraction= false;
size_t fraction_len=0;
for (;;)
{
my_wc_t c= 0;
int charlen= cs->mb_wc(&c, pos, end);
bool is_digit= charlen >= 1 && (c >= '0' && c <= '9');
if (!is_digit && (num_start || leading_zeros))
{
/*
Handle end of digits run.
Write prefix, natsort_num2str(length) is without leading zeros
Write numeric value, without leading zeros
If there were leading zeros, write variable length suffix
*/
char buf[NATSORT_BUFSIZE];
if (!num_len)
{
// number was zeros only
leading_zeros--;
num_len= 1;
}
s->append(buf, natsort_num2str(num_len, buf));
if (!num_start)
s->append('0');
else
s->append((const char *) num_start, pos - num_start, cs);
if (leading_zeros)
s->append(buf, natsort_num2str(leading_zeros, buf));
/* Reset state.*/
num_len= 0;
num_start= nullptr;
leading_zeros= 0;
in_fraction= c == decimal_sep;
fraction_len= 0;
}
if (charlen < 1)
break;
if (is_digit && !in_fraction)
{
/* Do not count leading zeros in num_len, and num_pos */
if (c != '0' || num_len)
{
num_len++;
if (!num_start)
num_start= pos;
}
else
leading_zeros++;
}
else
{
/* Append a non-digit, or fractional part of the number (handling the same as string) */
s->append((const char *) pos, charlen);
if (in_fraction)
{
if (!is_digit && fraction_len)
in_fraction= false;
fraction_len++;
}
}
pos+= charlen;
}
}
String *Item_func_natural_sort_key::val_str(String *s)
{
if (args[0]->is_null())
{
null_value= true;
return nullptr;
}
String *in= args[0]->val_str();
CHARSET_INFO *cs= in->charset();
ulong max_allowed_packet= current_thd->variables.max_allowed_packet;
size_t reserve_length= (size_t) in->length() + in->length() / 2 + 2;
if (s->alloc((uint32)reserve_length))
goto error_exit;
s->length(0);
s->set_charset(cs);
to_natsort_key(in, s, decimal_separator());
DBUG_ASSERT(s->length() < reserve_length);
if (s->length() > max_allowed_packet)
{
push_warning_printf(current_thd, Sql_condition::WARN_LEVEL_WARN,
ER_WARN_ALLOWED_PACKET_OVERFLOWED,
ER(ER_WARN_ALLOWED_PACKET_OVERFLOWED), func_name(),
max_allowed_packet);
goto error_exit;
}
return s;
error_exit:
null_value=true;
return nullptr;
}
my_wc_t Item_func_natural_sort_key::decimal_separator()
{
if (m_decimal_separator != DECIMAL_SEP_UNDEFINED)
return m_decimal_separator;
DBUG_ASSERT(arg_count > 1);
String *s= args[1]->val_str();
if (s && s->length())
{
my_wc_t c;
if (s->charset()->mb_wc(&c, (const uchar *) s->ptr(),
(const uchar *) s->end()))
return c;
}
return DECIMAL_SEP_NONE;
}
bool Item_func_natural_sort_key::fix_length_and_dec(void)
{
if (agg_arg_charsets_for_string_result(collation, args, 1))
return true;
DBUG_ASSERT(collation.collation != NULL);
uint32 max_clen= args[0]->max_char_length();
fix_char_length(max_clen + max_clen/2 + ((max_clen <= 3)?1:0));
set_maybe_null();
if (arg_count> 1 && args[1]->can_eval_in_optimize())
{
/* Initialize decimal separator */
m_decimal_separator= decimal_separator();
}
return false;
}
#ifdef WITH_WSREP
#include "wsrep_mysqld.h"

View file

@ -271,6 +271,31 @@ public:
{ return get_item_copy<Item_func_aes_decrypt>(thd, this); }
};
class Item_func_natural_sort_key : public Item_str_func
{
my_wc_t m_decimal_separator;
static constexpr my_wc_t DECIMAL_SEP_UNDEFINED=ULONG_MAX-1;
static constexpr my_wc_t DECIMAL_SEP_NONE= ULONG_MAX;
public:
Item_func_natural_sort_key(THD *thd, Item *a)
: Item_str_func(thd, a), m_decimal_separator(DECIMAL_SEP_NONE){}
Item_func_natural_sort_key(THD *thd, Item *a, Item *b)
: Item_str_func(thd, a, b), m_decimal_separator(DECIMAL_SEP_UNDEFINED){}
my_wc_t decimal_separator();
String *val_str(String *) override;
LEX_CSTRING func_name_cstring() const override
{
static LEX_CSTRING name= {STRING_WITH_LEN("natural_sort_key")};
return name;
}
bool fix_length_and_dec(void) override;
Item *get_copy(THD *thd) override
{
return get_item_copy<Item_func_natural_sort_key>(thd, this);
}
};
class Item_func_concat :public Item_str_func
{