diff --git a/mysql-test/main/natural_sort_key.result b/mysql-test/main/natural_sort_key.result new file mode 100644 index 00000000000..6cfa04d76fa --- /dev/null +++ b/mysql-test/main/natural_sort_key.result @@ -0,0 +1,169 @@ +SET NAMES utf8mb4; +SELECT NATURAL_SORT_KEY(NULL); +NATURAL_SORT_KEY(NULL) +NULL +SELECT '' c WHERE 0 UNION VALUES('a10'),('a9'),('a1000'), ('a0'),('b'),('b0') ORDER BY NATURAL_SORT_KEY(c); +c +a0 +a9 +a10 +a1000 +b +b0 +SELECT NATURAL_SORT_KEY(repeat('a1',@@max_allowed_packet/2-1)); +NATURAL_SORT_KEY(repeat('a1',@@max_allowed_packet/2-1)) +NULL +Warnings: +Warning 1301 Result of natural_sort_key() was larger than max_allowed_packet (16777216) - truncated +CREATE TABLE t1( +c VARCHAR(30) CHARACTER SET latin1 COLLATE latin1_bin, +k VARCHAR(45) CHARACTER SET utf8mb4 COLLATE utf8mb4_unicode_ci AS (NATURAL_SORT_KEY(CONVERT(c USING utf8mb4))) INVISIBLE, +KEY(k,c)) ENGINE=InnoDB; +INSERT INTO t1 values +('A1'),('a1'),('A100'),('a100'),('A2'),('ä2'),('a2'),('A99'), +('äb'),('B1'),('B100'),('B9'),('C'),('100'); +EXPLAIN SELECT c FROM t1 ORDER BY k,c; +id select_type table type possible_keys key key_len ref rows Extra +1 SIMPLE t1 index NULL k 216 NULL 14 Using index +#Natural sort order. +SELECT c FROM t1 ORDER BY k,c; +c +100 +A1 +a1 +A2 +a2 +ä2 +A99 +A100 +a100 +äb +B1 +B9 +B100 +C +#Unnatural but unicode aware) sort order +SELECT c FROM t1 ORDER BY CONVERT(c USING utf8mb4) COLLATE utf8mb4_unicode_ci,c; +c +100 +A1 +a1 +A100 +a100 +A2 +a2 +ä2 +A99 +äb +B1 +B100 +B9 +C +CREATE TABLE t2 AS SELECT c, NATURAL_SORT_KEY(c) FROM t1 WHERE 0; +SHOW CREATE TABLE t2; +Table Create Table +t2 CREATE TABLE `t2` ( + `c` varchar(30) CHARACTER SET latin1 COLLATE latin1_bin DEFAULT NULL, + `NATURAL_SORT_KEY(c)` varchar(45) CHARACTER SET latin1 COLLATE latin1_bin DEFAULT NULL +) ENGINE=MyISAM DEFAULT CHARSET=latin1 +DROP TABLE t1,t2; +SELECT RPAD(val,21,' ') value , RPAD(NATURAL_SORT_KEY(val),26,' ') sortkey , LENGTH(NATURAL_SORT_KEY(val)) - LENGTH(val) encoding_overhead +FROM +( +SELECT 0 val +UNION VALUES ('0.0'),('0.1'), ('0,15'),('1.001'),('1.002'),('1.010'),('1.02'),('1.1'),('1.3'),('1'),('01'),('0001') +UNION SELECT CONCAT('1',repeat('0',seq)) FROM seq_1_to_20 +) AS numbers ORDER BY sortkey; +value sortkey encoding_overhead +0 00 1 +0,15 00,115 2 +0.0 00.00 2 +0.1 00.01 2 +1 01 1 +1.1 01.01 2 +1.001 01.011 1 +1.02 01.020 2 +1.002 01.021 1 +1.3 01.03 2 +1.010 01.1100 2 +01 010 1 +0001 012 -1 +10 110 1 +100 2100 1 +1000 31000 1 +10000 410000 1 +100000 5100000 1 +1000000 61000000 1 +10000000 710000000 1 +100000000 8100000000 1 +1000000000 901000000000 2 +10000000000 9110000000000 2 +100000000000 92100000000000 2 +1000000000000 931000000000000 2 +10000000000000 9410000000000000 2 +100000000000000 95100000000000000 2 +1000000000000000 961000000000000000 2 +10000000000000000 9710000000000000000 2 +100000000000000000 98100000000000000000 2 +1000000000000000000 9901000000000000000000 3 +10000000000000000000 9912010000000000000000000 5 +100000000000000000000 99121100000000000000000000 5 +# Disable fractions handling by passing NULL as second parameter to NATURAL_SORT_KEY +SELECT val value +FROM +( +SELECT 0 val +UNION VALUES ('0.1'),('1.001'),('1.002'),('1.010'),('1.02'),('1.1'),('1.3'),('1'), ('0,1'),('1,001'),('1,002'),('1,010'),('1,02'),('1,1'),('1,3'),('1')) +AS numbers ORDER BY NATURAL_SORT_KEY(val, NULL); +value +0 +0,1 +0.1 +1 +1,1 +1,001 +1,02 +1,002 +1,3 +1,010 +1.1 +1.001 +1.02 +1.002 +1.3 +1.010 +# Use ',' as decimal separator for NATURAL_SORT_KEY +SELECT val value, NATURAL_SORT_KEY(val,',') sortkey +FROM +( +SELECT 0 val +UNION VALUES ('0,1'),('1,001'),('1,002'),('1,010'),('1,02'),('1,1'),('1,3'),('1')) +AS numbers ORDER BY sortkey; +value sortkey +0 00 +0,1 00,1 +1 01 +1,001 01,001 +1,002 01,002 +1,010 01,010 +1,02 01,02 +1,1 01,1 +1,3 01,3 +# Use '.' as decimal separator for NATURAL_SORT_KEY +SELECT val value,NATURAL_SORT_KEY(val,'.') sortkey +FROM +( +SELECT 0 val +UNION VALUES ('0.1'),('1.001'),('1.002'),('1.010'),('1.02'),('1.1'),('1.3'),('1')) +AS numbers ORDER BY sortkey; +value sortkey +0 00 +0.1 00.1 +1 01 +1.001 01.001 +1.002 01.002 +1.010 01.010 +1.02 01.02 +1.1 01.1 +1.3 01.3 +SET NAMES DEFAULT; diff --git a/mysql-test/main/natural_sort_key.test b/mysql-test/main/natural_sort_key.test new file mode 100644 index 00000000000..bee0d8e76bd --- /dev/null +++ b/mysql-test/main/natural_sort_key.test @@ -0,0 +1,64 @@ +--source include/have_innodb.inc +--source include/have_sequence.inc + +SET NAMES utf8mb4; +SELECT NATURAL_SORT_KEY(NULL); + +# Sort without tables +SELECT '' c WHERE 0 UNION VALUES('a10'),('a9'),('a1000'), ('a0'),('b'),('b0') ORDER BY NATURAL_SORT_KEY(c); + +#Test that max packet overflow produces NULL plus warning +SELECT NATURAL_SORT_KEY(repeat('a1',@@max_allowed_packet/2-1)); + +#Test with virtual(index only) key +CREATE TABLE t1( + c VARCHAR(30) CHARACTER SET latin1 COLLATE latin1_bin, + k VARCHAR(45) CHARACTER SET utf8mb4 COLLATE utf8mb4_unicode_ci AS (NATURAL_SORT_KEY(CONVERT(c USING utf8mb4))) INVISIBLE, + KEY(k,c)) ENGINE=InnoDB; +INSERT INTO t1 values + ('A1'),('a1'),('A100'),('a100'),('A2'),('ä2'),('a2'),('A99'), + ('äb'),('B1'),('B100'),('B9'),('C'),('100'); +EXPLAIN SELECT c FROM t1 ORDER BY k,c; +-- echo #Natural sort order. +# We sort by 2 colums, for stable sort,as we do not currenly have a case and accent insensitive Unicode collation. +SELECT c FROM t1 ORDER BY k,c; +-- echo #Unnatural but unicode aware) sort order +SELECT c FROM t1 ORDER BY CONVERT(c USING utf8mb4) COLLATE utf8mb4_unicode_ci,c; +# CREATE TABLE AS SELECT, to see that length of the column is correct. +CREATE TABLE t2 AS SELECT c, NATURAL_SORT_KEY(c) FROM t1 WHERE 0; +SHOW CREATE TABLE t2; +DROP TABLE t1,t2; + +#Show encoding of numbers, including fractions, and leading whitespace. +SELECT RPAD(val,21,' ') value , RPAD(NATURAL_SORT_KEY(val),26,' ') sortkey , LENGTH(NATURAL_SORT_KEY(val)) - LENGTH(val) encoding_overhead +FROM +( +SELECT 0 val +UNION VALUES ('0.0'),('0.1'), ('0,15'),('1.001'),('1.002'),('1.010'),('1.02'),('1.1'),('1.3'),('1'),('01'),('0001') +UNION SELECT CONCAT('1',repeat('0',seq)) FROM seq_1_to_20 +) AS numbers ORDER BY sortkey; + +--echo # Disable fractions handling by passing NULL as second parameter to NATURAL_SORT_KEY +SELECT val value +FROM +( +SELECT 0 val +UNION VALUES ('0.1'),('1.001'),('1.002'),('1.010'),('1.02'),('1.1'),('1.3'),('1'), ('0,1'),('1,001'),('1,002'),('1,010'),('1,02'),('1,1'),('1,3'),('1')) +AS numbers ORDER BY NATURAL_SORT_KEY(val, NULL); + +--echo # Use ',' as decimal separator for NATURAL_SORT_KEY +SELECT val value, NATURAL_SORT_KEY(val,',') sortkey +FROM +( +SELECT 0 val +UNION VALUES ('0,1'),('1,001'),('1,002'),('1,010'),('1,02'),('1,1'),('1,3'),('1')) +AS numbers ORDER BY sortkey; + +--echo # Use '.' as decimal separator for NATURAL_SORT_KEY +SELECT val value,NATURAL_SORT_KEY(val,'.') sortkey +FROM +( +SELECT 0 val +UNION VALUES ('0.1'),('1.001'),('1.002'),('1.010'),('1.02'),('1.1'),('1.3'),('1')) +AS numbers ORDER BY sortkey; +SET NAMES DEFAULT; diff --git a/sql/item_create.cc b/sql/item_create.cc index d414892ee77..a8b43deb727 100644 --- a/sql/item_create.cc +++ b/sql/item_create.cc @@ -1622,6 +1622,16 @@ protected: virtual ~Create_func_name_const() {} }; +class Create_func_natural_sort_key : public Create_native_func +{ +public: + virtual Item *create_native(THD *thd, LEX_CSTRING *name, + List *item_list) override; + static Create_func_natural_sort_key s_singleton; +protected: + Create_func_natural_sort_key() {} + virtual ~Create_func_natural_sort_key() {} +}; class Create_func_nullif : public Create_func_arg2 { @@ -4652,6 +4662,36 @@ Create_func_md5::create_1_arg(THD *thd, Item *arg1) return new (thd->mem_root) Item_func_md5(thd, arg1); } +Create_func_natural_sort_key Create_func_natural_sort_key::s_singleton; + +Item *Create_func_natural_sort_key::create_native(THD *thd, LEX_CSTRING *name, + List *item_list) +{ + Item *func= NULL; + int arg_count= 0; + + if (item_list != NULL) + arg_count= item_list->elements; + + Item *param_1, *param_2; + + switch (arg_count) + { + case 1: + param_1= item_list->pop(); + func= new (thd->mem_root) Item_func_natural_sort_key(thd, param_1); + break; + case 2: + param_1= item_list->pop(); + param_2= item_list->pop(); + func= new (thd->mem_root) Item_func_natural_sort_key(thd, param_1, param_2); + break; + default: + my_error(ER_WRONG_PARAMCOUNT_TO_NATIVE_FCT, MYF(0), name->str); + break; + } + return func; +} Create_func_monthname Create_func_monthname::s_singleton; @@ -5678,6 +5718,7 @@ Native_func_registry func_array[] = { { STRING_WITH_LEN("MD5") }, BUILDER(Create_func_md5)}, { { STRING_WITH_LEN("MONTHNAME") }, BUILDER(Create_func_monthname)}, { { STRING_WITH_LEN("NAME_CONST") }, BUILDER(Create_func_name_const)}, + { {STRING_WITH_LEN("NATURAL_SORT_KEY")}, BUILDER(Create_func_natural_sort_key)}, { { STRING_WITH_LEN("NVL") }, BUILDER(Create_func_ifnull)}, { { STRING_WITH_LEN("NVL2") }, BUILDER(Create_func_nvl2)}, { { STRING_WITH_LEN("NULLIF") }, BUILDER(Create_func_nullif)}, diff --git a/sql/item_strfunc.cc b/sql/item_strfunc.cc index 5254872fef2..59460134ef8 100644 --- a/sql/item_strfunc.cc +++ b/sql/item_strfunc.cc @@ -5439,6 +5439,232 @@ String *Item_temptable_rowid::val_str(String *str) return &str_value; } +/** + Here is how we generate a key suitable for lexicographic comparison + from a numeric string (representing positive number). + + We prepend variable length prefix, that only encodes string length + The longer the string, the lexicographically larger is the prefix. + + Rules for generating prefix is following. + + 1. Small length (1 to 9) + Prefix is single char '0' + length -1 + This gives the range '0' - '8' + 2. Medium length(10 to 18) + Prefix is 2 chars - concat('9', '0'+length-10)' + This gives range '90'-'98' + 3. Large length(19) + Prefix is "990" + 4. Huge length (20 to 2^32-1) + Prefix stars with '99', then log10 of the length is added as char + then the number itself is added + The range is '99120' - '9994294967295' +*/ + +#define NATSORT_BUFSIZE 16 +static size_t natsort_num2str(size_t n, char *s) +{ + if (n < 10) + { + s[0]= '0' + (char)n -1; + return 1; + } + + if (n < 19) + { + s[0]= '9'; + s[1]= '0'+ (char)n- 10; + return 2; + } + + if (n == 19) + { + s[0]= '9'; + s[1]= '9'; + s[2]= '0'; + return 3; + } + + /* Here, n > 19*/ + s[0]= '9'; + s[1]= '9'; + size_t log10n= 0; + for (size_t tmp= n/10; tmp; tmp /= 10) + log10n++; + DBUG_ASSERT(log10n && log10n < 10); + s[2]='0' + (char)log10n; + return ll2str(n,s+3,10,0) - s; +} + +/** + Convert a string to natural sort key. + @param[in] in - input string + @param[out] s - output string + + We assume that memory is preallocated for the output + string, so that appends do not fail. + + The lead zeros count, if positive, is stored after + the numeric string. + + Fractional parts are stored and compared as strings. + Even if they have numbers in them, they should just + be compared as strings. +*/ +static void to_natsort_key(const String *in, String *s, my_wc_t decimal_sep) +{ + CHARSET_INFO *cs= in->charset(); + uchar *pos= (uchar *) in->ptr(); + uchar *end= pos + in->length(); + size_t num_len= 0; + size_t leading_zeros= 0; + uchar *num_start= nullptr; + bool in_fraction= false; + size_t fraction_len=0; + + for (;;) + { + my_wc_t c= 0; + int charlen= cs->mb_wc(&c, pos, end); + bool is_digit= charlen >= 1 && (c >= '0' && c <= '9'); + + if (!is_digit && (num_start || leading_zeros)) + { + /* + Handle end of digits run. + + Write prefix, natsort_num2str(length) is without leading zeros + Write numeric value, without leading zeros + If there were leading zeros, write variable length suffix + */ + char buf[NATSORT_BUFSIZE]; + if (!num_len) + { + // number was zeros only + leading_zeros--; + num_len= 1; + } + + s->append(buf, natsort_num2str(num_len, buf)); + if (!num_start) + s->append('0'); + else + s->append((const char *) num_start, pos - num_start, cs); + + if (leading_zeros) + s->append(buf, natsort_num2str(leading_zeros, buf)); + + /* Reset state.*/ + num_len= 0; + num_start= nullptr; + leading_zeros= 0; + in_fraction= c == decimal_sep; + fraction_len= 0; + } + + if (charlen < 1) + break; + + if (is_digit && !in_fraction) + { + /* Do not count leading zeros in num_len, and num_pos */ + if (c != '0' || num_len) + { + num_len++; + if (!num_start) + num_start= pos; + } + else + leading_zeros++; + } + else + { + /* Append a non-digit, or fractional part of the number (handling the same as string) */ + s->append((const char *) pos, charlen); + if (in_fraction) + { + if (!is_digit && fraction_len) + in_fraction= false; + fraction_len++; + } + } + pos+= charlen; + } +} + + +String *Item_func_natural_sort_key::val_str(String *s) +{ + if (args[0]->is_null()) + { + null_value= true; + return nullptr; + } + + String *in= args[0]->val_str(); + CHARSET_INFO *cs= in->charset(); + ulong max_allowed_packet= current_thd->variables.max_allowed_packet; + size_t reserve_length= (size_t) in->length() + in->length() / 2 + 2; + if (s->alloc((uint32)reserve_length)) + goto error_exit; + s->length(0); + s->set_charset(cs); + + to_natsort_key(in, s, decimal_separator()); + + DBUG_ASSERT(s->length() < reserve_length); + + if (s->length() > max_allowed_packet) + { + push_warning_printf(current_thd, Sql_condition::WARN_LEVEL_WARN, + ER_WARN_ALLOWED_PACKET_OVERFLOWED, + ER(ER_WARN_ALLOWED_PACKET_OVERFLOWED), func_name(), + max_allowed_packet); + goto error_exit; + } + return s; + +error_exit: + null_value=true; + return nullptr; +} + + +my_wc_t Item_func_natural_sort_key::decimal_separator() +{ + if (m_decimal_separator != DECIMAL_SEP_UNDEFINED) + return m_decimal_separator; + + DBUG_ASSERT(arg_count > 1); + String *s= args[1]->val_str(); + if (s && s->length()) + { + my_wc_t c; + if (s->charset()->mb_wc(&c, (const uchar *) s->ptr(), + (const uchar *) s->end())) + return c; + } + return DECIMAL_SEP_NONE; +} + +bool Item_func_natural_sort_key::fix_length_and_dec(void) +{ + if (agg_arg_charsets_for_string_result(collation, args, 1)) + return true; + DBUG_ASSERT(collation.collation != NULL); + uint32 max_clen= args[0]->max_char_length(); + fix_char_length(max_clen + max_clen/2 + ((max_clen <= 3)?1:0)); + set_maybe_null(); + + if (arg_count> 1 && args[1]->can_eval_in_optimize()) + { + /* Initialize decimal separator */ + m_decimal_separator= decimal_separator(); + } + return false; +} + #ifdef WITH_WSREP #include "wsrep_mysqld.h" diff --git a/sql/item_strfunc.h b/sql/item_strfunc.h index 5ba6b12ee06..eb36422dbeb 100644 --- a/sql/item_strfunc.h +++ b/sql/item_strfunc.h @@ -271,6 +271,31 @@ public: { return get_item_copy(thd, this); } }; +class Item_func_natural_sort_key : public Item_str_func +{ + my_wc_t m_decimal_separator; + static constexpr my_wc_t DECIMAL_SEP_UNDEFINED=ULONG_MAX-1; + static constexpr my_wc_t DECIMAL_SEP_NONE= ULONG_MAX; + +public: + Item_func_natural_sort_key(THD *thd, Item *a) + : Item_str_func(thd, a), m_decimal_separator(DECIMAL_SEP_NONE){} + + Item_func_natural_sort_key(THD *thd, Item *a, Item *b) + : Item_str_func(thd, a, b), m_decimal_separator(DECIMAL_SEP_UNDEFINED){} + my_wc_t decimal_separator(); + String *val_str(String *) override; + LEX_CSTRING func_name_cstring() const override + { + static LEX_CSTRING name= {STRING_WITH_LEN("natural_sort_key")}; + return name; + } + bool fix_length_and_dec(void) override; + Item *get_copy(THD *thd) override + { + return get_item_copy(thd, this); + } +}; class Item_func_concat :public Item_str_func {