2018-10-16 17:10:57 +02:00
|
|
|
/*
|
2020-01-26 17:27:13 +01:00
|
|
|
Copyright (c) 2018, 2020, MariaDB Corporation
|
2018-10-16 17:10:57 +02:00
|
|
|
|
|
|
|
This program is free software; you can redistribute it and/or modify
|
|
|
|
it under the terms of the GNU General Public License as published by
|
|
|
|
the Free Software Foundation; version 2 of the License.
|
|
|
|
|
|
|
|
This program is distributed in the hope that it will be useful,
|
|
|
|
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
|
|
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
|
|
|
GNU General Public License for more details.
|
|
|
|
|
|
|
|
You should have received a copy of the GNU General Public License
|
|
|
|
along with this program; if not, write to the Free Software
|
|
|
|
Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
|
|
|
|
*/
|
|
|
|
|
2021-09-23 16:46:37 +02:00
|
|
|
#include "ctype-mb.h"
|
2018-10-16 17:10:57 +02:00
|
|
|
|
|
|
|
#ifndef MY_FUNCTION_NAME
|
|
|
|
#error MY_FUNCTION_NAME is not defined
|
|
|
|
#endif
|
|
|
|
#ifndef MY_MB_WC
|
|
|
|
#error MY_MB_WC is not defined
|
|
|
|
#endif
|
|
|
|
#ifndef MY_LIKE_RANGE
|
|
|
|
#error MY_LIKE_RANGE is not defined
|
|
|
|
#endif
|
2018-10-16 17:10:57 +02:00
|
|
|
#ifndef MY_UCA_ASCII_OPTIMIZE
|
|
|
|
#error MY_ASCII_OPTIMIZE is not defined
|
|
|
|
#endif
|
|
|
|
#ifndef MY_UCA_COMPILE_CONTRACTIONS
|
|
|
|
#error MY_UCA_COMPILE_CONTRACTIONS is not defined
|
|
|
|
#endif
|
|
|
|
#ifndef MY_UCA_COLL_INIT
|
|
|
|
#error MY_UCA_COLL_INIT is not defined
|
|
|
|
#endif
|
2018-10-16 17:10:57 +02:00
|
|
|
|
|
|
|
|
2021-09-29 13:13:57 +02:00
|
|
|
#include "ctype-uca-scanner_next.inl"
|
|
|
|
#define SCANNER_NEXT_NCHARS
|
|
|
|
#include "ctype-uca-scanner_next.inl"
|
2018-10-16 17:10:57 +02:00
|
|
|
|
|
|
|
/*
|
|
|
|
Compares two strings according to the collation
|
|
|
|
|
|
|
|
SYNOPSIS:
|
|
|
|
strnncoll_onelevel()
|
|
|
|
cs Character set information
|
|
|
|
level Weight level (0 primary, 1 secondary, 2 tertiary, etc)
|
|
|
|
s First string
|
|
|
|
slen First string length
|
|
|
|
t Second string
|
|
|
|
tlen Seconf string length
|
|
|
|
level DUCETweight level
|
|
|
|
|
|
|
|
NOTES:
|
|
|
|
Initializes two weight scanners and gets weights
|
|
|
|
corresponding to two strings in a loop. If weights are not
|
|
|
|
the same at some step then returns their difference.
|
|
|
|
|
|
|
|
In the while() comparison these situations are possible:
|
|
|
|
1. (s_res>0) and (t_res>0) and (s_res == t_res)
|
|
|
|
Weights are the same so far, continue comparison
|
|
|
|
2. (s_res>0) and (t_res>0) and (s_res!=t_res)
|
|
|
|
A difference has been found, return.
|
|
|
|
3. (s_res>0) and (t_res<0)
|
|
|
|
We have reached the end of the second string, or found
|
|
|
|
an illegal multibyte sequence in the second string.
|
|
|
|
Return a positive number, i.e. the first string is bigger.
|
|
|
|
4. (s_res<0) and (t_res>0)
|
|
|
|
We have reached the end of the first string, or found
|
|
|
|
an illegal multibyte sequence in the first string.
|
|
|
|
Return a negative number, i.e. the second string is bigger.
|
|
|
|
5. (s_res<0) and (t_res<0)
|
|
|
|
Both scanners returned -1. It means we have riched
|
|
|
|
the end-of-string of illegal-sequence in both strings
|
|
|
|
at the same time. Return 0, strings are equal.
|
|
|
|
|
|
|
|
RETURN
|
|
|
|
Difference between two strings, according to the collation:
|
|
|
|
0 - means strings are equal
|
|
|
|
negative number - means the first string is smaller
|
|
|
|
positive number - means the first string is bigger
|
|
|
|
*/
|
|
|
|
|
|
|
|
static int
|
|
|
|
MY_FUNCTION_NAME(strnncoll_onelevel)(CHARSET_INFO *cs,
|
|
|
|
const MY_UCA_WEIGHT_LEVEL *level,
|
|
|
|
const uchar *s, size_t slen,
|
|
|
|
const uchar *t, size_t tlen,
|
|
|
|
my_bool t_is_prefix)
|
|
|
|
{
|
|
|
|
my_uca_scanner sscanner;
|
|
|
|
my_uca_scanner tscanner;
|
2022-09-02 11:23:24 +02:00
|
|
|
my_uca_scanner_param param;
|
2018-10-16 17:10:57 +02:00
|
|
|
int s_res;
|
|
|
|
int t_res;
|
2022-02-25 10:54:59 +01:00
|
|
|
|
|
|
|
#if MY_UCA_ASCII_OPTIMIZE
|
|
|
|
{
|
|
|
|
size_t prefix= my_uca_level_booster_equal_prefix_length(level->booster,
|
|
|
|
s, slen, t, tlen);
|
|
|
|
s+= prefix, slen-= prefix;
|
|
|
|
t+= prefix, tlen-= prefix;
|
|
|
|
}
|
|
|
|
#endif
|
2022-09-02 11:23:24 +02:00
|
|
|
|
|
|
|
my_uca_scanner_param_init(¶m, cs, level);
|
|
|
|
my_uca_scanner_init_any(&sscanner, s, slen);
|
|
|
|
my_uca_scanner_init_any(&tscanner, t, tlen);
|
2018-10-16 17:10:57 +02:00
|
|
|
|
|
|
|
do
|
|
|
|
{
|
2022-09-02 11:23:24 +02:00
|
|
|
s_res= MY_FUNCTION_NAME(scanner_next)(&sscanner, ¶m);
|
|
|
|
t_res= MY_FUNCTION_NAME(scanner_next)(&tscanner, ¶m);
|
2018-10-16 17:10:57 +02:00
|
|
|
} while ( s_res == t_res && s_res >0);
|
|
|
|
|
|
|
|
return (t_is_prefix && t_res < 0) ? 0 : (s_res - t_res);
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
/*
|
|
|
|
One-level, PAD SPACE.
|
|
|
|
*/
|
|
|
|
static int
|
|
|
|
MY_FUNCTION_NAME(strnncoll)(CHARSET_INFO *cs,
|
|
|
|
const uchar *s, size_t slen,
|
|
|
|
const uchar *t, size_t tlen,
|
|
|
|
my_bool t_is_prefix)
|
|
|
|
{
|
|
|
|
return MY_FUNCTION_NAME(strnncoll_onelevel)(cs, &cs->uca->level[0],
|
|
|
|
s, slen, t, tlen, t_is_prefix);
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
/*
|
|
|
|
Multi-level, PAD SPACE.
|
|
|
|
*/
|
|
|
|
static int
|
|
|
|
MY_FUNCTION_NAME(strnncoll_multilevel)(CHARSET_INFO *cs,
|
|
|
|
const uchar *s, size_t slen,
|
|
|
|
const uchar *t, size_t tlen,
|
|
|
|
my_bool t_is_prefix)
|
|
|
|
{
|
MDEV-27009 Add UCA-14.0.0 collations
- Added one neutral and 22 tailored (language specific) collations based on
Unicode Collation Algorithm version 14.0.0.
Collations were added for Unicode character sets
utf8mb3, utf8mb4, ucs2, utf16, utf32.
Every tailoring was added with four accent and case
sensitivity flag combinations, e.g:
* utf8mb4_uca1400_swedish_as_cs
* utf8mb4_uca1400_swedish_as_ci
* utf8mb4_uca1400_swedish_ai_cs
* utf8mb4_uca1400_swedish_ai_ci
and their _nopad_ variants:
* utf8mb4_uca1400_swedish_nopad_as_cs
* utf8mb4_uca1400_swedish_nopad_as_ci
* utf8mb4_uca1400_swedish_nopad_ai_cs
* utf8mb4_uca1400_swedish_nopad_ai_ci
- Introducing a conception of contextually typed named collations:
CREATE DATABASE db1 CHARACTER SET utf8mb4;
CREATE TABLE db1.t1 (a CHAR(10) COLLATE uca1400_as_ci);
The idea is that there is no a need to specify the character set prefix
in the new collation names. It's enough to type just the suffix
"uca1400_as_ci". The character set is taken from the context.
In the above example script the context character set is utf8mb4.
So the CREATE TABLE will make a column with the collation
utf8mb4_uca1400_as_ci.
Short collations names can be used in any parts of the SQL syntax
where the COLLATE clause is understood.
- New collations are displayed only one time
(without character set combinations) by these statements:
SELECT * FROM INFORMATION_SCHEMA.COLLATIONS;
SHOW COLLATION;
For example, all these collations:
- utf8mb3_uca1400_swedish_as_ci
- utf8mb4_uca1400_swedish_as_ci
- ucs2_uca1400_swedish_as_ci
- utf16_uca1400_swedish_as_ci
- utf32_uca1400_swedish_as_ci
have just one entry in INFORMATION_SCHEMA.COLLATIONS and SHOW COLLATION,
with COLLATION_NAME equal to "uca1400_swedish_as_ci", which is the suffix
without the character set name:
SELECT COLLATION_NAME FROM INFORMATION_SCHEMA.COLLATIONS
WHERE COLLATION_NAME LIKE '%uca1400_swedish_as_ci';
+-----------------------+
| COLLATION_NAME |
+-----------------------+
| uca1400_swedish_as_ci |
+-----------------------+
Note, the behaviour of old collations did not change.
Non-unicode collations (e.g. latin1_swedish_ci) and
old UCA-4.0.0 collations (e.g. utf8mb4_unicode_ci)
are still displayed with the character set prefix, as before.
- The structure of the table INFORMATION_SCHEMA.COLLATIONS was changed.
The NOT NULL constraint was removed from these columns:
- CHARACTER_SET_NAME
- ID
- IS_DEFAULT
and from the corresponding columns in SHOW COLLATION.
For example:
SELECT COLLATION_NAME, CHARACTER_SET_NAME, ID, IS_DEFAULT
FROM INFORMATION_SCHEMA.COLLATIONS
WHERE COLLATION_NAME LIKE '%uca1400_swedish_as_ci';
+-----------------------+--------------------+------+------------+
| COLLATION_NAME | CHARACTER_SET_NAME | ID | IS_DEFAULT |
+-----------------------+--------------------+------+------------+
| uca1400_swedish_as_ci | NULL | NULL | NULL |
+-----------------------+--------------------+------+------------+
The NULL value in these columns now means that the collation
is applicable to multiple character sets.
The behavioir of old collations did not change.
Make sure your client programs can handle NULL values in these columns.
- The structure of the table
INFORMATION_SCHEMA.COLLATION_CHARACTER_SET_APPLICABILITY was changed.
Three new NOT NULL columns were added:
- FULL_COLLATION_NAME
- ID
- IS_DEFAULT
New collations have multiple entries in COLLATION_CHARACTER_SET_APPLICABILITY.
The column COLLATION_NAME contains the collation name without the character
set prefix. The column FULL_COLLATION_NAME contains the collation name with
the character set prefix.
Old collations have full collation name in both FULL_COLLATION_NAME and
COLLATION_NAME.
SELECT COLLATION_NAME, FULL_COLLATION_NAME, CHARACTER_SET_NAME, ID, IS_DEFAULT
FROM INFORMATION_SCHEMA.COLLATION_CHARACTER_SET_APPLICABILITY
WHERE FULL_COLLATION_NAME RLIKE '^(utf8mb4|latin1).*swedish.*ci$';
+-----------------------------+-------------------------------------+--------------------+------+------------+
| COLLATION_NAME | FULL_COLLATION_NAME | CHARACTER_SET_NAME | ID | IS_DEFAULT |
+-----------------------------+-------------------------------------+--------------------+------+------------+
| latin1_swedish_ci | latin1_swedish_ci | latin1 | 8 | Yes |
| latin1_swedish_nopad_ci | latin1_swedish_nopad_ci | latin1 | 1032 | |
| utf8mb4_swedish_ci | utf8mb4_swedish_ci | utf8mb4 | 232 | |
| uca1400_swedish_ai_ci | utf8mb4_uca1400_swedish_ai_ci | utf8mb4 | 2368 | |
| uca1400_swedish_as_ci | utf8mb4_uca1400_swedish_as_ci | utf8mb4 | 2370 | |
| uca1400_swedish_nopad_ai_ci | utf8mb4_uca1400_swedish_nopad_ai_ci | utf8mb4 | 2372 | |
| uca1400_swedish_nopad_as_ci | utf8mb4_uca1400_swedish_nopad_as_ci | utf8mb4 | 2374 | |
+-----------------------------+-------------------------------------+--------------------+------+------------+
- Other INFORMATION_SCHEMA queries:
SELECT COLLATION_NAME FROM INFORMATION_SCHEMA.COLUMNS;
SELECT COLLATION_NAME FROM INFORMATION_SCHEMA.PARAMETERS;
SELECT TABLE_COLLATION FROM INFORMATION_SCHEMA.TABLES;
SELECT DEFAULT_COLLATION_NAME FROM INFORMATION_SCHEMA.SCHEMATA;
SELECT COLLATION_NAME FROM INFORMATION_SCHEMA.ROUTINES;
SELECT COLLATION_CONNECTION FROM INFORMATION_SCHEMA.EVENTS;
SELECT DATABASE_COLLATION FROM INFORMATION_SCHEMA.EVENTS;
SELECT COLLATION_CONNECTION FROM INFORMATION_SCHEMA.ROUTINES;
SELECT DATABASE_COLLATION FROM INFORMATION_SCHEMA.ROUTINES;
SELECT COLLATION_CONNECTION FROM INFORMATION_SCHEMA.TRIGGERS;
SELECT DATABASE_COLLATION FROM INFORMATION_SCHEMA.TRIGGERS;
SELECT COLLATION_CONNECTION FROM INFORMATION_SCHEMA.VIEWS;
display full collation names, including character sets prefix,
for all collations, including new collations.
Corresponding SHOW commands also display full collation names
in collation related columns:
SHOW CREATE TABLE t1;
SHOW CREATE DATABASE db1;
SHOW TABLE STATUS;
SHOW CREATE FUNCTION f1;
SHOW CREATE PROCEDURE p1;
SHOW CREATE EVENT ev1;
SHOW CREATE TRIGGER tr1;
SHOW CREATE VIEW;
These INFORMATION_SCHEMA queries and SHOW statements may change in
the future, to display show collation names.
2021-11-28 13:55:15 +01:00
|
|
|
uint i, level_flags= cs->levels_for_order;
|
|
|
|
for (i= 0; level_flags; i++, level_flags>>= 1)
|
2018-10-16 17:10:57 +02:00
|
|
|
{
|
MDEV-27009 Add UCA-14.0.0 collations
- Added one neutral and 22 tailored (language specific) collations based on
Unicode Collation Algorithm version 14.0.0.
Collations were added for Unicode character sets
utf8mb3, utf8mb4, ucs2, utf16, utf32.
Every tailoring was added with four accent and case
sensitivity flag combinations, e.g:
* utf8mb4_uca1400_swedish_as_cs
* utf8mb4_uca1400_swedish_as_ci
* utf8mb4_uca1400_swedish_ai_cs
* utf8mb4_uca1400_swedish_ai_ci
and their _nopad_ variants:
* utf8mb4_uca1400_swedish_nopad_as_cs
* utf8mb4_uca1400_swedish_nopad_as_ci
* utf8mb4_uca1400_swedish_nopad_ai_cs
* utf8mb4_uca1400_swedish_nopad_ai_ci
- Introducing a conception of contextually typed named collations:
CREATE DATABASE db1 CHARACTER SET utf8mb4;
CREATE TABLE db1.t1 (a CHAR(10) COLLATE uca1400_as_ci);
The idea is that there is no a need to specify the character set prefix
in the new collation names. It's enough to type just the suffix
"uca1400_as_ci". The character set is taken from the context.
In the above example script the context character set is utf8mb4.
So the CREATE TABLE will make a column with the collation
utf8mb4_uca1400_as_ci.
Short collations names can be used in any parts of the SQL syntax
where the COLLATE clause is understood.
- New collations are displayed only one time
(without character set combinations) by these statements:
SELECT * FROM INFORMATION_SCHEMA.COLLATIONS;
SHOW COLLATION;
For example, all these collations:
- utf8mb3_uca1400_swedish_as_ci
- utf8mb4_uca1400_swedish_as_ci
- ucs2_uca1400_swedish_as_ci
- utf16_uca1400_swedish_as_ci
- utf32_uca1400_swedish_as_ci
have just one entry in INFORMATION_SCHEMA.COLLATIONS and SHOW COLLATION,
with COLLATION_NAME equal to "uca1400_swedish_as_ci", which is the suffix
without the character set name:
SELECT COLLATION_NAME FROM INFORMATION_SCHEMA.COLLATIONS
WHERE COLLATION_NAME LIKE '%uca1400_swedish_as_ci';
+-----------------------+
| COLLATION_NAME |
+-----------------------+
| uca1400_swedish_as_ci |
+-----------------------+
Note, the behaviour of old collations did not change.
Non-unicode collations (e.g. latin1_swedish_ci) and
old UCA-4.0.0 collations (e.g. utf8mb4_unicode_ci)
are still displayed with the character set prefix, as before.
- The structure of the table INFORMATION_SCHEMA.COLLATIONS was changed.
The NOT NULL constraint was removed from these columns:
- CHARACTER_SET_NAME
- ID
- IS_DEFAULT
and from the corresponding columns in SHOW COLLATION.
For example:
SELECT COLLATION_NAME, CHARACTER_SET_NAME, ID, IS_DEFAULT
FROM INFORMATION_SCHEMA.COLLATIONS
WHERE COLLATION_NAME LIKE '%uca1400_swedish_as_ci';
+-----------------------+--------------------+------+------------+
| COLLATION_NAME | CHARACTER_SET_NAME | ID | IS_DEFAULT |
+-----------------------+--------------------+------+------------+
| uca1400_swedish_as_ci | NULL | NULL | NULL |
+-----------------------+--------------------+------+------------+
The NULL value in these columns now means that the collation
is applicable to multiple character sets.
The behavioir of old collations did not change.
Make sure your client programs can handle NULL values in these columns.
- The structure of the table
INFORMATION_SCHEMA.COLLATION_CHARACTER_SET_APPLICABILITY was changed.
Three new NOT NULL columns were added:
- FULL_COLLATION_NAME
- ID
- IS_DEFAULT
New collations have multiple entries in COLLATION_CHARACTER_SET_APPLICABILITY.
The column COLLATION_NAME contains the collation name without the character
set prefix. The column FULL_COLLATION_NAME contains the collation name with
the character set prefix.
Old collations have full collation name in both FULL_COLLATION_NAME and
COLLATION_NAME.
SELECT COLLATION_NAME, FULL_COLLATION_NAME, CHARACTER_SET_NAME, ID, IS_DEFAULT
FROM INFORMATION_SCHEMA.COLLATION_CHARACTER_SET_APPLICABILITY
WHERE FULL_COLLATION_NAME RLIKE '^(utf8mb4|latin1).*swedish.*ci$';
+-----------------------------+-------------------------------------+--------------------+------+------------+
| COLLATION_NAME | FULL_COLLATION_NAME | CHARACTER_SET_NAME | ID | IS_DEFAULT |
+-----------------------------+-------------------------------------+--------------------+------+------------+
| latin1_swedish_ci | latin1_swedish_ci | latin1 | 8 | Yes |
| latin1_swedish_nopad_ci | latin1_swedish_nopad_ci | latin1 | 1032 | |
| utf8mb4_swedish_ci | utf8mb4_swedish_ci | utf8mb4 | 232 | |
| uca1400_swedish_ai_ci | utf8mb4_uca1400_swedish_ai_ci | utf8mb4 | 2368 | |
| uca1400_swedish_as_ci | utf8mb4_uca1400_swedish_as_ci | utf8mb4 | 2370 | |
| uca1400_swedish_nopad_ai_ci | utf8mb4_uca1400_swedish_nopad_ai_ci | utf8mb4 | 2372 | |
| uca1400_swedish_nopad_as_ci | utf8mb4_uca1400_swedish_nopad_as_ci | utf8mb4 | 2374 | |
+-----------------------------+-------------------------------------+--------------------+------+------------+
- Other INFORMATION_SCHEMA queries:
SELECT COLLATION_NAME FROM INFORMATION_SCHEMA.COLUMNS;
SELECT COLLATION_NAME FROM INFORMATION_SCHEMA.PARAMETERS;
SELECT TABLE_COLLATION FROM INFORMATION_SCHEMA.TABLES;
SELECT DEFAULT_COLLATION_NAME FROM INFORMATION_SCHEMA.SCHEMATA;
SELECT COLLATION_NAME FROM INFORMATION_SCHEMA.ROUTINES;
SELECT COLLATION_CONNECTION FROM INFORMATION_SCHEMA.EVENTS;
SELECT DATABASE_COLLATION FROM INFORMATION_SCHEMA.EVENTS;
SELECT COLLATION_CONNECTION FROM INFORMATION_SCHEMA.ROUTINES;
SELECT DATABASE_COLLATION FROM INFORMATION_SCHEMA.ROUTINES;
SELECT COLLATION_CONNECTION FROM INFORMATION_SCHEMA.TRIGGERS;
SELECT DATABASE_COLLATION FROM INFORMATION_SCHEMA.TRIGGERS;
SELECT COLLATION_CONNECTION FROM INFORMATION_SCHEMA.VIEWS;
display full collation names, including character sets prefix,
for all collations, including new collations.
Corresponding SHOW commands also display full collation names
in collation related columns:
SHOW CREATE TABLE t1;
SHOW CREATE DATABASE db1;
SHOW TABLE STATUS;
SHOW CREATE FUNCTION f1;
SHOW CREATE PROCEDURE p1;
SHOW CREATE EVENT ev1;
SHOW CREATE TRIGGER tr1;
SHOW CREATE VIEW;
These INFORMATION_SCHEMA queries and SHOW statements may change in
the future, to display show collation names.
2021-11-28 13:55:15 +01:00
|
|
|
int ret;
|
|
|
|
if (!(level_flags & 1))
|
|
|
|
continue;
|
|
|
|
ret= MY_FUNCTION_NAME(strnncoll_onelevel)(cs, &cs->uca->level[i],
|
|
|
|
s, slen, t, tlen,
|
|
|
|
t_is_prefix);
|
2018-10-16 17:10:57 +02:00
|
|
|
if (ret)
|
|
|
|
return ret;
|
|
|
|
}
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
/*
|
|
|
|
Compares two strings according to the collation,
|
|
|
|
ignoring trailing spaces.
|
|
|
|
|
|
|
|
SYNOPSIS:
|
|
|
|
strnncollsp_onelevel()
|
|
|
|
cs Character set information
|
|
|
|
level UCA weight level
|
|
|
|
s First string
|
|
|
|
slen First string length
|
|
|
|
t Second string
|
|
|
|
tlen Seconf string length
|
|
|
|
level DUCETweight level
|
|
|
|
|
|
|
|
NOTES:
|
|
|
|
Works exactly the same with my_strnncoll_uca(),
|
|
|
|
but ignores trailing spaces.
|
|
|
|
|
|
|
|
In the while() comparison these situations are possible:
|
|
|
|
1. (s_res>0) and (t_res>0) and (s_res == t_res)
|
|
|
|
Weights are the same so far, continue comparison
|
|
|
|
2. (s_res>0) and (t_res>0) and (s_res!=t_res)
|
|
|
|
A difference has been found, return.
|
|
|
|
3. (s_res>0) and (t_res<0)
|
|
|
|
We have reached the end of the second string, or found
|
|
|
|
an illegal multibyte sequence in the second string.
|
|
|
|
Compare the first string to an infinite array of
|
|
|
|
space characters until difference is found, or until
|
|
|
|
the end of the first string.
|
|
|
|
4. (s_res<0) and (t_res>0)
|
|
|
|
We have reached the end of the first string, or found
|
|
|
|
an illegal multibyte sequence in the first string.
|
|
|
|
Compare the second string to an infinite array of
|
|
|
|
space characters until difference is found or until
|
|
|
|
the end of the second steing.
|
|
|
|
5. (s_res<0) and (t_res<0)
|
|
|
|
Both scanners returned -1. It means we have riched
|
|
|
|
the end-of-string of illegal-sequence in both strings
|
|
|
|
at the same time. Return 0, strings are equal.
|
|
|
|
|
|
|
|
RETURN
|
|
|
|
Difference between two strings, according to the collation:
|
|
|
|
0 - means strings are equal
|
|
|
|
negative number - means the first string is smaller
|
|
|
|
positive number - means the first string is bigger
|
|
|
|
*/
|
|
|
|
|
|
|
|
static int
|
|
|
|
MY_FUNCTION_NAME(strnncollsp_onelevel)(CHARSET_INFO *cs,
|
|
|
|
const MY_UCA_WEIGHT_LEVEL *level,
|
|
|
|
const uchar *s, size_t slen,
|
|
|
|
const uchar *t, size_t tlen)
|
|
|
|
{
|
|
|
|
my_uca_scanner sscanner, tscanner;
|
2022-09-02 11:23:24 +02:00
|
|
|
my_uca_scanner_param param;
|
2018-10-16 17:10:57 +02:00
|
|
|
int s_res, t_res;
|
|
|
|
|
2022-02-25 10:54:59 +01:00
|
|
|
#if MY_UCA_ASCII_OPTIMIZE
|
|
|
|
{
|
|
|
|
size_t prefix= my_uca_level_booster_equal_prefix_length(level->booster,
|
|
|
|
s, slen, t, tlen);
|
|
|
|
s+= prefix, slen-= prefix;
|
|
|
|
t+= prefix, tlen-= prefix;
|
|
|
|
}
|
|
|
|
#endif
|
|
|
|
|
2022-09-02 11:23:24 +02:00
|
|
|
my_uca_scanner_param_init(¶m, cs, level);
|
|
|
|
my_uca_scanner_init_any(&sscanner, s, slen);
|
|
|
|
my_uca_scanner_init_any(&tscanner, t, tlen);
|
2018-10-16 17:10:57 +02:00
|
|
|
|
|
|
|
do
|
|
|
|
{
|
2022-09-02 11:23:24 +02:00
|
|
|
s_res= MY_FUNCTION_NAME(scanner_next)(&sscanner, ¶m);
|
|
|
|
t_res= MY_FUNCTION_NAME(scanner_next)(&tscanner, ¶m);
|
2018-10-16 17:10:57 +02:00
|
|
|
} while ( s_res == t_res && s_res >0);
|
|
|
|
|
|
|
|
if (s_res > 0 && t_res < 0)
|
|
|
|
{
|
|
|
|
/* Calculate weight for SPACE character */
|
|
|
|
t_res= my_space_weight(level);
|
|
|
|
|
|
|
|
/* compare the first string to spaces */
|
|
|
|
do
|
|
|
|
{
|
|
|
|
if (s_res != t_res)
|
|
|
|
return (s_res - t_res);
|
2022-09-02 11:23:24 +02:00
|
|
|
s_res= MY_FUNCTION_NAME(scanner_next)(&sscanner, ¶m);
|
2018-10-16 17:10:57 +02:00
|
|
|
} while (s_res > 0);
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
|
|
|
if (s_res < 0 && t_res > 0)
|
|
|
|
{
|
|
|
|
/* Calculate weight for SPACE character */
|
|
|
|
s_res= my_space_weight(level);
|
|
|
|
|
|
|
|
/* compare the second string to spaces */
|
|
|
|
do
|
|
|
|
{
|
|
|
|
if (s_res != t_res)
|
|
|
|
return (s_res - t_res);
|
2022-09-02 11:23:24 +02:00
|
|
|
t_res= MY_FUNCTION_NAME(scanner_next)(&tscanner, ¶m);
|
2018-10-16 17:10:57 +02:00
|
|
|
} while (t_res > 0);
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
|
|
|
return ( s_res - t_res );
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
/*
|
|
|
|
One-level, PAD SPACE
|
|
|
|
*/
|
|
|
|
static int
|
|
|
|
MY_FUNCTION_NAME(strnncollsp)(CHARSET_INFO *cs,
|
|
|
|
const uchar *s, size_t slen,
|
|
|
|
const uchar *t, size_t tlen)
|
|
|
|
{
|
|
|
|
return MY_FUNCTION_NAME(strnncollsp_onelevel)(cs, &cs->uca->level[0],
|
|
|
|
s, slen, t, tlen);
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
/*
|
|
|
|
One-level, NO PAD
|
|
|
|
*/
|
|
|
|
static int
|
|
|
|
MY_FUNCTION_NAME(strnncollsp_nopad)(CHARSET_INFO *cs,
|
|
|
|
const uchar *s, size_t slen,
|
|
|
|
const uchar *t, size_t tlen)
|
|
|
|
{
|
|
|
|
return MY_FUNCTION_NAME(strnncoll_onelevel)(cs, &cs->uca->level[0],
|
|
|
|
s, slen, t, tlen, FALSE);
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
/*
|
|
|
|
Multi-level, PAD SPACE
|
|
|
|
*/
|
|
|
|
static int
|
|
|
|
MY_FUNCTION_NAME(strnncollsp_multilevel)(CHARSET_INFO *cs,
|
|
|
|
const uchar *s, size_t slen,
|
|
|
|
const uchar *t, size_t tlen)
|
|
|
|
{
|
MDEV-27009 Add UCA-14.0.0 collations
- Added one neutral and 22 tailored (language specific) collations based on
Unicode Collation Algorithm version 14.0.0.
Collations were added for Unicode character sets
utf8mb3, utf8mb4, ucs2, utf16, utf32.
Every tailoring was added with four accent and case
sensitivity flag combinations, e.g:
* utf8mb4_uca1400_swedish_as_cs
* utf8mb4_uca1400_swedish_as_ci
* utf8mb4_uca1400_swedish_ai_cs
* utf8mb4_uca1400_swedish_ai_ci
and their _nopad_ variants:
* utf8mb4_uca1400_swedish_nopad_as_cs
* utf8mb4_uca1400_swedish_nopad_as_ci
* utf8mb4_uca1400_swedish_nopad_ai_cs
* utf8mb4_uca1400_swedish_nopad_ai_ci
- Introducing a conception of contextually typed named collations:
CREATE DATABASE db1 CHARACTER SET utf8mb4;
CREATE TABLE db1.t1 (a CHAR(10) COLLATE uca1400_as_ci);
The idea is that there is no a need to specify the character set prefix
in the new collation names. It's enough to type just the suffix
"uca1400_as_ci". The character set is taken from the context.
In the above example script the context character set is utf8mb4.
So the CREATE TABLE will make a column with the collation
utf8mb4_uca1400_as_ci.
Short collations names can be used in any parts of the SQL syntax
where the COLLATE clause is understood.
- New collations are displayed only one time
(without character set combinations) by these statements:
SELECT * FROM INFORMATION_SCHEMA.COLLATIONS;
SHOW COLLATION;
For example, all these collations:
- utf8mb3_uca1400_swedish_as_ci
- utf8mb4_uca1400_swedish_as_ci
- ucs2_uca1400_swedish_as_ci
- utf16_uca1400_swedish_as_ci
- utf32_uca1400_swedish_as_ci
have just one entry in INFORMATION_SCHEMA.COLLATIONS and SHOW COLLATION,
with COLLATION_NAME equal to "uca1400_swedish_as_ci", which is the suffix
without the character set name:
SELECT COLLATION_NAME FROM INFORMATION_SCHEMA.COLLATIONS
WHERE COLLATION_NAME LIKE '%uca1400_swedish_as_ci';
+-----------------------+
| COLLATION_NAME |
+-----------------------+
| uca1400_swedish_as_ci |
+-----------------------+
Note, the behaviour of old collations did not change.
Non-unicode collations (e.g. latin1_swedish_ci) and
old UCA-4.0.0 collations (e.g. utf8mb4_unicode_ci)
are still displayed with the character set prefix, as before.
- The structure of the table INFORMATION_SCHEMA.COLLATIONS was changed.
The NOT NULL constraint was removed from these columns:
- CHARACTER_SET_NAME
- ID
- IS_DEFAULT
and from the corresponding columns in SHOW COLLATION.
For example:
SELECT COLLATION_NAME, CHARACTER_SET_NAME, ID, IS_DEFAULT
FROM INFORMATION_SCHEMA.COLLATIONS
WHERE COLLATION_NAME LIKE '%uca1400_swedish_as_ci';
+-----------------------+--------------------+------+------------+
| COLLATION_NAME | CHARACTER_SET_NAME | ID | IS_DEFAULT |
+-----------------------+--------------------+------+------------+
| uca1400_swedish_as_ci | NULL | NULL | NULL |
+-----------------------+--------------------+------+------------+
The NULL value in these columns now means that the collation
is applicable to multiple character sets.
The behavioir of old collations did not change.
Make sure your client programs can handle NULL values in these columns.
- The structure of the table
INFORMATION_SCHEMA.COLLATION_CHARACTER_SET_APPLICABILITY was changed.
Three new NOT NULL columns were added:
- FULL_COLLATION_NAME
- ID
- IS_DEFAULT
New collations have multiple entries in COLLATION_CHARACTER_SET_APPLICABILITY.
The column COLLATION_NAME contains the collation name without the character
set prefix. The column FULL_COLLATION_NAME contains the collation name with
the character set prefix.
Old collations have full collation name in both FULL_COLLATION_NAME and
COLLATION_NAME.
SELECT COLLATION_NAME, FULL_COLLATION_NAME, CHARACTER_SET_NAME, ID, IS_DEFAULT
FROM INFORMATION_SCHEMA.COLLATION_CHARACTER_SET_APPLICABILITY
WHERE FULL_COLLATION_NAME RLIKE '^(utf8mb4|latin1).*swedish.*ci$';
+-----------------------------+-------------------------------------+--------------------+------+------------+
| COLLATION_NAME | FULL_COLLATION_NAME | CHARACTER_SET_NAME | ID | IS_DEFAULT |
+-----------------------------+-------------------------------------+--------------------+------+------------+
| latin1_swedish_ci | latin1_swedish_ci | latin1 | 8 | Yes |
| latin1_swedish_nopad_ci | latin1_swedish_nopad_ci | latin1 | 1032 | |
| utf8mb4_swedish_ci | utf8mb4_swedish_ci | utf8mb4 | 232 | |
| uca1400_swedish_ai_ci | utf8mb4_uca1400_swedish_ai_ci | utf8mb4 | 2368 | |
| uca1400_swedish_as_ci | utf8mb4_uca1400_swedish_as_ci | utf8mb4 | 2370 | |
| uca1400_swedish_nopad_ai_ci | utf8mb4_uca1400_swedish_nopad_ai_ci | utf8mb4 | 2372 | |
| uca1400_swedish_nopad_as_ci | utf8mb4_uca1400_swedish_nopad_as_ci | utf8mb4 | 2374 | |
+-----------------------------+-------------------------------------+--------------------+------+------------+
- Other INFORMATION_SCHEMA queries:
SELECT COLLATION_NAME FROM INFORMATION_SCHEMA.COLUMNS;
SELECT COLLATION_NAME FROM INFORMATION_SCHEMA.PARAMETERS;
SELECT TABLE_COLLATION FROM INFORMATION_SCHEMA.TABLES;
SELECT DEFAULT_COLLATION_NAME FROM INFORMATION_SCHEMA.SCHEMATA;
SELECT COLLATION_NAME FROM INFORMATION_SCHEMA.ROUTINES;
SELECT COLLATION_CONNECTION FROM INFORMATION_SCHEMA.EVENTS;
SELECT DATABASE_COLLATION FROM INFORMATION_SCHEMA.EVENTS;
SELECT COLLATION_CONNECTION FROM INFORMATION_SCHEMA.ROUTINES;
SELECT DATABASE_COLLATION FROM INFORMATION_SCHEMA.ROUTINES;
SELECT COLLATION_CONNECTION FROM INFORMATION_SCHEMA.TRIGGERS;
SELECT DATABASE_COLLATION FROM INFORMATION_SCHEMA.TRIGGERS;
SELECT COLLATION_CONNECTION FROM INFORMATION_SCHEMA.VIEWS;
display full collation names, including character sets prefix,
for all collations, including new collations.
Corresponding SHOW commands also display full collation names
in collation related columns:
SHOW CREATE TABLE t1;
SHOW CREATE DATABASE db1;
SHOW TABLE STATUS;
SHOW CREATE FUNCTION f1;
SHOW CREATE PROCEDURE p1;
SHOW CREATE EVENT ev1;
SHOW CREATE TRIGGER tr1;
SHOW CREATE VIEW;
These INFORMATION_SCHEMA queries and SHOW statements may change in
the future, to display show collation names.
2021-11-28 13:55:15 +01:00
|
|
|
uint i, level_flags= cs->levels_for_order;
|
|
|
|
for (i= 0; level_flags; i++, level_flags>>= 1)
|
2018-10-16 17:10:57 +02:00
|
|
|
{
|
MDEV-27009 Add UCA-14.0.0 collations
- Added one neutral and 22 tailored (language specific) collations based on
Unicode Collation Algorithm version 14.0.0.
Collations were added for Unicode character sets
utf8mb3, utf8mb4, ucs2, utf16, utf32.
Every tailoring was added with four accent and case
sensitivity flag combinations, e.g:
* utf8mb4_uca1400_swedish_as_cs
* utf8mb4_uca1400_swedish_as_ci
* utf8mb4_uca1400_swedish_ai_cs
* utf8mb4_uca1400_swedish_ai_ci
and their _nopad_ variants:
* utf8mb4_uca1400_swedish_nopad_as_cs
* utf8mb4_uca1400_swedish_nopad_as_ci
* utf8mb4_uca1400_swedish_nopad_ai_cs
* utf8mb4_uca1400_swedish_nopad_ai_ci
- Introducing a conception of contextually typed named collations:
CREATE DATABASE db1 CHARACTER SET utf8mb4;
CREATE TABLE db1.t1 (a CHAR(10) COLLATE uca1400_as_ci);
The idea is that there is no a need to specify the character set prefix
in the new collation names. It's enough to type just the suffix
"uca1400_as_ci". The character set is taken from the context.
In the above example script the context character set is utf8mb4.
So the CREATE TABLE will make a column with the collation
utf8mb4_uca1400_as_ci.
Short collations names can be used in any parts of the SQL syntax
where the COLLATE clause is understood.
- New collations are displayed only one time
(without character set combinations) by these statements:
SELECT * FROM INFORMATION_SCHEMA.COLLATIONS;
SHOW COLLATION;
For example, all these collations:
- utf8mb3_uca1400_swedish_as_ci
- utf8mb4_uca1400_swedish_as_ci
- ucs2_uca1400_swedish_as_ci
- utf16_uca1400_swedish_as_ci
- utf32_uca1400_swedish_as_ci
have just one entry in INFORMATION_SCHEMA.COLLATIONS and SHOW COLLATION,
with COLLATION_NAME equal to "uca1400_swedish_as_ci", which is the suffix
without the character set name:
SELECT COLLATION_NAME FROM INFORMATION_SCHEMA.COLLATIONS
WHERE COLLATION_NAME LIKE '%uca1400_swedish_as_ci';
+-----------------------+
| COLLATION_NAME |
+-----------------------+
| uca1400_swedish_as_ci |
+-----------------------+
Note, the behaviour of old collations did not change.
Non-unicode collations (e.g. latin1_swedish_ci) and
old UCA-4.0.0 collations (e.g. utf8mb4_unicode_ci)
are still displayed with the character set prefix, as before.
- The structure of the table INFORMATION_SCHEMA.COLLATIONS was changed.
The NOT NULL constraint was removed from these columns:
- CHARACTER_SET_NAME
- ID
- IS_DEFAULT
and from the corresponding columns in SHOW COLLATION.
For example:
SELECT COLLATION_NAME, CHARACTER_SET_NAME, ID, IS_DEFAULT
FROM INFORMATION_SCHEMA.COLLATIONS
WHERE COLLATION_NAME LIKE '%uca1400_swedish_as_ci';
+-----------------------+--------------------+------+------------+
| COLLATION_NAME | CHARACTER_SET_NAME | ID | IS_DEFAULT |
+-----------------------+--------------------+------+------------+
| uca1400_swedish_as_ci | NULL | NULL | NULL |
+-----------------------+--------------------+------+------------+
The NULL value in these columns now means that the collation
is applicable to multiple character sets.
The behavioir of old collations did not change.
Make sure your client programs can handle NULL values in these columns.
- The structure of the table
INFORMATION_SCHEMA.COLLATION_CHARACTER_SET_APPLICABILITY was changed.
Three new NOT NULL columns were added:
- FULL_COLLATION_NAME
- ID
- IS_DEFAULT
New collations have multiple entries in COLLATION_CHARACTER_SET_APPLICABILITY.
The column COLLATION_NAME contains the collation name without the character
set prefix. The column FULL_COLLATION_NAME contains the collation name with
the character set prefix.
Old collations have full collation name in both FULL_COLLATION_NAME and
COLLATION_NAME.
SELECT COLLATION_NAME, FULL_COLLATION_NAME, CHARACTER_SET_NAME, ID, IS_DEFAULT
FROM INFORMATION_SCHEMA.COLLATION_CHARACTER_SET_APPLICABILITY
WHERE FULL_COLLATION_NAME RLIKE '^(utf8mb4|latin1).*swedish.*ci$';
+-----------------------------+-------------------------------------+--------------------+------+------------+
| COLLATION_NAME | FULL_COLLATION_NAME | CHARACTER_SET_NAME | ID | IS_DEFAULT |
+-----------------------------+-------------------------------------+--------------------+------+------------+
| latin1_swedish_ci | latin1_swedish_ci | latin1 | 8 | Yes |
| latin1_swedish_nopad_ci | latin1_swedish_nopad_ci | latin1 | 1032 | |
| utf8mb4_swedish_ci | utf8mb4_swedish_ci | utf8mb4 | 232 | |
| uca1400_swedish_ai_ci | utf8mb4_uca1400_swedish_ai_ci | utf8mb4 | 2368 | |
| uca1400_swedish_as_ci | utf8mb4_uca1400_swedish_as_ci | utf8mb4 | 2370 | |
| uca1400_swedish_nopad_ai_ci | utf8mb4_uca1400_swedish_nopad_ai_ci | utf8mb4 | 2372 | |
| uca1400_swedish_nopad_as_ci | utf8mb4_uca1400_swedish_nopad_as_ci | utf8mb4 | 2374 | |
+-----------------------------+-------------------------------------+--------------------+------+------------+
- Other INFORMATION_SCHEMA queries:
SELECT COLLATION_NAME FROM INFORMATION_SCHEMA.COLUMNS;
SELECT COLLATION_NAME FROM INFORMATION_SCHEMA.PARAMETERS;
SELECT TABLE_COLLATION FROM INFORMATION_SCHEMA.TABLES;
SELECT DEFAULT_COLLATION_NAME FROM INFORMATION_SCHEMA.SCHEMATA;
SELECT COLLATION_NAME FROM INFORMATION_SCHEMA.ROUTINES;
SELECT COLLATION_CONNECTION FROM INFORMATION_SCHEMA.EVENTS;
SELECT DATABASE_COLLATION FROM INFORMATION_SCHEMA.EVENTS;
SELECT COLLATION_CONNECTION FROM INFORMATION_SCHEMA.ROUTINES;
SELECT DATABASE_COLLATION FROM INFORMATION_SCHEMA.ROUTINES;
SELECT COLLATION_CONNECTION FROM INFORMATION_SCHEMA.TRIGGERS;
SELECT DATABASE_COLLATION FROM INFORMATION_SCHEMA.TRIGGERS;
SELECT COLLATION_CONNECTION FROM INFORMATION_SCHEMA.VIEWS;
display full collation names, including character sets prefix,
for all collations, including new collations.
Corresponding SHOW commands also display full collation names
in collation related columns:
SHOW CREATE TABLE t1;
SHOW CREATE DATABASE db1;
SHOW TABLE STATUS;
SHOW CREATE FUNCTION f1;
SHOW CREATE PROCEDURE p1;
SHOW CREATE EVENT ev1;
SHOW CREATE TRIGGER tr1;
SHOW CREATE VIEW;
These INFORMATION_SCHEMA queries and SHOW statements may change in
the future, to display show collation names.
2021-11-28 13:55:15 +01:00
|
|
|
int ret;
|
|
|
|
if (!(level_flags & 1))
|
|
|
|
continue;
|
|
|
|
ret= MY_FUNCTION_NAME(strnncollsp_onelevel)(cs, &cs->uca->level[i],
|
|
|
|
s, slen, t, tlen);
|
2018-10-16 17:10:57 +02:00
|
|
|
if (ret)
|
|
|
|
return ret;
|
|
|
|
}
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
/*
|
|
|
|
Multi-level, NO PAD
|
|
|
|
*/
|
|
|
|
static int
|
|
|
|
MY_FUNCTION_NAME(strnncollsp_nopad_multilevel)(CHARSET_INFO *cs,
|
|
|
|
const uchar *s, size_t slen,
|
|
|
|
const uchar *t, size_t tlen)
|
|
|
|
{
|
MDEV-27009 Add UCA-14.0.0 collations
- Added one neutral and 22 tailored (language specific) collations based on
Unicode Collation Algorithm version 14.0.0.
Collations were added for Unicode character sets
utf8mb3, utf8mb4, ucs2, utf16, utf32.
Every tailoring was added with four accent and case
sensitivity flag combinations, e.g:
* utf8mb4_uca1400_swedish_as_cs
* utf8mb4_uca1400_swedish_as_ci
* utf8mb4_uca1400_swedish_ai_cs
* utf8mb4_uca1400_swedish_ai_ci
and their _nopad_ variants:
* utf8mb4_uca1400_swedish_nopad_as_cs
* utf8mb4_uca1400_swedish_nopad_as_ci
* utf8mb4_uca1400_swedish_nopad_ai_cs
* utf8mb4_uca1400_swedish_nopad_ai_ci
- Introducing a conception of contextually typed named collations:
CREATE DATABASE db1 CHARACTER SET utf8mb4;
CREATE TABLE db1.t1 (a CHAR(10) COLLATE uca1400_as_ci);
The idea is that there is no a need to specify the character set prefix
in the new collation names. It's enough to type just the suffix
"uca1400_as_ci". The character set is taken from the context.
In the above example script the context character set is utf8mb4.
So the CREATE TABLE will make a column with the collation
utf8mb4_uca1400_as_ci.
Short collations names can be used in any parts of the SQL syntax
where the COLLATE clause is understood.
- New collations are displayed only one time
(without character set combinations) by these statements:
SELECT * FROM INFORMATION_SCHEMA.COLLATIONS;
SHOW COLLATION;
For example, all these collations:
- utf8mb3_uca1400_swedish_as_ci
- utf8mb4_uca1400_swedish_as_ci
- ucs2_uca1400_swedish_as_ci
- utf16_uca1400_swedish_as_ci
- utf32_uca1400_swedish_as_ci
have just one entry in INFORMATION_SCHEMA.COLLATIONS and SHOW COLLATION,
with COLLATION_NAME equal to "uca1400_swedish_as_ci", which is the suffix
without the character set name:
SELECT COLLATION_NAME FROM INFORMATION_SCHEMA.COLLATIONS
WHERE COLLATION_NAME LIKE '%uca1400_swedish_as_ci';
+-----------------------+
| COLLATION_NAME |
+-----------------------+
| uca1400_swedish_as_ci |
+-----------------------+
Note, the behaviour of old collations did not change.
Non-unicode collations (e.g. latin1_swedish_ci) and
old UCA-4.0.0 collations (e.g. utf8mb4_unicode_ci)
are still displayed with the character set prefix, as before.
- The structure of the table INFORMATION_SCHEMA.COLLATIONS was changed.
The NOT NULL constraint was removed from these columns:
- CHARACTER_SET_NAME
- ID
- IS_DEFAULT
and from the corresponding columns in SHOW COLLATION.
For example:
SELECT COLLATION_NAME, CHARACTER_SET_NAME, ID, IS_DEFAULT
FROM INFORMATION_SCHEMA.COLLATIONS
WHERE COLLATION_NAME LIKE '%uca1400_swedish_as_ci';
+-----------------------+--------------------+------+------------+
| COLLATION_NAME | CHARACTER_SET_NAME | ID | IS_DEFAULT |
+-----------------------+--------------------+------+------------+
| uca1400_swedish_as_ci | NULL | NULL | NULL |
+-----------------------+--------------------+------+------------+
The NULL value in these columns now means that the collation
is applicable to multiple character sets.
The behavioir of old collations did not change.
Make sure your client programs can handle NULL values in these columns.
- The structure of the table
INFORMATION_SCHEMA.COLLATION_CHARACTER_SET_APPLICABILITY was changed.
Three new NOT NULL columns were added:
- FULL_COLLATION_NAME
- ID
- IS_DEFAULT
New collations have multiple entries in COLLATION_CHARACTER_SET_APPLICABILITY.
The column COLLATION_NAME contains the collation name without the character
set prefix. The column FULL_COLLATION_NAME contains the collation name with
the character set prefix.
Old collations have full collation name in both FULL_COLLATION_NAME and
COLLATION_NAME.
SELECT COLLATION_NAME, FULL_COLLATION_NAME, CHARACTER_SET_NAME, ID, IS_DEFAULT
FROM INFORMATION_SCHEMA.COLLATION_CHARACTER_SET_APPLICABILITY
WHERE FULL_COLLATION_NAME RLIKE '^(utf8mb4|latin1).*swedish.*ci$';
+-----------------------------+-------------------------------------+--------------------+------+------------+
| COLLATION_NAME | FULL_COLLATION_NAME | CHARACTER_SET_NAME | ID | IS_DEFAULT |
+-----------------------------+-------------------------------------+--------------------+------+------------+
| latin1_swedish_ci | latin1_swedish_ci | latin1 | 8 | Yes |
| latin1_swedish_nopad_ci | latin1_swedish_nopad_ci | latin1 | 1032 | |
| utf8mb4_swedish_ci | utf8mb4_swedish_ci | utf8mb4 | 232 | |
| uca1400_swedish_ai_ci | utf8mb4_uca1400_swedish_ai_ci | utf8mb4 | 2368 | |
| uca1400_swedish_as_ci | utf8mb4_uca1400_swedish_as_ci | utf8mb4 | 2370 | |
| uca1400_swedish_nopad_ai_ci | utf8mb4_uca1400_swedish_nopad_ai_ci | utf8mb4 | 2372 | |
| uca1400_swedish_nopad_as_ci | utf8mb4_uca1400_swedish_nopad_as_ci | utf8mb4 | 2374 | |
+-----------------------------+-------------------------------------+--------------------+------+------------+
- Other INFORMATION_SCHEMA queries:
SELECT COLLATION_NAME FROM INFORMATION_SCHEMA.COLUMNS;
SELECT COLLATION_NAME FROM INFORMATION_SCHEMA.PARAMETERS;
SELECT TABLE_COLLATION FROM INFORMATION_SCHEMA.TABLES;
SELECT DEFAULT_COLLATION_NAME FROM INFORMATION_SCHEMA.SCHEMATA;
SELECT COLLATION_NAME FROM INFORMATION_SCHEMA.ROUTINES;
SELECT COLLATION_CONNECTION FROM INFORMATION_SCHEMA.EVENTS;
SELECT DATABASE_COLLATION FROM INFORMATION_SCHEMA.EVENTS;
SELECT COLLATION_CONNECTION FROM INFORMATION_SCHEMA.ROUTINES;
SELECT DATABASE_COLLATION FROM INFORMATION_SCHEMA.ROUTINES;
SELECT COLLATION_CONNECTION FROM INFORMATION_SCHEMA.TRIGGERS;
SELECT DATABASE_COLLATION FROM INFORMATION_SCHEMA.TRIGGERS;
SELECT COLLATION_CONNECTION FROM INFORMATION_SCHEMA.VIEWS;
display full collation names, including character sets prefix,
for all collations, including new collations.
Corresponding SHOW commands also display full collation names
in collation related columns:
SHOW CREATE TABLE t1;
SHOW CREATE DATABASE db1;
SHOW TABLE STATUS;
SHOW CREATE FUNCTION f1;
SHOW CREATE PROCEDURE p1;
SHOW CREATE EVENT ev1;
SHOW CREATE TRIGGER tr1;
SHOW CREATE VIEW;
These INFORMATION_SCHEMA queries and SHOW statements may change in
the future, to display show collation names.
2021-11-28 13:55:15 +01:00
|
|
|
uint i, level_flags;
|
|
|
|
int ret;
|
|
|
|
|
|
|
|
/* Compare only the primary level using NO PAD */
|
|
|
|
if ((ret= MY_FUNCTION_NAME(strnncoll_onelevel)(cs, &cs->uca->level[0],
|
|
|
|
s, slen, t, tlen, FALSE)))
|
|
|
|
return ret;
|
|
|
|
|
|
|
|
/*
|
|
|
|
Compare the other levels using PAD SPACE.
|
|
|
|
These are Unicode-14.0.0 DUCTET weights:
|
|
|
|
|
|
|
|
0020 ; [*0209.0020.0002] # SPACE
|
|
|
|
|
|
|
|
0035 ; [.2070.0020.0002] # DIGIT FIVE
|
|
|
|
248C ; [.2070.0020.0004][*0281.0020.0004] # DIGIT FIVE FULL STOP
|
|
|
|
|
|
|
|
0041 ; [.2075.0020.0008] # LATIN CAPITAL LETTER A
|
|
|
|
0061 ; [.2075.0020.0002] # LATIN SMALL LETTER A
|
|
|
|
00C1 ; [.2075.0020.0008][.0000.0024.0002] # LATIN CAPITAL LETTER A WITH ACUTE
|
|
|
|
00E1 ; [.2075.0020.0002][.0000.0024.0002] # LATIN SMALL LETTER A WITH ACUTE
|
|
|
|
|
|
|
|
Examples demonstrating that it's important to use PAD SPACE
|
|
|
|
on the tertiary level:
|
|
|
|
|
|
|
|
The third level weights for "SMALL LETTER A"
|
|
|
|
- U+0061 produces one weight 0002
|
|
|
|
- U+00E1 produces two weights 0002+0002
|
|
|
|
For _ai_cs collations these two letters must be equal.
|
|
|
|
Therefore, the difference in trailing 0002 should be ignored.
|
|
|
|
|
|
|
|
The third level weights for "CAPITAL LETTER A"
|
|
|
|
- U+0041 produces one weight 0008
|
|
|
|
- U+00C1 produces two weights 0008+0002
|
|
|
|
For _ai_cs collations these two letters must be equal.
|
|
|
|
Therefore, the difference in trailing 0002 should be ignored.
|
|
|
|
|
|
|
|
Examples demonstrating that it's important to use PAD SPACE
|
|
|
|
on the secondary level:
|
|
|
|
|
|
|
|
When we implement variable shifted alternative weighting collations,
|
|
|
|
U+0035 will be equal to U+248C on the primary level in these collations.
|
|
|
|
The second level weights for "DIGIT FIVE" are:
|
|
|
|
- U+0035 produces one weight 0020
|
|
|
|
- U+248C produces two weights 0020+0020.
|
|
|
|
The difference for these two characters must be found only
|
|
|
|
on the tertiary level. Therefore, the trailing 0020 should be ignored.
|
|
|
|
*/
|
|
|
|
|
|
|
|
for (i= 1, level_flags= cs->levels_for_order >> 1;
|
|
|
|
level_flags;
|
|
|
|
i++, level_flags>>= 1)
|
2018-10-16 17:10:57 +02:00
|
|
|
{
|
MDEV-27009 Add UCA-14.0.0 collations
- Added one neutral and 22 tailored (language specific) collations based on
Unicode Collation Algorithm version 14.0.0.
Collations were added for Unicode character sets
utf8mb3, utf8mb4, ucs2, utf16, utf32.
Every tailoring was added with four accent and case
sensitivity flag combinations, e.g:
* utf8mb4_uca1400_swedish_as_cs
* utf8mb4_uca1400_swedish_as_ci
* utf8mb4_uca1400_swedish_ai_cs
* utf8mb4_uca1400_swedish_ai_ci
and their _nopad_ variants:
* utf8mb4_uca1400_swedish_nopad_as_cs
* utf8mb4_uca1400_swedish_nopad_as_ci
* utf8mb4_uca1400_swedish_nopad_ai_cs
* utf8mb4_uca1400_swedish_nopad_ai_ci
- Introducing a conception of contextually typed named collations:
CREATE DATABASE db1 CHARACTER SET utf8mb4;
CREATE TABLE db1.t1 (a CHAR(10) COLLATE uca1400_as_ci);
The idea is that there is no a need to specify the character set prefix
in the new collation names. It's enough to type just the suffix
"uca1400_as_ci". The character set is taken from the context.
In the above example script the context character set is utf8mb4.
So the CREATE TABLE will make a column with the collation
utf8mb4_uca1400_as_ci.
Short collations names can be used in any parts of the SQL syntax
where the COLLATE clause is understood.
- New collations are displayed only one time
(without character set combinations) by these statements:
SELECT * FROM INFORMATION_SCHEMA.COLLATIONS;
SHOW COLLATION;
For example, all these collations:
- utf8mb3_uca1400_swedish_as_ci
- utf8mb4_uca1400_swedish_as_ci
- ucs2_uca1400_swedish_as_ci
- utf16_uca1400_swedish_as_ci
- utf32_uca1400_swedish_as_ci
have just one entry in INFORMATION_SCHEMA.COLLATIONS and SHOW COLLATION,
with COLLATION_NAME equal to "uca1400_swedish_as_ci", which is the suffix
without the character set name:
SELECT COLLATION_NAME FROM INFORMATION_SCHEMA.COLLATIONS
WHERE COLLATION_NAME LIKE '%uca1400_swedish_as_ci';
+-----------------------+
| COLLATION_NAME |
+-----------------------+
| uca1400_swedish_as_ci |
+-----------------------+
Note, the behaviour of old collations did not change.
Non-unicode collations (e.g. latin1_swedish_ci) and
old UCA-4.0.0 collations (e.g. utf8mb4_unicode_ci)
are still displayed with the character set prefix, as before.
- The structure of the table INFORMATION_SCHEMA.COLLATIONS was changed.
The NOT NULL constraint was removed from these columns:
- CHARACTER_SET_NAME
- ID
- IS_DEFAULT
and from the corresponding columns in SHOW COLLATION.
For example:
SELECT COLLATION_NAME, CHARACTER_SET_NAME, ID, IS_DEFAULT
FROM INFORMATION_SCHEMA.COLLATIONS
WHERE COLLATION_NAME LIKE '%uca1400_swedish_as_ci';
+-----------------------+--------------------+------+------------+
| COLLATION_NAME | CHARACTER_SET_NAME | ID | IS_DEFAULT |
+-----------------------+--------------------+------+------------+
| uca1400_swedish_as_ci | NULL | NULL | NULL |
+-----------------------+--------------------+------+------------+
The NULL value in these columns now means that the collation
is applicable to multiple character sets.
The behavioir of old collations did not change.
Make sure your client programs can handle NULL values in these columns.
- The structure of the table
INFORMATION_SCHEMA.COLLATION_CHARACTER_SET_APPLICABILITY was changed.
Three new NOT NULL columns were added:
- FULL_COLLATION_NAME
- ID
- IS_DEFAULT
New collations have multiple entries in COLLATION_CHARACTER_SET_APPLICABILITY.
The column COLLATION_NAME contains the collation name without the character
set prefix. The column FULL_COLLATION_NAME contains the collation name with
the character set prefix.
Old collations have full collation name in both FULL_COLLATION_NAME and
COLLATION_NAME.
SELECT COLLATION_NAME, FULL_COLLATION_NAME, CHARACTER_SET_NAME, ID, IS_DEFAULT
FROM INFORMATION_SCHEMA.COLLATION_CHARACTER_SET_APPLICABILITY
WHERE FULL_COLLATION_NAME RLIKE '^(utf8mb4|latin1).*swedish.*ci$';
+-----------------------------+-------------------------------------+--------------------+------+------------+
| COLLATION_NAME | FULL_COLLATION_NAME | CHARACTER_SET_NAME | ID | IS_DEFAULT |
+-----------------------------+-------------------------------------+--------------------+------+------------+
| latin1_swedish_ci | latin1_swedish_ci | latin1 | 8 | Yes |
| latin1_swedish_nopad_ci | latin1_swedish_nopad_ci | latin1 | 1032 | |
| utf8mb4_swedish_ci | utf8mb4_swedish_ci | utf8mb4 | 232 | |
| uca1400_swedish_ai_ci | utf8mb4_uca1400_swedish_ai_ci | utf8mb4 | 2368 | |
| uca1400_swedish_as_ci | utf8mb4_uca1400_swedish_as_ci | utf8mb4 | 2370 | |
| uca1400_swedish_nopad_ai_ci | utf8mb4_uca1400_swedish_nopad_ai_ci | utf8mb4 | 2372 | |
| uca1400_swedish_nopad_as_ci | utf8mb4_uca1400_swedish_nopad_as_ci | utf8mb4 | 2374 | |
+-----------------------------+-------------------------------------+--------------------+------+------------+
- Other INFORMATION_SCHEMA queries:
SELECT COLLATION_NAME FROM INFORMATION_SCHEMA.COLUMNS;
SELECT COLLATION_NAME FROM INFORMATION_SCHEMA.PARAMETERS;
SELECT TABLE_COLLATION FROM INFORMATION_SCHEMA.TABLES;
SELECT DEFAULT_COLLATION_NAME FROM INFORMATION_SCHEMA.SCHEMATA;
SELECT COLLATION_NAME FROM INFORMATION_SCHEMA.ROUTINES;
SELECT COLLATION_CONNECTION FROM INFORMATION_SCHEMA.EVENTS;
SELECT DATABASE_COLLATION FROM INFORMATION_SCHEMA.EVENTS;
SELECT COLLATION_CONNECTION FROM INFORMATION_SCHEMA.ROUTINES;
SELECT DATABASE_COLLATION FROM INFORMATION_SCHEMA.ROUTINES;
SELECT COLLATION_CONNECTION FROM INFORMATION_SCHEMA.TRIGGERS;
SELECT DATABASE_COLLATION FROM INFORMATION_SCHEMA.TRIGGERS;
SELECT COLLATION_CONNECTION FROM INFORMATION_SCHEMA.VIEWS;
display full collation names, including character sets prefix,
for all collations, including new collations.
Corresponding SHOW commands also display full collation names
in collation related columns:
SHOW CREATE TABLE t1;
SHOW CREATE DATABASE db1;
SHOW TABLE STATUS;
SHOW CREATE FUNCTION f1;
SHOW CREATE PROCEDURE p1;
SHOW CREATE EVENT ev1;
SHOW CREATE TRIGGER tr1;
SHOW CREATE VIEW;
These INFORMATION_SCHEMA queries and SHOW statements may change in
the future, to display show collation names.
2021-11-28 13:55:15 +01:00
|
|
|
if (!(level_flags & 1))
|
|
|
|
continue;
|
|
|
|
ret= MY_FUNCTION_NAME(strnncollsp_onelevel)(cs, &cs->uca->level[i],
|
|
|
|
s, slen, t, tlen);
|
2018-10-16 17:10:57 +02:00
|
|
|
if (ret)
|
|
|
|
return ret;
|
|
|
|
}
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
|
|
|
|
2021-09-29 13:13:57 +02:00
|
|
|
/*
|
|
|
|
Scan the next weight and perform space padding
|
|
|
|
or trimming according to "nchars".
|
|
|
|
*/
|
|
|
|
static inline weight_and_nchars_t
|
|
|
|
MY_FUNCTION_NAME(scanner_next_pad_trim)(my_uca_scanner *scanner,
|
2022-09-02 11:23:24 +02:00
|
|
|
my_uca_scanner_param *param,
|
2021-09-29 13:13:57 +02:00
|
|
|
size_t nchars,
|
2023-03-31 15:20:03 +02:00
|
|
|
uint flags,
|
2021-09-29 13:13:57 +02:00
|
|
|
uint *generated)
|
|
|
|
{
|
|
|
|
weight_and_nchars_t res;
|
|
|
|
if (nchars > 0 ||
|
|
|
|
scanner->wbeg[0] /* Some weights from a previous expansion left */)
|
|
|
|
{
|
2022-09-02 11:23:24 +02:00
|
|
|
if ((res= MY_FUNCTION_NAME(scanner_next_with_nchars)(scanner, param,
|
2021-09-29 13:13:57 +02:00
|
|
|
nchars)).weight < 0)
|
|
|
|
{
|
|
|
|
/*
|
|
|
|
We reached the end of the string, but the caller wants more weights.
|
|
|
|
Perform space padding.
|
|
|
|
*/
|
2023-03-31 15:20:03 +02:00
|
|
|
res.weight=
|
|
|
|
flags & MY_STRNNCOLLSP_NCHARS_EMULATE_TRIMMED_TRAILING_SPACES ?
|
2023-04-14 11:24:14 +02:00
|
|
|
my_space_weight(param->level) : 0;
|
2023-03-31 15:20:03 +02:00
|
|
|
|
2021-09-29 13:13:57 +02:00
|
|
|
(*generated)++;
|
MDEV-26743 InnoDB: CHAR+nopad does not work well
The patch for "MDEV-25440: Indexed CHAR ... broken with NO_PAD collations"
fixed these scenarios from MDEV-26743:
- Basic latin letter vs equal accented letter
- Two letters vs equal (but space padded) expansion
However, this scenario was still broken:
- Basic latin letter (but followed by an ignorable character)
vs equal accented letter
Fix:
When processing for a NOPAD collation a string with trailing ignorable
characters, like:
'<non-ignorable><ignorable><ignorable>'
the string gets virtually converted to:
'<non-ignorable><ignorable><ignorable><space><space><space>...'
After the fix the code works differently in these two cases:
1. <space> fits into the "nchars" limit
2. <space> does not fit into the "nchars" limit
Details:
1. If "nchars" is large enough (4+ in this example),
return weights as follows:
'[weight-for-non-ignorable, 1 char] [weight-for-space-character, 3 chars]'
i.e. the weight for the virtual trailing space character now indicates
that it corresponds to total 3 characters:
- two ignorable characters
- one virtual trailing space character
2. If "nchars" is small (3), then the virtual trailing space character
does not fit into the "nchar" limit, so return 0x00 as weight, e.g.:
'[weight-for-non-ignorable, 1 char] [0x00, 2 chars]'
Adding corresponding MTR tests and unit tests.
2021-10-01 15:12:00 +02:00
|
|
|
res.nchars++; /* Count all ignorable characters and the padded space */
|
|
|
|
if (res.nchars > nchars)
|
|
|
|
{
|
|
|
|
/*
|
|
|
|
We scanned a number of ignorable characters at the end of the
|
|
|
|
string and reached the "nchars" limit, so the virtual padded space
|
|
|
|
does not fit. This is possible with CONCAT('a', x'00') with
|
|
|
|
nchars=2 on the second iteration when we scan the x'00'.
|
|
|
|
*/
|
2023-12-17 16:56:38 +01:00
|
|
|
if (param->cs->state & MY_CS_NOPAD)
|
MDEV-26743 InnoDB: CHAR+nopad does not work well
The patch for "MDEV-25440: Indexed CHAR ... broken with NO_PAD collations"
fixed these scenarios from MDEV-26743:
- Basic latin letter vs equal accented letter
- Two letters vs equal (but space padded) expansion
However, this scenario was still broken:
- Basic latin letter (but followed by an ignorable character)
vs equal accented letter
Fix:
When processing for a NOPAD collation a string with trailing ignorable
characters, like:
'<non-ignorable><ignorable><ignorable>'
the string gets virtually converted to:
'<non-ignorable><ignorable><ignorable><space><space><space>...'
After the fix the code works differently in these two cases:
1. <space> fits into the "nchars" limit
2. <space> does not fit into the "nchars" limit
Details:
1. If "nchars" is large enough (4+ in this example),
return weights as follows:
'[weight-for-non-ignorable, 1 char] [weight-for-space-character, 3 chars]'
i.e. the weight for the virtual trailing space character now indicates
that it corresponds to total 3 characters:
- two ignorable characters
- one virtual trailing space character
2. If "nchars" is small (3), then the virtual trailing space character
does not fit into the "nchar" limit, so return 0x00 as weight, e.g.:
'[weight-for-non-ignorable, 1 char] [0x00, 2 chars]'
Adding corresponding MTR tests and unit tests.
2021-10-01 15:12:00 +02:00
|
|
|
res.weight= 0;
|
|
|
|
res.nchars= (uint) nchars;
|
|
|
|
}
|
2021-09-29 13:13:57 +02:00
|
|
|
}
|
|
|
|
else if (res.nchars > nchars)
|
|
|
|
{
|
|
|
|
/*
|
|
|
|
We scanned the next collation element, but it does not fit into
|
|
|
|
the "nchars" limit. This is possible in case of:
|
|
|
|
- A contraction, e.g. Czech 'ch' with nchars=1
|
|
|
|
- A sequence of ignorable characters followed by non-ignorable ones,
|
|
|
|
e.g. CONCAT(x'00','a') with nchars=1.
|
|
|
|
Perform trimming.
|
|
|
|
*/
|
2022-09-02 11:23:24 +02:00
|
|
|
res.weight= param->cs->state & MY_CS_NOPAD ?
|
|
|
|
0 : my_space_weight(param->level);
|
2021-09-29 13:13:57 +02:00
|
|
|
res.nchars= (uint) nchars;
|
|
|
|
(*generated)++;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
else
|
|
|
|
{
|
|
|
|
/* The caller wants nchars==0. Perform trimming. */
|
2022-09-02 11:23:24 +02:00
|
|
|
res.weight= param->cs->state & MY_CS_NOPAD ?
|
|
|
|
0 : my_space_weight(param->level);
|
2021-09-29 13:13:57 +02:00
|
|
|
res.nchars= 0;
|
|
|
|
(*generated)++;
|
|
|
|
}
|
|
|
|
return res;
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
static int
|
|
|
|
MY_FUNCTION_NAME(strnncollsp_nchars_onelevel)(CHARSET_INFO *cs,
|
|
|
|
const MY_UCA_WEIGHT_LEVEL *level,
|
|
|
|
const uchar *s, size_t slen,
|
|
|
|
const uchar *t, size_t tlen,
|
2023-03-31 15:20:03 +02:00
|
|
|
size_t nchars,
|
|
|
|
uint flags)
|
2021-09-29 13:13:57 +02:00
|
|
|
{
|
|
|
|
my_uca_scanner sscanner;
|
|
|
|
my_uca_scanner tscanner;
|
2022-09-02 11:23:24 +02:00
|
|
|
my_uca_scanner_param param;
|
2021-09-29 13:13:57 +02:00
|
|
|
size_t s_nchars_left= nchars;
|
|
|
|
size_t t_nchars_left= nchars;
|
|
|
|
|
2022-02-25 10:54:59 +01:00
|
|
|
/*
|
|
|
|
TODO: strnncollsp_nchars_onelevel
|
|
|
|
#if MY_UCA_ASCII_OPTIMIZE
|
|
|
|
{
|
|
|
|
size_t prefix= my_uca_level_booster_equal_prefix_length(level->booster,
|
|
|
|
s, slen, t, tlen);
|
|
|
|
s+= prefix, slen-= prefix;
|
|
|
|
t+= prefix, tlen-= prefix;
|
|
|
|
}
|
|
|
|
#endif
|
|
|
|
*/
|
|
|
|
|
2022-09-02 11:23:24 +02:00
|
|
|
my_uca_scanner_param_init(¶m, cs, level);
|
|
|
|
my_uca_scanner_init_any(&sscanner, s, slen);
|
|
|
|
my_uca_scanner_init_any(&tscanner, t, tlen);
|
2021-09-29 13:13:57 +02:00
|
|
|
|
|
|
|
for ( ; ; )
|
|
|
|
{
|
|
|
|
weight_and_nchars_t s_res;
|
|
|
|
weight_and_nchars_t t_res;
|
|
|
|
uint generated= 0;
|
|
|
|
int diff;
|
|
|
|
|
2022-09-02 11:23:24 +02:00
|
|
|
s_res= MY_FUNCTION_NAME(scanner_next_pad_trim)(&sscanner, ¶m,
|
|
|
|
s_nchars_left,
|
2023-03-31 15:20:03 +02:00
|
|
|
flags, &generated);
|
2022-09-02 11:23:24 +02:00
|
|
|
t_res= MY_FUNCTION_NAME(scanner_next_pad_trim)(&tscanner, ¶m,
|
|
|
|
t_nchars_left,
|
2023-03-31 15:20:03 +02:00
|
|
|
flags, &generated);
|
2021-09-29 13:13:57 +02:00
|
|
|
if ((diff= (s_res.weight - t_res.weight)))
|
|
|
|
return diff;
|
|
|
|
|
|
|
|
if (generated == 2)
|
|
|
|
{
|
2023-03-31 15:20:03 +02:00
|
|
|
if ((cs->state & MY_CS_NOPAD) &&
|
|
|
|
(flags & MY_STRNNCOLLSP_NCHARS_EMULATE_TRIMMED_TRAILING_SPACES))
|
2021-09-29 13:13:57 +02:00
|
|
|
{
|
|
|
|
/*
|
|
|
|
Both values are auto-generated. There's no real data any more.
|
|
|
|
We need to handle the remaining virtual trailing spaces.
|
|
|
|
The two strings still have s_nchars_left and t_nchars_left imaginary
|
|
|
|
trailing spaces at the end. If s_nchars_left != t_nchars_left,
|
|
|
|
the strings will be not equal in case of a NOPAD collation.
|
|
|
|
|
|
|
|
Example:
|
|
|
|
"B" is German "U+00DF LATIN SMALL LETTER SHARP S"
|
|
|
|
When we have these values in a
|
|
|
|
CHAR(3) CHARACTER SET utf8mb4 COLLATE utf8mb4_unicode_nopad_ci
|
|
|
|
column:
|
|
|
|
'B ' (one character, two trailing spaces)
|
|
|
|
'ss ' (two characters, one trailing space)
|
|
|
|
The 'B ' is greater than the 'ss '.
|
|
|
|
They are compared in the following steps:
|
|
|
|
1. 'B' == 'ss'
|
|
|
|
2. ' ' == ' '
|
|
|
|
3. ' ' > ''
|
|
|
|
|
|
|
|
We need to emulate the same behavior in this function even if
|
|
|
|
it's called with strings 'B' and 'ss' (with space trimmed).
|
|
|
|
The side which has more remaining virtual spaces at the end
|
|
|
|
is greater.
|
|
|
|
*/
|
|
|
|
if (s_nchars_left < t_nchars_left)
|
|
|
|
return -1;
|
|
|
|
if (s_nchars_left > t_nchars_left)
|
|
|
|
return +1;
|
|
|
|
}
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
|
|
|
DBUG_ASSERT(s_nchars_left >= s_res.nchars);
|
|
|
|
DBUG_ASSERT(t_nchars_left >= t_res.nchars);
|
|
|
|
s_nchars_left-= s_res.nchars;
|
|
|
|
t_nchars_left-= t_res.nchars;
|
|
|
|
}
|
|
|
|
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
/*
|
|
|
|
One-level collations.
|
|
|
|
*/
|
|
|
|
static int
|
|
|
|
MY_FUNCTION_NAME(strnncollsp_nchars)(CHARSET_INFO *cs,
|
|
|
|
const uchar *s, size_t slen,
|
|
|
|
const uchar *t, size_t tlen,
|
2023-03-31 15:20:03 +02:00
|
|
|
size_t nchars,
|
|
|
|
uint flags)
|
2021-09-29 13:13:57 +02:00
|
|
|
{
|
|
|
|
return MY_FUNCTION_NAME(strnncollsp_nchars_onelevel)(cs, &cs->uca->level[0],
|
|
|
|
s, slen, t, tlen,
|
2023-03-31 15:20:03 +02:00
|
|
|
nchars, flags);
|
2021-09-29 13:13:57 +02:00
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
/*
|
|
|
|
Multi-level collations.
|
|
|
|
*/
|
|
|
|
static int
|
|
|
|
MY_FUNCTION_NAME(strnncollsp_nchars_multilevel)(CHARSET_INFO *cs,
|
|
|
|
const uchar *s, size_t slen,
|
|
|
|
const uchar *t, size_t tlen,
|
2023-03-31 15:20:03 +02:00
|
|
|
size_t nchars,
|
|
|
|
uint flags)
|
2021-09-29 13:13:57 +02:00
|
|
|
{
|
MDEV-27009 Add UCA-14.0.0 collations
- Added one neutral and 22 tailored (language specific) collations based on
Unicode Collation Algorithm version 14.0.0.
Collations were added for Unicode character sets
utf8mb3, utf8mb4, ucs2, utf16, utf32.
Every tailoring was added with four accent and case
sensitivity flag combinations, e.g:
* utf8mb4_uca1400_swedish_as_cs
* utf8mb4_uca1400_swedish_as_ci
* utf8mb4_uca1400_swedish_ai_cs
* utf8mb4_uca1400_swedish_ai_ci
and their _nopad_ variants:
* utf8mb4_uca1400_swedish_nopad_as_cs
* utf8mb4_uca1400_swedish_nopad_as_ci
* utf8mb4_uca1400_swedish_nopad_ai_cs
* utf8mb4_uca1400_swedish_nopad_ai_ci
- Introducing a conception of contextually typed named collations:
CREATE DATABASE db1 CHARACTER SET utf8mb4;
CREATE TABLE db1.t1 (a CHAR(10) COLLATE uca1400_as_ci);
The idea is that there is no a need to specify the character set prefix
in the new collation names. It's enough to type just the suffix
"uca1400_as_ci". The character set is taken from the context.
In the above example script the context character set is utf8mb4.
So the CREATE TABLE will make a column with the collation
utf8mb4_uca1400_as_ci.
Short collations names can be used in any parts of the SQL syntax
where the COLLATE clause is understood.
- New collations are displayed only one time
(without character set combinations) by these statements:
SELECT * FROM INFORMATION_SCHEMA.COLLATIONS;
SHOW COLLATION;
For example, all these collations:
- utf8mb3_uca1400_swedish_as_ci
- utf8mb4_uca1400_swedish_as_ci
- ucs2_uca1400_swedish_as_ci
- utf16_uca1400_swedish_as_ci
- utf32_uca1400_swedish_as_ci
have just one entry in INFORMATION_SCHEMA.COLLATIONS and SHOW COLLATION,
with COLLATION_NAME equal to "uca1400_swedish_as_ci", which is the suffix
without the character set name:
SELECT COLLATION_NAME FROM INFORMATION_SCHEMA.COLLATIONS
WHERE COLLATION_NAME LIKE '%uca1400_swedish_as_ci';
+-----------------------+
| COLLATION_NAME |
+-----------------------+
| uca1400_swedish_as_ci |
+-----------------------+
Note, the behaviour of old collations did not change.
Non-unicode collations (e.g. latin1_swedish_ci) and
old UCA-4.0.0 collations (e.g. utf8mb4_unicode_ci)
are still displayed with the character set prefix, as before.
- The structure of the table INFORMATION_SCHEMA.COLLATIONS was changed.
The NOT NULL constraint was removed from these columns:
- CHARACTER_SET_NAME
- ID
- IS_DEFAULT
and from the corresponding columns in SHOW COLLATION.
For example:
SELECT COLLATION_NAME, CHARACTER_SET_NAME, ID, IS_DEFAULT
FROM INFORMATION_SCHEMA.COLLATIONS
WHERE COLLATION_NAME LIKE '%uca1400_swedish_as_ci';
+-----------------------+--------------------+------+------------+
| COLLATION_NAME | CHARACTER_SET_NAME | ID | IS_DEFAULT |
+-----------------------+--------------------+------+------------+
| uca1400_swedish_as_ci | NULL | NULL | NULL |
+-----------------------+--------------------+------+------------+
The NULL value in these columns now means that the collation
is applicable to multiple character sets.
The behavioir of old collations did not change.
Make sure your client programs can handle NULL values in these columns.
- The structure of the table
INFORMATION_SCHEMA.COLLATION_CHARACTER_SET_APPLICABILITY was changed.
Three new NOT NULL columns were added:
- FULL_COLLATION_NAME
- ID
- IS_DEFAULT
New collations have multiple entries in COLLATION_CHARACTER_SET_APPLICABILITY.
The column COLLATION_NAME contains the collation name without the character
set prefix. The column FULL_COLLATION_NAME contains the collation name with
the character set prefix.
Old collations have full collation name in both FULL_COLLATION_NAME and
COLLATION_NAME.
SELECT COLLATION_NAME, FULL_COLLATION_NAME, CHARACTER_SET_NAME, ID, IS_DEFAULT
FROM INFORMATION_SCHEMA.COLLATION_CHARACTER_SET_APPLICABILITY
WHERE FULL_COLLATION_NAME RLIKE '^(utf8mb4|latin1).*swedish.*ci$';
+-----------------------------+-------------------------------------+--------------------+------+------------+
| COLLATION_NAME | FULL_COLLATION_NAME | CHARACTER_SET_NAME | ID | IS_DEFAULT |
+-----------------------------+-------------------------------------+--------------------+------+------------+
| latin1_swedish_ci | latin1_swedish_ci | latin1 | 8 | Yes |
| latin1_swedish_nopad_ci | latin1_swedish_nopad_ci | latin1 | 1032 | |
| utf8mb4_swedish_ci | utf8mb4_swedish_ci | utf8mb4 | 232 | |
| uca1400_swedish_ai_ci | utf8mb4_uca1400_swedish_ai_ci | utf8mb4 | 2368 | |
| uca1400_swedish_as_ci | utf8mb4_uca1400_swedish_as_ci | utf8mb4 | 2370 | |
| uca1400_swedish_nopad_ai_ci | utf8mb4_uca1400_swedish_nopad_ai_ci | utf8mb4 | 2372 | |
| uca1400_swedish_nopad_as_ci | utf8mb4_uca1400_swedish_nopad_as_ci | utf8mb4 | 2374 | |
+-----------------------------+-------------------------------------+--------------------+------+------------+
- Other INFORMATION_SCHEMA queries:
SELECT COLLATION_NAME FROM INFORMATION_SCHEMA.COLUMNS;
SELECT COLLATION_NAME FROM INFORMATION_SCHEMA.PARAMETERS;
SELECT TABLE_COLLATION FROM INFORMATION_SCHEMA.TABLES;
SELECT DEFAULT_COLLATION_NAME FROM INFORMATION_SCHEMA.SCHEMATA;
SELECT COLLATION_NAME FROM INFORMATION_SCHEMA.ROUTINES;
SELECT COLLATION_CONNECTION FROM INFORMATION_SCHEMA.EVENTS;
SELECT DATABASE_COLLATION FROM INFORMATION_SCHEMA.EVENTS;
SELECT COLLATION_CONNECTION FROM INFORMATION_SCHEMA.ROUTINES;
SELECT DATABASE_COLLATION FROM INFORMATION_SCHEMA.ROUTINES;
SELECT COLLATION_CONNECTION FROM INFORMATION_SCHEMA.TRIGGERS;
SELECT DATABASE_COLLATION FROM INFORMATION_SCHEMA.TRIGGERS;
SELECT COLLATION_CONNECTION FROM INFORMATION_SCHEMA.VIEWS;
display full collation names, including character sets prefix,
for all collations, including new collations.
Corresponding SHOW commands also display full collation names
in collation related columns:
SHOW CREATE TABLE t1;
SHOW CREATE DATABASE db1;
SHOW TABLE STATUS;
SHOW CREATE FUNCTION f1;
SHOW CREATE PROCEDURE p1;
SHOW CREATE EVENT ev1;
SHOW CREATE TRIGGER tr1;
SHOW CREATE VIEW;
These INFORMATION_SCHEMA queries and SHOW statements may change in
the future, to display show collation names.
2021-11-28 13:55:15 +01:00
|
|
|
uint i, level_flags= cs->levels_for_order;
|
|
|
|
for (i= 0; level_flags; i++, level_flags>>= 1)
|
2021-09-29 13:13:57 +02:00
|
|
|
{
|
MDEV-27009 Add UCA-14.0.0 collations
- Added one neutral and 22 tailored (language specific) collations based on
Unicode Collation Algorithm version 14.0.0.
Collations were added for Unicode character sets
utf8mb3, utf8mb4, ucs2, utf16, utf32.
Every tailoring was added with four accent and case
sensitivity flag combinations, e.g:
* utf8mb4_uca1400_swedish_as_cs
* utf8mb4_uca1400_swedish_as_ci
* utf8mb4_uca1400_swedish_ai_cs
* utf8mb4_uca1400_swedish_ai_ci
and their _nopad_ variants:
* utf8mb4_uca1400_swedish_nopad_as_cs
* utf8mb4_uca1400_swedish_nopad_as_ci
* utf8mb4_uca1400_swedish_nopad_ai_cs
* utf8mb4_uca1400_swedish_nopad_ai_ci
- Introducing a conception of contextually typed named collations:
CREATE DATABASE db1 CHARACTER SET utf8mb4;
CREATE TABLE db1.t1 (a CHAR(10) COLLATE uca1400_as_ci);
The idea is that there is no a need to specify the character set prefix
in the new collation names. It's enough to type just the suffix
"uca1400_as_ci". The character set is taken from the context.
In the above example script the context character set is utf8mb4.
So the CREATE TABLE will make a column with the collation
utf8mb4_uca1400_as_ci.
Short collations names can be used in any parts of the SQL syntax
where the COLLATE clause is understood.
- New collations are displayed only one time
(without character set combinations) by these statements:
SELECT * FROM INFORMATION_SCHEMA.COLLATIONS;
SHOW COLLATION;
For example, all these collations:
- utf8mb3_uca1400_swedish_as_ci
- utf8mb4_uca1400_swedish_as_ci
- ucs2_uca1400_swedish_as_ci
- utf16_uca1400_swedish_as_ci
- utf32_uca1400_swedish_as_ci
have just one entry in INFORMATION_SCHEMA.COLLATIONS and SHOW COLLATION,
with COLLATION_NAME equal to "uca1400_swedish_as_ci", which is the suffix
without the character set name:
SELECT COLLATION_NAME FROM INFORMATION_SCHEMA.COLLATIONS
WHERE COLLATION_NAME LIKE '%uca1400_swedish_as_ci';
+-----------------------+
| COLLATION_NAME |
+-----------------------+
| uca1400_swedish_as_ci |
+-----------------------+
Note, the behaviour of old collations did not change.
Non-unicode collations (e.g. latin1_swedish_ci) and
old UCA-4.0.0 collations (e.g. utf8mb4_unicode_ci)
are still displayed with the character set prefix, as before.
- The structure of the table INFORMATION_SCHEMA.COLLATIONS was changed.
The NOT NULL constraint was removed from these columns:
- CHARACTER_SET_NAME
- ID
- IS_DEFAULT
and from the corresponding columns in SHOW COLLATION.
For example:
SELECT COLLATION_NAME, CHARACTER_SET_NAME, ID, IS_DEFAULT
FROM INFORMATION_SCHEMA.COLLATIONS
WHERE COLLATION_NAME LIKE '%uca1400_swedish_as_ci';
+-----------------------+--------------------+------+------------+
| COLLATION_NAME | CHARACTER_SET_NAME | ID | IS_DEFAULT |
+-----------------------+--------------------+------+------------+
| uca1400_swedish_as_ci | NULL | NULL | NULL |
+-----------------------+--------------------+------+------------+
The NULL value in these columns now means that the collation
is applicable to multiple character sets.
The behavioir of old collations did not change.
Make sure your client programs can handle NULL values in these columns.
- The structure of the table
INFORMATION_SCHEMA.COLLATION_CHARACTER_SET_APPLICABILITY was changed.
Three new NOT NULL columns were added:
- FULL_COLLATION_NAME
- ID
- IS_DEFAULT
New collations have multiple entries in COLLATION_CHARACTER_SET_APPLICABILITY.
The column COLLATION_NAME contains the collation name without the character
set prefix. The column FULL_COLLATION_NAME contains the collation name with
the character set prefix.
Old collations have full collation name in both FULL_COLLATION_NAME and
COLLATION_NAME.
SELECT COLLATION_NAME, FULL_COLLATION_NAME, CHARACTER_SET_NAME, ID, IS_DEFAULT
FROM INFORMATION_SCHEMA.COLLATION_CHARACTER_SET_APPLICABILITY
WHERE FULL_COLLATION_NAME RLIKE '^(utf8mb4|latin1).*swedish.*ci$';
+-----------------------------+-------------------------------------+--------------------+------+------------+
| COLLATION_NAME | FULL_COLLATION_NAME | CHARACTER_SET_NAME | ID | IS_DEFAULT |
+-----------------------------+-------------------------------------+--------------------+------+------------+
| latin1_swedish_ci | latin1_swedish_ci | latin1 | 8 | Yes |
| latin1_swedish_nopad_ci | latin1_swedish_nopad_ci | latin1 | 1032 | |
| utf8mb4_swedish_ci | utf8mb4_swedish_ci | utf8mb4 | 232 | |
| uca1400_swedish_ai_ci | utf8mb4_uca1400_swedish_ai_ci | utf8mb4 | 2368 | |
| uca1400_swedish_as_ci | utf8mb4_uca1400_swedish_as_ci | utf8mb4 | 2370 | |
| uca1400_swedish_nopad_ai_ci | utf8mb4_uca1400_swedish_nopad_ai_ci | utf8mb4 | 2372 | |
| uca1400_swedish_nopad_as_ci | utf8mb4_uca1400_swedish_nopad_as_ci | utf8mb4 | 2374 | |
+-----------------------------+-------------------------------------+--------------------+------+------------+
- Other INFORMATION_SCHEMA queries:
SELECT COLLATION_NAME FROM INFORMATION_SCHEMA.COLUMNS;
SELECT COLLATION_NAME FROM INFORMATION_SCHEMA.PARAMETERS;
SELECT TABLE_COLLATION FROM INFORMATION_SCHEMA.TABLES;
SELECT DEFAULT_COLLATION_NAME FROM INFORMATION_SCHEMA.SCHEMATA;
SELECT COLLATION_NAME FROM INFORMATION_SCHEMA.ROUTINES;
SELECT COLLATION_CONNECTION FROM INFORMATION_SCHEMA.EVENTS;
SELECT DATABASE_COLLATION FROM INFORMATION_SCHEMA.EVENTS;
SELECT COLLATION_CONNECTION FROM INFORMATION_SCHEMA.ROUTINES;
SELECT DATABASE_COLLATION FROM INFORMATION_SCHEMA.ROUTINES;
SELECT COLLATION_CONNECTION FROM INFORMATION_SCHEMA.TRIGGERS;
SELECT DATABASE_COLLATION FROM INFORMATION_SCHEMA.TRIGGERS;
SELECT COLLATION_CONNECTION FROM INFORMATION_SCHEMA.VIEWS;
display full collation names, including character sets prefix,
for all collations, including new collations.
Corresponding SHOW commands also display full collation names
in collation related columns:
SHOW CREATE TABLE t1;
SHOW CREATE DATABASE db1;
SHOW TABLE STATUS;
SHOW CREATE FUNCTION f1;
SHOW CREATE PROCEDURE p1;
SHOW CREATE EVENT ev1;
SHOW CREATE TRIGGER tr1;
SHOW CREATE VIEW;
These INFORMATION_SCHEMA queries and SHOW statements may change in
the future, to display show collation names.
2021-11-28 13:55:15 +01:00
|
|
|
int ret;
|
|
|
|
if (!(level_flags & 1))
|
|
|
|
continue;
|
|
|
|
ret= MY_FUNCTION_NAME(strnncollsp_nchars_onelevel)(cs,
|
|
|
|
&cs->uca->level[i],
|
|
|
|
s, slen,
|
|
|
|
t, tlen,
|
2023-04-14 11:24:14 +02:00
|
|
|
nchars, flags);
|
2021-09-29 13:13:57 +02:00
|
|
|
if (ret)
|
|
|
|
return ret;
|
|
|
|
}
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
2018-10-16 17:10:57 +02:00
|
|
|
|
|
|
|
/*
|
|
|
|
Calculates hash value for the given string,
|
|
|
|
according to the collation, and ignoring trailing spaces.
|
|
|
|
|
|
|
|
SYNOPSIS:
|
|
|
|
hash_sort()
|
|
|
|
cs Character set information
|
|
|
|
s String
|
|
|
|
slen String's length
|
|
|
|
n1 First hash parameter
|
|
|
|
n2 Second hash parameter
|
|
|
|
|
|
|
|
NOTES:
|
|
|
|
Scans consequently weights and updates
|
|
|
|
hash parameters n1 and n2. In a case insensitive collation,
|
|
|
|
upper and lower case of the same letter will return the same
|
|
|
|
weight sequence, and thus will produce the same hash values
|
|
|
|
in n1 and n2.
|
|
|
|
|
|
|
|
This functions is used for one-level and for multi-level collations.
|
|
|
|
We intentionally use only primary level in multi-level collations.
|
|
|
|
This helps to have PARTITION BY KEY put primarily equal records
|
2019-06-28 07:05:12 +02:00
|
|
|
into the same partition. E.g. in utf8mb3_thai_520_ci records that differ
|
2018-10-16 17:10:57 +02:00
|
|
|
only in tone marks go into the same partition.
|
|
|
|
|
|
|
|
RETURN
|
|
|
|
N/A
|
|
|
|
*/
|
|
|
|
|
|
|
|
static void
|
|
|
|
MY_FUNCTION_NAME(hash_sort)(CHARSET_INFO *cs,
|
|
|
|
const uchar *s, size_t slen,
|
|
|
|
ulong *nr1, ulong *nr2)
|
|
|
|
{
|
|
|
|
int s_res;
|
|
|
|
my_uca_scanner scanner;
|
2022-09-02 11:23:24 +02:00
|
|
|
my_uca_scanner_param param;
|
2018-10-16 17:10:57 +02:00
|
|
|
int space_weight= my_space_weight(&cs->uca->level[0]);
|
|
|
|
register ulong m1= *nr1, m2= *nr2;
|
|
|
|
|
2022-09-02 11:23:24 +02:00
|
|
|
my_uca_scanner_param_init(¶m, cs, &cs->uca->level[0]);
|
|
|
|
my_uca_scanner_init_any(&scanner, s, slen);
|
2018-10-16 17:10:57 +02:00
|
|
|
|
2022-09-02 11:23:24 +02:00
|
|
|
while ((s_res= MY_FUNCTION_NAME(scanner_next)(&scanner, ¶m)) >0)
|
2018-10-16 17:10:57 +02:00
|
|
|
{
|
|
|
|
if (s_res == space_weight)
|
|
|
|
{
|
|
|
|
/* Combine all spaces to be able to skip end spaces */
|
|
|
|
uint count= 0;
|
|
|
|
do
|
|
|
|
{
|
|
|
|
count++;
|
2022-09-02 11:23:24 +02:00
|
|
|
if ((s_res= MY_FUNCTION_NAME(scanner_next)(&scanner, ¶m)) <= 0)
|
2018-10-16 17:10:57 +02:00
|
|
|
{
|
|
|
|
/* Skip strings at end of string */
|
|
|
|
goto end;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
while (s_res == space_weight);
|
|
|
|
|
|
|
|
/* Add back that has for the space characters */
|
|
|
|
do
|
|
|
|
{
|
|
|
|
/*
|
|
|
|
We can't use MY_HASH_ADD_16() here as we, because of a misstake
|
|
|
|
in the original code, where we added the 16 byte variable the
|
|
|
|
opposite way. Changing this would cause old partitioned tables
|
|
|
|
to fail.
|
|
|
|
*/
|
|
|
|
MY_HASH_ADD(m1, m2, space_weight >> 8);
|
|
|
|
MY_HASH_ADD(m1, m2, space_weight & 0xFF);
|
|
|
|
}
|
|
|
|
while (--count != 0);
|
|
|
|
|
|
|
|
}
|
|
|
|
/* See comment above why we can't use MY_HASH_ADD_16() */
|
|
|
|
MY_HASH_ADD(m1, m2, s_res >> 8);
|
|
|
|
MY_HASH_ADD(m1, m2, s_res & 0xFF);
|
|
|
|
}
|
|
|
|
end:
|
|
|
|
*nr1= m1;
|
|
|
|
*nr2= m2;
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
static void
|
|
|
|
MY_FUNCTION_NAME(hash_sort_nopad)(CHARSET_INFO *cs,
|
|
|
|
const uchar *s, size_t slen,
|
|
|
|
ulong *nr1, ulong *nr2)
|
|
|
|
{
|
|
|
|
int s_res;
|
|
|
|
my_uca_scanner scanner;
|
2022-09-02 11:23:24 +02:00
|
|
|
my_uca_scanner_param param;
|
2018-10-16 17:10:57 +02:00
|
|
|
register ulong m1= *nr1, m2= *nr2;
|
|
|
|
|
2022-09-02 11:23:24 +02:00
|
|
|
my_uca_scanner_param_init(¶m, cs, &cs->uca->level[0]);
|
|
|
|
my_uca_scanner_init_any(&scanner, s, slen);
|
2018-10-16 17:10:57 +02:00
|
|
|
|
2022-09-02 11:23:24 +02:00
|
|
|
while ((s_res= MY_FUNCTION_NAME(scanner_next)(&scanner, ¶m)) >0)
|
2018-10-16 17:10:57 +02:00
|
|
|
{
|
|
|
|
/* See comment above why we can't use MY_HASH_ADD_16() */
|
|
|
|
MY_HASH_ADD(m1, m2, s_res >> 8);
|
|
|
|
MY_HASH_ADD(m1, m2, s_res & 0xFF);
|
|
|
|
}
|
|
|
|
*nr1= m1;
|
|
|
|
*nr2= m2;
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
/*
|
|
|
|
For the given string creates its "binary image", suitable
|
|
|
|
to be used in binary comparison, i.e. in memcmp().
|
|
|
|
|
|
|
|
SYNOPSIS:
|
|
|
|
my_strnxfrm_uca()
|
|
|
|
cs Character set information
|
|
|
|
dst Where to write the image
|
|
|
|
dstlen Space available for the image, in bytes
|
|
|
|
src The source string
|
|
|
|
srclen Length of the source string, in bytes
|
|
|
|
|
|
|
|
NOTES:
|
|
|
|
In a loop, scans weights from the source string and writes
|
|
|
|
them into the binary image. In a case insensitive collation,
|
|
|
|
upper and lower cases of the same letter will produce the
|
|
|
|
same image subsequences. When we have reached the end-of-string
|
|
|
|
or found an illegal multibyte sequence, the loop stops.
|
|
|
|
|
|
|
|
It is impossible to restore the original string using its
|
|
|
|
binary image.
|
|
|
|
|
|
|
|
Binary images are used for bulk comparison purposes,
|
|
|
|
e.g. in ORDER BY, when it is more efficient to create
|
|
|
|
a binary image and use it instead of weight scanner
|
|
|
|
for the original strings for every comparison.
|
|
|
|
|
|
|
|
RETURN
|
|
|
|
Number of bytes that have been written into the binary image.
|
|
|
|
*/
|
|
|
|
|
2024-08-17 10:56:28 +02:00
|
|
|
static my_strnxfrm_ret_t
|
2018-10-16 17:10:57 +02:00
|
|
|
MY_FUNCTION_NAME(strnxfrm_onelevel_internal)(CHARSET_INFO *cs,
|
|
|
|
MY_UCA_WEIGHT_LEVEL *level,
|
|
|
|
uchar *dst, uchar *de,
|
|
|
|
uint *nweights,
|
|
|
|
const uchar *src, size_t srclen)
|
|
|
|
{
|
|
|
|
my_uca_scanner scanner;
|
2022-09-02 11:23:24 +02:00
|
|
|
my_uca_scanner_param param;
|
2018-10-16 17:10:57 +02:00
|
|
|
int s_res;
|
2024-08-17 10:56:28 +02:00
|
|
|
const uchar *src0= src;
|
|
|
|
const uchar *dst0= dst;
|
|
|
|
const uchar *de2= de - 1; /* Last position where 2 bytes fit */
|
2018-10-16 17:10:57 +02:00
|
|
|
|
|
|
|
DBUG_ASSERT(src || !srclen);
|
|
|
|
|
2018-10-24 13:12:38 +02:00
|
|
|
#if MY_UCA_ASCII_OPTIMIZE && !MY_UCA_COMPILE_CONTRACTIONS
|
|
|
|
/*
|
|
|
|
Fast path for the ASCII range with no contractions.
|
|
|
|
*/
|
|
|
|
{
|
|
|
|
const uint16 *weights0= level->weights[0];
|
|
|
|
uint lengths0= level->lengths[0];
|
|
|
|
for ( ; ; src++, srclen--)
|
|
|
|
{
|
|
|
|
const uint16 *weight;
|
2024-08-17 10:56:28 +02:00
|
|
|
if (!srclen)
|
|
|
|
return my_strnxfrm_ret_construct(dst - dst0, src - src0, 0);
|
2018-10-24 13:12:38 +02:00
|
|
|
if (*src > 0x7F)
|
|
|
|
break; /* Non-ASCII */
|
|
|
|
|
|
|
|
weight= weights0 + (((uint) *src) * lengths0);
|
|
|
|
if (!(s_res= *weight))
|
|
|
|
continue; /* Ignorable */
|
|
|
|
if (weight[1]) /* Expansion (e.g. in a user defined collation */
|
|
|
|
break;
|
|
|
|
|
2024-08-17 10:56:28 +02:00
|
|
|
if (!*nweights)
|
|
|
|
return my_strnxfrm_ret_construct(dst - dst0, src - src0,
|
|
|
|
MY_STRNXFRM_TRUNCATED_WEIGHT_REAL_CHAR);
|
2018-10-24 13:12:38 +02:00
|
|
|
/* Here we have a character with extactly one 2-byte UCA weight */
|
|
|
|
if (dst < de2) /* Most typical case is when both bytes fit */
|
|
|
|
{
|
|
|
|
*dst++= s_res >> 8;
|
|
|
|
*dst++= s_res & 0xFF;
|
|
|
|
(*nweights)--;
|
|
|
|
continue;
|
|
|
|
}
|
|
|
|
if (dst >= de) /* No space left in "dst" */
|
2024-08-17 10:56:28 +02:00
|
|
|
return my_strnxfrm_ret_construct(dst - dst0, src - src0,
|
|
|
|
MY_STRNXFRM_TRUNCATED_WEIGHT_REAL_CHAR);
|
2018-10-24 13:12:38 +02:00
|
|
|
*dst++= s_res >> 8; /* There is space only for one byte */
|
|
|
|
(*nweights)--;
|
2024-08-17 10:56:28 +02:00
|
|
|
return my_strnxfrm_ret_construct(dst - dst0, src + 1 - src0,
|
|
|
|
MY_STRNXFRM_TRUNCATED_WEIGHT_REAL_CHAR);
|
2018-10-24 13:12:38 +02:00
|
|
|
}
|
|
|
|
}
|
|
|
|
#endif
|
|
|
|
|
2022-09-02 11:23:24 +02:00
|
|
|
my_uca_scanner_param_init(¶m, cs, level);
|
|
|
|
my_uca_scanner_init_any(&scanner, src, srclen);
|
|
|
|
|
2024-08-17 10:56:28 +02:00
|
|
|
for (; (s_res= MY_FUNCTION_NAME(scanner_next)(&scanner, ¶m)) > 0 ;
|
2022-09-02 11:23:24 +02:00
|
|
|
(*nweights)--)
|
2018-10-16 17:10:57 +02:00
|
|
|
{
|
2024-08-17 10:56:28 +02:00
|
|
|
if (!*nweights)
|
|
|
|
return my_strnxfrm_ret_construct(dst - dst0, scanner.sbeg - src0,
|
|
|
|
MY_STRNXFRM_TRUNCATED_WEIGHT_REAL_CHAR);
|
|
|
|
if (dst < de2)
|
|
|
|
{
|
|
|
|
*dst++= s_res >> 8;
|
2018-10-16 17:10:57 +02:00
|
|
|
*dst++= s_res & 0xFF;
|
2024-08-17 10:56:28 +02:00
|
|
|
}
|
|
|
|
else
|
|
|
|
{
|
|
|
|
if (dst < de)
|
|
|
|
*dst++= s_res >> 8;
|
|
|
|
return my_strnxfrm_ret_construct(dst - dst0, scanner.sbeg - src0,
|
|
|
|
MY_STRNXFRM_TRUNCATED_WEIGHT_REAL_CHAR);
|
|
|
|
}
|
2018-10-16 17:10:57 +02:00
|
|
|
}
|
2024-08-17 10:56:28 +02:00
|
|
|
return my_strnxfrm_ret_construct(dst - dst0, scanner.sbeg - src0,
|
|
|
|
my_uca_scanner_next_expansion_weight(&scanner) > 0 ?
|
|
|
|
MY_STRNXFRM_TRUNCATED_WEIGHT_REAL_CHAR : 0);
|
2018-10-16 17:10:57 +02:00
|
|
|
}
|
|
|
|
|
|
|
|
|
2024-08-17 10:56:28 +02:00
|
|
|
static my_strnxfrm_ret_t
|
2018-10-16 17:10:57 +02:00
|
|
|
MY_FUNCTION_NAME(strnxfrm_onelevel)(CHARSET_INFO *cs,
|
|
|
|
MY_UCA_WEIGHT_LEVEL *level,
|
|
|
|
uchar *dst, uchar *de, uint nweights,
|
|
|
|
const uchar *src, size_t srclen, uint flags)
|
|
|
|
{
|
2024-08-17 10:56:28 +02:00
|
|
|
my_strnxfrm_ret_t rc= MY_FUNCTION_NAME(strnxfrm_onelevel_internal)(cs, level,
|
|
|
|
dst, de, &nweights,
|
|
|
|
src, srclen);
|
|
|
|
DBUG_ASSERT(dst + rc.m_result_length <= de);
|
|
|
|
if (nweights && (flags & MY_STRXFRM_PAD_WITH_SPACE))
|
|
|
|
{
|
|
|
|
my_strnxfrm_pad_ret_t rcpad= my_strnxfrm_uca_padn(dst + rc.m_result_length,
|
|
|
|
de, nweights,
|
|
|
|
my_space_weight(level));
|
|
|
|
my_strnxfrm_ret_join_pad(&rc, &rcpad);
|
|
|
|
DBUG_ASSERT(dst + rc.m_result_length <= de);
|
|
|
|
}
|
|
|
|
my_strxfrm_desc_and_reverse(dst, dst + rc.m_result_length, flags, 0);
|
|
|
|
return rc;
|
2018-10-16 17:10:57 +02:00
|
|
|
}
|
|
|
|
|
|
|
|
|
2024-08-17 10:56:28 +02:00
|
|
|
static my_strnxfrm_ret_t
|
2018-10-16 17:10:57 +02:00
|
|
|
MY_FUNCTION_NAME(strnxfrm_nopad_onelevel)(CHARSET_INFO *cs,
|
|
|
|
MY_UCA_WEIGHT_LEVEL *level,
|
|
|
|
uchar *dst, uchar *de, uint nweights,
|
|
|
|
const uchar *src, size_t srclen,
|
|
|
|
uint flags)
|
|
|
|
{
|
2024-08-17 10:56:28 +02:00
|
|
|
my_strnxfrm_ret_t rc= MY_FUNCTION_NAME(strnxfrm_onelevel_internal)(cs, level,
|
|
|
|
dst, de, &nweights,
|
|
|
|
src, srclen);
|
|
|
|
DBUG_ASSERT(dst + rc.m_result_length <= de);
|
2018-10-16 17:10:57 +02:00
|
|
|
/* Pad with the minimum possible weight on this level */
|
2024-08-17 10:56:28 +02:00
|
|
|
if (nweights && (flags & MY_STRXFRM_PAD_WITH_SPACE))
|
|
|
|
{
|
|
|
|
my_strnxfrm_pad_ret_t rcpad= my_strnxfrm_uca_padn(dst + rc.m_result_length,
|
|
|
|
de, nweights,
|
|
|
|
min_weight_on_level(level));
|
|
|
|
my_strnxfrm_ret_join_pad(&rc, &rcpad);
|
|
|
|
DBUG_ASSERT(dst + rc.m_result_length <= de);
|
|
|
|
}
|
|
|
|
my_strxfrm_desc_and_reverse(dst, dst + rc.m_result_length, flags, 0);
|
|
|
|
return rc;
|
2018-10-16 17:10:57 +02:00
|
|
|
}
|
|
|
|
|
|
|
|
|
2024-08-17 10:56:28 +02:00
|
|
|
static my_strnxfrm_ret_t
|
2018-10-16 17:10:57 +02:00
|
|
|
MY_FUNCTION_NAME(strnxfrm)(CHARSET_INFO *cs,
|
|
|
|
uchar *dst, size_t dstlen, uint nweights,
|
|
|
|
const uchar *src, size_t srclen, uint flags)
|
|
|
|
{
|
|
|
|
uchar *d0= dst;
|
|
|
|
uchar *de= dst + dstlen;
|
2024-08-17 10:56:28 +02:00
|
|
|
my_strnxfrm_ret_t rc;
|
2018-10-16 17:10:57 +02:00
|
|
|
|
2018-10-21 03:02:38 +02:00
|
|
|
/*
|
|
|
|
There are two ways to handle trailing spaces for PAD SPACE collations:
|
|
|
|
1. Keep trailing spaces as they are, so have strnxfrm_onelevel() scan
|
|
|
|
spaces as normal characters. This will call scanner_next() for every
|
|
|
|
trailing space and calculate its weight using UCA weights.
|
2018-10-21 19:28:11 +02:00
|
|
|
2. Strip trailing spaces before calling strnxfrm_onelevel(), as it will
|
|
|
|
append weights for implicit spaces anyway, up to the desired key size.
|
2018-10-21 03:02:38 +02:00
|
|
|
This will effectively generate exactly the same sortable key result.
|
|
|
|
The latter is much faster.
|
|
|
|
*/
|
|
|
|
|
2018-10-21 19:28:11 +02:00
|
|
|
if (flags & MY_STRXFRM_PAD_WITH_SPACE)
|
2020-01-26 17:27:13 +01:00
|
|
|
srclen= my_ci_lengthsp(cs, (const char*) src, srclen);
|
2024-08-17 10:56:28 +02:00
|
|
|
rc= MY_FUNCTION_NAME(strnxfrm_onelevel)(cs, &cs->uca->level[0],
|
|
|
|
dst, de, nweights,
|
|
|
|
src, srclen, flags);
|
|
|
|
dst+= rc.m_result_length;
|
2018-10-16 17:10:57 +02:00
|
|
|
/*
|
|
|
|
This can probably be changed to memset(dst, 0, de - dst),
|
|
|
|
like my_strnxfrm_uca_multilevel() does.
|
|
|
|
*/
|
|
|
|
if ((flags & MY_STRXFRM_PAD_TO_MAXLEN) && dst < de)
|
|
|
|
dst= my_strnxfrm_uca_pad(dst, de, my_space_weight(&cs->uca->level[0]));
|
2024-08-17 10:56:28 +02:00
|
|
|
rc.m_result_length= dst - d0;
|
|
|
|
return rc;
|
2018-10-16 17:10:57 +02:00
|
|
|
}
|
|
|
|
|
|
|
|
|
2024-08-17 10:56:28 +02:00
|
|
|
static my_strnxfrm_ret_t
|
2018-10-16 17:10:57 +02:00
|
|
|
MY_FUNCTION_NAME(strnxfrm_nopad)(CHARSET_INFO *cs,
|
|
|
|
uchar *dst, size_t dstlen,
|
|
|
|
uint nweights,
|
|
|
|
const uchar *src, size_t srclen,
|
|
|
|
uint flags)
|
|
|
|
{
|
|
|
|
uchar *d0= dst;
|
|
|
|
uchar *de= dst + dstlen;
|
2024-08-17 10:56:28 +02:00
|
|
|
my_strnxfrm_ret_t rc= MY_FUNCTION_NAME(strnxfrm_nopad_onelevel)(cs,
|
|
|
|
&cs->uca->level[0],
|
|
|
|
dst, de, nweights,
|
|
|
|
src, srclen, flags);
|
|
|
|
dst+= rc.m_result_length;
|
2018-10-16 17:10:57 +02:00
|
|
|
if ((flags & MY_STRXFRM_PAD_TO_MAXLEN) && dst < de)
|
|
|
|
{
|
|
|
|
memset(dst, 0, de - dst);
|
|
|
|
dst= de;
|
|
|
|
}
|
2024-08-17 10:56:28 +02:00
|
|
|
rc.m_result_length= dst - d0;
|
|
|
|
return rc;
|
2018-10-16 17:10:57 +02:00
|
|
|
}
|
|
|
|
|
|
|
|
|
2024-08-17 10:56:28 +02:00
|
|
|
static my_strnxfrm_ret_t
|
2018-10-16 17:10:57 +02:00
|
|
|
MY_FUNCTION_NAME(strnxfrm_multilevel)(CHARSET_INFO *cs,
|
|
|
|
uchar *dst, size_t dstlen,
|
|
|
|
uint nweights,
|
|
|
|
const uchar *src, size_t srclen,
|
|
|
|
uint flags)
|
|
|
|
{
|
MDEV-27009 Add UCA-14.0.0 collations
- Added one neutral and 22 tailored (language specific) collations based on
Unicode Collation Algorithm version 14.0.0.
Collations were added for Unicode character sets
utf8mb3, utf8mb4, ucs2, utf16, utf32.
Every tailoring was added with four accent and case
sensitivity flag combinations, e.g:
* utf8mb4_uca1400_swedish_as_cs
* utf8mb4_uca1400_swedish_as_ci
* utf8mb4_uca1400_swedish_ai_cs
* utf8mb4_uca1400_swedish_ai_ci
and their _nopad_ variants:
* utf8mb4_uca1400_swedish_nopad_as_cs
* utf8mb4_uca1400_swedish_nopad_as_ci
* utf8mb4_uca1400_swedish_nopad_ai_cs
* utf8mb4_uca1400_swedish_nopad_ai_ci
- Introducing a conception of contextually typed named collations:
CREATE DATABASE db1 CHARACTER SET utf8mb4;
CREATE TABLE db1.t1 (a CHAR(10) COLLATE uca1400_as_ci);
The idea is that there is no a need to specify the character set prefix
in the new collation names. It's enough to type just the suffix
"uca1400_as_ci". The character set is taken from the context.
In the above example script the context character set is utf8mb4.
So the CREATE TABLE will make a column with the collation
utf8mb4_uca1400_as_ci.
Short collations names can be used in any parts of the SQL syntax
where the COLLATE clause is understood.
- New collations are displayed only one time
(without character set combinations) by these statements:
SELECT * FROM INFORMATION_SCHEMA.COLLATIONS;
SHOW COLLATION;
For example, all these collations:
- utf8mb3_uca1400_swedish_as_ci
- utf8mb4_uca1400_swedish_as_ci
- ucs2_uca1400_swedish_as_ci
- utf16_uca1400_swedish_as_ci
- utf32_uca1400_swedish_as_ci
have just one entry in INFORMATION_SCHEMA.COLLATIONS and SHOW COLLATION,
with COLLATION_NAME equal to "uca1400_swedish_as_ci", which is the suffix
without the character set name:
SELECT COLLATION_NAME FROM INFORMATION_SCHEMA.COLLATIONS
WHERE COLLATION_NAME LIKE '%uca1400_swedish_as_ci';
+-----------------------+
| COLLATION_NAME |
+-----------------------+
| uca1400_swedish_as_ci |
+-----------------------+
Note, the behaviour of old collations did not change.
Non-unicode collations (e.g. latin1_swedish_ci) and
old UCA-4.0.0 collations (e.g. utf8mb4_unicode_ci)
are still displayed with the character set prefix, as before.
- The structure of the table INFORMATION_SCHEMA.COLLATIONS was changed.
The NOT NULL constraint was removed from these columns:
- CHARACTER_SET_NAME
- ID
- IS_DEFAULT
and from the corresponding columns in SHOW COLLATION.
For example:
SELECT COLLATION_NAME, CHARACTER_SET_NAME, ID, IS_DEFAULT
FROM INFORMATION_SCHEMA.COLLATIONS
WHERE COLLATION_NAME LIKE '%uca1400_swedish_as_ci';
+-----------------------+--------------------+------+------------+
| COLLATION_NAME | CHARACTER_SET_NAME | ID | IS_DEFAULT |
+-----------------------+--------------------+------+------------+
| uca1400_swedish_as_ci | NULL | NULL | NULL |
+-----------------------+--------------------+------+------------+
The NULL value in these columns now means that the collation
is applicable to multiple character sets.
The behavioir of old collations did not change.
Make sure your client programs can handle NULL values in these columns.
- The structure of the table
INFORMATION_SCHEMA.COLLATION_CHARACTER_SET_APPLICABILITY was changed.
Three new NOT NULL columns were added:
- FULL_COLLATION_NAME
- ID
- IS_DEFAULT
New collations have multiple entries in COLLATION_CHARACTER_SET_APPLICABILITY.
The column COLLATION_NAME contains the collation name without the character
set prefix. The column FULL_COLLATION_NAME contains the collation name with
the character set prefix.
Old collations have full collation name in both FULL_COLLATION_NAME and
COLLATION_NAME.
SELECT COLLATION_NAME, FULL_COLLATION_NAME, CHARACTER_SET_NAME, ID, IS_DEFAULT
FROM INFORMATION_SCHEMA.COLLATION_CHARACTER_SET_APPLICABILITY
WHERE FULL_COLLATION_NAME RLIKE '^(utf8mb4|latin1).*swedish.*ci$';
+-----------------------------+-------------------------------------+--------------------+------+------------+
| COLLATION_NAME | FULL_COLLATION_NAME | CHARACTER_SET_NAME | ID | IS_DEFAULT |
+-----------------------------+-------------------------------------+--------------------+------+------------+
| latin1_swedish_ci | latin1_swedish_ci | latin1 | 8 | Yes |
| latin1_swedish_nopad_ci | latin1_swedish_nopad_ci | latin1 | 1032 | |
| utf8mb4_swedish_ci | utf8mb4_swedish_ci | utf8mb4 | 232 | |
| uca1400_swedish_ai_ci | utf8mb4_uca1400_swedish_ai_ci | utf8mb4 | 2368 | |
| uca1400_swedish_as_ci | utf8mb4_uca1400_swedish_as_ci | utf8mb4 | 2370 | |
| uca1400_swedish_nopad_ai_ci | utf8mb4_uca1400_swedish_nopad_ai_ci | utf8mb4 | 2372 | |
| uca1400_swedish_nopad_as_ci | utf8mb4_uca1400_swedish_nopad_as_ci | utf8mb4 | 2374 | |
+-----------------------------+-------------------------------------+--------------------+------+------------+
- Other INFORMATION_SCHEMA queries:
SELECT COLLATION_NAME FROM INFORMATION_SCHEMA.COLUMNS;
SELECT COLLATION_NAME FROM INFORMATION_SCHEMA.PARAMETERS;
SELECT TABLE_COLLATION FROM INFORMATION_SCHEMA.TABLES;
SELECT DEFAULT_COLLATION_NAME FROM INFORMATION_SCHEMA.SCHEMATA;
SELECT COLLATION_NAME FROM INFORMATION_SCHEMA.ROUTINES;
SELECT COLLATION_CONNECTION FROM INFORMATION_SCHEMA.EVENTS;
SELECT DATABASE_COLLATION FROM INFORMATION_SCHEMA.EVENTS;
SELECT COLLATION_CONNECTION FROM INFORMATION_SCHEMA.ROUTINES;
SELECT DATABASE_COLLATION FROM INFORMATION_SCHEMA.ROUTINES;
SELECT COLLATION_CONNECTION FROM INFORMATION_SCHEMA.TRIGGERS;
SELECT DATABASE_COLLATION FROM INFORMATION_SCHEMA.TRIGGERS;
SELECT COLLATION_CONNECTION FROM INFORMATION_SCHEMA.VIEWS;
display full collation names, including character sets prefix,
for all collations, including new collations.
Corresponding SHOW commands also display full collation names
in collation related columns:
SHOW CREATE TABLE t1;
SHOW CREATE DATABASE db1;
SHOW TABLE STATUS;
SHOW CREATE FUNCTION f1;
SHOW CREATE PROCEDURE p1;
SHOW CREATE EVENT ev1;
SHOW CREATE TRIGGER tr1;
SHOW CREATE VIEW;
These INFORMATION_SCHEMA queries and SHOW statements may change in
the future, to display show collation names.
2021-11-28 13:55:15 +01:00
|
|
|
uint level_flags= cs->levels_for_order;
|
2018-10-16 17:10:57 +02:00
|
|
|
uchar *d0= dst;
|
|
|
|
uchar *de= dst + dstlen;
|
2024-08-17 10:56:28 +02:00
|
|
|
uchar *de_for_levels= dst + dstlen;
|
2018-10-16 17:10:57 +02:00
|
|
|
uint current_level;
|
2024-08-17 10:56:28 +02:00
|
|
|
my_strnxfrm_ret_t rc= my_strnxfrm_ret_construct(0, 0, 0);
|
2018-10-16 17:10:57 +02:00
|
|
|
|
MDEV-27009 Add UCA-14.0.0 collations
- Added one neutral and 22 tailored (language specific) collations based on
Unicode Collation Algorithm version 14.0.0.
Collations were added for Unicode character sets
utf8mb3, utf8mb4, ucs2, utf16, utf32.
Every tailoring was added with four accent and case
sensitivity flag combinations, e.g:
* utf8mb4_uca1400_swedish_as_cs
* utf8mb4_uca1400_swedish_as_ci
* utf8mb4_uca1400_swedish_ai_cs
* utf8mb4_uca1400_swedish_ai_ci
and their _nopad_ variants:
* utf8mb4_uca1400_swedish_nopad_as_cs
* utf8mb4_uca1400_swedish_nopad_as_ci
* utf8mb4_uca1400_swedish_nopad_ai_cs
* utf8mb4_uca1400_swedish_nopad_ai_ci
- Introducing a conception of contextually typed named collations:
CREATE DATABASE db1 CHARACTER SET utf8mb4;
CREATE TABLE db1.t1 (a CHAR(10) COLLATE uca1400_as_ci);
The idea is that there is no a need to specify the character set prefix
in the new collation names. It's enough to type just the suffix
"uca1400_as_ci". The character set is taken from the context.
In the above example script the context character set is utf8mb4.
So the CREATE TABLE will make a column with the collation
utf8mb4_uca1400_as_ci.
Short collations names can be used in any parts of the SQL syntax
where the COLLATE clause is understood.
- New collations are displayed only one time
(without character set combinations) by these statements:
SELECT * FROM INFORMATION_SCHEMA.COLLATIONS;
SHOW COLLATION;
For example, all these collations:
- utf8mb3_uca1400_swedish_as_ci
- utf8mb4_uca1400_swedish_as_ci
- ucs2_uca1400_swedish_as_ci
- utf16_uca1400_swedish_as_ci
- utf32_uca1400_swedish_as_ci
have just one entry in INFORMATION_SCHEMA.COLLATIONS and SHOW COLLATION,
with COLLATION_NAME equal to "uca1400_swedish_as_ci", which is the suffix
without the character set name:
SELECT COLLATION_NAME FROM INFORMATION_SCHEMA.COLLATIONS
WHERE COLLATION_NAME LIKE '%uca1400_swedish_as_ci';
+-----------------------+
| COLLATION_NAME |
+-----------------------+
| uca1400_swedish_as_ci |
+-----------------------+
Note, the behaviour of old collations did not change.
Non-unicode collations (e.g. latin1_swedish_ci) and
old UCA-4.0.0 collations (e.g. utf8mb4_unicode_ci)
are still displayed with the character set prefix, as before.
- The structure of the table INFORMATION_SCHEMA.COLLATIONS was changed.
The NOT NULL constraint was removed from these columns:
- CHARACTER_SET_NAME
- ID
- IS_DEFAULT
and from the corresponding columns in SHOW COLLATION.
For example:
SELECT COLLATION_NAME, CHARACTER_SET_NAME, ID, IS_DEFAULT
FROM INFORMATION_SCHEMA.COLLATIONS
WHERE COLLATION_NAME LIKE '%uca1400_swedish_as_ci';
+-----------------------+--------------------+------+------------+
| COLLATION_NAME | CHARACTER_SET_NAME | ID | IS_DEFAULT |
+-----------------------+--------------------+------+------------+
| uca1400_swedish_as_ci | NULL | NULL | NULL |
+-----------------------+--------------------+------+------------+
The NULL value in these columns now means that the collation
is applicable to multiple character sets.
The behavioir of old collations did not change.
Make sure your client programs can handle NULL values in these columns.
- The structure of the table
INFORMATION_SCHEMA.COLLATION_CHARACTER_SET_APPLICABILITY was changed.
Three new NOT NULL columns were added:
- FULL_COLLATION_NAME
- ID
- IS_DEFAULT
New collations have multiple entries in COLLATION_CHARACTER_SET_APPLICABILITY.
The column COLLATION_NAME contains the collation name without the character
set prefix. The column FULL_COLLATION_NAME contains the collation name with
the character set prefix.
Old collations have full collation name in both FULL_COLLATION_NAME and
COLLATION_NAME.
SELECT COLLATION_NAME, FULL_COLLATION_NAME, CHARACTER_SET_NAME, ID, IS_DEFAULT
FROM INFORMATION_SCHEMA.COLLATION_CHARACTER_SET_APPLICABILITY
WHERE FULL_COLLATION_NAME RLIKE '^(utf8mb4|latin1).*swedish.*ci$';
+-----------------------------+-------------------------------------+--------------------+------+------------+
| COLLATION_NAME | FULL_COLLATION_NAME | CHARACTER_SET_NAME | ID | IS_DEFAULT |
+-----------------------------+-------------------------------------+--------------------+------+------------+
| latin1_swedish_ci | latin1_swedish_ci | latin1 | 8 | Yes |
| latin1_swedish_nopad_ci | latin1_swedish_nopad_ci | latin1 | 1032 | |
| utf8mb4_swedish_ci | utf8mb4_swedish_ci | utf8mb4 | 232 | |
| uca1400_swedish_ai_ci | utf8mb4_uca1400_swedish_ai_ci | utf8mb4 | 2368 | |
| uca1400_swedish_as_ci | utf8mb4_uca1400_swedish_as_ci | utf8mb4 | 2370 | |
| uca1400_swedish_nopad_ai_ci | utf8mb4_uca1400_swedish_nopad_ai_ci | utf8mb4 | 2372 | |
| uca1400_swedish_nopad_as_ci | utf8mb4_uca1400_swedish_nopad_as_ci | utf8mb4 | 2374 | |
+-----------------------------+-------------------------------------+--------------------+------+------------+
- Other INFORMATION_SCHEMA queries:
SELECT COLLATION_NAME FROM INFORMATION_SCHEMA.COLUMNS;
SELECT COLLATION_NAME FROM INFORMATION_SCHEMA.PARAMETERS;
SELECT TABLE_COLLATION FROM INFORMATION_SCHEMA.TABLES;
SELECT DEFAULT_COLLATION_NAME FROM INFORMATION_SCHEMA.SCHEMATA;
SELECT COLLATION_NAME FROM INFORMATION_SCHEMA.ROUTINES;
SELECT COLLATION_CONNECTION FROM INFORMATION_SCHEMA.EVENTS;
SELECT DATABASE_COLLATION FROM INFORMATION_SCHEMA.EVENTS;
SELECT COLLATION_CONNECTION FROM INFORMATION_SCHEMA.ROUTINES;
SELECT DATABASE_COLLATION FROM INFORMATION_SCHEMA.ROUTINES;
SELECT COLLATION_CONNECTION FROM INFORMATION_SCHEMA.TRIGGERS;
SELECT DATABASE_COLLATION FROM INFORMATION_SCHEMA.TRIGGERS;
SELECT COLLATION_CONNECTION FROM INFORMATION_SCHEMA.VIEWS;
display full collation names, including character sets prefix,
for all collations, including new collations.
Corresponding SHOW commands also display full collation names
in collation related columns:
SHOW CREATE TABLE t1;
SHOW CREATE DATABASE db1;
SHOW TABLE STATUS;
SHOW CREATE FUNCTION f1;
SHOW CREATE PROCEDURE p1;
SHOW CREATE EVENT ev1;
SHOW CREATE TRIGGER tr1;
SHOW CREATE VIEW;
These INFORMATION_SCHEMA queries and SHOW statements may change in
the future, to display show collation names.
2021-11-28 13:55:15 +01:00
|
|
|
for (current_level= 0; level_flags; current_level++, level_flags>>= 1)
|
2018-10-16 17:10:57 +02:00
|
|
|
{
|
MDEV-27009 Add UCA-14.0.0 collations
- Added one neutral and 22 tailored (language specific) collations based on
Unicode Collation Algorithm version 14.0.0.
Collations were added for Unicode character sets
utf8mb3, utf8mb4, ucs2, utf16, utf32.
Every tailoring was added with four accent and case
sensitivity flag combinations, e.g:
* utf8mb4_uca1400_swedish_as_cs
* utf8mb4_uca1400_swedish_as_ci
* utf8mb4_uca1400_swedish_ai_cs
* utf8mb4_uca1400_swedish_ai_ci
and their _nopad_ variants:
* utf8mb4_uca1400_swedish_nopad_as_cs
* utf8mb4_uca1400_swedish_nopad_as_ci
* utf8mb4_uca1400_swedish_nopad_ai_cs
* utf8mb4_uca1400_swedish_nopad_ai_ci
- Introducing a conception of contextually typed named collations:
CREATE DATABASE db1 CHARACTER SET utf8mb4;
CREATE TABLE db1.t1 (a CHAR(10) COLLATE uca1400_as_ci);
The idea is that there is no a need to specify the character set prefix
in the new collation names. It's enough to type just the suffix
"uca1400_as_ci". The character set is taken from the context.
In the above example script the context character set is utf8mb4.
So the CREATE TABLE will make a column with the collation
utf8mb4_uca1400_as_ci.
Short collations names can be used in any parts of the SQL syntax
where the COLLATE clause is understood.
- New collations are displayed only one time
(without character set combinations) by these statements:
SELECT * FROM INFORMATION_SCHEMA.COLLATIONS;
SHOW COLLATION;
For example, all these collations:
- utf8mb3_uca1400_swedish_as_ci
- utf8mb4_uca1400_swedish_as_ci
- ucs2_uca1400_swedish_as_ci
- utf16_uca1400_swedish_as_ci
- utf32_uca1400_swedish_as_ci
have just one entry in INFORMATION_SCHEMA.COLLATIONS and SHOW COLLATION,
with COLLATION_NAME equal to "uca1400_swedish_as_ci", which is the suffix
without the character set name:
SELECT COLLATION_NAME FROM INFORMATION_SCHEMA.COLLATIONS
WHERE COLLATION_NAME LIKE '%uca1400_swedish_as_ci';
+-----------------------+
| COLLATION_NAME |
+-----------------------+
| uca1400_swedish_as_ci |
+-----------------------+
Note, the behaviour of old collations did not change.
Non-unicode collations (e.g. latin1_swedish_ci) and
old UCA-4.0.0 collations (e.g. utf8mb4_unicode_ci)
are still displayed with the character set prefix, as before.
- The structure of the table INFORMATION_SCHEMA.COLLATIONS was changed.
The NOT NULL constraint was removed from these columns:
- CHARACTER_SET_NAME
- ID
- IS_DEFAULT
and from the corresponding columns in SHOW COLLATION.
For example:
SELECT COLLATION_NAME, CHARACTER_SET_NAME, ID, IS_DEFAULT
FROM INFORMATION_SCHEMA.COLLATIONS
WHERE COLLATION_NAME LIKE '%uca1400_swedish_as_ci';
+-----------------------+--------------------+------+------------+
| COLLATION_NAME | CHARACTER_SET_NAME | ID | IS_DEFAULT |
+-----------------------+--------------------+------+------------+
| uca1400_swedish_as_ci | NULL | NULL | NULL |
+-----------------------+--------------------+------+------------+
The NULL value in these columns now means that the collation
is applicable to multiple character sets.
The behavioir of old collations did not change.
Make sure your client programs can handle NULL values in these columns.
- The structure of the table
INFORMATION_SCHEMA.COLLATION_CHARACTER_SET_APPLICABILITY was changed.
Three new NOT NULL columns were added:
- FULL_COLLATION_NAME
- ID
- IS_DEFAULT
New collations have multiple entries in COLLATION_CHARACTER_SET_APPLICABILITY.
The column COLLATION_NAME contains the collation name without the character
set prefix. The column FULL_COLLATION_NAME contains the collation name with
the character set prefix.
Old collations have full collation name in both FULL_COLLATION_NAME and
COLLATION_NAME.
SELECT COLLATION_NAME, FULL_COLLATION_NAME, CHARACTER_SET_NAME, ID, IS_DEFAULT
FROM INFORMATION_SCHEMA.COLLATION_CHARACTER_SET_APPLICABILITY
WHERE FULL_COLLATION_NAME RLIKE '^(utf8mb4|latin1).*swedish.*ci$';
+-----------------------------+-------------------------------------+--------------------+------+------------+
| COLLATION_NAME | FULL_COLLATION_NAME | CHARACTER_SET_NAME | ID | IS_DEFAULT |
+-----------------------------+-------------------------------------+--------------------+------+------------+
| latin1_swedish_ci | latin1_swedish_ci | latin1 | 8 | Yes |
| latin1_swedish_nopad_ci | latin1_swedish_nopad_ci | latin1 | 1032 | |
| utf8mb4_swedish_ci | utf8mb4_swedish_ci | utf8mb4 | 232 | |
| uca1400_swedish_ai_ci | utf8mb4_uca1400_swedish_ai_ci | utf8mb4 | 2368 | |
| uca1400_swedish_as_ci | utf8mb4_uca1400_swedish_as_ci | utf8mb4 | 2370 | |
| uca1400_swedish_nopad_ai_ci | utf8mb4_uca1400_swedish_nopad_ai_ci | utf8mb4 | 2372 | |
| uca1400_swedish_nopad_as_ci | utf8mb4_uca1400_swedish_nopad_as_ci | utf8mb4 | 2374 | |
+-----------------------------+-------------------------------------+--------------------+------+------------+
- Other INFORMATION_SCHEMA queries:
SELECT COLLATION_NAME FROM INFORMATION_SCHEMA.COLUMNS;
SELECT COLLATION_NAME FROM INFORMATION_SCHEMA.PARAMETERS;
SELECT TABLE_COLLATION FROM INFORMATION_SCHEMA.TABLES;
SELECT DEFAULT_COLLATION_NAME FROM INFORMATION_SCHEMA.SCHEMATA;
SELECT COLLATION_NAME FROM INFORMATION_SCHEMA.ROUTINES;
SELECT COLLATION_CONNECTION FROM INFORMATION_SCHEMA.EVENTS;
SELECT DATABASE_COLLATION FROM INFORMATION_SCHEMA.EVENTS;
SELECT COLLATION_CONNECTION FROM INFORMATION_SCHEMA.ROUTINES;
SELECT DATABASE_COLLATION FROM INFORMATION_SCHEMA.ROUTINES;
SELECT COLLATION_CONNECTION FROM INFORMATION_SCHEMA.TRIGGERS;
SELECT DATABASE_COLLATION FROM INFORMATION_SCHEMA.TRIGGERS;
SELECT COLLATION_CONNECTION FROM INFORMATION_SCHEMA.VIEWS;
display full collation names, including character sets prefix,
for all collations, including new collations.
Corresponding SHOW commands also display full collation names
in collation related columns:
SHOW CREATE TABLE t1;
SHOW CREATE DATABASE db1;
SHOW TABLE STATUS;
SHOW CREATE FUNCTION f1;
SHOW CREATE PROCEDURE p1;
SHOW CREATE EVENT ev1;
SHOW CREATE TRIGGER tr1;
SHOW CREATE VIEW;
These INFORMATION_SCHEMA queries and SHOW statements may change in
the future, to display show collation names.
2021-11-28 13:55:15 +01:00
|
|
|
if (!(level_flags & 1))
|
|
|
|
continue;
|
2018-10-16 17:10:57 +02:00
|
|
|
if (!(flags & MY_STRXFRM_LEVEL_ALL) ||
|
|
|
|
(flags & (MY_STRXFRM_LEVEL1 << current_level)))
|
2024-08-17 10:56:28 +02:00
|
|
|
{
|
|
|
|
const my_strnxfrm_ret_t rc1= cs->state & MY_CS_NOPAD ?
|
2018-10-16 17:10:57 +02:00
|
|
|
MY_FUNCTION_NAME(strnxfrm_nopad_onelevel)(cs,
|
|
|
|
&cs->uca->level[current_level],
|
2024-08-17 10:56:28 +02:00
|
|
|
dst, de_for_levels, nweights,
|
2018-10-16 17:10:57 +02:00
|
|
|
src, srclen, flags) :
|
|
|
|
MY_FUNCTION_NAME(strnxfrm_onelevel)(cs,
|
|
|
|
&cs->uca->level[current_level],
|
2024-08-17 10:56:28 +02:00
|
|
|
dst, de_for_levels, nweights,
|
2018-10-16 17:10:57 +02:00
|
|
|
src, srclen, flags);
|
2024-08-17 10:56:28 +02:00
|
|
|
rc.m_source_length_used+= rc1.m_source_length_used;
|
|
|
|
rc.m_warnings|= rc1.m_warnings;
|
|
|
|
dst+= rc1.m_result_length;
|
|
|
|
DBUG_ASSERT(dst <= de);
|
|
|
|
if (rc1.m_warnings)
|
|
|
|
{
|
|
|
|
if (rc1.m_warnings & MY_STRNXFRM_TRUNCATED_WEIGHT_REAL_CHAR)
|
|
|
|
break;
|
|
|
|
/*
|
|
|
|
A weight for a padding space did not fit on the current level.
|
|
|
|
Characters may be ignorable on this level, but non-ignorable
|
|
|
|
on the next level. Let's continue with the next levels
|
|
|
|
only to find non-ignorable characters and set
|
|
|
|
MY_STRNXFRM_TRUNCATED_WEIGHT_REAL_CHAR if found.
|
|
|
|
But let's set ds_for_levels to dst to prevent putting
|
|
|
|
any weights into the destination buffer on the the next.
|
|
|
|
*/
|
|
|
|
de_for_levels= dst;
|
|
|
|
}
|
|
|
|
}
|
2018-10-16 17:10:57 +02:00
|
|
|
}
|
|
|
|
|
|
|
|
if (dst < de && (flags & MY_STRXFRM_PAD_TO_MAXLEN))
|
|
|
|
{
|
|
|
|
memset(dst, 0, de - dst);
|
|
|
|
dst= de;
|
|
|
|
}
|
|
|
|
|
2024-08-17 10:56:28 +02:00
|
|
|
rc.m_result_length= dst - d0;
|
|
|
|
return rc;
|
2018-10-16 17:10:57 +02:00
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
/*
|
|
|
|
One-level, PAD SPACE
|
|
|
|
*/
|
|
|
|
MY_COLLATION_HANDLER MY_FUNCTION_NAME(collation_handler)=
|
|
|
|
{
|
2018-10-16 17:10:57 +02:00
|
|
|
MY_UCA_COLL_INIT,
|
2018-10-16 17:10:57 +02:00
|
|
|
MY_FUNCTION_NAME(strnncoll),
|
|
|
|
MY_FUNCTION_NAME(strnncollsp),
|
2021-09-29 13:13:57 +02:00
|
|
|
MY_FUNCTION_NAME(strnncollsp_nchars),
|
2018-10-16 17:10:57 +02:00
|
|
|
MY_FUNCTION_NAME(strnxfrm),
|
|
|
|
my_strnxfrmlen_any_uca,
|
|
|
|
MY_LIKE_RANGE,
|
|
|
|
my_wildcmp_uca,
|
|
|
|
my_instr_mb,
|
|
|
|
MY_FUNCTION_NAME(hash_sort),
|
2021-09-23 16:46:37 +02:00
|
|
|
my_propagate_complex,
|
|
|
|
my_min_str_mb_simple,
|
MDEV-27009 Add UCA-14.0.0 collations
- Added one neutral and 22 tailored (language specific) collations based on
Unicode Collation Algorithm version 14.0.0.
Collations were added for Unicode character sets
utf8mb3, utf8mb4, ucs2, utf16, utf32.
Every tailoring was added with four accent and case
sensitivity flag combinations, e.g:
* utf8mb4_uca1400_swedish_as_cs
* utf8mb4_uca1400_swedish_as_ci
* utf8mb4_uca1400_swedish_ai_cs
* utf8mb4_uca1400_swedish_ai_ci
and their _nopad_ variants:
* utf8mb4_uca1400_swedish_nopad_as_cs
* utf8mb4_uca1400_swedish_nopad_as_ci
* utf8mb4_uca1400_swedish_nopad_ai_cs
* utf8mb4_uca1400_swedish_nopad_ai_ci
- Introducing a conception of contextually typed named collations:
CREATE DATABASE db1 CHARACTER SET utf8mb4;
CREATE TABLE db1.t1 (a CHAR(10) COLLATE uca1400_as_ci);
The idea is that there is no a need to specify the character set prefix
in the new collation names. It's enough to type just the suffix
"uca1400_as_ci". The character set is taken from the context.
In the above example script the context character set is utf8mb4.
So the CREATE TABLE will make a column with the collation
utf8mb4_uca1400_as_ci.
Short collations names can be used in any parts of the SQL syntax
where the COLLATE clause is understood.
- New collations are displayed only one time
(without character set combinations) by these statements:
SELECT * FROM INFORMATION_SCHEMA.COLLATIONS;
SHOW COLLATION;
For example, all these collations:
- utf8mb3_uca1400_swedish_as_ci
- utf8mb4_uca1400_swedish_as_ci
- ucs2_uca1400_swedish_as_ci
- utf16_uca1400_swedish_as_ci
- utf32_uca1400_swedish_as_ci
have just one entry in INFORMATION_SCHEMA.COLLATIONS and SHOW COLLATION,
with COLLATION_NAME equal to "uca1400_swedish_as_ci", which is the suffix
without the character set name:
SELECT COLLATION_NAME FROM INFORMATION_SCHEMA.COLLATIONS
WHERE COLLATION_NAME LIKE '%uca1400_swedish_as_ci';
+-----------------------+
| COLLATION_NAME |
+-----------------------+
| uca1400_swedish_as_ci |
+-----------------------+
Note, the behaviour of old collations did not change.
Non-unicode collations (e.g. latin1_swedish_ci) and
old UCA-4.0.0 collations (e.g. utf8mb4_unicode_ci)
are still displayed with the character set prefix, as before.
- The structure of the table INFORMATION_SCHEMA.COLLATIONS was changed.
The NOT NULL constraint was removed from these columns:
- CHARACTER_SET_NAME
- ID
- IS_DEFAULT
and from the corresponding columns in SHOW COLLATION.
For example:
SELECT COLLATION_NAME, CHARACTER_SET_NAME, ID, IS_DEFAULT
FROM INFORMATION_SCHEMA.COLLATIONS
WHERE COLLATION_NAME LIKE '%uca1400_swedish_as_ci';
+-----------------------+--------------------+------+------------+
| COLLATION_NAME | CHARACTER_SET_NAME | ID | IS_DEFAULT |
+-----------------------+--------------------+------+------------+
| uca1400_swedish_as_ci | NULL | NULL | NULL |
+-----------------------+--------------------+------+------------+
The NULL value in these columns now means that the collation
is applicable to multiple character sets.
The behavioir of old collations did not change.
Make sure your client programs can handle NULL values in these columns.
- The structure of the table
INFORMATION_SCHEMA.COLLATION_CHARACTER_SET_APPLICABILITY was changed.
Three new NOT NULL columns were added:
- FULL_COLLATION_NAME
- ID
- IS_DEFAULT
New collations have multiple entries in COLLATION_CHARACTER_SET_APPLICABILITY.
The column COLLATION_NAME contains the collation name without the character
set prefix. The column FULL_COLLATION_NAME contains the collation name with
the character set prefix.
Old collations have full collation name in both FULL_COLLATION_NAME and
COLLATION_NAME.
SELECT COLLATION_NAME, FULL_COLLATION_NAME, CHARACTER_SET_NAME, ID, IS_DEFAULT
FROM INFORMATION_SCHEMA.COLLATION_CHARACTER_SET_APPLICABILITY
WHERE FULL_COLLATION_NAME RLIKE '^(utf8mb4|latin1).*swedish.*ci$';
+-----------------------------+-------------------------------------+--------------------+------+------------+
| COLLATION_NAME | FULL_COLLATION_NAME | CHARACTER_SET_NAME | ID | IS_DEFAULT |
+-----------------------------+-------------------------------------+--------------------+------+------------+
| latin1_swedish_ci | latin1_swedish_ci | latin1 | 8 | Yes |
| latin1_swedish_nopad_ci | latin1_swedish_nopad_ci | latin1 | 1032 | |
| utf8mb4_swedish_ci | utf8mb4_swedish_ci | utf8mb4 | 232 | |
| uca1400_swedish_ai_ci | utf8mb4_uca1400_swedish_ai_ci | utf8mb4 | 2368 | |
| uca1400_swedish_as_ci | utf8mb4_uca1400_swedish_as_ci | utf8mb4 | 2370 | |
| uca1400_swedish_nopad_ai_ci | utf8mb4_uca1400_swedish_nopad_ai_ci | utf8mb4 | 2372 | |
| uca1400_swedish_nopad_as_ci | utf8mb4_uca1400_swedish_nopad_as_ci | utf8mb4 | 2374 | |
+-----------------------------+-------------------------------------+--------------------+------+------------+
- Other INFORMATION_SCHEMA queries:
SELECT COLLATION_NAME FROM INFORMATION_SCHEMA.COLUMNS;
SELECT COLLATION_NAME FROM INFORMATION_SCHEMA.PARAMETERS;
SELECT TABLE_COLLATION FROM INFORMATION_SCHEMA.TABLES;
SELECT DEFAULT_COLLATION_NAME FROM INFORMATION_SCHEMA.SCHEMATA;
SELECT COLLATION_NAME FROM INFORMATION_SCHEMA.ROUTINES;
SELECT COLLATION_CONNECTION FROM INFORMATION_SCHEMA.EVENTS;
SELECT DATABASE_COLLATION FROM INFORMATION_SCHEMA.EVENTS;
SELECT COLLATION_CONNECTION FROM INFORMATION_SCHEMA.ROUTINES;
SELECT DATABASE_COLLATION FROM INFORMATION_SCHEMA.ROUTINES;
SELECT COLLATION_CONNECTION FROM INFORMATION_SCHEMA.TRIGGERS;
SELECT DATABASE_COLLATION FROM INFORMATION_SCHEMA.TRIGGERS;
SELECT COLLATION_CONNECTION FROM INFORMATION_SCHEMA.VIEWS;
display full collation names, including character sets prefix,
for all collations, including new collations.
Corresponding SHOW commands also display full collation names
in collation related columns:
SHOW CREATE TABLE t1;
SHOW CREATE DATABASE db1;
SHOW TABLE STATUS;
SHOW CREATE FUNCTION f1;
SHOW CREATE PROCEDURE p1;
SHOW CREATE EVENT ev1;
SHOW CREATE TRIGGER tr1;
SHOW CREATE VIEW;
These INFORMATION_SCHEMA queries and SHOW statements may change in
the future, to display show collation names.
2021-11-28 13:55:15 +01:00
|
|
|
my_max_str_mb_simple,
|
|
|
|
my_ci_get_id_uca,
|
|
|
|
my_ci_get_collation_name_uca
|
2018-10-16 17:10:57 +02:00
|
|
|
};
|
|
|
|
|
|
|
|
|
|
|
|
/*
|
|
|
|
One-level, NO PAD
|
|
|
|
For character sets with mbminlen==1 use MY_LIKE_RANGE=my_like_range_mb
|
|
|
|
For character sets with mbminlen>=2 use MY_LIKE_RANGE=my_like_range_generic
|
|
|
|
*/
|
|
|
|
MY_COLLATION_HANDLER MY_FUNCTION_NAME(collation_handler_nopad)=
|
|
|
|
{
|
2018-10-16 17:10:57 +02:00
|
|
|
MY_UCA_COLL_INIT,
|
2018-10-16 17:10:57 +02:00
|
|
|
MY_FUNCTION_NAME(strnncoll),
|
|
|
|
MY_FUNCTION_NAME(strnncollsp_nopad),
|
2021-09-29 13:13:57 +02:00
|
|
|
MY_FUNCTION_NAME(strnncollsp_nchars),
|
2018-10-16 17:10:57 +02:00
|
|
|
MY_FUNCTION_NAME(strnxfrm_nopad),
|
|
|
|
my_strnxfrmlen_any_uca,
|
|
|
|
MY_LIKE_RANGE, /* my_like_range_mb or my_like_range_generic */
|
|
|
|
my_wildcmp_uca,
|
|
|
|
my_instr_mb,
|
|
|
|
MY_FUNCTION_NAME(hash_sort_nopad),
|
2021-09-23 16:46:37 +02:00
|
|
|
my_propagate_complex,
|
|
|
|
my_min_str_mb_simple_nopad,
|
MDEV-27009 Add UCA-14.0.0 collations
- Added one neutral and 22 tailored (language specific) collations based on
Unicode Collation Algorithm version 14.0.0.
Collations were added for Unicode character sets
utf8mb3, utf8mb4, ucs2, utf16, utf32.
Every tailoring was added with four accent and case
sensitivity flag combinations, e.g:
* utf8mb4_uca1400_swedish_as_cs
* utf8mb4_uca1400_swedish_as_ci
* utf8mb4_uca1400_swedish_ai_cs
* utf8mb4_uca1400_swedish_ai_ci
and their _nopad_ variants:
* utf8mb4_uca1400_swedish_nopad_as_cs
* utf8mb4_uca1400_swedish_nopad_as_ci
* utf8mb4_uca1400_swedish_nopad_ai_cs
* utf8mb4_uca1400_swedish_nopad_ai_ci
- Introducing a conception of contextually typed named collations:
CREATE DATABASE db1 CHARACTER SET utf8mb4;
CREATE TABLE db1.t1 (a CHAR(10) COLLATE uca1400_as_ci);
The idea is that there is no a need to specify the character set prefix
in the new collation names. It's enough to type just the suffix
"uca1400_as_ci". The character set is taken from the context.
In the above example script the context character set is utf8mb4.
So the CREATE TABLE will make a column with the collation
utf8mb4_uca1400_as_ci.
Short collations names can be used in any parts of the SQL syntax
where the COLLATE clause is understood.
- New collations are displayed only one time
(without character set combinations) by these statements:
SELECT * FROM INFORMATION_SCHEMA.COLLATIONS;
SHOW COLLATION;
For example, all these collations:
- utf8mb3_uca1400_swedish_as_ci
- utf8mb4_uca1400_swedish_as_ci
- ucs2_uca1400_swedish_as_ci
- utf16_uca1400_swedish_as_ci
- utf32_uca1400_swedish_as_ci
have just one entry in INFORMATION_SCHEMA.COLLATIONS and SHOW COLLATION,
with COLLATION_NAME equal to "uca1400_swedish_as_ci", which is the suffix
without the character set name:
SELECT COLLATION_NAME FROM INFORMATION_SCHEMA.COLLATIONS
WHERE COLLATION_NAME LIKE '%uca1400_swedish_as_ci';
+-----------------------+
| COLLATION_NAME |
+-----------------------+
| uca1400_swedish_as_ci |
+-----------------------+
Note, the behaviour of old collations did not change.
Non-unicode collations (e.g. latin1_swedish_ci) and
old UCA-4.0.0 collations (e.g. utf8mb4_unicode_ci)
are still displayed with the character set prefix, as before.
- The structure of the table INFORMATION_SCHEMA.COLLATIONS was changed.
The NOT NULL constraint was removed from these columns:
- CHARACTER_SET_NAME
- ID
- IS_DEFAULT
and from the corresponding columns in SHOW COLLATION.
For example:
SELECT COLLATION_NAME, CHARACTER_SET_NAME, ID, IS_DEFAULT
FROM INFORMATION_SCHEMA.COLLATIONS
WHERE COLLATION_NAME LIKE '%uca1400_swedish_as_ci';
+-----------------------+--------------------+------+------------+
| COLLATION_NAME | CHARACTER_SET_NAME | ID | IS_DEFAULT |
+-----------------------+--------------------+------+------------+
| uca1400_swedish_as_ci | NULL | NULL | NULL |
+-----------------------+--------------------+------+------------+
The NULL value in these columns now means that the collation
is applicable to multiple character sets.
The behavioir of old collations did not change.
Make sure your client programs can handle NULL values in these columns.
- The structure of the table
INFORMATION_SCHEMA.COLLATION_CHARACTER_SET_APPLICABILITY was changed.
Three new NOT NULL columns were added:
- FULL_COLLATION_NAME
- ID
- IS_DEFAULT
New collations have multiple entries in COLLATION_CHARACTER_SET_APPLICABILITY.
The column COLLATION_NAME contains the collation name without the character
set prefix. The column FULL_COLLATION_NAME contains the collation name with
the character set prefix.
Old collations have full collation name in both FULL_COLLATION_NAME and
COLLATION_NAME.
SELECT COLLATION_NAME, FULL_COLLATION_NAME, CHARACTER_SET_NAME, ID, IS_DEFAULT
FROM INFORMATION_SCHEMA.COLLATION_CHARACTER_SET_APPLICABILITY
WHERE FULL_COLLATION_NAME RLIKE '^(utf8mb4|latin1).*swedish.*ci$';
+-----------------------------+-------------------------------------+--------------------+------+------------+
| COLLATION_NAME | FULL_COLLATION_NAME | CHARACTER_SET_NAME | ID | IS_DEFAULT |
+-----------------------------+-------------------------------------+--------------------+------+------------+
| latin1_swedish_ci | latin1_swedish_ci | latin1 | 8 | Yes |
| latin1_swedish_nopad_ci | latin1_swedish_nopad_ci | latin1 | 1032 | |
| utf8mb4_swedish_ci | utf8mb4_swedish_ci | utf8mb4 | 232 | |
| uca1400_swedish_ai_ci | utf8mb4_uca1400_swedish_ai_ci | utf8mb4 | 2368 | |
| uca1400_swedish_as_ci | utf8mb4_uca1400_swedish_as_ci | utf8mb4 | 2370 | |
| uca1400_swedish_nopad_ai_ci | utf8mb4_uca1400_swedish_nopad_ai_ci | utf8mb4 | 2372 | |
| uca1400_swedish_nopad_as_ci | utf8mb4_uca1400_swedish_nopad_as_ci | utf8mb4 | 2374 | |
+-----------------------------+-------------------------------------+--------------------+------+------------+
- Other INFORMATION_SCHEMA queries:
SELECT COLLATION_NAME FROM INFORMATION_SCHEMA.COLUMNS;
SELECT COLLATION_NAME FROM INFORMATION_SCHEMA.PARAMETERS;
SELECT TABLE_COLLATION FROM INFORMATION_SCHEMA.TABLES;
SELECT DEFAULT_COLLATION_NAME FROM INFORMATION_SCHEMA.SCHEMATA;
SELECT COLLATION_NAME FROM INFORMATION_SCHEMA.ROUTINES;
SELECT COLLATION_CONNECTION FROM INFORMATION_SCHEMA.EVENTS;
SELECT DATABASE_COLLATION FROM INFORMATION_SCHEMA.EVENTS;
SELECT COLLATION_CONNECTION FROM INFORMATION_SCHEMA.ROUTINES;
SELECT DATABASE_COLLATION FROM INFORMATION_SCHEMA.ROUTINES;
SELECT COLLATION_CONNECTION FROM INFORMATION_SCHEMA.TRIGGERS;
SELECT DATABASE_COLLATION FROM INFORMATION_SCHEMA.TRIGGERS;
SELECT COLLATION_CONNECTION FROM INFORMATION_SCHEMA.VIEWS;
display full collation names, including character sets prefix,
for all collations, including new collations.
Corresponding SHOW commands also display full collation names
in collation related columns:
SHOW CREATE TABLE t1;
SHOW CREATE DATABASE db1;
SHOW TABLE STATUS;
SHOW CREATE FUNCTION f1;
SHOW CREATE PROCEDURE p1;
SHOW CREATE EVENT ev1;
SHOW CREATE TRIGGER tr1;
SHOW CREATE VIEW;
These INFORMATION_SCHEMA queries and SHOW statements may change in
the future, to display show collation names.
2021-11-28 13:55:15 +01:00
|
|
|
my_max_str_mb_simple,
|
|
|
|
my_ci_get_id_uca,
|
|
|
|
my_ci_get_collation_name_uca
|
2018-10-16 17:10:57 +02:00
|
|
|
};
|
|
|
|
|
|
|
|
|
|
|
|
/*
|
|
|
|
Multi-level, PAD SPACE
|
|
|
|
*/
|
|
|
|
MY_COLLATION_HANDLER MY_FUNCTION_NAME(collation_handler_multilevel)=
|
|
|
|
{
|
2018-10-16 17:10:57 +02:00
|
|
|
MY_UCA_COLL_INIT,
|
2018-10-16 17:10:57 +02:00
|
|
|
MY_FUNCTION_NAME(strnncoll_multilevel),
|
|
|
|
MY_FUNCTION_NAME(strnncollsp_multilevel),
|
2021-09-29 13:13:57 +02:00
|
|
|
MY_FUNCTION_NAME(strnncollsp_nchars_multilevel),
|
2018-10-16 17:10:57 +02:00
|
|
|
MY_FUNCTION_NAME(strnxfrm_multilevel),
|
|
|
|
my_strnxfrmlen_any_uca_multilevel,
|
|
|
|
MY_LIKE_RANGE,
|
|
|
|
my_wildcmp_uca,
|
|
|
|
my_instr_mb,
|
|
|
|
MY_FUNCTION_NAME(hash_sort),
|
2021-09-23 16:46:37 +02:00
|
|
|
my_propagate_complex,
|
|
|
|
my_min_str_mb_simple,
|
MDEV-27009 Add UCA-14.0.0 collations
- Added one neutral and 22 tailored (language specific) collations based on
Unicode Collation Algorithm version 14.0.0.
Collations were added for Unicode character sets
utf8mb3, utf8mb4, ucs2, utf16, utf32.
Every tailoring was added with four accent and case
sensitivity flag combinations, e.g:
* utf8mb4_uca1400_swedish_as_cs
* utf8mb4_uca1400_swedish_as_ci
* utf8mb4_uca1400_swedish_ai_cs
* utf8mb4_uca1400_swedish_ai_ci
and their _nopad_ variants:
* utf8mb4_uca1400_swedish_nopad_as_cs
* utf8mb4_uca1400_swedish_nopad_as_ci
* utf8mb4_uca1400_swedish_nopad_ai_cs
* utf8mb4_uca1400_swedish_nopad_ai_ci
- Introducing a conception of contextually typed named collations:
CREATE DATABASE db1 CHARACTER SET utf8mb4;
CREATE TABLE db1.t1 (a CHAR(10) COLLATE uca1400_as_ci);
The idea is that there is no a need to specify the character set prefix
in the new collation names. It's enough to type just the suffix
"uca1400_as_ci". The character set is taken from the context.
In the above example script the context character set is utf8mb4.
So the CREATE TABLE will make a column with the collation
utf8mb4_uca1400_as_ci.
Short collations names can be used in any parts of the SQL syntax
where the COLLATE clause is understood.
- New collations are displayed only one time
(without character set combinations) by these statements:
SELECT * FROM INFORMATION_SCHEMA.COLLATIONS;
SHOW COLLATION;
For example, all these collations:
- utf8mb3_uca1400_swedish_as_ci
- utf8mb4_uca1400_swedish_as_ci
- ucs2_uca1400_swedish_as_ci
- utf16_uca1400_swedish_as_ci
- utf32_uca1400_swedish_as_ci
have just one entry in INFORMATION_SCHEMA.COLLATIONS and SHOW COLLATION,
with COLLATION_NAME equal to "uca1400_swedish_as_ci", which is the suffix
without the character set name:
SELECT COLLATION_NAME FROM INFORMATION_SCHEMA.COLLATIONS
WHERE COLLATION_NAME LIKE '%uca1400_swedish_as_ci';
+-----------------------+
| COLLATION_NAME |
+-----------------------+
| uca1400_swedish_as_ci |
+-----------------------+
Note, the behaviour of old collations did not change.
Non-unicode collations (e.g. latin1_swedish_ci) and
old UCA-4.0.0 collations (e.g. utf8mb4_unicode_ci)
are still displayed with the character set prefix, as before.
- The structure of the table INFORMATION_SCHEMA.COLLATIONS was changed.
The NOT NULL constraint was removed from these columns:
- CHARACTER_SET_NAME
- ID
- IS_DEFAULT
and from the corresponding columns in SHOW COLLATION.
For example:
SELECT COLLATION_NAME, CHARACTER_SET_NAME, ID, IS_DEFAULT
FROM INFORMATION_SCHEMA.COLLATIONS
WHERE COLLATION_NAME LIKE '%uca1400_swedish_as_ci';
+-----------------------+--------------------+------+------------+
| COLLATION_NAME | CHARACTER_SET_NAME | ID | IS_DEFAULT |
+-----------------------+--------------------+------+------------+
| uca1400_swedish_as_ci | NULL | NULL | NULL |
+-----------------------+--------------------+------+------------+
The NULL value in these columns now means that the collation
is applicable to multiple character sets.
The behavioir of old collations did not change.
Make sure your client programs can handle NULL values in these columns.
- The structure of the table
INFORMATION_SCHEMA.COLLATION_CHARACTER_SET_APPLICABILITY was changed.
Three new NOT NULL columns were added:
- FULL_COLLATION_NAME
- ID
- IS_DEFAULT
New collations have multiple entries in COLLATION_CHARACTER_SET_APPLICABILITY.
The column COLLATION_NAME contains the collation name without the character
set prefix. The column FULL_COLLATION_NAME contains the collation name with
the character set prefix.
Old collations have full collation name in both FULL_COLLATION_NAME and
COLLATION_NAME.
SELECT COLLATION_NAME, FULL_COLLATION_NAME, CHARACTER_SET_NAME, ID, IS_DEFAULT
FROM INFORMATION_SCHEMA.COLLATION_CHARACTER_SET_APPLICABILITY
WHERE FULL_COLLATION_NAME RLIKE '^(utf8mb4|latin1).*swedish.*ci$';
+-----------------------------+-------------------------------------+--------------------+------+------------+
| COLLATION_NAME | FULL_COLLATION_NAME | CHARACTER_SET_NAME | ID | IS_DEFAULT |
+-----------------------------+-------------------------------------+--------------------+------+------------+
| latin1_swedish_ci | latin1_swedish_ci | latin1 | 8 | Yes |
| latin1_swedish_nopad_ci | latin1_swedish_nopad_ci | latin1 | 1032 | |
| utf8mb4_swedish_ci | utf8mb4_swedish_ci | utf8mb4 | 232 | |
| uca1400_swedish_ai_ci | utf8mb4_uca1400_swedish_ai_ci | utf8mb4 | 2368 | |
| uca1400_swedish_as_ci | utf8mb4_uca1400_swedish_as_ci | utf8mb4 | 2370 | |
| uca1400_swedish_nopad_ai_ci | utf8mb4_uca1400_swedish_nopad_ai_ci | utf8mb4 | 2372 | |
| uca1400_swedish_nopad_as_ci | utf8mb4_uca1400_swedish_nopad_as_ci | utf8mb4 | 2374 | |
+-----------------------------+-------------------------------------+--------------------+------+------------+
- Other INFORMATION_SCHEMA queries:
SELECT COLLATION_NAME FROM INFORMATION_SCHEMA.COLUMNS;
SELECT COLLATION_NAME FROM INFORMATION_SCHEMA.PARAMETERS;
SELECT TABLE_COLLATION FROM INFORMATION_SCHEMA.TABLES;
SELECT DEFAULT_COLLATION_NAME FROM INFORMATION_SCHEMA.SCHEMATA;
SELECT COLLATION_NAME FROM INFORMATION_SCHEMA.ROUTINES;
SELECT COLLATION_CONNECTION FROM INFORMATION_SCHEMA.EVENTS;
SELECT DATABASE_COLLATION FROM INFORMATION_SCHEMA.EVENTS;
SELECT COLLATION_CONNECTION FROM INFORMATION_SCHEMA.ROUTINES;
SELECT DATABASE_COLLATION FROM INFORMATION_SCHEMA.ROUTINES;
SELECT COLLATION_CONNECTION FROM INFORMATION_SCHEMA.TRIGGERS;
SELECT DATABASE_COLLATION FROM INFORMATION_SCHEMA.TRIGGERS;
SELECT COLLATION_CONNECTION FROM INFORMATION_SCHEMA.VIEWS;
display full collation names, including character sets prefix,
for all collations, including new collations.
Corresponding SHOW commands also display full collation names
in collation related columns:
SHOW CREATE TABLE t1;
SHOW CREATE DATABASE db1;
SHOW TABLE STATUS;
SHOW CREATE FUNCTION f1;
SHOW CREATE PROCEDURE p1;
SHOW CREATE EVENT ev1;
SHOW CREATE TRIGGER tr1;
SHOW CREATE VIEW;
These INFORMATION_SCHEMA queries and SHOW statements may change in
the future, to display show collation names.
2021-11-28 13:55:15 +01:00
|
|
|
my_max_str_mb_simple,
|
|
|
|
my_ci_get_id_uca,
|
|
|
|
my_ci_get_collation_name_uca
|
2018-10-16 17:10:57 +02:00
|
|
|
};
|
|
|
|
|
|
|
|
|
|
|
|
/*
|
|
|
|
Multi-level, NO PAD
|
|
|
|
*/
|
|
|
|
MY_COLLATION_HANDLER MY_FUNCTION_NAME(collation_handler_nopad_multilevel)=
|
|
|
|
{
|
2018-10-16 17:10:57 +02:00
|
|
|
MY_UCA_COLL_INIT,
|
2018-10-16 17:10:57 +02:00
|
|
|
MY_FUNCTION_NAME(strnncoll_multilevel),
|
|
|
|
MY_FUNCTION_NAME(strnncollsp_nopad_multilevel),
|
2021-09-29 13:13:57 +02:00
|
|
|
MY_FUNCTION_NAME(strnncollsp_nchars_multilevel),
|
2018-10-16 17:10:57 +02:00
|
|
|
MY_FUNCTION_NAME(strnxfrm_multilevel),
|
|
|
|
my_strnxfrmlen_any_uca_multilevel,
|
|
|
|
MY_LIKE_RANGE,
|
|
|
|
my_wildcmp_uca,
|
|
|
|
my_instr_mb,
|
|
|
|
MY_FUNCTION_NAME(hash_sort),
|
2021-09-23 16:46:37 +02:00
|
|
|
my_propagate_complex,
|
|
|
|
my_min_str_mb_simple_nopad,
|
MDEV-27009 Add UCA-14.0.0 collations
- Added one neutral and 22 tailored (language specific) collations based on
Unicode Collation Algorithm version 14.0.0.
Collations were added for Unicode character sets
utf8mb3, utf8mb4, ucs2, utf16, utf32.
Every tailoring was added with four accent and case
sensitivity flag combinations, e.g:
* utf8mb4_uca1400_swedish_as_cs
* utf8mb4_uca1400_swedish_as_ci
* utf8mb4_uca1400_swedish_ai_cs
* utf8mb4_uca1400_swedish_ai_ci
and their _nopad_ variants:
* utf8mb4_uca1400_swedish_nopad_as_cs
* utf8mb4_uca1400_swedish_nopad_as_ci
* utf8mb4_uca1400_swedish_nopad_ai_cs
* utf8mb4_uca1400_swedish_nopad_ai_ci
- Introducing a conception of contextually typed named collations:
CREATE DATABASE db1 CHARACTER SET utf8mb4;
CREATE TABLE db1.t1 (a CHAR(10) COLLATE uca1400_as_ci);
The idea is that there is no a need to specify the character set prefix
in the new collation names. It's enough to type just the suffix
"uca1400_as_ci". The character set is taken from the context.
In the above example script the context character set is utf8mb4.
So the CREATE TABLE will make a column with the collation
utf8mb4_uca1400_as_ci.
Short collations names can be used in any parts of the SQL syntax
where the COLLATE clause is understood.
- New collations are displayed only one time
(without character set combinations) by these statements:
SELECT * FROM INFORMATION_SCHEMA.COLLATIONS;
SHOW COLLATION;
For example, all these collations:
- utf8mb3_uca1400_swedish_as_ci
- utf8mb4_uca1400_swedish_as_ci
- ucs2_uca1400_swedish_as_ci
- utf16_uca1400_swedish_as_ci
- utf32_uca1400_swedish_as_ci
have just one entry in INFORMATION_SCHEMA.COLLATIONS and SHOW COLLATION,
with COLLATION_NAME equal to "uca1400_swedish_as_ci", which is the suffix
without the character set name:
SELECT COLLATION_NAME FROM INFORMATION_SCHEMA.COLLATIONS
WHERE COLLATION_NAME LIKE '%uca1400_swedish_as_ci';
+-----------------------+
| COLLATION_NAME |
+-----------------------+
| uca1400_swedish_as_ci |
+-----------------------+
Note, the behaviour of old collations did not change.
Non-unicode collations (e.g. latin1_swedish_ci) and
old UCA-4.0.0 collations (e.g. utf8mb4_unicode_ci)
are still displayed with the character set prefix, as before.
- The structure of the table INFORMATION_SCHEMA.COLLATIONS was changed.
The NOT NULL constraint was removed from these columns:
- CHARACTER_SET_NAME
- ID
- IS_DEFAULT
and from the corresponding columns in SHOW COLLATION.
For example:
SELECT COLLATION_NAME, CHARACTER_SET_NAME, ID, IS_DEFAULT
FROM INFORMATION_SCHEMA.COLLATIONS
WHERE COLLATION_NAME LIKE '%uca1400_swedish_as_ci';
+-----------------------+--------------------+------+------------+
| COLLATION_NAME | CHARACTER_SET_NAME | ID | IS_DEFAULT |
+-----------------------+--------------------+------+------------+
| uca1400_swedish_as_ci | NULL | NULL | NULL |
+-----------------------+--------------------+------+------------+
The NULL value in these columns now means that the collation
is applicable to multiple character sets.
The behavioir of old collations did not change.
Make sure your client programs can handle NULL values in these columns.
- The structure of the table
INFORMATION_SCHEMA.COLLATION_CHARACTER_SET_APPLICABILITY was changed.
Three new NOT NULL columns were added:
- FULL_COLLATION_NAME
- ID
- IS_DEFAULT
New collations have multiple entries in COLLATION_CHARACTER_SET_APPLICABILITY.
The column COLLATION_NAME contains the collation name without the character
set prefix. The column FULL_COLLATION_NAME contains the collation name with
the character set prefix.
Old collations have full collation name in both FULL_COLLATION_NAME and
COLLATION_NAME.
SELECT COLLATION_NAME, FULL_COLLATION_NAME, CHARACTER_SET_NAME, ID, IS_DEFAULT
FROM INFORMATION_SCHEMA.COLLATION_CHARACTER_SET_APPLICABILITY
WHERE FULL_COLLATION_NAME RLIKE '^(utf8mb4|latin1).*swedish.*ci$';
+-----------------------------+-------------------------------------+--------------------+------+------------+
| COLLATION_NAME | FULL_COLLATION_NAME | CHARACTER_SET_NAME | ID | IS_DEFAULT |
+-----------------------------+-------------------------------------+--------------------+------+------------+
| latin1_swedish_ci | latin1_swedish_ci | latin1 | 8 | Yes |
| latin1_swedish_nopad_ci | latin1_swedish_nopad_ci | latin1 | 1032 | |
| utf8mb4_swedish_ci | utf8mb4_swedish_ci | utf8mb4 | 232 | |
| uca1400_swedish_ai_ci | utf8mb4_uca1400_swedish_ai_ci | utf8mb4 | 2368 | |
| uca1400_swedish_as_ci | utf8mb4_uca1400_swedish_as_ci | utf8mb4 | 2370 | |
| uca1400_swedish_nopad_ai_ci | utf8mb4_uca1400_swedish_nopad_ai_ci | utf8mb4 | 2372 | |
| uca1400_swedish_nopad_as_ci | utf8mb4_uca1400_swedish_nopad_as_ci | utf8mb4 | 2374 | |
+-----------------------------+-------------------------------------+--------------------+------+------------+
- Other INFORMATION_SCHEMA queries:
SELECT COLLATION_NAME FROM INFORMATION_SCHEMA.COLUMNS;
SELECT COLLATION_NAME FROM INFORMATION_SCHEMA.PARAMETERS;
SELECT TABLE_COLLATION FROM INFORMATION_SCHEMA.TABLES;
SELECT DEFAULT_COLLATION_NAME FROM INFORMATION_SCHEMA.SCHEMATA;
SELECT COLLATION_NAME FROM INFORMATION_SCHEMA.ROUTINES;
SELECT COLLATION_CONNECTION FROM INFORMATION_SCHEMA.EVENTS;
SELECT DATABASE_COLLATION FROM INFORMATION_SCHEMA.EVENTS;
SELECT COLLATION_CONNECTION FROM INFORMATION_SCHEMA.ROUTINES;
SELECT DATABASE_COLLATION FROM INFORMATION_SCHEMA.ROUTINES;
SELECT COLLATION_CONNECTION FROM INFORMATION_SCHEMA.TRIGGERS;
SELECT DATABASE_COLLATION FROM INFORMATION_SCHEMA.TRIGGERS;
SELECT COLLATION_CONNECTION FROM INFORMATION_SCHEMA.VIEWS;
display full collation names, including character sets prefix,
for all collations, including new collations.
Corresponding SHOW commands also display full collation names
in collation related columns:
SHOW CREATE TABLE t1;
SHOW CREATE DATABASE db1;
SHOW TABLE STATUS;
SHOW CREATE FUNCTION f1;
SHOW CREATE PROCEDURE p1;
SHOW CREATE EVENT ev1;
SHOW CREATE TRIGGER tr1;
SHOW CREATE VIEW;
These INFORMATION_SCHEMA queries and SHOW statements may change in
the future, to display show collation names.
2021-11-28 13:55:15 +01:00
|
|
|
my_max_str_mb_simple,
|
|
|
|
my_ci_get_id_uca,
|
|
|
|
my_ci_get_collation_name_uca
|
2018-10-16 17:10:57 +02:00
|
|
|
};
|
|
|
|
|
|
|
|
|
2018-10-16 17:10:57 +02:00
|
|
|
MY_COLLATION_HANDLER_PACKAGE MY_FUNCTION_NAME(package)=
|
|
|
|
{
|
|
|
|
&MY_FUNCTION_NAME(collation_handler),
|
|
|
|
&MY_FUNCTION_NAME(collation_handler_nopad),
|
|
|
|
&MY_FUNCTION_NAME(collation_handler_multilevel),
|
|
|
|
&MY_FUNCTION_NAME(collation_handler_nopad_multilevel)
|
|
|
|
};
|
|
|
|
|
|
|
|
|
2018-10-16 17:10:57 +02:00
|
|
|
#undef MY_FUNCTION_NAME
|
|
|
|
#undef MY_MB_WC
|
|
|
|
#undef MY_LIKE_RANGE
|
2018-10-16 17:10:57 +02:00
|
|
|
#undef MY_UCA_ASCII_OPTIMIZE
|
|
|
|
#undef MY_UCA_COMPILE_CONTRACTIONS
|
|
|
|
#undef MY_UCA_COLL_INIT
|