mariadb/sql/charset_collations.cc
Alexander Barkov 75f25e4ca7 MDEV-30164 System variable for default collations
This patch adds a way to override default collations
(or "character set collations") for desired character sets.

The SQL standard says:
> Each collation known in an SQL-environment is applicable to one
> or more character sets, and for each character set, one or more
> collations are applicable to it, one of which is associated with
> it as its character set collation.

In MariaDB, character set collations has been hard-coded so far,
e.g. utf8mb4_general_ci has been a hard-coded character set collation
for utf8mb4.

This patch allows to override (globally per server, or per session)
character set collations, so for example, uca1400_ai_ci can be set as a
character set collation for Unicode character sets
(instead of compiled xxx_general_ci).

The array of overridden character set collations is stored in a new
(session and global) system variable @@character_set_collations and
can be set as a comma separated list of charset=collation pairs, e.g.:

SET @@character_set_collations='utf8mb3=uca1400_ai_ci,utf8mb4=uca1400_ai_ci';

The variable is empty by default, which mean use the hard-coded
character set collations (e.g. utf8mb4_general_ci for utf8mb4).

The variable can also be set globally by passing to the server startup command
line, and/or in my.cnf.
2023-07-17 14:56:17 +04:00

117 lines
3.8 KiB
C++

/* Copyright (c) 2023, MariaDB Corporation.
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; version 2 of the License.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with this program; if not, write to the Free Software
Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1335 USA */
#include "my_global.h"
#include "my_sys.h"
#include "lex_charset.h"
#include "mysqld_error.h"
#include "charset_collations.h"
#include "simple_tokenizer.h"
bool Charset_collation_map_st::insert_or_replace(
const Lex_exact_charset &charset,
const Lex_extended_collation &collation,
bool error_on_conflicting_duplicate)
{
Lex_exact_charset_opt_extended_collate res(charset);
Sql_used used;
if (res.merge_collation_override(&used, *this, collation))
return true;
if (error_on_conflicting_duplicate)
{
const Elem_st *dup;
if ((dup= find_elem_by_charset_id(charset.charset_info()->number)) &&
dup->to() != res.collation().charset_info())
{
my_error(ER_CONFLICTING_DECLARATIONS, MYF(0),
"", dup->to()->coll_name.str,
"", res.collation().charset_info()->coll_name.str);
return true;
}
}
return insert_or_replace(Elem(charset.charset_info(),
res.collation().charset_info()));
}
bool Charset_collation_map_st::insert_or_replace(
const LEX_CSTRING &cs_name,
const LEX_CSTRING &cl_name,
bool error_on_conflicting_duplicate,
myf utf8_flag)
{
char charset_name_c[MY_CS_CHARACTER_SET_NAME_SIZE + 1/*for '\0'*/];
strmake(charset_name_c, cs_name.str, cs_name.length);
CHARSET_INFO *cs= get_charset_by_csname(charset_name_c,
MY_CS_PRIMARY, utf8_flag);
if (!cs)
{
my_error(ER_UNKNOWN_CHARACTER_SET, MYF(0), charset_name_c);
return true;
}
char collation_name_c[MY_CS_COLLATION_NAME_SIZE + 1/*for '\0'*/];
strmake(collation_name_c, cl_name.str, cl_name.length);
Lex_exact_collation tmpec(&my_charset_bin);
Lex_extended_collation tmp(tmpec);
if (tmp.set_by_name(collation_name_c, utf8_flag))
return true;
return insert_or_replace(Lex_exact_charset(cs), tmp,
error_on_conflicting_duplicate);
}
bool Charset_collation_map_st::from_text(const LEX_CSTRING &str, myf utf8_flag)
{
init();
Simple_tokenizer stream(str.str, str.length);
/*
Allow relaxed comma parsing:
SET @@character_set_collations=
',,,utf8mb3 = utf8mb3_bin,,latin1 = latin1_bin,,,';
It makes it easier for the user to edit the value
using SQL functions CONCAT or REGEXP_REPLACE.
*/
for ( ; ; )
{
LEX_CSTRING charset_name= stream.get_ident();
if (charset_name.length)
{
if (stream.get_char('='))
return true;
LEX_CSTRING collation_name= stream.get_ident();
if (!collation_name.length)
return true;
/*
Don't allow duplicate conflicting declarations within the same string:
SET @@var='utf8mb3=utf8mb3_general_ci,utf8mb3=utf8mb3_bin';
*/
if (insert_or_replace(charset_name, collation_name,
true/*err on dup*/, utf8_flag))
return true;
}
if (!stream.get_char(','))
continue;
if (stream.eof())
return false;
return true;
}
return false;
}