mariadb/sql/charset_collations.h
Alexander Barkov 75f25e4ca7 MDEV-30164 System variable for default collations
This patch adds a way to override default collations
(or "character set collations") for desired character sets.

The SQL standard says:
> Each collation known in an SQL-environment is applicable to one
> or more character sets, and for each character set, one or more
> collations are applicable to it, one of which is associated with
> it as its character set collation.

In MariaDB, character set collations has been hard-coded so far,
e.g. utf8mb4_general_ci has been a hard-coded character set collation
for utf8mb4.

This patch allows to override (globally per server, or per session)
character set collations, so for example, uca1400_ai_ci can be set as a
character set collation for Unicode character sets
(instead of compiled xxx_general_ci).

The array of overridden character set collations is stored in a new
(session and global) system variable @@character_set_collations and
can be set as a comma separated list of charset=collation pairs, e.g.:

SET @@character_set_collations='utf8mb3=uca1400_ai_ci,utf8mb4=uca1400_ai_ci';

The variable is empty by default, which mean use the hard-coded
character set collations (e.g. utf8mb4_general_ci for utf8mb4).

The variable can also be set globally by passing to the server startup command
line, and/or in my.cnf.
2023-07-17 14:56:17 +04:00

247 lines
6.4 KiB
C++

/* Copyright (c) 2023, MariaDB Corporation.
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; version 2 of the License.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with this program; if not, write to the Free Software
Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1335 USA */
#ifndef LEX_CHARSET_COLLATIONS_INCLUDED
#define LEX_CHARSET_COLLATIONS_INCLUDED
#include "sql_used.h"
struct Charset_collation_map_st
{
public:
struct Elem_st
{
protected:
CHARSET_INFO *m_from; // From a character set
CHARSET_INFO *m_to; // To a collation
static size_t print_lex_string(char *dst, const LEX_CSTRING &str)
{
memcpy(dst, str.str, str.length);
return str.length;
}
public:
/*
Size in text format: 'utf8mb4=utf8mb4_unicode_ai_ci'
*/
static constexpr size_t text_size_max()
{
return MY_CS_CHARACTER_SET_NAME_SIZE + 1 +
MY_CS_COLLATION_NAME_SIZE;
}
CHARSET_INFO *from() const
{
return m_from;
}
CHARSET_INFO *to() const
{
return m_to;
}
void set_to(CHARSET_INFO *cl)
{
m_to= cl;
}
size_t print(char *dst) const
{
const char *dst0= dst;
dst+= print_lex_string(dst, m_from->cs_name);
*dst++= '=';
dst+= print_lex_string(dst, m_to->coll_name);
return (size_t) (dst - dst0);
}
int cmp_by_charset_id(const Elem_st &rhs) const
{
return m_from->number < rhs.m_from->number ? -1 :
m_from->number > rhs.m_from->number ? +1 : 0;
}
};
class Elem: public Elem_st
{
public:
Elem(CHARSET_INFO *from, CHARSET_INFO *to)
{
m_from= from;
m_to= to;
}
};
protected:
Elem_st m_element[8]; // Should be enough for now
uint m_count;
uint m_version;
static int cmp_by_charset_id(const void *a, const void *b)
{
return static_cast<const Elem_st*>(a)->
cmp_by_charset_id(*static_cast<const Elem_st*>(b));
}
void sort()
{
qsort(m_element, m_count, sizeof(Elem_st), cmp_by_charset_id);
}
const Elem_st *find_elem_by_charset_id(uint id) const
{
if (!m_count)
return NULL;
int first= 0, last= ((int) m_count) - 1;
for ( ; first <= last; )
{
const int middle= (first + last) / 2;
DBUG_ASSERT(middle >= 0);
DBUG_ASSERT(middle < (int) m_count);
const uint middle_id= m_element[middle].from()->number;
if (middle_id == id)
return &m_element[middle];
if (middle_id < id)
first= middle + 1;
else
last= middle - 1;
}
return NULL;
}
bool insert(const Elem_st &elem)
{
DBUG_ASSERT(elem.from()->state & MY_CS_PRIMARY);
if (m_count >= array_elements(m_element))
return true;
m_element[m_count]= elem;
m_count++;
sort();
return false;
}
bool insert_or_replace(const Elem_st &elem)
{
DBUG_ASSERT(elem.from()->state & MY_CS_PRIMARY);
const Elem_st *found= find_elem_by_charset_id(elem.from()->number);
if (found)
{
const_cast<Elem_st*>(found)->set_to(elem.to());
return false;
}
return insert(elem);
}
public:
void init()
{
m_count= 0;
m_version= 0;
}
uint count() const
{
return m_count;
}
uint version() const
{
return m_version;
}
void set(const Charset_collation_map_st &rhs, uint version_increment)
{
uint version= m_version;
*this= rhs;
m_version= version + version_increment;
}
const Elem_st & operator[](uint pos) const
{
DBUG_ASSERT(pos < m_count);
return m_element[pos];
}
bool insert_or_replace(const class Lex_exact_charset &cs,
const class Lex_extended_collation &cl,
bool error_on_conflicting_duplicate);
bool insert_or_replace(const LEX_CSTRING &cs,
const LEX_CSTRING &cl,
bool error_on_conflicting_duplicate,
myf utf8_flag);
CHARSET_INFO *get_collation_for_charset(Sql_used *used,
CHARSET_INFO *cs) const
{
DBUG_ASSERT(cs->state & MY_CS_PRIMARY);
const Elem_st *elem= find_elem_by_charset_id(cs->number);
used->used|= Sql_used::CHARACTER_SET_COLLATIONS_USED;
if (elem)
return elem->to();
return cs;
}
size_t text_format_nbytes_needed() const
{
return (Elem_st::text_size_max() + 1/* for ',' */) * m_count;
}
size_t print(char *dst, size_t nbytes_available) const
{
const char *dst0= dst;
const char *end= dst + nbytes_available;
for (uint i= 0; i < m_count; i++)
{
if (Elem_st::text_size_max() + 1/* for ',' */ > (size_t) (end - dst))
break;
if (i > 0)
*dst++= ',';
dst+= m_element[i].print(dst);
}
return dst - dst0;
}
static constexpr size_t binary_size_max()
{
return 1/*count*/ + 4 * array_elements(m_element);
}
size_t to_binary(char *dst) const
{
const char *dst0= dst;
*dst++= (char) (uchar) m_count;
for (uint i= 0; i < m_count; i++)
{
int2store(dst, (uint16) m_element[i].from()->number);
dst+= 2;
int2store(dst, (uint16) m_element[i].to()->number);
dst+= 2;
}
return (size_t) (dst - dst0);
}
size_t from_binary(const char *src, size_t srclen)
{
const char *src0= src;
init();
if (!srclen)
return 0; // Empty
uint count= (uchar) *src++;
if (srclen < 1 + 4 * count)
return 0;
for (uint i= 0; i < count; i++, src+= 4)
{
CHARSET_INFO *cs, *cl;
if (!(cs= get_charset(uint2korr(src), MYF(0))) ||
!(cl= get_charset(uint2korr(src + 2), MYF(0))))
{
/*
Unpacking from binary format happens on the slave side.
If for some reasons the slave does not know about a
character set or a collation, just skip the pair here.
This pair might not even be needed.
*/
continue;
}
insert_or_replace(Elem(cs, cl));
}
return src - src0;
}
bool from_text(const LEX_CSTRING &str, myf utf8_flag);
};
#endif // LEX_CHARSET_COLLATIONS_INCLUDED