mariadb/sql/simple_tokenizer.h
Alexander Barkov 75f25e4ca7 MDEV-30164 System variable for default collations
This patch adds a way to override default collations
(or "character set collations") for desired character sets.

The SQL standard says:
> Each collation known in an SQL-environment is applicable to one
> or more character sets, and for each character set, one or more
> collations are applicable to it, one of which is associated with
> it as its character set collation.

In MariaDB, character set collations has been hard-coded so far,
e.g. utf8mb4_general_ci has been a hard-coded character set collation
for utf8mb4.

This patch allows to override (globally per server, or per session)
character set collations, so for example, uca1400_ai_ci can be set as a
character set collation for Unicode character sets
(instead of compiled xxx_general_ci).

The array of overridden character set collations is stored in a new
(session and global) system variable @@character_set_collations and
can be set as a comma separated list of charset=collation pairs, e.g.:

SET @@character_set_collations='utf8mb3=uca1400_ai_ci,utf8mb4=uca1400_ai_ci';

The variable is empty by default, which mean use the hard-coded
character set collations (e.g. utf8mb4_general_ci for utf8mb4).

The variable can also be set globally by passing to the server startup command
line, and/or in my.cnf.
2023-07-17 14:56:17 +04:00

85 lines
2 KiB
C++

/* Copyright (c) 2023, MariaDB Corporation.
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; version 2 of the License.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with this program; if not, write to the Free Software
Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1335 USA */
#ifndef SIMPLE_TOKENIZER_INCLUDED
#define SIMPLE_TOKENIZER_INCLUDED
class Simple_tokenizer
{
const char *m_ptr;
const char *m_end;
public:
Simple_tokenizer(const char *str, size_t length)
:m_ptr(str), m_end(str + length)
{ }
const char *ptr() const
{
return m_ptr;
}
bool eof() const
{
return m_ptr >= m_end;
}
void get_spaces()
{
for ( ; !eof(); m_ptr++)
{
if (m_ptr[0] != ' ')
break;
}
}
bool is_ident_start(char ch) const
{
return (ch >= 'a' && ch <= 'z') ||
(ch >= 'A' && ch <= 'Z') ||
ch == '_';
}
bool is_ident_body(char ch) const
{
return is_ident_start(ch) ||
(ch >= '0' && ch <= '9');
}
bool is_ident_start() const
{
return !eof() && is_ident_start(*m_ptr);
}
bool is_ident_body() const
{
return !eof() && is_ident_body(*m_ptr);
}
LEX_CSTRING get_ident()
{
get_spaces();
if (!is_ident_start())
return {m_ptr,0};
const char *start= m_ptr++;
for ( ; is_ident_body(); m_ptr++)
{ }
LEX_CSTRING res= {start, (size_t) (m_ptr - start)};
return res;
}
bool get_char(char ch)
{
get_spaces();
if (eof() || *m_ptr != ch)
return true;
m_ptr++;
return false;
}
};
#endif // SIMPLE_TOKENIZER_INCLUDED