mariadb/sql/lex_string.h

186 lines
4.7 KiB
C
Raw Normal View History

/*
Copyright (c) 2018, MariaDB Corporation.
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; version 2 of the License.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with this program; if not, write to the Free Software
Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA */
#ifndef LEX_STRING_INCLUDED
#define LEX_STRING_INCLUDED
typedef struct st_mysql_const_lex_string LEX_CSTRING;
class Lex_cstring : public LEX_CSTRING
{
public:
MDEV-31340 Remove MY_COLLATION_HANDLER::strcasecmp() This patch also fixes: MDEV-33050 Build-in schemas like oracle_schema are accent insensitive MDEV-33084 LASTVAL(t1) and LASTVAL(T1) do not work well with lower-case-table-names=0 MDEV-33085 Tables T1 and t1 do not work well with ENGINE=CSV and lower-case-table-names=0 MDEV-33086 SHOW OPEN TABLES IN DB1 -- is case insensitive with lower-case-table-names=0 MDEV-33088 Cannot create triggers in the database `MYSQL` MDEV-33103 LOCK TABLE t1 AS t2 -- alias is not case sensitive with lower-case-table-names=0 MDEV-33109 DROP DATABASE MYSQL -- does not drop SP with lower-case-table-names=0 MDEV-33110 HANDLER commands are case insensitive with lower-case-table-names=0 MDEV-33119 User is case insensitive in INFORMATION_SCHEMA.VIEWS MDEV-33120 System log table names are case insensitive with lower-cast-table-names=0 - Removing the virtual function strnncoll() from MY_COLLATION_HANDLER - Adding a wrapper function CHARSET_INFO::streq(), to compare two strings for equality. For now it calls strnncoll() internally. In the future it will turn into a virtual function. - Adding new accent sensitive case insensitive collations: - utf8mb4_general1400_as_ci - utf8mb3_general1400_as_ci They implement accent sensitive case insensitive comparison. The weight of a character is equal to the code point of its upper case variant. These collations use Unicode-14.0.0 casefolding data. The result of my_charset_utf8mb3_general1400_as_ci.strcoll() is very close to the former my_charset_utf8mb3_general_ci.strcasecmp() There is only a difference in a couple dozen rare characters, because: - the switch from "tolower" to "toupper" comparison, to make utf8mb3_general1400_as_ci closer to utf8mb3_general_ci - the switch from Unicode-3.0.0 to Unicode-14.0.0 This difference should be tolarable. See the list of affected characters in the MDEV description. Note, utf8mb4_general1400_as_ci correctly handles non-BMP characters! Unlike utf8mb4_general_ci, it does not treat all BMP characters as equal. - Adding classes representing names of the file based database objects: Lex_ident_db Lex_ident_table Lex_ident_trigger Their comparison collation depends on the underlying file system case sensitivity and on --lower-case-table-names and can be either my_charset_bin or my_charset_utf8mb3_general1400_as_ci. - Adding classes representing names of other database objects, whose names have case insensitive comparison style, using my_charset_utf8mb3_general1400_as_ci: Lex_ident_column Lex_ident_sys_var Lex_ident_user_var Lex_ident_sp_var Lex_ident_ps Lex_ident_i_s_table Lex_ident_window Lex_ident_func Lex_ident_partition Lex_ident_with_element Lex_ident_rpl_filter Lex_ident_master_info Lex_ident_host Lex_ident_locale Lex_ident_plugin Lex_ident_engine Lex_ident_server Lex_ident_savepoint Lex_ident_charset engine_option_value::Name - All the mentioned Lex_ident_xxx classes implement a method streq(): if (ident1.streq(ident2)) do_equal(); This method works as a wrapper for CHARSET_INFO::streq(). - Changing a lot of "LEX_CSTRING name" to "Lex_ident_xxx name" in class members and in function/method parameters. - Replacing all calls like system_charset_info->coll->strcasecmp(ident1, ident2) to ident1.streq(ident2) - Taking advantage of the c++11 user defined literal operator for LEX_CSTRING (see m_strings.h) and Lex_ident_xxx (see lex_ident.h) data types. Use example: const Lex_ident_column primary_key_name= "PRIMARY"_Lex_ident_column; is now a shorter version of: const Lex_ident_column primary_key_name= Lex_ident_column({STRING_WITH_LEN("PRIMARY")});
2023-04-26 15:27:01 +04:00
constexpr Lex_cstring()
:LEX_CSTRING({NULL, 0})
{ }
constexpr Lex_cstring(const LEX_CSTRING &str)
:LEX_CSTRING(str)
{ }
constexpr Lex_cstring(const char *_str, size_t _len)
:LEX_CSTRING({_str, _len})
{ }
Lex_cstring(const char *start, const char *end)
{
DBUG_ASSERT(start <= end);
str= start;
length= end - start;
}
MDEV-31340 Remove MY_COLLATION_HANDLER::strcasecmp() This patch also fixes: MDEV-33050 Build-in schemas like oracle_schema are accent insensitive MDEV-33084 LASTVAL(t1) and LASTVAL(T1) do not work well with lower-case-table-names=0 MDEV-33085 Tables T1 and t1 do not work well with ENGINE=CSV and lower-case-table-names=0 MDEV-33086 SHOW OPEN TABLES IN DB1 -- is case insensitive with lower-case-table-names=0 MDEV-33088 Cannot create triggers in the database `MYSQL` MDEV-33103 LOCK TABLE t1 AS t2 -- alias is not case sensitive with lower-case-table-names=0 MDEV-33109 DROP DATABASE MYSQL -- does not drop SP with lower-case-table-names=0 MDEV-33110 HANDLER commands are case insensitive with lower-case-table-names=0 MDEV-33119 User is case insensitive in INFORMATION_SCHEMA.VIEWS MDEV-33120 System log table names are case insensitive with lower-cast-table-names=0 - Removing the virtual function strnncoll() from MY_COLLATION_HANDLER - Adding a wrapper function CHARSET_INFO::streq(), to compare two strings for equality. For now it calls strnncoll() internally. In the future it will turn into a virtual function. - Adding new accent sensitive case insensitive collations: - utf8mb4_general1400_as_ci - utf8mb3_general1400_as_ci They implement accent sensitive case insensitive comparison. The weight of a character is equal to the code point of its upper case variant. These collations use Unicode-14.0.0 casefolding data. The result of my_charset_utf8mb3_general1400_as_ci.strcoll() is very close to the former my_charset_utf8mb3_general_ci.strcasecmp() There is only a difference in a couple dozen rare characters, because: - the switch from "tolower" to "toupper" comparison, to make utf8mb3_general1400_as_ci closer to utf8mb3_general_ci - the switch from Unicode-3.0.0 to Unicode-14.0.0 This difference should be tolarable. See the list of affected characters in the MDEV description. Note, utf8mb4_general1400_as_ci correctly handles non-BMP characters! Unlike utf8mb4_general_ci, it does not treat all BMP characters as equal. - Adding classes representing names of the file based database objects: Lex_ident_db Lex_ident_table Lex_ident_trigger Their comparison collation depends on the underlying file system case sensitivity and on --lower-case-table-names and can be either my_charset_bin or my_charset_utf8mb3_general1400_as_ci. - Adding classes representing names of other database objects, whose names have case insensitive comparison style, using my_charset_utf8mb3_general1400_as_ci: Lex_ident_column Lex_ident_sys_var Lex_ident_user_var Lex_ident_sp_var Lex_ident_ps Lex_ident_i_s_table Lex_ident_window Lex_ident_func Lex_ident_partition Lex_ident_with_element Lex_ident_rpl_filter Lex_ident_master_info Lex_ident_host Lex_ident_locale Lex_ident_plugin Lex_ident_engine Lex_ident_server Lex_ident_savepoint Lex_ident_charset engine_option_value::Name - All the mentioned Lex_ident_xxx classes implement a method streq(): if (ident1.streq(ident2)) do_equal(); This method works as a wrapper for CHARSET_INFO::streq(). - Changing a lot of "LEX_CSTRING name" to "Lex_ident_xxx name" in class members and in function/method parameters. - Replacing all calls like system_charset_info->coll->strcasecmp(ident1, ident2) to ident1.streq(ident2) - Taking advantage of the c++11 user defined literal operator for LEX_CSTRING (see m_strings.h) and Lex_ident_xxx (see lex_ident.h) data types. Use example: const Lex_ident_column primary_key_name= "PRIMARY"_Lex_ident_column; is now a shorter version of: const Lex_ident_column primary_key_name= Lex_ident_column({STRING_WITH_LEN("PRIMARY")});
2023-04-26 15:27:01 +04:00
bool bin_eq(const LEX_CSTRING &rhs) const
{
return length == rhs.length && !memcmp(str, rhs.str, length);
}
void set(const char *_str, size_t _len)
{
str= _str;
length= _len;
}
MDEV-30662 SQL/PL package body does not appear in I_S.ROUTINES.ROUTINE_DEFINITION - Moving the code from a public function trim_whitespaces() to the class Lex_cstring as methods. This code may be useful in other contexts, and also this code becomes visible inside sql_class.h - Adding a helper method THD::strmake_lex_cstring_trim_whitespaces() - Unifying the way how CREATE PROCEDURE/CREATE FUNCTION and CREATE PACKAGE/CREATE PACKAGE BODY work: a) Now CREATE PACKAGE/CREATE PACKAGE BODY also calls Lex->sphead->set_body_start() to remember the cpp body start inside an sp_head member. b) adding a "const char *cpp_body_end" parameter to sp_head::set_stmt_end(). These changes made it possible to reuse sp_head::set_stmt_end() inside LEX::create_package_finalize() and remove the duplucate code. - Renaming sp_head::m_body_begin to m_cpp_body_begin and adding a comment to make it clear that this member is used only during parsing, and points to a fragment inside the cpp buffer. - Changed sp_head::set_body_start() and sp_head::set_stmt_end() to skip the calls related to "body_utf8" in cases when m_parent is not NULL. A non-NULL m_parent means that we're inside a package routine. "body_utf8" in such case belongs not to the current sphead itself, but to parent (the package) sphead. So an sphead instance of a package routine should neither initialize, nor finalize, nor change in any other ways the "body_utf8" related members of Lex_input_stream, and should not take over or copy "body_utf8" data from Lex_input_stream to "this".
2023-07-14 12:14:56 +04:00
/*
Trim left white spaces.
Assumes that there are no multi-bytes characters
that can be considered white-space.
*/
Lex_cstring ltrim_whitespace(CHARSET_INFO *cs) const
{
DBUG_ASSERT(cs->mbminlen == 1);
Lex_cstring str= *this;
while (str.length > 0 && my_isspace(cs, str.str[0]))
{
str.length--;
str.str++;
}
return str;
}
/*
Trim right white spaces.
Assumes that there are no multi-bytes characters
that can be considered white-space.
Also, assumes that the character set supports backward space parsing.
*/
Lex_cstring rtrim_whitespace(CHARSET_INFO *cs) const
{
DBUG_ASSERT(cs->mbminlen == 1);
Lex_cstring str= *this;
while (str.length > 0 && my_isspace(cs, str.str[str.length - 1]))
{
str.length --;
}
return str;
}
/*
Trim all spaces.
*/
Lex_cstring trim_whitespace(CHARSET_INFO *cs) const
{
return ltrim_whitespace(cs).rtrim_whitespace(cs);
}
/*
Trim all spaces and return the length of the leading space sequence.
*/
Lex_cstring trim_whitespace(CHARSET_INFO *cs, size_t *prefix_length) const
{
Lex_cstring tmp= Lex_cstring(*this).ltrim_whitespace(cs);
if (prefix_length)
*prefix_length= tmp.str - str;
return tmp.rtrim_whitespace(cs);
}
/*
Return the "n" leftmost bytes if this[0] is longer than "n" bytes,
or return this[0] itself otherwise.
*/
Lex_cstring left(size_t n) const
{
return Lex_cstring(str, MY_MIN(length, n));
}
/*
If this[0] is shorter than "pos" bytes, then return an empty string.
Otherwise, return a substring of this[0] starting from
the byte position "pos" until the end.
*/
Lex_cstring substr(size_t pos) const
{
return length <= pos ? Lex_cstring(str + length, (size_t) 0) :
Lex_cstring(str + pos, length - pos);
}
// Check if a prefix of this[0] is equal to "rhs".
bool starts_with(const LEX_CSTRING &rhs) const
{
DBUG_ASSERT(str);
DBUG_ASSERT(rhs.str);
return length >= rhs.length && !memcmp(str, rhs.str, rhs.length);
}
};
class Lex_cstring_strlen: public Lex_cstring
{
public:
Lex_cstring_strlen(const char *from)
:Lex_cstring(from, from ? strlen(from) : 0)
{ }
};
/* Functions to compare if two lex strings are equal */
/*
Compare to LEX_CSTRING's and return 0 if equal
*/
static inline bool cmp(const LEX_CSTRING *a, const LEX_CSTRING *b)
{
2020-11-02 15:48:47 +02:00
return a->length != b->length ||
(a->length && memcmp(a->str, b->str, a->length));
}
static inline bool cmp(const LEX_CSTRING a, const LEX_CSTRING b)
{
2020-11-03 14:49:17 +02:00
return a.length != b.length || (a.length && memcmp(a.str, b.str, a.length));
}
/*
Compare if two LEX_CSTRING are equal. Assumption is that
character set is ASCII (like for plugin names)
*/
static inline bool lex_string_eq(const LEX_CSTRING *a, const LEX_CSTRING *b)
{
if (a->length != b->length)
return 0; /* Different */
return strcasecmp(a->str, b->str) == 0;
}
/*
To be used when calling lex_string_eq with STRING_WITH_LEN() as second
argument
*/
static inline bool lex_string_eq(const LEX_CSTRING *a, const char *b, size_t b_length)
{
if (a->length != b_length)
return 0; /* Different */
return strcasecmp(a->str, b) == 0;
}
#endif /* LEX_STRING_INCLUDED */