diff --git a/include/m_ctype.h b/include/m_ctype.h index a55222682b0..ee49e94ef99 100644 --- a/include/m_ctype.h +++ b/include/m_ctype.h @@ -533,6 +533,7 @@ struct my_charset_handler_st extern MY_CHARSET_HANDLER my_charset_8bit_handler; extern MY_CHARSET_HANDLER my_charset_ucs2_handler; +extern MY_CHARSET_HANDLER my_charset_utf8_handler; /* @@ -889,6 +890,18 @@ uint32 my_convert(char *to, uint32 to_length, CHARSET_INFO *to_cs, const char *from, uint32 from_length, CHARSET_INFO *from_cs, uint *errors); +/** + An extended version of my_convert(), to pass non-default mb_wc() and wc_mb(). + For example, String::copy_printable() which is used in + Protocol::store_warning() uses this to escape control + and non-convertable characters. +*/ +uint32 my_convert_using_func(char *to, uint32 to_length, CHARSET_INFO *to_cs, + my_charset_conv_wc_mb mb_wc, + const char *from, uint32 from_length, + CHARSET_INFO *from_cs, + my_charset_conv_mb_wc wc_mb, + uint *errors); /* Convert a string between two character sets. Bad byte sequences as well as characters that cannot be diff --git a/mysql-test/r/ctype_utf8.result b/mysql-test/r/ctype_utf8.result index 90bc6b51177..112f768ac8f 100644 --- a/mysql-test/r/ctype_utf8.result +++ b/mysql-test/r/ctype_utf8.result @@ -10259,5 +10259,146 @@ Warnings: Note 1003 select `test`.`t1`.`c` AS `c` from `test`.`t1` where (`test`.`t1`.`c` = 'A') DROP TABLE t1; # +# MDEV-7231 Field ROUTINE_DEFINITION in INFORMATION_SCHEMA.`ROUTINES` contains broken procedure body when used shielding quotes inside. +# +CREATE PROCEDURE p1() +BEGIN +SELECT CONCAT('ABC = ''',1,''''), CONCAT('ABC = ',2); +SELECT '''', """", '\'', "\""; +SELECT ' \t'; +SELECT ' +\n'; +SELECT 'test'; +SELECT 'tëst'; +SELECT 'test\0'; +SELECT 'tëst\0'; +SELECT _binary'test'; +SELECT _binary'test\0'; +SELECT N'''', N"""", N'\'', N"\""; +SELECT N' \t'; +SELECT N' +\n'; +SELECT N'test'; +SELECT N'tëst'; +SELECT N'test\0'; +SELECT N'tëst\0'; +END$$ +SELECT ROUTINE_DEFINITION FROM INFORMATION_SCHEMA.ROUTINES +WHERE ROUTINE_SCHEMA='test' AND SPECIFIC_NAME ='p1'; +ROUTINE_DEFINITION +BEGIN +SELECT CONCAT('ABC = ''',1,''''), CONCAT('ABC = ',2); +SELECT '''', """", '''', """"; +SELECT '\t\t'; +SELECT '\n\n'; +SELECT 'test'; +SELECT 'tëst'; +SELECT 'test\0'; +SELECT 'tëst\0'; +SELECT 'test'; +SELECT 'test\0'; +SELECT N'''', N"""", N'''', N""""; +SELECT N'\t\t'; +SELECT N'\n\n'; +SELECT N'test'; +SELECT N'tëst'; +SELECT N'test\0'; +SELECT N'tëst\0'; +END +SELECT body_utf8 FROM mysql.proc WHERE name='p1'; +body_utf8 +BEGIN +SELECT CONCAT('ABC = ''',1,''''), CONCAT('ABC = ',2); +SELECT '''', """", '''', """"; +SELECT '\t\t'; +SELECT '\n\n'; +SELECT 'test'; +SELECT 'tëst'; +SELECT 'test\0'; +SELECT 'tëst\0'; +SELECT 'test'; +SELECT 'test\0'; +SELECT N'''', N"""", N'''', N""""; +SELECT N'\t\t'; +SELECT N'\n\n'; +SELECT N'test'; +SELECT N'tëst'; +SELECT N'test\0'; +SELECT N'tëst\0'; +END +DROP PROCEDURE p1; +SET @@SQL_MODE='NO_BACKSLASH_ESCAPES'; +CREATE PROCEDURE p1() +BEGIN +SELECT CONCAT('ABC = ''',1,''''), CONCAT('ABC = ',2); +SELECT '''', """"; +SELECT ' \t'; +SELECT ' +\n'; +SELECT 'test'; +SELECT 'tëst'; +SELECT 'test\0'; +SELECT 'tëst\0'; +SELECT _binary'test'; +SELECT _binary'test\0'; +SELECT N'''', N""""; +SELECT N' \t'; +SELECT N' +\n'; +SELECT N'test'; +SELECT N'tëst'; +SELECT N'test\0'; +SELECT N'tëst\0'; +END$$ +SELECT ROUTINE_DEFINITION FROM INFORMATION_SCHEMA.ROUTINES +WHERE ROUTINE_SCHEMA='test' AND SPECIFIC_NAME ='p1'; +ROUTINE_DEFINITION +BEGIN +SELECT CONCAT('ABC = ''',1,''''), CONCAT('ABC = ',2); +SELECT '''', """"; +SELECT ' \t'; +SELECT ' +\n'; +SELECT 'test'; +SELECT 'tëst'; +SELECT 'test\0'; +SELECT 'tëst\0'; +SELECT 'test'; +SELECT 'test\0'; +SELECT N'''', N""""; +SELECT N' \t'; +SELECT N' +\n'; +SELECT N'test'; +SELECT N'tëst'; +SELECT N'test\0'; +SELECT N'tëst\0'; +END +SELECT body_utf8 FROM mysql.proc WHERE name='p1'; +body_utf8 +BEGIN +SELECT CONCAT('ABC = ''',1,''''), CONCAT('ABC = ',2); +SELECT '''', """"; +SELECT ' \t'; +SELECT ' +\n'; +SELECT 'test'; +SELECT 'tëst'; +SELECT 'test\0'; +SELECT 'tëst\0'; +SELECT 'test'; +SELECT 'test\0'; +SELECT N'''', N""""; +SELECT N' \t'; +SELECT N' +\n'; +SELECT N'test'; +SELECT N'tëst'; +SELECT N'test\0'; +SELECT N'tëst\0'; +END +DROP PROCEDURE p1; +SET @@SQL_MODE=default; +# # End of 10.1 tests # diff --git a/mysql-test/r/ctype_utf8mb4.result b/mysql-test/r/ctype_utf8mb4.result index ac53a7e5a4e..f969369242e 100644 --- a/mysql-test/r/ctype_utf8mb4.result +++ b/mysql-test/r/ctype_utf8mb4.result @@ -3382,5 +3382,19 @@ SET NAMES utf8mb4; SELECT * FROM `test😁😁test`; ERROR HY000: Invalid utf8mb4 character string: 'test\xF0\x9F\x98\x81\xF0\x9F\x98\x81test' # +# MDEV-7231 Field ROUTINE_DEFINITION in INFORMATION_SCHEMA.`ROUTINES` contains broken procedure body when used shielding quotes inside. +# +SET NAMES utf8mb4; +CREATE FUNCTION f1() RETURNS TEXT CHARACTER SET utf8mb4 +RETURN CONCAT('😎','x😎','😎y','x😎y'); +SELECT ROUTINE_DEFINITION FROM INFORMATION_SCHEMA.ROUTINES +WHERE ROUTINE_SCHEMA='test' AND SPECIFIC_NAME ='f1'; +ROUTINE_DEFINITION +RETURN CONCAT('?','x?','?y','x?y') +SELECT body_utf8 FROM mysql.proc WHERE name='f1'; +body_utf8 +RETURN CONCAT('?','x?','?y','x?y') +DROP FUNCTION f1; +# # End of 10.1 tests # diff --git a/mysql-test/t/ctype_utf8.test b/mysql-test/t/ctype_utf8.test index 014194d78e7..85a7ad0b92d 100644 --- a/mysql-test/t/ctype_utf8.test +++ b/mysql-test/t/ctype_utf8.test @@ -1871,6 +1871,82 @@ SELECT * FROM t1 WHERE c>=_utf8'a' COLLATE utf8_general_ci AND c='A'; DROP TABLE t1; +--echo # +--echo # MDEV-7231 Field ROUTINE_DEFINITION in INFORMATION_SCHEMA.`ROUTINES` contains broken procedure body when used shielding quotes inside. +--echo # +DELIMITER $$; +CREATE PROCEDURE p1() +BEGIN +SELECT CONCAT('ABC = ''',1,''''), CONCAT('ABC = ',2); +SELECT '''', """", '\'', "\""; +SELECT ' \t'; +SELECT ' +\n'; +SELECT 'test'; +SELECT 'tëst'; +SELECT 'test\0'; +SELECT 'tëst\0'; +SELECT _binary'test'; +SELECT _binary'test\0'; +SELECT N'''', N"""", N'\'', N"\""; +SELECT N' \t'; +SELECT N' +\n'; +SELECT N'test'; +SELECT N'tëst'; +SELECT N'test\0'; +SELECT N'tëst\0'; +END$$ +DELIMITER ;$$ +SELECT ROUTINE_DEFINITION FROM INFORMATION_SCHEMA.ROUTINES +WHERE ROUTINE_SCHEMA='test' AND SPECIFIC_NAME ='p1'; +SELECT body_utf8 FROM mysql.proc WHERE name='p1'; +DROP PROCEDURE p1; + +SET @@SQL_MODE='NO_BACKSLASH_ESCAPES'; +DELIMITER $$; +CREATE PROCEDURE p1() +BEGIN +SELECT CONCAT('ABC = ''',1,''''), CONCAT('ABC = ',2); +SELECT '''', """"; +SELECT ' \t'; +SELECT ' +\n'; +SELECT 'test'; +SELECT 'tëst'; +SELECT 'test\0'; +SELECT 'tëst\0'; +SELECT _binary'test'; +SELECT _binary'test\0'; +SELECT N'''', N""""; +SELECT N' \t'; +SELECT N' +\n'; +SELECT N'test'; +SELECT N'tëst'; +SELECT N'test\0'; +SELECT N'tëst\0'; +END$$ +DELIMITER ;$$ +SELECT ROUTINE_DEFINITION FROM INFORMATION_SCHEMA.ROUTINES +WHERE ROUTINE_SCHEMA='test' AND SPECIFIC_NAME ='p1'; +SELECT body_utf8 FROM mysql.proc WHERE name='p1'; +DROP PROCEDURE p1; +SET @@SQL_MODE=default; + + +# TODO: Uncomment the below test whe we fix: +# MDEV-9623INFORMATION_SCHEMA.ROUTINES.ROUTINE_DEFINITION does not handle binary literals well +# +#SET NAMES binary; +#CREATE FUNCTION f1() RETURNS TEXT RETURN CONCAT('i','й'); +#SELECT ROUTINE_DEFINITION FROM INFORMATION_SCHEMA.ROUTINES +#WHERE ROUTINE_SCHEMA='test' AND SPECIFIC_NAME ='f1'; +#SELECT body_utf8 FROM mysql.proc WHERE name='f1'; +#DROP FUNCTION f1; +#SET NAMES utf8; + + --echo # --echo # End of 10.1 tests --echo # diff --git a/mysql-test/t/ctype_utf8mb4.test b/mysql-test/t/ctype_utf8mb4.test index 3f2b6000597..2fe9b5e6544 100644 --- a/mysql-test/t/ctype_utf8mb4.test +++ b/mysql-test/t/ctype_utf8mb4.test @@ -1904,6 +1904,18 @@ SET NAMES utf8mb4; --error ER_INVALID_CHARACTER_STRING SELECT * FROM `test😁😁test`; +--echo # +--echo # MDEV-7231 Field ROUTINE_DEFINITION in INFORMATION_SCHEMA.`ROUTINES` contains broken procedure body when used shielding quotes inside. +--echo # +# Non-BMP characters should be replaced to '?' in ROUTINE_DEFINITION/body_utf8 +SET NAMES utf8mb4; +CREATE FUNCTION f1() RETURNS TEXT CHARACTER SET utf8mb4 +RETURN CONCAT('😎','x😎','😎y','x😎y'); +SELECT ROUTINE_DEFINITION FROM INFORMATION_SCHEMA.ROUTINES +WHERE ROUTINE_SCHEMA='test' AND SPECIFIC_NAME ='f1'; +SELECT body_utf8 FROM mysql.proc WHERE name='f1'; +DROP FUNCTION f1; + --echo # --echo # End of 10.1 tests --echo # diff --git a/sql/sql_lex.cc b/sql/sql_lex.cc index 898e3ae33c6..994221c60d1 100644 --- a/sql/sql_lex.cc +++ b/sql/sql_lex.cc @@ -324,9 +324,7 @@ void Lex_input_stream::body_utf8_start(THD *thd, const char *begin_ptr) DBUG_ASSERT(begin_ptr); DBUG_ASSERT(m_cpp_buf <= begin_ptr && begin_ptr <= m_cpp_buf + m_buf_length); - uint body_utf8_length= - (m_buf_length / thd->variables.character_set_client->mbminlen) * - my_charset_utf8_bin.mbmaxlen; + uint body_utf8_length= get_body_utf8_maximum_length(thd); m_body_utf8= (char *) thd->alloc(body_utf8_length + 1); m_body_utf8_ptr= m_body_utf8; @@ -335,6 +333,22 @@ void Lex_input_stream::body_utf8_start(THD *thd, const char *begin_ptr) m_cpp_utf8_processed_ptr= begin_ptr; } + +uint Lex_input_stream::get_body_utf8_maximum_length(THD *thd) +{ + /* + String literals can grow during escaping: + 1a. Character string '' can grow to '\t', 3 bytes to 4 bytes growth. + 1b. Character string '1000 times ' grows from + 1002 to 2002 bytes (including quotes), which gives a little bit + less than 2 times growth. + "2" should be a reasonable multiplier that safely covers escaping needs. + */ + return (m_buf_length / thd->variables.character_set_client->mbminlen) * + my_charset_utf8_bin.mbmaxlen * 2/*for escaping*/; +} + + /** @brief The operation appends unprocessed part of pre-processed buffer till the given pointer (ptr) and sets m_cpp_utf8_processed_ptr to end_ptr. @@ -402,15 +416,15 @@ void Lex_input_stream::body_utf8_append(const char *ptr) operation. */ -void Lex_input_stream::body_utf8_append_literal(THD *thd, - const LEX_STRING *txt, - CHARSET_INFO *txt_cs, - const char *end_ptr) +void Lex_input_stream::body_utf8_append_ident(THD *thd, + const LEX_STRING *txt, + const char *end_ptr) { if (!m_cpp_utf8_processed_ptr) return; LEX_STRING utf_txt; + CHARSET_INFO *txt_cs= thd->charset(); if (!my_charset_same(txt_cs, &my_charset_utf8_general_ci)) { @@ -434,6 +448,189 @@ void Lex_input_stream::body_utf8_append_literal(THD *thd, m_cpp_utf8_processed_ptr= end_ptr; } + + + +extern "C" { + +/** + Escape a character. Consequently puts "escape" and "wc" characters into + the destination utf8 string. + @param cs - the character set (utf8) + @param escape - the escape character (backslash, single quote, double quote) + @param wc - the character to be escaped + @param str - the destination string + @param end - the end of the destination string + @returns - a code according to the wc_mb() convension. +*/ +int my_wc_mb_utf8_with_escape(CHARSET_INFO *cs, my_wc_t escape, my_wc_t wc, + uchar *str, uchar *end) +{ + DBUG_ASSERT(escape > 0); + if (str + 1 >= end) + return MY_CS_TOOSMALL2; // Not enough space, need at least two bytes. + *str= escape; + int cnvres= my_charset_utf8_handler.wc_mb(cs, wc, str + 1, end); + if (cnvres > 0) + return cnvres + 1; // The character was normally put + if (cnvres == MY_CS_ILUNI) + return MY_CS_ILUNI; // Could not encode "wc" (e.g. non-BMP character) + DBUG_ASSERT(cnvres <= MY_CS_TOOSMALL); + return cnvres - 1; // Not enough space +} + + +/** + Optionally escape a character. + If "escape" is non-zero, then both "escape" and "wc" are put to + the destination string. Otherwise, only "wc" is put. + @param cs - the character set (utf8) + @param wc - the character to be optionally escaped + @param escape - the escape character, or 0 + @param ewc - the escaped replacement of "wc" (e.g. 't' for '\t') + @param str - the destination string + @param end - the end of the destination string + @returns - a code according to the wc_mb() conversion. +*/ +int my_wc_mb_utf8_opt_escape(CHARSET_INFO *cs, + my_wc_t wc, my_wc_t escape, my_wc_t ewc, + uchar *str, uchar *end) +{ + return escape ? my_wc_mb_utf8_with_escape(cs, escape, ewc, str, end) : + my_charset_utf8_handler.wc_mb(cs, wc, str, end); +} + +/** + Encode a character with optional backlash escaping and quote escaping. + Quote marks are escaped using another quote mark. + Additionally, if "escape" is non-zero, then special characters are + also escaped using "escape". + Otherwise (if "escape" is zero, e.g. in case of MODE_NO_BACKSLASH_ESCAPES), + then special characters are not escaped and handled as normal characters. + + @param cs - the character set (utf8) + @param wc - the character to be encoded + @param str - the destination string + @param end - the end of the destination string + @param sep - the string delimiter (e.g. ' or ") + @param escape - the escape character (backslash, or 0) + @returns - a code according to the wc_mb() convension. +*/ +int my_wc_mb_utf8_escape(CHARSET_INFO *cs, my_wc_t wc, uchar *str, uchar *end, + my_wc_t sep, my_wc_t escape) +{ + DBUG_ASSERT(escape == 0 || escape == '\\'); + DBUG_ASSERT(sep == '"' || sep == '\''); + switch (wc) { + case 0: return my_wc_mb_utf8_opt_escape(cs, wc, escape, '0', str, end); + case '\t': return my_wc_mb_utf8_opt_escape(cs, wc, escape, 't', str, end); + case '\r': return my_wc_mb_utf8_opt_escape(cs, wc, escape, 'r', str, end); + case '\n': return my_wc_mb_utf8_opt_escape(cs, wc, escape, 'n', str, end); + case '\032': return my_wc_mb_utf8_opt_escape(cs, wc, escape, 'Z', str, end); + case '\'': + case '\"': + if (wc == sep) + return my_wc_mb_utf8_with_escape(cs, wc, wc, str, end); + } + return my_charset_utf8_handler.wc_mb(cs, wc, str, end); // No escaping needed +} + + +/** wc_mb() compatible routines for all sql_mode and delimiter combinations */ +int my_wc_mb_utf8_escape_single_quote_and_backslash(CHARSET_INFO *cs, + my_wc_t wc, + uchar *str, uchar *end) +{ + return my_wc_mb_utf8_escape(cs, wc, str, end, '\'', '\\'); +} + + +int my_wc_mb_utf8_escape_double_quote_and_backslash(CHARSET_INFO *cs, + my_wc_t wc, + uchar *str, uchar *end) +{ + return my_wc_mb_utf8_escape(cs, wc, str, end, '"', '\\'); +} + + +int my_wc_mb_utf8_escape_single_quote(CHARSET_INFO *cs, my_wc_t wc, + uchar *str, uchar *end) +{ + return my_wc_mb_utf8_escape(cs, wc, str, end, '\'', 0); +} + + +int my_wc_mb_utf8_escape_double_quote(CHARSET_INFO *cs, my_wc_t wc, + uchar *str, uchar *end) +{ + return my_wc_mb_utf8_escape(cs, wc, str, end, '"', 0); +} + +}; // End of extern "C" + + +/** + Get an escaping function, depending on the current sql_mode and the + string separator. +*/ +my_charset_conv_wc_mb +Lex_input_stream::get_escape_func(THD *thd, my_wc_t sep) const +{ + return thd->backslash_escapes() ? + (sep == '"' ? my_wc_mb_utf8_escape_double_quote_and_backslash: + my_wc_mb_utf8_escape_single_quote_and_backslash) : + (sep == '"' ? my_wc_mb_utf8_escape_double_quote: + my_wc_mb_utf8_escape_single_quote); +} + + +/** + Append a text literal to the end of m_body_utf8. + The string is escaped according to the current sql_mode and the + string delimiter (e.g. ' or "). + + @param thd - current THD + @param txt - the string to be appended to m_body_utf8. + Note, the string must be already unescaped. + @param cs - the character set of the string + @param end_ptr - m_cpp_utf8_processed_ptr will be set to this value + (see body_utf8_append_ident for details) + @param sep - the string delimiter (single or double quote) +*/ +void Lex_input_stream::body_utf8_append_escape(THD *thd, + const LEX_STRING *txt, + CHARSET_INFO *cs, + const char *end_ptr, + my_wc_t sep) +{ + DBUG_ASSERT(sep == '\'' || sep == '"'); + if (!m_cpp_utf8_processed_ptr) + return; + uint errors; + /** + We previously alloced m_body_utf8 to be able to store the query with all + strings properly escaped. See get_body_utf8_maximum_length(). + So here we have guaranteedly enough space to append any string literal + with escaping. Passing txt->length*2 as "available space" is always safe. + For better safety purposes we could calculate get_body_utf8_maximum_length() + every time we append a string, but this would affect performance negatively, + so let's check that we don't get beyond the allocated buffer in + debug build only. + */ + DBUG_ASSERT(m_body_utf8 + get_body_utf8_maximum_length(thd) >= + m_body_utf8_ptr + txt->length * 2); + uint32 cnv_length= my_convert_using_func(m_body_utf8_ptr, txt->length * 2, + &my_charset_utf8_general_ci, + get_escape_func(thd, sep), + txt->str, txt->length, + cs, cs->cset->mb_wc, + &errors); + m_body_utf8_ptr+= cnv_length; + *m_body_utf8_ptr= 0; + m_cpp_utf8_processed_ptr= end_ptr; +} + + void Lex_input_stream::add_digest_token(uint token, LEX_YYSTYPE yylval) { if (m_digest != NULL) @@ -797,14 +994,14 @@ Lex_input_stream::unescape(CHARSET_INFO *cs, char *to, Fix sometimes to do only one scan of the string */ -bool Lex_input_stream::get_text(LEX_STRING *dst, int pre_skip, int post_skip) +bool Lex_input_stream::get_text(LEX_STRING *dst, uint sep, + int pre_skip, int post_skip) { - reg1 uchar c,sep; + reg1 uchar c; uint found_escape=0; CHARSET_INFO *cs= m_thd->charset(); tok_bitmap= 0; - sep= yyGetLast(); // String should end with this while (! eof()) { c= yyGet(); @@ -1169,6 +1366,8 @@ static int lex_one_token(YYSTYPE *yylval, THD *thd) return((int) c); case MY_LEX_IDENT_OR_NCHAR: + { + uint sep; if (lip->yyPeek() != '\'') { state= MY_LEX_IDENT; @@ -1176,14 +1375,20 @@ static int lex_one_token(YYSTYPE *yylval, THD *thd) } /* Found N'string' */ lip->yySkip(); // Skip ' - if (lip->get_text(&yylval->lex_str, 2, 1)) + if (lip->get_text(&yylval->lex_str, (sep= lip->yyGetLast()), 2, 1)) { state= MY_LEX_CHAR; // Read char by char break; } + + lip->body_utf8_append(lip->m_cpp_text_start); + lip->body_utf8_append_escape(thd, &yylval->lex_str, + national_charset_info, + lip->m_cpp_text_end, sep); + lex->text_string_is_7bit= (lip->tok_bitmap & 0x80) ? 0 : 1; return(NCHAR_STRING); - + } case MY_LEX_IDENT_OR_HEX: if (lip->yyPeek() == '\'') { // Found x'hex-number' @@ -1286,8 +1491,7 @@ static int lex_one_token(YYSTYPE *yylval, THD *thd) lip->body_utf8_append(lip->m_cpp_text_start); - lip->body_utf8_append_literal(thd, &yylval->lex_str, cs, - lip->m_cpp_text_end); + lip->body_utf8_append_ident(thd, &yylval->lex_str, lip->m_cpp_text_end); return(result_state); // IDENT or IDENT_QUOTED @@ -1391,8 +1595,7 @@ static int lex_one_token(YYSTYPE *yylval, THD *thd) lip->body_utf8_append(lip->m_cpp_text_start); - lip->body_utf8_append_literal(thd, &yylval->lex_str, cs, - lip->m_cpp_text_end); + lip->body_utf8_append_ident(thd, &yylval->lex_str, lip->m_cpp_text_end); return(result_state); @@ -1435,8 +1638,7 @@ static int lex_one_token(YYSTYPE *yylval, THD *thd) lip->body_utf8_append(lip->m_cpp_text_start); - lip->body_utf8_append_literal(thd, &yylval->lex_str, cs, - lip->m_cpp_text_end); + lip->body_utf8_append_ident(thd, &yylval->lex_str, lip->m_cpp_text_end); return(IDENT_QUOTED); } @@ -1541,23 +1743,23 @@ static int lex_one_token(YYSTYPE *yylval, THD *thd) } /* " used for strings */ case MY_LEX_STRING: // Incomplete text string - if (lip->get_text(&yylval->lex_str, 1, 1)) + { + uint sep; + if (lip->get_text(&yylval->lex_str, (sep= lip->yyGetLast()), 1, 1)) { state= MY_LEX_CHAR; // Read char by char break; } - + CHARSET_INFO *strcs= lip->m_underscore_cs ? lip->m_underscore_cs : cs; lip->body_utf8_append(lip->m_cpp_text_start); - lip->body_utf8_append_literal(thd, &yylval->lex_str, - lip->m_underscore_cs ? lip->m_underscore_cs : cs, - lip->m_cpp_text_end); - + lip->body_utf8_append_escape(thd, &yylval->lex_str, strcs, + lip->m_cpp_text_end, sep); lip->m_underscore_cs= NULL; lex->text_string_is_7bit= (lip->tok_bitmap & 0x80) ? 0 : 1; return(TEXT_STRING); - + } case MY_LEX_COMMENT: // Comment lex->select_lex.options|= OPTION_FOUND_COMMENT; while ((c = lip->yyGet()) != '\n' && c) ; @@ -1806,8 +2008,7 @@ static int lex_one_token(YYSTYPE *yylval, THD *thd) lip->body_utf8_append(lip->m_cpp_text_start); - lip->body_utf8_append_literal(thd, &yylval->lex_str, cs, - lip->m_cpp_text_end); + lip->body_utf8_append_ident(thd, &yylval->lex_str, lip->m_cpp_text_end); return(result_state); } diff --git a/sql/sql_lex.h b/sql/sql_lex.h index 644d1a19cfd..72a9ac14936 100644 --- a/sql/sql_lex.h +++ b/sql/sql_lex.h @@ -1807,6 +1807,7 @@ class Lex_input_stream { size_t unescape(CHARSET_INFO *cs, char *to, const char *str, const char *end, int sep); + my_charset_conv_wc_mb get_escape_func(THD *thd, my_wc_t sep) const; public: Lex_input_stream() { @@ -2077,14 +2078,23 @@ public: return (uint) (m_body_utf8_ptr - m_body_utf8); } + /** + Get the maximum length of the utf8-body buffer. + The utf8 body can grow because of the character set conversion and escaping. + */ + uint get_body_utf8_maximum_length(THD *thd); + void body_utf8_start(THD *thd, const char *begin_ptr); void body_utf8_append(const char *ptr); void body_utf8_append(const char *ptr, const char *end_ptr); - void body_utf8_append_literal(THD *thd, - const LEX_STRING *txt, - CHARSET_INFO *txt_cs, - const char *end_ptr); - + void body_utf8_append_ident(THD *thd, + const LEX_STRING *txt, + const char *end_ptr); + void body_utf8_append_escape(THD *thd, + const LEX_STRING *txt, + CHARSET_INFO *txt_cs, + const char *end_ptr, + my_wc_t sep); /** Current thread. */ THD *m_thd; @@ -2105,7 +2115,7 @@ public: /** LALR(2) resolution, value of the look ahead token.*/ LEX_YYSTYPE lookahead_yylval; - bool get_text(LEX_STRING *to, int pre_skip, int post_skip); + bool get_text(LEX_STRING *to, uint sep, int pre_skip, int post_skip); void add_digest_token(uint token, LEX_YYSTYPE yylval); diff --git a/strings/ctype.c b/strings/ctype.c index f871a219245..620c7e13503 100644 --- a/strings/ctype.c +++ b/strings/ctype.c @@ -1030,19 +1030,18 @@ my_charset_is_ascii_compatible(CHARSET_INFO *cs) @return Number of bytes copied to 'to' string */ -static uint32 -my_convert_internal(char *to, uint32 to_length, - CHARSET_INFO *to_cs, - const char *from, uint32 from_length, - CHARSET_INFO *from_cs, uint *errors) +uint32 +my_convert_using_func(char *to, uint32 to_length, + CHARSET_INFO *to_cs, my_charset_conv_wc_mb wc_mb, + const char *from, uint32 from_length, + CHARSET_INFO *from_cs, my_charset_conv_mb_wc mb_wc, + uint *errors) { int cnvres; my_wc_t wc; const uchar *from_end= (const uchar*) from + from_length; char *to_start= to; uchar *to_end= (uchar*) to + to_length; - my_charset_conv_mb_wc mb_wc= from_cs->cset->mb_wc; - my_charset_conv_wc_mb wc_mb= to_cs->cset->wc_mb; uint error_count= 0; while (1) @@ -1119,8 +1118,11 @@ my_convert(char *to, uint32 to_length, CHARSET_INFO *to_cs, immediately switch to slow mb_wc->wc_mb method. */ if ((to_cs->state | from_cs->state) & MY_CS_NONASCII) - return my_convert_internal(to, to_length, to_cs, - from, from_length, from_cs, errors); + return my_convert_using_func(to, to_length, + to_cs, to_cs->cset->wc_mb, + from, from_length, + from_cs, from_cs->cset->mb_wc, + errors); length= length2= MY_MIN(to_length, from_length); @@ -1152,9 +1154,11 @@ my_convert(char *to, uint32 to_length, CHARSET_INFO *to_cs, uint32 copied_length= length2 - length; to_length-= copied_length; from_length-= copied_length; - return copied_length + my_convert_internal(to, to_length, to_cs, - from, from_length, from_cs, - errors); + return copied_length + my_convert_using_func(to, to_length, to_cs, + to_cs->cset->wc_mb, + from, from_length, from_cs, + from_cs->cset->mb_wc, + errors); } }