Adding thorough tests covering what happens with escaped sequences

in the SQL parser.

Various backslash escapes and quote-quote escaped sequences are covered
in combination with single and multi-byte characters.

This is especially important for the character sets that can have 0x5C
as the second byte in a multi-byte character (big5, cp932, gbk, sjis).

swe7 is also a special character set, because in swe7 0x5C is used for
both escape character and for "LATIN CAPITAL LETTER O WITH DIAERESIS".
This commit is contained in:
Alexander Barkov 2014-09-12 12:57:27 +04:00
parent d2ae40ac2e
commit 6a576f1a58
17 changed files with 21909 additions and 0 deletions

View file

@ -0,0 +1,9 @@
#
# Create a table with all byte values
#
CREATE TABLE halfs (a INT);
INSERT INTO halfs VALUES (0x00),(0x01),(0x02),(0x03),(0x04),(0x05),(0x06),(0x07);
INSERT INTO halfs VALUES (0x08),(0x09),(0x0A),(0x0B),(0x0C),(0x0D),(0x0E),(0x0F);
CREATE TEMPORARY TABLE bytes (a BINARY(1), KEY(a)) ENGINE=MyISAM;
INSERT INTO bytes SELECT CHAR((t1.a << 4) | t2.a USING BINARY) FROM halfs t1, halfs t2;
DROP TABLE halfs;

View file

@ -0,0 +1,21 @@
#
# Create a table with all 2 byte sequence values
#
--source include/bytes.inc
CREATE TABLE halfs (a BINARY(1));
# "bytes" is a temporary table, hence is not allowed in joins
# Create a non-temporary copy.
INSERT INTO halfs SELECT * FROM bytes;
CREATE TEMPORARY TABLE bytes2 (
a BINARY(2),
hi BINARY(1),
lo BINARY(1),
KEY(a),
KEY(lo)
) ENGINE=MyISAM;
INSERT INTO bytes2
SELECT CONCAT(t1.a, t2.a), t1.a, t2.a FROM halfs t1, halfs t2
ORDER BY t1.a, t2.a;
DROP TABLE halfs;

View file

@ -0,0 +1,327 @@
--echo # Start of ctype_unescape.inc
#
# Testing how string literals with backslash and quote-quote are unescaped.
# The tests assume that single quote (') is used as a delimiter.
#
#
# Make sure that the parser really works using the character set we need.
# We use binary strings to compose strings, to be able to test get malformed
# sequences, which are possible as a result of mysql_real_escape_string().
# The important thing for this test is to make the parser unescape using
# the client character set, rather than binary. Currently it works exactly
# that way by default, so the query below should return @@character_set_client
#
SET @query=_binary'SELECT CHARSET(\'test\'),@@character_set_client,@@character_set_connection';
PREPARE stmt FROM @query;
EXECUTE stmt;
DEALLOCATE PREPARE stmt;
let $CHARSET=`SELECT @@character_set_connection`;
CREATE TABLE allbytes (a VARBINARY(10));
#
# Create various byte sequences to test. Testing the full banch of
# possible combinations takes about 2 minutes. So this test provides
# variants to run with:
# - the full set of possible combinations
# - a reduced test of combinations for selected bytes only
#
# Create selected byte combinations
if ($ctype_unescape_combinations == 'selected')
{
--echo # Using selected bytes combinations
--source include/bytes.inc
#
# Populate "selected_bytes" with bytes that have a special meaning.
# We'll use "selected_bytes" to generate byte seqeunces,
# instead of the full possible byte combinations, to reduce test time.
#
CREATE TABLE selected_bytes (a VARBINARY(10));
# Bytes that have a special meaning in all character sets:
# 0x00 - mysql_real_escape_string() quotes this to '\0'
# 0x0D - mysql_real_escape_string() quotes this to '\r'
# 0x0A - mysql_real_escape_string() quotes this to '\n'
# 0x1A - mysql_real_escape_string() quotes this to '\Z'
# 0x08 - mysql_real_escape_string() does not quote this,
# but '\b' is unescaped to 0x08.
# 0x09 - mysql_real_escape_string() does not quote this,
# but '\t' is unescaped to 0x09.
# 0x30 - '0', as in '\0'
# 0x5A - 'Z', as in '\Z'
# 0x62 - 'b', as in '\b'
# 0x6E - 'n', as in '\n'
# 0x72 - 't', as in '\r'
# 0x74 - 't', as in '\t'
INSERT INTO selected_bytes (a) VALUES ('\0'),('\b'),('\t'),('\r'),('\n'),('\Z');
INSERT INTO selected_bytes (a) VALUES ('0'),('b'),('t'),('r'),('n'),('Z');
# 0x22 - double quote
# 0x25 - percent sign, '\%' is preserved as is for LIKE.
# 0x27 - single quote
# 0x5C - backslash
# 0x5F - underscore, '\_' is preserved as is for LIKE.
INSERT INTO selected_bytes (a) VALUES ('\\'),('_'),('%'),(0x22),(0x27);
# Some bytes do not have any special meaning, for example basic Latin letters.
# Let's add, one should be enough for a good enough coverage.
INSERT INTO selected_bytes (a) VALUES ('a');
#
# This maps summarizes bytes that have a special
# meaning in various character sets:
#
# MBHEAD MBTAIL NONASCII-8BIT BAD
# ------ ------ -------------- ----------
# big5: [A1..F9] [40..7E,A1..FE] N/A [80..A0,FA..FF]
# cp932: [81..9F,E0..FC] [40..7E,80..FC] [A1..DF] [FD..FF]
# gbk: [81..FE] [40..7E,80..FE] N/A [FF]
# sjis: [81..9F,E0..FC] [40..7E,80..FC] [A1..DF] [FD..FF]
# swe7: N/A N/A [5B..5E,7B..7E] [80..FF]
#
INSERT INTO selected_bytes (a) VALUES
(0x3F), # 7bit
(0x40), # 7bit mbtail
(0x7E), # 7bit mbtail nonascii-8bit
(0x7F), # 7bit nonascii-8bit
(0x80), # mbtail bad-mb
(0x81), # mbhead mbtail
(0x9F), # mbhead mbtail bad-mb
(0xA0), # mbhead mbtail bad-mb
(0xA1), # mbhead mbtail nonascii-8bit
(0xE0), # mbhead mbtai
(0xEF), # mbhead mbtail
(0xF9), # mbhead mbtail
(0xFA), # mbhead mbtail bad-mb
(0xFC), # mbhead mbtail bad-mb
(0xFD), # mbhead mbtail bad-mb
(0xFE), # mbhead mbtial bad-mb
(0xFF); # bad-mb
#
# Now populate the test table
#
# Use all single bytes, this is cheap, there are only 256 values.
INSERT INTO allbytes (a) SELECT a FROM bytes;
# Add selected bytes combinations
INSERT INTO allbytes (a) SELECT CONCAT(t1.a,t2.a) FROM selected_bytes t1,selected_bytes t2;
INSERT INTO allbytes (a) SELECT CONCAT(0x5C,t1.a,t2.a) FROM selected_bytes t1,selected_bytes t2;
INSERT INTO allbytes (a) SELECT CONCAT(0x5C,t1.a,0x5C,t2.a) FROM selected_bytes t1,selected_bytes t2;
DROP TABLE selected_bytes;
# Delete all non-single byte sequences that do not have
# backslashes or quotes at all. There is nothing special with these strings.
DELETE FROM allbytes WHERE
OCTET_LENGTH(a)>1 AND
LOCATE(0x5C,a)=0 AND
a NOT LIKE '%\'%' AND
a NOT LIKE '%"%';
}
if ($ctype_unescape_combinations=='')
{
--echo # Using full byte combinations
--source include/bytes2.inc
INSERT INTO allbytes (a) SELECT a FROM bytes;
INSERT INTO allbytes (a) SELECT CONCAT(hi,lo) FROM bytes2;
INSERT INTO allbytes (a) SELECT CONCAT(0x5C,hi,lo) FROM bytes2;
INSERT INTO allbytes (a) SELECT CONCAT(0x5C,hi,0x5C,lo) FROM bytes2;
}
DELIMITER //;
#
# A procedure that make an SQL query using 'val' as a string literal.
# The result of the query execution is written into the table 't1'.
# NULL in t1.b means that query failed due to syntax error,
# typically because of mis-interpreted closing quote delimiter.
#
CREATE PROCEDURE p1(val VARBINARY(10))
BEGIN
DECLARE EXIT HANDLER FOR SQLSTATE '42000' INSERT INTO t1 (a,b) VALUES(val,NULL);
SET @query=CONCAT(_binary"INSERT INTO t1 (a,b) VALUES (0x",HEX(val),",'",val,"')");
PREPARE stmt FROM @query;
EXECUTE stmt;
DEALLOCATE PREPARE stmt;
END//
#
# A procedure that iterates through all records in "allbytes".
# And runs p1() for every record.
#
CREATE PROCEDURE p2()
BEGIN
DECLARE val VARBINARY(10);
DECLARE done INT DEFAULT FALSE;
DECLARE stmt CURSOR FOR SELECT a FROM allbytes;
DECLARE CONTINUE HANDLER FOR NOT FOUND SET done=TRUE;
OPEN stmt;
read_loop1: LOOP
FETCH stmt INTO val;
IF done THEN
LEAVE read_loop1;
END IF;
CALL p1(val);
END LOOP;
CLOSE stmt;
END//
# A function that converts the value from binary to $CHARSET
# and check if it has changed. CONVERT() fixes malformed strings.
# So if the string changes in CONVERT(), it means it was not wellformed.
--eval CREATE FUNCTION iswellformed(a VARBINARY(256)) RETURNS INT RETURN a=BINARY CONVERT(a USING $CHARSET);
#
# A function that approximately reproduces how the SQL parser
# would unescape a binary string.
#
CREATE FUNCTION unescape(a VARBINARY(256)) RETURNS VARBINARY(256)
BEGIN
# We need to do it in a way to avoid producing new escape sequences
# First, enclose all known escsape sequences to '{{xx}}'
# - Backslash not followed by a LIKE pattern characters _ and %
# - Double escapes
# This uses PCRE Branch Reset Groups: (?|(alt1)|(alt2)|(alt3)).
# So '\\1' in the last argument always means the match, no matter
# which alternative it came from.
SET a=REGEXP_REPLACE(a,'(?|(\\\\[^_%])|(\\x{27}\\x{27}))','{{\\1}}');
# Now unescape all enclosed standard escape sequences
SET a=REPLACE(a,'{{\\0}}', '\0');
SET a=REPLACE(a,'{{\\b}}', '\b');
SET a=REPLACE(a,'{{\\t}}', '\t');
SET a=REPLACE(a,'{{\\r}}', '\r');
SET a=REPLACE(a,'{{\\n}}', '\n');
SET a=REPLACE(a,'{{\\Z}}', '\Z');
SET a=REPLACE(a,'{{\\\'}}', '\'');
# Unescape double quotes
SET a=REPLACE(a,'{{\'\'}}', '\'');
# Unescape the rest: all other \x sequences mean just 'x'
SET a=REGEXP_REPLACE(a, '{{\\\\(.|\\R)}}', '\\1');
RETURN a;
END//
#
# A function that checks what happened during unescaping.
#
# @param a - the value before unescaping
# @param b - the value after unescaping
#
# The following return values are possible:
# - SyntErr - b IS NULL, which means syntax error happened in p1().
# - Preserv - the value was not modified during unescaping.
# This is possible if 0x5C was treated as mbtail.
# Or only LIKE escape sequences were found: '\_' and '\%'.
# - Trivial - only 0x5C were removed.
# - Regular - the value was unescaped like a binary string.
# Some standard escape sequences were found.
# No special multi-byte handling happened.
# - Special - Something else happened. Should not happen.
#
CREATE FUNCTION unescape_type(a VARBINARY(256),b VARBINARY(256)) RETURNS VARBINARY(256)
BEGIN
RETURN CASE
WHEN b IS NULL THEN '[SyntErr]'
WHEN a=b THEN CASE
WHEN OCTET_LENGTH(a)=1 THEN '[Preserve]'
WHEN a RLIKE '\\\\[_%]' THEN '[Preserve][LIKE]'
WHEN a RLIKE '^[[:ascii:]]+$' THEN '[Preserve][ASCII]'
ELSE '[Preserv][MB]' END
WHEN REPLACE(a,0x5C,'')=b THEN '[Trivial]'
WHEN UNESCAPE(a)=b THEN '[Regular]'
ELSE '[Special]' END;
END//
#
# Check what happened with wellformedness during unescaping
# @param a - the value before unescaping
# @param b - the value after unescaping
#
# Returned values:
# [FIXED] - the value was malformed and become wellformed after unescaping
# [BROKE] - the value was wellformed and become malformed after unescaping
# [ILSEQ] - both values (before unescaping and after unescaping) are malformed
# '' - both values are wellformed
#
CREATE FUNCTION wellformedness(a VARBINARY(256), b VARBINARY(256))
RETURNS VARBINARY(256)
BEGIN
RETURN CASE
WHEN b IS NULL THEN ''
WHEN NOT iswellformed(a) AND iswellformed(b) THEN '[FIXED]'
WHEN iswellformed(a) AND NOT iswellformed(b) THEN '[BROKE]'
WHEN NOT iswellformed(a) AND NOT iswellformed(b) THEN '[ILSEQ]'
ELSE ''
END;
END//
#
# Check if the value could be generated by mysql_real_escape_string(),
# or can only come from a direct user input.
#
# @param a - the value before unescaping
#
# Returns:
# [USER] - if the value could not be generated by mysql_real_escape_string()
# '' - if the value was possibly generated by mysql_real_escape_string()
#
#
CREATE FUNCTION mysql_real_escape_string_generated(a VARBINARY(256))
RETURNS VARBINARY(256)
BEGIN
DECLARE a1 BINARY(1) DEFAULT SUBSTR(a,1,1);
DECLARE a2 BINARY(1) DEFAULT SUBSTR(a,2,1);
DECLARE a3 BINARY(1) DEFAULT SUBSTR(a,3,1);
DECLARE a4 BINARY(1) DEFAULT SUBSTR(a,4,1);
DECLARE a2a4 BINARY(2) DEFAULT CONCAT(a2,a4);
RETURN CASE
WHEN (a1=0x5C) AND
(a3=0x5C) AND
(a2>0x7F) AND
(a4 NOT IN ('_','%','0','t','r','n','Z')) AND
iswellformed(a2a4) THEN '[USER]'
ELSE ''
END;
END//
DELIMITER ;//
CREATE TABLE t1 (a VARBINARY(10),b VARBINARY(10));
CALL p2();
# Avoid "Invalid XXX character string" warnings
# We mark malformed strings in the output anyway
--disable_warnings
# All records marked with '[BAD]' mean that the string was unescaped
# in a unexpected way, that means there is a bug in UNESCAPE() above.
SELECT HEX(a),HEX(b),
CONCAT(unescape_type(a,b),
wellformedness(a,b),
mysql_real_escape_string_generated(a),
IF(UNESCAPE(a)<>b,CONCAT('[BAD',HEX(UNESCAPE(a)),']'),'')) AS comment
FROM t1 ORDER BY LENGTH(a),a;
--enable_warnings
DROP TABLE t1;
DROP PROCEDURE p1;
DROP PROCEDURE p2;
DROP FUNCTION unescape;
DROP FUNCTION unescape_type;
DROP FUNCTION wellformedness;
DROP FUNCTION mysql_real_escape_string_generated;
DROP FUNCTION iswellformed;
DROP TABLE allbytes;
--echo # End of ctype_backslash.inc

File diff suppressed because it is too large Load diff

File diff suppressed because it is too large Load diff

File diff suppressed because it is too large Load diff

File diff suppressed because it is too large Load diff

File diff suppressed because it is too large Load diff

File diff suppressed because it is too large Load diff

File diff suppressed because it is too large Load diff

View file

@ -169,3 +169,14 @@ set collation_connection=big5_bin;
--echo #
--echo # End of 5.6 tests
--echo #
--echo #
--echo # Start of 10.0 tests
--echo #
let $ctype_unescape_combinations=selected;
--source include/ctype_unescape.inc
--echo #
--echo # End of 10.0 tests
--echo #

View file

@ -149,3 +149,14 @@ set collation_connection=gb2312_bin;
--echo #
--echo # End of 5.6 tests
--echo #
--echo #
--echo # Start of 10.0 tests
--echo #
let $ctype_unescape_combinations=selected;
--source include/ctype_unescape.inc
--echo #
--echo # End of 10.0 tests
--echo #

View file

@ -187,3 +187,14 @@ set collation_connection=gbk_bin;
--echo #
--echo # End of 5.6 tests
--echo #
--echo #
--echo # Start of 10.0 tests
--echo #
let $ctype_unescape_combinations=selected;
--source include/ctype_unescape.inc
--echo #
--echo # End of 10.0 tests
--echo #

View file

@ -202,3 +202,14 @@ set names latin1;
--echo #
--echo # End of 5.6 tests
--echo #
--echo #
--echo # Start of 10.0 tests
--echo #
let $ctype_unescape_combinations=selected;
--source include/ctype_unescape.inc
--echo #
--echo # End of 10.0 tests
--echo #

View file

@ -215,3 +215,14 @@ set collation_connection=sjis_bin;
--echo #
--echo # End of 5.6 tests
--echo #
--echo #
--echo # Start of 10.0 tests
--echo #
let $ctype_unescape_combinations=selected;
--source include/ctype_unescape.inc
--echo #
--echo # End of 10.0 tests
--echo #

View file

@ -0,0 +1,10 @@
--echo #
--echo # Start of 10.0 tests
--echo #
let $ctype_unescape_combinations=selected;
--source include/ctype_unescape.inc
--echo #
--echo # End of 10.0 tests
--echo #

View file

@ -1738,6 +1738,9 @@ DEALLOCATE PREPARE stmt;
DROP TABLE t1;
let $ctype_unescape_combinations=selected;
--source include/ctype_unescape.inc
--echo #
--echo # End of 10.0 tests
--echo #