From 1bcd2bebc69163c541baa548f1204ccd1b830d25 Mon Sep 17 00:00:00 2001 From: Alexander Barkov Date: Tue, 8 Oct 2013 18:25:17 +0400 Subject: [PATCH] MDEV-4425 Regexp enhancements Do not pass PCRE_UCP flag for binary data. This makes bytes 0x80..FF not to belong to generic character classes \d (digit) and \w (word character). SELECT 0xFF RLIKE '\\w'; -> 0 Note, this change does not affect non-binary data, which is still examined with the PCRE_UCP flag by default. --- mysql-test/r/func_regexp_pcre.result | 6 ++++++ mysql-test/t/func_regexp_pcre.test | 4 ++++ sql/item_cmpfunc.h | 5 +++-- 3 files changed, 13 insertions(+), 2 deletions(-) diff --git a/mysql-test/r/func_regexp_pcre.result b/mysql-test/r/func_regexp_pcre.result index 4ae66e24eda..ebbdedc6999 100644 --- a/mysql-test/r/func_regexp_pcre.result +++ b/mysql-test/r/func_regexp_pcre.result @@ -234,6 +234,12 @@ class ch ch RLIKE class \p{Tamil} 㐗 0 \p{Tamil} 갷 0 DROP TABLE t1, t2; +SELECT 0xFF RLIKE '\\w'; +0xFF RLIKE '\\w' +0 +SELECT 0xFF RLIKE '(*UCP)\\w'; +0xFF RLIKE '(*UCP)\\w' +1 SELECT '\n' RLIKE '(*CR)'; '\n' RLIKE '(*CR)' 1 diff --git a/mysql-test/t/func_regexp_pcre.test b/mysql-test/t/func_regexp_pcre.test index 7a4be163001..4892d9931da 100644 --- a/mysql-test/t/func_regexp_pcre.test +++ b/mysql-test/t/func_regexp_pcre.test @@ -46,6 +46,10 @@ INSERT INTO t2 VALUES ('[[:alpha:]]'),('[[:digit:]]'); SELECT class, ch, ch RLIKE class FROM t1, t2 ORDER BY class, BINARY ch; DROP TABLE t1, t2; +# Checking that UCP is disabled by default for binary data +SELECT 0xFF RLIKE '\\w'; +SELECT 0xFF RLIKE '(*UCP)\\w'; + # newline character SELECT '\n' RLIKE '(*CR)'; SELECT '\n' RLIKE '(*LF)'; diff --git a/sql/item_cmpfunc.h b/sql/item_cmpfunc.h index e8574a573e7..2d89b8d65db 100644 --- a/sql/item_cmpfunc.h +++ b/sql/item_cmpfunc.h @@ -1511,8 +1511,9 @@ public: {} void init(CHARSET_INFO *data_charset, int extra_flags, uint nsubpatterns) { - m_library_flags= PCRE_UCP | extra_flags | - (data_charset != &my_charset_bin ? PCRE_UTF8 : 0) | + m_library_flags= extra_flags | + (data_charset != &my_charset_bin ? + (PCRE_UTF8 | PCRE_UCP) : 0) | ((data_charset->state & (MY_CS_BINSORT | MY_CS_CSSORT)) ? 0 : PCRE_CASELESS);