MDEV-4425 Regexp enhancements

Do not pass PCRE_UCP flag for binary data.
This makes bytes 0x80..FF not to belong to 
generic character classes \d (digit) and \w (word character).

SELECT 0xFF RLIKE '\\w';
 -> 0

Note, this change does not affect non-binary data,
which is still examined with the PCRE_UCP flag by default.
This commit is contained in:
Alexander Barkov 2013-10-08 18:25:17 +04:00
parent 43c09c15ff
commit 1bcd2bebc6
3 changed files with 13 additions and 2 deletions

View file

@ -234,6 +234,12 @@ class ch ch RLIKE class
\p{Tamil} 㐗 0
\p{Tamil} 갷 0
DROP TABLE t1, t2;
SELECT 0xFF RLIKE '\\w';
0xFF RLIKE '\\w'
0
SELECT 0xFF RLIKE '(*UCP)\\w';
0xFF RLIKE '(*UCP)\\w'
1
SELECT '\n' RLIKE '(*CR)';
'\n' RLIKE '(*CR)'
1

View file

@ -46,6 +46,10 @@ INSERT INTO t2 VALUES ('[[:alpha:]]'),('[[:digit:]]');
SELECT class, ch, ch RLIKE class FROM t1, t2 ORDER BY class, BINARY ch;
DROP TABLE t1, t2;
# Checking that UCP is disabled by default for binary data
SELECT 0xFF RLIKE '\\w';
SELECT 0xFF RLIKE '(*UCP)\\w';
# newline character
SELECT '\n' RLIKE '(*CR)';
SELECT '\n' RLIKE '(*LF)';

View file

@ -1511,8 +1511,9 @@ public:
{}
void init(CHARSET_INFO *data_charset, int extra_flags, uint nsubpatterns)
{
m_library_flags= PCRE_UCP | extra_flags |
(data_charset != &my_charset_bin ? PCRE_UTF8 : 0) |
m_library_flags= extra_flags |
(data_charset != &my_charset_bin ?
(PCRE_UTF8 | PCRE_UCP) : 0) |
((data_charset->state &
(MY_CS_BINSORT | MY_CS_CSSORT)) ? 0 : PCRE_CASELESS);