Bug#57737 Character sets: search fails with like, contraction, index

Problem: LIKE over an indexed column optimized away good results,
because my_like_range_utf32/utf16 returned wrong ranges for contractions.
Contraction related code was missing in my_like_range_utf32/utf16,
but did exist in my_like_range_ucs2/utf8.
It was forgotten in utf32/utf16 versions (during mysql-6.0 push/revert mess).

Fix:
The patch removes individual functions my_like_range_ucs2,
my_like_range_utf16, my_like_range_utf32 and introduces a single function
my_like_range_generic() instead. The new function handles contractions
correctly. It can handle any character set with cs->min_sort_char and
cs->max_sort_char represented in Unicode code points.

added:
  @ mysql-test/include/ctype_czech.inc
  @ mysql-test/include/ctype_like_ignorable.inc
  @ mysql-test/r/ctype_like_range.result
  @ mysql-test/t/ctype_like_range.test
  Adding tests


modified:

  @ include/m_ctype.h
  - Adding helper functions for contractions.
  - Prototypes: removing ucs2,utf16,utf32 functions, adding generic function.
  @ mysql-test/r/ctype_uca.result
  @ mysql-test/r/ctype_utf16_uca.result
  @ mysql-test/r/ctype_utf32_uca.result
  @ mysql-test/t/ctype_uca.test
  @ mysql-test/t/ctype_utf16_uca.test
  @ mysql-test/t/ctype_utf32_uca.test
  - Adding tests.

  @ strings/ctype-mb.c
  - Pad function did not put the last character.
  - Implementing my_like_range_generic() - an universal replacement
    for three separate functions
    my_like_range_ucs2(), my_like_range_utf16() and my_like_range_utf32(),
    with correct contraction handling.

  @ strings/ctype-ucs2.c
  - my_fill_mb2 did not put the high byte, as previously
    it was used to put only characters in ASCII range.
    Now it puts high byte as well
    (needed to pupulate cs->max_sort_char correctly).
  - Adding DBUG_ASSERT()
  - Removing character set specific functions:
    my_like_range_ucs2(), my_like_range_utf16() and my_like_range_utf32().
  - Using my_like_range_generic() instead of the old functions.

  @ strings/ctype-uca.c
  - Using generic function instead of the old character set specific ones.

  @ sql/item_create.cc
  @ sql/item_strfunc.cc
  @ sql/item_strfunc.h
  - Adding SQL functions LIKE_RANGE_MIN and LIKE_RANGE_MAX,
    available only in debug build to make sure like_range()
    works correctly for all character sets and collations.
This commit is contained in:
Alexander Barkov 2010-11-26 13:44:39 +03:00
commit e3dee8a7fd
17 changed files with 3001 additions and 344 deletions

View file

@ -636,7 +636,7 @@ static void pad_max_char(CHARSET_INFO *cs, char *str, char *end)
DBUG_ASSERT(buflen > 0);
do
{
if ((str + buflen) < end)
if ((str + buflen) <= end)
{
/* Enough space for the characer */
memcpy(str, buf, buflen);
@ -802,6 +802,192 @@ fill_max_and_min:
}
/**
Calculate min_str and max_str that ranges a LIKE string.
Generic function, currently used for ucs2, utf16, utf32,
but should be suitable for any other character sets with
cs->min_sort_char and cs->max_sort_char represented in
Unicode code points.
@param cs Character set and collation pointer
@param ptr Pointer to LIKE pattern.
@param ptr_length Length of LIKE pattern.
@param escape Escape character pattern, typically '\'.
@param w_one 'One character' pattern, typically '_'.
@param w_many 'Many characters' pattern, typically '%'.
@param res_length Length of min_str and max_str.
@param[out] min_str Smallest string that ranges LIKE.
@param[out] max_str Largest string that ranges LIKE.
@param[out] min_len Length of min_str
@param[out] max_len Length of max_str
@return Optimization status.
@retval FALSE if LIKE pattern can be optimized
@rerval TRUE if LIKE can't be optimized.
*/
my_bool
my_like_range_generic(CHARSET_INFO *cs,
const char *ptr, size_t ptr_length,
pbool escape, pbool w_one, pbool w_many,
size_t res_length,
char *min_str,char *max_str,
size_t *min_length,size_t *max_length)
{
const char *end= ptr + ptr_length;
const char *min_org= min_str;
const char *max_org= max_str;
char *min_end= min_str + res_length;
char *max_end= max_str + res_length;
size_t charlen= res_length / cs->mbmaxlen;
size_t res_length_diff;
my_bool have_contractions= my_cs_have_contractions(cs);
for ( ; charlen > 0; charlen--)
{
my_wc_t wc, wc2;
int res;
if ((res= cs->cset->mb_wc(cs, &wc, (uchar*) ptr, (uchar*) end)) <= 0)
{
if (res == MY_CS_ILSEQ) /* Bad sequence */
return TRUE; /* min_length and max_length are not important */
break; /* End of the string */
}
ptr+= res;
if (wc == (my_wc_t) escape)
{
if ((res= cs->cset->mb_wc(cs, &wc, (uchar*) ptr, (uchar*) end)) <= 0)
{
if (res == MY_CS_ILSEQ)
return TRUE; /* min_length and max_length are not important */
/*
End of the string: Escape is the last character.
Put escape as a normal character.
We'll will leave the loop on the next iteration.
*/
}
else
ptr+= res;
/* Put escape character to min_str and max_str */
if ((res= cs->cset->wc_mb(cs, wc,
(uchar*) min_str, (uchar*) min_end)) <= 0)
goto pad_set_lengths; /* No space */
min_str+= res;
if ((res= cs->cset->wc_mb(cs, wc,
(uchar*) max_str, (uchar*) max_end)) <= 0)
goto pad_set_lengths; /* No space */
max_str+= res;
continue;
}
else if (wc == (my_wc_t) w_one)
{
if ((res= cs->cset->wc_mb(cs, cs->min_sort_char,
(uchar*) min_str, (uchar*) min_end)) <= 0)
goto pad_set_lengths;
min_str+= res;
if ((res= cs->cset->wc_mb(cs, cs->max_sort_char,
(uchar*) max_str, (uchar*) max_end)) <= 0)
goto pad_set_lengths;
max_str+= res;
continue;
}
else if (wc == (my_wc_t) w_many)
{
/*
Calculate length of keys:
a\min\min... is the smallest possible string
a\max\max... is the biggest possible string
*/
*min_length= ((cs->state & MY_CS_BINSORT) ?
(size_t) (min_str - min_org) :
res_length);
*max_length= res_length;
goto pad_min_max;
}
if (have_contractions &&
my_cs_can_be_contraction_head(cs, wc) &&
(res= cs->cset->mb_wc(cs, &wc2, (uchar*) ptr, (uchar*) end)) > 0)
{
uint16 *weight;
if ((wc2 == (my_wc_t) w_one || wc2 == (my_wc_t) w_many))
{
/* Contraction head followed by a wildcard */
*min_length= *max_length= res_length;
goto pad_min_max;
}
if (my_cs_can_be_contraction_tail(cs, wc2) &&
(weight= my_cs_contraction2_weight(cs, wc, wc2)) && weight[0])
{
/* Contraction found */
if (charlen == 1)
{
/* contraction does not fit to result */
*min_length= *max_length= res_length;
goto pad_min_max;
}
ptr+= res;
charlen--;
/* Put contraction head */
if ((res= cs->cset->wc_mb(cs, wc,
(uchar*) min_str, (uchar*) min_end)) <= 0)
goto pad_set_lengths;
min_str+= res;
if ((res= cs->cset->wc_mb(cs, wc,
(uchar*) max_str, (uchar*) max_end)) <= 0)
goto pad_set_lengths;
max_str+= res;
wc= wc2; /* Prepare to put contraction tail */
}
}
/* Normal character, or contraction tail */
if ((res= cs->cset->wc_mb(cs, wc,
(uchar*) min_str, (uchar*) min_end)) <= 0)
goto pad_set_lengths;
min_str+= res;
if ((res= cs->cset->wc_mb(cs, wc,
(uchar*) max_str, (uchar*) max_end)) <= 0)
goto pad_set_lengths;
max_str+= res;
}
pad_set_lengths:
*min_length= (size_t) (min_str - min_org);
*max_length= (size_t) (max_str - max_org);
pad_min_max:
/*
Fill up max_str and min_str to res_length.
fill() cannot set incomplete characters and
requires that "length" argument is divisible to mbminlen.
Make sure to call fill() with proper "length" argument.
*/
res_length_diff= res_length % cs->mbminlen;
cs->cset->fill(cs, min_str, min_end - min_str - res_length_diff,
cs->min_sort_char);
cs->cset->fill(cs, max_str, max_end - max_str - res_length_diff,
cs->max_sort_char);
/* In case of incomplete characters set the remainder to 0x00's */
if (res_length_diff)
{
/* Example: odd res_length for ucs2 */
memset(min_end - res_length_diff, 0, res_length_diff);
memset(max_end - res_length_diff, 0, res_length_diff);
}
return FALSE;
}
int
my_wildcmp_mb_bin(CHARSET_INFO *cs,
const char *str,const char *str_end,

View file

@ -8127,7 +8127,7 @@ MY_COLLATION_HANDLER my_collation_ucs2_uca_handler =
my_strnncollsp_ucs2_uca,
my_strnxfrm_ucs2_uca,
my_strnxfrmlen_simple,
my_like_range_ucs2,
my_like_range_generic,
my_wildcmp_uca,
NULL,
my_instr_mb,
@ -10134,7 +10134,7 @@ MY_COLLATION_HANDLER my_collation_utf32_uca_handler =
my_strnncollsp_any_uca,
my_strnxfrm_any_uca,
my_strnxfrmlen_simple,
my_like_range_utf32,
my_like_range_generic,
my_wildcmp_uca,
NULL,
my_instr_mb,
@ -10801,7 +10801,7 @@ MY_COLLATION_HANDLER my_collation_utf16_uca_handler =
my_strnncollsp_any_uca,
my_strnxfrm_any_uca,
my_strnxfrmlen_simple,
my_like_range_utf16,
my_like_range_generic,
my_wildcmp_uca,
NULL,
my_instr_mb,

View file

@ -903,7 +903,8 @@ static void
my_fill_mb2(CHARSET_INFO *cs __attribute__((unused)),
char *s, size_t l, int fill)
{
for ( ; l >= 2; s[0]= 0, s[1]= fill, s+= 2, l-= 2);
DBUG_ASSERT(fill <= 0xFFFF);
for ( ; l >= 2; s[0]= (fill >> 8), s[1]= (fill & 0xFF), s+= 2, l-= 2);
}
@ -1563,98 +1564,6 @@ my_hash_sort_utf16_bin(CHARSET_INFO *cs __attribute__((unused)),
}
/**
Calculate min_str and max_str that ranges a LIKE string.
@param ptr Pointer to LIKE pattern.
@param ptr_length Length of LIKE pattern.
@param escape Escape character in LIKE. (Normally '\').
All escape characters should be removed
from min_str and max_str.
@param res_length Length of min_str and max_str.
@param min_str Smallest case sensitive string that ranges LIKE.
Should be space padded to res_length.
@param max_str Largest case sensitive string that ranges LIKE.
Normally padded with the biggest character sort value.
@return Optimization status.
@retval FALSE if LIKE pattern can be optimized
@rerval TRUE if LIKE can't be optimized.
*/
my_bool
my_like_range_utf16(CHARSET_INFO *cs,
const char *ptr, size_t ptr_length,
pbool escape, pbool w_one, pbool w_many,
size_t res_length,
char *min_str,char *max_str,
size_t *min_length,size_t *max_length)
{
const char *end=ptr+ptr_length;
char *min_org=min_str;
char *min_end=min_str+res_length;
size_t charlen= res_length / cs->mbmaxlen;
for ( ; ptr + 1 < end && min_str + 1 < min_end && charlen > 0
; ptr+=2, charlen--)
{
if (ptr[0] == '\0' && ptr[1] == escape && ptr + 1 < end)
{
ptr+=2; /* Skip escape */
*min_str++= *max_str++ = ptr[0];
*min_str++= *max_str++ = ptr[1];
continue;
}
if (ptr[0] == '\0' && ptr[1] == w_one) /* '_' in SQL */
{
*min_str++= (char) (cs->min_sort_char >> 8);
*min_str++= (char) (cs->min_sort_char & 255);
*max_str++= (char) (cs->max_sort_char >> 8);
*max_str++= (char) (cs->max_sort_char & 255);
continue;
}
if (ptr[0] == '\0' && ptr[1] == w_many) /* '%' in SQL */
{
/*
Calculate length of keys:
'a\0\0... is the smallest possible string when we have space expand
a\ff\ff... is the biggest possible string
*/
*min_length= ((cs->state & MY_CS_BINSORT) ? (size_t) (min_str - min_org) :
res_length);
*max_length= res_length;
do {
*min_str++ = 0;
*min_str++ = 0;
*max_str++ = (char) (cs->max_sort_char >> 8);
*max_str++ = (char) (cs->max_sort_char & 255);
} while (min_str + 1 < min_end);
return FALSE;
}
*min_str++= *max_str++ = ptr[0];
*min_str++= *max_str++ = ptr[1];
}
/* Temporary fix for handling w_one at end of string (key compression) */
{
char *tmp;
for (tmp= min_str ; tmp-1 > min_org && tmp[-1] == '\0' && tmp[-2]=='\0';)
{
*--tmp=' ';
*--tmp='\0';
}
}
*min_length= *max_length = (size_t) (min_str - min_org);
while (min_str + 1 < min_end)
{
*min_str++ = *max_str++ = '\0';
*min_str++ = *max_str++ = ' '; /* Because if key compression */
}
return FALSE;
}
static MY_COLLATION_HANDLER my_collation_utf16_general_ci_handler =
{
NULL, /* init */
@ -1662,7 +1571,7 @@ static MY_COLLATION_HANDLER my_collation_utf16_general_ci_handler =
my_strnncollsp_utf16,
my_strnxfrm_unicode,
my_strnxfrmlen_simple,
my_like_range_utf16,
my_like_range_generic,
my_wildcmp_utf16_ci,
my_strcasecmp_mb2_or_mb4,
my_instr_mb,
@ -1678,7 +1587,7 @@ static MY_COLLATION_HANDLER my_collation_utf16_bin_handler =
my_strnncollsp_utf16_bin,
my_strnxfrm_unicode_full_bin,
my_strnxfrmlen_unicode_full_bin,
my_like_range_utf16,
my_like_range_generic,
my_wildcmp_utf16_bin,
my_strcasecmp_mb2_or_mb4,
my_instr_mb,
@ -2551,113 +2460,6 @@ my_strnncollsp_utf32_bin(CHARSET_INFO *cs __attribute__((unused)),
}
/**
Calculate min_str and max_str that ranges a LIKE string.
@param ptr Pointer to LIKE pattern.
@param ptr_length Length of LIKE pattern.
@param escape Escape character in LIKE. (Normally '\').
All escape characters should be removed
from min_str and max_str.
@param res_length Length of min_str and max_str.
@param min_str Smallest case sensitive string that ranges LIKE.
Should be space padded to res_length.
@param max_str Largest case sensitive string that ranges LIKE.
Normally padded with the biggest character sort value.
@return Optimization status.
@retval FALSE if LIKE pattern can be optimized
@rerval TRUE if LIKE can't be optimized.
*/
my_bool
my_like_range_utf32(CHARSET_INFO *cs,
const char *ptr, size_t ptr_length,
pbool escape, pbool w_one, pbool w_many,
size_t res_length,
char *min_str,char *max_str,
size_t *min_length,size_t *max_length)
{
const char *end= ptr + ptr_length;
char *min_org= min_str;
char *min_end= min_str + res_length;
char *max_end= max_str + res_length;
size_t charlen= res_length / cs->mbmaxlen;
DBUG_ASSERT((res_length % 4) == 0);
for ( ; charlen > 0; ptr+= 4, charlen--)
{
my_wc_t wc;
int res;
if ((res= my_utf32_uni(cs, &wc, (uchar*) ptr, (uchar*) end)) < 0)
{
my_fill_utf32(cs, min_str, min_end - min_str, cs->min_sort_char);
my_fill_utf32(cs, max_str, min_end - min_str, cs->max_sort_char);
/* min_length and max_legnth are not important */
return TRUE;
}
if (wc == (my_wc_t) escape)
{
ptr+= 4; /* Skip escape */
if ((res= my_utf32_uni(cs, &wc, (uchar*) ptr, (uchar*) end)) < 0)
{
my_fill_utf32(cs, min_str, min_end - min_str, cs->min_sort_char);
my_fill_utf32(cs, max_str, max_end - min_str, cs->max_sort_char);
/* min_length and max_length are not important */
return TRUE;
}
if (my_uni_utf32(cs, wc, (uchar*) min_str, (uchar*) min_end) != 4 ||
my_uni_utf32(cs, wc, (uchar*) max_str, (uchar*) max_end) != 4)
goto pad_set_lengths;
*min_str++= 4;
*max_str++= 4;
continue;
}
if (wc == (my_wc_t) w_one)
{
if (my_uni_utf32(cs, cs->min_sort_char, (uchar*) min_str, (uchar*) min_end) != 4 ||
my_uni_utf32(cs, cs->max_sort_char, (uchar*) max_str, (uchar*) max_end) != 4)
goto pad_set_lengths;
min_str+= 4;
max_str+= 4;
continue;
}
if (wc == (my_wc_t) w_many)
{
/*
Calculate length of keys:
'a\0\0... is the smallest possible string when we have space expand
a\ff\ff... is the biggest possible string
*/
*min_length= ((cs->state & MY_CS_BINSORT) ?
(size_t) (min_str - min_org) :
res_length);
*max_length= res_length;
goto pad_min_max;
}
/* Normal character */
if (my_uni_utf32(cs, wc, (uchar*) min_str, (uchar*) min_end) != 4 ||
my_uni_utf32(cs, wc, (uchar*) max_str, (uchar*) max_end) != 4)
goto pad_set_lengths;
min_str+= 4;
max_str+= 4;
}
pad_set_lengths:
*min_length= *max_length= (size_t) (min_str - min_org);
pad_min_max:
my_fill_utf32(cs, min_str, min_end - min_str, cs->min_sort_char);
my_fill_utf32(cs, max_str, max_end - max_str, cs->max_sort_char);
return FALSE;
}
static size_t
my_scan_utf32(CHARSET_INFO *cs,
const char *str, const char *end, int sequence_type)
@ -2689,7 +2491,7 @@ static MY_COLLATION_HANDLER my_collation_utf32_general_ci_handler =
my_strnncollsp_utf32,
my_strnxfrm_unicode,
my_strnxfrmlen_utf32,
my_like_range_utf32,
my_like_range_generic,
my_wildcmp_utf32_ci,
my_strcasecmp_mb2_or_mb4,
my_instr_mb,
@ -2705,7 +2507,7 @@ static MY_COLLATION_HANDLER my_collation_utf32_bin_handler =
my_strnncollsp_utf32_bin,
my_strnxfrm_unicode_full_bin,
my_strnxfrmlen_unicode_full_bin,
my_like_range_utf32,
my_like_range_generic,
my_wildcmp_utf32_bin,
my_strcasecmp_mb2_or_mb4,
my_instr_mb,
@ -3252,120 +3054,6 @@ void my_hash_sort_ucs2_bin(CHARSET_INFO *cs __attribute__((unused)),
}
}
/*
** Calculate min_str and max_str that ranges a LIKE string.
** Arguments:
** ptr Pointer to LIKE string.
** ptr_length Length of LIKE string.
** escape Escape character in LIKE. (Normally '\').
** All escape characters should be removed from min_str and max_str
** res_length Length of min_str and max_str.
** min_str Smallest case sensitive string that ranges LIKE.
** Should be space padded to res_length.
** max_str Largest case sensitive string that ranges LIKE.
** Normally padded with the biggest character sort value.
**
** The function should return 0 if ok and 1 if the LIKE string can't be
** optimized !
*/
my_bool my_like_range_ucs2(CHARSET_INFO *cs,
const char *ptr, size_t ptr_length,
pbool escape, pbool w_one, pbool w_many,
size_t res_length,
char *min_str,char *max_str,
size_t *min_length,size_t *max_length)
{
const char *end=ptr+ptr_length;
char *min_org=min_str;
char *min_end=min_str+res_length;
size_t charlen= res_length / cs->mbmaxlen;
const char *contraction_flags= cs->contractions ?
((const char*) cs->contractions) + 0x40*0x40 : NULL;
for ( ; ptr + 1 < end && min_str + 1 < min_end && charlen > 0
; ptr+=2, charlen--)
{
if (ptr[0] == '\0' && ptr[1] == escape && ptr + 1 < end)
{
ptr+=2; /* Skip escape */
*min_str++= *max_str++ = ptr[0];
*min_str++= *max_str++ = ptr[1];
continue;
}
if (ptr[0] == '\0' && ptr[1] == w_one) /* '_' in SQL */
{
*min_str++= (char) (cs->min_sort_char >> 8);
*min_str++= (char) (cs->min_sort_char & 255);
*max_str++= (char) (cs->max_sort_char >> 8);
*max_str++= (char) (cs->max_sort_char & 255);
continue;
}
if (ptr[0] == '\0' && ptr[1] == w_many) /* '%' in SQL */
{
fill_max_and_min:
/*
Calculate length of keys:
'a\0\0... is the smallest possible string when we have space expand
a\ff\ff... is the biggest possible string
*/
*min_length= ((cs->state & MY_CS_BINSORT) ? (size_t) (min_str - min_org) :
res_length);
*max_length= res_length;
do {
*min_str++ = 0;
*min_str++ = 0;
*max_str++ = (char) (cs->max_sort_char >> 8);
*max_str++ = (char) (cs->max_sort_char & 255);
} while (min_str + 1 < min_end);
return 0;
}
if (contraction_flags && ptr + 3 < end &&
ptr[0] == '\0' && contraction_flags[(uchar) ptr[1]])
{
/* Contraction head found */
if (ptr[2] == '\0' && (ptr[3] == w_one || ptr[3] == w_many))
{
/* Contraction head followed by a wildcard, quit */
goto fill_max_and_min;
}
/*
Check if the second letter can be contraction part,
and if two letters really produce a contraction.
*/
if (ptr[2] == '\0' && contraction_flags[(uchar) ptr[3]] &&
cs->contractions[(ptr[1]-0x40)*0x40 + ptr[3] - 0x40])
{
/* Contraction found */
if (charlen == 1 || min_str + 2 >= min_end)
{
/* Full contraction doesn't fit, quit */
goto fill_max_and_min;
}
/* Put contraction head */
*min_str++= *max_str++= *ptr++;
*min_str++= *max_str++= *ptr++;
charlen--;
}
}
/* Put contraction tail, or a single character */
*min_str++= *max_str++ = ptr[0];
*min_str++= *max_str++ = ptr[1];
}
*min_length= *max_length = (size_t) (min_str - min_org);
while (min_str + 1 < min_end)
{
*min_str++ = *max_str++ = '\0';
*min_str++ = *max_str++ = ' '; /* Because if key compression */
}
return 0;
}
static MY_COLLATION_HANDLER my_collation_ucs2_general_ci_handler =
{
@ -3374,7 +3062,7 @@ static MY_COLLATION_HANDLER my_collation_ucs2_general_ci_handler =
my_strnncollsp_ucs2,
my_strnxfrm_unicode,
my_strnxfrmlen_simple,
my_like_range_ucs2,
my_like_range_generic,
my_wildcmp_ucs2_ci,
my_strcasecmp_mb2_or_mb4,
my_instr_mb,
@ -3390,7 +3078,7 @@ static MY_COLLATION_HANDLER my_collation_ucs2_bin_handler =
my_strnncollsp_ucs2_bin,
my_strnxfrm_unicode,
my_strnxfrmlen_simple,
my_like_range_ucs2,
my_like_range_generic,
my_wildcmp_ucs2_bin,
my_strcasecmp_mb2_or_mb4,
my_instr_mb,