Fixed that multibyte charsets didn't honor multibyte

sequence boundaries in functions LIKE and LOCATE in
the case of "binary" collation. Comparison was done
like if the strings were just a binary strings without
character set assumption.
This commit is contained in:
unknown 2003-09-19 15:18:19 +05:00
commit 44bffa0b05
21 changed files with 437 additions and 66 deletions

View file

@ -115,12 +115,17 @@ typedef struct my_collation_handler_st
int (*strcasecmp)(struct charset_info_st *, const char *, const char *);
int (*instr)(struct charset_info_st *,
const char *big, uint b_length,
const char *small, uint s_length);
/* Hash calculation */
void (*hash_sort)(struct charset_info_st *cs, const uchar *key, uint len,
ulong *nr1, ulong *nr2);
} MY_COLLATION_HANDLER;
extern MY_COLLATION_HANDLER my_collation_bin_handler;
extern MY_COLLATION_HANDLER my_collation_mb_bin_handler;
extern MY_COLLATION_HANDLER my_collation_8bit_bin_handler;
extern MY_COLLATION_HANDLER my_collation_8bit_simple_ci_handler;
@ -243,6 +248,10 @@ extern void my_hash_sort_simple(CHARSET_INFO *cs,
extern uint my_lengthsp_8bit(CHARSET_INFO *cs, const char *ptr, uint length);
extern int my_instr_simple(struct charset_info_st *,
const char *big, uint b_length,
const char *small, uint s_length);
/* Functions for 8bit */
extern void my_caseup_str_8bit(CHARSET_INFO *, char *);
@ -307,6 +316,9 @@ int my_wildcmp_mb(CHARSET_INFO *,
int escape, int w_one, int w_many);
uint my_numchars_mb(CHARSET_INFO *, const char *b, const char *e);
uint my_charpos_mb(CHARSET_INFO *, const char *b, const char *e, uint pos);
int my_instr_mb(struct charset_info_st *,
const char *big, uint b_length,
const char *small, uint s_length);
extern my_bool my_parse_charset_xml(const char *bug, uint len,

View file

@ -1,4 +1,5 @@
drop table if exists t1;
set names ujis;
create table t1 (c text character set ujis);
insert into t1 values (0xa4a2),(0xa4a3);
select hex(left(c,1)) from t1 group by c;
@ -6,3 +7,60 @@ hex(left(c,1))
A4A2
A4A3
drop table t1;
select locate(0xa2a1,0xa1a2a1a3);
locate(0xa2a1,0xa1a2a1a3)
2
select locate(_ujis 0xa2a1,_ujis 0xa1a2a1a3);
locate(_ujis 0xa2a1,_ujis 0xa1a2a1a3)
0
select locate(_ujis 0xa2a1,_ujis 0xa1a2a1a3 collate ujis_bin);
locate(_ujis 0xa2a1,_ujis 0xa1a2a1a3 collate ujis_bin)
0
select locate('he','hello');
locate('he','hello')
1
select locate('he','hello',2);
locate('he','hello',2)
0
select locate('lo','hello',2);
locate('lo','hello',2)
4
select locate('HE','hello');
locate('HE','hello')
1
select locate('HE','hello',2);
locate('HE','hello',2)
0
select locate('LO','hello',2);
locate('LO','hello',2)
4
select locate('HE','hello' collate ujis_bin);
locate('HE','hello' collate ujis_bin)
0
select locate('HE','hello' collate ujis_bin,2);
locate('HE','hello' collate ujis_bin,2)
0
select locate('LO','hello' collate ujis_bin,2);
locate('LO','hello' collate ujis_bin,2)
0
select locate(_ujis 0xa1a3,_ujis 0xa1a2a1a3);
locate(_ujis 0xa1a3,_ujis 0xa1a2a1a3)
2
select 0xa1a2a1a3 like concat(_binary'%',0xa2a1,_binary'%');
0xa1a2a1a3 like concat(_binary'%',0xa2a1,_binary'%')
1
select _ujis 0xa1a2a1a3 like concat(_ujis'%',_ujis 0xa2a1, _ujis'%');
_ujis 0xa1a2a1a3 like concat(_ujis'%',_ujis 0xa2a1, _ujis'%')
0
select _ujis 0xa1a2a1a3 like concat(_ujis'%',_ujis 0xa2a1, _ujis'%') collate ujis_bin;
_ujis 0xa1a2a1a3 like concat(_ujis'%',_ujis 0xa2a1, _ujis'%') collate ujis_bin
0
select 'a' like 'a';
'a' like 'a'
1
select 'A' like 'a';
'A' like 'a'
1
select 'A' like 'a' collate ujis_bin;
'A' like 'a' collate ujis_bin
0

View file

@ -1,2 +1,2 @@
Collation Charset Id Default Compiled Sortlen
ujis_japanese_ci ujis 12 Yes Yes 0
ujis_japanese_ci ujis 12 Yes Yes 1

View file

@ -7,6 +7,8 @@
drop table if exists t1;
--enable_warnings
set names ujis;
#
# Test problem with LEFT()
#
@ -15,3 +17,27 @@ create table t1 (c text character set ujis);
insert into t1 values (0xa4a2),(0xa4a3);
select hex(left(c,1)) from t1 group by c;
drop table t1;
#
#
#
select locate(0xa2a1,0xa1a2a1a3);
select locate(_ujis 0xa2a1,_ujis 0xa1a2a1a3);
select locate(_ujis 0xa2a1,_ujis 0xa1a2a1a3 collate ujis_bin);
select locate('he','hello');
select locate('he','hello',2);
select locate('lo','hello',2);
select locate('HE','hello');
select locate('HE','hello',2);
select locate('LO','hello',2);
select locate('HE','hello' collate ujis_bin);
select locate('HE','hello' collate ujis_bin,2);
select locate('LO','hello' collate ujis_bin,2);
select locate(_ujis 0xa1a3,_ujis 0xa1a2a1a3);
select 0xa1a2a1a3 like concat(_binary'%',0xa2a1,_binary'%');
select _ujis 0xa1a2a1a3 like concat(_ujis'%',_ujis 0xa2a1, _ujis'%');
select _ujis 0xa1a2a1a3 like concat(_ujis'%',_ujis 0xa2a1, _ujis'%') collate ujis_bin;
select 'a' like 'a';
select 'A' like 'a';
select 'A' like 'a' collate ujis_bin;

View file

@ -119,7 +119,7 @@ static void simple_cs_init_functions(CHARSET_INFO *cs)
if (cs->state & MY_CS_BINSORT)
{
cs->coll= &my_collation_bin_handler;
cs->coll= &my_collation_8bit_bin_handler;
}
else
{

View file

@ -1153,7 +1153,6 @@ longlong Item_func_locate::val_int()
{
String *a=args[0]->val_str(&value1);
String *b=args[1]->val_str(&value2);
bool binary_cmp= (cmp_collation.collation->state & MY_CS_BINSORT) ? 1 : 0;
if (!a || !b)
{
null_value=1;
@ -1161,55 +1160,26 @@ longlong Item_func_locate::val_int()
}
null_value=0;
uint start=0;
#ifdef USE_MB
uint start0=0;
#endif
int ind;
if (arg_count == 3)
{
start=(uint) args[2]->val_int()-1;
#ifdef USE_MB
if (use_mb(cmp_collation.collation))
{
start0=start;
if (!binary_cmp)
start=a->charpos(start);
}
#endif
start0= start =(uint) args[2]->val_int()-1;
start=a->charpos(start);
if (start > a->length() || start+b->length() > a->length())
return 0;
}
if (!b->length()) // Found empty string at start
return (longlong) (start+1);
#ifdef USE_MB
if (use_mb(cmp_collation.collation) && !binary_cmp)
{
const char *ptr=a->ptr()+start;
const char *search=b->ptr();
const char *strend = ptr+a->length();
const char *end=strend-b->length()+1;
const char *search_end=search+b->length();
register uint32 l;
while (ptr < end)
{
if (*ptr == *search)
{
register char *i,*j;
i=(char*) ptr+1; j=(char*) search+1;
while (j != search_end)
if (*i++ != *j++) goto skipp;
return (longlong) start0+1;
}
skipp:
if ((l=my_ismbchar(cmp_collation.collation,ptr,strend)))
ptr+=l;
else ++ptr;
++start0;
}
return 0;
}
#endif /* USE_MB */
return (longlong) (binary_cmp ? a->strstr(*b,start) :
(a->strstr_case(*b,start)))+1;
ind= cmp_collation.collation->coll->instr(cmp_collation.collation,
a->ptr()+start, a->length()-start,
b->ptr(), b->length());
return (longlong) (ind >= 0 ? ind + start0 + 1 : ind + 1);
}

View file

@ -6234,6 +6234,7 @@ static MY_COLLATION_HANDLER my_collation_big5_chinese_ci_handler =
my_like_range_big5,
my_wildcmp_mb,
my_strcasecmp_mb,
my_instr_mb,
my_hash_sort_simple
};
@ -6305,7 +6306,7 @@ CHARSET_INFO my_charset_big5_bin=
2, /* mbmaxlen */
0,
&my_charset_big5_handler,
&my_collation_bin_handler
&my_collation_mb_bin_handler
};

View file

@ -262,8 +262,46 @@ static int my_strnxfrm_bin(CHARSET_INFO *cs __attribute__((unused)),
return len;
}
static
int my_instr_bin(CHARSET_INFO *cs __attribute__((unused)),
const char *big, uint b_length,
const char *small, uint s_length)
{
register const uchar *str, *search, *end, *search_end;
if (s_length <= b_length)
{
if (!s_length)
return 0; // Empty string is always found
str= (const uchar*) big;
search= (const uchar*) small;
end= (const uchar*) big+b_length-s_length+1;
search_end= (const uchar*) small + s_length;
skipp:
while (str != end)
{
if ( (*str++) == (*search))
{
register const uchar *i,*j;
i= str;
j= search+1;
while (j != search_end)
if ((*i++) != (*j++))
goto skipp;
return (int) (str- (const uchar*)big) -1;
}
}
}
return -1;
}
MY_COLLATION_HANDLER my_collation_bin_handler =
MY_COLLATION_HANDLER my_collation_8bit_bin_handler =
{
my_strnncoll_binary,
my_strnncollsp_binary,
@ -271,6 +309,7 @@ MY_COLLATION_HANDLER my_collation_bin_handler =
my_like_range_simple,
my_wildcmp_bin,
my_strcasecmp_bin,
my_instr_bin,
my_hash_sort_bin
};
@ -317,5 +356,5 @@ CHARSET_INFO my_charset_bin =
1, /* mbmaxlen */
(char) 255, /* max_sort_char */
&my_charset_handler,
&my_collation_bin_handler
&my_collation_8bit_bin_handler
};

View file

@ -612,6 +612,7 @@ static MY_COLLATION_HANDLER my_collation_latin2_czech_ci_handler =
my_like_range_czech,
my_wildcmp_8bit,
my_strcasecmp_8bit,
my_instr_simple,
my_hash_sort_simple,
};

View file

@ -8637,12 +8637,13 @@ my_mb_wc_euc_kr(CHARSET_INFO *cs __attribute__((unused)),
static MY_COLLATION_HANDLER my_collation_ci_handler =
{
my_strnncoll_simple,/* strnncoll */
my_strnncoll_simple, /* strnncoll */
my_strnncollsp_simple,
my_strnxfrm_simple, /* strnxfrm */
my_like_range_simple,/* like_range */
my_like_range_simple, /* like_range */
my_wildcmp_mb, /* wildcmp */
my_strcasecmp_mb,
my_instr_mb,
my_hash_sort_simple,
};
@ -8714,7 +8715,7 @@ CHARSET_INFO my_charset_euckr_bin=
2, /* mbmaxlen */
0,
&my_charset_handler,
&my_collation_bin_handler
&my_collation_mb_bin_handler
};
#endif

View file

@ -5687,12 +5687,13 @@ my_mb_wc_gb2312(CHARSET_INFO *cs __attribute__((unused)),
static MY_COLLATION_HANDLER my_collation_ci_handler =
{
my_strnncoll_simple,/* strnncoll */
my_strnncoll_simple, /* strnncoll */
my_strnncollsp_simple,
my_strnxfrm_simple, /* strnxfrm */
my_like_range_simple,/* like_range */
my_like_range_simple, /* like_range */
my_wildcmp_mb, /* wildcmp */
my_strcasecmp_mb,
my_strcasecmp_mb, /* instr */
my_instr_mb,
my_hash_sort_simple,
};
@ -5763,7 +5764,7 @@ CHARSET_INFO my_charset_gb2312_bin=
2, /* mbmaxlen */
0,
&my_charset_handler,
&my_collation_bin_handler
&my_collation_mb_bin_handler
};
#endif

View file

@ -9890,6 +9890,7 @@ static MY_COLLATION_HANDLER my_collation_ci_handler =
my_like_range_gbk,
my_wildcmp_mb,
my_strcasecmp_mb,
my_instr_mb,
my_hash_sort_simple,
};
@ -9960,7 +9961,7 @@ CHARSET_INFO my_charset_gbk_bin=
2, /* mbmaxlen */
0,
&my_charset_handler,
&my_collation_bin_handler
&my_collation_mb_bin_handler
};

View file

@ -390,6 +390,7 @@ static MY_COLLATION_HANDLER my_collation_german2_ci_handler=
my_like_range_simple,
my_wildcmp_8bit,
my_strcasecmp_8bit,
my_instr_simple,
my_hash_sort_simple
};
@ -435,6 +436,6 @@ CHARSET_INFO my_charset_latin1_bin=
1, /* mbmaxlen */
0,
&my_charset_handler,
&my_collation_bin_handler
&my_collation_8bit_bin_handler
};

View file

@ -126,11 +126,7 @@ int my_strcasecmp_mb(CHARSET_INFO * cs,const char *s, const char *t)
#define INC_PTR(cs,A,B) A+=((use_mb_flag && \
my_ismbchar(cs,A,B)) ? my_ismbchar(cs,A,B) : 1)
#ifdef LIKE_CMP_TOUPPER
#define likeconv(s,A) (uchar) my_toupper(s,A)
#else
#define likeconv(s,A) (uchar) (s)->sort_order[(uchar) (A)]
#endif
int my_wildcmp_mb(CHARSET_INFO *cs,
const char *str,const char *str_end,
@ -278,5 +274,224 @@ uint my_charpos_mb(CHARSET_INFO *cs __attribute__((unused)),
return b-b0;
}
int my_instr_mb(CHARSET_INFO *cs,
const char *big, uint b_length,
const char *small, uint s_length)
{
register const char *end;
int res= 0;
if (s_length <= b_length)
{
if (!s_length)
return 0; // Empty string is always found
end= big+b_length-s_length+1;
while (big < end)
{
int mblen;
if (!cs->coll->strnncoll(cs, (unsigned char*) big, s_length,
(unsigned char*) small, s_length))
return res;
mblen= (mblen= my_ismbchar(cs, big, end)) ? mblen : 1;
big+= mblen;
b_length-= mblen;
res++;
}
}
return -1;
}
/* BINARY collations handlers for MB charsets */
static int my_strnncoll_mb_bin(CHARSET_INFO * cs __attribute__((unused)),
const uchar *s, uint slen,
const uchar *t, uint tlen)
{
int cmp= memcmp(s,t,min(slen,tlen));
return cmp ? cmp : (int) (slen - tlen);
}
static int my_strnncollsp_mb_bin(CHARSET_INFO * cs,
const uchar *s, uint slen,
const uchar *t, uint tlen)
{
int len, cmp;
for ( ; slen && my_isspace(cs, s[slen-1]) ; slen--);
for ( ; tlen && my_isspace(cs, t[tlen-1]) ; tlen--);
len = ( slen > tlen ) ? tlen : slen;
cmp= memcmp(s,t,len);
return cmp ? cmp : (int) (slen - tlen);
}
static int my_strnxfrm_mb_bin(CHARSET_INFO *cs __attribute__((unused)),
uchar * dest, uint len,
const uchar *src,
uint srclen __attribute__((unused)))
{
if (dest != src)
memcpy(dest,src,len= min(len,srclen));
return len;
}
static int my_strcasecmp_mb_bin(CHARSET_INFO * cs __attribute__((unused)),
const char *s, const char *t)
{
return strcmp(s,t);
}
static void my_hash_sort_mb_bin(CHARSET_INFO *cs __attribute__((unused)),
const uchar *key, uint len,ulong *nr1, ulong *nr2)
{
const uchar *pos = key;
key+= len;
for (; pos < (uchar*) key ; pos++)
{
nr1[0]^=(ulong) ((((uint) nr1[0] & 63)+nr2[0]) *
((uint)*pos)) + (nr1[0] << 8);
nr2[0]+=3;
}
}
static int my_wildcmp_mb_bin(CHARSET_INFO *cs,
const char *str,const char *str_end,
const char *wildstr,const char *wildend,
int escape, int w_one, int w_many)
{
int result= -1; /* Not found, using wildcards */
bool use_mb_flag=use_mb(cs);
while (wildstr != wildend)
{
while (*wildstr != w_many && *wildstr != w_one)
{
int l;
if (*wildstr == escape && wildstr+1 != wildend)
wildstr++;
if (use_mb_flag &&
(l = my_ismbchar(cs, wildstr, wildend)))
{
if (str+l > str_end || memcmp(str, wildstr, l) != 0)
return 1;
str += l;
wildstr += l;
}
else
if (str == str_end || *wildstr++ != *str++)
return(1); /* No match */
if (wildstr == wildend)
return (str != str_end); /* Match if both are at end */
result=1; /* Found an anchor char */
}
if (*wildstr == w_one)
{
do
{
if (str == str_end) /* Skip one char if possible */
return (result);
INC_PTR(cs,str,str_end);
} while (++wildstr < wildend && *wildstr == w_one);
if (wildstr == wildend)
break;
}
if (*wildstr == w_many)
{ /* Found w_many */
uchar cmp;
const char* mb = wildstr;
int mblen=0;
wildstr++;
/* Remove any '%' and '_' from the wild search string */
for (; wildstr != wildend ; wildstr++)
{
if (*wildstr == w_many)
continue;
if (*wildstr == w_one)
{
if (str == str_end)
return (-1);
INC_PTR(cs,str,str_end);
continue;
}
break; /* Not a wild character */
}
if (wildstr == wildend)
return(0); /* Ok if w_many is last */
if (str == str_end)
return -1;
if ((cmp= *wildstr) == escape && wildstr+1 != wildend)
cmp= *++wildstr;
mb=wildstr;
LINT_INIT(mblen);
if (use_mb_flag)
mblen = my_ismbchar(cs, wildstr, wildend);
INC_PTR(cs,wildstr,wildend); /* This is compared trough cmp */
do
{
if (use_mb_flag)
{
for (;;)
{
if (str >= str_end)
return -1;
if (mblen)
{
if (str+mblen <= str_end && memcmp(str, mb, mblen) == 0)
{
str += mblen;
break;
}
}
else if (!my_ismbchar(cs, str, str_end) && *str == cmp)
{
str++;
break;
}
INC_PTR(cs,str, str_end);
}
}
else
{
while (str != str_end && *str != cmp)
str++;
if (str++ == str_end) return (-1);
}
{
int tmp=my_wildcmp_mb(cs,str,str_end,wildstr,wildend,escape,w_one,w_many);
if (tmp <= 0)
return (tmp);
}
} while (str != str_end && wildstr[0] != w_many);
return(-1);
}
}
return (str != str_end ? 1 : 0);
}
MY_COLLATION_HANDLER my_collation_mb_bin_handler =
{
my_strnncoll_mb_bin,
my_strnncollsp_mb_bin,
my_strnxfrm_mb_bin,
my_like_range_simple,
my_wildcmp_mb_bin,
my_strcasecmp_mb_bin,
my_instr_mb,
my_hash_sort_mb_bin
};
#endif

View file

@ -1030,6 +1030,44 @@ uint my_lengthsp_8bit(CHARSET_INFO *cs __attribute__((unused)),
}
int my_instr_simple(CHARSET_INFO *cs,
const char *big, uint b_length,
const char *small, uint s_length)
{
register const uchar *str, *search, *end, *search_end;
if (s_length <= b_length)
{
if (!s_length)
return 0; // Empty string is always found
str= (const uchar*) big;
search= (const uchar*) small;
end= (const uchar*) big+b_length-s_length+1;
search_end= (const uchar*) small + s_length;
skipp:
while (str != end)
{
if (cs->sort_order[*str++] == cs->sort_order[*search])
{
register const uchar *i,*j;
i= str;
j= search+1;
while (j != search_end)
if (cs->sort_order[*i++] != cs->sort_order[*j++])
goto skipp;
return (int) (str- (const uchar*)big) -1;
}
}
}
return -1;
}
MY_CHARSET_HANDLER my_charset_8bit_handler=
{
NULL, /* ismbchar */
@ -1063,5 +1101,6 @@ MY_COLLATION_HANDLER my_collation_8bit_simple_ci_handler =
my_like_range_simple,
my_wildcmp_8bit,
my_strcasecmp_8bit,
my_instr_simple,
my_hash_sort_simple
};

View file

@ -4477,6 +4477,7 @@ static MY_COLLATION_HANDLER my_collation_ci_handler =
my_like_range_sjis,
my_wildcmp_mb, /* wildcmp */
my_strcasecmp_8bit,
my_instr_mb,
my_hash_sort_simple,
};
@ -4547,7 +4548,7 @@ CHARSET_INFO my_charset_sjis_bin=
2, /* mbmaxlen */
0,
&my_charset_handler,
&my_collation_bin_handler
&my_collation_mb_bin_handler
};
#endif

View file

@ -710,6 +710,7 @@ static MY_COLLATION_HANDLER my_collation_ci_handler =
my_like_range_tis620,
my_wildcmp_8bit, /* wildcmp */
my_strcasecmp_8bit,
NULL,
my_hash_sort_simple,
};
@ -781,7 +782,7 @@ CHARSET_INFO my_charset_tis620_bin=
1, /* mbmaxlen */
0,
&my_charset_handler,
&my_collation_bin_handler
&my_collation_8bit_bin_handler
};

View file

@ -1028,6 +1028,7 @@ static MY_COLLATION_HANDLER my_collation_ci_handler =
my_like_range_simple,
my_wildcmp_mb,
my_strcasecmp_ucs2,
my_instr_mb,
my_hash_sort_ucs2
};
@ -1100,7 +1101,7 @@ CHARSET_INFO my_charset_ucs2_bin=
2, /* mbmaxlen */
0,
&my_charset_handler,
&my_collation_bin_handler
&my_collation_mb_bin_handler
};

View file

@ -8434,6 +8434,7 @@ static MY_COLLATION_HANDLER my_collation_ci_handler =
my_like_range_simple,/* like_range */
my_wildcmp_mb, /* wildcmp */
my_strcasecmp_mb,
my_instr_mb,
my_hash_sort_simple,
};
@ -8504,7 +8505,7 @@ CHARSET_INFO my_charset_ujis_bin=
3, /* mbmaxlen */
0,
&my_charset_handler,
&my_collation_bin_handler
&my_collation_mb_bin_handler
};

View file

@ -1959,6 +1959,7 @@ static MY_COLLATION_HANDLER my_collation_ci_handler =
my_like_range_simple,
my_wildcmp_mb,
my_strcasecmp_utf8,
my_instr_mb,
my_hash_sort_utf8
};
@ -2031,7 +2032,7 @@ CHARSET_INFO my_charset_utf8_bin=
3, /* mbmaxlen */
0,
&my_charset_handler,
&my_collation_bin_handler
&my_collation_mb_bin_handler
};

View file

@ -652,6 +652,7 @@ static MY_COLLATION_HANDLER my_collation_czech_ci_handler =
my_like_range_win1250ch,
my_wildcmp_8bit,
my_strcasecmp_8bit,
my_instr_simple,
my_hash_sort_simple
};