Simplify caseup() and casedn() in charsets

After the MDEV-13118 fix there's no code in the server that
wants caseup/casedn to change the argument in place for simple
charsets.  Let's remove this logic and always return the result in a
new string for all charsets, both simple and complex.

1. Removing the optimization that *some* character sets used in casedn()
  and caseup(), which allowed (and required) to change the case in-place,
  overwriting the string passed as the "src" argument.
  Now all CHARSET_INFO's work in the same way:
  non of them change the source string in-place, all of them now convert
  case from the source string to the destination string, leaving
  the source string untouched.

2. Adding "const" qualifier to the "char *src" parameter
   to caseup() and casedn().

3. Removing duplicate implementations in ctype-mb.c.
  Now both caseup() and casedn() implementations for all CJK character sets
  use internally the same function my_casefold_mb()
  (the former my_casefold_mb_varlen()).

4. Removing the "unused" attribute from parameters of some my_case{up|dn}_xxx()
   implementations, as the affected parameters are now *used* in the code.
   Previously these parameters were used only in DBUG_ASSERT().
This commit is contained in:
Alexander Barkov 2018-07-19 13:02:14 +04:00
parent ab58493db2
commit e2ac4098ed
9 changed files with 106 additions and 177 deletions

View file

@ -365,7 +365,7 @@ typedef int (*my_charset_conv_mb_wc)(CHARSET_INFO *, my_wc_t *,
typedef int (*my_charset_conv_wc_mb)(CHARSET_INFO *, my_wc_t,
uchar *, uchar *);
typedef size_t (*my_charset_conv_case)(CHARSET_INFO *,
char *, size_t, char *, size_t);
const char *, size_t, char *, size_t);
/* See strings/CHARSET_INFO.txt about information on this structure */
@ -565,9 +565,11 @@ extern uint my_instr_simple(CHARSET_INFO *,
/* Functions for 8bit */
extern size_t my_caseup_str_8bit(CHARSET_INFO *, char *);
extern size_t my_casedn_str_8bit(CHARSET_INFO *, char *);
extern size_t my_caseup_8bit(CHARSET_INFO *, char *src, size_t srclen,
extern size_t my_caseup_8bit(CHARSET_INFO *,
const char *src, size_t srclen,
char *dst, size_t dstlen);
extern size_t my_casedn_8bit(CHARSET_INFO *, char *src, size_t srclen,
extern size_t my_casedn_8bit(CHARSET_INFO *,
const char *src, size_t srclen,
char *dst, size_t dstlen);
extern int my_strcasecmp_8bit(CHARSET_INFO * cs, const char *, const char *);
@ -658,17 +660,17 @@ uint my_mbcharlen_8bit(CHARSET_INFO *, uint c);
/* Functions for multibyte charsets */
extern size_t my_caseup_str_mb(CHARSET_INFO *, char *);
extern size_t my_casedn_str_mb(CHARSET_INFO *, char *);
extern size_t my_caseup_mb(CHARSET_INFO *, char *src, size_t srclen,
char *dst, size_t dstlen);
extern size_t my_casedn_mb(CHARSET_INFO *, char *src, size_t srclen,
char *dst, size_t dstlen);
extern size_t my_caseup_mb_varlen(CHARSET_INFO *, char *src, size_t srclen,
char *dst, size_t dstlen);
extern size_t my_casedn_mb_varlen(CHARSET_INFO *, char *src, size_t srclen,
char *dst, size_t dstlen);
extern size_t my_caseup_ujis(CHARSET_INFO *, char *src, size_t srclen,
extern size_t my_caseup_mb(CHARSET_INFO *,
const char *src, size_t srclen,
char *dst, size_t dstlen);
extern size_t my_casedn_mb(CHARSET_INFO *,
const char *src, size_t srclen,
char *dst, size_t dstlen);
extern size_t my_caseup_ujis(CHARSET_INFO *,
const char *src, size_t srclen,
char *dst, size_t dstlen);
extern size_t my_casedn_ujis(CHARSET_INFO *, char *src, size_t srclen,
extern size_t my_casedn_ujis(CHARSET_INFO *,
const char *src, size_t srclen,
char *dst, size_t dstlen);
extern int my_strcasecmp_mb(CHARSET_INFO * cs,const char *, const char *);

View file

@ -1572,19 +1572,10 @@ String *Item_str_conv::val_str(String *str)
str->alloc((alloced_length= res->length() * multiply)))))
return 0;
if (multiply == 1)
{
str->copy(*res); // Should not fail (it was alloced above)
len= converter(collation.collation, (char*) str->ptr(), str->length(),
(char*) str->ptr(), alloced_length);
}
else
{
len= converter(collation.collation, (char*) res->ptr(), res->length(),
(char*) str->ptr(), alloced_length);
str->set_charset(collation.collation);
}
len= converter(collation.collation, (char*) res->ptr(), res->length(),
(char*) str->ptr(), alloced_length);
DBUG_ASSERT(len <= alloced_length);
str->set_charset(collation.collation);
str->length(len);
return str;
}

View file

@ -220,11 +220,11 @@ static size_t my_case_str_bin(CHARSET_INFO *cs __attribute__((unused)),
static size_t my_case_bin(CHARSET_INFO *cs __attribute__((unused)),
char *src __attribute__((unused)),
size_t srclen,
char *dst __attribute__((unused)),
size_t dstlen __attribute__((unused)))
const char *src, size_t srclen,
char *dst, size_t dstlen)
{
DBUG_ASSERT(srclen <= dstlen);
memcpy(dst, src, srclen);
return srclen;
}

View file

@ -9994,8 +9994,8 @@ static MY_CHARSET_HANDLER my_charset_handler=
my_mb_ctype_mb,
my_caseup_str_mb,
my_casedn_str_mb,
my_caseup_mb_varlen, /* UPPER() can reduce length: Turkish DOTLESS i -> I */
my_casedn_mb, /* LOWER() does not change length, use simple version*/
my_caseup_mb, /* UPPER() can reduce length: Turkish DOTLESS i -> I */
my_casedn_mb, /* LOWER() does not change length */
my_snprintf_8bit,
my_long10_to_str_8bit,
my_longlong10_to_str_8bit,

View file

@ -71,81 +71,8 @@ get_case_info_for_ch(const CHARSET_INFO *cs, uint page, uint offs)
/*
For character sets which don't change octet length in case conversion.
*/
size_t my_caseup_mb(CHARSET_INFO * cs, char *src, size_t srclen,
char *dst __attribute__((unused)),
size_t dstlen __attribute__((unused)))
{
register uint32 l;
register char *srcend= src + srclen;
register const uchar *map= cs->to_upper;
DBUG_ASSERT(cs->caseup_multiply == 1);
DBUG_ASSERT(src == dst && srclen == dstlen);
DBUG_ASSERT(cs->mbmaxlen == 2);
while (src < srcend)
{
if ((l=my_ismbchar(cs, src, srcend)))
{
MY_UNICASE_CHARACTER *ch;
if ((ch= get_case_info_for_ch(cs, (uchar) src[0], (uchar) src[1])))
{
*src++= ch->toupper >> 8;
*src++= ch->toupper & 0xFF;
}
else
src+= l;
}
else
{
*src=(char) map[(uchar) *src];
src++;
}
}
return srclen;
}
size_t my_casedn_mb(CHARSET_INFO * cs, char *src, size_t srclen,
char *dst __attribute__((unused)),
size_t dstlen __attribute__((unused)))
{
register uint32 l;
register char *srcend= src + srclen;
register const uchar *map=cs->to_lower;
DBUG_ASSERT(cs->casedn_multiply == 1);
DBUG_ASSERT(src == dst && srclen == dstlen);
DBUG_ASSERT(cs->mbmaxlen == 2);
while (src < srcend)
{
if ((l= my_ismbchar(cs, src, srcend)))
{
MY_UNICASE_CHARACTER *ch;
if ((ch= get_case_info_for_ch(cs, (uchar) src[0], (uchar) src[1])))
{
*src++= ch->tolower >> 8;
*src++= ch->tolower & 0xFF;
}
else
src+= l;
}
else
{
*src= (char) map[(uchar)*src];
src++;
}
}
return srclen;
}
/*
Case folding functions for character set
where case conversion can change string octet length.
Case folding functions for CJK character set.
Case conversion can optionally reduce string octet length.
For example, in EUCKR,
_euckr 0xA9A5 == "LATIN LETTER DOTLESS I" (Turkish letter)
is upper-cased to to
@ -153,13 +80,14 @@ size_t my_casedn_mb(CHARSET_INFO * cs, char *src, size_t srclen,
Length is reduced in this example from two bytes to one byte.
*/
static size_t
my_casefold_mb_varlen(CHARSET_INFO *cs,
char *src, size_t srclen,
char *dst, size_t dstlen __attribute__((unused)),
const uchar *map,
size_t is_upper)
my_casefold_mb(CHARSET_INFO *cs,
const char *src, size_t srclen,
char *dst, size_t dstlen __attribute__((unused)),
const uchar *map,
size_t is_upper)
{
char *srcend= src + srclen, *dst0= dst;
const char *srcend= src + srclen;
char *dst0= dst;
DBUG_ASSERT(cs->mbmaxlen == 2);
@ -193,22 +121,22 @@ my_casefold_mb_varlen(CHARSET_INFO *cs,
size_t
my_casedn_mb_varlen(CHARSET_INFO * cs, char *src, size_t srclen,
my_casedn_mb(CHARSET_INFO * cs, const char *src, size_t srclen,
char *dst, size_t dstlen)
{
DBUG_ASSERT(dstlen >= srclen * cs->casedn_multiply);
DBUG_ASSERT(src != dst || cs->casedn_multiply == 1);
return my_casefold_mb_varlen(cs, src, srclen, dst, dstlen, cs->to_lower, 0);
return my_casefold_mb(cs, src, srclen, dst, dstlen, cs->to_lower, 0);
}
size_t
my_caseup_mb_varlen(CHARSET_INFO * cs, char *src, size_t srclen,
char *dst, size_t dstlen)
my_caseup_mb(CHARSET_INFO * cs, const char *src, size_t srclen,
char *dst, size_t dstlen)
{
DBUG_ASSERT(dstlen >= srclen * cs->caseup_multiply);
DBUG_ASSERT(src != dst || cs->caseup_multiply == 1);
return my_casefold_mb_varlen(cs, src, srclen, dst, dstlen, cs->to_upper, 1);
return my_casefold_mb(cs, src, srclen, dst, dstlen, cs->to_upper, 1);
}

View file

@ -214,28 +214,26 @@ size_t my_casedn_str_8bit(CHARSET_INFO * cs,char *str)
}
size_t my_caseup_8bit(CHARSET_INFO * cs, char *src, size_t srclen,
char *dst __attribute__((unused)),
size_t dstlen __attribute__((unused)))
size_t my_caseup_8bit(CHARSET_INFO * cs, const char *src, size_t srclen,
char *dst, size_t dstlen)
{
char *end= src + srclen;
const char *end= src + srclen;
register const uchar *map= cs->to_upper;
DBUG_ASSERT(src == dst && srclen == dstlen);
DBUG_ASSERT(srclen <= dstlen);
for ( ; src != end ; src++)
*src= (char) map[(uchar) *src];
*dst++= (char) map[(uchar) *src];
return srclen;
}
size_t my_casedn_8bit(CHARSET_INFO * cs, char *src, size_t srclen,
char *dst __attribute__((unused)),
size_t dstlen __attribute__((unused)))
size_t my_casedn_8bit(CHARSET_INFO * cs, const char *src, size_t srclen,
char *dst, size_t dstlen)
{
char *end= src + srclen;
const char *end= src + srclen;
register const uchar *map=cs->to_lower;
DBUG_ASSERT(src == dst && srclen == dstlen);
DBUG_ASSERT(srclen <= dstlen);
for ( ; src != end ; src++)
*src= (char) map[(uchar) *src];
*dst++= (char) map[(uchar) *src];
return srclen;
}

View file

@ -1196,25 +1196,26 @@ my_tosort_utf16(MY_UNICASE_INFO *uni_plane, my_wc_t *wc)
static size_t
my_caseup_utf16(CHARSET_INFO *cs, char *src, size_t srclen,
char *dst __attribute__((unused)),
size_t dstlen __attribute__((unused)))
my_caseup_utf16(CHARSET_INFO *cs, const char *src, size_t srclen,
char *dst, size_t dstlen)
{
my_wc_t wc;
my_charset_conv_mb_wc mb_wc= cs->cset->mb_wc;
my_charset_conv_wc_mb wc_mb= cs->cset->wc_mb;
int res;
char *srcend= src + srclen;
const char *srcend= src + srclen;
char *dstend= dst + dstlen;
MY_UNICASE_INFO *uni_plane= cs->caseinfo;
DBUG_ASSERT(src == dst && srclen == dstlen);
DBUG_ASSERT(srclen <= dstlen);
while ((src < srcend) &&
(res= mb_wc(cs, &wc, (uchar *) src, (uchar *) srcend)) > 0)
{
my_toupper_utf16(uni_plane, &wc);
if (res != wc_mb(cs, wc, (uchar *) src, (uchar *) srcend))
if (res != wc_mb(cs, wc, (uchar *) dst, (uchar *) dstend))
break;
src+= res;
dst+= res;
}
return srclen;
}
@ -1243,25 +1244,26 @@ my_hash_sort_utf16(CHARSET_INFO *cs, const uchar *s, size_t slen,
static size_t
my_casedn_utf16(CHARSET_INFO *cs, char *src, size_t srclen,
char *dst __attribute__((unused)),
size_t dstlen __attribute__((unused)))
my_casedn_utf16(CHARSET_INFO *cs, const char *src, size_t srclen,
char *dst, size_t dstlen)
{
my_wc_t wc;
my_charset_conv_mb_wc mb_wc= cs->cset->mb_wc;
my_charset_conv_wc_mb wc_mb= cs->cset->wc_mb;
int res;
char *srcend= src + srclen;
const char *srcend= src + srclen;
char *dstend= dst + dstlen;
MY_UNICASE_INFO *uni_plane= cs->caseinfo;
DBUG_ASSERT(src == dst && srclen == dstlen);
DBUG_ASSERT(srclen <= dstlen);
while ((src < srcend) &&
(res= mb_wc(cs, &wc, (uchar *) src, (uchar *) srcend)) > 0)
{
my_tolower_utf16(uni_plane, &wc);
if (res != wc_mb(cs, wc, (uchar *) src, (uchar *) srcend))
if (res != wc_mb(cs, wc, (uchar *) dst, (uchar *) dstend))
break;
src+= res;
dst+= res;
}
return srclen;
}
@ -1987,23 +1989,24 @@ my_tosort_utf32(MY_UNICASE_INFO *uni_plane, my_wc_t *wc)
static size_t
my_caseup_utf32(CHARSET_INFO *cs, char *src, size_t srclen,
char *dst __attribute__((unused)),
size_t dstlen __attribute__((unused)))
my_caseup_utf32(CHARSET_INFO *cs, const char *src, size_t srclen,
char *dst, size_t dstlen)
{
my_wc_t wc;
int res;
char *srcend= src + srclen;
const char *srcend= src + srclen;
char *dstend= dst + dstlen;
MY_UNICASE_INFO *uni_plane= cs->caseinfo;
DBUG_ASSERT(src == dst && srclen == dstlen);
DBUG_ASSERT(srclen <= dstlen);
while ((src < srcend) &&
(res= my_utf32_uni(cs, &wc, (uchar *)src, (uchar*) srcend)) > 0)
{
my_toupper_utf32(uni_plane, &wc);
if (res != my_uni_utf32(cs, wc, (uchar*) src, (uchar*) srcend))
if (res != my_uni_utf32(cs, wc, (uchar*) dst, (uchar*) dstend))
break;
src+= res;
dst+= res;
}
return srclen;
}
@ -2038,22 +2041,23 @@ my_hash_sort_utf32(CHARSET_INFO *cs, const uchar *s, size_t slen,
static size_t
my_casedn_utf32(CHARSET_INFO *cs, char *src, size_t srclen,
char *dst __attribute__((unused)),
size_t dstlen __attribute__((unused)))
my_casedn_utf32(CHARSET_INFO *cs, const char *src, size_t srclen,
char *dst, size_t dstlen)
{
my_wc_t wc;
int res;
char *srcend= src + srclen;
const char *srcend= src + srclen;
char *dstend= dst + dstlen;
MY_UNICASE_INFO *uni_plane= cs->caseinfo;
DBUG_ASSERT(src == dst && srclen == dstlen);
DBUG_ASSERT(srclen <= dstlen);
while ((res= my_utf32_uni(cs, &wc, (uchar*) src, (uchar*) srcend)) > 0)
{
my_tolower_utf32(uni_plane,&wc);
if (res != my_uni_utf32(cs, wc, (uchar*) src, (uchar*) srcend))
if (res != my_uni_utf32(cs, wc, (uchar*) dst, (uchar*) dstend))
break;
src+= res;
dst+= res;
}
return srclen;
}
@ -2950,23 +2954,24 @@ my_tosort_ucs2(MY_UNICASE_INFO *uni_plane, my_wc_t *wc)
*wc= page[*wc & 0xFF].sort;
}
static size_t my_caseup_ucs2(CHARSET_INFO *cs, char *src, size_t srclen,
char *dst __attribute__((unused)),
size_t dstlen __attribute__((unused)))
static size_t my_caseup_ucs2(CHARSET_INFO *cs, const char *src, size_t srclen,
char *dst, size_t dstlen)
{
my_wc_t wc;
int res;
char *srcend= src + srclen;
const char *srcend= src + srclen;
char *dstend= dst + dstlen;
MY_UNICASE_INFO *uni_plane= cs->caseinfo;
DBUG_ASSERT(src == dst && srclen == dstlen);
DBUG_ASSERT(srclen <= dstlen);
while ((src < srcend) &&
(res= my_ucs2_uni(cs, &wc, (uchar *)src, (uchar*) srcend)) > 0)
{
my_toupper_ucs2(uni_plane, &wc);
if (res != my_uni_ucs2(cs, wc, (uchar*) src, (uchar*) srcend))
if (res != my_uni_ucs2(cs, wc, (uchar*) dst, (uchar*) dstend))
break;
src+= res;
dst+= res;
}
return srclen;
}
@ -2995,23 +3000,24 @@ static void my_hash_sort_ucs2(CHARSET_INFO *cs, const uchar *s, size_t slen,
}
static size_t my_casedn_ucs2(CHARSET_INFO *cs, char *src, size_t srclen,
char *dst __attribute__((unused)),
size_t dstlen __attribute__((unused)))
static size_t my_casedn_ucs2(CHARSET_INFO *cs, const char *src, size_t srclen,
char *dst, size_t dstlen)
{
my_wc_t wc;
int res;
char *srcend= src + srclen;
const char *srcend= src + srclen;
char *dstend= dst + dstlen;
MY_UNICASE_INFO *uni_plane= cs->caseinfo;
DBUG_ASSERT(src == dst && srclen == dstlen);
DBUG_ASSERT(srclen <= dstlen);
while ((src < srcend) &&
(res= my_ucs2_uni(cs, &wc, (uchar*) src, (uchar*) srcend)) > 0)
{
my_tolower_ucs2(uni_plane, &wc);
if (res != my_uni_ucs2(cs, wc, (uchar*) src, (uchar*) srcend))
if (res != my_uni_ucs2(cs, wc, (uchar*) dst, (uchar*) dstend))
break;
src+= res;
dst+= res;
}
return srclen;
}

View file

@ -67179,12 +67179,12 @@ get_case_info_for_ch(CHARSET_INFO *cs, uint plane, uint page, uint offs)
*/
static size_t
my_casefold_ujis(CHARSET_INFO *cs,
char *src, size_t srclen,
const char *src, size_t srclen,
char *dst, size_t dstlen __attribute__((unused)),
const uchar * const map,
size_t is_upper)
{
char *srcend= src + srclen, *dst0= dst;
const char *srcend= src + srclen, *dst0= dst;
while (src < srcend)
{
@ -67226,7 +67226,7 @@ my_casefold_ujis(CHARSET_INFO *cs,
LOWER()
*/
size_t
my_casedn_ujis(CHARSET_INFO * cs, char *src, size_t srclen,
my_casedn_ujis(CHARSET_INFO * cs, const char *src, size_t srclen,
char *dst, size_t dstlen)
{
DBUG_ASSERT(dstlen >= srclen * cs->casedn_multiply);
@ -67239,7 +67239,7 @@ my_casedn_ujis(CHARSET_INFO * cs, char *src, size_t srclen,
UPPER()
*/
size_t
my_caseup_ujis(CHARSET_INFO * cs, char *src, size_t srclen,
my_caseup_ujis(CHARSET_INFO * cs, const char *src, size_t srclen,
char *dst, size_t dstlen)
{
DBUG_ASSERT(dstlen >= srclen * cs->caseup_multiply);

View file

@ -5065,12 +5065,13 @@ my_tosort_utf8mb3(MY_UNICASE_INFO *uni_plane, my_wc_t *wc)
*wc= page[*wc & 0xFF].sort;
}
static size_t my_caseup_utf8(CHARSET_INFO *cs, char *src, size_t srclen,
static size_t my_caseup_utf8(CHARSET_INFO *cs, const char *src, size_t srclen,
char *dst, size_t dstlen)
{
my_wc_t wc;
int srcres, dstres;
char *srcend= src + srclen, *dstend= dst + dstlen, *dst0= dst;
const char *srcend= src + srclen;
char *dstend= dst + dstlen, *dst0= dst;
MY_UNICASE_INFO *uni_plane= cs->caseinfo;
DBUG_ASSERT(src != dst || cs->caseup_multiply == 1);
@ -5136,12 +5137,13 @@ static size_t my_caseup_str_utf8(CHARSET_INFO *cs, char *src)
}
static size_t my_casedn_utf8(CHARSET_INFO *cs, char *src, size_t srclen,
static size_t my_casedn_utf8(CHARSET_INFO *cs, const char *src, size_t srclen,
char *dst, size_t dstlen)
{
my_wc_t wc;
int srcres, dstres;
char *srcend= src + srclen, *dstend= dst + dstlen, *dst0= dst;
const char *srcend= src + srclen;
char *dstend= dst + dstlen, *dst0= dst;
MY_UNICASE_INFO *uni_plane= cs->caseinfo;
DBUG_ASSERT(src != dst || cs->casedn_multiply == 1);
@ -7579,12 +7581,13 @@ my_toupper_utf8mb4(MY_UNICASE_INFO *uni_plane, my_wc_t *wc)
static size_t
my_caseup_utf8mb4(CHARSET_INFO *cs, char *src, size_t srclen,
my_caseup_utf8mb4(CHARSET_INFO *cs, const char *src, size_t srclen,
char *dst, size_t dstlen)
{
my_wc_t wc;
int srcres, dstres;
char *srcend= src + srclen, *dstend= dst + dstlen, *dst0= dst;
const char *srcend= src + srclen;
char *dstend= dst + dstlen, *dst0= dst;
MY_UNICASE_INFO *uni_plane= cs->caseinfo;
DBUG_ASSERT(src != dst || cs->caseup_multiply == 1);
@ -7666,12 +7669,13 @@ my_caseup_str_utf8mb4(CHARSET_INFO *cs, char *src)
static size_t
my_casedn_utf8mb4(CHARSET_INFO *cs,
char *src, size_t srclen,
const char *src, size_t srclen,
char *dst, size_t dstlen)
{
my_wc_t wc;
int srcres, dstres;
char *srcend= src + srclen, *dstend= dst + dstlen, *dst0= dst;
const char *srcend= src + srclen;
char *dstend= dst + dstlen, *dst0= dst;
MY_UNICASE_INFO *uni_plane= cs->caseinfo;
DBUG_ASSERT(src != dst || cs->casedn_multiply == 1);