Bug #14057034 : WASTED CPU CYCLES IN MY_UTF8_UNI WHERE

RESULTING MY_WC_T RESULT IS NOT USED
Issue         : handler functions my_ismbchar_utf8,
              my_well_formed_len_mb for charset utf8
              is calling unicode converion function
              to validate and to find the character
              length. Because of this, instructions
              which will convert the utf8 to unicode
              are executed for no use.
              A similar issue exist with charset utf8mb4
Solution      : reorganized the code such that character
              validation part of unicode conversion
              handler is extracted(duplicated) in to
              separate function. Hence
              my_ismbchar_utf8, my_well_formed_len_mb
              will call the new function which only
              validates and return the length of mb(utf8).
              A similar fix for charset utf8mb4.

strings/ctype-utf8.c:
  New functions has been added for charset utf8 and utf8mb4
  to validate and to get the length of the character.
This commit is contained in:
mithun 2013-11-12 16:42:46 +05:30
parent 4189e05c13
commit 7c9112b9c7

View file

@ -27,6 +27,7 @@
#define EILSEQ ENOENT
#endif
#define IS_CONTINUATION_BYTE(c) (((c) ^ 0x80) < 0x40)
#define MY_UTF8MB3_GENERAL_CI MY_UTF8MB3 "_general_ci"
#define MY_UTF8MB3_GENERAL_CS MY_UTF8MB3 "_general_cs"
@ -57,6 +58,46 @@
#define HAVE_UNIDATA
#endif
#if defined(HAVE_CHARSET_utf8) || defined(HAVE_CHARSET_utf8mb4)
static inline
int my_valid_mbcharlen_utf8mb3(const uchar *s, const uchar *e)
{
uchar c;
DBUG_ASSERT(s < e);
c= s[0];
if (c < 0x80)
return 1;
if (c < 0xc2)
return MY_CS_ILSEQ;
if (c < 0xe0)
{
if (s+2 > e) /* We need 2 characters */
return MY_CS_TOOSMALL2;
if (!(IS_CONTINUATION_BYTE(s[1])))
return MY_CS_ILSEQ;
return 2;
}
DBUG_ASSERT(c < 0xf0);
if (s+3 > e) /* We need 3 characters */
return MY_CS_TOOSMALL3;
if (!(IS_CONTINUATION_BYTE(s[1]) && IS_CONTINUATION_BYTE(s[2]) &&
(c >= 0xe1 || s[1] >= 0xa0)))
return MY_CS_ILSEQ;
return 3;
}
#endif /*HAVE_CHARSET_utf8 || HAVE_CHARSET_utf8mb4*/
#ifdef HAVE_UNIDATA
#include "my_uctype.h"
@ -2287,7 +2328,7 @@ static int my_utf8_uni(CHARSET_INFO *cs __attribute__((unused)),
if (s+2 > e) /* We need 2 characters */
return MY_CS_TOOSMALL2;
if (!((s[1] ^ 0x80) < 0x40))
if (!(IS_CONTINUATION_BYTE(s[1])))
return MY_CS_ILSEQ;
*pwc = ((my_wc_t) (c & 0x1f) << 6) | (my_wc_t) (s[1] ^ 0x80);
@ -2298,7 +2339,7 @@ static int my_utf8_uni(CHARSET_INFO *cs __attribute__((unused)),
if (s+3 > e) /* We need 3 characters */
return MY_CS_TOOSMALL3;
if (!((s[1] ^ 0x80) < 0x40 && (s[2] ^ 0x80) < 0x40 &&
if (!(IS_CONTINUATION_BYTE(s[1]) && IS_CONTINUATION_BYTE(s[2]) &&
(c >= 0xe1 || s[1] >= 0xa0)))
return MY_CS_ILSEQ;
@ -2314,9 +2355,9 @@ static int my_utf8_uni(CHARSET_INFO *cs __attribute__((unused)),
if (s+4 > e) /* We need 4 characters */
return MY_CS_TOOSMALL4;
if (!((s[1] ^ 0x80) < 0x40 &&
(s[2] ^ 0x80) < 0x40 &&
(s[3] ^ 0x80) < 0x40 &&
if (!(IS_CONTINUATION_BYTE(s[1]) &&
IS_CONTINUATION_BYTE(s[2]) &&
IS_CONTINUATION_BYTE(s[3]) &&
(c >= 0xf1 || s[1] >= 0x90)))
return MY_CS_ILSEQ;
@ -2332,10 +2373,10 @@ static int my_utf8_uni(CHARSET_INFO *cs __attribute__((unused)),
if (s+5 >e) /* We need 5 characters */
return MY_CS_TOOSMALL5;
if (!((s[1] ^ 0x80) < 0x40 &&
(s[2] ^ 0x80) < 0x40 &&
(s[3] ^ 0x80) < 0x40 &&
(s[4] ^ 0x80) < 0x40 &&
if (!(IS_CONTINUATION_BYTE(s[1]) &&
IS_CONTINUATION_BYTE(s[2]) &&
IS_CONTINUATION_BYTE(s[3]) &&
IS_CONTINUATION_BYTE(s[4]) &&
(c >= 0xf9 || s[1] >= 0x88)))
return MY_CS_ILSEQ;
@ -2351,11 +2392,11 @@ static int my_utf8_uni(CHARSET_INFO *cs __attribute__((unused)),
if ( s+6 >e ) /* We need 6 characters */
return MY_CS_TOOSMALL6;
if (!((s[1] ^ 0x80) < 0x40 &&
(s[2] ^ 0x80) < 0x40 &&
(s[3] ^ 0x80) < 0x40 &&
(s[4] ^ 0x80) < 0x40 &&
(s[5] ^ 0x80) < 0x40 &&
if (!(IS_CONTINUATION_BYTE(s[1]) &&
IS_CONTINUATION_BYTE(s[2]) &&
IS_CONTINUATION_BYTE(s[3]) &&
IS_CONTINUATION_BYTE(s[4]) &&
IS_CONTINUATION_BYTE(s[5]) &&
(c >= 0xfd || s[1] >= 0x84)))
return MY_CS_ILSEQ;
@ -2399,11 +2440,11 @@ static int my_utf8_uni_no_range(CHARSET_INFO *cs __attribute__((unused)),
*pwc = ((my_wc_t) (c & 0x1f) << 6) | (my_wc_t) (s[1] ^ 0x80);
return 2;
}
if (c < 0xf0)
{
if (!((s[1] ^ 0x80) < 0x40 &&
(s[2] ^ 0x80) < 0x40 &&
if (!(IS_CONTINUATION_BYTE(s[1]) &&
IS_CONTINUATION_BYTE(s[2]) &&
(c >= 0xe1 || s[1] >= 0xa0)))
return MY_CS_ILSEQ;
@ -2892,10 +2933,90 @@ size_t my_strnxfrmlen_utf8(CHARSET_INFO *cs __attribute__((unused)),
}
static
int my_valid_mbcharlen_utf8(CHARSET_INFO *cs __attribute__((unused)),
const uchar *s, const uchar *e)
{
uchar c;
if (s >= e)
return MY_CS_TOOSMALL;
c= s[0];
if (c < 0xf0)
return my_valid_mbcharlen_utf8mb3(s, e);
#ifdef UNICODE_32BIT
if (c < 0xf8 && sizeof(my_wc_t)*8 >= 32)
{
if (s+4 > e) /* We need 4 characters */
return MY_CS_TOOSMALL4;
if (!(IS_CONTINUATION_BYTE(s[1]) &&
IS_CONTINUATION_BYTE(s[2]) &&
IS_CONTINUATION_BYTE(s[3]) &&
(c >= 0xf1 || s[1] >= 0x90)))
return MY_CS_ILSEQ;
return 4;
}
if (c < 0xfc && sizeof(my_wc_t)*8 >= 32)
{
if (s+5 >e) /* We need 5 characters */
return MY_CS_TOOSMALL5;
if (!(IS_CONTINUATION_BYTE(s[1]) &&
IS_CONTINUATION_BYTE(s[2]) &&
IS_CONTINUATION_BYTE(s[3]) &&
IS_CONTINUATION_BYTE(s[4]) &&
(c >= 0xf9 || s[1] >= 0x88)))
return MY_CS_ILSEQ;
return 5;
}
if (c < 0xfe && sizeof(my_wc_t)*8 >= 32)
{
if ( s+6 >e ) /* We need 6 characters */
return MY_CS_TOOSMALL6;
if (!(IS_CONTINUATION_BYTE(s[1]) &&
IS_CONTINUATION_BYTE(s[2]) &&
IS_CONTINUATION_BYTE(s[3]) &&
IS_CONTINUATION_BYTE(s[4]) &&
IS_CONTINUATION_BYTE(s[5]) &&
(c >= 0xfd || s[1] >= 0x84)))
return MY_CS_ILSEQ;
return 6;
}
#endif
return MY_CS_ILSEQ;
}
static size_t
my_well_formed_len_utf8(CHARSET_INFO *cs, const char *b, const char *e,
size_t pos, int *error)
{
const char *b_start= b;
*error= 0;
while (pos)
{
int mb_len;
if ((mb_len= my_valid_mbcharlen_utf8(cs, (uchar*) b, (uchar*) e)) <= 0)
{
*error= b < e ? 1 : 0;
break;
}
b+= mb_len;
pos--;
}
return (size_t) (b - b_start);
}
static uint my_ismbchar_utf8(CHARSET_INFO *cs,const char *b, const char *e)
{
my_wc_t wc;
int res= my_utf8_uni(cs,&wc, (const uchar*)b, (const uchar*)e);
int res= my_valid_mbcharlen_utf8(cs, (const uchar*)b, (const uchar*)e);
return (res>1) ? res : 0;
}
@ -2944,7 +3065,7 @@ MY_CHARSET_HANDLER my_charset_utf8_handler=
my_mbcharlen_utf8,
my_numchars_mb,
my_charpos_mb,
my_well_formed_len_mb,
my_well_formed_len_utf8,
my_lengthsp_8bit,
my_numcells_mb,
my_utf8_uni,
@ -4714,7 +4835,7 @@ my_mb_wc_utf8mb4(CHARSET_INFO *cs __attribute__((unused)),
if (s + 2 > e) /* We need 2 characters */
return MY_CS_TOOSMALL2;
if (!((s[1] ^ 0x80) < 0x40))
if (!(IS_CONTINUATION_BYTE(s[1])))
return MY_CS_ILSEQ;
*pwc= ((my_wc_t) (c & 0x1f) << 6) | (my_wc_t) (s[1] ^ 0x80);
@ -4725,7 +4846,7 @@ my_mb_wc_utf8mb4(CHARSET_INFO *cs __attribute__((unused)),
if (s + 3 > e) /* We need 3 characters */
return MY_CS_TOOSMALL3;
if (!((s[1] ^ 0x80) < 0x40 && (s[2] ^ 0x80) < 0x40 &&
if (!(IS_CONTINUATION_BYTE(s[1]) && IS_CONTINUATION_BYTE(s[2]) &&
(c >= 0xe1 || s[1] >= 0xa0)))
return MY_CS_ILSEQ;
@ -4758,9 +4879,9 @@ my_mb_wc_utf8mb4(CHARSET_INFO *cs __attribute__((unused)),
[F4][80..8F][80..BF][80..BF]
*/
if (!((s[1] ^ 0x80) < 0x40 &&
(s[2] ^ 0x80) < 0x40 &&
(s[3] ^ 0x80) < 0x40 &&
if (!(IS_CONTINUATION_BYTE(s[1]) &&
IS_CONTINUATION_BYTE(s[2]) &&
IS_CONTINUATION_BYTE(s[3]) &&
(c >= 0xf1 || s[1] >= 0x90) &&
(c <= 0xf3 || s[1] <= 0x8F)))
return MY_CS_ILSEQ;
@ -4796,17 +4917,17 @@ my_mb_wc_utf8mb4_no_range(CHARSET_INFO *cs __attribute__((unused)),
if (c < 0xe0)
{
if (!((s[1] ^ 0x80) < 0x40))
if (!IS_CONTINUATION_BYTE(s[1]))
return MY_CS_ILSEQ;
*pwc = ((my_wc_t) (c & 0x1f) << 6) | (my_wc_t) (s[1] ^ 0x80);
return 2;
}
if (c < 0xf0)
{
if (!((s[1] ^ 0x80) < 0x40 &&
(s[2] ^ 0x80) < 0x40 &&
if (!(IS_CONTINUATION_BYTE(s[1]) &&
IS_CONTINUATION_BYTE(s[2]) &&
(c >= 0xe1 || s[1] >= 0xa0)))
return MY_CS_ILSEQ;
*pwc= ((my_wc_t) (c & 0x0f) << 12) |
@ -4817,9 +4938,9 @@ my_mb_wc_utf8mb4_no_range(CHARSET_INFO *cs __attribute__((unused)),
}
else if (c < 0xf5)
{
if (!((s[1] ^ 0x80) < 0x40 &&
(s[2] ^ 0x80) < 0x40 &&
(s[3] ^ 0x80) < 0x40 &&
if (!(IS_CONTINUATION_BYTE(s[1]) &&
IS_CONTINUATION_BYTE(s[2]) &&
IS_CONTINUATION_BYTE(s[3]) &&
(c >= 0xf1 || s[1] >= 0x90) &&
(c <= 0xf3 || s[1] <= 0x8F)))
return MY_CS_ILSEQ;
@ -5308,11 +5429,84 @@ my_strnxfrmlen_utf8mb4(CHARSET_INFO *cs __attribute__((unused)), size_t len)
}
static int
my_valid_mbcharlen_utf8mb4(CHARSET_INFO *cs __attribute__((unused)),
const uchar *s, const uchar *e)
{
uchar c;
if (s >= e)
return MY_CS_TOOSMALL;
c= s[0];
if (c < 0xf0)
return my_valid_mbcharlen_utf8mb3(s, e);
if (c < 0xf5)
{
if (s + 4 > e) /* We need 4 characters */
return MY_CS_TOOSMALL4;
/*
UTF-8 quick four-byte mask:
11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
Encoding allows to encode U+00010000..U+001FFFFF
The maximum character defined in the Unicode standard is U+0010FFFF.
Higher characters U+00110000..U+001FFFFF are not used.
11110000.10010000.10xxxxxx.10xxxxxx == F0.90.80.80 == U+00010000 (min)
11110100.10001111.10111111.10111111 == F4.8F.BF.BF == U+0010FFFF (max)
Valid codes:
[F0][90..BF][80..BF][80..BF]
[F1][80..BF][80..BF][80..BF]
[F2][80..BF][80..BF][80..BF]
[F3][80..BF][80..BF][80..BF]
[F4][80..8F][80..BF][80..BF]
*/
if (!(IS_CONTINUATION_BYTE(s[1]) &&
IS_CONTINUATION_BYTE(s[2]) &&
IS_CONTINUATION_BYTE(s[3]) &&
(c >= 0xf1 || s[1] >= 0x90) &&
(c <= 0xf3 || s[1] <= 0x8F)))
return MY_CS_ILSEQ;
return 4;
}
return MY_CS_ILSEQ;
}
static
size_t my_well_formed_len_utf8mb4(CHARSET_INFO *cs,
const char *b, const char *e,
size_t pos, int *error)
{
const char *b_start= b;
*error= 0;
while (pos)
{
int mb_len;
if ((mb_len= my_valid_mbcharlen_utf8mb4(cs, (uchar*) b, (uchar*) e)) <= 0)
{
*error= b < e ? 1 : 0;
break;
}
b+= mb_len;
pos--;
}
return (size_t) (b - b_start);
}
static uint
my_ismbchar_utf8mb4(CHARSET_INFO *cs, const char *b, const char *e)
{
my_wc_t wc;
int res= my_mb_wc_utf8mb4(cs,&wc, (const uchar*)b, (const uchar*)e);
int res= my_valid_mbcharlen_utf8mb4(cs, (const uchar*)b, (const uchar*)e);
return (res > 1) ? res : 0;
}
@ -5373,7 +5567,7 @@ MY_CHARSET_HANDLER my_charset_utf8mb4_handler=
my_mbcharlen_utf8mb4,
my_numchars_mb,
my_charpos_mb,
my_well_formed_len_mb,
my_well_formed_len_utf8mb4,
my_lengthsp_8bit,
my_numcells_mb,
my_mb_wc_utf8mb4,