10.0-base merge

This commit is contained in:
Sergei Golubchik 2014-02-26 15:28:07 +01:00
commit 0dc23679c8
1449 changed files with 6137 additions and 2205 deletions

View file

@ -26,6 +26,7 @@
#define EILSEQ ENOENT
#endif
#define IS_CONTINUATION_BYTE(c) (((c) ^ 0x80) < 0x40)
#define MY_UTF8MB3_GENERAL_CI MY_UTF8MB3 "_general_ci"
#define MY_UTF8MB3_GENERAL_CS MY_UTF8MB3 "_general_cs"
@ -56,6 +57,46 @@
#define HAVE_UNIDATA
#endif
#if defined(HAVE_CHARSET_utf8) || defined(HAVE_CHARSET_utf8mb4)
static inline
int my_valid_mbcharlen_utf8mb3(const uchar *s, const uchar *e)
{
uchar c;
DBUG_ASSERT(s < e);
c= s[0];
if (c < 0x80)
return 1;
if (c < 0xc2)
return MY_CS_ILSEQ;
if (c < 0xe0)
{
if (s+2 > e) /* We need 2 characters */
return MY_CS_TOOSMALL2;
if (!(IS_CONTINUATION_BYTE(s[1])))
return MY_CS_ILSEQ;
return 2;
}
DBUG_ASSERT(c < 0xf0);
if (s+3 > e) /* We need 3 characters */
return MY_CS_TOOSMALL3;
if (!(IS_CONTINUATION_BYTE(s[1]) && IS_CONTINUATION_BYTE(s[2]) &&
(c >= 0xe1 || s[1] >= 0xa0)))
return MY_CS_ILSEQ;
return 3;
}
#endif /*HAVE_CHARSET_utf8 || HAVE_CHARSET_utf8mb4*/
#ifdef HAVE_UNIDATA
#include "my_uctype.h"
@ -4806,7 +4847,7 @@ static int my_utf8_uni(CHARSET_INFO *cs __attribute__((unused)),
if (s+2 > e) /* We need 2 characters */
return MY_CS_TOOSMALL2;
if (!((s[1] ^ 0x80) < 0x40))
if (!(IS_CONTINUATION_BYTE(s[1])))
return MY_CS_ILSEQ;
*pwc = ((my_wc_t) (c & 0x1f) << 6) | (my_wc_t) (s[1] ^ 0x80);
@ -4817,7 +4858,7 @@ static int my_utf8_uni(CHARSET_INFO *cs __attribute__((unused)),
if (s+3 > e) /* We need 3 characters */
return MY_CS_TOOSMALL3;
if (!((s[1] ^ 0x80) < 0x40 && (s[2] ^ 0x80) < 0x40 &&
if (!(IS_CONTINUATION_BYTE(s[1]) && IS_CONTINUATION_BYTE(s[2]) &&
(c >= 0xe1 || s[1] >= 0xa0)))
return MY_CS_ILSEQ;
@ -4833,9 +4874,9 @@ static int my_utf8_uni(CHARSET_INFO *cs __attribute__((unused)),
if (s+4 > e) /* We need 4 characters */
return MY_CS_TOOSMALL4;
if (!((s[1] ^ 0x80) < 0x40 &&
(s[2] ^ 0x80) < 0x40 &&
(s[3] ^ 0x80) < 0x40 &&
if (!(IS_CONTINUATION_BYTE(s[1]) &&
IS_CONTINUATION_BYTE(s[2]) &&
IS_CONTINUATION_BYTE(s[3]) &&
(c >= 0xf1 || s[1] >= 0x90)))
return MY_CS_ILSEQ;
@ -4851,10 +4892,10 @@ static int my_utf8_uni(CHARSET_INFO *cs __attribute__((unused)),
if (s+5 >e) /* We need 5 characters */
return MY_CS_TOOSMALL5;
if (!((s[1] ^ 0x80) < 0x40 &&
(s[2] ^ 0x80) < 0x40 &&
(s[3] ^ 0x80) < 0x40 &&
(s[4] ^ 0x80) < 0x40 &&
if (!(IS_CONTINUATION_BYTE(s[1]) &&
IS_CONTINUATION_BYTE(s[2]) &&
IS_CONTINUATION_BYTE(s[3]) &&
IS_CONTINUATION_BYTE(s[4]) &&
(c >= 0xf9 || s[1] >= 0x88)))
return MY_CS_ILSEQ;
@ -4870,11 +4911,11 @@ static int my_utf8_uni(CHARSET_INFO *cs __attribute__((unused)),
if ( s+6 >e ) /* We need 6 characters */
return MY_CS_TOOSMALL6;
if (!((s[1] ^ 0x80) < 0x40 &&
(s[2] ^ 0x80) < 0x40 &&
(s[3] ^ 0x80) < 0x40 &&
(s[4] ^ 0x80) < 0x40 &&
(s[5] ^ 0x80) < 0x40 &&
if (!(IS_CONTINUATION_BYTE(s[1]) &&
IS_CONTINUATION_BYTE(s[2]) &&
IS_CONTINUATION_BYTE(s[3]) &&
IS_CONTINUATION_BYTE(s[4]) &&
IS_CONTINUATION_BYTE(s[5]) &&
(c >= 0xfd || s[1] >= 0x84)))
return MY_CS_ILSEQ;
@ -4918,11 +4959,11 @@ static int my_utf8_uni_no_range(CHARSET_INFO *cs __attribute__((unused)),
*pwc = ((my_wc_t) (c & 0x1f) << 6) | (my_wc_t) (s[1] ^ 0x80);
return 2;
}
if (c < 0xf0)
{
if (!((s[1] ^ 0x80) < 0x40 &&
(s[2] ^ 0x80) < 0x40 &&
if (!(IS_CONTINUATION_BYTE(s[1]) &&
IS_CONTINUATION_BYTE(s[2]) &&
(c >= 0xe1 || s[1] >= 0xa0)))
return MY_CS_ILSEQ;
@ -5404,10 +5445,90 @@ int my_wildcmp_utf8(CHARSET_INFO *cs,
}
static
int my_valid_mbcharlen_utf8(CHARSET_INFO *cs __attribute__((unused)),
const uchar *s, const uchar *e)
{
uchar c;
if (s >= e)
return MY_CS_TOOSMALL;
c= s[0];
if (c < 0xf0)
return my_valid_mbcharlen_utf8mb3(s, e);
#ifdef UNICODE_32BIT
if (c < 0xf8 && sizeof(my_wc_t)*8 >= 32)
{
if (s+4 > e) /* We need 4 characters */
return MY_CS_TOOSMALL4;
if (!(IS_CONTINUATION_BYTE(s[1]) &&
IS_CONTINUATION_BYTE(s[2]) &&
IS_CONTINUATION_BYTE(s[3]) &&
(c >= 0xf1 || s[1] >= 0x90)))
return MY_CS_ILSEQ;
return 4;
}
if (c < 0xfc && sizeof(my_wc_t)*8 >= 32)
{
if (s+5 >e) /* We need 5 characters */
return MY_CS_TOOSMALL5;
if (!(IS_CONTINUATION_BYTE(s[1]) &&
IS_CONTINUATION_BYTE(s[2]) &&
IS_CONTINUATION_BYTE(s[3]) &&
IS_CONTINUATION_BYTE(s[4]) &&
(c >= 0xf9 || s[1] >= 0x88)))
return MY_CS_ILSEQ;
return 5;
}
if (c < 0xfe && sizeof(my_wc_t)*8 >= 32)
{
if ( s+6 >e ) /* We need 6 characters */
return MY_CS_TOOSMALL6;
if (!(IS_CONTINUATION_BYTE(s[1]) &&
IS_CONTINUATION_BYTE(s[2]) &&
IS_CONTINUATION_BYTE(s[3]) &&
IS_CONTINUATION_BYTE(s[4]) &&
IS_CONTINUATION_BYTE(s[5]) &&
(c >= 0xfd || s[1] >= 0x84)))
return MY_CS_ILSEQ;
return 6;
}
#endif
return MY_CS_ILSEQ;
}
static size_t
my_well_formed_len_utf8(CHARSET_INFO *cs, const char *b, const char *e,
size_t pos, int *error)
{
const char *b_start= b;
*error= 0;
while (pos)
{
int mb_len;
if ((mb_len= my_valid_mbcharlen_utf8(cs, (uchar*) b, (uchar*) e)) <= 0)
{
*error= b < e ? 1 : 0;
break;
}
b+= mb_len;
pos--;
}
return (size_t) (b - b_start);
}
static uint my_ismbchar_utf8(CHARSET_INFO *cs,const char *b, const char *e)
{
my_wc_t wc;
int res= my_utf8_uni(cs,&wc, (const uchar*)b, (const uchar*)e);
int res= my_valid_mbcharlen_utf8(cs, (const uchar*)b, (const uchar*)e);
return (res>1) ? res : 0;
}
@ -5472,7 +5593,7 @@ MY_CHARSET_HANDLER my_charset_utf8_handler=
my_mbcharlen_utf8,
my_numchars_mb,
my_charpos_mb,
my_well_formed_len_mb,
my_well_formed_len_utf8,
my_lengthsp_8bit,
my_numcells_mb,
my_utf8_uni,
@ -7244,7 +7365,7 @@ my_mb_wc_utf8mb4(CHARSET_INFO *cs __attribute__((unused)),
if (s + 2 > e) /* We need 2 characters */
return MY_CS_TOOSMALL2;
if (!((s[1] ^ 0x80) < 0x40))
if (!(IS_CONTINUATION_BYTE(s[1])))
return MY_CS_ILSEQ;
*pwc= ((my_wc_t) (c & 0x1f) << 6) | (my_wc_t) (s[1] ^ 0x80);
@ -7255,7 +7376,7 @@ my_mb_wc_utf8mb4(CHARSET_INFO *cs __attribute__((unused)),
if (s + 3 > e) /* We need 3 characters */
return MY_CS_TOOSMALL3;
if (!((s[1] ^ 0x80) < 0x40 && (s[2] ^ 0x80) < 0x40 &&
if (!(IS_CONTINUATION_BYTE(s[1]) && IS_CONTINUATION_BYTE(s[2]) &&
(c >= 0xe1 || s[1] >= 0xa0)))
return MY_CS_ILSEQ;
@ -7288,9 +7409,9 @@ my_mb_wc_utf8mb4(CHARSET_INFO *cs __attribute__((unused)),
[F4][80..8F][80..BF][80..BF]
*/
if (!((s[1] ^ 0x80) < 0x40 &&
(s[2] ^ 0x80) < 0x40 &&
(s[3] ^ 0x80) < 0x40 &&
if (!(IS_CONTINUATION_BYTE(s[1]) &&
IS_CONTINUATION_BYTE(s[2]) &&
IS_CONTINUATION_BYTE(s[3]) &&
(c >= 0xf1 || s[1] >= 0x90) &&
(c <= 0xf3 || s[1] <= 0x8F)))
return MY_CS_ILSEQ;
@ -7326,17 +7447,17 @@ my_mb_wc_utf8mb4_no_range(CHARSET_INFO *cs __attribute__((unused)),
if (c < 0xe0)
{
if (!((s[1] ^ 0x80) < 0x40))
if (!IS_CONTINUATION_BYTE(s[1]))
return MY_CS_ILSEQ;
*pwc = ((my_wc_t) (c & 0x1f) << 6) | (my_wc_t) (s[1] ^ 0x80);
return 2;
}
if (c < 0xf0)
{
if (!((s[1] ^ 0x80) < 0x40 &&
(s[2] ^ 0x80) < 0x40 &&
if (!(IS_CONTINUATION_BYTE(s[1]) &&
IS_CONTINUATION_BYTE(s[2]) &&
(c >= 0xe1 || s[1] >= 0xa0)))
return MY_CS_ILSEQ;
*pwc= ((my_wc_t) (c & 0x0f) << 12) |
@ -7347,9 +7468,9 @@ my_mb_wc_utf8mb4_no_range(CHARSET_INFO *cs __attribute__((unused)),
}
else if (c < 0xf5)
{
if (!((s[1] ^ 0x80) < 0x40 &&
(s[2] ^ 0x80) < 0x40 &&
(s[3] ^ 0x80) < 0x40 &&
if (!(IS_CONTINUATION_BYTE(s[1]) &&
IS_CONTINUATION_BYTE(s[2]) &&
IS_CONTINUATION_BYTE(s[3]) &&
(c >= 0xf1 || s[1] >= 0x90) &&
(c <= 0xf3 || s[1] <= 0x8F)))
return MY_CS_ILSEQ;
@ -7836,11 +7957,84 @@ my_wildcmp_utf8mb4(CHARSET_INFO *cs,
}
static int
my_valid_mbcharlen_utf8mb4(CHARSET_INFO *cs __attribute__((unused)),
const uchar *s, const uchar *e)
{
uchar c;
if (s >= e)
return MY_CS_TOOSMALL;
c= s[0];
if (c < 0xf0)
return my_valid_mbcharlen_utf8mb3(s, e);
if (c < 0xf5)
{
if (s + 4 > e) /* We need 4 characters */
return MY_CS_TOOSMALL4;
/*
UTF-8 quick four-byte mask:
11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
Encoding allows to encode U+00010000..U+001FFFFF
The maximum character defined in the Unicode standard is U+0010FFFF.
Higher characters U+00110000..U+001FFFFF are not used.
11110000.10010000.10xxxxxx.10xxxxxx == F0.90.80.80 == U+00010000 (min)
11110100.10001111.10111111.10111111 == F4.8F.BF.BF == U+0010FFFF (max)
Valid codes:
[F0][90..BF][80..BF][80..BF]
[F1][80..BF][80..BF][80..BF]
[F2][80..BF][80..BF][80..BF]
[F3][80..BF][80..BF][80..BF]
[F4][80..8F][80..BF][80..BF]
*/
if (!(IS_CONTINUATION_BYTE(s[1]) &&
IS_CONTINUATION_BYTE(s[2]) &&
IS_CONTINUATION_BYTE(s[3]) &&
(c >= 0xf1 || s[1] >= 0x90) &&
(c <= 0xf3 || s[1] <= 0x8F)))
return MY_CS_ILSEQ;
return 4;
}
return MY_CS_ILSEQ;
}
static
size_t my_well_formed_len_utf8mb4(CHARSET_INFO *cs,
const char *b, const char *e,
size_t pos, int *error)
{
const char *b_start= b;
*error= 0;
while (pos)
{
int mb_len;
if ((mb_len= my_valid_mbcharlen_utf8mb4(cs, (uchar*) b, (uchar*) e)) <= 0)
{
*error= b < e ? 1 : 0;
break;
}
b+= mb_len;
pos--;
}
return (size_t) (b - b_start);
}
static uint
my_ismbchar_utf8mb4(CHARSET_INFO *cs, const char *b, const char *e)
{
my_wc_t wc;
int res= my_mb_wc_utf8mb4(cs,&wc, (const uchar*)b, (const uchar*)e);
int res= my_valid_mbcharlen_utf8mb4(cs, (const uchar*)b, (const uchar*)e);
return (res > 1) ? res : 0;
}
@ -7901,7 +8095,7 @@ MY_CHARSET_HANDLER my_charset_utf8mb4_handler=
my_mbcharlen_utf8mb4,
my_numchars_mb,
my_charpos_mb,
my_well_formed_len_mb,
my_well_formed_len_utf8mb4,
my_lengthsp_8bit,
my_numcells_mb,
my_mb_wc_utf8mb4,
@ -7963,7 +8157,8 @@ struct charset_info_st my_charset_utf8mb4_general_ci=
struct charset_info_st my_charset_utf8mb4_bin=
{
46,0,0, /* number */
MY_CS_COMPILED|MY_CS_BINSORT|MY_CS_UNICODE|MY_CS_UNICODE_SUPPLEMENT, /* state */
MY_CS_COMPILED|MY_CS_BINSORT|MY_CS_STRNXFRM|MY_CS_UNICODE|
MY_CS_UNICODE_SUPPLEMENT, /* state */
MY_UTF8MB4, /* cs name */
MY_UTF8MB4_BIN, /* name */
"UTF-8 Unicode", /* comment */