WL#1386 - CTYPE table for unicode character sets

A prerequisite for several fulltext and XML bugs.
MY_CHARSET_HANDLER now has a new function "ctype"
to detect a type of the next character in a string
(i.e. digit, letter, space, punctuation, control, etc),
which now works correctly for both 8bit and multibyte charsets.
Previously only 8bit charsets worked correctly,
while any multibyte character was considered as letter
in multibyte charsets.
Many files:
  Adding new function
Makefile.am:
  Adding build rules for uctypedump,
  a dump tool to create my_uctype.h
  using Unicode Character Database file.
m_ctype.h:
  Adding declaration of my_uni_ctype,
  ctype data for Unicode.
  Adding new member into MY_CHARSET_HANDLER
Makefile.am:
  Adding my_uctype.h into noinst_HEADERS
my_uctype.h, uctypedump.c:
  new files:
  ctype data for unicode,
  and the tool to generate it from 
  a Unicode Character Database file.



include/Makefile.am:
  Adding my_uctype.h
include/m_ctype.h:
  Adding declaration of my_uni_ctype,
  ctype data for Unicode.
strings/Makefile.am:
  Adding build rules for uctypedump,
  a dump tool to create my_uctype.h
  using Unicode Character Database file.
strings/ctype-big5.c:
  Adding new function
strings/ctype-bin.c:
  Adding new function
strings/ctype-cp932.c:
  Adding new function
strings/ctype-euc_kr.c:
  Adding new function
strings/ctype-eucjpms.c:
  Adding new function
strings/ctype-gb2312.c:
  Adding new function
strings/ctype-gbk.c:
  Adding new function
strings/ctype-latin1.c:
  Adding new function
strings/ctype-mb.c:
  Adding new function
strings/ctype-simple.c:
  Adding new function
strings/ctype-sjis.c:
  Adding new function
strings/ctype-tis620.c:
  Adding new function
strings/ctype-ucs2.c:
  Adding new function
strings/ctype-ujis.c:
  Adding new function
strings/ctype-utf8.c:
  Adding new function
This commit is contained in:
unknown 2006-02-02 10:07:47 +04:00
parent 55c304a17b
commit 4fa4383ba8
20 changed files with 1757 additions and 1 deletions

View file

@ -24,7 +24,7 @@ pkginclude_HEADERS = my_dbug.h m_string.h my_sys.h my_list.h my_xml.h \
sslopt-vars.h sslopt-case.h sql_common.h keycache.h \
mysql_time.h plugin.h $(BUILT_SOURCES)
noinst_HEADERS = config-win.h config-os2.h config-netware.h \
heap.h my_bitmap.h\
heap.h my_bitmap.h my_uctype.h \
myisam.h myisampack.h myisammrg.h ft_global.h\
mysys_err.h my_base.h help_start.h help_end.h \
my_nosys.h my_alarm.h queues.h rijndael.h sha1.h \

View file

@ -47,6 +47,15 @@ typedef struct unicase_info_st
extern MY_UNICASE_INFO *my_unicase_default[256];
extern MY_UNICASE_INFO *my_unicase_turkish[256];
typedef struct uni_ctype_st
{
unsigned char pctype;
unsigned char *ctype;
} MY_UNI_CTYPE;
extern MY_UNI_CTYPE my_uni_ctype[256];
#define MY_CS_ILSEQ 0
#define MY_CS_ILUNI 0
#define MY_CS_TOOSMALL -1
@ -165,6 +174,10 @@ typedef struct my_charset_handler_st
int (*wc_mb)(struct charset_info_st *cs,my_wc_t wc,
unsigned char *s,unsigned char *e);
/* CTYPE scanner */
int (*ctype)(struct charset_info_st *cs, int *ctype,
const unsigned char *s, const unsigned char *e);
/* Functions for case and sort convertion */
void (*caseup_str)(struct charset_info_st *, char *);
void (*casedn_str)(struct charset_info_st *, char *);
@ -308,6 +321,9 @@ extern int my_strcasecmp_8bit(CHARSET_INFO * cs, const char *, const char *);
int my_mb_wc_8bit(CHARSET_INFO *cs,my_wc_t *wc, const uchar *s,const uchar *e);
int my_wc_mb_8bit(CHARSET_INFO *cs,my_wc_t wc, uchar *s, uchar *e);
int my_mb_ctype_8bit(CHARSET_INFO *,int *, const uchar *,const uchar *);
int my_mb_ctype_mb(CHARSET_INFO *,int *, const uchar *,const uchar *);
ulong my_scan_8bit(CHARSET_INFO *cs, const char *b, const char *e, int sq);
int my_snprintf_8bit(struct charset_info_st *, char *to, uint n,

1464
include/my_uctype.h Normal file

File diff suppressed because it is too large Load diff

View file

@ -40,6 +40,7 @@ endif
libmystrings_a_SOURCES = $(ASRCS) $(CSRCS)
noinst_PROGRAMS = conf_to_src
CLEANFILES = str_test uctypedump test_decimal
# Default charset definitions
EXTRA_DIST = ctype-big5.c ctype-cp932.c ctype-czech.c ctype-eucjpms.c ctype-euc_kr.c ctype-win1250ch.c \
ctype-gb2312.c ctype-gbk.c ctype-sjis.c ctype-utf8.c \
@ -77,6 +78,9 @@ FLAGS=$(DEFS) $(INCLUDES) $(CPPFLAGS) $(CFLAGS) @NOINST_LDFLAGS@
str_test: str_test.c $(pkglib_LIBRARIES)
$(LINK) $(FLAGS) -DMAIN $(INCLUDES) $(srcdir)/str_test.c $(LDADD) $(pkglib_LIBRARIES)
uctypedump: uctypedump.c
$(LINK) $(INCLUDES) $(srcdir)/uctypedump.c
test_decimal$(EXEEXT): decimal.c $(pkglib_LIBRARIES)
$(CP) $(srcdir)/decimal.c ./test_decimal.c
$(LINK) $(FLAGS) -DMAIN ./test_decimal.c $(LDADD) $(pkglib_LIBRARIES)

View file

@ -6356,6 +6356,7 @@ static MY_CHARSET_HANDLER my_charset_big5_handler=
my_numcells_8bit,
my_mb_wc_big5, /* mb_wc */
my_wc_mb_big5, /* wc_mb */
my_mb_ctype_mb,
my_caseup_str_mb,
my_casedn_str_mb,
my_caseup_mb,

View file

@ -503,6 +503,7 @@ static MY_CHARSET_HANDLER my_charset_handler=
my_numcells_8bit,
my_mb_wc_bin,
my_wc_mb_bin,
my_mb_ctype_8bit,
my_case_str_bin,
my_case_str_bin,
my_case_bin,

View file

@ -5478,6 +5478,7 @@ static MY_CHARSET_HANDLER my_charset_handler=
my_numcells_cp932,
my_mb_wc_cp932, /* mb_wc */
my_wc_mb_cp932, /* wc_mb */
my_mb_ctype_mb,
my_caseup_str_8bit,
my_casedn_str_8bit,
my_caseup_8bit,

View file

@ -8697,6 +8697,7 @@ static MY_CHARSET_HANDLER my_charset_handler=
my_numcells_8bit,
my_mb_wc_euc_kr, /* mb_wc */
my_wc_mb_euc_kr, /* wc_mb */
my_mb_ctype_mb,
my_caseup_str_mb,
my_casedn_str_mb,
my_caseup_mb,

View file

@ -8663,6 +8663,7 @@ static MY_CHARSET_HANDLER my_charset_handler=
my_numcells_eucjp,
my_mb_wc_euc_jp, /* mb_wc */
my_wc_mb_euc_jp, /* wc_mb */
my_mb_ctype_mb,
my_caseup_str_mb,
my_casedn_str_mb,
my_caseup_mb,

View file

@ -5748,6 +5748,7 @@ static MY_CHARSET_HANDLER my_charset_handler=
my_numcells_8bit,
my_mb_wc_gb2312, /* mb_wc */
my_wc_mb_gb2312, /* wc_mb */
my_mb_ctype_mb,
my_caseup_str_mb,
my_casedn_str_mb,
my_caseup_mb,

View file

@ -10001,6 +10001,7 @@ static MY_CHARSET_HANDLER my_charset_handler=
my_numcells_8bit,
my_mb_wc_gbk,
my_wc_mb_gbk,
my_mb_ctype_mb,
my_caseup_str_mb,
my_casedn_str_mb,
my_caseup_mb,

View file

@ -397,6 +397,7 @@ static MY_CHARSET_HANDLER my_charset_handler=
my_numcells_8bit,
my_mb_wc_latin1,
my_wc_mb_latin1,
my_mb_ctype_8bit,
my_caseup_str_8bit,
my_casedn_str_8bit,
my_caseup_8bit,

View file

@ -914,6 +914,22 @@ uint my_numcells_mb(CHARSET_INFO *cs, const char *b, const char *e)
}
int my_mb_ctype_mb(CHARSET_INFO *cs, int *ctype,
const unsigned char *s, const unsigned char *e)
{
my_wc_t wc;
int res= cs->cset->mb_wc(cs, &wc, s, e);
if (res <= 0)
*ctype= 0;
else
*ctype= my_uni_ctype[wc>>8].ctype ?
my_uni_ctype[wc>>8].ctype[wc&0xFF] :
my_uni_ctype[wc>>8].pctype;
return res;
}
MY_COLLATION_HANDLER my_collation_mb_bin_handler =
{
NULL, /* init */

View file

@ -1354,6 +1354,19 @@ longlong my_strtoll10_8bit(CHARSET_INFO *cs __attribute__((unused)),
}
int my_mb_ctype_8bit(CHARSET_INFO *cs, int *ctype,
const unsigned char *s, const unsigned char *e)
{
if (s >= e)
{
*ctype= 0;
return MY_CS_TOOFEW(0);
}
*ctype= cs->ctype[*s];
return 1;
}
/*
Check if a constant can be propagated
@ -1420,6 +1433,7 @@ MY_CHARSET_HANDLER my_charset_8bit_handler=
my_numcells_8bit,
my_mb_wc_8bit,
my_wc_mb_8bit,
my_mb_ctype_8bit,
my_caseup_str_8bit,
my_casedn_str_8bit,
my_caseup_8bit,

View file

@ -4649,6 +4649,7 @@ static MY_CHARSET_HANDLER my_charset_handler=
my_numcells_sjis,
my_mb_wc_sjis, /* mb_wc */
my_wc_mb_sjis, /* wc_mb */
my_mb_ctype_mb,
my_caseup_str_8bit,
my_casedn_str_8bit,
my_caseup_8bit,

View file

@ -877,6 +877,7 @@ static MY_CHARSET_HANDLER my_charset_handler=
my_numcells_8bit,
my_mb_wc_tis620, /* mb_wc */
my_wc_mb_tis620, /* wc_mb */
my_mb_ctype_8bit,
my_caseup_str_8bit,
my_casedn_str_8bit,
my_caseup_8bit,

View file

@ -1615,6 +1615,7 @@ MY_CHARSET_HANDLER my_charset_ucs2_handler=
my_numcells_mb,
my_ucs2_uni, /* mb_wc */
my_uni_ucs2, /* wc_mb */
my_mb_ctype_mb,
my_caseup_str_ucs2,
my_casedn_str_ucs2,
my_caseup_ucs2,

View file

@ -8531,6 +8531,7 @@ static MY_CHARSET_HANDLER my_charset_handler=
my_numcells_eucjp,
my_mb_wc_euc_jp, /* mb_wc */
my_wc_mb_euc_jp, /* wc_mb */
my_mb_ctype_mb,
my_caseup_str_mb,
my_casedn_str_mb,
my_caseup_mb,

View file

@ -41,6 +41,8 @@
#ifdef HAVE_UNIDATA
#include "my_uctype.h"
static MY_UNICASE_INFO plane00[]={
{0x0000,0x0000,0x0000}, {0x0001,0x0001,0x0001},
{0x0002,0x0002,0x0002}, {0x0003,0x0003,0x0003},
@ -2534,6 +2536,7 @@ MY_CHARSET_HANDLER my_charset_utf8_handler=
my_numcells_mb,
my_utf8_uni,
my_uni_utf8,
my_mb_ctype_mb,
my_caseup_str_utf8,
my_casedn_str_utf8,
my_caseup_utf8,
@ -4027,6 +4030,7 @@ static MY_CHARSET_HANDLER my_charset_filename_handler=
my_numcells_mb,
my_mb_wc_filename,
my_wc_mb_filename,
my_mb_ctype_mb,
my_caseup_str_utf8,
my_casedn_str_utf8,
my_caseup_utf8,

226
strings/uctypedump.c Normal file
View file

@ -0,0 +1,226 @@
/*
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
*/
#include <my_global.h>
#include <m_string.h>
#include <m_ctype.h>
#include "m_ctype.h"
typedef struct my_ctype_name_st
{
const char *name;
int val;
} MY_CTYPE_NAME_ST;
static MY_CTYPE_NAME_ST my_ctype_name[]=
{
{"Lu", _MY_U}, /* Letter, Uppercase */
{"Ll", _MY_L}, /* Letter, Lowercase */
{"Lt", _MY_U}, /* Letter, Titlecase */
{"Lm", _MY_L}, /* Letter, Modifier */
{"Lo", _MY_L}, /* Letter, other */
{"Nd", _MY_NMR}, /* Number, Decimal Digit */
{"Nl", _MY_NMR|_MY_U|_MY_L}, /* Number, Letter */
{"No", _MY_NMR|_MY_PNT}, /* Number, Other */
{"Mn", _MY_L|_MY_PNT}, /* Mark, Nonspacing */
{"Mc", _MY_L|_MY_PNT}, /* Mark, Spacing Combining */
{"Me", _MY_L|_MY_PNT}, /* Mark, Enclosing */
{"Pc", _MY_PNT}, /* Punctuation, Connector */
{"Pd", _MY_PNT}, /* Punctuation, Dash */
{"Ps", _MY_PNT}, /* Punctuation, Open */
{"Pe", _MY_PNT}, /* Punctuation, Close */
{"Pi", _MY_PNT}, /* Punctuation, Initial quote */
{"Pf", _MY_PNT}, /* Punctuation, Final quote */
{"Po", _MY_PNT}, /* Punctuation, Other */
{"Sm", _MY_PNT}, /* Symbol, Math */
{"Sc", _MY_PNT}, /* Symbol, Currency */
{"Sk", _MY_PNT}, /* Symbol, Modifier */
{"So", _MY_PNT}, /* Symbol, Other */
{"Zs", _MY_SPC}, /* Separator, Space */
{"Zl", _MY_SPC}, /* Separator, Line */
{"Zp", _MY_SPC}, /* Separator, Paragraph */
{"Cc", _MY_CTR}, /* Other, Control */
{"Cf", _MY_CTR}, /* Other, Format */
{"Cs", _MY_CTR}, /* Other, Surrogate */
{"Co", _MY_CTR}, /* Other, Private Use */
{"Cn", _MY_CTR}, /* Other, Not Assigned */
{NULL, 0}
};
static int
ctypestr2num(const char *tok)
{
MY_CTYPE_NAME_ST *p;
for (p= my_ctype_name; p->name; p++)
{
if (!strncasecmp(p->name, tok, 2))
return p->val;
}
return 0;
}
int main(int ac, char ** av)
{
char str[1024];
unsigned char ctypea[64*1024];
size_t i;
size_t plane;
MY_UNI_CTYPE uctype[256];
FILE *f= stdin;
if (ac > 1 && av[1] && !(f= fopen(av[1],"r")))
{
fprintf(stderr, "Can't open file %s\n", av[1]);
exit(1);
}
bzero(&ctypea,sizeof(ctypea));
bzero(&uctype, sizeof(uctype));
printf("/*\n");
printf(" Unicode ctype data\n");
printf(" Generated from %s\n", av[1] ? av[1] : "stdin");
printf("*/\n");
while(fgets(str, sizeof(str), f))
{
size_t n= 0, code= 0;
char *s,*e;
int ctype= 0;
for(s= str; s; )
{
char *end;
char tok[1024]="";
e=strchr(s,';');
if(e)
{
strncpy(tok,s,(unsigned int)(e-s));
tok[e-s]=0;
}
else
{
strcpy(tok,s);
}
end=tok+strlen(tok);
switch(n)
{
case 0: code= strtol(tok,&end,16);break;
case 2: ctype= ctypestr2num(tok);break;
}
n++;
if(e) s=e+1;
else s=e;
}
if(code<=0xFFFF)
{
ctypea[code]= ctype;
}
}
/* Fill digits */
for (i= '0'; i <= '9'; i++)
ctypea[i]= _MY_NMR;
for (i= 'a'; i <= 'z'; i++)
ctypea[i]|= _MY_X;
for (i= 'A'; i <= 'Z'; i++)
ctypea[i]|= _MY_X;
/* Fill ideographs */
/* CJK Ideographs Extension A (U+3400 - U+4DB5) */
for(i=0x3400;i<=0x4DB5;i++)
{
ctypea[i]= _MY_L | _MY_U;
}
/* CJK Ideographs (U+4E00 - U+9FA5) */
for(i=0x4E00;i<=0x9FA5;i++){
ctypea[i]= _MY_L | _MY_U;
}
/* Hangul Syllables (U+AC00 - U+D7A3) */
for(i=0xAC00;i<=0xD7A3;i++)
{
ctypea[i]= _MY_L | _MY_U;
}
/* Calc plane parameters */
for(plane=0;plane<256;plane++)
{
size_t character;
uctype[plane].ctype= ctypea+plane*256;
uctype[plane].pctype= uctype[plane].ctype[0];
for(character=1;character<256;character++)
{
if (uctype[plane].ctype[character] != uctype[plane].pctype)
{
uctype[plane].pctype= 0; /* Mixed plane */
break;
}
}
if (character==256) /* All the same, no needs to dump whole plane */
uctype[plane].ctype= NULL;
}
/* Dump mixed planes */
for(plane=0;plane<256;plane++)
{
if(uctype[plane].ctype)
{
int charnum=0;
int num=0;
printf("static unsigned char uctype_page%02X[256]=\n{\n",plane);
for(charnum=0;charnum<256;charnum++)
{
int cod;
cod=(plane<<8)+charnum;
printf(" %2d%s",uctype[plane].ctype[charnum],charnum<255?",":"");
num++;
if(num==16)
{
printf("\n");
num=0;
}
}
printf("};\n\n");
}
}
/* Dump plane index */
printf("MY_UNI_CTYPE my_uni_ctype[256]={\n");
for(plane=0;plane<256;plane++)
{
char plane_name[128]="NULL";
if(uctype[plane].ctype){
sprintf(plane_name,"uctype_page%02X",plane);
}
printf("\t{%d,%s}%s\n",uctype[plane].pctype,plane_name,plane<255?",":"");
}
printf("};\n");
return 0;
}