mariadb/strings/uctypedump.c
bar@mysql.com d7c773834b WL#1386 - CTYPE table for unicode character sets
A prerequisite for several fulltext and XML bugs.
MY_CHARSET_HANDLER now has a new function "ctype"
to detect a type of the next character in a string
(i.e. digit, letter, space, punctuation, control, etc),
which now works correctly for both 8bit and multibyte charsets.
Previously only 8bit charsets worked correctly,
while any multibyte character was considered as letter
in multibyte charsets.
Many files:
  Adding new function
Makefile.am:
  Adding build rules for uctypedump,
  a dump tool to create my_uctype.h
  using Unicode Character Database file.
m_ctype.h:
  Adding declaration of my_uni_ctype,
  ctype data for Unicode.
  Adding new member into MY_CHARSET_HANDLER
Makefile.am:
  Adding my_uctype.h into noinst_HEADERS
my_uctype.h, uctypedump.c:
  new files:
  ctype data for unicode,
  and the tool to generate it from 
  a Unicode Character Database file.
2006-02-02 10:07:47 +04:00

226 lines
5.4 KiB
C

/*
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
*/
#include <my_global.h>
#include <m_string.h>
#include <m_ctype.h>
#include "m_ctype.h"
typedef struct my_ctype_name_st
{
const char *name;
int val;
} MY_CTYPE_NAME_ST;
static MY_CTYPE_NAME_ST my_ctype_name[]=
{
{"Lu", _MY_U}, /* Letter, Uppercase */
{"Ll", _MY_L}, /* Letter, Lowercase */
{"Lt", _MY_U}, /* Letter, Titlecase */
{"Lm", _MY_L}, /* Letter, Modifier */
{"Lo", _MY_L}, /* Letter, other */
{"Nd", _MY_NMR}, /* Number, Decimal Digit */
{"Nl", _MY_NMR|_MY_U|_MY_L}, /* Number, Letter */
{"No", _MY_NMR|_MY_PNT}, /* Number, Other */
{"Mn", _MY_L|_MY_PNT}, /* Mark, Nonspacing */
{"Mc", _MY_L|_MY_PNT}, /* Mark, Spacing Combining */
{"Me", _MY_L|_MY_PNT}, /* Mark, Enclosing */
{"Pc", _MY_PNT}, /* Punctuation, Connector */
{"Pd", _MY_PNT}, /* Punctuation, Dash */
{"Ps", _MY_PNT}, /* Punctuation, Open */
{"Pe", _MY_PNT}, /* Punctuation, Close */
{"Pi", _MY_PNT}, /* Punctuation, Initial quote */
{"Pf", _MY_PNT}, /* Punctuation, Final quote */
{"Po", _MY_PNT}, /* Punctuation, Other */
{"Sm", _MY_PNT}, /* Symbol, Math */
{"Sc", _MY_PNT}, /* Symbol, Currency */
{"Sk", _MY_PNT}, /* Symbol, Modifier */
{"So", _MY_PNT}, /* Symbol, Other */
{"Zs", _MY_SPC}, /* Separator, Space */
{"Zl", _MY_SPC}, /* Separator, Line */
{"Zp", _MY_SPC}, /* Separator, Paragraph */
{"Cc", _MY_CTR}, /* Other, Control */
{"Cf", _MY_CTR}, /* Other, Format */
{"Cs", _MY_CTR}, /* Other, Surrogate */
{"Co", _MY_CTR}, /* Other, Private Use */
{"Cn", _MY_CTR}, /* Other, Not Assigned */
{NULL, 0}
};
static int
ctypestr2num(const char *tok)
{
MY_CTYPE_NAME_ST *p;
for (p= my_ctype_name; p->name; p++)
{
if (!strncasecmp(p->name, tok, 2))
return p->val;
}
return 0;
}
int main(int ac, char ** av)
{
char str[1024];
unsigned char ctypea[64*1024];
size_t i;
size_t plane;
MY_UNI_CTYPE uctype[256];
FILE *f= stdin;
if (ac > 1 && av[1] && !(f= fopen(av[1],"r")))
{
fprintf(stderr, "Can't open file %s\n", av[1]);
exit(1);
}
bzero(&ctypea,sizeof(ctypea));
bzero(&uctype, sizeof(uctype));
printf("/*\n");
printf(" Unicode ctype data\n");
printf(" Generated from %s\n", av[1] ? av[1] : "stdin");
printf("*/\n");
while(fgets(str, sizeof(str), f))
{
size_t n= 0, code= 0;
char *s,*e;
int ctype= 0;
for(s= str; s; )
{
char *end;
char tok[1024]="";
e=strchr(s,';');
if(e)
{
strncpy(tok,s,(unsigned int)(e-s));
tok[e-s]=0;
}
else
{
strcpy(tok,s);
}
end=tok+strlen(tok);
switch(n)
{
case 0: code= strtol(tok,&end,16);break;
case 2: ctype= ctypestr2num(tok);break;
}
n++;
if(e) s=e+1;
else s=e;
}
if(code<=0xFFFF)
{
ctypea[code]= ctype;
}
}
/* Fill digits */
for (i= '0'; i <= '9'; i++)
ctypea[i]= _MY_NMR;
for (i= 'a'; i <= 'z'; i++)
ctypea[i]|= _MY_X;
for (i= 'A'; i <= 'Z'; i++)
ctypea[i]|= _MY_X;
/* Fill ideographs */
/* CJK Ideographs Extension A (U+3400 - U+4DB5) */
for(i=0x3400;i<=0x4DB5;i++)
{
ctypea[i]= _MY_L | _MY_U;
}
/* CJK Ideographs (U+4E00 - U+9FA5) */
for(i=0x4E00;i<=0x9FA5;i++){
ctypea[i]= _MY_L | _MY_U;
}
/* Hangul Syllables (U+AC00 - U+D7A3) */
for(i=0xAC00;i<=0xD7A3;i++)
{
ctypea[i]= _MY_L | _MY_U;
}
/* Calc plane parameters */
for(plane=0;plane<256;plane++)
{
size_t character;
uctype[plane].ctype= ctypea+plane*256;
uctype[plane].pctype= uctype[plane].ctype[0];
for(character=1;character<256;character++)
{
if (uctype[plane].ctype[character] != uctype[plane].pctype)
{
uctype[plane].pctype= 0; /* Mixed plane */
break;
}
}
if (character==256) /* All the same, no needs to dump whole plane */
uctype[plane].ctype= NULL;
}
/* Dump mixed planes */
for(plane=0;plane<256;plane++)
{
if(uctype[plane].ctype)
{
int charnum=0;
int num=0;
printf("static unsigned char uctype_page%02X[256]=\n{\n",plane);
for(charnum=0;charnum<256;charnum++)
{
int cod;
cod=(plane<<8)+charnum;
printf(" %2d%s",uctype[plane].ctype[charnum],charnum<255?",":"");
num++;
if(num==16)
{
printf("\n");
num=0;
}
}
printf("};\n\n");
}
}
/* Dump plane index */
printf("MY_UNI_CTYPE my_uni_ctype[256]={\n");
for(plane=0;plane<256;plane++)
{
char plane_name[128]="NULL";
if(uctype[plane].ctype){
sprintf(plane_name,"uctype_page%02X",plane);
}
printf("\t{%d,%s}%s\n",uctype[plane].pctype,plane_name,plane<255?",":"");
}
printf("};\n");
return 0;
}