WL#1386 - CTYPE table for unicode character sets

A prerequisite for several fulltext and XML bugs. MY_CHARSET_HANDLER now has a new function "ctype" to detect a type of the next character in a string (i.e. digit, letter, space, punctuation, control, etc), which now works correctly for both 8bit and multibyte charsets. Previously only 8bit charsets worked correctly, while any multibyte character was considered as letter in multibyte charsets. Many files: Adding new function Makefile.am: Adding build rules for uctypedump, a dump tool to create my_uctype.h using Unicode Character Database file. m_ctype.h: Adding declaration of my_uni_ctype, ctype data for Unicode. Adding new member into MY_CHARSET_HANDLER Makefile.am: Adding my_uctype.h into noinst_HEADERS my_uctype.h, uctypedump.c: new files: ctype data for unicode, and the tool to generate it from a Unicode Character Database file. include/Makefile.am: Adding my_uctype.h include/m_ctype.h: Adding declaration of my_uni_ctype, ctype data for Unicode. strings/Makefile.am: Adding build rules for uctypedump, a dump tool to create my_uctype.h using Unicode Character Database file. strings/ctype-big5.c: Adding new function strings/ctype-bin.c: Adding new function strings/ctype-cp932.c: Adding new function strings/ctype-euc_kr.c: Adding new function strings/ctype-eucjpms.c: Adding new function strings/ctype-gb2312.c: Adding new function strings/ctype-gbk.c: Adding new function strings/ctype-latin1.c: Adding new function strings/ctype-mb.c: Adding new function strings/ctype-simple.c: Adding new function strings/ctype-sjis.c: Adding new function strings/ctype-tis620.c: Adding new function strings/ctype-ucs2.c: Adding new function strings/ctype-ujis.c: Adding new function strings/ctype-utf8.c: Adding new function
2025-01-28 17:54:16 +01:00 · 2006-02-02 10:07:47 +04:00 · 2006-02-02 10:07:47 +04:00 · 4fa4383ba8
commit 4fa4383ba8
parent 55c304a17b
20 changed files with 1757 additions and 1 deletions
--- a/include/Makefile.am
+++ b/include/Makefile.am
@ -24,7 +24,7 @@ pkginclude_HEADERS =	my_dbug.h m_string.h my_sys.h my_list.h my_xml.h \
 			sslopt-vars.h sslopt-case.h sql_common.h keycache.h \
 			mysql_time.h plugin.h $(BUILT_SOURCES)
 noinst_HEADERS =	config-win.h config-os2.h config-netware.h \
-			heap.h my_bitmap.h\
+			heap.h my_bitmap.h my_uctype.h \
 			myisam.h myisampack.h myisammrg.h ft_global.h\
 			mysys_err.h my_base.h help_start.h help_end.h \
 			my_nosys.h my_alarm.h queues.h rijndael.h sha1.h \
--- a/include/m_ctype.h
+++ b/include/m_ctype.h
@ -47,6 +47,15 @@ typedef struct unicase_info_st
 extern MY_UNICASE_INFO *my_unicase_default[256];
 extern MY_UNICASE_INFO *my_unicase_turkish[256];

+typedef struct uni_ctype_st
+{
+  unsigned char  pctype;
+  unsigned char  *ctype;
+} MY_UNI_CTYPE;
+
+extern MY_UNI_CTYPE my_uni_ctype[256];
+
+
 #define MY_CS_ILSEQ	0
 #define MY_CS_ILUNI	0
 #define MY_CS_TOOSMALL	-1
@ -165,6 +174,10 @@ typedef struct my_charset_handler_st
  int (*wc_mb)(struct charset_info_st *cs,my_wc_t wc,
 	       unsigned char *s,unsigned char *e);
  
+  /* CTYPE scanner */
+  int (*ctype)(struct charset_info_st *cs, int *ctype,
+               const unsigned char *s, const unsigned char *e);
+  
  /* Functions for case and sort convertion */
  void    (*caseup_str)(struct charset_info_st *, char *);
  void    (*casedn_str)(struct charset_info_st *, char *);
@ -308,6 +321,9 @@ extern int my_strcasecmp_8bit(CHARSET_INFO * cs, const char *, const char *);
 int my_mb_wc_8bit(CHARSET_INFO *cs,my_wc_t *wc, const uchar *s,const uchar *e);
 int my_wc_mb_8bit(CHARSET_INFO *cs,my_wc_t wc, uchar *s, uchar *e);

+int my_mb_ctype_8bit(CHARSET_INFO *,int *, const uchar *,const uchar *);
+int my_mb_ctype_mb(CHARSET_INFO *,int *, const uchar *,const uchar *);
+
 ulong my_scan_8bit(CHARSET_INFO *cs, const char *b, const char *e, int sq);

 int my_snprintf_8bit(struct charset_info_st *, char *to, uint n,
--- a/include/my_uctype.h
+++ b/include/my_uctype.h
--- a/strings/Makefile.am
+++ b/strings/Makefile.am
@ -40,6 +40,7 @@ endif

 libmystrings_a_SOURCES = $(ASRCS) $(CSRCS)
 noinst_PROGRAMS = conf_to_src
+CLEANFILES = str_test uctypedump test_decimal
 # Default charset definitions
 EXTRA_DIST =		ctype-big5.c ctype-cp932.c ctype-czech.c ctype-eucjpms.c ctype-euc_kr.c ctype-win1250ch.c \
 			ctype-gb2312.c ctype-gbk.c ctype-sjis.c ctype-utf8.c \
@ -77,6 +78,9 @@ FLAGS=$(DEFS) $(INCLUDES) $(CPPFLAGS) $(CFLAGS) @NOINST_LDFLAGS@
 str_test: str_test.c $(pkglib_LIBRARIES)
 	$(LINK) $(FLAGS) -DMAIN $(INCLUDES) $(srcdir)/str_test.c $(LDADD) $(pkglib_LIBRARIES)

+uctypedump: uctypedump.c
+	$(LINK) $(INCLUDES) $(srcdir)/uctypedump.c
+
 test_decimal$(EXEEXT): decimal.c $(pkglib_LIBRARIES)
 	$(CP) $(srcdir)/decimal.c ./test_decimal.c
 	$(LINK) $(FLAGS) -DMAIN  ./test_decimal.c $(LDADD) $(pkglib_LIBRARIES)
--- a/strings/ctype-big5.c
+++ b/strings/ctype-big5.c
@ -6356,6 +6356,7 @@ static MY_CHARSET_HANDLER my_charset_big5_handler=
  my_numcells_8bit,
  my_mb_wc_big5,	/* mb_wc       */
  my_wc_mb_big5,	/* wc_mb       */
+  my_mb_ctype_mb,
  my_caseup_str_mb,
  my_casedn_str_mb,
  my_caseup_mb,
--- a/strings/ctype-bin.c
+++ b/strings/ctype-bin.c
@ -503,6 +503,7 @@ static MY_CHARSET_HANDLER my_charset_handler=
    my_numcells_8bit,
    my_mb_wc_bin,
    my_wc_mb_bin,
+    my_mb_ctype_8bit,
    my_case_str_bin,
    my_case_str_bin,
    my_case_bin,
--- a/strings/ctype-cp932.c
+++ b/strings/ctype-cp932.c
@ -5478,6 +5478,7 @@ static MY_CHARSET_HANDLER my_charset_handler=
  my_numcells_cp932,
  my_mb_wc_cp932,	/* mb_wc */
  my_wc_mb_cp932,	/* wc_mb */
+  my_mb_ctype_mb,
  my_caseup_str_8bit,
  my_casedn_str_8bit,
  my_caseup_8bit,
--- a/strings/ctype-euc_kr.c
+++ b/strings/ctype-euc_kr.c
@ -8697,6 +8697,7 @@ static MY_CHARSET_HANDLER my_charset_handler=
  my_numcells_8bit,
  my_mb_wc_euc_kr,	/* mb_wc   */
  my_wc_mb_euc_kr,	/* wc_mb   */
+  my_mb_ctype_mb,
  my_caseup_str_mb,
  my_casedn_str_mb,
  my_caseup_mb,
--- a/strings/ctype-eucjpms.c
+++ b/strings/ctype-eucjpms.c
@ -8663,6 +8663,7 @@ static MY_CHARSET_HANDLER my_charset_handler=
    my_numcells_eucjp,
    my_mb_wc_euc_jp,	/* mb_wc       */
    my_wc_mb_euc_jp,	/* wc_mb       */
+    my_mb_ctype_mb,
    my_caseup_str_mb,
    my_casedn_str_mb,
    my_caseup_mb,
--- a/strings/ctype-gb2312.c
+++ b/strings/ctype-gb2312.c
@ -5748,6 +5748,7 @@ static MY_CHARSET_HANDLER my_charset_handler=
  my_numcells_8bit,
  my_mb_wc_gb2312,	/* mb_wc      */
  my_wc_mb_gb2312,	/* wc_mb      */
+  my_mb_ctype_mb,
  my_caseup_str_mb,
  my_casedn_str_mb,
  my_caseup_mb,
--- a/strings/ctype-gbk.c
+++ b/strings/ctype-gbk.c
@ -10001,6 +10001,7 @@ static MY_CHARSET_HANDLER my_charset_handler=
  my_numcells_8bit,
  my_mb_wc_gbk,
  my_wc_mb_gbk,
+  my_mb_ctype_mb,
  my_caseup_str_mb,
  my_casedn_str_mb,
  my_caseup_mb,
--- a/strings/ctype-latin1.c
+++ b/strings/ctype-latin1.c
@ -397,6 +397,7 @@ static MY_CHARSET_HANDLER my_charset_handler=
    my_numcells_8bit,
    my_mb_wc_latin1,
    my_wc_mb_latin1,
+    my_mb_ctype_8bit,
    my_caseup_str_8bit,
    my_casedn_str_8bit,
    my_caseup_8bit,
--- a/strings/ctype-mb.c
+++ b/strings/ctype-mb.c
@ -914,6 +914,22 @@ uint my_numcells_mb(CHARSET_INFO *cs, const char *b, const char *e)
 }


+int my_mb_ctype_mb(CHARSET_INFO *cs, int *ctype,
+                   const unsigned char *s, const unsigned char *e)
+{
+  my_wc_t wc;
+  int res= cs->cset->mb_wc(cs, &wc, s, e);
+  if (res <= 0)
+    *ctype= 0;
+  else
+    *ctype= my_uni_ctype[wc>>8].ctype ?
+            my_uni_ctype[wc>>8].ctype[wc&0xFF] :
+            my_uni_ctype[wc>>8].pctype;    
+  return res;
+}
+
+
+
 MY_COLLATION_HANDLER my_collation_mb_bin_handler =
 {
    NULL,		/* init */
--- a/strings/ctype-simple.c
+++ b/strings/ctype-simple.c
@ -1354,6 +1354,19 @@ longlong my_strtoll10_8bit(CHARSET_INFO *cs __attribute__((unused)),
 }


+int my_mb_ctype_8bit(CHARSET_INFO *cs, int *ctype,
+                   const unsigned char *s, const unsigned char *e)
+{
+  if (s >= e)
+  {
+    *ctype= 0;
+    return MY_CS_TOOFEW(0);
+  }
+  *ctype= cs->ctype[*s];
+  return 1;
+}
+
+
 /*
  Check if a constant can be propagated

@ -1420,6 +1433,7 @@ MY_CHARSET_HANDLER my_charset_8bit_handler=
    my_numcells_8bit,
    my_mb_wc_8bit,
    my_wc_mb_8bit,
+    my_mb_ctype_8bit,
    my_caseup_str_8bit,
    my_casedn_str_8bit,
    my_caseup_8bit,
--- a/strings/ctype-sjis.c
+++ b/strings/ctype-sjis.c
@ -4649,6 +4649,7 @@ static MY_CHARSET_HANDLER my_charset_handler=
  my_numcells_sjis,
  my_mb_wc_sjis,	/* mb_wc */
  my_wc_mb_sjis,	/* wc_mb */
+  my_mb_ctype_mb,
  my_caseup_str_8bit,
  my_casedn_str_8bit,
  my_caseup_8bit,
--- a/strings/ctype-tis620.c
+++ b/strings/ctype-tis620.c
@ -877,6 +877,7 @@ static MY_CHARSET_HANDLER my_charset_handler=
    my_numcells_8bit,
    my_mb_wc_tis620,	/* mb_wc     */
    my_wc_mb_tis620,	/* wc_mb     */
+    my_mb_ctype_8bit,
    my_caseup_str_8bit,
    my_casedn_str_8bit,
    my_caseup_8bit,
--- a/strings/ctype-ucs2.c
+++ b/strings/ctype-ucs2.c
@ -1615,6 +1615,7 @@ MY_CHARSET_HANDLER my_charset_ucs2_handler=
    my_numcells_mb,
    my_ucs2_uni,	/* mb_wc        */
    my_uni_ucs2,	/* wc_mb        */
+    my_mb_ctype_mb,
    my_caseup_str_ucs2,
    my_casedn_str_ucs2,
    my_caseup_ucs2,
--- a/strings/ctype-ujis.c
+++ b/strings/ctype-ujis.c
@ -8531,6 +8531,7 @@ static MY_CHARSET_HANDLER my_charset_handler=
    my_numcells_eucjp,
    my_mb_wc_euc_jp,	/* mb_wc       */
    my_wc_mb_euc_jp,	/* wc_mb       */
+    my_mb_ctype_mb,
    my_caseup_str_mb,
    my_casedn_str_mb,
    my_caseup_mb,
--- a/strings/ctype-utf8.c
+++ b/strings/ctype-utf8.c
@ -41,6 +41,8 @@

 #ifdef HAVE_UNIDATA

+#include "my_uctype.h"
+
 static MY_UNICASE_INFO plane00[]={
  {0x0000,0x0000,0x0000},  {0x0001,0x0001,0x0001},
  {0x0002,0x0002,0x0002},  {0x0003,0x0003,0x0003},
@ -2534,6 +2536,7 @@ MY_CHARSET_HANDLER my_charset_utf8_handler=
    my_numcells_mb,
    my_utf8_uni,
    my_uni_utf8,
+    my_mb_ctype_mb,
    my_caseup_str_utf8,
    my_casedn_str_utf8,
    my_caseup_utf8,
@ -4027,6 +4030,7 @@ static MY_CHARSET_HANDLER my_charset_filename_handler=
    my_numcells_mb,
    my_mb_wc_filename,
    my_wc_mb_filename,
+    my_mb_ctype_mb,
    my_caseup_str_utf8,
    my_casedn_str_utf8,
    my_caseup_utf8,
--- a/strings/uctypedump.c
+++ b/strings/uctypedump.c
@ -0,0 +1,226 @@
+/*
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+*/
+#include <my_global.h>
+#include <m_string.h>
+#include <m_ctype.h>
+#include "m_ctype.h"
+
+
+typedef struct my_ctype_name_st
+{
+  const char *name;
+  int val;
+} MY_CTYPE_NAME_ST;
+
+
+static MY_CTYPE_NAME_ST my_ctype_name[]=
+{
+  {"Lu", _MY_U},                /* Letter, Uppercase          */
+  {"Ll", _MY_L},                /* Letter, Lowercase          */
+  {"Lt", _MY_U},                /* Letter, Titlecase          */
+  {"Lm", _MY_L},                /* Letter, Modifier           */
+  {"Lo", _MY_L},                /* Letter, other              */
+  
+  {"Nd", _MY_NMR},              /* Number, Decimal Digit      */
+  {"Nl", _MY_NMR|_MY_U|_MY_L},  /* Number, Letter             */
+  {"No", _MY_NMR|_MY_PNT},      /* Number, Other              */
+  
+  {"Mn", _MY_L|_MY_PNT},        /* Mark, Nonspacing           */
+  {"Mc", _MY_L|_MY_PNT},        /* Mark, Spacing Combining    */
+  {"Me", _MY_L|_MY_PNT},        /* Mark, Enclosing            */
+  
+  {"Pc", _MY_PNT},              /* Punctuation, Connector     */
+  {"Pd", _MY_PNT},              /* Punctuation, Dash          */
+  {"Ps", _MY_PNT},              /* Punctuation, Open          */
+  {"Pe", _MY_PNT},              /* Punctuation, Close         */
+  {"Pi", _MY_PNT},              /* Punctuation, Initial quote */
+  {"Pf", _MY_PNT},              /* Punctuation, Final quote   */
+  {"Po", _MY_PNT},              /* Punctuation, Other         */
+  
+  {"Sm", _MY_PNT},              /* Symbol, Math               */
+  {"Sc", _MY_PNT},              /* Symbol, Currency           */
+  {"Sk", _MY_PNT},              /* Symbol, Modifier           */
+  {"So", _MY_PNT},              /* Symbol, Other              */
+  
+  {"Zs", _MY_SPC},              /* Separator, Space           */
+  {"Zl", _MY_SPC},              /* Separator, Line            */
+  {"Zp", _MY_SPC},              /* Separator, Paragraph       */
+  
+  {"Cc", _MY_CTR},              /* Other, Control             */
+  {"Cf", _MY_CTR},              /* Other, Format              */
+  {"Cs", _MY_CTR},              /* Other, Surrogate           */
+  {"Co", _MY_CTR},              /* Other, Private Use         */
+  {"Cn", _MY_CTR},              /* Other, Not Assigned        */
+  {NULL, 0}
+};
+
+
+static int
+ctypestr2num(const char *tok)
+{
+  MY_CTYPE_NAME_ST *p;
+  for (p= my_ctype_name; p->name; p++)
+  {
+    if (!strncasecmp(p->name, tok, 2))
+      return p->val;
+  }
+  return 0;
+}
+
+
+int main(int ac, char ** av)
+{
+  char str[1024];
+  unsigned char ctypea[64*1024];
+  size_t i;
+  size_t plane;
+  MY_UNI_CTYPE uctype[256];
+  FILE *f= stdin;
+
+  if (ac > 1 && av[1] && !(f= fopen(av[1],"r")))
+  {
+    fprintf(stderr, "Can't open file %s\n", av[1]);
+    exit(1);
+  }
+  bzero(&ctypea,sizeof(ctypea));
+  bzero(&uctype, sizeof(uctype));
+  
+  printf("/*\n");
+  printf("  Unicode ctype data\n");
+  printf("  Generated from %s\n", av[1] ? av[1] : "stdin");
+  printf("*/\n");
+  
+  while(fgets(str, sizeof(str), f))
+  {
+    size_t n= 0, code= 0;
+    char *s,*e;
+    int ctype= 0;
+    
+    for(s= str; s; )
+    {
+      char *end;
+      char tok[1024]="";
+      e=strchr(s,';');
+      if(e)
+      {
+        strncpy(tok,s,(unsigned int)(e-s));
+        tok[e-s]=0;
+      }
+      else
+      {
+        strcpy(tok,s);
+      }
+      
+      end=tok+strlen(tok);
+      
+      switch(n)
+      {
+        case 0: code= strtol(tok,&end,16);break;
+        case 2: ctype= ctypestr2num(tok);break;
+      }
+      
+      n++;
+      if(e)  s=e+1;
+      else  s=e;
+    }
+    if(code<=0xFFFF)
+    {
+      ctypea[code]= ctype;
+    }
+  }
+  
+  /* Fill digits */
+  for (i= '0'; i <= '9'; i++)
+    ctypea[i]= _MY_NMR;
+    
+  for (i= 'a'; i <= 'z'; i++)
+    ctypea[i]|= _MY_X;
+  for (i= 'A'; i <= 'Z'; i++)
+    ctypea[i]|= _MY_X;
+  
+  
+  /* Fill ideographs  */
+  
+  /* CJK Ideographs Extension A (U+3400 - U+4DB5) */
+  for(i=0x3400;i<=0x4DB5;i++)
+  {
+    ctypea[i]= _MY_L | _MY_U;
+  }
+  
+  /* CJK Ideographs (U+4E00 - U+9FA5) */
+  for(i=0x4E00;i<=0x9FA5;i++){
+    ctypea[i]= _MY_L | _MY_U;
+  }
+  
+  /* Hangul Syllables (U+AC00 - U+D7A3)  */
+  for(i=0xAC00;i<=0xD7A3;i++)
+  {
+    ctypea[i]= _MY_L | _MY_U;
+  }
+  
+  
+  /* Calc plane parameters */
+  for(plane=0;plane<256;plane++)
+  {
+    size_t character;
+    uctype[plane].ctype= ctypea+plane*256;
+    
+    uctype[plane].pctype= uctype[plane].ctype[0];
+    for(character=1;character<256;character++)
+    {
+      if (uctype[plane].ctype[character] != uctype[plane].pctype)
+      {
+        uctype[plane].pctype= 0; /* Mixed plane */
+        break;
+      }
+    }
+    if (character==256)	/* All the same, no needs to dump whole plane */
+      uctype[plane].ctype= NULL; 
+  }
+  
+  /* Dump mixed planes */
+  for(plane=0;plane<256;plane++)
+  {
+    if(uctype[plane].ctype)
+    {
+      int charnum=0;
+      int num=0;
+      
+      printf("static unsigned char uctype_page%02X[256]=\n{\n",plane);
+      
+      for(charnum=0;charnum<256;charnum++)
+      {
+        int cod;
+        
+        cod=(plane<<8)+charnum;
+        printf(" %2d%s",uctype[plane].ctype[charnum],charnum<255?",":"");
+      
+        num++;
+        if(num==16)
+        {
+          printf("\n");
+          num=0;
+        }
+      }
+      printf("};\n\n");
+    }
+  }
+  
+  
+  /* Dump plane index */
+  printf("MY_UNI_CTYPE my_uni_ctype[256]={\n");
+  for(plane=0;plane<256;plane++)
+  {
+    char plane_name[128]="NULL";
+    if(uctype[plane].ctype){
+      sprintf(plane_name,"uctype_page%02X",plane);
+    }
+    printf("\t{%d,%s}%s\n",uctype[plane].pctype,plane_name,plane<255?",":"");
+  }
+  printf("};\n");
+  
+  return 0;
+}