mirror of
https://github.com/MariaDB/server.git
synced 2025-01-15 19:42:28 +01:00
6075f12c65
This is a non-functional change. It changes the way how case folding data and weight data (for simple Unicode collations) are stored: - Removing data types MY_UNICASE_CHARACTER, MY_UNICASE_INFO - Using data types MY_CASEFOLD_CHARACTER, MY_CASEFOLD_INFO instead. This patch changes simple Unicode collations in a similar way how MDEV-30695 previously changed Asian collations. No new MTR tests are needed. The underlying code is thoroughly covered by a number of ctype_*_ws.test and ctype_*_casefold.test files, which were added recently as a preparation for this change. Old and new Unicode data layout ------------------------------- Case folding data is now stored in separate tables consisting of MY_CASEFOLD_CHARACTER elements with two members: typedef struct casefold_info_char_t { uint32 toupper; uint32 tolower; } MY_CASEFOLD_CHARACTER; while weight data (for simple non-UCA collations xxx_general_ci and xxx_general_mysql500_ci) is stored in separate arrays of uint16 elements. Before this change case folding data and simple weight data were stored together, in tables of the following elements with three members: typedef struct unicase_info_char_st { uint32 toupper; uint32 tolower; uint32 sort; /* weights for simple collations */ } MY_UNICASE_CHARACTER; This data format was redundant, because weights (the "sort" member) were needed only for these two simple Unicode collations: - xxx_general_ci - xxx_general_mysql500_ci Adding case folding information for Unicode-14.0.0 using the old format would waste memory without purpose. Detailed changes ---------------- - Changing the underlying data types as described above - Including unidata-dump.c into the sources. This program was earlier used to dump UnicodeData.txt (e.g. https://www.unicode.org/Public/14.0.0/ucd/UnicodeData.txt) into MySQL / MariaDB source files. It was originally written in 2002, but has not been distributed yet together with MySQL / MariaDB sources. - Removing the old format Unicode data earlier dumped from UnicodeData.txt (versions 3.0.0 and 5.2.0) from ctype-utf8.c. Adding Unicode data in the new format into separate header files, to maintain the code easier: - ctype-unicode300-casefold.h - ctype-unicode300-casefold-tr.h - ctype-unicode300-general_ci.h - ctype-unicode300-general_mysql500_ci.h - ctype-unicode520-casefold.h - Adding a new file ctype-unidata.c as an aggregator for the header files listed above.
1110 lines
28 KiB
C
1110 lines
28 KiB
C
const char COPYING[]= "\
|
|
/* Copyright (c) 2000, 2013, Oracle and/or its affiliates.\n\
|
|
Copyright (c) 2009, 2023, MariaDB Corporation.\n\
|
|
\n\
|
|
This program is free software; you can redistribute it and/or modify\n\
|
|
it under the terms of the GNU General Public License as published by\n\
|
|
the Free Software Foundation; version 2 of the License.\n\
|
|
\n\
|
|
This program is distributed in the hope that it will be useful,\n\
|
|
but WITHOUT ANY WARRANTY; without even the implied warranty of\n\
|
|
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the\n\
|
|
GNU General Public License for more details.\n\
|
|
\n\
|
|
You should have received a copy of the GNU General Public License\n\
|
|
along with this program; if not, write to the Free Software\n\
|
|
Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1335 USA\n\
|
|
*/\n";
|
|
|
|
#include <stdio.h>
|
|
#include <stdlib.h>
|
|
#include <string.h>
|
|
|
|
#define MAX_UNI_CHAR 0x10FFFF
|
|
#define MAX_UNI_PAGE 0x10FF
|
|
|
|
#define STRING_WITH_LEN(X) (X), ((size_t) (sizeof(X) - 1))
|
|
|
|
|
|
typedef unsigned int my_wchar_t;
|
|
|
|
/* Character types, as in m_ctype.h */
|
|
#define _MY_U 01 /* Upper case */
|
|
#define _MY_L 02 /* Lower case */
|
|
#define _MY_NMR 04 /* Numeral (digit) */
|
|
#define _MY_SPC 010 /* Spacing character */
|
|
#define _MY_PNT 020 /* Punctuation */
|
|
#define _MY_CTR 040 /* Control character */
|
|
#define _MY_B 0100 /* Blank */
|
|
#define _MY_X 0200 /* heXadecimal digit */
|
|
|
|
#define CT_MAX _MY_X
|
|
#define CT_CJK _MY_L | _MY_U
|
|
#define CT_HANGUL _MY_L | _MY_U
|
|
#define CT_NONE 0
|
|
|
|
|
|
/* Decomposition types */
|
|
typedef enum
|
|
{
|
|
DT_UNKNOWN,
|
|
DT_FONT,
|
|
DT_NOBREAK,
|
|
DT_INITIAL,
|
|
DT_MEDIAL,
|
|
DT_FINAL,
|
|
DT_ISOLATED,
|
|
DT_CIRCLE,
|
|
DT_SUPER,
|
|
DT_SUB,
|
|
DT_VERTICAL,
|
|
DT_WIDE,
|
|
DT_NARROW,
|
|
DT_SMALL,
|
|
DT_SQUARE,
|
|
DT_FRACTION,
|
|
DT_COMPAT
|
|
} decomposition_type_t;
|
|
|
|
|
|
typedef enum
|
|
{
|
|
PAGE_DATA_USELESS= 0,
|
|
PAGE_DATA_IMPORTANT= 1,
|
|
PAGE_DATA_DUMMY= 2
|
|
} page_data_type_t;
|
|
|
|
|
|
typedef struct
|
|
{
|
|
page_data_type_t page_tab;
|
|
int page_overridden;
|
|
int page_ctype;
|
|
} PAGE_STAT;
|
|
|
|
|
|
typedef struct
|
|
{
|
|
const char *mode_name;
|
|
int print_ctype;
|
|
int print_toupper;
|
|
int print_tolower;
|
|
int print_noaccent;
|
|
int print_noaccent_tolower;
|
|
int print_noaccent_toupper;
|
|
int print_curly_brackets_in_items;
|
|
int print_curly_brackets_in_index;
|
|
int chars_per_line;
|
|
int single_array;
|
|
int pages_per_line_in_index;
|
|
int const_data;
|
|
const char *page_data_type_name;
|
|
const char *page_name;
|
|
const char *page_name_derived;
|
|
const char *index_data_type_name;
|
|
const char *index_name;
|
|
} UNIDATA_OPT_MODE;
|
|
|
|
|
|
typedef struct
|
|
{
|
|
my_wchar_t max_char;
|
|
my_wchar_t dummy_pages_codepoint_max;
|
|
const char *filename;
|
|
UNIDATA_OPT_MODE mode;
|
|
} UNIDATA_OPT;
|
|
|
|
|
|
my_wchar_t npages_by_opt(const UNIDATA_OPT *opt)
|
|
{
|
|
return (opt->max_char + 1) / 256;
|
|
}
|
|
|
|
|
|
typedef struct my_ctype_name_st
|
|
{
|
|
const char *name;
|
|
int val;
|
|
int to_be_decomposed;
|
|
} MY_CTYPE_NAME_ST;
|
|
|
|
|
|
static MY_CTYPE_NAME_ST my_ctype_name[]=
|
|
{
|
|
{"Lu", _MY_U, 1}, /* Letter, Uppercase */
|
|
{"Ll", _MY_L, 1}, /* Letter, Lowercase */
|
|
{"Lt", _MY_U, 1}, /* Letter, Titlecase */
|
|
{"Lo", _MY_L, 1}, /* Letter, other */
|
|
{"Lm", _MY_L, 0}, /* Letter, Modifier */
|
|
|
|
{"Nd", _MY_NMR, 0}, /* Number, Decimal Digit */
|
|
{"Nl", _MY_NMR|_MY_U|_MY_L, 0}, /* Number, Letter */
|
|
{"No", _MY_NMR|_MY_PNT, 0}, /* Number, Other */
|
|
|
|
{"Mn", _MY_L|_MY_PNT, 0}, /* Mark, Nonspacing */
|
|
{"Mc", _MY_L|_MY_PNT, 1}, /* Mark, Spacing Combining */
|
|
{"Me", _MY_L|_MY_PNT, 0}, /* Mark, Enclosing */
|
|
|
|
{"Pc", _MY_PNT, 0}, /* Punctuation, Connector */
|
|
{"Pd", _MY_PNT, 0}, /* Punctuation, Dash */
|
|
{"Ps", _MY_PNT, 0}, /* Punctuation, Open */
|
|
{"Pe", _MY_PNT, 0}, /* Punctuation, Close */
|
|
{"Pi", _MY_PNT, 0}, /* Punctuation, Initial quote */
|
|
{"Pf", _MY_PNT, 0}, /* Punctuation, Final quote */
|
|
{"Po", _MY_PNT, 0}, /* Punctuation, Other */
|
|
|
|
{"Sm", _MY_PNT, 0}, /* Symbol, Math */
|
|
{"Sc", _MY_PNT, 0}, /* Symbol, Currency */
|
|
{"Sk", _MY_PNT, 0}, /* Symbol, Modifier */
|
|
{"So", _MY_PNT, 0}, /* Symbol, Other */
|
|
|
|
{"Zs", _MY_SPC, 0}, /* Separator, Space */
|
|
{"Zl", _MY_SPC, 0}, /* Separator, Line */
|
|
{"Zp", _MY_SPC, 0}, /* Separator, Paragraph */
|
|
|
|
{"Cc", _MY_CTR, 0}, /* Other, Control */
|
|
{"Cf", _MY_CTR, 0}, /* Other, Format */
|
|
{"Cs", _MY_CTR, 0}, /* Other, Surrogate */
|
|
{"Co", _MY_CTR, 0}, /* Other, Private Use */
|
|
{"Cn", _MY_CTR, 0}, /* Other, Not Assigned */
|
|
{NULL, 0, 0}
|
|
};
|
|
|
|
|
|
static const MY_CTYPE_NAME_ST *
|
|
ctype_name_st_find(my_wchar_t codepoint, const char *tok)
|
|
{
|
|
MY_CTYPE_NAME_ST *p;
|
|
for (p= my_ctype_name; p->name; p++)
|
|
{
|
|
if (!strncasecmp(p->name, tok, 2))
|
|
return p;
|
|
}
|
|
return NULL;
|
|
}
|
|
|
|
|
|
static int
|
|
ctype_name_st_to_num(const MY_CTYPE_NAME_ST *st, my_wchar_t codepoint)
|
|
{
|
|
if ((codepoint >= 'a' && codepoint <= 'z') ||
|
|
(codepoint >= 'A' && codepoint <= 'Z'))
|
|
return st->val | _MY_X;
|
|
return st->val;
|
|
}
|
|
|
|
|
|
static UNIDATA_OPT opt_caseinfo=
|
|
{
|
|
0x10FFFF, /* max_char */
|
|
0x7FF, /* dummy_pages_codepoint_max == utf8 mb2 range */
|
|
NULL, /*filename*/
|
|
{
|
|
"caseinfo", /* mode name */
|
|
0, /* print_ctype */
|
|
1, /* print_toupper */
|
|
1, /* print_tolower */
|
|
0, /* print_noaccent */
|
|
0, /* print_noaccent_tolower */
|
|
1, /* print_noaccent_toupper */
|
|
1, /* print_curly_brackets_in_items */
|
|
0, /* print_curly_brackets_in_index */
|
|
2, /* chars_per_line */
|
|
0, /* single_array */
|
|
8, /* pages_per_line_in_index */
|
|
0, /* const_data */
|
|
"MY_UNICASE_CHARACTER", /* page_data_type_name */
|
|
"plane", /* page_name */
|
|
NULL, /* page_name_derived */
|
|
"MY_UNICASE_CHARACTER *", /* index_data_type_name */
|
|
"my_unicase_default_pages" /* index_name */
|
|
}
|
|
};
|
|
|
|
|
|
static UNIDATA_OPT opt_casefold=
|
|
{
|
|
0x10FFFF, /* max_char */
|
|
0x7FF, /* dummy_pages_codepoint_max == utf8 mb2 range */
|
|
NULL, /*filename*/
|
|
{
|
|
"casefold", /* mode name */
|
|
0, /* print_ctype */
|
|
1, /* print_toupper */
|
|
1, /* print_tolower */
|
|
0, /* print_noaccent */
|
|
0, /* print_noaccent_tolower */
|
|
0, /* print_noaccent_toupper */
|
|
1, /* print_curly_brackets_in_items */
|
|
0, /* print_curly_brackets_in_index */
|
|
2, /* chars_per_line */
|
|
0, /* single_array */
|
|
8, /* pages_per_line_in_index */
|
|
1, /* const_data */
|
|
"MY_CASEFOLD_CHARACTER" , /* page_data_type_name */
|
|
"page", /* page_name */
|
|
NULL, /* page_name_derived */
|
|
"MY_CASEFOLD_CHARACTER *", /* index_data_type_name */
|
|
"my_casefold_default_pages" /* index_name */
|
|
}
|
|
};
|
|
|
|
|
|
static UNIDATA_OPT opt_casefold_tr=
|
|
{
|
|
0x10FFFF, /* max_char */
|
|
0x7FF, /* dummy_pages_codepoint_max == utf8 mb2 range */
|
|
NULL, /*filename*/
|
|
{
|
|
"casefold-tr", /* mode name */
|
|
0, /* print_ctype */
|
|
1, /* print_toupper */
|
|
1, /* print_tolower */
|
|
0, /* print_noaccent */
|
|
0, /* print_noaccent_tolower */
|
|
0, /* print_noaccent_toupper */
|
|
1, /* print_curly_brackets_in_items */
|
|
0, /* print_curly_brackets_in_index */
|
|
2, /* chars_per_line */
|
|
0, /* single_array */
|
|
8, /* pages_per_line_in_index */
|
|
1, /* const_data */
|
|
"MY_CASEFOLD_CHARACTER" , /* page_data_type_name */
|
|
"page_tr", /* page_name */
|
|
"page", /* page_name_derived */
|
|
"MY_CASEFOLD_CHARACTER *", /* index_data_type_name */
|
|
"my_casefold_tr_pages" /* index_name */
|
|
}
|
|
};
|
|
|
|
|
|
static UNIDATA_OPT opt_weight_general_ci=
|
|
{
|
|
0xFFFF, /* max_char */
|
|
0x7FF, /* dummy_pages_codepoint_max == utf8 mb2 range */
|
|
NULL, /*filename*/
|
|
{
|
|
"weight_general_ci", /* mode name */
|
|
0, /* print_ctype */
|
|
0, /* print_toupper */
|
|
0, /* print_tolower */
|
|
0, /* print_noaccent */
|
|
0, /* print_noaccent_tolower */
|
|
1, /* print_noaccent_toupper */
|
|
0, /* print_curly_brackets_in_items */
|
|
0, /* print_curly_brackets_in_index */
|
|
8, /* chars_per_line */
|
|
0, /* single_array */
|
|
2, /* pages_per_line_in_index */
|
|
1, /* const_data */
|
|
"uint16", /* page_data_type_name */
|
|
"weight_general_ci_page", /* page_name */
|
|
NULL, /* page_name_derived */
|
|
"uint16 *", /* index_data_type_name */
|
|
"weight_general_ci_index" /* index_name */
|
|
}
|
|
};
|
|
|
|
|
|
static UNIDATA_OPT opt_weight_general_mysql500_ci=
|
|
{
|
|
0xFFFF, /* max_char */
|
|
0x7FF, /* dummy_pages_codepoint_max == utf8 mb2 range */
|
|
NULL, /*filename*/
|
|
{
|
|
"weight_general_mysql500_ci", /* mode name */
|
|
0, /* print_ctype */
|
|
0, /* print_toupper */
|
|
0, /* print_tolower */
|
|
0, /* print_noaccent */
|
|
0, /* print_noaccent_tolower */
|
|
1, /* print_noaccent_toupper */
|
|
0, /* print_curly_brackets_in_items */
|
|
0, /* print_curly_brackets_in_index */
|
|
8, /* chars_per_line */
|
|
0, /* single_array */
|
|
2, /* pages_per_line_in_index */
|
|
1, /* const_data */
|
|
"uint16", /* page_data_type_name */
|
|
"weight_general_mysql500_ci_page", /* page_name */
|
|
"weight_general_ci_page", /* page_name_derived */
|
|
"uint16 *", /* index_data_type_name */
|
|
"weight_general_mysql500_ci_index" /* index_name */
|
|
}
|
|
};
|
|
|
|
|
|
static UNIDATA_OPT opt_ctype=
|
|
{
|
|
0x10FFFF, /* max_char */
|
|
0x7FF, /* dummy_pages_codepoint_max == utf8 mb2 range */
|
|
NULL, /*filename*/
|
|
{
|
|
"ctype", /* mode name */
|
|
1, /* print_ctype */
|
|
0, /* print_toupper */
|
|
0, /* print_tolower */
|
|
0, /* print_noaccent */
|
|
0, /* print_noaccent_tolower */
|
|
0, /* print_noaccent_toupper */
|
|
0, /* print_curly_brackets_in_items */
|
|
1, /* print_curly_brackets_in_index */
|
|
16, /* chars_per_line */
|
|
0, /* single_array */
|
|
1, /* pages_per_line_in_index */
|
|
1, /* const_data */
|
|
"unsigned char", /* page_data_type_name */
|
|
"uctype_page", /* page_name */
|
|
NULL, /* page_name_derived */
|
|
"MY_UNI_CTYPE", /* index_data_type_name */
|
|
"my_uni_ctype" /* index_name */
|
|
}
|
|
};
|
|
|
|
|
|
int opt_set_mode(UNIDATA_OPT *to, const char *name_and_value, const char *value)
|
|
{
|
|
if (!strcmp(value, "casefold"))
|
|
{
|
|
to->mode= opt_casefold.mode;
|
|
return 0;
|
|
}
|
|
else if (!strcmp(value, "casefold-tr"))
|
|
{
|
|
to->mode= opt_casefold_tr.mode;
|
|
return 0;
|
|
}
|
|
else if (!strcmp(value, "caseinfo"))
|
|
{
|
|
to->mode= opt_caseinfo.mode;
|
|
return 0;
|
|
}
|
|
else if (!strcmp(value, "weight_general_ci"))
|
|
{
|
|
to->mode= opt_weight_general_ci.mode;
|
|
return 0;
|
|
}
|
|
else if (!strcmp(value, "weight_general_mysql500_ci"))
|
|
{
|
|
to->mode= opt_weight_general_mysql500_ci.mode;
|
|
return 0;
|
|
}
|
|
else if (!strcmp(value, "ctype"))
|
|
{
|
|
to->mode= opt_ctype.mode;
|
|
return 0;
|
|
}
|
|
fprintf(stderr, "Bad option: %s\n", name_and_value);
|
|
return 1;
|
|
}
|
|
|
|
|
|
static decomposition_type_t
|
|
get_decomposition_type(const char *str)
|
|
{
|
|
if (!strcmp(str, "<font>")) return DT_FONT;
|
|
if (!strcmp(str, "<noBreak>")) return DT_NOBREAK;
|
|
if (!strcmp(str, "<initial>")) return DT_INITIAL;
|
|
if (!strcmp(str, "<medial>")) return DT_MEDIAL;
|
|
if (!strcmp(str, "<final>")) return DT_FINAL;
|
|
if (!strcmp(str, "<isolated>")) return DT_ISOLATED;
|
|
if (!strcmp(str, "<circle>")) return DT_CIRCLE;
|
|
if (!strcmp(str, "<super>")) return DT_SUPER;
|
|
if (!strcmp(str, "<sub>")) return DT_SUB;
|
|
if (!strcmp(str, "<vertical>")) return DT_VERTICAL;
|
|
if (!strcmp(str, "<wide>")) return DT_WIDE;
|
|
if (!strcmp(str, "<narrow>")) return DT_NARROW;
|
|
if (!strcmp(str, "<small>")) return DT_SMALL;
|
|
if (!strcmp(str, "<square>")) return DT_SQUARE;
|
|
if (!strcmp(str, "<fraction>")) return DT_FRACTION;
|
|
if (!strcmp(str, "<compat>")) return DT_COMPAT;
|
|
return DT_UNKNOWN;
|
|
}
|
|
|
|
|
|
#define MAX_DECOMP 20
|
|
|
|
|
|
typedef struct
|
|
{
|
|
int ctype;
|
|
int toupper;
|
|
int tolower;
|
|
int noaccent;
|
|
int noaccent_tolower;
|
|
int noaccent_toupper;
|
|
int decomp_type;
|
|
int decomp[MAX_DECOMP];
|
|
int to_be_decomposed;
|
|
} UNIDATA_CHAR;
|
|
|
|
|
|
|
|
/************* Initialization functions *********/
|
|
|
|
|
|
static int
|
|
strip_accent(UNIDATA_CHAR *code, int i)
|
|
{
|
|
if (code[i].decomp[0] &&
|
|
code[i].decomp[1] >= 0x0300 &&
|
|
code[i].decomp[1] <= 0x036F &&
|
|
code[i].decomp[2] == 0)
|
|
return strip_accent(code, code[i].decomp[0]);
|
|
return i;
|
|
}
|
|
|
|
|
|
static void
|
|
set_noaccent(const UNIDATA_OPT *opt, UNIDATA_CHAR *code)
|
|
{
|
|
my_wchar_t i;
|
|
for (i= 0; i <= opt->max_char; i++)
|
|
{
|
|
code[i].noaccent= strip_accent(code, i);
|
|
}
|
|
}
|
|
|
|
|
|
static void
|
|
set_noaccent_tolower(const UNIDATA_OPT *opt, UNIDATA_CHAR *code)
|
|
{
|
|
my_wchar_t i;
|
|
for (i= 0; i <= opt->max_char; i++)
|
|
{
|
|
code[i].noaccent_tolower= code[code[i].noaccent].tolower;
|
|
}
|
|
}
|
|
|
|
|
|
static void
|
|
set_noaccent_toupper(const UNIDATA_OPT *opt, UNIDATA_CHAR *code)
|
|
{
|
|
my_wchar_t i;
|
|
for (i= 0; i <= opt->max_char; i++)
|
|
{
|
|
code[i].noaccent_toupper= code[code[i].noaccent].toupper;
|
|
}
|
|
}
|
|
|
|
|
|
static void
|
|
set_default_case_folding(const UNIDATA_OPT *opt, UNIDATA_CHAR *code)
|
|
{
|
|
my_wchar_t i;
|
|
for (i= 0; i <= opt->max_char; i++)
|
|
{
|
|
code[i].tolower= i;
|
|
code[i].toupper= i;
|
|
}
|
|
}
|
|
|
|
|
|
/*
|
|
Fill ideographs
|
|
*/
|
|
|
|
static void
|
|
fill_cjk(UNIDATA_CHAR *code)
|
|
{
|
|
size_t i;
|
|
/* CJK Ideographs Extension A (U+3400 - U+4DB5) */
|
|
for(i=0x3400;i<=0x4DB5;i++)
|
|
{
|
|
code[i].tolower=i;
|
|
code[i].ctype= CT_CJK;
|
|
}
|
|
/* CJK Ideographs (U+4E00 - U+9FA5) */
|
|
for(i=0x4E00;i<=0x9FA5;i++)
|
|
{
|
|
code[i].tolower=i;
|
|
code[i].ctype= CT_CJK;
|
|
}
|
|
/* Hangul Syllables (U+AC00 - U+D7A3) */
|
|
for(i=0xAC00;i<=0xD7A3;i++)
|
|
{
|
|
code[i].tolower=i;
|
|
code[i].ctype= CT_HANGUL;
|
|
}
|
|
}
|
|
|
|
|
|
/************* Loading functions ***************/
|
|
|
|
|
|
static void handle_general_category(const UNIDATA_OPT *opt,
|
|
UNIDATA_CHAR *ch,
|
|
const char *tok,
|
|
my_wchar_t codepoint)
|
|
{
|
|
/*
|
|
TODO: check if ctype is set correctly.
|
|
A difference can break fulltext indexes.
|
|
*/
|
|
|
|
const MY_CTYPE_NAME_ST *ct= ctype_name_st_find(
|
|
(my_wchar_t) codepoint, tok);
|
|
if (ct)
|
|
{
|
|
ch->ctype|= ctype_name_st_to_num(
|
|
ct,
|
|
(my_wchar_t) codepoint);
|
|
ch->to_be_decomposed= ct->to_be_decomposed;
|
|
}
|
|
}
|
|
|
|
|
|
int handle_decomposition(UNIDATA_CHAR *ch, char *tok, const char *str)
|
|
{
|
|
char *lt, *part;
|
|
size_t num;
|
|
|
|
if (!ch->to_be_decomposed)
|
|
return 0; /* Decompose only letters */
|
|
|
|
for (part= strtok_r(tok, " ", <), num= 0;
|
|
part;
|
|
part= strtok_r(NULL, " ", <))
|
|
{
|
|
char *end;
|
|
if (part[0] == '<')
|
|
{
|
|
if ((ch->decomp_type= get_decomposition_type(part)) == DT_UNKNOWN)
|
|
{
|
|
fprintf(stderr, "Unknown decomposition type:\n%s\n", str);
|
|
return 1;
|
|
}
|
|
continue;
|
|
}
|
|
|
|
if (num + 1 >= MAX_DECOMP)
|
|
{
|
|
fprintf(stderr, "Too many decomposition parts:\n%s\n", str);
|
|
return 1;
|
|
}
|
|
ch->decomp[num]= strtol(part,&end,16);
|
|
ch->decomp[num+1]= 0;
|
|
num++;
|
|
}
|
|
return 0;
|
|
}
|
|
|
|
|
|
static int
|
|
parse_unidata_line(const UNIDATA_OPT *opt, char *str, UNIDATA_CHAR *unidata)
|
|
{
|
|
unsigned long codepoint= 0;
|
|
int fieldno= 0;
|
|
char *s;
|
|
|
|
for (s= str; *s; fieldno++)
|
|
{
|
|
char *tok= s, *e;
|
|
|
|
if ((e= strchr(s,';')))
|
|
{
|
|
*e= '\0';
|
|
s= e + 1;
|
|
}
|
|
else
|
|
{
|
|
s+= strlen(s);
|
|
}
|
|
|
|
switch (fieldno)
|
|
{
|
|
case 0: /* Code point */
|
|
codepoint= strtoul(tok, NULL, 16);
|
|
if (codepoint > opt->max_char)
|
|
return 1;
|
|
break;
|
|
case 1: /* name */
|
|
break;
|
|
case 2: /* general category */
|
|
handle_general_category(opt, &unidata[codepoint],
|
|
tok, (my_wchar_t) codepoint);
|
|
break;
|
|
case 3: /* Canonical combining class */
|
|
break;
|
|
case 4: /* BiDi class */
|
|
break;
|
|
case 5: /* Decomposition type */
|
|
if (tok[0] && handle_decomposition(&unidata[codepoint], tok, str))
|
|
return -1;
|
|
break;
|
|
case 6: /* Numeric_Type, Numeric Value */
|
|
break;
|
|
case 7: /* Numeric_Type, Numeric Value */
|
|
break;
|
|
case 8: /* Numeric_Type, Numeric Value */
|
|
break;
|
|
case 9: /* BiDi mirrored */
|
|
break;
|
|
case 10: /* Unicode_1_Name */
|
|
break;
|
|
case 11: /* ISO_Comment */
|
|
break;
|
|
case 12: /*Simple_Uppercase_Mapping*/
|
|
if (tok[0])
|
|
unidata[codepoint].toupper= strtol(tok, NULL, 16);
|
|
break;
|
|
case 13: /*Simple_Lowercase_Mapping*/
|
|
if (tok[0])
|
|
unidata[codepoint].tolower= strtol(tok, NULL, 16);
|
|
break;
|
|
case 14: /* Simple_Titlecase_Mapping */
|
|
break;
|
|
}
|
|
}
|
|
|
|
return 0;
|
|
}
|
|
|
|
|
|
static int
|
|
load_unidata_file(const UNIDATA_OPT *opt, FILE *f, UNIDATA_CHAR *unidata)
|
|
{
|
|
char str[1024];
|
|
|
|
while (fgets(str, sizeof(str), f))
|
|
{
|
|
if (parse_unidata_line(opt, str, unidata) < 0)
|
|
return 1;
|
|
}
|
|
return 0;
|
|
}
|
|
|
|
|
|
static int
|
|
load_unidata(const UNIDATA_OPT *opt, UNIDATA_CHAR *unidata)
|
|
{
|
|
FILE *f;
|
|
int rc;
|
|
if (!(f= fopen(opt->filename, "r")))
|
|
{
|
|
fprintf(stderr, "Could not open file '%s'\n", opt->filename);
|
|
return 1;
|
|
}
|
|
rc= load_unidata_file(opt, f, unidata);
|
|
fclose(f);
|
|
return rc;
|
|
}
|
|
|
|
/************** Printing functions ********************/
|
|
|
|
static void
|
|
print_one_char(const UNIDATA_OPT *opt, UNIDATA_CHAR *data, int code)
|
|
{
|
|
UNIDATA_CHAR *ch= &data[code];
|
|
const char *comma= "";
|
|
|
|
if (opt->mode.print_curly_brackets_in_items)
|
|
printf("{");
|
|
|
|
if (opt->mode.print_ctype)
|
|
{
|
|
printf("%s", comma);
|
|
printf("%3d", ch->ctype);
|
|
comma= ",";
|
|
}
|
|
|
|
if (opt->mode.print_toupper)
|
|
{
|
|
printf("%s", comma);
|
|
printf("0x%04X", ch->toupper);
|
|
comma= ",";
|
|
}
|
|
|
|
if (opt->mode.print_tolower)
|
|
{
|
|
printf("%s", comma);
|
|
printf("0x%04X", ch->tolower);
|
|
comma= ",";
|
|
}
|
|
|
|
if (opt->mode.print_noaccent)
|
|
{
|
|
printf("%s", comma);
|
|
printf("0x%04X", ch->noaccent);
|
|
comma= ",";
|
|
}
|
|
|
|
if (opt->mode.print_noaccent_tolower)
|
|
{
|
|
printf("%s", comma);
|
|
printf("0x%04X", ch->noaccent_tolower);
|
|
comma= ",";
|
|
}
|
|
|
|
if (opt->mode.print_noaccent_toupper)
|
|
{
|
|
printf("%s", comma);
|
|
printf("0x%04X", ch->noaccent_toupper);
|
|
comma= ",";
|
|
}
|
|
|
|
if (opt->mode.print_curly_brackets_in_items)
|
|
printf("}");
|
|
|
|
if (opt->mode.single_array ||
|
|
(code & 0xFF) != 0xFF) /* Don't print comma for the last char in a page */
|
|
printf(",");
|
|
else
|
|
printf(" ");
|
|
}
|
|
|
|
|
|
static void
|
|
print_one_page(const UNIDATA_OPT *opt, UNIDATA_CHAR *data,
|
|
my_wchar_t pageno, const PAGE_STAT *pstat)
|
|
{
|
|
my_wchar_t charnum;
|
|
|
|
if (!opt->mode.single_array || pageno == 0)
|
|
{
|
|
printf("%s%s%s %s%02X[256]={%s\n",
|
|
pageno == 0 ? "" : "static ",
|
|
opt->mode.const_data ? "const " : "",
|
|
opt->mode.page_data_type_name, opt->mode.page_name,
|
|
(unsigned int) pageno,
|
|
pstat[pageno].page_tab == PAGE_DATA_DUMMY ?
|
|
" /* This page is dummy */" : "");
|
|
}
|
|
|
|
for (charnum= 0; charnum < 256; charnum++)
|
|
{
|
|
my_wchar_t codepoint= (pageno << 8) + charnum;
|
|
my_wchar_t rem= charnum % opt->mode.chars_per_line;
|
|
if (!rem)
|
|
printf(" ");
|
|
print_one_char(opt, data, codepoint);
|
|
if (rem + 1 == opt->mode.chars_per_line)
|
|
{
|
|
printf(" /* %04X */", (codepoint + 1) - opt->mode.chars_per_line);
|
|
printf("\n");
|
|
}
|
|
}
|
|
if (!opt->mode.single_array)
|
|
printf("};\n\n");
|
|
}
|
|
|
|
|
|
static const char *page_name_in_index(const UNIDATA_OPT *opt,
|
|
const PAGE_STAT *pstat,
|
|
my_wchar_t pageno)
|
|
{
|
|
if (!opt->mode.page_name_derived)
|
|
return opt->mode.page_name;
|
|
|
|
return pstat[pageno].page_overridden ?
|
|
opt->mode.page_name :
|
|
opt->mode.page_name_derived;
|
|
}
|
|
|
|
|
|
static void print_page_index(const UNIDATA_OPT *opt,
|
|
const PAGE_STAT *pstat)
|
|
{
|
|
my_wchar_t page;
|
|
my_wchar_t npages= npages_by_opt(opt);
|
|
int printing_ctype= !strcmp(opt->mode.index_data_type_name, "MY_UNI_CTYPE");
|
|
|
|
printf("%s%s %s[%d]={\n",
|
|
opt->mode.const_data ? "const " : "",
|
|
opt->mode.index_data_type_name, opt->mode.index_name,
|
|
(unsigned int) npages);
|
|
|
|
for (page= 0; page < npages; page++)
|
|
{
|
|
my_wchar_t rem= page % opt->mode.pages_per_line_in_index;
|
|
if (!rem)
|
|
printf(" ");
|
|
if (opt->mode.print_curly_brackets_in_index)
|
|
printf("{");
|
|
if (printing_ctype)
|
|
printf("%d,", pstat[page].page_ctype);
|
|
|
|
if (pstat[page].page_tab)
|
|
printf("%s%02X", page_name_in_index(opt, pstat, page), page);
|
|
else
|
|
printf("NULL");
|
|
|
|
if (opt->mode.print_curly_brackets_in_index)
|
|
printf("}");
|
|
|
|
if (page + 1 < npages)
|
|
printf(",");
|
|
|
|
if (rem + 1 == opt->mode.pages_per_line_in_index)
|
|
printf("\n");
|
|
else
|
|
printf(" ");
|
|
}
|
|
printf("};\n");
|
|
}
|
|
|
|
|
|
static void print(UNIDATA_OPT *opt, UNIDATA_CHAR *unidata, const PAGE_STAT *pstat)
|
|
{
|
|
my_wchar_t npages= npages_by_opt(opt);
|
|
my_wchar_t page;
|
|
|
|
/* Print all pages */
|
|
for (page= 0; page < npages; page++)
|
|
{
|
|
if (opt->mode.page_name_derived && !pstat[page].page_overridden)
|
|
continue;
|
|
if (opt->mode.single_array || pstat[page].page_tab)
|
|
print_one_page(opt, unidata, page, pstat);
|
|
}
|
|
|
|
/* Print index */
|
|
if (!opt->mode.single_array)
|
|
print_page_index(opt, pstat);
|
|
}
|
|
|
|
|
|
void print_command_line_options(int ac, char **av)
|
|
{
|
|
int i;
|
|
printf("/*\n");
|
|
printf(" Generated by:\n");
|
|
for (i= 0; i < ac; i++)
|
|
{
|
|
printf(" %s%s%s\n", i > 0 ? " " : "", av[i], i+1 < ac ? " \\" :"");
|
|
}
|
|
printf("\n");
|
|
printf("*/\n");
|
|
}
|
|
|
|
|
|
static void calc_page_parameters(const UNIDATA_OPT *opt, const UNIDATA_CHAR *code,
|
|
PAGE_STAT *pstat)
|
|
{
|
|
my_wchar_t npages= npages_by_opt(opt);
|
|
my_wchar_t page;
|
|
for(page= 0; page < npages; page++)
|
|
{
|
|
int ntype[CT_MAX + 1], t;
|
|
int character, done=0;
|
|
|
|
memset(ntype,0,sizeof(ntype));
|
|
for(character= 0;character < 256; character++)
|
|
{
|
|
size_t cod= (page << 8) + character;
|
|
const UNIDATA_CHAR *ch= &code[cod];
|
|
ntype[ch->ctype]++;
|
|
|
|
if((ch->tolower != cod ||
|
|
ch->toupper != cod ||
|
|
ch->noaccent != cod ||
|
|
ch->noaccent_toupper != cod) &&
|
|
(opt->mode.print_tolower ||
|
|
opt->mode.print_toupper ||
|
|
opt->mode.print_noaccent ||
|
|
opt->mode.print_noaccent_toupper))
|
|
{
|
|
pstat[page].page_tab= PAGE_DATA_IMPORTANT;
|
|
}
|
|
}
|
|
|
|
if (opt->mode.print_ctype)
|
|
{
|
|
for (t= 0; t <= CT_MAX; t++)
|
|
{
|
|
if(ntype[t]==256)
|
|
{
|
|
/* All ctypes are the same */
|
|
pstat[page].page_ctype= t;
|
|
done=1;
|
|
break;
|
|
}
|
|
}
|
|
}
|
|
else
|
|
{
|
|
done= 1; /* Don't need ctype */
|
|
}
|
|
|
|
if(!done)
|
|
{
|
|
/* Mixed page, lets create the table */
|
|
pstat[page].page_ctype= CT_NONE;
|
|
pstat[page].page_tab= PAGE_DATA_IMPORTANT;
|
|
}
|
|
if (!pstat[page].page_tab &&
|
|
page <= (opt->dummy_pages_codepoint_max >> 8))
|
|
pstat[page].page_tab= PAGE_DATA_DUMMY;
|
|
}
|
|
}
|
|
|
|
|
|
static UNIDATA_CHAR code[MAX_UNI_CHAR + 1];
|
|
static PAGE_STAT pstat[MAX_UNI_PAGE + 1];
|
|
|
|
|
|
int usage(int ac, char **av)
|
|
{
|
|
fprintf(stderr, "Usage: %s filename\n", av[0]);
|
|
return 1;
|
|
}
|
|
|
|
|
|
const char *one_opt(const char *option, const char *name, size_t length)
|
|
{
|
|
if (!strncmp(option, name, length))
|
|
return option + length;
|
|
return 0;
|
|
}
|
|
|
|
|
|
int get_option_bool(int *to, const char *name_and_value, const char *value)
|
|
{
|
|
if (!strcmp(value, "1"))
|
|
*to= 1;
|
|
else if (!strcmp(value, "0"))
|
|
*to= 0;
|
|
else
|
|
{
|
|
fprintf(stderr, "Bad option: %s\n", name_and_value);
|
|
return 1;
|
|
}
|
|
return 0;
|
|
}
|
|
|
|
|
|
int get_option_codepoint(my_wchar_t *to, const char *name_and_value, const char *value)
|
|
{
|
|
unsigned long codepoint= value[0]=='0' && value[1]=='x' ?
|
|
strtoul(value + 2, NULL, 16) :
|
|
strtoul(value, NULL, 10);
|
|
if (codepoint > MAX_UNI_CHAR)
|
|
{
|
|
fprintf(stderr, "Too large --max-char: %s\n", name_and_value);
|
|
return 1;
|
|
}
|
|
*to= (my_wchar_t) codepoint;
|
|
return 0;
|
|
}
|
|
|
|
|
|
int process_param(UNIDATA_OPT *opt, int ac, char **av)
|
|
{
|
|
int i;
|
|
if (ac < 2)
|
|
return usage(ac, av);
|
|
for (i= 1; i < ac; i++)
|
|
{
|
|
const char *op;
|
|
if ((op= one_opt(av[i], STRING_WITH_LEN("--mode="))))
|
|
{
|
|
if (opt_set_mode(opt, av[i], op))
|
|
return 1;
|
|
}
|
|
else if ((op= one_opt(av[i], STRING_WITH_LEN("--max-char="))))
|
|
{
|
|
if (get_option_codepoint(&opt->max_char, av[i], op))
|
|
return 1;
|
|
}
|
|
else if ((op= one_opt(av[i], STRING_WITH_LEN("--print-toupper="))))
|
|
{
|
|
if (get_option_bool(&opt->mode.print_toupper, av[i], op))
|
|
return 1;
|
|
}
|
|
else if ((op= one_opt(av[i], STRING_WITH_LEN("--print-tolower="))))
|
|
{
|
|
if (get_option_bool(&opt->mode.print_tolower, av[i], op))
|
|
return 1;
|
|
}
|
|
else if ((op= one_opt(av[i], STRING_WITH_LEN("--print-noaccent-toupper="))))
|
|
{
|
|
if (get_option_bool(&opt->mode.print_noaccent_toupper, av[i], op))
|
|
return 1;
|
|
}
|
|
else if ((op= one_opt(av[i], STRING_WITH_LEN("--page-name="))))
|
|
{
|
|
opt->mode.page_name= op;
|
|
}
|
|
else if ((op= one_opt(av[i], STRING_WITH_LEN("--page-name-derived="))))
|
|
{
|
|
opt->mode.page_name_derived= op;
|
|
}
|
|
else if ((op= one_opt(av[i], STRING_WITH_LEN("--index-name="))))
|
|
{
|
|
opt->mode.index_name= op;
|
|
}
|
|
else
|
|
{
|
|
if (av[i][0] == '-' && av[i][1] == '-')
|
|
{
|
|
fprintf(stderr, "Unknown option: %s\n", av[i]);
|
|
return 1;
|
|
}
|
|
break;
|
|
}
|
|
}
|
|
if (i + 1 != ac)
|
|
return usage(ac, av);
|
|
opt->filename= av[i];
|
|
return 0;
|
|
}
|
|
|
|
|
|
int main(int ac,char **av)
|
|
{
|
|
UNIDATA_OPT opt= opt_caseinfo;
|
|
|
|
if (process_param(&opt, ac, av))
|
|
return 1;
|
|
|
|
memset(code,0,sizeof(code));
|
|
memset(pstat,0,sizeof(pstat));
|
|
|
|
set_default_case_folding(&opt, code);
|
|
|
|
fill_cjk(code);
|
|
|
|
if (load_unidata(&opt, code))
|
|
return 1;
|
|
|
|
set_noaccent(&opt, code);
|
|
set_noaccent_tolower(&opt, code);
|
|
set_noaccent_toupper(&opt, code);
|
|
|
|
/*
|
|
Bug#8385: utf8_general_ci treats cyrillic letters I and SHORT I as the same
|
|
Because of decomposition applied, noaccent_toupper for the following letters:
|
|
U+0419 CYRILLIC CAPITAL LETTER SHORT I
|
|
U+0439 CYRILLIC SMALL LETTER SHORT I
|
|
was set to:
|
|
U+418 CYRILLIC CAPITAL LETTER I
|
|
Reset it back to U+0419.
|
|
*/
|
|
code[0x0419].noaccent_toupper= 0x0419;
|
|
code[0x0439].noaccent_toupper= 0x0419;
|
|
|
|
/*
|
|
Bug#27877 incorrect german order in utf8_general_ci
|
|
*/
|
|
if (strcmp(opt.mode.mode_name, "weight_general_mysql500_ci"))
|
|
{
|
|
code[0x00DF].noaccent_toupper= code['s'].noaccent_toupper;
|
|
}
|
|
else
|
|
pstat[0].page_overridden= 1;
|
|
|
|
if (!strcmp(opt.mode.mode_name, "casefold-tr"))
|
|
{
|
|
code[0x49].tolower= 0x0131;
|
|
code[0x69].toupper= 0x0130;
|
|
pstat[0].page_overridden= 1;
|
|
}
|
|
|
|
calc_page_parameters(&opt, code, pstat);
|
|
|
|
printf("%s\n", COPYING);
|
|
print_command_line_options(ac, av);
|
|
print(&opt, code, pstat);
|
|
|
|
return 0;
|
|
}
|