mirror of
https://github.com/MariaDB/server.git
synced 2025-01-15 19:42:28 +01:00
MDEV-27009 Add UCA-14.0.0 collations - dump logical positions and contractions
- uca-dump can now dump logical positions as a set of "#define" directives. Logical positions for 4.0.0 and for 5.2.0 were calculated and put into ctype-uca.c manually. That required some efforts by analyzing allkeys.txt with help of grep and sort. Now when defining a new MY_UCA_INFO it's possible to use the new #define's instead of calculating logical positions manually. Logical positions also print their weights in DUCET format as a comment before the define: /* [.0000.0021.0002] [.0000.0117.0002] */ The comment helps to know weight ranges on various levels, which makes it easier to debug the code. - uca-dump can now dump built-in DUCET contractions - Adding a new uca-dump command line option --no-contractions, this is useful if one needs to re-dump 4.0.0 and 5.2.0 data in ctype-uca.c compatible way. - Adding a new uca-dump command line options --case-first=upper|level. This can be useful if one need to dump with UPPER case first by default. It's not yet decided if we'll use --case-first=upper during the dump though. - Moving parts of the code from the main loop into separate functions parse_chars() and parse_weights(). This allows to reuse the code between single characters and contractions. - Adding a new function my_ducet_weight_normalize(), to cut zero weights from a weight string, e.g. [AAAA][0000][BBBB] -> [AAAA][BBBB]. This helps to reuse the code between single characters and contractions. - Weight normalization is now done before printing, in separate loops inside my_ducet_normalize(). Before this change, normalization was done during priting, inside the printing loop. This helps to separate steps: loading -> normalizing -> printing. This makes it easier to follow what's going on, e.g. while debugging. - Fixing ctype-uca.c to handle built-in contractions of any length. Previously we had only built-in contractions in utf8mb4_thai_520_w2, which contains only 2-character contractions.
This commit is contained in:
parent
0736c03d56
commit
d7ffb7c3dd
2 changed files with 567 additions and 104 deletions
|
@ -33720,16 +33720,11 @@ init_weight_level(MY_CHARSET_LOADER *loader, MY_COLL_RULES *rules,
|
|||
for (i= 0; i != src->contractions.nitems; i++)
|
||||
{
|
||||
MY_CONTRACTION *item= &src->contractions.item[i];
|
||||
/*
|
||||
TODO: calculate length from item->ch.
|
||||
Generally contractions can consist of more than 2 characters.
|
||||
*/
|
||||
uint length= 2;
|
||||
uint length= my_wstrnlen(item->ch, array_elements(item->ch));
|
||||
uint16 *weights= my_uca_init_one_contraction(&dst->contractions,
|
||||
item->ch, length,
|
||||
item->with_context);
|
||||
memcpy(weights, item->weight, length * sizeof(uint16));
|
||||
weights[length]= 0;
|
||||
memcpy(weights, item->weight, sizeof(item->weight));
|
||||
}
|
||||
return FALSE;
|
||||
}
|
||||
|
|
|
@ -24,17 +24,6 @@
|
|||
#include "m_ctype.h"
|
||||
#include "ctype-uca.h"
|
||||
|
||||
#if 0
|
||||
#define MY_UCA_NPAGES 1024
|
||||
#define MY_UCA_NCHARS 64
|
||||
#define MY_UCA_CMASK 63
|
||||
#define MY_UCA_PSHIFT 6
|
||||
#else
|
||||
#define MY_UCA_NPAGES 4352 /* 0x110000 characters / 0x100 chars per page */
|
||||
#define MY_UCA_NCHARS 256
|
||||
#define MY_UCA_CMASK 255
|
||||
#define MY_UCA_PSHIFT 8
|
||||
#endif
|
||||
|
||||
#define MAX_ALLOWED_CODE 0x10FFFF
|
||||
|
||||
|
@ -44,6 +33,8 @@ typedef struct opt_st
|
|||
const char *name_prefix; /* Name that goes into all array names */
|
||||
const char *filename; /* The filename or "-" for stdin */
|
||||
uint levels; /* The number of levels to dump */
|
||||
my_bool no_contractions;
|
||||
my_bool case_first_upper;
|
||||
} OPT;
|
||||
|
||||
|
||||
|
@ -51,7 +42,9 @@ static OPT defaults=
|
|||
{
|
||||
"uca",
|
||||
"-",
|
||||
3
|
||||
3,
|
||||
FALSE,
|
||||
FALSE
|
||||
};
|
||||
|
||||
|
||||
|
@ -65,22 +58,344 @@ typedef struct my_ducet_weight_st
|
|||
typedef struct my_ducet_single_char_t
|
||||
{
|
||||
MY_DUCET_WEIGHT weight;
|
||||
my_bool is_variable;
|
||||
} MY_DUCET_SINGLE_CHAR;
|
||||
|
||||
|
||||
typedef struct my_ducet_char_t
|
||||
{
|
||||
my_wc_t wc[MY_UCA_MAX_CONTRACTION];
|
||||
size_t length;
|
||||
} MY_DUCET_CHARS;
|
||||
|
||||
|
||||
typedef struct my_ducet_contraction_t
|
||||
{
|
||||
MY_DUCET_CHARS chars;
|
||||
MY_DUCET_WEIGHT weights;
|
||||
} MY_DUCET_CONTRACTION;
|
||||
|
||||
|
||||
typedef struct my_ducet_contraction_list_st
|
||||
{
|
||||
size_t nitems;
|
||||
MY_DUCET_CONTRACTION item[4*1024];
|
||||
} MY_DUCET_CONTRACTION_LIST;
|
||||
|
||||
|
||||
typedef struct my_ducet_logical_posision_st
|
||||
{
|
||||
my_wc_t first;
|
||||
my_wc_t last;
|
||||
} MY_DUCET_LOGICAL_POSITION;
|
||||
|
||||
|
||||
typedef struct my_ducet_logical_positions_st
|
||||
{
|
||||
MY_DUCET_LOGICAL_POSITION tertiary_ignorable;
|
||||
MY_DUCET_LOGICAL_POSITION secondary_ignorable;
|
||||
MY_DUCET_LOGICAL_POSITION primary_ignorable;
|
||||
MY_DUCET_LOGICAL_POSITION variable;
|
||||
MY_DUCET_LOGICAL_POSITION non_ignorable;
|
||||
} MY_DUCET_LOGICAL_POSITIONS;
|
||||
|
||||
|
||||
typedef struct my_allkeys_st
|
||||
{
|
||||
MY_DUCET_SINGLE_CHAR single_chars[MAX_ALLOWED_CODE+1];
|
||||
MY_DUCET_CONTRACTION_LIST contractions;
|
||||
MY_DUCET_LOGICAL_POSITIONS logical_positions;
|
||||
uint version;
|
||||
char version_str[32];
|
||||
} MY_DUCET;
|
||||
|
||||
|
||||
static int
|
||||
my_ducet_weight_cmp_on_level(const MY_DUCET_WEIGHT *a,
|
||||
const MY_DUCET_WEIGHT *b,
|
||||
uint level)
|
||||
{
|
||||
uint i;
|
||||
for (i= 0; i < array_elements(a->weight[level]); i++)
|
||||
{
|
||||
int diff= (int) a->weight[level][i] - (int) b->weight[level][i];
|
||||
if (diff)
|
||||
return diff;
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
|
||||
|
||||
static int
|
||||
my_ducet_weight_cmp(const MY_DUCET_WEIGHT *a,
|
||||
const MY_DUCET_WEIGHT *b)
|
||||
{
|
||||
uint level;
|
||||
for (level= 0; level < array_elements(a->weight); level++)
|
||||
{
|
||||
int diff= my_ducet_weight_cmp_on_level(a, b, level);
|
||||
if (diff)
|
||||
return diff;
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
|
||||
|
||||
/*
|
||||
"3.11 Logical Reset Positions" says:
|
||||
|
||||
The CLDR table (based on UCA) has the following overall structure for weights,
|
||||
going from low to high.
|
||||
|
||||
*/
|
||||
|
||||
static my_bool
|
||||
my_ducet_weight_is_tertiary_ignorable(const MY_DUCET_WEIGHT *w)
|
||||
{
|
||||
return w->weight[0][0] == 0 &&
|
||||
w->weight[1][0] == 0 &&
|
||||
w->weight[2][0] == 0;
|
||||
}
|
||||
|
||||
|
||||
static my_bool
|
||||
my_ducet_weight_is_secondary_ignorable(const MY_DUCET_WEIGHT *w)
|
||||
{
|
||||
return w->weight[0][0] == 0 &&
|
||||
w->weight[1][0] == 0 &&
|
||||
w->weight[2][0] != 0;
|
||||
}
|
||||
|
||||
|
||||
static my_bool
|
||||
my_ducet_weight_is_primary_ignorable(const MY_DUCET_WEIGHT *w)
|
||||
{
|
||||
return w->weight[0][0] == 0 &&
|
||||
w->weight[1][0] != 0 &&
|
||||
w->weight[2][0] != 0;
|
||||
}
|
||||
|
||||
|
||||
static my_bool
|
||||
my_ducet_weight_is_primary_non_ignorable(const MY_DUCET_WEIGHT *w)
|
||||
{
|
||||
return w->weight[0][0] > 0 && w->weight[0][0] < 0xFB00;
|
||||
}
|
||||
|
||||
|
||||
/*
|
||||
if alternate = non-ignorable
|
||||
p != ignore,
|
||||
if alternate = shifted
|
||||
p, s, t = ignore
|
||||
*/
|
||||
static my_bool
|
||||
my_ducet_single_char_is_variable(const MY_DUCET_SINGLE_CHAR *ch)
|
||||
{
|
||||
return ch->is_variable &&
|
||||
my_ducet_weight_is_primary_non_ignorable(&ch->weight);
|
||||
}
|
||||
|
||||
|
||||
static void
|
||||
my_ducet_logical_position_set(MY_DUCET_LOGICAL_POSITION *dst, my_wc_t wc)
|
||||
{
|
||||
dst->first= dst->last= wc;
|
||||
}
|
||||
|
||||
|
||||
static void
|
||||
my_ducet_logical_position_update(MY_DUCET_LOGICAL_POSITION *dst,
|
||||
const MY_DUCET *ducet, my_wc_t current)
|
||||
{
|
||||
const MY_DUCET_SINGLE_CHAR *chars= ducet->single_chars;
|
||||
int diff;
|
||||
if (current >= array_elements(ducet->single_chars))
|
||||
return;
|
||||
if ((diff= my_ducet_weight_cmp(&chars[current].weight,
|
||||
&chars[dst->first].weight)) < 0 ||
|
||||
(diff == 0 && current < dst->first))
|
||||
dst->first= current;
|
||||
if ((diff= my_ducet_weight_cmp(&chars[current].weight,
|
||||
&chars[dst->last].weight)) > 0 ||
|
||||
(diff == 0 && current > dst->last))
|
||||
dst->last= current;
|
||||
}
|
||||
|
||||
|
||||
static void
|
||||
my_ducet_logical_positions_init(MY_DUCET_LOGICAL_POSITIONS *dst,
|
||||
const MY_DUCET *ducet)
|
||||
{
|
||||
uint i;
|
||||
const MY_DUCET_SINGLE_CHAR *chars= ducet->single_chars;
|
||||
|
||||
for (i= 0; i < array_elements(ducet->single_chars); i++)
|
||||
{
|
||||
if (my_ducet_weight_is_tertiary_ignorable(&chars[i].weight))
|
||||
{
|
||||
my_ducet_logical_position_set(&dst->tertiary_ignorable, i);
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
for (i= 0; i < array_elements(ducet->single_chars); i++)
|
||||
{
|
||||
if (my_ducet_weight_is_secondary_ignorable(&chars[i].weight))
|
||||
{
|
||||
my_ducet_logical_position_set(&dst->secondary_ignorable, i);
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
for (i= 0; i < array_elements(ducet->single_chars); i++)
|
||||
{
|
||||
if (my_ducet_weight_is_primary_ignorable(&chars[i].weight))
|
||||
{
|
||||
my_ducet_logical_position_set(&dst->primary_ignorable, i);
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
for (i= 0; i < array_elements(ducet->single_chars); i++)
|
||||
{
|
||||
if (my_ducet_weight_is_primary_non_ignorable(&chars[i].weight))
|
||||
{
|
||||
my_ducet_logical_position_set(&dst->non_ignorable, i);
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
for (i= 0; i < array_elements(ducet->single_chars); i++)
|
||||
{
|
||||
if (my_ducet_single_char_is_variable(&chars[i]))
|
||||
{
|
||||
my_ducet_logical_position_set(&dst->variable, i);
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
for (i= 1; i < array_elements(ducet->single_chars); i++)
|
||||
{
|
||||
if (my_ducet_weight_is_primary_non_ignorable(&chars[i].weight))
|
||||
my_ducet_logical_position_update(&dst->non_ignorable, ducet, i);
|
||||
if (my_ducet_weight_is_primary_ignorable(&chars[i].weight))
|
||||
my_ducet_logical_position_update(&dst->primary_ignorable, ducet, i);
|
||||
if (my_ducet_weight_is_secondary_ignorable(&chars[i].weight))
|
||||
my_ducet_logical_position_update(&dst->secondary_ignorable, ducet, i);
|
||||
if (my_ducet_weight_is_tertiary_ignorable(&chars[i].weight))
|
||||
my_ducet_logical_position_update(&dst->tertiary_ignorable, ducet, i);
|
||||
if (my_ducet_single_char_is_variable(&chars[i]))
|
||||
my_ducet_logical_position_update(&dst->variable, ducet, i);
|
||||
}
|
||||
|
||||
/*
|
||||
DUCET as of Unicode-14.0.0 does not have any secondary ignorable
|
||||
characters, i.e. with weights [p=0000, s=0000, t!=0000]
|
||||
For compatibility with 4.0.0 and 5.2.0 data in ctype-uca.c,
|
||||
let copy tertiary_ignorable to secondary_ignorable.
|
||||
It gives effectively the same result with just leaving
|
||||
secondary_ignorable as {first=U+0000,last=U+0000}.
|
||||
*/
|
||||
if (dst->secondary_ignorable.first == 0 && dst->secondary_ignorable.last == 0)
|
||||
{
|
||||
dst->secondary_ignorable.first= dst->tertiary_ignorable.first;
|
||||
dst->secondary_ignorable.last= dst->tertiary_ignorable.last;
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
static void
|
||||
my_ducet_weight_normalize_on_level(MY_DUCET_WEIGHT *weights,
|
||||
uint level,
|
||||
const OPT *options)
|
||||
{
|
||||
uint dst, src;
|
||||
for (src= 0, dst= 0; src < array_elements(weights->weight[level]); src++)
|
||||
{
|
||||
if (weights->weight[level][src] != 0)
|
||||
weights->weight[level][dst++]= weights->weight[level][src];
|
||||
}
|
||||
for ( ; dst < array_elements(weights->weight[level]) ; dst++)
|
||||
weights->weight[level][dst]= 0;
|
||||
if (options->case_first_upper && level == 2)
|
||||
{
|
||||
/*
|
||||
Invert weights for secondary level to
|
||||
sort upper case letters before their
|
||||
lower case counter part.
|
||||
*/
|
||||
for (dst= 0; dst < array_elements(weights->weight[level]); dst++)
|
||||
{
|
||||
if (weights->weight[level][dst] == 0)
|
||||
break;
|
||||
if (weights->weight[level][dst] >= 0x20)
|
||||
{
|
||||
fprintf(stderr, "Secondary level is too large: %04X\n",
|
||||
(int) weights->weight[level][dst]);
|
||||
}
|
||||
weights->weight[level][dst]= (uint16) (0x20 - weights->weight[level][dst]);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
static void
|
||||
my_ducet_weight_normalize(MY_DUCET_WEIGHT *weights, const OPT *options)
|
||||
{
|
||||
uint i;
|
||||
for (i= 0; i < array_elements(weights->weight); i++)
|
||||
my_ducet_weight_normalize_on_level(weights, i, options);
|
||||
}
|
||||
|
||||
|
||||
static void
|
||||
my_ducet_normalize(MY_DUCET *ducet, const OPT *options)
|
||||
{
|
||||
uint i;
|
||||
for (i= 0; i < array_elements(ducet->single_chars); i++)
|
||||
my_ducet_weight_normalize(&ducet->single_chars[i].weight, options);
|
||||
for (i= 0; i < array_elements(ducet->contractions.item); i++)
|
||||
my_ducet_weight_normalize(&ducet->contractions.item[i].weights, options);
|
||||
}
|
||||
|
||||
|
||||
static my_bool
|
||||
my_ducet_contraction_list_add(MY_DUCET_CONTRACTION_LIST *dst,
|
||||
const MY_DUCET_CHARS *chars,
|
||||
const MY_DUCET_WEIGHT *weights)
|
||||
{
|
||||
if (dst->nitems >= array_elements(dst->item))
|
||||
{
|
||||
fprintf(stderr, "Too many contractions\n");
|
||||
return TRUE;
|
||||
}
|
||||
dst->item[dst->nitems].chars= *chars;
|
||||
dst->item[dst->nitems].weights= *weights;
|
||||
dst->nitems++;
|
||||
return FALSE;
|
||||
}
|
||||
|
||||
|
||||
#if 0
|
||||
#define MY_UCA_NPAGES 1024
|
||||
#define MY_UCA_NCHARS 64
|
||||
#define MY_UCA_CMASK 63
|
||||
#define MY_UCA_PSHIFT 6
|
||||
#else
|
||||
#define MY_UCA_NPAGES 4352 /* 0x110000 characters / 0x100 chars per page */
|
||||
#define MY_UCA_NCHARS 256
|
||||
#define MY_UCA_CMASK 255
|
||||
#define MY_UCA_PSHIFT 8
|
||||
#endif
|
||||
|
||||
|
||||
|
||||
/* Name prefix that goes into page weight array names after global_name_prefix */
|
||||
static const char *pname_prefix[]= {"_p", "_p", "_p"};
|
||||
|
||||
/* Name suffix that goes into page weight array names after page number */
|
||||
static const char *pname_suffix[]= {"", "_w2", "_w3"};
|
||||
static const char *pname_suffix[]= {"", "_secondary", "_tertiary"};
|
||||
|
||||
|
||||
void usage(const char *prog)
|
||||
|
@ -100,6 +415,8 @@ int process_option(OPT *options, const char *opt)
|
|||
{
|
||||
static const LEX_CSTRING opt_name_prefix= {STRING_WITH_LEN("--name-prefix=")};
|
||||
static const LEX_CSTRING opt_levels= {STRING_WITH_LEN("--levels=")};
|
||||
static const LEX_CSTRING opt_no_contractions= {STRING_WITH_LEN("--no-contractions")};
|
||||
static const LEX_CSTRING opt_case_first= {STRING_WITH_LEN("--case-first=")};
|
||||
if (!lstrncmp(opt, opt_name_prefix))
|
||||
{
|
||||
options->name_prefix= opt + opt_name_prefix.length;
|
||||
|
@ -115,6 +432,27 @@ int process_option(OPT *options, const char *opt)
|
|||
}
|
||||
return 0;
|
||||
}
|
||||
if (!lstrncmp(opt, opt_case_first))
|
||||
{
|
||||
const char *value= opt + opt_case_first.length;
|
||||
if (!strcasecmp(value, "upper"))
|
||||
{
|
||||
options->case_first_upper= TRUE;
|
||||
return 0;
|
||||
}
|
||||
if (!strcasecmp(value, "lower"))
|
||||
{
|
||||
options->case_first_upper= FALSE;
|
||||
return 0;
|
||||
}
|
||||
fprintf(stderr, "Bad option: %s\n", opt);
|
||||
return 1;
|
||||
}
|
||||
if (!strcmp(opt, opt_no_contractions.str))
|
||||
{
|
||||
options->no_contractions= TRUE;
|
||||
return 0;
|
||||
}
|
||||
printf("Unknown option: %s\n", opt);
|
||||
return 1;
|
||||
}
|
||||
|
@ -189,7 +527,7 @@ char *strrtrim(char *str)
|
|||
my_bool parse_at_line(MY_DUCET *ducet, const char *str)
|
||||
{
|
||||
static const LEX_CSTRING version= {STRING_WITH_LEN("@version ")};
|
||||
if (!strncmp(str, version.str, version.length))
|
||||
if (!lstrncmp(str, version))
|
||||
{
|
||||
/*
|
||||
Examples:
|
||||
|
@ -220,6 +558,119 @@ my_bool parse_at_line(MY_DUCET *ducet, const char *str)
|
|||
}
|
||||
|
||||
|
||||
static void
|
||||
parse_chars(MY_DUCET_CHARS *dst, char *str)
|
||||
{
|
||||
char *s;
|
||||
const char *delim= " \t";
|
||||
dst->length= 0;
|
||||
for (s= strtok(str, delim); s ; s= strtok(NULL, delim))
|
||||
{
|
||||
my_wc_t code= (my_wc_t) strtoul(s, NULL, 16);
|
||||
if (dst->length < array_elements(dst->wc))
|
||||
dst->wc[dst->length]= code;
|
||||
dst->length++;
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
static void
|
||||
parse_weights(MY_DUCET_WEIGHT *dst, my_bool *is_variable, char *weight)
|
||||
{
|
||||
const char *delim= " []";
|
||||
size_t w;
|
||||
char *weights[64];
|
||||
char *s;
|
||||
dst->weight_length= 0;
|
||||
*is_variable= FALSE;
|
||||
for (s= strtok(weight, delim) ; s ; s= strtok(NULL, delim))
|
||||
{
|
||||
if (dst->weight_length < array_elements(weights))
|
||||
weights[dst->weight_length]= s;
|
||||
dst->weight_length++;
|
||||
}
|
||||
|
||||
set_if_smaller(dst->weight_length, MY_UCA_MAX_WEIGHT_SIZE-1);
|
||||
|
||||
for (w= 0; w < dst->weight_length ; w++)
|
||||
{
|
||||
size_t partnum= 0;
|
||||
for (s= weights[w]; *s ;)
|
||||
{
|
||||
char *endptr;
|
||||
uint part= (uint) strtoul(s + 1, &endptr, 16);
|
||||
if (w == 0 && s[0] == '*')
|
||||
*is_variable= TRUE;
|
||||
if (part > 0xFFFF)
|
||||
fprintf(stderr, "Weight is too large: %X\n", (uint) part);
|
||||
dst->weight[partnum][w]= (uint16) part;
|
||||
s= endptr;
|
||||
partnum++;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
static void
|
||||
print_one_logical_position(const OPT *options,
|
||||
const char *name,
|
||||
const char *name2,
|
||||
my_wc_t value)
|
||||
{
|
||||
printf("#define %s_%s%s 0x%04X\n",
|
||||
options->name_prefix, name, name2, (int) value);
|
||||
}
|
||||
|
||||
|
||||
static void
|
||||
my_ducet_weight_print_canonical(const MY_DUCET_WEIGHT *src)
|
||||
{
|
||||
uint i;
|
||||
for (i= 0; i < array_elements(src->weight[0]); i++)
|
||||
{
|
||||
my_bool zero= src->weight[0][i] == 0 &&
|
||||
src->weight[1][i] == 0 &&
|
||||
src->weight[2][i] == 0;
|
||||
if (zero && i > 0)
|
||||
break;
|
||||
printf("[.%04X.%04X.%04X]",
|
||||
src->weight[0][i],
|
||||
src->weight[1][i],
|
||||
src->weight[2][i]);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
static void
|
||||
my_ducet_logical_position_print(const MY_DUCET_LOGICAL_POSITION *src,
|
||||
const char *name,
|
||||
const MY_DUCET *ducet,
|
||||
const OPT *options)
|
||||
{
|
||||
printf("/*\n");
|
||||
my_ducet_weight_print_canonical(&ducet->single_chars[src->first].weight);
|
||||
printf("\n");
|
||||
my_ducet_weight_print_canonical(&ducet->single_chars[src->last].weight);
|
||||
printf("\n*/\n");
|
||||
print_one_logical_position(options, name, "_first", src->first);
|
||||
print_one_logical_position(options, name, "_last", src->last);
|
||||
printf("\n");
|
||||
}
|
||||
|
||||
|
||||
static void
|
||||
print_logical_positions(const MY_DUCET_LOGICAL_POSITIONS *src,
|
||||
const MY_DUCET *ducet,
|
||||
const OPT *opt)
|
||||
{
|
||||
my_ducet_logical_position_print(&src->tertiary_ignorable, "tertiary_ignorable", ducet, opt);
|
||||
my_ducet_logical_position_print(&src->secondary_ignorable, "secondary_ignorable", ducet, opt);
|
||||
my_ducet_logical_position_print(&src->primary_ignorable, "primary_ignorable", ducet, opt);
|
||||
my_ducet_logical_position_print(&src->variable, "variable", ducet, opt);
|
||||
my_ducet_logical_position_print(&src->non_ignorable, "non_ignorable", ducet, opt);
|
||||
}
|
||||
|
||||
|
||||
static void
|
||||
print_version(const MY_DUCET *ducet, const OPT *opt)
|
||||
{
|
||||
|
@ -230,10 +681,64 @@ print_version(const MY_DUCET *ducet, const OPT *opt)
|
|||
}
|
||||
|
||||
|
||||
static void
|
||||
print_contraction(const MY_DUCET_CONTRACTION *c,
|
||||
uint level,
|
||||
const OPT *options)
|
||||
{
|
||||
size_t j;
|
||||
printf("{");
|
||||
printf("{");
|
||||
for (j= 0; j < array_elements(c->chars.wc); j++)
|
||||
{
|
||||
if (j > 0)
|
||||
printf(", ");
|
||||
if (c->chars.wc[j])
|
||||
printf("0x%04X", (uint) c->chars.wc[j]);
|
||||
else
|
||||
{
|
||||
printf("0");
|
||||
break;
|
||||
}
|
||||
}
|
||||
printf("}, ");
|
||||
printf("{");
|
||||
for (j= 0; j < array_elements(c->weights.weight[level]); j++)
|
||||
{
|
||||
if (j > 0)
|
||||
printf(", ");
|
||||
if (c->weights.weight[level][j])
|
||||
printf("0x%04X", (uint) c->weights.weight[level][j]);
|
||||
else
|
||||
{
|
||||
printf("0");
|
||||
break;
|
||||
}
|
||||
}
|
||||
printf("}, FALSE");
|
||||
printf("},\n");
|
||||
}
|
||||
|
||||
|
||||
static void
|
||||
print_contraction_list(const MY_DUCET_CONTRACTION_LIST *src, uint level, const OPT *opt)
|
||||
{
|
||||
size_t i;
|
||||
printf("\n\n/* Contractions, level %d */\n", level);
|
||||
printf("static MY_CONTRACTION %s_contractions%s[%d]={\n",
|
||||
opt->name_prefix, pname_suffix[level], (int) src->nitems);
|
||||
for (i= 0; i < src->nitems; i++)
|
||||
{
|
||||
const MY_DUCET_CONTRACTION *c= &src->item[i];
|
||||
print_contraction(c, level, opt);
|
||||
}
|
||||
printf("};\n\n");
|
||||
}
|
||||
|
||||
|
||||
int main(int ac, char **av)
|
||||
{
|
||||
char str[1024];
|
||||
char *weights[64];
|
||||
static MY_DUCET ducet;
|
||||
my_wc_t code;
|
||||
uint w;
|
||||
|
@ -257,8 +762,10 @@ int main(int ac, char **av)
|
|||
{
|
||||
char *comment;
|
||||
char *weight;
|
||||
char *s;
|
||||
size_t codenum;
|
||||
MY_DUCET_CHARS chr = {0};
|
||||
|
||||
if (str[0] == '#')
|
||||
continue;
|
||||
|
||||
if (str[0] == '@')
|
||||
{
|
||||
|
@ -266,18 +773,7 @@ int main(int ac, char **av)
|
|||
continue;
|
||||
}
|
||||
|
||||
code= (my_wc_t) strtol(str,NULL,16);
|
||||
|
||||
if (str[0]=='#' || (code > MAX_ALLOWED_CODE))
|
||||
continue;
|
||||
if ((comment=strchr(str,'#')))
|
||||
{
|
||||
*comment++= '\0';
|
||||
for ( ; *comment==' ' ; comment++);
|
||||
}else
|
||||
continue;
|
||||
|
||||
if ((weight=strchr(str,';')))
|
||||
if ((weight= strchr(str, ';')))
|
||||
{
|
||||
*weight++= '\0';
|
||||
for ( ; *weight==' ' ; weight++);
|
||||
|
@ -285,51 +781,39 @@ int main(int ac, char **av)
|
|||
else
|
||||
continue;
|
||||
|
||||
codenum= 0;
|
||||
s= strtok(str, " \t");
|
||||
while (s)
|
||||
if ((comment=strchr(weight, '#')))
|
||||
{
|
||||
s= strtok(NULL, " \t");
|
||||
codenum++;
|
||||
}
|
||||
*comment++= '\0';
|
||||
}else
|
||||
continue;
|
||||
|
||||
if (codenum>1)
|
||||
parse_chars(&chr, str);
|
||||
if (!chr.length)
|
||||
continue;
|
||||
|
||||
if (chr.length == 1)
|
||||
{
|
||||
/* Multi-character weight,
|
||||
i.e. contraction.
|
||||
Not supported yet.
|
||||
*/
|
||||
if (chr.wc[0] > MAX_ALLOWED_CODE)
|
||||
continue;
|
||||
parse_weights(&ducet.single_chars[chr.wc[0]].weight,
|
||||
&ducet.single_chars[chr.wc[0]].is_variable,
|
||||
weight);
|
||||
/* Mark that a character from this page was loaded */
|
||||
pageloaded[chr.wc[0] >> MY_UCA_PSHIFT]++;
|
||||
}
|
||||
else
|
||||
{
|
||||
MY_DUCET_WEIGHT weights= {0};
|
||||
my_bool dummy;
|
||||
if (chr.length >= MY_UCA_MAX_CONTRACTION)
|
||||
{
|
||||
fprintf(stderr, "Too long contraction: %d\n", (int) chr.length);
|
||||
continue;
|
||||
}
|
||||
parse_weights(&weights, &dummy, weight);
|
||||
my_ducet_contraction_list_add(&ducet.contractions, &chr, &weights);
|
||||
continue;
|
||||
}
|
||||
|
||||
ducet.single_chars[code].weight.weight_length= 0;
|
||||
s= strtok(weight, " []");
|
||||
while (s)
|
||||
{
|
||||
weights[ducet.single_chars[code].weight.weight_length]= s;
|
||||
s= strtok(NULL, " []");
|
||||
ducet.single_chars[code].weight.weight_length++;
|
||||
}
|
||||
|
||||
set_if_smaller(ducet.single_chars[code].weight.weight_length, MY_UCA_MAX_WEIGHT_SIZE-1);
|
||||
|
||||
for (w=0; w < ducet.single_chars[code].weight.weight_length ; w++)
|
||||
{
|
||||
size_t partnum;
|
||||
|
||||
partnum= 0;
|
||||
s= weights[w];
|
||||
while (*s)
|
||||
{
|
||||
char *endptr;
|
||||
uint part= (uint) strtoul(s + 1, &endptr, 16);
|
||||
ducet.single_chars[code].weight.weight[partnum][w]= (uint16) part;
|
||||
s= endptr;
|
||||
partnum++;
|
||||
}
|
||||
}
|
||||
/* Mark that a character from this page was loaded */
|
||||
pageloaded[code >> MY_UCA_PSHIFT]++;
|
||||
}
|
||||
|
||||
close_file(file);
|
||||
|
@ -352,6 +836,9 @@ int main(int ac, char **av)
|
|||
ducet.single_chars[code].weight.weight_length= 2;
|
||||
}
|
||||
|
||||
my_ducet_normalize(&ducet, &options);
|
||||
my_ducet_logical_positions_init(&ducet.logical_positions, &ducet);
|
||||
|
||||
printf("/*\n");
|
||||
printf(" Generated from allkeys.txt version '%s'\n", ducet.version_str);
|
||||
printf("*/\n");
|
||||
|
@ -445,35 +932,13 @@ int main(int ac, char **av)
|
|||
|
||||
for (offs=0; offs < MY_UCA_NCHARS; offs++)
|
||||
{
|
||||
uint16 weight[MY_UCA_MAX_WEIGHT_SIZE];
|
||||
size_t num, i;
|
||||
size_t i;
|
||||
|
||||
code= page*MY_UCA_NCHARS+offs;
|
||||
|
||||
bzero(weight,sizeof(weight));
|
||||
|
||||
/* Copy non-zero weights */
|
||||
for (num=0, i=0; i < ducet.single_chars[code].weight.weight_length; i++)
|
||||
{
|
||||
if (ducet.single_chars[code].weight.weight[w][i])
|
||||
{
|
||||
weight[num]= ducet.single_chars[code].weight.weight[w][i];
|
||||
num++;
|
||||
}
|
||||
}
|
||||
|
||||
for (i=0; i < maxnum; i++)
|
||||
{
|
||||
/*
|
||||
Invert weights for secondary level to
|
||||
sort upper case letters before their
|
||||
lower case counter part.
|
||||
*/
|
||||
int tmp= weight[i];
|
||||
if (w == 2 && tmp)
|
||||
tmp= (int)(0x20 - weight[i]);
|
||||
|
||||
|
||||
int tmp= ducet.single_chars[code].weight.weight[w][i];
|
||||
printf("0x%04X", tmp);
|
||||
if ((offs+1 != MY_UCA_NCHARS) || (i+1!=maxnum))
|
||||
printf(",");
|
||||
|
@ -518,9 +983,12 @@ int main(int ac, char **av)
|
|||
comma, nline);
|
||||
}
|
||||
printf("};\n");
|
||||
}
|
||||
|
||||
if (!options.no_contractions)
|
||||
print_contraction_list(&ducet.contractions, w, &options);
|
||||
}
|
||||
print_version(&ducet, &options);
|
||||
print_logical_positions(&ducet.logical_positions, &ducet, &options);
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
|
Loading…
Reference in a new issue