MDEV-27009 Add UCA-14.0.0 collations - adding version aware implicit weight handling

Implicit weights are now handled according to the Unicode version
(14.0.0 vs earlier versions).

- Adding a new member MY_UCA_INFO::version

- Copy logical positions and the version from "src_uca" to "new_uca"
  in init_weight_level().

- Adding a "const MY_UCA_INFO *" parameter to a few functions
  to know Unicode version to generate implicit weights accordingly:
  - during the collation initialization time, to pages which are
    a mixture of explicit and implicit weights
  - during comparison time, for fully implicit pages
This commit is contained in:
Alexander Barkov 2021-11-28 16:48:13 +04:00 committed by Oleksandr Byelkin
parent d7ffb7c3dd
commit 6bc10f8026
2 changed files with 40 additions and 13 deletions

View file

@ -168,6 +168,9 @@ typedef struct uca_info_st
my_wc_t first_variable;
my_wc_t last_variable;
/* Unicode version */
uint version;
} MY_UCA_INFO;

View file

@ -6580,6 +6580,9 @@ MY_UCA_INFO my_uca_v400=
0x0009, /* first_variable */
0x2183, /* last_variable */
/* Misc */
400 /* Version */
};
/******************************************************/
@ -30136,6 +30139,9 @@ MY_UCA_INFO my_uca_v520_th=
0x0009, /* first_variable if alt=non-ignorable: p != ignore */
0x1D371, /* last_variable if alt=shifter: p,s,t == ignore */
/* Misc */
520 /* Version */
};
MY_UCA_INFO my_uca_v520=
@ -30183,6 +30189,9 @@ MY_UCA_INFO my_uca_v520=
0x0009, /* first_variable if alt=non-ignorable: p != ignore */
0x1D371, /* last_variable if alt=shifter: p,s,t == ignore */
/* Misc */
520 /* Version */
};
@ -31691,10 +31700,11 @@ my_uca_context_weight_find(my_uca_scanner *scanner, my_wc_t currwc,
/****************************************************************/
static inline void
my_uca_implicit_weight_put(uint16 *to, my_wc_t code, uint level)
my_uca_implicit_weight_put(uint16 *to, const MY_UCA_INFO *src_uca,
my_wc_t code, uint level)
{
MY_UCA_IMPLICIT_WEIGHT weight;
weight= my_uca_implicit_weight_on_level(520, code, level);
weight= my_uca_implicit_weight_on_level(src_uca->version, code, level);
to[0]= weight.weight[0];
to[1]= weight.weight[1];
to[2]= 0;
@ -31718,7 +31728,8 @@ static inline int
my_uca_scanner_next_implicit_primary(my_uca_scanner *scanner)
{
my_wc_t wc= (scanner->page << 8) + scanner->code;
MY_UCA_IMPLICIT_WEIGHT weight= my_uca_520_implicit_weight_primary(wc);
uint version= scanner->cs->uca->version;
MY_UCA_IMPLICIT_WEIGHT weight= my_uca_implicit_weight_primary(version, wc);
scanner->implicit[0]= weight.weight[1]; /* The second weight */
scanner->implicit[1]= 0; /* 0 terminator */
scanner->wbeg= scanner->implicit;
@ -32509,7 +32520,7 @@ typedef struct my_coll_rules_st
{
uint version; /* Unicode version, e.g. 400 or 520 */
uint strength; /* Number of levels */
MY_UCA_INFO *uca; /* Unicode weight data */
const MY_UCA_INFO *uca; /* Unicode weight data */
size_t nrules; /* Number of rules in the rule array */
size_t mrules; /* Number of allocated rules */
MY_COLL_RULE *rule; /* Rule array */
@ -33199,8 +33210,8 @@ my_coll_rule_parse(MY_COLL_RULES *rules,
Copies UCA weights for a given "uint" string
to the given location.
@dst destination UCA weight level data
@src_uca source UCA weight data
@dst_uca destination UCA weight data
@to destination address
@to_length size of destination
@nweights OUT number of weights put to "to"
@ -33212,6 +33223,7 @@ my_coll_rule_parse(MY_COLL_RULES *rules,
static my_bool
my_char_weight_put(MY_UCA_WEIGHT_LEVEL *dst,
const MY_UCA_INFO *src_uca,
uint16 *to, size_t to_length, size_t *nweights,
my_wc_t *str, size_t len)
{
@ -33249,7 +33261,7 @@ my_char_weight_put(MY_UCA_WEIGHT_LEVEL *dst,
if (!from)
{
from= implicit_weights;
my_uca_implicit_weight_put(implicit_weights, *str, dst->levelno);
my_uca_implicit_weight_put(implicit_weights, src_uca, *str, dst->levelno);
}
str++;
len--;
@ -33306,6 +33318,7 @@ my_uca_copy_page(MY_CHARSET_LOADER *loader,
static my_bool
my_uca_generate_implicit_page(MY_CHARSET_LOADER *loader,
MY_UCA_WEIGHT_LEVEL *dst,
const MY_UCA_INFO *src_uca,
uint page)
{
uint chc, size= 256 * dst->lengths[page] * sizeof(uint16);
@ -33316,7 +33329,7 @@ my_uca_generate_implicit_page(MY_CHARSET_LOADER *loader,
for (chc= 0 ; chc < 256; chc++)
{
uint16 *w= dst->weights[page] + chc * dst->lengths[page];
my_uca_implicit_weight_put(w, (page << 8) + chc, dst->levelno);
my_uca_implicit_weight_put(w, src_uca, (page << 8) + chc, dst->levelno);
}
return FALSE;
}
@ -33495,7 +33508,7 @@ apply_one_rule(MY_CHARSET_LOADER *loader,
r->curr, (uint)nshift, r->with_context);
/* Store weights of the "reset to" character */
dst->contractions.nitems--; /* Temporarily hide - it's incomplete */
rc= my_char_weight_put(dst,
rc= my_char_weight_put(dst, rules->uca,
to, MY_UCA_CONTRACTION_MAX_WEIGHT_SIZE, &nweights,
r->base, nreset);
dst->contractions.nitems++; /* Activate, now it's complete */
@ -33506,7 +33519,8 @@ apply_one_rule(MY_CHARSET_LOADER *loader,
DBUG_ASSERT(dst->weights[pagec]);
to= my_char_weight_addr(dst, r->curr[0]);
/* Store weights of the "reset to" character */
rc= my_char_weight_put(dst, to, dst->lengths[pagec], &nweights, r->base, nreset);
rc= my_char_weight_put(dst, rules->uca,
to, dst->lengths[pagec], &nweights, r->base, nreset);
}
if (rc)
{
@ -33575,7 +33589,9 @@ static uint my_weight_size_on_page(const MY_UCA_WEIGHT_LEVEL *src, uint page)
*/
static my_bool
my_uca_generate_page(MY_CHARSET_LOADER *loader,
MY_UCA_WEIGHT_LEVEL *dst, const MY_UCA_WEIGHT_LEVEL *src,
MY_UCA_WEIGHT_LEVEL *dst,
const MY_UCA_INFO *src_uca,
const MY_UCA_WEIGHT_LEVEL *src,
uint pageno)
{
DBUG_ASSERT(dst->levelno == src->levelno);
@ -33590,7 +33606,7 @@ my_uca_generate_page(MY_CHARSET_LOADER *loader,
Generate default weights for all characters on this page
algorithmically now, at initialization time.
*/
my_uca_generate_implicit_page(loader, dst, pageno);
my_uca_generate_implicit_page(loader, dst, src_uca, pageno);
}
@ -33601,6 +33617,7 @@ my_uca_generate_page(MY_CHARSET_LOADER *loader,
static my_bool
my_uca_generate_pages(MY_CHARSET_LOADER *loader,
MY_UCA_WEIGHT_LEVEL *dst,
const MY_UCA_INFO *src_uca,
const MY_UCA_WEIGHT_LEVEL *src,
uint npages)
{
@ -33625,7 +33642,7 @@ my_uca_generate_pages(MY_CHARSET_LOADER *loader,
}
/* Found a page with some special rules. */
if (my_uca_generate_page(loader, dst, src, page))
if (my_uca_generate_page(loader, dst, src_uca, src, page))
return TRUE;
}
return FALSE;
@ -33691,7 +33708,7 @@ init_weight_level(MY_CHARSET_LOADER *loader, MY_COLL_RULES *rules,
ncontractions += (int)src->contractions.nitems;
if ((my_uca_generate_pages(loader, dst, src, (uint)npages)))
if ((my_uca_generate_pages(loader, dst, rules->uca, src, (uint)npages)))
return TRUE;
if (ncontractions)
@ -33860,6 +33877,13 @@ create_tailoring(struct charset_info_st *cs,
}
cs->levels_for_order= rules.strength ? rules.strength : 1;
/*
Copy logical positions, version, but don't copy levels -
they will be initialized below.
*/
new_uca= *src_uca;
bzero(&new_uca.level, sizeof(new_uca.level));
for (i= 0; i != cs->levels_for_order; i++)
{
if ((rc= (src_uca->level[i].maxchar == 0)))