mirror of
https://github.com/MariaDB/server.git
synced 2025-01-29 02:05:57 +01:00
A few minor Unicode collation customization improvements were made,
which makes it possible to add more world language collations with very complex collation rules (e.g. Myanmar): - Weight string for a single character in a user defined collation was erroneously limited to 7 weights (instead of 8 weights). Added an extra element in the user-defined weight arrays, to fit 8 non-zero weights. - Weight string limit for contractions was made two times longer (16 weights), which allows longer contractions without affecting the performance of filesort. - A user-defined collation now refuses to initialize and reports an error in case if a weight string gets longer than 8 weights for a single character, or longer than 16 weights for a contraction. Previously weight strings for such characters (and contractions) were cut, so a collation could silently start with wrong rules. - Fixed a bug in handling rules like "&a << b" in combination with shift-after-method="expand". The primary weight for "b" was not correctly calculated, which erroneously made "b" primary greater than "a" instead of primary equal to "a".
This commit is contained in:
parent
eea91f633f
commit
bd3dc54261
5 changed files with 136 additions and 48 deletions
|
@ -88,13 +88,25 @@ extern MY_UNICASE_INFO my_unicase_mysql500;
|
|||
extern MY_UNICASE_INFO my_unicase_unicode520;
|
||||
|
||||
#define MY_UCA_MAX_CONTRACTION 6
|
||||
#define MY_UCA_MAX_WEIGHT_SIZE 8
|
||||
/*
|
||||
The DUCET tables in ctype-uca.c are dumped with a limit of 8 weights
|
||||
per character. cs->strxfrm_multiply is set to 8 for all UCA based collations.
|
||||
|
||||
In language-specific UCA collations (with tailorings) we also do not allow
|
||||
a single character to have more than 8 weights to stay with the same
|
||||
strxfrm_multiply limit. Note, contractions are allowed to have twice longer
|
||||
weight strings (up to 16 weights). As a contraction consists of at
|
||||
least 2 characters, this makes sure that strxfrm_multiply ratio of 8
|
||||
is respected.
|
||||
*/
|
||||
#define MY_UCA_MAX_WEIGHT_SIZE (8+1) /* Including 0 terminator */
|
||||
#define MY_UCA_CONTRACTION_MAX_WEIGHT_SIZE (2*8+1) /* Including 0 terminator */
|
||||
#define MY_UCA_WEIGHT_LEVELS 1
|
||||
|
||||
typedef struct my_contraction_t
|
||||
{
|
||||
my_wc_t ch[MY_UCA_MAX_CONTRACTION]; /* Character sequence */
|
||||
uint16 weight[MY_UCA_MAX_WEIGHT_SIZE];/* Its weight string, 0-terminated */
|
||||
uint16 weight[MY_UCA_CONTRACTION_MAX_WEIGHT_SIZE];/* Its weight string, 0-terminated */
|
||||
my_bool with_context;
|
||||
} MY_CONTRACTION;
|
||||
|
||||
|
|
|
@ -425,6 +425,7 @@ ucs2_test_ci ucs2 358 8
|
|||
ucs2_vn_ci ucs2 359 8
|
||||
ucs2_5624_1 ucs2 360 8
|
||||
utf8_5624_5 utf8 368 8
|
||||
utf8_5624_5_bad utf8 369 8
|
||||
utf32_test_ci utf32 391 8
|
||||
utf8_maxuserid_ci utf8 2047 8
|
||||
show collation like '%test%';
|
||||
|
@ -1030,9 +1031,12 @@ INSERT INTO t1 VALUES ('A'),('B'),('C'),('D'),('E'),('F'),('G'),('H'),('I');
|
|||
INSERT INTO t1 VALUES ('J'),('K'),('L'),('M'),('N'),('O'),('P'),('Q'),('R');
|
||||
INSERT INTO t1 VALUES ('S'),('T'),('U'),('V'),('W'),('X'),('Y'),('Z');
|
||||
INSERT INTO t1 VALUES ('AA'),('AAA');
|
||||
INSERT INTO t1 VALUES ('001'),('002');
|
||||
SELECT a, HEX(WEIGHT_STRING(a)) FROM t1 ORDER BY a, LENGTH(a), BINARY(a);
|
||||
a HEX(WEIGHT_STRING(a))
|
||||
0 0E29
|
||||
001 0E29
|
||||
002 0E29
|
||||
0z 0E290E292357
|
||||
0ン 0E291E81
|
||||
a 0E29233E
|
||||
|
@ -1093,6 +1097,12 @@ AA 0E293358
|
|||
AAA 0E293359
|
||||
1 0E2A
|
||||
DROP TABLE t1;
|
||||
SET NAMES utf8 COLLATE utf8_5624_5_bad;
|
||||
ERROR HY000: Unknown collation: 'utf8_5624_5_bad'
|
||||
SHOW WARNINGS;
|
||||
Level Code Message
|
||||
Error 1273 Unknown collation: 'utf8_5624_5_bad'
|
||||
Warning 1273 Expansion too long: 'a\u002Daaaaaa10'
|
||||
#
|
||||
# End of WL#5624
|
||||
#
|
||||
|
|
|
@ -114,13 +114,25 @@
|
|||
weight space between 0 and 1 in DUCET.
|
||||
Also, to test it works with contractions, put some after 'z'.
|
||||
-->
|
||||
<reset>0</reset>
|
||||
<reset>0</reset><s>001</s><s>002</s>
|
||||
<pc>abcdefghijklmnopqrstuvwxyz</pc><p>aa</p><p>aaa</p>
|
||||
<reset before="primary">1</reset>
|
||||
<pc>ABCDEFGHIJKLMNOPQRSTUVWXYZ</pc><p>AA</p><p>AAA</p>
|
||||
</rules>
|
||||
</collation>
|
||||
|
||||
<collation name="utf8_5624_5_bad" id="369" shift-after-method="expand">
|
||||
<rules>
|
||||
<reset>a-a4</reset><p>xxx04</a>
|
||||
<reset>a-aa5</reset><p>xxx05</a>
|
||||
<reset>a-aaa6</reset><p>xxx06</a>
|
||||
<reset>a-aaaa7</reset><p>xxx07</a>
|
||||
<reset>a-aaaaa8</reset><p>xxx08</a>
|
||||
<reset>a-aaaaaa9</reset><p>xxx09</a>
|
||||
<reset>a-aaaaaa10</reset><p>xxx10</a>
|
||||
</rules>
|
||||
</collation>
|
||||
|
||||
<collation name="utf8_hugeid_ci" id="2047000000">
|
||||
<rules>
|
||||
<reset>a</reset>
|
||||
|
|
|
@ -342,10 +342,14 @@ INSERT INTO t1 VALUES ('A'),('B'),('C'),('D'),('E'),('F'),('G'),('H'),('I');
|
|||
INSERT INTO t1 VALUES ('J'),('K'),('L'),('M'),('N'),('O'),('P'),('Q'),('R');
|
||||
INSERT INTO t1 VALUES ('S'),('T'),('U'),('V'),('W'),('X'),('Y'),('Z');
|
||||
INSERT INTO t1 VALUES ('AA'),('AAA');
|
||||
INSERT INTO t1 VALUES ('001'),('002');
|
||||
|
||||
SELECT a, HEX(WEIGHT_STRING(a)) FROM t1 ORDER BY a, LENGTH(a), BINARY(a);
|
||||
DROP TABLE t1;
|
||||
|
||||
--error ER_UNKNOWN_COLLATION
|
||||
SET NAMES utf8 COLLATE utf8_5624_5_bad;
|
||||
SHOW WARNINGS;
|
||||
|
||||
--echo #
|
||||
--echo # End of WL#5624
|
||||
|
|
|
@ -8211,7 +8211,7 @@ ex:
|
|||
Collation rule item
|
||||
*/
|
||||
|
||||
#define MY_UCA_MAX_EXPANSION 6 /* Maximum expansion length */
|
||||
#define MY_UCA_MAX_EXPANSION 10 /* Maximum expansion length */
|
||||
|
||||
typedef struct my_coll_rule_item_st
|
||||
{
|
||||
|
@ -8821,42 +8821,6 @@ my_coll_parser_scan_reset_sequence(MY_COLL_RULE_PARSER *p)
|
|||
MY_UCA_MAX_EXPANSION, "Expansion"))
|
||||
return 0;
|
||||
}
|
||||
|
||||
if (p->rules->shift_after_method == my_shift_method_expand ||
|
||||
p->rule.before_level == 1) /* Apply "before primary" option */
|
||||
{
|
||||
/*
|
||||
Suppose we have this rule: &B[before primary] < C
|
||||
i.e. we need to put C before B, but after A, so
|
||||
the result order is: A < C < B.
|
||||
|
||||
Let primary weight of B be [BBBB].
|
||||
|
||||
We cannot just use [BBBB-1] as weight for C:
|
||||
DUCET does not have enough unused weights between any two characters,
|
||||
so using [BBBB-1] will likely make C equal to the previous character,
|
||||
which is A, so we'll get this order instead of the desired: A = C < B.
|
||||
|
||||
To guarantee that that C is sorted after A, we'll use expansion
|
||||
with a kind of "biggest possible character".
|
||||
As "biggest possible character" we'll use "last_non_ignorable":
|
||||
|
||||
We'll compose weight for C as: [BBBB-1][MMMM+1]
|
||||
where [MMMM] is weight for "last_non_ignorable".
|
||||
|
||||
We also do the same trick for "reset after" if the collation
|
||||
option says so. E.g. for the rules "&B < C", weight for
|
||||
C will be calculated as: [BBBB][MMMM+1]
|
||||
|
||||
At this point we only need to store codepoints
|
||||
'B' and 'last_non_ignorable'. Actual weights for 'C'
|
||||
will be calculated according to the above formula later,
|
||||
in create_tailoring().
|
||||
*/
|
||||
if (!my_coll_rule_expand(p->rule.base, MY_UCA_MAX_EXPANSION,
|
||||
p->rules->uca->last_non_ignorable))
|
||||
return my_coll_parser_too_long_error(p, "Expansion");
|
||||
}
|
||||
return 1;
|
||||
}
|
||||
|
||||
|
@ -9056,20 +9020,25 @@ my_coll_rule_parse(MY_COLL_RULES *rules,
|
|||
@dst_uca destination UCA weight data
|
||||
@to destination address
|
||||
@to_length size of destination
|
||||
@nweights OUT number of weights put to "to"
|
||||
@str qide string
|
||||
@len string length
|
||||
|
||||
@return number of weights put
|
||||
@return FALSE on success, TRUE if the weights did not fit.
|
||||
*/
|
||||
|
||||
static size_t
|
||||
static my_bool
|
||||
my_char_weight_put(MY_UCA_WEIGHT_LEVEL *dst,
|
||||
uint16 *to, size_t to_length,
|
||||
uint16 *to, size_t to_length, size_t *nweights,
|
||||
my_wc_t *str, size_t len)
|
||||
{
|
||||
size_t count;
|
||||
int rc= FALSE;
|
||||
if (!to_length)
|
||||
return 0;
|
||||
{
|
||||
*nweights= 0;
|
||||
return len > 0;
|
||||
}
|
||||
to_length--; /* Without trailing zero */
|
||||
|
||||
for (count= 0; len; )
|
||||
|
@ -9099,10 +9068,13 @@ my_char_weight_put(MY_UCA_WEIGHT_LEVEL *dst,
|
|||
*to++= *from++;
|
||||
count++;
|
||||
}
|
||||
if (count == to_length && from && * from)
|
||||
rc= TRUE; /* All weights did not fit */
|
||||
}
|
||||
|
||||
*to= 0;
|
||||
return count;
|
||||
*nweights= count;
|
||||
return rc;
|
||||
}
|
||||
|
||||
|
||||
|
@ -9191,6 +9163,37 @@ apply_shift(MY_CHARSET_LOADER *loader,
|
|||
}
|
||||
|
||||
|
||||
static void
|
||||
wstr_to_str(char *str, size_t length, my_wc_t *wc, size_t wlength)
|
||||
{
|
||||
const char *end= str + length;
|
||||
char *s;
|
||||
size_t i, rem;
|
||||
for (s= str, i= 0; (rem= (end - s)) > 0 && i < wlength; i++)
|
||||
{
|
||||
if ((wc[i] >= '0' && wc[i] <= '9') ||
|
||||
(wc[i] >= 'a' && wc[i] <= 'z') ||
|
||||
(wc[i] >= 'A' && wc[i] <= 'Z'))
|
||||
s+= my_snprintf(s, rem, "%c", (int) wc[i]);
|
||||
else
|
||||
s+= my_snprintf(s, rem, "\\u%04X", (int) wc[i]);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
static void
|
||||
my_charset_loader_error_for_rule(MY_CHARSET_LOADER *loader,
|
||||
const MY_COLL_RULE *r,
|
||||
const char *name,
|
||||
my_wc_t *wc, size_t wlength)
|
||||
{
|
||||
char tmp[128];
|
||||
wstr_to_str(tmp, sizeof(tmp), wc, wlength);
|
||||
my_snprintf(loader->error, sizeof(loader->error),
|
||||
"%s too long: '%s'", name, tmp);
|
||||
}
|
||||
|
||||
|
||||
static my_bool
|
||||
apply_one_rule(MY_CHARSET_LOADER *loader,
|
||||
MY_COLL_RULES *rules, MY_COLL_RULE *r, int level,
|
||||
|
@ -9200,6 +9203,47 @@ apply_one_rule(MY_CHARSET_LOADER *loader,
|
|||
size_t nreset= my_coll_rule_reset_length(r); /* Length of reset sequence */
|
||||
size_t nshift= my_coll_rule_shift_length(r); /* Length of shift sequence */
|
||||
uint16 *to;
|
||||
my_bool rc;
|
||||
|
||||
if ((rules->shift_after_method == my_shift_method_expand && r->diff[0]) ||
|
||||
r->before_level == 1)
|
||||
{
|
||||
/*
|
||||
Suppose we have this rule: &B[before primary] < C
|
||||
i.e. we need to put C before B, but after A, so
|
||||
the result order is: A < C < B.
|
||||
|
||||
Let primary weight of B be [BBBB].
|
||||
|
||||
We cannot just use [BBBB-1] as weight for C:
|
||||
DUCET does not have enough unused weights between any two characters,
|
||||
so using [BBBB-1] will likely make C equal to the previous character,
|
||||
which is A, so we'll get this order instead of the desired: A = C < B.
|
||||
|
||||
To guarantee that that C is sorted after A, we'll use expansion
|
||||
with a kind of "biggest possible character".
|
||||
As "biggest possible character" we'll use "last_non_ignorable":
|
||||
|
||||
We'll compose weight for C as: [BBBB-1][MMMM+1]
|
||||
where [MMMM] is weight for "last_non_ignorable".
|
||||
|
||||
We also do the same trick for "reset after" if the collation
|
||||
option says so. E.g. for the rules "&B < C", weight for
|
||||
C will be calculated as: [BBBB][MMMM+1]
|
||||
|
||||
At this point we only need to store codepoints
|
||||
'B' and 'last_non_ignorable'. Actual weights for 'C'
|
||||
will be calculated according to the above formula later,
|
||||
in create_tailoring().
|
||||
*/
|
||||
if (!my_coll_rule_expand(r->base, MY_UCA_MAX_EXPANSION,
|
||||
rules->uca->last_non_ignorable))
|
||||
{
|
||||
my_charset_loader_error_for_rule(loader, r, "Expansion", r->base, nreset);
|
||||
return TRUE;
|
||||
}
|
||||
nreset= my_coll_rule_reset_length(r);
|
||||
}
|
||||
|
||||
if (nshift >= 2) /* Contraction */
|
||||
{
|
||||
|
@ -9222,8 +9266,9 @@ apply_one_rule(MY_CHARSET_LOADER *loader,
|
|||
r->with_context)->weight;
|
||||
/* Store weights of the "reset to" character */
|
||||
dst->contractions.nitems--; /* Temporarily hide - it's incomplete */
|
||||
nweights= my_char_weight_put(dst, to, MY_UCA_MAX_WEIGHT_SIZE,
|
||||
r->base, nreset);
|
||||
rc= my_char_weight_put(dst,
|
||||
to, MY_UCA_CONTRACTION_MAX_WEIGHT_SIZE, &nweights,
|
||||
r->base, nreset);
|
||||
dst->contractions.nitems++; /* Activate, now it's complete */
|
||||
}
|
||||
else
|
||||
|
@ -9232,7 +9277,12 @@ apply_one_rule(MY_CHARSET_LOADER *loader,
|
|||
DBUG_ASSERT(dst->weights[pagec]);
|
||||
to= my_char_weight_addr(dst, r->curr[0]);
|
||||
/* Store weights of the "reset to" character */
|
||||
nweights= my_char_weight_put(dst, to, dst->lengths[pagec], r->base, nreset);
|
||||
rc= my_char_weight_put(dst, to, dst->lengths[pagec], &nweights, r->base, nreset);
|
||||
}
|
||||
if (rc)
|
||||
{
|
||||
my_charset_loader_error_for_rule(loader, r, "Expansion", r->base, nreset);
|
||||
return rc;
|
||||
}
|
||||
|
||||
/* Apply level difference. */
|
||||
|
|
Loading…
Add table
Reference in a new issue