mirror of
https://github.com/MariaDB/server.git
synced 2026-04-24 09:15:30 +02:00
Added more general support for sorting 2 characters as one (contractions)
Added support for Croatian sorting orders utf8_croatian_ci and ucs2_croatian_ci. Patch done by Alexander Barkov. See http://www.collation-charts.org/articles/croatian.htm mysql-test/r/ctype_uca.result: Added testing of Croatian sort order mysql-test/t/ctype_uca.test: Added testing of Croatian sort order
This commit is contained in:
parent
84911a9fd0
commit
4c14f9f23c
7 changed files with 476 additions and 80 deletions
|
|
@ -49,6 +49,24 @@ typedef struct unicase_info_st
|
|||
extern MY_UNICASE_INFO *my_unicase_default[256];
|
||||
extern MY_UNICASE_INFO *my_unicase_turkish[256];
|
||||
|
||||
#define MY_UCA_MAX_CONTRACTION 4
|
||||
#define MY_UCA_MAX_WEIGHT_SIZE 8
|
||||
|
||||
typedef struct my_contraction_t
|
||||
{
|
||||
my_wc_t ch[MY_UCA_MAX_CONTRACTION]; /* Character sequence */
|
||||
uint16 weight[MY_UCA_MAX_WEIGHT_SIZE];/* Its weight string, 0-terminated */
|
||||
} MY_CONTRACTION;
|
||||
|
||||
|
||||
typedef struct my_contraction_list_t
|
||||
{
|
||||
size_t nitems; /* Number of items in the list */
|
||||
MY_CONTRACTION *item; /* List of contractions */
|
||||
char *flags; /* Character flags, e.g. "is contraction head") */
|
||||
} MY_CONTRACTIONS;
|
||||
|
||||
|
||||
typedef struct uni_ctype_st
|
||||
{
|
||||
uchar pctype;
|
||||
|
|
@ -262,7 +280,7 @@ typedef struct charset_info_st
|
|||
uchar *to_lower;
|
||||
uchar *to_upper;
|
||||
uchar *sort_order;
|
||||
uint16 *contractions;
|
||||
MY_CONTRACTIONS *contractions;
|
||||
uint16 **sort_order_big;
|
||||
uint16 *tab_to_uni;
|
||||
MY_UNI_IDX *tab_from_uni;
|
||||
|
|
@ -475,6 +493,13 @@ my_bool my_charset_is_ascii_based(CHARSET_INFO *cs);
|
|||
my_bool my_charset_is_8bit_pure_ascii(CHARSET_INFO *cs);
|
||||
uint my_charset_repertoire(CHARSET_INFO *cs);
|
||||
|
||||
my_bool my_uca_have_contractions(CHARSET_INFO *cs);
|
||||
my_bool my_uca_can_be_contraction_head(CHARSET_INFO *cs, my_wc_t wc);
|
||||
my_bool my_uca_can_be_contraction_tail(CHARSET_INFO *cs, my_wc_t wc);
|
||||
uint16 *my_uca_contraction2_weight(CHARSET_INFO *cs, my_wc_t wc1, my_wc_t wc2);
|
||||
|
||||
|
||||
|
||||
|
||||
#define _MY_U 01 /* Upper case */
|
||||
#define _MY_L 02 /* Lower case */
|
||||
|
|
|
|||
|
|
@ -159,6 +159,7 @@ insert into t1 values (_ucs2 0x01fc),(_ucs2 0x01fd),(_ucs2 0x01fe),(_ucs2 0x01ff
|
|||
insert into t1 values ('AA'),('Aa'),('aa'),('aA');
|
||||
insert into t1 values ('CH'),('Ch'),('ch'),('cH');
|
||||
insert into t1 values ('DZ'),('Dz'),('dz'),('dZ');
|
||||
insert into t1 values ('DŽ'),('Dž'),('dž'),('dŽ');
|
||||
insert into t1 values ('IJ'),('Ij'),('ij'),('iJ');
|
||||
insert into t1 values ('LJ'),('Lj'),('lj'),('lJ');
|
||||
insert into t1 values ('LL'),('Ll'),('ll'),('lL');
|
||||
|
|
@ -181,7 +182,7 @@ C,c,Ç,ç,Ć,ć,Ĉ,ĉ,Ċ,ċ,Č,č
|
|||
CH,Ch,cH,ch
|
||||
Ƈ,ƈ
|
||||
D,d,Ď,ď
|
||||
DZ,Dz,dZ,dz,DŽ,Dž,dž,DZ,Dz,dz
|
||||
DZ,Dz,DŽ,Dž,dZ,dz,dŽ,dž,DŽ,Dž,dž,DZ,Dz,dz
|
||||
Đ,đ
|
||||
Ɖ
|
||||
Ɗ
|
||||
|
|
@ -286,7 +287,7 @@ C,c,Ç,ç,Ć,ć,Ĉ,ĉ,Ċ,ċ,Č,č
|
|||
CH,Ch,cH,ch
|
||||
Ƈ,ƈ
|
||||
D,d,Ď,ď
|
||||
DZ,Dz,dZ,dz,DŽ,Dž,dž,DZ,Dz,dz
|
||||
DZ,Dz,DŽ,Dž,dZ,dz,dŽ,dž,DŽ,Dž,dž,DZ,Dz,dz
|
||||
Ð,ð
|
||||
Đ,đ
|
||||
Ɖ
|
||||
|
|
@ -400,6 +401,7 @@ CH,Ch,cH,ch
|
|||
Ƈ,ƈ
|
||||
D,d,Ď,ď
|
||||
DZ,Dz,dZ,dz,DŽ,Dž,dž,DZ,Dz,dz
|
||||
DŽ,Dž,dŽ,dž
|
||||
Đ,đ
|
||||
Ɖ
|
||||
Ɗ
|
||||
|
|
@ -513,7 +515,7 @@ C,c,Ç,ç,Ć,ć,Ĉ,ĉ,Ċ,ċ,Č,č
|
|||
CH,Ch,cH,ch
|
||||
Ƈ,ƈ
|
||||
D,d,Ď,ď
|
||||
DZ,Dz,dZ,dz,DŽ,Dž,dž,DZ,Dz,dz
|
||||
DZ,Dz,DŽ,Dž,dZ,dz,dŽ,dž,DŽ,Dž,dž,DZ,Dz,dz
|
||||
Đ,đ
|
||||
Ɖ
|
||||
Ɗ
|
||||
|
|
@ -622,6 +624,7 @@ CH,Ch,cH,ch
|
|||
Ƈ,ƈ
|
||||
D,d,Ď,ď
|
||||
DZ,Dz,dZ,dz,DŽ,Dž,dž,DZ,Dz,dz
|
||||
DŽ,Dž,dŽ,dž
|
||||
Đ,đ
|
||||
Ɖ
|
||||
Ɗ
|
||||
|
|
@ -729,7 +732,7 @@ CH,Ch,cH,ch
|
|||
Ć,ć
|
||||
Ƈ,ƈ
|
||||
D,d,Ď,ď
|
||||
DZ,Dz,dZ,dz,DŽ,Dž,dž,DZ,Dz,dz
|
||||
DZ,Dz,DŽ,Dž,dZ,dz,dŽ,dž,DŽ,Dž,dž,DZ,Dz,dz
|
||||
Đ,đ
|
||||
Ɖ
|
||||
Ɗ
|
||||
|
|
@ -840,6 +843,7 @@ CH,Ch,cH,ch
|
|||
Ƈ,ƈ
|
||||
D,d,Ď,ď
|
||||
DZ,Dz,dZ,dz
|
||||
DŽ,Dž,dŽ,dž
|
||||
DŽ,Dž,dž,DZ,Dz,dz
|
||||
Đ,đ
|
||||
Ɖ
|
||||
|
|
@ -951,7 +955,7 @@ C,c,Ç,ç,Ć,ć,Ĉ,ĉ,Ċ,ċ,Č,č
|
|||
CH,Ch,cH,ch
|
||||
Ƈ,ƈ
|
||||
D,d,Ď,ď
|
||||
DZ,Dz,dZ,dz,DŽ,Dž,dž,DZ,Dz,dz
|
||||
DZ,Dz,DŽ,Dž,dZ,dz,dŽ,dž,DŽ,Dž,dž,DZ,Dz,dz
|
||||
Đ,đ
|
||||
Ɖ
|
||||
Ɗ
|
||||
|
|
@ -1056,7 +1060,7 @@ C,c,Ç,ç,Ć,ć,Ĉ,ĉ,Ċ,ċ,Č,č
|
|||
CH,Ch,cH,ch
|
||||
Ƈ,ƈ
|
||||
D,d,Ď,ď
|
||||
DZ,Dz,dZ,dz,DŽ,Dž,dž,DZ,Dz,dz
|
||||
DZ,Dz,DŽ,Dž,dZ,dz,dŽ,dž,DŽ,Dž,dž,DZ,Dz,dz
|
||||
Đ,đ
|
||||
Ɖ
|
||||
Ɗ
|
||||
|
|
@ -1164,7 +1168,7 @@ CH,Ch,cH,ch
|
|||
Ç,ç
|
||||
Ƈ,ƈ
|
||||
D,d,Ď,ď
|
||||
DZ,Dz,dZ,dz,DŽ,Dž,dž,DZ,Dz,dz
|
||||
DZ,Dz,DŽ,Dž,dZ,dz,dŽ,dž,DŽ,Dž,dž,DZ,Dz,dz
|
||||
Đ,đ
|
||||
Ɖ
|
||||
Ɗ
|
||||
|
|
@ -1275,6 +1279,7 @@ cH
|
|||
Ƈ,ƈ
|
||||
D,d,Ď,ď
|
||||
DZ,Dz,dZ,dz,DŽ,Dž,dž,DZ,Dz,dz
|
||||
DŽ,Dž,dŽ,dž
|
||||
Đ,đ
|
||||
Ɖ
|
||||
Ɗ
|
||||
|
|
@ -1382,7 +1387,7 @@ C,c,Ç,ç,Ć,ć,Ĉ,ĉ,Ċ,ċ,Č,č
|
|||
CH,Ch,cH,ch
|
||||
Ƈ,ƈ
|
||||
D,d,Ď,ď
|
||||
DZ,Dz,dZ,dz,DŽ,Dž,dž,DZ,Dz,dz
|
||||
DZ,Dz,DŽ,Dž,dZ,dz,dŽ,dž,DŽ,Dž,dž,DZ,Dz,dz
|
||||
Đ,đ
|
||||
Ɖ
|
||||
Ɗ
|
||||
|
|
@ -1491,6 +1496,7 @@ cH
|
|||
Ƈ,ƈ
|
||||
D,d,Ď,ď
|
||||
DZ,Dz,dZ,dz,DŽ,Dž,dž,DZ,Dz,dz
|
||||
DŽ,Dž,dŽ,dž
|
||||
Đ,đ
|
||||
Ɖ
|
||||
Ɗ
|
||||
|
|
@ -1599,6 +1605,7 @@ cH
|
|||
Ƈ,ƈ
|
||||
D,d,Ď,ď
|
||||
DZ,Dz,dZ,dz,DŽ,Dž,dž,DZ,Dz,dz
|
||||
DŽ,Dž,dŽ,dž
|
||||
Đ,đ
|
||||
Ɖ
|
||||
Ɗ
|
||||
|
|
@ -1707,7 +1714,7 @@ cH
|
|||
CH,Ch,ch
|
||||
Ƈ,ƈ
|
||||
D,d,Ď,ď
|
||||
DZ,Dz,dZ,dz,DŽ,Dž,dž,DZ,Dz,dz
|
||||
DZ,Dz,DŽ,Dž,dZ,dz,dŽ,dž,DŽ,Dž,dž,DZ,Dz,dz
|
||||
Đ,đ
|
||||
Ɖ
|
||||
Ɗ
|
||||
|
|
@ -1813,7 +1820,7 @@ C,c,Ç,ç,Ć,ć,Ĉ,ĉ,Ċ,ċ,Č,č
|
|||
CH,Ch,cH,ch
|
||||
Ƈ,ƈ
|
||||
D,d,Ď,ď
|
||||
DZ,Dz,dZ,dz,DŽ,Dž,dž,DZ,Dz,dz
|
||||
DZ,Dz,DŽ,Dž,dZ,dz,dŽ,dž,DŽ,Dž,dž,DZ,Dz,dz
|
||||
Đ,đ
|
||||
Ɖ
|
||||
Ɗ
|
||||
|
|
@ -1921,7 +1928,7 @@ CH,Ch,cH,ch
|
|||
Ĉ,ĉ
|
||||
Ƈ,ƈ
|
||||
D,d,Ď,ď
|
||||
DZ,Dz,dZ,dz,DŽ,Dž,dž,DZ,Dz,dz
|
||||
DZ,Dz,DŽ,Dž,dZ,dz,dŽ,dž,DŽ,Dž,dž,DZ,Dz,dz
|
||||
Đ,đ
|
||||
Ɖ
|
||||
Ɗ
|
||||
|
|
@ -2030,7 +2037,7 @@ C,c,Ç,ç,Ć,ć,Ĉ,ĉ,Ċ,ċ,Č,č
|
|||
CH,Ch,cH,ch
|
||||
Ƈ,ƈ
|
||||
D,d,Ď,ď
|
||||
DZ,Dz,dZ,dz,DŽ,Dž,dž,DZ,Dz,dz
|
||||
DZ,Dz,DŽ,Dž,dZ,dz,dŽ,dž,DŽ,Dž,dž,DZ,Dz,dz
|
||||
Đ,đ
|
||||
Ɖ
|
||||
Ɗ
|
||||
|
|
@ -2121,6 +2128,118 @@ Z,z,Ź,ź,Ż,ż,Ž,ž
|
|||
ǁ
|
||||
ǂ
|
||||
ǃ
|
||||
select group_concat(c1 order by c1) from t1 group by c1 collate utf8_croatian_ci;
|
||||
group_concat(c1 order by c1)
|
||||
÷
|
||||
×
|
||||
A,a,À,Á,Â,Ã,Ä,Å,à,á,â,ã,ä,å,Ā,ā,Ă,ă,Ą,ą,Ǎ,ǎ,Ǟ,ǟ,Ǡ,ǡ,Ǻ,ǻ
|
||||
AA,Aa,aA,aa
|
||||
Æ,æ,Ǣ,ǣ,Ǽ,ǽ
|
||||
B,b
|
||||
ƀ
|
||||
Ɓ
|
||||
Ƃ,ƃ
|
||||
C,c,Ç,ç,Ĉ,ĉ,Ċ,ċ
|
||||
CH,Ch,cH,ch
|
||||
Č,č
|
||||
Ć,ć
|
||||
Ƈ,ƈ
|
||||
D,d,Ď,ď
|
||||
DZ,Dz,dZ,dz,DZ,Dz,dz
|
||||
dŽ
|
||||
DŽ,Dž,dž,DŽ,Dž,dž
|
||||
Đ,đ
|
||||
Ɖ
|
||||
Ɗ
|
||||
Ƌ,ƌ
|
||||
Ð,ð
|
||||
E,e,È,É,Ê,Ë,è,é,ê,ë,Ē,ē,Ĕ,ĕ,Ė,ė,Ę,ę,Ě,ě
|
||||
Ǝ,ǝ
|
||||
Ə
|
||||
Ɛ
|
||||
F,f
|
||||
Ƒ,ƒ
|
||||
G,g,Ĝ,ĝ,Ğ,ğ,Ġ,ġ,Ģ,ģ,Ǧ,ǧ,Ǵ,ǵ
|
||||
Ǥ,ǥ
|
||||
Ɠ
|
||||
Ɣ
|
||||
Ƣ,ƣ
|
||||
H,h,Ĥ,ĥ
|
||||
ƕ,Ƕ
|
||||
Ħ,ħ
|
||||
I,i,Ì,Í,Î,Ï,ì,í,î,ï,Ĩ,ĩ,Ī,ī,Ĭ,ĭ,Į,į,İ,Ǐ,ǐ
|
||||
IJ,Ij,iJ,ij,IJ,ij
|
||||
ı
|
||||
Ɨ
|
||||
Ɩ
|
||||
J,j,Ĵ,ĵ,ǰ
|
||||
K,k,Ķ,ķ,Ǩ,ǩ
|
||||
Ƙ,ƙ
|
||||
L,l,Ĺ,ĺ,Ļ,ļ,Ľ,ľ
|
||||
Ŀ,ŀ
|
||||
lJ
|
||||
LL,Ll,lL,ll
|
||||
LJ,Lj,lj,LJ,Lj,lj
|
||||
Ł,ł
|
||||
ƚ
|
||||
ƛ
|
||||
M,m
|
||||
N,n,Ñ,ñ,Ń,ń,Ņ,ņ,Ň,ň,Ǹ,ǹ
|
||||
nJ
|
||||
NJ,Nj,nj,NJ,Nj,nj
|
||||
Ɲ
|
||||
ƞ
|
||||
Ŋ,ŋ
|
||||
O,o,Ò,Ó,Ô,Õ,Ö,ò,ó,ô,õ,ö,Ō,ō,Ŏ,ŏ,Ő,ő,Ơ,ơ,Ǒ,ǒ,Ǫ,ǫ,Ǭ,ǭ
|
||||
OE,Oe,oE,oe,Œ,œ
|
||||
Ø,ø,Ǿ,ǿ
|
||||
Ɔ
|
||||
Ɵ
|
||||
P,p
|
||||
Ƥ,ƥ
|
||||
Q,q
|
||||
ĸ
|
||||
R,r,Ŕ,ŕ,Ŗ,ŗ,Ř,ř
|
||||
RR,Rr,rR,rr
|
||||
Ʀ
|
||||
S,s,Ś,ś,Ŝ,ŝ,Ş,ş,ſ
|
||||
SS,Ss,sS,ss,ß
|
||||
Š,š
|
||||
Ʃ
|
||||
ƪ
|
||||
T,t,Ţ,ţ,Ť,ť
|
||||
ƾ
|
||||
Ŧ,ŧ
|
||||
ƫ
|
||||
Ƭ,ƭ
|
||||
Ʈ
|
||||
U,u,Ù,Ú,Û,Ü,ù,ú,û,ü,Ũ,ũ,Ū,ū,Ŭ,ŭ,Ů,ů,Ű,ű,Ų,ų,Ư,ư,Ǔ,ǔ,Ǖ,ǖ,Ǘ,ǘ,Ǚ,ǚ,Ǜ,ǜ
|
||||
Ɯ
|
||||
Ʊ
|
||||
V,v
|
||||
Ʋ
|
||||
W,w,Ŵ,ŵ
|
||||
X,x
|
||||
Y,y,Ý,ý,ÿ,Ŷ,ŷ,Ÿ
|
||||
Ƴ,ƴ
|
||||
Z,z,Ź,ź,Ż,ż
|
||||
ƍ
|
||||
Ž,ž
|
||||
Ƶ,ƶ
|
||||
Ʒ,Ǯ,ǯ
|
||||
Ƹ,ƹ
|
||||
ƺ
|
||||
Þ,þ
|
||||
ƿ,Ƿ
|
||||
ƻ
|
||||
Ƨ,ƨ
|
||||
Ƽ,ƽ
|
||||
Ƅ,ƅ
|
||||
ʼn
|
||||
ǀ
|
||||
ǁ
|
||||
ǂ
|
||||
ǃ
|
||||
drop table t1;
|
||||
SET NAMES utf8;
|
||||
CREATE TABLE t1 (c varchar(255) NOT NULL COLLATE utf8_general_ci, INDEX (c));
|
||||
|
|
|
|||
|
|
@ -186,6 +186,7 @@ insert into t1 values (_ucs2 0x01fc),(_ucs2 0x01fd),(_ucs2 0x01fe),(_ucs2 0x01ff
|
|||
insert into t1 values ('AA'),('Aa'),('aa'),('aA');
|
||||
insert into t1 values ('CH'),('Ch'),('ch'),('cH');
|
||||
insert into t1 values ('DZ'),('Dz'),('dz'),('dZ');
|
||||
insert into t1 values ('DŽ'),('Dž'),('dž'),('dŽ');
|
||||
insert into t1 values ('IJ'),('Ij'),('ij'),('iJ');
|
||||
insert into t1 values ('LJ'),('Lj'),('lj'),('lJ');
|
||||
insert into t1 values ('LL'),('Ll'),('ll'),('lL');
|
||||
|
|
@ -213,6 +214,7 @@ select group_concat(c1 order by c1) from t1 group by c1 collate utf8_spanish2_ci
|
|||
select group_concat(c1 order by c1) from t1 group by c1 collate utf8_roman_ci;
|
||||
select group_concat(c1 order by c1) from t1 group by c1 collate utf8_esperanto_ci;
|
||||
select group_concat(c1 order by c1) from t1 group by c1 collate utf8_hungarian_ci;
|
||||
select group_concat(c1 order by c1) from t1 group by c1 collate utf8_croatian_ci;
|
||||
|
||||
drop table t1;
|
||||
|
||||
|
|
|
|||
|
|
@ -42,6 +42,7 @@ extern CHARSET_INFO my_charset_ucs2_roman_uca_ci;
|
|||
extern CHARSET_INFO my_charset_ucs2_persian_uca_ci;
|
||||
extern CHARSET_INFO my_charset_ucs2_esperanto_uca_ci;
|
||||
extern CHARSET_INFO my_charset_ucs2_hungarian_uca_ci;
|
||||
extern CHARSET_INFO my_charset_ucs2_croatian_uca_ci;
|
||||
#endif
|
||||
|
||||
#ifdef HAVE_CHARSET_utf8
|
||||
|
|
@ -63,6 +64,7 @@ extern CHARSET_INFO my_charset_utf8_roman_uca_ci;
|
|||
extern CHARSET_INFO my_charset_utf8_persian_uca_ci;
|
||||
extern CHARSET_INFO my_charset_utf8_esperanto_uca_ci;
|
||||
extern CHARSET_INFO my_charset_utf8_hungarian_uca_ci;
|
||||
extern CHARSET_INFO my_charset_utf8_croatian_uca_ci;
|
||||
#ifdef HAVE_UTF8_GENERAL_CS
|
||||
extern CHARSET_INFO my_charset_utf8_general_cs;
|
||||
#endif
|
||||
|
|
@ -152,6 +154,7 @@ my_bool init_compiled_charsets(myf flags __attribute__((unused)))
|
|||
add_compiled_collation(&my_charset_ucs2_persian_uca_ci);
|
||||
add_compiled_collation(&my_charset_ucs2_esperanto_uca_ci);
|
||||
add_compiled_collation(&my_charset_ucs2_hungarian_uca_ci);
|
||||
add_compiled_collation(&my_charset_ucs2_croatian_uca_ci);
|
||||
#endif
|
||||
#endif
|
||||
|
||||
|
|
@ -186,6 +189,7 @@ my_bool init_compiled_charsets(myf flags __attribute__((unused)))
|
|||
add_compiled_collation(&my_charset_utf8_persian_uca_ci);
|
||||
add_compiled_collation(&my_charset_utf8_esperanto_uca_ci);
|
||||
add_compiled_collation(&my_charset_utf8_hungarian_uca_ci);
|
||||
add_compiled_collation(&my_charset_utf8_croatian_uca_ci);
|
||||
#endif
|
||||
#endif
|
||||
|
||||
|
|
|
|||
|
|
@ -567,8 +567,7 @@ my_bool my_like_range_mb(CHARSET_INFO *cs,
|
|||
char *min_end= min_str + res_length;
|
||||
char *max_end= max_str + res_length;
|
||||
size_t maxcharlen= res_length / cs->mbmaxlen;
|
||||
const char *contraction_flags= cs->contractions ?
|
||||
((const char*) cs->contractions) + 0x40*0x40 : NULL;
|
||||
my_bool have_contractions= my_uca_have_contractions(cs);
|
||||
|
||||
for (; ptr != end && min_str != min_end && maxcharlen ; maxcharlen--)
|
||||
{
|
||||
|
|
@ -636,8 +635,8 @@ fill_max_and_min:
|
|||
'ab\min\min\min\min' and 'ab\max\max\max\max'.
|
||||
|
||||
*/
|
||||
if (contraction_flags && ptr + 1 < end &&
|
||||
contraction_flags[(uchar) *ptr])
|
||||
if (have_contractions && ptr + 1 < end &&
|
||||
my_uca_can_be_contraction_head(cs, (uchar) *ptr))
|
||||
{
|
||||
/* Ptr[0] is a contraction head. */
|
||||
|
||||
|
|
@ -659,8 +658,8 @@ fill_max_and_min:
|
|||
is not a contraction, then we put only ptr[0],
|
||||
and continue with ptr[1] on the next loop.
|
||||
*/
|
||||
if (contraction_flags[(uchar) ptr[1]] &&
|
||||
cs->contractions[(*ptr-0x40)*0x40 + ptr[1] - 0x40])
|
||||
if (my_uca_can_be_contraction_tail(cs, (uchar) ptr[1]) &&
|
||||
my_uca_contraction2_weight(cs, (uchar) ptr[0], (uchar) ptr[1]))
|
||||
{
|
||||
/* Contraction found */
|
||||
if (maxcharlen == 1 || min_str + 1 >= min_end)
|
||||
|
|
|
|||
|
|
@ -6713,6 +6713,16 @@ static const char hungarian[]=
|
|||
"&U < \\u00FC <<< \\u00DC << \\u0171 <<< \\u0170";
|
||||
|
||||
|
||||
static const char croatian[]=
|
||||
|
||||
"&C < \\u010D <<< \\u010C < \\u0107 <<< \\u0106 "
|
||||
"&D < d\\u017E <<< \\u01C6 <<< D\\u017E <<< \\u01C5 <<< D\\u017D <<< \\u01C4 "
|
||||
" < \\u0111 <<< \\u0110 "
|
||||
"&L < lj <<< \\u01C9 <<< Lj <<< \\u01C8 <<< LJ <<< \\u01C7 "
|
||||
"&N < nj <<< \\u01CC <<< Nj <<< \\u01CB <<< NJ <<< \\u01CA "
|
||||
"&S < \\u0161 <<< \\u0160 "
|
||||
"&Z < \\u017E <<< \\u017D";
|
||||
|
||||
/*
|
||||
Unicode Collation Algorithm:
|
||||
Collation element (weight) scanner,
|
||||
|
|
@ -6726,7 +6736,7 @@ typedef struct my_uca_scanner_st
|
|||
const uchar *send; /* End of the input string */
|
||||
uchar *uca_length;
|
||||
uint16 **uca_weight;
|
||||
uint16 *contractions;
|
||||
MY_CONTRACTIONS *contractions;
|
||||
uint16 implicit[2];
|
||||
int page;
|
||||
int code;
|
||||
|
|
@ -6747,6 +6757,164 @@ typedef struct my_uca_scanner_handler_st
|
|||
static uint16 nochar[]= {0,0};
|
||||
|
||||
|
||||
#define MY_UCA_CNT_FLAG_SIZE 4096
|
||||
#define MY_UCA_CNT_FLAG_MASK 4095
|
||||
|
||||
#define MY_UCA_CNT_HEAD 1
|
||||
#define MY_UCA_CNT_TAIL 2
|
||||
|
||||
|
||||
|
||||
|
||||
/********** Helper functions to handle contraction ************/
|
||||
|
||||
|
||||
/**
|
||||
Mark a character as a contraction part
|
||||
|
||||
@cs Pointer to CHARSET_INFO data
|
||||
@wc Unicode code point
|
||||
@flag flag: "is contraction head", "is contraction tail"
|
||||
*/
|
||||
|
||||
static void
|
||||
my_uca_add_contraction_flag(CHARSET_INFO *cs, my_wc_t wc, int flag)
|
||||
{
|
||||
cs->contractions->flags[wc & MY_UCA_CNT_FLAG_MASK]|= flag;
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
Add a new contraction into contraction list
|
||||
|
||||
@cs Pointer to CHARSET_INFO data
|
||||
@wc Unicode code points of the characters
|
||||
@len Number of characters
|
||||
|
||||
@return New contraction
|
||||
@retval Pointer to a newly added contraction
|
||||
*/
|
||||
|
||||
static MY_CONTRACTION *
|
||||
my_uca_add_contraction(CHARSET_INFO *cs,
|
||||
my_wc_t *wc, int len __attribute__((unused)))
|
||||
{
|
||||
MY_CONTRACTIONS *list= cs->contractions;
|
||||
MY_CONTRACTION *next= &list->item[list->nitems];
|
||||
DBUG_ASSERT(len == 2); /* We currently support only contraction2 */
|
||||
next->ch[0]= wc[0];
|
||||
next->ch[1]= wc[1];
|
||||
list->nitems++;
|
||||
return next;
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
Allocate and initialize memory for contraction list and flags
|
||||
|
||||
@cs Pointer to CHARSET_INFO data
|
||||
@alloc Memory allocation function (typically points to my_alloc_once)
|
||||
@n Number of contractions
|
||||
|
||||
@return Error code
|
||||
@retval 0 - memory allocated successfully
|
||||
@retval 1 - not enough memory
|
||||
*/
|
||||
|
||||
static my_bool
|
||||
my_uca_alloc_contractions(CHARSET_INFO *cs, void *(*alloc)(size_t), size_t n)
|
||||
{
|
||||
uint size= n * sizeof(MY_CONTRACTION);
|
||||
if (!(cs->contractions= (*alloc)(sizeof(MY_CONTRACTIONS))))
|
||||
return 1;
|
||||
bzero(cs->contractions, sizeof(MY_CONTRACTIONS));
|
||||
if (!(cs->contractions->item= (*alloc)(size)) ||
|
||||
!(cs->contractions->flags= (char*) (*alloc)(MY_UCA_CNT_FLAG_SIZE)))
|
||||
return 1;
|
||||
bzero((void*) cs->contractions->item, size);
|
||||
bzero((void*) cs->contractions->flags, MY_UCA_CNT_FLAG_SIZE);
|
||||
return 0;
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
Check if UCA data has contractions (public version)
|
||||
|
||||
@cs Pointer to CHARSET_INFO data
|
||||
@retval 0 - no contraction, 1 - have contractions.
|
||||
*/
|
||||
|
||||
my_bool
|
||||
my_uca_have_contractions(CHARSET_INFO *cs)
|
||||
{
|
||||
return cs->contractions != NULL;
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
Check if a character can be contraction head
|
||||
|
||||
@cs Pointer to CHARSET_INFO data
|
||||
@wc Code point
|
||||
|
||||
@retval 0 - cannot be contraction head
|
||||
@retval 1 - can be contraction head
|
||||
*/
|
||||
|
||||
my_bool
|
||||
my_uca_can_be_contraction_head(CHARSET_INFO *cs, my_wc_t wc)
|
||||
{
|
||||
return cs->contractions->flags[wc & MY_UCA_CNT_FLAG_MASK] & MY_UCA_CNT_HEAD;
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
Check if a character can be contraction tail
|
||||
|
||||
@cs Pointer to CHARSET_INFO data
|
||||
@wc Code point
|
||||
|
||||
@retval 0 - cannot be contraction tail
|
||||
@retval 1 - can be contraction tail
|
||||
*/
|
||||
|
||||
my_bool
|
||||
my_uca_can_be_contraction_tail(CHARSET_INFO *cs, my_wc_t wc)
|
||||
{
|
||||
return cs->contractions->flags[wc & MY_UCA_CNT_FLAG_MASK] & MY_UCA_CNT_TAIL;
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
Find a contraction and return its weight array
|
||||
|
||||
@cs Pointer to CHARSET data
|
||||
@wc1 First character
|
||||
@wc2 Second character
|
||||
|
||||
@return Weight array
|
||||
@retval NULL - no contraction found
|
||||
@retval ptr - contraction weight array
|
||||
*/
|
||||
|
||||
uint16 *
|
||||
my_uca_contraction2_weight(CHARSET_INFO *cs, my_wc_t wc1, my_wc_t wc2)
|
||||
{
|
||||
MY_CONTRACTIONS *list= cs->contractions;
|
||||
MY_CONTRACTION *c, *last;
|
||||
for (c= list->item, last= &list->item[list->nitems]; c < last; c++)
|
||||
{
|
||||
if (c->ch[0] == wc1 && c->ch[1] == wc2)
|
||||
{
|
||||
return c->weight;
|
||||
}
|
||||
}
|
||||
return NULL;
|
||||
}
|
||||
|
||||
|
||||
|
||||
|
||||
#ifdef HAVE_CHARSET_ucs2
|
||||
/*
|
||||
Initialize collation weight scanner
|
||||
|
|
@ -6766,7 +6934,7 @@ static uint16 nochar[]= {0,0};
|
|||
*/
|
||||
|
||||
static void my_uca_scanner_init_ucs2(my_uca_scanner *scanner,
|
||||
CHARSET_INFO *cs __attribute__((unused)),
|
||||
CHARSET_INFO *cs,
|
||||
const uchar *str, size_t length)
|
||||
{
|
||||
scanner->wbeg= nochar;
|
||||
|
|
@ -6777,6 +6945,7 @@ static void my_uca_scanner_init_ucs2(my_uca_scanner *scanner,
|
|||
scanner->uca_length= cs->sort_order;
|
||||
scanner->uca_weight= cs->sort_order_big;
|
||||
scanner->contractions= cs->contractions;
|
||||
scanner->cs= cs;
|
||||
return;
|
||||
}
|
||||
|
||||
|
|
@ -6865,18 +7034,23 @@ static int my_uca_scanner_next_ucs2(my_uca_scanner *scanner)
|
|||
|
||||
if (scanner->contractions && (scanner->sbeg <= scanner->send))
|
||||
{
|
||||
int cweight;
|
||||
my_wc_t wc1= ((scanner->page << 8) | scanner->code);
|
||||
|
||||
if (!scanner->page && !scanner->sbeg[0] &&
|
||||
(scanner->sbeg[1] > 0x40) && (scanner->sbeg[1] < 0x80) &&
|
||||
(scanner->code > 0x40) && (scanner->code < 0x80) &&
|
||||
(cweight= scanner->contractions[(scanner->code-0x40)*0x40+scanner->sbeg[1]-0x40]))
|
||||
if (my_uca_can_be_contraction_head(scanner->cs, wc1))
|
||||
{
|
||||
uint16 *cweight;
|
||||
my_wc_t wc2= (((my_wc_t) scanner->sbeg[0]) << 8) | scanner->sbeg[1];
|
||||
if (my_uca_can_be_contraction_tail(scanner->cs, wc2) &&
|
||||
(cweight= my_uca_contraction2_weight(scanner->cs,
|
||||
scanner->code,
|
||||
scanner->sbeg[1])))
|
||||
{
|
||||
scanner->implicit[0]= 0;
|
||||
scanner->wbeg= scanner->implicit;
|
||||
scanner->sbeg+=2;
|
||||
return cweight;
|
||||
return *cweight;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if (!ucaw[scanner->page])
|
||||
|
|
@ -6959,23 +7133,22 @@ static int my_uca_scanner_next_any(my_uca_scanner *scanner)
|
|||
scanner->code= wc & 0xFF;
|
||||
scanner->sbeg+= mb_len;
|
||||
|
||||
if (scanner->contractions && !scanner->page &&
|
||||
(scanner->code > 0x40) && (scanner->code < 0x80))
|
||||
if (my_uca_have_contractions(scanner->cs) &&
|
||||
my_uca_can_be_contraction_head(scanner->cs, wc))
|
||||
{
|
||||
uint page1, code1, cweight;
|
||||
my_wc_t wc2;
|
||||
uint16 *cweight;
|
||||
|
||||
if (((mb_len= scanner->cs->cset->mb_wc(scanner->cs, &wc,
|
||||
if (((mb_len= scanner->cs->cset->mb_wc(scanner->cs, &wc2,
|
||||
scanner->sbeg,
|
||||
scanner->send)) >=0) &&
|
||||
(!(page1= (wc >> 8))) &&
|
||||
((code1= (wc & 0xFF)) > 0x40) &&
|
||||
(code1 < 0x80) &&
|
||||
(cweight= scanner->contractions[(scanner->code-0x40)*0x40 + code1-0x40]))
|
||||
my_uca_can_be_contraction_tail(scanner->cs, wc2) &&
|
||||
(cweight= my_uca_contraction2_weight(scanner->cs, wc, wc2)))
|
||||
{
|
||||
scanner->implicit[0]= 0;
|
||||
scanner->wbeg= scanner->implicit;
|
||||
scanner->sbeg+= mb_len;
|
||||
return cweight;
|
||||
return *cweight;
|
||||
}
|
||||
}
|
||||
|
||||
|
|
@ -7012,6 +7185,33 @@ static my_uca_scanner_handler my_any_uca_scanner_handler=
|
|||
my_uca_scanner_next_any
|
||||
};
|
||||
|
||||
|
||||
|
||||
/**
|
||||
Helper function:
|
||||
Find address of weights of the given character.
|
||||
|
||||
@weights UCA weight array
|
||||
@lengths UCA length array
|
||||
@ch character Unicode code point
|
||||
|
||||
@return Weight array
|
||||
@retval pointer to weight array for the given character,
|
||||
or NULL if this page does not have implicit weights.
|
||||
*/
|
||||
|
||||
static inline uint16 *
|
||||
my_char_weight_addr(CHARSET_INFO *cs, uint wc)
|
||||
{
|
||||
uint page= (wc >> 8);
|
||||
uint ofst= wc & 0xFF;
|
||||
return cs->sort_order_big[page] ?
|
||||
cs->sort_order_big[page] + ofst * cs->sort_order[page] :
|
||||
NULL;
|
||||
}
|
||||
|
||||
|
||||
|
||||
/*
|
||||
Compares two strings according to the collation
|
||||
|
||||
|
|
@ -7683,8 +7883,8 @@ ex:
|
|||
|
||||
typedef struct my_coll_rule_item_st
|
||||
{
|
||||
uint base; /* Base character */
|
||||
uint curr[2]; /* Current character */
|
||||
my_wc_t base; /* Base character */
|
||||
my_wc_t curr[2]; /* Current character */
|
||||
int diff[3]; /* Primary, Secondary and Tertiary difference */
|
||||
} MY_COLL_RULE;
|
||||
|
||||
|
|
@ -7834,6 +8034,7 @@ static int my_coll_rule_parse(MY_COLL_RULE *rule, size_t mitems,
|
|||
static my_bool create_tailoring(CHARSET_INFO *cs, void *(*alloc)(size_t))
|
||||
{
|
||||
MY_COLL_RULE rule[MY_MAX_COLL_RULE];
|
||||
MY_COLL_RULE *r, *rfirst, *rlast;
|
||||
char errstr[128];
|
||||
uchar *newlengths;
|
||||
uint16 **newweights;
|
||||
|
|
@ -7858,6 +8059,9 @@ static my_bool create_tailoring(CHARSET_INFO *cs, void *(*alloc)(size_t))
|
|||
return 1;
|
||||
}
|
||||
|
||||
rfirst= rule;
|
||||
rlast= rule + rc;
|
||||
|
||||
if (!cs->caseinfo)
|
||||
cs->caseinfo= my_unicase_default;
|
||||
|
||||
|
|
@ -7941,44 +8145,21 @@ static my_bool create_tailoring(CHARSET_INFO *cs, void *(*alloc)(size_t))
|
|||
/* Now process contractions */
|
||||
if (ncontractions)
|
||||
{
|
||||
/*
|
||||
8K for weights for basic latin letter pairs,
|
||||
plus 256 bytes for "is contraction part" flags.
|
||||
*/
|
||||
uint size= 0x40*0x40*sizeof(uint16) + 256;
|
||||
char *contraction_flags;
|
||||
if (!(cs->contractions= (uint16*) (*alloc)(size)))
|
||||
return 1;
|
||||
bzero((void*)cs->contractions, size);
|
||||
contraction_flags= ((char*) cs->contractions) + 0x40*0x40;
|
||||
for (i=0; i < rc; i++)
|
||||
if (my_uca_alloc_contractions(cs, alloc, ncontractions))
|
||||
return 1;
|
||||
for (r= rfirst; r < rlast; r++)
|
||||
{
|
||||
if (rule[i].curr[1])
|
||||
uint16 *to;
|
||||
if (r->curr[1]) /* Contraction */
|
||||
{
|
||||
uint pageb= (rule[i].base >> 8) & 0xFF;
|
||||
uint chb= rule[i].base & 0xFF;
|
||||
uint16 *offsb= defweights[pageb] + chb*deflengths[pageb];
|
||||
uint offsc;
|
||||
|
||||
if (offsb[1] ||
|
||||
rule[i].curr[0] < 0x40 || rule[i].curr[0] > 0x7f ||
|
||||
rule[i].curr[1] < 0x40 || rule[i].curr[1] > 0x7f)
|
||||
{
|
||||
/*
|
||||
TODO: add error reporting;
|
||||
We support only basic latin letters contractions at this point.
|
||||
Also, We don't support contractions with weight longer than one.
|
||||
Otherwise, we'd need much more memory.
|
||||
*/
|
||||
return 1;
|
||||
}
|
||||
offsc= (rule[i].curr[0]-0x40)*0x40+(rule[i].curr[1]-0x40);
|
||||
|
||||
/* Copy base weight applying primary difference */
|
||||
cs->contractions[offsc]= offsb[0] + rule[i].diff[0];
|
||||
/* Mark both letters as "is contraction part */
|
||||
contraction_flags[rule[i].curr[0]]= 1;
|
||||
contraction_flags[rule[i].curr[1]]= 1;
|
||||
/* Mark both letters as "is contraction part" */
|
||||
my_uca_add_contraction_flag(cs, r->curr[0], MY_UCA_CNT_HEAD);
|
||||
my_uca_add_contraction_flag(cs, r->curr[1], MY_UCA_CNT_TAIL);
|
||||
to= my_uca_add_contraction(cs, r->curr, 2)->weight;
|
||||
/* Copy weight from the reset character */
|
||||
to[0]= my_char_weight_addr(cs, r->base)[0];
|
||||
/* Apply primary difference */
|
||||
to[0]+= r->diff[0];
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
@ -8701,6 +8882,39 @@ CHARSET_INFO my_charset_ucs2_hungarian_uca_ci=
|
|||
};
|
||||
|
||||
|
||||
CHARSET_INFO my_charset_ucs2_croatian_uca_ci=
|
||||
{
|
||||
149,0,0, /* number */
|
||||
MY_CS_COMPILED|MY_CS_STRNXFRM|MY_CS_UNICODE,
|
||||
"ucs2", /* cs name */
|
||||
"ucs2_croatian_ci", /* name */
|
||||
"", /* comment */
|
||||
croatian, /* tailoring */
|
||||
NULL, /* ctype */
|
||||
NULL, /* to_lower */
|
||||
NULL, /* to_upper */
|
||||
NULL, /* sort_order */
|
||||
NULL, /* contractions */
|
||||
NULL, /* sort_order_big*/
|
||||
NULL, /* tab_to_uni */
|
||||
NULL, /* tab_from_uni */
|
||||
my_unicase_default, /* caseinfo */
|
||||
NULL, /* state_map */
|
||||
NULL, /* ident_map */
|
||||
8, /* strxfrm_multiply */
|
||||
1, /* caseup_multiply */
|
||||
1, /* casedn_multiply */
|
||||
2, /* mbminlen */
|
||||
2, /* mbmaxlen */
|
||||
9, /* min_sort_char */
|
||||
0xFFFF, /* max_sort_char */
|
||||
' ', /* pad char */
|
||||
0, /* escape_with_backslash_is_dangerous */
|
||||
&my_charset_ucs2_handler,
|
||||
&my_collation_ucs2_uca_handler
|
||||
};
|
||||
|
||||
|
||||
#endif
|
||||
|
||||
|
||||
|
|
@ -9358,6 +9572,38 @@ CHARSET_INFO my_charset_utf8_hungarian_uca_ci=
|
|||
&my_collation_any_uca_handler
|
||||
};
|
||||
|
||||
CHARSET_INFO my_charset_utf8_croatian_uca_ci=
|
||||
{
|
||||
213,0,0, /* number */
|
||||
MY_CS_COMPILED|MY_CS_STRNXFRM|MY_CS_UNICODE,
|
||||
"utf8", /* cs name */
|
||||
"utf8_croatian_ci", /* name */
|
||||
"", /* comment */
|
||||
croatian, /* tailoring */
|
||||
ctype_utf8, /* ctype */
|
||||
NULL, /* to_lower */
|
||||
NULL, /* to_upper */
|
||||
NULL, /* sort_order */
|
||||
NULL, /* contractions */
|
||||
NULL, /* sort_order_big*/
|
||||
NULL, /* tab_to_uni */
|
||||
NULL, /* tab_from_uni */
|
||||
my_unicase_default, /* caseinfo */
|
||||
NULL, /* state_map */
|
||||
NULL, /* ident_map */
|
||||
8, /* strxfrm_multiply */
|
||||
1, /* caseup_multiply */
|
||||
1, /* casedn_multiply */
|
||||
1, /* mbminlen */
|
||||
3, /* mbmaxlen */
|
||||
9, /* min_sort_char */
|
||||
0xFFFF, /* max_sort_char */
|
||||
' ', /* pad char */
|
||||
0, /* escape_with_backslash_is_dangerous */
|
||||
&my_charset_utf8_handler,
|
||||
&my_collation_any_uca_handler
|
||||
};
|
||||
|
||||
#endif /* HAVE_CHARSET_utf8 */
|
||||
|
||||
#endif /* HAVE_UCA_COLLATIONS */
|
||||
|
|
|
|||
|
|
@ -1526,8 +1526,7 @@ my_bool my_like_range_ucs2(CHARSET_INFO *cs,
|
|||
char *min_org=min_str;
|
||||
char *min_end=min_str+res_length;
|
||||
size_t charlen= res_length / cs->mbmaxlen;
|
||||
const char *contraction_flags= cs->contractions ?
|
||||
((const char*) cs->contractions) + 0x40*0x40 : NULL;
|
||||
my_bool have_contractions= my_uca_have_contractions(cs);
|
||||
|
||||
for ( ; ptr + 1 < end && min_str + 1 < min_end && charlen > 0
|
||||
; ptr+=2, charlen--)
|
||||
|
|
@ -1567,8 +1566,9 @@ fill_max_and_min:
|
|||
return 0;
|
||||
}
|
||||
|
||||
if (contraction_flags && ptr + 3 < end &&
|
||||
ptr[0] == '\0' && contraction_flags[(uchar) ptr[1]])
|
||||
if (have_contractions && ptr + 3 < end &&
|
||||
ptr[0] == '\0' &&
|
||||
my_uca_can_be_contraction_head(cs, (uchar) ptr[1]))
|
||||
{
|
||||
/* Contraction head found */
|
||||
if (ptr[2] == '\0' && (ptr[3] == w_one || ptr[3] == w_many))
|
||||
|
|
@ -1581,8 +1581,9 @@ fill_max_and_min:
|
|||
Check if the second letter can be contraction part,
|
||||
and if two letters really produce a contraction.
|
||||
*/
|
||||
if (ptr[2] == '\0' && contraction_flags[(uchar) ptr[3]] &&
|
||||
cs->contractions[(ptr[1]-0x40)*0x40 + ptr[3] - 0x40])
|
||||
if (ptr[2] == '\0' &&
|
||||
my_uca_can_be_contraction_tail(cs, (uchar) ptr[3]) &&
|
||||
my_uca_contraction2_weight(cs,(uchar) ptr[1], (uchar) ptr[3]))
|
||||
{
|
||||
/* Contraction found */
|
||||
if (charlen == 1 || min_str + 2 >= min_end)
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue