MDEV-27265 Improve contraction performance in UCA collations

Adding a hash table for contractions.

The old code iterated through all items in MY_CONTRACTIONS,
and was much slower, especially for those contractions
in the end of the list.
This commit is contained in:
Alexander Barkov 2022-02-24 17:54:52 +04:00 committed by Oleksandr Byelkin
parent 133446828c
commit a0858b2cff
2 changed files with 194 additions and 18 deletions

View file

@ -141,6 +141,13 @@ const uint16 *my_uca_contraction2_weight(const MY_CONTRACTIONS *c,
my_wc_t wc1, my_wc_t wc2);
typedef struct my_uca_contraction_hash_t
{
size_t nitems_alloced;
MY_CONTRACTION *item;
} MY_UCA_CONTRACTION_HASH;
/* Collation weights on a single level (e.g. primary, secondary, tertiarty) */
typedef struct my_uca_level_info_st
{
@ -149,6 +156,7 @@ typedef struct my_uca_level_info_st
uint16 **weights;
MY_CONTRACTIONS contractions;
uint levelno;
MY_UCA_CONTRACTION_HASH contraction_hash;
} MY_UCA_WEIGHT_LEVEL;

View file

@ -6548,7 +6548,8 @@ MY_UCA_INFO my_uca_v400=
NULL, /* item */
NULL /* flags */
},
0 /* levelno */
0, /* levelno */
{0} /* contraction_hash */
},
{
0,
@ -6559,7 +6560,8 @@ MY_UCA_INFO my_uca_v400=
NULL,
NULL
},
1 /* levelno */
1, /* levelno */
{0} /* contraction_hash */
},
{0}
},
@ -30109,7 +30111,8 @@ MY_UCA_INFO my_uca_v520_th=
thai_contractions, /* item */
NULL /* flags */
},
0 /* levelno */
0, /* levelno */
{0} /* contraction_hash */
},
{
0x10FFFF, /* maxchar */
@ -30120,7 +30123,8 @@ MY_UCA_INFO my_uca_v520_th=
thai_contractions_w2, /* item */
NULL /* flags */
},
1 /* levelno */
1, /* levelno */
{0} /* contraction_hash */
},
{0}
},
@ -30159,7 +30163,8 @@ MY_UCA_INFO my_uca_v520=
NULL, /* item */
NULL /* flags */
},
0 /* levelno */
0, /* levelno */
{0} /* contraction_hash */
},
{
@ -30171,7 +30176,8 @@ MY_UCA_INFO my_uca_v520=
NULL, /* item */
NULL /* flags */
},
1 /* levelno */
1, /* levelno */
{0} /* contraction_hash */
},
{0}
@ -30214,7 +30220,8 @@ static MY_UCA_INFO my_uca_v1400=
uca1400_contractions, /* item */
NULL /* flags */
},
0 /* levelno */
0, /* levelno */
{0} /* contraction_hash */
},
{
@ -30226,7 +30233,8 @@ static MY_UCA_INFO my_uca_v1400=
uca1400_contractions_secondary, /* item */
NULL /* flags */
},
1 /* levelno */
1, /* levelno */
{0} /* contraction_hash */
},
{
@ -30238,7 +30246,8 @@ static MY_UCA_INFO my_uca_v1400=
uca1400_contractions_tertiary, /* item */
NULL /* flags */
},
2 /* levelno */
2, /* levelno */
{0} /* contraction_hash */
}
},
@ -31662,6 +31671,150 @@ static inline uint my_contraction_char_length(const MY_CONTRACTION *cnt)
}
/*
The number of elements must be a degree of 2.
This allows to use the faster & operator instead of the
slow % operator to find the remainder of the division:
pos= (start_pos + iteration) & MASK
instead of:
pos= (start_pos + iteration) % NUMBER_OF_PREALLOCED_HASH_ELEMENTS
DUCET as of Unicode-14.0.0 has 939 default contractions.
CLDR-40 has around 2601 contractions (all collations total).
The built-in Myanmar collation tailoring has 912 contractions.
4096 as the contraction prealloced hash size should be enough
for all collations.
*/
#define MY_UCA_CONTRACTION_HASH_ALLOC_ELEMENTS 4096
#define MY_UCA_CONTRACTION_HASH_LSHIFT 2
#define MY_UCA_CONTRACTION_HASH_MASK \
((MY_UCA_CONTRACTION_HASH_ALLOC_ELEMENTS-1)>>MY_UCA_CONTRACTION_HASH_LSHIFT)
#define MY_UCA_CONTRACTION_HASH_ALLOWED_COLLISIONS \
(MY_UCA_CONTRACTION_HASH_ALLOC_ELEMENTS-1)
/*#define DBUG_UCA_CONTRACTIONS*/
#ifdef DBUG_UCA_CONTRACTIONS
static ulonglong collisions= 0;
static ulonglong collisions_eq= 0;
#endif
/*
An empirical hash function for contractions.
It does not produce collisions for built-in DUCET contractions
as of Unicode-14.0.0.
*/
static uint16
my_uca_contraction_hash_func(my_wc_t a, my_wc_t b)
{
return (uint16) (((a * 465 + b) & MY_UCA_CONTRACTION_HASH_MASK) <<
MY_UCA_CONTRACTION_HASH_LSHIFT);
}
/*
Find an unused cell in the contraction hash table.
*/
static my_bool
my_uca_contraction_hash_find_empty(const MY_UCA_CONTRACTION_HASH *cnt,
uint16 start,
uint16 *ppos)
{
uint16 i;
for (i= 0; i < MY_UCA_CONTRACTION_HASH_ALLOWED_COLLISIONS; i++)
{
uint16 pos= (i + start) % cnt->nitems_alloced;
if (!cnt->item[pos].ch[0])
{
*ppos= pos;
return FALSE;
}
}
return TRUE;
}
/*
Find a contraction in the hash table
*/
static inline const MY_CONTRACTION *
my_uca_contraction_hash_find(const MY_UCA_CONTRACTION_HASH *cnt,
const my_wc_t *wc, size_t len)
{
uint16 start= my_uca_contraction_hash_func(wc[0], wc[1]);
uint16 i;
DBUG_ASSERT(len <= MY_UCA_MAX_CONTRACTION);
for (i=0 ; i < MY_UCA_CONTRACTION_HASH_ALLOWED_COLLISIONS; i++)
{
uint16 pos= (i + start) % cnt->nitems_alloced;
const MY_CONTRACTION *c= &cnt->item[pos];
if (!c->ch[0])
return NULL; /* An empty cell found - there is no such contraction */
if (my_uca_true_contraction_eq(c, wc, len))
return c; /* The given contraction was found */
}
/*
We scanned every single cell in the hash table and neither found
the given contraction nor met an empty cell. This is a very unlikely
scenario and is possible only if the hash table is full.
Anyway, the given contraction was not found in the hash.
*/
return NULL;
}
/*
Allocate an empty hash table for contractions.
*/
static my_bool
my_uca_contraction_hash_allocate(MY_UCA_CONTRACTION_HASH *dst,
MY_CHARSET_LOADER *loader)
{
size_t nbytes= MY_UCA_CONTRACTION_HASH_ALLOC_ELEMENTS * sizeof(MY_CONTRACTION);
bzero(dst, sizeof(*dst));
if (!(dst->item= (MY_CONTRACTION*) (loader->once_alloc)(nbytes)))
return TRUE;
bzero(dst->item, nbytes);
dst->nitems_alloced= MY_UCA_CONTRACTION_HASH_ALLOC_ELEMENTS;
return FALSE;
}
/*
Add all contractions from the list "src" into the hash table "dst".
*/
static my_bool
my_uca_contraction_hash_populate(MY_UCA_CONTRACTION_HASH *dst,
const MY_CONTRACTIONS *src)
{
size_t i;
DBUG_ASSERT(dst->nitems_alloced > 0);
for (i= 0; i < src->nitems; i++)
{
const MY_CONTRACTION *c= &src->item[i];
uint16 start= my_uca_contraction_hash_func(c->ch[0], c->ch[1]);
if (!dst->item[start].ch[0])
dst->item[start]= src->item[i];
else
{
uint16 pos;
#ifdef DBUG_UCA_CONTRACTIONS
if (dst->item[start].ch[0] != c->ch[0] &&
dst->item[start].ch[1] != c->ch[1])
collisions++;
else
collisions_eq++;
#endif
if (my_uca_contraction_hash_find_empty(dst, start, &pos))
return TRUE;
dst->item[pos]= src->item[i];
}
}
return FALSE;
}
/**
Check if a string is a contraction,
and return its weight array on success.
@ -31691,7 +31844,7 @@ my_uca_contraction_find(const MY_CONTRACTIONS *list, my_wc_t *wc, size_t len)
/**
Find a contraction in the input stream and return its weight array
Find a contraction in the input stream
Scan input characters while their flags tell that they can be
a contraction part. Then try to find real contraction among the
@ -31701,14 +31854,15 @@ my_uca_contraction_find(const MY_CONTRACTIONS *list, my_wc_t *wc, size_t len)
@param[OUT] *wc Where to store the scanned string
@param max_char_length The longest contraction character length allowed
@return Weight array
@return
@retval NULL - no contraction found
@retval ptr - the address of MY_CONTRACTION found
*/
static const MY_CONTRACTION *
my_uca_scanner_contraction_find(my_uca_scanner *scanner, my_wc_t currwc,
size_t max_char_length)
my_uca_scanner_contraction_hash_find(my_uca_scanner *scanner,
my_wc_t currwc,
size_t max_char_length)
{
size_t clen= 1;
int flag;
@ -31738,8 +31892,8 @@ my_uca_scanner_contraction_find(my_uca_scanner *scanner, my_wc_t currwc,
const MY_CONTRACTION *cnt;
if (my_uca_can_be_contraction_tail(&scanner->level->contractions,
wc[clen - 1]) &&
(cnt= my_uca_contraction_find(&scanner->level->contractions,
wc, clen)))
(cnt= my_uca_contraction_hash_find(&scanner->level->contraction_hash,
wc, clen)))
{
scanner->sbeg= beg[clen - 1];
return cnt;
@ -31752,7 +31906,6 @@ my_uca_scanner_contraction_find(my_uca_scanner *scanner, my_wc_t currwc,
/**
Find weight for contraction with previous context
and return its weight array.
@param scanner Pointer to UCA scanner
@param wc0 Previous character
@ -31827,8 +31980,9 @@ my_uca_context_weight_find(my_uca_scanner *scanner, my_wc_t currwc,
else if (my_uca_can_be_contraction_head(&scanner->level->contractions,
currwc))
{
/* Check if currwc starts a contraction */
if ((cnt= my_uca_scanner_contraction_find(scanner, currwc, max_char_length)))
/* Check if w[0] starts a contraction */
if ((cnt= my_uca_scanner_contraction_hash_find(scanner, currwc,
max_char_length)))
return cnt;
}
return NULL;
@ -33887,6 +34041,20 @@ init_weight_level(MY_CHARSET_LOADER *loader, MY_COLL_RULES *rules,
item->with_context);
memcpy(weights, item->weight, sizeof(item->weight));
}
if (ncontractions)
{
if (ncontractions > MY_UCA_CONTRACTION_HASH_ALLOC_ELEMENTS ||
my_uca_contraction_hash_allocate(&dst->contraction_hash, loader) ||
my_uca_contraction_hash_populate(&dst->contraction_hash,
&dst->contractions))
{
my_snprintf(loader->error, sizeof(loader->error),
"Can't initialize %d contractions", (int) ncontractions);
return TRUE;
}
}
return FALSE;
}