mirror of
https://github.com/MariaDB/server.git
synced 2025-01-15 19:42:28 +01:00
MDEV-27265 Improve contraction performance in UCA collations
Adding a hash table for contractions. The old code iterated through all items in MY_CONTRACTIONS, and was much slower, especially for those contractions in the end of the list.
This commit is contained in:
parent
133446828c
commit
a0858b2cff
2 changed files with 194 additions and 18 deletions
|
@ -141,6 +141,13 @@ const uint16 *my_uca_contraction2_weight(const MY_CONTRACTIONS *c,
|
|||
my_wc_t wc1, my_wc_t wc2);
|
||||
|
||||
|
||||
typedef struct my_uca_contraction_hash_t
|
||||
{
|
||||
size_t nitems_alloced;
|
||||
MY_CONTRACTION *item;
|
||||
} MY_UCA_CONTRACTION_HASH;
|
||||
|
||||
|
||||
/* Collation weights on a single level (e.g. primary, secondary, tertiarty) */
|
||||
typedef struct my_uca_level_info_st
|
||||
{
|
||||
|
@ -149,6 +156,7 @@ typedef struct my_uca_level_info_st
|
|||
uint16 **weights;
|
||||
MY_CONTRACTIONS contractions;
|
||||
uint levelno;
|
||||
MY_UCA_CONTRACTION_HASH contraction_hash;
|
||||
} MY_UCA_WEIGHT_LEVEL;
|
||||
|
||||
|
||||
|
|
|
@ -6548,7 +6548,8 @@ MY_UCA_INFO my_uca_v400=
|
|||
NULL, /* item */
|
||||
NULL /* flags */
|
||||
},
|
||||
0 /* levelno */
|
||||
0, /* levelno */
|
||||
{0} /* contraction_hash */
|
||||
},
|
||||
{
|
||||
0,
|
||||
|
@ -6559,7 +6560,8 @@ MY_UCA_INFO my_uca_v400=
|
|||
NULL,
|
||||
NULL
|
||||
},
|
||||
1 /* levelno */
|
||||
1, /* levelno */
|
||||
{0} /* contraction_hash */
|
||||
},
|
||||
{0}
|
||||
},
|
||||
|
@ -30109,7 +30111,8 @@ MY_UCA_INFO my_uca_v520_th=
|
|||
thai_contractions, /* item */
|
||||
NULL /* flags */
|
||||
},
|
||||
0 /* levelno */
|
||||
0, /* levelno */
|
||||
{0} /* contraction_hash */
|
||||
},
|
||||
{
|
||||
0x10FFFF, /* maxchar */
|
||||
|
@ -30120,7 +30123,8 @@ MY_UCA_INFO my_uca_v520_th=
|
|||
thai_contractions_w2, /* item */
|
||||
NULL /* flags */
|
||||
},
|
||||
1 /* levelno */
|
||||
1, /* levelno */
|
||||
{0} /* contraction_hash */
|
||||
},
|
||||
{0}
|
||||
},
|
||||
|
@ -30159,7 +30163,8 @@ MY_UCA_INFO my_uca_v520=
|
|||
NULL, /* item */
|
||||
NULL /* flags */
|
||||
},
|
||||
0 /* levelno */
|
||||
0, /* levelno */
|
||||
{0} /* contraction_hash */
|
||||
},
|
||||
|
||||
{
|
||||
|
@ -30171,7 +30176,8 @@ MY_UCA_INFO my_uca_v520=
|
|||
NULL, /* item */
|
||||
NULL /* flags */
|
||||
},
|
||||
1 /* levelno */
|
||||
1, /* levelno */
|
||||
{0} /* contraction_hash */
|
||||
},
|
||||
|
||||
{0}
|
||||
|
@ -30214,7 +30220,8 @@ static MY_UCA_INFO my_uca_v1400=
|
|||
uca1400_contractions, /* item */
|
||||
NULL /* flags */
|
||||
},
|
||||
0 /* levelno */
|
||||
0, /* levelno */
|
||||
{0} /* contraction_hash */
|
||||
},
|
||||
|
||||
{
|
||||
|
@ -30226,7 +30233,8 @@ static MY_UCA_INFO my_uca_v1400=
|
|||
uca1400_contractions_secondary, /* item */
|
||||
NULL /* flags */
|
||||
},
|
||||
1 /* levelno */
|
||||
1, /* levelno */
|
||||
{0} /* contraction_hash */
|
||||
},
|
||||
|
||||
{
|
||||
|
@ -30238,7 +30246,8 @@ static MY_UCA_INFO my_uca_v1400=
|
|||
uca1400_contractions_tertiary, /* item */
|
||||
NULL /* flags */
|
||||
},
|
||||
2 /* levelno */
|
||||
2, /* levelno */
|
||||
{0} /* contraction_hash */
|
||||
}
|
||||
|
||||
},
|
||||
|
@ -31662,6 +31671,150 @@ static inline uint my_contraction_char_length(const MY_CONTRACTION *cnt)
|
|||
}
|
||||
|
||||
|
||||
/*
|
||||
The number of elements must be a degree of 2.
|
||||
This allows to use the faster & operator instead of the
|
||||
slow % operator to find the remainder of the division:
|
||||
pos= (start_pos + iteration) & MASK
|
||||
instead of:
|
||||
pos= (start_pos + iteration) % NUMBER_OF_PREALLOCED_HASH_ELEMENTS
|
||||
|
||||
DUCET as of Unicode-14.0.0 has 939 default contractions.
|
||||
CLDR-40 has around 2601 contractions (all collations total).
|
||||
The built-in Myanmar collation tailoring has 912 contractions.
|
||||
4096 as the contraction prealloced hash size should be enough
|
||||
for all collations.
|
||||
*/
|
||||
#define MY_UCA_CONTRACTION_HASH_ALLOC_ELEMENTS 4096
|
||||
#define MY_UCA_CONTRACTION_HASH_LSHIFT 2
|
||||
#define MY_UCA_CONTRACTION_HASH_MASK \
|
||||
((MY_UCA_CONTRACTION_HASH_ALLOC_ELEMENTS-1)>>MY_UCA_CONTRACTION_HASH_LSHIFT)
|
||||
#define MY_UCA_CONTRACTION_HASH_ALLOWED_COLLISIONS \
|
||||
(MY_UCA_CONTRACTION_HASH_ALLOC_ELEMENTS-1)
|
||||
|
||||
/*#define DBUG_UCA_CONTRACTIONS*/
|
||||
#ifdef DBUG_UCA_CONTRACTIONS
|
||||
static ulonglong collisions= 0;
|
||||
static ulonglong collisions_eq= 0;
|
||||
#endif
|
||||
|
||||
|
||||
/*
|
||||
An empirical hash function for contractions.
|
||||
It does not produce collisions for built-in DUCET contractions
|
||||
as of Unicode-14.0.0.
|
||||
*/
|
||||
static uint16
|
||||
my_uca_contraction_hash_func(my_wc_t a, my_wc_t b)
|
||||
{
|
||||
return (uint16) (((a * 465 + b) & MY_UCA_CONTRACTION_HASH_MASK) <<
|
||||
MY_UCA_CONTRACTION_HASH_LSHIFT);
|
||||
}
|
||||
|
||||
|
||||
/*
|
||||
Find an unused cell in the contraction hash table.
|
||||
*/
|
||||
static my_bool
|
||||
my_uca_contraction_hash_find_empty(const MY_UCA_CONTRACTION_HASH *cnt,
|
||||
uint16 start,
|
||||
uint16 *ppos)
|
||||
{
|
||||
uint16 i;
|
||||
for (i= 0; i < MY_UCA_CONTRACTION_HASH_ALLOWED_COLLISIONS; i++)
|
||||
{
|
||||
uint16 pos= (i + start) % cnt->nitems_alloced;
|
||||
if (!cnt->item[pos].ch[0])
|
||||
{
|
||||
*ppos= pos;
|
||||
return FALSE;
|
||||
}
|
||||
}
|
||||
return TRUE;
|
||||
}
|
||||
|
||||
|
||||
/*
|
||||
Find a contraction in the hash table
|
||||
*/
|
||||
static inline const MY_CONTRACTION *
|
||||
my_uca_contraction_hash_find(const MY_UCA_CONTRACTION_HASH *cnt,
|
||||
const my_wc_t *wc, size_t len)
|
||||
{
|
||||
uint16 start= my_uca_contraction_hash_func(wc[0], wc[1]);
|
||||
uint16 i;
|
||||
DBUG_ASSERT(len <= MY_UCA_MAX_CONTRACTION);
|
||||
|
||||
for (i=0 ; i < MY_UCA_CONTRACTION_HASH_ALLOWED_COLLISIONS; i++)
|
||||
{
|
||||
uint16 pos= (i + start) % cnt->nitems_alloced;
|
||||
const MY_CONTRACTION *c= &cnt->item[pos];
|
||||
if (!c->ch[0])
|
||||
return NULL; /* An empty cell found - there is no such contraction */
|
||||
if (my_uca_true_contraction_eq(c, wc, len))
|
||||
return c; /* The given contraction was found */
|
||||
}
|
||||
/*
|
||||
We scanned every single cell in the hash table and neither found
|
||||
the given contraction nor met an empty cell. This is a very unlikely
|
||||
scenario and is possible only if the hash table is full.
|
||||
Anyway, the given contraction was not found in the hash.
|
||||
*/
|
||||
return NULL;
|
||||
}
|
||||
|
||||
|
||||
/*
|
||||
Allocate an empty hash table for contractions.
|
||||
*/
|
||||
static my_bool
|
||||
my_uca_contraction_hash_allocate(MY_UCA_CONTRACTION_HASH *dst,
|
||||
MY_CHARSET_LOADER *loader)
|
||||
{
|
||||
size_t nbytes= MY_UCA_CONTRACTION_HASH_ALLOC_ELEMENTS * sizeof(MY_CONTRACTION);
|
||||
bzero(dst, sizeof(*dst));
|
||||
if (!(dst->item= (MY_CONTRACTION*) (loader->once_alloc)(nbytes)))
|
||||
return TRUE;
|
||||
bzero(dst->item, nbytes);
|
||||
dst->nitems_alloced= MY_UCA_CONTRACTION_HASH_ALLOC_ELEMENTS;
|
||||
return FALSE;
|
||||
}
|
||||
|
||||
|
||||
/*
|
||||
Add all contractions from the list "src" into the hash table "dst".
|
||||
*/
|
||||
static my_bool
|
||||
my_uca_contraction_hash_populate(MY_UCA_CONTRACTION_HASH *dst,
|
||||
const MY_CONTRACTIONS *src)
|
||||
{
|
||||
size_t i;
|
||||
DBUG_ASSERT(dst->nitems_alloced > 0);
|
||||
for (i= 0; i < src->nitems; i++)
|
||||
{
|
||||
const MY_CONTRACTION *c= &src->item[i];
|
||||
uint16 start= my_uca_contraction_hash_func(c->ch[0], c->ch[1]);
|
||||
if (!dst->item[start].ch[0])
|
||||
dst->item[start]= src->item[i];
|
||||
else
|
||||
{
|
||||
uint16 pos;
|
||||
#ifdef DBUG_UCA_CONTRACTIONS
|
||||
if (dst->item[start].ch[0] != c->ch[0] &&
|
||||
dst->item[start].ch[1] != c->ch[1])
|
||||
collisions++;
|
||||
else
|
||||
collisions_eq++;
|
||||
#endif
|
||||
if (my_uca_contraction_hash_find_empty(dst, start, &pos))
|
||||
return TRUE;
|
||||
dst->item[pos]= src->item[i];
|
||||
}
|
||||
}
|
||||
return FALSE;
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
Check if a string is a contraction,
|
||||
and return its weight array on success.
|
||||
|
@ -31691,7 +31844,7 @@ my_uca_contraction_find(const MY_CONTRACTIONS *list, my_wc_t *wc, size_t len)
|
|||
|
||||
|
||||
/**
|
||||
Find a contraction in the input stream and return its weight array
|
||||
Find a contraction in the input stream
|
||||
|
||||
Scan input characters while their flags tell that they can be
|
||||
a contraction part. Then try to find real contraction among the
|
||||
|
@ -31701,14 +31854,15 @@ my_uca_contraction_find(const MY_CONTRACTIONS *list, my_wc_t *wc, size_t len)
|
|||
@param[OUT] *wc Where to store the scanned string
|
||||
@param max_char_length The longest contraction character length allowed
|
||||
|
||||
@return Weight array
|
||||
@return
|
||||
@retval NULL - no contraction found
|
||||
@retval ptr - the address of MY_CONTRACTION found
|
||||
*/
|
||||
|
||||
static const MY_CONTRACTION *
|
||||
my_uca_scanner_contraction_find(my_uca_scanner *scanner, my_wc_t currwc,
|
||||
size_t max_char_length)
|
||||
my_uca_scanner_contraction_hash_find(my_uca_scanner *scanner,
|
||||
my_wc_t currwc,
|
||||
size_t max_char_length)
|
||||
{
|
||||
size_t clen= 1;
|
||||
int flag;
|
||||
|
@ -31738,8 +31892,8 @@ my_uca_scanner_contraction_find(my_uca_scanner *scanner, my_wc_t currwc,
|
|||
const MY_CONTRACTION *cnt;
|
||||
if (my_uca_can_be_contraction_tail(&scanner->level->contractions,
|
||||
wc[clen - 1]) &&
|
||||
(cnt= my_uca_contraction_find(&scanner->level->contractions,
|
||||
wc, clen)))
|
||||
(cnt= my_uca_contraction_hash_find(&scanner->level->contraction_hash,
|
||||
wc, clen)))
|
||||
{
|
||||
scanner->sbeg= beg[clen - 1];
|
||||
return cnt;
|
||||
|
@ -31752,7 +31906,6 @@ my_uca_scanner_contraction_find(my_uca_scanner *scanner, my_wc_t currwc,
|
|||
|
||||
/**
|
||||
Find weight for contraction with previous context
|
||||
and return its weight array.
|
||||
|
||||
@param scanner Pointer to UCA scanner
|
||||
@param wc0 Previous character
|
||||
|
@ -31827,8 +31980,9 @@ my_uca_context_weight_find(my_uca_scanner *scanner, my_wc_t currwc,
|
|||
else if (my_uca_can_be_contraction_head(&scanner->level->contractions,
|
||||
currwc))
|
||||
{
|
||||
/* Check if currwc starts a contraction */
|
||||
if ((cnt= my_uca_scanner_contraction_find(scanner, currwc, max_char_length)))
|
||||
/* Check if w[0] starts a contraction */
|
||||
if ((cnt= my_uca_scanner_contraction_hash_find(scanner, currwc,
|
||||
max_char_length)))
|
||||
return cnt;
|
||||
}
|
||||
return NULL;
|
||||
|
@ -33887,6 +34041,20 @@ init_weight_level(MY_CHARSET_LOADER *loader, MY_COLL_RULES *rules,
|
|||
item->with_context);
|
||||
memcpy(weights, item->weight, sizeof(item->weight));
|
||||
}
|
||||
|
||||
if (ncontractions)
|
||||
{
|
||||
if (ncontractions > MY_UCA_CONTRACTION_HASH_ALLOC_ELEMENTS ||
|
||||
my_uca_contraction_hash_allocate(&dst->contraction_hash, loader) ||
|
||||
my_uca_contraction_hash_populate(&dst->contraction_hash,
|
||||
&dst->contractions))
|
||||
{
|
||||
my_snprintf(loader->error, sizeof(loader->error),
|
||||
"Can't initialize %d contractions", (int) ncontractions);
|
||||
return TRUE;
|
||||
}
|
||||
}
|
||||
|
||||
return FALSE;
|
||||
}
|
||||
|
||||
|
|
Loading…
Reference in a new issue