Fixed LP bug #670380.

Lifted the limitation that hash join could not be used over 
varchar fields with non-binary collation.
This commit is contained in:
Igor Babaev 2010-12-22 00:37:35 -08:00
parent 4f28dcbe32
commit a095346a9d
9 changed files with 469 additions and 28 deletions

View file

@ -6,7 +6,7 @@ CREATE TABLE Country (
Capital int(11) default NULL,
PRIMARY KEY (Code),
UNIQUE INDEX (Name)
) COLLATE latin1_bin;
);
CREATE TABLE City (
ID int(11) NOT NULL auto_increment,
Name char(35) NOT NULL default '',
@ -15,11 +15,11 @@ CREATE TABLE City (
PRIMARY KEY (ID),
INDEX (Population),
INDEX (Country)
) COLLATE latin1_bin;
);
CREATE TABLE CountryLanguage (
Country char(3) NOT NULL default '',
Language char(30) NOT NULL default '',
Percentage float(3,1) NOT NULL default '0.0',
PRIMARY KEY (Country, Language),
INDEX (Percentage)
) COLLATE latin1_bin;
);

View file

@ -4,15 +4,15 @@ CREATE TABLE Country (
SurfaceArea float(10,2) NOT NULL default '0.00',
Population int(11) NOT NULL default '0',
Capital int(11) default NULL
) COLLATE latin1_bin;
);
CREATE TABLE City (
ID int(11) NOT NULL,
Name char(35) NOT NULL default '',
Country char(3) NOT NULL default '',
Population int(11) NOT NULL default '0'
) COLLATE latin1_bin;
);
CREATE TABLE CountryLanguage (
Country char(3) NOT NULL default '',
Language char(30) NOT NULL default '',
Percentage float(3,1) NOT NULL default '0.0'
) COLLATE latin1_bin;
);

View file

@ -12,18 +12,18 @@ Name char(52) NOT NULL default '',
SurfaceArea float(10,2) NOT NULL default '0.00',
Population int(11) NOT NULL default '0',
Capital int(11) default NULL
) COLLATE latin1_bin;
);
CREATE TABLE City (
ID int(11) NOT NULL,
Name char(35) NOT NULL default '',
Country char(3) NOT NULL default '',
Population int(11) NOT NULL default '0'
) COLLATE latin1_bin;
);
CREATE TABLE CountryLanguage (
Country char(3) NOT NULL default '',
Language char(30) NOT NULL default '',
Percentage float(3,1) NOT NULL default '0.0'
) COLLATE latin1_bin;
);
SELECT COUNT(*) FROM Country;
COUNT(*)
239
@ -810,7 +810,7 @@ Population int(11) NOT NULL default '0',
Capital int(11) default NULL,
PRIMARY KEY (Code),
UNIQUE INDEX (Name)
) COLLATE latin1_bin;
);
CREATE TABLE City (
ID int(11) NOT NULL auto_increment,
Name char(35) NOT NULL default '',
@ -819,14 +819,14 @@ Population int(11) NOT NULL default '0',
PRIMARY KEY (ID),
INDEX (Population),
INDEX (Country)
) COLLATE latin1_bin;
);
CREATE TABLE CountryLanguage (
Country char(3) NOT NULL default '',
Language char(30) NOT NULL default '',
Percentage float(3,1) NOT NULL default '0.0',
PRIMARY KEY (Country, Language),
INDEX (Percentage)
) COLLATE latin1_bin;
);
show variables like 'join_buffer_size';
Variable_name Value
join_buffer_size 131072
@ -5582,7 +5582,7 @@ EXPLAIN
SELECT t2.i FROM t1,t2 WHERE t1.cu = t2.cl ;
id select_type table type possible_keys key key_len ref rows Extra
1 SIMPLE t2 ALL NULL NULL NULL NULL 6
1 SIMPLE t1 ref cu cu 33 func 2 Using where; Using index
1 SIMPLE t1 ref cu cu 33 func 2 Using where; Using index; Using join buffer (flat, BNLH join)
SELECT t2.i FROM t1,t2 WHERE t1.cu = t2.cl ;
i
6
@ -6056,4 +6056,35 @@ a4 b5
SET SESSION optimizer_switch = 'outer_join_with_cache=off';
SET SESSION join_cache_level = DEFAULT;
DROP TABLE t1,t2,t3,t4,t5;
#
# Bug #670380: hash join for non-binary collation
#
CREATE TABLE t1 (pk int PRIMARY KEY, a varchar(32));
CREATE TABLE t2 (pk int PRIMARY KEY, a varchar(32), INDEX idx(a));
INSERT INTO t1 VALUES
(10,'AAA'), (20,'BBBB'), (30,'Cc'), (40,'DD'), (50,'ee');
INSERT INTO t2 VALUES
(1,'Bbbb'), (2,'BBB'), (3,'bbbb'), (4,'AaA'), (5,'CC'),
(6,'cC'), (7,'CCC'), (8,'AAA'), (9,'bBbB'), (10,'aaaa'),
(11,'a'), (12,'dd'), (13,'EE'), (14,'ee'), (15,'D');
SET SESSION join_cache_level = 4;
EXPLAIN
SELECT * FROM t1,t2 WHERE t1.a=t2.a;
id select_type table type possible_keys key key_len ref rows Extra
1 SIMPLE t1 ALL NULL NULL NULL NULL 5 Using where
1 SIMPLE t2 ref idx idx 35 test.t1.a 2 Using join buffer (flat, BNLH join)
SELECT * FROM t1,t2 WHERE t1.a=t2.a;
pk a pk a
20 BBBB 1 Bbbb
20 BBBB 3 bbbb
10 AAA 4 AaA
30 Cc 5 CC
30 Cc 6 cC
10 AAA 8 AAA
20 BBBB 9 bBbB
40 DD 12 dd
50 ee 13 EE
50 ee 14 ee
SET SESSION join_cache_level = DEFAULT;
DROP TABLE t1,t2;
set @@optimizer_switch=@save_optimizer_switch;

View file

@ -2700,5 +2700,28 @@ SET SESSION join_cache_level = DEFAULT;
DROP TABLE t1,t2,t3,t4,t5;
--echo #
--echo # Bug #670380: hash join for non-binary collation
--echo #
CREATE TABLE t1 (pk int PRIMARY KEY, a varchar(32));
CREATE TABLE t2 (pk int PRIMARY KEY, a varchar(32), INDEX idx(a));
INSERT INTO t1 VALUES
(10,'AAA'), (20,'BBBB'), (30,'Cc'), (40,'DD'), (50,'ee');
INSERT INTO t2 VALUES
(1,'Bbbb'), (2,'BBB'), (3,'bbbb'), (4,'AaA'), (5,'CC'),
(6,'cC'), (7,'CCC'), (8,'AAA'), (9,'bBbB'), (10,'aaaa'),
(11,'a'), (12,'dd'), (13,'EE'), (14,'ee'), (15,'D');
SET SESSION join_cache_level = 4;
EXPLAIN
SELECT * FROM t1,t2 WHERE t1.a=t2.a;
SELECT * FROM t1,t2 WHERE t1.a=t2.a;
SET SESSION join_cache_level = DEFAULT;
DROP TABLE t1,t2;
# this must be the last command in the file
set @@optimizer_switch=@save_optimizer_switch;

View file

@ -593,6 +593,7 @@ public:
/* Check whether the field can be used as a join attribute in hash join */
virtual bool hash_join_is_possible() { return TRUE; }
virtual bool eq_cmp_as_binary() { return TRUE; }
friend bool reopen_table(THD *,struct st_table *,bool);
friend int cre_myisam(char * name, register TABLE *form, uint options,
@ -769,12 +770,7 @@ public:
my_decimal *val_decimal(my_decimal *);
virtual bool str_needs_quotes() { return TRUE; }
uint is_equal(Create_field *new_field);
bool hash_join_is_possible()
{
/* TODO: support hash joins for non-binary collations */
return (flags & BINARY_FLAG);
}
bool eq_cmp_as_binary() { return test(flags & BINARY_FLAG); }
};

View file

@ -567,3 +567,254 @@ next_loop:
} while (key_info); /* no more keys to test */
DBUG_RETURN(0);
}
/**
Get hash value for the key from a key buffer
@param key_info the key descriptor
@param used_key_part number of key parts used for the key
@param key pointer to the buffer with the key value
@datails
When hashing we should take special care only of:
1. NULLs (and keyparts which can be null so one byte reserved for it);
2. Strings for which we have to take into account their collations
and the values of their lengths in the prefixes.
@return hash value calculated for the key
*/
ulong key_hashnr(KEY *key_info, uint used_key_parts, const uchar *key)
{
ulong nr=1, nr2=4;
KEY_PART_INFO *key_part= key_info->key_part;
KEY_PART_INFO *end_key_part= key_part + used_key_parts;
for (; key_part < end_key_part; key_part++)
{
uchar *pos= (uchar*)key;
CHARSET_INFO *cs;
uint length, pack_length;
bool is_string= TRUE;
LINT_INIT(cs);
key+= key_part->length;
if (key_part->null_bit)
{
key++; /* Skip null byte */
if (*pos) /* Found null */
{
nr^= (nr << 1) | 1;
/* Add key pack length to key for VARCHAR segments */
switch (key_part->type) {
case HA_KEYTYPE_VARTEXT1:
case HA_KEYTYPE_VARBINARY1:
key++;
break;
case HA_KEYTYPE_VARTEXT2:
case HA_KEYTYPE_VARBINARY2:
key+= 2;
break;
default:
;
}
continue;
}
pos++; /* Skip null byte */
}
/* If it is string set parameters of the string */
switch (key_part->type) {
case HA_KEYTYPE_TEXT:
cs= key_part->field->charset();
length= key_part->length;
pack_length= 0;
break;
case HA_KEYTYPE_BINARY :
cs= &my_charset_bin;
length= key_part->length;
pack_length= 0;
break;
case HA_KEYTYPE_VARTEXT1:
cs= key_part->field->charset();
length= (uint)(pos[0]);
pack_length= 1;
break;
case HA_KEYTYPE_VARBINARY1:
cs= &my_charset_bin;
length= (uint)(pos[0]);
pack_length= 1;
break;
case HA_KEYTYPE_VARTEXT2:
cs= key_part->field->charset();
length= uint2korr(pos);
pack_length= 2;
break;
case HA_KEYTYPE_VARBINARY2:
cs= &my_charset_bin;
length= uint2korr(pos);
pack_length= 2;
break;
default:
is_string= FALSE;
}
if (is_string)
{
if (cs->mbmaxlen > 1)
{
uint char_length= my_charpos(cs, pos + pack_length,
pos + pack_length + length,
length / cs->mbmaxlen);
set_if_smaller(length, char_length);
}
cs->coll->hash_sort(cs, pos+pack_length, length, &nr, &nr2);
key+= pack_length;
}
else
{
for (; pos < (uchar*)key ; pos++)
{
nr^=(ulong) ((((uint) nr & 63)+nr2)*((uint) *pos)) + (nr << 8);
nr2+=3;
}
}
}
DBUG_PRINT("exit", ("hash: %lx", nr));
return(nr);
}
/**
Check whether two keys in the key buffers are equal
@param key_info the key descriptor
@param used_key_part number of key parts used for the keys
@param key1 pointer to the buffer with the first key
@param key2 pointer to the buffer with the second key
@detail See details of key_hashnr().
@retval TRUE keys in the buffers are NOT equal
@retval FALSE keys in the buffers are equal
*/
bool key_buf_cmp(KEY *key_info, uint used_key_parts,
const uchar *key1, const uchar *key2)
{
KEY_PART_INFO *key_part= key_info->key_part;
KEY_PART_INFO *end_key_part= key_part + used_key_parts;
for (; key_part < end_key_part; key_part++)
{
uchar *pos1= (uchar*)key1;
uchar *pos2= (uchar*)key2;
CHARSET_INFO *cs;
uint length1, length2, pack_length;
bool is_string= TRUE;
LINT_INIT(cs);
key1+= key_part->length;
key2+= key_part->length;
if (key_part->null_bit)
{
key1++; key2++; /* Skip null byte */
if (*pos1 && *pos2) /* Both are null */
{
/* Add key pack length to key for VARCHAR segments */
switch (key_part->type) {
case HA_KEYTYPE_VARTEXT1:
case HA_KEYTYPE_VARBINARY1:
key1++; key2++;
break;
case HA_KEYTYPE_VARTEXT2:
case HA_KEYTYPE_VARBINARY2:
key1+= 2; key2+= 2;
break;
default:
;
}
continue;
}
if (*pos1 != *pos2)
return FALSE;
pos1++; pos2++;
}
/* If it is string set parameters of the string */
switch (key_part->type) {
case HA_KEYTYPE_TEXT:
cs= key_part->field->charset();
length1= length2= key_part->length;
pack_length= 0;
break;
case HA_KEYTYPE_BINARY :
cs= &my_charset_bin;
length1= length2= key_part->length;
pack_length= 0;
break;
case HA_KEYTYPE_VARTEXT1:
cs= key_part->field->charset();
length1= (uint)(pos1[0]);
length2= (uint)(pos2[0]);
pack_length= 1;
break;
case HA_KEYTYPE_VARBINARY1:
cs= &my_charset_bin;
length1= (uint)(pos1[0]);
length2= (uint)(pos2[0]);
pack_length= 1;
break;
case HA_KEYTYPE_VARTEXT2:
cs= key_part->field->charset();
length1= uint2korr(pos1);
length2= uint2korr(pos2);
pack_length= 2;
break;
case HA_KEYTYPE_VARBINARY2:
cs= &my_charset_bin;
length1= uint2korr(pos1);
length2= uint2korr(pos2);
pack_length= 2;
break;
default:
is_string= FALSE;
}
if (is_string)
{
/*
Compare the strings taking into account length in characters
and collation
*/
uint byte_len1= length1, byte_len2= length2;
if (cs->mbmaxlen > 1)
{
uint char_length1= my_charpos(cs, pos1 + pack_length,
pos1 + pack_length + length1,
length1 / cs->mbmaxlen);
uint char_length2= my_charpos(cs, pos2 + pack_length,
pos2 + pack_length + length2,
length2 / cs->mbmaxlen);
set_if_smaller(length1, char_length1);
set_if_smaller(length2, char_length2);
}
if (length1 != length2 ||
cs->coll->strnncollsp(cs,
pos1 + pack_length, byte_len1,
pos2 + pack_length, byte_len2,
1))
return TRUE;
key1+= pack_length; key2+= pack_length;
}
else
{
/* it is OK to compare non-string byte per byte */
for (; pos1 < (uchar*)key1 ; pos1++, pos2++)
{
if (pos1[0] != pos2[0])
return TRUE;
}
}
}
return FALSE;
}

View file

@ -1879,6 +1879,9 @@ bool key_cmp_if_same(TABLE *form,const uchar *key,uint index,uint key_length);
void key_unpack(String *to,TABLE *form,uint index);
bool is_key_used(TABLE *table, uint idx, const MY_BITMAP *fields);
int key_cmp(KEY_PART_INFO *key_part, const uchar *key, uint key_length);
ulong key_hashnr(KEY *key_info, uint used_key_parts, const uchar *key);
bool key_buf_cmp(KEY *key_info, uint used_key_parts,
const uchar *key1, const uchar *key2);
extern "C" int key_rec_cmp(void *key_info, uchar *a, uchar *b);
bool init_errmessage(void);

View file

@ -2520,6 +2520,24 @@ int JOIN_CACHE_HASHED::init()
pack_length+= get_size_of_rec_offset();
pack_length_with_blob_ptrs+= get_size_of_rec_offset();
ref_key_info= join_tab->table->key_info+join_tab->ref.key;
ref_used_key_parts= join_tab->ref.key_parts;
hash_func= &JOIN_CACHE_HASHED::get_hash_idx_simple;
hash_cmp_func= &JOIN_CACHE_HASHED::equal_keys_simple;
KEY_PART_INFO *key_part= ref_key_info->key_part;
KEY_PART_INFO *key_part_end= key_part+ref_used_key_parts;
for ( ; key_part < key_part_end; key_part++)
{
if (!key_part->field->eq_cmp_as_binary())
{
hash_func= &JOIN_CACHE_HASHED::get_hash_idx_complex;
hash_cmp_func= &JOIN_CACHE_HASHED::equal_keys_complex;
break;
}
}
init_hash_table();
rec_fields_offset= get_size_of_rec_offset()+get_size_of_rec_length()+
@ -2903,7 +2921,7 @@ bool JOIN_CACHE_HASHED::key_search(uchar *key, uint key_len,
uchar **key_ref_ptr)
{
bool is_found= FALSE;
uint idx= get_hash_idx(key, key_length);
uint idx= (this->*hash_func)(key, key_length);
uchar *ref_ptr= hash_table+size_of_key_ofs*idx;
while (!is_null_key_ref(ref_ptr))
{
@ -2912,7 +2930,7 @@ bool JOIN_CACHE_HASHED::key_search(uchar *key, uint key_len,
next_key= use_emb_key ? get_emb_key(ref_ptr-get_size_of_rec_offset()) :
ref_ptr-key_length;
if (memcmp(next_key, key, key_len) == 0)
if ((this->*hash_cmp_func)(next_key, key, key_len))
{
is_found= TRUE;
break;
@ -2924,22 +2942,24 @@ bool JOIN_CACHE_HASHED::key_search(uchar *key, uint key_len,
/*
Calclulate hash value for a key in the hash table of the join buffer
Hash function that considers a key in the hash table as byte array
SYNOPSIS
get_hash_idx()
get_hash_idx_simple()
key pointer to the key value
key_len key value length
DESCRIPTION
The function calculates an index of the hash entry in the hash table
of the join buffer for the given key
of the join buffer for the given key. It considers the key just as
a sequence of bytes of the length key_len.
RETURN VALUE
the calculated index of the hash entry for the given key.
the calculated index of the hash entry for the given key
*/
uint JOIN_CACHE_HASHED::get_hash_idx(uchar* key, uint key_len)
inline
uint JOIN_CACHE_HASHED::get_hash_idx_simple(uchar* key, uint key_len)
{
ulong nr= 1;
ulong nr2= 4;
@ -2954,6 +2974,93 @@ uint JOIN_CACHE_HASHED::get_hash_idx(uchar* key, uint key_len)
}
/*
Hash function that takes into account collations of the components of the key
SYNOPSIS
get_hash_idx_complex()
key pointer to the key value
key_len key value length
DESCRIPTION
The function calculates an index of the hash entry in the hash table
of the join buffer for the given key. It takes into account that the
components of the key may be of a varchar type with different collations.
The function guarantees that the same hash value for any two equal
keys that may differ as byte sequences.
The function takes the info about the components of the key, their
types and used collations from the class member ref_key_info containing
a pointer to the descriptor of the index that can be used for the join
operation.
RETURN VALUE
the calculated index of the hash entry for the given key
*/
inline
uint JOIN_CACHE_HASHED::get_hash_idx_complex(uchar *key, uint key_len)
{
return
(uint) (key_hashnr(ref_key_info, ref_used_key_parts, key) % hash_entries);
}
/*
Compare two key entries in the hash table as sequence of bytes
SYNOPSIS
equal_keys_simple()
key1 pointer to the first key entry
key2 pointer to the second key entry
key_len the length of the key values
DESCRIPTION
The function compares two key entries in the hash table key1 and key2
as two sequences bytes of the length key_len
RETURN VALUE
TRUE key1 coincides with key2
FALSE otherwise
*/
inline
bool JOIN_CACHE_HASHED::equal_keys_simple(uchar *key1, uchar *key2,
uint key_len)
{
return memcmp(key1, key2, key_len) == 0;
}
/*
Compare two key entries taking into account the used collation
SYNOPSIS
equal_keys_complex()
key1 pointer to the first key entry
key2 pointer to the second key entry
key_len the length of the key values
DESCRIPTION
The function checks whether two key entries in the hash table
key1 and key2 are equal as, possibly, compound keys of a certain
structure whose components may be of a varchar type and may
employ different collations.
The descriptor of the key structure is taken from the class
member ref_key_info.
RETURN VALUE
TRUE key1 is equal tokey2
FALSE otherwise
*/
inline
bool JOIN_CACHE_HASHED::equal_keys_complex(uchar *key1, uchar *key2,
uint key_len)
{
return key_buf_cmp(ref_key_info, ref_used_key_parts, key1, key2) == 0;
}
/*
Clean up the hash table of the join buffer

View file

@ -738,6 +738,10 @@ public:
class JOIN_CACHE_HASHED: public JOIN_CACHE
{
typedef uint (JOIN_CACHE_HASHED::*Hash_func) (uchar *key, uint key_len);
typedef bool (JOIN_CACHE_HASHED::*Hash_cmp_func) (uchar *key1, uchar *key2,
uint key_len);
private:
/* Size of the offset of a key entry in the hash table */
@ -761,14 +765,40 @@ private:
/* The offset of the data fields from the beginning of the record fields */
uint data_fields_offset;
uint get_hash_idx(uchar* key, uint key_len);
inline uint get_hash_idx_simple(uchar *key, uint key_len);
inline uint get_hash_idx_complex(uchar *key, uint key_len);
inline bool equal_keys_simple(uchar *key1, uchar *key2, uint key_len);
inline bool equal_keys_complex(uchar *key1, uchar *key2, uint key_len);
int init_hash_table();
void cleanup_hash_table();
protected:
/*
Index info on the TABLE_REF object used by the hash join
to look for matching records
*/
KEY *ref_key_info;
/*
Number of the key parts the TABLE_REF object used by the hash join
to look for matching records
*/
uint ref_used_key_parts;
/*
The hash function used in the hash table,
usually set by the init() method
*/
Hash_func hash_func;
/*
The function to check whether two key entries in the hash table
are equal or not, usually set by the init() method
*/
Hash_cmp_func hash_cmp_func;
/*
Length of a key value.
It is assumed that all key values have the same length.