From 7dbdedcb72afa53f8c2d619e32376a1897bc257b Mon Sep 17 00:00:00 2001 From: unknown Date: Thu, 18 Dec 2003 06:08:00 +0300 Subject: [PATCH] Precise read time estimates for index_merge/Unique --- mysql-test/r/index_merge.result | 2 +- mysql-test/t/index_merge.test | 2 +- sql/filesort.cc | 8 +- sql/ha_berkeley.h | 2 +- sql/ha_innodb.cc | 3 +- sql/ha_innodb.h | 2 +- sql/handler.h | 4 +- sql/mysql_priv.h | 20 ++ sql/opt_range.cc | 493 ++++++++++++++++++++++---------- sql/opt_range.h | 96 ++++--- sql/records.cc | 1 - sql/sql_class.h | 3 +- sql/uniques.cc | 182 ++++++++++++ 13 files changed, 606 insertions(+), 212 deletions(-) diff --git a/mysql-test/r/index_merge.result b/mysql-test/r/index_merge.result index 2abdcf202d9..97d46bf39ea 100644 --- a/mysql-test/r/index_merge.result +++ b/mysql-test/r/index_merge.result @@ -1,4 +1,4 @@ -drop table if exists t0, t1, t2, t3; +drop table if exists t0, t1, t2, t3,t4; create table t0 ( key1 int not null, diff --git a/mysql-test/t/index_merge.test b/mysql-test/t/index_merge.test index 79eda7c1f56..029c2e4a013 100644 --- a/mysql-test/t/index_merge.test +++ b/mysql-test/t/index_merge.test @@ -3,7 +3,7 @@ # --disable_warnings -drop table if exists t0, t1, t2, t3; +drop table if exists t0, t1, t2, t3,t4; --enable_warnings # Create and fill a table with simple keys diff --git a/sql/filesort.cc b/sql/filesort.cc index 356afdf748c..27c59a05941 100644 --- a/sql/filesort.cc +++ b/sql/filesort.cc @@ -88,9 +88,9 @@ ha_rows filesort(THD *thd, TABLE *table, SORT_FIELD *sortorder, uint s_length, #endif FILESORT_INFO table_sort; /* - don't use table->sort in filesort as it is also used by - QUICK_INDEX_MERGE_SELECT. work with a copy of it and put it back at the - end when index_merge select has finished with it. + Don't use table->sort in filesort as it is also used by + QUICK_INDEX_MERGE_SELECT. Work with a copy and put it back at the end + when index_merge select has finished with it. */ memcpy(&table_sort, &table->sort, sizeof(FILESORT_INFO)); table->sort.io_cache= NULL; @@ -452,7 +452,7 @@ static ha_rows find_all_keys(SORTPARAM *param, SQL_SELECT *select, if (quick_select) { /* - index_merge quick select uses table->sort when retrieving rows, so free + index_merge quick select uses table->sort when retrieving rows, so free resoures it has allocated. */ end_read_record(&read_record_info); diff --git a/sql/ha_berkeley.h b/sql/ha_berkeley.h index 582a79906a7..f225c24eaf7 100644 --- a/sql/ha_berkeley.h +++ b/sql/ha_berkeley.h @@ -167,7 +167,7 @@ class ha_berkeley: public handler longlong get_auto_increment(); void print_error(int error, myf errflag); uint8 table_cache_type() { return HA_CACHE_TBL_TRANSACT; } - bool primary_key_is_clustered_covering() { return true; } + bool primary_key_is_clustered() { return true; } }; extern bool berkeley_skip, berkeley_shared_data; diff --git a/sql/ha_innodb.cc b/sql/ha_innodb.cc index b92a5ff8c3f..d949f8bcf9c 100644 --- a/sql/ha_innodb.cc +++ b/sql/ha_innodb.cc @@ -2003,7 +2003,8 @@ build_template( update field->query_id so that the formula thd->query_id == field->query_id did not work. */ - ibool index_contains_field = dict_index_contains_col_or_prefix(index, i); + ibool index_contains_field= + dict_index_contains_col_or_prefix(index, i); if (templ_type == ROW_MYSQL_REC_FIELDS && ((prebuilt->read_just_key && !index_contains_field) || diff --git a/sql/ha_innodb.h b/sql/ha_innodb.h index 6fa66377cd6..c305a019fcd 100644 --- a/sql/ha_innodb.h +++ b/sql/ha_innodb.h @@ -187,7 +187,7 @@ class ha_innobase: public handler void init_table_handle_for_HANDLER(); longlong get_auto_increment(); uint8 table_cache_type() { return HA_CACHE_TBL_ASKTRANSACT; } - bool primary_key_is_clustered_covering() { return true; } + bool primary_key_is_clustered() { return true; } }; extern bool innodb_skip; diff --git a/sql/handler.h b/sql/handler.h index 0bbaba81f96..2ad37233c9e 100644 --- a/sql/handler.h +++ b/sql/handler.h @@ -378,10 +378,10 @@ public: /* RETURN - true primary key (if there is one) is clustered key covering all fields + true Primary key (if there is one) is clustered key covering all fields false otherwise */ - virtual bool primary_key_is_clustered_covering() { return false; } + virtual bool primary_key_is_clustered() { return false; } }; /* Some extern variables used with handlers */ diff --git a/sql/mysql_priv.h b/sql/mysql_priv.h index 3ace72ea24c..2b25f501a37 100644 --- a/sql/mysql_priv.h +++ b/sql/mysql_priv.h @@ -118,6 +118,26 @@ extern CHARSET_INFO *national_charset_info, *table_alias_charset; */ #define TIME_FOR_COMPARE 5 // 5 compares == one read +/* + Number of comparisons of table rowids equivalent to reading one row from a + table. +*/ +#define TIME_FOR_COMPARE_ROWID (TIME_FOR_COMPARE*2) + +/* + For sequential disk seeks the cost formula is: + DISK_SEEK_BASE_COST + DISK_SEEK_PROP_COST * #blocks_to_skip + + The cost of average seek + DISK_SEEK_BASE_COST + DISK_SEEK_PROP_COST*BLOCKS_IN_AVG_SEEK =1.0. +*/ +#define DISK_SEEK_BASE_COST ((double)0.5) + +#define BLOCKS_IN_AVG_SEEK 128 + +#define DISK_SEEK_PROP_COST ((double)0.5/BLOCKS_IN_AVG_SEEK) + + /* Number of rows in a reference table when refereed through a not unique key. This value is only used when we don't know anything about the key diff --git a/sql/opt_range.cc b/sql/opt_range.cc index 98002cc5b7a..e3d0f8624b9 100644 --- a/sql/opt_range.cc +++ b/sql/opt_range.cc @@ -307,12 +307,18 @@ static ha_rows check_quick_keys(PARAM *param,uint index,SEL_ARG *key_tree, QUICK_RANGE_SELECT *get_quick_select(PARAM *param,uint index, SEL_ARG *key_tree, MEM_ROOT *alloc = NULL); -static int get_quick_select_params(SEL_TREE *tree, PARAM& param, - key_map& needed_reg, TABLE *head, +static int get_quick_select_params(SEL_TREE *tree, PARAM *param, + key_map& needed_reg, bool index_read_can_be_used, - double* read_time, - ha_rows* records, + double *read_time, + ha_rows *records, SEL_ARG*** key_to_read); +static int get_index_merge_params(PARAM *param, key_map& needed_reg, + SEL_IMERGE *imerge, double *read_time, + ha_rows* imerge_rows); +inline double get_index_only_read_time(PARAM* param, ha_rows records, + int keynr); + #ifndef DBUG_OFF static void print_quick_sel_imerge(QUICK_INDEX_MERGE_SELECT *quick, const key_map *needed_reg); @@ -453,7 +459,7 @@ int SEL_IMERGE::or_sel_tree_with_checks(PARAM *param, SEL_TREE *new_tree) } } - /* new tree cannot be combined with any of existing trees */ + /* New tree cannot be combined with any of existing trees. */ return or_sel_tree(param, new_tree); } @@ -483,7 +489,6 @@ int SEL_IMERGE::or_sel_imerge_with_checks(PARAM *param, SEL_IMERGE* imerge) /* Perform AND operation on two index_merge lists and store result in *im1. - */ inline void imerge_list_and_list(List *im1, List *im2) @@ -503,18 +508,16 @@ inline void imerge_list_and_list(List *im1, List *im2) i.e. all conjuncts except the first one are currently dropped. This is done to avoid producing N*K ways to do index_merge. - If (a_1||b_1) produce a condition that is always true, NULL is - returned and index_merge is discarded. (while it is actually - possible to try harder). + If (a_1||b_1) produce a condition that is always true, NULL is returned + and index_merge is discarded (while it is actually possible to try + harder). - As a consequence of this, choice of keys to do index_merge - read may depend on the order of conditions in WHERE part of - the query. + As a consequence of this, choice of keys to do index_merge read may depend + on the order of conditions in WHERE part of the query. RETURN - 0 OK, result is stored in *im1 + 0 OK, result is stored in *im1 other Error, both passed lists are unusable - */ int imerge_list_or_list(PARAM *param, @@ -533,7 +536,7 @@ int imerge_list_or_list(PARAM *param, Perform OR operation on index_merge list and key tree. RETURN - 0 OK, result is stored in *im1 + 0 OK, result is stored in *im1. other Error */ @@ -685,10 +688,10 @@ bool QUICK_INDEX_MERGE_SELECT::push_quick_back(QUICK_RANGE_SELECT *quick_sel_range) { /* - Save quick_select that does scan on clustered covering primary key as - it will be processed separately + Save quick_select that does scan on clustered primary key as it will be + processed separately. */ - if (head->file->primary_key_is_clustered_covering() && + if (head->file->primary_key_is_clustered() && quick_sel_range->index == head->primary_key) pk_quick_select= quick_sel_range; else @@ -1001,7 +1004,7 @@ int SQL_SELECT::test_quick_select(THD *thd, key_map keys_to_use, ha_rows found_records; double found_read_time= read_time; - if (!get_quick_select_params(tree, param, needed_reg, head, true, + if (!get_quick_select_params(tree, ¶m, needed_reg, true, &found_read_time, &found_records, &best_key)) { @@ -1021,120 +1024,57 @@ int SQL_SELECT::test_quick_select(THD *thd, key_map keys_to_use, } /* - btw, tree type SEL_TREE::INDEX_MERGE was not introduced - intentionally + Btw, tree type SEL_TREE::INDEX_MERGE was not introduced + intentionally. */ - /* if no range select could be built, try using index_merge */ + /* If no range select could be built, try using index_merge. */ if (!quick && !tree->merges.is_empty()) { DBUG_PRINT("info",("No range reads possible," " trying to construct index_merge")); SEL_IMERGE *imerge; SEL_IMERGE *min_imerge= NULL; - double min_imerge_cost= DBL_MAX; + double min_imerge_read_time; ha_rows min_imerge_records; + if (!head->used_keys.is_clear_all()) + { + int key_for_use= find_shortest_key(head, &head->used_keys); + ha_rows total_table_records= (0 == head->file->records)? 1 : + head->file->records; + read_time = get_index_only_read_time(¶m, total_table_records, + key_for_use); + DBUG_PRINT("info", + ("'all' scan will be using key %d, read time %g", + key_for_use, read_time)); + } + + min_imerge_read_time=read_time; + /* + Ok, read_time contains best 'all' read time. + Now look for index_merge with cost < read_time + */ List_iterator_fast it(tree->merges); - /* find index_merge with minimal cost */ while ((imerge= it++)) { - bool imerge_failed= false; - double imerge_cost= 0; - ha_rows imerge_total_records= 0; - double tree_read_time; - ha_rows tree_records; - imerge->best_keys= - (SEL_ARG***)alloc_root(&alloc, - (imerge->trees_next - imerge->trees)* - sizeof(void*)); - /* - It may be possible to use different keys for index_merge, e.g for - queries like - ...WHERE (key1 < c2 AND key2 < c2) OR (key3 < c3 AND key4 < c4) - We assume we get the best index_merge if we choose the best key - read inside each of the conjuncts. - */ - for (SEL_TREE **ptree= imerge->trees; - ptree != imerge->trees_next; - ptree++) - { - tree_read_time= read_time; - if (get_quick_select_params(*ptree, param, needed_reg, head, - false, - &tree_read_time, &tree_records, - &(imerge->best_keys[ptree - - imerge->trees]))) - imerge_failed= true; - imerge_cost += tree_read_time; - imerge_total_records += tree_records; - } - - if (!imerge_failed) - { - imerge_total_records= min(imerge_total_records, - head->file->records); - imerge_cost += imerge_total_records / TIME_FOR_COMPARE; - if (imerge_cost < min_imerge_cost) - { - min_imerge= imerge; - min_imerge_cost= imerge_cost; - min_imerge_records= imerge_total_records; - } - } + if (!get_index_merge_params(¶m, needed_reg, imerge, + &min_imerge_read_time, + &min_imerge_records)) + min_imerge= imerge; } if (!min_imerge) goto end_free; records= min_imerge_records; - /* - Ok, got minimal index merge, *min_imerge, with cost min_imerge_cost - Compare its cost with "all" scan cost (or "all+using index" if - it is possible) and choose the best. - */ - - if (!head->used_keys.is_clear_all()) - { - /* check if "ALL" +"using index" read would be faster */ - int key_for_use= find_shortest_key(head, &head->used_keys); - ha_rows total_table_records= (0 == head->file->records)? 1 : - head->file->records; - uint keys_per_block= (head->file->block_size/2/ - (head->key_info[key_for_use].key_length+ - head->file->ref_length) + 1); - double all_index_scan_read_time= ((double)(total_table_records+ - keys_per_block-1)/ - (double) keys_per_block); - - DBUG_PRINT("info", - ("'all' scan will be using key %d, read time %g", - key_for_use, all_index_scan_read_time)); - if (all_index_scan_read_time < min_imerge_cost) - { - DBUG_PRINT("info", - ("index merge would be slower, " - "will do full 'index' scan")); - goto end_free; - } - } - else - { - /* check if "ALL" would be faster */ - if (read_time < min_imerge_cost) - { - DBUG_PRINT("info", - ("index merge would be slower, " - "will do full table scan")); - goto end_free; - } - } - + + /* Ok, using index_merge is the best option, so construct it. */ if (!(quick= quick_imerge= new QUICK_INDEX_MERGE_SELECT(thd, head))) goto end_free; quick->records= min_imerge_records; - quick->read_time= min_imerge_cost; + quick->read_time= min_imerge_read_time; my_pthread_setspecific_ptr(THR_MALLOC, &quick_imerge->alloc); @@ -1152,10 +1092,10 @@ int SQL_SELECT::test_quick_select(THD *thd, key_map keys_to_use, &quick_imerge->alloc))) { new_quick->records= min_imerge_records; - new_quick->read_time= min_imerge_cost; + new_quick->read_time= min_imerge_read_time; /* QUICK_RANGE_SELECT::QUICK_RANGE_SELECT leaves THR_MALLOC - pointing to its allocator, restore it back + pointing to its allocator, restore it back. */ quick_imerge->last_quick_select= new_quick; @@ -1214,15 +1154,265 @@ end: /* - Calculate quick range select read time, # of records, and best key to use - without constructing QUICK_RANGE_SELECT object. + Calculate index merge cost and save parameters for its construction. + + SYNOPSIS + get_index_merge_params() + param in parameter with structure. + needed_reg in/out needed_reg from this SQL_SELECT + imerge in index_merge description structure + read_time in/out in: cost of an existing way to read a table + out: cost of index merge + imerge_rows out pessimistic estimate of # of rows to be retrieved + + RETURN + 0 Cost of this index_merge is less than passed *read_time, + *imerge_rows and *read_time contain new index_merge parameters. + 1 Cost of this index_merge is more than *read_time, + *imerge_rows and *read_time are not modified. + -1 error + + NOTES + index_merge_cost = + cost(index_reads) + (see #1) + cost(rowid_to_row_scan) + (see #2) + cost(unique_use) (see #3) + + 1. cost(index_reads) =SUM_i(cost(index_read_i)) + For non-CPK scans, + cost(index_read_i) = {cost of ordinary 'index only' scan} + For CPK scan, + cost(index_read_i) = {cost of non-'index only' scan} + + 2. cost(rowid_to_row_scan) + If table PK is clustered then + cost(rowid_to_row_scan) = + {cost of ordinary clustered PK scan with n_ranges=n_rows} + + Otherwise, we use the following model to calculate costs: + We need to retrieve n_rows rows from file that occupies n_blocks blocks. + We assume that offsets of rows we need are independent variates with + uniform distribution in [0..max_file_offset] range. + + We'll denote block as "busy" if it contains row(s) we need to retrieve + and "empty" if doesn't contain rows we need. + + Probability that a block is empty is (1 - 1/n_blocks)^n_rows (this + applies to any block in file). Let x_i be a variate taking value 1 if + block #i is empty and 0 otherwise. + + Then E(x_i) = (1 - 1/n_blocks)^n_rows; + + E(n_empty_blocks) = E(sum(x_i)) = sum(E(x_i)) = + = n_blocks * ((1 - 1/n_blocks)^n_rows) = + ~= n_blocks * exp(-n_rows/n_blocks). + + E(n_busy_blocks) = n_blocks*(1 - (1 - 1/n_blocks)^n_rows) = + ~= n_blocks * (1 - exp(-n_rows/n_blocks)). + + Average size of "hole" between neighbor non-empty blocks is + E(hole_size) = n_blocks/E(n_busy_blocks). + + The total cost of reading all needed blocks in one "sweep" is: + + E(n_busy_blocks)* + (DISK_SEEK_BASE_COST + DISK_SEEK_PROP_COST*n_blocks/E(n_busy_blocks)). + + 3. Cost of Unique use is calculated in Unique::get_use_cost function. */ -static int get_quick_select_params(SEL_TREE *tree, PARAM& param, - key_map& needed_reg, TABLE *head, +static int get_index_merge_params(PARAM *param, key_map& needed_reg, + SEL_IMERGE *imerge, double *read_time, + ha_rows* imerge_rows) +{ + double imerge_cost= 0.0; /* cost of this index_merge */ + bool imerge_too_expensive= false; + double tree_read_time; + ha_rows tree_records; + bool pk_is_clustered= param->table->file->primary_key_is_clustered(); + bool have_cpk_scan; + ha_rows records_for_unique= 0; + ha_rows cpk_records= 0; + + DBUG_ENTER("get_index_merge_params"); + + /* allocate structs to save construction info */ + imerge->best_keys= + (SEL_ARG***)alloc_root(param->mem_root, + (imerge->trees_next - imerge->trees)* + sizeof(void*)); + /* + PHASE 1: get the best keys to use for this index_merge + */ + + /* + It may be possible to use different keys for index_merge scans, + e.g. for query like + ...WHERE (key1 < c2 AND key2 < c2) OR (key3 < c3 AND key4 < c4) + we have to make choice between key1 and key2 for one scan and + between key3,key4 for another. + We assume we'll get the best way if we choose the best key read + inside each of the conjuncts. Comparison is done without 'using index'. + */ + for (SEL_TREE **ptree= imerge->trees; + ptree != imerge->trees_next; + ptree++) + { + SEL_ARG **tree_best_key; + uint keynr; + + tree_read_time= *read_time; + if (get_quick_select_params(*ptree, param, needed_reg, false, + &tree_read_time, &tree_records, + &tree_best_key)) + { + /* + Non-'index only' range scan on a one in index_merge key is more + expensive than other available option. The entire index_merge will be + more expensive then, too. We continue here only to update SQL_SELECT + members. + */ + imerge_too_expensive= true; + } + + if (imerge_too_expensive) + continue; + + imerge->best_keys[ptree - imerge->trees]= tree_best_key; + keynr= param->real_keynr[(tree_best_key-(*ptree)->keys)]; + + if (pk_is_clustered && keynr == param->table->primary_key) + { + /* This is a Clustered PK scan, it will be done without 'index only' */ + imerge_cost += tree_read_time; + have_cpk_scan= true; + cpk_records= tree_records; + } + else + { + /* Non-CPK scan, calculate time to do it using 'index only' */ + imerge_cost += get_index_only_read_time(param, tree_records,keynr); + records_for_unique += tree_records; + } + } + + if (imerge_too_expensive) + DBUG_RETURN(1); + + if ((imerge_cost > *read_time) || + ((records_for_unique + cpk_records) >= param->table->file->records) && + *read_time != DBL_MAX) + { + /* Bail out if it is obvious that index_merge would be more expensive */ + DBUG_RETURN(1); + } + + if (have_cpk_scan) + { + /* + Add one ROWID comparison for each row retrieved on non-CPK scan. + (it is done in QUICK_RANGE_SELECT::row_in_ranges) + */ + imerge_cost += records_for_unique / TIME_FOR_COMPARE_ROWID; + } + + /* PHASE 2: Calculate cost(rowid_to_row_scan) */ + if (pk_is_clustered) + { + imerge_cost += + param->table->file->read_time(param->table->primary_key, + records_for_unique, + records_for_unique) + + rows2double(records_for_unique) / TIME_FOR_COMPARE; + } + else + { + double n_blocks= + ceil((double)(longlong)param->table->file->data_file_length / IO_SIZE); + /* Don't ceil the following as it is an estimate */ + double busy_blocks= + n_blocks * (1.0 - pow(1.0 - 1.0/n_blocks, records_for_unique)); + + JOIN *join= param->thd->lex->select_lex.join; + if (!join || join->tables == 1) + { + imerge_cost += busy_blocks*(DISK_SEEK_BASE_COST + + DISK_SEEK_PROP_COST*n_blocks/busy_blocks); + } + else + { + /* + It can be a join with source table being non-last table, so assume + that disk seeks are random here. + (TODO it is possible to determine if this *is* a last table in 'index + checked for each record'-type join) + */ + imerge_cost += busy_blocks; + } + } + + /* PHASE 3: Add Unique operations cost */ + imerge_cost += Unique::get_use_cost(param->mem_root, records_for_unique, + param->table->file->ref_length, + param->thd->variables.sortbuff_size); + if (imerge_cost < *read_time) + { + *read_time= imerge_cost; + records_for_unique += cpk_records; + *imerge_rows= min(records_for_unique, param->table->file->records); + DBUG_RETURN(0); + } + DBUG_RETURN(1); +} + + +/* + Calculate cost of 'index only' scan for given index and number of records. + (We can resolve this by only reading through this key.) + + SYNOPSIS + get_whole_index_read_time() + param parameters structure + records #of records to read + keynr key to read + + NOTES + It is assumed that we will read trough the whole key range and that all + key blocks are half full (normally things are much better). +*/ + +inline double get_index_only_read_time(PARAM* param, ha_rows records, int keynr) +{ + double read_time; + uint keys_per_block= (param->table->file->block_size/2/ + (param->table->key_info[keynr].key_length+ + param->table->file->ref_length) + 1); + read_time=((double) (records+keys_per_block-1)/ + (double) keys_per_block); + return read_time; +} + + +/* + Calculate quick range select read time, # of records, and best key to use + without constructing QUICK_RANGE_SELECT object. + SYNOPSIS + get_quick_select_params + tree in make range select for this SEL_TREE + param in parameters from test_quick_select + needed_reg in/out other table data needed by this quick_select + index_read_can_be_used if false, assume that 'index only' option is not + available. + read_time out read time estimate + records out # of records estimate + key_to_read out SEL_ARG to be used for creating quick select +*/ + +static int get_quick_select_params(SEL_TREE *tree, PARAM *param, + key_map& needed_reg, bool index_read_can_be_used, - double* read_time, ha_rows* records, - SEL_ARG*** key_to_read) + double *read_time, ha_rows *records, + SEL_ARG ***key_to_read) { int idx; int result = 1; @@ -1233,7 +1423,7 @@ static int get_quick_select_params(SEL_TREE *tree, PARAM& param, */ SEL_ARG **key,**end; - for (idx= 0,key=tree->keys, end=key+param.keys ; + for (idx= 0,key=tree->keys, end=key+param->keys ; key != end ; key++,idx++) { @@ -1241,16 +1431,18 @@ static int get_quick_select_params(SEL_TREE *tree, PARAM& param, double found_read_time; if (*key) { - uint keynr= param.real_keynr[idx]; + uint keynr= param->real_keynr[idx]; if ((*key)->type == SEL_ARG::MAYBE_KEY || (*key)->maybe_flag) needed_reg.set_bit(keynr); - bool read_index_only= index_read_can_be_used? head->used_keys.is_set(keynr): false; - found_records=check_quick_select(¶m, idx, *key); + bool read_index_only= index_read_can_be_used? + param->table->used_keys.is_set(keynr): false; + found_records=check_quick_select(param, idx, *key); + if (found_records != HA_POS_ERROR && found_records > 2 && read_index_only && - (head->file->index_flags(keynr) & HA_KEY_READ_ONLY)) + (param->table->file->index_flags(keynr) & HA_KEY_READ_ONLY)) { /* We can resolve this by only reading through this key. @@ -1258,21 +1450,17 @@ static int get_quick_select_params(SEL_TREE *tree, PARAM& param, and that all key blocks are half full (normally things are much better). */ - uint keys_per_block= (head->file->block_size/2/ - (head->key_info[keynr].key_length+ - head->file->ref_length) + 1); - found_read_time=((double) (found_records+keys_per_block-1)/ - (double) keys_per_block); + found_read_time=get_index_only_read_time(param, found_records, keynr); } else - found_read_time= (head->file->read_time(keynr, - param.range_count, + found_read_time= (param->table->file->read_time(keynr, + param->range_count, found_records)+ (double) found_records / TIME_FOR_COMPARE); if (*read_time > found_read_time && found_records != HA_POS_ERROR) { *read_time= found_read_time; - *records= found_records; + *records= found_records; *key_to_read= key; result = 0; } @@ -3118,8 +3306,8 @@ err: /* Fetch all row ids into unique. - If table has a clustered primary key(PK) that contains all rows (bdb and - innodb currently) and one of the index_merge scans is a scan on primary key, + If table has a clustered primary key that covers all rows (true for bdb + and innodb currently) and one of the index_merge scans is a scan on PK, then primary key scan rowids are not put into Unique and also rows that will be retrieved by PK scan are not put into Unique @@ -3134,15 +3322,15 @@ int QUICK_INDEX_MERGE_SELECT::prepare_unique() int result; DBUG_ENTER("QUICK_INDEX_MERGE_SELECT::prepare_unique"); - /* we're going to just read rowids. */ + /* We're going to just read rowids. */ head->file->extra(HA_EXTRA_KEYREAD); /* Make innodb retrieve all PK member fields, so * ha_innobase::position (which uses them) call works. - * we filter out rows retrieved by CCPK. + * We can filter out rows that will be retrieved by clustered PK. (This also creates a deficiency - it is possible that we will retrieve - parts of key that are not used by current query at all) + parts of key that are not used by current query at all.) */ head->file->extra(HA_EXTRA_RETRIEVE_ALL_COLS); @@ -3170,22 +3358,15 @@ int QUICK_INDEX_MERGE_SELECT::prepare_unique() if (result) { - /* - table read error (including HA_ERR_END_OF_FILE on last quick select - in index_merge) - */ if (result != HA_ERR_END_OF_FILE) - { DBUG_RETURN(result); - } - else - break; + break; } if (thd->killed) DBUG_RETURN(1); - /* skip row if it will be retrieved by clustered covering PK scan */ + /* skip row if it will be retrieved by clustered PK scan */ if (pk_quick_select && pk_quick_select->row_in_ranges()) continue; @@ -3207,14 +3388,16 @@ int QUICK_INDEX_MERGE_SELECT::prepare_unique() DBUG_RETURN(result); } + /* Get next row for index_merge. NOTES - The rows are read from - 1. rowids stored in Unique. - 2. QUICK_RANGE_SELECT with clustered primary key (if any). - the sets of rows retrieved in 1) and 2) are guaranteed to be disjoint. + The rows are read from + 1. rowids stored in Unique. + 2. QUICK_RANGE_SELECT with clustered primary key (if any). + The sets of rows retrieved in 1) and 2) are guaranteed to be disjoint. */ + int QUICK_INDEX_MERGE_SELECT::get_next() { int result; @@ -3228,8 +3411,8 @@ int QUICK_INDEX_MERGE_SELECT::get_next() if (result == -1) { result= HA_ERR_END_OF_FILE; - /* All rows from Unique have been retrieved, do a CCPK scan */ end_read_record(&read_record); + /* All rows from Unique have been retrieved, do a clustered PK scan */ if(pk_quick_select) { doing_pk_scan= true; @@ -3275,8 +3458,9 @@ int QUICK_RANGE_SELECT::get_next() if (!cur_range) range= *(cur_range= (QUICK_RANGE**)ranges.buffer); else - range= (cur_range == ((QUICK_RANGE**)ranges.buffer + ranges.elements - 1))? - NULL: *(++cur_range); + range= + (cur_range == ((QUICK_RANGE**)ranges.buffer + ranges.elements - 1))? + NULL: *(++cur_range); if (!range) DBUG_RETURN(HA_ERR_END_OF_FILE); // All ranges used @@ -3371,16 +3555,17 @@ int QUICK_RANGE_SELECT::cmp_next(QUICK_RANGE *range_arg) /* Check if current row will be retrieved by this QUICK_RANGE_SELECT - (this is used to filter out CCPK scan rows in index_merge). NOTES It is assumed that currently a scan is being done on another index which reads all necessary parts of the index that is scanned by this quick select. - The implementation does a binary search on sorted array of disjoint ranges, without taking size of range into account. + This function is used to filter out clustered PK scan rows in + index_merge quick select. + RETURN true if current row will be retrieved by this quick select false if not diff --git a/sql/opt_range.h b/sql/opt_range.h index 35a0cb5df88..7c2981795a2 100644 --- a/sql/opt_range.h +++ b/sql/opt_range.h @@ -118,11 +118,13 @@ public: protected: friend void print_quick_sel_range(QUICK_RANGE_SELECT *quick, const key_map* needed_reg); - friend QUICK_RANGE_SELECT *get_quick_select_for_ref(THD *thd, TABLE *table, - struct st_table_ref *ref); + friend + QUICK_RANGE_SELECT *get_quick_select_for_ref(THD *thd, TABLE *table, + struct st_table_ref *ref); friend bool get_quick_keys(struct st_qsel_param *param, QUICK_RANGE_SELECT *quick,KEY_PART *key, - SEL_ARG *key_tree,char *min_key,uint min_key_flag, + SEL_ARG *key_tree, + char *min_key, uint min_key_flag, char *max_key, uint max_key_flag); friend QUICK_RANGE_SELECT *get_quick_select(struct st_qsel_param*,uint idx, SEL_ARG *key_tree, @@ -160,58 +162,62 @@ public: /* -QUICK_INDEX_MERGE_SELECT - index_merge acces method quick select. + QUICK_INDEX_MERGE_SELECT - index_merge access method quick select. - QUICK_INDEX_MERGE_SELECT uses - * QUICK_RANGE_SELECTs to get rows - * Unique class to remove duplicate rows + QUICK_INDEX_MERGE_SELECT uses + * QUICK_RANGE_SELECTs to get rows + * Unique class to remove duplicate rows -INDEX MERGE OPTIMIZER - Current implementation doesn't detect all cases where index_merge could be - used, in particular: - * index_merge will never be used if range scan is possible (even if range - scan is more expensive) + INDEX MERGE OPTIMIZER + Current implementation doesn't detect all cases where index_merge could + be used, in particular: + * index_merge will never be used if range scan is possible (even if + range scan is more expensive) - * index_merge+'using index' is not supported (this the consequence of the - above restriction) + * index_merge+'using index' is not supported (this the consequence of + the above restriction) - * If WHERE part contains complex nested AND and OR conditions, some ways to - retrieve rows using index_merge will not be considered. The choice of - read plan may depend on the order of conjuncts/disjuncts in WHERE part of - the query, see comments near SEL_IMERGE::or_sel_tree_with_checks and - imerge_list_or_list function for details. + * If WHERE part contains complex nested AND and OR conditions, some ways + to retrieve rows using index_merge will not be considered. The choice + of read plan may depend on the order of conjuncts/disjuncts in WHERE + part of the query, see comments near imerge_list_or_list and + SEL_IMERGE::or_sel_tree_with_checks functions for details. - * there is no "index_merge_ref" method (but index_merge on non-first table - in join is possible with 'range checked for each record'). + * There is no "index_merge_ref" method (but index_merge on non-first + table in join is possible with 'range checked for each record'). - See comments around SEL_IMERGE class and test_quick_select for more details. + See comments around SEL_IMERGE class and test_quick_select for more + details. -ROW RETRIEVAL ALGORITHM + ROW RETRIEVAL ALGORITHM - index_merge uses Unique class for duplicates removal. Index merge takes - advantage of clustered covering primary key (CCPK) if the table has one. - The algorithm is as follows: + index_merge uses Unique class for duplicates removal. index_merge takes + advantage of Clustered Primary Key (CPK) if the table has one. + The index_merge algorithm consists of two phases: - prepare() //implemented in QUICK_INDEX_MERGE_SELECT::prepare_unique - { - activate 'index only'; - while(retrieve next row for non-CCPK scan) + Phase 1 (implemented in QUICK_INDEX_MERGE_SELECT::prepare_unique): + prepare() { - if (there is a CCPK scan and row will be retrieved by it) - skip this row; - else - put rowid into Unique; + activate 'index only'; + while(retrieve next row for non-CPK scan) + { + if (there is a CPK scan and row will be retrieved by it) + skip this row; + else + put its rowid into Unique; + } + deactivate 'index only'; } - deactivate 'index only'; - } - - fetch() //implemented as sequence of QUICK_INDEX_MERGE_SELECT::get_next calls - { - retrieve all rows from row pointers stored in Unique; - free Unique; - retrieve all rows for CCPK scan; - } + + Phase 2 (implemented as sequence of QUICK_INDEX_MERGE_SELECT::get_next + calls): + fetch() + { + retrieve all rows from row pointers stored in Unique; + free Unique; + retrieve all rows for CPK scan; + } */ class QUICK_INDEX_MERGE_SELECT : public QUICK_SELECT_I @@ -239,10 +245,10 @@ public: /* last element in quick_selects list */ QUICK_RANGE_SELECT* last_quick_select; - /* quick select that uses Covering Clustered Primary Key (NULL if none) */ + /* quick select that uses clustered primary key (NULL if none) */ QUICK_RANGE_SELECT* pk_quick_select; - /* true if this select is currently doing a CCPK scan */ + /* true if this select is currently doing a clustered PK scan */ bool doing_pk_scan; Unique *unique; diff --git a/sql/records.cc b/sql/records.cc index f8fbfe62187..b29b113a1bc 100644 --- a/sql/records.cc +++ b/sql/records.cc @@ -98,7 +98,6 @@ void init_read_record(READ_RECORD *info,THD *thd, TABLE *table, } } else if (select && select->quick) - //&& (select->quick->get_type() != QUICK_SELECT_I::QS_TYPE_INDEX_MERGE)) { DBUG_PRINT("info",("using rr_quick")); info->read_record=rr_quick; diff --git a/sql/sql_class.h b/sql/sql_class.h index 8263789a2a2..76d2eae1bb5 100644 --- a/sql/sql_class.h +++ b/sql/sql_class.h @@ -1233,7 +1233,8 @@ public: } bool get(TABLE *table); - + static double get_use_cost(MEM_ROOT *alloc, uint nkeys, uint key_size, + ulong max_in_memory_size); friend int unique_write_to_file(gptr key, element_count count, Unique *unique); friend int unique_write_to_ptrs(gptr key, element_count count, Unique *unique); }; diff --git a/sql/uniques.cc b/sql/uniques.cc index 02146426782..48233f29bee 100644 --- a/sql/uniques.cc +++ b/sql/uniques.cc @@ -63,12 +63,194 @@ Unique::Unique(qsort_cmp2 comp_func, void * comp_func_fixed_arg, comp_func_fixed_arg); /* If the following fail's the next add will also fail */ my_init_dynamic_array(&file_ptrs, sizeof(BUFFPEK), 16, 16); + /* + If you change the following, change it in get_max_elements function, too. + */ max_elements= max_in_memory_size / ALIGN_SIZE(sizeof(TREE_ELEMENT)+size); open_cached_file(&file, mysql_tmpdir,TEMP_PREFIX, DISK_BUFFER_SIZE, MYF(MY_WME)); } +#ifndef M_PI +#define M_PI 3.14159265358979323846 +#endif + +#define M_E (exp(1)) + +inline double log2_n_fact(double x) +{ + return (2 * ( ((x)+1) * log(((x)+1)/M_E) + log(2*M_PI*((x)+1))/2 ) / log(2)); +} + +/* + Calculate cost of merge_buffers call. + + NOTE + See comment near Unique::get_use_cost for cost formula derivation. +*/ +static double get_merge_buffers_cost(uint* buff_sizes, uint elem_size, + int last, int f,int t) +{ + uint sum= 0; + for (int i=f; i <= t; i++) + sum+= buff_sizes[i]; + buff_sizes[last]= sum; + + int n_buffers= t - f + 1; + double buf_length= sum*elem_size; + + return (((double)buf_length/(n_buffers+1)) / IO_SIZE) * 2 * n_buffers + + buf_length * log(n_buffers) / (TIME_FOR_COMPARE_ROWID * log(2.0)); +} + +/* + Calculate cost of merging buffers into one in Unique::get, i.e. calculate + how long (in terms of disk seeks) the two call + merge_many_buffs(...); + merge_buffers(...); + will take. + + SYNOPSIS + get_merge_many_buffs_cost() + alloc memory pool to use + maxbuffer # of full buffers. + max_n_elems # of elements in first maxbuffer buffers. + last_n_elems # of elements in last buffer. + elem_size size of buffer element. + + NOTES + It is assumed that maxbuffer+1 buffers are merged, first maxbuffer buffers + contain max_n_elems each, last buffer contains last_n_elems elements. + + The current implementation does a dumb simulation of merge_many_buffs + actions. + + RETURN + >=0 Cost of merge in disk seeks. + <0 Out of memory. +*/ +static double get_merge_many_buffs_cost(MEM_ROOT *alloc, + uint maxbuffer, uint max_n_elems, + uint last_n_elems, int elem_size) +{ + register int i; + double total_cost= 0.0; + int lastbuff; + uint* buff_sizes; + + if (!(buff_sizes= (uint*)alloc_root(alloc, sizeof(uint) * (maxbuffer + 1)))) + return -1.0; + for(i = 0; i < (int)maxbuffer; i++) + buff_sizes[i]= max_n_elems; + + buff_sizes[maxbuffer]= last_n_elems; + + if (maxbuffer >= MERGEBUFF2) + { + /* Simulate merge_many_buff */ + while (maxbuffer >= MERGEBUFF2) + { + lastbuff=0; + for (i = 0; i <= (int) maxbuffer - MERGEBUFF*3/2; i += MERGEBUFF) + total_cost += get_merge_buffers_cost(buff_sizes, elem_size, + lastbuff++, i, i+MERGEBUFF-1); + + total_cost += get_merge_buffers_cost(buff_sizes, elem_size, + lastbuff++, i, maxbuffer); + maxbuffer= (uint)lastbuff-1; + } + } + + /* Simulate final merge_buff call. */ + total_cost += get_merge_buffers_cost(buff_sizes, elem_size, 0, 0, + maxbuffer); + return total_cost; +} + + +/* + Calclulate cost of using Unique for processing nkeys elements of size + key_size using max_in_memory_size memory. + + RETURN + Use cost as # of disk seeks. + + NOTES + cost(using_unqiue) = + cost(create_trees) + (see #1) + cost(merge) + (see #2) + cost(read_result) (see #3) + + 1. Cost of trees creation + For each Unique::put operation there will be 2*log2(n+1) elements + comparisons, where n runs from 1 tree_size (we assume that all added + elements are different). Together this gives: + + n_compares = 2*(log2(2) + log2(3) + ... + log2(N+1)) = 2*log2((N+1)!) = + + = 2*ln((N+1)!) / ln(2) = {using Stirling formula} = + + = 2*( (N+1)*ln((N+1)/e) + (1/2)*ln(2*pi*(N+1)) / ln(2). + + then cost(tree_creation) = n_compares*ROWID_COMPARE_COST; + + Total cost of creating trees: + (n_trees - 1)*max_size_tree_cost + non_max_size_tree_cost. + + 2. Cost of merging. + If only one tree is created by Unique no merging will be necessary. + Otherwise, we model execution of merge_many_buff function and count + #of merges. (The reason behind this is that number of buffers is small, + while size of buffers is big and we don't want to loose precision with + O(x)-style formula) + + 3. If only one tree is created by Unique no disk io will happen. + Otherwise, ceil(key_len*n_keys) disk seeks are necessary. We assume + these will be random seeks. +*/ + +double Unique::get_use_cost(MEM_ROOT *alloc, uint nkeys, uint key_size, + ulong max_in_memory_size) +{ + ulong max_elements_in_tree; + ulong last_tree_elems; + int n_full_trees; /* number of trees in unique - 1 */ + double result; + + max_elements_in_tree= max_in_memory_size / + ALIGN_SIZE(sizeof(TREE_ELEMENT)+key_size); + n_full_trees= nkeys / max_elements_in_tree; + last_tree_elems= nkeys % max_elements_in_tree; + + /* Calculate cost of creating trees */ + result= log2_n_fact(last_tree_elems); + if (n_full_trees) + result+= n_full_trees * log2_n_fact(max_elements_in_tree); + result /= TIME_FOR_COMPARE_ROWID; + + /* Calculate cost of merging */ + if (!n_full_trees) + return result; + + /* There is more then one tree and merging is necessary. */ + /* Add cost of writing all trees to disk. */ + result += n_full_trees * ceil(key_size*max_elements_in_tree / IO_SIZE); + result += ceil(key_size*last_tree_elems / IO_SIZE); + + /* Cost of merge */ + result += get_merge_many_buffs_cost(alloc, n_full_trees, + max_elements_in_tree, + last_tree_elems, key_size); + /* + Add cost of reading the resulting sequence, assuming there were no + duplicate elements. + */ + result += ceil((double)key_size*nkeys/IO_SIZE); + + return result; +} + Unique::~Unique() { close_cached_file(&file);