From 7dbdedcb72afa53f8c2d619e32376a1897bc257b Mon Sep 17 00:00:00 2001
From: unknown <sergefp@mysql.com>
Date: Thu, 18 Dec 2003 06:08:00 +0300
Subject: [PATCH] Precise read time estimates for index_merge/Unique

---
 mysql-test/r/index_merge.result |   2 +-
 mysql-test/t/index_merge.test   |   2 +-
 sql/filesort.cc                 |   8 +-
 sql/ha_berkeley.h               |   2 +-
 sql/ha_innodb.cc                |   3 +-
 sql/ha_innodb.h                 |   2 +-
 sql/handler.h                   |   4 +-
 sql/mysql_priv.h                |  20 ++
 sql/opt_range.cc                | 493 ++++++++++++++++++++++----------
 sql/opt_range.h                 |  96 ++++---
 sql/records.cc                  |   1 -
 sql/sql_class.h                 |   3 +-
 sql/uniques.cc                  | 182 ++++++++++++
 13 files changed, 606 insertions(+), 212 deletions(-)

diff --git a/mysql-test/r/index_merge.result b/mysql-test/r/index_merge.result
index 2abdcf202d9..97d46bf39ea 100644
--- a/mysql-test/r/index_merge.result
+++ b/mysql-test/r/index_merge.result
@@ -1,4 +1,4 @@
-drop table if exists t0, t1, t2, t3;
+drop table if exists t0, t1, t2, t3,t4;
 create table t0
 (
 key1 int not null, 
diff --git a/mysql-test/t/index_merge.test b/mysql-test/t/index_merge.test
index 79eda7c1f56..029c2e4a013 100644
--- a/mysql-test/t/index_merge.test
+++ b/mysql-test/t/index_merge.test
@@ -3,7 +3,7 @@
 #
 
 --disable_warnings
-drop table if exists t0, t1, t2, t3;
+drop table if exists t0, t1, t2, t3,t4;
 --enable_warnings
 
 # Create and fill a table with simple keys
diff --git a/sql/filesort.cc b/sql/filesort.cc
index 356afdf748c..27c59a05941 100644
--- a/sql/filesort.cc
+++ b/sql/filesort.cc
@@ -88,9 +88,9 @@ ha_rows filesort(THD *thd, TABLE *table, SORT_FIELD *sortorder, uint s_length,
 #endif
   FILESORT_INFO table_sort;
   /* 
-    don't use table->sort in filesort as it is also used by 
-    QUICK_INDEX_MERGE_SELECT. work with a copy of it and put it back at the 
-    end when index_merge select has finished with it.
+    Don't use table->sort in filesort as it is also used by 
+    QUICK_INDEX_MERGE_SELECT. Work with a copy and put it back at the end 
+    when index_merge select has finished with it.
   */
   memcpy(&table_sort, &table->sort, sizeof(FILESORT_INFO));
   table->sort.io_cache= NULL;
@@ -452,7 +452,7 @@ static ha_rows find_all_keys(SORTPARAM *param, SQL_SELECT *select,
   if (quick_select)
   {
     /*
-      index_merge quick select uses table->sort when retrieving rows, so free 
+      index_merge quick select uses table->sort when retrieving rows, so free
       resoures it has allocated.
     */
     end_read_record(&read_record_info);
diff --git a/sql/ha_berkeley.h b/sql/ha_berkeley.h
index 582a79906a7..f225c24eaf7 100644
--- a/sql/ha_berkeley.h
+++ b/sql/ha_berkeley.h
@@ -167,7 +167,7 @@ class ha_berkeley: public handler
   longlong get_auto_increment();
   void print_error(int error, myf errflag);
   uint8 table_cache_type() { return HA_CACHE_TBL_TRANSACT; }
-  bool primary_key_is_clustered_covering() { return true; }
+  bool primary_key_is_clustered() { return true; }
 };
 
 extern bool berkeley_skip, berkeley_shared_data;
diff --git a/sql/ha_innodb.cc b/sql/ha_innodb.cc
index b92a5ff8c3f..d949f8bcf9c 100644
--- a/sql/ha_innodb.cc
+++ b/sql/ha_innodb.cc
@@ -2003,7 +2003,8 @@ build_template(
 		update field->query_id so that the formula
 		thd->query_id == field->query_id did not work. */
 
-                ibool index_contains_field = dict_index_contains_col_or_prefix(index, i);
+                ibool index_contains_field=
+                  dict_index_contains_col_or_prefix(index, i);
 
 		if (templ_type == ROW_MYSQL_REC_FIELDS && 
                     ((prebuilt->read_just_key && !index_contains_field) ||
diff --git a/sql/ha_innodb.h b/sql/ha_innodb.h
index 6fa66377cd6..c305a019fcd 100644
--- a/sql/ha_innodb.h
+++ b/sql/ha_innodb.h
@@ -187,7 +187,7 @@ class ha_innobase: public handler
 	void init_table_handle_for_HANDLER(); 
 	longlong get_auto_increment();
         uint8 table_cache_type() { return HA_CACHE_TBL_ASKTRANSACT; }
-        bool primary_key_is_clustered_covering() { return true; }
+        bool primary_key_is_clustered() { return true; }
 };
 
 extern bool innodb_skip;
diff --git a/sql/handler.h b/sql/handler.h
index 0bbaba81f96..2ad37233c9e 100644
--- a/sql/handler.h
+++ b/sql/handler.h
@@ -378,10 +378,10 @@ public:
 
   /*
    RETURN
-     true  primary key (if there is one) is clustered key covering all fields
+     true  Primary key (if there is one) is clustered key covering all fields
      false otherwise
   */
-  virtual bool primary_key_is_clustered_covering() { return false; }
+  virtual bool primary_key_is_clustered() { return false; }
 };
 
 	/* Some extern variables used with handlers */
diff --git a/sql/mysql_priv.h b/sql/mysql_priv.h
index 3ace72ea24c..2b25f501a37 100644
--- a/sql/mysql_priv.h
+++ b/sql/mysql_priv.h
@@ -118,6 +118,26 @@ extern CHARSET_INFO *national_charset_info, *table_alias_charset;
 */
 #define TIME_FOR_COMPARE   5	// 5 compares == one read
 
+/*
+  Number of comparisons of table rowids equivalent to reading one row from a 
+  table.
+*/
+#define TIME_FOR_COMPARE_ROWID  (TIME_FOR_COMPARE*2)
+
+/*
+  For sequential disk seeks the cost formula is:
+    DISK_SEEK_BASE_COST + DISK_SEEK_PROP_COST * #blocks_to_skip  
+  
+  The cost of average seek 
+    DISK_SEEK_BASE_COST + DISK_SEEK_PROP_COST*BLOCKS_IN_AVG_SEEK =1.0.
+*/
+#define DISK_SEEK_BASE_COST ((double)0.5)
+
+#define BLOCKS_IN_AVG_SEEK  128
+
+#define DISK_SEEK_PROP_COST ((double)0.5/BLOCKS_IN_AVG_SEEK)
+
+
 /*
   Number of rows in a reference table when refereed through a not unique key.
   This value is only used when we don't know anything about the key
diff --git a/sql/opt_range.cc b/sql/opt_range.cc
index 98002cc5b7a..e3d0f8624b9 100644
--- a/sql/opt_range.cc
+++ b/sql/opt_range.cc
@@ -307,12 +307,18 @@ static ha_rows check_quick_keys(PARAM *param,uint index,SEL_ARG *key_tree,
 
 QUICK_RANGE_SELECT *get_quick_select(PARAM *param,uint index,
                                      SEL_ARG *key_tree, MEM_ROOT *alloc = NULL);
-static int get_quick_select_params(SEL_TREE *tree, PARAM& param,
-                                   key_map& needed_reg, TABLE *head,
+static int get_quick_select_params(SEL_TREE *tree, PARAM *param,
+                                   key_map& needed_reg,
                                    bool index_read_can_be_used,
-                                   double* read_time, 
-                                   ha_rows* records,
+                                   double *read_time, 
+                                   ha_rows *records,
                                    SEL_ARG*** key_to_read);
+static int get_index_merge_params(PARAM *param, key_map& needed_reg,
+                           SEL_IMERGE *imerge, double *read_time, 
+                           ha_rows* imerge_rows);
+inline double get_index_only_read_time(PARAM* param, ha_rows records, 
+                                       int keynr);
+
 #ifndef DBUG_OFF
 static void print_quick_sel_imerge(QUICK_INDEX_MERGE_SELECT *quick,
                                    const key_map *needed_reg);
@@ -453,7 +459,7 @@ int SEL_IMERGE::or_sel_tree_with_checks(PARAM *param, SEL_TREE *new_tree)
     }
   }
 
-  /* new tree cannot be combined with any of existing trees */
+  /* New tree cannot be combined with any of existing trees. */
   return or_sel_tree(param, new_tree);
 }
 
@@ -483,7 +489,6 @@ int SEL_IMERGE::or_sel_imerge_with_checks(PARAM *param, SEL_IMERGE* imerge)
 
 /* 
   Perform AND operation on two index_merge lists and store result in *im1.
-
 */
 
 inline void imerge_list_and_list(List<SEL_IMERGE> *im1, List<SEL_IMERGE> *im2)
@@ -503,18 +508,16 @@ inline void imerge_list_and_list(List<SEL_IMERGE> *im1, List<SEL_IMERGE> *im2)
     i.e. all conjuncts except the first one are currently dropped. 
     This is done to avoid producing N*K ways to do index_merge.
 
-    If (a_1||b_1) produce a condition that is always true, NULL is 
-    returned and index_merge is discarded. (while it is actually 
-    possible to try harder).
+    If (a_1||b_1) produce a condition that is always true, NULL is returned
+    and index_merge is discarded (while it is actually possible to try 
+    harder).
 
-    As a consequence of this, choice of keys to do index_merge 
-    read may depend on the order of conditions in WHERE part of 
-    the query.
+    As a consequence of this, choice of keys to do index_merge read may depend
+    on the order of conditions in WHERE part of the query.
 
   RETURN
-    0     OK, result is stored in *im1 
+    0     OK, result is stored in *im1
     other Error, both passed lists are unusable
-
 */
 
 int imerge_list_or_list(PARAM *param, 
@@ -533,7 +536,7 @@ int imerge_list_or_list(PARAM *param,
   Perform OR operation on index_merge list and key tree.
 
   RETURN
-    0     OK, result is stored in *im1 
+    0     OK, result is stored in *im1.
     other Error
   
 */
@@ -685,10 +688,10 @@ bool
 QUICK_INDEX_MERGE_SELECT::push_quick_back(QUICK_RANGE_SELECT *quick_sel_range)
 {
   /* 
-    Save quick_select that does scan on clustered covering primary key as 
-    it will be processed separately
+    Save quick_select that does scan on clustered primary key as it will be 
+    processed separately.
   */
-  if (head->file->primary_key_is_clustered_covering() && 
+  if (head->file->primary_key_is_clustered() && 
       quick_sel_range->index == head->primary_key)
     pk_quick_select= quick_sel_range;
   else
@@ -1001,7 +1004,7 @@ int SQL_SELECT::test_quick_select(THD *thd, key_map keys_to_use,
 	ha_rows found_records;
 	double found_read_time= read_time;
 
-        if (!get_quick_select_params(tree, param, needed_reg, head, true,
+        if (!get_quick_select_params(tree, &param, needed_reg, true,
                                      &found_read_time, &found_records,
                                      &best_key))
         {
@@ -1021,120 +1024,57 @@ int SQL_SELECT::test_quick_select(THD *thd, key_map keys_to_use,
         }
 
         /* 
-           btw, tree type SEL_TREE::INDEX_MERGE was not introduced 
-           intentionally
+           Btw, tree type SEL_TREE::INDEX_MERGE was not introduced 
+           intentionally.
         */
 
-        /* if no range select could be built, try using index_merge */
+        /* If no range select could be built, try using index_merge. */
         if (!quick && !tree->merges.is_empty())
         {
           DBUG_PRINT("info",("No range reads possible,"
                              " trying to construct index_merge"));
           SEL_IMERGE *imerge;
           SEL_IMERGE *min_imerge= NULL;
-          double  min_imerge_cost= DBL_MAX;
+          double  min_imerge_read_time;
           ha_rows min_imerge_records;
           
+          if (!head->used_keys.is_clear_all())
+          {           
+            int key_for_use= find_shortest_key(head, &head->used_keys);
+            ha_rows total_table_records= (0 == head->file->records)? 1 : 
+                                          head->file->records;
+            read_time = get_index_only_read_time(&param, total_table_records,
+                                                 key_for_use);
+            DBUG_PRINT("info", 
+                       ("'all' scan will be using key %d, read time %g",
+                       key_for_use, read_time));
+          }
+          
+          min_imerge_read_time=read_time;
+          /* 
+            Ok, read_time contains best 'all' read time.
+            Now look for index_merge with cost < read_time 
+           */
           List_iterator_fast<SEL_IMERGE> it(tree->merges);
-          /* find index_merge with minimal cost */
           while ((imerge= it++))
           {
-            bool    imerge_failed= false;
-            double  imerge_cost= 0;
-            ha_rows imerge_total_records= 0;
-            double  tree_read_time;
-            ha_rows tree_records;
-            imerge->best_keys=
-              (SEL_ARG***)alloc_root(&alloc,
-                                     (imerge->trees_next - imerge->trees)*
-                                     sizeof(void*));
-            /*
-              It may be possible to use different keys for index_merge, e.g for
-              queries like 
-              ...WHERE (key1 < c2 AND key2 < c2) OR (key3 < c3 AND key4 < c4)
-              We assume we get the best index_merge if we choose the best key
-              read inside each of the conjuncts.
-            */
-            for (SEL_TREE **ptree= imerge->trees;
-                 ptree != imerge->trees_next;
-                 ptree++)
-            {
-              tree_read_time= read_time;              
-              if (get_quick_select_params(*ptree, param, needed_reg, head, 
-                                          false,
-                                          &tree_read_time, &tree_records,
-                                          &(imerge->best_keys[ptree - 
-                                          imerge->trees])))
-                imerge_failed= true;
-              imerge_cost += tree_read_time;
-              imerge_total_records += tree_records;
-            }
-
-            if (!imerge_failed)
-            {
-              imerge_total_records= min(imerge_total_records, 
-                                        head->file->records);
-              imerge_cost += imerge_total_records / TIME_FOR_COMPARE;
-              if (imerge_cost < min_imerge_cost)
-              {
-                min_imerge= imerge;
-                min_imerge_cost= imerge_cost;
-                min_imerge_records= imerge_total_records;
-              }
-            }
+            if (!get_index_merge_params(&param, needed_reg, imerge, 
+                                        &min_imerge_read_time, 
+                                        &min_imerge_records))
+              min_imerge= imerge;
           }
           
           if (!min_imerge)
             goto end_free;
           
           records= min_imerge_records;
-          /* 
-            Ok, got minimal index merge, *min_imerge, with cost min_imerge_cost
-            Compare its cost with "all" scan cost (or "all+using index" if 
-            it is possible) and choose the best.
-          */
-          
-          if (!head->used_keys.is_clear_all())
-          {
-            /* check if "ALL" +"using index" read would be faster */
-            int key_for_use= find_shortest_key(head, &head->used_keys);
-            ha_rows total_table_records= (0 == head->file->records)? 1 : 
-                                          head->file->records;
-            uint keys_per_block= (head->file->block_size/2/
-                                  (head->key_info[key_for_use].key_length+
-                                  head->file->ref_length) + 1);
-            double all_index_scan_read_time= ((double)(total_table_records+
-                                              keys_per_block-1)/
-                                              (double) keys_per_block);
-
-            DBUG_PRINT("info", 
-                       ("'all' scan will be using key %d, read time %g",
-                       key_for_use, all_index_scan_read_time));
-            if (all_index_scan_read_time < min_imerge_cost)
-            {
-              DBUG_PRINT("info", 
-                         ("index merge would be slower, "
-                         "will do full 'index' scan"));
-              goto end_free;
-            }
-          }
-          else
-          {
-            /* check if "ALL" would be faster */
-            if (read_time < min_imerge_cost)
-            {
-              DBUG_PRINT("info", 
-                         ("index merge would be slower, "
-                         "will do full table scan"));
-              goto end_free;
-            }
-          }
-
+                    
+          /* Ok, using index_merge is the best option, so construct it. */
           if (!(quick= quick_imerge= new QUICK_INDEX_MERGE_SELECT(thd, head)))
             goto end_free;
 
           quick->records= min_imerge_records;
-          quick->read_time= min_imerge_cost;
+          quick->read_time= min_imerge_read_time;
           
           my_pthread_setspecific_ptr(THR_MALLOC, &quick_imerge->alloc);
 
@@ -1152,10 +1092,10 @@ int SQL_SELECT::test_quick_select(THD *thd, key_map keys_to_use,
                                              &quick_imerge->alloc)))
             {
               new_quick->records= min_imerge_records;
-              new_quick->read_time= min_imerge_cost;
+              new_quick->read_time= min_imerge_read_time;
               /*
                 QUICK_RANGE_SELECT::QUICK_RANGE_SELECT leaves THR_MALLOC
-                pointing to its allocator, restore it back
+                pointing to its allocator, restore it back.
               */
               quick_imerge->last_quick_select= new_quick;
 
@@ -1214,15 +1154,265 @@ end:
 
 
 /*
-  Calculate quick range select read time, # of records, and best key to use
-  without constructing QUICK_RANGE_SELECT object.
+  Calculate index merge cost and save parameters for its construction.
+
+  SYNOPSIS 
+    get_index_merge_params()
+      param       in      parameter with structure.
+      needed_reg  in/out  needed_reg from this SQL_SELECT
+      imerge      in      index_merge description structure
+      read_time   in/out  in:  cost of an existing way to read a table
+                          out: cost of index merge
+      imerge_rows out     pessimistic estimate of # of rows to be retrieved
+
+  RETURN
+    0   Cost of this index_merge is less than passed *read_time, 
+        *imerge_rows and *read_time contain new index_merge parameters.
+    1   Cost of this index_merge is more than *read_time, 
+        *imerge_rows and *read_time are not modified.
+   -1   error
+
+  NOTES
+    index_merge_cost = 
+      cost(index_reads) +         (see #1)
+      cost(rowid_to_row_scan) +   (see #2)
+      cost(unique_use)            (see #3)
+
+    1. cost(index_reads) =SUM_i(cost(index_read_i))
+       For non-CPK scans, 
+         cost(index_read_i) = {cost of ordinary 'index only' scan} 
+       For CPK scan,
+         cost(index_read_i) = {cost of non-'index only' scan}
+
+    2. cost(rowid_to_row_scan)
+      If table PK is clustered then
+        cost(rowid_to_row_scan) = 
+          {cost of ordinary clustered PK scan with n_ranges=n_rows}
+      
+      Otherwise, we use the following model to calculate costs:      
+      We need to retrieve n_rows rows from file that occupies n_blocks blocks.
+      We assume that offsets of rows we need are independent variates with 
+      uniform distribution in [0..max_file_offset] range.
+      
+      We'll denote block as "busy" if it contains row(s) we need to retrieve
+      and "empty" if doesn't contain rows we need.
+      
+      Probability that a block is empty is (1 - 1/n_blocks)^n_rows (this
+      applies to any block in file). Let x_i be a variate taking value 1 if 
+      block #i is empty and 0 otherwise.
+      
+      Then E(x_i) = (1 - 1/n_blocks)^n_rows;
+
+      E(n_empty_blocks) = E(sum(x_i)) = sum(E(x_i)) = 
+        = n_blocks * ((1 - 1/n_blocks)^n_rows) = 
+       ~= n_blocks * exp(-n_rows/n_blocks).
+
+      E(n_busy_blocks) = n_blocks*(1 - (1 - 1/n_blocks)^n_rows) =
+       ~= n_blocks * (1 - exp(-n_rows/n_blocks)).
+      
+      Average size of "hole" between neighbor non-empty blocks is
+           E(hole_size) = n_blocks/E(n_busy_blocks).
+      
+      The total cost of reading all needed blocks in one "sweep" is:
+
+      E(n_busy_blocks)*
+       (DISK_SEEK_BASE_COST + DISK_SEEK_PROP_COST*n_blocks/E(n_busy_blocks)).
+
+    3. Cost of Unique use is calculated in Unique::get_use_cost function.
 */
 
-static int get_quick_select_params(SEL_TREE *tree, PARAM& param, 
-                                   key_map& needed_reg, TABLE *head,
+static int get_index_merge_params(PARAM *param, key_map& needed_reg,
+                                  SEL_IMERGE *imerge, double *read_time,
+                                  ha_rows* imerge_rows)
+{
+  double  imerge_cost= 0.0; /* cost of this index_merge */
+  bool    imerge_too_expensive= false;
+  double  tree_read_time;
+  ha_rows tree_records;
+  bool    pk_is_clustered= param->table->file->primary_key_is_clustered();
+  bool    have_cpk_scan;
+  ha_rows records_for_unique= 0;
+  ha_rows cpk_records= 0;
+  
+  DBUG_ENTER("get_index_merge_params");
+
+  /* allocate structs to save construction info */
+  imerge->best_keys=
+    (SEL_ARG***)alloc_root(param->mem_root,
+                           (imerge->trees_next - imerge->trees)*
+                           sizeof(void*));  
+  /*
+    PHASE 1: get the best keys to use for this index_merge 
+  */
+
+  /*
+    It may be possible to use different keys for index_merge scans,
+     e.g. for query like 
+    ...WHERE (key1 < c2 AND key2 < c2) OR (key3 < c3 AND key4 < c4)
+    we have to make choice between key1 and key2 for one scan and 
+    between key3,key4 for another.
+    We assume we'll get the best way if we choose the best key read
+    inside each of the conjuncts. Comparison is done without 'using index'.
+  */
+  for (SEL_TREE **ptree= imerge->trees;
+       ptree != imerge->trees_next;
+       ptree++)
+  {
+    SEL_ARG **tree_best_key;
+    uint keynr;
+
+    tree_read_time= *read_time;
+    if (get_quick_select_params(*ptree, param, needed_reg, false,
+                                &tree_read_time, &tree_records,
+                                &tree_best_key))
+    {
+      /*
+        Non-'index only' range scan on a one in index_merge key is more 
+        expensive than other available option. The entire index_merge will be
+        more expensive then, too. We continue here only to update SQL_SELECT 
+        members.
+      */
+      imerge_too_expensive= true;
+    }
+
+    if (imerge_too_expensive)
+      continue;
+
+    imerge->best_keys[ptree - imerge->trees]= tree_best_key;
+    keynr= param->real_keynr[(tree_best_key-(*ptree)->keys)];    
+
+    if (pk_is_clustered && keynr == param->table->primary_key)
+    {
+      /* This is a Clustered PK scan, it will be done without 'index only' */
+      imerge_cost += tree_read_time;
+      have_cpk_scan= true;
+      cpk_records= tree_records;
+    }
+    else
+    {
+      /* Non-CPK scan, calculate time to do it using 'index only' */
+      imerge_cost += get_index_only_read_time(param, tree_records,keynr);
+      records_for_unique += tree_records;
+    }
+  }
+  
+  if (imerge_too_expensive)
+    DBUG_RETURN(1);
+  
+  if ((imerge_cost > *read_time) || 
+      ((records_for_unique + cpk_records) >= param->table->file->records) &&
+       *read_time != DBL_MAX)
+  {
+    /* Bail out if it is obvious that index_merge would be more expensive */
+    DBUG_RETURN(1);
+  }
+
+  if (have_cpk_scan)
+  {
+    /* 
+      Add one ROWID comparison for each row retrieved on non-CPK scan.
+      (it is done in QUICK_RANGE_SELECT::row_in_ranges)
+    */
+    imerge_cost += records_for_unique / TIME_FOR_COMPARE_ROWID;
+  }
+
+  /* PHASE 2: Calculate cost(rowid_to_row_scan) */
+  if (pk_is_clustered)
+  {
+    imerge_cost +=
+      param->table->file->read_time(param->table->primary_key,
+                                    records_for_unique,
+                                    records_for_unique)
+       + rows2double(records_for_unique) / TIME_FOR_COMPARE;
+  }
+  else
+  {    
+    double n_blocks=
+      ceil((double)(longlong)param->table->file->data_file_length / IO_SIZE);    
+    /* Don't ceil the following as it is an estimate */
+    double busy_blocks=
+       n_blocks * (1.0 - pow(1.0 - 1.0/n_blocks, records_for_unique));
+    
+    JOIN *join= param->thd->lex->select_lex.join;
+    if (!join || join->tables == 1)
+    {
+      imerge_cost += busy_blocks*(DISK_SEEK_BASE_COST + 
+                                DISK_SEEK_PROP_COST*n_blocks/busy_blocks);
+    }
+    else
+    {
+      /* 
+        It can be a join with source table being non-last table, so assume
+        that disk seeks are random here.
+        (TODO it is possible to determine if this *is* a last table in 'index
+         checked for each record'-type join)
+      */
+      imerge_cost += busy_blocks;
+    }
+  }
+
+  /* PHASE 3: Add Unique operations cost */
+  imerge_cost += Unique::get_use_cost(param->mem_root, records_for_unique,
+                                      param->table->file->ref_length,
+                                      param->thd->variables.sortbuff_size);
+  if (imerge_cost < *read_time)
+  {
+    *read_time=   imerge_cost;    
+    records_for_unique += cpk_records;
+    *imerge_rows= min(records_for_unique, param->table->file->records);    
+    DBUG_RETURN(0);
+  }
+  DBUG_RETURN(1);
+}
+
+
+/*
+  Calculate cost of 'index only' scan for given index and number of records.
+    (We can resolve this by only reading through this key.)
+
+  SYNOPSIS
+    get_whole_index_read_time()
+      param    parameters structure
+      records  #of records to read
+      keynr    key to read
+
+  NOTES
+    It is assumed that we will read trough the whole key range and that all 
+    key blocks are half full (normally things are much better).
+*/
+
+inline double get_index_only_read_time(PARAM* param, ha_rows records, int keynr)
+{
+  double read_time;
+  uint keys_per_block= (param->table->file->block_size/2/
+			(param->table->key_info[keynr].key_length+
+			 param->table->file->ref_length) + 1);
+  read_time=((double) (records+keys_per_block-1)/
+             (double) keys_per_block);
+  return read_time; 
+}
+
+
+/*
+  Calculate quick range select read time, # of records, and best key to use
+  without constructing QUICK_RANGE_SELECT object.
+  SYNOPSIS
+    get_quick_select_params
+      tree        in         make range select for this SEL_TREE 
+      param       in         parameters from test_quick_select
+      needed_reg  in/out     other table data needed by this quick_select
+      index_read_can_be_used if false, assume that 'index only' option is not
+                             available.
+      read_time   out        read time estimate
+      records     out        # of records estimate
+      key_to_read out        SEL_ARG to be used for creating quick select
+*/
+
+static int get_quick_select_params(SEL_TREE *tree, PARAM *param,
+                                   key_map& needed_reg,
                                    bool index_read_can_be_used,
-                                   double* read_time, ha_rows* records,
-                                   SEL_ARG*** key_to_read)
+                                   double *read_time, ha_rows *records,
+                                   SEL_ARG ***key_to_read)
 {
   int idx;
   int result = 1;
@@ -1233,7 +1423,7 @@ static int get_quick_select_params(SEL_TREE *tree, PARAM& param,
   */
   SEL_ARG **key,**end;
 
-  for (idx= 0,key=tree->keys, end=key+param.keys ;
+  for (idx= 0,key=tree->keys, end=key+param->keys ;
        key != end ;
        key++,idx++)
   {
@@ -1241,16 +1431,18 @@ static int get_quick_select_params(SEL_TREE *tree, PARAM& param,
     double found_read_time;
     if (*key)
     {
-      uint keynr= param.real_keynr[idx];
+      uint keynr= param->real_keynr[idx];
       if ((*key)->type == SEL_ARG::MAYBE_KEY ||
           (*key)->maybe_flag)
         needed_reg.set_bit(keynr);
       
-      bool read_index_only= index_read_can_be_used? head->used_keys.is_set(keynr): false;
-      found_records=check_quick_select(&param, idx, *key);
+      bool read_index_only= index_read_can_be_used? 
+                            param->table->used_keys.is_set(keynr): false;
+      found_records=check_quick_select(param, idx, *key);
+
       if (found_records != HA_POS_ERROR && found_records > 2 &&
           read_index_only &&
-          (head->file->index_flags(keynr) & HA_KEY_READ_ONLY))
+          (param->table->file->index_flags(keynr) & HA_KEY_READ_ONLY))
       {
         /*
           We can resolve this by only reading through this key.
@@ -1258,21 +1450,17 @@ static int get_quick_select_params(SEL_TREE *tree, PARAM& param,
           and that all key blocks are half full (normally things are
           much better).
         */
-        uint keys_per_block= (head->file->block_size/2/
-			      (head->key_info[keynr].key_length+
-			       head->file->ref_length) + 1);
-	found_read_time=((double) (found_records+keys_per_block-1)/
-			 (double) keys_per_block);
+        found_read_time=get_index_only_read_time(param, found_records, keynr);
       }
       else
-	found_read_time= (head->file->read_time(keynr,
-						param.range_count,
+	found_read_time= (param->table->file->read_time(keynr,
+						param->range_count,
 						found_records)+
 			  (double) found_records / TIME_FOR_COMPARE);
       if (*read_time > found_read_time && found_records != HA_POS_ERROR)
       {
         *read_time=   found_read_time;
-        *records=     found_records;
+        *records=     found_records; 
         *key_to_read= key;
         result = 0;
       }
@@ -3118,8 +3306,8 @@ err:
 /*
   Fetch all row ids into unique.
 
-  If table has a clustered primary key(PK) that contains all rows (bdb and 
-     innodb currently) and one of the index_merge scans is a scan on primary key,
+  If table has a clustered primary key that covers all rows (true for bdb 
+     and innodb currently) and one of the index_merge scans is a scan on PK,
   then 
     primary key scan rowids are not put into Unique and also 
     rows that will be retrieved by PK scan are not put into Unique
@@ -3134,15 +3322,15 @@ int QUICK_INDEX_MERGE_SELECT::prepare_unique()
   int result;
   DBUG_ENTER("QUICK_INDEX_MERGE_SELECT::prepare_unique");
   
-  /* we're going to just read rowids. */
+  /* We're going to just read rowids. */
   head->file->extra(HA_EXTRA_KEYREAD);
 
   /* 
     Make innodb retrieve all PK member fields, so 
      * ha_innobase::position (which uses them) call works.
-     * we filter out rows retrieved by CCPK.
+     * We can filter out rows that will be retrieved by clustered PK.
     (This also creates a deficiency - it is possible that we will retrieve
-     parts of key that are not used by current query at all)
+     parts of key that are not used by current query at all.)
   */
   head->file->extra(HA_EXTRA_RETRIEVE_ALL_COLS);
 
@@ -3170,22 +3358,15 @@ int QUICK_INDEX_MERGE_SELECT::prepare_unique()
 
     if (result)
     {      
-      /* 
-        table read error (including HA_ERR_END_OF_FILE on last quick select
-        in index_merge)
-      */
       if (result != HA_ERR_END_OF_FILE)
-      {
         DBUG_RETURN(result);
-      }
-      else
-        break;
+      break;
     }
     
     if (thd->killed)
       DBUG_RETURN(1);
     
-    /* skip row if it will be retrieved by clustered covering PK scan */
+    /* skip row if it will be retrieved by clustered PK scan */
     if (pk_quick_select && pk_quick_select->row_in_ranges())
       continue;
 
@@ -3207,14 +3388,16 @@ int QUICK_INDEX_MERGE_SELECT::prepare_unique()
   DBUG_RETURN(result);
 }
 
+
 /*
   Get next row for index_merge.
   NOTES
-  The rows are read from
-    1. rowids stored in Unique.
-    2. QUICK_RANGE_SELECT with clustered primary key (if any).
-  the sets of rows retrieved in 1) and 2) are guaranteed to be disjoint.
+    The rows are read from
+      1. rowids stored in Unique.
+      2. QUICK_RANGE_SELECT with clustered primary key (if any).
+    The sets of rows retrieved in 1) and 2) are guaranteed to be disjoint.
 */
+
 int QUICK_INDEX_MERGE_SELECT::get_next()
 {
   int result;
@@ -3228,8 +3411,8 @@ int QUICK_INDEX_MERGE_SELECT::get_next()
   if (result == -1)
   {
     result= HA_ERR_END_OF_FILE;
-    /* All rows from Unique have been retrieved, do a CCPK scan */
     end_read_record(&read_record);
+    /* All rows from Unique have been retrieved, do a clustered PK scan */
     if(pk_quick_select)
     {
       doing_pk_scan= true;
@@ -3275,8 +3458,9 @@ int QUICK_RANGE_SELECT::get_next()
     if (!cur_range)
       range= *(cur_range= (QUICK_RANGE**)ranges.buffer);
     else 
-      range= (cur_range == ((QUICK_RANGE**)ranges.buffer + ranges.elements - 1))? 
-            NULL: *(++cur_range);
+      range=
+        (cur_range == ((QUICK_RANGE**)ranges.buffer + ranges.elements - 1))?
+         NULL: *(++cur_range);
 
     if (!range)
       DBUG_RETURN(HA_ERR_END_OF_FILE);		// All ranges used
@@ -3371,16 +3555,17 @@ int QUICK_RANGE_SELECT::cmp_next(QUICK_RANGE *range_arg)
 
 /*
   Check if current row will be retrieved by this QUICK_RANGE_SELECT
-  (this is used to filter out CCPK scan rows in index_merge).
 
   NOTES
     It is assumed that currently a scan is being done on another index 
     which reads all necessary parts of the index that is scanned by this 
     quick select.
-
     The implementation does a binary search on sorted array of disjoint 
     ranges, without taking size of range into account.
 
+    This function is used to filter out clustered PK scan rows in 
+    index_merge quick select.
+
   RETURN
     true  if current row will be retrieved by this quick select
     false if not
diff --git a/sql/opt_range.h b/sql/opt_range.h
index 35a0cb5df88..7c2981795a2 100644
--- a/sql/opt_range.h
+++ b/sql/opt_range.h
@@ -118,11 +118,13 @@ public:
 protected:
   friend void print_quick_sel_range(QUICK_RANGE_SELECT *quick,
                                     const key_map* needed_reg);
-  friend QUICK_RANGE_SELECT *get_quick_select_for_ref(THD *thd, TABLE *table, 
-                                                      struct st_table_ref *ref);
+  friend
+  QUICK_RANGE_SELECT *get_quick_select_for_ref(THD *thd, TABLE *table, 
+                                               struct st_table_ref *ref);
   friend bool get_quick_keys(struct st_qsel_param *param,
                              QUICK_RANGE_SELECT *quick,KEY_PART *key,
-                             SEL_ARG *key_tree,char *min_key,uint min_key_flag,
+                             SEL_ARG *key_tree,
+                             char *min_key, uint min_key_flag,
                              char *max_key, uint max_key_flag);
   friend QUICK_RANGE_SELECT *get_quick_select(struct st_qsel_param*,uint idx,
                                               SEL_ARG *key_tree,
@@ -160,58 +162,62 @@ public:
 
 
 /*
-QUICK_INDEX_MERGE_SELECT - index_merge acces method quick select.
+  QUICK_INDEX_MERGE_SELECT - index_merge access method quick select.
 
-  QUICK_INDEX_MERGE_SELECT uses 
-   * QUICK_RANGE_SELECTs to get rows
-   * Unique class to remove duplicate rows
+    QUICK_INDEX_MERGE_SELECT uses 
+     * QUICK_RANGE_SELECTs to get rows
+     * Unique class to remove duplicate rows
 
-INDEX MERGE OPTIMIZER
-  Current implementation doesn't detect all cases where index_merge could be 
-  used, in particular:
-   * index_merge will never be used if range scan is possible (even if range 
-     scan is more expensive)
+  INDEX MERGE OPTIMIZER
+    Current implementation doesn't detect all cases where index_merge could 
+    be used, in particular:
+     * index_merge will never be used if range scan is possible (even if 
+       range scan is more expensive)
 
-   * index_merge+'using index' is not supported (this the consequence of the 
-     above restriction)
+     * index_merge+'using index' is not supported (this the consequence of 
+       the above restriction)
    
-   * If WHERE part contains complex nested AND and OR conditions, some ways to
-     retrieve rows using index_merge will not be considered. The choice of 
-     read plan may depend on the order of conjuncts/disjuncts in WHERE part of
-     the query, see comments near SEL_IMERGE::or_sel_tree_with_checks and 
-     imerge_list_or_list function for details.
+     * If WHERE part contains complex nested AND and OR conditions, some ways
+       to retrieve rows using index_merge will not be considered. The choice 
+       of read plan may depend on the order of conjuncts/disjuncts in WHERE 
+       part of the query, see comments near imerge_list_or_list and
+       SEL_IMERGE::or_sel_tree_with_checks functions for details.
 
-   * there is no "index_merge_ref" method (but index_merge on non-first table 
-     in join is possible with 'range checked for each record').
+     * There is no "index_merge_ref" method (but index_merge on non-first
+       table in join is possible with 'range checked for each record').
 
-   See comments around SEL_IMERGE class and test_quick_select for more details.
+    See comments around SEL_IMERGE class and test_quick_select for more 
+    details.
 
-ROW RETRIEVAL ALGORITHM
+  ROW RETRIEVAL ALGORITHM
 
-  index_merge uses Unique class for duplicates removal.  Index merge takes 
-  advantage of clustered covering primary key (CCPK) if the table has one.
-  The algorithm is as follows:
+    index_merge uses Unique class for duplicates removal.  index_merge takes
+    advantage of Clustered Primary Key (CPK) if the table has one.
+    The index_merge algorithm consists of two phases:
 
-  prepare() //implemented in QUICK_INDEX_MERGE_SELECT::prepare_unique
-  {
-    activate 'index only';
-    while(retrieve next row for non-CCPK scan)
+    Phase 1 (implemented in QUICK_INDEX_MERGE_SELECT::prepare_unique):
+    prepare()
     {
-      if (there is a CCPK scan and row will be retrieved by it)
-        skip this row;
-      else
-        put rowid into Unique;
+      activate 'index only';
+      while(retrieve next row for non-CPK scan)
+      {
+        if (there is a CPK scan and row will be retrieved by it)
+          skip this row;
+        else
+          put its rowid into Unique;
+      }
+      deactivate 'index only';
     }
-    deactivate 'index only';
-  }
-
-  fetch() //implemented as sequence of QUICK_INDEX_MERGE_SELECT::get_next calls
-  {
-    retrieve all rows from row pointers stored in Unique;
-    free Unique;
-    retrieve all rows for CCPK scan;
-  }
+    
+    Phase 2 (implemented as sequence of QUICK_INDEX_MERGE_SELECT::get_next
+    calls):
 
+    fetch()
+    {
+      retrieve all rows from row pointers stored in Unique;
+      free Unique;
+      retrieve all rows for CPK scan;
+    }
 */
 
 class QUICK_INDEX_MERGE_SELECT : public QUICK_SELECT_I 
@@ -239,10 +245,10 @@ public:
   /* last element in quick_selects list */
   QUICK_RANGE_SELECT* last_quick_select;
 
-  /* quick select that uses Covering Clustered Primary Key (NULL if none) */
+  /* quick select that uses clustered primary key (NULL if none) */
   QUICK_RANGE_SELECT* pk_quick_select;
   
-  /* true if this select is currently doing a CCPK scan */
+  /* true if this select is currently doing a clustered PK scan */
   bool  doing_pk_scan;
   
   Unique  *unique;
diff --git a/sql/records.cc b/sql/records.cc
index f8fbfe62187..b29b113a1bc 100644
--- a/sql/records.cc
+++ b/sql/records.cc
@@ -98,7 +98,6 @@ void init_read_record(READ_RECORD *info,THD *thd, TABLE *table,
     }
   }
   else if (select && select->quick)
-           //&& (select->quick->get_type() != QUICK_SELECT_I::QS_TYPE_INDEX_MERGE))
   {
     DBUG_PRINT("info",("using rr_quick"));
     info->read_record=rr_quick;
diff --git a/sql/sql_class.h b/sql/sql_class.h
index 8263789a2a2..76d2eae1bb5 100644
--- a/sql/sql_class.h
+++ b/sql/sql_class.h
@@ -1233,7 +1233,8 @@ public:
   }
 
   bool get(TABLE *table);
-
+  static double get_use_cost(MEM_ROOT *alloc, uint nkeys, uint key_size, 
+                             ulong max_in_memory_size);
   friend int unique_write_to_file(gptr key, element_count count, Unique *unique);
   friend int unique_write_to_ptrs(gptr key, element_count count, Unique *unique);
 };
diff --git a/sql/uniques.cc b/sql/uniques.cc
index 02146426782..48233f29bee 100644
--- a/sql/uniques.cc
+++ b/sql/uniques.cc
@@ -63,12 +63,194 @@ Unique::Unique(qsort_cmp2 comp_func, void * comp_func_fixed_arg,
 	    comp_func_fixed_arg);
   /* If the following fail's the next add will also fail */
   my_init_dynamic_array(&file_ptrs, sizeof(BUFFPEK), 16, 16);
+  /* 
+    If you change the following, change it in get_max_elements function, too.
+  */
   max_elements= max_in_memory_size / ALIGN_SIZE(sizeof(TREE_ELEMENT)+size);
   open_cached_file(&file, mysql_tmpdir,TEMP_PREFIX, DISK_BUFFER_SIZE,
 		   MYF(MY_WME));
 }
 
 
+#ifndef M_PI
+#define M_PI 3.14159265358979323846
+#endif
+
+#define M_E (exp(1))
+
+inline double log2_n_fact(double x)
+{
+  return (2 * ( ((x)+1) * log(((x)+1)/M_E) + log(2*M_PI*((x)+1))/2 ) / log(2));
+}
+
+/*
+  Calculate cost of merge_buffers call.
+
+  NOTE
+    See comment near Unique::get_use_cost for cost formula derivation.
+*/
+static double get_merge_buffers_cost(uint* buff_sizes, uint elem_size, 
+                                     int last, int f,int t)
+{  
+  uint sum= 0;
+  for (int i=f; i <= t; i++)
+    sum+= buff_sizes[i];
+  buff_sizes[last]= sum;
+  
+  int n_buffers= t - f + 1;
+  double buf_length= sum*elem_size;
+
+  return (((double)buf_length/(n_buffers+1)) / IO_SIZE) * 2 * n_buffers + 
+     buf_length * log(n_buffers)  / (TIME_FOR_COMPARE_ROWID * log(2.0));
+}
+
+/*
+  Calculate cost of merging buffers into one in Unique::get, i.e. calculate
+  how long (in terms of disk seeks) the two call
+    merge_many_buffs(...); 
+    merge_buffers(...); 
+  will take.
+
+  SYNOPSIS
+    get_merge_many_buffs_cost()
+      alloc         memory pool to use
+      maxbuffer     # of full buffers.
+      max_n_elems   # of elements in first maxbuffer buffers.
+      last_n_elems  # of elements in last buffer.
+      elem_size     size of buffer element.
+
+  NOTES
+    It is assumed that maxbuffer+1 buffers are merged, first maxbuffer buffers
+    contain max_n_elems each, last buffer contains last_n_elems elements.
+
+    The current implementation does a dumb simulation of merge_many_buffs
+    actions.
+  
+  RETURN
+   >=0  Cost of merge in disk seeks.
+   <0   Out of memory.
+*/
+static double get_merge_many_buffs_cost(MEM_ROOT *alloc,
+                                        uint maxbuffer, uint max_n_elems,
+                                        uint last_n_elems, int elem_size)
+{
+  register int i;
+  double total_cost= 0.0;
+  int    lastbuff;
+  uint*  buff_sizes;
+  
+  if (!(buff_sizes= (uint*)alloc_root(alloc, sizeof(uint) * (maxbuffer + 1))))
+    return -1.0;
+  for(i = 0; i < (int)maxbuffer; i++)
+    buff_sizes[i]= max_n_elems;
+  
+  buff_sizes[maxbuffer]= last_n_elems;
+
+  if (maxbuffer >= MERGEBUFF2)
+  {
+    /* Simulate merge_many_buff */
+    while (maxbuffer >= MERGEBUFF2)
+    {
+      lastbuff=0;
+      for (i = 0; i <= (int) maxbuffer - MERGEBUFF*3/2; i += MERGEBUFF)
+        total_cost += get_merge_buffers_cost(buff_sizes, elem_size, 
+                                             lastbuff++, i, i+MERGEBUFF-1);
+      
+      total_cost += get_merge_buffers_cost(buff_sizes, elem_size, 
+                                           lastbuff++, i, maxbuffer);
+      maxbuffer= (uint)lastbuff-1;
+    }
+  }
+  
+  /* Simulate final merge_buff call. */
+  total_cost += get_merge_buffers_cost(buff_sizes, elem_size, 0, 0, 
+                                       maxbuffer);
+  return total_cost;
+}
+
+
+/*
+  Calclulate cost of using Unique for processing nkeys elements of size 
+  key_size using max_in_memory_size memory.
+  
+  RETURN
+    Use cost as # of disk seeks.
+  
+  NOTES
+    cost(using_unqiue) = 
+      cost(create_trees) +  (see #1)
+      cost(merge) +         (see #2)
+      cost(read_result)     (see #3)
+
+    1. Cost of trees creation
+      For each Unique::put operation there will be 2*log2(n+1) elements
+      comparisons, where n runs from 1 tree_size (we assume that all added
+      elements are different). Together this gives:
+    
+      n_compares = 2*(log2(2) + log2(3) + ... + log2(N+1)) = 2*log2((N+1)!) =
+  
+      = 2*ln((N+1)!) / ln(2) = {using Stirling formula} = 
+
+      = 2*( (N+1)*ln((N+1)/e) + (1/2)*ln(2*pi*(N+1)) / ln(2).
+
+      then cost(tree_creation) = n_compares*ROWID_COMPARE_COST;
+
+      Total cost of creating trees:
+      (n_trees - 1)*max_size_tree_cost + non_max_size_tree_cost.
+    
+    2. Cost of merging.
+      If only one tree is created by Unique no merging will be necessary.
+      Otherwise, we model execution of merge_many_buff function and count
+      #of merges. (The reason behind this is that number of buffers is small, 
+      while size of buffers is big and we don't want to loose precision with 
+      O(x)-style formula)
+  
+    3. If only one tree is created by Unique no disk io will happen.
+      Otherwise, ceil(key_len*n_keys) disk seeks are necessary. We assume 
+      these will be random seeks.
+*/
+
+double Unique::get_use_cost(MEM_ROOT *alloc, uint nkeys, uint key_size, 
+                            ulong max_in_memory_size)
+{
+  ulong max_elements_in_tree;
+  ulong last_tree_elems;
+  int   n_full_trees; /* number of trees in unique - 1 */
+  double result;
+  
+  max_elements_in_tree= max_in_memory_size / 
+                        ALIGN_SIZE(sizeof(TREE_ELEMENT)+key_size);
+  n_full_trees=    nkeys / max_elements_in_tree;
+  last_tree_elems= nkeys % max_elements_in_tree;
+  
+  /* Calculate cost of creating trees */
+  result= log2_n_fact(last_tree_elems);
+  if (n_full_trees)
+    result+= n_full_trees * log2_n_fact(max_elements_in_tree);
+  result /= TIME_FOR_COMPARE_ROWID;
+
+  /* Calculate cost of merging */
+  if (!n_full_trees)
+    return result;
+  
+  /* There is more then one tree and merging is necessary. */
+  /* Add cost of writing all trees to disk. */
+  result += n_full_trees * ceil(key_size*max_elements_in_tree / IO_SIZE);
+  result += ceil(key_size*last_tree_elems / IO_SIZE);
+
+  /* Cost of merge */
+  result += get_merge_many_buffs_cost(alloc, n_full_trees, 
+                                      max_elements_in_tree,
+                                      last_tree_elems, key_size);
+  /* 
+    Add cost of reading the resulting sequence, assuming there were no 
+    duplicate elements.
+  */
+  result += ceil((double)key_size*nkeys/IO_SIZE);
+
+  return result;
+}
+
 Unique::~Unique()
 {
   close_cached_file(&file);