Revert these two changes (wrong push) :

MDEV-5980: EITS: if condition is used for REF access, its selectivity is still in filtered% MDEV-5985: EITS: selectivity estimates look illogical for join and non-key equalities MDEV-6003: EITS: ref access, keypart2=const vs keypart2=expr - inconsistent filtered% value - Made a number of fixes in table_cond_selectivity() so that it returns correct selectivity estimates. - Added comments in related code. Better comments
2025-01-16 20:12:31 +01:00 · 2014-04-28 21:49:39 +04:00 · 2014-04-28 21:49:39 +04:00 · 182f7d76ee
commit 182f7d76ee
parent 46d1b84597
4 changed files with 14 additions and 346 deletions
--- a/mysql-test/r/selectivity_no_engine.result
+++ b/mysql-test/r/selectivity_no_engine.result
@ -139,118 +139,6 @@ Warnings:
 Note	1003	select `test`.`t2`.`a` AS `a`,`test`.`t2`.`b` AS `b`,`test`.`t2`.`col1` AS `col1`,`test`.`t2`.`col2` AS `col2` from `test`.`t2` where ((`test`.`t2`.`a` in (1,2,3)) and (`test`.`t2`.`b` in (1,2,3)))
 drop table t2, t1;
 # 
 # MDEV-5980: EITS: if condition is used for REF access, its selectivity is still in filtered%
 # 
 create table t0(a int);
 insert into t0 values (0),(1),(2),(3),(4),(5),(6),(7),(8),(9);
 create table t1(key1 int, col1 int, key(key1));
 insert into t1 select A.a, A.a from t0 A, t0 B, t0 C;
 set histogram_size=100;
 set use_stat_tables='preferably';
 set optimizer_use_condition_selectivity=4;
 analyze table t1 persistent for all;
 Table	Op	Msg_type	Msg_text
 test.t1	analyze	status	Engine-independent statistics collected
 test.t1	analyze	status	Table is already up to date
 # 10% is ok
 explain extended select * from t1 where col1=2;
 id	select_type	table	type	possible_keys	key	key_len	ref	rows	filtered	Extra
 1	SIMPLE	t1	ALL	NULL	NULL	NULL	NULL	1000	9.90	Using where
 Warnings:
 Note	1003	select `test`.`t1`.`key1` AS `key1`,`test`.`t1`.`col1` AS `col1` from `test`.`t1` where (`test`.`t1`.`col1` = 2)
 # Must show 100%, not 10%
 explain extended select * from t1 where key1=2;
 id	select_type	table	type	possible_keys	key	key_len	ref	rows	filtered	Extra
 1	SIMPLE	t1	ref	key1	key1	5	const	98	100.00	
 Warnings:
 Note	1003	select `test`.`t1`.`key1` AS `key1`,`test`.`t1`.`col1` AS `col1` from `test`.`t1` where (`test`.`t1`.`key1` = 2)
 drop table t0, t1;
 # 
 # MDEV-5985: EITS: selectivity estimates look illogical for join and non-key equalities
 # 
 create table t1(a int);
 insert into t1 values (0),(1),(2),(3),(4),(5),(6),(7),(8),(9);
 create table t2(a int);
 insert into t2 select A.a + B.a* 10 + C.a * 100 from t1 A, t1 B, t1 C;
 create table t3 as select * from t2;
 set histogram_size=100;
 set use_stat_tables='preferably';
 set optimizer_use_condition_selectivity=4;
 analyze table t2 persistent for all;
 Table	Op	Msg_type	Msg_text
 test.t2	analyze	status	Engine-independent statistics collected
 test.t2	analyze	status	OK
 analyze table t3 persistent for all;
 Table	Op	Msg_type	Msg_text
 test.t3	analyze	status	Engine-independent statistics collected
 test.t3	analyze	status	OK
 explain extended select * from t2 A where A.a < 40;
 id	select_type	table	type	possible_keys	key	key_len	ref	rows	filtered	Extra
 1	SIMPLE	A	ALL	NULL	NULL	NULL	NULL	1000	4.95	Using where
 Warnings:
 Note	1003	select `test`.`A`.`a` AS `a` from `test`.`t2` `A` where (`test`.`A`.`a` < 40)
 explain extended select * from t3 B where B.a < 100;
 id	select_type	table	type	possible_keys	key	key_len	ref	rows	filtered	Extra
 1	SIMPLE	B	ALL	NULL	NULL	NULL	NULL	1000	9.90	Using where
 Warnings:
 Note	1003	select `test`.`B`.`a` AS `a` from `test`.`t3` `B` where (`test`.`B`.`a` < 100)
 explain extended select * from t2 A, t3 B where A.a < 40 and B.a < 100;
 id	select_type	table	type	possible_keys	key	key_len	ref	rows	filtered	Extra
 1	SIMPLE	A	ALL	NULL	NULL	NULL	NULL	1000	4.95	Using where
 1	SIMPLE	B	ALL	NULL	NULL	NULL	NULL	1000	9.90	Using where; Using join buffer (flat, BNL join)
 Warnings:
 Note	1003	select `test`.`A`.`a` AS `a`,`test`.`B`.`a` AS `a` from `test`.`t2` `A` join `test`.`t3` `B` where ((`test`.`A`.`a` < 40) and (`test`.`B`.`a` < 100))
 explain extended select * from t2 A, t3 B where A.a < 40 and B.a < 100 and B.a=A.a;
 id	select_type	table	type	possible_keys	key	key_len	ref	rows	filtered	Extra
 1	SIMPLE	A	ALL	NULL	NULL	NULL	NULL	1000	4.95	Using where
 1	SIMPLE	B	ALL	NULL	NULL	NULL	NULL	1000	4.95	Using where; Using join buffer (flat, BNL join)
 Warnings:
 Note	1003	select `test`.`A`.`a` AS `a`,`test`.`B`.`a` AS `a` from `test`.`t2` `A` join `test`.`t3` `B` where ((`test`.`B`.`a` = `test`.`A`.`a`) and (`test`.`A`.`a` < 40) and (`test`.`A`.`a` < 100))
 drop table t1, t2, t3;
 select 1;
 1
 1
 # 
 # MDEV-6003: EITS: ref access, keypart2=const vs keypart2=expr - inconsistent filtered% value
 # 
 create table t0(a int);
 insert into t0 values (0),(1),(2),(3),(4),(5),(6),(7),(8),(9);
 create table t1 (
 kp1 int, kp2 int, 
 filler1 char(100),
 filler2 char(100),
 key(kp1, kp2)
 );
 insert into t1 
 select 
 A.a,
 B.a,
 'filler-data-1',
 'filler-data-2'
 from t0 A, t0 B, t0 C;
 set histogram_size=100;
 set use_stat_tables='preferably';
 set optimizer_use_condition_selectivity=4;
 analyze table t1 persistent for all;
 Table	Op	Msg_type	Msg_text
 test.t1	analyze	status	Engine-independent statistics collected
 test.t1	analyze	status	Table is already up to date
 # NOTE: 10*100%, 10*100% rows is ok
 explain extended select * from t0, t1 where t1.kp1=t0.a and t1.kp2=t0.a+1;
 id	select_type	table	type	possible_keys	key	key_len	ref	rows	filtered	Extra
 1	SIMPLE	t0	ALL	NULL	NULL	NULL	NULL	10	100.00	Using where
 1	SIMPLE	t1	ref	kp1	kp1	10	test.t0.a,func	10	100.00	Using index condition
 Warnings:
 Note	1003	select `test`.`t0`.`a` AS `a`,`test`.`t1`.`kp1` AS `kp1`,`test`.`t1`.`kp2` AS `kp2`,`test`.`t1`.`filler1` AS `filler1`,`test`.`t1`.`filler2` AS `filler2` from `test`.`t0` join `test`.`t1` where ((`test`.`t1`.`kp1` = `test`.`t0`.`a`) and (`test`.`t1`.`kp2` = (`test`.`t0`.`a` + 1)))
 # NOTE: t0: 10*100% is ok,  t1: 10*9.90% is bad. t1 should have 10*100%.
 explain extended select * from t0, t1 where t1.kp1=t0.a and t1.kp2=4;
 id	select_type	table	type	possible_keys	key	key_len	ref	rows	filtered	Extra
 1	SIMPLE	t0	ALL	NULL	NULL	NULL	NULL	10	100.00	Using where
 1	SIMPLE	t1	ref	kp1	kp1	10	test.t0.a,const	10	100.00	
 Warnings:
 Note	1003	select `test`.`t0`.`a` AS `a`,`test`.`t1`.`kp1` AS `kp1`,`test`.`t1`.`kp2` AS `kp2`,`test`.`t1`.`filler1` AS `filler1`,`test`.`t1`.`filler2` AS `filler2` from `test`.`t0` join `test`.`t1` where ((`test`.`t1`.`kp1` = `test`.`t0`.`a`) and (`test`.`t1`.`kp2` = 4))
 drop table t0, t1;
 # 
 # End of the test file
 # 
 set use_stat_tables= @save_use_stat_tables;
--- a/mysql-test/t/selectivity_no_engine.test
+++ b/mysql-test/t/selectivity_no_engine.test
@ -101,78 +101,6 @@ analyze table t2 persistent for all;
 explain extended select * from t2 where a in (1,2,3) and b in (1,2,3);
 drop table t2, t1;
 --echo # 
 --echo # MDEV-5980: EITS: if condition is used for REF access, its selectivity is still in filtered%
 --echo # 
 create table t0(a int);
 insert into t0 values (0),(1),(2),(3),(4),(5),(6),(7),(8),(9);
 create table t1(key1 int, col1 int, key(key1));
 insert into t1 select A.a, A.a from t0 A, t0 B, t0 C;
 set histogram_size=100;
 set use_stat_tables='preferably';
 set optimizer_use_condition_selectivity=4;
 analyze table t1 persistent for all;
 --echo # 10% is ok
 explain extended select * from t1 where col1=2;
 --echo # Must show 100%, not 10%
 explain extended select * from t1 where key1=2;
 drop table t0, t1;
 ##--disable_parsing
 --echo # 
 --echo # MDEV-5985: EITS: selectivity estimates look illogical for join and non-key equalities
 --echo # 
 create table t1(a int);
 insert into t1 values (0),(1),(2),(3),(4),(5),(6),(7),(8),(9);
 create table t2(a int);
 insert into t2 select A.a + B.a* 10 + C.a * 100 from t1 A, t1 B, t1 C;
 create table t3 as select * from t2;
 set histogram_size=100;
 set use_stat_tables='preferably';
 set optimizer_use_condition_selectivity=4;
 analyze table t2 persistent for all;
 analyze table t3 persistent for all;
 explain extended select * from t2 A where A.a < 40;
 explain extended select * from t3 B where B.a < 100;
 explain extended select * from t2 A, t3 B where A.a < 40 and B.a < 100;
 explain extended select * from t2 A, t3 B where A.a < 40 and B.a < 100 and B.a=A.a;
 drop table t1, t2, t3;
 ## --enable_parsing
 select 1;
 ##--disable_parsing
 --echo # 
 --echo # MDEV-6003: EITS: ref access, keypart2=const vs keypart2=expr - inconsistent filtered% value
 --echo # 
 create table t0(a int);
 insert into t0 values (0),(1),(2),(3),(4),(5),(6),(7),(8),(9);
 create table t1 (
  kp1 int, kp2 int, 
  filler1 char(100),
  filler2 char(100),
  key(kp1, kp2)
 );
 insert into t1 
 select 
  A.a,
  B.a,
  'filler-data-1',
  'filler-data-2'
 from t0 A, t0 B, t0 C;
 set histogram_size=100;
 set use_stat_tables='preferably';
 set optimizer_use_condition_selectivity=4;
 analyze table t1 persistent for all;
 --echo # NOTE: 10*100%, 10*100% rows is ok
 explain extended select * from t0, t1 where t1.kp1=t0.a and t1.kp2=t0.a+1;
 --echo # NOTE: t0: 10*100% is ok,  t1: 10*9.90% is bad. t1 should have 10*100%.
 explain extended select * from t0, t1 where t1.kp1=t0.a and t1.kp2=4;
 drop table t0, t1;
 ##--enable_parsing
 --echo # 
 --echo # End of the test file
 --echo # 
--- a/sql/opt_range.cc
+++ b/sql/opt_range.cc
@ -3406,10 +3406,6 @@ double records_in_column_ranges(PARAM *param, uint idx,
       selectivity (this is used for conditions like "column LIKE '%val%'" 
       where approaches #1 and #2 do not provide selectivity data).
  SEE ALSO
     table_cond_selectivity()
     matching_candidates_in_table()
  NOTE
    Currently the selectivities of range conditions over different columns are
    considered independent. 
--- a/sql/sql_select.cc
+++ b/sql/sql_select.cc
@ -7135,15 +7135,6 @@ double JOIN::get_examined_rows()
  @param rem_tables The bitmap of tables to be joined later
  @param keyparts   The number of key parts to used when joining s
  @param ref_keyuse_steps Array of references to keyuses employed to join s 
  @detail
  Basic idea: if the WHERE clause has an equality in form
    tbl.column= ...
  then this condition will have selectivity 1/#distinct_values(tbl.column),
  unless the equality was used by ref access. If the equality is used by ref
  access, we only get rows that satisfy it, and so its selectivity=1.
 */
 static 
@ -7174,18 +7165,8 @@ double table_multi_eq_cond_selectivity(JOIN *join, uint idx, JOIN_TAB *s,
      the current value of sel by this selectivity
    */
    table_map used_tables= item_equal->used_tables();
    /* 
      Equalities that do not include fields in this table do not matter 
    */
    if (!(used_tables & table_bit))
      continue;
    /* 
      Equalities that include a constant are taken into account in
      table->cond_selectivity. Selectivity from there is taken into account 
      in matching_candidates_in_table() and/or table_cond_selectivity(). 
    */
    if (item_equal->get_const())
      continue;
@ -7197,23 +7178,14 @@ double table_multi_eq_cond_selectivity(JOIN *join, uint idx, JOIN_TAB *s,
      Field *fld= fi.get_curr_field();
      if (fld->table->map != table_bit)
        continue;
      if (pos->key == 0)
      {
        /* 
          No ref access used (and no const in the multi-equality). We will
          need to adjust the selectivity.
        */
        adjust_sel= TRUE;
      }
      else
      {
        /* Ok, [eq]ref access is used */
        uint i;
        KEYUSE *keyuse= pos->key;
        uint key= keyuse->key;
        /* Find which keypart participates in the equality */
        for (i= 0; i < keyparts; i++)
 	{
          uint fldno;
@ -7224,7 +7196,6 @@ double table_multi_eq_cond_selectivity(JOIN *join, uint idx, JOIN_TAB *s,
          if (fld->field_index == fldno)
            break;
        }
        if (i == keyparts)
 	{
          /* 
@ -7250,7 +7221,6 @@ double table_multi_eq_cond_selectivity(JOIN *join, uint idx, JOIN_TAB *s,
        }          
      }
    }
    if (adjust_sel)
    {
      /* 
@ -7296,10 +7266,6 @@ double table_multi_eq_cond_selectivity(JOIN *join, uint idx, JOIN_TAB *s,
    For other access methods, we need to calculate selectivity of the whole
    condition, "COND(this_table) AND COND(this_table, previous_tables)".
  @seealso
    calculate_cond_selectivity_for_table()
    matching_candidates_in_table()
  @retval
    selectivity of the conditions imposed on the rows of s
@ -7310,90 +7276,34 @@ double table_cond_selectivity(JOIN *join, uint idx, JOIN_TAB *s,
                              table_map rem_tables)
 {
  uint16 ref_keyuse_steps[MAX_REF_PARTS - 1];
  Field *field;
  TABLE *table= s->table;
-  double sel;
+  MY_BITMAP *read_set= table->read_set;
  double sel= s->table->cond_selectivity;
  POSITION *pos= &join->positions[idx];
  uint keyparts= 0;
  uint found_part_ref_or_null= 0;
  if (pos->key != 0)
  {
    sel= s->table->cond_selectivity;
    /* 
-      A ref access or hash join is used for this table. ref access is created
+      A ref access or hash join is used for this table.
      from
-        tbl.keypart1=expr1 AND tbl.keypart2=expr2 AND ...
+      It could have some parts with "t.key_part=const". Using ref access
      means that we will only get records where the condition holds, so we
      should remove its selectivity from the condition selectivity.
      and it will only return rows for which this condition is satisified.
      Suppose, certain expr{i} is a constant. Since ref access only returns
      rows that satisfy
         tbl.keypart{i}=const       (*)
      then selectivity of this equality should not be counted in return value 
      of this function. This function uses the value of 
         table->cond_selectivity=selectivity(COND(tbl)) (**)
      as a starting point. This value includes selectivity of equality (*). We
      should somehow discount it. 
      Looking at calculate_cond_selectivity_for_table(), one can see that that
      the value is not necessarily a direct multiplicand in 
      table->cond_selectivity
      There are three possible ways to discount
      1. There is a potential range access on t.keypart{i}=const. 
         (an important special case: the used ref access has a const prefix for
          which a range estimate is available)
      2. The field has a histogram. field[x]->cond_selectivity has the data.
      3. Use index stats on this index:
         rec_per_key[key_part+1]/rec_per_key[key_part]
      (TODO: more details about the "t.key=othertable.col" case)
    */
    KEYUSE *keyuse= pos->key;
    KEYUSE *prev_ref_keyuse= keyuse;
    uint key= keyuse->key;
-    
+    do
    /*
      Check if we have a prefix of key=const that matches a quick select.
    */
    if (!is_hash_join_key_no(key))
    {
      table_map quick_key_map= (table_map(1) << table->quick_key_parts[key]) - 1;
      if (table->quick_rows[key] && 
          !(quick_key_map & ~table->const_key_parts[key]))
      {
        /* 
          Ok, there is an equality for each of the key parts used by the
          quick select. This means, quick select's estimate can be reused to
          discount the selectivity of a prefix of a ref access.
        */
        for (; quick_key_map & 1 ; quick_key_map>>= 1)
        {
          while (keyuse->keypart == keyparts)
            keyuse++;
          keyparts++;
        }
        sel /= table->quick_rows[key] / table->stat_records();
      }
    }
    /*
      Go through the "keypart{N}=..." equalities and find those that were
      already taken into account in table->cond_selectivity.
    */
    while (keyuse->table == table && keyuse->key == key)
    {
      if (!(keyuse->used_tables & (rem_tables | table->map)))
      {
        if (are_tables_local(s, keyuse->val->used_tables()))
-	{                            /// ^^ why val->used_tables here but just
+	{
                                     ///       used_tables above?
          if (is_hash_join_key_no(key))
 	  {
            if (keyparts == keyuse->keypart)
@ -7402,39 +7312,22 @@ double table_cond_selectivity(JOIN *join, uint idx, JOIN_TAB *s,
          else
 	  {
            if (keyparts == keyuse->keypart &&
-                !((keyuse->val->used_tables()) & ~pos->ref_depend_map) &&
+                !(~(keyuse->val->used_tables()) & pos->ref_depend_map) &&
                !(found_part_ref_or_null & keyuse->optimize))
 	    {
              /* Found a KEYUSE object that will be used by ref access */
              keyparts++;
              found_part_ref_or_null|= keyuse->optimize & ~KEY_OPTIMIZE_EQ;
            }
          }
          if (keyparts > keyuse->keypart)
 	  {
            /* Ok this is the keyuse that will be used for ref access */
            uint fldno;
            if (is_hash_join_key_no(key))
 	      fldno= keyuse->keypart;
            else
              fldno= table->key_info[key].key_part[keyparts-1].fieldnr - 1;
            if (keyuse->val->const_item())
-            {              
+              sel*= table->field[fldno]->cond_selectivity; 
              // psergey: not multiply, divide instead.
              // before, we've had:
              // sel*= table->field[fldno]->cond_selectivity; 
              sel /= table->field[fldno]->cond_selectivity;
              /* 
               TODO: we could do better here:
                 1. cond_selectivity might be =1 (the default) because quick 
                    select on some index prevented us from analyzing 
                    histogram for this column.
                 2. we could get an estimate through this?
                     rec_per_key[key_part-1] / rec_per_key[key_part]
              */
            }
            if (keyparts > 1)
 	    {
              ref_keyuse_steps[keyparts-2]= keyuse - prev_ref_keyuse;
@ -7444,59 +7337,23 @@ double table_cond_selectivity(JOIN *join, uint idx, JOIN_TAB *s,
 	}
      }
      keyuse++;
-    }
+    } while (keyuse->table == table && keyuse->key == key);
  }
  else
  {
    /*
      The table is accessed with full table scan, or quick select.
-      Selectivity of COND(this_table) is already accounted for in 
+      Selectivity of COND(table) is already accounted for in 
      matching_candidates_in_table().
      For COND(this_table, previous_tables) we don't have any meaningful
      estimates.
    */
    sel= 1;
  }
-  /*
+  /* 
    Selectivity and multiple equalities. Consider an example:
      select * from t1, t2 where t1.col=t2.col and t2.col<5
    Suppose the join order is t1, t2. When equality propagation is used, we 
    get:
      t1:  t1.col<5
      t2:  t2.col<5  // not generated: AND t2.col=t1.col
    if we use ref access on table t2, we will not get records for which
    "t2.col<5"
      when we get to table t2, we will not get records that have "t2.col < 5"
     COND(t2) = "t2.col<5" 
      ## a variant with key:
      select * from t1, t2 where t1.col=t2.col and t2.col<5 and t2.key=t1.col2
    If the field f from the table is equal to a field from one the
    earlier joined tables then the selectivity of the range conditions
    over the field f must be discounted.
    Suppose, we're now looking at selectivity for table t2.
    - in case t2 uses full table scan (or quick select): all selectivity is
      already accounted for in matching_candidates_in_table().
    - in case t2 uses ref access
       = if the equality is used for ref access, we have already 
         discounted its selectivity above
         (However, we have not discounted selectivity of the induced
         equalities)
       = if the equality is not used for ref access, we should still count its
         selectivity.
  */ 
 #if 0
  for (Field **f_ptr=table->field ; (field= *f_ptr) ; f_ptr++)
  {
    if (!bitmap_is_set(read_set, field->field_index) ||
@ -7514,7 +7371,6 @@ double table_cond_selectivity(JOIN *join, uint idx, JOIN_TAB *s,
      }
    }
  }
 #endif 
  sel*= table_multi_eq_cond_selectivity(join, idx, s, rem_tables,
                                        keyparts, ref_keyuse_steps);