Use binary search to compute range selectivity

* it also adds an "explain select" statement to the test so that the fprintf calls can print the computed intervals to mysqld.1.err Signed-off-by: Michael Okoko <okokomichaels@outlook.com>
2025-01-16 12:02:42 +01:00 · 2021-08-16 10:09:56 +01:00 · 2021-08-16 10:09:56 +01:00 · c129689ddc
commit c129689ddc
parent c605285bb8
4 changed files with 89 additions and 99 deletions
--- a/mysql-test/main/statistics_json.result
+++ b/mysql-test/main/statistics_json.result
@ -67,33 +67,11 @@ test	t1	d	1	25	0.0000	8.0000	1.0000	10	JSON	[
  "21",
  "23"
 ]
-SELECT * FROM t1;
-a	b	c	d
-1	1	1	1
-2	2	2	2
-3	3	3	3
-4	4	4	4
-5	5	5	5
-6	6	6	6
-7	7	7	7
-8	8	8	8
-9	9	9	9
-10	10	10	10
-11	11	11	11
-12	12	12	12
-13	13	13	13
-14	14	14	14
-15	15	15	15
-16	16	16	16
-17	17	17	17
-18	18	18	18
-19	19	19	19
-20	20	20	20
-21	21	21	21
-22	22	22	22
-23	23	23	23
-24	24	24	24
-25	25	25	25
+explain extended select * from t1 where b between '20' and '70';
+id	select_type	table	type	possible_keys	key	key_len	ref	rows	filtered	Extra
+1	SIMPLE	t1	ALL	NULL	NULL	NULL	NULL	25	10.00	Using where
+Warnings:
+Note	1003	select `test`.`t1`.`a` AS `a`,`test`.`t1`.`b` AS `b`,`test`.`t1`.`c` AS `c`,`test`.`t1`.`d` AS `d` from `test`.`t1` where `test`.`t1`.`b` between '20' and '70'
 UPDATE mysql.column_stats SET histogram='["1", {"a": "b"}, "2"]' WHERE table_name='t1';
 FLUSH TABLES;
 SELECT * FROM t1;
--- a/mysql-test/main/statistics_json.test
+++ b/mysql-test/main/statistics_json.test
@ -28,7 +28,7 @@ set histogram_size=10;

 ANALYZE TABLE t1 PERSISTENT FOR ALL;
 SELECT * FROM mysql.column_stats WHERE table_name='t1';
-SELECT * FROM t1;
+explain extended select * from t1 where b between '20' and '70';

 # We then test different valid JSON strings that are invalid histograms.
 UPDATE mysql.column_stats SET histogram='["1", {"a": "b"}, "2"]' WHERE table_name='t1';
--- a/sql/sql_statistics.cc
+++ b/sql/sql_statistics.cc
@ -1466,90 +1466,95 @@ double Histogram_json::range_selectivity_new(Field *field, key_range *min_endp,
                                             key_range *max_endp)
 {
  fprintf(stderr, "Histogram_json::range_selectivity_new\n");
-
-
-  /*
-    GSOC-TODO: 
-    The code below is NOT what this function have. 
-
-    == WHAT THIS CODE DOES ==
-    At the moment it does a linear walk through histogram_bounds and compares 
-    min_endp to each of histogram bucket's min and max. 
-    ATTENTION:  This is a demo of how key_cmp() is used to compare the values.
-    
-    When it finds the bucket such that BUCKET_START < min_endp < BUCKET_END, 
-    it computes a position of min_endp within the bucket.
-    ATTENTION: calls to pos_in_interval_.... are a demo of how to compute 
-    position of a value within a [min,max] range.
-
-    == WHAT THIS CODE SHOULD DO ==
-    * Use binary search to locate the range  [MIN_BUCKET; MAX_BUCKET] - the
-      set of buckets that overlaps with the search interval {min_endp, max_endp}.
-
-    * If the search interval covers MIN_BUCKET only partially, compute a
-      position of min_endp within the bucket.
-
-    * The same for max_endp.
-
-    * Compute the final selectivity and return it.
-  */
-  std::string prev_s;
-  bool have_prev_s=false;
-  for (auto &s : histogram_bounds)
+  double min_sel, max_sel;
+  if (min_endp)
  {
-    if (!have_prev_s)
-    {
-      prev_s = s;
-      have_prev_s= true;
-      continue;
-    }
+    const uchar *min_key= min_endp->key;
+    // TODO: also, properly handle SQL NULLs.
+    // in this test patch, we just assume the values are not SQL NULLs.
+    if (field->real_maybe_null())
+      min_key++;

-    // It's a test code, so we only process min_endp.
-    if (min_endp)
-    {
-      const uchar *min_key= min_endp->key;
-      // TODO: also, properly handle SQL NULLs.
-      // in this test patch, we just assume the values are not SQL NULLs.
-      if (field->real_maybe_null())
-        min_key++;
-
-      int res1= field->key_cmp((uchar*)prev_s.data(), min_key);
-      const char *str1="<";
-      if (res1>0) str1=">";
-      if (res1==0) str1="=";
-
-      int res2= field->key_cmp(min_key, (uchar*)s.data());
-      const char *str2="<";
-      if (res2>0) str2=">";
-      if (res2==0) str2="=";
-      fprintf(stderr, "prev_bound %s min_key %s bound\n", str1, str2);
-
-      if (res1<0 && res2 < 0)
-      {
-        double sel;
-        if (field->pos_through_val_str())
-          sel= pos_in_interval_through_strxfrm(field, (uchar*)prev_s.data(), 
-                                               (uchar*)s.data(), (uchar*)min_key);
-        else
-          sel= pos_in_interval_through_val_real(field, (uchar*)prev_s.data(), 
-                                                (uchar*)s.data(), (uchar*)min_key);
-
-        fprintf(stderr, "  pos_in_interval=%g\n", sel);
-      }
-
-      prev_s= s;
-    }
+    min_sel= selection_in_interval(field, min_key);
+    fprintf(stderr, "min pos_in_interval(min_endp)=%g\n", min_sel);
  }
+  if (max_endp)
+  {
+    const uchar *max_key= max_endp->key;
+    if (field->real_maybe_null())
+      max_key++;
+
+    max_sel= selection_in_interval(field, max_key);
+    fprintf(stderr, "max pos_in_interval(min_endp)=%g\n", max_sel);
+  }
+
  fprintf(stderr, "Histogram_json::range_selectivity_new ends\n");
  return 0.5;
 }

+double Histogram_json::selection_in_interval(Field *field, const uchar* endpoint)
+{
+  int min_bucket_idx, max_bucket_idx;
+  min_bucket_idx= find_bucket(field, endpoint);
+  std::string min_bucket, max_bucket;
+
+  // todo:
+  //  this will probably trip up for cases where mind_endp > the last histogram value i.e min_bucket_idx = -1, but max_bucket_idx = 0 doesn't make sense.
+  max_bucket_idx= min_bucket_idx + 1;
+  double selection = 0;
+  if (min_bucket_idx != -1)
+  {
+    min_bucket= histogram_bounds[min_bucket_idx];
+    max_bucket= (max_bucket_idx < (int)histogram_bounds.size()) ? histogram_bounds[max_bucket_idx] : "";
+
+    if (field->pos_through_val_str())
+      selection = pos_in_interval_through_strxfrm(field, (uchar *) min_bucket.data(),
+                                           (uchar *) max_bucket.data(),
+                                           (uchar *) endpoint);
+    else
+      selection = pos_in_interval_through_val_real(field, (uchar *) min_bucket.data(),
+                                            (uchar *) max_bucket.data(),
+                                            (uchar *) endpoint);
+  }
+  return selection;
+}
+
 void Histogram_json::serialize(Field *field)
 {
  field->store((char*)get_values(), strlen((char*)get_values()),
               &my_charset_bin);
 }

+int Histogram_json::find_bucket(Field *field, const uchar *endpoint)
+{
+  int low = 0;
+  int high = (int)histogram_bounds.size()-1;
+  int mid;
+  int min_bucket_index = -1;
+  std::string mid_val;
+
+  while(low <= high) {
+    // c++ gives us the floor of integer divisions by default, below we get the ceiling (round-up).
+    // it works but it doesn't feel so readable, maybe we could make improvements?
+    int sum = (low+high);
+    mid = sum/2 + (sum % 2 != 0);
+
+    mid_val = histogram_bounds[mid];
+
+    int res = field->key_cmp((uchar*) mid_val.data(), endpoint);
+    min_bucket_index = mid;
+    if (res < 0) {
+      low = mid + 1;
+    } else if (res > 0) {
+      high = mid - 1;
+    } else {
+      //todo: endpoint is on a bucket boundary
+      break;
+    }
+  }
+  return min_bucket_index;
+}
+
 /*
  An object of the class Index_stat is created to read statistical
  data on tables from the statistical table table_stat, to update
--- a/sql/sql_statistics.h
+++ b/sql/sql_statistics.h
@ -419,6 +419,13 @@ public:
  */
  double range_selectivity_new(Field *field, key_range *min_endp,
                                       key_range *max_endp) override;
+
+  /*
+   * Returns the index of the biggest histogram value that is smaller than endpoint
+   */
+  int find_bucket(Field *field, const uchar *endpoint);
+
+  double selection_in_interval(Field *field, const uchar* endpoint);
 };

 class Columns_statistics;