Use binary search to compute range selectivity

* it also adds an "explain select" statement to the test so that the fprintf calls
  can print the computed intervals to mysqld.1.err

Signed-off-by: Michael Okoko <okokomichaels@outlook.com>
This commit is contained in:
Michael Okoko 2021-08-16 10:09:56 +01:00 committed by Sergei Petrunia
parent c605285bb8
commit c129689ddc
4 changed files with 89 additions and 99 deletions

View file

@ -67,33 +67,11 @@ test t1 d 1 25 0.0000 8.0000 1.0000 10 JSON [
"21",
"23"
]
SELECT * FROM t1;
a b c d
1 1 1 1
2 2 2 2
3 3 3 3
4 4 4 4
5 5 5 5
6 6 6 6
7 7 7 7
8 8 8 8
9 9 9 9
10 10 10 10
11 11 11 11
12 12 12 12
13 13 13 13
14 14 14 14
15 15 15 15
16 16 16 16
17 17 17 17
18 18 18 18
19 19 19 19
20 20 20 20
21 21 21 21
22 22 22 22
23 23 23 23
24 24 24 24
25 25 25 25
explain extended select * from t1 where b between '20' and '70';
id select_type table type possible_keys key key_len ref rows filtered Extra
1 SIMPLE t1 ALL NULL NULL NULL NULL 25 10.00 Using where
Warnings:
Note 1003 select `test`.`t1`.`a` AS `a`,`test`.`t1`.`b` AS `b`,`test`.`t1`.`c` AS `c`,`test`.`t1`.`d` AS `d` from `test`.`t1` where `test`.`t1`.`b` between '20' and '70'
UPDATE mysql.column_stats SET histogram='["1", {"a": "b"}, "2"]' WHERE table_name='t1';
FLUSH TABLES;
SELECT * FROM t1;

View file

@ -28,7 +28,7 @@ set histogram_size=10;
ANALYZE TABLE t1 PERSISTENT FOR ALL;
SELECT * FROM mysql.column_stats WHERE table_name='t1';
SELECT * FROM t1;
explain extended select * from t1 where b between '20' and '70';
# We then test different valid JSON strings that are invalid histograms.
UPDATE mysql.column_stats SET histogram='["1", {"a": "b"}, "2"]' WHERE table_name='t1';

View file

@ -1466,90 +1466,95 @@ double Histogram_json::range_selectivity_new(Field *field, key_range *min_endp,
key_range *max_endp)
{
fprintf(stderr, "Histogram_json::range_selectivity_new\n");
/*
GSOC-TODO:
The code below is NOT what this function have.
== WHAT THIS CODE DOES ==
At the moment it does a linear walk through histogram_bounds and compares
min_endp to each of histogram bucket's min and max.
ATTENTION: This is a demo of how key_cmp() is used to compare the values.
When it finds the bucket such that BUCKET_START < min_endp < BUCKET_END,
it computes a position of min_endp within the bucket.
ATTENTION: calls to pos_in_interval_.... are a demo of how to compute
position of a value within a [min,max] range.
== WHAT THIS CODE SHOULD DO ==
* Use binary search to locate the range [MIN_BUCKET; MAX_BUCKET] - the
set of buckets that overlaps with the search interval {min_endp, max_endp}.
* If the search interval covers MIN_BUCKET only partially, compute a
position of min_endp within the bucket.
* The same for max_endp.
* Compute the final selectivity and return it.
*/
std::string prev_s;
bool have_prev_s=false;
for (auto &s : histogram_bounds)
double min_sel, max_sel;
if (min_endp)
{
if (!have_prev_s)
{
prev_s = s;
have_prev_s= true;
continue;
}
const uchar *min_key= min_endp->key;
// TODO: also, properly handle SQL NULLs.
// in this test patch, we just assume the values are not SQL NULLs.
if (field->real_maybe_null())
min_key++;
// It's a test code, so we only process min_endp.
if (min_endp)
{
const uchar *min_key= min_endp->key;
// TODO: also, properly handle SQL NULLs.
// in this test patch, we just assume the values are not SQL NULLs.
if (field->real_maybe_null())
min_key++;
int res1= field->key_cmp((uchar*)prev_s.data(), min_key);
const char *str1="<";
if (res1>0) str1=">";
if (res1==0) str1="=";
int res2= field->key_cmp(min_key, (uchar*)s.data());
const char *str2="<";
if (res2>0) str2=">";
if (res2==0) str2="=";
fprintf(stderr, "prev_bound %s min_key %s bound\n", str1, str2);
if (res1<0 && res2 < 0)
{
double sel;
if (field->pos_through_val_str())
sel= pos_in_interval_through_strxfrm(field, (uchar*)prev_s.data(),
(uchar*)s.data(), (uchar*)min_key);
else
sel= pos_in_interval_through_val_real(field, (uchar*)prev_s.data(),
(uchar*)s.data(), (uchar*)min_key);
fprintf(stderr, " pos_in_interval=%g\n", sel);
}
prev_s= s;
}
min_sel= selection_in_interval(field, min_key);
fprintf(stderr, "min pos_in_interval(min_endp)=%g\n", min_sel);
}
if (max_endp)
{
const uchar *max_key= max_endp->key;
if (field->real_maybe_null())
max_key++;
max_sel= selection_in_interval(field, max_key);
fprintf(stderr, "max pos_in_interval(min_endp)=%g\n", max_sel);
}
fprintf(stderr, "Histogram_json::range_selectivity_new ends\n");
return 0.5;
}
double Histogram_json::selection_in_interval(Field *field, const uchar* endpoint)
{
int min_bucket_idx, max_bucket_idx;
min_bucket_idx= find_bucket(field, endpoint);
std::string min_bucket, max_bucket;
// todo:
// this will probably trip up for cases where mind_endp > the last histogram value i.e min_bucket_idx = -1, but max_bucket_idx = 0 doesn't make sense.
max_bucket_idx= min_bucket_idx + 1;
double selection = 0;
if (min_bucket_idx != -1)
{
min_bucket= histogram_bounds[min_bucket_idx];
max_bucket= (max_bucket_idx < (int)histogram_bounds.size()) ? histogram_bounds[max_bucket_idx] : "";
if (field->pos_through_val_str())
selection = pos_in_interval_through_strxfrm(field, (uchar *) min_bucket.data(),
(uchar *) max_bucket.data(),
(uchar *) endpoint);
else
selection = pos_in_interval_through_val_real(field, (uchar *) min_bucket.data(),
(uchar *) max_bucket.data(),
(uchar *) endpoint);
}
return selection;
}
void Histogram_json::serialize(Field *field)
{
field->store((char*)get_values(), strlen((char*)get_values()),
&my_charset_bin);
}
int Histogram_json::find_bucket(Field *field, const uchar *endpoint)
{
int low = 0;
int high = (int)histogram_bounds.size()-1;
int mid;
int min_bucket_index = -1;
std::string mid_val;
while(low <= high) {
// c++ gives us the floor of integer divisions by default, below we get the ceiling (round-up).
// it works but it doesn't feel so readable, maybe we could make improvements?
int sum = (low+high);
mid = sum/2 + (sum % 2 != 0);
mid_val = histogram_bounds[mid];
int res = field->key_cmp((uchar*) mid_val.data(), endpoint);
min_bucket_index = mid;
if (res < 0) {
low = mid + 1;
} else if (res > 0) {
high = mid - 1;
} else {
//todo: endpoint is on a bucket boundary
break;
}
}
return min_bucket_index;
}
/*
An object of the class Index_stat is created to read statistical
data on tables from the statistical table table_stat, to update

View file

@ -419,6 +419,13 @@ public:
*/
double range_selectivity_new(Field *field, key_range *min_endp,
key_range *max_endp) override;
/*
* Returns the index of the biggest histogram value that is smaller than endpoint
*/
int find_bucket(Field *field, const uchar *endpoint);
double selection_in_interval(Field *field, const uchar* endpoint);
};
class Columns_statistics;