mariadb/sql/sql_statistics.h

626 lines
16 KiB
C
Raw Normal View History

/* Copyright 2006-2008 MySQL AB, 2008 Sun Microsystems, Inc.
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; version 2 of the License.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with this program; if not, write to the Free Software
2019-05-11 22:19:05 +03:00
Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1335 USA */
#ifndef SQL_STATISTICS_H
#define SQL_STATISTICS_H
#include <vector>
2021-08-28 12:31:13 +03:00
#include <string>
/*
For COMPLEMENTARY_FOR_QUERIES and PREFERABLY_FOR_QUERIES they are
similar to the COMPLEMENTARY and PREFERABLY respectively except that
with these values we would not be collecting EITS for queries like
ANALYZE TABLE t1;
To collect EITS with these values, we have to use PERSISITENT FOR
analyze table t1 persistent for
columns (col1,col2...) index (idx1, idx2...)
or
analyze table t1 persistent for all
*/
typedef
enum enum_use_stat_tables_mode
{
NEVER,
COMPLEMENTARY,
PREFERABLY,
COMPLEMENTARY_FOR_QUERIES,
PREFERABLY_FOR_QUERIES
} Use_stat_tables_mode;
typedef
enum enum_histogram_type
{
SINGLE_PREC_HB,
DOUBLE_PREC_HB,
2021-08-27 16:57:22 +03:00
JSON_HB,
INVALID_HISTOGRAM
} Histogram_type;
enum enum_stat_tables
{
TABLE_STAT,
COLUMN_STAT,
INDEX_STAT,
};
2013-03-25 23:48:29 -07:00
/*
These enumeration types comprise the dictionary of three
statistical tables table_stat, column_stat and index_stat
as they defined in ../scripts/mysql_system_tables.sql.
It would be nice if the declarations of these types were
generated automatically by the table definitions.
*/
enum enum_table_stat_col
{
TABLE_STAT_DB_NAME,
TABLE_STAT_TABLE_NAME,
TABLE_STAT_CARDINALITY,
TABLE_STAT_N_FIELDS
};
enum enum_column_stat_col
{
COLUMN_STAT_DB_NAME,
COLUMN_STAT_TABLE_NAME,
COLUMN_STAT_COLUMN_NAME,
COLUMN_STAT_MIN_VALUE,
COLUMN_STAT_MAX_VALUE,
COLUMN_STAT_NULLS_RATIO,
COLUMN_STAT_AVG_LENGTH,
2013-03-25 23:48:29 -07:00
COLUMN_STAT_AVG_FREQUENCY,
COLUMN_STAT_HIST_SIZE,
COLUMN_STAT_HIST_TYPE,
COLUMN_STAT_HISTOGRAM,
COLUMN_STAT_N_FIELDS
};
enum enum_index_stat_col
{
INDEX_STAT_DB_NAME,
INDEX_STAT_TABLE_NAME,
INDEX_STAT_INDEX_NAME,
INDEX_STAT_PREFIX_ARITY,
INDEX_STAT_AVG_FREQUENCY,
INDEX_STAT_N_FIELDS
};
inline
Use_stat_tables_mode get_use_stat_tables_mode(THD *thd)
{
return (Use_stat_tables_mode) (thd->variables.use_stat_tables);
}
inline
bool check_eits_collection_allowed(THD *thd)
{
return (get_use_stat_tables_mode(thd) == COMPLEMENTARY ||
get_use_stat_tables_mode(thd) == PREFERABLY);
}
inline
bool check_eits_preferred(THD *thd)
{
return (get_use_stat_tables_mode(thd) == PREFERABLY ||
get_use_stat_tables_mode(thd) == PREFERABLY_FOR_QUERIES);
}
int read_statistics_for_tables_if_needed(THD *thd, TABLE_LIST *tables);
MDEV-29693 ANALYZE TABLE still flushes table definition cache when engine-independent statistics is used This commits enables reloading of engine-independent statistics without flushing the table from table definition cache. This is achieved by allowing multiple version of the TABLE_STATISTICS_CB object and having independent pointers to it in TABLE and TABLE_SHARE. The TABLE_STATISTICS_CB object have reference pointers and are freed when no one is pointing to it anymore. TABLE's TABLE_STATISTICS_CB pointer is updated to use the TABLE_SHARE's pointer when read_statistics_for_tables() is called at the beginning of a query. Main changes: - read_statistics_for_table() will allocate an new TABLE_STATISTICS_CB object. - All get_stat_values() functions has a new parameter that tells where collected data should be stored. get_stat_values() are not using the table_field object anymore to store data. - All get_stat_values() functions returns 1 if they found any data in the statistics tables. Other things: - Fixed INSERT DELAYED to not read statistics tables. - Removed Statistics_state from TABLE_STATISTICS_CB as this is not needed anymore as wer are not changing TABLE_SHARE->stats_cb while calculating or loading statistics. - Store values used with store_from_statistical_minmax_field() in TABLE_STATISTICS_CB::mem_root. This allowed me to remove the function delete_stat_values_for_table_share(). - Field_blob::store_from_statistical_minmax_field() is implemented but is not normally used as we do not yet support EIS statistics for blobs. For example Field_blob::update_min() and Field_blob::update_max() are not implemented. Note that the function can be called if there is an concurrent "ALTER TABLE MODIFY field BLOB" running because of a bug in ALTER TABLE where it deletes entries from column_stats before it has an exclusive lock on the table. - Use result of field->val_str(&val) as a pointer to the result instead of val (safetly fix). - Allocate memory for collected statistics in THD::mem_root, not in in TABLE::mem_root. This could cause the TABLE object to grow if a ANALYZE TABLE was run many times on the same table. This was done in allocate_statistics_for_table(), create_min_max_statistical_fields_for_table() and create_min_max_statistical_fields_for_table_share(). - Store in TABLE_STATISTICS_CB::stats_available which statistics was found in the statistics tables. - Removed index_table from class Index_prefix_calc as it was not used. - Added TABLE_SHARE::LOCK_statistics to ensure we don't load EITS in parallel. First thread will load it, others will reuse the loaded data. - Eliminate read_histograms_for_table(). The loading happens within read_statistics_for_tables() if histograms are needed. One downside is that if we have read statistics without histograms before and someone requires histograms, we have to read all statistics again (once) from the statistics tables. A smaller downside is the need to call alloc_root() for each individual histogram. Before we could allocate all the space for histograms with a single alloc_root. - Fixed bug in MyISAM and Aria where they did not properly notice that table had changed after analyze table. This was not a problem before this patch as then the MyISAM and Aria tables where flushed as part of ANALYZE table which did hide this issue. - Fixed a bug in ANALYZE table where table->records could be seen as 0 in collect_statistics_for_table(). The effect of this unlikely bug was that a full table scan could be done even if analyze_sample_percentage was not set to 1. - Changed multiple mallocs in a row to use multi_alloc_root(). - Added a mutex protection in update_statistics_for_table() to ensure that several tables are not updating the statistics at the same time. Some of the changes in sql_statistics.cc are based on a patch from Oleg Smirnov <olernov@gmail.com> Co-authored-by: Oleg Smirnov <olernov@gmail.com> Co-authored-by: Vicentiu Ciorbaru <cvicentiu@gmail.com> Reviewer: Sergei Petrunia <sergey@mariadb.com>
2023-08-05 01:08:05 +03:00
int read_statistics_for_tables(THD *thd, TABLE_LIST *tables,
bool force_reload);
int collect_statistics_for_table(THD *thd, TABLE *table);
MDEV-31957 Concurrent ALTER and ANALYZE collecting statistics can result in stale statistical data Example of what causes the problem: T1: ANALYZE TABLE starts to collect statistics T2: ALTER TABLE starts by deleting statistics for all changed fields, then creates a temp table and copies data to it. T1: ANALYZE ends and writes to the statistics tables. T2: ALTER TABLE renames temp table in place of the old table. Now the statistics from analyze matches the old deleted tables. Fixed by waiting to delete old statistics until ALTER TABLE is the only one using the old table and ensure that rename of columns can handle swapping of column names. rename_columns_in_stat_table() (former rename_column_in_stat_tables()) now takes a list of columns to rename. It uses the following algorithm to update column_stats to be able to handle circular renames - While there are columns to be renamed and it is the first loop or last rename loop did change something. - Loop over all columns to be renamed - Change column name in column_stat - If fail because of duplicate key - If this is first change attempt for this column - Change column name to a temporary column name - If there was a conflicting row, replace it with the current row. else - Remove entry from column list - Loop over all remaining columns in the list - Remove the conflicting row - Change column from temporary name to final name in column_stat Other things: - Don't flush tables for every operation. Only flush when all updates are done. - Rename of columns was not handled in case of ALGORITHM=copy (old bug). - Fixed that we do not collect statistics for hidden hash columns used by UNIQUE constraint on long values. - Fixed that we do not collect statistics for blob columns referred by generated virtual columns. This was achieved by storing the fields for which we want to have statistics in table->has_value_set instead of in table->read_set. - Rename of indexes was not handled for persistent statistics. - This is now handled similar as rename of columns. Renamed columns are now stored in 'rename_stat_indexes' and handled in Alter_info::delete_statistics() together with drooped indexes. - ALTER TABLE .. ADD INDEX may instead of creating a new index rename an existing generated foreign key index. This was not reflected in the index_stats table because this was handled in mysql_prepare_create_table instead instead of in the mysql_alter() code. Fixed by adding a call in mysql_prepare_create_table() to drop the changed index. I also had to change the code that 'marked the index' to be ignored with code that would not destroy the original index name. Reviewer: Sergei Petrunia <sergey@mariadb.com>
2023-08-18 18:35:02 +03:00
int alloc_statistics_for_table(THD *thd, TABLE *table, MY_BITMAP *stat_fields);
void free_statistics_for_table(TABLE *table);
int update_statistics_for_table(THD *thd, TABLE *table);
MDEV-31957 Concurrent ALTER and ANALYZE collecting statistics can result in stale statistical data Example of what causes the problem: T1: ANALYZE TABLE starts to collect statistics T2: ALTER TABLE starts by deleting statistics for all changed fields, then creates a temp table and copies data to it. T1: ANALYZE ends and writes to the statistics tables. T2: ALTER TABLE renames temp table in place of the old table. Now the statistics from analyze matches the old deleted tables. Fixed by waiting to delete old statistics until ALTER TABLE is the only one using the old table and ensure that rename of columns can handle swapping of column names. rename_columns_in_stat_table() (former rename_column_in_stat_tables()) now takes a list of columns to rename. It uses the following algorithm to update column_stats to be able to handle circular renames - While there are columns to be renamed and it is the first loop or last rename loop did change something. - Loop over all columns to be renamed - Change column name in column_stat - If fail because of duplicate key - If this is first change attempt for this column - Change column name to a temporary column name - If there was a conflicting row, replace it with the current row. else - Remove entry from column list - Loop over all remaining columns in the list - Remove the conflicting row - Change column from temporary name to final name in column_stat Other things: - Don't flush tables for every operation. Only flush when all updates are done. - Rename of columns was not handled in case of ALGORITHM=copy (old bug). - Fixed that we do not collect statistics for hidden hash columns used by UNIQUE constraint on long values. - Fixed that we do not collect statistics for blob columns referred by generated virtual columns. This was achieved by storing the fields for which we want to have statistics in table->has_value_set instead of in table->read_set. - Rename of indexes was not handled for persistent statistics. - This is now handled similar as rename of columns. Renamed columns are now stored in 'rename_stat_indexes' and handled in Alter_info::delete_statistics() together with drooped indexes. - ALTER TABLE .. ADD INDEX may instead of creating a new index rename an existing generated foreign key index. This was not reflected in the index_stats table because this was handled in mysql_prepare_create_table instead instead of in the mysql_alter() code. Fixed by adding a call in mysql_prepare_create_table() to drop the changed index. I also had to change the code that 'marked the index' to be ignored with code that would not destroy the original index name. Reviewer: Sergei Petrunia <sergey@mariadb.com>
2023-08-18 18:35:02 +03:00
int delete_statistics_for_table(THD *thd, const LEX_CSTRING *db,
const LEX_CSTRING *tab);
int delete_statistics_for_column(THD *thd, TABLE *tab, Field *col);
int delete_statistics_for_index(THD *thd, TABLE *tab, KEY *key_info,
bool ext_prefixes_only);
MDEV-31957 Concurrent ALTER and ANALYZE collecting statistics can result in stale statistical data Example of what causes the problem: T1: ANALYZE TABLE starts to collect statistics T2: ALTER TABLE starts by deleting statistics for all changed fields, then creates a temp table and copies data to it. T1: ANALYZE ends and writes to the statistics tables. T2: ALTER TABLE renames temp table in place of the old table. Now the statistics from analyze matches the old deleted tables. Fixed by waiting to delete old statistics until ALTER TABLE is the only one using the old table and ensure that rename of columns can handle swapping of column names. rename_columns_in_stat_table() (former rename_column_in_stat_tables()) now takes a list of columns to rename. It uses the following algorithm to update column_stats to be able to handle circular renames - While there are columns to be renamed and it is the first loop or last rename loop did change something. - Loop over all columns to be renamed - Change column name in column_stat - If fail because of duplicate key - If this is first change attempt for this column - Change column name to a temporary column name - If there was a conflicting row, replace it with the current row. else - Remove entry from column list - Loop over all remaining columns in the list - Remove the conflicting row - Change column from temporary name to final name in column_stat Other things: - Don't flush tables for every operation. Only flush when all updates are done. - Rename of columns was not handled in case of ALGORITHM=copy (old bug). - Fixed that we do not collect statistics for hidden hash columns used by UNIQUE constraint on long values. - Fixed that we do not collect statistics for blob columns referred by generated virtual columns. This was achieved by storing the fields for which we want to have statistics in table->has_value_set instead of in table->read_set. - Rename of indexes was not handled for persistent statistics. - This is now handled similar as rename of columns. Renamed columns are now stored in 'rename_stat_indexes' and handled in Alter_info::delete_statistics() together with drooped indexes. - ALTER TABLE .. ADD INDEX may instead of creating a new index rename an existing generated foreign key index. This was not reflected in the index_stats table because this was handled in mysql_prepare_create_table instead instead of in the mysql_alter() code. Fixed by adding a call in mysql_prepare_create_table() to drop the changed index. I also had to change the code that 'marked the index' to be ignored with code that would not destroy the original index name. Reviewer: Sergei Petrunia <sergey@mariadb.com>
2023-08-18 18:35:02 +03:00
int rename_table_in_stat_tables(THD *thd, const LEX_CSTRING *db,
const LEX_CSTRING *tab,
const LEX_CSTRING *new_db,
const LEX_CSTRING *new_tab);
int rename_columns_in_stat_table(THD *thd, TABLE *tab,
List<Alter_info::RENAME_COLUMN_STAT_PARAMS> *fields);
int rename_indexes_in_stat_table(THD *thd, TABLE *tab,
List<Alter_info::RENAME_INDEX_STAT_PARAMS> *indexes);
void set_statistics_for_table(THD *thd, TABLE *table);
2013-03-11 07:44:24 -07:00
double get_column_avg_frequency(Field * field);
double get_column_range_cardinality(Field *field,
key_range *min_endp,
key_range *max_endp,
uint range_flag);
bool is_stat_table(const LEX_CSTRING *db, LEX_CSTRING *table);
bool is_eits_usable(Field* field);
2013-03-11 07:44:24 -07:00
2021-08-31 13:39:39 +03:00
class Histogram_builder;
/*
Common base for all histograms
*/
class Histogram_base :public Sql_alloc
2013-03-25 23:48:29 -07:00
{
public:
Histogram_base() {}
virtual ~Histogram_base()= default;
virtual bool parse(MEM_ROOT *mem_root,
const char *db_name, const char *table_name,
Field *field, const char *hist_data,
size_t hist_data_len)= 0;
virtual void serialize(Field *to_field)= 0;
virtual Histogram_type get_type()=0;
virtual uint get_width()=0;
/*
The creation-time workflow is:
* create a histogram
* init_for_collection()
* create_builder()
* feed the data to the builder
* serialize();
*/
2021-08-27 22:28:59 +03:00
virtual void init_for_collection(MEM_ROOT *mem_root, Histogram_type htype_arg,
ulonglong size)=0;
virtual Histogram_builder *create_builder(Field *col, uint col_len,
ha_rows rows)=0;
/*
This function checks that histograms should be usable only when
1) the level of optimizer_use_condition_selectivity > 3
*/
bool is_usable(THD *thd)
{
return thd->variables.optimizer_use_condition_selectivity > 3;
}
2021-08-27 22:28:59 +03:00
virtual double point_selectivity(Field *field, key_range *endpoint,
2022-01-19 18:02:40 +03:00
double avg_sel)=0;
virtual double range_selectivity(Field *field, key_range *min_endp,
key_range *max_endp, double avg_sel)=0;
/*
Legacy: return the size of the histogram on disk.
This will be stored in mysql.column_stats.hist_size column.
The value is not really needed as one can look at
LENGTH(mysql.column_stats.histogram) directly.
*/
virtual uint get_size()=0;
};
2021-08-27 22:28:59 +03:00
/*
A Height-balanced histogram that stores numeric fractions
*/
class Histogram_binary final : public Histogram_base
{
private:
Histogram_type type;
size_t size; /* Size of values array, in bytes */
uchar *values;
uint prec_factor()
{
switch (type) {
case SINGLE_PREC_HB:
return ((uint) (1 << 8) - 1);
case DOUBLE_PREC_HB:
return ((uint) (1 << 16) - 1);
default:
DBUG_ASSERT(0);
}
return 1;
}
2013-03-25 23:48:29 -07:00
2021-08-31 00:53:09 +03:00
public:
Histogram_binary(Histogram_type type_arg) : type(type_arg)
{}
2021-08-31 00:53:09 +03:00
uint get_width() override
{
switch (type) {
case SINGLE_PREC_HB:
return (uint) size;
2021-08-31 00:53:09 +03:00
case DOUBLE_PREC_HB:
return (uint) (size / 2);
2021-08-31 00:53:09 +03:00
default:
DBUG_ASSERT(0);
}
return 0;
}
private:
uint get_value(uint i)
{
DBUG_ASSERT(i < get_width());
switch (type) {
case SINGLE_PREC_HB:
return (uint) (((uint8 *) values)[i]);
case DOUBLE_PREC_HB:
return (uint) uint2korr(values + i * 2);
default:
DBUG_ASSERT(0);
}
2013-04-13 02:36:30 -07:00
return 0;
}
/* Find the bucket which value 'pos' falls into. */
2013-03-25 23:48:29 -07:00
uint find_bucket(double pos, bool first)
{
size_t val= (size_t) (pos * prec_factor());
2013-03-25 23:48:29 -07:00
int lp= 0;
int rp= get_width() - 1;
int d= get_width() / 2;
uint i= lp + d;
for ( ; d; d= (rp - lp) / 2, i= lp + d)
2013-03-25 23:48:29 -07:00
{
if (val == get_value(i))
2013-03-25 23:48:29 -07:00
break;
if (val < get_value(i))
2013-03-25 23:48:29 -07:00
rp= i;
else if (val > get_value(i + 1))
2013-03-25 23:48:29 -07:00
lp= i + 1;
else
break;
}
if (val > get_value(i) && i < (get_width() - 1))
i++;
if (val == get_value(i))
2013-03-25 23:48:29 -07:00
{
if (first)
{
while(i && val == get_value(i - 1))
2013-03-25 23:48:29 -07:00
i--;
}
else
{
while(i + 1 < get_width() && val == get_value(i + 1))
2013-03-25 23:48:29 -07:00
i++;
}
}
return i;
}
public:
uint get_size() override {return (uint)size;}
2021-08-31 00:53:09 +03:00
Histogram_type get_type() override { return type; }
bool parse(MEM_ROOT *mem_root, const char*, const char*, Field*,
const char *hist_data, size_t hist_data_len) override;
2021-08-31 00:53:09 +03:00
void serialize(Field *to_field) override;
void init_for_collection(MEM_ROOT *mem_root, Histogram_type htype_arg,
ulonglong size) override;
2021-08-31 13:39:39 +03:00
Histogram_builder *create_builder(Field *col, uint col_len,
ha_rows rows) override;
2021-08-31 00:53:09 +03:00
2013-03-25 23:48:29 -07:00
void set_value(uint i, double val)
{
switch (type) {
case SINGLE_PREC_HB:
((uint8 *) values)[i]= (uint8) (val * prec_factor());
return;
case DOUBLE_PREC_HB:
int2store(values + i * 2, val * prec_factor());
return;
default:
DBUG_ASSERT(0);
return;
}
2013-03-25 23:48:29 -07:00
}
void set_prev_value(uint i)
{
switch (type) {
case SINGLE_PREC_HB:
((uint8 *) values)[i]= ((uint8 *) values)[i-1];
return;
case DOUBLE_PREC_HB:
int2store(values + i * 2, uint2korr(values + i * 2 - 2));
return;
default:
DBUG_ASSERT(0);
return;
}
}
2013-03-25 23:48:29 -07:00
double range_selectivity(Field *field, key_range *min_endp,
key_range *max_endp, double avg_sel) override;
2021-08-27 22:28:59 +03:00
/*
Estimate selectivity of "col=const" using a histogram
*/
2021-08-27 22:28:59 +03:00
double point_selectivity(Field *field, key_range *endpoint,
2022-01-19 18:02:40 +03:00
double avg_sel) override;
2013-03-25 23:48:29 -07:00
};
2021-08-27 22:28:59 +03:00
/*
This is used to collect the the basic statistics from a Unique object:
- count of values
- count of distinct values
- count of distinct values that have occurred only once
2021-08-27 22:28:59 +03:00
*/
class Basic_stats_collector
{
ulonglong count; /* number of values retrieved */
ulonglong count_distinct; /* number of distinct values retrieved */
/* number of distinct values that occurred only once */
ulonglong count_distinct_single_occurence;
public:
Basic_stats_collector()
{
count= 0;
count_distinct= 0;
count_distinct_single_occurence= 0;
2021-08-27 22:28:59 +03:00
}
ulonglong get_count_distinct() const { return count_distinct; }
ulonglong get_count_single_occurence() const
{
return count_distinct_single_occurence;
}
ulonglong get_count() const { return count; }
void next(void *elem, element_count elem_cnt)
2021-08-27 22:28:59 +03:00
{
count_distinct++;
if (elem_cnt == 1)
count_distinct_single_occurence++;
count+= elem_cnt;
2021-08-27 22:28:59 +03:00
}
};
/*
Histogram_builder is a helper class that is used to build histograms
for columns.
Do not create directly, call Histogram->get_builder(...);
*/
class Histogram_builder: public Sql_alloc
{
protected:
Field *column; /* table field for which the histogram is built */
uint col_length; /* size of this field */
ha_rows records; /* number of records the histogram is built for */
Histogram_builder(Field *col, uint col_len, ha_rows rows) :
column(col), col_length(col_len), records(rows)
{}
public:
// A histogram builder will also collect the counters
Basic_stats_collector counters;
virtual int next(void *elem, element_count elem_cnt)=0;
virtual void finalize()=0;
virtual ~Histogram_builder(){}
};
2013-03-25 23:48:29 -07:00
class Column_statistics;
class Index_statistics;
/* Statistical data on a table */
class Table_statistics
{
public:
my_bool cardinality_is_null; /* TRUE if the cardinality is unknown */
uint columns; /* Number of columns in table */
ha_rows cardinality; /* Number of rows in the table */
uchar *min_max_record_buffers; /* Record buffers for min/max values */
Column_statistics *column_stats; /* Array of statistical data for columns */
Index_statistics *index_stats; /* Array of statistical data for indexes */
/* Array of records per key for index prefixes */
ulonglong *idx_avg_frequency;
MDEV-29693 ANALYZE TABLE still flushes table definition cache when engine-independent statistics is used This commits enables reloading of engine-independent statistics without flushing the table from table definition cache. This is achieved by allowing multiple version of the TABLE_STATISTICS_CB object and having independent pointers to it in TABLE and TABLE_SHARE. The TABLE_STATISTICS_CB object have reference pointers and are freed when no one is pointing to it anymore. TABLE's TABLE_STATISTICS_CB pointer is updated to use the TABLE_SHARE's pointer when read_statistics_for_tables() is called at the beginning of a query. Main changes: - read_statistics_for_table() will allocate an new TABLE_STATISTICS_CB object. - All get_stat_values() functions has a new parameter that tells where collected data should be stored. get_stat_values() are not using the table_field object anymore to store data. - All get_stat_values() functions returns 1 if they found any data in the statistics tables. Other things: - Fixed INSERT DELAYED to not read statistics tables. - Removed Statistics_state from TABLE_STATISTICS_CB as this is not needed anymore as wer are not changing TABLE_SHARE->stats_cb while calculating or loading statistics. - Store values used with store_from_statistical_minmax_field() in TABLE_STATISTICS_CB::mem_root. This allowed me to remove the function delete_stat_values_for_table_share(). - Field_blob::store_from_statistical_minmax_field() is implemented but is not normally used as we do not yet support EIS statistics for blobs. For example Field_blob::update_min() and Field_blob::update_max() are not implemented. Note that the function can be called if there is an concurrent "ALTER TABLE MODIFY field BLOB" running because of a bug in ALTER TABLE where it deletes entries from column_stats before it has an exclusive lock on the table. - Use result of field->val_str(&val) as a pointer to the result instead of val (safetly fix). - Allocate memory for collected statistics in THD::mem_root, not in in TABLE::mem_root. This could cause the TABLE object to grow if a ANALYZE TABLE was run many times on the same table. This was done in allocate_statistics_for_table(), create_min_max_statistical_fields_for_table() and create_min_max_statistical_fields_for_table_share(). - Store in TABLE_STATISTICS_CB::stats_available which statistics was found in the statistics tables. - Removed index_table from class Index_prefix_calc as it was not used. - Added TABLE_SHARE::LOCK_statistics to ensure we don't load EITS in parallel. First thread will load it, others will reuse the loaded data. - Eliminate read_histograms_for_table(). The loading happens within read_statistics_for_tables() if histograms are needed. One downside is that if we have read statistics without histograms before and someone requires histograms, we have to read all statistics again (once) from the statistics tables. A smaller downside is the need to call alloc_root() for each individual histogram. Before we could allocate all the space for histograms with a single alloc_root. - Fixed bug in MyISAM and Aria where they did not properly notice that table had changed after analyze table. This was not a problem before this patch as then the MyISAM and Aria tables where flushed as part of ANALYZE table which did hide this issue. - Fixed a bug in ANALYZE table where table->records could be seen as 0 in collect_statistics_for_table(). The effect of this unlikely bug was that a full table scan could be done even if analyze_sample_percentage was not set to 1. - Changed multiple mallocs in a row to use multi_alloc_root(). - Added a mutex protection in update_statistics_for_table() to ensure that several tables are not updating the statistics at the same time. Some of the changes in sql_statistics.cc are based on a patch from Oleg Smirnov <olernov@gmail.com> Co-authored-by: Oleg Smirnov <olernov@gmail.com> Co-authored-by: Vicentiu Ciorbaru <cvicentiu@gmail.com> Reviewer: Sergei Petrunia <sergey@mariadb.com>
2023-08-05 01:08:05 +03:00
uchar *histograms; /* Sequence of histograms */
};
/*
Statistical data on a column
Note: objects of this class may be "empty", where they have almost all fields
as zeros, for example, get_avg_frequency() will return 0.
objects are allocated in alloc_statistics_for_table[_share].
*/
class Column_statistics :public Sql_alloc
{
private:
static const uint Scale_factor_nulls_ratio= 100000;
static const uint Scale_factor_avg_length= 100000;
static const uint Scale_factor_avg_frequency= 100000;
public:
~Column_statistics()
{
delete histogram;
}
/*
Bitmap indicating what statistical characteristics
are available for the column
*/
uint32 column_stat_nulls;
/* For the below two, see comments in get_column_range_cardinality() */
/* Minimum value for the column */
Field *min_value;
/* Maximum value for the column */
Field *max_value;
private:
/*
The ratio Z/N multiplied by the scale factor Scale_factor_nulls_ratio,
where
N is the total number of rows,
Z is the number of nulls in the column
*/
ulong nulls_ratio;
/*
Average number of bytes occupied by the representation of a
value of the column in memory buffers such as join buffer
multiplied by the scale factor Scale_factor_avg_length.
CHAR values are stripped of trailing spaces.
Flexible values are stripped of their length prefixes.
*/
ulonglong avg_length;
/*
The ratio N/D multiplied by the scale factor Scale_factor_avg_frequency,
where
N is the number of rows with not null value in the column,
D the number of distinct values among them
*/
ulonglong avg_frequency;
public:
Histogram_base *histogram;
bool histogram_exists;
uint32 no_values_provided_bitmap()
{
return
((1 << (COLUMN_STAT_HISTOGRAM-COLUMN_STAT_COLUMN_NAME))-1) <<
(COLUMN_STAT_COLUMN_NAME+1);
}
2013-03-25 23:48:29 -07:00
void set_all_nulls()
{
column_stat_nulls= no_values_provided_bitmap();
}
void set_not_null(uint stat_field_no)
{
column_stat_nulls&= ~(1 << stat_field_no);
}
void set_null(uint stat_field_no)
{
column_stat_nulls|= (1 << stat_field_no);
}
bool is_null(uint stat_field_no)
{
return MY_TEST(column_stat_nulls & (1 << stat_field_no));
}
double get_nulls_ratio()
{
return (double) nulls_ratio / Scale_factor_nulls_ratio;
}
double get_avg_length()
{
return (double) avg_length / Scale_factor_avg_length;
}
double get_avg_frequency()
{
return (double) avg_frequency / Scale_factor_avg_frequency;
}
void set_nulls_ratio (double val)
{
nulls_ratio= (ulong) (val * Scale_factor_nulls_ratio);
}
void set_avg_length (double val)
{
avg_length= (ulonglong) (val * Scale_factor_avg_length);
}
void set_avg_frequency (double val)
{
avg_frequency= (ulonglong) (val * Scale_factor_avg_frequency);
}
bool min_max_values_are_provided()
{
return !is_null(COLUMN_STAT_MIN_VALUE) &&
!is_null(COLUMN_STAT_MAX_VALUE);
}
/*
This function checks whether the values for the fields of the statistical
tables that were NULL by DEFAULT for a column have changed or not.
@retval
TRUE: Statistics are not present for a column
FALSE: Statisitics are present for a column
*/
bool no_stat_values_provided()
{
return (column_stat_nulls == no_values_provided_bitmap());
}
};
/* Statistical data on an index prefixes */
class Index_statistics
{
private:
static const uint Scale_factor_avg_frequency= 100000;
/*
The k-th element of this array contains the ratio N/D
multiplied by the scale factor Scale_factor_avg_frequency,
where N is the number of index entries without nulls
in the first k components, and D is the number of distinct
k-component prefixes among them
*/
ulonglong *avg_frequency;
public:
void init_avg_frequency(ulonglong *ptr) { avg_frequency= ptr; }
bool avg_frequency_is_inited() { return avg_frequency != NULL; }
double get_avg_frequency(uint i) const
{
return (double) avg_frequency[i] / Scale_factor_avg_frequency;
}
void set_avg_frequency(uint i, double val)
{
avg_frequency[i]= (ulonglong) (val * Scale_factor_avg_frequency);
}
};
#endif /* SQL_STATISTICS_H */