mariadb/sql/sql_statistics.h

626 lines
16 KiB
C
Raw Normal View History

/* Copyright 2006-2008 MySQL AB, 2008 Sun Microsystems, Inc.
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; version 2 of the License.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with this program; if not, write to the Free Software
2019-05-11 22:19:05 +03:00
Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1335 USA */
#ifndef SQL_STATISTICS_H
#define SQL_STATISTICS_H
#include <vector>
2021-08-28 12:31:13 +03:00
#include <string>
/*
For COMPLEMENTARY_FOR_QUERIES and PREFERABLY_FOR_QUERIES they are
similar to the COMPLEMENTARY and PREFERABLY respectively except that
with these values we would not be collecting EITS for queries like
ANALYZE TABLE t1;
To collect EITS with these values, we have to use PERSISITENT FOR
analyze table t1 persistent for
columns (col1,col2...) index (idx1, idx2...)
or
analyze table t1 persistent for all
*/
typedef
enum enum_use_stat_tables_mode
{
NEVER,
COMPLEMENTARY,
PREFERABLY,
COMPLEMENTARY_FOR_QUERIES,
PREFERABLY_FOR_QUERIES
} Use_stat_tables_mode;
typedef
enum enum_histogram_type
{
SINGLE_PREC_HB,
DOUBLE_PREC_HB,
2021-08-27 16:57:22 +03:00
JSON_HB,
INVALID_HISTOGRAM
} Histogram_type;
enum enum_stat_tables
{
TABLE_STAT,
COLUMN_STAT,
INDEX_STAT,
};
2013-03-25 23:48:29 -07:00
/*
These enumeration types comprise the dictionary of three
statistical tables table_stat, column_stat and index_stat
as they defined in ../scripts/mysql_system_tables.sql.
It would be nice if the declarations of these types were
generated automatically by the table definitions.
*/
enum enum_table_stat_col
{
TABLE_STAT_DB_NAME,
TABLE_STAT_TABLE_NAME,
TABLE_STAT_CARDINALITY,
TABLE_STAT_N_FIELDS
};
enum enum_column_stat_col
{
COLUMN_STAT_DB_NAME,
COLUMN_STAT_TABLE_NAME,
COLUMN_STAT_COLUMN_NAME,
COLUMN_STAT_MIN_VALUE,
COLUMN_STAT_MAX_VALUE,
COLUMN_STAT_NULLS_RATIO,
COLUMN_STAT_AVG_LENGTH,
2013-03-25 23:48:29 -07:00
COLUMN_STAT_AVG_FREQUENCY,
COLUMN_STAT_HIST_SIZE,
COLUMN_STAT_HIST_TYPE,
COLUMN_STAT_HISTOGRAM,
COLUMN_STAT_N_FIELDS
};
enum enum_index_stat_col
{
INDEX_STAT_DB_NAME,
INDEX_STAT_TABLE_NAME,
INDEX_STAT_INDEX_NAME,
INDEX_STAT_PREFIX_ARITY,
INDEX_STAT_AVG_FREQUENCY,
INDEX_STAT_N_FIELDS
};
inline
Use_stat_tables_mode get_use_stat_tables_mode(THD *thd)
{
return (Use_stat_tables_mode) (thd->variables.use_stat_tables);
}
inline
bool check_eits_collection_allowed(THD *thd)
{
return (get_use_stat_tables_mode(thd) == COMPLEMENTARY ||
get_use_stat_tables_mode(thd) == PREFERABLY);
}
inline
bool check_eits_preferred(THD *thd)
{
return (get_use_stat_tables_mode(thd) == PREFERABLY ||
get_use_stat_tables_mode(thd) == PREFERABLY_FOR_QUERIES);
}
int read_statistics_for_tables_if_needed(THD *thd, TABLE_LIST *tables);
MDEV-29693 ANALYZE TABLE still flushes table definition cache when engine-independent statistics is used This commits enables reloading of engine-independent statistics without flushing the table from table definition cache. This is achieved by allowing multiple version of the TABLE_STATISTICS_CB object and having independent pointers to it in TABLE and TABLE_SHARE. The TABLE_STATISTICS_CB object have reference pointers and are freed when no one is pointing to it anymore. TABLE's TABLE_STATISTICS_CB pointer is updated to use the TABLE_SHARE's pointer when read_statistics_for_tables() is called at the beginning of a query. Main changes: - read_statistics_for_table() will allocate an new TABLE_STATISTICS_CB object. - All get_stat_values() functions has a new parameter that tells where collected data should be stored. get_stat_values() are not using the table_field object anymore to store data. - All get_stat_values() functions returns 1 if they found any data in the statistics tables. Other things: - Fixed INSERT DELAYED to not read statistics tables. - Removed Statistics_state from TABLE_STATISTICS_CB as this is not needed anymore as wer are not changing TABLE_SHARE->stats_cb while calculating or loading statistics. - Store values used with store_from_statistical_minmax_field() in TABLE_STATISTICS_CB::mem_root. This allowed me to remove the function delete_stat_values_for_table_share(). - Field_blob::store_from_statistical_minmax_field() is implemented but is not normally used as we do not yet support EIS statistics for blobs. For example Field_blob::update_min() and Field_blob::update_max() are not implemented. Note that the function can be called if there is an concurrent "ALTER TABLE MODIFY field BLOB" running because of a bug in ALTER TABLE where it deletes entries from column_stats before it has an exclusive lock on the table. - Use result of field->val_str(&val) as a pointer to the result instead of val (safetly fix). - Allocate memory for collected statistics in THD::mem_root, not in in TABLE::mem_root. This could cause the TABLE object to grow if a ANALYZE TABLE was run many times on the same table. This was done in allocate_statistics_for_table(), create_min_max_statistical_fields_for_table() and create_min_max_statistical_fields_for_table_share(). - Store in TABLE_STATISTICS_CB::stats_available which statistics was found in the statistics tables. - Removed index_table from class Index_prefix_calc as it was not used. - Added TABLE_SHARE::LOCK_statistics to ensure we don't load EITS in parallel. First thread will load it, others will reuse the loaded data. - Eliminate read_histograms_for_table(). The loading happens within read_statistics_for_tables() if histograms are needed. One downside is that if we have read statistics without histograms before and someone requires histograms, we have to read all statistics again (once) from the statistics tables. A smaller downside is the need to call alloc_root() for each individual histogram. Before we could allocate all the space for histograms with a single alloc_root. - Fixed bug in MyISAM and Aria where they did not properly notice that table had changed after analyze table. This was not a problem before this patch as then the MyISAM and Aria tables where flushed as part of ANALYZE table which did hide this issue. - Fixed a bug in ANALYZE table where table->records could be seen as 0 in collect_statistics_for_table(). The effect of this unlikely bug was that a full table scan could be done even if analyze_sample_percentage was not set to 1. - Changed multiple mallocs in a row to use multi_alloc_root(). - Added a mutex protection in update_statistics_for_table() to ensure that several tables are not updating the statistics at the same time. Some of the changes in sql_statistics.cc are based on a patch from Oleg Smirnov <olernov@gmail.com> Co-authored-by: Oleg Smirnov <olernov@gmail.com> Co-authored-by: Vicentiu Ciorbaru <cvicentiu@gmail.com> Reviewer: Sergei Petrunia <sergey@mariadb.com>
2023-08-05 01:08:05 +03:00
int read_statistics_for_tables(THD *thd, TABLE_LIST *tables,
bool force_reload);
int collect_statistics_for_table(THD *thd, TABLE *table);
MDEV-31957 Concurrent ALTER and ANALYZE collecting statistics can result in stale statistical data Example of what causes the problem: T1: ANALYZE TABLE starts to collect statistics T2: ALTER TABLE starts by deleting statistics for all changed fields, then creates a temp table and copies data to it. T1: ANALYZE ends and writes to the statistics tables. T2: ALTER TABLE renames temp table in place of the old table. Now the statistics from analyze matches the old deleted tables. Fixed by waiting to delete old statistics until ALTER TABLE is the only one using the old table and ensure that rename of columns can handle swapping of column names. rename_columns_in_stat_table() (former rename_column_in_stat_tables()) now takes a list of columns to rename. It uses the following algorithm to update column_stats to be able to handle circular renames - While there are columns to be renamed and it is the first loop or last rename loop did change something. - Loop over all columns to be renamed - Change column name in column_stat - If fail because of duplicate key - If this is first change attempt for this column - Change column name to a temporary column name - If there was a conflicting row, replace it with the current row. else - Remove entry from column list - Loop over all remaining columns in the list - Remove the conflicting row - Change column from temporary name to final name in column_stat Other things: - Don't flush tables for every operation. Only flush when all updates are done. - Rename of columns was not handled in case of ALGORITHM=copy (old bug). - Fixed that we do not collect statistics for hidden hash columns used by UNIQUE constraint on long values. - Fixed that we do not collect statistics for blob columns referred by generated virtual columns. This was achieved by storing the fields for which we want to have statistics in table->has_value_set instead of in table->read_set. - Rename of indexes was not handled for persistent statistics. - This is now handled similar as rename of columns. Renamed columns are now stored in 'rename_stat_indexes' and handled in Alter_info::delete_statistics() together with drooped indexes. - ALTER TABLE .. ADD INDEX may instead of creating a new index rename an existing generated foreign key index. This was not reflected in the index_stats table because this was handled in mysql_prepare_create_table instead instead of in the mysql_alter() code. Fixed by adding a call in mysql_prepare_create_table() to drop the changed index. I also had to change the code that 'marked the index' to be ignored with code that would not destroy the original index name. Reviewer: Sergei Petrunia <sergey@mariadb.com>
2023-08-18 18:35:02 +03:00
int alloc_statistics_for_table(THD *thd, TABLE *table, MY_BITMAP *stat_fields);
void free_statistics_for_table(TABLE *table);
int update_statistics_for_table(THD *thd, TABLE *table);
MDEV-31957 Concurrent ALTER and ANALYZE collecting statistics can result in stale statistical data Example of what causes the problem: T1: ANALYZE TABLE starts to collect statistics T2: ALTER TABLE starts by deleting statistics for all changed fields, then creates a temp table and copies data to it. T1: ANALYZE ends and writes to the statistics tables. T2: ALTER TABLE renames temp table in place of the old table. Now the statistics from analyze matches the old deleted tables. Fixed by waiting to delete old statistics until ALTER TABLE is the only one using the old table and ensure that rename of columns can handle swapping of column names. rename_columns_in_stat_table() (former rename_column_in_stat_tables()) now takes a list of columns to rename. It uses the following algorithm to update column_stats to be able to handle circular renames - While there are columns to be renamed and it is the first loop or last rename loop did change something. - Loop over all columns to be renamed - Change column name in column_stat - If fail because of duplicate key - If this is first change attempt for this column - Change column name to a temporary column name - If there was a conflicting row, replace it with the current row. else - Remove entry from column list - Loop over all remaining columns in the list - Remove the conflicting row - Change column from temporary name to final name in column_stat Other things: - Don't flush tables for every operation. Only flush when all updates are done. - Rename of columns was not handled in case of ALGORITHM=copy (old bug). - Fixed that we do not collect statistics for hidden hash columns used by UNIQUE constraint on long values. - Fixed that we do not collect statistics for blob columns referred by generated virtual columns. This was achieved by storing the fields for which we want to have statistics in table->has_value_set instead of in table->read_set. - Rename of indexes was not handled for persistent statistics. - This is now handled similar as rename of columns. Renamed columns are now stored in 'rename_stat_indexes' and handled in Alter_info::delete_statistics() together with drooped indexes. - ALTER TABLE .. ADD INDEX may instead of creating a new index rename an existing generated foreign key index. This was not reflected in the index_stats table because this was handled in mysql_prepare_create_table instead instead of in the mysql_alter() code. Fixed by adding a call in mysql_prepare_create_table() to drop the changed index. I also had to change the code that 'marked the index' to be ignored with code that would not destroy the original index name. Reviewer: Sergei Petrunia <sergey@mariadb.com>
2023-08-18 18:35:02 +03:00
int delete_statistics_for_table(THD *thd, const LEX_CSTRING *db,
const LEX_CSTRING *tab);
int delete_statistics_for_column(THD *thd, TABLE *tab, Field *col);
int delete_statistics_for_index(THD *thd, TABLE *tab, KEY *key_info,
bool ext_prefixes_only);
MDEV-31957 Concurrent ALTER and ANALYZE collecting statistics can result in stale statistical data Example of what causes the problem: T1: ANALYZE TABLE starts to collect statistics T2: ALTER TABLE starts by deleting statistics for all changed fields, then creates a temp table and copies data to it. T1: ANALYZE ends and writes to the statistics tables. T2: ALTER TABLE renames temp table in place of the old table. Now the statistics from analyze matches the old deleted tables. Fixed by waiting to delete old statistics until ALTER TABLE is the only one using the old table and ensure that rename of columns can handle swapping of column names. rename_columns_in_stat_table() (former rename_column_in_stat_tables()) now takes a list of columns to rename. It uses the following algorithm to update column_stats to be able to handle circular renames - While there are columns to be renamed and it is the first loop or last rename loop did change something. - Loop over all columns to be renamed - Change column name in column_stat - If fail because of duplicate key - If this is first change attempt for this column - Change column name to a temporary column name - If there was a conflicting row, replace it with the current row. else - Remove entry from column list - Loop over all remaining columns in the list - Remove the conflicting row - Change column from temporary name to final name in column_stat Other things: - Don't flush tables for every operation. Only flush when all updates are done. - Rename of columns was not handled in case of ALGORITHM=copy (old bug). - Fixed that we do not collect statistics for hidden hash columns used by UNIQUE constraint on long values. - Fixed that we do not collect statistics for blob columns referred by generated virtual columns. This was achieved by storing the fields for which we want to have statistics in table->has_value_set instead of in table->read_set. - Rename of indexes was not handled for persistent statistics. - This is now handled similar as rename of columns. Renamed columns are now stored in 'rename_stat_indexes' and handled in Alter_info::delete_statistics() together with drooped indexes. - ALTER TABLE .. ADD INDEX may instead of creating a new index rename an existing generated foreign key index. This was not reflected in the index_stats table because this was handled in mysql_prepare_create_table instead instead of in the mysql_alter() code. Fixed by adding a call in mysql_prepare_create_table() to drop the changed index. I also had to change the code that 'marked the index' to be ignored with code that would not destroy the original index name. Reviewer: Sergei Petrunia <sergey@mariadb.com>
2023-08-18 18:35:02 +03:00
int rename_table_in_stat_tables(THD *thd, const LEX_CSTRING *db,
const LEX_CSTRING *tab,
const LEX_CSTRING *new_db,
const LEX_CSTRING *new_tab);
int rename_columns_in_stat_table(THD *thd, TABLE *tab,
List<Alter_info::RENAME_COLUMN_STAT_PARAMS> *fields);
int rename_indexes_in_stat_table(THD *thd, TABLE *tab,
List<Alter_info::RENAME_INDEX_STAT_PARAMS> *indexes);
void set_statistics_for_table(THD *thd, TABLE *table);
2013-03-11 07:44:24 -07:00
double get_column_avg_frequency(Field * field);
double get_column_range_cardinality(Field *field,
key_range *min_endp,
key_range *max_endp,
uint range_flag);
MDEV-31340 Remove MY_COLLATION_HANDLER::strcasecmp() This patch also fixes: MDEV-33050 Build-in schemas like oracle_schema are accent insensitive MDEV-33084 LASTVAL(t1) and LASTVAL(T1) do not work well with lower-case-table-names=0 MDEV-33085 Tables T1 and t1 do not work well with ENGINE=CSV and lower-case-table-names=0 MDEV-33086 SHOW OPEN TABLES IN DB1 -- is case insensitive with lower-case-table-names=0 MDEV-33088 Cannot create triggers in the database `MYSQL` MDEV-33103 LOCK TABLE t1 AS t2 -- alias is not case sensitive with lower-case-table-names=0 MDEV-33109 DROP DATABASE MYSQL -- does not drop SP with lower-case-table-names=0 MDEV-33110 HANDLER commands are case insensitive with lower-case-table-names=0 MDEV-33119 User is case insensitive in INFORMATION_SCHEMA.VIEWS MDEV-33120 System log table names are case insensitive with lower-cast-table-names=0 - Removing the virtual function strnncoll() from MY_COLLATION_HANDLER - Adding a wrapper function CHARSET_INFO::streq(), to compare two strings for equality. For now it calls strnncoll() internally. In the future it will turn into a virtual function. - Adding new accent sensitive case insensitive collations: - utf8mb4_general1400_as_ci - utf8mb3_general1400_as_ci They implement accent sensitive case insensitive comparison. The weight of a character is equal to the code point of its upper case variant. These collations use Unicode-14.0.0 casefolding data. The result of my_charset_utf8mb3_general1400_as_ci.strcoll() is very close to the former my_charset_utf8mb3_general_ci.strcasecmp() There is only a difference in a couple dozen rare characters, because: - the switch from "tolower" to "toupper" comparison, to make utf8mb3_general1400_as_ci closer to utf8mb3_general_ci - the switch from Unicode-3.0.0 to Unicode-14.0.0 This difference should be tolarable. See the list of affected characters in the MDEV description. Note, utf8mb4_general1400_as_ci correctly handles non-BMP characters! Unlike utf8mb4_general_ci, it does not treat all BMP characters as equal. - Adding classes representing names of the file based database objects: Lex_ident_db Lex_ident_table Lex_ident_trigger Their comparison collation depends on the underlying file system case sensitivity and on --lower-case-table-names and can be either my_charset_bin or my_charset_utf8mb3_general1400_as_ci. - Adding classes representing names of other database objects, whose names have case insensitive comparison style, using my_charset_utf8mb3_general1400_as_ci: Lex_ident_column Lex_ident_sys_var Lex_ident_user_var Lex_ident_sp_var Lex_ident_ps Lex_ident_i_s_table Lex_ident_window Lex_ident_func Lex_ident_partition Lex_ident_with_element Lex_ident_rpl_filter Lex_ident_master_info Lex_ident_host Lex_ident_locale Lex_ident_plugin Lex_ident_engine Lex_ident_server Lex_ident_savepoint Lex_ident_charset engine_option_value::Name - All the mentioned Lex_ident_xxx classes implement a method streq(): if (ident1.streq(ident2)) do_equal(); This method works as a wrapper for CHARSET_INFO::streq(). - Changing a lot of "LEX_CSTRING name" to "Lex_ident_xxx name" in class members and in function/method parameters. - Replacing all calls like system_charset_info->coll->strcasecmp(ident1, ident2) to ident1.streq(ident2) - Taking advantage of the c++11 user defined literal operator for LEX_CSTRING (see m_strings.h) and Lex_ident_xxx (see lex_ident.h) data types. Use example: const Lex_ident_column primary_key_name= "PRIMARY"_Lex_ident_column; is now a shorter version of: const Lex_ident_column primary_key_name= Lex_ident_column({STRING_WITH_LEN("PRIMARY")});
2023-04-26 15:27:01 +04:00
bool is_stat_table(const Lex_ident_db &db, const Lex_ident_table &table);
bool is_eits_usable(Field* field);
2013-03-11 07:44:24 -07:00
2021-08-31 13:39:39 +03:00
class Histogram_builder;
/*
Common base for all histograms
*/
class Histogram_base :public Sql_alloc
2013-03-25 23:48:29 -07:00
{
public:
Histogram_base() {}
virtual ~Histogram_base()= default;
virtual bool parse(MEM_ROOT *mem_root,
const char *db_name, const char *table_name,
Field *field, const char *hist_data,
size_t hist_data_len)= 0;
virtual void serialize(Field *to_field)= 0;
virtual Histogram_type get_type()=0;
virtual uint get_width()=0;
/*
The creation-time workflow is:
* create a histogram
* init_for_collection()
* create_builder()
* feed the data to the builder
* serialize();
*/
2021-08-27 22:28:59 +03:00
virtual void init_for_collection(MEM_ROOT *mem_root, Histogram_type htype_arg,
ulonglong size)=0;
virtual Histogram_builder *create_builder(Field *col, uint col_len,
ha_rows rows)=0;
/*
This function checks that histograms should be usable only when
1) the level of optimizer_use_condition_selectivity > 3
*/
bool is_usable(THD *thd)
{
return thd->variables.optimizer_use_condition_selectivity > 3;
}
2021-08-27 22:28:59 +03:00
virtual double point_selectivity(Field *field, key_range *endpoint,
2022-01-19 18:02:40 +03:00
double avg_sel)=0;
virtual double range_selectivity(Field *field, key_range *min_endp,
key_range *max_endp, double avg_sel)=0;
/*
Legacy: return the size of the histogram on disk.
This will be stored in mysql.column_stats.hist_size column.
The value is not really needed as one can look at
LENGTH(mysql.column_stats.histogram) directly.
*/
virtual uint get_size()=0;
};
2021-08-27 22:28:59 +03:00
/*
A Height-balanced histogram that stores numeric fractions
*/
class Histogram_binary final : public Histogram_base
{
private:
Histogram_type type;
size_t size; /* Size of values array, in bytes */
uchar *values;
uint prec_factor()
{
switch (type) {
case SINGLE_PREC_HB:
return ((uint) (1 << 8) - 1);
case DOUBLE_PREC_HB:
return ((uint) (1 << 16) - 1);
default:
DBUG_ASSERT(0);
}
return 1;
}
2013-03-25 23:48:29 -07:00
2021-08-31 00:53:09 +03:00
public:
Histogram_binary(Histogram_type type_arg) : type(type_arg)
{}
2021-08-31 00:53:09 +03:00
uint get_width() override
{
switch (type) {
case SINGLE_PREC_HB:
return (uint) size;
2021-08-31 00:53:09 +03:00
case DOUBLE_PREC_HB:
return (uint) (size / 2);
2021-08-31 00:53:09 +03:00
default:
DBUG_ASSERT(0);
}
return 0;
}
private:
uint get_value(uint i)
{
DBUG_ASSERT(i < get_width());
switch (type) {
case SINGLE_PREC_HB:
return (uint) (((uint8 *) values)[i]);
case DOUBLE_PREC_HB:
return (uint) uint2korr(values + i * 2);
default:
DBUG_ASSERT(0);
}
2013-04-13 02:36:30 -07:00
return 0;
}
/* Find the bucket which value 'pos' falls into. */
2013-03-25 23:48:29 -07:00
uint find_bucket(double pos, bool first)
{
size_t val= (size_t) (pos * prec_factor());
2013-03-25 23:48:29 -07:00
int lp= 0;
int rp= get_width() - 1;
int d= get_width() / 2;
uint i= lp + d;
for ( ; d; d= (rp - lp) / 2, i= lp + d)
2013-03-25 23:48:29 -07:00
{
if (val == get_value(i))
2013-03-25 23:48:29 -07:00
break;
if (val < get_value(i))
2013-03-25 23:48:29 -07:00
rp= i;
else if (val > get_value(i + 1))
2013-03-25 23:48:29 -07:00
lp= i + 1;
else
break;
}
if (val > get_value(i) && i < (get_width() - 1))
i++;
if (val == get_value(i))
2013-03-25 23:48:29 -07:00
{
if (first)
{
while(i && val == get_value(i - 1))
2013-03-25 23:48:29 -07:00
i--;
}
else
{
while(i + 1 < get_width() && val == get_value(i + 1))
2013-03-25 23:48:29 -07:00
i++;
}
}
return i;
}
public:
uint get_size() override {return (uint)size;}
2021-08-31 00:53:09 +03:00
Histogram_type get_type() override { return type; }
bool parse(MEM_ROOT *mem_root, const char*, const char*, Field*,
const char *hist_data, size_t hist_data_len) override;
2021-08-31 00:53:09 +03:00
void serialize(Field *to_field) override;
void init_for_collection(MEM_ROOT *mem_root, Histogram_type htype_arg,
ulonglong size) override;
2021-08-31 13:39:39 +03:00
Histogram_builder *create_builder(Field *col, uint col_len,
ha_rows rows) override;
2021-08-31 00:53:09 +03:00
2013-03-25 23:48:29 -07:00
void set_value(uint i, double val)
{
switch (type) {
case SINGLE_PREC_HB:
((uint8 *) values)[i]= (uint8) (val * prec_factor());
return;
case DOUBLE_PREC_HB:
int2store(values + i * 2, val * prec_factor());
return;
default:
DBUG_ASSERT(0);
return;
}
2013-03-25 23:48:29 -07:00
}
void set_prev_value(uint i)
{
switch (type) {
case SINGLE_PREC_HB:
((uint8 *) values)[i]= ((uint8 *) values)[i-1];
return;
case DOUBLE_PREC_HB:
int2store(values + i * 2, uint2korr(values + i * 2 - 2));
return;
default:
DBUG_ASSERT(0);
return;
}
}
2013-03-25 23:48:29 -07:00
double range_selectivity(Field *field, key_range *min_endp,
key_range *max_endp, double avg_sel) override;
2021-08-27 22:28:59 +03:00
/*
Estimate selectivity of "col=const" using a histogram
*/
2021-08-27 22:28:59 +03:00
double point_selectivity(Field *field, key_range *endpoint,
2022-01-19 18:02:40 +03:00
double avg_sel) override;
2013-03-25 23:48:29 -07:00
};
2021-08-27 22:28:59 +03:00
/*
This is used to collect the the basic statistics from a Unique object:
- count of values
- count of distinct values
- count of distinct values that have occurred only once
2021-08-27 22:28:59 +03:00
*/
class Basic_stats_collector
{
ulonglong count; /* number of values retrieved */
ulonglong count_distinct; /* number of distinct values retrieved */
/* number of distinct values that occurred only once */
ulonglong count_distinct_single_occurence;
public:
Basic_stats_collector()
{
count= 0;
count_distinct= 0;
count_distinct_single_occurence= 0;
2021-08-27 22:28:59 +03:00
}
ulonglong get_count_distinct() const { return count_distinct; }
ulonglong get_count_single_occurence() const
{
return count_distinct_single_occurence;
}
ulonglong get_count() const { return count; }
void next(void *elem, element_count elem_cnt)
2021-08-27 22:28:59 +03:00
{
count_distinct++;
if (elem_cnt == 1)
count_distinct_single_occurence++;
count+= elem_cnt;
2021-08-27 22:28:59 +03:00
}
};
/*
Histogram_builder is a helper class that is used to build histograms
for columns.
Do not create directly, call Histogram->get_builder(...);
*/
class Histogram_builder: public Sql_alloc
{
protected:
Field *column; /* table field for which the histogram is built */
uint col_length; /* size of this field */
ha_rows records; /* number of records the histogram is built for */
Histogram_builder(Field *col, uint col_len, ha_rows rows) :
column(col), col_length(col_len), records(rows)
{}
public:
// A histogram builder will also collect the counters
Basic_stats_collector counters;
virtual int next(void *elem, element_count elem_cnt)=0;
virtual void finalize()=0;
virtual ~Histogram_builder(){}
};
2013-03-25 23:48:29 -07:00
class Column_statistics;
class Index_statistics;
/* Statistical data on a table */
class Table_statistics
{
public:
my_bool cardinality_is_null; /* TRUE if the cardinality is unknown */
uint columns; /* Number of columns in table */
ha_rows cardinality; /* Number of rows in the table */
uchar *min_max_record_buffers; /* Record buffers for min/max values */
Column_statistics *column_stats; /* Array of statistical data for columns */
Index_statistics *index_stats; /* Array of statistical data for indexes */
/* Array of records per key for index prefixes */
ulonglong *idx_avg_frequency;
MDEV-29693 ANALYZE TABLE still flushes table definition cache when engine-independent statistics is used This commits enables reloading of engine-independent statistics without flushing the table from table definition cache. This is achieved by allowing multiple version of the TABLE_STATISTICS_CB object and having independent pointers to it in TABLE and TABLE_SHARE. The TABLE_STATISTICS_CB object have reference pointers and are freed when no one is pointing to it anymore. TABLE's TABLE_STATISTICS_CB pointer is updated to use the TABLE_SHARE's pointer when read_statistics_for_tables() is called at the beginning of a query. Main changes: - read_statistics_for_table() will allocate an new TABLE_STATISTICS_CB object. - All get_stat_values() functions has a new parameter that tells where collected data should be stored. get_stat_values() are not using the table_field object anymore to store data. - All get_stat_values() functions returns 1 if they found any data in the statistics tables. Other things: - Fixed INSERT DELAYED to not read statistics tables. - Removed Statistics_state from TABLE_STATISTICS_CB as this is not needed anymore as wer are not changing TABLE_SHARE->stats_cb while calculating or loading statistics. - Store values used with store_from_statistical_minmax_field() in TABLE_STATISTICS_CB::mem_root. This allowed me to remove the function delete_stat_values_for_table_share(). - Field_blob::store_from_statistical_minmax_field() is implemented but is not normally used as we do not yet support EIS statistics for blobs. For example Field_blob::update_min() and Field_blob::update_max() are not implemented. Note that the function can be called if there is an concurrent "ALTER TABLE MODIFY field BLOB" running because of a bug in ALTER TABLE where it deletes entries from column_stats before it has an exclusive lock on the table. - Use result of field->val_str(&val) as a pointer to the result instead of val (safetly fix). - Allocate memory for collected statistics in THD::mem_root, not in in TABLE::mem_root. This could cause the TABLE object to grow if a ANALYZE TABLE was run many times on the same table. This was done in allocate_statistics_for_table(), create_min_max_statistical_fields_for_table() and create_min_max_statistical_fields_for_table_share(). - Store in TABLE_STATISTICS_CB::stats_available which statistics was found in the statistics tables. - Removed index_table from class Index_prefix_calc as it was not used. - Added TABLE_SHARE::LOCK_statistics to ensure we don't load EITS in parallel. First thread will load it, others will reuse the loaded data. - Eliminate read_histograms_for_table(). The loading happens within read_statistics_for_tables() if histograms are needed. One downside is that if we have read statistics without histograms before and someone requires histograms, we have to read all statistics again (once) from the statistics tables. A smaller downside is the need to call alloc_root() for each individual histogram. Before we could allocate all the space for histograms with a single alloc_root. - Fixed bug in MyISAM and Aria where they did not properly notice that table had changed after analyze table. This was not a problem before this patch as then the MyISAM and Aria tables where flushed as part of ANALYZE table which did hide this issue. - Fixed a bug in ANALYZE table where table->records could be seen as 0 in collect_statistics_for_table(). The effect of this unlikely bug was that a full table scan could be done even if analyze_sample_percentage was not set to 1. - Changed multiple mallocs in a row to use multi_alloc_root(). - Added a mutex protection in update_statistics_for_table() to ensure that several tables are not updating the statistics at the same time. Some of the changes in sql_statistics.cc are based on a patch from Oleg Smirnov <olernov@gmail.com> Co-authored-by: Oleg Smirnov <olernov@gmail.com> Co-authored-by: Vicentiu Ciorbaru <cvicentiu@gmail.com> Reviewer: Sergei Petrunia <sergey@mariadb.com>
2023-08-05 01:08:05 +03:00
uchar *histograms; /* Sequence of histograms */
};
/*
Statistical data on a column
Note: objects of this class may be "empty", where they have almost all fields
as zeros, for example, get_avg_frequency() will return 0.
objects are allocated in alloc_statistics_for_table[_share].
*/
class Column_statistics :public Sql_alloc
{
private:
static const uint Scale_factor_nulls_ratio= 100000;
static const uint Scale_factor_avg_length= 100000;
static const uint Scale_factor_avg_frequency= 100000;
public:
~Column_statistics()
{
delete histogram;
}
/*
Bitmap indicating what statistical characteristics
are available for the column
*/
uint32 column_stat_nulls;
/* For the below two, see comments in get_column_range_cardinality() */
/* Minimum value for the column */
Field *min_value;
/* Maximum value for the column */
Field *max_value;
private:
/*
The ratio Z/N multiplied by the scale factor Scale_factor_nulls_ratio,
where
N is the total number of rows,
Z is the number of nulls in the column
*/
ulong nulls_ratio;
/*
Average number of bytes occupied by the representation of a
value of the column in memory buffers such as join buffer
multiplied by the scale factor Scale_factor_avg_length.
CHAR values are stripped of trailing spaces.
Flexible values are stripped of their length prefixes.
*/
ulonglong avg_length;
/*
The ratio N/D multiplied by the scale factor Scale_factor_avg_frequency,
where
N is the number of rows with not null value in the column,
D the number of distinct values among them
*/
ulonglong avg_frequency;
public:
Histogram_base *histogram;
bool histogram_exists;
uint32 no_values_provided_bitmap()
{
return
((1 << (COLUMN_STAT_HISTOGRAM-COLUMN_STAT_COLUMN_NAME))-1) <<
(COLUMN_STAT_COLUMN_NAME+1);
}
2013-03-25 23:48:29 -07:00
void set_all_nulls()
{
column_stat_nulls= no_values_provided_bitmap();
}
void set_not_null(uint stat_field_no)
{
column_stat_nulls&= ~(1 << stat_field_no);
}
void set_null(uint stat_field_no)
{
column_stat_nulls|= (1 << stat_field_no);
}
bool is_null(uint stat_field_no)
{
return MY_TEST(column_stat_nulls & (1 << stat_field_no));
}
double get_nulls_ratio()
{
return (double) nulls_ratio / Scale_factor_nulls_ratio;
}
double get_avg_length()
{
return (double) avg_length / Scale_factor_avg_length;
}
double get_avg_frequency()
{
return (double) avg_frequency / Scale_factor_avg_frequency;
}
void set_nulls_ratio (double val)
{
nulls_ratio= (ulong) (val * Scale_factor_nulls_ratio);
}
void set_avg_length (double val)
{
avg_length= (ulonglong) (val * Scale_factor_avg_length);
}
void set_avg_frequency (double val)
{
avg_frequency= (ulonglong) (val * Scale_factor_avg_frequency);
}
bool min_max_values_are_provided()
{
return !is_null(COLUMN_STAT_MIN_VALUE) &&
!is_null(COLUMN_STAT_MAX_VALUE);
}
/*
This function checks whether the values for the fields of the statistical
tables that were NULL by DEFAULT for a column have changed or not.
@retval
TRUE: Statistics are not present for a column
FALSE: Statisitics are present for a column
*/
bool no_stat_values_provided()
{
return (column_stat_nulls == no_values_provided_bitmap());
}
};
/* Statistical data on an index prefixes */
class Index_statistics
{
private:
static const uint Scale_factor_avg_frequency= 100000;
/*
The k-th element of this array contains the ratio N/D
multiplied by the scale factor Scale_factor_avg_frequency,
where N is the number of index entries without nulls
in the first k components, and D is the number of distinct
k-component prefixes among them
*/
ulonglong *avg_frequency;
public:
void init_avg_frequency(ulonglong *ptr) { avg_frequency= ptr; }
bool avg_frequency_is_inited() { return avg_frequency != NULL; }
double get_avg_frequency(uint i)
{
return (double) avg_frequency[i] / Scale_factor_avg_frequency;
}
void set_avg_frequency(uint i, double val)
{
avg_frequency[i]= (ulonglong) (val * Scale_factor_avg_frequency);
}
};
#endif /* SQL_STATISTICS_H */