mirror of
https://github.com/MariaDB/server.git
synced 2025-01-29 02:05:57 +01:00
Introduce analyze_sample_percentage variable
The variable controls the amount of sampling analyze table performs. If ANALYZE table with histogram collection is too slow, one can reduce the time taken by setting analyze_sample_percentage to a lower value of the total number of rows. Setting it to 0 will use a formula to compute how many rows to sample: The number of rows collected is capped to a minimum of 50000 and increases logarithmically with a coffecient of 4096. The coffecient is chosen so that we expect an error of less than 3% in our estimations according to the paper: "Random Sampling for Histogram Construction: How much is enough?” – Surajit Chaudhuri, Rajeev Motwani, Vivek Narasayya, ACM SIGMOD, 1998. The drawback of sampling is that avg_frequency number is computed imprecisely and will yeild a smaller number than the real one.
This commit is contained in:
parent
47f15ea73c
commit
f0773b7842
8 changed files with 265 additions and 10 deletions
|
@ -15,6 +15,10 @@ The following specify which files/extra groups are read (specified before remain
|
|||
--alter-algorithm[=name]
|
||||
Specify the alter table algorithm. One of: DEFAULT, COPY,
|
||||
INPLACE, NOCOPY, INSTANT
|
||||
--analyze-sample-percentage=#
|
||||
Percentage of rows from the table ANALYZE TABLE will
|
||||
sample to collect table statistics. Set to 0 to let
|
||||
MariaDB decide what percentage of rows to sample.
|
||||
-a, --ansi Use ANSI SQL syntax instead of MySQL syntax. This mode
|
||||
will also set transaction isolation level 'serializable'.
|
||||
--auto-increment-increment[=#]
|
||||
|
@ -1385,6 +1389,7 @@ The following specify which files/extra groups are read (specified before remain
|
|||
Variables (--variable-name=value)
|
||||
allow-suspicious-udfs FALSE
|
||||
alter-algorithm DEFAULT
|
||||
analyze-sample-percentage 100
|
||||
auto-increment-increment 1
|
||||
auto-increment-offset 1
|
||||
autocommit TRUE
|
||||
|
|
|
@ -1760,3 +1760,107 @@ DROP TABLE t1;
|
|||
# End of 10.2 tests
|
||||
#
|
||||
set histogram_size=@save_hist_size, histogram_type=@save_hist_type;
|
||||
#
|
||||
# Start of 10.4 tests
|
||||
#
|
||||
#
|
||||
# Test analyze_sample_percentage system variable.
|
||||
#
|
||||
set @save_use_stat_tables=@@use_stat_tables;
|
||||
set @save_analyze_sample_percentage=@@analyze_sample_percentage;
|
||||
set session rand_seed1=42;
|
||||
set session rand_seed2=62;
|
||||
set use_stat_tables=PREFERABLY;
|
||||
set histogram_size=10;
|
||||
CREATE TABLE t1 (id int);
|
||||
INSERT INTO t1 (id) VALUES (1), (1), (1), (1), (1), (1), (1);
|
||||
INSERT INTO t1 (id) SELECT id FROM t1;
|
||||
INSERT INTO t1 SELECT id+1 FROM t1;
|
||||
INSERT INTO t1 SELECT id+2 FROM t1;
|
||||
INSERT INTO t1 SELECT id+4 FROM t1;
|
||||
INSERT INTO t1 SELECT id+8 FROM t1;
|
||||
INSERT INTO t1 SELECT id+16 FROM t1;
|
||||
INSERT INTO t1 SELECT id+32 FROM t1;
|
||||
INSERT INTO t1 SELECT id+64 FROM t1;
|
||||
INSERT INTO t1 SELECT id+128 FROM t1;
|
||||
INSERT INTO t1 SELECT id+256 FROM t1;
|
||||
INSERT INTO t1 SELECT id+512 FROM t1;
|
||||
INSERT INTO t1 SELECT id+1024 FROM t1;
|
||||
INSERT INTO t1 SELECT id+2048 FROM t1;
|
||||
INSERT INTO t1 SELECT id+4096 FROM t1;
|
||||
INSERT INTO t1 SELECT id+9192 FROM t1;
|
||||
#
|
||||
# This query will should show a full table scan analysis.
|
||||
#
|
||||
ANALYZE TABLE t1;
|
||||
Table Op Msg_type Msg_text
|
||||
test.t1 analyze status Engine-independent statistics collected
|
||||
test.t1 analyze status OK
|
||||
select table_name, column_name, min_value, max_value, nulls_ratio, avg_length, avg_frequency,
|
||||
DECODE_HISTOGRAM(hist_type, histogram)
|
||||
from mysql.column_stats;
|
||||
table_name column_name min_value max_value nulls_ratio avg_length avg_frequency DECODE_HISTOGRAM(hist_type, histogram)
|
||||
t1 id 1 17384 0.0000 4.0000 14.0000 0.082,0.086,0.086,0.086,0.086,0.141,0.086,0.086,0.086,0.086,0.086
|
||||
set analyze_sample_percentage=0.1;
|
||||
#
|
||||
# This query will show an innacurate avg_frequency value.
|
||||
#
|
||||
ANALYZE TABLE t1;
|
||||
Table Op Msg_type Msg_text
|
||||
test.t1 analyze status Engine-independent statistics collected
|
||||
test.t1 analyze status Table is already up to date
|
||||
select table_name, column_name, min_value, max_value, nulls_ratio, avg_length, avg_frequency,
|
||||
DECODE_HISTOGRAM(hist_type, histogram)
|
||||
from mysql.column_stats;
|
||||
table_name column_name min_value max_value nulls_ratio avg_length avg_frequency DECODE_HISTOGRAM(hist_type, histogram)
|
||||
t1 id 111 17026 0.0000 4.0000 1.0047 0.039,0.098,0.055,0.118,0.078,0.157,0.082,0.118,0.094,0.063,0.098
|
||||
#
|
||||
# This query will show a better avg_frequency value.
|
||||
#
|
||||
set analyze_sample_percentage=25;
|
||||
ANALYZE TABLE t1;
|
||||
Table Op Msg_type Msg_text
|
||||
test.t1 analyze status Engine-independent statistics collected
|
||||
test.t1 analyze status Table is already up to date
|
||||
select table_name, column_name, min_value, max_value, nulls_ratio, avg_length, avg_frequency,
|
||||
DECODE_HISTOGRAM(hist_type, histogram)
|
||||
from mysql.column_stats;
|
||||
table_name column_name min_value max_value nulls_ratio avg_length avg_frequency DECODE_HISTOGRAM(hist_type, histogram)
|
||||
t1 id 1 17384 0.0000 4.0000 3.5736 0.082,0.086,0.086,0.082,0.086,0.145,0.086,0.086,0.082,0.086,0.090
|
||||
set analyze_sample_percentage=0;
|
||||
#
|
||||
# Test self adjusting sampling level.
|
||||
#
|
||||
ANALYZE TABLE t1;
|
||||
Table Op Msg_type Msg_text
|
||||
test.t1 analyze status Engine-independent statistics collected
|
||||
test.t1 analyze status Table is already up to date
|
||||
select table_name, column_name, min_value, max_value, nulls_ratio, avg_length, avg_frequency,
|
||||
DECODE_HISTOGRAM(hist_type, histogram)
|
||||
from mysql.column_stats;
|
||||
table_name column_name min_value max_value nulls_ratio avg_length avg_frequency DECODE_HISTOGRAM(hist_type, histogram)
|
||||
t1 id 1 17384 0.0000 4.0000 7.4523 0.082,0.090,0.086,0.082,0.086,0.145,0.086,0.082,0.086,0.086,0.086
|
||||
#
|
||||
# Test record estimation is working properly.
|
||||
#
|
||||
select count(*) from t1;
|
||||
count(*)
|
||||
229376
|
||||
explain select * from t1;
|
||||
id select_type table type possible_keys key key_len ref rows Extra
|
||||
1 SIMPLE t1 ALL NULL NULL NULL NULL 229060
|
||||
set analyze_sample_percentage=100;
|
||||
ANALYZE TABLE t1;
|
||||
Table Op Msg_type Msg_text
|
||||
test.t1 analyze status Engine-independent statistics collected
|
||||
test.t1 analyze status Table is already up to date
|
||||
select table_name, column_name, min_value, max_value, nulls_ratio, avg_length, avg_frequency,
|
||||
DECODE_HISTOGRAM(hist_type, histogram)
|
||||
from mysql.column_stats;
|
||||
table_name column_name min_value max_value nulls_ratio avg_length avg_frequency DECODE_HISTOGRAM(hist_type, histogram)
|
||||
t1 id 1 17384 0.0000 4.0000 14.0000 0.082,0.086,0.086,0.086,0.086,0.141,0.086,0.086,0.086,0.086,0.086
|
||||
explain select * from t1;
|
||||
id select_type table type possible_keys key key_len ref rows Extra
|
||||
1 SIMPLE t1 ALL NULL NULL NULL NULL 229376
|
||||
set use_stat_tables=@save_use_stat_tables;
|
||||
drop table t1;
|
||||
|
|
|
@ -901,3 +901,91 @@ DROP TABLE t1;
|
|||
--echo #
|
||||
set histogram_size=@save_hist_size, histogram_type=@save_hist_type;
|
||||
|
||||
|
||||
|
||||
--echo #
|
||||
--echo # Start of 10.4 tests
|
||||
--echo #
|
||||
|
||||
--echo #
|
||||
--echo # Test analyze_sample_percentage system variable.
|
||||
--echo #
|
||||
set @save_use_stat_tables=@@use_stat_tables;
|
||||
set @save_analyze_sample_percentage=@@analyze_sample_percentage;
|
||||
|
||||
set session rand_seed1=42;
|
||||
set session rand_seed2=62;
|
||||
|
||||
set use_stat_tables=PREFERABLY;
|
||||
set histogram_size=10;
|
||||
|
||||
CREATE TABLE t1 (id int);
|
||||
INSERT INTO t1 (id) VALUES (1), (1), (1), (1), (1), (1), (1);
|
||||
INSERT INTO t1 (id) SELECT id FROM t1;
|
||||
INSERT INTO t1 SELECT id+1 FROM t1;
|
||||
INSERT INTO t1 SELECT id+2 FROM t1;
|
||||
INSERT INTO t1 SELECT id+4 FROM t1;
|
||||
INSERT INTO t1 SELECT id+8 FROM t1;
|
||||
INSERT INTO t1 SELECT id+16 FROM t1;
|
||||
INSERT INTO t1 SELECT id+32 FROM t1;
|
||||
INSERT INTO t1 SELECT id+64 FROM t1;
|
||||
INSERT INTO t1 SELECT id+128 FROM t1;
|
||||
INSERT INTO t1 SELECT id+256 FROM t1;
|
||||
INSERT INTO t1 SELECT id+512 FROM t1;
|
||||
INSERT INTO t1 SELECT id+1024 FROM t1;
|
||||
INSERT INTO t1 SELECT id+2048 FROM t1;
|
||||
INSERT INTO t1 SELECT id+4096 FROM t1;
|
||||
INSERT INTO t1 SELECT id+9192 FROM t1;
|
||||
|
||||
--echo #
|
||||
--echo # This query will should show a full table scan analysis.
|
||||
--echo #
|
||||
ANALYZE TABLE t1;
|
||||
select table_name, column_name, min_value, max_value, nulls_ratio, avg_length, avg_frequency,
|
||||
DECODE_HISTOGRAM(hist_type, histogram)
|
||||
from mysql.column_stats;
|
||||
|
||||
set analyze_sample_percentage=0.1;
|
||||
|
||||
--echo #
|
||||
--echo # This query will show an innacurate avg_frequency value.
|
||||
--echo #
|
||||
ANALYZE TABLE t1;
|
||||
select table_name, column_name, min_value, max_value, nulls_ratio, avg_length, avg_frequency,
|
||||
DECODE_HISTOGRAM(hist_type, histogram)
|
||||
from mysql.column_stats;
|
||||
|
||||
--echo #
|
||||
--echo # This query will show a better avg_frequency value.
|
||||
--echo #
|
||||
set analyze_sample_percentage=25;
|
||||
ANALYZE TABLE t1;
|
||||
select table_name, column_name, min_value, max_value, nulls_ratio, avg_length, avg_frequency,
|
||||
DECODE_HISTOGRAM(hist_type, histogram)
|
||||
from mysql.column_stats;
|
||||
|
||||
|
||||
set analyze_sample_percentage=0;
|
||||
--echo #
|
||||
--echo # Test self adjusting sampling level.
|
||||
--echo #
|
||||
ANALYZE TABLE t1;
|
||||
select table_name, column_name, min_value, max_value, nulls_ratio, avg_length, avg_frequency,
|
||||
DECODE_HISTOGRAM(hist_type, histogram)
|
||||
from mysql.column_stats;
|
||||
-- echo #
|
||||
-- echo # Test record estimation is working properly.
|
||||
-- echo #
|
||||
select count(*) from t1;
|
||||
explain select * from t1;
|
||||
|
||||
set analyze_sample_percentage=100;
|
||||
ANALYZE TABLE t1;
|
||||
select table_name, column_name, min_value, max_value, nulls_ratio, avg_length, avg_frequency,
|
||||
DECODE_HISTOGRAM(hist_type, histogram)
|
||||
from mysql.column_stats;
|
||||
explain select * from t1;
|
||||
|
||||
set use_stat_tables=@save_use_stat_tables;
|
||||
|
||||
drop table t1;
|
||||
|
|
|
@ -40,6 +40,20 @@ NUMERIC_BLOCK_SIZE NULL
|
|||
ENUM_VALUE_LIST DEFAULT,COPY,INPLACE,NOCOPY,INSTANT
|
||||
READ_ONLY NO
|
||||
COMMAND_LINE_ARGUMENT OPTIONAL
|
||||
VARIABLE_NAME ANALYZE_SAMPLE_PERCENTAGE
|
||||
SESSION_VALUE 100.000000
|
||||
GLOBAL_VALUE 100.000000
|
||||
GLOBAL_VALUE_ORIGIN COMPILE-TIME
|
||||
DEFAULT_VALUE 100.000000
|
||||
VARIABLE_SCOPE SESSION
|
||||
VARIABLE_TYPE DOUBLE
|
||||
VARIABLE_COMMENT Percentage of rows from the table ANALYZE TABLE will sample to collect table statistics. Set to 0 to let MariaDB decide what percentage of rows to sample.
|
||||
NUMERIC_MIN_VALUE 0
|
||||
NUMERIC_MAX_VALUE 100
|
||||
NUMERIC_BLOCK_SIZE NULL
|
||||
ENUM_VALUE_LIST NULL
|
||||
READ_ONLY NO
|
||||
COMMAND_LINE_ARGUMENT REQUIRED
|
||||
VARIABLE_NAME AUTOCOMMIT
|
||||
SESSION_VALUE ON
|
||||
GLOBAL_VALUE ON
|
||||
|
|
|
@ -40,6 +40,20 @@ NUMERIC_BLOCK_SIZE NULL
|
|||
ENUM_VALUE_LIST DEFAULT,COPY,INPLACE,NOCOPY,INSTANT
|
||||
READ_ONLY NO
|
||||
COMMAND_LINE_ARGUMENT OPTIONAL
|
||||
VARIABLE_NAME ANALYZE_SAMPLE_PERCENTAGE
|
||||
SESSION_VALUE 100.000000
|
||||
GLOBAL_VALUE 100.000000
|
||||
GLOBAL_VALUE_ORIGIN COMPILE-TIME
|
||||
DEFAULT_VALUE 100.000000
|
||||
VARIABLE_SCOPE SESSION
|
||||
VARIABLE_TYPE DOUBLE
|
||||
VARIABLE_COMMENT Percentage of rows from the table ANALYZE TABLE will sample to collect table statistics. Set to 0 to let MariaDB decide what percentage of rows to sample.
|
||||
NUMERIC_MIN_VALUE 0
|
||||
NUMERIC_MAX_VALUE 100
|
||||
NUMERIC_BLOCK_SIZE NULL
|
||||
ENUM_VALUE_LIST NULL
|
||||
READ_ONLY NO
|
||||
COMMAND_LINE_ARGUMENT REQUIRED
|
||||
VARIABLE_NAME AUTOCOMMIT
|
||||
SESSION_VALUE ON
|
||||
GLOBAL_VALUE ON
|
||||
|
|
|
@ -622,6 +622,7 @@ typedef struct system_variables
|
|||
ulong optimizer_selectivity_sampling_limit;
|
||||
ulong optimizer_use_condition_selectivity;
|
||||
ulong use_stat_tables;
|
||||
double sample_percentage;
|
||||
ulong histogram_size;
|
||||
ulong histogram_type;
|
||||
ulong preload_buff_size;
|
||||
|
|
|
@ -2729,12 +2729,28 @@ int collect_statistics_for_table(THD *thd, TABLE *table)
|
|||
Field *table_field;
|
||||
ha_rows rows= 0;
|
||||
handler *file=table->file;
|
||||
double sample_fraction= thd->variables.sample_percentage / 100;
|
||||
const ha_rows MIN_THRESHOLD_FOR_SAMPLING= 50000;
|
||||
|
||||
DBUG_ENTER("collect_statistics_for_table");
|
||||
|
||||
table->collected_stats->cardinality_is_null= TRUE;
|
||||
table->collected_stats->cardinality= 0;
|
||||
|
||||
if (thd->variables.sample_percentage == 0)
|
||||
{
|
||||
if (file->records() < MIN_THRESHOLD_FOR_SAMPLING)
|
||||
{
|
||||
sample_fraction= 1;
|
||||
}
|
||||
else
|
||||
{
|
||||
sample_fraction= std::fmin(
|
||||
(MIN_THRESHOLD_FOR_SAMPLING + 4096 *
|
||||
log(200 * file->records())) / file->records(), 1);
|
||||
}
|
||||
}
|
||||
|
||||
for (field_ptr= table->field; *field_ptr; field_ptr++)
|
||||
{
|
||||
table_field= *field_ptr;
|
||||
|
@ -2747,7 +2763,7 @@ int collect_statistics_for_table(THD *thd, TABLE *table)
|
|||
|
||||
/* Perform a full table scan to collect statistics on 'table's columns */
|
||||
if (!(rc= file->ha_rnd_init(TRUE)))
|
||||
{
|
||||
{
|
||||
DEBUG_SYNC(table->in_use, "statistics_collection_start");
|
||||
|
||||
while ((rc= file->ha_rnd_next(table->record[0])) != HA_ERR_END_OF_FILE)
|
||||
|
@ -2758,17 +2774,20 @@ int collect_statistics_for_table(THD *thd, TABLE *table)
|
|||
if (rc)
|
||||
break;
|
||||
|
||||
for (field_ptr= table->field; *field_ptr; field_ptr++)
|
||||
if (thd_rnd(thd) <= sample_fraction)
|
||||
{
|
||||
table_field= *field_ptr;
|
||||
if (!bitmap_is_set(table->read_set, table_field->field_index))
|
||||
continue;
|
||||
if ((rc= table_field->collected_stats->add()))
|
||||
for (field_ptr= table->field; *field_ptr; field_ptr++)
|
||||
{
|
||||
table_field= *field_ptr;
|
||||
if (!bitmap_is_set(table->read_set, table_field->field_index))
|
||||
continue;
|
||||
if ((rc= table_field->collected_stats->add()))
|
||||
break;
|
||||
}
|
||||
if (rc)
|
||||
break;
|
||||
rows++;
|
||||
}
|
||||
if (rc)
|
||||
break;
|
||||
rows++;
|
||||
}
|
||||
file->ha_rnd_end();
|
||||
}
|
||||
|
@ -2782,7 +2801,8 @@ int collect_statistics_for_table(THD *thd, TABLE *table)
|
|||
if (!rc)
|
||||
{
|
||||
table->collected_stats->cardinality_is_null= FALSE;
|
||||
table->collected_stats->cardinality= rows;
|
||||
table->collected_stats->cardinality=
|
||||
static_cast<ha_rows>(rows / sample_fraction);
|
||||
}
|
||||
|
||||
bitmap_clear_all(table->write_set);
|
||||
|
|
|
@ -350,6 +350,15 @@ static Sys_var_long Sys_pfs_connect_attrs_size(
|
|||
|
||||
#endif /* WITH_PERFSCHEMA_STORAGE_ENGINE */
|
||||
|
||||
static Sys_var_double Sys_analyze_sample_percentage(
|
||||
"analyze_sample_percentage",
|
||||
"Percentage of rows from the table ANALYZE TABLE will sample "
|
||||
"to collect table statistics. Set to 0 to let MariaDB decide "
|
||||
"what percentage of rows to sample.",
|
||||
SESSION_VAR(sample_percentage),
|
||||
CMD_LINE(REQUIRED_ARG), VALID_RANGE(0, 100),
|
||||
DEFAULT(100));
|
||||
|
||||
static Sys_var_ulong Sys_auto_increment_increment(
|
||||
"auto_increment_increment",
|
||||
"Auto-increment columns are incremented by this",
|
||||
|
|
Loading…
Add table
Reference in a new issue