mariadb/storage/sequence/sequence.cc
Monty d9d0e78039 Add limits for how many IO operations a table access will do
This solves the current problem in the optimizer
- SELECT FROM big_table
  - SELECT from small_table where small_table.eq_ref_key=big_table.id

The old code assumed that each eq_ref access will cause an IO.
As the cost of IO is high, this dominated the cost for the later table
which caused the optimizer to prefer table scans + join cache over
index reads.

This patch fixes this issue by limit the number of expected IO calls,
for rows and index separately, to the size of the table or index or
the number of accesses that we except in a range for the index.

The major changes are:

- Adding a new structure ALL_READ_COST that is mainly used in
  best_access_path() to hold the costs parts of the cost we are
  calculating. This allows us to limit the number of IO when multiplying
  the cost with the previous row combinations.
- All storage engine cost functions are changed to return IO_AND_CPU_COST.
  The virtual cost functions should now return in IO_AND_CPU_COST.io
  the number of disk blocks that will be accessed instead of the cost
  of the access.
- We are not limiting the io_blocks for table or index scans as we
  assume that engines may not store these in the 'hot' part of the
  cache. Table and index scan also uses much less IO blocks than
  key accesses, so the original issue is not as critical with scans.

Other things:
  OPT_RANGE now holds a 'Cost_estimate cost' instead a lot of different
  costs. All the old costs, like index_only_read, can be extracted
  from 'cost'.
- Added to the start of some functions 'handler *file= table->file'
  to shorten the code that is using the handler.
- handler->cost() is used to change a ALL_READ_COST or IO_AND_CPU_COST
  to 'cost in milliseconds'
- New functions:  handler::index_blocks() and handler::row_blocks()
  which are used to limit the IO.
- Added index_cost and row_cost to Cost_estimate and removed all not
  needed members.
- Removed cost coefficients from Cost_estimate as these don't make sense
  when costs (except IO_BLOCKS) are in milliseconds.
- Removed handler::avg_io_cost() and replaced it with DISK_READ_COST.
- Renamed best_range_rowid_filter_for_partial_join() to
  best_range_rowid_filter() as using the old name made rows too long.
- Changed all SJ_MATERIALIZATION_INFO 'Cost_estimate' variables to
  'double' as Cost_estimate power was not used for these and thus
  just caused storage and performance overhead.
- Changed cost_for_index_read() to use 'worst_seeks' to only limit
  IO, not number of table accesses. With this patch worst_seeks is
  probably not needed anymore, but I kept it around just in case.
- Applying cost for filter got to be much shorter and easier thanks
  to the API changes.
- Adjusted cost for fulltext keys in collaboration with Sergei Golubchik.
- Most test changes caused by this patch is that table scans are changed
  to use indexes.
- Added ha_seq::keyread_time() and ha_seq::key_scan_time() to get
  make checking number of potential IO blocks easier during debugging.
2023-02-02 23:57:30 +03:00

562 lines
15 KiB
C++

/*
Copyright (c) 2013 Monty Program Ab
This program is free software; you can redistribute it and/or
modify it under the terms of the GNU General Public License
as published by the Free Software Foundation; version 2 of
the License.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with this program; if not, write to the Free Software
Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1335 USA
*/
/*
a engine that auto-creates tables with rows filled with sequential values
*/
#include <my_config.h>
#include <ctype.h>
#include <mysql_version.h>
#include <item.h>
#include <item_sum.h>
#include <handler.h>
#include <table.h>
#include <field.h>
#include <sql_limit.h>
static handlerton *sequence_hton;
class Sequence_share : public Handler_share {
public:
const char *name;
THR_LOCK lock;
ulonglong from, to, step;
bool reverse;
Sequence_share(const char *name_arg, ulonglong from_arg, ulonglong to_arg,
ulonglong step_arg, bool reverse_arg):
name(name_arg), from(from_arg), to(to_arg), step(step_arg),
reverse(reverse_arg)
{
thr_lock_init(&lock);
}
~Sequence_share()
{
thr_lock_delete(&lock);
}
};
class ha_seq final : public handler
{
private:
THR_LOCK_DATA lock;
Sequence_share *get_share();
ulonglong cur;
public:
Sequence_share *seqs;
ha_seq(handlerton *hton, TABLE_SHARE *table_arg)
: handler(hton, table_arg), seqs(0) { }
ulonglong table_flags() const
{ return HA_BINLOG_ROW_CAPABLE | HA_BINLOG_STMT_CAPABLE; }
/* open/close/locking */
int create(const char *name, TABLE *table_arg,
HA_CREATE_INFO *create_info)
{ return HA_ERR_WRONG_COMMAND; }
int open(const char *name, int mode, uint test_if_locked);
int close(void);
int delete_table(const char *name)
{
return 0;
}
THR_LOCK_DATA **store_lock(THD *, THR_LOCK_DATA **, enum thr_lock_type);
/* table scan */
int rnd_init(bool scan);
int rnd_next(unsigned char *buf);
void position(const uchar *record);
int rnd_pos(uchar *buf, uchar *pos);
int info(uint flag);
IO_AND_CPU_COST keyread_time(uint index, ulong ranges, ha_rows rows,
ulonglong blocks) override
{
/* Avoids assert in total_cost() and makes DBUG_PRINT more consistent */
return {0,0};
}
IO_AND_CPU_COST scan_time()
{
/* Avoids assert in total_cost() and makes DBUG_PRINT more consistent */
return {0, 0};
}
/* indexes */
ulong index_flags(uint inx, uint part, bool all_parts) const
{ return HA_READ_NEXT | HA_READ_PREV | HA_READ_ORDER |
HA_READ_RANGE | HA_KEYREAD_ONLY; }
uint max_supported_keys() const { return 1; }
int index_read_map(uchar *buf, const uchar *key, key_part_map keypart_map,
enum ha_rkey_function find_flag);
int index_next(uchar *buf);
int index_prev(uchar *buf);
int index_first(uchar *buf);
int index_last(uchar *buf);
ha_rows records_in_range(uint inx, const key_range *start_key,
const key_range *end_key, page_range *pages);
private:
void set(uchar *buf);
ulonglong nvalues() { return (seqs->to - seqs->from)/seqs->step; }
};
THR_LOCK_DATA **ha_seq::store_lock(THD *thd, THR_LOCK_DATA **to,
enum thr_lock_type lock_type)
{
if (lock_type != TL_IGNORE && lock.type == TL_UNLOCK)
lock.type= TL_WRITE_ALLOW_WRITE;
*to ++= &lock;
return to;
}
void ha_seq::set(unsigned char *buf)
{
MY_BITMAP *old_map = dbug_tmp_use_all_columns(table, &table->write_set);
my_ptrdiff_t offset = (my_ptrdiff_t) (buf - table->record[0]);
Field *field = table->field[0];
field->move_field_offset(offset);
field->store(cur, true);
field->move_field_offset(-offset);
dbug_tmp_restore_column_map(&table->write_set, old_map);
}
int ha_seq::rnd_init(bool scan)
{
cur= seqs->reverse ? seqs->to : seqs->from;
return 0;
}
int ha_seq::rnd_next(unsigned char *buf)
{
if (seqs->reverse)
return index_prev(buf);
else
return index_next(buf);
}
void ha_seq::position(const uchar *record)
{
*(ulonglong*)ref= cur;
}
int ha_seq::rnd_pos(uchar *buf, uchar *pos)
{
cur= *(ulonglong*)pos;
return rnd_next(buf);
}
int ha_seq::info(uint flag)
{
if (flag & HA_STATUS_VARIABLE)
stats.records = nvalues();
return 0;
}
int ha_seq::index_read_map(uchar *buf, const uchar *key_arg,
key_part_map keypart_map,
enum ha_rkey_function find_flag)
{
ulonglong key= uint8korr(key_arg);
switch (find_flag) {
case HA_READ_AFTER_KEY:
key++;
// fall through
case HA_READ_KEY_OR_NEXT:
if (key <= seqs->from)
cur= seqs->from;
else
{
cur= (key - seqs->from + seqs->step - 1) / seqs->step * seqs->step + seqs->from;
if (cur >= seqs->to)
return HA_ERR_KEY_NOT_FOUND;
}
return index_next(buf);
case HA_READ_KEY_EXACT:
if ((key - seqs->from) % seqs->step != 0 || key < seqs->from || key >= seqs->to)
return HA_ERR_KEY_NOT_FOUND;
cur= key;
return index_next(buf);
case HA_READ_BEFORE_KEY:
key--;
// fall through
case HA_READ_PREFIX_LAST_OR_PREV:
if (key >= seqs->to)
cur= seqs->to;
else
{
if (key < seqs->from)
return HA_ERR_KEY_NOT_FOUND;
cur= (key - seqs->from) / seqs->step * seqs->step + seqs->from;
}
return index_prev(buf);
default: return HA_ERR_WRONG_COMMAND;
}
}
int ha_seq::index_next(uchar *buf)
{
if (cur == seqs->to)
return HA_ERR_END_OF_FILE;
set(buf);
cur+= seqs->step;
return 0;
}
int ha_seq::index_prev(uchar *buf)
{
if (cur == seqs->from)
return HA_ERR_END_OF_FILE;
cur-= seqs->step;
set(buf);
return 0;
}
int ha_seq::index_first(uchar *buf)
{
cur= seqs->from;
return index_next(buf);
}
int ha_seq::index_last(uchar *buf)
{
cur= seqs->to;
return index_prev(buf);
}
ha_rows ha_seq::records_in_range(uint inx, const key_range *min_key,
const key_range *max_key,
page_range *pages)
{
ulonglong kmin= min_key ? uint8korr(min_key->key) : seqs->from;
ulonglong kmax= max_key ? uint8korr(max_key->key) : seqs->to - 1;
if (kmin >= seqs->to || kmax < seqs->from || kmin > kmax)
return 0;
return (kmax - seqs->from) / seqs->step -
(kmin - seqs->from + seqs->step - 1) / seqs->step + 1;
}
int ha_seq::open(const char *name, int mode, uint test_if_locked)
{
if (!(seqs= get_share()))
return HA_ERR_OUT_OF_MEM;
DBUG_ASSERT(my_strcasecmp(table_alias_charset, name, seqs->name) == 0);
ref_length= sizeof(cur);
thr_lock_data_init(&seqs->lock,&lock,NULL);
return 0;
}
int ha_seq::close(void)
{
return 0;
}
static handler *create_handler(handlerton *hton, TABLE_SHARE *table,
MEM_ROOT *mem_root)
{
return new (mem_root) ha_seq(hton, table);
}
static bool parse_table_name(const char *name, size_t name_length,
ulonglong *from, ulonglong *to, ulonglong *step)
{
uint n0=0, n1= 0, n2= 0;
*step= 1;
// the table is discovered if its name matches the pattern of seq_1_to_10 or
// seq_1_to_10_step_3
sscanf(name, "seq_%llu_to_%n%llu%n_step_%llu%n",
from, &n0, to, &n1, step, &n2);
// I consider this a bug in sscanf() - when an unsigned number
// is requested, -5 should *not* be accepted. But is is :(
// hence the additional check below:
return
n0 == 0 || !isdigit(name[4]) || !isdigit(name[n0]) || // reject negative numbers
(n1 != name_length && n2 != name_length);
}
Sequence_share *ha_seq::get_share()
{
Sequence_share *tmp_share;
lock_shared_ha_data();
if (!(tmp_share= static_cast<Sequence_share*>(get_ha_share_ptr())))
{
bool reverse;
ulonglong from, to, step;
parse_table_name(table_share->table_name.str,
table_share->table_name.length, &from, &to, &step);
if ((reverse = from > to))
{
if (step > from - to)
to = from;
else
swap_variables(ulonglong, from, to);
/*
when keyread is allowed, optimizer will always prefer an index to a
table scan for our tables, and we'll never see the range reversed.
*/
table_share->keys_for_keyread.clear_all();
}
to= (to - from) / step * step + step + from;
tmp_share= new Sequence_share(table_share->normalized_path.str, from, to, step, reverse);
if (!tmp_share)
goto err;
set_ha_share_ptr(static_cast<Handler_share*>(tmp_share));
}
err:
unlock_shared_ha_data();
return tmp_share;
}
static int discover_table(handlerton *hton, THD *thd, TABLE_SHARE *share)
{
ulonglong from, to, step;
if (parse_table_name(share->table_name.str, share->table_name.length,
&from, &to, &step))
return HA_ERR_NO_SUCH_TABLE;
if (step == 0)
return HA_WRONG_CREATE_OPTION;
const char *sql="create table seq (seq bigint unsigned primary key)";
return share->init_from_sql_statement_string(thd, 0, sql, strlen(sql));
}
static int discover_table_existence(handlerton *hton, const char *db,
const char *table_name)
{
ulonglong from, to, step;
return !parse_table_name(table_name, strlen(table_name), &from, &to, &step);
}
static int dummy_commit_rollback(handlerton *, THD *, bool) { return 0; }
static int dummy_savepoint(handlerton *, THD *, void *) { return 0; }
/*****************************************************************************
Example of a simple group by handler for queries like:
SELECT SUM(seq) from sequence_table;
This implementation supports SUM() and COUNT() on primary key.
*****************************************************************************/
class ha_seq_group_by_handler: public group_by_handler
{
Select_limit_counters limit;
List<Item> *fields;
TABLE_LIST *table_list;
bool first_row;
public:
ha_seq_group_by_handler(THD *thd_arg, List<Item> *fields_arg,
TABLE_LIST *table_list_arg,
Select_limit_counters *orig_lim)
: group_by_handler(thd_arg, sequence_hton), limit(orig_lim[0]),
fields(fields_arg), table_list(table_list_arg)
{
// Reset limit because we are handling it now
orig_lim->set_unlimited();
}
~ha_seq_group_by_handler() {}
int init_scan() { first_row= 1 ; return 0; }
int next_row();
int end_scan() { return 0; }
};
static group_by_handler *
create_group_by_handler(THD *thd, Query *query)
{
ha_seq_group_by_handler *handler;
Item *item;
List_iterator_fast<Item> it(*query->select);
/* check that only one table is used in FROM clause and no sub queries */
if (query->from->next_local != 0)
return 0;
/* check that there is no where clause and no group_by */
if (query->where != 0 || query->group_by != 0)
return 0;
/*
Check that all fields are sum(primary_key) or count(primary_key)
For more ways to work with the field list and sum functions, see
opt_sum.cc::opt_sum_query().
*/
while ((item= it++))
{
Item *arg0;
Field *field;
if (item->type() != Item::SUM_FUNC_ITEM ||
(((Item_sum*) item)->sum_func() != Item_sum::SUM_FUNC &&
((Item_sum*) item)->sum_func() != Item_sum::COUNT_FUNC))
return 0; // Not a SUM() function
arg0= ((Item_sum*) item)->get_arg(0);
if (arg0->type() != Item::FIELD_ITEM)
{
if ((((Item_sum*) item)->sum_func() == Item_sum::COUNT_FUNC) &&
arg0->basic_const_item())
continue; // Allow count(1)
return 0;
}
field= ((Item_field*) arg0)->field;
/*
Check that we are using the sequence table (the only table in the FROM
clause) and not an outer table.
*/
if (field->table != query->from->table)
return 0;
/* Check that we are using a SUM() on the primary key */
if (strcmp(field->field_name.str, "seq"))
return 0;
}
/* Create handler and return it */
handler= new ha_seq_group_by_handler(thd, query->select, query->from,
query->limit);
return handler;
}
int ha_seq_group_by_handler::next_row()
{
List_iterator_fast<Item> it(*fields);
Item_sum *item_sum;
Sequence_share *seqs= ((ha_seq*) table_list->table->file)->seqs;
DBUG_ENTER("ha_seq_group_by_handler::next_row");
/*
Check if this is the first call to the function. If not, we have already
returned all data.
*/
if (!first_row ||
limit.get_offset_limit() > 0 ||
limit.get_select_limit() == 0)
DBUG_RETURN(HA_ERR_END_OF_FILE);
first_row= 0;
/* Pointer to first field in temporary table where we should store summary*/
Field **field_ptr= table->field;
ulonglong elements= (seqs->to - seqs->from + seqs->step - 1) / seqs->step;
while ((item_sum= (Item_sum*) it++))
{
Field *field= *(field_ptr++);
switch (item_sum->sum_func()) {
case Item_sum::COUNT_FUNC:
{
Item *arg0= ((Item_sum*) item_sum)->get_arg(0);
if (arg0->basic_const_item() && arg0->is_null())
field->store(0LL, 1);
else
field->store((longlong) elements, 1);
break;
}
case Item_sum::SUM_FUNC:
{
/* Calculate SUM(f, f+step, f+step*2 ... to) */
ulonglong sum;
sum= seqs->from * elements + seqs->step * (elements*elements-elements)/2;
field->store((longlong) sum, 1);
break;
}
default:
DBUG_ASSERT(0);
}
field->set_notnull();
}
DBUG_RETURN(0);
}
static void sequence_update_optimizer_costs(OPTIMIZER_COSTS *costs)
{
costs->disk_read_cost= 0;
costs->disk_read_ratio= 0.0; // No disk
costs->key_next_find_cost=
costs->key_lookup_cost=
costs->key_copy_cost=
costs->row_next_find_cost=
costs->row_lookup_cost=
costs->row_copy_cost= 0.0000062391530550;
}
/*****************************************************************************
Initialize the interface between the sequence engine and MariaDB
*****************************************************************************/
static int drop_table(handlerton *hton, const char *path)
{
const char *name= strrchr(path, FN_LIBCHAR)+1;
ulonglong from, to, step;
if (parse_table_name(name, strlen(name), &from, &to, &step))
return ENOENT;
return 0;
}
static int init(void *p)
{
handlerton *hton= (handlerton *)p;
sequence_hton= hton;
hton->create= create_handler;
hton->drop_table= drop_table;
hton->discover_table= discover_table;
hton->discover_table_existence= discover_table_existence;
hton->commit= hton->rollback= dummy_commit_rollback;
hton->savepoint_set= hton->savepoint_rollback= hton->savepoint_release=
dummy_savepoint;
hton->create_group_by= create_group_by_handler;
hton->update_optimizer_costs= sequence_update_optimizer_costs;
return 0;
}
static struct st_mysql_storage_engine descriptor =
{ MYSQL_HANDLERTON_INTERFACE_VERSION };
maria_declare_plugin(sequence)
{
MYSQL_STORAGE_ENGINE_PLUGIN,
&descriptor,
"SEQUENCE",
"Sergei Golubchik",
"Generated tables filled with sequential values",
PLUGIN_LICENSE_GPL,
init,
NULL,
0x0100,
NULL,
NULL,
"0.1",
MariaDB_PLUGIN_MATURITY_STABLE
}
maria_declare_plugin_end;