mariadb/innobase/row/row0sel.c
heikki@donna.mysql.fi 3abaff79ce row0sel.c Removes now trailing spaces from varchars
srv0start.c	Removes now trailing spaces from varchars
row0mysql.ic	Removes now trailing spaces from varchars
ha_innobase.cc	Removes now trailing spaces from varchars
2001-03-02 19:07:29 +02:00

2737 lines
66 KiB
C

/*******************************************************
Select
(c) 1997 Innobase Oy
Created 12/19/1997 Heikki Tuuri
*******************************************************/
#include "row0sel.h"
#ifdef UNIV_NONINL
#include "row0sel.ic"
#endif
#include "dict0dict.h"
#include "dict0boot.h"
#include "trx0undo.h"
#include "trx0trx.h"
#include "btr0btr.h"
#include "btr0cur.h"
#include "btr0sea.h"
#include "mach0data.h"
#include "que0que.h"
#include "row0upd.h"
#include "row0row.h"
#include "row0vers.h"
#include "rem0cmp.h"
#include "lock0lock.h"
#include "eval0eval.h"
#include "pars0sym.h"
#include "pars0pars.h"
#include "row0mysql.h"
/* Maximum number of rows to prefetch; MySQL interface has another parameter */
#define SEL_MAX_N_PREFETCH 16
/* Number of rows fetched, after which to start prefetching; MySQL interface
has another parameter */
#define SEL_PREFETCH_LIMIT 1
/* When a select has accessed about this many pages, it returns control back
to que_run_threads: this is to allow canceling runaway queries */
#define SEL_COST_LIMIT 100
/* Flags for search shortcut */
#define SEL_FOUND 0
#define SEL_EXHAUSTED 1
#define SEL_RETRY 2
/*************************************************************************
Creates a select node struct. */
sel_node_t*
sel_node_create(
/*============*/
/* out, own: select node struct */
mem_heap_t* heap) /* in: memory heap where created */
{
sel_node_t* node;
node = mem_heap_alloc(heap, sizeof(sel_node_t));
node->common.type = QUE_NODE_SELECT;
node->state = SEL_NODE_OPEN;
node->select_will_do_update = FALSE;
node->latch_mode = BTR_SEARCH_LEAF;
node->plans = NULL;
return(node);
}
/*************************************************************************
Frees the memory private to a select node when a query graph is freed,
does not free the heap where the node was originally created. */
void
sel_node_free_private(
/*==================*/
sel_node_t* node) /* in: select node struct */
{
ulint i;
plan_t* plan;
if (node->plans != NULL) {
for (i = 0; i < node->n_tables; i++) {
plan = sel_node_get_nth_plan(node, i);
btr_pcur_close(&(plan->pcur));
btr_pcur_close(&(plan->clust_pcur));
if (plan->old_vers_heap) {
mem_heap_free(plan->old_vers_heap);
}
}
}
}
/*************************************************************************
Evaluates the values in a select list. If there are aggregate functions,
their argument value is added to the aggregate total. */
UNIV_INLINE
void
sel_eval_select_list(
/*=================*/
sel_node_t* node) /* in: select node */
{
que_node_t* exp;
exp = node->select_list;
while (exp) {
eval_exp(exp);
exp = que_node_get_next(exp);
}
}
/*************************************************************************
Assigns the values in the select list to the possible into-variables in
SELECT ... INTO ... */
UNIV_INLINE
void
sel_assign_into_var_values(
/*=======================*/
sym_node_t* var, /* in: first variable in a list of variables */
sel_node_t* node) /* in: select node */
{
que_node_t* exp;
if (var == NULL) {
return;
}
exp = node->select_list;
while (var) {
ut_ad(exp);
eval_node_copy_val(var->alias, exp);
exp = que_node_get_next(exp);
var = que_node_get_next(var);
}
}
/*************************************************************************
Resets the aggregate value totals in the select list of an aggregate type
query. */
UNIV_INLINE
void
sel_reset_aggregate_vals(
/*=====================*/
sel_node_t* node) /* in: select node */
{
func_node_t* func_node;
ut_ad(node->is_aggregate);
func_node = node->select_list;
while (func_node) {
eval_node_set_int_val(func_node, 0);
func_node = que_node_get_next(func_node);
}
node->aggregate_already_fetched = FALSE;
}
/*************************************************************************
Copies the input variable values when an explicit cursor is opened. */
UNIV_INLINE
void
row_sel_copy_input_variable_vals(
/*=============================*/
sel_node_t* node) /* in: select node */
{
sym_node_t* var;
var = UT_LIST_GET_FIRST(node->copy_variables);
while (var) {
eval_node_copy_val(var, var->alias);
var->indirection = NULL;
var = UT_LIST_GET_NEXT(col_var_list, var);
}
}
/*************************************************************************
Fetches the column values from a record. */
static
void
row_sel_fetch_columns(
/*==================*/
dict_index_t* index, /* in: record index */
rec_t* rec, /* in: record in a clustered or non-clustered
index */
sym_node_t* column) /* in: first column in a column list, or
NULL */
{
dfield_t* val;
ulint index_type;
ulint field_no;
byte* data;
ulint len;
if (index->type & DICT_CLUSTERED) {
index_type = SYM_CLUST_FIELD_NO;
} else {
index_type = SYM_SEC_FIELD_NO;
}
while (column) {
field_no = column->field_nos[index_type];
if (field_no != ULINT_UNDEFINED) {
data = rec_get_nth_field(rec, field_no, &len);
if (column->copy_val) {
eval_node_copy_and_alloc_val(column, data,
len);
} else {
val = que_node_get_val(column);
dfield_set_data(val, data, len);
}
}
column = UT_LIST_GET_NEXT(col_var_list, column);
}
}
/*************************************************************************
Allocates a prefetch buffer for a column when prefetch is first time done. */
static
void
sel_col_prefetch_buf_alloc(
/*=======================*/
sym_node_t* column) /* in: symbol table node for a column */
{
sel_buf_t* sel_buf;
ulint i;
ut_ad(que_node_get_type(column) == QUE_NODE_SYMBOL);
column->prefetch_buf = mem_alloc(SEL_MAX_N_PREFETCH
* sizeof(sel_buf_t));
for (i = 0; i < SEL_MAX_N_PREFETCH; i++) {
sel_buf = column->prefetch_buf + i;
sel_buf->data = NULL;
sel_buf->val_buf_size = 0;
}
}
/*************************************************************************
Frees a prefetch buffer for a column, including the dynamically allocated
memory for data stored there. */
void
sel_col_prefetch_buf_free(
/*======================*/
sel_buf_t* prefetch_buf) /* in, own: prefetch buffer */
{
sel_buf_t* sel_buf;
ulint i;
for (i = 0; i < SEL_MAX_N_PREFETCH; i++) {
sel_buf = prefetch_buf + i;
if (sel_buf->val_buf_size > 0) {
mem_free(sel_buf->data);
}
}
}
/*************************************************************************
Pops the column values for a prefetched, cached row from the column prefetch
buffers and places them to the val fields in the column nodes. */
static
void
sel_pop_prefetched_row(
/*===================*/
plan_t* plan) /* in: plan node for a table */
{
sym_node_t* column;
sel_buf_t* sel_buf;
dfield_t* val;
byte* data;
ulint len;
ulint val_buf_size;
ut_ad(plan->n_rows_prefetched > 0);
column = UT_LIST_GET_FIRST(plan->columns);
while (column) {
val = que_node_get_val(column);
if (!column->copy_val) {
/* We did not really push any value for the
column */
ut_ad(!column->prefetch_buf);
ut_ad(que_node_get_val_buf_size(column) == 0);
#ifdef UNIV_DEBUG
dfield_set_data(val, NULL, 0);
#endif
goto next_col;
}
ut_ad(column->prefetch_buf);
sel_buf = column->prefetch_buf + plan->first_prefetched;
data = sel_buf->data;
len = sel_buf->len;
val_buf_size = sel_buf->val_buf_size;
/* We must keep track of the allocated memory for
column values to be able to free it later: therefore
we swap the values for sel_buf and val */
sel_buf->data = dfield_get_data(val);
sel_buf->len = dfield_get_len(val);
sel_buf->val_buf_size = que_node_get_val_buf_size(column);
dfield_set_data(val, data, len);
que_node_set_val_buf_size(column, val_buf_size);
next_col:
column = UT_LIST_GET_NEXT(col_var_list, column);
}
plan->n_rows_prefetched--;
plan->first_prefetched++;
}
/*************************************************************************
Pushes the column values for a prefetched, cached row to the column prefetch
buffers from the val fields in the column nodes. */
UNIV_INLINE
void
sel_push_prefetched_row(
/*====================*/
plan_t* plan) /* in: plan node for a table */
{
sym_node_t* column;
sel_buf_t* sel_buf;
dfield_t* val;
byte* data;
ulint len;
ulint pos;
ulint val_buf_size;
if (plan->n_rows_prefetched == 0) {
pos = 0;
plan->first_prefetched = 0;
} else {
pos = plan->n_rows_prefetched;
/* We have the convention that pushing new rows starts only
after the prefetch stack has been emptied: */
ut_ad(plan->first_prefetched == 0);
}
plan->n_rows_prefetched++;
ut_ad(pos < SEL_MAX_N_PREFETCH);
column = UT_LIST_GET_FIRST(plan->columns);
while (column) {
if (!column->copy_val) {
/* There is no sense to push pointers to database
page fields when we do not keep latch on the page! */
goto next_col;
}
if (!column->prefetch_buf) {
/* Allocate a new prefetch buffer */
sel_col_prefetch_buf_alloc(column);
}
sel_buf = column->prefetch_buf + pos;
val = que_node_get_val(column);
data = dfield_get_data(val);
len = dfield_get_len(val);
val_buf_size = que_node_get_val_buf_size(column);
/* We must keep track of the allocated memory for
column values to be able to free it later: therefore
we swap the values for sel_buf and val */
dfield_set_data(val, sel_buf->data, sel_buf->len);
que_node_set_val_buf_size(column, sel_buf->val_buf_size);
sel_buf->data = data;
sel_buf->len = len;
sel_buf->val_buf_size = val_buf_size;
next_col:
column = UT_LIST_GET_NEXT(col_var_list, column);
}
}
/*************************************************************************
Builds a previous version of a clustered index record for a consistent read */
static
ulint
row_sel_build_prev_vers(
/*====================*/
/* out: DB_SUCCESS or error code */
read_view_t* read_view, /* in: read view */
plan_t* plan, /* in: plan node for table */
rec_t* rec, /* in: record in a clustered index */
rec_t** old_vers, /* out: old version, or NULL if the
record does not exist in the view:
i.e., it was freshly inserted
afterwards */
mtr_t* mtr) /* in: mtr */
{
ulint err;
if (plan->old_vers_heap) {
mem_heap_empty(plan->old_vers_heap);
} else {
plan->old_vers_heap = mem_heap_create(512);
}
err = row_vers_build_for_consistent_read(rec, mtr, plan->index,
read_view, plan->old_vers_heap,
old_vers);
return(err);
}
/*************************************************************************
Tests the conditions which determine when the index segment we are searching
through has been exhausted. */
UNIV_INLINE
ibool
row_sel_test_end_conds(
/*===================*/
/* out: TRUE if row passed the tests */
plan_t* plan) /* in: plan for the table; the column values must
already have been retrieved and the right sides of
comparisons evaluated */
{
func_node_t* cond;
/* All conditions in end_conds are comparisons of a column to an
expression */
cond = UT_LIST_GET_FIRST(plan->end_conds);
while (cond) {
/* Evaluate the left side of the comparison, i.e., get the
column value if there is an indirection */
eval_sym(cond->args);
/* Do the comparison */
if (!eval_cmp(cond)) {
return(FALSE);
}
cond = UT_LIST_GET_NEXT(cond_list, cond);
}
return(TRUE);
}
/*************************************************************************
Tests the other conditions. */
UNIV_INLINE
ibool
row_sel_test_other_conds(
/*=====================*/
/* out: TRUE if row passed the tests */
plan_t* plan) /* in: plan for the table; the column values must
already have been retrieved */
{
func_node_t* cond;
cond = UT_LIST_GET_FIRST(plan->other_conds);
while (cond) {
eval_exp(cond);
if (!eval_node_get_ibool_val(cond)) {
return(FALSE);
}
cond = UT_LIST_GET_NEXT(cond_list, cond);
}
return(TRUE);
}
/*************************************************************************
Retrieves the clustered index record corresponding to a record in a
non-clustered index. Does the necessary locking. */
static
ulint
row_sel_get_clust_rec(
/*==================*/
/* out: DB_SUCCESS or error code */
sel_node_t* node, /* in: select_node */
plan_t* plan, /* in: plan node for table */
rec_t* rec, /* in: record in a non-clustered index */
que_thr_t* thr, /* in: query thread */
rec_t** out_rec,/* out: clustered record or an old version of
it, NULL if the old version did not exist
in the read view, i.e., it was a fresh
inserted version */
mtr_t* mtr) /* in: mtr used to get access to the
non-clustered record; the same mtr is used to
access the clustered index */
{
dict_index_t* index;
rec_t* clust_rec;
rec_t* old_vers;
ulint err;
row_build_row_ref_fast(plan->clust_ref, plan->clust_map, rec);
index = dict_table_get_first_index(plan->table);
btr_pcur_open_with_no_init(index, plan->clust_ref, PAGE_CUR_LE,
node->latch_mode, &(plan->clust_pcur),
0, mtr);
clust_rec = btr_pcur_get_rec(&(plan->clust_pcur));
ut_ad(page_rec_is_user_rec(clust_rec));
if (!node->read_view) {
/* Try to place a lock on the index record */
err = lock_clust_rec_read_check_and_lock(0, clust_rec, index,
node->row_lock_mode, thr);
if (err != DB_SUCCESS) {
return(err);
}
} else {
/* This is a non-locking consistent read: if necessary, fetch
a previous version of the record */
if (!lock_clust_rec_cons_read_sees(clust_rec, index,
node->read_view)) {
err = row_sel_build_prev_vers(node->read_view, plan,
clust_rec, &old_vers, mtr);
if (err != DB_SUCCESS) {
return(err);
}
clust_rec = old_vers;
if (clust_rec == NULL) {
*out_rec = clust_rec;
return(DB_SUCCESS);
}
}
}
/* Fetch the columns needed in test conditions */
row_sel_fetch_columns(index, clust_rec,
UT_LIST_GET_FIRST(plan->columns));
*out_rec = clust_rec;
return(DB_SUCCESS);
}
/*************************************************************************
Sets a lock on a record. */
UNIV_INLINE
ulint
sel_set_rec_lock(
/*=============*/
/* out: DB_SUCCESS or error code */
rec_t* rec, /* in: record */
dict_index_t* index, /* in: index */
ulint mode, /* in: lock mode */
que_thr_t* thr) /* in: query thread */
{
ulint err;
if (index->type & DICT_CLUSTERED) {
err = lock_clust_rec_read_check_and_lock(0, rec, index, mode,
thr);
} else {
err = lock_sec_rec_read_check_and_lock(0, rec, index, mode,
thr);
}
return(err);
}
/*************************************************************************
Opens a pcur to a table index. */
static
void
row_sel_open_pcur(
/*==============*/
sel_node_t* node, /* in: select node */
plan_t* plan, /* in: table plan */
ibool search_latch_locked,
/* in: TRUE if the thread currently
has the search latch locked in
s-mode */
mtr_t* mtr) /* in: mtr */
{
dict_index_t* index;
func_node_t* cond;
que_node_t* exp;
ulint n_fields;
ulint has_search_latch = 0; /* RW_S_LATCH or 0 */
ulint i;
if (search_latch_locked) {
has_search_latch = RW_S_LATCH;
}
index = plan->index;
/* Calculate the value of the search tuple: the exact match columns
get their expressions evaluated when we evaluate the right sides of
end_conds */
cond = UT_LIST_GET_FIRST(plan->end_conds);
while (cond) {
eval_exp(que_node_get_next(cond->args));
cond = UT_LIST_GET_NEXT(cond_list, cond);
}
if (plan->tuple) {
n_fields = dtuple_get_n_fields(plan->tuple);
if (plan->n_exact_match < n_fields) {
/* There is a non-exact match field which must be
evaluated separately */
eval_exp(plan->tuple_exps[n_fields - 1]);
}
for (i = 0; i < n_fields; i++) {
exp = plan->tuple_exps[i];
dfield_copy_data(dtuple_get_nth_field(plan->tuple, i),
que_node_get_val(exp));
}
/* Open pcur to the index */
btr_pcur_open_with_no_init(index, plan->tuple, plan->mode,
node->latch_mode, &(plan->pcur),
has_search_latch, mtr);
} else {
/* Open the cursor to the start or the end of the index
(FALSE: no init) */
btr_pcur_open_at_index_side(plan->asc, index, node->latch_mode,
&(plan->pcur), FALSE, mtr);
}
ut_ad(plan->n_rows_prefetched == 0);
ut_ad(plan->n_rows_fetched == 0);
ut_ad(plan->cursor_at_end == FALSE);
plan->pcur_is_open = TRUE;
}
/*************************************************************************
Restores a stored pcur position to a table index. */
UNIV_INLINE
ibool
row_sel_restore_pcur_pos(
/*=====================*/
/* out: TRUE if the cursor should be moved to
the next record after we return from this
function (moved to the previous, in the case
of a descending cursor) without processing
again the current cursor record */
sel_node_t* node, /* in: select node */
plan_t* plan, /* in: table plan */
mtr_t* mtr) /* in: mtr */
{
ibool equal_position;
ulint relative_position;
ut_ad(!plan->cursor_at_end);
relative_position = btr_pcur_get_rel_pos(&(plan->pcur));
equal_position = btr_pcur_restore_position(node->latch_mode,
&(plan->pcur), mtr);
/* If the cursor is traveling upwards, and relative_position is
(1) BTR_PCUR_BEFORE: this is not allowed, as we did not have a lock
yet on the successor of the page infimum;
(2) BTR_PCUR_AFTER: btr_pcur_restore_position placed the cursor on the
first record GREATER than the predecessor of a page supremum; we have
not yet processed the cursor record: no need to move the cursor to the
next record;
(3) BTR_PCUR_ON: btr_pcur_restore_position placed the cursor on the
last record LESS or EQUAL to the old stored user record; (a) if
equal_position is FALSE, this means that the cursor is now on a record
less than the old user record, and we must move to the next record;
(b) if equal_position is TRUE, then if
plan->stored_cursor_rec_processed is TRUE, we must move to the next
record, else there is no need to move the cursor. */
if (plan->asc) {
if (relative_position == BTR_PCUR_ON) {
if (equal_position) {
return(plan->stored_cursor_rec_processed);
}
return(TRUE);
}
ut_ad(relative_position == BTR_PCUR_AFTER);
return(FALSE);
}
/* If the cursor is traveling downwards, and relative_position is
(1) BTR_PCUR_BEFORE: btr_pcur_restore_position placed the cursor on
the last record LESS than the successor of a page infimum; we have not
processed the cursor record: no need to move the cursor;
(2) BTR_PCUR_AFTER: btr_pcur_restore_position placed the cursor on the
first record GREATER than the predecessor of a page supremum; we have
processed the cursor record: we should move the cursor to the previous
record;
(3) BTR_PCUR_ON: btr_pcur_restore_position placed the cursor on the
last record LESS or EQUAL to the old stored user record; (a) if
equal_position is FALSE, this means that the cursor is now on a record
less than the old user record, and we need not move to the previous
record; (b) if equal_position is TRUE, then if
plan->stored_cursor_rec_processed is TRUE, we must move to the previous
record, else there is no need to move the cursor. */
if (relative_position == BTR_PCUR_BEFORE) {
return(FALSE);
}
if (relative_position == BTR_PCUR_ON) {
if (equal_position) {
return(plan->stored_cursor_rec_processed);
}
return(FALSE);
}
ut_ad(relative_position == BTR_PCUR_AFTER);
return(TRUE);
}
/*************************************************************************
Resets a plan cursor to a closed state. */
UNIV_INLINE
void
plan_reset_cursor(
/*==============*/
plan_t* plan) /* in: plan */
{
plan->pcur_is_open = FALSE;
plan->cursor_at_end = FALSE;
plan->n_rows_fetched = 0;
plan->n_rows_prefetched = 0;
}
/*************************************************************************
Tries to do a shortcut to fetch a clustered index record with a unique key,
using the hash index if possible (not always). */
static
ulint
row_sel_try_search_shortcut(
/*========================*/
/* out: SEL_FOUND, SEL_EXHAUSTED, SEL_RETRY */
sel_node_t* node, /* in: select node for a consistent read */
plan_t* plan, /* in: plan for a unique search in clustered
index */
mtr_t* mtr) /* in: mtr */
{
dict_index_t* index;
rec_t* rec;
index = plan->index;
ut_ad(node->read_view);
ut_ad(plan->unique_search);
ut_ad(!plan->must_get_clust);
ut_ad(rw_lock_own(&btr_search_latch, RW_LOCK_SHARED));
row_sel_open_pcur(node, plan, TRUE, mtr);
rec = btr_pcur_get_rec(&(plan->pcur));
if (!page_rec_is_user_rec(rec)) {
return(SEL_RETRY);
}
ut_ad(plan->mode == PAGE_CUR_GE);
/* As the cursor is now placed on a user record after a search with
the mode PAGE_CUR_GE, the up_match field in the cursor tells how many
fields in the user record matched to the search tuple */
if (btr_pcur_get_up_match(&(plan->pcur)) < plan->n_exact_match) {
return(SEL_EXHAUSTED);
}
/* This is a non-locking consistent read: if necessary, fetch
a previous version of the record */
if (index->type & DICT_CLUSTERED) {
if (!lock_clust_rec_cons_read_sees(rec, index,
node->read_view)) {
return(SEL_RETRY);
}
} else if (!lock_sec_rec_cons_read_sees(rec, index, node->read_view)) {
return(SEL_RETRY);
}
/* Test deleted flag. Fetch the columns needed in test conditions. */
row_sel_fetch_columns(index, rec, UT_LIST_GET_FIRST(plan->columns));
if (rec_get_deleted_flag(rec)) {
return(SEL_EXHAUSTED);
}
/* Test the rest of search conditions */
if (!row_sel_test_other_conds(plan)) {
return(SEL_EXHAUSTED);
}
ut_ad(plan->pcur.latch_mode == node->latch_mode);
plan->n_rows_fetched++;
return(SEL_FOUND);
}
/*************************************************************************
Performs a select step. */
static
ulint
row_sel(
/*====*/
/* out: DB_SUCCESS or error code */
sel_node_t* node, /* in: select node */
que_thr_t* thr) /* in: query thread */
{
dict_index_t* index;
plan_t* plan;
mtr_t mtr;
ibool moved;
rec_t* rec;
rec_t* old_vers;
rec_t* clust_rec;
ibool search_latch_locked;
ibool consistent_read;
/* The following flag becomes TRUE when we are doing a
consistent read from a non-clustered index and we must look
at the clustered index to find out the previous delete mark
state of the non-clustered record: */
ibool cons_read_requires_clust_rec = FALSE;
ulint cost_counter = 0;
ibool cursor_just_opened;
ibool must_go_to_next;
ibool leaf_contains_updates = FALSE;
/* TRUE if select_will_do_update is
TRUE and the current clustered index
leaf page has been updated during
the current mtr: mtr must be committed
at the same time as the leaf x-latch
is released */
ibool mtr_has_extra_clust_latch = FALSE;
/* TRUE if the search was made using
a non-clustered index, and we had to
access the clustered record: now &mtr
contains a clustered index latch, and
&mtr must be committed before we move
to the next non-clustered record */
ulint found_flag;
ulint err;
ut_ad(thr->run_node == node);
search_latch_locked = FALSE;
if (node->read_view) {
/* In consistent reads, we try to do with the hash index and
not to use the buffer page get. This is to reduce memory bus
load resulting from semaphore operations. The search latch
will be s-locked when we access an index with a unique search
condition, but not locked when we access an index with a
less selective search condition. */
consistent_read = TRUE;
} else {
consistent_read = FALSE;
}
table_loop:
/* TABLE LOOP
----------
This is the outer major loop in calculating a join. We come here when
node->fetch_table changes, and after adding a row to aggregate totals
and, of course, when this function is called. */
ut_ad(leaf_contains_updates == FALSE);
ut_ad(mtr_has_extra_clust_latch == FALSE);
plan = sel_node_get_nth_plan(node, node->fetch_table);
index = plan->index;
if (plan->n_rows_prefetched > 0) {
sel_pop_prefetched_row(plan);
goto next_table_no_mtr;
}
if (plan->cursor_at_end) {
/* The cursor has already reached the result set end: no more
rows to process for this table cursor, as also the prefetch
stack was empty */
ut_ad(plan->pcur_is_open);
goto table_exhausted_no_mtr;
}
/* Open a cursor to index, or restore an open cursor position */
mtr_start(&mtr);
if (consistent_read && plan->unique_search && !plan->pcur_is_open
&& !plan->must_get_clust) {
if (!search_latch_locked) {
rw_lock_s_lock(&btr_search_latch);
search_latch_locked = TRUE;
} else if (btr_search_latch.writer_is_wait_ex) {
/* There is an x-latch request waiting: release the
s-latch for a moment; as an s-latch here is often
kept for some 10 searches before being released,
a waiting x-latch request would block other threads
from acquiring an s-latch for a long time, lowering
performance significantly in multiprocessors. */
rw_lock_s_unlock(&btr_search_latch);
rw_lock_s_lock(&btr_search_latch);
}
found_flag = row_sel_try_search_shortcut(node, plan, &mtr);
if (found_flag == SEL_FOUND) {
goto next_table;
} else if (found_flag == SEL_EXHAUSTED) {
goto table_exhausted;
}
ut_ad(found_flag == SEL_RETRY);
plan_reset_cursor(plan);
mtr_commit(&mtr);
mtr_start(&mtr);
}
if (search_latch_locked) {
rw_lock_s_unlock(&btr_search_latch);
search_latch_locked = FALSE;
}
if (!plan->pcur_is_open) {
/* Evaluate the expressions to build the search tuple and
open the cursor */
row_sel_open_pcur(node, plan, search_latch_locked, &mtr);
cursor_just_opened = TRUE;
/* A new search was made: increment the cost counter */
cost_counter++;
} else {
/* Restore pcur position to the index */
must_go_to_next = row_sel_restore_pcur_pos(node, plan, &mtr);
cursor_just_opened = FALSE;
if (must_go_to_next) {
/* We have already processed the cursor record: move
to the next */
goto next_rec;
}
}
rec_loop:
/* RECORD LOOP
-----------
In this loop we use pcur and try to fetch a qualifying row, and
also fill the prefetch buffer for this table if n_rows_fetched has
exceeded a threshold. While we are inside this loop, the following
holds:
(1) &mtr is started,
(2) pcur is positioned and open.
NOTE that if cursor_just_opened is TRUE here, it means that we came
to this point right after row_sel_open_pcur. */
ut_ad(mtr_has_extra_clust_latch == FALSE);
rec = btr_pcur_get_rec(&(plan->pcur));
/* PHASE 1: Set a lock if specified */
if (!node->asc && cursor_just_opened
&& (rec != page_get_supremum_rec(buf_frame_align(rec)))) {
/* When we open a cursor for a descending search, we must set
a next-key lock on the successor record: otherwise it would
be possible to insert new records next to the cursor position,
and it might be that these new records should appear in the
search result set, resulting in the phantom problem. */
if (!consistent_read) {
err = sel_set_rec_lock(page_rec_get_next(rec), index,
node->row_lock_mode, thr);
if (err != DB_SUCCESS) {
/* Note that in this case we will store in pcur
the PREDECESSOR of the record we are waiting
the lock for */
goto lock_wait_or_error;
}
}
}
if (rec == page_get_infimum_rec(buf_frame_align(rec))) {
/* The infimum record on a page cannot be in the result set,
and neither can a record lock be placed on it: we skip such
a record. We also increment the cost counter as we may have
processed yet another page of index. */
cost_counter++;
goto next_rec;
}
if (!consistent_read) {
/* Try to place a lock on the index record */
err = sel_set_rec_lock(rec, index, node->row_lock_mode, thr);
if (err != DB_SUCCESS) {
goto lock_wait_or_error;
}
}
if (rec == page_get_supremum_rec(buf_frame_align(rec))) {
/* A page supremum record cannot be in the result set: skip
it now when we have placed a possible lock on it */
goto next_rec;
}
ut_ad(page_rec_is_user_rec(rec));
if (cost_counter > SEL_COST_LIMIT) {
/* Now that we have placed the necessary locks, we can stop
for a while and store the cursor position; NOTE that if we
would store the cursor position BEFORE placing a record lock,
it might happen that the cursor would jump over some records
that another transaction could meanwhile insert adjacent to
the cursor: this would result in the phantom problem. */
goto stop_for_a_while;
}
/* PHASE 2: Check a mixed index mix id if needed */
if (plan->unique_search && cursor_just_opened) {
ut_ad(plan->mode == PAGE_CUR_GE);
/* As the cursor is now placed on a user record after a search
with the mode PAGE_CUR_GE, the up_match field in the cursor
tells how many fields in the user record matched to the search
tuple */
if (btr_pcur_get_up_match(&(plan->pcur))
< plan->n_exact_match) {
goto table_exhausted;
}
/* Ok, no need to test end_conds or mix id */
} else if (plan->mixed_index) {
/* We have to check if the record in a mixed cluster belongs
to this table */
if (!dict_is_mixed_table_rec(plan->table, rec)) {
goto next_rec;
}
}
/* We are ready to look at a possible new index entry in the result
set: the cursor is now placed on a user record */
/* PHASE 3: Get previous version in a consistent read */
if (consistent_read) {
/* This is a non-locking consistent read: if necessary, fetch
a previous version of the record */
if (index->type & DICT_CLUSTERED) {
if (!lock_clust_rec_cons_read_sees(rec, index,
node->read_view)) {
err = row_sel_build_prev_vers(node->read_view,
plan, rec, &old_vers,
&mtr);
if (err != DB_SUCCESS) {
goto lock_wait_or_error;
}
if (old_vers == NULL) {
row_sel_fetch_columns(index, rec,
UT_LIST_GET_FIRST(plan->columns));
if (!row_sel_test_end_conds(plan)) {
goto table_exhausted;
}
goto next_rec;
}
rec = old_vers;
}
} else if (!lock_sec_rec_cons_read_sees(rec, index,
node->read_view)) {
cons_read_requires_clust_rec = TRUE;
}
}
/* PHASE 4: Test search end conditions and deleted flag */
/* Fetch the columns needed in test conditions */
row_sel_fetch_columns(index, rec, UT_LIST_GET_FIRST(plan->columns));
/* Test the selection end conditions: these can only contain columns
which already are found in the index, even though the index might be
non-clustered */
if (plan->unique_search && cursor_just_opened) {
/* No test necessary: the test was already made above */
} else if (!row_sel_test_end_conds(plan)) {
goto table_exhausted;
}
if (rec_get_deleted_flag(rec) && !cons_read_requires_clust_rec) {
/* The record is delete marked: we can skip it if this is
not a consistent read which might see an earlier version
of a non-clustered index record */
if (plan->unique_search) {
goto table_exhausted;
}
goto next_rec;
}
/* PHASE 5: Get the clustered index record, if needed and if we did
not do the search using the clustered index */
if (plan->must_get_clust || cons_read_requires_clust_rec) {
/* It was a non-clustered index and we must fetch also the
clustered index record */
err = row_sel_get_clust_rec(node, plan, rec, thr, &clust_rec,
&mtr);
mtr_has_extra_clust_latch = TRUE;
if (err != DB_SUCCESS) {
goto lock_wait_or_error;
}
/* Retrieving the clustered record required a search:
increment the cost counter */
cost_counter++;
if (clust_rec == NULL) {
/* The record did not exist in the read view */
ut_ad(consistent_read);
goto next_rec;
}
if (rec_get_deleted_flag(clust_rec)) {
/* The record is delete marked: we can skip it */
goto next_rec;
}
if (node->can_get_updated) {
btr_pcur_store_position(&(plan->clust_pcur), &mtr);
}
}
/* PHASE 6: Test the rest of search conditions */
if (!row_sel_test_other_conds(plan)) {
if (plan->unique_search) {
goto table_exhausted;
}
goto next_rec;
}
/* PHASE 7: We found a new qualifying row for the current table; push
the row if prefetch is on, or move to the next table in the join */
plan->n_rows_fetched++;
ut_ad(plan->pcur.latch_mode == node->latch_mode);
if (node->select_will_do_update) {
/* This is a searched update and we can do the update in-place,
saving CPU time */
row_upd_in_place_in_select(node, thr, &mtr);
leaf_contains_updates = TRUE;
/* When the database is in the online backup mode, the number
of log records for a single mtr should be small: increment the
cost counter to ensure it */
cost_counter += 1 + (SEL_COST_LIMIT / 8);
if (plan->unique_search) {
goto table_exhausted;
}
goto next_rec;
}
if ((plan->n_rows_fetched <= SEL_PREFETCH_LIMIT)
|| plan->unique_search || plan->no_prefetch) {
/* No prefetch in operation: go to the next table */
goto next_table;
}
sel_push_prefetched_row(plan);
if (plan->n_rows_prefetched == SEL_MAX_N_PREFETCH) {
/* The prefetch buffer is now full */
sel_pop_prefetched_row(plan);
goto next_table;
}
next_rec:
ut_ad(!search_latch_locked);
if (mtr_has_extra_clust_latch) {
/* We must commit &mtr if we are moving to the next
non-clustered index record, because we could break the
latching order if we would access a different clustered
index page right away without releasing the previous. */
goto commit_mtr_for_a_while;
}
if (leaf_contains_updates
&& btr_pcur_is_after_last_on_page(&(plan->pcur), &mtr)) {
/* We must commit &mtr if we are moving to a different page,
because we have done updates to the x-latched leaf page, and
the latch would be released in btr_pcur_move_to_next, without
&mtr getting committed there */
ut_ad(node->asc);
goto commit_mtr_for_a_while;
}
if (node->asc) {
moved = btr_pcur_move_to_next(&(plan->pcur), &mtr);
} else {
moved = btr_pcur_move_to_prev(&(plan->pcur), &mtr);
}
if (!moved) {
goto table_exhausted;
}
cursor_just_opened = FALSE;
/* END OF RECORD LOOP
------------------ */
goto rec_loop;
next_table:
/* We found a record which satisfies the conditions: we can move to
the next table or return a row in the result set */
ut_ad(btr_pcur_is_on_user_rec(&(plan->pcur), &mtr));
if (plan->unique_search && !node->can_get_updated) {
plan->cursor_at_end = TRUE;
} else {
ut_ad(!search_latch_locked);
plan->stored_cursor_rec_processed = TRUE;
btr_pcur_store_position(&(plan->pcur), &mtr);
}
mtr_commit(&mtr);
leaf_contains_updates = FALSE;
mtr_has_extra_clust_latch = FALSE;
next_table_no_mtr:
/* If we use 'goto' to this label, it means that the row was popped
from the prefetched rows stack, and &mtr is already committed */
if (node->fetch_table + 1 == node->n_tables) {
sel_eval_select_list(node);
if (node->is_aggregate) {
goto table_loop;
}
sel_assign_into_var_values(node->into_list, node);
thr->run_node = que_node_get_parent(node);
if (search_latch_locked) {
rw_lock_s_unlock(&btr_search_latch);
}
return(DB_SUCCESS);
}
node->fetch_table++;
/* When we move to the next table, we first reset the plan cursor:
we do not care about resetting it when we backtrack from a table */
plan_reset_cursor(sel_node_get_nth_plan(node, node->fetch_table));
goto table_loop;
table_exhausted:
/* The table cursor pcur reached the result set end: backtrack to the
previous table in the join if we do not have cached prefetched rows */
plan->cursor_at_end = TRUE;
mtr_commit(&mtr);
leaf_contains_updates = FALSE;
mtr_has_extra_clust_latch = FALSE;
if (plan->n_rows_prefetched > 0) {
/* The table became exhausted during a prefetch */
sel_pop_prefetched_row(plan);
goto next_table_no_mtr;
}
table_exhausted_no_mtr:
if (node->fetch_table == 0) {
if (node->is_aggregate && !node->aggregate_already_fetched) {
node->aggregate_already_fetched = TRUE;
sel_assign_into_var_values(node->into_list, node);
thr->run_node = que_node_get_parent(node);
if (search_latch_locked) {
rw_lock_s_unlock(&btr_search_latch);
}
return(DB_SUCCESS);
}
node->state = SEL_NODE_NO_MORE_ROWS;
thr->run_node = que_node_get_parent(node);
if (search_latch_locked) {
rw_lock_s_unlock(&btr_search_latch);
}
return(DB_SUCCESS);
}
node->fetch_table--;
goto table_loop;
stop_for_a_while:
/* Return control for a while to que_run_threads, so that runaway
queries can be canceled. NOTE that when we come here, we must, in a
locking read, have placed the necessary (possibly waiting request)
record lock on the cursor record or its successor: when we reposition
the cursor, this record lock guarantees that nobody can meanwhile have
inserted new records which should have appeared in the result set,
which would result in the phantom problem. */
ut_ad(!search_latch_locked);
plan->stored_cursor_rec_processed = FALSE;
btr_pcur_store_position(&(plan->pcur), &mtr);
mtr_commit(&mtr);
ut_ad(sync_thread_levels_empty_gen(TRUE));
return(DB_SUCCESS);
commit_mtr_for_a_while:
/* Stores the cursor position and commits &mtr; this is used if
&mtr may contain latches which would break the latching order if
&mtr would not be committed and the latches released. */
plan->stored_cursor_rec_processed = TRUE;
ut_ad(!search_latch_locked);
btr_pcur_store_position(&(plan->pcur), &mtr);
mtr_commit(&mtr);
leaf_contains_updates = FALSE;
mtr_has_extra_clust_latch = FALSE;
ut_ad(sync_thread_levels_empty_gen(TRUE));
goto table_loop;
lock_wait_or_error:
/* See the note at stop_for_a_while: the same holds for this case */
ut_ad(!btr_pcur_is_before_first_on_page(&(plan->pcur), &mtr)
|| !node->asc);
ut_ad(!search_latch_locked);
plan->stored_cursor_rec_processed = FALSE;
btr_pcur_store_position(&(plan->pcur), &mtr);
mtr_commit(&mtr);
ut_ad(sync_thread_levels_empty_gen(TRUE));
return(err);
}
/**************************************************************************
Performs a select step. This is a high-level function used in SQL execution
graphs. */
que_thr_t*
row_sel_step(
/*=========*/
/* out: query thread to run next or NULL */
que_thr_t* thr) /* in: query thread */
{
ulint i_lock_mode;
sym_node_t* table_node;
sel_node_t* node;
ulint err;
ut_ad(thr);
node = thr->run_node;
ut_ad(que_node_get_type(node) == QUE_NODE_SELECT);
/* If this is a new time this node is executed (or when execution
resumes after wait for a table intention lock), set intention locks
on the tables, or assign a read view */
if (node->into_list && (thr->prev_node == que_node_get_parent(node))) {
node->state = SEL_NODE_OPEN;
}
if (node->state == SEL_NODE_OPEN) {
/* It may be that the current session has not yet started
its transaction, or it has been committed: */
trx_start_if_not_started(thr_get_trx(thr));
plan_reset_cursor(sel_node_get_nth_plan(node, 0));
if (node->consistent_read) {
/* Assign a read view for the query */
node->read_view = trx_assign_read_view(
thr_get_trx(thr));
} else {
if (node->set_x_locks) {
i_lock_mode = LOCK_IX;
} else {
i_lock_mode = LOCK_IS;
}
table_node = node->table_list;
while (table_node) {
err = lock_table(0, table_node->table,
i_lock_mode, thr);
if (err != DB_SUCCESS) {
que_thr_handle_error(thr, DB_ERROR,
NULL, 0);
return(NULL);
}
table_node = que_node_get_next(table_node);
}
}
/* If this is an explicit cursor, copy stored procedure
variable values, so that the values cannot change between
fetches (currently, we copy them also for non-explicit
cursors) */
if (node->explicit_cursor &&
UT_LIST_GET_FIRST(node->copy_variables)) {
row_sel_copy_input_variable_vals(node);
}
node->state = SEL_NODE_FETCH;
node->fetch_table = 0;
if (node->is_aggregate) {
/* Reset the aggregate total values */
sel_reset_aggregate_vals(node);
}
}
err = row_sel(node, thr);
/* NOTE! if queries are parallelized, the following assignment may
have problems; the assignment should be made only if thr is the
only top-level thr in the graph: */
thr->graph->last_sel_node = node;
if (err == DB_SUCCESS) {
/* Ok: do nothing */
} else if (err == DB_LOCK_WAIT) {
return(NULL);
} else {
/* SQL error detected */
printf("SQL error %lu\n", err);
que_thr_handle_error(thr, DB_ERROR, NULL, 0);
return(NULL);
}
return(thr);
}
/**************************************************************************
Performs a fetch for a cursor. */
que_thr_t*
fetch_step(
/*=======*/
/* out: query thread to run next or NULL */
que_thr_t* thr) /* in: query thread */
{
sel_node_t* sel_node;
fetch_node_t* node;
ut_ad(thr);
node = thr->run_node;
sel_node = node->cursor_def;
ut_ad(que_node_get_type(node) == QUE_NODE_FETCH);
if (thr->prev_node != que_node_get_parent(node)) {
if (sel_node->state != SEL_NODE_NO_MORE_ROWS) {
sel_assign_into_var_values(node->into_list, sel_node);
}
thr->run_node = que_node_get_parent(node);
return(thr);
}
/* Make the fetch node the parent of the cursor definition for
the time of the fetch, so that execution knows to return to this
fetch node after a row has been selected or we know that there is
no row left */
sel_node->common.parent = node;
if (sel_node->state == SEL_NODE_CLOSED) {
/* SQL error detected */
printf("SQL error %lu\n", DB_ERROR);
que_thr_handle_error(thr, DB_ERROR, NULL, 0);
return(NULL);
}
thr->run_node = sel_node;
return(thr);
}
/***************************************************************
Prints a row in a select result. */
que_thr_t*
row_printf_step(
/*============*/
/* out: query thread to run next or NULL */
que_thr_t* thr) /* in: query thread */
{
row_printf_node_t* node;
sel_node_t* sel_node;
que_node_t* arg;
ut_ad(thr);
node = thr->run_node;
sel_node = node->sel_node;
ut_ad(que_node_get_type(node) == QUE_NODE_ROW_PRINTF);
if (thr->prev_node == que_node_get_parent(node)) {
/* Reset the cursor */
sel_node->state = SEL_NODE_OPEN;
/* Fetch next row to print */
thr->run_node = sel_node;
return(thr);
}
if (sel_node->state != SEL_NODE_FETCH) {
ut_ad(sel_node->state == SEL_NODE_NO_MORE_ROWS);
/* No more rows to print */
thr->run_node = que_node_get_parent(node);
return(thr);
}
arg = sel_node->select_list;
while (arg) {
dfield_print_also_hex(que_node_get_val(arg));
printf(" ::: ");
arg = que_node_get_next(arg);
}
printf("\n");
/* Fetch next row to print */
thr->run_node = sel_node;
return(thr);
}
/********************************************************************
Converts a key value stored in MySQL format to an Innobase dtuple.
The last field of the key value may be just a prefix of a fixed length
field: hence the parameter key_len. */
void
row_sel_convert_mysql_key_to_innobase(
/*==================================*/
dtuple_t* tuple, /* in: tuple where to build;
NOTE: we assume that the type info
in the tuple is already according
to index! */
byte* buf, /* in: buffer to use in field
conversions */
dict_index_t* index, /* in: index of the key value */
byte* key_ptr, /* in: MySQL key value */
ulint key_len) /* in: MySQL key value length */
{
dfield_t* dfield;
ulint offset;
ulint len;
byte* key_end;
ulint n_fields = 0;
UT_NOT_USED(index);
key_end = key_ptr + key_len;
/* Permit us to access any field in the tuple (ULINT_MAX): */
dtuple_set_n_fields(tuple, ULINT_MAX);
dfield = dtuple_get_nth_field(tuple, 0);
if (dfield_get_type(dfield)->mtype == DATA_SYS) {
/* A special case: we are looking for a position in a
generated clustered index: the first and the only
ordering column is ROW_ID */
ut_a(key_len == DATA_ROW_ID_LEN);
dfield_set_data(dfield, key_ptr, DATA_ROW_ID_LEN);
dtuple_set_n_fields(tuple, 1);
return;
}
while (key_ptr < key_end) {
offset = 0;
len = dfield_get_type(dfield)->len;
n_fields++;
if (!(dfield_get_type(dfield)->prtype & DATA_NOT_NULL)) {
/* The first byte in the field tells if this is
an SQL NULL value */
offset = 1;
if (*key_ptr != 0) {
dfield_set_data(dfield, NULL, UNIV_SQL_NULL);
goto next_part;
}
}
row_mysql_store_col_in_innobase_format(
dfield, buf, key_ptr + offset, len,
dfield_get_type(dfield)->mtype,
dfield_get_type(dfield)->prtype
& DATA_UNSIGNED);
next_part:
key_ptr += (offset + len);
if (key_ptr > key_end) {
/* The last field in key was not a complete
field but a prefix of it */
ut_ad(dfield_get_len(dfield) != UNIV_SQL_NULL);
dfield_set_data(dfield, buf,
len - (ulint)(key_ptr - key_end));
}
buf += len;
dfield++;
}
/* We set the length of tuple to n_fields: we assume that
the memory area allocated for it is big enough (usually
bigger than n_fields). */
dtuple_set_n_fields(tuple, n_fields);
}
/******************************************************************
Stores the row id to the prebuilt struct. */
UNIV_INLINE
void
row_sel_store_row_id_to_prebuilt(
/*=============================*/
row_prebuilt_t* prebuilt, /* in: prebuilt */
rec_t* index_rec, /* in: record */
dict_index_t* index) /* in: index of the record */
{
byte* data;
ulint len;
data = rec_get_nth_field(index_rec,
dict_index_get_sys_col_pos(index, DATA_ROW_ID), &len);
ut_a(len == DATA_ROW_ID_LEN);
ut_memcpy(prebuilt->row_id, data, len);
}
/******************************************************************
Stores a non-SQL-NULL field in the MySQL format. */
UNIV_INLINE
void
row_sel_field_store_in_mysql_format(
/*================================*/
byte* dest, /* in/out: buffer where to store; NOTE that BLOBs
are not in themselves stored here: the caller must
allocate and copy the BLOB into buffer before, and pass
the pointer to the BLOB in 'data' */
ulint col_len,/* in: MySQL column length */
byte* data, /* in: data to store */
ulint len, /* in: length of the data */
ulint type, /* in: data type */
ulint is_unsigned)/* in: != 0 if an unsigned integer type */
{
byte* ptr;
ut_ad(len != UNIV_SQL_NULL);
if (type == DATA_INT) {
/* Convert integer data from Innobase to a little-endian
format, sign bit restored to normal */
ptr = dest + len;
for (;;) {
ptr--;
*ptr = *data;
if (ptr == dest) {
break;
}
data++;
}
if (!is_unsigned) {
dest[len - 1] = (byte) (dest[len - 1] ^ 128);
}
ut_ad(col_len == len);
} else if (type == DATA_VARCHAR || type == DATA_VARMYSQL
|| type == DATA_BINARY) {
/* Store the length of the data to the first two bytes of
dest; does not do anything yet because MySQL has
no real vars! */
dest = row_mysql_store_var_len(dest, len);
ut_memcpy(dest, data, len);
/* Pad with trailing spaces */
memset(dest + len, ' ', col_len - len);
/* ut_ad(col_len >= len + 2); No real var implemented in
MySQL yet! */
} else if (type == DATA_BLOB) {
/* Store a pointer to the BLOB buffer to dest: the BLOB was
already copied to the buffer in row_sel_store_mysql_rec */
row_mysql_store_blob_ref(dest, col_len, data, len);
} else {
ut_memcpy(dest, data, len);
ut_ad(col_len == len);
}
}
/******************************************************************
Convert a row in the Innobase format to a row in the MySQL format.
Note that the template in prebuilt may advise us to copy only a few
columns to mysql_rec, other columns are left blank. All columns may not
be needed in the query. */
static
void
row_sel_store_mysql_rec(
/*====================*/
byte* mysql_rec, /* out: row in the MySQL format */
row_prebuilt_t* prebuilt, /* in: prebuilt struct */
rec_t* rec) /* in: Innobase record in the index
which was described in prebuilt's
template */
{
mysql_row_templ_t* templ;
byte* data;
ulint len;
byte* blob_buf;
ulint i;
ut_ad(prebuilt->mysql_template);
if (prebuilt->blob_heap != NULL) {
mem_heap_free(prebuilt->blob_heap);
prebuilt->blob_heap = NULL;
}
/* Mark all columns as not SQL NULL */
memset(mysql_rec, '\0', prebuilt->null_bitmap_len);
for (i = 0; i < prebuilt->n_template; i++) {
templ = prebuilt->mysql_template + i;
data = rec_get_nth_field(rec, templ->rec_field_no, &len);
if (len != UNIV_SQL_NULL) {
if (templ->type == DATA_BLOB) {
/* Copy the BLOB data to the BLOB
heap of prebuilt */
if (prebuilt->blob_heap == NULL) {
prebuilt->blob_heap =
mem_heap_create(len);
}
blob_buf = mem_heap_alloc(prebuilt->blob_heap,
len);
ut_memcpy(blob_buf, data, len);
data = blob_buf;
}
row_sel_field_store_in_mysql_format(
mysql_rec + templ->mysql_col_offset,
templ->mysql_col_len, data, len,
templ->type, templ->is_unsigned);
} else {
mysql_rec[templ->mysql_null_byte_offset] |=
(byte) (templ->mysql_null_bit_mask);
}
}
}
/*************************************************************************
Builds a previous version of a clustered index record for a consistent read */
static
ulint
row_sel_build_prev_vers_for_mysql(
/*==============================*/
/* out: DB_SUCCESS or error code */
read_view_t* read_view, /* in: read view */
dict_index_t* clust_index, /* in: clustered index */
row_prebuilt_t* prebuilt, /* in: prebuilt struct */
rec_t* rec, /* in: record in a clustered index */
rec_t** old_vers, /* out: old version, or NULL if the
record does not exist in the view:
i.e., it was freshly inserted
afterwards */
mtr_t* mtr) /* in: mtr */
{
ulint err;
if (prebuilt->old_vers_heap) {
mem_heap_empty(prebuilt->old_vers_heap);
} else {
prebuilt->old_vers_heap = mem_heap_create(200);
}
err = row_vers_build_for_consistent_read(rec, mtr, clust_index,
read_view, prebuilt->old_vers_heap,
old_vers);
return(err);
}
/*************************************************************************
Retrieves the clustered index record corresponding to a record in a
non-clustered index. Does the necessary locking. Used in the MySQL
interface. */
static
ulint
row_sel_get_clust_rec_for_mysql(
/*============================*/
/* out: DB_SUCCESS or error code */
row_prebuilt_t* prebuilt,/* in: prebuilt struct in the handle */
dict_index_t* sec_index,/* in: secondary index where rec resides */
rec_t* rec, /* in: record in a non-clustered index */
que_thr_t* thr, /* in: query thread */
rec_t** out_rec,/* out: clustered record or an old version of
it, NULL if the old version did not exist
in the read view, i.e., it was a fresh
inserted version */
mtr_t* mtr) /* in: mtr used to get access to the
non-clustered record; the same mtr is used to
access the clustered index */
{
dict_index_t* clust_index;
rec_t* clust_rec;
rec_t* old_vers;
ulint err;
trx_t* trx;
*out_rec = NULL;
row_build_row_ref_in_tuple(prebuilt->clust_ref, sec_index, rec);
clust_index = dict_table_get_first_index(sec_index->table);
btr_pcur_open_with_no_init(clust_index, prebuilt->clust_ref,
PAGE_CUR_LE, BTR_SEARCH_LEAF,
prebuilt->clust_pcur, 0, mtr);
clust_rec = btr_pcur_get_rec(prebuilt->clust_pcur);
ut_ad(page_rec_is_user_rec(clust_rec));
if (prebuilt->select_lock_type != LOCK_NONE) {
/* Try to place a lock on the index record */
err = lock_clust_rec_read_check_and_lock(0, clust_rec,
clust_index,
prebuilt->select_lock_type, thr);
if (err != DB_SUCCESS) {
return(err);
}
} else {
/* This is a non-locking consistent read: if necessary, fetch
a previous version of the record */
trx = thr_get_trx(thr);
if (!lock_clust_rec_cons_read_sees(clust_rec, clust_index,
trx->read_view)) {
err = row_sel_build_prev_vers_for_mysql(
trx->read_view, clust_index,
prebuilt, clust_rec,
&old_vers, mtr);
if (err != DB_SUCCESS) {
return(err);
}
clust_rec = old_vers;
}
}
*out_rec = clust_rec;
if (prebuilt->select_lock_type == LOCK_X) {
/* We may use the cursor in update: store its position */
btr_pcur_store_position(prebuilt->clust_pcur, mtr);
}
return(DB_SUCCESS);
}
/************************************************************************
Restores cursor position after it has been stored. We have to take into
account that the record cursor was positioned on can have been deleted.
Then we may have to move the cursor one step up or down. */
static
ibool
sel_restore_position_for_mysql(
/*===========================*/
/* out: TRUE if we may need to
process the record the cursor is
now positioned on (i.e. we should
not go to the next record yet) */
ulint latch_mode, /* in: latch mode wished in
restoration */
btr_pcur_t* pcur, /* in: cursor whose position
has been stored */
ibool moves_up, /* in: TRUE if the cursor moves up
in the index */
mtr_t* mtr) /* in: mtr; CAUTION: may commit
mtr temporarily! */
{
ibool success;
ulint relative_position;
relative_position = pcur->rel_pos;
success = btr_pcur_restore_position(latch_mode, pcur, mtr);
if (relative_position == BTR_PCUR_ON) {
if (success) {
return(FALSE);
}
if (moves_up) {
btr_pcur_move_to_next(pcur, mtr);
return(TRUE);
}
return(TRUE);
}
if (relative_position == BTR_PCUR_AFTER) {
if (moves_up) {
return(TRUE);
}
if (btr_pcur_is_on_user_rec(pcur, mtr)) {
btr_pcur_move_to_prev(pcur, mtr);
}
return(TRUE);
}
ut_ad(relative_position == BTR_PCUR_BEFORE);
if (moves_up && btr_pcur_is_on_user_rec(pcur, mtr)) {
btr_pcur_move_to_next(pcur, mtr);
}
return(TRUE);
}
/************************************************************************
Pops a cached row for MySQL from the fetch cache. */
UNIV_INLINE
void
row_sel_pop_cached_row_for_mysql(
/*=============================*/
byte* buf, /* in/out: buffer where to copy the
row */
row_prebuilt_t* prebuilt) /* in: prebuilt struct */
{
ut_ad(prebuilt->n_fetch_cached > 0);
ut_memcpy(buf, prebuilt->fetch_cache[prebuilt->fetch_cache_first],
prebuilt->mysql_row_len);
prebuilt->n_fetch_cached--;
prebuilt->fetch_cache_first++;
if (prebuilt->n_fetch_cached == 0) {
prebuilt->fetch_cache_first = 0;
}
}
/************************************************************************
Pushes a row for MySQL to the fetch cache. */
UNIV_INLINE
void
row_sel_push_cache_row_for_mysql(
/*=============================*/
row_prebuilt_t* prebuilt, /* in: prebuilt struct */
rec_t* rec) /* in: record to push */
{
ulint i;
ut_ad(prebuilt->n_fetch_cached < MYSQL_FETCH_CACHE_SIZE);
if (prebuilt->fetch_cache[0] == NULL) {
/* Allocate memory for the fetch cache */
for (i = 0; i < MYSQL_FETCH_CACHE_SIZE; i++) {
prebuilt->fetch_cache[i] = mem_alloc(
prebuilt->mysql_row_len);
}
}
ut_ad(prebuilt->fetch_cache_first == 0);
row_sel_store_mysql_rec(
prebuilt->fetch_cache[prebuilt->n_fetch_cached],
prebuilt, rec);
prebuilt->n_fetch_cached++;
}
/************************************************************************
Searches for rows in the database. This is used in the interface to
MySQL. This function opens a cursor, and also implements fetch next
and fetch prev. NOTE that if we do a search with a full key value
from a unique index (ROW_SEL_EXACT), then we will not store the cursor
position and fetch next or fetch prev must not be tried to the cursor! */
ulint
row_search_for_mysql(
/*=================*/
/* out: DB_SUCCESS,
DB_RECORD_NOT_FOUND,
DB_END_OF_INDEX, or DB_DEADLOCK */
byte* buf, /* in/out: buffer for the fetched
row in the MySQL format */
ulint mode, /* in: search mode PAGE_CUR_L, ... */
row_prebuilt_t* prebuilt, /* in: prebuilt struct for the
table handle; this contains the info
of search_tuple, index; if search
tuple contains 0 fields then we
position the cursor at the start or
the end of the index, depending on
'mode' */
ulint match_mode, /* in: 0 or ROW_SEL_EXACT or
ROW_SEL_EXACT_PREFIX */
ulint direction) /* in: 0 or ROW_SEL_NEXT or
ROW_SEL_PREV; NOTE: if this is != 0,
then prebuilt must have a pcur
with stored position! In opening of a
cursor 'direction' should be 0. */
{
dict_index_t* index = prebuilt->index;
dtuple_t* search_tuple = prebuilt->search_tuple;
btr_pcur_t* pcur = prebuilt->pcur;
trx_t* trx = prebuilt->trx;
dict_index_t* clust_index;
que_thr_t* thr;
rec_t* rec;
rec_t* index_rec;
rec_t* clust_rec;
rec_t* old_vers;
ulint err;
ibool moved;
ibool cons_read_requires_clust_rec;
ibool was_lock_wait;
ulint ret;
ibool unique_search_from_clust_index = FALSE;
ibool mtr_has_extra_clust_latch = FALSE;
ibool moves_up = FALSE;
mtr_t mtr;
ut_ad(index && pcur && search_tuple);
ut_ad(trx->mysql_thread_id == os_thread_get_curr_id());
ut_ad(sync_thread_levels_empty_gen(FALSE));
if (direction == 0) {
prebuilt->n_rows_fetched = 0;
prebuilt->n_fetch_cached = 0;
prebuilt->fetch_cache_first = 0;
if (prebuilt->sel_graph == NULL) {
/* Build a dummy select query graph */
row_prebuild_sel_graph(prebuilt);
}
} else {
if (prebuilt->n_rows_fetched == 0) {
prebuilt->fetch_direction = direction;
}
if (direction != prebuilt->fetch_direction) {
if (prebuilt->n_fetch_cached > 0) {
ut_a(0);
/* TODO: scrollable cursor: restore cursor to
the place of the latest returned row,
or better: prevent caching for a scroll
cursor! */
}
prebuilt->n_rows_fetched = 0;
prebuilt->n_fetch_cached = 0;
prebuilt->fetch_cache_first = 0;
} else if (prebuilt->n_fetch_cached > 0) {
row_sel_pop_cached_row_for_mysql(buf, prebuilt);
prebuilt->n_rows_fetched++;
return(DB_SUCCESS);
}
if (prebuilt->fetch_cache_first > 0
&& prebuilt->fetch_cache_first < MYSQL_FETCH_CACHE_SIZE) {
/* The previous returned row was popped from the fetch
cache, but the cache was not full at the time of the
popping: no more rows can exist in the result set */
return(DB_RECORD_NOT_FOUND);
}
prebuilt->n_rows_fetched++;
if (prebuilt->n_rows_fetched > 1000000000) {
/* Prevent wrap-over */
prebuilt->n_rows_fetched = 500000000;
}
mode = pcur->search_mode;
}
if (match_mode == ROW_SEL_EXACT && index->type & DICT_UNIQUE
&& index->type & DICT_CLUSTERED
&& dtuple_get_n_fields(search_tuple)
== dict_index_get_n_unique(index)) {
if (direction == ROW_SEL_NEXT) {
/* MySQL sometimes seems to do fetch next even
if the search condition is unique; we do not store
pcur position in this case, so we cannot
restore cursor position, and must return
immediately */
return(DB_RECORD_NOT_FOUND);
}
ut_a(direction == 0); /* We cannot do fetch prev, as we have
not stored the cursor position */
mode = PAGE_CUR_GE;
unique_search_from_clust_index = TRUE;
}
/* Note that if the search mode was GE or G, then the cursor
naturally moves upward (in fetch next) in alphabetical order,
otherwise downward */
if (direction == 0) {
if (mode == PAGE_CUR_GE || mode == PAGE_CUR_G) {
moves_up = TRUE;
}
} else if (direction == ROW_SEL_NEXT) {
moves_up = TRUE;
}
mtr_start(&mtr);
thr = que_fork_get_first_thr(prebuilt->sel_graph);
que_thr_move_to_run_state_for_mysql(thr, trx);
clust_index = dict_table_get_first_index(index->table);
if (direction != 0) {
moved = sel_restore_position_for_mysql(BTR_SEARCH_LEAF, pcur,
moves_up, &mtr);
if (!moved) {
goto next_rec;
}
} else if (dtuple_get_n_fields(search_tuple) > 0) {
btr_pcur_open_with_no_init(index, search_tuple, mode,
BTR_SEARCH_LEAF,
pcur, 0, &mtr);
} else {
if (mode == PAGE_CUR_G) {
btr_pcur_open_at_index_side(TRUE, index,
BTR_SEARCH_LEAF, pcur, FALSE, &mtr);
} else if (mode == PAGE_CUR_L) {
btr_pcur_open_at_index_side(FALSE, index,
BTR_SEARCH_LEAF, pcur, FALSE, &mtr);
}
}
if (!prebuilt->sql_stat_start) {
/* No need to set an intention lock or assign a read view */
} else if (prebuilt->select_lock_type == LOCK_NONE) {
/* This is a consistent read */
trx_start_if_not_started(trx);
/* Assign a read view for the query */
trx_assign_read_view(trx);
prebuilt->sql_stat_start = FALSE;
} else {
trx_start_if_not_started(trx);
if (prebuilt->select_lock_type == LOCK_S) {
err = lock_table(0, index->table, LOCK_IS, thr);
} else {
err = lock_table(0, index->table, LOCK_IX, thr);
}
if (err != DB_SUCCESS) {
goto lock_wait_or_error;
}
prebuilt->sql_stat_start = FALSE;
}
/*-------------------------------------------------------------*/
rec_loop:
cons_read_requires_clust_rec = FALSE;
rec = btr_pcur_get_rec(pcur);
if (rec == page_get_infimum_rec(buf_frame_align(rec))) {
/* The infimum record on a page cannot be in the result set,
and neither can a record lock be placed on it: we skip such
a record. */
goto next_rec;
}
if (prebuilt->select_lock_type != LOCK_NONE) {
/* Try to place a lock on the index record */
err = sel_set_rec_lock(rec, index, prebuilt->select_lock_type,
thr);
if (err != DB_SUCCESS) {
goto lock_wait_or_error;
}
}
if (rec == page_get_supremum_rec(buf_frame_align(rec))) {
/* A page supremum record cannot be in the result set: skip
it now when we have placed a possible lock on it */
goto next_rec;
}
ut_ad(page_rec_is_user_rec(rec));
if (unique_search_from_clust_index && btr_pcur_get_up_match(pcur)
== dtuple_get_n_fields(search_tuple)) {
/* The record matches enough */
ut_ad(mode == PAGE_CUR_GE);
} else if (match_mode == ROW_SEL_EXACT) {
/* Test if the index record matches completely to search_tuple
in prebuilt: if not, then we return with DB_RECORD_NOT_FOUND */
if (0 != cmp_dtuple_rec(search_tuple, rec)) {
btr_pcur_store_position(pcur, &mtr);
ret = DB_RECORD_NOT_FOUND;
goto normal_return;
}
} else if (match_mode == ROW_SEL_EXACT_PREFIX) {
if (!cmp_dtuple_is_prefix_of_rec(search_tuple, rec)) {
btr_pcur_store_position(pcur, &mtr);
ret = DB_RECORD_NOT_FOUND;
goto normal_return;
}
}
/* We are ready to look at a possible new index entry in the result
set: the cursor is now placed on a user record */
/* Get the right version of the row in a consistent read */
if (prebuilt->select_lock_type == LOCK_NONE) {
/* This is a non-locking consistent read: if necessary, fetch
a previous version of the record */
cons_read_requires_clust_rec = FALSE;
if (index == clust_index) {
if (!lock_clust_rec_cons_read_sees(rec, index,
trx->read_view)) {
err = row_sel_build_prev_vers_for_mysql(
trx->read_view, clust_index,
prebuilt, rec,
&old_vers, &mtr);
if (err != DB_SUCCESS) {
goto lock_wait_or_error;
}
if (old_vers == NULL) {
/* The row did not exist yet in
the read view */
goto next_rec;
}
rec = old_vers;
}
} else if (!lock_sec_rec_cons_read_sees(rec, index,
trx->read_view)) {
/* We are looking into a non-clustered index,
and to get the right version of the record we
have to look also into the clustered index: this
is necessary, because we can only get the undo
information via the clustered index record. */
cons_read_requires_clust_rec = TRUE;
}
}
if (rec_get_deleted_flag(rec) && !cons_read_requires_clust_rec) {
/* The record is delete marked: we can skip it if this is
not a consistent read which might see an earlier version
of a non-clustered index record */
goto next_rec;
}
/* Get the clustered index record if needed and if we did
not do the search using the clustered index */
index_rec = rec;
if (index != clust_index && (cons_read_requires_clust_rec
|| prebuilt->need_to_access_clustered)) {
/* It was a non-clustered index and we must fetch also the
clustered index record */
mtr_has_extra_clust_latch = TRUE;
err = row_sel_get_clust_rec_for_mysql(prebuilt, index, rec,
thr, &clust_rec, &mtr);
if (err != DB_SUCCESS) {
goto lock_wait_or_error;
}
if (clust_rec == NULL) {
/* The record did not exist in the read view */
ut_ad(prebuilt->select_lock_type == LOCK_NONE);
goto next_rec;
}
if (rec_get_deleted_flag(clust_rec)) {
/* The record is delete marked: we can skip it */
goto next_rec;
}
rec = clust_rec;
}
/* We found a qualifying row */
if (prebuilt->n_rows_fetched >= MYSQL_FETCH_CACHE_THRESHOLD
&& !prebuilt->templ_contains_blob
&& prebuilt->select_lock_type == LOCK_NONE
&& !prebuilt->clust_index_was_generated) {
/* Inside an update, for example, we do not cache rows,
since we may use the cursor position to do the actual
update, that is why we require ...lock_type == LOCK_NONE */
row_sel_push_cache_row_for_mysql(prebuilt, rec);
if (prebuilt->n_fetch_cached == MYSQL_FETCH_CACHE_SIZE) {
goto got_row;
}
goto next_rec;
} else {
row_sel_store_mysql_rec(buf, prebuilt, rec);
if (prebuilt->clust_index_was_generated) {
row_sel_store_row_id_to_prebuilt(prebuilt, index_rec,
index);
}
}
got_row:
/* TODO: should we in every case store the cursor position, even
if this is just a join, for example? */
if (!unique_search_from_clust_index
|| prebuilt->select_lock_type == LOCK_X) {
/* Inside an update always store the cursor position */
btr_pcur_store_position(pcur, &mtr);
}
ret = DB_SUCCESS;
goto normal_return;
/*-------------------------------------------------------------*/
next_rec:
if (mtr_has_extra_clust_latch) {
/* We must commit mtr if we are moving to the next
non-clustered index record, because we could break the
latching order if we would access a different clustered
index page right away without releasing the previous. */
btr_pcur_store_position(pcur, &mtr);
mtr_commit(&mtr);
mtr_has_extra_clust_latch = FALSE;
mtr_start(&mtr);
moved = sel_restore_position_for_mysql(BTR_SEARCH_LEAF, pcur,
moves_up, &mtr);
if (moved) {
goto rec_loop;
}
}
if (moves_up) {
moved = btr_pcur_move_to_next(pcur, &mtr);
} else {
moved = btr_pcur_move_to_prev(pcur, &mtr);
}
if (!moved) {
btr_pcur_store_position(pcur, &mtr);
if (match_mode != 0) {
ret = DB_RECORD_NOT_FOUND;
} else {
ret = DB_END_OF_INDEX;
}
goto normal_return;
}
goto rec_loop;
/*-------------------------------------------------------------*/
lock_wait_or_error:
btr_pcur_store_position(pcur, &mtr);
mtr_commit(&mtr);
mtr_has_extra_clust_latch = FALSE;
trx->error_state = err;
/* The following is a patch for MySQL */
que_thr_stop_for_mysql(thr);
was_lock_wait = row_mysql_handle_errors(&err, trx, thr, NULL);
if (was_lock_wait) {
mtr_start(&mtr);
sel_restore_position_for_mysql(BTR_SEARCH_LEAF, pcur,
moves_up, &mtr);
mode = pcur->search_mode;
goto rec_loop;
}
return(err);
normal_return:
que_thr_stop_for_mysql_no_error(thr, trx);
mtr_commit(&mtr);
if (prebuilt->n_fetch_cached > 0) {
row_sel_pop_cached_row_for_mysql(buf, prebuilt);
ret = DB_SUCCESS;
}
return(ret);
}