mariadb/storage/heap/hp_create.c
Konstantin Osipov 33c57e2f9e WL#5419 "LOCK_open scalability: make tdc_refresh_version
an atomic counter"

Split the large LOCK_open section in open_table(). 
Do not call open_table_from_share() under LOCK_open.
Remove thd->version.

This fixes
Bug#50589 "Server hang on a query evaluated using a temporary 
table"
Bug#51557 "LOCK_open and kernel_mutex are not happy together"
Bug#49463 "LOCK_table and innodb are not nice when handler 
instances are created".

This patch has effect on storage engines that rely on
ha_open() PSEA method being called under LOCK_open.
In particular:

1) NDB is broken and left unfixed. NDB relies on LOCK_open
being kept as part of ha_open(), since it uses auto-discovery.
While previously the NDB open code was race-prone, now
it simply fails on asserts.

2) HEAP engine had a race in ha_heap::open() when
a share for the same table could be added twice
to the list of shares, or a dangling reference to a share
stored in HEAP handler. This patch aims to address this
problem by 'pinning' the newly created share in the 
internal HEAP engine share list until at least one
handler instance is created using that share.


include/heap.h:
  Add members to HP_CREATE_INFO.
  Declare heap_release_share().
sql/lock.cc:
  Remove thd->version, use thd->open_tables->s->version instead.
sql/repl_failsafe.cc:
  Remove thd->version.
sql/sql_base.cc:
  - close_thread_table(): move handler cleanup code outside the critical section protected by LOCK_open.
  - remove thd->version
  - split the large critical section in open_table() that
  opens a new table from share and is protected by LOCK_open
  into 2 critical sections, thus reducing the critical path.
  - make check_if_table_exists() acquire LOCK_open internally.
  - use thd->open_tables->s->version instead of thd->refresh_version to make sure that all tables in
  thd->open_tables are in the same refresh series.
sql/sql_base.h:
  Add declaration for check_if_table_exists().
sql/sql_class.cc:
  Remove init_open_tables_state(), it's now equal to
  reset_open_tables_state().
sql/sql_class.h:
  Remove thd->version, THD::init_open_tables_state().
sql/sql_plugin.cc:
  Use table->m_needs_reopen to mark the table as stale
  rather than manipulate with thd->version, which is no more.
sql/sql_udf.cc:
  Use table->m_needs_reopen to mark the table as stale
  rather than manipulate with thd->version, which is no more.
sql/table.h:
  Remove an unused variable.
sql/tztime.cc:
  Use table->m_needs_reopen to mark the table as stale
  rather than manipulate with thd->version, which is no more.
storage/heap/CMakeLists.txt:
  Add heap tests to cmake build files.
storage/heap/ha_heap.cc:
  Fix a race when ha_heap::ha_open() could insert two 
  HP_SHARE objects into the internal share list or store
  a dangling reference to a share in ha_heap instance,
  or wrongly set implicit_emptied.
storage/heap/hp_create.c:
  Optionally pin a newly created share in the list of shares
  by increasing its open_count. This is necessary to 
  make sure that a newly created share doesn't disappear while
  a HP_INFO object is being created to reference it.
storage/heap/hp_open.c:
  When adding a new HP_INFO object to the list of objects
  in the heap share, make sure the open_count is not increased
  twice.
storage/heap/hp_test1.c:
  Adjust the test to new function signatures.
storage/heap/hp_test2.c:
  Adjust the test to new function signatures.
2010-06-11 19:28:18 +04:00

311 lines
9.3 KiB
C

/* Copyright (C) 2000-2006 MySQL AB, 2008-2009 Sun Microsystems, Inc
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; version 2 of the License.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with this program; if not, write to the Free Software
Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA */
#include "heapdef.h"
static int keys_compare(heap_rb_param *param, uchar *key1, uchar *key2);
static void init_block(HP_BLOCK *block,uint reclength,ulong min_records,
ulong max_records);
/* Create a heap table */
int heap_create(const char *name, HP_CREATE_INFO *create_info,
HP_SHARE **res, my_bool *created_new_share)
{
uint i, j, key_segs, max_length, length;
HP_SHARE *share= 0;
HA_KEYSEG *keyseg;
HP_KEYDEF *keydef= create_info->keydef;
uint reclength= create_info->reclength;
uint keys= create_info->keys;
ulong min_records= create_info->min_records;
ulong max_records= create_info->max_records;
DBUG_ENTER("heap_create");
if (!create_info->internal_table)
{
mysql_mutex_lock(&THR_LOCK_heap);
share= hp_find_named_heap(name);
if (share && share->open_count == 0)
{
hp_free(share);
share= 0;
}
}
*created_new_share= (share == NULL);
if (!share)
{
HP_KEYDEF *keyinfo;
DBUG_PRINT("info",("Initializing new table"));
/*
We have to store sometimes uchar* del_link in records,
so the record length should be at least sizeof(uchar*)
*/
set_if_bigger(reclength, sizeof (uchar*));
for (i= key_segs= max_length= 0, keyinfo= keydef; i < keys; i++, keyinfo++)
{
bzero((char*) &keyinfo->block,sizeof(keyinfo->block));
bzero((char*) &keyinfo->rb_tree ,sizeof(keyinfo->rb_tree));
for (j= length= 0; j < keyinfo->keysegs; j++)
{
length+= keyinfo->seg[j].length;
if (keyinfo->seg[j].null_bit)
{
length++;
if (!(keyinfo->flag & HA_NULL_ARE_EQUAL))
keyinfo->flag|= HA_NULL_PART_KEY;
if (keyinfo->algorithm == HA_KEY_ALG_BTREE)
keyinfo->rb_tree.size_of_element++;
}
switch (keyinfo->seg[j].type) {
case HA_KEYTYPE_SHORT_INT:
case HA_KEYTYPE_LONG_INT:
case HA_KEYTYPE_FLOAT:
case HA_KEYTYPE_DOUBLE:
case HA_KEYTYPE_USHORT_INT:
case HA_KEYTYPE_ULONG_INT:
case HA_KEYTYPE_LONGLONG:
case HA_KEYTYPE_ULONGLONG:
case HA_KEYTYPE_INT24:
case HA_KEYTYPE_UINT24:
case HA_KEYTYPE_INT8:
keyinfo->seg[j].flag|= HA_SWAP_KEY;
break;
case HA_KEYTYPE_VARBINARY1:
/* Case-insensitiveness is handled in coll->hash_sort */
keyinfo->seg[j].type= HA_KEYTYPE_VARTEXT1;
/* fall_through */
case HA_KEYTYPE_VARTEXT1:
keyinfo->flag|= HA_VAR_LENGTH_KEY;
length+= 2;
/* Save number of bytes used to store length */
keyinfo->seg[j].bit_start= 1;
break;
case HA_KEYTYPE_VARBINARY2:
/* Case-insensitiveness is handled in coll->hash_sort */
/* fall_through */
case HA_KEYTYPE_VARTEXT2:
keyinfo->flag|= HA_VAR_LENGTH_KEY;
length+= 2;
/* Save number of bytes used to store length */
keyinfo->seg[j].bit_start= 2;
/*
Make future comparison simpler by only having to check for
one type
*/
keyinfo->seg[j].type= HA_KEYTYPE_VARTEXT1;
break;
default:
break;
}
}
keyinfo->length= length;
length+= keyinfo->rb_tree.size_of_element +
((keyinfo->algorithm == HA_KEY_ALG_BTREE) ? sizeof(uchar*) : 0);
if (length > max_length)
max_length= length;
key_segs+= keyinfo->keysegs;
if (keyinfo->algorithm == HA_KEY_ALG_BTREE)
{
key_segs++; /* additional HA_KEYTYPE_END segment */
if (keyinfo->flag & HA_VAR_LENGTH_KEY)
keyinfo->get_key_length= hp_rb_var_key_length;
else if (keyinfo->flag & HA_NULL_PART_KEY)
keyinfo->get_key_length= hp_rb_null_key_length;
else
keyinfo->get_key_length= hp_rb_key_length;
}
}
if (!(share= (HP_SHARE*) my_malloc((uint) sizeof(HP_SHARE)+
keys*sizeof(HP_KEYDEF)+
key_segs*sizeof(HA_KEYSEG),
MYF(MY_ZEROFILL))))
goto err;
share->keydef= (HP_KEYDEF*) (share + 1);
share->key_stat_version= 1;
keyseg= (HA_KEYSEG*) (share->keydef + keys);
init_block(&share->block, reclength + 1, min_records, max_records);
/* Fix keys */
memcpy(share->keydef, keydef, (size_t) (sizeof(keydef[0]) * keys));
for (i= 0, keyinfo= share->keydef; i < keys; i++, keyinfo++)
{
keyinfo->seg= keyseg;
memcpy(keyseg, keydef[i].seg,
(size_t) (sizeof(keyseg[0]) * keydef[i].keysegs));
keyseg+= keydef[i].keysegs;
if (keydef[i].algorithm == HA_KEY_ALG_BTREE)
{
/* additional HA_KEYTYPE_END keyseg */
keyseg->type= HA_KEYTYPE_END;
keyseg->length= sizeof(uchar*);
keyseg->flag= 0;
keyseg->null_bit= 0;
keyseg++;
init_tree(&keyinfo->rb_tree, 0, 0, sizeof(uchar*),
(qsort_cmp2)keys_compare, 1, NULL, NULL);
keyinfo->delete_key= hp_rb_delete_key;
keyinfo->write_key= hp_rb_write_key;
}
else
{
init_block(&keyinfo->block, sizeof(HASH_INFO), min_records,
max_records);
keyinfo->delete_key= hp_delete_key;
keyinfo->write_key= hp_write_key;
keyinfo->hash_buckets= 0;
}
if ((keyinfo->flag & HA_AUTO_KEY) && create_info->with_auto_increment)
share->auto_key= i + 1;
}
share->min_records= min_records;
share->max_records= max_records;
share->max_table_size= create_info->max_table_size;
share->data_length= share->index_length= 0;
share->reclength= reclength;
share->blength= 1;
share->keys= keys;
share->max_key_length= max_length;
share->changed= 0;
share->auto_key= create_info->auto_key;
share->auto_key_type= create_info->auto_key_type;
share->auto_increment= create_info->auto_increment;
/* Must be allocated separately for rename to work */
if (!(share->name= my_strdup(name,MYF(0))))
{
my_free((uchar*) share,MYF(0));
goto err;
}
#ifdef THREAD
thr_lock_init(&share->lock);
mysql_mutex_init(hp_key_mutex_HP_SHARE_intern_lock,
&share->intern_lock, MY_MUTEX_INIT_FAST);
#endif
if (!create_info->internal_table)
{
share->open_list.data= (void*) share;
heap_share_list= list_add(heap_share_list,&share->open_list);
}
else
share->delete_on_close= 1;
}
if (!create_info->internal_table)
{
if (create_info->pin_share)
++share->open_count;
mysql_mutex_unlock(&THR_LOCK_heap);
}
*res= share;
DBUG_RETURN(0);
err:
if (!create_info->internal_table)
mysql_mutex_unlock(&THR_LOCK_heap);
DBUG_RETURN(1);
} /* heap_create */
static int keys_compare(heap_rb_param *param, uchar *key1, uchar *key2)
{
uint not_used[2];
return ha_key_cmp(param->keyseg, key1, key2, param->key_length,
param->search_flag, not_used);
}
static void init_block(HP_BLOCK *block, uint reclength, ulong min_records,
ulong max_records)
{
uint i,recbuffer,records_in_block;
max_records= max(min_records,max_records);
if (!max_records)
max_records= 1000; /* As good as quess as anything */
recbuffer= (uint) (reclength + sizeof(uchar**) - 1) & ~(sizeof(uchar**) - 1);
records_in_block= max_records / 10;
if (records_in_block < 10 && max_records)
records_in_block= 10;
if (!records_in_block || records_in_block*recbuffer >
(my_default_record_cache_size-sizeof(HP_PTRS)*HP_MAX_LEVELS))
records_in_block= (my_default_record_cache_size - sizeof(HP_PTRS) *
HP_MAX_LEVELS) / recbuffer + 1;
block->records_in_block= records_in_block;
block->recbuffer= recbuffer;
block->last_allocated= 0L;
for (i= 0; i <= HP_MAX_LEVELS; i++)
block->level_info[i].records_under_level=
(!i ? 1 : i == 1 ? records_in_block :
HP_PTRS_IN_NOD * block->level_info[i - 1].records_under_level);
}
static inline void heap_try_free(HP_SHARE *share)
{
if (share->open_count == 0)
hp_free(share);
else
share->delete_on_close= 1;
}
int heap_delete_table(const char *name)
{
int result;
reg1 HP_SHARE *share;
DBUG_ENTER("heap_delete_table");
mysql_mutex_lock(&THR_LOCK_heap);
if ((share= hp_find_named_heap(name)))
{
heap_try_free(share);
result= 0;
}
else
{
result= my_errno=ENOENT;
}
mysql_mutex_unlock(&THR_LOCK_heap);
DBUG_RETURN(result);
}
void heap_drop_table(HP_INFO *info)
{
DBUG_ENTER("heap_drop_table");
mysql_mutex_lock(&THR_LOCK_heap);
heap_try_free(info->s);
mysql_mutex_unlock(&THR_LOCK_heap);
DBUG_VOID_RETURN;
}
void hp_free(HP_SHARE *share)
{
if (share->open_list.data) /* If not internal table */
heap_share_list= list_delete(heap_share_list, &share->open_list);
hp_clear(share); /* Remove blocks from memory */
#ifdef THREAD
thr_lock_delete(&share->lock);
mysql_mutex_destroy(&share->intern_lock);
#endif
my_free((uchar*) share->name, MYF(0));
my_free((uchar*) share, MYF(0));
return;
}