2007-01-18 20:38:14 +01:00
|
|
|
/* Copyright (C) 2007 Michael Widenius
|
|
|
|
|
|
|
|
This program is free software; you can redistribute it and/or modify
|
|
|
|
it under the terms of the GNU General Public License as published by
|
2007-03-02 11:20:23 +01:00
|
|
|
the Free Software Foundation; version 2 of the License.
|
2007-01-18 20:38:14 +01:00
|
|
|
|
|
|
|
This program is distributed in the hope that it will be useful,
|
|
|
|
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
|
|
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
|
|
|
GNU General Public License for more details.
|
|
|
|
|
|
|
|
You should have received a copy of the GNU General Public License
|
|
|
|
along with this program; if not, write to the Free Software
|
|
|
|
Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA */
|
|
|
|
|
|
|
|
/*
|
|
|
|
Storage of records in block
|
|
|
|
*/
|
|
|
|
|
|
|
|
#define LSN_SIZE 7
|
2007-04-19 12:18:56 +02:00
|
|
|
#define DIR_COUNT_SIZE 1 /* Stores number of rows on page */
|
2007-10-09 20:09:50 +02:00
|
|
|
#define DIR_FREE_SIZE 1 /* Pointer to first free dir entry */
|
2007-01-18 20:38:14 +01:00
|
|
|
#define EMPTY_SPACE_SIZE 2 /* Stores empty space on page */
|
|
|
|
#define PAGE_TYPE_SIZE 1
|
2007-10-09 20:09:50 +02:00
|
|
|
#define PAGE_SUFFIX_SIZE 4 /* Bytes for checksum */
|
|
|
|
#define PAGE_HEADER_SIZE (LSN_SIZE + DIR_COUNT_SIZE + DIR_FREE_SIZE +\
|
|
|
|
EMPTY_SPACE_SIZE + PAGE_TYPE_SIZE)
|
2007-01-18 20:38:14 +01:00
|
|
|
#define PAGE_OVERHEAD_SIZE (PAGE_HEADER_SIZE + DIR_ENTRY_SIZE + \
|
|
|
|
PAGE_SUFFIX_SIZE)
|
|
|
|
#define BLOCK_RECORD_POINTER_SIZE 6
|
|
|
|
|
2007-11-28 20:38:30 +01:00
|
|
|
#define FULL_PAGE_SIZE(block_size) ((block_size) - LSN_SIZE - \
|
|
|
|
PAGE_TYPE_SIZE - PAGE_SUFFIX_SIZE)
|
2007-01-18 20:38:14 +01:00
|
|
|
|
|
|
|
#define ROW_EXTENT_PAGE_SIZE 5
|
|
|
|
#define ROW_EXTENT_COUNT_SIZE 2
|
2007-10-19 23:24:22 +02:00
|
|
|
#define SUB_RANGE_SIZE 2
|
|
|
|
#define BLOCK_FILLER_SIZE 2
|
2007-01-18 20:38:14 +01:00
|
|
|
#define ROW_EXTENT_SIZE (ROW_EXTENT_PAGE_SIZE + ROW_EXTENT_COUNT_SIZE)
|
|
|
|
#define TAIL_BIT 0x8000 /* Bit in page_count to signify tail */
|
2007-04-19 12:18:56 +02:00
|
|
|
/* Number of extents reserved MARIA_BITMAP_BLOCKS to store head part */
|
2007-01-18 20:38:14 +01:00
|
|
|
#define ELEMENTS_RESERVED_FOR_MAIN_PART 4
|
2007-11-28 20:38:30 +01:00
|
|
|
/* This is just used to prealloc a dynamic array */
|
|
|
|
#define AVERAGE_BLOB_SIZE 1024L*1024L
|
|
|
|
/* Number of pages to store continuous blob parts */
|
|
|
|
#define BLOB_SEGMENT_MIN_SIZE 128
|
|
|
|
|
2007-04-19 12:18:56 +02:00
|
|
|
/* Fields before 'row->null_field_lengths' used by find_where_to_split_row */
|
2007-01-18 20:38:14 +01:00
|
|
|
#define EXTRA_LENGTH_FIELDS 3
|
|
|
|
|
2007-04-19 12:18:56 +02:00
|
|
|
/* Size for the different parts in the row header (and head page) */
|
2007-01-18 20:38:14 +01:00
|
|
|
#define FLAG_SIZE 1
|
|
|
|
#define VERPTR_SIZE 7
|
|
|
|
#define DIR_ENTRY_SIZE 4
|
2007-04-19 12:18:56 +02:00
|
|
|
#define FIELD_OFFSET_SIZE 2 /* size of pointers to field starts */
|
2007-01-18 20:38:14 +01:00
|
|
|
|
|
|
|
/* Minimum header size needed for a new row */
|
|
|
|
#define BASE_ROW_HEADER_SIZE FLAG_SIZE
|
|
|
|
#define TRANS_ROW_EXTRA_HEADER_SIZE TRANSID_SIZE
|
|
|
|
|
2007-10-09 20:09:50 +02:00
|
|
|
#define PAGE_TYPE_MASK 7
|
2007-01-18 20:38:14 +01:00
|
|
|
enum en_page_type { UNALLOCATED_PAGE, HEAD_PAGE, TAIL_PAGE, BLOB_PAGE, MAX_PAGE_TYPE };
|
|
|
|
|
|
|
|
#define PAGE_TYPE_OFFSET LSN_SIZE
|
2007-10-09 20:09:50 +02:00
|
|
|
#define DIR_COUNT_OFFSET (LSN_SIZE+PAGE_TYPE_SIZE)
|
|
|
|
#define DIR_FREE_OFFSET (DIR_COUNT_OFFSET+DIR_COUNT_SIZE)
|
|
|
|
#define EMPTY_SPACE_OFFSET (DIR_FREE_OFFSET+DIR_FREE_SIZE)
|
2007-01-18 20:38:14 +01:00
|
|
|
|
|
|
|
#define PAGE_CAN_BE_COMPACTED 128 /* Bit in PAGE_TYPE */
|
|
|
|
|
2007-07-02 19:45:15 +02:00
|
|
|
/* Bits used for flag uchar (one byte, first in record) */
|
2007-01-18 20:38:14 +01:00
|
|
|
#define ROW_FLAG_TRANSID 1
|
|
|
|
#define ROW_FLAG_VER_PTR 2
|
|
|
|
#define ROW_FLAG_DELETE_TRANSID 4
|
|
|
|
#define ROW_FLAG_NULLS_EXTENDED 8
|
|
|
|
#define ROW_FLAG_EXTENTS 128
|
|
|
|
#define ROW_FLAG_ALL (1+2+4+8+128)
|
|
|
|
|
2007-04-19 12:18:56 +02:00
|
|
|
/******** Variables that affects how data pages are utilized ********/
|
|
|
|
|
|
|
|
/* Minium size of tail segment */
|
2007-01-18 20:38:14 +01:00
|
|
|
#define MIN_TAIL_SIZE 32
|
|
|
|
|
2007-04-19 12:18:56 +02:00
|
|
|
/*
|
|
|
|
Fixed length part of Max possible header size; See row data structure
|
|
|
|
table in ma_blockrec.c.
|
|
|
|
*/
|
2007-01-18 20:38:14 +01:00
|
|
|
#define MAX_FIXED_HEADER_SIZE (FLAG_SIZE + 3 + ROW_EXTENT_SIZE + 3)
|
|
|
|
#define TRANS_MAX_FIXED_HEADER_SIZE (MAX_FIXED_HEADER_SIZE + \
|
2007-04-05 13:38:05 +02:00
|
|
|
TRANSID_SIZE + VERPTR_SIZE + \
|
2007-01-18 20:38:14 +01:00
|
|
|
TRANSID_SIZE)
|
|
|
|
|
2007-07-02 19:45:15 +02:00
|
|
|
/* We use 1 uchar in record header to store number of directory entries */
|
2007-01-18 20:38:14 +01:00
|
|
|
#define MAX_ROWS_PER_PAGE 255
|
2007-10-09 20:09:50 +02:00
|
|
|
#define END_OF_DIR_FREE_LIST ((uchar) 255)
|
2007-01-18 20:38:14 +01:00
|
|
|
|
|
|
|
/* Bits for MARIA_BITMAP_BLOCKS->used */
|
2007-04-19 12:18:56 +02:00
|
|
|
/* We stored data on disk in the block */
|
2007-01-18 20:38:14 +01:00
|
|
|
#define BLOCKUSED_USED 1
|
2007-04-19 12:18:56 +02:00
|
|
|
/* Bitmap on disk is block->org_bitmap_value ; Happens only on update */
|
2007-01-18 20:38:14 +01:00
|
|
|
#define BLOCKUSED_USE_ORG_BITMAP 2
|
2007-04-19 12:18:56 +02:00
|
|
|
/* We stored tail data on disk for the block */
|
2007-01-18 20:38:14 +01:00
|
|
|
#define BLOCKUSED_TAIL 4
|
|
|
|
|
2007-04-19 12:18:56 +02:00
|
|
|
/******* defines that affects allocation (density) of data *******/
|
2007-01-18 20:38:14 +01:00
|
|
|
|
2007-04-19 12:18:56 +02:00
|
|
|
/*
|
- WL#3239 "log CREATE TABLE in Maria"
- WL#3240 "log DROP TABLE in Maria"
- similarly, log RENAME TABLE, REPAIR/OPTIMIZE TABLE, and
DELETE no_WHERE_clause (== the DELETE which just truncates the files)
- create_rename_lsn added to MARIA_SHARE's state
- all these operations (except DROP TABLE) also update the table's
create_rename_lsn, which is needed for the correctness of
Recovery (see function comment of _ma_repair_write_log_record()
in ma_check.c)
- write a COMMIT record when transaction commits.
- don't log REDOs/UNDOs if this is an internal temporary table
like inside ALTER TABLE (I expect this to be a big win). There was
already no logging for user-created "CREATE TEMPORARY" tables.
- don't fsync files/directories if the table is not transactional
- in translog_write_record(), autogenerate a 2-byte-id for the table
and log the "id->name" pair (LOGREC_FILE_ID); log
LOGREC_LONG_TRANSACTION_ID; automatically store
the table's 2-byte-id in any log record.
- preparations for Checkpoint: translog_get_horizon(); pausing Checkpoint
when some dirty pages are unknown; capturing trn->rec_lsn,
trn->first_undo_lsn for Checkpoint and log's low-water-mark computing.
- assertions, comments.
storage/maria/Makefile.am:
more files to build
storage/maria/ha_maria.cc:
- logging a REPAIR log record if REPAIR/OPTIMIZE was successful.
- ha_maria::data_file_type does not have to be set in every info()
call, just do it once in open().
- if caller said that transactionality can be disabled (like if
caller is ALTER TABLE) i.e. thd->transaction.on==FALSE, then we
temporarily disable transactionality of the table in external_lock();
that will ensure that no REDOs/UNDOs are logged for this possibly
massive write operation (they are not needed, as if any write fails,
the table will be dropped). We re-enable in external_lock(F_UNLCK),
which in ALTER TABLE happens before the tmp table replaces the original
one (which is good, as thus the final table will have a REDO RENAME
and a correct create_rename_lsn).
- when we commit we also have to write a log record, so
trnman_commit_trn() calls become ma_commit() calls
- at end of engine's initialization, we are potentially entering a
multi-threaded dangerous world (clients are going to be accepted)
and so some assertions of mutex-owning become enforceable, for that
we set maria_multi_threaded=TRUE (see ma_control_file.c)
storage/maria/ha_maria.h:
new member ha_maria::save_transactional (see also ha_maria.cc)
storage/maria/ma_blockrec.c:
- fixing comments according to discussion with Monty
- if a table is transactional but temporarily non-transactional
(like in ALTER TABLE), we need to give a sensible LSN to the pages
(and, if we give 0, pagecache asserts).
- translog_write_record() now takes care of storing the share's
2-byte-id in the log record
storage/maria/ma_blockrec.h:
fixing comment according to discussion with Monty
storage/maria/ma_check.c:
When REPAIR/OPTIMIZE modify the data/index file, if this is a
transactional table, they must sync it; if they remove files or rename
files, they must sync the directory, so that everything is durable.
This is just applying to REPAIR/OPTIMIZE the logic already implemented
in CREATE/DROP/RENAME a few months ago.
Adding a function to write a LOGREC_REPAIR_TABLE at end of
REPAIR/OPTIMIZE (called only by ha_maria, not by maria_chk), and
to update the table's create_rename_lsn.
storage/maria/ma_close.c:
fix for a future bug
storage/maria/ma_control_file.c:
ensuring that if Maria is running in multi-threaded mode, anybody
wanting to write to the control file and update
last_checkpoint_lsn/last_logno owns the log's lock.
storage/maria/ma_control_file.h:
see ma_control_file.c
storage/maria/ma_create.c:
when creating a table:
- sync it and its directory only if this is a transactional table
and there is a log (no point in syncing in maria_chk)
- decouple the two uses of linkname/linkname_ptr (for index file and
for data file) into more variables, as we need to know all links
until the moment we write the LOGREC_CREATE_TABLE.
- set share.data_file_type early so that _ma_initialize_data_file()
knows it (Monty's bugfix so that a table always has at least a bitmap
page when it is created; so data-file is not 0 bytes anymore).
- log a LOGREC_CREATE_TABLE; it contains the bytes which we have
just written to the index file's header. Update table's
create_rename_lsn.
- syncing of kfile had been bugified in a previous merge, correcting
- syncing of dfile is now needed as it's not empty anymore
- in _ma_initialize_data_file(), use share's block_size and not the
global one. This is a gratuitous change, both variables are equal,
just that I find it more future-proof to use share-bound variable
rather than global one.
storage/maria/ma_delete_all.c:
log a LOGREC_DELETE_ALL record when doing ma_delete_all_rows();
update create_rename_lsn then.
storage/maria/ma_delete_table.c:
- logging LOGREC_DROP_TABLE; knowing if this is needed, requires
knowing if the table is transactional, which requires opening the
table.
- we need to sync directories only if the table is transactional
storage/maria/ma_extra.c:
questions
storage/maria/ma_init.c:
when maria_end() is called, engine is not multithreaded
storage/maria/ma_loghandler.c:
- translog_inited has to be visible to ma_create() (see how it is used
in ma_create())
- checkpoint record will be a single record, not three
- no REDO for TRUNCATE (TRUNCATE calls ma_create() internally so will
log a REDO_CREATE)
- adding REDO for DELETE no_WHERE_clause (fast DELETE of all rows by
truncating the files), REPAIR.
- MY_WAIT_IF_FULL to wait&retry if a log write hits a full disk
- in translog_write_record(), if MARIA_SHARE does not yet have a
2-byte-id, generate one for it and log LOGREC_FILE_ID; automatically
store this short id into log records.
- in translog_write_record(), if transaction has not logged its
long trid, log LOGREC_LONG_TRANSACTION_ID.
- For Checkpoint, we need to know the current end-of-log: adding
translog_get_horizon().
- For Control File, adding an assertion that the thread owns the
log's lock (control file is protected by this lock)
storage/maria/ma_loghandler.h:
Changes in log records (see ma_loghandler.c).
new prototypes, new functions.
storage/maria/ma_loghandler_lsn.h:
adding a type LSN_WITH_FLAGS especially for TRN::first_undo_lsn,
where the most significant byte is used for flags.
storage/maria/ma_open.c:
storing the create_rename_lsn in the index file's header (in the
state, precisely) and retrieving it from there.
storage/maria/ma_pagecache.c:
- my set_if_bigger was wrong, correcting it
- if the first_in_switch list is not empty, it means that
changed_blocks misses some dirty pages, so Checkpoint cannot run and
needs to wait. A variable missing_blocks_in_changed_list is added to
tell that (should it be named missing_blocks_in_changed_blocks?)
- pagecache_collect_changed_blocks_with_lsn() now also tells the
minimum rec_lsn (needed for low-water mark computation).
storage/maria/ma_pagecache.h:
see ma_pagecache.c
storage/maria/ma_panic.c:
comment
storage/maria/ma_range.c:
comment
storage/maria/ma_rename.c:
- logging LOGREC_RENAME_TABLE; knowing if this is needed, requires
knowing if the table is transactional, which requires opening the
table.
- update create_rename_lsn
- we need to sync directories only if the table is transactional
storage/maria/ma_static.c:
comment
storage/maria/ma_test_all.sh:
- tip for Valgrind-ing ma_test_all
- do "export maria_path=somepath" before calling ma_test_all,
if you want to run ma_test_all out of storage/maria (useful
to have parallel runs, like one normal and one Valgrind, they
must not use the same tables so need to run in different directories)
storage/maria/maria_def.h:
- state now contains, in memory and on disk, the create_rename_lsn
- share now contains a 2-byte-id
storage/maria/trnman.c:
preparations for Checkpoint: capture trn->rec_lsn, trn->first_undo_lsn;
minimum first_undo_lsn needed to know log's low-water-mark
storage/maria/trnman.h:
using most significant byte of first_undo_lsn to hold miscellaneous
flags, for now TRANSACTION_LOGGED_LONG_ID.
dummy_transaction_object is already declared in ma_static.c.
storage/maria/trnman_public.h:
dummy_transaction_object was declared in all files including
trnman_public.h, while in fact it's a single object.
new prototype
storage/maria/unittest/ma_test_loghandler-t.c:
update for new prototype
storage/maria/unittest/ma_test_loghandler_multigroup-t.c:
update for new prototype
storage/maria/unittest/ma_test_loghandler_multithread-t.c:
update for new prototype
storage/maria/unittest/ma_test_loghandler_pagecache-t.c:
update for new prototype
storage/maria/ma_commit.c:
function which wraps:
- writing a LOGREC_COMMIT record (==commit on disk)
- calling trnman_commit_trn() (=commit in memory)
storage/maria/ma_commit.h:
new header file
.tree-is-private:
this file is now needed to keep our tree private (don't push it
to public trees). When 5.1 is merged into mysql-maria, we can abandon
our maria-specific post-commit trigger; .tree_is_private will take
care of keeping commit mails private. Don't push this file to public
trees.
2007-06-22 14:49:37 +02:00
|
|
|
If the tail part (from the main block or a blob) would use more than 75 % of
|
2007-04-19 12:18:56 +02:00
|
|
|
the size of page, store the tail on a full page instead of a shared
|
|
|
|
tail page.
|
|
|
|
*/
|
2007-01-18 20:38:14 +01:00
|
|
|
#define MAX_TAIL_SIZE(block_size) ((block_size) *3 / 4)
|
|
|
|
|
This patch is a collection of patches from from Sanja, Sergei and Monty.
Added logging and pinning of pages to block format.
Integration of transaction manager, log handler.
Better page cache intergration
Split trnman.h into two files, so that we don't have to include my_atomic.h into C++ programs.
Renaming of structures, more comments, more debugging etc.
Fixed problem with small head block + long varchar.
Added extra argument to delete_record() and update_record() (needed for UNDO logging)
Small changes to interface of pagecache and log handler.
Change initialization of log_record_type_descriptors to not be depending on enum order.
Use array of LEX_STRING's to send data to log handler
Added 'dummy' transaction option to MARIA_INFO so that we can always assume 'trn' exists.
include/lf.h:
Interface fixes
Rename of structures
(Patch from Sergei via Sanja)
include/my_atomic.h:
More comments
include/my_global.h:
Added MY_ERRPTR
include/pagecache.h:
Added undo LSN when unlocking pages
mysql-test/r/maria.result:
Updated results
mysql-test/t/maria.test:
Added autocommit around lock tables
(Patch from Sanja)
mysys/lf_alloc-pin.c:
Post-review fixes, simple optimizations
More comments
Struct slot renames
Check amount of memory on stack
(Patch from Sergei)
mysys/lf_dynarray.c:
More comments
mysys/lf_hash.c:
More comments
After review fixes
(Patch from Sergei)
storage/maria/ha_maria.cc:
Split trnman.h into two files, so that we don't have to include my_atomic.h into the .cc program.
(Temporary fix to avoid bug in gcc)
Move out all deferencing of the transaction structure.
Transaction manager integrated (Patch from Sergei)
storage/maria/ha_maria.h:
Added prototype for start_stmt()
storage/maria/lockman.c:
Function call rename
storage/maria/ma_bitmap.c:
Mark deleted pages free from page cache
storage/maria/ma_blockrec.c:
Offset -> rownr
More debugging
Fixed problem with small head block + long varchar
Added logging of changed pages
Added logging of undo (Including only loggging of changed fields in case of update)
Added pinning/unpinning of all changed pages
More comments
Added free_full_pages() as the same code was used in several places.
fill_rows_parts() renamed as fill_insert_undo_parts()
offset -> rownr
Added some optimization of not transactional tables
_ma_update_block_record() has new parameter, as we need original row to do efficent undo for update
storage/maria/ma_blockrec.h:
Added ROW_EXTENTS_ON_STACK
Changed prototype for update and delete of row
storage/maria/ma_check.c:
Added original row to delete_record() call
storage/maria/ma_control_file.h:
Added ifdefs for C++
storage/maria/ma_delete.c:
Added original row to delete_record() call
(Needed for efficent undo logging)
storage/maria/ma_dynrec.c:
Added extra argument to delete_record() and update_record()
Removed not used variable
storage/maria/ma_init.c:
Initialize log handler
storage/maria/ma_loghandler.c:
Removed not used variable
Change initialization of log_record_type_descriptors to not be depending on enum order
Use array of LEX_STRING's to send data to log handler
storage/maria/ma_loghandler.h:
New defines
Use array of LEX_STRING's to send data to log handler
storage/maria/ma_open.c:
Added 'dummy' transaction option to MARIA_INFO so that we can always assume 'trn' exists.
Store in MARIA_SHARE->page_type if pages will have up to date LSN's
storage/maria/ma_pagecache.c:
Don't decrease number of readers when using pagecache_write()/pagecache_read()
In pagecache_write() decrement request count if page was left pinned
Added pagecache_delete_pages()
Removed some casts
Make trace output consistent with rest of code
Simplify calling of DBUG_ASSERT(0)
Only update LSN if the LSN is bigger than what's already on the page
Added LSN parameter pagecache_unpin_page(), pagecache_unpin(), and pagecache_unlock()
(Part of patch from Sanja)
storage/maria/ma_static.c:
Added 'dummy' transaction option to MARIA_INFO so that we can always assume 'trn' exists.
Added default page cache
storage/maria/ma_statrec.c:
Added extra argument to delete_record() and update_record()
storage/maria/ma_test1.c:
Added option -T for transactions
storage/maria/ma_test2.c:
Added option -T for transactions
storage/maria/ma_test_all.sh:
Test with transactions
storage/maria/ma_update.c:
Changed prototype for update of row
storage/maria/maria_def.h:
Changed prototype for update & delete of row as block records need to access the old row
Store in MARIA_SHARE->page_type if pages will have up to date LSN's
Added MARIA_MAX_TREE_LEVELS to allow us to calculate the number of possible pinned pages we may need.
Removed not used 'empty_bits_buffer'
Added pointer to transaction object
Added array for pinned pages
Added log_row_parts array for logging of field data.
Added MARIA_PINNED_PAGE to store pinned pages
storage/maria/trnman.c:
Added accessor functions to transaction object
Added missing DBUG_RETURN()
More debugging
More comments
Changed // comment of code to #ifdef NOT_USED
Transaction manager integrated.
Post review fixes
Part of patch originally from Sergei
storage/maria/trnman.h:
Split trnman.h into two files, so that we don't have to include my_atomic.h into the .cc program.
(Temporary fix to avoid bug in gcc)
storage/maria/unittest/ma_pagecache_single.c:
Added missing argument
Added SKIP_BIG_TESTS
(Patch from Sanja)
storage/maria/unittest/ma_test_loghandler-t.c:
Test logging with new LEX_STRING parameter
(Patch from Sanja)
storage/maria/unittest/ma_test_loghandler_multigroup-t.c:
Test logging with new LEX_STRING parameter
(Patch from Sanja)
storage/maria/unittest/ma_test_loghandler_multithread-t.c:
Test logging with new LEX_STRING parameter
(Patch from Sanja)
storage/maria/unittest/ma_test_loghandler_pagecache-t.c:
Test logging with new LEX_STRING parameter
(Patch from Sanja)
storage/maria/unittest/trnman-t.c:
Stack overflow detection
(Patch from Sergei)
unittest/unit.pl:
Command-line options --big and --verbose
(Patch from Sergei)
unittest/mytap/tap.c:
Detect --big
(Patch from Sergei)
unittest/mytap/tap.h:
Skip_big_tests and SKIP_BIG_TESTS
(Patch from Sergei)
storage/maria/trnman_public.h:
New BitKeeper file ``storage/maria/trnman_public.h''
2007-05-29 19:13:56 +02:00
|
|
|
/* Don't allocate memory for too many row extents on the stack */
|
|
|
|
#define ROW_EXTENTS_ON_STACK 32
|
|
|
|
|
2007-01-18 20:38:14 +01:00
|
|
|
/* Functions to convert MARIA_RECORD_POS to/from page:offset */
|
|
|
|
|
2007-04-19 12:18:56 +02:00
|
|
|
static inline MARIA_RECORD_POS ma_recordpos(ulonglong page, uint dir_entry)
|
2007-01-18 20:38:14 +01:00
|
|
|
{
|
2007-04-19 12:18:56 +02:00
|
|
|
DBUG_ASSERT(dir_entry <= 255);
|
|
|
|
return (MARIA_RECORD_POS) ((page << 8) | dir_entry);
|
2007-01-18 20:38:14 +01:00
|
|
|
}
|
|
|
|
|
|
|
|
static inline my_off_t ma_recordpos_to_page(MARIA_RECORD_POS record_pos)
|
|
|
|
{
|
|
|
|
return record_pos >> 8;
|
|
|
|
}
|
|
|
|
|
2007-09-11 00:58:15 +02:00
|
|
|
static inline uint ma_recordpos_to_dir_entry(MARIA_RECORD_POS record_pos)
|
2007-01-18 20:38:14 +01:00
|
|
|
{
|
2007-09-11 00:58:15 +02:00
|
|
|
return (uint) (record_pos & 255);
|
2007-01-18 20:38:14 +01:00
|
|
|
}
|
|
|
|
|
2007-10-09 20:09:50 +02:00
|
|
|
static inline uchar *dir_entry_pos(uchar *buff, uint block_size, uint pos)
|
|
|
|
{
|
|
|
|
return (buff + block_size - DIR_ENTRY_SIZE * pos - PAGE_SUFFIX_SIZE -
|
|
|
|
DIR_ENTRY_SIZE);
|
|
|
|
}
|
|
|
|
|
2007-01-18 20:38:14 +01:00
|
|
|
/* ma_blockrec.c */
|
|
|
|
void _ma_init_block_record_data(void);
|
2007-04-19 12:18:56 +02:00
|
|
|
my_bool _ma_once_init_block_record(MARIA_SHARE *share, File dfile);
|
|
|
|
my_bool _ma_once_end_block_record(MARIA_SHARE *share);
|
|
|
|
my_bool _ma_init_block_record(MARIA_HA *info);
|
|
|
|
void _ma_end_block_record(MARIA_HA *info);
|
2007-10-19 23:24:22 +02:00
|
|
|
my_bool _ma_check_if_zero(uchar *pos, uint length);
|
2007-01-18 20:38:14 +01:00
|
|
|
|
|
|
|
my_bool _ma_update_block_record(MARIA_HA *info, MARIA_RECORD_POS pos,
|
2007-07-02 19:45:15 +02:00
|
|
|
const uchar *oldrec, const uchar *newrec);
|
|
|
|
my_bool _ma_delete_block_record(MARIA_HA *info, const uchar *record);
|
|
|
|
int _ma_read_block_record(MARIA_HA *info, uchar *record,
|
2007-01-18 20:38:14 +01:00
|
|
|
MARIA_RECORD_POS record_pos);
|
2007-07-02 19:45:15 +02:00
|
|
|
int _ma_read_block_record2(MARIA_HA *info, uchar *record,
|
|
|
|
uchar *data, uchar *end_of_data);
|
|
|
|
int _ma_scan_block_record(MARIA_HA *info, uchar *record,
|
2007-01-18 20:38:14 +01:00
|
|
|
MARIA_RECORD_POS, my_bool);
|
|
|
|
my_bool _ma_cmp_block_unique(MARIA_HA *info, MARIA_UNIQUEDEF *def,
|
2007-07-02 19:45:15 +02:00
|
|
|
const uchar *record, MARIA_RECORD_POS pos);
|
2007-01-18 20:38:14 +01:00
|
|
|
my_bool _ma_scan_init_block_record(MARIA_HA *info);
|
|
|
|
void _ma_scan_end_block_record(MARIA_HA *info);
|
Fixed bug in undo_key_delete; Caused crashed key files in recovery
Maria is now used for internal temporary tables in MySQL
Better usage of VARCHAR and long strings in temporary tables
Use packed fields if BLOCK_RECORD is used
null_bytes are not anymore stored in a separate field
New interface to remember and restore scan position
Fixed bugs in unique handling
Don't sync Maria temporary tables
Lock control file while it's used to stop several processes from using it
Changed value of MA_DONT_OVERWRITE_FILE as it collided with MY_SYNC_DIR
Split MY_DONT_WAIT into MY_NO_WAIT and MY_SHORT_WAIT (for my_lock())
Added MY_FORCE_LOCK
include/my_sys.h:
Changed value of MA_DONT_OVERWRITE_FILE as it collided with MY_SYNC_DIR
Split MY_DONT_WAIT into MY_NO_WAIT and MY_SHORT_WAIT (for my_lock())
Added MY_FORCE_LOCK
include/myisam.h:
Make MyISAM columndef compile time compatible with Maria
mysql-test/lib/mtr_process.pl:
Removed confusing warning (It's common that there is a lot of other files than pid files)
mysql-test/mysql-test-run.pl:
Added --sync-frm to speed up tests
mysql-test/r/maria-recovery.result:
Updated results from wrong push
mysql-test/suite/rpl/t/rpl_innodb_bug28430.test:
Marked test as --big
mysys/my_lock.c:
If MY_FORCE_LOCK is given, use locking even if my_disable_locking is given
If MY_NO_WAIT is given, return at once if lock is occupied
If MY_SHORT_WAIT is given, wait some time for lock before returning (This was called MY_DONT_WAIT before)
mysys/my_thr_init.c:
Fix that we don't give name to thread before it's properly initied
sql/handler.cc:
Added myisam.h
sql/handler.h:
Changes to use Maria for internal temporary tables
Removed not needed argument to restart_rnd_next()
Added function remember_rnd_pos()
sql/my_lock.c:
If MY_FORCE_LOCK is given, use locking even if my_disable_locking is given
If MY_NO_WAIT is given, return at once if lock is occupied
If MY_SHORT_WAIT is given, wait some time for lock before returning (This was called MY_DONT_WAIT before)
sql/mysql_priv.h:
Added maria_hton
sql/sql_class.h:
Changes to use Maria for internal temporary tables
sql/sql_select.cc:
Changes to use Maria for internal temporary tables
Temporary tables didn't properly switch to dynamic row format if long strings was used
Better usage of VARCHAR in temporary tables
Use new interface to restart scan in duplicate removal
sql/sql_select.h:
Changes to use Maria for internal temporary tables
sql/sql_show.cc:
Changes to use Maria for internal temporary tables
Removed all end space
sql/sql_table.cc:
Set HA_OPTION_PACK_RECORD if we are not using default or static record
sql/sql_union.cc:
If MY_FORCE_LOCK is given, use locking even if my_disable_locking is given
If MY_NO_WAIT is given, return at once if lock is occupied
If MY_SHORT_WAIT is given, wait some time for lock before returning (This was called MY_DONT_WAIT before)
sql/sql_update.cc:
If MY_FORCE_LOCK is given, use locking even if my_disable_locking is given
If MY_NO_WAIT is given, return at once if lock is occupied
If MY_SHORT_WAIT is given, wait some time for lock before returning (This was called MY_DONT_WAIT before)
storage/maria/ha_maria.cc:
Use packed fields
null_bytes are not anymore stored in a separate field
Changes to use Maria for internal temporary tables
Give warning if we try to do an ALTER TABLE to a unusable row format
storage/maria/ha_maria.h:
Allow Maria with block format to restart scanning at given position
storage/maria/ma_blockrec.c:
Added functions to remember and restore scan position
Allocate cur_row.extents so that we don't have to do a malloc on first read
Fixed bug when using packed row without packed strings
Removed unneeded calls to free_full_pages()
Fixed unlikely bug when using old bitmap to read head page and head page had gone away
Remember row position when doing undo of delete and update row (needed for undo of key delete)
storage/maria/ma_blockrec.h:
Added functions to remember and restore scan position
storage/maria/ma_close.c:
Don't sync temporary tables
storage/maria/ma_control_file.c:
Lock control file while it's used to stop several processes from using it
storage/maria/ma_create.c:
Fixed bug when using FIELD_NORMAL that was longer than FULL_PAGE_SIZE
Fixed bug that casued fields to not be ordered according to offset
Fixed bug in unique creation
storage/maria/ma_delete.c:
Don't write record reference when deleting key.
(Rowid is likely to be different when we undo this)
storage/maria/ma_dynrec.c:
Fixed core dump when comparing records (happended in unique handling)
storage/maria/ma_extra.c:
MY_DONT_WAIT -> MY_SHORT_WAIT
Removed TODO comment. (Was not relevant as all other instances are guranteed to be closed when we the code is excecuted)
Added DBUG_ASSERT() to prove above.
storage/maria/ma_key_recover.c:
CLR's for UNDO_ROW_DELETE and UNDO_ROW_UPDATE now include rowid for the row.
This was needed for undo_key_delete to work, as undo of delete row is likely to put row in a new position.
undo_delete_key now doesn't include row position
storage/maria/ma_open.c:
Added virtual functions for remembering and restoring scan position
Fixed wrong key search method when using multi-byte character sets (Bug#32705)
Store original column number in index file
NOTE: Index files are now incompatible with previous versions!
(Ok as we haven't yet made a public Maria release)
storage/maria/ma_recovery.c:
Set info->cur_row.lastpos when reading CLR's for UNDO_ROW_DELETE or UNDO_ROW_UPDATE
storage/maria/ma_scan.c:
Added default function to remember and restore scan position
storage/maria/maria_def.h:
Added virtual functions & variables to remember and restore scan position
Added MARIA_MAX_CONTROL_FILE_LOCK_RETRY
storage/myisam/ha_myisam.cc:
Fixed compiler errors as columdef->type is now an enum, not an integer
Added functions to remember and restore scan position
storage/myisam/ha_myisam.h:
Added functions to remember and restore scan position
storage/myisam/mi_check.c:
MY_DONT_WAIT -> MY_SHORT_WAIT
storage/myisam/mi_extra.c:
MY_DONT_WAIT -> MY_SHORT_WAIT
storage/myisam/mi_open.c:
MY_DONT_WAIT -> MY_SHORT_WAIT
storage/myisam/myisamdef.h:
MY_DONT_WAIT -> MY_SHORT_WAIT
2007-12-17 00:17:37 +01:00
|
|
|
int _ma_scan_remember_block_record(MARIA_HA *info,
|
|
|
|
MARIA_RECORD_POS *lastpos);
|
|
|
|
void _ma_scan_restore_block_record(MARIA_HA *info,
|
|
|
|
MARIA_RECORD_POS lastpos);
|
2007-01-18 20:38:14 +01:00
|
|
|
|
|
|
|
MARIA_RECORD_POS _ma_write_init_block_record(MARIA_HA *info,
|
2007-07-02 19:45:15 +02:00
|
|
|
const uchar *record);
|
|
|
|
my_bool _ma_write_block_record(MARIA_HA *info, const uchar *record);
|
2007-01-18 20:38:14 +01:00
|
|
|
my_bool _ma_write_abort_block_record(MARIA_HA *info);
|
|
|
|
my_bool _ma_compare_block_record(register MARIA_HA *info,
|
2007-07-02 19:45:15 +02:00
|
|
|
register const uchar *record);
|
2007-01-18 20:38:14 +01:00
|
|
|
|
|
|
|
/* ma_bitmap.c */
|
|
|
|
my_bool _ma_bitmap_init(MARIA_SHARE *share, File file);
|
|
|
|
my_bool _ma_bitmap_end(MARIA_SHARE *share);
|
WL#3072 Maria recovery:
fix for bug: if a crash happened right after writing a REDO like this:
REDO - UNDO - REDO*, then recovery would ignore the last REDO* (ok),
rollback: REDO - UNDO - REDO* - REDO - CLR, and a next recovery would
thus execute REDO* instead of skipping it again. Recovery now logs
LOGREC_INCOMPLETE_GROUP when it meets REDO* for the first time,
to draw a boundary and ensure it is always skipped. Tested by hand.
Note: ma_test_all fails "maria_chk: error: Key 1 - Found too many records"
not due to this patch (failed before).
BitKeeper/triggers/post-commit:
no truncation of the commit mail, or how to review patches?
mysql-test/include/maria_verify_recovery.inc:
let caller choose the statement used to crash (sometimes we
want the crash to happen at special places)
mysql-test/t/maria-recovery.test:
user of maria_verify_recovery.inc now specifies statement which the
script should use for crashing.
storage/maria/ma_bitmap.c:
it's easier to search for all places using functions from the bitmap
module (like in ma_blockrec.c) if those exported functions all start
with "_ma_bitmap": renaming some of them.
Assertion that when we read a bitmap page, overwriting bitmap->map,
we are not losing information (i.e. bitmap->changed is false).
storage/maria/ma_blockrec.c:
update to new names. Adding code (disabled, protected by a #ifdef)
that I use to test certain crash scenarios (more to come).
storage/maria/ma_blockrec.h:
update to new names
storage/maria/ma_checkpoint.c:
update to new names
storage/maria/ma_extra.c:
update to new names
storage/maria/ma_loghandler.c:
new LOGREC_INCOMPLETE_GROUP
storage/maria/ma_loghandler.h:
new LOGREC_INCOMPLETE_GROUP
storage/maria/ma_recovery.c:
When at the end of the REDO phase we have identified some transactions
with incomplete REDO groups (REDOs without an UNDO or CLR_END),
for each of them we log LOGREC_INCOMPLETE_GROUP. This way, the
upcoming UNDO phase can write more records for such transaction,
a future recovery won't pair the incomplete group with the
CLR_END (as there is LOGREC_INCOMPLETE_GROUP to draw a boundary).
2007-12-10 23:26:53 +01:00
|
|
|
my_bool _ma_bitmap_flush(MARIA_SHARE *share);
|
WL#3072 - Maria recovery.
* fix for bitmap vs checkpoint bug which could lead to corrupted
tables in case of crashes at certain moments: a bitmap could be flushed
to disk even though it was inconsistent with the log (it could be
flushed before REDO-UNDO are written to the log). One bug remains, need
code from others. Tests added. Fix is to pin unflushable bitmap pages,
and let checkpoint wait for them to be flushable.
* fix for long_trid!=0 assertion failure at Recovery.
* less useless wakeups in the background flush|checkpoint thread.
* store global_trid_generator in checkpoint record.
mysql-test/r/maria-recovery.result:
result update
mysql-test/t/maria-recovery.test:
make it easier to locate subtests
storage/maria/ma_bitmap.c:
When we send a bitmap to the pagecache, if this bitmap is not in a
flushable state we keep it pinned and add it to a list, it will be
unpinned when the bitmap is flushable again.
A new function _ma_bitmap_flush_all() used by checkpoint.
A new function _ma_bitmap_flushable() used by block format to signal
when it starts modifying a bitmap and when it is done with it.
storage/maria/ma_blockrec.c:
When starting a row operation (insert/update/delete), mark that
the bitmap is not flushable (because for example INSERT is going
to over-allocate in the bitmap to prevent other threads from using
our data pages). If a checkpoint comes at this moment it will wait
for the bitmap to be flushable before flushing it.
When the operation ends, bitmap becomes flushable again; that
transition is done under the bitmap's mutex (needed for correct
synchro with a concurrent checkpoint); but for INSERT/UPDATE this
happens inside _ma_bitmap_release_unused() at a place where it already
has the mutex, so the only penalty (mutex adding) is in DELETE and UNDO
of INSERT. In case of errors after setting the bitmap unflushable,
we must always set it back to flushable or checkpoint would block.
Debug possibilities to force a sleep while the bitmap is over-allocated.
In case of error in get_head_or_tail() in allocate_and_write_block_record(),
we still need to unpin all pages.
Bugfix: _ma_apply_redo_insert_row_blobs() produced wrong
data_file_length.
storage/maria/ma_blockrec.h:
new bitmap calls.
storage/maria/ma_checkpoint.c:
filter_flush_indirect not needed anymore (flushing bitmap
pages happens in _ma_bitmap_flush_all() now). So
st_filter_param::is_data_file|pages_covered_by_bitmap not needed.
Other filter_flush* don't need to flush bitmap anymore.
Add debug possibility to flush all bitmap pages outside of a checkpoint,
to simulate pagecache LRU eviction.
When the background flush/checkpoint thread notices it has nothing
to flush, it now sleeps directly until the next potential checkpoint
moment instead of waking up every second.
When in checkpoint we decide to not store a table in the checkpoint record
(because it has logged no writes for example), we can also skip flushing
this table.
storage/maria/ma_commit.c:
comment is out-of-date
storage/maria/ma_key_recover.c:
comment fix
storage/maria/ma_loghandler.c:
comment is out-of-date
storage/maria/ma_open.c:
comment is out-of-date
storage/maria/ma_pagecache.c:
comment for bug to fix. And we don't take checkpoints at end of REDO
phase yet so can trust block->type.
storage/maria/ma_recovery.c:
Comments. Now-unneeded code for incomplete REDO-UNDO groups removed.
When we forget about an old transaction we must really forget
about it with bzero() (fixes the "long_trid!=0 assertion" recovery
bug). When we delete a row with maria_delete() we turn on
STATE_NOT_OPTIMIZED_ROWS so we do the same when we see a CLR_END
for an UNDO_ROW_INSERT or when we execute an UNDO_ROW_INSERT (in both
cases a row was deleted). Pick up max_long_trid from the checkpoint record.
storage/maria/maria_chk.c:
comment
storage/maria/maria_def.h:
MARIA_FILE_BITMAP gets new members: 'flushable', 'bitmap_cond' and
'pinned_pages'.
storage/maria/trnman.c:
I used to think that recovery only needs to know the maximum TrID
of the lists of active and committed transactions. But no, sometimes
both lists can even be empty and their TrID should not be reused.
So Checkpoint now saves global_trid_generator in the checkpoint record.
storage/maria/trnman_public.h:
macros to read/store a TrID
mysql-test/r/maria-recovery-bitmap.result:
result is ok. Without the code fix, we would get a corruption message
about the bitmap page in CHECK TABLE EXTENDED.
mysql-test/t/maria-recovery-bitmap-master.opt:
usual when we crash mysqld in tests
mysql-test/t/maria-recovery-bitmap.test:
test of recovery problems specific of the bitmap pages.
2007-12-14 16:14:12 +01:00
|
|
|
my_bool _ma_bitmap_flush_all(MARIA_SHARE *share);
|
2007-11-28 20:38:30 +01:00
|
|
|
void _ma_bitmap_reset_cache(MARIA_SHARE *share);
|
2007-01-18 20:38:14 +01:00
|
|
|
my_bool _ma_bitmap_find_place(MARIA_HA *info, MARIA_ROW *row,
|
|
|
|
MARIA_BITMAP_BLOCKS *result_blocks);
|
|
|
|
my_bool _ma_bitmap_release_unused(MARIA_HA *info, MARIA_BITMAP_BLOCKS *blocks);
|
2007-07-02 19:45:15 +02:00
|
|
|
my_bool _ma_bitmap_free_full_pages(MARIA_HA *info, const uchar *extents,
|
2007-01-18 20:38:14 +01:00
|
|
|
uint count);
|
|
|
|
my_bool _ma_bitmap_set(MARIA_HA *info, ulonglong pos, my_bool head,
|
|
|
|
uint empty_space);
|
WL#3072 Maria recovery:
fix for bug: if a crash happened right after writing a REDO like this:
REDO - UNDO - REDO*, then recovery would ignore the last REDO* (ok),
rollback: REDO - UNDO - REDO* - REDO - CLR, and a next recovery would
thus execute REDO* instead of skipping it again. Recovery now logs
LOGREC_INCOMPLETE_GROUP when it meets REDO* for the first time,
to draw a boundary and ensure it is always skipped. Tested by hand.
Note: ma_test_all fails "maria_chk: error: Key 1 - Found too many records"
not due to this patch (failed before).
BitKeeper/triggers/post-commit:
no truncation of the commit mail, or how to review patches?
mysql-test/include/maria_verify_recovery.inc:
let caller choose the statement used to crash (sometimes we
want the crash to happen at special places)
mysql-test/t/maria-recovery.test:
user of maria_verify_recovery.inc now specifies statement which the
script should use for crashing.
storage/maria/ma_bitmap.c:
it's easier to search for all places using functions from the bitmap
module (like in ma_blockrec.c) if those exported functions all start
with "_ma_bitmap": renaming some of them.
Assertion that when we read a bitmap page, overwriting bitmap->map,
we are not losing information (i.e. bitmap->changed is false).
storage/maria/ma_blockrec.c:
update to new names. Adding code (disabled, protected by a #ifdef)
that I use to test certain crash scenarios (more to come).
storage/maria/ma_blockrec.h:
update to new names
storage/maria/ma_checkpoint.c:
update to new names
storage/maria/ma_extra.c:
update to new names
storage/maria/ma_loghandler.c:
new LOGREC_INCOMPLETE_GROUP
storage/maria/ma_loghandler.h:
new LOGREC_INCOMPLETE_GROUP
storage/maria/ma_recovery.c:
When at the end of the REDO phase we have identified some transactions
with incomplete REDO groups (REDOs without an UNDO or CLR_END),
for each of them we log LOGREC_INCOMPLETE_GROUP. This way, the
upcoming UNDO phase can write more records for such transaction,
a future recovery won't pair the incomplete group with the
CLR_END (as there is LOGREC_INCOMPLETE_GROUP to draw a boundary).
2007-12-10 23:26:53 +01:00
|
|
|
my_bool _ma_bitmap_reset_full_page_bits(MARIA_HA *info,
|
|
|
|
MARIA_FILE_BITMAP *bitmap,
|
|
|
|
ulonglong page, uint page_count);
|
|
|
|
my_bool _ma_bitmap_set_full_page_bits(MARIA_HA *info,
|
|
|
|
MARIA_FILE_BITMAP *bitmap,
|
|
|
|
ulonglong page, uint page_count);
|
2007-01-18 20:38:14 +01:00
|
|
|
uint _ma_free_size_to_head_pattern(MARIA_FILE_BITMAP *bitmap, uint size);
|
|
|
|
my_bool _ma_bitmap_find_new_place(MARIA_HA *info, MARIA_ROW *new_row,
|
|
|
|
ulonglong page, uint free_size,
|
|
|
|
MARIA_BITMAP_BLOCKS *result_blocks);
|
|
|
|
my_bool _ma_check_bitmap_data(MARIA_HA *info,
|
|
|
|
enum en_page_type page_type, ulonglong page,
|
|
|
|
uint empty_space, uint *bitmap_pattern);
|
|
|
|
my_bool _ma_check_if_right_bitmap_type(MARIA_HA *info,
|
|
|
|
enum en_page_type page_type,
|
|
|
|
ulonglong page,
|
|
|
|
uint *bitmap_pattern);
|
2007-04-12 11:05:30 +02:00
|
|
|
void _ma_bitmap_delete_all(MARIA_SHARE *share);
|
WL#3072 - Maria recovery
Unit test for recovery: runs ma_test1 and ma_test2 (both only with
INSERTs and DELETEs; UPDATEs disabled as not handled by recovery)
then moves the tables elswhere; recreates tables from the log, and
compares and fails if there is a difference. Passes now.
Most of maria_read_log.c moved to ma_recovery.c, as it will be re-used
for recovery-from-ha_maria.
Bugfixes of applying of REDO_INSERT, REDO_PURGE_ROW.
Applying of REDO_PURGE_BLOCKS, REDO_DELETE_ALL, REDO_DROP_TABLE,
UNDO_ROW_INSERT (in REDO phase only, i.e. just doing records++),
UNDO_ROW_DELETE, UNDO_ROW_PURGE.
Code cleanups.
Monty: please look for "QQ". Sanja: please look for "Sanja".
Future tasks: recovery of the bitmap (easy), recovery of the state
(make it idempotent), more REDOs (Monty to work on
REDO_UPDATE?), UNDO phase...
Pushing this cset as it looks safe, contains test and bugfixes which
will help Monty implement applying of REDO_UPDATE.
sql/handler.cc:
typo
storage/maria/Makefile.am:
Adding ma_test_recovery (which ma_test_all invokes, and which can
also be run alone). Most of maria_read_log.c moved to ma_recovery.c
storage/maria/ha_maria.cc:
comments
storage/maria/ma_bitmap.c:
fixing comments. 2 -> sizeof(maria_bitmap_marker).
Bitmap-related part of _ma_initialize_datafile() moves in bitmap module.
Now putting the "bm" signature when creating the first bitmap page
(it used to happen only at next open, but that
caused an annoying difference when testing Recovery if the original
run didn't open the table, and it looks more
logical like this: it goes to disk only with its signature correct);
see the "QQ" comment towards the _ma_initialize_data_file() call
in ma_create.c for more).
When reading a bitmap page, verify its signature (happens when normally
using the table or when CHECKing it; not when REPAIRing it).
storage/maria/ma_blockrec.c:
* no need to sync the data file if table is not transactional
* Comments, code cleanup (log-related data moved to log-related code
block, int5store->page_store).
* Store the table's short id into LOGREC_UNDO_ROW_PURGE, like we
do for other records (though this record will soon be replaced
with a CLR).
* If "page" is 1 it means the page which extends from byte
page*block_size+1 to (page+1)*block_size (byte number 1 being
the first byte of the file). The last byte of the file is
data_file_length (same convention).
A new page needs to be created if the last byte of the page is
beyond the last byte of the file, i.e.
(page+1)*block_size+1 > data_file_length, so we correct the test
(bug found when testing log applying for ma_test1 -M -T --skip-update).
* update the page's LSN when removing a row from it during
execution of a REDO_PURGE_ROW record (bug found when testing log
applying for ma_test1 -M -T --skip-update).
* applying of REDO_PURGE_BLOCKs (limited to a one-page range for now).
storage/maria/ma_blockrec.h:
new functions. maria_bitmap_marker does not need to be exported.
storage/maria/ma_close.c:
we can always flush the table's state when closing the last instance
of the table. And it is needed for maria_read_log (as it does
not use maria_lock_database()).
storage/maria/ma_control_file.c:
when in Recovery, some assertions should not be used.
storage/maria/ma_control_file.h:
double-inclusion safe
storage/maria/ma_create.c:
during recovery, don't log records. Comments.
Moving the creation of the first bitmap page to ma_bitmap.c
storage/maria/ma_delete_table.c:
during recovery, don't log records. Log the end-zero of the dropped
table's name, so that recovery can use the string in place without
extending it to fit an end zero.
storage/maria/ma_loghandler.c:
* inwrite_rec_hook also needs access to the MARIA_SHARE, like
prewrite_rec_hook. This will be needed to update
share->records_diff (in the upcoming patch "recovery of the state").
* LOG_DESC::record_ends_group changed to an enum.
* LOG_DESC for LOGREC_REDO_PURGE_BLOCKS and LOGREC_UNDO_ROW_PURGE
corrected
* Sanja please see the @todo LOG BUG
* avoiding DBUG_RETURN(func()) as it gives confusing debug traces.
storage/maria/ma_loghandler.h:
- log write hooks called while the log's lock is held (inwrite_rec_hook)
now need the MARIA_SHARE, like prewrite_rec_hook already had
- instead of a bool saying if this record's type ends groups or not,
we refine: it may not end a group, it may end a group, or it may
be a group in itself. Imagine that we had a physical write failure
to a table before we log the UNDO, we still end up in
external_lock(F_UNLCK) and then we log a COMMIT: we don't want
to consider this COMMIT as ending the group of REDOs (don't want
to execute those REDOs during Recovery), that's why we say "COMMIT
is a group in itself, it aborts any previous group". This also
gives one more sanity check in maria_read_log.
storage/maria/ma_recovery.c:
New Recovery code, replacing the old pseudocode.
Most of maria_read_log moved here.
Call-able from ha_maria, but not enabled yet.
Compared to the previous version of maria_read_log, some bugs have
been fixed, debugging output can go to stdout or a disk file (for now
it's useful for me, later it can be changed), execution of
REDO_DROP_TABLE, REDO_DELETE_ALL, REDO_PURGE_BLOCKS has been added. Duplicate code
has been factored into functions. We abort an unfinished group
of records if we see a record which is a group in itself (like COMMIT).
No need for maria_panic() after a bug (which caused tables to not
be closed) was fixed; if there is yet another bug I prefer to see it.
When opening a table for Recovery, set data_file_length
and key_file_length to their real physical value (these are the
easiest state members to restore :). Warn us if the last page
was truncated (but Recovery handles it).
MARIA_SHARE::state::state::records is now partly recovered (not
idempotent, but works if recreating tables from scracth).
When applying a REDO to a page, stamp it with the UNDO's LSN
(current_group_end_lsn), not with the REDO's LSN; it makes
the table more identical to the original table (easier to compare
the two tables in the end).
Big thing missing: some types of REDOs are not handled,
and the UNDO phase does not exist (missing functions to execute UNDOs
to actually rollback). So for now tests are only inserting/deleting
a few 100 rows, closing the table and seeing if the log is applied ok;
it works. UPDATE not handled.
storage/maria/ma_recovery.h:
new functions: ma_recover() for recovery from inside ha_maria;
_ma_apply_log() for maria_read_log (ma_recover() calls _ma_apply_log()).
Btw, we need to not use the word "recover" for REPAIR/maria_chk anymore.
storage/maria/ma_rename.c:
don't write log records during recovery
storage/maria/ma_test2.c:
- fail if maria_info() or other subtests find some wrong information
- new option -g to skip updates.
- init the translog before creating the table, so that log applying
can work.
- in "#if 0" you'll see some fixed bugs (will be removed).
storage/maria/ma_test_all.sh:
cleanup files. Test log applying.
storage/maria/maria_read_log.c:
most of the logic moves to ma_recovery.c to be shared between
maria_read_log and recovery-from-inside-mysqld.
See ma_recovery.c for additional changes made to the moved code.
storage/maria/ma_test_recovery:
unit test for Recovery. Tests insert and delete,
REDO_UPDATE not yet coded.
Script is called from ma_test_all. Can run standalone.
2007-07-26 11:56:21 +02:00
|
|
|
int _ma_bitmap_create_first(MARIA_SHARE *share);
|
2007-12-15 14:17:23 +01:00
|
|
|
void _ma_bitmap_flushable(MARIA_SHARE *share, int non_flushable_inc);
|
2007-10-19 23:24:22 +02:00
|
|
|
#ifndef DBUG_OFF
|
|
|
|
void _ma_print_bitmap(MARIA_FILE_BITMAP *bitmap, uchar *data,
|
|
|
|
ulonglong page);
|
|
|
|
#endif
|
|
|
|
|
2007-07-03 23:50:17 +02:00
|
|
|
uint _ma_apply_redo_insert_row_head_or_tail(MARIA_HA *info, LSN lsn,
|
|
|
|
uint page_type,
|
2007-07-04 11:39:19 +02:00
|
|
|
const uchar *header,
|
|
|
|
const uchar *data,
|
2007-07-03 23:50:17 +02:00
|
|
|
size_t data_length);
|
|
|
|
uint _ma_apply_redo_purge_row_head_or_tail(MARIA_HA *info, LSN lsn,
|
|
|
|
uint page_type,
|
2007-07-04 11:39:19 +02:00
|
|
|
const uchar *header);
|
2007-10-19 23:24:22 +02:00
|
|
|
uint _ma_apply_redo_free_blocks(MARIA_HA *info, LSN lsn,
|
|
|
|
const uchar *header);
|
|
|
|
uint _ma_apply_redo_free_head_or_tail(MARIA_HA *info, LSN lsn,
|
|
|
|
const uchar *header);
|
|
|
|
uint _ma_apply_redo_insert_row_blobs(MARIA_HA *info,
|
|
|
|
LSN lsn, const uchar *header);
|
2007-09-04 09:53:52 +02:00
|
|
|
my_bool _ma_apply_undo_row_insert(MARIA_HA *info, LSN undo_lsn,
|
|
|
|
const uchar *header);
|
2007-09-05 01:57:53 +02:00
|
|
|
my_bool _ma_apply_undo_row_delete(MARIA_HA *info, LSN undo_lsn,
|
|
|
|
const uchar *header, size_t length);
|
|
|
|
my_bool _ma_apply_undo_row_update(MARIA_HA *info, LSN undo_lsn,
|
|
|
|
const uchar *header, size_t length);
|
WL#3072 - Maria recovery.
* Recovery of the table's live checksum (CREATE TABLE ... CHECKSUM=1)
is achieved in this patch. The table's live checksum
(info->s->state.state.checksum) is updated in inwrite_rec_hook's
under the log mutex when writing UNDO_ROW_INSERT|UPDATE|DELETE
and REDO_DELETE_ALL. The checksum variation caused by the operation
is stored in these UNDOs, so that the REDO phase, when it sees such
UNDOs, can update the live checksum if it is older (state.is_of_lsn is
lower) than the record. It is also used, as a nice add-on with no
cost, to do less row checksum computation during the UNDO phase
(as we have it in the record already).
Doing this work, it became pressing to move in-write hooks
(write_hook_for_redo() et al) to ma_blockrec.c.
The 'parts' argument of inwrite_rec_hook is unpredictable (it comes
mangled at this stage, for example by LSN compression) so it is
replaced by a 'void* hook_arg', which is used to pass down information,
currently only to write_hook_for_clr_end() (previous undo_lsn and
type of undone record).
* If from ha_maria, we print to stderr how many seconds (with one
fractional digit) the REDO phase took, same for UNDO phase and for
final table close. Just to give an indication for debugging and maybe
also for Support.
storage/maria/ha_maria.cc:
question for Monty
storage/maria/ma_blockrec.c:
* log in-write hooks (write_hook_for_redo() etc) move from
ma_loghandler.c to here; this is natural: the hooks are coupled
to their callers (functions in ma_blockrec.c).
* translog_write_record() now has a new argument "hook_arg";
using it to pass down to write_hook_for_clr_end() the transaction's
previous_undo_lsn and the type of the being undone record, and also
to pass down to all UNDOs the live checksum variation caused by the
operation.
* If table has live checksum, store in UNDO_ROW_INSERT|UPDATE|DELETE
and in CLR_END the checksum variation ("delta") caused by the
operation. For example if a DELETE caused the table's live checksum
to change from 123 to 456, we store in the UNDO_ROW_DELETE, in 4 bytes,
the value 333 (456-123).
* Instead of hard-coded "1" as length of the place where we store
the undone record's type in CLR_END, use a symbol CLR_TYPE_STORE_SIZE;
use macros clr_type_store and clr_type_korr.
* write_block_record() has a new parameter 'old_record_checksum'
which is the pre-computed checksum of old_record; that value is used
to update the table's live checksum when writing UNDO_ROW_UPDATE|CLR_END.
* In allocate_write_block_record(), if we are executing UNDO_ROW_DELETE
the row's checksum is already computed.
* _ma_update_block_record2() now expect the new row's checksum into
cur_row.checksum (was already true) and the old row's checksum into
new_row.checksum (that's new). Its two callers, maria_update() and
_ma_apply_undo_row_update(), honour this.
* When executing an UNDO_ROW_INSERT|UPDATE|DELETE in UNDO phase, pick
up the checksum delta from the log record. It is then used to update
the table's live checksum when writing CLR_END, and saves us a
computation of record.
storage/maria/ma_blockrec.h:
in-write hooks move from ma_loghandler.c
storage/maria/ma_check.c:
more straightforward size of buffer
storage/maria/ma_checkpoint.c:
<= is enough
storage/maria/ma_commit.c:
new prototype of translog_write_record()
storage/maria/ma_create.c:
new prototype of translog_write_record()
storage/maria/ma_delete.c:
The row's checksum must be computed before calling(*delete_record)(),
not after, because it must be known inside _ma_delete_block_record()
(to update the table's live checksum when writing UNDO_ROW_DELETE).
If deleting from a transactional table, live checksum was already updated
when writing UNDO_ROW_DELETE.
storage/maria/ma_delete_all.c:
@todo is now done (in ma_loghandler.c)
storage/maria/ma_delete_table.c:
new prototype of translog_write_record()
storage/maria/ma_loghandler.c:
* in-write hooks move to ma_blockrec.c.
* translog_write_record() gets a new argument 'hook_arg' which is
passed down to pre|inwrite_rec_hook. It is more useful that 'parts'
for those hooks, because when those hooks are called, 'parts' has
possibly been mangled (like with LSN compression) and is so
unpredictable.
* fix for compiler warning (unused buffer_start when compiling without
debug support)
* Because checksum delta is stored into UNDO_ROW_INSERT|UPDATE|DELETE
and CLR_END, but only if the table has live checksum, these records
are not PSEUDOFIXEDLENGTH anymore, they are now VARIABLE_LENGTH (their
length is X if no live checksum and X+4 otherwise).
* add an inwrite_rec_hook for UNDO_ROW_UPDATE, which updates the
table's live checksum. Update it also in hooks of UNDO_ROW_INSERT|
DELETE and REDO_DELETE_ALL and CLR_END.
* Bugfix: when reading a record in translog_read_record(), it happened
that "length" became negative, because the function assumed that
the record extended beyond the page's end, whereas it may be shorter.
storage/maria/ma_loghandler.h:
* Instead of hard-coded "1" and "4", use symbols and macros
to store/retrieve the type of record which the CLR_END corresponds
to, and the checksum variation caused by the operation which logs the
record
* translog_write_record() gets a new argument 'hook_arg' which is
passed down to pre|inwrite_rec_hook. It is more useful that 'parts'
for those hooks, because when those hooks are called, 'parts' has
possibly been mangled (like with LSN compression) and is so
unpredictable.
storage/maria/ma_open.c:
fix for "empty body in if() statement" (when compiling without safemutex)
storage/maria/ma_pagecache.c:
<= is enough
storage/maria/ma_recovery.c:
* print the time that each recovery phase (REDO/UNDO/flush) took;
this is enabled only when recovering from ha_maria. Is it printed
n seconds with a fractional part of one digit (like 123.4 seconds).
* In the REDO phase, update the table's live checksum by using
the checksum delta stored in UNDO_ROW_INSERT|DELETE|UPDATE and CLR_END.
Update it too when seeing REDO_DELETE_ALL.
* In the UNDO phase, when executing UNDO_ROW_INSERT, if the table does
not have live checksum then reading the record's header (as done by
the master loop of run_undo_phase()) is enough; otherwise we
do a translog_read_record() to have the checksum delta ready
for _ma_apply_undo_row_insert().
* When at the end of the REDO phase we notice that there is an unfinished
group of REDOs, don't assert in debug binaries, as I verified that it
can happen in real life (with kill -9)
* removing ' in #error as it confuses gcc3
storage/maria/ma_rename.c:
new prototype of translog_write_record()
storage/maria/ma_test_recovery.expected:
Change in output of ma_test_recovery: now all live checksums of
original tables equal those of tables recreated by the REDO phase
and those of tables fixed by the UNDO phase. I.e. recovery of
the live checksum looks like working (which was after all the only
goal of this changeset).
I checked by hand that it's not just all live checksums which are
now 0 and that's why they match. They are the old values like
3757530372. maria.test has hard-coded checksum values in its result
file so checks this too.
storage/maria/ma_update.c:
* It's useless to put up HA_STATE_CHANGED in 'key_changed',
as we put up HA_STATE_CHANGED in info->update anyway.
* We need to compute the old and new rows' checksum before calling
(*update_record)(), as checksum delta must be known when logging
UNDO_ROW_UPDATE which is done by _ma_update_block_record(). Note that
some functions change the 'newrec' record (at least _ma_check_unique()
does) so we cannot move the checksum computation too early in the
function.
storage/maria/ma_write.c:
If inserting into a transactional table, live's checksum was
already updated when writing UNDO_ROW_INSERT. The multiplication
is a trick to save an if().
storage/maria/unittest/ma_test_loghandler-t.c:
new prototype of translog_write_record()
storage/maria/unittest/ma_test_loghandler_first_lsn-t.c:
new prototype of translog_write_record()
storage/maria/unittest/ma_test_loghandler_max_lsn-t.c:
new prototype of translog_write_record()
storage/maria/unittest/ma_test_loghandler_multigroup-t.c:
new prototype of translog_write_record()
storage/maria/unittest/ma_test_loghandler_multithread-t.c:
new prototype of translog_write_record()
storage/maria/unittest/ma_test_loghandler_noflush-t.c:
new prototype of translog_write_record()
storage/maria/unittest/ma_test_loghandler_pagecache-t.c:
new prototype of translog_write_record()
storage/maria/unittest/ma_test_loghandler_purge-t.c:
new prototype of translog_write_record()
storage/myisam/sort.c:
fix for compiler warnings in pushbuild (write_merge_key* functions
didn't have their declaration match MARIA_HA::write_key).
2007-10-02 18:02:09 +02:00
|
|
|
|
|
|
|
my_bool write_hook_for_redo(enum translog_record_type type,
|
|
|
|
TRN *trn, MARIA_HA *tbl_info, LSN *lsn,
|
|
|
|
void *hook_arg);
|
|
|
|
my_bool write_hook_for_undo(enum translog_record_type type,
|
|
|
|
TRN *trn, MARIA_HA *tbl_info, LSN *lsn,
|
|
|
|
void *hook_arg);
|
|
|
|
my_bool write_hook_for_redo_delete_all(enum translog_record_type type,
|
|
|
|
TRN *trn, MARIA_HA *tbl_info,
|
|
|
|
LSN *lsn, void *hook_arg);
|
|
|
|
my_bool write_hook_for_undo_row_insert(enum translog_record_type type,
|
|
|
|
TRN *trn, MARIA_HA *tbl_info,
|
|
|
|
LSN *lsn, void *hook_arg);
|
|
|
|
my_bool write_hook_for_undo_row_delete(enum translog_record_type type,
|
|
|
|
TRN *trn, MARIA_HA *tbl_info,
|
|
|
|
LSN *lsn, void *hook_arg);
|
|
|
|
my_bool write_hook_for_undo_row_update(enum translog_record_type type,
|
|
|
|
TRN *trn, MARIA_HA *tbl_info,
|
|
|
|
LSN *lsn, void *hook_arg);
|
|
|
|
my_bool write_hook_for_file_id(enum translog_record_type type,
|
|
|
|
TRN *trn, MARIA_HA *tbl_info, LSN *lsn,
|
|
|
|
void *hook_arg);
|