From 36e875f7a5a5782729b2982ac61a02000e00fed0 Mon Sep 17 00:00:00 2001 From: marko <> Date: Wed, 16 May 2007 09:23:53 +0000 Subject: [PATCH] branches/zip: Document and obey the rules for modifying the free bits in the insert buffer bitmap. ibuf_set_free_bits_func(): Never disable redo logging. ibuf_update_free_bits_zip(): Remove. btr_page_reorganize_low(), page_zip_reorganize(): Do not update the insert buffer bitmap. Instead, document that callers will have to take care of it, and adapt the callers. btr_compress(): On error, reset the insert buffer free bits. btr_cur_insert_if_possible(): Do not modify the insert buffer bitmap. btr_compress(), btr_cur_optimistic_insert(): On compressed pages, reset the insert buffer bitmap. Document why. btr_cur_update_alloc_zip(): Document why it is necessary and sufficient to reset the insert buffer free bits. btr_cur_update_in_place(), btr_cur_optimistic_update(), btr_cur_pessimistic_update(): Update the free bits in the same mini-transaction. Document that the mini-transaction must be committed before latching any further pages. Verify that this is the case in all execution paths. row_ins_sec_index_entry_by_modify(), row_ins_clust_index_entry_by_modify(), row_undo_mod_clust_low(): Because these functions call btr_cur_update_in_place(), btr_cur_optimistic_update(), or btr_cur_pessimistic_update(), document that the mini-transaction must be committed before latching any further pages. Verify that this is the case in all execution paths. --- btr/btr0btr.c | 55 ++++++++++++++++---- btr/btr0cur.c | 121 ++++++++++++++++++++++++++++--------------- ibuf/ibuf0ibuf.c | 11 +--- include/btr0btr.h | 6 ++- include/btr0cur.h | 9 ++-- include/ibuf0ibuf.h | 12 ----- include/ibuf0ibuf.ic | 27 ---------- include/page0zip.h | 6 ++- page/page0zip.c | 12 ++--- row/row0ins.c | 6 ++- row/row0umod.c | 3 +- 11 files changed, 150 insertions(+), 118 deletions(-) diff --git a/btr/btr0btr.c b/btr/btr0btr.c index 0b2350ce27e..d99c48dd4be 100644 --- a/btr/btr0btr.c +++ b/btr/btr0btr.c @@ -975,13 +975,6 @@ btr_page_reorganize_low( success = TRUE; } - /* On compressed pages, recompute the insert buffer free bits. */ - if (UNIV_LIKELY_NULL(page_zip) - && !dict_index_is_clust(index) && page_is_leaf(page)) { - - ibuf_update_free_bits_zip(page_zip_get_size(page_zip), block); - } - func_exit: #ifdef UNIV_ZIP_DEBUG ut_a(!page_zip || page_zip_validate(page_zip, page)); @@ -995,7 +988,11 @@ func_exit: } /***************************************************************** -Reorganizes an index page. */ +Reorganizes an index page. +IMPORTANT: if btr_page_reorganize() is invoked on a compressed leaf +page of a non-clustered index, the caller must update the insert +buffer free bits in the same mini-transaction in such a way that the +modification will be redo-logged. */ ibool btr_page_reorganize( @@ -2400,6 +2397,13 @@ btr_compress( /* No space for merge */ err_exit: + /* We play it safe and reset the free bits. */ + if (zip_size + && page_is_leaf(merge_page) + && !dict_index_is_clust(index)) { + ibuf_reset_free_bits(merge_block); + } + mem_heap_free(heap); return(FALSE); } @@ -2523,11 +2527,40 @@ err_exit: mem_heap_free(heap); if (!dict_index_is_clust(index) && page_is_leaf(merge_page)) { - /* We have added new records to merge_page: - update its free bits */ + /* Update the free bits of the B-tree page in the + insert buffer bitmap. This has to be done in a + separate mini-transaction that is committed before the + main mini-transaction. We cannot update the insert + buffer bitmap in this mini-transaction, because + btr_compress() can be invoked recursively without + committing the mini-transaction in between. Since + insert buffer bitmap pages have a lower rank than + B-tree pages, we must not access other pages in the + same mini-transaction after accessing an insert buffer + bitmap page. */ + + /* The free bits in the insert buffer bitmap must + never exceed the free space on a page. It is safe to + decrement or reset the bits in the bitmap in a + mini-transaction that is committed before the + mini-transaction that affects the free space. */ + + /* It is unsafe to increment the bits in a separately + committed mini-transaction, because in crash recovery, + the free bits could momentarily be set too high. */ + if (zip_size) { - ibuf_update_free_bits_zip(zip_size, merge_block); + /* Because the free bits may be incremented + and we cannot update the insert buffer bitmap + in the same mini-transaction, the only safe + thing we can do here is the pessimistic + approach: reset the free bits. */ + ibuf_reset_free_bits(merge_block); } else { + /* On uncompressed pages, the free bits will + never increase here. Thus, it is safe to + write the bits accurately in a separate + mini-transaction. */ ibuf_update_free_bits_if_full(merge_block, UNIV_PAGE_SIZE, ULINT_UNDEFINED); diff --git a/btr/btr0cur.c b/btr/btr0cur.c index 9b959210742..cbb12bebb4d 100644 --- a/btr/btr0cur.c +++ b/btr/btr0cur.c @@ -894,14 +894,6 @@ btr_cur_insert_if_possible( } } - if (buf_block_get_page_zip(block) - && !dict_index_is_clust(cursor->index) - && page_is_leaf(buf_block_get_frame(block))) { - /* Update the free bits in the insert buffer. */ - ibuf_update_free_bits_zip(buf_block_get_zip_size(block), - block); - } - return(rec); } @@ -1223,9 +1215,34 @@ fail_err: rec_size + PAGE_DIR_SLOT_SIZE, index->type); #endif if (!dict_index_is_clust(index) && UNIV_LIKELY(0 == level)) { - /* We have added a record to page: update its free bits */ + /* Update the free bits of the B-tree page in the + insert buffer bitmap. This has to be done in a + separate mini-transaction that is committed before the + main mini-transaction. Since insert buffer bitmap + pages have a lower rank than B-tree pages, we must not + access other pages in the same mini-transaction after + accessing an insert buffer bitmap page. Because the + mini-transaction where this function is invoked may + access other pages, we must update the insert + buffer bitmap in a separate mini-transaction. */ + + /* The free bits in the insert buffer bitmap must + never exceed the free space on a page. It is safe to + decrement or reset the bits in the bitmap in a + mini-transaction that is committed before the + mini-transaction that affects the free space. */ + + /* It is unsafe to increment the bits in a separately + committed mini-transaction, because in crash recovery, + the free bits could momentarily be set too high. */ + if (zip_size) { - ibuf_update_free_bits_zip(zip_size, block); + /* Because the free bits may be incremented + and we cannot update the insert buffer bitmap + in the same mini-transaction, the only safe + thing we can do here is the pessimistic + approach: reset the free bits. */ + ibuf_reset_free_bits(block); } else { ibuf_update_free_bits_if_full( block, max_size, @@ -1658,10 +1675,21 @@ btr_cur_update_alloc_zip( return(FALSE); } + /* After recompressing a page, we must make sure that the free + bits in the insert buffer bitmap will not exceed the free + space on the page. Because this function will not attempt + recompression unless page_zip_available() fails above, it is + safe to reset the free bits if page_zip_available() fails + again, below. The free bits can safely be reset in a separate + mini-transaction. If page_zip_available() succeeds below, we + can be sure that the page_zip_compress() above did not reduce + the free space available on the page. */ + if (!page_zip_available(page_zip, dict_index_is_clust(index), length, 0)) { - if (!dict_index_is_clust(index)) { - /* No space on the page: reset the free bits. */ + /* Out of space: reset the free bits. */ + if (!dict_index_is_clust(index) + && page_is_leaf(buf_block_get_frame(block))) { ibuf_reset_free_bits(block); } return(FALSE); @@ -1686,7 +1714,8 @@ btr_cur_update_in_place( ulint cmpl_info,/* in: compiler info on secondary index updates */ que_thr_t* thr, /* in: query thread */ - mtr_t* mtr) /* in: mtr */ + mtr_t* mtr) /* in: mtr; must be committed before + latching any further pages */ { dict_index_t* index; buf_block_t* block; @@ -1728,13 +1757,6 @@ btr_cur_update_in_place( thr, &roll_ptr); if (UNIV_UNLIKELY(err != DB_SUCCESS)) { - if (page_zip && !dict_index_is_clust(index) - && page_is_leaf(buf_block_get_frame(block))) { - /* Update the free bits in the insert buffer. */ - ibuf_update_free_bits_zip( - page_zip_get_size(page_zip), block); - } - if (UNIV_LIKELY_NULL(heap)) { mem_heap_free(heap); } @@ -1773,8 +1795,8 @@ btr_cur_update_in_place( if (page_zip && !dict_index_is_clust(index) && page_is_leaf(buf_block_get_frame(block))) { /* Update the free bits in the insert buffer. */ - ibuf_update_free_bits_zip(buf_block_get_zip_size(block), - block); + ibuf_update_free_bits_low(buf_block_get_zip_size(block), + block, UNIV_PAGE_SIZE, mtr); } btr_cur_update_in_place_log(flags, rec, index, update, @@ -1820,7 +1842,8 @@ btr_cur_optimistic_update( ulint cmpl_info,/* in: compiler info on secondary index updates */ que_thr_t* thr, /* in: query thread */ - mtr_t* mtr) /* in: mtr */ + mtr_t* mtr) /* in: mtr; must be committed before + latching any further pages */ { dict_index_t* index; page_cur_t* page_cursor; @@ -1870,6 +1893,7 @@ btr_cur_optimistic_update( } if (rec_offs_any_extern(offsets)) { +any_extern: /* Externally stored fields are treated in pessimistic update */ @@ -1880,11 +1904,7 @@ btr_cur_optimistic_update( for (i = 0; i < upd_get_n_fields(update); i++) { if (upd_get_nth_field(update, i)->extern_storage) { - /* Externally stored fields are treated in pessimistic - update */ - - mem_heap_free(heap); - return(DB_OVERFLOW); + goto any_extern; } } @@ -1905,9 +1925,8 @@ btr_cur_optimistic_update( if (UNIV_LIKELY_NULL(page_zip) && !btr_cur_update_alloc_zip(page_zip, block, index, new_rec_size, mtr)) { - mem_heap_free(heap); - - return(DB_ZIP_OVERFLOW); + err = DB_ZIP_OVERFLOW; + goto err_exit; } if (UNIV_UNLIKELY(new_rec_size @@ -1948,21 +1967,13 @@ btr_cur_optimistic_update( &roll_ptr); if (err != DB_SUCCESS) { err_exit: - if (page_zip && !dict_index_is_clust(index) - && page_is_leaf(page)) { - /* Update the free bits in the insert buffer. */ - ibuf_update_free_bits_zip( - buf_block_get_zip_size(block), block); - } - mem_heap_free(heap); - return(err); } /* Ok, we may do the replacement. Store on the page infimum the explicit locks on rec, before deleting rec (see the comment in - .._pessimistic_update). */ + btr_cur_pessimistic_update). */ lock_rec_store_on_page_infimum(block, rec); @@ -1985,6 +1996,13 @@ err_exit: rec = btr_cur_insert_if_possible(cursor, new_entry, NULL, 0, mtr); ut_a(rec); /* <- We calculated above the insert would fit */ + if (page_zip && !dict_index_is_clust(index) + && page_is_leaf(page)) { + /* Update the free bits in the insert buffer. */ + ibuf_update_free_bits_low(buf_block_get_zip_size(block), block, + UNIV_PAGE_SIZE, mtr); + } + if (!rec_get_deleted_flag(rec, page_is_comp(page))) { /* The new inserted record owns its possible externally stored fields */ @@ -2077,7 +2095,8 @@ btr_cur_pessimistic_update( ulint cmpl_info,/* in: compiler info on secondary index updates */ que_thr_t* thr, /* in: query thread */ - mtr_t* mtr) /* in: mtr */ + mtr_t* mtr) /* in: mtr; must be committed before + latching any further pages */ { big_rec_t* big_rec_vec = NULL; big_rec_t* dummy_big_rec; @@ -2237,7 +2256,6 @@ btr_cur_pessimistic_update( rec = btr_cur_insert_if_possible(cursor, new_entry, ext_vect, n_ext_vect, mtr); - ut_a(rec || optim_err != DB_UNDERFLOW); if (rec) { lock_rec_restore_from_page_infimum(btr_cur_get_block(cursor), @@ -2255,8 +2273,24 @@ btr_cur_pessimistic_update( btr_cur_compress_if_useful(cursor, mtr); + if (page_zip && !dict_index_is_clust(index) + && page_is_leaf(page)) { + /* Update the free bits in the insert buffer. */ + ibuf_update_free_bits_low( + buf_block_get_zip_size(block), block, + UNIV_PAGE_SIZE, mtr); + } + err = DB_SUCCESS; goto return_after_reservations; + } else { + ut_a(optim_err != DB_UNDERFLOW); + + /* Out of space: reset the free bits. */ + if (!dict_index_is_clust(index) + && page_is_leaf(page)) { + ibuf_reset_free_bits(block); + } } /* Was the record to be updated positioned as the first user @@ -2783,7 +2817,8 @@ btr_cur_optimistic_delete( ut_a(!page_zip || page_zip_validate(page_zip, page)); #endif /* UNIV_ZIP_DEBUG */ - if (!dict_index_is_clust(cursor->index)) { + if (!dict_index_is_clust(cursor->index) + && page_is_leaf(page)) { ibuf_update_free_bits_low(zip_size, block, max_ins_size, mtr); } diff --git a/ibuf/ibuf0ibuf.c b/ibuf/ibuf0ibuf.c index 2ecba16b3b9..18f0ea41b39 100644 --- a/ibuf/ibuf0ibuf.c +++ b/ibuf/ibuf0ibuf.c @@ -857,15 +857,6 @@ ibuf_set_free_bits_func( mtr_start(&mtr); - if (recv_recovery_is_on()) { - /* Do not write to the redo log, because there is - crash recovery in progress. Flushing the log would - require the possession of log_sys->mutex, which is - being held by the main thread. */ - - mtr_set_log_mode(&mtr, MTR_LOG_NONE); - } - space = buf_block_get_space(block); page_no = buf_block_get_page_no(block); zip_size = buf_block_get_zip_size(block); @@ -3307,7 +3298,7 @@ loop: if (UNIV_UNLIKELY(corruption_noticed)) { fputs("InnoDB: Discarding record\n ", stderr); rec_print_old(stderr, ibuf_rec); - fputs("\n from the insert buffer!\n\n", stderr); + fputs("\nInnoDB: from the insert buffer!\n\n", stderr); } else if (block) { /* Now we have at pcur a record which should be inserted to the index page; NOTE that the call below diff --git a/include/btr0btr.h b/include/btr0btr.h index be12e047736..ca9b2f3119f 100644 --- a/include/btr0btr.h +++ b/include/btr0btr.h @@ -230,7 +230,11 @@ btr_root_raise_and_insert( ulint n_ext, /* in: number of elements in vec */ mtr_t* mtr); /* in: mtr */ /***************************************************************** -Reorganizes an index page. */ +Reorganizes an index page. +IMPORTANT: if btr_page_reorganize() is invoked on a compressed leaf +page of a non-clustered index, the caller must update the insert +buffer free bits in the same mini-transaction in such a way that the +modification will be redo-logged. */ ibool btr_page_reorganize( diff --git a/include/btr0cur.h b/include/btr0cur.h index 27df859f0ba..af7ba19f076 100644 --- a/include/btr0cur.h +++ b/include/btr0cur.h @@ -224,7 +224,8 @@ btr_cur_update_in_place( ulint cmpl_info,/* in: compiler info on secondary index updates */ que_thr_t* thr, /* in: query thread */ - mtr_t* mtr); /* in: mtr */ + mtr_t* mtr); /* in: mtr; must be committed before + latching any further pages */ /***************************************************************** Tries to update a record on a page in an index tree. It is assumed that mtr holds an x-latch on the page. The operation does not succeed if there is too @@ -248,7 +249,8 @@ btr_cur_optimistic_update( ulint cmpl_info,/* in: compiler info on secondary index updates */ que_thr_t* thr, /* in: query thread */ - mtr_t* mtr); /* in: mtr */ + mtr_t* mtr); /* in: mtr; must be committed before + latching any further pages */ /***************************************************************** Performs an update of a record on a page of a tree. It is assumed that mtr holds an x-latch on the tree and on the cursor page. If the @@ -271,7 +273,8 @@ btr_cur_pessimistic_update( ulint cmpl_info,/* in: compiler info on secondary index updates */ que_thr_t* thr, /* in: query thread */ - mtr_t* mtr); /* in: mtr */ + mtr_t* mtr); /* in: mtr; must be committed before + latching any further pages */ /*************************************************************** Marks a clustered index record deleted. Writes an undo log record to undo log on this delete marking. Writes in the trx id field the id diff --git a/include/ibuf0ibuf.h b/include/ibuf0ibuf.h index a6ccd1d23c0..f11d4d14a86 100644 --- a/include/ibuf0ibuf.h +++ b/include/ibuf0ibuf.h @@ -86,18 +86,6 @@ ibuf_update_free_bits_if_full( ulint increase);/* in: upper limit for the additional space used in the latest operation, if known, or ULINT_UNDEFINED */ -/**************************************************************************** -Updates the free bits of a compressed page in the ibuf bitmap. This -is done in a separate mini-transaction, hence this operation does not -restrict further work to only ibuf bitmap operations, which would -result if the latch to the bitmap page were kept. */ -UNIV_INLINE -void -ibuf_update_free_bits_zip( -/*======================*/ - ulint zip_size,/* in: compressed page size in bytes */ - buf_block_t* block); /* in: B-tree leaf page of a secondary - index */ /************************************************************************** Updates the free bits for the page to reflect the present state. Does this in the mtr given, which means that the latching order rules virtually diff --git a/include/ibuf0ibuf.ic b/include/ibuf0ibuf.ic index f12f3ded078..e33477084b1 100644 --- a/include/ibuf0ibuf.ic +++ b/include/ibuf0ibuf.ic @@ -279,30 +279,3 @@ ibuf_update_free_bits_if_full( ibuf_set_free_bits(block, after, before); } } - -/**************************************************************************** -Updates the free bits of a compressed page in the ibuf bitmap. This -is done in a separate mini-transaction, hence this operation does not -restrict further work to only ibuf bitmap operations, which would -result if the latch to the bitmap page were kept. */ -UNIV_INLINE -void -ibuf_update_free_bits_zip( -/*======================*/ - ulint zip_size,/* in: compressed page size in bytes */ - buf_block_t* block) /* in: B-tree leaf page of a secondary - index */ -{ - ulint free_bits = ibuf_index_page_calc_free(zip_size, block); - - if (free_bits == 0) { - /* We move the page to the front of the buffer pool LRU list: - the purpose of this is to prevent those pages to which we - cannot make inserts using the insert buffer from slipping - out of the buffer pool */ - - buf_page_make_young(&block->page); - } - - ibuf_set_free_bits(block, free_bits, ULINT_UNDEFINED); -} diff --git a/include/page0zip.h b/include/page0zip.h index b5513e870c0..c9d43228046 100644 --- a/include/page0zip.h +++ b/include/page0zip.h @@ -348,7 +348,11 @@ page_zip_write_header( Reorganize and compress a page. This is a low-level operation for compressed pages, to be used when page_zip_compress() fails. On success, a redo log entry MLOG_ZIP_PAGE_COMPRESS will be written. -The function btr_page_reorganize() should be preferred whenever possible. */ +The function btr_page_reorganize() should be preferred whenever possible. +IMPORTANT: if page_zip_reorganize() is invoked on a leaf page of a +non-clustered index, the caller must update the insert buffer free +bits in the same mini-transaction in such a way that the modification +will be redo-logged. */ ibool page_zip_reorganize( diff --git a/page/page0zip.c b/page/page0zip.c index 7627111fd51..14ce6567a2c 100644 --- a/page/page0zip.c +++ b/page/page0zip.c @@ -19,7 +19,6 @@ Created June 2005 by Marko Makela #include "dict0dict.h" #include "btr0sea.h" #include "btr0cur.h" -#include "ibuf0ibuf.h" #include "page0types.h" #include "lock0lock.h" #include "log0recv.h" @@ -3821,7 +3820,11 @@ page_zip_write_header_log( Reorganize and compress a page. This is a low-level operation for compressed pages, to be used when page_zip_compress() fails. On success, a redo log entry MLOG_ZIP_PAGE_COMPRESS will be written. -The function btr_page_reorganize() should be preferred whenever possible. */ +The function btr_page_reorganize() should be preferred whenever possible. +IMPORTANT: if page_zip_reorganize() is invoked on a leaf page of a +non-clustered index, the caller must update the insert buffer free +bits in the same mini-transaction in such a way that the modification +will be redo-logged. */ ibool page_zip_reorganize( @@ -3885,11 +3888,6 @@ page_zip_reorganize( lock_move_reorganize_page(block, temp_block); btr_search_drop_page_hash_index(block); - if (!dict_index_is_clust(index) && page_is_leaf(page)) { - /* Recompute the insert buffer free bits. */ - ibuf_update_free_bits_zip(page_zip_get_size(page_zip), block); - } - buf_block_free(temp_block); return(TRUE); } diff --git a/row/row0ins.c b/row/row0ins.c index c0537625a59..452be3da60b 100644 --- a/row/row0ins.c +++ b/row/row0ins.c @@ -215,7 +215,8 @@ row_ins_sec_index_entry_by_modify( btr_cur_t* cursor, /* in: B-tree cursor */ const dtuple_t* entry, /* in: index entry to insert */ que_thr_t* thr, /* in: query thread */ - mtr_t* mtr) /* in: mtr */ + mtr_t* mtr) /* in: mtr; must be committed before + latching any further pages */ { big_rec_t* dummy_big_rec; mem_heap_t* heap; @@ -292,7 +293,8 @@ row_ins_clust_index_entry_by_modify( externally stored fields in entry, or NULL */ ulint n_ext_vec,/* in: number of fields in ext_vec */ que_thr_t* thr, /* in: query thread */ - mtr_t* mtr) /* in: mtr */ + mtr_t* mtr) /* in: mtr; must be committed before + latching any further pages */ { rec_t* rec; upd_t* update; diff --git a/row/row0umod.c b/row/row0umod.c index 812cb5e9758..f6cb2522a35 100644 --- a/row/row0umod.c +++ b/row/row0umod.c @@ -82,7 +82,8 @@ row_undo_mod_clust_low( we may run out of file space */ undo_node_t* node, /* in: row undo node */ que_thr_t* thr, /* in: query thread */ - mtr_t* mtr, /* in: mtr */ + mtr_t* mtr, /* in: mtr; must be committed before + latching any further pages */ ulint mode) /* in: BTR_MODIFY_LEAF or BTR_MODIFY_TREE */ { btr_pcur_t* pcur;