diff --git a/btr/btr0btr.c b/btr/btr0btr.c index c27fb73ff8d..dd248e6e555 100644 --- a/btr/btr0btr.c +++ b/btr/btr0btr.c @@ -14,6 +14,7 @@ Created 6/2/1994 Heikki Tuuri #include "fsp0fsp.h" #include "page0page.h" +#include "page0zip.h" #include "btr0cur.h" #include "btr0sea.h" #include "btr0pcur.h" @@ -105,8 +106,9 @@ static void btr_page_empty( /*===========*/ - page_t* page, /* in: page to be emptied */ - mtr_t* mtr); /* in: mtr */ + page_t* page, /* in: page to be emptied */ + page_zip_des_t* page_zip,/* out: compressed page, or NULL */ + mtr_t* mtr); /* in: mtr */ /***************************************************************** Returns TRUE if the insert fits on the appropriate half-page with the chosen split_rec. */ @@ -258,7 +260,7 @@ btr_page_create( { ut_ad(mtr_memo_contains(mtr, buf_block_align(page), MTR_MEMO_PAGE_X_FIX)); - page_create(page, mtr, + page_create(page, NULL, mtr, UT_LIST_GET_FIRST(tree->tree_indexes)->table->comp); buf_block_align(page)->check_index_page_at_flush = TRUE; @@ -662,6 +664,7 @@ btr_create( buf_frame_t* ibuf_hdr_frame; buf_frame_t* frame; page_t* page; + page_zip_des_t* page_zip; /* Create the two new segments (one, in the case of an ibuf tree) for the index tree; the segment headers are put on the allocated root page @@ -723,7 +726,7 @@ btr_create( } /* Create a new index page on the the allocated segment page */ - page = page_create(frame, mtr, comp); + page = page_create(frame, NULL, mtr, comp); buf_block_align(page)->check_index_page_at_flush = TRUE; /* Set the index id of the page */ @@ -748,6 +751,14 @@ btr_create( ut_ad(page_get_max_insert_size(page, 2) > 2 * BTR_PAGE_MAX_REC_SIZE); + page_zip = buf_block_get_page_zip(buf_block_align(page)); + if (UNIV_LIKELY_NULL(page_zip)) { + if (UNIV_UNLIKELY(page_zip_compress(page_zip, page))) { + /* An empty page should always be compressible */ + ut_error; + } + } + return(page_no); } @@ -833,7 +844,8 @@ btr_page_reorganize_low( there cannot exist locks on the page, and a hash index should not be dropped: it cannot exist */ - page_t* page, /* in: page to be reorganized */ + page_t* page, /* in/out: page to be reorganized */ + page_zip_des_t* page_zip,/* in/out: compressed page, or NULL */ dict_index_t* index, /* in: record descriptor */ mtr_t* mtr) /* in: mtr */ { @@ -846,7 +858,7 @@ btr_page_reorganize_low( ut_ad(mtr_memo_contains(mtr, buf_block_align(page), MTR_MEMO_PAGE_X_FIX)); - ut_ad(!!page_is_comp(page) == index->table->comp); + ut_ad((ibool) !!page_is_comp(page) == index->table->comp); data_size1 = page_get_data_size(page); max_ins_size1 = page_get_max_insert_size_after_reorganize(page, 1); @@ -863,25 +875,34 @@ btr_page_reorganize_low( /* Copy the old page to temporary space */ buf_frame_copy(new_page, page); - if (!recovery) { + if (UNIV_LIKELY(!recovery)) { btr_search_drop_page_hash_index(page); } /* Recreate the page: note that global data on page (possible segment headers, next page-field, etc.) is preserved intact */ - page_create(page, mtr, page_is_comp(page)); + page_create(page, NULL, mtr, page_is_comp(page)); buf_block_align(page)->check_index_page_at_flush = TRUE; /* Copy the records from the temporary space to the recreated page; do not copy the lock bits yet */ - page_copy_rec_list_end_no_locks(page, new_page, + page_copy_rec_list_end_no_locks(page, page_get_infimum_rec(new_page), index, mtr); /* Copy max trx id to recreated page */ page_set_max_trx_id(page, page_get_max_trx_id(new_page)); - if (!recovery) { + if (UNIV_LIKELY_NULL(page_zip)) { + if (UNIV_UNLIKELY(!page_zip_compress(page_zip, page))) { + + /* Reorganizing a page should reduce entropy, + making the compressed page occupy less space. */ + ut_error; + } + } + + if (UNIV_LIKELY(!recovery)) { /* Update the record lock bitmaps */ lock_move_reorganize_page(page, new_page); } @@ -889,7 +910,8 @@ btr_page_reorganize_low( data_size2 = page_get_data_size(page); max_ins_size2 = page_get_max_insert_size_after_reorganize(page, 1); - if (data_size1 != data_size2 || max_ins_size1 != max_ins_size2) { + if (UNIV_UNLIKELY(data_size1 != data_size2) + || UNIV_UNLIKELY(max_ins_size1 != max_ins_size2)) { buf_page_print(page); buf_page_print(new_page); fprintf(stderr, @@ -917,7 +939,9 @@ btr_page_reorganize( dict_index_t* index, /* in: record descriptor */ mtr_t* mtr) /* in: mtr */ { - btr_page_reorganize_low(FALSE, page, index, mtr); + btr_page_reorganize_low(FALSE, page, + buf_block_get_page_zip(buf_block_align(page)), + index, mtr); } /*************************************************************** @@ -938,8 +962,10 @@ btr_parse_page_reorganize( /* The record is empty, except for the record initial part */ - if (page) { - btr_page_reorganize_low(TRUE, page, index, mtr); + if (UNIV_LIKELY(page != NULL)) { + page_zip_des_t* page_zip = buf_block_get_page_zip( + buf_block_align(page)); + btr_page_reorganize_low(TRUE, page, page_zip, index, mtr); } return(ptr); @@ -951,17 +977,20 @@ static void btr_page_empty( /*===========*/ - page_t* page, /* in: page to be emptied */ - mtr_t* mtr) /* in: mtr */ + page_t* page, /* in: page to be emptied */ + page_zip_des_t* page_zip,/* out: compressed page, or NULL */ + mtr_t* mtr) /* in: mtr */ { ut_ad(mtr_memo_contains(mtr, buf_block_align(page), MTR_MEMO_PAGE_X_FIX)); + ut_ad(!page_zip || page_zip_validate(page_zip, page)); + btr_search_drop_page_hash_index(page); /* Recreate the page: note that global data on page (possible segment headers, next page-field, etc.) is preserved intact */ - page_create(page, mtr, page_is_comp(page)); + page_create(page, page_zip, mtr, page_is_comp(page)); buf_block_align(page)->check_index_page_at_flush = TRUE; } @@ -993,6 +1022,7 @@ btr_root_raise_and_insert( ulint level; rec_t* node_ptr_rec; page_cur_t* page_cursor; + page_zip_des_t* page_zip; root = btr_cur_get_page(cursor); tree = btr_cur_get_tree(cursor); @@ -1025,8 +1055,12 @@ btr_root_raise_and_insert( /* Move the records from root to the new page */ - page_move_rec_list_end(new_page, root, page_get_infimum_rec(root), - cursor->index, mtr); + page_zip = buf_block_get_page_zip(buf_block_align(new_page)); + + page_move_rec_list_end(new_page, page_zip, + page_get_infimum_rec(root), NULL, + cursor->index, mtr); + /* If this is a pessimistic insert which is actually done to perform a pessimistic update then we have stored the lock information of the record to be inserted on the infimum of the @@ -1046,7 +1080,7 @@ btr_root_raise_and_insert( node_ptr = dict_tree_build_node_ptr(tree, rec, new_page_no, heap, level); /* Reorganize the root to get free space */ - btr_page_reorganize(root, cursor->index, mtr); + btr_page_reorganize_low(FALSE, root, NULL, cursor->index, mtr); page_cursor = btr_cur_get_page_cur(cursor); @@ -1054,17 +1088,26 @@ btr_root_raise_and_insert( page_cur_set_before_first(root, page_cursor); - node_ptr_rec = page_cur_tuple_insert(page_cursor, node_ptr, - cursor->index, mtr); + node_ptr_rec = page_cur_tuple_insert(page_cursor, NULL, + node_ptr, cursor->index, mtr); ut_ad(node_ptr_rec); + page_zip = buf_block_get_page_zip(buf_block_align(root)); + /* The node pointer must be marked as the predefined minimum record, as there is no lower alphabetical limit to records in the leftmost node of a level: */ - btr_set_min_rec_mark(node_ptr_rec, page_is_comp(root), mtr); + btr_set_min_rec_mark(node_ptr_rec, NULL, mtr); + if (!UNIV_UNLIKELY(page_zip_compress(page_zip, root))) { + /* The root page should only contain the + node pointer to new_page at this point. + Thus, the data should fit. */ + ut_error; + } + /* Free the memory heap */ mem_heap_free(heap); @@ -1564,15 +1607,13 @@ btr_page_split_and_insert( mtr_t* mtr) /* in: mtr */ { dict_tree_t* tree; - page_t* page; - ulint page_no; byte direction; - ulint hint_page_no; - page_t* new_page; rec_t* split_rec; page_t* left_page; page_t* right_page; page_t* insert_page; + page_zip_des_t* left_page_zip; + page_zip_des_t* right_page_zip; page_cur_t* page_cursor; rec_t* first_rec; byte* buf = 0; /* remove warning */ @@ -1597,13 +1638,13 @@ func_start: ut_ad(rw_lock_own(dict_tree_get_lock(tree), RW_LOCK_EX)); #endif /* UNIV_SYNC_DEBUG */ - page = btr_cur_get_page(cursor); + left_page = btr_cur_get_page(cursor); - ut_ad(mtr_memo_contains(mtr, buf_block_align(page), + ut_ad(mtr_memo_contains(mtr, buf_block_align(left_page), MTR_MEMO_PAGE_X_FIX)); - ut_ad(page_get_n_recs(page) >= 2); - - page_no = buf_frame_get_page_no(page); + ut_ad(page_get_n_recs(left_page) >= 2); + + left_page_zip = buf_block_get_page_zip(buf_block_align(left_page)); /* 1. Decide the split record; split_rec == NULL means that the tuple to be inserted should be the first record on the upper @@ -1611,26 +1652,24 @@ func_start: if (n_iterations > 0) { direction = FSP_UP; - hint_page_no = page_no + 1; split_rec = btr_page_get_sure_split_rec(cursor, tuple); } else if (btr_page_get_split_rec_to_right(cursor, &split_rec)) { direction = FSP_UP; - hint_page_no = page_no + 1; } else if (btr_page_get_split_rec_to_left(cursor, &split_rec)) { direction = FSP_DOWN; - hint_page_no = page_no - 1; } else { direction = FSP_UP; - hint_page_no = page_no + 1; - split_rec = page_get_middle_rec(page); + split_rec = page_get_middle_rec(left_page); } /* 2. Allocate a new page to the tree */ - new_page = btr_page_alloc(tree, hint_page_no, direction, - btr_page_get_level(page, mtr), mtr); - btr_page_create(new_page, tree, mtr); + right_page = btr_page_alloc(tree, + buf_frame_get_page_no(left_page) + 1, + direction, + btr_page_get_level(left_page, mtr), mtr); + btr_page_create(right_page, tree, mtr); /* 3. Calculate the first record on the upper half-page, and the first record (move_limit) on original page which ends up on the @@ -1649,7 +1688,8 @@ func_start: /* 4. Do first the modifications in the tree structure */ - btr_attach_half_pages(tree, page, first_rec, new_page, direction, mtr); + btr_attach_half_pages(tree, left_page, first_rec, right_page, + direction, mtr); if (split_rec == NULL) { mem_free(buf); @@ -1667,34 +1707,31 @@ func_start: insert_will_fit = btr_page_insert_fits(cursor, split_rec, offsets, tuple, heap); } else { + mem_free(buf); insert_will_fit = btr_page_insert_fits(cursor, NULL, NULL, tuple, heap); } - if (insert_will_fit && (btr_page_get_level(page, mtr) == 0)) { + if (insert_will_fit && (btr_page_get_level(left_page, mtr) == 0)) { mtr_memo_release(mtr, dict_tree_get_lock(tree), MTR_MEMO_X_LOCK); } /* 5. Move then the records to the new page */ - if (direction == FSP_DOWN) { -/* fputs("Split left\n", stderr); */ + right_page_zip = buf_block_get_page_zip(buf_block_align(right_page)); - page_move_rec_list_start(new_page, page, move_limit, - cursor->index, mtr); - left_page = new_page; - right_page = page; + page_move_rec_list_end(right_page, right_page_zip, + move_limit, left_page_zip, + cursor->index, mtr); + + if (UNIV_UNLIKELY(direction == FSP_DOWN)) { + fputs("Split left\n", stderr); /* TODO: coverage test */ lock_update_split_left(right_page, left_page); } else { /* fputs("Split right\n", stderr); */ - page_move_rec_list_end(new_page, page, move_limit, - cursor->index, mtr); - left_page = page; - right_page = new_page; - lock_update_split_right(right_page, left_page); } @@ -1722,9 +1759,12 @@ func_start: page_cur_search(insert_page, cursor->index, tuple, PAGE_CUR_LE, page_cursor); - rec = page_cur_tuple_insert(page_cursor, tuple, cursor->index, mtr); + rec = page_cur_tuple_insert(page_cursor, left_page_zip, + tuple, cursor->index, mtr); - if (rec != NULL) { + ut_ad(!left_page_zip || page_zip_validate(left_page_zip, left_page)); + + if (UNIV_LIKELY(rec != NULL)) { /* Insert fit on the page: update the free bits for the left and right pages in the same mtr */ @@ -1744,14 +1784,16 @@ func_start: page_cur_search(insert_page, cursor->index, tuple, PAGE_CUR_LE, page_cursor); - rec = page_cur_tuple_insert(page_cursor, tuple, cursor->index, mtr); + rec = page_cur_tuple_insert(page_cursor, left_page_zip, + tuple, cursor->index, mtr); + + if (UNIV_UNLIKELY(rec == NULL)) { - if (rec == NULL) { /* The insert did not fit on the page: loop back to the start of the function for a new split */ - /* We play safe and reset the free bits for new_page */ - ibuf_reset_free_bits(cursor->index, new_page); + /* We play safe and reset the free bits for right_page */ + ibuf_reset_free_bits(cursor->index, right_page); /* fprintf(stderr, "Split second round %lu\n", buf_frame_get_page_no(page)); */ @@ -1830,11 +1872,10 @@ void btr_set_min_rec_mark_log( /*=====================*/ rec_t* rec, /* in: record */ - ulint comp, /* nonzero=compact record format */ + byte type, /* in: MLOG_COMP_REC_MIN_MARK or MLOG_REC_MIN_MARK */ mtr_t* mtr) /* in: mtr */ { - mlog_write_initial_log_record(rec, - comp ? MLOG_COMP_REC_MIN_MARK : MLOG_REC_MIN_MARK, mtr); + mlog_write_initial_log_record(rec, type, mtr); /* Write rec offset as a 2-byte ulint */ mlog_catenate_ulint(mtr, ut_align_offset(rec, UNIV_PAGE_SIZE), @@ -1863,11 +1904,14 @@ btr_parse_set_min_rec_mark( } if (page) { + page_zip_des_t* page_zip = buf_block_get_page_zip( + buf_block_align(page)); + ut_a(!page_is_comp(page) == !comp); rec = page + mach_read_from_2(ptr); - btr_set_min_rec_mark(rec, comp, mtr); + btr_set_min_rec_mark(rec, page_zip, mtr); } return(ptr + 2); @@ -1879,17 +1923,29 @@ Sets a record as the predefined minimum record. */ void btr_set_min_rec_mark( /*=================*/ - rec_t* rec, /* in: record */ - ulint comp, /* in: nonzero=compact page format */ - mtr_t* mtr) /* in: mtr */ + rec_t* rec, /* in: record */ + page_zip_des_t* page_zip,/* in/out: compressed page with + at least 5 bytes available, or NULL */ + mtr_t* mtr) /* in: mtr */ { ulint info_bits; - info_bits = rec_get_info_bits(rec, comp); + if (UNIV_LIKELY(page_rec_is_comp(rec))) { + info_bits = rec_get_info_bits(rec, TRUE); - rec_set_info_bits(rec, comp, info_bits | REC_INFO_MIN_REC_FLAG); + rec_set_info_bits_new(rec, page_zip, + info_bits | REC_INFO_MIN_REC_FLAG); - btr_set_min_rec_mark_log(rec, comp, mtr); + btr_set_min_rec_mark_log(rec, MLOG_COMP_REC_MIN_MARK, mtr); + } else { + ut_ad(!page_zip); + + info_bits = rec_get_info_bits(rec, FALSE); + + rec_set_info_bits_old(rec, info_bits | REC_INFO_MIN_REC_FLAG); + + btr_set_min_rec_mark_log(rec, MLOG_REC_MIN_MARK, mtr); + } } /***************************************************************** @@ -1928,9 +1984,10 @@ btr_node_ptr_delete( If page is the only on its level, this function moves its records to the father page, thus reducing the tree height. */ static -void +ibool btr_lift_page_up( /*=============*/ + /* out: TRUE on success */ dict_tree_t* tree, /* in: index tree */ page_t* page, /* in: page which is the only on its level; must not be empty: use @@ -1941,6 +1998,7 @@ btr_lift_page_up( page_t* father_page; ulint page_level; dict_index_t* index; + page_zip_des_t* father_page_zip; ut_ad(btr_page_get_prev(page, mtr) == FIL_NULL); ut_ad(btr_page_get_next(page, mtr) == FIL_NULL); @@ -1948,22 +2006,39 @@ btr_lift_page_up( MTR_MEMO_PAGE_X_FIX)); father_page = buf_frame_align( btr_page_get_father_node_ptr(tree, page, mtr)); - + father_page_zip = buf_block_get_page_zip(buf_block_align(father_page)); + page_level = btr_page_get_level(page, mtr); index = UT_LIST_GET_FIRST(tree->tree_indexes); btr_search_drop_page_hash_index(page); /* Make the father empty */ - btr_page_empty(father_page, mtr); + btr_page_empty(father_page, NULL, mtr); /* Move records to the father */ - page_copy_rec_list_end(father_page, page, page_get_infimum_rec(page), - index, mtr); - lock_update_copy_and_discard(father_page, page); + if (!page_copy_rec_list_end(father_page, NULL, + page_get_infimum_rec(page), index, mtr)) { + ut_error; + } btr_page_set_level(father_page, page_level, mtr); + if (UNIV_LIKELY_NULL(father_page_zip)) { + if (UNIV_UNLIKELY(!page_zip_compress( + father_page_zip, father_page))) { + /* Restore the old page from temporary space */ + if (UNIV_UNLIKELY(!page_zip_decompress( + father_page_zip, father_page, mtr))) { + ut_error; /* probably memory corruption */ + } + + return(FALSE); + } + } + + lock_update_copy_and_discard(father_page, page); + /* Free the file page */ btr_page_free(tree, page, mtr); @@ -1971,6 +2046,8 @@ btr_lift_page_up( ibuf_reset_free_bits(index, father_page); ut_ad(page_validate(father_page, index)); ut_ad(btr_check_node_ptr(tree, father_page, mtr)); + + return(TRUE); } /***************************************************************** @@ -1981,12 +2058,12 @@ conditions, looks at the right brother. If the page is the only one on that level lifts the records of the page to the father page, thus reducing the tree height. It is assumed that mtr holds an x-latch on the tree and on the page. If cursor is on the leaf level, mtr must also hold x-latches to the -brothers, if they exist. NOTE: it is assumed that the caller has reserved -enough free extents so that the compression will always succeed if done! */ +brothers, if they exist. */ -void +ibool btr_compress( /*=========*/ + /* out: TRUE on success */ btr_cur_t* cursor, /* in: cursor on the page to merge or lift; the page must not be empty: in record delete use btr_discard_page if the page would become @@ -2001,20 +2078,16 @@ btr_compress( page_t* father_page; ibool is_left; page_t* page; - rec_t* orig_pred; - rec_t* orig_succ; rec_t* node_ptr; ulint data_size; ulint n_recs; ulint max_ins_size; ulint max_ins_size_reorg; ulint level; - ulint comp; page = btr_cur_get_page(cursor); tree = btr_cur_get_tree(cursor); - comp = page_is_comp(page); - ut_a((ibool)!!comp == cursor->index->table->comp); + ut_a((ibool)!!page_is_comp(page) == cursor->index->table->comp); ut_ad(mtr_memo_contains(mtr, dict_tree_get_lock(tree), MTR_MEMO_X_LOCK)); @@ -2030,34 +2103,33 @@ btr_compress( right_page_no); */ node_ptr = btr_page_get_father_node_ptr(tree, page, mtr); - ut_ad(!comp || rec_get_status(node_ptr) == REC_STATUS_NODE_PTR); + ut_ad(!page_is_comp(page) + || rec_get_status(node_ptr) == REC_STATUS_NODE_PTR); father_page = buf_frame_align(node_ptr); - ut_a(comp == page_is_comp(father_page)); + ut_a(page_is_comp(page) == page_is_comp(father_page)); /* Decide the page to which we try to merge and which will inherit the locks */ - if (left_page_no != FIL_NULL) { + is_left = left_page_no != FIL_NULL; + + if (is_left) { - is_left = TRUE; merge_page = btr_page_get(space, left_page_no, RW_X_LATCH, mtr); } else if (right_page_no != FIL_NULL) { - is_left = FALSE; merge_page = btr_page_get(space, right_page_no, RW_X_LATCH, mtr); } else { /* The page is the only one on the level, lift the records to the father */ - btr_lift_page_up(tree, page, mtr); - - return; + return(btr_lift_page_up(tree, page, mtr)); } n_recs = page_get_n_recs(page); data_size = page_get_data_size(page); - ut_a(page_is_comp(merge_page) == comp); + ut_a(page_is_comp(merge_page) == page_is_comp(page)); max_ins_size_reorg = page_get_max_insert_size_after_reorganize( merge_page, n_recs); @@ -2065,14 +2137,14 @@ btr_compress( /* No space for merge */ - return; + return(FALSE); } ut_ad(page_validate(merge_page, cursor->index)); max_ins_size = page_get_max_insert_size(merge_page, n_recs); - if (data_size > max_ins_size) { + if (UNIV_UNLIKELY(data_size > max_ins_size)) { /* We have to reorganize merge_page */ @@ -2083,13 +2155,14 @@ btr_compress( ut_ad(page_validate(merge_page, cursor->index)); ut_ad(page_get_max_insert_size(merge_page, n_recs) == max_ins_size_reorg); - } - if (data_size > max_ins_size) { + if (UNIV_UNLIKELY(data_size > max_ins_size)) { - /* Add fault tolerance, though this should never happen */ + /* Add fault tolerance, though this should + never happen */ - return; + return(FALSE); + } } btr_search_drop_page_hash_index(page); @@ -2118,17 +2191,27 @@ btr_compress( /* Move records to the merge page */ if (is_left) { - orig_pred = page_rec_get_prev( + rec_t* orig_pred = page_rec_get_prev( page_get_supremum_rec(merge_page)); - page_copy_rec_list_start(merge_page, page, - page_get_supremum_rec(page), cursor->index, mtr); + if (UNIV_UNLIKELY(!page_copy_rec_list_start( + merge_page, buf_block_get_page_zip( + buf_block_align(merge_page)), + page_get_supremum_rec(page), + cursor->index, mtr))) { + return(FALSE); + } lock_update_merge_left(merge_page, orig_pred, page); } else { - orig_succ = page_rec_get_next( + rec_t* orig_succ = page_rec_get_next( page_get_infimum_rec(merge_page)); - page_copy_rec_list_end(merge_page, page, - page_get_infimum_rec(page), cursor->index, mtr); + if (UNIV_UNLIKELY(!page_copy_rec_list_end( + merge_page, buf_block_get_page_zip( + buf_block_align(merge_page)), + page_get_infimum_rec(page), + cursor->index, mtr))) { + return(FALSE); + } lock_update_merge_right(orig_succ, page); } @@ -2143,6 +2226,7 @@ btr_compress( btr_page_free(tree, page, mtr); ut_ad(btr_check_node_ptr(tree, merge_page, mtr)); + return(TRUE); } /***************************************************************** @@ -2155,7 +2239,6 @@ btr_discard_only_page_on_level( page_t* page, /* in: page which is the only on its level */ mtr_t* mtr) /* in: mtr */ { - rec_t* node_ptr; page_t* father_page; ulint page_level; @@ -2165,8 +2248,8 @@ btr_discard_only_page_on_level( MTR_MEMO_PAGE_X_FIX)); btr_search_drop_page_hash_index(page); - node_ptr = btr_page_get_father_node_ptr(tree, page, mtr); - father_page = buf_frame_align(node_ptr); + father_page = buf_frame_align( + btr_page_get_father_node_ptr(tree, page, mtr)); page_level = btr_page_get_level(page, mtr); @@ -2177,10 +2260,13 @@ btr_discard_only_page_on_level( /* Free the file page */ btr_page_free(tree, page, mtr); - if (buf_frame_get_page_no(father_page) == dict_tree_get_page(tree)) { + if (UNIV_UNLIKELY(buf_frame_get_page_no(father_page) + == dict_tree_get_page(tree))) { /* The father is the root page */ - btr_page_empty(father_page, mtr); + btr_page_empty(father_page, + buf_block_get_page_zip(buf_block_align(father_page)), + mtr); /* We play safe and reset the free bits for the father */ ibuf_reset_free_bits(UT_LIST_GET_FIRST(tree->tree_indexes), @@ -2209,7 +2295,6 @@ btr_discard_page( ulint left_page_no; ulint right_page_no; page_t* merge_page; - ibool is_left; page_t* page; rec_t* node_ptr; @@ -2229,11 +2314,9 @@ btr_discard_page( right_page_no = btr_page_get_next(page, mtr); if (left_page_no != FIL_NULL) { - is_left = TRUE; merge_page = btr_page_get(space, left_page_no, RW_X_LATCH, mtr); } else if (right_page_no != FIL_NULL) { - is_left = FALSE; merge_page = btr_page_get(space, right_page_no, RW_X_LATCH, mtr); } else { @@ -2249,12 +2332,21 @@ btr_discard_page( /* We have to mark the leftmost node pointer on the right side page as the predefined minimum record */ + page_zip_des_t* merge_page_zip; + merge_page_zip = buf_block_get_page_zip( + buf_block_align(merge_page)); + + if (UNIV_LIKELY_NULL(merge_page_zip) + && UNIV_UNLIKELY(!page_zip_alloc( + merge_page_zip, merge_page, 5))) { + ut_error; /* TODO: handle this gracefully */ + } node_ptr = page_rec_get_next(page_get_infimum_rec(merge_page)); ut_ad(page_rec_is_user_rec(node_ptr)); - btr_set_min_rec_mark(node_ptr, page_is_comp(merge_page), mtr); + btr_set_min_rec_mark(node_ptr, merge_page_zip, mtr); } btr_node_ptr_delete(tree, page, mtr); @@ -2262,7 +2354,7 @@ btr_discard_page( /* Remove the page from the level list */ btr_level_list_remove(tree, page, mtr); - if (is_left) { + if (left_page_no != FIL_NULL) { lock_update_discard(page_get_supremum_rec(merge_page), page); } else { lock_update_discard(page_rec_get_next( diff --git a/btr/btr0cur.c b/btr/btr0cur.c index f81cce5b8e9..d6215736e3e 100644 --- a/btr/btr0cur.c +++ b/btr/btr0cur.c @@ -24,6 +24,7 @@ Created 10/16/1994 Heikki Tuuri #endif #include "page0page.h" +#include "page0zip.h" #include "rem0rec.h" #include "rem0cmp.h" #include "btr0btr.h" @@ -115,6 +116,35 @@ btr_rec_get_externally_stored_len( rec_t* rec, /* in: record */ const ulint* offsets);/* in: array returned by rec_get_offsets() */ +/********************************************************** +The following function is used to set the deleted bit of a record. */ +UNIV_INLINE +ibool +btr_rec_set_deleted_flag( +/*=====================*/ + /* out: TRUE on success; + FALSE on page_zip overflow */ + rec_t* rec, /* in/out: physical record */ + page_zip_des_t* page_zip,/* in/out: compressed page (or NULL) */ + ulint flag) /* in: nonzero if delete marked */ +{ + if (page_rec_is_comp(rec)) { + if (UNIV_LIKELY_NULL(page_zip) + && UNIV_UNLIKELY(!page_zip_alloc(page_zip, + ut_align_down(rec, UNIV_PAGE_SIZE), 5))) { + rec_set_deleted_flag_new(rec, NULL, flag); + return(FALSE); + } + + rec_set_deleted_flag_new(rec, page_zip, flag); + } else { + ut_ad(!page_zip); + rec_set_deleted_flag_old(rec, flag); + } + + return(TRUE); +} + /*==================== B-TREE SEARCH =========================*/ /************************************************************************ @@ -405,19 +435,6 @@ btr_cur_search_to_nth_level( /* Loop and search until we arrive at the desired level */ for (;;) { - if ((height == 0) && (latch_mode <= BTR_MODIFY_LEAF)) { - - rw_latch = latch_mode; - - if (insert_planned && ibuf_should_try(index, - ignore_sec_unique)) { - - /* Try insert to the insert buffer if the - page is not in the buffer pool */ - - buf_mode = BUF_GET_IF_IN_POOL; - } - } retry_page_get: page = buf_page_get_gen(space, page_no, rw_latch, guess, buf_mode, @@ -460,7 +477,7 @@ retry_page_get: ut_ad(0 == ut_dulint_cmp(tree->id, btr_page_get_index_id(page))); - if (height == ULINT_UNDEFINED) { + if (UNIV_UNLIKELY(height == ULINT_UNDEFINED)) { /* We are in the root node */ height = btr_page_get_level(page, mtr); @@ -522,6 +539,21 @@ retry_page_get: ut_ad(height > 0); height--; + + if ((height == 0) && (latch_mode <= BTR_MODIFY_LEAF)) { + + rw_latch = latch_mode; + + if (insert_planned && ibuf_should_try(index, + ignore_sec_unique)) { + + /* Try insert to the insert buffer if the + page is not in the buffer pool */ + + buf_mode = BUF_GET_IF_IN_POOL; + } + } + guess = NULL; node_ptr = page_cur_get_rec(page_cursor); @@ -788,6 +820,7 @@ btr_cur_insert_if_possible( else NULL */ btr_cur_t* cursor, /* in: cursor on page after which to insert; cursor stays valid */ + page_zip_des_t* page_zip,/* in: compressed page of cursor */ dtuple_t* tuple, /* in: tuple to insert; the size info need not have been stored to tuple */ ibool* reorg, /* out: TRUE if reorganization occurred */ @@ -808,9 +841,10 @@ btr_cur_insert_if_possible( page_cursor = btr_cur_get_page_cur(cursor); /* Now, try the insert */ - rec = page_cur_tuple_insert(page_cursor, tuple, cursor->index, mtr); + rec = page_cur_tuple_insert(page_cursor, page_zip, + tuple, cursor->index, mtr); - if (!rec) { + if (UNIV_UNLIKELY(!rec)) { /* If record did not fit, reorganize */ btr_page_reorganize(page, cursor->index, mtr); @@ -820,8 +854,8 @@ btr_cur_insert_if_possible( page_cur_search(page, cursor->index, tuple, PAGE_CUR_LE, page_cursor); - rec = page_cur_tuple_insert(page_cursor, tuple, - cursor->index, mtr); + rec = page_cur_tuple_insert(page_cursor, page_zip, + tuple, cursor->index, mtr); } return(rec); @@ -935,6 +969,7 @@ btr_cur_optimistic_insert( dict_index_t* index; page_cur_t* page_cursor; page_t* page; + page_zip_des_t* page_zip; ulint max_size; rec_t* dummy_rec; ulint level; @@ -1033,9 +1068,10 @@ calculate_sizes_again: reorg = FALSE; /* Now, try the insert */ + page_zip = buf_block_get_page_zip(buf_block_align(page)); - *rec = page_cur_insert_rec_low(page_cursor, entry, index, - NULL, NULL, mtr); + *rec = page_cur_insert_rec_low(page_cursor, page_zip, + entry, index, NULL, NULL, mtr); if (UNIV_UNLIKELY(!(*rec))) { /* If the record did not fit, reorganize */ btr_page_reorganize(page, index, mtr); @@ -1046,9 +1082,15 @@ calculate_sizes_again: page_cur_search(page, index, entry, PAGE_CUR_LE, page_cursor); - *rec = page_cur_tuple_insert(page_cursor, entry, index, mtr); + *rec = page_cur_tuple_insert(page_cursor, page_zip, + entry, index, mtr); if (UNIV_UNLIKELY(!*rec)) { + if (UNIV_LIKELY_NULL(page_zip)) { + /* Likely a compressed page overflow */ + return(DB_FAIL); + } + fputs("InnoDB: Error: cannot insert tuple ", stderr); dtuple_print(stderr, entry); fputs(" into ", stderr); @@ -1343,7 +1385,8 @@ btr_cur_parse_update_in_place( /* out: end of log record or NULL */ byte* ptr, /* in: buffer */ byte* end_ptr,/* in: buffer end */ - page_t* page, /* in: page or NULL */ + page_t* page, /* in/out: page or NULL */ + page_zip_des_t* page_zip,/* in/out: compressed page, or NULL */ dict_index_t* index) /* in: index corresponding to page */ { ulint flags; @@ -1399,12 +1442,19 @@ btr_cur_parse_update_in_place( offsets = rec_get_offsets(rec, index, NULL, ULINT_UNDEFINED, &heap); if (!(flags & BTR_KEEP_SYS_FLAG)) { - row_upd_rec_sys_fields_in_recovery(rec, offsets, + row_upd_rec_sys_fields_in_recovery(rec, page_zip, offsets, pos, trx_id, roll_ptr); } row_upd_rec_in_place(rec, offsets, update); + if (UNIV_LIKELY_NULL(page_zip)) { + btr_cur_unmark_extern_fields(rec, NULL, offsets); + + page_zip_write(page_zip, rec - rec_offs_extra_size(offsets), + rec_offs_size(offsets)); + } + func_exit: mem_heap_free(heap); @@ -1431,6 +1481,7 @@ btr_cur_update_in_place( { dict_index_t* index; buf_block_t* block; + page_zip_des_t* page_zip; ulint err; rec_t* rec; dulint roll_ptr = ut_dulint_zero; @@ -1465,8 +1516,12 @@ btr_cur_update_in_place( } block = buf_block_align(rec); - ut_ad(!!page_is_comp(buf_block_get_frame(block)) - == index->table->comp); + + page_zip = buf_block_get_page_zip(block); + if (UNIV_UNLIKELY(!page_zip_alloc(page_zip, buf_block_get_frame(block), + 4 + rec_offs_size(offsets)))) { + return(DB_OVERFLOW); + } if (block->is_hashed) { /* The function row_upd_changes_ord_field_binary works only @@ -1484,7 +1539,8 @@ btr_cur_update_in_place( } if (!(flags & BTR_KEEP_SYS_FLAG)) { - row_upd_rec_sys_fields(rec, index, offsets, trx, roll_ptr); + row_upd_rec_sys_fields(rec, NULL, + index, offsets, trx, roll_ptr); } /* FIXME: in a mixed tree, all records may not have enough ordering @@ -1506,9 +1562,22 @@ btr_cur_update_in_place( /* The new updated record owns its possible externally stored fields */ + if (UNIV_LIKELY_NULL(page_zip)) { + /* Do not log the btr_cur_unmark_extern_fields() + if the page is compressed. Do the operation in + crash recovery of MLOG_COMP_REC_UPDATE_IN_PLACE + in that case. */ + mtr = NULL; + } + btr_cur_unmark_extern_fields(rec, mtr, offsets); } + if (UNIV_LIKELY_NULL(page_zip)) { + page_zip_write(page_zip, rec - rec_offs_extra_size(offsets), + rec_offs_size(offsets)); + } + if (UNIV_LIKELY_NULL(heap)) { mem_heap_free(heap); } @@ -1543,7 +1612,10 @@ btr_cur_optimistic_update( page_cur_t* page_cursor; ulint err; page_t* page; + page_zip_des_t* page_zip; + page_zip_des_t* page_zip_used; rec_t* rec; + rec_t* orig_rec; ulint max_size; ulint new_rec_size; ulint old_rec_size; @@ -1556,7 +1628,7 @@ btr_cur_optimistic_update( ulint* offsets; page = btr_cur_get_page(cursor); - rec = btr_cur_get_rec(cursor); + orig_rec = rec = btr_cur_get_rec(cursor); index = cursor->index; ut_ad(!!page_rec_is_comp(rec) == index->table->comp); @@ -1663,7 +1735,18 @@ btr_cur_optimistic_update( btr_search_update_hash_on_delete(cursor); - page_cur_delete_rec(page_cursor, index, offsets, mtr); + page_zip = buf_block_get_page_zip(buf_block_align(page)); + if (UNIV_LIKELY(!page_zip) + || UNIV_UNLIKELY(!page_zip_available(page_zip, 32))) { + /* If there is not enough space in the page + modification log, ignore the log and + try compressing the page afterwards. */ + page_zip_used = NULL; + } else { + page_zip_used = page_zip; + } + + page_cur_delete_rec(page_cursor, index, offsets, page_zip_used, mtr); page_cur_move_to_prev(page_cursor); @@ -1676,7 +1759,8 @@ btr_cur_optimistic_update( trx->id); } - rec = btr_cur_insert_if_possible(cursor, new_entry, &reorganized, mtr); + rec = btr_cur_insert_if_possible(cursor, page_zip_used, + new_entry, &reorganized, mtr); ut_a(rec); /* <- We calculated above the insert would fit */ @@ -1689,6 +1773,22 @@ btr_cur_optimistic_update( btr_cur_unmark_extern_fields(rec, mtr, offsets); } + if (UNIV_LIKELY_NULL(page_zip) && UNIV_UNLIKELY(!page_zip_used)) { + if (!page_zip_compress(page_zip, page)) { + + if (UNIV_UNLIKELY(!page_zip_decompress( + page_zip, page, mtr))) { + ut_error; + } + /* TODO: is this correct? */ + lock_rec_restore_from_page_infimum(orig_rec, page); + + mem_heap_free(heap); + + return(DB_OVERFLOW); + } + } + /* Restore the old explicit lock state on the record */ lock_rec_restore_from_page_infimum(rec, page); @@ -1768,6 +1868,7 @@ btr_cur_pessimistic_update( big_rec_t* dummy_big_rec; dict_index_t* index; page_t* page; + page_zip_des_t* page_zip; dict_tree_t* tree; rec_t* rec; page_cur_t* page_cursor; @@ -1790,6 +1891,7 @@ btr_cur_pessimistic_update( *big_rec = NULL; page = btr_cur_get_page(cursor); + page_zip = buf_block_get_page_zip(buf_block_align(page)); rec = btr_cur_get_rec(cursor); index = cursor->index; tree = index->tree; @@ -1906,11 +2008,11 @@ btr_cur_pessimistic_update( btr_search_update_hash_on_delete(cursor); - page_cur_delete_rec(page_cursor, index, offsets, mtr); + page_cur_delete_rec(page_cursor, index, offsets, page_zip, mtr); page_cur_move_to_prev(page_cursor); - rec = btr_cur_insert_if_possible(cursor, new_entry, + rec = btr_cur_insert_if_possible(cursor, page_zip, new_entry, &dummy_reorganized, mtr); ut_a(rec || optim_err != DB_UNDERFLOW); @@ -2045,8 +2147,9 @@ btr_cur_parse_del_mark_set_clust_rec( /* out: end of log record or NULL */ byte* ptr, /* in: buffer */ byte* end_ptr,/* in: buffer end */ - dict_index_t* index, /* in: index corresponding to page */ - page_t* page) /* in: page or NULL */ + page_t* page, /* in/out: page or NULL */ + page_zip_des_t* page_zip,/* in/out: compressed page, or NULL */ + dict_index_t* index) /* in: index corresponding to page */ { ulint flags; ulint val; @@ -2087,13 +2190,25 @@ btr_cur_parse_del_mark_set_clust_rec( if (page) { rec = page + offset; - + + /* We do not need to reserve btr_search_latch, as the page + is only being recovered, and there cannot be a hash index to + it. */ + + if (UNIV_UNLIKELY(!btr_rec_set_deleted_flag(rec, + page_zip, val))) { + /* page_zip overflow should have been detected + before writing MLOG_COMP_REC_CLUST_DELETE_MARK */ + ut_error; + } + if (!(flags & BTR_KEEP_SYS_FLAG)) { mem_heap_t* heap = NULL; ulint offsets_[REC_OFFS_NORMAL_SIZE]; *offsets_ = (sizeof offsets_) / sizeof *offsets_; - row_upd_rec_sys_fields_in_recovery(rec, + /* TODO: page_zip_write(whole record)? */ + row_upd_rec_sys_fields_in_recovery(rec, page_zip, rec_get_offsets(rec, index, offsets_, ULINT_UNDEFINED, &heap), pos, trx_id, roll_ptr); @@ -2101,12 +2216,6 @@ btr_cur_parse_del_mark_set_clust_rec( mem_heap_free(heap); } } - - /* We do not need to reserve btr_search_latch, as the page - is only being recovered, and there cannot be a hash index to - it. */ - - rec_set_deleted_flag(rec, page_is_comp(page), val); } return(ptr); @@ -2134,6 +2243,7 @@ btr_cur_del_mark_set_clust_rec( dulint roll_ptr; ulint err; rec_t* rec; + page_zip_des_t* page_zip; trx_t* trx; mem_heap_t* heap = NULL; ulint offsets_[REC_OFFS_NORMAL_SIZE]; @@ -2155,15 +2265,28 @@ btr_cur_del_mark_set_clust_rec( ut_ad(index->type & DICT_CLUSTERED); ut_ad(!rec_get_deleted_flag(rec, rec_offs_comp(offsets))); + page_zip = buf_block_get_page_zip(buf_block_align(rec)); + if (UNIV_LIKELY_NULL(page_zip)) { + ulint size = 5; + + if (!(flags & BTR_KEEP_SYS_FLAG)) { + size += 21;/* row_upd_rec_sys_fields() */ + } + + if (UNIV_UNLIKELY(!page_zip_alloc(page_zip, + ut_align_down(rec, UNIV_PAGE_SIZE), size))) { + + err = DB_OVERFLOW; + goto func_exit; + } + } + err = lock_clust_rec_modify_check_and_lock(flags, rec, index, offsets, thr); if (err != DB_SUCCESS) { - if (UNIV_LIKELY_NULL(heap)) { - mem_heap_free(heap); - } - return(err); + goto func_exit; } err = trx_undo_report_row_operation(flags, TRX_UNDO_MODIFY_OP, thr, @@ -2171,10 +2294,7 @@ btr_cur_del_mark_set_clust_rec( &roll_ptr); if (err != DB_SUCCESS) { - if (UNIV_LIKELY_NULL(heap)) { - mem_heap_free(heap); - } - return(err); + goto func_exit; } block = buf_block_align(rec); @@ -2183,12 +2303,13 @@ btr_cur_del_mark_set_clust_rec( rw_lock_x_lock(&btr_search_latch); } - rec_set_deleted_flag(rec, rec_offs_comp(offsets), val); + btr_rec_set_deleted_flag(rec, page_zip, val); trx = thr_get_trx(thr); if (!(flags & BTR_KEEP_SYS_FLAG)) { - row_upd_rec_sys_fields(rec, index, offsets, trx, roll_ptr); + row_upd_rec_sys_fields(rec, page_zip, + index, offsets, trx, roll_ptr); } if (block->is_hashed) { @@ -2197,10 +2318,12 @@ btr_cur_del_mark_set_clust_rec( btr_cur_del_mark_set_clust_rec_log(flags, rec, index, val, trx, roll_ptr, mtr); + +func_exit: if (UNIV_LIKELY_NULL(heap)) { mem_heap_free(heap); } - return(DB_SUCCESS); + return(err); } /******************************************************************** @@ -2246,7 +2369,8 @@ btr_cur_parse_del_mark_set_sec_rec( /* out: end of log record or NULL */ byte* ptr, /* in: buffer */ byte* end_ptr,/* in: buffer end */ - page_t* page) /* in: page or NULL */ + page_t* page, /* in/out: page or NULL */ + page_zip_des_t* page_zip)/* in/out: compressed page, or NULL */ { ulint val; ulint offset; @@ -2272,7 +2396,10 @@ btr_cur_parse_del_mark_set_sec_rec( is only being recovered, and there cannot be a hash index to it. */ - rec_set_deleted_flag(rec, page_is_comp(page), val); + if (!btr_rec_set_deleted_flag(rec, page_zip, val)) { + /* page_zip overflow should have been detected + before writing MLOG_COMP_REC_SEC_DELETE_MARK */ + } } return(ptr); @@ -2293,6 +2420,7 @@ btr_cur_del_mark_set_sec_rec( mtr_t* mtr) /* in: mtr */ { buf_block_t* block; + page_zip_des_t* page_zip; rec_t* rec; ulint err; @@ -2316,13 +2444,15 @@ btr_cur_del_mark_set_sec_rec( block = buf_block_align(rec); ut_ad(!!page_is_comp(buf_block_get_frame(block)) == cursor->index->table->comp); + page_zip = buf_block_get_page_zip(block); if (block->is_hashed) { rw_lock_x_lock(&btr_search_latch); } - rec_set_deleted_flag(rec, page_is_comp(buf_block_get_frame(block)), - val); + if (!btr_rec_set_deleted_flag(rec, page_zip, val)) { + ut_error; /* TODO */ + } if (block->is_hashed) { rw_lock_x_unlock(&btr_search_latch); @@ -2344,40 +2474,16 @@ btr_cur_del_unmark_for_ibuf( mtr_t* mtr) /* in: mtr */ { /* We do not need to reserve btr_search_latch, as the page has just + been read to the buffer pool and there cannot be a hash index to it. */ - rec_set_deleted_flag(rec, page_is_comp(buf_frame_align(rec)), FALSE); + btr_rec_set_deleted_flag(rec, NULL, FALSE); btr_cur_del_mark_set_sec_rec_log(rec, FALSE, mtr); } /*==================== B-TREE RECORD REMOVE =========================*/ -/***************************************************************** -Tries to compress a page of the tree on the leaf level. It is assumed -that mtr holds an x-latch on the tree and on the cursor page. To avoid -deadlocks, mtr must also own x-latches to brothers of page, if those -brothers exist. NOTE: it is assumed that the caller has reserved enough -free extents so that the compression will always succeed if done! */ - -void -btr_cur_compress( -/*=============*/ - btr_cur_t* cursor, /* in: cursor on the page to compress; - cursor does not stay valid */ - mtr_t* mtr) /* in: mtr */ -{ - ut_ad(mtr_memo_contains(mtr, - dict_tree_get_lock(btr_cur_get_tree(cursor)), - MTR_MEMO_X_LOCK)); - ut_ad(mtr_memo_contains(mtr, buf_block_align( - btr_cur_get_page(cursor)), - MTR_MEMO_PAGE_X_FIX)); - ut_ad(btr_page_get_level(btr_cur_get_page(cursor), mtr) == 0); - - btr_compress(cursor, mtr); -} - /***************************************************************** Tries to compress a page of the tree if it seems useful. It is assumed that mtr holds an x-latch on the tree and on the cursor page. To avoid @@ -2403,9 +2509,7 @@ btr_cur_compress_if_useful( if (btr_cur_compress_recommendation(cursor, mtr)) { - btr_compress(cursor, mtr); - - return(TRUE); + return(btr_compress(cursor, mtr)); } return(FALSE); @@ -2454,17 +2558,41 @@ btr_cur_optimistic_delete( if (no_compress_needed) { + page_zip_des_t* page_zip; + page_zip_des_t* page_zip_used; + lock_update_delete(rec); btr_search_update_hash_on_delete(cursor); max_ins_size = page_get_max_insert_size_after_reorganize(page, 1); + page_zip = buf_block_get_page_zip( + buf_block_align(btr_cur_get_page(cursor))); + + if (UNIV_LIKELY(!page_zip) + || UNIV_UNLIKELY(!page_zip_available(page_zip, 32))) { + /* If there is not enough space in the page + modification log, ignore the log and + try compressing the page afterwards. */ + page_zip_used = NULL; + } else { + page_zip_used = page_zip; + } + page_cur_delete_rec(btr_cur_get_page_cur(cursor), - cursor->index, offsets, mtr); + cursor->index, offsets, + page_zip_used, mtr); ibuf_update_free_bits_low(cursor->index, page, max_ins_size, mtr); + + if (UNIV_LIKELY_NULL(page_zip) + && UNIV_UNLIKELY(!page_zip_used)) { + /* Reorganize the page to ensure that the + compression succeeds after deleting the record. */ + btr_page_reorganize(page, cursor->index, mtr); + } } if (UNIV_LIKELY_NULL(heap)) { @@ -2503,6 +2631,8 @@ btr_cur_pessimistic_delete( mtr_t* mtr) /* in: mtr */ { page_t* page; + page_zip_des_t* page_zip; + page_zip_des_t* page_zip_used; dict_tree_t* tree; rec_t* rec; dtuple_t* node_ptr; @@ -2546,7 +2676,7 @@ btr_cur_pessimistic_delete( /* Free externally stored fields if the record is neither a node pointer nor in two-byte format. - This avoids an unnecessary loop. */ + This condition avoids an unnecessary loop. */ if (page_is_comp(page) ? !rec_get_node_ptr_flag(rec) : !rec_get_1byte_offs_flag(rec)) { @@ -2569,6 +2699,14 @@ btr_cur_pessimistic_delete( goto return_after_reservations; } + page_zip = buf_block_get_page_zip(buf_block_align(page)); + if (UNIV_LIKELY(!page_zip) + || UNIV_UNLIKELY(!page_zip_available(page_zip, 32))) { + page_zip_used = NULL; + } else { + page_zip_used = page_zip; + } + lock_update_delete(rec); level = btr_page_get_level(page, mtr); @@ -2584,8 +2722,13 @@ btr_cur_pessimistic_delete( non-leaf level, we must mark the new leftmost node pointer as the predefined minimum record */ - btr_set_min_rec_mark(next_rec, page_is_comp(page), - mtr); + if (UNIV_LIKELY_NULL(page_zip_used) + && UNIV_UNLIKELY(!page_zip_available( + page_zip_used, 5 + 32))) { + page_zip_used = NULL; + } + + btr_set_min_rec_mark(next_rec, page_zip_used, mtr); } else { /* Otherwise, if we delete the leftmost node pointer on a page, we have to change the father node pointer @@ -2607,10 +2750,16 @@ btr_cur_pessimistic_delete( btr_search_update_hash_on_delete(cursor); page_cur_delete_rec(btr_cur_get_page_cur(cursor), cursor->index, - offsets, mtr); + offsets, page_zip_used, mtr); ut_ad(btr_check_node_ptr(tree, page, mtr)); + if (UNIV_LIKELY_NULL(page_zip) && UNIV_UNLIKELY(!page_zip_used)) { + /* Reorganize the page to ensure that the + compression succeeds after deleting the record. */ + btr_page_reorganize(page, cursor->index, mtr); + } + *err = DB_SUCCESS; return_after_reservations: @@ -3038,7 +3187,7 @@ btr_cur_set_ownership_of_extern_field( const ulint* offsets,/* in: array returned by rec_get_offsets() */ ulint i, /* in: field number */ ibool val, /* in: value to set */ - mtr_t* mtr) /* in: mtr */ + mtr_t* mtr) /* in: mtr, or NULL if not logged */ { byte* data; ulint local_len; @@ -3057,9 +3206,13 @@ btr_cur_set_ownership_of_extern_field( } else { byte_val = byte_val | BTR_EXTERN_OWNER_FLAG; } - - mlog_write_ulint(data + local_len + BTR_EXTERN_LEN, byte_val, + + if (UNIV_LIKELY(mtr != NULL)) { + mlog_write_ulint(data + local_len + BTR_EXTERN_LEN, byte_val, MLOG_1BYTE, mtr); + } else { + mach_write_to_1(data + local_len + BTR_EXTERN_LEN, byte_val); + } } /*********************************************************************** @@ -3074,9 +3227,8 @@ btr_cur_mark_extern_inherited_fields( rec_t* rec, /* in: record in a clustered index */ const ulint* offsets,/* in: array returned by rec_get_offsets() */ upd_t* update, /* in: update vector */ - mtr_t* mtr) /* in: mtr */ + mtr_t* mtr) /* in: mtr, or NULL if not logged */ { - ibool is_updated; ulint n; ulint j; ulint i; @@ -3089,22 +3241,22 @@ btr_cur_mark_extern_inherited_fields( if (rec_offs_nth_extern(offsets, i)) { /* Check it is not in updated fields */ - is_updated = FALSE; if (update) { for (j = 0; j < upd_get_n_fields(update); j++) { if (upd_get_nth_field(update, j) ->field_no == i) { - is_updated = TRUE; + + goto updated; } } } - if (!is_updated) { - btr_cur_set_ownership_of_extern_field(rec, - offsets, i, FALSE, mtr); - } + btr_cur_set_ownership_of_extern_field(rec, + offsets, i, FALSE, mtr); +updated: + ; } } } @@ -3176,7 +3328,7 @@ void btr_cur_unmark_extern_fields( /*=========================*/ rec_t* rec, /* in: record in a clustered index */ - mtr_t* mtr, /* in: mtr */ + mtr_t* mtr, /* in: mtr, or NULL if not logged */ const ulint* offsets)/* in: array returned by rec_get_offsets() */ { ulint n; @@ -3188,8 +3340,8 @@ btr_cur_unmark_extern_fields( for (i = 0; i < n; i++) { if (rec_offs_nth_extern(offsets, i)) { - btr_cur_set_ownership_of_extern_field(rec, offsets, i, - TRUE, mtr); + btr_cur_set_ownership_of_extern_field(rec, + offsets, i, TRUE, mtr); } } } @@ -3468,7 +3620,7 @@ btr_store_big_rec_extern_fields( rec_set_nth_field_extern_bit(rec, index, big_rec_vec->fields[i].field_no, - TRUE, &mtr); + &mtr); } prev_page_no = page_no; diff --git a/buf/buf0buf.c b/buf/buf0buf.c index 3a3b64dd51b..441a2b8ae55 100644 --- a/buf/buf0buf.c +++ b/buf/buf0buf.c @@ -37,6 +37,7 @@ Created 11/5/1995 Heikki Tuuri #include "log0log.h" #include "trx0undo.h" #include "srv0srv.h" +#include "page0zip.h" /* IMPLEMENTATION OF THE BUFFER POOL @@ -482,6 +483,8 @@ buf_block_init( block->n_pointers = 0; + page_zip_des_init(&block->page_zip); + rw_lock_create(&(block->lock)); ut_ad(rw_lock_validate(&(block->lock))); diff --git a/buf/buf0flu.c b/buf/buf0flu.c index e39d1ae0a71..312fd5608b2 100644 --- a/buf/buf0flu.c +++ b/buf/buf0flu.c @@ -264,9 +264,11 @@ buf_flush_buffered_writes(void) "InnoDB: before posting to the doublewrite buffer.\n"); } - if (block->check_index_page_at_flush - && !page_simple_validate(block->frame)) { - + if (!block->check_index_page_at_flush) { + } else if (page_is_comp(block->frame) + && UNIV_UNLIKELY(!page_simple_validate_new( + block->frame))) { +corrupted_page: buf_page_print(block->frame); ut_print_timestamp(stderr); @@ -278,6 +280,10 @@ buf_flush_buffered_writes(void) (ulong) block->offset, (ulong) block->space); ut_error; + } else if (UNIV_UNLIKELY(!page_simple_validate_old( + block->frame))) { + + goto corrupted_page; } } diff --git a/ibuf/ibuf0ibuf.c b/ibuf/ibuf0ibuf.c index d7fa48b6e66..7f48789d35a 100644 --- a/ibuf/ibuf0ibuf.c +++ b/ibuf/ibuf0ibuf.c @@ -2761,6 +2761,7 @@ ibuf_insert( ut_ad(dtuple_check_typed(entry)); ut_a(!(index->type & DICT_CLUSTERED)); + ut_a(!index->table->zip); if (rec_get_converted_size(index, entry) >= page_get_free_space_of_empty(index->table->comp) / 2) { @@ -2846,9 +2847,10 @@ ibuf_insert_to_index_page( btr_cur_del_unmark_for_ibuf(rec, mtr); } else { - rec = page_cur_tuple_insert(&page_cur, entry, index, mtr); + rec = page_cur_tuple_insert(&page_cur, NULL, + entry, index, mtr); - if (rec == NULL) { + if (UNIV_UNLIKELY(rec == NULL)) { /* If the record did not fit, reorganize */ btr_page_reorganize(page, index, mtr); @@ -2858,7 +2860,8 @@ ibuf_insert_to_index_page( /* This time the record must fit */ if (UNIV_UNLIKELY(!page_cur_tuple_insert( - &page_cur, entry, index, mtr))) { + &page_cur, NULL, + entry, index, mtr))) { ut_print_timestamp(stderr); diff --git a/include/btr0btr.h b/include/btr0btr.h index d28b0b129a1..62ef2c8295c 100644 --- a/include/btr0btr.h +++ b/include/btr0btr.h @@ -265,9 +265,10 @@ Sets a record as the predefined minimum record. */ void btr_set_min_rec_mark( /*=================*/ - rec_t* rec, /* in: record */ - ulint comp, /* in: nonzero=compact page format */ - mtr_t* mtr); /* in: mtr */ + rec_t* rec, /* in/out: record */ + page_zip_des_t* page_zip,/* in/out: compressed page with + at least 5 bytes available, or NULL */ + mtr_t* mtr); /* in: mtr */ /***************************************************************** Deletes on the upper level the node pointer to a page. */ @@ -295,11 +296,12 @@ conditions, looks at the right brother. If the page is the only one on that level lifts the records of the page to the father page, thus reducing the tree height. It is assumed that mtr holds an x-latch on the tree and on the page. If cursor is on the leaf level, mtr must also hold x-latches to -the brothers, if they exist. NOTE: it is assumed that the caller has reserved -enough free extents so that the compression will always succeed if done! */ -void +the brothers, if they exist. */ + +ibool btr_compress( /*=========*/ + /* out: TRUE on success */ btr_cur_t* cursor, /* in: cursor on the page to merge or lift; the page must not be empty: in record delete use btr_discard_page if the page would become diff --git a/include/btr0cur.h b/include/btr0cur.h index 352d1739b6a..99f38a97a50 100644 --- a/include/btr0cur.h +++ b/include/btr0cur.h @@ -286,19 +286,6 @@ btr_cur_del_unmark_for_ibuf( rec_t* rec, /* in: record to delete unmark */ mtr_t* mtr); /* in: mtr */ /***************************************************************** -Tries to compress a page of the tree on the leaf level. It is assumed -that mtr holds an x-latch on the tree and on the cursor page. To avoid -deadlocks, mtr must also own x-latches to brothers of page, if those -brothers exist. NOTE: it is assumed that the caller has reserved enough -free extents so that the compression will always succeed if done! */ - -void -btr_cur_compress( -/*=============*/ - btr_cur_t* cursor, /* in: cursor on the page to compress; - cursor does not stay valid */ - mtr_t* mtr); /* in: mtr */ -/***************************************************************** Tries to compress a page of the tree if it seems useful. It is assumed that mtr holds an x-latch on the tree and on the cursor page. To avoid deadlocks, mtr must also own x-latches to brothers of page, if those @@ -364,7 +351,8 @@ btr_cur_parse_update_in_place( /* out: end of log record or NULL */ byte* ptr, /* in: buffer */ byte* end_ptr,/* in: buffer end */ - page_t* page, /* in: page or NULL */ + page_t* page, /* in/out: page or NULL */ + page_zip_des_t* page_zip,/* in/out: compressed page, or NULL */ dict_index_t* index); /* in: index corresponding to page */ /******************************************************************** Parses the redo log record for delete marking or unmarking of a clustered @@ -376,8 +364,9 @@ btr_cur_parse_del_mark_set_clust_rec( /* out: end of log record or NULL */ byte* ptr, /* in: buffer */ byte* end_ptr,/* in: buffer end */ - dict_index_t* index, /* in: index corresponding to page */ - page_t* page); /* in: page or NULL */ + page_t* page, /* in/out: page or NULL */ + page_zip_des_t* page_zip,/* in/out: compressed page, or NULL */ + dict_index_t* index); /* in: index corresponding to page */ /******************************************************************** Parses the redo log record for delete marking or unmarking of a secondary index record. */ @@ -388,7 +377,8 @@ btr_cur_parse_del_mark_set_sec_rec( /* out: end of log record or NULL */ byte* ptr, /* in: buffer */ byte* end_ptr,/* in: buffer end */ - page_t* page); /* in: page or NULL */ + page_t* page, /* in/out: page or NULL */ + page_zip_des_t* page_zip);/* in/out: compressed page, or NULL */ /*********************************************************************** Estimates the number of rows in a given index range. */ diff --git a/include/buf0buf.h b/include/buf0buf.h index ae8d0411c12..5705f2f89a7 100644 --- a/include/buf0buf.h +++ b/include/buf0buf.h @@ -31,6 +31,7 @@ Created 11/5/1995 Heikki Tuuri #include "hash0hash.h" #include "ut0byte.h" #include "os0proc.h" +#include "page0types.h" /* Flags for flush types */ #define BUF_FLUSH_LRU 1 @@ -612,6 +613,14 @@ buf_block_get_page_no( /*==================*/ /* out: page number */ buf_block_t* block); /* in: pointer to the control block */ +/************************************************************************* +Gets the compressed page descriptor of a block if applicable. */ +UNIV_INLINE +page_zip_des_t* +buf_block_get_page_zip( +/*===================*/ + /* out: compressed page descriptor, or NULL */ + buf_block_t* block); /* in: pointer to the control block */ /*********************************************************************** Gets the block to whose frame the pointer is pointing to. */ UNIV_INLINE @@ -850,6 +859,7 @@ struct buf_block_struct{ ulint curr_side; /* BTR_SEARCH_LEFT_SIDE or BTR_SEARCH_RIGHT_SIDE in hash indexing */ + page_zip_des_t page_zip; /* compressed page info */ /* 6. Debug fields */ #ifdef UNIV_SYNC_DEBUG rw_lock_t debug_latch; /* in the debug version, each thread diff --git a/include/buf0buf.ic b/include/buf0buf.ic index d949254d47d..ed5172fa0b0 100644 --- a/include/buf0buf.ic +++ b/include/buf0buf.ic @@ -191,6 +191,24 @@ buf_block_get_page_no( return(block->offset); } +/************************************************************************* +Gets the compressed page descriptor of a block if applicable. */ +UNIV_INLINE +page_zip_des_t* +buf_block_get_page_zip( +/*===================*/ + /* out: compressed page descriptor, or NULL */ + buf_block_t* block) /* in: pointer to the control block */ +{ + ut_ad(block); + + if (UNIV_LIKELY_NULL(block->page_zip.data)) { + return(&block->page_zip); + } + + return(NULL); +} + /*********************************************************************** Gets the block to whose frame the pointer is pointing to. */ UNIV_INLINE @@ -614,8 +632,6 @@ buf_page_release( RW_NO_LATCH */ mtr_t* mtr) /* in: mtr */ { - ulint buf_fix_count; - ut_ad(block); mutex_enter_fast(&(buf_pool->mutex)); @@ -631,8 +647,7 @@ buf_page_release( #ifdef UNIV_SYNC_DEBUG rw_lock_s_unlock(&(block->debug_latch)); #endif - buf_fix_count = block->buf_fix_count; - block->buf_fix_count = buf_fix_count - 1; + block->buf_fix_count--; mutex_exit(&(buf_pool->mutex)); diff --git a/include/dict0mem.h b/include/dict0mem.h index 7eec86d0bcb..8eca7296f00 100644 --- a/include/dict0mem.h +++ b/include/dict0mem.h @@ -329,6 +329,7 @@ struct dict_table_struct{ user calls DISCARD TABLESPACE on this table, and reset to FALSE in IMPORT TABLESPACE */ ibool comp; /* flag: TRUE=compact page format */ + ibool zip; /* flag: TRUE=compressed page format */ hash_node_t name_hash; /* hash chain node */ hash_node_t id_hash; /* hash chain node */ ulint n_def; /* number of columns defined so far */ diff --git a/include/ibuf0ibuf.ic b/include/ibuf0ibuf.ic index 68f7ce9c1d0..52ec9d90e30 100644 --- a/include/ibuf0ibuf.ic +++ b/include/ibuf0ibuf.ic @@ -88,6 +88,7 @@ ibuf_should_try( decide */ { if (!(index->type & DICT_CLUSTERED) + && !index->table->zip && (ignore_sec_unique || !(index->type & DICT_UNIQUE)) && ibuf->meter > IBUF_THRESHOLD) { diff --git a/include/mtr0mtr.h b/include/mtr0mtr.h index f44e813cf6b..04827008587 100644 --- a/include/mtr0mtr.h +++ b/include/mtr0mtr.h @@ -129,8 +129,11 @@ flag value must give the length also! */ /* copy compact record list end to a new created index page */ #define MLOG_COMP_PAGE_REORGANIZE ((byte)46) /* reorganize an index page */ +#define MLOG_COMP_DECOMPRESS ((byte)47) /* decompress a page + to undo a compressed page + overflow */ -#define MLOG_BIGGEST_TYPE ((byte)46) /* biggest value (used in +#define MLOG_BIGGEST_TYPE ((byte)47) /* biggest value (used in asserts) */ /******************************************************************* diff --git a/include/page0cur.h b/include/page0cur.h index b03302b0e77..a926bfd9a36 100644 --- a/include/page0cur.h +++ b/include/page0cur.h @@ -130,6 +130,8 @@ page_cur_tuple_insert( /* out: pointer to record if succeed, NULL otherwise */ page_cur_t* cursor, /* in: a page cursor */ + page_zip_des_t* page_zip,/* in/out: compressed page with at least + 25 + rec_size bytes available, or NULL */ dtuple_t* tuple, /* in: pointer to a data tuple */ dict_index_t* index, /* in: record descriptor */ mtr_t* mtr); /* in: mini-transaction handle */ @@ -144,6 +146,8 @@ page_cur_rec_insert( /* out: pointer to record if succeed, NULL otherwise */ page_cur_t* cursor, /* in: a page cursor */ + page_zip_des_t* page_zip,/* in/out: compressed page with at least + 25 + rec_size bytes available, or NULL */ rec_t* rec, /* in: record to insert */ dict_index_t* index, /* in: record descriptor */ ulint* offsets,/* in: rec_get_offsets(rec, index) */ @@ -160,6 +164,8 @@ page_cur_insert_rec_low( /* out: pointer to record if succeed, NULL otherwise */ page_cur_t* cursor, /* in: a page cursor */ + page_zip_des_t* page_zip,/* in/out: compressed page with at least + 25 + rec_size bytes available, or NULL */ dtuple_t* tuple, /* in: pointer to a data tuple or NULL */ dict_index_t* index, /* in: record descriptor */ rec_t* rec, /* in: pointer to a physical record or NULL */ @@ -173,7 +179,6 @@ void page_copy_rec_list_end_to_created_page( /*===================================*/ page_t* new_page, /* in: index page to copy to */ - page_t* page, /* in: index page */ rec_t* rec, /* in: first record to copy */ dict_index_t* index, /* in: record descriptor */ mtr_t* mtr); /* in: mtr */ @@ -184,9 +189,11 @@ next record after the deleted one. */ void page_cur_delete_rec( /*================*/ - page_cur_t* cursor, /* in: a page cursor */ + page_cur_t* cursor, /* in/out: a page cursor */ dict_index_t* index, /* in: record descriptor */ const ulint* offsets,/* in: rec_get_offsets(cursor->rec, index) */ + page_zip_des_t* page_zip,/* in/out: compressed page with at least + 32 bytes available, or NULL */ mtr_t* mtr); /* in: mini-transaction handle */ /******************************************************************** Searches the right position for a page cursor. */ @@ -245,7 +252,9 @@ page_cur_parse_insert_rec( byte* ptr, /* in: buffer */ byte* end_ptr,/* in: buffer end */ dict_index_t* index, /* in: record descriptor */ - page_t* page, /* in: page or NULL */ + page_t* page, /* in/out: page or NULL */ + page_zip_des_t* page_zip,/* in/out: compressed page with at least + 25 + rec_size bytes available, or NULL */ mtr_t* mtr); /* in: mtr or NULL */ /************************************************************** Parses a log record of copying a record list end to a new created page. */ @@ -257,7 +266,8 @@ page_parse_copy_rec_list_to_created_page( byte* ptr, /* in: buffer */ byte* end_ptr,/* in: buffer end */ dict_index_t* index, /* in: record descriptor */ - page_t* page, /* in: page or NULL */ + page_t* page, /* in/out: page or NULL */ + page_zip_des_t* page_zip,/* in/out: compressed page or NULL */ mtr_t* mtr); /* in: mtr or NULL */ /*************************************************************** Parses log record of a record delete on a page. */ @@ -269,7 +279,9 @@ page_cur_parse_delete_rec( byte* ptr, /* in: buffer */ byte* end_ptr,/* in: buffer end */ dict_index_t* index, /* in: record descriptor */ - page_t* page, /* in: page or NULL */ + page_t* page, /* in/out: page or NULL */ + page_zip_des_t* page_zip,/* in/out: compressed page with at least + 32 bytes available, or NULL */ mtr_t* mtr); /* in: mtr or NULL */ /* Index page cursor */ diff --git a/include/page0cur.ic b/include/page0cur.ic index f8346819e84..aac873d24d0 100644 --- a/include/page0cur.ic +++ b/include/page0cur.ic @@ -181,11 +181,14 @@ page_cur_tuple_insert( /* out: pointer to record if succeed, NULL otherwise */ page_cur_t* cursor, /* in: a page cursor */ + page_zip_des_t* page_zip,/* in/out: compressed page with at least + 25 + rec_size bytes available, or NULL */ dtuple_t* tuple, /* in: pointer to a data tuple */ dict_index_t* index, /* in: record descriptor */ mtr_t* mtr) /* in: mini-transaction handle */ { - return(page_cur_insert_rec_low(cursor, tuple, index, NULL, NULL, mtr)); + return(page_cur_insert_rec_low(cursor, page_zip, tuple, + index, NULL, NULL, mtr)); } /*************************************************************** @@ -199,12 +202,14 @@ page_cur_rec_insert( /* out: pointer to record if succeed, NULL otherwise */ page_cur_t* cursor, /* in: a page cursor */ + page_zip_des_t* page_zip,/* in/out: compressed page with at least + 25 + rec_size bytes available, or NULL */ rec_t* rec, /* in: record to insert */ dict_index_t* index, /* in: record descriptor */ ulint* offsets,/* in: rec_get_offsets(rec, index) */ mtr_t* mtr) /* in: mini-transaction handle */ { - return(page_cur_insert_rec_low(cursor, NULL, index, rec, - offsets, mtr)); + return(page_cur_insert_rec_low(cursor, page_zip, NULL, + index, rec, offsets, mtr)); } diff --git a/include/page0page.h b/include/page0page.h index c4ffa39d3ac..89121e50086 100644 --- a/include/page0page.h +++ b/include/page0page.h @@ -145,7 +145,7 @@ Sets the max trx id field value. */ void page_set_max_trx_id( /*================*/ - page_t* page, /* in: page */ + page_t* page, /* in/out: page */ dulint trx_id);/* in: transaction id */ /***************************************************************** Sets the max trx id field value if trx_id is bigger than the previous @@ -154,8 +154,8 @@ UNIV_INLINE void page_update_max_trx_id( /*===================*/ - page_t* page, /* in: page */ - dulint trx_id); /* in: transaction id */ + page_t* page, /* in/out: page */ + dulint trx_id);/* in: transaction id */ /***************************************************************** Reads the given header field. */ UNIV_INLINE @@ -170,9 +170,10 @@ UNIV_INLINE void page_header_set_field( /*==================*/ - page_t* page, /* in: page */ - ulint field, /* in: PAGE_N_DIR_SLOTS, ... */ - ulint val); /* in: value */ + page_t* page, /* in/out: page */ + page_zip_des_t* page_zip,/* in/out: compressed page, or NULL */ + ulint field, /* in: PAGE_N_DIR_SLOTS, ... */ + ulint val); /* in: value */ /***************************************************************** Returns the pointer stored in the given header field. */ UNIV_INLINE @@ -188,9 +189,10 @@ UNIV_INLINE void page_header_set_ptr( /*================*/ - page_t* page, /* in: page */ - ulint field, /* in: PAGE_FREE, ... */ - byte* ptr); /* in: pointer or NULL*/ + page_t* page, /* in/out: page */ + page_zip_des_t* page_zip,/* in: compressed page, or NULL */ + ulint field, /* in/out: PAGE_FREE, ... */ + const byte* ptr); /* in: pointer or NULL*/ /***************************************************************** Resets the last insert info field in the page header. Writes to mlog about this operation. */ @@ -283,8 +285,9 @@ UNIV_INLINE void page_dir_set_n_heap( /*================*/ - page_t* page, /* in: index page */ - ulint n_heap);/* in: number of records */ + page_t* page, /* in/out: index page */ + page_zip_des_t* page_zip,/* in/out: compressed page, or NULL */ + ulint n_heap);/* in: number of records */ /***************************************************************** Gets the number of dir slots in directory. */ UNIV_INLINE @@ -299,9 +302,9 @@ UNIV_INLINE void page_dir_set_n_slots( /*=================*/ - /* out: number of slots */ - page_t* page, /* in: index page */ - ulint n_slots);/* in: number of slots */ + page_t* page, /* in/out: page */ + page_zip_des_t* page_zip,/* in/out: compressed page, or NULL */ + ulint n_slots);/* in: number of slots */ /***************************************************************** Gets pointer to nth directory slot. */ UNIV_INLINE @@ -349,9 +352,10 @@ UNIV_INLINE void page_dir_slot_set_n_owned( /*======================*/ - page_dir_slot_t* slot, /* in: directory slot */ - ulint n); /* in: number of records owned - by the slot */ + page_dir_slot_t*slot, /* in/out: directory slot */ + page_zip_des_t* page_zip,/* in/out: compressed page with + at least 5 bytes available, or NULL */ + ulint n); /* in: number of records owned by the slot */ /**************************************************************** Calculates the space reserved for directory slots of a given number of records. The exact value is a fraction number @@ -402,10 +406,12 @@ UNIV_INLINE void page_rec_set_next( /*==============*/ - rec_t* rec, /* in: pointer to record, must not be - page supremum */ - rec_t* next); /* in: pointer to next record, must not - be page infimum */ + rec_t* rec, /* in: pointer to record, + must not be page supremum */ + rec_t* next, /* in: pointer to next record, + must not be page infimum */ + page_zip_des_t* page_zip);/* in/out: compressed page with at least + 6 bytes available, or NULL */ /**************************************************************** Gets the pointer to the previous record. */ UNIV_INLINE @@ -513,6 +519,16 @@ page_get_free_space_of_empty( /* out: free space */ ulint comp) /* in: nonzero=compact page format */ __attribute__((const)); +/************************************************************** +Returns the base extra size of a physical record. This is the +size of the fixed header, independent of the record size. */ +UNIV_INLINE +ulint +page_rec_get_base_extra_size( +/*=========================*/ + /* out: REC_N_NEW_EXTRA_BYTES + or REC_N_OLD_EXTRA_BYTES */ + const rec_t* rec); /* in: physical record */ /**************************************************************** Returns the sum of the sizes of the records in the record list excluding the infimum and supremum records. */ @@ -530,7 +546,8 @@ page_mem_alloc( /*===========*/ /* out: pointer to start of allocated buffer, or NULL if allocation fails */ - page_t* page, /* in: index page */ + page_t* page, /* in/out: index page */ + page_zip_des_t* page_zip,/* in/out: compressed page, or NULL */ ulint need, /* in: number of bytes needed */ dict_index_t* index, /* in: record descriptor */ ulint* heap_no);/* out: this contains the heap number @@ -542,7 +559,9 @@ UNIV_INLINE void page_mem_free( /*==========*/ - page_t* page, /* in: index page */ + page_t* page, /* in/out: index page */ + page_zip_des_t* page_zip,/* in/out: compressed page with at least + 6 bytes available, or NULL */ rec_t* rec, /* in: pointer to the (origin of) record */ const ulint* offsets);/* in: array returned by rec_get_offsets() */ /************************************************************** @@ -554,17 +573,17 @@ page_create( /* out: pointer to the page */ buf_frame_t* frame, /* in: a buffer frame where the page is created */ + page_zip_des_t* page_zip, /* in/out: compressed page, or NULL */ mtr_t* mtr, /* in: mini-transaction handle */ ulint comp); /* in: nonzero=compact page format */ /***************************************************************** Differs from page_copy_rec_list_end, because this function does not -touch the lock table and max trx id on page. */ +touch the lock table and max trx id on page or compress the page. */ void page_copy_rec_list_end_no_locks( /*============================*/ page_t* new_page, /* in: index page to copy to */ - page_t* page, /* in: index page */ rec_t* rec, /* in: record on page */ dict_index_t* index, /* in: record descriptor */ mtr_t* mtr); /* in: mtr */ @@ -573,27 +592,31 @@ Copies records from page to new_page, from the given record onward, including that record. Infimum and supremum records are not copied. The records are copied to the start of the record list on new_page. */ -void +ibool page_copy_rec_list_end( /*===================*/ - page_t* new_page, /* in: index page to copy to */ - page_t* page, /* in: index page */ + /* out: TRUE on success */ + page_t* new_page, /* in/out: index page to copy to */ + page_zip_des_t* new_page_zip, /* in/out: compressed page, or NULL */ rec_t* rec, /* in: record on page */ dict_index_t* index, /* in: record descriptor */ - mtr_t* mtr); /* in: mtr */ + mtr_t* mtr) /* in: mtr */ + __attribute__((warn_unused_result, nonnull(1, 3, 4, 5))); /***************************************************************** Copies records from page to new_page, up to the given record, NOT including that record. Infimum and supremum records are not copied. The records are copied to the end of the record list on new_page. */ -void +ibool page_copy_rec_list_start( /*=====================*/ - page_t* new_page, /* in: index page to copy to */ - page_t* page, /* in: index page */ + /* out: TRUE on success */ + page_t* new_page, /* in/out: index page to copy to */ + page_zip_des_t* new_page_zip, /* in/out: compressed page, or NULL */ rec_t* rec, /* in: record on page */ dict_index_t* index, /* in: record descriptor */ - mtr_t* mtr); /* in: mtr */ + mtr_t* mtr) /* in: mtr */ + __attribute__((warn_unused_result, nonnull(1, 3, 4, 5))); /***************************************************************** Deletes records from a page from a given record onward, including that record. The infimum and supremum records are not deleted. */ @@ -601,26 +624,16 @@ The infimum and supremum records are not deleted. */ void page_delete_rec_list_end( /*=====================*/ - page_t* page, /* in: index page */ - rec_t* rec, /* in: record on page */ + rec_t* rec, /* in: pointer to record on page */ dict_index_t* index, /* in: record descriptor */ ulint n_recs, /* in: number of records to delete, or ULINT_UNDEFINED if not known */ ulint size, /* in: the sum of the sizes of the records in the end of the chain to delete, or ULINT_UNDEFINED if not known */ - mtr_t* mtr); /* in: mtr */ -/***************************************************************** -Deletes records from page, up to the given record, NOT including -that record. Infimum and supremum records are not deleted. */ - -void -page_delete_rec_list_start( -/*=======================*/ - page_t* page, /* in: index page */ - rec_t* rec, /* in: record on page */ - dict_index_t* index, /* in: record descriptor */ - mtr_t* mtr); /* in: mtr */ + page_zip_des_t* page_zip,/* in/out: compressed page, or NULL */ + mtr_t* mtr) /* in: mtr */ + __attribute__((nonnull(1, 2, 6))); /***************************************************************** Moves record list end to another page. Moved records include split_rec. */ @@ -629,30 +642,25 @@ void page_move_rec_list_end( /*===================*/ page_t* new_page, /* in: index page where to move */ - page_t* page, /* in: index page */ + page_zip_des_t* new_page_zip, /* in/out: compressed page of + new_page, or NULL */ rec_t* split_rec, /* in: first record to move */ + page_zip_des_t* page_zip, /* in/out: compressed page of + split_rec, or NULL */ dict_index_t* index, /* in: record descriptor */ - mtr_t* mtr); /* in: mtr */ -/***************************************************************** -Moves record list start to another page. Moved records do not include -split_rec. */ - -void -page_move_rec_list_start( -/*=====================*/ - page_t* new_page, /* in: index page where to move */ - page_t* page, /* in: index page */ - rec_t* split_rec, /* in: first record not to move */ - dict_index_t* index, /* in: record descriptor */ - mtr_t* mtr); /* in: mtr */ + mtr_t* mtr) /* in: mtr */ + __attribute__((nonnull(1, 3, 5, 6))); /******************************************************************** Splits a directory slot which owns too many records. */ void page_dir_split_slot( /*================*/ - page_t* page, /* in: the index page in question */ - ulint slot_no); /* in: the directory slot */ + page_t* page, /* in: index page */ + page_zip_des_t* page_zip,/* in/out: compressed page with + at least 10 bytes available, or NULL */ + ulint slot_no)/* in: the directory slot */ + __attribute__((nonnull(1))); /***************************************************************** Tries to balance the given directory slot with too few records with the upper neighbor, so that there are at least the minimum number @@ -662,8 +670,11 @@ two slots. */ void page_dir_balance_slot( /*==================*/ - page_t* page, /* in: index page */ - ulint slot_no); /* in: the directory slot */ + page_t* page, /* in/out: index page */ + page_zip_des_t* page_zip,/* in/out: compressed page with + at least 15 bytes available, or NULL */ + ulint slot_no)/* in: the directory slot */ + __attribute__((nonnull(1))); /************************************************************** Parses a log record of a record list end or start deletion. */ @@ -766,10 +777,20 @@ know the index. This is also resilient so that this should never crash even if the page is total garbage. */ ibool -page_simple_validate( -/*=================*/ +page_simple_validate_old( +/*=====================*/ /* out: TRUE if ok */ - page_t* page); /* in: index page */ + page_t* page); /* in: old-style index page */ +/******************************************************************* +This function checks the consistency of an index page when we do not +know the index. This is also resilient so that this should never crash +even if the page is total garbage. */ + +ibool +page_simple_validate_new( +/*=====================*/ + /* out: TRUE if ok */ + page_t* page); /* in: new-style index page */ /******************************************************************* This function checks the consistency of an index page. */ diff --git a/include/page0page.ic b/include/page0page.ic index 655ff245aa8..2fae9ad8f1e 100644 --- a/include/page0page.ic +++ b/include/page0page.ic @@ -35,7 +35,7 @@ UNIV_INLINE void page_update_max_trx_id( /*===================*/ - page_t* page, /* in: page */ + page_t* page, /* in/out: page */ dulint trx_id) /* in: transaction id */ { ut_ad(page); @@ -67,9 +67,10 @@ UNIV_INLINE void page_header_set_field( /*==================*/ - page_t* page, /* in: page */ - ulint field, /* in: PAGE_LEVEL, ... */ - ulint val) /* in: value */ + page_t* page, /* in/out: page */ + page_zip_des_t* page_zip,/* in/out: compressed page, or NULL */ + ulint field, /* in: PAGE_N_DIR_SLOTS, ... */ + ulint val) /* in: value */ { ut_ad(page); ut_ad(field <= PAGE_N_RECS); @@ -77,6 +78,9 @@ page_header_set_field( ut_ad(field != PAGE_N_HEAP || (val & 0x7fff) < UNIV_PAGE_SIZE); mach_write_to_2(page + PAGE_HEADER + field, val); + if (UNIV_LIKELY_NULL(page_zip)) { + page_zip_write_header(page_zip, page + PAGE_HEADER + field, 2); + } } /***************************************************************** @@ -114,9 +118,10 @@ UNIV_INLINE void page_header_set_ptr( /*================*/ - page_t* page, /* in: page */ - ulint field, /* in: PAGE_FREE, ... */ - byte* ptr) /* in: pointer or NULL*/ + page_t* page, /* in: page */ + page_zip_des_t* page_zip,/* in: compressed page, or NULL */ + ulint field, /* in: PAGE_FREE, ... */ + const byte* ptr) /* in: pointer or NULL*/ { ulint offs; @@ -133,7 +138,7 @@ page_header_set_ptr( ut_ad((field != PAGE_HEAP_TOP) || offs); - page_header_set_field(page, field, offs); + page_header_set_field(page, page_zip, field, offs); } /***************************************************************** @@ -413,11 +418,11 @@ UNIV_INLINE void page_dir_set_n_slots( /*=================*/ - /* out: number of slots */ - page_t* page, /* in: index page */ - ulint n_slots)/* in: number of slots */ + page_t* page, /* in/out: page */ + page_zip_des_t* page_zip,/* in/out: compressed page, or NULL */ + ulint n_slots)/* in: number of slots */ { - page_header_set_field(page, PAGE_N_DIR_SLOTS, n_slots); + page_header_set_field(page, page_zip, PAGE_N_DIR_SLOTS, n_slots); } /***************************************************************** @@ -438,12 +443,13 @@ UNIV_INLINE void page_dir_set_n_heap( /*================*/ - page_t* page, /* in: index page */ - ulint n_heap) /* in: number of records */ + page_t* page, /* in/out: index page */ + page_zip_des_t* page_zip,/* in/out: compressed page, or NULL */ + ulint n_heap) /* in: number of records */ { ut_ad(n_heap < 0x8000); - page_header_set_field(page, PAGE_N_HEAP, n_heap | (0x8000 & + page_header_set_field(page, page_zip, PAGE_N_HEAP, n_heap | (0x8000 & page_header_get_field(page, PAGE_N_HEAP))); } @@ -520,7 +526,11 @@ page_dir_slot_get_n_owned( page_dir_slot_t* slot) /* in: page directory slot */ { rec_t* rec = page_dir_slot_get_rec(slot); - return(rec_get_n_owned(rec, page_rec_is_comp(rec))); + if (page_rec_is_comp(slot)) { + return(rec_get_n_owned_new(rec)); + } else { + return(rec_get_n_owned_old(rec)); + } } /******************************************************************* @@ -529,12 +539,18 @@ UNIV_INLINE void page_dir_slot_set_n_owned( /*======================*/ - page_dir_slot_t* slot, /* in: directory slot */ - ulint n) /* in: number of records owned - by the slot */ + page_dir_slot_t*slot, /* in/out: directory slot */ + page_zip_des_t* page_zip,/* in/out: compressed page with + at least 5 bytes available, or NULL */ + ulint n) /* in: number of records owned by the slot */ { rec_t* rec = page_dir_slot_get_rec(slot); - rec_set_n_owned(rec, page_rec_is_comp(rec), n); + if (page_rec_is_comp(slot)) { + rec_set_n_owned_new(rec, page_zip, n); + } else { + ut_ad(!page_zip); + rec_set_n_owned_old(rec, n); + } } /**************************************************************** @@ -597,26 +613,29 @@ UNIV_INLINE void page_rec_set_next( /*==============*/ - rec_t* rec, /* in: pointer to record, must not be page supremum */ - rec_t* next) /* in: pointer to next record, must not be page - infimum */ + rec_t* rec, /* in: pointer to record, + must not be page supremum */ + rec_t* next, /* in: pointer to next record, + must not be page infimum */ + page_zip_des_t* page_zip) /* in/out: compressed page with + at least 6 bytes available, or NULL */ { - page_t* page; ulint offs; ut_ad(page_rec_check(rec)); ut_ad(!page_rec_is_supremum(rec)); - page = ut_align_down(rec, UNIV_PAGE_SIZE); - if (next) { - ut_ad(!page_rec_is_infimum(next)); - ut_ad(page == ut_align_down(next, UNIV_PAGE_SIZE)); - offs = (ulint) (next - page); + ut_ad(!next || !page_rec_is_infimum(next)); + ut_ad(!next || ut_align_down(rec, UNIV_PAGE_SIZE) + == ut_align_down(next, UNIV_PAGE_SIZE)); + + offs = ut_align_offset(next, UNIV_PAGE_SIZE); + if (page_rec_is_comp(rec)) { + rec_set_next_offs_new(rec, page_zip, offs); } else { - offs = 0; + rec_set_next_offs_old(rec, offs); + ut_ad(!page_zip); } - - rec_set_next_offs(rec, page_is_comp(page), offs); } /**************************************************************** @@ -671,11 +690,11 @@ page_rec_find_owner_rec( ut_ad(page_rec_check(rec)); if (page_rec_is_comp(rec)) { - while (rec_get_n_owned(rec, TRUE) == 0) { + while (rec_get_n_owned_new(rec) == 0) { rec = page_rec_get_next(rec); } } else { - while (rec_get_n_owned(rec, FALSE) == 0) { + while (rec_get_n_owned_old(rec) == 0) { rec = page_rec_get_next(rec); } } @@ -683,6 +702,23 @@ page_rec_find_owner_rec( return(rec); } +/************************************************************** +Returns the base extra size of a physical record. This is the +size of the fixed header, independent of the record size. */ +UNIV_INLINE +ulint +page_rec_get_base_extra_size( +/*=========================*/ + /* out: REC_N_NEW_EXTRA_BYTES + or REC_N_OLD_EXTRA_BYTES */ + const rec_t* rec) /* in: physical record */ +{ +#if REC_N_NEW_EXTRA_BYTES + 1 != REC_N_OLD_EXTRA_BYTES +# error "REC_N_NEW_EXTRA_BYTES + 1 != REC_N_OLD_EXTRA_BYTES" +#endif + return(REC_N_NEW_EXTRA_BYTES + (ulint) !page_rec_is_comp(rec)); +} + /**************************************************************** Returns the sum of the sizes of the records in the record list, excluding the infimum and supremum records. */ @@ -805,7 +841,9 @@ UNIV_INLINE void page_mem_free( /*==========*/ - page_t* page, /* in: index page */ + page_t* page, /* in/out: index page */ + page_zip_des_t* page_zip,/* in/out: compressed page with at least + 6 bytes available, or NULL */ rec_t* rec, /* in: pointer to the (origin of) record */ const ulint* offsets)/* in: array returned by rec_get_offsets() */ { @@ -816,8 +854,8 @@ page_mem_free( ut_ad(!rec_offs_comp(offsets) == !page_rec_is_comp(rec)); free = page_header_get_ptr(page, PAGE_FREE); - page_rec_set_next(rec, free); - page_header_set_ptr(page, PAGE_FREE, rec); + page_rec_set_next(rec, free, page_zip); + page_header_set_ptr(page, page_zip, PAGE_FREE, rec); #if 0 /* It's better not to destroy the user's data. */ @@ -827,11 +865,18 @@ page_mem_free( cannot be cleared, because page_mem_alloc() needs them in order to determine the size of the deleted record. */ memset(rec, 0, rec_offs_data_size(offsets)); + + /* If you enable this code, make sure that the callers of + page_mem_free() account for the increased usage of space. */ + if (UNIV_LIKELY_NULL(page_zip)) { + page_zip_write(page_zip, page, rec, rec - page, + rec_offs_data_size(offsets)); + } #endif garbage = page_header_get_field(page, PAGE_GARBAGE); - page_header_set_field(page, PAGE_GARBAGE, + page_header_set_field(page, page_zip, PAGE_GARBAGE, garbage + rec_offs_size(offsets)); } diff --git a/include/page0types.h b/include/page0types.h index 525a0366a6f..70437008b51 100644 --- a/include/page0types.h +++ b/include/page0types.h @@ -18,5 +18,71 @@ typedef byte page_t; typedef struct page_search_struct page_search_t; typedef struct page_cur_struct page_cur_t; +typedef byte page_zip_t; +typedef struct page_zip_des_struct page_zip_des_t; + +/* The following definitions would better belong to page0zip.h, +but we cannot include page0zip.h from rem0rec.ic, because +page0*.h includes rem0rec.h and may include rem0rec.ic. */ + +/* Compressed page descriptor */ +struct page_zip_des_struct +{ + page_zip_t* data; /* compressed page data */ + ulint size; /* total size of compressed page */ + ulint m_start; /* start offset of modification log */ + ulint m_end; /* end offset of modification log */ +}; + +/************************************************************************** +Write data to the compressed page. The data must already be written to +the uncompressed page. */ + +void +page_zip_write( +/*===========*/ + page_zip_des_t* page_zip,/* out: compressed page */ + const byte* str, /* in: address on the uncompressed page */ + ulint length) /* in: length of the data */ + __attribute__((nonnull)); + +/************************************************************************** +Write data to the uncompressed header portion of a page. The data must +already have been written to the uncompressed page. */ +UNIV_INLINE +void +page_zip_write_header( +/*==================*/ + page_zip_des_t* page_zip,/* in/out: compressed page */ + const byte* str, /* in: address on the uncompressed page */ + ulint length) /* in: length of the data */ + __attribute__((nonnull)); + + +/************************************************************************** +Write data to the uncompressed trailer portion of a page. The data must +already have been written to the uncompressed page. */ +UNIV_INLINE +void +page_zip_write_trailer( +/*===================*/ + page_zip_des_t* page_zip,/* in/out: compressed page */ + const byte* str, /* in: address on the uncompressed page */ + ulint length) /* in: length of the data */ + __attribute__((nonnull)); + +#ifdef UNIV_DEBUG +/************************************************************************** +Determine if enough space is available in the modification log. */ + +ibool +page_zip_available_noninline( +/*=========================*/ + /* out: TRUE if enough space + is available */ + const page_zip_des_t* page_zip,/* in: compressed page */ + ulint size) + __attribute__((warn_unused_result, nonnull, pure)); +#endif /* UNIV_DEBUG */ #endif diff --git a/include/page0zip.h b/include/page0zip.h new file mode 100644 index 00000000000..f2e89128c5a --- /dev/null +++ b/include/page0zip.h @@ -0,0 +1,183 @@ +/****************************************************** +Compressed page interface + +(c) 2005 Innobase Oy + +Created June 2005 by Marko Makela +*******************************************************/ + +#ifndef page0zip_h +#define page0zip_h + +#ifdef UNIV_MATERIALIZE +# undef UNIV_INLINE +# define UNIV_INLINE +#endif + +#include "mtr0types.h" +#include "page0types.h" + +/************************************************************************** +Initialize a compressed page descriptor. */ +UNIV_INLINE +void +page_zip_des_init( +/*==============*/ + page_zip_des_t* page_zip); /* in/out: compressed page + descriptor */ + +/************************************************************************** +Compress a page. */ + +ibool +page_zip_compress( +/*==============*/ + /* out: TRUE on success, FALSE on failure; + page_zip will be left intact on failure. */ + page_zip_des_t* page_zip,/* out: compressed page */ + const page_t* page); /* in: uncompressed page */ + +/************************************************************************** +Decompress a page. */ + +ibool +page_zip_decompress( +/*================*/ + /* out: TRUE on success, FALSE on failure */ + page_zip_des_t* page_zip,/* in: data, size; out: m_start, m_end */ + page_t* page, /* out: uncompressed page, may be trashed */ + mtr_t* mtr) /* in: mini-transaction handle, + or NULL if no logging is needed */ + __attribute__((warn_unused_result, nonnull(1, 2))); + +#ifdef UNIV_DEBUG +/************************************************************************** +Validate a compressed page descriptor. */ +UNIV_INLINE +ibool +page_zip_simple_validate( +/*=====================*/ + /* out: TRUE if ok */ + const page_zip_des_t* page_zip); /* in: compressed page + descriptor */ + +/************************************************************************** +Check that the compressed and decompressed pages match. */ + +ibool +page_zip_validate( +/*==============*/ + const page_zip_des_t* page_zip,/* in: compressed page */ + const page_t* page); /* in: uncompressed page */ +#endif /* UNIV_DEBUG */ + +/************************************************************************** +Determine the encoded length of an integer in the modification log. */ +UNIV_INLINE +ulint +page_zip_ulint_size( +/*================*/ + /* out: length of the integer, in bytes */ + ulint num) /* in: the integer */ + __attribute__((const)); + +/************************************************************************** +Determine the size of a modification log entry. */ +UNIV_INLINE +ulint +page_zip_entry_size( +/*================*/ + /* out: length of the log entry, in bytes */ + ulint pos, /* in: offset of the uncompressed page */ + ulint length) /* in: length of the data */ + __attribute__((const)); + +/************************************************************************** +Ensure that enough space is available in the modification log. +If not, try to compress the page. */ +UNIV_INLINE +ibool +page_zip_alloc( +/*===========*/ + /* out: TRUE if enough space is available */ + page_zip_des_t* page_zip,/* in/out: compressed page; + will only be modified if compression is needed + and successful */ + const page_t* page, /* in: uncompressed page */ + ulint size) /* in: size of modification log entries */ + __attribute__((nonnull)); + +/************************************************************************** +Determine if enough space is available in the modification log. */ +UNIV_INLINE +ibool +page_zip_available( +/*===============*/ + /* out: TRUE if enough space + is available */ + const page_zip_des_t* page_zip,/* in: compressed page */ + ulint size) /* in: requested size of + modification log entries */ + __attribute__((warn_unused_result, nonnull, pure)); + +#ifdef UNIV_DEBUG +/************************************************************************** +Determine if enough space is available in the modification log. */ + +ibool +page_zip_available_noninline( +/*=========================*/ + /* out: TRUE if enough space + is available */ + const page_zip_des_t* page_zip,/* in: compressed page */ + ulint size) + __attribute__((warn_unused_result, nonnull, pure)); +#endif /* UNIV_DEBUG */ + +/************************************************************************** +Write data to the compressed portion of a page. The data must already +have been written to the uncompressed page. */ + +void +page_zip_write( +/*===========*/ + page_zip_des_t* page_zip,/* in/out: compressed page */ + const byte* str, /* in: address on the uncompressed page */ + ulint length) /* in: length of the data */ + __attribute__((nonnull)); + +/************************************************************************** +Write data to the uncompressed header portion of a page. The data must +already have been written to the uncompressed page. */ +UNIV_INLINE +void +page_zip_write_header( +/*==================*/ + page_zip_des_t* page_zip,/* in/out: compressed page */ + const byte* str, /* in: address on the uncompressed page */ + ulint length) /* in: length of the data */ + __attribute__((nonnull)); + + +/************************************************************************** +Write data to the uncompressed trailer portion of a page. The data must +already have been written to the uncompressed page. */ +UNIV_INLINE +void +page_zip_write_trailer( +/*===================*/ + page_zip_des_t* page_zip,/* in/out: compressed page */ + const byte* str, /* in: address on the uncompressed page */ + ulint length) /* in: length of the data */ + __attribute__((nonnull)); + +#ifdef UNIV_MATERIALIZE +# undef UNIV_INLINE +# define UNIV_INLINE UNIV_INLINE_ORIGINAL +#endif + +#ifndef UNIV_NONINL +# include "page0zip.ic" +#endif + +#endif /* page0zip_h */ diff --git a/include/page0zip.ic b/include/page0zip.ic new file mode 100644 index 00000000000..ba720846ec5 --- /dev/null +++ b/include/page0zip.ic @@ -0,0 +1,224 @@ +/****************************************************** +Compressed page interface + +(c) 2005 Innobase Oy + +Created June 2005 by Marko Makela +*******************************************************/ + +#ifdef UNIV_MATERIALIZE +# undef UNIV_INLINE +# define UNIV_INLINE +#endif + +#include "page0zip.h" +#include "page0page.h" + +/* The format of compressed pages is as follows. + +The header and trailer of the uncompressed pages, including the page +directory in the trailer, are copied as is to the header and trailer +of the compressed page. Immediately preceding the page trailer, +we store a 32-bit checksum of the compressed data. + +The data between PAGE_DATA and the last page directory entry +will be written in compressed format, starting at offset PAGE_DATA. + +The compressed data stream may be followed by a modification log +covering the compressed portion of the page, as follows. + +MODIFICATION LOG ENTRY FORMAT +- length (1..2 bytes), not zero +- offset - PAGE_DATA (1..2 bytes) +- data bytes + +The length and the offset are stored in a variable-length format: +- 0xxxxxxxx : 0..127 +- 10xxxxxxx xxxxxxxx: 0..16383 +- 11xxxxxxx xxxxxxxx: reserved + +The end of the modification log is marked by length=0. */ + +/************************************************************************** +Initialize a compressed page descriptor. */ +UNIV_INLINE +void +page_zip_des_init( +/*==============*/ + page_zip_des_t* page_zip) /* in/out: compressed page + descriptor */ +{ + memset(page_zip, 0, sizeof *page_zip); +} + +/************************************************************************** +Determine the encoded length of an integer in the modification log. */ +UNIV_INLINE +ulint +page_zip_ulint_size( +/*================*/ + /* out: length of the integer, in bytes */ + ulint num) /* in: the integer */ +{ + if (num < 128) { /* 0xxxxxxx: 0..127 */ + return(1); + } + if (num < 16384) { /* 10xxxxxx xxxxxxxx: 0..16383 */ + return(2); + } + ut_error; + return(0); +} + +/************************************************************************** +Determine the size of a modification log entry. */ +UNIV_INLINE +ulint +page_zip_entry_size( +/*================*/ + /* out: length of the log entry, in bytes */ + ulint pos, /* in: offset of the uncompressed page */ + ulint length) /* in: length of the data */ +{ + ut_ad(pos >= PAGE_DATA); + ut_ad(pos + length <= UNIV_PAGE_SIZE - PAGE_DATA /* - trailer_len */); + return(page_zip_ulint_size(pos - PAGE_DATA) + + page_zip_ulint_size(length) + + length); +} + +#ifdef UNIV_DEBUG +/************************************************************************** +Validate a compressed page descriptor. */ +UNIV_INLINE +ibool +page_zip_simple_validate( +/*=====================*/ + /* out: TRUE if ok */ + const page_zip_des_t* page_zip)/* in: compressed page descriptor */ +{ + ut_ad(page_zip); + ut_ad(page_zip->data); + ut_ad(!(page_zip->size & (page_zip->size - 1))); + ut_ad(page_zip->size < UNIV_PAGE_SIZE); + ut_ad(page_zip->size > PAGE_DATA + PAGE_EMPTY_DIR_START); + ut_ad(page_zip->m_start <= page_zip->m_end); + ut_ad(page_zip->m_end < page_zip->size); + return(TRUE); +} +#endif /* UNIV_DEBUG */ + +/************************************************************************** +Ensure that enough space is available in the modification log. +If not, try to compress the page. */ +UNIV_INLINE +ibool +page_zip_alloc( +/*===========*/ + /* out: TRUE if enough space is available */ + page_zip_des_t* page_zip,/* in/out: compressed page; + will only be modified if compression is needed + and successful */ + const page_t* page, /* in: uncompressed page */ + ulint size) /* in: size of modification log entries */ +{ + ulint trailer_len = PAGE_DIR + PAGE_DIR_SLOT_SIZE + * page_dir_get_n_slots((page_t*) page_zip->data); + + ut_ad(page_zip_simple_validate(page_zip)); + ut_ad(page_zip->m_end + trailer_len < page_zip->size); + ut_ad(size >= 3); /* modification log entries are >= 1+1+1 bytes */ + ut_ad(size < page_zip->size); + + if (size < page_zip->size - page_zip->m_end - trailer_len) { + return(TRUE); + } + + if (page_zip->m_start == page_zip->m_end) { + /* The page has been freshly compressed, so + recompressing it will not help. */ + return(FALSE); + } + + return(page_zip_compress(page_zip, page)); +} + +/************************************************************************** +Determine if enough space is available in the modification log. */ +UNIV_INLINE +ibool +page_zip_available( +/*===============*/ + /* out: TRUE if enough space + is available */ + const page_zip_des_t* page_zip,/* in: compressed page */ + ulint size) /* in: requested size of + modification log entries */ +{ + ulint trailer_len = PAGE_DIR + PAGE_DIR_SLOT_SIZE + * page_dir_get_n_slots((page_t*) page_zip->data); + + ut_ad(page_zip_simple_validate(page_zip)); + ut_ad(page_zip->m_end + trailer_len < page_zip->size); + ut_ad(size >= 3); /* modification log entries are >= 1+1+1 bytes */ + ut_ad(size < page_zip->size); + + return(UNIV_LIKELY( + size < page_zip->size - page_zip->m_end - trailer_len)); +} + +/************************************************************************** +Write data to the uncompressed header portion of a page. The data must +already have been written to the uncompressed page. */ +UNIV_INLINE +void +page_zip_write_header( +/*==================*/ + page_zip_des_t* page_zip,/* in/out: compressed page */ + const byte* str, /* in: address on the uncompressed page */ + ulint length) /* in: length of the data */ +{ + ulint pos; + + ut_ad(buf_block_get_page_zip(buf_block_align((byte*)str)) == page_zip); + ut_ad(page_zip_simple_validate(page_zip)); + + pos = ut_align_offset(str, UNIV_PAGE_SIZE); + + ut_ad(pos < PAGE_DATA); + + memcpy(page_zip + pos, str, length); + + ut_ad(page_zip_validate(page_zip, str - pos)); +} + + +/************************************************************************** +Write data to the uncompressed trailer portion of a page. The data must +already have been written to the uncompressed page. */ +UNIV_INLINE +void +page_zip_write_trailer( +/*===================*/ + page_zip_des_t* page_zip,/* in/out: compressed page */ + const byte* str, /* in: address on the uncompressed page */ + ulint length) /* in: length of the data */ +{ + ulint pos; + + ut_ad(buf_block_get_page_zip(buf_block_align((byte*)str)) == page_zip); + ut_ad(page_zip_simple_validate(page_zip)); + + pos = ut_align_offset(str, UNIV_PAGE_SIZE); + + ut_ad(pos < PAGE_DATA);/* TODO */ + + memcpy(page_zip + pos/* TODO */, str, length); + + ut_ad(page_zip_validate(page_zip, str - pos)); +} + +#ifdef UNIV_MATERIALIZE +# undef UNIV_INLINE +# define UNIV_INLINE UNIV_INLINE_ORIGINAL +#endif diff --git a/include/rem0rec.h b/include/rem0rec.h index 69b397c9682..01ff4e04b9e 100644 --- a/include/rem0rec.h +++ b/include/rem0rec.h @@ -13,11 +13,10 @@ Created 5/30/1994 Heikki Tuuri #include "data0data.h" #include "rem0types.h" #include "mtr0types.h" +#include "page0types.h" /* Maximum values for various fields (for non-blob tuples) */ #define REC_MAX_N_FIELDS (1024 - 1) -#define REC_MAX_HEAP_NO (2 * 8192 - 1) -#define REC_MAX_N_OWNED (16 - 1) /* Flag denoting the predefined minimum record: this bit is ORed in the 4 info bits of a record */ @@ -41,6 +40,17 @@ offsets[] array, first passed to rec_get_offsets() */ #define REC_OFFS_NORMAL_SIZE 100 #define REC_OFFS_SMALL_SIZE 10 +/********************************************************** +The following function is used to get the pointer of the next chained record +on the same page. */ +UNIV_INLINE +rec_t* +rec_get_next_ptr( +/*=============*/ + /* out: pointer to the next chained record, or + NULL if none */ + rec_t* rec, /* in: physical record */ + ulint comp); /* in: nonzero=compact page format */ /********************************************************** The following function is used to get the offset of the next chained record on the same page. */ @@ -54,15 +64,25 @@ rec_get_next_offs( ulint comp); /* in: nonzero=compact page format */ /********************************************************** The following function is used to set the next record offset field -of the record. */ +of an old-style record. */ UNIV_INLINE void -rec_set_next_offs( -/*==============*/ - rec_t* rec, /* in: physical record */ - ulint comp, /* in: nonzero=compact page format */ +rec_set_next_offs_old( +/*==================*/ + rec_t* rec, /* in: old-style physical record */ ulint next); /* in: offset of the next record */ /********************************************************** +The following function is used to set the next record offset field +of a new-style record. */ +UNIV_INLINE +void +rec_set_next_offs_new( +/*==================*/ + rec_t* rec, /* in/out: new-style physical record */ + page_zip_des_t* page_zip,/* in/out: compressed page with at least + 6 bytes available, or NULL */ + ulint next); /* in: offset of the next record */ +/********************************************************** The following function is used to get the number of fields in an old-style record. */ UNIV_INLINE @@ -82,26 +102,44 @@ rec_get_n_fields( rec_t* rec, /* in: physical record */ dict_index_t* index); /* in: record descriptor */ /********************************************************** -The following function is used to get the number of records -owned by the previous directory record. */ +The following function is used to get the number of records owned by the +previous directory record. */ UNIV_INLINE ulint -rec_get_n_owned( -/*============*/ +rec_get_n_owned_old( +/*================*/ /* out: number of owned records */ - rec_t* rec, /* in: physical record */ - ulint comp); /* in: nonzero=compact page format */ + rec_t* rec); /* in: old-style physical record */ /********************************************************** -The following function is used to set the number of owned -records. */ +The following function is used to set the number of owned records. */ UNIV_INLINE void -rec_set_n_owned( -/*============*/ - rec_t* rec, /* in: physical record */ - ulint comp, /* in: nonzero=compact page format */ +rec_set_n_owned_old( +/*================*/ + /* out: TRUE on success */ + rec_t* rec, /* in: old-style physical record */ ulint n_owned); /* in: the number of owned */ /********************************************************** +The following function is used to get the number of records owned by the +previous directory record. */ +UNIV_INLINE +ulint +rec_get_n_owned_new( +/*================*/ + /* out: number of owned records */ + rec_t* rec); /* in: new-style physical record */ +/********************************************************** +The following function is used to set the number of owned records. */ +UNIV_INLINE +void +rec_set_n_owned_new( +/*================*/ + /* out: TRUE on success */ + rec_t* rec, /* in/out: new-style physical record */ + page_zip_des_t* page_zip,/* in/out: compressed page with at least + 5 bytes available, or NULL */ + ulint n_owned);/* in: the number of owned */ +/********************************************************** The following function is used to retrieve the info bits of a record. */ UNIV_INLINE @@ -115,12 +153,21 @@ rec_get_info_bits( The following function is used to set the info bits of a record. */ UNIV_INLINE void -rec_set_info_bits( -/*==============*/ - rec_t* rec, /* in: physical record */ - ulint comp, /* in: nonzero=compact page format */ +rec_set_info_bits_old( +/*==================*/ + rec_t* rec, /* in: old-style physical record */ ulint bits); /* in: info bits */ /********************************************************** +The following function is used to set the info bits of a record. */ +UNIV_INLINE +void +rec_set_info_bits_new( +/*==================*/ + rec_t* rec, /* in/out: new-style physical record */ + page_zip_des_t* page_zip,/* in/out: compressed page with + at least 5 bytes available, or NULL */ + ulint bits); /* in: info bits */ +/********************************************************** The following function retrieves the status bits of a new-style record. */ UNIV_INLINE ulint @@ -135,8 +182,10 @@ UNIV_INLINE void rec_set_status( /*===========*/ - rec_t* rec, /* in: physical record */ - ulint bits); /* in: info bits */ + rec_t* rec, /* in/out: physical record */ + page_zip_des_t* page_zip,/* in/out: compressed page with + at least 5 bytes available, or NULL */ + ulint bits); /* in: info bits */ /********************************************************** The following function is used to retrieve the info and status @@ -155,9 +204,10 @@ UNIV_INLINE void rec_set_info_and_status_bits( /*=========================*/ - rec_t* rec, /* in: physical record */ - ulint comp, /* in: nonzero=compact page format */ - ulint bits); /* in: info bits */ + rec_t* rec, /* in/out: compact physical record */ + page_zip_des_t* page_zip,/* in/out: compressed page with + at least 5 bytes available, or NULL */ + ulint bits); /* in: info bits */ /********************************************************** The following function tells if record is delete marked. */ @@ -172,40 +222,67 @@ rec_get_deleted_flag( The following function is used to set the deleted bit. */ UNIV_INLINE void -rec_set_deleted_flag( -/*=================*/ - rec_t* rec, /* in: physical record */ - ulint comp, /* in: nonzero=compact page format */ +rec_set_deleted_flag_old( +/*=====================*/ + rec_t* rec, /* in: old-style physical record */ ulint flag); /* in: nonzero if delete marked */ /********************************************************** +The following function is used to set the deleted bit. */ +UNIV_INLINE +void +rec_set_deleted_flag_new( +/*=====================*/ + rec_t* rec, /* in/out: new-style physical record */ + page_zip_des_t* page_zip,/* in/out: compressed page with + at least 5 bytes available, or NULL */ + ulint flag); /* in: nonzero if delete marked */ +/********************************************************** The following function tells if a new-style record is a node pointer. */ UNIV_INLINE ibool rec_get_node_ptr_flag( -/*=================*/ +/*==================*/ /* out: TRUE if node pointer */ rec_t* rec); /* in: physical record */ /********************************************************** The following function is used to get the order number -of the record in the heap of the index page. */ +of an old-style record in the heap of the index page. */ UNIV_INLINE ulint -rec_get_heap_no( -/*=============*/ +rec_get_heap_no_old( +/*================*/ /* out: heap order number */ - rec_t* rec, /* in: physical record */ - ulint comp); /* in: nonzero=compact page format */ + rec_t* rec); /* in: physical record */ /********************************************************** The following function is used to set the heap number -field in the record. */ +field in an old-style record. */ UNIV_INLINE void -rec_set_heap_no( -/*=============*/ +rec_set_heap_no_old( +/*================*/ rec_t* rec, /* in: physical record */ - ulint comp, /* in: nonzero=compact page format */ ulint heap_no);/* in: the heap number */ /********************************************************** +The following function is used to get the order number +of a new-style record in the heap of the index page. */ +UNIV_INLINE +ulint +rec_get_heap_no_new( +/*================*/ + /* out: heap order number */ + rec_t* rec); /* in: physical record */ +/********************************************************** +The following function is used to set the heap number +field in a new-style record. */ +UNIV_INLINE +void +rec_set_heap_no_new( +/*================*/ + rec_t* rec, /* in/out: physical record */ + page_zip_des_t* page_zip,/* in/out: compressed page with + at least 6 bytes available, or NULL */ + ulint heap_no);/* in: the heap number */ +/********************************************************** The following function is used to test whether the data offsets in the record are stored in one-byte or two-byte format. */ UNIV_INLINE @@ -340,7 +417,7 @@ rec_offs_any_extern( /* out: TRUE if a field is stored externally */ const ulint* offsets);/* in: array returned by rec_get_offsets() */ /*************************************************************** -Sets the value of the ith field extern storage bit. */ +Sets the ith field extern storage bit. */ UNIV_INLINE void rec_set_nth_field_extern_bit( @@ -348,7 +425,6 @@ rec_set_nth_field_extern_bit( rec_t* rec, /* in: record */ dict_index_t* index, /* in: record descriptor */ ulint i, /* in: ith field */ - ibool val, /* in: value to set */ mtr_t* mtr); /* in: mtr holding an X-latch to the page where rec is, or NULL; in the NULL case we do not write to log about the change */ @@ -489,8 +565,8 @@ rec_fold( in an incomplete last field */ dulint tree_id); /* in: index tree id */ /************************************************************* -Builds a physical record out of a data tuple and stores it beginning from -address destination. */ +Builds a physical record out of a data tuple and +stores it into the given buffer. */ rec_t* rec_convert_dtuple_to_rec( diff --git a/include/rem0rec.ic b/include/rem0rec.ic index 9c24f385f4f..c42760c8b45 100644 --- a/include/rem0rec.ic +++ b/include/rem0rec.ic @@ -148,19 +148,18 @@ rec_set_nth_field_sql_null( ulint n); /* in: index of the field */ /*************************************************************** -Sets the value of the ith field extern storage bit of an old-style record. */ +Sets the ith field extern storage bit of an old-style record. */ void rec_set_nth_field_extern_bit_old( /*=============================*/ rec_t* rec, /* in: old-style record */ ulint i, /* in: ith field */ - ibool val, /* in: value to set */ mtr_t* mtr); /* in: mtr holding an X-latch to the page where rec is, or NULL; in the NULL case we do not write to log about the change */ /*************************************************************** -Sets the value of the ith field extern storage bit of a new-style record. */ +Sets the ith field extern storage bit of a new-style record. */ void rec_set_nth_field_extern_bit_new( @@ -168,7 +167,6 @@ rec_set_nth_field_extern_bit_new( rec_t* rec, /* in: record */ dict_index_t* index, /* in: record descriptor */ ulint ith, /* in: ith field */ - ibool val, /* in: value to set */ mtr_t* mtr); /* in: mtr holding an X-latch to the page where rec is, or NULL; in the NULL case we do not write to log about the change */ @@ -255,6 +253,55 @@ rec_set_bit_field_2( | (val << shift)); } +/********************************************************** +The following function is used to get the pointer of the next chained record +on the same page. */ +UNIV_INLINE +rec_t* +rec_get_next_ptr( +/*=============*/ + /* out: pointer to the next chained record, or + NULL if none */ + rec_t* rec, /* in: physical record */ + ulint comp) /* in: nonzero=compact page format */ +{ + ulint field_value; + + ut_ad(REC_NEXT_MASK == 0xFFFFUL); + ut_ad(REC_NEXT_SHIFT == 0); + + field_value = mach_read_from_2(rec - REC_NEXT); + + if (UNIV_UNLIKELY(field_value == 0)) { + + return(NULL); + } + + if (UNIV_EXPECT(comp, REC_OFFS_COMPACT)) { +#if UNIV_PAGE_SIZE <= 32768 + /* Note that for 64 KiB pages, field_value can 'wrap around' + and the debug assertion is not valid */ + + /* In the following assertion, field_value is interpreted + as signed 16-bit integer in 2's complement arithmetics. + If all platforms defined int16_t in the standard headers, + the expression could be written simpler as + (int16_t) field_value + ut_align_offset(...) < UNIV_PAGE_SIZE + */ + ut_ad((field_value >= 32768 + ? field_value - 65536 + : field_value) + + ut_align_offset(rec, UNIV_PAGE_SIZE) + < UNIV_PAGE_SIZE); +#endif + return(rec + field_value); + } else { + ut_ad(field_value < UNIV_PAGE_SIZE); + + return(ut_align_down(rec, UNIV_PAGE_SIZE) + field_value); + } +} + /********************************************************** The following function is used to get the offset of the next chained record on the same page. */ @@ -274,7 +321,7 @@ rec_get_next_offs( field_value = mach_read_from_2(rec - REC_NEXT); - if (comp) { + if (UNIV_EXPECT(comp, REC_OFFS_COMPACT)) { #if UNIV_PAGE_SIZE <= 32768 /* Note that for 64 KiB pages, field_value can 'wrap around' and the debug assertion is not valid */ @@ -291,7 +338,7 @@ rec_get_next_offs( + ut_align_offset(rec, UNIV_PAGE_SIZE) < UNIV_PAGE_SIZE); #endif - if (field_value == 0) { + if (UNIV_UNLIKELY(field_value == 0)) { return(0); } @@ -305,39 +352,59 @@ rec_get_next_offs( } /********************************************************** -The following function is used to set the next record offset field of the -record. */ +The following function is used to set the next record offset field +of an old-style record. */ UNIV_INLINE void -rec_set_next_offs( -/*==============*/ - rec_t* rec, /* in: physical record */ - ulint comp, /* in: nonzero=compact page format */ - ulint next) /* in: offset of the next record, or 0 if none */ +rec_set_next_offs_old( +/*==================*/ + rec_t* rec, /* in: old-style physical record */ + ulint next) /* in: offset of the next record */ { ut_ad(rec); ut_ad(UNIV_PAGE_SIZE > next); - ut_ad(REC_NEXT_MASK == 0xFFFFUL); - ut_ad(REC_NEXT_SHIFT == 0); +#if REC_NEXT_MASK != 0xFFFFUL +# error "REC_NEXT_MASK != 0xFFFFUL" +#endif +#if REC_NEXT_SHIFT +# error "REC_NEXT_SHIFT != 0" +#endif - if (comp) { - ulint field_value; + mach_write_to_2(rec - REC_NEXT, next); +} - if (next) { - /* The following two statements calculate - next - offset_of_rec mod 64Ki, where mod is the modulo - as a non-negative number */ - - field_value = (ulint)((lint)next - - (lint)ut_align_offset(rec, UNIV_PAGE_SIZE)); - field_value &= REC_NEXT_MASK; - } else { - field_value = 0; - } +/********************************************************** +The following function is used to set the next record offset field +of a new-style record. */ +UNIV_INLINE +void +rec_set_next_offs_new( +/*==================*/ + rec_t* rec, /* in/out: new-style physical record */ + page_zip_des_t* page_zip,/* in/out: compressed page with at least + 6 bytes available, or NULL */ + ulint next) /* in: offset of the next record */ +{ + ut_ad(rec); + ut_ad(UNIV_PAGE_SIZE > next); - mach_write_to_2(rec - REC_NEXT, field_value); + ulint field_value; + + if (UNIV_UNLIKELY(!next)) { + field_value = 0; } else { - mach_write_to_2(rec - REC_NEXT, next); + /* The following two statements calculate + next - offset_of_rec mod 64Ki, where mod is the modulo + as a non-negative number */ + + field_value = (ulint)((lint)next + - (lint)ut_align_offset(rec, UNIV_PAGE_SIZE)); + field_value &= REC_NEXT_MASK; + } + + mach_write_to_2(rec - REC_NEXT, field_value); + if (UNIV_LIKELY_NULL(page_zip)) { + page_zip_write(page_zip, rec - REC_NEXT, 2); } } @@ -436,42 +503,62 @@ The following function is used to get the number of records owned by the previous directory record. */ UNIV_INLINE ulint -rec_get_n_owned( -/*============*/ +rec_get_n_owned_old( +/*================*/ /* out: number of owned records */ - rec_t* rec, /* in: physical record */ - ulint comp) /* in: nonzero=compact page format */ + rec_t* rec) /* in: old-style physical record */ { - ulint ret; - - ut_ad(rec); - - ret = rec_get_bit_field_1(rec, - comp ? REC_NEW_N_OWNED : REC_OLD_N_OWNED, - REC_N_OWNED_MASK, REC_N_OWNED_SHIFT); - ut_ad(ret <= REC_MAX_N_OWNED); - - return(ret); + return(rec_get_bit_field_1(rec, REC_OLD_N_OWNED, + REC_N_OWNED_MASK, REC_N_OWNED_SHIFT)); } /********************************************************** The following function is used to set the number of owned records. */ UNIV_INLINE void -rec_set_n_owned( -/*============*/ - rec_t* rec, /* in: physical record */ - ulint comp, /* in: nonzero=compact page format */ +rec_set_n_owned_old( +/*================*/ + /* out: TRUE on success */ + rec_t* rec, /* in: old-style physical record */ ulint n_owned) /* in: the number of owned */ { - ut_ad(rec); - ut_ad(n_owned <= REC_MAX_N_OWNED); - - rec_set_bit_field_1(rec, n_owned, - comp ? REC_NEW_N_OWNED : REC_OLD_N_OWNED, + rec_set_bit_field_1(rec, n_owned, REC_OLD_N_OWNED, REC_N_OWNED_MASK, REC_N_OWNED_SHIFT); } +/********************************************************** +The following function is used to get the number of records owned by the +previous directory record. */ +UNIV_INLINE +ulint +rec_get_n_owned_new( +/*================*/ + /* out: number of owned records */ + rec_t* rec) /* in: new-style physical record */ +{ + return(rec_get_bit_field_1(rec, REC_NEW_N_OWNED, + REC_N_OWNED_MASK, REC_N_OWNED_SHIFT)); +} + +/********************************************************** +The following function is used to set the number of owned records. */ +UNIV_INLINE +void +rec_set_n_owned_new( +/*================*/ + /* out: TRUE on success */ + rec_t* rec, /* in/out: new-style physical record */ + page_zip_des_t* page_zip,/* in/out: compressed page with + at least 5 bytes available, or NULL */ + ulint n_owned)/* in: the number of owned */ +{ + rec_set_bit_field_1(rec, n_owned, REC_NEW_N_OWNED, + REC_N_OWNED_MASK, REC_N_OWNED_SHIFT); + if (UNIV_LIKELY_NULL(page_zip)) { + page_zip_write(page_zip, rec - REC_NEW_N_OWNED, 1); + } +} + /********************************************************** The following function is used to retrieve the info bits of a record. */ UNIV_INLINE @@ -482,35 +569,40 @@ rec_get_info_bits( rec_t* rec, /* in: physical record */ ulint comp) /* in: nonzero=compact page format */ { - ulint ret; - - ut_ad(rec); - - ret = rec_get_bit_field_1(rec, + return(rec_get_bit_field_1(rec, comp ? REC_NEW_INFO_BITS : REC_OLD_INFO_BITS, - REC_INFO_BITS_MASK, REC_INFO_BITS_SHIFT); - ut_ad((ret & ~REC_INFO_BITS_MASK) == 0); - - return(ret); + REC_INFO_BITS_MASK, REC_INFO_BITS_SHIFT)); } /********************************************************** The following function is used to set the info bits of a record. */ UNIV_INLINE void -rec_set_info_bits( -/*==============*/ - rec_t* rec, /* in: physical record */ - ulint comp, /* in: nonzero=compact page format */ +rec_set_info_bits_old( +/*==================*/ + rec_t* rec, /* in: old-style physical record */ ulint bits) /* in: info bits */ { - ut_ad(rec); - ut_ad((bits & ~REC_INFO_BITS_MASK) == 0); - - rec_set_bit_field_1(rec, bits, - comp ? REC_NEW_INFO_BITS : REC_OLD_INFO_BITS, + rec_set_bit_field_1(rec, bits, REC_OLD_INFO_BITS, REC_INFO_BITS_MASK, REC_INFO_BITS_SHIFT); } +/********************************************************** +The following function is used to set the info bits of a record. */ +UNIV_INLINE +void +rec_set_info_bits_new( +/*==================*/ + rec_t* rec, /* in/out: new-style physical record */ + page_zip_des_t* page_zip,/* in/out: compressed page with + at least 5 bytes available, or NULL */ + ulint bits) /* in: info bits */ +{ + rec_set_bit_field_1(rec, bits, REC_NEW_INFO_BITS, + REC_INFO_BITS_MASK, REC_INFO_BITS_SHIFT); + if (UNIV_LIKELY_NULL(page_zip)) { + page_zip_write(page_zip, rec - REC_NEW_INFO_BITS, 1); + } +} /********************************************************** The following function is used to set the status bits of a new-style record. */ @@ -518,14 +610,16 @@ UNIV_INLINE void rec_set_status( /*===========*/ - rec_t* rec, /* in: physical record */ - ulint bits) /* in: info bits */ + rec_t* rec, /* in/out: physical record */ + page_zip_des_t* page_zip,/* in/out: compressed page with + at least 5 bytes available, or NULL */ + ulint bits) /* in: info bits */ { - ut_ad(rec); - ut_ad((bits & ~REC_NEW_STATUS_MASK) == 0); - rec_set_bit_field_1(rec, bits, REC_NEW_STATUS, REC_NEW_STATUS_MASK, REC_NEW_STATUS_SHIFT); + if (UNIV_LIKELY_NULL(page_zip)) { + page_zip_write(page_zip, rec - REC_NEW_STATUS, 1); + } } /********************************************************** @@ -559,20 +653,17 @@ UNIV_INLINE void rec_set_info_and_status_bits( /*=========================*/ - rec_t* rec, /* in: physical record */ - ulint comp, /* in: nonzero=compact page format */ - ulint bits) /* in: info bits */ + rec_t* rec, /* in/out: physical record */ + page_zip_des_t* page_zip,/* in/out: compressed page with + at least 5 bytes available, or NULL */ + ulint bits) /* in: info bits */ { #if (REC_NEW_STATUS_MASK >> REC_NEW_STATUS_SHIFT) \ & (REC_INFO_BITS_MASK >> REC_INFO_BITS_SHIFT) # error "REC_NEW_STATUS_MASK and REC_INFO_BITS_MASK overlap" #endif - if (comp) { - rec_set_status(rec, bits & REC_NEW_STATUS_MASK); - } else { - ut_ad(!(bits & ~(REC_INFO_BITS_MASK >> REC_INFO_BITS_SHIFT))); - } - rec_set_info_bits(rec, comp, bits & ~REC_NEW_STATUS_MASK); + rec_set_status(rec, page_zip, bits & REC_NEW_STATUS_MASK); + rec_set_info_bits_new(rec, page_zip, bits & ~REC_NEW_STATUS_MASK); } /********************************************************** @@ -600,15 +691,14 @@ rec_get_deleted_flag( The following function is used to set the deleted bit. */ UNIV_INLINE void -rec_set_deleted_flag( -/*=================*/ - rec_t* rec, /* in: physical record */ - ulint comp, /* in: nonzero=compact page format */ +rec_set_deleted_flag_old( +/*=====================*/ + rec_t* rec, /* in: old-style physical record */ ulint flag) /* in: nonzero if delete marked */ { ulint val; - val = rec_get_info_bits(rec, comp); + val = rec_get_info_bits(rec, FALSE); if (flag) { val |= REC_INFO_DELETED_FLAG; @@ -616,7 +706,31 @@ rec_set_deleted_flag( val &= ~REC_INFO_DELETED_FLAG; } - rec_set_info_bits(rec, comp, val); + rec_set_info_bits_old(rec, val); +} + +/********************************************************** +The following function is used to set the deleted bit. */ +UNIV_INLINE +void +rec_set_deleted_flag_new( +/*=====================*/ + rec_t* rec, /* in/out: new-style physical record */ + page_zip_des_t* page_zip,/* in/out: compressed page with + at least 5 bytes available, or NULL */ + ulint flag) /* in: nonzero if delete marked */ +{ + ulint val; + + val = rec_get_info_bits(rec, TRUE); + + if (flag) { + val |= REC_INFO_DELETED_FLAG; + } else { + val &= ~REC_INFO_DELETED_FLAG; + } + + rec_set_info_bits_new(rec, page_zip, val); } /********************************************************** @@ -624,7 +738,7 @@ The following function tells if a new-style record is a node pointer. */ UNIV_INLINE ibool rec_get_node_ptr_flag( -/*=================*/ +/*==================*/ /* out: TRUE if node pointer */ rec_t* rec) /* in: physical record */ { @@ -632,45 +746,66 @@ rec_get_node_ptr_flag( } /********************************************************** -The following function is used to get the order number of the record in the -heap of the index page. */ +The following function is used to get the order number +of an old-style record in the heap of the index page. */ UNIV_INLINE ulint -rec_get_heap_no( -/*=============*/ +rec_get_heap_no_old( +/*================*/ /* out: heap order number */ - rec_t* rec, /* in: physical record */ - ulint comp) /* in: nonzero=compact page format */ + rec_t* rec) /* in: physical record */ { - ulint ret; - - ut_ad(rec); - - ret = rec_get_bit_field_2(rec, - comp ? REC_NEW_HEAP_NO : REC_OLD_HEAP_NO, - REC_HEAP_NO_MASK, REC_HEAP_NO_SHIFT); - ut_ad(ret <= REC_MAX_HEAP_NO); - - return(ret); -} + return(rec_get_bit_field_2(rec, REC_OLD_HEAP_NO, + REC_HEAP_NO_MASK, REC_HEAP_NO_SHIFT)); +} /********************************************************** -The following function is used to set the heap number field in the record. */ +The following function is used to set the heap number +field in an old-style record. */ UNIV_INLINE void -rec_set_heap_no( -/*=============*/ +rec_set_heap_no_old( +/*================*/ rec_t* rec, /* in: physical record */ - ulint comp, /* in: nonzero=compact page format */ ulint heap_no)/* in: the heap number */ { - ut_ad(heap_no <= REC_MAX_HEAP_NO); - - rec_set_bit_field_2(rec, heap_no, - comp ? REC_NEW_HEAP_NO : REC_OLD_HEAP_NO, + rec_set_bit_field_2(rec, heap_no, REC_OLD_HEAP_NO, REC_HEAP_NO_MASK, REC_HEAP_NO_SHIFT); } +/********************************************************** +The following function is used to get the order number +of a new-style record in the heap of the index page. */ +UNIV_INLINE +ulint +rec_get_heap_no_new( +/*================*/ + /* out: heap order number */ + rec_t* rec) /* in: physical record */ +{ + return(rec_get_bit_field_2(rec, REC_NEW_HEAP_NO, + REC_HEAP_NO_MASK, REC_HEAP_NO_SHIFT)); +} + +/********************************************************** +The following function is used to set the heap number +field in a new-style record. */ +UNIV_INLINE +void +rec_set_heap_no_new( +/*================*/ + rec_t* rec, /* in/out: physical record */ + page_zip_des_t* page_zip,/* in/out: compressed page with + at least 6 bytes available, or NULL */ + ulint heap_no)/* in: the heap number */ +{ + rec_set_bit_field_2(rec, heap_no, REC_NEW_HEAP_NO, + REC_HEAP_NO_MASK, REC_HEAP_NO_SHIFT); + if (UNIV_LIKELY_NULL(page_zip)) { + page_zip_write(page_zip, rec - REC_NEW_HEAP_NO, 2); + } +} + /********************************************************** The following function is used to test whether the data offsets in the record are stored in one-byte or two-byte format. */ @@ -1006,7 +1141,7 @@ rec_offs_any_extern( } /*************************************************************** -Sets the value of the ith field extern storage bit. */ +Sets the ith field extern storage bit. */ UNIV_INLINE void rec_set_nth_field_extern_bit( @@ -1014,15 +1149,14 @@ rec_set_nth_field_extern_bit( rec_t* rec, /* in: record */ dict_index_t* index, /* in: record descriptor */ ulint i, /* in: ith field */ - ibool val, /* in: value to set */ mtr_t* mtr) /* in: mtr holding an X-latch to the page where rec is, or NULL; in the NULL case we do not write to log about the change */ { if (UNIV_LIKELY(index->table->comp)) { - rec_set_nth_field_extern_bit_new(rec, index, i, val, mtr); + rec_set_nth_field_extern_bit_new(rec, index, i, mtr); } else { - rec_set_nth_field_extern_bit_old(rec, i, val, mtr); + rec_set_nth_field_extern_bit_old(rec, i, mtr); } } diff --git a/include/row0row.h b/include/row0row.h index 782973d8f5d..7083b14b966 100644 --- a/include/row0row.h +++ b/include/row0row.h @@ -45,7 +45,9 @@ UNIV_INLINE void row_set_rec_trx_id( /*===============*/ - rec_t* rec, /* in: record */ + rec_t* rec, /* in/out: record */ + page_zip_des_t* page_zip,/* in/out: compressed page with + at least 10 bytes available,, or NULL */ dict_index_t* index, /* in: clustered index */ const ulint* offsets,/* in: rec_get_offsets(rec, index) */ dulint trx_id);/* in: value of the field */ @@ -55,7 +57,9 @@ UNIV_INLINE void row_set_rec_roll_ptr( /*=================*/ - rec_t* rec, /* in: record */ + rec_t* rec, /* in/out: record */ + page_zip_des_t* page_zip,/* in/out: compressed page with + at least 11 bytes available, or NULL */ dict_index_t* index, /* in: clustered index */ const ulint* offsets,/* in: rec_get_offsets(rec, index) */ dulint roll_ptr);/* in: value of the field */ diff --git a/include/row0row.ic b/include/row0row.ic index 85410beacf0..c56dd9a30f8 100644 --- a/include/row0row.ic +++ b/include/row0row.ic @@ -29,9 +29,10 @@ is slower than the specialized inline functions. */ void row_set_rec_sys_field( /*==================*/ - /* out: value of the field */ ulint type, /* in: DATA_TRX_ID or DATA_ROLL_PTR */ - rec_t* rec, /* in: record */ + rec_t* rec, /* in/out: record */ + page_zip_des_t* page_zip,/* in/out: compressed page with at least + 10 or 11 bytes available, or NULL */ dict_index_t* index, /* in: clustered index */ const ulint* offsets,/* in: rec_get_offsets(rec, index) */ dulint val); /* in: value to set */ @@ -94,7 +95,9 @@ UNIV_INLINE void row_set_rec_trx_id( /*===============*/ - rec_t* rec, /* in: record */ + rec_t* rec, /* in/out: record */ + page_zip_des_t* page_zip,/* in/out: compressed page with + at least 10 bytes available, or NULL */ dict_index_t* index, /* in: clustered index */ const ulint* offsets,/* in: rec_get_offsets(rec, index) */ dulint trx_id) /* in: value of the field */ @@ -107,10 +110,10 @@ row_set_rec_trx_id( offset = index->trx_id_offset; if (offset) { - trx_write_trx_id(rec + offset, trx_id); + trx_write_trx_id(rec + offset, page_zip, trx_id); } else { row_set_rec_sys_field(DATA_TRX_ID, - rec, index, offsets, trx_id); + rec, page_zip, index, offsets, trx_id); } } @@ -120,7 +123,9 @@ UNIV_INLINE void row_set_rec_roll_ptr( /*=================*/ - rec_t* rec, /* in: record */ + rec_t* rec, /* in/out: record */ + page_zip_des_t* page_zip,/* in/out: compressed page with + at least 11 bytes available, or NULL */ dict_index_t* index, /* in: clustered index */ const ulint* offsets,/* in: rec_get_offsets(rec, index) */ dulint roll_ptr)/* in: value of the field */ @@ -133,10 +138,11 @@ row_set_rec_roll_ptr( offset = index->trx_id_offset; if (offset) { - trx_write_roll_ptr(rec + offset + DATA_TRX_ID_LEN, roll_ptr); + trx_write_roll_ptr(rec + offset + DATA_TRX_ID_LEN, + page_zip, roll_ptr); } else { row_set_rec_sys_field(DATA_ROLL_PTR, - rec, index, offsets, roll_ptr); + rec, page_zip, index, offsets, roll_ptr); } } diff --git a/include/row0upd.h b/include/row0upd.h index 673e0511153..8ebbb4890fe 100644 --- a/include/row0upd.h +++ b/include/row0upd.h @@ -78,7 +78,9 @@ UNIV_INLINE void row_upd_rec_sys_fields( /*===================*/ - rec_t* rec, /* in: record */ + rec_t* rec, /* in/out: record */ + page_zip_des_t* page_zip,/* in/out: compressed page with + at least 21 bytes available, or NULL */ dict_index_t* index, /* in: clustered index */ const ulint* offsets,/* in: rec_get_offsets(rec, index) */ trx_t* trx, /* in: transaction */ @@ -276,7 +278,8 @@ recovery. */ void row_upd_rec_sys_fields_in_recovery( /*===============================*/ - rec_t* rec, /* in: record */ + rec_t* rec, /* in/out: record */ + page_zip_des_t* page_zip,/* in/out: compressed page, or NULL */ const ulint* offsets,/* in: array returned by rec_get_offsets() */ ulint pos, /* in: TRX_ID position in rec */ dulint trx_id, /* in: transaction id */ diff --git a/include/row0upd.ic b/include/row0upd.ic index acbb11aa1c7..1eb9bc4d232 100644 --- a/include/row0upd.ic +++ b/include/row0upd.ic @@ -11,6 +11,7 @@ Created 12/27/1996 Heikki Tuuri #include "trx0undo.h" #include "row0row.h" #include "btr0sea.h" +#include "page0zip.h" /************************************************************************* Creates an update vector object. */ @@ -104,7 +105,9 @@ UNIV_INLINE void row_upd_rec_sys_fields( /*===================*/ - rec_t* rec, /* in: record */ + rec_t* rec, /* in/out: record */ + page_zip_des_t* page_zip,/* in/out: compressed page with + at least 21 bytes available, or NULL */ dict_index_t* index, /* in: clustered index */ const ulint* offsets,/* in: rec_get_offsets(rec, index) */ trx_t* trx, /* in: transaction */ @@ -116,7 +119,8 @@ row_upd_rec_sys_fields( ut_ad(!buf_block_align(rec)->is_hashed || rw_lock_own(&btr_search_latch, RW_LOCK_EX)); #endif /* UNIV_SYNC_DEBUG */ + ut_ad(!page_zip || page_zip_available(page_zip, 21)); - row_set_rec_trx_id(rec, index, offsets, trx->id); - row_set_rec_roll_ptr(rec, index, offsets, roll_ptr); + row_set_rec_trx_id(rec, page_zip, index, offsets, trx->id); + row_set_rec_roll_ptr(rec, page_zip, index, offsets, roll_ptr); } diff --git a/include/trx0sys.h b/include/trx0sys.h index 31e8607f8a0..2a31e63db35 100644 --- a/include/trx0sys.h +++ b/include/trx0sys.h @@ -23,6 +23,7 @@ Created 3/26/1996 Heikki Tuuri #include "fut0lst.h" #include "fsp0fsp.h" #include "read0types.h" +#include "page0types.h" /* In a MySQL replication slave, in crash recovery we store the master log file name and position here. We have successfully got the updates to InnoDB @@ -210,8 +211,10 @@ UNIV_INLINE void trx_write_trx_id( /*=============*/ - byte* ptr, /* in: pointer to memory where written */ - dulint id); /* in: id */ + byte* ptr, /* in: pointer to memory where written */ + page_zip_des_t* page_zip,/* in/out: compressed page with + at least 10 bytes available, or NULL */ + dulint id); /* in: id */ /********************************************************************* Reads a trx id from an index page. In case that the id size changes in some future version, this function should be used instead of diff --git a/include/trx0sys.ic b/include/trx0sys.ic index 24610bef827..11bb0534c41 100644 --- a/include/trx0sys.ic +++ b/include/trx0sys.ic @@ -9,6 +9,7 @@ Created 3/26/1996 Heikki Tuuri #include "srv0srv.h" #include "trx0trx.h" #include "data0type.h" +#include "page0zip.h" /* The typedef for rseg slot in the file copy */ typedef byte trx_sysf_rseg_t; @@ -213,12 +214,18 @@ UNIV_INLINE void trx_write_trx_id( /*=============*/ - byte* ptr, /* in: pointer to memory where written */ - dulint id) /* in: id */ + byte* ptr, /* in: pointer to memory where written */ + page_zip_des_t* page_zip,/* in/out: compressed page with + at least 10 bytes available, or NULL */ + dulint id) /* in: id */ { ut_ad(DATA_TRX_ID_LEN == 6); mach_write_to_6(ptr, id); + if (UNIV_LIKELY_NULL(page_zip)) { + ut_ad(page_zip_available(page_zip, 4 + DATA_TRX_ID_LEN)); + page_zip_write(page_zip, ptr, DATA_TRX_ID_LEN); + } } /********************************************************************* diff --git a/include/trx0undo.h b/include/trx0undo.h index bd7337e4f90..0453b25567e 100644 --- a/include/trx0undo.h +++ b/include/trx0undo.h @@ -55,6 +55,8 @@ void trx_write_roll_ptr( /*===============*/ byte* ptr, /* in: pointer to memory where written */ + page_zip_des_t* page_zip,/* in/out: compressed page with + at least 11 bytes available, or NULL */ dulint roll_ptr); /* in: roll ptr */ /********************************************************************* Reads a roll ptr from an index page. In case that the roll ptr size diff --git a/include/trx0undo.ic b/include/trx0undo.ic index a04b234b495..2a1f539cee4 100644 --- a/include/trx0undo.ic +++ b/include/trx0undo.ic @@ -7,6 +7,7 @@ Created 3/26/1996 Heikki Tuuri *******************************************************/ #include "data0type.h" +#include "page0zip.h" /*************************************************************************** Builds a roll pointer dulint. */ @@ -87,12 +88,18 @@ UNIV_INLINE void trx_write_roll_ptr( /*===============*/ - byte* ptr, /* in: pointer to memory where written */ - dulint roll_ptr) /* in: roll ptr */ + byte* ptr, /* in: pointer to memory where written */ + page_zip_des_t* page_zip,/* in/out: compressed page with + at least 11 bytes available, or NULL */ + dulint roll_ptr)/* in: roll ptr */ { ut_ad(DATA_ROLL_PTR_LEN == 7); mach_write_to_7(ptr, roll_ptr); + if (UNIV_LIKELY_NULL(page_zip)) { + ut_ad(page_zip_available(page_zip, 4 + DATA_ROLL_PTR_LEN)); + page_zip_write(page_zip, ptr, DATA_ROLL_PTR_LEN); + } } /********************************************************************* diff --git a/lock/lock0lock.c b/lock/lock0lock.c index 7844991613f..19f0808efdb 100644 --- a/lock/lock0lock.c +++ b/lock/lock0lock.c @@ -1289,25 +1289,17 @@ lock_t* lock_rec_get_next( /*==============*/ /* out: next lock, NULL if none exists */ - rec_t* rec, /* in: record on a page */ + ulint heap_no,/* in: heap number of the record */ lock_t* lock) /* in: lock */ { #ifdef UNIV_SYNC_DEBUG ut_ad(mutex_own(&kernel_mutex)); #endif /* UNIV_SYNC_DEBUG */ - ut_ad(lock_get_type(lock) == LOCK_REC); - if (page_rec_is_comp(rec)) { - do { - lock = lock_rec_get_next_on_page(lock); - } while (lock && !lock_rec_get_nth_bit(lock, - rec_get_heap_no(rec, TRUE))); - } else { - do { - lock = lock_rec_get_next_on_page(lock); - } while (lock && !lock_rec_get_nth_bit(lock, - rec_get_heap_no(rec, FALSE))); - } + do { + ut_ad(lock_get_type(lock) == LOCK_REC); + lock = lock_rec_get_next_on_page(lock); + } while (lock && !lock_rec_get_nth_bit(lock, heap_no)); return(lock); } @@ -1319,7 +1311,8 @@ lock_t* lock_rec_get_first( /*===============*/ /* out: first lock, NULL if none exists */ - rec_t* rec) /* in: record on a page */ + rec_t* rec, /* in: record on a page */ + ulint heap_no)/* in: heap number of the record */ { lock_t* lock; @@ -1329,8 +1322,6 @@ lock_rec_get_first( lock = lock_rec_get_first_on_page(rec); if (UNIV_LIKELY_NULL(lock)) { - ulint heap_no = rec_get_heap_no(rec, page_rec_is_comp(rec)); - while (lock && !lock_rec_get_nth_bit(lock, heap_no)) { lock = lock_rec_get_next_on_page(lock); } @@ -1495,6 +1486,7 @@ lock_rec_has_expl( for a supremum record we regard this always a gap type request */ rec_t* rec, /* in: record */ + ulint heap_no,/* in: heap number of the record */ trx_t* trx) /* in: transaction */ { lock_t* lock; @@ -1506,7 +1498,7 @@ lock_rec_has_expl( || (precise_mode & LOCK_MODE_MASK) == LOCK_X); ut_ad(!(precise_mode & LOCK_INSERT_INTENTION)); - lock = lock_rec_get_first(rec); + lock = lock_rec_get_first(rec, heap_no); while (lock) { if (lock->trx == trx @@ -1524,7 +1516,7 @@ lock_rec_has_expl( return(lock); } - lock = lock_rec_get_next(rec, lock); + lock = lock_rec_get_next(heap_no, lock); } return(NULL); @@ -1543,6 +1535,7 @@ lock_rec_other_has_expl_req( ulint wait, /* in: LOCK_WAIT if also waiting locks are taken into account, or 0 if not */ rec_t* rec, /* in: record to look at */ + ulint heap_no,/* in: heap number of hte record */ trx_t* trx) /* in: transaction, or NULL if requests by all transactions are taken into account */ { @@ -1555,7 +1548,7 @@ lock_rec_other_has_expl_req( ut_ad(gap == 0 || gap == LOCK_GAP); ut_ad(wait == 0 || wait == LOCK_WAIT); - lock = lock_rec_get_first(rec); + lock = lock_rec_get_first(rec, heap_no); while (lock) { if (lock->trx != trx @@ -1567,7 +1560,7 @@ lock_rec_other_has_expl_req( return(lock); } - lock = lock_rec_get_next(rec, lock); + lock = lock_rec_get_next(heap_no, lock); } return(NULL); @@ -1585,6 +1578,7 @@ lock_rec_other_has_conflicting( possibly ORed to LOCK_GAP or LOC_REC_NOT_GAP, LOCK_INSERT_INTENTION */ rec_t* rec, /* in: record to look at */ + ulint heap_no,/* in: heap number of the record */ trx_t* trx) /* in: our transaction */ { lock_t* lock; @@ -1592,16 +1586,30 @@ lock_rec_other_has_conflicting( ut_ad(mutex_own(&kernel_mutex)); #endif /* UNIV_SYNC_DEBUG */ - lock = lock_rec_get_first(rec); + lock = lock_rec_get_first(rec, heap_no); - while (lock) { - if (lock_rec_has_to_wait(trx, mode, lock, - page_rec_is_supremum(rec))) { + if (UNIV_LIKELY_NULL(lock)) { + if (page_rec_is_supremum(rec)) { - return(lock); + do { + if (lock_rec_has_to_wait(trx, mode, lock, + TRUE)) { + return(lock); + } + + lock = lock_rec_get_next(heap_no, lock); + } while (lock); + } else { + + do { + if (lock_rec_has_to_wait(trx, mode, lock, + FALSE)) { + return(lock); + } + + lock = lock_rec_get_next(heap_no, lock); + } while (lock); } - - lock = lock_rec_get_next(rec, lock); } return(NULL); @@ -1617,19 +1625,14 @@ lock_rec_find_similar_on_page( /*==========================*/ /* out: lock or NULL */ ulint type_mode, /* in: lock type_mode field */ - rec_t* rec, /* in: record */ + ulint heap_no, /* in: heap number of the record */ + lock_t* lock, /* in: lock_rec_get_first_on_page() */ trx_t* trx) /* in: transaction */ { - lock_t* lock; - ulint heap_no; - #ifdef UNIV_SYNC_DEBUG ut_ad(mutex_own(&kernel_mutex)); #endif /* UNIV_SYNC_DEBUG */ - heap_no = rec_get_heap_no(rec, page_rec_is_comp(rec)); - lock = lock_rec_get_first_on_page(rec); - while (lock != NULL) { if (lock->trx == trx && lock->type_mode == type_mode @@ -1709,13 +1712,13 @@ lock_rec_create( ulint type_mode,/* in: lock mode and wait flag, type is ignored and replaced by LOCK_REC */ rec_t* rec, /* in: record on page */ + ulint heap_no,/* in: heap number of the record */ dict_index_t* index, /* in: index of record */ trx_t* trx) /* in: transaction */ { page_t* page; lock_t* lock; ulint page_no; - ulint heap_no; ulint space; ulint n_bits; ulint n_bytes; @@ -1727,9 +1730,8 @@ lock_rec_create( page = buf_frame_align(rec); space = buf_frame_get_space_id(page); page_no = buf_frame_get_page_no(page); - heap_no = rec_get_heap_no(rec, page_is_comp(page)); - ut_ad(!!page_is_comp(page) == index->table->comp); + ut_ad((ibool) !!page_is_comp(page) == index->table->comp); /* If rec is the supremum record, then we reset the gap and LOCK_REC_NOT_GAP bits, as all locks on the supremum are @@ -1806,6 +1808,7 @@ lock_rec_enqueue_waiting( { lock_t* lock; trx_t* trx; + ulint heap_no; #ifdef UNIV_SYNC_DEBUG ut_ad(mutex_own(&kernel_mutex)); @@ -1815,7 +1818,7 @@ lock_rec_enqueue_waiting( we do not enqueue a lock request if the query thread should be stopped anyway */ - if (que_thr_stop(thr)) { + if (UNIV_UNLIKELY(que_thr_stop(thr))) { ut_error; @@ -1824,7 +1827,7 @@ lock_rec_enqueue_waiting( trx = thr_get_trx(thr); - if (trx->dict_operation) { + if (UNIV_UNLIKELY(trx->dict_operation)) { ut_print_timestamp(stderr); fputs( " InnoDB: Error: a record lock wait happens in a dictionary operation!\n" @@ -1834,18 +1837,24 @@ lock_rec_enqueue_waiting( "InnoDB: Submit a detailed bug report to http://bugs.mysql.com\n", stderr); } - + + if (page_rec_is_comp(rec)) { + heap_no = rec_get_heap_no_new(rec); + } else { + heap_no = rec_get_heap_no_old(rec); + } + /* Enqueue the lock request that will wait to be granted */ - lock = lock_rec_create(type_mode | LOCK_WAIT, rec, index, trx); + lock = lock_rec_create(type_mode | LOCK_WAIT, rec, + heap_no, index, trx); /* Check if a deadlock occurs: if yes, remove the lock request and return an error code */ - if (lock_deadlock_occurs(lock, trx)) { + if (UNIV_UNLIKELY(lock_deadlock_occurs(lock, trx))) { lock_reset_lock_and_trx_wait(lock); - lock_rec_reset_nth_bit(lock, rec_get_heap_no(rec, - page_rec_is_comp(rec))); + lock_rec_reset_nth_bit(lock, heap_no); return(DB_DEADLOCK); } @@ -1891,25 +1900,24 @@ lock_rec_add_to_queue( ulint type_mode,/* in: lock mode, wait, gap etc. flags; type is ignored and replaced by LOCK_REC */ rec_t* rec, /* in: record on page */ + ulint heap_no,/* in: heap number of the record */ dict_index_t* index, /* in: index of record */ trx_t* trx) /* in: transaction */ { lock_t* lock; - lock_t* similar_lock = NULL; - ulint heap_no; - ibool somebody_waits = FALSE; #ifdef UNIV_SYNC_DEBUG ut_ad(mutex_own(&kernel_mutex)); #endif /* UNIV_SYNC_DEBUG */ + ut_ad((type_mode & (LOCK_WAIT | LOCK_GAP)) || ((type_mode & LOCK_MODE_MASK) != LOCK_S) || !lock_rec_other_has_expl_req(LOCK_X, 0, LOCK_WAIT, - rec, trx)); + rec, heap_no, trx)); ut_ad((type_mode & (LOCK_WAIT | LOCK_GAP)) || ((type_mode & LOCK_MODE_MASK) != LOCK_X) || !lock_rec_other_has_expl_req(LOCK_S, 0, LOCK_WAIT, - rec, trx)); + rec, heap_no, trx)); type_mode = type_mode | LOCK_REC; @@ -1929,32 +1937,37 @@ lock_rec_add_to_queue( /* Look for a waiting lock request on the same record or on a gap */ - heap_no = rec_get_heap_no(rec, page_rec_is_comp(rec)); lock = lock_rec_get_first_on_page(rec); while (lock != NULL) { if (lock_get_wait(lock) && (lock_rec_get_nth_bit(lock, heap_no))) { - somebody_waits = TRUE; + goto somebody_waits; } lock = lock_rec_get_next_on_page(lock); } - /* Look for a similar record lock on the same page: if one is found - and there are no waiting lock requests, we can just set the bit */ + if (!(type_mode & LOCK_WAIT)) { - similar_lock = lock_rec_find_similar_on_page(type_mode, rec, trx); + /* Look for a similar record lock on the same page: + if one is found and there are no waiting lock requests, + we can just set the bit */ - if (similar_lock && !somebody_waits && !(type_mode & LOCK_WAIT)) { + lock = lock_rec_find_similar_on_page(type_mode, heap_no, + lock_rec_get_first_on_page(rec), trx); - lock_rec_set_nth_bit(similar_lock, heap_no); + if (lock) { - return(similar_lock); + lock_rec_set_nth_bit(lock, heap_no); + + return(lock); + } } - return(lock_rec_create(type_mode, rec, index, trx)); +somebody_waits: + return(lock_rec_create(type_mode, rec, heap_no, index, trx)); } /************************************************************************* @@ -1975,11 +1988,11 @@ lock_rec_lock_fast( ulint mode, /* in: lock mode: LOCK_X or LOCK_S possibly ORed to either LOCK_GAP or LOCK_REC_NOT_GAP */ rec_t* rec, /* in: record */ + ulint heap_no,/* in: heap number of record */ dict_index_t* index, /* in: index of record */ que_thr_t* thr) /* in: query thread */ { lock_t* lock; - ulint heap_no; trx_t* trx; #ifdef UNIV_SYNC_DEBUG @@ -1995,15 +2008,13 @@ lock_rec_lock_fast( || mode - (LOCK_MODE_MASK & mode) == 0 || mode - (LOCK_MODE_MASK & mode) == LOCK_REC_NOT_GAP); - heap_no = rec_get_heap_no(rec, page_rec_is_comp(rec)); - lock = lock_rec_get_first_on_page(rec); trx = thr_get_trx(thr); if (lock == NULL) { if (!impl) { - lock_rec_create(mode, rec, index, trx); + lock_rec_create(mode, rec, heap_no, index, trx); if (srv_locks_unsafe_for_binlog) { trx_register_new_rec_lock(trx, index); @@ -2057,6 +2068,7 @@ lock_rec_lock_slow( ulint mode, /* in: lock mode: LOCK_X or LOCK_S possibly ORed to either LOCK_GAP or LOCK_REC_NOT_GAP */ rec_t* rec, /* in: record */ + ulint heap_no,/* in: heap number of record */ dict_index_t* index, /* in: index of record */ que_thr_t* thr) /* in: query thread */ { @@ -2078,12 +2090,12 @@ lock_rec_lock_slow( trx = thr_get_trx(thr); - if (lock_rec_has_expl(mode, rec, trx)) { + if (lock_rec_has_expl(mode, rec, heap_no, trx)) { /* The trx already has a strong enough lock on rec: do nothing */ err = DB_SUCCESS; - } else if (lock_rec_other_has_conflicting(mode, rec, trx)) { + } else if (lock_rec_other_has_conflicting(mode, rec, heap_no, trx)) { /* If another transaction has a non-gap conflicting request in the queue, as this transaction does not have a lock strong @@ -2098,8 +2110,8 @@ lock_rec_lock_slow( if (!impl) { /* Set the requested lock on the record */ - lock_rec_add_to_queue(LOCK_REC | mode, rec, index, - trx); + lock_rec_add_to_queue(LOCK_REC | mode, rec, heap_no, + index, trx); if (srv_locks_unsafe_for_binlog) { trx_register_new_rec_lock(trx, index); } @@ -2133,6 +2145,7 @@ lock_rec_lock( que_thr_t* thr) /* in: query thread */ { ulint err; + ulint heap_no; #ifdef UNIV_SYNC_DEBUG ut_ad(mutex_own(&kernel_mutex)); @@ -2147,14 +2160,20 @@ lock_rec_lock( || mode - (LOCK_MODE_MASK & mode) == LOCK_REC_NOT_GAP || mode - (LOCK_MODE_MASK & mode) == 0); - if (lock_rec_lock_fast(impl, mode, rec, index, thr)) { + if (page_rec_is_comp(rec)) { + heap_no = rec_get_heap_no_new(rec); + } else { + heap_no = rec_get_heap_no_old(rec); + } + + if (lock_rec_lock_fast(impl, mode, rec, heap_no, index, thr)) { /* We try a simplified and faster subroutine for the most common cases */ err = DB_SUCCESS; } else { - err = lock_rec_lock_slow(impl, mode, rec, index, thr); + err = lock_rec_lock_slow(impl, mode, rec, heap_no, index, thr); } return(err); @@ -2405,9 +2424,13 @@ lock_rec_reset_and_release_wait( ut_ad(mutex_own(&kernel_mutex)); #endif /* UNIV_SYNC_DEBUG */ - heap_no = rec_get_heap_no(rec, page_rec_is_comp(rec)); - - lock = lock_rec_get_first(rec); + if (page_rec_is_comp(rec)) { + heap_no = rec_get_heap_no_new(rec); + } else { + heap_no = rec_get_heap_no_old(rec); + } + + lock = lock_rec_get_first(rec, heap_no); while (lock != NULL) { if (lock_get_wait(lock)) { @@ -2416,7 +2439,7 @@ lock_rec_reset_and_release_wait( lock_rec_reset_nth_bit(lock, heap_no); } - lock = lock_rec_get_next(rec, lock); + lock = lock_rec_get_next(heap_no, lock); } } @@ -2434,11 +2457,21 @@ lock_rec_inherit_to_gap( the locks on this record */ { lock_t* lock; + ulint heir_heap_no; + ulint heap_no; #ifdef UNIV_SYNC_DEBUG ut_ad(mutex_own(&kernel_mutex)); #endif /* UNIV_SYNC_DEBUG */ - lock = lock_rec_get_first(rec); + if (page_rec_is_comp(rec)) { + heir_heap_no = rec_get_heap_no_new(heir); + heap_no = rec_get_heap_no_new(rec); + } else { + heir_heap_no = rec_get_heap_no_old(heir); + heap_no = rec_get_heap_no_old(rec); + } + + lock = lock_rec_get_first(rec, heap_no); /* If srv_locks_unsafe_for_binlog is TRUE, we do not want locks set by an UPDATE or a DELETE to be inherited as gap type locks. But we @@ -2452,10 +2485,11 @@ lock_rec_inherit_to_gap( lock_rec_add_to_queue(LOCK_REC | lock_get_mode(lock) | LOCK_GAP, - heir, lock->index, lock->trx); + heir, heir_heap_no, + lock->index, lock->trx); } - lock = lock_rec_get_next(rec, lock); + lock = lock_rec_get_next(heap_no, lock); } } @@ -2472,11 +2506,21 @@ lock_rec_inherit_to_gap_if_gap_lock( the locks on this record */ { lock_t* lock; + ulint heir_heap_no; + ulint heap_no; #ifdef UNIV_SYNC_DEBUG ut_ad(mutex_own(&kernel_mutex)); #endif /* UNIV_SYNC_DEBUG */ - lock = lock_rec_get_first(rec); + if (page_rec_is_comp(rec)) { + heir_heap_no = rec_get_heap_no_new(heir); + heap_no = rec_get_heap_no_new(rec); + } else { + heir_heap_no = rec_get_heap_no_old(heir); + heap_no = rec_get_heap_no_old(rec); + } + + lock = lock_rec_get_first(rec, heap_no); while (lock != NULL) { if (!lock_rec_get_insert_intention(lock) @@ -2485,10 +2529,11 @@ lock_rec_inherit_to_gap_if_gap_lock( lock_rec_add_to_queue(LOCK_REC | lock_get_mode(lock) | LOCK_GAP, - heir, lock->index, lock->trx); + heir, heir_heap_no, + lock->index, lock->trx); } - lock = lock_rec_get_next(rec, lock); + lock = lock_rec_get_next(heap_no, lock); } } @@ -2505,37 +2550,44 @@ lock_rec_move( ulint comp) /* in: nonzero=compact page format */ { lock_t* lock; - ulint heap_no; + ulint receiver_heap_no; + ulint donator_heap_no; ulint type_mode; #ifdef UNIV_SYNC_DEBUG ut_ad(mutex_own(&kernel_mutex)); #endif /* UNIV_SYNC_DEBUG */ - heap_no = rec_get_heap_no(donator, comp); - - lock = lock_rec_get_first(donator); + if (UNIV_LIKELY(comp)) { + receiver_heap_no = rec_get_heap_no_new(donator); + donator_heap_no = rec_get_heap_no_new(donator); + } else { + receiver_heap_no = rec_get_heap_no_old(donator); + donator_heap_no = rec_get_heap_no_old(donator); + } - ut_ad(lock_rec_get_first(receiver) == NULL); + lock = lock_rec_get_first(donator, donator_heap_no); + + ut_ad(lock_rec_get_first(receiver, receiver_heap_no) == NULL); while (lock != NULL) { type_mode = lock->type_mode; - lock_rec_reset_nth_bit(lock, heap_no); + lock_rec_reset_nth_bit(lock, donator_heap_no); - if (lock_get_wait(lock)) { + if (UNIV_UNLIKELY(lock_get_wait(lock))) { lock_reset_lock_and_trx_wait(lock); } /* Note that we FIRST reset the bit, and then set the lock: the function works also if donator == receiver */ - lock_rec_add_to_queue(type_mode, receiver, lock->index, - lock->trx); - lock = lock_rec_get_next(donator, lock); + lock_rec_add_to_queue(type_mode, receiver, receiver_heap_no, + lock->index, lock->trx); + lock = lock_rec_get_next(donator_heap_no, lock); } - ut_ad(lock_rec_get_first(donator) == NULL); + ut_ad(lock_rec_get_first(donator, donator_heap_no) == NULL); } /***************************************************************** @@ -2555,9 +2607,9 @@ lock_move_reorganize_page( page_cur_t cur1; page_cur_t cur2; ulint old_heap_no; + ulint new_heap_no; UT_LIST_BASE_NODE_T(lock_t) old_locks; mem_heap_t* heap = NULL; - rec_t* sup; ulint comp; lock_mutex_enter_kernel(); @@ -2595,8 +2647,6 @@ lock_move_reorganize_page( lock = lock_rec_get_next_on_page(lock); } - sup = page_get_supremum_rec(page); - lock = UT_LIST_GET_FIRST(old_locks); comp = page_is_comp(page); @@ -2617,8 +2667,17 @@ lock_move_reorganize_page( page_cur_get_rec(&cur2), rec_get_data_size_old( page_cur_get_rec(&cur2)))); - old_heap_no = rec_get_heap_no(page_cur_get_rec(&cur2), - comp); + if (UNIV_LIKELY(comp)) { + old_heap_no = rec_get_heap_no_new( + page_cur_get_rec(&cur2)); + new_heap_no = rec_get_heap_no_new( + page_cur_get_rec(&cur1)); + } else { + old_heap_no = rec_get_heap_no_old( + page_cur_get_rec(&cur2)); + new_heap_no = rec_get_heap_no_old( + page_cur_get_rec(&cur1)); + } if (lock_rec_get_nth_bit(lock, old_heap_no)) { @@ -2627,9 +2686,10 @@ lock_move_reorganize_page( lock_rec_add_to_queue(lock->type_mode, page_cur_get_rec(&cur1), + new_heap_no, lock->index, lock->trx); - /* if ((page_cur_get_rec(&cur1) == sup) + /* if ((page_cur_is_after_last(&cur1)) && lock_get_wait(lock)) { fprintf(stderr, "---\n--\n!!!Lock reorg: supr type %lu\n", @@ -2637,7 +2697,7 @@ lock_move_reorganize_page( } */ } - if (page_cur_get_rec(&cur1) == sup) { + if (page_cur_is_after_last(&cur1)) { break; } @@ -2675,9 +2735,8 @@ lock_move_rec_list_end( page_cur_t cur1; page_cur_t cur2; ulint heap_no; - rec_t* sup; ulint type_mode; - ulint comp; + ut_ad(page_is_comp(page) == page_is_comp(new_page)); ut_ad(page == buf_frame_align(rec)); lock_mutex_enter_kernel(); @@ -2688,12 +2747,8 @@ lock_move_rec_list_end( table to the end of the hash chain, and lock_rec_add_to_queue does not reuse locks if there are waiters in the queue. */ - sup = page_get_supremum_rec(page); - lock = lock_rec_get_first_on_page(page); - comp = page_is_comp(page); - while (lock != NULL) { page_cur_position(rec, &cur1); @@ -2708,13 +2763,19 @@ lock_move_rec_list_end( /* Copy lock requests on user records to new page and reset the lock bits on the old */ - while (page_cur_get_rec(&cur1) != sup) { - ut_ad(comp || 0 == ut_memcmp(page_cur_get_rec(&cur1), + while (!page_cur_is_after_last(&cur1)) { + ut_ad(page_is_comp(page) + || 0 == ut_memcmp(page_cur_get_rec(&cur1), page_cur_get_rec(&cur2), rec_get_data_size_old( page_cur_get_rec(&cur2)))); - heap_no = rec_get_heap_no(page_cur_get_rec(&cur1), - comp); + if (page_is_comp(page)) { + heap_no = rec_get_heap_no_new( + page_cur_get_rec(&cur1)); + } else { + heap_no = rec_get_heap_no_old( + page_cur_get_rec(&cur1)); + } if (lock_rec_get_nth_bit(lock, heap_no)) { type_mode = lock->type_mode; @@ -2725,8 +2786,17 @@ lock_move_rec_list_end( lock_reset_lock_and_trx_wait(lock); } + if (page_is_comp(page)) { + heap_no = rec_get_heap_no_new( + page_cur_get_rec(&cur2)); + } else { + heap_no = rec_get_heap_no_old( + page_cur_get_rec(&cur2)); + } + lock_rec_add_to_queue(type_mode, page_cur_get_rec(&cur2), + heap_no, lock->index, lock->trx); } @@ -2764,15 +2834,13 @@ lock_move_rec_list_start( page_cur_t cur2; ulint heap_no; ulint type_mode; - ulint comp; ut_a(new_page); lock_mutex_enter_kernel(); lock = lock_rec_get_first_on_page(page); - comp = page_is_comp(page); - ut_ad(comp == page_is_comp(new_page)); + ut_ad(page_is_comp(page) == page_is_comp(new_page)); ut_ad(page == buf_frame_align(rec)); while (lock != NULL) { @@ -2787,12 +2855,18 @@ lock_move_rec_list_start( reset the lock bits on the old */ while (page_cur_get_rec(&cur1) != rec) { - ut_ad(comp || 0 == ut_memcmp(page_cur_get_rec(&cur1), + ut_ad(page_is_comp(page) + || 0 == ut_memcmp(page_cur_get_rec(&cur1), page_cur_get_rec(&cur2), rec_get_data_size_old( page_cur_get_rec(&cur2)))); - heap_no = rec_get_heap_no(page_cur_get_rec(&cur1), - comp); + if (page_is_comp(page)) { + heap_no = rec_get_heap_no_new( + page_cur_get_rec(&cur1)); + } else { + heap_no = rec_get_heap_no_old( + page_cur_get_rec(&cur1)); + } if (lock_rec_get_nth_bit(lock, heap_no)) { type_mode = lock->type_mode; @@ -2803,8 +2877,17 @@ lock_move_rec_list_start( lock_reset_lock_and_trx_wait(lock); } + if (page_is_comp(page)) { + heap_no = rec_get_heap_no_new( + page_cur_get_rec(&cur2)); + } else { + heap_no = rec_get_heap_no_old( + page_cur_get_rec(&cur2)); + } + lock_rec_add_to_queue(type_mode, page_cur_get_rec(&cur2), + heap_no, lock->index, lock->trx); } @@ -2832,16 +2915,15 @@ lock_update_split_right( page_t* right_page, /* in: right page */ page_t* left_page) /* in: left page */ { - ulint comp; lock_mutex_enter_kernel(); - comp = page_is_comp(left_page); - ut_ad(comp == page_is_comp(right_page)); + ut_ad(page_is_comp(left_page) == page_is_comp(right_page)); /* Move the locks on the supremum of the left page to the supremum of the right page */ lock_rec_move(page_get_supremum_rec(right_page), - page_get_supremum_rec(left_page), comp); + page_get_supremum_rec(left_page), + page_is_comp(left_page)); /* Inherit the locks to the supremum of left page from the successor of the infimum on right page */ @@ -3196,7 +3278,7 @@ retry: goto retry; } - if (ret == LOCK_VICTIM_IS_START) { + if (UNIV_UNLIKELY(ret == LOCK_VICTIM_IS_START)) { if (lock_get_type(lock) & LOCK_TABLE) { table = lock->un_member.tab_lock.table; index = NULL; @@ -4403,6 +4485,7 @@ lock_rec_queue_validate( { trx_t* impl_trx; lock_t* lock; + ulint heap_no; ut_a(rec); ut_ad(rec_offs_validate(rec, index, offsets)); @@ -4410,9 +4493,15 @@ lock_rec_queue_validate( lock_mutex_enter_kernel(); + if (page_rec_is_comp(rec)) { + heap_no = rec_get_heap_no_new(rec); + } else { + heap_no = rec_get_heap_no_old(rec); + } + if (!page_rec_is_user_rec(rec)) { - lock = lock_rec_get_first(rec); + lock = lock_rec_get_first(rec, heap_no); while (lock) { ut_a(lock->trx->conc_state == TRX_ACTIVE @@ -4430,7 +4519,7 @@ lock_rec_queue_validate( ut_a(lock->index == index); } - lock = lock_rec_get_next(rec, lock); + lock = lock_rec_get_next(heap_no, lock); } lock_mutex_exit_kernel(); @@ -4438,19 +4527,18 @@ lock_rec_queue_validate( return(TRUE); } - if (index && (index->type & DICT_CLUSTERED)) { + if (!index); + else if (index->type & DICT_CLUSTERED) { impl_trx = lock_clust_rec_some_has_impl(rec, index, offsets); if (impl_trx && lock_rec_other_has_expl_req(LOCK_S, 0, - LOCK_WAIT, rec, impl_trx)) { + LOCK_WAIT, rec, heap_no, impl_trx)) { ut_a(lock_rec_has_expl(LOCK_X | LOCK_REC_NOT_GAP, rec, - impl_trx)); + heap_no, impl_trx)); } - } - - if (index && !(index->type & DICT_CLUSTERED)) { + } else { /* The kernel mutex may get released temporarily in the next function call: we have to release lock table mutex @@ -4460,14 +4548,14 @@ lock_rec_queue_validate( rec, index, offsets); if (impl_trx && lock_rec_other_has_expl_req(LOCK_S, 0, - LOCK_WAIT, rec, impl_trx)) { + LOCK_WAIT, rec, heap_no, impl_trx)) { ut_a(lock_rec_has_expl(LOCK_X | LOCK_REC_NOT_GAP, - rec, impl_trx)); + rec, heap_no, impl_trx)); } } - lock = lock_rec_get_first(rec); + lock = lock_rec_get_first(rec, heap_no); while (lock) { ut_a(lock->trx->conc_state == TRX_ACTIVE @@ -4488,15 +4576,15 @@ lock_rec_queue_validate( } else { mode = LOCK_S; } - ut_a(!lock_rec_other_has_expl_req(mode, - 0, 0, rec, lock->trx)); + ut_a(!lock_rec_other_has_expl_req(mode, 0, 0, + rec, heap_no, lock->trx)); } else if (lock_get_wait(lock) && !lock_rec_get_gap(lock)) { ut_a(lock_rec_has_to_wait_in_queue(lock)); } - lock = lock_rec_get_next(rec, lock); + lock = lock_rec_get_next(heap_no, lock); } lock_mutex_exit_kernel(); @@ -4704,6 +4792,7 @@ lock_rec_insert_check_and_lock( trx_t* trx; lock_t* lock; ulint err; + ulint next_rec_heap_no; if (flags & BTR_NO_LOCKING_FLAG) { @@ -4715,15 +4804,19 @@ lock_rec_insert_check_and_lock( trx = thr_get_trx(thr); next_rec = page_rec_get_next(rec); - *inherit = FALSE; - lock_mutex_enter_kernel(); ut_ad(lock_table_has(thr_get_trx(thr), index->table, LOCK_IX)); - lock = lock_rec_get_first(next_rec); + if (page_rec_is_comp(next_rec)) { + next_rec_heap_no = rec_get_heap_no_new(next_rec); + } else { + next_rec_heap_no = rec_get_heap_no_old(next_rec); + } - if (lock == NULL) { + lock = lock_rec_get_first(next_rec, next_rec_heap_no); + + if (UNIV_LIKELY(lock == NULL)) { /* We optimize CPU time usage in the simplest case */ lock_mutex_exit_kernel(); @@ -4735,6 +4828,8 @@ lock_rec_insert_check_and_lock( thr_get_trx(thr)->id); } + *inherit = FALSE; + return(DB_SUCCESS); } @@ -4751,7 +4846,8 @@ lock_rec_insert_check_and_lock( on the successor, which produced an unnecessary deadlock. */ if (lock_rec_other_has_conflicting(LOCK_X | LOCK_GAP - | LOCK_INSERT_INTENTION, next_rec, trx)) { + | LOCK_INSERT_INTENTION, + next_rec, next_rec_heap_no, trx)) { /* Note that we may get DB_SUCCESS also here! */ err = lock_rec_enqueue_waiting(LOCK_X | LOCK_GAP @@ -4821,12 +4917,20 @@ lock_rec_convert_impl_to_expl( /* If the transaction has no explicit x-lock set on the record, set one for it */ + ulint heap_no; + + if (page_rec_is_comp(rec)) { + heap_no = rec_get_heap_no_new(rec); + } else { + heap_no = rec_get_heap_no_old(rec); + } + if (!lock_rec_has_expl(LOCK_X | LOCK_REC_NOT_GAP, rec, - impl_trx)) { + heap_no, impl_trx)) { lock_rec_add_to_queue(LOCK_REC | LOCK_X - | LOCK_REC_NOT_GAP, rec, index, - impl_trx); + | LOCK_REC_NOT_GAP, rec, heap_no, + index, impl_trx); } } } diff --git a/log/log0recv.c b/log/log0recv.c index 42e854398ba..49a2343cd96 100644 --- a/log/log0recv.c +++ b/log/log0recv.c @@ -22,6 +22,7 @@ Created 9/20/1997 Heikki Tuuri #include "mtr0log.h" #include "page0page.h" #include "page0cur.h" +#include "page0zip.h" #include "btr0btr.h" #include "btr0cur.h" #include "ibuf0ibuf.h" @@ -753,9 +754,10 @@ recv_parse_or_apply_log_rec_body( byte type, /* in: type */ byte* ptr, /* in: pointer to a buffer */ byte* end_ptr,/* in: pointer to the buffer end */ - page_t* page, /* in: buffer page or NULL; if not NULL, then the log - record is applied to the page, and the log record + page_t* page, /* in/out: buffer page or NULL; if not NULL, then the + log record is applied to the page, and the log record should be complete then */ + page_zip_des_t* page_zip,/* in/out: compressed page or NULL */ mtr_t* mtr) /* in: mtr or NULL; should be non-NULL if and only if page is non-NULL */ { @@ -771,7 +773,7 @@ recv_parse_or_apply_log_rec_body( ut_a(!page || (ibool)!!page_is_comp(page)==index->table->comp); ptr = page_cur_parse_insert_rec(FALSE, ptr, end_ptr, - index, page, mtr); + index, page, page_zip, mtr); } break; case MLOG_REC_CLUST_DELETE_MARK: case MLOG_COMP_REC_CLUST_DELETE_MARK: @@ -780,7 +782,7 @@ recv_parse_or_apply_log_rec_body( ut_a(!page || (ibool)!!page_is_comp(page)==index->table->comp); ptr = btr_cur_parse_del_mark_set_clust_rec(ptr, - end_ptr, index, page); + end_ptr, page, page_zip, index); } break; case MLOG_COMP_REC_SEC_DELETE_MARK: @@ -793,7 +795,8 @@ recv_parse_or_apply_log_rec_body( } /* Fall through */ case MLOG_REC_SEC_DELETE_MARK: - ptr = btr_cur_parse_del_mark_set_sec_rec(ptr, end_ptr, page); + ptr = btr_cur_parse_del_mark_set_sec_rec(ptr, end_ptr, + page, page_zip); break; case MLOG_REC_UPDATE_IN_PLACE: case MLOG_COMP_REC_UPDATE_IN_PLACE: if (NULL != (ptr = mlog_parse_index(ptr, end_ptr, @@ -801,7 +804,7 @@ recv_parse_or_apply_log_rec_body( ut_a(!page || (ibool)!!page_is_comp(page)==index->table->comp); ptr = btr_cur_parse_update_in_place(ptr, end_ptr, - page, index); + page, page_zip, index); } break; case MLOG_LIST_END_DELETE: case MLOG_COMP_LIST_END_DELETE: @@ -821,7 +824,7 @@ recv_parse_or_apply_log_rec_body( ut_a(!page || (ibool)!!page_is_comp(page)==index->table->comp); ptr = page_parse_copy_rec_list_to_created_page(ptr, - end_ptr, index, page, mtr); + end_ptr, index, page, page_zip, mtr); } break; case MLOG_PAGE_REORGANIZE: case MLOG_COMP_PAGE_REORGANIZE: @@ -864,7 +867,7 @@ recv_parse_or_apply_log_rec_body( ut_a(!page || (ibool)!!page_is_comp(page)==index->table->comp); ptr = page_cur_parse_delete_rec(ptr, end_ptr, - index, page, mtr); + index, page, page_zip, mtr); } break; case MLOG_IBUF_BITMAP_INIT: @@ -882,6 +885,16 @@ recv_parse_or_apply_log_rec_body( ptr = fil_op_log_parse_or_replay(ptr, end_ptr, type, FALSE, ULINT_UNDEFINED); break; + case MLOG_COMP_DECOMPRESS: + if (page) { + ut_a(page_is_comp(page)); + ut_a(page_zip); + if (UNIV_UNLIKELY(!page_zip_decompress( + page_zip, page, NULL))) { + ut_error; + } + } + break; default: ptr = NULL; recv_sys->found_corrupt_log = TRUE; @@ -1089,6 +1102,7 @@ recv_recover_page( ulint page_no) /* in: page number */ { buf_block_t* block = NULL; + page_zip_des_t* page_zip = NULL; recv_addr_t* recv_addr; recv_t* recv; byte* buf; @@ -1133,6 +1147,7 @@ recv_recover_page( if (!recover_backup) { block = buf_block_align(page); + page_zip = buf_block_get_page_zip(block); if (just_read_in) { /* Move the ownership of the x-latch on the page to this OS @@ -1220,7 +1235,8 @@ recv_recover_page( #endif /* UNIV_DEBUG */ recv_parse_or_apply_log_rec_body(recv->type, buf, - buf + recv->len, page, &mtr); + buf + recv->len, + page, page_zip, &mtr); mach_write_to_8(page + UNIV_PAGE_SIZE - FIL_PAGE_END_LSN_OLD_CHKSUM, ut_dulint_add(recv->start_lsn, @@ -1613,8 +1629,8 @@ recv_update_replicate( buf_page_dbg_add_level(replica, SYNC_NO_ORDER_CHECK); #endif /* UNIV_SYNC_DEBUG */ - ptr = recv_parse_or_apply_log_rec_body(type, body, end_ptr, replica, - &mtr); + ptr = recv_parse_or_apply_log_rec_body(type, body, end_ptr, + replica, NULL, &mtr); ut_a(ptr == end_ptr); /* Notify the buffer manager that the page has been updated */ @@ -1845,7 +1861,7 @@ recv_parse_log_rec( } new_ptr = recv_parse_or_apply_log_rec_body(*type, new_ptr, end_ptr, - NULL, NULL); + NULL, NULL, NULL); if (UNIV_UNLIKELY(new_ptr == NULL)) { return(0); diff --git a/page/Makefile.am b/page/Makefile.am index 2e260787438..67dc20fc594 100644 --- a/page/Makefile.am +++ b/page/Makefile.am @@ -19,6 +19,6 @@ include ../include/Makefile.i noinst_LIBRARIES = libpage.a -libpage_a_SOURCES = page0page.c page0cur.c +libpage_a_SOURCES = page0page.c page0cur.c page0zip.c EXTRA_PROGRAMS = diff --git a/page/page0cur.c b/page/page0cur.c index d0b89e81787..6550bc93539 100644 --- a/page/page0cur.c +++ b/page/page0cur.c @@ -11,6 +11,7 @@ Created 10/4/1994 Heikki Tuuri #include "page0cur.ic" #endif +#include "page0zip.h" #include "rem0cmp.h" #include "mtr0log.h" #include "log0recv.h" @@ -483,7 +484,7 @@ page_cur_open_on_rnd_user_rec( ulint rnd; rec_t* rec; - if (page_get_n_recs(page) == 0) { + if (UNIV_UNLIKELY(page_get_n_recs(page) == 0)) { page_cur_position(page_get_infimum_rec(page), cursor); return; @@ -522,19 +523,14 @@ page_cur_insert_rec_write_log( ulint cur_rec_size; ulint extra_size; ulint cur_extra_size; - ulint min_rec_size; - byte* ins_ptr; - byte* cur_ptr; - ulint extra_info_yes; + const byte* ins_ptr; byte* log_ptr; - byte* log_end; + const byte* log_end; ulint i; - ulint comp; ut_a(rec_size < UNIV_PAGE_SIZE); ut_ad(buf_frame_align(insert_rec) == buf_frame_align(cursor_rec)); ut_ad(!page_rec_is_comp(insert_rec) == !index->table->comp); - comp = page_rec_is_comp(insert_rec); { mem_heap_t* heap = NULL; @@ -567,45 +563,55 @@ page_cur_insert_rec_write_log( i = 0; if (cur_extra_size == extra_size) { - min_rec_size = ut_min(cur_rec_size, rec_size); + ulint min_rec_size = ut_min(cur_rec_size, rec_size); - cur_ptr = cursor_rec - cur_extra_size; + const byte* cur_ptr = cursor_rec - cur_extra_size; /* Find out the first byte in insert_rec which differs from cursor_rec; skip the bytes in the record info */ - for (;;) { - if (i >= min_rec_size) { - - break; - } else if (*ins_ptr == *cur_ptr) { + do { + if (*ins_ptr == *cur_ptr) { i++; ins_ptr++; cur_ptr++; } else if ((i < extra_size) - && (i >= extra_size - (comp - ? REC_N_NEW_EXTRA_BYTES - : REC_N_OLD_EXTRA_BYTES))) { + && (i >= extra_size - + page_rec_get_base_extra_size( + insert_rec))) { i = extra_size; ins_ptr = insert_rec; cur_ptr = cursor_rec; } else { break; } - } + } while (i < min_rec_size); } if (mtr_get_log_mode(mtr) != MTR_LOG_SHORT_INSERTS) { - log_ptr = mlog_open_and_write_index(mtr, insert_rec, index, - comp - ? MLOG_COMP_REC_INSERT : MLOG_REC_INSERT, - 2 + 5 + 1 + 5 + 5 + MLOG_BUF_MARGIN); + if (page_rec_is_comp(insert_rec)) { + log_ptr = mlog_open_and_write_index(mtr, insert_rec, + index, MLOG_COMP_REC_INSERT, + 2 + 5 + 1 + 5 + 5 + MLOG_BUF_MARGIN); + if (UNIV_UNLIKELY(!log_ptr)) { + /* Logging in mtr is switched off + during crash recovery: in that case + mlog_open returns NULL */ + return; + } + } else { + log_ptr = mlog_open(mtr, 11 + + 2 + 5 + 1 + 5 + 5 + MLOG_BUF_MARGIN); + if (UNIV_UNLIKELY(!log_ptr)) { + /* Logging in mtr is switched off + during crash recovery: in that case + mlog_open returns NULL */ + return; + } - if (!log_ptr) { - /* Logging in mtr is switched off during crash - recovery: in that case mlog_open returns NULL */ - return; + log_ptr = mlog_write_initial_log_record_fast( + insert_rec, MLOG_REC_INSERT, log_ptr, mtr); } log_end = &log_ptr[2 + 5 + 1 + 5 + 5 + MLOG_BUF_MARGIN]; @@ -623,24 +629,33 @@ page_cur_insert_rec_write_log( log_end = &log_ptr[5 + 1 + 5 + 5 + MLOG_BUF_MARGIN]; } - if ((rec_get_info_and_status_bits(insert_rec, comp) != - rec_get_info_and_status_bits(cursor_rec, comp)) - || (extra_size != cur_extra_size) - || (rec_size != cur_rec_size)) { + if (page_rec_is_comp(insert_rec)) { + if (UNIV_UNLIKELY + (rec_get_info_and_status_bits(insert_rec, TRUE) != + rec_get_info_and_status_bits(cursor_rec, TRUE))) { - extra_info_yes = 1; + goto need_extra_info; + } } else { - extra_info_yes = 0; + if (UNIV_UNLIKELY + (rec_get_info_and_status_bits(insert_rec, FALSE) != + rec_get_info_and_status_bits(cursor_rec, FALSE))) { + + goto need_extra_info; + } } - - /* Write the record end segment length and the extra info storage - flag */ - log_ptr += mach_write_compressed(log_ptr, 2 * (rec_size - i) - + extra_info_yes); - if (extra_info_yes) { + + if (extra_size != cur_extra_size || rec_size != cur_rec_size) { +need_extra_info: + /* Write the record end segment length + and the extra info storage flag */ + log_ptr += mach_write_compressed(log_ptr, + 2 * (rec_size - i) + 1); + /* Write the info bits */ mach_write_to_1(log_ptr, - rec_get_info_and_status_bits(insert_rec, comp)); + rec_get_info_and_status_bits(insert_rec, + page_rec_is_comp(insert_rec))); log_ptr++; /* Write the record origin offset */ @@ -651,8 +666,12 @@ page_cur_insert_rec_write_log( ut_a(i < UNIV_PAGE_SIZE); ut_a(extra_size < UNIV_PAGE_SIZE); + } else { + /* Write the record end segment length + and the extra info storage flag */ + log_ptr += mach_write_compressed(log_ptr, 2 * (rec_size - i)); } - + /* Write to the log the inserted index record end segment which differs from the cursor record */ @@ -679,10 +698,11 @@ page_cur_parse_insert_rec( byte* ptr, /* in: buffer */ byte* end_ptr,/* in: buffer end */ dict_index_t* index, /* in: record descriptor */ - page_t* page, /* in: page or NULL */ + page_t* page, /* in/out: page or NULL */ + page_zip_des_t* page_zip,/* in/out: compressed page with at least + 25 + rec_size bytes available, or NULL */ mtr_t* mtr) /* in: mtr or NULL */ { - ulint extra_info_yes; ulint offset = 0; /* remove warning */ ulint origin_offset; ulint end_seg_len; @@ -725,16 +745,13 @@ page_cur_parse_insert_rec( return(NULL); } - extra_info_yes = end_seg_len & 0x1UL; - end_seg_len >>= 1; - - if (end_seg_len >= UNIV_PAGE_SIZE) { + if (UNIV_UNLIKELY(end_seg_len >= UNIV_PAGE_SIZE << 1)) { recv_sys->found_corrupt_log = TRUE; return(NULL); } - if (extra_info_yes) { + if (end_seg_len & 0x1UL) { /* Read the info bits */ if (end_ptr < ptr + 1) { @@ -764,17 +781,18 @@ page_cur_parse_insert_rec( ut_a(mismatch_index < UNIV_PAGE_SIZE); } - if (end_ptr < ptr + end_seg_len) { + if (end_ptr < ptr + (end_seg_len >> 1)) { return(NULL); } if (page == NULL) { - return(ptr + end_seg_len); + return(ptr + (end_seg_len >> 1)); } - ut_ad(!!page_is_comp(page) == index->table->comp); + ut_ad((ibool) !!page_is_comp(page) == index->table->comp); + ut_ad(!page_zip || page_is_comp(page)); /* Read from the log the inserted index record end segment which differs from the cursor record */ @@ -788,12 +806,14 @@ page_cur_parse_insert_rec( offsets = rec_get_offsets(cursor_rec, index, offsets, ULINT_UNDEFINED, &heap); - if (extra_info_yes == 0) { + if (!(end_seg_len & 0x1UL)) { info_and_status_bits = rec_get_info_and_status_bits( cursor_rec, page_is_comp(page)); origin_offset = rec_offs_extra_size(offsets); - mismatch_index = rec_offs_size(offsets) - end_seg_len; + mismatch_index = rec_offs_size(offsets) - (end_seg_len >> 1); } + + end_seg_len >>= 1; if (mismatch_index + end_seg_len < sizeof buf1) { buf = buf1; @@ -803,7 +823,7 @@ page_cur_parse_insert_rec( /* Build the inserted record to buf */ - if (mismatch_index >= UNIV_PAGE_SIZE) { + if (UNIV_UNLIKELY(mismatch_index >= UNIV_PAGE_SIZE)) { fprintf(stderr, "Is short %lu, info_and_status_bits %lu, offset %lu, " "o_offset %lu\n" @@ -826,14 +846,24 @@ page_cur_parse_insert_rec( ut_memcpy(buf, rec_get_start(cursor_rec, offsets), mismatch_index); ut_memcpy(buf + mismatch_index, ptr, end_seg_len); - rec_set_info_and_status_bits(buf + origin_offset, page_is_comp(page), + if (page_is_comp(page)) { + rec_set_info_and_status_bits(buf + origin_offset, NULL, info_and_status_bits); + } else { + rec_set_info_bits_old(buf + origin_offset, + info_and_status_bits); + } page_cur_position(cursor_rec, &cursor); offsets = rec_get_offsets(buf + origin_offset, index, offsets, ULINT_UNDEFINED, &heap); - page_cur_rec_insert(&cursor, buf + origin_offset, index, offsets, mtr); + if (UNIV_UNLIKELY(!page_cur_rec_insert(&cursor, page_zip, + buf + origin_offset, index, offsets, mtr))) { + /* The redo log record should only have been written + after the write was successful. */ + ut_error; + } if (buf != buf1) { @@ -859,6 +889,8 @@ page_cur_insert_rec_low( /* out: pointer to record if succeed, NULL otherwise */ page_cur_t* cursor, /* in: a page cursor */ + page_zip_des_t* page_zip,/* in/out: compressed page with at least + 25 + rec_size bytes available, or NULL */ dtuple_t* tuple, /* in: pointer to a data tuple or NULL */ dict_index_t* index, /* in: record descriptor */ rec_t* rec, /* in: pointer to a physical record or NULL */ @@ -873,14 +905,7 @@ page_cur_insert_rec_low( ulint heap_no; /* heap number of the inserted record */ rec_t* current_rec; /* current record after which the new record is inserted */ - rec_t* next_rec; /* next record after current before - the insertion */ - ulint owner_slot; /* the slot which owns the - inserted record */ - rec_t* owner_rec; - ulint n_owned; mem_heap_t* heap = NULL; - ulint comp; ut_ad(cursor && mtr); ut_ad(tuple || rec); @@ -888,10 +913,9 @@ page_cur_insert_rec_low( ut_ad(rec || dtuple_check_typed(tuple)); page = page_cur_get_page(cursor); - comp = page_is_comp(page); - ut_ad(index->table->comp == !!comp); + ut_ad(index->table->comp == (ibool) !!page_is_comp(page)); - ut_ad(cursor->rec != page_get_supremum_rec(page)); + ut_ad(!page_rec_is_supremum(cursor->rec)); /* 1. Get the size of the physical record in the page */ if (tuple != NULL) { @@ -905,10 +929,20 @@ page_cur_insert_rec_low( rec_size = rec_offs_size(offsets); } - /* 2. Try to find suitable space from page memory management */ - insert_buf = page_mem_alloc(page, rec_size, index, &heap_no); + if (UNIV_LIKELY_NULL(page_zip)) { + if (UNIV_UNLIKELY(!page_zip_alloc( + page_zip, page, 25 + rec_size))) { - if (insert_buf == NULL) { + goto err_exit; + } + } + + /* 2. Try to find suitable space from page memory management */ + insert_buf = page_mem_alloc(page, page_zip, rec_size, + index, &heap_no); + + if (UNIV_UNLIKELY(insert_buf == NULL)) { +err_exit: if (UNIV_LIKELY_NULL(heap)) { mem_heap_free(heap); } @@ -933,66 +967,95 @@ page_cur_insert_rec_low( /* 4. Insert the record in the linked list of records */ current_rec = cursor->rec; - ut_ad(!comp || rec_get_status(current_rec) <= REC_STATUS_INFIMUM); - ut_ad(!comp || rec_get_status(insert_rec) < REC_STATUS_INFIMUM); + { + /* next record after current before the insertion */ + rec_t* next_rec = page_rec_get_next(current_rec); +#ifdef UNIV_DEBUG + if (page_is_comp(page)) { + ut_ad(rec_get_status(current_rec) + <= REC_STATUS_INFIMUM); + ut_ad(rec_get_status(insert_rec) < REC_STATUS_INFIMUM); + ut_ad(rec_get_status(next_rec) != REC_STATUS_INFIMUM); + } +#endif + page_rec_set_next(insert_rec, next_rec, NULL); + page_rec_set_next(current_rec, insert_rec, page_zip); + } - next_rec = page_rec_get_next(current_rec); - ut_ad(!comp || rec_get_status(next_rec) != REC_STATUS_INFIMUM); - page_rec_set_next(insert_rec, next_rec); - page_rec_set_next(current_rec, insert_rec); - - page_header_set_field(page, PAGE_N_RECS, 1 + page_get_n_recs(page)); + page_header_set_field(page, page_zip, PAGE_N_RECS, + 1 + page_get_n_recs(page)); /* 5. Set the n_owned field in the inserted record to zero, and set the heap_no field */ - - rec_set_n_owned(insert_rec, comp, 0); - rec_set_heap_no(insert_rec, comp, heap_no); + if (page_is_comp(page)) { + rec_set_n_owned_new(insert_rec, NULL, 0); + rec_set_heap_no_new(insert_rec, NULL, heap_no); + } else { + rec_set_n_owned_old(insert_rec, 0); + rec_set_heap_no_old(insert_rec, heap_no); + } /* 6. Update the last insertion info in page header */ last_insert = page_header_get_ptr(page, PAGE_LAST_INSERT); - ut_ad(!last_insert || !comp + ut_ad(!last_insert || !page_is_comp(page) || rec_get_node_ptr_flag(last_insert) == rec_get_node_ptr_flag(insert_rec)); - if (last_insert == NULL) { - page_header_set_field(page, PAGE_DIRECTION, PAGE_NO_DIRECTION); - page_header_set_field(page, PAGE_N_DIRECTION, 0); + if (UNIV_UNLIKELY(last_insert == NULL)) { + page_header_set_field(page, page_zip, PAGE_DIRECTION, + PAGE_NO_DIRECTION); + page_header_set_field(page, page_zip, PAGE_N_DIRECTION, 0); } else if ((last_insert == current_rec) && (page_header_get_field(page, PAGE_DIRECTION) != PAGE_LEFT)) { - page_header_set_field(page, PAGE_DIRECTION, PAGE_RIGHT); - page_header_set_field(page, PAGE_N_DIRECTION, + page_header_set_field(page, page_zip, PAGE_DIRECTION, + PAGE_RIGHT); + page_header_set_field(page, page_zip, PAGE_N_DIRECTION, page_header_get_field(page, PAGE_N_DIRECTION) + 1); } else if ((page_rec_get_next(insert_rec) == last_insert) && (page_header_get_field(page, PAGE_DIRECTION) != PAGE_RIGHT)) { - page_header_set_field(page, PAGE_DIRECTION, PAGE_LEFT); - page_header_set_field(page, PAGE_N_DIRECTION, + page_header_set_field(page, page_zip, PAGE_DIRECTION, + PAGE_LEFT); + page_header_set_field(page, page_zip, PAGE_N_DIRECTION, page_header_get_field(page, PAGE_N_DIRECTION) + 1); } else { - page_header_set_field(page, PAGE_DIRECTION, PAGE_NO_DIRECTION); - page_header_set_field(page, PAGE_N_DIRECTION, 0); + page_header_set_field(page, page_zip, PAGE_DIRECTION, + PAGE_NO_DIRECTION); + page_header_set_field(page, page_zip, PAGE_N_DIRECTION, 0); } - page_header_set_ptr(page, PAGE_LAST_INSERT, insert_rec); + page_header_set_ptr(page, page_zip, PAGE_LAST_INSERT, insert_rec); /* 7. It remains to update the owner record. */ - - owner_rec = page_rec_find_owner_rec(insert_rec); - n_owned = rec_get_n_owned(owner_rec, comp); - rec_set_n_owned(owner_rec, comp, n_owned + 1); + { + rec_t* owner_rec = page_rec_find_owner_rec(insert_rec); + ulint n_owned; + if (page_is_comp(page)) { + n_owned = rec_get_n_owned_new(owner_rec); + rec_set_n_owned_new(owner_rec, page_zip, n_owned + 1); + } else { + n_owned = rec_get_n_owned_old(owner_rec); + rec_set_n_owned_old(owner_rec, n_owned + 1); + } - /* 8. Now we have incremented the n_owned field of the owner - record. If the number exceeds PAGE_DIR_SLOT_MAX_N_OWNED, - we have to split the corresponding directory slot in two. */ + /* 8. Now we have incremented the n_owned field of the owner + record. If the number exceeds PAGE_DIR_SLOT_MAX_N_OWNED, + we have to split the corresponding directory slot in two. */ - if (n_owned == PAGE_DIR_SLOT_MAX_N_OWNED) { - owner_slot = page_dir_find_owner_slot(owner_rec); - page_dir_split_slot(page, owner_slot); + if (UNIV_UNLIKELY(n_owned == PAGE_DIR_SLOT_MAX_N_OWNED)) { + page_dir_split_slot(page, page_zip, + page_dir_find_owner_slot(owner_rec)); + } + } + + if (UNIV_LIKELY_NULL(page_zip)) { + page_zip_write(page_zip, + insert_rec - rec_offs_extra_size(offsets), + rec_size); } /* 9. Write log record of the insert */ @@ -1041,7 +1104,8 @@ page_parse_copy_rec_list_to_created_page( byte* ptr, /* in: buffer */ byte* end_ptr,/* in: buffer end */ dict_index_t* index, /* in: record descriptor */ - page_t* page, /* in: page or NULL */ + page_t* page, /* in/out: page or NULL */ + page_zip_des_t* page_zip,/* in/out: compressed page or NULL */ mtr_t* mtr) /* in: mtr or NULL */ { byte* rec_end; @@ -1069,14 +1133,15 @@ page_parse_copy_rec_list_to_created_page( while (ptr < rec_end) { ptr = page_cur_parse_insert_rec(TRUE, ptr, end_ptr, - index, page, mtr); + index, page, page_zip, mtr); } ut_a(ptr == rec_end); - page_header_set_ptr(page, PAGE_LAST_INSERT, NULL); - page_header_set_field(page, PAGE_DIRECTION, PAGE_NO_DIRECTION); - page_header_set_field(page, PAGE_N_DIRECTION, 0); + page_header_set_ptr(page, page_zip, PAGE_LAST_INSERT, NULL); + page_header_set_field(page, page_zip, PAGE_DIRECTION, + PAGE_NO_DIRECTION); + page_header_set_field(page, page_zip, PAGE_N_DIRECTION, 0); return(rec_end); } @@ -1089,7 +1154,6 @@ void page_copy_rec_list_end_to_created_page( /*===================================*/ page_t* new_page, /* in: index page to copy to */ - page_t* page, /* in: index page */ rec_t* rec, /* in: first record to copy */ dict_index_t* index, /* in: record descriptor */ mtr_t* mtr) /* in: mtr */ @@ -1105,22 +1169,21 @@ page_copy_rec_list_end_to_created_page( ulint log_mode; byte* log_ptr; ulint log_data_len; - ulint comp = page_is_comp(page); mem_heap_t* heap = NULL; ulint offsets_[REC_OFFS_NORMAL_SIZE]; ulint* offsets = offsets_; *offsets_ = (sizeof offsets_) / sizeof *offsets_; ut_ad(page_dir_get_n_heap(new_page) == 2); - ut_ad(page != new_page); - ut_ad(comp == page_is_comp(new_page)); + ut_ad(ut_align_down(rec, UNIV_PAGE_SIZE) != new_page); + ut_ad(page_rec_is_comp(rec) == page_is_comp(new_page)); - if (rec == page_get_infimum_rec(page)) { + if (page_rec_is_infimum(rec)) { rec = page_rec_get_next(rec); } - if (rec == page_get_supremum_rec(page)) { + if (page_rec_is_supremum(rec)) { return; } @@ -1128,8 +1191,8 @@ page_copy_rec_list_end_to_created_page( #ifdef UNIV_DEBUG /* To pass the debug tests we have to set these dummy values in the debug version */ - page_dir_set_n_slots(new_page, UNIV_PAGE_SIZE / 2); - page_header_set_ptr(new_page, PAGE_HEAP_TOP, + page_dir_set_n_slots(new_page, NULL, UNIV_PAGE_SIZE / 2); + page_header_set_ptr(new_page, NULL, PAGE_HEAP_TOP, new_page + UNIV_PAGE_SIZE - 1); #endif @@ -1143,7 +1206,7 @@ page_copy_rec_list_end_to_created_page( log_mode = mtr_set_log_mode(mtr, MTR_LOG_SHORT_INSERTS); prev_rec = page_get_infimum_rec(new_page); - if (comp) { + if (page_is_comp(new_page)) { heap_top = new_page + PAGE_NEW_SUPREMUM_END; } else { heap_top = new_page + PAGE_OLD_SUPREMUM_END; @@ -1152,43 +1215,52 @@ page_copy_rec_list_end_to_created_page( slot_index = 0; n_recs = 0; - /* should be do ... until, comment by Jani */ - while (rec != page_get_supremum_rec(page)) { + do { offsets = rec_get_offsets(rec, index, offsets, ULINT_UNDEFINED, &heap); insert_rec = rec_copy(heap_top, rec, offsets); - rec_set_next_offs(prev_rec, comp, insert_rec - new_page); + if (page_is_comp(new_page)) { + rec_set_next_offs_new(prev_rec, NULL, + ut_align_offset(insert_rec, UNIV_PAGE_SIZE)); - rec_set_n_owned(insert_rec, comp, 0); - rec_set_heap_no(insert_rec, comp, 2 + n_recs); + rec_set_n_owned_new(insert_rec, NULL, 0); + rec_set_heap_no_new(insert_rec, NULL, 2 + n_recs); + } else { + rec_set_next_offs_old(prev_rec, + ut_align_offset(insert_rec, UNIV_PAGE_SIZE)); - rec_size = rec_offs_size(offsets); - - heap_top = heap_top + rec_size; - - ut_ad(heap_top < new_page + UNIV_PAGE_SIZE); + rec_set_n_owned_old(insert_rec, 0); + rec_set_heap_no_old(insert_rec, 2 + n_recs); + } count++; n_recs++; - if (count == (PAGE_DIR_SLOT_MAX_N_OWNED + 1) / 2) { + if (UNIV_UNLIKELY(count == + (PAGE_DIR_SLOT_MAX_N_OWNED + 1) / 2)) { slot_index++; slot = page_dir_get_nth_slot(new_page, slot_index); page_dir_slot_set_rec(slot, insert_rec); - page_dir_slot_set_n_owned(slot, count); + page_dir_slot_set_n_owned(slot, NULL, count); count = 0; } - + + rec_size = rec_offs_size(offsets); + + ut_ad(heap_top < new_page + UNIV_PAGE_SIZE); + + heap_top += rec_size; + page_cur_insert_rec_write_log(insert_rec, rec_size, prev_rec, index, mtr); prev_rec = insert_rec; rec = page_rec_get_next(rec); - } + } while (!page_rec_is_supremum(rec)); if ((slot_index > 0) && (count + 1 + (PAGE_DIR_SLOT_MAX_N_OWNED + 1) / 2 @@ -1202,7 +1274,7 @@ page_copy_rec_list_end_to_created_page( count += (PAGE_DIR_SLOT_MAX_N_OWNED + 1) / 2; - page_dir_slot_set_n_owned(slot, 0); + page_dir_slot_set_n_owned(slot, NULL, 0); slot_index--; } @@ -1216,23 +1288,27 @@ page_copy_rec_list_end_to_created_page( ut_a(log_data_len < 100 * UNIV_PAGE_SIZE); mach_write_to_4(log_ptr, log_data_len); - - rec_set_next_offs(insert_rec, comp, - comp ? PAGE_NEW_SUPREMUM : PAGE_OLD_SUPREMUM); + + if (page_is_comp(new_page)) { + rec_set_next_offs_new(insert_rec, NULL, PAGE_NEW_SUPREMUM); + } else { + rec_set_next_offs_old(insert_rec, PAGE_OLD_SUPREMUM); + } slot = page_dir_get_nth_slot(new_page, 1 + slot_index); page_dir_slot_set_rec(slot, page_get_supremum_rec(new_page)); - page_dir_slot_set_n_owned(slot, count + 1); + page_dir_slot_set_n_owned(slot, NULL, count + 1); - page_dir_set_n_slots(new_page, 2 + slot_index); - page_header_set_ptr(new_page, PAGE_HEAP_TOP, heap_top); - page_dir_set_n_heap(new_page, 2 + n_recs); - page_header_set_field(new_page, PAGE_N_RECS, n_recs); + page_dir_set_n_slots(new_page, NULL, 2 + slot_index); + page_header_set_ptr(new_page, NULL, PAGE_HEAP_TOP, heap_top); + page_dir_set_n_heap(new_page, NULL, 2 + n_recs); + page_header_set_field(new_page, NULL, PAGE_N_RECS, n_recs); - page_header_set_ptr(new_page, PAGE_LAST_INSERT, NULL); - page_header_set_field(new_page, PAGE_DIRECTION, PAGE_NO_DIRECTION); - page_header_set_field(new_page, PAGE_N_DIRECTION, 0); + page_header_set_ptr(new_page, NULL, PAGE_LAST_INSERT, NULL); + page_header_set_field(new_page, NULL, PAGE_DIRECTION, + PAGE_NO_DIRECTION); + page_header_set_field(new_page, NULL, PAGE_N_DIRECTION, 0); /* Restore the log mode */ @@ -1251,7 +1327,7 @@ page_cur_delete_rec_write_log( { byte* log_ptr; - ut_ad(!!page_rec_is_comp(rec) == index->table->comp); + ut_ad((ibool) !!page_rec_is_comp(rec) == index->table->comp); log_ptr = mlog_open_and_write_index(mtr, rec, index, page_rec_is_comp(rec) @@ -1280,7 +1356,9 @@ page_cur_parse_delete_rec( byte* ptr, /* in: buffer */ byte* end_ptr,/* in: buffer end */ dict_index_t* index, /* in: record descriptor */ - page_t* page, /* in: page or NULL */ + page_t* page, /* in/out: page or NULL */ + page_zip_des_t* page_zip,/* in/out: compressed page with at least + 32 bytes available, or NULL */ mtr_t* mtr) /* in: mtr or NULL */ { ulint offset; @@ -1304,10 +1382,11 @@ page_cur_parse_delete_rec( *offsets_ = (sizeof offsets_) / sizeof *offsets_; page_cur_position(rec, &cursor); + ut_ad(!page_zip || page_is_comp(page)); page_cur_delete_rec(&cursor, index, rec_get_offsets(rec, index, offsets_, - ULINT_UNDEFINED, &heap), mtr); + ULINT_UNDEFINED, &heap), page_zip, mtr); if (UNIV_LIKELY_NULL(heap)) { mem_heap_free(heap); } @@ -1323,9 +1402,11 @@ record after the deleted one. */ void page_cur_delete_rec( /*================*/ - page_cur_t* cursor, /* in: a page cursor */ + page_cur_t* cursor, /* in/out: a page cursor */ dict_index_t* index, /* in: record descriptor */ const ulint* offsets,/* in: rec_get_offsets(cursor->rec, index) */ + page_zip_des_t* page_zip,/* in/out: compressed page with at least + 32 bytes available, or NULL */ mtr_t* mtr) /* in: mini-transaction handle */ { page_dir_slot_t* cur_dir_slot; @@ -1343,12 +1424,12 @@ page_cur_delete_rec( page = page_cur_get_page(cursor); current_rec = cursor->rec; ut_ad(rec_offs_validate(current_rec, index, offsets)); - ut_ad(!!page_is_comp(page) == index->table->comp); + ut_ad((ibool) !!page_is_comp(page) == index->table->comp); + ut_ad(!page_zip || page_zip_available(page_zip, 32)); /* The record must not be the supremum or infimum record. */ - ut_ad(current_rec != page_get_supremum_rec(page)); - ut_ad(current_rec != page_get_infimum_rec(page)); - + ut_ad(page_rec_is_user_rec(current_rec)); + /* Save to local variables some data associated with current_rec */ cur_slot_no = page_dir_find_owner_slot(current_rec); cur_dir_slot = page_dir_get_nth_slot(page, cur_slot_no); @@ -1360,7 +1441,7 @@ page_cur_delete_rec( /* 1. Reset the last insert info in the page header and increment the modify clock for the frame */ - page_header_set_ptr(page, PAGE_LAST_INSERT, NULL); + page_header_set_ptr(page, page_zip, PAGE_LAST_INSERT, NULL); /* The page gets invalid for optimistic searches: increment the frame modify clock */ @@ -1388,8 +1469,8 @@ page_cur_delete_rec( /* 3. Remove the record from the linked list of records */ - page_rec_set_next(prev_rec, next_rec); - page_header_set_field(page, PAGE_N_RECS, + page_rec_set_next(prev_rec, next_rec, page_zip); + page_header_set_field(page, page_zip, PAGE_N_RECS, (ulint)(page_get_n_recs(page) - 1)); /* 4. If the deleted record is pointed to by a dir slot, update the @@ -1406,16 +1487,16 @@ page_cur_delete_rec( /* 5. Update the number of owned records of the slot */ - page_dir_slot_set_n_owned(cur_dir_slot, cur_n_owned - 1); + page_dir_slot_set_n_owned(cur_dir_slot, page_zip, cur_n_owned - 1); /* 6. Free the memory occupied by the record */ - page_mem_free(page, current_rec, offsets); + page_mem_free(page, page_zip, current_rec, offsets); /* 7. Now we have decremented the number of owned records of the slot. If the number drops below PAGE_DIR_SLOT_MIN_N_OWNED, we balance the slots. */ - if (cur_n_owned <= PAGE_DIR_SLOT_MIN_N_OWNED) { - page_dir_balance_slot(page, cur_slot_no); + if (UNIV_UNLIKELY(cur_n_owned <= PAGE_DIR_SLOT_MIN_N_OWNED)) { + page_dir_balance_slot(page, page_zip, cur_slot_no); } } diff --git a/page/page0page.c b/page/page0page.c index 7e09cdf073e..7b461126626 100644 --- a/page/page0page.c +++ b/page/page0page.c @@ -14,6 +14,7 @@ Created 2/2/1994 Heikki Tuuri #undef THIS_MODULE #include "page0cur.h" +#include "page0zip.h" #include "lock0lock.h" #include "fut0lst.h" #include "btr0sea.h" @@ -63,6 +64,18 @@ Assuming a page size of 8 kB, a typical index page of a secondary index contains 300 index entries, and the size of the page directory is 50 x 4 bytes = 200 bytes. */ +/***************************************************************** +Deletes records from page, up to the given record, NOT including +that record. Infimum and supremum records are not deleted. +This function is not to be used with compressed pages. */ +static +void +page_delete_rec_list_start( +/*=======================*/ + rec_t* rec, /* in: record on page */ + dict_index_t* index, /* in: record descriptor */ + mtr_t* mtr); /* in: mtr */ + /******************************************************************* Looks for the directory slot which owns the given record. */ @@ -85,14 +98,14 @@ page_dir_find_owner_slot( slot = page_dir_get_nth_slot(page, page_dir_get_n_slots(page) - 1); if (page_is_comp(page)) { - while (rec_get_n_owned(r, TRUE) == 0) { - r = page + rec_get_next_offs(r, TRUE); + while (rec_get_n_owned_new(r) == 0) { + r = rec_get_next_ptr(r, TRUE); ut_ad(r >= page + PAGE_NEW_SUPREMUM); ut_ad(r < page + (UNIV_PAGE_SIZE - PAGE_DIR)); } } else { - while (rec_get_n_owned(r, FALSE) == 0) { - r = page + rec_get_next_offs(r, FALSE); + while (rec_get_n_owned_old(r) == 0) { + r = rec_get_next_ptr(r, FALSE); ut_ad(r >= page + PAGE_OLD_SUPREMUM); ut_ad(r < page + (UNIV_PAGE_SIZE - PAGE_DIR)); } @@ -162,8 +175,11 @@ page_dir_slot_check( ut_a(page_rec_check(page_dir_slot_get_rec(slot))); - n_owned = rec_get_n_owned(page_dir_slot_get_rec(slot), - page_is_comp(page)); + if (page_is_comp(page)) { + n_owned = rec_get_n_owned_new(page_dir_slot_get_rec(slot)); + } else { + n_owned = rec_get_n_owned_new(page_dir_slot_get_rec(slot)); + } if (slot == page_dir_get_nth_slot(page, 0)) { ut_a(n_owned == 1); @@ -184,7 +200,7 @@ Sets the max trx id field value. */ void page_set_max_trx_id( /*================*/ - page_t* page, /* in: page */ + page_t* page, /* in/out: page */ dulint trx_id) /* in: transaction id */ { buf_block_t* block; @@ -201,7 +217,7 @@ page_set_max_trx_id( during a database recovery we assume that the max trx id of every page is the maximum trx id assigned before the crash. */ - mach_write_to_8(page + PAGE_HEADER + PAGE_MAX_TRX_ID, trx_id); + mach_write_to_8(page + (PAGE_HEADER + PAGE_MAX_TRX_ID), trx_id); if (block->is_hashed) { rw_lock_x_unlock(&btr_search_latch); @@ -216,7 +232,8 @@ page_mem_alloc( /*===========*/ /* out: pointer to start of allocated buffer, or NULL if allocation fails */ - page_t* page, /* in: index page */ + page_t* page, /* in/out: index page */ + page_zip_des_t* page_zip,/* in/out: compressed page, or NULL */ ulint need, /* in: number of bytes needed */ dict_index_t* index, /* in: record descriptor */ ulint* heap_no)/* out: this contains the heap number @@ -229,6 +246,7 @@ page_mem_alloc( ulint garbage; ut_ad(page && heap_no); + ut_ad(!page_zip || page_zip_validate(page_zip, page)); /* If there are records in the free list, look if the first is big enough */ @@ -245,16 +263,20 @@ page_mem_alloc( ULINT_UNDEFINED, &heap); if (rec_offs_size(offsets) >= need) { - page_header_set_ptr(page, PAGE_FREE, + page_header_set_ptr(page, page_zip, PAGE_FREE, page_rec_get_next(rec)); garbage = page_header_get_field(page, PAGE_GARBAGE); ut_ad(garbage >= need); - page_header_set_field(page, PAGE_GARBAGE, + page_header_set_field(page, page_zip, PAGE_GARBAGE, garbage - need); - *heap_no = rec_get_heap_no(rec, page_is_comp(page)); + if (page_is_comp(page)) { + *heap_no = rec_get_heap_no_new(rec); + } else { + *heap_no = rec_get_heap_no_old(rec); + } block = rec_get_start(rec, offsets); if (UNIV_LIKELY_NULL(heap)) { @@ -275,10 +297,11 @@ page_mem_alloc( if (avl_space >= need) { block = page_header_get_ptr(page, PAGE_HEAP_TOP); - page_header_set_ptr(page, PAGE_HEAP_TOP, block + need); + page_header_set_ptr(page, page_zip, PAGE_HEAP_TOP, + block + need); *heap_no = page_dir_get_n_heap(page); - page_dir_set_n_heap(page, 1 + *heap_no); + page_dir_set_n_heap(page, page_zip, 1 + *heap_no); return(block); } @@ -319,7 +342,8 @@ page_parse_create( /* The record is empty, except for the record initial part */ if (page) { - page_create(page, mtr, comp); + page_create(page, buf_block_get_page_zip( + buf_block_align(page)), mtr, comp); } return(ptr); @@ -331,11 +355,12 @@ The index page creation function. */ page_t* page_create( /*========*/ - /* out: pointer to the page */ - buf_frame_t* frame, /* in: a buffer frame where the page is - created */ - mtr_t* mtr, /* in: mini-transaction handle */ - ulint comp) /* in: nonzero=compact page format */ + /* out: pointer to the page */ + buf_frame_t* frame, /* in/out: a buffer frame where the + page is created */ + page_zip_des_t* page_zip, /* in/out: compressed page, or NULL */ + mtr_t* mtr, /* in: mini-transaction handle */ + ulint comp) /* in: nonzero=compact page format */ { page_dir_slot_t* slot; mem_heap_t* heap; @@ -348,7 +373,12 @@ page_create( dict_index_t* index; ulint* offsets; - index = comp ? srv_sys->dummy_ind2 : srv_sys->dummy_ind1; + if (UNIV_LIKELY(comp)) { + index = srv_sys->dummy_ind2; + } else { + index = srv_sys->dummy_ind1; + ut_ad(!page_zip); + } ut_ad(frame && mtr); ut_ad(PAGE_BTR_IBUF_FREE_LIST + FLST_BASE_NODE_SIZE @@ -385,11 +415,18 @@ page_create( infimum_rec = rec_convert_dtuple_to_rec(heap_top, index, tuple); - ut_a(infimum_rec == - page + (comp ? PAGE_NEW_INFIMUM : PAGE_OLD_INFIMUM)); + if (UNIV_LIKELY(comp)) { + ut_a(infimum_rec == page + PAGE_NEW_INFIMUM); + + rec_set_n_owned_new(infimum_rec, NULL, 1); + rec_set_heap_no_new(infimum_rec, NULL, 0); + } else { + ut_a(infimum_rec == page + PAGE_OLD_INFIMUM); + + rec_set_n_owned_old(infimum_rec, 1); + rec_set_heap_no_old(infimum_rec, 0); + } - rec_set_n_owned(infimum_rec, comp, 1); - rec_set_heap_no(infimum_rec, comp, 0); offsets = rec_get_offsets(infimum_rec, index, NULL, ULINT_UNDEFINED, &heap); @@ -407,11 +444,17 @@ page_create( supremum_rec = rec_convert_dtuple_to_rec(heap_top, index, tuple); - ut_a(supremum_rec == - page + (comp ? PAGE_NEW_SUPREMUM : PAGE_OLD_SUPREMUM)); + if (UNIV_LIKELY(comp)) { + ut_a(supremum_rec == page + PAGE_NEW_SUPREMUM); - rec_set_n_owned(supremum_rec, comp, 1); - rec_set_heap_no(supremum_rec, comp, 1); + rec_set_n_owned_new(supremum_rec, NULL, 1); + rec_set_heap_no_new(supremum_rec, NULL, 1); + } else { + ut_a(supremum_rec == page + PAGE_OLD_SUPREMUM); + + rec_set_n_owned_old(supremum_rec, 1); + rec_set_heap_no_old(supremum_rec, 1); + } offsets = rec_get_offsets(supremum_rec, index, offsets, ULINT_UNDEFINED, &heap); @@ -424,15 +467,15 @@ page_create( /* 4. INITIALIZE THE PAGE */ - page_header_set_field(page, PAGE_N_DIR_SLOTS, 2); - page_header_set_ptr(page, PAGE_HEAP_TOP, heap_top); - page_header_set_field(page, PAGE_N_HEAP, comp ? 0x8002 : 2); - page_header_set_ptr(page, PAGE_FREE, NULL); - page_header_set_field(page, PAGE_GARBAGE, 0); - page_header_set_ptr(page, PAGE_LAST_INSERT, NULL); - page_header_set_field(page, PAGE_DIRECTION, PAGE_NO_DIRECTION); - page_header_set_field(page, PAGE_N_DIRECTION, 0); - page_header_set_field(page, PAGE_N_RECS, 0); + page_header_set_field(page, NULL, PAGE_N_DIR_SLOTS, 2); + page_header_set_ptr(page, NULL, PAGE_HEAP_TOP, heap_top); + page_header_set_field(page, NULL, PAGE_N_HEAP, comp ? 0x8002 : 2); + page_header_set_ptr(page, NULL, PAGE_FREE, NULL); + page_header_set_field(page, NULL, PAGE_GARBAGE, 0); + page_header_set_ptr(page, NULL, PAGE_LAST_INSERT, NULL); + page_header_set_field(page, NULL, PAGE_DIRECTION, PAGE_NO_DIRECTION); + page_header_set_field(page, NULL, PAGE_N_DIRECTION, 0); + page_header_set_field(page, NULL, PAGE_N_RECS, 0); page_set_max_trx_id(page, ut_dulint_zero); memset(heap_top, 0, UNIV_PAGE_SIZE - PAGE_EMPTY_DIR_START - (heap_top - page)); @@ -448,22 +491,36 @@ page_create( page_dir_slot_set_rec(slot, supremum_rec); /* Set the next pointers in infimum and supremum */ - - rec_set_next_offs(infimum_rec, comp, (ulint)(supremum_rec - page)); - rec_set_next_offs(supremum_rec, comp, 0); + + if (UNIV_LIKELY(comp)) { + rec_set_next_offs_new(infimum_rec, NULL, + (ulint)(supremum_rec - page)); + rec_set_next_offs_new(supremum_rec, NULL, 0); + } else { + rec_set_next_offs_old(infimum_rec, + (ulint)(supremum_rec - page)); + rec_set_next_offs_old(supremum_rec, 0); + } + + if (UNIV_LIKELY_NULL(page_zip)) { + if (!page_zip_compress(page_zip, page)) { + /* The compression of a newly created page + should always succeed. */ + ut_error; + } + } return(page); } /***************************************************************** Differs from page_copy_rec_list_end, because this function does not -touch the lock table and max trx id on page. */ +touch the lock table and max trx id on page or compress the page. */ void page_copy_rec_list_end_no_locks( /*============================*/ page_t* new_page, /* in: index page to copy to */ - page_t* page, /* in: index page */ rec_t* rec, /* in: record on page */ dict_index_t* index, /* in: record descriptor */ mtr_t* mtr) /* in: mtr */ @@ -484,7 +541,7 @@ page_copy_rec_list_end_no_locks( } ut_a((ibool)!!page_is_comp(new_page) == index->table->comp); - ut_a(page_is_comp(new_page) == page_is_comp(page)); + ut_a(page_is_comp(new_page) == page_rec_is_comp(rec)); ut_a(mach_read_from_2(new_page + UNIV_PAGE_SIZE - 10) == (ulint) (page_is_comp(new_page) ? PAGE_NEW_INFIMUM : PAGE_OLD_INFIMUM)); @@ -493,7 +550,7 @@ page_copy_rec_list_end_no_locks( /* Copy records from the original page to the new page */ - sup = page_get_supremum_rec(page); + sup = page_get_supremum_rec(ut_align_down(rec, UNIV_PAGE_SIZE)); for (;;) { rec_t* cur1_rec = page_cur_get_rec(&cur1); @@ -502,20 +559,22 @@ page_copy_rec_list_end_no_locks( } offsets = rec_get_offsets(cur1_rec, index, offsets, ULINT_UNDEFINED, &heap); - if (UNIV_UNLIKELY(!page_cur_rec_insert(&cur2, cur1_rec, index, - offsets, mtr))) { + if (UNIV_UNLIKELY(!page_cur_rec_insert(&cur2, NULL, + cur1_rec, index, offsets, mtr))) { /* Track an assertion failure reported on the mailing list on June 18th, 2003 */ buf_page_print(new_page); - buf_page_print(page); + buf_page_print(ut_align_down(rec, UNIV_PAGE_SIZE)); ut_print_timestamp(stderr); fprintf(stderr, "InnoDB: rec offset %lu, cur1 offset %lu, cur2 offset %lu\n", - (ulong)(rec - page), - (ulong)(page_cur_get_rec(&cur1) - page), - (ulong)(page_cur_get_rec(&cur2) - new_page)); + (ulong)ut_align_offset(rec, UNIV_PAGE_SIZE), + (ulong)ut_align_offset(page_cur_get_rec(&cur1), + UNIV_PAGE_SIZE), + (ulong)ut_align_offset(page_cur_get_rec(&cur2), + UNIV_PAGE_SIZE)); ut_error; } @@ -533,30 +592,50 @@ Copies records from page to new_page, from a given record onward, including that record. Infimum and supremum records are not copied. The records are copied to the start of the record list on new_page. */ -void +ibool page_copy_rec_list_end( /*===================*/ - page_t* new_page, /* in: index page to copy to */ - page_t* page, /* in: index page */ + /* out: TRUE on success */ + page_t* new_page, /* in/out: index page to copy to */ + page_zip_des_t* new_page_zip, /* in/out: compressed page, or NULL */ rec_t* rec, /* in: record on page */ dict_index_t* index, /* in: record descriptor */ mtr_t* mtr) /* in: mtr */ { + page_t* page; + + ut_ad(!new_page_zip || page_zip_validate(new_page_zip, new_page)); + if (page_dir_get_n_heap(new_page) == 2) { - page_copy_rec_list_end_to_created_page(new_page, page, rec, - index, mtr); + page_copy_rec_list_end_to_created_page( + new_page, rec, index, mtr); } else { - page_copy_rec_list_end_no_locks(new_page, page, rec, - index, mtr); + page_copy_rec_list_end_no_locks(new_page, rec, index, mtr); } + page = ut_align_down(rec, UNIV_PAGE_SIZE); + /* Update the lock table, MAX_TRX_ID, and possible hash index */ lock_move_rec_list_end(new_page, page, rec); page_update_max_trx_id(new_page, page_get_max_trx_id(page)); + if (UNIV_LIKELY_NULL(new_page_zip)) { + if (UNIV_UNLIKELY(!page_zip_compress(new_page_zip, + new_page))) { + + if (UNIV_UNLIKELY(!page_zip_decompress( + new_page_zip, new_page, mtr))) { + ut_error; + } + return(FALSE); + } + } + btr_search_move_or_delete_hash_entries(new_page, page, index); + + return(TRUE); } /***************************************************************** @@ -564,30 +643,33 @@ Copies records from page to new_page, up to the given record, NOT including that record. Infimum and supremum records are not copied. The records are copied to the end of the record list on new_page. */ -void +ibool page_copy_rec_list_start( /*=====================*/ - page_t* new_page, /* in: index page to copy to */ - page_t* page, /* in: index page */ + /* out: TRUE on success */ + page_t* new_page, /* in/out: index page to copy to */ + page_zip_des_t* new_page_zip, /* in/out: compressed page, or NULL */ rec_t* rec, /* in: record on page */ dict_index_t* index, /* in: record descriptor */ mtr_t* mtr) /* in: mtr */ { page_cur_t cur1; page_cur_t cur2; + page_t* page; rec_t* old_end; mem_heap_t* heap = NULL; ulint offsets_[REC_OFFS_NORMAL_SIZE]; ulint* offsets = offsets_; *offsets_ = (sizeof offsets_) / sizeof *offsets_; - page_cur_set_before_first(page, &cur1); + if (page_rec_is_infimum(rec)) { - if (rec == page_cur_get_rec(&cur1)) { - - return; + return(TRUE); } + page = ut_align_down(rec, UNIV_PAGE_SIZE); + + page_cur_set_before_first(page, &cur1); page_cur_move_to_next(&cur1); page_cur_set_after_last(new_page, &cur2); @@ -601,25 +683,40 @@ page_copy_rec_list_start( rec_t* cur1_rec = page_cur_get_rec(&cur1); offsets = rec_get_offsets(cur1_rec, index, offsets, ULINT_UNDEFINED, &heap); - ins_rec = page_cur_rec_insert(&cur2, cur1_rec, index, - offsets, mtr); + ins_rec = page_cur_rec_insert(&cur2, NULL, + cur1_rec, index, offsets, mtr); ut_a(ins_rec); page_cur_move_to_next(&cur1); page_cur_move_to_next(&cur2); } - /* Update the lock table, MAX_TRX_ID, and possible hash index */ - - lock_move_rec_list_start(new_page, page, rec, old_end); - - page_update_max_trx_id(new_page, page_get_max_trx_id(page)); - - btr_search_move_or_delete_hash_entries(new_page, page, index); - if (UNIV_LIKELY_NULL(heap)) { mem_heap_free(heap); } + + /* Update MAX_TRX_ID, the lock table, and possible hash index */ + + page_update_max_trx_id(new_page, page_get_max_trx_id(page)); + + if (UNIV_LIKELY_NULL(new_page_zip)) { + if (UNIV_UNLIKELY(!page_zip_compress(new_page_zip, + new_page))) { + + if (UNIV_UNLIKELY(!page_zip_decompress( + new_page_zip, new_page, mtr))) { + ut_error; + } + /* TODO: try btr_page_reorganize() */ + return(FALSE); + } + } + + lock_move_rec_list_start(new_page, page, rec, old_end); + + btr_search_move_or_delete_hash_entries(new_page, page, index); + + return(TRUE); } /************************************************************** @@ -665,7 +762,7 @@ page_parse_delete_rec_list( page_t* page, /* in: page or NULL */ mtr_t* mtr) /* in: mtr or NULL */ { - ulint offset; + ulint offset; ut_ad(type == MLOG_LIST_END_DELETE || type == MLOG_LIST_START_DELETE @@ -687,14 +784,17 @@ page_parse_delete_rec_list( return(ptr); } - ut_ad(!!page_is_comp(page) == index->table->comp); + ut_ad((ibool) !!page_is_comp(page) == index->table->comp); if (type == MLOG_LIST_END_DELETE || type == MLOG_COMP_LIST_END_DELETE) { - page_delete_rec_list_end(page, page + offset, index, - ULINT_UNDEFINED, ULINT_UNDEFINED, mtr); + page_delete_rec_list_end(page + offset, index, + ULINT_UNDEFINED, ULINT_UNDEFINED, + buf_block_get_page_zip( + buf_block_align(page)), mtr); } else { - page_delete_rec_list_start(page, page + offset, index, mtr); + ut_ad(!buf_block_get_page_zip(buf_block_align(page))); + page_delete_rec_list_start(page + offset, index, mtr); } return(ptr); @@ -707,56 +807,63 @@ The infimum and supremum records are not deleted. */ void page_delete_rec_list_end( /*=====================*/ - page_t* page, /* in: index page */ - rec_t* rec, /* in: record on page */ + rec_t* rec, /* in: pointer to record on page */ dict_index_t* index, /* in: record descriptor */ ulint n_recs, /* in: number of records to delete, or ULINT_UNDEFINED if not known */ ulint size, /* in: the sum of the sizes of the records in the end of the chain to delete, or ULINT_UNDEFINED if not known */ + page_zip_des_t* page_zip,/* in/out: compressed page, or NULL */ mtr_t* mtr) /* in: mtr */ { page_dir_slot_t* slot; ulint slot_index; rec_t* last_rec; rec_t* prev_rec; - rec_t* free; rec_t* rec2; ulint count; ulint n_owned; - rec_t* sup; - ulint comp; + page_t* page; + page_zip_des_t* page_zip_temp; + ut_ad(size == ULINT_UNDEFINED || size < UNIV_PAGE_SIZE); + ut_ad(!page_zip || page_rec_is_comp(rec)); + + if (page_rec_is_infimum(rec)) { + rec = page_rec_get_next(rec); + } + + if (page_rec_is_supremum(rec)) { + + return; + } + + if (page_zip && !page_zip_available(page_zip, 17)) { + /* Try compressing the page afterwards. */ + page_zip_temp = NULL; + } else { + page_zip_temp = page_zip; + } + /* Reset the last insert info in the page header and increment the modify clock for the frame */ - ut_ad(size == ULINT_UNDEFINED || size < UNIV_PAGE_SIZE); - page_header_set_ptr(page, PAGE_LAST_INSERT, NULL); + page = ut_align_down(rec, UNIV_PAGE_SIZE); + page_header_set_ptr(page, page_zip_temp, PAGE_LAST_INSERT, NULL); /* The page gets invalid for optimistic searches: increment the frame modify clock */ buf_frame_modify_clock_inc(page); - sup = page_get_supremum_rec(page); - - comp = page_is_comp(page); - if (page_rec_is_infimum_low(rec - page)) { - rec = page_rec_get_next(rec); - } + page_delete_rec_list_write_log(rec, index, page_is_comp(page) + ? MLOG_COMP_LIST_END_DELETE + : MLOG_LIST_END_DELETE, mtr); - page_delete_rec_list_write_log(rec, index, - comp ? MLOG_COMP_LIST_END_DELETE : MLOG_LIST_END_DELETE, mtr); - - if (rec == sup) { - - return; - } - prev_rec = page_rec_get_prev(rec); - last_rec = page_rec_get_prev(sup); + last_rec = page_rec_get_prev(page_get_supremum_rec(page)); if ((size == ULINT_UNDEFINED) || (n_recs == ULINT_UNDEFINED)) { mem_heap_t* heap = NULL; @@ -768,7 +875,7 @@ page_delete_rec_list_end( n_recs = 0; rec2 = rec; - while (rec2 != sup) { + do { ulint s; offsets = rec_get_offsets(rec2, index, offsets, ULINT_UNDEFINED, &heap); @@ -780,7 +887,7 @@ page_delete_rec_list_end( n_recs++; rec2 = page_rec_get_next(rec2); - } + } while (!page_rec_is_supremum(rec2)); if (UNIV_LIKELY_NULL(heap)) { mem_heap_free(heap); @@ -795,50 +902,68 @@ page_delete_rec_list_end( rec2 = rec; count = 0; - - while (rec_get_n_owned(rec2, comp) == 0) { - count++; - rec2 = page_rec_get_next(rec2); + if (page_is_comp(page)) { + while (rec_get_n_owned_new(rec2) == 0) { + count++; + + rec2 = rec_get_next_ptr(rec2, TRUE); + } + + ut_ad(rec_get_n_owned_new(rec2) > count); + + n_owned = rec_get_n_owned_new(rec2) - count; + } else { + while (rec_get_n_owned_old(rec2) == 0) { + count++; + + rec2 = rec_get_next_ptr(rec2, FALSE); + } + + ut_ad(rec_get_n_owned_old(rec2) > count); + + n_owned = rec_get_n_owned_old(rec2) - count; } - ut_ad(rec_get_n_owned(rec2, comp) - count > 0); - - n_owned = rec_get_n_owned(rec2, comp) - count; - slot_index = page_dir_find_owner_slot(rec2); slot = page_dir_get_nth_slot(page, slot_index); - page_dir_slot_set_rec(slot, sup); - page_dir_slot_set_n_owned(slot, n_owned); + page_dir_slot_set_rec(slot, page_get_supremum_rec(page)); + page_dir_slot_set_n_owned(slot, page_zip_temp, n_owned); - page_dir_set_n_slots(page, slot_index + 1); + page_dir_set_n_slots(page, page_zip_temp, slot_index + 1); /* Remove the record chain segment from the record chain */ - page_rec_set_next(prev_rec, page_get_supremum_rec(page)); + page_rec_set_next(prev_rec, page_get_supremum_rec(page), + page_zip_temp); /* Catenate the deleted chain segment to the page free list */ - free = page_header_get_ptr(page, PAGE_FREE); + page_rec_set_next(last_rec, page_header_get_ptr(page, PAGE_FREE), + page_zip_temp); + page_header_set_ptr(page, page_zip_temp, PAGE_FREE, rec); - page_rec_set_next(last_rec, free); - page_header_set_ptr(page, PAGE_FREE, rec); - - page_header_set_field(page, PAGE_GARBAGE, + page_header_set_field(page, page_zip_temp, PAGE_GARBAGE, size + page_header_get_field(page, PAGE_GARBAGE)); - page_header_set_field(page, PAGE_N_RECS, + page_header_set_field(page, page_zip_temp, PAGE_N_RECS, (ulint)(page_get_n_recs(page) - n_recs)); + + if (UNIV_LIKELY_NULL(page_zip) && UNIV_UNLIKELY(!page_zip_temp)) { + /* Reorganize and compress the page. */ + /* TODO: coverage test. Is this allowed? */ + btr_page_reorganize(page, index, mtr); + } } /***************************************************************** Deletes records from page, up to the given record, NOT including -that record. Infimum and supremum records are not deleted. */ - +that record. Infimum and supremum records are not deleted. +This function is not to be used with compressed pages. */ +static void page_delete_rec_list_start( /*=======================*/ - page_t* page, /* in: index page */ rec_t* rec, /* in: record on page */ dict_index_t* index, /* in: record descriptor */ mtr_t* mtr) /* in: mtr */ @@ -849,11 +974,17 @@ page_delete_rec_list_start( ulint* offsets = offsets_; mem_heap_t* heap = NULL; byte type; + *offsets_ = (sizeof offsets_) / sizeof *offsets_; - ut_ad(!!page_is_comp(page) == index->table->comp); + ut_ad((ibool) !!page_rec_is_comp(rec) == index->table->comp); - if (page_is_comp(page)) { + if (page_rec_is_infimum(rec)) { + + return; + } + + if (page_rec_is_comp(rec)) { type = MLOG_COMP_LIST_START_DELETE; } else { type = MLOG_LIST_START_DELETE; @@ -861,13 +992,7 @@ page_delete_rec_list_start( page_delete_rec_list_write_log(rec, index, type, mtr); - page_cur_set_before_first(page, &cur1); - - if (rec == page_cur_get_rec(&cur1)) { - - return; - } - + page_cur_set_before_first(ut_align_down(rec, UNIV_PAGE_SIZE), &cur1); page_cur_move_to_next(&cur1); /* Individual deletes are not logged */ @@ -877,7 +1002,7 @@ page_delete_rec_list_start( while (page_cur_get_rec(&cur1) != rec) { offsets = rec_get_offsets(page_cur_get_rec(&cur1), index, offsets, ULINT_UNDEFINED, &heap); - page_cur_delete_rec(&cur1, index, offsets, mtr); + page_cur_delete_rec(&cur1, index, offsets, NULL, mtr); } if (UNIV_LIKELY_NULL(heap)) { @@ -897,8 +1022,11 @@ void page_move_rec_list_end( /*===================*/ page_t* new_page, /* in: index page where to move */ - page_t* page, /* in: index page */ + page_zip_des_t* new_page_zip, /* in/out: compressed page of + new_page, or NULL */ rec_t* split_rec, /* in: first record to move */ + page_zip_des_t* page_zip, /* in/out: compressed page of + split_rec, or NULL */ dict_index_t* index, /* in: record descriptor */ mtr_t* mtr) /* in: mtr */ { @@ -910,33 +1038,22 @@ page_move_rec_list_end( old_data_size = page_get_data_size(new_page); old_n_recs = page_get_n_recs(new_page); - page_copy_rec_list_end(new_page, page, split_rec, index, mtr); + if (!page_copy_rec_list_end(new_page, new_page_zip, + split_rec, index, mtr)) { + /* This should always succeed, as new_page + is created from the scratch and receives a contiguous + part of the records from split_rec onwards */ + ut_error; + } new_data_size = page_get_data_size(new_page); new_n_recs = page_get_n_recs(new_page); ut_ad(new_data_size >= old_data_size); - page_delete_rec_list_end(page, split_rec, index, - new_n_recs - old_n_recs, new_data_size - old_data_size, mtr); -} - -/***************************************************************** -Moves record list start to another page. Moved records do not include -split_rec. */ - -void -page_move_rec_list_start( -/*=====================*/ - page_t* new_page, /* in: index page where to move */ - page_t* page, /* in: index page */ - rec_t* split_rec, /* in: first record not to move */ - dict_index_t* index, /* in: record descriptor */ - mtr_t* mtr) /* in: mtr */ -{ - page_copy_rec_list_start(new_page, page, split_rec, index, mtr); - - page_delete_rec_list_start(page, split_rec, index, mtr); + page_delete_rec_list_end(split_rec, index, + new_n_recs - old_n_recs, new_data_size - old_data_size, + page_zip, mtr); } /*************************************************************************** @@ -967,50 +1084,46 @@ also n_owned fields in the records, so that the first slot after the deleted ones inherits the records of the deleted slots. */ UNIV_INLINE void -page_dir_delete_slots( -/*==================*/ - page_t* page, /* in: the index page */ - ulint start, /* in: first slot to be deleted */ - ulint n) /* in: number of slots to delete (currently - only n == 1 allowed) */ +page_dir_delete_slot( +/*=================*/ + page_t* page, /* in/out: the index page */ + page_zip_des_t* page_zip,/* in/out: compressed page with + at least 10 bytes available, or NULL */ + ulint slot_no)/* in: slot to be deleted */ { page_dir_slot_t* slot; + ulint n_owned; ulint i; - ulint sum_owned = 0; ulint n_slots; - rec_t* rec; - ut_ad(n == 1); - ut_ad(start > 0); - ut_ad(start + n < page_dir_get_n_slots(page)); + ut_ad(!page_zip || page_zip_available(page_zip, 10)); + ut_ad(!page_zip || page_is_comp(page)); + ut_ad(slot_no > 0); + ut_ad(slot_no < page_dir_get_n_slots(page)); n_slots = page_dir_get_n_slots(page); /* 1. Reset the n_owned fields of the slots to be deleted */ - for (i = start; i < start + n; i++) { - slot = page_dir_get_nth_slot(page, i); - sum_owned += page_dir_slot_get_n_owned(slot); - page_dir_slot_set_n_owned(slot, 0); - } + slot = page_dir_get_nth_slot(page, slot_no); + n_owned = page_dir_slot_get_n_owned(slot); + page_dir_slot_set_n_owned(slot, page_zip, 0); /* 2. Update the n_owned value of the first non-deleted slot */ - slot = page_dir_get_nth_slot(page, start + n); - page_dir_slot_set_n_owned(slot, - sum_owned + page_dir_slot_get_n_owned(slot)); + slot = page_dir_get_nth_slot(page, slot_no + 1); + page_dir_slot_set_n_owned(slot, page_zip, n_owned + + page_dir_slot_get_n_owned(slot)); - /* 3. Destroy start and other slots by copying slots */ - for (i = start + n; i < n_slots; i++) { - slot = page_dir_get_nth_slot(page, i); - rec = page_dir_slot_get_rec(slot); - - slot = page_dir_get_nth_slot(page, i - n); - page_dir_slot_set_rec(slot, rec); + /* 3. Destroy the slot by copying slots */ + for (i = slot_no + 1; i < n_slots; i++) { + rec_t* rec; + rec = page_dir_slot_get_rec(page_dir_get_nth_slot(page, i)); + page_dir_slot_set_rec(page_dir_get_nth_slot(page, i), rec); } /* 4. Update the page header */ - page_header_set_field(page, PAGE_N_DIR_SLOTS, n_slots - n); + page_header_set_field(page, page_zip, PAGE_N_DIR_SLOTS, n_slots - 1); } /****************************************************************** @@ -1021,10 +1134,12 @@ UNIV_INLINE void page_dir_add_slots( /*===============*/ - page_t* page, /* in: the index page */ - ulint start, /* in: the slot above which the new slots are added */ - ulint n) /* in: number of slots to add (currently only n == 1 - allowed) */ + page_t* page, /* in/out: the index page */ + page_zip_des_t* page_zip,/* in/out: comprssed page, or NULL */ + ulint start, /* in: the slot above which the new slots + are added */ + ulint n) /* in: number of slots to add + (currently only n == 1 allowed) */ { page_dir_slot_t* slot; ulint n_slots; @@ -1038,7 +1153,7 @@ page_dir_add_slots( ut_ad(start < n_slots - 1); /* Update the page header */ - page_dir_set_n_slots(page, n_slots + n); + page_dir_set_n_slots(page, page_zip, n_slots + n); /* Move slots up */ @@ -1050,6 +1165,13 @@ page_dir_add_slots( slot = page_dir_get_nth_slot(page, i + n); page_dir_slot_set_rec(slot, rec); } + + if (UNIV_LIKELY_NULL(page_zip)) { + /* TODO: test this */ + page_zip_write_trailer(page_zip, + page_dir_get_nth_slot(page, n_slots + n - 1), + (n_slots + n - start) * PAGE_DIR_SLOT_SIZE); + } } /******************************************************************** @@ -1058,8 +1180,10 @@ Splits a directory slot which owns too many records. */ void page_dir_split_slot( /*================*/ - page_t* page, /* in: the index page in question */ - ulint slot_no) /* in: the directory slot */ + page_t* page, /* in/out: index page */ + page_zip_des_t* page_zip,/* in/out: compressed page with + at least 10 bytes available, or NULL */ + ulint slot_no)/* in: the directory slot */ { rec_t* rec; page_dir_slot_t* new_slot; @@ -1069,6 +1193,8 @@ page_dir_split_slot( ulint n_owned; ut_ad(page); + ut_ad(!page_zip || page_zip_available(page_zip, 10)); + ut_ad(!page_zip || page_is_comp(page)); ut_ad(slot_no > 0); slot = page_dir_get_nth_slot(page, slot_no); @@ -1091,7 +1217,7 @@ page_dir_split_slot( /* 2. We add one directory slot immediately below the slot to be split. */ - page_dir_add_slots(page, slot_no - 1, 1); + page_dir_add_slots(page, page_zip, slot_no - 1, 1); /* The added slot is now number slot_no, and the old slot is now number slot_no + 1 */ @@ -1102,12 +1228,12 @@ page_dir_split_slot( /* 3. We store the appropriate values to the new slot. */ page_dir_slot_set_rec(new_slot, rec); - page_dir_slot_set_n_owned(new_slot, n_owned / 2); + page_dir_slot_set_n_owned(new_slot, page_zip, n_owned / 2); /* 4. Finally, we update the number of records field of the original slot */ - page_dir_slot_set_n_owned(slot, n_owned - (n_owned / 2)); + page_dir_slot_set_n_owned(slot, page_zip, n_owned - (n_owned / 2)); } /***************************************************************** @@ -1118,8 +1244,10 @@ the slot; this may result in the merging of two slots. */ void page_dir_balance_slot( /*==================*/ - page_t* page, /* in: index page */ - ulint slot_no) /* in: the directory slot */ + page_t* page, /* in/out: index page */ + page_zip_des_t* page_zip,/* in/out: compressed page with + at least 15 bytes available, or NULL */ + ulint slot_no)/* in: the directory slot */ { page_dir_slot_t* slot; page_dir_slot_t* up_slot; @@ -1129,6 +1257,8 @@ page_dir_balance_slot( rec_t* new_rec; ut_ad(page); + ut_ad(!page_zip || page_zip_available(page_zip, 15)); + ut_ad(!page_zip || page_is_comp(page)); ut_ad(slot_no > 0); slot = page_dir_get_nth_slot(page, slot_no); @@ -1136,7 +1266,7 @@ page_dir_balance_slot( /* The last directory slot cannot be balanced with the upper neighbor, as there is none. */ - if (slot_no == page_dir_get_n_slots(page) - 1) { + if (UNIV_UNLIKELY(slot_no == page_dir_get_n_slots(page) - 1)) { return; } @@ -1157,17 +1287,25 @@ page_dir_balance_slot( /* In this case we can just transfer one record owned by the upper slot to the property of the lower slot */ old_rec = page_dir_slot_get_rec(slot); - new_rec = page_rec_get_next(old_rec); + + if (page_is_comp(page)) { + new_rec = rec_get_next_ptr(old_rec, TRUE); - rec_set_n_owned(old_rec, page_is_comp(page), 0); - rec_set_n_owned(new_rec, page_is_comp(page), n_owned + 1); + rec_set_n_owned_new(old_rec, page_zip, 0); + rec_set_n_owned_new(new_rec, page_zip, n_owned + 1); + page_dir_slot_set_rec(slot, new_rec); + } else { + new_rec = rec_get_next_ptr(old_rec, FALSE); - page_dir_slot_set_rec(slot, new_rec); - - page_dir_slot_set_n_owned(up_slot, up_n_owned -1); + rec_set_n_owned_old(old_rec, 0); + rec_set_n_owned_old(new_rec, n_owned + 1); + page_dir_slot_set_rec(slot, new_rec); + } + + page_dir_slot_set_n_owned(up_slot, page_zip, up_n_owned -1); } else { /* In this case we may merge the two slots */ - page_dir_delete_slots(page, slot_no, 1); + page_dir_delete_slot(page, page_zip, slot_no); } } @@ -1233,29 +1371,46 @@ page_rec_get_n_recs_before( rec_t* slot_rec; page_t* page; ulint i; - ulint comp; lint n = 0; ut_ad(page_rec_check(rec)); page = buf_frame_align(rec); - comp = page_is_comp(page); + if (page_is_comp(page)) { + while (rec_get_n_owned_new(rec) == 0) { - while (rec_get_n_owned(rec, comp) == 0) { + rec = rec_get_next_ptr(rec, TRUE); + n--; + } - rec = page_rec_get_next(rec); - n--; - } - - for (i = 0; ; i++) { - slot = page_dir_get_nth_slot(page, i); - slot_rec = page_dir_slot_get_rec(slot); + for (i = 0; ; i++) { + slot = page_dir_get_nth_slot(page, i); + slot_rec = page_dir_slot_get_rec(slot); - n += rec_get_n_owned(slot_rec, comp); + n += rec_get_n_owned_new(slot_rec); - if (rec == slot_rec) { + if (rec == slot_rec) { - break; + break; + } + } + } else { + while (rec_get_n_owned_old(rec) == 0) { + + rec = rec_get_next_ptr(rec, FALSE); + n--; + } + + for (i = 0; ; i++) { + slot = page_dir_get_nth_slot(page, i); + slot_rec = page_dir_slot_get_rec(slot); + + n += rec_get_n_owned_old(slot_rec); + + if (rec == slot_rec) { + + break; + } } } @@ -1276,15 +1431,21 @@ page_rec_print( rec_t* rec, /* in: physical record */ const ulint* offsets)/* in: record descriptor */ { - ulint comp = page_is_comp(buf_frame_align(rec)); - - ut_a(!comp == !rec_offs_comp(offsets)); + ut_a(!page_rec_is_comp(rec) == !rec_offs_comp(offsets)); rec_print_new(stderr, rec, offsets); - fprintf(stderr, + if (page_rec_is_comp(rec)) { + fprintf(stderr, " n_owned: %lu; heap_no: %lu; next rec: %lu\n", - (ulong) rec_get_n_owned(rec, comp), - (ulong) rec_get_heap_no(rec, comp), - (ulong) rec_get_next_offs(rec, comp)); + (ulong) rec_get_n_owned_new(rec), + (ulong) rec_get_heap_no_new(rec), + (ulong) rec_get_next_offs(rec, TRUE)); + } else { + fprintf(stderr, + " n_owned: %lu; heap_no: %lu; next rec: %lu\n", + (ulong) rec_get_n_owned_old(rec), + (ulong) rec_get_heap_no_old(rec), + (ulong) rec_get_next_offs(rec, TRUE)); + } page_rec_check(rec); rec_validate(rec, offsets); @@ -1459,26 +1620,29 @@ page_rec_validate( ulint n_owned; ulint heap_no; page_t* page; - ulint comp; - page = buf_frame_align(rec); - comp = page_is_comp(page); - ut_a(!comp == !rec_offs_comp(offsets)); + page = ut_align_down(rec, UNIV_PAGE_SIZE); + ut_a(!page_is_comp(page) == !rec_offs_comp(offsets)); page_rec_check(rec); rec_validate(rec, offsets); - n_owned = rec_get_n_owned(rec, comp); - heap_no = rec_get_heap_no(rec, comp); + if (page_rec_is_comp(rec)) { + n_owned = rec_get_n_owned_new(rec); + heap_no = rec_get_heap_no_new(rec); + } else { + n_owned = rec_get_n_owned_old(rec); + heap_no = rec_get_heap_no_old(rec); + } - if (!(n_owned <= PAGE_DIR_SLOT_MAX_N_OWNED)) { + if (UNIV_UNLIKELY(!(n_owned <= PAGE_DIR_SLOT_MAX_N_OWNED))) { fprintf(stderr, "InnoDB: Dir slot of rec %lu, n owned too big %lu\n", (ulong)(rec - page), (ulong) n_owned); return(FALSE); } - if (!(heap_no < page_dir_get_n_heap(page))) { + if (UNIV_UNLIKELY(!(heap_no < page_dir_get_n_heap(page)))) { fprintf(stderr, "InnoDB: Heap no of rec %lu too big %lu %lu\n", (ulong)(rec - page), (ulong) heap_no, @@ -1526,10 +1690,10 @@ know the index. This is also resilient so that this should never crash even if the page is total garbage. */ ibool -page_simple_validate( -/*=================*/ +page_simple_validate_old( +/*=====================*/ /* out: TRUE if ok */ - page_t* page) /* in: index page */ + page_t* page) /* in: old-style index page */ { page_cur_t cur; page_dir_slot_t* slot; @@ -1540,14 +1704,15 @@ page_simple_validate( ulint count; ulint own_count; ibool ret = FALSE; - ulint comp = page_is_comp(page); + + ut_a(!page_is_comp(page)); /* Check first that the record heap and the directory do not overlap. */ n_slots = page_dir_get_n_slots(page); - if (n_slots > UNIV_PAGE_SIZE / 4) { + if (UNIV_UNLIKELY(n_slots > UNIV_PAGE_SIZE / 4)) { fprintf(stderr, "InnoDB: Nonsensical number %lu of page dir slots\n", (ulong) n_slots); @@ -1556,7 +1721,8 @@ page_simple_validate( rec_heap_top = page_header_get_ptr(page, PAGE_HEAP_TOP); - if (rec_heap_top > page_dir_get_nth_slot(page, n_slots - 1)) { + if (UNIV_UNLIKELY(rec_heap_top + > page_dir_get_nth_slot(page, n_slots - 1))) { fprintf(stderr, "InnoDB: Record heap and dir overlap on a page, heap top %lu, dir %lu\n", @@ -1579,7 +1745,7 @@ page_simple_validate( for (;;) { rec = (&cur)->rec; - if (rec > rec_heap_top) { + if (UNIV_UNLIKELY(rec > rec_heap_top)) { fprintf(stderr, "InnoDB: Record %lu is above rec heap top %lu\n", (ulong)(rec - page), (ulong)(rec_heap_top - page)); @@ -1587,20 +1753,21 @@ page_simple_validate( goto func_exit; } - if (rec_get_n_owned(rec, comp) != 0) { + if (UNIV_UNLIKELY(rec_get_n_owned_old(rec))) { /* This is a record pointed to by a dir slot */ - if (rec_get_n_owned(rec, comp) != own_count) { + if (UNIV_UNLIKELY(rec_get_n_owned_old(rec) + != own_count)) { fprintf(stderr, "InnoDB: Wrong owned count %lu, %lu, rec %lu\n", - (ulong) rec_get_n_owned(rec, comp), + (ulong) rec_get_n_owned_old(rec), (ulong) own_count, (ulong)(rec - page)); goto func_exit; } - if (page_dir_slot_get_rec(slot) != rec) { + if (UNIV_UNLIKELY(page_dir_slot_get_rec(slot) != rec)) { fprintf(stderr, "InnoDB: Dir slot does not point to right rec %lu\n", (ulong)(rec - page)); @@ -1621,11 +1788,11 @@ page_simple_validate( break; } - if (rec_get_next_offs(rec, comp) < FIL_PAGE_DATA - || rec_get_next_offs(rec, comp) >= UNIV_PAGE_SIZE) { + if (UNIV_UNLIKELY(rec_get_next_offs(rec, FALSE) < FIL_PAGE_DATA + || rec_get_next_offs(rec, FALSE) >= UNIV_PAGE_SIZE)) { fprintf(stderr, "InnoDB: Next record offset nonsensical %lu for rec %lu\n", - (ulong) rec_get_next_offs(rec, comp), + (ulong) rec_get_next_offs(rec, FALSE), (ulong)(rec - page)); goto func_exit; @@ -1633,7 +1800,7 @@ page_simple_validate( count++; - if (count > UNIV_PAGE_SIZE) { + if (UNIV_UNLIKELY(count > UNIV_PAGE_SIZE)) { fprintf(stderr, "InnoDB: Page record list appears to be circular %lu\n", (ulong) count); @@ -1644,19 +1811,20 @@ page_simple_validate( own_count++; } - if (rec_get_n_owned(rec, comp) == 0) { + if (UNIV_UNLIKELY(rec_get_n_owned_old(rec) == 0)) { fprintf(stderr, "InnoDB: n owned is zero in a supremum rec\n"); goto func_exit; } - if (slot_no != n_slots - 1) { + if (UNIV_UNLIKELY(slot_no != n_slots - 1)) { fprintf(stderr, "InnoDB: n slots wrong %lu, %lu\n", (ulong) slot_no, (ulong) (n_slots - 1)); goto func_exit; } - if (page_header_get_field(page, PAGE_N_RECS) + 2 != count + 1) { + if (UNIV_UNLIKELY(page_header_get_field(page, PAGE_N_RECS) + 2 + != count + 1)) { fprintf(stderr, "InnoDB: n recs wrong %lu %lu\n", (ulong) page_header_get_field(page, PAGE_N_RECS) + 2, (ulong) (count + 1)); @@ -1668,8 +1836,8 @@ page_simple_validate( rec = page_header_get_ptr(page, PAGE_FREE); while (rec != NULL) { - if (rec < page + FIL_PAGE_DATA - || rec >= page + UNIV_PAGE_SIZE) { + if (UNIV_UNLIKELY(rec < page + FIL_PAGE_DATA + || rec >= page + UNIV_PAGE_SIZE)) { fprintf(stderr, "InnoDB: Free list record has a nonsensical offset %lu\n", (ulong)(rec - page)); @@ -1677,7 +1845,7 @@ page_simple_validate( goto func_exit; } - if (rec > rec_heap_top) { + if (UNIV_UNLIKELY(rec > rec_heap_top)) { fprintf(stderr, "InnoDB: Free list record %lu is above rec heap top %lu\n", (ulong)(rec - page), (ulong)(rec_heap_top - page)); @@ -1687,7 +1855,7 @@ page_simple_validate( count++; - if (count > UNIV_PAGE_SIZE) { + if (UNIV_UNLIKELY(count > UNIV_PAGE_SIZE)) { fprintf(stderr, "InnoDB: Page free list appears to be circular %lu\n", (ulong) count); @@ -1697,7 +1865,203 @@ page_simple_validate( rec = page_rec_get_next(rec); } - if (page_dir_get_n_heap(page) != count + 1) { + if (UNIV_UNLIKELY(page_dir_get_n_heap(page) != count + 1)) { + + fprintf(stderr, "InnoDB: N heap is wrong %lu, %lu\n", + (ulong) page_dir_get_n_heap(page), + (ulong) (count + 1)); + + goto func_exit; + } + + ret = TRUE; + +func_exit: + return(ret); +} + +/******************************************************************* +This function checks the consistency of an index page when we do not +know the index. This is also resilient so that this should never crash +even if the page is total garbage. */ + +ibool +page_simple_validate_new( +/*=====================*/ + /* out: TRUE if ok */ + page_t* page) /* in: new-style index page */ +{ + page_cur_t cur; + page_dir_slot_t* slot; + ulint slot_no; + ulint n_slots; + rec_t* rec; + byte* rec_heap_top; + ulint count; + ulint own_count; + ibool ret = FALSE; + + ut_a(page_is_comp(page)); + + /* Check first that the record heap and the directory do not + overlap. */ + + n_slots = page_dir_get_n_slots(page); + + if (UNIV_UNLIKELY(n_slots > UNIV_PAGE_SIZE / 4)) { + fprintf(stderr, + "InnoDB: Nonsensical number %lu of page dir slots\n", (ulong) n_slots); + + goto func_exit; + } + + rec_heap_top = page_header_get_ptr(page, PAGE_HEAP_TOP); + + if (UNIV_UNLIKELY(rec_heap_top + > page_dir_get_nth_slot(page, n_slots - 1))) { + + fprintf(stderr, + "InnoDB: Record heap and dir overlap on a page, heap top %lu, dir %lu\n", + (ulong)(page_header_get_ptr(page, PAGE_HEAP_TOP) - page), + (ulong)(page_dir_get_nth_slot(page, n_slots - 1) - page)); + + goto func_exit; + } + + /* Validate the record list in a loop checking also that it is + consistent with the page record directory. */ + + count = 0; + own_count = 1; + slot_no = 0; + slot = page_dir_get_nth_slot(page, slot_no); + + page_cur_set_before_first(page, &cur); + + for (;;) { + rec = (&cur)->rec; + + if (UNIV_UNLIKELY(rec > rec_heap_top)) { + fprintf(stderr, + "InnoDB: Record %lu is above rec heap top %lu\n", + (ulong)(rec - page), (ulong)(rec_heap_top - page)); + + goto func_exit; + } + + if (UNIV_UNLIKELY(rec_get_n_owned_new(rec))) { + /* This is a record pointed to by a dir slot */ + if (UNIV_UNLIKELY(rec_get_n_owned_new(rec) + != own_count)) { + + fprintf(stderr, + "InnoDB: Wrong owned count %lu, %lu, rec %lu\n", + (ulong) rec_get_n_owned_new(rec), + (ulong) own_count, + (ulong)(rec - page)); + + goto func_exit; + } + + if (UNIV_UNLIKELY(page_dir_slot_get_rec(slot) != rec)) { + fprintf(stderr, + "InnoDB: Dir slot does not point to right rec %lu\n", + (ulong)(rec - page)); + + goto func_exit; + } + + own_count = 0; + + if (!page_cur_is_after_last(&cur)) { + slot_no++; + slot = page_dir_get_nth_slot(page, slot_no); + } + } + + if (page_cur_is_after_last(&cur)) { + + break; + } + + if (UNIV_UNLIKELY(rec_get_next_offs(rec, TRUE) < FIL_PAGE_DATA + || rec_get_next_offs(rec, TRUE) >= UNIV_PAGE_SIZE)) { + fprintf(stderr, + "InnoDB: Next record offset nonsensical %lu for rec %lu\n", + (ulong) rec_get_next_offs(rec, TRUE), + (ulong)(rec - page)); + + goto func_exit; + } + + count++; + + if (UNIV_UNLIKELY(count > UNIV_PAGE_SIZE)) { + fprintf(stderr, + "InnoDB: Page record list appears to be circular %lu\n", + (ulong) count); + goto func_exit; + } + + page_cur_move_to_next(&cur); + own_count++; + } + + if (UNIV_UNLIKELY(rec_get_n_owned_new(rec) == 0)) { + fprintf(stderr, "InnoDB: n owned is zero in a supremum rec\n"); + + goto func_exit; + } + + if (UNIV_UNLIKELY(slot_no != n_slots - 1)) { + fprintf(stderr, "InnoDB: n slots wrong %lu, %lu\n", + (ulong) slot_no, (ulong) (n_slots - 1)); + goto func_exit; + } + + if (UNIV_UNLIKELY(page_header_get_field(page, PAGE_N_RECS) + 2 + != count + 1)) { + fprintf(stderr, "InnoDB: n recs wrong %lu %lu\n", + (ulong) page_header_get_field(page, PAGE_N_RECS) + 2, + (ulong) (count + 1)); + + goto func_exit; + } + + /* Check then the free list */ + rec = page_header_get_ptr(page, PAGE_FREE); + + while (rec != NULL) { + if (UNIV_UNLIKELY(rec < page + FIL_PAGE_DATA + || rec >= page + UNIV_PAGE_SIZE)) { + fprintf(stderr, + "InnoDB: Free list record has a nonsensical offset %lu\n", + (ulong)(rec - page)); + + goto func_exit; + } + + if (UNIV_UNLIKELY(rec > rec_heap_top)) { + fprintf(stderr, + "InnoDB: Free list record %lu is above rec heap top %lu\n", + (ulong)(rec - page), (ulong)(rec_heap_top - page)); + + goto func_exit; + } + + count++; + + if (UNIV_UNLIKELY(count > UNIV_PAGE_SIZE)) { + fprintf(stderr, + "InnoDB: Page free list appears to be circular %lu\n", + (ulong) count); + goto func_exit; + } + + rec = page_rec_get_next(rec); + } + + if (UNIV_UNLIKELY(page_dir_get_n_heap(page) != count + 1)) { fprintf(stderr, "InnoDB: N heap is wrong %lu, %lu\n", (ulong) page_dir_get_n_heap(page), @@ -1728,7 +2092,6 @@ page_validate( page_cur_t cur; byte* buf; ulint count; - ulint own_count; ulint slot_no; ulint data_size; rec_t* rec; @@ -1737,16 +2100,22 @@ page_validate( ulint n_slots; ibool ret = FALSE; ulint i; - ulint comp = page_is_comp(page); ulint* offsets = NULL; ulint* old_offsets = NULL; - if ((ibool)!!comp != index->table->comp) { + if (UNIV_UNLIKELY((ibool) !!page_is_comp(page) + != index->table->comp)) { fputs("InnoDB: 'compact format' flag mismatch\n", stderr); goto func_exit2; } - if (!page_simple_validate(page)) { - goto func_exit2; + if (page_is_comp(page)) { + if (UNIV_UNLIKELY(!page_simple_validate_new(page))) { + goto func_exit2; + } + } else { + if (UNIV_UNLIKELY(!page_simple_validate_old(page))) { + goto func_exit2; + } } heap = mem_heap_create(UNIV_PAGE_SIZE + 200); @@ -1762,8 +2131,8 @@ page_validate( n_slots = page_dir_get_n_slots(page); - if (!(page_header_get_ptr(page, PAGE_HEAP_TOP) <= - page_dir_get_nth_slot(page, n_slots - 1))) { + if (UNIV_UNLIKELY(!(page_header_get_ptr(page, PAGE_HEAP_TOP) <= + page_dir_get_nth_slot(page, n_slots - 1)))) { fputs("InnoDB: Record heap and dir overlap on a page ", stderr); @@ -1779,7 +2148,6 @@ page_validate( it is consistent with the directory. */ count = 0; data_size = 0; - own_count = 1; slot_no = 0; slot = page_dir_get_nth_slot(page, slot_no); @@ -1790,22 +2158,23 @@ page_validate( offsets = rec_get_offsets(rec, index, offsets, ULINT_UNDEFINED, &heap); - if (comp && page_rec_is_user_rec(rec) - && rec_get_node_ptr_flag(rec) + if (page_is_comp(page) && page_rec_is_user_rec(rec) + && UNIV_UNLIKELY(rec_get_node_ptr_flag(rec) != (ibool) - (btr_page_get_level_low(page) != 0)) { + (btr_page_get_level_low(page) != 0))) { fputs("InnoDB: node_ptr flag mismatch\n", stderr); goto func_exit; } - if (!page_rec_validate(rec, offsets)) { + if (UNIV_UNLIKELY(!page_rec_validate(rec, offsets))) { goto func_exit; } /* Check that the records are in the ascending order */ - if ((count >= 2) && (!page_cur_is_after_last(&cur))) { - if (!(1 == cmp_rec_rec(rec, old_rec, - offsets, old_offsets, index))) { + if (UNIV_LIKELY(count >= 2) + && (!page_cur_is_after_last(&cur))) { + if (UNIV_UNLIKELY(!(1 == cmp_rec_rec(rec, old_rec, + offsets, old_offsets, index)))) { fprintf(stderr, "InnoDB: Records in wrong order on page %lu", (ulong) buf_frame_get_page_no(page)); @@ -1828,40 +2197,27 @@ page_validate( offs = rec_get_start(rec, offsets) - page; for (i = 0; i < rec_offs_size(offsets); i++) { - if (!buf[offs + i] == 0) { + if (UNIV_UNLIKELY(buf[offs + i]++)) { /* No other record may overlap this */ fputs("InnoDB: Record overlaps another\n", stderr); goto func_exit; } - - buf[offs + i] = 1; } - - if (rec_get_n_owned(rec, comp) != 0) { - /* This is a record pointed to by a dir slot */ - if (rec_get_n_owned(rec, comp) != own_count) { - fprintf(stderr, - "InnoDB: Wrong owned count %lu, %lu\n", - (ulong) rec_get_n_owned(rec, comp), - (ulong) own_count); - goto func_exit; - } - if (page_dir_slot_get_rec(slot) != rec) { - fputs( - "InnoDB: Dir slot does not point to right rec\n", - stderr); - goto func_exit; + if (page_is_comp(page)) { + if (UNIV_UNLIKELY(rec_get_n_owned_new(rec))) { + + goto check_slot; } - + } else if (UNIV_UNLIKELY(rec_get_n_owned_old(rec))) { +check_slot: + /* This is a record pointed to by a dir slot */ page_dir_slot_check(slot); - own_count = 0; if (!page_cur_is_after_last(&cur)) { - slot_no++; - slot = page_dir_get_nth_slot(page, slot_no); + slot = page_dir_get_nth_slot(page, slot_no++); } } @@ -1869,17 +2225,8 @@ page_validate( break; } - if (rec_get_next_offs(rec, comp) < FIL_PAGE_DATA - || rec_get_next_offs(rec, comp) >= UNIV_PAGE_SIZE) { - fprintf(stderr, - "InnoDB: Next record offset wrong %lu\n", - (ulong) rec_get_next_offs(rec, comp)); - goto func_exit; - } - count++; page_cur_move_to_next(&cur); - own_count++; old_rec = rec; /* set old_offsets to offsets; recycle offsets */ { @@ -1888,26 +2235,33 @@ page_validate( offsets = offs; } } - - if (rec_get_n_owned(rec, comp) == 0) { + + if (page_is_comp(page)) { + if (UNIV_UNLIKELY(rec_get_n_owned_new(rec) == 0)) { + + goto n_owned_zero; + } + } else if (UNIV_UNLIKELY(rec_get_n_owned_old(rec) == 0)) { +n_owned_zero: fputs("InnoDB: n owned is zero\n", stderr); goto func_exit; } - - if (slot_no != n_slots - 1) { + + if (UNIV_UNLIKELY(slot_no != n_slots - 1)) { fprintf(stderr, "InnoDB: n slots wrong %lu %lu\n", (ulong) slot_no, (ulong) (n_slots - 1)); goto func_exit; } - if (page_header_get_field(page, PAGE_N_RECS) + 2 != count + 1) { + if (UNIV_UNLIKELY(page_header_get_field(page, PAGE_N_RECS) + 2 + != count + 1)) { fprintf(stderr, "InnoDB: n recs wrong %lu %lu\n", (ulong) page_header_get_field(page, PAGE_N_RECS) + 2, (ulong) (count + 1)); goto func_exit; } - if (data_size != page_get_data_size(page)) { + if (UNIV_UNLIKELY(data_size != page_get_data_size(page))) { fprintf(stderr, "InnoDB: Summed data size %lu, returned by func %lu\n", (ulong) data_size, (ulong) page_get_data_size(page)); @@ -1920,7 +2274,7 @@ page_validate( while (rec != NULL) { offsets = rec_get_offsets(rec, index, offsets, ULINT_UNDEFINED, &heap); - if (!page_rec_validate(rec, offsets)) { + if (UNIV_UNLIKELY(!page_rec_validate(rec, offsets))) { goto func_exit; } @@ -1930,19 +2284,17 @@ page_validate( for (i = 0; i < rec_offs_size(offsets); i++) { - if (buf[offs + i] != 0) { + if (UNIV_UNLIKELY(buf[offs + i]++)) { fputs( "InnoDB: Record overlaps another in free list\n", stderr); goto func_exit; } - - buf[offs + i] = 1; } rec = page_rec_get_next(rec); } - if (page_dir_get_n_heap(page) != count + 1) { + if (UNIV_UNLIKELY(page_dir_get_n_heap(page) != count + 1)) { fprintf(stderr, "InnoDB: N heap is wrong %lu %lu\n", (ulong) page_dir_get_n_heap(page), (ulong) count + 1); @@ -1954,7 +2306,7 @@ page_validate( func_exit: mem_heap_free(heap); - if (ret == FALSE) { + if (UNIV_UNLIKELY(ret == FALSE)) { func_exit2: fprintf(stderr, "InnoDB: Apparent corruption in page %lu in ", (ulong) buf_frame_get_page_no(page)); @@ -1980,17 +2332,33 @@ page_find_rec_with_heap_no( page_cur_set_before_first(page, &cur); - for (;;) { - if (rec_get_heap_no(cur.rec, page_is_comp(page)) == heap_no) { + if (page_is_comp(page)) { + for (;;) { + if (rec_get_heap_no_new(cur.rec) == heap_no) { - return(cur.rec); + return(cur.rec); + } + + if (page_cur_is_after_last(&cur)) { + + return(NULL); + } + + page_cur_move_to_next(&cur); } + } else { + for (;;) { + if (rec_get_heap_no_old(cur.rec) == heap_no) { - if (page_cur_is_after_last(&cur)) { + return(cur.rec); + } - return(NULL); - } + if (page_cur_is_after_last(&cur)) { - page_cur_move_to_next(&cur); + return(NULL); + } + + page_cur_move_to_next(&cur); + } } } diff --git a/page/page0zip.c b/page/page0zip.c new file mode 100644 index 00000000000..15fd70f3fea --- /dev/null +++ b/page/page0zip.c @@ -0,0 +1,331 @@ +/****************************************************** +Compressed page interface + +(c) 2005 Innobase Oy + +Created June 2005 by Marko Makela +*******************************************************/ + +#define THIS_MODULE +#include "page0zip.h" +#ifdef UNIV_NONINL +# include "page0zip.ic" +#endif +#undef THIS_MODULE +#include "page0page.h" +#include "mtr0log.h" +#include "zlib.h" + +/************************************************************************** +Compress a page. */ + +ibool +page_zip_compress( +/*==============*/ + /* out: TRUE on success, FALSE on failure; + page_zip will be left intact on failure. */ + page_zip_des_t* page_zip,/* out: compressed page */ + const page_t* page) /* in: uncompressed page */ +{ + z_stream c_stream; + int err; + byte* buf; + ulint trailer_len; + + ut_ad(page_zip_simple_validate(page_zip)); +#ifdef UNIV_DEBUG + if (page_is_comp((page_t*) page)) { + ut_ad(page_simple_validate_new((page_t*) page)); + } else { + ut_ad(page_simple_validate_old((page_t*) page)); + } +#endif /* UNIV_DEBUG */ + + buf = mem_alloc(page_zip->size - PAGE_DATA); + + /* Determine the length of the page trailer. */ + trailer_len = page + UNIV_PAGE_SIZE + - page_dir_get_nth_slot((page_t*) page, + page_dir_get_n_slots((page_t*) page) - 1); + ut_ad(trailer_len < UNIV_PAGE_SIZE - PAGE_DATA); + + /* Compress the data payload. */ + c_stream.zalloc = (alloc_func) 0; + c_stream.zfree = (free_func) 0; + c_stream.opaque = (voidpf) 0; + + err = deflateInit(&c_stream, Z_DEFAULT_COMPRESSION); + ut_a(err == Z_OK); + + c_stream.next_out = buf; + c_stream.next_in = (void*) (page + PAGE_DATA); + c_stream.avail_out = page_zip->size - (PAGE_DATA - 1) - trailer_len; + c_stream.avail_in = page_header_get_field((page_t*) page, + PAGE_HEAP_TOP) - PAGE_DATA; + + err = deflate(&c_stream, Z_FINISH); + if (err != Z_STREAM_END) { + deflateEnd(&c_stream); + mem_free(buf); + return(FALSE); + } + + err = deflateEnd(&c_stream); + ut_a(err == Z_OK); + + ut_ad(c_stream.avail_in == page_header_get_field((page_t*) page, + PAGE_HEAP_TOP) - PAGE_DATA); + ut_ad(c_stream.avail_out == page_zip->size - (PAGE_DATA - 1) + - trailer_len); + ut_a(c_stream.total_in == (uLong) c_stream.avail_in); + ut_a(c_stream.total_out <= (uLong) c_stream.avail_out); + + page_zip->m_end = page_zip->m_start = PAGE_DATA + c_stream.total_out; + /* Copy the page header */ + memcpy(page_zip->data, page, PAGE_DATA); + /* Copy the compressed data */ + memcpy(page_zip->data + PAGE_DATA, buf, c_stream.total_out); + /* Zero out the area reserved for the modification log */ + memset(page_zip->data + PAGE_DATA + c_stream.total_out, 0, + page_zip->size - PAGE_DATA - trailer_len - c_stream.total_out); + /* Copy the page trailer */ + memcpy(page_zip->data + page_zip->size - trailer_len, + page + UNIV_PAGE_SIZE - trailer_len, trailer_len); + mem_free(buf); + ut_ad(page_zip_validate(page_zip, page)); + return(TRUE); +} + +/************************************************************************** +Read an integer from the modification log of the compressed page. */ +static +ulint +page_zip_ulint_read( +/*================*/ + /* out: length of the integer, in bytes; + zero on failure */ + const byte* src, /* in: where to read */ + ulint* dest) /* out: the decoded integer */ +{ + ulint num = (unsigned char) *src; + if (num < 128) { + *dest = num; /* 0xxxxxxx: 0..127 */ + return(1); + } + if (num < 192) { /* 10xxxxxx xxxxxxxx: 0..16383 */ + *dest = ((num << 8) & ~0x8000) | (unsigned char) src[1]; + return(2); + } + *dest = ULINT_MAX; + return(0); /* 11xxxxxxx xxxxxxxx: reserved */ +} + +/************************************************************************** +Write an integer to the modification log of the compressed page. */ +static +ulint +page_zip_ulint_write( +/*=================*/ + /* out: length of the integer, in bytes; + zero on failure */ + byte* dest, /* in: where to write */ + ulint num) /* out: integer to write */ +{ + if (num < 128) { + *dest = num; /* 0xxxxxxx: 0..127 */ + return(1); + } + if (num < 16384) { /* 10xxxxxx xxxxxxxx: 0..16383 */ + dest[0] = num >> 8 | 0x80; + dest[1] = num; + return(2); + } + ut_error; + return(0); /* 11xxxxxxx xxxxxxxx: reserved */ +} + +/************************************************************************** +Decompress a page. */ + +ibool +page_zip_decompress( +/*================*/ + /* out: TRUE on success, FALSE on failure */ + page_zip_des_t* page_zip,/* in: data, size; out: m_start, m_end */ + page_t* page, /* out: uncompressed page, may be trashed */ + mtr_t* mtr) /* in: mini-transaction handle, + or NULL if no logging is needed */ +{ + z_stream d_stream; + int err; + ulint trailer_len; + + ut_ad(page_zip_simple_validate(page_zip)); + trailer_len = PAGE_DIR + + PAGE_DIR_SLOT_SIZE + * page_dir_get_n_slots((page_t*) page_zip->data); + ut_ad(trailer_len < page_zip->size - PAGE_DATA); + ut_ad(page_header_get_field((page_t*) page_zip->data, PAGE_HEAP_TOP) + <= UNIV_PAGE_SIZE - trailer_len); + + d_stream.zalloc = (alloc_func) 0; + d_stream.zfree = (free_func) 0; + d_stream.opaque = (voidpf) 0; + + err = inflateInit(&d_stream); + ut_a(err == Z_OK); + + d_stream.next_in = page_zip->data + PAGE_DATA; + d_stream.next_out = page + PAGE_DATA; + d_stream.avail_in = page_zip->size - trailer_len - (PAGE_DATA - 1); + d_stream.avail_out = page_header_get_field(page_zip->data, PAGE_HEAP_TOP) + - PAGE_DATA; + + err = inflate(&d_stream, Z_FINISH); + if (err != Z_STREAM_END) { + inflateEnd(&d_stream); + return(FALSE); + } + err = inflateEnd(&d_stream); + ut_a(err == Z_OK); + + ut_ad(d_stream.avail_in + == page_zip->size - trailer_len - (PAGE_DATA - 1)); + ut_ad(d_stream.avail_out + == page_header_get_field(page_zip->data, PAGE_HEAP_TOP) - PAGE_DATA); + ut_a(d_stream.total_in <= (uLong) d_stream.avail_in); + ut_a(d_stream.total_out == d_stream.total_out); + + page_zip->m_end = page_zip->m_start = PAGE_DATA + d_stream.total_in; + /* Copy the page header */ + memcpy(page, page_zip->data, PAGE_DATA); + /* Copy the page trailer */ + memcpy(page_zip->data + page_zip->size - trailer_len, + page + UNIV_PAGE_SIZE - trailer_len, trailer_len); + /* Apply the modification log. */ + while (page_zip->data[page_zip->m_end]) { + ulint ulint_len; + ulint length, offset; + ulint_len = page_zip_ulint_read(page_zip->data + page_zip->m_end, + &length); + page_zip->m_end += ulint_len; + if (!ulint_len + || page_zip->m_end + length >= page_zip->size - trailer_len) { + return(FALSE); + } + ut_a(length > 0 && length < UNIV_PAGE_SIZE - PAGE_DATA); + + ulint_len = page_zip_ulint_read(page_zip->data + page_zip->m_end, + &offset); + page_zip->m_end += ulint_len; + if (!ulint_len + || page_zip->m_end + length >= page_zip->size - trailer_len) { + return(FALSE); + } + + offset += PAGE_DATA; + ut_a(offset + length < UNIV_PAGE_SIZE - trailer_len); + + memcpy(page + offset, page_zip->data + page_zip->m_end, length); + page_zip->m_end += length; + } + + ut_a(page_is_comp(page)); + ut_ad(page_simple_validate_new(page)); + + if (UNIV_LIKELY_NULL(mtr)) { + byte* log_ptr = mlog_open(mtr, 11); + if (log_ptr) { + log_ptr = mlog_write_initial_log_record_fast( + page, MLOG_COMP_DECOMPRESS, + log_ptr, mtr); + mlog_close(mtr, log_ptr); + } + } + + return(TRUE); +} + +#ifdef UNIV_DEBUG +/************************************************************************** +Check that the compressed and decompressed pages match. */ + +ibool +page_zip_validate( +/*==============*/ + const page_zip_des_t* page_zip, /* in: compressed page */ + const page_t* page) /* in: uncompressed page */ +{ + page_zip_des_t temp_page_zip = *page_zip; + page_t temp_page[UNIV_PAGE_SIZE]; + + ut_ad(buf_block_get_page_zip(buf_block_align((byte*)page)) + == page_zip); + + return(page_zip_decompress(&temp_page_zip, temp_page, NULL) + && !memcmp(page, temp_page, UNIV_PAGE_SIZE)); +} +#endif /* UNIV_DEBUG */ + +/************************************************************************** +Write data to the compressed portion of a page. The data must already +have been written to the uncompressed page. */ + +void +page_zip_write( +/*===========*/ + page_zip_des_t* page_zip,/* in/out: compressed page */ + const byte* str, /* in: address on the uncompressed page */ + ulint length) /* in: length of the data */ +{ + ulint pos = ut_align_offset(str, UNIV_PAGE_SIZE); +#ifdef UNIV_DEBUG + ulint trailer_len = PAGE_DIR + + PAGE_DIR_SLOT_SIZE + * page_dir_get_n_slots((page_t*) page_zip->data); +#endif /* UNIV_DEBUG */ + + ut_ad(buf_block_get_page_zip(buf_block_align((byte*)str)) == page_zip); + ut_ad(page_zip_simple_validate(page_zip)); + ut_ad(page_zip->m_start >= PAGE_DATA); + ut_ad(page_dir_get_n_slots(ut_align_down((byte*) str, UNIV_PAGE_SIZE)) + == page_dir_get_n_slots((page_t*) page_zip->data)); + ut_ad(!page_zip->data[page_zip->m_end]); + + ut_ad(PAGE_DATA + trailer_len < page_zip->size); + + ut_ad(pos >= PAGE_DATA); + ut_ad(pos + length <= UNIV_PAGE_SIZE - trailer_len); + + pos -= PAGE_DATA; + + ut_ad(page_zip_available(page_zip, page_zip_entry_size(pos, length))); + + /* Append to the modification log. */ + page_zip->m_end += page_zip_ulint_write( + page_zip->data + page_zip->m_end, length); + page_zip->m_end += page_zip_ulint_write( + page_zip->data + page_zip->m_end, pos); + memcpy(&page_zip->data[page_zip->m_end], str, length); + page_zip->m_end += length; + ut_ad(!page_zip->data[page_zip->m_end]); + ut_ad(page_zip->m_end < page_zip->size - trailer_len); + ut_ad(page_zip_validate(page_zip, + ut_align_down((byte*) str, UNIV_PAGE_SIZE))); +} + +#ifdef UNIV_DEBUG +/************************************************************************** +Determine if enough space is available in the modification log. */ + +ibool +page_zip_available_noninline( +/*=========================*/ + /* out: TRUE if enough space + is available */ + const page_zip_des_t* page_zip,/* in: compressed page */ + ulint size) +{ + return(page_zip_available(page_zip, size)); +} +#endif /* UNIV_DEBUG */ diff --git a/rem/rem0rec.c b/rem/rem0rec.c index 9480c978755..9af3e80b488 100644 --- a/rem/rem0rec.c +++ b/rem/rem0rec.c @@ -537,14 +537,13 @@ rec_set_nth_field_null_bit( } /*************************************************************** -Sets the value of the ith field extern storage bit of an old-style record. */ +Sets the ith field extern storage bit of an old-style record. */ void rec_set_nth_field_extern_bit_old( /*=============================*/ rec_t* rec, /* in: old-style record */ ulint i, /* in: ith field */ - ibool val, /* in: value to set */ mtr_t* mtr) /* in: mtr holding an X-latch to the page where rec is, or NULL; in the NULL case we do not write to log about the change */ @@ -556,11 +555,7 @@ rec_set_nth_field_extern_bit_old( info = rec_2_get_field_end_info(rec, i); - if (val) { - info = info | REC_2BYTE_EXTERN_MASK; - } else { - info = info & ~REC_2BYTE_EXTERN_MASK; - } + info |= REC_2BYTE_EXTERN_MASK; if (mtr) { mlog_write_ulint(rec - REC_N_OLD_EXTRA_BYTES - 2 * (i + 1), @@ -571,7 +566,7 @@ rec_set_nth_field_extern_bit_old( } /*************************************************************** -Sets the value of the ith field extern storage bit of a new-style record. */ +Sets the ith field extern storage bit of a new-style record. */ void rec_set_nth_field_extern_bit_new( @@ -579,7 +574,6 @@ rec_set_nth_field_extern_bit_new( rec_t* rec, /* in: record */ dict_index_t* index, /* in: record descriptor */ ulint ith, /* in: ith field */ - ibool val, /* in: value to set */ mtr_t* mtr) /* in: mtr holding an X-latch to the page where rec is, or NULL; in the NULL case we do not write to log about the change */ @@ -632,11 +626,11 @@ rec_set_nth_field_extern_bit_new( ulint len = lens[1]; if (len & 0x80) { /* 1exxxxxx: 2-byte length */ if (i == ith) { - if (!val == !(len & 0x40)) { + if (len & 0x40) { return; /* no change */ } /* toggle the extern bit */ - len ^= 0x40; + len |= 0x40; if (mtr) { mlog_write_ulint(lens + 1, len, MLOG_1BYTE, mtr); @@ -677,12 +671,11 @@ rec_set_field_extern_bits( if (UNIV_LIKELY(index->table->comp)) { for (i = 0; i < n_fields; i++) { rec_set_nth_field_extern_bit_new(rec, index, vec[i], - TRUE, mtr); + mtr); } } else { for (i = 0; i < n_fields; i++) { - rec_set_nth_field_extern_bit_old(rec, vec[i], - TRUE, mtr); + rec_set_nth_field_extern_bit_old(rec, vec[i], mtr); } } } @@ -745,7 +738,7 @@ rec_convert_dtuple_to_rec_old( rec_set_n_fields_old(rec, n_fields); /* Set the info bits of the record */ - rec_set_info_bits(rec, FALSE, + rec_set_info_bits_old(rec, dtuple_get_info_bits(dtuple) & REC_INFO_BITS_MASK); /* Store the data and the offsets */ @@ -835,8 +828,6 @@ rec_convert_dtuple_to_rec_new( ulint fixed_len; ulint null_mask = 1; const ulint n_fields = dtuple_get_n_fields(dtuple); - const ulint status = dtuple_get_info_bits(dtuple) - & REC_NEW_STATUS_MASK; ut_ad(index->table->comp); ut_ad(n_fields > 0); @@ -847,7 +838,8 @@ rec_convert_dtuple_to_rec_new( UNIV_PREFETCH_RW(rec - REC_N_NEW_EXTRA_BYTES - n_fields); UNIV_PREFETCH_RW(rec); - switch (UNIV_EXPECT(status, REC_STATUS_ORDINARY)) { + switch (UNIV_EXPECT(dtuple_get_info_bits(dtuple) & REC_NEW_STATUS_MASK, + REC_STATUS_ORDINARY)) { case REC_STATUS_ORDINARY: ut_ad(n_fields <= dict_index_get_n_fields(index)); n_node_ptr_field = ULINT_UNDEFINED; @@ -862,7 +854,7 @@ rec_convert_dtuple_to_rec_new( n_node_ptr_field = ULINT_UNDEFINED; goto init; default: - ut_a(0); + ut_error; return(0); } @@ -912,10 +904,8 @@ init: memset (lens + 1, 0, nulls - lens); /* Set the info bits of the record */ - rec_set_status(rec, status); - - rec_set_info_bits(rec, TRUE, - dtuple_get_info_bits(dtuple) & REC_INFO_BITS_MASK); + rec_set_info_and_status_bits(rec, NULL, + dtuple_get_info_bits(dtuple)); /* Store the data and the offsets */ @@ -928,6 +918,7 @@ init: ut_ad(dtype_get_prtype(type) & DATA_NOT_NULL); ut_ad(len == 4); memcpy(end, dfield_get_data(field), len); + end += 4; break; } fixed_len = dict_index_get_nth_field(index, i)->fixed_len; diff --git a/row/row0ins.c b/row/row0ins.c index 5e833372299..307ab69a40b 100644 --- a/row/row0ins.c +++ b/row/row0ins.c @@ -2409,7 +2409,7 @@ row_ins_step( goto same_trx; } - trx_write_trx_id(node->trx_id_buf, trx->id); + trx_write_trx_id(node->trx_id_buf, NULL, trx->id); err = lock_table(0, node->table, LOCK_IX, thr); diff --git a/row/row0row.c b/row/row0row.c index 9a74397dc08..50bba7c0601 100644 --- a/row/row0row.c +++ b/row/row0row.c @@ -67,9 +67,10 @@ is slower than the specialized inline functions. */ void row_set_rec_sys_field( /*==================*/ - /* out: value of the field */ ulint type, /* in: DATA_TRX_ID or DATA_ROLL_PTR */ - rec_t* rec, /* in: record */ + rec_t* rec, /* in/out: record */ + page_zip_des_t* page_zip,/* in/out: compressed page with at least + 10 or 11 bytes available, or NULL */ dict_index_t* index, /* in: clustered index */ const ulint* offsets,/* in: rec_get_offsets(rec, index) */ dulint val) /* in: value to set */ @@ -87,11 +88,11 @@ row_set_rec_sys_field( if (type == DATA_TRX_ID) { - trx_write_trx_id(field, val); + trx_write_trx_id(field, page_zip/* 10 bytes */, val); } else { ut_ad(type == DATA_ROLL_PTR); - trx_write_roll_ptr(field, val); + trx_write_roll_ptr(field, page_zip/* 11 bytes */, val); } } diff --git a/row/row0sel.c b/row/row0sel.c index 1b66f14f5d7..a9774beb526 100644 --- a/row/row0sel.c +++ b/row/row0sel.c @@ -2059,7 +2059,7 @@ row_sel_convert_mysql_key_to_innobase( dfield = dtuple_get_nth_field(tuple, 0); field = dict_index_get_nth_field(index, 0); - if (dfield_get_type(dfield)->mtype == DATA_SYS) { + if (UNIV_UNLIKELY(dfield_get_type(dfield)->mtype == DATA_SYS)) { /* A special case: we are looking for a position in the generated clustered index which InnoDB automatically added to a table with no primary key: the first and the only @@ -2077,8 +2077,9 @@ row_sel_convert_mysql_key_to_innobase( while (key_ptr < key_end) { - ut_a(dict_col_get_type(field->col)->mtype - == dfield_get_type(dfield)->mtype); + type = dfield_get_type(dfield)->mtype; + + ut_a(dict_col_get_type(field->col)->mtype == type); data_offset = 0; is_null = FALSE; @@ -2096,8 +2097,6 @@ row_sel_convert_mysql_key_to_innobase( } } - type = dfield_get_type(dfield)->mtype; - /* Calculate data length and data field total length */ if (type == DATA_BLOB) { @@ -2143,9 +2142,9 @@ row_sel_convert_mysql_key_to_innobase( data_field_len = data_offset + data_len; } - if (dtype_get_mysql_type(dfield_get_type(dfield)) - == DATA_MYSQL_TRUE_VARCHAR - && dfield_get_type(dfield)->mtype != DATA_INT) { + if (UNIV_UNLIKELY(dtype_get_mysql_type(dfield_get_type(dfield)) + == DATA_MYSQL_TRUE_VARCHAR) + && UNIV_LIKELY(type != DATA_INT)) { /* In a MySQL key value format, a true VARCHAR is always preceded by 2 bytes of a length field. dfield_get_type(dfield)->len returns the maximum @@ -2161,7 +2160,7 @@ row_sel_convert_mysql_key_to_innobase( /* Storing may use at most data_len bytes of buf */ - if (!is_null) { + if (UNIV_LIKELY(!is_null)) { row_mysql_store_col_in_innobase_format( dfield, buf, @@ -2174,7 +2173,7 @@ row_sel_convert_mysql_key_to_innobase( key_ptr += data_field_len; - if (key_ptr > key_end) { + if (UNIV_UNLIKELY(key_ptr > key_end)) { /* The last field in key was not a complete key field but a prefix of it. diff --git a/row/row0upd.c b/row/row0upd.c index ff1ad1dfd05..4d98462bfeb 100644 --- a/row/row0upd.c +++ b/row/row0upd.c @@ -301,7 +301,8 @@ recovery. */ void row_upd_rec_sys_fields_in_recovery( /*===============================*/ - rec_t* rec, /* in: record */ + rec_t* rec, /* in/out: record */ + page_zip_des_t* page_zip,/* in/out: compressed page, or NULL */ const ulint* offsets,/* in: array returned by rec_get_offsets() */ ulint pos, /* in: TRX_ID position in rec */ dulint trx_id, /* in: transaction id */ @@ -312,11 +313,11 @@ row_upd_rec_sys_fields_in_recovery( field = rec_get_nth_field(rec, offsets, pos, &len); ut_ad(len == DATA_TRX_ID_LEN); - trx_write_trx_id(field, trx_id); + trx_write_trx_id(field, page_zip, trx_id); field = rec_get_nth_field(rec, offsets, pos + 1, &len); ut_ad(len == DATA_ROLL_PTR_LEN); - trx_write_roll_ptr(field, roll_ptr); + trx_write_roll_ptr(field, page_zip, roll_ptr); } /************************************************************************* @@ -345,10 +346,10 @@ row_upd_index_entry_sys_field( field = dfield_get_data(dfield); if (type == DATA_TRX_ID) { - trx_write_trx_id(field, val); + trx_write_trx_id(field, NULL, val); } else { ut_ad(type == DATA_ROLL_PTR); - trx_write_roll_ptr(field, val); + trx_write_roll_ptr(field, NULL, val); } } @@ -445,7 +446,11 @@ row_upd_rec_in_place( ut_ad(rec_offs_validate(rec, NULL, offsets)); - rec_set_info_bits(rec, rec_offs_comp(offsets), update->info_bits); + if (rec_offs_comp(offsets)) { + rec_set_info_bits_new(rec, NULL, update->info_bits); + } else { + rec_set_info_bits_old(rec, update->info_bits); + } n_fields = upd_get_n_fields(update); @@ -480,7 +485,7 @@ row_upd_write_sys_vals_to_log( log_ptr += mach_write_compressed(log_ptr, dict_index_get_sys_col_pos(index, DATA_TRX_ID)); - trx_write_roll_ptr(log_ptr, roll_ptr); + trx_write_roll_ptr(log_ptr, NULL, roll_ptr); log_ptr += DATA_ROLL_PTR_LEN; log_ptr += mach_dulint_write_compressed(log_ptr, trx->id); @@ -2040,5 +2045,11 @@ row_upd_in_place_in_select( err = btr_cur_update_in_place(BTR_NO_LOCKING_FLAG, btr_cur, node->update, node->cmpl_info, thr, mtr); + /* TODO: the above can fail if page_zip != NULL. + However, this function row_upd_in_place_in_select() is only invoked + when executing UPDATE statements of the built-in InnoDB SQL parser. + The built-in SQL is only used for InnoDB system tables, which + always are in the old, uncompressed format (ROW_FORMAT=REDUNDANT, + comp == FALSE, page_zip == NULL). */ ut_ad(err == DB_SUCCESS); } diff --git a/trx/trx0rec.c b/trx/trx0rec.c index 3b7171e6038..20e6cfebfd6 100644 --- a/trx/trx0rec.c +++ b/trx/trx0rec.c @@ -807,7 +807,7 @@ trx_undo_update_rec_get_update( upd_field = upd_get_nth_field(update, n_fields); buf = mem_heap_alloc(heap, DATA_TRX_ID_LEN); - trx_write_trx_id(buf, trx_id); + trx_write_trx_id(buf, NULL, trx_id); upd_field_set_field_no(upd_field, dict_index_get_sys_col_pos(index, DATA_TRX_ID), @@ -816,7 +816,7 @@ trx_undo_update_rec_get_update( upd_field = upd_get_nth_field(update, n_fields + 1); buf = mem_heap_alloc(heap, DATA_ROLL_PTR_LEN); - trx_write_roll_ptr(buf, roll_ptr); + trx_write_roll_ptr(buf, NULL, roll_ptr); upd_field_set_field_no(upd_field, dict_index_get_sys_col_pos(index, DATA_ROLL_PTR),