From f4c776f518bac3ea1ade5c1a49257e8fae8728db Mon Sep 17 00:00:00 2001 From: marko <> Date: Wed, 5 Apr 2006 13:41:12 +0000 Subject: [PATCH] branches/zip: Initial steps towards disk-based storage of compressed pages. dict_mem_table_create(): Account for DICT_TF_COMPRESSED in a debug assertion. btr_store_big_rec_extern_fields(), btr_free_externally_stored_field(), btr_copy_externally_stored_field(): Implement the disk format for compressed BLOB pages. btr_copy_externally_stored_field(): Improve error reporting and handling when decompressing BLOB pages. buf_flush_init_for_writing(), buf_page_is_corrupted(), buf_page_print(): Account for compressed BLOB pages (FIL_PAGE_TYPE_ZBLOB). buf_calc_zblob_page_checksum(): New function. --- btr/btr0cur.c | 93 ++++++++++++++++++++++++++++++++++------------- buf/buf0buf.c | 64 +++++++++++++++++++++++--------- buf/buf0flu.c | 19 ++++++++++ dict/dict0mem.c | 2 +- include/btr0btr.h | 3 +- include/buf0buf.h | 15 +++++++- include/fil0fil.h | 1 + page/page0zip.c | 3 +- trx/trx0sys.c | 12 +++++- 9 files changed, 163 insertions(+), 49 deletions(-) diff --git a/btr/btr0cur.c b/btr/btr0cur.c index 46deaeaa148..e6d177e5903 100644 --- a/btr/btr0cur.c +++ b/btr/btr0cur.c @@ -3575,9 +3575,6 @@ btr_store_big_rec_extern_fields( return(DB_OUT_OF_FILE_SPACE); } - mlog_write_ulint(page + FIL_PAGE_TYPE, - FIL_PAGE_TYPE_BLOB, MLOG_2BYTES, &mtr); - page_no = buf_frame_get_page_no(page); if (prev_page_no != FIL_NULL) { @@ -3592,7 +3589,7 @@ btr_store_big_rec_extern_fields( #endif /* UNIV_SYNC_DEBUG */ if (UNIV_LIKELY_NULL(page_zip)) { - next_ptr = prev_page; + next_ptr = prev_page + FIL_PAGE_NEXT; } else { next_ptr = prev_page + FIL_PAGE_DATA + BTR_BLOB_HDR_NEXT_PAGE_NO; @@ -3605,8 +3602,13 @@ btr_store_big_rec_extern_fields( if (UNIV_LIKELY_NULL(page_zip)) { int err; - c_stream.next_out = page + 4; - c_stream.avail_out = UNIV_PAGE_SIZE - 4; + mach_write_to_4(page + FIL_PAGE_TYPE, + FIL_PAGE_TYPE_ZBLOB); + + c_stream.next_out = page + + FIL_PAGE_FILE_FLUSH_LSN; + c_stream.avail_out = page_zip->size + - FIL_PAGE_FILE_FLUSH_LSN; err = deflate(&c_stream, Z_FINISH); ut_a(err == Z_OK || err == Z_STREAM_END); @@ -3614,12 +3616,14 @@ btr_store_big_rec_extern_fields( || c_stream.avail_out == 0); /* Write the "next BLOB page" pointer */ - mach_write_to_4(page, FIL_NULL); + mlog_write_ulint(page + FIL_PAGE_NEXT, + FIL_NULL, MLOG_4BYTES, &mtr); /* Zero out the unused part of the page. */ - memset(page + UNIV_PAGE_SIZE + memset(page + page_zip->size - c_stream.avail_out, 0, c_stream.avail_out); - mlog_log_string(page, UNIV_PAGE_SIZE, &mtr); + mlog_log_string(page + FIL_PAGE_TYPE, + page_zip->size - FIL_PAGE_TYPE, &mtr); if (err == Z_OK && prev_page_no != FIL_NULL) { @@ -3663,7 +3667,7 @@ btr_store_big_rec_extern_fields( mlog_write_ulint(field_ref + BTR_EXTERN_OFFSET, - FIL_PAGE_DATA, + FIL_PAGE_NEXT, MLOG_4BYTES, &mtr); } @@ -3680,6 +3684,10 @@ next_zip_page: break; } } else { + mlog_write_ulint(page + FIL_PAGE_TYPE, + FIL_PAGE_TYPE_BLOB, + MLOG_2BYTES, &mtr); + if (extern_len > (UNIV_PAGE_SIZE - FIL_PAGE_DATA - BTR_BLOB_HDR_SIZE @@ -3851,7 +3859,7 @@ btr_free_externally_stored_field( if (dict_table_is_zip(index->table)) { /* Note that page_zip will be NULL in row_purge_upd_exist_or_extern(). */ - next_page_no = mach_read_from_4(page); + next_page_no = mach_read_from_4(page + FIL_PAGE_NEXT); btr_page_free_low(index->tree, page, space_id, page_no, 0, &mtr); @@ -4015,7 +4023,8 @@ btr_copy_externally_stored_field( byte* data, /* in: 'internally' stored part of the field containing also the reference to the external part */ - ibool zip, /* in: TRUE=compressed BLOB */ + ulint zip_size,/* in: nonzero=compressed BLOB page size, + zero for uncompressed BLOBs */ ulint local_len,/* in: length of data */ mem_heap_t* heap) /* in: mem heap */ { @@ -4055,7 +4064,7 @@ btr_copy_externally_stored_field( return(buf); } - if (UNIV_UNLIKELY(zip)) { + if (UNIV_UNLIKELY(zip_size)) { int err; d_stream.zalloc = (alloc_func) 0; d_stream.zfree = (free_func) 0; @@ -4076,12 +4085,36 @@ btr_copy_externally_stored_field( #ifdef UNIV_SYNC_DEBUG buf_page_dbg_add_level(page, SYNC_EXTERN_STORAGE); #endif /* UNIV_SYNC_DEBUG */ - if (UNIV_UNLIKELY(zip)) { + if (UNIV_UNLIKELY(zip_size)) { int err; - d_stream.next_in = page + 4; - d_stream.avail_in = UNIV_PAGE_SIZE - 4;/* TODO */ - page_no = mach_read_from_4(page); + if (UNIV_UNLIKELY(fil_page_get_type(page) + != FIL_PAGE_TYPE_ZBLOB)) { + ut_print_timestamp(stderr); + fprintf(stderr, +" InnoDB: Unknown type %lu of compressed BLOB page %lu space %lu\n", + (ulong) fil_page_get_type(page), + (ulong) page_no, (ulong) space_id); + } + + page_no = mach_read_from_4(page + offset); + + if (UNIV_LIKELY(offset == FIL_PAGE_NEXT)) { + /* When the BLOB begins at page header, + the compressed data payload does not + immediately follow the next page pointer. */ + offset = FIL_PAGE_FILE_FLUSH_LSN; + } else { + offset += 4; + } + + d_stream.next_in = page + offset; + d_stream.avail_in = zip_size - offset; + + /* On other BLOB pages except the first + the BLOB header always is at the page header: */ + + offset = FIL_PAGE_NEXT; err = inflate(&d_stream, Z_NO_FLUSH); switch (err) { @@ -4095,16 +4128,23 @@ btr_copy_externally_stored_field( default: mtr_commit(&mtr); inflateEnd(&d_stream); - ut_error;/* TODO: report error */ +inflate_error: + ut_print_timestamp(stderr); + fprintf(stderr, +" InnoDB: inflate() of compressed BLOB page %lu space %lu returned %d\n", + (ulong) page_no, (ulong) space_id, + err); + *len = 0; return(buf); } if (page_no == FIL_NULL) { err = inflate(&d_stream, Z_FINISH); - /* TODO: report error instead of - assertion failure? */ - ut_a(err == Z_STREAM_END); - end_of_blob: + if (UNIV_UNLIKELY(err != Z_STREAM_END)) { + + goto inflate_error; + } +end_of_blob: mtr_commit(&mtr); ut_a(!d_stream.avail_out); @@ -4160,8 +4200,9 @@ btr_rec_copy_externally_stored_field( ulint* len, /* out: length of the field */ mem_heap_t* heap) /* in: mem heap */ { - ulint local_len; - byte* data; + ulint local_len; + byte* data; + page_zip_des_t* page_zip; ut_ad(rec_offs_validate(rec, NULL, offsets)); ut_a(rec_offs_nth_extern(offsets, no)); @@ -4177,7 +4218,9 @@ btr_rec_copy_externally_stored_field( data = rec_get_nth_field(rec, offsets, no, &local_len); + page_zip = buf_block_get_page_zip(buf_block_align(rec)); + return(btr_copy_externally_stored_field(len, data, - !!buf_block_get_page_zip(buf_block_align(rec)), + page_zip ? page_zip->size : 0, local_len, heap)); } diff --git a/buf/buf0buf.c b/buf/buf0buf.c index 55011e967c9..b7f16745c40 100644 --- a/buf/buf0buf.c +++ b/buf/buf0buf.c @@ -232,6 +232,22 @@ ibool buf_debug_prints = FALSE; /* If this is set TRUE, the program prints info whenever read-ahead or flush occurs */ #endif /* UNIV_DEBUG */ +/************************************************************************ +Calculates a compressed BLOB page checksum which is stored to the page +when it is written to a file. Note that we must be careful to calculate +the same value on 32-bit and 64-bit architectures. */ + +ulint +buf_calc_zblob_page_checksum( +/*=========================*/ + /* out: checksum */ + const byte* page, /* in: compressed BLOB page */ + ulint zip_size) /* in: size of the page, in bytes */ +{ + return(ut_fold_binary(page + FIL_PAGE_SPACE_OR_CHKSUM, + zip_size - FIL_PAGE_SPACE_OR_CHKSUM) & 0xFFFFFFFFUL); +} + /************************************************************************ Calculates a page checksum which is stored to the page when it is written to a file. Note that we must be careful to calculate the same value on @@ -293,18 +309,19 @@ ibool buf_page_is_corrupted( /*==================*/ /* out: TRUE if corrupted */ - byte* read_buf) /* in: a database page */ + byte* read_buf, /* in: a database page */ + ulint zip_size) /* in: size of compressed page; + 0 for uncompressed pages */ { - ulint checksum; - ulint old_checksum; ulint checksum_field; ulint old_checksum_field; #ifndef UNIV_HOTBACKUP dulint current_lsn; #endif - if (mach_read_from_4(read_buf + FIL_PAGE_LSN + 4) - != mach_read_from_4(read_buf + UNIV_PAGE_SIZE - - FIL_PAGE_END_LSN_OLD_CHKSUM + 4)) { + if (UNIV_LIKELY(!zip_size) + && memcmp(read_buf + FIL_PAGE_LSN + 4, + read_buf + UNIV_PAGE_SIZE + - FIL_PAGE_END_LSN_OLD_CHKSUM + 4, 4)) { /* Stored log sequence numbers at the start and the end of page do not match */ @@ -341,8 +358,16 @@ buf_page_is_corrupted( BUF_NO_CHECKSUM_MAGIC which might be stored by InnoDB with checksums disabled. Otherwise, skip checksum calculation and return FALSE */ - if (srv_use_checksums) { - old_checksum = buf_calc_page_old_checksum(read_buf); + if (UNIV_LIKELY(srv_use_checksums)) { + checksum_field = mach_read_from_4(read_buf + + FIL_PAGE_SPACE_OR_CHKSUM); + + if (UNIV_UNLIKELY(zip_size)) { + return(checksum_field != BUF_NO_CHECKSUM_MAGIC + && checksum_field + != buf_calc_zblob_page_checksum( + read_buf, zip_size)); + } old_checksum_field = mach_read_from_4(read_buf + UNIV_PAGE_SIZE - FIL_PAGE_END_LSN_OLD_CHKSUM); @@ -357,21 +382,20 @@ buf_page_is_corrupted( if (old_checksum_field != mach_read_from_4(read_buf + FIL_PAGE_LSN) - && old_checksum_field != old_checksum - && old_checksum_field != BUF_NO_CHECKSUM_MAGIC) { + && old_checksum_field != BUF_NO_CHECKSUM_MAGIC + && old_checksum_field + != buf_calc_page_old_checksum(read_buf)) { return(TRUE); } - checksum = buf_calc_page_new_checksum(read_buf); - checksum_field = mach_read_from_4(read_buf + - FIL_PAGE_SPACE_OR_CHKSUM); - /* InnoDB versions < 4.0.14 and < 4.1.1 stored the space id (always equal to 0), to FIL_PAGE_SPACE_SPACE_OR_CHKSUM */ - if (checksum_field != 0 && checksum_field != checksum - && checksum_field != BUF_NO_CHECKSUM_MAGIC) { + if (checksum_field != 0 + && checksum_field != BUF_NO_CHECKSUM_MAGIC + && checksum_field + != buf_calc_page_new_checksum(read_buf)) { return(TRUE); } @@ -398,6 +422,7 @@ buf_page_print( ut_print_buf(stderr, read_buf, UNIV_PAGE_SIZE); fputs("InnoDB: End of page dump\n", stderr); + /* TODO: print zipped pages differently, esp. BLOB pages */ checksum = srv_use_checksums ? buf_calc_page_new_checksum(read_buf) : BUF_NO_CHECKSUM_MAGIC; old_checksum = srv_use_checksums ? @@ -489,6 +514,10 @@ buf_page_print( fputs("InnoDB: Page may be a BLOB page\n", stderr); break; + case FIL_PAGE_TYPE_ZBLOB: + fputs("InnoDB: Page may be a compressed BLOB page\n", + stderr); + break; } } @@ -1885,7 +1914,8 @@ buf_page_io_complete( /* From version 3.23.38 up we store the page checksum to the 4 first bytes of the page end lsn field */ - if (buf_page_is_corrupted(block->frame)) { + if (buf_page_is_corrupted(block->frame, + block->space ? 16384 : 0/* TODO */)) { fprintf(stderr, "InnoDB: Database page corruption on disk or a failed\n" "InnoDB: file read of page %lu.\n", (ulong) block->offset); diff --git a/buf/buf0flu.c b/buf/buf0flu.c index 33f0494e225..f36a1bd7038 100644 --- a/buf/buf0flu.c +++ b/buf/buf0flu.c @@ -455,6 +455,25 @@ buf_flush_init_for_writing( { page_zip_des_t* page_zip = page_zip_; + if (space/* TODO: space_is_zip */) { + switch (fil_page_get_type(page)) { + case FIL_PAGE_TYPE_ZBLOB: + ut_ad(!page_zip); + mach_write_to_4(page + FIL_PAGE_OFFSET, page_no); + mach_write_to_4(page + FIL_PAGE_PREV, space); + mach_write_to_8(page + FIL_PAGE_LSN, newest_lsn); + mach_write_to_4(page + FIL_PAGE_SPACE_OR_CHKSUM, + srv_use_checksums + ? buf_calc_zblob_page_checksum( + page, 16384/* TODO */) + : BUF_NO_CHECKSUM_MAGIC); + return; + case FIL_PAGE_INDEX: + /* TODO: special handling */ + break; + } + } + /* Write the newest modification lsn to the page header and trailer */ mach_write_to_8(page + FIL_PAGE_LSN, newest_lsn); diff --git a/dict/dict0mem.c b/dict/dict0mem.c index d9f0ad3d84e..d3ec869cfb3 100644 --- a/dict/dict0mem.c +++ b/dict/dict0mem.c @@ -42,7 +42,7 @@ dict_mem_table_create( mem_heap_t* heap; ut_ad(name); - ut_ad(!(flags & ~DICT_TF_COMPACT)); + ut_ad(!(flags & ~(DICT_TF_COMPACT | DICT_TF_COMPRESSED))); heap = mem_heap_create(DICT_HEAP_SIZE); diff --git a/include/btr0btr.h b/include/btr0btr.h index eccc35bafa1..b181b73fe57 100644 --- a/include/btr0btr.h +++ b/include/btr0btr.h @@ -207,8 +207,7 @@ btr_page_reorganize( /* out: TRUE on success, FALSE on failure */ page_t* page, /* in: page to be reorganized */ dict_index_t* index, /* in: record descriptor */ - mtr_t* mtr) /* in: mtr */ - __attribute__((nonnull, warn_unused_result)); + mtr_t* mtr); /* in: mtr */ /***************************************************************** Decides if the page should be split at the convergence point of inserts converging to left. */ diff --git a/include/buf0buf.h b/include/buf0buf.h index d2dc01b475f..2fbe024386e 100644 --- a/include/buf0buf.h +++ b/include/buf0buf.h @@ -390,6 +390,17 @@ buf_block_get_modify_clock( /* out: value */ buf_block_t* block); /* in: block */ /************************************************************************ +Calculates a compressed BLOB page checksum which is stored to the page +when it is written to a file. Note that we must be careful to calculate +the same value on 32-bit and 64-bit architectures. */ + +ulint +buf_calc_zblob_page_checksum( +/*=========================*/ + /* out: checksum */ + const byte* page, /* in: compressed BLOB page */ + ulint zip_size); /* in: size of the page, in bytes */ +/************************************************************************ Calculates a page checksum which is stored to the page when it is written to a file. Note that we must be careful to calculate the same value on 32-bit and 64-bit architectures. */ @@ -419,7 +430,9 @@ ibool buf_page_is_corrupted( /*==================*/ /* out: TRUE if corrupted */ - byte* read_buf); /* in: a database page */ + byte* read_buf, /* in: a database page */ + ulint zip_size); /* in: size of compressed page; + 0 for uncompressed pages */ /************************************************************************** Gets the page number of a pointer pointing within a buffer frame containing a file page. */ diff --git a/include/fil0fil.h b/include/fil0fil.h index 53bb496c190..f95def47a5a 100644 --- a/include/fil0fil.h +++ b/include/fil0fil.h @@ -105,6 +105,7 @@ extern fil_addr_t fil_addr_null; #define FIL_PAGE_TYPE_FSP_HDR 8 /* File space header */ #define FIL_PAGE_TYPE_XDES 9 /* Extent descriptor page */ #define FIL_PAGE_TYPE_BLOB 10 /* Uncompressed BLOB page */ +#define FIL_PAGE_TYPE_ZBLOB 11 /* Compressed BLOB page */ /* Space types */ #define FIL_TABLESPACE 501 diff --git a/page/page0zip.c b/page/page0zip.c index 3b40b3f1556..da772054107 100644 --- a/page/page0zip.c +++ b/page/page0zip.c @@ -968,7 +968,8 @@ page_zip_fields_decode( return(NULL); } - table = dict_mem_table_create("ZIP_DUMMY", DICT_HDR_SPACE, n, TRUE); + table = dict_mem_table_create("ZIP_DUMMY", DICT_HDR_SPACE, n, + DICT_TF_COMPACT | DICT_TF_COMPRESSED); index = dict_mem_index_create("ZIP_DUMMY", "ZIP_DUMMY", DICT_HDR_SPACE, 0, n); index->table = table; diff --git a/trx/trx0sys.c b/trx/trx0sys.c index bd1bceef7da..248869a817a 100644 --- a/trx/trx0sys.c +++ b/trx/trx0sys.c @@ -465,13 +465,21 @@ trx_sys_doublewrite_init_or_restore_pages( /* It is an unwritten doublewrite buffer page: do nothing */ } else { + ulint zip_size; /* Read in the actual page from the data files */ fil_io(OS_FILE_READ, TRUE, space_id, page_no, 0, UNIV_PAGE_SIZE, read_buf, NULL); /* Check if the page is corrupt */ - if (buf_page_is_corrupted(read_buf)) { + if (space_id && fil_page_get_type(read_buf) + == FIL_PAGE_TYPE_ZBLOB) { + zip_size = 16384; /* TODO */ + } else { + zip_size = 0; + } + + if (buf_page_is_corrupted(read_buf, zip_size)) { fprintf(stderr, "InnoDB: Warning: database page corruption or a failed\n" @@ -479,7 +487,7 @@ trx_sys_doublewrite_init_or_restore_pages( fprintf(stderr, "InnoDB: Trying to recover it from the doublewrite buffer.\n"); - if (buf_page_is_corrupted(page)) { + if (buf_page_is_corrupted(page, zip_size)) { fprintf(stderr, "InnoDB: Dump of the page:\n"); buf_page_print(read_buf);