diff --git a/storage/innobase/btr/btr0cur.cc b/storage/innobase/btr/btr0cur.cc index a2cd3253782..a88fceac6e6 100644 --- a/storage/innobase/btr/btr0cur.cc +++ b/storage/innobase/btr/btr0cur.cc @@ -7090,7 +7090,7 @@ static void btr_blob_free(buf_block_t *block, bool all, mtr_t *mtr) mysql_mutex_lock(&buf_pool.mutex); if (buf_page_t *bpage= buf_pool.page_hash_get_low(page_id, fold)) - if(!buf_LRU_free_page(bpage, all) && all && bpage->zip.data) + if (!buf_LRU_free_page(bpage, all) && all && bpage->zip.data) /* Attempt to deallocate the redundant copy of the uncompressed page if the whole ROW_FORMAT=COMPRESSED block cannot be deallocted. */ buf_LRU_free_page(bpage, false); diff --git a/storage/innobase/buf/buf0buf.cc b/storage/innobase/buf/buf0buf.cc index 1e19ca713be..ebd3d13ec97 100644 --- a/storage/innobase/buf/buf0buf.cc +++ b/storage/innobase/buf/buf0buf.cc @@ -1108,13 +1108,15 @@ inline const buf_block_t *buf_pool_t::chunk_t::not_freed() const break; } + const lsn_t lsn= block->page.oldest_modification(); + if (fsp_is_system_temporary(block->page.id().space())) { - ut_ad(block->page.oldest_modification() <= 1); + ut_ad(lsn == 0 || lsn == 2); break; } - if (!block->page.ready_for_replace()) + if (lsn > 1 || !block->page.can_relocate()) return block; break; @@ -1269,9 +1271,9 @@ void buf_pool_t::close() Only on aborted startup (with recovery) or with innodb_fast_shutdown=2 we may discard changes. */ ut_d(const lsn_t oldest= bpage->oldest_modification();) - ut_ad(!oldest || srv_is_being_started || - srv_fast_shutdown == 2 || - (oldest == 1 && fsp_is_system_temporary(bpage->id().space()))); + ut_ad(fsp_is_system_temporary(bpage->id().space()) + ? (oldest == 0 || oldest == 2) + : oldest <= 1 || srv_is_being_started || srv_fast_shutdown == 2); if (bpage->state() != BUF_BLOCK_FILE_PAGE) buf_page_free_descriptor(bpage); @@ -1489,10 +1491,10 @@ inline bool buf_pool_t::withdraw_blocks() /* reserve free_list length */ if (UT_LIST_GET_LEN(withdraw) < withdraw_target) { - buf_flush_lists( + buf_flush_LRU( std::max(withdraw_target - UT_LIST_GET_LEN(withdraw), - srv_LRU_scan_depth), 0); + srv_LRU_scan_depth)); buf_flush_wait_batch_end_acquiring_mutex(true); } @@ -2970,8 +2972,10 @@ re_evict: fix_block->fix(); mysql_mutex_unlock(&buf_pool.mutex); - buf_flush_lists(ULINT_UNDEFINED, LSN_MAX); + buf_flush_list(); buf_flush_wait_batch_end_acquiring_mutex(false); + while (buf_flush_list_space(space)); + os_aio_wait_until_no_pending_writes(); if (fix_block->page.buf_fix_count() == 1 && !fix_block->page.oldest_modification()) { @@ -4066,8 +4070,8 @@ void buf_pool_t::print() << UT_LIST_GET_LEN(flush_list) << ", n pending decompressions=" << n_pend_unzip << ", n pending reads=" << n_pend_reads - << ", n pending flush LRU=" << n_flush_LRU - << " list=" << n_flush_list + << ", n pending flush LRU=" << n_flush_LRU_ + << " list=" << n_flush_list_ << ", pages made young=" << stat.n_pages_made_young << ", not young=" << stat.n_pages_not_made_young << ", pages read=" << stat.n_pages_read @@ -4166,7 +4170,6 @@ void buf_stats_get_pool_info(buf_pool_info_t *pool_info) double time_elapsed; mysql_mutex_lock(&buf_pool.mutex); - mysql_mutex_lock(&buf_pool.flush_list_mutex); pool_info->pool_size = buf_pool.curr_size; @@ -4176,17 +4179,17 @@ void buf_stats_get_pool_info(buf_pool_info_t *pool_info) pool_info->free_list_len = UT_LIST_GET_LEN(buf_pool.free); + mysql_mutex_lock(&buf_pool.flush_list_mutex); pool_info->flush_list_len = UT_LIST_GET_LEN(buf_pool.flush_list); pool_info->n_pend_unzip = UT_LIST_GET_LEN(buf_pool.unzip_LRU); + mysql_mutex_unlock(&buf_pool.flush_list_mutex); pool_info->n_pend_reads = buf_pool.n_pend_reads; - pool_info->n_pending_flush_lru = buf_pool.n_flush_LRU; + pool_info->n_pending_flush_lru = buf_pool.n_flush_LRU_; - pool_info->n_pending_flush_list = buf_pool.n_flush_list; - - mysql_mutex_unlock(&buf_pool.flush_list_mutex); + pool_info->n_pending_flush_list = buf_pool.n_flush_list_; current_time = time(NULL); time_elapsed = 0.001 + difftime(current_time, diff --git a/storage/innobase/buf/buf0dblwr.cc b/storage/innobase/buf/buf0dblwr.cc index ba1c386687e..c512cf01579 100644 --- a/storage/innobase/buf/buf0dblwr.cc +++ b/storage/innobase/buf/buf0dblwr.cc @@ -669,6 +669,13 @@ void buf_dblwr_t::flush_buffered_writes_completed(const IORequest &request) ut_d(buf_dblwr_check_page_lsn(*bpage, static_cast(frame))); } + const lsn_t lsn= mach_read_from_8(my_assume_aligned<8> + (FIL_PAGE_LSN + + static_cast(frame))); + ut_ad(lsn); + ut_ad(lsn >= bpage->oldest_modification()); + if (lsn > log_sys.get_flushed_lsn()) + log_write_up_to(lsn, true); e.request.node->space->io(e.request, bpage->physical_offset(), e_size, frame, bpage); } @@ -682,7 +689,6 @@ void buf_dblwr_t::flush_buffered_writes() { if (!is_initialised() || !srv_use_doublewrite_buf) { - os_aio_wait_until_no_pending_writes(); fil_flush_file_spaces(); return; } diff --git a/storage/innobase/buf/buf0flu.cc b/storage/innobase/buf/buf0flu.cc index 47bf72837eb..75751324d68 100644 --- a/storage/innobase/buf/buf0flu.cc +++ b/storage/innobase/buf/buf0flu.cc @@ -66,8 +66,11 @@ static constexpr ulint buf_flush_lsn_scan_factor = 3; /** Average redo generation rate */ static lsn_t lsn_avg_rate = 0; -/** Target oldest_modification for the page cleaner; writes are protected by -buf_pool.flush_list_mutex */ +/** Target oldest_modification for the page cleaner background flushing; +writes are protected by buf_pool.flush_list_mutex */ +static Atomic_relaxed buf_flush_async_lsn; +/** Target oldest_modification for the page cleaner furious flushing; +writes are protected by buf_pool.flush_list_mutex */ static Atomic_relaxed buf_flush_sync_lsn; #ifdef UNIV_PFS_THREAD @@ -131,7 +134,7 @@ inline void buf_pool_t::page_cleaner_wakeup() double(UT_LIST_GET_LEN(buf_pool.LRU) + UT_LIST_GET_LEN(buf_pool.free)); double pct_lwm= srv_max_dirty_pages_pct_lwm; - /* if pct_lwm != 0.0 means adpative flushing is enabled. + /* if pct_lwm != 0.0, adaptive flushing is enabled. signal buf page cleaner thread - if pct_lwm <= dirty_pct then it will invoke apdative flushing flow - if pct_lwm > dirty_pct then it will invoke idle flushing flow. @@ -165,53 +168,58 @@ inline void buf_pool_t::page_cleaner_wakeup() } } -/** Insert a modified block into the flush list. -@param[in,out] block modified block -@param[in] lsn oldest modification */ -void buf_flush_insert_into_flush_list(buf_block_t* block, lsn_t lsn) -{ - mysql_mutex_assert_not_owner(&buf_pool.mutex); - mysql_mutex_assert_owner(&log_sys.flush_order_mutex); - ut_ad(lsn); - ut_ad(!fsp_is_system_temporary(block->page.id().space())); - - mysql_mutex_lock(&buf_pool.flush_list_mutex); - block->page.set_oldest_modification(lsn); - MEM_CHECK_DEFINED(block->page.zip.data - ? block->page.zip.data : block->frame, - block->physical_size()); - buf_pool.stat.flush_list_bytes += block->physical_size(); - ut_ad(buf_pool.stat.flush_list_bytes <= buf_pool.curr_pool_size); - - UT_LIST_ADD_FIRST(buf_pool.flush_list, &block->page); - ut_d(buf_flush_validate_skip()); - buf_pool.page_cleaner_wakeup(); - mysql_mutex_unlock(&buf_pool.flush_list_mutex); -} - -/** Remove a block from buf_pool.flush_list */ -static void buf_flush_remove_low(buf_page_t *bpage) +inline void buf_pool_t::delete_from_flush_list_low(buf_page_t *bpage) { ut_ad(!fsp_is_system_temporary(bpage->id().space())); - mysql_mutex_assert_owner(&buf_pool.mutex); - mysql_mutex_assert_owner(&buf_pool.flush_list_mutex); - ut_ad(!bpage->oldest_modification()); - buf_pool.flush_hp.adjust(bpage); - UT_LIST_REMOVE(buf_pool.flush_list, bpage); - buf_pool.stat.flush_list_bytes -= bpage->physical_size(); + mysql_mutex_assert_owner(&flush_list_mutex); + flush_hp.adjust(bpage); + UT_LIST_REMOVE(flush_list, bpage); +} + +/** Insert a modified block into the flush list. +@param block modified block +@param lsn start LSN of the mini-transaction that modified the block */ +void buf_pool_t::insert_into_flush_list(buf_block_t *block, lsn_t lsn) +{ + mysql_mutex_assert_not_owner(&mutex); + mysql_mutex_assert_owner(&log_sys.flush_order_mutex); + ut_ad(lsn > 2); + ut_ad(!fsp_is_system_temporary(block->page.id().space())); + + mysql_mutex_lock(&flush_list_mutex); + if (ut_d(const lsn_t old=) block->page.oldest_modification()) + { + ut_ad(old == 1); + delete_from_flush_list_low(&block->page); + } + else + stat.flush_list_bytes+= block->physical_size(); + ut_ad(stat.flush_list_bytes <= curr_pool_size); + + block->page.set_oldest_modification(lsn); + MEM_CHECK_DEFINED(block->page.zip.data + ? block->page.zip.data : block->frame, + block->physical_size()); + UT_LIST_ADD_FIRST(flush_list, &block->page); + ut_d(buf_flush_validate_skip()); + page_cleaner_wakeup(); + mysql_mutex_unlock(&flush_list_mutex); +} + +/** Remove a block from flush_list. +@param bpage buffer pool page +@param clear whether to invoke buf_page_t::clear_oldest_modification() */ +void buf_pool_t::delete_from_flush_list(buf_page_t *bpage, bool clear) +{ + if (clear) + bpage->clear_oldest_modification(); + delete_from_flush_list_low(bpage); + stat.flush_list_bytes-= bpage->physical_size(); #ifdef UNIV_DEBUG buf_flush_validate_skip(); #endif /* UNIV_DEBUG */ } -/** Remove a block from the flush list of modified blocks. -@param[in,out] bpage block to be removed from the flush list */ -static void buf_flush_remove(buf_page_t *bpage) -{ - bpage->clear_oldest_modification(); - buf_flush_remove_low(bpage); -} - /** Remove all dirty pages belonging to a given tablespace when we are deleting the data file of that tablespace. The pages still remain a part of LRU and are evicted from @@ -242,7 +250,7 @@ void buf_flush_remove_pages(ulint id) else if (bpage->io_fix() != BUF_IO_NONE) deferred= true; else - buf_flush_remove(bpage); + buf_pool.delete_from_flush_list(bpage); bpage= prev; } @@ -261,31 +269,6 @@ void buf_flush_remove_pages(ulint id) mysql_mutex_unlock(&buf_pool.mutex); } -/** Try to flush all the dirty pages that belong to a given tablespace. -@param id tablespace identifier -@return number dirty pages that there were for this tablespace */ -ulint buf_flush_dirty_pages(ulint id) -{ - ulint n= 0; - - mysql_mutex_lock(&buf_pool.flush_list_mutex); - - for (buf_page_t *bpage= UT_LIST_GET_FIRST(buf_pool.flush_list); bpage; - bpage= UT_LIST_GET_NEXT(list, bpage)) - { - ut_d(const auto s= bpage->state()); - ut_ad(s == BUF_BLOCK_ZIP_PAGE || s == BUF_BLOCK_FILE_PAGE || - s == BUF_BLOCK_REMOVE_HASH); - ut_ad(bpage->oldest_modification()); - if (id == bpage->id().space()) - n++; - } - mysql_mutex_unlock(&buf_pool.flush_list_mutex); - if (n) - buf_flush_lists(srv_max_io_capacity, LSN_MAX); - return n; -} - /*******************************************************************//** Relocates a buffer control block on the flush_list. Note that it is assumed that the contents of bpage have already been @@ -309,31 +292,43 @@ buf_flush_relocate_on_flush_list( mysql_mutex_assert_owner(&buf_pool.mutex); ut_ad(!fsp_is_system_temporary(bpage->id().space())); - if (!bpage->oldest_modification()) { + const lsn_t lsn = bpage->oldest_modification(); + + if (!lsn) { return; } + ut_ad(lsn == 1 || lsn > 2); + mysql_mutex_lock(&buf_pool.flush_list_mutex); - /* FIXME: At this point we have both buf_pool and flush_list - mutexes. Theoretically removal of a block from flush list is - only covered by flush_list mutex but currently we do - have buf_pool mutex in buf_flush_remove() therefore this block - is guaranteed to be in the flush list. We need to check if - this will work without the assumption of block removing code - having the buf_pool mutex. */ - ut_ad(dpage->oldest_modification()); + /* FIXME: Can we avoid holding buf_pool.mutex here? */ + ut_ad(dpage->oldest_modification() == lsn); - /* Important that we adjust the hazard pointer before removing - the bpage from the flush list. */ - buf_pool.flush_hp.adjust(bpage); + if (const lsn_t o_lsn = bpage->oldest_modification()) { + ut_ad(o_lsn == lsn); - bpage->clear_oldest_modification(); + /* Important that we adjust the hazard pointer before removing + the bpage from the flush list. */ + buf_pool.flush_hp.adjust(bpage); - prev = UT_LIST_GET_PREV(list, bpage); - UT_LIST_REMOVE(buf_pool.flush_list, bpage); + bpage->clear_oldest_modification(); - if (prev) { + prev = UT_LIST_GET_PREV(list, bpage); + UT_LIST_REMOVE(buf_pool.flush_list, bpage); + } else { + /* bpage was removed from buf_pool.flush_list + since we last checked, and before we acquired + buf_pool.flush_list_mutex. */ + dpage->list.prev = nullptr; + dpage->list.next = nullptr; + goto was_clean; + } + + if (lsn == 1) { +was_clean: + dpage->clear_oldest_modification(); + } else if (prev) { ut_ad(prev->oldest_modification()); UT_LIST_INSERT_AFTER(buf_pool.flush_list, prev, dpage); } else { @@ -354,25 +349,24 @@ void buf_page_write_complete(const IORequest &request) buf_page_t *bpage= request.bpage; ut_ad(bpage); ut_ad(bpage->in_file()); + /* bpage->io_fix() can only be changed by buf_page_write_complete() + and buf_page_read_complete() from BUF_IO_READ or BUF_IO_WRITE */ ut_ad(bpage->io_fix() == BUF_IO_WRITE); ut_ad(!buf_dblwr.is_inside(bpage->id())); - bool dblwr; + ut_ad(request.node->space->id == bpage->id().space()); + if (bpage->status == buf_page_t::INIT_ON_FLUSH) - { bpage->status= buf_page_t::NORMAL; - dblwr= false; - } else { ut_ad(bpage->status == buf_page_t::NORMAL); - dblwr= request.node->space->use_doublewrite(); + if (request.node->space->use_doublewrite()) + { + ut_ad(request.node->space != fil_system.temp_space); + buf_dblwr.write_completed(); + } } - /* We do not need protect io_fix here by mutex to read it because - this and buf_page_read_complete() are the only functions where we can - change the value from BUF_IO_READ or BUF_IO_WRITE to some other - value, and our code ensures that this is the only thread that handles - the i/o for this block. */ if (bpage->slot) { bpage->slot->release(); @@ -383,42 +377,35 @@ void buf_page_write_complete(const IORequest &request) buf_page_monitor(bpage, BUF_IO_WRITE); DBUG_PRINT("ib_buf", ("write page %u:%u", bpage->id().space(), bpage->id().page_no())); - ut_ad(request.is_LRU() ? buf_pool.n_flush_LRU : buf_pool.n_flush_list); const bool temp= fsp_is_system_temporary(bpage->id().space()); mysql_mutex_lock(&buf_pool.mutex); + buf_pool.stat.n_pages_written++; + /* While we do not need any mutex for clearing oldest_modification + here, we hope that it will be in the same cache line with io_fix, + whose changes must be protected by buf_pool.mutex. */ + bpage->clear_oldest_modification(temp); + ut_ad(bpage->io_fix() == BUF_IO_WRITE); bpage->set_io_fix(BUF_IO_NONE); - mysql_mutex_lock(&buf_pool.flush_list_mutex); - ut_ad(!temp || bpage->oldest_modification() == 1); - bpage->clear_oldest_modification(); - - if (!temp) - buf_flush_remove_low(bpage); - else - ut_ad(request.is_LRU()); - - mysql_mutex_unlock(&buf_pool.flush_list_mutex); - - if (dblwr) - { - ut_ad(!fsp_is_system_temporary(bpage->id().space())); - buf_dblwr.write_completed(); - } if (bpage->state() == BUF_BLOCK_FILE_PAGE) reinterpret_cast(bpage)->lock.u_unlock(true); - buf_pool.stat.n_pages_written++; + if (request.is_LRU()) + buf_LRU_free_page(bpage, true); + else + ut_ad(!temp); if (request.is_LRU()) { - buf_LRU_free_page(bpage, true); - if (!--buf_pool.n_flush_LRU) + ut_ad(buf_pool.n_flush_LRU_); + if (!--buf_pool.n_flush_LRU_) pthread_cond_broadcast(&buf_pool.done_flush_LRU); } else { - if (!--buf_pool.n_flush_list) + ut_ad(buf_pool.n_flush_list_); + if (!--buf_pool.n_flush_list_) pthread_cond_broadcast(&buf_pool.done_flush_list); } @@ -773,35 +760,33 @@ not_compressed: return d; } -/** The following function deals with freed page during flushing. - i) Writing zeros to the file asynchronously if scrubbing is enabled - ii) Punch the hole to the file synchoronously if page_compressed is - enabled for the tablespace -This function also resets the IO_FIX to IO_NONE and making the -page status as NORMAL. It initiates the write to the file only after -releasing the page from flush list and its associated mutex. -@param[in,out] bpage freed buffer page */ -static void buf_release_freed_page(buf_page_t *bpage) +/** Free a page whose underlying file page has been freed. */ +inline void buf_pool_t::release_freed_page(buf_page_t *bpage) { ut_ad(bpage->in_file()); const bool uncompressed= bpage->state() == BUF_BLOCK_FILE_PAGE; - mysql_mutex_lock(&buf_pool.mutex); + mysql_mutex_lock(&mutex); bpage->set_io_fix(BUF_IO_NONE); bpage->status= buf_page_t::NORMAL; - const bool temp= fsp_is_system_temporary(bpage->id().space()); - ut_ad(!temp || uncompressed); - ut_ad(!temp || bpage->oldest_modification() == 1); - mysql_mutex_lock(&buf_pool.flush_list_mutex); + mysql_mutex_lock(&flush_list_mutex); + if (fsp_is_system_temporary(bpage->id().space())) + { + ut_ad(uncompressed); + ut_ad(bpage->oldest_modification() == 2); + } + else + { + ut_ad(bpage->oldest_modification() > 2); + delete_from_flush_list(bpage, false); + } bpage->clear_oldest_modification(); - if (!temp) - buf_flush_remove_low(bpage); - mysql_mutex_unlock(&buf_pool.flush_list_mutex); + mysql_mutex_unlock(&flush_list_mutex); if (uncompressed) reinterpret_cast(bpage)->lock.u_unlock(true); buf_LRU_free_page(bpage, true); - mysql_mutex_unlock(&buf_pool.mutex); + mysql_mutex_unlock(&mutex); } /** Write a flushable page from buf_pool to a file. @@ -832,8 +817,22 @@ static bool buf_flush_page(buf_page_t *bpage, bool lru, fil_space_t *space) } bpage->set_io_fix(BUF_IO_WRITE); - buf_flush_page_count++; - mysql_mutex_unlock(&buf_pool.mutex); + /* Because bpage->status can only be changed while buf_block_t + exists, it cannot be modified for ROW_FORMAT=COMPRESSED pages + without first allocating the uncompressed page frame. Such + allocation cannot be completed due to our io_fix. So, bpage->status + is protected even if !rw_lock. */ + const auto status= bpage->status; + + if (status != buf_page_t::FREED) + { + if (lru) + buf_pool.n_flush_LRU_++; + else + buf_pool.n_flush_list_++; + buf_flush_page_count++; + } + mysql_mutex_assert_not_owner(&buf_pool.flush_list_mutex); /* We are holding rw_lock = buf_block_t::lock in SX mode except if @@ -852,38 +851,14 @@ static bool buf_flush_page(buf_page_t *bpage, bool lru, fil_space_t *space) ut_ad(bpage->state() == (rw_lock ? BUF_BLOCK_FILE_PAGE : BUF_BLOCK_ZIP_PAGE)); ut_ad(ULINT_UNDEFINED > - (lru ? buf_pool.n_flush_LRU : buf_pool.n_flush_list)); - - /* Because bpage->status can only be changed while buf_block_t - exists, it cannot be modified for ROW_FORMAT=COMPRESSED pages - without first allocating the uncompressed page frame. Such - allocation cannot be completed due to our io_fix. So, bpage->status - is protected even if !rw_lock. */ - const auto status= bpage->status; + (lru ? buf_pool.n_flush_LRU_ : buf_pool.n_flush_list_)); + mysql_mutex_unlock(&buf_pool.mutex); buf_block_t *block= reinterpret_cast(bpage); page_t *frame= bpage->zip.data; - if (UNIV_LIKELY(space->purpose == FIL_TYPE_TABLESPACE)) - { - const lsn_t lsn= mach_read_from_8(my_assume_aligned<8> - (FIL_PAGE_LSN + - (frame ? frame : block->frame))); - ut_ad(lsn); - ut_ad(lsn >= bpage->oldest_modification()); - ut_ad(!srv_read_only_mode); - if (UNIV_UNLIKELY(lsn > log_sys.get_flushed_lsn())) - { - if (rw_lock) - rw_lock->u_unlock(true); - mysql_mutex_lock(&buf_pool.mutex); - bpage->set_io_fix(BUF_IO_NONE); - return false; - } - } - if (status == buf_page_t::FREED) - buf_release_freed_page(&block->page); + buf_pool.release_freed_page(&block->page); else { space->reacquire(); @@ -919,8 +894,8 @@ static bool buf_flush_page(buf_page_t *bpage, bool lru, fil_space_t *space) /* innodb_checksum_algorithm=full_crc32 is not implemented for ROW_FORMAT=COMPRESSED pages. */ ut_ad(!frame); - page= buf_page_encrypt(space, bpage, page, &size); - buf_flush_init_for_writing(block, page, nullptr, true); + page= buf_page_encrypt(space, bpage, page, &size); + buf_flush_init_for_writing(block, page, nullptr, true); } else { @@ -938,13 +913,21 @@ static bool buf_flush_page(buf_page_t *bpage, bool lru, fil_space_t *space) ut_ad(status == bpage->status); - if (lru) - buf_pool.n_flush_LRU++; - else - buf_pool.n_flush_list++; if (status != buf_page_t::NORMAL || !space->use_doublewrite()) + { + if (UNIV_LIKELY(space->purpose == FIL_TYPE_TABLESPACE)) + { + const lsn_t lsn= mach_read_from_8(my_assume_aligned<8> + (FIL_PAGE_LSN + (frame ? frame + : block->frame))); + ut_ad(lsn); + ut_ad(lsn >= bpage->oldest_modification()); + if (lsn > log_sys.get_flushed_lsn()) + log_write_up_to(lsn, true); + } space->io(IORequest(type, bpage), bpage->physical_offset(), size, frame, bpage); + } else buf_dblwr.add_to_batch(IORequest(bpage, space->chain.start, type), size); } @@ -971,8 +954,10 @@ static bool buf_flush_check_neighbor(const page_id_t id, ulint fold, bool lru) /* We avoid flushing 'non-old' blocks in an LRU flush, because the flushed blocks are soon freed */ + if (lru && !bpage->is_old()) + return false; - return (!lru || bpage->is_old()) && bpage->ready_for_flush(); + return bpage->oldest_modification() > 1 && bpage->ready_for_flush(); } /** Check which neighbors of a page can be flushed from the buf_pool. @@ -1132,6 +1117,7 @@ static ulint buf_flush_try_neighbors(fil_space_t *space, if (!lru || id == page_id || bpage->is_old()) { if (!buf_pool.watch_is_sentinel(*bpage) && + bpage->oldest_modification() > 1 && bpage->ready_for_flush() && buf_flush_page(bpage, lru, space)) { ++count; @@ -1244,7 +1230,7 @@ static void buf_flush_discard_page(buf_page_t *bpage) bpage->status= buf_page_t::NORMAL; mysql_mutex_lock(&buf_pool.flush_list_mutex); - buf_flush_remove(bpage); + buf_pool.delete_from_flush_list(bpage); mysql_mutex_unlock(&buf_pool.flush_list_mutex); if (rw_lock) @@ -1275,20 +1261,20 @@ static void buf_flush_LRU_list_batch(ulint max, flush_counters_t *n) for (buf_page_t *bpage= UT_LIST_GET_LAST(buf_pool.LRU); bpage && n->flushed + n->evicted < max && UT_LIST_GET_LEN(buf_pool.LRU) > BUF_LRU_MIN_LEN && - UT_LIST_GET_LEN(buf_pool.free) < free_limit; - ++scanned, bpage= buf_pool.lru_hp.get()) + UT_LIST_GET_LEN(buf_pool.free) < free_limit; ++scanned) { buf_page_t *prev= UT_LIST_GET_PREV(LRU, bpage); + const lsn_t oldest_modification= bpage->oldest_modification(); buf_pool.lru_hp.set(prev); - if (bpage->ready_for_replace()) + if (oldest_modification <= 1 && bpage->can_relocate()) { /* block is ready for eviction i.e., it is clean and is not IO-fixed or buffer fixed. */ if (buf_LRU_free_page(bpage, true)) ++n->evicted; } - else if (bpage->ready_for_flush()) + else if (oldest_modification > 1 && bpage->ready_for_flush()) { /* Block is ready for flush. Dispatch an IO request. The IO helper thread will put it on free list in IO completion routine. */ @@ -1331,6 +1317,7 @@ reacquire_mutex: else /* Can't evict or dispatch this block. Go to previous. */ ut_ad(buf_pool.lru_hp.is_hp(prev)); + bpage= buf_pool.lru_hp.get(); } buf_pool.lru_hp.set(nullptr); @@ -1393,68 +1380,80 @@ static ulint buf_do_flush_list_batch(ulint max_n, lsn_t lsn) mysql_mutex_lock(&buf_pool.flush_list_mutex); ulint len= UT_LIST_GET_LEN(buf_pool.flush_list); - /* In order not to degenerate this scan to O(n*n) we attempt to - preserve pointer of previous block in the flush list. To do so we - declare it a hazard pointer. Any thread working on the flush list - must check the hazard pointer and if it is removing the same block - then it must reset it. */ for (buf_page_t *bpage= UT_LIST_GET_LAST(buf_pool.flush_list); - bpage && len && count < max_n; - bpage= buf_pool.flush_hp.get(), ++scanned, len--) + bpage && len && count < max_n; ++scanned, len--) { const lsn_t oldest_modification= bpage->oldest_modification(); if (oldest_modification >= lsn) break; - ut_ad(oldest_modification); + ut_ad(bpage->in_file()); buf_page_t *prev= UT_LIST_GET_PREV(list, bpage); + + if (oldest_modification == 1) + { + buf_pool.delete_from_flush_list(bpage); + skip: + bpage= prev; + continue; + } + + ut_ad(oldest_modification > 2); + ut_ad(bpage->in_file()); + + if (!bpage->ready_for_flush()) + goto skip; + + /* In order not to degenerate this scan to O(n*n) we attempt to + preserve the pointer position. Any thread that would remove 'prev' + from buf_pool.flush_list must adjust the hazard pointer. + + Note: A concurrent execution of buf_flush_list_space() may + terminate this scan prematurely. The buf_pool.n_flush_list() + should prevent multiple threads from executing + buf_do_flush_list_batch() concurrently, + but buf_flush_list_space() is ignoring that. */ buf_pool.flush_hp.set(prev); mysql_mutex_unlock(&buf_pool.flush_list_mutex); - ut_ad(bpage->in_file()); - const bool flushed= bpage->ready_for_flush(); - - if (flushed) + const page_id_t page_id(bpage->id()); + const uint32_t space_id= page_id.space(); + if (!space || space->id != space_id) { - const page_id_t page_id(bpage->id()); - const uint32_t space_id= page_id.space(); - if (!space || space->id != space_id) + if (last_space_id != space_id) { - if (last_space_id != space_id) - { - if (space) - space->release(); - space= buf_flush_space(space_id); - last_space_id= space_id; - } - else - ut_ad(!space); - } - else if (space->is_stopping()) - { - space->release(); - space= nullptr; + if (space) + space->release(); + space= buf_flush_space(space_id); + last_space_id= space_id; } + else + ut_ad(!space); + } + else if (space->is_stopping()) + { + space->release(); + space= nullptr; + } - if (!space) - buf_flush_discard_page(bpage); - else if (neighbors && space->is_rotational()) - { - mysql_mutex_unlock(&buf_pool.mutex); - count+= buf_flush_try_neighbors(space, page_id, neighbors == 1, - false, count, max_n); -reacquire_mutex: - mysql_mutex_lock(&buf_pool.mutex); - } - else if (buf_flush_page(bpage, false, space)) - { - ++count; - goto reacquire_mutex; - } + if (!space) + buf_flush_discard_page(bpage); + else if (neighbors && space->is_rotational()) + { + mysql_mutex_unlock(&buf_pool.mutex); + count+= buf_flush_try_neighbors(space, page_id, neighbors == 1, + false, count, max_n); + reacquire_mutex: + mysql_mutex_lock(&buf_pool.mutex); + } + else if (buf_flush_page(bpage, false, space)) + { + ++count; + goto reacquire_mutex; } mysql_mutex_lock(&buf_pool.flush_list_mutex); - ut_ad(flushed || buf_pool.flush_hp.is_hp(prev)); + bpage= buf_pool.flush_hp.get(); } buf_pool.flush_hp.set(nullptr); @@ -1481,7 +1480,7 @@ reacquire_mutex: @param lru true=buf_pool.LRU; false=buf_pool.flush_list */ void buf_flush_wait_batch_end(bool lru) { - const auto &n_flush= lru ? buf_pool.n_flush_LRU : buf_pool.n_flush_list; + const auto &n_flush= lru ? buf_pool.n_flush_LRU_ : buf_pool.n_flush_list_; if (n_flush) { @@ -1497,85 +1496,193 @@ void buf_flush_wait_batch_end(bool lru) } } -/** Whether a background log flush is pending */ -static std::atomic_flag log_flush_pending; - -/** Advance log_sys.get_flushed_lsn() */ -static void log_flush(void *) -{ - /* Between batches, we try to prevent I/O stalls by these calls. - This should not be needed for correctness. */ - os_aio_wait_until_no_pending_writes(); - fil_flush_file_spaces(); - - /* Guarantee progress for buf_flush_lists(). */ - log_buffer_flush_to_disk(true); - log_flush_pending.clear(); -} - -static tpool::waitable_task log_flush_task(log_flush, nullptr, nullptr); - /** Write out dirty blocks from buf_pool.flush_list. @param max_n wished maximum mumber of blocks flushed -@param lsn buf_pool.get_oldest_modification(LSN_MAX) target (0=LRU flush) +@param lsn buf_pool.get_oldest_modification(LSN_MAX) target @return the number of processed pages -@retval 0 if a batch of the same type (lsn==0 or lsn!=0) is already running */ -ulint buf_flush_lists(ulint max_n, lsn_t lsn) +@retval 0 if a buf_pool.flush_list batch is already running */ +ulint buf_flush_list(ulint max_n, lsn_t lsn) { - auto &n_flush= lsn ? buf_pool.n_flush_list : buf_pool.n_flush_LRU; + ut_ad(lsn); - if (n_flush) + if (buf_pool.n_flush_list()) return 0; - lsn_t flushed_lsn= log_sys.get_flushed_lsn(); - if (log_sys.get_lsn() > flushed_lsn) - { - log_flush_task.wait(); - flushed_lsn= log_sys.get_flushed_lsn(); - if (log_sys.get_lsn() > flushed_lsn && - !log_flush_pending.test_and_set()) - srv_thread_pool->submit_task(&log_flush_task); -#if defined UNIV_DEBUG || defined UNIV_IBUF_DEBUG - if (UNIV_UNLIKELY(ibuf_debug)) - log_buffer_flush_to_disk(true); -#endif - } - - auto cond= lsn ? &buf_pool.done_flush_list : &buf_pool.done_flush_LRU; - mysql_mutex_lock(&buf_pool.mutex); - const bool running= n_flush != 0; + const bool running= buf_pool.n_flush_list_ != 0; /* FIXME: we are performing a dirty read of buf_pool.flush_list.count while not holding buf_pool.flush_list_mutex */ - if (running || (lsn && !UT_LIST_GET_LEN(buf_pool.flush_list))) + if (running || !UT_LIST_GET_LEN(buf_pool.flush_list)) { if (!running) - pthread_cond_broadcast(cond); + pthread_cond_broadcast(&buf_pool.done_flush_list); mysql_mutex_unlock(&buf_pool.mutex); return 0; } - n_flush++; - ulint n_flushed= lsn - ? buf_do_flush_list_batch(max_n, lsn) - : buf_do_LRU_batch(max_n); - - const auto n_flushing= --n_flush; + buf_pool.n_flush_list_++; + const ulint n_flushed= buf_do_flush_list_batch(max_n, lsn); + const ulint n_flushing= --buf_pool.n_flush_list_; buf_pool.try_LRU_scan= true; mysql_mutex_unlock(&buf_pool.mutex); if (!n_flushing) - pthread_cond_broadcast(cond); + pthread_cond_broadcast(&buf_pool.done_flush_list); buf_dblwr.flush_buffered_writes(); - DBUG_PRINT("ib_buf", ("%s completed, " ULINTPF " pages", - lsn ? "flush_list" : "LRU flush", n_flushed)); + DBUG_PRINT("ib_buf", ("flush_list completed, " ULINTPF " pages", n_flushed)); return n_flushed; } +/** Try to flush all the dirty pages that belong to a given tablespace. +@param space tablespace +@param n_flushed number of pages written +@return whether the flush for some pages might not have been initiated */ +bool buf_flush_list_space(fil_space_t *space, ulint *n_flushed) +{ + const auto space_id= space->id; + ut_ad(space_id <= SRV_SPACE_ID_UPPER_BOUND); + + bool may_have_skipped= false; + ulint max_n_flush= srv_io_capacity; + + mysql_mutex_lock(&buf_pool.mutex); + mysql_mutex_lock(&buf_pool.flush_list_mutex); + + bool acquired= space->acquire(); + buf_flush_freed_pages(space); + + for (buf_page_t *bpage= UT_LIST_GET_LAST(buf_pool.flush_list); bpage; ) + { + ut_d(const auto s= bpage->state()); + ut_ad(s == BUF_BLOCK_ZIP_PAGE || s == BUF_BLOCK_FILE_PAGE || + s == BUF_BLOCK_REMOVE_HASH); + ut_ad(bpage->oldest_modification()); + ut_ad(bpage->in_file()); + + buf_page_t *prev= UT_LIST_GET_PREV(list, bpage); + if (bpage->id().space() != space_id); + else if (bpage->oldest_modification() == 1) + buf_pool.delete_from_flush_list(bpage); + else if (!bpage->ready_for_flush()) + may_have_skipped= true; + else + { + /* In order not to degenerate this scan to O(n*n) we attempt to + preserve the pointer position. Any thread that would remove 'prev' + from buf_pool.flush_list must adjust the hazard pointer. + + Note: Multiple executions of buf_flush_list_space() may be + interleaved, and also buf_do_flush_list_batch() may be running + concurrently. This may terminate our iteration prematurely, + leading us to return may_have_skipped=true. */ + buf_pool.flush_hp.set(prev); + mysql_mutex_unlock(&buf_pool.flush_list_mutex); + + if (!acquired) + { + was_freed: + buf_flush_discard_page(bpage); + } + else + { + if (space->is_stopping()) + { + space->release(); + acquired= false; + goto was_freed; + } + if (!buf_flush_page(bpage, false, space)) + { + may_have_skipped= true; + mysql_mutex_lock(&buf_pool.flush_list_mutex); + goto next_after_skip; + } + if (n_flushed) + ++*n_flushed; + if (!--max_n_flush) + { + mysql_mutex_lock(&buf_pool.mutex); + mysql_mutex_lock(&buf_pool.flush_list_mutex); + may_have_skipped= true; + break; + } + mysql_mutex_lock(&buf_pool.mutex); + } + + mysql_mutex_lock(&buf_pool.flush_list_mutex); + if (!buf_pool.flush_hp.is_hp(prev)) + may_have_skipped= true; + next_after_skip: + bpage= buf_pool.flush_hp.get(); + continue; + } + + bpage= prev; + } + + /* Note: this loop may have been executed concurrently with + buf_do_flush_list_batch() as well as other threads executing + buf_flush_list_space(). We should always return true from + buf_flush_list_space() if that should be the case; in + buf_do_flush_list_batch() we will simply perform less work. */ + + buf_pool.flush_hp.set(nullptr); + mysql_mutex_unlock(&buf_pool.flush_list_mutex); + + buf_pool.try_LRU_scan= true; + + mysql_mutex_unlock(&buf_pool.mutex); + + if (acquired) + space->release(); + + if (space->purpose == FIL_TYPE_IMPORT) + os_aio_wait_until_no_pending_writes(); + else + buf_dblwr.flush_buffered_writes(); + + return may_have_skipped; +} + +/** Write out dirty blocks from buf_pool.LRU. +@param max_n wished maximum mumber of blocks flushed +@return the number of processed pages +@retval 0 if a buf_pool.LRU batch is already running */ +ulint buf_flush_LRU(ulint max_n) +{ + if (buf_pool.n_flush_LRU()) + return 0; + + log_buffer_flush_to_disk(true); + + mysql_mutex_lock(&buf_pool.mutex); + if (buf_pool.n_flush_LRU_) + { + mysql_mutex_unlock(&buf_pool.mutex); + return 0; + } + buf_pool.n_flush_LRU_++; + + ulint n_flushed= buf_do_LRU_batch(max_n); + + const ulint n_flushing= --buf_pool.n_flush_LRU_; + + buf_pool.try_LRU_scan= true; + + mysql_mutex_unlock(&buf_pool.mutex); + + if (!n_flushing) + pthread_cond_broadcast(&buf_pool.done_flush_LRU); + + buf_dblwr.flush_buffered_writes(); + + DBUG_PRINT("ib_buf", ("LRU flush completed, " ULINTPF " pages", n_flushed)); + return n_flushed; +} /** Initiate a log checkpoint, discarding the start of the log. @param oldest_lsn the checkpoint LSN @@ -1709,7 +1816,7 @@ ATTRIBUTE_COLD void buf_flush_wait_flushed(lsn_t sync_lsn) do { mysql_mutex_unlock(&buf_pool.flush_list_mutex); - ulint n_pages= buf_flush_lists(srv_max_io_capacity, sync_lsn); + ulint n_pages= buf_flush_list(srv_max_io_capacity, sync_lsn); buf_flush_wait_batch_end_acquiring_mutex(false); if (n_pages) { @@ -1749,12 +1856,21 @@ try_checkpoint: mysql_mutex_unlock(&buf_pool.flush_list_mutex); if (UNIV_UNLIKELY(log_sys.last_checkpoint_lsn < sync_lsn)) + { + /* If the buffer pool was clean, no log write was guaranteed + to happen until now. There could be an outstanding FILE_CHECKPOINT + record from a previous fil_names_clear() call, which we must + write out before we can advance the checkpoint. */ + if (sync_lsn > log_sys.get_flushed_lsn()) + log_write_up_to(sync_lsn, true); log_checkpoint(); + } } -/** If innodb_flush_sync=ON, initiate a furious flush. -@param lsn buf_pool.get_oldest_modification(LSN_MAX) target */ -void buf_flush_ahead(lsn_t lsn) +/** Initiate more eager page flushing if the log checkpoint age is too old. +@param lsn buf_pool.get_oldest_modification(LSN_MAX) target +@param furious true=furious flushing, false=limit to innodb_io_capacity */ +ATTRIBUTE_COLD void buf_flush_ahead(lsn_t lsn, bool furious) { mysql_mutex_assert_not_owner(&log_sys.mutex); ut_ad(!srv_read_only_mode); @@ -1762,14 +1878,15 @@ void buf_flush_ahead(lsn_t lsn) if (recv_recovery_is_on()) recv_sys.apply(true); - if (buf_flush_sync_lsn < lsn) + Atomic_relaxed &limit= furious + ? buf_flush_sync_lsn : buf_flush_async_lsn; + + if (limit < lsn) { mysql_mutex_lock(&buf_pool.flush_list_mutex); - if (buf_flush_sync_lsn < lsn) - { - buf_flush_sync_lsn= lsn; - pthread_cond_signal(&buf_pool.do_flush_list); - } + if (limit < lsn) + limit= lsn; + pthread_cond_signal(&buf_pool.do_flush_list); mysql_mutex_unlock(&buf_pool.flush_list_mutex); } } @@ -1777,7 +1894,7 @@ void buf_flush_ahead(lsn_t lsn) /** Wait for pending flushes to complete. */ void buf_flush_wait_batch_end_acquiring_mutex(bool lru) { - if (lru ? buf_pool.n_flush_LRU : buf_pool.n_flush_list) + if (lru ? buf_pool.n_flush_LRU() : buf_pool.n_flush_list()) { mysql_mutex_lock(&buf_pool.mutex); buf_flush_wait_batch_end(lru); @@ -1796,7 +1913,7 @@ ATTRIBUTE_COLD static void buf_flush_sync_for_checkpoint(lsn_t lsn) { mysql_mutex_unlock(&buf_pool.flush_list_mutex); - if (ulint n_flushed= buf_flush_lists(srv_max_io_capacity, lsn)) + if (ulint n_flushed= buf_flush_list(srv_max_io_capacity, lsn)) { MONITOR_INC_VALUE_CUMULATIVE(MONITOR_FLUSH_SYNC_TOTAL_PAGE, MONITOR_FLUSH_SYNC_COUNT, @@ -1844,6 +1961,8 @@ ATTRIBUTE_COLD static void buf_flush_sync_for_checkpoint(lsn_t lsn) if (measure >= target) buf_flush_sync_lsn= 0; + else if (measure >= buf_flush_async_lsn) + buf_flush_async_lsn= 0; /* wake up buf_flush_wait_flushed() */ pthread_cond_broadcast(&buf_pool.done_flush_list); @@ -1863,7 +1982,7 @@ static bool af_needed_for_redo(lsn_t oldest_lsn) { lsn_t age= (log_sys.get_lsn() - oldest_lsn); lsn_t af_lwm= static_cast(srv_adaptive_flushing_lwm * - static_cast(log_sys.log_capacity) / 100); + static_cast(log_sys.log_capacity) / 100); /* if age > af_lwm adaptive flushing is recommended */ return (age > af_lwm); @@ -2072,7 +2191,6 @@ furious_flush: else if (srv_shutdown_state > SRV_SHUTDOWN_INITIATED) break; - /* If buf pager cleaner is idle and there is no work (either dirty pages are all flushed or adaptive flushing is not enabled) then opt for non-timed wait */ @@ -2086,6 +2204,7 @@ furious_flush: set_timespec(abstime, 1); + lsn_t soft_lsn_limit= buf_flush_async_lsn; lsn_limit= buf_flush_sync_lsn; if (UNIV_UNLIKELY(lsn_limit != 0)) @@ -2096,9 +2215,9 @@ furious_flush: else if (srv_shutdown_state > SRV_SHUTDOWN_INITIATED) break; - const ulint dirty_blocks= UT_LIST_GET_LEN(buf_pool.flush_list); + const lsn_t oldest_lsn= buf_pool.get_oldest_modification(0); - if (!dirty_blocks) + if (!oldest_lsn) { if (UNIV_UNLIKELY(lsn_limit != 0)) { @@ -2107,23 +2226,22 @@ furious_flush: pthread_cond_broadcast(&buf_pool.done_flush_list); } unemployed: + buf_flush_async_lsn= 0; buf_pool.page_cleaner_set_idle(true); continue; } + const ulint dirty_blocks= UT_LIST_GET_LEN(buf_pool.flush_list); + ut_ad(dirty_blocks); /* We perform dirty reads of the LRU+free list lengths here. Division by zero is not possible, because buf_pool.flush_list is guaranteed to be nonempty, and it is a subset of buf_pool.LRU. */ const double dirty_pct= double(dirty_blocks) * 100.0 / double(UT_LIST_GET_LEN(buf_pool.LRU) + UT_LIST_GET_LEN(buf_pool.free)); - const lsn_t oldest_lsn= buf_pool.get_oldest_modified() - ->oldest_modification(); - ut_ad(oldest_lsn); - bool idle_flush= false; - if (lsn_limit); + if (lsn_limit || soft_lsn_limit); else if (af_needed_for_redo(oldest_lsn)); else if (srv_max_dirty_pages_pct_lwm != 0.0) { @@ -2148,23 +2266,28 @@ unemployed: goto unemployed; if (UNIV_UNLIKELY(lsn_limit != 0) && oldest_lsn >= lsn_limit) - buf_flush_sync_lsn= 0; + lsn_limit= buf_flush_sync_lsn= 0; + if (UNIV_UNLIKELY(soft_lsn_limit != 0) && oldest_lsn >= soft_lsn_limit) + soft_lsn_limit= buf_flush_async_lsn= 0; buf_pool.page_cleaner_set_idle(false); mysql_mutex_unlock(&buf_pool.flush_list_mutex); + if (!lsn_limit) + lsn_limit= soft_lsn_limit; + ulint n_flushed; if (UNIV_UNLIKELY(lsn_limit != 0)) { - n_flushed= buf_flush_lists(srv_max_io_capacity, lsn_limit); + n_flushed= buf_flush_list(srv_max_io_capacity, lsn_limit); /* wake up buf_flush_wait_flushed() */ pthread_cond_broadcast(&buf_pool.done_flush_list); goto try_checkpoint; } else if (idle_flush || !srv_adaptive_flushing) { - n_flushed= buf_flush_lists(srv_io_capacity, LSN_MAX); + n_flushed= buf_flush_list(srv_io_capacity); try_checkpoint: if (n_flushed) { @@ -2191,7 +2314,7 @@ do_checkpoint: { page_cleaner.flush_pass++; const ulint tm= ut_time_ms(); - last_pages= n_flushed= buf_flush_lists(n, LSN_MAX); + last_pages= n_flushed= buf_flush_list(n); page_cleaner.flush_time+= ut_time_ms() - tm; if (n_flushed) @@ -2203,7 +2326,7 @@ do_checkpoint: goto do_checkpoint; } } - else + else if (buf_flush_async_lsn <= oldest_lsn) { mysql_mutex_lock(&buf_pool.flush_list_mutex); goto unemployed; @@ -2235,8 +2358,6 @@ next: buf_flush_wait_batch_end_acquiring_mutex(false); } - log_flush_task.wait(); - mysql_mutex_lock(&buf_pool.flush_list_mutex); lsn_limit= buf_flush_sync_lsn; if (UNIV_UNLIKELY(lsn_limit != 0)) @@ -2259,6 +2380,7 @@ ATTRIBUTE_COLD void buf_flush_page_cleaner_init() ut_ad(srv_operation == SRV_OPERATION_NORMAL || srv_operation == SRV_OPERATION_RESTORE || srv_operation == SRV_OPERATION_RESTORE_EXPORT); + buf_flush_async_lsn= 0; buf_flush_sync_lsn= 0; buf_page_cleaner_is_active= true; std::thread(buf_flush_page_cleaner).detach(); @@ -2282,19 +2404,19 @@ ATTRIBUTE_COLD void buf_flush_buffer_pool() service_manager_extend_timeout(INNODB_EXTEND_TIMEOUT_INTERVAL, "Waiting to flush the buffer pool"); - while (buf_pool.n_flush_list || buf_flush_list_length()) + while (buf_pool.n_flush_list() || buf_flush_list_length()) { - buf_flush_lists(srv_max_io_capacity, LSN_MAX); + buf_flush_list(srv_max_io_capacity); timespec abstime; - if (buf_pool.n_flush_list) + if (buf_pool.n_flush_list()) { service_manager_extend_timeout(INNODB_EXTEND_TIMEOUT_INTERVAL, "Waiting to flush " ULINTPF " pages", buf_flush_list_length()); set_timespec(abstime, INNODB_EXTEND_TIMEOUT_INTERVAL / 2); mysql_mutex_lock(&buf_pool.mutex); - while (buf_pool.n_flush_list) + while (buf_pool.n_flush_list_) my_cond_timedwait(&buf_pool.done_flush_list, &buf_pool.mutex.m_mutex, &abstime); mysql_mutex_unlock(&buf_pool.mutex); @@ -2302,7 +2424,6 @@ ATTRIBUTE_COLD void buf_flush_buffer_pool() } ut_ad(!buf_pool.any_io_pending()); - log_flush_task.wait(); } /** Synchronously flush dirty blocks. @@ -2311,7 +2432,7 @@ void buf_flush_sync() { for (;;) { - const ulint n_flushed= buf_flush_lists(srv_max_io_capacity, LSN_MAX); + const ulint n_flushed= buf_flush_list(srv_max_io_capacity); buf_flush_wait_batch_end_acquiring_mutex(false); if (!n_flushed && !buf_flush_list_length()) return; @@ -2350,10 +2471,10 @@ static void buf_flush_validate_low() ut_d(const auto s= bpage->state()); ut_ad(s == BUF_BLOCK_ZIP_PAGE || s == BUF_BLOCK_FILE_PAGE || s == BUF_BLOCK_REMOVE_HASH); - ut_ad(om > 0); + ut_ad(om == 1 || om > 2); bpage = UT_LIST_GET_NEXT(list, bpage); - ut_ad(!bpage || recv_recovery_is_on() + ut_ad(om == 1 || !bpage || recv_recovery_is_on() || om >= bpage->oldest_modification()); } } diff --git a/storage/innobase/buf/buf0lru.cc b/storage/innobase/buf/buf0lru.cc index 02aaa76de8b..cb25973855e 100644 --- a/storage/innobase/buf/buf0lru.cc +++ b/storage/innobase/buf/buf0lru.cc @@ -108,7 +108,7 @@ uint buf_LRU_old_threshold_ms; /** Remove bpage from buf_pool.LRU and buf_pool.page_hash. -If bpage->state() == BUF_BLOCK_ZIP_PAGE && !bpage->oldest_modification(), +If bpage->state() == BUF_BLOCK_ZIP_PAGE && bpage->oldest_modification() <= 1, the object will be freed. @param bpage buffer block @@ -242,8 +242,8 @@ static bool buf_LRU_free_from_common_LRU_list(ulint limit) buf_pool.lru_scan_itr.set(prev); const auto accessed = bpage->is_accessed(); - if (!bpage->oldest_modification() - && buf_LRU_free_page(bpage, true)) { + + if (buf_LRU_free_page(bpage, true)) { if (!accessed) { /* Keep track of pages that are evicted without ever being accessed. This gives us a measure of @@ -449,8 +449,8 @@ retry: #ifndef DBUG_OFF not_found: #endif + buf_flush_wait_batch_end(true); mysql_mutex_unlock(&buf_pool.mutex); - buf_flush_wait_batch_end_acquiring_mutex(true); if (n_iterations > 20 && !buf_lru_free_blocks_error_printed && srv_buf_pool_old_size == srv_buf_pool_size) { @@ -487,7 +487,7 @@ not_found: involved (particularly in case of ROW_FORMAT=COMPRESSED pages). We can do that in a separate patch sometime in future. */ - if (!buf_flush_lists(innodb_lru_flush_size, 0)) { + if (!buf_flush_LRU(innodb_lru_flush_size)) { MONITOR_INC(MONITOR_LRU_SINGLE_FLUSH_FAILURE_COUNT); ++flush_failures; } @@ -801,20 +801,33 @@ bool buf_LRU_free_page(buf_page_t *bpage, bool zip) const ulint fold = id.fold(); page_hash_latch* hash_lock = buf_pool.page_hash.lock_get(fold); hash_lock->write_lock(); + lsn_t oldest_modification = bpage->oldest_modification(); if (UNIV_UNLIKELY(!bpage->can_relocate())) { /* Do not free buffer fixed and I/O-fixed blocks. */ goto func_exit; } + if (oldest_modification == 1) { + mysql_mutex_lock(&buf_pool.flush_list_mutex); + oldest_modification = bpage->oldest_modification(); + if (oldest_modification) { + ut_ad(oldest_modification == 1); + buf_pool.delete_from_flush_list(bpage); + } + mysql_mutex_unlock(&buf_pool.flush_list_mutex); + ut_ad(!bpage->oldest_modification()); + oldest_modification = 0; + } + if (zip || !bpage->zip.data) { /* This would completely free the block. */ /* Do not completely free dirty blocks. */ - if (bpage->oldest_modification()) { + if (oldest_modification) { goto func_exit; } - } else if (bpage->oldest_modification() + } else if (oldest_modification && bpage->state() != BUF_BLOCK_FILE_PAGE) { func_exit: hash_lock->write_unlock(); diff --git a/storage/innobase/dict/dict0dict.cc b/storage/innobase/dict/dict0dict.cc index 98bcc764b52..9c4eb741bad 100644 --- a/storage/innobase/dict/dict0dict.cc +++ b/storage/innobase/dict/dict0dict.cc @@ -40,11 +40,6 @@ Created 1/8/1996 Heikki Tuuri #include "sql_table.h" #include -#if defined UNIV_DEBUG || defined UNIV_IBUF_DEBUG -/** Flag to control insert buffer debugging. */ -extern uint ibuf_debug; -#endif /* UNIV_DEBUG || UNIV_IBUF_DEBUG */ - #include "btr0btr.h" #include "btr0cur.h" #include "btr0sea.h" diff --git a/storage/innobase/fil/fil0crypt.cc b/storage/innobase/fil/fil0crypt.cc index 44268954d57..a0b4b45287f 100644 --- a/storage/innobase/fil/fil0crypt.cc +++ b/storage/innobase/fil/fil0crypt.cc @@ -1079,7 +1079,7 @@ func_exit: mtr.commit(); /* 4 - sync tablespace before publishing crypt data */ - while (buf_flush_dirty_pages(space->id)); + while (buf_flush_list_space(space)); /* 5 - publish crypt data */ mysql_mutex_lock(&fil_crypt_threads_mutex); @@ -1820,7 +1820,7 @@ fil_crypt_rotate_page( if (block->page.status == buf_page_t::FREED) { /* Do not modify freed pages to avoid an assertion failure on recovery.*/ - } else if (block->page.oldest_modification()) { + } else if (block->page.oldest_modification() > 1) { /* Do not unnecessarily touch pages that are already dirty. */ } else if (space->is_stopping()) { @@ -1970,14 +1970,7 @@ fil_crypt_flush_space( if (end_lsn > 0 && !space->is_stopping()) { ulint sum_pages = 0; const ulonglong start = my_interval_timer(); - do { - ulint n_dirty= buf_flush_dirty_pages(state->space->id); - if (!n_dirty) { - break; - } - sum_pages += n_dirty; - } while (!space->is_stopping()); - + while (buf_flush_list_space(space, &sum_pages)); if (sum_pages) { const ulonglong end = my_interval_timer(); diff --git a/storage/innobase/fil/fil0fil.cc b/storage/innobase/fil/fil0fil.cc index 3038750f451..6997978ea0a 100644 --- a/storage/innobase/fil/fil0fil.cc +++ b/storage/innobase/fil/fil0fil.cc @@ -1642,9 +1642,7 @@ void fil_close_tablespace(ulint id) can no longer read more pages of this tablespace to buf_pool. Thus we can clean the tablespace out of buf_pool completely and permanently. */ - while (buf_flush_dirty_pages(id)); - /* Ensure that all asynchronous IO is completed. */ - os_aio_wait_until_no_pending_writes(); + while (buf_flush_list_space(space)); ut_ad(space->is_stopping()); /* If it is a delete then also delete any generated files, otherwise diff --git a/storage/innobase/handler/ha_innodb.cc b/storage/innobase/handler/ha_innodb.cc index 019859687b8..ab94684e982 100644 --- a/storage/innobase/handler/ha_innodb.cc +++ b/storage/innobase/handler/ha_innodb.cc @@ -987,9 +987,9 @@ static SHOW_VAR innodb_status_variables[]= { SHOW_SIZE_T}, {"os_log_written", &export_vars.innodb_os_log_written, SHOW_SIZE_T}, {"page_size", &srv_page_size, SHOW_ULONG}, - {"pages_created", &export_vars.innodb_pages_created, SHOW_SIZE_T}, - {"pages_read", &export_vars.innodb_pages_read, SHOW_SIZE_T}, - {"pages_written", &export_vars.innodb_pages_written, SHOW_SIZE_T}, + {"pages_created", &buf_pool.stat.n_pages_created, SHOW_SIZE_T}, + {"pages_read", &buf_pool.stat.n_pages_read, SHOW_SIZE_T}, + {"pages_written", &buf_pool.stat.n_pages_written, SHOW_SIZE_T}, {"row_lock_current_waits", &export_vars.innodb_row_lock_current_waits, SHOW_SIZE_T}, {"row_lock_time", &export_vars.innodb_row_lock_time, SHOW_LONGLONG}, diff --git a/storage/innobase/include/buf0buf.h b/storage/innobase/include/buf0buf.h index c42977b5eda..e0a546c11b7 100644 --- a/storage/innobase/include/buf0buf.h +++ b/storage/innobase/include/buf0buf.h @@ -672,6 +672,16 @@ private: /** Count of how manyfold this block is currently bufferfixed. */ Atomic_counter buf_fix_count_; + /** log sequence number of the START of the log entry written of the + oldest modification to this block which has not yet been written + to the data file; + + 0 if no modifications are pending; + 1 if no modifications are pending, but the block is in buf_pool.flush_list; + 2 if modifications are pending, but the block is not in buf_pool.flush_list + (because id().space() is the temporary tablespace). */ + Atomic_counter oldest_modification_; + /** type of pending I/O operation; protected by buf_pool.mutex if in_LRU_list */ Atomic_relaxed io_fix_; @@ -721,12 +731,6 @@ public: or if state() is BUF_BLOCK_MEMORY or BUF_BLOCK_REMOVE_HASH. */ UT_LIST_NODE_T(buf_page_t) list; -private: - /** log sequence number of the START of the log entry written of the - oldest modification to this block which has not yet been written - to the data file; 0 if no modifications are pending. */ - Atomic_counter oldest_modification_; -public: /** @name LRU replacement algorithm fields. Protected by buf_pool.mutex. */ /* @{ */ @@ -841,12 +845,19 @@ public: inline void set_io_fix(buf_io_fix io_fix); inline void set_corrupt_id(); - /** @return the oldest modification */ + /** @return the log sequence number of the oldest pending modification + @retval 0 if the block is not in buf_pool.flush_list + @retval 1 if the block is in buf_pool.flush_list but not modified + @retval 2 if the block belongs to the temporary tablespace and + has unwritten changes */ lsn_t oldest_modification() const { return oldest_modification_; } /** Set oldest_modification when adding to buf_pool.flush_list */ inline void set_oldest_modification(lsn_t lsn); /** Clear oldest_modification when removing from buf_pool.flush_list */ inline void clear_oldest_modification(); + /** Note that a block is no longer dirty, while not removing + it from buf_pool.flush_list */ + inline void clear_oldest_modification(bool temporary); /** Notify that a page in a temporary tablespace has been modified. */ void set_temp_modified() @@ -854,7 +865,7 @@ public: ut_ad(fsp_is_system_temporary(id().space())); ut_ad(state() == BUF_BLOCK_FILE_PAGE); ut_ad(!oldest_modification()); - oldest_modification_= 1; + oldest_modification_= 2; } /** Prepare to release a file page to buf_pool.free. */ @@ -1462,23 +1473,24 @@ public: inline buf_block_t *block_from_ahi(const byte *ptr) const; #endif /* BTR_CUR_HASH_ADAPT */ - /** @return the block that was made dirty the longest time ago */ - const buf_page_t *get_oldest_modified() const - { - mysql_mutex_assert_owner(&flush_list_mutex); - const buf_page_t *bpage= UT_LIST_GET_LAST(flush_list); - ut_ad(!bpage || !fsp_is_system_temporary(bpage->id().space())); - ut_ad(!bpage || bpage->oldest_modification()); - return bpage; - } - /** @return the smallest oldest_modification lsn for any page @retval empty_lsn if all modified persistent pages have been flushed */ - lsn_t get_oldest_modification(lsn_t empty_lsn) const + lsn_t get_oldest_modification(lsn_t empty_lsn) { - const buf_page_t *bpage= get_oldest_modified(); - return bpage ? bpage->oldest_modification() : empty_lsn; + mysql_mutex_assert_owner(&flush_list_mutex); + while (buf_page_t *bpage= UT_LIST_GET_LAST(flush_list)) + { + ut_ad(!fsp_is_system_temporary(bpage->id().space())); + lsn_t lsn= bpage->oldest_modification(); + if (lsn != 1) + { + ut_ad(lsn > 2); + return lsn; + } + delete_from_flush_list(bpage); + } + return empty_lsn; } /** Determine if a buffer block was created by chunk_t::create(). @@ -1692,15 +1704,18 @@ public: /** Buffer pool mutex */ MY_ALIGNED(CPU_LEVEL1_DCACHE_LINESIZE) mysql_mutex_t mutex; - /** Number of pending LRU flush. */ - Atomic_counter n_flush_LRU; + /** Number of pending LRU flush; protected by mutex. */ + ulint n_flush_LRU_; /** broadcast when n_flush_LRU reaches 0; protected by mutex */ pthread_cond_t done_flush_LRU; - /** Number of pending flush_list flush. */ - Atomic_counter n_flush_list; + /** Number of pending flush_list flush; protected by mutex */ + ulint n_flush_list_; /** broadcast when n_flush_list reaches 0; protected by mutex */ pthread_cond_t done_flush_list; + TPOOL_SUPPRESS_TSAN ulint n_flush_LRU() const { return n_flush_LRU_; } + TPOOL_SUPPRESS_TSAN ulint n_flush_list() const { return n_flush_list_; } + /** @name General fields */ /* @{ */ ulint curr_pool_size; /*!< Current pool size in bytes */ @@ -1875,8 +1890,8 @@ public: last_activity_count= activity_count; } - // n_flush_LRU + n_flush_list is approximately COUNT(io_fix()==BUF_IO_WRITE) - // in flush_list + // n_flush_LRU() + n_flush_list() + // is approximately COUNT(io_fix()==BUF_IO_WRITE) in flush_list unsigned freed_page_clock;/*!< a sequence number used to count the number of buffer @@ -1961,13 +1976,35 @@ public: /** @return whether any I/O is pending */ bool any_io_pending() const { - return n_pend_reads || n_flush_LRU || n_flush_list; + return n_pend_reads || n_flush_LRU() || n_flush_list(); } /** @return total amount of pending I/O */ ulint io_pending() const { - return n_pend_reads + n_flush_LRU + n_flush_list; + return n_pend_reads + n_flush_LRU() + n_flush_list(); } + +private: + /** Remove a block from the flush list. */ + inline void delete_from_flush_list_low(buf_page_t *bpage); + /** Remove a block from flush_list. + @param bpage buffer pool page + @param clear whether to invoke buf_page_t::clear_oldest_modification() */ + void delete_from_flush_list(buf_page_t *bpage, bool clear); +public: + /** Remove a block from flush_list. + @param bpage buffer pool page */ + void delete_from_flush_list(buf_page_t *bpage) + { delete_from_flush_list(bpage, true); } + + /** Insert a modified block into the flush list. + @param block modified block + @param lsn start LSN of the mini-transaction that modified the block */ + void insert_into_flush_list(buf_block_t *block, lsn_t lsn); + + /** Free a page whose underlying file page has been freed. */ + inline void release_freed_page(buf_page_t *bpage); + private: /** Temporary memory for page_compressed and encrypted I/O */ struct io_buf_t @@ -2080,7 +2117,7 @@ inline void buf_page_t::set_corrupt_id() switch (oldest_modification()) { case 0: break; - case 1: + case 2: ut_ad(fsp_is_system_temporary(id().space())); ut_d(oldest_modification_= 0); /* for buf_LRU_block_free_non_file_page() */ break; @@ -2106,7 +2143,7 @@ inline void buf_page_t::set_corrupt_id() inline void buf_page_t::set_oldest_modification(lsn_t lsn) { mysql_mutex_assert_owner(&buf_pool.flush_list_mutex); - ut_ad(!oldest_modification()); + ut_ad(oldest_modification() <= 1); oldest_modification_= lsn; } @@ -2121,13 +2158,27 @@ inline void buf_page_t::clear_oldest_modification() oldest_modification_= 0; } +/** Note that a block is no longer dirty, while not removing +it from buf_pool.flush_list */ +inline void buf_page_t::clear_oldest_modification(bool temporary) +{ + mysql_mutex_assert_not_owner(&buf_pool.flush_list_mutex); + ut_ad(temporary == fsp_is_system_temporary(id().space())); + ut_ad(io_fix_ == BUF_IO_WRITE); + ut_ad(temporary ? oldest_modification() == 2 : oldest_modification() > 2); + oldest_modification_= !temporary; +} + /** @return whether the block is modified and ready for flushing */ inline bool buf_page_t::ready_for_flush() const { mysql_mutex_assert_owner(&buf_pool.mutex); ut_ad(in_LRU_list); ut_a(in_file()); - return oldest_modification() && io_fix_ == BUF_IO_NONE; + ut_ad(fsp_is_system_temporary(id().space()) + ? oldest_modification() == 2 + : oldest_modification() > 2); + return io_fix_ == BUF_IO_NONE; } /** @return whether the block can be relocated in memory. @@ -2204,7 +2255,7 @@ MEMORY: is not in free list, LRU list, or flush list, nor page hash table FILE_PAGE: space and offset are defined, is in page hash table if io_fix == BUF_IO_WRITE, - buf_pool.n_flush_LRU > 0 || buf_pool.n_flush_list > 0 + buf_pool.n_flush_LRU() || buf_pool.n_flush_list() (1) if buf_fix_count == 0, then is in LRU list, not in free list diff --git a/storage/innobase/include/buf0flu.h b/storage/innobase/include/buf0flu.h index 800c707bab9..eed2ecac703 100644 --- a/storage/innobase/include/buf0flu.h +++ b/storage/innobase/include/buf0flu.h @@ -1,7 +1,7 @@ /***************************************************************************** Copyright (c) 1995, 2016, Oracle and/or its affiliates. All Rights Reserved. -Copyright (c) 2014, 2020, MariaDB Corporation. +Copyright (c) 2014, 2021, MariaDB Corporation. This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software @@ -56,12 +56,6 @@ the list as they age towards the tail of the LRU. @param id tablespace identifier */ void buf_flush_remove_pages(ulint id); -/** Try to flush all the dirty pages that belong to a given tablespace. -@param id tablespace identifier -@return number dirty pages that there were for this tablespace */ -ulint buf_flush_dirty_pages(ulint id) - MY_ATTRIBUTE((warn_unused_result)); - /*******************************************************************//** Relocates a buffer control block on the flush_list. Note that it is assumed that the contents of bpage has already been @@ -95,10 +89,23 @@ buf_flush_init_for_writing( /** Write out dirty blocks from buf_pool.flush_list. @param max_n wished maximum mumber of blocks flushed -@param lsn buf_pool.get_oldest_modification(LSN_MAX) target (0=LRU flush) +@param lsn buf_pool.get_oldest_modification(LSN_MAX) target @return the number of processed pages -@retval 0 if a batch of the same type (lsn==0 or lsn!=0) is already running */ -ulint buf_flush_lists(ulint max_n, lsn_t lsn); +@retval 0 if a buf_pool.flush_list batch is already running */ +ulint buf_flush_list(ulint max_n= ULINT_UNDEFINED, lsn_t lsn= LSN_MAX); + +/** Try to flush dirty pages that belong to a given tablespace. +@param space tablespace +@param n_flushed number of pages written +@return whether the flush for some pages might not have been initiated */ +bool buf_flush_list_space(fil_space_t *space, ulint *n_flushed= nullptr) + MY_ATTRIBUTE((warn_unused_result)); + +/** Write out dirty blocks from buf_pool.LRU. +@param max_n wished maximum mumber of blocks flushed +@return the number of processed pages +@retval 0 if a buf_pool.LRU batch is already running */ +ulint buf_flush_LRU(ulint max_n); /** Wait until a flush batch ends. @param lru true=buf_pool.LRU; false=buf_pool.flush_list */ @@ -106,9 +113,10 @@ void buf_flush_wait_batch_end(bool lru); /** Wait until all persistent pages are flushed up to a limit. @param sync_lsn buf_pool.get_oldest_modification(LSN_MAX) to wait for */ ATTRIBUTE_COLD void buf_flush_wait_flushed(lsn_t sync_lsn); -/** If innodb_flush_sync=ON, initiate a furious flush. -@param lsn buf_pool.get_oldest_modification(LSN_MAX) target */ -void buf_flush_ahead(lsn_t lsn); +/** Initiate more eager page flushing if the log checkpoint age is too old. +@param lsn buf_pool.get_oldest_modification(LSN_MAX) target +@param furious true=furious flushing, false=limit to innodb_io_capacity */ +ATTRIBUTE_COLD void buf_flush_ahead(lsn_t lsn, bool furious); /********************************************************************//** This function should be called at a mini-transaction commit, if a page was diff --git a/storage/innobase/include/buf0flu.ic b/storage/innobase/include/buf0flu.ic index cd853fc05cd..b8a9b6d1f5d 100644 --- a/storage/innobase/include/buf0flu.ic +++ b/storage/innobase/include/buf0flu.ic @@ -1,7 +1,7 @@ /***************************************************************************** Copyright (c) 1995, 2015, Oracle and/or its affiliates. All Rights Reserved. -Copyright (c) 2019, 2020, MariaDB Corporation. +Copyright (c) 2019, 2021, MariaDB Corporation. This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software @@ -26,17 +26,7 @@ Created 11/5/1995 Heikki Tuuri #include "assume_aligned.h" #include "buf0buf.h" -#include "mtr0mtr.h" #include "srv0srv.h" -#include "fsp0types.h" - -/********************************************************************//** -Inserts a modified block into the flush list. */ -void -buf_flush_insert_into_flush_list( -/*=============================*/ - buf_block_t* block, /*!< in/out: block which is modified */ - lsn_t lsn); /*!< in: oldest modification */ /********************************************************************//** This function should be called at a mini-transaction commit, if a page was @@ -52,8 +42,7 @@ buf_flush_note_modification( lsn_t end_lsn) /*!< in: end lsn of the mtr that modified this block */ { - ut_ad(!srv_read_only_mode - || fsp_is_system_temporary(block->page.id().space())); + ut_ad(!srv_read_only_mode); ut_ad(block->page.state() == BUF_BLOCK_FILE_PAGE); ut_ad(block->page.buf_fix_count()); ut_ad(mach_read_from_8(block->frame + FIL_PAGE_LSN) <= end_lsn); @@ -65,12 +54,12 @@ buf_flush_note_modification( const lsn_t oldest_modification = block->page.oldest_modification(); - if (oldest_modification) { + if (oldest_modification > 1) { ut_ad(oldest_modification <= start_lsn); - } else if (!fsp_is_system_temporary(block->page.id().space())) { - buf_flush_insert_into_flush_list(block, start_lsn); - } else { + } else if (fsp_is_system_temporary(block->page.id().space())) { block->page.set_temp_modified(); + } else { + buf_pool.insert_into_flush_list(block, start_lsn); } srv_stats.buf_pool_write_requests.inc(); diff --git a/storage/innobase/include/mtr0mtr.h b/storage/innobase/include/mtr0mtr.h index fd865c496c4..e6f82f12b77 100644 --- a/storage/innobase/include/mtr0mtr.h +++ b/storage/innobase/include/mtr0mtr.h @@ -588,6 +588,17 @@ public: @return number of buffer count added by this mtr */ uint32_t get_fix_count(const buf_block_t *block) const; + /** type of page flushing is needed during commit() */ + enum page_flush_ahead + { + /** no need to trigger page cleaner */ + PAGE_FLUSH_NO= 0, + /** asynchronous flushing is needed */ + PAGE_FLUSH_ASYNC, + /** furious flushing is needed */ + PAGE_FLUSH_SYNC + }; + private: /** Log a write of a byte string to a page. @param block buffer page @@ -621,7 +632,7 @@ private: /** Append the redo log records to the redo log buffer. @param len number of bytes to write @return {start_lsn,flush_ahead} */ - inline std::pair finish_write(ulint len); + inline std::pair finish_write(ulint len); /** Release the resources */ inline void release_resources(); diff --git a/storage/innobase/include/mtr0mtr.ic b/storage/innobase/include/mtr0mtr.ic index d75d6f512e9..3896f2f6715 100644 --- a/storage/innobase/include/mtr0mtr.ic +++ b/storage/innobase/include/mtr0mtr.ic @@ -1,7 +1,7 @@ /***************************************************************************** Copyright (c) 1995, 2014, Oracle and/or its affiliates. All Rights Reserved. -Copyright (c) 2017, 2020, MariaDB Corporation. +Copyright (c) 2017, 2021, MariaDB Corporation. This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software @@ -32,7 +32,7 @@ inline bool mtr_t::is_block_dirtied(const buf_block_t *block) { ut_ad(block->page.state() == BUF_BLOCK_FILE_PAGE); ut_ad(block->page.buf_fix_count()); - return !block->page.oldest_modification(); + return block->page.oldest_modification() <= 1; } /** diff --git a/storage/innobase/include/os0file.h b/storage/innobase/include/os0file.h index 9b40d01547a..9d5c22b563b 100644 --- a/storage/innobase/include/os0file.h +++ b/storage/innobase/include/os0file.h @@ -1111,10 +1111,9 @@ void os_aio_free(); @retval DB_IO_ERROR on I/O error */ dberr_t os_aio(const IORequest &type, void *buf, os_offset_t offset, size_t n); -/** Waits until there are no pending writes in os_aio_write_array. There can -be other, synchronous, pending writes. */ -void -os_aio_wait_until_no_pending_writes(); +/** Wait until there are no pending asynchronous writes. +Only used on FLUSH TABLES...FOR EXPORT. */ +void os_aio_wait_until_no_pending_writes(); /** Wait until all pending asynchronous reads have completed. */ void os_aio_wait_until_no_pending_reads(); diff --git a/storage/innobase/include/srv0srv.h b/storage/innobase/include/srv0srv.h index 3c22d8823e2..0c32d5d686a 100644 --- a/storage/innobase/include/srv0srv.h +++ b/storage/innobase/include/srv0srv.h @@ -756,9 +756,6 @@ struct export_var_t{ ulint innodb_os_log_fsyncs; /*!< n_log_flushes */ ulint innodb_os_log_pending_writes; /*!< srv_os_log_pending_writes */ ulint innodb_os_log_pending_fsyncs; /*!< n_pending_log_flushes */ - ulint innodb_pages_created; /*!< buf_pool.stat.n_pages_created */ - ulint innodb_pages_read; /*!< buf_pool.stat.n_pages_read*/ - ulint innodb_pages_written; /*!< buf_pool.stat.n_pages_written */ ulint innodb_row_lock_waits; /*!< srv_n_lock_wait_count */ ulint innodb_row_lock_current_waits; /*!< srv_n_lock_wait_current_count */ int64_t innodb_row_lock_time; /*!< srv_n_lock_wait_time diff --git a/storage/innobase/mtr/mtr0mtr.cc b/storage/innobase/mtr/mtr0mtr.cc index ea4c987397b..d7bb3ce886b 100644 --- a/storage/innobase/mtr/mtr0mtr.cc +++ b/storage/innobase/mtr/mtr0mtr.cc @@ -404,12 +404,12 @@ void mtr_t::commit() { ut_ad(!srv_read_only_mode || m_log_mode == MTR_LOG_NO_REDO); - std::pair lsns; + std::pair lsns; if (const ulint len= prepare_write()) lsns= finish_write(len); else - lsns= { m_commit_lsn, false }; + lsns= { m_commit_lsn, PAGE_FLUSH_NO }; if (m_made_dirty) mysql_mutex_lock(&log_sys.flush_order_mutex); @@ -449,8 +449,8 @@ void mtr_t::commit() m_memo.for_each_block_in_reverse(CIterate()); - if (lsns.second) - buf_flush_ahead(m_commit_lsn); + if (UNIV_UNLIKELY(lsns.second != PAGE_FLUSH_NO)) + buf_flush_ahead(m_commit_lsn, lsns.second == PAGE_FLUSH_SYNC); if (m_made_dirty) srv_stats.log_write_requests.inc(); @@ -767,7 +767,7 @@ static void log_write_low(const void *str, size_t size) /** Close the log at mini-transaction commit. @return whether buffer pool flushing is needed */ -static bool log_close(lsn_t lsn) +static mtr_t::page_flush_ahead log_close(lsn_t lsn) { mysql_mutex_assert_owner(&log_sys.mutex); ut_ad(lsn == log_sys.get_lsn()); @@ -790,7 +790,9 @@ static bool log_close(lsn_t lsn) const lsn_t checkpoint_age= lsn - log_sys.last_checkpoint_lsn; - if (UNIV_UNLIKELY(checkpoint_age >= log_sys.log_capacity)) + if (UNIV_UNLIKELY(checkpoint_age >= log_sys.log_capacity) && + /* silence message on create_log_file() after the log had been deleted */ + checkpoint_age != lsn) { time_t t= time(nullptr); if (!log_close_warned || difftime(t, log_close_warn_time) > 15) @@ -799,15 +801,17 @@ static bool log_close(lsn_t lsn) log_close_warn_time= t; ib::error() << "The age of the last checkpoint is " << checkpoint_age - << ", which exceeds the log capacity " - << log_sys.log_capacity << "."; + << ", which exceeds the log capacity " + << log_sys.log_capacity << "."; } } + else if (UNIV_LIKELY(checkpoint_age <= log_sys.max_modified_age_async)) + return mtr_t::PAGE_FLUSH_NO; else if (UNIV_LIKELY(checkpoint_age <= log_sys.max_checkpoint_age)) - return false; + return mtr_t::PAGE_FLUSH_ASYNC; log_sys.set_check_flush_or_checkpoint(); - return true; + return mtr_t::PAGE_FLUSH_SYNC; } /** Write the block contents to the REDO log */ @@ -871,8 +875,8 @@ inline ulint mtr_t::prepare_write() /** Append the redo log records to the redo log buffer. @param len number of bytes to write -@return {start_lsn,flush_ahead_lsn} */ -inline std::pair mtr_t::finish_write(ulint len) +@return {start_lsn,flush_ahead} */ +inline std::pair mtr_t::finish_write(ulint len) { ut_ad(m_log_mode == MTR_LOG_ALL); mysql_mutex_assert_owner(&log_sys.mutex); @@ -888,19 +892,19 @@ inline std::pair mtr_t::finish_write(ulint len) m_commit_lsn = log_reserve_and_write_fast(front->begin(), len, &start_lsn); - if (m_commit_lsn) { - return std::make_pair(start_lsn, false); + if (!m_commit_lsn) { + goto piecewise; } + } else { +piecewise: + /* Open the database log for log_write_low */ + start_lsn = log_reserve_and_open(len); + mtr_write_log write_log; + m_log.for_each_block(write_log); + m_commit_lsn = log_sys.get_lsn(); } - - /* Open the database log for log_write_low */ - start_lsn = log_reserve_and_open(len); - - mtr_write_log write_log; - m_log.for_each_block(write_log); - m_commit_lsn = log_sys.get_lsn(); - bool flush = log_close(m_commit_lsn); - DBUG_EXECUTE_IF("ib_log_flush_ahead", flush=true;); + page_flush_ahead flush= log_close(m_commit_lsn); + DBUG_EXECUTE_IF("ib_log_flush_ahead", flush = PAGE_FLUSH_SYNC;); return std::make_pair(start_lsn, flush); } diff --git a/storage/innobase/os/os0file.cc b/storage/innobase/os/os0file.cc index 6b8b11605af..53c0c07eda4 100644 --- a/storage/innobase/os/os0file.cc +++ b/storage/innobase/os/os0file.cc @@ -3775,8 +3775,8 @@ static void os_aio_wait_until_no_pending_writes_low() tpool::tpool_wait_end(); } -/** Waits until there are no pending writes. There can -be other, synchronous, pending writes. */ +/** Wait until there are no pending asynchronous writes. +Only used on FLUSH TABLES...FOR EXPORT. */ void os_aio_wait_until_no_pending_writes() { os_aio_wait_until_no_pending_writes_low(); diff --git a/storage/innobase/row/row0import.cc b/storage/innobase/row/row0import.cc index 0d8b2007f07..00016761c91 100644 --- a/storage/innobase/row/row0import.cc +++ b/storage/innobase/row/row0import.cc @@ -4213,7 +4213,17 @@ row_import_for_mysql( /* Ensure that all pages dirtied during the IMPORT make it to disk. The only dirty pages generated should be from the pessimistic purge of delete marked records that couldn't be purged in Phase I. */ - while (buf_flush_dirty_pages(prebuilt->table->space_id)); + while (buf_flush_list_space(prebuilt->table->space)); + + for (ulint count = 0; prebuilt->table->space->referenced(); count++) { + /* Issue a warning every 10.24 seconds, starting after + 2.56 seconds */ + if ((count & 511) == 128) { + ib::warn() << "Waiting for flush to complete on " + << prebuilt->table->name; + } + std::this_thread::sleep_for(std::chrono::milliseconds(20)); + } ib::info() << "Phase IV - Flush complete"; prebuilt->table->space->set_imported(); diff --git a/storage/innobase/row/row0quiesce.cc b/storage/innobase/row/row0quiesce.cc index e14e8846ee4..74c7b9b35fc 100644 --- a/storage/innobase/row/row0quiesce.cc +++ b/storage/innobase/row/row0quiesce.cc @@ -536,7 +536,7 @@ row_quiesce_table_start( } } - while (buf_flush_dirty_pages(table->space_id)) { + while (buf_flush_list_space(table->space)) { if (trx_is_interrupted(trx)) { goto aborted; } diff --git a/storage/innobase/srv/srv0srv.cc b/storage/innobase/srv/srv0srv.cc index 446314b9ab9..ce020a6b88d 100644 --- a/storage/innobase/srv/srv0srv.cc +++ b/storage/innobase/srv/srv0srv.cc @@ -1092,12 +1092,6 @@ srv_export_innodb_status(void) export_vars.innodb_log_writes = srv_stats.log_writes; - export_vars.innodb_pages_created = buf_pool.stat.n_pages_created; - - export_vars.innodb_pages_read = buf_pool.stat.n_pages_read; - - export_vars.innodb_pages_written = buf_pool.stat.n_pages_written; - mysql_mutex_lock(&lock_sys.wait_mutex); export_vars.innodb_row_lock_waits = lock_sys.get_wait_cumulative(); diff --git a/storage/innobase/trx/trx0purge.cc b/storage/innobase/trx/trx0purge.cc index 9e6c2741b99..2584e534fbc 100644 --- a/storage/innobase/trx/trx0purge.cc +++ b/storage/innobase/trx/trx0purge.cc @@ -672,7 +672,7 @@ not_free: mini-transaction commit and the server was killed, then discarding the to-be-trimmed pages without flushing would break crash recovery. So, we cannot avoid the write. */ - while (buf_flush_dirty_pages(space.id)); + while (buf_flush_list_space(&space)); log_free_check();