/***************************************************************************** Copyright (c) 1997, 2016, Oracle and/or its affiliates. All Rights Reserved. Copyright (c) 2017, 2022, MariaDB Corporation. This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation; version 2 of the License. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with this program; if not, write to the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA *****************************************************************************/ /**************************************************//** @file include/log0recv.h Recovery Created 9/20/1997 Heikki Tuuri *******************************************************/ #pragma once #include "ut0new.h" #include "buf0types.h" #include "log0log.h" #include "mtr0types.h" #include #include /** @return whether recovery is currently running. */ #define recv_recovery_is_on() UNIV_UNLIKELY(recv_sys.recovery_on) /** Find the latest checkpoint in the log header. @param[out] max_field LOG_CHECKPOINT_1 or LOG_CHECKPOINT_2 @return error code or DB_SUCCESS */ dberr_t recv_find_max_checkpoint(ulint* max_field) MY_ATTRIBUTE((nonnull, warn_unused_result)); ATTRIBUTE_COLD MY_ATTRIBUTE((nonnull, warn_unused_result)) /** Apply any buffered redo log to a page that was just read from a data file. @param[in,out] space tablespace @param[in,out] bpage buffer pool page @return whether the page was recovered correctly */ bool recv_recover_page(fil_space_t* space, buf_page_t* bpage); /** Start recovering from a redo log checkpoint. @param[in] flush_lsn FIL_PAGE_FILE_FLUSH_LSN of first system tablespace page @return error code or DB_SUCCESS */ dberr_t recv_recovery_from_checkpoint_start( lsn_t flush_lsn); /** Whether to store redo log records in recv_sys.pages */ enum store_t { /** Do not store redo log records. */ STORE_NO, /** Store redo log records. */ STORE_YES, /** Store redo log records if the tablespace exists. */ STORE_IF_EXISTS }; /** Adds data from a new log block to the parsing buffer of recv_sys if recv_sys.parse_start_lsn is non-zero. @param[in] log_block log block to add @param[in] scanned_lsn lsn of how far we were able to find data in this log block @return true if more data added */ bool recv_sys_add_to_parsing_buf(const byte* log_block, lsn_t scanned_lsn); /** Moves the parsing buffer data left to the buffer start */ void recv_sys_justify_left_parsing_buf(); /** Report an operation to create, delete, or rename a file during backup. @param[in] space_id tablespace identifier @param[in] type file operation redo log type @param[in] name file name (not NUL-terminated) @param[in] len length of name, in bytes @param[in] new_name new file name (NULL if not rename) @param[in] new_len length of new_name, in bytes (0 if NULL) */ extern void (*log_file_op)(ulint space_id, int type, const byte* name, ulint len, const byte* new_name, ulint new_len); /** Report an operation which does undo log tablespace truncation during backup @param space_id undo tablespace identifier */ extern void (*undo_space_trunc)(uint32_t space_id); /** Report an operation which does INIT_PAGE for page0 during backup. @param space_id tablespace identifier */ extern void (*first_page_init)(ulint space_id); /** Stored redo log record */ struct log_rec_t { log_rec_t(lsn_t lsn) : next(nullptr), lsn(lsn) { ut_ad(lsn); } log_rec_t()= delete; log_rec_t(const log_rec_t&)= delete; log_rec_t &operator=(const log_rec_t&)= delete; /** next record */ log_rec_t *next; /** mtr_t::commit_lsn() of the mini-transaction */ const lsn_t lsn; }; struct recv_dblwr_t { /** Add a page frame to the doublewrite recovery buffer. */ void add(byte *page) { pages.push_front(page); } /** Validate the page. @param page_id page identifier @param page page contents @param space the tablespace of the page (not available for page 0) @param tmp_buf 2*srv_page_size for decrypting and decompressing any page_compressed or encrypted pages @return whether the page is valid */ bool validate_page(const page_id_t page_id, const byte *page, const fil_space_t *space, byte *tmp_buf); /** Find a doublewrite copy of a page. @param page_id page identifier @param space tablespace (not available for page_id.page_no()==0) @param tmp_buf 2*srv_page_size for decrypting and decompressing any page_compressed or encrypted pages @return page frame @retval NULL if no valid page for page_id was found */ byte* find_page(const page_id_t page_id, const fil_space_t *space= NULL, byte *tmp_buf= NULL); typedef std::deque > list; /** Recovered doublewrite buffer page frames */ list pages; }; /** the recovery state and buffered records for a page */ struct page_recv_t { /** Recovery state; protected by recv_sys.mutex */ enum { /** not yet processed */ RECV_NOT_PROCESSED, /** not processed; the page will be reinitialized */ RECV_WILL_NOT_READ, /** page is being read */ RECV_BEING_READ, /** log records are being applied on the page */ RECV_BEING_PROCESSED } state= RECV_NOT_PROCESSED; /** Latest written byte offset when applying the log records. @see mtr_t::m_last_offset */ uint16_t last_offset= 1; /** log records for a page */ class recs_t { /** The first log record */ log_rec_t *head= nullptr; /** The last log record */ log_rec_t *tail= nullptr; friend struct page_recv_t; public: /** Append a redo log snippet for the page @param recs log snippet */ void append(log_rec_t* recs) { if (tail) tail->next= recs; else head= recs; tail= recs; } /** @return the last log snippet */ const log_rec_t* last() const { return tail; } /** @return the last log snippet */ log_rec_t* last() { return tail; } class iterator { log_rec_t *cur; public: iterator(log_rec_t* rec) : cur(rec) {} log_rec_t* operator*() const { return cur; } iterator &operator++() { cur= cur->next; return *this; } bool operator!=(const iterator& i) const { return cur != i.cur; } }; iterator begin() { return head; } iterator end() { return NULL; } bool empty() const { ut_ad(!head == !tail); return !head; } /** Clear and free the records; @see recv_sys_t::alloc() */ inline void clear(); } log; /** Trim old log records for a page. @param start_lsn oldest log sequence number to preserve @return whether all the log for the page was trimmed */ inline bool trim(lsn_t start_lsn); /** Ignore any earlier redo log records for this page. */ inline void will_not_read(); /** @return whether the log records for the page are being processed */ bool is_being_processed() const { return state == RECV_BEING_PROCESSED; } }; /** Recovery system data structure */ struct recv_sys_t { /** mutex protecting apply_log_recs and page_recv_t::state */ mysql_mutex_t mutex; private: /** condition variable for !apply_batch_on || pages.empty() || found_corrupt_log || found_corrupt_fs */ pthread_cond_t cond; /** whether recv_apply_hashed_log_recs() is running */ bool apply_batch_on; /** set when finding a corrupt log block or record, or there is a log parsing buffer overflow */ bool found_corrupt_log; /** set when an inconsistency with the file system contents is detected during log scan or apply */ bool found_corrupt_fs; public: /** whether we are applying redo log records during crash recovery */ bool recovery_on; /** whether recv_recover_page(), invoked from buf_page_t::read_complete(), should apply log records*/ bool apply_log_recs; byte* buf; /*!< buffer for parsing log records */ ulint len; /*!< amount of data in buf */ lsn_t parse_start_lsn; /*!< this is the lsn from which we were able to start parsing log records and adding them to pages; zero if a suitable start point not found yet */ lsn_t scanned_lsn; /*!< the log data has been scanned up to this lsn */ ulint scanned_checkpoint_no; /*!< the log data has been scanned up to this checkpoint number (lowest 4 bytes) */ ulint recovered_offset; /*!< start offset of non-parsed log records in buf */ lsn_t recovered_lsn; /*!< the log records have been parsed up to this lsn */ lsn_t mlog_checkpoint_lsn; /*!< the LSN of a FILE_CHECKPOINT record, or 0 if none was parsed */ /** the time when progress was last reported */ time_t progress_time; using map = std::map, ut_allocator>>; /** buffered records waiting to be applied to pages */ map pages; private: /** Process a record that indicates that a tablespace size is being shrunk. @param page_id first page that is not in the file @param lsn log sequence number of the shrink operation */ inline void trim(const page_id_t page_id, lsn_t lsn); /** Undo tablespaces for which truncate has been logged (indexed by page_id_t::space() - srv_undo_space_id_start) */ struct trunc { /** log sequence number of FILE_CREATE, or 0 if none */ lsn_t lsn; /** truncated size of the tablespace, or 0 if not truncated */ unsigned pages; } truncated_undo_spaces[127]; public: /** The contents of the doublewrite buffer */ recv_dblwr_t dblwr; /** Last added LSN to pages. */ lsn_t last_stored_lsn= 0; void read(os_offset_t offset, span buf); inline size_t files_size(); void close_files() { files.clear(); files.shrink_to_fit(); } private: /** Attempt to initialize a page based on redo log records. @param page_id page identifier @param p iterator pointing to page_id @param mtr mini-transaction @param b pre-allocated buffer pool block @return the recovered block @retval nullptr if the page cannot be initialized based on log records @retval -1 if the page cannot be recovered due to corruption */ inline buf_block_t *recover_low(const page_id_t page_id, map::iterator &p, mtr_t &mtr, buf_block_t *b); /** Attempt to initialize a page based on redo log records. @param page_id page identifier @return the recovered block @retval nullptr if the page cannot be initialized based on log records @retval -1 if the page cannot be recovered due to corruption */ buf_block_t *recover_low(const page_id_t page_id); /** All found log files (multiple ones are possible if we are upgrading from before MariaDB Server 10.5.1) */ std::vector files; void open_log_files_if_needed(); /** Base node of the redo block list. List elements are linked via buf_block_t::unzip_LRU. */ UT_LIST_BASE_NODE_T(buf_block_t) blocks; public: /** Check whether the number of read redo log blocks exceeds the maximum. Store last_stored_lsn if the recovery is not in the last phase. @param[in,out] store whether to store page operations @return whether the memory is exhausted */ inline bool is_memory_exhausted(store_t *store); /** Apply buffered log to persistent data pages. @param last_batch whether it is possible to write more redo log */ void apply(bool last_batch); #ifdef UNIV_DEBUG /** whether all redo log in the current batch has been applied */ bool after_apply= false; #endif /** Initialize the redo log recovery subsystem. */ void create(); /** Free most recovery data structures. */ void debug_free(); /** Clean up after create() */ void close(); bool is_initialised() const { return last_stored_lsn != 0; } /** Register a redo log snippet for a page. @param it page iterator @param start_lsn start LSN of the mini-transaction @param lsn @see mtr_t::commit_lsn() @param l redo log snippet @see log_t::FORMAT_10_5 @param len length of l, in bytes */ inline void add(map::iterator it, lsn_t start_lsn, lsn_t lsn, const byte *l, size_t len); /** Parse and register one mini-transaction in log_t::FORMAT_10_5. @param checkpoint_lsn the log sequence number of the latest checkpoint @param store whether to store the records @param apply whether to apply file-level log records @return whether FILE_CHECKPOINT record was seen the first time, or corruption was noticed */ bool parse(lsn_t checkpoint_lsn, store_t *store, bool apply); /** Clear a fully processed set of stored redo log records. */ inline void clear(); /** Determine whether redo log recovery progress should be reported. @param time the current time @return whether progress should be reported (the last report was at least 15 seconds ago) */ bool report(time_t time) { if (time - progress_time < 15) return false; progress_time= time; return true; } /** The alloc() memory alignment, in bytes */ static constexpr size_t ALIGNMENT= sizeof(size_t); /** Allocate memory for log_rec_t @param len allocation size, in bytes @return pointer to len bytes of memory (never NULL) */ inline void *alloc(size_t len); /** Free a redo log snippet. @param data buffer returned by alloc() */ inline void free(const void *data); /** Remove records for a corrupted page. This function should only be called when innodb_force_recovery is set. @param page_id corrupted page identifier */ ATTRIBUTE_COLD void free_corrupted_page(page_id_t page_id); /** Flag data file corruption during recovery. */ ATTRIBUTE_COLD void set_corrupt_fs(); /** Flag log file corruption during recovery. */ ATTRIBUTE_COLD void set_corrupt_log(); /** Possibly finish a recovery batch. */ inline void maybe_finish_batch(); /** @return whether data file corruption was found */ bool is_corrupt_fs() const { return UNIV_UNLIKELY(found_corrupt_fs); } /** @return whether log file corruption was found */ bool is_corrupt_log() const { return UNIV_UNLIKELY(found_corrupt_log); } /** Attempt to initialize a page based on redo log records. @param page_id page identifier @return the recovered block @retval nullptr if the page cannot be initialized based on log records @retval -1 if the page cannot be recovered due to corruption */ buf_block_t *recover(const page_id_t page_id) { return UNIV_UNLIKELY(recovery_on) ? recover_low(page_id) : nullptr; } /** Try to recover a tablespace that was not readable earlier @param p iterator, initially pointing to page_id_t{space_id,0}; the records will be freed and the iterator advanced @param name tablespace file name @param free_block spare buffer block @return whether recovery failed */ bool recover_deferred(map::iterator &p, const std::string &name, buf_block_t *&free_block); }; /** The recovery system */ extern recv_sys_t recv_sys; /** If the following is TRUE, the buffer pool file pages must be invalidated after recovery and no ibuf operations are allowed; this will be set if recv_sys.pages becomes too full, and log records must be merged to file pages already before the recovery is finished: in this case no ibuf operations are allowed, as they could modify the pages read in the buffer pool before the pages have been recovered to the up-to-date state. TRUE means that recovery is running and no operations on the log files are allowed yet: the variable name is misleading. */ extern bool recv_no_ibuf_operations; /** TRUE when recv_init_crash_recovery() has been called. */ extern bool recv_needed_recovery; #ifdef UNIV_DEBUG /** TRUE if writing to the redo log (mtr_commit) is forbidden. Protected by log_sys.mutex. */ extern bool recv_no_log_write; #endif /* UNIV_DEBUG */ /** TRUE if buf_page_is_corrupted() should check if the log sequence number (FIL_PAGE_LSN) is in the future. Initially FALSE, and set by recv_recovery_from_checkpoint_start(). */ extern bool recv_lsn_checks_on; /** Size of the parsing buffer; it must accommodate RECV_SCAN_SIZE many times! */ #define RECV_PARSING_BUF_SIZE (2U << 20) /** Size of block reads when the log groups are scanned forward to do a roll-forward */ #define RECV_SCAN_SIZE (4U << srv_page_size_shift)