mirror of
				https://github.com/MariaDB/server.git
				synced 2025-11-04 04:46:15 +01:00 
			
		
		
		
	Improve the performance of slave connect using B+-Tree indexes on each binlog file. The index allows fast lookup of a GTID position to the corresponding offset in the binlog file, as well as lookup of a position to find the corresponding GTID position. This eliminates a costly sequential scan of the starting binlog file to find the GTID starting position when a slave connects. This is especially costly if the binlog file is not cached in memory (IO cost), or if it is encrypted or a lot of slaves connect simultaneously (CPU cost). The size of the index files is generally less than 1% of the binlog data, so not expected to be an issue. Most of the work writing the index is done as a background task, in the binlog background thread. This minimises the performance impact on transaction commit. A simple global mutex is used to protect index reads and (background) index writes; this is fine as slave connect is a relatively infrequent operation. Here are the user-visible options and status variables. The feature is on by default and is expected to need no tuning or configuration for most users. binlog_gtid_index On by default. Can be used to disable the indexes for testing purposes. binlog_gtid_index_page_size (default 4096) Page size to use for the binlog GTID index. This is the size of the nodes in the B+-tree used internally in the index. A very small page-size (64 is the minimum) will be less efficient, but can be used to stress the BTree-code during testing. binlog_gtid_index_span_min (default 65536) Control sparseness of the binlog GTID index. If set to N, at most one index record will be added for every N bytes of binlog file written. This can be used to reduce the number of records in the index, at the cost only of having to scan a few more events in the binlog file before finding the target position Two status variables are available to monitor the use of the GTID indexes: Binlog_gtid_index_hit Binlog_gtid_index_miss The "hit" status increments for each successful lookup in a GTID index. The "miss" increments when a lookup is not possible. This indicates that the index file is missing (eg. binlog written by old server version without GTID index support), or corrupt. Signed-off-by: Kristian Nielsen <knielsen@knielsen-hq.org>
		
			
				
	
	
		
			521 lines
		
	
	
	
		
			20 KiB
		
	
	
	
		
			C++
		
	
	
	
	
	
			
		
		
	
	
			521 lines
		
	
	
	
		
			20 KiB
		
	
	
	
		
			C++
		
	
	
	
	
	
/*
 | 
						|
   Copyright (c) 2023 Kristian Nielsen <knielsen@knielsen-hq.org>
 | 
						|
 | 
						|
   This program is free software; you can redistribute it and/or modify
 | 
						|
   it under the terms of the GNU General Public License as published by
 | 
						|
   the Free Software Foundation; version 2 of the License.
 | 
						|
 | 
						|
   This program is distributed in the hope that it will be useful,
 | 
						|
   but WITHOUT ANY WARRANTY; without even the implied warranty of
 | 
						|
   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 | 
						|
   GNU General Public License for more details.
 | 
						|
 | 
						|
   You should have received a copy of the GNU General Public License
 | 
						|
   along with this program; if not, write to the Free Software
 | 
						|
   Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301, USA
 | 
						|
*/
 | 
						|
 | 
						|
#ifndef GTID_INDEX_H
 | 
						|
#define GTID_INDEX_H
 | 
						|
 | 
						|
#include "my_global.h"
 | 
						|
#include "mysqld.h"
 | 
						|
#include "mariadb.h"
 | 
						|
#include "rpl_gtid.h"
 | 
						|
 | 
						|
/*
 | 
						|
  This implements an on-disk index for each binlog file to speed up access to
 | 
						|
  the binlog at a specific offset or GTID position. This is primarily used when
 | 
						|
  a slave connects to the master, but also by user calling BINLOG_GTID_POS().
 | 
						|
 | 
						|
  A connecting slave first scans the binlog files to find the last one with an
 | 
						|
  initial GTID_LIST event that lies before the starting GTID position. Then a
 | 
						|
  sequential scan of the binlog file is done until the requested GTID position
 | 
						|
  is found.
 | 
						|
 | 
						|
  The binlog index conceptually extends this using index records corresponding
 | 
						|
  to different offset within one binlog file. Each record functions as if it
 | 
						|
  was the initial GTID_LIST event of a new binlog file, allowing the
 | 
						|
  sequential scan to start from the corresponding position. By having
 | 
						|
  sufficiently many index records, the scan will be fast.
 | 
						|
 | 
						|
  The code that adds one record to the index is in two parts, a "sync" path
 | 
						|
  and an "async" path. The "sync" path in process_gtid_check_batch() does the
 | 
						|
  minimum amount of work which needs to run as part of transaction commit. The
 | 
						|
  actual writing of the index, in async_update(), can then be done as a
 | 
						|
  background task, minimizing the performance impact on transaction processing.
 | 
						|
  The "sync" and "async" paths each run single threaded, but can execute in
 | 
						|
  parallel with each other.
 | 
						|
 | 
						|
  The index file is written incrementally together with the binlog file.
 | 
						|
  However there is no fsync()'s of the index file needed while writing. A
 | 
						|
  partially written index left by a crashing server will be re-written during
 | 
						|
  binlog recovery. A reader is allowed to use the index as it is begin written
 | 
						|
  (for the "hot" binlog file); such access is protected by mutex.
 | 
						|
 | 
						|
  In case of lost or corrupt index, fallback to full sequential scan is done
 | 
						|
  (so performance will be affected but not correct functionality).
 | 
						|
 | 
						|
  The index file is structured like a B+-tree. The index is append-only, so
 | 
						|
  also resembles a log-structured merge-tree, but with no merging of levels
 | 
						|
  needed as it covers a single fixed-size binlog file. This makes the building
 | 
						|
  of the tree relatively simple.
 | 
						|
 | 
						|
  Keys in the tree consist of a GTID state (corresponding to a GTID_LIST
 | 
						|
  event) and the associated binlog file offset. All keys (except the first key
 | 
						|
  in each level of the tree) are delta-compressed to save space, holding only
 | 
						|
  the (domain_id, server_id) pairs that differ from the previous record.
 | 
						|
 | 
						|
  The file is page-based. The first page contains the leftmost leaf node, and
 | 
						|
  the root node is at the end of the file. An incompletely written index file
 | 
						|
  can be detected by the last page in the file not being a root node page.
 | 
						|
  Nodes in the B+-tree usually fit in one page, but a node can be split across
 | 
						|
  multiple pages if GTID states are very large.
 | 
						|
 | 
						|
  Page format:
 | 
						|
 | 
						|
  The first page contains an extra file header:
 | 
						|
 | 
						|
    Offset  Size  Description
 | 
						|
       0      4   MAGIC header identifying the file as a binlog index
 | 
						|
       4      1   Major version number. A new major version of the file format
 | 
						|
                  is not readable by older server versions.
 | 
						|
       5      1   Minor version number. Formats differing only in minor version
 | 
						|
                  are backwards compatible and can be read by older servers.
 | 
						|
       6      2   Padding/unused.
 | 
						|
       8      4   Page size.
 | 
						|
 | 
						|
  Each page additionally contains this header:
 | 
						|
 | 
						|
    Offset  Size  Description
 | 
						|
       0      1   Flags
 | 
						|
       1      3   Padding/unused
 | 
						|
 | 
						|
  The last 4 bytes of each page is a 32-bit CRC.
 | 
						|
 | 
						|
  An interior node is a sequence of
 | 
						|
    <child ptr> <key> <child ptr> <key> ... <key> <child ptr>
 | 
						|
  while a leaf node has only keys.
 | 
						|
 | 
						|
  A child pointer is stored as 4 byte integer. The first page is 1, so that
 | 
						|
  0 can be used to denote "not present".
 | 
						|
 | 
						|
  Format of a key:
 | 
						|
 | 
						|
    Offset  Size  Description
 | 
						|
      0       4   Number of GTIDs in the key, plus 1. Or 0 for EOF.
 | 
						|
      4       4   Binlog file offset
 | 
						|
      8       4   Domain_id of first GTID
 | 
						|
     12       4   Server_id of first GTID
 | 
						|
     16       8   Seq_no of first GTID
 | 
						|
     ...          and so on for each GTID in the key.
 | 
						|
 | 
						|
  A node typically fits in one page. But if the GTID state is very big (or
 | 
						|
  the page size very small), multiple pages may be used. When a node is split,
 | 
						|
  it can be split after a child pointer or before or after a GTID, but not
 | 
						|
  elsewhere.
 | 
						|
 | 
						|
Here is an example GTID index with page_size=64 containing 3 records:
 | 
						|
  Offset  GTID state
 | 
						|
  0x11d   [empty]
 | 
						|
  0x20e   [0-1-1]
 | 
						|
  0x2ad   [0-1-2]
 | 
						|
The example contains 3 nodes, each stored in a single page. Two leaf nodes and
 | 
						|
one interior root node.
 | 
						|
 | 
						|
Page 1 (leaf node page with file header):
 | 
						|
  fe fe 0c 01              "magic" identifying the file as a binlog GTID index
 | 
						|
  01 00                    Major version 1, minor version 0
 | 
						|
  00 00                    Padding / currently unused
 | 
						|
  40 00 00 00              Page size (64 bytes in this example)
 | 
						|
  05                       Flag PAGE_FLAG_IS_LEAF | PAGE_FLAG_LAST (single-page leaf node)
 | 
						|
  00 00 00                 Padding / current unused
 | 
						|
Key 1:
 | 
						|
  01 00 00 00              <GTID_count + 1> = 1 (entry has zero GTIDs in it)
 | 
						|
  1d 01 00 00              Binlog file offset = 0x11d
 | 
						|
                           [Empty GTID state at the very start of the binlog]
 | 
						|
Key 2:
 | 
						|
  02 00 00 00              GTID_count = 1
 | 
						|
  0e 02 00 00              Binlog file offset = 0x20e
 | 
						|
  00 00 00 00 01 00 00 00
 | 
						|
  01 00 00 00 00 00 00 00  GTID 0-1-1
 | 
						|
 | 
						|
  00 00 00 00 00           Zero denotes end-of-node
 | 
						|
  00 00 00 00 00 00 00     (Unused space in the page)
 | 
						|
  0e 4f ac 43              Checksum / CRC
 | 
						|
 | 
						|
Page 2 (leaf node):
 | 
						|
  05                       Flag PAGE_FLAG_IS_LEAF | PAGE_FLAG_LAST (single-page leaf node)
 | 
						|
  00 00 00                 Unused
 | 
						|
Key 1:
 | 
						|
  02 00 00 00              GTID_count = 1
 | 
						|
  ad 02 00 00              Binlog file offset = 0x2ad
 | 
						|
  00 00 00 00 01 00 00 00
 | 
						|
  02 00 00 00 00 00 00 00  GTID 0-1-2
 | 
						|
 | 
						|
  00 00 00 00              End-of-node
 | 
						|
  00 00 00 00 00 00 00 00
 | 
						|
  00 00 00 00 00 00 00 00
 | 
						|
  00 00 00 00 00 00 00 00
 | 
						|
  00 00 00 00              (Unused space in the page)
 | 
						|
  0c 4e c2 b9              CRC
 | 
						|
 | 
						|
Page 3 (root node):
 | 
						|
  0c                       PAGE_FLAG_ROOT | PAGE_FLAG_LAST (interior root node)
 | 
						|
  00 00 00                 Unused
 | 
						|
Child pointer:
 | 
						|
  01 00 00 00              Pointer to page 1
 | 
						|
Key for next child page:
 | 
						|
  02 00 00 00              GTID_count = 1
 | 
						|
  ad 02 00 00              Binlog offset = 0x2ad
 | 
						|
  00 00 00 00 01 00 00 00
 | 
						|
  02 00 00 00 00 00 00 00  GTID 0-1-2
 | 
						|
Child pointer:
 | 
						|
  02 00 00 00              Pointer to page 2
 | 
						|
 | 
						|
  00 00 0 000              Zero denotes end-of-node
 | 
						|
  00 00 00 00 00 00 00 00
 | 
						|
  00 00 00 00 00 00 00 00
 | 
						|
  00 00 00 00              (Unused)
 | 
						|
  8155 a3c7                CRC
 | 
						|
 | 
						|
  Below is an example of the logical B-Tree structure of a larger GTID index
 | 
						|
  with a total of 12 keys.
 | 
						|
 | 
						|
  We use S0, S1, ..., S11 to denote a key, which consists of a GTID state (as
 | 
						|
  seen in @@binlog_gtid_state and GTID_LIST_EVENT) and the associated binlog
 | 
						|
  file offset. D1, D2, ..., D11 denote the same keys, but delta-compressed, so
 | 
						|
  that D1 stores only those GTIDs that are not the same as in S0.
 | 
						|
 | 
						|
  Pages are denoted by P1, P2, ..., P8. In the example, P1, P2, P3, P5, and P6
 | 
						|
  are leaf pages, the rest are interior node pages. P8 is the root node (the
 | 
						|
  root is always the last page in the index).
 | 
						|
 | 
						|
  The contents of each page is listed in square brackets [...]. So P1[S0 D1 D2]
 | 
						|
  is a leaf page with 3 keys, and P7[P5 <D10+D11> P6] is an interior node page
 | 
						|
  with one key <D10+D11> and two child-page pointers to P5 and P6. The
 | 
						|
  notation <D10+D11> denotes the delta-compression of key S11 relative to S9;
 | 
						|
  all GTIDs in S11 that are not present in S9. In the code, this is computed
 | 
						|
  by combining D10 and D11, hence the use of the notation "D10+D11" instead of
 | 
						|
  the equivalent "S11-S9".
 | 
						|
 | 
						|
  Here is the example B-Tree. It has 3 levels, with the leaf nodes at the top:
 | 
						|
 | 
						|
    P1[S0 D1 D2]   P2[D3 D4 D5]   P3[D6 D7 D8]   P5[D9 D10]   P6[D11]
 | 
						|
          P4[P1 <S3> P2 <D4+D5+D6> P3]            P7[P5 <D10+D11> P6]
 | 
						|
                                     P8[P4 <S9> P7]
 | 
						|
 | 
						|
  To find eg. S4, we start from the root P8. S4<S9, so we follow the left child
 | 
						|
  pointer to P4. S4>S3 and S4<S6 (S6=(S3+D4+D5+D6)), so we follow the child
 | 
						|
  pointer to leaf page P2.
 | 
						|
 | 
						|
  The index is written completely append-only; this is possible since keys are
 | 
						|
  always inserted in-order, at the end of the index. One page is kept in-memory
 | 
						|
  at each level of the B-Tree; when a new key no longer fits the page, it is
 | 
						|
  written out to disk and a new in-memory page is allocated for it.
 | 
						|
 | 
						|
  Here are the operations that occur while writing the index file from the
 | 
						|
  above example. The left column is each key added to the index as the
 | 
						|
  corresponding GTID is written into the binlog file (<EOF> is when the index
 | 
						|
  is closed at binlog rotation). The right column are the operations performed,
 | 
						|
  as follows:
 | 
						|
    alloc(p)      Allocate page p
 | 
						|
    add_key(p,k)  Insert the key k into the page p
 | 
						|
    add_ptr(p,q)  Insert a pointer to child page q in parent page p
 | 
						|
    write(p)      Write out page p to disk at the end of the index file.
 | 
						|
 | 
						|
  GTID STATE   OPERATIONS
 | 
						|
    S0       alloc(P1) add_key(P1,S0)
 | 
						|
    D1         add_key(P1,D1)
 | 
						|
    D2         add_key(P1,D2)
 | 
						|
    D3       write(P1) alloc(P4) add_ptr(P4,P1)
 | 
						|
             alloc(P2) add_key(P2,D3) add_key(P4,S3)
 | 
						|
    D4         add_key(P2,D4)
 | 
						|
    D5         add_key(P2,D5)
 | 
						|
    D6       write(P2) add_ptr(P4,P2)
 | 
						|
             alloc(P3) add_key(P3,D6) add_key(P4,D4+D5+D6)
 | 
						|
    D7         add_key(P3,D7)
 | 
						|
    D8         add_key(P3,D8)
 | 
						|
    D9       write(P3) add_ptr(P4,P3)
 | 
						|
             alloc(P5) add_key(P5,D9)
 | 
						|
             write(P4) alloc(P8) add_ptr(P8,P4) alloc(P7) add_key(P8,S9)
 | 
						|
    D10        add_key(P5,D10)
 | 
						|
    D11      write(P5) add_ptr(P7,P5)
 | 
						|
             alloc(P6) add_key(P6,D11) add_key(P7,D10+D11)
 | 
						|
    <EOF>    write(P6) add_ptr(P7,P6)
 | 
						|
             write(P7) add_ptr(P8,P7)
 | 
						|
             write(P8)
 | 
						|
 | 
						|
  After adding each record to the index, there is exactly one partial page
 | 
						|
  allocated in-memory for each level present in the B-Tree; new pages being
 | 
						|
  allocated as old pages fill up and are written to disk.
 | 
						|
*/
 | 
						|
 | 
						|
 | 
						|
class Gtid_index_base
 | 
						|
{
 | 
						|
public:
 | 
						|
  /* +4 for ".idx" prefix. */
 | 
						|
  static constexpr size_t GTID_INDEX_FILENAME_MAX_SIZE= FN_REFLEN+4;
 | 
						|
 | 
						|
protected:
 | 
						|
  enum enum_page_flags {
 | 
						|
    /* Set for a leaf node page, cleared for an interior node page. */
 | 
						|
    PAGE_FLAG_IS_LEAF= 1,
 | 
						|
    /* This is a continuation page. */
 | 
						|
    PAGE_FLAG_IS_CONT= 2,
 | 
						|
    /* No continuation page follows (the last page in a group). */
 | 
						|
    PAGE_FLAG_LAST= 4,
 | 
						|
    /*
 | 
						|
      Flag set to mark the root node. (The root node is normally the last page
 | 
						|
      in the index file, but having an explicit flag allows us to detect a
 | 
						|
      partially written index file with the root node missing.
 | 
						|
    */
 | 
						|
    PAGE_FLAG_ROOT= 8,
 | 
						|
  };
 | 
						|
 | 
						|
  /*
 | 
						|
    Minor version increment represents a backwards-compatible format (can be
 | 
						|
    read by any server version that knows the format of the major version).
 | 
						|
    Major version increment means a server should not attempt to read from the
 | 
						|
    index.
 | 
						|
  */
 | 
						|
  static constexpr uchar GTID_INDEX_VERSION_MAJOR= 1;
 | 
						|
  static constexpr uchar GTID_INDEX_VERSION_MINOR= 0;
 | 
						|
  static constexpr size_t GTID_INDEX_FILE_HEADER_SIZE= 12;
 | 
						|
  static constexpr size_t GTID_INDEX_PAGE_HEADER_SIZE= 4;
 | 
						|
  static constexpr size_t CHECKSUM_LEN= 4;
 | 
						|
 | 
						|
#ifdef _MSC_VER
 | 
						|
/*
 | 
						|
  Flexible array member is part of C99, but it is not standard in C++.
 | 
						|
  All the compilers and platforms we support do support it, though.
 | 
						|
  Just we need to disable on Windows a warning about using a non-standard
 | 
						|
  C++ extension.
 | 
						|
*/
 | 
						|
#pragma warning(disable : 4200)
 | 
						|
#endif
 | 
						|
  struct Node_page
 | 
						|
  {
 | 
						|
    Node_page *next;
 | 
						|
    /* Pointer to allow to update the "flags" byte at page writeout. */
 | 
						|
    uchar *flag_ptr;
 | 
						|
    /* Flexible array member; will be allocated to opt_gtid_index_page_size. */
 | 
						|
    uchar page[];
 | 
						|
  };
 | 
						|
 | 
						|
  struct Index_node_base
 | 
						|
  {
 | 
						|
    Node_page *first_page;
 | 
						|
    Node_page *current_page;
 | 
						|
    /* The current_ptr is only valid if current_page != 0. */
 | 
						|
    uchar *current_ptr;
 | 
						|
 | 
						|
    Index_node_base();
 | 
						|
    ~Index_node_base();
 | 
						|
    void free_pages();
 | 
						|
    void reset();
 | 
						|
  };
 | 
						|
 | 
						|
public:
 | 
						|
  static void make_gtid_index_file_name(char *out_name, size_t bufsize,
 | 
						|
                                        const char *base_filename);
 | 
						|
 | 
						|
protected:
 | 
						|
  int update_gtid_state(rpl_binlog_state_base *state,
 | 
						|
                        const rpl_gtid *gtid_list, uint32 gtid_count);
 | 
						|
  Node_page *alloc_page();
 | 
						|
  rpl_gtid *gtid_list_buffer(uint32 count);
 | 
						|
  void build_index_filename(const char *filename);
 | 
						|
  virtual int give_error(const char *msg) = 0;
 | 
						|
 | 
						|
  /*
 | 
						|
    A buffer to hold a gtid_list temporarily.
 | 
						|
    Increased as needed to hold largest needed list.
 | 
						|
  */
 | 
						|
  rpl_gtid *gtid_buffer;
 | 
						|
  uint32 gtid_buffer_alloc;
 | 
						|
  size_t page_size;
 | 
						|
public:
 | 
						|
  char index_file_name[GTID_INDEX_FILENAME_MAX_SIZE];
 | 
						|
 | 
						|
protected:
 | 
						|
  Gtid_index_base();
 | 
						|
  virtual ~Gtid_index_base();
 | 
						|
};
 | 
						|
 | 
						|
 | 
						|
class Gtid_index_writer : public Gtid_index_base
 | 
						|
{
 | 
						|
private:
 | 
						|
  const my_off_t offset_min_threshold;
 | 
						|
 | 
						|
  struct Index_node : public Index_node_base
 | 
						|
  {
 | 
						|
    rpl_binlog_state_base state;
 | 
						|
    uint32 num_records;
 | 
						|
    uint32 level;
 | 
						|
    bool force_spill_page;
 | 
						|
 | 
						|
    Index_node(uint32 level_);
 | 
						|
    ~Index_node();
 | 
						|
    void reset();
 | 
						|
  };
 | 
						|
 | 
						|
public:
 | 
						|
  static void gtid_index_init();
 | 
						|
  static void gtid_index_cleanup();
 | 
						|
protected:
 | 
						|
  friend class Gtid_index_reader_hot;
 | 
						|
  static void lock_gtid_index() { mysql_mutex_lock(>id_index_mutex); }
 | 
						|
  static void unlock_gtid_index() { mysql_mutex_unlock(>id_index_mutex); }
 | 
						|
  static const Gtid_index_writer *find_hot_index(const char *file_name);
 | 
						|
 | 
						|
public:
 | 
						|
  Gtid_index_writer(const char *filename, uint32 offset,
 | 
						|
                    rpl_binlog_state_base *binlog_state,
 | 
						|
                    uint32 opt_page_size, my_off_t opt_span_min);
 | 
						|
  virtual ~Gtid_index_writer();
 | 
						|
  void process_gtid(uint32 offset, const rpl_gtid *gtid);
 | 
						|
  int process_gtid_check_batch(uint32 offset, const rpl_gtid *gtid,
 | 
						|
                               rpl_gtid **out_gtid_list,
 | 
						|
                               uint32 *out_gtid_count);
 | 
						|
  int async_update(uint32 event_offset, rpl_gtid *gtid_list, uint32 gtid_count);
 | 
						|
  void close();
 | 
						|
 | 
						|
private:
 | 
						|
  void insert_in_hot_index();
 | 
						|
  void remove_from_hot_index();
 | 
						|
  uint32 write_current_node(uint32 level, bool is_root);
 | 
						|
  int reserve_space(Index_node *n, size_t bytes);
 | 
						|
  int do_write_record(uint32 level, uint32 event_offset,
 | 
						|
                      const rpl_gtid *gtid_list, uint32 gtid_count);
 | 
						|
  int add_child_ptr(uint32 level, my_off_t node_offset);
 | 
						|
  int write_record(uint32 event_offset, const rpl_gtid *gtid_list,
 | 
						|
                   uint32 gtid_count);
 | 
						|
  bool check_room(uint32 level, uint32 gtid_count);
 | 
						|
  int alloc_level_if_missing(uint32 level);
 | 
						|
  uchar *init_header(Node_page *page, bool is_leaf, bool is_first);
 | 
						|
  int give_error(const char *msg) override;
 | 
						|
 | 
						|
  static mysql_mutex_t gtid_index_mutex;
 | 
						|
  static Gtid_index_writer *hot_index_list;
 | 
						|
 | 
						|
  rpl_binlog_state_base pending_state;
 | 
						|
  /* Next pointer for the hot_index_list linked list. */
 | 
						|
  Gtid_index_writer *next_hot_index;
 | 
						|
  /* The currently being built index nodes, from leaf[0] to root[max_level]. */
 | 
						|
  Index_node **nodes;
 | 
						|
  my_off_t previous_offset;
 | 
						|
  uint32 max_level;
 | 
						|
 | 
						|
  File index_file;
 | 
						|
 | 
						|
  /*
 | 
						|
    This is set if we encounter an error (such as out-of-memory or I/O error).
 | 
						|
    Then we will no longer do any updates to the index, to prevent leaving a
 | 
						|
    corrupt index. This is not fatal; the partial index will work up to where
 | 
						|
    it got the error, and the code can fall-back to sequential scan of the
 | 
						|
    binlog.
 | 
						|
  */
 | 
						|
  bool error_state;
 | 
						|
  /* Flag to help put the file header at the start of the very first page. */
 | 
						|
  bool file_header_written;
 | 
						|
  /* Flag set while this object is visible in the "hot index" list. */
 | 
						|
  bool in_hot_index_list;
 | 
						|
};
 | 
						|
 | 
						|
 | 
						|
class Gtid_index_reader : public Gtid_index_base
 | 
						|
{
 | 
						|
public:
 | 
						|
  Gtid_index_reader();
 | 
						|
  virtual ~Gtid_index_reader();
 | 
						|
 | 
						|
  int open_index_file(const char *binlog_filename);
 | 
						|
  void close_index_file();
 | 
						|
  /*
 | 
						|
    The search functions take either a binlog offset or GTID position to search
 | 
						|
    for. They return:
 | 
						|
      0   for "not found" (searched position is earlier than start of index).
 | 
						|
      1   for "found"
 | 
						|
     -1   for error.
 | 
						|
    When found, the returned position is the last position in the index that
 | 
						|
    lies at or before the searched position. The offset of the returned
 | 
						|
    position is written to *out_offset. The number of GTIDs in the returned
 | 
						|
    GTID state is written to *out_gtid_count; the list of found GTIDs can be
 | 
						|
    accessed with search_gtid_list() and is valid only until next search or
 | 
						|
    freeing of the Gtid_index_reader object.
 | 
						|
  */
 | 
						|
  int search_offset(uint32 in_offset, uint32 *out_offset,
 | 
						|
                    uint32 *out_gtid_count);
 | 
						|
  int search_gtid_pos(slave_connection_state *in_gtid_pos, uint32 *out_offset,
 | 
						|
                      uint32 *out_gtid_count);
 | 
						|
  rpl_gtid *search_gtid_list();
 | 
						|
 | 
						|
protected:
 | 
						|
  int search_cmp_offset(uint32 offset, rpl_binlog_state_base *state);
 | 
						|
  int search_cmp_gtid_pos(uint32 offset, rpl_binlog_state_base *state);
 | 
						|
  virtual int do_index_search(uint32 *out_offset, uint32 *out_gtid_count);
 | 
						|
  int do_index_search_root(uint32 *out_offset, uint32 *out_gtid_count);
 | 
						|
  int do_index_search_leaf(bool current_state_updated,
 | 
						|
                           uint32 *out_offset, uint32 *out_gtid_count);
 | 
						|
  int next_page();
 | 
						|
  int find_bytes(uint32 num_bytes);
 | 
						|
  virtual int get_child_ptr(uint32 *out_child_ptr);
 | 
						|
  int get_offset_count(uint32 *out_offset, uint32 *out_gtid_count);
 | 
						|
  int get_gtid_list(rpl_gtid *out_gtid_list, uint32 count);
 | 
						|
  virtual int read_file_header();
 | 
						|
  int verify_checksum(Node_page *page);
 | 
						|
  Node_page *alloc_and_read_page();
 | 
						|
  virtual int read_root_node();
 | 
						|
  virtual int read_node(uint32 page_ptr);
 | 
						|
  int read_node_cold(uint32 page_ptr);
 | 
						|
  int give_error(const char *msg) override;
 | 
						|
 | 
						|
  rpl_binlog_state_base current_state;
 | 
						|
  rpl_binlog_state_base compare_state;
 | 
						|
  Index_node_base cold_node;
 | 
						|
  /* n points to either cold node or hot node in writer. */
 | 
						|
  Index_node_base *n;
 | 
						|
  int (Gtid_index_reader::* search_cmp_function)(uint32, rpl_binlog_state_base *);
 | 
						|
  slave_connection_state *in_search_gtid_pos;
 | 
						|
  Node_page *read_page;
 | 
						|
  uchar *read_ptr;
 | 
						|
  File index_file;
 | 
						|
  uint32 current_offset;
 | 
						|
  uint32 in_search_offset;
 | 
						|
  bool file_open;
 | 
						|
  bool index_valid;
 | 
						|
  bool has_root_node;
 | 
						|
  uchar version_major;
 | 
						|
  uchar version_minor;
 | 
						|
};
 | 
						|
 | 
						|
 | 
						|
/*
 | 
						|
   Sub-class of Gtid_index_reader that can additionally access in-memory "hot"
 | 
						|
   pages of the index, which are partially filled pages of the current binlog
 | 
						|
   file, not yet written to disk.
 | 
						|
*/
 | 
						|
class Gtid_index_reader_hot : public Gtid_index_reader
 | 
						|
{
 | 
						|
public:
 | 
						|
  Gtid_index_reader_hot();
 | 
						|
  virtual ~Gtid_index_reader_hot() { }
 | 
						|
 | 
						|
private:
 | 
						|
  int do_index_search(uint32 *out_offset, uint32 *out_gtid_count) override;
 | 
						|
  int get_child_ptr(uint32 *out_child_ptr) override;
 | 
						|
  int read_file_header() override;
 | 
						|
  int read_root_node() override;
 | 
						|
  int read_node(uint32 page_ptr) override;
 | 
						|
  int read_node_hot();
 | 
						|
 | 
						|
  /* Pointer to the writer object, if we're reading a hot index. */
 | 
						|
  const Gtid_index_writer *hot_writer;
 | 
						|
  /* The level we are currently reading in the hot writer .*/
 | 
						|
  uint32 hot_level;
 | 
						|
};
 | 
						|
 | 
						|
#endif  /* GTID_INDEX_H */
 |