mirror of
				https://github.com/MariaDB/server.git
				synced 2025-10-26 16:38:11 +01:00 
			
		
		
		
	
		
			
				
	
	
		
			1639 lines
		
	
	
	
		
			59 KiB
		
	
	
	
		
			C++
		
	
	
	
	
	
			
		
		
	
	
			1639 lines
		
	
	
	
		
			59 KiB
		
	
	
	
		
			C++
		
	
	
	
	
	
| /*
 | |
|    Copyright (c) 2012,2013 Monty Program Ab
 | |
| 
 | |
|    This program is free software; you can redistribute it and/or modify
 | |
|    it under the terms of the GNU General Public License as published by
 | |
|    the Free Software Foundation; version 2 of the License.
 | |
| 
 | |
|    This program is distributed in the hope that it will be useful,
 | |
|    but WITHOUT ANY WARRANTY; without even the implied warranty of
 | |
|    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 | |
|    GNU General Public License for more details.
 | |
| 
 | |
|    You should have received a copy of the GNU General Public License
 | |
|    along with this program; if not, write to the Free Software
 | |
|    Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111-1301 USA */
 | |
| #pragma once
 | |
| 
 | |
| /* C++ standard header files */
 | |
| #include <cstdlib>
 | |
| #include <algorithm>
 | |
| #include <atomic>
 | |
| #include <map>
 | |
| #include <mutex>
 | |
| #include <string>
 | |
| #include <unordered_map>
 | |
| #include <unordered_set>
 | |
| #include <utility>
 | |
| #include <vector>
 | |
| #include <array>
 | |
| 
 | |
| /* C standard header files */
 | |
| #ifndef _WIN32
 | |
| #include <arpa/inet.h>
 | |
| #endif
 | |
| 
 | |
| /* MyRocks header files */
 | |
| #include "./ha_rocksdb.h"
 | |
| #include "./properties_collector.h"
 | |
| #include "./rdb_buff.h"
 | |
| #include "./rdb_utils.h"
 | |
| 
 | |
| namespace myrocks {
 | |
| 
 | |
| class Rdb_dict_manager;
 | |
| class Rdb_key_def;
 | |
| class Rdb_field_packing;
 | |
| class Rdb_cf_manager;
 | |
| class Rdb_ddl_manager;
 | |
| 
 | |
| const uint32_t GTID_BUF_LEN = 60;
 | |
| 
 | |
| class Rdb_convert_to_record_key_decoder {
 | |
|  public:
 | |
|   Rdb_convert_to_record_key_decoder() = default;
 | |
|   Rdb_convert_to_record_key_decoder(
 | |
|       const Rdb_convert_to_record_key_decoder &decoder) = delete;
 | |
|   Rdb_convert_to_record_key_decoder &operator=(
 | |
|       const Rdb_convert_to_record_key_decoder &decoder) = delete;
 | |
|   static int decode(uchar *const buf, uint *offset, Rdb_field_packing *fpi,
 | |
|                     TABLE *table, Field *field, bool has_unpack_info,
 | |
|                     Rdb_string_reader *reader,
 | |
|                     Rdb_string_reader *unpack_reader);
 | |
|   static int skip(const Rdb_field_packing *fpi, const Field *field,
 | |
|                   Rdb_string_reader *reader, Rdb_string_reader *unpack_reader);
 | |
| 
 | |
|  private:
 | |
|   static int decode_field(Rdb_field_packing *fpi, Field *field,
 | |
|                           Rdb_string_reader *reader,
 | |
|                           const uchar *const default_value,
 | |
|                           Rdb_string_reader *unpack_reader);
 | |
| };
 | |
| 
 | |
| /*
 | |
|   @brief
 | |
|   Field packing context.
 | |
|   The idea is to ensure that a call to rdb_index_field_pack_t function
 | |
|   is followed by a call to rdb_make_unpack_info_t.
 | |
| 
 | |
|   @detail
 | |
|   For some datatypes, unpack_info is produced as a side effect of
 | |
|   rdb_index_field_pack_t function call.
 | |
|   For other datatypes, packing is just calling make_sort_key(), while
 | |
|   rdb_make_unpack_info_t is a custom function.
 | |
|   In order to accommodate both cases, we require both calls to be made and
 | |
|   unpack_info is passed as context data between the two.
 | |
| */
 | |
| class Rdb_pack_field_context {
 | |
|  public:
 | |
|   Rdb_pack_field_context(const Rdb_pack_field_context &) = delete;
 | |
|   Rdb_pack_field_context &operator=(const Rdb_pack_field_context &) = delete;
 | |
| 
 | |
|   explicit Rdb_pack_field_context(Rdb_string_writer *const writer_arg)
 | |
|       : writer(writer_arg) {}
 | |
| 
 | |
|   // NULL means we're not producing unpack_info.
 | |
|   Rdb_string_writer *writer;
 | |
| };
 | |
| 
 | |
| class Rdb_key_field_iterator {
 | |
|  private:
 | |
|   Rdb_field_packing *m_pack_info;
 | |
|   int m_iter_index;
 | |
|   int m_iter_end;
 | |
|   TABLE *m_table;
 | |
|   Rdb_string_reader *m_reader;
 | |
|   Rdb_string_reader *m_unp_reader;
 | |
|   uint m_curr_bitmap_pos;
 | |
|   const MY_BITMAP *m_covered_bitmap;
 | |
|   uchar *m_buf;
 | |
|   bool m_has_unpack_info;
 | |
|   const Rdb_key_def *m_key_def;
 | |
|   bool m_secondary_key;
 | |
|   bool m_hidden_pk_exists;
 | |
|   bool m_is_hidden_pk;
 | |
|   bool m_is_null;
 | |
|   Field *m_field;
 | |
|   uint m_offset;
 | |
|   Rdb_field_packing *m_fpi;
 | |
| 
 | |
|  public:
 | |
|   Rdb_key_field_iterator(const Rdb_key_field_iterator &) = delete;
 | |
|   Rdb_key_field_iterator &operator=(const Rdb_key_field_iterator &) = delete;
 | |
|   Rdb_key_field_iterator(const Rdb_key_def *key_def,
 | |
|                          Rdb_field_packing *pack_info,
 | |
|                          Rdb_string_reader *reader,
 | |
|                          Rdb_string_reader *unp_reader, TABLE *table,
 | |
|                          bool has_unpack_info, const MY_BITMAP *covered_bitmap,
 | |
|                          uchar *buf);
 | |
| 
 | |
|   int next();
 | |
|   bool has_next();
 | |
|   bool get_is_null() const;
 | |
|   Field *get_field() const;
 | |
|   int get_field_index() const;
 | |
|   void *get_dst() const;
 | |
| };
 | |
| 
 | |
| struct Rdb_collation_codec;
 | |
| struct Rdb_index_info;
 | |
| 
 | |
| /*
 | |
|   C-style "virtual table" allowing different handling of packing logic based
 | |
|   on the field type. See Rdb_field_packing::setup() implementation.
 | |
|   */
 | |
| using rdb_make_unpack_info_t = void (*)(const Rdb_collation_codec *codec,
 | |
|                                         const Field *field,
 | |
|                                         Rdb_pack_field_context *pack_ctx);
 | |
| using rdb_index_field_unpack_t = int (*)(Rdb_field_packing *fpi, Field *field,
 | |
|                                          uchar *field_ptr,
 | |
|                                          Rdb_string_reader *reader,
 | |
|                                          Rdb_string_reader *unpack_reader);
 | |
| using rdb_index_field_skip_t = int (*)(const Rdb_field_packing *fpi,
 | |
|                                        const Field *field,
 | |
|                                        Rdb_string_reader *reader);
 | |
| using rdb_index_field_pack_t = void (*)(Rdb_field_packing *fpi, Field *field,
 | |
|                                         uchar *buf, uchar **dst,
 | |
|                                         Rdb_pack_field_context *pack_ctx);
 | |
| 
 | |
| const uint RDB_INVALID_KEY_LEN = uint(-1);
 | |
| 
 | |
| /* How much one checksum occupies when stored in the record */
 | |
| const size_t RDB_CHECKSUM_SIZE = sizeof(uint32_t);
 | |
| 
 | |
| /*
 | |
|   How much the checksum data occupies in record, in total.
 | |
|   It is storing two checksums plus 1 tag-byte.
 | |
| */
 | |
| const size_t RDB_CHECKSUM_CHUNK_SIZE = 2 * RDB_CHECKSUM_SIZE + 1;
 | |
| 
 | |
| /*
 | |
|   Checksum data starts from CHECKSUM_DATA_TAG which is followed by two CRC32
 | |
|   checksums.
 | |
| */
 | |
| const char RDB_CHECKSUM_DATA_TAG = 0x01;
 | |
| 
 | |
| /*
 | |
|   Unpack data is variable length. The header is 1 tag-byte plus a two byte
 | |
|   length field. The length field includes the header as well.
 | |
| */
 | |
| const char RDB_UNPACK_DATA_TAG = 0x02;
 | |
| const size_t RDB_UNPACK_DATA_LEN_SIZE = sizeof(uint16_t);
 | |
| const size_t RDB_UNPACK_HEADER_SIZE =
 | |
|     sizeof(RDB_UNPACK_DATA_TAG) + RDB_UNPACK_DATA_LEN_SIZE;
 | |
| 
 | |
| /*
 | |
|   This header format is 1 tag-byte plus a two byte length field plus a two byte
 | |
|   covered bitmap. The length field includes the header size.
 | |
| */
 | |
| const char RDB_UNPACK_COVERED_DATA_TAG = 0x03;
 | |
| const size_t RDB_UNPACK_COVERED_DATA_LEN_SIZE = sizeof(uint16_t);
 | |
| const size_t RDB_COVERED_BITMAP_SIZE = sizeof(uint16_t);
 | |
| const size_t RDB_UNPACK_COVERED_HEADER_SIZE =
 | |
|     sizeof(RDB_UNPACK_COVERED_DATA_TAG) + RDB_UNPACK_COVERED_DATA_LEN_SIZE +
 | |
|     RDB_COVERED_BITMAP_SIZE;
 | |
| 
 | |
| /*
 | |
|   Data dictionary index info field sizes.
 | |
| */
 | |
| const size_t RDB_SIZEOF_INDEX_INFO_VERSION = sizeof(uint16);
 | |
| const size_t RDB_SIZEOF_INDEX_TYPE = sizeof(uchar);
 | |
| const size_t RDB_SIZEOF_KV_VERSION = sizeof(uint16);
 | |
| const size_t RDB_SIZEOF_INDEX_FLAGS = sizeof(uint32);
 | |
| const size_t RDB_SIZEOF_AUTO_INCREMENT_VERSION = sizeof(uint16);
 | |
| 
 | |
| // Possible return values for rdb_index_field_unpack_t functions.
 | |
| enum {
 | |
|   UNPACK_SUCCESS = 0,
 | |
|   UNPACK_FAILURE = 1,
 | |
| };
 | |
| 
 | |
| /*
 | |
|   An object of this class represents information about an index in an SQL
 | |
|   table. It provides services to encode and decode index tuples.
 | |
| 
 | |
|   Note: a table (as in, on-disk table) has a single Rdb_key_def object which
 | |
|   is shared across multiple TABLE* objects and may be used simultaneously from
 | |
|   different threads.
 | |
| 
 | |
|   There are several data encodings:
 | |
| 
 | |
|   === SQL LAYER ===
 | |
|   SQL layer uses two encodings:
 | |
| 
 | |
|   - "Table->record format". This is the format that is used for the data in
 | |
|      the record buffers, table->record[i]
 | |
| 
 | |
|   - KeyTupleFormat (see opt_range.cc) - this is used in parameters to index
 | |
|     lookup functions, like handler::index_read_map().
 | |
| 
 | |
|   === Inside RocksDB ===
 | |
|   Primary Key is stored as a mapping:
 | |
| 
 | |
|     index_tuple -> StoredRecord
 | |
| 
 | |
|   StoredRecord is in Table->record format, except for blobs, which are stored
 | |
|   in-place. See ha_rocksdb::convert_record_to_storage_format for details.
 | |
| 
 | |
|   Secondary indexes are stored as one of two variants:
 | |
| 
 | |
|     index_tuple -> unpack_info
 | |
|     index_tuple -> empty_string
 | |
| 
 | |
|   index_tuple here is the form of key that can be compared with memcmp(), aka
 | |
|   "mem-comparable form".
 | |
| 
 | |
|   unpack_info is extra data that allows to restore the original value from its
 | |
|   mem-comparable form. It is present only if the index supports index-only
 | |
|   reads.
 | |
| */
 | |
| 
 | |
| class Rdb_key_def {
 | |
|  public:
 | |
|   /* Convert a key from KeyTupleFormat to mem-comparable form */
 | |
|   uint pack_index_tuple(TABLE *const tbl, uchar *const pack_buffer,
 | |
|                         uchar *const packed_tuple, uchar *const record_buffer,
 | |
|                         const uchar *const key_tuple,
 | |
|                         const key_part_map &keypart_map) const;
 | |
| 
 | |
|   uchar *pack_field(Field *const field, Rdb_field_packing *pack_info,
 | |
|                     uchar *tuple, uchar *const packed_tuple,
 | |
|                     uchar *const pack_buffer,
 | |
|                     Rdb_string_writer *const unpack_info,
 | |
|                     uint *const n_null_fields) const;
 | |
|   /* Convert a key from Table->record format to mem-comparable form */
 | |
|   uint pack_record(const TABLE *const tbl, uchar *const pack_buffer,
 | |
|                    const uchar *const record, uchar *const packed_tuple,
 | |
|                    Rdb_string_writer *const unpack_info,
 | |
|                    const bool should_store_row_debug_checksums,
 | |
|                    const longlong hidden_pk_id = 0, uint n_key_parts = 0,
 | |
|                    uint *const n_null_fields = nullptr,
 | |
|                    const char *const ttl_bytes = nullptr) const;
 | |
|   /* Pack the hidden primary key into mem-comparable form. */
 | |
|   uint pack_hidden_pk(const longlong hidden_pk_id,
 | |
|                       uchar *const packed_tuple) const;
 | |
|   int unpack_record(TABLE *const table, uchar *const buf,
 | |
|                     const rocksdb::Slice *const packed_key,
 | |
|                     const rocksdb::Slice *const unpack_info,
 | |
|                     const bool verify_row_debug_checksums) const;
 | |
| 
 | |
|   static bool unpack_info_has_checksum(const rocksdb::Slice &unpack_info);
 | |
|   int compare_keys(const rocksdb::Slice *key1, const rocksdb::Slice *key2,
 | |
|                    std::size_t *const column_index) const;
 | |
| 
 | |
|   size_t key_length(const TABLE *const table, const rocksdb::Slice &key) const;
 | |
| 
 | |
|   /* Get the key that is the "infimum" for this index */
 | |
|   inline void get_infimum_key(uchar *const key, uint *const size) const {
 | |
|     rdb_netbuf_store_index(key, m_index_number);
 | |
|     *size = INDEX_NUMBER_SIZE;
 | |
|   }
 | |
| 
 | |
|   /* Get the key that is a "supremum" for this index */
 | |
|   inline void get_supremum_key(uchar *const key, uint *const size) const {
 | |
|     rdb_netbuf_store_index(key, m_index_number + 1);
 | |
|     *size = INDEX_NUMBER_SIZE;
 | |
|   }
 | |
| 
 | |
|   /*
 | |
|     Get the first key that you need to position at to start iterating.
 | |
|     Stores into *key a "supremum" or "infimum" key value for the index.
 | |
|     @parameters key    OUT  Big Endian, value is m_index_number or
 | |
|                             m_index_number + 1
 | |
|     @parameters size   OUT  key size, value is INDEX_NUMBER_SIZE
 | |
|     @return Number of bytes in the key that are usable for bloom filter use.
 | |
|   */
 | |
|   inline int get_first_key(uchar *const key, uint *const size) const {
 | |
|     if (m_is_reverse_cf) {
 | |
|       get_supremum_key(key, size);
 | |
|       /* Find out how many bytes of infimum are the same as m_index_number */
 | |
|       uchar unmodified_key[INDEX_NUMBER_SIZE];
 | |
|       rdb_netbuf_store_index(unmodified_key, m_index_number);
 | |
|       int i;
 | |
|       for (i = 0; i < INDEX_NUMBER_SIZE; i++) {
 | |
|         if (key[i] != unmodified_key[i]) {
 | |
|           break;
 | |
|         }
 | |
|       }
 | |
|       return i;
 | |
|     } else {
 | |
|       get_infimum_key(key, size);
 | |
|       // For infimum key, its value will be m_index_number
 | |
|       // Thus return its own size instead.
 | |
|       return INDEX_NUMBER_SIZE;
 | |
|     }
 | |
|   }
 | |
| 
 | |
|   /*
 | |
|     The same as get_first_key, but get the key for the last entry in the index
 | |
|     @parameters key    OUT  Big Endian, value is m_index_number or
 | |
|                             m_index_number + 1
 | |
|     @parameters size   OUT  key size, value is INDEX_NUMBER_SIZE
 | |
| 
 | |
|     @return Number of bytes in the key that are usable for bloom filter use.
 | |
|   */
 | |
|   inline int get_last_key(uchar *const key, uint *const size) const {
 | |
|     if (m_is_reverse_cf) {
 | |
|       get_infimum_key(key, size);
 | |
|       // For infimum key, its value will be m_index_number
 | |
|       // Thus return its own size instead.
 | |
|       return INDEX_NUMBER_SIZE;
 | |
|     } else {
 | |
|       get_supremum_key(key, size);
 | |
|       /* Find out how many bytes are the same as m_index_number */
 | |
|       uchar unmodified_key[INDEX_NUMBER_SIZE];
 | |
|       rdb_netbuf_store_index(unmodified_key, m_index_number);
 | |
|       int i;
 | |
|       for (i = 0; i < INDEX_NUMBER_SIZE; i++) {
 | |
|         if (key[i] != unmodified_key[i]) {
 | |
|           break;
 | |
|         }
 | |
|       }
 | |
|       return i;
 | |
|     }
 | |
|   }
 | |
| 
 | |
|   /* Make a key that is right after the given key. */
 | |
|   static int successor(uchar *const packed_tuple, const uint len);
 | |
| 
 | |
|   /* Make a key that is right before the given key. */
 | |
|   static int predecessor(uchar *const packed_tuple, const uint len);
 | |
| 
 | |
|   /*
 | |
|     This can be used to compare prefixes.
 | |
|     if  X is a prefix of Y, then we consider that X = Y.
 | |
|   */
 | |
|   // b describes the lookup key, which can be a prefix of a.
 | |
|   // b might be outside of the index_number range, if successor() is called.
 | |
|   int cmp_full_keys(const rocksdb::Slice &a, const rocksdb::Slice &b) const {
 | |
|     DBUG_ASSERT(covers_key(a));
 | |
| 
 | |
|     return memcmp(a.data(), b.data(), std::min(a.size(), b.size()));
 | |
|   }
 | |
| 
 | |
|   /* Check if given mem-comparable key belongs to this index */
 | |
|   bool covers_key(const rocksdb::Slice &slice) const {
 | |
|     if (slice.size() < INDEX_NUMBER_SIZE) return false;
 | |
| 
 | |
|     if (memcmp(slice.data(), m_index_number_storage_form, INDEX_NUMBER_SIZE)) {
 | |
|       return false;
 | |
|     }
 | |
| 
 | |
|     return true;
 | |
|   }
 | |
| 
 | |
|   void get_lookup_bitmap(const TABLE *table, MY_BITMAP *map) const;
 | |
| 
 | |
|   bool covers_lookup(const rocksdb::Slice *const unpack_info,
 | |
|                      const MY_BITMAP *const map) const;
 | |
| 
 | |
|   inline bool use_covered_bitmap_format() const {
 | |
|     return m_index_type == INDEX_TYPE_SECONDARY &&
 | |
|            m_kv_format_version >= SECONDARY_FORMAT_VERSION_UPDATE3;
 | |
|   }
 | |
| 
 | |
|   /* Indicates that all key parts can be unpacked to cover a secondary lookup */
 | |
|   bool can_cover_lookup() const;
 | |
| 
 | |
|   /*
 | |
|     Return true if the passed mem-comparable key
 | |
|     - is from this index, and
 | |
|     - it matches the passed key prefix (the prefix is also in mem-comparable
 | |
|       form)
 | |
|   */
 | |
|   bool value_matches_prefix(const rocksdb::Slice &value,
 | |
|                             const rocksdb::Slice &prefix) const {
 | |
|     return covers_key(value) && !cmp_full_keys(value, prefix);
 | |
|   }
 | |
| 
 | |
|   uint32 get_keyno() const { return m_keyno; }
 | |
| 
 | |
|   uint32 get_index_number() const { return m_index_number; }
 | |
| 
 | |
|   GL_INDEX_ID get_gl_index_id() const {
 | |
|     const GL_INDEX_ID gl_index_id = {m_cf_handle->GetID(), m_index_number};
 | |
|     return gl_index_id;
 | |
|   }
 | |
| 
 | |
|   int read_memcmp_key_part(const TABLE *table_arg, Rdb_string_reader *reader,
 | |
|                            const uint part_num) const;
 | |
| 
 | |
|   /* Must only be called for secondary keys: */
 | |
|   uint get_primary_key_tuple(const TABLE *const tbl,
 | |
|                              const Rdb_key_def &pk_descr,
 | |
|                              const rocksdb::Slice *const key,
 | |
|                              uchar *const pk_buffer) const;
 | |
| 
 | |
|   uint get_memcmp_sk_parts(const TABLE *table, const rocksdb::Slice &key,
 | |
|                            uchar *sk_buffer, uint *n_null_fields) const;
 | |
| 
 | |
|   /* Return max length of mem-comparable form */
 | |
|   uint max_storage_fmt_length() const { return m_maxlength; }
 | |
| 
 | |
|   uint get_key_parts() const { return m_key_parts; }
 | |
| 
 | |
|   uint get_ttl_field_index() const { return m_ttl_field_index; }
 | |
| 
 | |
|   /*
 | |
|     Get a field object for key part #part_no
 | |
| 
 | |
|     @detail
 | |
|       SQL layer thinks unique secondary indexes and indexes in partitioned
 | |
|       tables are not "Extended" with Primary Key columns.
 | |
| 
 | |
|       Internally, we always extend all indexes with PK columns. This function
 | |
|       uses our definition of how the index is Extended.
 | |
|   */
 | |
|   inline Field *get_table_field_for_part_no(TABLE *table, uint part_no) const;
 | |
| 
 | |
|   const std::string &get_name() const { return m_name; }
 | |
| 
 | |
|   const rocksdb::SliceTransform *get_extractor() const {
 | |
|     return m_prefix_extractor.get();
 | |
|   }
 | |
| 
 | |
|   static size_t get_unpack_header_size(char tag);
 | |
| 
 | |
|   Rdb_key_def &operator=(const Rdb_key_def &) = delete;
 | |
|   Rdb_key_def(const Rdb_key_def &k);
 | |
|   Rdb_key_def(uint indexnr_arg, uint keyno_arg,
 | |
|               rocksdb::ColumnFamilyHandle *cf_handle_arg,
 | |
|               uint16_t index_dict_version_arg, uchar index_type_arg,
 | |
|               uint16_t kv_format_version_arg, bool is_reverse_cf_arg,
 | |
|               bool is_per_partition_cf, const char *name,
 | |
|               Rdb_index_stats stats = Rdb_index_stats(), uint32 index_flags = 0,
 | |
|               uint32 ttl_rec_offset = UINT_MAX, uint64 ttl_duration = 0);
 | |
|   ~Rdb_key_def();
 | |
| 
 | |
|   enum {
 | |
|     INDEX_NUMBER_SIZE = 4,
 | |
|     VERSION_SIZE = 2,
 | |
|     CF_NUMBER_SIZE = 4,
 | |
|     CF_FLAG_SIZE = 4,
 | |
|     PACKED_SIZE = 4,  // one int
 | |
|   };
 | |
| 
 | |
|   // bit flags for combining bools when writing to disk
 | |
|   enum {
 | |
|     REVERSE_CF_FLAG = 1,
 | |
|     AUTO_CF_FLAG = 2,  // Deprecated
 | |
|     PER_PARTITION_CF_FLAG = 4,
 | |
|   };
 | |
| 
 | |
|   // bit flags which denote myrocks specific fields stored in the record
 | |
|   // currently only used for TTL.
 | |
|   enum INDEX_FLAG {
 | |
|     TTL_FLAG = 1 << 0,
 | |
| 
 | |
|     // MAX_FLAG marks where the actual record starts
 | |
|     // This flag always needs to be set to the last index flag enum.
 | |
|     MAX_FLAG = TTL_FLAG << 1,
 | |
|   };
 | |
| 
 | |
|   // Set of flags to ignore when comparing two CF-s and determining if
 | |
|   // they're same.
 | |
|   static const uint CF_FLAGS_TO_IGNORE = PER_PARTITION_CF_FLAG;
 | |
| 
 | |
|   // Data dictionary types
 | |
|   enum DATA_DICT_TYPE {
 | |
|     DDL_ENTRY_INDEX_START_NUMBER = 1,
 | |
|     INDEX_INFO = 2,
 | |
|     CF_DEFINITION = 3,
 | |
|     BINLOG_INFO_INDEX_NUMBER = 4,
 | |
|     DDL_DROP_INDEX_ONGOING = 5,
 | |
|     INDEX_STATISTICS = 6,
 | |
|     MAX_INDEX_ID = 7,
 | |
|     DDL_CREATE_INDEX_ONGOING = 8,
 | |
|     AUTO_INC = 9,
 | |
|     // MariaDB: 10 through 12 are already taken in upstream
 | |
|     TABLE_VERSION = 20, // MariaDB: table version record
 | |
|     END_DICT_INDEX_ID = 255
 | |
|   };
 | |
| 
 | |
|   // Data dictionary schema version. Introduce newer versions
 | |
|   // if changing schema layout
 | |
|   enum {
 | |
|     DDL_ENTRY_INDEX_VERSION = 1,
 | |
|     CF_DEFINITION_VERSION = 1,
 | |
|     BINLOG_INFO_INDEX_NUMBER_VERSION = 1,
 | |
|     DDL_DROP_INDEX_ONGOING_VERSION = 1,
 | |
|     MAX_INDEX_ID_VERSION = 1,
 | |
|     DDL_CREATE_INDEX_ONGOING_VERSION = 1,
 | |
|     AUTO_INCREMENT_VERSION = 1,
 | |
|     // Version for index stats is stored in IndexStats struct
 | |
|   };
 | |
| 
 | |
|   // Index info version.  Introduce newer versions when changing the
 | |
|   // INDEX_INFO layout. Update INDEX_INFO_VERSION_LATEST to point to the
 | |
|   // latest version number.
 | |
|   enum {
 | |
|     INDEX_INFO_VERSION_INITIAL = 1,  // Obsolete
 | |
|     INDEX_INFO_VERSION_KV_FORMAT,
 | |
|     INDEX_INFO_VERSION_GLOBAL_ID,
 | |
|     // There is no change to data format in this version, but this version
 | |
|     // verifies KV format version, whereas previous versions do not. A version
 | |
|     // bump is needed to prevent older binaries from skipping the KV version
 | |
|     // check inadvertently.
 | |
|     INDEX_INFO_VERSION_VERIFY_KV_FORMAT,
 | |
|     // This changes the data format to include a 8 byte TTL duration for tables
 | |
|     INDEX_INFO_VERSION_TTL,
 | |
|     // This changes the data format to include a bitmap before the TTL duration
 | |
|     // which will indicate in the future whether TTL or other special fields
 | |
|     // are turned on or off.
 | |
|     INDEX_INFO_VERSION_FIELD_FLAGS,
 | |
|     // This normally point to the latest (currently it does).
 | |
|     INDEX_INFO_VERSION_LATEST = INDEX_INFO_VERSION_FIELD_FLAGS,
 | |
|   };
 | |
| 
 | |
|   // MyRocks index types
 | |
|   enum {
 | |
|     INDEX_TYPE_PRIMARY = 1,
 | |
|     INDEX_TYPE_SECONDARY = 2,
 | |
|     INDEX_TYPE_HIDDEN_PRIMARY = 3,
 | |
|   };
 | |
| 
 | |
|   // Key/Value format version for each index type
 | |
|   enum {
 | |
|     PRIMARY_FORMAT_VERSION_INITIAL = 10,
 | |
|     // This change includes:
 | |
|     //  - For columns that can be unpacked with unpack_info, PK
 | |
|     //    stores the unpack_info.
 | |
|     //  - DECIMAL datatype is no longer stored in the row (because
 | |
|     //    it can be decoded from its mem-comparable form)
 | |
|     //  - VARCHAR-columns use endspace-padding.
 | |
|     PRIMARY_FORMAT_VERSION_UPDATE1 = 11,
 | |
|     // This change includes:
 | |
|     //  - Binary encoded variable length fields have a new format that avoids
 | |
|     //    an inefficient where data that was a multiple of 8 bytes in length
 | |
|     //    had an extra 9 bytes of encoded data.
 | |
|     PRIMARY_FORMAT_VERSION_UPDATE2 = 12,
 | |
|     // This change includes support for TTL
 | |
|     //  - This means that when TTL is specified for the table an 8-byte TTL
 | |
|     //    field is prepended in front of each value.
 | |
|     PRIMARY_FORMAT_VERSION_TTL = 13,
 | |
|     PRIMARY_FORMAT_VERSION_LATEST = PRIMARY_FORMAT_VERSION_TTL,
 | |
| 
 | |
|     SECONDARY_FORMAT_VERSION_INITIAL = 10,
 | |
|     // This change the SK format to include unpack_info.
 | |
|     SECONDARY_FORMAT_VERSION_UPDATE1 = 11,
 | |
|     // This change includes:
 | |
|     //  - Binary encoded variable length fields have a new format that avoids
 | |
|     //    an inefficient where data that was a multiple of 8 bytes in length
 | |
|     //    had an extra 9 bytes of encoded data.
 | |
|     SECONDARY_FORMAT_VERSION_UPDATE2 = 12,
 | |
|     // This change includes support for TTL
 | |
|     //  - This means that when TTL is specified for the table an 8-byte TTL
 | |
|     //    field is prepended in front of each value.
 | |
|     SECONDARY_FORMAT_VERSION_TTL = 13,
 | |
|     SECONDARY_FORMAT_VERSION_LATEST = SECONDARY_FORMAT_VERSION_TTL,
 | |
|     // This change includes support for covering SK lookups for varchars.  A
 | |
|     // 2-byte bitmap is added after the tag-byte to unpack_info only for
 | |
|     // records which have covered varchar columns. Currently waiting before
 | |
|     // enabling in prod.
 | |
|     SECONDARY_FORMAT_VERSION_UPDATE3 = 65535,
 | |
|   };
 | |
| 
 | |
|   uint setup(const TABLE *const table, const Rdb_tbl_def *const tbl_def);
 | |
| 
 | |
|   static uint extract_ttl_duration(const TABLE *const table_arg,
 | |
|                                    const Rdb_tbl_def *const tbl_def_arg,
 | |
|                                    uint64 *ttl_duration);
 | |
|   static uint extract_ttl_col(const TABLE *const table_arg,
 | |
|                               const Rdb_tbl_def *const tbl_def_arg,
 | |
|                               std::string *ttl_column, uint *ttl_field_index,
 | |
|                               bool skip_checks = false);
 | |
|   inline bool has_ttl() const { return m_ttl_duration > 0; }
 | |
| 
 | |
|   static bool has_index_flag(uint32 index_flags, enum INDEX_FLAG flag);
 | |
|   static uint32 calculate_index_flag_offset(uint32 index_flags,
 | |
|                                             enum INDEX_FLAG flag,
 | |
|                                             uint *const field_length = nullptr);
 | |
|   void write_index_flag_field(Rdb_string_writer *const buf,
 | |
|                               const uchar *const val,
 | |
|                               enum INDEX_FLAG flag) const;
 | |
| 
 | |
|   static const std::string gen_qualifier_for_table(
 | |
|       const char *const qualifier, const std::string &partition_name = "");
 | |
|   static const std::string gen_cf_name_qualifier_for_partition(
 | |
|       const std::string &s);
 | |
|   static const std::string gen_ttl_duration_qualifier_for_partition(
 | |
|       const std::string &s);
 | |
|   static const std::string gen_ttl_col_qualifier_for_partition(
 | |
|       const std::string &s);
 | |
| 
 | |
|   static const std::string parse_comment_for_qualifier(
 | |
|       const std::string &comment, const TABLE *const table_arg,
 | |
|       const Rdb_tbl_def *const tbl_def_arg, bool *per_part_match_found,
 | |
|       const char *const qualifier);
 | |
| 
 | |
|   rocksdb::ColumnFamilyHandle *get_cf() const { return m_cf_handle; }
 | |
| 
 | |
|   /* Check if keypart #kp can be unpacked from index tuple */
 | |
|   inline bool can_unpack(const uint kp) const;
 | |
|   /* Check if keypart #kp needs unpack info */
 | |
|   inline bool has_unpack_info(const uint kp) const;
 | |
| 
 | |
|   /* Check if given table has a primary key */
 | |
|   static bool table_has_hidden_pk(const TABLE *const table);
 | |
| 
 | |
|   void report_checksum_mismatch(const bool is_key, const char *const data,
 | |
|                                 const size_t data_size) const;
 | |
| 
 | |
|   /* Check if index is at least pk_min if it is a PK,
 | |
|     or at least sk_min if SK.*/
 | |
|   bool index_format_min_check(const int pk_min, const int sk_min) const;
 | |
| 
 | |
|   static void pack_with_make_sort_key(
 | |
|       Rdb_field_packing *const fpi, Field *const field,
 | |
|       uchar *buf MY_ATTRIBUTE((__unused__)), uchar **dst,
 | |
|       Rdb_pack_field_context *const pack_ctx MY_ATTRIBUTE((__unused__)));
 | |
| 
 | |
|   static void pack_with_varchar_encoding(
 | |
|       Rdb_field_packing *const fpi, Field *const field, uchar *buf, uchar **dst,
 | |
|       Rdb_pack_field_context *const pack_ctx MY_ATTRIBUTE((__unused__)));
 | |
| 
 | |
|   static void pack_with_varchar_space_pad(
 | |
|       Rdb_field_packing *const fpi, Field *const field, uchar *buf, uchar **dst,
 | |
|       Rdb_pack_field_context *const pack_ctx);
 | |
| 
 | |
|   static int unpack_integer(Rdb_field_packing *const fpi, Field *const field,
 | |
|                             uchar *const to, Rdb_string_reader *const reader,
 | |
|                             Rdb_string_reader *const unp_reader
 | |
|                                 MY_ATTRIBUTE((__unused__)));
 | |
| 
 | |
|   static int unpack_double(
 | |
|       Rdb_field_packing *const fpi MY_ATTRIBUTE((__unused__)),
 | |
|       Field *const field MY_ATTRIBUTE((__unused__)), uchar *const field_ptr,
 | |
|       Rdb_string_reader *const reader,
 | |
|       Rdb_string_reader *const unp_reader MY_ATTRIBUTE((__unused__)));
 | |
| 
 | |
|   static int unpack_float(
 | |
|       Rdb_field_packing *const fpi,
 | |
|       Field *const field MY_ATTRIBUTE((__unused__)), uchar *const field_ptr,
 | |
|       Rdb_string_reader *const reader,
 | |
|       Rdb_string_reader *const unp_reader MY_ATTRIBUTE((__unused__)));
 | |
| 
 | |
|   static int unpack_binary_str(Rdb_field_packing *const fpi, Field *const field,
 | |
|                                uchar *const to, Rdb_string_reader *const reader,
 | |
|                                Rdb_string_reader *const unp_reader
 | |
|                                    MY_ATTRIBUTE((__unused__)));
 | |
| 
 | |
|   static int unpack_binary_or_utf8_varchar(
 | |
|       Rdb_field_packing *const fpi, Field *const field, uchar *dst,
 | |
|       Rdb_string_reader *const reader,
 | |
|       Rdb_string_reader *const unp_reader MY_ATTRIBUTE((__unused__)));
 | |
| 
 | |
|   static int unpack_binary_or_utf8_varchar_space_pad(
 | |
|       Rdb_field_packing *const fpi, Field *const field, uchar *dst,
 | |
|       Rdb_string_reader *const reader, Rdb_string_reader *const unp_reader);
 | |
| 
 | |
|   static int unpack_newdate(
 | |
|       Rdb_field_packing *const fpi,
 | |
|       Field *const field MY_ATTRIBUTE((__unused__)), uchar *const field_ptr,
 | |
|       Rdb_string_reader *const reader,
 | |
|       Rdb_string_reader *const unp_reader MY_ATTRIBUTE((__unused__)));
 | |
| 
 | |
|   static int unpack_utf8_str(Rdb_field_packing *const fpi, Field *const field,
 | |
|                              uchar *dst, Rdb_string_reader *const reader,
 | |
|                              Rdb_string_reader *const unp_reader
 | |
|                                  MY_ATTRIBUTE((__unused__)));
 | |
| 
 | |
|   static int unpack_unknown_varchar(Rdb_field_packing *const fpi,
 | |
|                                     Field *const field, uchar *dst,
 | |
|                                     Rdb_string_reader *const reader,
 | |
|                                     Rdb_string_reader *const unp_reader);
 | |
| 
 | |
|   static int unpack_simple_varchar_space_pad(
 | |
|       Rdb_field_packing *const fpi, Field *const field, uchar *dst,
 | |
|       Rdb_string_reader *const reader, Rdb_string_reader *const unp_reader);
 | |
| 
 | |
|   static int unpack_simple(Rdb_field_packing *const fpi,
 | |
|                            Field *const field MY_ATTRIBUTE((__unused__)),
 | |
|                            uchar *const dst, Rdb_string_reader *const reader,
 | |
|                            Rdb_string_reader *const unp_reader);
 | |
| 
 | |
|   static int unpack_unknown(Rdb_field_packing *const fpi, Field *const field,
 | |
|                             uchar *const dst, Rdb_string_reader *const reader,
 | |
|                             Rdb_string_reader *const unp_reader);
 | |
| 
 | |
|   static int unpack_floating_point(uchar *const dst,
 | |
|                                    Rdb_string_reader *const reader,
 | |
|                                    const size_t size, const int exp_digit,
 | |
|                                    const uchar *const zero_pattern,
 | |
|                                    const uchar *const zero_val,
 | |
|                                    void (*swap_func)(uchar *, const uchar *));
 | |
| 
 | |
|   static void make_unpack_simple_varchar(
 | |
|       const Rdb_collation_codec *const codec, const Field *const field,
 | |
|       Rdb_pack_field_context *const pack_ctx);
 | |
| 
 | |
|   static void make_unpack_simple(const Rdb_collation_codec *const codec,
 | |
|                                  const Field *const field,
 | |
|                                  Rdb_pack_field_context *const pack_ctx);
 | |
| 
 | |
|   static void make_unpack_unknown(
 | |
|       const Rdb_collation_codec *codec MY_ATTRIBUTE((__unused__)),
 | |
|       const Field *const field, Rdb_pack_field_context *const pack_ctx);
 | |
| 
 | |
|   static void make_unpack_unknown_varchar(
 | |
|       const Rdb_collation_codec *const codec MY_ATTRIBUTE((__unused__)),
 | |
|       const Field *const field, Rdb_pack_field_context *const pack_ctx);
 | |
| 
 | |
|   static void dummy_make_unpack_info(
 | |
|       const Rdb_collation_codec *codec MY_ATTRIBUTE((__unused__)),
 | |
|       const Field *field MY_ATTRIBUTE((__unused__)),
 | |
|       Rdb_pack_field_context *pack_ctx MY_ATTRIBUTE((__unused__)));
 | |
| 
 | |
|   static int skip_max_length(const Rdb_field_packing *const fpi,
 | |
|                              const Field *const field
 | |
|                                  MY_ATTRIBUTE((__unused__)),
 | |
|                              Rdb_string_reader *const reader);
 | |
| 
 | |
|   static int skip_variable_length(const Rdb_field_packing *const fpi,
 | |
|                                   const Field *const field,
 | |
|                                   Rdb_string_reader *const reader);
 | |
| 
 | |
|   static int skip_variable_space_pad(const Rdb_field_packing *const fpi,
 | |
|                                      const Field *const field,
 | |
|                                      Rdb_string_reader *const reader);
 | |
| 
 | |
|   inline bool use_legacy_varbinary_format() const {
 | |
|     return !index_format_min_check(PRIMARY_FORMAT_VERSION_UPDATE2,
 | |
|                                    SECONDARY_FORMAT_VERSION_UPDATE2);
 | |
|   }
 | |
| 
 | |
|   static inline bool is_unpack_data_tag(char c) {
 | |
|     return c == RDB_UNPACK_DATA_TAG || c == RDB_UNPACK_COVERED_DATA_TAG;
 | |
|   }
 | |
| 
 | |
|  private:
 | |
| #ifndef DBUG_OFF
 | |
|   inline bool is_storage_available(const int offset, const int needed) const {
 | |
|     const int storage_length = static_cast<int>(max_storage_fmt_length());
 | |
|     return (storage_length - offset) >= needed;
 | |
|   }
 | |
| #else
 | |
|   inline bool is_storage_available(const int &offset, const int &needed) const {
 | |
|     return 1;
 | |
|   }
 | |
| #endif  // DBUG_OFF
 | |
| 
 | |
|   /* Global number of this index (used as prefix in StorageFormat) */
 | |
|   const uint32 m_index_number;
 | |
| 
 | |
|   uchar m_index_number_storage_form[INDEX_NUMBER_SIZE];
 | |
| 
 | |
|   rocksdb::ColumnFamilyHandle *m_cf_handle;
 | |
| 
 | |
|   static void pack_legacy_variable_format(const uchar *src, size_t src_len,
 | |
|                                           uchar **dst);
 | |
| 
 | |
|   static void pack_variable_format(const uchar *src, size_t src_len,
 | |
|                                    uchar **dst);
 | |
| 
 | |
|   static uint calc_unpack_legacy_variable_format(uchar flag, bool *done);
 | |
| 
 | |
|   static uint calc_unpack_variable_format(uchar flag, bool *done);
 | |
| 
 | |
|  public:
 | |
|   uint16_t m_index_dict_version;
 | |
|   uchar m_index_type;
 | |
|   /* KV format version for the index id */
 | |
|   uint16_t m_kv_format_version;
 | |
|   /* If true, the column family stores data in the reverse order */
 | |
|   bool m_is_reverse_cf;
 | |
| 
 | |
|   /* If true, then column family is created per partition. */
 | |
|   bool m_is_per_partition_cf;
 | |
| 
 | |
|   std::string m_name;
 | |
|   mutable Rdb_index_stats m_stats;
 | |
| 
 | |
|   /*
 | |
|     Bitmap containing information about whether TTL or other special fields
 | |
|     are enabled for the given index.
 | |
|   */
 | |
|   uint32 m_index_flags_bitmap;
 | |
| 
 | |
|   /*
 | |
|     How much space in bytes the index flag fields occupy.
 | |
|   */
 | |
|   uint32 m_total_index_flags_length;
 | |
| 
 | |
|   /*
 | |
|     Offset in the records where the 8-byte TTL is stored (UINT_MAX if no TTL)
 | |
|   */
 | |
|   uint32 m_ttl_rec_offset;
 | |
| 
 | |
|   /* Default TTL duration */
 | |
|   uint64 m_ttl_duration;
 | |
| 
 | |
|   /* TTL column (if defined by user, otherwise implicit TTL is used) */
 | |
|   std::string m_ttl_column;
 | |
| 
 | |
|  private:
 | |
|   /* Number of key parts in the primary key*/
 | |
|   uint m_pk_key_parts;
 | |
| 
 | |
|   /*
 | |
|      pk_part_no[X]=Y means that keypart #X of this key is key part #Y of the
 | |
|      primary key.  Y==-1 means this column is not present in the primary key.
 | |
|   */
 | |
|   uint *m_pk_part_no;
 | |
| 
 | |
|   /* Array of index-part descriptors. */
 | |
|   Rdb_field_packing *m_pack_info;
 | |
| 
 | |
|   uint m_keyno; /* number of this index in the table */
 | |
| 
 | |
|   /*
 | |
|     Number of key parts in the index (including "index extension"). This is how
 | |
|     many elements are in the m_pack_info array.
 | |
|   */
 | |
|   uint m_key_parts;
 | |
| 
 | |
|   /*
 | |
|     If TTL column is part of the PK, offset of the column within pk.
 | |
|     Default is UINT_MAX to denote that TTL col is not part of PK.
 | |
|   */
 | |
|   uint m_ttl_pk_key_part_offset;
 | |
| 
 | |
|   /*
 | |
|     Index of the TTL column in table->s->fields, if it exists.
 | |
|     Default is UINT_MAX to denote that it does not exist.
 | |
|   */
 | |
|   uint m_ttl_field_index;
 | |
| 
 | |
|   /* Prefix extractor for the column family of the key definiton */
 | |
|   std::shared_ptr<const rocksdb::SliceTransform> m_prefix_extractor;
 | |
| 
 | |
|   /* Maximum length of the mem-comparable form. */
 | |
|   uint m_maxlength;
 | |
| 
 | |
|   /* mutex to protect setup */
 | |
|   mysql_mutex_t m_mutex;
 | |
| };
 | |
| 
 | |
| // "Simple" collations (those specified in strings/ctype-simple.c) are simple
 | |
| // because their strnxfrm function maps one byte to one byte. However, the
 | |
| // mapping is not injective, so the inverse function will take in an extra
 | |
| // index parameter containing information to disambiguate what the original
 | |
| // character was.
 | |
| //
 | |
| // The m_enc* members are for encoding. Generally, we want encoding to be:
 | |
| //      src -> (dst, idx)
 | |
| //
 | |
| // Since strnxfrm already gives us dst, we just need m_enc_idx[src] to give us
 | |
| // idx.
 | |
| //
 | |
| // For the inverse, we have:
 | |
| //      (dst, idx) -> src
 | |
| //
 | |
| // We have m_dec_idx[idx][dst] = src to get our original character back.
 | |
| //
 | |
| struct Rdb_collation_codec {
 | |
|   const my_core::CHARSET_INFO *m_cs;
 | |
|   // The first element unpacks VARCHAR(n), the second one - CHAR(n).
 | |
|   std::array<rdb_make_unpack_info_t, 2> m_make_unpack_info_func;
 | |
|   std::array<rdb_index_field_unpack_t, 2> m_unpack_func;
 | |
| 
 | |
|   std::array<uchar, 256> m_enc_idx;
 | |
|   std::array<uchar, 256> m_enc_size;
 | |
| 
 | |
|   std::array<uchar, 256> m_dec_size;
 | |
|   std::vector<std::array<uchar, 256>> m_dec_idx;
 | |
| };
 | |
| 
 | |
| extern mysql_mutex_t rdb_collation_data_mutex;
 | |
| extern mysql_mutex_t rdb_mem_cmp_space_mutex;
 | |
| extern std::array<const Rdb_collation_codec *, MY_ALL_CHARSETS_SIZE>
 | |
|     rdb_collation_data;
 | |
| 
 | |
| class Rdb_field_packing {
 | |
|  public:
 | |
|   Rdb_field_packing(const Rdb_field_packing &) = delete;
 | |
|   Rdb_field_packing &operator=(const Rdb_field_packing &) = delete;
 | |
|   Rdb_field_packing() = default;
 | |
| 
 | |
|   /* Length of mem-comparable image of the field, in bytes */
 | |
|   int m_max_image_len;
 | |
| 
 | |
|   /* Length of image in the unpack data */
 | |
|   int m_unpack_data_len;
 | |
|   int m_unpack_data_offset;
 | |
| 
 | |
|   bool m_maybe_null; /* TRUE <=> NULL-byte is stored */
 | |
| 
 | |
|   /*
 | |
|     Valid only for VARCHAR fields.
 | |
|   */
 | |
|   const CHARSET_INFO *m_varchar_charset;
 | |
|   bool m_use_legacy_varbinary_format;
 | |
| 
 | |
|   // (Valid when Variable Length Space Padded Encoding is used):
 | |
|   uint m_segment_size;  // size of segment used
 | |
| 
 | |
|   // number of bytes used to store number of trimmed (or added)
 | |
|   // spaces in the upack_info
 | |
|   bool m_unpack_info_uses_two_bytes;
 | |
| 
 | |
|   /*
 | |
|     True implies that an index-only read is always possible for this field.
 | |
|     False means an index-only read may be possible depending on the record and
 | |
|     field type.
 | |
|   */
 | |
|   bool m_covered;
 | |
| 
 | |
|   const std::vector<uchar> *space_xfrm;
 | |
|   size_t space_xfrm_len;
 | |
|   size_t space_mb_len;
 | |
| 
 | |
|   const Rdb_collation_codec *m_charset_codec;
 | |
| 
 | |
|   /*
 | |
|     @return TRUE: this field makes use of unpack_info.
 | |
|   */
 | |
|   bool uses_unpack_info() const { return (m_make_unpack_info_func != nullptr); }
 | |
| 
 | |
|   /* TRUE means unpack_info stores the original field value */
 | |
|   bool m_unpack_info_stores_value;
 | |
| 
 | |
|   rdb_index_field_pack_t m_pack_func;
 | |
|   rdb_make_unpack_info_t m_make_unpack_info_func;
 | |
| 
 | |
|   /*
 | |
|     This function takes
 | |
|     - mem-comparable form
 | |
|     - unpack_info data
 | |
|     and restores the original value.
 | |
|   */
 | |
|   rdb_index_field_unpack_t m_unpack_func;
 | |
| 
 | |
|   /*
 | |
|     This function skips over mem-comparable form.
 | |
|   */
 | |
|   rdb_index_field_skip_t m_skip_func;
 | |
| 
 | |
|  private:
 | |
|   /*
 | |
|     Location of the field in the table (key number and key part number).
 | |
| 
 | |
|     Note that this describes not the field, but rather a position of field in
 | |
|     the index. Consider an example:
 | |
| 
 | |
|       col1 VARCHAR (100),
 | |
|       INDEX idx1 (col1)),
 | |
|       INDEX idx2 (col1(10)),
 | |
| 
 | |
|     Here, idx2 has a special Field object that is set to describe a 10-char
 | |
|     prefix of col1.
 | |
| 
 | |
|     We must also store the keynr. It is needed for implicit "extended keys".
 | |
|     Every key in MyRocks needs to include PK columns.  Generally, SQL layer
 | |
|     includes PK columns as part of its "Extended Keys" feature, but sometimes
 | |
|     it does not (known examples are unique secondary indexes and partitioned
 | |
|     tables).
 | |
|     In that case, MyRocks's index descriptor has invisible suffix of PK
 | |
|     columns (and the point is that these columns are parts of PK, not parts
 | |
|     of the current index).
 | |
|   */
 | |
|   uint m_keynr;
 | |
|   uint m_key_part;
 | |
| 
 | |
|  public:
 | |
|   bool setup(const Rdb_key_def *const key_descr, const Field *const field,
 | |
|              const uint keynr_arg, const uint key_part_arg,
 | |
|              const uint16 key_length);
 | |
|   Field *get_field_in_table(const TABLE *const tbl) const;
 | |
|   void fill_hidden_pk_val(uchar **dst, const longlong hidden_pk_id) const;
 | |
| };
 | |
| 
 | |
| /*
 | |
|   Descriptor telling how to decode/encode a field to on-disk record storage
 | |
|   format. Not all information is in the structure yet, but eventually we
 | |
|   want to have as much as possible there to avoid virtual calls.
 | |
| 
 | |
|   For encoding/decoding of index tuples, see Rdb_key_def.
 | |
|   */
 | |
| class Rdb_field_encoder {
 | |
|  public:
 | |
|   Rdb_field_encoder(const Rdb_field_encoder &) = delete;
 | |
|   Rdb_field_encoder &operator=(const Rdb_field_encoder &) = delete;
 | |
|   /*
 | |
|     STORE_NONE is set when a column can be decoded solely from their
 | |
|     mem-comparable form.
 | |
|     STORE_SOME is set when a column can be decoded from their mem-comparable
 | |
|     form plus unpack_info.
 | |
|     STORE_ALL is set when a column cannot be decoded, so its original value
 | |
|     must be stored in the PK records.
 | |
|     */
 | |
|   enum STORAGE_TYPE {
 | |
|     STORE_NONE,
 | |
|     STORE_SOME,
 | |
|     STORE_ALL,
 | |
|   };
 | |
|   STORAGE_TYPE m_storage_type;
 | |
| 
 | |
|   uint m_null_offset;
 | |
|   uint16 m_field_index;
 | |
| 
 | |
|   uchar m_null_mask;  // 0 means the field cannot be null
 | |
| 
 | |
|   my_core::enum_field_types m_field_type;
 | |
| 
 | |
|   uint m_pack_length_in_rec;
 | |
| 
 | |
|   bool maybe_null() const { return m_null_mask != 0; }
 | |
| 
 | |
|   bool uses_variable_len_encoding() const {
 | |
|     return (m_field_type == MYSQL_TYPE_BLOB ||
 | |
|             m_field_type == MYSQL_TYPE_VARCHAR);
 | |
|   }
 | |
| };
 | |
| 
 | |
| inline Field *Rdb_key_def::get_table_field_for_part_no(TABLE *table,
 | |
|                                                        uint part_no) const {
 | |
|   DBUG_ASSERT(part_no < get_key_parts());
 | |
|   return m_pack_info[part_no].get_field_in_table(table);
 | |
| }
 | |
| 
 | |
| inline bool Rdb_key_def::can_unpack(const uint kp) const {
 | |
|   DBUG_ASSERT(kp < m_key_parts);
 | |
|   return (m_pack_info[kp].m_unpack_func != nullptr);
 | |
| }
 | |
| 
 | |
| inline bool Rdb_key_def::has_unpack_info(const uint kp) const {
 | |
|   DBUG_ASSERT(kp < m_key_parts);
 | |
|   return m_pack_info[kp].uses_unpack_info();
 | |
| }
 | |
| 
 | |
| /*
 | |
|   A table definition. This is an entry in the mapping
 | |
| 
 | |
|     dbname.tablename -> {index_nr, index_nr, ... }
 | |
| 
 | |
|   There is only one Rdb_tbl_def object for a given table.
 | |
|   That's why we keep auto_increment value here, too.
 | |
| */
 | |
| 
 | |
| class Rdb_tbl_def {
 | |
|  private:
 | |
|   void check_if_is_mysql_system_table();
 | |
| 
 | |
|   /* Stores 'dbname.tablename' */
 | |
|   std::string m_dbname_tablename;
 | |
| 
 | |
|   /* Store the db name, table name, and partition name */
 | |
|   std::string m_dbname;
 | |
|   std::string m_tablename;
 | |
|   std::string m_partition;
 | |
| 
 | |
|   void set_name(const std::string &name);
 | |
| 
 | |
|  public:
 | |
|   Rdb_tbl_def(const Rdb_tbl_def &) = delete;
 | |
|   Rdb_tbl_def &operator=(const Rdb_tbl_def &) = delete;
 | |
| 
 | |
|   explicit Rdb_tbl_def(const std::string &name)
 | |
|       : m_key_descr_arr(nullptr), m_hidden_pk_val(0), m_auto_incr_val(0),
 | |
|         m_update_time(0), m_create_time(CREATE_TIME_UNKNOWN) {
 | |
|     set_name(name);
 | |
|   }
 | |
| 
 | |
|   Rdb_tbl_def(const char *const name, const size_t len)
 | |
|       : m_key_descr_arr(nullptr), m_hidden_pk_val(0), m_auto_incr_val(0),
 | |
|         m_update_time(0), m_create_time(CREATE_TIME_UNKNOWN) {
 | |
|     set_name(std::string(name, len));
 | |
|   }
 | |
| 
 | |
|   explicit Rdb_tbl_def(const rocksdb::Slice &slice, const size_t pos = 0)
 | |
|       : m_key_descr_arr(nullptr), m_hidden_pk_val(0), m_auto_incr_val(0),
 | |
|         m_update_time(0), m_create_time(CREATE_TIME_UNKNOWN) {
 | |
|     set_name(std::string(slice.data() + pos, slice.size() - pos));
 | |
|   }
 | |
| 
 | |
|   ~Rdb_tbl_def();
 | |
| 
 | |
|   void check_and_set_read_free_rpl_table();
 | |
| 
 | |
|   /* Number of indexes */
 | |
|   uint m_key_count;
 | |
| 
 | |
|   /* Array of index descriptors */
 | |
|   std::shared_ptr<Rdb_key_def> *m_key_descr_arr;
 | |
| 
 | |
|   std::atomic<longlong> m_hidden_pk_val;
 | |
|   std::atomic<ulonglong> m_auto_incr_val;
 | |
| 
 | |
|   /* Is this a system table */
 | |
|   bool m_is_mysql_system_table;
 | |
| 
 | |
|   /* Is this table read free repl enabled */
 | |
|   std::atomic_bool m_is_read_free_rpl_table{false};
 | |
| 
 | |
|   bool put_dict(Rdb_dict_manager *const dict, rocksdb::WriteBatch *const batch,
 | |
|                 const rocksdb::Slice &key);
 | |
| 
 | |
|   const std::string &full_tablename() const { return m_dbname_tablename; }
 | |
|   const std::string &base_dbname() const { return m_dbname; }
 | |
|   const std::string &base_tablename() const { return m_tablename; }
 | |
|   const std::string &base_partition() const { return m_partition; }
 | |
|   GL_INDEX_ID get_autoincr_gl_index_id();
 | |
| 
 | |
|   time_t get_create_time();
 | |
|   std::atomic<time_t> m_update_time; // in-memory only value
 | |
| 
 | |
|  private:
 | |
|   const time_t CREATE_TIME_UNKNOWN= 1;
 | |
|   // CREATE_TIME_UNKNOWN means "didn't try to read, yet"
 | |
|   // 0 means "no data available"
 | |
|   std::atomic<time_t> m_create_time;
 | |
| };
 | |
| 
 | |
| /*
 | |
|   A thread-safe sequential number generator. Its performance is not a concern
 | |
|   hence it is ok to protect it by a mutex.
 | |
| */
 | |
| 
 | |
| class Rdb_seq_generator {
 | |
|   uint m_next_number = 0;
 | |
| 
 | |
|   mysql_mutex_t m_mutex;
 | |
| 
 | |
|  public:
 | |
|   Rdb_seq_generator(const Rdb_seq_generator &) = delete;
 | |
|   Rdb_seq_generator &operator=(const Rdb_seq_generator &) = delete;
 | |
|   Rdb_seq_generator() = default;
 | |
| 
 | |
|   void init(const uint initial_number) {
 | |
|     mysql_mutex_init(0, &m_mutex, MY_MUTEX_INIT_FAST);
 | |
|     m_next_number = initial_number;
 | |
|   }
 | |
| 
 | |
|   uint get_and_update_next_number(Rdb_dict_manager *const dict);
 | |
| 
 | |
|   void cleanup() { mysql_mutex_destroy(&m_mutex); }
 | |
| };
 | |
| 
 | |
| interface Rdb_tables_scanner {
 | |
|   virtual int add_table(Rdb_tbl_def * tdef) = 0;
 | |
|   virtual ~Rdb_tables_scanner() = default; /* Keep the compiler happy */
 | |
| };
 | |
| 
 | |
| /*
 | |
|   This contains a mapping of
 | |
| 
 | |
|      dbname.table_name -> array{Rdb_key_def}.
 | |
| 
 | |
|   objects are shared among all threads.
 | |
| */
 | |
| 
 | |
| class Rdb_ddl_manager {
 | |
|   Rdb_dict_manager *m_dict = nullptr;
 | |
| 
 | |
|   // Contains Rdb_tbl_def elements
 | |
|   std::unordered_map<std::string, Rdb_tbl_def *> m_ddl_map;
 | |
| 
 | |
|   // Maps index id to <table_name, index number>
 | |
|   std::map<GL_INDEX_ID, std::pair<std::string, uint>> m_index_num_to_keydef;
 | |
| 
 | |
|   // Maps index id to key definitons not yet committed to data dictionary.
 | |
|   // This is mainly used to store key definitions during ALTER TABLE.
 | |
|   std::map<GL_INDEX_ID, std::shared_ptr<Rdb_key_def>>
 | |
|       m_index_num_to_uncommitted_keydef;
 | |
|   mysql_rwlock_t m_rwlock;
 | |
| 
 | |
|   Rdb_seq_generator m_sequence;
 | |
|   // A queue of table stats to write into data dictionary
 | |
|   // It is produced by event listener (ie compaction and flush threads)
 | |
|   // and consumed by the rocksdb background thread
 | |
|   std::map<GL_INDEX_ID, Rdb_index_stats> m_stats2store;
 | |
| 
 | |
|   const std::shared_ptr<Rdb_key_def> &find(GL_INDEX_ID gl_index_id);
 | |
| 
 | |
|  public:
 | |
|   Rdb_ddl_manager(const Rdb_ddl_manager &) = delete;
 | |
|   Rdb_ddl_manager &operator=(const Rdb_ddl_manager &) = delete;
 | |
|   Rdb_ddl_manager() = default;
 | |
| 
 | |
|   /* Load the data dictionary from on-disk storage */
 | |
|   bool init(Rdb_dict_manager *const dict_arg, Rdb_cf_manager *const cf_manager,
 | |
|             const uint32_t validate_tables);
 | |
| 
 | |
|   void cleanup();
 | |
| 
 | |
|   Rdb_tbl_def *find(const std::string &table_name, const bool lock = true);
 | |
|   std::shared_ptr<const Rdb_key_def> safe_find(GL_INDEX_ID gl_index_id);
 | |
|   void set_stats(const std::unordered_map<GL_INDEX_ID, Rdb_index_stats> &stats);
 | |
|   void adjust_stats(const std::vector<Rdb_index_stats> &new_data,
 | |
|                     const std::vector<Rdb_index_stats> &deleted_data =
 | |
|                         std::vector<Rdb_index_stats>());
 | |
|   void persist_stats(const bool sync = false);
 | |
| 
 | |
|   /* Modify the mapping and write it to on-disk storage */
 | |
|   int put_and_write(Rdb_tbl_def *const key_descr,
 | |
|                     rocksdb::WriteBatch *const batch);
 | |
|   void remove(Rdb_tbl_def *const rec, rocksdb::WriteBatch *const batch,
 | |
|               const bool lock = true);
 | |
|   bool rename(const std::string &from, const std::string &to,
 | |
|               rocksdb::WriteBatch *const batch);
 | |
| 
 | |
|   uint get_and_update_next_number(Rdb_dict_manager *const dict) {
 | |
|     return m_sequence.get_and_update_next_number(dict);
 | |
|   }
 | |
| 
 | |
|   const std::string safe_get_table_name(const GL_INDEX_ID &gl_index_id);
 | |
| 
 | |
|   /* Walk the data dictionary */
 | |
|   int scan_for_tables(Rdb_tables_scanner *tables_scanner);
 | |
| 
 | |
|   void erase_index_num(const GL_INDEX_ID &gl_index_id);
 | |
|   void add_uncommitted_keydefs(
 | |
|       const std::unordered_set<std::shared_ptr<Rdb_key_def>> &indexes);
 | |
|   void remove_uncommitted_keydefs(
 | |
|       const std::unordered_set<std::shared_ptr<Rdb_key_def>> &indexes);
 | |
| 
 | |
|  private:
 | |
|   /* Put the data into in-memory table (only) */
 | |
|   int put(Rdb_tbl_def *const key_descr, const bool lock = true);
 | |
| 
 | |
|   /* Helper functions to be passed to my_core::HASH object */
 | |
|   static const uchar *get_hash_key(Rdb_tbl_def *const rec, size_t *const length,
 | |
|                                    my_bool not_used MY_ATTRIBUTE((unused)));
 | |
|   static void free_hash_elem(void *const data);
 | |
| 
 | |
|   bool validate_schemas();
 | |
| 
 | |
|   bool validate_auto_incr();
 | |
| };
 | |
| 
 | |
| /*
 | |
|   Writing binlog information into RocksDB at commit(),
 | |
|   and retrieving binlog information at crash recovery.
 | |
|   commit() and recovery are always executed by at most single client
 | |
|   at the same time, so concurrency control is not needed.
 | |
| 
 | |
|   Binlog info is stored in RocksDB as the following.
 | |
|    key: BINLOG_INFO_INDEX_NUMBER
 | |
|    value: packed single row:
 | |
|      binlog_name_length (2 byte form)
 | |
|      binlog_name
 | |
|      binlog_position (4 byte form)
 | |
|      binlog_gtid_length (2 byte form)
 | |
|      binlog_gtid
 | |
| */
 | |
| class Rdb_binlog_manager {
 | |
|  public:
 | |
|   Rdb_binlog_manager(const Rdb_binlog_manager &) = delete;
 | |
|   Rdb_binlog_manager &operator=(const Rdb_binlog_manager &) = delete;
 | |
|   Rdb_binlog_manager() = default;
 | |
| 
 | |
|   bool init(Rdb_dict_manager *const dict);
 | |
|   void cleanup();
 | |
|   void update(const char *const binlog_name, const my_off_t binlog_pos,
 | |
|               rocksdb::WriteBatchBase *const batch);
 | |
|   bool read(char *const binlog_name, my_off_t *const binlog_pos,
 | |
|             char *const binlog_gtid) const;
 | |
|   void update_slave_gtid_info(const uint id, const char *const db,
 | |
|                               const char *const gtid,
 | |
|                               rocksdb::WriteBatchBase *const write_batch);
 | |
| 
 | |
|  private:
 | |
|   Rdb_dict_manager *m_dict = nullptr;
 | |
|   Rdb_buf_writer<Rdb_key_def::INDEX_NUMBER_SIZE> m_key_writer;
 | |
|   rocksdb::Slice m_key_slice;
 | |
| 
 | |
|   bool unpack_value(const uchar *const value, size_t value_size,
 | |
|                     char *const binlog_name,
 | |
|                     my_off_t *const binlog_pos, char *const binlog_gtid) const;
 | |
| 
 | |
|   std::atomic<Rdb_tbl_def *> m_slave_gtid_info_tbl;
 | |
| };
 | |
| 
 | |
| /*
 | |
|    Rdb_dict_manager manages how MySQL on RocksDB (MyRocks) stores its
 | |
|   internal data dictionary.
 | |
|    MyRocks stores data dictionary on dedicated system column family
 | |
|   named __system__. The system column family is used by MyRocks
 | |
|   internally only, and not used by applications.
 | |
| 
 | |
|    Currently MyRocks has the following data dictionary data models.
 | |
| 
 | |
|   1. Table Name => internal index id mappings
 | |
|   key: Rdb_key_def::DDL_ENTRY_INDEX_START_NUMBER(0x1) + dbname.tablename
 | |
|   value: version, {cf_id, index_id}*n_indexes_of_the_table
 | |
|   version is 2 bytes. cf_id and index_id are 4 bytes.
 | |
| 
 | |
|   2. internal cf_id, index id => index information
 | |
|   key: Rdb_key_def::INDEX_INFO(0x2) + cf_id + index_id
 | |
|   value: version, index_type, kv_format_version, index_flags, ttl_duration
 | |
|   index_type is 1 byte, version and kv_format_version are 2 bytes.
 | |
|   index_flags is 4 bytes.
 | |
|   ttl_duration is 8 bytes.
 | |
| 
 | |
|   3. CF id => CF flags
 | |
|   key: Rdb_key_def::CF_DEFINITION(0x3) + cf_id
 | |
|   value: version, {is_reverse_cf, is_auto_cf (deprecated), is_per_partition_cf}
 | |
|   cf_flags is 4 bytes in total.
 | |
| 
 | |
|   4. Binlog entry (updated at commit)
 | |
|   key: Rdb_key_def::BINLOG_INFO_INDEX_NUMBER (0x4)
 | |
|   value: version, {binlog_name,binlog_pos,binlog_gtid}
 | |
| 
 | |
|   5. Ongoing drop index entry
 | |
|   key: Rdb_key_def::DDL_DROP_INDEX_ONGOING(0x5) + cf_id + index_id
 | |
|   value: version
 | |
| 
 | |
|   6. index stats
 | |
|   key: Rdb_key_def::INDEX_STATISTICS(0x6) + cf_id + index_id
 | |
|   value: version, {materialized PropertiesCollector::IndexStats}
 | |
| 
 | |
|   7. maximum index id
 | |
|   key: Rdb_key_def::MAX_INDEX_ID(0x7)
 | |
|   value: index_id
 | |
|   index_id is 4 bytes
 | |
| 
 | |
|   8. Ongoing create index entry
 | |
|   key: Rdb_key_def::DDL_CREATE_INDEX_ONGOING(0x8) + cf_id + index_id
 | |
|   value: version
 | |
| 
 | |
|   9. auto_increment values
 | |
|   key: Rdb_key_def::AUTO_INC(0x9) + cf_id + index_id
 | |
|   value: version, {max auto_increment so far}
 | |
|   max auto_increment is 8 bytes
 | |
| 
 | |
|   Data dictionary operations are atomic inside RocksDB. For example,
 | |
|   when creating a table with two indexes, it is necessary to call Put
 | |
|   three times. They have to be atomic. Rdb_dict_manager has a wrapper function
 | |
|   begin() and commit() to make it easier to do atomic operations.
 | |
| 
 | |
| */
 | |
| class Rdb_dict_manager {
 | |
|  private:
 | |
|   mysql_mutex_t m_mutex;
 | |
|   rocksdb::TransactionDB *m_db = nullptr;
 | |
|   rocksdb::ColumnFamilyHandle *m_system_cfh = nullptr;
 | |
|   /* Utility to put INDEX_INFO and CF_DEFINITION */
 | |
| 
 | |
|   uchar m_key_buf_max_index_id[Rdb_key_def::INDEX_NUMBER_SIZE] = {0};
 | |
|   rocksdb::Slice m_key_slice_max_index_id;
 | |
| 
 | |
|   static void dump_index_id(uchar *const netbuf,
 | |
|                             Rdb_key_def::DATA_DICT_TYPE dict_type,
 | |
|                             const GL_INDEX_ID &gl_index_id);
 | |
|   template <size_t T>
 | |
|   static void dump_index_id(Rdb_buf_writer<T> *buf_writer,
 | |
|                             Rdb_key_def::DATA_DICT_TYPE dict_type,
 | |
|                             const GL_INDEX_ID &gl_index_id) {
 | |
|     buf_writer->write_uint32(dict_type);
 | |
|     buf_writer->write_uint32(gl_index_id.cf_id);
 | |
|     buf_writer->write_uint32(gl_index_id.index_id);
 | |
|   }
 | |
| 
 | |
|   void delete_with_prefix(rocksdb::WriteBatch *const batch,
 | |
|                           Rdb_key_def::DATA_DICT_TYPE dict_type,
 | |
|                           const GL_INDEX_ID &gl_index_id) const;
 | |
|   /* Functions for fast DROP TABLE/INDEX */
 | |
|   void resume_drop_indexes() const;
 | |
|   void log_start_drop_table(const std::shared_ptr<Rdb_key_def> *const key_descr,
 | |
|                             const uint32 n_keys,
 | |
|                             const char *const log_action) const;
 | |
|   void log_start_drop_index(GL_INDEX_ID gl_index_id,
 | |
|                             const char *log_action) const;
 | |
| 
 | |
|  public:
 | |
|   Rdb_dict_manager(const Rdb_dict_manager &) = delete;
 | |
|   Rdb_dict_manager &operator=(const Rdb_dict_manager &) = delete;
 | |
|   Rdb_dict_manager() = default;
 | |
| 
 | |
|   bool init(rocksdb::TransactionDB *const rdb_dict,
 | |
|             Rdb_cf_manager *const cf_manager);
 | |
| 
 | |
|   inline void cleanup() { mysql_mutex_destroy(&m_mutex); }
 | |
| 
 | |
|   inline void lock() { RDB_MUTEX_LOCK_CHECK(m_mutex); }
 | |
| 
 | |
|   inline void unlock() { RDB_MUTEX_UNLOCK_CHECK(m_mutex); }
 | |
| 
 | |
|   inline rocksdb::ColumnFamilyHandle *get_system_cf() const {
 | |
|     return m_system_cfh;
 | |
|   }
 | |
| 
 | |
|   /* Raw RocksDB operations */
 | |
|   std::unique_ptr<rocksdb::WriteBatch> begin() const;
 | |
|   int commit(rocksdb::WriteBatch *const batch, const bool sync = true) const;
 | |
|   rocksdb::Status get_value(const rocksdb::Slice &key,
 | |
|                             std::string *const value) const;
 | |
|   void put_key(rocksdb::WriteBatchBase *const batch, const rocksdb::Slice &key,
 | |
|                const rocksdb::Slice &value) const;
 | |
|   void delete_key(rocksdb::WriteBatchBase *batch,
 | |
|                   const rocksdb::Slice &key) const;
 | |
|   rocksdb::Iterator *new_iterator() const;
 | |
| 
 | |
|   /* Internal Index id => CF */
 | |
|   void add_or_update_index_cf_mapping(
 | |
|       rocksdb::WriteBatch *batch,
 | |
|       struct Rdb_index_info *const index_info) const;
 | |
|   void delete_index_info(rocksdb::WriteBatch *batch,
 | |
|                          const GL_INDEX_ID &index_id) const;
 | |
|   bool get_index_info(const GL_INDEX_ID &gl_index_id,
 | |
|                       struct Rdb_index_info *const index_info) const;
 | |
| 
 | |
|   /* CF id => CF flags */
 | |
|   void add_cf_flags(rocksdb::WriteBatch *const batch, const uint cf_id,
 | |
|                     const uint cf_flags) const;
 | |
|   bool get_cf_flags(const uint cf_id, uint *const cf_flags) const;
 | |
| 
 | |
|   /* Functions for fast CREATE/DROP TABLE/INDEX */
 | |
|   void get_ongoing_index_operation(
 | |
|       std::unordered_set<GL_INDEX_ID> *gl_index_ids,
 | |
|       Rdb_key_def::DATA_DICT_TYPE dd_type) const;
 | |
|   bool is_index_operation_ongoing(const GL_INDEX_ID &gl_index_id,
 | |
|                                   Rdb_key_def::DATA_DICT_TYPE dd_type) const;
 | |
|   void start_ongoing_index_operation(rocksdb::WriteBatch *batch,
 | |
|                                      const GL_INDEX_ID &gl_index_id,
 | |
|                                      Rdb_key_def::DATA_DICT_TYPE dd_type) const;
 | |
|   void end_ongoing_index_operation(rocksdb::WriteBatch *const batch,
 | |
|                                    const GL_INDEX_ID &gl_index_id,
 | |
|                                    Rdb_key_def::DATA_DICT_TYPE dd_type) const;
 | |
|   bool is_drop_index_empty() const;
 | |
|   void add_drop_table(std::shared_ptr<Rdb_key_def> *const key_descr,
 | |
|                       const uint32 n_keys,
 | |
|                       rocksdb::WriteBatch *const batch) const;
 | |
|   void add_drop_index(const std::unordered_set<GL_INDEX_ID> &gl_index_ids,
 | |
|                       rocksdb::WriteBatch *const batch) const;
 | |
|   void add_create_index(const std::unordered_set<GL_INDEX_ID> &gl_index_ids,
 | |
|                         rocksdb::WriteBatch *const batch) const;
 | |
|   void finish_indexes_operation(
 | |
|       const std::unordered_set<GL_INDEX_ID> &gl_index_ids,
 | |
|       Rdb_key_def::DATA_DICT_TYPE dd_type) const;
 | |
|   void rollback_ongoing_index_creation() const;
 | |
| 
 | |
|   inline void get_ongoing_drop_indexes(
 | |
|       std::unordered_set<GL_INDEX_ID> *gl_index_ids) const {
 | |
|     get_ongoing_index_operation(gl_index_ids,
 | |
|                                 Rdb_key_def::DDL_DROP_INDEX_ONGOING);
 | |
|   }
 | |
|   inline void get_ongoing_create_indexes(
 | |
|       std::unordered_set<GL_INDEX_ID> *gl_index_ids) const {
 | |
|     get_ongoing_index_operation(gl_index_ids,
 | |
|                                 Rdb_key_def::DDL_CREATE_INDEX_ONGOING);
 | |
|   }
 | |
|   inline void start_drop_index(rocksdb::WriteBatch *wb,
 | |
|                                const GL_INDEX_ID &gl_index_id) const {
 | |
|     start_ongoing_index_operation(wb, gl_index_id,
 | |
|                                   Rdb_key_def::DDL_DROP_INDEX_ONGOING);
 | |
|   }
 | |
|   inline void start_create_index(rocksdb::WriteBatch *wb,
 | |
|                                  const GL_INDEX_ID &gl_index_id) const {
 | |
|     start_ongoing_index_operation(wb, gl_index_id,
 | |
|                                   Rdb_key_def::DDL_CREATE_INDEX_ONGOING);
 | |
|   }
 | |
|   inline void finish_drop_indexes(
 | |
|       const std::unordered_set<GL_INDEX_ID> &gl_index_ids) const {
 | |
|     finish_indexes_operation(gl_index_ids, Rdb_key_def::DDL_DROP_INDEX_ONGOING);
 | |
|   }
 | |
|   inline void finish_create_indexes(
 | |
|       const std::unordered_set<GL_INDEX_ID> &gl_index_ids) const {
 | |
|     finish_indexes_operation(gl_index_ids,
 | |
|                              Rdb_key_def::DDL_CREATE_INDEX_ONGOING);
 | |
|   }
 | |
|   inline bool is_drop_index_ongoing(const GL_INDEX_ID &gl_index_id) const {
 | |
|     return is_index_operation_ongoing(gl_index_id,
 | |
|                                       Rdb_key_def::DDL_DROP_INDEX_ONGOING);
 | |
|   }
 | |
|   inline bool is_create_index_ongoing(const GL_INDEX_ID &gl_index_id) const {
 | |
|     return is_index_operation_ongoing(gl_index_id,
 | |
|                                       Rdb_key_def::DDL_CREATE_INDEX_ONGOING);
 | |
|   }
 | |
| 
 | |
|   bool get_max_index_id(uint32_t *const index_id) const;
 | |
|   bool update_max_index_id(rocksdb::WriteBatch *const batch,
 | |
|                            const uint32_t index_id) const;
 | |
|   void add_stats(rocksdb::WriteBatch *const batch,
 | |
|                  const std::vector<Rdb_index_stats> &stats) const;
 | |
|   Rdb_index_stats get_stats(GL_INDEX_ID gl_index_id) const;
 | |
| 
 | |
|   rocksdb::Status put_auto_incr_val(rocksdb::WriteBatchBase *batch,
 | |
|                                     const GL_INDEX_ID &gl_index_id,
 | |
|                                     ulonglong val,
 | |
|                                     bool overwrite = false) const;
 | |
|   bool get_auto_incr_val(const GL_INDEX_ID &gl_index_id,
 | |
|                          ulonglong *new_val) const;
 | |
| };
 | |
| 
 | |
| struct Rdb_index_info {
 | |
|   GL_INDEX_ID m_gl_index_id;
 | |
|   uint16_t m_index_dict_version = 0;
 | |
|   uchar m_index_type = 0;
 | |
|   uint16_t m_kv_version = 0;
 | |
|   uint32 m_index_flags = 0;
 | |
|   uint64 m_ttl_duration = 0;
 | |
| };
 | |
| 
 | |
| /*
 | |
|   @brief
 | |
|   Merge Operator for the auto_increment value in the system_cf
 | |
| 
 | |
|   @detail
 | |
|   This class implements the rocksdb Merge Operator for auto_increment values
 | |
|   that are stored to the data dictionary every transaction.
 | |
| 
 | |
|   The actual Merge function is triggered on compaction, memtable flushes, or
 | |
|   when get() is called on the same key.
 | |
| 
 | |
|  */
 | |
| class Rdb_system_merge_op : public rocksdb::AssociativeMergeOperator {
 | |
|  public:
 | |
|   /*
 | |
|     Updates the new value associated with a key to be the maximum of the
 | |
|     passed in value and the existing value.
 | |
| 
 | |
|     @param[IN]  key
 | |
|     @param[IN]  existing_value  existing value for a key; nullptr if nonexistent
 | |
|     key
 | |
|     @param[IN]  value
 | |
|     @param[OUT] new_value       new value after Merge
 | |
|     @param[IN]  logger
 | |
|   */
 | |
|   bool Merge(const rocksdb::Slice &key, const rocksdb::Slice *existing_value,
 | |
|              const rocksdb::Slice &value, std::string *new_value,
 | |
|              rocksdb::Logger *logger) const override {
 | |
|     DBUG_ASSERT(new_value != nullptr);
 | |
| 
 | |
|     if (key.size() != Rdb_key_def::INDEX_NUMBER_SIZE * 3 ||
 | |
|         GetKeyType(key) != Rdb_key_def::AUTO_INC ||
 | |
|         value.size() !=
 | |
|             RDB_SIZEOF_AUTO_INCREMENT_VERSION + ROCKSDB_SIZEOF_AUTOINC_VALUE ||
 | |
|         GetVersion(value) > Rdb_key_def::AUTO_INCREMENT_VERSION) {
 | |
|       abort();
 | |
|     }
 | |
| 
 | |
|     uint64_t merged_value = Deserialize(value);
 | |
| 
 | |
|     if (existing_value != nullptr) {
 | |
|       if (existing_value->size() != RDB_SIZEOF_AUTO_INCREMENT_VERSION +
 | |
|                                         ROCKSDB_SIZEOF_AUTOINC_VALUE ||
 | |
|           GetVersion(*existing_value) > Rdb_key_def::AUTO_INCREMENT_VERSION) {
 | |
|         abort();
 | |
|       }
 | |
| 
 | |
|       merged_value = std::max(merged_value, Deserialize(*existing_value));
 | |
|     }
 | |
|     Serialize(merged_value, new_value);
 | |
|     return true;
 | |
|   }
 | |
| 
 | |
|   const char *Name() const override { return "Rdb_system_merge_op"; }
 | |
| 
 | |
|  private:
 | |
|   /*
 | |
|     Serializes the integer data to the new_value buffer or the target buffer
 | |
|     the merge operator will update to
 | |
|    */
 | |
|   void Serialize(const uint64_t data, std::string *new_value) const {
 | |
|     uchar value_buf[RDB_SIZEOF_AUTO_INCREMENT_VERSION +
 | |
|                     ROCKSDB_SIZEOF_AUTOINC_VALUE] = {0};
 | |
|     uchar *ptr = value_buf;
 | |
|     /* fill in the auto increment version */
 | |
|     rdb_netbuf_store_uint16(ptr, Rdb_key_def::AUTO_INCREMENT_VERSION);
 | |
|     ptr += RDB_SIZEOF_AUTO_INCREMENT_VERSION;
 | |
|     /* fill in the auto increment value */
 | |
|     rdb_netbuf_store_uint64(ptr, data);
 | |
|     ptr += ROCKSDB_SIZEOF_AUTOINC_VALUE;
 | |
|     new_value->assign(reinterpret_cast<char *>(value_buf), ptr - value_buf);
 | |
|   }
 | |
| 
 | |
|   /*
 | |
|     Gets the value of auto_increment type in the data dictionary from the
 | |
|     value slice
 | |
| 
 | |
|     @Note Only to be used on data dictionary keys for the auto_increment type
 | |
|    */
 | |
|   uint64_t Deserialize(const rocksdb::Slice &s) const {
 | |
|     return rdb_netbuf_to_uint64(reinterpret_cast<const uchar *>(s.data()) +
 | |
|                                 RDB_SIZEOF_AUTO_INCREMENT_VERSION);
 | |
|   }
 | |
| 
 | |
|   /*
 | |
|     Gets the type of the key of the key in the data dictionary.
 | |
| 
 | |
|     @Note Only to be used on data dictionary keys for the auto_increment type
 | |
|    */
 | |
|   uint16_t GetKeyType(const rocksdb::Slice &s) const {
 | |
|     return rdb_netbuf_to_uint32(reinterpret_cast<const uchar *>(s.data()));
 | |
|   }
 | |
| 
 | |
|   /*
 | |
|     Gets the version of the auto_increment value in the data dictionary.
 | |
| 
 | |
|     @Note Only to be used on data dictionary value for the auto_increment type
 | |
|    */
 | |
|   uint16_t GetVersion(const rocksdb::Slice &s) const {
 | |
|     return rdb_netbuf_to_uint16(reinterpret_cast<const uchar *>(s.data()));
 | |
|   }
 | |
| };
 | |
| 
 | |
| bool rdb_is_collation_supported(const my_core::CHARSET_INFO *const cs);
 | |
| 
 | |
| }  // namespace myrocks
 | 
