mirror of
https://github.com/MariaDB/server.git
synced 2025-01-23 07:14:17 +01:00
015bc27f2f
This makes big difference for space (46% smaller) and a small time difference (5% faster), as measured by benchmark-test. Before: {{{ $ ./benchmark-test nodesize=1048576 keysize=8 valsize=8 Serial and random insertions of 1048576 per batch serial 8.753964s 119783/s random 5.640094s 185915/s cumulative 14.394118s 145695/s serial 9.381472s 111771/s random 7.325284s 143145/s cumulative 31.100944s 134861/s serial 9.859233s 106355/s random 6.734307s 155707/s cumulative 47.694553s 131911/s serial 11.069200s 94729/s random 6.885863s 152280/s cumulative 65.649695s 127778/s Shutdown 4.636875s Total time 70.286611s for 8388608 insertions = 119349/s $ ls -l sinsert.brt -rwxrwxr-x 1 bradley bradley 730344924 Jan 22 11:47 sinsert.brt }}} After: {{{ $ ./benchmark-test nodesize=1048576 keysize=8 valsize=8 Serial and random insertions of 1048576 per batch serial 8.521855s 123046/s random 5.730942s 182967/s cumulative 14.252861s 147139/s serial 9.106047s 115152/s random 7.001765s 149759/s cumulative 30.360740s 138149/s serial 9.543696s 109871/s random 6.651000s 157657/s cumulative 46.555503s 135139/s serial 10.627035s 98671/s random 6.555884s 159944/s cumulative 63.738491s 131610/s Shutdown 2.818513s Total time 66.557042s for 8388608 insertions = 126036/s $ ls -l sinsert.brt -rwxrwxr-x 1 bradley bradley 396894480 Jan 22 11:45 sinsert.brt }}} git-svn-id: file:///svn/tokudb@1798 c7de825b-a66e-492c-adef-691d508d4ae1
241 lines
8.7 KiB
C
241 lines
8.7 KiB
C
#ifndef BRT_INTERNAL_H
|
|
#define BRT_INTERNAL_H
|
|
|
|
#ident "Copyright (c) 2007 Tokutek Inc. All rights reserved."
|
|
|
|
#include "cachetable.h"
|
|
#include "fifo.h"
|
|
#include "pma.h"
|
|
#include "brt.h"
|
|
#include "crc.h"
|
|
|
|
#ifndef BRT_FANOUT
|
|
#define BRT_FANOUT 16
|
|
#endif
|
|
enum { TREE_FANOUT = BRT_FANOUT };
|
|
enum { KEY_VALUE_OVERHEAD = 8 }; /* Must store the two lengths. */
|
|
enum { PMA_ITEM_OVERHEAD = 4 };
|
|
enum { BRT_CMD_OVERHEAD = 1 };
|
|
enum { BRT_DEFAULT_NODE_SIZE = 1 << 20 };
|
|
|
|
struct nodeheader_in_file {
|
|
int n_in_buffer;
|
|
};
|
|
enum { BUFFER_HEADER_SIZE = (4 // height//
|
|
+ 4 // n_children
|
|
+ TREE_FANOUT * 8 // children
|
|
) };
|
|
|
|
struct brtnode_nonleaf_pivotinfo {
|
|
struct kv_pair *pivotkey; /* For DUPSORT keys, the keys are whole key-value pairs.
|
|
* For nonduplicate and DUPSORT keys we have
|
|
* Child 0's keys <= pivotkey[0] < Child 1's keys <= pivotkey[1] < ... pivotkey[N-1] < child N's keys <= pivotkey[N] ...
|
|
*/
|
|
};
|
|
struct brtnode_nonleaf_childinfo {
|
|
u_int32_t subtree_fingerprint;
|
|
#if 0
|
|
DISKOFF diskoff;
|
|
FIFO htable;
|
|
unsigned int n_bytes_in_buffer; /* How many bytes are in each buffer (including overheads for the disk-representation) */
|
|
unsigned int n_cursors;
|
|
#endif
|
|
};
|
|
|
|
typedef struct brtnode *BRTNODE;
|
|
/* Internal nodes. */
|
|
struct brtnode {
|
|
enum typ_tag tag;
|
|
unsigned int nodesize;
|
|
unsigned int flags;
|
|
DISKOFF thisnodename; // The size of the node allocated on disk. Not all is necessarily in use.
|
|
LSN disk_lsn; // The LSN as of the most recent version on disk.
|
|
LSN log_lsn; // The LSN as of the most recent log write.
|
|
int layout_version; // What version of the data structure?
|
|
int height; /* height is always >= 0. 0 for leaf, >0 for nonleaf. */
|
|
u_int32_t rand4fingerprint;
|
|
u_int32_t local_fingerprint; /* For leaves this is everything in the buffer. For nonleaves, this is everything in the buffers, but does not include child subtree fingerprints. */
|
|
int dirty;
|
|
union node {
|
|
struct nonleaf {
|
|
// Don't actually store the subree fingerprint in the in-memory data structure.
|
|
int n_children; /* if n_children==TREE_FANOUT+1 then the tree needs to be rebalanced. */
|
|
unsigned int totalchildkeylens;
|
|
unsigned int n_bytes_in_buffers;
|
|
|
|
struct brtnode_nonleaf_childinfo childinfos[TREE_FANOUT+1]; /* One extra so we can grow */
|
|
|
|
#if 0
|
|
u_int32_t child_subtree_fingerprints[TREE_FANOUT+1];
|
|
#define BRTNODE_CHILD_SUBTREE_FINGERPRINTS(node,i) ((node)->u.n.child_subtree_fingerprints[i])
|
|
#else
|
|
#define BRTNODE_CHILD_SUBTREE_FINGERPRINTS(node,i) ((node)->u.n.childinfos[i].subtree_fingerprint)
|
|
#endif
|
|
|
|
//#define CHSTRUCT
|
|
#ifdef CHSTRUCT
|
|
struct brtnode_nonleaf_pivotinfo pivots[TREE_FANOUT]; /* One extra one so we can grow. */
|
|
#else
|
|
struct kv_pair *childkeys[TREE_FANOUT]; /* Pivot keys. Child 0's keys are <= childkeys[0]. Child 1's keys are <= childkeys[1].
|
|
Note: It is possible that Child 1's keys are == to child 0's key's, so it is
|
|
not necessarily true that child 1's keys are > childkeys[0].
|
|
However, in the absense of duplicate keys, child 1's keys *are* > childkeys[0]. */
|
|
DISKOFF children[TREE_FANOUT+1]; /* unused if height==0 */ /* Note: The last element of these arrays is used only temporarily while splitting a node. */
|
|
#define BRTNODE_CHILD_DISKOFF(node,i) ((node)->u.n.children[i])
|
|
FIFO buffers[TREE_FANOUT+1];
|
|
unsigned int n_bytes_in_buffer[TREE_FANOUT+1]; /* how many bytes are in each buffer (including overheads) */
|
|
unsigned int n_cursors[TREE_FANOUT+1];
|
|
#endif
|
|
} n;
|
|
struct leaf {
|
|
PMA buffer;
|
|
unsigned int n_bytes_in_buffer; /* How many bytes to represent the PMA (including the per-key overheads, but not including the overheads for the node. */
|
|
} l;
|
|
} u;
|
|
};
|
|
|
|
/* pivot flags (must fit in 8 bits) */
|
|
enum {
|
|
BRT_PIVOT_TRUNC = 4,
|
|
BRT_PIVOT_FRONT_COMPRESS = 8,
|
|
};
|
|
|
|
struct brt_header {
|
|
int dirty;
|
|
unsigned int nodesize;
|
|
DISKOFF freelist;
|
|
DISKOFF unused_memory;
|
|
DISKOFF unnamed_root;
|
|
int n_named_roots; /* -1 if the only one is unnamed */
|
|
char **names;
|
|
DISKOFF *roots;
|
|
unsigned int flags;
|
|
};
|
|
|
|
enum brt_header_flags {
|
|
TOKU_DB_DUP = 1,
|
|
TOKU_DB_DUPSORT = 2,
|
|
};
|
|
|
|
struct brt {
|
|
CACHEFILE cf;
|
|
char *database_name;
|
|
// The header is shared. It is also ephemeral.
|
|
struct brt_header *h;
|
|
|
|
BRT_CURSOR cursors_head, cursors_tail;
|
|
|
|
unsigned int nodesize;
|
|
unsigned int flags;
|
|
int (*compare_fun)(DB*,const DBT*,const DBT*);
|
|
int (*dup_compare)(DB*,const DBT*,const DBT*);
|
|
DB *db; // To pass to the compare fun
|
|
|
|
void *skey,*sval; /* Used for DBT return values. */
|
|
};
|
|
|
|
/* serialization code */
|
|
void toku_serialize_brtnode_to(int fd, DISKOFF off, DISKOFF size, BRTNODE node);
|
|
int toku_deserialize_brtnode_from (int fd, DISKOFF off, BRTNODE *brtnode, int flags, int nodesize, int (*bt_compare)(DB *, const DBT*, const DBT*), int (*dup_compare)(DB *, const DBT *, const DBT *), DB *db, FILENUM filenum);
|
|
unsigned int toku_serialize_brtnode_size(BRTNODE node); /* How much space will it take? */
|
|
int toku_keycompare (bytevec key1, ITEMLEN key1len, bytevec key2, ITEMLEN key2len);
|
|
|
|
void toku_verify_counts(BRTNODE);
|
|
|
|
int toku_serialize_brt_header_size (struct brt_header *h);
|
|
int toku_serialize_brt_header_to (int fd, struct brt_header *h);
|
|
int toku_serialize_brt_header_to_wbuf (struct wbuf *, struct brt_header *h);
|
|
int toku_deserialize_brtheader_from (int fd, DISKOFF off, struct brt_header **brth);
|
|
|
|
void toku_brtnode_free (BRTNODE *node);
|
|
|
|
#if 1
|
|
#define DEADBEEF ((void*)0xDEADBEEF)
|
|
#else
|
|
#define DEADBEEF ((void*)0xDEADBEEFDEADBEEF)
|
|
#endif
|
|
|
|
|
|
#define CURSOR_PATHLEN_LIMIT 32
|
|
struct brt_cursor {
|
|
BRT brt;
|
|
int path_len; /* -1 if the cursor points nowhere. */
|
|
BRTNODE path[CURSOR_PATHLEN_LIMIT]; /* Include the leaf (last). These are all pinned. */
|
|
int pathcnum[CURSOR_PATHLEN_LIMIT]; /* which child did we descend to from here? */
|
|
PMA_CURSOR pmacurs; /* The cursor into the leaf. NULL if the cursor doesn't exist. */
|
|
BRT_CURSOR prev,next;
|
|
int op; DBT *key; DBT *val; /* needed when flushing buffers */
|
|
};
|
|
|
|
/* print the cursor path */
|
|
void toku_brt_cursor_print(BRT_CURSOR cursor);
|
|
|
|
/* is the cursor path empty? */
|
|
static inline int toku_brt_cursor_path_empty(BRT_CURSOR cursor) {
|
|
return cursor->path_len == 0;
|
|
}
|
|
|
|
/*is the cursor path full? */
|
|
static inline int toku_brt_cursor_path_full(BRT_CURSOR cursor) {
|
|
return cursor->path_len == CURSOR_PATHLEN_LIMIT;
|
|
}
|
|
|
|
static inline int toku_brt_cursor_active(BRT_CURSOR cursor) {
|
|
return cursor->path_len > 0;
|
|
}
|
|
|
|
/* brt has a new root. add the root to this cursor. */
|
|
void toku_brt_cursor_new_root(BRT_CURSOR cursor, BRT t, BRTNODE newroot, BRTNODE left, BRTNODE right);
|
|
|
|
/* a brt leaf has split. modify this cursor if it includes the old node in its path. */
|
|
void toku_brt_cursor_leaf_split(BRT_CURSOR cursor, BRT t, BRTNODE oldnode, BRTNODE newright);
|
|
|
|
/* a brt internal node has expanded. modify this cursor if it includes the old node in its path. */
|
|
void toku_brt_cursor_nonleaf_expand(BRT_CURSOR cursor, BRT t, BRTNODE oldnode, int childnum, BRTNODE left, BRTNODE right, struct kv_pair *splitk);
|
|
|
|
/* a brt internal node has split. modify this cursor if it includes the old node in its path. */
|
|
void toku_brt_cursor_nonleaf_split(BRT_CURSOR cursor, BRT t, BRTNODE oldnode, BRTNODE left, BRTNODE right);
|
|
|
|
enum brt_cmd_type {
|
|
BRT_NONE = 0,
|
|
BRT_INSERT = 1,
|
|
BRT_DELETE = 2,
|
|
BRT_DELETE_BOTH = 3,
|
|
};
|
|
|
|
struct brt_cmd {
|
|
enum brt_cmd_type type;
|
|
union {
|
|
/* insert or delete */
|
|
struct brt_cmd_insert_delete {
|
|
DBT *key;
|
|
DBT *val;
|
|
} id;
|
|
} u;
|
|
};
|
|
typedef struct brt_cmd BRT_CMD;
|
|
|
|
struct brtenv {
|
|
CACHETABLE ct;
|
|
TOKULOGGER logger;
|
|
long long checksum_number;
|
|
// SPINLOCK checkpointing;
|
|
};
|
|
|
|
extern cachetable_flush_func_t toku_brtnode_flush_callback, toku_brtheader_flush_callback;
|
|
extern cachetable_fetch_func_t toku_brtnode_fetch_callback, toku_brtheader_fetch_callback;
|
|
extern int toku_read_and_pin_brt_header (CACHEFILE cf, struct brt_header **header);
|
|
extern int toku_unpin_brt_header (BRT brt);
|
|
extern CACHEKEY* toku_calculate_root_offset_pointer (BRT brt);
|
|
|
|
static const BRTNODE null_brtnode=0;
|
|
|
|
extern u_int32_t toku_calccrc32_kvpair (const void *key, int keylen, const void *val, int vallen);
|
|
extern u_int32_t toku_calccrc32_cmd (int type, const void *key, int keylen, const void *val, int vallen);
|
|
extern u_int32_t toku_calccrc32_cmdstruct (BRT_CMD *cmd);
|
|
|
|
// How long is the pivot key?
|
|
unsigned int toku_brt_pivot_key_len (BRT, struct kv_pair *); // Given the tree
|
|
unsigned int toku_brtnode_pivot_key_len (BRTNODE, struct kv_pair *); // Given the node
|
|
|
|
#endif
|