Merge remote-tracking branch 'origin/ft-index/46merge-a'

2025-02-01 11:31:51 +01:00 · 2014-02-04 10:37:07 -05:00 · 2014-02-04 10:37:07 -05:00 · a03b9926d7
commit a03b9926d7
parent 4d9ba2645a 198b4d60bd
49 changed files with 5211 additions and 596 deletions
--- a/46.code-review
+++ b/46.code-review
@ -0,0 +1,87 @@
+Notes during 2014-01-08 Leif/Yoni
+-Should verify (dmt?omt?bndata?) crash or return error on failed verify
+
+DECISIONS:
+    Replace dmt_functor with implicit interface only.  Instead of (for data type x) requiring the name to be dmt_functor<x> just pass the writer's class name into the dmt's template as a new parameter.
+    Replace dmt_functor<default> with comments explaining the "interface"
+
+-==========================================-
+
+See wiki:
+https://github.com/Tokutek/ft-index/wiki/Improving-in-memory-query-performance---Design
+
+
+ft/bndata.{cc,h}  The basement node was heavily modified to split the key/value, and inline the keys
+    bn_data::initialize_from_separate_keys_and_vals
+        This is effectively the deserialize
+
+    The bn_data::omt_* functions (probably badly named) kind of treat the basement node as an omt of key+leafentry pairs
+    There are many references to 'omt' that could be renamed to dmt if it's worth it.
+
+
+util/dmt.{cc,h}  The new DMT structure
+    Possible questions:
+        1-Should we merge dmt<> & omt<>? (delete omt entirely)
+        2-Should omt<> become a wrapper for dmt<>?
+        3-Should we just keep both around?
+            If we plan to do this for a while, should we get rid of any scaffolding that would make it easier to do 1 or 2?
+    The dmt is basically an omt with dynamic sized nodes/values.
+    There are two representations:  an array of values, or a tree of nodes.
+    The high-level algorithm is basically the same for dmt and omt, except the dmt tries not to move values around in tree form
+        Instead, it moves the metadata from nodes around.
+    Insertion into a dmt requires a functor that can provide information about size, since it's expected to be (potentially at least) dynamically sized
+
+    The dmt does not revert to array form when rebalancing the root, but it CAN revert to array form when it prepares for serializing (if it notices everything is fixed length)
+
+    The dmt also can serialize and deserialize the values (set) it represents.  It saves no information about the dmt itself, just the values.
+
+
+Some comments about what's in each file.
+
+
+ft/CMakeLists.txt
+    add dmt-wrapper (test wrapper, nearly identical to ft/omt.cc which is also a test wrapper)
+ft/dmt-wrapper.cc/h
+    Just like ft/omt.cc,h.  Is a test wrapper for the dmt to implement a version of the old (non-templated) omt tests.
+ft/ft-internal.h
+    Additional engine status
+ft/ft-ops.cc/h
+    Additional engine status
+    in ftnode_memory_size()
+        fix a minor bug where we didn't count all the memory.
+    comments
+ft/ft_layout_version.h
+    Update comment describing version change.
+    NOTE: May need to add version 26 if 25 is sent to customers before this goes live.
+        Adding 26 requires additional code changes (limited to a subset of places where version 24/25 are referred to)
+ft/ft_node-serialize.cc
+    Changes calculation of size of a leaf node to include basement-node header
+    Adds optimized serialization for basement nodes with fixed-length keys
+        Maintains old method when not using fixed-length keys.
+    rebalance_ftnode_leaf()
+        Minor changes since key/leafentries are separated
+    deserialize_ftnode_partition()
+        Minor changes, including passing rbuf directly to child function (so ndone calculation is done by child)
+ft/memarena.cc
+    Changes so that toku_memory_footprint is more accurate.  (Not exactly related project)
+ft/rollback.cc
+    Just uses new memarena function for memory footprint
+ft/tests/dmt-test.cc
+    "clone" of old omt-test (non templated) ported to dmt
+    Basically not worth looking at except to make sure it imports dmt instead of omt.
+ft/tests/dmt-test2.cc
+    New dmt tests.
+    You might decide not enough new tests were implemented.
+ft/tests/ft-serialize-benchmark.cc
+    Minor improvements s.t. you can take an average of a bunch of runs.
+ft/tests/ft-serialize-test.cc
+    Just ported to changed api
+ft/tests/test-pick-child-to-flush.cc
+    The new basement-node headers reduce available memory.. reduce max size of test appropriately.
+ft/wbuf.h
+    Added wbuf_nocrc_reserve_literal_bytes()
+    Gives you a pointer to write to the wbuf, but notes the memory was used.
+util/mempool.cc
+    Made mempool allocations aligned to cachelines
+    Minor 'const' changes to help compilation
+    Some utility functions to get/give offsets
--- a/ft/CMakeLists.txt
+++ b/ft/CMakeLists.txt
@ -31,6 +31,7 @@ set(FT_SOURCES
  checkpoint
  compress
  dbufio
+  dmt-wrapper
  fifo
  ft
  ft-cachetable-wrappers
--- a/ft/bndata.cc
+++ b/ft/bndata.cc
@ -90,46 +90,197 @@ PATENT RIGHTS GRANT:
 #ident "The technology is licensed by the Massachusetts Institute of Technology, Rutgers State University of New Jersey, and the Research Foundation of State University of New York at Stony Brook under United States of America Serial No. 11/760379 and to the patents and/or patent applications resulting from it."

 #include <bndata.h>
+#include <ft-ops.h>

-static uint32_t klpair_size(KLPAIR klpair){
-    return sizeof(*klpair) + klpair->keylen + leafentry_memsize(get_le_from_klpair(klpair));
-}
-
-static uint32_t klpair_disksize(KLPAIR klpair){
-    return sizeof(*klpair) + klpair->keylen + leafentry_disksize(get_le_from_klpair(klpair));
+using namespace toku;
+uint32_t bn_data::klpair_disksize(const uint32_t klpair_len, const klpair_struct *klpair) const {
+    return sizeof(*klpair) + keylen_from_klpair_len(klpair_len) + leafentry_disksize(get_le_from_klpair(klpair));
 }

 void bn_data::init_zero() {
    toku_mempool_zero(&m_buffer_mempool);
+    m_disksize_of_keys = 0;
 }

 void bn_data::initialize_empty() {
-    toku_mempool_zero(&m_buffer_mempool);
-    m_buffer.create_no_array();
+    init_zero();
+    m_buffer.create();
 }

-void bn_data::initialize_from_data(uint32_t num_entries, unsigned char *buf, uint32_t data_size) {
+void bn_data::add_key(uint32_t keylen) {
+    m_disksize_of_keys += sizeof(keylen) + keylen;
+}
+
+void bn_data::add_keys(uint32_t n_keys, uint32_t combined_klpair_len) {
+    invariant(n_keys * sizeof(uint32_t) <= combined_klpair_len);
+    m_disksize_of_keys += combined_klpair_len;
+}
+
+void bn_data::remove_key(uint32_t keylen) {
+    m_disksize_of_keys -= sizeof(keylen) + keylen;
+}
+
+// Deserialize from format optimized for keys being inlined.
+// Currently only supports fixed-length keys.
+void bn_data::initialize_from_separate_keys_and_vals(uint32_t num_entries, struct rbuf *rb, uint32_t data_size, uint32_t version UU(),
+                                                     uint32_t key_data_size, uint32_t val_data_size, bool all_keys_same_length,
+                                                     uint32_t fixed_klpair_length) {
+    paranoid_invariant(version >= FT_LAYOUT_VERSION_26);  // Support was added @26
+    uint32_t ndone_before = rb->ndone;
+    init_zero();
+    invariant(all_keys_same_length);  // Until otherwise supported.
+    bytevec keys_src;
+    rbuf_literal_bytes(rb, &keys_src, key_data_size);
+    //Generate dmt
+    this->m_buffer.create_from_sorted_memory_of_fixed_size_elements(
+            keys_src, num_entries, key_data_size, fixed_klpair_length);
+    toku_mempool_construct(&this->m_buffer_mempool, val_data_size);
+
+    bytevec vals_src;
+    rbuf_literal_bytes(rb, &vals_src, val_data_size);
+
+    if (num_entries > 0) {
+        void *vals_dest = toku_mempool_malloc(&this->m_buffer_mempool, val_data_size, 1);
+        paranoid_invariant_notnull(vals_dest);
+        memcpy(vals_dest, vals_src, val_data_size);
+    }
+
+    add_keys(num_entries, num_entries * fixed_klpair_length);
+
+    toku_note_deserialized_basement_node(all_keys_same_length);
+
+    invariant(rb->ndone - ndone_before == data_size);
+}
+
+static int
+wbufwriteleafentry(const void* key, const uint32_t keylen, const LEAFENTRY &le, const uint32_t UU(idx), struct wbuf * const wb) {
+    // need to pack the leafentry as it was in versions
+    // where the key was integrated into it (< 26)
+    uint32_t begin_spot UU() = wb->ndone;
+    uint32_t le_disk_size = leafentry_disksize(le);
+    wbuf_nocrc_uint8_t(wb, le->type);
+    wbuf_nocrc_uint32_t(wb, keylen);
+    if (le->type == LE_CLEAN) {
+        wbuf_nocrc_uint32_t(wb, le->u.clean.vallen);
+        wbuf_nocrc_literal_bytes(wb, key, keylen);
+        wbuf_nocrc_literal_bytes(wb, le->u.clean.val, le->u.clean.vallen);
+    }
+    else {
+        paranoid_invariant(le->type == LE_MVCC);
+        wbuf_nocrc_uint32_t(wb, le->u.mvcc.num_cxrs);
+        wbuf_nocrc_uint8_t(wb, le->u.mvcc.num_pxrs);
+        wbuf_nocrc_literal_bytes(wb, key, keylen);
+        wbuf_nocrc_literal_bytes(wb, le->u.mvcc.xrs, le_disk_size - (1 + 4 + 1));
+    }
+    uint32_t end_spot UU() = wb->ndone;
+    paranoid_invariant((end_spot - begin_spot) == keylen + sizeof(keylen) + le_disk_size);
+    return 0;
+}
+
+void bn_data::serialize_to_wbuf(struct wbuf *const wb) {
+    prepare_to_serialize();
+    serialize_header(wb);
+    if (m_buffer.value_length_is_fixed()) {
+        serialize_rest(wb);
+    } else {
+        //
+        // iterate over leafentries and place them into the buffer
+        //
+        iterate<struct wbuf, wbufwriteleafentry>(wb);
+    }
+}
+
+// If we have fixed-length keys, we prepare the dmt and mempool.
+// The mempool is prepared by removing any fragmented space and ordering leafentries in the same order as their keys.
+void bn_data::prepare_to_serialize(void) {
+    if (m_buffer.value_length_is_fixed()) {
+        m_buffer.prepare_for_serialize();
+        dmt_compress_kvspace(0, nullptr, true);  // Gets it ready for easy serialization.
+    }
+}
+
+void bn_data::serialize_header(struct wbuf *wb) const {
+    bool fixed = m_buffer.value_length_is_fixed();
+
+    //key_data_size
+    wbuf_nocrc_uint(wb, m_disksize_of_keys);
+    //val_data_size
+    wbuf_nocrc_uint(wb, toku_mempool_get_used_size(&m_buffer_mempool));
+    //fixed_klpair_length
+    wbuf_nocrc_uint(wb, m_buffer.get_fixed_length());
+    // all_keys_same_length
+    wbuf_nocrc_uint8_t(wb, fixed);
+    // keys_vals_separate
+    wbuf_nocrc_uint8_t(wb, fixed);
+}
+
+void bn_data::serialize_rest(struct wbuf *wb) const {
+    //Write keys
+    invariant(m_buffer.value_length_is_fixed()); //Assumes prepare_to_serialize was called
+    m_buffer.serialize_values(m_disksize_of_keys, wb);
+
+    //Write leafentries
+    //Just ran dmt_compress_kvspace so there is no fragmentation and also leafentries are in sorted order.
+    paranoid_invariant(toku_mempool_get_frag_size(&m_buffer_mempool) == 0);
+    uint32_t val_data_size = toku_mempool_get_used_size(&m_buffer_mempool);
+    wbuf_nocrc_literal_bytes(wb, toku_mempool_get_base(&m_buffer_mempool), val_data_size);
+}
+
+// Deserialize from rbuf
+void bn_data::deserialize_from_rbuf(uint32_t num_entries, struct rbuf *rb, uint32_t data_size, uint32_t version) {
+    uint32_t key_data_size = data_size;  // overallocate if < version 26 (best guess that is guaranteed not too small)
+    uint32_t val_data_size = data_size;  // overallocate if < version 26 (best guess that is guaranteed not too small)
+
+    bool all_keys_same_length = false;
+    bool keys_vals_separate = false;
+    uint32_t fixed_klpair_length = 0;
+
+    // In version 25 and older there is no header.  Skip reading header for old version.
+    if (version >= FT_LAYOUT_VERSION_26) {
+        uint32_t ndone_before = rb->ndone;
+        key_data_size = rbuf_int(rb);
+        val_data_size = rbuf_int(rb);
+        fixed_klpair_length = rbuf_int(rb);  // 0 if !all_keys_same_length
+        all_keys_same_length = rbuf_char(rb);
+        keys_vals_separate = rbuf_char(rb);
+        invariant(all_keys_same_length == keys_vals_separate);  // Until we support otherwise
+        uint32_t header_size = rb->ndone - ndone_before;
+        data_size -= header_size;
+        invariant(header_size == HEADER_LENGTH);
+        if (keys_vals_separate) {
+            invariant(fixed_klpair_length >= sizeof(klpair_struct));
+            initialize_from_separate_keys_and_vals(num_entries, rb, data_size, version,
+                                                   key_data_size, val_data_size, all_keys_same_length,
+                                                   fixed_klpair_length);
+            return;
+        }
+    }
+    // Version >= 26 and version 25 deserialization are now identical except that <= 25 might allocate too much memory.
+    bytevec bytes;
+    rbuf_literal_bytes(rb, &bytes, data_size);
+    const unsigned char *CAST_FROM_VOIDP(buf, bytes);
    if (data_size == 0) {
        invariant_zero(num_entries);
    }
-    KLPAIR *XMALLOC_N(num_entries, array); // create array of pointers to leafentries
-    unsigned char *newmem = NULL;
+    init_zero();
+    klpair_dmt_t::builder dmt_builder;
+    dmt_builder.create(num_entries, key_data_size);
+
+    // TODO(leif): clean this up (#149)
+    unsigned char *newmem = nullptr;
    // add same wiggle room that toku_mempool_construct would, 25% extra
-    uint32_t allocated_bytes = data_size + data_size/4;
-    CAST_FROM_VOIDP(newmem, toku_xmalloc(allocated_bytes)); 
-    unsigned char* curr_src_pos = buf;
+    uint32_t allocated_bytes_vals = val_data_size + val_data_size/4;
+    CAST_FROM_VOIDP(newmem, toku_xmalloc(allocated_bytes_vals));
+    const unsigned char* curr_src_pos = buf;
    unsigned char* curr_dest_pos = newmem;
    for (uint32_t i = 0; i < num_entries; i++) {
-        KLPAIR curr_kl = (KLPAIR)curr_dest_pos;
-        array[i] = curr_kl;
-
        uint8_t curr_type = curr_src_pos[0];
        curr_src_pos++;
        // first thing we do is lay out the key,
        // to do so, we must extract it from the leafentry
        // and write it in
        uint32_t keylen = 0;
-        void* keyp = NULL;
+        const void* keyp = nullptr;
        keylen = *(uint32_t *)curr_src_pos;
        curr_src_pos += sizeof(uint32_t);
        uint32_t clean_vallen = 0;
@ -150,12 +301,10 @@ void bn_data::initialize_from_data(uint32_t num_entries, unsigned char *buf, uin
            keyp = curr_src_pos;
            curr_src_pos += keylen;
        }
-        // now that we have the keylen and the key, we can copy it
-        // into the destination
-        *(uint32_t *)curr_dest_pos = keylen;
-        curr_dest_pos += sizeof(keylen);
-        memcpy(curr_dest_pos, keyp, keylen);
-        curr_dest_pos += keylen;
+        uint32_t le_offset = curr_dest_pos - newmem;
+        dmt_builder.append(klpair_dmtwriter(keylen, le_offset, keyp));
+        add_key(keylen);
+
        // now curr_dest_pos is pointing to where the leafentry should be packed
        curr_dest_pos[0] = curr_type;
        curr_dest_pos++;
@ -173,31 +322,46 @@ void bn_data::initialize_from_data(uint32_t num_entries, unsigned char *buf, uin
            *(uint8_t *)curr_dest_pos = num_pxrs;
            curr_dest_pos += sizeof(num_pxrs);
            // now we need to pack the rest of the data
-            uint32_t num_rest_bytes = leafentry_rest_memsize(num_pxrs, num_cxrs, curr_src_pos);
+            uint32_t num_rest_bytes = leafentry_rest_memsize(num_pxrs, num_cxrs, const_cast<uint8_t*>(curr_src_pos));
            memcpy(curr_dest_pos, curr_src_pos, num_rest_bytes);
            curr_dest_pos += num_rest_bytes;
            curr_src_pos += num_rest_bytes;
        }
    }
-    uint32_t num_bytes_read UU() = (uint32_t)(curr_src_pos - buf);
-    paranoid_invariant( num_bytes_read == data_size);
-    uint32_t num_bytes_written = curr_dest_pos - newmem;
-    paranoid_invariant( num_bytes_written == data_size);
-    toku_mempool_init(&m_buffer_mempool, newmem, (size_t)(num_bytes_written), allocated_bytes);
+    dmt_builder.build(&this->m_buffer);
+    toku_note_deserialized_basement_node(m_buffer.value_length_is_fixed());

-    // destroy old omt that was created by toku_create_empty_bn(), so we can create a new one
-    m_buffer.destroy();
-    m_buffer.create_steal_sorted_array(&array, num_entries, num_entries);
+#if TOKU_DEBUG_PARANOID
+    uint32_t num_bytes_read = (uint32_t)(curr_src_pos - buf);
+    paranoid_invariant( num_bytes_read == data_size);
+
+    uint32_t num_bytes_written = curr_dest_pos - newmem + m_disksize_of_keys;
+    paranoid_invariant( num_bytes_written == data_size);
+#endif
+    toku_mempool_init(&m_buffer_mempool, newmem, (size_t)(curr_dest_pos - newmem), allocated_bytes_vals);
+
+    paranoid_invariant(get_disk_size() == data_size);
+    // Versions older than 26 might have allocated too much memory.  Try to shrink the mempool now that we
+    // know how much memory we need.
+    if (version < FT_LAYOUT_VERSION_26) {
+        // Unnecessary after version 26
+        // Reallocate smaller mempool to save memory
+        invariant_zero(toku_mempool_get_frag_size(&m_buffer_mempool));
+        toku_mempool_realloc_larger(&m_buffer_mempool, toku_mempool_get_used_size(&m_buffer_mempool));
+    }
 }

 uint64_t bn_data::get_memory_size() {
    uint64_t retval = 0;
+    //TODO: Maybe ask for memory_size instead of mempool_footprint (either this todo or the next)
    // include fragmentation overhead but do not include space in the
    // mempool that has not yet been allocated for leaf entries
    size_t poolsize = toku_mempool_footprint(&m_buffer_mempool);
-    invariant(poolsize >= get_disk_size());
    retval += poolsize;
+    // This one includes not-yet-allocated for nodes (just like old constant-key omt)
+    //TODO: Maybe ask for mempool_footprint instead of memory_size.
    retval += m_buffer.memory_size();
+    invariant(retval >= get_disk_size());
    return retval;
 }

@ -205,44 +369,53 @@ void bn_data::delete_leafentry (
    uint32_t idx,
    uint32_t keylen,
    uint32_t old_le_size
-    ) 
+    )
 {
+    remove_key(keylen);
    m_buffer.delete_at(idx);
-    toku_mempool_mfree(&m_buffer_mempool, 0, old_le_size + keylen + sizeof(keylen)); // Must pass 0, since le is no good any more.
+    toku_mempool_mfree(&m_buffer_mempool, nullptr, old_le_size);
 }

 /* mempool support */

-struct omt_compressor_state {
+struct dmt_compressor_state {
    struct mempool *new_kvspace;
-    KLPAIR *newvals;
+    class bn_data *bd;
 };

-static int move_it (const KLPAIR &klpair, const uint32_t idx, struct omt_compressor_state * const oc) {
-    uint32_t size = klpair_size(klpair);
-    KLPAIR CAST_FROM_VOIDP(newdata, toku_mempool_malloc(oc->new_kvspace, size, 1));
+static int move_it (const uint32_t, klpair_struct *klpair, const uint32_t idx UU(), struct dmt_compressor_state * const oc) {
+    LEAFENTRY old_le = oc->bd->get_le_from_klpair(klpair);
+    uint32_t size = leafentry_memsize(old_le);
+    void* newdata = toku_mempool_malloc(oc->new_kvspace, size, 1);
    paranoid_invariant_notnull(newdata); // we do this on a fresh mempool, so nothing bad should happen
-    memcpy(newdata, klpair, size);
-    oc->newvals[idx] = newdata;
+    memcpy(newdata, old_le, size);
+    klpair->le_offset = toku_mempool_get_offset_from_pointer_and_base(oc->new_kvspace, newdata);
    return 0;
 }

-// Compress things, and grow the mempool if needed.
-void bn_data::omt_compress_kvspace(size_t added_size, void **maybe_free) {
-    uint32_t total_size_needed = toku_mempool_get_used_space(&m_buffer_mempool) + added_size;
+// Compress things, and grow or shrink the mempool if needed.
+// May (always if force_compress) have a side effect of putting contents of mempool in sorted order.
+void bn_data::dmt_compress_kvspace(size_t added_size, void **maybe_free, bool force_compress) {
+    uint32_t total_size_needed = toku_mempool_get_used_size(&m_buffer_mempool) + added_size;
    // set the new mempool size to be twice of the space we actually need.
    // On top of the 25% that is padded within toku_mempool_construct (which we
    // should consider getting rid of), that should be good enough.
+
+    // If there is no fragmentation, e.g. in serial inserts, we can just increase the size of the mempool
+    // with a realloc.  (force_compress means we NEED the side effect that all contents are put in sorted order).
+    if (!force_compress && toku_mempool_get_frag_size(&m_buffer_mempool) == 0) {
+        // Skip iterate, just realloc.
+        toku_mempool_realloc_larger(&m_buffer_mempool, 2*total_size_needed);
+        if (maybe_free) {
+            *maybe_free = nullptr;
+        }
+        return;
+    }
    struct mempool new_kvspace;
-    toku_mempool_construct(&new_kvspace, 2*total_size_needed);
-    uint32_t numvals = omt_size();
-    KLPAIR *XMALLOC_N(numvals, newvals);
-    struct omt_compressor_state oc = { &new_kvspace, newvals };
-
-    m_buffer.iterate_on_range< decltype(oc), move_it >(0, omt_size(), &oc);
-
-    m_buffer.destroy();
-    m_buffer.create_steal_sorted_array(&newvals, numvals, numvals);
+    size_t requested_size = force_compress ? total_size_needed : 2*total_size_needed;
+    toku_mempool_construct(&new_kvspace, requested_size);
+    struct dmt_compressor_state oc = { &new_kvspace, this};
+    m_buffer.iterate_ptr< decltype(oc), move_it >(&oc);

    if (maybe_free) {
        *maybe_free = m_buffer_mempool.base;
@ -254,120 +427,205 @@ void bn_data::omt_compress_kvspace(size_t added_size, void **maybe_free) {

 // Effect: Allocate a new object of size SIZE in MP.  If MP runs out of space, allocate new a new mempool space, and copy all the items
 //  from the OMT (which items refer to items in the old mempool) into the new mempool.
-//  If MAYBE_FREE is NULL then free the old mempool's space.
+//  If MAYBE_FREE is nullptr then free the old mempool's space.
 //  Otherwise, store the old mempool's space in maybe_free.
-KLPAIR bn_data::mempool_malloc_from_omt(size_t size, void **maybe_free) {
+LEAFENTRY bn_data::mempool_malloc_and_update_dmt(size_t size, void **maybe_free) {
    void *v = toku_mempool_malloc(&m_buffer_mempool, size, 1);
-    if (v == NULL) {
-        omt_compress_kvspace(size, maybe_free);
+    if (v == nullptr) {
+        dmt_compress_kvspace(size, maybe_free, false);
        v = toku_mempool_malloc(&m_buffer_mempool, size, 1);
        paranoid_invariant_notnull(v);
    }
-    return (KLPAIR)v;
+    return (LEAFENTRY)v;
 }

-//TODO: probably not free the "maybe_free" right away?
 void bn_data::get_space_for_overwrite(
    uint32_t idx,
-    const void* keyp,
-    uint32_t keylen,
+    const void* keyp UU(),
+    uint32_t keylen UU(),
    uint32_t old_le_size,
    uint32_t new_size,
-    LEAFENTRY* new_le_space
+    LEAFENTRY* new_le_space,
+    void **const maybe_free
    )
 {
-    void* maybe_free = nullptr;
-    uint32_t size_alloc = new_size + keylen + sizeof(keylen);
-    KLPAIR new_kl = mempool_malloc_from_omt(
-        size_alloc,
-        &maybe_free
-        );
-    uint32_t size_freed = old_le_size + keylen + sizeof(keylen);
-    toku_mempool_mfree(&m_buffer_mempool, nullptr, size_freed);  // Must pass nullptr, since le is no good any more.
-    new_kl->keylen = keylen;
-    memcpy(new_kl->key_le, keyp, keylen);
-    m_buffer.set_at(new_kl, idx);
-    *new_le_space = get_le_from_klpair(new_kl);
-    // free at end, so that the keyp and keylen
-    // passed in is still valid
-    if (maybe_free) {
-        toku_free(maybe_free);
-    }
+    *maybe_free = nullptr;
+    LEAFENTRY new_le = mempool_malloc_and_update_dmt(new_size, maybe_free);
+    toku_mempool_mfree(&m_buffer_mempool, nullptr, old_le_size);
+    klpair_struct* klp = nullptr;
+    uint32_t klpair_len;
+    int r = m_buffer.fetch(idx, &klpair_len, &klp);
+    invariant_zero(r);
+    paranoid_invariant(klp!=nullptr);
+    // Key never changes.
+    paranoid_invariant(keylen_from_klpair_len(klpair_len) == keylen);
+    paranoid_invariant(!memcmp(klp->key, keyp, keylen));
+
+    size_t new_le_offset = toku_mempool_get_offset_from_pointer_and_base(&this->m_buffer_mempool, new_le);
+    paranoid_invariant(new_le_offset <= UINT32_MAX - new_size);  // Not using > 4GB
+    klp->le_offset = new_le_offset;
+
+    paranoid_invariant(new_le == get_le_from_klpair(klp));
+    *new_le_space = new_le;
 }

-//TODO: probably not free the "maybe_free" right away?
 void bn_data::get_space_for_insert(
    uint32_t idx,
    const void* keyp,
    uint32_t keylen,
    size_t size,
-    LEAFENTRY* new_le_space
+    LEAFENTRY* new_le_space,
+    void **const maybe_free
    )
 {
-    void* maybe_free = nullptr;
-    uint32_t size_alloc = size + keylen + sizeof(keylen);
-    KLPAIR new_kl = mempool_malloc_from_omt(
-        size_alloc,
-        &maybe_free
-        );
-    new_kl->keylen = keylen;
-    memcpy(new_kl->key_le, keyp, keylen);
-    m_buffer.insert_at(new_kl, idx);
-    *new_le_space = get_le_from_klpair(new_kl);
-    // free at end, so that the keyp and keylen
-    // passed in is still valid (you never know if
-    // it was part of the old mempool, this is just
-    // safer).
-    if (maybe_free) {
-        toku_free(maybe_free);
-    }
+    add_key(keylen);
+
+    *maybe_free = nullptr;
+    LEAFENTRY new_le = mempool_malloc_and_update_dmt(size, maybe_free);
+    size_t new_le_offset = toku_mempool_get_offset_from_pointer_and_base(&this->m_buffer_mempool, new_le);
+
+    klpair_dmtwriter kl(keylen, new_le_offset, keyp);
+    m_buffer.insert_at(kl, idx);
+
+    *new_le_space = new_le;
 }

-void bn_data::move_leafentries_to(
-     BN_DATA dest_bd,
-     uint32_t lbi, //lower bound inclusive
-     uint32_t ube //upper bound exclusive
+class split_klpairs_extra {
+    bn_data *const m_left_bn;
+    bn_data *const m_right_bn;
+    klpair_dmt_t::builder *const m_left_builder;
+    klpair_dmt_t::builder *const m_right_builder;
+    struct mempool *const m_left_dest_mp;
+    uint32_t m_split_at;
+
+    struct mempool *left_dest_mp(void) const { return m_left_dest_mp; }
+    struct mempool *right_dest_mp(void) const { return &m_right_bn->m_buffer_mempool; }
+
+    void copy_klpair(const uint32_t klpair_len, const klpair_struct &klpair,
+                     klpair_dmt_t::builder *const builder,
+                     struct mempool *const dest_mp,
+                     bn_data *const bn) {
+        LEAFENTRY old_le = m_left_bn->get_le_from_klpair(&klpair);
+        size_t le_size = leafentry_memsize(old_le);
+
+        void *new_le = toku_mempool_malloc(dest_mp, le_size, 1);
+        paranoid_invariant_notnull(new_le);
+        memcpy(new_le, old_le, le_size);
+        size_t le_offset = toku_mempool_get_offset_from_pointer_and_base(dest_mp, new_le);
+        size_t keylen = keylen_from_klpair_len(klpair_len);
+        builder->append(klpair_dmtwriter(keylen, le_offset, klpair.key));
+
+        bn->add_key(keylen);
+    }
+
+    int move_leafentry(const uint32_t klpair_len, const klpair_struct &klpair, const uint32_t idx) {
+        m_left_bn->remove_key(keylen_from_klpair_len(klpair_len));
+        if (idx < m_split_at) {
+            copy_klpair(klpair_len, klpair, m_left_builder, left_dest_mp(), m_left_bn);
+        } else {
+            copy_klpair(klpair_len, klpair, m_right_builder, right_dest_mp(), m_right_bn);
+        }
+        return 0;
+    }
+
+  public:
+    split_klpairs_extra(bn_data *const left_bn, bn_data *const right_bn,
+                        klpair_dmt_t::builder *const left_builder,
+                        klpair_dmt_t::builder *const right_builder,
+                        struct mempool *const left_new_mp,
+                        uint32_t split_at)
+        : m_left_bn(left_bn),
+          m_right_bn(right_bn),
+          m_left_builder(left_builder),
+          m_right_builder(right_builder),
+          m_left_dest_mp(left_new_mp),
+          m_split_at(split_at) {}
+    static int cb(const uint32_t klpair_len, const klpair_struct &klpair, const uint32_t idx, split_klpairs_extra *const thisp) {
+        return thisp->move_leafentry(klpair_len, klpair, idx);
+    }
+};
+
+void bn_data::split_klpairs(
+     bn_data* right_bd,
+     uint32_t split_at //lower bound inclusive for right_bd
     )
-//Effect: move leafentries in the range [lbi, ube) from this to src_omt to newly created dest_omt
 {
-    paranoid_invariant(lbi < ube);
-    paranoid_invariant(ube <= omt_size());
-    KLPAIR *XMALLOC_N(ube-lbi, newklpointers);    // create new omt
+    // We use move_leafentries_to during a split, and the split algorithm should never call this
+    // if it's splitting on a boundary, so there must be some leafentries in the range to move.
+    paranoid_invariant(split_at < num_klpairs());

-    size_t mpsize = toku_mempool_get_used_space(&m_buffer_mempool);   // overkill, but safe
-    struct mempool *dest_mp = &dest_bd->m_buffer_mempool;
-    struct mempool *src_mp  = &m_buffer_mempool;
-    toku_mempool_construct(dest_mp, mpsize);
+    right_bd->init_zero();

-    uint32_t i = 0;
-    for (i = lbi; i < ube; i++) {
-        KLPAIR curr_kl;
-        m_buffer.fetch(i, &curr_kl);
+    size_t mpsize = toku_mempool_get_used_size(&m_buffer_mempool);   // overkill, but safe

-        size_t kl_size = klpair_size(curr_kl);
-        KLPAIR new_kl = NULL;
-        CAST_FROM_VOIDP(new_kl, toku_mempool_malloc(dest_mp, kl_size, 1));
-        memcpy(new_kl, curr_kl, kl_size);
-        newklpointers[i-lbi] = new_kl;
-        toku_mempool_mfree(src_mp, curr_kl, kl_size);
-    }
+    struct mempool new_left_mp;
+    toku_mempool_construct(&new_left_mp, mpsize);

-    dest_bd->m_buffer.create_steal_sorted_array(&newklpointers, ube-lbi, ube-lbi);
-    // now remove the elements from src_omt
-    for (i=ube-1; i >= lbi; i--) {
-        m_buffer.delete_at(i);
-    }
+    struct mempool *right_mp = &right_bd->m_buffer_mempool;
+    toku_mempool_construct(right_mp, mpsize);
+
+    klpair_dmt_t::builder left_dmt_builder;
+    left_dmt_builder.create(split_at, m_disksize_of_keys);  // overkill, but safe (builder will realloc at the end)
+
+    klpair_dmt_t::builder right_dmt_builder;
+    right_dmt_builder.create(num_klpairs() - split_at, m_disksize_of_keys);  // overkill, but safe (builder will realloc at the end)
+
+    split_klpairs_extra extra(this, right_bd, &left_dmt_builder, &right_dmt_builder, &new_left_mp, split_at);
+
+    int r = m_buffer.iterate<split_klpairs_extra, split_klpairs_extra::cb>(&extra);
+    invariant_zero(r);
+
+    m_buffer.destroy();
+    toku_mempool_destroy(&m_buffer_mempool);
+
+    m_buffer_mempool = new_left_mp;
+
+    left_dmt_builder.build(&m_buffer);
+    right_dmt_builder.build(&right_bd->m_buffer);
+
+    // Potentially shrink memory pool for destination.
+    // We overallocated ("overkill") above
+    struct mempool *const left_mp = &m_buffer_mempool;
+    paranoid_invariant_zero(toku_mempool_get_frag_size(left_mp));
+    toku_mempool_realloc_larger(left_mp, toku_mempool_get_used_size(left_mp));
+    paranoid_invariant_zero(toku_mempool_get_frag_size(right_mp));
+    toku_mempool_realloc_larger(right_mp, toku_mempool_get_used_size(right_mp));
 }

 uint64_t bn_data::get_disk_size() {
-    return toku_mempool_get_used_space(&m_buffer_mempool);
+    return m_disksize_of_keys +
+           toku_mempool_get_used_size(&m_buffer_mempool);
 }

+struct verify_le_in_mempool_state {
+    size_t offset_limit;
+    class bn_data *bd;
+};
+
+static int verify_le_in_mempool (const uint32_t, klpair_struct *klpair, const uint32_t idx UU(), struct verify_le_in_mempool_state * const state) {
+    invariant(klpair->le_offset < state->offset_limit);
+
+    LEAFENTRY le = state->bd->get_le_from_klpair(klpair);
+    uint32_t size = leafentry_memsize(le);
+
+    size_t end_offset = klpair->le_offset+size;
+
+    invariant(end_offset <= state->offset_limit);
+    return 0;
+}
+
+//This is a debug-only (paranoid) verification.
+//Verifies the dmt is valid, and all leafentries are entirely in the mempool's memory.
 void bn_data::verify_mempool(void) {
-    // TODO: implement something
+    //Verify the dmt itself <- paranoid and slow
+    m_buffer.verify();
+
+    verify_le_in_mempool_state state = { .offset_limit = toku_mempool_get_offset_limit(&m_buffer_mempool), .bd = this };
+    //Verify every leafentry pointed to by the keys in the dmt are fully inside the mempool
+    m_buffer.iterate_ptr< decltype(state), verify_le_in_mempool >(&state);
 }

-uint32_t bn_data::omt_size(void) const {
+uint32_t bn_data::num_klpairs(void) const {
    return m_buffer.size();
 }

@ -375,40 +633,54 @@ void bn_data::destroy(void) {
    // The buffer may have been freed already, in some cases.
    m_buffer.destroy();
    toku_mempool_destroy(&m_buffer_mempool);
+    m_disksize_of_keys = 0;
 }

-//TODO: Splitting key/val requires changing this
-void bn_data::replace_contents_with_clone_of_sorted_array(
+void bn_data::set_contents_as_clone_of_sorted_array(
    uint32_t num_les,
    const void** old_key_ptrs,
    uint32_t* old_keylens,
-    LEAFENTRY* old_les, 
-    size_t *le_sizes, 
-    size_t mempool_size
-    ) 
+    LEAFENTRY* old_les,
+    size_t *le_sizes,
+    size_t total_key_size,
+    size_t total_le_size
+    )
 {
-    toku_mempool_construct(&m_buffer_mempool, mempool_size);
-    KLPAIR *XMALLOC_N(num_les, le_array);
-    for (uint32_t idx = 0; idx < num_les; idx++) {
-        KLPAIR new_kl = (KLPAIR)toku_mempool_malloc(
-            &m_buffer_mempool,
-            le_sizes[idx] + old_keylens[idx] + sizeof(uint32_t),
-            1); // point to new location
-        new_kl->keylen = old_keylens[idx];
-        memcpy(new_kl->key_le, old_key_ptrs[idx], new_kl->keylen);
-        memcpy(get_le_from_klpair(new_kl), old_les[idx], le_sizes[idx]);
-        CAST_FROM_VOIDP(le_array[idx], new_kl);
-    }
-    //TODO: Splitting key/val requires changing this; keys are stored in old omt.. cannot delete it yet?
+    //Enforce "just created" invariant.
+    paranoid_invariant_zero(m_disksize_of_keys);
+    paranoid_invariant_zero(num_klpairs());
+    paranoid_invariant_null(toku_mempool_get_base(&m_buffer_mempool));
+    paranoid_invariant_zero(toku_mempool_get_size(&m_buffer_mempool));
+
+    toku_mempool_construct(&m_buffer_mempool, total_le_size);
    m_buffer.destroy();
-    m_buffer.create_steal_sorted_array(&le_array, num_les, num_les);
+    m_disksize_of_keys = 0;
+
+    klpair_dmt_t::builder dmt_builder;
+    dmt_builder.create(num_les, total_key_size);
+
+    for (uint32_t idx = 0; idx < num_les; idx++) {
+        void* new_le = toku_mempool_malloc(&m_buffer_mempool, le_sizes[idx], 1);
+        paranoid_invariant_notnull(new_le);
+        memcpy(new_le, old_les[idx], le_sizes[idx]);
+        size_t le_offset = toku_mempool_get_offset_from_pointer_and_base(&m_buffer_mempool, new_le);
+        dmt_builder.append(klpair_dmtwriter(old_keylens[idx], le_offset, old_key_ptrs[idx]));
+        add_key(old_keylens[idx]);
+    }
+    dmt_builder.build(&this->m_buffer);
+}
+
+LEAFENTRY bn_data::get_le_from_klpair(const klpair_struct *klpair) const {
+    void * ptr = toku_mempool_get_pointer_from_base_and_offset(&this->m_buffer_mempool, klpair->le_offset);
+    LEAFENTRY CAST_FROM_VOIDP(le, ptr);
+    return le;
 }


 // get info about a single leafentry by index
 int bn_data::fetch_le(uint32_t idx, LEAFENTRY *le) {
-    KLPAIR klpair = NULL;
-    int r = m_buffer.fetch(idx, &klpair);
+    klpair_struct* klpair = nullptr;
+    int r = m_buffer.fetch(idx, nullptr, &klpair);
    if (r == 0) {
        *le = get_le_from_klpair(klpair);
    }
@ -416,59 +688,41 @@ int bn_data::fetch_le(uint32_t idx, LEAFENTRY *le) {
 }

 int bn_data::fetch_klpair(uint32_t idx, LEAFENTRY *le, uint32_t *len, void** key) {
-    KLPAIR klpair = NULL;
-    int r = m_buffer.fetch(idx, &klpair);
+    klpair_struct* klpair = nullptr;
+    uint32_t klpair_len;
+    int r = m_buffer.fetch(idx, &klpair_len, &klpair);
    if (r == 0) {
-        *len = klpair->keylen;
-        *key = klpair->key_le;
+        *len = keylen_from_klpair_len(klpair_len);
+        *key = klpair->key;
        *le = get_le_from_klpair(klpair);
    }
    return r;
 }

 int bn_data::fetch_klpair_disksize(uint32_t idx, size_t *size) {
-    KLPAIR klpair = NULL;
-    int r = m_buffer.fetch(idx, &klpair);
+    klpair_struct* klpair = nullptr;
+    uint32_t klpair_len;
+    int r = m_buffer.fetch(idx, &klpair_len, &klpair);
    if (r == 0) {
-        *size = klpair_disksize(klpair);
+        *size = klpair_disksize(klpair_len, klpair);
    }
    return r;
 }

-int bn_data::fetch_le_key_and_len(uint32_t idx, uint32_t *len, void** key) {
-    KLPAIR klpair = NULL;
-    int r = m_buffer.fetch(idx, &klpair);
+int bn_data::fetch_key_and_len(uint32_t idx, uint32_t *len, void** key) {
+    klpair_struct* klpair = nullptr;
+    uint32_t klpair_len;
+    int r = m_buffer.fetch(idx, &klpair_len, &klpair);
    if (r == 0) {
-        *len = klpair->keylen;
-        *key = klpair->key_le;
+        *len = keylen_from_klpair_len(klpair_len);
+        *key = klpair->key;
    }
    return r;
 }

-
-struct mp_pair {
-    void* orig_base;
-    void* new_base;
-    klpair_omt_t* omt;
-};
-
-static int fix_mp_offset(const KLPAIR &klpair, const uint32_t idx,  struct mp_pair * const p) {
-    char* old_value = (char *) klpair;
-    char *new_value = old_value - (char *)p->orig_base + (char *)p->new_base;
-    p->omt->set_at((KLPAIR)new_value, idx);
-    return 0;
-}
-
 void bn_data::clone(bn_data* orig_bn_data) {
    toku_mempool_clone(&orig_bn_data->m_buffer_mempool, &m_buffer_mempool);
    m_buffer.clone(orig_bn_data->m_buffer);
-    struct mp_pair p;
-    p.orig_base = toku_mempool_get_base(&orig_bn_data->m_buffer_mempool);
-    p.new_base = toku_mempool_get_base(&m_buffer_mempool);
-    p.omt = &m_buffer;
-
-    int r = m_buffer.iterate_on_range<decltype(p), fix_mp_offset>(0, omt_size(), &p);
-    invariant_zero(r);
+    this->m_disksize_of_keys = orig_bn_data->m_disksize_of_keys;
 }

-
--- a/ft/bndata.h
+++ b/ft/bndata.h
@ -91,166 +91,296 @@ PATENT RIGHTS GRANT:

 #pragma once

-#include <util/omt.h>
-#include "leafentry.h"
 #include <util/mempool.h>
+#include "wbuf.h"
+#include <util/dmt.h>
+#include "leafentry.h"

-#if 0 //for implementation
-static int
-UU() verify_in_mempool(OMTVALUE lev, uint32_t UU(idx), void *mpv)
-{
-    LEAFENTRY CAST_FROM_VOIDP(le, lev);
-    struct mempool *CAST_FROM_VOIDP(mp, mpv);
-    int r = toku_mempool_inrange(mp, le, leafentry_memsize(le));
-    lazy_assert(r);
-    return 0;
-}
-            toku_omt_iterate(bn->buffer, verify_in_mempool, &bn->buffer_mempool);
-
-#endif
-
+// Key/leafentry pair stored in a dmt.  The key is inlined, the offset (in leafentry mempool) is stored for the leafentry.
 struct klpair_struct {
-    uint32_t keylen;
-    uint8_t key_le[0]; // key, followed by le
+    uint32_t le_offset;  //Offset of leafentry (in leafentry mempool)
+    uint8_t key[0]; // key, followed by le
 };

-typedef struct klpair_struct *KLPAIR;
-
-static inline LEAFENTRY get_le_from_klpair(KLPAIR klpair){
-    uint32_t keylen = klpair->keylen;
-    LEAFENTRY le = (LEAFENTRY)(klpair->key_le + keylen);
-    return le;
+static constexpr uint32_t keylen_from_klpair_len(const uint32_t klpair_len) {
+    return klpair_len - __builtin_offsetof(klpair_struct, key);
 }

-template<typename omtcmp_t,
-         int (*h)(const DBT &, const omtcmp_t &)>
-static int wrappy_fun_find(const KLPAIR &klpair, const omtcmp_t &extra) {
-    //TODO: kill this function when we split, and/or use toku_fill_dbt
+
+static_assert(__builtin_offsetof(klpair_struct, key) == 1*sizeof(uint32_t), "klpair alignment issues");
+static_assert(__builtin_offsetof(klpair_struct, key) == sizeof(klpair_struct), "klpair size issues");
+
+// A wrapper for the heaviside function provided to dmt->find*.
+// Needed because the heaviside functions provided to bndata do not know about the internal types.
+// Alternative to this wrapper is to expose accessor functions and rewrite all the external heaviside functions.
+template<typename dmtcmp_t,
+         int (*h)(const DBT &, const dmtcmp_t &)>
+static int klpair_find_wrapper(const uint32_t klpair_len, const klpair_struct &klpair, const dmtcmp_t &extra) {
    DBT kdbt;
-    kdbt.data = klpair->key_le;
-    kdbt.size = klpair->keylen;
+    kdbt.data = const_cast<void*>(reinterpret_cast<const void*>(klpair.key));
+    kdbt.size = keylen_from_klpair_len(klpair_len);
    return h(kdbt, extra);
 }

+template<typename inner_iterate_extra_t>
+struct klpair_iterate_extra {
+    public:
+    inner_iterate_extra_t *inner;
+    const class bn_data * bd;
+};
+
+// A wrapper for the high-order function provided to dmt->iterate*
+// Needed because the heaviside functions provided to bndata do not know about the internal types.
+// Alternative to this wrapper is to expose accessor functions and rewrite all the external heaviside functions.
 template<typename iterate_extra_t,
-         int (*h)(const void * key, const uint32_t keylen, const LEAFENTRY &, const uint32_t idx, iterate_extra_t *const)>
-static int wrappy_fun_iterate(const KLPAIR &klpair, const uint32_t idx, iterate_extra_t *const extra) {
-    uint32_t keylen = klpair->keylen;
-    void* key = klpair->key_le;
-    LEAFENTRY le = get_le_from_klpair(klpair);
-    return h(key, keylen, le, idx, extra);
+         int (*f)(const void * key, const uint32_t keylen, const LEAFENTRY &, const uint32_t idx, iterate_extra_t *const)>
+static int klpair_iterate_wrapper(const uint32_t klpair_len, const klpair_struct &klpair, const uint32_t idx, klpair_iterate_extra<iterate_extra_t> *const extra) {
+    const void* key = &klpair.key;
+    LEAFENTRY le = extra->bd->get_le_from_klpair(&klpair);
+    return f(key, keylen_from_klpair_len(klpair_len), le, idx, extra->inner);
 }

-typedef toku::omt<KLPAIR> klpair_omt_t;
+
+namespace toku {
+// dmt writer for klpair_struct
+class klpair_dmtwriter {
+    public:
+        // Return the size needed for the klpair_struct that this dmtwriter represents
+        size_t get_size(void) const {
+            return sizeof(klpair_struct) + this->keylen;
+        }
+        // Write the klpair_struct this dmtwriter represents to a destination
+        void write_to(klpair_struct *const dest) const {
+            dest->le_offset = this->le_offset;
+            memcpy(dest->key, this->keyp, this->keylen);
+        }
+
+        klpair_dmtwriter(uint32_t _keylen, uint32_t _le_offset, const void* _keyp)
+            : keylen(_keylen), le_offset(_le_offset), keyp(_keyp) {}
+        klpair_dmtwriter(const uint32_t klpair_len, klpair_struct *const src)
+            : keylen(keylen_from_klpair_len(klpair_len)), le_offset(src->le_offset), keyp(src->key) {}
+    private:
+        const uint32_t keylen;
+        const uint32_t le_offset;
+        const void* keyp;
+};
+}
+
+typedef toku::dmt<klpair_struct, klpair_struct*, toku::klpair_dmtwriter> klpair_dmt_t;
 // This class stores the data associated with a basement node
 class bn_data {
 public:
+    // Initialize an empty bn_data _without_ a dmt backing.
+    // Externally only used for deserialization.
    void init_zero(void);
+
+    // Initialize an empty bn_data _with_ a dmt
    void initialize_empty(void);
-    void initialize_from_data(uint32_t num_entries, unsigned char *buf, uint32_t data_size);
-    // globals
+
+    // Deserialize a bn_data from rbuf.
+    // This is the entry point for deserialization.
+    void deserialize_from_rbuf(uint32_t num_entries, struct rbuf *rb, uint32_t data_size, uint32_t version);
+
+    // Retrieve the memory footprint of this basement node.
+    // May over or under count: see Tokutek/ft-index#136
+    // Also see dmt's implementation.
    uint64_t get_memory_size(void);
+
+    // Get the serialized size of this basement node.
    uint64_t get_disk_size(void);
+
+    // Perform (paranoid) verification that all leafentries are fully contained within the mempool
    void verify_mempool(void);

-    // Interact with "omt"
-    uint32_t omt_size(void) const;
+    // size() of key dmt
+    uint32_t num_klpairs(void) const;

+    // iterate() on key dmt (and associated leafentries)
    template<typename iterate_extra_t,
             int (*f)(const void * key, const uint32_t keylen, const LEAFENTRY &, const uint32_t, iterate_extra_t *const)>
-    int omt_iterate(iterate_extra_t *const iterate_extra) const {
-        return omt_iterate_on_range<iterate_extra_t, f>(0, omt_size(), iterate_extra);
+    int iterate(iterate_extra_t *const iterate_extra) const {
+        return iterate_on_range<iterate_extra_t, f>(0, num_klpairs(), iterate_extra);
    }

+    // iterate_on_range() on key dmt (and associated leafentries)
    template<typename iterate_extra_t,
             int (*f)(const void * key, const uint32_t keylen, const LEAFENTRY &, const uint32_t, iterate_extra_t *const)>
-    int omt_iterate_on_range(const uint32_t left, const uint32_t right, iterate_extra_t *const iterate_extra) const {
-        return m_buffer.iterate_on_range< iterate_extra_t, wrappy_fun_iterate<iterate_extra_t, f> >(left, right, iterate_extra);
+    int iterate_on_range(const uint32_t left, const uint32_t right, iterate_extra_t *const iterate_extra) const {
+        klpair_iterate_extra<iterate_extra_t> klpair_extra = { iterate_extra, this };
+        return m_buffer.iterate_on_range< klpair_iterate_extra<iterate_extra_t>, klpair_iterate_wrapper<iterate_extra_t, f> >(left, right, &klpair_extra);
    }

-    template<typename omtcmp_t,
-             int (*h)(const DBT &, const omtcmp_t &)>
-    int find_zero(const omtcmp_t &extra, LEAFENTRY *const value, void** key, uint32_t* keylen, uint32_t *const idxp) const {
-        KLPAIR klpair = NULL;
-        int r = m_buffer.find_zero< omtcmp_t, wrappy_fun_find<omtcmp_t, h> >(extra, &klpair, idxp);
+    // find_zero() on key dmt
+    template<typename dmtcmp_t,
+             int (*h)(const DBT &, const dmtcmp_t &)>
+    int find_zero(const dmtcmp_t &extra, LEAFENTRY *const value, void** key, uint32_t* keylen, uint32_t *const idxp) const {
+        klpair_struct* klpair = nullptr;
+        uint32_t klpair_len;
+        int r = m_buffer.find_zero< dmtcmp_t, klpair_find_wrapper<dmtcmp_t, h> >(extra, &klpair_len, &klpair, idxp);
        if (r == 0) {
            if (value) {
                *value = get_le_from_klpair(klpair);
            }
            if (key) {
-                paranoid_invariant(keylen != NULL);
-                *key = klpair->key_le;
-                *keylen = klpair->keylen;
+                paranoid_invariant_notnull(keylen);
+                *key = klpair->key;
+                *keylen = keylen_from_klpair_len(klpair_len);
            }
            else {
-                paranoid_invariant(keylen == NULL);
+                paranoid_invariant_null(keylen);
            }
        }
        return r;
    }

-    template<typename omtcmp_t,
-             int (*h)(const DBT &, const omtcmp_t &)>
-    int find(const omtcmp_t &extra, int direction, LEAFENTRY *const value, void** key, uint32_t* keylen, uint32_t *const idxp) const {
-        KLPAIR klpair = NULL;
-        int r = m_buffer.find< omtcmp_t, wrappy_fun_find<omtcmp_t, h> >(extra, direction, &klpair, idxp);
+    // find() on key dmt (and associated leafentries)
+    template<typename dmtcmp_t,
+             int (*h)(const DBT &, const dmtcmp_t &)>
+    int find(const dmtcmp_t &extra, int direction, LEAFENTRY *const value, void** key, uint32_t* keylen, uint32_t *const idxp) const {
+        klpair_struct* klpair = nullptr;
+        uint32_t klpair_len;
+        int r = m_buffer.find< dmtcmp_t, klpair_find_wrapper<dmtcmp_t, h> >(extra, direction, &klpair_len, &klpair, idxp);
        if (r == 0) {
            if (value) {
                *value = get_le_from_klpair(klpair);
            }
            if (key) {
-                paranoid_invariant(keylen != NULL);
-                *key = klpair->key_le;
-                *keylen = klpair->keylen;
+                paranoid_invariant_notnull(keylen);
+                *key = klpair->key;
+                *keylen = keylen_from_klpair_len(klpair_len);
            }
            else {
-                paranoid_invariant(keylen == NULL);
+                paranoid_invariant_null(keylen);
            }
        }
        return r;
    }

-    // get info about a single leafentry by index
+    // Fetch leafentry by index
+    __attribute__((__nonnull__))
    int fetch_le(uint32_t idx, LEAFENTRY *le);
+    // Fetch (leafentry, key, keylen) by index
+    __attribute__((__nonnull__))
    int fetch_klpair(uint32_t idx, LEAFENTRY *le, uint32_t *len, void** key);
+    // Fetch (serialized size of leafentry, key, and keylen) by index
+    __attribute__((__nonnull__))
    int fetch_klpair_disksize(uint32_t idx, size_t *size);
-    int fetch_le_key_and_len(uint32_t idx, uint32_t *len, void** key);
+    // Fetch (key, keylen) by index
+    __attribute__((__nonnull__))
+    int fetch_key_and_len(uint32_t idx, uint32_t *len, void** key);

-    // Interact with another bn_data
-    void move_leafentries_to(BN_DATA dest_bd,
-                                      uint32_t lbi, //lower bound inclusive
-                                      uint32_t ube //upper bound exclusive
-                                      );
+    // Move leafentries (and associated key/keylens) from this basement node to dest_bd
+    // Moves indexes [lbi-ube)
+    __attribute__((__nonnull__))
+    void split_klpairs(bn_data* dest_bd, uint32_t first_index_for_dest);

+    // Destroy this basement node and free memory.
    void destroy(void);

-    // Replaces contents, into brand new mempool.
-    // Returns old mempool base, expects caller to free it.
-    void replace_contents_with_clone_of_sorted_array(
+    // Uses sorted array as input for this basement node.
+    // Expects this to be a basement node just initialized with initialize_empty()
+    void set_contents_as_clone_of_sorted_array(
        uint32_t num_les,
        const void** old_key_ptrs,
        uint32_t* old_keylens,
        LEAFENTRY* old_les,
        size_t *le_sizes,
-        size_t mempool_size
+        size_t total_key_size,
+        size_t total_le_size
        );

+    // Make this basement node a clone of orig_bn_data.
+    // orig_bn_data still owns all its memory (dmt, mempool)
+    // this basement node will have a new dmt, mempool containing same data.
    void clone(bn_data* orig_bn_data);
+
+    // Delete klpair index idx with provided keylen and old leafentry with size old_le_size
    void delete_leafentry (
        uint32_t idx,
        uint32_t keylen,
        uint32_t old_le_size
        );
-    void get_space_for_overwrite(uint32_t idx, const void* keyp, uint32_t keylen, uint32_t old_size, uint32_t new_size, LEAFENTRY* new_le_space);
-    void get_space_for_insert(uint32_t idx, const void* keyp, uint32_t keylen, size_t size, LEAFENTRY* new_le_space);
-private:
-    // Private functions
-    KLPAIR mempool_malloc_from_omt(size_t size, void **maybe_free);
-    void omt_compress_kvspace(size_t added_size, void **maybe_free);

-    klpair_omt_t m_buffer;                     // pointers to individual leaf entries
+    // Allocates space in the mempool to store a new leafentry.
+    // This may require reorganizing the mempool and updating the dmt.
+    __attribute__((__nonnull__))
+    void get_space_for_overwrite(uint32_t idx, const void* keyp, uint32_t keylen, uint32_t old_size, uint32_t new_size, LEAFENTRY* new_le_space, void **const maybe_free);
+
+    // Allocates space in the mempool to store a new leafentry
+    // and inserts a new key into the dmt
+    // This may require reorganizing the mempool and updating the dmt.
+    __attribute__((__nonnull__))
+    void get_space_for_insert(uint32_t idx, const void* keyp, uint32_t keylen, size_t size, LEAFENTRY* new_le_space, void **const maybe_free);
+
+    // Gets a leafentry given a klpair from this basement node.
+    LEAFENTRY get_le_from_klpair(const klpair_struct *klpair) const;
+
+    void serialize_to_wbuf(struct wbuf *const wb);
+
+    // Prepares this basement node for serialization.
+    // Must be called before serializing this basement node.
+    // Between calling prepare_to_serialize and actually serializing, the basement node may not be modified
+    void prepare_to_serialize(void);
+
+    // Serialize the basement node header to a wbuf
+    // Requires prepare_to_serialize() to have been called first.
+    void serialize_header(struct wbuf *wb) const;
+
+    // Serialize all keys and leafentries to a wbuf
+    // Requires prepare_to_serialize() (and serialize_header()) has been called first.
+    // Currently only supported when all keys are fixed-length.
+    void serialize_rest(struct wbuf *wb) const;
+
+    static const uint32_t HEADER_LENGTH = 0
+        + sizeof(uint32_t) // key_data_size
+        + sizeof(uint32_t) // val_data_size
+        + sizeof(uint32_t) // fixed_key_length
+        + sizeof(uint8_t) // all_keys_same_length
+        + sizeof(uint8_t) // keys_vals_separate
+        + 0;
+private:
+
+    // split_klpairs_extra should be a local class in split_klpairs, but
+    // the dmt template parameter for iterate needs linkage, so it has to be a
+    // separate class, but we want it to be able to call e.g. add_key
+    friend class split_klpairs_extra;
+
+    // Allocates space in the mempool.
+    // If there is insufficient space, the mempool is enlarged and leafentries may be shuffled to reduce fragmentation.
+    // If shuffling happens, the offsets stored in the dmt are updated.
+    LEAFENTRY mempool_malloc_and_update_dmt(size_t size, void **maybe_free);
+
+    // Change the size of the mempool to support what is already in it, plus added_size.
+    // possibly "compress" by shuffling leafentries around to reduce fragmentation to 0.
+    // If fragmentation is already 0 and force_compress is not true, shuffling may be skipped.
+    // If shuffling happens, leafentries will be stored in the mempool in sorted order.
+    void dmt_compress_kvspace(size_t added_size, void **maybe_free, bool force_compress);
+
+    // Note that a key was added (for maintaining disk-size of this basement node)
+    void add_key(uint32_t keylen);
+
+    // Note that multiple keys were added (for maintaining disk-size of this basement node)
+    void add_keys(uint32_t n_keys, uint32_t combined_klpair_len);
+
+    // Note that a key was removed (for maintaining disk-size of this basement node)
+    void remove_key(uint32_t keylen);
+
+    klpair_dmt_t m_buffer;                     // pointers to individual leaf entries
    struct mempool m_buffer_mempool;  // storage for all leaf entries

    friend class bndata_bugfix_test;
+
+    // Get the serialized size of a klpair.
+    // As of Jan 14, 2014, serialized size of a klpair is independent of whether this basement node has fixed-length keys.
+    uint32_t klpair_disksize(const uint32_t klpair_len, const klpair_struct *klpair) const;
+
+    // The disk/memory size of all keys.  (Note that the size of memory for the leafentries is maintained by m_buffer_mempool)
+    size_t m_disksize_of_keys;
+
+    // Deserialize this basement node from rbuf
+    // all keys will be first followed by all leafentries (both in sorted order)
+    void initialize_from_separate_keys_and_vals(uint32_t num_entries, struct rbuf *rb, uint32_t data_size, uint32_t version,
+                                                uint32_t key_data_size, uint32_t val_data_size, bool all_keys_same_length,
+                                                uint32_t fixed_klpair_length);
 };

--- a/ft/dmt-wrapper.cc
+++ b/ft/dmt-wrapper.cc
@ -0,0 +1,297 @@
+/* -*- mode: C++; c-basic-offset: 4; indent-tabs-mode: nil -*- */
+// vim: ft=cpp:expandtab:ts=8:sw=4:softtabstop=4:
+#ident "$Id$"
+/*
+COPYING CONDITIONS NOTICE:
+
+  This program is free software; you can redistribute it and/or modify
+  it under the terms of version 2 of the GNU General Public License as
+  published by the Free Software Foundation, and provided that the
+  following conditions are met:
+
+      * Redistributions of source code must retain this COPYING
+        CONDITIONS NOTICE, the COPYRIGHT NOTICE (below), the
+        DISCLAIMER (below), the UNIVERSITY PATENT NOTICE (below), the
+        PATENT MARKING NOTICE (below), and the PATENT RIGHTS
+        GRANT (below).
+
+      * Redistributions in binary form must reproduce this COPYING
+        CONDITIONS NOTICE, the COPYRIGHT NOTICE (below), the
+        DISCLAIMER (below), the UNIVERSITY PATENT NOTICE (below), the
+        PATENT MARKING NOTICE (below), and the PATENT RIGHTS
+        GRANT (below) in the documentation and/or other materials
+        provided with the distribution.
+
+  You should have received a copy of the GNU General Public License
+  along with this program; if not, write to the Free Software
+  Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
+  02110-1301, USA.
+
+COPYRIGHT NOTICE:
+
+  TokuDB, Tokutek Fractal Tree Indexing Library.
+  Copyright (C) 2007-2013 Tokutek, Inc.
+
+DISCLAIMER:
+
+  This program is distributed in the hope that it will be useful, but
+  WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+  General Public License for more details.
+
+UNIVERSITY PATENT NOTICE:
+
+  The technology is licensed by the Massachusetts Institute of
+  Technology, Rutgers State University of New Jersey, and the Research
+  Foundation of State University of New York at Stony Brook under
+  United States of America Serial No. 11/760379 and to the patents
+  and/or patent applications resulting from it.
+
+PATENT MARKING NOTICE:
+
+  This software is covered by US Patent No. 8,185,551.
+  This software is covered by US Patent No. 8,489,638.
+
+PATENT RIGHTS GRANT:
+
+  "THIS IMPLEMENTATION" means the copyrightable works distributed by
+  Tokutek as part of the Fractal Tree project.
+
+  "PATENT CLAIMS" means the claims of patents that are owned or
+  licensable by Tokutek, both currently or in the future; and that in
+  the absence of this license would be infringed by THIS
+  IMPLEMENTATION or by using or running THIS IMPLEMENTATION.
+
+  "PATENT CHALLENGE" shall mean a challenge to the validity,
+  patentability, enforceability and/or non-infringement of any of the
+  PATENT CLAIMS or otherwise opposing any of the PATENT CLAIMS.
+
+  Tokutek hereby grants to you, for the term and geographical scope of
+  the PATENT CLAIMS, a non-exclusive, no-charge, royalty-free,
+  irrevocable (except as stated in this section) patent license to
+  make, have made, use, offer to sell, sell, import, transfer, and
+  otherwise run, modify, and propagate the contents of THIS
+  IMPLEMENTATION, where such license applies only to the PATENT
+  CLAIMS.  This grant does not include claims that would be infringed
+  only as a consequence of further modifications of THIS
+  IMPLEMENTATION.  If you or your agent or licensee institute or order
+  or agree to the institution of patent litigation against any entity
+  (including a cross-claim or counterclaim in a lawsuit) alleging that
+  THIS IMPLEMENTATION constitutes direct or contributory patent
+  infringement, or inducement of patent infringement, then any rights
+  granted to you under this License shall terminate as of the date
+  such litigation is filed.  If you or your agent or exclusive
+  licensee institute or order or agree to the institution of a PATENT
+  CHALLENGE, then Tokutek may terminate any rights granted to you
+  under this License.
+*/
+
+#ident "Copyright (c) 2007-2013 Tokutek Inc.  All rights reserved."
+#ident "The technology is licensed by the Massachusetts Institute of Technology, Rutgers State University of New Jersey, and the Research Foundation of State University of New York at Stony Brook under United States of America Serial No. 11/760379 and to the patents and/or patent applications resulting from it."
+
+#include <toku_portability.h>
+#include <memory.h>
+#include <string.h>
+#include <db.h>
+
+#include <util/mempool.h>
+#include "dmt-wrapper.h"
+
+int
+toku_dmt_create_steal_sorted_array(DMT *dmtp, DMTVALUE **valuesp, uint32_t numvalues, uint32_t capacity) {
+    //TODO: implement using create_steal_sorted_array when it exists
+    (void)capacity;
+    toku_dmt_create_from_sorted_array(dmtp, *valuesp, numvalues);
+    toku_free(*valuesp);
+    *valuesp = nullptr;
+
+
+//    DMT XMALLOC(dmt);
+    //dmt->create_steal_sorted_array(valuesp, numvalues, capacity);
+ //   *dmtp = dmt;
+    return 0;
+}
+
+//TODO: Put all dmt API functions here.
+int toku_dmt_create (DMT *dmtp) {
+    DMT XMALLOC(dmt);
+    dmt->create();
+    *dmtp = dmt;
+    return 0;
+}
+
+void toku_dmt_destroy(DMT *dmtp) {
+    DMT dmt=*dmtp;
+    dmt->destroy();
+    toku_free(dmt);
+    *dmtp=NULL;
+}
+
+uint32_t toku_dmt_size(DMT V) {
+    return V->size();
+}
+
+int toku_dmt_create_from_sorted_array(DMT *dmtp, DMTVALUE *values, uint32_t numvalues) {
+    //TODO: implement using create_from_sorted_array when it exists
+
+    DMT XMALLOC(dmt);
+    dmt->create();
+    for (uint32_t i = 0; i < numvalues; i++) {
+        toku_dmt_insert_at(dmt, values[i], i);
+    }
+    //dmt->create_from_sorted_array(values, numvalues);
+    *dmtp=dmt;
+    return 0;
+}
+
+int toku_dmt_insert_at(DMT dmt, DMTVALUE value, uint32_t index) {
+    dmt_wrapper_internal::dmtvalue_writer functor(value);
+    return dmt->insert_at(functor, index);
+}
+
+int toku_dmt_set_at (DMT dmt, DMTVALUE value, uint32_t index) {
+    int r = dmt->delete_at(index);
+    if (r!=0) return r;
+    return toku_dmt_insert_at(dmt, value, index);
+}
+
+int toku_dmt_delete_at(DMT dmt, uint32_t index) {
+    return dmt->delete_at(index);
+}
+
+int toku_dmt_fetch(DMT dmt, uint32_t i, DMTVALUE *v) {
+    uint32_t size;
+    return dmt->fetch(i, &size, v);
+}
+
+struct functor {
+    int (*f)(DMTVALUE, uint32_t, void *);
+    void *v;
+};
+static_assert(std::is_pod<functor>::value, "not POD");
+
+int call_functor(const uint32_t size, const DMTVALUE &v, uint32_t idx, functor *const ftor);
+int call_functor(const uint32_t size, const DMTVALUE &v, uint32_t idx, functor *const ftor) {
+    invariant(size == sizeof(DMTVALUE));
+    return ftor->f(const_cast<DMTVALUE>(v), idx, ftor->v);
+}
+
+int toku_dmt_iterate(DMT dmt, int (*f)(DMTVALUE, uint32_t, void*), void*v) {
+    struct functor ftor = { .f = f, .v = v };
+    return dmt->iterate<functor, call_functor>(&ftor);
+}
+
+int toku_dmt_iterate_on_range(DMT dmt, uint32_t left, uint32_t right, int (*f)(DMTVALUE, uint32_t, void*), void*v) {
+    struct functor ftor = { .f = f, .v = v };
+    return dmt->iterate_on_range<functor, call_functor>(left, right, &ftor);
+}
+
+struct heftor {
+    int (*h)(DMTVALUE, void *v);
+    void *v;
+};
+static_assert(std::is_pod<heftor>::value, "not POD");
+
+int call_heftor(const uint32_t size, const DMTVALUE &v, const heftor &htor);
+int call_heftor(const uint32_t size, const DMTVALUE &v, const heftor &htor) {
+    invariant(size == sizeof(DMTVALUE));
+    return htor.h(const_cast<DMTVALUE>(v), htor.v);
+}
+
+int toku_dmt_insert(DMT dmt, DMTVALUE value, int(*h)(DMTVALUE, void*v), void *v, uint32_t *index) {
+    struct heftor htor = { .h = h, .v = v };
+    dmt_wrapper_internal::dmtvalue_writer functor(value);
+    return dmt->insert<heftor, call_heftor>(functor, htor, index);
+}
+
+int toku_dmt_find_zero(DMT V, int (*h)(DMTVALUE, void*extra), void*extra, DMTVALUE *value, uint32_t *index) {
+    struct heftor htor = { .h = h, .v = extra };
+    uint32_t ignore;
+    return V->find_zero<heftor, call_heftor>(htor, &ignore, value, index);
+}
+
+int toku_dmt_find(DMT V, int (*h)(DMTVALUE, void*extra), void*extra, int direction, DMTVALUE *value, uint32_t *index) {
+    struct heftor htor = { .h = h, .v = extra };
+    uint32_t ignore;
+    return V->find<heftor, call_heftor>(htor, direction, &ignore, value, index);
+}
+
+int toku_dmt_split_at(DMT dmt, DMT *newdmtp, uint32_t index) {
+    //TODO: use real split_at when it exists
+    if (index > dmt->size()) { return EINVAL; }
+    DMT XMALLOC(newdmt);
+    newdmt->create();
+    int r;
+
+    for (uint32_t i = index; i < dmt->size(); i++) {
+        DMTVALUE v;
+        r = toku_dmt_fetch(dmt, i, &v);
+        invariant_zero(r);
+        r = toku_dmt_insert_at(newdmt, v, i-index);
+        invariant_zero(r);
+    }
+    if (dmt->size() > 0) {
+        for (uint32_t i = dmt->size(); i > index; i--) {
+            r = toku_dmt_delete_at(dmt, i-1);
+            invariant_zero(r);
+        }
+    }
+    r = 0;
+
+#if 0
+    int r = dmt->split_at(newdmt, index);
+#endif
+    if (r != 0) {
+        toku_free(newdmt);
+    } else {
+        *newdmtp = newdmt;
+    }
+    return r;
+}
+
+int toku_dmt_merge(DMT leftdmt, DMT rightdmt, DMT *newdmtp) {
+    //TODO: use real merge when it exists
+    DMT XMALLOC(newdmt);
+    newdmt->create();
+    int r;
+    for (uint32_t i = 0; i < leftdmt->size(); i++) {
+        DMTVALUE v;
+        r = toku_dmt_fetch(leftdmt, i, &v);
+        invariant_zero(r);
+        r = toku_dmt_insert_at(newdmt, v, i);
+        invariant_zero(r);
+    }
+    uint32_t offset = leftdmt->size();
+    for (uint32_t i = 0; i < rightdmt->size(); i++) {
+        DMTVALUE v;
+        r = toku_dmt_fetch(rightdmt, i, &v);
+        invariant_zero(r);
+        r = toku_dmt_insert_at(newdmt, v, i+offset);
+        invariant_zero(r);
+    }
+    leftdmt->destroy();
+    rightdmt->destroy();
+
+//    newdmt->merge(leftdmt, rightdmt);
+
+    toku_free(leftdmt);
+    toku_free(rightdmt);
+    *newdmtp = newdmt;
+    return 0;
+}
+
+int toku_dmt_clone_noptr(DMT *dest, DMT src) {
+    DMT XMALLOC(dmt);
+    dmt->clone(*src);
+    *dest = dmt;
+    return 0;
+}
+
+void toku_dmt_clear(DMT dmt) {
+    dmt->clear();
+}
+
+size_t toku_dmt_memory_size (DMT dmt) {
+    return dmt->memory_size();
+}
+
--- a/ft/dmt-wrapper.h
+++ b/ft/dmt-wrapper.h
@ -0,0 +1,440 @@
+/* -*- mode: C++; c-basic-offset: 4; indent-tabs-mode: nil -*- */
+// vim: ft=cpp:expandtab:ts=8:sw=4:softtabstop=4:
+#if !defined(TOKU_DMT_WRAPPER_H)
+#define TOKU_DMT_WRAPPER_H
+
+#ident "$Id$"
+/*
+COPYING CONDITIONS NOTICE:
+
+  This program is free software; you can redistribute it and/or modify
+  it under the terms of version 2 of the GNU General Public License as
+  published by the Free Software Foundation, and provided that the
+  following conditions are met:
+
+      * Redistributions of source code must retain this COPYING
+        CONDITIONS NOTICE, the COPYRIGHT NOTICE (below), the
+        DISCLAIMER (below), the UNIVERSITY PATENT NOTICE (below), the
+        PATENT MARKING NOTICE (below), and the PATENT RIGHTS
+        GRANT (below).
+
+      * Redistributions in binary form must reproduce this COPYING
+        CONDITIONS NOTICE, the COPYRIGHT NOTICE (below), the
+        DISCLAIMER (below), the UNIVERSITY PATENT NOTICE (below), the
+        PATENT MARKING NOTICE (below), and the PATENT RIGHTS
+        GRANT (below) in the documentation and/or other materials
+        provided with the distribution.
+
+  You should have received a copy of the GNU General Public License
+  along with this program; if not, write to the Free Software
+  Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
+  02110-1301, USA.
+
+COPYRIGHT NOTICE:
+
+  TokuDB, Tokutek Fractal Tree Indexing Library.
+  Copyright (C) 2007-2013 Tokutek, Inc.
+
+DISCLAIMER:
+
+  This program is distributed in the hope that it will be useful, but
+  WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+  General Public License for more details.
+
+UNIVERSITY PATENT NOTICE:
+
+  The technology is licensed by the Massachusetts Institute of
+  Technology, Rutgers State University of New Jersey, and the Research
+  Foundation of State University of New York at Stony Brook under
+  United States of America Serial No. 11/760379 and to the patents
+  and/or patent applications resulting from it.
+
+PATENT MARKING NOTICE:
+
+  This software is covered by US Patent No. 8,185,551.
+  This software is covered by US Patent No. 8,489,638.
+
+PATENT RIGHTS GRANT:
+
+  "THIS IMPLEMENTATION" means the copyrightable works distributed by
+  Tokutek as part of the Fractal Tree project.
+
+  "PATENT CLAIMS" means the claims of patents that are owned or
+  licensable by Tokutek, both currently or in the future; and that in
+  the absence of this license would be infringed by THIS
+  IMPLEMENTATION or by using or running THIS IMPLEMENTATION.
+
+  "PATENT CHALLENGE" shall mean a challenge to the validity,
+  patentability, enforceability and/or non-infringement of any of the
+  PATENT CLAIMS or otherwise opposing any of the PATENT CLAIMS.
+
+  Tokutek hereby grants to you, for the term and geographical scope of
+  the PATENT CLAIMS, a non-exclusive, no-charge, royalty-free,
+  irrevocable (except as stated in this section) patent license to
+  make, have made, use, offer to sell, sell, import, transfer, and
+  otherwise run, modify, and propagate the contents of THIS
+  IMPLEMENTATION, where such license applies only to the PATENT
+  CLAIMS.  This grant does not include claims that would be infringed
+  only as a consequence of further modifications of THIS
+  IMPLEMENTATION.  If you or your agent or licensee institute or order
+  or agree to the institution of patent litigation against any entity
+  (including a cross-claim or counterclaim in a lawsuit) alleging that
+  THIS IMPLEMENTATION constitutes direct or contributory patent
+  infringement, or inducement of patent infringement, then any rights
+  granted to you under this License shall terminate as of the date
+  such litigation is filed.  If you or your agent or exclusive
+  licensee institute or order or agree to the institution of a PATENT
+  CHALLENGE, then Tokutek may terminate any rights granted to you
+  under this License.
+*/
+
+#ident "Copyright (c) 2007-2013 Tokutek Inc.  All rights reserved."
+#ident "The technology is licensed by the Massachusetts Institute of Technology, Rutgers State University of New Jersey, and the Research Foundation of State University of New York at Stony Brook under United States of America Serial No. 11/760379 and to the patents and/or patent applications resulting from it."
+
+
+// Order Maintenance Tree (DMT)
+//
+// Maintains a collection of totally ordered values, where each value has an integer weight.
+// The DMT is a mutable datatype.
+//
+// The Abstraction:
+//
+// An DMT is a vector of values, $V$, where $|V|$ is the length of the vector.
+// The vector is numbered from $0$ to $|V|-1$.
+// Each value has a weight.  The weight of the $i$th element is denoted $w(V_i)$.
+//
+// We can create a new DMT, which is the empty vector.
+//
+// We can insert a new element $x$ into slot $i$, changing $V$ into $V'$ where
+//  $|V'|=1+|V|$       and
+//
+//   V'_j = V_j       if $j<i$
+//          x         if $j=i$
+//          V_{j-1}   if $j>i$.
+//
+// We can specify $i$ using a kind of function instead of as an integer.
+// Let $b$ be a function mapping from values to nonzero integers, such that
+// the signum of $b$ is monotically increasing.
+// We can specify $i$ as the minimum integer such that $b(V_i)>0$.
+//
+// We look up a value using its index, or using a Heaviside function.
+// For lookups, we allow $b$ to be zero for some values, and again the signum of $b$ must be monotonically increasing.
+// When lookup up values, we can look up
+//  $V_i$ where $i$ is the minimum integer such that $b(V_i)=0$.   (With a special return code if no such value exists.)
+//      (Rationale:  Ordinarily we want $i$ to be unique.  But for various reasons we want to allow multiple zeros, and we want the smallest $i$ in that case.)
+//  $V_i$ where $i$ is the minimum integer such that $b(V_i)>0$.   (Or an indication that no such value exists.)
+//  $V_i$ where $i$ is the maximum integer such that $b(V_i)<0$.   (Or an indication that no such value exists.)
+//
+// When looking up a value using a Heaviside function, we get the value and its index.
+//
+// We can also split an DMT into two DMTs, splitting the weight of the values evenly.
+// Find a value $j$ such that the values to the left of $j$ have about the same total weight as the values to the right of $j$.
+// The resulting two DMTs contain the values to the left of $j$ and the values to the right of $j$ respectively.
+// All of the values from the original DMT go into one of the new DMTs.
+// If the weights of the values don't split exactly evenly, then the implementation has the freedom to choose whether
+//  the new left DMT or the new right DMT is larger.
+//
+// Performance:
+//  Insertion and deletion should run with $O(\log |V|)$ time and $O(\log |V|)$ calls to the Heaviside function.
+//  The memory required is O(|V|).
+//
+// The programming API:
+
+//typedef struct value *DMTVALUE; // A slight improvement over using void*.
+#include <util/dmt.h>
+
+typedef void *DMTVALUE;
+
+namespace dmt_wrapper_internal {
+class dmtvalue_writer {
+    public:
+        size_t get_size(void) const {
+            return sizeof(DMTVALUE);
+        }
+        void write_to(DMTVALUE *const dest) const {
+            *dest = value;
+        }
+
+        dmtvalue_writer(DMTVALUE _value)
+            : value(_value) {}
+        dmtvalue_writer(const uint32_t size UU(), DMTVALUE *const src)
+            : value(*src) {
+            paranoid_invariant(size == sizeof(DMTVALUE));
+        }
+    private:
+        const DMTVALUE value;
+};
+};
+
+
+typedef toku::dmt<DMTVALUE, DMTVALUE, dmt_wrapper_internal::dmtvalue_writer> *DMT;
+
+
+int toku_dmt_create (DMT *dmtp);
+// Effect: Create an empty DMT.  Stores it in *dmtp.
+// Requires: dmtp != NULL
+// Returns:
+//   0        success
+//   ENOMEM   out of memory (and doesn't modify *dmtp)
+// Performance: constant time.
+
+int toku_dmt_create_from_sorted_array(DMT *dmtp, DMTVALUE *values, uint32_t numvalues);
+// Effect: Create a DMT containing values.  The number of values is in numvalues.
+//  Stores the new DMT in *dmtp.
+// Requires: dmtp != NULL
+// Requires: values != NULL
+// Requires: values is sorted
+// Returns:
+//   0        success
+//   ENOMEM   out of memory (and doesn't modify *dmtp)
+// Performance:  time=O(numvalues)
+// Rational:     Normally to insert N values takes O(N lg N) amortized time.
+//               If the N values are known in advance, are sorted, and
+//               the structure is empty, we can batch insert them much faster.
+
+int toku_dmt_create_steal_sorted_array(DMT *dmtp, DMTVALUE **valuesp, uint32_t numvalues, uint32_t steal_capacity);
+// Effect: Create an DMT containing values.  The number of values is in numvalues.
+//         On success the DMT takes ownership of *valuesp array, and sets valuesp=NULL.
+// Requires: dmtp != NULL
+// Requires: valuesp != NULL
+// Requires: *valuesp is sorted
+// Requires: *valuesp was allocated with toku_malloc
+// Requires: Capacity of the *valuesp array is <= steal_capacity
+// Requires: On success, *valuesp may not be accessed again by the caller.
+// Returns:
+//   0        success
+//   ENOMEM   out of memory (and doesn't modify *dmtp)
+//   EINVAL   *valuesp == NULL or numvalues > capacity
+// Performance:  time=O(1)
+// Rational:     toku_dmt_create_from_sorted_array takes O(numvalues) time.
+//               By taking ownership of the array, we save a malloc and memcpy,
+//               and possibly a free (if the caller is done with the array).
+
+void toku_dmt_destroy(DMT *dmtp);
+// Effect:  Destroy an DMT, freeing all its memory.
+//   Does not free the DMTVALUEs stored in the DMT.
+//   Those values may be freed before or after calling toku_dmt_destroy.
+//   Also sets *dmtp=NULL.
+// Requires: dmtp != NULL
+// Requires: *dmtp != NULL
+// Rationale:  The usage is to do something like
+//   toku_dmt_destroy(&s->dmt);
+// and now s->dmt will have a NULL pointer instead of a dangling freed pointer.
+// Rationale: Returns no values since free() cannot fail.
+// Rationale: Does not free the DMTVALUEs to reduce complexity.
+// Performance:  time=O(toku_dmt_size(*dmtp))
+
+uint32_t toku_dmt_size(DMT V);
+// Effect: return |V|.
+// Requires: V != NULL
+// Performance:  time=O(1)
+
+int toku_dmt_iterate_on_range(DMT dmt, uint32_t left, uint32_t right, int (*f)(DMTVALUE, uint32_t, void*), void*v);
+// Effect:  Iterate over the values of the dmt, from left to right, calling f on each value.
+//  The second argument passed to f is the index of the value.
+//  The third argument passed to f is v.
+//  The indices run from 0 (inclusive) to toku_dmt_size(dmt) (exclusive).
+//  We will iterate only over [left,right)
+//
+// Requires: dmt != NULL
+// left <= right
+// Requires: f != NULL
+// Returns:
+//  If f ever returns nonzero, then the iteration stops, and the value returned by f is returned by toku_dmt_iterate.
+//  If f always returns zero, then toku_dmt_iterate returns 0.
+// Requires:  Don't modify dmt while running.  (E.g., f may not insert or delete values form dmt.)
+// Performance: time=O(i+\log N) where i is the number of times f is called, and N is the number of elements in dmt.
+// Rational: Although the functional iterator requires defining another function (as opposed to C++ style iterator), it is much easier to read.
+
+int toku_dmt_iterate(DMT dmt, int (*f)(DMTVALUE, uint32_t, void*), void*v);
+// Effect:  Iterate over the values of the dmt, from left to right, calling f on each value.
+//  The second argument passed to f is the index of the value.
+//  The third argument passed to f is v.
+//  The indices run from 0 (inclusive) to toku_dmt_size(dmt) (exclusive).
+// Requires: dmt != NULL
+// Requires: f != NULL
+// Returns:
+//  If f ever returns nonzero, then the iteration stops, and the value returned by f is returned by toku_dmt_iterate.
+//  If f always returns zero, then toku_dmt_iterate returns 0.
+// Requires:  Don't modify dmt while running.  (E.g., f may not insert or delete values form dmt.)
+// Performance: time=O(i+\log N) where i is the number of times f is called, and N is the number of elements in dmt.
+// Rational: Although the functional iterator requires defining another function (as opposed to C++ style iterator), it is much easier to read.
+
+int toku_dmt_insert_at(DMT dmt, DMTVALUE value, uint32_t idx);
+// Effect: Increases indexes of all items at slot >= index by 1.
+//         Insert value into the position at index.
+//
+// Returns:
+//   0         success
+//   EINVAL    if index>toku_dmt_size(dmt)
+//   ENOMEM
+// On error, dmt is unchanged.
+// Performance: time=O(\log N) amortized time.
+// Rationale: Some future implementation may be O(\log N) worst-case time, but O(\log N) amortized is good enough for now.
+
+int toku_dmt_set_at (DMT dmt, DMTVALUE value, uint32_t idx);
+// Effect:  Replaces the item at index with value.
+// Returns:
+//   0       success
+//   EINVAL  if index>=toku_dmt_size(dmt)
+// On error, dmt i sunchanged.
+// Performance: time=O(\log N)
+// Rationale: The BRT needs to be able to replace a value with another copy of the same value (allocated in a different location)
+
+int toku_dmt_insert(DMT dmt, DMTVALUE value, int(*h)(DMTVALUE, void*v), void *v, uint32_t *idx);
+// Effect:  Insert value into the DMT.
+//   If there is some i such that $h(V_i, v)=0$ then returns DB_KEYEXIST.
+//   Otherwise, let i be the minimum value such that $h(V_i, v)>0$.
+//      If no such i exists, then let i be |V|
+//   Then this has the same effect as
+//    dmt_insert_at(tree, value, i);
+//   If index!=NULL then i is stored in *index
+// Requires:  The signum of h must be monotonically increasing.
+// Returns:
+//    0            success
+//    DB_KEYEXIST  the key is present (h was equal to zero for some value)
+//    ENOMEM
+// On nonzero return, dmt is unchanged.
+// On nonzero non-DB_KEYEXIST return, *index is unchanged.
+// Performance: time=O(\log N) amortized.
+// Rationale: Some future implementation may be O(\log N) worst-case time, but O(\log N) amortized is good enough for now.
+
+int toku_dmt_delete_at(DMT dmt, uint32_t idx);
+// Effect: Delete the item in slot index.
+//         Decreases indexes of all items at slot >= index by 1.
+// Returns
+//     0            success
+//     EINVAL       if index>=toku_dmt_size(dmt)
+// On error, dmt is unchanged.
+// Rationale: To delete an item, first find its index using toku_dmt_find, then delete it.
+// Performance: time=O(\log N) amortized.
+
+int toku_dmt_fetch (DMT V, uint32_t i, DMTVALUE *v);
+// Effect: Set *v=V_i
+//   If c!=NULL then set c's abstract offset to i.
+// Requires: v   != NULL
+// Returns
+//    0             success
+//    EINVAL        if index>=toku_dmt_size(dmt)
+// On nonzero return, *v is unchanged, and c (if nonnull) is either
+//   invalidated or unchanged.
+// Performance: time=O(\log N)
+// Implementation Notes: It is possible that c was previously valid and was
+//   associated with a different DMT.   If c is changed by this
+//   function, the function must remove c's association with the old
+//   DMT, and associate it with the new DMT.
+
+int toku_dmt_find_zero(DMT V, int (*h)(DMTVALUE, void*extra), void*extra, DMTVALUE *value, uint32_t *idx);
+// Effect:  Find the smallest i such that h(V_i, extra)>=0
+//  If there is such an i and h(V_i,extra)==0 then set *index=i and return 0.
+//  If there is such an i and h(V_i,extra)>0  then set *index=i and return DB_NOTFOUND.
+//  If there is no such i then set *index=toku_dmt_size(V) and return DB_NOTFOUND.
+// Requires: index!=NULL
+
+int toku_dmt_find(DMT V, int (*h)(DMTVALUE, void*extra), void*extra, int direction, DMTVALUE *value, uint32_t *idx);
+//   Effect:
+//    If direction >0 then find the smallest i such that h(V_i,extra)>0.
+//    If direction <0 then find the largest  i such that h(V_i,extra)<0.
+//    (Direction may not be equal to zero.)
+//    If value!=NULL then store V_i in *value
+//    If index!=NULL then store i in *index.
+//   Requires: The signum of h is monotically increasing.
+//   Returns
+//      0             success
+//      DB_NOTFOUND   no such value is found.
+//   On nonzero return, *value and *index are unchanged, and c (if nonnull) is either invalidated
+//      or unchanged.
+//   Performance: time=O(\log N)
+//   Rationale:
+//     Here's how to use the find function to find various things
+//       Cases for find:
+//        find first value:         ( h(v)=+1, direction=+1 )
+//        find last value           ( h(v)=-1, direction=-1 )
+//        find first X              ( h(v)=(v< x) ? -1 : 1    direction=+1 )
+//        find last X               ( h(v)=(v<=x) ? -1 : 1    direction=-1 )
+//        find X or successor to X  ( same as find first X. )
+//
+//   Rationale: To help understand heaviside functions and behavor of find:
+//    There are 7 kinds of heaviside functions.
+//    The signus of the h must be monotonically increasing.
+//    Given a function of the following form, A is the element
+//    returned for direction>0, B is the element returned
+//    for direction<0, C is the element returned for
+//    direction==0 (see find_zero) (with a return of 0), and D is the element
+//    returned for direction==0 (see find_zero) with a return of DB_NOTFOUND.
+//    If any of A, B, or C are not found, then asking for the
+//    associated direction will return DB_NOTFOUND.
+//    See find_zero for more information.
+//
+//    Let the following represent the signus of the heaviside function.
+//
+//    -...-
+//        A
+//         D
+//
+//    +...+
+//    B
+//    D
+//
+//    0...0
+//    C
+//
+//    -...-0...0
+//        AC
+//
+//    0...0+...+
+//    C    B
+//
+//    -...-+...+
+//        AB
+//         D
+//
+//    -...-0...0+...+
+//        AC    B
+
+int toku_dmt_split_at(DMT dmt, DMT *newdmt, uint32_t idx);
+// Effect: Create a new DMT, storing it in *newdmt.
+//  The values to the right of index (starting at index) are moved to *newdmt.
+// Requires: dmt != NULL
+// Requires: newdmt != NULL
+// Returns
+//    0             success,
+//    EINVAL        if index > toku_dmt_size(dmt)
+//    ENOMEM
+// On nonzero return, dmt and *newdmt are unmodified.
+// Performance: time=O(n)
+// Rationale:  We don't need a split-evenly operation.  We need to split items so that their total sizes
+//  are even, and other similar splitting criteria.  It's easy to split evenly by calling toku_dmt_size(), and dividing by two.
+
+int toku_dmt_merge(DMT leftdmt, DMT rightdmt, DMT *newdmt);
+// Effect: Appends leftdmt and rightdmt to produce a new dmt.
+//  Sets *newdmt to the new dmt.
+//  On success, leftdmt and rightdmt destroyed,.
+// Returns 0 on success
+//   ENOMEM on out of memory.
+// On error, nothing is modified.
+// Performance: time=O(n) is acceptable, but one can imagine implementations that are O(\log n) worst-case.
+
+int toku_dmt_clone_noptr(DMT *dest, DMT src);
+// Effect: Creates a copy of an dmt.
+//  Sets *dest to the clone
+//  Each element is assumed to be stored directly in the dmt, that is, the DMTVALUEs are not pointers, they are data.  Thus no extra memory allocation is required.
+// Returns 0 on success
+//  ENOMEM on out of memory.
+// On error, nothing is modified.
+// Performance: time between O(n) and O(n log n), depending how long it
+//  takes to traverse src.
+
+void toku_dmt_clear(DMT dmt);
+// Effect: Set the tree to be empty.
+//  Note: Will not reallocate or resize any memory, since returning void precludes calling malloc.
+// Performance: time=O(1)
+
+size_t toku_dmt_memory_size (DMT dmt);
+// Effect: Return the size (in bytes) of the dmt, as it resides in main memory.  Don't include any of the DMTVALUES.
+
+
+
+#endif  /* #ifndef TOKU_DMT_WRAPPER_H */
+
--- a/ft/ft-flusher.cc
+++ b/ft/ft-flusher.cc
@ -689,16 +689,16 @@ ftleaf_get_split_loc(
    switch (split_mode) {
    case SPLIT_LEFT_HEAVY: {
        *num_left_bns = node->n_children;
-        *num_left_les = BLB_DATA(node, *num_left_bns - 1)->omt_size();
+        *num_left_les = BLB_DATA(node, *num_left_bns - 1)->num_klpairs();
        if (*num_left_les == 0) {
            *num_left_bns = node->n_children - 1;
-            *num_left_les = BLB_DATA(node, *num_left_bns - 1)->omt_size();
+            *num_left_les = BLB_DATA(node, *num_left_bns - 1)->num_klpairs();
        }
        goto exit;
    }
    case SPLIT_RIGHT_HEAVY: {
        *num_left_bns = 1;
-        *num_left_les = BLB_DATA(node, 0)->omt_size() ? 1 : 0;
+        *num_left_les = BLB_DATA(node, 0)->num_klpairs() ? 1 : 0;
        goto exit;
    }
    case SPLIT_EVENLY: {
@ -707,8 +707,8 @@ ftleaf_get_split_loc(
        uint64_t sumlesizes = ftleaf_disk_size(node);
        uint32_t size_so_far = 0;
        for (int i = 0; i < node->n_children; i++) {
-            BN_DATA bd = BLB_DATA(node, i);
-            uint32_t n_leafentries = bd->omt_size();
+            bn_data* bd = BLB_DATA(node, i);
+            uint32_t n_leafentries = bd->num_klpairs();
            for (uint32_t j=0; j < n_leafentries; j++) {
                size_t size_this_le;
                int rr = bd->fetch_klpair_disksize(j, &size_this_le);
@ -725,7 +725,7 @@ ftleaf_get_split_loc(
                            (*num_left_les)--;
                        } else if (*num_left_bns > 1) {
                            (*num_left_bns)--;
-                            *num_left_les = BLB_DATA(node, *num_left_bns - 1)->omt_size();
+                            *num_left_les = BLB_DATA(node, *num_left_bns - 1)->num_klpairs();
                        } else {
                            // we are trying to split a leaf with only one
                            // leafentry in it
@ -754,7 +754,8 @@ move_leafentries(
    )
 //Effect: move leafentries in the range [lbi, upe) from src_omt to newly created dest_omt
 {
-    src_bn->data_buffer.move_leafentries_to(&dest_bn->data_buffer, lbi, ube);
+    invariant(ube == src_bn->data_buffer.num_klpairs());
+    src_bn->data_buffer.split_klpairs(&dest_bn->data_buffer, lbi);
 }

 static void ftnode_finalize_split(FTNODE node, FTNODE B, MSN max_msn_applied_to_node) {
@ -851,7 +852,7 @@ ftleaf_split(
    ftleaf_get_split_loc(node, split_mode, &num_left_bns, &num_left_les);
    {
        // did we split right on the boundary between basement nodes?
-        const bool split_on_boundary = (num_left_les == 0) || (num_left_les == (int) BLB_DATA(node, num_left_bns - 1)->omt_size());
+        const bool split_on_boundary = (num_left_les == 0) || (num_left_les == (int) BLB_DATA(node, num_left_bns - 1)->num_klpairs());
        // Now we know where we are going to break it
        // the two nodes will have a total of n_children+1 basement nodes
        // and n_children-1 pivots
@ -912,7 +913,7 @@ ftleaf_split(
            move_leafentries(BLB(B, curr_dest_bn_index),
                             BLB(node, curr_src_bn_index),
                             num_left_les,         // first row to be moved to B
-                             BLB_DATA(node, curr_src_bn_index)->omt_size()  // number of rows in basement to be split
+                             BLB_DATA(node, curr_src_bn_index)->num_klpairs()  // number of rows in basement to be split
                             );
            BLB_MAX_MSN_APPLIED(B, curr_dest_bn_index) = BLB_MAX_MSN_APPLIED(node, curr_src_bn_index);
            curr_dest_bn_index++;
@ -954,10 +955,10 @@ ftleaf_split(
                toku_destroy_dbt(&node->childkeys[num_left_bns - 1]);
            }
        } else if (splitk) {
-            BN_DATA bd = BLB_DATA(node, num_left_bns - 1);
+            bn_data* bd = BLB_DATA(node, num_left_bns - 1);
            uint32_t keylen;
            void *key;
-            int rr = bd->fetch_le_key_and_len(bd->omt_size() - 1, &keylen, &key);
+            int rr = bd->fetch_key_and_len(bd->num_klpairs() - 1, &keylen, &key);
            invariant_zero(rr);
            toku_memdup_dbt(splitk, key, keylen);
        }
@ -1168,11 +1169,11 @@ merge_leaf_nodes(FTNODE a, FTNODE b)
    a->dirty = 1;
    b->dirty = 1;

-    BN_DATA a_last_bd = BLB_DATA(a, a->n_children-1);
+    bn_data* a_last_bd = BLB_DATA(a, a->n_children-1);
    // this bool states if the last basement node in a has any items or not
    // If it does, then it stays in the merge. If it does not, the last basement node
    // of a gets eliminated because we do not have a pivot to store for it (because it has no elements)
-    const bool a_has_tail = a_last_bd->omt_size() > 0;
+    const bool a_has_tail = a_last_bd->num_klpairs() > 0;

    // move each basement node from b to a
    // move the pivots, adding one of what used to be max(a)
@ -1199,7 +1200,7 @@ merge_leaf_nodes(FTNODE a, FTNODE b)
    if (a_has_tail) {
        uint32_t keylen;
        void *key;
-        int rr = a_last_bd->fetch_le_key_and_len(a_last_bd->omt_size() - 1, &keylen, &key);
+        int rr = a_last_bd->fetch_key_and_len(a_last_bd->num_klpairs() - 1, &keylen, &key);
        invariant_zero(rr);
        toku_memdup_dbt(&a->childkeys[a->n_children-1], key, keylen);
        a->totalchildkeylens += keylen;
--- a/ft/ft-internal.h
+++ b/ft/ft-internal.h
@ -1178,6 +1178,8 @@ typedef enum {
    FT_PRO_NUM_STOP_LOCK_CHILD,
    FT_PRO_NUM_STOP_CHILD_INMEM,
    FT_PRO_NUM_DIDNT_WANT_PROMOTE,
+    FT_BASEMENT_DESERIALIZE_FIXED_KEYSIZE, // how many basement nodes were deserialized with a fixed keysize
+    FT_BASEMENT_DESERIALIZE_VARIABLE_KEYSIZE, // how many basement nodes were deserialized with a variable keysize
    FT_STATUS_NUM_ROWS
 } ft_status_entry;

--- a/ft/ft-ops.cc
+++ b/ft/ft-ops.cc
@ -363,6 +363,8 @@ status_init(void)
    STATUS_INIT(FT_PRO_NUM_STOP_LOCK_CHILD,                PROMOTION_STOPPED_CHILD_LOCKED_OR_NOT_IN_MEMORY, PARCOUNT, "promotion: stopped because the child was locked or not at all in memory", TOKU_ENGINE_STATUS|TOKU_GLOBAL_STATUS);
    STATUS_INIT(FT_PRO_NUM_STOP_CHILD_INMEM,               PROMOTION_STOPPED_CHILD_NOT_FULLY_IN_MEMORY, PARCOUNT, "promotion: stopped because the child was not fully in memory", TOKU_ENGINE_STATUS|TOKU_GLOBAL_STATUS);
    STATUS_INIT(FT_PRO_NUM_DIDNT_WANT_PROMOTE,             PROMOTION_STOPPED_AFTER_LOCKING_CHILD, PARCOUNT, "promotion: stopped anyway, after locking the child", TOKU_ENGINE_STATUS|TOKU_GLOBAL_STATUS);
+    STATUS_INIT(FT_BASEMENT_DESERIALIZE_FIXED_KEYSIZE,     BASEMENT_DESERIALIZATION_FIXED_KEY, PARCOUNT, "basement nodes deserialized with fixed-keysize", TOKU_ENGINE_STATUS|TOKU_GLOBAL_STATUS);
+    STATUS_INIT(FT_BASEMENT_DESERIALIZE_VARIABLE_KEYSIZE,  BASEMENT_DESERIALIZATION_VARIABLE_KEY, PARCOUNT, "basement nodes deserialized with variable-keysize", TOKU_ENGINE_STATUS|TOKU_GLOBAL_STATUS);

    ft_status.initialized = true;
 }
@ -389,6 +391,14 @@ toku_ft_get_status(FT_STATUS s) {
        }                                                                           \
    } while (0)

+void toku_note_deserialized_basement_node(bool fixed_key_size) {
+    if (fixed_key_size) {
+        STATUS_INC(FT_BASEMENT_DESERIALIZE_FIXED_KEYSIZE, 1);
+    } else {
+        STATUS_INC(FT_BASEMENT_DESERIALIZE_VARIABLE_KEYSIZE, 1);
+    }
+}
+
 bool is_entire_node_in_memory(FTNODE node) {
    for (int i = 0; i < node->n_children; i++) {
        if(BP_STATE(node,i) != PT_AVAIL) {
@ -409,7 +419,7 @@ get_leaf_num_entries(FTNODE node) {
    int i;
    toku_assert_entire_node_in_memory(node);
    for ( i = 0; i < node->n_children; i++) {
-        result += BLB_DATA(node, i)->omt_size();
+        result += BLB_DATA(node, i)->num_klpairs();
    }
    return result;
 }
@ -595,6 +605,7 @@ ftnode_memory_size (FTNODE node)
    int n_children = node->n_children;
    retval += sizeof(*node);
    retval += (n_children)*(sizeof(node->bp[0]));
+    retval += (n_children > 0 ? n_children-1 : 0)*(sizeof(node->childkeys[0]));
    retval += node->totalchildkeylens;

    // now calculate the sizes of the partitions
@ -1708,8 +1719,8 @@ toku_ft_bn_apply_cmd_once (
    if (le) {
        oldsize = leafentry_memsize(le) + key_storage_size;
    }
-    
-    // toku_le_apply_msg() may call mempool_malloc_from_omt() to allocate more space.
+
+    // toku_le_apply_msg() may call bn_data::mempool_malloc_and_update_dmt() to allocate more space.
    // That means le is guaranteed to not cause a sigsegv but it may point to a mempool that is
    // no longer in use.  We'll have to release the old mempool later.
    toku_le_apply_msg(
@ -1722,6 +1733,8 @@ toku_ft_bn_apply_cmd_once (
        &new_le, 
        &numbytes_delta
        );
+    // at this point, we cannot trust cmd->u.id.key to be valid.
+    // The dmt may have realloced its mempool and freed the one containing key.

    newsize = new_le ? (leafentry_memsize(new_le) +  + key_storage_size) : 0;
    if (le && new_le) {
@ -1897,7 +1910,7 @@ toku_ft_bn_apply_cmd (
    void* key = NULL;
    uint32_t keylen = 0;

-    uint32_t omt_size;
+    uint32_t num_klpairs;
    int r;
    struct cmd_leafval_heaviside_extra be = {compare_fun, desc, cmd->u.id.key};

@ -1909,9 +1922,9 @@ toku_ft_bn_apply_cmd (
    case FT_INSERT: {
        uint32_t idx;
        if (doing_seqinsert) {
-            idx = bn->data_buffer.omt_size();
+            idx = bn->data_buffer.num_klpairs();
            DBT kdbt;
-            r = bn->data_buffer.fetch_le_key_and_len(idx-1, &kdbt.size, &kdbt.data);
+            r = bn->data_buffer.fetch_key_and_len(idx-1, &kdbt.size, &kdbt.data);
            if (r != 0) goto fz;
            int cmp = toku_cmd_leafval_heaviside(kdbt, be);
            if (cmp >= 0) goto fz;
@ -1937,7 +1950,7 @@ toku_ft_bn_apply_cmd (
        // the leaf then it is sequential
        // window = min(32, number of leaf entries/16)
        {
-            uint32_t s = bn->data_buffer.omt_size();
+            uint32_t s = bn->data_buffer.num_klpairs();
            uint32_t w = s / 16;
            if (w == 0) w = 1;
            if (w > 32) w = 32;
@ -1972,8 +1985,8 @@ toku_ft_bn_apply_cmd (
    case FT_COMMIT_BROADCAST_ALL:
    case FT_OPTIMIZE:
        // Apply to all leafentries
-        omt_size = bn->data_buffer.omt_size();
-        for (uint32_t idx = 0; idx < omt_size; ) {
+        num_klpairs = bn->data_buffer.num_klpairs();
+        for (uint32_t idx = 0; idx < num_klpairs; ) {
            DBT curr_keydbt;
            void* curr_keyp = NULL;
            uint32_t curr_keylen = 0;
@ -1986,26 +1999,27 @@ toku_ft_bn_apply_cmd (
            int deleted = 0;
            if (!le_is_clean(storeddata)) { //If already clean, nothing to do.
                toku_ft_bn_apply_cmd_once(bn, cmd, idx, storeddata, oldest_referenced_xid_known, gc_info, workdone, stats_to_update);
-                uint32_t new_omt_size = bn->data_buffer.omt_size();
-                if (new_omt_size != omt_size) {
-                    paranoid_invariant(new_omt_size+1 == omt_size);
+                // at this point, we cannot trust cmd->u.id.key to be valid.
+                uint32_t new_dmt_size = bn->data_buffer.num_klpairs();
+                if (new_dmt_size != num_klpairs) {
+                    paranoid_invariant(new_dmt_size+1 == num_klpairs);
                    //Item was deleted.
                    deleted = 1;
                }
            }
            if (deleted)
-                omt_size--;
+                num_klpairs--;
            else
                idx++;
        }
-        paranoid_invariant(bn->data_buffer.omt_size() == omt_size);
+        paranoid_invariant(bn->data_buffer.num_klpairs() == num_klpairs);

        break;
    case FT_COMMIT_BROADCAST_TXN:
    case FT_ABORT_BROADCAST_TXN:
        // Apply to all leafentries if txn is represented
-        omt_size = bn->data_buffer.omt_size();
-        for (uint32_t idx = 0; idx < omt_size; ) {
+        num_klpairs = bn->data_buffer.num_klpairs();
+        for (uint32_t idx = 0; idx < num_klpairs; ) {
            DBT curr_keydbt;
            void* curr_keyp = NULL;
            uint32_t curr_keylen = 0;
@ -2018,19 +2032,19 @@ toku_ft_bn_apply_cmd (
            int deleted = 0;
            if (le_has_xids(storeddata, cmd->xids)) {
                toku_ft_bn_apply_cmd_once(bn, cmd, idx, storeddata, oldest_referenced_xid_known, gc_info, workdone, stats_to_update);
-                uint32_t new_omt_size = bn->data_buffer.omt_size();
-                if (new_omt_size != omt_size) {
-                    paranoid_invariant(new_omt_size+1 == omt_size);
+                uint32_t new_dmt_size = bn->data_buffer.num_klpairs();
+                if (new_dmt_size != num_klpairs) {
+                    paranoid_invariant(new_dmt_size+1 == num_klpairs);
                    //Item was deleted.
                    deleted = 1;
                }
            }
            if (deleted)
-                omt_size--;
+                num_klpairs--;
            else
                idx++;
        }
-        paranoid_invariant(bn->data_buffer.omt_size() == omt_size);
+        paranoid_invariant(bn->data_buffer.num_klpairs() == num_klpairs);

        break;
    case FT_UPDATE: {
@ -2059,7 +2073,7 @@ toku_ft_bn_apply_cmd (
        // apply to all leafentries.
        uint32_t idx = 0;
        uint32_t num_leafentries_before;
-        while (idx < (num_leafentries_before = bn->data_buffer.omt_size())) {
+        while (idx < (num_leafentries_before = bn->data_buffer.num_klpairs())) {
            void* curr_key = nullptr;
            uint32_t curr_keylen = 0;
            r = bn->data_buffer.fetch_klpair(idx, &storeddata, &curr_keylen, &curr_key);
@ -2077,7 +2091,7 @@ toku_ft_bn_apply_cmd (
            r = do_update(update_fun, desc, bn, cmd, idx, storeddata, curr_key, curr_keylen, oldest_referenced_xid_known, gc_info, workdone, stats_to_update);
            assert_zero(r);

-            if (num_leafentries_before == bn->data_buffer.omt_size()) {
+            if (num_leafentries_before == bn->data_buffer.num_klpairs()) {
                // we didn't delete something, so increment the index.
                idx++;
            }
@ -2390,7 +2404,7 @@ basement_node_gc_all_les(BASEMENTNODE bn,
    int r = 0;
    uint32_t index = 0;
    uint32_t num_leafentries_before;
-    while (index < (num_leafentries_before = bn->data_buffer.omt_size())) {
+    while (index < (num_leafentries_before = bn->data_buffer.num_klpairs())) {
        void* keyp = NULL;
        uint32_t keylen = 0;
        LEAFENTRY leaf_entry;
@ -2409,7 +2423,7 @@ basement_node_gc_all_les(BASEMENTNODE bn,
            delta
            );
        // Check if the leaf entry was deleted or not.
-        if (num_leafentries_before == bn->data_buffer.omt_size()) {
+        if (num_leafentries_before == bn->data_buffer.num_klpairs()) {
            ++index;
        }
    }
@ -4915,7 +4929,7 @@ ok: ;
            switch (search->direction) {
            case FT_SEARCH_LEFT:
                idx++;
-                if (idx >= bn->data_buffer.omt_size()) {
+                if (idx >= bn->data_buffer.num_klpairs()) {
                    if (ftcursor->interrupt_cb && ftcursor->interrupt_cb(ftcursor->interrupt_cb_extra)) {
                        return TOKUDB_INTERRUPTED;
                    }
@ -5590,7 +5604,7 @@ ft_cursor_shortcut (
    int r = 0;
    // if we are searching towards the end, limit is last element
    // if we are searching towards the beginning, limit is the first element
-    uint32_t limit = (direction > 0) ? (bd->omt_size() - 1) : 0;
+    uint32_t limit = (direction > 0) ? (bd->num_klpairs() - 1) : 0;

    //Starting with the prev, find the first real (non-provdel) leafentry.
    while (index != limit) {
@ -5881,7 +5895,7 @@ keysrange_in_leaf_partition (FT_HANDLE brt, FTNODE node,
        *less = idx_left;
        *equal_left = (r==0) ? 1 : 0;

-        uint32_t size = bn->data_buffer.omt_size();
+        uint32_t size = bn->data_buffer.num_klpairs();
        uint32_t idx_right = size;
        r = -1;
        if (single_basement && key_right) {
@ -6141,7 +6155,7 @@ static int get_key_after_bytes_in_basementnode(FT ft, BASEMENTNODE bn, const DBT
        assert(r == 0 || r == DB_NOTFOUND);
    }
    struct get_key_after_bytes_iterate_extra iter_extra = {skip_len, skipped, callback, cb_extra};
-    r = bn->data_buffer.omt_iterate_on_range<get_key_after_bytes_iterate_extra, get_key_after_bytes_iterate>(idx_left, bn->data_buffer.omt_size(), &iter_extra);
+    r = bn->data_buffer.iterate_on_range<get_key_after_bytes_iterate_extra, get_key_after_bytes_iterate>(idx_left, bn->data_buffer.num_klpairs(), &iter_extra);

    // Invert the sense of r == 0 (meaning the iterate finished, which means we didn't find what we wanted)
    if (r == 1) {
@ -6337,7 +6351,7 @@ toku_dump_ftnode (FILE *file, FT_HANDLE brt, BLOCKNUM blocknum, int depth, const
                             });
            }
            else {
-                int size = BLB_DATA(node, i)->omt_size();
+                int size = BLB_DATA(node, i)->num_klpairs();
                if (0)
                    for (int j=0; j<size; j++) {
                        LEAFENTRY le;
@ -6517,9 +6531,9 @@ static bool is_empty_fast_iter (FT_HANDLE brt, FTNODE node) {
        }
        return 1;
    } else {
-        // leaf:  If the omt is empty, we are happy.
+        // leaf:  If the dmt is empty, we are happy.
        for (int i = 0; i < node->n_children; i++) {
-            if (BLB_DATA(node, i)->omt_size()) {
+            if (BLB_DATA(node, i)->num_klpairs()) {
                return false;
            }
        }
--- a/ft/ft-ops.h
+++ b/ft/ft-ops.h
@ -351,6 +351,8 @@ int toku_ft_strerror_r(int error, char *buf, size_t buflen);

 extern bool garbage_collection_debug;

+void toku_note_deserialized_basement_node(bool fixed_key_size);
+
 // This is a poor place to put global options like these.
 void toku_ft_set_direct_io(bool direct_io_on);
 void toku_ft_set_compress_buffers_before_eviction(bool compress_buffers);
--- a/ft/ft-serialize.cc
+++ b/ft/ft-serialize.cc
@ -462,6 +462,7 @@ serialize_ft_min_size (uint32_t version) {
    size_t size = 0;

    switch(version) {
+    case FT_LAYOUT_VERSION_26:
    case FT_LAYOUT_VERSION_25:
    case FT_LAYOUT_VERSION_24:
    case FT_LAYOUT_VERSION_23:
--- a/ft/ft-verify.cc
+++ b/ft/ft-verify.cc
@ -152,7 +152,7 @@ verify_msg_in_child_buffer(FT_HANDLE brt, enum ft_msg_type type, MSN msn, byteve
 static DBT
 get_ith_key_dbt (BASEMENTNODE bn, int i) {
    DBT kdbt;
-    int r = bn->data_buffer.fetch_le_key_and_len(i, &kdbt.size, &kdbt.data);
+    int r = bn->data_buffer.fetch_key_and_len(i, &kdbt.size, &kdbt.data);
    invariant_zero(r); // this is a bad failure if it happens.
    return kdbt;
 }
@ -424,7 +424,7 @@ toku_verify_ftnode_internal(FT_HANDLE brt,
        }
        else {
            BASEMENTNODE bn = BLB(node, i);
-            for (uint32_t j = 0; j < bn->data_buffer.omt_size(); j++) {
+            for (uint32_t j = 0; j < bn->data_buffer.num_klpairs(); j++) {
                VERIFY_ASSERTION((rootmsn.msn >= this_msn.msn), 0, "leaf may have latest msn, but cannot be greater than root msn");
                DBT kdbt = get_ith_key_dbt(bn, j);
                if (curr_less_pivot) {
--- a/ft/ft.cc
+++ b/ft/ft.cc
@ -1077,8 +1077,8 @@ garbage_helper(BLOCKNUM blocknum, int64_t UU(size), int64_t UU(address), void *e
        goto exit;
    }
    for (int i = 0; i < node->n_children; ++i) {
-        BN_DATA bd = BLB_DATA(node, i);
-        r = bd->omt_iterate<struct garbage_helper_extra, garbage_leafentry_helper>(info);
+        bn_data* bd = BLB_DATA(node, i);
+        r = bd->iterate<struct garbage_helper_extra, garbage_leafentry_helper>(info);
        if (r != 0) {
            goto exit;
        }
--- a/ft/ft_layout_version.h
+++ b/ft/ft_layout_version.h
@ -119,6 +119,7 @@ enum ft_layout_version_e {
    FT_LAYOUT_VERSION_23 = 23, // Ming: Fix upgrade path #5902
    FT_LAYOUT_VERSION_24 = 24, // Riddler: change logentries that log transactions to store TXNID_PAIRs instead of TXNIDs
    FT_LAYOUT_VERSION_25 = 25, // SecretSquirrel: ROLLBACK_LOG_NODES (on disk and in memory) now just use blocknum (instead of blocknum + hash) to point to other log nodes.  same for xstillopen log entry
+    FT_LAYOUT_VERSION_26 = 26, // Hojo: basements store key/vals separately on disk for fixed klpair length BNs
    FT_NEXT_VERSION,           // the version after the current version
    FT_LAYOUT_VERSION   = FT_NEXT_VERSION-1, // A hack so I don't have to change this line.
    FT_LAYOUT_MIN_SUPPORTED_VERSION = FT_LAYOUT_VERSION_13, // Minimum version supported
--- a/ft/ft_node-serialize.cc
+++ b/ft/ft_node-serialize.cc
@ -284,32 +284,7 @@ serialize_node_header(FTNODE node, FTNODE_DISK_DATA ndd, struct wbuf *wbuf) {
    invariant(wbuf->ndone == wbuf->size);
 }

-static int
-wbufwriteleafentry(const void* key, const uint32_t keylen, const LEAFENTRY &le, const uint32_t UU(idx), struct wbuf * const wb) {
-    // need to pack the leafentry as it was in versions
-    // where the key was integrated into it
-    uint32_t begin_spot UU() = wb->ndone;
-    uint32_t le_disk_size = leafentry_disksize(le);
-    wbuf_nocrc_uint8_t(wb, le->type);
-    wbuf_nocrc_uint32_t(wb, keylen);
-    if (le->type == LE_CLEAN) {
-        wbuf_nocrc_uint32_t(wb, le->u.clean.vallen);
-        wbuf_nocrc_literal_bytes(wb, key, keylen);
-        wbuf_nocrc_literal_bytes(wb, le->u.clean.val, le->u.clean.vallen);
-    }
-    else {
-        paranoid_invariant(le->type == LE_MVCC);
-        wbuf_nocrc_uint32_t(wb, le->u.mvcc.num_cxrs);
-        wbuf_nocrc_uint8_t(wb, le->u.mvcc.num_pxrs);
-        wbuf_nocrc_literal_bytes(wb, key, keylen);
-        wbuf_nocrc_literal_bytes(wb, le->u.mvcc.xrs, le_disk_size - (1 + 4 + 1));
-    }
-    uint32_t end_spot UU() = wb->ndone;
-    paranoid_invariant((end_spot - begin_spot) == keylen + sizeof(keylen) + le_disk_size);
-    return 0;
-}
-
-static uint32_t 
+static uint32_t
 serialize_ftnode_partition_size (FTNODE node, int i)
 {
    uint32_t result = 0;
@ -320,14 +295,14 @@ serialize_ftnode_partition_size (FTNODE node, int i)
        result += toku_bnc_nbytesinbuf(BNC(node, i));
    }
    else {
-        result += 4; // n_entries in buffer table
+        result += 4 + bn_data::HEADER_LENGTH; // n_entries in buffer table + basement header
        result += BLB_NBYTESINDATA(node, i);
    }
    result += 4; // checksum
    return result;
 }

-#define FTNODE_PARTITION_OMT_LEAVES 0xaa
+#define FTNODE_PARTITION_DMT_LEAVES 0xaa
 #define FTNODE_PARTITION_FIFO_MSG 0xbb

 static void
@ -374,16 +349,13 @@ serialize_ftnode_partition(FTNODE node, int i, struct sub_block *sb) {
        serialize_nonleaf_childinfo(BNC(node, i), &wb);
    }
    else {
-        unsigned char ch = FTNODE_PARTITION_OMT_LEAVES;
-        BN_DATA bd = BLB_DATA(node, i);
+        unsigned char ch = FTNODE_PARTITION_DMT_LEAVES;
+        bn_data* bd = BLB_DATA(node, i);

        wbuf_nocrc_char(&wb, ch);
-        wbuf_nocrc_uint(&wb, bd->omt_size());
+        wbuf_nocrc_uint(&wb, bd->num_klpairs());

-        //
-        // iterate over leafentries and place them into the buffer
-        //
-        bd->omt_iterate<struct wbuf, wbufwriteleafentry>(&wb);
+        bd->serialize_to_wbuf(&wb);
    }
    uint32_t end_to_end_checksum = x1764_memory(sb->uncompressed_ptr, wbuf_get_woffset(&wb));
    wbuf_nocrc_int(&wb, end_to_end_checksum);
@ -546,7 +518,7 @@ rebalance_ftnode_leaf(FTNODE node, unsigned int basementnodesize)
    // Count number of leaf entries in this leaf (num_le).
    uint32_t num_le = 0;
    for (uint32_t i = 0; i < num_orig_basements; i++) {
-        num_le += BLB_DATA(node, i)->omt_size();
+        num_le += BLB_DATA(node, i)->num_klpairs();
    }

    uint32_t num_alloc = num_le ? num_le : 1;  // simplify logic below by always having at least one entry per array
@ -571,10 +543,10 @@ rebalance_ftnode_leaf(FTNODE node, unsigned int basementnodesize)

    uint32_t curr_le = 0;
    for (uint32_t i = 0; i < num_orig_basements; i++) {
-        BN_DATA bd = BLB_DATA(node, i);
+        bn_data* bd = BLB_DATA(node, i);
        struct array_info ai {.offset = curr_le, .le_array = leafpointers, .key_sizes_array = key_sizes, .key_ptr_array = key_pointers };
-        bd->omt_iterate<array_info, array_item>(&ai);
-        curr_le += bd->omt_size();
+        bd->iterate<array_info, array_item>(&ai);
+        curr_le += bd->num_klpairs();
    }

    // Create an array that will store indexes of new pivots.
@ -592,9 +564,14 @@ rebalance_ftnode_leaf(FTNODE node, unsigned int basementnodesize)
    // Create an array that will store the size of each basement.
    // This is the sum of the leaf sizes of all the leaves in that basement.
    // We don't know how many basements there will be, so we use num_le as the upper bound.
-    toku::scoped_malloc bn_sizes_buf(sizeof(size_t) * num_alloc);
-    size_t *bn_sizes = reinterpret_cast<size_t *>(bn_sizes_buf.get());
-    bn_sizes[0] = 0;
+
+    // Sum of all le sizes in a single basement
+    toku::scoped_calloc bn_le_sizes_buf(sizeof(size_t) * num_alloc);
+    size_t *bn_le_sizes = reinterpret_cast<size_t *>(bn_le_sizes_buf.get());
+
+    // Sum of all key sizes in a single basement
+    toku::scoped_calloc bn_key_sizes_buf(sizeof(size_t) * num_alloc);
+    size_t *bn_key_sizes = reinterpret_cast<size_t *>(bn_key_sizes_buf.get());

    // TODO 4050: All these arrays should be combined into a single array of some bn_info struct (pivot, msize, num_les).
    // Each entry is the number of leafentries in this basement.  (Again, num_le is overkill upper baound.)
@ -611,7 +588,7 @@ rebalance_ftnode_leaf(FTNODE node, unsigned int basementnodesize)
    for (uint32_t i = 0; i < num_le; i++) {
        uint32_t curr_le_size = leafentry_disksize((LEAFENTRY) leafpointers[i]); 
        le_sizes[i] = curr_le_size;
-        if ((bn_size_so_far + curr_le_size > basementnodesize) && (num_le_in_curr_bn != 0)) {
+        if ((bn_size_so_far + curr_le_size + sizeof(uint32_t) + key_sizes[i] > basementnodesize) && (num_le_in_curr_bn != 0)) {
            // cap off the current basement node to end with the element before i
            new_pivots[curr_pivot] = i-1;
            curr_pivot++;
@ -620,8 +597,9 @@ rebalance_ftnode_leaf(FTNODE node, unsigned int basementnodesize)
        }
        num_le_in_curr_bn++;
        num_les_this_bn[curr_pivot] = num_le_in_curr_bn;
+        bn_le_sizes[curr_pivot] += curr_le_size;
+        bn_key_sizes[curr_pivot] += sizeof(uint32_t) + key_sizes[i];  // uint32_t le_offset
        bn_size_so_far += curr_le_size + sizeof(uint32_t) + key_sizes[i];
-        bn_sizes[curr_pivot] = bn_size_so_far;
    }
    // curr_pivot is now the total number of pivot keys in the leaf node
    int num_pivots   = curr_pivot;
@ -688,17 +666,15 @@ rebalance_ftnode_leaf(FTNODE node, unsigned int basementnodesize)
        uint32_t num_les_to_copy = num_les_this_bn[i];
        invariant(num_les_to_copy == num_in_bn); 

-        // construct mempool for this basement
-        size_t size_this_bn = bn_sizes[i];
-
-        BN_DATA bd = BLB_DATA(node, i);
-        bd->replace_contents_with_clone_of_sorted_array(
+        bn_data* bd = BLB_DATA(node, i);
+        bd->set_contents_as_clone_of_sorted_array(
            num_les_to_copy,
            &key_pointers[baseindex_this_bn],
            &key_sizes[baseindex_this_bn],
            &leafpointers[baseindex_this_bn],
            &le_sizes[baseindex_this_bn],
-            size_this_bn
+            bn_key_sizes[i],  // Total key sizes
+            bn_le_sizes[i]  // total le sizes
            );

        BP_STATE(node,i) = PT_AVAIL;
@ -1541,15 +1517,14 @@ deserialize_ftnode_partition(
        BP_WORKDONE(node, childnum) = 0;
    }
    else {
-        assert(ch == FTNODE_PARTITION_OMT_LEAVES);
+        assert(ch == FTNODE_PARTITION_DMT_LEAVES);
        BLB_SEQINSERT(node, childnum) = 0;
        uint32_t num_entries = rbuf_int(&rb);
        // we are now at the first byte of first leafentry
        data_size -= rb.ndone; // remaining bytes of leafentry data
-        
+
        BASEMENTNODE bn = BLB(node, childnum);
-        bn->data_buffer.initialize_from_data(num_entries, &rb.buf[rb.ndone], data_size);
-        rb.ndone += data_size;
+        bn->data_buffer.deserialize_from_rbuf(num_entries, &rb, data_size, node->layout_version_read_from_disk);
    }
    assert(rb.ndone == rb.size);
 exit:
@ -2086,13 +2061,18 @@ deserialize_and_upgrade_leaf_node(FTNODE node,
            assert_zero(r);
            // Copy the pointer value straight into the OMT
            LEAFENTRY new_le_in_bn = nullptr;
+            void *maybe_free;
            bn->data_buffer.get_space_for_insert(
                i,
                key,
                keylen,
                new_le_size,
-                &new_le_in_bn
+                &new_le_in_bn,
+                &maybe_free
                );
+            if (maybe_free) {
+                toku_free(maybe_free);
+            }
            memcpy(new_le_in_bn, new_le, new_le_size);
            toku_free(new_le);
        }
@ -2101,8 +2081,7 @@ deserialize_and_upgrade_leaf_node(FTNODE node,
        if (has_end_to_end_checksum) {
            data_size -= sizeof(uint32_t);
        }
-        bn->data_buffer.initialize_from_data(n_in_buf, &rb->buf[rb->ndone], data_size);
-        rb->ndone += data_size;
+        bn->data_buffer.deserialize_from_rbuf(n_in_buf, rb, data_size, node->layout_version_read_from_disk);
    }

    // Whatever this is must be less than the MSNs of every message above
--- a/ft/ftloader.cc
+++ b/ft/ftloader.cc
@ -2917,7 +2917,7 @@ static void add_pair_to_leafnode (struct leaf_buf *lbuf, unsigned char *key, int
    // #3588 TODO just make a clean ule and append it to the omt
    // #3588 TODO can do the rebalancing here and avoid a lot of work later
    FTNODE leafnode = lbuf->node;
-    uint32_t idx = BLB_DATA(leafnode, 0)->omt_size();
+    uint32_t idx = BLB_DATA(leafnode, 0)->num_klpairs();
    DBT thekey = { .data = key, .size = (uint32_t) keylen }; 
    DBT theval = { .data = val, .size = (uint32_t) vallen };
    FT_MSG_S cmd = { .type = FT_INSERT,
--- a/ft/fttypes.h
+++ b/ft/fttypes.h
@ -234,7 +234,7 @@ typedef struct cachetable *CACHETABLE;
 typedef struct cachefile *CACHEFILE;
 typedef struct ctpair *PAIR;
 typedef class checkpointer *CHECKPOINTER;
-typedef class bn_data *BN_DATA;
+class bn_data;

 /* tree command types */
 enum ft_msg_type {
--- a/ft/memarena.cc
+++ b/ft/memarena.cc
@ -98,6 +98,7 @@ struct memarena {
    char *buf;
    size_t buf_used, buf_size;
    size_t size_of_other_bufs; // the buf_size of all the other bufs.
+    size_t footprint_of_other_bufs; // the footprint of all the other bufs.
    char **other_bufs;
    int n_other_bufs;
 };
@ -108,6 +109,7 @@ MEMARENA memarena_create_presized (size_t initial_size) {
    result->buf_used = 0;
    result->other_bufs = NULL;
    result->size_of_other_bufs = 0;
+    result->footprint_of_other_bufs = 0;
    result->n_other_bufs = 0;
    XMALLOC_N(result->buf_size, result->buf);
    return result;
@ -128,6 +130,7 @@ void memarena_clear (MEMARENA ma) {
    // But reuse the main buffer
    ma->buf_used = 0;
    ma->size_of_other_bufs = 0;
+    ma->footprint_of_other_bufs = 0;
 }

 static size_t
@ -151,6 +154,7 @@ void* malloc_in_memarena (MEMARENA ma, size_t size) {
            ma->other_bufs[old_n]=ma->buf;
            ma->n_other_bufs = old_n+1;
            ma->size_of_other_bufs += ma->buf_size;
+            ma->footprint_of_other_bufs += toku_memory_footprint(ma->buf, ma->buf_used);
        }
        // Make a new one
        {
@ -217,7 +221,9 @@ void memarena_move_buffers(MEMARENA dest, MEMARENA source) {
 #endif

    dest  ->size_of_other_bufs += source->size_of_other_bufs + source->buf_size;
+    dest  ->footprint_of_other_bufs += source->footprint_of_other_bufs + toku_memory_footprint(source->buf, source->buf_used);
    source->size_of_other_bufs = 0;
+    source->footprint_of_other_bufs = 0;

    assert(other_bufs);
    dest->other_bufs = other_bufs;
@ -246,4 +252,12 @@ size_t
 memarena_total_size_in_use (MEMARENA m)
 {
    return m->size_of_other_bufs + m->buf_used;
-}    
+}
+
+size_t
+memarena_total_footprint (MEMARENA m)
+{
+    return m->footprint_of_other_bufs + toku_memory_footprint(m->buf, m->buf_used) +
+            sizeof(*m) +
+            m->n_other_bufs * sizeof(*m->other_bufs);
+}
--- a/ft/memarena.h
+++ b/ft/memarena.h
@ -129,5 +129,6 @@ size_t memarena_total_memory_size (MEMARENA);

 size_t memarena_total_size_in_use (MEMARENA);

+size_t memarena_total_footprint (MEMARENA);

 #endif
--- a/ft/rollback.cc
+++ b/ft/rollback.cc
@ -146,7 +146,7 @@ PAIR_ATTR
 rollback_memory_size(ROLLBACK_LOG_NODE log) {
    size_t size = sizeof(*log);
    if (log->rollentry_arena) {
-        size += memarena_total_memory_size(log->rollentry_arena);
+        size += memarena_total_footprint(log->rollentry_arena);
    }
    return make_rollback_pair_attr(size);
 }
--- a/ft/tests/dmt-test.cc
+++ b/ft/tests/dmt-test.cc
@ -0,0 +1,906 @@
+/* -*- mode: C++; c-basic-offset: 4; indent-tabs-mode: nil -*- */
+// vim: ft=cpp:expandtab:ts=8:sw=4:softtabstop=4:
+#ident "$Id$"
+/*
+COPYING CONDITIONS NOTICE:
+
+  This program is free software; you can redistribute it and/or modify
+  it under the terms of version 2 of the GNU General Public License as
+  published by the Free Software Foundation, and provided that the
+  following conditions are met:
+
+      * Redistributions of source code must retain this COPYING
+        CONDITIONS NOTICE, the COPYRIGHT NOTICE (below), the
+        DISCLAIMER (below), the UNIVERSITY PATENT NOTICE (below), the
+        PATENT MARKING NOTICE (below), and the PATENT RIGHTS
+        GRANT (below).
+
+      * Redistributions in binary form must reproduce this COPYING
+        CONDITIONS NOTICE, the COPYRIGHT NOTICE (below), the
+        DISCLAIMER (below), the UNIVERSITY PATENT NOTICE (below), the
+        PATENT MARKING NOTICE (below), and the PATENT RIGHTS
+        GRANT (below) in the documentation and/or other materials
+        provided with the distribution.
+
+  You should have received a copy of the GNU General Public License
+  along with this program; if not, write to the Free Software
+  Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
+  02110-1301, USA.
+
+COPYRIGHT NOTICE:
+
+  TokuDB, Tokutek Fractal Tree Indexing Library.
+  Copyright (C) 2007-2013 Tokutek, Inc.
+
+DISCLAIMER:
+
+  This program is distributed in the hope that it will be useful, but
+  WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+  General Public License for more details.
+
+UNIVERSITY PATENT NOTICE:
+
+  The technology is licensed by the Massachusetts Institute of
+  Technology, Rutgers State University of New Jersey, and the Research
+  Foundation of State University of New York at Stony Brook under
+  United States of America Serial No. 11/760379 and to the patents
+  and/or patent applications resulting from it.
+
+PATENT MARKING NOTICE:
+
+  This software is covered by US Patent No. 8,185,551.
+  This software is covered by US Patent No. 8,489,638.
+
+PATENT RIGHTS GRANT:
+
+  "THIS IMPLEMENTATION" means the copyrightable works distributed by
+  Tokutek as part of the Fractal Tree project.
+
+  "PATENT CLAIMS" means the claims of patents that are owned or
+  licensable by Tokutek, both currently or in the future; and that in
+  the absence of this license would be infringed by THIS
+  IMPLEMENTATION or by using or running THIS IMPLEMENTATION.
+
+  "PATENT CHALLENGE" shall mean a challenge to the validity,
+  patentability, enforceability and/or non-infringement of any of the
+  PATENT CLAIMS or otherwise opposing any of the PATENT CLAIMS.
+
+  Tokutek hereby grants to you, for the term and geographical scope of
+  the PATENT CLAIMS, a non-exclusive, no-charge, royalty-free,
+  irrevocable (except as stated in this section) patent license to
+  make, have made, use, offer to sell, sell, import, transfer, and
+  otherwise run, modify, and propagate the contents of THIS
+  IMPLEMENTATION, where such license applies only to the PATENT
+  CLAIMS.  This grant does not include claims that would be infringed
+  only as a consequence of further modifications of THIS
+  IMPLEMENTATION.  If you or your agent or licensee institute or order
+  or agree to the institution of patent litigation against any entity
+  (including a cross-claim or counterclaim in a lawsuit) alleging that
+  THIS IMPLEMENTATION constitutes direct or contributory patent
+  infringement, or inducement of patent infringement, then any rights
+  granted to you under this License shall terminate as of the date
+  such litigation is filed.  If you or your agent or exclusive
+  licensee institute or order or agree to the institution of a PATENT
+  CHALLENGE, then Tokutek may terminate any rights granted to you
+  under this License.
+*/
+
+#ident "Copyright (c) 2007-2013 Tokutek Inc.  All rights reserved."
+
+#include "test.h"
+
+#include "dmt-wrapper.h"
+#include <util/dmt.h>
+
+typedef DMTVALUE TESTVALUE;
+
+static void
+parse_args (int argc, const char *argv[]) {
+    const char *argv0=argv[0];
+    while (argc>1) {
+        int resultcode=0;
+        if (strcmp(argv[1], "-v")==0) {
+            verbose++;
+        } else if (strcmp(argv[1], "-q")==0) {
+            verbose = 0;
+        } else if (strcmp(argv[1], "-h")==0) {
+        do_usage:
+            fprintf(stderr, "Usage:\n%s [-v|-h]\n", argv0);
+            exit(resultcode);
+        } else {
+            resultcode=1;
+            goto do_usage;
+        }
+        argc--;
+        argv++;
+    }
+}
+/* End ".h like" stuff. */
+
+struct value {
+    uint32_t number;
+};
+#define V(x) ((struct value *)(x))
+
+enum rand_type {
+    TEST_RANDOM,
+    TEST_SORTED,
+    TEST_IDENTITY
+};
+enum close_when_done {
+    CLOSE_WHEN_DONE,
+    KEEP_WHEN_DONE
+};
+enum create_type {
+    STEAL_ARRAY,
+    BATCH_INSERT,
+    INSERT_AT,
+    INSERT_AT_ALMOST_RANDOM,
+};
+
+/* Globals */
+DMT global_dmt;
+TESTVALUE*       values = NULL;
+struct value*   nums   = NULL;
+uint32_t       length;
+
+static void
+cleanup_globals (void) {
+    assert(values);
+    toku_free(values);
+    values = NULL;
+    assert(nums);
+    toku_free(nums);
+    nums = NULL;
+}
+
+const unsigned int random_seed = 0xFEADACBA;
+
+static void
+init_init_values (unsigned int seed, uint32_t num_elements) {
+    srandom(seed);
+
+    cleanup_globals();
+
+    MALLOC_N(num_elements, values);
+    assert(values);
+    MALLOC_N(num_elements, nums);
+    assert(nums);
+    length = num_elements;
+}
+
+static void
+init_identity_values (unsigned int seed, uint32_t num_elements) {
+    uint32_t   i;
+
+    init_init_values(seed, num_elements);
+
+    for (i = 0; i < length; i++) {
+        nums[i].number   = i;
+        values[i]        = (TESTVALUE)&nums[i];
+    }
+}
+
+static void
+init_distinct_sorted_values (unsigned int seed, uint32_t num_elements) {
+    uint32_t   i;
+
+    init_init_values(seed, num_elements);
+
+    uint32_t number = 0;
+
+    for (i = 0; i < length; i++) {
+        number          += (uint32_t)(random() % 32) + 1;
+        nums[i].number   = number;
+        values[i]        = (TESTVALUE)&nums[i];
+    }
+}
+
+static void
+init_distinct_random_values (unsigned int seed, uint32_t num_elements) {
+    init_distinct_sorted_values(seed, num_elements);
+
+    uint32_t   i;
+    uint32_t   choice;
+    uint32_t   choices;
+    struct value temp;
+    for (i = 0; i < length - 1; i++) {
+        choices = length - i;
+        choice  = random() % choices;
+        if (choice != i) {
+            temp         = nums[i];
+            nums[i]      = nums[choice];
+            nums[choice] = temp;
+        }
+    }
+}
+
+static void
+init_globals (void) {
+    MALLOC_N(1, values);
+    assert(values);
+    MALLOC_N(1, nums);
+    assert(nums);
+    length = 1;
+}
+
+static void
+test_close (enum close_when_done do_close) {
+    if (do_close == KEEP_WHEN_DONE) return;
+    assert(do_close == CLOSE_WHEN_DONE);
+    toku_dmt_destroy(&global_dmt);
+    assert(global_dmt==NULL);
+}
+
+static void
+test_create (enum close_when_done do_close) {
+    int r;
+    global_dmt = NULL;
+
+    r = toku_dmt_create(&global_dmt);
+    CKERR(r);
+    assert(global_dmt!=NULL);
+    test_close(do_close);
+}
+
+static void
+test_create_size (enum close_when_done do_close) {
+    test_create(KEEP_WHEN_DONE);
+    assert(toku_dmt_size(global_dmt) == 0);
+    test_close(do_close);
+}
+
+static void
+test_create_insert_at_almost_random (enum close_when_done do_close) {
+    uint32_t i;
+    int r;
+    uint32_t size = 0;
+
+    test_create(KEEP_WHEN_DONE);
+    r = toku_dmt_insert_at(global_dmt, values[0], toku_dmt_size(global_dmt)+1);
+    CKERR2(r, EINVAL);
+    r = toku_dmt_insert_at(global_dmt, values[0], toku_dmt_size(global_dmt)+2);
+    CKERR2(r, EINVAL);
+    for (i = 0; i < length/2; i++) {
+        assert(size==toku_dmt_size(global_dmt));
+        r = toku_dmt_insert_at(global_dmt, values[i], i);
+        CKERR(r);
+        assert(++size==toku_dmt_size(global_dmt));
+        r = toku_dmt_insert_at(global_dmt, values[length-1-i], i+1);
+        CKERR(r);
+        assert(++size==toku_dmt_size(global_dmt));
+    }
+    r = toku_dmt_insert_at(global_dmt, values[0], toku_dmt_size(global_dmt)+1);
+    CKERR2(r, EINVAL);
+    r = toku_dmt_insert_at(global_dmt, values[0], toku_dmt_size(global_dmt)+2);
+    CKERR2(r, EINVAL);
+    assert(size==toku_dmt_size(global_dmt));
+    test_close(do_close);
+}
+
+static void
+test_create_insert_at_sequential (enum close_when_done do_close) {
+    uint32_t i;
+    int r;
+    uint32_t size = 0;
+
+    test_create(KEEP_WHEN_DONE);
+    r = toku_dmt_insert_at(global_dmt, values[0], toku_dmt_size(global_dmt)+1);
+    CKERR2(r, EINVAL);
+    r = toku_dmt_insert_at(global_dmt, values[0], toku_dmt_size(global_dmt)+2);
+    CKERR2(r, EINVAL);
+    for (i = 0; i < length; i++) {
+        assert(size==toku_dmt_size(global_dmt));
+        r = toku_dmt_insert_at(global_dmt, values[i], i);
+        CKERR(r);
+        assert(++size==toku_dmt_size(global_dmt));
+    }
+    r = toku_dmt_insert_at(global_dmt, values[0], toku_dmt_size(global_dmt)+1);
+    CKERR2(r, EINVAL);
+    r = toku_dmt_insert_at(global_dmt, values[0], toku_dmt_size(global_dmt)+2);
+    CKERR2(r, EINVAL);
+    assert(size==toku_dmt_size(global_dmt));
+    test_close(do_close);
+}
+
+static void
+test_create_from_sorted_array (enum create_type create_choice, enum close_when_done do_close) {
+    int r;
+    global_dmt = NULL;
+
+    if (create_choice == BATCH_INSERT) {
+        r = toku_dmt_create_from_sorted_array(&global_dmt, values, length);
+        CKERR(r);
+    }
+    else if (create_choice == STEAL_ARRAY) {
+        TESTVALUE* MALLOC_N(length, values_copy);
+        memcpy(values_copy, values, length*sizeof(*values));
+        r = toku_dmt_create_steal_sorted_array(&global_dmt, &values_copy, length, length);
+        CKERR(r);
+        assert(values_copy==NULL);
+    }
+    else if (create_choice == INSERT_AT) {
+        test_create_insert_at_sequential(KEEP_WHEN_DONE);
+    }
+    else if (create_choice == INSERT_AT_ALMOST_RANDOM) {
+        test_create_insert_at_almost_random(KEEP_WHEN_DONE);
+    }
+    else assert(false);
+
+    assert(global_dmt!=NULL);
+    test_close(do_close);
+}
+
+static void
+test_create_from_sorted_array_size (enum create_type create_choice, enum close_when_done do_close) {
+    test_create_from_sorted_array(create_choice, KEEP_WHEN_DONE);
+    assert(toku_dmt_size(global_dmt)==length);
+    test_close(do_close);
+}    
+
+static void
+test_fetch_verify (DMT dmtree, TESTVALUE* val, uint32_t len ) {
+    uint32_t i;
+    int r;
+    TESTVALUE v = (TESTVALUE)&i;
+    TESTVALUE oldv = v;
+
+    assert(len == toku_dmt_size(dmtree));
+    for (i = 0; i < len; i++) {
+        assert(oldv!=val[i]);
+        v = NULL;
+        r = toku_dmt_fetch(dmtree, i, &v);
+        CKERR(r);
+        assert(v != NULL);
+        assert(v != oldv);
+        assert(v == val[i]);
+        assert(V(v)->number == V(val[i])->number);
+        v = oldv;
+    }
+
+    for (i = len; i < len*2; i++) {
+        v = oldv;
+        r = toku_dmt_fetch(dmtree, i, &v);
+        CKERR2(r, EINVAL);
+        assert(v == oldv);
+    }
+
+}
+
+static void
+test_create_fetch_verify (enum create_type create_choice, enum close_when_done do_close) {
+    test_create_from_sorted_array(create_choice, KEEP_WHEN_DONE);
+    test_fetch_verify(global_dmt, values, length);
+    test_close(do_close);
+}
+
+static int iterate_helper_error_return = 1;
+
+static int
+iterate_helper (TESTVALUE v, uint32_t idx, void* extra) {
+    if (extra == NULL) return iterate_helper_error_return;
+    TESTVALUE* vals = (TESTVALUE *)extra;
+    assert(v != NULL);
+    assert(v == vals[idx]);
+    assert(V(v)->number == V(vals[idx])->number);
+    return 0;
+}
+
+static void
+test_iterate_verify (DMT dmtree, TESTVALUE* vals, uint32_t len) {
+    int r;
+    iterate_helper_error_return = 0;
+    r = toku_dmt_iterate(dmtree, iterate_helper, (void*)vals);
+    CKERR(r);
+    iterate_helper_error_return = 0xFEEDABBA;
+    r = toku_dmt_iterate(dmtree, iterate_helper, NULL);
+    if (!len) {
+        CKERR2(r, 0);
+    }
+    else {
+        CKERR2(r, iterate_helper_error_return);
+    }
+}
+
+static void
+test_create_iterate_verify (enum create_type create_choice, enum close_when_done do_close) {
+    test_create_from_sorted_array(create_choice, KEEP_WHEN_DONE);
+    test_iterate_verify(global_dmt, values, length);
+    test_close(do_close);
+}
+
+
+static void
+permute_array (uint32_t* arr, uint32_t len) {
+    //
+    // create a permutation of 0...size-1
+    //
+    uint32_t i = 0;
+    for (i = 0; i < len; i++) {
+        arr[i] = i;
+    }
+    for (i = 0; i < len - 1; i++) {
+        uint32_t choices = len - i;
+        uint32_t choice  = random() % choices;
+        if (choice != i) {
+            uint32_t temp = arr[i];
+            arr[i]      = arr[choice];
+            arr[choice] = temp;
+        }
+    }
+}
+
+static void
+test_create_set_at (enum create_type create_choice, enum close_when_done do_close) {
+    uint32_t i = 0;
+
+    struct value*   old_nums   = NULL;
+    MALLOC_N(length, old_nums);
+    assert(nums);
+
+    uint32_t* perm = NULL;
+    MALLOC_N(length, perm);
+    assert(perm);
+
+    TESTVALUE* old_values = NULL;
+    MALLOC_N(length, old_values);
+    assert(old_values);
+    
+    permute_array(perm, length);
+
+    //
+    // These are going to be the new values
+    //
+    for (i = 0; i < length; i++) {
+        old_nums[i] = nums[i];
+        old_values[i] = &old_nums[i];        
+        values[i] = &old_nums[i];
+    }
+    test_create_from_sorted_array(create_choice, KEEP_WHEN_DONE);
+    int r;
+    r = toku_dmt_set_at (global_dmt, values[0], length);
+    CKERR2(r,EINVAL);    
+    r = toku_dmt_set_at (global_dmt, values[0], length+1);
+    CKERR2(r,EINVAL);    
+    for (i = 0; i < length; i++) {
+        uint32_t choice = perm[i];
+        values[choice] = &nums[choice];
+        nums[choice].number = (uint32_t)random();
+        r = toku_dmt_set_at (global_dmt, values[choice], choice);
+        CKERR(r);
+        test_iterate_verify(global_dmt, values, length);
+        test_fetch_verify(global_dmt, values, length);
+    }
+    r = toku_dmt_set_at (global_dmt, values[0], length);
+    CKERR2(r,EINVAL);    
+    r = toku_dmt_set_at (global_dmt, values[0], length+1);
+    CKERR2(r,EINVAL);    
+
+    toku_free(perm);
+    toku_free(old_values);
+    toku_free(old_nums);
+
+    test_close(do_close);
+}
+
+static int
+insert_helper (TESTVALUE value, void* extra_insert) {
+    TESTVALUE to_insert = (DMTVALUE)extra_insert;
+    assert(to_insert);
+
+    if (V(value)->number < V(to_insert)->number) return -1;
+    if (V(value)->number > V(to_insert)->number) return +1;
+    return 0;
+}
+
+static void
+test_create_insert (enum close_when_done do_close) {
+    uint32_t i = 0;
+
+    uint32_t* perm = NULL;
+    MALLOC_N(length, perm);
+    assert(perm);
+
+    permute_array(perm, length);
+
+    test_create(KEEP_WHEN_DONE);
+    int r;
+    uint32_t size = length;
+    length = 0;
+    while (length < size) {
+        uint32_t choice = perm[length];
+        TESTVALUE to_insert = &nums[choice];
+        uint32_t idx = UINT32_MAX;
+
+        assert(length==toku_dmt_size(global_dmt));
+        r = toku_dmt_insert(global_dmt, to_insert, insert_helper, to_insert, &idx);
+        CKERR(r);
+        assert(idx <= length);
+        if (idx > 0) {
+            assert(V(to_insert)->number > V(values[idx-1])->number);
+        }
+        if (idx < length) {
+            assert(V(to_insert)->number < V(values[idx])->number);
+        }
+        length++;
+        assert(length==toku_dmt_size(global_dmt));
+        /* Make room */
+        for (i = length-1; i > idx; i--) {
+            values[i] = values[i-1];
+        }
+        values[idx] = to_insert;
+        test_fetch_verify(global_dmt, values, length);
+        test_iterate_verify(global_dmt, values, length);
+
+        idx = UINT32_MAX;
+        r = toku_dmt_insert(global_dmt, to_insert, insert_helper, to_insert, &idx);
+        CKERR2(r, DB_KEYEXIST);
+        assert(idx < length);
+        assert(V(values[idx])->number == V(to_insert)->number);
+        assert(length==toku_dmt_size(global_dmt));
+
+        test_iterate_verify(global_dmt, values, length);
+        test_fetch_verify(global_dmt, values, length);
+    }
+
+    toku_free(perm);
+
+    test_close(do_close);
+}
+
+static void
+test_create_delete_at (enum create_type create_choice, enum close_when_done do_close) {
+    uint32_t i = 0;
+    int r = ENOSYS;
+    test_create_from_sorted_array(create_choice, KEEP_WHEN_DONE);
+
+    assert(length == toku_dmt_size(global_dmt));
+    r = toku_dmt_delete_at(global_dmt, length);
+    CKERR2(r,EINVAL);
+    assert(length == toku_dmt_size(global_dmt));
+    r = toku_dmt_delete_at(global_dmt, length+1);
+    CKERR2(r,EINVAL);
+    while (length > 0) {
+        assert(length == toku_dmt_size(global_dmt));
+        uint32_t index_to_delete = random()%length;
+        r = toku_dmt_delete_at(global_dmt, index_to_delete);
+        CKERR(r);
+        for (i = index_to_delete+1; i < length; i++) {
+            values[i-1] = values[i];
+        }
+        length--;
+        test_fetch_verify(global_dmt, values, length);
+        test_iterate_verify(global_dmt, values, length);
+    }
+    assert(length == 0);
+    assert(length == toku_dmt_size(global_dmt));
+    r = toku_dmt_delete_at(global_dmt, length);
+    CKERR2(r, EINVAL);
+    assert(length == toku_dmt_size(global_dmt));
+    r = toku_dmt_delete_at(global_dmt, length+1);
+    CKERR2(r, EINVAL);
+    test_close(do_close);
+}
+
+static void
+test_split_merge (enum create_type create_choice, enum close_when_done do_close) {
+    int r = ENOSYS;
+    uint32_t i = 0;
+    DMT left_split = NULL;
+    DMT right_split = NULL;
+    test_create_from_sorted_array(create_choice, KEEP_WHEN_DONE);
+
+    for (i = 0; i <= length; i++) {
+        r = toku_dmt_split_at(global_dmt, &right_split, length+1);
+        CKERR2(r,EINVAL);
+        r = toku_dmt_split_at(global_dmt, &right_split, length+2);
+        CKERR2(r,EINVAL);
+
+        //
+        // test successful split
+        //
+        r = toku_dmt_split_at(global_dmt, &right_split, i);
+        CKERR(r);
+        left_split = global_dmt;
+        global_dmt = NULL;
+        assert(toku_dmt_size(left_split) == i);
+        assert(toku_dmt_size(right_split) == length - i);
+        test_fetch_verify(left_split, values, i);
+        test_iterate_verify(left_split, values, i);
+        test_fetch_verify(right_split, &values[i], length - i);
+        test_iterate_verify(right_split, &values[i], length - i);
+        //
+        // verify that new global_dmt's cannot do bad splits
+        //
+        r = toku_dmt_split_at(left_split, &global_dmt, i+1);
+        CKERR2(r,EINVAL);
+        assert(toku_dmt_size(left_split) == i);
+        assert(toku_dmt_size(right_split) == length - i);
+        r = toku_dmt_split_at(left_split, &global_dmt, i+2);
+        CKERR2(r,EINVAL);
+        assert(toku_dmt_size(left_split) == i);
+        assert(toku_dmt_size(right_split) == length - i);
+        r = toku_dmt_split_at(right_split, &global_dmt, length - i + 1);
+        CKERR2(r,EINVAL);
+        assert(toku_dmt_size(left_split) == i);
+        assert(toku_dmt_size(right_split) == length - i);
+        r = toku_dmt_split_at(right_split, &global_dmt, length - i + 1);
+        CKERR2(r,EINVAL);
+        assert(toku_dmt_size(left_split) == i);
+        assert(toku_dmt_size(right_split) == length - i);
+
+        //
+        // test merge
+        //
+        r = toku_dmt_merge(left_split,right_split,&global_dmt);
+        CKERR(r);
+        left_split = NULL;
+        right_split = NULL;
+        assert(toku_dmt_size(global_dmt) == length);
+        test_fetch_verify(global_dmt, values, length);
+        test_iterate_verify(global_dmt, values, length);
+    }
+    test_close(do_close);
+}
+
+
+static void
+init_values (enum rand_type rand_choice) {
+    const uint32_t test_size = 100;
+    if (rand_choice == TEST_RANDOM) {
+        init_distinct_random_values(random_seed, test_size);
+    }
+    else if (rand_choice == TEST_SORTED) {
+        init_distinct_sorted_values(random_seed, test_size);
+    }
+    else if (rand_choice == TEST_IDENTITY) {
+        init_identity_values(       random_seed, test_size);
+    }
+    else assert(false);
+}
+
+static void
+test_create_array (enum create_type create_choice, enum rand_type rand_choice) {
+    /* ********************************************************************** */
+    init_values(rand_choice);
+    test_create_from_sorted_array(     create_choice, CLOSE_WHEN_DONE);
+    test_create_from_sorted_array_size(create_choice, CLOSE_WHEN_DONE);
+    /* ********************************************************************** */
+    init_values(rand_choice);
+    test_create_fetch_verify(          create_choice, CLOSE_WHEN_DONE);
+    /* ********************************************************************** */
+    init_values(rand_choice);
+    test_create_iterate_verify(        create_choice, CLOSE_WHEN_DONE);
+    /* ********************************************************************** */
+    init_values(rand_choice);
+    test_create_set_at(                create_choice, CLOSE_WHEN_DONE);
+    /* ********************************************************************** */
+    init_values(rand_choice);
+    test_create_delete_at(             create_choice, CLOSE_WHEN_DONE);
+    /* ********************************************************************** */
+    init_values(rand_choice);
+    test_create_insert(                               CLOSE_WHEN_DONE);
+    /* ********************************************************************** */
+    init_values(rand_choice);
+    test_split_merge(                  create_choice, CLOSE_WHEN_DONE);
+}
+
+typedef struct {
+    uint32_t first_zero;
+    uint32_t first_pos;
+} h_extra;
+
+
+static int
+test_heaviside (DMTVALUE v_dmt, void* x) {
+    TESTVALUE v = (DMTVALUE) v_dmt;
+    h_extra* extra = (h_extra*)x;
+    assert(v && x);
+    assert(extra->first_zero <= extra->first_pos);
+
+    uint32_t value = V(v)->number;
+    if (value < extra->first_zero) return -1;
+    if (value < extra->first_pos) return 0;
+    return 1;
+}
+
+static void
+heavy_extra (h_extra* extra, uint32_t first_zero, uint32_t first_pos) {
+    extra->first_zero = first_zero;
+    extra->first_pos  = first_pos;
+}
+
+static void
+test_find_dir (int dir, void* extra, int (*h)(DMTVALUE, void*),
+	       int r_expect, bool idx_will_change, uint32_t idx_expect,
+	       uint32_t number_expect, bool UU(cursor_valid)) {
+    uint32_t idx     = UINT32_MAX;
+    uint32_t old_idx = idx;
+    TESTVALUE dmt_val;
+    int r;
+
+    dmt_val = NULL;
+
+    /* Verify we can pass NULL value. */
+    dmt_val = NULL;
+    idx      = old_idx;
+    if (dir == 0) {
+        r = toku_dmt_find_zero(global_dmt, h, extra,      NULL, &idx);
+    }
+    else {
+        r = toku_dmt_find(     global_dmt, h, extra, dir, NULL, &idx);
+    }
+    CKERR2(r, r_expect);
+    if (idx_will_change) {
+        assert(idx == idx_expect);
+    }
+    else {
+        assert(idx == old_idx);
+    }
+    assert(dmt_val == NULL);
+    
+    /* Verify we can pass NULL idx. */
+    dmt_val  = NULL;
+    idx      = old_idx;
+    if (dir == 0) {
+        r = toku_dmt_find_zero(global_dmt, h, extra,      &dmt_val, 0);
+    }
+    else {
+        r = toku_dmt_find(     global_dmt, h, extra, dir, &dmt_val, 0);
+    }
+    CKERR2(r, r_expect);
+    assert(idx == old_idx);
+    if (r == DB_NOTFOUND) {
+        assert(dmt_val == NULL);
+    }
+    else {
+        assert(V(dmt_val)->number == number_expect);
+    }
+
+    /* Verify we can pass NULL both. */
+    dmt_val  = NULL;
+    idx      = old_idx;
+    if (dir == 0) {
+        r = toku_dmt_find_zero(global_dmt, h, extra,      NULL, 0);
+    }
+    else {
+        r = toku_dmt_find(     global_dmt, h, extra, dir, NULL, 0);
+    }
+    CKERR2(r, r_expect);
+    assert(idx == old_idx);
+    assert(dmt_val == NULL);
+}
+
+static void
+test_find (enum create_type create_choice, enum close_when_done do_close) {
+    h_extra extra;
+    init_identity_values(random_seed, 100);
+    test_create_from_sorted_array(create_choice, KEEP_WHEN_DONE);
+
+/*
+    -...-
+        A
+*/
+    heavy_extra(&extra, length, length);
+    test_find_dir(-1, &extra, test_heaviside, 0,           true,  length-1, length-1, true);
+    test_find_dir(+1, &extra, test_heaviside, DB_NOTFOUND, false, 0,        0,        false);
+    test_find_dir(0,  &extra, test_heaviside, DB_NOTFOUND, true,  length,   length,   false);
+
+
+/*
+    +...+
+    B
+*/
+    heavy_extra(&extra, 0, 0);
+    test_find_dir(-1, &extra, test_heaviside, DB_NOTFOUND, false, 0, 0, false);
+    test_find_dir(+1, &extra, test_heaviside, 0,           true,  0, 0, true);
+    test_find_dir(0,  &extra, test_heaviside, DB_NOTFOUND, true,  0, 0, false);
+
+/*
+    0...0
+    C
+*/
+    heavy_extra(&extra, 0, length);
+    test_find_dir(-1, &extra, test_heaviside, DB_NOTFOUND, false, 0, 0, false);
+    test_find_dir(+1, &extra, test_heaviside, DB_NOTFOUND, false, 0, 0, false);
+    test_find_dir(0,  &extra, test_heaviside, 0,           true,  0, 0, true);
+
+/*
+    -...-0...0
+        AC
+*/
+    heavy_extra(&extra, length/2, length);
+    test_find_dir(-1, &extra, test_heaviside, 0,           true,  length/2-1, length/2-1, true);
+    test_find_dir(+1, &extra, test_heaviside, DB_NOTFOUND, false, 0,          0,          false);
+    test_find_dir(0,  &extra, test_heaviside, 0,           true,  length/2,   length/2,   true);
+
+/*
+    0...0+...+
+    C    B
+*/
+    heavy_extra(&extra, 0, length/2);
+    test_find_dir(-1, &extra, test_heaviside, DB_NOTFOUND, false, 0,        0,        false);
+    test_find_dir(+1, &extra, test_heaviside, 0,           true,  length/2, length/2, true);
+    test_find_dir(0,  &extra, test_heaviside, 0,           true,  0,        0,        true);
+
+/*
+    -...-+...+
+        AB
+*/
+    heavy_extra(&extra, length/2, length/2);
+    test_find_dir(-1, &extra, test_heaviside, 0,           true, length/2-1, length/2-1, true);
+    test_find_dir(+1, &extra, test_heaviside, 0,           true, length/2,   length/2,   true);
+    test_find_dir(0,  &extra, test_heaviside, DB_NOTFOUND, true, length/2,   length/2,   false);
+
+/*
+    -...-0...0+...+
+        AC    B
+*/    
+    heavy_extra(&extra, length/3, 2*length/3);
+    test_find_dir(-1, &extra, test_heaviside, 0, true,   length/3-1,   length/3-1, true);
+    test_find_dir(+1, &extra, test_heaviside, 0, true, 2*length/3,   2*length/3,   true);
+    test_find_dir(0,  &extra, test_heaviside, 0, true,   length/3,     length/3,   true);
+
+    /* Cleanup */
+    test_close(do_close);
+}
+
+static void
+runtests_create_choice (enum create_type create_choice) {
+    test_create_array(create_choice, TEST_SORTED);
+    test_create_array(create_choice, TEST_RANDOM);
+    test_create_array(create_choice, TEST_IDENTITY);
+    test_find(        create_choice, CLOSE_WHEN_DONE);
+}
+
+static void
+test_clone(uint32_t nelts)
+// Test that each clone operation gives the right data back.  If nelts is
+// zero, also tests that you still get a valid DMT back and that the way
+// to deallocate it still works.
+{
+    DMT src = NULL, dest = NULL;
+    int r;
+
+    r = toku_dmt_create(&src);
+    assert_zero(r);
+    for (long i = 0; i < nelts; ++i) {
+        r = toku_dmt_insert_at(src, (DMTVALUE) i, i);
+        assert_zero(r);
+    }
+
+    r = toku_dmt_clone_noptr(&dest, src);
+    assert_zero(r);
+    assert(dest != NULL);
+    assert(toku_dmt_size(dest) == nelts);
+    for (long i = 0; i < nelts; ++i) {
+        DMTVALUE v;
+        long l;
+        r = toku_dmt_fetch(dest, i, &v);
+        assert_zero(r);
+        l = (long) v;
+        assert(l == i);
+    }
+    toku_dmt_destroy(&dest);
+    toku_dmt_destroy(&src);
+}
+
+int
+test_main(int argc, const char *argv[]) {
+    parse_args(argc, argv);
+    init_globals();
+    test_create(      CLOSE_WHEN_DONE);
+    test_create_size( CLOSE_WHEN_DONE);
+    runtests_create_choice(BATCH_INSERT);
+    runtests_create_choice(STEAL_ARRAY);
+    runtests_create_choice(INSERT_AT);
+    runtests_create_choice(INSERT_AT_ALMOST_RANDOM);
+    test_clone(0);
+    test_clone(1);
+    test_clone(1000);
+    test_clone(10000);
+    cleanup_globals();
+    return 0;
+}
+
--- a/ft/tests/dmt-test2.cc
+++ b/ft/tests/dmt-test2.cc
@ -0,0 +1,373 @@
+/* -*- mode: C++; c-basic-offset: 4; indent-tabs-mode: nil -*- */
+// vim: ft=cpp:expandtab:ts=8:sw=4:softtabstop=4:
+#ident "$Id$"
+/*
+COPYING CONDITIONS NOTICE:
+
+  This program is free software; you can redistribute it and/or modify
+  it under the terms of version 2 of the GNU General Public License as
+  published by the Free Software Foundation, and provided that the
+  following conditions are met:
+
+      * Redistributions of source code must retain this COPYING
+        CONDITIONS NOTICE, the COPYRIGHT NOTICE (below), the
+        DISCLAIMER (below), the UNIVERSITY PATENT NOTICE (below), the
+        PATENT MARKING NOTICE (below), and the PATENT RIGHTS
+        GRANT (below).
+
+      * Redistributions in binary form must reproduce this COPYING
+        CONDITIONS NOTICE, the COPYRIGHT NOTICE (below), the
+        DISCLAIMER (below), the UNIVERSITY PATENT NOTICE (below), the
+        PATENT MARKING NOTICE (below), and the PATENT RIGHTS
+        GRANT (below) in the documentation and/or other materials
+        provided with the distribution.
+
+  You should have received a copy of the GNU General Public License
+  along with this program; if not, write to the Free Software
+  Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
+  02110-1301, USA.
+
+COPYRIGHT NOTICE:
+
+  TokuDB, Tokutek Fractal Tree Indexing Library.
+  Copyright (C) 2007-2013 Tokutek, Inc.
+
+DISCLAIMER:
+
+  This program is distributed in the hope that it will be useful, but
+  WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+  General Public License for more details.
+
+UNIVERSITY PATENT NOTICE:
+
+  The technology is licensed by the Massachusetts Institute of
+  Technology, Rutgers State University of New Jersey, and the Research
+  Foundation of State University of New York at Stony Brook under
+  United States of America Serial No. 11/760379 and to the patents
+  and/or patent applications resulting from it.
+
+PATENT MARKING NOTICE:
+
+  This software is covered by US Patent No. 8,185,551.
+  This software is covered by US Patent No. 8,489,638.
+
+PATENT RIGHTS GRANT:
+
+  "THIS IMPLEMENTATION" means the copyrightable works distributed by
+  Tokutek as part of the Fractal Tree project.
+
+  "PATENT CLAIMS" means the claims of patents that are owned or
+  licensable by Tokutek, both currently or in the future; and that in
+  the absence of this license would be infringed by THIS
+  IMPLEMENTATION or by using or running THIS IMPLEMENTATION.
+
+  "PATENT CHALLENGE" shall mean a challenge to the validity,
+  patentability, enforceability and/or non-infringement of any of the
+  PATENT CLAIMS or otherwise opposing any of the PATENT CLAIMS.
+
+  Tokutek hereby grants to you, for the term and geographical scope of
+  the PATENT CLAIMS, a non-exclusive, no-charge, royalty-free,
+  irrevocable (except as stated in this section) patent license to
+  make, have made, use, offer to sell, sell, import, transfer, and
+  otherwise run, modify, and propagate the contents of THIS
+  IMPLEMENTATION, where such license applies only to the PATENT
+  CLAIMS.  This grant does not include claims that would be infringed
+  only as a consequence of further modifications of THIS
+  IMPLEMENTATION.  If you or your agent or licensee institute or order
+  or agree to the institution of patent litigation against any entity
+  (including a cross-claim or counterclaim in a lawsuit) alleging that
+  THIS IMPLEMENTATION constitutes direct or contributory patent
+  infringement, or inducement of patent infringement, then any rights
+  granted to you under this License shall terminate as of the date
+  such litigation is filed.  If you or your agent or exclusive
+  licensee institute or order or agree to the institution of a PATENT
+  CHALLENGE, then Tokutek may terminate any rights granted to you
+  under this License.
+*/
+
+#ident "Copyright (c) 2007-2013 Tokutek Inc.  All rights reserved."
+
+#include "test.h"
+
+#include <util/dmt.h>
+
+static void
+parse_args (int argc, const char *argv[]) {
+    const char *argv0=argv[0];
+    while (argc>1) {
+        int resultcode=0;
+        if (strcmp(argv[1], "-v")==0) {
+            verbose++;
+        } else if (strcmp(argv[1], "-q")==0) {
+            verbose = 0;
+        } else if (strcmp(argv[1], "-h")==0) {
+        do_usage:
+            fprintf(stderr, "Usage:\n%s [-v|-h]\n", argv0);
+            exit(resultcode);
+        } else {
+            resultcode=1;
+            goto do_usage;
+        }
+        argc--;
+        argv++;
+    }
+}
+/* End ".h like" stuff. */
+
+struct value {
+    uint32_t number;
+};
+#define V(x) ((struct value *)(x))
+
+
+
+const uint32_t MAXNUM = 1024;
+const uint32_t MAXLEN = 32;
+char data[MAXNUM][MAXLEN];
+
+struct val_type {
+    char c[MAXLEN];
+};
+
+namespace toku {
+class vwriter {
+    public:
+        size_t get_size(void) const {
+            size_t len = strlen(v.c);
+            invariant(len < sizeof(val_type));
+            return len + 1;
+        }
+        void write_to(val_type *const dest) const {
+            strcpy(dest->c, v.c);
+        }
+
+        vwriter(const char* c) {
+            invariant(strlen(c) < sizeof(val_type));
+            strcpy(v.c, c);
+        }
+
+        vwriter(const uint32_t klpair_len, val_type *const src) {
+            invariant(strlen(src->c) < sizeof(val_type));
+            strcpy(v.c, src->c);
+            invariant(klpair_len == get_size());
+        }
+    private:
+        val_type v;
+};
+}
+
+/* Globals */
+typedef toku::dmt<val_type, val_type*, toku::vwriter> vdmt;
+
+const unsigned int random_seed = 0xFEADACBA;
+
+///////////////
+
+
+static void fail_one_verify(uint32_t len, uint32_t num, vdmt *v) {
+    val_type* fetched_data;
+    int count = 0;
+    v->verify();
+    for (uint32_t i = 0; i < num; i++) {
+        uint32_t fetched_len;
+        int r = v->fetch(i-count, &fetched_len, &fetched_data);
+        if (r != 0 || fetched_len != len || strcmp(fetched_data->c, data[i])) {
+            count++;
+            continue;
+        }
+    }
+    invariant(count == 1);
+}
+
+static void verify(uint32_t len, uint32_t num, vdmt *v) {
+    v->verify();
+    val_type* fetched_data;
+    for (uint32_t i = 0; i < num; i++) {
+        uint32_t fetched_len;
+        int r = v->fetch(i, &fetched_len, &fetched_data);
+        CKERR(r);
+        invariant(fetched_len == len);
+        invariant(!strcmp(fetched_data->c, data[i]));
+    }
+}
+
+
+static void test_builder_fixed(uint32_t len, uint32_t num) {
+    srandom(random_seed);
+    assert(len > 1);
+    assert(len <= MAXLEN);
+    assert(num <= MAXNUM);
+    for (uint32_t i = 0; i < num; i++) {
+        for (uint32_t j = 0; j < len-1; j++) {
+            data[i][j] = random() % 255 + 1; //This way it doesn't end up being 0 and thought of as NUL
+        }
+        data[i][len-1] = '\0'; //cap it
+    }
+
+    vdmt::builder builder;
+    builder.create(num, num * len);
+
+    for (uint32_t i = 0; i < num; i++) {
+        vwriter vfun(data[i]);
+        builder.append(vfun);
+    }
+    invariant(builder.value_length_is_fixed());
+    vdmt v;
+    builder.build(&v);
+    invariant(v.value_length_is_fixed());
+    invariant(v.get_fixed_length() == len);
+
+    invariant(v.size() == num);
+
+    verify(len, num, &v);
+
+    for (uint32_t change = 0; change < num; change++) {
+        vdmt v2;
+        v2.clone(v);
+        v2.delete_at(change);
+        fail_one_verify(len, num, &v2);
+
+        vwriter vfun(data[change]);
+        v2.insert_at(vfun, change);
+        verify(len, num, &v2);
+        v2.destroy();
+    }
+
+    v.destroy();
+}
+
+static void test_builder_variable(uint32_t len, uint32_t len2, uint32_t num) {
+    srandom(random_seed);
+    assert(len > 1);
+    assert(len <= MAXLEN);
+    assert(num <= MAXNUM);
+    assert(num > 3);
+    uint32_t which2 = random() % num;
+    for (uint32_t i = 0; i < num; i++) {
+        uint32_t thislen = i == which2 ? len2 : len;
+        for (uint32_t j = 0; j < thislen-1; j++) {
+            data[i][j] = random() % 255 + 1; //This way it doesn't end up being 0 and thought of as NUL
+        }
+        data[i][thislen-1] = '\0'; //cap it
+    }
+
+    vdmt::builder builder;
+    builder.create(num, (num-1) * len + len2);
+
+    for (uint32_t i = 0; i < num; i++) {
+        vwriter vfun(data[i]);
+        builder.append(vfun);
+    }
+    invariant(!builder.value_length_is_fixed());
+    vdmt v;
+    builder.build(&v);
+    invariant(!v.value_length_is_fixed());
+
+    invariant(v.size() == num);
+
+    val_type* fetched_data;
+    for (uint32_t i = 0; i < num; i++) {
+        uint32_t fetched_len;
+        int r = v.fetch(i, &fetched_len, &fetched_data);
+        CKERR(r);
+        if (i == which2) {
+            invariant(fetched_len == len2);
+            invariant(!strcmp(fetched_data->c, data[i]));
+        } else {
+            invariant(fetched_len == len);
+            invariant(!strcmp(fetched_data->c, data[i]));
+        }
+    }
+
+    v.destroy();
+}
+
+static void test_create_from_sorted_memory_of_fixed_sized_elements__and__serialize(uint32_t len, uint32_t num) {
+    srandom(random_seed);
+    assert(len > 1);
+    assert(len <= MAXLEN);
+    assert(num <= MAXNUM);
+    assert(num > 4);
+    for (uint32_t i = 0; i < num; i++) {
+        for (uint32_t j = 0; j < len-1; j++) {
+            data[i][j] = random() % 255 + 1; //This way it doesn't end up being 0 and thought of as NUL
+        }
+        data[i][len-1] = '\0'; //cap it
+    }
+
+    char *flat = (char*)toku_xmalloc(len * num);
+    char *p = flat;
+    for (uint32_t i = 0; i < num; i++) {
+        memcpy(p, data[i], len);
+        p += len;
+    }
+    vdmt v;
+
+    v.create_from_sorted_memory_of_fixed_size_elements(flat, num, len*num, len);
+    invariant(v.value_length_is_fixed());
+    invariant(v.get_fixed_length() == len);
+
+    invariant(v.size() == num);
+
+    val_type* fetched_data;
+    for (uint32_t i = 0; i < num; i++) {
+        uint32_t fetched_len;
+        int r = v.fetch(i, &fetched_len, &fetched_data);
+        CKERR(r);
+        invariant(fetched_len == len);
+        invariant(!strcmp(fetched_data->c, data[i]));
+    }
+
+    char *serialized_flat = (char*)toku_xmalloc(len*num);
+    struct wbuf wb;
+    wbuf_nocrc_init(&wb, serialized_flat, len*num);
+    v.prepare_for_serialize();
+    v.serialize_values(len*num, &wb);
+    invariant(!memcmp(serialized_flat, flat, len*num));
+
+    //Currently converting to dtree treats the entire thing as NOT fixed length.
+    //Optional additional perf here.
+    uint32_t which = (random() % (num-1)) + 1;  // Not last, not first
+    invariant(which > 0 && which < num-1);
+    v.delete_at(which);
+
+    memmove(flat + which*len, flat+(which+1)*len, (num-which-1) * len);
+    v.prepare_for_serialize();
+    wbuf_nocrc_init(&wb, serialized_flat, len*(num-1));
+    v.serialize_values(len*(num-1), &wb);
+    invariant(!memcmp(serialized_flat, flat, len*(num-1)));
+
+
+    toku_free(flat);
+    toku_free(serialized_flat);
+
+    v.destroy();
+}
+
+int
+test_main(int argc, const char *argv[]) {
+    parse_args(argc, argv);
+    // Do test with size divisible by 4 and not
+    test_builder_fixed(4, 0);
+    test_builder_fixed(5, 0);
+    test_builder_fixed(4, 1);
+    test_builder_fixed(5, 1);
+    test_builder_fixed(4, 100);
+    test_builder_fixed(5, 100);
+    // Do test with zero, one, or both sizes divisible
+    test_builder_variable(4, 8, 100);
+    test_builder_variable(4, 5, 100);
+    test_builder_variable(5, 8, 100);
+    test_builder_variable(5, 10, 100);
+
+    test_create_from_sorted_memory_of_fixed_sized_elements__and__serialize(4, 0);
+    test_create_from_sorted_memory_of_fixed_sized_elements__and__serialize(5, 0);
+    test_create_from_sorted_memory_of_fixed_sized_elements__and__serialize(4, 1);
+    test_create_from_sorted_memory_of_fixed_sized_elements__and__serialize(5, 1);
+    test_create_from_sorted_memory_of_fixed_sized_elements__and__serialize(4, 100);
+    test_create_from_sorted_memory_of_fixed_sized_elements__and__serialize(5, 100);
+
+    return 0;
+}
+
--- a/ft/tests/ft-clock-test.cc
+++ b/ft/tests/ft-clock-test.cc
@ -115,13 +115,18 @@ le_add_to_bn(bn_data* bn, uint32_t idx, const  char *key, int keylen, const char
 {
    LEAFENTRY r = NULL;
    uint32_t size_needed = LE_CLEAN_MEMSIZE(vallen);
+    void *maybe_free = nullptr;
    bn->get_space_for_insert(
        idx, 
        key,
        keylen,
        size_needed,
-        &r
+        &r,
+        &maybe_free
        );
+    if (maybe_free) {
+        toku_free(maybe_free);
+    }
    resource_assert(r);
    r->type = LE_CLEAN;
    r->u.clean.vallen = vallen;
--- a/ft/tests/ft-serialize-benchmark.cc
+++ b/ft/tests/ft-serialize-benchmark.cc
@ -105,13 +105,18 @@ le_add_to_bn(bn_data* bn, uint32_t idx, char *key, int keylen, char *val, int va
 {
    LEAFENTRY r = NULL;
    uint32_t size_needed = LE_CLEAN_MEMSIZE(vallen);
+    void *maybe_free = nullptr;
    bn->get_space_for_insert(
        idx, 
        key,
        keylen,
        size_needed,
-        &r
+        &r,
+        &maybe_free
        );
+    if (maybe_free) {
+        toku_free(maybe_free);
+    }
    resource_assert(r);
    r->type = LE_CLEAN;
    r->u.clean.vallen = vallen;
@ -127,7 +132,7 @@ long_key_cmp(DB *UU(e), const DBT *a, const DBT *b)
 }

 static void
-test_serialize_leaf(int valsize, int nelts, double entropy) {
+test_serialize_leaf(int valsize, int nelts, double entropy, int ser_runs, int deser_runs) {
    //    struct ft_handle source_ft;
    struct ftnode *sn, *dn;

@ -214,32 +219,63 @@ test_serialize_leaf(int valsize, int nelts, double entropy) {
        assert(size   == 100);
    }

+    struct timeval total_start;
+    struct timeval total_end;
+    total_start.tv_sec = total_start.tv_usec = 0;
+    total_end.tv_sec = total_end.tv_usec = 0;
    struct timeval t[2];
-    gettimeofday(&t[0], NULL);
    FTNODE_DISK_DATA ndd = NULL;
-    r = toku_serialize_ftnode_to(fd, make_blocknum(20), sn, &ndd, true, brt->ft, false);
-    assert(r==0);
-    gettimeofday(&t[1], NULL);
+    for (int i = 0; i < ser_runs; i++) {
+        gettimeofday(&t[0], NULL);
+        ndd = NULL;
+        sn->dirty = 1;
+        r = toku_serialize_ftnode_to(fd, make_blocknum(20), sn, &ndd, true, brt->ft, false);
+        assert(r==0);
+        gettimeofday(&t[1], NULL);
+        total_start.tv_sec += t[0].tv_sec;
+        total_start.tv_usec += t[0].tv_usec;
+        total_end.tv_sec += t[1].tv_sec;
+        total_end.tv_usec += t[1].tv_usec;
+        toku_free(ndd);
+    }
    double dt;
-    dt = (t[1].tv_sec - t[0].tv_sec) + ((t[1].tv_usec - t[0].tv_usec) / USECS_PER_SEC);
-    printf("serialize leaf:   %0.05lf\n", dt);
+    dt = (total_end.tv_sec - total_start.tv_sec) + ((total_end.tv_usec - total_start.tv_usec) / USECS_PER_SEC);
+    dt *= 1000;
+    dt /= ser_runs;
+    printf("serialize leaf(ms):   %0.05lf (average of %d runs)\n", dt, ser_runs);
+
+    //reset 
+    total_start.tv_sec = total_start.tv_usec = 0;
+    total_end.tv_sec = total_end.tv_usec = 0;

    struct ftnode_fetch_extra bfe;
-    fill_bfe_for_full_read(&bfe, brt_h);
-    gettimeofday(&t[0], NULL);
-    FTNODE_DISK_DATA ndd2 = NULL;
-    r = toku_deserialize_ftnode_from(fd, make_blocknum(20), 0/*pass zero for hash*/, &dn, &ndd2, &bfe);
-    assert(r==0);
-    gettimeofday(&t[1], NULL);
-    dt = (t[1].tv_sec - t[0].tv_sec) + ((t[1].tv_usec - t[0].tv_usec) / USECS_PER_SEC);
-    printf("deserialize leaf: %0.05lf\n", dt);
-    printf("io time %lf decompress time %lf deserialize time %lf\n",
-           tokutime_to_seconds(bfe.io_time),
-           tokutime_to_seconds(bfe.decompress_time),
-           tokutime_to_seconds(bfe.deserialize_time)
+    for (int i = 0; i < deser_runs; i++) {
+        fill_bfe_for_full_read(&bfe, brt_h);
+        gettimeofday(&t[0], NULL);
+        FTNODE_DISK_DATA ndd2 = NULL;
+        r = toku_deserialize_ftnode_from(fd, make_blocknum(20), 0/*pass zero for hash*/, &dn, &ndd2, &bfe);
+        assert(r==0);
+        gettimeofday(&t[1], NULL);
+
+        total_start.tv_sec += t[0].tv_sec;
+        total_start.tv_usec += t[0].tv_usec;
+        total_end.tv_sec += t[1].tv_sec;
+        total_end.tv_usec += t[1].tv_usec;
+
+        toku_ftnode_free(&dn);
+        toku_free(ndd2);
+    }
+    dt = (total_end.tv_sec - total_start.tv_sec) + ((total_end.tv_usec - total_start.tv_usec) / USECS_PER_SEC);
+    dt *= 1000;
+    dt /= deser_runs;
+    printf("deserialize leaf(ms): %0.05lf (average of %d runs)\n", dt, deser_runs);
+    printf("io time(ms) %lf decompress time(ms) %lf deserialize time(ms) %lf (average of %d runs)\n",
+           tokutime_to_seconds(bfe.io_time)*1000,
+           tokutime_to_seconds(bfe.decompress_time)*1000,
+           tokutime_to_seconds(bfe.deserialize_time)*1000,
+           deser_runs
           );

-    toku_ftnode_free(&dn);
    toku_ftnode_free(&sn);

    toku_block_free(brt_h->blocktable, BLOCK_ALLOCATOR_TOTAL_HEADER_RESERVE);
@ -247,14 +283,12 @@ test_serialize_leaf(int valsize, int nelts, double entropy) {
    toku_free(brt_h->h);
    toku_free(brt_h);
    toku_free(brt);
-    toku_free(ndd);
-    toku_free(ndd2);

    r = close(fd); assert(r != -1);
 }

 static void
-test_serialize_nonleaf(int valsize, int nelts, double entropy) {
+test_serialize_nonleaf(int valsize, int nelts, double entropy, int ser_runs, int deser_runs) {
    //    struct ft_handle source_ft;
    struct ftnode sn, *dn;

@ -353,7 +387,8 @@ test_serialize_nonleaf(int valsize, int nelts, double entropy) {
    gettimeofday(&t[1], NULL);
    double dt;
    dt = (t[1].tv_sec - t[0].tv_sec) + ((t[1].tv_usec - t[0].tv_usec) / USECS_PER_SEC);
-    printf("serialize nonleaf:   %0.05lf\n", dt);
+    dt *= 1000;
+    printf("serialize nonleaf(ms):   %0.05lf (IGNORED RUNS=%d)\n", dt, ser_runs);

    struct ftnode_fetch_extra bfe;
    fill_bfe_for_full_read(&bfe, brt_h);
@ -363,11 +398,13 @@ test_serialize_nonleaf(int valsize, int nelts, double entropy) {
    assert(r==0);
    gettimeofday(&t[1], NULL);
    dt = (t[1].tv_sec - t[0].tv_sec) + ((t[1].tv_usec - t[0].tv_usec) / USECS_PER_SEC);
-    printf("deserialize nonleaf: %0.05lf\n", dt);
-    printf("io time %lf decompress time %lf deserialize time %lf\n",
-           tokutime_to_seconds(bfe.io_time),
-           tokutime_to_seconds(bfe.decompress_time),
-           tokutime_to_seconds(bfe.deserialize_time)
+    dt *= 1000;
+    printf("deserialize nonleaf(ms): %0.05lf (IGNORED RUNS=%d)\n", dt, deser_runs);
+    printf("io time(ms) %lf decompress time(ms) %lf deserialize time(ms) %lf (IGNORED RUNS=%d)\n",
+           tokutime_to_seconds(bfe.io_time)*1000,
+           tokutime_to_seconds(bfe.decompress_time)*1000,
+           tokutime_to_seconds(bfe.deserialize_time)*1000,
+           deser_runs
           );

    toku_ftnode_free(&dn);
@ -394,19 +431,32 @@ test_serialize_nonleaf(int valsize, int nelts, double entropy) {

 int
 test_main (int argc __attribute__((__unused__)), const char *argv[] __attribute__((__unused__))) {
-    long valsize, nelts;
+    const int DEFAULT_RUNS = 5;
+    long valsize, nelts, ser_runs = DEFAULT_RUNS, deser_runs = DEFAULT_RUNS;
    double entropy = 0.3;

-    if (argc != 3) {
-        fprintf(stderr, "Usage: %s <valsize> <nelts>\n", argv[0]);
+    if (argc != 3 && argc != 5) {
+        fprintf(stderr, "Usage: %s <valsize> <nelts> [<serialize_runs> <deserialize_runs>]\n", argv[0]);
+        fprintf(stderr, "Default (and min) runs is %d\n", DEFAULT_RUNS);
        return 2;
    }
    valsize = strtol(argv[1], NULL, 0);
    nelts = strtol(argv[2], NULL, 0);
+    if (argc == 5) {
+        ser_runs = strtol(argv[3], NULL, 0);
+        deser_runs = strtol(argv[4], NULL, 0);
+    }
+
+    if (ser_runs <= 0) {
+        ser_runs = DEFAULT_RUNS;
+    }
+    if (deser_runs <= 0) {
+        deser_runs = DEFAULT_RUNS;
+    }

    initialize_dummymsn();
-    test_serialize_leaf(valsize, nelts, entropy);
-    test_serialize_nonleaf(valsize, nelts, entropy);
+    test_serialize_leaf(valsize, nelts, entropy, ser_runs, deser_runs);
+    test_serialize_nonleaf(valsize, nelts, entropy, ser_runs, deser_runs);

    return 0;
 }
--- a/ft/tests/ft-serialize-test.cc
+++ b/ft/tests/ft-serialize-test.cc
@ -98,51 +98,54 @@ PATENT RIGHTS GRANT:
 #endif

 static size_t
-calc_le_size(int keylen, int vallen) {
-    return LE_CLEAN_MEMSIZE(vallen) + keylen + sizeof(uint32_t);
-}
-
-static void
 le_add_to_bn(bn_data* bn, uint32_t idx, const  char *key, int keysize, const char *val, int valsize)
 {
    LEAFENTRY r = NULL;
    uint32_t size_needed = LE_CLEAN_MEMSIZE(valsize);
+    void *maybe_free = nullptr;
    bn->get_space_for_insert(
-        idx, 
+        idx,
        key,
        keysize,
        size_needed,
-        &r
+        &r,
+        &maybe_free
        );
+    if (maybe_free) {
+        toku_free(maybe_free);
+    }
    resource_assert(r);
    r->type = LE_CLEAN;
    r->u.clean.vallen = valsize;
    memcpy(r->u.clean.val, val, valsize);
+    return size_needed + keysize + sizeof(uint32_t);
 }

-static KLPAIR
-le_fastmalloc(struct mempool * mp, const char *key, int keylen, const char *val, int vallen)
-{
-    KLPAIR kl;
-    size_t le_size = calc_le_size(keylen, vallen);
-    CAST_FROM_VOIDP(kl, toku_mempool_malloc(mp, le_size, 1));
-    resource_assert(kl);
-    kl->keylen = keylen;
-    memcpy(kl->key_le, key, keylen);
-    LEAFENTRY le = get_le_from_klpair(kl);
-    le->type = LE_CLEAN;
-    le->u.clean.vallen = vallen;
-    memcpy(le->u.clean.val, val, vallen);
-    return kl;
-}
+class test_key_le_pair {
+    public:
+    uint32_t keylen;
+    char* keyp;
+    LEAFENTRY le;

-static KLPAIR
-le_malloc(struct mempool * mp, const char *key, const char *val)
-{
-    int keylen = strlen(key) + 1;
-    int vallen = strlen(val) + 1;
-    return le_fastmalloc(mp, key, keylen, val, vallen);
-}
+    test_key_le_pair() : keylen(), keyp(), le() {}
+    void init(const char *_keyp, const char *_val) {
+        init(_keyp, strlen(_keyp) + 1, _val, strlen(_val) + 1);
+    }
+    void init(const char * _keyp, uint32_t _keylen, const char*_val, uint32_t _vallen) {
+        keylen = _keylen;
+
+        CAST_FROM_VOIDP(le, toku_malloc(LE_CLEAN_MEMSIZE(_vallen)));
+        le->type = LE_CLEAN;
+        le->u.clean.vallen = _vallen;
+        memcpy(le->u.clean.val, _val, _vallen);
+
+        CAST_FROM_VOIDP(keyp, toku_xmemdup(_keyp, keylen));
+    }
+    ~test_key_le_pair() {
+        toku_free(le);
+        toku_free(keyp);
+    }
+};

 struct check_leafentries_struct {
    int nelts;
@ -290,7 +293,6 @@ test_serialize_leaf_check_msn(enum ftnode_verify_type bft, bool do_clone) {
    BP_STATE(&sn,1) = PT_AVAIL;
    set_BLB(&sn, 0, toku_create_empty_bn());
    set_BLB(&sn, 1, toku_create_empty_bn());
-    KLPAIR elts[3];
    le_add_to_bn(BLB_DATA(&sn, 0), 0, "a", 2, "aval", 5);
    le_add_to_bn(BLB_DATA(&sn, 0), 1, "b", 2, "bval", 5);
    le_add_to_bn(BLB_DATA(&sn, 1), 0, "x", 2, "xval", 5);
@ -346,11 +348,10 @@ test_serialize_leaf_check_msn(enum ftnode_verify_type bft, bool do_clone) {
    {
        // Man, this is way too ugly.  This entire test suite needs to be refactored.
        // Create a dummy mempool and put the leaves there.  Ugh.
-        struct mempool dummy_mp;
-        toku_mempool_construct(&dummy_mp, 1024);
-        elts[0] = le_malloc(&dummy_mp, "a", "aval");
-        elts[1] = le_malloc(&dummy_mp, "b", "bval");
-        elts[2] = le_malloc(&dummy_mp, "x", "xval");
+        test_key_le_pair elts[3];
+        elts[0].init("a", "aval");
+        elts[1].init("b", "bval");
+        elts[2].init("x", "xval");
        const uint32_t npartitions = dn->n_children;
        assert(dn->totalchildkeylens==(2*(npartitions-1)));
        uint32_t last_i = 0;
@ -361,22 +362,21 @@ test_serialize_leaf_check_msn(enum ftnode_verify_type bft, bool do_clone) {
            if (bn > 0) {
                assert(dest_ndd[bn].start >= dest_ndd[bn-1].start + dest_ndd[bn-1].size);
            }
-            for (uint32_t i = 0; i < BLB_DATA(dn, bn)->omt_size(); i++) {
+            for (uint32_t i = 0; i < BLB_DATA(dn, bn)->num_klpairs(); i++) {
                LEAFENTRY curr_le;
                uint32_t curr_keylen;
                void* curr_key;
                BLB_DATA(dn, bn)->fetch_klpair(i, &curr_le, &curr_keylen, &curr_key);
-                assert(leafentry_memsize(curr_le) == leafentry_memsize(get_le_from_klpair(elts[last_i])));
-                assert(memcmp(curr_le, get_le_from_klpair(elts[last_i]), leafentry_memsize(curr_le)) == 0);
+                assert(leafentry_memsize(curr_le) == leafentry_memsize(elts[last_i].le));
+                assert(memcmp(curr_le, elts[last_i].le, leafentry_memsize(curr_le)) == 0);
                if (bn < npartitions-1) {
-                    assert(strcmp((char*)dn->childkeys[bn].data, (char*)(elts[last_i]->key_le)) <= 0);
+                    assert(strcmp((char*)dn->childkeys[bn].data, elts[last_i].keyp) <= 0);
                }
                // TODO for later, get a key comparison here as well
                last_i++;
            }

        }
-        toku_mempool_destroy(&dummy_mp);
        assert(last_i == 3);
    }
    toku_ftnode_free(&dn);
@ -436,7 +436,7 @@ test_serialize_leaf_with_large_pivots(enum ftnode_verify_type bft, bool do_clone
        if (i < nrows-1) {
            uint32_t keylen;
            void* curr_key;
-            BLB_DATA(&sn, i)->fetch_le_key_and_len(0, &keylen, &curr_key);
+            BLB_DATA(&sn, i)->fetch_key_and_len(0, &keylen, &curr_key);
            toku_memdup_dbt(&sn.childkeys[i], curr_key, keylen);
        }
    }
@ -485,18 +485,14 @@ test_serialize_leaf_with_large_pivots(enum ftnode_verify_type bft, bool do_clone
    {
        // Man, this is way too ugly.  This entire test suite needs to be refactored.
        // Create a dummy mempool and put the leaves there.  Ugh.
-        struct mempool dummy_mp;
-        size_t le_size = calc_le_size(keylens, vallens);
-        size_t mpsize = nrows * le_size;
-        toku_mempool_construct(&dummy_mp, mpsize);
-        KLPAIR les[nrows];
+        test_key_le_pair *les = new test_key_le_pair[nrows];
        {
            char key[keylens], val[vallens];
            key[keylens-1] = '\0';
            for (uint32_t i = 0; i < nrows; ++i) {
                char c = 'a' + i;
                memset(key, c, keylens-1);
-                les[i] = le_fastmalloc(&dummy_mp, (char *) &key, sizeof(key), (char *) &val, sizeof(val));
+                les[i].init((char *) &key, sizeof(key), (char *) &val, sizeof(val));
            }
        }
        const uint32_t npartitions = dn->n_children;
@ -508,23 +504,23 @@ test_serialize_leaf_with_large_pivots(enum ftnode_verify_type bft, bool do_clone
            if (bn > 0) {
                assert(dest_ndd[bn].start >= dest_ndd[bn-1].start + dest_ndd[bn-1].size);
            }
-            assert(BLB_DATA(dn, bn)->omt_size() > 0);
-            for (uint32_t i = 0; i < BLB_DATA(dn, bn)->omt_size(); i++) {
+            assert(BLB_DATA(dn, bn)->num_klpairs() > 0);
+            for (uint32_t i = 0; i < BLB_DATA(dn, bn)->num_klpairs(); i++) {
                LEAFENTRY curr_le;
                uint32_t curr_keylen;
                void* curr_key;
                BLB_DATA(dn, bn)->fetch_klpair(i, &curr_le, &curr_keylen, &curr_key);
-                assert(leafentry_memsize(curr_le) == leafentry_memsize(get_le_from_klpair(les[last_i])));
-                assert(memcmp(curr_le, get_le_from_klpair(les[last_i]), leafentry_memsize(curr_le)) == 0);
+                assert(leafentry_memsize(curr_le) == leafentry_memsize(les[last_i].le));
+                assert(memcmp(curr_le, les[last_i].le, leafentry_memsize(curr_le)) == 0);
                if (bn < npartitions-1) {
-                    assert(strcmp((char*)dn->childkeys[bn].data, (char*)(les[last_i]->key_le)) <= 0);
+                    assert(strcmp((char*)dn->childkeys[bn].data, les[last_i].keyp) <= 0);
                }
                // TODO for later, get a key comparison here as well
                last_i++;
            }
        }
-        toku_mempool_destroy(&dummy_mp);
        assert(last_i == nrows);
+        delete[] les;
    }

    toku_ftnode_free(&dn);
@ -552,7 +548,6 @@ static void
 test_serialize_leaf_with_many_rows(enum ftnode_verify_type bft, bool do_clone) {
    int r;
    struct ftnode sn, *dn;
-    const int keylens = sizeof(int), vallens = sizeof(int);
    const uint32_t nrows = 196*1024;
    int fd = open(TOKU_TEST_FILENAME, O_RDWR|O_CREAT|O_BINARY, S_IRWXU|S_IRWXG|S_IRWXO); assert(fd >= 0);

@ -566,17 +561,18 @@ test_serialize_leaf_with_many_rows(enum ftnode_verify_type bft, bool do_clone) {
    sn.dirty = 1;
    sn.oldest_referenced_xid_known = TXNID_NONE;

-    MALLOC_N(sn.n_children, sn.bp);
-    MALLOC_N(sn.n_children-1, sn.childkeys);
+    XMALLOC_N(sn.n_children, sn.bp);
+    XMALLOC_N(sn.n_children-1, sn.childkeys);
    sn.totalchildkeylens = (sn.n_children-1)*sizeof(int);
    for (int i = 0; i < sn.n_children; ++i) {
        BP_STATE(&sn,i) = PT_AVAIL;
        set_BLB(&sn, i, toku_create_empty_bn()); 
    }
+    size_t total_size = 0;
    for (uint32_t i = 0; i < nrows; ++i) {
        uint32_t key = i;
        uint32_t val = i;
-        le_add_to_bn(BLB_DATA(&sn, 0), i, (char *) &key, sizeof(key), (char *) &val, sizeof(val));
+        total_size += le_add_to_bn(BLB_DATA(&sn, 0), i, (char *) &key, sizeof(key), (char *) &val, sizeof(val));
    }

    FT_HANDLE XMALLOC(brt);
@ -624,15 +620,11 @@ test_serialize_leaf_with_many_rows(enum ftnode_verify_type bft, bool do_clone) {
    {
        // Man, this is way too ugly.  This entire test suite needs to be refactored.
        // Create a dummy mempool and put the leaves there.  Ugh.
-        struct mempool dummy_mp;
-        size_t le_size = calc_le_size(keylens, vallens);
-        size_t mpsize = nrows * le_size;
-        toku_mempool_construct(&dummy_mp, mpsize);
-        KLPAIR les[nrows];
+        test_key_le_pair *les = new test_key_le_pair[nrows];
        {
            int key = 0, val = 0;
            for (uint32_t i = 0; i < nrows; ++i, key++, val++) {
-                les[i] = le_fastmalloc(&dummy_mp, (char *) &key, sizeof(key), (char *) &val, sizeof(val));
+                les[i].init((char *) &key, sizeof(key), (char *) &val, sizeof(val));
            }
        }
        const uint32_t npartitions = dn->n_children;
@ -644,17 +636,17 @@ test_serialize_leaf_with_many_rows(enum ftnode_verify_type bft, bool do_clone) {
            if (bn > 0) {
                assert(dest_ndd[bn].start >= dest_ndd[bn-1].start + dest_ndd[bn-1].size);
            }
-            assert(BLB_DATA(dn, bn)->omt_size() > 0);
-            for (uint32_t i = 0; i < BLB_DATA(dn, bn)->omt_size(); i++) {
+            assert(BLB_DATA(dn, bn)->num_klpairs() > 0);
+            for (uint32_t i = 0; i < BLB_DATA(dn, bn)->num_klpairs(); i++) {
                LEAFENTRY curr_le;
                uint32_t curr_keylen;
                void* curr_key;
                BLB_DATA(dn, bn)->fetch_klpair(i, &curr_le, &curr_keylen, &curr_key);
-                assert(leafentry_memsize(curr_le) == leafentry_memsize(get_le_from_klpair(les[last_i])));
-                assert(memcmp(curr_le, get_le_from_klpair(les[last_i]), leafentry_memsize(curr_le)) == 0);
+                assert(leafentry_memsize(curr_le) == leafentry_memsize(les[last_i].le));
+                assert(memcmp(curr_le, les[last_i].le, leafentry_memsize(curr_le)) == 0);
                if (bn < npartitions-1) {
                    uint32_t *CAST_FROM_VOIDP(pivot, dn->childkeys[bn].data);
-                    void* tmp = les[last_i]->key_le;
+                    void* tmp = les[last_i].keyp;
                    uint32_t *CAST_FROM_VOIDP(item, tmp);
                    assert(*pivot >= *item);
                }
@ -664,8 +656,8 @@ test_serialize_leaf_with_many_rows(enum ftnode_verify_type bft, bool do_clone) {
            // don't check soft_copy_is_up_to_date or seqinsert
            assert(BLB_DATA(dn, bn)->get_disk_size() < 128*1024);  // BN_MAX_SIZE, apt to change
        }
-        toku_mempool_destroy(&dummy_mp);
        assert(last_i == nrows);
+        delete[] les;
    }

    toku_ftnode_free(&dn);
@ -772,11 +764,7 @@ test_serialize_leaf_with_large_rows(enum ftnode_verify_type bft, bool do_clone)
    {
        // Man, this is way too ugly.  This entire test suite needs to be refactored.
        // Create a dummy mempool and put the leaves there.  Ugh.
-        struct mempool dummy_mp;
-        size_t le_size = calc_le_size(key_size, val_size);
-        size_t mpsize = nrows * le_size;
-        toku_mempool_construct(&dummy_mp, mpsize);
-        KLPAIR les[nrows];
+        test_key_le_pair *les = new test_key_le_pair[nrows];
        {
            char key[key_size], val[val_size];
            key[key_size-1] = '\0';
@ -785,7 +773,7 @@ test_serialize_leaf_with_large_rows(enum ftnode_verify_type bft, bool do_clone)
                char c = 'a' + i;
                memset(key, c, key_size-1);
                memset(val, c, val_size-1);
-                les[i] = le_fastmalloc(&dummy_mp, key, key_size, val, val_size);
+                les[i].init(key, key_size, val, val_size);
            }
        }
        const uint32_t npartitions = dn->n_children;
@ -798,24 +786,24 @@ test_serialize_leaf_with_large_rows(enum ftnode_verify_type bft, bool do_clone)
            if (bn > 0) {
                assert(dest_ndd[bn].start >= dest_ndd[bn-1].start + dest_ndd[bn-1].size);
            }
-            assert(BLB_DATA(dn, bn)->omt_size() > 0);
-            for (uint32_t i = 0; i < BLB_DATA(dn, bn)->omt_size(); i++) {
+            assert(BLB_DATA(dn, bn)->num_klpairs() > 0);
+            for (uint32_t i = 0; i < BLB_DATA(dn, bn)->num_klpairs(); i++) {
                LEAFENTRY curr_le;
                uint32_t curr_keylen;
                void* curr_key;
                BLB_DATA(dn, bn)->fetch_klpair(i, &curr_le, &curr_keylen, &curr_key);
-                assert(leafentry_memsize(curr_le) == leafentry_memsize(get_le_from_klpair(les[last_i])));
-                assert(memcmp(curr_le, get_le_from_klpair(les[last_i]), leafentry_memsize(curr_le)) == 0);
+                assert(leafentry_memsize(curr_le) == leafentry_memsize(les[last_i].le));
+                assert(memcmp(curr_le, les[last_i].le, leafentry_memsize(curr_le)) == 0);
                if (bn < npartitions-1) {
-                    assert(strcmp((char*)dn->childkeys[bn].data, (char*)(les[last_i]->key_le)) <= 0);
+                    assert(strcmp((char*)dn->childkeys[bn].data, (char*)(les[last_i].keyp)) <= 0);
                }
                // TODO for later, get a key comparison here as well
                last_i++;
            }
            // don't check soft_copy_is_up_to_date or seqinsert
        }
-        toku_mempool_destroy(&dummy_mp);
        assert(last_i == 7);
+        delete[] les;
    }

    toku_ftnode_free(&dn);
@ -871,7 +859,6 @@ test_serialize_leaf_with_empty_basement_nodes(enum ftnode_verify_type bft, bool
        set_BLB(&sn, i, toku_create_empty_bn());
        BLB_SEQINSERT(&sn, i) = 0;
    }
-    KLPAIR elts[3];
    le_add_to_bn(BLB_DATA(&sn, 1), 0, "a", 2, "aval", 5);
    le_add_to_bn(BLB_DATA(&sn, 3), 0, "b", 2, "bval", 5);
    le_add_to_bn(BLB_DATA(&sn, 5), 0, "x", 2, "xval", 5);
@ -921,13 +908,13 @@ test_serialize_leaf_with_empty_basement_nodes(enum ftnode_verify_type bft, bool
    assert(dn->height == 0);
    assert(dn->n_children>0);
    {
+        test_key_le_pair elts[3];
+
        // Man, this is way too ugly.  This entire test suite needs to be refactored.
        // Create a dummy mempool and put the leaves there.  Ugh.
-        struct mempool dummy_mp;
-        toku_mempool_construct(&dummy_mp, 1024);
-        elts[0] = le_malloc(&dummy_mp, "a", "aval");
-        elts[1] = le_malloc(&dummy_mp, "b", "bval");
-        elts[2] = le_malloc(&dummy_mp, "x", "xval");
+        elts[0].init("a", "aval");
+        elts[1].init("b", "bval");
+        elts[2].init("x", "xval");
        const uint32_t npartitions = dn->n_children;
        assert(dn->totalchildkeylens==(2*(npartitions-1)));
        uint32_t last_i = 0;
@ -937,22 +924,21 @@ test_serialize_leaf_with_empty_basement_nodes(enum ftnode_verify_type bft, bool
            if (bn > 0) {
                assert(dest_ndd[bn].start >= dest_ndd[bn-1].start + dest_ndd[bn-1].size);
            }
-            for (uint32_t i = 0; i < BLB_DATA(dn, bn)->omt_size(); i++) {
+            for (uint32_t i = 0; i < BLB_DATA(dn, bn)->num_klpairs(); i++) {
                LEAFENTRY curr_le;
                uint32_t curr_keylen;
                void* curr_key;
                BLB_DATA(dn, bn)->fetch_klpair(i, &curr_le, &curr_keylen, &curr_key);
-                assert(leafentry_memsize(curr_le) == leafentry_memsize(get_le_from_klpair(elts[last_i])));
-                assert(memcmp(curr_le, get_le_from_klpair(elts[last_i]), leafentry_memsize(curr_le)) == 0);
+                assert(leafentry_memsize(curr_le) == leafentry_memsize(elts[last_i].le));
+                assert(memcmp(curr_le, elts[last_i].le, leafentry_memsize(curr_le)) == 0);
                if (bn < npartitions-1) {
-                    assert(strcmp((char*)dn->childkeys[bn].data, (char*)(elts[last_i]->key_le)) <= 0);
+                    assert(strcmp((char*)dn->childkeys[bn].data, (char*)(elts[last_i].keyp)) <= 0);
                }
                // TODO for later, get a key comparison here as well
                last_i++;
            }

        }
-        toku_mempool_destroy(&dummy_mp);
        assert(last_i == 3);
    }
    toku_ftnode_free(&dn);
@ -1059,7 +1045,7 @@ test_serialize_leaf_with_multiple_empty_basement_nodes(enum ftnode_verify_type b
            if (i > 0) {
                assert(dest_ndd[i].start >= dest_ndd[i-1].start + dest_ndd[i-1].size);
            }
-            assert(BLB_DATA(dn, i)->omt_size() == 0);
+            assert(BLB_DATA(dn, i)->num_klpairs() == 0);
        }
    }
    toku_ftnode_free(&dn);
--- a/ft/tests/make-tree.cc
+++ b/ft/tests/make-tree.cc
@ -119,7 +119,7 @@ append_leaf(FTNODE leafnode, void *key, size_t keylen, void *val, size_t vallen)
    DBT theval; toku_fill_dbt(&theval, val, vallen);

    // get an index that we can use to create a new leaf entry
-    uint32_t idx = BLB_DATA(leafnode, 0)->omt_size();
+    uint32_t idx = BLB_DATA(leafnode, 0)->num_klpairs();

    MSN msn = next_dummymsn();

--- a/ft/tests/mempool-115.cc
+++ b/ft/tests/mempool-115.cc
@ -96,13 +96,18 @@ le_add_to_bn(bn_data* bn, uint32_t idx, const  char *key, int keysize, const cha
 {
    LEAFENTRY r = NULL;
    uint32_t size_needed = LE_CLEAN_MEMSIZE(valsize);
+    void *maybe_free = nullptr;
    bn->get_space_for_insert(
        idx, 
        key,
        keysize,
        size_needed,
-        &r
+        &r,
+        &maybe_free
        );
+    if (maybe_free) {
+        toku_free(maybe_free);
+    }
    resource_assert(r);
    r->type = LE_CLEAN;
    r->u.clean.vallen = valsize;
@ -113,14 +118,19 @@ static void
 le_overwrite(bn_data* bn, uint32_t idx, const  char *key, int keysize, const char *val, int valsize) {
    LEAFENTRY r = NULL;
    uint32_t size_needed = LE_CLEAN_MEMSIZE(valsize);
+    void *maybe_free = nullptr;
    bn->get_space_for_overwrite(
        idx, 
        key,
        keysize,
        size_needed, // old_le_size
        size_needed,
-        &r
+        &r,
+        &maybe_free
        );
+    if (maybe_free) {
+        toku_free(maybe_free);
+    }
    resource_assert(r);
    r->type = LE_CLEAN;
    r->u.clean.vallen = valsize;
--- a/ft/tests/orthopush-flush.cc
+++ b/ft/tests/orthopush-flush.cc
@ -733,7 +733,7 @@ flush_to_leaf(FT_HANDLE t, bool make_leaf_up_to_date, bool use_flush) {

    int total_messages = 0;
    for (i = 0; i < 8; ++i) {
-        total_messages += BLB_DATA(child, i)->omt_size();
+        total_messages += BLB_DATA(child, i)->num_klpairs();
    }
    assert(total_messages <= num_parent_messages + num_child_messages);

@ -746,7 +746,7 @@ flush_to_leaf(FT_HANDLE t, bool make_leaf_up_to_date, bool use_flush) {
    memset(parent_messages_present, 0, sizeof parent_messages_present);
    memset(child_messages_present, 0, sizeof child_messages_present);
    for (int j = 0; j < 8; ++j) {
-        uint32_t len = BLB_DATA(child, j)->omt_size();
+        uint32_t len = BLB_DATA(child, j)->num_klpairs();
        for (uint32_t idx = 0; idx < len; ++idx) {
            LEAFENTRY le;
            DBT keydbt, valdbt;
@ -968,7 +968,7 @@ flush_to_leaf_with_keyrange(FT_HANDLE t, bool make_leaf_up_to_date) {

    int total_messages = 0;
    for (i = 0; i < 8; ++i) {
-        total_messages += BLB_DATA(child, i)->omt_size();
+        total_messages += BLB_DATA(child, i)->num_klpairs();
    }
    assert(total_messages <= num_parent_messages + num_child_messages);

@ -1144,10 +1144,10 @@ compare_apply_and_flush(FT_HANDLE t, bool make_leaf_up_to_date) {
    toku_ftnode_free(&parentnode);

    for (int j = 0; j < 8; ++j) {
-        BN_DATA first = BLB_DATA(child1, j);
-        BN_DATA second = BLB_DATA(child2, j);
-        uint32_t len = first->omt_size();
-        assert(len == second->omt_size());
+        bn_data* first = BLB_DATA(child1, j);
+        bn_data* second = BLB_DATA(child2, j);
+        uint32_t len = first->num_klpairs();
+        assert(len == second->num_klpairs());
        for (uint32_t idx = 0; idx < len; ++idx) {
            LEAFENTRY le1, le2;
            DBT key1dbt, val1dbt, key2dbt, val2dbt;
--- a/ft/tests/test-checkpoint-during-merge.cc
+++ b/ft/tests/test-checkpoint-during-merge.cc
@ -348,7 +348,7 @@ doit (int state) {
        assert(node->height == 0);
        assert(!node->dirty);
        assert(node->n_children == 1);
-        assert(BLB_DATA(node, 0)->omt_size() == 1);
+        assert(BLB_DATA(node, 0)->num_klpairs() == 1);
        toku_unpin_ftnode_off_client_thread(c_ft->ft, node);

        toku_pin_ftnode_off_client_thread(
@ -364,7 +364,7 @@ doit (int state) {
        assert(node->height == 0);
        assert(!node->dirty);
        assert(node->n_children == 1);
-        assert(BLB_DATA(node, 0)->omt_size() == 1);
+        assert(BLB_DATA(node, 0)->num_klpairs() == 1);
        toku_unpin_ftnode_off_client_thread(c_ft->ft, node);
    }
    else if (state == ft_flush_aflter_merge || state == flt_flush_before_unpin_remove) {
@ -381,7 +381,7 @@ doit (int state) {
        assert(node->height == 0);
        assert(!node->dirty);
        assert(node->n_children == 1);
-        assert(BLB_DATA(node, 0)->omt_size() == 2);
+        assert(BLB_DATA(node, 0)->num_klpairs() == 2);
        toku_unpin_ftnode_off_client_thread(c_ft->ft, node);
    }
    else {
--- a/ft/tests/test-checkpoint-during-rebalance.cc
+++ b/ft/tests/test-checkpoint-during-rebalance.cc
@ -359,7 +359,7 @@ doit (int state) {
    assert(node->height == 0);
    assert(!node->dirty);
    assert(node->n_children == 1);
-    assert(BLB_DATA(node, 0)->omt_size() == 2);
+    assert(BLB_DATA(node, 0)->num_klpairs() == 2);
    toku_unpin_ftnode_off_client_thread(c_ft->ft, node);
    
    toku_pin_ftnode_off_client_thread(
@ -375,7 +375,7 @@ doit (int state) {
    assert(node->height == 0);
    assert(!node->dirty);
    assert(node->n_children == 1);
-    assert(BLB_DATA(node, 0)->omt_size() == 2);
+    assert(BLB_DATA(node, 0)->num_klpairs() == 2);
    toku_unpin_ftnode_off_client_thread(c_ft->ft, node);


--- a/ft/tests/test-checkpoint-during-split.cc
+++ b/ft/tests/test-checkpoint-during-split.cc
@ -342,7 +342,7 @@ doit (bool after_split) {
        assert(node->height == 0);
        assert(!node->dirty);
        assert(node->n_children == 1);
-        assert(BLB_DATA(node, 0)->omt_size() == 1);
+        assert(BLB_DATA(node, 0)->num_klpairs() == 1);
        toku_unpin_ftnode_off_client_thread(c_ft->ft, node);

        toku_pin_ftnode_off_client_thread(
@ -358,7 +358,7 @@ doit (bool after_split) {
        assert(node->height == 0);
        assert(!node->dirty);
        assert(node->n_children == 1);
-        assert(BLB_DATA(node, 0)->omt_size() == 1);
+        assert(BLB_DATA(node, 0)->num_klpairs() == 1);
        toku_unpin_ftnode_off_client_thread(c_ft->ft, node);
    }
    else {
@ -375,7 +375,7 @@ doit (bool after_split) {
        assert(node->height == 0);
        assert(!node->dirty);
        assert(node->n_children == 1);
-        assert(BLB_DATA(node, 0)->omt_size() == 2);
+        assert(BLB_DATA(node, 0)->num_klpairs() == 2);
        toku_unpin_ftnode_off_client_thread(c_ft->ft, node);
    }

--- a/ft/tests/test-leafentry-nested.cc
+++ b/ft/tests/test-leafentry-nested.cc
@ -213,7 +213,7 @@ test_le_offsets (void) {
 static void
 test_ule_packs_to_nothing (ULE ule) {
    LEAFENTRY le;
-    int r = le_pack(ule, NULL, 0, NULL, 0, 0, &le);
+    int r = le_pack(ule, NULL, 0, NULL, 0, 0, &le, nullptr);
    assert(r==0);
    assert(le==NULL);
 }
@ -319,7 +319,7 @@ test_le_pack_committed (void) {

        size_t memsize;
        LEAFENTRY le;
-        int r = le_pack(&ule, nullptr, 0, nullptr, 0, 0, &le);
+        int r = le_pack(&ule, nullptr, 0, nullptr, 0, 0, &le, nullptr);
        assert(r==0);
        assert(le!=NULL);
        memsize = le_memsize_from_ule(&ule);
@ -329,7 +329,7 @@ test_le_pack_committed (void) {
        verify_ule_equal(&ule, &tmp_ule);
        LEAFENTRY tmp_le;
        size_t    tmp_memsize;
-        r = le_pack(&tmp_ule, nullptr, 0, nullptr, 0, 0, &tmp_le);
+        r = le_pack(&tmp_ule, nullptr, 0, nullptr, 0, 0, &tmp_le, nullptr);
        tmp_memsize = le_memsize_from_ule(&tmp_ule);
        assert(r==0);
        assert(tmp_memsize == memsize);
@ -377,7 +377,7 @@ test_le_pack_uncommitted (uint8_t committed_type, uint8_t prov_type, int num_pla

        size_t memsize;
        LEAFENTRY le;
-        int r = le_pack(&ule, nullptr, 0, nullptr, 0, 0, &le);
+        int r = le_pack(&ule, nullptr, 0, nullptr, 0, 0, &le, nullptr);
        assert(r==0);
        assert(le!=NULL);
        memsize = le_memsize_from_ule(&ule);
@ -387,7 +387,7 @@ test_le_pack_uncommitted (uint8_t committed_type, uint8_t prov_type, int num_pla
        verify_ule_equal(&ule, &tmp_ule);
        LEAFENTRY tmp_le;
        size_t    tmp_memsize;
-        r = le_pack(&tmp_ule, nullptr, 0, nullptr, 0, 0, &tmp_le);
+        r = le_pack(&tmp_ule, nullptr, 0, nullptr, 0, 0, &tmp_le, nullptr);
        tmp_memsize = le_memsize_from_ule(&tmp_ule);
        assert(r==0);
        assert(tmp_memsize == memsize);
@ -448,7 +448,7 @@ test_le_apply(ULE ule_initial, FT_MSG msg, ULE ule_expected) {
    LEAFENTRY le_expected;
    LEAFENTRY le_result;

-    r = le_pack(ule_initial, nullptr, 0, nullptr, 0, 0, &le_initial);
+    r = le_pack(ule_initial, nullptr, 0, nullptr, 0, 0, &le_initial, nullptr);
    CKERR(r);

    size_t result_memsize = 0;
@ -467,7 +467,7 @@ test_le_apply(ULE ule_initial, FT_MSG msg, ULE ule_expected) {
    }

    size_t expected_memsize = 0;
-    r = le_pack(ule_expected, nullptr, 0, nullptr, 0, 0, &le_expected);
+    r = le_pack(ule_expected, nullptr, 0, nullptr, 0, 0, &le_expected, nullptr);
    CKERR(r);
    if (le_expected) {
        expected_memsize = leafentry_memsize(le_expected);
@ -749,7 +749,7 @@ test_le_apply_messages(void) {

 static bool ule_worth_running_garbage_collection(ULE ule, TXNID oldest_referenced_xid_known) {
    LEAFENTRY le;
-    int r = le_pack(ule, nullptr, 0, nullptr, 0, 0, &le); CKERR(r);
+    int r = le_pack(ule, nullptr, 0, nullptr, 0, 0, &le, nullptr); CKERR(r);
    invariant_notnull(le);
    bool worth_running = toku_le_worth_running_garbage_collection(le, oldest_referenced_xid_known);
    toku_free(le);
--- a/ft/tests/test-pick-child-to-flush.cc
+++ b/ft/tests/test-pick-child-to-flush.cc
@ -189,7 +189,7 @@ doit (void) {
    r = toku_testsetup_root(t, node_root);
    assert(r==0);

-    char filler[900];
+    char filler[900-2*bn_data::HEADER_LENGTH];
    memset(filler, 0, sizeof(filler));
    // now we insert filler data so that a merge does not happen
    r = toku_testsetup_insert_to_leaf (
--- a/ft/tests/test3884.cc
+++ b/ft/tests/test3884.cc
@ -119,13 +119,18 @@ le_add_to_bn(bn_data* bn, uint32_t idx, const  char *key, int keysize, const cha
 {
    LEAFENTRY r = NULL;
    uint32_t size_needed = LE_CLEAN_MEMSIZE(valsize);
+    void *maybe_free = nullptr;
    bn->get_space_for_insert(
        idx, 
        key,
        keysize,
        size_needed,
-        &r
+        &r,
+        &maybe_free
        );
+    if (maybe_free) {
+        toku_free(maybe_free);
+    }
    resource_assert(r);
    r->type = LE_CLEAN;
    r->u.clean.vallen = valsize;
--- a/ft/tests/verify-bad-msn.cc
+++ b/ft/tests/verify-bad-msn.cc
@ -122,7 +122,7 @@ append_leaf(FTNODE leafnode, void *key, size_t keylen, void *val, size_t vallen)
    DBT theval; toku_fill_dbt(&theval, val, vallen);

    // get an index that we can use to create a new leaf entry
-    uint32_t idx = BLB_DATA(leafnode, 0)->omt_size();
+    uint32_t idx = BLB_DATA(leafnode, 0)->num_klpairs();

    MSN msn = next_dummymsn();

--- a/ft/tests/verify-bad-pivots.cc
+++ b/ft/tests/verify-bad-pivots.cc
@ -111,7 +111,7 @@ append_leaf(FTNODE leafnode, void *key, size_t keylen, void *val, size_t vallen)
    DBT theval; toku_fill_dbt(&theval, val, vallen);

    // get an index that we can use to create a new leaf entry
-    uint32_t idx = BLB_DATA(leafnode, 0)->omt_size();
+    uint32_t idx = BLB_DATA(leafnode, 0)->num_klpairs();

    // apply an insert to the leaf node
    MSN msn = next_dummymsn();
--- a/ft/tests/verify-dup-in-leaf.cc
+++ b/ft/tests/verify-dup-in-leaf.cc
@ -112,7 +112,7 @@ append_leaf(FTNODE leafnode, void *key, size_t keylen, void *val, size_t vallen)
    DBT theval; toku_fill_dbt(&theval, val, vallen);

    // get an index that we can use to create a new leaf entry
-    uint32_t idx = BLB_DATA(leafnode, 0)->omt_size();
+    uint32_t idx = BLB_DATA(leafnode, 0)->num_klpairs();

    // apply an insert to the leaf node
    MSN msn = next_dummymsn();
--- a/ft/tests/verify-dup-pivots.cc
+++ b/ft/tests/verify-dup-pivots.cc
@ -111,7 +111,7 @@ append_leaf(FTNODE leafnode, void *key, size_t keylen, void *val, size_t vallen)
    DBT theval; toku_fill_dbt(&theval, val, vallen);

    // get an index that we can use to create a new leaf entry
-    uint32_t idx = BLB_DATA(leafnode, 0)->omt_size();
+    uint32_t idx = BLB_DATA(leafnode, 0)->num_klpairs();

    // apply an insert to the leaf node
    MSN msn = next_dummymsn();
--- a/ft/tests/verify-misrouted-msgs.cc
+++ b/ft/tests/verify-misrouted-msgs.cc
@ -112,7 +112,7 @@ append_leaf(FTNODE leafnode, void *key, size_t keylen, void *val, size_t vallen)
    DBT theval; toku_fill_dbt(&theval, val, vallen);

    // get an index that we can use to create a new leaf entry
-    uint32_t idx = BLB_DATA(leafnode, 0)->omt_size();
+    uint32_t idx = BLB_DATA(leafnode, 0)->num_klpairs();

    // apply an insert to the leaf node
    MSN msn = next_dummymsn();
--- a/ft/tests/verify-unsorted-leaf.cc
+++ b/ft/tests/verify-unsorted-leaf.cc
@ -114,7 +114,7 @@ append_leaf(FTNODE leafnode, void *key, size_t keylen, void *val, size_t vallen)
    toku_fill_dbt(&theval, val, vallen);

    // get an index that we can use to create a new leaf entry
-    uint32_t idx = BLB_DATA(leafnode, 0)->omt_size();
+    uint32_t idx = BLB_DATA(leafnode, 0)->num_klpairs();

    // apply an insert to the leaf node
    MSN msn = next_dummymsn();
--- a/ft/tests/verify-unsorted-pivots.cc
+++ b/ft/tests/verify-unsorted-pivots.cc
@ -111,7 +111,7 @@ append_leaf(FTNODE leafnode, void *key, size_t keylen, void *val, size_t vallen)
    DBT theval; toku_fill_dbt(&theval, val, vallen);

    // get an index that we can use to create a new leaf entry
-    uint32_t idx = BLB_DATA(leafnode, 0)->omt_size();
+    uint32_t idx = BLB_DATA(leafnode, 0)->num_klpairs();

    // apply an insert to the leaf node
    MSN msn = next_dummymsn();
--- a/ft/tokuftdump.cc
+++ b/ft/tokuftdump.cc
@ -315,9 +315,9 @@ dump_node (int f, BLOCKNUM blocknum, FT h) {
            }
        } else {
            printf(" n_bytes_in_buffer= %" PRIu64 "", BLB_DATA(n, i)->get_disk_size());
-            printf(" items_in_buffer=%u\n", BLB_DATA(n, i)->omt_size());
+            printf(" items_in_buffer=%u\n", BLB_DATA(n, i)->num_klpairs());
            if (dump_data) {
-                BLB_DATA(n, i)->omt_iterate<void, print_le>(NULL);
+                BLB_DATA(n, i)->iterate<void, print_le>(NULL);
            }
        }
    }
--- a/ft/ule-internal.h
+++ b/ft/ule-internal.h
@ -149,7 +149,8 @@ le_pack(ULE ule, // data to be packed into new leafentry
        void* keyp,
        uint32_t keylen,
        uint32_t old_le_size,
-        LEAFENTRY * const new_leafentry_p // this is what this function creates
+        LEAFENTRY * const new_leafentry_p, // this is what this function creates
+        void **const maybe_free
        );


--- a/ft/ule.cc
+++ b/ft/ule.cc
@ -236,26 +236,27 @@ static inline size_t uxr_unpack_length_and_bit(UXR uxr, uint8_t *p);
 static inline size_t uxr_unpack_data(UXR uxr, uint8_t *p);

 static void get_space_for_le(
-    bn_data* data_buffer, 
+    bn_data* data_buffer,
    uint32_t idx,
    void* keyp,
    uint32_t keylen,
    uint32_t old_le_size,
-    size_t size, 
-    LEAFENTRY* new_le_space
-    ) 
+    size_t size,
+    LEAFENTRY* new_le_space,
+    void **const maybe_free
+    )
 {
-    if (data_buffer == NULL) {
+    if (data_buffer == nullptr) {
        CAST_FROM_VOIDP(*new_le_space, toku_xmalloc(size));
    }
    else {
        // this means we are overwriting something
        if (old_le_size > 0) {
-            data_buffer->get_space_for_overwrite(idx, keyp, keylen, old_le_size, size, new_le_space);
+            data_buffer->get_space_for_overwrite(idx, keyp, keylen, old_le_size, size, new_le_space, maybe_free);
        }
        // this means we are inserting something new
        else {
-            data_buffer->get_space_for_insert(idx, keyp, keylen, size, new_le_space);
+            data_buffer->get_space_for_insert(idx, keyp, keylen, size, new_le_space, maybe_free);
        }
    }
 }
@ -470,23 +471,17 @@ toku_le_apply_msg(FT_MSG   msg,
    int64_t newnumbytes = 0;
    uint64_t oldmemsize = 0;
    uint32_t keylen = ft_msg_get_keylen(msg);
-    LEAFENTRY copied_old_le = NULL;
-    size_t old_le_size = old_leafentry ? leafentry_memsize(old_leafentry) : 0;
-    toku::scoped_malloc copied_old_le_buf(old_le_size);
-    if (old_leafentry) {
-        CAST_FROM_VOIDP(copied_old_le, copied_old_le_buf.get());
-        memcpy(copied_old_le, old_leafentry, old_le_size);
-    }

    if (old_leafentry == NULL) {
        msg_init_empty_ule(&ule);
    } else {
        oldmemsize = leafentry_memsize(old_leafentry);
-        le_unpack(&ule, copied_old_le); // otherwise unpack leafentry
+        le_unpack(&ule, old_leafentry); // otherwise unpack leafentry
        oldnumbytes = ule_get_innermost_numbytes(&ule, keylen);
    }
    msg_modify_ule(&ule, msg);          // modify unpacked leafentry
    ule_simple_garbage_collection(&ule, oldest_referenced_xid, gc_info);
+    void *maybe_free = nullptr;
    int rval = le_pack(
        &ule, // create packed leafentry
        data_buffer,
@ -494,7 +489,8 @@ toku_le_apply_msg(FT_MSG   msg,
        ft_msg_get_key(msg), // contract of this function is caller has this set, always
        keylen, // contract of this function is caller has this set, always
        oldmemsize,
-        new_leafentry_p
+        new_leafentry_p,
+        &maybe_free
        );
    invariant_zero(rval);
    if (*new_leafentry_p) {
@ -502,6 +498,9 @@ toku_le_apply_msg(FT_MSG   msg,
    }
    *numbytes_delta_p = newnumbytes - oldnumbytes;
    ule_cleanup(&ule);
+    if (maybe_free) {
+        toku_free(maybe_free);
+    }
 }

 bool toku_le_worth_running_garbage_collection(LEAFENTRY le, TXNID oldest_referenced_xid_known) {
@ -557,15 +556,8 @@ toku_le_garbage_collect(LEAFENTRY old_leaf_entry,
    ULE_S ule;
    int64_t oldnumbytes = 0;
    int64_t newnumbytes = 0;
-    LEAFENTRY copied_old_le = NULL;
-    size_t old_le_size = old_leaf_entry ? leafentry_memsize(old_leaf_entry) : 0;
-    toku::scoped_malloc copied_old_le_buf(old_le_size);
-    if (old_leaf_entry) {
-        CAST_FROM_VOIDP(copied_old_le, copied_old_le_buf.get());
-        memcpy(copied_old_le, old_leaf_entry, old_le_size);
-    }

-    le_unpack(&ule, copied_old_le);
+    le_unpack(&ule, old_leaf_entry);

    oldnumbytes = ule_get_innermost_numbytes(&ule, keylen);
    uint32_t old_mem_size = leafentry_memsize(old_leaf_entry);
@ -580,6 +572,7 @@ toku_le_garbage_collect(LEAFENTRY old_leaf_entry,
    ule_try_promote_provisional_outermost(&ule, oldest_possible_live_xid);
    ule_garbage_collect(&ule, snapshot_xids, referenced_xids, live_root_txns);

+    void *maybe_free = nullptr;
    int r = le_pack(
        &ule,
        data_buffer,
@ -587,7 +580,8 @@ toku_le_garbage_collect(LEAFENTRY old_leaf_entry,
        keyp,
        keylen,
        old_mem_size,
-        new_leaf_entry
+        new_leaf_entry,
+        &maybe_free
        );
    assert(r == 0);
    if (*new_leaf_entry) {
@ -595,6 +589,9 @@ toku_le_garbage_collect(LEAFENTRY old_leaf_entry,
    }
    *numbytes_delta_p = newnumbytes - oldnumbytes;
    ule_cleanup(&ule);
+    if (maybe_free) {
+        toku_free(maybe_free);
+    }
 }

 /////////////////////////////////////////////////////////////////////////////////
@ -889,7 +886,7 @@ update_le_status(ULE ule, size_t memsize) {
 }

 // Purpose is to return a newly allocated leaf entry in packed format, or
-// return null if leaf entry should be destroyed (if no transaction records 
+// return null if leaf entry should be destroyed (if no transaction records
 // are for inserts).
 // Transaction records in packed le are stored inner to outer (first xr is innermost),
 // with some information extracted out of the transaction records into the header.
@ -901,7 +898,8 @@ le_pack(ULE ule, // data to be packed into new leafentry
        void* keyp,
        uint32_t keylen,
        uint32_t old_le_size,
-        LEAFENTRY * const new_leafentry_p // this is what this function creates
+        LEAFENTRY * const new_leafentry_p, // this is what this function creates
+        void **const maybe_free
        )
 {
    invariant(ule->num_cuxrs > 0);
@ -927,10 +925,10 @@ le_pack(ULE ule, // data to be packed into new leafentry
        rval = 0;
        goto cleanup;
    }
-found_insert:;
+found_insert:
    memsize = le_memsize_from_ule(ule);
    LEAFENTRY new_leafentry;
-    get_space_for_le(data_buffer, idx, keyp, keylen, old_le_size, memsize, &new_leafentry);
+    get_space_for_le(data_buffer, idx, keyp, keylen, old_le_size, memsize, &new_leafentry, maybe_free);

    //p always points to first unused byte after leafentry we are packing
    uint8_t *p;
@ -982,7 +980,7 @@ found_insert:;
        for (i = 0; i < ule->num_cuxrs; i++) {
            p += uxr_pack_length_and_bit(ule->uxrs + ule->num_cuxrs - 1 - i, p);
        }
-        
+
        //pack interesting values inner to outer
        if (ule->num_puxrs!=0) {
            UXR innermost = ule->uxrs + ule->num_cuxrs + ule->num_puxrs - 1;
@ -1020,7 +1018,7 @@ found_insert:;
    size_t bytes_written;
    bytes_written = (size_t)p - (size_t)new_leafentry;
    invariant(bytes_written == memsize);
-         
+
 #if ULE_DEBUG
    if (omt) { //Disable recursive debugging.
        size_t memsize_verify = leafentry_memsize(new_leafentry);
@ -2393,12 +2391,14 @@ toku_le_upgrade_13_14(LEAFENTRY_13 old_leafentry,
    // malloc instead of a mempool.  However after supporting upgrade,
    // we need to use mempools and the OMT.
    rval = le_pack(&ule, // create packed leafentry
-                   NULL,
+                   nullptr,
                   0, //only matters if we are passing in a bn_data
-                   NULL, //only matters if we are passing in a bn_data
+                   nullptr, //only matters if we are passing in a bn_data
                   0, //only matters if we are passing in a bn_data
                   0, //only matters if we are passing in a bn_data
-                   new_leafentry_p);  
+                   new_leafentry_p,
+                   nullptr //only matters if we are passing in a bn_data
+                   );
    ule_cleanup(&ule);
    *new_leafentry_memorysize = leafentry_memsize(*new_leafentry_p);
    return rval;
--- a/ft/wbuf.h
+++ b/ft/wbuf.h
@ -187,6 +187,13 @@ static inline void wbuf_uint (struct wbuf *w, uint32_t i) {
    wbuf_int(w, (int32_t)i);
 }

+static inline uint8_t* wbuf_nocrc_reserve_literal_bytes(struct wbuf *w, uint32_t nbytes) {
+    assert(w->ndone + nbytes <= w->size);
+    uint8_t * dest = w->buf + w->ndone;
+    w->ndone += nbytes;
+    return dest;
+}
+
 static inline void wbuf_nocrc_literal_bytes(struct wbuf *w, bytevec bytes_bv, uint32_t nbytes) {
    const unsigned char *bytes = (const unsigned char *) bytes_bv;
 #if 0
--- a/util/dmt.cc
+++ b/util/dmt.cc
--- a/util/dmt.h
+++ b/util/dmt.h
@ -0,0 +1,728 @@
+/* -*- mode: C++; c-basic-offset: 4; indent-tabs-mode: nil -*- */
+// vim: ft=cpp:expandtab:ts=8:sw=4:softtabstop=4:
+#pragma once
+
+/*
+COPYING CONDITIONS NOTICE:
+
+  This program is free software; you can redistribute it and/or modify
+  it under the terms of version 2 of the GNU General Public License as
+  published by the Free Software Foundation, and provided that the
+  following conditions are met:
+
+      * Redistributions of source code must retain this COPYING
+        CONDITIONS NOTICE, the COPYRIGHT NOTICE (below), the
+        DISCLAIMER (below), the UNIVERSITY PATENT NOTICE (below), the
+        PATENT MARKING NOTICE (below), and the PATENT RIGHTS
+        GRANT (below).
+
+      * Redistributions in binary form must reproduce this COPYING
+        CONDITIONS NOTICE, the COPYRIGHT NOTICE (below), the
+        DISCLAIMER (below), the UNIVERSITY PATENT NOTICE (below), the
+        PATENT MARKING NOTICE (below), and the PATENT RIGHTS
+        GRANT (below) in the documentation and/or other materials
+        provided with the distribution.
+
+  You should have received a copy of the GNU General Public License
+  along with this program; if not, write to the Free Software
+  Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
+  02110-1301, USA.
+
+COPYRIGHT NOTICE:
+
+  TokuDB, Tokutek Fractal Tree Indexing Library.
+  Copyright (C) 2007-2013 Tokutek, Inc.
+
+DISCLAIMER:
+
+  This program is distributed in the hope that it will be useful, but
+  WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+  General Public License for more details.
+
+UNIVERSITY PATENT NOTICE:
+
+  The technology is licensed by the Massachusetts Institute of
+  Technology, Rutgers State University of New Jersey, and the Research
+  Foundation of State University of New York at Stony Brook under
+  United States of America Serial No. 11/760379 and to the patents
+  and/or patent applications resulting from it.
+
+PATENT MARKING NOTICE:
+
+  This software is covered by US Patent No. 8,185,551.
+
+PATENT RIGHTS GRANT:
+
+  "THIS IMPLEMENTATION" means the copyrightable works distributed by
+  Tokutek as part of the Fractal Tree project.
+
+  "PATENT CLAIMS" means the claims of patents that are owned or
+  licensable by Tokutek, both currently or in the future; and that in
+  the absence of this license would be infringed by THIS
+  IMPLEMENTATION or by using or running THIS IMPLEMENTATION.
+
+  "PATENT CHALLENGE" shall mean a challenge to the validity,
+  patentability, enforceability and/or non-infringement of any of the
+  PATENT CLAIMS or otherwise opposing any of the PATENT CLAIMS.
+
+  Tokutek hereby grants to you, for the term and geographical scope of
+  the PATENT CLAIMS, a non-exclusive, no-charge, royalty-free,
+  irrevocable (except as stated in this section) patent license to
+  make, have made, use, offer to sell, sell, import, transfer, and
+  otherwise run, modify, and propagate the contents of THIS
+  IMPLEMENTATION, where such license applies only to the PATENT
+  CLAIMS.  This grant does not include claims that would be infringed
+  only as a consequence of further modifications of THIS
+  IMPLEMENTATION.  If you or your agent or licensee institute or order
+  or agree to the institution of patent litigation against any entity
+  (including a cross-claim or counterclaim in a lawsuit) alleging that
+  THIS IMPLEMENTATION constitutes direct or contributory patent
+  infringement, or inducement of patent infringement, then any rights
+  granted to you under this License shall terminate as of the date
+  such litigation is filed.  If you or your agent or exclusive
+  licensee institute or order or agree to the institution of a PATENT
+  CHALLENGE, then Tokutek may terminate any rights granted to you
+  under this License.
+*/
+
+#ident "Copyright (c) 2007-2013 Tokutek Inc.  All rights reserved."
+#ident "The technology is licensed by the Massachusetts Institute of Technology, Rutgers State University of New Jersey, and the Research Foundation of State University of New York at Stony Brook under United States of America Serial No. 11/760379 and to the patents and/or patent applications resulting from it."
+
+#include <stdint.h>
+#include <memory.h>
+#include <toku_portability.h>
+#include <toku_race_tools.h>
+#include "growable_array.h"
+#include "../ft/wbuf.h"
+#include <vector>
+
+namespace toku {
+typedef uint32_t node_offset;
+
+
+/**
+ * Dynamic Order Maintenance Tree (DMT)
+ *
+ * Maintains a collection of totally ordered values, where each value has weight 1.
+ * A DMT supports variable sized values.
+ * The DMT is a mutable datatype.
+ *
+ * The Abstraction:
+ *
+ * An DMT is a vector of values, $V$, where $|V|$ is the length of the vector.
+ * The vector is numbered from $0$ to $|V|-1$.
+ *
+ * We can create a new DMT, which is the empty vector.
+ *
+ * We can insert a new element $x$ into slot $i$, changing $V$ into $V'$ where
+ *  $|V'|=1+|V|$       and
+ *
+ *   V'_j = V_j       if $j<i$
+ *          x         if $j=i$
+ *          V_{j-1}   if $j>i$.
+ *
+ * We can specify $i$ using a kind of function instead of as an integer.
+ * Let $b$ be a function mapping from values to nonzero integers, such that
+ * the signum of $b$ is monotically increasing.
+ * We can specify $i$ as the minimum integer such that $b(V_i)>0$.
+ *
+ * We look up a value using its index, or using a Heaviside function.
+ * For lookups, we allow $b$ to be zero for some values, and again the signum of $b$ must be monotonically increasing.
+ * When lookup up values, we can look up
+ *  $V_i$ where $i$ is the minimum integer such that $b(V_i)=0$.   (With a special return code if no such value exists.)
+ *      (Rationale:  Ordinarily we want $i$ to be unique.  But for various reasons we want to allow multiple zeros, and we want the smallest $i$ in that case.)
+ *  $V_i$ where $i$ is the minimum integer such that $b(V_i)>0$.   (Or an indication that no such value exists.)
+ *  $V_i$ where $i$ is the maximum integer such that $b(V_i)<0$.   (Or an indication that no such value exists.)
+ *
+ * When looking up a value using a Heaviside function, we get the value and its index.
+ *
+ * Performance:
+ *  Insertion and deletion should run with $O(\log |V|)$ time and $O(\log |V|)$ calls to the Heaviside function.
+ *  The memory required is O(|V|).
+ *
+ * Usage:
+ *  The dmt is templated by three parameters:
+ *   - dmtdata_t is what will be stored within the dmt.  These could be pointers or real data types (ints, structs).
+ *   - dmtdataout_t is what will be returned by find and related functions.  By default, it is the same as dmtdata_t, but you can set it to (dmtdata_t *).
+ *   - dmtwriter_t is a class that effectively handles (de)serialization between the value stored in the dmt and outside the dmt.
+ *  To create an dmt which will store "TXNID"s, for example, it is a good idea to typedef the template:
+ *   typedef dmt<TXNID, TXNID, txnid_writer_t> txnid_dmt_t;
+ *  If you are storing structs (or you want to edit what is stored), you may want to be able to get a pointer to the data actually stored in the dmt (see find_zero).  To do this, use the second template parameter:
+ *   typedef dmt<struct foo, struct foo *, foo_writer_t> foo_dmt_t;
+ */
+
+namespace dmt_internal {
+
+class subtree {
+private:
+    uint32_t m_index;
+public:
+    // The maximum mempool size for a dmt is 2**32-2
+    static const uint32_t NODE_NULL = UINT32_MAX;
+    inline void set_to_null(void) {
+        m_index = NODE_NULL;
+    }
+
+    inline bool is_null(void) const {
+        return NODE_NULL == this->get_offset();
+    }
+
+    inline node_offset get_offset(void) const {
+        return m_index;
+    }
+
+    inline void set_offset(node_offset index) {
+        paranoid_invariant(index != NODE_NULL);
+        m_index = index;
+    }
+} __attribute__((__packed__,__aligned__(4)));
+
+template<typename dmtdata_t>
+class dmt_node_templated {
+public:
+    uint32_t weight;
+    subtree left;
+    subtree right;
+    uint32_t value_length;
+    dmtdata_t value;
+} __attribute__((__aligned__(4)));  //NOTE: we cannot use attribute packed or dmtdata_t will call copy constructors (dmtdata_t might not be packed by default)
+
+}
+
+using namespace toku::dmt_internal;
+
+// Each data type used in a dmt requires a dmt_writer class (allows you to insert/etc with dynamic sized types).
+// A dmt_writer can be thought of a (de)serializer
+// There is no default implementation.
+// A dmtwriter instance handles reading/writing 'dmtdata_t's to/from the dmt.
+// The class must implement the following functions:
+//      The size required in a dmt for the dmtdata_t represented:
+//          size_t get_size(void) const;
+//      Write the dmtdata_t to memory owned by a dmt:
+//          void write_to(dmtdata_t *const dest) const;
+//      Constructor (others are allowed, but this one is required)
+//          dmtwriter(const uint32_t dmtdata_t_len, dmtdata_t *const src)
+
+template<typename dmtdata_t,
+         typename dmtdataout_t,
+         typename dmtwriter_t
+        >
+class dmt {
+private:
+    typedef dmt_node_templated<dmtdata_t> dmt_node;
+
+public:
+    static const uint8_t ALIGNMENT = 4;
+
+    class builder {
+    public:
+        void append(const dmtwriter_t &value);
+
+        // Create a dmt builder to build a dmt that will have at most n_values values and use
+        // at most n_value_bytes bytes in the mempool to store values (not counting node or alignment overhead).
+        void create(uint32_t n_values, uint32_t n_value_bytes);
+
+        bool value_length_is_fixed(void);
+
+        // Constructs a dmt that contains everything that was append()ed to this builder.
+        // Destroys this builder and frees associated memory.
+        void build(dmt<dmtdata_t, dmtdataout_t, dmtwriter_t> *dest);
+    private:
+        uint32_t max_values;
+        uint32_t max_value_bytes;
+        node_offset *sorted_node_offsets;
+        bool temp_valid;
+        dmt<dmtdata_t, dmtdataout_t, dmtwriter_t> temp;
+    };
+
+    /**
+     * Effect: Create an empty DMT.
+     * Performance: constant time.
+     */
+    void create(void);
+
+    /**
+     * Effect: Create a DMT containing values.  The number of values is in numvalues.
+     *         Each value is of a fixed (at runtime) length.
+     *         mem contains the values in packed form (no alignment padding)
+     *         Caller retains ownership of mem.
+     * Requires: this has not been created yet
+     * Rationale:    Normally to insert N values takes O(N lg N) amortized time.
+     *               If the N values are known in advance, are sorted, and
+     *               the structure is empty, we can batch insert them much faster.
+     */
+    __attribute__((nonnull))
+    void create_from_sorted_memory_of_fixed_size_elements(
+            const void *mem,
+            const uint32_t numvalues,
+            const uint32_t mem_length,
+            const uint32_t fixed_value_length);
+
+    /**
+     * Effect: Creates a copy of an dmt.
+     *  Creates this as the clone.
+     *  Each element is copied directly.  If they are pointers, the underlying data is not duplicated.
+     * Performance: O(memory) (essentially a memdup)
+     *  The underlying structures are memcpy'd.  Only the values themselves are copied (shallow copy)
+     */
+    void clone(const dmt &src);
+
+    /**
+     * Effect: Set the tree to be empty.
+     *  Note: Will not reallocate or resize any memory.
+     *  Note: If this dmt had variable sized elements, it will start tracking again (until it gets values of two different sizes)
+     * Performance: time=O(1)
+     */
+    void clear(void);
+
+    /**
+     * Effect:  Destroy an DMT, freeing all its memory.
+     *   If the values being stored are pointers, their underlying data is not freed.
+     *   Those values may be freed before or after calling ::destroy()
+     * Rationale: Returns no values since free() cannot fail.
+     * Rationale: Does not free the underlying pointers to reduce complexity/maintain abstraction layer
+     * Performance:  time=O(1)
+     */
+    void destroy(void);
+
+    /**
+     * Effect: return |this| (number of values stored in this dmt).
+     * Performance:  time=O(1)
+     */
+    uint32_t size(void) const;
+
+    /**
+     * Effect: Serialize all values contained in this dmt into a packed form (no alignment padding).
+     *  We serialized to wb.  expected_unpadded_memory is the size of memory reserved in the wbuf
+     *  for serialization.  (We assert that serialization requires exactly the expected amount)
+     * Requires:
+     *  ::prepare_for_serialize() has been called and no non-const functions have been called since.
+     *  This dmt has fixed-length values and is in array form.
+     * Performance:
+     *  O(memory)
+     */
+    void serialize_values(uint32_t expected_unpadded_memory, struct wbuf *wb) const;
+
+    /**
+     * Effect:  Insert value into the DMT.
+     *   If there is some i such that $h(V_i, v)=0$ then returns DB_KEYEXIST.
+     *   Otherwise, let i be the minimum value such that $h(V_i, v)>0$.
+     *      If no such i exists, then let i be |V|
+     *   Then this has the same effect as
+     *    insert_at(tree, value, i);
+     *   If idx!=NULL then i is stored in *idx
+     * Requires:  The signum of h must be monotonically increasing.
+     * Returns:
+     *    0            success
+     *    DB_KEYEXIST  the key is present (h was equal to zero for some value)
+     * On nonzero return, dmt is unchanged.
+     * Performance: time=O(\log N) amortized.
+     * Rationale: Some future implementation may be O(\log N) worst-case time, but O(\log N) amortized is good enough for now.
+     */
+    template<typename dmtcmp_t, int (*h)(const uint32_t size, const dmtdata_t &, const dmtcmp_t &)>
+    int insert(const dmtwriter_t &value, const dmtcmp_t &v, uint32_t *const idx);
+
+    /**
+     * Effect: Increases indexes of all items at slot >= idx by 1.
+     *         Insert value into the position at idx.
+     * Returns:
+     *   0         success
+     *   EINVAL    if idx > this->size()
+     * On error, dmt is unchanged.
+     * Performance: time=O(\log N) amortized time.
+     * Rationale: Some future implementation may be O(\log N) worst-case time, but O(\log N) amortized is good enough for now.
+     */
+    int insert_at(const dmtwriter_t &value, const uint32_t idx);
+
+    /**
+     * Effect: Delete the item in slot idx.
+     *         Decreases indexes of all items at slot > idx by 1.
+     * Returns
+     *     0            success
+     *     EINVAL       if idx>=this->size()
+     * On error, dmt is unchanged.
+     * Rationale: To delete an item, first find its index using find or find_zero, then delete it.
+     * Performance: time=O(\log N) amortized.
+     */
+    int delete_at(const uint32_t idx);
+
+    /**
+     * Effect:  Iterate over the values of the dmt, from left to right, calling f on each value.
+     *  The first argument passed to f is a ref-to-const of the value stored in the dmt.
+     *  The second argument passed to f is the index of the value.
+     *  The third argument passed to f is iterate_extra.
+     *  The indices run from 0 (inclusive) to this->size() (exclusive).
+     * Requires: f != NULL
+     * Returns:
+     *  If f ever returns nonzero, then the iteration stops, and the value returned by f is returned by iterate.
+     *  If f always returns zero, then iterate returns 0.
+     * Requires:  Don't modify the dmt while running.  (E.g., f may not insert or delete values from the dmt.)
+     * Performance: time=O(i+\log N) where i is the number of times f is called, and N is the number of elements in the dmt.
+     * Rationale: Although the functional iterator requires defining another function (as opposed to C++ style iterator), it is much easier to read.
+     * Rationale: We may at some point use functors, but for now this is a smaller change from the old DMT.
+     */
+    template<typename iterate_extra_t,
+             int (*f)(const uint32_t, const dmtdata_t &, const uint32_t, iterate_extra_t *const)>
+    int iterate(iterate_extra_t *const iterate_extra) const;
+
+    /**
+     * Effect:  Iterate over the values of the dmt, from left to right, calling f on each value.
+     *  The first argument passed to f is a ref-to-const of the value stored in the dmt.
+     *  The second argument passed to f is the index of the value.
+     *  The third argument passed to f is iterate_extra.
+     *  The indices run from 0 (inclusive) to this->size() (exclusive).
+     *  We will iterate only over [left,right)
+     *
+     * Requires: left <= right
+     * Requires: f != NULL
+     * Returns:
+     *  EINVAL  if right > this->size()
+     *  If f ever returns nonzero, then the iteration stops, and the value returned by f is returned by iterate_on_range.
+     *  If f always returns zero, then iterate_on_range returns 0.
+     * Requires:  Don't modify the dmt while running.  (E.g., f may not insert or delete values from the dmt.)
+     * Performance: time=O(i+\log N) where i is the number of times f is called, and N is the number of elements in the dmt.
+     * Rational: Although the functional iterator requires defining another function (as opposed to C++ style iterator), it is much easier to read.
+     */
+    template<typename iterate_extra_t,
+             int (*f)(const uint32_t, const dmtdata_t &, const uint32_t, iterate_extra_t *const)>
+    int iterate_on_range(const uint32_t left, const uint32_t right, iterate_extra_t *const iterate_extra) const;
+
+    // Attempt to verify this dmt is well formed.  (Crashes/asserts/aborts if not well formed)
+    void verify(void) const;
+
+    /**
+     * Effect:  Iterate over the values of the dmt, from left to right, calling f on each value.
+     *  The first argument passed to f is a pointer to the value stored in the dmt.
+     *  The second argument passed to f is the index of the value.
+     *  The third argument passed to f is iterate_extra.
+     *  The indices run from 0 (inclusive) to this->size() (exclusive).
+     * Requires: same as for iterate()
+     * Returns: same as for iterate()
+     * Performance: same as for iterate()
+     * Rationale: In general, most iterators should use iterate() since they should not modify the data stored in the dmt.  This function is for iterators which need to modify values (for example, free_items).
+     * Rationale: We assume if you are transforming the data in place, you want to do it to everything at once, so there is not yet an iterate_on_range_ptr (but there could be).
+     */
+    template<typename iterate_extra_t,
+             int (*f)(const uint32_t, dmtdata_t *, const uint32_t, iterate_extra_t *const)>
+    void iterate_ptr(iterate_extra_t *const iterate_extra);
+
+    /**
+     * Effect: Set *value=V_idx
+     * Returns
+     *    0             success
+     *    EINVAL        if index>=toku_dmt_size(dmt)
+     * On nonzero return, *value is unchanged
+     * Performance: time=O(\log N)
+     */
+    int fetch(const uint32_t idx, uint32_t *const value_size, dmtdataout_t *const value) const;
+
+    /**
+     * Effect:  Find the smallest i such that h(V_i, extra)>=0
+     *  If there is such an i and h(V_i,extra)==0 then set *idxp=i, set *value = V_i, and return 0.
+     *  If there is such an i and h(V_i,extra)>0  then set *idxp=i and return DB_NOTFOUND.
+     *  If there is no such i then set *idx=this->size() and return DB_NOTFOUND.
+     * Note: value is of type dmtdataout_t, which may be of type (dmtdata_t) or (dmtdata_t *) but is fixed by the instantiation.
+     *  If it is the value type, then the value is copied out (even if the value type is a pointer to something else)
+     *  If it is the pointer type, then *value is set to a pointer to the data within the dmt.
+     *  This is determined by the type of the dmt as initially declared.
+     *   If the dmt is declared as dmt<foo_t>, then foo_t's will be stored and foo_t's will be returned by find and related functions.
+     *   If the dmt is declared as dmt<foo_t, foo_t *>, then foo_t's will be stored, and pointers to the stored items will be returned by find and related functions.
+     * Rationale:
+     *  Structs too small for malloc should be stored directly in the dmt.
+     *  These structs may need to be edited as they exist inside the dmt, so we need a way to get a pointer within the dmt.
+     *  Using separate functions for returning pointers and values increases code duplication and reduces type-checking.
+     *  That also reduces the ability of the creator of a data structure to give advice to its future users.
+     *  Slight overloading in this case seemed to provide a better API and better type checking.
+     */
+    template<typename dmtcmp_t,
+             int (*h)(const uint32_t, const dmtdata_t &, const dmtcmp_t &)>
+    int find_zero(const dmtcmp_t &extra, uint32_t *const value_size, dmtdataout_t *const value, uint32_t *const idxp) const;
+
+    /**
+     *   Effect:
+     *    If direction >0 then find the smallest i such that h(V_i,extra)>0.
+     *    If direction <0 then find the largest  i such that h(V_i,extra)<0.
+     *    (Direction may not be equal to zero.)
+     *    If value!=NULL then store V_i in *value
+     *    If idxp!=NULL then store i in *idxp.
+     *   Requires: The signum of h is monotically increasing.
+     *   Returns
+     *      0             success
+     *      DB_NOTFOUND   no such value is found.
+     *   On nonzero return, *value and *idxp are unchanged
+     *   Performance: time=O(\log N)
+     *   Rationale:
+     *     Here's how to use the find function to find various things
+     *       Cases for find:
+     *        find first value:         ( h(v)=+1, direction=+1 )
+     *        find last value           ( h(v)=-1, direction=-1 )
+     *        find first X              ( h(v)=(v< x) ? -1 : 1    direction=+1 )
+     *        find last X               ( h(v)=(v<=x) ? -1 : 1    direction=-1 )
+     *        find X or successor to X  ( same as find first X. )
+     *
+     *   Rationale: To help understand heaviside functions and behavor of find:
+     *    There are 7 kinds of heaviside functions.
+     *    The signus of the h must be monotonically increasing.
+     *    Given a function of the following form, A is the element
+     *    returned for direction>0, B is the element returned
+     *    for direction<0, C is the element returned for
+     *    direction==0 (see find_zero) (with a return of 0), and D is the element
+     *    returned for direction==0 (see find_zero) with a return of DB_NOTFOUND.
+     *    If any of A, B, or C are not found, then asking for the
+     *    associated direction will return DB_NOTFOUND.
+     *    See find_zero for more information.
+     *
+     *    Let the following represent the signus of the heaviside function.
+     *
+     *    -...-
+     *        A
+     *         D
+     *
+     *    +...+
+     *    B
+     *    D
+     *
+     *    0...0
+     *    C
+     *
+     *    -...-0...0
+     *        AC
+     *
+     *    0...0+...+
+     *    C    B
+     *
+     *    -...-+...+
+     *        AB
+     *         D
+     *
+     *    -...-0...0+...+
+     *        AC    B
+     */
+    template<typename dmtcmp_t,
+             int (*h)(const uint32_t, const dmtdata_t &, const dmtcmp_t &)>
+    int find(const dmtcmp_t &extra, int direction, uint32_t *const value_size, dmtdataout_t *const value, uint32_t *const idxp) const;
+
+    /**
+     * Effect: Return the size (in bytes) of the dmt, as it resides in main memory.
+     * If the data stored are pointers, don't include the size of what they all point to.
+     * //TODO(leif or yoni): (maybe rename and) return memory footprint instead of allocated size
+     */
+    size_t memory_size(void);
+
+    // Returns whether all values in the dmt are known to be the same size.
+    // Note:
+    //  There are no false positives, but false negatives are allowed.
+    //  A false negative can happen if this dmt had 2 (or more) different size values,
+    //  and then enough were deleted so that all the remaining ones are the same size.
+    //  Once that happens, this dmt will never again return true for this function unless/until
+    //  ::clear() is called
+    bool value_length_is_fixed(void) const;
+
+
+    // If this dmt is empty, return value is undefined.
+    // else if value_length_is_fixed() then it returns the fixed length.
+    // else returns 0
+    uint32_t get_fixed_length(void) const;
+
+    // Preprocesses the dmt so that serialization can happen quickly.
+    // After this call, serialize_values() can be called but no other mutator function can be called in between.
+    void prepare_for_serialize(void);
+
+private:
+    // Do a bit of verification that subtree and nodes act like packed c structs and do not introduce unnecessary padding for alignment.
+    ENSURE_POD(subtree);
+    static_assert(ALIGNMENT > 0, "ALIGNMENT <= 0");
+    static_assert((ALIGNMENT & (ALIGNMENT - 1)) == 0, "ALIGNMENT not a power of 2");
+    static_assert(sizeof(dmt_node) - sizeof(dmtdata_t) == __builtin_offsetof(dmt_node, value), "value is not last field in node");
+    static_assert(4 * sizeof(uint32_t) == __builtin_offsetof(dmt_node, value), "dmt_node is padded");
+    static_assert(__builtin_offsetof(dmt_node, value) % ALIGNMENT == 0, "dmt_node requires padding for alignment");
+    ENSURE_POD(dmt_node);
+
+    struct dmt_array {
+        uint32_t num_values;
+    };
+
+    struct dmt_tree {
+        subtree root;
+    };
+
+    /*
+    Relationship between values_same_size, d.a.num_values, value_length, is_array:
+    In an empty dmt:
+        is_array is true
+        value_same_size is true
+        value_length is undefined
+        d.a.num_values is 0
+    In a non-empty array dmt:
+        is_array is true
+        values_same_size is true
+        value_length is defined
+        d.a.num_values > 0
+    In a non-empty tree dmt:
+        is_array = false
+        value_same_size is true iff all values have been the same size since the last time the dmt turned into a tree.
+        value_length is defined iff values_same_size is true
+        d.a.num_values is undefined (the memory is used for the tree)
+    Note that in tree form, the dmt keeps track of if all values are the same size until the first time they are not.
+    'values_same_size' will not become true again (even if we change all values to be the same size)
+        until/unless the dmt becomes empty, at which point it becomes an array again.
+     */
+    bool values_same_size;
+    uint32_t value_length;  // valid iff values_same_size is true.
+    struct mempool mp;
+    bool is_array;
+    union {
+        struct dmt_array a;
+        struct dmt_tree t;
+    } d;
+
+    // Returns pad bytes per element (for alignment) or 0 if not fixed length.
+    uint32_t get_fixed_length_alignment_overhead(void) const;
+
+    void verify_internal(const subtree &subtree, std::vector<bool> *touched) const;
+
+    // Retrieves the node for a given subtree.
+    // Requires: !subtree.is_null()
+    dmt_node & get_node(const subtree &subtree) const;
+
+    // Retrieves the node at a given offset in the mempool.
+    dmt_node & get_node(const node_offset offset) const;
+
+    // Returns the weight of a subtree rooted at st.
+    // if st.is_null(), returns 0
+    // Perf: O(1)
+    uint32_t nweight(const subtree &st) const;
+
+    // Allocates space for a node (in the mempool) and uses the dmtwriter to write the value into the node
+    node_offset node_malloc_and_set_value(const dmtwriter_t &value);
+
+    // Uses the dmtwriter to write a value into node n
+    void node_set_value(dmt_node *n, const dmtwriter_t &value);
+
+    // (mempool-)free the memory for a node
+    void node_free(const subtree &st);
+
+    // Effect: Resizes the mempool (holding the array) if necessary to hold one more item of length: this->value_length
+    // Requires:
+    //  This dmt is in array form (and thus this->values_same_length)
+    void maybe_resize_array_for_insert(void);
+
+    // Effect: Converts a dmt from array form to tree form.
+    // Perf: O(n)
+    // Note: This does not clear the 'this->values_same_size' bit
+    void convert_to_tree(void);
+
+    // Effect: Resizes the mempool holding a tree if necessary.  If value==nullptr then it may shrink if overallocated,
+    //         otherwise resize only happens if there is not enough free space for an insert of value
+    void maybe_resize_tree(const dmtwriter_t * value);
+
+    // Returns true if the tree rooted at st would need rebalance after adding
+    // leftmod to the left subtree and rightmod to the right subtree
+    bool will_need_rebalance(const subtree &st, const int leftmod, const int rightmod) const;
+
+    __attribute__((nonnull))
+    void insert_internal(subtree *const subtreep, const dmtwriter_t &value, const uint32_t idx, subtree **const rebalance_subtree);
+
+    template<bool with_resize>
+    int insert_at_array_end(const dmtwriter_t& value_in);
+
+    dmtdata_t * alloc_array_value_end(void);
+
+    dmtdata_t * get_array_value(const uint32_t idx) const;
+
+    dmtdata_t * get_array_value_internal(const struct mempool *mempool, const uint32_t idx) const;
+
+    void convert_from_array_to_tree(void);
+
+    void convert_from_tree_to_array(void);
+
+    __attribute__((nonnull(2,5)))
+    void delete_internal(subtree *const subtreep, const uint32_t idx, subtree *const subtree_replace, subtree **const rebalance_subtree);
+
+    template<typename iterate_extra_t,
+             int (*f)(const uint32_t, const dmtdata_t &, const uint32_t, iterate_extra_t *const)>
+    int iterate_internal_array(const uint32_t left, const uint32_t right,
+                                      iterate_extra_t *const iterate_extra) const;
+
+    template<typename iterate_extra_t,
+             int (*f)(const uint32_t, dmtdata_t *, const uint32_t, iterate_extra_t *const)>
+    void iterate_ptr_internal(const uint32_t left, const uint32_t right,
+                                     const subtree &subtree, const uint32_t idx,
+                                     iterate_extra_t *const iterate_extra);
+
+    template<typename iterate_extra_t,
+             int (*f)(const uint32_t, dmtdata_t *, const uint32_t, iterate_extra_t *const)>
+    void iterate_ptr_internal_array(const uint32_t left, const uint32_t right,
+                                           iterate_extra_t *const iterate_extra);
+
+    template<typename iterate_extra_t,
+             int (*f)(const uint32_t, const dmtdata_t &, const uint32_t, iterate_extra_t *const)>
+    int iterate_internal(const uint32_t left, const uint32_t right,
+                                const subtree &subtree, const uint32_t idx,
+                                iterate_extra_t *const iterate_extra) const;
+
+    void fetch_internal_array(const uint32_t i, uint32_t *const value_len, dmtdataout_t *const value) const;
+
+    void fetch_internal(const subtree &subtree, const uint32_t i, uint32_t *const value_len, dmtdataout_t *const value) const;
+
+    __attribute__((nonnull))
+    void fill_array_with_subtree_offsets(node_offset *const array, const subtree &subtree) const;
+
+    __attribute__((nonnull))
+    void rebuild_subtree_from_offsets(subtree *const subtree, const node_offset *const offsets, const uint32_t numvalues);
+
+    __attribute__((nonnull))
+    void rebalance(subtree *const subtree);
+
+    __attribute__((nonnull))
+    static void copyout(uint32_t *const outlen, dmtdata_t *const out, const dmt_node *const n);
+
+    __attribute__((nonnull))
+    static void copyout(uint32_t *const outlen, dmtdata_t **const out, dmt_node *const n);
+
+    __attribute__((nonnull))
+    static void copyout(uint32_t *const outlen, dmtdata_t *const out, const uint32_t len, const dmtdata_t *const stored_value_ptr);
+
+    __attribute__((nonnull))
+    static void copyout(uint32_t *const outlen, dmtdata_t **const out, const uint32_t len, dmtdata_t *const stored_value_ptr);
+
+    template<typename dmtcmp_t,
+             int (*h)(const uint32_t, const dmtdata_t &, const dmtcmp_t &)>
+    int find_internal_zero_array(const dmtcmp_t &extra, uint32_t *const value_len, dmtdataout_t *const value, uint32_t *const idxp) const;
+
+    template<typename dmtcmp_t,
+             int (*h)(const uint32_t, const dmtdata_t &, const dmtcmp_t &)>
+    int find_internal_zero(const subtree &subtree, const dmtcmp_t &extra, uint32_t *const value_len, dmtdataout_t *const value, uint32_t *const idxp) const;
+
+    template<typename dmtcmp_t,
+             int (*h)(const uint32_t, const dmtdata_t &, const dmtcmp_t &)>
+    int find_internal_plus_array(const dmtcmp_t &extra, uint32_t *const value_len, dmtdataout_t *const value, uint32_t *const idxp) const;
+
+    template<typename dmtcmp_t,
+             int (*h)(const uint32_t, const dmtdata_t &, const dmtcmp_t &)>
+    int find_internal_plus(const subtree &subtree, const dmtcmp_t &extra, uint32_t *const value_len, dmtdataout_t *const value, uint32_t *const idxp) const;
+
+    template<typename dmtcmp_t,
+             int (*h)(const uint32_t, const dmtdata_t &, const dmtcmp_t &)>
+    int find_internal_minus_array(const dmtcmp_t &extra, uint32_t *const value_len, dmtdataout_t *const value, uint32_t *const idxp) const;
+
+    template<typename dmtcmp_t,
+             int (*h)(const uint32_t, const dmtdata_t &, const dmtcmp_t &)>
+    int find_internal_minus(const subtree &subtree, const dmtcmp_t &extra, uint32_t *const value_len, dmtdataout_t *const value, uint32_t *const idxp) const;
+
+    // Allocate memory for an array:  node_offset[num_idx] from pre-allocated contiguous free space in the mempool.
+    // If there is not enough space, returns nullptr.
+    node_offset* alloc_temp_node_offsets(uint32_t num_idxs);
+
+    // Returns the aligned size of x.
+    // If x % ALIGNMENT == 0, returns x
+    // o.w. returns x + (ALIGNMENT - (x % ALIGNMENT))
+    uint32_t align(const uint32_t x) const;
+};
+
+} // namespace toku
+
+// include the implementation here
+#include "dmt.cc"
+
--- a/util/mempool.cc
+++ b/util/mempool.cc
@ -131,7 +131,7 @@ void toku_mempool_init(struct mempool *mp, void *base, size_t free_offset, size_
 void toku_mempool_construct(struct mempool *mp, size_t data_size) {
    if (data_size) {
        size_t mpsize = data_size + (data_size/4);     // allow 1/4 room for expansion (would be wasted if read-only)
-        mp->base = toku_xmalloc(mpsize);               // allocate buffer for mempool
+        mp->base = toku_xmalloc_aligned(64, mpsize);   // allocate buffer for mempool
        mp->size = mpsize;
        mp->free_offset = 0;                     // address of first available memory for new data
        mp->frag_size = 0;                       // all allocated space is now in use
@ -142,6 +142,22 @@ void toku_mempool_construct(struct mempool *mp, size_t data_size) {
    }
 }

+void toku_mempool_reset(struct mempool *mp) {
+    mp->free_offset = 0;
+    mp->frag_size = 0;
+}
+
+void toku_mempool_realloc_larger(struct mempool *mp, size_t data_size) {
+    invariant(data_size >= mp->free_offset);
+
+    size_t mpsize = data_size + (data_size/4);     // allow 1/4 room for expansion (would be wasted if read-only)
+    void* newmem = toku_xmalloc_aligned(64, mpsize);   // allocate new buffer for mempool
+    memcpy(newmem, mp->base, mp->free_offset);  // Copy old info
+    toku_free(mp->base);
+    mp->base = newmem;
+    mp->size = mpsize;
+}
+

 void toku_mempool_destroy(struct mempool *mp) {
    // printf("mempool_destroy %p %p %lu %lu\n", mp, mp->base, mp->size, mp->frag_size);
@ -150,27 +166,44 @@ void toku_mempool_destroy(struct mempool *mp) {
    toku_mempool_zero(mp);
 }

-void *toku_mempool_get_base(struct mempool *mp) {
+void *toku_mempool_get_base(const struct mempool *mp) {
    return mp->base;
 }

-size_t toku_mempool_get_size(struct mempool *mp) {
+void *toku_mempool_get_pointer_from_base_and_offset(const struct mempool *mp, size_t offset) {
+    return reinterpret_cast<void*>(reinterpret_cast<char*>(mp->base) + offset);
+}
+
+size_t toku_mempool_get_offset_from_pointer_and_base(const struct mempool *mp, const void* p) {
+    paranoid_invariant(p >= mp->base);
+    return reinterpret_cast<const char*>(p) - reinterpret_cast<const char*>(mp->base);
+}
+
+size_t toku_mempool_get_size(const struct mempool *mp) {
    return mp->size;
 }

-size_t toku_mempool_get_frag_size(struct mempool *mp) {
+size_t toku_mempool_get_frag_size(const struct mempool *mp) {
    return mp->frag_size;
 }

-size_t toku_mempool_get_used_space(struct mempool *mp) {
+size_t toku_mempool_get_used_size(const struct mempool *mp) {
    return mp->free_offset - mp->frag_size;
 }

-size_t toku_mempool_get_free_space(struct mempool *mp) {
+void* toku_mempool_get_next_free_ptr(const struct mempool *mp) {
+    return toku_mempool_get_pointer_from_base_and_offset(mp, mp->free_offset);
+}
+
+size_t toku_mempool_get_offset_limit(const struct mempool *mp) {
+    return mp->free_offset;
+}
+
+size_t toku_mempool_get_free_size(const struct mempool *mp) {
    return mp->size - mp->free_offset;
 }

-size_t toku_mempool_get_allocated_space(struct mempool *mp) {
+size_t toku_mempool_get_allocated_size(const struct mempool *mp) {
    return mp->free_offset;
 }

@ -211,10 +244,10 @@ size_t toku_mempool_footprint(struct mempool *mp) {
    return rval;
 }

-void toku_mempool_clone(struct mempool* orig_mp, struct mempool* new_mp) {
+void toku_mempool_clone(const struct mempool* orig_mp, struct mempool* new_mp) {
    new_mp->frag_size = orig_mp->frag_size;
    new_mp->free_offset = orig_mp->free_offset;
    new_mp->size = orig_mp->free_offset; // only make the cloned mempool store what is needed
-    new_mp->base = toku_xmalloc(new_mp->size);
+    new_mp->base = toku_xmalloc_aligned(64, new_mp->size);
    memcpy(new_mp->base, orig_mp->base, new_mp->size);
 }
--- a/util/mempool.h
+++ b/util/mempool.h
@ -123,26 +123,46 @@ void toku_mempool_init(struct mempool *mp, void *base, size_t free_offset, size_
 */
 void toku_mempool_construct(struct mempool *mp, size_t data_size);

+/* treat mempool as if it has just been created; ignore any frag and start allocating from beginning again.
+ */
+void toku_mempool_reset(struct mempool *mp);
+
+/* reallocate memory for construct mempool
+ */
+void toku_mempool_realloc_larger(struct mempool *mp, size_t data_size);
+
 /* destroy the memory pool */
 void toku_mempool_destroy(struct mempool *mp);

 /* get the base address of the memory pool */
-void *toku_mempool_get_base(struct mempool *mp);
+void *toku_mempool_get_base(const struct mempool *mp);
+
+/* get the a pointer that is offset bytes in front of base of the memory pool */
+void *toku_mempool_get_pointer_from_base_and_offset(const struct mempool *mp, size_t offset);
+
+/* get the offset from base of a pointer */
+size_t toku_mempool_get_offset_from_pointer_and_base(const struct mempool *mp, const void* p);
+
+/* get the a pointer of the first free byte (if any) */
+void* toku_mempool_get_next_free_ptr(const struct mempool *mp);
+
+/* get the limit of valid offsets.  (anything later was not allocated) */
+size_t toku_mempool_get_offset_limit(const struct mempool *mp);

 /* get the size of the memory pool */
-size_t toku_mempool_get_size(struct mempool *mp);
+size_t toku_mempool_get_size(const struct mempool *mp);

 /* get the amount of fragmented (wasted) space in the memory pool */
-size_t toku_mempool_get_frag_size(struct mempool *mp);
+size_t toku_mempool_get_frag_size(const struct mempool *mp);

 /* get the amount of space that is holding useful data */
-size_t toku_mempool_get_used_space(struct mempool *mp);
+size_t toku_mempool_get_used_size(const struct mempool *mp);

 /* get the amount of space that is available for new data */
-size_t toku_mempool_get_free_space(struct mempool *mp);
+size_t toku_mempool_get_free_size(const struct mempool *mp);

 /* get the amount of space that has been allocated for use (wasted or not) */
-size_t toku_mempool_get_allocated_space(struct mempool *mp);
+size_t toku_mempool_get_allocated_size(const struct mempool *mp);

 /* allocate a chunk of memory from the memory pool suitably aligned */
 void *toku_mempool_malloc(struct mempool *mp, size_t size, int alignment);
@ -160,6 +180,8 @@ static inline int toku_mempool_inrange(struct mempool *mp, void *vp, size_t size
 /* get memory footprint */
 size_t toku_mempool_footprint(struct mempool *mp);

-void toku_mempool_clone(struct mempool* orig_mp, struct mempool* new_mp);
+void toku_mempool_clone(const struct mempool* orig_mp, struct mempool* new_mp);
+
+

 #endif // UTIL_MEMPOOL_H