refs #5418 merge promotion to main

git-svn-id: file:///svn/toku/tokudb@49697 c7de825b-a66e-492c-adef-691d508d4ae1
2025-02-01 19:41:47 +01:00 · 2013-04-17 00:01:14 -04:00 · 2013-04-17 00:01:14 -04:00 · bf70bbb898
commit bf70bbb898
parent 609d0dca91
104 changed files with 1991 additions and 1000 deletions
--- a/CTestCustom.cmake
+++ b/CTestCustom.cmake
@ -8,6 +8,7 @@ list(APPEND CTEST_CUSTOM_MEMCHECK_IGNORE
  ft/ft_loader-test-extractor-2
  ft/ft_loader-test-extractor-3
  ft/upgrade_test_simple
+  portability/test-cache-line-boundary-fails
  portability/try-leak-lost
  portability/try-leak-reachable
  portability/try-leak-uninit
--- a/cmake_modules/TokuFeatureDetection.cmake
+++ b/cmake_modules/TokuFeatureDetection.cmake
@ -7,7 +7,12 @@ if(USE_BDB)
  find_package(BDB REQUIRED)
 endif()

-find_package(Valgrind REQUIRED)
+option(USE_VALGRIND "Build to run safely under valgrind (often slower)." ON)
+if(USE_VALGRIND)
+  find_package(Valgrind REQUIRED)
+endif()
+
+option(TOKU_DEBUG_PARANOID "Enable paranoid asserts." ON)

 include(CheckIncludeFiles)

--- a/cmake_modules/TokuSetupCompiler.cmake
+++ b/cmake_modules/TokuSetupCompiler.cmake
@ -100,6 +100,9 @@ endif ()

 ## this hits with optimized builds somewhere in ftleaf_split, we don't
 ## know why but we don't think it's a big deal
+set_cflags_if_supported(
+  -Wno-error=strict-overflow
+  )
 set_ldflags_if_supported(
  -Wno-error=strict-overflow
  )
@ -122,11 +125,6 @@ else ()
  set(CMAKE_SHARED_LINKER_FLAGS "-g -fuse-linker-plugin ${CMAKE_SHARED_LINKER_FLAGS}")
 endif ()

-option(USE_VALGRIND "Do not pass NVALGRIND to the compiler, because valgrind will be run on the generated executables." ON)
-if (NOT USE_VALGRIND)
-  set_property(DIRECTORY APPEND PROPERTY COMPILE_DEFINITIONS_RELEASE NVALGRIND=1)
-endif ()
-
 ## set warnings
 set_cflags_if_supported(
  -Wextra
--- a/db-benchmark-test/ptquery.cc
+++ b/db-benchmark-test/ptquery.cc
@ -7,6 +7,7 @@
 #include "tokudb_common_funcs.h"
 #include <toku_pthread.h>
 #include <toku_assert.h>
+#include <portability/toku_atomic.h>
 #include <db.h>
 #include <errno.h>
 #include <stdlib.h>
@ -102,7 +103,7 @@ static void test_begin_commit(int _nqueries) {
        r = c->c_get(c, &key, &val, DB_SET);
 #endif
        assert_zero(r);
-        (void) __sync_fetch_and_add(&set_count, 1);
+        (void) toku_sync_fetch_and_add(&set_count, 1);
        r = c->c_close(c); assert_zero(r);
        r = txn->commit(txn, 0); assert_zero(r);
    }
--- a/ft/block_table.cc
+++ b/ft/block_table.cc
@ -89,9 +89,9 @@ static inline void unlock_for_blocktable (BLOCK_TABLE bt);
 static void 
 ft_set_dirty(FT ft, bool for_checkpoint){
    toku_mutex_assert_locked(&ft->blocktable->mutex);
-    assert(ft->h->type == FT_CURRENT);
+    paranoid_invariant(ft->h->type == FT_CURRENT);
    if (for_checkpoint) {
-        assert(ft->checkpoint_header->type == FT_CHECKPOINT_INPROGRESS);
+        paranoid_invariant(ft->checkpoint_header->type == FT_CHECKPOINT_INPROGRESS);
        ft->checkpoint_header->dirty = 1;
    }
    else {
@ -134,10 +134,10 @@ toku_maybe_truncate_file_on_open(BLOCK_TABLE bt, int fd) {

 static void
 copy_translation(struct translation * dst, struct translation * src, enum translation_type newtype) {
-    assert(src->length_of_array >= src->smallest_never_used_blocknum.b); //verify invariant
-    assert(newtype==TRANSLATION_DEBUG ||
-           (src->type == TRANSLATION_CURRENT      && newtype == TRANSLATION_INPROGRESS) ||
-           (src->type == TRANSLATION_CHECKPOINTED && newtype == TRANSLATION_CURRENT));
+    paranoid_invariant(src->length_of_array >= src->smallest_never_used_blocknum.b); //verify invariant
+    paranoid_invariant(newtype==TRANSLATION_DEBUG ||
+                       (src->type == TRANSLATION_CURRENT      && newtype == TRANSLATION_INPROGRESS) ||
+                       (src->type == TRANSLATION_CHECKPOINTED && newtype == TRANSLATION_CURRENT));
    dst->type = newtype;
    dst->smallest_never_used_blocknum = src->smallest_never_used_blocknum;
    dst->blocknum_freelist_head       = src->blocknum_freelist_head; 
@ -175,7 +175,7 @@ maybe_optimize_translation(struct translation *t) {
    //This is O(n) work, so do it only if you're already doing that.

    BLOCKNUM b;
-    assert(t->smallest_never_used_blocknum.b >= RESERVED_BLOCKNUMS);
+    paranoid_invariant(t->smallest_never_used_blocknum.b >= RESERVED_BLOCKNUMS);
    //Calculate how large the free suffix is.
    int64_t freed;
    {
@ -212,7 +212,7 @@ void
 toku_block_translation_note_start_checkpoint_unlocked (BLOCK_TABLE bt) {
    toku_mutex_assert_locked(&bt->mutex);
    // Copy current translation to inprogress translation.
-    assert(bt->inprogress.block_translation == NULL);
+    paranoid_invariant(bt->inprogress.block_translation == NULL);
    //We're going to do O(n) work to copy the translation, so we
    //can afford to do O(n) work by optimizing the translation
    maybe_optimize_translation(&bt->current);
@ -229,7 +229,7 @@ toku_block_translation_note_start_checkpoint_unlocked (BLOCK_TABLE bt) {
 void toku_block_translation_note_skipped_checkpoint (BLOCK_TABLE bt) {
    //Purpose, alert block translation that the checkpoint was skipped, e.x. for a non-dirty header
    lock_for_blocktable(bt);
-    assert(bt->inprogress.block_translation);
+    paranoid_invariant_notnull(bt->inprogress.block_translation);
    bt->checkpoint_skipped = true;
    unlock_for_blocktable(bt);
 }
@ -267,7 +267,7 @@ toku_block_translation_note_end_checkpoint (BLOCK_TABLE bt, int fd) {
    // Free unused blocks
    lock_for_blocktable(bt);
    uint64_t allocated_limit_at_start = block_allocator_allocated_limit(bt->block_allocator);
-    assert(bt->inprogress.block_translation);
+    paranoid_invariant_notnull(bt->inprogress.block_translation);
    if (bt->checkpoint_skipped || bt->checkpoint_failed) {
        cleanup_failed_checkpoint(bt);
        goto end;
@ -299,25 +299,31 @@ end:
    unlock_for_blocktable(bt);
 }

-
+__attribute__((nonnull,const))
+static inline bool
+is_valid_blocknum(struct translation *t, BLOCKNUM b) {
+    //Sanity check: Verify invariant
+    paranoid_invariant(t->length_of_array >= t->smallest_never_used_blocknum.b);
+    return b.b >= 0 && b.b < t->smallest_never_used_blocknum.b;
+}

 static inline void
-verify_valid_blocknum (struct translation *t, BLOCKNUM b) {
-    assert(b.b >= 0);
-    assert(b.b < t->smallest_never_used_blocknum.b);
+verify_valid_blocknum (struct translation *UU(t), BLOCKNUM UU(b)) {
+    paranoid_invariant(is_valid_blocknum(t, b));
+}

+__attribute__((nonnull,const))
+static inline bool
+is_valid_freeable_blocknum(struct translation *t, BLOCKNUM b) {
    //Sanity check: Verify invariant
-    assert(t->length_of_array >= t->smallest_never_used_blocknum.b);
+    paranoid_invariant(t->length_of_array >= t->smallest_never_used_blocknum.b);
+    return b.b >= RESERVED_BLOCKNUMS && b.b < t->smallest_never_used_blocknum.b;
 }

 //Can be freed
 static inline void
-verify_valid_freeable_blocknum (struct translation *t, BLOCKNUM b) {
-    assert(b.b >= RESERVED_BLOCKNUMS);
-    assert(b.b < t->smallest_never_used_blocknum.b);
-
-    //Sanity check: Verify invariant
-    assert(t->length_of_array >= t->smallest_never_used_blocknum.b);
+verify_valid_freeable_blocknum (struct translation *UU(t), BLOCKNUM UU(b)) {
+    paranoid_invariant(is_valid_freeable_blocknum(t, b));
 }

 static void
@ -376,11 +382,9 @@ calculate_size_on_disk (struct translation *t) {
 // We cannot free the disk space allocated to this blocknum if it is still in use by the given translation table.
 static inline bool
 translation_prevents_freeing(struct translation *t, BLOCKNUM b, struct block_translation_pair *old_pair) {
-    bool r = (bool)
-        (t->block_translation &&
+    return (t->block_translation &&
         b.b < t->smallest_never_used_blocknum.b &&
         old_pair->u.diskoff == t->block_translation[b.b].u.diskoff);
-    return r;
 }

 static void
@ -413,7 +417,7 @@ PRNTF("Freed", b.b, old_pair.size, old_pair.u.diskoff, bt);
 PRNTF("New", b.b, t->block_translation[b.b].size, t->block_translation[b.b].u.diskoff, bt);
    //Update inprogress btt if appropriate (if called because Pending bit is set).
    if (for_checkpoint) {
-        assert(b.b < bt->inprogress.length_of_array);
+        paranoid_invariant(b.b < bt->inprogress.length_of_array);
        bt->inprogress.block_translation[b.b] = t->block_translation[b.b];
    }
 }
@ -449,17 +453,22 @@ toku_blocknum_realloc_on_disk (BLOCK_TABLE bt, BLOCKNUM b, DISKOFF size, DISKOFF
    unlock_for_blocktable(bt);
 }

+__attribute__((nonnull,const))
+static inline bool
+pair_is_unallocated(struct block_translation_pair *pair) {
+    return pair->size == 0 && pair->u.diskoff == diskoff_unused;
+}
+
 // Purpose of this function is to figure out where to put the inprogress btt on disk, allocate space for it there.
 static void
 blocknum_alloc_translation_on_disk_unlocked (BLOCK_TABLE bt) {
    toku_mutex_assert_locked(&bt->mutex);

    struct translation *t = &bt->inprogress;
-    assert(t->block_translation);
+    paranoid_invariant_notnull(t->block_translation);
    BLOCKNUM b = make_blocknum(RESERVED_BLOCKNUM_TRANSLATION);
-    struct block_translation_pair old_pair = t->block_translation[b.b];
    //Each inprogress is allocated only once
-    assert(old_pair.size == 0 && old_pair.u.diskoff == diskoff_unused);
+    paranoid_invariant(pair_is_unallocated(&t->block_translation[b.b]));

    //Allocate a new block
    int64_t size = calculate_size_on_disk(t);
@ -560,7 +569,7 @@ toku_allocate_blocknum_unlocked(BLOCK_TABLE bt, BLOCKNUM *res, FT ft) {
        t->blocknum_freelist_head = next;
    }
    //Verify the blocknum is free
-    assert(t->block_translation[result.b].size == size_is_free);
+    paranoid_invariant(t->block_translation[result.b].size == size_is_free);
    //blocknum is not free anymore
    t->block_translation[result.b].u.diskoff = diskoff_unused;
    t->block_translation[result.b].size    = 0;
@ -580,8 +589,7 @@ static void
 free_blocknum_in_translation(struct translation *t, BLOCKNUM b)
 {
    verify_valid_freeable_blocknum(t, b);
-    struct block_translation_pair old_pair = t->block_translation[b.b];
-    assert(old_pair.size != size_is_free);
+    paranoid_invariant(t->block_translation[b.b].size != size_is_free);

    PRNTF("free_blocknum", b.b, t->block_translation[b.b].size, t->block_translation[b.b].u.diskoff, bt);
    t->block_translation[b.b].size                 = size_is_free;
@ -601,7 +609,7 @@ free_blocknum_unlocked(BLOCK_TABLE bt, BLOCKNUM *bp, FT ft, bool for_checkpoint)

    free_blocknum_in_translation(&bt->current, b);
    if (for_checkpoint) {
-        assert(ft->checkpoint_header->type == FT_CHECKPOINT_INPROGRESS);
+        paranoid_invariant(ft->checkpoint_header->type == FT_CHECKPOINT_INPROGRESS);
        free_blocknum_in_translation(&bt->inprogress, b);
    }

@ -616,7 +624,10 @@ PRNTF("free_blocknum_free", b.b, old_pair.size, old_pair.u.diskoff, bt);
            block_allocator_free_block(bt->block_allocator, old_pair.u.diskoff);
        }
    }
-    else assert(old_pair.size==0 && old_pair.u.diskoff == diskoff_unused);
+    else {
+        paranoid_invariant(old_pair.size==0);
+        paranoid_invariant(old_pair.u.diskoff == diskoff_unused);
+    }
    ft_set_dirty(ft, for_checkpoint);
 }

@ -629,8 +640,8 @@ toku_free_blocknum(BLOCK_TABLE bt, BLOCKNUM *bp, FT ft, bool for_checkpoint) {

 //Verify there are no free blocks.
 void
-toku_block_verify_no_free_blocknums(BLOCK_TABLE bt) {
-    assert(bt->current.blocknum_freelist_head.b == freelist_null.b);
+toku_block_verify_no_free_blocknums(BLOCK_TABLE UU(bt)) {
+    paranoid_invariant(bt->current.blocknum_freelist_head.b == freelist_null.b);
 }

 // Frees blocknums that have a size of 0 and unused diskoff
@ -652,31 +663,54 @@ toku_free_unused_blocknums(BLOCK_TABLE bt, BLOCKNUM root) {
    unlock_for_blocktable(bt);
 }

-
-//Verify there are no data blocks except root.
-void
-toku_block_verify_no_data_blocks_except_root(BLOCK_TABLE bt, BLOCKNUM root) {
+__attribute__((nonnull,const,unused))
+static inline bool
+no_data_blocks_except_root(BLOCK_TABLE bt, BLOCKNUM root) {
+    bool ok = true;
    lock_for_blocktable(bt);
-    assert(root.b >= RESERVED_BLOCKNUMS);
    int64_t smallest = bt->current.smallest_never_used_blocknum.b;
-    for (int64_t i=RESERVED_BLOCKNUMS; i < smallest; i++) {
+    if (root.b < RESERVED_BLOCKNUMS) {
+        ok = false;
+        goto cleanup;
+    }
+    int64_t i;
+    for (i=RESERVED_BLOCKNUMS; i < smallest; i++) {
        if (i == root.b) {
            continue;
        }
        BLOCKNUM b = make_blocknum(i);
-        assert(bt->current.block_translation[b.b].size == size_is_free);
+        if (bt->current.block_translation[b.b].size != size_is_free) {
+            ok = false;
+            goto cleanup;
+        }
    }
+ cleanup:
    unlock_for_blocktable(bt);
+    return ok;
+}
+
+//Verify there are no data blocks except root.
+// TODO(leif): This actually takes a lock, but I don't want to fix all the callers right now.
+void
+toku_block_verify_no_data_blocks_except_root(BLOCK_TABLE UU(bt), BLOCKNUM UU(root)) {
+    paranoid_invariant(no_data_blocks_except_root(bt, root));
+}
+
+__attribute__((nonnull,const,unused))
+static inline bool
+blocknum_allocated(BLOCK_TABLE bt, BLOCKNUM b) {
+    lock_for_blocktable(bt);
+    struct translation *t = &bt->current;
+    verify_valid_blocknum(t, b);
+    bool ok = t->block_translation[b.b].size != size_is_free;
+    unlock_for_blocktable(bt);
+    return ok;
 }

 //Verify a blocknum is currently allocated.
 void
-toku_verify_blocknum_allocated(BLOCK_TABLE bt, BLOCKNUM b) {
-    lock_for_blocktable(bt);
-    struct translation *t = &bt->current;
-    verify_valid_blocknum(t, b);
-    assert(t->block_translation[b.b].size != size_is_free);
-    unlock_for_blocktable(bt);
+toku_verify_blocknum_allocated(BLOCK_TABLE UU(bt), BLOCKNUM UU(b)) {
+    paranoid_invariant(blocknum_allocated(bt, b));
 }

 //Only used by toku_dump_translation table (debug info)
@ -834,12 +868,12 @@ blocktable_note_translation (BLOCK_ALLOCATOR allocator, struct translation *t) {
    //See RESERVED_BLOCKNUMS

    // Previously this added blocks one at a time.  Now we make an array and pass it in so it can be sorted and merged.  See #3218.
-    struct block_allocator_blockpair *MALLOC_N(t->smallest_never_used_blocknum.b, pairs);
+    struct block_allocator_blockpair *XMALLOC_N(t->smallest_never_used_blocknum.b, pairs);
    uint64_t n_pairs = 0;
    for (int64_t i=0; i<t->smallest_never_used_blocknum.b; i++) {
        struct block_translation_pair pair = t->block_translation[i];
        if (pair.size > 0) {
-            assert(pair.u.diskoff != diskoff_unused);
+            paranoid_invariant(pair.u.diskoff != diskoff_unused);
            int cur_pair = n_pairs++;
            pairs[cur_pair] = (struct block_allocator_blockpair) { .offset = (uint64_t) pair.u.diskoff,
                                                                   .size = (uint64_t) pair.size };
@ -943,7 +977,7 @@ void
 toku_blocktable_internal_fragmentation (BLOCK_TABLE bt, int64_t *total_sizep, int64_t *used_sizep) {
    frag_extra info = {0,0};
    int r = toku_blocktable_iterate(bt, TRANSLATION_CHECKPOINTED, frag_helper, &info, false, true);
-    assert(r==0);
+    assert_zero(r);

    if (total_sizep) *total_sizep = info.total_space;
    if (used_sizep)  *used_sizep  = info.used_space;
--- a/ft/cachetable-internal.h
+++ b/ft/cachetable-internal.h
@ -134,6 +134,7 @@ struct ctpair {
    CACHETABLE_PARTIAL_EVICTION_CALLBACK pe_callback;
    CACHETABLE_CLEANER_CALLBACK cleaner_callback;
    CACHETABLE_CLONE_CALLBACK clone_callback;
+    CACHETABLE_CHECKPOINT_COMPLETE_CALLBACK checkpoint_complete_callback;
    void *write_extraargs;

    // access to these fields are protected by disk_nb_mutex
@ -384,7 +385,7 @@ public:
    uint64_t reserve_memory(double fraction);
    void release_reserved_memory(uint64_t reserved_memory);
    void run_eviction_thread();
-    void do_partial_eviction(PAIR p);
+    void do_partial_eviction(PAIR p, bool pair_mutex_held);
    void evict_pair(PAIR p, bool checkpoint_pending);
    void wait_for_cache_pressure_to_subside();
    void signal_eviction_thread();
--- a/ft/cachetable.cc
+++ b/ft/cachetable.cc
@ -16,6 +16,7 @@
 #include "cachetable-internal.h"
 #include <memory.h>
 #include <toku_race_tools.h>
+#include <portability/toku_atomic.h>
 #include <portability/toku_pthread.h>
 #include <portability/toku_time.h>
 #include <util/rwlock.h>
@ -97,6 +98,10 @@ static inline void pair_unlock(PAIR p) {
    toku_mutex_unlock(p->mutex);
 }

+bool toku_ctpair_is_write_locked(PAIR pair) {
+    return pair->value_rwlock.writers() == 1;
+}
+
 void
 toku_cachetable_get_status(CACHETABLE ct, CACHETABLE_STATUS statp) {
    if (!ct_status.initialized) {
@ -706,7 +711,7 @@ static void cachetable_evicter(void* extra) {
 static void cachetable_partial_eviction(void* extra) {
    PAIR p = (PAIR)extra;
    CACHEFILE cf = p->cachefile;
-    p->ev->do_partial_eviction(p);
+    p->ev->do_partial_eviction(p, false);
    bjm_remove_background_job(cf->bjm);
 }

@ -750,6 +755,7 @@ void pair_init(PAIR p,
    p->pe_est_callback = write_callback.pe_est_callback;
    p->cleaner_callback = write_callback.cleaner_callback;
    p->clone_callback = write_callback.clone_callback;
+    p->checkpoint_complete_callback = write_callback.checkpoint_complete_callback;
    p->write_extraargs = write_callback.write_extraargs;

    p->count = 0; // <CER> Is zero the correct init value?
@ -915,6 +921,9 @@ checkpoint_cloned_pair_on_writer_thread(CACHETABLE ct, PAIR p) {
 static void
 write_locked_pair_for_checkpoint(CACHETABLE ct, PAIR p, bool checkpoint_pending)
 {
+    if (checkpoint_pending && p->checkpoint_complete_callback) {
+        p->checkpoint_complete_callback(p->value_data);
+    }
    if (p->dirty && checkpoint_pending) {
        if (p->clone_callback) {
            pair_lock(p);
@ -952,6 +961,9 @@ write_pair_for_checkpoint_thread (evictor* ev, PAIR p)
    // will be cheap.  Also, much of the time we'll just be clearing
    // pending bits and that's definitely cheap. (see #5427)
    p->value_rwlock.write_lock(false);
+    if (p->checkpoint_pending && p->checkpoint_complete_callback) {
+        p->checkpoint_complete_callback(p->value_data);
+    }
    if (p->dirty && p->checkpoint_pending) {
        if (p->clone_callback) {
            nb_mutex_lock(&p->disk_nb_mutex, p->mutex);
@ -1726,73 +1738,100 @@ int toku_cachetable_get_and_pin_with_dep_pairs (
 //  For example, imagine that we can modify a bit in a dirty parent, or modify a bit in a clean child, then we should modify
 //  the dirty parent (which will have to do I/O eventually anyway) rather than incur a full block write to modify one bit.
 //  Similarly, if the checkpoint is actually pending, we don't want to block on it.
-int toku_cachetable_maybe_get_and_pin (CACHEFILE cachefile, CACHEKEY key, uint32_t fullhash, void**value) {
+int toku_cachetable_maybe_get_and_pin (CACHEFILE cachefile, CACHEKEY key, uint32_t fullhash, pair_lock_type lock_type, void**value) {
    CACHETABLE ct = cachefile->cachetable;
    int r = -1;
    ct->list.pair_lock_by_fullhash(fullhash);
    PAIR p = ct->list.find_pair(cachefile, key, fullhash);
-    if (p && p->value_rwlock.try_write_lock(true)) {
-        // we got the write lock fast, so continue
-        ct->list.read_pending_cheap_lock();
-        //
-        // if pending a checkpoint, then we don't want to return
-        // the value to the user, because we are responsible for
-        // handling the checkpointing, which we do not want to do,
-        // because it is expensive
-        //
-        if (!p->dirty || p->checkpoint_pending) {
-            p->value_rwlock.write_unlock();
-            r = -1;
+    if (p) {
+        const bool lock_is_expensive = (lock_type == PL_WRITE_EXPENSIVE);
+        bool got_lock = false;
+        switch (lock_type) {
+        case PL_READ:
+            if (p->value_rwlock.try_read_lock()) {
+                got_lock = p->dirty;
+
+                if (!got_lock) {
+                    p->value_rwlock.read_unlock();
+                }
+            }
+            break;
+        case PL_WRITE_CHEAP:
+        case PL_WRITE_EXPENSIVE:
+            if (p->value_rwlock.try_write_lock(lock_is_expensive)) {
+                // we got the lock fast, so continue
+                ct->list.read_pending_cheap_lock();
+
+                // if pending a checkpoint, then we don't want to return
+                // the value to the user, because we are responsible for
+                // handling the checkpointing, which we do not want to do,
+                // because it is expensive
+                got_lock = p->dirty && !p->checkpoint_pending;
+
+                ct->list.read_pending_cheap_unlock();
+                if (!got_lock) {
+                    p->value_rwlock.write_unlock();
+                }
+            }
+            break;
        }
-        else {
+        if (got_lock) {
+            pair_touch(p);
            *value = p->value_data;
            r = 0;
        }
-        ct->list.read_pending_cheap_unlock();
-        pair_unlock(p);
-    }
-    else {
-        ct->list.pair_unlock_by_fullhash(fullhash);
    }
+    ct->list.pair_unlock_by_fullhash(fullhash);
    return r;
 }

 //Used by flusher threads to possibly pin child on client thread if pinning is cheap
 //Same as toku_cachetable_maybe_get_and_pin except that we don't care if the node is clean or dirty (return the node regardless).
 //All other conditions remain the same.
-int toku_cachetable_maybe_get_and_pin_clean (CACHEFILE cachefile, CACHEKEY key, uint32_t fullhash, void**value) {
+int toku_cachetable_maybe_get_and_pin_clean (CACHEFILE cachefile, CACHEKEY key, uint32_t fullhash, pair_lock_type lock_type, void**value) {
    CACHETABLE ct = cachefile->cachetable;
    int r = -1;
    ct->list.pair_lock_by_fullhash(fullhash);
    PAIR p = ct->list.find_pair(cachefile, key, fullhash);
-    if (p && p->value_rwlock.try_write_lock(true)) {
-        // got the write lock fast, so continue
-        ct->list.read_pending_cheap_lock();
-        //
-        // if pending a checkpoint, then we don't want to return
-        // the value to the user, because we are responsible for
-        // handling the checkpointing, which we do not want to do,
-        // because it is expensive
-        //
-        if (p->checkpoint_pending) {
-            if (p->dirty) {
-                p->value_rwlock.write_unlock();
-                r = -1;
+    if (p) {
+        const bool lock_is_expensive = (lock_type == PL_WRITE_EXPENSIVE);
+        bool got_lock = false;
+        switch (lock_type) {
+        case PL_READ:
+            if (p->value_rwlock.try_read_lock()) {
+                got_lock = true;
+            } else if (!p->value_rwlock.read_lock_is_expensive()) {
+                p->value_rwlock.write_lock(lock_is_expensive);
+                got_lock = true;
            }
-            else {
-                p->checkpoint_pending = false;
-                *value = p->value_data;
-                r = 0;
+            if (got_lock) {
+                pair_touch(p);
            }
+            pair_unlock(p);
+            break;
+        case PL_WRITE_CHEAP:
+        case PL_WRITE_EXPENSIVE:
+            if (p->value_rwlock.try_write_lock(lock_is_expensive)) {
+                got_lock = true;
+            } else if (!p->value_rwlock.write_lock_is_expensive()) {
+                p->value_rwlock.write_lock(lock_is_expensive);
+                got_lock = true;
+            }
+            if (got_lock) {
+                pair_touch(p);
+            }
+            pair_unlock(p);
+            if (got_lock) {
+                bool checkpoint_pending = get_checkpoint_pending(p, &ct->list);
+                write_locked_pair_for_checkpoint(ct, p, checkpoint_pending);
+            }
+            break;
        }
-        else {
+        if (got_lock) {
            *value = p->value_data;
            r = 0;
        }
-        ct->list.read_pending_cheap_unlock();
-        pair_unlock(p);
-    }
-    else {
+    } else {
        ct->list.pair_unlock_by_fullhash(fullhash);
    }
    return r;
@ -3524,7 +3563,7 @@ void evictor::change_pair_attr(PAIR_ATTR old_attr, PAIR_ATTR new_attr) {
 // the size of the cachetable.
 //
 void evictor::add_to_size_current(long size) {
-    (void) __sync_fetch_and_add(&m_size_current, size);
+    (void) toku_sync_fetch_and_add(&m_size_current, size);
 }

 //
@ -3532,7 +3571,7 @@ void evictor::add_to_size_current(long size) {
 // approximation of the cachetable size.
 //
 void evictor::remove_from_size_current(long size) {
-    (void) __sync_fetch_and_sub(&m_size_current, size);
+    (void) toku_sync_fetch_and_sub(&m_size_current, size);
 }

 //
@ -3543,7 +3582,7 @@ uint64_t evictor::reserve_memory(double fraction) {
    toku_mutex_lock(&m_ev_thread_lock);
    reserved_memory = fraction * (m_low_size_watermark - m_size_reserved);
    m_size_reserved += reserved_memory;
-    (void) __sync_fetch_and_add(&m_size_current, reserved_memory);
+    (void) toku_sync_fetch_and_add(&m_size_current, reserved_memory);
    this->signal_eviction_thread();  
    toku_mutex_unlock(&m_ev_thread_lock);

@ -3557,7 +3596,7 @@ uint64_t evictor::reserve_memory(double fraction) {
 // TODO: (Zardosht) comment this function
 //
 void evictor::release_reserved_memory(uint64_t reserved_memory){
-    (void) __sync_fetch_and_sub(&m_size_current, reserved_memory);
+    (void) toku_sync_fetch_and_sub(&m_size_current, reserved_memory);
    toku_mutex_lock(&m_ev_thread_lock);    
    m_size_reserved -= reserved_memory;
    // signal the eviction thread in order to possibly wake up sleeping clients
@ -3710,7 +3749,6 @@ bool evictor::run_eviction_on_pair(PAIR curr_in_clock) {
        curr_in_clock->count--;
        // call the partial eviction callback
        curr_in_clock->value_rwlock.write_lock(true);
-        pair_unlock(curr_in_clock);
    
        void *value = curr_in_clock->value_data;
        void* disk_data = curr_in_clock->disk_data;
@ -3726,13 +3764,15 @@ bool evictor::run_eviction_on_pair(PAIR curr_in_clock) {
            );
        if (cost == PE_CHEAP) {
            curr_in_clock->size_evicting_estimate = 0;
-            this->do_partial_eviction(curr_in_clock);
+            this->do_partial_eviction(curr_in_clock, true);
            bjm_remove_background_job(cf->bjm);
+            pair_unlock(curr_in_clock);
        }
        else if (cost == PE_EXPENSIVE) {
            // only bother running an expensive partial eviction
            // if it is expected to free space
            if (bytes_freed_estimate > 0) {
+                pair_unlock(curr_in_clock);
                curr_in_clock->size_evicting_estimate = bytes_freed_estimate;
                toku_mutex_lock(&m_ev_thread_lock);
                m_size_evicting += bytes_freed_estimate;
@ -3744,7 +3784,6 @@ bool evictor::run_eviction_on_pair(PAIR curr_in_clock) {
                    );
            }
            else {
-                pair_lock(curr_in_clock);
                curr_in_clock->value_rwlock.write_unlock();
                pair_unlock(curr_in_clock);
                bjm_remove_background_job(cf->bjm);
@ -3767,10 +3806,10 @@ exit:
 }

 //
-// on entry, pair's mutex is not held, but pair is pinned
+// on entry and exit, pair's mutex is held if pair_mutex_held is true
 // on exit, PAIR is unpinned
 //
-void evictor::do_partial_eviction(PAIR p) {
+void evictor::do_partial_eviction(PAIR p, bool pair_mutex_held) {
    PAIR_ATTR new_attr;
    PAIR_ATTR old_attr = p->attr;
    
@ -3779,9 +3818,13 @@ void evictor::do_partial_eviction(PAIR p) {
    this->change_pair_attr(old_attr, new_attr);
    p->attr = new_attr;
    this->decrease_size_evicting(p->size_evicting_estimate);
-    pair_lock(p);
+    if (!pair_mutex_held) {
+        pair_lock(p);
+    }
    p->value_rwlock.write_unlock();
-    pair_unlock(p);
+    if (!pair_mutex_held) {
+        pair_unlock(p);
+    }
 }

 //
--- a/ft/cachetable.h
+++ b/ft/cachetable.h
@ -173,12 +173,15 @@ typedef int (*CACHETABLE_CLEANER_CALLBACK)(void *ftnode_pv, BLOCKNUM blocknum, u

 typedef void (*CACHETABLE_CLONE_CALLBACK)(void* value_data, void** cloned_value_data, PAIR_ATTR* new_attr, bool for_checkpoint, void* write_extraargs);

+typedef void (*CACHETABLE_CHECKPOINT_COMPLETE_CALLBACK)(void *value_data);
+
 typedef struct {
    CACHETABLE_FLUSH_CALLBACK flush_callback;
    CACHETABLE_PARTIAL_EVICTION_EST_CALLBACK pe_est_callback;
    CACHETABLE_PARTIAL_EVICTION_CALLBACK pe_callback; 
    CACHETABLE_CLEANER_CALLBACK cleaner_callback;
    CACHETABLE_CLONE_CALLBACK clone_callback;
+    CACHETABLE_CHECKPOINT_COMPLETE_CALLBACK checkpoint_complete_callback;
    void* write_extraargs; // parameter for flush_callback, pe_est_callback, pe_callback, and cleaner_callback
 } CACHETABLE_WRITE_CALLBACK;

@ -366,14 +369,14 @@ int toku_cachetable_get_and_pin_nonblocking (
    UNLOCKERS unlockers
    );

-int toku_cachetable_maybe_get_and_pin (CACHEFILE, CACHEKEY, uint32_t /*fullhash*/, void**);
+int toku_cachetable_maybe_get_and_pin (CACHEFILE, CACHEKEY, uint32_t /*fullhash*/, pair_lock_type, void**);
 // Effect: Maybe get and pin a memory object.
 //  This function is similar to the get_and_pin function except that it
 //  will not attempt to fetch a memory object that is not in the cachetable or requires any kind of blocking to get it.  
 // Returns: If the the item is already in memory, then return 0 and store it in the
 // void**.  If the item is not in memory, then return a nonzero error number.

-int toku_cachetable_maybe_get_and_pin_clean (CACHEFILE, CACHEKEY, uint32_t /*fullhash*/, void**);
+int toku_cachetable_maybe_get_and_pin_clean (CACHEFILE, CACHEKEY, uint32_t /*fullhash*/, pair_lock_type, void**);
 // Effect: Like maybe get and pin, but may pin a clean pair.

 int toku_cachetable_unpin(CACHEFILE, PAIR, enum cachetable_dirty dirty, PAIR_ATTR size);
@ -556,5 +559,10 @@ int toku_cleaner_thread(void *cleaner_v);
 // The default of 1M is too high for drd tests, so this is a mechanism to set a smaller number.
 void toku_pair_list_set_lock_size(uint32_t num_locks);

+// Used by ft-ops.cc to figure out if it has the write lock on a pair.
+// Pretty hacky and not accurate enough, should be improved at the frwlock
+// layer.
+__attribute__((const,nonnull))
+bool toku_ctpair_is_write_locked(PAIR pair);

 #endif /* CACHETABLE_H */
--- a/ft/checkpoint.cc
+++ b/ft/checkpoint.cc
@ -49,6 +49,7 @@
 #include "log-internal.h"
 #include "logger.h"
 #include "checkpoint.h"
+#include <portability/toku_atomic.h>

 ///////////////////////////////////////////////////////////////////////////////////
 // Engine status
@ -173,7 +174,7 @@ checkpoint_safe_checkpoint_unlock(void) {
 void 
 toku_multi_operation_client_lock(void) {
    if (locked_mo)
-        (void) __sync_fetch_and_add(&STATUS_VALUE(CP_CLIENT_WAIT_ON_MO), 1);
+        (void) toku_sync_fetch_and_add(&STATUS_VALUE(CP_CLIENT_WAIT_ON_MO), 1);
    toku_pthread_rwlock_rdlock(&multi_operation_lock);   
 }

@ -185,7 +186,7 @@ toku_multi_operation_client_unlock(void) {
 void 
 toku_checkpoint_safe_client_lock(void) {
    if (locked_cs)
-        (void) __sync_fetch_and_add(&STATUS_VALUE(CP_CLIENT_WAIT_ON_CS), 1);
+        (void) toku_sync_fetch_and_add(&STATUS_VALUE(CP_CLIENT_WAIT_ON_CS), 1);
    toku_pthread_rwlock_rdlock(&checkpoint_safe_lock);  
    toku_multi_operation_client_lock();
 }
@ -227,9 +228,9 @@ toku_checkpoint(CHECKPOINTER cp, TOKULOGGER logger,

    assert(initialized);

-    (void) __sync_fetch_and_add(&STATUS_VALUE(CP_WAITERS_NOW), 1);
+    (void) toku_sync_fetch_and_add(&STATUS_VALUE(CP_WAITERS_NOW), 1);
    checkpoint_safe_checkpoint_lock();
-    (void) __sync_fetch_and_sub(&STATUS_VALUE(CP_WAITERS_NOW), 1);
+    (void) toku_sync_fetch_and_sub(&STATUS_VALUE(CP_WAITERS_NOW), 1);

    if (STATUS_VALUE(CP_WAITERS_NOW) > STATUS_VALUE(CP_WAITERS_MAX))
        STATUS_VALUE(CP_WAITERS_MAX) = STATUS_VALUE(CP_WAITERS_NOW);  // threadsafe, within checkpoint_safe lock
--- a/ft/fifo.cc
+++ b/ft/fifo.cc
@ -25,12 +25,23 @@ static void fifo_init(struct fifo *fifo) {
    fifo->memory_used  = 0;
 }

+__attribute__((const,nonnull))
 static int fifo_entry_size(struct fifo_entry *entry) {
    return sizeof (struct fifo_entry) + entry->keylen + entry->vallen
                  + xids_get_size(&entry->xids_s)
                  - sizeof(XIDS_S); //Prevent double counting from fifo_entry+xids_get_size
 }

+__attribute__((const,nonnull))
+size_t toku_ft_msg_memsize_in_fifo(FT_MSG cmd) {
+    // This must stay in sync with fifo_entry_size because that's what we
+    // really trust.  But sometimes we only have an in-memory FT_MSG, not
+    // a serialized fifo_entry so we have to fake it.
+    return sizeof (struct fifo_entry) + cmd->u.id.key->size + cmd->u.id.val->size
+        + xids_get_size(cmd->xids)
+        - sizeof(XIDS_S);
+}
+
 int toku_fifo_create(FIFO *ptr) {
    struct fifo *XMALLOC(fifo);
    if (fifo == 0) return ENOMEM;
@ -112,6 +123,9 @@ int toku_fifo_iterate_internal_next(FIFO fifo, int off) {
 struct fifo_entry * toku_fifo_iterate_internal_get_entry(FIFO fifo, int off) {
    return (struct fifo_entry *)(fifo->memory + off);
 }
+size_t toku_fifo_internal_entry_memsize(struct fifo_entry *e) {
+    return fifo_entry_size(e);
+}

 void toku_fifo_iterate (FIFO fifo, void(*f)(bytevec key,ITEMLEN keylen,bytevec data,ITEMLEN datalen, enum ft_msg_type type, MSN msn, XIDS xids, bool is_fresh, void*), void *arg) {
    FIFO_ITERATE(fifo,
--- a/ft/fifo.h
+++ b/ft/fifo.h
@ -68,26 +68,30 @@ unsigned long toku_fifo_memory_footprint(FIFO fifo);  // return how much memory
 void toku_fifo_iterate(FIFO, void(*f)(bytevec key,ITEMLEN keylen,bytevec data,ITEMLEN datalen, enum ft_msg_type type, MSN msn, XIDS xids, bool is_fresh, void*), void*);

 #define FIFO_ITERATE(fifo,keyvar,keylenvar,datavar,datalenvar,typevar,msnvar,xidsvar,is_freshvar,body) ({ \
-  for (int fifo_iterate_off = toku_fifo_iterate_internal_start(fifo);                          \
-       toku_fifo_iterate_internal_has_more(fifo, fifo_iterate_off);                        \
-       fifo_iterate_off = toku_fifo_iterate_internal_next(fifo, fifo_iterate_off)) {       \
-      struct fifo_entry *e = toku_fifo_iterate_internal_get_entry(fifo, fifo_iterate_off); \
-      ITEMLEN keylenvar = e->keylen;                                                       \
-      ITEMLEN datalenvar = e->vallen;                                                 \
-      enum ft_msg_type typevar = fifo_entry_get_msg_type(e);                         \
-      MSN     msnvar  = e->msn;                                                       \
-      XIDS    xidsvar = &e->xids_s;                                                   \
-      bytevec keyvar  = xids_get_end_of_array(xidsvar);                               \
-      bytevec datavar = (const uint8_t*)keyvar + e->keylen;                          \
-      bool is_freshvar = e->is_fresh;                                                 \
-      body;	\
+  for (int fifo_iterate_off = toku_fifo_iterate_internal_start(fifo);                                     \
+       toku_fifo_iterate_internal_has_more(fifo, fifo_iterate_off);                                       \
+       fifo_iterate_off = toku_fifo_iterate_internal_next(fifo, fifo_iterate_off)) {                      \
+      struct fifo_entry *e = toku_fifo_iterate_internal_get_entry(fifo, fifo_iterate_off);                \
+      ITEMLEN keylenvar = e->keylen;                                                                      \
+      ITEMLEN datalenvar = e->vallen;                                                                     \
+      enum ft_msg_type typevar = fifo_entry_get_msg_type(e);                                              \
+      MSN     msnvar  = e->msn;                                                                           \
+      XIDS    xidsvar = &e->xids_s;                                                                       \
+      bytevec keyvar  = xids_get_end_of_array(xidsvar);                                                   \
+      bytevec datavar = (const uint8_t*)keyvar + e->keylen;                                               \
+      bool is_freshvar = e->is_fresh;                                                                     \
+      body;                                                                                               \
  } })

+#define FIFO_CURRENT_ENTRY_MEMSIZE toku_fifo_internal_entry_memsize(e)
+
 // Internal functions for the iterator.
 int toku_fifo_iterate_internal_start(FIFO fifo);
 int toku_fifo_iterate_internal_has_more(FIFO fifo, int off);
 int toku_fifo_iterate_internal_next(FIFO fifo, int off);
 struct fifo_entry * toku_fifo_iterate_internal_get_entry(FIFO fifo, int off);
+size_t toku_fifo_internal_entry_memsize(struct fifo_entry *e) __attribute__((const,nonnull));
+size_t toku_ft_msg_memsize_in_fifo(FT_MSG cmd) __attribute__((const,nonnull));

 DBT *fill_dbt_for_fifo_entry(DBT *dbt, const struct fifo_entry *entry);
 struct fifo_entry *toku_fifo_get_entry(FIFO fifo, int off);
--- a/ft/ft-cachetable-wrappers.cc
+++ b/ft/ft-cachetable-wrappers.cc
@ -147,7 +147,7 @@ try_again_for_write_lock:
        if (apply_ancestor_messages && node->height == 0) {
            needs_ancestors_messages = toku_ft_leaf_needs_ancestors_messages(brt->ft, node, ancestors, bounds, &max_msn_in_path);
            if (needs_ancestors_messages && needed_lock_type == PL_READ) {
-                toku_unpin_ftnode_read_only(brt, node);
+                toku_unpin_ftnode_read_only(brt->ft, node);
                needed_lock_type = PL_WRITE_CHEAP;
                goto try_again_for_write_lock;
            }
@ -296,14 +296,14 @@ toku_pin_ftnode_off_client_thread_batched(
            h, blocknum, fullhash, bfe, lock_type, num_dependent_nodes, dependent_nodes, node_p, true);
 }

-int toku_maybe_pin_ftnode_clean(FT ft, BLOCKNUM blocknum, uint32_t fullhash, FTNODE *nodep) {
+int toku_maybe_pin_ftnode_clean(FT ft, BLOCKNUM blocknum, uint32_t fullhash, pair_lock_type lock_type, FTNODE *nodep) {
    void *node_v;
-    int r = toku_cachetable_maybe_get_and_pin_clean(ft->cf, blocknum, fullhash, &node_v);
+    int r = toku_cachetable_maybe_get_and_pin_clean(ft->cf, blocknum, fullhash, lock_type, &node_v);
    if (r != 0) {
        goto cleanup;
    }
    CAST_FROM_VOIDP(*nodep, node_v);
-    if ((*nodep)->height > 0) {
+    if ((*nodep)->height > 0 && lock_type != PL_READ) {
        toku_move_ftnode_messages_to_stale(ft, *nodep);
    }
 cleanup:
@ -331,14 +331,13 @@ toku_unpin_ftnode(FT ft, FTNODE node)
 }

 void
-toku_unpin_ftnode_read_only(FT_HANDLE brt, FTNODE node)
+toku_unpin_ftnode_read_only(FT ft, FTNODE node)
 {
    int r = toku_cachetable_unpin(
-        brt->ft->cf,
+        ft->cf,
        node->ct_pair,
        (enum cachetable_dirty) node->dirty,
        make_invalid_pair_attr()
        );
    assert(r==0);
 }
-
--- a/ft/ft-cachetable-wrappers.h
+++ b/ft/ft-cachetable-wrappers.h
@ -108,7 +108,7 @@ toku_pin_ftnode_off_client_thread_and_maybe_move_messages(
 * This function may return a pinned ftnode to the caller, if pinning is cheap.
 * If the node is already locked, or is pending a checkpoint, the node is not pinned and -1 is returned.
 */
-int toku_maybe_pin_ftnode_clean(FT ft, BLOCKNUM blocknum, uint32_t fullhash, FTNODE *nodep);
+int toku_maybe_pin_ftnode_clean(FT ft, BLOCKNUM blocknum, uint32_t fullhash, pair_lock_type lock_type, FTNODE *nodep);

 /**
 * Batched version of toku_pin_ftnode_off_client_thread, see cachetable
@ -158,6 +158,6 @@ void
 toku_unpin_ftnode(FT h, FTNODE node);

 void
-toku_unpin_ftnode_read_only(FT_HANDLE brt, FTNODE node);
+toku_unpin_ftnode_read_only(FT ft, FTNODE node);

 #endif
--- a/ft/ft-flusher.cc
+++ b/ft/ft-flusher.cc
@ -9,6 +9,8 @@
 #include <ft-flusher-internal.h>
 #include <ft-cachetable-wrappers.h>
 #include <ft.h>
+#include <toku_assert.h>
+#include <portability/toku_atomic.h>

 /* Status is intended for display to humans to help understand system behavior.
 * It does not need to be perfectly thread-safe.
@ -98,11 +100,13 @@ find_heaviest_child(FTNODE node)
    int i;

    if (0) printf("%s:%d weights: %d", __FILE__, __LINE__, max_weight);
-    assert(node->n_children>0);
+    paranoid_invariant(node->n_children>0);
    for (i=1; i<node->n_children; i++) {
+#ifdef TOKU_DEBUG_PARANOID
        if (BP_WORKDONE(node,i)) {
            assert(toku_bnc_nbytesinbuf(BNC(node,i)) > 0);
        }
+#endif
        int this_weight = toku_bnc_nbytesinbuf(BNC(node,i)) + BP_WORKDONE(node,i);;
        if (0) printf(" %d", this_weight);
        if (max_weight < this_weight) {
@ -180,7 +184,7 @@ pick_heaviest_child(FT UU(h),
                    void* UU(extra))
 {
    int childnum = find_heaviest_child(parent);
-    assert(toku_bnc_n_entries(BNC(parent, childnum))>0);
+    paranoid_invariant(toku_bnc_n_entries(BNC(parent, childnum))>0);
    return childnum;
 }

@ -348,7 +352,7 @@ ctm_maybe_merge_child(struct flusher_advice *fa,
                      void *extra)
 {
    if (child->height == 0) {
-        (void) __sync_fetch_and_add(&STATUS_VALUE(FT_FLUSHER_CLEANER_NUM_LEAF_MERGES_COMPLETED), 1);
+        (void) toku_sync_fetch_and_add(&STATUS_VALUE(FT_FLUSHER_CLEANER_NUM_LEAF_MERGES_COMPLETED), 1);
    }
    default_merge_child(fa, h, parent, childnum, child, extra);
 }
@ -366,7 +370,7 @@ ct_maybe_merge_child(struct flusher_advice *fa,
    }
    else {
        struct ctm_extra ctme;
-        assert(parent->n_children > 1);
+        paranoid_invariant(parent->n_children > 1);
        int pivot_to_save;
        //
        // we have two cases, one where the childnum
@ -413,12 +417,12 @@ ct_maybe_merge_child(struct flusher_advice *fa,
            toku_assert_entire_node_in_memory(root_node);
        }

-        (void) __sync_fetch_and_add(&STATUS_VALUE(FT_FLUSHER_CLEANER_NUM_LEAF_MERGES_STARTED), 1);
-        (void) __sync_fetch_and_add(&STATUS_VALUE(FT_FLUSHER_CLEANER_NUM_LEAF_MERGES_RUNNING), 1);
+        (void) toku_sync_fetch_and_add(&STATUS_VALUE(FT_FLUSHER_CLEANER_NUM_LEAF_MERGES_STARTED), 1);
+        (void) toku_sync_fetch_and_add(&STATUS_VALUE(FT_FLUSHER_CLEANER_NUM_LEAF_MERGES_RUNNING), 1);

        flush_some_child(h, root_node, &new_fa);

-        (void) __sync_fetch_and_sub(&STATUS_VALUE(FT_FLUSHER_CLEANER_NUM_LEAF_MERGES_RUNNING), 1);
+        (void) toku_sync_fetch_and_sub(&STATUS_VALUE(FT_FLUSHER_CLEANER_NUM_LEAF_MERGES_RUNNING), 1);

        toku_free(ctme.target_key.data);
    }
@ -483,13 +487,14 @@ handle_split_of_child(
    DBT *splitk /* the data in the childsplitk is alloc'd and is consumed by this call. */
    )
 {
-    assert(node->height>0);
-    assert(0 <= childnum && childnum < node->n_children);
+    paranoid_invariant(node->height>0);
+    paranoid_invariant(0 <= childnum);
+    paranoid_invariant(childnum < node->n_children);
    toku_assert_entire_node_in_memory(node);
    toku_assert_entire_node_in_memory(childa);
    toku_assert_entire_node_in_memory(childb);
-    int old_count = toku_bnc_nbytesinbuf(BNC(node, childnum));
-    assert(old_count==0);
+    NONLEAF_CHILDINFO old_bnc = BNC(node, childnum);
+    paranoid_invariant(toku_bnc_nbytesinbuf(old_bnc)==0);
    int cnum;
    WHEN_NOT_GCOV(
    if (toku_ft_debug_mode) {
@ -515,13 +520,20 @@ handle_split_of_child(
    memset(&node->bp[childnum+1],0,sizeof(node->bp[0]));
    node->n_children++;

-    assert(BP_BLOCKNUM(node, childnum).b==childa->thisnodename.b); // use the same child
+    paranoid_invariant(BP_BLOCKNUM(node, childnum).b==childa->thisnodename.b); // use the same child

    BP_BLOCKNUM(node, childnum+1) = childb->thisnodename;
    BP_WORKDONE(node, childnum+1)  = 0;
    BP_STATE(node,childnum+1) = PT_AVAIL;

-    set_BNC(node, childnum+1, toku_create_empty_nl());
+    NONLEAF_CHILDINFO new_bnc = toku_create_empty_nl();
+    for (unsigned int i = 0; i < (sizeof new_bnc->flow) / (sizeof new_bnc->flow[0]); ++i) {
+        // just split the flows in half for now, can't guess much better
+        // at the moment
+        new_bnc->flow[i] = old_bnc->flow[i] / 2;
+        old_bnc->flow[i] = (old_bnc->flow[i] + 1) / 2;
+    }
+    set_BNC(node, childnum+1, new_bnc);

    // Slide the keys over
    {
@ -553,7 +565,7 @@ handle_split_of_child(
 }

 static int
-verify_in_mempool(OMTVALUE lev, uint32_t UU(idx), void *mpv)
+UU() verify_in_mempool(OMTVALUE lev, uint32_t UU(idx), void *mpv)
 {
    LEAFENTRY CAST_FROM_VOIDP(le, lev);
    struct mempool *CAST_FROM_VOIDP(mp, mpv);
@ -563,8 +575,9 @@ verify_in_mempool(OMTVALUE lev, uint32_t UU(idx), void *mpv)
 }

 static void
-verify_all_in_mempool(FTNODE node)
+verify_all_in_mempool(FTNODE UU() node)
 {
+#ifdef TOKU_DEBUG_PARANOID
    if (node->height==0) {
        for (int i = 0; i < node->n_children; i++) {
            invariant(BP_STATE(node,i) == PT_AVAIL);
@ -572,13 +585,14 @@ verify_all_in_mempool(FTNODE node)
            toku_omt_iterate(bn->buffer, verify_in_mempool, &bn->buffer_mempool);
        }
    }
+#endif
 }

 static uint64_t
 ftleaf_disk_size(FTNODE node)
 // Effect: get the disk size of a leafentry
 {
-    assert(node->height == 0);
+    paranoid_invariant(node->height == 0);
    toku_assert_entire_node_in_memory(node);
    uint64_t retval = 0;
    for (int i = 0; i < node->n_children; i++) {
@ -587,8 +601,8 @@ ftleaf_disk_size(FTNODE node)
        for (uint32_t j=0; j < n_leafentries; j++) {
            OMTVALUE v;
            int r = toku_omt_fetch(curr_buffer, j, &v);
-            LEAFENTRY CAST_FROM_VOIDP(curr_le, v);
            assert_zero(r);
+            LEAFENTRY CAST_FROM_VOIDP(curr_le, v);
            retval += leafentry_disksize(curr_le);
        }
    }
@ -598,47 +612,69 @@ ftleaf_disk_size(FTNODE node)
 static void
 ftleaf_get_split_loc(
    FTNODE node,
-    uint64_t sumlesizes,
-    int* bn_index,   // which basement within leaf
-    int* le_index    // which key within basement
+    enum split_mode split_mode,
+    int *num_left_bns,   // which basement within leaf
+    int *num_left_les    // which key within basement
    )
 // Effect: Find the location within a leaf node where we want to perform a split
-// bn_index is which basement node (which OMT) should be split.
-// le_index is index into OMT of the last key that should be on the left side of the split.
+// num_left_bns is how many basement nodes (which OMT) should be split to the left.
+// num_left_les is how many leafentries in OMT of the last bn should be on the left side of the split.
 {
-    assert(node->height == 0);
-    uint32_t size_so_far = 0;
-    for (int i = 0; i < node->n_children; i++) {
-        OMT curr_buffer = BLB_BUFFER(node, i);
-        uint32_t n_leafentries = toku_omt_size(curr_buffer);
-        for (uint32_t j=0; j < n_leafentries; j++) {
-            OMTVALUE lev;
-            int r = toku_omt_fetch(curr_buffer, j, &lev);
-            assert_zero(r);
-            LEAFENTRY CAST_FROM_VOIDP(curr_le, lev);
-            size_so_far += leafentry_disksize(curr_le);
-            if (size_so_far >= sumlesizes/2) {
-                *bn_index = i;
-                *le_index = j;
-                if ((*bn_index == node->n_children - 1) &&
-                    ((unsigned int) *le_index == n_leafentries - 1)) {
-                    // need to correct for when we're splitting after the
-                    // last element, that makes no sense
-                    if (*le_index > 0) {
-                        (*le_index)--;
-                    } else if (*bn_index > 0) {
-                        (*bn_index)--;
-                        *le_index = toku_omt_size(BLB_BUFFER(node, *bn_index)) - 1;
-                    } else {
-                        // we are trying to split a leaf with only one
-                        // leafentry in it
-                        abort();
+    switch (split_mode) {
+    case SPLIT_LEFT_HEAVY: {
+        *num_left_bns = node->n_children;
+        *num_left_les = toku_omt_size(BLB_BUFFER(node, *num_left_bns - 1));
+        if (*num_left_les == 0) {
+            *num_left_bns = node->n_children - 1;
+            *num_left_les = toku_omt_size(BLB_BUFFER(node, *num_left_bns - 1));
+            invariant(*num_left_les > 0);
+        }
+        goto exit;
+    }
+    case SPLIT_RIGHT_HEAVY: {
+        *num_left_bns = 1;
+        *num_left_les = 1;
+        goto exit;
+    }
+    case SPLIT_EVENLY: {
+        paranoid_invariant(node->height == 0);
+        // TODO: (Zardosht) see if we can/should make this faster, we iterate over the rows twice
+        uint64_t sumlesizes = ftleaf_disk_size(node);
+        uint32_t size_so_far = 0;
+        for (int i = 0; i < node->n_children; i++) {
+            OMT curr_buffer = BLB_BUFFER(node, i);
+            uint32_t n_leafentries = toku_omt_size(curr_buffer);
+            for (uint32_t j=0; j < n_leafentries; j++) {
+                OMTVALUE lev;
+                int r = toku_omt_fetch(curr_buffer, j, &lev);
+                assert_zero(r);
+                LEAFENTRY CAST_FROM_VOIDP(curr_le, lev);
+                size_so_far += leafentry_disksize(curr_le);
+                if (size_so_far >= sumlesizes/2) {
+                    *num_left_bns = i + 1;
+                    *num_left_les = j + 1;
+                    if (*num_left_bns == node->n_children &&
+                        (unsigned int) *num_left_les == n_leafentries) {
+                        // need to correct for when we're splitting after the
+                        // last element, that makes no sense
+                        if (*num_left_les > 1) {
+                            (*num_left_les)--;
+                        } else if (*num_left_bns > 1) {
+                            (*num_left_bns)--;
+                            *num_left_les = toku_omt_size(BLB_BUFFER(node, *num_left_bns - 1));
+                        } else {
+                            // we are trying to split a leaf with only one
+                            // leafentry in it
+                            abort();
+                        }
                    }
+                    goto exit;
                }
-                goto exit;
            }
        }
    }
+    }
+    abort();
 exit:
    return;
 }
@ -655,7 +691,7 @@ move_leafentries(
    )
 //Effect: move leafentries in the range [lbi, upe) from src_omt to newly created dest_omt
 {
-    assert(lbi < ube);
+    paranoid_invariant(lbi < ube);
    OMTVALUE *XMALLOC_N(ube-lbi, newleafpointers);    // create new omt

    size_t mpsize = toku_mempool_get_used_space(&src_bn->buffer_mempool);   // overkill, but safe
@ -692,6 +728,7 @@ ftleaf_split(
    FTNODE *nodeb,
    DBT *splitk,
    bool create_new_node,
+    enum split_mode split_mode,
    uint32_t num_dependent_nodes,
    FTNODE* dependent_nodes)
 // Effect: Split a leaf node.
@ -702,7 +739,7 @@ ftleaf_split(
 //   splitk is the right-most key of nodea
 {

-    invariant(node->height == 0);
+    paranoid_invariant(node->height == 0);
    STATUS_VALUE(FT_FLUSHER_SPLIT_LEAF)++;
    if (node->n_children) {
        // First move all the accumulated stat64info deltas into the first basement.
@ -744,31 +781,20 @@ ftleaf_split(
    }


-    assert(node->height==0);
+    paranoid_invariant(node->height==0);
    toku_assert_entire_node_in_memory(node);
    verify_all_in_mempool(node);
    MSN max_msn_applied_to_node = node->max_msn_applied_to_node_on_disk;

-    // variables that say where we will do the split. We do it in the basement node indexed at
-    // at last_bn_on_left and at the index last_le_on_left_within_bn within that basement node.
-    int last_bn_on_left = 0;               // last_bn_on_left may or may not be fully included
-    int last_le_on_left_within_bn = 0;
+    // variables that say where we will do the split.
+    // After the split, there will be num_left_bns basement nodes in the left node,
+    // and the last basement node in the left node will have num_left_les leafentries.
+    int num_left_bns;
+    int num_left_les;
+    ftleaf_get_split_loc(node, split_mode, &num_left_bns, &num_left_les);
    {
-        {
-            // TODO: (Zardosht) see if we can/should make this faster, we iterate over the rows twice
-            uint64_t sumlesizes=0;
-            sumlesizes = ftleaf_disk_size(node);
-            // TODO: (Zardosht) #3537, figure out serial insertion optimization again later
-            // split in half
-            ftleaf_get_split_loc(
-                node,
-                sumlesizes,
-                &last_bn_on_left,
-                &last_le_on_left_within_bn
-                );
-        }
        // did we split right on the boundary between basement nodes?
-        const bool split_on_boundary = (last_le_on_left_within_bn == (int) toku_omt_size(BLB_BUFFER(node, last_bn_on_left)) - 1);
+        const bool split_on_boundary = (num_left_les == 0) || (num_left_les == (int) toku_omt_size(BLB_BUFFER(node, num_left_bns - 1)));
        // Now we know where we are going to break it
        // the two nodes will have a total of n_children+1 basement nodes
        // and n_children-1 pivots
@ -781,8 +807,17 @@ ftleaf_split(
        //       (if split_on_boundary is false) will be affected.  All other mempools will remain intact. ???

        //set up the basement nodes in the new node
-        int num_children_in_node = last_bn_on_left + 1;
-        int num_children_in_b = node->n_children - last_bn_on_left - (split_on_boundary ? 1 : 0);
+        int num_children_in_node = num_left_bns;
+        // In the SPLIT_RIGHT_HEAVY case, we need to add 1 back because
+        // while it's not on the boundary, we do need node->n_children
+        // children in B.
+        int num_children_in_b = node->n_children - num_left_bns + (!split_on_boundary ? 1 : 0);
+        if (num_children_in_b == 0) {
+            // for uneven split, make sure we have at least 1 bn
+            paranoid_invariant(split_mode == SPLIT_LEFT_HEAVY);
+            num_children_in_b = 1;
+        }
+        paranoid_invariant(num_children_in_node > 0);
        if (create_new_node) {
            toku_initialize_empty_ftnode(
                B,
@ -808,19 +843,19 @@ ftleaf_split(

        // now move all the data

-        int curr_src_bn_index = last_bn_on_left;
+        int curr_src_bn_index = num_left_bns - 1;
        int curr_dest_bn_index = 0;

        // handle the move of a subset of data in last_bn_on_left from node to B
        if (!split_on_boundary) {
            BP_STATE(B,curr_dest_bn_index) = PT_AVAIL;
            uint32_t diff_size = 0;
-            destroy_basement_node (BLB(B, curr_dest_bn_index)); // Destroy B's empty OMT, so I can rebuild it from an array
+            destroy_basement_node(BLB(B, curr_dest_bn_index)); // Destroy B's empty OMT, so I can rebuild it from an array
            set_BNULL(B, curr_dest_bn_index);
            set_BLB(B, curr_dest_bn_index, toku_create_empty_bn_no_buffer());
            move_leafentries(BLB(B, curr_dest_bn_index),
                             BLB(node, curr_src_bn_index),
-                             last_le_on_left_within_bn+1,         // first row to be moved to B
+                             num_left_les,         // first row to be moved to B
                             toku_omt_size(BLB_BUFFER(node, curr_src_bn_index)),    // number of rows in basement to be split
                             &diff_size);
            BLB_MAX_MSN_APPLIED(B, curr_dest_bn_index) = BLB_MAX_MSN_APPLIED(node, curr_src_bn_index);
@ -830,15 +865,20 @@ ftleaf_split(
        }
        curr_src_bn_index++;

-        invariant(B->n_children >= curr_dest_bn_index);
-        invariant(node->n_children >= curr_src_bn_index);
-        invariant(B->n_children - curr_dest_bn_index == node->n_children - curr_src_bn_index);
+        paranoid_invariant(B->n_children >= curr_dest_bn_index);
+        paranoid_invariant(node->n_children >= curr_src_bn_index);
+
        // move the rest of the basement nodes
        for ( ; curr_src_bn_index < node->n_children; curr_src_bn_index++, curr_dest_bn_index++) {
            destroy_basement_node(BLB(B, curr_dest_bn_index));
            set_BNULL(B, curr_dest_bn_index);
            B->bp[curr_dest_bn_index] = node->bp[curr_src_bn_index];
        }
+        if (curr_dest_bn_index < B->n_children) {
+            // B already has an empty basement node here.
+            BP_STATE(B, curr_dest_bn_index) = PT_AVAIL;
+        }
+
        node->n_children = num_children_in_node;

        //
@ -847,7 +887,7 @@ ftleaf_split(

        // the child index in the original node that corresponds to the
        // first node in the right node of the split
-        int base_index = (split_on_boundary ? last_bn_on_left + 1 : last_bn_on_left);
+        int base_index = num_left_bns - (split_on_boundary ? 0 : 1);
        // make pivots in B
        for (int i=0; i < num_children_in_b-1; i++) {
            toku_copyref_dbt(&B->childkeys[i], node->childkeys[i+base_index]);
@ -855,10 +895,10 @@ ftleaf_split(
            node->totalchildkeylens -= node->childkeys[i+base_index].size;
            toku_init_dbt(&node->childkeys[i+base_index]);
        }
-        if (split_on_boundary) {
+        if (split_on_boundary && split_mode != SPLIT_LEFT_HEAVY) {
            // destroy the extra childkey between the nodes, we'll
            // recreate it in splitk below
-            toku_free(node->childkeys[last_bn_on_left].data);
+            toku_free(node->childkeys[num_left_bns - 1].data);
        }
        REALLOC_N(num_children_in_node, node->bp);
        REALLOC_N(num_children_in_node-1, node->childkeys);
@ -867,7 +907,7 @@ ftleaf_split(
    if (splitk) {
        memset(splitk, 0, sizeof *splitk);
        OMTVALUE lev;
-        OMT buffer = BLB_BUFFER(node, last_bn_on_left);
+        OMT buffer = BLB_BUFFER(node, num_left_bns - 1);
        int r = toku_omt_fetch(buffer, toku_omt_size(buffer) - 1, &lev);
        assert_zero(r); // that fetch should have worked.
        LEAFENTRY CAST_FROM_VOIDP(le, lev);
@ -908,8 +948,8 @@ ft_nonleaf_split(
    int n_children_in_b = old_n_children-n_children_in_a;
    MSN max_msn_applied_to_node = node->max_msn_applied_to_node_on_disk;
    FTNODE B;
-    assert(node->height>0);
-    assert(node->n_children>=2); // Otherwise, how do we split?	 We need at least two children to split. */
+    paranoid_invariant(node->height>0);
+    paranoid_invariant(node->n_children>=2); // Otherwise, how do we split?	 We need at least two children to split. */
    create_new_ftnode_with_dep_nodes(h, &B, node->height, n_children_in_b, num_dependent_nodes, dependent_nodes);
    {
        /* The first n_children_in_a go into node a.
@ -932,7 +972,7 @@ ft_nonleaf_split(

            // Delete a child, removing the preceeding pivot key.  The child number must be > 0
            {
-                assert(i>0);
+                paranoid_invariant(i>0);
                if (i>n_children_in_a) {
                    toku_copyref_dbt(&B->childkeys[targchild-1], node->childkeys[i-1]);
                    B->totalchildkeylens += node->childkeys[i-1].size;
@ -978,10 +1018,11 @@ ft_split_child(
    FTNODE node,
    int childnum,
    FTNODE child,
+    enum split_mode split_mode,
    struct flusher_advice *fa)
 {
-    assert(node->height>0);
-    assert(toku_bnc_nbytesinbuf(BNC(node, childnum))==0); // require that the buffer for this child is empty
+    paranoid_invariant(node->height>0);
+    paranoid_invariant(toku_bnc_nbytesinbuf(BNC(node, childnum))==0); // require that the buffer for this child is empty
    FTNODE nodea, nodeb;
    DBT splitk;

@ -992,7 +1033,7 @@ ft_split_child(
    dep_nodes[0] = node;
    dep_nodes[1] = child;
    if (child->height==0) {
-        ftleaf_split(h, child, &nodea, &nodeb, &splitk, true, 2, dep_nodes);
+        ftleaf_split(h, child, &nodea, &nodeb, &splitk, true, split_mode, 2, dep_nodes);
    } else {
        ft_nonleaf_split(h, child, &nodea, &nodeb, &splitk, 2, dep_nodes);
    }
@ -1040,8 +1081,8 @@ flush_this_child(
    }
    bring_node_fully_into_memory(child, h);
    toku_assert_entire_node_in_memory(child);
-    assert(node->height>0);
-    assert(child->thisnodename.b!=0);
+    paranoid_invariant(node->height>0);
+    paranoid_invariant(child->thisnodename.b!=0);
    // VERIFY_NODE does not work off client thread as of now
    //VERIFY_NODE(t, child);
    node->dirty = 1;
@ -1062,10 +1103,10 @@ merge_leaf_nodes(FTNODE a, FTNODE b)
    STATUS_VALUE(FT_FLUSHER_MERGE_LEAF)++;
    toku_assert_entire_node_in_memory(a);
    toku_assert_entire_node_in_memory(b);
-    assert(a->height == 0);
-    assert(b->height == 0);
-    assert(a->n_children > 0);
-    assert(b->n_children > 0);
+    paranoid_invariant(a->height == 0);
+    paranoid_invariant(b->height == 0);
+    paranoid_invariant(a->n_children > 0);
+    paranoid_invariant(b->n_children > 0);

    // Mark nodes as dirty before moving basements from b to a.
    // This way, whatever deltas are accumulated in the basements are
@ -1148,7 +1189,7 @@ static void balance_leaf_nodes(
    merge_leaf_nodes(a,b);
    // now split them
    // because we are not creating a new node, we can pass in no dependent nodes
-    ftleaf_split(NULL, a, &a, &b, splitk, false, 0, NULL);
+    ftleaf_split(NULL, a, &a, &b, splitk, false, SPLIT_EVENLY, 0, NULL);
 }

 static void
@ -1202,7 +1243,7 @@ maybe_merge_pinned_nonleaf_nodes(
 {
    toku_assert_entire_node_in_memory(a);
    toku_assert_entire_node_in_memory(b);
-    assert(parent_splitk->data);
+    paranoid_invariant(parent_splitk->data);
    int old_n_children = a->n_children;
    int new_n_children = old_n_children + b->n_children;
    XREALLOC_N(new_n_children, a->bp);
@ -1262,7 +1303,7 @@ maybe_merge_pinned_nodes(
 //  splitk		(OUT):	If the two nodes did not get merged, the new pivot key between the two nodes.
 {
    MSN msn_max;
-    assert(a->height == b->height);
+    paranoid_invariant(a->height == b->height);
    toku_assert_entire_node_in_memory(parent);
    toku_assert_entire_node_in_memory(a);
    toku_assert_entire_node_in_memory(b);
@ -1271,9 +1312,6 @@ maybe_merge_pinned_nodes(
        MSN msna = a->max_msn_applied_to_node_on_disk;
        MSN msnb = b->max_msn_applied_to_node_on_disk;
        msn_max = (msna.msn > msnb.msn) ? msna : msnb;
-        if (a->height > 0) {
-            invariant(msn_max.msn <= parent->max_msn_applied_to_node_on_disk.msn);  // parent msn must be >= children's msn
-        }
    }
    if (a->height == 0) {
        maybe_merge_pinned_leaf_nodes(a, b, parent_splitk, did_merge, did_rebalance, splitk, nodesize);
@ -1312,7 +1350,7 @@ ft_merge_child(
 {
    // this function should not be called
    // if the child is not mergable
-    assert(node->n_children > 1);
+    paranoid_invariant(node->n_children > 1);
    toku_assert_entire_node_in_memory(node);

    int childnuma,childnumb;
@ -1323,11 +1361,11 @@ ft_merge_child(
        childnuma = childnum_to_merge;
        childnumb = childnum_to_merge+1;
    }
-    assert(0 <= childnuma);
-    assert(childnuma+1 == childnumb);
-    assert(childnumb < node->n_children);
+    paranoid_invariant(0 <= childnuma);
+    paranoid_invariant(childnuma+1 == childnumb);
+    paranoid_invariant(childnumb < node->n_children);

-    assert(node->height>0);
+    paranoid_invariant(node->height>0);

    // We suspect that at least one of the children is fusible, but they might not be.
    // for test
@ -1371,22 +1409,27 @@ ft_merge_child(
        maybe_merge_pinned_nodes(node, &node->childkeys[childnuma], childa, childb, &did_merge, &did_rebalance, &splitk, h->h->nodesize);
        if (childa->height>0) {
            for (int i=0; i+1<childa->n_children; i++) {
-                assert(childa->childkeys[i].data);
+                paranoid_invariant(childa->childkeys[i].data);
            }
        }
        //toku_verify_estimates(t,childa);
        // the tree did react if a merge (did_merge) or rebalance (new spkit key) occurred
        *did_react = (bool)(did_merge || did_rebalance);
        if (did_merge) {
-            assert(!splitk.data);
+            paranoid_invariant(!splitk.data);
        } else {
-            assert(splitk.data);
+            paranoid_invariant(splitk.data);
        }

        node->totalchildkeylens -= deleted_size; // The key was free()'d inside the maybe_merge_pinned_nodes.

        if (did_merge) {
-            destroy_nonleaf_childinfo(BNC(node, childnumb));
+            NONLEAF_CHILDINFO remaining_bnc = BNC(node, childnuma);
+            NONLEAF_CHILDINFO merged_bnc = BNC(node, childnumb);
+            for (unsigned int i = 0; i < (sizeof remaining_bnc->flow) / (sizeof remaining_bnc->flow[0]); ++i) {
+                remaining_bnc->flow[i] += merged_bnc->flow[i];
+            }
+            destroy_nonleaf_childinfo(merged_bnc);
            set_BNULL(node, childnumb);
            node->n_children--;
            memmove(&node->bp[childnumb],
@ -1397,10 +1440,14 @@ ft_merge_child(
                    &node->childkeys[childnuma+1],
                    (node->n_children-childnumb)*sizeof(node->childkeys[0]));
            REALLOC_N(node->n_children-1, node->childkeys);
-            assert(BP_BLOCKNUM(node, childnuma).b == childa->thisnodename.b);
+            paranoid_invariant(BP_BLOCKNUM(node, childnuma).b == childa->thisnodename.b);
            childa->dirty = 1;  // just to make sure
            childb->dirty = 1;  // just to make sure
        } else {
+            // flow will be inaccurate for a while, oh well.  the children
+            // are leaves in this case so it's not a huge deal (we're
+            // pretty far down the tree)
+
            // If we didn't merge the nodes, then we need the correct pivot.
            toku_copyref_dbt(&node->childkeys[childnuma], splitk);
            node->totalchildkeylens += node->childkeys[childnuma].size;
@ -1421,13 +1468,13 @@ ft_merge_child(
            merge_remove_key_callback,
            h
            );
-        assert(rrb==0);
+        assert_zero(rrb);

        // for test
        call_flusher_thread_callback(ft_flush_aflter_merge);

        // unlock the parent
-        assert(node->dirty);
+        paranoid_invariant(node->dirty);
        toku_unpin_ftnode_off_client_thread(h, node);
    }
    else {
@ -1435,7 +1482,7 @@ ft_merge_child(
        call_flusher_thread_callback(ft_flush_aflter_rebalance);

        // unlock the parent
-        assert(node->dirty);
+        paranoid_invariant(node->dirty);
        toku_unpin_ftnode_off_client_thread(h, node);
        toku_unpin_ftnode_off_client_thread(h, childb);
    }
@ -1463,7 +1510,7 @@ flush_some_child(
 {
    int dirtied = 0;
    NONLEAF_CHILDINFO bnc = NULL;
-    assert(parent->height>0);
+    paranoid_invariant(parent->height>0);
    toku_assert_entire_node_in_memory(parent);

    // pick the child we want to flush to
@ -1496,7 +1543,7 @@ flush_some_child(
    // the parent before finishing reading in the entire child node.
    bool may_child_be_reactive = may_node_be_reactive(child);

-    assert(child->thisnodename.b!=0);
+    paranoid_invariant(child->thisnodename.b!=0);
    //VERIFY_NODE(brt, child);

    // only do the following work if there is a flush to perform
@ -1508,7 +1555,9 @@ flush_some_child(
        // detach buffer
        BP_WORKDONE(parent, childnum) = 0;  // this buffer is drained, no work has been done by its contents
        bnc = BNC(parent, childnum);
-        set_BNC(parent, childnum, toku_create_empty_nl());
+        NONLEAF_CHILDINFO new_bnc = toku_create_empty_nl();
+        memcpy(new_bnc->flow, bnc->flow, sizeof bnc->flow);
+        set_BNC(parent, childnum, new_bnc);
    }

    //
@ -1592,19 +1641,19 @@ flush_some_child(
        // it is responsibility of `ft_split_child` to unlock nodes of
        // parent and child as it sees fit
        //
-        assert(parent); // just make sure we have not accidentally unpinned parent
-        ft_split_child(h, parent, childnum, child, fa);
+        paranoid_invariant(parent); // just make sure we have not accidentally unpinned parent
+        ft_split_child(h, parent, childnum, child, SPLIT_EVENLY, fa);
    }
    else if (child_re == RE_FUSIBLE) {
        //
        // it is responsibility of `maybe_merge_child to unlock nodes of
        // parent and child as it sees fit
        //
-        assert(parent); // just make sure we have not accidentally unpinned parent
+        paranoid_invariant(parent); // just make sure we have not accidentally unpinned parent
        fa->maybe_merge_child(fa, h, parent, childnum, child, fa->extra);
    }
    else {
-        assert(false);
+        abort();
    }
 }

@ -1657,7 +1706,7 @@ dummy_pick_heaviest_child(FT UU(h),
                    FTNODE UU(parent),
                    void* UU(extra))
 {
-    assert(false);
+    abort();
    return -1;
 }

@ -1665,7 +1714,8 @@ void toku_ft_split_child(
    FT ft,
    FTNODE node,
    int childnum,
-    FTNODE child
+    FTNODE child,
+    enum split_mode split_mode
    )
 {
    struct flusher_advice fa;
@ -1684,6 +1734,34 @@ void toku_ft_split_child(
        node,
        childnum, // childnum to split
        child,
+        split_mode,
+        &fa
+        );
+}
+
+void toku_ft_merge_child(
+    FT ft,
+    FTNODE node,
+    int childnum
+    )
+{
+    struct flusher_advice fa;
+    flusher_advice_init(
+        &fa,
+        dummy_pick_heaviest_child,
+        dont_destroy_basement_nodes,
+        never_recursively_flush,
+        default_merge_child,
+        dummy_update_status,
+        default_pick_child_after_split,
+        NULL
+        );
+    bool did_react;
+    ft_merge_child(
+        ft,
+        node,
+        childnum, // childnum to merge
+        &did_react,
        &fa
        );
 }
@ -1816,13 +1894,13 @@ flush_node_on_background_thread(FT h, FTNODE parent)
    // and pick the child we want to flush to
    //
    int childnum = find_heaviest_child(parent);
-    assert(toku_bnc_n_entries(BNC(parent, childnum))>0);
+    paranoid_invariant(toku_bnc_n_entries(BNC(parent, childnum))>0);
    //
    // see if we can pin the child
    //
    FTNODE child;
    uint32_t childfullhash = compute_child_fullhash(h->cf, parent, childnum);
-    int r = toku_maybe_pin_ftnode_clean(h, BP_BLOCKNUM(parent, childnum), childfullhash, &child);
+    int r = toku_maybe_pin_ftnode_clean(h, BP_BLOCKNUM(parent, childnum), childfullhash, PL_WRITE_EXPENSIVE, &child);
    if (r != 0) {
        // In this case, we could not lock the child, so just place the parent on the background thread
        // In the callback, we will use flush_some_child, which checks to
@ -1846,7 +1924,9 @@ flush_node_on_background_thread(FT h, FTNODE parent)
            parent->dirty = 1;
            BP_WORKDONE(parent, childnum) = 0;  // this buffer is drained, no work has been done by its contents
            NONLEAF_CHILDINFO bnc = BNC(parent, childnum);
-            set_BNC(parent, childnum, toku_create_empty_nl());
+            NONLEAF_CHILDINFO new_bnc = toku_create_empty_nl();
+            memcpy(new_bnc->flow, bnc->flow, sizeof bnc->flow);
+            set_BNC(parent, childnum, new_bnc);

            //
            // at this point, the buffer has been detached from the parent
--- a/ft/ft-flusher.h
+++ b/ft/ft-flusher.h
@ -89,6 +89,7 @@ ftleaf_split(
    FTNODE *nodeb,
    DBT *splitk,
    bool create_new_node,
+    enum split_mode split_mode,
    uint32_t num_dependent_nodes,
    FTNODE* dependent_nodes
    );
--- a/ft/ft-hot-flusher.cc
+++ b/ft/ft-hot-flusher.cc
@ -9,6 +9,7 @@
 #include <ft-cachetable-wrappers.h>
 #include <ft-internal.h>
 #include <ft.h>
+#include <portability/toku_atomic.h>

 // Member Descirption:
 // 1. highest_pivot_key - this is the key that corresponds to the 
@ -251,7 +252,7 @@ toku_ft_hot_optimize(FT_HANDLE brt,
    uint64_t loop_count = 0;
    MSN msn_at_start_of_hot = ZERO_MSN;  // capture msn from root at
                                         // start of HOT operation
-    (void) __sync_fetch_and_add(&STATUS_VALUE(FT_HOT_NUM_STARTED), 1);
+    (void) toku_sync_fetch_and_add(&STATUS_VALUE(FT_HOT_NUM_STARTED), 1);

    {
        toku_ft_note_hot_begin(brt);
@ -353,9 +354,9 @@ toku_ft_hot_optimize(FT_HANDLE brt,
        }

        if (success) {
-            (void) __sync_fetch_and_add(&STATUS_VALUE(FT_HOT_NUM_COMPLETED), 1);
+            (void) toku_sync_fetch_and_add(&STATUS_VALUE(FT_HOT_NUM_COMPLETED), 1);
        } else {
-            (void) __sync_fetch_and_add(&STATUS_VALUE(FT_HOT_NUM_ABORTED), 1);
+            (void) toku_sync_fetch_and_add(&STATUS_VALUE(FT_HOT_NUM_ABORTED), 1);
        }
    }
    return r;
--- a/ft/ft-internal.h
+++ b/ft/ft-internal.h
@ -124,6 +124,7 @@ struct ftnode_nonleaf_childinfo {
    off_omt_t broadcast_list;
    marked_off_omt_t fresh_message_tree;
    off_omt_t stale_message_tree;
+    uint64_t flow[2];  // current and last checkpoint
 };

 unsigned int toku_bnc_nbytesinbuf(NONLEAF_CHILDINFO bnc);
@ -133,6 +134,7 @@ long toku_bnc_memory_used(NONLEAF_CHILDINFO bnc);
 void toku_bnc_insert_msg(NONLEAF_CHILDINFO bnc, const void *key, ITEMLEN keylen, const void *data, ITEMLEN datalen, enum ft_msg_type type, MSN msn, XIDS xids, bool is_fresh, DESCRIPTOR desc, ft_compare_func cmp);
 void toku_bnc_empty(NONLEAF_CHILDINFO bnc);
 void toku_bnc_flush_to_child(FT h, NONLEAF_CHILDINFO bnc, FTNODE child);
+bool toku_bnc_should_promote(FT ft, NONLEAF_CHILDINFO bnc) __attribute__((const, nonnull));
 bool toku_ft_nonleaf_is_gorged(FTNODE node, uint32_t nodesize);


@ -152,13 +154,13 @@ struct ftnode_leaf_basement_node {
    STAT64INFO_S stat64_delta;      // change in stat64 counters since basement was last written to disk
 };

-enum  __attribute__((__packed__)) pt_state {  // declare this to be packed so that when used below it will only take 1 byte.
+enum   pt_state {  // declare this to be packed so that when used below it will only take 1 byte.
    PT_INVALID = 0,
    PT_ON_DISK = 1,
    PT_COMPRESSED = 2,
    PT_AVAIL = 3};

-enum __attribute__((__packed__)) ftnode_child_tag {
+enum  ftnode_child_tag {
    BCT_INVALID = 0,
    BCT_NULL,
    BCT_SUBBLOCK,
@ -166,7 +168,7 @@ enum __attribute__((__packed__)) ftnode_child_tag {
    BCT_NONLEAF
 };
    
-typedef struct __attribute__((__packed__)) ftnode_child_pointer {
+typedef struct  ftnode_child_pointer {
    union {
 	struct sub_block *subblock;
 	struct ftnode_nonleaf_childinfo *nonleaf;
@ -264,7 +266,13 @@ struct ftnode {
 // that have a read lock on an internal node may try to touch the clock
 // simultaneously
 //
-#define BP_TOUCH_CLOCK(node, i) ((void) __sync_val_compare_and_swap(&(node)->bp[i].clock_count, 0, 1))
+#define BP_TOUCH_CLOCK(node, i) do {                                    \
+        TOKU_VALGRIND_HG_DISABLE_CHECKING(&(node)->bp[i].clock_count, sizeof (node)->bp[i].clock_count); \
+        TOKU_DRD_IGNORE_VAR((node)->bp[i].clock_count);                 \
+        (node)->bp[i].clock_count = 1;                                  \
+        TOKU_DRD_STOP_IGNORING_VAR((node)->bp[i].clock_count);          \
+        TOKU_VALGRIND_HG_ENABLE_CHECKING(&(node)->bp[i].clock_count, sizeof (node)->bp[i].clock_count); \
+    } while (0)
 #define BP_SWEEP_CLOCK(node, i) ((node)->bp[i].clock_count = 0)
 #define BP_SHOULD_EVICT(node, i) ((node)->bp[i].clock_count == 0)
 // not crazy about having these two here, one is for the case where we create new
@ -275,47 +283,54 @@ struct ftnode {

 // internal node macros
 static inline void set_BNULL(FTNODE node, int i) {
-    assert(0<=i && i<node->n_children);
+    paranoid_invariant(0<=i);
+    paranoid_invariant(i<node->n_children);
    node->bp[i].ptr.tag = BCT_NULL;
 }
 static inline bool is_BNULL (FTNODE node, int i) {
-    assert(0<=i && i<node->n_children);
+    paranoid_invariant(0<=i);
+    paranoid_invariant(i<node->n_children);
    return node->bp[i].ptr.tag == BCT_NULL;
 }
 static inline NONLEAF_CHILDINFO BNC(FTNODE node, int i) {
-    assert(0<=i && i<node->n_children);
+    paranoid_invariant(0<=i);
+    paranoid_invariant(i<node->n_children);
    FTNODE_CHILD_POINTER p = node->bp[i].ptr;
-    assert(p.tag==BCT_NONLEAF);
+    paranoid_invariant(p.tag==BCT_NONLEAF);
    return p.u.nonleaf;
 }
 static inline void set_BNC(FTNODE node, int i, NONLEAF_CHILDINFO nl) {
-    assert(0<=i && i<node->n_children);
+    paranoid_invariant(0<=i);
+    paranoid_invariant(i<node->n_children);
    FTNODE_CHILD_POINTER *p = &node->bp[i].ptr;
    p->tag = BCT_NONLEAF;
    p->u.nonleaf = nl;
 }
 static inline BASEMENTNODE BLB(FTNODE node, int i) {
-    assert(i<node->n_children);
-    assert(0<=i);
+    paranoid_invariant(0<=i);
+    paranoid_invariant(i<node->n_children);
    FTNODE_CHILD_POINTER p = node->bp[i].ptr;
-    assert(p.tag==BCT_LEAF);
+    paranoid_invariant(p.tag==BCT_LEAF);
    return p.u.leaf;
 }
 static inline void set_BLB(FTNODE node, int i, BASEMENTNODE bn) {
-    assert(0<=i && i<node->n_children);
+    paranoid_invariant(0<=i);
+    paranoid_invariant(i<node->n_children);
    FTNODE_CHILD_POINTER *p = &node->bp[i].ptr;
    p->tag = BCT_LEAF;
    p->u.leaf = bn;
 }

 static inline SUB_BLOCK BSB(FTNODE node, int i) {
-    assert(0<=i && i<node->n_children);
+    paranoid_invariant(0<=i);
+    paranoid_invariant(i<node->n_children);
    FTNODE_CHILD_POINTER p = node->bp[i].ptr;
-    assert(p.tag==BCT_SUBBLOCK);
+    paranoid_invariant(p.tag==BCT_SUBBLOCK);
    return p.u.subblock;
 }
 static inline void set_BSB(FTNODE node, int i, SUB_BLOCK sb) {
-    assert(0<=i && i<node->n_children);
+    paranoid_invariant(0<=i);
+    paranoid_invariant(i<node->n_children);
    FTNODE_CHILD_POINTER *p = &node->bp[i].ptr;
    p->tag = BCT_SUBBLOCK;
    p->u.subblock = sb;
@ -390,6 +405,9 @@ struct ft_header {
    // This is decremented from our currnt MIN_MSN so as not to clash
    // with any existing 'normal' MSN's.
    MSN highest_unused_msn_for_upgrade;
+    // Largest MSN ever injected into the tree.  Used to set the MSN for
+    // messages as they get injected.
+    MSN max_msn_in_ft;

    // last time that a hot optimize operation was begun
    uint64_t time_of_last_optimize_begin;
@ -605,13 +623,6 @@ void toku_ft_append_to_child_buffer(ft_compare_func compare_fun, DESCRIPTOR desc

 STAT64INFO_S toku_get_and_clear_basement_stats(FTNODE leafnode);

-
-#if 1
-#define DEADBEEF ((void*)0xDEADBEEF)
-#else
-#define DEADBEEF ((void*)0xDEADBEEFDEADBEEF)
-#endif
-
 //#define SLOW
 #ifdef SLOW
 #define VERIFY_NODE(t,n) (toku_verify_or_set_counts(n), toku_verify_estimates(t,n))
@ -629,6 +640,7 @@ STAT64INFO_S toku_get_and_clear_basement_stats(FTNODE leafnode);
 void toku_evict_bn_from_memory(FTNODE node, int childnum, FT h);
 void toku_ft_status_update_pivot_fetch_reason(struct ftnode_fetch_extra *bfe);
 extern void toku_ftnode_clone_callback(void* value_data, void** cloned_value_data, PAIR_ATTR* new_attr, bool for_checkpoint, void* write_extraargs);
+extern void toku_ftnode_checkpoint_complete_callback(void *value_data);
 extern void toku_ftnode_flush_callback (CACHEFILE cachefile, int fd, BLOCKNUM nodename, void *ftnode_v, void** UU(disk_data), void *extraargs, PAIR_ATTR size, PAIR_ATTR* new_size, bool write_me, bool keep_me, bool for_checkpoint, bool is_clone);
 extern int toku_ftnode_fetch_callback (CACHEFILE cachefile, PAIR p, int fd, BLOCKNUM nodename, uint32_t fullhash, void **ftnode_pv, void** UU(disk_data), PAIR_ATTR *sizep, int*dirty, void*extraargs);
 extern void toku_ftnode_pe_est_callback(void* ftnode_pv, void* disk_data, long* bytes_freed_estimate, enum partial_eviction_cost *cost, void* write_extraargs);
@ -643,7 +655,15 @@ void toku_ft_split_child(
    FT h,
    FTNODE node,
    int childnum,
-    FTNODE child
+    FTNODE child,
+    enum split_mode split_mode
+    );
+// Given pinned node, merge childnum with a neighbor and update node with
+// information about the change
+void toku_ft_merge_child(
+    FT ft,
+    FTNODE node,
+    int childnum
    );
 static inline CACHETABLE_WRITE_CALLBACK get_write_callbacks_for_node(FT h) {
    CACHETABLE_WRITE_CALLBACK wc;
@ -652,6 +672,7 @@ static inline CACHETABLE_WRITE_CALLBACK get_write_callbacks_for_node(FT h) {
    wc.pe_callback = toku_ftnode_pe_callback;
    wc.cleaner_callback = toku_ftnode_cleaner_callback;
    wc.clone_callback = toku_ftnode_clone_callback;
+    wc.checkpoint_complete_callback = toku_ftnode_checkpoint_complete_callback;
    wc.write_extraargs = h;
    return wc;
 }
@ -720,7 +741,7 @@ static inline void fill_bfe_for_subset_read(
    bool disable_prefetching
    )
 {
-    invariant(h->h->type == FT_CURRENT);
+    paranoid_invariant(h->h->type == FT_CURRENT);
    bfe->type = ftnode_fetch_subset;
    bfe->h = h;
    bfe->search = search;
@ -739,7 +760,7 @@ static inline void fill_bfe_for_subset_read(
 // Currently used for stat64.
 //
 static inline void fill_bfe_for_min_read(struct ftnode_fetch_extra *bfe, FT h) {
-    invariant(h->h->type == FT_CURRENT);
+    paranoid_invariant(h->h->type == FT_CURRENT);
    bfe->type = ftnode_fetch_none;
    bfe->h = h;
    bfe->search = NULL;
@ -752,7 +773,7 @@ static inline void fill_bfe_for_min_read(struct ftnode_fetch_extra *bfe, FT h) {
 }

 static inline void destroy_bfe_for_prefetch(struct ftnode_fetch_extra *bfe) {
-    assert(bfe->type == ftnode_fetch_prefetch);
+    paranoid_invariant(bfe->type == ftnode_fetch_prefetch);
    if (bfe->range_lock_left_key != NULL) {
        toku_free(bfe->range_lock_left_key->data);
        toku_destroy_dbt(bfe->range_lock_left_key);
@ -771,7 +792,7 @@ static inline void destroy_bfe_for_prefetch(struct ftnode_fetch_extra *bfe) {
 static inline void fill_bfe_for_prefetch(struct ftnode_fetch_extra *bfe,
                                         FT h,
                                         FT_CURSOR c) {
-    invariant(h->h->type == FT_CURRENT);
+    paranoid_invariant(h->h->type == FT_CURRENT);
    bfe->type = ftnode_fetch_prefetch;
    bfe->h = h;
    bfe->search = NULL;
@ -779,13 +800,13 @@ static inline void fill_bfe_for_prefetch(struct ftnode_fetch_extra *bfe,
        const DBT *left = &c->range_lock_left_key;
        const DBT *right = &c->range_lock_right_key;
 	if (left->data) {
-            MALLOC(bfe->range_lock_left_key); resource_assert(bfe->range_lock_left_key);
+            XMALLOC(bfe->range_lock_left_key);
            toku_fill_dbt(bfe->range_lock_left_key, toku_xmemdup(left->data, left->size), left->size);
        } else {
            bfe->range_lock_left_key = NULL;
        }
        if (right->data) {
-            MALLOC(bfe->range_lock_right_key); resource_assert(bfe->range_lock_right_key);
+            XMALLOC(bfe->range_lock_right_key);
            toku_fill_dbt(bfe->range_lock_right_key, toku_xmemdup(right->data, right->size), right->size);
        } else {
            bfe->range_lock_right_key = NULL;
@ -815,6 +836,9 @@ bool toku_ft_leaf_needs_ancestors_messages(FT ft, FTNODE node, ANCESTORS ancesto
 __attribute__((nonnull))
 void toku_ft_bn_update_max_msn(FTNODE node, MSN max_msn_applied);

+__attribute__((const,nonnull))
+size_t toku_ft_msg_memsize_in_fifo(FT_MSG cmd);
+
 int
 toku_ft_search_which_child(
    DESCRIPTOR desc,
@ -840,8 +864,8 @@ void toku_create_new_ftnode (FT_HANDLE t, FTNODE *result, int height, int n_chil
 void toku_initialize_empty_ftnode (FTNODE n, BLOCKNUM nodename, int height, int num_children, 
                                    int layout_version, unsigned int flags);

-unsigned int toku_ftnode_which_child(FTNODE node, const DBT *k,
-                                      DESCRIPTOR desc, ft_compare_func cmp)
+int toku_ftnode_which_child(FTNODE node, const DBT *k,
+                            DESCRIPTOR desc, ft_compare_func cmp)
    __attribute__((__warn_unused_result__));

 /**
@ -854,10 +878,10 @@ unsigned int toku_ftnode_which_child(FTNODE node, const DBT *k,
 * If k is equal to some pivot, then we return the next (to the right)
 * childnum.
 */
-unsigned int toku_ftnode_hot_next_child(FTNODE node,
-                                         const DBT *k,
-                                         DESCRIPTOR desc,
-                                         ft_compare_func cmp);
+int toku_ftnode_hot_next_child(FTNODE node,
+                               const DBT *k,
+                               DESCRIPTOR desc,
+                               ft_compare_func cmp);

 /* Stuff for testing */
 // toku_testsetup_initialize() must be called before any other test_setup_xxx() functions are called.
@ -882,7 +906,7 @@ int toku_cmd_leafval_heaviside (OMTVALUE leafentry, void *extra)
 // toku_ft_root_put_cmd() accepts non-constant cmd because this is where we set the msn
 void toku_ft_root_put_cmd(FT h, FT_MSG_S * cmd);

-void *mempool_malloc_from_omt(OMT omt, struct mempool *mp, size_t size, void **maybe_free);
+void *mempool_malloc_from_omt(OMT *omtp, struct mempool *mp, size_t size, void **maybe_free);
 // Effect: Allocate a new object of size SIZE in MP.  If MP runs out of space, allocate new a new mempool space, and copy all the items
 //  from the OMT (which items refer to items in the old mempool) into the new mempool.
 //  If MAYBE_FREE is NULL then free the old mempool's space.
@ -896,7 +920,7 @@ toku_get_node_for_verify(

 int
 toku_verify_ftnode (FT_HANDLE brt,
-                     MSN rootmsn, MSN parentmsn,
+                    MSN rootmsn, MSN parentmsn, bool messages_exist_above,
                     FTNODE node, int height,
                     const DBT *lesser_pivot,               // Everything in the subtree should be > lesser_pivot.  (lesser_pivot==NULL if there is no lesser pivot.)
                     const DBT *greatereq_pivot,            // Everything in the subtree should be <= lesser_pivot.  (lesser_pivot==NULL if there is no lesser pivot.)
@ -978,6 +1002,19 @@ typedef enum {
    FT_NUM_MSG_BUFFER_FETCHED_AGGRESSIVE,      // ... because they were between lc and rc
    FT_NUM_MSG_BUFFER_FETCHED_PREFETCH,
    FT_NUM_MSG_BUFFER_FETCHED_WRITE,
+    FT_PRO_NUM_ROOT_SPLIT,
+    FT_PRO_NUM_ROOT_H0_INJECT,
+    FT_PRO_NUM_ROOT_H1_INJECT,
+    FT_PRO_NUM_INJECT_DEPTH_0,
+    FT_PRO_NUM_INJECT_DEPTH_1,
+    FT_PRO_NUM_INJECT_DEPTH_2,
+    FT_PRO_NUM_INJECT_DEPTH_3,
+    FT_PRO_NUM_INJECT_DEPTH_GT3,
+    FT_PRO_NUM_STOP_NONEMPTY_BUF,
+    FT_PRO_NUM_STOP_H1,
+    FT_PRO_NUM_STOP_LOCK_CHILD,
+    FT_PRO_NUM_STOP_CHILD_INMEM,
+    FT_PRO_NUM_DIDNT_WANT_PROMOTE,
    FT_STATUS_NUM_ROWS
 } ft_status_entry;

@ -1015,6 +1052,7 @@ toku_ft_leaf_apply_cmd (
    ft_update_func update_fun,
    DESCRIPTOR desc,
    FTNODE node,
+    int target_childnum,
    FT_MSG cmd,
    uint64_t *workdone,
    STAT64INFO stats_to_update
@ -1026,13 +1064,16 @@ toku_ft_node_put_cmd (
    ft_update_func update_fun,
    DESCRIPTOR desc,
    FTNODE node,
+    int target_childnum,
    FT_MSG cmd,
    bool is_fresh,
+    size_t flow_deltas[],
    STAT64INFO stats_to_update
    );

 void toku_flusher_thread_set_callback(void (*callback_f)(int, void*), void* extra);

-int toku_upgrade_subtree_estimates_to_stat64info(int fd, FT h);
+int toku_upgrade_subtree_estimates_to_stat64info(int fd, FT h) __attribute__((nonnull));
+int toku_upgrade_msn_from_root_to_header(int fd, FT h) __attribute__((nonnull));

 #endif
--- a/ft/ft-ops.cc
+++ b/ft/ft-ops.cc
--- a/ft/ft-serialize.cc
+++ b/ft/ft-serialize.cc
@ -134,8 +134,8 @@ deserialize_ft_versioned(int fd, struct rbuf *rb, FT *ftp, uint32_t version)
 {
    int r;
    FT ft = NULL;
-    invariant(version >= FT_LAYOUT_MIN_SUPPORTED_VERSION);
-    invariant(version <= FT_LAYOUT_VERSION);
+    paranoid_invariant(version >= FT_LAYOUT_MIN_SUPPORTED_VERSION);
+    paranoid_invariant(version <= FT_LAYOUT_VERSION);
    // We already know:
    //  we have an rbuf representing the header.
    //  The checksum has been validated
@ -290,6 +290,12 @@ deserialize_ft_versioned(int fd, struct rbuf *rb, FT *ftp, uint32_t version)
        }
    }

+    MSN max_msn_in_ft;
+    max_msn_in_ft = ZERO_MSN;  // We'll upgrade it from the root node later if necessary
+    if (ft->layout_version_read_from_disk >= FT_LAYOUT_VERSION_21) {
+        max_msn_in_ft = rbuf_msn(rb);
+    }
+
    (void) rbuf_int(rb); //Read in checksum and ignore (already verified).
    if (rb->ndone != rb->size) {
        fprintf(stderr, "Header size did not match contents.\n");
@ -317,6 +323,7 @@ deserialize_ft_versioned(int fd, struct rbuf *rb, FT *ftp, uint32_t version)
            .basementnodesize = basementnodesize,
            .compression_method = compression_method,
            .highest_unused_msn_for_upgrade = highest_unused_msn_for_upgrade,
+            .max_msn_in_ft = max_msn_in_ft,
            .time_of_last_optimize_begin = time_of_last_optimize_begin,
            .time_of_last_optimize_end = time_of_last_optimize_end,
            .count_of_optimize_in_progress = count_of_optimize_in_progress,
@ -335,6 +342,12 @@ deserialize_ft_versioned(int fd, struct rbuf *rb, FT *ftp, uint32_t version)
            goto exit;
        }
    }
+    if (ft->layout_version_read_from_disk < FT_LAYOUT_VERSION_21) {
+        r = toku_upgrade_msn_from_root_to_header(fd, ft);
+        if (r != 0) {
+            goto exit;
+        }
+    }

    invariant((uint32_t) ft->layout_version_read_from_disk == version);
    r = deserialize_descriptor_from(fd, ft->blocktable, &ft->descriptor, version);
@ -366,10 +379,12 @@ serialize_ft_min_size (uint32_t version) {
    size_t size = 0;

    switch(version) {
+    case FT_LAYOUT_VERSION_21:
+        size += sizeof(MSN);       // max_msn_in_ft
    case FT_LAYOUT_VERSION_20:
    case FT_LAYOUT_VERSION_19:
        size += 1; // compression method
-        size += sizeof(uint64_t);  // highest_unused_msn_for_upgrade
+        size += sizeof(MSN);       // highest_unused_msn_for_upgrade
    case FT_LAYOUT_VERSION_18:
        size += sizeof(uint64_t);  // time_of_last_optimize_begin
        size += sizeof(uint64_t);  // time_of_last_optimize_end
@ -412,7 +427,7 @@ serialize_ft_min_size (uint32_t version) {
            );
        break;
    default:
-        lazy_assert(false);
+        abort();
    }

    lazy_assert(size <= BLOCK_ALLOCATOR_HEADER_RESERVE);
@ -637,7 +652,7 @@ toku_deserialize_ft_from(int fd,
        version = version_1;
    }

-    invariant(rb);
+    paranoid_invariant(rb);
    r = deserialize_ft_versioned(fd, rb, ft, version);

 exit:
@ -694,6 +709,7 @@ void toku_serialize_ft_to_wbuf (
    wbuf_MSN(wbuf, h->msn_at_start_of_last_completed_optimize);
    wbuf_char(wbuf, (unsigned char) h->compression_method);
    wbuf_MSN(wbuf, h->highest_unused_msn_for_upgrade);
+    wbuf_MSN(wbuf, h->max_msn_in_ft);
    uint32_t checksum = x1764_finish(&wbuf->checksum);
    wbuf_int(wbuf, checksum);
    lazy_assert(wbuf->ndone == wbuf->size);
--- a/ft/ft-test-helpers.cc
+++ b/ft/ft-test-helpers.cc
@ -136,13 +136,16 @@ int toku_testsetup_insert_to_leaf (FT_HANDLE brt, BLOCKNUM blocknum, const char
                     .u = { .id = { toku_fill_dbt(&keydbt, key, keylen),
                                    toku_fill_dbt(&valdbt, val, vallen) } } };

+    static size_t zero_flow_deltas[] = { 0, 0 };
    toku_ft_node_put_cmd (
        brt->ft->compare_fun,
        brt->ft->update_fun,
        &brt->ft->cmp_descriptor,
        node,
+        -1,
        &cmd,
        true,
+        zero_flow_deltas,
        NULL
        );

@ -215,6 +218,8 @@ int toku_testsetup_insert_to_nonleaf (FT_HANDLE brt, BLOCKNUM blocknum, enum ft_
    // using brt APIs.
    node->max_msn_applied_to_node_on_disk = msn;
    node->dirty = 1;
+    // Also hack max_msn_in_ft
+    brt->ft->h->max_msn_in_ft = msn;

    toku_unpin_ftnode(brt->ft, node);
    return 0;
--- a/ft/ft-verify.cc
+++ b/ft/ft-verify.cc
@ -245,7 +245,7 @@ toku_get_node_for_verify(

 static int
 toku_verify_ftnode_internal(FT_HANDLE brt,
-                            MSN rootmsn, MSN parentmsn,
+                            MSN rootmsn, MSN parentmsn, bool messages_exist_above,
                            FTNODE node, int height,
                            const DBT *lesser_pivot,               // Everything in the subtree should be > lesser_pivot.  (lesser_pivot==NULL if there is no lesser pivot.)
                            const DBT *greatereq_pivot,            // Everything in the subtree should be <= lesser_pivot.  (lesser_pivot==NULL if there is no lesser pivot.)
@ -258,16 +258,11 @@ toku_verify_ftnode_internal(FT_HANDLE brt,
    //printf("%s:%d pin %p\n", __FILE__, __LINE__, node_v);
    toku_assert_entire_node_in_memory(node);
    this_msn = node->max_msn_applied_to_node_on_disk;
-    if (rootmsn.msn == ZERO_MSN.msn) {
-        assert(parentmsn.msn == ZERO_MSN.msn);
-        rootmsn = this_msn;
-        parentmsn = this_msn;
-    }

    if (height >= 0) {
        invariant(height == node->height);   // this is a bad failure if wrong
    }
-    if (node->height > 0) {
+    if (node->height > 0 && messages_exist_above) {
        VERIFY_ASSERTION((parentmsn.msn >= this_msn.msn), 0, "node msn must be descending down tree, newest messages at top");
    }
    // Verify that all the pivot keys are in order.
@ -390,7 +385,7 @@ done:
 // input is a pinned node, on exit, node is unpinned
 int
 toku_verify_ftnode (FT_HANDLE brt,
-                     MSN rootmsn, MSN parentmsn,
+                    MSN rootmsn, MSN parentmsn, bool messages_exist_above,
                     FTNODE node, int height,
                     const DBT *lesser_pivot,               // Everything in the subtree should be > lesser_pivot.  (lesser_pivot==NULL if there is no lesser pivot.)
                     const DBT *greatereq_pivot,            // Everything in the subtree should be <= lesser_pivot.  (lesser_pivot==NULL if there is no lesser pivot.)
@ -402,11 +397,6 @@ toku_verify_ftnode (FT_HANDLE brt,
    //printf("%s:%d pin %p\n", __FILE__, __LINE__, node_v);
    toku_assert_entire_node_in_memory(node);
    this_msn = node->max_msn_applied_to_node_on_disk;
-    if (rootmsn.msn == ZERO_MSN.msn) {
-        assert(parentmsn.msn == ZERO_MSN.msn);
-        rootmsn = this_msn;
-        parentmsn = this_msn;
-    }

    int result = 0;
    int result2 = 0;
@ -414,7 +404,7 @@ toku_verify_ftnode (FT_HANDLE brt,
        // Otherwise we'll just do the next call

        result = toku_verify_ftnode_internal(
-                brt, rootmsn, parentmsn, node, height, lesser_pivot, greatereq_pivot,
+                brt, rootmsn, parentmsn, messages_exist_above, node, height, lesser_pivot, greatereq_pivot,
                verbose, keep_going_on_failure, false);
        if (result != 0 && (!keep_going_on_failure || result != TOKUDB_NEEDS_REPAIR)) goto done;
    }
@ -422,7 +412,7 @@ toku_verify_ftnode (FT_HANDLE brt,
        toku_move_ftnode_messages_to_stale(brt->ft, node);
    }
    result2 = toku_verify_ftnode_internal(
-            brt, rootmsn, parentmsn, node, height, lesser_pivot, greatereq_pivot,
+            brt, rootmsn, parentmsn, messages_exist_above, node, height, lesser_pivot, greatereq_pivot,
            verbose, keep_going_on_failure, true);
    if (result == 0) {
        result = result2;
@ -434,7 +424,7 @@ toku_verify_ftnode (FT_HANDLE brt,
        for (int i = 0; i < node->n_children; i++) {
            FTNODE child_node;
            toku_get_node_for_verify(BP_BLOCKNUM(node, i), brt, &child_node);
-            int r = toku_verify_ftnode(brt, rootmsn, this_msn,
+            int r = toku_verify_ftnode(brt, rootmsn, this_msn, messages_exist_above || toku_bnc_n_entries(BNC(node, i)) > 0,
                                        child_node, node->height-1,
                                        (i==0)                  ? lesser_pivot        : &node->childkeys[i-1],
                                        (i==node->n_children-1) ? greatereq_pivot     : &node->childkeys[i],
@ -465,7 +455,7 @@ toku_verify_ft_with_progress (FT_HANDLE brt, int (*progress_callback)(void *extr
        toku_calculate_root_offset_pointer(brt->ft, &root_key, &root_hash);
        toku_get_node_for_verify(root_key, brt, &root_node);
    }
-    int r = toku_verify_ftnode(brt, ZERO_MSN, ZERO_MSN, root_node, -1, NULL, NULL, progress_callback, progress_extra, 1, verbose, keep_on_going);
+    int r = toku_verify_ftnode(brt, brt->ft->h->max_msn_in_ft, brt->ft->h->max_msn_in_ft, false, root_node, -1, NULL, NULL, progress_callback, progress_extra, 1, verbose, keep_on_going);
    if (r == 0) {
        toku_ft_lock(brt->ft);
        brt->ft->h->time_of_last_verification = time(NULL);
@ -479,4 +469,3 @@ int
 toku_verify_ft (FT_HANDLE brt) {
    return toku_verify_ft_with_progress(brt, NULL, NULL, 0, 0);
 }
-
--- a/ft/ft.cc
+++ b/ft/ft.cc
@ -13,6 +13,7 @@

 #include <memory.h>
 #include <toku_assert.h>
+#include <portability/toku_atomic.h>

 void
 toku_ft_suppress_rollbacks(FT h, TOKUTXN txn) {
@ -365,6 +366,7 @@ ft_header_create(FT_OPTIONS options, BLOCKNUM root_blocknum, TXNID root_xid_that
        .basementnodesize = options->basementnodesize,
        .compression_method = options->compression_method,
        .highest_unused_msn_for_upgrade = { .msn = (MIN_MSN.msn - 1) },
+        .max_msn_in_ft = ZERO_MSN,
        .time_of_last_optimize_begin = 0,
        .time_of_last_optimize_end = 0,
        .count_of_optimize_in_progress = 0,
@ -850,14 +852,14 @@ toku_ft_get_cmp_descriptor(FT_HANDLE ft_handle) {

 void
 toku_ft_update_stats(STAT64INFO headerstats, STAT64INFO_S delta) {
-    (void) __sync_fetch_and_add(&(headerstats->numrows),  delta.numrows);
-    (void) __sync_fetch_and_add(&(headerstats->numbytes), delta.numbytes);
+    (void) toku_sync_fetch_and_add(&(headerstats->numrows),  delta.numrows);
+    (void) toku_sync_fetch_and_add(&(headerstats->numbytes), delta.numbytes);
 }

 void
 toku_ft_decrease_stats(STAT64INFO headerstats, STAT64INFO_S delta) {
-    (void) __sync_fetch_and_sub(&(headerstats->numrows),  delta.numrows);
-    (void) __sync_fetch_and_sub(&(headerstats->numbytes), delta.numbytes);
+    (void) toku_sync_fetch_and_sub(&(headerstats->numrows),  delta.numrows);
+    (void) toku_sync_fetch_and_sub(&(headerstats->numbytes), delta.numbytes);
 }

 void
--- a/ft/ft_layout_version.h
+++ b/ft/ft_layout_version.h
@ -28,6 +28,7 @@ enum ft_layout_version_e {
    FT_LAYOUT_VERSION_20 = 20, // Deadshot: Add compression method to log_fcreate,
                               // mgr_last_xid after begin checkpoint,
                               // last_xid to shutdown
+    FT_LAYOUT_VERSION_21 = 21, // Ming: Add max_msn_in_ft to header
    FT_NEXT_VERSION,           // the version after the current version
    FT_LAYOUT_VERSION   = FT_NEXT_VERSION-1, // A hack so I don't have to change this line.
    FT_LAYOUT_MIN_SUPPORTED_VERSION = FT_LAYOUT_VERSION_13, // Minimum version supported
--- a/ft/ft_node-serialize.cc
+++ b/ft/ft_node-serialize.cc
@ -7,6 +7,7 @@
 #include "ft-internal.h"
 #include "log-internal.h"
 #include <compress.h>
+#include <portability/toku_atomic.h>
 #include <util/sort.h>
 #include <util/threadpool.h>

@ -198,7 +199,7 @@ serialize_node_header(FTNODE node, FTNODE_DISK_DATA ndd, struct wbuf *wbuf) {
        wbuf_nocrc_literal_bytes(wbuf, "tokuleaf", 8);
    else 
        wbuf_nocrc_literal_bytes(wbuf, "tokunode", 8);
-    invariant(node->layout_version == FT_LAYOUT_VERSION);
+    paranoid_invariant(node->layout_version == FT_LAYOUT_VERSION);
    wbuf_nocrc_int(wbuf, node->layout_version);
    wbuf_nocrc_int(wbuf, node->layout_version_original);
    wbuf_nocrc_uint(wbuf, BUILD_ID);
@ -226,7 +227,7 @@ static uint32_t
 serialize_ftnode_partition_size (FTNODE node, int i)
 {
    uint32_t result = 0;
-    assert(node->bp[i].state == PT_AVAIL);
+    paranoid_invariant(node->bp[i].state == PT_AVAIL);
    result++; // Byte that states what the partition is
    if (node->height > 0) {
        result += 4; // size of bytes in buffer table
@ -253,7 +254,7 @@ serialize_nonleaf_childinfo(NONLEAF_CHILDINFO bnc, struct wbuf *wb)
    FIFO_ITERATE(
        bnc->buffer, key, keylen, data, datalen, type, msn, xids, is_fresh,
        {
-            invariant((int)type>=0 && type<256);
+            paranoid_invariant((int)type>=0 && type<256);
            wbuf_nocrc_char(wb, (unsigned char)type);
            wbuf_nocrc_char(wb, (unsigned char)is_fresh);
            wbuf_MSN(wb, msn);
@ -382,7 +383,6 @@ static void serialize_ftnode_info(FTNODE node,
    assert(sb->uncompressed_ptr == NULL);
    sb->uncompressed_size = serialize_ftnode_info_size(node);
    sb->uncompressed_ptr = toku_xmalloc(sb->uncompressed_size);
-    assert(sb->uncompressed_ptr);
    struct wbuf wb;
    wbuf_init(&wb, sb->uncompressed_ptr, sb->uncompressed_size);

@ -956,7 +956,7 @@ deserialize_child_buffer(NONLEAF_CHILDINFO bnc, struct rbuf *rbuf,
                dest = &broadcast_offsets[nbroadcast_offsets];
                nbroadcast_offsets++;
            } else {
-                assert(false);
+                abort();
            }
        } else {
            dest = NULL;
@ -1079,6 +1079,7 @@ NONLEAF_CHILDINFO toku_create_empty_nl(void) {
    cn->fresh_message_tree.create();
    cn->stale_message_tree.create();
    cn->broadcast_list.create();
+    memset(cn->flow, 0, sizeof cn->flow);
    return cn;
 }

@ -1089,6 +1090,7 @@ NONLEAF_CHILDINFO toku_clone_nl(NONLEAF_CHILDINFO orig_childinfo) {
    cn->fresh_message_tree.create_no_array();
    cn->stale_message_tree.create_no_array();
    cn->broadcast_list.create_no_array();
+    memset(cn->flow, 0, sizeof cn->flow);
    return cn;
 }

@ -1181,7 +1183,6 @@ read_and_decompress_sub_block(struct rbuf *rb, struct sub_block *sb)
    }

    sb->uncompressed_ptr = toku_xmalloc(sb->uncompressed_size);
-    assert(sb->uncompressed_ptr);

    toku_decompress(
        (Bytef *) sb->uncompressed_ptr,
@ -1198,9 +1199,8 @@ exit:
 void
 just_decompress_sub_block(struct sub_block *sb)
 {
-    // <CER> TODO: Add assert thta the subblock was read in.
+    // <CER> TODO: Add assert that the subblock was read in.
    sb->uncompressed_ptr = toku_xmalloc(sb->uncompressed_size);
-    assert(sb->uncompressed_ptr);

    toku_decompress(
        (Bytef *) sb->uncompressed_ptr,
@ -1263,13 +1263,11 @@ deserialize_ftnode_info(
    // now the subtree_estimates

    // n_children is now in the header, nd the allocatio of the node->bp is in deserialize_ftnode_from_rbuf.
-    assert(node->bp!=NULL); // 

    // now the pivots
    node->totalchildkeylens = 0;
    if (node->n_children > 1) {
        XMALLOC_N(node->n_children - 1, node->childkeys);
-        assert(node->childkeys);
        for (int i=0; i < node->n_children-1; i++) {
            bytevec childkeyptr;
            unsigned int cklen;
@ -1297,7 +1295,7 @@ deserialize_ftnode_info(
    // make sure that all the data was read
    if (data_size != rb.ndone) {
        dump_bad_block(rb.buf, rb.size);
-        assert(false);
+        abort();
    }
 exit:
    return r;
@ -1326,7 +1324,6 @@ update_bfe_using_ftnode(FTNODE node, struct ftnode_fetch_extra *bfe)
        // we can possibly require is a single basement node
        // we find out what basement node the query cares about
        // and check if it is available
-        assert(bfe->search);
        bfe->child_to_read = toku_ft_search_which_child(
            &bfe->h->cmp_descriptor,
            bfe->h->compare_fun,
@ -1372,17 +1369,16 @@ setup_partitions_using_bfe(FTNODE node,
        case PT_AVAIL:
            setup_available_ftnode_partition(node, i);
            BP_TOUCH_CLOCK(node,i);
-            continue;
+            break;
        case PT_COMPRESSED:
            set_BSB(node, i, sub_block_creat());
-            continue;
+            break;
        case PT_ON_DISK:
            set_BNULL(node, i);
-            continue;
-        case PT_INVALID:
            break;
+        case PT_INVALID:
+            abort();
        }
-        assert(false);
    }
 }

@ -1616,7 +1612,6 @@ deserialize_ftnode_header_from_rbuf_if_small_enough (FTNODE *ftnode,

    // Now decompress the subblock
    sb_node_info.uncompressed_ptr = toku_xmalloc(sb_node_info.uncompressed_size);
-    assert(sb_node_info.uncompressed_ptr);

    toku_decompress(
        (Bytef *) sb_node_info.uncompressed_ptr,
@ -1638,7 +1633,7 @@ deserialize_ftnode_header_from_rbuf_if_small_enough (FTNODE *ftnode,
    // rbuf, so we might be able to store the compressed data for some
    // objects.
    // We can proceed to deserialize the individual subblocks.
-    assert(bfe->type == ftnode_fetch_none || bfe->type == ftnode_fetch_subset || bfe->type == ftnode_fetch_all || bfe->type == ftnode_fetch_prefetch);
+    paranoid_invariant(bfe->type == ftnode_fetch_none || bfe->type == ftnode_fetch_subset || bfe->type == ftnode_fetch_all || bfe->type == ftnode_fetch_prefetch);

    // setup the memory of the partitions
    // for partitions being decompressed, create either FIFO or basement node
@ -1648,7 +1643,7 @@ deserialize_ftnode_header_from_rbuf_if_small_enough (FTNODE *ftnode,
    if (bfe->type != ftnode_fetch_none) {
        PAIR_ATTR attr;

-r = toku_ftnode_pf_callback(node, *ndd, bfe, fd, &attr);
+        r = toku_ftnode_pf_callback(node, *ndd, bfe, fd, &attr);
        if (r != 0) {
            goto cleanup;
        }
@ -1656,12 +1651,12 @@ r = toku_ftnode_pf_callback(node, *ndd, bfe, fd, &attr);
    // handle clock
    for (int i = 0; i < node->n_children; i++) {
        if (toku_bfe_wants_child_available(bfe, i)) {
-            assert(BP_STATE(node,i) == PT_AVAIL);
+            paranoid_invariant(BP_STATE(node,i) == PT_AVAIL);
            BP_TOUCH_CLOCK(node,i);
        }
    }
    *ftnode = node;
-    r = 0; // TODO: Why do we do this???
+    r = 0;

 cleanup:
    if (r != 0) {
@ -1795,7 +1790,7 @@ deserialize_and_upgrade_internal_node(FTNODE node,
        // of messages in the buffer.
        MSN lowest;
        uint64_t amount = n_in_this_buffer;
-        lowest.msn = __sync_sub_and_fetch(&bfe->h->h->highest_unused_msn_for_upgrade.msn, amount);
+        lowest.msn = toku_sync_sub_and_fetch(&bfe->h->h->highest_unused_msn_for_upgrade.msn, amount);
        if (highest_msn.msn == 0) {
            highest_msn.msn = lowest.msn + n_in_this_buffer;
        }
@ -1821,7 +1816,7 @@ deserialize_and_upgrade_internal_node(FTNODE node,
                    dest = &broadcast_offsets[nbroadcast_offsets];
                    nbroadcast_offsets++;
                } else {
-                    assert(false);
+                    abort();
                }
            } else {
                dest = NULL;
@ -1962,8 +1957,6 @@ deserialize_and_upgrade_leaf_node(FTNODE node,
    if (version <= FT_LAYOUT_VERSION_13) {
        // Create our mempool.
        toku_mempool_construct(&bn->buffer_mempool, 0);
-        OMT omt = BLB_BUFFER(node, 0);
-        struct mempool *mp = &BLB_BUFFER_MEMPOOL(node, 0);
        // Loop through
        for (int i = 0; i < n_in_buf; ++i) {
            LEAFENTRY_13 le = reinterpret_cast<LEAFENTRY_13>(&rb->buf[rb->ndone]);
@ -1975,11 +1968,11 @@ deserialize_and_upgrade_leaf_node(FTNODE node,
            r = toku_le_upgrade_13_14(le,
                                      &new_le_size,
                                      &new_le,
-                                      omt,
-                                      mp);
+                                      &bn->buffer,
+                                      &bn->buffer_mempool);
            assert_zero(r);
            // Copy the pointer value straight into the OMT
-            r = toku_omt_insert_at(omt, new_le, i);
+            r = toku_omt_insert_at(bn->buffer, new_le, i);
            assert_zero(r);
            bn->n_bytes_in_buffer += new_le_size;
        }
@ -2259,7 +2252,7 @@ deserialize_ftnode_from_rbuf(

    // now that the node info has been deserialized, we can proceed to deserialize
    // the individual sub blocks
-    assert(bfe->type == ftnode_fetch_none || bfe->type == ftnode_fetch_subset || bfe->type == ftnode_fetch_all || bfe->type == ftnode_fetch_prefetch);
+    paranoid_invariant(bfe->type == ftnode_fetch_none || bfe->type == ftnode_fetch_subset || bfe->type == ftnode_fetch_all || bfe->type == ftnode_fetch_prefetch);

    // setup the memory of the partitions
    // for partitions being decompressed, create either FIFO or basement node
@ -2306,20 +2299,18 @@ deserialize_ftnode_from_rbuf(
            if (r != 0) {
                goto cleanup;
            }
-            continue;
+            break;
        case PT_COMPRESSED:
            // case where we leave the partition in the compressed state
            r = check_and_copy_compressed_sub_block_worker(curr_rbuf, curr_sb, node, i);
            if (r != 0) {
                goto cleanup;
            }
-            continue;
+            break;
        case PT_INVALID: // this is really bad
        case PT_ON_DISK: // it's supposed to be in memory.
-            assert(0);
-            continue;
+            abort();
        }
-        assert(0);
    }
    *ftnode = node;
    r = 0;
@ -2745,7 +2736,8 @@ decompress_from_raw_block_into_rbuf(uint8_t *raw_block, size_t raw_block_size, s
    n_sub_blocks = toku_dtoh32(*(uint32_t*)(&raw_block[node_header_overhead]));

    // verify the number of sub blocks
-    invariant(0 <= n_sub_blocks && n_sub_blocks <= max_sub_blocks);
+    invariant(0 <= n_sub_blocks);
+    invariant(n_sub_blocks <= max_sub_blocks);

    { // verify the header checksum
        uint32_t header_length = node_header_overhead + sub_block_header_size(n_sub_blocks);
@ -2799,7 +2791,6 @@ decompress_from_raw_block_into_rbuf(uint8_t *raw_block, size_t raw_block_size, s
    size = node_header_overhead + uncompressed_size;
    unsigned char *buf;
    XMALLOC_N(size, buf);
-    lazy_assert(buf);
    rbuf_init(rb, buf, size);

    // copy the uncompressed node header to the uncompressed buffer
@ -2820,7 +2811,6 @@ decompress_from_raw_block_into_rbuf(uint8_t *raw_block, size_t raw_block_size, s
        dump_bad_block(raw_block, raw_block_size);
        goto exit;
    }
-    lazy_assert_zero(r);

    toku_trace("decompress done");

@ -2840,7 +2830,7 @@ decompress_from_raw_block_into_rbuf_versioned(uint32_t version, uint8_t *raw_blo
            r = decompress_from_raw_block_into_rbuf(raw_block, raw_block_size, rb, blocknum);
            break;
        default:
-            lazy_assert(false);
+            abort();
    }
    return r;
 }
@ -2886,7 +2876,7 @@ read_and_decompress_block_from_fd_into_rbuf(int fd, BLOCKNUM blocknum,
            fprintf(stderr,
                    "Checksum failure while reading raw block in file %s.\n",
                    toku_cachefile_fname_in_env(h->cf));
-            assert(false);
+            abort();
        } else {
            r = toku_db_badformat();
            goto cleanup;
@ -2949,7 +2939,6 @@ cleanup:
    return r;
 }

-
 int
 toku_upgrade_subtree_estimates_to_stat64info(int fd, FT h)
 {
@ -2962,7 +2951,7 @@ toku_upgrade_subtree_estimates_to_stat64info(int fd, FT h)
    struct ftnode_fetch_extra bfe;
    fill_bfe_for_min_read(&bfe, h);
    r = deserialize_ftnode_from_fd(fd, h->h->root_blocknum, 0, &unused_node, &unused_ndd,
-                                       &bfe, &h->h->on_disk_stats);
+                                   &bfe, &h->h->on_disk_stats);
    h->in_memory_stats = h->h->on_disk_stats;

    if (unused_node) {
@ -2974,5 +2963,27 @@ toku_upgrade_subtree_estimates_to_stat64info(int fd, FT h)
    return r;
 }

-#undef UPGRADE_STATUS_VALUE
+int
+toku_upgrade_msn_from_root_to_header(int fd, FT h)
+{
+    int r;
+    // 21 was the first version with max_msn_in_ft in the header
+    invariant(h->layout_version_read_from_disk <= FT_LAYOUT_VERSION_20);

+    FTNODE node;
+    FTNODE_DISK_DATA ndd;
+    struct ftnode_fetch_extra bfe;
+    fill_bfe_for_min_read(&bfe, h);
+    r = deserialize_ftnode_from_fd(fd, h->h->root_blocknum, 0, &node, &ndd, &bfe, nullptr);
+    if (r != 0) {
+        goto exit;
+    }
+
+    h->h->max_msn_in_ft = node->max_msn_applied_to_node_on_disk;
+    toku_ftnode_free(&node);
+    toku_free(ndd);
+ exit:
+    return r;
+}
+
+#undef UPGRADE_STATUS_VALUE
--- a/ft/fttypes.h
+++ b/ft/fttypes.h
@ -281,6 +281,10 @@ enum reactivity {
    RE_FISSIBLE
 };

+enum split_mode {
+    SPLIT_EVENLY,
+    SPLIT_LEFT_HEAVY,
+    SPLIT_RIGHT_HEAVY
+};

 #endif
-
--- a/ft/leafentry.h
+++ b/ft/leafentry.h
@ -177,10 +177,10 @@ size_t
 leafentry_disksize_13(LEAFENTRY_13 le);
 int
 toku_le_upgrade_13_14(LEAFENTRY_13 old_leafentry, // NULL if there was no stored data.
-		      size_t *new_leafentry_memorysize, 
-		      LEAFENTRY *new_leafentry_p,
-		      OMT omt,
-		      struct mempool *mp);
+                      size_t *new_leafentry_memorysize,
+                      LEAFENTRY *new_leafentry_p,
+                      OMT *omtp,
+                      struct mempool *mp);



--- a/ft/locking-benchmarks/mfence-benchmark.cc
+++ b/ft/locking-benchmarks/mfence-benchmark.cc
@ -34,6 +34,7 @@ lfence:  12.9ns/loop  (marginal cost=  -0.1ns)

 #include <sys/time.h>
 #include <stdio.h>
+#include <portability/toku_atomic.h>

 enum { COUNT = 100000000 };

@ -67,8 +68,8 @@ static inline void sfence (void) {

 int lock_for_lock_and_unlock;
 static inline void lock_and_unlock (void) {
-    (void)__sync_lock_test_and_set(&lock_for_lock_and_unlock, 1);
-    __sync_lock_release(&lock_for_lock_and_unlock);
+    (void)toku_sync_lock_test_and_set(&lock_for_lock_and_unlock, 1);
+    toku_sync_lock_release(&lock_for_lock_and_unlock);
 }


--- a/ft/locking-benchmarks/pthread-locks.cc
+++ b/ft/locking-benchmarks/pthread-locks.cc
@ -13,6 +13,7 @@
 #include <stdio.h>
 #include <sys/time.h>
 #include <pthread.h>
+#include <portability/toku_atomic.h>

 float tdiff (struct timeval *start, struct timeval *end) {
    return 1e6*(end->tv_sec-start->tv_sec) +(end->tv_usec - start->tv_usec);
@ -71,13 +72,13 @@ fetch_and_add_i (volatile int *p, int incr)
 static inline int
 gcc_fetch_and_add_i (volatile int *p, int incr)
 {
-  return __sync_fetch_and_add(p, incr);
+  return toku_sync_fetch_and_add(p, incr);
 }

 static inline long
 gcc_fetch_and_add_l (volatile long *p, long incr)
 {
-  return __sync_fetch_and_add(p, incr);
+  return toku_sync_fetch_and_add(p, incr);
 }

 // Something wrong with the compiler  for longs
--- a/ft/locking-benchmarks/trylock-rdtsc.cc
+++ b/ft/locking-benchmarks/trylock-rdtsc.cc
@ -13,6 +13,7 @@
 #include <sys/time.h>
 #include <unistd.h>
 #include <rdtsc.h>
+#include <portability/toku_atomic.h>

 float tdiff (struct timeval *start, struct timeval *end) {
    return 1e6*(end->tv_sec-start->tv_sec) +(end->tv_usec - start->tv_usec);
@ -135,12 +136,12 @@ int main(int argc __attribute__((unused)), char **argv)
  {
      static int lock_for_lock_and_unlock;
      t_start = rdtsc();
-      (void)__sync_lock_test_and_set(&lock_for_lock_and_unlock, 1);
+      (void)toku_sync_lock_test_and_set(&lock_for_lock_and_unlock, 1);
      t_end   = rdtsc();
      printf("sync_lock_test_and_set took %llu clocks\n", t_end-t_start);

      t_start = rdtsc();
-      __sync_lock_release(&lock_for_lock_and_unlock);
+      toku_sync_lock_release(&lock_for_lock_and_unlock);
      t_end   = rdtsc();
      printf("sync_lock_release      took %llu clocks\n", t_end-t_start);
  }
@ -148,7 +149,7 @@ int main(int argc __attribute__((unused)), char **argv)

  {
      t_start = rdtsc();
-      (void)__sync_synchronize();
+      (void)toku_sync_synchronize();
      t_end   = rdtsc();
      printf("sync_synchornize took %llu clocks\n", t_end-t_start);
  }
--- a/ft/rollback-ct-callbacks.h
+++ b/ft/rollback-ct-callbacks.h
@ -44,6 +44,7 @@ static inline CACHETABLE_WRITE_CALLBACK get_write_callbacks_for_rollback_log(FT
    wc.pe_callback = toku_rollback_pe_callback;
    wc.cleaner_callback = toku_rollback_cleaner_callback;
    wc.clone_callback = toku_rollback_clone_callback;
+    wc.checkpoint_complete_callback = nullptr;
    wc.write_extraargs = h;
    return wc;
 }
--- a/ft/rollback.cc
+++ b/ft/rollback.cc
@ -84,7 +84,7 @@ void rollback_empty_log_init(ROLLBACK_LOG_NODE log) {
    log->layout_version_read_from_disk = FT_LAYOUT_VERSION;
    log->dirty = true;
    log->sequence = 0;
-    log->previous = {0};
+    log->previous = make_blocknum(0);
    log->previous_hash = 0;
    log->oldest_logentry = NULL;
    log->newest_logentry = NULL;
--- a/ft/tests/cachetable-checkpoint-pending.cc
+++ b/ft/tests/cachetable-checkpoint-pending.cc
@ -9,6 +9,7 @@
 #include <unistd.h>
 #include "cachetable-test.h"
 #include "checkpoint.h"
+#include <portability/toku_atomic.h>

 static int N; // how many items in the table
 static CACHEFILE cf;
@ -54,9 +55,9 @@ flush (
    int *CAST_FROM_VOIDP(v, value);
    if (*v!=expect_value) printf("got %d expect %d\n", *v, expect_value);
    assert(*v==expect_value);
-    (void)__sync_fetch_and_add(&n_flush, 1);
-    if (write_me) (void)__sync_fetch_and_add(&n_write_me, 1);
-    if (keep_me)  (void)__sync_fetch_and_add(&n_keep_me, 1);
+    (void)toku_sync_fetch_and_add(&n_flush, 1);
+    if (write_me) (void)toku_sync_fetch_and_add(&n_write_me, 1);
+    if (keep_me)  (void)toku_sync_fetch_and_add(&n_keep_me, 1);
    sleep_random();
 }

--- a/ft/tests/cachetable-checkpoint-test.cc
+++ b/ft/tests/cachetable-checkpoint-test.cc
@ -106,7 +106,7 @@ static void cachetable_checkpoint_test(int n, enum cachetable_dirty dirty) {
        CACHEKEY key = make_blocknum(i);
        uint32_t hi = toku_cachetable_hash(f1, key);
        void *v;
-        r = toku_cachetable_maybe_get_and_pin(f1, key, hi, &v);
+        r = toku_cachetable_maybe_get_and_pin(f1, key, hi, PL_WRITE_EXPENSIVE, &v);
        if (r != 0) 
            continue;
        r = toku_test_cachetable_unpin(f1, key, hi, CACHETABLE_CLEAN, make_pair_attr(item_size));
--- a/ft/tests/cachetable-checkpointer-class.cc
+++ b/ft/tests/cachetable-checkpointer-class.cc
@ -193,6 +193,7 @@ void checkpointer_test::add_pairs(struct cachefile *cf,
    attr.cache_pressure_size = 0;
    attr.is_valid = true;
    CACHETABLE_WRITE_CALLBACK cb;
+    ZERO_STRUCT(cb);  // All nullptr

    for (uint32_t i = k; i < count + k; ++i) {
        CACHEKEY key;
@ -201,12 +202,12 @@ void checkpointer_test::add_pairs(struct cachefile *cf,
        pair_init(&(pairs[i]),
            cf,
            key,
-            NULL,
+            nullptr,
            attr,
            CACHETABLE_CLEAN,
            full_hash,
            cb,
-            NULL,
+            nullptr,
            m_cp.m_list);

        m_cp.m_list->put(&pairs[i]);
--- a/ft/tests/cachetable-count-pinned-test.cc
+++ b/ft/tests/cachetable-count-pinned-test.cc
@ -27,7 +27,8 @@ cachetable_count_pinned_test (int n) {
        assert(toku_cachefile_count_pinned(f1, 0) == i);

        void *v;
-        r = toku_cachetable_maybe_get_and_pin(f1, make_blocknum(i), hi, &v);
+        r = toku_cachetable_maybe_get_and_pin(f1, make_blocknum(i), hi, PL_WRITE_EXPENSIVE, &v);
+
        assert(r == -1);
        assert(toku_cachefile_count_pinned(f1, 0) == i);

--- a/ft/tests/cachetable-flush-test.cc
+++ b/ft/tests/cachetable-flush-test.cc
@ -43,12 +43,12 @@ test_cachetable_def_flush (int n) {
        uint32_t hi;
        void *v;
        hi = toku_cachetable_hash(f1, make_blocknum(i));
-        r = toku_cachetable_maybe_get_and_pin(f1, make_blocknum(i), hi, &v);
+        r = toku_cachetable_maybe_get_and_pin(f1, make_blocknum(i), hi, PL_WRITE_EXPENSIVE, &v);
        assert(r == 0 && v == (void *)(long)i);
        r = toku_test_cachetable_unpin(f1, make_blocknum(i), hi, CACHETABLE_CLEAN, make_pair_attr(1));
        assert(r == 0);
        hi = toku_cachetable_hash(f2, make_blocknum(i));
-        r = toku_cachetable_maybe_get_and_pin(f2, make_blocknum(i), hi, &v);
+        r = toku_cachetable_maybe_get_and_pin(f2, make_blocknum(i), hi, PL_WRITE_EXPENSIVE, &v);
        assert(r == 0 && v == (void *)(long)i);
        r = toku_test_cachetable_unpin(f2, make_blocknum(i), hi, CACHETABLE_CLEAN, make_pair_attr(1));
        assert(r == 0);
@ -63,10 +63,10 @@ test_cachetable_def_flush (int n) {
        uint32_t hi;
        void *v;
        hi = toku_cachetable_hash(f1, make_blocknum(i));
-        r = toku_cachetable_maybe_get_and_pin(f1, make_blocknum(i), hi, &v);
+        r = toku_cachetable_maybe_get_and_pin(f1, make_blocknum(i), hi, PL_WRITE_EXPENSIVE, &v);
        assert(r != 0);
        hi = toku_cachetable_hash(f2, make_blocknum(i));
-        r = toku_cachetable_maybe_get_and_pin(f2, make_blocknum(i), hi, &v);
+        r = toku_cachetable_maybe_get_and_pin(f2, make_blocknum(i), hi, PL_WRITE_EXPENSIVE, &v);
        assert(r == 0);
        r = toku_test_cachetable_unpin(f2, make_blocknum(i), hi, CACHETABLE_CLEAN, make_pair_attr(1));
        assert(r == 0);
--- a/ft/tests/cachetable-prefetch-checkpoint-test.cc
+++ b/ft/tests/cachetable-prefetch-checkpoint-test.cc
@ -122,7 +122,7 @@ static void cachetable_prefetch_checkpoint_test(int n, enum cachetable_dirty dir
        CACHEKEY key = make_blocknum(i);
        uint32_t hi = toku_cachetable_hash(f1, key);
        void *v;
-        r = toku_cachetable_maybe_get_and_pin(f1, key, hi, &v);
+        r = toku_cachetable_maybe_get_and_pin(f1, key, hi, PL_WRITE_EXPENSIVE, &v);
        if (r != 0) 
            continue;
        r = toku_test_cachetable_unpin(f1, key, hi, CACHETABLE_CLEAN, make_pair_attr(item_size));
--- a/ft/tests/cachetable-prefetch-maybegetandpin-test.cc
+++ b/ft/tests/cachetable-prefetch-maybegetandpin-test.cc
@ -51,7 +51,7 @@ static void cachetable_prefetch_maybegetandpin_test (void) {
    int i;
    for (i=1; i>=0; i++) {
        void *v;
-        r = toku_cachetable_maybe_get_and_pin(f1, key, fullhash, &v);
+        r = toku_cachetable_maybe_get_and_pin(f1, key, fullhash, PL_WRITE_EXPENSIVE, &v);
        if (r == 0) break;
        toku_pthread_yield();
    }
--- a/ft/tests/cachetable-prefetch2-test.cc
+++ b/ft/tests/cachetable-prefetch2-test.cc
@ -58,7 +58,7 @@ static void cachetable_prefetch_maybegetandpin_test (void) {
    int i;
    for (i=1; i>=0; i++) {
        void *v;
-        r = toku_cachetable_maybe_get_and_pin(f1, key, fullhash, &v);
+        r = toku_cachetable_maybe_get_and_pin(f1, key, fullhash, PL_WRITE_EXPENSIVE, &v);
        if (r == 0) break;
        toku_pthread_yield();
    }
--- a/ft/tests/cachetable-put-test.cc
+++ b/ft/tests/cachetable-put-test.cc
@ -26,7 +26,7 @@ cachetable_put_test (int n) {
        assert(toku_cachefile_count_pinned(f1, 0) == i);

        void *v;
-        r = toku_cachetable_maybe_get_and_pin(f1, make_blocknum(i), hi, &v);
+        r = toku_cachetable_maybe_get_and_pin(f1, make_blocknum(i), hi, PL_WRITE_EXPENSIVE, &v);
        assert(r == -1);
        assert(toku_cachefile_count_pinned(f1, 0) == i);

--- a/ft/tests/cachetable-simple-maybe-get-pin.cc
+++ b/ft/tests/cachetable-simple-maybe-get-pin.cc
@ -26,37 +26,37 @@ cachetable_test (void) {
    void* v1;
    long s1;
    // nothing in cachetable, so this should fail
-    r = toku_cachetable_maybe_get_and_pin(f1, make_blocknum(1), 1, &v1);
+    r = toku_cachetable_maybe_get_and_pin(f1, make_blocknum(1), 1, PL_WRITE_EXPENSIVE, &v1);
    assert(r==-1);
    r = toku_cachetable_get_and_pin(f1, make_blocknum(1), 1, &v1, &s1, wc, def_fetch, def_pf_req_callback, def_pf_callback, true, NULL);
    r = toku_test_cachetable_unpin(f1, make_blocknum(1), 1, CACHETABLE_CLEAN, make_pair_attr(8));

    // maybe_get_and_pin_clean should succeed, maybe_get_and_pin should fail
-    r = toku_cachetable_maybe_get_and_pin(f1, make_blocknum(1), 1, &v1);
+    r = toku_cachetable_maybe_get_and_pin(f1, make_blocknum(1), 1, PL_WRITE_EXPENSIVE, &v1);
    assert(r==-1);
-    r = toku_cachetable_maybe_get_and_pin_clean(f1, make_blocknum(1), 1, &v1);
+    r = toku_cachetable_maybe_get_and_pin_clean(f1, make_blocknum(1), 1, PL_WRITE_EXPENSIVE, &v1);
    assert(r == 0);
    r = toku_test_cachetable_unpin(f1, make_blocknum(1), 1, CACHETABLE_DIRTY, make_pair_attr(8));
    // maybe_get_and_pin_clean should succeed, maybe_get_and_pin should fail
-    r = toku_cachetable_maybe_get_and_pin(f1, make_blocknum(1), 1, &v1);
+    r = toku_cachetable_maybe_get_and_pin(f1, make_blocknum(1), 1, PL_WRITE_EXPENSIVE, &v1);
    assert(r==0);
    // now these calls should fail because the node is already pinned, and therefore in use
-    r = toku_cachetable_maybe_get_and_pin(f1, make_blocknum(1), 1, &v1);
+    r = toku_cachetable_maybe_get_and_pin(f1, make_blocknum(1), 1, PL_WRITE_EXPENSIVE, &v1);
    assert(r==-1);
-    r = toku_cachetable_maybe_get_and_pin_clean(f1, make_blocknum(1), 1, &v1);
+    r = toku_cachetable_maybe_get_and_pin_clean(f1, make_blocknum(1), 1, PL_WRITE_EXPENSIVE, &v1);
    assert(r==-1);
    r = toku_test_cachetable_unpin(f1, make_blocknum(1), 1, CACHETABLE_DIRTY, make_pair_attr(8));

    // sanity check, this should still succeed, because the PAIR is dirty
-    r = toku_cachetable_maybe_get_and_pin(f1, make_blocknum(1), 1, &v1);
+    r = toku_cachetable_maybe_get_and_pin(f1, make_blocknum(1), 1, PL_WRITE_EXPENSIVE, &v1);
    assert(r==0);
    r = toku_test_cachetable_unpin(f1, make_blocknum(1), 1, CACHETABLE_DIRTY, make_pair_attr(8));
    CHECKPOINTER cp = toku_cachetable_get_checkpointer(ct);
    toku_cachetable_begin_checkpoint(cp, NULL);
    // now these should fail, because the node should be pending a checkpoint
-    r = toku_cachetable_maybe_get_and_pin(f1, make_blocknum(1), 1, &v1);
+    r = toku_cachetable_maybe_get_and_pin(f1, make_blocknum(1), 1, PL_WRITE_EXPENSIVE, &v1);
    assert(r==-1);
-    r = toku_cachetable_maybe_get_and_pin(f1, make_blocknum(1), 1, &v1);
+    r = toku_cachetable_maybe_get_and_pin(f1, make_blocknum(1), 1, PL_WRITE_EXPENSIVE, &v1);
    assert(r==-1);
    toku_cachetable_end_checkpoint(
        cp, 
--- a/ft/tests/cachetable-test.cc
+++ b/ft/tests/cachetable-test.cc
@ -105,7 +105,7 @@ static void test_nested_pin (void) {
    assert(i0==0);
    r = toku_test_cachetable_unpin(f, make_blocknum(1), f1hash, CACHETABLE_CLEAN, make_pair_attr(test_object_size));
    assert(r==0);
-    r = toku_cachetable_maybe_get_and_pin(f, make_blocknum(1), f1hash, &vv2);
+    r = toku_cachetable_maybe_get_and_pin(f, make_blocknum(1), f1hash, PL_WRITE_EXPENSIVE, &vv2);
    assert(r==0);
    assert(vv2==vv);
    r = toku_test_cachetable_unpin(f, make_blocknum(1), f1hash, CACHETABLE_CLEAN, make_pair_attr(test_object_size));
--- a/ft/tests/cachetable-unpin-and-remove-test.cc
+++ b/ft/tests/cachetable-unpin-and-remove-test.cc
@ -63,7 +63,7 @@ cachetable_unpin_and_remove_test (int n) {

        // verify that k is removed
        void *v;
-        r = toku_cachetable_maybe_get_and_pin(f1, make_blocknum(testkeys[i].b), hi, &v);
+        r = toku_cachetable_maybe_get_and_pin(f1, make_blocknum(testkeys[i].b), hi, PL_WRITE_EXPENSIVE, &v);
        assert(r != 0);

        testkeys[i] = testkeys[nkeys-1]; nkeys -= 1;
--- a/ft/tests/cachetable-unpin-test.cc
+++ b/ft/tests/cachetable-unpin-test.cc
@ -27,7 +27,7 @@ cachetable_unpin_test (int n) {
        assert(toku_cachefile_count_pinned(f1, 0) == i);

        void *v;
-        r = toku_cachetable_maybe_get_and_pin(f1, make_blocknum(i), hi, &v);
+        r = toku_cachetable_maybe_get_and_pin(f1, make_blocknum(i), hi, PL_WRITE_EXPENSIVE, &v);
        assert(r == -1);
        assert(toku_cachefile_count_pinned(f1, 0) == i);

--- a/ft/tests/cachetable-writer-thread-limit.cc
+++ b/ft/tests/cachetable-writer-thread-limit.cc
@ -4,6 +4,7 @@
 #ident "Copyright (c) 2007-2012 Tokutek Inc.  All rights reserved."
 #include "includes.h"
 #include "test.h"
+#include <portability/toku_atomic.h>


 static int total_size;
@ -25,7 +26,7 @@ flush (CACHEFILE f __attribute__((__unused__)),
        bool UU(is_clone)
       ) {
    if (w) {
-        int curr_size = __sync_fetch_and_sub(&total_size, 1);
+        int curr_size = toku_sync_fetch_and_sub(&total_size, 1);
        assert(curr_size <= 200);
        usleep(500*1000);
    }
@ -49,7 +50,7 @@ cachetable_test (void) {
       CACHETABLE_WRITE_CALLBACK wc = def_write_callback(NULL);
       wc.flush_callback = flush;
        toku_cachetable_put(f1, make_blocknum(i), i, NULL, make_pair_attr(1), wc, put_callback_nop);
-        int curr_size = __sync_fetch_and_add(&total_size, 1);
+        int curr_size = toku_sync_fetch_and_add(&total_size, 1);
        assert(curr_size <= test_limit + test_limit/2+1);
        r = toku_test_cachetable_unpin(f1, make_blocknum(i), i, CACHETABLE_DIRTY, make_pair_attr(4));
    }
--- a/ft/tests/ftloader-error-injector.h
+++ b/ft/tests/ftloader-error-injector.h
@ -7,6 +7,7 @@
 #ifndef FTLOADER_ERROR_INJECTOR_H
 #define FTLOADER_ERROR_INJECTOR_H

+#include <portability/toku_atomic.h>

 static toku_mutex_t event_mutex = TOKU_MUTEX_INITIALIZER;
 static void lock_events(void) {
@ -107,9 +108,9 @@ static void reset_my_malloc_counts(void) {

 __attribute__((__unused__))
 static void *my_malloc(size_t n) {
-    (void) __sync_fetch_and_add(&my_malloc_count, 1); // my_malloc_count++;
+    (void) toku_sync_fetch_and_add(&my_malloc_count, 1); // my_malloc_count++;
    if (n >= my_big_malloc_limit) {
-        (void) __sync_fetch_and_add(&my_big_malloc_count, 1); // my_big_malloc_count++;
+        (void) toku_sync_fetch_and_add(&my_big_malloc_count, 1); // my_big_malloc_count++;
        if (do_malloc_errors) {
            if (event_add_and_fetch() == event_count_trigger) {
                event_hit();
@ -125,9 +126,9 @@ static int do_realloc_errors = 0;

 __attribute__((__unused__))
 static void *my_realloc(void *p, size_t n) {
-    (void) __sync_fetch_and_add(&my_realloc_count, 1); // my_realloc_count++;
+    (void) toku_sync_fetch_and_add(&my_realloc_count, 1); // my_realloc_count++;
    if (n >= my_big_malloc_limit) {
-        (void) __sync_fetch_and_add(&my_big_realloc_count, 1); // my_big_realloc_count++;
+        (void) toku_sync_fetch_and_add(&my_big_realloc_count, 1); // my_big_realloc_count++;
        if (do_realloc_errors) {
            if (event_add_and_fetch() == event_count_trigger) {
                event_hit();
--- a/ft/tests/keyrange.cc
+++ b/ft/tests/keyrange.cc
@ -82,8 +82,8 @@ static void test_keyrange (enum memory_state ms, uint64_t limit) {
        struct ftstat64_s s;
        toku_ft_handle_stat64(t, null_txn, &s);
        
-        assert(0 < s.nkeys && s.nkeys < limit);
-        assert(0 < s.dsize && s.dsize < limit * (9 + 9)); // keylen = 9, vallen = 9
+        assert(0 < s.nkeys && s.nkeys <= limit);
+        assert(0 < s.dsize && s.dsize <= limit * (9 + 9)); // keylen = 9, vallen = 9
    }
    
    maybe_reopen(ms, limit);
--- a/ft/tests/make-tree.cc
+++ b/ft/tests/make-tree.cc
@ -138,7 +138,7 @@ test_make_tree(int height, int fanout, int nperleaf, int do_verify) {
    // set the new root to point to the new tree
    toku_ft_set_new_root_blocknum(brt->ft, newroot->thisnodename);

-    newroot->max_msn_applied_to_node_on_disk = last_dummymsn(); // capture msn of last message injected into tree
+    brt->ft->h->max_msn_in_ft = last_dummymsn(); // capture msn of last message injected into tree

    // unpin the new root
    toku_unpin_ftnode(brt->ft, newroot);
--- a/ft/tests/msnfilter.cc
+++ b/ft/tests/msnfilter.cc
@ -45,10 +45,10 @@ append_leaf(FT_HANDLE brt, FTNODE leafnode, void *key, uint32_t keylen, void *va

    // apply an insert to the leaf node
    MSN msn = next_dummymsn();
+    brt->ft->h->max_msn_in_ft = msn;
    FT_MSG_S cmd = { FT_INSERT, msn, xids_get_root_xids(), .u={.id = { &thekey, &theval }} };

-    uint64_t workdone=0;
-    toku_ft_leaf_apply_cmd(brt->ft->compare_fun, brt->ft->update_fun, &brt->ft->cmp_descriptor, leafnode, &cmd, &workdone, NULL);
+    toku_ft_leaf_apply_cmd(brt->ft->compare_fun, brt->ft->update_fun, &brt->ft->cmp_descriptor, leafnode, -1, &cmd, nullptr, nullptr);
    {
 	int r = toku_ft_lookup(brt, &thekey, lookup_checkf, &pair);
 	assert(r==0);
@ -56,8 +56,7 @@ append_leaf(FT_HANDLE brt, FTNODE leafnode, void *key, uint32_t keylen, void *va
    }

    FT_MSG_S badcmd = { FT_INSERT, msn, xids_get_root_xids(), .u={.id = { &thekey, &badval }} };
-    toku_ft_leaf_apply_cmd(brt->ft->compare_fun, brt->ft->update_fun, &brt->ft->cmp_descriptor, leafnode, &badcmd, &workdone, NULL);
-
+    toku_ft_leaf_apply_cmd(brt->ft->compare_fun, brt->ft->update_fun, &brt->ft->cmp_descriptor, leafnode, -1, &badcmd, nullptr, nullptr);

    // message should be rejected for duplicate msn, row should still have original val
    {
@ -68,8 +67,9 @@ append_leaf(FT_HANDLE brt, FTNODE leafnode, void *key, uint32_t keylen, void *va

    // now verify that message with proper msn gets through
    msn = next_dummymsn();
+    brt->ft->h->max_msn_in_ft = msn;
    FT_MSG_S cmd2 = { FT_INSERT, msn, xids_get_root_xids(), .u={.id = { &thekey, &val2 }} };
-    toku_ft_leaf_apply_cmd(brt->ft->compare_fun, brt->ft->update_fun, &brt->ft->cmp_descriptor, leafnode, &cmd2, &workdone, NULL);
+    toku_ft_leaf_apply_cmd(brt->ft->compare_fun, brt->ft->update_fun, &brt->ft->cmp_descriptor, leafnode, -1, &cmd2, nullptr, nullptr);

    // message should be accepted, val should have new value
    {
@ -81,7 +81,7 @@ append_leaf(FT_HANDLE brt, FTNODE leafnode, void *key, uint32_t keylen, void *va
    // now verify that message with lesser (older) msn is rejected
    msn.msn = msn.msn - 10;
    FT_MSG_S cmd3 = { FT_INSERT, msn, xids_get_root_xids(), .u={.id = { &thekey, &badval } }};
-    toku_ft_leaf_apply_cmd(brt->ft->compare_fun, brt->ft->update_fun, &brt->ft->cmp_descriptor, leafnode, &cmd3, &workdone, NULL);
+    toku_ft_leaf_apply_cmd(brt->ft->compare_fun, brt->ft->update_fun, &brt->ft->cmp_descriptor, leafnode, -1, &cmd3, nullptr, nullptr);

    // message should be rejected, val should still have value in pair2
    {
--- a/ft/tests/orthopush-flush.cc
+++ b/ft/tests/orthopush-flush.cc
@ -580,7 +580,7 @@ flush_to_leaf(FT_HANDLE t, bool make_leaf_up_to_date, bool use_flush) {
    if (make_leaf_up_to_date) {
        for (i = 0; i < num_parent_messages; ++i) {
            if (!parent_messages_is_fresh[i]) {
-                toku_ft_leaf_apply_cmd(t->ft->compare_fun, t->ft->update_fun, &t->ft->descriptor, child, parent_messages[i], NULL, NULL);
+                toku_ft_leaf_apply_cmd(t->ft->compare_fun, t->ft->update_fun, &t->ft->descriptor, child, -1, parent_messages[i], NULL, NULL);
            }
        }
        for (i = 0; i < 8; ++i) {
@ -803,7 +803,7 @@ flush_to_leaf_with_keyrange(FT_HANDLE t, bool make_leaf_up_to_date) {
        for (i = 0; i < num_parent_messages; ++i) {
            if (dummy_cmp(NULL, parent_messages[i]->u.id.key, &childkeys[7]) <= 0 &&
                !parent_messages_is_fresh[i]) {
-                toku_ft_leaf_apply_cmd(t->ft->compare_fun, t->ft->update_fun, &t->ft->descriptor, child, parent_messages[i], NULL, NULL);
+                toku_ft_leaf_apply_cmd(t->ft->compare_fun, t->ft->update_fun, &t->ft->descriptor, child, -1, parent_messages[i], NULL, NULL);
            }
        }
        for (i = 0; i < 8; ++i) {
@ -995,8 +995,8 @@ compare_apply_and_flush(FT_HANDLE t, bool make_leaf_up_to_date) {
    if (make_leaf_up_to_date) {
        for (i = 0; i < num_parent_messages; ++i) {
            if (!parent_messages_is_fresh[i]) {
-                toku_ft_leaf_apply_cmd(t->ft->compare_fun, t->ft->update_fun, &t->ft->descriptor, child1, parent_messages[i], NULL, NULL);
-                toku_ft_leaf_apply_cmd(t->ft->compare_fun, t->ft->update_fun, &t->ft->descriptor, child2, parent_messages[i], NULL, NULL);
+                toku_ft_leaf_apply_cmd(t->ft->compare_fun, t->ft->update_fun, &t->ft->descriptor, child1, -1, parent_messages[i], NULL, NULL);
+                toku_ft_leaf_apply_cmd(t->ft->compare_fun, t->ft->update_fun, &t->ft->descriptor, child2, -1, parent_messages[i], NULL, NULL);
            }
        }
        for (i = 0; i < 8; ++i) {
--- a/ft/tests/test.h
+++ b/ft/tests/test.h
@ -228,7 +228,8 @@ static UU() CACHETABLE_WRITE_CALLBACK def_write_callback(void* write_extraargs)
    wc.pe_callback = def_pe_callback;
    wc.cleaner_callback = def_cleaner_callback;
    wc.write_extraargs = write_extraargs;
-    wc.clone_callback = NULL;
+    wc.clone_callback = nullptr;
+    wc.checkpoint_complete_callback = nullptr;
    return wc;
 }

--- a/ft/tests/test3884.cc
+++ b/ft/tests/test3884.cc
@ -171,7 +171,7 @@ test_split_on_boundary(void)
    FTNODE nodea, nodeb;
    DBT splitk;
    // if we haven't done it right, we should hit the assert in the top of move_leafentries
-    ftleaf_split(brt->ft, &sn, &nodea, &nodeb, &splitk, true, 0, NULL);
+    ftleaf_split(brt->ft, &sn, &nodea, &nodeb, &splitk, true, SPLIT_EVENLY, 0, NULL);

    verify_basement_node_msns(nodea, dummy_msn_3884);
    verify_basement_node_msns(nodeb, dummy_msn_3884);
@ -244,7 +244,7 @@ test_split_with_everything_on_the_left(void)
    FTNODE nodea, nodeb;
    DBT splitk;
    // if we haven't done it right, we should hit the assert in the top of move_leafentries
-    ftleaf_split(brt->ft, &sn, &nodea, &nodeb, &splitk, true, 0, NULL);
+    ftleaf_split(brt->ft, &sn, &nodea, &nodeb, &splitk, true, SPLIT_EVENLY, 0, NULL);

    toku_unpin_ftnode(brt->ft, nodeb);
    r = toku_close_ft_handle_nolsn(brt, NULL); assert(r == 0);
@ -319,7 +319,7 @@ test_split_on_boundary_of_last_node(void)
    FTNODE nodea, nodeb;
    DBT splitk;
    // if we haven't done it right, we should hit the assert in the top of move_leafentries
-    ftleaf_split(brt->ft, &sn, &nodea, &nodeb, &splitk, true, 0, NULL);
+    ftleaf_split(brt->ft, &sn, &nodea, &nodeb, &splitk, true, SPLIT_EVENLY, 0, NULL);

    toku_unpin_ftnode(brt->ft, nodeb);
    r = toku_close_ft_handle_nolsn(brt, NULL); assert(r == 0);
@ -387,7 +387,7 @@ test_split_at_begin(void)
    FTNODE nodea, nodeb;
    DBT splitk;
    // if we haven't done it right, we should hit the assert in the top of move_leafentries
-    ftleaf_split(brt->ft, &sn, &nodea, &nodeb, &splitk, true, 0, NULL);
+    ftleaf_split(brt->ft, &sn, &nodea, &nodeb, &splitk, true, SPLIT_EVENLY, 0, NULL);

    toku_unpin_ftnode(brt->ft, nodeb);
    r = toku_close_ft_handle_nolsn(brt, NULL); assert(r == 0);
@ -451,7 +451,7 @@ test_split_at_end(void)
    FTNODE nodea, nodeb;
    DBT splitk;
    // if we haven't done it right, we should hit the assert in the top of move_leafentries
-    ftleaf_split(brt->ft, &sn, &nodea, &nodeb, &splitk, true, 0, NULL);
+    ftleaf_split(brt->ft, &sn, &nodea, &nodeb, &splitk, true, SPLIT_EVENLY, 0, NULL);

    toku_unpin_ftnode(brt->ft, nodeb);
    r = toku_close_ft_handle_nolsn(brt, NULL); assert(r == 0);
@ -505,7 +505,7 @@ test_split_odd_nodes(void)
    FTNODE nodea, nodeb;
    DBT splitk;
    // if we haven't done it right, we should hit the assert in the top of move_leafentries
-    ftleaf_split(brt->ft, &sn, &nodea, &nodeb, &splitk, true, 0, NULL);
+    ftleaf_split(brt->ft, &sn, &nodea, &nodeb, &splitk, true, SPLIT_EVENLY, 0, NULL);

    verify_basement_node_msns(nodea, dummy_msn_3884);
    verify_basement_node_msns(nodeb, dummy_msn_3884);
--- a/ft/ule-internal.h
+++ b/ft/ule-internal.h
@ -62,7 +62,7 @@ void le_unpack(ULE ule,  LEAFENTRY le);
 int le_pack(ULE ule,                            // data to be packed into new leafentry
 	    size_t *new_leafentry_memorysize,
 	    LEAFENTRY * const new_leafentry_p,  // this is what this function creates
-	    OMT omt, 
+	    OMT *omtp,
 	    struct mempool *mp,
 	    void **maybe_free);

--- a/ft/ule.cc
+++ b/ft/ule.cc
@ -154,11 +154,11 @@ static inline size_t uxr_unpack_length_and_bit(UXR uxr, uint8_t *p);
 static inline size_t uxr_unpack_data(UXR uxr, uint8_t *p);

 static void *
-le_malloc(OMT omt, struct mempool *mp, size_t size, void **maybe_free)
+le_malloc(OMT *omtp, struct mempool *mp, size_t size, void **maybe_free)
 {
    void * rval;
-    if (omt)
-	rval = mempool_malloc_from_omt(omt, mp, size, maybe_free);
+    if (omtp)
+	rval = mempool_malloc_from_omt(omtp, mp, size, maybe_free);
    else
 	rval = toku_xmalloc(size);
    resource_assert(rval);
@ -319,12 +319,12 @@ done:;
 // As of October 2011, this function always returns 0.
 int
 apply_msg_to_leafentry(FT_MSG   msg,		// message to apply to leafentry
-		       LEAFENTRY old_leafentry, // NULL if there was no stored data.
-		       size_t *new_leafentry_memorysize, 
-		       LEAFENTRY *new_leafentry_p,
-		       OMT omt, 
-		       struct mempool *mp, 
-		       void **maybe_free,
+                       LEAFENTRY old_leafentry, // NULL if there was no stored data.
+                       size_t *new_leafentry_memorysize,
+                       LEAFENTRY *new_leafentry_p,
+                       OMT *omtp,
+                       struct mempool *mp,
+                       void **maybe_free,
                       int64_t * numbytes_delta_p) {  // change in total size of key and val, not including any overhead
    ULE_S ule;
    int rval;
@ -339,11 +339,11 @@ apply_msg_to_leafentry(FT_MSG   msg,		// message to apply to leafentry
    }
    msg_modify_ule(&ule, msg);          // modify unpacked leafentry
    rval = le_pack(&ule,                // create packed leafentry
-		   new_leafentry_memorysize, 
-		   new_leafentry_p,
-		   omt,
-		   mp,
-		   maybe_free);                       
+                   new_leafentry_memorysize,
+                   new_leafentry_p,
+                   omtp,
+                   mp,
+                   maybe_free);
    if (new_leafentry_p)
 	newnumbytes = ule_get_innermost_numbytes(&ule);
    *numbytes_delta_p = newnumbytes - oldnumbytes;
@ -374,7 +374,7 @@ int
 garbage_collect_leafentry(LEAFENTRY old_leaf_entry,
                          LEAFENTRY *new_leaf_entry,
                          size_t *new_leaf_entry_memory_size,
-                          OMT omt,
+                          OMT *omtp,
                          struct mempool *mp,
                          void **maybe_free,
                          const xid_omt_t &snapshot_xids,
@ -387,7 +387,7 @@ garbage_collect_leafentry(LEAFENTRY old_leaf_entry,
    r = le_pack(&ule,
                new_leaf_entry_memory_size,
                new_leaf_entry,
-                omt,
+                omtp,
                mp,
                maybe_free);
    assert(r == 0);
@ -713,7 +713,7 @@ int
 le_pack(ULE ule,                            // data to be packed into new leafentry
 	size_t *new_leafentry_memorysize, 
 	LEAFENTRY * const new_leafentry_p,  // this is what this function creates
-	OMT omt, 
+	OMT *omtp,
 	struct mempool *mp,
 	void **maybe_free)
 {
@ -740,7 +740,7 @@ le_pack(ULE ule,                            // data to be packed into new leafen
 found_insert:;
    memsize = le_memsize_from_ule(ule);
    LEAFENTRY new_leafentry;
-    CAST_FROM_VOIDP(new_leafentry, le_malloc(omt, mp, memsize, maybe_free));
+    CAST_FROM_VOIDP(new_leafentry, le_malloc(omtp, mp, memsize, maybe_free));

    //Universal data
    new_leafentry->keylen  = toku_htod32(ule->keylen);
@ -2293,7 +2293,7 @@ int
 toku_le_upgrade_13_14(LEAFENTRY_13 old_leafentry,
 		      size_t *new_leafentry_memorysize, 
 		      LEAFENTRY *new_leafentry_p,
-		      OMT omt,
+		      OMT *omtp,
 		      struct mempool *mp) {
    ULE_S ule;
    int rval;
@ -2305,7 +2305,7 @@ toku_le_upgrade_13_14(LEAFENTRY_13 old_leafentry,
    rval = le_pack(&ule,                // create packed leafentry
                   new_leafentry_memorysize, 
                   new_leafentry_p,
-		   omt, mp, NULL);  
+		   omtp, mp, NULL);  
    ule_cleanup(&ule);
    return rval;
 }
--- a/ft/ule.h
+++ b/ft/ule.h
@ -53,18 +53,18 @@ void fast_msg_to_leafentry(
    LEAFENTRY *new_leafentry_p) ;

 int apply_msg_to_leafentry(FT_MSG   msg,
-			   LEAFENTRY old_leafentry, // NULL if there was no stored data.
-			   size_t *new_leafentry_memorysize, 
-			   LEAFENTRY *new_leafentry_p,
-			   OMT omt, 
-			   struct mempool *mp, 
-			   void **maybe_free,
+                           LEAFENTRY old_leafentry, // NULL if there was no stored data.
+                           size_t *new_leafentry_memorysize,
+                           LEAFENTRY *new_leafentry_p,
+                           OMT *omtp,
+                           struct mempool *mp,
+                           void **maybe_free,
                           int64_t * numbytes_delta_p);

 int garbage_collect_leafentry(LEAFENTRY old_leaf_entry,
                              LEAFENTRY *new_leaf_entry,
                              size_t *new_leaf_entry_memory_size,
-                              OMT omt,
+                              OMT *omtp,
                              struct mempool *mp,
                              void **maybe_free,
                              const xid_omt_t &snapshot_xids,
--- a/ft/worker-thread-benchmarks/threadpool.cc
+++ b/ft/worker-thread-benchmarks/threadpool.cc
@ -11,6 +11,7 @@
 #include <errno.h>

 #include "threadpool.h"
+#include <portability/toku_atomic.h>

 // use gcc builtin fetch_and_add 0->no 1->yes
 #define DO_ATOMIC_FETCH_AND_ADD 0
@ -61,7 +62,7 @@ void threadpool_maybe_add(THREADPOOL threadpool, void *(*f)(void *), void *arg)

 void threadpool_set_thread_busy(THREADPOOL threadpool) {
 #if DO_ATOMIC_FETCH_AND_ADD
-    (void) __sync_fetch_and_add(&threadpool->busy_threads, 1);
+    (void) toku_sync_fetch_and_add(&threadpool->busy_threads, 1);
 #else
    threadpool->busy_threads++;
 #endif
@ -69,7 +70,7 @@ void threadpool_set_thread_busy(THREADPOOL threadpool) {

 void threadpool_set_thread_idle(THREADPOOL threadpool) {
 #if DO_ATOMIC_FETCH_AND_ADD
-    (void) __sync_fetch_and_add(&threadpool->busy_threads, -1);
+    (void) toku_sync_fetch_and_add(&threadpool->busy_threads, -1);
 #else
    threadpool->busy_threads--;
 #endif
--- a/portability/file.cc
+++ b/portability/file.cc
@ -16,6 +16,7 @@

 #include "memory.h"
 #include "toku_time.h"
+#include <portability/toku_atomic.h>

 static int toku_assert_on_write_enospc = 0;
 static const int toku_write_enospc_sleep = 1;
@ -60,8 +61,8 @@ try_again_after_handling_write_error(int fd, size_t len, ssize_t r_write) {
            int out_of_disk_space = 1;
            assert(!out_of_disk_space); //Give an error message that might be useful if this is the only one that survives.
        } else {
-            __sync_fetch_and_add(&toku_write_enospc_total, 1);
-            __sync_fetch_and_add(&toku_write_enospc_current, 1);
+            toku_sync_fetch_and_add(&toku_write_enospc_total, 1);
+            toku_sync_fetch_and_add(&toku_write_enospc_current, 1);

            time_t tnow = time(0);
            toku_write_enospc_last_time = tnow;
@ -89,7 +90,7 @@ try_again_after_handling_write_error(int fd, size_t len, ssize_t r_write) {
            }
            sleep(toku_write_enospc_sleep);
            try_again = 1;
-            __sync_fetch_and_sub(&toku_write_enospc_current, 1);
+            toku_sync_fetch_and_sub(&toku_write_enospc_current, 1);
            break;
        }
    }
@ -347,9 +348,9 @@ static void file_fsync_internal (int fd, uint64_t *duration_p) {
            assert(get_error_errno() == EINTR);
 	}
    }
-    __sync_fetch_and_add(&toku_fsync_count, 1);
+    toku_sync_fetch_and_add(&toku_fsync_count, 1);
    uint64_t duration = toku_current_time_usec() - tstart;
-    __sync_fetch_and_add(&toku_fsync_time, duration);
+    toku_sync_fetch_and_add(&toku_fsync_time, duration);
    if (duration_p) {
        *duration_p = duration;
    }
@ -383,8 +384,8 @@ int toku_fsync_dir_by_name_without_accounting(const char *dir_name) {
 void toku_file_fsync(int fd) {
    uint64_t duration;
    file_fsync_internal (fd, &duration);
-    __sync_fetch_and_add(&sched_fsync_count, 1);
-    __sync_fetch_and_add(&sched_fsync_time, duration);
+    toku_sync_fetch_and_add(&sched_fsync_count, 1);
+    toku_sync_fetch_and_add(&sched_fsync_time, duration);
 }

 // for real accounting
--- a/portability/memory.cc
+++ b/portability/memory.cc
@ -18,6 +18,7 @@
 #include <toku_race_tools.h>
 #include "memory.h"
 #include "toku_assert.h"
+#include <portability/toku_atomic.h>

 static malloc_fun_t  t_malloc  = 0;
 static malloc_fun_t  t_xmalloc = 0;
@ -120,7 +121,7 @@ set_max(uint64_t sum_used, uint64_t sum_freed) {
 	do {
 	    old_max = status.max_in_use;
 	} while (old_max < in_use &&
-		 !__sync_bool_compare_and_swap(&status.max_in_use, old_max, in_use));
+		 !toku_sync_bool_compare_and_swap(&status.max_in_use, old_max, in_use));
    }
 }

@ -150,13 +151,13 @@ toku_malloc(size_t size) {
 	TOKU_ANNOTATE_NEW_MEMORY(p, size); // see #4671 and https://bugs.kde.org/show_bug.cgi?id=297147
        if (toku_memory_do_stats) {
            size_t used = my_malloc_usable_size(p);
-            __sync_add_and_fetch(&status.malloc_count, 1);
-            __sync_add_and_fetch(&status.requested,size);
-            __sync_add_and_fetch(&status.used, used);
+            toku_sync_add_and_fetch(&status.malloc_count, 1);
+            toku_sync_add_and_fetch(&status.requested,size);
+            toku_sync_add_and_fetch(&status.used, used);
            set_max(status.used, status.freed);
        }
    } else {
-        __sync_add_and_fetch(&status.malloc_fail, 1);
+        toku_sync_add_and_fetch(&status.malloc_fail, 1);
    }
  return p;
 }
@ -176,14 +177,14 @@ toku_realloc(void *p, size_t size) {
    if (q) {
        if (toku_memory_do_stats) {
            size_t used = my_malloc_usable_size(q);
-            __sync_add_and_fetch(&status.realloc_count, 1);
-            __sync_add_and_fetch(&status.requested, size);
-            __sync_add_and_fetch(&status.used, used);
-            __sync_add_and_fetch(&status.freed, used_orig);
+            toku_sync_add_and_fetch(&status.realloc_count, 1);
+            toku_sync_add_and_fetch(&status.requested, size);
+            toku_sync_add_and_fetch(&status.used, used);
+            toku_sync_add_and_fetch(&status.freed, used_orig);
            set_max(status.used, status.freed);
        }
    } else {
-	__sync_add_and_fetch(&status.realloc_fail, 1);
+	toku_sync_add_and_fetch(&status.realloc_fail, 1);
    }
    return q;
 }
@ -205,8 +206,8 @@ toku_free(void *p) {
    if (p) {
        if (toku_memory_do_stats) {
            size_t used = my_malloc_usable_size(p);
-            __sync_add_and_fetch(&status.free_count, 1);
-            __sync_add_and_fetch(&status.freed, used);
+            toku_sync_add_and_fetch(&status.free_count, 1);
+            toku_sync_add_and_fetch(&status.freed, used);
        }
 	if (t_free)
 	    t_free(p);
@ -228,9 +229,9 @@ toku_xmalloc(size_t size) {
    TOKU_ANNOTATE_NEW_MEMORY(p, size); // see #4671 and https://bugs.kde.org/show_bug.cgi?id=297147
    if (toku_memory_do_stats) {
        size_t used = my_malloc_usable_size(p);
-        __sync_add_and_fetch(&status.malloc_count, 1);
-        __sync_add_and_fetch(&status.requested, size);
-        __sync_add_and_fetch(&status.used, used);
+        toku_sync_add_and_fetch(&status.malloc_count, 1);
+        toku_sync_add_and_fetch(&status.requested, size);
+        toku_sync_add_and_fetch(&status.used, used);
        set_max(status.used, status.freed);
    }
    return p;
@ -252,10 +253,10 @@ toku_xrealloc(void *v, size_t size) {
        resource_assert(p);
    if (toku_memory_do_stats) {
        size_t used = my_malloc_usable_size(p);
-        __sync_add_and_fetch(&status.realloc_count, 1);
-        __sync_add_and_fetch(&status.requested, size);
-        __sync_add_and_fetch(&status.used, used);
-        __sync_add_and_fetch(&status.freed, used_orig);
+        toku_sync_add_and_fetch(&status.realloc_count, 1);
+        toku_sync_add_and_fetch(&status.requested, size);
+        toku_sync_add_and_fetch(&status.used, used);
+        toku_sync_add_and_fetch(&status.freed, used_orig);
        set_max(status.used, status.freed);
    }
    return p;
--- a/portability/portability.cc
+++ b/portability/portability.cc
@ -43,6 +43,7 @@
 #include "toku_os.h"
 #include "toku_time.h"
 #include "memory.h"
+#include <portability/toku_atomic.h>
 #include <util/partitioned_counter.h>

 int
--- a/portability/tests/CMakeLists.txt
+++ b/portability/tests/CMakeLists.txt
@ -5,6 +5,7 @@ if(BUILD_TESTING)
  file(GLOB srcs RELATIVE "${CMAKE_CURRENT_SOURCE_DIR}" *.cc)
  foreach(src ${srcs})
    get_filename_component(base ${src} NAME_WE)
+    set_property(SOURCE ${src} APPEND PROPERTY COMPILE_DEFINITIONS "__SRCFILE__=\"${CMAKE_CURRENT_SOURCE_DIR}/${src}\"")
    list(APPEND tests ${base})
  endforeach(src)

--- a/portability/tests/test-cache-line-boundary-fails.cc
+++ b/portability/tests/test-cache-line-boundary-fails.cc
@ -0,0 +1,88 @@
+/* -*- mode: C++; c-basic-offset: 4; indent-tabs-mode: nil -*- */
+// vim: ft=cpp:expandtab:ts=8:sw=4:softtabstop=4:
+#ident "$Id$"
+#ident "Copyright (c) 2007-2012 Tokutek Inc.  All rights reserved."
+#ident "The technology is licensed by the Massachusetts Institute of Technology, Rutgers State University of New Jersey, and the Research Foundation of State University of New York at Stony Brook under United States of America Serial No. 11/760379 and to the patents and/or patent applications resulting from it."
+
+#include <config.h>
+#include <memory.h>
+#include <portability/toku_atomic.h>
+#include "test.h"
+#include <stdint.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <signal.h>
+
+int verbose = 0;
+
+static const size_t cachelinesize = 64;
+
+// cache line is 64 bytes
+// nine 7-byte structs fill 63 bytes
+// the tenth spans one byte of the first cache line and six of the next cache line
+// we first SFAA the first 9 structs and ensure we don't crash, then we set a signal handler and SFAA the 10th and ensure we do crash
+
+struct unpackedsevenbytestruct {
+    uint32_t i;
+    char pad[3];
+};
+struct __attribute__((packed)) packedsevenbytestruct {
+    uint32_t i;
+    char pad[3];
+};
+
+struct packedsevenbytestruct *psevenbytestructs;
+static __attribute__((__noreturn__)) void catch_abort (int sig __attribute__((__unused__))) {
+    toku_free(psevenbytestructs);
+#ifdef TOKU_DEBUG_PARANOID
+    exit(EXIT_SUCCESS);  // with paranoid asserts, we expect to assert and reach this handler
+#else
+    exit(EXIT_FAILURE);  // we should not have crashed without paranoid asserts
+#endif
+}
+
+int test_main(int UU(argc), char *const argv[] UU()) {
+    if (sizeof(unpackedsevenbytestruct) != 8) {
+        exit(EXIT_FAILURE);
+    }
+    if (sizeof(packedsevenbytestruct) != 7) {
+        exit(EXIT_FAILURE);
+    }
+
+    {
+        struct unpackedsevenbytestruct *usevenbytestructs;
+        int r = posix_memalign((void **) &usevenbytestructs, cachelinesize, sizeof(unpackedsevenbytestruct) * 10);
+        if (r) {
+            // this test is supposed to crash, so exiting cleanly is a failure
+            perror("posix_memalign");
+            exit(EXIT_FAILURE);
+        }
+
+        for (int idx = 0; idx < 10; ++idx) {
+            usevenbytestructs[idx].i = idx + 1;
+            (void) toku_sync_fetch_and_add(&usevenbytestructs[idx].i, 32U - idx);
+        }
+        toku_free(usevenbytestructs);
+    }
+
+    int r = posix_memalign((void **) &psevenbytestructs, cachelinesize, sizeof(packedsevenbytestruct) * 10);
+    if (r) {
+        // this test is supposed to crash, so exiting cleanly is a failure
+        perror("posix_memalign");
+        exit(EXIT_FAILURE);
+    }
+
+    for (int idx = 0; idx < 9; ++idx) {
+        psevenbytestructs[idx].i = idx + 1;
+        (void) toku_sync_fetch_and_add(&psevenbytestructs[idx].i, 32U - idx);
+    }
+    psevenbytestructs[9].i = 10;
+    signal(SIGABRT, catch_abort);
+    (void) toku_sync_fetch_and_add(&psevenbytestructs[9].i, 32U);
+
+#ifdef TOKU_DEBUG_PARANOID
+    exit(EXIT_FAILURE);  // with paranoid asserts, we should already have crashed
+#else
+    exit(EXIT_SUCCESS);  // without them, we should make it here
+#endif
+}
--- a/portability/tests/test-fair-rwlock.cc
+++ b/portability/tests/test-fair-rwlock.cc
@ -40,6 +40,7 @@
 #include <errno.h>
 #include <util/rwlock.h>
 #include <util/frwlock.h>
+#include <portability/toku_atomic.h>
 #include "toku_fair_rwlock.h"
 #include <sys/types.h>

@ -138,8 +139,8 @@ void time_cas (void) {
    for (int t=0; t<T; t++) {
 	gettimeofday(&start, NULL);
 	for (int i=0; i<N; i++) {
-	    { int r = __sync_val_compare_and_swap(&myval, 0, 1);  assert(r==0); }
-	    { int r = __sync_val_compare_and_swap(&myval, 1, 0);  assert(r==1); }
+	    { int r = toku_sync_val_compare_and_swap(&myval, 0, 1);  assert(r==0); }
+	    { int r = toku_sync_val_compare_and_swap(&myval, 1, 0);  assert(r==1); }
 	}
 	gettimeofday(&end,   NULL);
 	double diff = 1e9*toku_tdiff(&end, &start)/N;
@ -325,7 +326,7 @@ static int log_counter=0;

 static void logit (int threadid, int loopid, char action) {
    //printf("%d %d %c\n", threadid, loopid, action);
-    int my_log_counter = __sync_fetch_and_add(&log_counter, 1);
+    int my_log_counter = toku_sync_fetch_and_add(&log_counter, 1);
    assert(my_log_counter<N_LOG_ENTRIES);
    actionlog[my_log_counter].threadid = threadid;
    actionlog[my_log_counter].loopid   = loopid;
--- a/portability/tests/test-flock.cc
+++ b/portability/tests/test-flock.cc
@ -11,12 +11,12 @@
 #include "toku_portability.h"

 int main(void) {
-    int fd = toku_os_lock_file(__FILE__);
+    int fd = toku_os_lock_file(__SRCFILE__);
    assert(fd != -1);
    pid_t pid = fork();
    assert(pid != -1);
    if (pid == 0) {
-        int fd2 = toku_os_lock_file(__FILE__);
+        int fd2 = toku_os_lock_file(__SRCFILE__);
        assert(fd2 == -1);
 	return 0;
    } else {
--- a/portability/toku_atomic.h
+++ b/portability/toku_atomic.h
@ -0,0 +1,77 @@
+/* -*- mode: C++; c-basic-offset: 4; indent-tabs-mode: nil -*- */
+// vim: ft=cpp:expandtab:ts=8:sw=4:softtabstop=4:
+#ident "Copyright (c) 2012 Tokutek Inc.  All rights reserved."
+#ident "$Id$"
+
+#ifndef TOKU_ATOMIC_H
+#define TOKU_ATOMIC_H
+
+#include <config.h>
+#include <toku_assert.h>
+#include <stdbool.h>
+#include <stdint.h>
+#include <stddef.h>
+
+__attribute__((const, always_inline))
+static inline intptr_t which_cache_line(intptr_t addr) {
+    static const size_t assumed_cache_line_size = 64;
+    return addr / assumed_cache_line_size;
+}
+template <typename T> __attribute__((const, always_inline))
+static inline bool crosses_boundary(T *addr, size_t width) {
+    const intptr_t int_addr = reinterpret_cast<intptr_t>(addr);
+    const intptr_t last_byte = int_addr + width - 1;
+    return which_cache_line(int_addr) != which_cache_line(last_byte);
+}
+
+template <typename T, typename U> __attribute__((always_inline))
+static inline T toku_sync_fetch_and_add(T *addr, U diff) {
+    paranoid_invariant(!crosses_boundary(addr, sizeof *addr));
+    return __sync_fetch_and_add(addr, diff);
+}
+template <typename T, typename U> __attribute__((always_inline))
+static inline T toku_sync_add_and_fetch(T *addr, U diff) {
+    paranoid_invariant(!crosses_boundary(addr, sizeof *addr));
+    return __sync_add_and_fetch(addr, diff);
+}
+template <typename T, typename U> __attribute__((always_inline))
+static inline T toku_sync_fetch_and_sub(T *addr, U diff) {
+    paranoid_invariant(!crosses_boundary(addr, sizeof *addr));
+    return __sync_fetch_and_sub(addr, diff);
+}
+template <typename T, typename U> __attribute__((always_inline))
+static inline T toku_sync_sub_and_fetch(T *addr, U diff) {
+    paranoid_invariant(!crosses_boundary(addr, sizeof *addr));
+    return __sync_sub_and_fetch(addr, diff);
+}
+template <typename T, typename U, typename V> __attribute__((always_inline))
+static inline T toku_sync_val_compare_and_swap(T *addr, U oldval, V newval) {
+    paranoid_invariant(!crosses_boundary(addr, sizeof *addr));
+    return __sync_val_compare_and_swap(addr, oldval, newval);
+}
+template <typename T, typename U, typename V> __attribute__((always_inline))
+static inline bool toku_sync_bool_compare_and_swap(T *addr, U oldval, V newval) {
+    paranoid_invariant(!crosses_boundary(addr, sizeof *addr));
+    return __sync_bool_compare_and_swap(addr, oldval, newval);
+}
+
+// in case you include this but not toku_portability.h
+#pragma GCC poison __sync_fetch_and_add
+#pragma GCC poison __sync_fetch_and_sub
+#pragma GCC poison __sync_fetch_and_or
+#pragma GCC poison __sync_fetch_and_and
+#pragma GCC poison __sync_fetch_and_xor
+#pragma GCC poison __sync_fetch_and_nand
+#pragma GCC poison __sync_add_and_fetch
+#pragma GCC poison __sync_sub_and_fetch
+#pragma GCC poison __sync_or_and_fetch
+#pragma GCC poison __sync_and_and_fetch
+#pragma GCC poison __sync_xor_and_fetch
+#pragma GCC poison __sync_nand_and_fetch
+#pragma GCC poison __sync_bool_compare_and_swap
+#pragma GCC poison __sync_val_compare_and_swap
+#pragma GCC poison __sync_synchronize
+#pragma GCC poison __sync_lock_test_and_set
+#pragma GCC poison __sync_release
+
+#endif // TOKU_ATOMIC_H
--- a/portability/toku_fair_rwlock.cc
+++ b/portability/toku_fair_rwlock.cc
@ -13,6 +13,7 @@

 #include <stdio.h>
 #include <memory.h>
+#include <portability/toku_atomic.h>

 struct toku_fair_rwlock_waiter_state {
    char is_read;
@ -80,7 +81,7 @@ static __thread int tid=-1;
 static int next_tid=0;
 static int get_tid (void) {
    if (tid==-1) {
-	tid = __sync_fetch_and_add(&next_tid, 1);
+	tid = toku_sync_fetch_and_add(&next_tid, 1);
    }
    return tid;
 }
@ -108,10 +109,10 @@ int toku_fair_rwlock_rdlock_slow (toku_fair_rwlock_t *rwlock) {
    if (s_get_qcount(s)==0 && !s_get_wlock(s)) goto C2;
    else goto C3;
 C2:
-    if (__sync_bool_compare_and_swap(&rwlock->state, s, s_incr_rcount(s))) goto MU;
+    if (toku_sync_bool_compare_and_swap(&rwlock->state, s, s_incr_rcount(s))) goto MU;
    else goto R2;
 C3:
-    if (__sync_bool_compare_and_swap(&rwlock->state, s, s_incr_qcount(s))) goto E;
+    if (toku_sync_bool_compare_and_swap(&rwlock->state, s, s_incr_qcount(s))) goto E;
    else goto R2;
 E:
    // Put me into the queue.
@ -145,7 +146,7 @@ int toku_fair_rwlock_rdlock_slow (toku_fair_rwlock_t *rwlock) {
    s = rwlock->state;
    goto C4;
 C4:
-    if (__sync_bool_compare_and_swap(&rwlock->state, s, s_incr_rcount(s_decr_qcount(s)))) goto MU;
+    if (toku_sync_bool_compare_and_swap(&rwlock->state, s, s_incr_rcount(s_decr_qcount(s)))) goto MU;
    else goto R4;
 MU:
    toku_mutex_unlock(&rwlock->mutex);
@ -168,11 +169,11 @@ int toku_fair_rwlock_wrlock_slow (toku_fair_rwlock_t *rwlock) {
    if (s_get_qcount(s)==0 && !s_get_wlock(s) && s_get_rcount(s)==0) goto C2;
    else goto C3;
 C2:
-    if (__sync_bool_compare_and_swap(&rwlock->state, s, s_set_wlock(s))) goto MU;
+    if (toku_sync_bool_compare_and_swap(&rwlock->state, s, s_set_wlock(s))) goto MU;
    else goto R2;
 C3:
    L(C3);
-    if (__sync_bool_compare_and_swap(&rwlock->state, s, s_incr_qcount(s))) goto E;
+    if (toku_sync_bool_compare_and_swap(&rwlock->state, s, s_incr_qcount(s))) goto E;
    else goto R2;
 E:
    LP(E, rwlock->state);
@ -202,7 +203,7 @@ int toku_fair_rwlock_wrlock_slow (toku_fair_rwlock_t *rwlock) {
    assert(!s_get_wlock(s));
    goto C4;
 C4:
-    if (__sync_bool_compare_and_swap(&rwlock->state, s, s_set_wlock(s_decr_qcount(s)))) goto MU;
+    if (toku_sync_bool_compare_and_swap(&rwlock->state, s, s_set_wlock(s_decr_qcount(s)))) goto MU;
    else goto R4;
 MU:
    toku_mutex_unlock(&rwlock->mutex);
@ -223,11 +224,11 @@ int toku_fair_rwlock_unlock_r_slow (toku_fair_rwlock_t *rwlock) {
    if (s_get_rcount(s)>1 || s_get_qcount(s)==0) goto C2;
    else goto C3;
 C2:
-    if (__sync_bool_compare_and_swap(&rwlock->state, s, s_decr_rcount(s))) goto MU;
+    if (toku_sync_bool_compare_and_swap(&rwlock->state, s, s_decr_rcount(s))) goto MU;
    else goto R2;
 C3:
    // rcount==1 and qcount>0
-    if (__sync_bool_compare_and_swap(&rwlock->state, s, s_decr_rcount(s))) goto WN;
+    if (toku_sync_bool_compare_and_swap(&rwlock->state, s, s_decr_rcount(s))) goto WN;
    else goto R2;
 WN:
    LP(WN, rwlock->state);
@ -253,10 +254,10 @@ int toku_fair_rwlock_unlock_w_slow (toku_fair_rwlock_t *rwlock) {
    if (s_get_qcount(s)==0) goto C2;
    else goto C3;
 C2:
-    if (__sync_bool_compare_and_swap(&rwlock->state, s, s_clear_wlock(s))) goto MU;
+    if (toku_sync_bool_compare_and_swap(&rwlock->state, s, s_clear_wlock(s))) goto MU;
    else goto R2;
 C3:
-    if (__sync_bool_compare_and_swap(&rwlock->state, s, s_clear_wlock(s))) goto WN;
+    if (toku_sync_bool_compare_and_swap(&rwlock->state, s, s_clear_wlock(s))) goto WN;
    else goto R2;
 WN:
    LP(WN, rwlock->state);
--- a/portability/toku_fair_rwlock.h
+++ b/portability/toku_fair_rwlock.h
@ -5,6 +5,7 @@
 #ident "The technology is licensed by the Massachusetts Institute of Technology, Rutgers State University of New Jersey, and the Research Foundation of State University of New York at Stony Brook under United States of America Serial No. 11/760379 and to the patents and/or patent applications resulting from it."

 #include "toku_pthread.h"
+#include <portability/toku_atomic.h>

 // Fair readers/writer locks.  These are fair (meaning first-come first-served.  No reader starvation, and no writer starvation).  And they are
 // probably faster than the linux readers/writer locks (pthread_rwlock_t).
@ -84,7 +85,7 @@ static inline int toku_fair_rwlock_rdlock (toku_fair_rwlock_t *rwlock) {
    //if (s_get_qcount(s)==0 && !s_get_wlock(s)) goto C1;
    else goto ML;
 C1:
-    if (__sync_bool_compare_and_swap(&rwlock->state, s, s_incr_rcount(s))) goto DONE;
+    if (toku_sync_bool_compare_and_swap(&rwlock->state, s, s_incr_rcount(s))) goto DONE;
    else goto START;
 DONE:
    return 0;
@ -102,7 +103,7 @@ static inline int toku_fair_rwlock_wrlock (toku_fair_rwlock_t *rwlock) {
    if (s_get_qcount(s)==0 && !s_get_wlock(s) && s_get_rcount(s)==0) goto C1;
    else goto ML;
 C1:
-    if (__sync_bool_compare_and_swap(&rwlock->state, s, s_set_wlock(s))) goto DONE;
+    if (toku_sync_bool_compare_and_swap(&rwlock->state, s, s_set_wlock(s))) goto DONE;
    else goto START;
 DONE:
    return 0;
@ -125,7 +126,7 @@ static inline int toku_fair_rwlock_unlock (toku_fair_rwlock_t *rwlock) {
 	if (s_get_qcount(s)==0) goto wC1;
 	else goto wML;
    wC1:
-	if (__sync_bool_compare_and_swap(&rwlock->state, s, s_clear_wlock(s))) goto wDONE;
+	if (toku_sync_bool_compare_and_swap(&rwlock->state, s, s_clear_wlock(s))) goto wDONE;
 	else goto wSTART;
    wDONE:
 	return 0;
@ -140,7 +141,7 @@ static inline int toku_fair_rwlock_unlock (toku_fair_rwlock_t *rwlock) {
 	if (s_get_rcount(s)>1 || s_get_qcount(s)==0) goto rC1;
 	else goto rML;
    rC1:
-	if (__sync_bool_compare_and_swap(&rwlock->state, s, s_decr_rcount(s))) goto rDONE;
+	if (toku_sync_bool_compare_and_swap(&rwlock->state, s, s_decr_rcount(s))) goto rDONE;
 	else goto rSTART;
    rDONE:
 	return 0;
--- a/src/indexer-undo-do.cc
+++ b/src/indexer-undo-do.cc
@ -507,7 +507,9 @@ indexer_ft_delete_committed(DB_INDEXER *indexer, DB *hotdb, DBT *hotkey, XIDS xi
    } else {
        result = toku_ydb_check_avail_fs_space(indexer->i->env);
        if (result == 0) {
+            toku_multi_operation_client_lock();
            toku_ft_send_delete(db_struct_i(hotdb)->ft_handle, hotkey, xids);
+            toku_multi_operation_client_unlock();
        }
    }
    return result;
@ -545,7 +547,9 @@ indexer_ft_insert_committed(DB_INDEXER *indexer, DB *hotdb, DBT *hotkey, DBT *ho
    } else {
        result = toku_ydb_check_avail_fs_space(indexer->i->env);
        if (result == 0) {
+            toku_multi_operation_client_lock();
            toku_ft_send_insert(db_struct_i(hotdb)->ft_handle, hotkey, hotval, xids, FT_INSERT);
+            toku_multi_operation_client_unlock();
        }
    }
    return result;
--- a/src/indexer.cc
+++ b/src/indexer.cc
@ -26,6 +26,7 @@
 #include <ft/xids.h>
 #include <ft/log-internal.h>
 #include <ft/checkpoint.h>
+#include <portability/toku_atomic.h>

 ///////////////////////////////////////////////////////////////////////////////////
 // Engine status
@ -236,13 +237,13 @@ create_exit:

        *indexerp = indexer;

-        (void) __sync_fetch_and_add(&STATUS_VALUE(INDEXER_CREATE), 1);
-        (void) __sync_fetch_and_add(&STATUS_VALUE(INDEXER_CURRENT), 1);
+        (void) toku_sync_fetch_and_add(&STATUS_VALUE(INDEXER_CREATE), 1);
+        (void) toku_sync_fetch_and_add(&STATUS_VALUE(INDEXER_CURRENT), 1);
        if ( STATUS_VALUE(INDEXER_CURRENT) > STATUS_VALUE(INDEXER_MAX) )
            STATUS_VALUE(INDEXER_MAX) = STATUS_VALUE(INDEXER_CURRENT);   // NOT WORTH A LOCK TO MAKE THREADSAFE), may be inaccurate

    } else {
-        (void) __sync_fetch_and_add(&STATUS_VALUE(INDEXER_CREATE_FAIL), 1);
+        (void) toku_sync_fetch_and_add(&STATUS_VALUE(INDEXER_CREATE_FAIL), 1);
        free_indexer(indexer);
    }

@ -472,9 +473,9 @@ build_index(DB_INDEXER *indexer) {
    //  - unique checks?

    if ( result == 0 ) {
-        (void) __sync_fetch_and_add(&STATUS_VALUE(INDEXER_BUILD), 1);
+        (void) toku_sync_fetch_and_add(&STATUS_VALUE(INDEXER_BUILD), 1);
    } else {
-        (void) __sync_fetch_and_add(&STATUS_VALUE(INDEXER_BUILD_FAIL), 1);
+        (void) toku_sync_fetch_and_add(&STATUS_VALUE(INDEXER_BUILD_FAIL), 1);
    }

    return result;
@ -484,7 +485,7 @@ build_index(DB_INDEXER *indexer) {
 static int
 close_indexer(DB_INDEXER *indexer) {
    int r = 0;
-    (void) __sync_fetch_and_sub(&STATUS_VALUE(INDEXER_CURRENT), 1);
+    (void) toku_sync_fetch_and_sub(&STATUS_VALUE(INDEXER_CURRENT), 1);

    // Mark txn as needing a checkpoint.
    // (This will cause a checkpoint, which is necessary
@ -499,9 +500,9 @@ close_indexer(DB_INDEXER *indexer) {
    free_indexer(indexer);

    if ( r == 0 ) {
-        (void) __sync_fetch_and_add(&STATUS_VALUE(INDEXER_CLOSE), 1);
+        (void) toku_sync_fetch_and_add(&STATUS_VALUE(INDEXER_CLOSE), 1);
    } else {
-        (void) __sync_fetch_and_add(&STATUS_VALUE(INDEXER_CLOSE_FAIL), 1);
+        (void) toku_sync_fetch_and_add(&STATUS_VALUE(INDEXER_CLOSE_FAIL), 1);
    }
    return r;
 }
@ -509,8 +510,8 @@ close_indexer(DB_INDEXER *indexer) {
 // Clients must not operate on any of the hot dbs concurrently with abort
 static int 
 abort_indexer(DB_INDEXER *indexer) {
-    (void) __sync_fetch_and_sub(&STATUS_VALUE(INDEXER_CURRENT), 1);
-    (void) __sync_fetch_and_add(&STATUS_VALUE(INDEXER_ABORT), 1);
+    (void) toku_sync_fetch_and_sub(&STATUS_VALUE(INDEXER_CURRENT), 1);
+    (void) toku_sync_fetch_and_add(&STATUS_VALUE(INDEXER_ABORT), 1);
    // Disassociate the indexer from the hot db and free_indexer
    disassociate_indexer_from_hot_dbs(indexer);
    free_indexer(indexer);
--- a/src/loader.cc
+++ b/src/loader.cc
@ -26,6 +26,7 @@
 #include <ft/ft-internal.h>
 #include <ft/ft.h>
 #include "ydb_db.h"
+#include <portability/toku_atomic.h>


 #define lazy_assert(a) assert(a) // indicates code is incomplete 
@ -302,13 +303,13 @@ static int create_loader(DB_ENV *env,
    *blp = loader;
 create_exit:
    if (rval == 0) {
-        (void) __sync_fetch_and_add(&STATUS_VALUE(LOADER_CREATE), 1);
-        (void) __sync_fetch_and_add(&STATUS_VALUE(LOADER_CURRENT), 1);
+        (void) toku_sync_fetch_and_add(&STATUS_VALUE(LOADER_CREATE), 1);
+        (void) toku_sync_fetch_and_add(&STATUS_VALUE(LOADER_CURRENT), 1);
        if (STATUS_VALUE(LOADER_CURRENT) > STATUS_VALUE(LOADER_MAX) )
            STATUS_VALUE(LOADER_MAX) = STATUS_VALUE(LOADER_CURRENT);  // not worth a lock to make threadsafe, may be inaccurate
    }
    else {
-        (void) __sync_fetch_and_add(&STATUS_VALUE(LOADER_CREATE_FAIL), 1);
+        (void) toku_sync_fetch_and_add(&STATUS_VALUE(LOADER_CREATE_FAIL), 1);
        free_loader(loader);
    }
    return rval;
@ -442,7 +443,7 @@ static void redirect_loader_to_empty_dictionaries(DB_LOADER *loader) {

 int toku_loader_close(DB_LOADER *loader) 
 {
-    (void) __sync_fetch_and_sub(&STATUS_VALUE(LOADER_CURRENT), 1);
+    (void) toku_sync_fetch_and_sub(&STATUS_VALUE(LOADER_CURRENT), 1);
    int r=0;
    if ( loader->i->err_errno != 0 ) {
        if ( loader->i->error_callback != NULL ) {
@ -466,16 +467,16 @@ int toku_loader_close(DB_LOADER *loader)
    }
    free_loader(loader);
    if (r==0)
-        (void) __sync_fetch_and_add(&STATUS_VALUE(LOADER_CLOSE), 1);
+        (void) toku_sync_fetch_and_add(&STATUS_VALUE(LOADER_CLOSE), 1);
    else
-        (void) __sync_fetch_and_add(&STATUS_VALUE(LOADER_CLOSE_FAIL), 1);
+        (void) toku_sync_fetch_and_add(&STATUS_VALUE(LOADER_CLOSE_FAIL), 1);
    return r;
 }

 int toku_loader_abort(DB_LOADER *loader) 
 {
-    (void) __sync_fetch_and_sub(&STATUS_VALUE(LOADER_CURRENT), 1);
-    (void) __sync_fetch_and_add(&STATUS_VALUE(LOADER_ABORT), 1);
+    (void) toku_sync_fetch_and_sub(&STATUS_VALUE(LOADER_CURRENT), 1);
+    (void) toku_sync_fetch_and_add(&STATUS_VALUE(LOADER_ABORT), 1);
    int r=0;
    if ( loader->i->err_errno != 0 ) {
        if ( loader->i->error_callback != NULL ) {
--- a/src/lock_tree/locktree.cc
+++ b/src/lock_tree/locktree.cc
@ -19,6 +19,7 @@
 #include <toku_stdint.h>
 #include <db.h>
 #include <toku_race_tools.h>
+#include <portability/toku_atomic.h>

 /* TODO: Yoni should check that all asserts make sense instead of panic,
         and all early returns make sense instead of panic,
@ -342,15 +343,15 @@ toku_ltm_set_max_lock_memory(toku_ltm* mgr, uint64_t lock_memory_limit) {
 static inline void 
 ltm_incr_locks(toku_ltm* tree_mgr, uint32_t replace_locks) {
    assert(replace_locks <= tree_mgr->curr_locks);
-    (void) __sync_fetch_and_sub(&tree_mgr->curr_locks, replace_locks);
-    (void) __sync_fetch_and_add(&tree_mgr->curr_locks, 1);
+    (void) toku_sync_fetch_and_sub(&tree_mgr->curr_locks, replace_locks);
+    (void) toku_sync_fetch_and_add(&tree_mgr->curr_locks, 1);
 }

 static inline void 
 ltm_decr_locks(toku_ltm* tree_mgr, uint32_t locks) {
    assert(tree_mgr);
    assert(tree_mgr->curr_locks >= locks);
-    (void) __sync_fetch_and_sub(&tree_mgr->curr_locks, locks);
+    (void) toku_sync_fetch_and_sub(&tree_mgr->curr_locks, locks);
 }

 static int 
@ -363,7 +364,7 @@ ltm_out_of_locks(toku_ltm *mgr) {

 static void
 ltm_incr_lock_memory(toku_ltm *mgr, size_t s) {
-    (void) __sync_add_and_fetch(&mgr->curr_lock_memory, s);
+    (void) toku_sync_add_and_fetch(&mgr->curr_lock_memory, s);
 }

 static void 
@ -375,7 +376,7 @@ ltm_incr_lock_memory_callback(void *extra, size_t s) {
 static void 
 ltm_decr_lock_memory(toku_ltm *mgr, size_t s) {
    assert(mgr->curr_lock_memory >= s);
-    (void) __sync_sub_and_fetch(&mgr->curr_lock_memory, s);
+    (void) toku_sync_sub_and_fetch(&mgr->curr_lock_memory, s);
 }

 static void 
--- a/src/tests/bdb-simple-deadlock-detect.cc
+++ b/src/tests/bdb-simple-deadlock-detect.cc
@ -15,6 +15,7 @@

 #include "test.h"
 #include "toku_pthread.h"
+#include <portability/toku_atomic.h>

 struct test_seq {
    int state;
@ -84,7 +85,7 @@ static void *run_locker_a(void *arg) {
    if (m_locked) {
        r = db_env->lock_put(db_env, &lock_a_m); assert(r == 0);
    } else {
-        (void) __sync_fetch_and_add(locker_args->deadlock_count, 1);
+        (void) toku_sync_fetch_and_add(locker_args->deadlock_count, 1);
        if (verbose) printf("%s:%u m deadlock\n", __FUNCTION__, __LINE__);
    }

@ -123,7 +124,7 @@ static void *run_locker_b(void *arg) {
    if (l_locked) {
        r = db_env->lock_put(db_env, &lock_b_l); assert(r == 0);
    } else {
-        (void) __sync_fetch_and_add(locker_args->deadlock_count, 1);
+        (void) toku_sync_fetch_and_add(locker_args->deadlock_count, 1);
        if (verbose) printf("%s:%u l deadlock\n", __FUNCTION__, __LINE__);
    }

--- a/src/tests/bdb-simple-deadlock-on-the-fly.cc
+++ b/src/tests/bdb-simple-deadlock-on-the-fly.cc
@ -15,6 +15,7 @@

 #include "test.h"
 #include "toku_pthread.h"
+#include <portability/toku_atomic.h>

 struct test_seq {
    int state;
@ -84,7 +85,7 @@ static void *run_locker_a(void *arg) {
    if (m_locked) {
        r = db_env->lock_put(db_env, &lock_a_m); assert(r == 0);
    } else {
-        (void) __sync_fetch_and_add(locker_args->deadlock_count, 1);
+        (void) toku_sync_fetch_and_add(locker_args->deadlock_count, 1);
        if (verbose) printf("%s:%u m deadlock\n", __FUNCTION__, __LINE__);
    }

@ -123,7 +124,7 @@ static void *run_locker_b(void *arg) {
    if (l_locked) {
        r = db_env->lock_put(db_env, &lock_b_l); assert(r == 0);
    } else {
-        (void) __sync_fetch_and_add(locker_args->deadlock_count, 1);
+        (void) toku_sync_fetch_and_add(locker_args->deadlock_count, 1);
        if (verbose) printf("%s:%u l deadlock\n", __FUNCTION__, __LINE__);
    }

--- a/src/tests/checkpoint_fairness.cc
+++ b/src/tests/checkpoint_fairness.cc
@ -16,6 +16,7 @@

 #include "test.h"
 #include "toku_pthread.h"
+#include <portability/toku_atomic.h>

 DB_ENV *env;
 DB     *db;
@ -38,7 +39,7 @@ static void *start_txns (void *e) {
 	{ int chk_r = env->txn_begin(env, NULL, &txn, 0); CKERR(chk_r); }
 	{ int chk_r = db->put(db, txn, &k, &k, 0); CKERR(chk_r); }
 	{ int chk_r = txn->commit(txn, 0); CKERR(chk_r); }
-	if (j==10) (void)__sync_fetch_and_add(&reader_start_count, 1);
+	if (j==10) (void)toku_sync_fetch_and_add(&reader_start_count, 1);
 	if (j%1000==999) { printf("."); fflush(stdout); }
 	assert(j<1000); // Get upset if we manage to run this many transactions without the checkpoint thread 
    }
@ -52,7 +53,7 @@ static void start_checkpoints (void) {
 	{ int chk_r = env->txn_checkpoint(env, 0, 0, 0); CKERR(chk_r); }
 	if (verbose) printf("ck\n");
 	sched_yield();
-	(void)__sync_fetch_and_add(&writer_done_count, 1);
+	(void)toku_sync_fetch_and_add(&writer_done_count, 1);
    }
 }

--- a/src/tests/db-put-update-deadlock.cc
+++ b/src/tests/db-put-update-deadlock.cc
@ -11,6 +11,7 @@

 #include "test.h"
 #include "toku_pthread.h"
+#include <portability/toku_atomic.h>

 static void write_row(DB *db, DB_TXN *txn, int k, int v, int expect_r) {
    DBT key; dbt_init(&key, &k, sizeof k);
@ -52,7 +53,7 @@ static void *write_one_f(void *arg) {
    } else {
        r = txn->abort(txn); assert(r == 0);
    }
-    (void) __sync_fetch_and_sub(&n_txns, 1);
+    (void) toku_sync_fetch_and_sub(&n_txns, 1);

    return arg;
 }
--- a/src/tests/diskfull.cc
+++ b/src/tests/diskfull.cc
@ -9,6 +9,7 @@
 #include <fcntl.h>
 #include <errno.h>
 #include <unistd.h>
+#include <portability/toku_atomic.h>

 #define DOERR(r) do { if (r!=0) { did_fail=1; fprintf(error_file, "%s:%d error %d (%s)\n", __FILE__, __LINE__, r, db_strerror(r)); }} while (0)

@ -143,7 +144,7 @@ static int fail_at = FAIL_NEVER;
 static ssize_t
 pwrite_counting_and_failing (int fd, const void *buf, size_t size, toku_off_t off)
 {
-    int this_count = __sync_add_and_fetch(&write_count, 1);
+    int this_count = toku_sync_add_and_fetch(&write_count, 1);
    if (this_count>fail_at) {
        if (verbose>1) { printf("Failure imminent at %d:\n", fail_at); fflush(stdout); }
 	errno = ENOSPC;
@ -156,7 +157,7 @@ pwrite_counting_and_failing (int fd, const void *buf, size_t size, toku_off_t of
 static ssize_t
 write_counting_and_failing (int fd, const void *buf, size_t size)
 {
-    int this_count = __sync_add_and_fetch(&write_count, 1);
+    int this_count = toku_sync_add_and_fetch(&write_count, 1);
    if (this_count>fail_at) {
        if (verbose>1) { printf("Failure imminent at %d:\n", fail_at); fflush(stdout); }
 	errno = ENOSPC;
--- a/src/tests/multiprocess.cc
+++ b/src/tests/multiprocess.cc
@ -8,7 +8,7 @@

 #include "test.h"

-static size_t max(size_t a, size_t b) {
+static inline size_t max(size_t a, size_t b) {
    return a > b ? a : b;
 }

--- a/src/tests/perf_insert.cc
+++ b/src/tests/perf_insert.cc
@ -33,6 +33,7 @@ stress_table(DB_ENV* env, DB** dbp, struct cli_args *cli_args) {
        arg_init(&myargs[i], dbp, env, cli_args);
        myargs[i].operation = put_op;
        if (cli_args->serial_insert) {
+            spe[i].current = cli_args->num_elements;
            myargs[i].operation_extra = &spe[i];
        }
    }
--- a/src/tests/seqinsert.cc
+++ b/src/tests/seqinsert.cc
@ -0,0 +1,81 @@
+/* -*- mode: C++; c-basic-offset: 4; indent-tabs-mode: nil -*- */
+// vim: ft=cpp:expandtab:ts=8:sw=4:softtabstop=4:
+#ident "$Id$"
+#ident "Copyright (c) 2007-2012 Tokutek Inc.  All rights reserved."
+#ident "The technology is licensed by the Massachusetts Institute of Technology, Rutgers State University of New Jersey, and the Research Foundation of State University of New York at Stony Brook under United States of America Serial No. 11/760379 and to the patents and/or patent applications resulting from it."
+
+#include "test.h"
+#include <db.h>
+#include <toku_portability.h>
+#include <toku_os.h>
+#include <memory.h>
+#include <stdint.h>
+#include <stdlib.h>
+
+DB_TXN * const null_txn = nullptr;
+
+const size_t nodesize = 128 << 10;
+const size_t keysize = 8;
+const size_t valsize = 92;
+const size_t rowsize = keysize + valsize;
+const int max_degree = 16;
+const size_t numleaves = max_degree * 3; // want height 2, this should be good enough
+const size_t numrows = (numleaves * nodesize + rowsize) / rowsize;
+
+static void test_seqinsert(bool asc) {
+    int r;
+    r = system("rm -rf " ENVDIR);
+    CKERR(r);
+    r = toku_os_mkdir(ENVDIR, S_IRWXU+S_IRWXG+S_IRWXO);
+    CKERR(r);
+
+    DB_ENV *env;
+    r = db_env_create(&env, 0);
+    CKERR(r);
+    r = env->open(env, ENVDIR, DB_INIT_LOCK|DB_INIT_MPOOL|DB_INIT_TXN|DB_CREATE|DB_PRIVATE, S_IRWXU+S_IRWXG+S_IRWXO);
+    CKERR(r);
+
+    DB *db;
+    r = db_create(&db, env, 0);
+    CKERR(r);
+    r = db->set_pagesize(db, nodesize);
+    CKERR(r);
+    r = db->open(db, null_txn, "seqinsert", NULL, DB_BTREE, DB_CREATE, 0666);
+    CKERR(r);
+
+    {
+        DB_TXN *txn;
+        r = env->txn_begin(env, 0, &txn, 0);
+        CKERR(r);
+
+        char v[valsize];
+        ZERO_ARRAY(v);
+        uint64_t k;
+        DBT key, val;
+        dbt_init(&key, &k, sizeof k);
+        dbt_init(&val, v, valsize);
+        for (size_t i = 0; i < numrows; ++i) {
+            k = toku_htod64(numrows + (asc ? i : -i));
+            r = db->put(db, txn, &key, &val, 0);
+            CKERR(r);
+        }
+
+        r = txn->commit(txn, 0);
+        CKERR(r);
+    }
+
+    r = db->close(db, 0);
+    CKERR(r);
+
+    r = env->close(env, 0);
+    CKERR(r);
+}
+
+int test_main(int argc, char * const argv[]) {
+    default_parse_args(argc, argv);
+
+    test_seqinsert(true);
+    test_seqinsert(false);
+
+    return 0;
+}
--- a/src/tests/stress_openclose.h
+++ b/src/tests/stress_openclose.h
@ -6,6 +6,7 @@
 #include <toku_pthread.h>
 #include "test.h"
 #include "threaded_stress_test_helpers.h"
+#include <portability/toku_atomic.h>

 // set this to true for the recovery version of this stress test
 // the way this works is to include this header and set 
@ -95,7 +96,7 @@ lock_and_maybe_open_some_db(ARG arg) {
        int i = myrandom_r(arg->random_data) % num_buckets;
        open_ith_db(bucket->env, &bucket->db, i);
        bucket->is_open = true;
-        assert(__sync_fetch_and_add(&open_buckets, 1) < num_buckets);
+        assert(toku_sync_fetch_and_add(&open_buckets, 1) < num_buckets);
        verbose_printf("opened db %d in bucket %d\n", i, k);
    }
    return bucket;
@ -113,7 +114,7 @@ unlock_and_maybe_close_db(struct db_bucket *bucket, ARG arg) {
        int r = db->close(db, 0);
        CKERR(r);
        bucket->is_open = false;
-        int old_open_buckets = __sync_fetch_and_sub(&open_buckets, 1);
+        int old_open_buckets = toku_sync_fetch_and_sub(&open_buckets, 1);
        assert(old_open_buckets > 0);
        verbose_printf("decided to close a bucket's db before unlocking\n");
    }
--- a/src/tests/test3039.cc
+++ b/src/tests/test3039.cc
@ -15,6 +15,7 @@
 #include <string.h>
 #include <toku_time.h>
 #include <toku_pthread.h>
+#include <portability/toku_atomic.h>

 static const int envflags = DB_INIT_MPOOL|DB_CREATE|DB_THREAD |DB_INIT_LOCK|DB_INIT_LOG|DB_INIT_TXN|DB_PRIVATE;

@ -187,7 +188,7 @@ void do_threads (unsigned long long N, int do_nonlocal) {
 static volatile unsigned long long n_preads;

 static ssize_t my_pread (int fd, void *buf, size_t count, off_t offset) {
-    (void) __sync_fetch_and_add(&n_preads, 1);
+    (void) toku_sync_fetch_and_add(&n_preads, 1);
    usleep(1000); // sleep for a millisecond
    return pread(fd, buf, count, offset);
 }
--- a/src/tests/test3522.cc
+++ b/src/tests/test3522.cc
@ -11,6 +11,7 @@
 #ident "The technology is licensed by the Massachusetts Institute of Technology, Rutgers State University of New Jersey, and the Research Foundation of State University of New York at Stony Brook under United States of America Serial No. 11/760379 and to the patents and/or patent applications resulting from it."

 #include "test.h"
+#include <portability/toku_atomic.h>

 static DB_ENV *env;
 static DB *db;
@ -20,7 +21,7 @@ const int n_preads_limit = 1000;
 long n_preads = 0;

 static ssize_t my_pread (int fd, void *buf, size_t count, off_t offset) {
-    long n_read_so_far = __sync_fetch_and_add(&n_preads, 1);
+    long n_read_so_far = toku_sync_fetch_and_add(&n_preads, 1);
    if (n_read_so_far > n_preads_limit) {
 	if (verbose) fprintf(stderr, "Apparent infinite loop detected\n");
 	abort();
--- a/src/tests/test3522b.cc
+++ b/src/tests/test3522b.cc
@ -12,6 +12,7 @@
 #ident "The technology is licensed by the Massachusetts Institute of Technology, Rutgers State University of New Jersey, and the Research Foundation of State University of New York at Stony Brook under United States of America Serial No. 11/760379 and to the patents and/or patent applications resulting from it."

 #include "test.h"
+#include <portability/toku_atomic.h>

 static DB_ENV *env;
 static DB *db;
@ -21,7 +22,7 @@ const int n_preads_limit = 1000;
 long n_preads = 0;

 static ssize_t my_pread (int fd, void *buf, size_t count, off_t offset) {
-    long n_read_so_far = __sync_fetch_and_add(&n_preads, 1);
+    long n_read_so_far = toku_sync_fetch_and_add(&n_preads, 1);
    if (n_read_so_far > n_preads_limit) {
 	if (verbose) fprintf(stderr, "Apparent infinite loop detected\n");
 	abort();
--- a/src/tests/test3529.cc
+++ b/src/tests/test3529.cc
@ -16,6 +16,7 @@
 #ident "The technology is licensed by the Massachusetts Institute of Technology, Rutgers State University of New Jersey, and the Research Foundation of State University of New York at Stony Brook under United States of America Serial No. 11/760379 and to the patents and/or patent applications resulting from it."

 #include "test.h"
+#include <portability/toku_atomic.h>

 static DB_ENV *env;
 static DB *db;
@ -29,7 +30,7 @@ long n_preads = 0;
 static void insert(int i, DB_TXN *txn);

 static ssize_t my_pread (int fd, void *buf, size_t count, off_t offset) {
-    long n_read_so_far = __sync_fetch_and_add(&n_preads, 1);
+    long n_read_so_far = toku_sync_fetch_and_add(&n_preads, 1);
    if (do_XX_on_pread==n_read_so_far && XX != NULL) {
 	// we're supposed to do the XX operation now.  Insert a row.
 	printf("Did XX\n");
--- a/src/tests/test_4015.cc
+++ b/src/tests/test_4015.cc
@ -6,6 +6,7 @@

 #include "test.h"
 #include "toku_pthread.h"
+#include <portability/toku_atomic.h>

 static int my_compare (DB *db, const DBT *a, const DBT *b) {
    assert(db);
@ -45,7 +46,7 @@ static void *startA (void *ignore __attribute__((__unused__))) {
 	}
 	{ int chk_r = txn->commit(txn, 0); CKERR(chk_r); }
    }
-    int r __attribute__((__unused__)) = __sync_fetch_and_add(&done, 1);
+    int r __attribute__((__unused__)) = toku_sync_fetch_and_add(&done, 1);
    return NULL;
 }
 static void change_descriptor (DB_TXN *txn, int i) {
--- a/src/tests/threaded_stress_test_helpers.h
+++ b/src/tests/threaded_stress_test_helpers.h
@ -26,6 +26,7 @@

 #include <memory.h>
 #include <toku_race_tools.h>
+#include <portability/toku_atomic.h>
 #include <portability/toku_pthread.h>
 #include <portability/toku_random.h>
 #include <util/rwlock.h>
@ -62,6 +63,7 @@ enum stress_lock_type {
 struct env_args {
    int node_size;
    int basement_node_size;
+    int rollback_node_size;
    int checkpointing_period;
    int cleaner_period;
    int cleaner_iterations;
@ -118,6 +120,7 @@ struct cli_args {
    bool unique_checks; // use uniqueness checking during insert. makes it slow.
    bool nosync; // do not fsync on txn commit. useful for testing in memory performance.
    bool nolog; // do not log. useful for testing in memory performance.
+    bool disperse_keys; // spread the keys out during a load (by reversing the bits in the loop index) to make a wide tree we can spread out random inserts into
 };

 struct arg {
@ -1366,7 +1369,7 @@ static void *test_time(void *arg) {
    if (verbose) {
        printf("should now end test\n");
    }
-    __sync_bool_compare_and_swap(&run_test, true, false); // make this atomic to make valgrind --tool=drd happy.
+    toku_sync_bool_compare_and_swap(&run_test, true, false); // make this atomic to make valgrind --tool=drd happy.
    if (verbose) {
        printf("run_test %d\n", run_test);
    }
@ -1468,6 +1471,7 @@ static int create_tables(DB_ENV **env_res, DB **db_res, int num_DBs,
    r = env->set_default_bt_compare(env, bt_compare); CKERR(r);
    r = env->set_lk_max_memory(env, env_args.lk_max_memory); CKERR(r);
    r = env->set_cachesize(env, env_args.cachetable_size / (1 << 30), env_args.cachetable_size % (1 << 30), 1); CKERR(r);
+    r = env->set_lg_bsize(env, env_args.rollback_node_size); CKERR(r);
    if (env_args.generate_put_callback) {
        r = env->set_generate_row_callback_for_put(env, env_args.generate_put_callback); 
        CKERR(r);
@ -1540,16 +1544,38 @@ static int fill_table_from_fun(DB *db, int num_elements, int key_bufsz, int val_
    return r;
 }

-static void zero_element_callback(int idx, void *UU(extra), void *keyv, int *keysz, void *valv, int *valsz) {
+static uint32_t breverse(uint32_t v)
+// Effect: return the bits in i, reversed
+// Notes: implementation taken from http://graphics.stanford.edu/~seander/bithacks.html#BitReverseObvious
+// Rationale: just a hack to spread out the keys during loading, doesn't need to be fast but does need to be correct.
+{
+    uint32_t r = v; // r will be reversed bits of v; first get LSB of v
+    int s = sizeof(v) * CHAR_BIT - 1; // extra shift needed at end
+
+    for (v >>= 1; v; v >>= 1) {
+        r <<= 1;
+        r |= v & 1;
+        s--;
+    }
+    r <<= s; // shift when v's highest bits are zero
+    return r;
+}
+
+static void zero_element_callback(int idx, void *extra, void *keyv, int *keysz, void *valv, int *valsz) {
+    const bool *disperse_keys = static_cast<bool *>(extra);
    int *CAST_FROM_VOIDP(key, keyv);
    int *CAST_FROM_VOIDP(val, valv);
-    *key = idx;
+    if (*disperse_keys) {
+        *key = static_cast<int>(breverse(idx));
+    } else {
+        *key = idx;
+    }
    *val = 0;
    *keysz = sizeof(int);
    *valsz = sizeof(int);
 }

-static int fill_tables_with_zeroes(DB **dbs, int num_DBs, int num_elements, uint32_t key_size, uint32_t val_size) {
+static int fill_tables_with_zeroes(DB **dbs, int num_DBs, int num_elements, uint32_t key_size, uint32_t val_size, bool disperse_keys) {
    for (int i = 0; i < num_DBs; i++) {
        assert(key_size >= sizeof(int));
        assert(val_size >= sizeof(int));
@ -1559,7 +1585,7 @@ static int fill_tables_with_zeroes(DB **dbs, int num_DBs, int num_elements, uint
            key_size,
            val_size,
            zero_element_callback,
-            NULL
+            &disperse_keys
            );
        CKERR(r);
    }
@ -1604,6 +1630,7 @@ static int open_tables(DB_ENV **env_res, DB **db_res, int num_DBs,
    env->set_update(env, env_args.update_function);
    // set the cache size to 10MB
    r = env->set_cachesize(env, env_args.cachetable_size / (1 << 30), env_args.cachetable_size % (1 << 30), 1); CKERR(r);
+    r = env->set_lg_bsize(env, env_args.rollback_node_size); CKERR(r);
    if (env_args.generate_put_callback) {
        r = env->set_generate_row_callback_for_put(env, env_args.generate_put_callback); 
        CKERR(r);
@ -1652,6 +1679,7 @@ static int close_tables(DB_ENV *env, DB**  dbs, int num_DBs) {
 static const struct env_args DEFAULT_ENV_ARGS = {
    .node_size = 4096,
    .basement_node_size = 1024,
+    .rollback_node_size = 4096,
    .checkpointing_period = 10,
    .cleaner_period = 1,
    .cleaner_iterations = 1,
@ -1667,6 +1695,7 @@ static const struct env_args DEFAULT_ENV_ARGS = {
 static const struct env_args DEFAULT_PERF_ENV_ARGS = {
    .node_size = 4*1024*1024,
    .basement_node_size = 128*1024,
+    .rollback_node_size = 4*1024*1024,
    .checkpointing_period = 60,
    .cleaner_period = 1,
    .cleaner_iterations = 5,
@ -1716,6 +1745,7 @@ static struct cli_args UU() get_default_args(void) {
        .unique_checks = false,
        .nosync = false,
        .nolog = false,
+        .disperse_keys = false,
        };
    return DEFAULT_ARGS;
 }
@ -2057,6 +2087,7 @@ static inline void parse_stress_test_args (int argc, char *const argv[], struct
        INT32_ARG_NONNEG("--num_seconds",             time_of_test,                  "s"),
        INT32_ARG_NONNEG("--node_size",               env_args.node_size,            " bytes"),
        INT32_ARG_NONNEG("--basement_node_size",      env_args.basement_node_size,   " bytes"),
+        INT32_ARG_NONNEG("--rollback_node_size",      env_args.rollback_node_size,   " bytes"),
        INT32_ARG_NONNEG("--checkpointing_period",    env_args.checkpointing_period, "s"),
        INT32_ARG_NONNEG("--cleaner_period",          env_args.cleaner_period,       "s"),
        INT32_ARG_NONNEG("--cleaner_iterations",      env_args.cleaner_iterations,   ""),
@ -2093,6 +2124,7 @@ static inline void parse_stress_test_args (int argc, char *const argv[], struct
        BOOL_ARG("unique_checks",     unique_checks),
        BOOL_ARG("nosync",     nosync),
        BOOL_ARG("nolog",     nolog),
+        BOOL_ARG("disperse_keys", disperse_keys),

        STRING_ARG("--envdir",    env_args.envdir),
        LOCAL_STRING_ARG("--perf_format", perf_format_s, "human"),
@ -2246,7 +2278,7 @@ UU() stress_test_main_with_cmp(struct cli_args *args, int (*bt_compare)(DB *, co
            bt_compare,
            args
            );
-        { int chk_r = fill_tables_with_zeroes(dbs, args->num_DBs, args->num_elements, args->key_size, args->val_size); CKERR(chk_r); }
+        { int chk_r = fill_tables_with_zeroes(dbs, args->num_DBs, args->num_elements, args->key_size, args->val_size, args->disperse_keys); CKERR(chk_r); }
        { int chk_r = close_tables(env, dbs, args->num_DBs); CKERR(chk_r); }
    }
    if (!args->only_create) {
--- a/src/ydb_db.cc
+++ b/src/ydb_db.cc
@ -18,6 +18,7 @@
 #include "ydb_db.h"
 #include "ydb_write.h"
 #include <lock_tree/locktree.h>
+#include <portability/toku_atomic.h>

 static YDB_DB_LAYER_STATUS_S ydb_db_layer_status;
 #ifdef STATUS_VALUE
@ -275,7 +276,7 @@ toku_db_open(DB * db, DB_TXN * txn, const char *fname, const char *dbname, DBTYP
        if (txn) {
            id = toku_txn_get_txnid(db_txn_struct_i(txn)->tokutxn);
        } else {
-            id = __sync_fetch_and_add(&nontransactional_open_id, 1);
+            id = toku_sync_fetch_and_add(&nontransactional_open_id, 1);
        }
        create_iname_hint(dname, hint);
        iname = create_iname(db->dbenv, id, hint, NULL, -1);  // allocated memory for iname
--- a/src/ydb_txn.cc
+++ b/src/ydb_txn.cc
@ -11,6 +11,7 @@
 #include "ydb_txn.h"
 #include <lock_tree/lth.h>
 #include <toku_race_tools.h>
+#include <portability/toku_atomic.h>
 #include "ft/txn_manager.h"

 static int 
@ -42,7 +43,7 @@ toku_txn_release_locks(DB_TXN* txn) {

 static void
 toku_txn_destroy(DB_TXN *txn) {
-    int32_t open_txns = __sync_sub_and_fetch(&txn->mgrp->i->open_txns, 1);
+    int32_t open_txns = toku_sync_sub_and_fetch(&txn->mgrp->i->open_txns, 1);
    invariant(open_txns >= 0);
    toku_txn_destroy_txn(db_txn_struct_i(txn)->tokutxn);
    toku_mutex_destroy(&db_txn_struct_i(txn)->txn_mutex);
@ -468,7 +469,7 @@ toku_txn_begin(DB_ENV *env, DB_TXN * stxn, DB_TXN ** txn, uint32_t flags) {
    }

    toku_mutex_init(&db_txn_struct_i(result)->txn_mutex, NULL);
-    (void) __sync_fetch_and_add(&env->i->open_txns, 1);
+    (void) toku_sync_fetch_and_add(&env->i->open_txns, 1);

    *txn = result;
    return 0;
@ -489,7 +490,7 @@ void toku_keep_prepared_txn_callback (DB_ENV *env, TOKUTXN tokutxn) {
    toku_txn_set_container_db_txn(tokutxn, result);

    toku_mutex_init(&db_txn_struct_i(result)->txn_mutex, NULL);
-    (void) __sync_fetch_and_add(&env->i->open_txns, 1);
+    (void) toku_sync_fetch_and_add(&env->i->open_txns, 1);
 }

 // Test-only function
--- a/src/ydb_write.cc
+++ b/src/ydb_write.cc
@ -12,6 +12,7 @@
 #include "ydb_row_lock.h"
 #include "ydb_write.h"
 #include "ydb_db.h"
+#include <portability/toku_atomic.h>

 static YDB_WRITE_LAYER_STATUS_S ydb_write_layer_status;
 #ifdef STATUS_VALUE
@ -204,10 +205,10 @@ toku_db_put(DB *db, DB_TXN *txn, DBT *key, DBT *val, uint32_t flags, bool holds_
    if (r == 0) {
        // helgrind flags a race on this status update.  we increment it atomically to satisfy helgrind.
        // STATUS_VALUE(YDB_LAYER_NUM_INSERTS)++;  // accountability 
-        (void) __sync_fetch_and_add(&STATUS_VALUE(YDB_LAYER_NUM_INSERTS), 1);
+        (void) toku_sync_fetch_and_add(&STATUS_VALUE(YDB_LAYER_NUM_INSERTS), 1);
    } else {
        // STATUS_VALUE(YDB_LAYER_NUM_INSERTS_FAIL)++;  // accountability 
-        (void) __sync_fetch_and_add(&STATUS_VALUE(YDB_LAYER_NUM_INSERTS_FAIL), 1);
+        (void) toku_sync_fetch_and_add(&STATUS_VALUE(YDB_LAYER_NUM_INSERTS_FAIL), 1);
    }

    return r;
--- a/toku_include/config.h.in
+++ b/toku_include/config.h.in
@ -9,6 +9,9 @@

 #define TOKUDB_REVISION @CMAKE_TOKUDB_REVISION@

+#cmakedefine TOKU_DEBUG_PARANOID 1
+#cmakedefine USE_VALGRIND 1
+
 #cmakedefine HAVE_ALLOCA_H 1
 #cmakedefine HAVE_ARPA_INET_H 1
 #cmakedefine HAVE_BYTESWAP_H 1
--- a/toku_include/toku_assert.h
+++ b/toku_include/toku_assert.h
@ -80,6 +80,18 @@ extern void (*do_assert_hook)(void); // Set this to a function you want called a
 #define resource_assert(a)      assert(a)      // indicates resource must be available, otherwise unrecoverable
 #define resource_assert_zero(a) assert_zero(a) // indicates resource must be available, otherwise unrecoverable

+#ifdef TOKU_DEBUG_PARANOID
+#define paranoid_invariant(a) assert(a)
+#define paranoid_invariant_null(a) assert_null(a)
+#define paranoid_invariant_notnull(a) assert(a)
+#define paranoid_invariant_zero(a) assert_zero(a)
+#else
+#define paranoid_invariant(a) ((void) 0)
+#define paranoid_invariant_null(a) ((void) 0)
+#define paranoid_invariant_notnull(a) ((void) 0)
+#define paranoid_invariant_zero(a) ((void) 0)
+#endif
+
 static inline int
 get_error_errno(void)
 {
--- a/toku_include/toku_portability.h
+++ b/toku_include/toku_portability.h
@ -32,6 +32,9 @@

 #endif

+// include here, before they get deprecated
+#include <toku_atomic.h>
+
 #if TOKU_WINDOWS
 // Windows

@ -210,6 +213,23 @@ extern void *realloc(void*, size_t)            __THROW __attribute__((__deprecat
 #pragma GCC poison BOOL
 #pragma GCC poison FALSE
 #pragma GCC poison TRUE
+#pragma GCC poison __sync_fetch_and_add
+#pragma GCC poison __sync_fetch_and_sub
+#pragma GCC poison __sync_fetch_and_or
+#pragma GCC poison __sync_fetch_and_and
+#pragma GCC poison __sync_fetch_and_xor
+#pragma GCC poison __sync_fetch_and_nand
+#pragma GCC poison __sync_add_and_fetch
+#pragma GCC poison __sync_sub_and_fetch
+#pragma GCC poison __sync_or_and_fetch
+#pragma GCC poison __sync_and_and_fetch
+#pragma GCC poison __sync_xor_and_fetch
+#pragma GCC poison __sync_nand_and_fetch
+#pragma GCC poison __sync_bool_compare_and_swap
+#pragma GCC poison __sync_val_compare_and_swap
+#pragma GCC poison __sync_synchronize
+#pragma GCC poison __sync_lock_test_and_set
+#pragma GCC poison __sync_release
 #   endif
 #endif

--- a/toku_include/toku_race_tools.h
+++ b/toku_include/toku_race_tools.h
@ -8,7 +8,7 @@

 #include "config.h"

-#if defined(__linux__) && !defined(NVALGRIND)
+#if defined(__linux__) && defined(USE_VALGRIND)

 # include <valgrind/helgrind.h>
 # include <valgrind/drd.h>
@ -21,6 +21,7 @@

 #else

+# define NVALGRIND 1
 # define TOKU_ANNOTATE_NEW_MEMORY(p, size) ((void) 0)
 # define TOKU_VALGRIND_HG_ENABLE_CHECKING(p, size) ((void) 0)
 # define TOKU_VALGRIND_HG_DISABLE_CHECKING(p, size) ((void) 0)
--- a/util/circular_buffer.cc
+++ b/util/circular_buffer.cc
@ -8,7 +8,7 @@ namespace toku {

    template<typename T>
    void circular_buffer<T>::init(T * const array, size_t cap) {
-        invariant_notnull(array);
+        paranoid_invariant_notnull(array);
        m_array = array;
        m_cap = cap;
        m_begin = 0;
@ -23,9 +23,9 @@ namespace toku {
    template<typename T>
    void circular_buffer<T>::deinit(void) {
        lock();
-        invariant(is_empty());
-        invariant_zero(m_push_waiters);
-        invariant_zero(m_pop_waiters);
+        paranoid_invariant(is_empty());
+        paranoid_invariant_zero(m_push_waiters);
+        paranoid_invariant_zero(m_pop_waiters);
        unlock();
        toku_cond_destroy(&m_pop_cond);
        toku_cond_destroy(&m_push_cond);
@ -67,15 +67,15 @@ namespace toku {
    template<typename T>
    T *circular_buffer<T>::get_addr(size_t idx) {
        toku_mutex_assert_locked(&m_lock);
-        invariant(idx >= m_begin);
-        invariant(idx < m_limit);
+        paranoid_invariant(idx >= m_begin);
+        paranoid_invariant(idx < m_limit);
        return &m_array[mod(idx, m_cap)];
    }

    template<typename T>
    void circular_buffer<T>::push_and_maybe_signal_unlocked(const T &elt) {
        toku_mutex_assert_locked(&m_lock);
-        invariant(!is_full());
+        paranoid_invariant(!is_full());
        size_t location = m_limit++;
        *get_addr(location) = elt;
        if (m_pop_waiters > 0) {
@ -110,7 +110,7 @@ namespace toku {
    template<typename T>
    bool circular_buffer<T>::timedpush(const T &elt, toku_timespec_t *abstime) {
        bool pushed = false;
-        invariant_notnull(abstime);
+        paranoid_invariant_notnull(abstime);
        lock();
        if (is_full()) {
            ++m_push_waiters;
@ -131,7 +131,7 @@ namespace toku {
    template<typename T>
    T circular_buffer<T>::pop_and_maybe_signal_unlocked(void) {
        toku_mutex_assert_locked(&m_lock);
-        invariant(!is_empty());
+        paranoid_invariant(!is_empty());
        T ret = *get_addr(m_begin);
        ++m_begin;
        if (m_push_waiters > 0) {
@ -156,7 +156,7 @@ namespace toku {
    template<typename T>
    bool circular_buffer<T>::trypop(T * const eltp) {
        bool popped = false;
-        invariant_notnull(eltp);
+        paranoid_invariant_notnull(eltp);
        lock();
        if (!is_empty() && m_pop_waiters == 0) {
            *eltp = pop_and_maybe_signal_unlocked();
@ -169,8 +169,8 @@ namespace toku {
    template<typename T>
    bool circular_buffer<T>::timedpop(T * const eltp, toku_timespec_t *abstime) {
        bool popped = false;
-        invariant_notnull(eltp);
-        invariant_notnull(abstime);
+        paranoid_invariant_notnull(eltp);
+        paranoid_invariant_notnull(abstime);
        lock();
        if (is_empty()) {
            ++m_pop_waiters;
--- a/util/frwlock.cc
+++ b/util/frwlock.cc
@ -38,19 +38,19 @@ inline bool frwlock::queue_is_empty(void) const {
 }

 inline void frwlock::enq_item(queue_item *const item) {
-    invariant_null(item->next);
+    paranoid_invariant_null(item->next);
    if (m_wait_tail != nullptr) {
        m_wait_tail->next = item;
    } else {
-        invariant_null(m_wait_head);
+        paranoid_invariant_null(m_wait_head);
        m_wait_head = item;
    }
    m_wait_tail = item;
 }

 inline toku_cond_t *frwlock::deq_item(void) {
-    invariant_notnull(m_wait_head);
-    invariant_notnull(m_wait_tail);
+    paranoid_invariant_notnull(m_wait_head);
+    paranoid_invariant_notnull(m_wait_tail);
    queue_item *item = m_wait_head;
    m_wait_head = m_wait_head->next;
    if (m_wait_tail == item) {
@ -79,10 +79,10 @@ inline void frwlock::write_lock(bool expensive) {
    toku_cond_destroy(&cond);

    // Now it's our turn.
-    invariant(m_num_want_write > 0);
-    invariant_zero(m_num_readers);
-    invariant_zero(m_num_writers);
-    invariant_zero(m_num_signaled_readers);
+    paranoid_invariant(m_num_want_write > 0);
+    paranoid_invariant_zero(m_num_readers);
+    paranoid_invariant_zero(m_num_writers);
+    paranoid_invariant_zero(m_num_signaled_readers);

    // Not waiting anymore; grab the lock.
    --m_num_want_write;
@ -99,8 +99,8 @@ inline bool frwlock::try_write_lock(bool expensive) {
        return false;
    }
    // No one holds the lock.  Grant the write lock.
-    invariant_zero(m_num_want_write);
-    invariant_zero(m_num_want_read);
+    paranoid_invariant_zero(m_num_want_write);
+    paranoid_invariant_zero(m_num_want_read);
    m_num_writers = 1;
    m_current_writer_expensive = expensive;
    return true;
@ -111,11 +111,11 @@ inline void frwlock::read_lock(void) {
    if (m_num_writers > 0 || m_num_want_write > 0) {
        if (!m_wait_read_is_in_queue) {
            // Throw the read cond_t onto the queue.
-            invariant(m_num_signaled_readers == m_num_want_read);
+            paranoid_invariant(m_num_signaled_readers == m_num_want_read);
            m_queue_item_read.next = nullptr;
            this->enq_item(&m_queue_item_read);
            m_wait_read_is_in_queue = true;
-            invariant(!m_read_wait_expensive);
+            paranoid_invariant(!m_read_wait_expensive);
            m_read_wait_expensive = (
                m_current_writer_expensive || 
                (m_num_expensive_want_write > 0)
@ -127,9 +127,9 @@ inline void frwlock::read_lock(void) {
        toku_cond_wait(&m_wait_read, m_mutex);

        // Now it's our turn.
-        invariant_zero(m_num_writers);
-        invariant(m_num_want_read > 0);
-        invariant(m_num_signaled_readers > 0);
+        paranoid_invariant_zero(m_num_writers);
+        paranoid_invariant(m_num_want_read > 0);
+        paranoid_invariant(m_num_signaled_readers > 0);

        // Not waiting anymore; grab the lock.
        --m_num_want_read;
@ -153,17 +153,17 @@ inline bool frwlock::try_read_lock(void) {
 inline void frwlock::maybe_signal_next_writer(void) {
    if (m_num_want_write > 0 && m_num_signaled_readers == 0 && m_num_readers == 0) {
        toku_cond_t *cond = this->deq_item();
-        invariant(cond != &m_wait_read);
+        paranoid_invariant(cond != &m_wait_read);
        // Grant write lock to waiting writer.
-        invariant(m_num_want_write > 0);
+        paranoid_invariant(m_num_want_write > 0);
        toku_cond_signal(cond);
    }
 }

 inline void frwlock::read_unlock(void) {
    toku_mutex_assert_locked(m_mutex);
-    invariant(m_num_writers == 0);
-    invariant(m_num_readers > 0);
+    paranoid_invariant(m_num_writers == 0);
+    paranoid_invariant(m_num_readers > 0);
    --m_num_readers;
    this->maybe_signal_next_writer();
 }
@ -180,18 +180,18 @@ inline bool frwlock::read_lock_is_expensive(void) {


 inline void frwlock::maybe_signal_or_broadcast_next(void) {
-    invariant(m_num_signaled_readers == 0);
+    paranoid_invariant(m_num_signaled_readers == 0);

    if (this->queue_is_empty()) {
-        invariant(m_num_want_write == 0);
-        invariant(m_num_want_read == 0);
+        paranoid_invariant(m_num_want_write == 0);
+        paranoid_invariant(m_num_want_read == 0);
        return;
    }
    toku_cond_t *cond = this->deq_item();
    if (cond == &m_wait_read) {
        // Grant read locks to all waiting readers
-        invariant(m_wait_read_is_in_queue);
-        invariant(m_num_want_read > 0);
+        paranoid_invariant(m_wait_read_is_in_queue);
+        paranoid_invariant(m_num_want_read > 0);
        m_num_signaled_readers = m_num_want_read;
        m_wait_read_is_in_queue = false;
        m_read_wait_expensive = false;
@ -199,14 +199,14 @@ inline void frwlock::maybe_signal_or_broadcast_next(void) {
    }
    else {
        // Grant write lock to waiting writer.
-        invariant(m_num_want_write > 0);
+        paranoid_invariant(m_num_want_write > 0);
        toku_cond_signal(cond);
    }
 }

 inline void frwlock::write_unlock(void) {
    toku_mutex_assert_locked(m_mutex);
-    invariant(m_num_writers == 1);
+    paranoid_invariant(m_num_writers == 1);
    m_num_writers = 0;
    m_current_writer_expensive = false;
    this->maybe_signal_or_broadcast_next();
--- a/util/growable_array.h
+++ b/util/growable_array.h
@ -51,7 +51,7 @@ template<typename T> class GrowableArray {
    void store_unchecked (size_t i, T v)
    // Effect: Store v in the ith element.  If i is out of range, the system asserts.
    {
-	assert(i<m_size);
+	paranoid_invariant(i<m_size);
 	m_array[i]=v;
    }

--- a/util/mempool.cc
+++ b/util/mempool.cc
@ -35,7 +35,7 @@ void toku_mempool_zero(struct mempool *mp) {
 void toku_mempool_copy_construct(struct mempool *mp, const void * const data_source, const size_t data_size) {
    // printf("mempool_copy %p %p %lu\n", mp, data_source, data_size);
    if (data_size) {
-	invariant(data_source);
+	paranoid_invariant(data_source);
 	toku_mempool_construct(mp, data_size);
 	memcpy(mp->base, data_source, data_size);
 	mp->free_offset = data_size;                     // address of first available memory for new data
@ -49,8 +49,8 @@ void toku_mempool_copy_construct(struct mempool *mp, const void * const data_sou
 // TODO 4050 this is dirty, try to replace all uses of this
 void toku_mempool_init(struct mempool *mp, void *base, size_t size) {
    // printf("mempool_init %p %p %lu\n", mp, base, size);
-    invariant(base != 0);
-    invariant(size < (1U<<31)); // used to be assert(size >= 0), but changed to size_t so now let's make sure it's not more than 2GB...
+    paranoid_invariant(base != 0);
+    paranoid_invariant(size < (1U<<31)); // used to be assert(size >= 0), but changed to size_t so now let's make sure it's not more than 2GB...
    mp->base = base;
    mp->size = size;
    mp->free_offset = 0;             // address of first available memory
@ -106,10 +106,10 @@ size_t toku_mempool_get_allocated_space(struct mempool *mp) {
 }

 void *toku_mempool_malloc(struct mempool *mp, size_t size, int alignment) {
-    invariant(size < (1U<<31));
-    invariant(mp->size < (1U<<31));
-    invariant(mp->free_offset < (1U<<31));
-    assert(mp->free_offset <= mp->size);
+    paranoid_invariant(size < (1U<<31));
+    paranoid_invariant(mp->size < (1U<<31));
+    paranoid_invariant(mp->free_offset < (1U<<31));
+    paranoid_invariant(mp->free_offset <= mp->size);
    void *vp;
    size_t offset = (mp->free_offset + (alignment-1)) & ~(alignment-1);
    //printf("mempool_malloc size=%ld base=%p free_offset=%ld mp->size=%ld offset=%ld\n", size, mp->base, mp->free_offset, mp->size, offset);
@ -119,18 +119,18 @@ void *toku_mempool_malloc(struct mempool *mp, size_t size, int alignment) {
        vp = (char *)mp->base + offset;
        mp->free_offset = offset + size;
    }
-    assert(mp->free_offset <= mp->size);
-    assert(((long)vp & (alignment-1)) == 0);
-    assert(vp == 0 || toku_mempool_inrange(mp, vp, size));
+    paranoid_invariant(mp->free_offset <= mp->size);
+    paranoid_invariant(((long)vp & (alignment-1)) == 0);
+    paranoid_invariant(vp == 0 || toku_mempool_inrange(mp, vp, size));
    //printf("mempool returning %p\n", vp);
    return vp;
 }

 // if vp is null then we are freeing something, but not specifying what.  The data won't be freed until compression is done.
 void toku_mempool_mfree(struct mempool *mp, void *vp, size_t size) {
-    if (vp) assert(toku_mempool_inrange(mp, vp, size));
+    if (vp) { paranoid_invariant(toku_mempool_inrange(mp, vp, size)); }
    mp->frag_size += size;
-    assert(mp->frag_size <= mp->size);
+    paranoid_invariant(mp->frag_size <= mp->size);
 }


--- a/util/omt.cc
+++ b/util/omt.cc
@ -44,7 +44,7 @@ void omt<omtdata_t, omtdataout_t, supports_marks>::create_from_sorted_array(cons

 template<typename omtdata_t, typename omtdataout_t, bool supports_marks>
 void omt<omtdata_t, omtdataout_t, supports_marks>::create_steal_sorted_array(omtdata_t **const values, const uint32_t numvalues, const uint32_t new_capacity) {
-    invariant_notnull(values);
+    paranoid_invariant_notnull(values);
    this->create_internal_no_array(new_capacity);
    this->d.a.num_values = numvalues;
    this->d.a.values = *values;
@ -57,7 +57,7 @@ void omt<omtdata_t, omtdataout_t, supports_marks>::create_steal_sorted_array(omt
 template<typename omtdata_t, typename omtdataout_t, bool supports_marks>
 int omt<omtdata_t, omtdataout_t, supports_marks>::split_at(omt *const newomt, const uint32_t idx) {
    barf_if_marked(*this);
-    invariant_notnull(newomt);
+    paranoid_invariant_notnull(newomt);
    if (idx > this->size()) { return EINVAL; }
    this->convert_to_array();
    const uint32_t newsize = this->size() - idx;
@ -73,8 +73,8 @@ int omt<omtdata_t, omtdataout_t, supports_marks>::split_at(omt *const newomt, co
 template<typename omtdata_t, typename omtdataout_t, bool supports_marks>
 void omt<omtdata_t, omtdataout_t, supports_marks>::merge(omt *const leftomt, omt *const rightomt) {
    barf_if_marked(*this);
-    invariant_notnull(leftomt);
-    invariant_notnull(rightomt);
+    paranoid_invariant_notnull(leftomt);
+    paranoid_invariant_notnull(rightomt);
    const uint32_t leftsize = leftomt->size();
    const uint32_t rightsize = rightomt->size();
    const uint32_t newsize = leftsize + rightsize;
@ -106,7 +106,7 @@ void omt<omtdata_t, omtdataout_t, supports_marks>::merge(omt *const leftomt, omt
    }
    rightomt->destroy();
    this->d.a.num_values += rightsize;
-    invariant(this->size() == newsize);
+    paranoid_invariant(this->size() == newsize);
    if (supports_marks) {
        this->convert_to_tree();
    }
@ -295,7 +295,7 @@ int omt<omtdata_t, omtdataout_t, supports_marks>::iterate_and_mark_range(const u
    static_assert(supports_marks, "does not support marks");
    if (right > this->size()) { return EINVAL; }
    if (left == right) { return 0; }
-    invariant(!this->is_array);
+    paranoid_invariant(!this->is_array);
    return this->iterate_and_mark_range_internal<iterate_extra_t, f>(left, right, this->d.t.root, 0, iterate_extra);
 }

@ -305,7 +305,7 @@ template<typename iterate_extra_t,
         int (*f)(const omtdata_t &, const uint32_t, iterate_extra_t *const)>
 int omt<omtdata_t, omtdataout_t, supports_marks>::iterate_over_marked(iterate_extra_t *const iterate_extra) const {
    static_assert(supports_marks, "does not support marks");
-    invariant(!this->is_array);
+    paranoid_invariant(!this->is_array);
    return this->iterate_over_marked_internal<iterate_extra_t, f>(this->d.t.root, 0, iterate_extra);
 }

@ -334,7 +334,7 @@ void omt<omtdata_t, omtdataout_t, supports_marks>::delete_all_marked(void) {
    if (!this->has_marks()) {
        return;
    }
-    invariant(!this->is_array);
+    paranoid_invariant(!this->is_array);
    GrowableArray<node_idx> marked_indexes;
    marked_indexes.init();

@ -353,7 +353,7 @@ void omt<omtdata_t, omtdataout_t, supports_marks>::delete_all_marked(void) {
 }

 template<typename omtdata_t, typename omtdataout_t, bool supports_marks>
-uint32_t omt<omtdata_t, omtdataout_t, supports_marks>::verify_marks_consistent_internal(const subtree &subtree, const bool allow_marks) const {
+uint32_t omt<omtdata_t, omtdataout_t, supports_marks>::verify_marks_consistent_internal(const subtree &subtree, const bool UU(allow_marks)) const {
    if (subtree.is_null()) {
        return 0;
    }
@ -361,14 +361,14 @@ uint32_t omt<omtdata_t, omtdataout_t, supports_marks>::verify_marks_consistent_i
    uint32_t num_marks = verify_marks_consistent_internal(node.left, node.get_marks_below());
    num_marks += verify_marks_consistent_internal(node.right, node.get_marks_below());
    if (node.get_marks_below()) {
-        invariant(allow_marks);
-        invariant(num_marks > 0);
+        paranoid_invariant(allow_marks);
+        paranoid_invariant(num_marks > 0);
    } else {
        // redundant with invariant below, but nice to have explicitly
-        invariant(num_marks == 0);
+        paranoid_invariant(num_marks == 0);
    }
    if (node.get_marked()) {
-        invariant(allow_marks);
+        paranoid_invariant(allow_marks);
        ++num_marks;
    }
    return num_marks;
@ -377,7 +377,7 @@ uint32_t omt<omtdata_t, omtdataout_t, supports_marks>::verify_marks_consistent_i
 template<typename omtdata_t, typename omtdataout_t, bool supports_marks>
 void omt<omtdata_t, omtdataout_t, supports_marks>::verify_marks_consistent(void) const {
    static_assert(supports_marks, "does not support marks");
-    invariant(!this->is_array);
+    paranoid_invariant(!this->is_array);
    this->verify_marks_consistent_internal(this->d.t.root, true);
 }

@ -425,7 +425,7 @@ template<typename omtcmp_t,
 int omt<omtdata_t, omtdataout_t, supports_marks>::find(const omtcmp_t &extra, int direction, omtdataout_t *const value, uint32_t *const idxp) const {
    uint32_t tmp_index;
    uint32_t *const child_idxp = (idxp != nullptr) ? idxp : &tmp_index;
-    invariant(direction != 0);
+    paranoid_invariant(direction != 0);
    if (direction < 0) {
        if (this->is_array) {
            return this->find_internal_minus_array<omtcmp_t, h>(extra, value, child_idxp);
@ -476,15 +476,15 @@ uint32_t omt<omtdata_t, omtdataout_t, supports_marks>::nweight(const subtree &su

 template<typename omtdata_t, typename omtdataout_t, bool supports_marks>
 typename omt<omtdata_t, omtdataout_t, supports_marks>::node_idx omt<omtdata_t, omtdataout_t, supports_marks>::node_malloc(void) {
-    invariant(this->d.t.free_idx < this->capacity);
+    paranoid_invariant(this->d.t.free_idx < this->capacity);
    omt_node &n = this->d.t.nodes[this->d.t.free_idx];
    n.clear_stolen_bits();
    return this->d.t.free_idx++;
 }

 template<typename omtdata_t, typename omtdataout_t, bool supports_marks>
-void omt<omtdata_t, omtdataout_t, supports_marks>::node_free(const node_idx idx) {
-    invariant(idx < this->capacity);
+void omt<omtdata_t, omtdataout_t, supports_marks>::node_free(const node_idx UU(idx)) {
+    paranoid_invariant(idx < this->capacity);
 }

 template<typename omtdata_t, typename omtdataout_t, bool supports_marks>
@ -604,7 +604,7 @@ bool omt<omtdata_t, omtdataout_t, supports_marks>::will_need_rebalance(const sub
 template<typename omtdata_t, typename omtdataout_t, bool supports_marks>
 void omt<omtdata_t, omtdataout_t, supports_marks>::insert_internal(subtree *const subtreep, const omtdata_t &value, const uint32_t idx, subtree **const rebalance_subtree) {
    if (subtreep->is_null()) {
-        invariant_zero(idx);
+        paranoid_invariant_zero(idx);
        const node_idx newidx = this->node_malloc();
        omt_node *const newnode = &this->d.t.nodes[newidx];
        newnode->weight = 1;
@ -637,7 +637,7 @@ void omt<omtdata_t, omtdataout_t, supports_marks>::set_at_internal_array(const o

 template<typename omtdata_t, typename omtdataout_t, bool supports_marks>
 void omt<omtdata_t, omtdataout_t, supports_marks>::set_at_internal(const subtree &subtree, const omtdata_t &value, const uint32_t idx) {
-    invariant(!subtree.is_null());
+    paranoid_invariant(!subtree.is_null());
    omt_node &n = this->d.t.nodes[subtree.get_index()];
    const uint32_t leftweight = this->nweight(n.left);
    if (idx < leftweight) {
@ -651,9 +651,9 @@ void omt<omtdata_t, omtdataout_t, supports_marks>::set_at_internal(const subtree

 template<typename omtdata_t, typename omtdataout_t, bool supports_marks>
 void omt<omtdata_t, omtdataout_t, supports_marks>::delete_internal(subtree *const subtreep, const uint32_t idx, omt_node *const copyn, subtree **const rebalance_subtree) {
-    invariant_notnull(subtreep);
-    invariant_notnull(rebalance_subtree);
-    invariant(!subtreep->is_null());
+    paranoid_invariant_notnull(subtreep);
+    paranoid_invariant_notnull(rebalance_subtree);
+    paranoid_invariant(!subtreep->is_null());
    omt_node &n = this->d.t.nodes[subtreep->get_index()];
    const uint32_t leftweight = this->nweight(n.left);
    if (idx < leftweight) {
@ -774,7 +774,7 @@ template<typename iterate_extra_t,
 int omt<omtdata_t, omtdataout_t, supports_marks>::iterate_and_mark_range_internal(const uint32_t left, const uint32_t right,
                                                                                  const subtree &subtree, const uint32_t idx,
                                                                                  iterate_extra_t *const iterate_extra) {
-    invariant(!subtree.is_null());
+    paranoid_invariant(!subtree.is_null());
    int r;
    omt_node &n = this->d.t.nodes[subtree.get_index()];
    const uint32_t idx_root = idx + this->nweight(n.left);
@ -925,7 +925,7 @@ template<typename omtdata_t, typename omtdataout_t, bool supports_marks>
 template<typename omtcmp_t,
         int (*h)(const omtdata_t &, const omtcmp_t &)>
 int omt<omtdata_t, omtdataout_t, supports_marks>::find_internal_zero_array(const omtcmp_t &extra, omtdataout_t *const value, uint32_t *const idxp) const {
-    invariant_notnull(idxp);
+    paranoid_invariant_notnull(idxp);
    uint32_t min = this->d.a.start_idx;
    uint32_t limit = this->d.a.start_idx + this->d.a.num_values;
    uint32_t best_pos = subtree::NODE_NULL;
@ -963,7 +963,7 @@ template<typename omtdata_t, typename omtdataout_t, bool supports_marks>
 template<typename omtcmp_t,
         int (*h)(const omtdata_t &, const omtcmp_t &)>
 int omt<omtdata_t, omtdataout_t, supports_marks>::find_internal_zero(const subtree &subtree, const omtcmp_t &extra, omtdataout_t *const value, uint32_t *const idxp) const {
-    invariant_notnull(idxp);
+    paranoid_invariant_notnull(idxp);
    if (subtree.is_null()) {
        *idxp = 0;
        return DB_NOTFOUND;
@ -993,7 +993,7 @@ template<typename omtdata_t, typename omtdataout_t, bool supports_marks>
 template<typename omtcmp_t,
         int (*h)(const omtdata_t &, const omtcmp_t &)>
 int omt<omtdata_t, omtdataout_t, supports_marks>::find_internal_plus_array(const omtcmp_t &extra, omtdataout_t *const value, uint32_t *const idxp) const {
-    invariant_notnull(idxp);
+    paranoid_invariant_notnull(idxp);
    uint32_t min = this->d.a.start_idx;
    uint32_t limit = this->d.a.start_idx + this->d.a.num_values;
    uint32_t best = subtree::NODE_NULL;
@ -1020,7 +1020,7 @@ template<typename omtdata_t, typename omtdataout_t, bool supports_marks>
 template<typename omtcmp_t,
         int (*h)(const omtdata_t &, const omtcmp_t &)>
 int omt<omtdata_t, omtdataout_t, supports_marks>::find_internal_plus(const subtree &subtree, const omtcmp_t &extra, omtdataout_t *const value, uint32_t *const idxp) const {
-    invariant_notnull(idxp);
+    paranoid_invariant_notnull(idxp);
    if (subtree.is_null()) {
        return DB_NOTFOUND;
    }
@ -1049,7 +1049,7 @@ template<typename omtdata_t, typename omtdataout_t, bool supports_marks>
 template<typename omtcmp_t,
         int (*h)(const omtdata_t &, const omtcmp_t &)>
 int omt<omtdata_t, omtdataout_t, supports_marks>::find_internal_minus_array(const omtcmp_t &extra, omtdataout_t *const value, uint32_t *const idxp) const {
-    invariant_notnull(idxp);
+    paranoid_invariant_notnull(idxp);
    uint32_t min = this->d.a.start_idx;
    uint32_t limit = this->d.a.start_idx + this->d.a.num_values;
    uint32_t best = subtree::NODE_NULL;
@ -1076,7 +1076,7 @@ template<typename omtdata_t, typename omtdataout_t, bool supports_marks>
 template<typename omtcmp_t,
         int (*h)(const omtdata_t &, const omtcmp_t &)>
 int omt<omtdata_t, omtdataout_t, supports_marks>::find_internal_minus(const subtree &subtree, const omtcmp_t &extra, omtdataout_t *const value, uint32_t *const idxp) const {
-    invariant_notnull(idxp);
+    paranoid_invariant_notnull(idxp);
    if (subtree.is_null()) {
        return DB_NOTFOUND;
    }
--- a/util/omt.h
+++ b/util/omt.h
@ -93,7 +93,7 @@ public:
    }

    inline void set_index(uint32_t index) {
-        invariant(index != NODE_NULL);
+        paranoid_invariant(index != NODE_NULL);
        m_index = index;
    }
 } __attribute__((__packed__,aligned(4)));
@ -126,7 +126,7 @@ public:
    }

    inline void set_index(uint32_t index) {
-        invariant(index < NODE_NULL);
+        paranoid_invariant(index < NODE_NULL);
        this->set_index_internal(index);
    }

--- a/util/partitioned_counter.cc
+++ b/util/partitioned_counter.cc
@ -11,6 +11,7 @@
 #include "partitioned_counter.h"
 #include "doubly_linked_list.h"
 #include "growable_array.h"
+#include <portability/toku_atomic.h>

 #ifdef __APPLE__
 // TODO(leif): The __thread declspec is broken in ways I don't understand
@ -36,7 +37,7 @@ void destroy_partitioned_counter(PARTITIONED_COUNTER counter) {
 }

 void increment_partitioned_counter(PARTITIONED_COUNTER counter, uint64_t delta) {
-    (void) __sync_fetch_and_add(&counter->v, delta);
+    (void) toku_sync_fetch_and_add(&counter->v, delta);
 }

 uint64_t read_partitioned_counter(PARTITIONED_COUNTER counter) {
--- a/util/rwlock.h
+++ b/util/rwlock.h
@ -150,8 +150,10 @@ rwlock_init(RWLOCK rwlock) {
 static __attribute__((__unused__))
 void
 rwlock_destroy(RWLOCK rwlock) {
-    assert(rwlock->reader == 0 && rwlock->want_read == 0);
-    assert(rwlock->writer == 0 && rwlock->want_write == 0);
+    paranoid_invariant(rwlock->reader == 0);
+    paranoid_invariant(rwlock->want_read == 0);
+    paranoid_invariant(rwlock->writer == 0);
+    paranoid_invariant(rwlock->want_write == 0);
    toku_cond_destroy(&rwlock->wait_read);
    toku_cond_destroy(&rwlock->wait_write);
 }
@ -160,7 +162,7 @@ rwlock_destroy(RWLOCK rwlock) {
 // expects: mutex is locked

 static inline void rwlock_read_lock(RWLOCK rwlock, toku_mutex_t *mutex) {
-    assert(!rwlock->wait_users_go_to_zero);
+    paranoid_invariant(!rwlock->wait_users_go_to_zero);
    if (rwlock->writer || rwlock->want_write) {
        rwlock->want_read++;
        while (rwlock->writer || rwlock->want_write) {
@ -175,8 +177,8 @@ static inline void rwlock_read_lock(RWLOCK rwlock, toku_mutex_t *mutex) {
 // expects: mutex is locked

 static inline void rwlock_read_unlock(RWLOCK rwlock) {
-    assert(rwlock->reader > 0);
-    assert(rwlock->writer == 0);
+    paranoid_invariant(rwlock->reader > 0);
+    paranoid_invariant(rwlock->writer == 0);
    rwlock->reader--;
    if (rwlock->reader == 0 && rwlock->want_write) {
        toku_cond_signal(&rwlock->wait_write);
@ -190,7 +192,7 @@ static inline void rwlock_read_unlock(RWLOCK rwlock) {
 // expects: mutex is locked

 static inline void rwlock_write_lock(RWLOCK rwlock, toku_mutex_t *mutex) {
-    assert(!rwlock->wait_users_go_to_zero);
+    paranoid_invariant(!rwlock->wait_users_go_to_zero);
    if (rwlock->reader || rwlock->writer) {
        rwlock->want_write++;
        while (rwlock->reader || rwlock->writer) {
@ -205,8 +207,8 @@ static inline void rwlock_write_lock(RWLOCK rwlock, toku_mutex_t *mutex) {
 // expects: mutex is locked

 static inline void rwlock_write_unlock(RWLOCK rwlock) {
-    assert(rwlock->reader == 0);
-    assert(rwlock->writer == 1);
+    paranoid_invariant(rwlock->reader == 0);
+    paranoid_invariant(rwlock->writer == 1);
    rwlock->writer--;
    if (rwlock->want_write) {
        toku_cond_signal(&rwlock->wait_write);
@ -255,7 +257,7 @@ static inline void rwlock_wait_for_users(
    toku_mutex_t *mutex
    )
 {
-    assert(!rwlock->wait_users_go_to_zero);
+    paranoid_invariant(!rwlock->wait_users_go_to_zero);
    toku_cond_t cond;
    toku_cond_init(&cond, NULL);
    while (rwlock_users(rwlock) > 0) {
--- a/Show more
+++ b/Show more