From 7942bcf20945aafa86b9c8a5feac914f807747c6 Mon Sep 17 00:00:00 2001
From: Zardosht Kasheff <zardosht@tokutek.com>
Date: Wed, 17 Apr 2013 00:00:13 -0400
Subject: [PATCH] [t:4028], merge to main

git-svn-id: file:///svn/toku/tokudb@41142 c7de825b-a66e-492c-adef-691d508d4ae1
---
 newbrt/brt-cachetable-wrappers.c              |   6 +
 newbrt/brt-cachetable-wrappers.h              |   3 +
 newbrt/brt-flusher.c                          |  12 +-
 newbrt/brt-hot-flusher.c                      |   1 +
 newbrt/brt-internal.h                         |  52 +-
 newbrt/brt-serialize.c                        | 172 ++-
 newbrt/brt-test-helpers.c                     |   4 +
 newbrt/brt-verify.c                           |   1 +
 newbrt/brt.c                                  | 336 ++++--
 newbrt/brtdump.c                              |  12 +-
 newbrt/brtloader.c                            |  11 +-
 newbrt/brttypes.h                             |   1 +
 newbrt/cachetable.c                           | 980 +++++++++++-------
 newbrt/cachetable.h                           |  24 +-
 newbrt/checkpoint.c                           |   6 +-
 newbrt/fifo.c                                 |  16 +
 newbrt/fifo.h                                 |   2 +
 newbrt/mempool.c                              |   8 +
 newbrt/mempool.h                              |   2 +
 newbrt/rollback.c                             |  11 +-
 newbrt/tests/brt-bfe-query.c                  |  52 +-
 newbrt/tests/brt-clock-test.c                 |  33 +-
 newbrt/tests/brt-serialize-benchmark.c        |  16 +-
 newbrt/tests/brt-serialize-test.c             | 237 +++--
 newbrt/tests/cachetable-3969.c                |   9 +-
 newbrt/tests/cachetable-4357.c                |   2 +
 newbrt/tests/cachetable-4365.c                |   2 +
 newbrt/tests/cachetable-4545.c                |  10 +-
 newbrt/tests/cachetable-all-write.c           |  10 +-
 newbrt/tests/cachetable-checkpoint-pending.c  |   9 +-
 .../cachetable-checkpoint-pinned-nodes.c      |   9 +-
 .../cachetable-checkpoint-prefetched-nodes.c  |   5 +-
 newbrt/tests/cachetable-checkpoint-test.c     |  16 +-
 newbrt/tests/cachetable-cleaner-checkpoint.c  |   6 +-
 newbrt/tests/cachetable-cleaner-checkpoint2.c |   6 +-
 newbrt/tests/cachetable-cleaner-dev-null.c    |   6 +-
 ...chetable-cleaner-thread-attrs-accumulate.c |   6 +-
 ...hetable-cleaner-thread-everything-pinned.c |   1 +
 ...le-cleaner-thread-nothing-needs-flushing.c |   1 +
 .../tests/cachetable-cleaner-thread-simple.c  |   2 +
 newbrt/tests/cachetable-clock-eviction.c      |  13 +-
 newbrt/tests/cachetable-clock-eviction2.c     |  17 +-
 newbrt/tests/cachetable-clock-eviction3.c     |  18 +-
 newbrt/tests/cachetable-clock-eviction4.c     |  16 +-
 newbrt/tests/cachetable-clone-checkpoint.c    | 109 ++
 ...chetable-clone-partial-fetch-pinned-node.c | 113 ++
 newbrt/tests/cachetable-clone-partial-fetch.c | 113 ++
 .../tests/cachetable-clone-pin-nonblocking.c  |  96 ++
 newbrt/tests/cachetable-clone-unpin-remove.c  | 102 ++
 newbrt/tests/cachetable-eviction-close-test.c |  10 +-
 .../tests/cachetable-eviction-close-test2.c   |  10 +-
 .../cachetable-eviction-getandpin-test.c      |  10 +-
 .../cachetable-eviction-getandpin-test2.c     |   5 +
 .../tests/cachetable-flush-during-cleaner.c   |   2 +-
 newbrt/tests/cachetable-getandpin-test.c      |   7 +-
 .../cachetable-kibbutz_and_flush_cachefile.c  |   4 +-
 newbrt/tests/cachetable-partial-fetch.c       |  18 +-
 newbrt/tests/cachetable-pin-checkpoint.c      |  27 +-
 ...hetable-pin-nonblocking-checkpoint-clean.c |   7 +-
 .../cachetable-prefetch-checkpoint-test.c     |   5 +-
 .../cachetable-prefetch-close-leak-test.c     |   5 +-
 newbrt/tests/cachetable-prefetch-close-test.c |   6 +-
 .../cachetable-prefetch-flowcontrol-test.c    |   5 +-
 .../cachetable-prefetch-getandpin-test.c      |  12 +-
 .../cachetable-prefetch-maybegetandpin-test.c |   1 +
 newbrt/tests/cachetable-prefetch2-test.c      |   1 +
 newbrt/tests/cachetable-put-checkpoint.c      |  32 +-
 newbrt/tests/cachetable-rename-test.c         |   8 +-
 newbrt/tests/cachetable-scan.c                |   8 +-
 newbrt/tests/cachetable-simple-clone.c        | 153 +++
 newbrt/tests/cachetable-simple-clone2.c       | 103 ++
 .../tests/cachetable-simple-maybe-get-pin.c   |   2 +-
 .../tests/cachetable-simple-pin-dep-nodes.c   |  10 +-
 .../tests/cachetable-simple-pin-nonblocking.c |  22 +-
 newbrt/tests/cachetable-simple-pin.c          |  12 +-
 .../tests/cachetable-simple-put-dep-nodes.c   |   9 +-
 ...achetable-simple-unpin-remove-checkpoint.c |   4 +-
 newbrt/tests/cachetable-simple-verify.c       |   2 +-
 newbrt/tests/cachetable-test.c                |  63 +-
 newbrt/tests/cachetable-test2.c               |  10 +-
 .../tests/cachetable-unpin-and-remove-test.c  |   3 +-
 .../cachetable-unpin-remove-and-checkpoint.c  |   4 +-
 newbrt/tests/cachetable-writer-thread-limit.c |   4 +-
 newbrt/tests/test-checkpoint-during-flush.c   |   4 +
 newbrt/tests/test-checkpoint-during-merge.c   |   6 +
 .../tests/test-checkpoint-during-rebalance.c  |   5 +
 newbrt/tests/test-checkpoint-during-split.c   |   6 +
 newbrt/tests/test-dirty-flushes-on-cleaner.c  |   4 +
 newbrt/tests/test-flushes-on-cleaner.c        |   4 +
 newbrt/tests/test-merges-on-cleaner.c         |   2 +
 newbrt/tests/test.h                           |  11 +-
 newbrt/tests/test4244.c                       |   1 +
 newbrt/tests/test4302.c                       |   7 +-
 newbrt/workqueue.c                            |   6 +-
 newbrt/workqueue.h                            |   4 +-
 src/tests/perf_checkpoint_var.c               |  87 ++
 96 files changed, 2598 insertions(+), 828 deletions(-)
 create mode 100644 newbrt/tests/cachetable-clone-checkpoint.c
 create mode 100644 newbrt/tests/cachetable-clone-partial-fetch-pinned-node.c
 create mode 100644 newbrt/tests/cachetable-clone-partial-fetch.c
 create mode 100644 newbrt/tests/cachetable-clone-pin-nonblocking.c
 create mode 100644 newbrt/tests/cachetable-clone-unpin-remove.c
 create mode 100644 newbrt/tests/cachetable-simple-clone.c
 create mode 100644 newbrt/tests/cachetable-simple-clone2.c

diff --git a/newbrt/brt-cachetable-wrappers.c b/newbrt/brt-cachetable-wrappers.c
index f75f11f2020..cd76071f557 100644
--- a/newbrt/brt-cachetable-wrappers.c
+++ b/newbrt/brt-cachetable-wrappers.c
@@ -127,6 +127,7 @@ toku_pin_brtnode(
     ANCESTORS ancestors,
     const PIVOT_BOUNDS bounds,
     BRTNODE_FETCH_EXTRA bfe,
+    BOOL may_modify_node,
     BOOL apply_ancestor_messages, // this BOOL is probably temporary, for #3972, once we know how range query estimates work, will revisit this
     BRTNODE *node_p,
     BOOL* msgs_applied)
@@ -143,6 +144,7 @@ toku_pin_brtnode(
             toku_brtnode_fetch_callback,
             toku_brtnode_pf_req_callback,
             toku_brtnode_pf_callback,
+            may_modify_node,
             bfe, //read_extraargs
             unlockers);
     if (r==0) {
@@ -168,6 +170,7 @@ toku_pin_brtnode_holding_lock(
     const PIVOT_BOUNDS bounds,
     BRTNODE_FETCH_EXTRA bfe,
     BOOL apply_ancestor_messages, // this BOOL is probably temporary, for #3972, once we know how range query estimates work, will revisit this
+    BOOL may_modify_node,
     BRTNODE *node_p)
 {
     void *node_v;
@@ -181,6 +184,7 @@ toku_pin_brtnode_holding_lock(
         toku_brtnode_fetch_callback,
         toku_brtnode_pf_req_callback,
         toku_brtnode_pf_callback,
+        may_modify_node,
         bfe
         );
     assert(r==0);
@@ -196,6 +200,7 @@ toku_pin_brtnode_off_client_thread(
     BLOCKNUM blocknum,
     u_int32_t fullhash,
     BRTNODE_FETCH_EXTRA bfe,
+    BOOL may_modify_node,
     u_int32_t num_dependent_nodes,
     BRTNODE* dependent_nodes,
     BRTNODE *node_p)
@@ -222,6 +227,7 @@ toku_pin_brtnode_off_client_thread(
         toku_brtnode_fetch_callback,
         toku_brtnode_pf_req_callback,
         toku_brtnode_pf_callback,
+        may_modify_node,
         bfe,
         num_dependent_nodes,
         dependent_cf,
diff --git a/newbrt/brt-cachetable-wrappers.h b/newbrt/brt-cachetable-wrappers.h
index 8f96c093d7f..506edf60ead 100644
--- a/newbrt/brt-cachetable-wrappers.h
+++ b/newbrt/brt-cachetable-wrappers.h
@@ -71,6 +71,7 @@ toku_pin_brtnode(
     ANCESTORS ancestors,
     const PIVOT_BOUNDS pbounds,
     BRTNODE_FETCH_EXTRA bfe,
+    BOOL may_modify_node,
     BOOL apply_ancestor_messages, // this BOOL is probably temporary, for #3972, once we know how range query estimates work, will revisit this
     BRTNODE *node_p,
     BOOL* msgs_applied
@@ -88,6 +89,7 @@ toku_pin_brtnode_holding_lock(
     const PIVOT_BOUNDS pbounds,
     BRTNODE_FETCH_EXTRA bfe,
     BOOL apply_ancestor_messages,
+    BOOL may_modify_node,
     BRTNODE *node_p
     );
 
@@ -104,6 +106,7 @@ toku_pin_brtnode_off_client_thread(
     BLOCKNUM blocknum,
     u_int32_t fullhash,
     BRTNODE_FETCH_EXTRA bfe,
+    BOOL may_modify_node,
     u_int32_t num_dependent_nodes,
     BRTNODE* dependent_nodes,
     BRTNODE *node_p
diff --git a/newbrt/brt-flusher.c b/newbrt/brt-flusher.c
index 6b8df0e528f..95a8a1cdd57 100644
--- a/newbrt/brt-flusher.c
+++ b/newbrt/brt-flusher.c
@@ -400,7 +400,7 @@ ct_maybe_merge_child(struct flusher_advice *fa,
             CACHEKEY *rootp = toku_calculate_root_offset_pointer(h, &fullhash);
             struct brtnode_fetch_extra bfe;
             fill_bfe_for_full_read(&bfe, h);
-            toku_pin_brtnode_off_client_thread(h, *rootp, fullhash, &bfe, 0,NULL, &root_node);
+            toku_pin_brtnode_off_client_thread(h, *rootp, fullhash, &bfe, TRUE, 0, NULL, &root_node);
             toku_assert_entire_node_in_memory(root_node);
 
             toku_brtheader_release_treelock(h);
@@ -512,8 +512,6 @@ handle_split_of_child(
     BP_BLOCKNUM(node, childnum+1) = childb->thisnodename;
     BP_WORKDONE(node, childnum+1)  = 0;
     BP_STATE(node,childnum+1) = PT_AVAIL;
-    BP_START(node,childnum+1) = 0;
-    BP_SIZE(node,childnum+1) = 0;
 
     set_BNC(node, childnum+1, toku_create_empty_nl());
 
@@ -824,8 +822,6 @@ brtleaf_split(
             for (int i = 0; i < num_children_in_b; i++) {
                 BP_BLOCKNUM(B,i).b = 0;
                 BP_STATE(B,i) = PT_AVAIL;
-                BP_START(B,i) = 0;
-                BP_SIZE(B,i) = 0;
                 BP_WORKDONE(B,i) = 0;
                 set_BLB(B, i, toku_create_empty_bn());
             }
@@ -1361,7 +1357,7 @@ brt_merge_child(
         u_int32_t childfullhash = compute_child_fullhash(h->cf, node, childnuma);
         struct brtnode_fetch_extra bfe;
         fill_bfe_for_full_read(&bfe, h);
-        toku_pin_brtnode_off_client_thread(h, BP_BLOCKNUM(node, childnuma), childfullhash, &bfe, 1, &node, &childa);
+        toku_pin_brtnode_off_client_thread(h, BP_BLOCKNUM(node, childnuma), childfullhash, &bfe, TRUE, 1, &node, &childa);
     }
     // for test
     call_flusher_thread_callback(ft_flush_before_pin_second_node_for_merge);
@@ -1372,7 +1368,7 @@ brt_merge_child(
         u_int32_t childfullhash = compute_child_fullhash(h->cf, node, childnumb);
         struct brtnode_fetch_extra bfe;
         fill_bfe_for_full_read(&bfe, h);
-        toku_pin_brtnode_off_client_thread(h, BP_BLOCKNUM(node, childnumb), childfullhash, &bfe, 2, dep_nodes, &childb);
+        toku_pin_brtnode_off_client_thread(h, BP_BLOCKNUM(node, childnumb), childfullhash, &bfe, TRUE, 2, dep_nodes, &childb);
     }
 
     if (toku_bnc_n_entries(BNC(node,childnuma))>0) {
@@ -1498,7 +1494,7 @@ flush_some_child(
     // Note that we don't read the entire node into memory yet.
     // The idea is let's try to do the minimum work before releasing the parent lock
     fill_bfe_for_min_read(&bfe, h);
-    toku_pin_brtnode_off_client_thread(h, targetchild, childfullhash, &bfe, 1, &parent, &child);
+    toku_pin_brtnode_off_client_thread(h, targetchild, childfullhash, &bfe, TRUE, 1, &parent, &child);
 
     // for test
     call_flusher_thread_callback(ft_flush_after_child_pin);
diff --git a/newbrt/brt-hot-flusher.c b/newbrt/brt-hot-flusher.c
index 0cecb3b17cb..ec4f19cbd1d 100644
--- a/newbrt/brt-hot-flusher.c
+++ b/newbrt/brt-hot-flusher.c
@@ -280,6 +280,7 @@ toku_brt_hot_optimize(BRT brt,
                                                (BLOCKNUM) *rootp,
                                                fullhash,
                                                &bfe,
+                                               TRUE, 
                                                0,
                                                NULL,
                                                &root);
diff --git a/newbrt/brt-internal.h b/newbrt/brt-internal.h
index 42efdd7155d..5aa2608e512 100644
--- a/newbrt/brt-internal.h
+++ b/newbrt/brt-internal.h
@@ -188,6 +188,22 @@ typedef struct __attribute__((__packed__)) brtnode_child_pointer {
     } u;
 } BRTNODE_CHILD_POINTER;
 
+
+struct brtnode_disk_data {
+    //
+    // stores the offset to the beginning of the partition on disk from the brtnode, and the length, needed to read a partition off of disk
+    // the value is only meaningful if the node is clean. If the node is dirty, then the value is meaningless
+    //  The START is the distance from the end of the compressed node_info data, to the beginning of the compressed partition
+    //  The SIZE is the size of the compressed partition.
+    // Rationale:  We cannot store the size from the beginning of the node since we don't know how big the header will be.
+    //  However, later when we are doing aligned writes, we won't be able to store the size from the end since we want things to align.
+    u_int32_t start;
+    u_int32_t size;
+};
+#define BP_START(node_dd,i) ((node_dd)[i].start)
+#define BP_SIZE(node_dd,i) ((node_dd)[i].size)
+
+
 // a brtnode partition, associated with a child of a node
 struct   __attribute__((__packed__)) brtnode_partition {
     // the following three variables are used for nonleaf nodes
@@ -203,14 +219,6 @@ struct   __attribute__((__packed__)) brtnode_partition {
     //
     enum pt_state state; // make this an enum to make debugging easier.  
     //
-    // stores the offset to the beginning of the partition on disk from the brtnode, and the length, needed to read a partition off of disk
-    // the value is only meaningful if the node is clean. If the node is dirty, then the value is meaningless
-    //  The START is the distance from the end of the compressed node_info data, to the beginning of the compressed partition
-    //  The SIZE is the size of the compressed partition.
-    // Rationale:  We cannot store the size from the beginning of the node since we don't know how big the header will be.
-    //  However, later when we are doing aligned writes, we won't be able to store the size from the end since we want things to align.
-    u_int32_t start,size;
-    //
     // pointer to the partition. Depending on the state, they may be different things
     // if state == PT_INVALID, then the node was just initialized and ptr == NULL
     // if state == PT_ON_DISK, then ptr == NULL
@@ -258,11 +266,7 @@ struct brtnode {
 // brtnode partition macros
 // BP stands for brtnode_partition
 #define BP_BLOCKNUM(node,i) ((node)->bp[i].blocknum)
-#define BP_HAVE_FULLHASH(node,i) ((node)->bp[i].have_fullhash)
-#define BP_FULLHASH(node,i) ((node)->bp[i].fullhash)
 #define BP_STATE(node,i) ((node)->bp[i].state)
-#define BP_START(node,i) ((node)->bp[i].start)
-#define BP_SIZE(node,i) ((node)->bp[i].size)
 #define BP_WORKDONE(node, i)((node)->bp[i].workdone)
 
 //
@@ -448,18 +452,21 @@ toku_create_compressed_partition_from_available(
     int childnum, 
     SUB_BLOCK sb
     );
+void rebalance_brtnode_leaf(BRTNODE node, unsigned int basementnodesize);
 int toku_serialize_brtnode_to_memory (BRTNODE node,
+                                      BRTNODE_DISK_DATA* ndd,
                                       unsigned int basementnodesize,
+                                      BOOL do_rebalancing,
                               /*out*/ size_t *n_bytes_to_write,
                               /*out*/ char  **bytes_to_write);
-int toku_serialize_brtnode_to(int fd, BLOCKNUM, BRTNODE node, struct brt_header *h, int n_workitems, int n_threads, BOOL for_checkpoint);
+int toku_serialize_brtnode_to(int fd, BLOCKNUM, BRTNODE node, BRTNODE_DISK_DATA* ndd, BOOL do_rebalancing, struct brt_header *h, int n_workitems, int n_threads, BOOL for_checkpoint);
 int toku_serialize_rollback_log_to (int fd, BLOCKNUM blocknum, ROLLBACK_LOG_NODE log,
                                     struct brt_header *h, int n_workitems, int n_threads,
                                     BOOL for_checkpoint);
 int toku_deserialize_rollback_log_from (int fd, BLOCKNUM blocknum, u_int32_t fullhash, ROLLBACK_LOG_NODE *logp, struct brt_header *h);
-void toku_deserialize_bp_from_disk(BRTNODE node, int childnum, int fd, struct brtnode_fetch_extra* bfe);
+void toku_deserialize_bp_from_disk(BRTNODE node, BRTNODE_DISK_DATA ndd, int childnum, int fd, struct brtnode_fetch_extra* bfe);
 void toku_deserialize_bp_from_compressed(BRTNODE node, int childnum, DESCRIPTOR desc, brt_compare_func cmp);
-int toku_deserialize_brtnode_from (int fd, BLOCKNUM off, u_int32_t /*fullhash*/, BRTNODE *brtnode, struct brtnode_fetch_extra* bfe);
+int toku_deserialize_brtnode_from (int fd, BLOCKNUM off, u_int32_t /*fullhash*/, BRTNODE *brtnode, BRTNODE_DISK_DATA* ndd, struct brtnode_fetch_extra* bfe);
 unsigned int toku_serialize_brtnode_size(BRTNODE node); /* How much space will it take? */
 int toku_keycompare (bytevec key1, ITEMLEN key1len, bytevec key2, ITEMLEN key2len);
 
@@ -477,6 +484,8 @@ int toku_serialize_descriptor_contents_to_fd(int fd, const DESCRIPTOR desc, DISK
 void toku_serialize_descriptor_contents_to_wbuf(struct wbuf *wb, const DESCRIPTOR desc);
 BASEMENTNODE toku_create_empty_bn(void);
 BASEMENTNODE toku_create_empty_bn_no_buffer(void); // create a basement node with a null buffer.
+NONLEAF_CHILDINFO toku_clone_nl(NONLEAF_CHILDINFO orig_childinfo);
+BASEMENTNODE toku_clone_bn(BASEMENTNODE orig_bn);
 NONLEAF_CHILDINFO toku_create_empty_nl(void);
 // FIXME needs toku prefix
 void destroy_basement_node (BASEMENTNODE bn);
@@ -529,12 +538,13 @@ struct brtenv {
 };
 
 void toku_brt_status_update_pivot_fetch_reason(struct brtnode_fetch_extra *bfe);
-extern void toku_brtnode_flush_callback (CACHEFILE cachefile, int fd, BLOCKNUM nodename, void *brtnode_v, void *extraargs, PAIR_ATTR size, PAIR_ATTR* new_size, BOOL write_me, BOOL keep_me, BOOL for_checkpoint);
-extern int toku_brtnode_fetch_callback (CACHEFILE cachefile, int fd, BLOCKNUM nodename, u_int32_t fullhash, void **brtnode_pv, PAIR_ATTR *sizep, int*dirty, void*extraargs);
-extern void toku_brtnode_pe_est_callback(void* brtnode_pv, long* bytes_freed_estimate, enum partial_eviction_cost *cost, void* write_extraargs);
+extern void toku_brtnode_clone_callback(void* value_data, void** cloned_value_data, PAIR_ATTR* new_attr, BOOL for_checkpoint, void* write_extraargs);
+extern void toku_brtnode_flush_callback (CACHEFILE cachefile, int fd, BLOCKNUM nodename, void *brtnode_v, void** UU(disk_data), void *extraargs, PAIR_ATTR size, PAIR_ATTR* new_size, BOOL write_me, BOOL keep_me, BOOL for_checkpoint, BOOL is_clone);
+extern int toku_brtnode_fetch_callback (CACHEFILE cachefile, int fd, BLOCKNUM nodename, u_int32_t fullhash, void **brtnode_pv, void** UU(disk_data), PAIR_ATTR *sizep, int*dirty, void*extraargs);
+extern void toku_brtnode_pe_est_callback(void* brtnode_pv, void* disk_data, long* bytes_freed_estimate, enum partial_eviction_cost *cost, void* write_extraargs);
 extern int toku_brtnode_pe_callback (void *brtnode_pv, PAIR_ATTR old_attr, PAIR_ATTR* new_attr, void *extraargs);
 extern BOOL toku_brtnode_pf_req_callback(void* brtnode_pv, void* read_extraargs);
-int toku_brtnode_pf_callback(void* brtnode_pv, void* read_extraargs, int fd, PAIR_ATTR* sizep);
+int toku_brtnode_pf_callback(void* brtnode_pv, void* UU(disk_data), void* read_extraargs, int fd, PAIR_ATTR* sizep);
 extern int toku_brtnode_cleaner_callback( void *brtnode_pv, BLOCKNUM blocknum, u_int32_t fullhash, void *extraargs);
 extern int toku_brt_alloc_init_header(BRT t, TOKUTXN txn);
 extern int toku_read_brt_header_and_store_in_cachefile (BRT brt, CACHEFILE cf, LSN max_acceptable_lsn, struct brt_header **header, BOOL* was_open);
@@ -546,6 +556,7 @@ static inline CACHETABLE_WRITE_CALLBACK get_write_callbacks_for_node(struct brt_
     wc.pe_est_callback = toku_brtnode_pe_est_callback;
     wc.pe_callback = toku_brtnode_pe_callback;
     wc.cleaner_callback = toku_brtnode_cleaner_callback;
+    wc.clone_callback = toku_brtnode_clone_callback;
     wc.write_extraargs = h;
     return wc;
 }
@@ -900,6 +911,9 @@ typedef enum {
     BRT_STATUS_NUM_ROWS
 } brt_status_entry;
 
+void brt_begin_checkpoint(void);
+void brt_end_checkpoint(void);
+
 typedef struct {
     bool initialized;
     TOKU_ENGINE_STATUS_ROW_S status[BRT_STATUS_NUM_ROWS];
diff --git a/newbrt/brt-serialize.c b/newbrt/brt-serialize.c
index 933b857dc80..98d8dbb6159 100644
--- a/newbrt/brt-serialize.c
+++ b/newbrt/brt-serialize.c
@@ -237,7 +237,7 @@ serialize_node_header_size(BRTNODE node) {
 }
 
 static void
-serialize_node_header(BRTNODE node, struct wbuf *wbuf) {
+serialize_node_header(BRTNODE node, BRTNODE_DISK_DATA ndd, struct wbuf *wbuf) {
     if (node->height == 0) 
         wbuf_nocrc_literal_bytes(wbuf, "tokuleaf", 8);
     else 
@@ -248,9 +248,9 @@ serialize_node_header(BRTNODE node, struct wbuf *wbuf) {
     wbuf_nocrc_uint(wbuf, BUILD_ID);
     wbuf_nocrc_int (wbuf, node->n_children);
     for (int i=0; i<node->n_children; i++) {
-	assert(BP_SIZE(node,i)>0);
-	wbuf_nocrc_int(wbuf, BP_START(node, i)); // save the beginning of the partition
-        wbuf_nocrc_int(wbuf, BP_SIZE (node, i));         // and the size
+        assert(BP_SIZE(ndd,i)>0);
+        wbuf_nocrc_int(wbuf, BP_START(ndd, i)); // save the beginning of the partition
+        wbuf_nocrc_int(wbuf, BP_SIZE (ndd, i));         // and the size
     }
     // checksum the header
     u_int32_t end_to_end_checksum = x1764_memory(wbuf->buf, wbuf_get_woffset(wbuf));
@@ -500,7 +500,7 @@ sum_item (OMTVALUE lev, u_int32_t UU(idx), void *vsi) {
 // Because all messages above have been applied, setting msn of all new basements 
 // to max msn of existing basements is correct.  (There cannot be any messages in
 // buffers above that still need to be applied.)
-static void
+void
 rebalance_brtnode_leaf(BRTNODE node, unsigned int basementnodesize)
 {
     assert(node->height == 0);
@@ -687,9 +687,6 @@ rebalance_brtnode_leaf(BRTNODE node, unsigned int basementnodesize)
     toku_free(num_les_this_bn);
 }  // end of rebalance_brtnode_leaf()
 
-static void
-serialize_and_compress(BRTNODE node, int npartitions, struct sub_block sb[]);
-
 static void
 serialize_and_compress_partition(BRTNODE node, int childnum, SUB_BLOCK sb)
 {
@@ -729,85 +726,29 @@ toku_create_compressed_partition_from_available(
 }
 
 
-// tests are showing that serial insertions are slightly faster 
-// using the pthreads than using CILK. Disabling CILK until we have
-// some evidence that it is faster
-#ifdef HAVE_CILK
-
 static void
 serialize_and_compress(BRTNODE node, int npartitions, struct sub_block sb[]) {
-#pragma cilk grainsize = 2
-    cilk_for (int i = 0; i < npartitions; i++) {
+    for (int i = 0; i < npartitions; i++) {
         serialize_and_compress_partition(node, i, &sb[i]);
     }
 }
 
-#else
-
-struct serialize_compress_work {
-    struct work base;
-    BRTNODE node;
-    int i;
-    struct sub_block *sb;
-};
-
-static void *
-serialize_and_compress_worker(void *arg) {
-    struct workset *ws = (struct workset *) arg;
-    while (1) {
-        struct serialize_compress_work *w = (struct serialize_compress_work *) workset_get(ws);
-        if (w == NULL)
-            break;
-        int i = w->i;
-        serialize_and_compress_partition(w->node, i, &w->sb[i]);
-    }
-    workset_release_ref(ws);
-    return arg;
-}
-
-static void
-serialize_and_compress(BRTNODE node, int npartitions, struct sub_block sb[]) {
-    if (npartitions == 1) {
-        serialize_and_compress_partition(node, 0, &sb[0]);
-    } else {
-        int T = num_cores;
-        if (T > npartitions)
-            T = npartitions;
-        if (T > 0)
-            T = T - 1;
-        struct workset ws;
-        workset_init(&ws);
-        struct serialize_compress_work work[npartitions];
-        workset_lock(&ws);
-        for (int i = 0; i < npartitions; i++) {
-            work[i] = (struct serialize_compress_work) { .node = node, .i = i, .sb = sb };
-            workset_put_locked(&ws, &work[i].base);
-        }
-        workset_unlock(&ws);
-        toku_thread_pool_run(brt_pool, 0, &T, serialize_and_compress_worker, &ws);
-        workset_add_ref(&ws, T);
-        serialize_and_compress_worker(&ws);
-        workset_join(&ws);
-        workset_destroy(&ws);
-    }
-}
-
-#endif
-
 // Writes out each child to a separate malloc'd buffer, then compresses
 // all of them, and writes the uncompressed header, to bytes_to_write,
 // which is malloc'd.
 //
 int
 toku_serialize_brtnode_to_memory (BRTNODE node,
+                                         BRTNODE_DISK_DATA* ndd,
                                   unsigned int basementnodesize,
+                                  BOOL do_rebalancing,
                           /*out*/ size_t *n_bytes_to_write,
                           /*out*/ char  **bytes_to_write)
 {
     toku_assert_entire_node_in_memory(node);
 
-    if (node->height == 0) {
-	rebalance_brtnode_leaf(node, basementnodesize);
+    if (do_rebalancing && node->height == 0) {
+        rebalance_brtnode_leaf(node, basementnodesize);
     }
     const int npartitions = node->n_children;
 
@@ -815,6 +756,7 @@ toku_serialize_brtnode_to_memory (BRTNODE node,
     // For internal nodes, a sub block is a message buffer
     // For leaf nodes, a sub block is a basement node
     struct sub_block *XMALLOC_N(npartitions, sb);
+    *ndd = toku_xrealloc(*ndd, npartitions*sizeof(**ndd));
     struct sub_block sb_node_info;
     for (int i = 0; i < npartitions; i++) {
         sub_block_init(&sb[i]);;
@@ -845,8 +787,8 @@ toku_serialize_brtnode_to_memory (BRTNODE node,
     // store the BP_SIZESs
     for (int i = 0; i < node->n_children; i++) {
 	u_int32_t len         = sb[i].compressed_size + 4; // data and checksum
-        BP_SIZE (node,i) = len;
-	BP_START(node,i) = total_node_size;
+        BP_SIZE (*ndd,i) = len;
+	BP_START(*ndd,i) = total_node_size;
         total_node_size += sb[i].compressed_size + 4;
     }
 
@@ -857,7 +799,7 @@ toku_serialize_brtnode_to_memory (BRTNODE node,
     // write the header
     struct wbuf wb;
     wbuf_init(&wb, curr_ptr, serialize_node_header_size(node));
-    serialize_node_header(node, &wb);
+    serialize_node_header(node, *ndd, &wb);
     assert(wb.ndone == wb.size);
     curr_ptr += serialize_node_header_size(node);
 
@@ -895,12 +837,12 @@ toku_serialize_brtnode_to_memory (BRTNODE node,
 }
 
 int
-toku_serialize_brtnode_to (int fd, BLOCKNUM blocknum, BRTNODE node, struct brt_header *h, int UU(n_workitems), int UU(n_threads), BOOL for_checkpoint) {
+toku_serialize_brtnode_to (int fd, BLOCKNUM blocknum, BRTNODE node, BRTNODE_DISK_DATA* ndd, BOOL do_rebalancing, struct brt_header *h, int UU(n_workitems), int UU(n_threads), BOOL for_checkpoint) {
 
     size_t n_to_write;
     char *compressed_buf = NULL;
     {
-	int r = toku_serialize_brtnode_to_memory(node, h->basementnodesize,
+	int r = toku_serialize_brtnode_to_memory(node, ndd, h->basementnodesize, do_rebalancing,
                                                  &n_to_write, &compressed_buf);
 	if (r!=0) return r;
     }
@@ -1046,6 +988,41 @@ BASEMENTNODE toku_create_empty_bn(void) {
     return bn;
 }
 
+struct mp_pair {
+    void* orig_base;
+    void* new_base;
+    OMT omt;
+};
+
+static int fix_mp_offset(OMTVALUE v, u_int32_t i, void* extra) {
+    struct mp_pair* p = extra;
+    char* old_value = v;
+    char *new_value = old_value - (char *)p->orig_base + (char *)p->new_base;
+    toku_omt_set_at(p->omt, (OMTVALUE) new_value, i);
+    return 0;
+}
+        
+BASEMENTNODE toku_clone_bn(BASEMENTNODE orig_bn) {
+    BASEMENTNODE bn = toku_create_empty_bn_no_buffer();
+    bn->max_msn_applied = orig_bn->max_msn_applied;
+    bn->n_bytes_in_buffer = orig_bn->n_bytes_in_buffer;
+    bn->seqinsert = orig_bn->seqinsert;
+    bn->stale_ancestor_messages_applied = orig_bn->stale_ancestor_messages_applied;
+    bn->stat64_delta = orig_bn->stat64_delta;
+    toku_mempool_clone(&orig_bn->buffer_mempool, &bn->buffer_mempool);
+    toku_omt_clone_noptr(&bn->buffer, orig_bn->buffer);
+    struct mp_pair p;
+    p.orig_base = toku_mempool_get_base(&orig_bn->buffer_mempool);
+    p.new_base = toku_mempool_get_base(&bn->buffer_mempool);
+    p.omt = bn->buffer;
+    toku_omt_iterate(
+        bn->buffer,
+        fix_mp_offset,
+        &p
+        );
+    return bn;
+}
+
 BASEMENTNODE toku_create_empty_bn_no_buffer(void) {
     BASEMENTNODE XMALLOC(bn);
     bn->max_msn_applied.msn = 0;
@@ -1068,6 +1045,17 @@ NONLEAF_CHILDINFO toku_create_empty_nl(void) {
     return cn;
 }
 
+// does NOT create OMTs, just the FIFO
+NONLEAF_CHILDINFO toku_clone_nl(NONLEAF_CHILDINFO orig_childinfo) {
+    NONLEAF_CHILDINFO XMALLOC(cn);
+    cn->n_bytes_in_buffer = orig_childinfo->n_bytes_in_buffer;    
+    cn->fresh_message_tree = NULL;
+    cn->stale_message_tree = NULL;
+    cn->broadcast_list = NULL;
+    toku_fifo_clone(orig_childinfo->buffer, &cn->buffer);
+    return cn;
+}
+
 void destroy_basement_node (BASEMENTNODE bn)
 {
     // The buffer may have been freed already, in some cases.
@@ -1080,9 +1068,9 @@ void destroy_basement_node (BASEMENTNODE bn)
 void destroy_nonleaf_childinfo (NONLEAF_CHILDINFO nl)
 {
     toku_fifo_free(&nl->buffer);
-    toku_omt_destroy(&nl->fresh_message_tree);
-    toku_omt_destroy(&nl->stale_message_tree);
-    toku_omt_destroy(&nl->broadcast_list);
+    if (nl->fresh_message_tree) toku_omt_destroy(&nl->fresh_message_tree);
+    if (nl->stale_message_tree) toku_omt_destroy(&nl->stale_message_tree);
+    if (nl->broadcast_list) toku_omt_destroy(&nl->broadcast_list);
     toku_free(nl);
 }
 
@@ -1402,6 +1390,7 @@ check_and_copy_compressed_sub_block_worker(struct rbuf curr_rbuf, struct sub_blo
 }
 
 static int deserialize_brtnode_header_from_rbuf_if_small_enough (BRTNODE *brtnode,
+                                                                 BRTNODE_DISK_DATA* ndd, 
                                                                  BLOCKNUM blocknum,
                                                                  u_int32_t fullhash,
                                                                  struct brtnode_fetch_extra *bfe,
@@ -1455,10 +1444,11 @@ static int deserialize_brtnode_header_from_rbuf_if_small_enough (BRTNODE *brtnod
     }
 
     XMALLOC_N(node->n_children, node->bp);
+    *ndd = toku_xmalloc(node->n_children*sizeof(**ndd));
     // read the partition locations
     for (int i=0; i<node->n_children; i++) {
-        BP_START(node,i) = rbuf_int(rb);
-        BP_SIZE (node,i) = rbuf_int(rb);
+        BP_START(*ndd,i) = rbuf_int(rb);
+        BP_SIZE (*ndd,i) = rbuf_int(rb);
     }
 
     u_int32_t checksum = x1764_memory(rb->buf, rb->ndone);
@@ -1517,7 +1507,7 @@ static int deserialize_brtnode_header_from_rbuf_if_small_enough (BRTNODE *brtnod
 
     if (bfe->type != brtnode_fetch_none) {
         PAIR_ATTR attr;
-        toku_brtnode_pf_callback(node, bfe, fd, &attr);
+        toku_brtnode_pf_callback(node, *ndd, bfe, fd, &attr);
     }
     // handle clock
     for (int i = 0; i < node->n_children; i++) {
@@ -1532,6 +1522,7 @@ static int deserialize_brtnode_header_from_rbuf_if_small_enough (BRTNODE *brtnod
  cleanup:
     if (r!=0) {
 	if (node) {
+            toku_free(*ndd);
 	    toku_free(node->bp);
 	    toku_free(node);
 	}
@@ -1542,6 +1533,7 @@ static int deserialize_brtnode_header_from_rbuf_if_small_enough (BRTNODE *brtnod
 static int
 deserialize_brtnode_from_rbuf(
     BRTNODE *brtnode,
+    BRTNODE_DISK_DATA* ndd,
     BLOCKNUM blocknum,
     u_int32_t fullhash,
     struct brtnode_fetch_extra* bfe,
@@ -1577,10 +1569,11 @@ deserialize_brtnode_from_rbuf(
     node->build_id = rbuf_int(rb);
     node->n_children = rbuf_int(rb);
     XMALLOC_N(node->n_children, node->bp);
+    *ndd = toku_xmalloc(node->n_children*sizeof(**ndd));
     // read the partition locations
     for (int i=0; i<node->n_children; i++) {
-        BP_START(node,i) = rbuf_int(rb);
-        BP_SIZE (node,i) = rbuf_int(rb);
+        BP_START(*ndd,i) = rbuf_int(rb);
+        BP_SIZE (*ndd,i) = rbuf_int(rb);
     }
     // verify checksum of header stored
     u_int32_t checksum = x1764_memory(rb->buf, rb->ndone);
@@ -1609,8 +1602,8 @@ deserialize_brtnode_from_rbuf(
     // Previously, this code was a for loop with spawns inside and a sync at the end.
     // But now the loop is parallelizeable since we don't have a dependency on the work done so far.
     cilk_for (int i = 0; i < node->n_children; i++) {
-        u_int32_t curr_offset = BP_START(node,i);
-        u_int32_t curr_size   = BP_SIZE(node,i);
+        u_int32_t curr_offset = BP_START(*ndd,i);
+        u_int32_t curr_size   = BP_SIZE(*ndd,i);
         // the compressed, serialized partitions start at where rb is currently pointing,
         // which would be rb->buf + rb->ndone
         // we need to intialize curr_rbuf to point to this place
@@ -1665,7 +1658,7 @@ cleanup:
 }
 
 void
-toku_deserialize_bp_from_disk(BRTNODE node, int childnum, int fd, struct brtnode_fetch_extra* bfe) {
+toku_deserialize_bp_from_disk(BRTNODE node, BRTNODE_DISK_DATA ndd, int childnum, int fd, struct brtnode_fetch_extra* bfe) {
     assert(BP_STATE(node,childnum) == PT_ON_DISK);
     assert(node->bp[childnum].ptr.tag == BCT_NULL);
     
@@ -1687,8 +1680,8 @@ toku_deserialize_bp_from_disk(BRTNODE node, int childnum, int fd, struct brtnode
         &total_node_disk_size
         );
 
-    u_int32_t curr_offset = BP_START(node, childnum);
-    u_int32_t curr_size   = BP_SIZE (node, childnum);
+    u_int32_t curr_offset = BP_START(ndd, childnum);
+    u_int32_t curr_size   = BP_SIZE (ndd, childnum);
     struct rbuf rb = {.buf = NULL, .size = 0, .ndone = 0};
 
     u_int8_t *XMALLOC_N(curr_size, raw_block);
@@ -1738,6 +1731,7 @@ int toku_deserialize_brtnode_from (int fd,
 				   BLOCKNUM blocknum,
 				   u_int32_t fullhash,
 				   BRTNODE *brtnode,
+				   BRTNODE_DISK_DATA* ndd,
 				   struct brtnode_fetch_extra* bfe)
 // Effect: Read a node in.  If possible, read just the header.
 {
@@ -1746,7 +1740,7 @@ int toku_deserialize_brtnode_from (int fd,
     struct rbuf rb = RBUF_INITIALIZER;
     read_brtnode_header_from_fd_into_rbuf_if_small_enough(fd, blocknum, bfe->h, &rb);
 
-    int r = deserialize_brtnode_header_from_rbuf_if_small_enough(brtnode, blocknum, fullhash, bfe, &rb, fd);
+    int r = deserialize_brtnode_header_from_rbuf_if_small_enough(brtnode, ndd, blocknum, fullhash, bfe, &rb, fd);
     if (r != 0) {
 	toku_free(rb.buf);
 	rb = RBUF_INITIALIZER;
@@ -1756,7 +1750,7 @@ int toku_deserialize_brtnode_from (int fd,
 	r = read_block_from_fd_into_rbuf(fd, blocknum, bfe->h, &rb);
 	if (r != 0) { goto cleanup; } // if we were successful, then we are done.
 
-	r = deserialize_brtnode_from_rbuf(brtnode, blocknum, fullhash, bfe, &rb);
+	r = deserialize_brtnode_from_rbuf(brtnode, ndd, blocknum, fullhash, bfe, &rb);
 	if (r!=0) {
 	    dump_bad_block(rb.buf,rb.size);
 	}
diff --git a/newbrt/brt-test-helpers.c b/newbrt/brt-test-helpers.c
index c5ebd1be5fa..5f2485ee0dc 100644
--- a/newbrt/brt-test-helpers.c
+++ b/newbrt/brt-test-helpers.c
@@ -98,6 +98,7 @@ int toku_testsetup_get_sersize(BRT brt, BLOCKNUM diskoff) // Return the size on
         toku_brtnode_fetch_callback,
         toku_brtnode_pf_req_callback,
         toku_brtnode_pf_callback,
+        TRUE,
         &bfe
         );
     assert(r==0);
@@ -124,6 +125,7 @@ int toku_testsetup_insert_to_leaf (BRT brt, BLOCKNUM blocknum, char *key, int ke
 	toku_brtnode_fetch_callback,
         toku_brtnode_pf_req_callback,
         toku_brtnode_pf_callback,
+        TRUE,
 	&bfe
 	);
     if (r!=0) return r;
@@ -172,6 +174,7 @@ toku_pin_node_with_min_bfe(BRTNODE* node, BLOCKNUM b, BRT t)
         b,
         toku_cachetable_hash(t->h->cf, b),
         &bfe,
+        TRUE,
         0,
         NULL,
         node
@@ -196,6 +199,7 @@ int toku_testsetup_insert_to_nonleaf (BRT brt, BLOCKNUM blocknum, enum brt_msg_t
 	toku_brtnode_fetch_callback,
         toku_brtnode_pf_req_callback,
         toku_brtnode_pf_callback,
+        TRUE,
 	&bfe
         );
     if (r!=0) return r;
diff --git a/newbrt/brt-verify.c b/newbrt/brt-verify.c
index f3617f45109..19a514fe9ae 100644
--- a/newbrt/brt-verify.c
+++ b/newbrt/brt-verify.c
@@ -215,6 +215,7 @@ toku_get_node_for_verify(
         blocknum,
         fullhash,
         &bfe,
+        TRUE, // may_modify_node, safe to set to TRUE
         0,
         NULL,
         nodep
diff --git a/newbrt/brt.c b/newbrt/brt.c
index e0bdb763035..fc1c328d1a0 100644
--- a/newbrt/brt.c
+++ b/newbrt/brt.c
@@ -589,11 +589,11 @@ toku_get_and_clear_basement_stats(BRTNODE leafnode) {
     invariant(leafnode->height == 0);
     STAT64INFO_S deltas = ZEROSTATS;
     for (int i = 0; i < leafnode->n_children; i++) {
-	BASEMENTNODE bn = BLB(leafnode, i);
-	invariant(BP_STATE(leafnode,i) == PT_AVAIL);
-	deltas.numrows  += bn->stat64_delta.numrows;
-	deltas.numbytes += bn->stat64_delta.numbytes;
-	bn->stat64_delta = ZEROSTATS;
+        BASEMENTNODE bn = BLB(leafnode, i);
+        invariant(BP_STATE(leafnode,i) == PT_AVAIL);
+        deltas.numrows  += bn->stat64_delta.numrows;
+        deltas.numbytes += bn->stat64_delta.numbytes;
+        bn->stat64_delta = ZEROSTATS;
     }
     return deltas;
 }
@@ -624,59 +624,162 @@ toku_mark_node_dirty(BRTNODE node) {
     node->dirty = 1;
 }
 
+static void brt_status_update_flush_reason(BRTNODE node, BOOL for_checkpoint) {
+    if (node->height == 0) {
+        if (for_checkpoint) {
+            __sync_fetch_and_add(&STATUS_VALUE(BRT_DISK_FLUSH_LEAF_FOR_CHECKPOINT), 1);
+        }
+        else {
+            __sync_fetch_and_add(&STATUS_VALUE(BRT_DISK_FLUSH_LEAF), 1);
+        }
+    }
+    else {
+        if (for_checkpoint) {
+            __sync_fetch_and_add(&STATUS_VALUE(BRT_DISK_FLUSH_NONLEAF_FOR_CHECKPOINT), 1);
+        }
+        else {
+            __sync_fetch_and_add(&STATUS_VALUE(BRT_DISK_FLUSH_NONLEAF), 1);
+        }
+    }
+}
+
+static void brtnode_update_disk_stats(
+    BRTNODE brtnode, 
+    struct brt_header* h, 
+    BOOL for_checkpoint
+    ) 
+{
+    STAT64INFO_S deltas = ZEROSTATS;
+    // capture deltas before rebalancing basements for serialization
+    deltas = toku_get_and_clear_basement_stats(brtnode);  
+    update_header_stats(&(h->on_disk_stats), &deltas);
+    if (for_checkpoint) {
+        update_header_stats(&(h->checkpoint_staging_stats), &deltas);
+    }
+}
+
+static void brtnode_clone_partitions(BRTNODE node, BRTNODE cloned_node) {
+    for (int i = 0; i < node->n_children; i++) {
+        BP_BLOCKNUM(cloned_node,i) = BP_BLOCKNUM(node,i);
+        assert(BP_STATE(node,i) == PT_AVAIL);
+        BP_STATE(cloned_node,i) = PT_AVAIL;
+        BP_WORKDONE(cloned_node, i) = BP_WORKDONE(node, i);
+        if (node->height == 0) {
+            set_BLB(cloned_node, i,toku_clone_bn(BLB(node,i)));
+        }
+        else {
+            set_BNC(cloned_node, i, toku_clone_nl(BNC(node,i)));
+        }
+    }
+}
+
+void toku_brtnode_clone_callback(
+    void* value_data, 
+    void** cloned_value_data, 
+    PAIR_ATTR* new_attr, 
+    BOOL for_checkpoint, 
+    void* write_extraargs
+    )
+{
+    BRTNODE node = value_data;
+    toku_assert_entire_node_in_memory(node);
+    struct brt_header *h = write_extraargs;
+    BRTNODE XMALLOC(cloned_node);
+    //BRTNODE cloned_node = (BRTNODE)toku_xmalloc(sizeof(*BRTNODE));
+    memset(cloned_node, 0, sizeof(*cloned_node));
+    if (node->height == 0) {
+        rebalance_brtnode_leaf(node, h->basementnodesize);
+    }
+
+    cloned_node->max_msn_applied_to_node_on_disk = node->max_msn_applied_to_node_on_disk;
+    cloned_node->h = node->h;
+    cloned_node->nodesize = node->nodesize;
+    cloned_node->flags = node->flags;
+    cloned_node->thisnodename = node->thisnodename;
+    cloned_node->layout_version = node->layout_version;
+    cloned_node->layout_version_original = node->layout_version_original;
+    cloned_node->layout_version_read_from_disk = node->layout_version_read_from_disk;
+    cloned_node->build_id = node->build_id;
+    cloned_node->height = node->height;
+    cloned_node->dirty = node->dirty;
+    cloned_node->fullhash = node->fullhash;
+    cloned_node->optimized_for_upgrade = node->optimized_for_upgrade;
+    cloned_node->n_children = node->n_children;
+    cloned_node->totalchildkeylens = node->totalchildkeylens;
+
+    XMALLOC_N(node->n_children-1, cloned_node->childkeys);
+    XMALLOC_N(node->n_children, cloned_node->bp);
+    // clone pivots
+    for (int i = 0; i < node->n_children-1; i++) {
+        cloned_node->childkeys[i] = kv_pair_malloc(
+            kv_pair_key(node->childkeys[i]), 
+            kv_pair_keylen(node->childkeys[i]), 
+            0, 
+            0
+            );
+    }
+    // clone partition
+    brtnode_clone_partitions(node, cloned_node);
+
+    // set header stats
+    if (node->height == 0) {
+        brtnode_update_disk_stats(node, h, for_checkpoint);
+    }
+    // clear dirty bit
+    node->dirty = 0;
+    cloned_node->dirty = 0;
+    // set new pair attr if necessary
+    if (node->height == 0) {
+        *new_attr = make_brtnode_pair_attr(node);
+    }
+    else {
+        new_attr->is_valid = FALSE;
+    }
+    *cloned_value_data = cloned_node;
+}
+
+
 //fd is protected (must be holding fdlock)
-void toku_brtnode_flush_callback (CACHEFILE cachefile, int fd, BLOCKNUM nodename, void *brtnode_v, void *extraargs, PAIR_ATTR size __attribute__((unused)), PAIR_ATTR* new_size, BOOL write_me, BOOL keep_me, BOOL for_checkpoint) {
+void toku_brtnode_flush_callback (
+    CACHEFILE cachefile, 
+    int fd, 
+    BLOCKNUM nodename, 
+    void *brtnode_v, 
+    void** disk_data, 
+    void *extraargs, 
+    PAIR_ATTR size __attribute__((unused)), 
+    PAIR_ATTR* new_size, 
+    BOOL write_me, 
+    BOOL keep_me, 
+    BOOL for_checkpoint,
+    BOOL is_clone
+    ) 
+{
     struct brt_header *h = extraargs;
     BRTNODE brtnode = brtnode_v;
+    BRTNODE_DISK_DATA* ndd = (BRTNODE_DISK_DATA*)disk_data;
     assert(brtnode->thisnodename.b==nodename.b);
     int height = brtnode->height;
-    STAT64INFO_S deltas = ZEROSTATS;
-    //printf("%s:%d %p->mdict[0]=%p\n", __FILE__, __LINE__, brtnode, brtnode->mdicts[0]);
     if (write_me) {
-	if (height == 0)
-	    // capture deltas before rebalancing basements for serialization
-	    deltas = toku_get_and_clear_basement_stats(brtnode);  
+        if (height == 0 && !is_clone) {
+            brtnode_update_disk_stats(brtnode, h, for_checkpoint);
+        }
         if (!h->panic) { // if the brt panicked, stop writing, otherwise try to write it.
             toku_assert_entire_node_in_memory(brtnode);
             int n_workitems, n_threads;
             toku_cachefile_get_workqueue_load(cachefile, &n_workitems, &n_threads);
-            int r = toku_serialize_brtnode_to(fd, brtnode->thisnodename, brtnode, h, n_workitems, n_threads, for_checkpoint);
-            if (r) {
-                if (h->panic==0) {
-                    char *e = strerror(r);
-                    int	  l = 200 + strlen(e);
-                    char s[l];
-                    h->panic=r;
-                    snprintf(s, l-1, "While writing data to disk, error %d (%s)", r, e);
-                    h->panic_string = toku_strdup(s);
-                }
-            }
-        }
-        if (height == 0) {
-            struct brt_header * header_in_node = brtnode->h;
-            invariant(header_in_node == h);
-            update_header_stats(&(h->on_disk_stats), &deltas);
-            if (for_checkpoint) {
-                update_header_stats(&(h->checkpoint_staging_stats), &deltas);
-            }
-            if (for_checkpoint)
-                __sync_fetch_and_add(&STATUS_VALUE(BRT_DISK_FLUSH_LEAF_FOR_CHECKPOINT), 1);
-            else
-                __sync_fetch_and_add(&STATUS_VALUE(BRT_DISK_FLUSH_LEAF), 1);
-        }
-        else {
-            if (for_checkpoint)
-                __sync_fetch_and_add(&STATUS_VALUE(BRT_DISK_FLUSH_NONLEAF_FOR_CHECKPOINT), 1);
-            else
-                __sync_fetch_and_add(&STATUS_VALUE(BRT_DISK_FLUSH_NONLEAF), 1);
+            int r = toku_serialize_brtnode_to(fd, brtnode->thisnodename, brtnode, ndd, !is_clone, h, n_workitems, n_threads, for_checkpoint);
+            assert_zero(r);
         }
+        brt_status_update_flush_reason(brtnode, for_checkpoint);
     }
-    //printf("%s:%d %p->mdict[0]=%p\n", __FILE__, __LINE__, brtnode, brtnode->mdicts[0]);
-    *new_size = make_brtnode_pair_attr(brtnode);
     if (!keep_me) {
+        if (!is_clone) toku_free(*disk_data);
         toku_brtnode_free(&brtnode);
     }
-    //printf("%s:%d n_items_malloced=%lld\n", __FILE__, __LINE__, n_items_malloced);
+    else {
+        *new_size = make_brtnode_pair_attr(brtnode);
+    }
 }
 
 void
@@ -693,15 +796,16 @@ toku_brt_status_update_pivot_fetch_reason(struct brtnode_fetch_extra *bfe)
 
 //fd is protected (must be holding fdlock)
 int toku_brtnode_fetch_callback (CACHEFILE UU(cachefile), int fd, BLOCKNUM nodename, u_int32_t fullhash,
-                                 void **brtnode_pv, PAIR_ATTR *sizep, int *dirtyp, void *extraargs) {
+                                 void **brtnode_pv,  void** disk_data, PAIR_ATTR *sizep, int *dirtyp, void *extraargs) {
     assert(extraargs);
     assert(*brtnode_pv == NULL);
+    BRTNODE_DISK_DATA* ndd = (BRTNODE_DISK_DATA*)disk_data;
     struct brtnode_fetch_extra *bfe = (struct brtnode_fetch_extra *)extraargs;
     BRTNODE *node=(BRTNODE*)brtnode_pv;
     // deserialize the node, must pass the bfe in because we cannot
     // evaluate what piece of the the node is necessary until we get it at
     // least partially into memory
-    int r = toku_deserialize_brtnode_from(fd, nodename, fullhash, node, bfe);
+    int r = toku_deserialize_brtnode_from(fd, nodename, fullhash, node, ndd, bfe);
     if (r == 0) {
 	(*node)->h = bfe->h;  // copy reference to header from bfe
 	*sizep = make_brtnode_pair_attr(*node);
@@ -712,6 +816,7 @@ int toku_brtnode_fetch_callback (CACHEFILE UU(cachefile), int fd, BLOCKNUM noden
 
 void toku_brtnode_pe_est_callback(
     void* brtnode_pv, 
+    void* disk_data,
     long* bytes_freed_estimate, 
     enum partial_eviction_cost *cost, 
     void* UU(write_extraargs)
@@ -742,7 +847,8 @@ void toku_brtnode_pe_est_callback(
             // first get an estimate for how much space will be taken 
             // after compression, it is simply the size of compressed
             // data on disk plus the size of the struct that holds it
-            u_int32_t compressed_data_size = BP_SIZE(node, i);
+            BRTNODE_DISK_DATA ndd = disk_data;
+            u_int32_t compressed_data_size = BP_SIZE(ndd, i);
             compressed_data_size += sizeof(struct sub_block);
 
             // now get the space taken now
@@ -942,6 +1048,81 @@ BOOL toku_brtnode_pf_req_callback(void* brtnode_pv, void* read_extraargs) {
     return retval;
 }
 
+u_int64_t num_basements_decompressed;
+u_int64_t num_buffers_decompressed;
+u_int64_t num_basements_fetched;
+u_int64_t num_buffers_fetched;
+u_int64_t num_pivots_fetched;
+
+void brt_begin_checkpoint(void) {
+    /*
+        u_int64_t old_num_basements_decompressed = num_basements_decompressed;
+        u_int64_t old_num_buffers_decompressed = num_buffers_decompressed;
+        u_int64_t old_num_basements_fetched = num_basements_fetched;
+        u_int64_t old_num_buffers_fetched = num_buffers_fetched;
+        u_int64_t old_num_pivots_fetched = num_pivots_fetched;
+    */  
+        num_basements_decompressed = 
+            STATUS_VALUE(BRT_NUM_BASEMENTS_DECOMPRESSED_NORMAL) +
+            STATUS_VALUE(BRT_NUM_BASEMENTS_DECOMPRESSED_AGGRESSIVE) +
+            STATUS_VALUE(BRT_NUM_BASEMENTS_DECOMPRESSED_PREFETCH) +
+            STATUS_VALUE(BRT_NUM_BASEMENTS_DECOMPRESSED_WRITE);
+            
+        num_buffers_decompressed = 
+            STATUS_VALUE(BRT_NUM_MSG_BUFFER_DECOMPRESSED_NORMAL) +
+            STATUS_VALUE(BRT_NUM_MSG_BUFFER_DECOMPRESSED_AGGRESSIVE) +
+            STATUS_VALUE(BRT_NUM_MSG_BUFFER_DECOMPRESSED_PREFETCH) +
+            STATUS_VALUE(BRT_NUM_MSG_BUFFER_DECOMPRESSED_WRITE);
+        
+        num_basements_fetched = 
+            STATUS_VALUE(BRT_NUM_BASEMENTS_FETCHED_NORMAL) +
+            STATUS_VALUE(BRT_NUM_BASEMENTS_FETCHED_AGGRESSIVE) +
+            STATUS_VALUE(BRT_NUM_BASEMENTS_FETCHED_PREFETCH) +
+            STATUS_VALUE(BRT_NUM_BASEMENTS_FETCHED_WRITE);
+        
+        num_buffers_fetched = 
+            STATUS_VALUE(BRT_NUM_MSG_BUFFER_FETCHED_NORMAL) +
+            STATUS_VALUE(BRT_NUM_MSG_BUFFER_FETCHED_AGGRESSIVE) +
+            STATUS_VALUE(BRT_NUM_MSG_BUFFER_FETCHED_PREFETCH) +
+            STATUS_VALUE(BRT_NUM_MSG_BUFFER_FETCHED_WRITE);
+        
+        num_pivots_fetched = 
+            STATUS_VALUE(BRT_NUM_PIVOTS_FETCHED_QUERY) +
+            STATUS_VALUE(BRT_NUM_PIVOTS_FETCHED_PREFETCH) +
+            STATUS_VALUE(BRT_NUM_PIVOTS_FETCHED_WRITE);
+}
+
+void brt_end_checkpoint(void) {
+    num_basements_decompressed = 
+        STATUS_VALUE(BRT_NUM_BASEMENTS_DECOMPRESSED_NORMAL) +
+        STATUS_VALUE(BRT_NUM_BASEMENTS_DECOMPRESSED_AGGRESSIVE) +
+        STATUS_VALUE(BRT_NUM_BASEMENTS_DECOMPRESSED_PREFETCH) +
+        STATUS_VALUE(BRT_NUM_BASEMENTS_DECOMPRESSED_WRITE);
+        
+    num_buffers_decompressed = 
+        STATUS_VALUE(BRT_NUM_MSG_BUFFER_DECOMPRESSED_NORMAL) +
+        STATUS_VALUE(BRT_NUM_MSG_BUFFER_DECOMPRESSED_AGGRESSIVE) +
+        STATUS_VALUE(BRT_NUM_MSG_BUFFER_DECOMPRESSED_PREFETCH) +
+        STATUS_VALUE(BRT_NUM_MSG_BUFFER_DECOMPRESSED_WRITE);
+    
+    num_basements_fetched = 
+        STATUS_VALUE(BRT_NUM_BASEMENTS_FETCHED_NORMAL) +
+        STATUS_VALUE(BRT_NUM_BASEMENTS_FETCHED_AGGRESSIVE) +
+        STATUS_VALUE(BRT_NUM_BASEMENTS_FETCHED_PREFETCH) +
+        STATUS_VALUE(BRT_NUM_BASEMENTS_FETCHED_WRITE);
+    
+    num_buffers_fetched = 
+        STATUS_VALUE(BRT_NUM_MSG_BUFFER_FETCHED_NORMAL) +
+        STATUS_VALUE(BRT_NUM_MSG_BUFFER_FETCHED_AGGRESSIVE) +
+        STATUS_VALUE(BRT_NUM_MSG_BUFFER_FETCHED_PREFETCH) +
+        STATUS_VALUE(BRT_NUM_MSG_BUFFER_FETCHED_WRITE);
+    
+    num_pivots_fetched = 
+        STATUS_VALUE(BRT_NUM_PIVOTS_FETCHED_QUERY) +
+        STATUS_VALUE(BRT_NUM_PIVOTS_FETCHED_PREFETCH) +
+        STATUS_VALUE(BRT_NUM_PIVOTS_FETCHED_WRITE);
+}
+
 static void
 brt_status_update_partial_fetch_reason(
     struct brtnode_fetch_extra* UU(bfe),
@@ -950,7 +1131,6 @@ brt_status_update_partial_fetch_reason(
     BOOL UU(is_leaf)
     )
 {
-#if 0
     invariant(state == PT_COMPRESSED || state == PT_ON_DISK);
     if (is_leaf) {
         if (bfe->type == brtnode_fetch_prefetch) {
@@ -1006,13 +1186,13 @@ brt_status_update_partial_fetch_reason(
             }
         }
     }
-#endif
 }
 
 // callback for partially reading a node
 // could have just used toku_brtnode_fetch_callback, but wanted to separate the two cases to separate functions
-int toku_brtnode_pf_callback(void* brtnode_pv, void* read_extraargs, int fd, PAIR_ATTR* sizep) {
+int toku_brtnode_pf_callback(void* brtnode_pv, void* disk_data, void* read_extraargs, int fd, PAIR_ATTR* sizep) {
     BRTNODE node = brtnode_pv;
+    BRTNODE_DISK_DATA ndd = disk_data;
     struct brtnode_fetch_extra *bfe = read_extraargs;
     // there must be a reason this is being called. If we get a garbage type or the type is brtnode_fetch_none,
     // then something went wrong
@@ -1041,7 +1221,7 @@ int toku_brtnode_pf_callback(void* brtnode_pv, void* read_extraargs, int fd, PAI
                 cilk_spawn toku_deserialize_bp_from_compressed(node, i, &bfe->h->descriptor, bfe->h->compare_fun);
             }
             else if (BP_STATE(node,i) == PT_ON_DISK) {
-                cilk_spawn toku_deserialize_bp_from_disk(node, i, fd, bfe);
+                cilk_spawn toku_deserialize_bp_from_disk(node, ndd, i, fd, bfe);
             }
             else {
                 assert(FALSE);
@@ -1271,8 +1451,6 @@ toku_initialize_empty_brtnode (BRTNODE n, BLOCKNUM nodename, int height, int num
 	for (int i = 0; i < num_children; i++) {
             BP_BLOCKNUM(n,i).b=0;
             BP_STATE(n,i) = PT_INVALID;
-            BP_START(n,i) = 0;
-            BP_SIZE (n,i) = 0;
 	    BP_WORKDONE(n,i) = 0;
             BP_INIT_TOUCHED_CLOCK(n, i);
             set_BNULL(n,i);
@@ -1329,8 +1507,6 @@ static void
 init_childinfo(BRTNODE node, int childnum, BRTNODE child) {
     BP_BLOCKNUM(node,childnum) = child->thisnodename;
     BP_STATE(node,childnum) = PT_AVAIL;
-    BP_START(node,childnum) = 0;
-    BP_SIZE (node,childnum) = 0;
     BP_WORKDONE(node, childnum)   = 0;
     set_BNC(node, childnum, toku_create_empty_nl());
 }
@@ -2303,11 +2479,15 @@ void bring_node_fully_into_memory(BRTNODE node, struct brt_header* h)
 {
     if (!is_entire_node_in_memory(node)) {
         struct brtnode_fetch_extra bfe;
-        PAIR_ATTR attr;
-        int fd = toku_cachefile_get_and_pin_fd(h->cf);
         fill_bfe_for_full_read(&bfe, h);
-        toku_brtnode_pf_callback(node, &bfe, fd, &attr);
-        toku_cachefile_unpin_fd(h->cf);
+        toku_cachetable_pf_pinned_pair(
+            node,
+            toku_brtnode_pf_callback,
+            &bfe,
+            h->cf,
+            node->thisnodename,
+            toku_cachetable_hash(h->cf, node->thisnodename)
+            );
     }
 }
 
@@ -2542,7 +2722,17 @@ toku_brt_root_put_cmd (BRT brt, BRT_MSG_S * cmd)
         // get the root node
         struct brtnode_fetch_extra bfe;
         fill_bfe_for_full_read(&bfe, brt->h);
-        toku_pin_brtnode_holding_lock(brt, *rootp, fullhash,(ANCESTORS)NULL, &infinite_bounds, &bfe, TRUE, &node);
+        toku_pin_brtnode_holding_lock(
+            brt, 
+            *rootp, 
+            fullhash,
+            (ANCESTORS)NULL, 
+            &infinite_bounds, 
+            &bfe, 
+            TRUE,
+            TRUE, // may_modify_node
+            &node
+            );
         toku_assert_entire_node_in_memory(node);
 
         cmd->msn.msn = node->max_msn_applied_to_node_on_disk.msn + 1;
@@ -5136,18 +5326,18 @@ brt_search_node (
 #if TOKU_DO_PREFETCH
 
 static int
-brtnode_fetch_callback_and_free_bfe(CACHEFILE cf, int fd, BLOCKNUM nodename, u_int32_t fullhash, void **brtnode_pv, PAIR_ATTR *sizep, int *dirtyp, void *extraargs)
+brtnode_fetch_callback_and_free_bfe(CACHEFILE cf, int fd, BLOCKNUM nodename, u_int32_t fullhash, void **brtnode_pv, void** UU(disk_data), PAIR_ATTR *sizep, int *dirtyp, void *extraargs)
 {
-    int r = toku_brtnode_fetch_callback(cf, fd, nodename, fullhash, brtnode_pv, sizep, dirtyp, extraargs);
+    int r = toku_brtnode_fetch_callback(cf, fd, nodename, fullhash, brtnode_pv, disk_data, sizep, dirtyp, extraargs);
     destroy_bfe_for_prefetch(extraargs);
     toku_free(extraargs);
     return r;
 }
 
 static int
-brtnode_pf_callback_and_free_bfe(void *brtnode_pv, void *read_extraargs, int fd, PAIR_ATTR *sizep)
+brtnode_pf_callback_and_free_bfe(void *brtnode_pv, void* disk_data, void *read_extraargs, int fd, PAIR_ATTR *sizep)
 {
-    int r = toku_brtnode_pf_callback(brtnode_pv, read_extraargs, fd, sizep);
+    int r = toku_brtnode_pf_callback(brtnode_pv, disk_data, read_extraargs, fd, sizep);
     destroy_bfe_for_prefetch(read_extraargs);
     toku_free(read_extraargs);
     return r;
@@ -5239,6 +5429,7 @@ brt_search_child(BRT brt, BRTNODE node, int childnum, brt_search_t *search, BRT_
                                   unlockers,
                                   &next_ancestors, bounds,
                                   &bfe,
+                                  (node->height == 1), // may_modify_node TRUE iff child is leaf
                                   TRUE,
                                   &childnode,
                                   &msgs_applied);
@@ -5569,6 +5760,7 @@ try_again:
             *rootp, 
             fullhash,
             &bfe, 
+            FALSE, // may_modify_node set to FALSE, because root cannot change during search
             0,
             NULL,
             &node
@@ -6084,7 +6276,19 @@ toku_brt_keyrange_internal (BRT brt, BRTNODE node,
 	u_int32_t fullhash = compute_child_fullhash(brt->cf, node, child_number);
 	BRTNODE childnode;
         BOOL msgs_applied = FALSE;
-	r = toku_pin_brtnode(brt, childblocknum, fullhash, unlockers, &next_ancestors, bounds, bfe, FALSE, &childnode, &msgs_applied);
+	r = toku_pin_brtnode(
+            brt, 
+            childblocknum, 
+            fullhash, 
+            unlockers, 
+            &next_ancestors, 
+            bounds, 
+            bfe,
+            FALSE, // may_modify_node is FALSE, because node guaranteed to not change
+            FALSE, 
+            &childnode, 
+            &msgs_applied
+            );
         assert(!msgs_applied);
 	if (r != TOKUDB_TRY_AGAIN) {
 	    assert(r == 0);
@@ -6136,6 +6340,7 @@ toku_brt_keyrange (BRT brt, DBT *key, u_int64_t *less_p, u_int64_t *equal_p, u_i
                 *rootp, 
                 fullhash,
                 &bfe, 
+                FALSE, // may_modify_node, cannot change root during keyrange
                 0,
                 NULL,
                 &node
@@ -6221,6 +6426,7 @@ toku_dump_brtnode (FILE *file, BRT brt, BLOCKNUM blocknum, int depth, struct kv_
 	toku_brtnode_fetch_callback, 
         toku_brtnode_pf_req_callback,
         toku_brtnode_pf_callback,
+        TRUE, // may_modify_value, just safe to set to TRUE, I think it could theoretically be FALSE 
 	&bfe 
 	);
     assert_zero(r);
@@ -6533,6 +6739,7 @@ static BOOL is_empty_fast_iter (BRT brt, BRTNODE node) {
                     childblocknum,
                     fullhash,
                     &bfe,
+                    FALSE, // may_modify_node set to FALSE, as nodes not modified
                     0,
                     NULL,
                     &childnode
@@ -6572,6 +6779,7 @@ BOOL toku_brt_is_empty_fast (BRT brt)
             *rootp,
             fullhash,
             &bfe,
+            FALSE, // may_modify_node set to FALSE, node does not change
             0,
             NULL,
             &node
diff --git a/newbrt/brtdump.c b/newbrt/brtdump.c
index c363fdb5844..5a5e525afb4 100644
--- a/newbrt/brtdump.c
+++ b/newbrt/brtdump.c
@@ -123,8 +123,9 @@ static void
 dump_node (int f, BLOCKNUM blocknum, struct brt_header *h) {
     BRTNODE n;
     struct brtnode_fetch_extra bfe;
+    BRTNODE_DISK_DATA ndd = NULL;
     fill_bfe_for_full_read(&bfe, h);
-    int r = toku_deserialize_brtnode_from (f, blocknum, 0 /*pass zero for hash, it doesn't matter*/, &n, &bfe);
+    int r = toku_deserialize_brtnode_from (f, blocknum, 0 /*pass zero for hash, it doesn't matter*/, &n, &ndd, &bfe);
     assert(r==0);
     assert(n!=0);
     printf("brtnode\n");
@@ -207,6 +208,7 @@ dump_node (int f, BLOCKNUM blocknum, struct brt_header *h) {
 	}
     }
     toku_brtnode_free(&n);
+    toku_free(ndd);
 }
 
 static void 
@@ -226,9 +228,10 @@ static int
 fragmentation_helper(BLOCKNUM b, int64_t size, int64_t UU(address), void *extra) {
     frag_help_extra *info = extra;
     BRTNODE n;
+    BRTNODE_DISK_DATA ndd = NULL;
     struct brtnode_fetch_extra bfe;
     fill_bfe_for_full_read(&bfe, info->h);
-    int r = toku_deserialize_brtnode_from(info->f, b, 0 /*pass zero for hash, it doesn't matter*/, &n, &bfe);
+    int r = toku_deserialize_brtnode_from(info->f, b, 0 /*pass zero for hash, it doesn't matter*/, &n, &ndd, &bfe);
     if (r==0) {
         info->blocksizes += size;
         if (n->height == 0) {
@@ -236,6 +239,7 @@ fragmentation_helper(BLOCKNUM b, int64_t size, int64_t UU(address), void *extra)
             info->leafblocks++;
         }
 	toku_brtnode_free(&n);
+        toku_free(ndd);
     }
     return 0;
 }
@@ -282,9 +286,10 @@ static int
 garbage_helper(BLOCKNUM b, int64_t UU(size), int64_t UU(address), void *extra) {
     garbage_help_extra *info = extra;
     BRTNODE n;
+    BRTNODE_DISK_DATA ndd = NULL;
     struct brtnode_fetch_extra bfe;
     fill_bfe_for_full_read(&bfe, info->h);
-    int r = toku_deserialize_brtnode_from(info->f, b, 0, &n, &bfe);
+    int r = toku_deserialize_brtnode_from(info->f, b, 0, &n, &ndd, &bfe);
     if (r != 0) {
         goto no_node;
     }
@@ -300,6 +305,7 @@ garbage_helper(BLOCKNUM b, int64_t UU(size), int64_t UU(address), void *extra) {
     }
 exit:
     toku_brtnode_free(&n);
+    toku_free(ndd);
 no_node:
     return r;
 }
diff --git a/newbrt/brtloader.c b/newbrt/brtloader.c
index 0081708e72b..18ed861f523 100644
--- a/newbrt/brtloader.c
+++ b/newbrt/brtloader.c
@@ -2806,7 +2806,8 @@ static void finish_leafnode (struct dbout *out, struct leaf_buf *lbuf, int progr
     // serialize leaf to buffer
     size_t serialized_leaf_size = 0;
     char *serialized_leaf = NULL;
-    result = toku_serialize_brtnode_to_memory(lbuf->node, target_basementnodesize, &serialized_leaf_size, &serialized_leaf);
+    BRTNODE_DISK_DATA ndd = NULL;
+    result = toku_serialize_brtnode_to_memory(lbuf->node, &ndd, target_basementnodesize, TRUE, &serialized_leaf_size, &serialized_leaf);
 
     // write it out
     if (result == 0) {
@@ -2822,8 +2823,10 @@ static void finish_leafnode (struct dbout *out, struct leaf_buf *lbuf, int progr
     }
 
     // free the node
-    if (serialized_leaf)
+    if (serialized_leaf) {
+        toku_free(ndd);
         toku_free(serialized_leaf);
+    }
     toku_brtnode_free(&lbuf->node);
     xids_destroy(&lbuf->xids);
     toku_free(lbuf);
@@ -3015,11 +3018,12 @@ static void write_nonleaf_node (BRTLOADER bl, struct dbout *out, int64_t blocknu
         BP_STATE(node,i) = PT_AVAIL;
     }
 
+    BRTNODE_DISK_DATA ndd = NULL;
     if (result == 0) {
         size_t n_bytes;
         char *bytes;
         int r;
-        r = toku_serialize_brtnode_to_memory(node, target_basementnodesize, &n_bytes, &bytes);
+        r = toku_serialize_brtnode_to_memory(node, &ndd, target_basementnodesize, TRUE, &n_bytes, &bytes);
         if (r) {
             result = r;
         } else {
@@ -3049,6 +3053,7 @@ static void write_nonleaf_node (BRTLOADER bl, struct dbout *out, int64_t blocknu
     toku_free(node->bp);
     toku_free(node->childkeys);
     toku_free(node);
+    toku_free(ndd);
     toku_free(subtree_info);
 
     blocknum_of_new_node = blocknum_of_new_node;
diff --git a/newbrt/brttypes.h b/newbrt/brttypes.h
index a35da0f6ea6..5d1abbb7af6 100644
--- a/newbrt/brttypes.h
+++ b/newbrt/brttypes.h
@@ -31,6 +31,7 @@ typedef bool BOOL;
 
 typedef struct brt *BRT;
 typedef struct brtnode *BRTNODE;
+typedef struct brtnode_disk_data *BRTNODE_DISK_DATA;
 typedef struct brtnode_leaf_basement_node *BASEMENTNODE;
 typedef struct brtnode_nonleaf_childinfo *NONLEAF_CHILDINFO;
 typedef struct sub_block *SUB_BLOCK;
diff --git a/newbrt/cachetable.c b/newbrt/cachetable.c
index be3eb76d53f..c976496bfe6 100644
--- a/newbrt/cachetable.c
+++ b/newbrt/cachetable.c
@@ -25,11 +25,6 @@
 
 #include "brt-internal.h"
 
-
-static void cachetable_writer(WORKITEM);
-static void cachetable_reader(WORKITEM);
-static void cachetable_partial_reader(WORKITEM);
-
 #define TRACE_CACHETABLE 0
 #if TRACE_CACHETABLE
 #define WHEN_TRACE_CT(x) x
@@ -124,7 +119,10 @@ typedef struct ctpair *PAIR;
 struct ctpair {
     CACHEFILE cachefile;
     CACHEKEY key;
-    void    *value;
+    void* value_data;
+    void* cloned_value_data;
+    long cloned_value_size;
+    void* disk_data;
     PAIR_ATTR attr;
 
     //
@@ -146,6 +144,7 @@ struct ctpair {
     CACHETABLE_PARTIAL_EVICTION_EST_CALLBACK pe_est_callback;
     CACHETABLE_PARTIAL_EVICTION_CALLBACK pe_callback;
     CACHETABLE_CLEANER_CALLBACK cleaner_callback;
+    CACHETABLE_CLONE_CALLBACK clone_callback;
     long size_evicting_estimate;
     void    *write_extraargs;
 
@@ -158,9 +157,11 @@ struct ctpair {
     PAIR     pending_next;
     PAIR     pending_prev;
 
-    struct nb_mutex nb_mutex;    // single writer
+    struct nb_mutex value_nb_mutex;    // single writer
+    struct nb_mutex disk_nb_mutex;    // single writer
     struct workqueue *cq;        // writers sometimes return ctpair's using this queue
     struct workitem asyncwork;   // work item for the worker threads
+    struct workitem checkpoint_asyncwork;   // work item for the worker threads
     u_int32_t refs;              // References that prevent destruction
     int already_removed;         // If a pair is removed from the cachetable, but cannot be freed because refs>0, this is set.
     struct toku_list next_for_cachefile; // link in the cachefile list
@@ -188,7 +189,8 @@ static inline void ctpair_destroy(PAIR p) {
     assert(p->refs>0);
     p->refs--;
     if (p->refs==0) {
-        nb_mutex_destroy(&p->nb_mutex);
+        nb_mutex_destroy(&p->value_nb_mutex);
+        nb_mutex_destroy(&p->disk_nb_mutex);
         toku_free(p);
     }
 }
@@ -216,6 +218,8 @@ struct cachetable {
     toku_pthread_mutex_t cachefiles_mutex;  // lock that protects the cachefiles list
     struct workqueue wq;          // async work queue 
     THREADPOOL threadpool;        // pool of worker threads
+    struct workqueue checkpoint_wq;          
+    THREADPOOL checkpoint_threadpool;        
 
     KIBBUTZ kibbutz;              // another pool of worker threads and jobs to do asynchronously.  
 
@@ -250,6 +254,18 @@ struct cachetable {
     int64_t size_leaf;
     int64_t size_rollback;
     int64_t size_cachepressure;
+
+    // variables used by the checkpoint thread to know
+    // when all work induced by cloning on client threads is done
+    // when a client thread clones a PAIR and places it on
+    // a background thread to be written out, n_checkpoint_clones_running
+    // is incremented. On the background thread, when the checkpointing
+    // is completed, n_checkpoint_clones_running is decremented.
+    // When the checkpoint thread uses clones_background_wait for 
+    // n_checkpoint_clones_running to go to zero, it knows that
+    // the checkpoint is complete
+    u_int32_t n_checkpoint_clones_running;
+    toku_pthread_cond_t clones_background_wait;    
 };
 
 
@@ -512,7 +528,8 @@ int toku_create_cachetable(CACHETABLE *result, long size_limit, LSN UU(initial_l
     ct->size_limit = size_limit;
     ct->size_reserved = unreservable_memory(size_limit);
     ct->logger = logger;
-    toku_init_workers(&ct->wq, &ct->threadpool);
+    toku_init_workers(&ct->wq, &ct->threadpool, 1);
+    toku_init_workers(&ct->checkpoint_wq, &ct->checkpoint_threadpool, 8);
     ct->mutex = workqueue_lock_ref(&ct->wq);
     int r = toku_pthread_mutex_init(&ct->openfd_mutex, NULL); resource_assert_zero(r);
     r = toku_pthread_mutex_init(&ct->cachefiles_mutex, 0); resource_assert_zero(r);
@@ -524,6 +541,7 @@ int toku_create_cachetable(CACHETABLE *result, long size_limit, LSN UU(initial_l
     ct->cleaner_iterations = 1; // default is one iteration
     r = toku_omt_create(&ct->reserved_filenums);  assert(r==0);
     ct->env_dir = toku_xstrdup(".");
+    r = toku_pthread_cond_init(&ct->clones_background_wait, NULL); resource_assert_zero(r);
     *result = ct;
     return 0;
 }
@@ -1309,7 +1327,6 @@ cachetable_change_pair_attr(CACHETABLE ct, PAIR_ATTR old_attr, PAIR_ATTR new_att
 // Effects: the pair is removed from the LRU list and from the cachetable's hash table.
 // The size of the objects in the cachetable is adjusted by the size of the pair being
 // removed.
-
 static void cachetable_remove_pair (CACHETABLE ct, PAIR p) {
     pair_remove(ct, p);
     pending_pairs_remove(ct, p);
@@ -1319,20 +1336,20 @@ static void cachetable_remove_pair (CACHETABLE ct, PAIR p) {
     ct->n_in_table--;
     // Remove it from the hash chain.
     {
-	unsigned int h = p->fullhash&(ct->table_size-1);
-	ct->table[h] = remove_from_hash_chain (p, ct->table[h]);
+        unsigned int h = p->fullhash&(ct->table_size-1);
+        ct->table[h] = remove_from_hash_chain (p, ct->table[h]);
     }
     cachetable_remove_pair_attr(ct, p->attr);
     p->already_removed = TRUE;
 }
 
-
 static void cachetable_free_pair(CACHETABLE ct, PAIR p) {
     // helgrind
     CACHETABLE_FLUSH_CALLBACK flush_callback = p->flush_callback;
     CACHEFILE cachefile = p->cachefile;
     CACHEKEY key = p->key;
-    void *value = p->value;
+    void *value = p->value_data;
+    void* disk_data = p->disk_data;
     void *write_extraargs = p->write_extraargs;
     PAIR_ATTR old_attr = p->attr;
     
@@ -1342,7 +1359,7 @@ static void cachetable_free_pair(CACHETABLE ct, PAIR p) {
     PAIR_ATTR new_attr = p->attr;
     // Note that flush_callback is called with write_me FALSE, so the only purpose of this 
     // call is to tell the brt layer to evict the node (keep_me is FALSE).
-    flush_callback(cachefile, cachefile->fd, key, value, write_extraargs, old_attr, &new_attr, FALSE, FALSE, TRUE);
+    flush_callback(cachefile, cachefile->fd, key, value, &disk_data, write_extraargs, old_attr, &new_attr, FALSE, FALSE, TRUE, FALSE);
     
     cachetable_lock(ct);
     rwlock_read_unlock(&cachefile->fdlock);
@@ -1358,76 +1375,71 @@ static void cachetable_free_pair(CACHETABLE ct, PAIR p) {
 // anything except destroy the node.
 static void cachetable_maybe_remove_and_free_pair (CACHETABLE ct, PAIR p, BOOL* destroyed) {
     *destroyed = FALSE;
-    if (nb_mutex_users(&p->nb_mutex) == 0) {
+    if (nb_mutex_users(&p->value_nb_mutex) == 0) {
+        // assumption is that if we are about to remove the pair
+        // that no one has grabbed the disk_nb_mutex,
+        // and that there is no cloned_value_data, because
+        // no one is writing a cloned value out.
+        assert(nb_mutex_users(&p->disk_nb_mutex) == 0);
+        assert(p->cloned_value_data == NULL);
         cachetable_remove_pair(ct, p);
         cachetable_free_pair(ct, p);
         *destroyed = TRUE;
     }
 }
 
-// Read a pair from a cachefile into memory using the pair's fetch callback
-static void cachetable_fetch_pair(
+// assumes value_nb_mutex and disk_nb_mutex held on entry
+// responsibility of this function is to only write a locked PAIR to disk
+// and NOTHING else. We do not manipulate the state of the PAIR
+// of the cachetable here (with the exception of ct->size_current for clones)
+static void cachetable_only_write_locked_data(
     CACHETABLE ct, 
-    CACHEFILE cf, 
     PAIR p, 
-    CACHETABLE_FETCH_CALLBACK fetch_callback, 
-    void* read_extraargs,
-    BOOL keep_pair_locked
+    BOOL for_checkpoint,
+    PAIR_ATTR* new_attr,
+    BOOL is_clone
     ) 
-{
-    // helgrind
+{    
+    CACHETABLE_FLUSH_CALLBACK flush_callback = p->flush_callback;
+    CACHEFILE cachefile = p->cachefile;
     CACHEKEY key = p->key;
-    u_int32_t fullhash = p->fullhash;
-
-    void *toku_value = 0;
-    PAIR_ATTR attr;
+    void *value = is_clone ? p->cloned_value_data : p->value_data;
+    void *disk_data = p->disk_data;
+    void *write_extraargs = p->write_extraargs;
+    PAIR_ATTR old_attr = p->attr;
+    BOOL dowrite = TRUE;
     
-    // FIXME this should be enum cachetable_dirty, right?
-    int dirty = 0;
-
-    WHEN_TRACE_CT(printf("%s:%d CT: fetch_callback(%lld...)\n", __FILE__, __LINE__, key));    
-
-    rwlock_prefer_read_lock(&cf->fdlock, ct->mutex);
+    rwlock_prefer_read_lock(&cachefile->fdlock, ct->mutex);
     cachetable_unlock(ct);
-
-    int r;
-    assert(!toku_cachefile_is_dev_null_unlocked(cf));
-    r = fetch_callback(cf, cf->fd, key, fullhash, &toku_value, &attr, &dirty, read_extraargs);
-    if (dirty)
-	p->dirty = CACHETABLE_DIRTY;
-
+    
+    // write callback
+    if (toku_cachefile_is_dev_null_unlocked(cachefile)) {
+        dowrite = FALSE;
+    }
+    flush_callback(
+        cachefile, 
+        cachefile->fd, 
+        key, 
+        value, 
+        &disk_data, 
+        write_extraargs, 
+        old_attr, 
+        new_attr, 
+        dowrite, 
+        is_clone ? FALSE : TRUE, // keep_me (only keep if this is not cloned pointer)
+        for_checkpoint, 
+        is_clone //is_clone
+        );
+    p->disk_data = disk_data;
     cachetable_lock(ct);
-    rwlock_read_unlock(&cf->fdlock);
-    // brt.c asserts that get_and_pin succeeds,
-    // so we might as well just assert it here as opposed
-    // to trying to support an INVALID state
-    assert(r == 0);
-
-    p->value = toku_value;
-    p->attr = attr;
-    cachetable_add_pair_attr(ct, attr);
-    p->state = CTPAIR_IDLE;
-    if (keep_pair_locked) {
-        // if the caller wants the pair to remain locked
-        // that means the caller requests continued
-        // ownership of the PAIR, so there better not
-        // be a cq asking to transfer ownership
-        assert(!p->cq);
-    }
-    else {
-        if (p->cq) {
-            workitem_init(&p->asyncwork, NULL, p);
-            workqueue_enq(p->cq, &p->asyncwork, 1);
-        }
-        else {
-            nb_mutex_write_unlock(&p->nb_mutex);
-        }
-    }
-    if (0) printf("%s:%d %"PRId64" complete\n", __FUNCTION__, __LINE__, key.b);
+    if (is_clone) {
+        p->cloned_value_data = NULL;
+        ct->size_current -= p->cloned_value_size;
+        p->cloned_value_size = 0;
+    }    
+    rwlock_read_unlock(&cachefile->fdlock);
 }
 
-static void cachetable_complete_write_pair (CACHETABLE ct, PAIR p, BOOL do_remove, BOOL* destroyed);
-
 
 //
 // This function writes a PAIR's value out to disk. Currently, it is called
@@ -1436,38 +1448,32 @@ static void cachetable_complete_write_pair (CACHETABLE ct, PAIR p, BOOL do_remov
 // that needs to write out a dirty node for checkpoint.
 //
 static void cachetable_write_locked_pair(CACHETABLE ct, PAIR p) {
-    // see comments in toku_cachetable_begin_checkpoint to understand
-    // purpose of the pending_lock
-    rwlock_read_lock(&ct->pending_lock, ct->mutex);
-    
-    // helgrind
-    CACHETABLE_FLUSH_CALLBACK flush_callback = p->flush_callback;
-    CACHEFILE cachefile = p->cachefile;
-    CACHEKEY key = p->key;
-    void *value = p->value;
-    void *write_extraargs = p->write_extraargs;
     PAIR_ATTR old_attr = p->attr;
     PAIR_ATTR new_attr = p->attr;
-    BOOL dowrite = (BOOL)(p->dirty);
+    rwlock_read_lock(&ct->pending_lock, ct->mutex);
     BOOL for_checkpoint = p->checkpoint_pending;
-    
-    //Must set to FALSE before releasing cachetable lock
     p->checkpoint_pending = FALSE;
-    rwlock_prefer_read_lock(&cachefile->fdlock, ct->mutex);
-    cachetable_unlock(ct);
-    
-    // write callback
-    if (toku_cachefile_is_dev_null_unlocked(cachefile)) dowrite = FALSE;
-    flush_callback(cachefile, cachefile->fd, key, value, write_extraargs, old_attr, &new_attr, dowrite, TRUE, for_checkpoint);
-    
-    cachetable_lock(ct);
-    rwlock_read_unlock(&cachefile->fdlock);
-    //
-    // now let's update variables
-    //
-    p->attr = new_attr;
-    cachetable_change_pair_attr(ct, old_attr, new_attr);
-    
+    // grabbing the disk_nb_mutex here ensures that
+    // after this point, no one is writing out a cloned value
+    // if we grab the disk_nb_mutex inside the if clause,
+    // then we may try to evict a PAIR that is in the process
+    // of having its clone be written out
+    nb_mutex_write_lock(&p->disk_nb_mutex, ct->mutex);
+    // make sure that assumption about cloned_value_data is true
+    // if we have grabbed the disk_nb_mutex, then that means that
+    // there should be no cloned value data
+    assert(p->cloned_value_data == NULL);
+    if (p->dirty) {
+        cachetable_only_write_locked_data(ct, p, for_checkpoint, &new_attr, FALSE);
+        //
+        // now let's update variables
+        //
+        if (new_attr.is_valid) {
+            p->attr = new_attr;
+            cachetable_change_pair_attr(ct, old_attr, new_attr);
+        }
+    }
+    nb_mutex_write_unlock(&p->disk_nb_mutex);
     // the pair is no longer dirty once written
     p->dirty = CACHETABLE_CLEAN;
     
@@ -1475,6 +1481,18 @@ static void cachetable_write_locked_pair(CACHETABLE ct, PAIR p) {
     rwlock_read_unlock(&ct->pending_lock);
 }
 
+// complete the write of a pair by reseting the writing flag, and 
+// maybe removing the pair from the cachetable if there are no
+// references to it
+
+static void cachetable_complete_write_pair (CACHETABLE ct, PAIR p, BOOL do_remove, BOOL* destroyed) {
+    p->cq = 0;
+    nb_mutex_write_unlock(&p->value_nb_mutex);
+    if (do_remove) {
+        cachetable_maybe_remove_and_free_pair(ct, p, destroyed);
+    }
+}
+
 // Write a pair to storage
 // Effects: an exclusive lock on the pair is obtained, the write callback is called,
 // the pair dirty state is adjusted, and the write is completed.  The keep_me
@@ -1507,16 +1525,15 @@ static void cachetable_write_pair(CACHETABLE ct, PAIR p, BOOL remove_me) {
     }
 }
 
-// complete the write of a pair by reseting the writing flag, and 
-// maybe removing the pair from the cachetable if there are no
-// references to it
-
-static void cachetable_complete_write_pair (CACHETABLE ct, PAIR p, BOOL do_remove, BOOL* destroyed) {
-    p->cq = 0;
-    nb_mutex_write_unlock(&p->nb_mutex);
-    if (do_remove) {
-        cachetable_maybe_remove_and_free_pair(ct, p, destroyed);
-    }
+// Worker thread function to write a pair from memory to its cachefile
+// As of now, the writer thread NEVER evicts, hence passing FALSE
+// for the third parameter to cachetable_write_pair
+static void cachetable_writer(WORKITEM wi) {
+    PAIR p = workitem_arg(wi);
+    CACHETABLE ct = p->cachefile->cachetable;
+    cachetable_lock(ct);
+    cachetable_write_pair(ct, p, p->remove_me);
+    cachetable_unlock(ct);
 }
 
 static void try_evict_pair(CACHETABLE ct, PAIR p) {
@@ -1525,15 +1542,20 @@ static void try_evict_pair(CACHETABLE ct, PAIR p) {
 
     // must check for before we grab the write lock because we may
     // be trying to evict something this thread is trying to read
-    if (!nb_mutex_users(&p->nb_mutex)) {
-        nb_mutex_write_lock(&p->nb_mutex, ct->mutex);
+    if (!nb_mutex_users(&p->value_nb_mutex)) {
+        nb_mutex_write_lock(&p->value_nb_mutex, ct->mutex);
         p->state = CTPAIR_WRITING;
 
         assert(ct->size_evicting >= 0);
         ct->size_evicting += p->attr.size;
         assert(ct->size_evicting >= 0);
-        
-        if (!p->dirty) {
+
+        // if the PAIR is dirty, the running eviction requires writing the 
+        // PAIR out. if the disk_nb_mutex is grabbed, then running 
+        // eviction requires waiting for the disk_nb_mutex to become available,
+        // which may be expensive. Hence, if either is true, we 
+        // do the eviction on a writer thread
+        if (!p->dirty && (nb_mutex_writers(&p->disk_nb_mutex) == 0)) {
             cachetable_write_pair(ct, p, TRUE);
         }
         else {
@@ -1545,12 +1567,10 @@ static void try_evict_pair(CACHETABLE ct, PAIR p) {
     }
 }
 
-// flush and remove a pair from the cachetable.  the callbacks are run by a thread in
-// a thread pool.
 // flush and remove a pair from the cachetable.  the callbacks are run by a thread in
 // a thread pool.
 static void flush_and_maybe_remove (CACHETABLE ct, PAIR p) {
-    nb_mutex_write_lock(&p->nb_mutex, ct->mutex);
+    nb_mutex_write_lock(&p->value_nb_mutex, ct->mutex);
     p->state = CTPAIR_WRITING;
     // this needs to be done here regardless of whether the eviction occurs on the main thread or on
     // a writer thread, because there may be a completion queue that needs access to this information
@@ -1565,7 +1585,7 @@ static void flush_and_maybe_remove (CACHETABLE ct, PAIR p) {
     workitem_init(wi, cachetable_writer, p);
     // evictions without a write or unpinned pair's that are clean
     // can be run in the current thread
-    if (!nb_mutex_writers(&p->nb_mutex) && !p->dirty) {
+    if (!nb_mutex_writers(&p->value_nb_mutex) && !p->dirty) {
         assert(ct->size_evicting >= 0);
         ct->size_evicting += p->attr.size;
         assert(ct->size_evicting >= 0);
@@ -1585,7 +1605,7 @@ static void do_partial_eviction(CACHETABLE ct, PAIR p) {
     PAIR_ATTR old_attr = p->attr;
     
     cachetable_unlock(ct);
-    p->pe_callback(p->value, old_attr, &new_attr, p->write_extraargs);
+    p->pe_callback(p->value_data, old_attr, &new_attr, p->write_extraargs);
     cachetable_lock(ct);
 
     cachetable_change_pair_attr(ct, old_attr, new_attr);
@@ -1603,7 +1623,7 @@ static void do_partial_eviction(CACHETABLE ct, PAIR p) {
         workqueue_enq(p->cq, &p->asyncwork, 1);
     }
     else {
-        nb_mutex_write_unlock(&p->nb_mutex);
+        nb_mutex_write_unlock(&p->value_nb_mutex);
     }
 }
 
@@ -1632,7 +1652,7 @@ static void maybe_flush_some (CACHETABLE ct, long size) {
     
     while ((ct->clock_head) && (size + ct->size_current > ct->size_limit + ct->size_evicting)) {
         PAIR curr_in_clock = ct->clock_head;
-        if (nb_mutex_users(&curr_in_clock->nb_mutex)) {
+        if (nb_mutex_users(&curr_in_clock->value_nb_mutex) || nb_mutex_users(&curr_in_clock->disk_nb_mutex)) {
             if (set_val && 
                 curr_in_clock->key.b == curr_cachekey.b &&
                 curr_in_clock->cachefile->filenum.fileid == curr_filenum.fileid)
@@ -1655,14 +1675,16 @@ static void maybe_flush_some (CACHETABLE ct, long size) {
             if (curr_in_clock->count > 0) {
                 curr_in_clock->count--;
                 // call the partial eviction callback
-                nb_mutex_write_lock(&curr_in_clock->nb_mutex, ct->mutex);
+                nb_mutex_write_lock(&curr_in_clock->value_nb_mutex, ct->mutex);
 
-                void *value = curr_in_clock->value;
+                void *value = curr_in_clock->value_data;
+                void* disk_data = curr_in_clock->disk_data;
                 void *write_extraargs = curr_in_clock->write_extraargs;
                 enum partial_eviction_cost cost;
                 long bytes_freed_estimate = 0;
                 curr_in_clock->pe_est_callback(
                     value, 
+                    disk_data,
                     &bytes_freed_estimate, 
                     &cost, 
                     write_extraargs
@@ -1700,7 +1722,7 @@ static void maybe_flush_some (CACHETABLE ct, long size) {
                         // set up a completion queue.
                         // So, a completion queue cannot exist
                         assert(!curr_in_clock->cq);
-                        nb_mutex_write_unlock(&curr_in_clock->nb_mutex);
+                        nb_mutex_write_unlock(&curr_in_clock->value_nb_mutex);
                     }
                 }
                 else {
@@ -1750,7 +1772,10 @@ static PAIR cachetable_insert_at(CACHETABLE ct,
     ctpair_add_ref(p);
     p->cachefile = cachefile;
     p->key = key;
-    p->value = value;
+    p->value_data = value;
+    p->cloned_value_data = NULL;
+    p->cloned_value_size = 0;
+    p->disk_data = NULL;
     p->fullhash = fullhash;
     p->dirty = dirty;
     p->attr = attr;
@@ -1759,11 +1784,13 @@ static PAIR cachetable_insert_at(CACHETABLE ct,
     p->pe_callback = write_callback.pe_callback;
     p->pe_est_callback = write_callback.pe_est_callback;
     p->cleaner_callback = write_callback.cleaner_callback;
+    p->clone_callback = write_callback.clone_callback;
     p->write_extraargs = write_callback.write_extraargs;
     p->fullhash = fullhash;
     p->clock_next = p->clock_prev = 0;
     p->remove_me = FALSE;
-    nb_mutex_init(&p->nb_mutex);
+    nb_mutex_init(&p->value_nb_mutex);
+    nb_mutex_init(&p->disk_nb_mutex);
     p->cq = 0;
     pair_add_to_clock(ct, p);
     toku_list_push(&cachefile->pairs_for_cachefile, &p->next_for_cachefile);
@@ -1845,7 +1872,7 @@ static int cachetable_put_internal(
         CACHETABLE_DIRTY
         );
     assert(p);
-    nb_mutex_write_lock(&p->nb_mutex, ct->mutex);
+    nb_mutex_write_lock(&p->value_nb_mutex, ct->mutex);
     //note_hash_count(count);
     return 0;
 }
@@ -1869,20 +1896,105 @@ static int cachetable_get_pair (CACHEFILE cachefile, CACHEKEY key, u_int32_t ful
     return r;
 }
 
+// ct locked on entry
+static void
+clone_pair(CACHETABLE ct, PAIR p) {
+    PAIR_ATTR old_attr = p->attr;
+    PAIR_ATTR new_attr;
+
+    // act of cloning should be fast,
+    // not sure if we have to release
+    // and regrab the cachetable lock,
+    // but doing it for now
+    cachetable_unlock(ct);
+    p->clone_callback(
+        p->value_data,
+        &p->cloned_value_data,
+        &new_attr,
+        TRUE,
+        p->write_extraargs
+        );
+    cachetable_lock(ct);
+    
+    // now we need to do the same actions we would do
+    // if the PAIR had been written to disk
+    //
+    // because we hold the value_nb_mutex,
+    // it doesn't matter whether we clear 
+    // the pending bit before the clone
+    // or after the clone
+    p->checkpoint_pending = FALSE;
+    p->dirty = CACHETABLE_CLEAN;
+    if (new_attr.is_valid) {
+        p->attr = new_attr;
+        cachetable_change_pair_attr(ct, old_attr, new_attr);
+    }
+    p->cloned_value_size = p->attr.size;
+    ct->size_current += p->cloned_value_size;
+}
+
+static void checkpoint_cloned_pair(WORKITEM wi) {
+    PAIR p = workitem_arg(wi);
+    CACHETABLE ct = p->cachefile->cachetable;
+    cachetable_lock(ct);
+    PAIR_ATTR new_attr;
+    // note that pending lock is not needed here because
+    // we KNOW we are in the middle of a checkpoint
+    // and that a begin_checkpoint cannot happen
+    cachetable_only_write_locked_data(
+        ct,
+        p,
+        TRUE, //for_checkpoint
+        &new_attr,
+        TRUE //is_clone
+        );
+    nb_mutex_write_unlock(&p->disk_nb_mutex);
+    ct->n_checkpoint_clones_running--;
+    if (ct->n_checkpoint_clones_running == 0) {
+        int r = toku_pthread_cond_broadcast(&ct->clones_background_wait); 
+        assert(r==0);
+    }
+    cachetable_unlock(ct);
+}
+
+static void
+checkpoint_cloned_pair_on_writer_thread(CACHETABLE ct, PAIR p) {
+    WORKITEM wi = &p->checkpoint_asyncwork;
+    workitem_init(wi, checkpoint_cloned_pair, p);
+    workqueue_enq(&ct->checkpoint_wq, wi, 1);
+}
+
+
 static void
 write_locked_pair_for_checkpoint(CACHETABLE ct, PAIR p)
 {
-    //
-    // this function is called by toku_cachetable_get_and_pin to write locked nodes
-    // out for checkpoint. get_and_pin assumes that there is no
-    // completion queue, so we assert it here.
-    //
-    assert(!p->cq);
     if (p->dirty && p->checkpoint_pending) {
-        // this is essentially a flush_and_maybe_remove except that
-        // we already have p->nb_mutex and we just do the write in our own thread.
-        p->state = CTPAIR_WRITING;
-        cachetable_write_locked_pair(ct, p); // keeps the PAIR's write lock
+        if (p->clone_callback) {
+            // I think it is safe to grab the disk_nb_mutex after
+            // cloning the pair, but doing it before just to be safe,
+            // even though the act of cloning does not touch disk_data
+            nb_mutex_write_lock(&p->disk_nb_mutex, ct->mutex);
+            assert(!p->cloned_value_data);
+            clone_pair(ct, p);
+            assert(p->cloned_value_data);
+            // place it on the background thread and continue
+            // responsibility of writer thread to release disk_nb_mutex
+            ct->n_checkpoint_clones_running++;
+            checkpoint_cloned_pair_on_writer_thread(ct, p);
+            // possibly run eviction because act of cloning adds
+            // to ct->size_current, we don't do it in 
+            // write_pair_for_checkpoint_thread, because that clones at most
+            // one node at any time, where as this may be called from many 
+            // threads simultaneously
+            maybe_flush_some(ct, 0);
+        }
+        else {
+            // The pair is not cloneable, just write the pair to disk
+            
+            // we already have p->value_nb_mutex and we just do the write in our own thread.
+            p->state = CTPAIR_WRITING;
+            cachetable_write_locked_pair(ct, p); // keeps the PAIR's write lock
+        }
     }
     else {
         //
@@ -1896,6 +2008,74 @@ write_locked_pair_for_checkpoint(CACHETABLE ct, PAIR p)
     }
 }
 
+// On entry: hold the ct lock
+// On exit:  the node is written out
+// Method:   take write lock
+//           maybe write out the node
+//           if p->cq, put on completion queue.  Else release write lock
+static void
+write_pair_for_checkpoint_thread (CACHETABLE ct, PAIR p)
+{
+    nb_mutex_write_lock(&p->value_nb_mutex, ct->mutex); // grab an exclusive lock on the pair    
+    if (p->dirty && p->checkpoint_pending) {
+        if (p->clone_callback) {
+            // I think it is safe to grab the disk_nb_mutex after
+            // cloning the pair, but doing it before just to be safe,
+            // even though the act of cloning does not touch disk_data
+            nb_mutex_write_lock(&p->disk_nb_mutex, ct->mutex);
+            assert(!p->cloned_value_data);
+            clone_pair(ct, p);
+            assert(p->cloned_value_data);
+        }
+        else {
+            // The pair is not cloneable, just write the pair to disk            
+            // we already have p->value_nb_mutex and we just do the write in our own thread.
+            // this will grab and release disk_nb_mutex
+            p->state = CTPAIR_WRITING;
+            cachetable_write_locked_pair(ct, p); // keeps the PAIR's write lock
+        }
+        // if we are checkpointing a PAIR, a cq should not exist
+        // close cannot be running, and unpin_and_remove 
+        // should have set the PAIR to clean
+        assert(!p->cq); 
+        
+        // now release value_nb_mutex, before we write the PAIR out
+        // so that the PAIR is available to client threads
+        nb_mutex_write_unlock(&p->value_nb_mutex); // didn't call cachetable_write_pair so we have to unlock it ourselves.
+        if (p->clone_callback) {
+            // note that pending lock is not needed here because
+            // we KNOW we are in the middle of a checkpoint
+            // and that a begin_checkpoint cannot happen
+            PAIR_ATTR attr;
+            cachetable_only_write_locked_data(
+                ct,
+                p,
+                TRUE, //for_checkpoint
+                &attr,
+                TRUE //is_clone
+                );
+            nb_mutex_write_unlock(&p->disk_nb_mutex);
+        }
+    }
+    else {
+        //
+        // we may clear the pending bit here because we have
+        // both the cachetable lock and the PAIR lock.
+        // The rule, as mentioned in  toku_cachetable_begin_checkpoint, 
+        // is that to clear the bit, we must have both the PAIR lock
+        // and the pending lock
+        //
+        p->checkpoint_pending = FALSE;
+        if (p->cq) {
+            workitem_init(&p->asyncwork, NULL, p);
+            workqueue_enq(p->cq, &p->asyncwork, 1);
+        }
+        else {
+            nb_mutex_write_unlock(&p->value_nb_mutex);
+        }
+    }
+}
+
 //
 // For each PAIR associated with these CACHEFILEs and CACHEKEYs
 // if the checkpoint_pending bit is set and the PAIR is dirty, write the PAIR
@@ -1926,7 +2106,7 @@ static void checkpoint_dependent_pairs(
          assert(curr_dep_pair != NULL);
          // pair had better be locked, as we are assuming
          // to own the write lock
-         assert(nb_mutex_writers(&curr_dep_pair->nb_mutex));
+         assert(nb_mutex_writers(&curr_dep_pair->value_nb_mutex));
          // we need to update the dirtyness of the dependent pair,
          // because the client may have dirtied it while holding its lock,
          // and if the pair is pending a checkpoint, it needs to be written out
@@ -2044,47 +2224,6 @@ static uint64_t get_tnow(void) {
     return tv.tv_sec * 1000000ULL + tv.tv_usec;
 }
 
-// for debug 
-static PAIR write_for_checkpoint_pair = NULL;
-
-
-// On entry: hold the ct lock
-// On exit:  the node is written out
-// Method:   take write lock
-//           maybe write out the node
-//           if p->cq, put on completion queue.  Else release write lock
-static void
-write_pair_for_checkpoint (CACHETABLE ct, PAIR p)
-{
-    write_for_checkpoint_pair = p;
-    nb_mutex_write_lock(&p->nb_mutex, ct->mutex); // grab an exclusive lock on the pair
-    if (p->dirty && p->checkpoint_pending) {
-        // this is essentially a flush_and_maybe_remove except that
-        // we already have p->nb_mutex and we just do the write in our own thread.
-        p->state = CTPAIR_WRITING;
-        workitem_init(&p->asyncwork, NULL, p);
-        cachetable_write_pair(ct, p, FALSE);    // releases the write lock on the pair
-    }
-    else {
-        //
-        // we may clear the pending bit here because we have
-        // both the cachetable lock and the PAIR lock.
-        // The rule, as mentioned in  toku_cachetable_begin_checkpoint, 
-        // is that to clear the bit, we must have both the PAIR lock
-        // and the pending lock
-        //
-        p->checkpoint_pending = FALSE;
-        if (p->cq) {
-            workitem_init(&p->asyncwork, NULL, p);
-            workqueue_enq(p->cq, &p->asyncwork, 1);
-        }
-        else {
-            nb_mutex_write_unlock(&p->nb_mutex); // didn't call cachetable_write_pair so we have to unlock it ourselves.
-        }
-    }
-    write_for_checkpoint_pair = NULL;
-}
-
 //
 // cachetable lock and PAIR lock are held on entry
 // On exit, cachetable lock is still held, but PAIR lock
@@ -2109,14 +2248,16 @@ do_partial_fetch(
     p->state = CTPAIR_READING;
 
     rwlock_prefer_read_lock(&cachefile->fdlock, ct->mutex);
+    nb_mutex_write_lock(&p->disk_nb_mutex, ct->mutex);
     cachetable_unlock(ct);
-    int r = pf_callback(p->value, read_extraargs, cachefile->fd, &new_attr);
+    int r = pf_callback(p->value_data, p->disk_data, read_extraargs, cachefile->fd, &new_attr);
     lazy_assert_zero(r);
     cachetable_lock(ct);
     rwlock_read_unlock(&cachefile->fdlock);
     p->attr = new_attr;
     cachetable_change_pair_attr(ct, old_attr, new_attr);
     p->state = CTPAIR_IDLE;
+    nb_mutex_write_unlock(&p->disk_nb_mutex);
     if (keep_pair_locked) {
         // if the caller wants the pair to remain locked
         // that means the caller requests continued
@@ -2130,11 +2271,38 @@ do_partial_fetch(
             workqueue_enq(p->cq, &p->asyncwork, 1);
         }
         else {
-            nb_mutex_write_unlock(&p->nb_mutex);
+            nb_mutex_write_unlock(&p->value_nb_mutex);
         }
     }
 }
 
+void toku_cachetable_pf_pinned_pair(
+    void* value,
+    CACHETABLE_PARTIAL_FETCH_CALLBACK pf_callback,
+    void* read_extraargs,
+    CACHEFILE cf,
+    CACHEKEY key,
+    u_int32_t fullhash
+    ) 
+{
+    PAIR_ATTR attr;
+    PAIR p = NULL;
+    cachetable_lock(cf->cachetable);
+    int r =  cachetable_get_pair(cf, key, fullhash, &p);
+    assert_zero(r);
+    assert(p->value_data == value);
+    assert(nb_mutex_writers(&p->value_nb_mutex));
+    nb_mutex_write_lock(&p->disk_nb_mutex, cf->cachetable->mutex);    
+    rwlock_prefer_read_lock(&cf->fdlock, cf->cachetable->mutex);
+    int fd = cf->fd;
+    cachetable_unlock(cf->cachetable);
+    pf_callback(value, p->disk_data, read_extraargs, fd, &attr);
+    cachetable_lock(cf->cachetable);
+    nb_mutex_write_unlock(&p->disk_nb_mutex);    
+    rwlock_read_unlock(&cf->fdlock);
+    cachetable_unlock(cf->cachetable);
+}
+
 
 int toku_cachetable_get_and_pin (
     CACHEFILE cachefile, 
@@ -2146,6 +2314,7 @@ int toku_cachetable_get_and_pin (
     CACHETABLE_FETCH_CALLBACK fetch_callback, 
     CACHETABLE_PARTIAL_FETCH_REQUIRED_CALLBACK pf_req_callback,
     CACHETABLE_PARTIAL_FETCH_CALLBACK pf_callback,
+    BOOL may_modify_value,
     void* read_extraargs // parameter for fetch_callback, pf_req_callback, and pf_callback
     ) 
 {
@@ -2165,6 +2334,7 @@ int toku_cachetable_get_and_pin (
         fetch_callback, 
         pf_req_callback,
         pf_callback,
+        may_modify_value,
         read_extraargs,
         0, // number of dependent pairs that we may need to checkpoint
         NULL, // array of cachefiles of dependent pairs
@@ -2174,8 +2344,73 @@ int toku_cachetable_get_and_pin (
         );
 }
 
+// Read a pair from a cachefile into memory using the pair's fetch callback
+static void cachetable_fetch_pair(
+    CACHETABLE ct, 
+    CACHEFILE cf, 
+    PAIR p, 
+    CACHETABLE_FETCH_CALLBACK fetch_callback, 
+    void* read_extraargs,
+    BOOL keep_pair_locked
+    ) 
+{
+    // helgrind
+    CACHEKEY key = p->key;
+    u_int32_t fullhash = p->fullhash;
+
+    void *toku_value = NULL;
+    void *disk_data = NULL;
+    PAIR_ATTR attr;
+    
+    // FIXME this should be enum cachetable_dirty, right?
+    int dirty = 0;
+
+    WHEN_TRACE_CT(printf("%s:%d CT: fetch_callback(%lld...)\n", __FILE__, __LINE__, key));    
+
+    rwlock_prefer_read_lock(&cf->fdlock, ct->mutex);
+    nb_mutex_write_lock(&p->disk_nb_mutex, ct->mutex);
+    cachetable_unlock(ct);
+
+    int r;
+    assert(!toku_cachefile_is_dev_null_unlocked(cf));
+    r = fetch_callback(cf, cf->fd, key, fullhash, &toku_value, &disk_data, &attr, &dirty, read_extraargs);
+    if (dirty)
+	p->dirty = CACHETABLE_DIRTY;
+
+    cachetable_lock(ct);
+    rwlock_read_unlock(&cf->fdlock);
+    // brt.c asserts that get_and_pin succeeds,
+    // so we might as well just assert it here as opposed
+    // to trying to support an INVALID state
+    assert(r == 0);
+
+    p->value_data = toku_value;
+    p->disk_data = disk_data;
+    p->attr = attr;
+    cachetable_add_pair_attr(ct, attr);
+    p->state = CTPAIR_IDLE;
+    nb_mutex_write_unlock(&p->disk_nb_mutex);
+    if (keep_pair_locked) {
+        // if the caller wants the pair to remain locked
+        // that means the caller requests continued
+        // ownership of the PAIR, so there better not
+        // be a cq asking to transfer ownership
+        assert(!p->cq);
+    }
+    else {
+        if (p->cq) {
+            workitem_init(&p->asyncwork, NULL, p);
+            workqueue_enq(p->cq, &p->asyncwork, 1);
+        }
+        else {
+            nb_mutex_write_unlock(&p->value_nb_mutex);
+        }
+    }
+    if (0) printf("%s:%d %"PRId64" complete\n", __FUNCTION__, __LINE__, key.b);
+}
+
 static BOOL resolve_checkpointing_fast(PAIR p) {
-    return !(p->checkpoint_pending && (p->dirty == CACHETABLE_DIRTY));
+    return !(p->checkpoint_pending && (p->dirty == CACHETABLE_DIRTY) && !p->clone_callback);
 }
 static void checkpoint_pair_and_dependent_pairs(
     CACHETABLE ct,
@@ -2238,6 +2473,7 @@ int toku_cachetable_get_and_pin_with_dep_pairs (
     CACHETABLE_FETCH_CALLBACK fetch_callback, 
     CACHETABLE_PARTIAL_FETCH_REQUIRED_CALLBACK pf_req_callback,
     CACHETABLE_PARTIAL_FETCH_CALLBACK pf_callback,
+    BOOL may_modify_value,
     void* read_extraargs, // parameter for fetch_callback, pf_req_callback, and pf_callback
     u_int32_t num_dependent_pairs, // number of dependent pairs that we may need to checkpoint
     CACHEFILE* dependent_cfs, // array of cachefiles of dependent pairs
@@ -2270,24 +2506,26 @@ int toku_cachetable_get_and_pin_with_dep_pairs (
             else if (p->state == CTPAIR_WRITING) {
                 cachetable_wait_writing++;
             }
-            nb_mutex_write_lock(&p->nb_mutex, ct->mutex);
+            nb_mutex_write_lock(&p->value_nb_mutex, ct->mutex);
             pair_touch(p);
-            checkpoint_pair_and_dependent_pairs(
-                ct,
-                p,
-                num_dependent_pairs,
-                dependent_cfs,
-                dependent_keys,
-                dependent_fullhash,
-                dependent_dirty
-                );
+            if (may_modify_value) {
+                checkpoint_pair_and_dependent_pairs(
+                    ct,
+                    p,
+                    num_dependent_pairs,
+                    dependent_cfs,
+                    dependent_keys,
+                    dependent_fullhash,
+                    dependent_dirty
+                    );
+            }
             cachetable_unlock(ct);
             
-            BOOL partial_fetch_required = pf_req_callback(p->value,read_extraargs);
+            BOOL partial_fetch_required = pf_req_callback(p->value_data,read_extraargs);
             // shortcutting a path to getting the user the data
             // helps scalability for in-memory workloads
             if (!partial_fetch_required) {
-                *value = p->value;
+                *value = p->value_data;
                 if (sizep) *sizep = p->attr.size;
                 return 0;
             }
@@ -2327,16 +2565,18 @@ int toku_cachetable_get_and_pin_with_dep_pairs (
             CACHETABLE_CLEAN
             );
         assert(p);
-        nb_mutex_write_lock(&p->nb_mutex, ct->mutex);
-        checkpoint_pair_and_dependent_pairs(
-            ct,
-            p,
-            num_dependent_pairs,
-            dependent_cfs,
-            dependent_keys,
-            dependent_fullhash,
-            dependent_dirty
-            );
+        nb_mutex_write_lock(&p->value_nb_mutex, ct->mutex);
+        if (may_modify_value) {
+            checkpoint_pair_and_dependent_pairs(
+                ct,
+                p,
+                num_dependent_pairs,
+                dependent_cfs,
+                dependent_keys,
+                dependent_fullhash,
+                dependent_dirty
+                );
+        }
         uint64_t t0 = get_tnow();
 
         // Retrieve the value of the PAIR from disk.
@@ -2348,7 +2588,7 @@ int toku_cachetable_get_and_pin_with_dep_pairs (
         goto got_value;
     }
 got_value:
-    *value = p->value;
+    *value = p->value_data;
     if (sizep) *sizep = p->attr.size;
     maybe_flush_some(ct, 0);
     cachetable_unlock(ct);
@@ -2378,12 +2618,12 @@ int toku_cachetable_maybe_get_and_pin (CACHEFILE cachefile, CACHEKEY key, u_int3
 	if (p->key.b==key.b && p->cachefile==cachefile) {
             if (!p->checkpoint_pending &&  //If checkpoint pending, we would need to first write it, which would make it clean
                 p->dirty &&
-                nb_mutex_users(&p->nb_mutex) == 0
+                nb_mutex_users(&p->value_nb_mutex) == 0
             ) {
                 cachetable_maybe_get_and_pin_hits++;
                 // because nb_mutex_users is 0, this is fast
-                nb_mutex_write_lock(&p->nb_mutex, ct->mutex);                
-                *value = p->value;
+                nb_mutex_write_lock(&p->value_nb_mutex, ct->mutex);                
+                *value = p->value_data;
                 pair_touch(p);
                 r = 0;
                 //printf("%s:%d cachetable_maybe_get_and_pin(%lld)--> %p\n", __FILE__, __LINE__, key, *value);
@@ -2410,12 +2650,12 @@ int toku_cachetable_maybe_get_and_pin_clean (CACHEFILE cachefile, CACHEKEY key,
 	count++;
 	if (p->key.b==key.b && p->cachefile==cachefile) {
             if (!p->checkpoint_pending &&  //If checkpoint pending, we would need to first write it, which would make it clean (if the pin would be used for writes.  If would be used for read-only we could return it, but that would increase complexity)
-                nb_mutex_users(&p->nb_mutex) == 0
+                nb_mutex_users(&p->value_nb_mutex) == 0
             ) {
                 cachetable_maybe_get_and_pin_hits++;
                 // because nb_mutex_users is 0, this is fast
-                nb_mutex_write_lock(&p->nb_mutex, ct->mutex);                
-                *value = p->value;
+                nb_mutex_write_lock(&p->value_nb_mutex, ct->mutex);                
+                *value = p->value_data;
                 r = 0;
                 //printf("%s:%d cachetable_maybe_get_and_pin_clean(%lld)--> %p\n", __FILE__, __LINE__, key, *value);
             }
@@ -2441,7 +2681,7 @@ cachetable_unpin_internal(CACHEFILE cachefile, CACHEKEY key, u_int32_t fullhash,
     for (PAIR p=ct->table[fullhash&(ct->table_size-1)]; p; p=p->hash_chain) {
 	count++;
 	if (p->key.b==key.b && p->cachefile==cachefile) {
-	    assert(nb_mutex_writers(&p->nb_mutex)>0);
+	    assert(nb_mutex_writers(&p->value_nb_mutex)>0);
             // this is a client thread that is unlocking the PAIR
             // That is, a cleaner, flusher, or get_and_pin thread
             // So, there must not be a completion queue lying around
@@ -2451,7 +2691,7 @@ cachetable_unpin_internal(CACHEFILE cachefile, CACHEKEY key, u_int32_t fullhash,
             // So, we should assert that a completion queue does not
             // exist
             assert(!p->cq);
-            nb_mutex_write_unlock(&p->nb_mutex);
+            nb_mutex_write_unlock(&p->value_nb_mutex);
 	    if (dirty) p->dirty = CACHETABLE_DIRTY;
             if (attr.is_valid) {
                 PAIR_ATTR old_attr = p->attr;
@@ -2503,6 +2743,7 @@ int toku_cachetable_get_and_pin_nonblocking (
     CACHETABLE_FETCH_CALLBACK fetch_callback, 
     CACHETABLE_PARTIAL_FETCH_REQUIRED_CALLBACK pf_req_callback,
     CACHETABLE_PARTIAL_FETCH_CALLBACK pf_callback,
+    BOOL may_modify_value,
     void *read_extraargs,
     UNLOCKERS unlockers
     )
@@ -2539,10 +2780,12 @@ int toku_cachetable_get_and_pin_nonblocking (
             // Otherwise, if there is no write lock grabbed, we know there will 
             // be no stall, so we grab the lock and return to the user
             //
-            if (!nb_mutex_writers(&p->nb_mutex) && resolve_checkpointing_fast(p)) {
+            if (!nb_mutex_writers(&p->value_nb_mutex) && 
+                (!may_modify_value || resolve_checkpointing_fast(p))) 
+            {
                 //cachetable_hit++;
-                nb_mutex_write_lock(&p->nb_mutex, ct->mutex);
-                if (p->checkpoint_pending) {
+                nb_mutex_write_lock(&p->value_nb_mutex, ct->mutex);
+                if (may_modify_value && p->checkpoint_pending) {
                     write_locked_pair_for_checkpoint(ct, p);
                 }
                 pair_touch(p);
@@ -2551,7 +2794,7 @@ int toku_cachetable_get_and_pin_nonblocking (
                 // when calling pf_req_callback, and if possible, returns the PAIR to the user without
                 // reacquiring the cachetable lock
                 cachetable_unlock(ct);
-                BOOL partial_fetch_required = pf_req_callback(p->value,read_extraargs);
+                BOOL partial_fetch_required = pf_req_callback(p->value_data,read_extraargs);
                 //
                 // Just because the PAIR exists does necessarily mean the all the data the caller requires
                 // is in memory. A partial fetch may be required, which is evaluated above
@@ -2568,7 +2811,7 @@ int toku_cachetable_get_and_pin_nonblocking (
                     return TOKUDB_TRY_AGAIN;
                 }
                 else {
-                    *value = p->value;
+                    *value = p->value_data;
                     return 0;
                 }
             }
@@ -2576,44 +2819,36 @@ int toku_cachetable_get_and_pin_nonblocking (
                 run_unlockers(unlockers); // The contract says the unlockers are run with the ct lock being held.
                 // Now wait for the I/O to occur.
                 // We need to obtain the lock (waiting for the write to finish), but then we only waited so we could wake up again
-                if (p->checkpoint_pending) {
-                    // an optimization we can later do is if 
-                    // we can grab the write lock on the pair and 
-                    // it is clean, then dont run the unlockers, simply 
-                    // clear the pending bit and return the PAIR to the user
-                    // but this is simpler.
-                    //cachetable_wait_checkpoint++;
-                    write_pair_for_checkpoint(ct, p);
+                if (p->state == CTPAIR_READING) {
+                    cachetable_wait_reading++;
+                }
+                else if (p->state == CTPAIR_WRITING) {
+                    cachetable_wait_writing++;
+                }
+                nb_mutex_write_lock(&p->value_nb_mutex, ct->mutex);
+                if (may_modify_value && p->checkpoint_pending) {
+                    write_locked_pair_for_checkpoint(ct, p);
+                }
+                // deadlock discovered in #4357 shows we need
+                // to do this. After running unlockers and waiting
+                // on the PAIR lock, a flusher thread may come 
+                // along and try to unpin_and_remove this PAIR.
+                // In that case, the thread running unpin_and_remove
+                // sets up a completion queue and we must transfer ownership
+                // of this PAIR lock to that thread via the completion 
+                // queue
+                if (p->cq) {
+                    // while we wait on the PAIR lock, a thread may come in and
+                    // call toku_cachetable_unpin_and_remove on this PAIR.
+                    // In that case, we must do NOTHING with the PAIR, as
+                    // it has been removed from the cachetable's data structures.
+                    // So, we should just pass the PAIR over to the completion
+                    // queue.
+                    workitem_init(&p->asyncwork, NULL, p);
+                    workqueue_enq(p->cq, &p->asyncwork, 1);
                 }
                 else {
-                    if (p->state == CTPAIR_READING) {
-                        cachetable_wait_reading++;
-                    }
-                    else if (p->state == CTPAIR_WRITING) {
-                        cachetable_wait_writing++;
-                    }
-                    nb_mutex_write_lock(&p->nb_mutex, ct->mutex);
-                    // deadlock discovered in #4357 shows we need
-                    // to do this. After running unlockers and waiting
-                    // on the PAIR lock, a flusher thread may come 
-                    // along and try to unpin_and_remove this PAIR.
-                    // In that case, the thread running unpin_and_remove
-                    // sets up a completion queue and we must transfer ownership
-                    // of this PAIR lock to that thread via the completion 
-                    // queue
-                    if (p->cq) {
-                        // while we wait on the PAIR lock, a thread may come in and
-                        // call toku_cachetable_unpin_and_remove on this PAIR.
-                        // In that case, we must do NOTHING with the PAIR, as
-                        // it has been removed from the cachetable's data structures.
-                        // So, we should just pass the PAIR over to the completion
-                        // queue.
-                        workitem_init(&p->asyncwork, NULL, p);
-                        workqueue_enq(p->cq, &p->asyncwork, 1);
-                    }
-                    else {
-                        nb_mutex_write_unlock(&p->nb_mutex);
-                    }
+                    nb_mutex_write_unlock(&p->value_nb_mutex);
                 }
                 cachetable_unlock(ct);
                 return TOKUDB_TRY_AGAIN;
@@ -2635,7 +2870,7 @@ int toku_cachetable_get_and_pin_nonblocking (
         CACHETABLE_CLEAN
         );
     assert(p);
-    nb_mutex_write_lock(&p->nb_mutex, ct->mutex);
+    nb_mutex_write_lock(&p->value_nb_mutex, ct->mutex);
     run_unlockers(unlockers); // we hold the ct mutex.
     u_int64_t t0 = get_tnow();
     cachetable_fetch_pair(ct, cf, p, fetch_callback, read_extraargs, FALSE);
@@ -2657,6 +2892,35 @@ struct cachefile_partial_prefetch_args {
     void *read_extraargs;
 };
 
+// Worker thread function to read a pair from a cachefile to memory
+static void cachetable_reader(WORKITEM wi) {
+    struct cachefile_prefetch_args* cpargs = workitem_arg(wi);
+    CACHETABLE ct = cpargs->p->cachefile->cachetable;
+    cachetable_lock(ct);
+    // TODO: find a way to properly pass some information for read_extraargs
+    // This is only called in toku_cachefile_prefetch, by putting it on a workqueue
+    // The problem is described in comments in toku_cachefile_prefetch
+    cachetable_fetch_pair(
+        ct,
+        cpargs->p->cachefile,
+        cpargs->p,
+        cpargs->fetch_callback,
+        cpargs->read_extraargs,
+        FALSE
+        );
+    cachetable_unlock(ct);
+    toku_free(cpargs);
+}
+
+static void cachetable_partial_reader(WORKITEM wi) {
+    struct cachefile_partial_prefetch_args *cpargs = workitem_arg(wi);
+    CACHETABLE ct = cpargs->p->cachefile->cachetable;
+    cachetable_lock(ct);
+    do_partial_fetch(ct, cpargs->p->cachefile, cpargs->p, cpargs->pf_callback, cpargs->read_extraargs, FALSE);
+    cachetable_unlock(ct);
+    toku_free(cpargs);
+}
+
 int toku_cachefile_prefetch(CACHEFILE cf, CACHEKEY key, u_int32_t fullhash,
                             CACHETABLE_WRITE_CALLBACK write_callback,
                             CACHETABLE_FETCH_CALLBACK fetch_callback,
@@ -2685,7 +2949,6 @@ int toku_cachefile_prefetch(CACHEFILE cf, CACHEKEY key, u_int32_t fullhash,
     PAIR p;
     for (p = ct->table[fullhash&(ct->table_size-1)]; p; p = p->hash_chain) {
         if (p->key.b==key.b && p->cachefile==cf) {
-            //Maybe check for pending and do write_pair_for_checkpoint()?
             pair_touch(p);
             break;
         }
@@ -2706,7 +2969,7 @@ int toku_cachefile_prefetch(CACHEFILE cf, CACHEKEY key, u_int32_t fullhash,
             CACHETABLE_CLEAN
             );
         assert(p);
-        nb_mutex_write_lock(&p->nb_mutex, ct->mutex);
+        nb_mutex_write_lock(&p->value_nb_mutex, ct->mutex);
         struct cachefile_prefetch_args *MALLOC(cpargs);
         cpargs->p = p;
         cpargs->fetch_callback = fetch_callback;
@@ -2717,15 +2980,15 @@ int toku_cachefile_prefetch(CACHEFILE cf, CACHEKEY key, u_int32_t fullhash,
             *doing_prefetch = TRUE;
         }
     }
-    else if (nb_mutex_users(&p->nb_mutex)==0) {
+    else if (nb_mutex_users(&p->value_nb_mutex)==0) {
         // client should not be trying to prefetch a node that is either
         // belongs to a cachefile being flushed or to a PAIR being
         // unpinned and removed
         assert(!p->cq);
         
         // nobody else is using the node, so we should go ahead and prefetch
-        nb_mutex_write_lock(&p->nb_mutex, ct->mutex);
-        BOOL partial_fetch_required = pf_req_callback(p->value, read_extraargs);
+        nb_mutex_write_lock(&p->value_nb_mutex, ct->mutex);
+        BOOL partial_fetch_required = pf_req_callback(p->value_data, read_extraargs);
 
         if (partial_fetch_required) {
             p->state = CTPAIR_READING;
@@ -2743,7 +3006,7 @@ int toku_cachefile_prefetch(CACHEFILE cf, CACHEKEY key, u_int32_t fullhash,
             // sanity check, we already have an assert 
             // before locking the PAIR
             assert(!p->cq);
-            nb_mutex_write_unlock(&p->nb_mutex);
+            nb_mutex_write_unlock(&p->value_nb_mutex);
 	}
     }
     cachetable_unlock(ct);
@@ -2794,7 +3057,7 @@ int64_t UU() toku_cachetable_size_slowslow (CACHETABLE ct) {
     int64_t ret = 0;
     for (p=ct->clock_head; ct->clock_head!=NULL && (p!=ct->clock_head || is_first); p=p->clock_next) {
         is_first=FALSE;
-        ret += brtnode_memory_size((BRTNODE) p->value);
+        ret += brtnode_memory_size((BRTNODE) p->value_data);
     }
     return ret;
 }
@@ -2808,7 +3071,7 @@ int64_t UU() toku_cachetable_size_discrepancy (CACHETABLE ct) {
     int64_t ret = 0;
     for (p=ct->clock_head; ct->clock_head!=NULL && (p!=ct->clock_head || is_first); p=p->clock_next) {
         is_first=FALSE;
-        ret += brtnode_memory_size((BRTNODE) p->value) - p->attr.size;
+        ret += brtnode_memory_size((BRTNODE) p->value_data) - p->attr.size;
     }
     return ret;
 }
@@ -2822,8 +3085,8 @@ int64_t UU() toku_cachetable_size_discrepancy_pinned (CACHETABLE ct) {
     int64_t ret = 0;
     for (p=ct->clock_head; ct->clock_head!=NULL && (p!=ct->clock_head || is_first); p=p->clock_next) {
         is_first=FALSE;
-        if (nb_mutex_writers(&p->nb_mutex)) {
-            ret += brtnode_memory_size((BRTNODE) p->value) - p->attr.size;
+        if (nb_mutex_writers(&p->value_nb_mutex)) {
+            ret += brtnode_memory_size((BRTNODE) p->value_data) - p->attr.size;
         }
     }
     return ret;
@@ -3011,7 +3274,7 @@ static void cachetable_flush_cachefile(CACHETABLE ct, CACHEFILE cf) {
             // Once again, the assumption is that any PAIR
             // is either unlocked or on a writer thread work queue
             //
-            if (!nb_mutex_writers(&p->nb_mutex)) {
+            if (!nb_mutex_writers(&p->value_nb_mutex)) {
                 flush_and_maybe_remove(ct, p);
             }
         }
@@ -3046,7 +3309,7 @@ static void cachetable_flush_cachefile(CACHETABLE ct, CACHEFILE cf) {
         PAIR p = workitem_arg(wi);
         p->cq = 0;
         //Some other thread owned the lock, but transferred ownership to the thread executing this function
-        nb_mutex_write_unlock(&p->nb_mutex);  //Release the lock, no one has a pin, per our assumptions above.
+        nb_mutex_write_unlock(&p->value_nb_mutex);  //Release the lock, no one has a pin, per our assumptions above.
         BOOL destroyed;
         cachetable_maybe_remove_and_free_pair(ct, p, &destroyed);
     }
@@ -3097,9 +3360,11 @@ toku_cachetable_close (CACHETABLE *ctp) {
     int r = toku_pthread_mutex_destroy(&ct->openfd_mutex); resource_assert_zero(r);
     cachetable_unlock(ct);
     toku_destroy_workers(&ct->wq, &ct->threadpool);
+    toku_destroy_workers(&ct->checkpoint_wq, &ct->checkpoint_threadpool);
     toku_kibbutz_destroy(ct->kibbutz);
     toku_omt_destroy(&ct->reserved_filenums);
     r = toku_pthread_mutex_destroy(&ct->cachefiles_mutex); resource_assert_zero(r);
+    r = toku_pthread_cond_destroy(&ct->clones_background_wait); resource_assert_zero(r);
     toku_free(ct->table);
     toku_free(ct->env_dir);
     toku_free(ct);
@@ -3125,8 +3390,12 @@ int toku_cachetable_unpin_and_remove (
         count++;
 	if (p->key.b==key.b && p->cachefile==cachefile) {
 	    p->dirty = CACHETABLE_CLEAN; // clear the dirty bit.  We're just supposed to remove it.
-	    assert(nb_mutex_writers(&p->nb_mutex));
-
+	    assert(nb_mutex_writers(&p->value_nb_mutex));
+            // grab disk_nb_mutex to ensure any background thread writing
+            // out a cloned value completes
+            nb_mutex_write_lock(&p->disk_nb_mutex, ct->mutex);
+            assert(p->cloned_value_data == NULL);
+            
             //
             // take care of key removal
             //
@@ -3168,7 +3437,8 @@ int toku_cachetable_unpin_and_remove (
             // we must not have a completion queue
             // lying around, as we may create one now
             assert(!p->cq);
-            nb_mutex_write_unlock(&p->nb_mutex);
+            nb_mutex_write_unlock(&p->value_nb_mutex);
+            nb_mutex_write_unlock(&p->disk_nb_mutex);
             //
             // As of Dr. Noga, only these threads may be
             // blocked waiting to lock this PAIR:
@@ -3213,10 +3483,10 @@ int toku_cachetable_unpin_and_remove (
             // nothing, and looking at those functions, it is clear they do nothing.
             // 
             cachetable_remove_pair(ct, p);
-            if (nb_mutex_blocked_writers(&p->nb_mutex)>0) {
+            if (nb_mutex_blocked_writers(&p->value_nb_mutex)>0) {
                 struct workqueue cq;
                 workqueue_init(&cq);
-                while (nb_mutex_blocked_writers(&p->nb_mutex)>0) {
+                while (nb_mutex_blocked_writers(&p->value_nb_mutex)>0) {
                     //Someone (one or more checkpoint threads) is waiting for a write lock
                     //on this pair.
                     //They are still blocked because we have not released the
@@ -3239,7 +3509,7 @@ int toku_cachetable_unpin_and_remove (
 
                     //We are holding the write lock on the pair
                     cachetable_lock(ct);
-                    assert(nb_mutex_writers(&p->nb_mutex) == 1);
+                    assert(nb_mutex_writers(&p->value_nb_mutex) == 1);
                     // let's also assert that this PAIR was not somehow marked 
                     // as pending a checkpoint. Above, when calling
                     // remove_key(), we cleared the dirty bit so that
@@ -3247,7 +3517,7 @@ int toku_cachetable_unpin_and_remove (
                     // make sure that our assumption is valid.
                     assert(!p->checkpoint_pending);
                     assert(p->attr.cache_pressure_size == 0);
-                    nb_mutex_write_unlock(&p->nb_mutex);
+                    nb_mutex_write_unlock(&p->value_nb_mutex);
                     // Because we assume it is just the checkpoint thread
                     // that may have been blocked (as argued above),
                     // it is safe to simply remove the PAIR from the 
@@ -3256,6 +3526,9 @@ int toku_cachetable_unpin_and_remove (
                 p->cq = NULL;
                 workqueue_destroy(&cq);
             }
+            // just a sanity check
+            assert(nb_mutex_users(&p->disk_nb_mutex) == 0);
+            assert(p->cloned_value_data == NULL);
             //Remove pair.
             cachetable_free_pair(ct, p);
             r = 0;
@@ -3328,6 +3601,7 @@ toku_cachetable_begin_checkpoint (CACHETABLE ct, TOKULOGGER logger) {
     //                                                    written to disk before it can be modified.)
 
     {
+        brt_begin_checkpoint();
         unsigned i;
 	if (logger) { // Unpin all 'inprogress rollback log nodes' pinned by transactions
             int r = toku_omt_iterate(logger->live_txns,
@@ -3452,6 +3726,13 @@ toku_cachetable_begin_checkpoint (CACHETABLE ct, TOKULOGGER logger) {
         rwlock_write_lock(&ct->pending_lock, ct->mutex);
         ct->checkpoint_is_beginning = TRUE;         // detect threadsafety bugs, must set checkpoint_is_beginning ...
         invariant(ct->checkpoint_prohibited == 0);  // ... before testing checkpoint_prohibited
+        invariant(ct->n_checkpoint_clones_running == 0);
+        u_int64_t leaf_sum = 0;
+        u_int64_t nonleaf_sum = 0;
+        u_int64_t rollback_sum = 0;
+        u_int64_t maybe_leaf_sum = 0;
+        u_int64_t maybe_nonleaf_sum = 0;
+        u_int64_t maybe_rollback_sum = 0;
         for (i=0; i < ct->table_size; i++) {
             PAIR p;
             for (p = ct->table[i]; p; p=p->hash_chain) {
@@ -3469,7 +3750,17 @@ toku_cachetable_begin_checkpoint (CACHETABLE ct, TOKULOGGER logger) {
                 //     BOTH the cachetable lock and the PAIR lock. Otherwise,
                 //     we may end up clearing the pending bit before the
                 //     current lock is ever released.
-                if (p->dirty || nb_mutex_writers(&p->nb_mutex)) {
+                if (p->dirty || nb_mutex_writers(&p->value_nb_mutex)) {
+                    if (p->dirty) {
+                        leaf_sum += p->attr.leaf_size;
+                        nonleaf_sum += p->attr.nonleaf_size;
+                        rollback_sum += p->attr.rollback_size;
+                    }
+                    else {
+                        maybe_leaf_sum += p->attr.leaf_size;
+                        maybe_nonleaf_sum += p->attr.nonleaf_size;
+                        maybe_rollback_sum += p->attr.rollback_size;
+                    }
                     p->checkpoint_pending = TRUE;
                     if (ct->pending_head) {
                         ct->pending_head->pending_prev = p;
@@ -3481,6 +3772,15 @@ toku_cachetable_begin_checkpoint (CACHETABLE ct, TOKULOGGER logger) {
                 }
             }
         }
+        if (0) {
+            printf("leaf_sum:       %"PRIu64"\n", leaf_sum);            
+            printf("nonleaf_sum:    %"PRIu64"\n", nonleaf_sum);            
+            printf("rollback_sum:   %"PRIu64"\n", rollback_sum);            
+            printf("maybe_leaf_sum: %"PRIu64"\n", maybe_leaf_sum);            
+            printf("maybe_nonleaf:  %"PRIu64"\n", maybe_nonleaf_sum);            
+            printf("maybe_rollback: %"PRIu64"\n", maybe_rollback_sum); 
+            printf("*****************************\n");
+        }
         rwlock_write_unlock(&ct->pending_lock);
         if (0 && (npending > 0 || ct->checkpoint_num_files > 0 || ct->checkpoint_num_txns > 0)) {
             fprintf(stderr, "%s:%d pending=%u %u files=%u txns=%u\n", __FUNCTION__, __LINE__, npending, ct->n_in_table, ct->checkpoint_num_files, ct->checkpoint_num_txns);
@@ -3533,30 +3833,35 @@ toku_cachetable_end_checkpoint(CACHETABLE ct, TOKULOGGER logger,
         // 
         // #TODO: #1424 Long-lived get and pin (held by cursor) will cause a deadlock here.
         //        Need some solution (possibly modify requirement for write lock or something else).
-	PAIR p;
-	while ((p = ct->pending_head)!=0) {
+        PAIR p;
+        while ((p = ct->pending_head)!=0) {
             ct->pending_head = ct->pending_head->pending_next;
             pending_pairs_remove(ct, p);
-	    write_pair_for_checkpoint(ct, p); // if still pending, clear the pending bit and write out the node
-	    // Don't need to unlock and lock cachetable, because the cachetable was unlocked and locked while the flush callback ran.
-	}
+            write_pair_for_checkpoint_thread(ct, p); // if still pending, clear the pending bit and write out the node
+            // Don't need to unlock and lock cachetable, because the cachetable was unlocked and locked while the flush callback ran.
+        }
     }
-    assert(!ct->pending_head);
+    assert(!ct->pending_head);    
+    while (ct->n_checkpoint_clones_running > 0) {
+        int r = toku_pthread_cond_wait(&ct->clones_background_wait, ct->mutex);
+        assert(r==0);
+    }
+    assert(ct->n_checkpoint_clones_running == 0);
 
 
     {   // have just written data blocks, so next write the translation and header for each open dictionary
-	CACHEFILE cf;
+        CACHEFILE cf;
         //cachefiles_in_checkpoint is protected by the checkpoint_safe_lock
-	for (cf = ct->cachefiles_in_checkpoint; cf; cf=cf->next_in_checkpoint) {
-	    if (cf->checkpoint_userdata) {
+        for (cf = ct->cachefiles_in_checkpoint; cf; cf=cf->next_in_checkpoint) {
+            if (cf->checkpoint_userdata) {
                 rwlock_prefer_read_lock(&cf->fdlock, ct->mutex);
                 if (!logger || ct->lsn_of_checkpoint_in_progress.lsn != cf->most_recent_global_checkpoint_that_finished_early.lsn) {
                     assert(ct->lsn_of_checkpoint_in_progress.lsn >= cf->most_recent_global_checkpoint_that_finished_early.lsn);
                     cachetable_unlock(ct);
                     assert(cf->checkpoint_state == CS_CALLED_BEGIN_CHECKPOINT);
-		    toku_cachetable_set_checkpointing_user_data_status(1);
+                    toku_cachetable_set_checkpointing_user_data_status(1);
                     int r = cf->checkpoint_userdata(cf, cf->fd, cf->userdata);
-		    toku_cachetable_set_checkpointing_user_data_status(0);
+                    toku_cachetable_set_checkpointing_user_data_status(0);
                     assert(r==0);
                     cf->checkpoint_state = CS_CALLED_CHECKPOINT;
                     cachetable_lock(ct);
@@ -3565,17 +3870,17 @@ toku_cachetable_end_checkpoint(CACHETABLE ct, TOKULOGGER logger,
                     assert(cf->checkpoint_state == CS_NOT_IN_PROGRESS);
                 }
                 rwlock_read_unlock(&cf->fdlock);
-	    }
-	}
+            }
+        }
     }
 
     {   // everything has been written to file (or at least OS internal buffer)...
-	// ... so fsync and call checkpoint-end function in block translator
-	//     to free obsolete blocks on disk used by previous checkpoint
-	CACHEFILE cf;
+        // ... so fsync and call checkpoint-end function in block translator
+        //     to free obsolete blocks on disk used by previous checkpoint
+        CACHEFILE cf;
         //cachefiles_in_checkpoint is protected by the checkpoint_safe_lock
-	for (cf = ct->cachefiles_in_checkpoint; cf; cf=cf->next_in_checkpoint) {
-	    if (cf->end_checkpoint_userdata) {
+        for (cf = ct->cachefiles_in_checkpoint; cf; cf=cf->next_in_checkpoint) {
+            if (cf->end_checkpoint_userdata) {
                 rwlock_prefer_read_lock(&cf->fdlock, ct->mutex);
                 if (!logger || ct->lsn_of_checkpoint_in_progress.lsn != cf->most_recent_global_checkpoint_that_finished_early.lsn) {
                     assert(ct->lsn_of_checkpoint_in_progress.lsn >= cf->most_recent_global_checkpoint_that_finished_early.lsn);
@@ -3589,8 +3894,8 @@ toku_cachetable_end_checkpoint(CACHETABLE ct, TOKULOGGER logger,
                 }
                 assert(cf->checkpoint_state == CS_NOT_IN_PROGRESS);
                 rwlock_read_unlock(&cf->fdlock);
-	    }
-	}
+            }
+        }
     }
     cachetable_unlock(ct);
 
@@ -3598,7 +3903,7 @@ toku_cachetable_end_checkpoint(CACHETABLE ct, TOKULOGGER logger,
         //Delete list of cachefiles in the checkpoint,
         //remove reference
         //clear bit saying they're in checkpoint
-	CACHEFILE cf;
+        CACHEFILE cf;
         //cachefiles_in_checkpoint is protected by the checkpoint_safe_lock
         while ((cf = ct->cachefiles_in_checkpoint)) {
             ct->cachefiles_in_checkpoint = cf->next_in_checkpoint; 
@@ -3620,19 +3925,20 @@ toku_cachetable_end_checkpoint(CACHETABLE ct, TOKULOGGER logger,
 
     // For testing purposes only.  Dictionary has been fsync-ed to disk but log has not yet been written.
     if (testcallback_f) 
-	testcallback_f(testextra);      
+        testcallback_f(testextra);      
 
     if (logger) {
-	int r = toku_log_end_checkpoint(logger, NULL,
-					1, // want the end_checkpoint to be fsync'd
-					ct->lsn_of_checkpoint_in_progress.lsn, 
-					0,
-					ct->checkpoint_num_files,
-					ct->checkpoint_num_txns);
-	assert(r==0);
-	toku_logger_note_checkpoint(logger, ct->lsn_of_checkpoint_in_progress);
+        int r = toku_log_end_checkpoint(logger, NULL,
+                                        1, // want the end_checkpoint to be fsync'd
+                                        ct->lsn_of_checkpoint_in_progress.lsn, 
+                                        0,
+                                        ct->checkpoint_num_files,
+                                        ct->checkpoint_num_txns);
+        assert(r==0);
+        toku_logger_note_checkpoint(logger, ct->lsn_of_checkpoint_in_progress);
     }
     
+    brt_end_checkpoint();
 panic:
     return retval;
 }
@@ -3645,48 +3951,6 @@ FILENUM toku_cachefile_filenum (CACHEFILE cf) {
     return cf->filenum;
 }
 
-
-// Worker thread function to write a pair from memory to its cachefile
-// As of now, the writer thread NEVER evicts, hence passing FALSE
-// for the third parameter to cachetable_write_pair
-static void cachetable_writer(WORKITEM wi) {
-    PAIR p = workitem_arg(wi);
-    CACHETABLE ct = p->cachefile->cachetable;
-    cachetable_lock(ct);
-    cachetable_write_pair(ct, p, p->remove_me);
-    cachetable_unlock(ct);
-}
-
-// Worker thread function to read a pair from a cachefile to memory
-static void cachetable_reader(WORKITEM wi) {
-    struct cachefile_prefetch_args* cpargs = workitem_arg(wi);
-    CACHETABLE ct = cpargs->p->cachefile->cachetable;
-    cachetable_lock(ct);
-    // TODO: find a way to properly pass some information for read_extraargs
-    // This is only called in toku_cachefile_prefetch, by putting it on a workqueue
-    // The problem is described in comments in toku_cachefile_prefetch
-    cachetable_fetch_pair(
-        ct,
-        cpargs->p->cachefile,
-        cpargs->p,
-        cpargs->fetch_callback,
-        cpargs->read_extraargs,
-        FALSE
-        );
-    cachetable_unlock(ct);
-    toku_free(cpargs);
-}
-
-static void cachetable_partial_reader(WORKITEM wi) {
-    struct cachefile_partial_prefetch_args *cpargs = workitem_arg(wi);
-    CACHETABLE ct = cpargs->p->cachefile->cachetable;
-    cachetable_lock(ct);
-    do_partial_fetch(ct, cpargs->p->cachefile, cpargs->p, cpargs->pf_callback, cpargs->read_extraargs, FALSE);
-    cachetable_unlock(ct);
-    toku_free(cpargs);
-}
-
-
 // debug functions
 
 int toku_cachetable_assert_all_unpinned (CACHETABLE ct) {
@@ -3696,9 +3960,9 @@ int toku_cachetable_assert_all_unpinned (CACHETABLE ct) {
     for (i=0; i<ct->table_size; i++) {
 	PAIR p;
 	for (p=ct->table[i]; p; p=p->hash_chain) {
-	    assert(nb_mutex_writers(&p->nb_mutex)>=0);
-	    if (nb_mutex_writers(&p->nb_mutex)) {
-		//printf("%s:%d pinned: %"PRId64" (%p)\n", __FILE__, __LINE__, p->key.b, p->value);
+	    assert(nb_mutex_writers(&p->value_nb_mutex)>=0);
+	    if (nb_mutex_writers(&p->value_nb_mutex)) {
+		//printf("%s:%d pinned: %"PRId64" (%p)\n", __FILE__, __LINE__, p->key.b, p->value_data);
 		some_pinned=1;
 	    }
 	}
@@ -3714,9 +3978,9 @@ int toku_cachefile_count_pinned (CACHEFILE cf, int print_them) {
     cachetable_lock(ct);
     for (struct toku_list *next_pair = cf->pairs_for_cachefile.next; next_pair != &cf->pairs_for_cachefile; next_pair = next_pair->next) {
         PAIR p = toku_list_struct(next_pair, struct ctpair, next_for_cachefile);
-        assert(nb_mutex_writers(&p->nb_mutex) >= 0);
-        if (nb_mutex_writers(&p->nb_mutex)) {
-            if (print_them) printf("%s:%d pinned: %"PRId64" (%p)\n", __FILE__, __LINE__, p->key.b, p->value);
+        assert(nb_mutex_writers(&p->value_nb_mutex) >= 0);
+        if (nb_mutex_writers(&p->value_nb_mutex)) {
+            if (print_them) printf("%s:%d pinned: %"PRId64" (%p)\n", __FILE__, __LINE__, p->key.b, p->value_data);
             n_pinned++;
         }
     }
@@ -3732,7 +3996,7 @@ void toku_cachetable_print_state (CACHETABLE ct) {
         if (p != 0) {
             printf("t[%u]=", i);
             for (p=ct->table[i]; p; p=p->hash_chain) {
-                printf(" {%"PRId64", %p, dirty=%d, pin=%d, size=%ld}", p->key.b, p->cachefile, (int) p->dirty, nb_mutex_writers(&p->nb_mutex), p->attr.size);
+                printf(" {%"PRId64", %p, dirty=%d, pin=%d, size=%ld}", p->key.b, p->cachefile, (int) p->dirty, nb_mutex_writers(&p->value_nb_mutex), p->attr.size);
             }
             printf("\n");
         }
@@ -3767,11 +4031,11 @@ int toku_cachetable_get_key_state (CACHETABLE ct, CACHEKEY key, CACHEFILE cf, vo
         if (p->key.b == key.b && p->cachefile == cf) {
 	    //note_hash_count(count);
             if (value_ptr)
-                *value_ptr = p->value;
+                *value_ptr = p->value_data;
             if (dirty_ptr)
                 *dirty_ptr = p->dirty;
             if (pin_ptr)
-                *pin_ptr = nb_mutex_writers(&p->nb_mutex);
+                *pin_ptr = nb_mutex_writers(&p->value_nb_mutex);
             if (size_ptr)
                 *size_ptr = p->attr.size;
             r = 0;
@@ -3951,7 +4215,7 @@ toku_cleaner_thread (void *cachetable_v)
         //  - this is how a thread that is calling unpin_and_remove will prevent
         //     the cleaner thread from picking its PAIR (see comments in that function)
         do {
-            if (nb_mutex_users(&ct->cleaner_head->nb_mutex) > 0 || ct->cleaner_head->cachefile->is_flushing) {
+            if (nb_mutex_users(&ct->cleaner_head->value_nb_mutex) > 0 || ct->cleaner_head->cachefile->is_flushing) {
                 goto next_pair;
             }
             n_seen++;
@@ -3968,7 +4232,7 @@ toku_cleaner_thread (void *cachetable_v)
         // that is, best_pair != NULL, we do the clean
         //
         if (best_pair) {
-            nb_mutex_write_lock(&best_pair->nb_mutex, ct->mutex);
+            nb_mutex_write_lock(&best_pair->value_nb_mutex, ct->mutex);
             // verify a key assumption.
             assert(cleaner_thread_rate_pair(best_pair) > 0);
             // the order of operations for these two pieces is important
@@ -4006,7 +4270,7 @@ toku_cleaner_thread (void *cachetable_v)
                 cleaner_thread_rate_pair(best_pair) > 0) 
             {
                 cachetable_unlock(ct);
-                int r = best_pair->cleaner_callback(best_pair->value,
+                int r = best_pair->cleaner_callback(best_pair->value_data,
                                                     best_pair->key,
                                                     best_pair->fullhash,
                                                     best_pair->write_extraargs);
@@ -4019,7 +4283,7 @@ toku_cleaner_thread (void *cachetable_v)
             // don't need to unlock it if the cleaner callback is called.
             if (!cleaner_callback_called) {
                 assert(!best_pair->cq);
-                nb_mutex_write_unlock(&best_pair->nb_mutex);
+                nb_mutex_write_unlock(&best_pair->value_nb_mutex);
             }
             rwlock_read_unlock(&cf->fdlock);
             // We need to make sure the cachefile sticks around so a close
diff --git a/newbrt/cachetable.h b/newbrt/cachetable.h
index 3b339f4d6e2..c61208f3081 100644
--- a/newbrt/cachetable.h
+++ b/newbrt/cachetable.h
@@ -130,14 +130,14 @@ enum cachetable_dirty {
 // When for_checkpoint is true, this was a 'pending' write
 // Returns: 0 if success, otherwise an error number.
 // Can access fd (fd is protected by a readlock during call)
-typedef void (*CACHETABLE_FLUSH_CALLBACK)(CACHEFILE, int fd, CACHEKEY key, void *value, void *write_extraargs, PAIR_ATTR size, PAIR_ATTR* new_size, BOOL write_me, BOOL keep_me, BOOL for_checkpoint);
+typedef void (*CACHETABLE_FLUSH_CALLBACK)(CACHEFILE, int fd, CACHEKEY key, void *value, void **disk_data, void *write_extraargs, PAIR_ATTR size, PAIR_ATTR* new_size, BOOL write_me, BOOL keep_me, BOOL for_checkpoint, BOOL is_clone);
 
 // The fetch callback is called when a thread is attempting to get and pin a memory
 // object and it is not in the cachetable.
 // Returns: 0 if success, otherwise an error number.  The address and size of the object
 // associated with the key are returned.
 // Can access fd (fd is protected by a readlock during call)
-typedef int (*CACHETABLE_FETCH_CALLBACK)(CACHEFILE, int fd, CACHEKEY key, u_int32_t fullhash, void **value, PAIR_ATTR *sizep, int *dirtyp, void *read_extraargs);
+typedef int (*CACHETABLE_FETCH_CALLBACK)(CACHEFILE, int fd, CACHEKEY key, u_int32_t fullhash, void **value_data, void **disk_data, PAIR_ATTR *sizep, int *dirtyp, void *read_extraargs);
 
 // The cachetable calls the partial eviction estimate callback to determine if 
 // partial eviction is a cheap operation that may be called by on the client thread
@@ -147,7 +147,7 @@ typedef int (*CACHETABLE_FETCH_CALLBACK)(CACHEFILE, int fd, CACHEKEY key, u_int3
 // to return an estimate of the number of bytes it will free
 // so that the cachetable can estimate how much data is being evicted on background threads.
 // If cost is PE_CHEAP, then the callback does not set bytes_freed_estimate.
-typedef void (*CACHETABLE_PARTIAL_EVICTION_EST_CALLBACK)(void *brtnode_pv, long* bytes_freed_estimate, enum partial_eviction_cost *cost, void *write_extraargs);
+typedef void (*CACHETABLE_PARTIAL_EVICTION_EST_CALLBACK)(void *brtnode_pv, void* disk_data, long* bytes_freed_estimate, enum partial_eviction_cost *cost, void *write_extraargs);
 
 // The cachetable calls the partial eviction callback is to possibly try and partially evict pieces
 // of the PAIR. The callback determines the strategy for what to evict. The callback may choose to free
@@ -173,16 +173,19 @@ typedef BOOL (*CACHETABLE_PARTIAL_FETCH_REQUIRED_CALLBACK)(void *brtnode_pv, voi
 // The new PAIR_ATTR of the PAIR is returned in sizep
 // Can access fd (fd is protected by a readlock during call)
 // Returns: 0 if success, otherwise an error number.  
-typedef int (*CACHETABLE_PARTIAL_FETCH_CALLBACK)(void *brtnode_pv, void *read_extraargs, int fd, PAIR_ATTR *sizep);
+typedef int (*CACHETABLE_PARTIAL_FETCH_CALLBACK)(void *value_data, void* disk_data, void *read_extraargs, int fd, PAIR_ATTR *sizep);
 
 // TODO(leif) XXX TODO XXX
 typedef int (*CACHETABLE_CLEANER_CALLBACK)(void *brtnode_pv, BLOCKNUM blocknum, u_int32_t fullhash, void *write_extraargs);
 
+typedef void (*CACHETABLE_CLONE_CALLBACK)(void* value_data, void** cloned_value_data, PAIR_ATTR* new_attr, BOOL for_checkpoint, void* write_extraargs);
+
 typedef struct {
     CACHETABLE_FLUSH_CALLBACK flush_callback;
     CACHETABLE_PARTIAL_EVICTION_EST_CALLBACK pe_est_callback;
     CACHETABLE_PARTIAL_EVICTION_CALLBACK pe_callback; 
     CACHETABLE_CLEANER_CALLBACK cleaner_callback;
+    CACHETABLE_CLONE_CALLBACK clone_callback;
     void* write_extraargs; // parameter for flush_callback, pe_est_callback, pe_callback, and cleaner_callback
 } CACHETABLE_WRITE_CALLBACK;
 
@@ -262,6 +265,7 @@ int toku_cachetable_get_and_pin_with_dep_pairs (
     CACHETABLE_FETCH_CALLBACK fetch_callback, 
     CACHETABLE_PARTIAL_FETCH_REQUIRED_CALLBACK pf_req_callback,
     CACHETABLE_PARTIAL_FETCH_CALLBACK pf_callback,
+    BOOL may_modify_value,
     void* read_extraargs, // parameter for fetch_callback, pf_req_callback, and pf_callback
     u_int32_t num_dependent_pairs, // number of dependent pairs that we may need to checkpoint
     CACHEFILE* dependent_cfs, // array of cachefiles of dependent pairs
@@ -286,9 +290,20 @@ int toku_cachetable_get_and_pin (
     CACHETABLE_FETCH_CALLBACK fetch_callback, 
     CACHETABLE_PARTIAL_FETCH_REQUIRED_CALLBACK pf_req_callback,
     CACHETABLE_PARTIAL_FETCH_CALLBACK pf_callback,
+    BOOL may_modify_value,
     void* read_extraargs // parameter for fetch_callback, pf_req_callback, and pf_callback
     );
 
+// does partial fetch on a pinned pair
+void toku_cachetable_pf_pinned_pair(
+    void* value,
+    CACHETABLE_PARTIAL_FETCH_CALLBACK pf_callback,
+    void* read_extraargs,
+    CACHEFILE cf,
+    CACHEKEY key,
+    u_int32_t fullhash
+    ); 
+
 struct unlockers {
     BOOL       locked;
     void (*f)(void*extra);
@@ -309,6 +324,7 @@ int toku_cachetable_get_and_pin_nonblocking (
     CACHETABLE_FETCH_CALLBACK fetch_callback, 
     CACHETABLE_PARTIAL_FETCH_REQUIRED_CALLBACK pf_req_callback  __attribute__((unused)),
     CACHETABLE_PARTIAL_FETCH_CALLBACK pf_callback  __attribute__((unused)),
+    BOOL may_modify_value,
     void *read_extraargs, // parameter for fetch_callback, pf_req_callback, and pf_callback
     UNLOCKERS unlockers
     );
diff --git a/newbrt/checkpoint.c b/newbrt/checkpoint.c
index ecd425c7784..d992baeec06 100644
--- a/newbrt/checkpoint.c
+++ b/newbrt/checkpoint.c
@@ -310,9 +310,9 @@ toku_checkpoint(CACHETABLE ct, TOKULOGGER logger,
 
     SET_CHECKPOINT_FOOTPRINT(40);
     if (r==0) {
-	if (callback_f) 
-	    callback_f(extra);      // callback is called with checkpoint_safe_lock still held
-	r = toku_cachetable_end_checkpoint(ct, logger, ydb_lock, ydb_unlock, callback2_f, extra2);
+        if (callback_f) 
+            callback_f(extra);      // callback is called with checkpoint_safe_lock still held
+        r = toku_cachetable_end_checkpoint(ct, logger, ydb_lock, ydb_unlock, callback2_f, extra2);
     }
     SET_CHECKPOINT_FOOTPRINT(50);
     if (r==0 && logger) {
diff --git a/newbrt/fifo.c b/newbrt/fifo.c
index 3750f4e3d30..add248c3a2a 100644
--- a/newbrt/fifo.c
+++ b/newbrt/fifo.c
@@ -226,3 +226,19 @@ DBT *fill_dbt_for_fifo_entry(DBT *dbt, const struct fifo_entry *entry) {
 const struct fifo_entry *toku_fifo_get_entry(FIFO fifo, long off) {
     return toku_fifo_iterate_internal_get_entry(fifo, off);
 }
+
+void toku_fifo_clone(FIFO orig_fifo, FIFO* cloned_fifo) {
+    struct fifo *XMALLOC(new_fifo);
+    assert(new_fifo);
+    new_fifo->n_items_in_fifo = orig_fifo->n_items_in_fifo;
+    new_fifo->memory_start = 0;
+    new_fifo->memory_used = orig_fifo->memory_used - orig_fifo->memory_start;
+    new_fifo->memory_size = new_fifo->memory_used;
+    new_fifo->memory = toku_xmalloc(new_fifo->memory_size);
+    memcpy(
+        new_fifo->memory, 
+        orig_fifo->memory + orig_fifo->memory_start, 
+        new_fifo->memory_size
+        );
+    *cloned_fifo = new_fifo;
+}
diff --git a/newbrt/fifo.h b/newbrt/fifo.h
index c51b9fbc7f1..39d4af4423e 100644
--- a/newbrt/fifo.h
+++ b/newbrt/fifo.h
@@ -110,6 +110,8 @@ struct fifo_entry * toku_fifo_iterate_internal_get_entry(FIFO fifo, int off);
 DBT *fill_dbt_for_fifo_entry(DBT *dbt, const struct fifo_entry *entry);
 const struct fifo_entry *toku_fifo_get_entry(FIFO fifo, long off);
 
+void toku_fifo_clone(FIFO orig_fifo, FIFO* cloned_fifo);
+
 #if defined(__cplusplus) || defined(__cilkplusplus)
 };
 #endif
diff --git a/newbrt/mempool.c b/newbrt/mempool.c
index 5326c9d9af4..ce0d82b013c 100644
--- a/newbrt/mempool.c
+++ b/newbrt/mempool.c
@@ -137,3 +137,11 @@ size_t toku_mempool_footprint(struct mempool *mp) {
     size_t rval = toku_memory_footprint(base, touched);
     return rval;
 }
+
+void toku_mempool_clone(struct mempool* orig_mp, struct mempool* new_mp) {
+    new_mp->frag_size = orig_mp->frag_size;
+    new_mp->free_offset = orig_mp->free_offset;
+    new_mp->size = orig_mp->free_offset; // only make the cloned mempool store what is needed
+    new_mp->base = toku_xmalloc(new_mp->size);
+    memcpy(new_mp->base, orig_mp->base, new_mp->size);
+}
diff --git a/newbrt/mempool.h b/newbrt/mempool.h
index 06497f8a06e..fbce5b1c56b 100644
--- a/newbrt/mempool.h
+++ b/newbrt/mempool.h
@@ -83,6 +83,8 @@ static inline int toku_mempool_inrange(struct mempool *mp, void *vp, size_t size
 /* get memory footprint */
 size_t toku_mempool_footprint(struct mempool *mp);
 
+void toku_mempool_clone(struct mempool* orig_mp, struct mempool* new_mp);
+
 #if defined(__cplusplus) || defined(__cilkplusplus)
 };
 #endif
diff --git a/newbrt/rollback.c b/newbrt/rollback.c
index 5bdb6eb1e51..778a2c56640 100644
--- a/newbrt/rollback.c
+++ b/newbrt/rollback.c
@@ -492,8 +492,8 @@ toku_rollback_log_free(ROLLBACK_LOG_NODE *log_p) {
 }
 
 static void toku_rollback_flush_callback (CACHEFILE cachefile, int fd, BLOCKNUM logname,
-                                          void *rollback_v, void *extraargs, PAIR_ATTR size, PAIR_ATTR* new_size,
-                                          BOOL write_me, BOOL keep_me, BOOL for_checkpoint) {
+                                          void *rollback_v,  void** UU(disk_data), void *extraargs, PAIR_ATTR size, PAIR_ATTR* new_size,
+                                          BOOL write_me, BOOL keep_me, BOOL for_checkpoint, BOOL UU(is_clone)) {
     int r;
     ROLLBACK_LOG_NODE  log = rollback_v;
     struct brt_header *h   = extraargs;
@@ -524,7 +524,7 @@ static void toku_rollback_flush_callback (CACHEFILE cachefile, int fd, BLOCKNUM
 }
 
 static int toku_rollback_fetch_callback (CACHEFILE cachefile, int fd, BLOCKNUM logname, u_int32_t fullhash,
-					 void **rollback_pv, PAIR_ATTR *sizep, int * UU(dirtyp), void *extraargs) {
+					 void **rollback_pv,  void** UU(disk_data), PAIR_ATTR *sizep, int * UU(dirtyp), void *extraargs) {
     int r;
     struct brt_header *h = extraargs;
     assert(h->cf == cachefile);
@@ -539,6 +539,7 @@ static int toku_rollback_fetch_callback (CACHEFILE cachefile, int fd, BLOCKNUM l
 
 static void toku_rollback_pe_est_callback(
     void* rollback_v, 
+    void* UU(disk_data),
     long* bytes_freed_estimate, 
     enum partial_eviction_cost *cost, 
     void* UU(write_extraargs)
@@ -565,7 +566,7 @@ static BOOL toku_rollback_pf_req_callback(void* UU(brtnode_pv), void* UU(read_ex
     return FALSE;
 }
 
-static int toku_rollback_pf_callback(void* UU(brtnode_pv), void* UU(read_extraargs), int UU(fd), PAIR_ATTR* UU(sizep)) {
+static int toku_rollback_pf_callback(void* UU(brtnode_pv),  void* UU(disk_data), void* UU(read_extraargs), int UU(fd), PAIR_ATTR* UU(sizep)) {
     // should never be called, given that toku_rollback_pf_req_callback always returns false
     assert(FALSE);
     return 0;
@@ -588,6 +589,7 @@ static inline CACHETABLE_WRITE_CALLBACK get_write_callbacks_for_rollback_log(str
     wc.pe_est_callback = toku_rollback_pe_est_callback;
     wc.pe_callback = toku_rollback_pe_callback;
     wc.cleaner_callback = toku_rollback_cleaner_callback;
+    wc.clone_callback = NULL;
     wc.write_extraargs = h;
     return wc;
 }
@@ -873,6 +875,7 @@ int toku_get_and_pin_rollback_log(TOKUTXN txn, TXNID xid, uint64_t sequence, BLO
                                         toku_rollback_fetch_callback,
                                         toku_rollback_pf_req_callback,
                                         toku_rollback_pf_callback,
+                                        TRUE, // may_modify_value
                                         h
                                         );
         assert(r==0);
diff --git a/newbrt/tests/brt-bfe-query.c b/newbrt/tests/brt-bfe-query.c
index aa593079d77..8e5c92b4c84 100644
--- a/newbrt/tests/brt-bfe-query.c
+++ b/newbrt/tests/brt-bfe-query.c
@@ -36,24 +36,26 @@ test_prefetch_read(int fd, BRT UU(brt), struct brt_header *brt_h) {
     // disable_prefetching to TRUE
     cursor->disable_prefetching = TRUE;
     fill_bfe_for_prefetch(&bfe, brt_h, cursor);
-    r = toku_deserialize_brtnode_from(fd, make_blocknum(20), 0/*pass zero for hash*/, &dn, &bfe);
+    BRTNODE_DISK_DATA ndd = NULL;
+    r = toku_deserialize_brtnode_from(fd, make_blocknum(20), 0/*pass zero for hash*/, &dn, &ndd, &bfe);
     assert(r==0);
     assert(dn->n_children == 3);
     assert(BP_STATE(dn,0) == PT_ON_DISK);
     assert(BP_STATE(dn,1) == PT_ON_DISK);
     assert(BP_STATE(dn,2) == PT_ON_DISK);
-    r = toku_brtnode_pf_callback(dn, &bfe, fd, &attr);
+    r = toku_brtnode_pf_callback(dn, ndd, &bfe, fd, &attr);
     assert(BP_STATE(dn,0) == PT_ON_DISK);
     assert(BP_STATE(dn,1) == PT_ON_DISK);
     assert(BP_STATE(dn,2) == PT_ON_DISK);
     destroy_bfe_for_prefetch(&bfe);
     toku_brtnode_free(&dn);
+    toku_free(ndd);
 
     // now enable prefetching again
     cursor->disable_prefetching = FALSE;
     
     fill_bfe_for_prefetch(&bfe, brt_h, cursor);
-    r = toku_deserialize_brtnode_from(fd, make_blocknum(20), 0/*pass zero for hash*/, &dn, &bfe);
+    r = toku_deserialize_brtnode_from(fd, make_blocknum(20), 0/*pass zero for hash*/, &dn, &ndd, &bfe);
     assert(r==0);
     assert(dn->n_children == 3);
     assert(BP_STATE(dn,0) == PT_AVAIL);
@@ -63,18 +65,19 @@ test_prefetch_read(int fd, BRT UU(brt), struct brt_header *brt_h) {
     assert(BP_STATE(dn,0) == PT_COMPRESSED);
     assert(BP_STATE(dn,1) == PT_COMPRESSED);
     assert(BP_STATE(dn,2) == PT_COMPRESSED);
-    r = toku_brtnode_pf_callback(dn, &bfe, fd, &attr);
+    r = toku_brtnode_pf_callback(dn, ndd, &bfe, fd, &attr);
     assert(BP_STATE(dn,0) == PT_AVAIL);
     assert(BP_STATE(dn,1) == PT_AVAIL);
     assert(BP_STATE(dn,2) == PT_AVAIL);
     destroy_bfe_for_prefetch(&bfe);
     toku_brtnode_free(&dn);
+    toku_free(ndd);
 
     u_int64_t left_key = 150;
     toku_fill_dbt(&cursor->range_lock_left_key, &left_key, sizeof(u_int64_t));
     cursor->left_is_neg_infty = FALSE;
     fill_bfe_for_prefetch(&bfe, brt_h, cursor);
-    r = toku_deserialize_brtnode_from(fd, make_blocknum(20), 0/*pass zero for hash*/, &dn, &bfe);
+    r = toku_deserialize_brtnode_from(fd, make_blocknum(20), 0/*pass zero for hash*/, &dn, &ndd, &bfe);
     assert(r==0);
     assert(dn->n_children == 3);
     assert(BP_STATE(dn,0) == PT_ON_DISK);
@@ -84,18 +87,19 @@ test_prefetch_read(int fd, BRT UU(brt), struct brt_header *brt_h) {
     assert(BP_STATE(dn,0) == PT_ON_DISK);
     assert(BP_STATE(dn,1) == PT_COMPRESSED);
     assert(BP_STATE(dn,2) == PT_COMPRESSED);
-    r = toku_brtnode_pf_callback(dn, &bfe, fd, &attr);
+    r = toku_brtnode_pf_callback(dn, ndd, &bfe, fd, &attr);
     assert(BP_STATE(dn,0) == PT_ON_DISK);
     assert(BP_STATE(dn,1) == PT_AVAIL);
     assert(BP_STATE(dn,2) == PT_AVAIL);
     destroy_bfe_for_prefetch(&bfe);
     toku_brtnode_free(&dn);
+    toku_free(ndd);
 
     u_int64_t right_key = 151;
     toku_fill_dbt(&cursor->range_lock_right_key, &right_key, sizeof(u_int64_t));
     cursor->right_is_pos_infty = FALSE;
     fill_bfe_for_prefetch(&bfe, brt_h, cursor);
-    r = toku_deserialize_brtnode_from(fd, make_blocknum(20), 0/*pass zero for hash*/, &dn, &bfe);
+    r = toku_deserialize_brtnode_from(fd, make_blocknum(20), 0/*pass zero for hash*/, &dn, &ndd, &bfe);
     assert(r==0);
     assert(dn->n_children == 3);
     assert(BP_STATE(dn,0) == PT_ON_DISK);
@@ -105,17 +109,18 @@ test_prefetch_read(int fd, BRT UU(brt), struct brt_header *brt_h) {
     assert(BP_STATE(dn,0) == PT_ON_DISK);
     assert(BP_STATE(dn,1) == PT_COMPRESSED);
     assert(BP_STATE(dn,2) == PT_ON_DISK);
-    r = toku_brtnode_pf_callback(dn, &bfe, fd, &attr);
+    r = toku_brtnode_pf_callback(dn, ndd, &bfe, fd, &attr);
     assert(BP_STATE(dn,0) == PT_ON_DISK);
     assert(BP_STATE(dn,1) == PT_AVAIL);
     assert(BP_STATE(dn,2) == PT_ON_DISK);
     destroy_bfe_for_prefetch(&bfe);
     toku_brtnode_free(&dn);
+    toku_free(ndd);
 
     left_key = 100000;
     right_key = 100000;
     fill_bfe_for_prefetch(&bfe, brt_h, cursor);
-    r = toku_deserialize_brtnode_from(fd, make_blocknum(20), 0/*pass zero for hash*/, &dn, &bfe);
+    r = toku_deserialize_brtnode_from(fd, make_blocknum(20), 0/*pass zero for hash*/, &dn, &ndd, &bfe);
     assert(r==0);
     assert(dn->n_children == 3);
     assert(BP_STATE(dn,0) == PT_ON_DISK);
@@ -125,17 +130,18 @@ test_prefetch_read(int fd, BRT UU(brt), struct brt_header *brt_h) {
     assert(BP_STATE(dn,0) == PT_ON_DISK);
     assert(BP_STATE(dn,1) == PT_ON_DISK);
     assert(BP_STATE(dn,2) == PT_COMPRESSED);
-    r = toku_brtnode_pf_callback(dn, &bfe, fd, &attr);
+    r = toku_brtnode_pf_callback(dn, ndd, &bfe, fd, &attr);
     assert(BP_STATE(dn,0) == PT_ON_DISK);
     assert(BP_STATE(dn,1) == PT_ON_DISK);
     assert(BP_STATE(dn,2) == PT_AVAIL);
     destroy_bfe_for_prefetch(&bfe);
+    toku_free(ndd);
     toku_brtnode_free(&dn);
 
     left_key = 100;
     right_key = 100;
     fill_bfe_for_prefetch(&bfe, brt_h, cursor);
-    r = toku_deserialize_brtnode_from(fd, make_blocknum(20), 0/*pass zero for hash*/, &dn, &bfe);
+    r = toku_deserialize_brtnode_from(fd, make_blocknum(20), 0/*pass zero for hash*/, &dn, &ndd, &bfe);
     assert(r==0);
     assert(dn->n_children == 3);
     assert(BP_STATE(dn,0) == PT_AVAIL);
@@ -145,12 +151,13 @@ test_prefetch_read(int fd, BRT UU(brt), struct brt_header *brt_h) {
     assert(BP_STATE(dn,0) == PT_COMPRESSED);
     assert(BP_STATE(dn,1) == PT_ON_DISK);
     assert(BP_STATE(dn,2) == PT_ON_DISK);
-    r = toku_brtnode_pf_callback(dn, &bfe, fd, &attr);
+    r = toku_brtnode_pf_callback(dn, ndd, &bfe, fd, &attr);
     assert(BP_STATE(dn,0) == PT_AVAIL);
     assert(BP_STATE(dn,1) == PT_ON_DISK);
     assert(BP_STATE(dn,2) == PT_ON_DISK);
     destroy_bfe_for_prefetch(&bfe);
     toku_brtnode_free(&dn);
+    toku_free(ndd);
 
     toku_free(cursor);
 }
@@ -161,6 +168,7 @@ test_subset_read(int fd, BRT UU(brt), struct brt_header *brt_h) {
     brt_h->compare_fun = int64_key_cmp;    
     BRT_CURSOR cursor = toku_malloc(sizeof *cursor);
     BRTNODE dn = NULL;
+    BRTNODE_DISK_DATA ndd = NULL;
     PAIR_ATTR attr;
     
     // first test that prefetching everything should work
@@ -191,7 +199,7 @@ test_subset_read(int fd, BRT UU(brt), struct brt_header *brt_h) {
     // set disable_prefetching ON
     bfe.child_to_read = 2;
     bfe.disable_prefetching = TRUE;
-    r = toku_deserialize_brtnode_from(fd, make_blocknum(20), 0/*pass zero for hash*/, &dn, &bfe);
+    r = toku_deserialize_brtnode_from(fd, make_blocknum(20), 0/*pass zero for hash*/, &dn, &ndd, &bfe);
     assert(r==0);
     assert(dn->n_children == 3);
     assert(BP_STATE(dn,0) == PT_ON_DISK);
@@ -206,16 +214,17 @@ test_subset_read(int fd, BRT UU(brt), struct brt_header *brt_h) {
     assert(BP_STATE(dn,0) == PT_ON_DISK);
     assert(BP_STATE(dn,1) == PT_ON_DISK);
     assert(BP_STATE(dn,2) == PT_COMPRESSED);
-    r = toku_brtnode_pf_callback(dn, &bfe, fd, &attr);
+    r = toku_brtnode_pf_callback(dn, ndd, &bfe, fd, &attr);
     assert(BP_STATE(dn,0) == PT_ON_DISK);
     assert(BP_STATE(dn,1) == PT_ON_DISK);
     assert(BP_STATE(dn,2) == PT_AVAIL);
     toku_brtnode_free(&dn);
+    toku_free(ndd);
 
     // fake the childnum to read
     bfe.child_to_read = 2;
     bfe.disable_prefetching = FALSE;
-    r = toku_deserialize_brtnode_from(fd, make_blocknum(20), 0/*pass zero for hash*/, &dn, &bfe);
+    r = toku_deserialize_brtnode_from(fd, make_blocknum(20), 0/*pass zero for hash*/, &dn, &ndd, &bfe);
     assert(r==0);
     assert(dn->n_children == 3);
     assert(BP_STATE(dn,0) == PT_ON_DISK);
@@ -230,15 +239,16 @@ test_subset_read(int fd, BRT UU(brt), struct brt_header *brt_h) {
     assert(BP_STATE(dn,0) == PT_ON_DISK);
     assert(BP_STATE(dn,1) == PT_COMPRESSED);
     assert(BP_STATE(dn,2) == PT_COMPRESSED);
-    r = toku_brtnode_pf_callback(dn, &bfe, fd, &attr);
+    r = toku_brtnode_pf_callback(dn, ndd, &bfe, fd, &attr);
     assert(BP_STATE(dn,0) == PT_ON_DISK);
     assert(BP_STATE(dn,1) == PT_AVAIL);
     assert(BP_STATE(dn,2) == PT_AVAIL);
     toku_brtnode_free(&dn);
+    toku_free(ndd);
 
     // fake the childnum to read
     bfe.child_to_read = 0;
-    r = toku_deserialize_brtnode_from(fd, make_blocknum(20), 0/*pass zero for hash*/, &dn, &bfe);
+    r = toku_deserialize_brtnode_from(fd, make_blocknum(20), 0/*pass zero for hash*/, &dn, &ndd, &bfe);
     assert(r==0);
     assert(dn->n_children == 3);
     assert(BP_STATE(dn,0) == PT_AVAIL);
@@ -253,11 +263,12 @@ test_subset_read(int fd, BRT UU(brt), struct brt_header *brt_h) {
     assert(BP_STATE(dn,0) == PT_COMPRESSED);
     assert(BP_STATE(dn,1) == PT_COMPRESSED);
     assert(BP_STATE(dn,2) == PT_ON_DISK);
-    r = toku_brtnode_pf_callback(dn, &bfe, fd, &attr);
+    r = toku_brtnode_pf_callback(dn, ndd, &bfe, fd, &attr);
     assert(BP_STATE(dn,0) == PT_AVAIL);
     assert(BP_STATE(dn,1) == PT_AVAIL);
     assert(BP_STATE(dn,2) == PT_ON_DISK);
     toku_brtnode_free(&dn);
+    toku_free(ndd);
 
     toku_free(cursor);
 }
@@ -345,8 +356,8 @@ test_prefetching(void) {
         assert(offset == BLOCK_ALLOCATOR_TOTAL_HEADER_RESERVE);
         assert(size   == 100);
     }
-
-    r = toku_serialize_brtnode_to(fd, make_blocknum(20), &sn, brt->h, 1, 1, FALSE);
+    BRTNODE_DISK_DATA ndd = NULL;
+    r = toku_serialize_brtnode_to(fd, make_blocknum(20), &sn, &ndd, TRUE, brt->h, 1, 1, FALSE);
     assert(r==0);
 
     test_prefetch_read(fd, brt, brt_h);    
@@ -365,6 +376,7 @@ test_prefetching(void) {
     toku_blocktable_destroy(&brt_h->blocktable);
     toku_free(brt_h);
     toku_free(brt);
+    toku_free(ndd);
 
     r = close(fd); assert(r != -1);
 }
diff --git a/newbrt/tests/brt-clock-test.c b/newbrt/tests/brt-clock-test.c
index 2961d32a59a..b0f279948e2 100644
--- a/newbrt/tests/brt-clock-test.c
+++ b/newbrt/tests/brt-clock-test.c
@@ -67,7 +67,8 @@ test1(int fd, struct brt_header *brt_h, BRTNODE *dn) {
     struct brtnode_fetch_extra bfe_all;
     brt_h->compare_fun = string_key_cmp;
     fill_bfe_for_full_read(&bfe_all, brt_h);
-    r = toku_deserialize_brtnode_from(fd, make_blocknum(20), 0/*pass zero for hash*/, dn, &bfe_all);
+    BRTNODE_DISK_DATA ndd = NULL;
+    r = toku_deserialize_brtnode_from(fd, make_blocknum(20), 0/*pass zero for hash*/, dn, &ndd, &bfe_all);
     BOOL is_leaf = ((*dn)->height == 0);
     assert(r==0);
     for (int i = 0; i < (*dn)->n_children; i++) {
@@ -93,7 +94,7 @@ test1(int fd, struct brt_header *brt_h, BRTNODE *dn) {
     PAIR_ATTR size;
     BOOL req = toku_brtnode_pf_req_callback(*dn, &bfe_all);
     assert(req);
-    toku_brtnode_pf_callback(*dn, &bfe_all, fd, &size);
+    toku_brtnode_pf_callback(*dn, ndd, &bfe_all, fd, &size);
     toku_brtnode_pe_callback(*dn, attr, &attr, NULL);
     for (int i = 0; i < (*dn)->n_children; i++) {
         assert(BP_STATE(*dn,i) == PT_AVAIL);
@@ -111,7 +112,7 @@ test1(int fd, struct brt_header *brt_h, BRTNODE *dn) {
 
     req = toku_brtnode_pf_req_callback(*dn, &bfe_all);
     assert(req);
-    toku_brtnode_pf_callback(*dn, &bfe_all, fd, &size);
+    toku_brtnode_pf_callback(*dn, ndd, &bfe_all, fd, &size);
     toku_brtnode_pe_callback(*dn, attr, &attr, NULL);
     for (int i = 0; i < (*dn)->n_children; i++) {
         assert(BP_STATE(*dn,i) == PT_AVAIL);
@@ -124,7 +125,7 @@ test1(int fd, struct brt_header *brt_h, BRTNODE *dn) {
     for (int i = 0; i < (*dn)->n_children; i++) {
         assert(BP_STATE(*dn,i) == PT_AVAIL);
     }
-
+    toku_free(ndd);
     toku_brtnode_free(dn);
 }
 
@@ -160,8 +161,8 @@ test2(int fd, struct brt_header *brt_h, BRTNODE *dn) {
         TRUE,
         FALSE
         );
-
-    int r = toku_deserialize_brtnode_from(fd, make_blocknum(20), 0/*pass zero for hash*/, dn, &bfe_subset);
+    BRTNODE_DISK_DATA ndd = NULL;
+    int r = toku_deserialize_brtnode_from(fd, make_blocknum(20), 0/*pass zero for hash*/, dn, &ndd, &bfe_subset);
     assert(r==0);
     BOOL is_leaf = ((*dn)->height == 0);
     // at this point, although both partitions are available, only the 
@@ -182,13 +183,13 @@ test2(int fd, struct brt_header *brt_h, BRTNODE *dn) {
 
     BOOL req = toku_brtnode_pf_req_callback(*dn, &bfe_subset);
     assert(req);
-    toku_brtnode_pf_callback(*dn, &bfe_subset, fd, &attr);
+    toku_brtnode_pf_callback(*dn, ndd, &bfe_subset, fd, &attr);
     assert(BP_STATE(*dn, 0) == PT_AVAIL);
     assert(BP_STATE(*dn, 1) == PT_AVAIL);
     assert(BP_SHOULD_EVICT(*dn, 0));
     assert(!BP_SHOULD_EVICT(*dn, 1));
 
-
+    toku_free(ndd);
     toku_brtnode_free(dn);
 }
 
@@ -206,8 +207,8 @@ test3_leaf(int fd, struct brt_header *brt_h, BRTNODE *dn) {
         &bfe_min,
         brt_h
         );
-
-    int r = toku_deserialize_brtnode_from(fd, make_blocknum(20), 0/*pass zero for hash*/, dn, &bfe_min);
+    BRTNODE_DISK_DATA ndd = NULL;
+    int r = toku_deserialize_brtnode_from(fd, make_blocknum(20), 0/*pass zero for hash*/, dn, &ndd, &bfe_min);
     assert(r==0);
     //
     // make sure we have a leaf
@@ -217,6 +218,7 @@ test3_leaf(int fd, struct brt_header *brt_h, BRTNODE *dn) {
         assert(BP_STATE(*dn, i) == PT_ON_DISK);
     }
     toku_brtnode_free(dn);
+    toku_free(ndd);
 }
 
 static void
@@ -296,8 +298,8 @@ test_serialize_nonleaf(void) {
         assert(offset == BLOCK_ALLOCATOR_TOTAL_HEADER_RESERVE);
         assert(size   == 100);
     }
-
-    r = toku_serialize_brtnode_to(fd, make_blocknum(20), &sn, brt->h, 1, 1, FALSE);
+    BRTNODE_DISK_DATA ndd = NULL;
+    r = toku_serialize_brtnode_to(fd, make_blocknum(20), &sn, &ndd, TRUE, brt->h, 1, 1, FALSE);
     assert(r==0);
 
     test1(fd, brt_h, &dn);
@@ -309,6 +311,7 @@ test_serialize_nonleaf(void) {
     destroy_nonleaf_childinfo(BNC(&sn, 1));
     toku_free(sn.bp);
     toku_free(sn.childkeys);
+    toku_free(ndd);
 
     toku_block_free(brt_h->blocktable, BLOCK_ALLOCATOR_TOTAL_HEADER_RESERVE);
     toku_brtheader_destroy_treelock(brt_h);
@@ -382,8 +385,8 @@ test_serialize_leaf(void) {
         assert(offset == BLOCK_ALLOCATOR_TOTAL_HEADER_RESERVE);
         assert(size   == 100);
     }
-
-    r = toku_serialize_brtnode_to(fd, make_blocknum(20), &sn, brt->h, 1, 1, FALSE);
+    BRTNODE_DISK_DATA ndd = NULL;
+    r = toku_serialize_brtnode_to(fd, make_blocknum(20), &sn, &ndd, TRUE, brt->h, 1, 1, FALSE);
     assert(r==0);
 
     test1(fd, brt_h, &dn);
@@ -408,7 +411,7 @@ test_serialize_leaf(void) {
     toku_blocktable_destroy(&brt_h->blocktable);
     toku_free(brt_h);
     toku_free(brt);
-
+    toku_free(ndd);
     r = close(fd); assert(r != -1);
 }
 
diff --git a/newbrt/tests/brt-serialize-benchmark.c b/newbrt/tests/brt-serialize-benchmark.c
index 81108c71037..8c368ce1a4d 100644
--- a/newbrt/tests/brt-serialize-benchmark.c
+++ b/newbrt/tests/brt-serialize-benchmark.c
@@ -130,7 +130,8 @@ test_serialize_leaf(int valsize, int nelts, double entropy) {
 
     struct timeval t[2];
     gettimeofday(&t[0], NULL);
-    r = toku_serialize_brtnode_to(fd, make_blocknum(20), &sn, brt->h, 1, 1, FALSE);
+    BRTNODE_DISK_DATA ndd = NULL;
+    r = toku_serialize_brtnode_to(fd, make_blocknum(20), &sn, &ndd, TRUE, brt->h, 1, 1, FALSE);
     assert(r==0);
     gettimeofday(&t[1], NULL);
     double dt;
@@ -140,7 +141,8 @@ test_serialize_leaf(int valsize, int nelts, double entropy) {
     struct brtnode_fetch_extra bfe;
     fill_bfe_for_full_read(&bfe, brt_h);
     gettimeofday(&t[0], NULL);
-    r = toku_deserialize_brtnode_from(fd, make_blocknum(20), 0/*pass zero for hash*/, &dn, &bfe);
+    BRTNODE_DISK_DATA ndd2 = NULL;
+    r = toku_deserialize_brtnode_from(fd, make_blocknum(20), 0/*pass zero for hash*/, &dn, &ndd2, &bfe);
     assert(r==0);
     gettimeofday(&t[1], NULL);
     dt = (t[1].tv_sec - t[0].tv_sec) + ((t[1].tv_usec - t[0].tv_usec) / USECS_PER_SEC);
@@ -165,6 +167,8 @@ test_serialize_leaf(int valsize, int nelts, double entropy) {
     toku_brtheader_destroy_treelock(brt_h);
     toku_free(brt_h);
     toku_free(brt);
+    toku_free(ndd);
+    toku_free(ndd2);
 
     r = close(fd); assert(r != -1);
 }
@@ -259,7 +263,8 @@ test_serialize_nonleaf(int valsize, int nelts, double entropy) {
 
     struct timeval t[2];
     gettimeofday(&t[0], NULL);
-    r = toku_serialize_brtnode_to(fd, make_blocknum(20), &sn, brt->h, 1, 1, FALSE);
+    BRTNODE_DISK_DATA ndd = NULL;
+    r = toku_serialize_brtnode_to(fd, make_blocknum(20), &sn, &ndd, TRUE, brt->h, 1, 1, FALSE);
     assert(r==0);
     gettimeofday(&t[1], NULL);
     double dt;
@@ -269,7 +274,8 @@ test_serialize_nonleaf(int valsize, int nelts, double entropy) {
     struct brtnode_fetch_extra bfe;
     fill_bfe_for_full_read(&bfe, brt_h);
     gettimeofday(&t[0], NULL);
-    r = toku_deserialize_brtnode_from(fd, make_blocknum(20), 0/*pass zero for hash*/, &dn, &bfe);
+    BRTNODE_DISK_DATA ndd2 = NULL;
+    r = toku_deserialize_brtnode_from(fd, make_blocknum(20), 0/*pass zero for hash*/, &dn, &ndd2, &bfe);
     assert(r==0);
     gettimeofday(&t[1], NULL);
     dt = (t[1].tv_sec - t[0].tv_sec) + ((t[1].tv_usec - t[0].tv_usec) / USECS_PER_SEC);
@@ -291,6 +297,8 @@ test_serialize_nonleaf(int valsize, int nelts, double entropy) {
     toku_brtheader_destroy_treelock(brt_h);
     toku_free(brt_h);
     toku_free(brt);
+    toku_free(ndd);
+    toku_free(ndd2);
 
     r = close(fd); assert(r != -1);
 }
diff --git a/newbrt/tests/brt-serialize-test.c b/newbrt/tests/brt-serialize-test.c
index cc631699020..524ea9e1775 100644
--- a/newbrt/tests/brt-serialize-test.c
+++ b/newbrt/tests/brt-serialize-test.c
@@ -102,19 +102,19 @@ string_key_cmp(DB *UU(e), const DBT *a, const DBT *b)
 }
 
 static void
-setup_dn(enum brtnode_verify_type bft, int fd, struct brt_header *brt_h, BRTNODE *dn) {
+setup_dn(enum brtnode_verify_type bft, int fd, struct brt_header *brt_h, BRTNODE *dn, BRTNODE_DISK_DATA* ndd) {
     int r;
     brt_h->compare_fun = string_key_cmp;
     if (bft == read_all) {
         struct brtnode_fetch_extra bfe;
         fill_bfe_for_full_read(&bfe, brt_h);
-        r = toku_deserialize_brtnode_from(fd, make_blocknum(20), 0/*pass zero for hash*/, dn, &bfe);
+        r = toku_deserialize_brtnode_from(fd, make_blocknum(20), 0/*pass zero for hash*/, dn, ndd, &bfe);
         assert(r==0);
     }
     else if (bft == read_compressed || bft == read_none) {
         struct brtnode_fetch_extra bfe;
         fill_bfe_for_min_read(&bfe, brt_h);
-        r = toku_deserialize_brtnode_from(fd, make_blocknum(20), 0/*pass zero for hash*/, dn, &bfe);
+        r = toku_deserialize_brtnode_from(fd, make_blocknum(20), 0/*pass zero for hash*/, dn, ndd, &bfe);
         assert(r==0);
         // assert all bp's are compressed or on disk.
         for (int i = 0; i < (*dn)->n_children; i++) {
@@ -143,7 +143,7 @@ setup_dn(enum brtnode_verify_type bft, int fd, struct brt_header *brt_h, BRTNODE
                 PAIR_ATTR attr;
                 fill_bfe_for_full_read(&bfe, brt_h);
                 assert(toku_brtnode_pf_req_callback(*dn, &bfe));
-                r = toku_brtnode_pf_callback(*dn, &bfe, fd, &attr);
+                r = toku_brtnode_pf_callback(*dn, *ndd, &bfe, fd, &attr);
                 assert(r==0);
                 // assert all bp's are available
                 for (int i = 0; i < (*dn)->n_children; i++) {
@@ -166,7 +166,7 @@ setup_dn(enum brtnode_verify_type bft, int fd, struct brt_header *brt_h, BRTNODE
         fill_bfe_for_full_read(&bfe, brt_h);
         assert(toku_brtnode_pf_req_callback(*dn, &bfe));
         PAIR_ATTR attr;
-        r = toku_brtnode_pf_callback(*dn, &bfe, fd, &attr);
+        r = toku_brtnode_pf_callback(*dn, *ndd, &bfe, fd, &attr);
         assert(r==0);
         // assert all bp's are available
         for (int i = 0; i < (*dn)->n_children; i++) {
@@ -180,8 +180,25 @@ setup_dn(enum brtnode_verify_type bft, int fd, struct brt_header *brt_h, BRTNODE
     }
 }
 
+static void write_sn_to_disk(int fd, BRT brt, BRTNODE sn, BRTNODE_DISK_DATA* src_ndd, BOOL do_clone) {
+    int r;
+    if (do_clone) {
+        void* cloned_node_v = NULL;
+        PAIR_ATTR attr;
+        toku_brtnode_clone_callback(sn, &cloned_node_v, &attr, FALSE, brt->h);
+        BRTNODE cloned_node = cloned_node_v;
+        r = toku_serialize_brtnode_to(fd, make_blocknum(20), cloned_node, src_ndd, FALSE, brt->h, 1, 1, FALSE);
+        assert(r==0);        
+        toku_brtnode_free(&cloned_node);
+    }
+    else {
+        r = toku_serialize_brtnode_to(fd, make_blocknum(20), sn, src_ndd, TRUE, brt->h, 1, 1, FALSE);
+        assert(r==0);
+    }
+}
+
 static void
-test_serialize_leaf_check_msn(enum brtnode_verify_type bft) {
+test_serialize_leaf_check_msn(enum brtnode_verify_type bft, BOOL do_clone) {
     //    struct brt source_brt;
     const int nodesize = 1024;
     struct brtnode sn, *dn;
@@ -256,11 +273,12 @@ test_serialize_leaf_check_msn(enum brtnode_verify_type bft) {
         assert(offset == BLOCK_ALLOCATOR_TOTAL_HEADER_RESERVE);
         assert(size   == 100);
     }
+    BRTNODE_DISK_DATA src_ndd = NULL;
+    BRTNODE_DISK_DATA dest_ndd = NULL;
 
-    r = toku_serialize_brtnode_to(fd, make_blocknum(20), &sn, brt->h, 1, 1, FALSE);
-    assert(r==0);
+    write_sn_to_disk(fd, brt, &sn, &src_ndd, do_clone);
 
-    setup_dn(bft, fd, brt_h, &dn);
+    setup_dn(bft, fd, brt_h, &dn, &dest_ndd);
 
     assert(dn->thisnodename.b==20);
 
@@ -285,10 +303,10 @@ test_serialize_leaf_check_msn(enum brtnode_verify_type bft) {
         u_int32_t last_i = 0;
         for (u_int32_t i = 0; i < npartitions; ++i) {
             assert(BLB_MAX_MSN_APPLIED(dn, i).msn == POSTSERIALIZE_MSN_ON_DISK.msn);
-            assert(dn->bp[i].start > 0);
-            assert(dn->bp[i].size  > 0);
+            assert(dest_ndd[i].start > 0);
+            assert(dest_ndd[i].size  > 0);
             if (i > 0) {
-                assert(dn->bp[i].start >= dn->bp[i-1].start + dn->bp[i-1].size);
+                assert(dest_ndd[i].start >= dest_ndd[i-1].start + dest_ndd[i-1].size);
             }
             toku_omt_iterate(BLB_BUFFER(dn, i), check_leafentries, &extra);
             u_int32_t keylen;
@@ -308,9 +326,9 @@ test_serialize_leaf_check_msn(enum brtnode_verify_type bft) {
         kv_pair_free(sn.childkeys[i]);
     }
     for (int i = 0; i < sn.n_children; i++) {
-	BASEMENTNODE bn = BLB(&sn, i);
-	struct mempool * mp = &bn->buffer_mempool;
-	toku_mempool_destroy(mp);
+        BASEMENTNODE bn = BLB(&sn, i);
+        struct mempool * mp = &bn->buffer_mempool;
+        toku_mempool_destroy(mp);
         destroy_basement_node(BLB(&sn, i));
     }
     toku_free(sn.bp);
@@ -321,12 +339,14 @@ test_serialize_leaf_check_msn(enum brtnode_verify_type bft) {
     toku_brtheader_destroy_treelock(brt_h);
     toku_free(brt_h);
     toku_free(brt);
+    toku_free(src_ndd);
+    toku_free(dest_ndd);
 
     r = close(fd); assert(r != -1);
 }
 
 static void
-test_serialize_leaf_with_large_pivots(enum brtnode_verify_type bft) {
+test_serialize_leaf_with_large_pivots(enum brtnode_verify_type bft, BOOL do_clone) {
     int r;
     struct brtnode sn, *dn;
     const int keylens = 256*1024, vallens = 0, nrows = 8;
@@ -396,11 +416,12 @@ test_serialize_leaf_with_large_pivots(enum brtnode_verify_type bft) {
         assert(offset == BLOCK_ALLOCATOR_TOTAL_HEADER_RESERVE);
         assert(size   == 100);
     }
+    BRTNODE_DISK_DATA src_ndd = NULL;
+    BRTNODE_DISK_DATA dest_ndd = NULL;
 
-    r = toku_serialize_brtnode_to(fd, make_blocknum(20), &sn, brt->h, 1, 1, FALSE);
-    assert(r==0);
+    write_sn_to_disk(fd, brt, &sn, &src_ndd, do_clone);
 
-    setup_dn(bft, fd, brt_h, &dn);
+    setup_dn(bft, fd, brt_h, &dn, &dest_ndd);
     
     assert(dn->thisnodename.b==20);
 
@@ -428,10 +449,10 @@ test_serialize_leaf_with_large_pivots(enum brtnode_verify_type bft) {
         struct check_leafentries_struct extra = { .nelts = nrows, .elts = les, .i = 0, .cmp = omt_cmp };
         u_int32_t last_i = 0;
         for (u_int32_t i = 0; i < npartitions; ++i) {
-            assert(dn->bp[i].start > 0);
-            assert(dn->bp[i].size  > 0);
+            assert(dest_ndd[i].start > 0);
+            assert(dest_ndd[i].size  > 0);
             if (i > 0) {
-                assert(dn->bp[i].start >= dn->bp[i-1].start + dn->bp[i-1].size);
+                assert(dest_ndd[i].start >= dest_ndd[i-1].start + dest_ndd[i-1].size);
             }
             assert(toku_omt_size(BLB_BUFFER(dn, i)) > 0);
             toku_omt_iterate(BLB_BUFFER(dn, i), check_leafentries, &extra);
@@ -461,12 +482,14 @@ test_serialize_leaf_with_large_pivots(enum brtnode_verify_type bft) {
     toku_brtheader_destroy_treelock(brt_h);
     toku_free(brt_h);
     toku_free(brt);
+    toku_free(src_ndd);
+    toku_free(dest_ndd);
 
     r = close(fd); assert(r != -1);
 }
 
 static void
-test_serialize_leaf_with_many_rows(enum brtnode_verify_type bft) {
+test_serialize_leaf_with_many_rows(enum brtnode_verify_type bft, BOOL do_clone) {
     int r;
     struct brtnode sn, *dn;
     const int keylens = sizeof(int), vallens = sizeof(int), nrows = 196*1024;
@@ -533,10 +556,11 @@ test_serialize_leaf_with_many_rows(enum brtnode_verify_type bft) {
         assert(size   == 100);
     }
 
-    r = toku_serialize_brtnode_to(fd, make_blocknum(20), &sn, brt->h, 1, 1, FALSE);
-    assert(r==0);
+    BRTNODE_DISK_DATA src_ndd = NULL;
+    BRTNODE_DISK_DATA dest_ndd = NULL;
+    write_sn_to_disk(fd, brt, &sn, &src_ndd, do_clone);
 
-    setup_dn(bft, fd, brt_h, &dn);
+    setup_dn(bft, fd, brt_h, &dn, &dest_ndd);
 
     assert(dn->thisnodename.b==20);
 
@@ -561,10 +585,10 @@ test_serialize_leaf_with_many_rows(enum brtnode_verify_type bft) {
         struct check_leafentries_struct extra = { .nelts = nrows, .elts = les, .i = 0, .cmp = omt_int_cmp };
         u_int32_t last_i = 0;
         for (u_int32_t i = 0; i < npartitions; ++i) {
-            assert(dn->bp[i].start > 0);
-            assert(dn->bp[i].size  > 0);
+            assert(dest_ndd[i].start > 0);
+            assert(dest_ndd[i].size  > 0);
             if (i > 0) {
-                assert(dn->bp[i].start >= dn->bp[i-1].start + dn->bp[i-1].size);
+                assert(dest_ndd[i].start >= dest_ndd[i-1].start + dest_ndd[i-1].size);
             }
             assert(toku_omt_size(BLB_BUFFER(dn, i)) > 0);
             toku_omt_iterate(BLB_BUFFER(dn, i), check_leafentries, &extra);
@@ -595,13 +619,15 @@ test_serialize_leaf_with_many_rows(enum brtnode_verify_type bft) {
     toku_brtheader_destroy_treelock(brt_h);
     toku_free(brt_h);
     toku_free(brt);
+    toku_free(src_ndd);
+    toku_free(dest_ndd);
 
     r = close(fd); assert(r != -1);
 }
 
 
 static void
-test_serialize_leaf_with_large_rows(enum brtnode_verify_type bft) {
+test_serialize_leaf_with_large_rows(enum brtnode_verify_type bft, BOOL do_clone) {
     int r;
     struct brtnode sn, *dn;
     const uint32_t nrows = 7;
@@ -674,10 +700,11 @@ test_serialize_leaf_with_large_rows(enum brtnode_verify_type bft) {
         assert(size   == 100);
     }
 
-    r = toku_serialize_brtnode_to(fd, make_blocknum(20), &sn, brt->h, 1, 1, FALSE);
-    assert(r==0);
+    BRTNODE_DISK_DATA src_ndd = NULL;
+    BRTNODE_DISK_DATA dest_ndd = NULL;
+    write_sn_to_disk(fd, brt, &sn, &src_ndd, do_clone);
 
-    setup_dn(bft, fd, brt_h, &dn);
+    setup_dn(bft, fd, brt_h, &dn, &dest_ndd);
 
     assert(dn->thisnodename.b==20);
 
@@ -708,10 +735,10 @@ test_serialize_leaf_with_large_rows(enum brtnode_verify_type bft) {
         struct check_leafentries_struct extra = { .nelts = nrows, .elts = les, .i = 0, .cmp = omt_cmp };
         u_int32_t last_i = 0;
         for (u_int32_t i = 0; i < npartitions; ++i) {
-            assert(dn->bp[i].start > 0);
-            assert(dn->bp[i].size  > 0);
+            assert(dest_ndd[i].start > 0);
+            assert(dest_ndd[i].size  > 0);
             if (i > 0) {
-                assert(dn->bp[i].start >= dn->bp[i-1].start + dn->bp[i-1].size);
+                assert(dest_ndd[i].start >= dest_ndd[i-1].start + dest_ndd[i-1].size);
             }
             assert(toku_omt_size(BLB_BUFFER(dn, i)) > 0);
             toku_omt_iterate(BLB_BUFFER(dn, i), check_leafentries, &extra);
@@ -741,13 +768,15 @@ test_serialize_leaf_with_large_rows(enum brtnode_verify_type bft) {
     toku_brtheader_destroy_treelock(brt_h);
     toku_free(brt_h);
     toku_free(brt);
+    toku_free(src_ndd);
+    toku_free(dest_ndd);
 
     r = close(fd); assert(r != -1);
 }
 
 
 static void
-test_serialize_leaf_with_empty_basement_nodes(enum brtnode_verify_type bft) {
+test_serialize_leaf_with_empty_basement_nodes(enum brtnode_verify_type bft, BOOL do_clone) {
     const int nodesize = 1024;
     struct brtnode sn, *dn;
 
@@ -830,11 +859,11 @@ test_serialize_leaf_with_empty_basement_nodes(enum brtnode_verify_type bft) {
         assert(offset == BLOCK_ALLOCATOR_TOTAL_HEADER_RESERVE);
         assert(size   == 100);
     }
+    BRTNODE_DISK_DATA src_ndd = NULL;
+    BRTNODE_DISK_DATA dest_ndd = NULL;
+    write_sn_to_disk(fd, brt, &sn, &src_ndd, do_clone);
 
-    r = toku_serialize_brtnode_to(fd, make_blocknum(20), &sn, brt->h, 1, 1, FALSE);
-    assert(r==0);
-
-    setup_dn(bft, fd, brt_h, &dn);
+    setup_dn(bft, fd, brt_h, &dn, &dest_ndd);
 
     assert(dn->thisnodename.b==20);
 
@@ -857,10 +886,10 @@ test_serialize_leaf_with_empty_basement_nodes(enum brtnode_verify_type bft) {
         struct check_leafentries_struct extra = { .nelts = 3, .elts = elts, .i = 0, .cmp = omt_cmp };
         u_int32_t last_i = 0;
         for (u_int32_t i = 0; i < npartitions; ++i) {
-            assert(dn->bp[i].start > 0);
-            assert(dn->bp[i].size  > 0);
+            assert(dest_ndd[i].start > 0);
+            assert(dest_ndd[i].size  > 0);
             if (i > 0) {
-                assert(dn->bp[i].start >= dn->bp[i-1].start + dn->bp[i-1].size);
+                assert(dest_ndd[i].start >= dest_ndd[i-1].start + dest_ndd[i-1].size);
             }
             assert(toku_omt_size(BLB_BUFFER(dn, i)) > 0);
             toku_omt_iterate(BLB_BUFFER(dn, i), check_leafentries, &extra);
@@ -890,12 +919,14 @@ test_serialize_leaf_with_empty_basement_nodes(enum brtnode_verify_type bft) {
     toku_brtheader_destroy_treelock(brt_h);
     toku_free(brt_h);
     toku_free(brt);
+    toku_free(src_ndd);
+    toku_free(dest_ndd);
 
     r = close(fd); assert(r != -1);
 }
 
 static void
-test_serialize_leaf_with_multiple_empty_basement_nodes(enum brtnode_verify_type bft) {
+test_serialize_leaf_with_multiple_empty_basement_nodes(enum brtnode_verify_type bft, BOOL do_clone) {
     const int nodesize = 1024;
     struct brtnode sn, *dn;
 
@@ -954,10 +985,11 @@ test_serialize_leaf_with_multiple_empty_basement_nodes(enum brtnode_verify_type
         assert(size   == 100);
     }
 
-    r = toku_serialize_brtnode_to(fd, make_blocknum(20), &sn, brt->h, 1, 1, FALSE);
-    assert(r==0);
+    BRTNODE_DISK_DATA src_ndd = NULL;
+    BRTNODE_DISK_DATA dest_ndd = NULL;
+    write_sn_to_disk(fd, brt, &sn, &src_ndd, do_clone);
 
-    setup_dn(bft, fd, brt_h, &dn);
+    setup_dn(bft, fd, brt_h, &dn, &dest_ndd);
 
     assert(dn->thisnodename.b==20);
 
@@ -973,10 +1005,10 @@ test_serialize_leaf_with_multiple_empty_basement_nodes(enum brtnode_verify_type
         struct check_leafentries_struct extra = { .nelts = 0, .elts = NULL, .i = 0, .cmp = omt_cmp };
         u_int32_t last_i = 0;
         for (u_int32_t i = 0; i < npartitions; ++i) {
-            assert(dn->bp[i].start > 0);
-            assert(dn->bp[i].size  > 0);
+            assert(dest_ndd[i].start > 0);
+            assert(dest_ndd[i].size  > 0);
             if (i > 0) {
-                assert(dn->bp[i].start >= dn->bp[i-1].start + dn->bp[i-1].size);
+                assert(dest_ndd[i].start >= dest_ndd[i-1].start + dest_ndd[i-1].size);
             }
             assert(toku_omt_size(BLB_BUFFER(dn, i)) == 0);
             toku_omt_iterate(BLB_BUFFER(dn, i), check_leafentries, &extra);
@@ -1002,13 +1034,15 @@ test_serialize_leaf_with_multiple_empty_basement_nodes(enum brtnode_verify_type
     toku_brtheader_destroy_treelock(brt_h);
     toku_free(brt_h);
     toku_free(brt);
+    toku_free(src_ndd);
+    toku_free(dest_ndd);
 
     r = close(fd); assert(r != -1);
 }
 
 
 static void
-test_serialize_leaf(enum brtnode_verify_type bft) {
+test_serialize_leaf(enum brtnode_verify_type bft, BOOL do_clone) {
     //    struct brt source_brt;
     const int nodesize = 1024;
     struct brtnode sn, *dn;
@@ -1016,6 +1050,8 @@ test_serialize_leaf(enum brtnode_verify_type bft) {
     int fd = open(__FILE__ ".brt", O_RDWR|O_CREAT|O_BINARY, S_IRWXU|S_IRWXG|S_IRWXO); assert(fd >= 0);
 
     int r;
+    BRTNODE_DISK_DATA src_ndd = NULL;
+    BRTNODE_DISK_DATA dest_ndd = NULL;
 
     sn.max_msn_applied_to_node_on_disk.msn = 0;
     sn.nodesize = nodesize;
@@ -1079,10 +1115,9 @@ test_serialize_leaf(enum brtnode_verify_type bft) {
         assert(size   == 100);
     }
 
-    r = toku_serialize_brtnode_to(fd, make_blocknum(20), &sn, brt->h, 1, 1, FALSE);
-    assert(r==0);
+    write_sn_to_disk(fd, brt, &sn, &src_ndd, do_clone);
 
-    setup_dn(bft, fd, brt_h, &dn);
+    setup_dn(bft, fd, brt_h, &dn, &dest_ndd);
 
     assert(dn->thisnodename.b==20);
 
@@ -1105,10 +1140,10 @@ test_serialize_leaf(enum brtnode_verify_type bft) {
         struct check_leafentries_struct extra = { .nelts = 3, .elts = elts, .i = 0, .cmp = omt_cmp };
         u_int32_t last_i = 0;
         for (u_int32_t i = 0; i < npartitions; ++i) {
-            assert(dn->bp[i].start > 0);
-            assert(dn->bp[i].size  > 0);
+            assert(dest_ndd[i].start > 0);
+            assert(dest_ndd[i].size  > 0);
             if (i > 0) {
-                assert(dn->bp[i].start >= dn->bp[i-1].start + dn->bp[i-1].size);
+                assert(dest_ndd[i].start >= dest_ndd[i-1].start + dest_ndd[i-1].size);
             }
             toku_omt_iterate(BLB_BUFFER(dn, i), check_leafentries, &extra);
             u_int32_t keylen;
@@ -1141,12 +1176,14 @@ test_serialize_leaf(enum brtnode_verify_type bft) {
     toku_brtheader_destroy_treelock(brt_h);
     toku_free(brt_h);
     toku_free(brt);
+    toku_free(src_ndd);
+    toku_free(dest_ndd);
 
     r = close(fd); assert(r != -1);
 }
 
 static void
-test_serialize_nonleaf(enum brtnode_verify_type bft) {
+test_serialize_nonleaf(enum brtnode_verify_type bft, BOOL do_clone) {
     //    struct brt source_brt;
     const int nodesize = 1024;
     struct brtnode sn, *dn;
@@ -1222,11 +1259,11 @@ test_serialize_nonleaf(enum brtnode_verify_type bft) {
         assert(offset == BLOCK_ALLOCATOR_TOTAL_HEADER_RESERVE);
         assert(size   == 100);
     }
+    BRTNODE_DISK_DATA src_ndd = NULL;
+    BRTNODE_DISK_DATA dest_ndd = NULL;
+    write_sn_to_disk(fd, brt, &sn, &src_ndd, do_clone);
 
-    r = toku_serialize_brtnode_to(fd, make_blocknum(20), &sn, brt->h, 1, 1, FALSE);
-    assert(r==0);
-
-    setup_dn(bft, fd, brt_h, &dn);
+    setup_dn(bft, fd, brt_h, &dn, &dest_ndd);
 
     assert(dn->thisnodename.b==20);
 
@@ -1339,43 +1376,69 @@ test_serialize_nonleaf(enum brtnode_verify_type bft) {
     toku_brtheader_destroy_treelock(brt_h);
     toku_free(brt_h);
     toku_free(brt);
+    toku_free(src_ndd);
+    toku_free(dest_ndd);
 
     r = close(fd); assert(r != -1);
 }
 
 int
 test_main (int argc __attribute__((__unused__)), const char *argv[] __attribute__((__unused__))) {
-    test_serialize_leaf(read_none);
-    test_serialize_leaf(read_all);
-    test_serialize_leaf(read_compressed);
+    test_serialize_leaf(read_none, FALSE);
+    test_serialize_leaf(read_all, FALSE);
+    test_serialize_leaf(read_compressed, FALSE);
+    test_serialize_leaf(read_none, TRUE);
+    test_serialize_leaf(read_all, TRUE);
+    test_serialize_leaf(read_compressed, TRUE);
 
-    test_serialize_leaf_with_empty_basement_nodes(read_none);
-    test_serialize_leaf_with_empty_basement_nodes(read_all);
-    test_serialize_leaf_with_empty_basement_nodes(read_compressed);
+    test_serialize_leaf_with_empty_basement_nodes(read_none, FALSE);
+    test_serialize_leaf_with_empty_basement_nodes(read_all, FALSE);
+    test_serialize_leaf_with_empty_basement_nodes(read_compressed, FALSE);
+    test_serialize_leaf_with_empty_basement_nodes(read_none, TRUE);
+    test_serialize_leaf_with_empty_basement_nodes(read_all, TRUE);
+    test_serialize_leaf_with_empty_basement_nodes(read_compressed, TRUE);
 
-    test_serialize_leaf_with_multiple_empty_basement_nodes(read_none);
-    test_serialize_leaf_with_multiple_empty_basement_nodes(read_all);
-    test_serialize_leaf_with_multiple_empty_basement_nodes(read_compressed);
+    test_serialize_leaf_with_multiple_empty_basement_nodes(read_none, FALSE);
+    test_serialize_leaf_with_multiple_empty_basement_nodes(read_all, FALSE);
+    test_serialize_leaf_with_multiple_empty_basement_nodes(read_compressed, FALSE);
+    test_serialize_leaf_with_multiple_empty_basement_nodes(read_none, TRUE);
+    test_serialize_leaf_with_multiple_empty_basement_nodes(read_all, TRUE);
+    test_serialize_leaf_with_multiple_empty_basement_nodes(read_compressed, TRUE);
 
-    test_serialize_leaf_with_large_rows(read_none);
-    test_serialize_leaf_with_large_rows(read_all);
-    test_serialize_leaf_with_large_rows(read_compressed);
+    test_serialize_leaf_with_large_rows(read_none, FALSE);
+    test_serialize_leaf_with_large_rows(read_all, FALSE);
+    test_serialize_leaf_with_large_rows(read_compressed, FALSE);
+    test_serialize_leaf_with_large_rows(read_none, TRUE);
+    test_serialize_leaf_with_large_rows(read_all, TRUE);
+    test_serialize_leaf_with_large_rows(read_compressed, TRUE);
 
-    test_serialize_leaf_with_many_rows(read_none);
-    test_serialize_leaf_with_many_rows(read_all);
-    test_serialize_leaf_with_many_rows(read_compressed);
+    test_serialize_leaf_with_many_rows(read_none, FALSE);
+    test_serialize_leaf_with_many_rows(read_all, FALSE);
+    test_serialize_leaf_with_many_rows(read_compressed, FALSE);
+    test_serialize_leaf_with_many_rows(read_none, TRUE);
+    test_serialize_leaf_with_many_rows(read_all, TRUE);
+    test_serialize_leaf_with_many_rows(read_compressed, TRUE);
 
-    test_serialize_leaf_with_large_pivots(read_none);
-    test_serialize_leaf_with_large_pivots(read_all);
-    test_serialize_leaf_with_large_pivots(read_compressed);
+    test_serialize_leaf_with_large_pivots(read_none, FALSE);
+    test_serialize_leaf_with_large_pivots(read_all, FALSE);
+    test_serialize_leaf_with_large_pivots(read_compressed, FALSE);
+    test_serialize_leaf_with_large_pivots(read_none, TRUE);
+    test_serialize_leaf_with_large_pivots(read_all, TRUE);
+    test_serialize_leaf_with_large_pivots(read_compressed, TRUE);
 
-    test_serialize_leaf_check_msn(read_none);
-    test_serialize_leaf_check_msn(read_all);
-    test_serialize_leaf_check_msn(read_compressed);
+    test_serialize_leaf_check_msn(read_none, FALSE);
+    test_serialize_leaf_check_msn(read_all, FALSE);
+    test_serialize_leaf_check_msn(read_compressed, FALSE);
+    test_serialize_leaf_check_msn(read_none, TRUE);
+    test_serialize_leaf_check_msn(read_all, TRUE);
+    test_serialize_leaf_check_msn(read_compressed, TRUE);
 
-    test_serialize_nonleaf(read_none);
-    test_serialize_nonleaf(read_all);
-    test_serialize_nonleaf(read_compressed);
+    test_serialize_nonleaf(read_none, FALSE);
+    test_serialize_nonleaf(read_all, FALSE);
+    test_serialize_nonleaf(read_compressed, FALSE);
+    test_serialize_nonleaf(read_none, TRUE);
+    test_serialize_nonleaf(read_all, TRUE);
+    test_serialize_nonleaf(read_compressed, TRUE);
 
     return 0;
 }
diff --git a/newbrt/tests/cachetable-3969.c b/newbrt/tests/cachetable-3969.c
index cde1e417ddc..d6da2a1941c 100644
--- a/newbrt/tests/cachetable-3969.c
+++ b/newbrt/tests/cachetable-3969.c
@@ -31,11 +31,11 @@ run_test (void) {
     long s1;
     long s2;
     
-    r = toku_cachetable_get_and_pin(f1, make_blocknum(1), 1, &v1, &s1, def_write_callback(NULL), def_fetch, def_pf_req_callback, def_pf_callback, NULL);
+    r = toku_cachetable_get_and_pin(f1, make_blocknum(1), 1, &v1, &s1, def_write_callback(NULL), def_fetch, def_pf_req_callback, def_pf_callback, TRUE, NULL);
     r = toku_cachetable_unpin(f1, make_blocknum(1), 1, CACHETABLE_CLEAN, make_pair_attr(8)); assert(r==0);
 
     for (int i = 0; i < 20; i++) {
-        r = toku_cachetable_get_and_pin(f1, make_blocknum(2), 2, &v2, &s2, def_write_callback(NULL), def_fetch, def_pf_req_callback, def_pf_callback, NULL);
+        r = toku_cachetable_get_and_pin(f1, make_blocknum(2), 2, &v2, &s2, def_write_callback(NULL), def_fetch, def_pf_req_callback, def_pf_callback, TRUE, NULL);
         r = toku_cachetable_unpin(f1, make_blocknum(2), 2, CACHETABLE_CLEAN, make_pair_attr(8)); assert(r==0);
     }
 
@@ -47,12 +47,12 @@ run_test (void) {
 
 
     // pin 1 and 2
-    r = toku_cachetable_get_and_pin(f1, make_blocknum(1), 1, &v2, &s2, def_write_callback(NULL), def_fetch, def_pf_req_callback, def_pf_callback, NULL);
+    r = toku_cachetable_get_and_pin(f1, make_blocknum(1), 1, &v2, &s2, def_write_callback(NULL), def_fetch, def_pf_req_callback, def_pf_callback, TRUE, NULL);
     r = toku_cachetable_begin_checkpoint(ct, NULL);
     // mark nodes as pending a checkpoint, so that get_and_pin_nonblocking on block 1 will return TOKUDB_TRY_AGAIN
     r = toku_cachetable_unpin(f1, make_blocknum(1), 1, CACHETABLE_DIRTY, make_pair_attr(8)); assert(r==0);
 
-    r = toku_cachetable_get_and_pin(f1, make_blocknum(2), 2, &v2, &s2, def_write_callback(NULL), def_fetch, def_pf_req_callback, def_pf_callback, NULL);
+    r = toku_cachetable_get_and_pin(f1, make_blocknum(2), 2, &v2, &s2, def_write_callback(NULL), def_fetch, def_pf_req_callback, def_pf_callback, TRUE, NULL);
     // now we try to pin 1, and it should get evicted out from under us
     struct unlockers foo;
     foo.extra = NULL;
@@ -69,6 +69,7 @@ run_test (void) {
         def_fetch,
         def_pf_req_callback,
         def_pf_callback,
+        TRUE,
         NULL,
         &foo
         );
diff --git a/newbrt/tests/cachetable-4357.c b/newbrt/tests/cachetable-4357.c
index 5dfd9ab98cc..16cc3b4ddf5 100644
--- a/newbrt/tests/cachetable-4357.c
+++ b/newbrt/tests/cachetable-4357.c
@@ -15,6 +15,7 @@ static void *pin_nonblocking(void *arg) {
         &v1, 
         &s1, 
         def_write_callback(NULL), def_fetch, def_pf_req_callback, def_pf_callback, 
+        TRUE,
         NULL, 
         NULL
         );
@@ -42,6 +43,7 @@ cachetable_test (void) {
       &v1, 
       &s1, 
       def_write_callback(NULL), def_fetch, def_pf_req_callback, def_pf_callback, 
+      TRUE, 
       NULL
       );
   toku_pthread_t pin_nonblocking_tid;
diff --git a/newbrt/tests/cachetable-4365.c b/newbrt/tests/cachetable-4365.c
index afa2796cc0c..d08a3be176c 100644
--- a/newbrt/tests/cachetable-4365.c
+++ b/newbrt/tests/cachetable-4365.c
@@ -15,6 +15,7 @@ static void *pin_nonblocking(void *arg) {
         &v1, 
         &s1, 
         def_write_callback(NULL), def_fetch, def_pf_req_callback, def_pf_callback, 
+        TRUE,
         NULL, 
         NULL
         );
@@ -63,6 +64,7 @@ cachetable_test (void) {
       &v1, 
       &s1, 
       def_write_callback(NULL), def_fetch, def_pf_req_callback, def_pf_callback, 
+      TRUE, 
       NULL
       );
   toku_pthread_t pin_nonblocking_tid;
diff --git a/newbrt/tests/cachetable-4545.c b/newbrt/tests/cachetable-4545.c
index 57dc227b831..4dbb006a607 100644
--- a/newbrt/tests/cachetable-4545.c
+++ b/newbrt/tests/cachetable-4545.c
@@ -12,12 +12,14 @@ flush (CACHEFILE f __attribute__((__unused__)),
        int UU(fd),
        CACHEKEY k  __attribute__((__unused__)),
        void *v     __attribute__((__unused__)),
+       void** UU(dd),
        void *e     __attribute__((__unused__)),
        PAIR_ATTR s      __attribute__((__unused__)),
        PAIR_ATTR* new_size      __attribute__((__unused__)),
        BOOL w      __attribute__((__unused__)),
        BOOL keep   __attribute__((__unused__)),
-       BOOL c      __attribute__((__unused__))
+       BOOL c      __attribute__((__unused__)),
+       BOOL UU(is_clone)
        ) {
     flush_called = TRUE;
     *new_size = make_pair_attr(8);
@@ -29,7 +31,7 @@ static BOOL pf_req_callback(void* UU(brtnode_pv), void* UU(read_extraargs)) {
   return TRUE;
 }
 
-static int pf_callback(void* UU(brtnode_pv), void* UU(read_extraargs), int UU(fd), PAIR_ATTR* sizep) {
+static int pf_callback(void* UU(brtnode_pv), void* UU(disk_data), void* UU(read_extraargs), int UU(fd), PAIR_ATTR* sizep) {
    assert(pf_req_called);
    assert(flush_called);
    pf_called = TRUE;
@@ -52,7 +54,7 @@ cachetable_test (void) {
   long s1;
   CACHETABLE_WRITE_CALLBACK wc = def_write_callback(NULL);
   wc.flush_callback = flush;
-  r = toku_cachetable_get_and_pin(f1, make_blocknum(1), 1, &v1, &s1, wc, def_fetch, pf_req_callback, pf_callback, NULL);
+  r = toku_cachetable_get_and_pin(f1, make_blocknum(1), 1, &v1, &s1, wc, def_fetch, pf_req_callback, pf_callback, TRUE, NULL);
   r = toku_cachetable_unpin(f1, make_blocknum(1), 1, CACHETABLE_DIRTY, make_pair_attr(8));
 
   flush_called = FALSE;
@@ -60,7 +62,7 @@ cachetable_test (void) {
   pf_called = FALSE;
   r = toku_cachetable_begin_checkpoint(ct, NULL);
   assert_zero(r);
-  r = toku_cachetable_get_and_pin(f1, make_blocknum(1), 1, &v1, &s1, wc, def_fetch, pf_req_callback, pf_callback, NULL);
+  r = toku_cachetable_get_and_pin(f1, make_blocknum(1), 1, &v1, &s1, wc, def_fetch, pf_req_callback, pf_callback, TRUE, NULL);
   assert_zero(r);
   r = toku_cachetable_unpin(f1, make_blocknum(1), 1, CACHETABLE_DIRTY, make_pair_attr(8));
   assert_zero(r);
diff --git a/newbrt/tests/cachetable-all-write.c b/newbrt/tests/cachetable-all-write.c
index 0c533a3752c..ac4cf7617e0 100644
--- a/newbrt/tests/cachetable-all-write.c
+++ b/newbrt/tests/cachetable-all-write.c
@@ -8,12 +8,14 @@ flush (CACHEFILE f __attribute__((__unused__)),
        int UU(fd),
        CACHEKEY k  __attribute__((__unused__)),
        void *v     __attribute__((__unused__)),
+       void** UU(dd),
        void *e     __attribute__((__unused__)),
        PAIR_ATTR s      __attribute__((__unused__)),
         PAIR_ATTR* new_size      __attribute__((__unused__)),
        BOOL w      __attribute__((__unused__)),
        BOOL keep   __attribute__((__unused__)),
-       BOOL c      __attribute__((__unused__))
+       BOOL c      __attribute__((__unused__)),
+        BOOL UU(is_clone)
        ) {
     /* Do nothing */
     if (verbose) { printf("FLUSH: %d write_me %d\n", (int)k.b, w); }
@@ -39,11 +41,9 @@ cachetable_test (void) {
     long s1, s2;
     CACHETABLE_WRITE_CALLBACK wc = def_write_callback(NULL);
     wc.flush_callback = flush;
-    r = toku_cachetable_get_and_pin(f1, make_blocknum(1), 1, &v1, &s1, wc, def_fetch, def_pf_req_callback, def_pf_callback, NULL);
+    r = toku_cachetable_get_and_pin(f1, make_blocknum(1), 1, &v1, &s1, wc, def_fetch, def_pf_req_callback, def_pf_callback, TRUE, NULL);
     r = toku_cachetable_unpin(f1, make_blocknum(1), 1, CACHETABLE_DIRTY, make_pair_attr(8));
-    r = toku_cachetable_get_and_pin(f1, make_blocknum(2), 2, &v2, &s2, wc, def_fetch, def_pf_req_callback, def_pf_callback, NULL);
-    // usleep (2*1024*1024);
-    //r = toku_cachetable_get_and_pin(f1, make_blocknum(1), 1, &v1, &s1, flush, def_fetch, def_pe_est_callback, pe_callback, pf_req_callback, pf_callback, NULL);
+    r = toku_cachetable_get_and_pin(f1, make_blocknum(2), 2, &v2, &s2, wc, def_fetch, def_pf_req_callback, def_pf_callback, TRUE, NULL);
 
 
     r = toku_cachetable_unpin(f1, make_blocknum(2), 2, CACHETABLE_CLEAN, make_pair_attr(8));
diff --git a/newbrt/tests/cachetable-checkpoint-pending.c b/newbrt/tests/cachetable-checkpoint-pending.c
index 68633d70aad..d1e894033b2 100644
--- a/newbrt/tests/cachetable-checkpoint-pending.c
+++ b/newbrt/tests/cachetable-checkpoint-pending.c
@@ -36,12 +36,14 @@ flush (
     int UU(fd), 
     CACHEKEY UU(key), 
     void *value, 
+    void** UU(dd),
     void *UU(extraargs), 
     PAIR_ATTR size, 
     PAIR_ATTR* UU(new_size), 
     BOOL write_me, 
     BOOL keep_me, 
-    BOOL UU(for_checkpoint)
+    BOOL UU(for_checkpoint),
+        BOOL UU(is_clone)
     )
 {
     // printf("f");
@@ -61,7 +63,8 @@ fetch (
     int UU(fd), 
     CACHEKEY UU(key), 
     u_int32_t UU(fullhash), 
-    void **UU(value), 
+    void **UU(value),
+    void **UU(dd), 
     PAIR_ATTR *UU(sizep), 
     int *UU(dirtyp), 
     void *UU(extraargs)
@@ -84,7 +87,7 @@ do_update (void *UU(ignore))
 	long size;
         CACHETABLE_WRITE_CALLBACK wc = def_write_callback(NULL);
         wc.flush_callback = flush;
-        int r = toku_cachetable_get_and_pin(cf, key, hi, &vv, &size, wc, fetch, def_pf_req_callback, def_pf_callback, 0);
+        int r = toku_cachetable_get_and_pin(cf, key, hi, &vv, &size, wc, fetch, def_pf_req_callback, def_pf_callback, TRUE, 0);
 	//printf("g");
 	assert(r==0);
 	assert(size==sizeof(int));
diff --git a/newbrt/tests/cachetable-checkpoint-pinned-nodes.c b/newbrt/tests/cachetable-checkpoint-pinned-nodes.c
index e90edc6e91a..7f33e9f599e 100644
--- a/newbrt/tests/cachetable-checkpoint-pinned-nodes.c
+++ b/newbrt/tests/cachetable-checkpoint-pinned-nodes.c
@@ -14,12 +14,14 @@ flush (CACHEFILE f __attribute__((__unused__)),
        int UU(fd),
        CACHEKEY k  __attribute__((__unused__)),
        void *v     __attribute__((__unused__)),
+       void** UU(dd),
        void *e     __attribute__((__unused__)),
        PAIR_ATTR s      __attribute__((__unused__)),
        PAIR_ATTR* new_size      __attribute__((__unused__)),
        BOOL w      __attribute__((__unused__)),
        BOOL keep   __attribute__((__unused__)),
-       BOOL c      __attribute__((__unused__))
+       BOOL c      __attribute__((__unused__)),
+       BOOL UU(is_clone)
        ) {
   /* Do nothing */
   if (verbose) { printf("FLUSH: %d\n", (int)k.b); }
@@ -41,6 +43,7 @@ fetch (CACHEFILE f        __attribute__((__unused__)),
        CACHEKEY k         __attribute__((__unused__)),
        u_int32_t fullhash __attribute__((__unused__)),
        void **value       __attribute__((__unused__)),
+       void** UU(dd),
        PAIR_ATTR *sizep        __attribute__((__unused__)),
        int  *dirtyp,
        void *extraargs    __attribute__((__unused__))
@@ -73,9 +76,9 @@ cachetable_test (void) {
   long s2;
   CACHETABLE_WRITE_CALLBACK wc = def_write_callback(&dirty_val);
   wc.flush_callback = flush;
-  r = toku_cachetable_get_and_pin(f1, make_blocknum(1), 1, &v1, &s1, wc, fetch, def_pf_req_callback, def_pf_callback, &dirty_val);
+  r = toku_cachetable_get_and_pin(f1, make_blocknum(1), 1, &v1, &s1, wc, fetch, def_pf_req_callback, def_pf_callback, TRUE, &dirty_val);
   wc.write_extraargs = NULL;
-  r = toku_cachetable_get_and_pin(f1, make_blocknum(2), 2, &v2, &s2, wc, fetch, def_pf_req_callback, def_pf_callback, NULL);
+  r = toku_cachetable_get_and_pin(f1, make_blocknum(2), 2, &v2, &s2, wc, fetch, def_pf_req_callback, def_pf_callback, TRUE, NULL);
 
   //
   // Here is the test, we have two pairs, v1 is dirty, v2 is clean, but both are currently pinned
diff --git a/newbrt/tests/cachetable-checkpoint-prefetched-nodes.c b/newbrt/tests/cachetable-checkpoint-prefetched-nodes.c
index 579fd062f3c..5a2e1cc95c0 100644
--- a/newbrt/tests/cachetable-checkpoint-prefetched-nodes.c
+++ b/newbrt/tests/cachetable-checkpoint-prefetched-nodes.c
@@ -14,12 +14,14 @@ flush (CACHEFILE f __attribute__((__unused__)),
        int UU(fd),
        CACHEKEY k  __attribute__((__unused__)),
        void *v     __attribute__((__unused__)),
+       void** UU(dd),
        void *e     __attribute__((__unused__)),
        PAIR_ATTR s      __attribute__((__unused__)),
         PAIR_ATTR* new_size      __attribute__((__unused__)),
        BOOL w      __attribute__((__unused__)),
        BOOL keep   __attribute__((__unused__)),
-       BOOL c      __attribute__((__unused__))
+       BOOL c      __attribute__((__unused__)),
+        BOOL UU(is_clone)
        ) {
   /* Do nothing */
   if (verbose) { printf("FLUSH: %d\n", (int)k.b); }
@@ -41,6 +43,7 @@ fetch (CACHEFILE f        __attribute__((__unused__)),
        CACHEKEY k         __attribute__((__unused__)),
        u_int32_t fullhash __attribute__((__unused__)),
        void **value       __attribute__((__unused__)),
+       void** UU(dd),
        PAIR_ATTR *sizep        __attribute__((__unused__)),
        int  *dirtyp,
        void *extraargs    __attribute__((__unused__))
diff --git a/newbrt/tests/cachetable-checkpoint-test.c b/newbrt/tests/cachetable-checkpoint-test.c
index 9088382cde4..b86d275f4ff 100644
--- a/newbrt/tests/cachetable-checkpoint-test.c
+++ b/newbrt/tests/cachetable-checkpoint-test.c
@@ -12,7 +12,21 @@ static const int item_size = 1;
 
 static int n_flush, n_write_me, n_keep_me, n_fetch;
 
-static void flush(CACHEFILE cf, int UU(fd), CACHEKEY key, void *value, void *extraargs, PAIR_ATTR size, PAIR_ATTR* UU(new_size), BOOL write_me, BOOL keep_me, BOOL UU(for_checkpoint)) {
+static void flush(
+    CACHEFILE cf, 
+    int UU(fd), 
+    CACHEKEY key, 
+    void *value, 
+    void** UU(dd), 
+    void *extraargs, 
+    PAIR_ATTR size, 
+    PAIR_ATTR* UU(new_size), 
+    BOOL write_me, 
+    BOOL keep_me, 
+    BOOL UU(for_checkpoint),
+        BOOL UU(is_clone)
+    ) 
+{
     cf = cf; key = key; value = value; extraargs = extraargs; 
     // assert(key == make_blocknum((long)value));
     assert(size.size == item_size);
diff --git a/newbrt/tests/cachetable-cleaner-checkpoint.c b/newbrt/tests/cachetable-cleaner-checkpoint.c
index 0577eb4ed16..3283d2f18e7 100644
--- a/newbrt/tests/cachetable-cleaner-checkpoint.c
+++ b/newbrt/tests/cachetable-cleaner-checkpoint.c
@@ -12,12 +12,14 @@ flush (CACHEFILE f __attribute__((__unused__)),
        int UU(fd),
        CACHEKEY k  __attribute__((__unused__)),
        void *v     __attribute__((__unused__)),
+       void** UU(dd),
        void *e     __attribute__((__unused__)),
        PAIR_ATTR s      __attribute__((__unused__)),
        PAIR_ATTR* new_size      __attribute__((__unused__)),
        BOOL w      __attribute__((__unused__)),
        BOOL keep   __attribute__((__unused__)),
-       BOOL c      __attribute__((__unused__))
+       BOOL c      __attribute__((__unused__)),
+        BOOL UU(is_clone)
        ) {
   /* Do nothing */
   if (verbose) { printf("FLUSH: %d\n", (int)k.b); }
@@ -70,7 +72,7 @@ cachetable_test (void) {
   CACHETABLE_WRITE_CALLBACK wc = def_write_callback(NULL);
   wc.flush_callback = flush;
   wc.cleaner_callback = cleaner_callback;
-  r = toku_cachetable_get_and_pin(f1, make_blocknum(1), 1, &v1, &s1, wc, def_fetch, def_pf_req_callback, def_pf_callback, NULL);
+  r = toku_cachetable_get_and_pin(f1, make_blocknum(1), 1, &v1, &s1, wc, def_fetch, def_pf_req_callback, def_pf_callback, TRUE, NULL);
   PAIR_ATTR attr = make_pair_attr(8);
   attr.cache_pressure_size = 8;
   r = toku_cachetable_unpin(f1, make_blocknum(1), 1, CACHETABLE_DIRTY, attr);
diff --git a/newbrt/tests/cachetable-cleaner-checkpoint2.c b/newbrt/tests/cachetable-cleaner-checkpoint2.c
index 302fc15eb65..df935c15938 100644
--- a/newbrt/tests/cachetable-cleaner-checkpoint2.c
+++ b/newbrt/tests/cachetable-cleaner-checkpoint2.c
@@ -12,12 +12,14 @@ flush (CACHEFILE f __attribute__((__unused__)),
        int UU(fd),
        CACHEKEY k  __attribute__((__unused__)),
        void *v     __attribute__((__unused__)),
+       void** UU(dd),
        void *e     __attribute__((__unused__)),
        PAIR_ATTR s      __attribute__((__unused__)),
        PAIR_ATTR* new_size      __attribute__((__unused__)),
        BOOL w      __attribute__((__unused__)),
        BOOL keep   __attribute__((__unused__)),
-       BOOL c      __attribute__((__unused__))
+       BOOL c      __attribute__((__unused__)),
+        BOOL UU(is_clone)
        ) {
   /* Do nothing */
   if (verbose) { printf("FLUSH: %d\n", (int)k.b); }
@@ -70,7 +72,7 @@ cachetable_test (void) {
   CACHETABLE_WRITE_CALLBACK wc = def_write_callback(NULL);
   wc.flush_callback = flush;
   wc.cleaner_callback = cleaner_callback;
-  r = toku_cachetable_get_and_pin(f1, make_blocknum(1), 1, &v1, &s1, wc, def_fetch, def_pf_req_callback, def_pf_callback, NULL);
+  r = toku_cachetable_get_and_pin(f1, make_blocknum(1), 1, &v1, &s1, wc, def_fetch, def_pf_req_callback, def_pf_callback, TRUE, NULL);
   PAIR_ATTR attr = make_pair_attr(8);
   attr.cache_pressure_size = 8;
   r = toku_cachetable_unpin(f1, make_blocknum(1), 1, CACHETABLE_DIRTY, attr);
diff --git a/newbrt/tests/cachetable-cleaner-dev-null.c b/newbrt/tests/cachetable-cleaner-dev-null.c
index 7fd4316fce0..345ce6a98a8 100644
--- a/newbrt/tests/cachetable-cleaner-dev-null.c
+++ b/newbrt/tests/cachetable-cleaner-dev-null.c
@@ -11,12 +11,14 @@ flush (CACHEFILE f __attribute__((__unused__)),
        int UU(fd),
        CACHEKEY k  __attribute__((__unused__)),
        void *v     __attribute__((__unused__)),
+       void** UU(dd),
        void *e     __attribute__((__unused__)),
        PAIR_ATTR s      __attribute__((__unused__)),
        PAIR_ATTR* new_size      __attribute__((__unused__)),
        BOOL w      __attribute__((__unused__)),
        BOOL keep   __attribute__((__unused__)),
-       BOOL c      __attribute__((__unused__))
+       BOOL c      __attribute__((__unused__)),
+        BOOL UU(is_clone)
        ) {
   /* Do nothing */
   if (verbose) { printf("FLUSH: %d\n", (int)k.b); }
@@ -59,7 +61,7 @@ cachetable_test (void) {
   CACHETABLE_WRITE_CALLBACK wc = def_write_callback(NULL);
   wc.flush_callback = flush;
   wc.cleaner_callback = cleaner_callback;
-  r = toku_cachetable_get_and_pin(f1, make_blocknum(1), 1, &v1, &s1, wc, def_fetch, def_pf_req_callback, def_pf_callback, NULL);
+  r = toku_cachetable_get_and_pin(f1, make_blocknum(1), 1, &v1, &s1, wc, def_fetch, def_pf_req_callback, def_pf_callback, TRUE, NULL);
   PAIR_ATTR attr = make_pair_attr(8);
   attr.cache_pressure_size = 8;
   r = toku_cachetable_unpin(f1, make_blocknum(1), 1, CACHETABLE_DIRTY, attr);
diff --git a/newbrt/tests/cachetable-cleaner-thread-attrs-accumulate.c b/newbrt/tests/cachetable-cleaner-thread-attrs-accumulate.c
index cd831215d17..c9b54a03157 100644
--- a/newbrt/tests/cachetable-cleaner-thread-attrs-accumulate.c
+++ b/newbrt/tests/cachetable-cleaner-thread-attrs-accumulate.c
@@ -30,12 +30,14 @@ flush (CACHEFILE f __attribute__((__unused__)),
        int UU(fd),
        CACHEKEY k  __attribute__((__unused__)),
        void *v     __attribute__((__unused__)),
+       void** UU(dd),
        void *e     __attribute__((__unused__)),
        PAIR_ATTR s      __attribute__((__unused__)),
        PAIR_ATTR* new_size      __attribute__((__unused__)),
        BOOL w      __attribute__((__unused__)),
        BOOL keep   __attribute__((__unused__)),
-       BOOL c      __attribute__((__unused__))
+       BOOL c      __attribute__((__unused__)),
+        BOOL UU(is_clone)
        ) {
     PAIR_ATTR *expect = e;
     if (!keep) {
@@ -85,6 +87,7 @@ run_test (void) {
                                         def_fetch,
                                         def_pf_req_callback,
                                         def_pf_callback,
+                                        TRUE, 
                                         &expect);
         assert_zero(r);
         r = toku_cachetable_unpin(f1, make_blocknum(i+1), i+1, CACHETABLE_DIRTY, attrs[i]);
@@ -109,6 +112,7 @@ run_test (void) {
                                     def_fetch,
                                     def_pf_req_callback,
                                     def_pf_callback,
+                                    TRUE, 
                                     &expect);
     toku_cachetable_unpin(f1, make_blocknum(n_pairs + 1), n_pairs + 1, CACHETABLE_CLEAN,
                           make_pair_attr(test_limit - expect.size + 20));
diff --git a/newbrt/tests/cachetable-cleaner-thread-everything-pinned.c b/newbrt/tests/cachetable-cleaner-thread-everything-pinned.c
index fa63df98be4..8ced54093ee 100644
--- a/newbrt/tests/cachetable-cleaner-thread-everything-pinned.c
+++ b/newbrt/tests/cachetable-cleaner-thread-everything-pinned.c
@@ -47,6 +47,7 @@ run_test (void) {
                                         def_fetch,
                                         def_pf_req_callback,
                                         def_pf_callback,
+                                        TRUE, 
                                         NULL);
         assert_zero(r);
     }
diff --git a/newbrt/tests/cachetable-cleaner-thread-nothing-needs-flushing.c b/newbrt/tests/cachetable-cleaner-thread-nothing-needs-flushing.c
index 6e86dadfc87..185f184a6a7 100644
--- a/newbrt/tests/cachetable-cleaner-thread-nothing-needs-flushing.c
+++ b/newbrt/tests/cachetable-cleaner-thread-nothing-needs-flushing.c
@@ -45,6 +45,7 @@ run_test (void) {
                                         def_fetch,
                                         def_pf_req_callback,
                                         def_pf_callback,
+                                        TRUE, 
                                         NULL);
         assert_zero(r);
         // set cachepressure_size to 0
diff --git a/newbrt/tests/cachetable-cleaner-thread-simple.c b/newbrt/tests/cachetable-cleaner-thread-simple.c
index 6463e08b978..504cc78a06d 100644
--- a/newbrt/tests/cachetable-cleaner-thread-simple.c
+++ b/newbrt/tests/cachetable-cleaner-thread-simple.c
@@ -52,6 +52,7 @@ run_test (void) {
                                     def_fetch,
                                     def_pf_req_callback,
                                     def_pf_callback,
+                                    TRUE, 
                                     NULL);
     PAIR_ATTR attr = make_pair_attr(8);
     attr.cache_pressure_size = 100;
@@ -63,6 +64,7 @@ run_test (void) {
                                         def_fetch,
                                         def_pf_req_callback,
                                         def_pf_callback,
+                                        TRUE, 
                                         NULL);
         assert_zero(r);
         // set cachepressure_size to 0
diff --git a/newbrt/tests/cachetable-clock-eviction.c b/newbrt/tests/cachetable-clock-eviction.c
index 3b0494462a3..37dea40882e 100644
--- a/newbrt/tests/cachetable-clock-eviction.c
+++ b/newbrt/tests/cachetable-clock-eviction.c
@@ -13,12 +13,14 @@ flush (CACHEFILE f __attribute__((__unused__)),
        int UU(fd),
        CACHEKEY k  __attribute__((__unused__)),
        void *v     __attribute__((__unused__)),
+       void** UU(dd),
        void *e     __attribute__((__unused__)),
        PAIR_ATTR s      __attribute__((__unused__)),
        PAIR_ATTR* new_size      __attribute__((__unused__)),
        BOOL w      __attribute__((__unused__)),
        BOOL keep   __attribute__((__unused__)),
-       BOOL c      __attribute__((__unused__))
+       BOOL c      __attribute__((__unused__)),
+        BOOL UU(is_clone)
        ) {
     /* Do nothing */
     if (check_flush && !keep) {
@@ -36,6 +38,7 @@ fetch (CACHEFILE f        __attribute__((__unused__)),
        CACHEKEY k         __attribute__((__unused__)),
        u_int32_t fullhash __attribute__((__unused__)),
        void **value       __attribute__((__unused__)),
+       void** UU(dd),
        PAIR_ATTR *sizep        __attribute__((__unused__)),
        int  *dirtyp,
        void *extraargs    __attribute__((__unused__))
@@ -66,19 +69,19 @@ cachetable_test (void) {
     CACHETABLE_WRITE_CALLBACK wc = def_write_callback(NULL);
     wc.flush_callback = flush;
     for (int i = 0; i < 100000; i++) {
-      r = toku_cachetable_get_and_pin(f1, make_blocknum(1), 1, &v1, &s1, wc, fetch, def_pf_req_callback, def_pf_callback, NULL);
+      r = toku_cachetable_get_and_pin(f1, make_blocknum(1), 1, &v1, &s1, wc, fetch, def_pf_req_callback, def_pf_callback, TRUE, NULL);
         r = toku_cachetable_unpin(f1, make_blocknum(1), 1, CACHETABLE_CLEAN, make_pair_attr(1));
     }
     for (int i = 0; i < 8; i++) {
-      r = toku_cachetable_get_and_pin(f1, make_blocknum(2), 2, &v2, &s2, wc, fetch, def_pf_req_callback, def_pf_callback, NULL);
+      r = toku_cachetable_get_and_pin(f1, make_blocknum(2), 2, &v2, &s2, wc, fetch, def_pf_req_callback, def_pf_callback, TRUE, NULL);
         r = toku_cachetable_unpin(f1, make_blocknum(2), 2, CACHETABLE_CLEAN, make_pair_attr(1));
     }
     for (int i = 0; i < 4; i++) {
-      r = toku_cachetable_get_and_pin(f1, make_blocknum(3), 3, &v2, &s2, wc, fetch, def_pf_req_callback, def_pf_callback, NULL);
+      r = toku_cachetable_get_and_pin(f1, make_blocknum(3), 3, &v2, &s2, wc, fetch, def_pf_req_callback, def_pf_callback, TRUE, NULL);
         r = toku_cachetable_unpin(f1, make_blocknum(3), 3, CACHETABLE_CLEAN, make_pair_attr(1));
     }
     for (int i = 0; i < 2; i++) {
-      r = toku_cachetable_get_and_pin(f1, make_blocknum(4), 4, &v2, &s2, wc, fetch, def_pf_req_callback, def_pf_callback, NULL);
+      r = toku_cachetable_get_and_pin(f1, make_blocknum(4), 4, &v2, &s2, wc, fetch, def_pf_req_callback, def_pf_callback, TRUE, NULL);
         r = toku_cachetable_unpin(f1, make_blocknum(4), 4, CACHETABLE_CLEAN, make_pair_attr(1));
     }
     flush_may_occur = TRUE;
diff --git a/newbrt/tests/cachetable-clock-eviction2.c b/newbrt/tests/cachetable-clock-eviction2.c
index 712855ab826..40d26d1214d 100755
--- a/newbrt/tests/cachetable-clock-eviction2.c
+++ b/newbrt/tests/cachetable-clock-eviction2.c
@@ -10,12 +10,14 @@ flush (CACHEFILE f __attribute__((__unused__)),
        int UU(fd),
        CACHEKEY k  __attribute__((__unused__)),
        void *v,
+       void** UU(dd),
        void *e     __attribute__((__unused__)),
        PAIR_ATTR s      __attribute__((__unused__)),
        PAIR_ATTR* new_size      __attribute__((__unused__)),
        BOOL w      __attribute__((__unused__)),
        BOOL keep,
-       BOOL c      __attribute__((__unused__))
+       BOOL c      __attribute__((__unused__)),
+        BOOL UU(is_clone)
        ) {
     assert(flush_may_occur);
     if (!keep) {
@@ -31,6 +33,7 @@ fetch (CACHEFILE f        __attribute__((__unused__)),
        CACHEKEY k         __attribute__((__unused__)),
        u_int32_t fullhash __attribute__((__unused__)),
        void **value       __attribute__((__unused__)),
+       void** UU(dd),
        PAIR_ATTR *sizep        __attribute__((__unused__)),
        int  *dirtyp,
        void *extraargs    __attribute__((__unused__))
@@ -48,12 +51,14 @@ other_flush (CACHEFILE f __attribute__((__unused__)),
        int UU(fd),
        CACHEKEY k  __attribute__((__unused__)),
        void *v     __attribute__((__unused__)),
+	     void** UU(dd),
        void *e     __attribute__((__unused__)),
        PAIR_ATTR s      __attribute__((__unused__)),
        PAIR_ATTR* new_size      __attribute__((__unused__)),
        BOOL w      __attribute__((__unused__)),
        BOOL keep   __attribute__((__unused__)),
-       BOOL c      __attribute__((__unused__))
+       BOOL c      __attribute__((__unused__)),
+        BOOL UU(is_clone)
        ) {
 }
 
@@ -103,28 +108,28 @@ cachetable_test (void) {
       CACHETABLE_WRITE_CALLBACK wc = def_write_callback(NULL);
       wc.flush_callback = flush;
       wc.pe_callback = pe_callback;
-      r = toku_cachetable_get_and_pin(f1, make_blocknum(1), 1, &v1, &s1, wc, fetch, def_pf_req_callback, def_pf_callback, NULL);
+      r = toku_cachetable_get_and_pin(f1, make_blocknum(1), 1, &v1, &s1, wc, fetch, def_pf_req_callback, def_pf_callback, TRUE, NULL);
       r = toku_cachetable_unpin(f1, make_blocknum(1), 1, CACHETABLE_CLEAN, make_pair_attr(4));
     }
     for (int i = 0; i < 8; i++) {
       CACHETABLE_WRITE_CALLBACK wc = def_write_callback(NULL);
       wc.flush_callback = flush;
       wc.pe_callback = pe_callback;
-      r = toku_cachetable_get_and_pin(f1, make_blocknum(2), 2, &v2, &s2, wc, fetch, def_pf_req_callback, def_pf_callback, NULL);
+      r = toku_cachetable_get_and_pin(f1, make_blocknum(2), 2, &v2, &s2, wc, fetch, def_pf_req_callback, def_pf_callback, TRUE, NULL);
       r = toku_cachetable_unpin(f1, make_blocknum(2), 2, CACHETABLE_CLEAN, make_pair_attr(4));
     }
     for (int i = 0; i < 4; i++) {
       CACHETABLE_WRITE_CALLBACK wc = def_write_callback(NULL);
       wc.flush_callback = flush;
       wc.pe_callback = pe_callback;
-      r = toku_cachetable_get_and_pin(f1, make_blocknum(3), 3, &v2, &s2, wc, fetch, def_pf_req_callback, def_pf_callback, NULL);
+      r = toku_cachetable_get_and_pin(f1, make_blocknum(3), 3, &v2, &s2, wc, fetch, def_pf_req_callback, def_pf_callback, TRUE, NULL);
       r = toku_cachetable_unpin(f1, make_blocknum(3), 3, CACHETABLE_CLEAN, make_pair_attr(4));
     }
     for (int i = 0; i < 2; i++) {
       CACHETABLE_WRITE_CALLBACK wc = def_write_callback(NULL);
       wc.flush_callback = flush;
       wc.pe_callback = pe_callback;
-      r = toku_cachetable_get_and_pin(f1, make_blocknum(4), 4, &v2, &s2, wc, fetch, def_pf_req_callback, def_pf_callback, NULL);
+      r = toku_cachetable_get_and_pin(f1, make_blocknum(4), 4, &v2, &s2, wc, fetch, def_pf_req_callback, def_pf_callback, TRUE, NULL);
       r = toku_cachetable_unpin(f1, make_blocknum(4), 4, CACHETABLE_CLEAN, make_pair_attr(4));
     }
     flush_may_occur = FALSE;
diff --git a/newbrt/tests/cachetable-clock-eviction3.c b/newbrt/tests/cachetable-clock-eviction3.c
index 2d0081a1ab7..21c6237b6ef 100755
--- a/newbrt/tests/cachetable-clock-eviction3.c
+++ b/newbrt/tests/cachetable-clock-eviction3.c
@@ -10,12 +10,14 @@ flush (CACHEFILE f __attribute__((__unused__)),
        int UU(fd),
        CACHEKEY k  __attribute__((__unused__)),
        void* UU(v),
+       void** UU(dd),
        void *e     __attribute__((__unused__)),
        PAIR_ATTR s      __attribute__((__unused__)),
        PAIR_ATTR* new_size      __attribute__((__unused__)),
        BOOL w      __attribute__((__unused__)),
        BOOL keep,
-       BOOL c      __attribute__((__unused__))
+       BOOL c      __attribute__((__unused__)),
+        BOOL UU(is_clone)
        ) {
     assert(flush_may_occur);
     if (!keep) {
@@ -31,6 +33,7 @@ fetch (CACHEFILE f        __attribute__((__unused__)),
        CACHEKEY k         __attribute__((__unused__)),
        u_int32_t fullhash __attribute__((__unused__)),
        void **value       __attribute__((__unused__)),
+       void** UU(dd),
        PAIR_ATTR *sizep        __attribute__((__unused__)),
        int  *dirtyp,
        void *extraargs    __attribute__((__unused__))
@@ -48,18 +51,21 @@ other_flush (CACHEFILE f __attribute__((__unused__)),
        int UU(fd),
        CACHEKEY k  __attribute__((__unused__)),
        void *v     __attribute__((__unused__)),
+	     void** UU(dd),
        void *e     __attribute__((__unused__)),
        PAIR_ATTR s      __attribute__((__unused__)),
        PAIR_ATTR* new_size      __attribute__((__unused__)),
        BOOL w      __attribute__((__unused__)),
        BOOL keep   __attribute__((__unused__)),
-       BOOL c      __attribute__((__unused__))
+       BOOL c      __attribute__((__unused__)),
+        BOOL UU(is_clone)
        ) {
 }
 
 static void 
 pe_est_callback(
     void* UU(brtnode_pv), 
+    void* UU(dd),
     long* bytes_freed_estimate, 
     enum partial_eviction_cost *cost, 
     void* UU(write_extraargs)
@@ -118,7 +124,7 @@ cachetable_test (void) {
       wc.flush_callback = flush;
       wc.pe_est_callback = pe_est_callback;
       wc.pe_callback = pe_callback;
-      r = toku_cachetable_get_and_pin(f1, make_blocknum(1), 1, &v1, &s1, wc, fetch, def_pf_req_callback, def_pf_callback, NULL);
+      r = toku_cachetable_get_and_pin(f1, make_blocknum(1), 1, &v1, &s1, wc, fetch, def_pf_req_callback, def_pf_callback, TRUE, NULL);
       r = toku_cachetable_unpin(f1, make_blocknum(1), 1, CACHETABLE_CLEAN, make_pair_attr(4));
     }
     for (int i = 0; i < 8; i++) {
@@ -126,7 +132,7 @@ cachetable_test (void) {
       wc.flush_callback = flush;
       wc.pe_est_callback = pe_est_callback;
       wc.pe_callback = pe_callback;
-      r = toku_cachetable_get_and_pin(f1, make_blocknum(2), 2, &v2, &s2, wc, fetch, def_pf_req_callback, def_pf_callback, NULL);
+      r = toku_cachetable_get_and_pin(f1, make_blocknum(2), 2, &v2, &s2, wc, fetch, def_pf_req_callback, def_pf_callback, TRUE, NULL);
       r = toku_cachetable_unpin(f1, make_blocknum(2), 2, CACHETABLE_CLEAN, make_pair_attr(4));
     }
     for (int i = 0; i < 4; i++) {
@@ -134,7 +140,7 @@ cachetable_test (void) {
       wc.flush_callback = flush;
       wc.pe_est_callback = pe_est_callback;
       wc.pe_callback = pe_callback;
-      r = toku_cachetable_get_and_pin(f1, make_blocknum(3), 3, &v2, &s2, wc, fetch, def_pf_req_callback, def_pf_callback, NULL);
+      r = toku_cachetable_get_and_pin(f1, make_blocknum(3), 3, &v2, &s2, wc, fetch, def_pf_req_callback, def_pf_callback, TRUE, NULL);
       r = toku_cachetable_unpin(f1, make_blocknum(3), 3, CACHETABLE_CLEAN, make_pair_attr(4));
     }
     for (int i = 0; i < 2; i++) {
@@ -142,7 +148,7 @@ cachetable_test (void) {
       wc.flush_callback = flush;
       wc.pe_est_callback = pe_est_callback;
       wc.pe_callback = pe_callback;
-      r = toku_cachetable_get_and_pin(f1, make_blocknum(4), 4, &v2, &s2, wc, fetch, def_pf_req_callback, def_pf_callback, NULL);
+      r = toku_cachetable_get_and_pin(f1, make_blocknum(4), 4, &v2, &s2, wc, fetch, def_pf_req_callback, def_pf_callback, TRUE, NULL);
       r = toku_cachetable_unpin(f1, make_blocknum(4), 4, CACHETABLE_CLEAN, make_pair_attr(4));
     }
     flush_may_occur = FALSE;
diff --git a/newbrt/tests/cachetable-clock-eviction4.c b/newbrt/tests/cachetable-clock-eviction4.c
index b0ecea2220b..923cf40308c 100644
--- a/newbrt/tests/cachetable-clock-eviction4.c
+++ b/newbrt/tests/cachetable-clock-eviction4.c
@@ -23,12 +23,14 @@ flush (CACHEFILE f __attribute__((__unused__)),
        int UU(fd),
        CACHEKEY k  __attribute__((__unused__)),
        void *v     __attribute__((__unused__)),
+       void** UU(dd),
        void *e     __attribute__((__unused__)),
        PAIR_ATTR s      __attribute__((__unused__)),
        PAIR_ATTR* new_size      __attribute__((__unused__)),
        BOOL w      __attribute__((__unused__)),
        BOOL keep   __attribute__((__unused__)),
-       BOOL c      __attribute__((__unused__))
+       BOOL c      __attribute__((__unused__)),
+        BOOL UU(is_clone)
        ) {
     /* Do nothing */
     if (check_flush && !keep) {
@@ -46,6 +48,7 @@ fetch (CACHEFILE f        __attribute__((__unused__)),
        CACHEKEY k         __attribute__((__unused__)),
        u_int32_t fullhash __attribute__((__unused__)),
        void **value       __attribute__((__unused__)),
+       void** UU(dd),
        PAIR_ATTR *sizep        __attribute__((__unused__)),
        int  *dirtyp,
        void *extraargs    __attribute__((__unused__))
@@ -58,7 +61,8 @@ fetch (CACHEFILE f        __attribute__((__unused__)),
 
 static void 
 pe_est_callback(
-    void* UU(brtnode_pv), 
+    void* UU(brtnode_pv),
+    void* UU(dd), 
     long* bytes_freed_estimate, 
     enum partial_eviction_cost *cost, 
     void* UU(write_extraargs)
@@ -104,19 +108,19 @@ cachetable_test (void) {
     wc.pe_est_callback = pe_est_callback;
     wc.pe_callback = pe_callback;
     for (int i = 0; i < 100000; i++) {
-      r = toku_cachetable_get_and_pin(f1, make_blocknum(1), 1, &v1, &s1, wc, fetch, def_pf_req_callback, def_pf_callback, NULL);
+      r = toku_cachetable_get_and_pin(f1, make_blocknum(1), 1, &v1, &s1, wc, fetch, def_pf_req_callback, def_pf_callback, TRUE, NULL);
         r = toku_cachetable_unpin(f1, make_blocknum(1), 1, CACHETABLE_CLEAN, make_pair_attr(1));
     }
     for (int i = 0; i < 8; i++) {
-      r = toku_cachetable_get_and_pin(f1, make_blocknum(2), 2, &v2, &s2, wc, fetch, def_pf_req_callback, def_pf_callback, NULL);
+      r = toku_cachetable_get_and_pin(f1, make_blocknum(2), 2, &v2, &s2, wc, fetch, def_pf_req_callback, def_pf_callback, TRUE, NULL);
         r = toku_cachetable_unpin(f1, make_blocknum(2), 2, CACHETABLE_CLEAN, make_pair_attr(1));
     }
     for (int i = 0; i < 4; i++) {
-      r = toku_cachetable_get_and_pin(f1, make_blocknum(3), 3, &v2, &s2, wc, fetch, def_pf_req_callback, def_pf_callback, NULL);
+      r = toku_cachetable_get_and_pin(f1, make_blocknum(3), 3, &v2, &s2, wc, fetch, def_pf_req_callback, def_pf_callback, TRUE, NULL);
         r = toku_cachetable_unpin(f1, make_blocknum(3), 3, CACHETABLE_CLEAN, make_pair_attr(1));
     }
     for (int i = 0; i < 2; i++) {
-      r = toku_cachetable_get_and_pin(f1, make_blocknum(4), 4, &v2, &s2, wc, fetch, def_pf_req_callback, def_pf_callback, NULL);
+      r = toku_cachetable_get_and_pin(f1, make_blocknum(4), 4, &v2, &s2, wc, fetch, def_pf_req_callback, def_pf_callback, TRUE, NULL);
         r = toku_cachetable_unpin(f1, make_blocknum(4), 4, CACHETABLE_CLEAN, make_pair_attr(1));
     }
     flush_may_occur = TRUE;
diff --git a/newbrt/tests/cachetable-clone-checkpoint.c b/newbrt/tests/cachetable-clone-checkpoint.c
new file mode 100644
index 00000000000..368969550e4
--- /dev/null
+++ b/newbrt/tests/cachetable-clone-checkpoint.c
@@ -0,0 +1,109 @@
+#ident "$Id: cachetable-simple-verify.c 39504 2012-02-03 16:19:33Z zardosht $"
+#ident "Copyright (c) 2007-2011 Tokutek Inc.  All rights reserved."
+#include "includes.h"
+#include "test.h"
+
+
+static void 
+clone_callback(void* UU(value_data), void** cloned_value_data, PAIR_ATTR* new_attr, BOOL UU(for_checkpoint), void* UU(write_extraargs))
+{
+    *cloned_value_data = (void *)1;
+    new_attr->is_valid = FALSE;
+}
+
+BOOL clone_flush_started;
+BOOL clone_flush_completed;
+CACHETABLE ct;
+
+static void
+flush (
+    CACHEFILE f __attribute__((__unused__)),
+    int UU(fd),
+    CACHEKEY k  __attribute__((__unused__)),
+    void *v     __attribute__((__unused__)),
+    void** UU(dd),
+    void *e     __attribute__((__unused__)),
+    PAIR_ATTR s      __attribute__((__unused__)),
+    PAIR_ATTR* new_size      __attribute__((__unused__)),
+    BOOL w      __attribute__((__unused__)),
+    BOOL keep   __attribute__((__unused__)),
+    BOOL c      __attribute__((__unused__)),
+    BOOL is_clone
+    ) 
+{  
+    if (is_clone) {
+        clone_flush_started = TRUE;
+        usleep(4*1024*1024);
+        clone_flush_completed = TRUE;
+    }
+}
+
+static void *run_end_checkpoint(void *arg) {
+    int r = toku_cachetable_end_checkpoint(
+        ct, 
+        NULL, 
+        fake_ydb_lock,
+        fake_ydb_unlock,
+        NULL,
+        NULL
+        );
+    assert_zero(r);
+    return arg;
+}
+
+//
+// this test verifies that a PAIR that undergoes a checkpoint on the checkpoint thread is still pinnable while being written out
+//
+static void
+cachetable_test (void) {
+    const int test_limit = 200;
+    int r;
+    ct = NULL;
+    r = toku_create_cachetable(&ct, test_limit, ZERO_LSN, NULL_LOGGER); assert(r == 0);
+    char fname1[] = __FILE__ "test1.dat";
+    unlink(fname1);
+    CACHEFILE f1;
+    r = toku_cachetable_openf(&f1, ct, fname1, O_RDWR|O_CREAT, S_IRWXU|S_IRWXG|S_IRWXO); assert(r == 0);
+    
+    void* v1;
+    long s1;
+    CACHETABLE_WRITE_CALLBACK wc = def_write_callback(NULL);
+    wc.flush_callback = flush;
+    wc.clone_callback = clone_callback;
+    r = toku_cachetable_get_and_pin(f1, make_blocknum(1), 1, &v1, &s1, wc, def_fetch, def_pf_req_callback, def_pf_callback, TRUE, NULL);
+    assert_zero(r);
+    r = toku_cachetable_unpin(f1, make_blocknum(1), 1, CACHETABLE_DIRTY, make_pair_attr(8));
+    assert_zero(r);
+    r = toku_cachetable_begin_checkpoint(ct, NULL);
+
+
+    clone_flush_started = FALSE;
+    clone_flush_completed = FALSE;
+    toku_pthread_t checkpoint_tid;
+    r = toku_pthread_create(&checkpoint_tid, NULL, run_end_checkpoint, NULL); 
+    assert_zero(r);    
+
+    usleep(1*1024*1024);
+
+    r = toku_cachetable_get_and_pin(f1, make_blocknum(1), 1, &v1, &s1, wc, def_fetch, def_pf_req_callback, def_pf_callback, TRUE, NULL);
+    assert_zero(r);
+    assert(clone_flush_started && !clone_flush_completed);
+    r = toku_cachetable_unpin(f1, make_blocknum(1), 1, CACHETABLE_CLEAN, make_pair_attr(8));
+    assert_zero(r);
+    
+    void *ret;
+    r = toku_pthread_join(checkpoint_tid, &ret); 
+    assert_zero(r);
+    assert(clone_flush_started && clone_flush_completed);
+
+    toku_cachetable_verify(ct);
+    r = toku_cachefile_close(&f1, 0, FALSE, ZERO_LSN); assert(r == 0 && f1 == 0);
+    r = toku_cachetable_close(&ct); lazy_assert_zero(r);
+}
+
+int
+test_main(int argc, const char *argv[]) {
+  default_parse_args(argc, argv);
+  cachetable_test();
+  return 0;
+}
diff --git a/newbrt/tests/cachetable-clone-partial-fetch-pinned-node.c b/newbrt/tests/cachetable-clone-partial-fetch-pinned-node.c
new file mode 100644
index 00000000000..5f859894bc1
--- /dev/null
+++ b/newbrt/tests/cachetable-clone-partial-fetch-pinned-node.c
@@ -0,0 +1,113 @@
+#ident "$Id: cachetable-simple-verify.c 39504 2012-02-03 16:19:33Z zardosht $"
+#ident "Copyright (c) 2007-2011 Tokutek Inc.  All rights reserved."
+#include "includes.h"
+#include "test.h"
+
+
+BOOL flush_completed;
+BOOL pf_called;
+
+static void 
+clone_callback(void* UU(value_data), void** cloned_value_data, PAIR_ATTR* new_attr, BOOL UU(for_checkpoint), void* UU(write_extraargs))
+{
+    *cloned_value_data = (void *)1;
+    new_attr->is_valid = FALSE;
+}
+
+static void
+flush (
+    CACHEFILE f __attribute__((__unused__)),
+    int UU(fd),
+    CACHEKEY k  __attribute__((__unused__)),
+    void *v     __attribute__((__unused__)),
+    void** UU(dd),
+    void *e     __attribute__((__unused__)),
+    PAIR_ATTR s      __attribute__((__unused__)),
+    PAIR_ATTR* new_size      __attribute__((__unused__)),
+    BOOL w      __attribute__((__unused__)),
+    BOOL keep   __attribute__((__unused__)),
+    BOOL c      __attribute__((__unused__)),
+    BOOL UU(is_clone)
+    ) 
+{  
+    if (is_clone) {
+        usleep(2*1024*1024);
+        flush_completed = TRUE;
+    }
+}
+
+static int true_pf_callback(void* UU(brtnode_pv), void* UU(dd), void* UU(read_extraargs), int UU(fd), PAIR_ATTR* sizep) {
+    assert(flush_completed);
+    pf_called = TRUE;
+    *sizep = make_pair_attr(9);
+    return 0;
+}
+
+
+// this test verifies that a partial fetch will wait for a cloned pair to complete
+// writing to disk
+static void
+cachetable_test (void) {
+    const int test_limit = 12;
+    int r;
+    CACHETABLE ct;
+    r = toku_create_cachetable(&ct, test_limit, ZERO_LSN, NULL_LOGGER); assert(r == 0);
+    char fname1[] = __FILE__ "test1.dat";
+    unlink(fname1);
+    CACHEFILE f1;
+    r = toku_cachetable_openf(&f1, ct, fname1, O_RDWR|O_CREAT, S_IRWXU|S_IRWXG|S_IRWXO); assert(r == 0);
+    
+    void* v1;
+    long s1;
+    CACHETABLE_WRITE_CALLBACK wc = def_write_callback(NULL);
+    wc.clone_callback = clone_callback;
+    wc.flush_callback = flush;
+    
+    r = toku_cachetable_get_and_pin(f1, make_blocknum(1), 1, &v1, &s1, wc, def_fetch, def_pf_req_callback, def_pf_callback, TRUE, NULL);
+    assert_zero(r);
+    r = toku_cachetable_unpin(f1, make_blocknum(1), 1, CACHETABLE_DIRTY, make_pair_attr(8));
+    assert_zero(r);
+
+    flush_completed = FALSE;
+    r = toku_cachetable_begin_checkpoint(ct, NULL); assert_zero(r);
+    assert_zero(r);
+    r = toku_cachetable_get_and_pin(f1, make_blocknum(1), 1, &v1, &s1, wc, def_fetch, def_pf_req_callback, def_pf_callback, TRUE, NULL);
+    assert_zero(r);
+    r = toku_cachetable_unpin(f1, make_blocknum(1), 1, CACHETABLE_CLEAN, make_pair_attr(8));
+    assert_zero(r);
+
+    pf_called = FALSE;
+    r = toku_cachetable_get_and_pin(f1, make_blocknum(1), 1, &v1, &s1, wc, def_fetch, def_pf_req_callback, def_pf_callback, TRUE, NULL);
+    assert_zero(r);
+    assert(!pf_called);
+    toku_cachetable_pf_pinned_pair(v1, true_pf_callback, NULL, f1, make_blocknum(1), 1);
+    assert(pf_called);
+    
+    r = toku_cachetable_unpin(f1, make_blocknum(1), 1, CACHETABLE_CLEAN, make_pair_attr(8));
+    assert_zero(r);
+    assert(pf_called);
+
+    r = toku_cachetable_end_checkpoint(
+        ct, 
+        NULL, 
+        fake_ydb_lock,
+        fake_ydb_unlock,
+        NULL,
+        NULL
+        );
+    assert_zero(r);
+
+
+    toku_cachetable_verify(ct);
+    r = toku_cachefile_close(&f1, 0, FALSE, ZERO_LSN); assert(r == 0 && f1 == 0);
+    r = toku_cachetable_close(&ct); lazy_assert_zero(r);
+
+
+}
+
+int
+test_main(int argc, const char *argv[]) {
+  default_parse_args(argc, argv);
+  cachetable_test();
+  return 0;
+}
diff --git a/newbrt/tests/cachetable-clone-partial-fetch.c b/newbrt/tests/cachetable-clone-partial-fetch.c
new file mode 100644
index 00000000000..9877ddf9f35
--- /dev/null
+++ b/newbrt/tests/cachetable-clone-partial-fetch.c
@@ -0,0 +1,113 @@
+#ident "$Id: cachetable-simple-verify.c 39504 2012-02-03 16:19:33Z zardosht $"
+#ident "Copyright (c) 2007-2011 Tokutek Inc.  All rights reserved."
+#include "includes.h"
+#include "test.h"
+
+
+BOOL flush_completed;
+BOOL pf_called;
+
+static void 
+clone_callback(void* UU(value_data), void** cloned_value_data, PAIR_ATTR* new_attr, BOOL UU(for_checkpoint), void* UU(write_extraargs))
+{
+    *cloned_value_data = (void *)1;
+    new_attr->is_valid = FALSE;
+}
+
+static void
+flush (
+    CACHEFILE f __attribute__((__unused__)),
+    int UU(fd),
+    CACHEKEY k  __attribute__((__unused__)),
+    void *v     __attribute__((__unused__)),
+    void** UU(dd),
+    void *e     __attribute__((__unused__)),
+    PAIR_ATTR s      __attribute__((__unused__)),
+    PAIR_ATTR* new_size      __attribute__((__unused__)),
+    BOOL w      __attribute__((__unused__)),
+    BOOL keep   __attribute__((__unused__)),
+    BOOL c      __attribute__((__unused__)),
+    BOOL UU(is_clone)
+    ) 
+{  
+    if (is_clone) {
+        usleep(2*1024*1024);
+        flush_completed = TRUE;
+    }
+}
+
+static BOOL true_pf_req_callback(void* UU(brtnode_pv), void* UU(read_extraargs)) {
+  return TRUE;
+}
+
+static int true_pf_callback(void* UU(brtnode_pv), void* UU(dd), void* UU(read_extraargs), int UU(fd), PAIR_ATTR* sizep) {
+    assert(flush_completed);
+    pf_called = TRUE;
+    *sizep = make_pair_attr(9);
+    return 0;
+}
+
+
+// this test verifies that a partial fetch will wait for a cloned pair to complete
+// writing to disk
+static void
+cachetable_test (void) {
+    const int test_limit = 12;
+    int r;
+    CACHETABLE ct;
+    r = toku_create_cachetable(&ct, test_limit, ZERO_LSN, NULL_LOGGER); assert(r == 0);
+    char fname1[] = __FILE__ "test1.dat";
+    unlink(fname1);
+    CACHEFILE f1;
+    r = toku_cachetable_openf(&f1, ct, fname1, O_RDWR|O_CREAT, S_IRWXU|S_IRWXG|S_IRWXO); assert(r == 0);
+    
+    void* v1;
+    long s1;
+    CACHETABLE_WRITE_CALLBACK wc = def_write_callback(NULL);
+    wc.clone_callback = clone_callback;
+    wc.flush_callback = flush;
+    
+    r = toku_cachetable_get_and_pin(f1, make_blocknum(1), 1, &v1, &s1, wc, def_fetch, def_pf_req_callback, def_pf_callback, TRUE, NULL);
+    assert_zero(r);
+    r = toku_cachetable_unpin(f1, make_blocknum(1), 1, CACHETABLE_DIRTY, make_pair_attr(8));
+    assert_zero(r);
+
+    flush_completed = FALSE;
+    r = toku_cachetable_begin_checkpoint(ct, NULL); assert_zero(r);
+    assert_zero(r);
+    r = toku_cachetable_get_and_pin(f1, make_blocknum(1), 1, &v1, &s1, wc, def_fetch, def_pf_req_callback, def_pf_callback, TRUE, NULL);
+    assert_zero(r);
+    r = toku_cachetable_unpin(f1, make_blocknum(1), 1, CACHETABLE_CLEAN, make_pair_attr(8));
+    assert_zero(r);
+
+    pf_called = FALSE;
+    r = toku_cachetable_get_and_pin(f1, make_blocknum(1), 1, &v1, &s1, wc, def_fetch, true_pf_req_callback, true_pf_callback, TRUE, NULL);
+    assert_zero(r);
+    r = toku_cachetable_unpin(f1, make_blocknum(1), 1, CACHETABLE_CLEAN, make_pair_attr(8));
+    assert_zero(r);
+    assert(pf_called);
+
+    r = toku_cachetable_end_checkpoint(
+        ct, 
+        NULL, 
+        fake_ydb_lock,
+        fake_ydb_unlock,
+        NULL,
+        NULL
+        );
+    assert_zero(r);
+
+
+    toku_cachetable_verify(ct);
+    r = toku_cachefile_close(&f1, 0, FALSE, ZERO_LSN); assert(r == 0 && f1 == 0);
+    r = toku_cachetable_close(&ct); lazy_assert_zero(r);
+
+
+}
+
+int
+test_main(int argc, const char *argv[]) {
+  default_parse_args(argc, argv);
+  cachetable_test();
+  return 0;
+}
diff --git a/newbrt/tests/cachetable-clone-pin-nonblocking.c b/newbrt/tests/cachetable-clone-pin-nonblocking.c
new file mode 100644
index 00000000000..d1f46b13546
--- /dev/null
+++ b/newbrt/tests/cachetable-clone-pin-nonblocking.c
@@ -0,0 +1,96 @@
+#ident "$Id: cachetable-simple-verify.c 39504 2012-02-03 16:19:33Z zardosht $"
+#ident "Copyright (c) 2007-2011 Tokutek Inc.  All rights reserved."
+#include "includes.h"
+#include "test.h"
+
+static void 
+clone_callback(void* UU(value_data), void** cloned_value_data, PAIR_ATTR* new_attr, BOOL UU(for_checkpoint), void* UU(write_extraargs))
+{
+    *cloned_value_data = (void *)1;
+    new_attr->is_valid = FALSE;
+}
+
+static void
+flush (
+    CACHEFILE f __attribute__((__unused__)),
+    int UU(fd),
+    CACHEKEY k  __attribute__((__unused__)),
+    void *v     __attribute__((__unused__)),
+    void** UU(dd),
+    void *e     __attribute__((__unused__)),
+    PAIR_ATTR s      __attribute__((__unused__)),
+    PAIR_ATTR* new_size      __attribute__((__unused__)),
+    BOOL w      __attribute__((__unused__)),
+    BOOL keep   __attribute__((__unused__)),
+    BOOL c      __attribute__((__unused__)),
+    BOOL UU(is_clone)
+    ) 
+{  
+}
+
+
+// this test verifies that a partial fetch will wait for a cloned pair to complete
+// writing to disk
+static void
+cachetable_test (enum cachetable_dirty dirty, BOOL cloneable) {
+    const int test_limit = 12;
+    int r;
+    CACHETABLE ct;
+    r = toku_create_cachetable(&ct, test_limit, ZERO_LSN, NULL_LOGGER); assert(r == 0);
+    char fname1[] = __FILE__ "test1.dat";
+    unlink(fname1);
+    CACHEFILE f1;
+    r = toku_cachetable_openf(&f1, ct, fname1, O_RDWR|O_CREAT, S_IRWXU|S_IRWXG|S_IRWXO); assert(r == 0);
+    
+    void* v1;
+    long s1;
+    CACHETABLE_WRITE_CALLBACK wc = def_write_callback(NULL);
+    wc.clone_callback = cloneable ? clone_callback : NULL;
+    wc.flush_callback = flush;
+    
+    r = toku_cachetable_get_and_pin(f1, make_blocknum(1), 1, &v1, &s1, wc, def_fetch, def_pf_req_callback, def_pf_callback, TRUE, NULL);
+    r = toku_cachetable_unpin(f1, make_blocknum(1), 1, dirty, make_pair_attr(8));
+
+    // test that having a pin that passes FALSE for may_modify_value does not stall behind checkpoint
+    r = toku_cachetable_begin_checkpoint(ct, NULL); assert_zero(r);
+    r = toku_cachetable_get_and_pin_nonblocking(f1, make_blocknum(1), 1, &v1, &s1, wc, def_fetch, def_pf_req_callback, def_pf_callback, FALSE, NULL, NULL);
+    assert(r == 0);
+    r = toku_cachetable_unpin(f1, make_blocknum(1), 1, CACHETABLE_CLEAN, make_pair_attr(8));
+    assert(r == 0);
+
+    r = toku_cachetable_get_and_pin_nonblocking(f1, make_blocknum(1), 1, &v1, &s1, wc, def_fetch, def_pf_req_callback, def_pf_callback, TRUE, NULL, NULL);
+    if (dirty == CACHETABLE_DIRTY && !cloneable) {
+        assert(r == TOKUDB_TRY_AGAIN);
+    }
+    else {
+        assert(r == 0);
+        r = toku_cachetable_unpin(f1, make_blocknum(1), 1, CACHETABLE_CLEAN, make_pair_attr(8));
+    }
+
+    r = toku_cachetable_end_checkpoint(
+        ct, 
+        NULL, 
+        fake_ydb_lock,
+        fake_ydb_unlock,
+        NULL,
+        NULL
+        );
+    assert_zero(r);
+
+
+    toku_cachetable_verify(ct);
+    r = toku_cachefile_close(&f1, 0, FALSE, ZERO_LSN); assert(r == 0 && f1 == 0);
+    r = toku_cachetable_close(&ct); lazy_assert_zero(r);
+
+
+}
+
+int
+test_main(int argc, const char *argv[]) {
+  default_parse_args(argc, argv);
+  cachetable_test(CACHETABLE_DIRTY, TRUE);
+  cachetable_test(CACHETABLE_DIRTY, FALSE);
+  cachetable_test(CACHETABLE_CLEAN, TRUE);
+  cachetable_test(CACHETABLE_CLEAN, FALSE);
+  return 0;
+}
diff --git a/newbrt/tests/cachetable-clone-unpin-remove.c b/newbrt/tests/cachetable-clone-unpin-remove.c
new file mode 100644
index 00000000000..781489a9d36
--- /dev/null
+++ b/newbrt/tests/cachetable-clone-unpin-remove.c
@@ -0,0 +1,102 @@
+#ident "$Id: cachetable-simple-verify.c 39504 2012-02-03 16:19:33Z zardosht $"
+#ident "Copyright (c) 2007-2011 Tokutek Inc.  All rights reserved."
+#include "includes.h"
+#include "test.h"
+
+
+BOOL flush_completed;
+BOOL evict_called;
+
+static void 
+clone_callback(void* UU(value_data), void** cloned_value_data, PAIR_ATTR* new_attr, BOOL UU(for_checkpoint), void* UU(write_extraargs))
+{
+    *cloned_value_data = (void *)1;
+    new_attr->is_valid = FALSE;
+}
+
+static void
+flush (
+    CACHEFILE f __attribute__((__unused__)),
+    int UU(fd),
+    CACHEKEY k  __attribute__((__unused__)),
+    void *v     __attribute__((__unused__)),
+    void** UU(dd),
+    void *e     __attribute__((__unused__)),
+    PAIR_ATTR s      __attribute__((__unused__)),
+    PAIR_ATTR* new_size      __attribute__((__unused__)),
+    BOOL w      __attribute__((__unused__)),
+    BOOL keep   __attribute__((__unused__)),
+    BOOL c      __attribute__((__unused__)),
+    BOOL UU(is_clone)
+    ) 
+{  
+    if (is_clone) {
+        usleep(2*1024*1024);
+        flush_completed = TRUE;
+    }
+    else if (!keep && !is_clone) {
+        assert(flush_completed);
+        evict_called = TRUE;
+    }
+}
+
+
+
+// this test verifies that a partial fetch will wait for a cloned pair to complete
+// writing to disk
+static void
+cachetable_test (void) {
+    const int test_limit = 12;
+    int r;
+    CACHETABLE ct;
+    r = toku_create_cachetable(&ct, test_limit, ZERO_LSN, NULL_LOGGER); assert(r == 0);
+    char fname1[] = __FILE__ "test1.dat";
+    unlink(fname1);
+    CACHEFILE f1;
+    r = toku_cachetable_openf(&f1, ct, fname1, O_RDWR|O_CREAT, S_IRWXU|S_IRWXG|S_IRWXO); assert(r == 0);
+    
+    void* v1;
+    long s1;
+    CACHETABLE_WRITE_CALLBACK wc = def_write_callback(NULL);
+    wc.clone_callback = clone_callback;
+    wc.flush_callback = flush;
+    
+    r = toku_cachetable_get_and_pin(f1, make_blocknum(1), toku_cachetable_hash(f1, make_blocknum(1)), &v1, &s1, wc, def_fetch, def_pf_req_callback, def_pf_callback, TRUE, NULL);
+    assert_zero(r);
+    r = toku_cachetable_unpin(f1, make_blocknum(1), toku_cachetable_hash(f1, make_blocknum(1)), CACHETABLE_DIRTY, make_pair_attr(8));
+    assert_zero(r);
+
+    flush_completed = FALSE;
+    evict_called = FALSE;
+    r = toku_cachetable_begin_checkpoint(ct, NULL); assert_zero(r);
+    assert_zero(r);
+    r = toku_cachetable_get_and_pin(f1, make_blocknum(1), toku_cachetable_hash(f1, make_blocknum(1)), &v1, &s1, wc, def_fetch, def_pf_req_callback, def_pf_callback, TRUE, NULL);
+    assert_zero(r);
+    r = toku_cachetable_unpin_and_remove(f1, make_blocknum(1), NULL, NULL);
+    assert_zero(r);
+
+
+    r = toku_cachetable_end_checkpoint(
+        ct, 
+        NULL, 
+        fake_ydb_lock,
+        fake_ydb_unlock,
+        NULL,
+        NULL
+        );
+    assert_zero(r);
+
+
+    toku_cachetable_verify(ct);
+    r = toku_cachefile_close(&f1, 0, FALSE, ZERO_LSN); assert(r == 0 && f1 == 0);
+    r = toku_cachetable_close(&ct); lazy_assert_zero(r);
+
+
+}
+
+int
+test_main(int argc, const char *argv[]) {
+  default_parse_args(argc, argv);
+  cachetable_test();
+  return 0;
+}
diff --git a/newbrt/tests/cachetable-eviction-close-test.c b/newbrt/tests/cachetable-eviction-close-test.c
index a59c2c967b5..057d5db2488 100644
--- a/newbrt/tests/cachetable-eviction-close-test.c
+++ b/newbrt/tests/cachetable-eviction-close-test.c
@@ -15,12 +15,14 @@ flush (CACHEFILE f __attribute__((__unused__)),
        int UU(fd),
        CACHEKEY k  __attribute__((__unused__)),
        void *v     __attribute__((__unused__)),
+       void** UU(dd),
        void *e     __attribute__((__unused__)),
        PAIR_ATTR s      __attribute__((__unused__)),
        PAIR_ATTR* new_size      __attribute__((__unused__)),
        BOOL w      __attribute__((__unused__)),
        BOOL keep   __attribute__((__unused__)),
-       BOOL c      __attribute__((__unused__))
+       BOOL c      __attribute__((__unused__)),
+        BOOL UU(is_clone)
        ) {
     assert(expect_full_flush);
     sleep(2);
@@ -34,6 +36,7 @@ fetch (CACHEFILE f        __attribute__((__unused__)),
        CACHEKEY k         __attribute__((__unused__)),
        u_int32_t fullhash __attribute__((__unused__)),
        void **value       __attribute__((__unused__)),
+       void** UU(dd),
        PAIR_ATTR *sizep        __attribute__((__unused__)),
        int  *dirtyp       __attribute__((__unused__)),
        void *extraargs    __attribute__((__unused__))
@@ -50,7 +53,8 @@ fetch (CACHEFILE f        __attribute__((__unused__)),
 
 static void 
 pe_est_callback(
-    void* UU(brtnode_pv), 
+    void* UU(brtnode_pv),
+    void* UU(dd), 
     long* bytes_freed_estimate, 
     enum partial_eviction_cost *cost, 
     void* UU(write_extraargs)
@@ -95,6 +99,7 @@ static void cachetable_eviction_full_test (void) {
             fetch,
             def_pf_req_callback,
             def_pf_callback,
+            TRUE, 
             0
             );
         assert(r==0);
@@ -115,6 +120,7 @@ static void cachetable_eviction_full_test (void) {
         fetch,
         def_pf_req_callback,
         def_pf_callback,
+        TRUE, 
         0
         );
     assert(r==0);
diff --git a/newbrt/tests/cachetable-eviction-close-test2.c b/newbrt/tests/cachetable-eviction-close-test2.c
index 4861dc5dd25..d69835a514c 100644
--- a/newbrt/tests/cachetable-eviction-close-test2.c
+++ b/newbrt/tests/cachetable-eviction-close-test2.c
@@ -15,12 +15,14 @@ flush (CACHEFILE f __attribute__((__unused__)),
        int UU(fd),
        CACHEKEY k  __attribute__((__unused__)),
        void *v     __attribute__((__unused__)),
+       void** UU(dd),
        void *e     __attribute__((__unused__)),
        PAIR_ATTR s      __attribute__((__unused__)),
        PAIR_ATTR* new_size      __attribute__((__unused__)),
        BOOL w      __attribute__((__unused__)),
        BOOL keep   __attribute__((__unused__)),
-       BOOL c      __attribute__((__unused__))
+       BOOL c      __attribute__((__unused__)),
+        BOOL UU(is_clone)
        ) {
     assert(expect_full_flush);
 }
@@ -33,6 +35,7 @@ fetch (CACHEFILE f        __attribute__((__unused__)),
        CACHEKEY k         __attribute__((__unused__)),
        u_int32_t fullhash __attribute__((__unused__)),
        void **value       __attribute__((__unused__)),
+       void** UU(dd),
        PAIR_ATTR *sizep        __attribute__((__unused__)),
        int  *dirtyp       __attribute__((__unused__)),
        void *extraargs    __attribute__((__unused__))
@@ -49,7 +52,8 @@ fetch (CACHEFILE f        __attribute__((__unused__)),
 
 static void 
 pe_est_callback(
-    void* UU(brtnode_pv), 
+    void* UU(brtnode_pv),
+    void* UU(dd), 
     long* bytes_freed_estimate, 
     enum partial_eviction_cost *cost, 
     void* UU(write_extraargs)
@@ -108,6 +112,7 @@ static void cachetable_eviction_full_test (void) {
             fetch,
             def_pf_req_callback,
             def_pf_callback,
+            TRUE, 
             0
             );
         assert(r==0);
@@ -129,6 +134,7 @@ static void cachetable_eviction_full_test (void) {
         fetch,
         def_pf_req_callback,
         def_pf_callback,
+        TRUE, 
         0
         );
     assert(r==0);
diff --git a/newbrt/tests/cachetable-eviction-getandpin-test.c b/newbrt/tests/cachetable-eviction-getandpin-test.c
index d7108b54e02..443b1fa97a4 100644
--- a/newbrt/tests/cachetable-eviction-getandpin-test.c
+++ b/newbrt/tests/cachetable-eviction-getandpin-test.c
@@ -13,12 +13,14 @@ flush (CACHEFILE f __attribute__((__unused__)),
        int UU(fd),
        CACHEKEY k  __attribute__((__unused__)),
        void *v     __attribute__((__unused__)),
+       void** UU(dd),
        void *e     __attribute__((__unused__)),
        PAIR_ATTR s      __attribute__((__unused__)),
        PAIR_ATTR* new_size      __attribute__((__unused__)),
        BOOL w      __attribute__((__unused__)),
        BOOL keep   __attribute__((__unused__)),
-       BOOL c      __attribute__((__unused__))
+       BOOL c      __attribute__((__unused__)),
+        BOOL UU(is_clone)
        ) {
     if (do_sleep) {
         sleep(2);
@@ -59,6 +61,7 @@ static void cachetable_predef_fetch_maybegetandpin_test (void) {
             def_fetch,
             def_pf_req_callback,
             def_pf_callback,
+            TRUE, 
             0
             );
         assert(r==0);
@@ -83,6 +86,7 @@ static void cachetable_predef_fetch_maybegetandpin_test (void) {
         def_fetch,
         def_pf_req_callback,
         def_pf_callback,
+        TRUE, 
         0
         );
     assert(r==0);
@@ -95,9 +99,9 @@ static void cachetable_predef_fetch_maybegetandpin_test (void) {
     // now verify that the block we are trying to evict may be pinned
     wc = def_write_callback(NULL);
     wc.flush_callback = flush;
-    r = toku_cachetable_get_and_pin_nonblocking(f1, key, fullhash, &v, &size, wc, def_fetch, def_pf_req_callback, def_pf_callback, NULL, NULL);
+    r = toku_cachetable_get_and_pin_nonblocking(f1, key, fullhash, &v, &size, wc, def_fetch, def_pf_req_callback, def_pf_callback, TRUE, NULL, NULL);
     assert(r == TOKUDB_TRY_AGAIN);
-    r = toku_cachetable_get_and_pin(f1, key, fullhash, &v, &size, wc, def_fetch, def_pf_req_callback, def_pf_callback, NULL);
+    r = toku_cachetable_get_and_pin(f1, key, fullhash, &v, &size, wc, def_fetch, def_pf_req_callback, def_pf_callback, TRUE, NULL);
     assert(r == 0 && v == 0 && size == 8);
     do_sleep = FALSE;
 
diff --git a/newbrt/tests/cachetable-eviction-getandpin-test2.c b/newbrt/tests/cachetable-eviction-getandpin-test2.c
index eba1b77a862..7b92e11c724 100644
--- a/newbrt/tests/cachetable-eviction-getandpin-test2.c
+++ b/newbrt/tests/cachetable-eviction-getandpin-test2.c
@@ -10,6 +10,7 @@
 static void 
 pe_est_callback(
     void* UU(brtnode_pv), 
+    void* UU(dd),
     long* bytes_freed_estimate, 
     enum partial_eviction_cost *cost, 
     void* UU(write_extraargs)
@@ -67,6 +68,7 @@ static void cachetable_prefetch_maybegetandpin_test (void) {
             def_fetch,
             def_pf_req_callback,
             def_pf_callback,
+            TRUE, 
             0
             );
         assert(r==0);
@@ -89,6 +91,7 @@ static void cachetable_prefetch_maybegetandpin_test (void) {
         def_fetch,
         def_pf_req_callback,
         def_pf_callback,
+        TRUE, 
         0
         );
     assert(r==0);
@@ -109,6 +112,7 @@ static void cachetable_prefetch_maybegetandpin_test (void) {
         def_fetch, 
         def_pf_req_callback, 
         def_pf_callback, 
+        TRUE,
         NULL, 
         NULL
         );
@@ -123,6 +127,7 @@ static void cachetable_prefetch_maybegetandpin_test (void) {
         def_fetch, 
         def_pf_req_callback, 
         def_pf_callback, 
+        TRUE, 
         NULL
         );
     assert(r == 0 && v == 0 && size == 1);
diff --git a/newbrt/tests/cachetable-flush-during-cleaner.c b/newbrt/tests/cachetable-flush-during-cleaner.c
index c533783d1f6..dc0cba3fe40 100644
--- a/newbrt/tests/cachetable-flush-during-cleaner.c
+++ b/newbrt/tests/cachetable-flush-during-cleaner.c
@@ -39,7 +39,7 @@ cachetable_test (void) {
       for (int i = 0; i < 10; i++) {
           CACHETABLE_WRITE_CALLBACK wc = def_write_callback(NULL);
           wc.cleaner_callback = cleaner_callback;
-          r = toku_cachetable_get_and_pin(f1, make_blocknum(i), i, &v1, &s1, wc, def_fetch, def_pf_req_callback, def_pf_callback, NULL);
+          r = toku_cachetable_get_and_pin(f1, make_blocknum(i), i, &v1, &s1, wc, def_fetch, def_pf_req_callback, def_pf_callback, TRUE, NULL);
           r = toku_cachetable_unpin(f1, make_blocknum(i), i, CACHETABLE_DIRTY, make_pair_attr(8));
       }
       r = toku_cachefile_flush(f1);
diff --git a/newbrt/tests/cachetable-getandpin-test.c b/newbrt/tests/cachetable-getandpin-test.c
index a4730e30dad..c874b921b45 100644
--- a/newbrt/tests/cachetable-getandpin-test.c
+++ b/newbrt/tests/cachetable-getandpin-test.c
@@ -7,12 +7,14 @@ flush (CACHEFILE cf     __attribute__((__unused__)),
        int UU(fd),
        CACHEKEY key     __attribute__((__unused__)),
        void *v          __attribute__((__unused__)),
+       void** UU(dd),
        void *extraargs  __attribute__((__unused__)),
        PAIR_ATTR size        __attribute__((__unused__)),
        PAIR_ATTR* new_size      __attribute__((__unused__)),
        BOOL write_me    __attribute__((__unused__)),
        BOOL keep_me     __attribute__((__unused__)),
-       BOOL for_checkpoint    __attribute__((__unused__))
+       BOOL for_checkpoint    __attribute__((__unused__)),
+        BOOL UU(is_clone)
        ) {
     assert((long) key.b == size.size);
     if (!keep_me) toku_free(v);
@@ -25,6 +27,7 @@ fetch (
     CACHEKEY key, 
     u_int32_t hash, 
     void **vptr, 
+    void** UU(dd),
     PAIR_ATTR *sizep, 
     int *dirtyp, 
     void *extra
@@ -57,7 +60,7 @@ cachetable_getandpin_test (int n) {
         void *v; long size;
         CACHETABLE_WRITE_CALLBACK wc = def_write_callback(NULL);
         wc.flush_callback = flush;
-        r = toku_cachetable_get_and_pin(f1, make_blocknum(i), hi, &v, &size, wc, fetch, def_pf_req_callback, def_pf_callback, 0);
+        r = toku_cachetable_get_and_pin(f1, make_blocknum(i), hi, &v, &size, wc, fetch, def_pf_req_callback, def_pf_callback, TRUE, 0);
         assert(r == 0);
         assert(size == i);
 
diff --git a/newbrt/tests/cachetable-kibbutz_and_flush_cachefile.c b/newbrt/tests/cachetable-kibbutz_and_flush_cachefile.c
index 7c89792e6ca..0206bf472ea 100644
--- a/newbrt/tests/cachetable-kibbutz_and_flush_cachefile.c
+++ b/newbrt/tests/cachetable-kibbutz_and_flush_cachefile.c
@@ -35,14 +35,14 @@ run_test (void) {
     long s1;
     //long s2;
     CACHETABLE_WRITE_CALLBACK wc = def_write_callback(NULL);
-    r = toku_cachetable_get_and_pin(f1, make_blocknum(1), 1, &v1, &s1, wc, def_fetch, def_pf_req_callback, def_pf_callback, NULL);
+    r = toku_cachetable_get_and_pin(f1, make_blocknum(1), 1, &v1, &s1, wc, def_fetch, def_pf_req_callback, def_pf_callback, TRUE, NULL);
     foo = FALSE;
     cachefile_kibbutz_enq(f1, kibbutz_work, f1);
     r = toku_cachefile_flush(f1); assert(r == 0);
     assert(foo);
     assert(f1);
     
-    r = toku_cachetable_get_and_pin(f1, make_blocknum(1), 1, &v1, &s1, wc, def_fetch, def_pf_req_callback, def_pf_callback, NULL);
+    r = toku_cachetable_get_and_pin(f1, make_blocknum(1), 1, &v1, &s1, wc, def_fetch, def_pf_req_callback, def_pf_callback, TRUE, NULL);
     foo = FALSE;
     cachefile_kibbutz_enq(f1, kibbutz_work, f1);
     r = toku_cachefile_close(&f1, 0, FALSE, ZERO_LSN); assert(r == 0 && f1 == 0);
diff --git a/newbrt/tests/cachetable-partial-fetch.c b/newbrt/tests/cachetable-partial-fetch.c
index 5390e52dd92..615adc39d89 100644
--- a/newbrt/tests/cachetable-partial-fetch.c
+++ b/newbrt/tests/cachetable-partial-fetch.c
@@ -17,6 +17,7 @@ fetch (CACHEFILE f        __attribute__((__unused__)),
        CACHEKEY k         __attribute__((__unused__)),
        u_int32_t fullhash __attribute__((__unused__)),
        void **value       __attribute__((__unused__)),
+       void** UU(dd),
        PAIR_ATTR *sizep        __attribute__((__unused__)),
        int  *dirtyp,
        void *extraargs    __attribute__((__unused__))
@@ -33,6 +34,7 @@ err_fetch (CACHEFILE f        __attribute__((__unused__)),
        CACHEKEY k         __attribute__((__unused__)),
        u_int32_t fullhash __attribute__((__unused__)),
        void **value       __attribute__((__unused__)),
+	   void** UU(dd),
        PAIR_ATTR *sizep        __attribute__((__unused__)),
        int  *dirtyp,
        void *extraargs    __attribute__((__unused__))
@@ -50,17 +52,17 @@ static BOOL true_pf_req_callback(void* UU(brtnode_pv), void* UU(read_extraargs))
   return TRUE;
 }
 
-static int err_pf_callback(void* UU(brtnode_pv), void* UU(read_extraargs), int UU(fd), PAIR_ATTR* UU(sizep)) {
+static int err_pf_callback(void* UU(brtnode_pv), void* UU(dd), void* UU(read_extraargs), int UU(fd), PAIR_ATTR* UU(sizep)) {
   assert(FALSE);
   return 0; // gcov
 }
 
-static int pf_callback(void* UU(brtnode_pv), void* UU(read_extraargs), int UU(fd), PAIR_ATTR* UU(sizep)) {
+static int pf_callback(void* UU(brtnode_pv), void* UU(dd), void* UU(read_extraargs), int UU(fd), PAIR_ATTR* UU(sizep)) {
   assert(FALSE);
   return 0; // gcov
 }
 
-static int true_pf_callback(void* UU(brtnode_pv), void* read_extraargs, int UU(fd), PAIR_ATTR* sizep) {
+static int true_pf_callback(void* UU(brtnode_pv), void* UU(dd), void* read_extraargs, int UU(fd), PAIR_ATTR* sizep) {
     pf_req_called = TRUE;
     *sizep = make_pair_attr(sizeof(fetch_val)+1);
     assert(read_extraargs == &fetch_val);
@@ -85,7 +87,7 @@ cachetable_test (void) {
     long s1;
     //long s2;
     CACHETABLE_WRITE_CALLBACK wc = def_write_callback(NULL);
-    r = toku_cachetable_get_and_pin(f1, make_blocknum(1), 1, &v1, &s1, wc, fetch, pf_req_callback, pf_callback, NULL);
+    r = toku_cachetable_get_and_pin(f1, make_blocknum(1), 1, &v1, &s1, wc, fetch, pf_req_callback, pf_callback, TRUE, NULL);
     assert(&fetch_val == v1);
     //
     // verify that a prefetch of this node will fail
@@ -108,14 +110,14 @@ cachetable_test (void) {
     //
     // now get and pin node again, and make sure that partial fetch and fetch are not called
     //
-    r = toku_cachetable_get_and_pin(f1, make_blocknum(1), 1, &v1, &s1, wc, err_fetch, pf_req_callback, err_pf_callback, NULL);
+    r = toku_cachetable_get_and_pin(f1, make_blocknum(1), 1, &v1, &s1, wc, err_fetch, pf_req_callback, err_pf_callback, TRUE, NULL);
     r = toku_cachetable_unpin(f1, make_blocknum(1), 1, CACHETABLE_CLEAN, make_pair_attr(8));
     //
     // now make sure that if we say a partial fetch is required, that we get a partial fetch
     // and that read_extraargs properly passed down
     //
     pf_req_called = FALSE;
-    r = toku_cachetable_get_and_pin(f1, make_blocknum(1), 1, &v1, &s1, wc, err_fetch, true_pf_req_callback, true_pf_callback, &fetch_val);
+    r = toku_cachetable_get_and_pin(f1, make_blocknum(1), 1, &v1, &s1, wc, err_fetch, true_pf_req_callback, true_pf_callback, TRUE, &fetch_val);
     assert(pf_req_called);
     assert(s1 == sizeof(fetch_val)+1);
     r = toku_cachetable_unpin(f1, make_blocknum(1), 1, CACHETABLE_CLEAN, make_pair_attr(8));
@@ -143,7 +145,7 @@ cachetable_test (void) {
     //
     // now verify we can pin it, and NO fetch callback should get called
     //
-    r = toku_cachetable_get_and_pin(f1, make_blocknum(1), 1, &v1, &s1, wc, err_fetch, pf_req_callback, err_pf_callback, NULL);
+    r = toku_cachetable_get_and_pin(f1, make_blocknum(1), 1, &v1, &s1, wc, err_fetch, pf_req_callback, err_pf_callback, TRUE, NULL);
     assert(&fetch_val == v1);
     r = toku_cachetable_unpin(f1, make_blocknum(1), 1, CACHETABLE_CLEAN, make_pair_attr(8));
 
@@ -162,7 +164,7 @@ cachetable_test (void) {
         &doing_prefetch
         );
     assert(doing_prefetch);
-    r = toku_cachetable_get_and_pin(f1, make_blocknum(1), 1, &v1, &s1, wc, err_fetch, pf_req_callback, err_pf_callback, NULL);
+    r = toku_cachetable_get_and_pin(f1, make_blocknum(1), 1, &v1, &s1, wc, err_fetch, pf_req_callback, err_pf_callback, TRUE, NULL);
     assert(&fetch_val == v1);
     r = toku_cachetable_unpin(f1, make_blocknum(1), 1, CACHETABLE_CLEAN, make_pair_attr(8));
     
diff --git a/newbrt/tests/cachetable-pin-checkpoint.c b/newbrt/tests/cachetable-pin-checkpoint.c
index 586a9966060..02224452dfa 100644
--- a/newbrt/tests/cachetable-pin-checkpoint.c
+++ b/newbrt/tests/cachetable-pin-checkpoint.c
@@ -22,18 +22,35 @@ int64_t checkpointed_data[NUM_ELEMENTS];
 u_int32_t time_of_test;
 BOOL run_test;
 
+static void 
+clone_callback(
+    void* value_data, 
+    void** cloned_value_data, 
+    PAIR_ATTR* new_attr, 
+    BOOL UU(for_checkpoint), 
+    void* UU(write_extraargs)
+    )
+{
+    new_attr->is_valid = FALSE;
+    int64_t* data_val = toku_xmalloc(sizeof(int64_t));
+    *data_val = *(int64_t *)value_data;
+    *cloned_value_data = data_val;   
+}
+
 
 static void
 flush (CACHEFILE f __attribute__((__unused__)),
        int UU(fd),
        CACHEKEY k  __attribute__((__unused__)),
        void *v     __attribute__((__unused__)),
+       void** UU(dd),
        void *e     __attribute__((__unused__)),
        PAIR_ATTR s      __attribute__((__unused__)),
        PAIR_ATTR* new_size      __attribute__((__unused__)),
        BOOL write_me,
        BOOL keep_me,
-       BOOL checkpoint_me
+       BOOL checkpoint_me,
+        BOOL UU(is_clone)
        ) {
     /* Do nothing */
     int64_t val_to_write = *(int64_t *)v;
@@ -55,6 +72,7 @@ fetch (CACHEFILE f        __attribute__((__unused__)),
        CACHEKEY k,
        u_int32_t fullhash __attribute__((__unused__)),
        void **value,
+       void** UU(dd),
        PAIR_ATTR *sizep,
        int  *dirtyp,
        void *extraargs    __attribute__((__unused__))
@@ -120,6 +138,7 @@ static void *move_numbers(void *arg) {
         enum cachetable_dirty less_dirty = CACHETABLE_DIRTY;
         CACHETABLE_WRITE_CALLBACK wc = def_write_callback(NULL);
         wc.flush_callback = flush;
+        wc.clone_callback = clone_callback;
         r = toku_cachetable_get_and_pin_with_dep_pairs(
             f1,
             less_key,
@@ -127,6 +146,7 @@ static void *move_numbers(void *arg) {
             &v1,
             &s1,
             wc, fetch, def_pf_req_callback, def_pf_callback,
+            TRUE,
             NULL,
             0, //num_dependent_pairs
             NULL,
@@ -148,6 +168,7 @@ static void *move_numbers(void *arg) {
             &v1,
             &s1,
             wc, fetch, def_pf_req_callback, def_pf_callback, 
+            TRUE,
             NULL,
             1, //num_dependent_pairs
             &f1,
@@ -181,6 +202,7 @@ static void *move_numbers(void *arg) {
                 &v1,
                 &s1,
                 wc, fetch, def_pf_req_callback, def_pf_callback,
+                TRUE,
                 NULL,
                 1, //num_dependent_pairs
                 &f1,
@@ -210,6 +232,7 @@ static void *read_random_numbers(void *arg) {
         int r1;
         CACHETABLE_WRITE_CALLBACK wc = def_write_callback(NULL);
         wc.flush_callback = flush;
+        wc.clone_callback = clone_callback;
         r1 = toku_cachetable_get_and_pin_nonblocking(
             f1,
             make_blocknum(rand_key1),
@@ -217,6 +240,7 @@ static void *read_random_numbers(void *arg) {
             &v1,
             &s1,
             wc, fetch, def_pf_req_callback, def_pf_callback, 
+            FALSE,
             NULL,
             NULL
             );
@@ -259,6 +283,7 @@ static void *checkpoints(void *arg) {
             sum += checkpointed_data[i];
         }
         assert (sum==0);
+        usleep(10*1024);
         num_checkpoints++;
     }
     return arg;
diff --git a/newbrt/tests/cachetable-pin-nonblocking-checkpoint-clean.c b/newbrt/tests/cachetable-pin-nonblocking-checkpoint-clean.c
index c288775575c..dd071abe8da 100644
--- a/newbrt/tests/cachetable-pin-nonblocking-checkpoint-clean.c
+++ b/newbrt/tests/cachetable-pin-nonblocking-checkpoint-clean.c
@@ -24,15 +24,15 @@ run_test (void) {
     long s1;
     long s2;
     
-    r = toku_cachetable_get_and_pin(f1, make_blocknum(1), 1, &v1, &s1, def_write_callback(NULL), def_fetch, def_pf_req_callback, def_pf_callback, NULL);
+    r = toku_cachetable_get_and_pin(f1, make_blocknum(1), 1, &v1, &s1, def_write_callback(NULL), def_fetch, def_pf_req_callback, def_pf_callback, TRUE, NULL);
     r = toku_cachetable_unpin(f1, make_blocknum(1), 1, CACHETABLE_CLEAN, make_pair_attr(8)); assert(r==0);
 
     for (int i = 0; i < 20; i++) {
-        r = toku_cachetable_get_and_pin(f1, make_blocknum(2), 2, &v2, &s2, def_write_callback(NULL), def_fetch, def_pf_req_callback, def_pf_callback, NULL);
+        r = toku_cachetable_get_and_pin(f1, make_blocknum(2), 2, &v2, &s2, def_write_callback(NULL), def_fetch, def_pf_req_callback, def_pf_callback, TRUE, NULL);
         r = toku_cachetable_unpin(f1, make_blocknum(2), 2, CACHETABLE_CLEAN, make_pair_attr(8)); assert(r==0);
     }
 
-    r = toku_cachetable_get_and_pin(f1, make_blocknum(1), 1, &v2, &s2, def_write_callback(NULL), def_fetch, def_pf_req_callback, def_pf_callback, NULL);
+    r = toku_cachetable_get_and_pin(f1, make_blocknum(1), 1, &v2, &s2, def_write_callback(NULL), def_fetch, def_pf_req_callback, def_pf_callback, TRUE, NULL);
     r = toku_cachetable_begin_checkpoint(ct, NULL);
     // mark nodes as pending a checkpoint, so that get_and_pin_nonblocking on block 1 will return TOKUDB_TRY_AGAIN
     r = toku_cachetable_unpin(f1, make_blocknum(1), 1, CACHETABLE_CLEAN, make_pair_attr(8)); assert(r==0);
@@ -47,6 +47,7 @@ run_test (void) {
         def_fetch,
         def_pf_req_callback,
         def_pf_callback,
+        TRUE,
         NULL,
         NULL
         );
diff --git a/newbrt/tests/cachetable-prefetch-checkpoint-test.c b/newbrt/tests/cachetable-prefetch-checkpoint-test.c
index f721e0a6f5f..e4f9084cfe4 100644
--- a/newbrt/tests/cachetable-prefetch-checkpoint-test.c
+++ b/newbrt/tests/cachetable-prefetch-checkpoint-test.c
@@ -20,12 +20,14 @@ static void flush(
     int UU(fd), 
     CACHEKEY key, 
     void *value, 
+    void** UU(dd),
     void *extraargs, 
     PAIR_ATTR size, 
     PAIR_ATTR* UU(new_size), 
     BOOL write_me, 
     BOOL keep_me, 
-    BOOL UU(for_checkpoint)
+    BOOL UU(for_checkpoint),
+        BOOL UU(is_clone)
     ) 
 {
     cf = cf; key = key; value = value; extraargs = extraargs; 
@@ -42,6 +44,7 @@ static int fetch(
     CACHEKEY key, 
     u_int32_t fullhash, 
     void **value, 
+    void** UU(dd),
     PAIR_ATTR *sizep, 
     int *dirtyp, 
     void *extraargs
diff --git a/newbrt/tests/cachetable-prefetch-close-leak-test.c b/newbrt/tests/cachetable-prefetch-close-leak-test.c
index 6384b5d487c..488f9f91dda 100644
--- a/newbrt/tests/cachetable-prefetch-close-leak-test.c
+++ b/newbrt/tests/cachetable-prefetch-close-leak-test.c
@@ -11,12 +11,14 @@ flush (CACHEFILE f __attribute__((__unused__)),
        int UU(fd),
        CACHEKEY k  __attribute__((__unused__)),
        void *v     __attribute__((__unused__)),
+       void** UU(dd),
        void *e     __attribute__((__unused__)),
        PAIR_ATTR s      __attribute__((__unused__)),
        PAIR_ATTR* new_size      __attribute__((__unused__)),
        BOOL w      __attribute__((__unused__)),
        BOOL keep   __attribute__((__unused__)),
-       BOOL c      __attribute__((__unused__))
+       BOOL c      __attribute__((__unused__)),
+        BOOL UU(is_clone)
        ) {
     assert(w == FALSE && v != NULL);
     toku_free(v);
@@ -30,6 +32,7 @@ fetch (CACHEFILE f        __attribute__((__unused__)),
        CACHEKEY k         __attribute__((__unused__)),
        u_int32_t fullhash __attribute__((__unused__)),
        void **value       __attribute__((__unused__)),
+       void** UU(dd),
        PAIR_ATTR *sizep        __attribute__((__unused__)),
        int  *dirtyp       __attribute__((__unused__)),
        void *extraargs    __attribute__((__unused__))
diff --git a/newbrt/tests/cachetable-prefetch-close-test.c b/newbrt/tests/cachetable-prefetch-close-test.c
index 353a376d6c8..f3b1b5f6e2c 100644
--- a/newbrt/tests/cachetable-prefetch-close-test.c
+++ b/newbrt/tests/cachetable-prefetch-close-test.c
@@ -13,12 +13,14 @@ flush (CACHEFILE f __attribute__((__unused__)),
        int UU(fd),
        CACHEKEY k  __attribute__((__unused__)),
        void *v     __attribute__((__unused__)),
+       void** UU(dd),
        void *e     __attribute__((__unused__)),
        PAIR_ATTR s      __attribute__((__unused__)),
        PAIR_ATTR* new_size      __attribute__((__unused__)),
        BOOL w      __attribute__((__unused__)),
        BOOL keep   __attribute__((__unused__)),
-       BOOL c      __attribute__((__unused__))
+       BOOL c      __attribute__((__unused__)),
+        BOOL UU(is_clone)
        ) {
     assert(w == FALSE);
 }
@@ -31,6 +33,7 @@ fetch (CACHEFILE f        __attribute__((__unused__)),
        CACHEKEY k         __attribute__((__unused__)),
        u_int32_t fullhash __attribute__((__unused__)),
        void **value       __attribute__((__unused__)),
+       void** UU(dd),
        PAIR_ATTR *sizep        __attribute__((__unused__)),
        int  *dirtyp       __attribute__((__unused__)),
        void *extraargs    __attribute__((__unused__))
@@ -80,6 +83,7 @@ static void cachetable_prefetch_full_test (BOOL partial_fetch) {
             fetch,
             def_pf_req_callback,
             def_pf_callback,
+            TRUE, 
             0
             );
         assert(r==0);
diff --git a/newbrt/tests/cachetable-prefetch-flowcontrol-test.c b/newbrt/tests/cachetable-prefetch-flowcontrol-test.c
index 78c9a956c1c..c95318b0fa7 100644
--- a/newbrt/tests/cachetable-prefetch-flowcontrol-test.c
+++ b/newbrt/tests/cachetable-prefetch-flowcontrol-test.c
@@ -17,12 +17,14 @@ flush (CACHEFILE f __attribute__((__unused__)),
        int UU(fd),
        CACHEKEY k,
        void *v     __attribute__((__unused__)),
+       void** UU(dd),
        void *e     __attribute__((__unused__)),
        PAIR_ATTR s      __attribute__((__unused__)),
        PAIR_ATTR* new_size      __attribute__((__unused__)),
        BOOL w,
        BOOL keep,
-       BOOL f_ckpt __attribute__((__unused__))
+       BOOL f_ckpt __attribute__((__unused__)),
+        BOOL UU(is_clone)
        ) {
     assert(w == FALSE);
     flush_calls++;
@@ -41,6 +43,7 @@ fetch (CACHEFILE f        __attribute__((__unused__)),
        CACHEKEY k,
        u_int32_t fullhash __attribute__((__unused__)),
        void **value,
+       void** UU(dd),
        PAIR_ATTR *sizep,
        int  *dirtyp,
        void *extraargs    __attribute__((__unused__))
diff --git a/newbrt/tests/cachetable-prefetch-getandpin-test.c b/newbrt/tests/cachetable-prefetch-getandpin-test.c
index f7cef9c7781..0f6fee988ac 100644
--- a/newbrt/tests/cachetable-prefetch-getandpin-test.c
+++ b/newbrt/tests/cachetable-prefetch-getandpin-test.c
@@ -14,12 +14,14 @@ flush (CACHEFILE f __attribute__((__unused__)),
        int UU(fd),
        CACHEKEY k  __attribute__((__unused__)),
        void *v     __attribute__((__unused__)),
+       void** UU(dd),
        void *e     __attribute__((__unused__)),
        PAIR_ATTR s      __attribute__((__unused__)),
        PAIR_ATTR* new_size      __attribute__((__unused__)),
        BOOL w      __attribute__((__unused__)),
        BOOL keep   __attribute__((__unused__)),
-       BOOL c      __attribute__((__unused__))
+       BOOL c      __attribute__((__unused__)),
+        BOOL UU(is_clone)
        ) {
     assert(w == FALSE);
 }
@@ -30,6 +32,7 @@ fetch (CACHEFILE f        __attribute__((__unused__)),
        CACHEKEY k         __attribute__((__unused__)),
        u_int32_t fullhash __attribute__((__unused__)),
        void **value       __attribute__((__unused__)),
+       void** UU(dd),
        PAIR_ATTR *sizep        __attribute__((__unused__)),
        int  *dirtyp       __attribute__((__unused__)),
        void *extraargs    __attribute__((__unused__))
@@ -55,7 +58,7 @@ static BOOL pf_req_callback(void* UU(brtnode_pv), void* UU(read_extraargs)) {
     }
 }
 
-static int pf_callback(void* UU(brtnode_pv), void* UU(read_extraargs), int UU(fd), PAIR_ATTR* UU(sizep)) {
+static int pf_callback(void* UU(brtnode_pv), void* UU(dd), void* UU(read_extraargs), int UU(fd), PAIR_ATTR* UU(sizep)) {
     assert(expect_pf);
     sleep(2);
     *sizep = make_pair_attr(2);
@@ -97,6 +100,7 @@ static void cachetable_prefetch_maybegetandpin_test (BOOL do_partial_fetch) {
             fetch,
             pf_req_callback,
             pf_callback,
+            TRUE, 
             0
             );
         assert(r==0);
@@ -115,9 +119,9 @@ static void cachetable_prefetch_maybegetandpin_test (BOOL do_partial_fetch) {
     void *v = 0;
     long size = 0;
     do_pf = FALSE;
-    r = toku_cachetable_get_and_pin_nonblocking(f1, key, fullhash, &v, &size, wc, fetch, pf_req_callback, pf_callback, NULL, NULL);
+    r = toku_cachetable_get_and_pin_nonblocking(f1, key, fullhash, &v, &size, wc, fetch, pf_req_callback, pf_callback, TRUE, NULL, NULL);
     assert(r==TOKUDB_TRY_AGAIN);
-    r = toku_cachetable_get_and_pin(f1, key, fullhash, &v, &size, wc, fetch, pf_req_callback, pf_callback, NULL);
+    r = toku_cachetable_get_and_pin(f1, key, fullhash, &v, &size, wc, fetch, pf_req_callback, pf_callback, TRUE, NULL);
     assert(r == 0 && v == 0 && size == 2);
 
     struct timeval tend; 
diff --git a/newbrt/tests/cachetable-prefetch-maybegetandpin-test.c b/newbrt/tests/cachetable-prefetch-maybegetandpin-test.c
index f838880382d..14c256ece64 100644
--- a/newbrt/tests/cachetable-prefetch-maybegetandpin-test.c
+++ b/newbrt/tests/cachetable-prefetch-maybegetandpin-test.c
@@ -12,6 +12,7 @@ fetch (CACHEFILE f        __attribute__((__unused__)),
        CACHEKEY k         __attribute__((__unused__)),
        u_int32_t fullhash __attribute__((__unused__)),
        void **value       __attribute__((__unused__)),
+       void** UU(dd),
        PAIR_ATTR *sizep        __attribute__((__unused__)),
        int  *dirtyp       __attribute__((__unused__)),
        void *extraargs    __attribute__((__unused__))
diff --git a/newbrt/tests/cachetable-prefetch2-test.c b/newbrt/tests/cachetable-prefetch2-test.c
index 636e327a6cc..f46539f73dd 100644
--- a/newbrt/tests/cachetable-prefetch2-test.c
+++ b/newbrt/tests/cachetable-prefetch2-test.c
@@ -15,6 +15,7 @@ fetch (CACHEFILE f        __attribute__((__unused__)),
        CACHEKEY k         __attribute__((__unused__)),
        u_int32_t fullhash __attribute__((__unused__)),
        void **value       __attribute__((__unused__)),
+       void** UU(dd),
        PAIR_ATTR *sizep        __attribute__((__unused__)),
        int  *dirtyp       __attribute__((__unused__)),
        void *extraargs    __attribute__((__unused__))
diff --git a/newbrt/tests/cachetable-put-checkpoint.c b/newbrt/tests/cachetable-put-checkpoint.c
index 223bbe42a4b..9430f24fa3e 100644
--- a/newbrt/tests/cachetable-put-checkpoint.c
+++ b/newbrt/tests/cachetable-put-checkpoint.c
@@ -25,23 +25,41 @@ int64_t checkpointed_data[NUM_ELEMENTS];
 u_int32_t time_of_test;
 BOOL run_test;
 
+static void 
+clone_callback(
+    void* value_data, 
+    void** cloned_value_data, 
+    PAIR_ATTR* new_attr, 
+    BOOL UU(for_checkpoint), 
+    void* UU(write_extraargs)
+    )
+{
+    new_attr->is_valid = FALSE;
+    int64_t* data_val = toku_xmalloc(sizeof(int64_t));
+    *data_val = *(int64_t *)value_data;
+    *cloned_value_data = data_val;
+    *new_attr = make_pair_attr(8);
+}
 
 static void
 flush (CACHEFILE f __attribute__((__unused__)),
        int UU(fd),
        CACHEKEY k  __attribute__((__unused__)),
        void *v     __attribute__((__unused__)),
+       void** UU(dd),
        void *e     __attribute__((__unused__)),
        PAIR_ATTR s      __attribute__((__unused__)),
-       PAIR_ATTR* new_size      __attribute__((__unused__)),
+       PAIR_ATTR* new_size,
        BOOL write_me,
        BOOL keep_me,
-       BOOL checkpoint_me
+       BOOL checkpoint_me,
+        BOOL UU(is_clone)
        ) {
     int64_t val_to_write = *(int64_t *)v;
     size_t data_index = (size_t)k.b;
     if (write_me) {
         usleep(10);
+        *new_size = make_pair_attr(8);
         data[data_index] = val_to_write;
         if (checkpoint_me) checkpointed_data[data_index] = val_to_write;
     }
@@ -56,6 +74,7 @@ fetch (CACHEFILE f        __attribute__((__unused__)),
        CACHEKEY k,
        u_int32_t fullhash __attribute__((__unused__)),
        void **value,
+       void** UU(dd),
        PAIR_ATTR *sizep,
        int  *dirtyp,
        void *extraargs    __attribute__((__unused__))
@@ -113,6 +132,7 @@ static void move_number_to_child(
     u_int32_t child_fullhash = toku_cachetable_hash(f1, child_key);
     CACHETABLE_WRITE_CALLBACK wc = def_write_callback(NULL);
     wc.flush_callback = flush;
+    wc.clone_callback = clone_callback;
     r = toku_cachetable_get_and_pin_with_dep_pairs(
         f1,
         child_key,
@@ -120,6 +140,7 @@ static void move_number_to_child(
         &v1,
         &s1,
         wc, fetch, def_pf_req_callback, def_pf_callback,
+        TRUE,
         NULL,
         1, //num_dependent_pairs
         &f1,
@@ -158,6 +179,7 @@ static void *move_numbers(void *arg) {
         u_int32_t parent_fullhash = toku_cachetable_hash(f1, parent_key);
         CACHETABLE_WRITE_CALLBACK wc = def_write_callback(NULL);
         wc.flush_callback = flush;
+        wc.clone_callback = clone_callback;
         r = toku_cachetable_get_and_pin_with_dep_pairs(
             f1,
             parent_key,
@@ -165,6 +187,7 @@ static void *move_numbers(void *arg) {
             &v1,
             &s1,
             wc, fetch, def_pf_req_callback, def_pf_callback,
+            TRUE,
             NULL,
             0, //num_dependent_pairs
             NULL,
@@ -222,6 +245,7 @@ static void merge_and_split_child(
     enum cachetable_dirty child_dirty = CACHETABLE_CLEAN;
     CACHETABLE_WRITE_CALLBACK wc = def_write_callback(NULL);
     wc.flush_callback = flush;
+    wc.clone_callback = clone_callback;
     r = toku_cachetable_get_and_pin_with_dep_pairs(
         f1,
         child_key,
@@ -229,6 +253,7 @@ static void merge_and_split_child(
         &v1,
         &s1,
         wc, fetch, def_pf_req_callback, def_pf_callback,
+        TRUE,
         NULL,
         1, //num_dependent_pairs
         &f1,
@@ -262,6 +287,7 @@ static void merge_and_split_child(
         &v1,
         &s1,
         wc, fetch, def_pf_req_callback, def_pf_callback,
+        TRUE,
         NULL,
         2, //num_dependent_pairs
         cfs,
@@ -330,6 +356,7 @@ static void *merge_and_split(void *arg) {
         u_int32_t parent_fullhash = toku_cachetable_hash(f1, parent_key);
         CACHETABLE_WRITE_CALLBACK wc = def_write_callback(NULL);
         wc.flush_callback = flush;
+        wc.clone_callback = clone_callback;
         r = toku_cachetable_get_and_pin_with_dep_pairs(
             f1,
             parent_key,
@@ -337,6 +364,7 @@ static void *merge_and_split(void *arg) {
             &v1,
             &s1,
             wc, fetch, def_pf_req_callback, def_pf_callback,
+            TRUE,
             NULL,
             0, //num_dependent_pairs
             NULL,
diff --git a/newbrt/tests/cachetable-rename-test.c b/newbrt/tests/cachetable-rename-test.c
index e33cc7352e7..c71f7093c28 100644
--- a/newbrt/tests/cachetable-rename-test.c
+++ b/newbrt/tests/cachetable-rename-test.c
@@ -40,12 +40,15 @@ static void r_flush (CACHEFILE f      __attribute__((__unused__)),
                      int UU(fd),
 		     CACHEKEY k,
 		     void *value,
+		     void** UU(dd),
 		     void *extra      __attribute__((__unused__)),
 		     PAIR_ATTR size        __attribute__((__unused__)),
         PAIR_ATTR* new_size      __attribute__((__unused__)),
 		     BOOL write_me    __attribute__((__unused__)),
 		     BOOL keep_me,
-		     BOOL for_checkpoint    __attribute__((__unused__))) {
+		     BOOL for_checkpoint    __attribute__((__unused__)),
+        BOOL UU(is_clone)
+		     ) {
     int i;
     //printf("Flush\n");
     if (keep_me) return;
@@ -74,6 +77,7 @@ static int r_fetch (CACHEFILE f        __attribute__((__unused__)),
 		    CACHEKEY key       __attribute__((__unused__)),
 		    u_int32_t fullhash __attribute__((__unused__)),
 		    void**value        __attribute__((__unused__)),
+		    void** UU(dd),
 		    PAIR_ATTR *sizep        __attribute__((__unused__)),
 		    int  *dirtyp       __attribute__((__unused__)),
 		    void*extraargs     __attribute__((__unused__))) {
@@ -131,7 +135,7 @@ static void test_rename (void) {
 	    if (verbose) printf("Rename %" PRIx64 " to %" PRIx64 "\n", okey.b, nkey.b);
             CACHETABLE_WRITE_CALLBACK wc = def_write_callback(NULL);
             wc.flush_callback = r_flush;
-	    r = toku_cachetable_get_and_pin(f, okey, toku_cachetable_hash(f, okey), &current_value, &current_size, wc, r_fetch, def_pf_req_callback, def_pf_callback, 0);
+	    r = toku_cachetable_get_and_pin(f, okey, toku_cachetable_hash(f, okey), &current_value, &current_size, wc, r_fetch, def_pf_req_callback, def_pf_callback, TRUE, 0);
 	    if (r == -42) continue;
             assert(r==0);
 	    r = toku_cachetable_rename(f, okey, nkey);
diff --git a/newbrt/tests/cachetable-scan.c b/newbrt/tests/cachetable-scan.c
index 45aa653bb68..801f74299f2 100644
--- a/newbrt/tests/cachetable-scan.c
+++ b/newbrt/tests/cachetable-scan.c
@@ -13,12 +13,15 @@ static void f_flush (CACHEFILE f,
                      int UU(fd),
 		     CACHEKEY key,
 		     void *value,
+		     void** UU(dd),
 		     void *extra       __attribute__((__unused__)),
 		     PAIR_ATTR size,
         PAIR_ATTR* new_size      __attribute__((__unused__)),
 		     BOOL write_me,
 		     BOOL keep_me,
-		     BOOL for_checkpoint     __attribute__((__unused__))) {
+		     BOOL for_checkpoint     __attribute__((__unused__)),
+        BOOL UU(is_clone)
+		     ) {
     assert(size.size==BLOCKSIZE);
     if (write_me) {
 	toku_os_full_pwrite(toku_cachefile_get_and_pin_fd(f), value, BLOCKSIZE, key.b);
@@ -34,6 +37,7 @@ static int f_fetch (CACHEFILE f,
 		    CACHEKEY key,
 		    u_int32_t fullhash __attribute__((__unused__)),
 		    void**value,
+		    void** UU(dd),
 		    PAIR_ATTR *sizep,
 		    int  *dirtyp,
 		    void*extraargs     __attribute__((__unused__))) {
@@ -93,7 +97,7 @@ static void readit (void) {
 	u_int32_t fullhash = toku_cachetable_hash(f, key);
         CACHETABLE_WRITE_CALLBACK wc = def_write_callback(NULL);
         wc.flush_callback = f_flush;
-	r=toku_cachetable_get_and_pin(f, key, fullhash, &block, &current_size, wc, f_fetch, def_pf_req_callback, def_pf_callback, 0); assert(r==0);
+	r=toku_cachetable_get_and_pin(f, key, fullhash, &block, &current_size, wc, f_fetch, def_pf_req_callback, def_pf_callback, TRUE, 0); assert(r==0);
 	r=toku_cachetable_unpin(f, key, fullhash, CACHETABLE_CLEAN, make_pair_attr(BLOCKSIZE));                                      assert(r==0);
     }
     r = toku_cachefile_close(&f, 0, FALSE, ZERO_LSN);    assert(r == 0);
diff --git a/newbrt/tests/cachetable-simple-clone.c b/newbrt/tests/cachetable-simple-clone.c
new file mode 100644
index 00000000000..ded7f11d6a8
--- /dev/null
+++ b/newbrt/tests/cachetable-simple-clone.c
@@ -0,0 +1,153 @@
+#ident "$Id: cachetable-simple-verify.c 39504 2012-02-03 16:19:33Z zardosht $"
+#ident "Copyright (c) 2007-2011 Tokutek Inc.  All rights reserved."
+#include "includes.h"
+#include "test.h"
+
+BOOL clone_called;
+BOOL check_flush;
+BOOL flush_expected;
+BOOL flush_called;
+
+static void 
+clone_callback(void* UU(value_data), void** cloned_value_data, PAIR_ATTR* new_attr, BOOL UU(for_checkpoint), void* UU(write_extraargs))
+{
+    *cloned_value_data = (void *)1;
+    new_attr->is_valid = FALSE;
+    clone_called = TRUE;
+}
+
+static void
+flush (
+    CACHEFILE f __attribute__((__unused__)),
+    int UU(fd),
+    CACHEKEY k  __attribute__((__unused__)),
+    void *v     __attribute__((__unused__)),
+    void** UU(dd),
+    void *e     __attribute__((__unused__)),
+    PAIR_ATTR s      __attribute__((__unused__)),
+    PAIR_ATTR* new_size      __attribute__((__unused__)),
+    BOOL w      __attribute__((__unused__)),
+    BOOL keep   __attribute__((__unused__)),
+    BOOL c      __attribute__((__unused__)),
+    BOOL UU(is_clone)
+    ) 
+{  
+    if (w) usleep(5*1024*1024);
+    if (w && check_flush) {
+        assert(flush_expected);
+        if (clone_called) assert(is_clone);
+    }
+    flush_called = TRUE;
+    if (is_clone) assert(!keep);
+}
+
+static uint64_t tdelta_usec(struct timeval *tend, struct timeval *tstart) {
+    uint64_t t = tend->tv_sec * 1000000 + tend->tv_usec;
+    t -= tstart->tv_sec * 1000000 + tstart->tv_usec;
+    return t;
+}
+
+
+//
+// test the following things for simple cloning:
+//  - if the pending pair is clean, nothing gets written
+//  - if the pending pair is dirty and cloneable, then pair is written
+//     in background and get_and_pin returns immedietely
+//  - if the pending pair is dirty and not cloneable, then get_and_pin
+//     blocks until the pair is written out
+//
+static void
+test_clean (enum cachetable_dirty dirty, BOOL cloneable) {
+    const int test_limit = 12;
+    int r;
+    CACHETABLE ct;
+    r = toku_create_cachetable(&ct, test_limit, ZERO_LSN, NULL_LOGGER); assert(r == 0);
+    char fname1[] = __FILE__ "test1.dat";
+    unlink(fname1);
+    CACHEFILE f1;
+    r = toku_cachetable_openf(&f1, ct, fname1, O_RDWR|O_CREAT, S_IRWXU|S_IRWXG|S_IRWXO); assert(r == 0);
+    
+    void* v1;
+    long s1;
+    CACHETABLE_WRITE_CALLBACK wc = def_write_callback(NULL);
+    wc.clone_callback = cloneable ? clone_callback : NULL;
+    wc.flush_callback = flush;
+    r = toku_cachetable_get_and_pin(f1, make_blocknum(1), 1, &v1, &s1, wc, def_fetch, def_pf_req_callback, def_pf_callback, TRUE, NULL);
+    r = toku_cachetable_unpin(f1, make_blocknum(1), 1, dirty, make_pair_attr(8));
+    
+    check_flush = TRUE;
+    clone_called = FALSE;
+    flush_expected = (dirty == CACHETABLE_DIRTY) ? TRUE : FALSE;
+    flush_called = FALSE;
+    // begin checkpoint, since pair is clean, we should not 
+    // have the clone called
+    r = toku_cachetable_begin_checkpoint(ct, NULL);
+    assert_zero(r);
+    struct timeval tstart;
+    struct timeval tend; 
+    gettimeofday(&tstart, NULL);
+
+    // test that having a pin that passes FALSE for may_modify_value does not stall behind checkpoint
+    r = toku_cachetable_get_and_pin(f1, make_blocknum(1), 1, &v1, &s1, wc, def_fetch, def_pf_req_callback, def_pf_callback, FALSE, NULL);
+    r = toku_cachetable_unpin(f1, make_blocknum(1), 1, CACHETABLE_CLEAN, make_pair_attr(8));
+    gettimeofday(&tend, NULL);
+    assert(tdelta_usec(&tend, &tstart) <= 2000000); 
+    assert(!clone_called);
+    
+    r = toku_cachetable_get_and_pin(f1, make_blocknum(1), 1, &v1, &s1, wc, def_fetch, def_pf_req_callback, def_pf_callback, TRUE, NULL);
+    gettimeofday(&tend, NULL);
+    
+    // we take 5 seconds for a write
+    // we check if time to pin is less than 2 seconds, if it is
+    // then we know act of cloning worked properly
+    if (cloneable || !dirty ) {
+        assert(tdelta_usec(&tend, &tstart) <= 2000000); 
+    }
+    else {
+        assert(tdelta_usec(&tend, &tstart) >= 2000000); 
+    }
+
+    
+    if (dirty == CACHETABLE_DIRTY && cloneable) {
+        assert(clone_called);
+    }
+    else {
+        assert(!clone_called);
+    }
+
+    // at this point, there should be no more dirty writes
+    r = toku_cachetable_unpin(f1, make_blocknum(1), 1, CACHETABLE_CLEAN, make_pair_attr(8));
+    gettimeofday(&tend, NULL);
+    if (cloneable || !dirty ) {
+        assert(tdelta_usec(&tend, &tstart) <= 2000000); 
+    }
+    else {
+        assert(tdelta_usec(&tend, &tstart) >= 2000000); 
+    }
+
+    r = toku_cachetable_end_checkpoint(
+    ct, 
+    NULL, 
+    fake_ydb_lock,
+    fake_ydb_unlock,
+    NULL,
+    NULL
+    );
+    assert_zero(r);
+    
+    check_flush = FALSE;
+    
+    toku_cachetable_verify(ct);
+    r = toku_cachefile_close(&f1, 0, FALSE, ZERO_LSN); assert(r == 0 && f1 == 0);
+    r = toku_cachetable_close(&ct); lazy_assert_zero(r);
+}
+
+int
+test_main(int argc, const char *argv[]) {
+  default_parse_args(argc, argv);
+  test_clean(CACHETABLE_CLEAN, TRUE);
+  test_clean(CACHETABLE_DIRTY, TRUE);
+  test_clean(CACHETABLE_CLEAN, FALSE);
+  test_clean(CACHETABLE_DIRTY, FALSE);
+  return 0;
+}
diff --git a/newbrt/tests/cachetable-simple-clone2.c b/newbrt/tests/cachetable-simple-clone2.c
new file mode 100644
index 00000000000..7fd42429f6c
--- /dev/null
+++ b/newbrt/tests/cachetable-simple-clone2.c
@@ -0,0 +1,103 @@
+#ident "$Id: cachetable-simple-verify.c 39504 2012-02-03 16:19:33Z zardosht $"
+#ident "Copyright (c) 2007-2011 Tokutek Inc.  All rights reserved."
+#include "includes.h"
+#include "test.h"
+
+BOOL clone_called;
+BOOL check_flush;
+BOOL flush_expected;
+BOOL flush_called;
+
+static void 
+clone_callback(void* UU(value_data), void** cloned_value_data, PAIR_ATTR* new_attr, BOOL UU(for_checkpoint), void* UU(write_extraargs))
+{
+    *cloned_value_data = (void *)1;
+    new_attr->is_valid = FALSE;
+    clone_called = TRUE;
+}
+
+static void
+flush (
+    CACHEFILE f __attribute__((__unused__)),
+    int UU(fd),
+    CACHEKEY k  __attribute__((__unused__)),
+    void *v     __attribute__((__unused__)),
+    void** UU(dd),
+    void *e     __attribute__((__unused__)),
+    PAIR_ATTR s      __attribute__((__unused__)),
+    PAIR_ATTR* new_size      __attribute__((__unused__)),
+    BOOL w      __attribute__((__unused__)),
+    BOOL keep   __attribute__((__unused__)),
+    BOOL c      __attribute__((__unused__)),
+    BOOL UU(is_clone)
+    ) 
+{  
+    if (w && check_flush) {
+        assert(flush_expected);
+        flush_called = TRUE;
+    }
+}
+
+//
+// test the following things for simple cloning:
+//  - verifies that after teh checkpoint ends, the PAIR is properly 
+//     dirty or clean based on the second unpin
+//
+static void
+test_clean (enum cachetable_dirty dirty, BOOL cloneable) {
+    const int test_limit = 200;
+    int r;
+    CACHETABLE ct;
+    r = toku_create_cachetable(&ct, test_limit, ZERO_LSN, NULL_LOGGER); assert(r == 0);
+    char fname1[] = __FILE__ "test1.dat";
+    unlink(fname1);
+    CACHEFILE f1;
+    r = toku_cachetable_openf(&f1, ct, fname1, O_RDWR|O_CREAT, S_IRWXU|S_IRWXG|S_IRWXO); assert(r == 0);
+    check_flush = FALSE;
+    
+    void* v1;
+    long s1;
+    CACHETABLE_WRITE_CALLBACK wc = def_write_callback(NULL);
+    wc.clone_callback = cloneable ? clone_callback : NULL;
+    wc.flush_callback = flush;
+    r = toku_cachetable_get_and_pin(f1, make_blocknum(1), 1, &v1, &s1, wc, def_fetch, def_pf_req_callback, def_pf_callback, TRUE, NULL);
+    r = toku_cachetable_unpin(f1, make_blocknum(1), 1, CACHETABLE_DIRTY, make_pair_attr(8));
+    
+    // begin checkpoint, since pair is clean, we should not 
+    // have the clone called
+    r = toku_cachetable_begin_checkpoint(ct, NULL);
+    assert_zero(r);
+    r = toku_cachetable_get_and_pin(f1, make_blocknum(1), 1, &v1, &s1, wc, def_fetch, def_pf_req_callback, def_pf_callback, TRUE, NULL);
+    
+    // at this point, there should be no more dirty writes
+    r = toku_cachetable_unpin(f1, make_blocknum(1), 1, dirty, make_pair_attr(8));
+    usleep(2*1024*1024);
+    r = toku_cachetable_end_checkpoint(
+    ct, 
+    NULL, 
+    fake_ydb_lock,
+    fake_ydb_unlock,
+    NULL,
+    NULL
+    );
+    assert_zero(r);
+    
+    check_flush = TRUE;
+    flush_expected = (dirty == CACHETABLE_DIRTY) ? TRUE : FALSE;
+    flush_called = FALSE;
+    
+    toku_cachetable_verify(ct);
+    r = toku_cachefile_close(&f1, 0, FALSE, ZERO_LSN); assert(r == 0 && f1 == 0);
+    r = toku_cachetable_close(&ct); lazy_assert_zero(r);
+    if (flush_expected) assert(flush_called);
+}
+
+int
+test_main(int argc, const char *argv[]) {
+  default_parse_args(argc, argv);
+  test_clean(CACHETABLE_CLEAN, TRUE);
+  test_clean(CACHETABLE_DIRTY, TRUE);
+  test_clean(CACHETABLE_CLEAN, FALSE);
+  test_clean(CACHETABLE_DIRTY, FALSE);
+  return 0;
+}
diff --git a/newbrt/tests/cachetable-simple-maybe-get-pin.c b/newbrt/tests/cachetable-simple-maybe-get-pin.c
index e4c80b0d83d..66807bb5b8d 100644
--- a/newbrt/tests/cachetable-simple-maybe-get-pin.c
+++ b/newbrt/tests/cachetable-simple-maybe-get-pin.c
@@ -24,7 +24,7 @@ cachetable_test (void) {
     // nothing in cachetable, so this should fail
     r = toku_cachetable_maybe_get_and_pin(f1, make_blocknum(1), 1, &v1);
     assert(r==-1);
-    r = toku_cachetable_get_and_pin(f1, make_blocknum(1), 1, &v1, &s1, wc, def_fetch, def_pf_req_callback, def_pf_callback, NULL);
+    r = toku_cachetable_get_and_pin(f1, make_blocknum(1), 1, &v1, &s1, wc, def_fetch, def_pf_req_callback, def_pf_callback, TRUE, NULL);
     r = toku_cachetable_unpin(f1, make_blocknum(1), 1, CACHETABLE_CLEAN, make_pair_attr(8));
 
     // maybe_get_and_pin_clean should succeed, maybe_get_and_pin should fail
diff --git a/newbrt/tests/cachetable-simple-pin-dep-nodes.c b/newbrt/tests/cachetable-simple-pin-dep-nodes.c
index 1f8043cd720..f877bf82276 100644
--- a/newbrt/tests/cachetable-simple-pin-dep-nodes.c
+++ b/newbrt/tests/cachetable-simple-pin-dep-nodes.c
@@ -18,12 +18,14 @@ flush (CACHEFILE f __attribute__((__unused__)),
        int UU(fd),
        CACHEKEY k  __attribute__((__unused__)),
        void *v     __attribute__((__unused__)),
+       void** UU(dd),
        void *e     __attribute__((__unused__)),
        PAIR_ATTR s      __attribute__((__unused__)),
        PAIR_ATTR* new_size      __attribute__((__unused__)),
        BOOL w      __attribute__((__unused__)),
        BOOL keep   __attribute__((__unused__)),
-       BOOL c      __attribute__((__unused__))
+       BOOL c      __attribute__((__unused__)),
+        BOOL UU(is_clone)
        ) {
     /* Do nothing */
     if (verbose) { printf("FLUSH: %d\n", (int)k.b); }
@@ -50,6 +52,7 @@ fetch (CACHEFILE f        __attribute__((__unused__)),
        CACHEKEY k         __attribute__((__unused__)),
        u_int32_t fullhash __attribute__((__unused__)),
        void **value       __attribute__((__unused__)),
+       void** UU(dd),
        PAIR_ATTR *sizep        __attribute__((__unused__)),
        int  *dirtyp,
        void *extraargs    __attribute__((__unused__))
@@ -80,9 +83,9 @@ cachetable_test (BOOL write_first, BOOL write_second, BOOL start_checkpoint) {
     CACHETABLE_WRITE_CALLBACK wc = def_write_callback(&val1);
     wc.flush_callback = flush;
     wc.write_extraargs = &val1;
-    r = toku_cachetable_get_and_pin(f1, make_blocknum(1), 1, &v1, &s1, wc, fetch, def_pf_req_callback, def_pf_callback, &val1);
+    r = toku_cachetable_get_and_pin(f1, make_blocknum(1), 1, &v1, &s1, wc, fetch, def_pf_req_callback, def_pf_callback, TRUE, &val1);
     wc.write_extraargs = &val2;
-    r = toku_cachetable_get_and_pin(f1, make_blocknum(2), 2, &v2, &s2, wc, fetch, def_pf_req_callback, def_pf_callback, &val2);
+    r = toku_cachetable_get_and_pin(f1, make_blocknum(2), 2, &v2, &s2, wc, fetch, def_pf_req_callback, def_pf_callback, TRUE, &val2);
 
     CACHEFILE dependent_cfs[2];
     dependent_cfs[0] = f1;
@@ -117,6 +120,7 @@ cachetable_test (BOOL write_first, BOOL write_second, BOOL start_checkpoint) {
         &v3,
         &s3,
         wc, fetch, def_pf_req_callback, def_pf_callback,
+        TRUE,
         &val3,
         2, //num_dependent_pairs
         dependent_cfs,
diff --git a/newbrt/tests/cachetable-simple-pin-nonblocking.c b/newbrt/tests/cachetable-simple-pin-nonblocking.c
index 272ab9b77fb..3d71a1f50b9 100644
--- a/newbrt/tests/cachetable-simple-pin-nonblocking.c
+++ b/newbrt/tests/cachetable-simple-pin-nonblocking.c
@@ -15,12 +15,14 @@ flush (CACHEFILE f __attribute__((__unused__)),
        int UU(fd),
        CACHEKEY k  __attribute__((__unused__)),
        void *v     __attribute__((__unused__)),
+       void** UU(dd),
        void *e     __attribute__((__unused__)),
        PAIR_ATTR s      __attribute__((__unused__)),
        PAIR_ATTR* new_size      __attribute__((__unused__)),
        BOOL w      __attribute__((__unused__)),
        BOOL keep   __attribute__((__unused__)),
-       BOOL c      __attribute__((__unused__))
+       BOOL c      __attribute__((__unused__)),
+        BOOL UU(is_clone)
        ) {
   /* Do nothing */
   if (verbose) { printf("FLUSH: %d\n", (int)k.b); }
@@ -36,7 +38,7 @@ flush (CACHEFILE f __attribute__((__unused__)),
 static BOOL true_def_pf_req_callback(void* UU(brtnode_pv), void* UU(read_extraargs)) {
   return TRUE;
 }
-static int true_def_pf_callback(void* UU(brtnode_pv), void* UU(read_extraargs), int UU(fd), PAIR_ATTR* sizep) {
+static int true_def_pf_callback(void* UU(brtnode_pv), void* UU(dd), void* UU(read_extraargs), int UU(fd), PAIR_ATTR* sizep) {
     *sizep = make_pair_attr(8);
     return 0;
 }
@@ -85,33 +87,33 @@ run_test (void) {
     // because the PAIR was not in the cachetable.
     //
     is_fake_locked = TRUE;
-    r = toku_cachetable_get_and_pin_nonblocking(f1, make_blocknum(1), 1, &v1, &s1, wc, def_fetch, def_pf_req_callback, def_pf_callback, NULL, NULL);
+    r = toku_cachetable_get_and_pin_nonblocking(f1, make_blocknum(1), 1, &v1, &s1, wc, def_fetch, def_pf_req_callback, def_pf_callback, TRUE, NULL, NULL);
     assert(r==TOKUDB_TRY_AGAIN);
     assert(is_fake_locked);
     // now it should succeed
-    r = toku_cachetable_get_and_pin_nonblocking(f1, make_blocknum(1), 1, &v1, &s1, wc, def_fetch, def_pf_req_callback, def_pf_callback, NULL, NULL);
+    r = toku_cachetable_get_and_pin_nonblocking(f1, make_blocknum(1), 1, &v1, &s1, wc, def_fetch, def_pf_req_callback, def_pf_callback, TRUE, NULL, NULL);
     assert(r==0);
     assert(is_fake_locked);
     foo = FALSE;
     cachefile_kibbutz_enq(f1, kibbutz_work, f1);
     // because node is in use, should return TOKUDB_TRY_AGAIN
     assert(is_fake_locked);
-    r = toku_cachetable_get_and_pin_nonblocking(f1, make_blocknum(1), 1, &v1, &s1, wc, def_fetch, def_pf_req_callback, def_pf_callback, NULL, NULL);
+    r = toku_cachetable_get_and_pin_nonblocking(f1, make_blocknum(1), 1, &v1, &s1, wc, def_fetch, def_pf_req_callback, def_pf_callback, TRUE, NULL, NULL);
     assert(is_fake_locked);
     assert(r==TOKUDB_TRY_AGAIN);
-    r = toku_cachetable_get_and_pin(f1, make_blocknum(1), 1, &v1, &s1, wc, def_fetch, def_pf_req_callback, def_pf_callback, NULL);
+    r = toku_cachetable_get_and_pin(f1, make_blocknum(1), 1, &v1, &s1, wc, def_fetch, def_pf_req_callback, def_pf_callback, TRUE, NULL);
     assert(foo);
     r = toku_cachetable_unpin(f1, make_blocknum(1), 1, CACHETABLE_CLEAN, make_pair_attr(8)); assert(r==0);
 
     // now make sure we get TOKUDB_TRY_AGAIN when a partial fetch is involved
     assert(is_fake_locked);
     // first make sure value is there
-    r = toku_cachetable_get_and_pin_nonblocking(f1, make_blocknum(1), 1, &v1, &s1, wc, def_fetch, def_pf_req_callback, def_pf_callback, NULL, NULL);
+    r = toku_cachetable_get_and_pin_nonblocking(f1, make_blocknum(1), 1, &v1, &s1, wc, def_fetch, def_pf_req_callback, def_pf_callback, TRUE, NULL, NULL);
     assert(is_fake_locked);
     assert(r==0);
     r = toku_cachetable_unpin(f1, make_blocknum(1), 1, CACHETABLE_CLEAN, make_pair_attr(8)); assert(r==0);
     // now make sure that we get TOKUDB_TRY_AGAIN for the partial fetch
-    r = toku_cachetable_get_and_pin_nonblocking(f1, make_blocknum(1), 1, &v1, &s1, wc, def_fetch, true_def_pf_req_callback, true_def_pf_callback, NULL, NULL);
+    r = toku_cachetable_get_and_pin_nonblocking(f1, make_blocknum(1), 1, &v1, &s1, wc, def_fetch, true_def_pf_req_callback, true_def_pf_callback, TRUE, NULL, NULL);
     assert(is_fake_locked);
     assert(r==TOKUDB_TRY_AGAIN);
 
@@ -119,13 +121,13 @@ run_test (void) {
     // now test that if there is a checkpoint pending, 
     // first pin and unpin with dirty
     //
-    r = toku_cachetable_get_and_pin_nonblocking(f1, make_blocknum(1), 1, &v1, &s1, wc, def_fetch, def_pf_req_callback, def_pf_callback, NULL, NULL);
+    r = toku_cachetable_get_and_pin_nonblocking(f1, make_blocknum(1), 1, &v1, &s1, wc, def_fetch, def_pf_req_callback, def_pf_callback, TRUE, NULL, NULL);
     assert(is_fake_locked);
     assert(r==0);
     r = toku_cachetable_unpin(f1, make_blocknum(1), 1, CACHETABLE_DIRTY, make_pair_attr(8)); assert(r==0);
     // this should mark the PAIR as pending
     r = toku_cachetable_begin_checkpoint(ct, NULL); assert(r == 0);
-    r = toku_cachetable_get_and_pin_nonblocking(f1, make_blocknum(1), 1, &v1, &s1, wc, def_fetch, def_pf_req_callback, def_pf_callback, NULL, NULL);
+    r = toku_cachetable_get_and_pin_nonblocking(f1, make_blocknum(1), 1, &v1, &s1, wc, def_fetch, def_pf_req_callback, def_pf_callback, TRUE, NULL, NULL);
     assert(is_fake_locked);
     assert(r==TOKUDB_TRY_AGAIN);
     my_ydb_unlock();
diff --git a/newbrt/tests/cachetable-simple-pin.c b/newbrt/tests/cachetable-simple-pin.c
index ce810030675..301038f59d0 100644
--- a/newbrt/tests/cachetable-simple-pin.c
+++ b/newbrt/tests/cachetable-simple-pin.c
@@ -16,12 +16,14 @@ flush (CACHEFILE f __attribute__((__unused__)),
        int UU(fd),
        CACHEKEY k  __attribute__((__unused__)),
        void *v     __attribute__((__unused__)),
+       void** UU(dd),
        void *e     __attribute__((__unused__)),
        PAIR_ATTR s      __attribute__((__unused__)),
        PAIR_ATTR* new_size      __attribute__((__unused__)),
        BOOL w      __attribute__((__unused__)),
        BOOL keep   __attribute__((__unused__)),
-       BOOL c      __attribute__((__unused__))
+       BOOL c      __attribute__((__unused__)),
+        BOOL UU(is_clone)
        ) {
   /* Do nothing */
   if (verbose) { printf("FLUSH: %d\n", (int)k.b); }
@@ -61,16 +63,16 @@ run_test (void) {
     //long s2;
     CACHETABLE_WRITE_CALLBACK wc = def_write_callback(NULL);
     wc.flush_callback = flush;
-    r = toku_cachetable_get_and_pin(f1, make_blocknum(1), 1, &v1, &s1, wc, def_fetch, def_pf_req_callback, def_pf_callback, NULL);
+    r = toku_cachetable_get_and_pin(f1, make_blocknum(1), 1, &v1, &s1, wc, def_fetch, def_pf_req_callback, def_pf_callback, TRUE, NULL);
     foo = FALSE;
     cachefile_kibbutz_enq(f1, kibbutz_work, f1);
-    r = toku_cachetable_get_and_pin(f1, make_blocknum(1), 1, &v1, &s1, wc, def_fetch, def_pf_req_callback, def_pf_callback, NULL);
+    r = toku_cachetable_get_and_pin(f1, make_blocknum(1), 1, &v1, &s1, wc, def_fetch, def_pf_req_callback, def_pf_callback, TRUE, NULL);
     assert(foo);
     r = toku_cachetable_unpin(f1, make_blocknum(1), 1, CACHETABLE_CLEAN, make_pair_attr(8));
 
     //now let's do a simple checkpoint test
     // first dirty the PAIR
-    r = toku_cachetable_get_and_pin(f1, make_blocknum(1), 1, &v1, &s1, wc, def_fetch, def_pf_req_callback, def_pf_callback, NULL);
+    r = toku_cachetable_get_and_pin(f1, make_blocknum(1), 1, &v1, &s1, wc, def_fetch, def_pf_req_callback, def_pf_callback, TRUE, NULL);
     r = toku_cachetable_unpin(f1, make_blocknum(1), 1, CACHETABLE_DIRTY, make_pair_attr(8));
 
     // now this should mark the pair for checkpoint
@@ -81,7 +83,7 @@ run_test (void) {
     //
     check_me = TRUE;
     flush_called = FALSE;
-    r = toku_cachetable_get_and_pin(f1, make_blocknum(1), 1, &v1, &s1, wc, def_fetch, def_pf_req_callback, def_pf_callback, NULL);
+    r = toku_cachetable_get_and_pin(f1, make_blocknum(1), 1, &v1, &s1, wc, def_fetch, def_pf_req_callback, def_pf_callback, TRUE, NULL);
     assert(flush_called);
     r = toku_cachetable_unpin(f1, make_blocknum(1), 1, CACHETABLE_CLEAN, make_pair_attr(8));
 
diff --git a/newbrt/tests/cachetable-simple-put-dep-nodes.c b/newbrt/tests/cachetable-simple-put-dep-nodes.c
index 7a24d1bf851..ec06094cdef 100644
--- a/newbrt/tests/cachetable-simple-put-dep-nodes.c
+++ b/newbrt/tests/cachetable-simple-put-dep-nodes.c
@@ -18,12 +18,14 @@ flush (CACHEFILE f __attribute__((__unused__)),
        int UU(fd),
        CACHEKEY k  __attribute__((__unused__)),
        void *v     __attribute__((__unused__)),
+       void** UU(dd),
        void *e     __attribute__((__unused__)),
        PAIR_ATTR s      __attribute__((__unused__)),
        PAIR_ATTR* new_size      __attribute__((__unused__)),
        BOOL w      __attribute__((__unused__)),
        BOOL keep   __attribute__((__unused__)),
-       BOOL c      __attribute__((__unused__))
+       BOOL c      __attribute__((__unused__)),
+        BOOL UU(is_clone)
        ) {
     /* Do nothing */
     if (verbose) { printf("FLUSH: %d\n", (int)k.b); }
@@ -50,6 +52,7 @@ fetch (CACHEFILE f        __attribute__((__unused__)),
        CACHEKEY k         __attribute__((__unused__)),
        u_int32_t fullhash __attribute__((__unused__)),
        void **value       __attribute__((__unused__)),
+       void** UU(dd),
        PAIR_ATTR *sizep        __attribute__((__unused__)),
        int  *dirtyp,
        void *extraargs    __attribute__((__unused__))
@@ -84,9 +87,9 @@ cachetable_test (BOOL write_first, BOOL write_second, BOOL start_checkpoint) {
     long s2;
     CACHETABLE_WRITE_CALLBACK wc = def_write_callback(NULL);
     wc.flush_callback = flush;
-    r = toku_cachetable_get_and_pin(f1, make_blocknum(1), 1, &v1, &s1, wc, fetch, def_pf_req_callback, def_pf_callback, &val1);
+    r = toku_cachetable_get_and_pin(f1, make_blocknum(1), 1, &v1, &s1, wc, fetch, def_pf_req_callback, def_pf_callback, TRUE, &val1);
     assert(r==0);
-    r = toku_cachetable_get_and_pin(f1, make_blocknum(2), 2, &v2, &s2, wc, fetch, def_pf_req_callback, def_pf_callback, &val2);
+    r = toku_cachetable_get_and_pin(f1, make_blocknum(2), 2, &v2, &s2, wc, fetch, def_pf_req_callback, def_pf_callback, TRUE, &val2);
     assert(r==0);
     
     CACHEFILE dependent_cfs[2];
diff --git a/newbrt/tests/cachetable-simple-unpin-remove-checkpoint.c b/newbrt/tests/cachetable-simple-unpin-remove-checkpoint.c
index dd64d80b5a4..c0c12ecdec7 100644
--- a/newbrt/tests/cachetable-simple-unpin-remove-checkpoint.c
+++ b/newbrt/tests/cachetable-simple-unpin-remove-checkpoint.c
@@ -37,7 +37,7 @@ cachetable_test (void) {
   long s1;
   //long s2;
   CACHETABLE_WRITE_CALLBACK wc = def_write_callback(NULL);
-  r = toku_cachetable_get_and_pin(f1, make_blocknum(1), 1, &v1, &s1, wc, def_fetch, def_pf_req_callback, def_pf_callback, NULL);
+  r = toku_cachetable_get_and_pin(f1, make_blocknum(1), 1, &v1, &s1, wc, def_fetch, def_pf_req_callback, def_pf_callback, TRUE, NULL);
   r = toku_cachetable_begin_checkpoint(ct, NULL); assert(r == 0);
   r = toku_cachetable_unpin_and_remove(f1, make_blocknum(1), remove_key_expect_checkpoint, NULL);  
   r = toku_cachetable_end_checkpoint(
@@ -50,7 +50,7 @@ cachetable_test (void) {
       );
   assert(r==0);
 
-  r = toku_cachetable_get_and_pin(f1, make_blocknum(1), 1, &v1, &s1, wc, def_fetch, def_pf_req_callback, def_pf_callback, NULL);
+  r = toku_cachetable_get_and_pin(f1, make_blocknum(1), 1, &v1, &s1, wc, def_fetch, def_pf_req_callback, def_pf_callback, TRUE, NULL);
   r = toku_cachetable_unpin_and_remove(f1, make_blocknum(1), remove_key_expect_no_checkpoint, NULL);  
 
   
diff --git a/newbrt/tests/cachetable-simple-verify.c b/newbrt/tests/cachetable-simple-verify.c
index d0d806b57e9..a731743f2aa 100644
--- a/newbrt/tests/cachetable-simple-verify.c
+++ b/newbrt/tests/cachetable-simple-verify.c
@@ -19,7 +19,7 @@ cachetable_test (void) {
   long s1;
   //long s2;
   CACHETABLE_WRITE_CALLBACK wc = def_write_callback(NULL);
-  r = toku_cachetable_get_and_pin(f1, make_blocknum(1), 1, &v1, &s1, wc, def_fetch, def_pf_req_callback, def_pf_callback, NULL);
+  r = toku_cachetable_get_and_pin(f1, make_blocknum(1), 1, &v1, &s1, wc, def_fetch, def_pf_req_callback, def_pf_callback, TRUE, NULL);
   r = toku_cachetable_unpin(f1, make_blocknum(1), 1, CACHETABLE_DIRTY, make_pair_attr(8));
   toku_cachetable_verify(ct);
   r = toku_cachefile_close(&f1, 0, FALSE, ZERO_LSN); assert(r == 0 && f1 == 0);
diff --git a/newbrt/tests/cachetable-test.c b/newbrt/tests/cachetable-test.c
index 95bc5db7734..2d4a62f7a5b 100644
--- a/newbrt/tests/cachetable-test.c
+++ b/newbrt/tests/cachetable-test.c
@@ -91,12 +91,15 @@ static void flush (CACHEFILE f,
                    int UU(fd),
 		   CACHEKEY key,
 		   void*value,
+		   void** UU(dd),
 		   void *extra __attribute__((__unused__)),
 		   PAIR_ATTR size __attribute__((__unused__)),
         PAIR_ATTR* new_size      __attribute__((__unused__)),
 		   BOOL write_me __attribute__((__unused__)),
 		   BOOL keep_me __attribute__((__unused__)),
-		   BOOL for_checkpoint __attribute__((__unused__))) {
+		   BOOL for_checkpoint __attribute__((__unused__)),
+        BOOL UU(is_clone)
+		   ) {
     struct item *it = value;
     int i;
 
@@ -132,7 +135,7 @@ static struct item *make_item (u_int64_t key) {
 }
 
 static CACHEKEY did_fetch={-1};
-static int fetch (CACHEFILE f, int UU(fd), CACHEKEY key, u_int32_t fullhash __attribute__((__unused__)), void**value, PAIR_ATTR *sizep __attribute__((__unused__)), int  *dirtyp, void*extraargs) {
+static int fetch (CACHEFILE f, int UU(fd), CACHEKEY key, u_int32_t fullhash __attribute__((__unused__)), void**value, void** UU(dd), PAIR_ATTR *sizep __attribute__((__unused__)), int  *dirtyp, void*extraargs) {
     if (verbose) printf("Fetch %" PRId64 "\n", key.b);
     assert (expect_f==f);
     assert((long)extraargs==23);
@@ -232,7 +235,7 @@ static void test0 (void) {
     {
 	void *item_v=0;
 	expect_init(); 
-	r=toku_cachetable_get_and_pin(f, make_blocknum(5), toku_cachetable_hash(f, make_blocknum(5)), &item_v, NULL, wc, fetch, def_pf_req_callback, def_pf_callback, t3);  /* 5P 7U 6P 4P 1P */
+	r=toku_cachetable_get_and_pin(f, make_blocknum(5), toku_cachetable_hash(f, make_blocknum(5)), &item_v, NULL, wc, fetch, def_pf_req_callback, def_pf_callback, TRUE, t3);  /* 5P 7U 6P 4P 1P */
 	assert(r==0);
 	assert(((struct item *)item_v)->key.b==5);
 	assert(strcmp(((struct item *)item_v)->something,"something")==0);
@@ -249,7 +252,7 @@ static void test0 (void) {
 	did_fetch=make_blocknum(-1);
         CACHETABLE_WRITE_CALLBACK wc2 = def_write_callback(t3);
         wc2.flush_callback = flush;
-	r=toku_cachetable_get_and_pin(f, make_blocknum(2), toku_cachetable_hash(f, make_blocknum(2)), &item_v, NULL, wc2, fetch, def_pf_req_callback, def_pf_callback, t3);  /* 2p 5P 7U 6P 1P */
+	r=toku_cachetable_get_and_pin(f, make_blocknum(2), toku_cachetable_hash(f, make_blocknum(2)), &item_v, NULL, wc2, fetch, def_pf_req_callback, def_pf_callback, TRUE, t3);  /* 2p 5P 7U 6P 1P */
 	assert(r==0);
 	assert(did_fetch.b==2); /* Expect that 2 is fetched in. */
 	assert(((struct item *)item_v)->key.b==2);
@@ -290,17 +293,22 @@ static void test0 (void) {
 
 static void flush_n (CACHEFILE f __attribute__((__unused__)), int UU(fd), CACHEKEY key __attribute__((__unused__)),
 		     void *value,
+		     void** UU(dd),
 		     void *extra  __attribute__((__unused__)),
                      PAIR_ATTR size __attribute__((__unused__)),
         PAIR_ATTR* new_size      __attribute__((__unused__)),
 		     BOOL write_me __attribute__((__unused__)),    BOOL keep_me __attribute__((__unused__)),
-		     BOOL for_checkpoint __attribute__ ((__unused__))) {
+		     BOOL for_checkpoint __attribute__ ((__unused__)),
+        BOOL UU(is_clone)
+		     ) {
     int *v = value;
     assert(*v==0);
 }
 static int fetch_n (CACHEFILE f __attribute__((__unused__)), int UU(fd), CACHEKEY key __attribute__((__unused__)),
 		    u_int32_t fullhash  __attribute__((__unused__)),
-                    void**value, PAIR_ATTR *sizep __attribute__((__unused__)), 
+                    void**value, 
+		    void** UU(dd),
+PAIR_ATTR *sizep __attribute__((__unused__)), 
 		    int * dirtyp, void*extraargs) {
     assert((long)extraargs==42);
     *value=0;
@@ -333,7 +341,7 @@ static void test_nested_pin (void) {
     r = toku_cachetable_put(f, make_blocknum(1), f1hash, &i0, make_pair_attr(1), wc);
     assert(r==0);
     r = toku_cachetable_unpin(f, make_blocknum(1), f1hash, CACHETABLE_CLEAN, make_pair_attr(test_object_size));
-    r = toku_cachetable_get_and_pin(f, make_blocknum(1), f1hash, &vv, NULL, wc, fetch_n, def_pf_req_callback, def_pf_callback, f2);
+    r = toku_cachetable_get_and_pin(f, make_blocknum(1), f1hash, &vv, NULL, wc, fetch_n, def_pf_req_callback, def_pf_callback, TRUE, f2);
     assert(r==0);
     assert(vv==&i0);
     assert(i0==0);
@@ -359,15 +367,20 @@ static void null_flush (CACHEFILE cf     __attribute__((__unused__)),
                         int UU(fd),
                         CACHEKEY k       __attribute__((__unused__)),
                         void *v          __attribute__((__unused__)),
+			void** UU(dd),
                         void *extra      __attribute__((__unused__)),
                         PAIR_ATTR size        __attribute__((__unused__)),
         PAIR_ATTR* new_size      __attribute__((__unused__)),
                         BOOL write_me    __attribute__((__unused__)),
                         BOOL keep_me     __attribute__((__unused__)),
-                        BOOL for_checkpoint __attribute__((__unused__))) {
+                        BOOL for_checkpoint __attribute__((__unused__)),
+        BOOL UU(is_clone)
+                        ) {
 }
 
-static int add123_fetch (CACHEFILE cf, int UU(fd), CACHEKEY key, u_int32_t fullhash, void **value, PAIR_ATTR *sizep __attribute__((__unused__)), int * dirtyp, void*extraargs) {
+static int add123_fetch (CACHEFILE cf, int UU(fd), CACHEKEY key, u_int32_t fullhash, void **value, 
+			 void** UU(dd),
+PAIR_ATTR *sizep __attribute__((__unused__)), int * dirtyp, void*extraargs) {
     assert(fullhash==toku_cachetable_hash(cf,key));
     assert((long)extraargs==123);
     *value = (void*)((unsigned long)key.b+123L);
@@ -376,7 +389,9 @@ static int add123_fetch (CACHEFILE cf, int UU(fd), CACHEKEY key, u_int32_t fullh
     return 0;
 }
 
-static int add222_fetch (CACHEFILE cf, int UU(fd), CACHEKEY key, u_int32_t fullhash, void **value, PAIR_ATTR *sizep __attribute__((__unused__)), int * dirtyp, void*extraargs) {
+static int add222_fetch (CACHEFILE cf, int UU(fd), CACHEKEY key, u_int32_t fullhash, void **value, 
+			 void** UU(dd),
+PAIR_ATTR *sizep __attribute__((__unused__)), int * dirtyp, void*extraargs) {
     assert(fullhash==toku_cachetable_hash(cf,key));
     assert((long)extraargs==222);
     *value = (void*)((unsigned long)key.b+222L);
@@ -411,12 +426,12 @@ static void test_multi_filehandles (void) {
     wc.flush_callback = null_flush;
     r = toku_cachetable_put(f1, make_blocknum(1), toku_cachetable_hash(f1, make_blocknum(1)), (void*)124, make_pair_attr(test_object_size), wc); assert(r==0);
     r = toku_cachetable_unpin(f1, make_blocknum(1), toku_cachetable_hash(f1, make_blocknum(1)), CACHETABLE_DIRTY, make_pair_attr(0)); assert(r==0);
-    r = toku_cachetable_get_and_pin(f2, make_blocknum(1), toku_cachetable_hash(f2, make_blocknum(1)), &v, NULL, wc, add123_fetch, def_pf_req_callback, def_pf_callback, (void*)123); assert(r==0);
+    r = toku_cachetable_get_and_pin(f2, make_blocknum(1), toku_cachetable_hash(f2, make_blocknum(1)), &v, NULL, wc, add123_fetch, def_pf_req_callback, def_pf_callback, TRUE, (void*)123); assert(r==0);
     assert((unsigned long)v==124);
-    r = toku_cachetable_get_and_pin(f2, make_blocknum(2), toku_cachetable_hash(f2, make_blocknum(2)), &v, NULL, wc, add123_fetch, def_pf_req_callback, def_pf_callback, (void*)123); assert(r==0);
+    r = toku_cachetable_get_and_pin(f2, make_blocknum(2), toku_cachetable_hash(f2, make_blocknum(2)), &v, NULL, wc, add123_fetch, def_pf_req_callback, def_pf_callback, TRUE, (void*)123); assert(r==0);
     assert((unsigned long)v==125);
     wc.write_extraargs = (void*)222;
-    r = toku_cachetable_get_and_pin(f3, make_blocknum(2), toku_cachetable_hash(f3, make_blocknum(2)), &v, NULL, wc, add222_fetch, def_pf_req_callback, def_pf_callback, (void*)222); assert(r==0);
+    r = toku_cachetable_get_and_pin(f3, make_blocknum(2), toku_cachetable_hash(f3, make_blocknum(2)), &v, NULL, wc, add222_fetch, def_pf_req_callback, def_pf_callback, TRUE, (void*)222); assert(r==0);
     assert((unsigned long)v==224);
 
     r = toku_cachetable_unpin(f1, make_blocknum(1), toku_cachetable_hash(f1, make_blocknum(1)), CACHETABLE_CLEAN, make_pair_attr(0)); assert(r==0);
@@ -439,16 +454,21 @@ static void test_dirty_flush(CACHEFILE f,
                              int UU(fd),
 			     CACHEKEY key,
 			     void *value,
+			     void** UU(dd),
 			     void *extra __attribute__((__unused__)),
 			     PAIR_ATTR size,
         PAIR_ATTR* new_size      __attribute__((__unused__)),
 			     BOOL do_write,
 			     BOOL keep,
-			     BOOL for_checkpoint __attribute__((__unused__))) {
+			     BOOL for_checkpoint __attribute__((__unused__)),
+        BOOL UU(is_clone)
+			     ) {
     if (verbose) printf("test_dirty_flush %p %" PRId64 " %p %ld %u %u\n", f, key.b, value, size.size, (unsigned)do_write, (unsigned)keep);
 }
 
-static int test_dirty_fetch(CACHEFILE f, int UU(fd), CACHEKEY key, u_int32_t fullhash, void **value_ptr, PAIR_ATTR *size_ptr, int * dirtyp, void *arg) {
+static int test_dirty_fetch(CACHEFILE f, int UU(fd), CACHEKEY key, u_int32_t fullhash, void **value_ptr, 
+			    void** UU(dd),
+PAIR_ATTR *size_ptr, int * dirtyp, void *arg) {
     *value_ptr = arg;
     *dirtyp = 0;
     *size_ptr = make_pair_attr(0);
@@ -495,7 +515,7 @@ static void test_dirty(void) {
     assert(pinned == 0);
 
     r = toku_cachetable_get_and_pin(f, key, hkey, &value, NULL, wc,
-				    test_dirty_fetch, def_pf_req_callback, def_pf_callback, 0);
+				    test_dirty_fetch, def_pf_req_callback, def_pf_callback, TRUE, 0);
     assert(r == 0);
 
     // cachetable_print_state(t);
@@ -517,7 +537,7 @@ static void test_dirty(void) {
     hkey = toku_cachetable_hash(f, key);
     r = toku_cachetable_get_and_pin(f, key, hkey,
 				    &value, NULL, wc,
-				    test_dirty_fetch, def_pf_req_callback, def_pf_callback, 0);
+				    test_dirty_fetch, def_pf_req_callback, def_pf_callback, TRUE, 0);
     assert(r == 0);
 
     // cachetable_print_state(t);
@@ -537,7 +557,7 @@ static void test_dirty(void) {
 
     r = toku_cachetable_get_and_pin(f, key, hkey,
 				    &value, NULL, wc,
-				    test_dirty_fetch, def_pf_req_callback, def_pf_callback, 0);
+				    test_dirty_fetch, def_pf_req_callback, def_pf_callback, TRUE, 0);
     assert(r == 0);
 
     // cachetable_print_state(t);
@@ -568,12 +588,15 @@ static void test_size_flush_callback(CACHEFILE f,
                                      int UU(fd),
 				     CACHEKEY key,
 				     void *value,
+				     void** UU(dd),
 				     void *extra __attribute__((__unused__)),
 				     PAIR_ATTR size,
         PAIR_ATTR* new_size      __attribute__((__unused__)),
 				     BOOL do_write,
 				     BOOL keep,
-				     BOOL for_checkpoint __attribute__((__unused__))) {
+				     BOOL for_checkpoint __attribute__((__unused__)),
+        BOOL UU(is_clone)
+				     ) {
     if (test_size_debug && verbose) printf("test_size_flush %p %" PRId64 " %p %ld %u %u\n", f, key.b, value, size.size, (unsigned)do_write, (unsigned)keep);
     if (keep) {
         if (do_write) {
@@ -628,7 +651,7 @@ static void test_size_resize(void) {
 
     void *current_value;
     long current_size;
-    r = toku_cachetable_get_and_pin(f, key, hkey, &current_value, &current_size, wc, 0, def_pf_req_callback, def_pf_callback, 0);
+    r = toku_cachetable_get_and_pin(f, key, hkey, &current_value, &current_size, wc, 0, def_pf_req_callback, def_pf_callback, TRUE, 0);
     assert(r == 0);
     assert(current_value == value);
     assert(current_size == new_size);
diff --git a/newbrt/tests/cachetable-test2.c b/newbrt/tests/cachetable-test2.c
index bc32b06af98..eb970f3779c 100644
--- a/newbrt/tests/cachetable-test2.c
+++ b/newbrt/tests/cachetable-test2.c
@@ -97,12 +97,15 @@ static void flush_forchain (CACHEFILE f            __attribute__((__unused__)),
                             int UU(fd),
 			    CACHEKEY  key,
 			    void     *value,
+			    void** UU(dd),
 			    void     *extra        __attribute__((__unused__)),
 			    PAIR_ATTR      size         __attribute__((__unused__)),
         PAIR_ATTR* new_size      __attribute__((__unused__)),
 			    BOOL      write_me     __attribute__((__unused__)),
 			    BOOL      keep_me      __attribute__((__unused__)),
-			    BOOL      for_checkpoint     __attribute__((__unused__))) {
+			    BOOL      for_checkpoint     __attribute__((__unused__)),
+        BOOL UU(is_clone)
+			    ) {
     if (keep_me) return;
     int *v = value;
     //toku_cachetable_print_state(ct);
@@ -112,7 +115,9 @@ static void flush_forchain (CACHEFILE f            __attribute__((__unused__)),
     //print_ints();
 }
 
-static int fetch_forchain (CACHEFILE f, int UU(fd), CACHEKEY key, u_int32_t fullhash, void**value, PAIR_ATTR *sizep __attribute__((__unused__)), int * dirtyp, void*extraargs) {
+static int fetch_forchain (CACHEFILE f, int UU(fd), CACHEKEY key, u_int32_t fullhash, void**value, 
+			   void** UU(dd),
+PAIR_ATTR *sizep __attribute__((__unused__)), int * dirtyp, void*extraargs) {
     assert(toku_cachetable_hash(f, key)==fullhash);
     assert((long)extraargs==(long)key.b);
     *value = (void*)(long)key.b;
@@ -197,6 +202,7 @@ static void test_chaining (void) {
 					    fetch_forchain,
 					    def_pf_req_callback,
 					    def_pf_callback,
+					    TRUE, 
                                             (void*)(long)whichkey.b
 					    );
 	    assert(r==0);
diff --git a/newbrt/tests/cachetable-unpin-and-remove-test.c b/newbrt/tests/cachetable-unpin-and-remove-test.c
index be80672462a..fde20d2bb68 100644
--- a/newbrt/tests/cachetable-unpin-and-remove-test.c
+++ b/newbrt/tests/cachetable-unpin-and-remove-test.c
@@ -9,6 +9,7 @@ fetch (CACHEFILE f        __attribute__((__unused__)),
        CACHEKEY k         __attribute__((__unused__)),
        u_int32_t fullhash __attribute__((__unused__)),
        void **value       __attribute__((__unused__)),
+       void** UU(dd),
        PAIR_ATTR *sizep        __attribute__((__unused__)),
        int  *dirtyp       __attribute__((__unused__)),
        void *extraargs    __attribute__((__unused__))
@@ -113,7 +114,7 @@ cachetable_put_evict_remove_test (int n) {
 
     // get 0
     void *v; long s;
-    r = toku_cachetable_get_and_pin(f1, make_blocknum(0), hi[0], &v, &s, wc, fetch, def_pf_req_callback, def_pf_callback, 0);
+    r = toku_cachetable_get_and_pin(f1, make_blocknum(0), hi[0], &v, &s, wc, fetch, def_pf_req_callback, def_pf_callback, TRUE, 0);
     assert(r == 0);
         
     // remove 0
diff --git a/newbrt/tests/cachetable-unpin-remove-and-checkpoint.c b/newbrt/tests/cachetable-unpin-remove-and-checkpoint.c
index 7780deed7ec..3a2d3721722 100644
--- a/newbrt/tests/cachetable-unpin-remove-and-checkpoint.c
+++ b/newbrt/tests/cachetable-unpin-remove-and-checkpoint.c
@@ -39,7 +39,7 @@ run_test (void) {
     //void* v2;
     long s1;
     //long s2;
-    r = toku_cachetable_get_and_pin(f1, make_blocknum(1), toku_cachetable_hash(f1, make_blocknum(1)), &v1, &s1, wc, def_fetch, def_pf_req_callback, def_pf_callback, NULL);
+    r = toku_cachetable_get_and_pin(f1, make_blocknum(1), toku_cachetable_hash(f1, make_blocknum(1)), &v1, &s1, wc, def_fetch, def_pf_req_callback, def_pf_callback, TRUE, NULL);
     toku_cachetable_unpin(
         f1, 
         make_blocknum(1), 
@@ -50,7 +50,7 @@ run_test (void) {
 
     // now this should mark the pair for checkpoint
     r = toku_cachetable_begin_checkpoint(ct, NULL);
-    r = toku_cachetable_get_and_pin(f1, make_blocknum(1), toku_cachetable_hash(f1, make_blocknum(1)), &v1, &s1, wc, def_fetch, def_pf_req_callback, def_pf_callback, NULL);
+    r = toku_cachetable_get_and_pin(f1, make_blocknum(1), toku_cachetable_hash(f1, make_blocknum(1)), &v1, &s1, wc, def_fetch, def_pf_req_callback, def_pf_callback, TRUE, NULL);
 
     toku_pthread_t mytid;
     r = toku_pthread_create(&mytid, NULL, run_end_chkpt, NULL);
diff --git a/newbrt/tests/cachetable-writer-thread-limit.c b/newbrt/tests/cachetable-writer-thread-limit.c
index f38770eb1ae..103c3d99410 100644
--- a/newbrt/tests/cachetable-writer-thread-limit.c
+++ b/newbrt/tests/cachetable-writer-thread-limit.c
@@ -13,12 +13,14 @@ flush (CACHEFILE f __attribute__((__unused__)),
        int UU(fd),
        CACHEKEY k  __attribute__((__unused__)),
        void *v     __attribute__((__unused__)),
+       void** UU(dd),
        void *e     __attribute__((__unused__)),
        PAIR_ATTR s      __attribute__((__unused__)),
        PAIR_ATTR* new_size      __attribute__((__unused__)),
        BOOL w      __attribute__((__unused__)),
        BOOL keep   __attribute__((__unused__)),
-       BOOL c      __attribute__((__unused__))
+       BOOL c      __attribute__((__unused__)),
+        BOOL UU(is_clone)
        ) {
     if (w) {
         int curr_size = __sync_fetch_and_add(&total_size, -1);
diff --git a/newbrt/tests/test-checkpoint-during-flush.c b/newbrt/tests/test-checkpoint-during-flush.c
index cae9fe3e919..9d26953e033 100644
--- a/newbrt/tests/test-checkpoint-during-flush.c
+++ b/newbrt/tests/test-checkpoint-during-flush.c
@@ -145,6 +145,7 @@ doit (BOOL after_child_pin) {
         node_root,
         toku_cachetable_hash(t->h->cf, node_root),
         &bfe,
+        TRUE, 
         0,
         NULL,
         &node
@@ -163,6 +164,7 @@ doit (BOOL after_child_pin) {
         node_root,
         toku_cachetable_hash(t->h->cf, node_root),
         &bfe,
+        TRUE, 
         0,
         NULL,
         &node
@@ -199,6 +201,7 @@ doit (BOOL after_child_pin) {
         node_root,
         toku_cachetable_hash(c_brt->h->cf, node_root),
         &bfe,
+        TRUE, 
         0,
         NULL,
         &node
@@ -219,6 +222,7 @@ doit (BOOL after_child_pin) {
         node_leaf,
         toku_cachetable_hash(c_brt->h->cf, node_root),
         &bfe,
+        TRUE, 
         0,
         NULL,
         &node
diff --git a/newbrt/tests/test-checkpoint-during-merge.c b/newbrt/tests/test-checkpoint-during-merge.c
index 0d453044ef7..4fdee6446d2 100644
--- a/newbrt/tests/test-checkpoint-during-merge.c
+++ b/newbrt/tests/test-checkpoint-during-merge.c
@@ -163,6 +163,7 @@ doit (int state) {
         node_root,
         toku_cachetable_hash(t->h->cf, node_root),
         &bfe,
+        TRUE, 
         0,
         NULL,
         &node
@@ -180,6 +181,7 @@ doit (int state) {
         node_root,
         toku_cachetable_hash(t->h->cf, node_root),
         &bfe,
+        TRUE, 
         0,
         NULL,
         &node
@@ -218,6 +220,7 @@ doit (int state) {
         node_root,
         toku_cachetable_hash(c_brt->h->cf, node_root),
         &bfe,
+        TRUE, 
         0,
         NULL,
         &node
@@ -247,6 +250,7 @@ doit (int state) {
             left_child,
             toku_cachetable_hash(c_brt->h->cf, left_child),
             &bfe,
+            TRUE, 
             0,
             NULL,
             &node
@@ -262,6 +266,7 @@ doit (int state) {
             right_child,
             toku_cachetable_hash(c_brt->h->cf, right_child),
             &bfe,
+            TRUE, 
             0,
             NULL,
             &node
@@ -278,6 +283,7 @@ doit (int state) {
             left_child,
             toku_cachetable_hash(c_brt->h->cf, left_child),
             &bfe,
+            TRUE, 
             0,
             NULL,
             &node
diff --git a/newbrt/tests/test-checkpoint-during-rebalance.c b/newbrt/tests/test-checkpoint-during-rebalance.c
index 7443f7cd1b9..3eda53b9e43 100644
--- a/newbrt/tests/test-checkpoint-during-rebalance.c
+++ b/newbrt/tests/test-checkpoint-during-rebalance.c
@@ -183,6 +183,7 @@ doit (int state) {
         node_root,
         toku_cachetable_hash(t->h->cf, node_root),
         &bfe,
+        TRUE, 
         0,
         NULL,
         &node
@@ -200,6 +201,7 @@ doit (int state) {
         node_root,
         toku_cachetable_hash(t->h->cf, node_root),
         &bfe,
+        TRUE, 
         0,
         NULL,
         &node
@@ -238,6 +240,7 @@ doit (int state) {
         node_root,
         toku_cachetable_hash(c_brt->h->cf, node_root),
         &bfe,
+        TRUE, 
         0,
         NULL,
         &node
@@ -258,6 +261,7 @@ doit (int state) {
         left_child,
         toku_cachetable_hash(c_brt->h->cf, left_child),
         &bfe,
+        TRUE, 
         0,
         NULL,
         &node
@@ -273,6 +277,7 @@ doit (int state) {
         right_child,
         toku_cachetable_hash(c_brt->h->cf, right_child),
         &bfe,
+        TRUE, 
         0,
         NULL,
         &node
diff --git a/newbrt/tests/test-checkpoint-during-split.c b/newbrt/tests/test-checkpoint-during-split.c
index 647b071c446..5fe299a703e 100644
--- a/newbrt/tests/test-checkpoint-during-split.c
+++ b/newbrt/tests/test-checkpoint-during-split.c
@@ -159,6 +159,7 @@ doit (BOOL after_split) {
         node_root,
         toku_cachetable_hash(t->h->cf, node_root),
         &bfe,
+        TRUE, 
         0,
         NULL,
         &node
@@ -176,6 +177,7 @@ doit (BOOL after_split) {
         node_root,
         toku_cachetable_hash(t->h->cf, node_root),
         &bfe,
+        TRUE, 
         0,
         NULL,
         &node
@@ -214,6 +216,7 @@ doit (BOOL after_split) {
         node_root,
         toku_cachetable_hash(c_brt->h->cf, node_root),
         &bfe,
+        TRUE, 
         0,
         NULL,
         &node
@@ -241,6 +244,7 @@ doit (BOOL after_split) {
             left_child,
             toku_cachetable_hash(c_brt->h->cf, left_child),
             &bfe,
+            TRUE, 
             0,
             NULL,
             &node
@@ -256,6 +260,7 @@ doit (BOOL after_split) {
             right_child,
             toku_cachetable_hash(c_brt->h->cf, right_child),
             &bfe,
+            TRUE, 
             0,
             NULL,
             &node
@@ -272,6 +277,7 @@ doit (BOOL after_split) {
             left_child,
             toku_cachetable_hash(c_brt->h->cf, left_child),
             &bfe,
+            TRUE, 
             0,
             NULL,
             &node
diff --git a/newbrt/tests/test-dirty-flushes-on-cleaner.c b/newbrt/tests/test-dirty-flushes-on-cleaner.c
index f108044b573..a4f6106b9cf 100644
--- a/newbrt/tests/test-dirty-flushes-on-cleaner.c
+++ b/newbrt/tests/test-dirty-flushes-on-cleaner.c
@@ -166,6 +166,7 @@ doit (void) {
         node_leaf,
         toku_cachetable_hash(brt->h->cf, node_leaf),
         &bfe,
+        TRUE, 
         0,
         NULL,
         &node
@@ -194,6 +195,7 @@ doit (void) {
         node_leaf,
         toku_cachetable_hash(brt->h->cf, node_leaf),
         &bfe,
+        TRUE, 
         0,
         NULL,
         &node
@@ -213,6 +215,7 @@ doit (void) {
         node_internal,
         toku_cachetable_hash(brt->h->cf, node_internal),
         &bfe,
+        TRUE, 
         0,
         NULL,
         &node
@@ -236,6 +239,7 @@ doit (void) {
         node_internal,
         toku_cachetable_hash(brt->h->cf, node_internal),
         &bfe,
+        TRUE, 
         0,
         NULL,
         &node
diff --git a/newbrt/tests/test-flushes-on-cleaner.c b/newbrt/tests/test-flushes-on-cleaner.c
index 8946b75a7ba..4cabc901e7a 100644
--- a/newbrt/tests/test-flushes-on-cleaner.c
+++ b/newbrt/tests/test-flushes-on-cleaner.c
@@ -171,6 +171,7 @@ doit (void) {
         node_leaf,
         toku_cachetable_hash(brt->h->cf, node_leaf),
         &bfe,
+        TRUE, 
         0,
         NULL,
         &node
@@ -206,6 +207,7 @@ doit (void) {
         node_leaf,
         toku_cachetable_hash(brt->h->cf, node_leaf),
         &bfe,
+        TRUE, 
         0,
         NULL,
         &node
@@ -225,6 +227,7 @@ doit (void) {
         node_internal,
         toku_cachetable_hash(brt->h->cf, node_internal),
         &bfe,
+        TRUE, 
         0,
         NULL,
         &node
@@ -248,6 +251,7 @@ doit (void) {
         node_internal,
         toku_cachetable_hash(brt->h->cf, node_internal),
         &bfe,
+        TRUE, 
         0,
         NULL,
         &node
diff --git a/newbrt/tests/test-merges-on-cleaner.c b/newbrt/tests/test-merges-on-cleaner.c
index a4057785c4b..49c1a99fc47 100644
--- a/newbrt/tests/test-merges-on-cleaner.c
+++ b/newbrt/tests/test-merges-on-cleaner.c
@@ -158,6 +158,7 @@ doit (void) {
         node_internal,
         toku_cachetable_hash(brt->h->cf, node_internal),
         &bfe,
+        TRUE, 
         0,
         NULL,
         &node
@@ -180,6 +181,7 @@ doit (void) {
         node_internal,
         toku_cachetable_hash(brt->h->cf, node_internal),
         &bfe,
+        TRUE, 
         0,
         NULL,
         &node
diff --git a/newbrt/tests/test.h b/newbrt/tests/test.h
index 33d98e441ec..3097177fad8 100644
--- a/newbrt/tests/test.h
+++ b/newbrt/tests/test.h
@@ -125,18 +125,21 @@ def_flush (CACHEFILE f __attribute__((__unused__)),
        int UU(fd),
        CACHEKEY k  __attribute__((__unused__)),
        void *v     __attribute__((__unused__)),
+       void **dd     __attribute__((__unused__)),
        void *e     __attribute__((__unused__)),
        PAIR_ATTR s      __attribute__((__unused__)),
        PAIR_ATTR* new_size      __attribute__((__unused__)),
        BOOL w      __attribute__((__unused__)),
        BOOL keep   __attribute__((__unused__)),
-       BOOL c      __attribute__((__unused__))
+       BOOL c      __attribute__((__unused__)),
+       BOOL UU(is_clone)
        ) {
 }
 
 static UU() void 
 def_pe_est_callback(
-    void* UU(brtnode_pv), 
+    void* UU(brtnode_pv),
+    void* UU(dd), 
     long* bytes_freed_estimate, 
     enum partial_eviction_cost *cost, 
     void* UU(write_extraargs)
@@ -162,7 +165,7 @@ static UU() BOOL def_pf_req_callback(void* UU(brtnode_pv), void* UU(read_extraar
   return FALSE;
 }
 
-static UU() int def_pf_callback(void* UU(brtnode_pv), void* UU(read_extraargs), int UU(fd), PAIR_ATTR* UU(sizep)) {
+  static UU() int def_pf_callback(void* UU(brtnode_pv), void* UU(dd), void* UU(read_extraargs), int UU(fd), PAIR_ATTR* UU(sizep)) {
   assert(FALSE);
   return 0;
 }
@@ -173,6 +176,7 @@ def_fetch (CACHEFILE f        __attribute__((__unused__)),
        CACHEKEY k         __attribute__((__unused__)),
        u_int32_t fullhash __attribute__((__unused__)),
        void **value       __attribute__((__unused__)),
+       void **dd     __attribute__((__unused__)),
        PAIR_ATTR *sizep        __attribute__((__unused__)),
        int  *dirtyp,
        void *extraargs    __attribute__((__unused__))
@@ -203,6 +207,7 @@ static UU() CACHETABLE_WRITE_CALLBACK def_write_callback(void* write_extraargs)
     wc.pe_callback = def_pe_callback;
     wc.cleaner_callback = def_cleaner_callback;
     wc.write_extraargs = write_extraargs;
+    wc.clone_callback = NULL;
     return wc;
 }
 
diff --git a/newbrt/tests/test4244.c b/newbrt/tests/test4244.c
index e53b0ba725f..bcb6c86757b 100644
--- a/newbrt/tests/test4244.c
+++ b/newbrt/tests/test4244.c
@@ -75,6 +75,7 @@ doit (void) {
         node_internal,
         toku_cachetable_hash(t->h->cf, node_internal),
         &bfe,
+        TRUE, 
         0,
         NULL,
         &node
diff --git a/newbrt/tests/test4302.c b/newbrt/tests/test4302.c
index b6c83d84017..04b0c6ff19d 100644
--- a/newbrt/tests/test4302.c
+++ b/newbrt/tests/test4302.c
@@ -11,12 +11,14 @@ flush (CACHEFILE f __attribute__((__unused__)),
        int UU(fd),
        CACHEKEY k  __attribute__((__unused__)),
        void *v     __attribute__((__unused__)),
+       void** UU(dd),
        void *e     __attribute__((__unused__)),
        PAIR_ATTR s      __attribute__((__unused__)),
        PAIR_ATTR* new_size      __attribute__((__unused__)),
        BOOL w      __attribute__((__unused__)),
        BOOL keep   __attribute__((__unused__)),
-       BOOL c      __attribute__((__unused__))
+       BOOL c      __attribute__((__unused__)),
+        BOOL UU(is_clone)
        ) {
   /* Do nothing */
   if (verbose) { printf("FLUSH: %d\n", (int)k.b); }
@@ -28,6 +30,7 @@ fetch (CACHEFILE f        __attribute__((__unused__)),
        CACHEKEY k         __attribute__((__unused__)),
        u_int32_t fullhash __attribute__((__unused__)),
        void **value,
+       void** UU(dd),
        PAIR_ATTR *sizep,
        int  *dirtyp,
        void *extraargs
@@ -41,6 +44,7 @@ fetch (CACHEFILE f        __attribute__((__unused__)),
 static void 
 pe_est_callback(
     void* UU(brtnode_pv), 
+    void* UU(dd),
     long* bytes_freed_estimate, 
     enum partial_eviction_cost *cost, 
     void* UU(write_extraargs)
@@ -120,6 +124,7 @@ cachetable_test (void) {
       wc, 
       fetch, 
       def_pf_req_callback, def_pf_callback, 
+      TRUE, 
       &val1
       );
   r = toku_cachetable_unpin(f1, make_blocknum(1), 1, CACHETABLE_CLEAN, make_pair_attr(8));
diff --git a/newbrt/workqueue.c b/newbrt/workqueue.c
index 2489c8921b3..c9da2dac42a 100644
--- a/newbrt/workqueue.c
+++ b/newbrt/workqueue.c
@@ -16,10 +16,12 @@
 // Create fixed number of worker threads, all waiting on a single queue
 // of work items (WORKQUEUE).
 
-void toku_init_workers(WORKQUEUE wq, THREADPOOL *tpptr) {
+void toku_init_workers(WORKQUEUE wq, THREADPOOL *tpptr, int fraction) {
     workqueue_init(wq);
+    assert(fraction > 0);
     int nprocs = toku_os_get_number_active_processors();
-    int nthreads = nprocs*2;
+    int nthreads = (nprocs*2)/fraction;
+    if (nthreads == 0) nthreads = 1;
     toku_thread_pool_create(tpptr, nthreads);
     toku_thread_pool_run(*tpptr, 0, &nthreads, toku_worker, wq);
 }
diff --git a/newbrt/workqueue.h b/newbrt/workqueue.h
index 59e68d3331f..4d6597204cb 100644
--- a/newbrt/workqueue.h
+++ b/newbrt/workqueue.h
@@ -205,7 +205,9 @@ static int workqueue_n_in_queue (WORKQUEUE wq, int dolock) {
 #include "threadpool.h"
 
 // initialize the work queue and worker 
-void toku_init_workers(WORKQUEUE wq, THREADPOOL *tpptr);
+void toku_init_workers(WORKQUEUE wq, THREADPOOL *tpptr, int fraction);
+
+void toku_init_workers_with_num_threads(WORKQUEUE wq, THREADPOOL *tpptr, int num_threads);
 
 // destroy the work queue and worker 
 void toku_destroy_workers(WORKQUEUE wq, THREADPOOL *tpptr);
diff --git a/src/tests/perf_checkpoint_var.c b/src/tests/perf_checkpoint_var.c
index 80dae56d0a2..a1ecb06103c 100644
--- a/src/tests/perf_checkpoint_var.c
+++ b/src/tests/perf_checkpoint_var.c
@@ -14,6 +14,91 @@
 
 #include "threaded_stress_test_helpers.h"
 
+u_int64_t num_basements_decompressed;
+u_int64_t num_buffers_decompressed;
+u_int64_t num_basements_fetched;
+u_int64_t num_buffers_fetched;
+u_int64_t num_pivots_fetched;
+
+static void checkpoint_callback_1(void * extra) {
+    DB_ENV* env = extra;
+    u_int64_t old_num_basements_decompressed = num_basements_decompressed;
+    u_int64_t old_num_buffers_decompressed = num_buffers_decompressed;
+    u_int64_t old_num_basements_fetched = num_basements_fetched;
+    u_int64_t old_num_buffers_fetched = num_buffers_fetched;
+    u_int64_t old_num_pivots_fetched = num_pivots_fetched;
+
+    num_basements_decompressed = 
+        get_engine_status_val(env, "BRT_NUM_BASEMENTS_DECOMPRESSED_NORMAL") +
+        get_engine_status_val(env, "BRT_NUM_BASEMENTS_DECOMPRESSED_AGGRESSIVE") +
+        get_engine_status_val(env, "BRT_NUM_BASEMENTS_DECOMPRESSED_PREFETCH") +
+        get_engine_status_val(env, "BRT_NUM_BASEMENTS_DECOMPRESSED_WRITE");
+        
+    num_buffers_decompressed = 
+        get_engine_status_val(env, "BRT_NUM_MSG_BUFFER_DECOMPRESSED_NORMAL") +
+        get_engine_status_val(env, "BRT_NUM_MSG_BUFFER_DECOMPRESSED_AGGRESSIVE") +
+        get_engine_status_val(env, "BRT_NUM_MSG_BUFFER_DECOMPRESSED_PREFETCH") +
+        get_engine_status_val(env, "BRT_NUM_MSG_BUFFER_DECOMPRESSED_WRITE");
+
+    num_basements_fetched = 
+        get_engine_status_val(env, "BRT_NUM_BASEMENTS_FETCHED_NORMAL") +
+        get_engine_status_val(env, "BRT_NUM_BASEMENTS_FETCHED_AGGRESSIVE") +
+        get_engine_status_val(env, "BRT_NUM_BASEMENTS_FETCHED_PREFETCH") +
+        get_engine_status_val(env, "BRT_NUM_BASEMENTS_FETCHED_WRITE");
+
+    num_buffers_fetched = 
+        get_engine_status_val(env, "BRT_NUM_MSG_BUFFER_FETCHED_NORMAL") +
+        get_engine_status_val(env, "BRT_NUM_MSG_BUFFER_FETCHED_AGGRESSIVE") +
+        get_engine_status_val(env, "BRT_NUM_MSG_BUFFER_FETCHED_PREFETCH") +
+        get_engine_status_val(env, "BRT_NUM_MSG_BUFFER_FETCHED_WRITE");
+
+    num_pivots_fetched = 
+        get_engine_status_val(env, "BRT_NUM_PIVOTS_FETCHED_QUERY") +
+        get_engine_status_val(env, "BRT_NUM_PIVOTS_FETCHED_PREFETCH") +
+        get_engine_status_val(env, "BRT_NUM_PIVOTS_FETCHED_WRITE");
+        
+    printf("basements decompressed %"PRIu64" \n", num_basements_decompressed - old_num_basements_decompressed);
+    printf("buffers   decompressed %"PRIu64" \n", num_buffers_decompressed- old_num_buffers_decompressed);
+    printf("basements fetched      %"PRIu64" \n", num_basements_fetched - old_num_basements_fetched);
+    printf("buffers fetched        %"PRIu64" \n", num_buffers_fetched - old_num_buffers_fetched);
+    printf("pivots fetched         %"PRIu64" \n", num_pivots_fetched - old_num_pivots_fetched);
+    printf("************************************************************\n");
+}
+
+static void checkpoint_callback_2(void * extra) {
+    DB_ENV* env = extra;
+    num_basements_decompressed = 
+        get_engine_status_val(env, "BRT_NUM_BASEMENTS_DECOMPRESSED_NORMAL") +
+        get_engine_status_val(env, "BRT_NUM_BASEMENTS_DECOMPRESSED_AGGRESSIVE") +
+        get_engine_status_val(env, "BRT_NUM_BASEMENTS_DECOMPRESSED_PREFETCH") +
+        get_engine_status_val(env, "BRT_NUM_BASEMENTS_DECOMPRESSED_WRITE");
+        
+    num_buffers_decompressed = 
+        get_engine_status_val(env, "BRT_NUM_MSG_BUFFER_DECOMPRESSED_NORMAL") +
+        get_engine_status_val(env, "BRT_NUM_MSG_BUFFER_DECOMPRESSED_AGGRESSIVE") +
+        get_engine_status_val(env, "BRT_NUM_MSG_BUFFER_DECOMPRESSED_PREFETCH") +
+        get_engine_status_val(env, "BRT_NUM_MSG_BUFFER_DECOMPRESSED_WRITE");
+    
+    num_basements_fetched = 
+        get_engine_status_val(env, "BRT_NUM_BASEMENTS_FETCHED_NORMAL") +
+        get_engine_status_val(env, "BRT_NUM_BASEMENTS_FETCHED_AGGRESSIVE") +
+        get_engine_status_val(env, "BRT_NUM_BASEMENTS_FETCHED_PREFETCH") +
+        get_engine_status_val(env, "BRT_NUM_BASEMENTS_FETCHED_WRITE");
+    
+    num_buffers_fetched = 
+        get_engine_status_val(env, "BRT_NUM_MSG_BUFFER_FETCHED_NORMAL") +
+        get_engine_status_val(env, "BRT_NUM_MSG_BUFFER_FETCHED_AGGRESSIVE") +
+        get_engine_status_val(env, "BRT_NUM_MSG_BUFFER_FETCHED_PREFETCH") +
+        get_engine_status_val(env, "BRT_NUM_MSG_BUFFER_FETCHED_WRITE");
+    
+    num_pivots_fetched = 
+        get_engine_status_val(env, "BRT_NUM_PIVOTS_FETCHED_QUERY") +
+        get_engine_status_val(env, "BRT_NUM_PIVOTS_FETCHED_PREFETCH") +
+        get_engine_status_val(env, "BRT_NUM_PIVOTS_FETCHED_WRITE");
+}
+
+
+
 //
 // This test is a form of stress that does operations on a single dictionary:
 // We create a dictionary bigger than the cachetable (around 4x greater).
@@ -69,6 +154,8 @@ cleanup:
 
 static void
 stress_table(DB_ENV* env, DB** dbp, struct cli_args *cli_args) {
+    db_env_set_checkpoint_callback(checkpoint_callback_1, env);
+    db_env_set_checkpoint_callback2(checkpoint_callback_2, env);
     int n = cli_args->num_elements;
     //
     // the threads that we want: