Merge branch '10.0' of github.com:MariaDB/server into 10.0

2025-01-29 02:05:57 +01:00 · 2016-09-28 16:19:58 +03:00 · 2016-09-28 16:19:58 +03:00 · 23af6f5942
commit 23af6f5942
parent a53f3c6d3c 078e510e0a
162 changed files with 4967 additions and 4289 deletions
--- a/1
+++ b/1
@ -10,6 +10,7 @@ Visma                           http://visma.com (2015 - 2016)
 Acronis                         http://acronis.com (2016)
 Nexedi				https://www.nexedi.com (2016)
 Automattic              	https://automattic.com  (2014 - 2016)
+Tencent Game DBA		http://tencentdba.com/about (2016)
 Verkkokauppa.com		https://www.verkkokauppa.com (2015 - 2016)
 Virtuozzo                       https://virtuozzo.com (2016)

--- a/2
+++ b/2
@ -1,3 +1,3 @@
 MYSQL_VERSION_MAJOR=10
 MYSQL_VERSION_MINOR=0
-MYSQL_VERSION_PATCH=27
+MYSQL_VERSION_PATCH=28
--- a/cmake/cpack_rpm.cmake
+++ b/cmake/cpack_rpm.cmake
@ -220,6 +220,9 @@ SETA(CPACK_RPM_test_PACKAGE_PROVIDES
  "perl(mtr_io.pl)"
  "perl(mtr_match)"
  "perl(mtr_misc.pl)"
+  "perl(mtr_gcov.pl)"
+  "perl(mtr_gprof.pl)"
+  "perl(mtr_process.pl)"
  "perl(mtr_report)"
  "perl(mtr_results)"
  "perl(mtr_unique)")
--- a/include/my_global.h
+++ b/include/my_global.h
@ -882,8 +882,7 @@ typedef long long	my_ptrdiff_t;
  and related routines are refactored.
 */

-#define my_offsetof(TYPE, MEMBER) \
-        ((size_t)((char *)&(((TYPE *)0x10)->MEMBER) - (char*)0x10))
+#define my_offsetof(TYPE, MEMBER) PTR_BYTE_DIFF(&((TYPE *)0x10)->MEMBER, 0x10)

 #define NullS		(char *) 0

--- a/include/my_sys.h
+++ b/include/my_sys.h
@ -1,5 +1,5 @@
 /* Copyright (c) 2000, 2013, Oracle and/or its affiliates.
-   Copyright (c) 2010, 2013, Monty Program Ab.
+   Copyright (c) 2010, 2016, Monty Program Ab.

   This program is free software; you can redistribute it and/or modify
   it under the terms of the GNU General Public License as published by
@ -271,7 +271,7 @@ extern my_bool my_use_symdir;
 extern ulong	my_default_record_cache_size;
 extern my_bool  my_disable_locking, my_disable_async_io,
                my_disable_flush_key_blocks, my_disable_symlinks;
-extern my_bool my_disable_sync;
+extern my_bool my_disable_sync, my_disable_copystat_in_redel;
 extern char	wild_many,wild_one,wild_prefix;
 extern const char *charsets_dir;
 extern my_bool timed_mutexes;
--- a/mysql-test/extra/binlog_tests/database.test
+++ b/mysql-test/extra/binlog_tests/database.test
@ -52,7 +52,7 @@ eval SELECT 'hello' INTO OUTFILE 'fake_file.$prefix';

 # Use '/' instead of '\' in the error message. On windows platform, dir is
 # formed with '\'.
--replace_regex /\\testing_1\\*/\/testing_1\// /66/39/ /17/39/ /File exists/Directory not empty/
+--replace_regex /\\testing_1\\*/\/testing_1\// /66/39/ /17/39/ /247/39/ /File exists/Directory not empty/
 --error 1010
 DROP DATABASE testing_1;
 let $wait_binlog_event= DROP TABLE IF EXIST;
--- a/mysql-test/include/index_merge2.inc
+++ b/mysql-test/include/index_merge2.inc
@ -341,6 +341,7 @@ while ($1)
 alter table t1 add index i2(key2);
 alter table t1 add index i3(key3);
 update t1 set key2=key1,key3=key1;
+analyze table t1;

 # to test the bug, the following must use "sort_union":
 --replace_column 9 REF
--- a/mysql-test/lib/My/CoreDump.pm
+++ b/mysql-test/lib/My/CoreDump.pm
@ -261,11 +261,7 @@ sub show {
  # On Windows, rely on cdb to be there...
  if (IS_WINDOWS)
  {
-    # Starting cdb is unsafe when used with --parallel > 1 option 
-    if ( $parallel < 2 )
-    {
-      _cdb($core_name);
-    }
+    _cdb($core_name);
    return;
  }
  
--- a/mysql-test/lib/mtr_cases.pm
+++ b/mysql-test/lib/mtr_cases.pm
@ -60,8 +60,6 @@ use My::Test;
 use My::Find;
 use My::Suite;

-require "mtr_misc.pl";
-
 # locate plugin suites, depending on whether it's a build tree or installed
 my @plugin_suitedirs;
 my $plugin_suitedir_regex;
@ -1122,7 +1120,7 @@ sub get_tags_from_file($$) {
  $file_to_tags{$file}= $tags;
  $file_to_master_opts{$file}= $master_opts;
  $file_to_slave_opts{$file}= $slave_opts;
-  $file_combinations{$file}= [ uniq(@combinations) ];
+  $file_combinations{$file}= [ ::uniq(@combinations) ];
  $file_in_overlay{$file} = 1 if $in_overlay;
  return @{$tags};
 }
--- a/mysql-test/lib/mtr_report.pm
+++ b/mysql-test/lib/mtr_report.pm
@ -34,7 +34,6 @@ use mtr_match;
 use My::Platform;
 use POSIX qw[ _exit ];
 use IO::Handle qw[ flush ];
-require "mtr_io.pl";
 use mtr_results;

 my $tot_real_time= 0;
@ -92,7 +91,7 @@ sub mtr_report_test_passed ($) {
  my $timer_str=  "";
  if ( $timer and -f "$::opt_vardir/log/timer" )
  {
-    $timer_str= mtr_fromfile("$::opt_vardir/log/timer");
+    $timer_str= ::mtr_fromfile("$::opt_vardir/log/timer");
    $tinfo->{timer}= $timer_str;
    resfile_test_info('duration', $timer_str) if $::opt_resfile;
  }
--- a/mysql-test/mysql-test-run.pl
+++ b/mysql-test/mysql-test-run.pl
@ -102,11 +102,11 @@ use mtr_results;
 use IO::Socket::INET;
 use IO::Select;

-require "lib/mtr_process.pl";
-require "lib/mtr_io.pl";
-require "lib/mtr_gcov.pl";
-require "lib/mtr_gprof.pl";
-require "lib/mtr_misc.pl";
+require "mtr_process.pl";
+require "mtr_io.pl";
+require "mtr_gcov.pl";
+require "mtr_gprof.pl";
+require "mtr_misc.pl";

 $SIG{INT}= sub { mtr_error("Got ^C signal"); };
 $SIG{HUP}= sub { mtr_error("Hangup detected on controlling terminal"); };
--- a/mysql-test/r/contributors.result
+++ b/mysql-test/r/contributors.result
@ -9,6 +9,7 @@ Acronis	http://www.acronis.com	Silver Sponsor of the MariaDB Foundation
 Auttomattic	https://automattic.com	Bronze Sponsor of the MariaDB Foundation
 Verkkokauppa.com	https://virtuozzo.com	Bronze Sponsor of the MariaDB Foundation
 Virtuozzo	https://virtuozzo.com/	Bronze Sponsor of the MariaDB Foundation
+Tencent Game DBA	http://tencentdba.com/about/	Bronze Sponsor of the MariaDB Foundation
 Google	USA	Sponsoring encryption, parallel replication and GTID
 Facebook	USA	Sponsoring non-blocking API, LIMIT ROWS EXAMINED etc
 Ronald Bradford	Brisbane, Australia	EFF contribution for UC2006 Auction
--- a/mysql-test/r/ctype_utf32.result
+++ b/mysql-test/r/ctype_utf32.result
@ -1658,6 +1658,9 @@ CHAR_LENGTH(TRIM(BOTH 0x61 FROM _utf32 0x00000061))
 SELECT CHAR_LENGTH(TRIM(BOTH 0x00 FROM _utf32 0x00000061));
 CHAR_LENGTH(TRIM(BOTH 0x00 FROM _utf32 0x00000061))
 1
+select hex(lower(cast(0xffff0000 as char character set utf32))) as c;
+c
+FFFF0000
 #
 # End of 5.5 tests
 #
--- a/mysql-test/r/group_min_max_innodb.result
+++ b/mysql-test/r/group_min_max_innodb.result
@ -286,3 +286,19 @@ F	28	28
 F	29	29
 F	30	30
 DROP TABLE t0,t1,t2;
+#
+# MDEV-MariaDB daemon leaks memory with specific query
+#
+CREATE TABLE t1 (`voter_id` int(11) unsigned NOT NULL,
+`language_id` int(11) unsigned NOT NULL DEFAULT '1'
+) ENGINE=InnoDB DEFAULT CHARSET=utf8;
+CREATE TABLE t2 (`voter_id` int(10) unsigned NOT NULL DEFAULT '0',
+`serialized_c` mediumblob) ENGINE=InnoDB DEFAULT CHARSET=utf8;
+insert into t2 values (1,repeat("a",1000)),(2,repeat("a",1000)),(3,repeat("b",1000)),(4,repeat("c",1000)),(4,repeat("b",1000));
+SELECT GROUP_CONCAT(t1.language_id SEPARATOR ',') AS `translation_resources`, `d`.`serialized_c` FROM t2 AS `d` LEFT JOIN t1 ON `d`.`voter_id` = t1.`voter_id` GROUP BY `d`.`voter_id` ORDER BY 10-d.voter_id+RAND()*0;
+translation_resources	serialized_c
+NULL	cccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccc
+NULL	bbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbb
+NULL	aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa
+NULL	aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa
+drop table t1,t2;
--- a/mysql-test/r/index_merge_innodb.result
+++ b/mysql-test/r/index_merge_innodb.result
@ -311,6 +311,9 @@ set @d=@d*2;
 alter table t1 add index i2(key2);
 alter table t1 add index i3(key3);
 update t1 set key2=key1,key3=key1;
+analyze table t1;
+Table	Op	Msg_type	Msg_text
+test.t1	analyze	status	OK
 explain select * from t1 where (key3 > 30 and key3<35) or (key2 >32 and key2 < 40);
 id	select_type	table	type	possible_keys	key	key_len	ref	rows	Extra
 1	SIMPLE	t1	index_merge	i2,i3	i3,i2	4,4	NULL	REF	Using sort_union(i3,i2); Using where
--- a/mysql-test/r/index_merge_myisam.result
+++ b/mysql-test/r/index_merge_myisam.result
@ -1146,6 +1146,9 @@ set @d=@d*2;
 alter table t1 add index i2(key2);
 alter table t1 add index i3(key3);
 update t1 set key2=key1,key3=key1;
+analyze table t1;
+Table	Op	Msg_type	Msg_text
+test.t1	analyze	status	OK
 explain select * from t1 where (key3 > 30 and key3<35) or (key2 >32 and key2 < 40);
 id	select_type	table	type	possible_keys	key	key_len	ref	rows	Extra
 1	SIMPLE	t1	index_merge	i2,i3	i3,i2	4,4	NULL	REF	Using sort_union(i3,i2); Using where
--- a/mysql-test/r/merge.result
+++ b/mysql-test/r/merge.result
@ -3832,6 +3832,23 @@ test.m1	repair	error	Corrupt
 # Clean-up.
 drop tables m1, t1, t4;
 drop view t3;
+#
+# MDEV-10424 - Assertion `ticket == __null' failed in
+#              MDL_request::set_type
+#
+CREATE TABLE t1 (f1 INT) ENGINE=MyISAM;
+CREATE TABLE tmerge (f1 INT) ENGINE=MERGE UNION=(t1);
+PREPARE stmt FROM "ANALYZE TABLE tmerge, t1";
+EXECUTE stmt;
+Table	Op	Msg_type	Msg_text
+test.tmerge	analyze	note	The storage engine for the table doesn't support analyze
+test.t1	analyze	status	Table is already up to date
+EXECUTE stmt;
+Table	Op	Msg_type	Msg_text
+test.tmerge	analyze	note	The storage engine for the table doesn't support analyze
+test.t1	analyze	status	Table is already up to date
+DEALLOCATE PREPARE stmt;
+DROP TABLE t1, tmerge;
 End of 5.5 tests
 #
 # Additional coverage for refactoring which is made as part
--- a/mysql-test/r/ps.result
+++ b/mysql-test/r/ps.result
@ -4076,4 +4076,35 @@ id	value
 deallocate prepare stmt;
 SET SESSION sql_mode = @save_sql_mode;
 DROP TABLE t1,t2;
-# End of 10.0 tests
+#
+# MDEV-8833: Crash of server on prepared statement with
+# conversion to semi-join
+#
+CREATE TABLE t1 (column1 INT);
+INSERT INTO t1 VALUES (3),(9);
+CREATE TABLE t2 (column2 INT);
+INSERT INTO t2 VALUES (1),(4);
+CREATE TABLE t3 (column3 INT);
+INSERT INTO t3 VALUES (6),(8);
+CREATE TABLE t4 (column4 INT);
+INSERT INTO t4 VALUES (2),(5);
+PREPARE stmt FROM "SELECT ( SELECT MAX( table1.column1 ) AS field1 
+FROM t1 AS table1
+WHERE table3.column3 IN ( SELECT table2.column2 AS field2 FROM t2 AS table2 ) 
+) AS sq
+FROM t3 AS table3, t4 AS table4";
+EXECUTE stmt;
+sq
+NULL
+NULL
+NULL
+NULL
+EXECUTE stmt;
+sq
+NULL
+NULL
+NULL
+NULL
+deallocate prepare stmt;
+drop table t1,t2,t3,t4;
+# End of 5.5 tests
--- a/mysql-test/r/type_uint.result
+++ b/mysql-test/r/type_uint.result
@ -14,6 +14,25 @@ this
 0
 4294967295
 drop table t1;
+create table t1 (a bigint unsigned, b mediumint unsigned);
+insert t1 values (1,2),(0xffffffffffffffff,0xffffff);
+select coalesce(a,b), coalesce(b,a) from t1;
+coalesce(a,b)	coalesce(b,a)
+1	2
+18446744073709551615	16777215
+create table t2 as select a from t1 union select b from t1;
+show create table t2;
+Table	Create Table
+t2	CREATE TABLE `t2` (
+  `a` bigint(20) unsigned DEFAULT NULL
+) ENGINE=MyISAM DEFAULT CHARSET=latin1
+select * from t2;
+a
+1
+18446744073709551615
+2
+16777215
+drop table t1, t2;
 #
 # Start of 10.0 tests
 #
--- a/mysql-test/suite/innodb/r/innodb_bug54044.result
+++ b/mysql-test/suite/innodb/r/innodb_bug54044.result
@ -6,7 +6,8 @@ table_54044	CREATE TEMPORARY TABLE `table_54044` (
  `IF(NULL  IS NOT NULL, NULL, NULL)` binary(0) DEFAULT NULL
 ) ENGINE=InnoDB DEFAULT CHARSET=latin1
 DROP TABLE table_54044;
-CREATE TABLE tmp ENGINE = INNODB AS SELECT COALESCE(NULL, NULL, NULL), GREATEST(NULL, NULL), NULL;
+CREATE TABLE tmp ENGINE = INNODB
+AS SELECT COALESCE(NULL, NULL, NULL), GREATEST(NULL, NULL), NULL;
 SHOW CREATE TABLE tmp;
 Table	Create Table
 tmp	CREATE TABLE `tmp` (
--- a/mysql-test/suite/innodb/r/system_tables.result
+++ b/mysql-test/suite/innodb/r/system_tables.result
@ -0,0 +1,8 @@
+alter table mysql.time_zone_name engine=InnoDB;
+create table envois3 (starttime datetime) engine=InnoDB;
+insert envois3 values ('2008-08-11 22:43:00');
+select convert_tz(starttime,'UTC','Europe/Moscow') starttime from envois3;
+starttime
+2008-08-12 02:43:00
+drop table envois3;
+alter table mysql.time_zone_name engine=MyISAM;
--- a/mysql-test/suite/innodb/t/innodb_bug54044.test
+++ b/mysql-test/suite/innodb/t/innodb_bug54044.test
@ -10,7 +10,10 @@ CREATE TEMPORARY TABLE table_54044 ENGINE = INNODB
 SHOW CREATE TABLE table_54044;
 DROP TABLE table_54044;

-CREATE TABLE tmp ENGINE = INNODB AS SELECT COALESCE(NULL, NULL, NULL), GREATEST(NULL, NULL), NULL;
+# This 'create table' should pass since it uses a Field_string of size 0.
+
+CREATE TABLE tmp ENGINE = INNODB
+ AS SELECT COALESCE(NULL, NULL, NULL), GREATEST(NULL, NULL), NULL;
 SHOW CREATE TABLE tmp;
 DROP TABLE tmp;

@ -23,4 +26,3 @@ FLUSH TABLES;
 --error 1005
 CREATE TEMPORARY TABLE tmp ENGINE=InnoDB AS SELECT VALUES(a) FROM t1;
 DROP TABLE t1;
-
--- a/mysql-test/suite/innodb/t/system_tables.test
+++ b/mysql-test/suite/innodb/t/system_tables.test
@ -0,0 +1,12 @@
+--source include/have_innodb.inc
+
+#
+# MDEV-10775 System table in InnoDB format allowed in MariaDB could lead to crash
+#
+alter table mysql.time_zone_name engine=InnoDB;
+create table envois3 (starttime datetime) engine=InnoDB;
+insert envois3 values ('2008-08-11 22:43:00');
+--source include/restart_mysqld.inc
+select convert_tz(starttime,'UTC','Europe/Moscow') starttime from envois3;
+drop table envois3;
+alter table mysql.time_zone_name engine=MyISAM;
--- a/mysql-test/suite/perfschema/r/aggregate.result
+++ b/mysql-test/suite/perfschema/r/aggregate.result
@ -1,121 +0,0 @@
-"General cleanup"
-set @aria_checkpoint_interval_save= @@global.aria_checkpoint_interval;
-set @@global.aria_checkpoint_interval= 0;
-drop table if exists t1;
-update performance_schema.setup_instruments set enabled = 'NO';
-update performance_schema.setup_consumers set enabled = 'NO';
-truncate table performance_schema.file_summary_by_event_name;
-truncate table performance_schema.file_summary_by_instance;
-truncate table performance_schema.socket_summary_by_event_name;
-truncate table performance_schema.socket_summary_by_instance;
-truncate table performance_schema.events_waits_summary_global_by_event_name;
-truncate table performance_schema.events_waits_summary_by_instance;
-truncate table performance_schema.events_waits_summary_by_thread_by_event_name;
-update performance_schema.setup_consumers set enabled = 'YES';
-update performance_schema.setup_instruments
-set enabled = 'YES', timed = 'YES';
-create table t1 (
-id INT PRIMARY KEY,
-b CHAR(100) DEFAULT 'initial value')
-ENGINE=MyISAM;
-insert into t1 (id) values (1), (2), (3), (4), (5), (6), (7), (8);
-update performance_schema.setup_instruments SET enabled = 'NO';
-update performance_schema.setup_consumers set enabled = 'NO';
-set @dump_all=FALSE;
-"Verifying file aggregate consistency"
-SELECT EVENT_NAME, e.COUNT_READ, SUM(i.COUNT_READ)
-FROM performance_schema.file_summary_by_event_name AS e
-JOIN performance_schema.file_summary_by_instance AS i USING (EVENT_NAME)
-GROUP BY EVENT_NAME
-HAVING (e.COUNT_READ <> SUM(i.COUNT_READ))
-OR @dump_all;
-EVENT_NAME	COUNT_READ	SUM(i.COUNT_READ)
-SELECT EVENT_NAME, e.COUNT_WRITE, SUM(i.COUNT_WRITE)
-FROM performance_schema.file_summary_by_event_name AS e
-JOIN performance_schema.file_summary_by_instance AS i USING (EVENT_NAME)
-GROUP BY EVENT_NAME
-HAVING (e.COUNT_WRITE <> SUM(i.COUNT_WRITE))
-OR @dump_all;
-EVENT_NAME	COUNT_WRITE	SUM(i.COUNT_WRITE)
-SELECT EVENT_NAME, e.COUNT_READ, SUM(i.COUNT_READ)
-FROM performance_schema.socket_summary_by_event_name AS e
-JOIN performance_schema.socket_summary_by_instance AS i USING (EVENT_NAME)
-GROUP BY EVENT_NAME
-HAVING (e.COUNT_READ <> SUM(i.COUNT_READ))
-OR @dump_all;
-EVENT_NAME	COUNT_READ	SUM(i.COUNT_READ)
-SELECT EVENT_NAME, e.COUNT_WRITE, SUM(i.COUNT_WRITE)
-FROM performance_schema.socket_summary_by_event_name AS e
-JOIN performance_schema.socket_summary_by_instance AS i USING (EVENT_NAME)
-GROUP BY EVENT_NAME
-HAVING (e.COUNT_WRITE <> SUM(i.COUNT_WRITE))
-OR @dump_all;
-EVENT_NAME	COUNT_WRITE	SUM(i.COUNT_WRITE)
-SELECT EVENT_NAME, e.SUM_NUMBER_OF_BYTES_READ, SUM(i.SUM_NUMBER_OF_BYTES_READ)
-FROM performance_schema.file_summary_by_event_name AS e
-JOIN performance_schema.file_summary_by_instance AS i USING (EVENT_NAME)
-GROUP BY EVENT_NAME
-HAVING (e.SUM_NUMBER_OF_BYTES_READ <> SUM(i.SUM_NUMBER_OF_BYTES_READ))
-OR @dump_all;
-EVENT_NAME	SUM_NUMBER_OF_BYTES_READ	SUM(i.SUM_NUMBER_OF_BYTES_READ)
-SELECT EVENT_NAME, e.SUM_NUMBER_OF_BYTES_WRITE, SUM(i.SUM_NUMBER_OF_BYTES_WRITE)
-FROM performance_schema.file_summary_by_event_name AS e
-JOIN performance_schema.file_summary_by_instance AS i USING (EVENT_NAME)
-GROUP BY EVENT_NAME
-HAVING (e.SUM_NUMBER_OF_BYTES_WRITE <> SUM(i.SUM_NUMBER_OF_BYTES_WRITE))
-OR @dump_all;
-EVENT_NAME	SUM_NUMBER_OF_BYTES_WRITE	SUM(i.SUM_NUMBER_OF_BYTES_WRITE)
-"Verifying waits aggregate consistency (instance)"
-SELECT EVENT_NAME, e.SUM_TIMER_WAIT, SUM(i.SUM_TIMER_WAIT)
-FROM performance_schema.events_waits_summary_global_by_event_name AS e
-JOIN performance_schema.events_waits_summary_by_instance AS i USING (EVENT_NAME)
-GROUP BY EVENT_NAME
-HAVING (e.SUM_TIMER_WAIT < SUM(i.SUM_TIMER_WAIT))
-OR @dump_all;
-EVENT_NAME	SUM_TIMER_WAIT	SUM(i.SUM_TIMER_WAIT)
-SELECT EVENT_NAME, e.MIN_TIMER_WAIT, MIN(i.MIN_TIMER_WAIT)
-FROM performance_schema.events_waits_summary_global_by_event_name AS e
-JOIN performance_schema.events_waits_summary_by_instance AS i USING (EVENT_NAME)
-GROUP BY EVENT_NAME
-HAVING (e.MIN_TIMER_WAIT > MIN(i.MIN_TIMER_WAIT))
-AND (MIN(i.MIN_TIMER_WAIT) != 0)
-OR @dump_all;
-EVENT_NAME	MIN_TIMER_WAIT	MIN(i.MIN_TIMER_WAIT)
-SELECT EVENT_NAME, e.MAX_TIMER_WAIT, MAX(i.MAX_TIMER_WAIT)
-FROM performance_schema.events_waits_summary_global_by_event_name AS e
-JOIN performance_schema.events_waits_summary_by_instance AS i USING (EVENT_NAME)
-GROUP BY EVENT_NAME
-HAVING (e.MAX_TIMER_WAIT < MAX(i.MAX_TIMER_WAIT))
-OR @dump_all;
-EVENT_NAME	MAX_TIMER_WAIT	MAX(i.MAX_TIMER_WAIT)
-"Verifying waits aggregate consistency (thread)"
-SELECT EVENT_NAME, e.SUM_TIMER_WAIT, SUM(t.SUM_TIMER_WAIT)
-FROM performance_schema.events_waits_summary_global_by_event_name AS e
-JOIN performance_schema.events_waits_summary_by_thread_by_event_name AS t
-USING (EVENT_NAME)
-GROUP BY EVENT_NAME
-HAVING (e.SUM_TIMER_WAIT < SUM(t.SUM_TIMER_WAIT))
-OR @dump_all;
-EVENT_NAME	SUM_TIMER_WAIT	SUM(t.SUM_TIMER_WAIT)
-SELECT EVENT_NAME, e.MIN_TIMER_WAIT, MIN(t.MIN_TIMER_WAIT)
-FROM performance_schema.events_waits_summary_global_by_event_name AS e
-JOIN performance_schema.events_waits_summary_by_thread_by_event_name AS t
-USING (EVENT_NAME)
-GROUP BY EVENT_NAME
-HAVING (e.MIN_TIMER_WAIT > MIN(t.MIN_TIMER_WAIT))
-AND (MIN(t.MIN_TIMER_WAIT) != 0)
-OR @dump_all;
-EVENT_NAME	MIN_TIMER_WAIT	MIN(t.MIN_TIMER_WAIT)
-SELECT EVENT_NAME, e.MAX_TIMER_WAIT, MAX(t.MAX_TIMER_WAIT)
-FROM performance_schema.events_waits_summary_global_by_event_name AS e
-JOIN performance_schema.events_waits_summary_by_thread_by_event_name AS t
-USING (EVENT_NAME)
-GROUP BY EVENT_NAME
-HAVING (e.MAX_TIMER_WAIT < MAX(t.MAX_TIMER_WAIT))
-OR @dump_all;
-EVENT_NAME	MAX_TIMER_WAIT	MAX(t.MAX_TIMER_WAIT)
-update performance_schema.setup_consumers set enabled = 'YES';
-update performance_schema.setup_instruments
-set enabled = 'YES', timed = 'YES';
-drop table test.t1;
-set @@global.aria_checkpoint_interval= @aria_checkpoint_interval_save;
--- a/mysql-test/suite/perfschema/t/aggregate.test
+++ b/mysql-test/suite/perfschema/t/aggregate.test
@ -1,197 +0,0 @@
-# Tests for PERFORMANCE_SCHEMA
-# Verify that statistics aggregated by different criteria are consistent.
-
--source include/not_embedded.inc
--source include/have_perfschema.inc
-
--echo "General cleanup"
-
-# MDEV-7187 - test fails sporadically in buildbot
-set @aria_checkpoint_interval_save= @@global.aria_checkpoint_interval;
-set @@global.aria_checkpoint_interval= 0;
-
--disable_warnings
-drop table if exists t1;
--enable_warnings
-
-update performance_schema.setup_instruments set enabled = 'NO';
-update performance_schema.setup_consumers set enabled = 'NO';
-
-# Cleanup statistics
-truncate table performance_schema.file_summary_by_event_name;
-truncate table performance_schema.file_summary_by_instance;
-truncate table performance_schema.socket_summary_by_event_name;
-truncate table performance_schema.socket_summary_by_instance;
-truncate table performance_schema.events_waits_summary_global_by_event_name;
-truncate table performance_schema.events_waits_summary_by_instance;
-truncate table performance_schema.events_waits_summary_by_thread_by_event_name;
-
-# Start recording data
-update performance_schema.setup_consumers set enabled = 'YES';
-update performance_schema.setup_instruments
-  set enabled = 'YES', timed = 'YES';
-
-
-create table t1 (
-  id INT PRIMARY KEY,
-  b CHAR(100) DEFAULT 'initial value')
-  ENGINE=MyISAM;
-
-insert into t1 (id) values (1), (2), (3), (4), (5), (6), (7), (8);
-
-# Stop recording data, so the select below don't add noise.
-update performance_schema.setup_instruments SET enabled = 'NO';
-# Disable all consumers, for long standing waits
-update performance_schema.setup_consumers set enabled = 'NO';
-
-# Helper to debug
-set @dump_all=FALSE;
-
-# Note that in general:
-# - COUNT/SUM/MAX(file_summary_by_event_name) >=
-#   COUNT/SUM/MAX(file_summary_by_instance).
-# - MIN(file_summary_by_event_name) <=
-#   MIN(file_summary_by_instance).
-# There will be equality only when file instances are not removed,
-# aka when a file is not deleted from the file system,
-# because doing so removes a row in file_summary_by_instance.
-
-# Likewise:
-# - COUNT/SUM/MAX(events_waits_summary_global_by_event_name) >=
-#   COUNT/SUM/MAX(events_waits_summary_by_instance)
-# - MIN(events_waits_summary_global_by_event_name) <=
-#   MIN(events_waits_summary_by_instance)
-# There will be equality only when an instrument instance
-# is not removed, which is next to impossible to predictably guarantee
-# in the server.
-# For example, a MyISAM table removed from the table cache
-# will cause a mysql_mutex_destroy on myisam/MYISAM_SHARE::intern_lock.
-# Another example, a thread terminating will cause a mysql_mutex_destroy
-# on sql/LOCK_delete
-# Both cause a row to be deleted from events_waits_summary_by_instance.
-
-# Likewise:
-# - COUNT/SUM/MAX(events_waits_summary_global_by_event_name) >=
-#   COUNT/SUM/MAX(events_waits_summary_by_thread_by_event_name)
-# - MIN(events_waits_summary_global_by_event_name) <=
-#   MIN(events_waits_summary_by_thread_by_event_name)
-# There will be equality only when no thread is removed,
-# that is if no thread disconnects, or no sub thread (for example insert
-# delayed) ever completes.
-# A thread completing will cause rows in
-# events_waits_summary_by_thread_by_event_name to be removed.
-
--echo "Verifying file aggregate consistency"
-
-# Since the code generating the load in this test does:
-# - create table
-# - insert
-# - does not cause temporary tables to be used
-# we can test for equality here for file aggregates.
-
-# If any of these queries returns data, the test failed.
-
-SELECT EVENT_NAME, e.COUNT_READ, SUM(i.COUNT_READ)
-FROM performance_schema.file_summary_by_event_name AS e
-JOIN performance_schema.file_summary_by_instance AS i USING (EVENT_NAME)
-GROUP BY EVENT_NAME
-HAVING (e.COUNT_READ <> SUM(i.COUNT_READ))
-OR @dump_all;
-
-SELECT EVENT_NAME, e.COUNT_WRITE, SUM(i.COUNT_WRITE)
-FROM performance_schema.file_summary_by_event_name AS e
-JOIN performance_schema.file_summary_by_instance AS i USING (EVENT_NAME)
-GROUP BY EVENT_NAME
-HAVING (e.COUNT_WRITE <> SUM(i.COUNT_WRITE))
-OR @dump_all;
-
-SELECT EVENT_NAME, e.COUNT_READ, SUM(i.COUNT_READ)
-FROM performance_schema.socket_summary_by_event_name AS e
-JOIN performance_schema.socket_summary_by_instance AS i USING (EVENT_NAME)
-GROUP BY EVENT_NAME
-HAVING (e.COUNT_READ <> SUM(i.COUNT_READ))
-OR @dump_all;
-
-SELECT EVENT_NAME, e.COUNT_WRITE, SUM(i.COUNT_WRITE)
-FROM performance_schema.socket_summary_by_event_name AS e
-JOIN performance_schema.socket_summary_by_instance AS i USING (EVENT_NAME)
-GROUP BY EVENT_NAME
-HAVING (e.COUNT_WRITE <> SUM(i.COUNT_WRITE))
-OR @dump_all;
-
-SELECT EVENT_NAME, e.SUM_NUMBER_OF_BYTES_READ, SUM(i.SUM_NUMBER_OF_BYTES_READ)
-FROM performance_schema.file_summary_by_event_name AS e
-JOIN performance_schema.file_summary_by_instance AS i USING (EVENT_NAME)
-GROUP BY EVENT_NAME
-HAVING (e.SUM_NUMBER_OF_BYTES_READ <> SUM(i.SUM_NUMBER_OF_BYTES_READ))
-OR @dump_all;
-
-SELECT EVENT_NAME, e.SUM_NUMBER_OF_BYTES_WRITE, SUM(i.SUM_NUMBER_OF_BYTES_WRITE)
-FROM performance_schema.file_summary_by_event_name AS e
-JOIN performance_schema.file_summary_by_instance AS i USING (EVENT_NAME)
-GROUP BY EVENT_NAME
-HAVING (e.SUM_NUMBER_OF_BYTES_WRITE <> SUM(i.SUM_NUMBER_OF_BYTES_WRITE))
-OR @dump_all;
-
--echo "Verifying waits aggregate consistency (instance)"
-
-SELECT EVENT_NAME, e.SUM_TIMER_WAIT, SUM(i.SUM_TIMER_WAIT)
-FROM performance_schema.events_waits_summary_global_by_event_name AS e
-JOIN performance_schema.events_waits_summary_by_instance AS i USING (EVENT_NAME)
-GROUP BY EVENT_NAME
-HAVING (e.SUM_TIMER_WAIT < SUM(i.SUM_TIMER_WAIT))
-OR @dump_all;
-
-SELECT EVENT_NAME, e.MIN_TIMER_WAIT, MIN(i.MIN_TIMER_WAIT)
-FROM performance_schema.events_waits_summary_global_by_event_name AS e
-JOIN performance_schema.events_waits_summary_by_instance AS i USING (EVENT_NAME)
-GROUP BY EVENT_NAME
-HAVING (e.MIN_TIMER_WAIT > MIN(i.MIN_TIMER_WAIT))
-AND (MIN(i.MIN_TIMER_WAIT) != 0)
-OR @dump_all;
-
-SELECT EVENT_NAME, e.MAX_TIMER_WAIT, MAX(i.MAX_TIMER_WAIT)
-FROM performance_schema.events_waits_summary_global_by_event_name AS e
-JOIN performance_schema.events_waits_summary_by_instance AS i USING (EVENT_NAME)
-GROUP BY EVENT_NAME
-HAVING (e.MAX_TIMER_WAIT < MAX(i.MAX_TIMER_WAIT))
-OR @dump_all;
-
--echo "Verifying waits aggregate consistency (thread)"
-
-SELECT EVENT_NAME, e.SUM_TIMER_WAIT, SUM(t.SUM_TIMER_WAIT)
-FROM performance_schema.events_waits_summary_global_by_event_name AS e
-JOIN performance_schema.events_waits_summary_by_thread_by_event_name AS t
-USING (EVENT_NAME)
-GROUP BY EVENT_NAME
-HAVING (e.SUM_TIMER_WAIT < SUM(t.SUM_TIMER_WAIT))
-OR @dump_all;
-
-SELECT EVENT_NAME, e.MIN_TIMER_WAIT, MIN(t.MIN_TIMER_WAIT)
-FROM performance_schema.events_waits_summary_global_by_event_name AS e
-JOIN performance_schema.events_waits_summary_by_thread_by_event_name AS t
-USING (EVENT_NAME)
-GROUP BY EVENT_NAME
-HAVING (e.MIN_TIMER_WAIT > MIN(t.MIN_TIMER_WAIT))
-AND (MIN(t.MIN_TIMER_WAIT) != 0)
-OR @dump_all;
-
-SELECT EVENT_NAME, e.MAX_TIMER_WAIT, MAX(t.MAX_TIMER_WAIT)
-FROM performance_schema.events_waits_summary_global_by_event_name AS e
-JOIN performance_schema.events_waits_summary_by_thread_by_event_name AS t
-USING (EVENT_NAME)
-GROUP BY EVENT_NAME
-HAVING (e.MAX_TIMER_WAIT < MAX(t.MAX_TIMER_WAIT))
-OR @dump_all;
-
-
-# Cleanup
-
-update performance_schema.setup_consumers set enabled = 'YES';
-update performance_schema.setup_instruments
-  set enabled = 'YES', timed = 'YES';
-
-drop table test.t1;
-
-set @@global.aria_checkpoint_interval= @aria_checkpoint_interval_save;
-
--- a/mysql-test/suite/plugins/r/server_audit.result
+++ b/mysql-test/suite/plugins/r/server_audit.result
@ -8,7 +8,6 @@ server_audit_file_rotate_now	OFF
 server_audit_file_rotate_size	1000000
 server_audit_file_rotations	9
 server_audit_incl_users	
-server_audit_loc_info	
 server_audit_logging	OFF
 server_audit_mode	0
 server_audit_output_type	file
@ -72,7 +71,6 @@ server_audit_file_rotate_now	OFF
 server_audit_file_rotate_size	1000000
 server_audit_file_rotations	9
 server_audit_incl_users	odin, root, dva, tri
-server_audit_loc_info	
 server_audit_logging	ON
 server_audit_mode	0
 server_audit_output_type	file
@ -218,7 +216,6 @@ server_audit_file_rotate_now	OFF
 server_audit_file_rotate_size	1000000
 server_audit_file_rotations	9
 server_audit_incl_users	odin, root, dva, tri
-server_audit_loc_info	
 server_audit_logging	ON
 server_audit_mode	1
 server_audit_output_type	file
--- a/mysql-test/suite/plugins/r/thread_pool_server_audit.result
+++ b/mysql-test/suite/plugins/r/thread_pool_server_audit.result
@ -8,7 +8,6 @@ server_audit_file_rotate_now	OFF
 server_audit_file_rotate_size	1000000
 server_audit_file_rotations	9
 server_audit_incl_users	
-server_audit_loc_info	
 server_audit_logging	OFF
 server_audit_mode	0
 server_audit_output_type	file
@ -72,7 +71,6 @@ server_audit_file_rotate_now	OFF
 server_audit_file_rotate_size	1000000
 server_audit_file_rotations	9
 server_audit_incl_users	odin, root, dva, tri
-server_audit_loc_info	
 server_audit_logging	ON
 server_audit_mode	0
 server_audit_output_type	file
@ -218,7 +216,6 @@ server_audit_file_rotate_now	OFF
 server_audit_file_rotate_size	1000000
 server_audit_file_rotations	9
 server_audit_incl_users	odin, root, dva, tri
-server_audit_loc_info	
 server_audit_logging	ON
 server_audit_mode	1
 server_audit_output_type	file
--- a/mysql-test/suite/rpl/t/rpl_drop_db.test
+++ b/mysql-test/suite/rpl/t/rpl_drop_db.test
@ -13,7 +13,7 @@ insert into mysqltest1.t1 values (1);
 select * from mysqltest1.t1 into outfile 'mysqltest1/f1.txt';
 create table mysqltest1.t2 (n int);
 create table mysqltest1.t3 (n int);
--replace_result \\ / 66 39 17 39 "File exists" "Directory not empty"
+--replace_result \\ / 66 39 17 39 247 39 "File exists" "Directory not empty"
 --error 1010
 drop database mysqltest1;
 use mysqltest1;
@ -30,7 +30,7 @@ while ($1)
 }
 --enable_query_log

--replace_result \\ / 66 39 17 39 "File exists" "Directory not empty"
+--replace_result \\ / 66 39 17 39 247 39 "File exists" "Directory not empty"
 --error 1010
 drop database mysqltest1;
 use mysqltest1;
--- a/mysql-test/t/ctype_utf32.test
+++ b/mysql-test/t/ctype_utf32.test
@ -889,6 +889,11 @@ SELECT CHAR_LENGTH(TRIM(BOTH 0x0001 FROM _utf32 0x00000061));
 SELECT CHAR_LENGTH(TRIM(BOTH 0x61 FROM _utf32 0x00000061));
 SELECT CHAR_LENGTH(TRIM(BOTH 0x00 FROM _utf32 0x00000061));

+#
+# potential signedness issue
+#
+select hex(lower(cast(0xffff0000 as char character set utf32))) as c;
+
 --echo #
 --echo # End of 5.5 tests
 --echo #
--- a/mysql-test/t/group_min_max_innodb.test
+++ b/mysql-test/t/group_min_max_innodb.test
@ -230,3 +230,16 @@ eval EXPLAIN $query;
 eval $query;

 DROP TABLE t0,t1,t2;
+
+--echo #
+--echo # MDEV-MariaDB daemon leaks memory with specific query
+--echo #
+
+CREATE TABLE t1 (`voter_id` int(11) unsigned NOT NULL,
+  `language_id` int(11) unsigned NOT NULL DEFAULT '1'
+) ENGINE=InnoDB DEFAULT CHARSET=utf8;
+CREATE TABLE t2 (`voter_id` int(10) unsigned NOT NULL DEFAULT '0',
+  `serialized_c` mediumblob) ENGINE=InnoDB DEFAULT CHARSET=utf8;
+insert into t2 values (1,repeat("a",1000)),(2,repeat("a",1000)),(3,repeat("b",1000)),(4,repeat("c",1000)),(4,repeat("b",1000));
+SELECT GROUP_CONCAT(t1.language_id SEPARATOR ',') AS `translation_resources`, `d`.`serialized_c` FROM t2 AS `d` LEFT JOIN t1 ON `d`.`voter_id` = t1.`voter_id` GROUP BY `d`.`voter_id` ORDER BY 10-d.voter_id+RAND()*0;
+drop table t1,t2;
--- a/mysql-test/t/merge.test
+++ b/mysql-test/t/merge.test
@ -2880,6 +2880,19 @@ drop tables m1, t1, t4;
 drop view t3;


+--echo #
+--echo # MDEV-10424 - Assertion `ticket == __null' failed in
+--echo #              MDL_request::set_type
+--echo #
+CREATE TABLE t1 (f1 INT) ENGINE=MyISAM;
+CREATE TABLE tmerge (f1 INT) ENGINE=MERGE UNION=(t1);
+PREPARE stmt FROM "ANALYZE TABLE tmerge, t1";
+EXECUTE stmt;
+EXECUTE stmt;
+DEALLOCATE PREPARE stmt;
+DROP TABLE t1, tmerge;
+
+
 --echo End of 5.5 tests


--- a/mysql-test/t/ps.test
+++ b/mysql-test/t/ps.test
@ -3653,5 +3653,32 @@ deallocate prepare stmt;
 SET SESSION sql_mode = @save_sql_mode;
 DROP TABLE t1,t2;

+--echo #
+--echo # MDEV-8833: Crash of server on prepared statement with
+--echo # conversion to semi-join
+--echo #

--echo # End of 10.0 tests
+CREATE TABLE t1 (column1 INT);
+INSERT INTO t1 VALUES (3),(9);
+
+CREATE TABLE t2 (column2 INT);
+INSERT INTO t2 VALUES (1),(4);
+
+CREATE TABLE t3 (column3 INT);
+INSERT INTO t3 VALUES (6),(8);
+
+CREATE TABLE t4 (column4 INT);
+INSERT INTO t4 VALUES (2),(5);
+
+PREPARE stmt FROM "SELECT ( SELECT MAX( table1.column1 ) AS field1 
+FROM t1 AS table1
+WHERE table3.column3 IN ( SELECT table2.column2 AS field2 FROM t2 AS table2 ) 
+) AS sq
+FROM t3 AS table3, t4 AS table4";
+EXECUTE stmt;
+EXECUTE stmt;
+deallocate prepare stmt;
+drop table t1,t2,t3,t4;
+
+
+--echo # End of 5.5 tests
--- a/mysql-test/t/type_uint.test
+++ b/mysql-test/t/type_uint.test
@ -16,6 +16,13 @@ drop table t1;

 # End of 4.1 tests

+create table t1 (a bigint unsigned, b mediumint unsigned);
+insert t1 values (1,2),(0xffffffffffffffff,0xffffff);
+select coalesce(a,b), coalesce(b,a) from t1;
+create table t2 as select a from t1 union select b from t1;
+show create table t2;
+select * from t2;
+drop table t1, t2;

 --echo #
 --echo # Start of 10.0 tests
--- a/mysys/my_redel.c
+++ b/mysys/my_redel.c
@ -1,5 +1,5 @@
-/*
-   Copyright (c) 2000, 2010, Oracle and/or its affiliates
+/* Copyright (c) 2000, 2010, Oracle and/or its affiliates
+   Copyright (c) 2009, 2016, MariaDB

   This program is free software; you can redistribute it and/or modify
   it under the terms of the GNU General Public License as published by
@ -49,7 +49,8 @@ int my_redel(const char *org_name, const char *tmp_name,
  DBUG_PRINT("my",("org_name: '%s' tmp_name: '%s'  MyFlags: %lu",
 		   org_name,tmp_name,MyFlags));

-  if (my_copystat(org_name,tmp_name,MyFlags) < 0)
+  if (!my_disable_copystat_in_redel &&
+      my_copystat(org_name,tmp_name,MyFlags) < 0)
    goto end;
  if (MyFlags & MY_REDEL_MAKE_BACKUP)
  {
--- a/mysys/my_static.c
+++ b/mysys/my_static.c
@ -98,3 +98,4 @@ my_bool my_disable_sync=0;
 my_bool my_disable_async_io=0;
 my_bool my_disable_flush_key_blocks=0;
 my_bool my_disable_symlinks=0;
+my_bool my_disable_copystat_in_redel=0;
--- a/plugin/server_audit/server_audit.c
+++ b/plugin/server_audit/server_audit.c
@ -427,9 +427,8 @@ static MYSQL_SYSVAR_UINT(query_log_limit, query_log_limit,
 char locinfo_ini_value[sizeof(struct connection_info)+4];

 static MYSQL_THDVAR_STR(loc_info,
-                        PLUGIN_VAR_READONLY | PLUGIN_VAR_MEMALLOC,
-                        "Auxiliary info.", NULL, NULL,
-                        locinfo_ini_value);
+                        PLUGIN_VAR_NOSYSVAR | PLUGIN_VAR_NOCMDOPT | PLUGIN_VAR_MEMALLOC,
+                        "Internal info", NULL, NULL, locinfo_ini_value);

 static const char *syslog_facility_names[]=
 {
--- a/sql/contributors.h
+++ b/sql/contributors.h
@ -46,6 +46,7 @@ struct show_table_contributors_st show_table_contributors[]= {
  {"Auttomattic", "https://automattic.com", "Bronze Sponsor of the MariaDB Foundation"},
  {"Verkkokauppa.com", "https://virtuozzo.com", "Bronze Sponsor of the MariaDB Foundation"},
  {"Virtuozzo", "https://virtuozzo.com/", "Bronze Sponsor of the MariaDB Foundation"},
+  {"Tencent Game DBA", "http://tencentdba.com/about/", "Bronze Sponsor of the MariaDB Foundation"},

  /* Sponsors of important features */
  {"Google", "USA", "Sponsoring encryption, parallel replication and GTID"},
--- a/sql/field.cc
+++ b/sql/field.cc
@ -355,7 +355,7 @@ static enum_field_types field_types_merge_rules [FIELDTYPE_NUM][FIELDTYPE_NUM]=
  //MYSQL_TYPE_NULL         MYSQL_TYPE_TIMESTAMP
    MYSQL_TYPE_LONGLONG,    MYSQL_TYPE_VARCHAR,
  //MYSQL_TYPE_LONGLONG     MYSQL_TYPE_INT24
-    MYSQL_TYPE_LONGLONG,    MYSQL_TYPE_LONG,
+    MYSQL_TYPE_LONGLONG,    MYSQL_TYPE_LONGLONG,
  //MYSQL_TYPE_DATE         MYSQL_TYPE_TIME
    MYSQL_TYPE_VARCHAR,     MYSQL_TYPE_VARCHAR,
  //MYSQL_TYPE_DATETIME     MYSQL_TYPE_YEAR
--- a/sql/item.cc
+++ b/sql/item.cc
@ -2743,9 +2743,28 @@ void Item_field::fix_after_pullout(st_select_lex *new_parent, Item **ref)
  if (context)
  {
    Name_resolution_context *ctx= new Name_resolution_context();
-    ctx->outer_context= NULL; // We don't build a complete name resolver
-    ctx->table_list= NULL;    // We rely on first_name_resolution_table instead
+    if (context->select_lex == new_parent)
+    {
+      /*
+        This field was pushed in then pulled out
+        (for example left part of IN)
+      */
+      ctx->outer_context= context->outer_context;
+    }
+    else if (context->outer_context)
+    {
+      /* just pull to the upper context */
+      ctx->outer_context= context->outer_context->outer_context;
+    }
+    else
+    {
+      /* No upper context (merging Derived/VIEW where context chain ends) */
+      ctx->outer_context= NULL;
+    }
+    ctx->table_list= context->first_name_resolution_table;
    ctx->select_lex= new_parent;
+    if (context->select_lex == NULL)
+      ctx->select_lex= NULL;
    ctx->first_name_resolution_table= context->first_name_resolution_table;
    ctx->last_name_resolution_table=  context->last_name_resolution_table;
    ctx->error_processor=             context->error_processor;
--- a/sql/log.cc
+++ b/sql/log.cc
@ -3011,7 +3011,7 @@ bool MYSQL_QUERY_LOG::write(THD *thd, time_t current_time,
      if (! write_error)
      {
        write_error= 1;
-        sql_print_error(ER(ER_ERROR_ON_WRITE), name, error);
+        sql_print_error(ER(ER_ERROR_ON_WRITE), name, tmp_errno);
      }
    }
  }
--- a/sql/mysqld.cc
+++ b/sql/mysqld.cc
@ -3916,6 +3916,7 @@ static int init_common_variables()

  max_system_variables.pseudo_thread_id= (ulong)~0;
  server_start_time= flush_status_time= my_time(0);
+  my_disable_copystat_in_redel= 1;

  global_rpl_filter= new Rpl_filter;
  binlog_filter= new Rpl_filter;
--- a/sql/net_serv.cc
+++ b/sql/net_serv.cc
@ -1,5 +1,5 @@
-/* Copyright (c) 2000, 2013, Oracle and/or its affiliates.
-   Copyright (c) 2010, 2014, SkySQL Ab.
+/* Copyright (c) 2000, 2016, Oracle and/or its affiliates.
+   Copyright (c) 2012, 2016, MariaDB

   This program is free software; you can redistribute it and/or modify
   it under the terms of the GNU General Public License as published by
--- a/sql/parse_file.h
+++ b/sql/parse_file.h
@ -42,9 +42,9 @@ enum file_opt_type {

 struct File_option
 {
-  LEX_STRING name;		/**< Name of the option */
-  int offset;			/**< offset to base address of value */
-  file_opt_type type;		/**< Option type */
+  LEX_STRING name;              /**< Name of the option */
+  my_ptrdiff_t offset;          /**< offset to base address of value */
+  file_opt_type type;           /**< Option type */
 };


--- a/sql/signal_handler.cc
+++ b/sql/signal_handler.cc
@ -64,13 +64,13 @@ extern "C" sig_handler handle_fatal_signal(int sig)
  struct tm tm;
 #ifdef HAVE_STACKTRACE
  THD *thd;
-#endif
  /*
     This flag remembers if the query pointer was found invalid.
     We will try and print the query at the end of the signal handler, in case
     we're wrong.
  */
  bool print_invalid_query_pointer= false;
+#endif

  if (segfaulted)
  {
@ -265,6 +265,7 @@ extern "C" sig_handler handle_fatal_signal(int sig)
      "\"mlockall\" bugs.\n");
  }

+#ifdef HAVE_STACKTRACE
  if (print_invalid_query_pointer)
  {
    my_safe_printf_stderr(
@ -274,6 +275,7 @@ extern "C" sig_handler handle_fatal_signal(int sig)
    my_write_stderr(thd->query(), MY_MIN(65536U, thd->query_length()));
    my_safe_printf_stderr("\n\n");
  }
+#endif

 #ifdef HAVE_WRITE_CORE
  if (test_flags & TEST_CORE_ON_SIGNAL)
--- a/sql/sql_admin.cc
+++ b/sql/sql_admin.cc
@ -455,7 +455,19 @@ static bool mysql_admin_table(THD* thd, TABLE_LIST* tables,
      }
      thd->prepare_derived_at_open= FALSE;

-      table->next_global= save_next_global;
+      /*
+        MERGE engine may adjust table->next_global chain, thus we have to
+        append save_next_global after merge children.
+      */
+      if (save_next_global)
+      {
+        TABLE_LIST *table_list_iterator= table;
+        while (table_list_iterator->next_global)
+          table_list_iterator= table_list_iterator->next_global;
+        table_list_iterator->next_global= save_next_global;
+        save_next_global->prev_global= &table_list_iterator->next_global;
+      }
+
      table->next_local= save_next_local;
      thd->open_options&= ~extra_open_options;

--- a/sql/sql_base.cc
+++ b/sql/sql_base.cc
@ -9223,6 +9223,7 @@ open_system_tables_for_read(THD *thd, TABLE_LIST *table_list,
  */
  lex->reset_n_backup_query_tables_list(&query_tables_list_backup);
  thd->reset_n_backup_open_tables_state(backup);
+  thd->lex->sql_command= SQLCOM_SELECT;

  if (open_and_lock_tables(thd, table_list, FALSE,
                           MYSQL_OPEN_IGNORE_FLUSH |
--- a/sql/sql_class.cc
+++ b/sql/sql_class.cc
@ -5371,9 +5371,11 @@ int THD::decide_logging_format(TABLE_LIST *tables)
    {
      static const char *prelocked_mode_name[] = {
        "NON_PRELOCKED",
+        "LOCK_TABLES",
        "PRELOCKED",
        "PRELOCKED_UNDER_LOCK_TABLES",
      };
+      compile_time_assert(array_elements(prelocked_mode_name) == LTM_always_last);
      DBUG_PRINT("debug", ("prelocked_mode: %s",
                           prelocked_mode_name[locked_tables_mode]));
    }
--- a/sql/sql_class.h
+++ b/sql/sql_class.h
@ -1182,7 +1182,8 @@ enum enum_locked_tables_mode
  LTM_NONE= 0,
  LTM_LOCK_TABLES,
  LTM_PRELOCKED,
-  LTM_PRELOCKED_UNDER_LOCK_TABLES
+  LTM_PRELOCKED_UNDER_LOCK_TABLES,
+  LTM_always_last
 };


@ -4302,6 +4303,11 @@ public:
      save_copy_field_end= copy_field_end= NULL;
    }
  }
+  void free_copy_field_data()
+  {
+    for (Copy_field *ptr= copy_field ; ptr != copy_field_end ; ptr++)
+      ptr->tmp.free();
+  }
 };

 class select_union :public select_result_interceptor
--- a/sql/sql_plugin.cc
+++ b/sql/sql_plugin.cc
@ -2810,6 +2810,22 @@ static st_bookmark *find_bookmark(const char *plugin, const char *name,
 }


+static size_t var_storage_size(int flags)
+{
+  switch (flags & PLUGIN_VAR_TYPEMASK) {
+  case PLUGIN_VAR_BOOL:         return sizeof(my_bool);
+  case PLUGIN_VAR_INT:          return sizeof(int);
+  case PLUGIN_VAR_LONG:         return sizeof(long);
+  case PLUGIN_VAR_ENUM:         return sizeof(long);
+  case PLUGIN_VAR_LONGLONG:     return sizeof(ulonglong);
+  case PLUGIN_VAR_SET:          return sizeof(ulonglong);
+  case PLUGIN_VAR_STR:          return sizeof(char*);
+  case PLUGIN_VAR_DOUBLE:       return sizeof(double);
+  default: DBUG_ASSERT(0);      return 0;
+  }
+}
+
+
 /*
  returns a bookmark for thd-local variables, creating if neccessary.
  returns null for non thd-local variables.
@ -2818,39 +2834,13 @@ static st_bookmark *find_bookmark(const char *plugin, const char *name,
 static st_bookmark *register_var(const char *plugin, const char *name,
                                 int flags)
 {
-  uint length= strlen(plugin) + strlen(name) + 3, size= 0, offset, new_size;
+  uint length= strlen(plugin) + strlen(name) + 3, size, offset, new_size;
  st_bookmark *result;
  char *varname, *p;

-  if (!(flags & PLUGIN_VAR_THDLOCAL))
-    return NULL;
-
-  switch (flags & PLUGIN_VAR_TYPEMASK) {
-  case PLUGIN_VAR_BOOL:
-    size= sizeof(my_bool);
-    break;
-  case PLUGIN_VAR_INT:
-    size= sizeof(int);
-    break;
-  case PLUGIN_VAR_LONG:
-  case PLUGIN_VAR_ENUM:
-    size= sizeof(long);
-    break;
-  case PLUGIN_VAR_LONGLONG:
-  case PLUGIN_VAR_SET:
-    size= sizeof(ulonglong);
-    break;
-  case PLUGIN_VAR_STR:
-    size= sizeof(char*);
-    break;
-  case PLUGIN_VAR_DOUBLE:
-    size= sizeof(double);
-    break;
-  default:
-    DBUG_ASSERT(0);
-    return NULL;
-  };
+  DBUG_ASSERT(flags & PLUGIN_VAR_THDLOCAL);

+  size= var_storage_size(flags);
  varname= ((char*) my_alloca(length));
  strxmov(varname + 1, plugin, "_", name, NullS);
  for (p= varname + 1; *p; p++)
@ -3046,25 +3036,17 @@ void sync_dynamic_session_variables(THD* thd, bool global_lock)
  */
  for (idx= 0; idx < bookmark_hash.records; idx++)
  {
-    sys_var_pluginvar *pi;
-    sys_var *var;
    st_bookmark *v= (st_bookmark*) my_hash_element(&bookmark_hash,idx);

    if (v->version <= thd->variables.dynamic_variables_version)
      continue; /* already in thd->variables */

-    if (!(var= intern_find_sys_var(v->key + 1, v->name_len)) ||
-        !(pi= var->cast_pluginvar()) ||
-        v->key[0] != plugin_var_bookmark_key(pi->plugin_var->flags))
-      continue;
-
    /* Here we do anything special that may be required of the data types */

-    if ((pi->plugin_var->flags & PLUGIN_VAR_TYPEMASK) == PLUGIN_VAR_STR &&
-        pi->plugin_var->flags & PLUGIN_VAR_MEMALLOC)
+    if ((v->key[0] & PLUGIN_VAR_TYPEMASK) == PLUGIN_VAR_STR &&
+         v->key[0] & BOOKMARK_MEMALLOC)
    {
-      int offset= ((thdvar_str_t *)(pi->plugin_var))->offset;
-      char **pp= (char**) (thd->variables.dynamic_variables_ptr + offset);
+      char **pp= (char**) (thd->variables.dynamic_variables_ptr + v->offset);
      if (*pp)
        *pp= my_strdup(*pp, MYF(MY_WME|MY_FAE));
    }
@ -3325,6 +3307,48 @@ bool sys_var_pluginvar::session_update(THD *thd, set_var *var)
  return false;
 }

+static const void *var_def_ptr(st_mysql_sys_var *pv)
+{
+    switch (pv->flags & (PLUGIN_VAR_TYPEMASK | PLUGIN_VAR_THDLOCAL)) {
+    case PLUGIN_VAR_INT:
+      return &((sysvar_uint_t*) pv)->def_val;
+    case PLUGIN_VAR_LONG:
+      return &((sysvar_ulong_t*) pv)->def_val;
+    case PLUGIN_VAR_LONGLONG:
+      return &((sysvar_ulonglong_t*) pv)->def_val;
+    case PLUGIN_VAR_ENUM:
+      return &((sysvar_enum_t*) pv)->def_val;
+    case PLUGIN_VAR_SET:
+      return &((sysvar_set_t*) pv)->def_val;
+    case PLUGIN_VAR_BOOL:
+      return &((sysvar_bool_t*) pv)->def_val;
+    case PLUGIN_VAR_STR:
+      return &((sysvar_str_t*) pv)->def_val;
+    case PLUGIN_VAR_DOUBLE:
+      return &((sysvar_double_t*) pv)->def_val;
+    case PLUGIN_VAR_INT | PLUGIN_VAR_THDLOCAL:
+      return &((thdvar_uint_t*) pv)->def_val;
+    case PLUGIN_VAR_LONG | PLUGIN_VAR_THDLOCAL:
+      return &((thdvar_ulong_t*) pv)->def_val;
+    case PLUGIN_VAR_LONGLONG | PLUGIN_VAR_THDLOCAL:
+      return &((thdvar_ulonglong_t*) pv)->def_val;
+    case PLUGIN_VAR_ENUM | PLUGIN_VAR_THDLOCAL:
+      return &((thdvar_enum_t*) pv)->def_val;
+    case PLUGIN_VAR_SET | PLUGIN_VAR_THDLOCAL:
+      return &((thdvar_set_t*) pv)->def_val;
+    case PLUGIN_VAR_BOOL | PLUGIN_VAR_THDLOCAL:
+      return &((thdvar_bool_t*) pv)->def_val;
+    case PLUGIN_VAR_STR | PLUGIN_VAR_THDLOCAL:
+      return &((thdvar_str_t*) pv)->def_val;
+    case PLUGIN_VAR_DOUBLE | PLUGIN_VAR_THDLOCAL:
+      return &((thdvar_double_t*) pv)->def_val;
+    default:
+      DBUG_ASSERT(0);
+      return NULL;
+    }
+}
+
+
 bool sys_var_pluginvar::global_update(THD *thd, set_var *var)
 {
  DBUG_ASSERT(!is_readonly());
@ -3334,60 +3358,7 @@ bool sys_var_pluginvar::global_update(THD *thd, set_var *var)
  const void *src= &var->save_result;

  if (!var->value)
-  {
-    switch (plugin_var->flags & (PLUGIN_VAR_TYPEMASK | PLUGIN_VAR_THDLOCAL)) {
-    case PLUGIN_VAR_INT:
-      src= &((sysvar_uint_t*) plugin_var)->def_val;
-      break;
-    case PLUGIN_VAR_LONG:
-      src= &((sysvar_ulong_t*) plugin_var)->def_val;
-      break;
-    case PLUGIN_VAR_LONGLONG:
-      src= &((sysvar_ulonglong_t*) plugin_var)->def_val;
-      break;
-    case PLUGIN_VAR_ENUM:
-      src= &((sysvar_enum_t*) plugin_var)->def_val;
-      break;
-    case PLUGIN_VAR_SET:
-      src= &((sysvar_set_t*) plugin_var)->def_val;
-      break;
-    case PLUGIN_VAR_BOOL:
-      src= &((sysvar_bool_t*) plugin_var)->def_val;
-      break;
-    case PLUGIN_VAR_STR:
-      src= &((sysvar_str_t*) plugin_var)->def_val;
-      break;
-    case PLUGIN_VAR_DOUBLE:
-      src= &((sysvar_double_t*) plugin_var)->def_val;
-      break;
-    case PLUGIN_VAR_INT | PLUGIN_VAR_THDLOCAL:
-      src= &((thdvar_uint_t*) plugin_var)->def_val;
-      break;
-    case PLUGIN_VAR_LONG | PLUGIN_VAR_THDLOCAL:
-      src= &((thdvar_ulong_t*) plugin_var)->def_val;
-      break;
-    case PLUGIN_VAR_LONGLONG | PLUGIN_VAR_THDLOCAL:
-      src= &((thdvar_ulonglong_t*) plugin_var)->def_val;
-      break;
-    case PLUGIN_VAR_ENUM | PLUGIN_VAR_THDLOCAL:
-      src= &((thdvar_enum_t*) plugin_var)->def_val;
-      break;
-    case PLUGIN_VAR_SET | PLUGIN_VAR_THDLOCAL:
-      src= &((thdvar_set_t*) plugin_var)->def_val;
-      break;
-    case PLUGIN_VAR_BOOL | PLUGIN_VAR_THDLOCAL:
-      src= &((thdvar_bool_t*) plugin_var)->def_val;
-      break;
-    case PLUGIN_VAR_STR | PLUGIN_VAR_THDLOCAL:
-      src= &((thdvar_str_t*) plugin_var)->def_val;
-      break;
-    case PLUGIN_VAR_DOUBLE | PLUGIN_VAR_THDLOCAL:
-      src= &((thdvar_double_t*) plugin_var)->def_val;
-      break;
-    default:
-      DBUG_ASSERT(0);
-    }
-  }
+    src= var_def_ptr(plugin_var);

  plugin_var->update(thd, plugin_var, tgt, src);
  return false;
@ -3743,7 +3714,18 @@ static int construct_options(MEM_ROOT *mem_root, struct st_plugin_int *tmp,
      *(int*)(opt + 1)= offset= v->offset;

      if (opt->flags & PLUGIN_VAR_NOCMDOPT)
+      {
+        char *val= global_system_variables.dynamic_variables_ptr + offset;
+        if (((opt->flags & PLUGIN_VAR_TYPEMASK) == PLUGIN_VAR_STR) &&
+             (opt->flags & PLUGIN_VAR_MEMALLOC))
+        {
+          char *def_val= *(char**)var_def_ptr(opt);
+          *(char**)val= def_val ? my_strdup(def_val, MYF(0)) : NULL;
+        }
+        else
+          memcpy(val, var_def_ptr(opt), var_storage_size(opt->flags));
        continue;
+      }

      optname= (char*) memdup_root(mem_root, v->key + 1,
                                   (optnamelen= v->name_len) + 1);
@ -3951,10 +3933,11 @@ static int test_plugin_options(MEM_ROOT *tmp_root, struct st_plugin_int *tmp,
          *str->value= strdup_root(mem_root, *str->value);
      }

+      var= find_bookmark(plugin_name.str, o->name, o->flags);
      if (o->flags & PLUGIN_VAR_NOSYSVAR)
        continue;
      tmp_backup[tmp->nbackups++].save(&o->name);
-      if ((var= find_bookmark(plugin_name.str, o->name, o->flags)))
+      if (var)
        v= new (mem_root) sys_var_pluginvar(&chain, var->key + 1, o, tmp);
      else
      {
--- a/sql/sql_select.cc
+++ b/sql/sql_select.cc
@ -9004,9 +9004,26 @@ JOIN::make_simple_join(JOIN *parent, TABLE *temp_table)
    We need to destruct the copy_field (allocated in create_tmp_table())
    before setting it to 0 if the join is not "reusable".
  */
-  if (!tmp_join || tmp_join != this) 
-    tmp_table_param.cleanup(); 
-  tmp_table_param.copy_field= tmp_table_param.copy_field_end=0;
+  if (!tmp_join || tmp_join != this)
+    tmp_table_param.cleanup();
+  else
+  {
+    /*
+      Free data buffered in copy_fields, but keep data pointed by copy_field
+      around for next iteration (possibly stored in save_copy_fields).
+
+      It would be logically simpler to not clear copy_field
+      below, but as we have loops that runs over copy_field to
+      copy_field_end that should not be done anymore, it's simpler to
+      just clear the pointers.
+
+      Another option would be to just clear copy_field_end and not run
+      the loops if this is not set or to have tmp_table_param.cleanup()
+      to run cleanup on save_copy_field if copy_field is not set.
+    */
+    tmp_table_param.free_copy_field_data();
+    tmp_table_param.copy_field= tmp_table_param.copy_field_end=0;
+  }
  first_record= sort_and_group=0;
  send_records= (ha_rows) 0;

@ -11687,7 +11704,7 @@ void JOIN::join_free()
 /**
  Free resources of given join.

-  @param fill   true if we should free all resources, call with full==1
+  @param full   true if we should free all resources, call with full==1
                should be last, before it this function can be called with
                full==0

@ -11806,7 +11823,7 @@ void JOIN::cleanup(bool full)
    /*
      If we have tmp_join and 'this' JOIN is not tmp_join and
      tmp_table_param.copy_field's  of them are equal then we have to remove
-      pointer to  tmp_table_param.copy_field from tmp_join, because it qill
+      pointer to  tmp_table_param.copy_field from tmp_join, because it will
      be removed in tmp_table_param.cleanup().
    */
    if (tmp_join &&
@ -15710,6 +15727,7 @@ Field *create_tmp_field(THD *thd, TABLE *table,Item *item, Item::Type type,
  case Item::VARBIN_ITEM:
  case Item::CACHE_ITEM:
  case Item::EXPR_CACHE_ITEM:
+  case Item::PARAM_ITEM:
    if (make_copy_field)
    {
      DBUG_ASSERT(((Item_result_field*)item)->result_field);
@ -22240,7 +22258,7 @@ setup_copy_fields(THD *thd, TMP_TABLE_PARAM *param,
 err:
  if (copy)
    delete [] param->copy_field;			// This is never 0
-  param->copy_field=0;
+  param->copy_field= 0;
 err2:
  DBUG_RETURN(TRUE);
 }
--- a/sql/table_cache.cc
+++ b/sql/table_cache.cc
@ -876,6 +876,8 @@ void tdc_release_share(TABLE_SHARE *share)
  }
  if (--share->tdc.ref_count)
  {
+    if (!share->is_view)
+      mysql_cond_broadcast(&share->tdc.COND_release);
    mysql_mutex_unlock(&share->tdc.LOCK_table_share);
    mysql_mutex_unlock(&LOCK_unused_shares);
    DBUG_VOID_RETURN;
--- a/storage/innobase/fts/fts0fts.cc
+++ b/storage/innobase/fts/fts0fts.cc
@ -108,6 +108,7 @@ UNIV_INTERN mysql_pfs_key_t	fts_pll_tokenize_mutex_key;
 /** variable to record innodb_fts_internal_tbl_name for information
 schema table INNODB_FTS_INSERTED etc. */
 UNIV_INTERN char* fts_internal_tbl_name		= NULL;
+UNIV_INTERN char* fts_internal_tbl_name2	= NULL;

 /** InnoDB default stopword list:
 There are different versions of stopwords, the stop words listed
@ -6570,6 +6571,36 @@ fts_check_corrupt_index(
 	return(0);
 }

+/* Get parent table name if it's a fts aux table
+@param[in]	aux_table_name	aux table name
+@param[in]	aux_table_len	aux table length
+@return parent table name, or NULL */
+char*
+fts_get_parent_table_name(
+	const char*	aux_table_name,
+	ulint		aux_table_len)
+{
+	fts_aux_table_t	aux_table;
+	char*		parent_table_name = NULL;
+
+	if (fts_is_aux_table_name(&aux_table, aux_table_name, aux_table_len)) {
+		dict_table_t*	parent_table;
+
+		parent_table = dict_table_open_on_id(
+			aux_table.parent_id, TRUE, DICT_TABLE_OP_NORMAL);
+
+		if (parent_table != NULL) {
+			parent_table_name = mem_strdupl(
+				parent_table->name,
+				strlen(parent_table->name));
+
+			dict_table_close(parent_table, TRUE, FALSE);
+		}
+	}
+
+	return(parent_table_name);
+}
+
 /** Check the validity of the parent table.
@param[in]	aux_table	auxiliary table
@return true if it is a valid table or false if it is not */
--- a/storage/innobase/handler/ha_innodb.cc
+++ b/storage/innobase/handler/ha_innodb.cc
@ -15010,7 +15010,12 @@ innodb_internal_table_update(
 		my_free(old);
 	}

-	fts_internal_tbl_name = *(char**) var_ptr;
+	fts_internal_tbl_name2 = *(char**) var_ptr;
+	if (fts_internal_tbl_name2 == NULL) {
+		fts_internal_tbl_name = const_cast<char*>("default");
+	} else {
+		fts_internal_tbl_name = fts_internal_tbl_name2;
+	}
 }

 /****************************************************************//**
@ -16793,7 +16798,7 @@ static MYSQL_SYSVAR_BOOL(disable_sort_file_cache, srv_disable_sort_file_cache,
  "Whether to disable OS system file cache for sort I/O",
  NULL, NULL, FALSE);

-static MYSQL_SYSVAR_STR(ft_aux_table, fts_internal_tbl_name,
+static MYSQL_SYSVAR_STR(ft_aux_table, fts_internal_tbl_name2,
  PLUGIN_VAR_NOCMDARG,
  "FTS internal auxiliary table to be checked",
  innodb_internal_table_validate,
--- a/storage/innobase/handler/handler0alter.cc
+++ b/storage/innobase/handler/handler0alter.cc
@ -209,7 +209,10 @@ innobase_need_rebuild(
 	const Alter_inplace_info*	ha_alter_info,
 	const TABLE*			altered_table)
 {
-	if (ha_alter_info->handler_flags
+	Alter_inplace_info::HA_ALTER_FLAGS alter_inplace_flags =
+		ha_alter_info->handler_flags & ~(INNOBASE_INPLACE_IGNORE);
+
+	if (alter_inplace_flags
 	    == Alter_inplace_info::CHANGE_CREATE_OPTION
 	    && !(ha_alter_info->create_info->used_fields
 		 & (HA_CREATE_USED_ROW_FORMAT
@ -3933,7 +3936,7 @@ err_exit:
 	}

 	if (!(ha_alter_info->handler_flags & INNOBASE_ALTER_DATA)
-	    || (ha_alter_info->handler_flags
+	    || ((ha_alter_info->handler_flags & ~INNOBASE_INPLACE_IGNORE)
 		== Alter_inplace_info::CHANGE_CREATE_OPTION
 		&& !innobase_need_rebuild(ha_alter_info, table))) {

@ -4107,7 +4110,7 @@ ok_exit:
 		DBUG_RETURN(false);
 	}

-	if (ha_alter_info->handler_flags
+	if ((ha_alter_info->handler_flags & ~INNOBASE_INPLACE_IGNORE)
 	    == Alter_inplace_info::CHANGE_CREATE_OPTION
 	    && !innobase_need_rebuild(ha_alter_info, table)) {
 		goto ok_exit;
--- a/storage/innobase/handler/i_s.cc
+++ b/storage/innobase/handler/i_s.cc
@ -3981,6 +3981,8 @@ i_s_fts_config_fill(
 		DBUG_RETURN(0);
 	}

+	DEBUG_SYNC_C("i_s_fts_config_fille_check");
+
 	fields = table->field;

 	/* Prevent DDL to drop fts aux tables. */
--- a/storage/innobase/include/fts0fts.h
+++ b/storage/innobase/include/fts0fts.h
@ -375,6 +375,7 @@ extern bool		fts_need_sync;
 /** Variable specifying the table that has Fulltext index to display its
 content through information schema table */
 extern char*		fts_internal_tbl_name;
+extern char*		fts_internal_tbl_name2;

 #define	fts_que_graph_free(graph)			\
 do {							\
@ -823,6 +824,15 @@ void
 fts_drop_orphaned_tables(void);
 /*==========================*/

+/* Get parent table name if it's a fts aux table
+@param[in]	aux_table_name	aux table name
+@param[in]	aux_table_len	aux table length
+@return parent table name, or NULL */
+char*
+fts_get_parent_table_name(
+	const char*	aux_table_name,
+	ulint		aux_table_len);
+
 /******************************************************************//**
 Since we do a horizontal split on the index table, we need to drop
 all the split tables.
--- a/storage/innobase/include/univ.i
+++ b/storage/innobase/include/univ.i
@ -44,7 +44,7 @@ Created 1/20/1994 Heikki Tuuri

 #define INNODB_VERSION_MAJOR	5
 #define INNODB_VERSION_MINOR	6
-#define INNODB_VERSION_BUGFIX	32
+#define INNODB_VERSION_BUGFIX	33

 /* The following is the InnoDB version as shown in
 SELECT plugin_version FROM information_schema.plugins;
--- a/storage/innobase/row/row0log.cc
+++ b/storage/innobase/row/row0log.cc
@ -613,7 +613,7 @@ row_log_table_delete(
 		&old_pk_extra_size);
 	ut_ad(old_pk_extra_size < 0x100);

-	mrec_size = 4 + old_pk_size;
+	mrec_size = 6 + old_pk_size;

 	/* Log enough prefix of the BLOB unless both the
 	old and new table are in COMPACT or REDUNDANT format,
@ -643,8 +643,8 @@ row_log_table_delete(
 		*b++ = static_cast<byte>(old_pk_extra_size);

 		/* Log the size of external prefix we saved */
-		mach_write_to_2(b, ext_size);
-		b += 2;
+		mach_write_to_4(b, ext_size);
+		b += 4;

 		rec_convert_dtuple_to_temp(
 			b + old_pk_extra_size, new_index,
@ -2268,14 +2268,14 @@ row_log_table_apply_op(
 		break;

 	case ROW_T_DELETE:
-		/* 1 (extra_size) + 2 (ext_size) + at least 1 (payload) */
-		if (mrec + 4 >= mrec_end) {
+		/* 1 (extra_size) + 4 (ext_size) + at least 1 (payload) */
+		if (mrec + 6 >= mrec_end) {
 			return(NULL);
 		}

 		extra_size = *mrec++;
-		ext_size = mach_read_from_2(mrec);
-		mrec += 2;
+		ext_size = mach_read_from_4(mrec);
+		mrec += 4;
 		ut_ad(mrec < mrec_end);

 		/* We assume extra_size < 0x100 for the PRIMARY KEY prefix.
--- a/storage/innobase/row/row0mysql.cc
+++ b/storage/innobase/row/row0mysql.cc
@ -2715,6 +2715,10 @@ loop:
 		return(n_tables + n_tables_dropped);
 	}

+	DBUG_EXECUTE_IF("row_drop_tables_in_background_sleep",
+		os_thread_sleep(5000000);
+	);
+
 	table = dict_table_open_on_name(drop->table_name, FALSE, FALSE,
 					DICT_ERR_IGNORE_NONE);

@ -2725,6 +2729,16 @@ loop:
 		goto already_dropped;
 	}

+	if (!table->to_be_dropped) {
+		/* There is a scenario: the old table is dropped
+		just after it's added into drop list, and new
+		table with the same name is created, then we try
+		to drop the new table in background. */
+		dict_table_close(table, FALSE, FALSE);
+
+		goto already_dropped;
+	}
+
 	ut_a(!table->can_be_evicted);

 	dict_table_close(table, FALSE, FALSE);
@ -3992,6 +4006,13 @@ row_drop_table_for_mysql(
 		}
 	}

+
+	DBUG_EXECUTE_IF("row_drop_table_add_to_background",
+		row_add_table_to_background_drop_list(table->name);
+		err = DB_SUCCESS;
+		goto funct_exit;
+	);
+
 	/* TODO: could we replace the counter n_foreign_key_checks_running
 	with lock checks on the table? Acquire here an exclusive lock on the
 	table, and rewrite lock0lock.cc and the lock wait in srv0srv.cc so that
@ -4608,6 +4629,19 @@ loop:
 	row_mysql_lock_data_dictionary(trx);

 	while ((table_name = dict_get_first_table_name_in_db(name))) {
+		/* Drop parent table if it is a fts aux table, to
+		avoid accessing dropped fts aux tables in information
+		scheam when parent table still exists.
+		Note: Drop parent table will drop fts aux tables. */
+		char*	parent_table_name;
+		parent_table_name = fts_get_parent_table_name(
+				table_name, strlen(table_name));
+
+		if (parent_table_name != NULL) {
+			mem_free(table_name);
+			table_name = parent_table_name;
+		}
+
 		ut_a(memcmp(table_name, name, namelen) == 0);

 		table = dict_table_open_on_name(
--- a/storage/perfschema/ha_perfschema.cc
+++ b/storage/perfschema/ha_perfschema.cc
@ -205,7 +205,7 @@ maria_declare_plugin(perfschema)
  0x0001,
  pfs_status_vars,
  NULL,
-  "5.6.32",
+  "5.6.33",
  MariaDB_PLUGIN_MATURITY_STABLE
 }
 maria_declare_plugin_end;
--- a/storage/tokudb/CMakeLists.txt
+++ b/storage/tokudb/CMakeLists.txt
@ -1,4 +1,4 @@
-SET(TOKUDB_VERSION 5.6.31-77.0)
+SET(TOKUDB_VERSION 5.6.32-78.1)
 # PerconaFT only supports x86-64 and cmake-2.8.9+
 IF(CMAKE_VERSION VERSION_LESS "2.8.9")
  MESSAGE(STATUS "CMake 2.8.9 or higher is required by TokuDB")
--- a/storage/tokudb/PerconaFT/buildheader/make_tdb.cc
+++ b/storage/tokudb/PerconaFT/buildheader/make_tdb.cc
@ -367,8 +367,8 @@ static void print_db_env_struct (void) {
                             "int (*checkpointing_get_period)             (DB_ENV*, uint32_t*) /* Retrieve the delay between automatic checkpoints.  0 means disabled. */",
                             "int (*cleaner_set_period)                   (DB_ENV*, uint32_t) /* Change the delay between automatic cleaner attempts.  0 means disabled. */",
                             "int (*cleaner_get_period)                   (DB_ENV*, uint32_t*) /* Retrieve the delay between automatic cleaner attempts.  0 means disabled. */",
-                             "int (*cleaner_set_iterations)               (DB_ENV*, uint32_t) /* Change the number of attempts on each cleaner invokation.  0 means disabled. */",
-                             "int (*cleaner_get_iterations)               (DB_ENV*, uint32_t*) /* Retrieve the number of attempts on each cleaner invokation.  0 means disabled. */",
+                             "int (*cleaner_set_iterations)               (DB_ENV*, uint32_t) /* Change the number of attempts on each cleaner invocation.  0 means disabled. */",
+                             "int (*cleaner_get_iterations)               (DB_ENV*, uint32_t*) /* Retrieve the number of attempts on each cleaner invocation.  0 means disabled. */",
                             "int (*evictor_set_enable_partial_eviction)  (DB_ENV*, bool) /* Enables or disabled partial eviction of nodes from cachetable. */",
                             "int (*evictor_get_enable_partial_eviction)  (DB_ENV*, bool*) /* Retrieve the status of partial eviction of nodes from cachetable. */",
                             "int (*checkpointing_postpone)               (DB_ENV*) /* Use for 'rename table' or any other operation that must be disjoint from a checkpoint */",
--- a/storage/tokudb/PerconaFT/cmake_modules/TokuSetupCompiler.cmake
+++ b/storage/tokudb/PerconaFT/cmake_modules/TokuSetupCompiler.cmake
@ -103,6 +103,7 @@ set_cflags_if_supported(
  -Wno-pointer-bool-conversion
  -fno-rtti
  -fno-exceptions
+  -Wno-error=nonnull-compare
  )
 ## set_cflags_if_supported_named("-Weffc++" -Weffcpp)

--- a/storage/tokudb/PerconaFT/ft/CMakeLists.txt
+++ b/storage/tokudb/PerconaFT/ft/CMakeLists.txt
@ -55,8 +55,8 @@ set(FT_SOURCES
  msg_buffer
  node
  pivotkeys
+  serialize/rbtree_mhs
  serialize/block_allocator
-  serialize/block_allocator_strategy
  serialize/block_table
  serialize/compress
  serialize/ft_node-serialize
--- a/storage/tokudb/PerconaFT/ft/ft-flusher.cc
+++ b/storage/tokudb/PerconaFT/ft/ft-flusher.cc
@ -496,7 +496,7 @@ handle_split_of_child(

    // We never set the rightmost blocknum to be the root.
    // Instead, we wait for the root to split and let promotion initialize the rightmost
-    // blocknum to be the first non-root leaf node on the right extreme to recieve an insert.
+    // blocknum to be the first non-root leaf node on the right extreme to receive an insert.
    BLOCKNUM rightmost_blocknum = toku_unsafe_fetch(&ft->rightmost_blocknum);
    invariant(ft->h->root_blocknum.b != rightmost_blocknum.b);
    if (childa->blocknum.b == rightmost_blocknum.b) {
@ -1470,7 +1470,7 @@ void toku_ft_flush_some_child(FT ft, FTNODE parent, struct flusher_advice *fa)
    // It is possible after reading in the entire child,
    // that we now know that the child is not reactive
    // if so, we can unpin parent right now
-    // we wont be splitting/merging child
+    // we won't be splitting/merging child
    // and we have already replaced the bnc
    // for the root with a fresh one
    enum reactivity child_re = toku_ftnode_get_reactivity(ft, child);
--- a/storage/tokudb/PerconaFT/ft/ft-ops.cc
+++ b/storage/tokudb/PerconaFT/ft/ft-ops.cc
@ -598,15 +598,12 @@ void toku_ftnode_checkpoint_complete_callback(void *value_data) {
    }
 }

-void toku_ftnode_clone_callback(
-    void* value_data,
-    void** cloned_value_data,
-    long* clone_size,
-    PAIR_ATTR* new_attr,
-    bool for_checkpoint,
-    void* write_extraargs
-    )
-{
+void toku_ftnode_clone_callback(void *value_data,
+                                void **cloned_value_data,
+                                long *clone_size,
+                                PAIR_ATTR *new_attr,
+                                bool for_checkpoint,
+                                void *write_extraargs) {
    FTNODE node = static_cast<FTNODE>(value_data);
    toku_ftnode_assert_fully_in_memory(node);
    FT ft = static_cast<FT>(write_extraargs);
@ -618,13 +615,16 @@ void toku_ftnode_clone_callback(
        toku_ftnode_leaf_rebalance(node, ft->h->basementnodesize);
    }

-    cloned_node->oldest_referenced_xid_known = node->oldest_referenced_xid_known;
-    cloned_node->max_msn_applied_to_node_on_disk = node->max_msn_applied_to_node_on_disk;
+    cloned_node->oldest_referenced_xid_known =
+        node->oldest_referenced_xid_known;
+    cloned_node->max_msn_applied_to_node_on_disk =
+        node->max_msn_applied_to_node_on_disk;
    cloned_node->flags = node->flags;
    cloned_node->blocknum = node->blocknum;
    cloned_node->layout_version = node->layout_version;
    cloned_node->layout_version_original = node->layout_version_original;
-    cloned_node->layout_version_read_from_disk = node->layout_version_read_from_disk;
+    cloned_node->layout_version_read_from_disk =
+        node->layout_version_read_from_disk;
    cloned_node->build_id = node->build_id;
    cloned_node->height = node->height;
    cloned_node->dirty = node->dirty;
@ -649,38 +649,39 @@ void toku_ftnode_clone_callback(
    // set new pair attr if necessary
    if (node->height == 0) {
        *new_attr = make_ftnode_pair_attr(node);
-    }
-    else {
+        for (int i = 0; i < node->n_children; i++) {
+            BLB(node, i)->logical_rows_delta = 0;
+            BLB(cloned_node, i)->logical_rows_delta = 0;
+        }
+    } else {
        new_attr->is_valid = false;
    }
    *clone_size = ftnode_memory_size(cloned_node);
    *cloned_value_data = cloned_node;
 }

-void toku_ftnode_flush_callback(
-    CACHEFILE UU(cachefile),
-    int fd,
-    BLOCKNUM blocknum,
-    void *ftnode_v,
-    void** disk_data,
-    void *extraargs,
-    PAIR_ATTR size __attribute__((unused)),
-    PAIR_ATTR* new_size,
-    bool write_me,
-    bool keep_me,
-    bool for_checkpoint,
-    bool is_clone
-    )
-{
-    FT ft = (FT) extraargs;
-    FTNODE ftnode = (FTNODE) ftnode_v;
-    FTNODE_DISK_DATA* ndd = (FTNODE_DISK_DATA*)disk_data;
+void toku_ftnode_flush_callback(CACHEFILE UU(cachefile),
+                                int fd,
+                                BLOCKNUM blocknum,
+                                void *ftnode_v,
+                                void **disk_data,
+                                void *extraargs,
+                                PAIR_ATTR size __attribute__((unused)),
+                                PAIR_ATTR *new_size,
+                                bool write_me,
+                                bool keep_me,
+                                bool for_checkpoint,
+                                bool is_clone) {
+    FT ft = (FT)extraargs;
+    FTNODE ftnode = (FTNODE)ftnode_v;
+    FTNODE_DISK_DATA *ndd = (FTNODE_DISK_DATA *)disk_data;
    assert(ftnode->blocknum.b == blocknum.b);
    int height = ftnode->height;
    if (write_me) {
        toku_ftnode_assert_fully_in_memory(ftnode);
        if (height > 0 && !is_clone) {
-            // cloned nodes already had their stale messages moved, see toku_ftnode_clone_callback()
+            // cloned nodes already had their stale messages moved, see
+            // toku_ftnode_clone_callback()
            toku_move_ftnode_messages_to_stale(ft, ftnode);
        } else if (height == 0) {
            toku_ftnode_leaf_run_gc(ft, ftnode);
@ -688,7 +689,8 @@ void toku_ftnode_flush_callback(
                toku_ftnode_update_disk_stats(ftnode, ft, for_checkpoint);
            }
        }
-        int r = toku_serialize_ftnode_to(fd, ftnode->blocknum, ftnode, ndd, !is_clone, ft, for_checkpoint);
+        int r = toku_serialize_ftnode_to(
+            fd, ftnode->blocknum, ftnode, ndd, !is_clone, ft, for_checkpoint);
        assert_zero(r);
        ftnode->layout_version_read_from_disk = FT_LAYOUT_VERSION;
    }
@ -703,20 +705,22 @@ void toku_ftnode_flush_callback(
                FT_STATUS_INC(FT_FULL_EVICTIONS_NONLEAF_BYTES, node_size);
            }
            toku_free(*disk_data);
-        }
-        else {
+        } else {
            if (ftnode->height == 0) {
                for (int i = 0; i < ftnode->n_children; i++) {
-                    if (BP_STATE(ftnode,i) == PT_AVAIL) {
+                    if (BP_STATE(ftnode, i) == PT_AVAIL) {
                        BASEMENTNODE bn = BLB(ftnode, i);
-                        toku_ft_decrease_stats(&ft->in_memory_stats, bn->stat64_delta);
+                        toku_ft_decrease_stats(&ft->in_memory_stats,
+                                               bn->stat64_delta);
+                        if (!ftnode->dirty)
+                            toku_ft_adjust_logical_row_count(
+                                ft, -bn->logical_rows_delta);
                    }
                }
            }
        }
        toku_ftnode_free(&ftnode);
-    }
-    else {
+    } else {
        *new_size = make_ftnode_pair_attr(ftnode);
    }
 }
@ -845,10 +849,13 @@ static void compress_internal_node_partition(FTNODE node, int i, enum toku_compr
 }

 // callback for partially evicting a node
-int toku_ftnode_pe_callback(void *ftnode_pv, PAIR_ATTR old_attr, void *write_extraargs,
-                            void (*finalize)(PAIR_ATTR new_attr, void *extra), void *finalize_extra) {
-    FTNODE node = (FTNODE) ftnode_pv;
-    FT ft = (FT) write_extraargs;
+int toku_ftnode_pe_callback(void *ftnode_pv,
+                            PAIR_ATTR old_attr,
+                            void *write_extraargs,
+                            void (*finalize)(PAIR_ATTR new_attr, void *extra),
+                            void *finalize_extra) {
+    FTNODE node = (FTNODE)ftnode_pv;
+    FT ft = (FT)write_extraargs;
    int num_partial_evictions = 0;

    // Hold things we intend to destroy here.
@ -866,7 +873,8 @@ int toku_ftnode_pe_callback(void *ftnode_pv, PAIR_ATTR old_attr, void *write_ext
    }
    // Don't partially evict nodes whose partitions can't be read back
    // from disk individually
-    if (node->layout_version_read_from_disk < FT_FIRST_LAYOUT_VERSION_WITH_BASEMENT_NODES) {
+    if (node->layout_version_read_from_disk <
+        FT_FIRST_LAYOUT_VERSION_WITH_BASEMENT_NODES) {
        goto exit;
    }
    //
@ -874,77 +882,77 @@ int toku_ftnode_pe_callback(void *ftnode_pv, PAIR_ATTR old_attr, void *write_ext
    //
    if (node->height > 0) {
        for (int i = 0; i < node->n_children; i++) {
-            if (BP_STATE(node,i) == PT_AVAIL) {
-                if (BP_SHOULD_EVICT(node,i)) {
+            if (BP_STATE(node, i) == PT_AVAIL) {
+                if (BP_SHOULD_EVICT(node, i)) {
                    NONLEAF_CHILDINFO bnc = BNC(node, i);
                    if (ft_compress_buffers_before_eviction &&
-                        // We may not serialize and compress a partition in memory if its
-                        // in memory layout version is different than what's on disk (and
-                        // therefore requires upgrade).
+                        // We may not serialize and compress a partition in
+                        // memory if its in memory layout version is different
+                        // than what's on disk (and therefore requires upgrade).
                        //
-                        // Auto-upgrade code assumes that if a node's layout version read
-                        // from disk is not current, it MUST require upgrade. Breaking
-                        // this rule would cause upgrade code to upgrade this partition
-                        // again after we serialize it as the current version, which is bad.
-                        node->layout_version == node->layout_version_read_from_disk) {
+                        // Auto-upgrade code assumes that if a node's layout
+                        // version read from disk is not current, it MUST
+                        // require upgrade.
+                        // Breaking this rule would cause upgrade code to
+                        // upgrade this partition again after we serialize it as
+                        // the current version, which is bad.
+                        node->layout_version ==
+                            node->layout_version_read_from_disk) {
                        toku_ft_bnc_move_messages_to_stale(ft, bnc);
                        compress_internal_node_partition(
                            node,
                            i,
                            // Always compress with quicklz
-                            TOKU_QUICKLZ_METHOD
-                            );
+                            TOKU_QUICKLZ_METHOD);
                    } else {
                        // We're not compressing buffers before eviction. Simply
-                        // detach the buffer and set the child's state to on-disk.
+                        // detach the buffer and set the child's state to
+                        // on-disk.
                        set_BNULL(node, i);
                        BP_STATE(node, i) = PT_ON_DISK;
                    }
                    buffers_to_destroy[num_buffers_to_destroy++] = bnc;
                    num_partial_evictions++;
+                } else {
+                    BP_SWEEP_CLOCK(node, i);
                }
-                else {
-                    BP_SWEEP_CLOCK(node,i);
-                }
-            }
-            else {
+            } else {
                continue;
            }
        }
-    }
-    //
-    // partial eviction strategy for basement nodes:
-    //  if the bn is compressed, evict it
-    //  else: check if it requires eviction, if it does, evict it, if not, sweep the clock count
-    //
-    else {
+    } else {
+        //
+        // partial eviction strategy for basement nodes:
+        //  if the bn is compressed, evict it
+        //  else: check if it requires eviction, if it does, evict it, if not,
+        //  sweep the clock count
+        //
        for (int i = 0; i < node->n_children; i++) {
            // Get rid of compressed stuff no matter what.
-            if (BP_STATE(node,i) == PT_COMPRESSED) {
+            if (BP_STATE(node, i) == PT_COMPRESSED) {
                SUB_BLOCK sb = BSB(node, i);
                pointers_to_free[num_pointers_to_free++] = sb->compressed_ptr;
                pointers_to_free[num_pointers_to_free++] = sb;
                set_BNULL(node, i);
-                BP_STATE(node,i) = PT_ON_DISK;
+                BP_STATE(node, i) = PT_ON_DISK;
                num_partial_evictions++;
-            }
-            else if (BP_STATE(node,i) == PT_AVAIL) {
-                if (BP_SHOULD_EVICT(node,i)) {
+            } else if (BP_STATE(node, i) == PT_AVAIL) {
+                if (BP_SHOULD_EVICT(node, i)) {
                    BASEMENTNODE bn = BLB(node, i);
                    basements_to_destroy[num_basements_to_destroy++] = bn;
-                    toku_ft_decrease_stats(&ft->in_memory_stats, bn->stat64_delta);
+                    toku_ft_decrease_stats(&ft->in_memory_stats,
+                                           bn->stat64_delta);
+                    toku_ft_adjust_logical_row_count(ft,
+                                                     -bn->logical_rows_delta);
                    set_BNULL(node, i);
                    BP_STATE(node, i) = PT_ON_DISK;
                    num_partial_evictions++;
+                } else {
+                    BP_SWEEP_CLOCK(node, i);
                }
-                else {
-                    BP_SWEEP_CLOCK(node,i);
-                }
-            }
-            else if (BP_STATE(node,i) == PT_ON_DISK) {
+            } else if (BP_STATE(node, i) == PT_ON_DISK) {
                continue;
-            }
-            else {
+            } else {
                abort();
            }
        }
@ -2378,12 +2386,16 @@ ft_send_update_msg(FT_HANDLE ft_h, const ft_msg &msg, TOKUTXN txn) {
    toku_ft_root_put_msg(ft_h->ft, msg, &gc_info);
 }

-void toku_ft_maybe_update(FT_HANDLE ft_h, const DBT *key, const DBT *update_function_extra,
-                      TOKUTXN txn, bool oplsn_valid, LSN oplsn,
-                      bool do_logging) {
+void toku_ft_maybe_update(FT_HANDLE ft_h,
+                          const DBT *key,
+                          const DBT *update_function_extra,
+                          TOKUTXN txn,
+                          bool oplsn_valid,
+                          LSN oplsn,
+                          bool do_logging) {
    TXNID_PAIR xid = toku_txn_get_txnid(txn);
    if (txn) {
-        BYTESTRING keybs = { key->size, (char *) key->data };
+        BYTESTRING keybs = {key->size, (char *)key->data};
        toku_logger_save_rollback_cmdupdate(
            txn, toku_cachefile_filenum(ft_h->ft->cf), &keybs);
        toku_txn_maybe_note_ft(txn, ft_h->ft);
@ -2392,22 +2404,33 @@ void toku_ft_maybe_update(FT_HANDLE ft_h, const DBT *key, const DBT *update_func
    TOKULOGGER logger;
    logger = toku_txn_logger(txn);
    if (do_logging && logger) {
-        BYTESTRING keybs = {.len=key->size, .data=(char *) key->data};
-        BYTESTRING extrabs = {.len=update_function_extra->size,
-                              .data = (char *) update_function_extra->data};
-        toku_log_enq_update(logger, NULL, 0, txn,
-                                toku_cachefile_filenum(ft_h->ft->cf),
-                                xid, keybs, extrabs);
+        BYTESTRING keybs = {.len = key->size, .data = (char *)key->data};
+        BYTESTRING extrabs = {.len = update_function_extra->size,
+                              .data = (char *)update_function_extra->data};
+        toku_log_enq_update(logger,
+                            NULL,
+                            0,
+                            txn,
+                            toku_cachefile_filenum(ft_h->ft->cf),
+                            xid,
+                            keybs,
+                            extrabs);
    }

    LSN treelsn;
-    if (oplsn_valid && oplsn.lsn <= (treelsn = toku_ft_checkpoint_lsn(ft_h->ft)).lsn) {
+    if (oplsn_valid &&
+        oplsn.lsn <= (treelsn = toku_ft_checkpoint_lsn(ft_h->ft)).lsn) {
        // do nothing
    } else {
-        XIDS message_xids = txn ? toku_txn_get_xids(txn) : toku_xids_get_root_xids();
-        ft_msg msg(key, update_function_extra, FT_UPDATE, ZERO_MSN, message_xids);
+        XIDS message_xids =
+            txn ? toku_txn_get_xids(txn) : toku_xids_get_root_xids();
+        ft_msg msg(
+            key, update_function_extra, FT_UPDATE, ZERO_MSN, message_xids);
        ft_send_update_msg(ft_h, msg, txn);
    }
+    // updates get converted to insert messages, which should do a -1 on the
+    // logical row count when the messages are permanently applied
+    toku_ft_adjust_logical_row_count(ft_h->ft, 1);
 }

 void toku_ft_maybe_update_broadcast(FT_HANDLE ft_h, const DBT *update_function_extra,
--- a/storage/tokudb/PerconaFT/ft/ft-recount-rows.cc
+++ b/storage/tokudb/PerconaFT/ft/ft-recount-rows.cc
@ -73,30 +73,20 @@ static bool recount_rows_interrupt(void* extra, uint64_t deleted_rows) {
    return rre->_cancelled =
        rre->_progress_callback(rre->_keys, deleted_rows, rre->_progress_extra);
 }
-int toku_ft_recount_rows(
-    FT_HANDLE ft,
-    int (*progress_callback)(
-        uint64_t count,
-        uint64_t deleted,
-        void* progress_extra),
-    void* progress_extra) {
-
+int toku_ft_recount_rows(FT_HANDLE ft,
+                         int (*progress_callback)(uint64_t count,
+                                                  uint64_t deleted,
+                                                  void* progress_extra),
+                         void* progress_extra) {
    int ret = 0;
-    recount_rows_extra_t rre = {
-        progress_callback,
-        progress_extra,
-        0,
-        false
-        };
+    recount_rows_extra_t rre = {progress_callback, progress_extra, 0, false};

    ft_cursor c;
    ret = toku_ft_cursor_create(ft, &c, nullptr, C_READ_ANY, false, false);
-    if (ret) return ret;
+    if (ret)
+        return ret;

-    toku_ft_cursor_set_check_interrupt_cb(
-        &c,
-        recount_rows_interrupt,
-        &rre);
+    toku_ft_cursor_set_check_interrupt_cb(&c, recount_rows_interrupt, &rre);

    ret = toku_ft_cursor_first(&c, recount_rows_found, &rre);
    while (FT_LIKELY(ret == 0)) {
@ -108,6 +98,7 @@ int toku_ft_recount_rows(
    if (rre._cancelled == false) {
        // update ft count
        toku_unsafe_set(&ft->ft->in_memory_logical_rows, rre._keys);
+        ft->ft->h->dirty = 1;
        ret = 0;
    }

--- a/storage/tokudb/PerconaFT/ft/ft.cc
+++ b/storage/tokudb/PerconaFT/ft/ft.cc
@ -903,6 +903,9 @@ void toku_ft_adjust_logical_row_count(FT ft, int64_t delta) {
    // must be returned in toku_ft_stat64.
    if (delta != 0 && ft->in_memory_logical_rows != (uint64_t)-1) {
        toku_sync_fetch_and_add(&(ft->in_memory_logical_rows), delta);
+        if (ft->in_memory_logical_rows == (uint64_t)-1) {
+            toku_sync_fetch_and_add(&(ft->in_memory_logical_rows), 1);
+        }
    }
 }

--- a/storage/tokudb/PerconaFT/ft/loader/loader-internal.h
+++ b/storage/tokudb/PerconaFT/ft/loader/loader-internal.h
@ -301,7 +301,7 @@ int toku_ft_loader_internal_init (/* out */ FTLOADER *blp,

 void toku_ft_loader_internal_destroy (FTLOADER bl, bool is_error);

-// For test purposes only.  (In production, the rowset size is determined by negotation with the cachetable for some memory.  See #2613.)
+// For test purposes only.  (In production, the rowset size is determined by negotiation with the cachetable for some memory.  See #2613.)
 uint64_t toku_ft_loader_get_rowset_budget_for_testing (void);

 int toku_ft_loader_finish_extractor(FTLOADER bl);
--- a/storage/tokudb/PerconaFT/ft/loader/loader.cc
+++ b/storage/tokudb/PerconaFT/ft/loader/loader.cc
@ -91,7 +91,7 @@ toku_ft_loader_set_size_factor(uint32_t factor) {

 uint64_t
 toku_ft_loader_get_rowset_budget_for_testing (void)
-// For test purposes only.  In production, the rowset size is determined by negotation with the cachetable for some memory.  (See #2613).
+// For test purposes only.  In production, the rowset size is determined by negotiation with the cachetable for some memory.  (See #2613).
 {
    return 16ULL*size_factor*1024ULL;
 }
--- a/storage/tokudb/PerconaFT/ft/node.cc
+++ b/storage/tokudb/PerconaFT/ft/node.cc
@ -373,52 +373,48 @@ find_bounds_within_message_tree(
    }
 }

-/**
- * For each message in the ancestor's buffer (determined by childnum) that
- * is key-wise between lower_bound_exclusive and upper_bound_inclusive,
- * apply the message to the basement node.  We treat the bounds as minus
- * or plus infinity respectively if they are NULL.  Do not mark the node
- * as dirty (preserve previous state of 'dirty' bit).
- */
+// For each message in the ancestor's buffer (determined by childnum) that
+// is key-wise between lower_bound_exclusive and upper_bound_inclusive,
+// apply the message to the basement node.  We treat the bounds as minus
+// or plus infinity respectively if they are NULL.  Do not mark the node
+// as dirty (preserve previous state of 'dirty' bit).
 static void bnc_apply_messages_to_basement_node(
-    FT_HANDLE t,             // used for comparison function
-    BASEMENTNODE bn,   // where to apply messages
+    FT_HANDLE t,      // used for comparison function
+    BASEMENTNODE bn,  // where to apply messages
    FTNODE ancestor,  // the ancestor node where we can find messages to apply
-    int childnum,      // which child buffer of ancestor contains messages we want
-    const pivot_bounds &bounds,  // contains pivot key bounds of this basement node
-    txn_gc_info* gc_info,
-    bool* msgs_applied) {
-
+    int childnum,  // which child buffer of ancestor contains messages we want
+    const pivot_bounds &
+        bounds,  // contains pivot key bounds of this basement node
+    txn_gc_info *gc_info,
+    bool *msgs_applied) {
    int r;
    NONLEAF_CHILDINFO bnc = BNC(ancestor, childnum);

    // Determine the offsets in the message trees between which we need to
    // apply messages from this buffer
-    STAT64INFO_S stats_delta = {0,0};
+    STAT64INFO_S stats_delta = {0, 0};
    uint64_t workdone_this_ancestor = 0;
    int64_t logical_rows_delta = 0;

    uint32_t stale_lbi, stale_ube;
    if (!bn->stale_ancestor_messages_applied) {
-        find_bounds_within_message_tree(
-            t->ft->cmp,
-            bnc->stale_message_tree,
-            &bnc->msg_buffer,
-            bounds,
-            &stale_lbi,
-            &stale_ube);
+        find_bounds_within_message_tree(t->ft->cmp,
+                                        bnc->stale_message_tree,
+                                        &bnc->msg_buffer,
+                                        bounds,
+                                        &stale_lbi,
+                                        &stale_ube);
    } else {
        stale_lbi = 0;
        stale_ube = 0;
    }
    uint32_t fresh_lbi, fresh_ube;
-    find_bounds_within_message_tree(
-        t->ft->cmp,
-        bnc->fresh_message_tree,
-        &bnc->msg_buffer,
-        bounds,
-        &fresh_lbi,
-        &fresh_ube);
+    find_bounds_within_message_tree(t->ft->cmp,
+                                    bnc->fresh_message_tree,
+                                    &bnc->msg_buffer,
+                                    bounds,
+                                    &fresh_lbi,
+                                    &fresh_ube);

    // We now know where all the messages we must apply are, so one of the
    // following 4 cases will do the application, depending on which of
@ -432,44 +428,53 @@ static void bnc_apply_messages_to_basement_node(
        // We have messages in multiple trees, so we grab all
        // the relevant messages' offsets and sort them by MSN, then apply
        // them in MSN order.
-        const int buffer_size = ((stale_ube - stale_lbi) +
-                                 (fresh_ube - fresh_lbi) +
-                                 bnc->broadcast_list.size());
+        const int buffer_size =
+            ((stale_ube - stale_lbi) + (fresh_ube - fresh_lbi) +
+             bnc->broadcast_list.size());
        toku::scoped_malloc offsets_buf(buffer_size * sizeof(int32_t));
        int32_t *offsets = reinterpret_cast<int32_t *>(offsets_buf.get());
-        struct store_msg_buffer_offset_extra sfo_extra = { .offsets = offsets, .i = 0 };
+        struct store_msg_buffer_offset_extra sfo_extra = {.offsets = offsets,
+                                                          .i = 0};

        // Populate offsets array with offsets to stale messages
-        r = bnc->stale_message_tree.iterate_on_range<struct store_msg_buffer_offset_extra, store_msg_buffer_offset>(stale_lbi, stale_ube, &sfo_extra);
+        r = bnc->stale_message_tree
+                .iterate_on_range<struct store_msg_buffer_offset_extra,
+                                  store_msg_buffer_offset>(
+                    stale_lbi, stale_ube, &sfo_extra);
        assert_zero(r);

        // Then store fresh offsets, and mark them to be moved to stale later.
-        r = bnc->fresh_message_tree.iterate_and_mark_range<struct store_msg_buffer_offset_extra, store_msg_buffer_offset>(fresh_lbi, fresh_ube, &sfo_extra);
+        r = bnc->fresh_message_tree
+                .iterate_and_mark_range<struct store_msg_buffer_offset_extra,
+                                        store_msg_buffer_offset>(
+                    fresh_lbi, fresh_ube, &sfo_extra);
        assert_zero(r);

        // Store offsets of all broadcast messages.
-        r = bnc->broadcast_list.iterate<struct store_msg_buffer_offset_extra, store_msg_buffer_offset>(&sfo_extra);
+        r = bnc->broadcast_list.iterate<struct store_msg_buffer_offset_extra,
+                                        store_msg_buffer_offset>(&sfo_extra);
        assert_zero(r);
        invariant(sfo_extra.i == buffer_size);

        // Sort by MSN.
-        toku::sort<int32_t, message_buffer, msg_buffer_offset_msn_cmp>::mergesort_r(offsets, buffer_size, bnc->msg_buffer);
+        toku::sort<int32_t, message_buffer, msg_buffer_offset_msn_cmp>::
+            mergesort_r(offsets, buffer_size, bnc->msg_buffer);

        // Apply the messages in MSN order.
        for (int i = 0; i < buffer_size; ++i) {
            *msgs_applied = true;
-            do_bn_apply_msg(
-                t,
-                bn,
-                &bnc->msg_buffer,
-                offsets[i],
-                gc_info,
-                &workdone_this_ancestor,
-                &stats_delta,
-                &logical_rows_delta);
+            do_bn_apply_msg(t,
+                            bn,
+                            &bnc->msg_buffer,
+                            offsets[i],
+                            gc_info,
+                            &workdone_this_ancestor,
+                            &stats_delta,
+                            &logical_rows_delta);
        }
    } else if (stale_lbi == stale_ube) {
-        // No stale messages to apply, we just apply fresh messages, and mark them to be moved to stale later.
+        // No stale messages to apply, we just apply fresh messages, and mark
+        // them to be moved to stale later.
        struct iterate_do_bn_apply_msg_extra iter_extra = {
            .t = t,
            .bn = bn,
@ -477,16 +482,20 @@ static void bnc_apply_messages_to_basement_node(
            .gc_info = gc_info,
            .workdone = &workdone_this_ancestor,
            .stats_to_update = &stats_delta,
-            .logical_rows_delta = &logical_rows_delta
-        };
-        if (fresh_ube - fresh_lbi > 0) *msgs_applied = true;
-        r = bnc->fresh_message_tree.iterate_and_mark_range<struct iterate_do_bn_apply_msg_extra, iterate_do_bn_apply_msg>(fresh_lbi, fresh_ube, &iter_extra);
+            .logical_rows_delta = &logical_rows_delta};
+        if (fresh_ube - fresh_lbi > 0)
+            *msgs_applied = true;
+        r = bnc->fresh_message_tree
+                .iterate_and_mark_range<struct iterate_do_bn_apply_msg_extra,
+                                        iterate_do_bn_apply_msg>(
+                    fresh_lbi, fresh_ube, &iter_extra);
        assert_zero(r);
    } else {
        invariant(fresh_lbi == fresh_ube);
        // No fresh messages to apply, we just apply stale messages.

-        if (stale_ube - stale_lbi > 0) *msgs_applied = true;
+        if (stale_ube - stale_lbi > 0)
+            *msgs_applied = true;
        struct iterate_do_bn_apply_msg_extra iter_extra = {
            .t = t,
            .bn = bn,
@ -494,22 +503,26 @@ static void bnc_apply_messages_to_basement_node(
            .gc_info = gc_info,
            .workdone = &workdone_this_ancestor,
            .stats_to_update = &stats_delta,
-            .logical_rows_delta = &logical_rows_delta
-        };
+            .logical_rows_delta = &logical_rows_delta};

-        r = bnc->stale_message_tree.iterate_on_range<struct iterate_do_bn_apply_msg_extra, iterate_do_bn_apply_msg>(stale_lbi, stale_ube, &iter_extra);
+        r = bnc->stale_message_tree
+                .iterate_on_range<struct iterate_do_bn_apply_msg_extra,
+                                  iterate_do_bn_apply_msg>(
+                    stale_lbi, stale_ube, &iter_extra);
        assert_zero(r);
    }
    //
    // update stats
    //
    if (workdone_this_ancestor > 0) {
-        (void) toku_sync_fetch_and_add(&BP_WORKDONE(ancestor, childnum), workdone_this_ancestor);
+        (void)toku_sync_fetch_and_add(&BP_WORKDONE(ancestor, childnum),
+                                      workdone_this_ancestor);
    }
    if (stats_delta.numbytes || stats_delta.numrows) {
        toku_ft_update_stats(&t->ft->in_memory_stats, stats_delta);
    }
    toku_ft_adjust_logical_row_count(t->ft, logical_rows_delta);
+    bn->logical_rows_delta += logical_rows_delta;
 }

 static void
--- a/storage/tokudb/PerconaFT/ft/node.h
+++ b/storage/tokudb/PerconaFT/ft/node.h
@ -199,6 +199,7 @@ struct ftnode_leaf_basement_node {
    MSN max_msn_applied;            // max message sequence number applied
    bool stale_ancestor_messages_applied;
    STAT64INFO_S stat64_delta;      // change in stat64 counters since basement was last written to disk
+    int64_t logical_rows_delta;
 };
 typedef struct ftnode_leaf_basement_node *BASEMENTNODE;

--- a/storage/tokudb/PerconaFT/ft/serialize/block_allocator.cc
+++ b/storage/tokudb/PerconaFT/ft/serialize/block_allocator.cc
@ -46,415 +46,214 @@ Copyright (c) 2006, 2015, Percona and/or its affiliates. All rights reserved.
 #include "portability/toku_stdlib.h"

 #include "ft/serialize/block_allocator.h"
-#include "ft/serialize/block_allocator_strategy.h"
+#include "ft/serialize/rbtree_mhs.h"

 #if TOKU_DEBUG_PARANOID
-#define VALIDATE() validate()
+#define VALIDATE() Validate()
 #else
 #define VALIDATE()
 #endif

-static FILE *ba_trace_file = nullptr;
-
-void block_allocator::maybe_initialize_trace(void) {
-    const char *ba_trace_path = getenv("TOKU_BA_TRACE_PATH");        
-    if (ba_trace_path != nullptr) {
-        ba_trace_file = toku_os_fopen(ba_trace_path, "w");
-        if (ba_trace_file == nullptr) {
-            fprintf(stderr, "tokuft: error: block allocator trace path found in environment (%s), "
-                            "but it could not be opened for writing (errno %d)\n",
-                            ba_trace_path, get_maybe_error_errno());
-        } else {
-            fprintf(stderr, "tokuft: block allocator tracing enabled, path: %s\n", ba_trace_path);
-        }
-    }
-}
-
-void block_allocator::maybe_close_trace() {
-    if (ba_trace_file != nullptr) {
-        int r = toku_os_fclose(ba_trace_file);
-        if (r != 0) {
-            fprintf(stderr, "tokuft: error: block allocator trace file did not close properly (r %d, errno %d)\n",
-                            r, get_maybe_error_errno());
-        } else {
-            fprintf(stderr, "tokuft: block allocator tracing finished, file closed successfully\n");
-        }
-    }
-}
-
-void block_allocator::_create_internal(uint64_t reserve_at_beginning, uint64_t alignment) {
-    // the alignment must be at least 512 and aligned with 512 to work with direct I/O
-    assert(alignment >= 512 && (alignment % 512) == 0);
+void BlockAllocator::CreateInternal(uint64_t reserve_at_beginning,
+                                    uint64_t alignment) {
+    // the alignment must be at least 512 and aligned with 512 to work with
+    // direct I/O
+    invariant(alignment >= 512 && (alignment % 512) == 0);

    _reserve_at_beginning = reserve_at_beginning;
    _alignment = alignment;
    _n_blocks = 0;
-    _blocks_array_size = 1;
-    XMALLOC_N(_blocks_array_size, _blocks_array);
    _n_bytes_in_use = reserve_at_beginning;
-    _strategy = BA_STRATEGY_FIRST_FIT;
-
-    memset(&_trace_lock, 0, sizeof(toku_mutex_t));
-    toku_mutex_init(&_trace_lock, nullptr);
+    _tree = new MhsRbTree::Tree(alignment);
+}

+void BlockAllocator::Create(uint64_t reserve_at_beginning, uint64_t alignment) {
+    CreateInternal(reserve_at_beginning, alignment);
+    _tree->Insert({reserve_at_beginning, MAX_BYTE});
    VALIDATE();
 }

-void block_allocator::create(uint64_t reserve_at_beginning, uint64_t alignment) {
-    _create_internal(reserve_at_beginning, alignment);
-    _trace_create();
+void BlockAllocator::Destroy() {
+    delete _tree;
 }

-void block_allocator::destroy() {
-    toku_free(_blocks_array);
-    _trace_destroy();
-    toku_mutex_destroy(&_trace_lock);
-}
-
-void block_allocator::set_strategy(enum allocation_strategy strategy) {
-    _strategy = strategy;
-}
-
-void block_allocator::grow_blocks_array_by(uint64_t n_to_add) {
-    if (_n_blocks + n_to_add > _blocks_array_size) {
-        uint64_t new_size = _n_blocks + n_to_add;
-        uint64_t at_least = _blocks_array_size * 2;
-        if (at_least > new_size) {
-            new_size = at_least;
-        }
-        _blocks_array_size = new_size;
-        XREALLOC_N(_blocks_array_size, _blocks_array);
-    }
-}
-
-void block_allocator::grow_blocks_array() {
-    grow_blocks_array_by(1);
-}
-
-void block_allocator::create_from_blockpairs(uint64_t reserve_at_beginning, uint64_t alignment,
-                                             struct blockpair *pairs, uint64_t n_blocks) {
-    _create_internal(reserve_at_beginning, alignment);
-
+void BlockAllocator::CreateFromBlockPairs(uint64_t reserve_at_beginning,
+                                          uint64_t alignment,
+                                          struct BlockPair *translation_pairs,
+                                          uint64_t n_blocks) {
+    CreateInternal(reserve_at_beginning, alignment);
    _n_blocks = n_blocks;
-    grow_blocks_array_by(_n_blocks);
-    memcpy(_blocks_array, pairs, _n_blocks * sizeof(struct blockpair));
-    std::sort(_blocks_array, _blocks_array + _n_blocks);
-    for (uint64_t i = 0; i < _n_blocks; i++) {
-        // Allocator does not support size 0 blocks. See block_allocator_free_block.
-        invariant(_blocks_array[i].size > 0);
-        invariant(_blocks_array[i].offset >= _reserve_at_beginning);
-        invariant(_blocks_array[i].offset % _alignment == 0);

-        _n_bytes_in_use += _blocks_array[i].size;
+    struct BlockPair *XMALLOC_N(n_blocks, pairs);
+    memcpy(pairs, translation_pairs, n_blocks * sizeof(struct BlockPair));
+    std::sort(pairs, pairs + n_blocks);
+
+    if (pairs[0]._offset > reserve_at_beginning) {
+        _tree->Insert(
+            {reserve_at_beginning, pairs[0]._offset - reserve_at_beginning});
    }
+    for (uint64_t i = 0; i < _n_blocks; i++) {
+        // Allocator does not support size 0 blocks. See
+        // block_allocator_free_block.
+        invariant(pairs[i]._size > 0);
+        invariant(pairs[i]._offset >= _reserve_at_beginning);
+        invariant(pairs[i]._offset % _alignment == 0);

+        _n_bytes_in_use += pairs[i]._size;
+
+        MhsRbTree::OUUInt64 free_size(MAX_BYTE);
+        MhsRbTree::OUUInt64 free_offset(pairs[i]._offset + pairs[i]._size);
+        if (i < n_blocks - 1) {
+            MhsRbTree::OUUInt64 next_offset(pairs[i + 1]._offset);
+            invariant(next_offset >= free_offset);
+            free_size = next_offset - free_offset;
+            if (free_size == 0)
+                continue;
+        }
+        _tree->Insert({free_offset, free_size});
+    }
+    toku_free(pairs);
    VALIDATE();
-
-    _trace_create_from_blockpairs();
 }

 // Effect: align a value by rounding up.
-static inline uint64_t align(uint64_t value, uint64_t ba_alignment) {
+static inline uint64_t Align(uint64_t value, uint64_t ba_alignment) {
    return ((value + ba_alignment - 1) / ba_alignment) * ba_alignment;
 }

-struct block_allocator::blockpair *
-block_allocator::choose_block_to_alloc_after(size_t size, uint64_t heat) {
-    switch (_strategy) {
-    case BA_STRATEGY_FIRST_FIT:
-        return block_allocator_strategy::first_fit(_blocks_array, _n_blocks, size, _alignment);
-    case BA_STRATEGY_BEST_FIT:
-        return block_allocator_strategy::best_fit(_blocks_array, _n_blocks, size, _alignment);
-    case BA_STRATEGY_HEAT_ZONE:
-        return block_allocator_strategy::heat_zone(_blocks_array, _n_blocks, size, _alignment, heat);
-    case BA_STRATEGY_PADDED_FIT:
-        return block_allocator_strategy::padded_fit(_blocks_array, _n_blocks, size, _alignment);
-    default:
-        abort();
-    }
-}
-
-// Effect: Allocate a block. The resulting block must be aligned on the ba->alignment (which to make direct_io happy must be a positive multiple of 512).
-void block_allocator::alloc_block(uint64_t size, uint64_t heat, uint64_t *offset) {
-    struct blockpair *bp;
-
+// Effect: Allocate a block. The resulting block must be aligned on the
+// ba->alignment (which to make direct_io happy must be a positive multiple of
+// 512).
+void BlockAllocator::AllocBlock(uint64_t size,
+                                uint64_t *offset) {
    // Allocator does not support size 0 blocks. See block_allocator_free_block.
    invariant(size > 0);

-    grow_blocks_array();
    _n_bytes_in_use += size;
+    *offset = _tree->Remove(size);

-    uint64_t end_of_reserve = align(_reserve_at_beginning, _alignment);
-
-    if (_n_blocks == 0) {
-        // First and only block
-        assert(_n_bytes_in_use == _reserve_at_beginning + size); // we know exactly how many are in use
-        _blocks_array[0].offset = align(_reserve_at_beginning, _alignment);
-        _blocks_array[0].size = size;
-        *offset = _blocks_array[0].offset;
-        goto done;
-    } else if (end_of_reserve + size <= _blocks_array[0].offset ) {
-        // Check to see if the space immediately after the reserve is big enough to hold the new block.
-        bp = &_blocks_array[0];
-        memmove(bp + 1, bp, _n_blocks * sizeof(*bp));
-        bp[0].offset = end_of_reserve;
-        bp[0].size = size;
-        *offset = end_of_reserve;
-        goto done;
-    }
-
-    bp = choose_block_to_alloc_after(size, heat);
-    if (bp != nullptr) {
-        // our allocation strategy chose the space after `bp' to fit the new block
-        uint64_t answer_offset = align(bp->offset + bp->size, _alignment);
-        uint64_t blocknum = bp - _blocks_array;
-        invariant(&_blocks_array[blocknum] == bp);
-        invariant(blocknum < _n_blocks);
-        memmove(bp + 2, bp + 1, (_n_blocks - blocknum - 1) * sizeof(*bp));
-        bp[1].offset = answer_offset;
-        bp[1].size = size;
-        *offset = answer_offset;
-    } else {
-        // It didn't fit anywhere, so fit it on the end.
-        assert(_n_blocks < _blocks_array_size);
-        bp = &_blocks_array[_n_blocks];
-        uint64_t answer_offset = align(bp[-1].offset + bp[-1].size, _alignment);
-        bp->offset = answer_offset;
-        bp->size = size;
-        *offset = answer_offset;
-    }
-
-done:
    _n_blocks++;
    VALIDATE();
-
-    _trace_alloc(size, heat, *offset);
 }

-// Find the index in the blocks array that has a particular offset.  Requires that the block exist.
-// Use binary search so it runs fast.
-int64_t block_allocator::find_block(uint64_t offset) {
-    VALIDATE();
-    if (_n_blocks == 1) {
-        assert(_blocks_array[0].offset == offset);
-        return 0;
-    }
-
-    uint64_t lo = 0;
-    uint64_t hi = _n_blocks;
-    while (1) {
-        assert(lo < hi); // otherwise no such block exists.
-        uint64_t mid = (lo + hi) / 2;
-        uint64_t thisoff = _blocks_array[mid].offset;
-        if (thisoff < offset) {
-            lo = mid + 1;
-        } else if (thisoff > offset) {
-            hi = mid;
-        } else {
-            return mid;
-        }
-    }
-}
-
-// To support 0-sized blocks, we need to include size as an input to this function.
+// To support 0-sized blocks, we need to include size as an input to this
+// function.
 // All 0-sized blocks at the same offset can be considered identical, but
 // a 0-sized block can share offset with a non-zero sized block.
-// The non-zero sized block is not exchangable with a zero sized block (or vice versa),
-// so inserting 0-sized blocks can cause corruption here.
-void block_allocator::free_block(uint64_t offset) {
+// The non-zero sized block is not exchangable with a zero sized block (or vice
+// versa), so inserting 0-sized blocks can cause corruption here.
+void BlockAllocator::FreeBlock(uint64_t offset, uint64_t size) {
    VALIDATE();
-    int64_t bn = find_block(offset);
-    assert(bn >= 0); // we require that there is a block with that offset.
-    _n_bytes_in_use -= _blocks_array[bn].size;
-    memmove(&_blocks_array[bn], &_blocks_array[bn + 1],
-            (_n_blocks - bn - 1) * sizeof(struct blockpair));
+    _n_bytes_in_use -= size;
+    _tree->Insert({offset, size});
    _n_blocks--;
    VALIDATE();
-    
-    _trace_free(offset);
 }

-uint64_t block_allocator::block_size(uint64_t offset) {
-    int64_t bn = find_block(offset);
-    assert(bn >=0); // we require that there is a block with that offset.
-    return _blocks_array[bn].size;
+uint64_t BlockAllocator::AllocatedLimit() const {
+    MhsRbTree::Node *max_node = _tree->MaxNode();
+    return rbn_offset(max_node).ToInt();
 }

-uint64_t block_allocator::allocated_limit() const {
-    if (_n_blocks == 0) {
-        return _reserve_at_beginning;
-    } else {
-        struct blockpair *last = &_blocks_array[_n_blocks - 1];
-        return last->offset + last->size;
-    }
-}
-
-// Effect: Consider the blocks in sorted order.  The reserved block at the beginning is number 0.  The next one is number 1 and so forth.
+// Effect: Consider the blocks in sorted order.  The reserved block at the
+// beginning is number 0.  The next one is number 1 and so forth.
 // Return the offset and size of the block with that number.
 // Return 0 if there is a block that big, return nonzero if b is too big.
-int block_allocator::get_nth_block_in_layout_order(uint64_t b, uint64_t *offset, uint64_t *size) {
-    if (b ==0 ) {
+int BlockAllocator::NthBlockInLayoutOrder(uint64_t b,
+                                          uint64_t *offset,
+                                          uint64_t *size) {
+    MhsRbTree::Node *x, *y;
+    if (b == 0) {
        *offset = 0;
        *size = _reserve_at_beginning;
-        return  0;
+        return 0;
    } else if (b > _n_blocks) {
        return -1;
    } else {
-        *offset =_blocks_array[b - 1].offset;
-        *size =_blocks_array[b - 1].size;
+        x = _tree->MinNode();
+        for (uint64_t i = 1; i <= b; i++) {
+            y = x;
+            x = _tree->Successor(x);
+        }
+        *size = (rbn_offset(x) - (rbn_offset(y) + rbn_size(y))).ToInt();
+        *offset = (rbn_offset(y) + rbn_size(y)).ToInt();
        return 0;
    }
 }

+struct VisUnusedExtra {
+    TOKU_DB_FRAGMENTATION _report;
+    uint64_t _align;
+};
+
+static void VisUnusedCollector(void *extra,
+                               MhsRbTree::Node *node,
+                               uint64_t UU(depth)) {
+    struct VisUnusedExtra *v_e = (struct VisUnusedExtra *)extra;
+    TOKU_DB_FRAGMENTATION report = v_e->_report;
+    uint64_t alignm = v_e->_align;
+
+    MhsRbTree::OUUInt64 offset = rbn_offset(node);
+    MhsRbTree::OUUInt64 size = rbn_size(node);
+    MhsRbTree::OUUInt64 answer_offset(Align(offset.ToInt(), alignm));
+    uint64_t free_space = (offset + size - answer_offset).ToInt();
+    if (free_space > 0) {
+        report->unused_bytes += free_space;
+        report->unused_blocks++;
+        if (free_space > report->largest_unused_block) {
+            report->largest_unused_block = free_space;
+        }
+    }
+}
 // Requires: report->file_size_bytes is filled in
 // Requires: report->data_bytes is filled in
 // Requires: report->checkpoint_bytes_additional is filled in
-void block_allocator::get_unused_statistics(TOKU_DB_FRAGMENTATION report) {
-    assert(_n_bytes_in_use == report->data_bytes + report->checkpoint_bytes_additional);
+void BlockAllocator::UnusedStatistics(TOKU_DB_FRAGMENTATION report) {
+    invariant(_n_bytes_in_use ==
+              report->data_bytes + report->checkpoint_bytes_additional);

    report->unused_bytes = 0;
    report->unused_blocks = 0;
    report->largest_unused_block = 0;
-    if (_n_blocks > 0) {
-        //Deal with space before block 0 and after reserve:
-        {
-            struct blockpair *bp = &_blocks_array[0];
-            assert(bp->offset >= align(_reserve_at_beginning, _alignment));
-            uint64_t free_space = bp->offset - align(_reserve_at_beginning, _alignment);
-            if (free_space > 0) {
-                report->unused_bytes += free_space;
-                report->unused_blocks++;
-                if (free_space > report->largest_unused_block) {
-                    report->largest_unused_block = free_space;
-                }
-            }
-        }
-
-        //Deal with space between blocks:
-        for (uint64_t blocknum = 0; blocknum +1 < _n_blocks; blocknum ++) {
-            // Consider the space after blocknum
-            struct blockpair *bp = &_blocks_array[blocknum];
-            uint64_t this_offset = bp[0].offset;
-            uint64_t this_size   = bp[0].size;
-            uint64_t end_of_this_block = align(this_offset+this_size, _alignment);
-            uint64_t next_offset = bp[1].offset;
-            uint64_t free_space  = next_offset - end_of_this_block;
-            if (free_space > 0) {
-                report->unused_bytes += free_space;
-                report->unused_blocks++;
-                if (free_space > report->largest_unused_block) {
-                    report->largest_unused_block = free_space;
-                }
-            }
-        }
-
-        //Deal with space after last block
-        {
-            struct blockpair *bp = &_blocks_array[_n_blocks-1];
-            uint64_t this_offset = bp[0].offset;
-            uint64_t this_size   = bp[0].size;
-            uint64_t end_of_this_block = align(this_offset+this_size, _alignment);
-            if (end_of_this_block < report->file_size_bytes) {
-                uint64_t free_space  = report->file_size_bytes - end_of_this_block;
-                assert(free_space > 0);
-                report->unused_bytes += free_space;
-                report->unused_blocks++;
-                if (free_space > report->largest_unused_block) {
-                    report->largest_unused_block = free_space;
-                }
-            }
-        }
-    } else {
-        // No blocks.  Just the reserve.
-        uint64_t end_of_this_block = align(_reserve_at_beginning, _alignment);
-        if (end_of_this_block < report->file_size_bytes) {
-            uint64_t free_space  = report->file_size_bytes - end_of_this_block;
-            assert(free_space > 0);
-            report->unused_bytes += free_space;
-            report->unused_blocks++;
-            if (free_space > report->largest_unused_block) {
-                report->largest_unused_block = free_space;
-            }
-        }
-    }
+    struct VisUnusedExtra extra = {report, _alignment};
+    _tree->InOrderVisitor(VisUnusedCollector, &extra);
 }

-void block_allocator::get_statistics(TOKU_DB_FRAGMENTATION report) {
-    report->data_bytes = _n_bytes_in_use; 
-    report->data_blocks = _n_blocks; 
+void BlockAllocator::Statistics(TOKU_DB_FRAGMENTATION report) {
+    report->data_bytes = _n_bytes_in_use;
+    report->data_blocks = _n_blocks;
    report->file_size_bytes = 0;
    report->checkpoint_bytes_additional = 0;
-    get_unused_statistics(report);
+    UnusedStatistics(report);
 }

-void block_allocator::validate() const {
-    uint64_t n_bytes_in_use = _reserve_at_beginning;
-    for (uint64_t i = 0; i < _n_blocks; i++) {
-        n_bytes_in_use += _blocks_array[i].size;
-        if (i > 0) {
-            assert(_blocks_array[i].offset >  _blocks_array[i - 1].offset);
-            assert(_blocks_array[i].offset >= _blocks_array[i - 1].offset + _blocks_array[i - 1].size );
-        }
+struct ValidateExtra {
+    uint64_t _bytes;
+    MhsRbTree::Node *_pre_node;
+};
+static void VisUsedBlocksInOrder(void *extra,
+                                 MhsRbTree::Node *cur_node,
+                                 uint64_t UU(depth)) {
+    struct ValidateExtra *v_e = (struct ValidateExtra *)extra;
+    MhsRbTree::Node *pre_node = v_e->_pre_node;
+    // verify no overlaps
+    if (pre_node) {
+        invariant(rbn_size(pre_node) > 0);
+        invariant(rbn_offset(cur_node) >
+                  rbn_offset(pre_node) + rbn_size(pre_node));
+        MhsRbTree::OUUInt64 used_space =
+            rbn_offset(cur_node) - (rbn_offset(pre_node) + rbn_size(pre_node));
+        v_e->_bytes += used_space.ToInt();
+    } else {
+        v_e->_bytes += rbn_offset(cur_node).ToInt();
    }
-    assert(n_bytes_in_use == _n_bytes_in_use);
+    v_e->_pre_node = cur_node;
 }

-// Tracing
-
-void block_allocator::_trace_create(void) {
-    if (ba_trace_file != nullptr) {
-        toku_mutex_lock(&_trace_lock);
-        fprintf(ba_trace_file, "ba_trace_create %p %" PRIu64 " %" PRIu64 "\n",
-                this, _reserve_at_beginning, _alignment);
-        toku_mutex_unlock(&_trace_lock);
-
-        fflush(ba_trace_file);
-    }
-}
-
-void block_allocator::_trace_create_from_blockpairs(void) {
-    if (ba_trace_file != nullptr) {
-        toku_mutex_lock(&_trace_lock);
-        fprintf(ba_trace_file, "ba_trace_create_from_blockpairs %p %" PRIu64 " %" PRIu64 " ",
-                this, _reserve_at_beginning, _alignment);
-        for (uint64_t i = 0; i < _n_blocks; i++) {
-            fprintf(ba_trace_file, "[%" PRIu64 " %" PRIu64 "] ",
-                    _blocks_array[i].offset, _blocks_array[i].size);
-        }
-        fprintf(ba_trace_file, "\n");
-        toku_mutex_unlock(&_trace_lock);
-
-        fflush(ba_trace_file);
-    }
-}
-
-void block_allocator::_trace_destroy(void) {
-    if (ba_trace_file != nullptr) {
-        toku_mutex_lock(&_trace_lock);
-        fprintf(ba_trace_file, "ba_trace_destroy %p\n", this);
-        toku_mutex_unlock(&_trace_lock);
-
-        fflush(ba_trace_file);
-    }
-}
-
-void block_allocator::_trace_alloc(uint64_t size, uint64_t heat, uint64_t offset) {
-    if (ba_trace_file != nullptr) {
-        toku_mutex_lock(&_trace_lock);
-        fprintf(ba_trace_file, "ba_trace_alloc %p %" PRIu64 " %" PRIu64 " %" PRIu64 "\n",
-                this, size, heat, offset);
-        toku_mutex_unlock(&_trace_lock);
-
-        fflush(ba_trace_file);
-    }
-}
-
-void block_allocator::_trace_free(uint64_t offset) {
-    if (ba_trace_file != nullptr) {
-        toku_mutex_lock(&_trace_lock);
-        fprintf(ba_trace_file, "ba_trace_free %p %" PRIu64 "\n", this, offset);
-        toku_mutex_unlock(&_trace_lock);
-
-        fflush(ba_trace_file);
-    }
+void BlockAllocator::Validate() const {
+    _tree->ValidateBalance();
+    _tree->ValidateMhs();
+    struct ValidateExtra extra = {0, nullptr};
+    _tree->InOrderVisitor(VisUsedBlocksInOrder, &extra);
+    invariant(extra._bytes == _n_bytes_in_use);
 }
--- a/storage/tokudb/PerconaFT/ft/serialize/block_allocator.h
+++ b/storage/tokudb/PerconaFT/ft/serialize/block_allocator.h
@ -43,6 +43,7 @@ Copyright (c) 2006, 2015, Percona and/or its affiliates. All rights reserved.
 #include "portability/toku_pthread.h"
 #include "portability/toku_stdint.h"
 #include "portability/toku_stdlib.h"
+#include "ft/serialize/rbtree_mhs.h"

 // Block allocator.
 //
@ -51,151 +52,128 @@ Copyright (c) 2006, 2015, Percona and/or its affiliates. All rights reserved.
 // The allocation of block numbers is handled elsewhere.
 //
 // When creating a block allocator we also specify a certain-sized
-// block at the beginning that is preallocated (and cannot be allocated or freed)
+// block at the beginning that is preallocated (and cannot be allocated or
+// freed)
 //
 // We can allocate blocks of a particular size at a particular location.
-// We can allocate blocks of a particular size at a location chosen by the allocator.
 // We can free blocks.
 // We can determine the size of a block.
-
-class block_allocator {
-public:
+#define MAX_BYTE 0xffffffffffffffff
+class BlockAllocator {
+   public:
    static const size_t BLOCK_ALLOCATOR_ALIGNMENT = 4096;

    // How much must be reserved at the beginning for the block?
-    //  The actual header is 8+4+4+8+8_4+8+ the length of the db names + 1 pointer for each root.
+    //  The actual header is 8+4+4+8+8_4+8+ the length of the db names + 1
+    //  pointer for each root.
    //  So 4096 should be enough.
    static const size_t BLOCK_ALLOCATOR_HEADER_RESERVE = 4096;
-    
-    static_assert(BLOCK_ALLOCATOR_HEADER_RESERVE % BLOCK_ALLOCATOR_ALIGNMENT == 0,
+
+    static_assert(BLOCK_ALLOCATOR_HEADER_RESERVE % BLOCK_ALLOCATOR_ALIGNMENT ==
+                      0,
                  "block allocator header must have proper alignment");

-    static const size_t BLOCK_ALLOCATOR_TOTAL_HEADER_RESERVE = BLOCK_ALLOCATOR_HEADER_RESERVE * 2;
+    static const size_t BLOCK_ALLOCATOR_TOTAL_HEADER_RESERVE =
+        BLOCK_ALLOCATOR_HEADER_RESERVE * 2;

-    enum allocation_strategy {
-        BA_STRATEGY_FIRST_FIT = 1,
-        BA_STRATEGY_BEST_FIT,
-        BA_STRATEGY_PADDED_FIT,
-        BA_STRATEGY_HEAT_ZONE
+    struct BlockPair {
+        uint64_t _offset;
+        uint64_t _size;
+        BlockPair(uint64_t o, uint64_t s) : _offset(o), _size(s) {}
+        int operator<(const struct BlockPair &rhs) const {
+            return _offset < rhs._offset;
+        }
+        int operator<(const uint64_t &o) const { return _offset < o; }
    };

-    struct blockpair {
-        uint64_t offset;
-        uint64_t size;
-        blockpair(uint64_t o, uint64_t s) :
-            offset(o), size(s) {
-        }
-        int operator<(const struct blockpair &rhs) const {
-            return offset < rhs.offset;
-        }
-        int operator<(const uint64_t &o) const {
-            return offset < o;
-        }
-    };
-
-    // Effect: Create a block allocator, in which the first RESERVE_AT_BEGINNING bytes are not put into a block.
-    //         The default allocation strategy is first fit (BA_STRATEGY_FIRST_FIT)
+    // Effect: Create a block allocator, in which the first RESERVE_AT_BEGINNING
+    // bytes are not put into a block.
+    //         The default allocation strategy is first fit
+    //         (BA_STRATEGY_FIRST_FIT)
    //  All blocks be start on a multiple of ALIGNMENT.
    //  Aborts if we run out of memory.
    // Parameters
-    //  reserve_at_beginning (IN)        Size of reserved block at beginning.  This size does not have to be aligned.
+    //  reserve_at_beginning (IN)        Size of reserved block at beginning.
+    //  This size does not have to be aligned.
    //  alignment (IN)                   Block alignment.
-    void create(uint64_t reserve_at_beginning, uint64_t alignment);
+    void Create(uint64_t reserve_at_beginning, uint64_t alignment);

-    // Effect: Create a block allocator, in which the first RESERVE_AT_BEGINNING bytes are not put into a block.
-    //         The default allocation strategy is first fit (BA_STRATEGY_FIRST_FIT)
-    //         The allocator is initialized to contain `n_blocks' of blockpairs, taken from `pairs'
+    // Effect: Create a block allocator, in which the first RESERVE_AT_BEGINNING
+    // bytes are not put into a block.
+    //         The allocator is initialized to contain `n_blocks' of BlockPairs,
+    //         taken from `pairs'
    //  All blocks be start on a multiple of ALIGNMENT.
    //  Aborts if we run out of memory.
    // Parameters
    //  pairs,                           unowned array of pairs to copy
    //  n_blocks,                        Size of pairs array
-    //  reserve_at_beginning (IN)        Size of reserved block at beginning.  This size does not have to be aligned.
+    //  reserve_at_beginning (IN)        Size of reserved block at beginning.
+    //  This size does not have to be aligned.
    //  alignment (IN)                   Block alignment.
-    void create_from_blockpairs(uint64_t reserve_at_beginning, uint64_t alignment,
-                                struct blockpair *pairs, uint64_t n_blocks);
+    void CreateFromBlockPairs(uint64_t reserve_at_beginning,
+                              uint64_t alignment,
+                              struct BlockPair *pairs,
+                              uint64_t n_blocks);

    // Effect: Destroy this block allocator
-    void destroy();
+    void Destroy();

-    // Effect: Set the allocation strategy that the allocator should use
-    // Requires: No other threads are operating on this block allocator
-    void set_strategy(enum allocation_strategy strategy);
-
-    // Effect: Allocate a block of the specified size at an address chosen by the allocator.
+    // Effect: Allocate a block of the specified size at an address chosen by
+    // the allocator.
    //  Aborts if anything goes wrong.
    //  The block address will be a multiple of the alignment.
    // Parameters:
-    //  size (IN):    The size of the block.  (The size does not have to be aligned.)
+    //  size (IN):    The size of the block.  (The size does not have to be
+    //  aligned.)
    //  offset (OUT): The location of the block.
-    //  heat (IN):    A higher heat means we should be prepared to free this block soon (perhaps in the next checkpoint)
-    //                Heat values are lexiographically ordered (like integers), but their specific values are arbitrary
-    void alloc_block(uint64_t size, uint64_t heat, uint64_t *offset);
+    //  block soon (perhaps in the next checkpoint)
+    //                Heat values are lexiographically ordered (like integers),
+    //                but their specific values are arbitrary
+    void AllocBlock(uint64_t size, uint64_t *offset);

    // Effect: Free the block at offset.
    // Requires: There must be a block currently allocated at that offset.
    // Parameters:
    //  offset (IN): The offset of the block.
-    void free_block(uint64_t offset);
+    void FreeBlock(uint64_t offset, uint64_t size);

-    // Effect: Return the size of the block that starts at offset.
-    // Requires: There must be a block currently allocated at that offset.
-    // Parameters:
-    //  offset (IN): The offset of the block.
-    uint64_t block_size(uint64_t offset);
-
-    // Effect: Check to see if the block allocator is OK.  This may take a long time.
+    // Effect: Check to see if the block allocator is OK.  This may take a long
+    // time.
    // Usage Hints: Probably only use this for unit tests.
    // TODO: Private?
-    void validate() const;
+    void Validate() const;

    // Effect: Return the unallocated block address of "infinite" size.
-    //  That is, return the smallest address that is above all the allocated blocks.
-    uint64_t allocated_limit() const;
+    //  That is, return the smallest address that is above all the allocated
+    //  blocks.
+    uint64_t AllocatedLimit() const;

-    // Effect: Consider the blocks in sorted order.  The reserved block at the beginning is number 0.  The next one is number 1 and so forth.
+    // Effect: Consider the blocks in sorted order.  The reserved block at the
+    // beginning is number 0.  The next one is number 1 and so forth.
    //  Return the offset and size of the block with that number.
    //  Return 0 if there is a block that big, return nonzero if b is too big.
    // Rationale: This is probably useful only for tests.
-    int get_nth_block_in_layout_order(uint64_t b, uint64_t *offset, uint64_t *size);
+    int NthBlockInLayoutOrder(uint64_t b, uint64_t *offset, uint64_t *size);

    // Effect:  Fill in report to indicate how the file is used.
-    // Requires: 
+    // Requires:
    //  report->file_size_bytes is filled in
    //  report->data_bytes is filled in
    //  report->checkpoint_bytes_additional is filled in
-    void get_unused_statistics(TOKU_DB_FRAGMENTATION report);
+    void UnusedStatistics(TOKU_DB_FRAGMENTATION report);

    // Effect: Fill in report->data_bytes with the number of bytes in use
-    //         Fill in report->data_blocks with the number of blockpairs in use
+    //         Fill in report->data_blocks with the number of BlockPairs in use
    //         Fill in unused statistics using this->get_unused_statistics()
    // Requires:
    //  report->file_size is ignored on return
    //  report->checkpoint_bytes_additional is ignored on return
-    void get_statistics(TOKU_DB_FRAGMENTATION report);
+    void Statistics(TOKU_DB_FRAGMENTATION report);

-    // Block allocator tracing.
-    // - Enabled by setting TOKU_BA_TRACE_PATH to the file that the trace file
-    //   should be written to.
-    // - Trace may be replayed by ba_trace_replay tool in tools/ directory
-    //   eg: "cat mytracefile | ba_trace_replay"
-    static void maybe_initialize_trace();
-    static void maybe_close_trace();
+    virtual ~BlockAllocator(){};

-private:
-    void _create_internal(uint64_t reserve_at_beginning, uint64_t alignment);
-    void grow_blocks_array_by(uint64_t n_to_add);
-    void grow_blocks_array();
-    int64_t find_block(uint64_t offset);
-    struct blockpair *choose_block_to_alloc_after(size_t size, uint64_t heat);
-
-    // Tracing
-    toku_mutex_t _trace_lock;
-    void _trace_create(void);
-    void _trace_create_from_blockpairs(void);
-    void _trace_destroy(void);
-    void _trace_alloc(uint64_t size, uint64_t heat, uint64_t offset);
-    void _trace_free(uint64_t offset);
+   private:
+    void CreateInternal(uint64_t reserve_at_beginning, uint64_t alignment);

    // How much to reserve at the beginning
    uint64_t _reserve_at_beginning;
@ -203,12 +181,8 @@ private:
    uint64_t _alignment;
    // How many blocks
    uint64_t _n_blocks;
-    // How big is the blocks_array.  Must be >= n_blocks.
-    uint64_t _blocks_array_size;
-    // These blocks are sorted by address.
-    struct blockpair *_blocks_array;
-    // Including the reserve_at_beginning
    uint64_t _n_bytes_in_use;
-    // The allocation strategy are we using
-    enum allocation_strategy _strategy;
+
+    // These blocks are sorted by address.
+    MhsRbTree::Tree *_tree;
 };
--- a/storage/tokudb/PerconaFT/ft/serialize/block_allocator_strategy.cc
+++ b/storage/tokudb/PerconaFT/ft/serialize/block_allocator_strategy.cc
@ -1,224 +0,0 @@
-/* -*- mode: C++; c-basic-offset: 4; indent-tabs-mode: nil -*- */
-// vim: ft=cpp:expandtab:ts=8:sw=4:softtabstop=4:
-#ident "$Id$"
-/*======
-This file is part of PerconaFT.
-
-
-Copyright (c) 2006, 2015, Percona and/or its affiliates. All rights reserved.
-
-    PerconaFT is free software: you can redistribute it and/or modify
-    it under the terms of the GNU General Public License, version 2,
-    as published by the Free Software Foundation.
-
-    PerconaFT is distributed in the hope that it will be useful,
-    but WITHOUT ANY WARRANTY; without even the implied warranty of
-    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-    GNU General Public License for more details.
-
-    You should have received a copy of the GNU General Public License
-    along with PerconaFT.  If not, see <http://www.gnu.org/licenses/>.
-
----------------------------------------
-
-    PerconaFT is free software: you can redistribute it and/or modify
-    it under the terms of the GNU Affero General Public License, version 3,
-    as published by the Free Software Foundation.
-
-    PerconaFT is distributed in the hope that it will be useful,
-    but WITHOUT ANY WARRANTY; without even the implied warranty of
-    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-    GNU Affero General Public License for more details.
-
-    You should have received a copy of the GNU Affero General Public License
-    along with PerconaFT.  If not, see <http://www.gnu.org/licenses/>.
-======= */
-
-#ident "Copyright (c) 2006, 2015, Percona and/or its affiliates. All rights reserved."
-
-#include <algorithm>
-
-#include <string.h>
-
-#include "portability/toku_assert.h"
-
-#include "ft/serialize/block_allocator_strategy.h"
-
-static uint64_t _align(uint64_t value, uint64_t ba_alignment) {
-    return ((value + ba_alignment - 1) / ba_alignment) * ba_alignment;
-}
-
-static uint64_t _roundup_to_power_of_two(uint64_t value) {
-    uint64_t r = 4096;
-    while (r < value) {
-        r *= 2;
-        invariant(r > 0);
-    }
-    return r;
-}
-
-// First fit block allocation
-static struct block_allocator::blockpair *
-_first_fit(struct block_allocator::blockpair *blocks_array,
-           uint64_t n_blocks, uint64_t size, uint64_t alignment,
-           uint64_t max_padding) {
-    if (n_blocks == 1) {
-        // won't enter loop, can't underflow the direction < 0 case
-        return nullptr;
-    }
-
-    struct block_allocator::blockpair *bp = &blocks_array[0];
-    for (uint64_t n_spaces_to_check = n_blocks - 1; n_spaces_to_check > 0;
-         n_spaces_to_check--, bp++) {
-        // Consider the space after bp
-        uint64_t padded_alignment = max_padding != 0 ? _align(max_padding, alignment) : alignment;
-        uint64_t possible_offset = _align(bp->offset + bp->size, padded_alignment);
-        if (possible_offset + size <= bp[1].offset) { // bp[1] is always valid since bp < &blocks_array[n_blocks-1]
-            invariant(bp - blocks_array < (int64_t) n_blocks);
-            return bp;
-        }
-    }
-    return nullptr;
-}
-
-static struct block_allocator::blockpair *
-_first_fit_bw(struct block_allocator::blockpair *blocks_array,
-           uint64_t n_blocks, uint64_t size, uint64_t alignment,
-           uint64_t max_padding, struct block_allocator::blockpair *blocks_array_limit) {
-    if (n_blocks == 1) {
-        // won't enter loop, can't underflow the direction < 0 case
-        return nullptr;
-    }
-
-    struct block_allocator::blockpair *bp = &blocks_array[-1];
-    for (uint64_t n_spaces_to_check = n_blocks - 1; n_spaces_to_check > 0;
-         n_spaces_to_check--, bp--) {
-        // Consider the space after bp
-        uint64_t padded_alignment = max_padding != 0 ? _align(max_padding, alignment) : alignment;
-        uint64_t possible_offset = _align(bp->offset + bp->size, padded_alignment);
-        if (&bp[1] < blocks_array_limit && possible_offset + size <= bp[1].offset) {
-            invariant(blocks_array - bp < (int64_t) n_blocks);
-            return bp;
-        }
-    }
-    return nullptr;
-}
-
-struct block_allocator::blockpair *
-block_allocator_strategy::first_fit(struct block_allocator::blockpair *blocks_array,
-                                    uint64_t n_blocks, uint64_t size, uint64_t alignment) {
-    return _first_fit(blocks_array, n_blocks, size, alignment, 0);
-}
-
-// Best fit block allocation
-struct block_allocator::blockpair *
-block_allocator_strategy::best_fit(struct block_allocator::blockpair *blocks_array,
-                                   uint64_t n_blocks, uint64_t size, uint64_t alignment) {
-    struct block_allocator::blockpair *best_bp = nullptr;
-    uint64_t best_hole_size = 0;
-    for (uint64_t blocknum = 0; blocknum + 1 < n_blocks; blocknum++) {
-        // Consider the space after blocknum
-        struct block_allocator::blockpair *bp = &blocks_array[blocknum];
-        uint64_t possible_offset = _align(bp->offset + bp->size, alignment);
-        uint64_t possible_end_offset = possible_offset + size;
-        if (possible_end_offset <= bp[1].offset) {
-            // It fits here. Is it the best fit?
-            uint64_t hole_size = bp[1].offset - possible_end_offset;
-            if (best_bp == nullptr || hole_size < best_hole_size) {
-                best_hole_size = hole_size;
-                best_bp = bp;
-            }
-        }
-    }
-    return best_bp;
-}
-
-static uint64_t padded_fit_alignment = 4096;
-
-// TODO: These compiler specific directives should be abstracted in a portability header
-//       portability/toku_compiler.h?
-__attribute__((__constructor__))
-static void determine_padded_fit_alignment_from_env(void) {
-    // TODO: Should be in portability as 'toku_os_getenv()?'
-    const char *s = getenv("TOKU_BA_PADDED_FIT_ALIGNMENT");
-    if (s != nullptr && strlen(s) > 0) {
-        const int64_t alignment = strtoll(s, nullptr, 10);
-        if (alignment <= 0) {
-            fprintf(stderr, "tokuft: error: block allocator padded fit alignment found in environment (%s), "
-                            "but it's out of range (should be an integer > 0). defaulting to %" PRIu64 "\n",
-                            s, padded_fit_alignment);
-        } else {
-            padded_fit_alignment = _roundup_to_power_of_two(alignment);
-            fprintf(stderr, "tokuft: setting block allocator padded fit alignment to %" PRIu64 "\n",
-                    padded_fit_alignment);
-        }
-    }
-}
-
-// First fit into a block that is oversized by up to max_padding.
-// The hope is that if we purposefully waste a bit of space at allocation
-// time we'll be more likely to reuse this block later.
-struct block_allocator::blockpair *
-block_allocator_strategy::padded_fit(struct block_allocator::blockpair *blocks_array,
-                                     uint64_t n_blocks, uint64_t size, uint64_t alignment) {
-    return _first_fit(blocks_array, n_blocks, size, alignment, padded_fit_alignment);
-}
-
-static double hot_zone_threshold = 0.85;
-
-// TODO: These compiler specific directives should be abstracted in a portability header
-//       portability/toku_compiler.h?
-__attribute__((__constructor__))
-static void determine_hot_zone_threshold_from_env(void) {
-    // TODO: Should be in portability as 'toku_os_getenv()?'
-    const char *s = getenv("TOKU_BA_HOT_ZONE_THRESHOLD");
-    if (s != nullptr && strlen(s) > 0) {
-        const double hot_zone = strtod(s, nullptr);
-        if (hot_zone < 1 || hot_zone > 99) {
-            fprintf(stderr, "tokuft: error: block allocator hot zone threshold found in environment (%s), "
-                            "but it's out of range (should be an integer 1 through 99). defaulting to 85\n", s);
-            hot_zone_threshold = 85 / 100;
-        } else {
-            fprintf(stderr, "tokuft: setting block allocator hot zone threshold to %s\n", s);
-            hot_zone_threshold = hot_zone / 100;
-        }
-    }
-}
-
-struct block_allocator::blockpair *
-block_allocator_strategy::heat_zone(struct block_allocator::blockpair *blocks_array,
-                                    uint64_t n_blocks, uint64_t size, uint64_t alignment,
-                                    uint64_t heat) {
-    if (heat > 0) {
-        struct block_allocator::blockpair *bp, *boundary_bp;
-
-        // Hot allocation. Find the beginning of the hot zone.
-        boundary_bp = &blocks_array[n_blocks - 1];
-        uint64_t highest_offset = _align(boundary_bp->offset + boundary_bp->size, alignment);
-        uint64_t hot_zone_offset = static_cast<uint64_t>(hot_zone_threshold * highest_offset);
-
-        boundary_bp = std::lower_bound(blocks_array, blocks_array + n_blocks, hot_zone_offset);
-        uint64_t blocks_in_zone = (blocks_array + n_blocks) - boundary_bp;
-        uint64_t blocks_outside_zone = boundary_bp - blocks_array;
-        invariant(blocks_in_zone + blocks_outside_zone == n_blocks);
-
-        if (blocks_in_zone > 0) {
-            // Find the first fit in the hot zone, going forward.
-            bp = _first_fit(boundary_bp, blocks_in_zone, size, alignment, 0);
-            if (bp != nullptr) {
-                return bp;
-            }
-        }
-        if (blocks_outside_zone > 0) {
-            // Find the first fit in the cold zone, going backwards.
-            bp = _first_fit_bw(boundary_bp, blocks_outside_zone, size, alignment, 0, &blocks_array[n_blocks]);
-            if (bp != nullptr) {
-                return bp;
-            }
-        }
-    } else {
-        // Cold allocations are simply first-fit from the beginning.
-        return _first_fit(blocks_array, n_blocks, size, alignment, 0);
-    }
-    return nullptr;
-}
--- a/storage/tokudb/PerconaFT/ft/serialize/block_table.cc
+++ b/storage/tokudb/PerconaFT/ft/serialize/block_table.cc
--- a/storage/tokudb/PerconaFT/ft/serialize/block_table.h
+++ b/storage/tokudb/PerconaFT/ft/serialize/block_table.h
@ -62,13 +62,16 @@ enum {
    RESERVED_BLOCKNUMS
 };

-typedef int (*BLOCKTABLE_CALLBACK)(BLOCKNUM b, int64_t size, int64_t address, void *extra);
+typedef int (*BLOCKTABLE_CALLBACK)(BLOCKNUM b,
+                                   int64_t size,
+                                   int64_t address,
+                                   void *extra);

 static inline BLOCKNUM make_blocknum(int64_t b) {
-    BLOCKNUM result = { .b = b };
+    BLOCKNUM result = {.b = b};
    return result;
 }
-static const BLOCKNUM ROLLBACK_NONE = { .b = 0 };
+static const BLOCKNUM ROLLBACK_NONE = {.b = 0};

 /**
 *  There are three copies of the translation table (btt) in the block table:
@ -80,18 +83,20 @@ static const BLOCKNUM ROLLBACK_NONE = { .b = 0 };
 *
 *    inprogress     Is only filled by copying from current,
 *                   and is the only version ever serialized to disk.
- *                   (It is serialized to disk on checkpoint and clean shutdown.)
+ *                   (It is serialized to disk on checkpoint and clean
+ *shutdown.)
 *                   At end of checkpoint it replaces 'checkpointed'.
 *                   During a checkpoint, any 'pending' dirty writes will update
 *                   inprogress.
 *
 *    current        Is initialized by copying from checkpointed,
- *                   is the only version ever modified while the database is in use, 
+ *                   is the only version ever modified while the database is in
+ *use,
 *                   and is the only version ever copied to inprogress.
 *                   It is never stored on disk.
 */
 class block_table {
-public:
+   public:
    enum translation_type {
        TRANSLATION_NONE = 0,
        TRANSLATION_CURRENT,
@ -102,7 +107,10 @@ public:

    void create();

-    int create_from_buffer(int fd, DISKOFF location_on_disk, DISKOFF size_on_disk, unsigned char *translation_buffer);
+    int create_from_buffer(int fd,
+                           DISKOFF location_on_disk,
+                           DISKOFF size_on_disk,
+                           unsigned char *translation_buffer);

    void destroy();

@ -114,11 +122,21 @@ public:

    // Blocknums
    void allocate_blocknum(BLOCKNUM *res, struct ft *ft);
-    void realloc_on_disk(BLOCKNUM b, DISKOFF size, DISKOFF *offset, struct ft *ft, int fd, bool for_checkpoint, uint64_t heat);
+    void realloc_on_disk(BLOCKNUM b,
+                         DISKOFF size,
+                         DISKOFF *offset,
+                         struct ft *ft,
+                         int fd,
+                         bool for_checkpoint);
    void free_blocknum(BLOCKNUM *b, struct ft *ft, bool for_checkpoint);
-    void translate_blocknum_to_offset_size(BLOCKNUM b, DISKOFF *offset, DISKOFF *size);
+    void translate_blocknum_to_offset_size(BLOCKNUM b,
+                                           DISKOFF *offset,
+                                           DISKOFF *size);
    void free_unused_blocknums(BLOCKNUM root);
-    void realloc_descriptor_on_disk(DISKOFF size, DISKOFF *offset, struct ft *ft, int fd);
+    void realloc_descriptor_on_disk(DISKOFF size,
+                                    DISKOFF *offset,
+                                    struct ft *ft,
+                                    int fd);
    void get_descriptor_offset_size(DISKOFF *offset, DISKOFF *size);

    // External verfication
@ -127,15 +145,22 @@ public:
    void verify_no_free_blocknums();

    // Serialization
-    void serialize_translation_to_wbuf(int fd, struct wbuf *w, int64_t *address, int64_t *size);
+    void serialize_translation_to_wbuf(int fd,
+                                       struct wbuf *w,
+                                       int64_t *address,
+                                       int64_t *size);

    // DEBUG ONLY (ftdump included), tests included
    void blocknum_dump_translation(BLOCKNUM b);
    void dump_translation_table_pretty(FILE *f);
    void dump_translation_table(FILE *f);
-    void block_free(uint64_t offset);
+    void block_free(uint64_t offset, uint64_t size);

-    int iterate(enum translation_type type, BLOCKTABLE_CALLBACK f, void *extra, bool data_only, bool used_only); 
+    int iterate(enum translation_type type,
+                BLOCKTABLE_CALLBACK f,
+                void *extra,
+                bool data_only,
+                bool used_only);
    void internal_fragmentation(int64_t *total_sizep, int64_t *used_sizep);

    // Requires: blocktable lock is held.
@ -146,13 +171,16 @@ public:

    void get_info64(struct ftinfo64 *);

-    int iterate_translation_tables(uint64_t, int (*)(uint64_t, int64_t, int64_t, int64_t, int64_t, void *), void *);
+    int iterate_translation_tables(
+        uint64_t,
+        int (*)(uint64_t, int64_t, int64_t, int64_t, int64_t, void *),
+        void *);

-private:
+   private:
    struct block_translation_pair {
        // If in the freelist, use next_free_blocknum, otherwise diskoff.
        union {
-            DISKOFF  diskoff; 
+            DISKOFF diskoff;
            BLOCKNUM next_free_blocknum;
        } u;

@ -173,7 +201,8 @@ private:
    struct translation {
        enum translation_type type;

-        // Number of elements in array (block_translation).  always >= smallest_never_used_blocknum
+        // Number of elements in array (block_translation).  always >=
+        // smallest_never_used_blocknum
        int64_t length_of_array;
        BLOCKNUM smallest_never_used_blocknum;

@ -181,20 +210,28 @@ private:
        BLOCKNUM blocknum_freelist_head;
        struct block_translation_pair *block_translation;

-        // size_on_disk is stored in block_translation[RESERVED_BLOCKNUM_TRANSLATION].size
-        // location_on is stored in block_translation[RESERVED_BLOCKNUM_TRANSLATION].u.diskoff
+        // size_on_disk is stored in
+        // block_translation[RESERVED_BLOCKNUM_TRANSLATION].size
+        // location_on is stored in
+        // block_translation[RESERVED_BLOCKNUM_TRANSLATION].u.diskoff
    };

    void _create_internal();
-    int _translation_deserialize_from_buffer(struct translation *t,    // destination into which to deserialize
-                                             DISKOFF location_on_disk, // location of translation_buffer
-                                             uint64_t size_on_disk,
-                                             unsigned char * translation_buffer);   // buffer with serialized translation
+    int _translation_deserialize_from_buffer(
+        struct translation *t,     // destination into which to deserialize
+        DISKOFF location_on_disk,  // location of translation_buffer
+        uint64_t size_on_disk,
+        unsigned char *
+            translation_buffer);  // buffer with serialized translation

-    void _copy_translation(struct translation *dst, struct translation *src, enum translation_type newtype);
+    void _copy_translation(struct translation *dst,
+                           struct translation *src,
+                           enum translation_type newtype);
    void _maybe_optimize_translation(struct translation *t);
    void _maybe_expand_translation(struct translation *t);
-    bool _translation_prevents_freeing(struct translation *t, BLOCKNUM b, struct block_translation_pair *old_pair);
+    bool _translation_prevents_freeing(struct translation *t,
+                                       BLOCKNUM b,
+                                       struct block_translation_pair *old_pair);
    void _free_blocknum_in_translation(struct translation *t, BLOCKNUM b);
    int64_t _calculate_size_on_disk(struct translation *t);
    bool _pair_is_unallocated(struct block_translation_pair *pair);
@ -203,14 +240,26 @@ private:

    // Blocknum management
    void _allocate_blocknum_unlocked(BLOCKNUM *res, struct ft *ft);
-    void _free_blocknum_unlocked(BLOCKNUM *bp, struct ft *ft, bool for_checkpoint);
-    void _realloc_descriptor_on_disk_unlocked(DISKOFF size, DISKOFF *offset, struct ft *ft);
-    void _realloc_on_disk_internal(BLOCKNUM b, DISKOFF size, DISKOFF *offset, struct ft *ft, bool for_checkpoint, uint64_t heat);
-    void _translate_blocknum_to_offset_size_unlocked(BLOCKNUM b, DISKOFF *offset, DISKOFF *size);
+    void _free_blocknum_unlocked(BLOCKNUM *bp,
+                                 struct ft *ft,
+                                 bool for_checkpoint);
+    void _realloc_descriptor_on_disk_unlocked(DISKOFF size,
+                                              DISKOFF *offset,
+                                              struct ft *ft);
+    void _realloc_on_disk_internal(BLOCKNUM b,
+                                   DISKOFF size,
+                                   DISKOFF *offset,
+                                   struct ft *ft,
+                                   bool for_checkpoint);
+    void _translate_blocknum_to_offset_size_unlocked(BLOCKNUM b,
+                                                     DISKOFF *offset,
+                                                     DISKOFF *size);

    // File management
    void _maybe_truncate_file(int fd, uint64_t size_needed_before);
-    void _ensure_safe_write_unlocked(int fd, DISKOFF block_size, DISKOFF block_offset);
+    void _ensure_safe_write_unlocked(int fd,
+                                     DISKOFF block_size,
+                                     DISKOFF block_offset);

    // Verification
    bool _is_valid_blocknum(struct translation *t, BLOCKNUM b);
@ -220,29 +269,33 @@ private:
    bool _no_data_blocks_except_root(BLOCKNUM root);
    bool _blocknum_allocated(BLOCKNUM b);

-    // Locking 
+    // Locking
    //
    // TODO: Move the lock to the FT
    void _mutex_lock();
    void _mutex_unlock();

-    // The current translation is the one used by client threads. 
+    // The current translation is the one used by client threads.
    // It is not represented on disk.
    struct translation _current;

-    // The translation used by the checkpoint currently in progress. 
-    // If the checkpoint thread allocates a block, it must also update the current translation.
+    // The translation used by the checkpoint currently in progress.
+    // If the checkpoint thread allocates a block, it must also update the
+    // current translation.
    struct translation _inprogress;

-    // The translation for the data that shall remain inviolate on disk until the next checkpoint finishes,
+    // The translation for the data that shall remain inviolate on disk until
+    // the next checkpoint finishes,
    // after which any blocks used only in this translation can be freed.
    struct translation _checkpointed;

-    // The in-memory data structure for block allocation. 
+    // The in-memory data structure for block allocation.
    // There is no on-disk data structure for block allocation.
-    // Note: This is *allocation* not *translation* - the block allocator is unaware of which
-    //       blocks are used for which translation, but simply allocates and deallocates blocks.
-    block_allocator _bt_block_allocator;
+    // Note: This is *allocation* not *translation* - the block allocator is
+    // unaware of which
+    //       blocks are used for which translation, but simply allocates and
+    //       deallocates blocks.
+    BlockAllocator *_bt_block_allocator;
    toku_mutex_t _mutex;
    struct nb_mutex _safe_file_size_lock;
    bool _checkpoint_skipped;
@ -257,16 +310,16 @@ private:

 #include "ft/serialize/wbuf.h"

-static inline void wbuf_BLOCKNUM (struct wbuf *w, BLOCKNUM b) {
+static inline void wbuf_BLOCKNUM(struct wbuf *w, BLOCKNUM b) {
    wbuf_ulonglong(w, b.b);
 }

-static inline void wbuf_nocrc_BLOCKNUM (struct wbuf *w, BLOCKNUM b) {
+static inline void wbuf_nocrc_BLOCKNUM(struct wbuf *w, BLOCKNUM b) {
    wbuf_nocrc_ulonglong(w, b.b);
 }

 static inline void wbuf_DISKOFF(struct wbuf *wb, DISKOFF off) {
-    wbuf_ulonglong(wb, (uint64_t) off);
+    wbuf_ulonglong(wb, (uint64_t)off);
 }

 #include "ft/serialize/rbuf.h"
@ -280,6 +333,8 @@ static inline BLOCKNUM rbuf_blocknum(struct rbuf *rb) {
    return result;
 }

-static inline void rbuf_ma_BLOCKNUM(struct rbuf *rb, memarena *UU(ma), BLOCKNUM *blocknum) {
+static inline void rbuf_ma_BLOCKNUM(struct rbuf *rb,
+                                    memarena *UU(ma),
+                                    BLOCKNUM *blocknum) {
    *blocknum = rbuf_blocknum(rb);
 }
--- a/storage/tokudb/PerconaFT/ft/serialize/compress.cc
+++ b/storage/tokudb/PerconaFT/ft/serialize/compress.cc
@ -235,7 +235,7 @@ void toku_decompress (Bytef       *dest,   uLongf destLen,
        strm.zalloc = Z_NULL;
        strm.zfree = Z_NULL;
        strm.opaque = Z_NULL;
-        char windowBits = source[1];
+        int8_t windowBits = source[1];
        int r = inflateInit2(&strm, windowBits);
        lazy_assert(r == Z_OK);
        strm.next_out = dest;
--- a/storage/tokudb/PerconaFT/ft/serialize/ft-serialize.cc
+++ b/storage/tokudb/PerconaFT/ft/serialize/ft-serialize.cc
@ -217,8 +217,8 @@ int deserialize_ft_versioned(int fd, struct rbuf *rb, FT *ftp, uint32_t version)
            // translation table itself won't fit in main memory.
            ssize_t readsz = toku_os_pread(fd, tbuf, size_to_read,
                                           translation_address_on_disk);
-            assert(readsz >= translation_size_on_disk);
-            assert(readsz <= (ssize_t)size_to_read);
+            invariant(readsz >= translation_size_on_disk);
+            invariant(readsz <= (ssize_t)size_to_read);
        }
        // Create table and read in data.
        r = ft->blocktable.create_from_buffer(fd,
@ -411,73 +411,90 @@ exit:
    return r;
 }

-static size_t
-serialize_ft_min_size (uint32_t version) {
+static size_t serialize_ft_min_size(uint32_t version) {
    size_t size = 0;

-    switch(version) {
-    case FT_LAYOUT_VERSION_29:
-        size += sizeof(uint64_t); // logrows in ft
-    case FT_LAYOUT_VERSION_28:
-        size += sizeof(uint32_t); // fanout in ft
-    case FT_LAYOUT_VERSION_27:
-    case FT_LAYOUT_VERSION_26:
-    case FT_LAYOUT_VERSION_25:
-    case FT_LAYOUT_VERSION_24:
-    case FT_LAYOUT_VERSION_23:
-    case FT_LAYOUT_VERSION_22:
-    case FT_LAYOUT_VERSION_21:
-        size += sizeof(MSN);       // max_msn_in_ft
-    case FT_LAYOUT_VERSION_20:
-    case FT_LAYOUT_VERSION_19:
-        size += 1; // compression method
-        size += sizeof(MSN);       // highest_unused_msn_for_upgrade
-    case FT_LAYOUT_VERSION_18:
-        size += sizeof(uint64_t);  // time_of_last_optimize_begin
-        size += sizeof(uint64_t);  // time_of_last_optimize_end
-        size += sizeof(uint32_t);  // count_of_optimize_in_progress
-        size += sizeof(MSN);       // msn_at_start_of_last_completed_optimize
-        size -= 8;                 // removed num_blocks_to_upgrade_14
-        size -= 8;                 // removed num_blocks_to_upgrade_13
-    case FT_LAYOUT_VERSION_17:
-        size += 16;
-        invariant(sizeof(STAT64INFO_S) == 16);
-    case FT_LAYOUT_VERSION_16:
-    case FT_LAYOUT_VERSION_15:
-        size += 4;  // basement node size
-        size += 8;  // num_blocks_to_upgrade_14 (previously num_blocks_to_upgrade, now one int each for upgrade from 13, 14
-        size += 8;  // time of last verification
-    case FT_LAYOUT_VERSION_14:
-        size += 8;  //TXNID that created
-    case FT_LAYOUT_VERSION_13:
-        size += ( 4 // build_id
-                  +4 // build_id_original
-                  +8 // time_of_creation
-                  +8 // time_of_last_modification
-            );
+    switch (version) {
+        case FT_LAYOUT_VERSION_29:
+            size += sizeof(uint64_t);  // logrows in ft
+        case FT_LAYOUT_VERSION_28:
+            size += sizeof(uint32_t);  // fanout in ft
+        case FT_LAYOUT_VERSION_27:
+        case FT_LAYOUT_VERSION_26:
+        case FT_LAYOUT_VERSION_25:
+        case FT_LAYOUT_VERSION_24:
+        case FT_LAYOUT_VERSION_23:
+        case FT_LAYOUT_VERSION_22:
+        case FT_LAYOUT_VERSION_21:
+            size += sizeof(MSN);  // max_msn_in_ft
+        case FT_LAYOUT_VERSION_20:
+        case FT_LAYOUT_VERSION_19:
+            size += 1;            // compression method
+            size += sizeof(MSN);  // highest_unused_msn_for_upgrade
+        case FT_LAYOUT_VERSION_18:
+            size += sizeof(uint64_t);  // time_of_last_optimize_begin
+            size += sizeof(uint64_t);  // time_of_last_optimize_end
+            size += sizeof(uint32_t);  // count_of_optimize_in_progress
+            size += sizeof(MSN);  // msn_at_start_of_last_completed_optimize
+            size -= 8;            // removed num_blocks_to_upgrade_14
+            size -= 8;            // removed num_blocks_to_upgrade_13
+        case FT_LAYOUT_VERSION_17:
+            size += 16;
+            invariant(sizeof(STAT64INFO_S) == 16);
+        case FT_LAYOUT_VERSION_16:
+        case FT_LAYOUT_VERSION_15:
+            size += 4;  // basement node size
+            size += 8;  // num_blocks_to_upgrade_14 (previously
+                        // num_blocks_to_upgrade, now one int each for upgrade
+                        // from 13, 14
+            size += 8;  // time of last verification
+        case FT_LAYOUT_VERSION_14:
+            size += 8;  // TXNID that created
+        case FT_LAYOUT_VERSION_13:
+            size += (4  // build_id
+                     +
+                     4  // build_id_original
+                     +
+                     8  // time_of_creation
+                     +
+                     8  // time_of_last_modification
+                     );
        // fall through
-    case FT_LAYOUT_VERSION_12:
-        size += (+8 // "tokudata"
-                 +4 // version
-                 +4 // original_version
-                 +4 // size
-                 +8 // byte order verification
-                 +8 // checkpoint_count
-                 +8 // checkpoint_lsn
-                 +4 // tree's nodesize
-                 +8 // translation_size_on_disk
-                 +8 // translation_address_on_disk
-                 +4 // checksum
-                 +8 // Number of blocks in old version.
-                 +8 // diskoff
-                 +4 // flags
-            );
-        break;
-    default:
-        abort();
+        case FT_LAYOUT_VERSION_12:
+            size += (+8  // "tokudata"
+                     +
+                     4  // version
+                     +
+                     4  // original_version
+                     +
+                     4  // size
+                     +
+                     8  // byte order verification
+                     +
+                     8  // checkpoint_count
+                     +
+                     8  // checkpoint_lsn
+                     +
+                     4  // tree's nodesize
+                     +
+                     8  // translation_size_on_disk
+                     +
+                     8  // translation_address_on_disk
+                     +
+                     4  // checksum
+                     +
+                     8  // Number of blocks in old version.
+                     +
+                     8  // diskoff
+                     +
+                     4  // flags
+                     );
+            break;
+        default:
+            abort();
    }

-    lazy_assert(size <= block_allocator::BLOCK_ALLOCATOR_HEADER_RESERVE);
+    lazy_assert(size <= BlockAllocator::BLOCK_ALLOCATOR_HEADER_RESERVE);
    return size;
 }

@ -486,7 +503,7 @@ int deserialize_ft_from_fd_into_rbuf(int fd,
                                     struct rbuf *rb,
                                     uint64_t *checkpoint_count,
                                     LSN *checkpoint_lsn,
-                                     uint32_t * version_p)
+                                     uint32_t *version_p)
 // Effect: Read and parse the header of a fractalal tree
 //
 //  Simply reading the raw bytes of the header into an rbuf is insensitive
@ -496,18 +513,18 @@ int deserialize_ft_from_fd_into_rbuf(int fd,
 //  file AND the header is useless
 {
    int r = 0;
-    const int64_t prefix_size = 8 + // magic ("tokudata")
-                                4 + // version
-                                4 + // build_id
-                                4;  // size
+    const int64_t prefix_size = 8 +  // magic ("tokudata")
+                                4 +  // version
+                                4 +  // build_id
+                                4;   // size
    const int64_t read_size = roundup_to_multiple(512, prefix_size);
    unsigned char *XMALLOC_N_ALIGNED(512, read_size, prefix);
    rb->buf = NULL;
    int64_t n = toku_os_pread(fd, prefix, read_size, offset_of_header);
    if (n != read_size) {
-        if (n==0) {
+        if (n == 0) {
            r = TOKUDB_DICTIONARY_NO_HEADER;
-        } else if (n<0) {
+        } else if (n < 0) {
            r = get_error_errno();
        } else {
            r = EINVAL;
@ -518,95 +535,102 @@ int deserialize_ft_from_fd_into_rbuf(int fd,

    rbuf_init(rb, prefix, prefix_size);

-    //Check magic number
+    // Check magic number
    const void *magic;
    rbuf_literal_bytes(rb, &magic, 8);
-    if (memcmp(magic,"tokudata",8)!=0) {
-        if ((*(uint64_t*)magic) == 0) {
+    if (memcmp(magic, "tokudata", 8) != 0) {
+        if ((*(uint64_t *)magic) == 0) {
            r = TOKUDB_DICTIONARY_NO_HEADER;
        } else {
-            r = EINVAL; //Not a tokudb file! Do not use.
+            r = EINVAL;  // Not a tokudb file! Do not use.
        }
        goto exit;
    }

-    //Version MUST be in network order regardless of disk order.
+    // Version MUST be in network order regardless of disk order.
    uint32_t version;
    version = rbuf_network_int(rb);
    *version_p = version;
    if (version < FT_LAYOUT_MIN_SUPPORTED_VERSION) {
-        r = TOKUDB_DICTIONARY_TOO_OLD; //Cannot use
+        r = TOKUDB_DICTIONARY_TOO_OLD;  // Cannot use
        goto exit;
    } else if (version > FT_LAYOUT_VERSION) {
-        r = TOKUDB_DICTIONARY_TOO_NEW; //Cannot use
+        r = TOKUDB_DICTIONARY_TOO_NEW;  // Cannot use
        goto exit;
    }

-    //build_id MUST be in network order regardless of disk order.
+    // build_id MUST be in network order regardless of disk order.
    uint32_t build_id __attribute__((__unused__));
    build_id = rbuf_network_int(rb);
    int64_t min_header_size;
    min_header_size = serialize_ft_min_size(version);

-    //Size MUST be in network order regardless of disk order.
+    // Size MUST be in network order regardless of disk order.
    uint32_t size;
    size = rbuf_network_int(rb);
-    //If too big, it is corrupt.  We would probably notice during checksum
-    //but may have to do a multi-gigabyte malloc+read to find out.
-    //If its too small reading rbuf would crash, so verify.
-    if (size > block_allocator::BLOCK_ALLOCATOR_HEADER_RESERVE || size < min_header_size) {
+    // If too big, it is corrupt.  We would probably notice during checksum
+    // but may have to do a multi-gigabyte malloc+read to find out.
+    // If its too small reading rbuf would crash, so verify.
+    if (size > BlockAllocator::BLOCK_ALLOCATOR_HEADER_RESERVE ||
+        size < min_header_size) {
        r = TOKUDB_DICTIONARY_NO_HEADER;
        goto exit;
    }

-    lazy_assert(rb->ndone==prefix_size);
+    lazy_assert(rb->ndone == prefix_size);
    rb->size = size;
    {
        toku_free(rb->buf);
        uint32_t size_to_read = roundup_to_multiple(512, size);
        XMALLOC_N_ALIGNED(512, size_to_read, rb->buf);

-        assert(offset_of_header%512==0);
+        invariant(offset_of_header % 512 == 0);
        n = toku_os_pread(fd, rb->buf, size_to_read, offset_of_header);
        if (n != size_to_read) {
            if (n < 0) {
                r = get_error_errno();
            } else {
-                r = EINVAL; //Header might be useless (wrong size) or could be a disk read error.
+                r = EINVAL;  // Header might be useless (wrong size) or could be
+                             // a disk read error.
            }
            goto exit;
        }
    }
-    //It's version 14 or later.  Magic looks OK.
-    //We have an rbuf that represents the header.
-    //Size is within acceptable bounds.
+    // It's version 14 or later.  Magic looks OK.
+    // We have an rbuf that represents the header.
+    // Size is within acceptable bounds.

-    //Verify checksum (FT_LAYOUT_VERSION_13 or later, when checksum function changed)
+    // Verify checksum (FT_LAYOUT_VERSION_13 or later, when checksum function
+    // changed)
    uint32_t calculated_x1764;
-    calculated_x1764 = toku_x1764_memory(rb->buf, rb->size-4);
+    calculated_x1764 = toku_x1764_memory(rb->buf, rb->size - 4);
    uint32_t stored_x1764;
-    stored_x1764 = toku_dtoh32(*(int*)(rb->buf+rb->size-4));
+    stored_x1764 = toku_dtoh32(*(int *)(rb->buf + rb->size - 4));
    if (calculated_x1764 != stored_x1764) {
-        r = TOKUDB_BAD_CHECKSUM; //Header useless
-        fprintf(stderr, "Header checksum failure: calc=0x%08x read=0x%08x\n", calculated_x1764, stored_x1764);
+        r = TOKUDB_BAD_CHECKSUM;  // Header useless
+        fprintf(stderr,
+                "Header checksum failure: calc=0x%08x read=0x%08x\n",
+                calculated_x1764,
+                stored_x1764);
        goto exit;
    }

-    //Verify byte order
+    // Verify byte order
    const void *tmp_byte_order_check;
    lazy_assert((sizeof toku_byte_order_host) == 8);
-    rbuf_literal_bytes(rb, &tmp_byte_order_check, 8); //Must not translate byte order
+    rbuf_literal_bytes(
+        rb, &tmp_byte_order_check, 8);  // Must not translate byte order
    int64_t byte_order_stored;
-    byte_order_stored = *(int64_t*)tmp_byte_order_check;
+    byte_order_stored = *(int64_t *)tmp_byte_order_check;
    if (byte_order_stored != toku_byte_order_host) {
-        r = TOKUDB_DICTIONARY_NO_HEADER; //Cannot use dictionary
+        r = TOKUDB_DICTIONARY_NO_HEADER;  // Cannot use dictionary
        goto exit;
    }

-    //Load checkpoint count
+    // Load checkpoint count
    *checkpoint_count = rbuf_ulonglong(rb);
    *checkpoint_lsn = rbuf_LSN(rb);
-    //Restart at beginning during regular deserialization
+    // Restart at beginning during regular deserialization
    rb->ndone = 0;

 exit:
@ -620,11 +644,7 @@ exit:
 // Read ft from file into struct.  Read both headers and use one.
 // We want the latest acceptable header whose checkpoint_lsn is no later
 // than max_acceptable_lsn.
-int
-toku_deserialize_ft_from(int fd,
-                         LSN max_acceptable_lsn,
-                         FT *ft)
-{
+int toku_deserialize_ft_from(int fd, LSN max_acceptable_lsn, FT *ft) {
    struct rbuf rb_0;
    struct rbuf rb_1;
    uint64_t checkpoint_count_0 = 0;
@ -638,13 +658,23 @@ toku_deserialize_ft_from(int fd,
    int r0, r1, r;

    toku_off_t header_0_off = 0;
-    r0 = deserialize_ft_from_fd_into_rbuf(fd, header_0_off, &rb_0, &checkpoint_count_0, &checkpoint_lsn_0, &version_0);
+    r0 = deserialize_ft_from_fd_into_rbuf(fd,
+                                          header_0_off,
+                                          &rb_0,
+                                          &checkpoint_count_0,
+                                          &checkpoint_lsn_0,
+                                          &version_0);
    if (r0 == 0 && checkpoint_lsn_0.lsn <= max_acceptable_lsn.lsn) {
        h0_acceptable = true;
    }

-    toku_off_t header_1_off = block_allocator::BLOCK_ALLOCATOR_HEADER_RESERVE;
-    r1 = deserialize_ft_from_fd_into_rbuf(fd, header_1_off, &rb_1, &checkpoint_count_1, &checkpoint_lsn_1, &version_1);
+    toku_off_t header_1_off = BlockAllocator::BLOCK_ALLOCATOR_HEADER_RESERVE;
+    r1 = deserialize_ft_from_fd_into_rbuf(fd,
+                                          header_1_off,
+                                          &rb_1,
+                                          &checkpoint_count_1,
+                                          &checkpoint_lsn_1,
+                                          &version_1);
    if (r1 == 0 && checkpoint_lsn_1.lsn <= max_acceptable_lsn.lsn) {
        h1_acceptable = true;
    }
@ -655,24 +685,29 @@ toku_deserialize_ft_from(int fd,
        // We were unable to read either header or at least one is too
        // new.  Certain errors are higher priority than others. Order of
        // these if/else if is important.
-        if (r0 == TOKUDB_DICTIONARY_TOO_NEW || r1 == TOKUDB_DICTIONARY_TOO_NEW) {
+        if (r0 == TOKUDB_DICTIONARY_TOO_NEW ||
+            r1 == TOKUDB_DICTIONARY_TOO_NEW) {
            r = TOKUDB_DICTIONARY_TOO_NEW;
-        } else if (r0 == TOKUDB_DICTIONARY_TOO_OLD || r1 == TOKUDB_DICTIONARY_TOO_OLD) {
+        } else if (r0 == TOKUDB_DICTIONARY_TOO_OLD ||
+                   r1 == TOKUDB_DICTIONARY_TOO_OLD) {
            r = TOKUDB_DICTIONARY_TOO_OLD;
        } else if (r0 == TOKUDB_BAD_CHECKSUM && r1 == TOKUDB_BAD_CHECKSUM) {
            fprintf(stderr, "Both header checksums failed.\n");
            r = TOKUDB_BAD_CHECKSUM;
-        } else if (r0 == TOKUDB_DICTIONARY_NO_HEADER || r1 == TOKUDB_DICTIONARY_NO_HEADER) {
+        } else if (r0 == TOKUDB_DICTIONARY_NO_HEADER ||
+                   r1 == TOKUDB_DICTIONARY_NO_HEADER) {
            r = TOKUDB_DICTIONARY_NO_HEADER;
        } else {
-            r = r0 ? r0 : r1; //Arbitrarily report the error from the
-                              //first header, unless it's readable
+            r = r0 ? r0 : r1;  // Arbitrarily report the error from the
+            // first header, unless it's readable
        }

-        // it should not be possible for both headers to be later than the max_acceptable_lsn
-        invariant(!((r0==0 && checkpoint_lsn_0.lsn > max_acceptable_lsn.lsn) &&
-                    (r1==0 && checkpoint_lsn_1.lsn > max_acceptable_lsn.lsn)));
-        invariant(r!=0);
+        // it should not be possible for both headers to be later than the
+        // max_acceptable_lsn
+        invariant(
+            !((r0 == 0 && checkpoint_lsn_0.lsn > max_acceptable_lsn.lsn) &&
+              (r1 == 0 && checkpoint_lsn_1.lsn > max_acceptable_lsn.lsn)));
+        invariant(r != 0);
        goto exit;
    }

@ -682,8 +717,7 @@ toku_deserialize_ft_from(int fd,
            invariant(version_0 >= version_1);
            rb = &rb_0;
            version = version_0;
-        }
-        else {
+        } else {
            invariant(checkpoint_count_1 == checkpoint_count_0 + 1);
            invariant(version_1 >= version_0);
            rb = &rb_1;
@ -692,14 +726,18 @@ toku_deserialize_ft_from(int fd,
    } else if (h0_acceptable) {
        if (r1 == TOKUDB_BAD_CHECKSUM) {
            // print something reassuring
-            fprintf(stderr, "Header 2 checksum failed, but header 1 ok.  Proceeding.\n");
+            fprintf(
+                stderr,
+                "Header 2 checksum failed, but header 1 ok.  Proceeding.\n");
        }
        rb = &rb_0;
        version = version_0;
    } else if (h1_acceptable) {
        if (r0 == TOKUDB_BAD_CHECKSUM) {
            // print something reassuring
-            fprintf(stderr, "Header 1 checksum failed, but header 2 ok.  Proceeding.\n");
+            fprintf(
+                stderr,
+                "Header 1 checksum failed, but header 2 ok.  Proceeding.\n");
        }
        rb = &rb_1;
        version = version_1;
@ -718,15 +756,13 @@ exit:
    return r;
 }

-
-size_t toku_serialize_ft_size (FT_HEADER h) {
+size_t toku_serialize_ft_size(FT_HEADER h) {
    size_t size = serialize_ft_min_size(h->layout_version);
-    //There is no dynamic data.
-    lazy_assert(size <= block_allocator::BLOCK_ALLOCATOR_HEADER_RESERVE);
+    // There is no dynamic data.
+    lazy_assert(size <= BlockAllocator::BLOCK_ALLOCATOR_HEADER_RESERVE);
    return size;
 }

-
 void toku_serialize_ft_to_wbuf (
    struct wbuf *wbuf, 
    FT_HEADER h, 
@ -771,52 +807,60 @@ void toku_serialize_ft_to_wbuf (
 }

 void toku_serialize_ft_to(int fd, FT_HEADER h, block_table *bt, CACHEFILE cf) {
-    lazy_assert(h->type==FT_CHECKPOINT_INPROGRESS);
+    lazy_assert(h->type == FT_CHECKPOINT_INPROGRESS);
    struct wbuf w_translation;
    int64_t size_translation;
    int64_t address_translation;

    // Must serialize translation first, to get address,size for header.
-    bt->serialize_translation_to_wbuf(fd, &w_translation,
-                                      &address_translation,
-                                      &size_translation);
-    assert(size_translation == w_translation.ndone);
+    bt->serialize_translation_to_wbuf(
+        fd, &w_translation, &address_translation, &size_translation);
+    invariant(size_translation == w_translation.ndone);

-    // the number of bytes available in the buffer is 0 mod 512, and those last bytes are all initialized.
-    assert(w_translation.size % 512 == 0);
+    // the number of bytes available in the buffer is 0 mod 512, and those last
+    // bytes are all initialized.
+    invariant(w_translation.size % 512 == 0);

    struct wbuf w_main;
-    size_t size_main       = toku_serialize_ft_size(h);
+    size_t size_main = toku_serialize_ft_size(h);
    size_t size_main_aligned = roundup_to_multiple(512, size_main);
-    assert(size_main_aligned<block_allocator::BLOCK_ALLOCATOR_HEADER_RESERVE);
+    invariant(size_main_aligned <
+              BlockAllocator::BLOCK_ALLOCATOR_HEADER_RESERVE);
    char *XMALLOC_N_ALIGNED(512, size_main_aligned, mainbuf);
-    for (size_t i=size_main; i<size_main_aligned; i++) mainbuf[i]=0; // initialize the end of the buffer with zeros
+    for (size_t i = size_main; i < size_main_aligned; i++)
+        mainbuf[i] = 0;  // initialize the end of the buffer with zeros
    wbuf_init(&w_main, mainbuf, size_main);
-    toku_serialize_ft_to_wbuf(&w_main, h, address_translation, size_translation);
+    toku_serialize_ft_to_wbuf(
+        &w_main, h, address_translation, size_translation);
    lazy_assert(w_main.ndone == size_main);

    // Actually write translation table
-    // This write is guaranteed to read good data at the end of the buffer, since the
+    // This write is guaranteed to read good data at the end of the buffer,
+    // since the
    // w_translation.buf is padded with zeros to a 512-byte boundary.
-    toku_os_full_pwrite(fd, w_translation.buf, roundup_to_multiple(512, size_translation), address_translation);
+    toku_os_full_pwrite(fd,
+                        w_translation.buf,
+                        roundup_to_multiple(512, size_translation),
+                        address_translation);

-    //Everything but the header MUST be on disk before header starts.
-    //Otherwise we will think the header is good and some blocks might not
-    //yet be on disk.
-    //If the header has a cachefile we need to do cachefile fsync (to
-    //prevent crash if we redirected to dev null)
-    //If there is no cachefile we still need to do an fsync.
+    // Everything but the header MUST be on disk before header starts.
+    // Otherwise we will think the header is good and some blocks might not
+    // yet be on disk.
+    // If the header has a cachefile we need to do cachefile fsync (to
+    // prevent crash if we redirected to dev null)
+    // If there is no cachefile we still need to do an fsync.
    if (cf) {
        toku_cachefile_fsync(cf);
-    }
-    else {
+    } else {
        toku_file_fsync(fd);
    }

-    //Alternate writing header to two locations:
+    // Alternate writing header to two locations:
    //   Beginning (0) or BLOCK_ALLOCATOR_HEADER_RESERVE
    toku_off_t main_offset;
-    main_offset = (h->checkpoint_count & 0x1) ? 0 : block_allocator::BLOCK_ALLOCATOR_HEADER_RESERVE;
+    main_offset = (h->checkpoint_count & 0x1)
+                      ? 0
+                      : BlockAllocator::BLOCK_ALLOCATOR_HEADER_RESERVE;
    toku_os_full_pwrite(fd, w_main.buf, size_main_aligned, main_offset);
    toku_free(w_main.buf);
    toku_free(w_translation.buf);
--- a/storage/tokudb/PerconaFT/ft/serialize/ft_node-serialize.cc
+++ b/storage/tokudb/PerconaFT/ft/serialize/ft_node-serialize.cc
@ -99,13 +99,11 @@ void toku_ft_serialize_layer_init(void) {
    num_cores = toku_os_get_number_active_processors();
    int r = toku_thread_pool_create(&ft_pool, num_cores);
    lazy_assert_zero(r);
-    block_allocator::maybe_initialize_trace();
    toku_serialize_in_parallel = false;
 }

 void toku_ft_serialize_layer_destroy(void) {
    toku_thread_pool_destroy(&ft_pool);
-    block_allocator::maybe_close_trace();
 }

 enum { FILE_CHANGE_INCREMENT = (16 << 20) };
@ -773,19 +771,23 @@ int toku_serialize_ftnode_to_memory(FTNODE node,
    return 0;
 }

-int
-toku_serialize_ftnode_to (int fd, BLOCKNUM blocknum, FTNODE node, FTNODE_DISK_DATA* ndd, bool do_rebalancing, FT ft, bool for_checkpoint) {
-
+int toku_serialize_ftnode_to(int fd,
+                             BLOCKNUM blocknum,
+                             FTNODE node,
+                             FTNODE_DISK_DATA *ndd,
+                             bool do_rebalancing,
+                             FT ft,
+                             bool for_checkpoint) {
    size_t n_to_write;
    size_t n_uncompressed_bytes;
    char *compressed_buf = nullptr;

-    // because toku_serialize_ftnode_to is only called for 
+    // because toku_serialize_ftnode_to is only called for
    // in toku_ftnode_flush_callback, we pass false
    // for in_parallel. The reasoning is that when we write
-    // nodes to disk via toku_ftnode_flush_callback, we 
+    // nodes to disk via toku_ftnode_flush_callback, we
    // assume that it is being done on a non-critical
-    // background thread (probably for checkpointing), and therefore 
+    // background thread (probably for checkpointing), and therefore
    // should not hog CPU,
    //
    // Should the above facts change, we may want to revisit
@ -802,32 +804,32 @@ toku_serialize_ftnode_to (int fd, BLOCKNUM blocknum, FTNODE node, FTNODE_DISK_DA
        toku_unsafe_fetch(&toku_serialize_in_parallel),
        &n_to_write,
        &n_uncompressed_bytes,
-        &compressed_buf
-        );
+        &compressed_buf);
    if (r != 0) {
        return r;
    }

-    // If the node has never been written, then write the whole buffer, including the zeros
-    invariant(blocknum.b>=0);
+    // If the node has never been written, then write the whole buffer,
+    // including the zeros
+    invariant(blocknum.b >= 0);
    DISKOFF offset;

    // Dirties the ft
-    ft->blocktable.realloc_on_disk(blocknum, n_to_write, &offset,
-                                   ft, fd, for_checkpoint,
-                                   // Allocations for nodes high in the tree are considered 'hot',
-                                   // as they are likely to move again in the next checkpoint.
-                                   node->height);
+    ft->blocktable.realloc_on_disk(
+        blocknum, n_to_write, &offset, ft, fd, for_checkpoint);

    tokutime_t t0 = toku_time_now();
    toku_os_full_pwrite(fd, compressed_buf, n_to_write, offset);
    tokutime_t t1 = toku_time_now();

    tokutime_t io_time = t1 - t0;
-    toku_ft_status_update_flush_reason(node, n_uncompressed_bytes, n_to_write, io_time, for_checkpoint);
+    toku_ft_status_update_flush_reason(
+        node, n_uncompressed_bytes, n_to_write, io_time, for_checkpoint);

    toku_free(compressed_buf);
-    node->dirty = 0;  // See #1957.   Must set the node to be clean after serializing it so that it doesn't get written again on the next checkpoint or eviction.
+    node->dirty = 0;  // See #1957.   Must set the node to be clean after
+                      // serializing it so that it doesn't get written again on
+                      // the next checkpoint or eviction.
    return 0;
 }

@ -994,6 +996,7 @@ BASEMENTNODE toku_clone_bn(BASEMENTNODE orig_bn) {
    bn->seqinsert = orig_bn->seqinsert;
    bn->stale_ancestor_messages_applied = orig_bn->stale_ancestor_messages_applied;
    bn->stat64_delta = orig_bn->stat64_delta;
+    bn->logical_rows_delta = orig_bn->logical_rows_delta;
    bn->data_buffer.clone(&orig_bn->data_buffer);
    return bn;
 }
@ -1004,6 +1007,7 @@ BASEMENTNODE toku_create_empty_bn_no_buffer(void) {
    bn->seqinsert = 0;
    bn->stale_ancestor_messages_applied = false;
    bn->stat64_delta = ZEROSTATS;
+    bn->logical_rows_delta = 0;
    bn->data_buffer.init_zero();
    return bn;
 }
@ -1897,7 +1901,7 @@ read_and_decompress_block_from_fd_into_rbuf(int fd, BLOCKNUM blocknum,
                                            /* out */ int *layout_version_p);

 // This function upgrades a version 14 or 13 ftnode to the current
-// verison. NOTE: This code assumes the first field of the rbuf has
+// version. NOTE: This code assumes the first field of the rbuf has
 // already been read from the buffer (namely the layout_version of the
 // ftnode.)
 static int
@ -2488,9 +2492,12 @@ toku_serialize_rollback_log_to_memory_uncompressed(ROLLBACK_LOG_NODE log, SERIAL
    serialized->blocknum = log->blocknum;
 }

-int
-toku_serialize_rollback_log_to (int fd, ROLLBACK_LOG_NODE log, SERIALIZED_ROLLBACK_LOG_NODE serialized_log, bool is_serialized,
-                                FT ft, bool for_checkpoint) {
+int toku_serialize_rollback_log_to(int fd,
+                                   ROLLBACK_LOG_NODE log,
+                                   SERIALIZED_ROLLBACK_LOG_NODE serialized_log,
+                                   bool is_serialized,
+                                   FT ft,
+                                   bool for_checkpoint) {
    size_t n_to_write;
    char *compressed_buf;
    struct serialized_rollback_log_node serialized_local;
@ -2511,21 +2518,21 @@ toku_serialize_rollback_log_to (int fd, ROLLBACK_LOG_NODE log, SERIALIZED_ROLLBA
                                           serialized_log->n_sub_blocks,
                                           serialized_log->sub_block,
                                           ft->h->compression_method,
-                                           &n_to_write, &compressed_buf);
+                                           &n_to_write,
+                                           &compressed_buf);

    // Dirties the ft
    DISKOFF offset;
-    ft->blocktable.realloc_on_disk(blocknum, n_to_write, &offset,
-                                   ft, fd, for_checkpoint,
-                                   // We consider rollback log flushing the hottest possible allocation,
-                                   // since rollback logs are short-lived compared to FT nodes.
-                                   INT_MAX);
+    ft->blocktable.realloc_on_disk(
+        blocknum, n_to_write, &offset, ft, fd, for_checkpoint);

    toku_os_full_pwrite(fd, compressed_buf, n_to_write, offset);
    toku_free(compressed_buf);
    if (!is_serialized) {
        toku_static_serialized_rollback_log_destroy(&serialized_local);
-        log->dirty = 0;  // See #1957.   Must set the node to be clean after serializing it so that it doesn't get written again on the next checkpoint or eviction.
+        log->dirty = 0;  // See #1957.   Must set the node to be clean after
+                         // serializing it so that it doesn't get written again
+                         // on the next checkpoint or eviction.
    }
    return 0;
 }
@ -2704,7 +2711,7 @@ exit:
 }

 static int decompress_from_raw_block_into_rbuf_versioned(uint32_t version, uint8_t *raw_block, size_t raw_block_size, struct rbuf *rb, BLOCKNUM blocknum) {
-    // This function exists solely to accomodate future changes in compression.
+    // This function exists solely to accommodate future changes in compression.
    int r = 0;
    if ((version == FT_LAYOUT_VERSION_13 || version == FT_LAYOUT_VERSION_14) ||
        (FT_LAYOUT_VERSION_25 <= version && version <= FT_LAYOUT_VERSION_27) ||
--- a/storage/tokudb/PerconaFT/ft/serialize/rbtree_mhs.cc
+++ b/storage/tokudb/PerconaFT/ft/serialize/rbtree_mhs.cc
@ -0,0 +1,833 @@
+/*- mode: C++; c-basic-offset: 4; indent-tabs-mode: nil -*- */
+// vim: ft=cpp:expandtab:ts=8:sw=4:softtabstop=4:
+#ident "$Id$"
+/*======
+This file is part of PerconaFT.
+
+
+Copyright (c) 2006, 2015, Percona and/or its affiliates. All rights reserved.
+
+    PerconaFT is free software: you can redistribute it and/or modify
+    it under the terms of the GNU General Public License, version 2,
+    as published by the Free Software Foundation.
+
+    PerconaFT is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILIT or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License
+    along with PerconaFT.  If not, see <http://www.gnu.org/licenses/>.
+
+----------------------------------------
+
+    PerconaFT is free software: you can redistribute it and/or modify
+    it under the terms of the GNU Affero General Public License, version 3,
+    as published by the Free Software Foundation.
+
+    PerconaFT is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU Affero General Public License for more details.
+
+    You should have received a copy of the GNU Affero General Public License
+    along with PerconaFT.  If not, see <http://www.gnu.org/licenses/>.
+======= */
+
+#ident "Copyright (c) 2006, 2015, Percona and/or its affiliates. All rights reserved."
+
+#include "ft/serialize/rbtree_mhs.h"
+#include "portability/toku_assert.h"
+#include "portability/toku_portability.h"
+#include <algorithm>
+
+namespace MhsRbTree {
+
+    Tree::Tree() : _root(NULL), _align(1) {}
+
+    Tree::Tree(uint64_t align) : _root(NULL), _align(align) {}
+
+    Tree::~Tree() { Destroy(); }
+
+    void Tree::PreOrder(Node *tree) const {
+        if (tree != NULL) {
+            fprintf(stderr, "%" PRIu64 " ", rbn_offset(tree).ToInt());
+            PreOrder(tree->_left);
+            PreOrder(tree->_right);
+        }
+    }
+
+    void Tree::PreOrder() { PreOrder(_root); }
+
+    void Tree::InOrder(Node *tree) const {
+        if (tree != NULL) {
+            InOrder(tree->_left);
+            fprintf(stderr, "%" PRIu64 " ", rbn_offset(tree).ToInt());
+            InOrder(tree->_right);
+        }
+    }
+
+    // yeah, i only care about in order visitor. -Jun
+    void Tree::InOrderVisitor(Node *tree,
+                              void (*f)(void *, Node *, uint64_t),
+                              void *extra,
+                              uint64_t depth) {
+        if (tree != NULL) {
+            InOrderVisitor(tree->_left, f, extra, depth + 1);
+            f(extra, tree, depth);
+            InOrderVisitor(tree->_right, f, extra, depth + 1);
+        }
+    }
+
+    void Tree::InOrderVisitor(void (*f)(void *, Node *, uint64_t),
+                              void *extra) {
+        InOrderVisitor(_root, f, extra, 0);
+    }
+
+    void Tree::InOrder() { InOrder(_root); }
+
+    void Tree::PostOrder(Node *tree) const {
+        if (tree != NULL) {
+            PostOrder(tree->_left);
+            PostOrder(tree->_right);
+            fprintf(stderr, "%" PRIu64 " ", rbn_offset(tree).ToInt());
+        }
+    }
+
+    void Tree::PostOrder() { PostOrder(_root); }
+
+    Node *Tree::SearchByOffset(uint64_t offset) {
+        Node *x = _root;
+        while ((x != NULL) && (rbn_offset(x).ToInt() != offset)) {
+            if (offset < rbn_offset(x).ToInt())
+                x = x->_left;
+            else
+                x = x->_right;
+        }
+
+        return x;
+    }
+
+    // mostly for testing
+    Node *Tree::SearchFirstFitBySize(uint64_t size) {
+        if (EffectiveSize(_root) < size && rbn_left_mhs(_root) < size &&
+            rbn_right_mhs(_root) < size) {
+            return nullptr;
+        } else {
+            return SearchFirstFitBySizeHelper(_root, size);
+        }
+    }
+
+    Node *Tree::SearchFirstFitBySizeHelper(Node *x, uint64_t size) {
+        if (EffectiveSize(x) >= size) {
+            // only possible to go left
+            if (rbn_left_mhs(x) >= size)
+                return SearchFirstFitBySizeHelper(x->_left, size);
+            else
+                return x;
+        }
+        if (rbn_left_mhs(x) >= size)
+            return SearchFirstFitBySizeHelper(x->_left, size);
+
+        if (rbn_right_mhs(x) >= size)
+            return SearchFirstFitBySizeHelper(x->_right, size);
+
+        // this is an invalid state
+        Dump();
+        ValidateBalance();
+        ValidateMhs();
+        invariant(0);
+        return NULL;
+    }
+
+    Node *Tree::MinNode(Node *tree) {
+        if (tree == NULL)
+            return NULL;
+
+        while (tree->_left != NULL)
+            tree = tree->_left;
+        return tree;
+    }
+
+    Node *Tree::MinNode() { return MinNode(_root); }
+
+    Node *Tree::MaxNode(Node *tree) {
+        if (tree == NULL)
+            return NULL;
+
+        while (tree->_right != NULL)
+            tree = tree->_right;
+        return tree;
+    }
+
+    Node *Tree::MaxNode() { return MaxNode(_root); }
+
+    Node *Tree::SuccessorHelper(Node *y, Node *x) {
+        while ((y != NULL) && (x == y->_right)) {
+            x = y;
+            y = y->_parent;
+        }
+        return y;
+    }
+    Node *Tree::Successor(Node *x) {
+        if (x->_right != NULL)
+            return MinNode(x->_right);
+
+        Node *y = x->_parent;
+        return SuccessorHelper(y, x);
+    }
+
+    Node *Tree::PredecessorHelper(Node *y, Node *x) {
+        while ((y != NULL) && (x == y->_left)) {
+            x = y;
+            y = y->_parent;
+        }
+
+        return y;
+    }
+    Node *Tree::Predecessor(Node *x) {
+        if (x->_left != NULL)
+            return MaxNode(x->_left);
+
+        Node *y = x->_parent;
+        return SuccessorHelper(y, x);
+    }
+
+    /*
+    *      px                              px
+    *     /                               /
+    *    x                               y
+    *   /  \      --(left rotation)-->  / \               #
+    *  lx   y                          x  ry
+    *     /   \                       /  \
+    *    ly   ry                      lx  ly
+    *  max_hole_size updates are pretty local
+    */
+
+    void Tree::LeftRotate(Node *&root, Node *x) {
+        Node *y = x->_right;
+
+        x->_right = y->_left;
+        rbn_right_mhs(x) = rbn_left_mhs(y);
+
+        if (y->_left != NULL)
+            y->_left->_parent = x;
+
+        y->_parent = x->_parent;
+
+        if (x->_parent == NULL) {
+            root = y;
+        } else {
+            if (x->_parent->_left == x) {
+                x->_parent->_left = y;
+            } else {
+                x->_parent->_right = y;
+            }
+        }
+        y->_left = x;
+        rbn_left_mhs(y) = mhs_of_subtree(x);
+
+        x->_parent = y;
+    }
+
+    /*            py                               py
+     *           /                                /
+     *          y                                x
+     *         /  \      --(right rotate)-->    /  \                     #
+     *        x   ry                           lx   y
+     *       / \                                   / \                   #
+     *      lx  rx                                rx  ry
+     *
+     */
+
+    void Tree::RightRotate(Node *&root, Node *y) {
+        Node *x = y->_left;
+
+        y->_left = x->_right;
+        rbn_left_mhs(y) = rbn_right_mhs(x);
+
+        if (x->_right != NULL)
+            x->_right->_parent = y;
+
+        x->_parent = y->_parent;
+
+        if (y->_parent == NULL) {
+            root = x;
+        } else {
+            if (y == y->_parent->_right)
+                y->_parent->_right = x;
+            else
+                y->_parent->_left = x;
+        }
+
+        x->_right = y;
+        rbn_right_mhs(x) = mhs_of_subtree(y);
+        y->_parent = x;
+    }
+
+    // walking from this node up to update the mhs info
+    // whenver there is change on left/right mhs or size we should recalculate.
+    // prerequisit: the children of the node are mhs up-to-date.
+    void Tree::RecalculateMhs(Node *node) {
+        uint64_t *p_node_mhs = 0;
+        Node *parent = node->_parent;
+
+        if (!parent)
+            return;
+
+        uint64_t max_mhs = mhs_of_subtree(node);
+        if (node == parent->_left) {
+            p_node_mhs = &rbn_left_mhs(parent);
+        } else if (node == parent->_right) {
+            p_node_mhs = &rbn_right_mhs(parent);
+        } else {
+            return;
+        }
+        if (*p_node_mhs != max_mhs) {
+            *p_node_mhs = max_mhs;
+            RecalculateMhs(parent);
+        }
+    }
+
+    void Tree::IsNewNodeMergable(Node *pred,
+                                 Node *succ,
+                                 Node::BlockPair pair,
+                                 bool *left_merge,
+                                 bool *right_merge) {
+        if (pred) {
+            OUUInt64 end_of_pred = rbn_size(pred) + rbn_offset(pred);
+            if (end_of_pred < pair._offset)
+                *left_merge = false;
+            else {
+                invariant(end_of_pred == pair._offset);
+                *left_merge = true;
+            }
+        }
+        if (succ) {
+            OUUInt64 begin_of_succ = rbn_offset(succ);
+            OUUInt64 end_of_node = pair._offset + pair._size;
+            if (end_of_node < begin_of_succ) {
+                *right_merge = false;
+            } else {
+                invariant(end_of_node == begin_of_succ);
+                *right_merge = true;
+            }
+        }
+    }
+
+    void Tree::AbsorbNewNode(Node *pred,
+                             Node *succ,
+                             Node::BlockPair pair,
+                             bool left_merge,
+                             bool right_merge,
+                             bool is_right_child) {
+        invariant(left_merge || right_merge);
+        if (left_merge && right_merge) {
+            // merge to the succ
+            if (!is_right_child) {
+                rbn_size(succ) += pair._size;
+                rbn_offset(succ) = pair._offset;
+                // merge to the pred
+                rbn_size(pred) += rbn_size(succ);
+                // to keep the invariant of the tree -no overlapping holes
+                rbn_offset(succ) += rbn_size(succ);
+                rbn_size(succ) = 0;
+                RecalculateMhs(succ);
+                RecalculateMhs(pred);
+                // pred dominates succ. this is going to
+                // update the pred labels separately.
+                // remove succ
+                RawRemove(_root, succ);
+            } else {
+                rbn_size(pred) += pair._size;
+                rbn_offset(succ) = rbn_offset(pred);
+                rbn_size(succ) += rbn_size(pred);
+                rbn_offset(pred) += rbn_size(pred);
+                rbn_size(pred) = 0;
+                RecalculateMhs(pred);
+                RecalculateMhs(succ);
+                // now remove pred
+                RawRemove(_root, pred);
+            }
+        } else if (left_merge) {
+            rbn_size(pred) += pair._size;
+            RecalculateMhs(pred);
+        } else if (right_merge) {
+            rbn_offset(succ) -= pair._size;
+            rbn_size(succ) += pair._size;
+            RecalculateMhs(succ);
+        }
+    }
+    // this is the most tedious part, but not complicated:
+    // 1.find where to insert the pair
+    // 2.if the pred and succ can merge with the pair. merge with them. either
+    // pred
+    // or succ can be removed.
+    // 3. if only left-mergable or right-mergeable, just merge
+    // 4. non-mergable case. insert the node and run the fixup.
+
+    int Tree::Insert(Node *&root, Node::BlockPair pair) {
+        Node *x = _root;
+        Node *y = NULL;
+        bool left_merge = false;
+        bool right_merge = false;
+        Node *node = NULL;
+
+        while (x != NULL) {
+            y = x;
+            if (pair._offset < rbn_key(x))
+                x = x->_left;
+            else
+                x = x->_right;
+        }
+
+        // we found where to insert, lets find out the pred and succ for
+        // possible
+        // merges.
+        //  node->parent = y;
+        Node *pred, *succ;
+        if (y != NULL) {
+            if (pair._offset < rbn_key(y)) {
+                // as the left child
+                pred = PredecessorHelper(y->_parent, y);
+                succ = y;
+                IsNewNodeMergable(pred, succ, pair, &left_merge, &right_merge);
+                if (left_merge || right_merge) {
+                    AbsorbNewNode(
+                        pred, succ, pair, left_merge, right_merge, false);
+                } else {
+                    // construct the node
+                    Node::Pair mhsp {0, 0};
+                    node =
+                        new Node(EColor::BLACK, pair, mhsp, nullptr, nullptr, nullptr);
+                    if (!node)
+                        return -1;
+                    y->_left = node;
+                    node->_parent = y;
+                    RecalculateMhs(node);
+                }
+
+            } else {
+                // as the right child
+                pred = y;
+                succ = SuccessorHelper(y->_parent, y);
+                IsNewNodeMergable(pred, succ, pair, &left_merge, &right_merge);
+                if (left_merge || right_merge) {
+                    AbsorbNewNode(
+                        pred, succ, pair, left_merge, right_merge, true);
+                } else {
+                    // construct the node
+                    Node::Pair mhsp {0, 0};
+                    node =
+                        new Node(EColor::BLACK, pair, mhsp, nullptr, nullptr, nullptr);
+                    if (!node)
+                        return -1;
+                    y->_right = node;
+                    node->_parent = y;
+                    RecalculateMhs(node);
+                }
+            }
+        } else {
+            Node::Pair mhsp {0, 0};
+            node = new Node(EColor::BLACK, pair, mhsp, nullptr, nullptr, nullptr);
+            if (!node)
+                return -1;
+            root = node;
+        }
+        if (!left_merge && !right_merge) {
+            invariant_notnull(node);
+            node->_color = EColor::RED;
+            return InsertFixup(root, node);
+        }
+        return 0;
+    }
+
+    int Tree::InsertFixup(Node *&root, Node *node) {
+        Node *parent, *gparent;
+        while ((parent = rbn_parent(node)) && rbn_is_red(parent)) {
+            gparent = rbn_parent(parent);
+            if (parent == gparent->_left) {
+                {
+                    Node *uncle = gparent->_right;
+                    if (uncle && rbn_is_red(uncle)) {
+                        rbn_set_black(uncle);
+                        rbn_set_black(parent);
+                        rbn_set_red(gparent);
+                        node = gparent;
+                        continue;
+                    }
+                }
+
+                if (parent->_right == node) {
+                    Node *tmp;
+                    LeftRotate(root, parent);
+                    tmp = parent;
+                    parent = node;
+                    node = tmp;
+                }
+
+                rbn_set_black(parent);
+                rbn_set_red(gparent);
+                RightRotate(root, gparent);
+            } else {
+                {
+                    Node *uncle = gparent->_left;
+                    if (uncle && rbn_is_red(uncle)) {
+                        rbn_set_black(uncle);
+                        rbn_set_black(parent);
+                        rbn_set_red(gparent);
+                        node = gparent;
+                        continue;
+                    }
+                }
+
+                if (parent->_left == node) {
+                    Node *tmp;
+                    RightRotate(root, parent);
+                    tmp = parent;
+                    parent = node;
+                    node = tmp;
+                }
+                rbn_set_black(parent);
+                rbn_set_red(gparent);
+                LeftRotate(root, gparent);
+            }
+        }
+        rbn_set_black(root);
+        return 0;
+    }
+
+    int Tree::Insert(Node::BlockPair pair) { return Insert(_root, pair); }
+
+    uint64_t Tree::Remove(size_t size) {
+        Node *node = SearchFirstFitBySize(size);
+        return Remove(_root, node, size);
+    }
+
+    void Tree::RawRemove(Node *&root, Node *node) {
+        Node *child, *parent;
+        EColor color;
+
+        if ((node->_left != NULL) && (node->_right != NULL)) {
+            Node *replace = node;
+            replace = replace->_right;
+            while (replace->_left != NULL)
+                replace = replace->_left;
+
+            if (rbn_parent(node)) {
+                if (rbn_parent(node)->_left == node)
+                    rbn_parent(node)->_left = replace;
+                else
+                    rbn_parent(node)->_right = replace;
+            } else {
+                root = replace;
+            }
+            child = replace->_right;
+            parent = rbn_parent(replace);
+            color = rbn_color(replace);
+
+            if (parent == node) {
+                parent = replace;
+            } else {
+                if (child)
+                    rbn_parent(child) = parent;
+
+                parent->_left = child;
+                rbn_left_mhs(parent) = rbn_right_mhs(replace);
+                RecalculateMhs(parent);
+                replace->_right = node->_right;
+                rbn_set_parent(node->_right, replace);
+                rbn_right_mhs(replace) = rbn_right_mhs(node);
+            }
+
+            replace->_parent = node->_parent;
+            replace->_color = node->_color;
+            replace->_left = node->_left;
+            rbn_left_mhs(replace) = rbn_left_mhs(node);
+            node->_left->_parent = replace;
+            RecalculateMhs(replace);
+            if (color == EColor::BLACK)
+                RawRemoveFixup(root, child, parent);
+            delete node;
+            return;
+        }
+
+        if (node->_left != NULL)
+            child = node->_left;
+        else
+            child = node->_right;
+
+        parent = node->_parent;
+        color = node->_color;
+
+        if (child)
+            child->_parent = parent;
+
+        if (parent) {
+            if (parent->_left == node) {
+                parent->_left = child;
+                rbn_left_mhs(parent) = child ? mhs_of_subtree(child) : 0;
+            } else {
+                parent->_right = child;
+                rbn_right_mhs(parent) = child ? mhs_of_subtree(child) : 0;
+            }
+            RecalculateMhs(parent);
+        } else
+            root = child;
+        if (color == EColor::BLACK)
+            RawRemoveFixup(root, child, parent);
+        delete node;
+    }
+
+    void Tree::RawRemove(uint64_t offset) {
+        Node *node = SearchByOffset(offset);
+        RawRemove(_root, node);
+    }
+    static inline uint64_t align(uint64_t value, uint64_t ba_alignment) {
+        return ((value + ba_alignment - 1) / ba_alignment) * ba_alignment;
+    }
+    uint64_t Tree::Remove(Node *&root, Node *node, size_t size) {
+        OUUInt64 n_offset = rbn_offset(node);
+        OUUInt64 n_size = rbn_size(node);
+        OUUInt64 answer_offset(align(rbn_offset(node).ToInt(), _align));
+
+        invariant((answer_offset + size) <= (n_offset + n_size));
+        if (answer_offset == n_offset) {
+            rbn_offset(node) += size;
+            rbn_size(node) -= size;
+            RecalculateMhs(node);
+            if (rbn_size(node) == 0) {
+                RawRemove(root, node);
+            }
+
+        } else {
+            if (answer_offset + size == n_offset + n_size) {
+                rbn_size(node) -= size;
+                RecalculateMhs(node);
+            } else {
+                // well, cut in the middle...
+                rbn_size(node) = answer_offset - n_offset;
+                RecalculateMhs(node);
+                Insert(_root,
+                       {(answer_offset + size),
+                        (n_offset + n_size) - (answer_offset + size)});
+            }
+        }
+        return answer_offset.ToInt();
+    }
+
+    void Tree::RawRemoveFixup(Node *&root, Node *node, Node *parent) {
+        Node *other;
+        while ((!node || rbn_is_black(node)) && node != root) {
+            if (parent->_left == node) {
+                other = parent->_right;
+                if (rbn_is_red(other)) {
+                    // Case 1: the brother of X, w, is read
+                    rbn_set_black(other);
+                    rbn_set_red(parent);
+                    LeftRotate(root, parent);
+                    other = parent->_right;
+                }
+                if ((!other->_left || rbn_is_black(other->_left)) &&
+                    (!other->_right || rbn_is_black(other->_right))) {
+                    // Case 2: w is black and both of w's children are black
+                    rbn_set_red(other);
+                    node = parent;
+                    parent = rbn_parent(node);
+                } else {
+                    if (!other->_right || rbn_is_black(other->_right)) {
+                        // Case 3: w is black and left child of w is red but
+                        // right
+                        // child is black
+                        rbn_set_black(other->_left);
+                        rbn_set_red(other);
+                        RightRotate(root, other);
+                        other = parent->_right;
+                    }
+                    // Case 4: w is black and right child of w is red,
+                    // regardless of
+                    // left child's color
+                    rbn_set_color(other, rbn_color(parent));
+                    rbn_set_black(parent);
+                    rbn_set_black(other->_right);
+                    LeftRotate(root, parent);
+                    node = root;
+                    break;
+                }
+            } else {
+                other = parent->_left;
+                if (rbn_is_red(other)) {
+                    // Case 1: w is red
+                    rbn_set_black(other);
+                    rbn_set_red(parent);
+                    RightRotate(root, parent);
+                    other = parent->_left;
+                }
+                if ((!other->_left || rbn_is_black(other->_left)) &&
+                    (!other->_right || rbn_is_black(other->_right))) {
+                    // Case 2: w is black and both children are black
+                    rbn_set_red(other);
+                    node = parent;
+                    parent = rbn_parent(node);
+                } else {
+                    if (!other->_left || rbn_is_black(other->_left)) {
+                        // Case 3: w is black and left child of w is red whereas
+                        // right child is black
+                        rbn_set_black(other->_right);
+                        rbn_set_red(other);
+                        LeftRotate(root, other);
+                        other = parent->_left;
+                    }
+                    // Case 4:w is black and right child of w is red, regardless
+                    // of
+                    // the left child's color
+                    rbn_set_color(other, rbn_color(parent));
+                    rbn_set_black(parent);
+                    rbn_set_black(other->_left);
+                    RightRotate(root, parent);
+                    node = root;
+                    break;
+                }
+            }
+        }
+        if (node)
+            rbn_set_black(node);
+    }
+
+    void Tree::Destroy(Node *&tree) {
+        if (tree == NULL)
+            return;
+
+        if (tree->_left != NULL)
+            Destroy(tree->_left);
+        if (tree->_right != NULL)
+            Destroy(tree->_right);
+
+        delete tree;
+        tree = NULL;
+    }
+
+    void Tree::Destroy() { Destroy(_root); }
+
+    void Tree::Dump(Node *tree, Node::BlockPair pair, EDirection dir) {
+        if (tree != NULL) {
+            if (dir == EDirection::NONE)
+                fprintf(stderr,
+                        "(%" PRIu64 ",%" PRIu64 ", mhs:(%" PRIu64 ",%" PRIu64
+                        "))(B) is root\n",
+                        rbn_offset(tree).ToInt(),
+                        rbn_size(tree).ToInt(),
+                        rbn_left_mhs(tree),
+                        rbn_right_mhs(tree));
+            else
+                fprintf(stderr,
+                        "(%" PRIu64 ",%" PRIu64 ",mhs:(%" PRIu64 ",%" PRIu64
+                        "))(%c) is %" PRIu64 "'s %s\n",
+                        rbn_offset(tree).ToInt(),
+                        rbn_size(tree).ToInt(),
+                        rbn_left_mhs(tree),
+                        rbn_right_mhs(tree),
+                        rbn_is_red(tree) ? 'R' : 'B',
+                        pair._offset.ToInt(),
+                        dir == EDirection::RIGHT ? "right child" : "left child");
+
+            Dump(tree->_left, tree->_hole, EDirection::LEFT);
+            Dump(tree->_right, tree->_hole, EDirection::RIGHT);
+        }
+    }
+
+    uint64_t Tree::EffectiveSize(Node *node) {
+        OUUInt64 offset = rbn_offset(node);
+        OUUInt64 size = rbn_size(node);
+        OUUInt64 end = offset + size;
+        OUUInt64 aligned_offset(align(offset.ToInt(), _align));
+        if (aligned_offset > end) {
+            return 0;
+        }
+        return (end - aligned_offset).ToInt();
+    }
+
+    void Tree::Dump() {
+        if (_root != NULL)
+            Dump(_root, _root->_hole, (EDirection)0);
+    }
+
+    static void vis_bal_f(void *extra, Node *node, uint64_t depth) {
+        uint64_t **p = (uint64_t **)extra;
+        uint64_t min = *p[0];
+        uint64_t max = *p[1];
+        if (node->_left) {
+            Node *left = node->_left;
+            invariant(node == left->_parent);
+        }
+
+        if (node->_right) {
+            Node *right = node->_right;
+            invariant(node == right->_parent);
+        }
+
+        if (!node->_left || !node->_right) {
+            if (min > depth) {
+                *p[0] = depth;
+            } else if (max < depth) {
+                *p[1] = depth;
+            }
+        }
+    }
+
+    void Tree::ValidateBalance() {
+        uint64_t min_depth = 0xffffffffffffffff;
+        uint64_t max_depth = 0;
+        if (!_root) {
+            return;
+        }
+        uint64_t *p[2] = {&min_depth, &max_depth};
+        InOrderVisitor(vis_bal_f, (void *)p);
+        invariant((min_depth + 1) * 2 >= max_depth + 1);
+    }
+
+    static void vis_cmp_f(void *extra, Node *node, uint64_t UU(depth)) {
+        Node::BlockPair **p = (Node::BlockPair **)extra;
+
+        invariant_notnull(*p);
+        invariant((*p)->_offset == node->_hole._offset);
+
+        *p = *p + 1;
+    }
+
+    // validate the input pairs matches with sorted pairs
+    void Tree::ValidateInOrder(Node::BlockPair *pairs) {
+        InOrderVisitor(vis_cmp_f, &pairs);
+    }
+
+    uint64_t Tree::ValidateMhs(Node *node) {
+        if (!node)
+            return 0;
+        else {
+            uint64_t mhs_left = ValidateMhs(node->_left);
+            uint64_t mhs_right = ValidateMhs(node->_right);
+            if (mhs_left != rbn_left_mhs(node)) {
+                printf("assert failure: mhs_left = %" PRIu64 "\n", mhs_left);
+                Dump(node, node->_hole, (EDirection)0);
+            }
+            invariant(mhs_left == rbn_left_mhs(node));
+
+            if (mhs_right != rbn_right_mhs(node)) {
+                printf("assert failure: mhs_right = %" PRIu64 "\n", mhs_right);
+                Dump(node, node->_hole, (EDirection)0);
+            }
+            invariant(mhs_right == rbn_right_mhs(node));
+            return std::max(EffectiveSize(node), std::max(mhs_left, mhs_right));
+        }
+    }
+
+    void Tree::ValidateMhs() {
+        if (!_root)
+            return;
+        uint64_t mhs_left = ValidateMhs(_root->_left);
+        uint64_t mhs_right = ValidateMhs(_root->_right);
+        invariant(mhs_left == rbn_left_mhs(_root));
+        invariant(mhs_right == rbn_right_mhs(_root));
+    }
+
+}  // namespace MhsRbTree
--- a/storage/tokudb/PerconaFT/ft/serialize/rbtree_mhs.h
+++ b/storage/tokudb/PerconaFT/ft/serialize/rbtree_mhs.h
@ -0,0 +1,351 @@
+/* -*- mode: C++; c-basic-offset: 4; indent-tabs-mode: nil -*- */
+// vim: ft=cpp:expandtab:ts=8:sw=4:softtabstop=4:
+#ident "$Id$"
+/*======
+This file is part of PerconaFT.
+
+
+Copyright (c) 2006, 2015, Percona and/or its affiliates. All rights reserved.
+
+    PerconaFT is free software: you can redistribute it and/or modify
+    it under the terms of the GNU General Public License, version 2,
+    as published by the Free Software Foundation.
+
+    PerconaFT is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License
+    along with PerconaFT.  If not, see <http://www.gnu.org/licenses/>.
+
+----------------------------------------
+
+    PerconaFT is free software: you can redistribute it and/or modify
+    it under the terms of the GNU Affero General Public License, version 3,
+    as published by the Free Software Foundation.
+
+    PerconaFT is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU Affero General Public License for more details.
+
+    You should have received a copy of the GNU Affero General Public License
+    along with PerconaFT.  If not, see <http://www.gnu.org/licenses/>.
+======= */
+
+#ident "Copyright (c) 2006, 2015, Percona and/or its affiliates. All rights reserved."
+
+#pragma once
+
+#include <db.h>
+
+#include "portability/toku_pthread.h"
+#include "portability/toku_stdint.h"
+#include "portability/toku_stdlib.h"
+
+// RBTree(Red-black tree) with max hole sizes for subtrees.
+
+// This is a tentative data struct to improve the block allocation time
+// complexity from the linear time to the log time. Please be noted this DS only
+// supports first-fit for now. It is actually easier to do it with
+// best-fit.(just
+// sort by size).
+
+// RBTree is a classic data struct with O(log(n)) for insertion, deletion and
+// search. Many years have seen its efficiency.
+
+// a *hole* is the representation of an available BlockPair for allocation.
+// defined as (start_address,size) or (offset, size) interchangably.
+
+// each node has a *label* to indicate a pair of the max hole sizes for its
+// subtree.
+
+// We are implementing a RBTree with max hole sizes for subtree. It is a red
+// black tree that is sorted by the start_address but also labeld with the max
+// hole sizes of the subtrees.
+
+//        [(6,3)]  -> [(offset, size)], the hole
+//        [{2,5}]  -> [{mhs_of_left, mhs_of_right}], the label
+/*        /     \           */
+// [(0, 1)]    [(10,  5)]
+// [{0, 2}]    [{0,   0}]
+/*        \                 */
+//       [(3,  2)]
+//       [{0,  0}]
+// request of allocation size=2 goes from root to [(3,2)].
+
+// above example shows a simplified RBTree_max_holes.
+// it is easier to tell the search time is O(log(n)) as we can make a decision
+// on each descent until we get to the target.
+
+// the only question is if we can keep the maintenance cost low -- and i think
+// it is not a problem becoz an insertion/deletion is only going to update the
+// max_hole_sizes of the nodes along the path from the root to the node to be
+// deleted/inserted. The path can be cached and search is anyway O(log(n)).
+
+// unlike the typical rbtree, Tree has to handle the inserts and deletes
+// with more care: an allocation that triggers the delete might leave some
+// unused space which we can simply update the start_addr and size without
+// worrying overlapping. An free might not only mean the insertion but also
+// *merging* with the adjacent holes.
+
+namespace MhsRbTree {
+
+#define offset_t uint64_t
+    enum class EColor { RED, BLACK };
+    enum class EDirection { NONE = 0, LEFT, RIGHT };
+
+    // I am a bit tired of fixing overflow/underflow, just quickly craft some
+    // int
+    // class that has an infinity-like max value and prevents overflow and
+    // underflow. If you got a file offset larger than MHS_MAX_VAL, it is not
+    // a problem here. :-/  - JYM
+    class OUUInt64 {
+       public:
+        static const uint64_t MHS_MAX_VAL = 0xffffffffffffffff;
+        OUUInt64() : _value(0) {}
+        OUUInt64(uint64_t s) : _value(s) {}
+        bool operator<(const OUUInt64 &r) const {
+            invariant(!(_value == MHS_MAX_VAL && r.ToInt() == MHS_MAX_VAL));
+            return _value < r.ToInt();
+        }
+        bool operator>(const OUUInt64 &r) const {
+            invariant(!(_value == MHS_MAX_VAL && r.ToInt() == MHS_MAX_VAL));
+            return _value > r.ToInt();
+        }
+        bool operator<=(const OUUInt64 &r) const {
+            invariant(!(_value == MHS_MAX_VAL && r.ToInt() == MHS_MAX_VAL));
+            return _value <= r.ToInt();
+        }
+        bool operator>=(const OUUInt64 &r) const {
+            invariant(!(_value == MHS_MAX_VAL && r.ToInt() == MHS_MAX_VAL));
+            return _value >= r.ToInt();
+        }
+        OUUInt64 operator+(const OUUInt64 &r) const {
+            if (_value == MHS_MAX_VAL || r.ToInt() == MHS_MAX_VAL) {
+                OUUInt64 tmp(MHS_MAX_VAL);
+                return tmp;
+            } else {
+                // detecting overflow
+                invariant((MHS_MAX_VAL - _value) >= r.ToInt());
+                uint64_t plus = _value + r.ToInt();
+                OUUInt64 tmp(plus);
+                return tmp;
+            }
+        }
+        OUUInt64 operator-(const OUUInt64 &r) const {
+            invariant(r.ToInt() != MHS_MAX_VAL);
+            if (_value == MHS_MAX_VAL) {
+                return *this;
+            } else {
+                invariant(_value >= r.ToInt());
+                uint64_t minus = _value - r.ToInt();
+                OUUInt64 tmp(minus);
+                return tmp;
+            }
+        }
+        OUUInt64 operator-=(const OUUInt64 &r) {
+            if (_value != MHS_MAX_VAL) {
+                invariant(r.ToInt() != MHS_MAX_VAL);
+                invariant(_value >= r.ToInt());
+                _value -= r.ToInt();
+            }
+            return *this;
+        }
+        OUUInt64 operator+=(const OUUInt64 &r) {
+            if (_value != MHS_MAX_VAL) {
+                if (r.ToInt() == MHS_MAX_VAL) {
+                    _value = MHS_MAX_VAL;
+                } else {
+                    invariant((MHS_MAX_VAL - _value) >= r.ToInt());
+                    this->_value += r.ToInt();
+                }
+            }
+            return *this;
+        }
+        bool operator==(const OUUInt64 &r) const {
+            return _value == r.ToInt();
+        }
+        bool operator!=(const OUUInt64 &r) const {
+            return _value != r.ToInt();
+        }
+        OUUInt64 operator=(const OUUInt64 &r) {
+            _value = r.ToInt();
+            return *this;
+        }
+        uint64_t ToInt() const { return _value; }
+
+       private:
+        uint64_t _value;
+    };
+
+    class Node {
+       public:
+        struct BlockPair {
+            OUUInt64 _offset;
+            OUUInt64 _size;
+
+            BlockPair() : _offset(0), _size(0) {}
+            BlockPair(uint64_t o, uint64_t s) : _offset(o), _size(s) {}
+
+            BlockPair(OUUInt64 o, OUUInt64 s) : _offset(o), _size(s) {}
+            int operator<(const struct BlockPair &rhs) const {
+                return _offset < rhs._offset;
+            }
+            int operator<(const uint64_t &o) const { return _offset < o; }
+        };
+
+        struct Pair {
+            uint64_t _left;
+            uint64_t _right;
+            Pair(uint64_t l, uint64_t r) : _left(l), _right(r) {}
+        };
+
+        EColor _color;
+        struct BlockPair _hole;
+        struct Pair _label;
+        Node *_left;
+        Node *_right;
+        Node *_parent;
+
+        Node(EColor c,
+             Node::BlockPair h,
+             struct Pair lb,
+             Node *l,
+             Node *r,
+             Node *p)
+            : _color(c),
+              _hole(h),
+              _label(lb),
+              _left(l),
+              _right(r),
+              _parent(p) {}
+    };
+
+    class Tree {
+       private:
+        Node *_root;
+        uint64_t _align;
+
+       public:
+        Tree();
+        Tree(uint64_t);
+        ~Tree();
+
+        void PreOrder();
+        void InOrder();
+        void PostOrder();
+        // immutable operations
+        Node *SearchByOffset(uint64_t addr);
+        Node *SearchFirstFitBySize(uint64_t size);
+
+        Node *MinNode();
+        Node *MaxNode();
+
+        Node *Successor(Node *);
+        Node *Predecessor(Node *);
+
+        // mapped from tree_allocator::free_block
+        int Insert(Node::BlockPair pair);
+        // mapped from tree_allocator::alloc_block
+        uint64_t Remove(size_t size);
+        // mapped from tree_allocator::alloc_block_after
+
+        void RawRemove(uint64_t offset);
+        void Destroy();
+        // print the tree
+        void Dump();
+        // validation
+        // balance
+        void ValidateBalance();
+        void ValidateInOrder(Node::BlockPair *);
+        void InOrderVisitor(void (*f)(void *, Node *, uint64_t), void *);
+        void ValidateMhs();
+
+       private:
+        void PreOrder(Node *node) const;
+        void InOrder(Node *node) const;
+        void PostOrder(Node *node) const;
+        Node *SearchByOffset(Node *node, offset_t addr) const;
+        Node *SearchFirstFitBySize(Node *node, size_t size) const;
+
+        Node *MinNode(Node *node);
+        Node *MaxNode(Node *node);
+
+        // rotations to fix up. we will have to update the labels too.
+        void LeftRotate(Node *&root, Node *x);
+        void RightRotate(Node *&root, Node *y);
+
+        int Insert(Node *&root, Node::BlockPair pair);
+        int InsertFixup(Node *&root, Node *node);
+
+        void RawRemove(Node *&root, Node *node);
+        uint64_t Remove(Node *&root, Node *node, size_t size);
+        void RawRemoveFixup(Node *&root, Node *node, Node *parent);
+
+        void Destroy(Node *&tree);
+        void Dump(Node *tree, Node::BlockPair pair, EDirection dir);
+        void RecalculateMhs(Node *node);
+        void IsNewNodeMergable(Node *, Node *, Node::BlockPair, bool *, bool *);
+        void AbsorbNewNode(Node *, Node *, Node::BlockPair, bool, bool, bool);
+        Node *SearchFirstFitBySizeHelper(Node *x, uint64_t size);
+
+        Node *SuccessorHelper(Node *y, Node *x);
+
+        Node *PredecessorHelper(Node *y, Node *x);
+
+        void InOrderVisitor(Node *,
+                            void (*f)(void *, Node *, uint64_t),
+                            void *,
+                            uint64_t);
+        uint64_t ValidateMhs(Node *);
+
+        uint64_t EffectiveSize(Node *);
+// mixed with some macros.....
+#define rbn_parent(r) ((r)->_parent)
+#define rbn_color(r) ((r)->_color)
+#define rbn_is_red(r) ((r)->_color == EColor::RED)
+#define rbn_is_black(r) ((r)->_color == EColor::BLACK)
+#define rbn_set_black(r)     \
+    do {                     \
+        (r)->_color = EColor::BLACK; \
+    } while (0)
+#define rbn_set_red(r)     \
+    do {                   \
+        (r)->_color = EColor::RED; \
+    } while (0)
+#define rbn_set_parent(r, p) \
+    do {                     \
+        (r)->_parent = (p);  \
+    } while (0)
+#define rbn_set_color(r, c) \
+    do {                    \
+        (r)->_color = (c);  \
+    } while (0)
+#define rbn_set_offset(r)         \
+    do {                          \
+        (r)->_hole._offset = (c); \
+    } while (0)
+#define rbn_set_size(r, c)      \
+    do {                        \
+        (r)->_hole._size = (c); \
+    } while (0)
+#define rbn_set_left_mhs(r, c)   \
+    do {                         \
+        (r)->_label._left = (c); \
+    } while (0)
+#define rbn_set_right_mhs(r, c)   \
+    do {                          \
+        (r)->_label._right = (c); \
+    } while (0)
+#define rbn_size(r) ((r)->_hole._size)
+#define rbn_offset(r) ((r)->_hole._offset)
+#define rbn_key(r) ((r)->_hole._offset)
+#define rbn_left_mhs(r) ((r)->_label._left)
+#define rbn_right_mhs(r) ((r)->_label._right)
+#define mhs_of_subtree(y) \
+    (std::max(std::max(rbn_left_mhs(y), rbn_right_mhs(y)), EffectiveSize(y)))
+    };
+
+}  // namespace MhsRbTree
--- a/storage/tokudb/PerconaFT/ft/tests/block_allocator_strategy_test.cc
+++ b/storage/tokudb/PerconaFT/ft/tests/block_allocator_strategy_test.cc
@ -1,126 +0,0 @@
-/* -*- mode: C++; c-basic-offset: 4; indent-tabs-mode: nil -*- */
-// vim: ft=cpp:expandtab:ts=8:sw=4:softtabstop=4:
-#ident "$Id$"
-/*======
-This file is part of PerconaFT.
-
-
-Copyright (c) 2006, 2015, Percona and/or its affiliates. All rights reserved.
-
-    PerconaFT is free software: you can redistribute it and/or modify
-    it under the terms of the GNU General Public License, version 2,
-    as published by the Free Software Foundation.
-
-    PerconaFT is distributed in the hope that it will be useful,
-    but WITHOUT ANY WARRANTY; without even the implied warranty of
-    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-    GNU General Public License for more details.
-
-    You should have received a copy of the GNU General Public License
-    along with PerconaFT.  If not, see <http://www.gnu.org/licenses/>.
-
----------------------------------------
-
-    PerconaFT is free software: you can redistribute it and/or modify
-    it under the terms of the GNU Affero General Public License, version 3,
-    as published by the Free Software Foundation.
-
-    PerconaFT is distributed in the hope that it will be useful,
-    but WITHOUT ANY WARRANTY; without even the implied warranty of
-    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-    GNU Affero General Public License for more details.
-
-    You should have received a copy of the GNU Affero General Public License
-    along with PerconaFT.  If not, see <http://www.gnu.org/licenses/>.
-======= */
-
-#ident "Copyright (c) 2006, 2015, Percona and/or its affiliates. All rights reserved."
-
-#include "ft/tests/test.h"
-
-#include "ft/serialize/block_allocator_strategy.h"
-
-static const uint64_t alignment = 4096;
-
-static void test_first_vs_best_fit(void) {
-    struct block_allocator::blockpair pairs[] = {
-        block_allocator::blockpair(1 * alignment, 6 * alignment),
-        // hole between 7x align -> 8x align
-        block_allocator::blockpair(8 * alignment, 4 * alignment),
-        // hole between 12x align -> 16x align
-        block_allocator::blockpair(16 * alignment, 1 * alignment),
-        block_allocator::blockpair(17 * alignment, 2 * alignment),
-        // hole between 19 align -> 21x align
-        block_allocator::blockpair(21 * alignment, 2 * alignment),
-    };
-    const uint64_t n_blocks = sizeof(pairs) / sizeof(pairs[0]);
-    
-    block_allocator::blockpair *bp;
-
-    // first fit
-    bp = block_allocator_strategy::first_fit(pairs, n_blocks, 100, alignment);
-    assert(bp == &pairs[0]);
-    bp = block_allocator_strategy::first_fit(pairs, n_blocks, 4096, alignment);
-    assert(bp == &pairs[0]);
-    bp = block_allocator_strategy::first_fit(pairs, n_blocks, 3 * 4096, alignment);
-    assert(bp == &pairs[1]);
-    bp = block_allocator_strategy::first_fit(pairs, n_blocks, 5 * 4096, alignment);
-    assert(bp == nullptr);
-
-    // best fit
-    bp = block_allocator_strategy::best_fit(pairs, n_blocks, 100, alignment);
-    assert(bp == &pairs[0]);
-    bp = block_allocator_strategy::best_fit(pairs, n_blocks, 4100, alignment);
-    assert(bp == &pairs[3]);
-    bp = block_allocator_strategy::best_fit(pairs, n_blocks, 3 * 4096, alignment);
-    assert(bp == &pairs[1]);
-    bp = block_allocator_strategy::best_fit(pairs, n_blocks, 5 * 4096, alignment);
-    assert(bp == nullptr);
-}
-
-static void test_padded_fit(void) {
-    struct block_allocator::blockpair pairs[] = {
-        block_allocator::blockpair(1 * alignment, 1 * alignment),
-        // 4096 byte hole after bp[0]
-        block_allocator::blockpair(3 * alignment, 1 * alignment),
-        // 8192 byte hole after bp[1]
-        block_allocator::blockpair(6 * alignment, 1 * alignment),
-        // 16384 byte hole after bp[2]
-        block_allocator::blockpair(11 * alignment, 1 * alignment),
-        // 32768 byte hole after bp[3]
-        block_allocator::blockpair(17 * alignment, 1 * alignment),
-        // 116kb hole after bp[4]
-        block_allocator::blockpair(113 * alignment, 1 * alignment),
-        // 256kb hole after bp[5]
-        block_allocator::blockpair(371 * alignment, 1 * alignment),
-    };
-    const uint64_t n_blocks = sizeof(pairs) / sizeof(pairs[0]);
-    
-    block_allocator::blockpair *bp;
-
-    // padding for a 100 byte allocation will be < than standard alignment,
-    // so it should fit in the first 4096 byte hole.
-    bp = block_allocator_strategy::padded_fit(pairs, n_blocks, 4000, alignment);
-    assert(bp == &pairs[0]);
-
-    // Even padded, a 12kb alloc will fit in a 16kb hole
-    bp = block_allocator_strategy::padded_fit(pairs, n_blocks, 3 * alignment, alignment);
-    assert(bp == &pairs[2]);
-
-    // would normally fit in the 116kb hole but the padding will bring it over
-    bp = block_allocator_strategy::padded_fit(pairs, n_blocks, 116 * alignment, alignment);
-    assert(bp == &pairs[5]);
-
-    bp = block_allocator_strategy::padded_fit(pairs, n_blocks, 127 * alignment, alignment);
-    assert(bp == &pairs[5]);
-}
-
-int test_main(int argc, const char *argv[]) {
-    (void) argc;
-    (void) argv;
-
-    test_first_vs_best_fit();
-    test_padded_fit();
-
-    return 0;
-}
--- a/storage/tokudb/PerconaFT/ft/tests/block_allocator_test.cc
+++ b/storage/tokudb/PerconaFT/ft/tests/block_allocator_test.cc
@ -38,253 +38,243 @@ Copyright (c) 2006, 2015, Percona and/or its affiliates. All rights reserved.

 #include "test.h"

-static void ba_alloc(block_allocator *ba, uint64_t size, uint64_t *answer) {
-    ba->validate();
+static void ba_alloc(BlockAllocator *ba, uint64_t size, uint64_t *answer) {
+    ba->Validate();
    uint64_t actual_answer;
-    const uint64_t heat = random() % 2;
-    ba->alloc_block(512 * size, heat, &actual_answer);
-    ba->validate();
+    ba->AllocBlock(512 * size, &actual_answer);
+    ba->Validate();

-    assert(actual_answer%512==0);
-    *answer = actual_answer/512;
+    invariant(actual_answer % 512 == 0);
+    *answer = actual_answer / 512;
 }

-static void ba_free(block_allocator *ba, uint64_t offset) {
-    ba->validate();
-    ba->free_block(offset * 512);
-    ba->validate();
+static void ba_free(BlockAllocator *ba, uint64_t offset, uint64_t size) {
+    ba->Validate();
+    ba->FreeBlock(offset * 512, 512 * size);
+    ba->Validate();
 }

-static void ba_check_l(block_allocator *ba, uint64_t blocknum_in_layout_order,
-                       uint64_t expected_offset, uint64_t expected_size) {
+static void ba_check_l(BlockAllocator *ba,
+                       uint64_t blocknum_in_layout_order,
+                       uint64_t expected_offset,
+                       uint64_t expected_size) {
    uint64_t actual_offset, actual_size;
-    int r = ba->get_nth_block_in_layout_order(blocknum_in_layout_order, &actual_offset, &actual_size);
-    assert(r==0);
-    assert(expected_offset*512 == actual_offset);
-    assert(expected_size  *512 == actual_size);
+    int r = ba->NthBlockInLayoutOrder(
+        blocknum_in_layout_order, &actual_offset, &actual_size);
+    invariant(r == 0);
+    invariant(expected_offset * 512 == actual_offset);
+    invariant(expected_size * 512 == actual_size);
 }

-static void ba_check_none(block_allocator *ba, uint64_t blocknum_in_layout_order) {
+static void ba_check_none(BlockAllocator *ba,
+                          uint64_t blocknum_in_layout_order) {
    uint64_t actual_offset, actual_size;
-    int r = ba->get_nth_block_in_layout_order(blocknum_in_layout_order, &actual_offset, &actual_size);
-    assert(r==-1);
+    int r = ba->NthBlockInLayoutOrder(
+        blocknum_in_layout_order, &actual_offset, &actual_size);
+    invariant(r == -1);
 }

-
 // Simple block allocator test
-static void test_ba0(block_allocator::allocation_strategy strategy) {
-    block_allocator allocator;
-    block_allocator *ba = &allocator;
-    ba->create(100*512, 1*512);
-    ba->set_strategy(strategy);
-    assert(ba->allocated_limit()==100*512);
+static void test_ba0() {
+    BlockAllocator allocator;
+    BlockAllocator *ba = &allocator;
+    ba->Create(100 * 512, 1 * 512);
+    invariant(ba->AllocatedLimit() == 100 * 512);

    uint64_t b2, b3, b4, b5, b6, b7;
-    ba_alloc(ba, 100, &b2);     
-    ba_alloc(ba, 100, &b3);     
-    ba_alloc(ba, 100, &b4);     
-    ba_alloc(ba, 100, &b5);     
-    ba_alloc(ba, 100, &b6);     
-    ba_alloc(ba, 100, &b7);     
-    ba_free(ba, b2);
-    ba_alloc(ba, 100, &b2);  
-    ba_free(ba, b4);         
-    ba_free(ba, b6);         
+    ba_alloc(ba, 100, &b2);
+    ba_alloc(ba, 100, &b3);
+    ba_alloc(ba, 100, &b4);
+    ba_alloc(ba, 100, &b5);
+    ba_alloc(ba, 100, &b6);
+    ba_alloc(ba, 100, &b7);
+    ba_free(ba, b2, 100);
+    ba_alloc(ba, 100, &b2);
+    ba_free(ba, b4, 100);
+    ba_free(ba, b6, 100);
    uint64_t b8, b9;
-    ba_alloc(ba, 100, &b4);    
-    ba_free(ba, b2);           
-    ba_alloc(ba, 100, &b6);    
-    ba_alloc(ba, 100, &b8);    
-    ba_alloc(ba, 100, &b9);    
-    ba_free(ba, b6);           
-    ba_free(ba, b7);           
-    ba_free(ba, b8);           
-    ba_alloc(ba, 100, &b6);    
-    ba_alloc(ba, 100, &b7);    
-    ba_free(ba, b4);           
-    ba_alloc(ba, 100, &b4);    
+    ba_alloc(ba, 100, &b4);
+    ba_free(ba, b2, 100);
+    ba_alloc(ba, 100, &b6);
+    ba_alloc(ba, 100, &b8);
+    ba_alloc(ba, 100, &b9);
+    ba_free(ba, b6, 100);
+    ba_free(ba, b7, 100);
+    ba_free(ba, b8, 100);
+    ba_alloc(ba, 100, &b6);
+    ba_alloc(ba, 100, &b7);
+    ba_free(ba, b4, 100);
+    ba_alloc(ba, 100, &b4);

-    ba->destroy();
+    ba->Destroy();
 }

 // Manually to get coverage of all the code in the block allocator.
-static void
-test_ba1(block_allocator::allocation_strategy strategy, int n_initial) {
-    block_allocator allocator;
-    block_allocator *ba = &allocator;
-    ba->create(0*512, 1*512);
-    ba->set_strategy(strategy);
+static void test_ba1(int n_initial) {
+    BlockAllocator allocator;
+    BlockAllocator *ba = &allocator;
+    ba->Create(0 * 512, 1 * 512);

-    int n_blocks=0;
+    int n_blocks = 0;
    uint64_t blocks[1000];
    for (int i = 0; i < 1000; i++) {
-	if (i < n_initial || random() % 2 == 0) {
-	    if (n_blocks < 1000) {
-		ba_alloc(ba, 1, &blocks[n_blocks]);
-		//printf("A[%d]=%ld\n", n_blocks, blocks[n_blocks]);
-		n_blocks++;
-	    } 
-	} else {
-	    if (n_blocks > 0) {
-		int blocknum = random()%n_blocks;
-		//printf("F[%d]%ld\n", blocknum, blocks[blocknum]);
-		ba_free(ba, blocks[blocknum]);
-		blocks[blocknum]=blocks[n_blocks-1];
-		n_blocks--;
-	    }
-	}
+        if (i < n_initial || random() % 2 == 0) {
+            if (n_blocks < 1000) {
+                ba_alloc(ba, 1, &blocks[n_blocks]);
+                // printf("A[%d]=%ld\n", n_blocks, blocks[n_blocks]);
+                n_blocks++;
+            }
+        } else {
+            if (n_blocks > 0) {
+                int blocknum = random() % n_blocks;
+                // printf("F[%d]=%ld\n", blocknum, blocks[blocknum]);
+                ba_free(ba, blocks[blocknum], 1);
+                blocks[blocknum] = blocks[n_blocks - 1];
+                n_blocks--;
+            }
+        }
    }
-    
-    ba->destroy();
+
+    ba->Destroy();
 }
-    
+
 // Check to see if it is first fit or best fit.
-static void
-test_ba2 (void)
-{
-    block_allocator allocator;
-    block_allocator *ba = &allocator;
+static void test_ba2(void) {
+    BlockAllocator allocator;
+    BlockAllocator *ba = &allocator;
    uint64_t b[6];
    enum { BSIZE = 1024 };
-    ba->create(100*512, BSIZE*512);
-    ba->set_strategy(block_allocator::BA_STRATEGY_FIRST_FIT);
-    assert(ba->allocated_limit()==100*512);
+    ba->Create(100 * 512, BSIZE * 512);
+    invariant(ba->AllocatedLimit() == 100 * 512);

-    ba_check_l    (ba, 0, 0, 100);
-    ba_check_none (ba, 1);
+    ba_check_l(ba, 0, 0, 100);
+    ba_check_none(ba, 1);

-    ba_alloc (ba, 100, &b[0]);
-    ba_check_l    (ba, 0, 0, 100);
-    ba_check_l    (ba, 1, BSIZE, 100);
-    ba_check_none (ba, 2);
+    ba_alloc(ba, 100, &b[0]);
+    ba_check_l(ba, 0, 0, 100);
+    ba_check_l(ba, 1, BSIZE, 100);
+    ba_check_none(ba, 2);

-    ba_alloc (ba, BSIZE + 100, &b[1]);
-    ba_check_l    (ba, 0, 0, 100);
-    ba_check_l    (ba, 1,   BSIZE,       100);
-    ba_check_l    (ba, 2, 2*BSIZE, BSIZE + 100);
-    ba_check_none (ba, 3);
+    ba_alloc(ba, BSIZE + 100, &b[1]);
+    ba_check_l(ba, 0, 0, 100);
+    ba_check_l(ba, 1, BSIZE, 100);
+    ba_check_l(ba, 2, 2 * BSIZE, BSIZE + 100);
+    ba_check_none(ba, 3);

-    ba_alloc (ba, 100, &b[2]);
-    ba_check_l    (ba, 0, 0, 100);
-    ba_check_l    (ba, 1,   BSIZE,       100);
-    ba_check_l    (ba, 2, 2*BSIZE, BSIZE + 100);
-    ba_check_l    (ba, 3, 4*BSIZE,       100);
-    ba_check_none (ba, 4);
+    ba_alloc(ba, 100, &b[2]);
+    ba_check_l(ba, 0, 0, 100);
+    ba_check_l(ba, 1, BSIZE, 100);
+    ba_check_l(ba, 2, 2 * BSIZE, BSIZE + 100);
+    ba_check_l(ba, 3, 4 * BSIZE, 100);
+    ba_check_none(ba, 4);

-    ba_alloc (ba, 100, &b[3]);
-    ba_alloc (ba, 100, &b[4]);
-    ba_alloc (ba, 100, &b[5]);
-    ba_check_l    (ba, 0, 0, 100);
-    ba_check_l    (ba, 1,   BSIZE,       100);
-    ba_check_l    (ba, 2, 2*BSIZE, BSIZE + 100);
-    ba_check_l    (ba, 3, 4*BSIZE,       100);
-    ba_check_l    (ba, 4, 5*BSIZE,       100);
-    ba_check_l    (ba, 5, 6*BSIZE,       100);
-    ba_check_l    (ba, 6, 7*BSIZE,       100);
-    ba_check_none (ba, 7);
-   
-    ba_free (ba, 4*BSIZE);
-    ba_check_l    (ba, 0, 0, 100);
-    ba_check_l    (ba, 1,   BSIZE,       100);
-    ba_check_l    (ba, 2, 2*BSIZE, BSIZE + 100);
-    ba_check_l    (ba, 3, 5*BSIZE,       100);
-    ba_check_l    (ba, 4, 6*BSIZE,       100);
-    ba_check_l    (ba, 5, 7*BSIZE,       100);
-    ba_check_none (ba, 6);
+    ba_alloc(ba, 100, &b[3]);
+    ba_alloc(ba, 100, &b[4]);
+    ba_alloc(ba, 100, &b[5]);
+    ba_check_l(ba, 0, 0, 100);
+    ba_check_l(ba, 1, BSIZE, 100);
+    ba_check_l(ba, 2, 2 * BSIZE, BSIZE + 100);
+    ba_check_l(ba, 3, 4 * BSIZE, 100);
+    ba_check_l(ba, 4, 5 * BSIZE, 100);
+    ba_check_l(ba, 5, 6 * BSIZE, 100);
+    ba_check_l(ba, 6, 7 * BSIZE, 100);
+    ba_check_none(ba, 7);
+
+    ba_free(ba, 4 * BSIZE, 100);
+    ba_check_l(ba, 0, 0, 100);
+    ba_check_l(ba, 1, BSIZE, 100);
+    ba_check_l(ba, 2, 2 * BSIZE, BSIZE + 100);
+    ba_check_l(ba, 3, 5 * BSIZE, 100);
+    ba_check_l(ba, 4, 6 * BSIZE, 100);
+    ba_check_l(ba, 5, 7 * BSIZE, 100);
+    ba_check_none(ba, 6);

    uint64_t b2;
    ba_alloc(ba, 100, &b2);
-    assert(b2==4*BSIZE);
-    ba_check_l    (ba, 0, 0, 100);
-    ba_check_l    (ba, 1,   BSIZE,       100);
-    ba_check_l    (ba, 2, 2*BSIZE, BSIZE + 100);
-    ba_check_l    (ba, 3, 4*BSIZE,       100);
-    ba_check_l    (ba, 4, 5*BSIZE,       100);
-    ba_check_l    (ba, 5, 6*BSIZE,       100);
-    ba_check_l    (ba, 6, 7*BSIZE,       100);
-    ba_check_none (ba, 7);
+    invariant(b2 == 4 * BSIZE);
+    ba_check_l(ba, 0, 0, 100);
+    ba_check_l(ba, 1, BSIZE, 100);
+    ba_check_l(ba, 2, 2 * BSIZE, BSIZE + 100);
+    ba_check_l(ba, 3, 4 * BSIZE, 100);
+    ba_check_l(ba, 4, 5 * BSIZE, 100);
+    ba_check_l(ba, 5, 6 * BSIZE, 100);
+    ba_check_l(ba, 6, 7 * BSIZE, 100);
+    ba_check_none(ba, 7);

-    ba_free (ba,   BSIZE);
-    ba_free (ba, 5*BSIZE);
-    ba_check_l    (ba, 0, 0, 100);
-    ba_check_l    (ba, 1, 2*BSIZE, BSIZE + 100);
-    ba_check_l    (ba, 2, 4*BSIZE,       100);
-    ba_check_l    (ba, 3, 6*BSIZE,       100);
-    ba_check_l    (ba, 4, 7*BSIZE,       100);
-    ba_check_none (ba, 5);
+    ba_free(ba, BSIZE, 100);
+    ba_free(ba, 5 * BSIZE, 100);
+    ba_check_l(ba, 0, 0, 100);
+    ba_check_l(ba, 1, 2 * BSIZE, BSIZE + 100);
+    ba_check_l(ba, 2, 4 * BSIZE, 100);
+    ba_check_l(ba, 3, 6 * BSIZE, 100);
+    ba_check_l(ba, 4, 7 * BSIZE, 100);
+    ba_check_none(ba, 5);

-    // This alloc will allocate the first block after the reserve space in the case of first fit.
+    // This alloc will allocate the first block after the reserve space in the
+    // case of first fit.
    uint64_t b3;
    ba_alloc(ba, 100, &b3);
-    assert(b3==  BSIZE);      // First fit.
+    invariant(b3 == BSIZE);  // First fit.
    // if (b3==5*BSIZE) then it is next fit.

    // Now 5*BSIZE is free
    uint64_t b5;
    ba_alloc(ba, 100, &b5);
-    assert(b5==5*BSIZE);
-    ba_check_l    (ba, 0, 0, 100);
-    ba_check_l    (ba, 1,   BSIZE,       100);
-    ba_check_l    (ba, 2, 2*BSIZE, BSIZE + 100);
-    ba_check_l    (ba, 3, 4*BSIZE,       100);
-    ba_check_l    (ba, 4, 5*BSIZE,       100);
-    ba_check_l    (ba, 5, 6*BSIZE,       100);
-    ba_check_l    (ba, 6, 7*BSIZE,       100);
-    ba_check_none (ba, 7);
+    invariant(b5 == 5 * BSIZE);
+    ba_check_l(ba, 0, 0, 100);
+    ba_check_l(ba, 1, BSIZE, 100);
+    ba_check_l(ba, 2, 2 * BSIZE, BSIZE + 100);
+    ba_check_l(ba, 3, 4 * BSIZE, 100);
+    ba_check_l(ba, 4, 5 * BSIZE, 100);
+    ba_check_l(ba, 5, 6 * BSIZE, 100);
+    ba_check_l(ba, 6, 7 * BSIZE, 100);
+    ba_check_none(ba, 7);

    // Now all blocks are busy
    uint64_t b6, b7, b8;
    ba_alloc(ba, 100, &b6);
    ba_alloc(ba, 100, &b7);
    ba_alloc(ba, 100, &b8);
-    assert(b6==8*BSIZE);
-    assert(b7==9*BSIZE);
-    assert(b8==10*BSIZE);
-    ba_check_l    (ba, 0, 0, 100);
-    ba_check_l    (ba, 1,   BSIZE,       100);
-    ba_check_l    (ba, 2, 2*BSIZE, BSIZE + 100);
-    ba_check_l    (ba, 3, 4*BSIZE,       100);
-    ba_check_l    (ba, 4, 5*BSIZE,       100);
-    ba_check_l    (ba, 5, 6*BSIZE,       100);
-    ba_check_l    (ba, 6, 7*BSIZE,       100);
-    ba_check_l    (ba, 7, 8*BSIZE,       100);
-    ba_check_l    (ba, 8, 9*BSIZE,       100);
-    ba_check_l    (ba, 9, 10*BSIZE,       100);
-    ba_check_none (ba, 10);
-    
-    ba_free(ba, 9*BSIZE);
-    ba_free(ba, 7*BSIZE);
+    invariant(b6 == 8 * BSIZE);
+    invariant(b7 == 9 * BSIZE);
+    invariant(b8 == 10 * BSIZE);
+    ba_check_l(ba, 0, 0, 100);
+    ba_check_l(ba, 1, BSIZE, 100);
+    ba_check_l(ba, 2, 2 * BSIZE, BSIZE + 100);
+    ba_check_l(ba, 3, 4 * BSIZE, 100);
+    ba_check_l(ba, 4, 5 * BSIZE, 100);
+    ba_check_l(ba, 5, 6 * BSIZE, 100);
+    ba_check_l(ba, 6, 7 * BSIZE, 100);
+    ba_check_l(ba, 7, 8 * BSIZE, 100);
+    ba_check_l(ba, 8, 9 * BSIZE, 100);
+    ba_check_l(ba, 9, 10 * BSIZE, 100);
+    ba_check_none(ba, 10);
+
+    ba_free(ba, 9 * BSIZE, 100);
+    ba_free(ba, 7 * BSIZE, 100);
    uint64_t b9;
    ba_alloc(ba, 100, &b9);
-    assert(b9==7*BSIZE);
+    invariant(b9 == 7 * BSIZE);

-    ba_free(ba, 5*BSIZE);
-    ba_free(ba, 2*BSIZE);
+    ba_free(ba, 5 * BSIZE, 100);
+    ba_free(ba, 2 * BSIZE, BSIZE + 100);
    uint64_t b10, b11;
    ba_alloc(ba, 100, &b10);
-    assert(b10==2*BSIZE);
+    invariant(b10 == 2 * BSIZE);
    ba_alloc(ba, 100, &b11);
-    assert(b11==3*BSIZE);
+    invariant(b11 == 3 * BSIZE);
    ba_alloc(ba, 100, &b11);
-    assert(b11==5*BSIZE);
+    invariant(b11 == 5 * BSIZE);

-    ba->destroy();
+    ba->Destroy();
 }

-int
-test_main (int argc __attribute__((__unused__)), const char *argv[] __attribute__((__unused__))) {
-    enum block_allocator::allocation_strategy strategies[] = {
-        block_allocator::BA_STRATEGY_FIRST_FIT,
-        block_allocator::BA_STRATEGY_BEST_FIT,
-        block_allocator::BA_STRATEGY_PADDED_FIT,
-        block_allocator::BA_STRATEGY_HEAT_ZONE,
-    };
-    for (size_t i = 0; i < sizeof(strategies) / sizeof(strategies[0]); i++) {
-        test_ba0(strategies[i]);
-        test_ba1(strategies[i], 0);
-        test_ba1(strategies[i], 10);
-        test_ba1(strategies[i], 20);
-    }
+int test_main(int argc __attribute__((__unused__)),
+              const char *argv[] __attribute__((__unused__))) {
+    test_ba0();
+    test_ba1(0);
+    test_ba1(10);
+    test_ba1(20);
    test_ba2();
    return 0;
 }
--- a/storage/tokudb/PerconaFT/ft/tests/cachetable-5978.cc
+++ b/storage/tokudb/PerconaFT/ft/tests/cachetable-5978.cc
@ -45,7 +45,7 @@ Copyright (c) 2006, 2015, Percona and/or its affiliates. All rights reserved.
 // #5978 is fixed. Here is what we do. We have four pairs with
 // blocknums and fullhashes of 1,2,3,4. The cachetable has only
 // two bucket mutexes, so 1 and 3 share a pair mutex, as do 2 and 4.
-// We pin all four with expensive write locks. Then, on backgroud threads,
+// We pin all four with expensive write locks. Then, on background threads,
 // we call get_and_pin_nonblocking on 3, where the unlockers unpins 2, and
 // we call get_and_pin_nonblocking on 4, where the unlockers unpins 1. Run this
 // enough times, and we should see a deadlock before the fix, and no deadlock
--- a/storage/tokudb/PerconaFT/ft/tests/cachetable-simple-clone2.cc
+++ b/storage/tokudb/PerconaFT/ft/tests/cachetable-simple-clone2.cc
@ -77,7 +77,7 @@ flush (

 //
 // test the following things for simple cloning:
-//  - verifies that after teh checkpoint ends, the PAIR is properly 
+//  - verifies that after the checkpoint ends, the PAIR is properly
 //     dirty or clean based on the second unpin
 //
 static void
--- a/storage/tokudb/PerconaFT/ft/tests/ft-bfe-query.cc
+++ b/storage/tokudb/PerconaFT/ft/tests/ft-bfe-query.cc
@ -38,69 +38,72 @@ Copyright (c) 2006, 2015, Percona and/or its affiliates. All rights reserved.

 #include "test.h"

-static  int
-int64_key_cmp (DB *db UU(), const DBT *a, const DBT *b) {
-    int64_t x = *(int64_t *) a->data;
-    int64_t y = *(int64_t *) b->data;
+static int int64_key_cmp(DB *db UU(), const DBT *a, const DBT *b) {
+    int64_t x = *(int64_t *)a->data;
+    int64_t y = *(int64_t *)b->data;

-    if (x<y) return -1;
-    if (x>y) return 1;
+    if (x < y)
+        return -1;
+    if (x > y)
+        return 1;
    return 0;
 }

-static void
-test_prefetch_read(int fd, FT_HANDLE UU(ft), FT ft_h) {
+static void test_prefetch_read(int fd, FT_HANDLE UU(ft), FT ft_h) {
    int r;
    FT_CURSOR XMALLOC(cursor);
    FTNODE dn = NULL;
    PAIR_ATTR attr;
-    
+
    // first test that prefetching everything should work
-    memset(&cursor->range_lock_left_key, 0 , sizeof(DBT));
-    memset(&cursor->range_lock_right_key, 0 , sizeof(DBT));
+    memset(&cursor->range_lock_left_key, 0, sizeof(DBT));
+    memset(&cursor->range_lock_right_key, 0, sizeof(DBT));
    cursor->left_is_neg_infty = true;
    cursor->right_is_pos_infty = true;
    cursor->disable_prefetching = false;
-    
+
    ftnode_fetch_extra bfe;

    // quick test to see that we have the right behavior when we set
    // disable_prefetching to true
    cursor->disable_prefetching = true;
-    bfe.create_for_prefetch( ft_h, cursor);
+    bfe.create_for_prefetch(ft_h, cursor);
    FTNODE_DISK_DATA ndd = NULL;
-    r = toku_deserialize_ftnode_from(fd, make_blocknum(20), 0/*pass zero for hash*/, &dn, &ndd, &bfe);
-    assert(r==0);
-    assert(dn->n_children == 3);
-    assert(BP_STATE(dn,0) == PT_ON_DISK);
-    assert(BP_STATE(dn,1) == PT_ON_DISK);
-    assert(BP_STATE(dn,2) == PT_ON_DISK);
+    r = toku_deserialize_ftnode_from(
+        fd, make_blocknum(20), 0 /*pass zero for hash*/, &dn, &ndd, &bfe);
+    invariant(r == 0);
+    invariant(dn->n_children == 3);
+    invariant(BP_STATE(dn, 0) == PT_ON_DISK);
+    invariant(BP_STATE(dn, 1) == PT_ON_DISK);
+    invariant(BP_STATE(dn, 2) == PT_ON_DISK);
    r = toku_ftnode_pf_callback(dn, ndd, &bfe, fd, &attr);
-    assert(BP_STATE(dn,0) == PT_ON_DISK);
-    assert(BP_STATE(dn,1) == PT_ON_DISK);
-    assert(BP_STATE(dn,2) == PT_ON_DISK);
+    invariant(BP_STATE(dn, 0) == PT_ON_DISK);
+    invariant(BP_STATE(dn, 1) == PT_ON_DISK);
+    invariant(BP_STATE(dn, 2) == PT_ON_DISK);
    bfe.destroy();
    toku_ftnode_free(&dn);
    toku_free(ndd);

    // now enable prefetching again
    cursor->disable_prefetching = false;
-    
-    bfe.create_for_prefetch( ft_h, cursor);
-    r = toku_deserialize_ftnode_from(fd, make_blocknum(20), 0/*pass zero for hash*/, &dn, &ndd, &bfe);
-    assert(r==0);
-    assert(dn->n_children == 3);
-    assert(BP_STATE(dn,0) == PT_AVAIL);
-    assert(BP_STATE(dn,1) == PT_AVAIL);
-    assert(BP_STATE(dn,2) == PT_AVAIL);
-    toku_ftnode_pe_callback(dn, make_pair_attr(0xffffffff), ft_h, def_pe_finalize_impl, nullptr);
-    assert(BP_STATE(dn,0) == PT_COMPRESSED);
-    assert(BP_STATE(dn,1) == PT_COMPRESSED);
-    assert(BP_STATE(dn,2) == PT_COMPRESSED);
+
+    bfe.create_for_prefetch(ft_h, cursor);
+    r = toku_deserialize_ftnode_from(
+        fd, make_blocknum(20), 0 /*pass zero for hash*/, &dn, &ndd, &bfe);
+    invariant(r == 0);
+    invariant(dn->n_children == 3);
+    invariant(BP_STATE(dn, 0) == PT_AVAIL);
+    invariant(BP_STATE(dn, 1) == PT_AVAIL);
+    invariant(BP_STATE(dn, 2) == PT_AVAIL);
+    toku_ftnode_pe_callback(
+        dn, make_pair_attr(0xffffffff), ft_h, def_pe_finalize_impl, nullptr);
+    invariant(BP_STATE(dn, 0) == PT_COMPRESSED);
+    invariant(BP_STATE(dn, 1) == PT_COMPRESSED);
+    invariant(BP_STATE(dn, 2) == PT_COMPRESSED);
    r = toku_ftnode_pf_callback(dn, ndd, &bfe, fd, &attr);
-    assert(BP_STATE(dn,0) == PT_AVAIL);
-    assert(BP_STATE(dn,1) == PT_AVAIL);
-    assert(BP_STATE(dn,2) == PT_AVAIL);
+    invariant(BP_STATE(dn, 0) == PT_AVAIL);
+    invariant(BP_STATE(dn, 1) == PT_AVAIL);
+    invariant(BP_STATE(dn, 2) == PT_AVAIL);
    bfe.destroy();
    toku_ftnode_free(&dn);
    toku_free(ndd);
@ -108,21 +111,23 @@ test_prefetch_read(int fd, FT_HANDLE UU(ft), FT ft_h) {
    uint64_t left_key = 150;
    toku_fill_dbt(&cursor->range_lock_left_key, &left_key, sizeof(uint64_t));
    cursor->left_is_neg_infty = false;
-    bfe.create_for_prefetch( ft_h, cursor);
-    r = toku_deserialize_ftnode_from(fd, make_blocknum(20), 0/*pass zero for hash*/, &dn, &ndd, &bfe);
-    assert(r==0);
-    assert(dn->n_children == 3);
-    assert(BP_STATE(dn,0) == PT_ON_DISK);
-    assert(BP_STATE(dn,1) == PT_AVAIL);
-    assert(BP_STATE(dn,2) == PT_AVAIL);
-    toku_ftnode_pe_callback(dn, make_pair_attr(0xffffffff), ft_h, def_pe_finalize_impl, nullptr);
-    assert(BP_STATE(dn,0) == PT_ON_DISK);
-    assert(BP_STATE(dn,1) == PT_COMPRESSED);
-    assert(BP_STATE(dn,2) == PT_COMPRESSED);
+    bfe.create_for_prefetch(ft_h, cursor);
+    r = toku_deserialize_ftnode_from(
+        fd, make_blocknum(20), 0 /*pass zero for hash*/, &dn, &ndd, &bfe);
+    invariant(r == 0);
+    invariant(dn->n_children == 3);
+    invariant(BP_STATE(dn, 0) == PT_ON_DISK);
+    invariant(BP_STATE(dn, 1) == PT_AVAIL);
+    invariant(BP_STATE(dn, 2) == PT_AVAIL);
+    toku_ftnode_pe_callback(
+        dn, make_pair_attr(0xffffffff), ft_h, def_pe_finalize_impl, nullptr);
+    invariant(BP_STATE(dn, 0) == PT_ON_DISK);
+    invariant(BP_STATE(dn, 1) == PT_COMPRESSED);
+    invariant(BP_STATE(dn, 2) == PT_COMPRESSED);
    r = toku_ftnode_pf_callback(dn, ndd, &bfe, fd, &attr);
-    assert(BP_STATE(dn,0) == PT_ON_DISK);
-    assert(BP_STATE(dn,1) == PT_AVAIL);
-    assert(BP_STATE(dn,2) == PT_AVAIL);
+    invariant(BP_STATE(dn, 0) == PT_ON_DISK);
+    invariant(BP_STATE(dn, 1) == PT_AVAIL);
+    invariant(BP_STATE(dn, 2) == PT_AVAIL);
    bfe.destroy();
    toku_ftnode_free(&dn);
    toku_free(ndd);
@ -130,63 +135,69 @@ test_prefetch_read(int fd, FT_HANDLE UU(ft), FT ft_h) {
    uint64_t right_key = 151;
    toku_fill_dbt(&cursor->range_lock_right_key, &right_key, sizeof(uint64_t));
    cursor->right_is_pos_infty = false;
-    bfe.create_for_prefetch( ft_h, cursor);
-    r = toku_deserialize_ftnode_from(fd, make_blocknum(20), 0/*pass zero for hash*/, &dn, &ndd, &bfe);
-    assert(r==0);
-    assert(dn->n_children == 3);
-    assert(BP_STATE(dn,0) == PT_ON_DISK);
-    assert(BP_STATE(dn,1) == PT_AVAIL);
-    assert(BP_STATE(dn,2) == PT_ON_DISK);
-    toku_ftnode_pe_callback(dn, make_pair_attr(0xffffffff), ft_h, def_pe_finalize_impl, nullptr);
-    assert(BP_STATE(dn,0) == PT_ON_DISK);
-    assert(BP_STATE(dn,1) == PT_COMPRESSED);
-    assert(BP_STATE(dn,2) == PT_ON_DISK);
+    bfe.create_for_prefetch(ft_h, cursor);
+    r = toku_deserialize_ftnode_from(
+        fd, make_blocknum(20), 0 /*pass zero for hash*/, &dn, &ndd, &bfe);
+    invariant(r == 0);
+    invariant(dn->n_children == 3);
+    invariant(BP_STATE(dn, 0) == PT_ON_DISK);
+    invariant(BP_STATE(dn, 1) == PT_AVAIL);
+    invariant(BP_STATE(dn, 2) == PT_ON_DISK);
+    toku_ftnode_pe_callback(
+        dn, make_pair_attr(0xffffffff), ft_h, def_pe_finalize_impl, nullptr);
+    invariant(BP_STATE(dn, 0) == PT_ON_DISK);
+    invariant(BP_STATE(dn, 1) == PT_COMPRESSED);
+    invariant(BP_STATE(dn, 2) == PT_ON_DISK);
    r = toku_ftnode_pf_callback(dn, ndd, &bfe, fd, &attr);
-    assert(BP_STATE(dn,0) == PT_ON_DISK);
-    assert(BP_STATE(dn,1) == PT_AVAIL);
-    assert(BP_STATE(dn,2) == PT_ON_DISK);
+    invariant(BP_STATE(dn, 0) == PT_ON_DISK);
+    invariant(BP_STATE(dn, 1) == PT_AVAIL);
+    invariant(BP_STATE(dn, 2) == PT_ON_DISK);
    bfe.destroy();
    toku_ftnode_free(&dn);
    toku_free(ndd);

    left_key = 100000;
    right_key = 100000;
-    bfe.create_for_prefetch( ft_h, cursor);
-    r = toku_deserialize_ftnode_from(fd, make_blocknum(20), 0/*pass zero for hash*/, &dn, &ndd, &bfe);
-    assert(r==0);
-    assert(dn->n_children == 3);
-    assert(BP_STATE(dn,0) == PT_ON_DISK);
-    assert(BP_STATE(dn,1) == PT_ON_DISK);
-    assert(BP_STATE(dn,2) == PT_AVAIL);
-    toku_ftnode_pe_callback(dn, make_pair_attr(0xffffffff), ft_h, def_pe_finalize_impl, nullptr);
-    assert(BP_STATE(dn,0) == PT_ON_DISK);
-    assert(BP_STATE(dn,1) == PT_ON_DISK);
-    assert(BP_STATE(dn,2) == PT_COMPRESSED);
+    bfe.create_for_prefetch(ft_h, cursor);
+    r = toku_deserialize_ftnode_from(
+        fd, make_blocknum(20), 0 /*pass zero for hash*/, &dn, &ndd, &bfe);
+    invariant(r == 0);
+    invariant(dn->n_children == 3);
+    invariant(BP_STATE(dn, 0) == PT_ON_DISK);
+    invariant(BP_STATE(dn, 1) == PT_ON_DISK);
+    invariant(BP_STATE(dn, 2) == PT_AVAIL);
+    toku_ftnode_pe_callback(
+        dn, make_pair_attr(0xffffffff), ft_h, def_pe_finalize_impl, nullptr);
+    invariant(BP_STATE(dn, 0) == PT_ON_DISK);
+    invariant(BP_STATE(dn, 1) == PT_ON_DISK);
+    invariant(BP_STATE(dn, 2) == PT_COMPRESSED);
    r = toku_ftnode_pf_callback(dn, ndd, &bfe, fd, &attr);
-    assert(BP_STATE(dn,0) == PT_ON_DISK);
-    assert(BP_STATE(dn,1) == PT_ON_DISK);
-    assert(BP_STATE(dn,2) == PT_AVAIL);
+    invariant(BP_STATE(dn, 0) == PT_ON_DISK);
+    invariant(BP_STATE(dn, 1) == PT_ON_DISK);
+    invariant(BP_STATE(dn, 2) == PT_AVAIL);
    bfe.destroy();
    toku_free(ndd);
    toku_ftnode_free(&dn);

    left_key = 100;
    right_key = 100;
-    bfe.create_for_prefetch( ft_h, cursor);
-    r = toku_deserialize_ftnode_from(fd, make_blocknum(20), 0/*pass zero for hash*/, &dn, &ndd, &bfe);
-    assert(r==0);
-    assert(dn->n_children == 3);
-    assert(BP_STATE(dn,0) == PT_AVAIL);
-    assert(BP_STATE(dn,1) == PT_ON_DISK);
-    assert(BP_STATE(dn,2) == PT_ON_DISK);
-    toku_ftnode_pe_callback(dn, make_pair_attr(0xffffffff), ft_h, def_pe_finalize_impl, nullptr);
-    assert(BP_STATE(dn,0) == PT_COMPRESSED);
-    assert(BP_STATE(dn,1) == PT_ON_DISK);
-    assert(BP_STATE(dn,2) == PT_ON_DISK);
+    bfe.create_for_prefetch(ft_h, cursor);
+    r = toku_deserialize_ftnode_from(
+        fd, make_blocknum(20), 0 /*pass zero for hash*/, &dn, &ndd, &bfe);
+    invariant(r == 0);
+    invariant(dn->n_children == 3);
+    invariant(BP_STATE(dn, 0) == PT_AVAIL);
+    invariant(BP_STATE(dn, 1) == PT_ON_DISK);
+    invariant(BP_STATE(dn, 2) == PT_ON_DISK);
+    toku_ftnode_pe_callback(
+        dn, make_pair_attr(0xffffffff), ft_h, def_pe_finalize_impl, nullptr);
+    invariant(BP_STATE(dn, 0) == PT_COMPRESSED);
+    invariant(BP_STATE(dn, 1) == PT_ON_DISK);
+    invariant(BP_STATE(dn, 2) == PT_ON_DISK);
    r = toku_ftnode_pf_callback(dn, ndd, &bfe, fd, &attr);
-    assert(BP_STATE(dn,0) == PT_AVAIL);
-    assert(BP_STATE(dn,1) == PT_ON_DISK);
-    assert(BP_STATE(dn,2) == PT_ON_DISK);
+    invariant(BP_STATE(dn, 0) == PT_AVAIL);
+    invariant(BP_STATE(dn, 1) == PT_ON_DISK);
+    invariant(BP_STATE(dn, 2) == PT_ON_DISK);
    bfe.destroy();
    toku_ftnode_free(&dn);
    toku_free(ndd);
@ -194,20 +205,19 @@ test_prefetch_read(int fd, FT_HANDLE UU(ft), FT ft_h) {
    toku_free(cursor);
 }

-static void
-test_subset_read(int fd, FT_HANDLE UU(ft), FT ft_h) {
+static void test_subset_read(int fd, FT_HANDLE UU(ft), FT ft_h) {
    int r;
    FT_CURSOR XMALLOC(cursor);
    FTNODE dn = NULL;
    FTNODE_DISK_DATA ndd = NULL;
    PAIR_ATTR attr;
-    
+
    // first test that prefetching everything should work
-    memset(&cursor->range_lock_left_key, 0 , sizeof(DBT));
-    memset(&cursor->range_lock_right_key, 0 , sizeof(DBT));
+    memset(&cursor->range_lock_left_key, 0, sizeof(DBT));
+    memset(&cursor->range_lock_right_key, 0, sizeof(DBT));
    cursor->left_is_neg_infty = true;
    cursor->right_is_pos_infty = true;
-    
+
    uint64_t left_key = 150;
    uint64_t right_key = 151;
    DBT left, right;
@ -216,101 +226,106 @@ test_subset_read(int fd, FT_HANDLE UU(ft), FT ft_h) {

    ftnode_fetch_extra bfe;
    bfe.create_for_subset_read(
-        ft_h,
-        NULL, 
-        &left,
-        &right,
-        false,
-        false,
-        false,
-        false
-        );
-    
+        ft_h, NULL, &left, &right, false, false, false, false);
+
    // fake the childnum to read
    // set disable_prefetching ON
    bfe.child_to_read = 2;
    bfe.disable_prefetching = true;
-    r = toku_deserialize_ftnode_from(fd, make_blocknum(20), 0/*pass zero for hash*/, &dn, &ndd, &bfe);
-    assert(r==0);
-    assert(dn->n_children == 3);
-    assert(BP_STATE(dn,0) == PT_ON_DISK);
-    assert(BP_STATE(dn,1) == PT_ON_DISK);
-    assert(BP_STATE(dn,2) == PT_AVAIL);
-    // need to call this twice because we had a subset read before, that touched the clock
-    toku_ftnode_pe_callback(dn, make_pair_attr(0xffffffff), ft_h, def_pe_finalize_impl, nullptr);
-    assert(BP_STATE(dn,0) == PT_ON_DISK);
-    assert(BP_STATE(dn,1) == PT_ON_DISK);
-    assert(BP_STATE(dn,2) == PT_AVAIL);
-    toku_ftnode_pe_callback(dn, make_pair_attr(0xffffffff), ft_h, def_pe_finalize_impl, nullptr);
-    assert(BP_STATE(dn,0) == PT_ON_DISK);
-    assert(BP_STATE(dn,1) == PT_ON_DISK);
-    assert(BP_STATE(dn,2) == PT_COMPRESSED);
+    r = toku_deserialize_ftnode_from(
+        fd, make_blocknum(20), 0 /*pass zero for hash*/, &dn, &ndd, &bfe);
+    invariant(r == 0);
+    invariant(dn->n_children == 3);
+    invariant(BP_STATE(dn, 0) == PT_ON_DISK);
+    invariant(BP_STATE(dn, 1) == PT_ON_DISK);
+    invariant(BP_STATE(dn, 2) == PT_AVAIL);
+    // need to call this twice because we had a subset read before, that touched
+    // the clock
+    toku_ftnode_pe_callback(
+        dn, make_pair_attr(0xffffffff), ft_h, def_pe_finalize_impl, nullptr);
+    invariant(BP_STATE(dn, 0) == PT_ON_DISK);
+    invariant(BP_STATE(dn, 1) == PT_ON_DISK);
+    invariant(BP_STATE(dn, 2) == PT_AVAIL);
+    toku_ftnode_pe_callback(
+        dn, make_pair_attr(0xffffffff), ft_h, def_pe_finalize_impl, nullptr);
+    invariant(BP_STATE(dn, 0) == PT_ON_DISK);
+    invariant(BP_STATE(dn, 1) == PT_ON_DISK);
+    invariant(BP_STATE(dn, 2) == PT_COMPRESSED);
    r = toku_ftnode_pf_callback(dn, ndd, &bfe, fd, &attr);
-    assert(BP_STATE(dn,0) == PT_ON_DISK);
-    assert(BP_STATE(dn,1) == PT_ON_DISK);
-    assert(BP_STATE(dn,2) == PT_AVAIL);
+    invariant(BP_STATE(dn, 0) == PT_ON_DISK);
+    invariant(BP_STATE(dn, 1) == PT_ON_DISK);
+    invariant(BP_STATE(dn, 2) == PT_AVAIL);
    toku_ftnode_free(&dn);
    toku_free(ndd);

    // fake the childnum to read
    bfe.child_to_read = 2;
    bfe.disable_prefetching = false;
-    r = toku_deserialize_ftnode_from(fd, make_blocknum(20), 0/*pass zero for hash*/, &dn, &ndd, &bfe);
-    assert(r==0);
-    assert(dn->n_children == 3);
-    assert(BP_STATE(dn,0) == PT_ON_DISK);
-    assert(BP_STATE(dn,1) == PT_AVAIL);
-    assert(BP_STATE(dn,2) == PT_AVAIL);
-    // need to call this twice because we had a subset read before, that touched the clock
-    toku_ftnode_pe_callback(dn, make_pair_attr(0xffffffff), ft_h, def_pe_finalize_impl, nullptr);
-    assert(BP_STATE(dn,0) == PT_ON_DISK);
-    assert(BP_STATE(dn,1) == PT_COMPRESSED);
-    assert(BP_STATE(dn,2) == PT_AVAIL);
-    toku_ftnode_pe_callback(dn, make_pair_attr(0xffffffff), ft_h, def_pe_finalize_impl, nullptr);
-    assert(BP_STATE(dn,0) == PT_ON_DISK);
-    assert(BP_STATE(dn,1) == PT_COMPRESSED);
-    assert(BP_STATE(dn,2) == PT_COMPRESSED);
+    r = toku_deserialize_ftnode_from(
+        fd, make_blocknum(20), 0 /*pass zero for hash*/, &dn, &ndd, &bfe);
+    invariant(r == 0);
+    invariant(dn->n_children == 3);
+    invariant(BP_STATE(dn, 0) == PT_ON_DISK);
+    invariant(BP_STATE(dn, 1) == PT_AVAIL);
+    invariant(BP_STATE(dn, 2) == PT_AVAIL);
+    // need to call this twice because we had a subset read before, that touched
+    // the clock
+    toku_ftnode_pe_callback(
+        dn, make_pair_attr(0xffffffff), ft_h, def_pe_finalize_impl, nullptr);
+    invariant(BP_STATE(dn, 0) == PT_ON_DISK);
+    invariant(BP_STATE(dn, 1) == PT_COMPRESSED);
+    invariant(BP_STATE(dn, 2) == PT_AVAIL);
+    toku_ftnode_pe_callback(
+        dn, make_pair_attr(0xffffffff), ft_h, def_pe_finalize_impl, nullptr);
+    invariant(BP_STATE(dn, 0) == PT_ON_DISK);
+    invariant(BP_STATE(dn, 1) == PT_COMPRESSED);
+    invariant(BP_STATE(dn, 2) == PT_COMPRESSED);
    r = toku_ftnode_pf_callback(dn, ndd, &bfe, fd, &attr);
-    assert(BP_STATE(dn,0) == PT_ON_DISK);
-    assert(BP_STATE(dn,1) == PT_AVAIL);
-    assert(BP_STATE(dn,2) == PT_AVAIL);
+    invariant(BP_STATE(dn, 0) == PT_ON_DISK);
+    invariant(BP_STATE(dn, 1) == PT_AVAIL);
+    invariant(BP_STATE(dn, 2) == PT_AVAIL);
    toku_ftnode_free(&dn);
    toku_free(ndd);

    // fake the childnum to read
    bfe.child_to_read = 0;
-    r = toku_deserialize_ftnode_from(fd, make_blocknum(20), 0/*pass zero for hash*/, &dn, &ndd, &bfe);
-    assert(r==0);
-    assert(dn->n_children == 3);
-    assert(BP_STATE(dn,0) == PT_AVAIL);
-    assert(BP_STATE(dn,1) == PT_AVAIL);
-    assert(BP_STATE(dn,2) == PT_ON_DISK);
-    // need to call this twice because we had a subset read before, that touched the clock
-    toku_ftnode_pe_callback(dn, make_pair_attr(0xffffffff), ft_h, def_pe_finalize_impl, nullptr);
-    assert(BP_STATE(dn,0) == PT_AVAIL);
-    assert(BP_STATE(dn,1) == PT_COMPRESSED);
-    assert(BP_STATE(dn,2) == PT_ON_DISK);
-    toku_ftnode_pe_callback(dn, make_pair_attr(0xffffffff), ft_h, def_pe_finalize_impl, nullptr);
-    assert(BP_STATE(dn,0) == PT_COMPRESSED);
-    assert(BP_STATE(dn,1) == PT_COMPRESSED);
-    assert(BP_STATE(dn,2) == PT_ON_DISK);
+    r = toku_deserialize_ftnode_from(
+        fd, make_blocknum(20), 0 /*pass zero for hash*/, &dn, &ndd, &bfe);
+    invariant(r == 0);
+    invariant(dn->n_children == 3);
+    invariant(BP_STATE(dn, 0) == PT_AVAIL);
+    invariant(BP_STATE(dn, 1) == PT_AVAIL);
+    invariant(BP_STATE(dn, 2) == PT_ON_DISK);
+    // need to call this twice because we had a subset read before, that touched
+    // the clock
+    toku_ftnode_pe_callback(
+        dn, make_pair_attr(0xffffffff), ft_h, def_pe_finalize_impl, nullptr);
+    invariant(BP_STATE(dn, 0) == PT_AVAIL);
+    invariant(BP_STATE(dn, 1) == PT_COMPRESSED);
+    invariant(BP_STATE(dn, 2) == PT_ON_DISK);
+    toku_ftnode_pe_callback(
+        dn, make_pair_attr(0xffffffff), ft_h, def_pe_finalize_impl, nullptr);
+    invariant(BP_STATE(dn, 0) == PT_COMPRESSED);
+    invariant(BP_STATE(dn, 1) == PT_COMPRESSED);
+    invariant(BP_STATE(dn, 2) == PT_ON_DISK);
    r = toku_ftnode_pf_callback(dn, ndd, &bfe, fd, &attr);
-    assert(BP_STATE(dn,0) == PT_AVAIL);
-    assert(BP_STATE(dn,1) == PT_AVAIL);
-    assert(BP_STATE(dn,2) == PT_ON_DISK);
+    invariant(BP_STATE(dn, 0) == PT_AVAIL);
+    invariant(BP_STATE(dn, 1) == PT_AVAIL);
+    invariant(BP_STATE(dn, 2) == PT_ON_DISK);
    toku_ftnode_free(&dn);
    toku_free(ndd);

    toku_free(cursor);
 }

-
-static void
-test_prefetching(void) {
+static void test_prefetching(void) {
    //    struct ft_handle source_ft;
    struct ftnode sn;

-    int fd = open(TOKU_TEST_FILENAME, O_RDWR|O_CREAT|O_BINARY, S_IRWXU|S_IRWXG|S_IRWXO); assert(fd >= 0);
+    int fd = open(TOKU_TEST_FILENAME,
+                  O_RDWR | O_CREAT | O_BINARY,
+                  S_IRWXU | S_IRWXG | S_IRWXO);
+    invariant(fd >= 0);

    int r;

@ -327,7 +342,7 @@ test_prefetching(void) {

    uint64_t key1 = 100;
    uint64_t key2 = 200;
-    
+
    MALLOC_N(sn.n_children, sn.bp);
    DBT pivotkeys[2];
    toku_fill_dbt(&pivotkeys[0], &key1, sizeof(key1));
@ -336,13 +351,13 @@ test_prefetching(void) {
    BP_BLOCKNUM(&sn, 0).b = 30;
    BP_BLOCKNUM(&sn, 1).b = 35;
    BP_BLOCKNUM(&sn, 2).b = 40;
-    BP_STATE(&sn,0) = PT_AVAIL;
-    BP_STATE(&sn,1) = PT_AVAIL;
-    BP_STATE(&sn,2) = PT_AVAIL;
+    BP_STATE(&sn, 0) = PT_AVAIL;
+    BP_STATE(&sn, 1) = PT_AVAIL;
+    BP_STATE(&sn, 2) = PT_AVAIL;
    set_BNC(&sn, 0, toku_create_empty_nl());
    set_BNC(&sn, 1, toku_create_empty_nl());
    set_BNC(&sn, 2, toku_create_empty_nl());
-    //Create XIDS
+    // Create XIDS
    XIDS xids_0 = toku_xids_get_root_xids();
    XIDS xids_123;
    XIDS xids_234;
@ -352,7 +367,7 @@ test_prefetching(void) {
    CKERR(r);

    // data in the buffers does not matter in this test
-    //Cleanup:
+    // Cleanup:
    toku_xids_destroy(&xids_0);
    toku_xids_destroy(&xids_123);
    toku_xids_destroy(&xids_234);
@ -363,41 +378,48 @@ test_prefetching(void) {
                 make_blocknum(0),
                 ZERO_LSN,
                 TXNID_NONE,
-                 4*1024*1024,
-                 128*1024,
+                 4 * 1024 * 1024,
+                 128 * 1024,
                 TOKU_DEFAULT_COMPRESSION_METHOD,
                 16);
    ft_h->cmp.create(int64_key_cmp, nullptr);
    ft->ft = ft_h;
    ft_h->blocktable.create();
-    { int r_truncate = ftruncate(fd, 0); CKERR(r_truncate); }
-    //Want to use block #20
+    {
+        int r_truncate = ftruncate(fd, 0);
+        CKERR(r_truncate);
+    }
+    // Want to use block #20
    BLOCKNUM b = make_blocknum(0);
    while (b.b < 20) {
        ft_h->blocktable.allocate_blocknum(&b, ft_h);
    }
-    assert(b.b == 20);
+    invariant(b.b == 20);

    {
        DISKOFF offset;
        DISKOFF size;
-        ft_h->blocktable.realloc_on_disk(b, 100, &offset, ft_h, fd, false, 0);
-        assert(offset==(DISKOFF)block_allocator::BLOCK_ALLOCATOR_TOTAL_HEADER_RESERVE);
+        ft_h->blocktable.realloc_on_disk(b, 100, &offset, ft_h, fd, false);
+        invariant(offset ==
+               (DISKOFF)BlockAllocator::BLOCK_ALLOCATOR_TOTAL_HEADER_RESERVE);

        ft_h->blocktable.translate_blocknum_to_offset_size(b, &offset, &size);
-        assert(offset == (DISKOFF)block_allocator::BLOCK_ALLOCATOR_TOTAL_HEADER_RESERVE);
-        assert(size   == 100);
+        invariant(offset ==
+               (DISKOFF)BlockAllocator::BLOCK_ALLOCATOR_TOTAL_HEADER_RESERVE);
+        invariant(size == 100);
    }
    FTNODE_DISK_DATA ndd = NULL;
-    r = toku_serialize_ftnode_to(fd, make_blocknum(20), &sn, &ndd, true, ft->ft, false);
-    assert(r==0);
+    r = toku_serialize_ftnode_to(
+        fd, make_blocknum(20), &sn, &ndd, true, ft->ft, false);
+    invariant(r == 0);

-    test_prefetch_read(fd, ft, ft_h);    
+    test_prefetch_read(fd, ft, ft_h);
    test_subset_read(fd, ft, ft_h);

    toku_destroy_ftnode_internals(&sn);

-    ft_h->blocktable.block_free(block_allocator::BLOCK_ALLOCATOR_TOTAL_HEADER_RESERVE);
+    ft_h->blocktable.block_free(
+        BlockAllocator::BLOCK_ALLOCATOR_TOTAL_HEADER_RESERVE, 100);
    ft_h->blocktable.destroy();
    ft_h->cmp.destroy();
    toku_free(ft_h->h);
@ -405,11 +427,12 @@ test_prefetching(void) {
    toku_free(ft);
    toku_free(ndd);

-    r = close(fd); assert(r != -1);
+    r = close(fd);
+    invariant(r != -1);
 }

-int
-test_main (int argc __attribute__((__unused__)), const char *argv[] __attribute__((__unused__))) {
+int test_main(int argc __attribute__((__unused__)),
+              const char *argv[] __attribute__((__unused__))) {
    test_prefetching();

    return 0;
--- a/storage/tokudb/PerconaFT/ft/tests/ft-clock-test.cc
+++ b/storage/tokudb/PerconaFT/ft/tests/ft-clock-test.cc
@ -40,38 +40,28 @@ Copyright (c) 2006, 2015, Percona and/or its affiliates. All rights reserved.

 #include "ft/cursor.h"

-enum ftnode_verify_type {
-    read_all=1,
-    read_compressed,
-    read_none
-};
+enum ftnode_verify_type { read_all = 1, read_compressed, read_none };

 #ifndef MIN
 #define MIN(x, y) (((x) < (y)) ? (x) : (y))
 #endif

-static int
-string_key_cmp(DB *UU(e), const DBT *a, const DBT *b)
-{
+static int string_key_cmp(DB *UU(e), const DBT *a, const DBT *b) {
    char *CAST_FROM_VOIDP(s, a->data);
    char *CAST_FROM_VOIDP(t, b->data);
    return strcmp(s, t);
 }

-static void
-le_add_to_bn(bn_data* bn, uint32_t idx, const  char *key, int keylen, const char *val, int vallen)
-{
+static void le_add_to_bn(bn_data *bn,
+                         uint32_t idx,
+                         const char *key,
+                         int keylen,
+                         const char *val,
+                         int vallen) {
    LEAFENTRY r = NULL;
    uint32_t size_needed = LE_CLEAN_MEMSIZE(vallen);
    void *maybe_free = nullptr;
-    bn->get_space_for_insert(
-        idx, 
-        key,
-        keylen,
-        size_needed,
-        &r,
-        &maybe_free
-        );
+    bn->get_space_for_insert(idx, key, keylen, size_needed, &r, &maybe_free);
    if (maybe_free) {
        toku_free(maybe_free);
    }
@ -81,70 +71,67 @@ le_add_to_bn(bn_data* bn, uint32_t idx, const  char *key, int keylen, const char
    memcpy(r->u.clean.val, val, vallen);
 }

-
-static void
-le_malloc(bn_data* bn, uint32_t idx, const char *key, const char *val)
-{
+static void le_malloc(bn_data *bn,
+                      uint32_t idx,
+                      const char *key,
+                      const char *val) {
    int keylen = strlen(key) + 1;
    int vallen = strlen(val) + 1;
    le_add_to_bn(bn, idx, key, keylen, val, vallen);
 }

-
-static void
-test1(int fd, FT ft_h, FTNODE *dn) {
+static void test1(int fd, FT ft_h, FTNODE *dn) {
    int r;
    ftnode_fetch_extra bfe_all;
    bfe_all.create_for_full_read(ft_h);
    FTNODE_DISK_DATA ndd = NULL;
-    r = toku_deserialize_ftnode_from(fd, make_blocknum(20), 0/*pass zero for hash*/, dn, &ndd, &bfe_all);
+    r = toku_deserialize_ftnode_from(
+        fd, make_blocknum(20), 0 /*pass zero for hash*/, dn, &ndd, &bfe_all);
    bool is_leaf = ((*dn)->height == 0);
-    assert(r==0);
+    invariant(r == 0);
    for (int i = 0; i < (*dn)->n_children; i++) {
-        assert(BP_STATE(*dn,i) == PT_AVAIL);
+        invariant(BP_STATE(*dn, i) == PT_AVAIL);
    }
    // should sweep and NOT get rid of anything
    PAIR_ATTR attr;
-    memset(&attr,0,sizeof(attr));
+    memset(&attr, 0, sizeof(attr));
    toku_ftnode_pe_callback(*dn, attr, ft_h, def_pe_finalize_impl, nullptr);
    for (int i = 0; i < (*dn)->n_children; i++) {
-        assert(BP_STATE(*dn,i) == PT_AVAIL);
+        invariant(BP_STATE(*dn, i) == PT_AVAIL);
    }
    // should sweep and get compress all
    toku_ftnode_pe_callback(*dn, attr, ft_h, def_pe_finalize_impl, nullptr);
    for (int i = 0; i < (*dn)->n_children; i++) {
        if (!is_leaf) {
-            assert(BP_STATE(*dn,i) == PT_COMPRESSED);
-        }
-        else {
-            assert(BP_STATE(*dn,i) == PT_ON_DISK);
+            invariant(BP_STATE(*dn, i) == PT_COMPRESSED);
+        } else {
+            invariant(BP_STATE(*dn, i) == PT_ON_DISK);
        }
    }
    PAIR_ATTR size;
    bool req = toku_ftnode_pf_req_callback(*dn, &bfe_all);
-    assert(req);
+    invariant(req);
    toku_ftnode_pf_callback(*dn, ndd, &bfe_all, fd, &size);
    toku_ftnode_pe_callback(*dn, attr, ft_h, def_pe_finalize_impl, nullptr);
    for (int i = 0; i < (*dn)->n_children; i++) {
-        assert(BP_STATE(*dn,i) == PT_AVAIL);
+        invariant(BP_STATE(*dn, i) == PT_AVAIL);
    }
    // should sweep and get compress all
    toku_ftnode_pe_callback(*dn, attr, ft_h, def_pe_finalize_impl, nullptr);
    for (int i = 0; i < (*dn)->n_children; i++) {
        if (!is_leaf) {
-            assert(BP_STATE(*dn,i) == PT_COMPRESSED);
+            invariant(BP_STATE(*dn, i) == PT_COMPRESSED);
+        } else {
+            invariant(BP_STATE(*dn, i) == PT_ON_DISK);
        }
-        else {
-            assert(BP_STATE(*dn,i) == PT_ON_DISK);
-        }
-    }    
+    }

    req = toku_ftnode_pf_req_callback(*dn, &bfe_all);
-    assert(req);
+    invariant(req);
    toku_ftnode_pf_callback(*dn, ndd, &bfe_all, fd, &size);
    toku_ftnode_pe_callback(*dn, attr, ft_h, def_pe_finalize_impl, nullptr);
    for (int i = 0; i < (*dn)->n_children; i++) {
-        assert(BP_STATE(*dn,i) == PT_AVAIL);
+        invariant(BP_STATE(*dn, i) == PT_AVAIL);
    }
    (*dn)->dirty = 1;
    toku_ftnode_pe_callback(*dn, attr, ft_h, def_pe_finalize_impl, nullptr);
@ -152,101 +139,102 @@ test1(int fd, FT ft_h, FTNODE *dn) {
    toku_ftnode_pe_callback(*dn, attr, ft_h, def_pe_finalize_impl, nullptr);
    toku_ftnode_pe_callback(*dn, attr, ft_h, def_pe_finalize_impl, nullptr);
    for (int i = 0; i < (*dn)->n_children; i++) {
-        assert(BP_STATE(*dn,i) == PT_AVAIL);
+        invariant(BP_STATE(*dn, i) == PT_AVAIL);
    }
    toku_free(ndd);
    toku_ftnode_free(dn);
 }

-
-static int search_cmp(const struct ft_search& UU(so), const DBT* UU(key)) {
+static int search_cmp(const struct ft_search &UU(so), const DBT *UU(key)) {
    return 0;
 }

-static void
-test2(int fd, FT ft_h, FTNODE *dn) {
+static void test2(int fd, FT ft_h, FTNODE *dn) {
    DBT left, right;
    DB dummy_db;
    memset(&dummy_db, 0, sizeof(dummy_db));
    memset(&left, 0, sizeof(left));
    memset(&right, 0, sizeof(right));
    ft_search search;
-    
+
    ftnode_fetch_extra bfe_subset;
    bfe_subset.create_for_subset_read(
        ft_h,
-        ft_search_init(&search, search_cmp, FT_SEARCH_LEFT, nullptr, nullptr, nullptr),
+        ft_search_init(
+            &search, search_cmp, FT_SEARCH_LEFT, nullptr, nullptr, nullptr),
        &left,
        &right,
        true,
        true,
        false,
-        false
-        );
+        false);

    FTNODE_DISK_DATA ndd = NULL;
-    int r = toku_deserialize_ftnode_from(fd, make_blocknum(20), 0/*pass zero for hash*/, dn, &ndd, &bfe_subset);
-    assert(r==0);
+    int r = toku_deserialize_ftnode_from(
+        fd, make_blocknum(20), 0 /*pass zero for hash*/, dn, &ndd, &bfe_subset);
+    invariant(r == 0);
    bool is_leaf = ((*dn)->height == 0);
-    // at this point, although both partitions are available, only the 
+    // at this point, although both partitions are available, only the
    // second basement node should have had its clock
    // touched
-    assert(BP_STATE(*dn, 0) == PT_AVAIL);
-    assert(BP_STATE(*dn, 1) == PT_AVAIL);
-    assert(BP_SHOULD_EVICT(*dn, 0));
-    assert(!BP_SHOULD_EVICT(*dn, 1));
+    invariant(BP_STATE(*dn, 0) == PT_AVAIL);
+    invariant(BP_STATE(*dn, 1) == PT_AVAIL);
+    invariant(BP_SHOULD_EVICT(*dn, 0));
+    invariant(!BP_SHOULD_EVICT(*dn, 1));
    PAIR_ATTR attr;
-    memset(&attr,0,sizeof(attr));
+    memset(&attr, 0, sizeof(attr));
    toku_ftnode_pe_callback(*dn, attr, ft_h, def_pe_finalize_impl, nullptr);
-    assert(BP_STATE(*dn, 0) == (is_leaf) ? PT_ON_DISK : PT_COMPRESSED);
-    assert(BP_STATE(*dn, 1) == PT_AVAIL);
-    assert(BP_SHOULD_EVICT(*dn, 1));
+    invariant(BP_STATE(*dn, 0) == (is_leaf) ? PT_ON_DISK : PT_COMPRESSED);
+    invariant(BP_STATE(*dn, 1) == PT_AVAIL);
+    invariant(BP_SHOULD_EVICT(*dn, 1));
    toku_ftnode_pe_callback(*dn, attr, ft_h, def_pe_finalize_impl, nullptr);
-    assert(BP_STATE(*dn, 1) == (is_leaf) ? PT_ON_DISK : PT_COMPRESSED);
+    invariant(BP_STATE(*dn, 1) == (is_leaf) ? PT_ON_DISK : PT_COMPRESSED);

    bool req = toku_ftnode_pf_req_callback(*dn, &bfe_subset);
-    assert(req);
+    invariant(req);
    toku_ftnode_pf_callback(*dn, ndd, &bfe_subset, fd, &attr);
-    assert(BP_STATE(*dn, 0) == PT_AVAIL);
-    assert(BP_STATE(*dn, 1) == PT_AVAIL);
-    assert(BP_SHOULD_EVICT(*dn, 0));
-    assert(!BP_SHOULD_EVICT(*dn, 1));
+    invariant(BP_STATE(*dn, 0) == PT_AVAIL);
+    invariant(BP_STATE(*dn, 1) == PT_AVAIL);
+    invariant(BP_SHOULD_EVICT(*dn, 0));
+    invariant(!BP_SHOULD_EVICT(*dn, 1));

    toku_free(ndd);
    toku_ftnode_free(dn);
 }

-static void
-test3_leaf(int fd, FT ft_h, FTNODE *dn) {
+static void test3_leaf(int fd, FT ft_h, FTNODE *dn) {
    DBT left, right;
    DB dummy_db;
    memset(&dummy_db, 0, sizeof(dummy_db));
    memset(&left, 0, sizeof(left));
    memset(&right, 0, sizeof(right));
-    
+
    ftnode_fetch_extra bfe_min;
    bfe_min.create_for_min_read(ft_h);

    FTNODE_DISK_DATA ndd = NULL;
-    int r = toku_deserialize_ftnode_from(fd, make_blocknum(20), 0/*pass zero for hash*/, dn, &ndd, &bfe_min);
-    assert(r==0);
+    int r = toku_deserialize_ftnode_from(
+        fd, make_blocknum(20), 0 /*pass zero for hash*/, dn, &ndd, &bfe_min);
+    invariant(r == 0);
    //
    // make sure we have a leaf
    //
-    assert((*dn)->height == 0);
+    invariant((*dn)->height == 0);
    for (int i = 0; i < (*dn)->n_children; i++) {
-        assert(BP_STATE(*dn, i) == PT_ON_DISK);
+        invariant(BP_STATE(*dn, i) == PT_ON_DISK);
    }
    toku_ftnode_free(dn);
    toku_free(ndd);
 }

-static void
-test_serialize_nonleaf(void) {
+static void test_serialize_nonleaf(void) {
    //    struct ft_handle source_ft;
    struct ftnode sn, *dn;

-    int fd = open(TOKU_TEST_FILENAME, O_RDWR|O_CREAT|O_BINARY, S_IRWXU|S_IRWXG|S_IRWXO); assert(fd >= 0);
+    int fd = open(TOKU_TEST_FILENAME,
+                  O_RDWR | O_CREAT | O_BINARY,
+                  S_IRWXU | S_IRWXG | S_IRWXO);
+    invariant(fd >= 0);

    int r;

@ -265,11 +253,11 @@ test_serialize_nonleaf(void) {
    sn.pivotkeys.create_from_dbts(toku_fill_dbt(&pivotkey, "hello", 6), 1);
    BP_BLOCKNUM(&sn, 0).b = 30;
    BP_BLOCKNUM(&sn, 1).b = 35;
-    BP_STATE(&sn,0) = PT_AVAIL;
-    BP_STATE(&sn,1) = PT_AVAIL;
+    BP_STATE(&sn, 0) = PT_AVAIL;
+    BP_STATE(&sn, 1) = PT_AVAIL;
    set_BNC(&sn, 0, toku_create_empty_nl());
    set_BNC(&sn, 1, toku_create_empty_nl());
-    //Create XIDS
+    // Create XIDS
    XIDS xids_0 = toku_xids_get_root_xids();
    XIDS xids_123;
    XIDS xids_234;
@ -281,11 +269,38 @@ test_serialize_nonleaf(void) {
    toku::comparator cmp;
    cmp.create(string_key_cmp, nullptr);

-    toku_bnc_insert_msg(BNC(&sn, 0), "a", 2, "aval", 5, FT_NONE, next_dummymsn(), xids_0, true, cmp);
-    toku_bnc_insert_msg(BNC(&sn, 0), "b", 2, "bval", 5, FT_NONE, next_dummymsn(), xids_123, false, cmp);
-    toku_bnc_insert_msg(BNC(&sn, 1), "x", 2, "xval", 5, FT_NONE, next_dummymsn(), xids_234, true, cmp);
+    toku_bnc_insert_msg(BNC(&sn, 0),
+                        "a",
+                        2,
+                        "aval",
+                        5,
+                        FT_NONE,
+                        next_dummymsn(),
+                        xids_0,
+                        true,
+                        cmp);
+    toku_bnc_insert_msg(BNC(&sn, 0),
+                        "b",
+                        2,
+                        "bval",
+                        5,
+                        FT_NONE,
+                        next_dummymsn(),
+                        xids_123,
+                        false,
+                        cmp);
+    toku_bnc_insert_msg(BNC(&sn, 1),
+                        "x",
+                        2,
+                        "xval",
+                        5,
+                        FT_NONE,
+                        next_dummymsn(),
+                        xids_234,
+                        true,
+                        cmp);

-    //Cleanup:
+    // Cleanup:
    toku_xids_destroy(&xids_0);
    toku_xids_destroy(&xids_123);
    toku_xids_destroy(&xids_234);
@ -297,35 +312,41 @@ test_serialize_nonleaf(void) {
                 make_blocknum(0),
                 ZERO_LSN,
                 TXNID_NONE,
-                 4*1024*1024,
-                 128*1024,
+                 4 * 1024 * 1024,
+                 128 * 1024,
                 TOKU_DEFAULT_COMPRESSION_METHOD,
                 16);
    ft_h->cmp.create(string_key_cmp, nullptr);
    ft->ft = ft_h;
-    
+
    ft_h->blocktable.create();
-    { int r_truncate = ftruncate(fd, 0); CKERR(r_truncate); }
-    //Want to use block #20
+    {
+        int r_truncate = ftruncate(fd, 0);
+        CKERR(r_truncate);
+    }
+    // Want to use block #20
    BLOCKNUM b = make_blocknum(0);
    while (b.b < 20) {
        ft_h->blocktable.allocate_blocknum(&b, ft_h);
    }
-    assert(b.b == 20);
+    invariant(b.b == 20);

    {
        DISKOFF offset;
        DISKOFF size;
-        ft_h->blocktable.realloc_on_disk(b, 100, &offset, ft_h, fd, false, 0);
-        assert(offset==(DISKOFF)block_allocator::BLOCK_ALLOCATOR_TOTAL_HEADER_RESERVE);
+        ft_h->blocktable.realloc_on_disk(b, 100, &offset, ft_h, fd, false);
+        invariant(offset ==
+               (DISKOFF)BlockAllocator::BLOCK_ALLOCATOR_TOTAL_HEADER_RESERVE);

        ft_h->blocktable.translate_blocknum_to_offset_size(b, &offset, &size);
-        assert(offset == (DISKOFF)block_allocator::BLOCK_ALLOCATOR_TOTAL_HEADER_RESERVE);
-        assert(size   == 100);
+        invariant(offset ==
+               (DISKOFF)BlockAllocator::BLOCK_ALLOCATOR_TOTAL_HEADER_RESERVE);
+        invariant(size == 100);
    }
    FTNODE_DISK_DATA ndd = NULL;
-    r = toku_serialize_ftnode_to(fd, make_blocknum(20), &sn, &ndd, true, ft->ft, false);
-    assert(r==0);
+    r = toku_serialize_ftnode_to(
+        fd, make_blocknum(20), &sn, &ndd, true, ft->ft, false);
+    invariant(r == 0);

    test1(fd, ft_h, &dn);
    test2(fd, ft_h, &dn);
@ -333,22 +354,26 @@ test_serialize_nonleaf(void) {
    toku_destroy_ftnode_internals(&sn);
    toku_free(ndd);

-    ft_h->blocktable.block_free(block_allocator::BLOCK_ALLOCATOR_TOTAL_HEADER_RESERVE);
+    ft_h->blocktable.block_free(
+        BlockAllocator::BLOCK_ALLOCATOR_TOTAL_HEADER_RESERVE, 100);
    ft_h->blocktable.destroy();
    toku_free(ft_h->h);
    ft_h->cmp.destroy();
    toku_free(ft_h);
    toku_free(ft);

-    r = close(fd); assert(r != -1);
+    r = close(fd);
+    invariant(r != -1);
 }

-static void
-test_serialize_leaf(void) {
+static void test_serialize_leaf(void) {
    //    struct ft_handle source_ft;
    struct ftnode sn, *dn;

-    int fd = open(TOKU_TEST_FILENAME, O_RDWR|O_CREAT|O_BINARY, S_IRWXU|S_IRWXG|S_IRWXO); assert(fd >= 0);
+    int fd = open(TOKU_TEST_FILENAME,
+                  O_RDWR | O_CREAT | O_BINARY,
+                  S_IRWXU | S_IRWXG | S_IRWXO);
+    invariant(fd >= 0);

    int r;

@ -364,8 +389,8 @@ test_serialize_leaf(void) {
    MALLOC_N(sn.n_children, sn.bp);
    DBT pivotkey;
    sn.pivotkeys.create_from_dbts(toku_fill_dbt(&pivotkey, "b", 2), 1);
-    BP_STATE(&sn,0) = PT_AVAIL;
-    BP_STATE(&sn,1) = PT_AVAIL;
+    BP_STATE(&sn, 0) = PT_AVAIL;
+    BP_STATE(&sn, 1) = PT_AVAIL;
    set_BLB(&sn, 0, toku_create_empty_bn());
    set_BLB(&sn, 1, toku_create_empty_bn());
    le_malloc(BLB_DATA(&sn, 0), 0, "a", "aval");
@ -378,51 +403,59 @@ test_serialize_leaf(void) {
                 make_blocknum(0),
                 ZERO_LSN,
                 TXNID_NONE,
-                 4*1024*1024,
-                 128*1024,
+                 4 * 1024 * 1024,
+                 128 * 1024,
                 TOKU_DEFAULT_COMPRESSION_METHOD,
                 16);
    ft->ft = ft_h;
-    
+
    ft_h->blocktable.create();
-    { int r_truncate = ftruncate(fd, 0); CKERR(r_truncate); }
-    //Want to use block #20
+    {
+        int r_truncate = ftruncate(fd, 0);
+        CKERR(r_truncate);
+    }
+    // Want to use block #20
    BLOCKNUM b = make_blocknum(0);
    while (b.b < 20) {
        ft_h->blocktable.allocate_blocknum(&b, ft_h);
    }
-    assert(b.b == 20);
+    invariant(b.b == 20);

    {
        DISKOFF offset;
        DISKOFF size;
-        ft_h->blocktable.realloc_on_disk(b, 100, &offset, ft_h, fd, false, 0);
-        assert(offset==(DISKOFF)block_allocator::BLOCK_ALLOCATOR_TOTAL_HEADER_RESERVE);
+        ft_h->blocktable.realloc_on_disk(b, 100, &offset, ft_h, fd, false);
+        invariant(offset ==
+               (DISKOFF)BlockAllocator::BLOCK_ALLOCATOR_TOTAL_HEADER_RESERVE);

        ft_h->blocktable.translate_blocknum_to_offset_size(b, &offset, &size);
-        assert(offset == (DISKOFF)block_allocator::BLOCK_ALLOCATOR_TOTAL_HEADER_RESERVE);
-        assert(size   == 100);
+        invariant(offset ==
+               (DISKOFF)BlockAllocator::BLOCK_ALLOCATOR_TOTAL_HEADER_RESERVE);
+        invariant(size == 100);
    }
    FTNODE_DISK_DATA ndd = NULL;
-    r = toku_serialize_ftnode_to(fd, make_blocknum(20), &sn, &ndd, true, ft->ft, false);
-    assert(r==0);
+    r = toku_serialize_ftnode_to(
+        fd, make_blocknum(20), &sn, &ndd, true, ft->ft, false);
+    invariant(r == 0);

    test1(fd, ft_h, &dn);
-    test3_leaf(fd, ft_h,&dn);
+    test3_leaf(fd, ft_h, &dn);

    toku_destroy_ftnode_internals(&sn);

-    ft_h->blocktable.block_free(block_allocator::BLOCK_ALLOCATOR_TOTAL_HEADER_RESERVE);
+    ft_h->blocktable.block_free(
+        BlockAllocator::BLOCK_ALLOCATOR_TOTAL_HEADER_RESERVE, 100);
    ft_h->blocktable.destroy();
    toku_free(ft_h->h);
    toku_free(ft_h);
    toku_free(ft);
    toku_free(ndd);
-    r = close(fd); assert(r != -1);
+    r = close(fd);
+    invariant(r != -1);
 }

-int
-test_main (int argc __attribute__((__unused__)), const char *argv[] __attribute__((__unused__))) {
+int test_main(int argc __attribute__((__unused__)),
+              const char *argv[] __attribute__((__unused__))) {
    initialize_dummymsn();
    test_serialize_nonleaf();
    test_serialize_leaf();
--- a/storage/tokudb/PerconaFT/ft/tests/ft-serialize-benchmark.cc
+++ b/storage/tokudb/PerconaFT/ft/tests/ft-serialize-benchmark.cc
@ -41,27 +41,21 @@ Copyright (c) 2006, 2015, Percona and/or its affiliates. All rights reserved.
 #include <sys/time.h>
 #include "test.h"

-
-
 #ifndef MIN
 #define MIN(x, y) (((x) < (y)) ? (x) : (y))
 #endif
 const double USECS_PER_SEC = 1000000.0;

-static void
-le_add_to_bn(bn_data* bn, uint32_t idx, char *key, int keylen, char *val, int vallen)
-{
+static void le_add_to_bn(bn_data *bn,
+                         uint32_t idx,
+                         char *key,
+                         int keylen,
+                         char *val,
+                         int vallen) {
    LEAFENTRY r = NULL;
    uint32_t size_needed = LE_CLEAN_MEMSIZE(vallen);
    void *maybe_free = nullptr;
-    bn->get_space_for_insert(
-        idx, 
-        key,
-        keylen,
-        size_needed,
-        &r,
-        &maybe_free
-        );
+    bn->get_space_for_insert(idx, key, keylen, size_needed, &r, &maybe_free);
    if (maybe_free) {
        toku_free(maybe_free);
    }
@ -71,20 +65,24 @@ le_add_to_bn(bn_data* bn, uint32_t idx, char *key, int keylen, char *val, int va
    memcpy(r->u.clean.val, val, vallen);
 }

-static int
-long_key_cmp(DB *UU(e), const DBT *a, const DBT *b)
-{
+static int long_key_cmp(DB *UU(e), const DBT *a, const DBT *b) {
    const long *CAST_FROM_VOIDP(x, a->data);
    const long *CAST_FROM_VOIDP(y, b->data);
    return (*x > *y) - (*x < *y);
 }

-static void
-test_serialize_leaf(int valsize, int nelts, double entropy, int ser_runs, int deser_runs) {
+static void test_serialize_leaf(int valsize,
+                                int nelts,
+                                double entropy,
+                                int ser_runs,
+                                int deser_runs) {
    //    struct ft_handle source_ft;
    struct ftnode *sn, *dn;

-    int fd = open(TOKU_TEST_FILENAME, O_RDWR|O_CREAT|O_BINARY, S_IRWXU|S_IRWXG|S_IRWXO); assert(fd >= 0);
+    int fd = open(TOKU_TEST_FILENAME,
+                  O_RDWR | O_CREAT | O_BINARY,
+                  S_IRWXU | S_IRWXG | S_IRWXO);
+    invariant(fd >= 0);

    int r;

@ -102,7 +100,7 @@ test_serialize_leaf(int valsize, int nelts, double entropy, int ser_runs, int de
    MALLOC_N(sn->n_children, sn->bp);
    sn->pivotkeys.create_empty();
    for (int i = 0; i < sn->n_children; ++i) {
-        BP_STATE(sn,i) = PT_AVAIL;
+        BP_STATE(sn, i) = PT_AVAIL;
        set_BLB(sn, i, toku_create_empty_bn());
    }
    int nperbn = nelts / sn->n_children;
@ -112,24 +110,19 @@ test_serialize_leaf(int valsize, int nelts, double entropy, int ser_runs, int de
            k = ck * nperbn + i;
            char buf[valsize];
            int c;
-            for (c = 0; c < valsize * entropy; ) {
-                int *p = (int *) &buf[c];
+            for (c = 0; c < valsize * entropy;) {
+                int *p = (int *)&buf[c];
                *p = rand();
                c += sizeof(*p);
            }
            memset(&buf[c], 0, valsize - c);
            le_add_to_bn(
-                BLB_DATA(sn,ck),
-                i,
-                (char *)&k, 
-                sizeof k, 
-                buf, 
-                sizeof buf
-                );
+                BLB_DATA(sn, ck), i, (char *)&k, sizeof k, buf, sizeof buf);
        }
        if (ck < 7) {
            DBT pivotkey;
-            sn->pivotkeys.insert_at(toku_fill_dbt(&pivotkey, &k, sizeof(k)), ck);
+            sn->pivotkeys.insert_at(toku_fill_dbt(&pivotkey, &k, sizeof(k)),
+                                    ck);
        }
    }

@ -139,31 +132,36 @@ test_serialize_leaf(int valsize, int nelts, double entropy, int ser_runs, int de
                 make_blocknum(0),
                 ZERO_LSN,
                 TXNID_NONE,
-                 4*1024*1024,
-                 128*1024,
+                 4 * 1024 * 1024,
+                 128 * 1024,
                 TOKU_DEFAULT_COMPRESSION_METHOD,
                 16);
    ft_h->cmp.create(long_key_cmp, nullptr);
    ft->ft = ft_h;
-    
+
    ft_h->blocktable.create();
-    { int r_truncate = ftruncate(fd, 0); CKERR(r_truncate); }
-    //Want to use block #20
+    {
+        int r_truncate = ftruncate(fd, 0);
+        CKERR(r_truncate);
+    }
+    // Want to use block #20
    BLOCKNUM b = make_blocknum(0);
    while (b.b < 20) {
        ft_h->blocktable.allocate_blocknum(&b, ft_h);
    }
-    assert(b.b == 20);
+    invariant(b.b == 20);

    {
        DISKOFF offset;
        DISKOFF size;
-        ft_h->blocktable.realloc_on_disk(b, 100, &offset, ft_h, fd, false, 0);
-        assert(offset==(DISKOFF)block_allocator::BLOCK_ALLOCATOR_TOTAL_HEADER_RESERVE);
+        ft_h->blocktable.realloc_on_disk(b, 100, &offset, ft_h, fd, false);
+        invariant(offset ==
+               (DISKOFF)BlockAllocator::BLOCK_ALLOCATOR_TOTAL_HEADER_RESERVE);

        ft_h->blocktable.translate_blocknum_to_offset_size(b, &offset, &size);
-        assert(offset == (DISKOFF)block_allocator::BLOCK_ALLOCATOR_TOTAL_HEADER_RESERVE);
-        assert(size   == 100);
+        invariant(offset ==
+               (DISKOFF)BlockAllocator::BLOCK_ALLOCATOR_TOTAL_HEADER_RESERVE);
+        invariant(size == 100);
    }

    struct timeval total_start;
@ -176,8 +174,9 @@ test_serialize_leaf(int valsize, int nelts, double entropy, int ser_runs, int de
        gettimeofday(&t[0], NULL);
        ndd = NULL;
        sn->dirty = 1;
-        r = toku_serialize_ftnode_to(fd, make_blocknum(20), sn, &ndd, true, ft->ft, false);
-        assert(r==0);
+        r = toku_serialize_ftnode_to(
+            fd, make_blocknum(20), sn, &ndd, true, ft->ft, false);
+        invariant(r == 0);
        gettimeofday(&t[1], NULL);
        total_start.tv_sec += t[0].tv_sec;
        total_start.tv_usec += t[0].tv_usec;
@ -186,12 +185,14 @@ test_serialize_leaf(int valsize, int nelts, double entropy, int ser_runs, int de
        toku_free(ndd);
    }
    double dt;
-    dt = (total_end.tv_sec - total_start.tv_sec) + ((total_end.tv_usec - total_start.tv_usec) / USECS_PER_SEC);
+    dt = (total_end.tv_sec - total_start.tv_sec) +
+         ((total_end.tv_usec - total_start.tv_usec) / USECS_PER_SEC);
    dt *= 1000;
    dt /= ser_runs;
-    printf("serialize leaf(ms):   %0.05lf (average of %d runs)\n", dt, ser_runs);
+    printf(
+        "serialize leaf(ms):   %0.05lf (average of %d runs)\n", dt, ser_runs);

-    //reset 
+    // reset
    total_start.tv_sec = total_start.tv_usec = 0;
    total_end.tv_sec = total_end.tv_usec = 0;

@ -200,8 +201,9 @@ test_serialize_leaf(int valsize, int nelts, double entropy, int ser_runs, int de
        bfe.create_for_full_read(ft_h);
        gettimeofday(&t[0], NULL);
        FTNODE_DISK_DATA ndd2 = NULL;
-        r = toku_deserialize_ftnode_from(fd, make_blocknum(20), 0/*pass zero for hash*/, &dn, &ndd2, &bfe);
-        assert(r==0);
+        r = toku_deserialize_ftnode_from(
+            fd, make_blocknum(20), 0 /*pass zero for hash*/, &dn, &ndd2, &bfe);
+        invariant(r == 0);
        gettimeofday(&t[1], NULL);

        total_start.tv_sec += t[0].tv_sec;
@ -212,35 +214,46 @@ test_serialize_leaf(int valsize, int nelts, double entropy, int ser_runs, int de
        toku_ftnode_free(&dn);
        toku_free(ndd2);
    }
-    dt = (total_end.tv_sec - total_start.tv_sec) + ((total_end.tv_usec - total_start.tv_usec) / USECS_PER_SEC);
+    dt = (total_end.tv_sec - total_start.tv_sec) +
+         ((total_end.tv_usec - total_start.tv_usec) / USECS_PER_SEC);
    dt *= 1000;
    dt /= deser_runs;
-    printf("deserialize leaf(ms): %0.05lf (average of %d runs)\n", dt, deser_runs);
-    printf("io time(ms) %lf decompress time(ms) %lf deserialize time(ms) %lf (average of %d runs)\n",
-           tokutime_to_seconds(bfe.io_time)*1000,
-           tokutime_to_seconds(bfe.decompress_time)*1000,
-           tokutime_to_seconds(bfe.deserialize_time)*1000,
-           deser_runs
-           );
+    printf(
+        "deserialize leaf(ms): %0.05lf (average of %d runs)\n", dt, deser_runs);
+    printf(
+        "io time(ms) %lf decompress time(ms) %lf deserialize time(ms) %lf "
+        "(average of %d runs)\n",
+        tokutime_to_seconds(bfe.io_time) * 1000,
+        tokutime_to_seconds(bfe.decompress_time) * 1000,
+        tokutime_to_seconds(bfe.deserialize_time) * 1000,
+        deser_runs);

    toku_ftnode_free(&sn);

-    ft_h->blocktable.block_free(block_allocator::BLOCK_ALLOCATOR_TOTAL_HEADER_RESERVE);
+    ft_h->blocktable.block_free(
+        BlockAllocator::BLOCK_ALLOCATOR_TOTAL_HEADER_RESERVE, 100);
    ft_h->blocktable.destroy();
    ft_h->cmp.destroy();
    toku_free(ft_h->h);
    toku_free(ft_h);
    toku_free(ft);

-    r = close(fd); assert(r != -1);
+    r = close(fd);
+    invariant(r != -1);
 }

-static void
-test_serialize_nonleaf(int valsize, int nelts, double entropy, int ser_runs, int deser_runs) {
+static void test_serialize_nonleaf(int valsize,
+                                   int nelts,
+                                   double entropy,
+                                   int ser_runs,
+                                   int deser_runs) {
    //    struct ft_handle source_ft;
    struct ftnode sn, *dn;

-    int fd = open(TOKU_TEST_FILENAME, O_RDWR|O_CREAT|O_BINARY, S_IRWXU|S_IRWXG|S_IRWXO); assert(fd >= 0);
+    int fd = open(TOKU_TEST_FILENAME,
+                  O_RDWR | O_CREAT | O_BINARY,
+                  S_IRWXU | S_IRWXG | S_IRWXO);
+    invariant(fd >= 0);

    int r;

@ -257,11 +270,11 @@ test_serialize_nonleaf(int valsize, int nelts, double entropy, int ser_runs, int
    MALLOC_N(sn.n_children, sn.bp);
    sn.pivotkeys.create_empty();
    for (int i = 0; i < sn.n_children; ++i) {
-        BP_BLOCKNUM(&sn, i).b = 30 + (i*5);
-        BP_STATE(&sn,i) = PT_AVAIL;
+        BP_BLOCKNUM(&sn, i).b = 30 + (i * 5);
+        BP_STATE(&sn, i) = PT_AVAIL;
        set_BNC(&sn, i, toku_create_empty_nl());
    }
-    //Create XIDS
+    // Create XIDS
    XIDS xids_0 = toku_xids_get_root_xids();
    XIDS xids_123;
    r = toku_xids_create_child(xids_0, &xids_123, (TXNID)123);
@ -276,14 +289,23 @@ test_serialize_nonleaf(int valsize, int nelts, double entropy, int ser_runs, int
            k = ck * nperchild + i;
            char buf[valsize];
            int c;
-            for (c = 0; c < valsize * entropy; ) {
-                int *p = (int *) &buf[c];
+            for (c = 0; c < valsize * entropy;) {
+                int *p = (int *)&buf[c];
                *p = rand();
                c += sizeof(*p);
            }
            memset(&buf[c], 0, valsize - c);

-            toku_bnc_insert_msg(bnc, &k, sizeof k, buf, valsize, FT_NONE, next_dummymsn(), xids_123, true, cmp);
+            toku_bnc_insert_msg(bnc,
+                                &k,
+                                sizeof k,
+                                buf,
+                                valsize,
+                                FT_NONE,
+                                next_dummymsn(),
+                                xids_123,
+                                true,
+                                cmp);
        }
        if (ck < 7) {
            DBT pivotkey;
@ -291,7 +313,7 @@ test_serialize_nonleaf(int valsize, int nelts, double entropy, int ser_runs, int
        }
    }

-    //Cleanup:
+    // Cleanup:
    toku_xids_destroy(&xids_0);
    toku_xids_destroy(&xids_123);
    cmp.destroy();
@ -302,65 +324,78 @@ test_serialize_nonleaf(int valsize, int nelts, double entropy, int ser_runs, int
                 make_blocknum(0),
                 ZERO_LSN,
                 TXNID_NONE,
-                 4*1024*1024,
-                 128*1024,
+                 4 * 1024 * 1024,
+                 128 * 1024,
                 TOKU_DEFAULT_COMPRESSION_METHOD,
                 16);
    ft_h->cmp.create(long_key_cmp, nullptr);
    ft->ft = ft_h;
-    
+
    ft_h->blocktable.create();
-    { int r_truncate = ftruncate(fd, 0); CKERR(r_truncate); }
-    //Want to use block #20
+    {
+        int r_truncate = ftruncate(fd, 0);
+        CKERR(r_truncate);
+    }
+    // Want to use block #20
    BLOCKNUM b = make_blocknum(0);
    while (b.b < 20) {
        ft_h->blocktable.allocate_blocknum(&b, ft_h);
    }
-    assert(b.b == 20);
+    invariant(b.b == 20);

    {
        DISKOFF offset;
        DISKOFF size;
-        ft_h->blocktable.realloc_on_disk(b, 100, &offset, ft_h, fd, false, 0);
-        assert(offset==(DISKOFF)block_allocator::BLOCK_ALLOCATOR_TOTAL_HEADER_RESERVE);
+        ft_h->blocktable.realloc_on_disk(b, 100, &offset, ft_h, fd, false);
+        invariant(offset ==
+               (DISKOFF)BlockAllocator::BLOCK_ALLOCATOR_TOTAL_HEADER_RESERVE);

        ft_h->blocktable.translate_blocknum_to_offset_size(b, &offset, &size);
-        assert(offset == (DISKOFF)block_allocator::BLOCK_ALLOCATOR_TOTAL_HEADER_RESERVE);
-        assert(size   == 100);
+        invariant(offset ==
+               (DISKOFF)BlockAllocator::BLOCK_ALLOCATOR_TOTAL_HEADER_RESERVE);
+        invariant(size == 100);
    }

    struct timeval t[2];
    gettimeofday(&t[0], NULL);
    FTNODE_DISK_DATA ndd = NULL;
-    r = toku_serialize_ftnode_to(fd, make_blocknum(20), &sn, &ndd, true, ft->ft, false);
-    assert(r==0);
+    r = toku_serialize_ftnode_to(
+        fd, make_blocknum(20), &sn, &ndd, true, ft->ft, false);
+    invariant(r == 0);
    gettimeofday(&t[1], NULL);
    double dt;
-    dt = (t[1].tv_sec - t[0].tv_sec) + ((t[1].tv_usec - t[0].tv_usec) / USECS_PER_SEC);
+    dt = (t[1].tv_sec - t[0].tv_sec) +
+         ((t[1].tv_usec - t[0].tv_usec) / USECS_PER_SEC);
    dt *= 1000;
-    printf("serialize nonleaf(ms):   %0.05lf (IGNORED RUNS=%d)\n", dt, ser_runs);
+    printf(
+        "serialize nonleaf(ms):   %0.05lf (IGNORED RUNS=%d)\n", dt, ser_runs);

    ftnode_fetch_extra bfe;
    bfe.create_for_full_read(ft_h);
    gettimeofday(&t[0], NULL);
    FTNODE_DISK_DATA ndd2 = NULL;
-    r = toku_deserialize_ftnode_from(fd, make_blocknum(20), 0/*pass zero for hash*/, &dn, &ndd2, &bfe);
-    assert(r==0);
+    r = toku_deserialize_ftnode_from(
+        fd, make_blocknum(20), 0 /*pass zero for hash*/, &dn, &ndd2, &bfe);
+    invariant(r == 0);
    gettimeofday(&t[1], NULL);
-    dt = (t[1].tv_sec - t[0].tv_sec) + ((t[1].tv_usec - t[0].tv_usec) / USECS_PER_SEC);
+    dt = (t[1].tv_sec - t[0].tv_sec) +
+         ((t[1].tv_usec - t[0].tv_usec) / USECS_PER_SEC);
    dt *= 1000;
-    printf("deserialize nonleaf(ms): %0.05lf (IGNORED RUNS=%d)\n", dt, deser_runs);
-    printf("io time(ms) %lf decompress time(ms) %lf deserialize time(ms) %lf (IGNORED RUNS=%d)\n",
-           tokutime_to_seconds(bfe.io_time)*1000,
-           tokutime_to_seconds(bfe.decompress_time)*1000,
-           tokutime_to_seconds(bfe.deserialize_time)*1000,
-           deser_runs
-           );
+    printf(
+        "deserialize nonleaf(ms): %0.05lf (IGNORED RUNS=%d)\n", dt, deser_runs);
+    printf(
+        "io time(ms) %lf decompress time(ms) %lf deserialize time(ms) %lf "
+        "(IGNORED RUNS=%d)\n",
+        tokutime_to_seconds(bfe.io_time) * 1000,
+        tokutime_to_seconds(bfe.decompress_time) * 1000,
+        tokutime_to_seconds(bfe.deserialize_time) * 1000,
+        deser_runs);

    toku_ftnode_free(&dn);
    toku_destroy_ftnode_internals(&sn);

-    ft_h->blocktable.block_free(block_allocator::BLOCK_ALLOCATOR_TOTAL_HEADER_RESERVE);
+    ft_h->blocktable.block_free(
+        BlockAllocator::BLOCK_ALLOCATOR_TOTAL_HEADER_RESERVE, 100);
    ft_h->blocktable.destroy();
    toku_free(ft_h->h);
    ft_h->cmp.destroy();
@ -369,17 +404,21 @@ test_serialize_nonleaf(int valsize, int nelts, double entropy, int ser_runs, int
    toku_free(ndd);
    toku_free(ndd2);

-    r = close(fd); assert(r != -1);
+    r = close(fd);
+    invariant(r != -1);
 }

-int
-test_main (int argc __attribute__((__unused__)), const char *argv[] __attribute__((__unused__))) {
+int test_main(int argc __attribute__((__unused__)),
+              const char *argv[] __attribute__((__unused__))) {
    const int DEFAULT_RUNS = 5;
    long valsize, nelts, ser_runs = DEFAULT_RUNS, deser_runs = DEFAULT_RUNS;
    double entropy = 0.3;

    if (argc != 3 && argc != 5) {
-        fprintf(stderr, "Usage: %s <valsize> <nelts> [<serialize_runs> <deserialize_runs>]\n", argv[0]);
+        fprintf(stderr,
+                "Usage: %s <valsize> <nelts> [<serialize_runs> "
+                "<deserialize_runs>]\n",
+                argv[0]);
        fprintf(stderr, "Default (and min) runs is %d\n", DEFAULT_RUNS);
        return 2;
    }
--- a/storage/tokudb/PerconaFT/ft/tests/ft-serialize-test.cc
+++ b/storage/tokudb/PerconaFT/ft/tests/ft-serialize-test.cc
--- a/storage/tokudb/PerconaFT/ft/tests/ft-test.cc
+++ b/storage/tokudb/PerconaFT/ft/tests/ft-test.cc
@ -164,17 +164,16 @@ static void  test_read_what_was_written (void) {
    int r;
    const int NVALS=10000;

-    if (verbose) printf("test_read_what_was_written(): "); fflush(stdout);
+    if (verbose) {
+        printf("test_read_what_was_written(): "); fflush(stdout);
+    }

    unlink(fname);
-    

    toku_cachetable_create(&ct, 0, ZERO_LSN, nullptr);
    r = toku_open_ft_handle(fname, 1, &ft, 1<<12, 1<<9, TOKU_DEFAULT_COMPRESSION_METHOD, ct, null_txn, toku_builtin_compare_fun);  assert(r==0);
    r = toku_close_ft_handle_nolsn(ft, 0); assert(r==0);
-    toku_cachetable_close(&ct);
-
-    
+    toku_cachetable_close(&ct);    

    /* Now see if we can read an empty tree in. */
    toku_cachetable_create(&ct, 0, ZERO_LSN, nullptr);
@ -189,8 +188,6 @@ static void  test_read_what_was_written (void) {
    r = toku_close_ft_handle_nolsn(ft, 0); assert(r==0);
    toku_cachetable_close(&ct);

-    
-
    /* Now see if we can read it in and get the value. */
    toku_cachetable_create(&ct, 0, ZERO_LSN, nullptr);
    r = toku_open_ft_handle(fname, 0, &ft, 1<<12, 1<<9, TOKU_DEFAULT_COMPRESSION_METHOD, ct, null_txn, toku_builtin_compare_fun); assert(r==0);
--- a/storage/tokudb/PerconaFT/ft/tests/pqueue-test.cc
+++ b/storage/tokudb/PerconaFT/ft/tests/pqueue-test.cc
@ -109,7 +109,9 @@ static int run_test(void)
        r = pqueue_pop(pq, &node);   assert(r==0);
        if (verbose) printf("%d : %d\n", i, *(int*)(node->key->data));
        if ( *(int*)(node->key->data) != i ) { 
-            if (verbose) printf("FAIL\n"); return -1; 
+            if (verbose)
+                printf("FAIL\n");
+            return -1;
        }
    }
    pqueue_free(pq);
--- a/storage/tokudb/PerconaFT/ft/tests/test-leafentry-nested.cc
+++ b/storage/tokudb/PerconaFT/ft/tests/test-leafentry-nested.cc
@ -793,7 +793,7 @@ static void test_le_garbage_collection_birdie(void) {
    do_garbage_collect = ule_worth_running_garbage_collection(&ule, 200);
    invariant(do_garbage_collect);

-    // It is definately worth doing when the above case is true
+    // It is definitely worth doing when the above case is true
    // and there is more than one provisional entry.
    ule.num_cuxrs = 1;
    ule.num_puxrs = 2;
--- a/storage/tokudb/PerconaFT/ft/tests/test-oldest-referenced-xid-flush.cc
+++ b/storage/tokudb/PerconaFT/ft/tests/test-oldest-referenced-xid-flush.cc
@ -72,7 +72,7 @@ static void dummy_update_status(FTNODE UU(child), int UU(dirtied), void* UU(extr

 enum { NODESIZE = 1024, KSIZE=NODESIZE-100, TOKU_PSIZE=20 };

-static void test_oldest_referenced_xid_gets_propogated(void) {
+static void test_oldest_referenced_xid_gets_propagated(void) {
    int r;
    CACHETABLE ct;
    FT_HANDLE t;
@ -166,7 +166,7 @@ static void test_oldest_referenced_xid_gets_propogated(void) {
    toku_ft_flush_some_child(t->ft, node, &fa);

    // pin the child, verify that oldest referenced xid was
-    // propogated from parent to child during the flush
+    // propagated from parent to child during the flush
    toku_pin_ftnode(
        t->ft, 
        child_nonleaf_blocknum,
@ -185,6 +185,6 @@ static void test_oldest_referenced_xid_gets_propogated(void) {

 int test_main(int argc __attribute__((__unused__)), const char *argv[] __attribute__((__unused__))) {
    default_parse_args(argc, argv);
-    test_oldest_referenced_xid_gets_propogated();
+    test_oldest_referenced_xid_gets_propagated();
    return 0;
 }
--- a/storage/tokudb/PerconaFT/ft/tests/test-rbtree-insert-remove-with-mhs.cc
+++ b/storage/tokudb/PerconaFT/ft/tests/test-rbtree-insert-remove-with-mhs.cc
@ -36,30 +36,62 @@ Copyright (c) 2006, 2015, Percona and/or its affiliates. All rights reserved.

 #ident "Copyright (c) 2006, 2015, Percona and/or its affiliates. All rights reserved."

-#pragma once
+#include "ft/serialize/rbtree_mhs.h"
+#include "test.h"
+#include <algorithm>
+#include <vector>
+#include <ctime>
+#include <cstdlib>

-#include <db.h>
+static void test_insert_remove(void) {
+    uint64_t i;
+    MhsRbTree::Tree *tree = new MhsRbTree::Tree();
+    verbose = 0;

-#include "ft/serialize/block_allocator.h"
+    tree->Insert({0, 100});

-// Block allocation strategy implementations
+    for (i = 0; i < 10; i++) {
+        tree->Remove(3);
+        tree->Remove(2);
+    }
+    tree->ValidateBalance();
+    tree->ValidateMhs();

-class block_allocator_strategy {
-public:
-    static struct block_allocator::blockpair *
-    first_fit(struct block_allocator::blockpair *blocks_array,
-              uint64_t n_blocks, uint64_t size, uint64_t alignment);
+    for (i = 0; i < 10; i++) {
+        tree->Insert({5 * i, 3});
+    }
+    tree->ValidateBalance();
+    tree->ValidateMhs();

-    static struct block_allocator::blockpair *
-    best_fit(struct block_allocator::blockpair *blocks_array,
-             uint64_t n_blocks, uint64_t size, uint64_t alignment);
+    uint64_t offset = tree->Remove(2);
+    invariant(offset == 0);
+    offset = tree->Remove(10);
+    invariant(offset == 50);
+    offset = tree->Remove(3);
+    invariant(offset == 5);
+    tree->ValidateBalance();
+    tree->ValidateMhs();

-    static struct block_allocator::blockpair *
-    padded_fit(struct block_allocator::blockpair *blocks_array,
-               uint64_t n_blocks, uint64_t size, uint64_t alignment);
+    tree->Insert({48, 2});
+    tree->Insert({50, 10});

-    static struct block_allocator::blockpair *
-    heat_zone(struct block_allocator::blockpair *blocks_array,
-              uint64_t n_blocks, uint64_t size, uint64_t alignment,
-              uint64_t heat);
-};
+    tree->ValidateBalance();
+    tree->ValidateMhs();
+
+    tree->Insert({3, 7});
+    offset = tree->Remove(10);
+    invariant(offset == 2);
+    tree->ValidateBalance();
+    tree->ValidateMhs();
+    tree->Dump();
+    delete tree;
+}
+
+int test_main(int argc, const char *argv[]) {
+    default_parse_args(argc, argv);
+
+    test_insert_remove();
+    if (verbose)
+        printf("test ok\n");
+    return 0;
+}
--- a/storage/tokudb/PerconaFT/ft/tests/test-rbtree-insert-remove-without-mhs.cc
+++ b/storage/tokudb/PerconaFT/ft/tests/test-rbtree-insert-remove-without-mhs.cc
@ -0,0 +1,102 @@
+/* -*- mode: C++; c-basic-offset: 4; indent-tabs-mode: nil -*- */
+// vim: ft=cpp:expandtab:ts=8:sw=4:softtabstop=4:
+#ident "$Id$"
+/*======
+This file is part of PerconaFT.
+
+
+Copyright (c) 2006, 2015, Percona and/or its affiliates. All rights reserved.
+
+    PerconaFT is free software: you can redistribute it and/or modify
+    it under the terms of the GNU General Public License, version 2,
+    as published by the Free Software Foundation.
+
+    PerconaFT is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License
+    along with PerconaFT.  If not, see <http://www.gnu.org/licenses/>.
+
+----------------------------------------
+
+    PerconaFT is free software: you can redistribute it and/or modify
+    it under the terms of the GNU Affero General Public License, version 3,
+    as published by the Free Software Foundation.
+
+    PerconaFT is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU Affero General Public License for more details.
+
+    You should have received a copy of the GNU Affero General Public License
+    along with PerconaFT.  If not, see <http://www.gnu.org/licenses/>.
+======= */
+
+#ident "Copyright (c) 2006, 2015, Percona and/or its affiliates. All rights reserved."
+
+#include "ft/serialize/rbtree_mhs.h"
+#include "test.h"
+#include <algorithm>
+#include <vector>
+#include <ctime>
+#include <cstdlib>
+
+#define N 1000000
+std::vector<MhsRbTree::Node::BlockPair> input_vector;
+MhsRbTree::Node::BlockPair old_vector[N];
+
+static int myrandom(int i) { return std::rand() % i; }
+
+static void generate_random_input() {
+    std::srand(unsigned(std::time(0)));
+
+    // set some values:
+    for (uint64_t i = 1; i < N; ++i) {
+        input_vector.push_back({i, 0});
+        old_vector[i] = {i, 0};
+    }
+    // using built-in random generator:
+    std::random_shuffle(input_vector.begin(), input_vector.end(), myrandom);
+}
+
+static void test_insert_remove(void) {
+    int i;
+    MhsRbTree::Tree *tree = new MhsRbTree::Tree();
+    verbose = 0;
+    generate_random_input();
+    if (verbose) {
+        printf("\n we are going to insert the following block offsets\n");
+        for (i = 0; i < N; i++)
+            printf("%" PRIu64 "\t", input_vector[i]._offset.ToInt());
+    }
+    for (i = 0; i < N; i++) {
+        tree->Insert(input_vector[i]);
+        // tree->ValidateBalance();
+    }
+    tree->ValidateBalance();
+    MhsRbTree::Node::BlockPair *p_bps = &old_vector[0];
+    tree->ValidateInOrder(p_bps);
+    printf("min node of the tree:%" PRIu64 "\n",
+           rbn_offset(tree->MinNode()).ToInt());
+    printf("max node of the tree:%" PRIu64 "\n",
+           rbn_offset(tree->MaxNode()).ToInt());
+
+    for (i = 0; i < N; i++) {
+        // tree->ValidateBalance();
+        tree->RawRemove(input_vector[i]._offset.ToInt());
+    }
+
+    tree->Destroy();
+    delete tree;
+}
+
+int test_main(int argc, const char *argv[]) {
+    default_parse_args(argc, argv);
+
+    test_insert_remove();
+    if (verbose)
+        printf("test ok\n");
+    return 0;
+}
--- a/storage/tokudb/PerconaFT/ft/txn/roll.cc
+++ b/storage/tokudb/PerconaFT/ft/txn/roll.cc
@ -49,7 +49,7 @@ Copyright (c) 2006, 2015, Percona and/or its affiliates. All rights reserved.
 // functionality provided by roll.c is exposed by an autogenerated
 // header file, logheader.h
 //
-// this (poorly) explains the absense of "roll.h"
+// this (poorly) explains the absence of "roll.h"

 // these flags control whether or not we send commit messages for
 // various operations
--- a/storage/tokudb/PerconaFT/ft/txn/rollback-apply.cc
+++ b/storage/tokudb/PerconaFT/ft/txn/rollback-apply.cc
@ -169,7 +169,7 @@ int toku_rollback_commit(TOKUTXN txn, LSN lsn) {
            txn->roll_info.spilled_rollback_head      = ROLLBACK_NONE; 
            txn->roll_info.spilled_rollback_tail      = ROLLBACK_NONE; 
        }
-        // if we're commiting a child rollback, put its entries into the parent
+        // if we're committing a child rollback, put its entries into the parent
        // by pinning both child and parent and then linking the child log entry
        // list to the end of the parent log entry list.
        if (txn_has_current_rollback_log(txn)) {
--- a/storage/tokudb/PerconaFT/ft/txn/rollback-ct-callbacks.cc
+++ b/storage/tokudb/PerconaFT/ft/txn/rollback-ct-callbacks.cc
@ -59,21 +59,18 @@ rollback_log_destroy(ROLLBACK_LOG_NODE log) {

 // flush an ununused log to disk, by allocating a size 0 blocknum in
 // the blocktable
-static void
-toku_rollback_flush_unused_log(
-    ROLLBACK_LOG_NODE log,
-    BLOCKNUM logname,
-    int fd,
-    FT ft,
-    bool write_me,
-    bool keep_me,
-    bool for_checkpoint,
-    bool is_clone
-    )
-{
+static void toku_rollback_flush_unused_log(ROLLBACK_LOG_NODE log,
+                                           BLOCKNUM logname,
+                                           int fd,
+                                           FT ft,
+                                           bool write_me,
+                                           bool keep_me,
+                                           bool for_checkpoint,
+                                           bool is_clone) {
    if (write_me) {
        DISKOFF offset;
-        ft->blocktable.realloc_on_disk(logname, 0, &offset, ft, fd, for_checkpoint, INT_MAX);
+        ft->blocktable.realloc_on_disk(
+            logname, 0, &offset, ft, fd, for_checkpoint);
    }
    if (!keep_me && !is_clone) {
        toku_free(log);
--- a/storage/tokudb/PerconaFT/ft/ule.cc
+++ b/storage/tokudb/PerconaFT/ft/ule.cc
@ -587,8 +587,8 @@ bool toku_le_worth_running_garbage_collection(
 //                by new txns.
 //            2.) There is only one committed entry, but the outermost
 //                provisional entry is older than the oldest known referenced
-//                xid, so it must have commited. Therefor we can promote it to
-//                committed and get rid of the old commited entry.
+//                xid, so it must have committed. Therefor we can promote it to
+//                committed and get rid of the old committed entry.
    if (le->type != LE_MVCC) {
        return false;
    }
--- a/Show more
+++ b/Show more