Merge branch '10.0' of github.com:MariaDB/server into 10.0

This commit is contained in:
Sergei Petrunia 2016-09-28 16:19:58 +03:00
commit 23af6f5942
162 changed files with 4967 additions and 4289 deletions

View file

@ -10,6 +10,7 @@ Visma http://visma.com (2015 - 2016)
Acronis http://acronis.com (2016)
Nexedi https://www.nexedi.com (2016)
Automattic https://automattic.com (2014 - 2016)
Tencent Game DBA http://tencentdba.com/about (2016)
Verkkokauppa.com https://www.verkkokauppa.com (2015 - 2016)
Virtuozzo https://virtuozzo.com (2016)

View file

@ -1,3 +1,3 @@
MYSQL_VERSION_MAJOR=10
MYSQL_VERSION_MINOR=0
MYSQL_VERSION_PATCH=27
MYSQL_VERSION_PATCH=28

View file

@ -220,6 +220,9 @@ SETA(CPACK_RPM_test_PACKAGE_PROVIDES
"perl(mtr_io.pl)"
"perl(mtr_match)"
"perl(mtr_misc.pl)"
"perl(mtr_gcov.pl)"
"perl(mtr_gprof.pl)"
"perl(mtr_process.pl)"
"perl(mtr_report)"
"perl(mtr_results)"
"perl(mtr_unique)")

View file

@ -882,8 +882,7 @@ typedef long long my_ptrdiff_t;
and related routines are refactored.
*/
#define my_offsetof(TYPE, MEMBER) \
((size_t)((char *)&(((TYPE *)0x10)->MEMBER) - (char*)0x10))
#define my_offsetof(TYPE, MEMBER) PTR_BYTE_DIFF(&((TYPE *)0x10)->MEMBER, 0x10)
#define NullS (char *) 0

View file

@ -1,5 +1,5 @@
/* Copyright (c) 2000, 2013, Oracle and/or its affiliates.
Copyright (c) 2010, 2013, Monty Program Ab.
Copyright (c) 2010, 2016, Monty Program Ab.
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
@ -271,7 +271,7 @@ extern my_bool my_use_symdir;
extern ulong my_default_record_cache_size;
extern my_bool my_disable_locking, my_disable_async_io,
my_disable_flush_key_blocks, my_disable_symlinks;
extern my_bool my_disable_sync;
extern my_bool my_disable_sync, my_disable_copystat_in_redel;
extern char wild_many,wild_one,wild_prefix;
extern const char *charsets_dir;
extern my_bool timed_mutexes;

View file

@ -52,7 +52,7 @@ eval SELECT 'hello' INTO OUTFILE 'fake_file.$prefix';
# Use '/' instead of '\' in the error message. On windows platform, dir is
# formed with '\'.
--replace_regex /\\testing_1\\*/\/testing_1\// /66/39/ /17/39/ /File exists/Directory not empty/
--replace_regex /\\testing_1\\*/\/testing_1\// /66/39/ /17/39/ /247/39/ /File exists/Directory not empty/
--error 1010
DROP DATABASE testing_1;
let $wait_binlog_event= DROP TABLE IF EXIST;

View file

@ -341,6 +341,7 @@ while ($1)
alter table t1 add index i2(key2);
alter table t1 add index i3(key3);
update t1 set key2=key1,key3=key1;
analyze table t1;
# to test the bug, the following must use "sort_union":
--replace_column 9 REF

View file

@ -261,11 +261,7 @@ sub show {
# On Windows, rely on cdb to be there...
if (IS_WINDOWS)
{
# Starting cdb is unsafe when used with --parallel > 1 option
if ( $parallel < 2 )
{
_cdb($core_name);
}
_cdb($core_name);
return;
}

View file

@ -60,8 +60,6 @@ use My::Test;
use My::Find;
use My::Suite;
require "mtr_misc.pl";
# locate plugin suites, depending on whether it's a build tree or installed
my @plugin_suitedirs;
my $plugin_suitedir_regex;
@ -1122,7 +1120,7 @@ sub get_tags_from_file($$) {
$file_to_tags{$file}= $tags;
$file_to_master_opts{$file}= $master_opts;
$file_to_slave_opts{$file}= $slave_opts;
$file_combinations{$file}= [ uniq(@combinations) ];
$file_combinations{$file}= [ ::uniq(@combinations) ];
$file_in_overlay{$file} = 1 if $in_overlay;
return @{$tags};
}

View file

@ -34,7 +34,6 @@ use mtr_match;
use My::Platform;
use POSIX qw[ _exit ];
use IO::Handle qw[ flush ];
require "mtr_io.pl";
use mtr_results;
my $tot_real_time= 0;
@ -92,7 +91,7 @@ sub mtr_report_test_passed ($) {
my $timer_str= "";
if ( $timer and -f "$::opt_vardir/log/timer" )
{
$timer_str= mtr_fromfile("$::opt_vardir/log/timer");
$timer_str= ::mtr_fromfile("$::opt_vardir/log/timer");
$tinfo->{timer}= $timer_str;
resfile_test_info('duration', $timer_str) if $::opt_resfile;
}

View file

@ -102,11 +102,11 @@ use mtr_results;
use IO::Socket::INET;
use IO::Select;
require "lib/mtr_process.pl";
require "lib/mtr_io.pl";
require "lib/mtr_gcov.pl";
require "lib/mtr_gprof.pl";
require "lib/mtr_misc.pl";
require "mtr_process.pl";
require "mtr_io.pl";
require "mtr_gcov.pl";
require "mtr_gprof.pl";
require "mtr_misc.pl";
$SIG{INT}= sub { mtr_error("Got ^C signal"); };
$SIG{HUP}= sub { mtr_error("Hangup detected on controlling terminal"); };

View file

@ -9,6 +9,7 @@ Acronis http://www.acronis.com Silver Sponsor of the MariaDB Foundation
Auttomattic https://automattic.com Bronze Sponsor of the MariaDB Foundation
Verkkokauppa.com https://virtuozzo.com Bronze Sponsor of the MariaDB Foundation
Virtuozzo https://virtuozzo.com/ Bronze Sponsor of the MariaDB Foundation
Tencent Game DBA http://tencentdba.com/about/ Bronze Sponsor of the MariaDB Foundation
Google USA Sponsoring encryption, parallel replication and GTID
Facebook USA Sponsoring non-blocking API, LIMIT ROWS EXAMINED etc
Ronald Bradford Brisbane, Australia EFF contribution for UC2006 Auction

View file

@ -1658,6 +1658,9 @@ CHAR_LENGTH(TRIM(BOTH 0x61 FROM _utf32 0x00000061))
SELECT CHAR_LENGTH(TRIM(BOTH 0x00 FROM _utf32 0x00000061));
CHAR_LENGTH(TRIM(BOTH 0x00 FROM _utf32 0x00000061))
1
select hex(lower(cast(0xffff0000 as char character set utf32))) as c;
c
FFFF0000
#
# End of 5.5 tests
#

View file

@ -286,3 +286,19 @@ F 28 28
F 29 29
F 30 30
DROP TABLE t0,t1,t2;
#
# MDEV-MariaDB daemon leaks memory with specific query
#
CREATE TABLE t1 (`voter_id` int(11) unsigned NOT NULL,
`language_id` int(11) unsigned NOT NULL DEFAULT '1'
) ENGINE=InnoDB DEFAULT CHARSET=utf8;
CREATE TABLE t2 (`voter_id` int(10) unsigned NOT NULL DEFAULT '0',
`serialized_c` mediumblob) ENGINE=InnoDB DEFAULT CHARSET=utf8;
insert into t2 values (1,repeat("a",1000)),(2,repeat("a",1000)),(3,repeat("b",1000)),(4,repeat("c",1000)),(4,repeat("b",1000));
SELECT GROUP_CONCAT(t1.language_id SEPARATOR ',') AS `translation_resources`, `d`.`serialized_c` FROM t2 AS `d` LEFT JOIN t1 ON `d`.`voter_id` = t1.`voter_id` GROUP BY `d`.`voter_id` ORDER BY 10-d.voter_id+RAND()*0;
translation_resources serialized_c
NULL cccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccc
NULL bbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbb
NULL aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa
NULL aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa
drop table t1,t2;

View file

@ -311,6 +311,9 @@ set @d=@d*2;
alter table t1 add index i2(key2);
alter table t1 add index i3(key3);
update t1 set key2=key1,key3=key1;
analyze table t1;
Table Op Msg_type Msg_text
test.t1 analyze status OK
explain select * from t1 where (key3 > 30 and key3<35) or (key2 >32 and key2 < 40);
id select_type table type possible_keys key key_len ref rows Extra
1 SIMPLE t1 index_merge i2,i3 i3,i2 4,4 NULL REF Using sort_union(i3,i2); Using where

View file

@ -1146,6 +1146,9 @@ set @d=@d*2;
alter table t1 add index i2(key2);
alter table t1 add index i3(key3);
update t1 set key2=key1,key3=key1;
analyze table t1;
Table Op Msg_type Msg_text
test.t1 analyze status OK
explain select * from t1 where (key3 > 30 and key3<35) or (key2 >32 and key2 < 40);
id select_type table type possible_keys key key_len ref rows Extra
1 SIMPLE t1 index_merge i2,i3 i3,i2 4,4 NULL REF Using sort_union(i3,i2); Using where

View file

@ -3832,6 +3832,23 @@ test.m1 repair error Corrupt
# Clean-up.
drop tables m1, t1, t4;
drop view t3;
#
# MDEV-10424 - Assertion `ticket == __null' failed in
# MDL_request::set_type
#
CREATE TABLE t1 (f1 INT) ENGINE=MyISAM;
CREATE TABLE tmerge (f1 INT) ENGINE=MERGE UNION=(t1);
PREPARE stmt FROM "ANALYZE TABLE tmerge, t1";
EXECUTE stmt;
Table Op Msg_type Msg_text
test.tmerge analyze note The storage engine for the table doesn't support analyze
test.t1 analyze status Table is already up to date
EXECUTE stmt;
Table Op Msg_type Msg_text
test.tmerge analyze note The storage engine for the table doesn't support analyze
test.t1 analyze status Table is already up to date
DEALLOCATE PREPARE stmt;
DROP TABLE t1, tmerge;
End of 5.5 tests
#
# Additional coverage for refactoring which is made as part

View file

@ -4076,4 +4076,35 @@ id value
deallocate prepare stmt;
SET SESSION sql_mode = @save_sql_mode;
DROP TABLE t1,t2;
# End of 10.0 tests
#
# MDEV-8833: Crash of server on prepared statement with
# conversion to semi-join
#
CREATE TABLE t1 (column1 INT);
INSERT INTO t1 VALUES (3),(9);
CREATE TABLE t2 (column2 INT);
INSERT INTO t2 VALUES (1),(4);
CREATE TABLE t3 (column3 INT);
INSERT INTO t3 VALUES (6),(8);
CREATE TABLE t4 (column4 INT);
INSERT INTO t4 VALUES (2),(5);
PREPARE stmt FROM "SELECT ( SELECT MAX( table1.column1 ) AS field1
FROM t1 AS table1
WHERE table3.column3 IN ( SELECT table2.column2 AS field2 FROM t2 AS table2 )
) AS sq
FROM t3 AS table3, t4 AS table4";
EXECUTE stmt;
sq
NULL
NULL
NULL
NULL
EXECUTE stmt;
sq
NULL
NULL
NULL
NULL
deallocate prepare stmt;
drop table t1,t2,t3,t4;
# End of 5.5 tests

View file

@ -14,6 +14,25 @@ this
0
4294967295
drop table t1;
create table t1 (a bigint unsigned, b mediumint unsigned);
insert t1 values (1,2),(0xffffffffffffffff,0xffffff);
select coalesce(a,b), coalesce(b,a) from t1;
coalesce(a,b) coalesce(b,a)
1 2
18446744073709551615 16777215
create table t2 as select a from t1 union select b from t1;
show create table t2;
Table Create Table
t2 CREATE TABLE `t2` (
`a` bigint(20) unsigned DEFAULT NULL
) ENGINE=MyISAM DEFAULT CHARSET=latin1
select * from t2;
a
1
18446744073709551615
2
16777215
drop table t1, t2;
#
# Start of 10.0 tests
#

View file

@ -6,7 +6,8 @@ table_54044 CREATE TEMPORARY TABLE `table_54044` (
`IF(NULL IS NOT NULL, NULL, NULL)` binary(0) DEFAULT NULL
) ENGINE=InnoDB DEFAULT CHARSET=latin1
DROP TABLE table_54044;
CREATE TABLE tmp ENGINE = INNODB AS SELECT COALESCE(NULL, NULL, NULL), GREATEST(NULL, NULL), NULL;
CREATE TABLE tmp ENGINE = INNODB
AS SELECT COALESCE(NULL, NULL, NULL), GREATEST(NULL, NULL), NULL;
SHOW CREATE TABLE tmp;
Table Create Table
tmp CREATE TABLE `tmp` (

View file

@ -0,0 +1,8 @@
alter table mysql.time_zone_name engine=InnoDB;
create table envois3 (starttime datetime) engine=InnoDB;
insert envois3 values ('2008-08-11 22:43:00');
select convert_tz(starttime,'UTC','Europe/Moscow') starttime from envois3;
starttime
2008-08-12 02:43:00
drop table envois3;
alter table mysql.time_zone_name engine=MyISAM;

View file

@ -10,7 +10,10 @@ CREATE TEMPORARY TABLE table_54044 ENGINE = INNODB
SHOW CREATE TABLE table_54044;
DROP TABLE table_54044;
CREATE TABLE tmp ENGINE = INNODB AS SELECT COALESCE(NULL, NULL, NULL), GREATEST(NULL, NULL), NULL;
# This 'create table' should pass since it uses a Field_string of size 0.
CREATE TABLE tmp ENGINE = INNODB
AS SELECT COALESCE(NULL, NULL, NULL), GREATEST(NULL, NULL), NULL;
SHOW CREATE TABLE tmp;
DROP TABLE tmp;
@ -23,4 +26,3 @@ FLUSH TABLES;
--error 1005
CREATE TEMPORARY TABLE tmp ENGINE=InnoDB AS SELECT VALUES(a) FROM t1;
DROP TABLE t1;

View file

@ -0,0 +1,12 @@
--source include/have_innodb.inc
#
# MDEV-10775 System table in InnoDB format allowed in MariaDB could lead to crash
#
alter table mysql.time_zone_name engine=InnoDB;
create table envois3 (starttime datetime) engine=InnoDB;
insert envois3 values ('2008-08-11 22:43:00');
--source include/restart_mysqld.inc
select convert_tz(starttime,'UTC','Europe/Moscow') starttime from envois3;
drop table envois3;
alter table mysql.time_zone_name engine=MyISAM;

View file

@ -1,121 +0,0 @@
"General cleanup"
set @aria_checkpoint_interval_save= @@global.aria_checkpoint_interval;
set @@global.aria_checkpoint_interval= 0;
drop table if exists t1;
update performance_schema.setup_instruments set enabled = 'NO';
update performance_schema.setup_consumers set enabled = 'NO';
truncate table performance_schema.file_summary_by_event_name;
truncate table performance_schema.file_summary_by_instance;
truncate table performance_schema.socket_summary_by_event_name;
truncate table performance_schema.socket_summary_by_instance;
truncate table performance_schema.events_waits_summary_global_by_event_name;
truncate table performance_schema.events_waits_summary_by_instance;
truncate table performance_schema.events_waits_summary_by_thread_by_event_name;
update performance_schema.setup_consumers set enabled = 'YES';
update performance_schema.setup_instruments
set enabled = 'YES', timed = 'YES';
create table t1 (
id INT PRIMARY KEY,
b CHAR(100) DEFAULT 'initial value')
ENGINE=MyISAM;
insert into t1 (id) values (1), (2), (3), (4), (5), (6), (7), (8);
update performance_schema.setup_instruments SET enabled = 'NO';
update performance_schema.setup_consumers set enabled = 'NO';
set @dump_all=FALSE;
"Verifying file aggregate consistency"
SELECT EVENT_NAME, e.COUNT_READ, SUM(i.COUNT_READ)
FROM performance_schema.file_summary_by_event_name AS e
JOIN performance_schema.file_summary_by_instance AS i USING (EVENT_NAME)
GROUP BY EVENT_NAME
HAVING (e.COUNT_READ <> SUM(i.COUNT_READ))
OR @dump_all;
EVENT_NAME COUNT_READ SUM(i.COUNT_READ)
SELECT EVENT_NAME, e.COUNT_WRITE, SUM(i.COUNT_WRITE)
FROM performance_schema.file_summary_by_event_name AS e
JOIN performance_schema.file_summary_by_instance AS i USING (EVENT_NAME)
GROUP BY EVENT_NAME
HAVING (e.COUNT_WRITE <> SUM(i.COUNT_WRITE))
OR @dump_all;
EVENT_NAME COUNT_WRITE SUM(i.COUNT_WRITE)
SELECT EVENT_NAME, e.COUNT_READ, SUM(i.COUNT_READ)
FROM performance_schema.socket_summary_by_event_name AS e
JOIN performance_schema.socket_summary_by_instance AS i USING (EVENT_NAME)
GROUP BY EVENT_NAME
HAVING (e.COUNT_READ <> SUM(i.COUNT_READ))
OR @dump_all;
EVENT_NAME COUNT_READ SUM(i.COUNT_READ)
SELECT EVENT_NAME, e.COUNT_WRITE, SUM(i.COUNT_WRITE)
FROM performance_schema.socket_summary_by_event_name AS e
JOIN performance_schema.socket_summary_by_instance AS i USING (EVENT_NAME)
GROUP BY EVENT_NAME
HAVING (e.COUNT_WRITE <> SUM(i.COUNT_WRITE))
OR @dump_all;
EVENT_NAME COUNT_WRITE SUM(i.COUNT_WRITE)
SELECT EVENT_NAME, e.SUM_NUMBER_OF_BYTES_READ, SUM(i.SUM_NUMBER_OF_BYTES_READ)
FROM performance_schema.file_summary_by_event_name AS e
JOIN performance_schema.file_summary_by_instance AS i USING (EVENT_NAME)
GROUP BY EVENT_NAME
HAVING (e.SUM_NUMBER_OF_BYTES_READ <> SUM(i.SUM_NUMBER_OF_BYTES_READ))
OR @dump_all;
EVENT_NAME SUM_NUMBER_OF_BYTES_READ SUM(i.SUM_NUMBER_OF_BYTES_READ)
SELECT EVENT_NAME, e.SUM_NUMBER_OF_BYTES_WRITE, SUM(i.SUM_NUMBER_OF_BYTES_WRITE)
FROM performance_schema.file_summary_by_event_name AS e
JOIN performance_schema.file_summary_by_instance AS i USING (EVENT_NAME)
GROUP BY EVENT_NAME
HAVING (e.SUM_NUMBER_OF_BYTES_WRITE <> SUM(i.SUM_NUMBER_OF_BYTES_WRITE))
OR @dump_all;
EVENT_NAME SUM_NUMBER_OF_BYTES_WRITE SUM(i.SUM_NUMBER_OF_BYTES_WRITE)
"Verifying waits aggregate consistency (instance)"
SELECT EVENT_NAME, e.SUM_TIMER_WAIT, SUM(i.SUM_TIMER_WAIT)
FROM performance_schema.events_waits_summary_global_by_event_name AS e
JOIN performance_schema.events_waits_summary_by_instance AS i USING (EVENT_NAME)
GROUP BY EVENT_NAME
HAVING (e.SUM_TIMER_WAIT < SUM(i.SUM_TIMER_WAIT))
OR @dump_all;
EVENT_NAME SUM_TIMER_WAIT SUM(i.SUM_TIMER_WAIT)
SELECT EVENT_NAME, e.MIN_TIMER_WAIT, MIN(i.MIN_TIMER_WAIT)
FROM performance_schema.events_waits_summary_global_by_event_name AS e
JOIN performance_schema.events_waits_summary_by_instance AS i USING (EVENT_NAME)
GROUP BY EVENT_NAME
HAVING (e.MIN_TIMER_WAIT > MIN(i.MIN_TIMER_WAIT))
AND (MIN(i.MIN_TIMER_WAIT) != 0)
OR @dump_all;
EVENT_NAME MIN_TIMER_WAIT MIN(i.MIN_TIMER_WAIT)
SELECT EVENT_NAME, e.MAX_TIMER_WAIT, MAX(i.MAX_TIMER_WAIT)
FROM performance_schema.events_waits_summary_global_by_event_name AS e
JOIN performance_schema.events_waits_summary_by_instance AS i USING (EVENT_NAME)
GROUP BY EVENT_NAME
HAVING (e.MAX_TIMER_WAIT < MAX(i.MAX_TIMER_WAIT))
OR @dump_all;
EVENT_NAME MAX_TIMER_WAIT MAX(i.MAX_TIMER_WAIT)
"Verifying waits aggregate consistency (thread)"
SELECT EVENT_NAME, e.SUM_TIMER_WAIT, SUM(t.SUM_TIMER_WAIT)
FROM performance_schema.events_waits_summary_global_by_event_name AS e
JOIN performance_schema.events_waits_summary_by_thread_by_event_name AS t
USING (EVENT_NAME)
GROUP BY EVENT_NAME
HAVING (e.SUM_TIMER_WAIT < SUM(t.SUM_TIMER_WAIT))
OR @dump_all;
EVENT_NAME SUM_TIMER_WAIT SUM(t.SUM_TIMER_WAIT)
SELECT EVENT_NAME, e.MIN_TIMER_WAIT, MIN(t.MIN_TIMER_WAIT)
FROM performance_schema.events_waits_summary_global_by_event_name AS e
JOIN performance_schema.events_waits_summary_by_thread_by_event_name AS t
USING (EVENT_NAME)
GROUP BY EVENT_NAME
HAVING (e.MIN_TIMER_WAIT > MIN(t.MIN_TIMER_WAIT))
AND (MIN(t.MIN_TIMER_WAIT) != 0)
OR @dump_all;
EVENT_NAME MIN_TIMER_WAIT MIN(t.MIN_TIMER_WAIT)
SELECT EVENT_NAME, e.MAX_TIMER_WAIT, MAX(t.MAX_TIMER_WAIT)
FROM performance_schema.events_waits_summary_global_by_event_name AS e
JOIN performance_schema.events_waits_summary_by_thread_by_event_name AS t
USING (EVENT_NAME)
GROUP BY EVENT_NAME
HAVING (e.MAX_TIMER_WAIT < MAX(t.MAX_TIMER_WAIT))
OR @dump_all;
EVENT_NAME MAX_TIMER_WAIT MAX(t.MAX_TIMER_WAIT)
update performance_schema.setup_consumers set enabled = 'YES';
update performance_schema.setup_instruments
set enabled = 'YES', timed = 'YES';
drop table test.t1;
set @@global.aria_checkpoint_interval= @aria_checkpoint_interval_save;

View file

@ -1,197 +0,0 @@
# Tests for PERFORMANCE_SCHEMA
# Verify that statistics aggregated by different criteria are consistent.
--source include/not_embedded.inc
--source include/have_perfschema.inc
--echo "General cleanup"
# MDEV-7187 - test fails sporadically in buildbot
set @aria_checkpoint_interval_save= @@global.aria_checkpoint_interval;
set @@global.aria_checkpoint_interval= 0;
--disable_warnings
drop table if exists t1;
--enable_warnings
update performance_schema.setup_instruments set enabled = 'NO';
update performance_schema.setup_consumers set enabled = 'NO';
# Cleanup statistics
truncate table performance_schema.file_summary_by_event_name;
truncate table performance_schema.file_summary_by_instance;
truncate table performance_schema.socket_summary_by_event_name;
truncate table performance_schema.socket_summary_by_instance;
truncate table performance_schema.events_waits_summary_global_by_event_name;
truncate table performance_schema.events_waits_summary_by_instance;
truncate table performance_schema.events_waits_summary_by_thread_by_event_name;
# Start recording data
update performance_schema.setup_consumers set enabled = 'YES';
update performance_schema.setup_instruments
set enabled = 'YES', timed = 'YES';
create table t1 (
id INT PRIMARY KEY,
b CHAR(100) DEFAULT 'initial value')
ENGINE=MyISAM;
insert into t1 (id) values (1), (2), (3), (4), (5), (6), (7), (8);
# Stop recording data, so the select below don't add noise.
update performance_schema.setup_instruments SET enabled = 'NO';
# Disable all consumers, for long standing waits
update performance_schema.setup_consumers set enabled = 'NO';
# Helper to debug
set @dump_all=FALSE;
# Note that in general:
# - COUNT/SUM/MAX(file_summary_by_event_name) >=
# COUNT/SUM/MAX(file_summary_by_instance).
# - MIN(file_summary_by_event_name) <=
# MIN(file_summary_by_instance).
# There will be equality only when file instances are not removed,
# aka when a file is not deleted from the file system,
# because doing so removes a row in file_summary_by_instance.
# Likewise:
# - COUNT/SUM/MAX(events_waits_summary_global_by_event_name) >=
# COUNT/SUM/MAX(events_waits_summary_by_instance)
# - MIN(events_waits_summary_global_by_event_name) <=
# MIN(events_waits_summary_by_instance)
# There will be equality only when an instrument instance
# is not removed, which is next to impossible to predictably guarantee
# in the server.
# For example, a MyISAM table removed from the table cache
# will cause a mysql_mutex_destroy on myisam/MYISAM_SHARE::intern_lock.
# Another example, a thread terminating will cause a mysql_mutex_destroy
# on sql/LOCK_delete
# Both cause a row to be deleted from events_waits_summary_by_instance.
# Likewise:
# - COUNT/SUM/MAX(events_waits_summary_global_by_event_name) >=
# COUNT/SUM/MAX(events_waits_summary_by_thread_by_event_name)
# - MIN(events_waits_summary_global_by_event_name) <=
# MIN(events_waits_summary_by_thread_by_event_name)
# There will be equality only when no thread is removed,
# that is if no thread disconnects, or no sub thread (for example insert
# delayed) ever completes.
# A thread completing will cause rows in
# events_waits_summary_by_thread_by_event_name to be removed.
--echo "Verifying file aggregate consistency"
# Since the code generating the load in this test does:
# - create table
# - insert
# - does not cause temporary tables to be used
# we can test for equality here for file aggregates.
# If any of these queries returns data, the test failed.
SELECT EVENT_NAME, e.COUNT_READ, SUM(i.COUNT_READ)
FROM performance_schema.file_summary_by_event_name AS e
JOIN performance_schema.file_summary_by_instance AS i USING (EVENT_NAME)
GROUP BY EVENT_NAME
HAVING (e.COUNT_READ <> SUM(i.COUNT_READ))
OR @dump_all;
SELECT EVENT_NAME, e.COUNT_WRITE, SUM(i.COUNT_WRITE)
FROM performance_schema.file_summary_by_event_name AS e
JOIN performance_schema.file_summary_by_instance AS i USING (EVENT_NAME)
GROUP BY EVENT_NAME
HAVING (e.COUNT_WRITE <> SUM(i.COUNT_WRITE))
OR @dump_all;
SELECT EVENT_NAME, e.COUNT_READ, SUM(i.COUNT_READ)
FROM performance_schema.socket_summary_by_event_name AS e
JOIN performance_schema.socket_summary_by_instance AS i USING (EVENT_NAME)
GROUP BY EVENT_NAME
HAVING (e.COUNT_READ <> SUM(i.COUNT_READ))
OR @dump_all;
SELECT EVENT_NAME, e.COUNT_WRITE, SUM(i.COUNT_WRITE)
FROM performance_schema.socket_summary_by_event_name AS e
JOIN performance_schema.socket_summary_by_instance AS i USING (EVENT_NAME)
GROUP BY EVENT_NAME
HAVING (e.COUNT_WRITE <> SUM(i.COUNT_WRITE))
OR @dump_all;
SELECT EVENT_NAME, e.SUM_NUMBER_OF_BYTES_READ, SUM(i.SUM_NUMBER_OF_BYTES_READ)
FROM performance_schema.file_summary_by_event_name AS e
JOIN performance_schema.file_summary_by_instance AS i USING (EVENT_NAME)
GROUP BY EVENT_NAME
HAVING (e.SUM_NUMBER_OF_BYTES_READ <> SUM(i.SUM_NUMBER_OF_BYTES_READ))
OR @dump_all;
SELECT EVENT_NAME, e.SUM_NUMBER_OF_BYTES_WRITE, SUM(i.SUM_NUMBER_OF_BYTES_WRITE)
FROM performance_schema.file_summary_by_event_name AS e
JOIN performance_schema.file_summary_by_instance AS i USING (EVENT_NAME)
GROUP BY EVENT_NAME
HAVING (e.SUM_NUMBER_OF_BYTES_WRITE <> SUM(i.SUM_NUMBER_OF_BYTES_WRITE))
OR @dump_all;
--echo "Verifying waits aggregate consistency (instance)"
SELECT EVENT_NAME, e.SUM_TIMER_WAIT, SUM(i.SUM_TIMER_WAIT)
FROM performance_schema.events_waits_summary_global_by_event_name AS e
JOIN performance_schema.events_waits_summary_by_instance AS i USING (EVENT_NAME)
GROUP BY EVENT_NAME
HAVING (e.SUM_TIMER_WAIT < SUM(i.SUM_TIMER_WAIT))
OR @dump_all;
SELECT EVENT_NAME, e.MIN_TIMER_WAIT, MIN(i.MIN_TIMER_WAIT)
FROM performance_schema.events_waits_summary_global_by_event_name AS e
JOIN performance_schema.events_waits_summary_by_instance AS i USING (EVENT_NAME)
GROUP BY EVENT_NAME
HAVING (e.MIN_TIMER_WAIT > MIN(i.MIN_TIMER_WAIT))
AND (MIN(i.MIN_TIMER_WAIT) != 0)
OR @dump_all;
SELECT EVENT_NAME, e.MAX_TIMER_WAIT, MAX(i.MAX_TIMER_WAIT)
FROM performance_schema.events_waits_summary_global_by_event_name AS e
JOIN performance_schema.events_waits_summary_by_instance AS i USING (EVENT_NAME)
GROUP BY EVENT_NAME
HAVING (e.MAX_TIMER_WAIT < MAX(i.MAX_TIMER_WAIT))
OR @dump_all;
--echo "Verifying waits aggregate consistency (thread)"
SELECT EVENT_NAME, e.SUM_TIMER_WAIT, SUM(t.SUM_TIMER_WAIT)
FROM performance_schema.events_waits_summary_global_by_event_name AS e
JOIN performance_schema.events_waits_summary_by_thread_by_event_name AS t
USING (EVENT_NAME)
GROUP BY EVENT_NAME
HAVING (e.SUM_TIMER_WAIT < SUM(t.SUM_TIMER_WAIT))
OR @dump_all;
SELECT EVENT_NAME, e.MIN_TIMER_WAIT, MIN(t.MIN_TIMER_WAIT)
FROM performance_schema.events_waits_summary_global_by_event_name AS e
JOIN performance_schema.events_waits_summary_by_thread_by_event_name AS t
USING (EVENT_NAME)
GROUP BY EVENT_NAME
HAVING (e.MIN_TIMER_WAIT > MIN(t.MIN_TIMER_WAIT))
AND (MIN(t.MIN_TIMER_WAIT) != 0)
OR @dump_all;
SELECT EVENT_NAME, e.MAX_TIMER_WAIT, MAX(t.MAX_TIMER_WAIT)
FROM performance_schema.events_waits_summary_global_by_event_name AS e
JOIN performance_schema.events_waits_summary_by_thread_by_event_name AS t
USING (EVENT_NAME)
GROUP BY EVENT_NAME
HAVING (e.MAX_TIMER_WAIT < MAX(t.MAX_TIMER_WAIT))
OR @dump_all;
# Cleanup
update performance_schema.setup_consumers set enabled = 'YES';
update performance_schema.setup_instruments
set enabled = 'YES', timed = 'YES';
drop table test.t1;
set @@global.aria_checkpoint_interval= @aria_checkpoint_interval_save;

View file

@ -8,7 +8,6 @@ server_audit_file_rotate_now OFF
server_audit_file_rotate_size 1000000
server_audit_file_rotations 9
server_audit_incl_users
server_audit_loc_info
server_audit_logging OFF
server_audit_mode 0
server_audit_output_type file
@ -72,7 +71,6 @@ server_audit_file_rotate_now OFF
server_audit_file_rotate_size 1000000
server_audit_file_rotations 9
server_audit_incl_users odin, root, dva, tri
server_audit_loc_info
server_audit_logging ON
server_audit_mode 0
server_audit_output_type file
@ -218,7 +216,6 @@ server_audit_file_rotate_now OFF
server_audit_file_rotate_size 1000000
server_audit_file_rotations 9
server_audit_incl_users odin, root, dva, tri
server_audit_loc_info
server_audit_logging ON
server_audit_mode 1
server_audit_output_type file

View file

@ -8,7 +8,6 @@ server_audit_file_rotate_now OFF
server_audit_file_rotate_size 1000000
server_audit_file_rotations 9
server_audit_incl_users
server_audit_loc_info
server_audit_logging OFF
server_audit_mode 0
server_audit_output_type file
@ -72,7 +71,6 @@ server_audit_file_rotate_now OFF
server_audit_file_rotate_size 1000000
server_audit_file_rotations 9
server_audit_incl_users odin, root, dva, tri
server_audit_loc_info
server_audit_logging ON
server_audit_mode 0
server_audit_output_type file
@ -218,7 +216,6 @@ server_audit_file_rotate_now OFF
server_audit_file_rotate_size 1000000
server_audit_file_rotations 9
server_audit_incl_users odin, root, dva, tri
server_audit_loc_info
server_audit_logging ON
server_audit_mode 1
server_audit_output_type file

View file

@ -13,7 +13,7 @@ insert into mysqltest1.t1 values (1);
select * from mysqltest1.t1 into outfile 'mysqltest1/f1.txt';
create table mysqltest1.t2 (n int);
create table mysqltest1.t3 (n int);
--replace_result \\ / 66 39 17 39 "File exists" "Directory not empty"
--replace_result \\ / 66 39 17 39 247 39 "File exists" "Directory not empty"
--error 1010
drop database mysqltest1;
use mysqltest1;
@ -30,7 +30,7 @@ while ($1)
}
--enable_query_log
--replace_result \\ / 66 39 17 39 "File exists" "Directory not empty"
--replace_result \\ / 66 39 17 39 247 39 "File exists" "Directory not empty"
--error 1010
drop database mysqltest1;
use mysqltest1;

View file

@ -889,6 +889,11 @@ SELECT CHAR_LENGTH(TRIM(BOTH 0x0001 FROM _utf32 0x00000061));
SELECT CHAR_LENGTH(TRIM(BOTH 0x61 FROM _utf32 0x00000061));
SELECT CHAR_LENGTH(TRIM(BOTH 0x00 FROM _utf32 0x00000061));
#
# potential signedness issue
#
select hex(lower(cast(0xffff0000 as char character set utf32))) as c;
--echo #
--echo # End of 5.5 tests
--echo #

View file

@ -230,3 +230,16 @@ eval EXPLAIN $query;
eval $query;
DROP TABLE t0,t1,t2;
--echo #
--echo # MDEV-MariaDB daemon leaks memory with specific query
--echo #
CREATE TABLE t1 (`voter_id` int(11) unsigned NOT NULL,
`language_id` int(11) unsigned NOT NULL DEFAULT '1'
) ENGINE=InnoDB DEFAULT CHARSET=utf8;
CREATE TABLE t2 (`voter_id` int(10) unsigned NOT NULL DEFAULT '0',
`serialized_c` mediumblob) ENGINE=InnoDB DEFAULT CHARSET=utf8;
insert into t2 values (1,repeat("a",1000)),(2,repeat("a",1000)),(3,repeat("b",1000)),(4,repeat("c",1000)),(4,repeat("b",1000));
SELECT GROUP_CONCAT(t1.language_id SEPARATOR ',') AS `translation_resources`, `d`.`serialized_c` FROM t2 AS `d` LEFT JOIN t1 ON `d`.`voter_id` = t1.`voter_id` GROUP BY `d`.`voter_id` ORDER BY 10-d.voter_id+RAND()*0;
drop table t1,t2;

View file

@ -2880,6 +2880,19 @@ drop tables m1, t1, t4;
drop view t3;
--echo #
--echo # MDEV-10424 - Assertion `ticket == __null' failed in
--echo # MDL_request::set_type
--echo #
CREATE TABLE t1 (f1 INT) ENGINE=MyISAM;
CREATE TABLE tmerge (f1 INT) ENGINE=MERGE UNION=(t1);
PREPARE stmt FROM "ANALYZE TABLE tmerge, t1";
EXECUTE stmt;
EXECUTE stmt;
DEALLOCATE PREPARE stmt;
DROP TABLE t1, tmerge;
--echo End of 5.5 tests

View file

@ -3653,5 +3653,32 @@ deallocate prepare stmt;
SET SESSION sql_mode = @save_sql_mode;
DROP TABLE t1,t2;
--echo #
--echo # MDEV-8833: Crash of server on prepared statement with
--echo # conversion to semi-join
--echo #
--echo # End of 10.0 tests
CREATE TABLE t1 (column1 INT);
INSERT INTO t1 VALUES (3),(9);
CREATE TABLE t2 (column2 INT);
INSERT INTO t2 VALUES (1),(4);
CREATE TABLE t3 (column3 INT);
INSERT INTO t3 VALUES (6),(8);
CREATE TABLE t4 (column4 INT);
INSERT INTO t4 VALUES (2),(5);
PREPARE stmt FROM "SELECT ( SELECT MAX( table1.column1 ) AS field1
FROM t1 AS table1
WHERE table3.column3 IN ( SELECT table2.column2 AS field2 FROM t2 AS table2 )
) AS sq
FROM t3 AS table3, t4 AS table4";
EXECUTE stmt;
EXECUTE stmt;
deallocate prepare stmt;
drop table t1,t2,t3,t4;
--echo # End of 5.5 tests

View file

@ -16,6 +16,13 @@ drop table t1;
# End of 4.1 tests
create table t1 (a bigint unsigned, b mediumint unsigned);
insert t1 values (1,2),(0xffffffffffffffff,0xffffff);
select coalesce(a,b), coalesce(b,a) from t1;
create table t2 as select a from t1 union select b from t1;
show create table t2;
select * from t2;
drop table t1, t2;
--echo #
--echo # Start of 10.0 tests

View file

@ -1,5 +1,5 @@
/*
Copyright (c) 2000, 2010, Oracle and/or its affiliates
/* Copyright (c) 2000, 2010, Oracle and/or its affiliates
Copyright (c) 2009, 2016, MariaDB
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
@ -49,7 +49,8 @@ int my_redel(const char *org_name, const char *tmp_name,
DBUG_PRINT("my",("org_name: '%s' tmp_name: '%s' MyFlags: %lu",
org_name,tmp_name,MyFlags));
if (my_copystat(org_name,tmp_name,MyFlags) < 0)
if (!my_disable_copystat_in_redel &&
my_copystat(org_name,tmp_name,MyFlags) < 0)
goto end;
if (MyFlags & MY_REDEL_MAKE_BACKUP)
{

View file

@ -98,3 +98,4 @@ my_bool my_disable_sync=0;
my_bool my_disable_async_io=0;
my_bool my_disable_flush_key_blocks=0;
my_bool my_disable_symlinks=0;
my_bool my_disable_copystat_in_redel=0;

View file

@ -427,9 +427,8 @@ static MYSQL_SYSVAR_UINT(query_log_limit, query_log_limit,
char locinfo_ini_value[sizeof(struct connection_info)+4];
static MYSQL_THDVAR_STR(loc_info,
PLUGIN_VAR_READONLY | PLUGIN_VAR_MEMALLOC,
"Auxiliary info.", NULL, NULL,
locinfo_ini_value);
PLUGIN_VAR_NOSYSVAR | PLUGIN_VAR_NOCMDOPT | PLUGIN_VAR_MEMALLOC,
"Internal info", NULL, NULL, locinfo_ini_value);
static const char *syslog_facility_names[]=
{

View file

@ -46,6 +46,7 @@ struct show_table_contributors_st show_table_contributors[]= {
{"Auttomattic", "https://automattic.com", "Bronze Sponsor of the MariaDB Foundation"},
{"Verkkokauppa.com", "https://virtuozzo.com", "Bronze Sponsor of the MariaDB Foundation"},
{"Virtuozzo", "https://virtuozzo.com/", "Bronze Sponsor of the MariaDB Foundation"},
{"Tencent Game DBA", "http://tencentdba.com/about/", "Bronze Sponsor of the MariaDB Foundation"},
/* Sponsors of important features */
{"Google", "USA", "Sponsoring encryption, parallel replication and GTID"},

View file

@ -355,7 +355,7 @@ static enum_field_types field_types_merge_rules [FIELDTYPE_NUM][FIELDTYPE_NUM]=
//MYSQL_TYPE_NULL MYSQL_TYPE_TIMESTAMP
MYSQL_TYPE_LONGLONG, MYSQL_TYPE_VARCHAR,
//MYSQL_TYPE_LONGLONG MYSQL_TYPE_INT24
MYSQL_TYPE_LONGLONG, MYSQL_TYPE_LONG,
MYSQL_TYPE_LONGLONG, MYSQL_TYPE_LONGLONG,
//MYSQL_TYPE_DATE MYSQL_TYPE_TIME
MYSQL_TYPE_VARCHAR, MYSQL_TYPE_VARCHAR,
//MYSQL_TYPE_DATETIME MYSQL_TYPE_YEAR

View file

@ -2743,9 +2743,28 @@ void Item_field::fix_after_pullout(st_select_lex *new_parent, Item **ref)
if (context)
{
Name_resolution_context *ctx= new Name_resolution_context();
ctx->outer_context= NULL; // We don't build a complete name resolver
ctx->table_list= NULL; // We rely on first_name_resolution_table instead
if (context->select_lex == new_parent)
{
/*
This field was pushed in then pulled out
(for example left part of IN)
*/
ctx->outer_context= context->outer_context;
}
else if (context->outer_context)
{
/* just pull to the upper context */
ctx->outer_context= context->outer_context->outer_context;
}
else
{
/* No upper context (merging Derived/VIEW where context chain ends) */
ctx->outer_context= NULL;
}
ctx->table_list= context->first_name_resolution_table;
ctx->select_lex= new_parent;
if (context->select_lex == NULL)
ctx->select_lex= NULL;
ctx->first_name_resolution_table= context->first_name_resolution_table;
ctx->last_name_resolution_table= context->last_name_resolution_table;
ctx->error_processor= context->error_processor;

View file

@ -3011,7 +3011,7 @@ bool MYSQL_QUERY_LOG::write(THD *thd, time_t current_time,
if (! write_error)
{
write_error= 1;
sql_print_error(ER(ER_ERROR_ON_WRITE), name, error);
sql_print_error(ER(ER_ERROR_ON_WRITE), name, tmp_errno);
}
}
}

View file

@ -3916,6 +3916,7 @@ static int init_common_variables()
max_system_variables.pseudo_thread_id= (ulong)~0;
server_start_time= flush_status_time= my_time(0);
my_disable_copystat_in_redel= 1;
global_rpl_filter= new Rpl_filter;
binlog_filter= new Rpl_filter;

View file

@ -1,5 +1,5 @@
/* Copyright (c) 2000, 2013, Oracle and/or its affiliates.
Copyright (c) 2010, 2014, SkySQL Ab.
/* Copyright (c) 2000, 2016, Oracle and/or its affiliates.
Copyright (c) 2012, 2016, MariaDB
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by

View file

@ -42,9 +42,9 @@ enum file_opt_type {
struct File_option
{
LEX_STRING name; /**< Name of the option */
int offset; /**< offset to base address of value */
file_opt_type type; /**< Option type */
LEX_STRING name; /**< Name of the option */
my_ptrdiff_t offset; /**< offset to base address of value */
file_opt_type type; /**< Option type */
};

View file

@ -64,13 +64,13 @@ extern "C" sig_handler handle_fatal_signal(int sig)
struct tm tm;
#ifdef HAVE_STACKTRACE
THD *thd;
#endif
/*
This flag remembers if the query pointer was found invalid.
We will try and print the query at the end of the signal handler, in case
we're wrong.
*/
bool print_invalid_query_pointer= false;
#endif
if (segfaulted)
{
@ -265,6 +265,7 @@ extern "C" sig_handler handle_fatal_signal(int sig)
"\"mlockall\" bugs.\n");
}
#ifdef HAVE_STACKTRACE
if (print_invalid_query_pointer)
{
my_safe_printf_stderr(
@ -274,6 +275,7 @@ extern "C" sig_handler handle_fatal_signal(int sig)
my_write_stderr(thd->query(), MY_MIN(65536U, thd->query_length()));
my_safe_printf_stderr("\n\n");
}
#endif
#ifdef HAVE_WRITE_CORE
if (test_flags & TEST_CORE_ON_SIGNAL)

View file

@ -455,7 +455,19 @@ static bool mysql_admin_table(THD* thd, TABLE_LIST* tables,
}
thd->prepare_derived_at_open= FALSE;
table->next_global= save_next_global;
/*
MERGE engine may adjust table->next_global chain, thus we have to
append save_next_global after merge children.
*/
if (save_next_global)
{
TABLE_LIST *table_list_iterator= table;
while (table_list_iterator->next_global)
table_list_iterator= table_list_iterator->next_global;
table_list_iterator->next_global= save_next_global;
save_next_global->prev_global= &table_list_iterator->next_global;
}
table->next_local= save_next_local;
thd->open_options&= ~extra_open_options;

View file

@ -9223,6 +9223,7 @@ open_system_tables_for_read(THD *thd, TABLE_LIST *table_list,
*/
lex->reset_n_backup_query_tables_list(&query_tables_list_backup);
thd->reset_n_backup_open_tables_state(backup);
thd->lex->sql_command= SQLCOM_SELECT;
if (open_and_lock_tables(thd, table_list, FALSE,
MYSQL_OPEN_IGNORE_FLUSH |

View file

@ -5371,9 +5371,11 @@ int THD::decide_logging_format(TABLE_LIST *tables)
{
static const char *prelocked_mode_name[] = {
"NON_PRELOCKED",
"LOCK_TABLES",
"PRELOCKED",
"PRELOCKED_UNDER_LOCK_TABLES",
};
compile_time_assert(array_elements(prelocked_mode_name) == LTM_always_last);
DBUG_PRINT("debug", ("prelocked_mode: %s",
prelocked_mode_name[locked_tables_mode]));
}

View file

@ -1182,7 +1182,8 @@ enum enum_locked_tables_mode
LTM_NONE= 0,
LTM_LOCK_TABLES,
LTM_PRELOCKED,
LTM_PRELOCKED_UNDER_LOCK_TABLES
LTM_PRELOCKED_UNDER_LOCK_TABLES,
LTM_always_last
};
@ -4302,6 +4303,11 @@ public:
save_copy_field_end= copy_field_end= NULL;
}
}
void free_copy_field_data()
{
for (Copy_field *ptr= copy_field ; ptr != copy_field_end ; ptr++)
ptr->tmp.free();
}
};
class select_union :public select_result_interceptor

View file

@ -2810,6 +2810,22 @@ static st_bookmark *find_bookmark(const char *plugin, const char *name,
}
static size_t var_storage_size(int flags)
{
switch (flags & PLUGIN_VAR_TYPEMASK) {
case PLUGIN_VAR_BOOL: return sizeof(my_bool);
case PLUGIN_VAR_INT: return sizeof(int);
case PLUGIN_VAR_LONG: return sizeof(long);
case PLUGIN_VAR_ENUM: return sizeof(long);
case PLUGIN_VAR_LONGLONG: return sizeof(ulonglong);
case PLUGIN_VAR_SET: return sizeof(ulonglong);
case PLUGIN_VAR_STR: return sizeof(char*);
case PLUGIN_VAR_DOUBLE: return sizeof(double);
default: DBUG_ASSERT(0); return 0;
}
}
/*
returns a bookmark for thd-local variables, creating if neccessary.
returns null for non thd-local variables.
@ -2818,39 +2834,13 @@ static st_bookmark *find_bookmark(const char *plugin, const char *name,
static st_bookmark *register_var(const char *plugin, const char *name,
int flags)
{
uint length= strlen(plugin) + strlen(name) + 3, size= 0, offset, new_size;
uint length= strlen(plugin) + strlen(name) + 3, size, offset, new_size;
st_bookmark *result;
char *varname, *p;
if (!(flags & PLUGIN_VAR_THDLOCAL))
return NULL;
switch (flags & PLUGIN_VAR_TYPEMASK) {
case PLUGIN_VAR_BOOL:
size= sizeof(my_bool);
break;
case PLUGIN_VAR_INT:
size= sizeof(int);
break;
case PLUGIN_VAR_LONG:
case PLUGIN_VAR_ENUM:
size= sizeof(long);
break;
case PLUGIN_VAR_LONGLONG:
case PLUGIN_VAR_SET:
size= sizeof(ulonglong);
break;
case PLUGIN_VAR_STR:
size= sizeof(char*);
break;
case PLUGIN_VAR_DOUBLE:
size= sizeof(double);
break;
default:
DBUG_ASSERT(0);
return NULL;
};
DBUG_ASSERT(flags & PLUGIN_VAR_THDLOCAL);
size= var_storage_size(flags);
varname= ((char*) my_alloca(length));
strxmov(varname + 1, plugin, "_", name, NullS);
for (p= varname + 1; *p; p++)
@ -3046,25 +3036,17 @@ void sync_dynamic_session_variables(THD* thd, bool global_lock)
*/
for (idx= 0; idx < bookmark_hash.records; idx++)
{
sys_var_pluginvar *pi;
sys_var *var;
st_bookmark *v= (st_bookmark*) my_hash_element(&bookmark_hash,idx);
if (v->version <= thd->variables.dynamic_variables_version)
continue; /* already in thd->variables */
if (!(var= intern_find_sys_var(v->key + 1, v->name_len)) ||
!(pi= var->cast_pluginvar()) ||
v->key[0] != plugin_var_bookmark_key(pi->plugin_var->flags))
continue;
/* Here we do anything special that may be required of the data types */
if ((pi->plugin_var->flags & PLUGIN_VAR_TYPEMASK) == PLUGIN_VAR_STR &&
pi->plugin_var->flags & PLUGIN_VAR_MEMALLOC)
if ((v->key[0] & PLUGIN_VAR_TYPEMASK) == PLUGIN_VAR_STR &&
v->key[0] & BOOKMARK_MEMALLOC)
{
int offset= ((thdvar_str_t *)(pi->plugin_var))->offset;
char **pp= (char**) (thd->variables.dynamic_variables_ptr + offset);
char **pp= (char**) (thd->variables.dynamic_variables_ptr + v->offset);
if (*pp)
*pp= my_strdup(*pp, MYF(MY_WME|MY_FAE));
}
@ -3325,6 +3307,48 @@ bool sys_var_pluginvar::session_update(THD *thd, set_var *var)
return false;
}
static const void *var_def_ptr(st_mysql_sys_var *pv)
{
switch (pv->flags & (PLUGIN_VAR_TYPEMASK | PLUGIN_VAR_THDLOCAL)) {
case PLUGIN_VAR_INT:
return &((sysvar_uint_t*) pv)->def_val;
case PLUGIN_VAR_LONG:
return &((sysvar_ulong_t*) pv)->def_val;
case PLUGIN_VAR_LONGLONG:
return &((sysvar_ulonglong_t*) pv)->def_val;
case PLUGIN_VAR_ENUM:
return &((sysvar_enum_t*) pv)->def_val;
case PLUGIN_VAR_SET:
return &((sysvar_set_t*) pv)->def_val;
case PLUGIN_VAR_BOOL:
return &((sysvar_bool_t*) pv)->def_val;
case PLUGIN_VAR_STR:
return &((sysvar_str_t*) pv)->def_val;
case PLUGIN_VAR_DOUBLE:
return &((sysvar_double_t*) pv)->def_val;
case PLUGIN_VAR_INT | PLUGIN_VAR_THDLOCAL:
return &((thdvar_uint_t*) pv)->def_val;
case PLUGIN_VAR_LONG | PLUGIN_VAR_THDLOCAL:
return &((thdvar_ulong_t*) pv)->def_val;
case PLUGIN_VAR_LONGLONG | PLUGIN_VAR_THDLOCAL:
return &((thdvar_ulonglong_t*) pv)->def_val;
case PLUGIN_VAR_ENUM | PLUGIN_VAR_THDLOCAL:
return &((thdvar_enum_t*) pv)->def_val;
case PLUGIN_VAR_SET | PLUGIN_VAR_THDLOCAL:
return &((thdvar_set_t*) pv)->def_val;
case PLUGIN_VAR_BOOL | PLUGIN_VAR_THDLOCAL:
return &((thdvar_bool_t*) pv)->def_val;
case PLUGIN_VAR_STR | PLUGIN_VAR_THDLOCAL:
return &((thdvar_str_t*) pv)->def_val;
case PLUGIN_VAR_DOUBLE | PLUGIN_VAR_THDLOCAL:
return &((thdvar_double_t*) pv)->def_val;
default:
DBUG_ASSERT(0);
return NULL;
}
}
bool sys_var_pluginvar::global_update(THD *thd, set_var *var)
{
DBUG_ASSERT(!is_readonly());
@ -3334,60 +3358,7 @@ bool sys_var_pluginvar::global_update(THD *thd, set_var *var)
const void *src= &var->save_result;
if (!var->value)
{
switch (plugin_var->flags & (PLUGIN_VAR_TYPEMASK | PLUGIN_VAR_THDLOCAL)) {
case PLUGIN_VAR_INT:
src= &((sysvar_uint_t*) plugin_var)->def_val;
break;
case PLUGIN_VAR_LONG:
src= &((sysvar_ulong_t*) plugin_var)->def_val;
break;
case PLUGIN_VAR_LONGLONG:
src= &((sysvar_ulonglong_t*) plugin_var)->def_val;
break;
case PLUGIN_VAR_ENUM:
src= &((sysvar_enum_t*) plugin_var)->def_val;
break;
case PLUGIN_VAR_SET:
src= &((sysvar_set_t*) plugin_var)->def_val;
break;
case PLUGIN_VAR_BOOL:
src= &((sysvar_bool_t*) plugin_var)->def_val;
break;
case PLUGIN_VAR_STR:
src= &((sysvar_str_t*) plugin_var)->def_val;
break;
case PLUGIN_VAR_DOUBLE:
src= &((sysvar_double_t*) plugin_var)->def_val;
break;
case PLUGIN_VAR_INT | PLUGIN_VAR_THDLOCAL:
src= &((thdvar_uint_t*) plugin_var)->def_val;
break;
case PLUGIN_VAR_LONG | PLUGIN_VAR_THDLOCAL:
src= &((thdvar_ulong_t*) plugin_var)->def_val;
break;
case PLUGIN_VAR_LONGLONG | PLUGIN_VAR_THDLOCAL:
src= &((thdvar_ulonglong_t*) plugin_var)->def_val;
break;
case PLUGIN_VAR_ENUM | PLUGIN_VAR_THDLOCAL:
src= &((thdvar_enum_t*) plugin_var)->def_val;
break;
case PLUGIN_VAR_SET | PLUGIN_VAR_THDLOCAL:
src= &((thdvar_set_t*) plugin_var)->def_val;
break;
case PLUGIN_VAR_BOOL | PLUGIN_VAR_THDLOCAL:
src= &((thdvar_bool_t*) plugin_var)->def_val;
break;
case PLUGIN_VAR_STR | PLUGIN_VAR_THDLOCAL:
src= &((thdvar_str_t*) plugin_var)->def_val;
break;
case PLUGIN_VAR_DOUBLE | PLUGIN_VAR_THDLOCAL:
src= &((thdvar_double_t*) plugin_var)->def_val;
break;
default:
DBUG_ASSERT(0);
}
}
src= var_def_ptr(plugin_var);
plugin_var->update(thd, plugin_var, tgt, src);
return false;
@ -3743,7 +3714,18 @@ static int construct_options(MEM_ROOT *mem_root, struct st_plugin_int *tmp,
*(int*)(opt + 1)= offset= v->offset;
if (opt->flags & PLUGIN_VAR_NOCMDOPT)
{
char *val= global_system_variables.dynamic_variables_ptr + offset;
if (((opt->flags & PLUGIN_VAR_TYPEMASK) == PLUGIN_VAR_STR) &&
(opt->flags & PLUGIN_VAR_MEMALLOC))
{
char *def_val= *(char**)var_def_ptr(opt);
*(char**)val= def_val ? my_strdup(def_val, MYF(0)) : NULL;
}
else
memcpy(val, var_def_ptr(opt), var_storage_size(opt->flags));
continue;
}
optname= (char*) memdup_root(mem_root, v->key + 1,
(optnamelen= v->name_len) + 1);
@ -3951,10 +3933,11 @@ static int test_plugin_options(MEM_ROOT *tmp_root, struct st_plugin_int *tmp,
*str->value= strdup_root(mem_root, *str->value);
}
var= find_bookmark(plugin_name.str, o->name, o->flags);
if (o->flags & PLUGIN_VAR_NOSYSVAR)
continue;
tmp_backup[tmp->nbackups++].save(&o->name);
if ((var= find_bookmark(plugin_name.str, o->name, o->flags)))
if (var)
v= new (mem_root) sys_var_pluginvar(&chain, var->key + 1, o, tmp);
else
{

View file

@ -9004,9 +9004,26 @@ JOIN::make_simple_join(JOIN *parent, TABLE *temp_table)
We need to destruct the copy_field (allocated in create_tmp_table())
before setting it to 0 if the join is not "reusable".
*/
if (!tmp_join || tmp_join != this)
tmp_table_param.cleanup();
tmp_table_param.copy_field= tmp_table_param.copy_field_end=0;
if (!tmp_join || tmp_join != this)
tmp_table_param.cleanup();
else
{
/*
Free data buffered in copy_fields, but keep data pointed by copy_field
around for next iteration (possibly stored in save_copy_fields).
It would be logically simpler to not clear copy_field
below, but as we have loops that runs over copy_field to
copy_field_end that should not be done anymore, it's simpler to
just clear the pointers.
Another option would be to just clear copy_field_end and not run
the loops if this is not set or to have tmp_table_param.cleanup()
to run cleanup on save_copy_field if copy_field is not set.
*/
tmp_table_param.free_copy_field_data();
tmp_table_param.copy_field= tmp_table_param.copy_field_end=0;
}
first_record= sort_and_group=0;
send_records= (ha_rows) 0;
@ -11687,7 +11704,7 @@ void JOIN::join_free()
/**
Free resources of given join.
@param fill true if we should free all resources, call with full==1
@param full true if we should free all resources, call with full==1
should be last, before it this function can be called with
full==0
@ -11806,7 +11823,7 @@ void JOIN::cleanup(bool full)
/*
If we have tmp_join and 'this' JOIN is not tmp_join and
tmp_table_param.copy_field's of them are equal then we have to remove
pointer to tmp_table_param.copy_field from tmp_join, because it qill
pointer to tmp_table_param.copy_field from tmp_join, because it will
be removed in tmp_table_param.cleanup().
*/
if (tmp_join &&
@ -15710,6 +15727,7 @@ Field *create_tmp_field(THD *thd, TABLE *table,Item *item, Item::Type type,
case Item::VARBIN_ITEM:
case Item::CACHE_ITEM:
case Item::EXPR_CACHE_ITEM:
case Item::PARAM_ITEM:
if (make_copy_field)
{
DBUG_ASSERT(((Item_result_field*)item)->result_field);
@ -22240,7 +22258,7 @@ setup_copy_fields(THD *thd, TMP_TABLE_PARAM *param,
err:
if (copy)
delete [] param->copy_field; // This is never 0
param->copy_field=0;
param->copy_field= 0;
err2:
DBUG_RETURN(TRUE);
}

View file

@ -876,6 +876,8 @@ void tdc_release_share(TABLE_SHARE *share)
}
if (--share->tdc.ref_count)
{
if (!share->is_view)
mysql_cond_broadcast(&share->tdc.COND_release);
mysql_mutex_unlock(&share->tdc.LOCK_table_share);
mysql_mutex_unlock(&LOCK_unused_shares);
DBUG_VOID_RETURN;

View file

@ -108,6 +108,7 @@ UNIV_INTERN mysql_pfs_key_t fts_pll_tokenize_mutex_key;
/** variable to record innodb_fts_internal_tbl_name for information
schema table INNODB_FTS_INSERTED etc. */
UNIV_INTERN char* fts_internal_tbl_name = NULL;
UNIV_INTERN char* fts_internal_tbl_name2 = NULL;
/** InnoDB default stopword list:
There are different versions of stopwords, the stop words listed
@ -6570,6 +6571,36 @@ fts_check_corrupt_index(
return(0);
}
/* Get parent table name if it's a fts aux table
@param[in] aux_table_name aux table name
@param[in] aux_table_len aux table length
@return parent table name, or NULL */
char*
fts_get_parent_table_name(
const char* aux_table_name,
ulint aux_table_len)
{
fts_aux_table_t aux_table;
char* parent_table_name = NULL;
if (fts_is_aux_table_name(&aux_table, aux_table_name, aux_table_len)) {
dict_table_t* parent_table;
parent_table = dict_table_open_on_id(
aux_table.parent_id, TRUE, DICT_TABLE_OP_NORMAL);
if (parent_table != NULL) {
parent_table_name = mem_strdupl(
parent_table->name,
strlen(parent_table->name));
dict_table_close(parent_table, TRUE, FALSE);
}
}
return(parent_table_name);
}
/** Check the validity of the parent table.
@param[in] aux_table auxiliary table
@return true if it is a valid table or false if it is not */

View file

@ -15010,7 +15010,12 @@ innodb_internal_table_update(
my_free(old);
}
fts_internal_tbl_name = *(char**) var_ptr;
fts_internal_tbl_name2 = *(char**) var_ptr;
if (fts_internal_tbl_name2 == NULL) {
fts_internal_tbl_name = const_cast<char*>("default");
} else {
fts_internal_tbl_name = fts_internal_tbl_name2;
}
}
/****************************************************************//**
@ -16793,7 +16798,7 @@ static MYSQL_SYSVAR_BOOL(disable_sort_file_cache, srv_disable_sort_file_cache,
"Whether to disable OS system file cache for sort I/O",
NULL, NULL, FALSE);
static MYSQL_SYSVAR_STR(ft_aux_table, fts_internal_tbl_name,
static MYSQL_SYSVAR_STR(ft_aux_table, fts_internal_tbl_name2,
PLUGIN_VAR_NOCMDARG,
"FTS internal auxiliary table to be checked",
innodb_internal_table_validate,

View file

@ -209,7 +209,10 @@ innobase_need_rebuild(
const Alter_inplace_info* ha_alter_info,
const TABLE* altered_table)
{
if (ha_alter_info->handler_flags
Alter_inplace_info::HA_ALTER_FLAGS alter_inplace_flags =
ha_alter_info->handler_flags & ~(INNOBASE_INPLACE_IGNORE);
if (alter_inplace_flags
== Alter_inplace_info::CHANGE_CREATE_OPTION
&& !(ha_alter_info->create_info->used_fields
& (HA_CREATE_USED_ROW_FORMAT
@ -3933,7 +3936,7 @@ err_exit:
}
if (!(ha_alter_info->handler_flags & INNOBASE_ALTER_DATA)
|| (ha_alter_info->handler_flags
|| ((ha_alter_info->handler_flags & ~INNOBASE_INPLACE_IGNORE)
== Alter_inplace_info::CHANGE_CREATE_OPTION
&& !innobase_need_rebuild(ha_alter_info, table))) {
@ -4107,7 +4110,7 @@ ok_exit:
DBUG_RETURN(false);
}
if (ha_alter_info->handler_flags
if ((ha_alter_info->handler_flags & ~INNOBASE_INPLACE_IGNORE)
== Alter_inplace_info::CHANGE_CREATE_OPTION
&& !innobase_need_rebuild(ha_alter_info, table)) {
goto ok_exit;

View file

@ -3981,6 +3981,8 @@ i_s_fts_config_fill(
DBUG_RETURN(0);
}
DEBUG_SYNC_C("i_s_fts_config_fille_check");
fields = table->field;
/* Prevent DDL to drop fts aux tables. */

View file

@ -375,6 +375,7 @@ extern bool fts_need_sync;
/** Variable specifying the table that has Fulltext index to display its
content through information schema table */
extern char* fts_internal_tbl_name;
extern char* fts_internal_tbl_name2;
#define fts_que_graph_free(graph) \
do { \
@ -823,6 +824,15 @@ void
fts_drop_orphaned_tables(void);
/*==========================*/
/* Get parent table name if it's a fts aux table
@param[in] aux_table_name aux table name
@param[in] aux_table_len aux table length
@return parent table name, or NULL */
char*
fts_get_parent_table_name(
const char* aux_table_name,
ulint aux_table_len);
/******************************************************************//**
Since we do a horizontal split on the index table, we need to drop
all the split tables.

View file

@ -44,7 +44,7 @@ Created 1/20/1994 Heikki Tuuri
#define INNODB_VERSION_MAJOR 5
#define INNODB_VERSION_MINOR 6
#define INNODB_VERSION_BUGFIX 32
#define INNODB_VERSION_BUGFIX 33
/* The following is the InnoDB version as shown in
SELECT plugin_version FROM information_schema.plugins;

View file

@ -613,7 +613,7 @@ row_log_table_delete(
&old_pk_extra_size);
ut_ad(old_pk_extra_size < 0x100);
mrec_size = 4 + old_pk_size;
mrec_size = 6 + old_pk_size;
/* Log enough prefix of the BLOB unless both the
old and new table are in COMPACT or REDUNDANT format,
@ -643,8 +643,8 @@ row_log_table_delete(
*b++ = static_cast<byte>(old_pk_extra_size);
/* Log the size of external prefix we saved */
mach_write_to_2(b, ext_size);
b += 2;
mach_write_to_4(b, ext_size);
b += 4;
rec_convert_dtuple_to_temp(
b + old_pk_extra_size, new_index,
@ -2268,14 +2268,14 @@ row_log_table_apply_op(
break;
case ROW_T_DELETE:
/* 1 (extra_size) + 2 (ext_size) + at least 1 (payload) */
if (mrec + 4 >= mrec_end) {
/* 1 (extra_size) + 4 (ext_size) + at least 1 (payload) */
if (mrec + 6 >= mrec_end) {
return(NULL);
}
extra_size = *mrec++;
ext_size = mach_read_from_2(mrec);
mrec += 2;
ext_size = mach_read_from_4(mrec);
mrec += 4;
ut_ad(mrec < mrec_end);
/* We assume extra_size < 0x100 for the PRIMARY KEY prefix.

View file

@ -2715,6 +2715,10 @@ loop:
return(n_tables + n_tables_dropped);
}
DBUG_EXECUTE_IF("row_drop_tables_in_background_sleep",
os_thread_sleep(5000000);
);
table = dict_table_open_on_name(drop->table_name, FALSE, FALSE,
DICT_ERR_IGNORE_NONE);
@ -2725,6 +2729,16 @@ loop:
goto already_dropped;
}
if (!table->to_be_dropped) {
/* There is a scenario: the old table is dropped
just after it's added into drop list, and new
table with the same name is created, then we try
to drop the new table in background. */
dict_table_close(table, FALSE, FALSE);
goto already_dropped;
}
ut_a(!table->can_be_evicted);
dict_table_close(table, FALSE, FALSE);
@ -3992,6 +4006,13 @@ row_drop_table_for_mysql(
}
}
DBUG_EXECUTE_IF("row_drop_table_add_to_background",
row_add_table_to_background_drop_list(table->name);
err = DB_SUCCESS;
goto funct_exit;
);
/* TODO: could we replace the counter n_foreign_key_checks_running
with lock checks on the table? Acquire here an exclusive lock on the
table, and rewrite lock0lock.cc and the lock wait in srv0srv.cc so that
@ -4608,6 +4629,19 @@ loop:
row_mysql_lock_data_dictionary(trx);
while ((table_name = dict_get_first_table_name_in_db(name))) {
/* Drop parent table if it is a fts aux table, to
avoid accessing dropped fts aux tables in information
scheam when parent table still exists.
Note: Drop parent table will drop fts aux tables. */
char* parent_table_name;
parent_table_name = fts_get_parent_table_name(
table_name, strlen(table_name));
if (parent_table_name != NULL) {
mem_free(table_name);
table_name = parent_table_name;
}
ut_a(memcmp(table_name, name, namelen) == 0);
table = dict_table_open_on_name(

View file

@ -205,7 +205,7 @@ maria_declare_plugin(perfschema)
0x0001,
pfs_status_vars,
NULL,
"5.6.32",
"5.6.33",
MariaDB_PLUGIN_MATURITY_STABLE
}
maria_declare_plugin_end;

View file

@ -1,4 +1,4 @@
SET(TOKUDB_VERSION 5.6.31-77.0)
SET(TOKUDB_VERSION 5.6.32-78.1)
# PerconaFT only supports x86-64 and cmake-2.8.9+
IF(CMAKE_VERSION VERSION_LESS "2.8.9")
MESSAGE(STATUS "CMake 2.8.9 or higher is required by TokuDB")

View file

@ -367,8 +367,8 @@ static void print_db_env_struct (void) {
"int (*checkpointing_get_period) (DB_ENV*, uint32_t*) /* Retrieve the delay between automatic checkpoints. 0 means disabled. */",
"int (*cleaner_set_period) (DB_ENV*, uint32_t) /* Change the delay between automatic cleaner attempts. 0 means disabled. */",
"int (*cleaner_get_period) (DB_ENV*, uint32_t*) /* Retrieve the delay between automatic cleaner attempts. 0 means disabled. */",
"int (*cleaner_set_iterations) (DB_ENV*, uint32_t) /* Change the number of attempts on each cleaner invokation. 0 means disabled. */",
"int (*cleaner_get_iterations) (DB_ENV*, uint32_t*) /* Retrieve the number of attempts on each cleaner invokation. 0 means disabled. */",
"int (*cleaner_set_iterations) (DB_ENV*, uint32_t) /* Change the number of attempts on each cleaner invocation. 0 means disabled. */",
"int (*cleaner_get_iterations) (DB_ENV*, uint32_t*) /* Retrieve the number of attempts on each cleaner invocation. 0 means disabled. */",
"int (*evictor_set_enable_partial_eviction) (DB_ENV*, bool) /* Enables or disabled partial eviction of nodes from cachetable. */",
"int (*evictor_get_enable_partial_eviction) (DB_ENV*, bool*) /* Retrieve the status of partial eviction of nodes from cachetable. */",
"int (*checkpointing_postpone) (DB_ENV*) /* Use for 'rename table' or any other operation that must be disjoint from a checkpoint */",

View file

@ -103,6 +103,7 @@ set_cflags_if_supported(
-Wno-pointer-bool-conversion
-fno-rtti
-fno-exceptions
-Wno-error=nonnull-compare
)
## set_cflags_if_supported_named("-Weffc++" -Weffcpp)

View file

@ -55,8 +55,8 @@ set(FT_SOURCES
msg_buffer
node
pivotkeys
serialize/rbtree_mhs
serialize/block_allocator
serialize/block_allocator_strategy
serialize/block_table
serialize/compress
serialize/ft_node-serialize

View file

@ -496,7 +496,7 @@ handle_split_of_child(
// We never set the rightmost blocknum to be the root.
// Instead, we wait for the root to split and let promotion initialize the rightmost
// blocknum to be the first non-root leaf node on the right extreme to recieve an insert.
// blocknum to be the first non-root leaf node on the right extreme to receive an insert.
BLOCKNUM rightmost_blocknum = toku_unsafe_fetch(&ft->rightmost_blocknum);
invariant(ft->h->root_blocknum.b != rightmost_blocknum.b);
if (childa->blocknum.b == rightmost_blocknum.b) {
@ -1470,7 +1470,7 @@ void toku_ft_flush_some_child(FT ft, FTNODE parent, struct flusher_advice *fa)
// It is possible after reading in the entire child,
// that we now know that the child is not reactive
// if so, we can unpin parent right now
// we wont be splitting/merging child
// we won't be splitting/merging child
// and we have already replaced the bnc
// for the root with a fresh one
enum reactivity child_re = toku_ftnode_get_reactivity(ft, child);

View file

@ -598,15 +598,12 @@ void toku_ftnode_checkpoint_complete_callback(void *value_data) {
}
}
void toku_ftnode_clone_callback(
void* value_data,
void** cloned_value_data,
long* clone_size,
PAIR_ATTR* new_attr,
bool for_checkpoint,
void* write_extraargs
)
{
void toku_ftnode_clone_callback(void *value_data,
void **cloned_value_data,
long *clone_size,
PAIR_ATTR *new_attr,
bool for_checkpoint,
void *write_extraargs) {
FTNODE node = static_cast<FTNODE>(value_data);
toku_ftnode_assert_fully_in_memory(node);
FT ft = static_cast<FT>(write_extraargs);
@ -618,13 +615,16 @@ void toku_ftnode_clone_callback(
toku_ftnode_leaf_rebalance(node, ft->h->basementnodesize);
}
cloned_node->oldest_referenced_xid_known = node->oldest_referenced_xid_known;
cloned_node->max_msn_applied_to_node_on_disk = node->max_msn_applied_to_node_on_disk;
cloned_node->oldest_referenced_xid_known =
node->oldest_referenced_xid_known;
cloned_node->max_msn_applied_to_node_on_disk =
node->max_msn_applied_to_node_on_disk;
cloned_node->flags = node->flags;
cloned_node->blocknum = node->blocknum;
cloned_node->layout_version = node->layout_version;
cloned_node->layout_version_original = node->layout_version_original;
cloned_node->layout_version_read_from_disk = node->layout_version_read_from_disk;
cloned_node->layout_version_read_from_disk =
node->layout_version_read_from_disk;
cloned_node->build_id = node->build_id;
cloned_node->height = node->height;
cloned_node->dirty = node->dirty;
@ -649,38 +649,39 @@ void toku_ftnode_clone_callback(
// set new pair attr if necessary
if (node->height == 0) {
*new_attr = make_ftnode_pair_attr(node);
}
else {
for (int i = 0; i < node->n_children; i++) {
BLB(node, i)->logical_rows_delta = 0;
BLB(cloned_node, i)->logical_rows_delta = 0;
}
} else {
new_attr->is_valid = false;
}
*clone_size = ftnode_memory_size(cloned_node);
*cloned_value_data = cloned_node;
}
void toku_ftnode_flush_callback(
CACHEFILE UU(cachefile),
int fd,
BLOCKNUM blocknum,
void *ftnode_v,
void** disk_data,
void *extraargs,
PAIR_ATTR size __attribute__((unused)),
PAIR_ATTR* new_size,
bool write_me,
bool keep_me,
bool for_checkpoint,
bool is_clone
)
{
FT ft = (FT) extraargs;
FTNODE ftnode = (FTNODE) ftnode_v;
FTNODE_DISK_DATA* ndd = (FTNODE_DISK_DATA*)disk_data;
void toku_ftnode_flush_callback(CACHEFILE UU(cachefile),
int fd,
BLOCKNUM blocknum,
void *ftnode_v,
void **disk_data,
void *extraargs,
PAIR_ATTR size __attribute__((unused)),
PAIR_ATTR *new_size,
bool write_me,
bool keep_me,
bool for_checkpoint,
bool is_clone) {
FT ft = (FT)extraargs;
FTNODE ftnode = (FTNODE)ftnode_v;
FTNODE_DISK_DATA *ndd = (FTNODE_DISK_DATA *)disk_data;
assert(ftnode->blocknum.b == blocknum.b);
int height = ftnode->height;
if (write_me) {
toku_ftnode_assert_fully_in_memory(ftnode);
if (height > 0 && !is_clone) {
// cloned nodes already had their stale messages moved, see toku_ftnode_clone_callback()
// cloned nodes already had their stale messages moved, see
// toku_ftnode_clone_callback()
toku_move_ftnode_messages_to_stale(ft, ftnode);
} else if (height == 0) {
toku_ftnode_leaf_run_gc(ft, ftnode);
@ -688,7 +689,8 @@ void toku_ftnode_flush_callback(
toku_ftnode_update_disk_stats(ftnode, ft, for_checkpoint);
}
}
int r = toku_serialize_ftnode_to(fd, ftnode->blocknum, ftnode, ndd, !is_clone, ft, for_checkpoint);
int r = toku_serialize_ftnode_to(
fd, ftnode->blocknum, ftnode, ndd, !is_clone, ft, for_checkpoint);
assert_zero(r);
ftnode->layout_version_read_from_disk = FT_LAYOUT_VERSION;
}
@ -703,20 +705,22 @@ void toku_ftnode_flush_callback(
FT_STATUS_INC(FT_FULL_EVICTIONS_NONLEAF_BYTES, node_size);
}
toku_free(*disk_data);
}
else {
} else {
if (ftnode->height == 0) {
for (int i = 0; i < ftnode->n_children; i++) {
if (BP_STATE(ftnode,i) == PT_AVAIL) {
if (BP_STATE(ftnode, i) == PT_AVAIL) {
BASEMENTNODE bn = BLB(ftnode, i);
toku_ft_decrease_stats(&ft->in_memory_stats, bn->stat64_delta);
toku_ft_decrease_stats(&ft->in_memory_stats,
bn->stat64_delta);
if (!ftnode->dirty)
toku_ft_adjust_logical_row_count(
ft, -bn->logical_rows_delta);
}
}
}
}
toku_ftnode_free(&ftnode);
}
else {
} else {
*new_size = make_ftnode_pair_attr(ftnode);
}
}
@ -845,10 +849,13 @@ static void compress_internal_node_partition(FTNODE node, int i, enum toku_compr
}
// callback for partially evicting a node
int toku_ftnode_pe_callback(void *ftnode_pv, PAIR_ATTR old_attr, void *write_extraargs,
void (*finalize)(PAIR_ATTR new_attr, void *extra), void *finalize_extra) {
FTNODE node = (FTNODE) ftnode_pv;
FT ft = (FT) write_extraargs;
int toku_ftnode_pe_callback(void *ftnode_pv,
PAIR_ATTR old_attr,
void *write_extraargs,
void (*finalize)(PAIR_ATTR new_attr, void *extra),
void *finalize_extra) {
FTNODE node = (FTNODE)ftnode_pv;
FT ft = (FT)write_extraargs;
int num_partial_evictions = 0;
// Hold things we intend to destroy here.
@ -866,7 +873,8 @@ int toku_ftnode_pe_callback(void *ftnode_pv, PAIR_ATTR old_attr, void *write_ext
}
// Don't partially evict nodes whose partitions can't be read back
// from disk individually
if (node->layout_version_read_from_disk < FT_FIRST_LAYOUT_VERSION_WITH_BASEMENT_NODES) {
if (node->layout_version_read_from_disk <
FT_FIRST_LAYOUT_VERSION_WITH_BASEMENT_NODES) {
goto exit;
}
//
@ -874,77 +882,77 @@ int toku_ftnode_pe_callback(void *ftnode_pv, PAIR_ATTR old_attr, void *write_ext
//
if (node->height > 0) {
for (int i = 0; i < node->n_children; i++) {
if (BP_STATE(node,i) == PT_AVAIL) {
if (BP_SHOULD_EVICT(node,i)) {
if (BP_STATE(node, i) == PT_AVAIL) {
if (BP_SHOULD_EVICT(node, i)) {
NONLEAF_CHILDINFO bnc = BNC(node, i);
if (ft_compress_buffers_before_eviction &&
// We may not serialize and compress a partition in memory if its
// in memory layout version is different than what's on disk (and
// therefore requires upgrade).
// We may not serialize and compress a partition in
// memory if its in memory layout version is different
// than what's on disk (and therefore requires upgrade).
//
// Auto-upgrade code assumes that if a node's layout version read
// from disk is not current, it MUST require upgrade. Breaking
// this rule would cause upgrade code to upgrade this partition
// again after we serialize it as the current version, which is bad.
node->layout_version == node->layout_version_read_from_disk) {
// Auto-upgrade code assumes that if a node's layout
// version read from disk is not current, it MUST
// require upgrade.
// Breaking this rule would cause upgrade code to
// upgrade this partition again after we serialize it as
// the current version, which is bad.
node->layout_version ==
node->layout_version_read_from_disk) {
toku_ft_bnc_move_messages_to_stale(ft, bnc);
compress_internal_node_partition(
node,
i,
// Always compress with quicklz
TOKU_QUICKLZ_METHOD
);
TOKU_QUICKLZ_METHOD);
} else {
// We're not compressing buffers before eviction. Simply
// detach the buffer and set the child's state to on-disk.
// detach the buffer and set the child's state to
// on-disk.
set_BNULL(node, i);
BP_STATE(node, i) = PT_ON_DISK;
}
buffers_to_destroy[num_buffers_to_destroy++] = bnc;
num_partial_evictions++;
} else {
BP_SWEEP_CLOCK(node, i);
}
else {
BP_SWEEP_CLOCK(node,i);
}
}
else {
} else {
continue;
}
}
}
//
// partial eviction strategy for basement nodes:
// if the bn is compressed, evict it
// else: check if it requires eviction, if it does, evict it, if not, sweep the clock count
//
else {
} else {
//
// partial eviction strategy for basement nodes:
// if the bn is compressed, evict it
// else: check if it requires eviction, if it does, evict it, if not,
// sweep the clock count
//
for (int i = 0; i < node->n_children; i++) {
// Get rid of compressed stuff no matter what.
if (BP_STATE(node,i) == PT_COMPRESSED) {
if (BP_STATE(node, i) == PT_COMPRESSED) {
SUB_BLOCK sb = BSB(node, i);
pointers_to_free[num_pointers_to_free++] = sb->compressed_ptr;
pointers_to_free[num_pointers_to_free++] = sb;
set_BNULL(node, i);
BP_STATE(node,i) = PT_ON_DISK;
BP_STATE(node, i) = PT_ON_DISK;
num_partial_evictions++;
}
else if (BP_STATE(node,i) == PT_AVAIL) {
if (BP_SHOULD_EVICT(node,i)) {
} else if (BP_STATE(node, i) == PT_AVAIL) {
if (BP_SHOULD_EVICT(node, i)) {
BASEMENTNODE bn = BLB(node, i);
basements_to_destroy[num_basements_to_destroy++] = bn;
toku_ft_decrease_stats(&ft->in_memory_stats, bn->stat64_delta);
toku_ft_decrease_stats(&ft->in_memory_stats,
bn->stat64_delta);
toku_ft_adjust_logical_row_count(ft,
-bn->logical_rows_delta);
set_BNULL(node, i);
BP_STATE(node, i) = PT_ON_DISK;
num_partial_evictions++;
} else {
BP_SWEEP_CLOCK(node, i);
}
else {
BP_SWEEP_CLOCK(node,i);
}
}
else if (BP_STATE(node,i) == PT_ON_DISK) {
} else if (BP_STATE(node, i) == PT_ON_DISK) {
continue;
}
else {
} else {
abort();
}
}
@ -2378,12 +2386,16 @@ ft_send_update_msg(FT_HANDLE ft_h, const ft_msg &msg, TOKUTXN txn) {
toku_ft_root_put_msg(ft_h->ft, msg, &gc_info);
}
void toku_ft_maybe_update(FT_HANDLE ft_h, const DBT *key, const DBT *update_function_extra,
TOKUTXN txn, bool oplsn_valid, LSN oplsn,
bool do_logging) {
void toku_ft_maybe_update(FT_HANDLE ft_h,
const DBT *key,
const DBT *update_function_extra,
TOKUTXN txn,
bool oplsn_valid,
LSN oplsn,
bool do_logging) {
TXNID_PAIR xid = toku_txn_get_txnid(txn);
if (txn) {
BYTESTRING keybs = { key->size, (char *) key->data };
BYTESTRING keybs = {key->size, (char *)key->data};
toku_logger_save_rollback_cmdupdate(
txn, toku_cachefile_filenum(ft_h->ft->cf), &keybs);
toku_txn_maybe_note_ft(txn, ft_h->ft);
@ -2392,22 +2404,33 @@ void toku_ft_maybe_update(FT_HANDLE ft_h, const DBT *key, const DBT *update_func
TOKULOGGER logger;
logger = toku_txn_logger(txn);
if (do_logging && logger) {
BYTESTRING keybs = {.len=key->size, .data=(char *) key->data};
BYTESTRING extrabs = {.len=update_function_extra->size,
.data = (char *) update_function_extra->data};
toku_log_enq_update(logger, NULL, 0, txn,
toku_cachefile_filenum(ft_h->ft->cf),
xid, keybs, extrabs);
BYTESTRING keybs = {.len = key->size, .data = (char *)key->data};
BYTESTRING extrabs = {.len = update_function_extra->size,
.data = (char *)update_function_extra->data};
toku_log_enq_update(logger,
NULL,
0,
txn,
toku_cachefile_filenum(ft_h->ft->cf),
xid,
keybs,
extrabs);
}
LSN treelsn;
if (oplsn_valid && oplsn.lsn <= (treelsn = toku_ft_checkpoint_lsn(ft_h->ft)).lsn) {
if (oplsn_valid &&
oplsn.lsn <= (treelsn = toku_ft_checkpoint_lsn(ft_h->ft)).lsn) {
// do nothing
} else {
XIDS message_xids = txn ? toku_txn_get_xids(txn) : toku_xids_get_root_xids();
ft_msg msg(key, update_function_extra, FT_UPDATE, ZERO_MSN, message_xids);
XIDS message_xids =
txn ? toku_txn_get_xids(txn) : toku_xids_get_root_xids();
ft_msg msg(
key, update_function_extra, FT_UPDATE, ZERO_MSN, message_xids);
ft_send_update_msg(ft_h, msg, txn);
}
// updates get converted to insert messages, which should do a -1 on the
// logical row count when the messages are permanently applied
toku_ft_adjust_logical_row_count(ft_h->ft, 1);
}
void toku_ft_maybe_update_broadcast(FT_HANDLE ft_h, const DBT *update_function_extra,

View file

@ -73,30 +73,20 @@ static bool recount_rows_interrupt(void* extra, uint64_t deleted_rows) {
return rre->_cancelled =
rre->_progress_callback(rre->_keys, deleted_rows, rre->_progress_extra);
}
int toku_ft_recount_rows(
FT_HANDLE ft,
int (*progress_callback)(
uint64_t count,
uint64_t deleted,
void* progress_extra),
void* progress_extra) {
int toku_ft_recount_rows(FT_HANDLE ft,
int (*progress_callback)(uint64_t count,
uint64_t deleted,
void* progress_extra),
void* progress_extra) {
int ret = 0;
recount_rows_extra_t rre = {
progress_callback,
progress_extra,
0,
false
};
recount_rows_extra_t rre = {progress_callback, progress_extra, 0, false};
ft_cursor c;
ret = toku_ft_cursor_create(ft, &c, nullptr, C_READ_ANY, false, false);
if (ret) return ret;
if (ret)
return ret;
toku_ft_cursor_set_check_interrupt_cb(
&c,
recount_rows_interrupt,
&rre);
toku_ft_cursor_set_check_interrupt_cb(&c, recount_rows_interrupt, &rre);
ret = toku_ft_cursor_first(&c, recount_rows_found, &rre);
while (FT_LIKELY(ret == 0)) {
@ -108,6 +98,7 @@ int toku_ft_recount_rows(
if (rre._cancelled == false) {
// update ft count
toku_unsafe_set(&ft->ft->in_memory_logical_rows, rre._keys);
ft->ft->h->dirty = 1;
ret = 0;
}

View file

@ -903,6 +903,9 @@ void toku_ft_adjust_logical_row_count(FT ft, int64_t delta) {
// must be returned in toku_ft_stat64.
if (delta != 0 && ft->in_memory_logical_rows != (uint64_t)-1) {
toku_sync_fetch_and_add(&(ft->in_memory_logical_rows), delta);
if (ft->in_memory_logical_rows == (uint64_t)-1) {
toku_sync_fetch_and_add(&(ft->in_memory_logical_rows), 1);
}
}
}

View file

@ -301,7 +301,7 @@ int toku_ft_loader_internal_init (/* out */ FTLOADER *blp,
void toku_ft_loader_internal_destroy (FTLOADER bl, bool is_error);
// For test purposes only. (In production, the rowset size is determined by negotation with the cachetable for some memory. See #2613.)
// For test purposes only. (In production, the rowset size is determined by negotiation with the cachetable for some memory. See #2613.)
uint64_t toku_ft_loader_get_rowset_budget_for_testing (void);
int toku_ft_loader_finish_extractor(FTLOADER bl);

View file

@ -91,7 +91,7 @@ toku_ft_loader_set_size_factor(uint32_t factor) {
uint64_t
toku_ft_loader_get_rowset_budget_for_testing (void)
// For test purposes only. In production, the rowset size is determined by negotation with the cachetable for some memory. (See #2613).
// For test purposes only. In production, the rowset size is determined by negotiation with the cachetable for some memory. (See #2613).
{
return 16ULL*size_factor*1024ULL;
}

View file

@ -373,52 +373,48 @@ find_bounds_within_message_tree(
}
}
/**
* For each message in the ancestor's buffer (determined by childnum) that
* is key-wise between lower_bound_exclusive and upper_bound_inclusive,
* apply the message to the basement node. We treat the bounds as minus
* or plus infinity respectively if they are NULL. Do not mark the node
* as dirty (preserve previous state of 'dirty' bit).
*/
// For each message in the ancestor's buffer (determined by childnum) that
// is key-wise between lower_bound_exclusive and upper_bound_inclusive,
// apply the message to the basement node. We treat the bounds as minus
// or plus infinity respectively if they are NULL. Do not mark the node
// as dirty (preserve previous state of 'dirty' bit).
static void bnc_apply_messages_to_basement_node(
FT_HANDLE t, // used for comparison function
BASEMENTNODE bn, // where to apply messages
FT_HANDLE t, // used for comparison function
BASEMENTNODE bn, // where to apply messages
FTNODE ancestor, // the ancestor node where we can find messages to apply
int childnum, // which child buffer of ancestor contains messages we want
const pivot_bounds &bounds, // contains pivot key bounds of this basement node
txn_gc_info* gc_info,
bool* msgs_applied) {
int childnum, // which child buffer of ancestor contains messages we want
const pivot_bounds &
bounds, // contains pivot key bounds of this basement node
txn_gc_info *gc_info,
bool *msgs_applied) {
int r;
NONLEAF_CHILDINFO bnc = BNC(ancestor, childnum);
// Determine the offsets in the message trees between which we need to
// apply messages from this buffer
STAT64INFO_S stats_delta = {0,0};
STAT64INFO_S stats_delta = {0, 0};
uint64_t workdone_this_ancestor = 0;
int64_t logical_rows_delta = 0;
uint32_t stale_lbi, stale_ube;
if (!bn->stale_ancestor_messages_applied) {
find_bounds_within_message_tree(
t->ft->cmp,
bnc->stale_message_tree,
&bnc->msg_buffer,
bounds,
&stale_lbi,
&stale_ube);
find_bounds_within_message_tree(t->ft->cmp,
bnc->stale_message_tree,
&bnc->msg_buffer,
bounds,
&stale_lbi,
&stale_ube);
} else {
stale_lbi = 0;
stale_ube = 0;
}
uint32_t fresh_lbi, fresh_ube;
find_bounds_within_message_tree(
t->ft->cmp,
bnc->fresh_message_tree,
&bnc->msg_buffer,
bounds,
&fresh_lbi,
&fresh_ube);
find_bounds_within_message_tree(t->ft->cmp,
bnc->fresh_message_tree,
&bnc->msg_buffer,
bounds,
&fresh_lbi,
&fresh_ube);
// We now know where all the messages we must apply are, so one of the
// following 4 cases will do the application, depending on which of
@ -432,44 +428,53 @@ static void bnc_apply_messages_to_basement_node(
// We have messages in multiple trees, so we grab all
// the relevant messages' offsets and sort them by MSN, then apply
// them in MSN order.
const int buffer_size = ((stale_ube - stale_lbi) +
(fresh_ube - fresh_lbi) +
bnc->broadcast_list.size());
const int buffer_size =
((stale_ube - stale_lbi) + (fresh_ube - fresh_lbi) +
bnc->broadcast_list.size());
toku::scoped_malloc offsets_buf(buffer_size * sizeof(int32_t));
int32_t *offsets = reinterpret_cast<int32_t *>(offsets_buf.get());
struct store_msg_buffer_offset_extra sfo_extra = { .offsets = offsets, .i = 0 };
struct store_msg_buffer_offset_extra sfo_extra = {.offsets = offsets,
.i = 0};
// Populate offsets array with offsets to stale messages
r = bnc->stale_message_tree.iterate_on_range<struct store_msg_buffer_offset_extra, store_msg_buffer_offset>(stale_lbi, stale_ube, &sfo_extra);
r = bnc->stale_message_tree
.iterate_on_range<struct store_msg_buffer_offset_extra,
store_msg_buffer_offset>(
stale_lbi, stale_ube, &sfo_extra);
assert_zero(r);
// Then store fresh offsets, and mark them to be moved to stale later.
r = bnc->fresh_message_tree.iterate_and_mark_range<struct store_msg_buffer_offset_extra, store_msg_buffer_offset>(fresh_lbi, fresh_ube, &sfo_extra);
r = bnc->fresh_message_tree
.iterate_and_mark_range<struct store_msg_buffer_offset_extra,
store_msg_buffer_offset>(
fresh_lbi, fresh_ube, &sfo_extra);
assert_zero(r);
// Store offsets of all broadcast messages.
r = bnc->broadcast_list.iterate<struct store_msg_buffer_offset_extra, store_msg_buffer_offset>(&sfo_extra);
r = bnc->broadcast_list.iterate<struct store_msg_buffer_offset_extra,
store_msg_buffer_offset>(&sfo_extra);
assert_zero(r);
invariant(sfo_extra.i == buffer_size);
// Sort by MSN.
toku::sort<int32_t, message_buffer, msg_buffer_offset_msn_cmp>::mergesort_r(offsets, buffer_size, bnc->msg_buffer);
toku::sort<int32_t, message_buffer, msg_buffer_offset_msn_cmp>::
mergesort_r(offsets, buffer_size, bnc->msg_buffer);
// Apply the messages in MSN order.
for (int i = 0; i < buffer_size; ++i) {
*msgs_applied = true;
do_bn_apply_msg(
t,
bn,
&bnc->msg_buffer,
offsets[i],
gc_info,
&workdone_this_ancestor,
&stats_delta,
&logical_rows_delta);
do_bn_apply_msg(t,
bn,
&bnc->msg_buffer,
offsets[i],
gc_info,
&workdone_this_ancestor,
&stats_delta,
&logical_rows_delta);
}
} else if (stale_lbi == stale_ube) {
// No stale messages to apply, we just apply fresh messages, and mark them to be moved to stale later.
// No stale messages to apply, we just apply fresh messages, and mark
// them to be moved to stale later.
struct iterate_do_bn_apply_msg_extra iter_extra = {
.t = t,
.bn = bn,
@ -477,16 +482,20 @@ static void bnc_apply_messages_to_basement_node(
.gc_info = gc_info,
.workdone = &workdone_this_ancestor,
.stats_to_update = &stats_delta,
.logical_rows_delta = &logical_rows_delta
};
if (fresh_ube - fresh_lbi > 0) *msgs_applied = true;
r = bnc->fresh_message_tree.iterate_and_mark_range<struct iterate_do_bn_apply_msg_extra, iterate_do_bn_apply_msg>(fresh_lbi, fresh_ube, &iter_extra);
.logical_rows_delta = &logical_rows_delta};
if (fresh_ube - fresh_lbi > 0)
*msgs_applied = true;
r = bnc->fresh_message_tree
.iterate_and_mark_range<struct iterate_do_bn_apply_msg_extra,
iterate_do_bn_apply_msg>(
fresh_lbi, fresh_ube, &iter_extra);
assert_zero(r);
} else {
invariant(fresh_lbi == fresh_ube);
// No fresh messages to apply, we just apply stale messages.
if (stale_ube - stale_lbi > 0) *msgs_applied = true;
if (stale_ube - stale_lbi > 0)
*msgs_applied = true;
struct iterate_do_bn_apply_msg_extra iter_extra = {
.t = t,
.bn = bn,
@ -494,22 +503,26 @@ static void bnc_apply_messages_to_basement_node(
.gc_info = gc_info,
.workdone = &workdone_this_ancestor,
.stats_to_update = &stats_delta,
.logical_rows_delta = &logical_rows_delta
};
.logical_rows_delta = &logical_rows_delta};
r = bnc->stale_message_tree.iterate_on_range<struct iterate_do_bn_apply_msg_extra, iterate_do_bn_apply_msg>(stale_lbi, stale_ube, &iter_extra);
r = bnc->stale_message_tree
.iterate_on_range<struct iterate_do_bn_apply_msg_extra,
iterate_do_bn_apply_msg>(
stale_lbi, stale_ube, &iter_extra);
assert_zero(r);
}
//
// update stats
//
if (workdone_this_ancestor > 0) {
(void) toku_sync_fetch_and_add(&BP_WORKDONE(ancestor, childnum), workdone_this_ancestor);
(void)toku_sync_fetch_and_add(&BP_WORKDONE(ancestor, childnum),
workdone_this_ancestor);
}
if (stats_delta.numbytes || stats_delta.numrows) {
toku_ft_update_stats(&t->ft->in_memory_stats, stats_delta);
}
toku_ft_adjust_logical_row_count(t->ft, logical_rows_delta);
bn->logical_rows_delta += logical_rows_delta;
}
static void

View file

@ -199,6 +199,7 @@ struct ftnode_leaf_basement_node {
MSN max_msn_applied; // max message sequence number applied
bool stale_ancestor_messages_applied;
STAT64INFO_S stat64_delta; // change in stat64 counters since basement was last written to disk
int64_t logical_rows_delta;
};
typedef struct ftnode_leaf_basement_node *BASEMENTNODE;

View file

@ -46,415 +46,214 @@ Copyright (c) 2006, 2015, Percona and/or its affiliates. All rights reserved.
#include "portability/toku_stdlib.h"
#include "ft/serialize/block_allocator.h"
#include "ft/serialize/block_allocator_strategy.h"
#include "ft/serialize/rbtree_mhs.h"
#if TOKU_DEBUG_PARANOID
#define VALIDATE() validate()
#define VALIDATE() Validate()
#else
#define VALIDATE()
#endif
static FILE *ba_trace_file = nullptr;
void block_allocator::maybe_initialize_trace(void) {
const char *ba_trace_path = getenv("TOKU_BA_TRACE_PATH");
if (ba_trace_path != nullptr) {
ba_trace_file = toku_os_fopen(ba_trace_path, "w");
if (ba_trace_file == nullptr) {
fprintf(stderr, "tokuft: error: block allocator trace path found in environment (%s), "
"but it could not be opened for writing (errno %d)\n",
ba_trace_path, get_maybe_error_errno());
} else {
fprintf(stderr, "tokuft: block allocator tracing enabled, path: %s\n", ba_trace_path);
}
}
}
void block_allocator::maybe_close_trace() {
if (ba_trace_file != nullptr) {
int r = toku_os_fclose(ba_trace_file);
if (r != 0) {
fprintf(stderr, "tokuft: error: block allocator trace file did not close properly (r %d, errno %d)\n",
r, get_maybe_error_errno());
} else {
fprintf(stderr, "tokuft: block allocator tracing finished, file closed successfully\n");
}
}
}
void block_allocator::_create_internal(uint64_t reserve_at_beginning, uint64_t alignment) {
// the alignment must be at least 512 and aligned with 512 to work with direct I/O
assert(alignment >= 512 && (alignment % 512) == 0);
void BlockAllocator::CreateInternal(uint64_t reserve_at_beginning,
uint64_t alignment) {
// the alignment must be at least 512 and aligned with 512 to work with
// direct I/O
invariant(alignment >= 512 && (alignment % 512) == 0);
_reserve_at_beginning = reserve_at_beginning;
_alignment = alignment;
_n_blocks = 0;
_blocks_array_size = 1;
XMALLOC_N(_blocks_array_size, _blocks_array);
_n_bytes_in_use = reserve_at_beginning;
_strategy = BA_STRATEGY_FIRST_FIT;
memset(&_trace_lock, 0, sizeof(toku_mutex_t));
toku_mutex_init(&_trace_lock, nullptr);
_tree = new MhsRbTree::Tree(alignment);
}
void BlockAllocator::Create(uint64_t reserve_at_beginning, uint64_t alignment) {
CreateInternal(reserve_at_beginning, alignment);
_tree->Insert({reserve_at_beginning, MAX_BYTE});
VALIDATE();
}
void block_allocator::create(uint64_t reserve_at_beginning, uint64_t alignment) {
_create_internal(reserve_at_beginning, alignment);
_trace_create();
void BlockAllocator::Destroy() {
delete _tree;
}
void block_allocator::destroy() {
toku_free(_blocks_array);
_trace_destroy();
toku_mutex_destroy(&_trace_lock);
}
void block_allocator::set_strategy(enum allocation_strategy strategy) {
_strategy = strategy;
}
void block_allocator::grow_blocks_array_by(uint64_t n_to_add) {
if (_n_blocks + n_to_add > _blocks_array_size) {
uint64_t new_size = _n_blocks + n_to_add;
uint64_t at_least = _blocks_array_size * 2;
if (at_least > new_size) {
new_size = at_least;
}
_blocks_array_size = new_size;
XREALLOC_N(_blocks_array_size, _blocks_array);
}
}
void block_allocator::grow_blocks_array() {
grow_blocks_array_by(1);
}
void block_allocator::create_from_blockpairs(uint64_t reserve_at_beginning, uint64_t alignment,
struct blockpair *pairs, uint64_t n_blocks) {
_create_internal(reserve_at_beginning, alignment);
void BlockAllocator::CreateFromBlockPairs(uint64_t reserve_at_beginning,
uint64_t alignment,
struct BlockPair *translation_pairs,
uint64_t n_blocks) {
CreateInternal(reserve_at_beginning, alignment);
_n_blocks = n_blocks;
grow_blocks_array_by(_n_blocks);
memcpy(_blocks_array, pairs, _n_blocks * sizeof(struct blockpair));
std::sort(_blocks_array, _blocks_array + _n_blocks);
for (uint64_t i = 0; i < _n_blocks; i++) {
// Allocator does not support size 0 blocks. See block_allocator_free_block.
invariant(_blocks_array[i].size > 0);
invariant(_blocks_array[i].offset >= _reserve_at_beginning);
invariant(_blocks_array[i].offset % _alignment == 0);
_n_bytes_in_use += _blocks_array[i].size;
struct BlockPair *XMALLOC_N(n_blocks, pairs);
memcpy(pairs, translation_pairs, n_blocks * sizeof(struct BlockPair));
std::sort(pairs, pairs + n_blocks);
if (pairs[0]._offset > reserve_at_beginning) {
_tree->Insert(
{reserve_at_beginning, pairs[0]._offset - reserve_at_beginning});
}
for (uint64_t i = 0; i < _n_blocks; i++) {
// Allocator does not support size 0 blocks. See
// block_allocator_free_block.
invariant(pairs[i]._size > 0);
invariant(pairs[i]._offset >= _reserve_at_beginning);
invariant(pairs[i]._offset % _alignment == 0);
_n_bytes_in_use += pairs[i]._size;
MhsRbTree::OUUInt64 free_size(MAX_BYTE);
MhsRbTree::OUUInt64 free_offset(pairs[i]._offset + pairs[i]._size);
if (i < n_blocks - 1) {
MhsRbTree::OUUInt64 next_offset(pairs[i + 1]._offset);
invariant(next_offset >= free_offset);
free_size = next_offset - free_offset;
if (free_size == 0)
continue;
}
_tree->Insert({free_offset, free_size});
}
toku_free(pairs);
VALIDATE();
_trace_create_from_blockpairs();
}
// Effect: align a value by rounding up.
static inline uint64_t align(uint64_t value, uint64_t ba_alignment) {
static inline uint64_t Align(uint64_t value, uint64_t ba_alignment) {
return ((value + ba_alignment - 1) / ba_alignment) * ba_alignment;
}
struct block_allocator::blockpair *
block_allocator::choose_block_to_alloc_after(size_t size, uint64_t heat) {
switch (_strategy) {
case BA_STRATEGY_FIRST_FIT:
return block_allocator_strategy::first_fit(_blocks_array, _n_blocks, size, _alignment);
case BA_STRATEGY_BEST_FIT:
return block_allocator_strategy::best_fit(_blocks_array, _n_blocks, size, _alignment);
case BA_STRATEGY_HEAT_ZONE:
return block_allocator_strategy::heat_zone(_blocks_array, _n_blocks, size, _alignment, heat);
case BA_STRATEGY_PADDED_FIT:
return block_allocator_strategy::padded_fit(_blocks_array, _n_blocks, size, _alignment);
default:
abort();
}
}
// Effect: Allocate a block. The resulting block must be aligned on the ba->alignment (which to make direct_io happy must be a positive multiple of 512).
void block_allocator::alloc_block(uint64_t size, uint64_t heat, uint64_t *offset) {
struct blockpair *bp;
// Effect: Allocate a block. The resulting block must be aligned on the
// ba->alignment (which to make direct_io happy must be a positive multiple of
// 512).
void BlockAllocator::AllocBlock(uint64_t size,
uint64_t *offset) {
// Allocator does not support size 0 blocks. See block_allocator_free_block.
invariant(size > 0);
grow_blocks_array();
_n_bytes_in_use += size;
*offset = _tree->Remove(size);
uint64_t end_of_reserve = align(_reserve_at_beginning, _alignment);
if (_n_blocks == 0) {
// First and only block
assert(_n_bytes_in_use == _reserve_at_beginning + size); // we know exactly how many are in use
_blocks_array[0].offset = align(_reserve_at_beginning, _alignment);
_blocks_array[0].size = size;
*offset = _blocks_array[0].offset;
goto done;
} else if (end_of_reserve + size <= _blocks_array[0].offset ) {
// Check to see if the space immediately after the reserve is big enough to hold the new block.
bp = &_blocks_array[0];
memmove(bp + 1, bp, _n_blocks * sizeof(*bp));
bp[0].offset = end_of_reserve;
bp[0].size = size;
*offset = end_of_reserve;
goto done;
}
bp = choose_block_to_alloc_after(size, heat);
if (bp != nullptr) {
// our allocation strategy chose the space after `bp' to fit the new block
uint64_t answer_offset = align(bp->offset + bp->size, _alignment);
uint64_t blocknum = bp - _blocks_array;
invariant(&_blocks_array[blocknum] == bp);
invariant(blocknum < _n_blocks);
memmove(bp + 2, bp + 1, (_n_blocks - blocknum - 1) * sizeof(*bp));
bp[1].offset = answer_offset;
bp[1].size = size;
*offset = answer_offset;
} else {
// It didn't fit anywhere, so fit it on the end.
assert(_n_blocks < _blocks_array_size);
bp = &_blocks_array[_n_blocks];
uint64_t answer_offset = align(bp[-1].offset + bp[-1].size, _alignment);
bp->offset = answer_offset;
bp->size = size;
*offset = answer_offset;
}
done:
_n_blocks++;
VALIDATE();
_trace_alloc(size, heat, *offset);
}
// Find the index in the blocks array that has a particular offset. Requires that the block exist.
// Use binary search so it runs fast.
int64_t block_allocator::find_block(uint64_t offset) {
VALIDATE();
if (_n_blocks == 1) {
assert(_blocks_array[0].offset == offset);
return 0;
}
uint64_t lo = 0;
uint64_t hi = _n_blocks;
while (1) {
assert(lo < hi); // otherwise no such block exists.
uint64_t mid = (lo + hi) / 2;
uint64_t thisoff = _blocks_array[mid].offset;
if (thisoff < offset) {
lo = mid + 1;
} else if (thisoff > offset) {
hi = mid;
} else {
return mid;
}
}
}
// To support 0-sized blocks, we need to include size as an input to this function.
// To support 0-sized blocks, we need to include size as an input to this
// function.
// All 0-sized blocks at the same offset can be considered identical, but
// a 0-sized block can share offset with a non-zero sized block.
// The non-zero sized block is not exchangable with a zero sized block (or vice versa),
// so inserting 0-sized blocks can cause corruption here.
void block_allocator::free_block(uint64_t offset) {
// The non-zero sized block is not exchangable with a zero sized block (or vice
// versa), so inserting 0-sized blocks can cause corruption here.
void BlockAllocator::FreeBlock(uint64_t offset, uint64_t size) {
VALIDATE();
int64_t bn = find_block(offset);
assert(bn >= 0); // we require that there is a block with that offset.
_n_bytes_in_use -= _blocks_array[bn].size;
memmove(&_blocks_array[bn], &_blocks_array[bn + 1],
(_n_blocks - bn - 1) * sizeof(struct blockpair));
_n_bytes_in_use -= size;
_tree->Insert({offset, size});
_n_blocks--;
VALIDATE();
_trace_free(offset);
}
uint64_t block_allocator::block_size(uint64_t offset) {
int64_t bn = find_block(offset);
assert(bn >=0); // we require that there is a block with that offset.
return _blocks_array[bn].size;
uint64_t BlockAllocator::AllocatedLimit() const {
MhsRbTree::Node *max_node = _tree->MaxNode();
return rbn_offset(max_node).ToInt();
}
uint64_t block_allocator::allocated_limit() const {
if (_n_blocks == 0) {
return _reserve_at_beginning;
} else {
struct blockpair *last = &_blocks_array[_n_blocks - 1];
return last->offset + last->size;
}
}
// Effect: Consider the blocks in sorted order. The reserved block at the beginning is number 0. The next one is number 1 and so forth.
// Effect: Consider the blocks in sorted order. The reserved block at the
// beginning is number 0. The next one is number 1 and so forth.
// Return the offset and size of the block with that number.
// Return 0 if there is a block that big, return nonzero if b is too big.
int block_allocator::get_nth_block_in_layout_order(uint64_t b, uint64_t *offset, uint64_t *size) {
if (b ==0 ) {
int BlockAllocator::NthBlockInLayoutOrder(uint64_t b,
uint64_t *offset,
uint64_t *size) {
MhsRbTree::Node *x, *y;
if (b == 0) {
*offset = 0;
*size = _reserve_at_beginning;
return 0;
return 0;
} else if (b > _n_blocks) {
return -1;
} else {
*offset =_blocks_array[b - 1].offset;
*size =_blocks_array[b - 1].size;
x = _tree->MinNode();
for (uint64_t i = 1; i <= b; i++) {
y = x;
x = _tree->Successor(x);
}
*size = (rbn_offset(x) - (rbn_offset(y) + rbn_size(y))).ToInt();
*offset = (rbn_offset(y) + rbn_size(y)).ToInt();
return 0;
}
}
struct VisUnusedExtra {
TOKU_DB_FRAGMENTATION _report;
uint64_t _align;
};
static void VisUnusedCollector(void *extra,
MhsRbTree::Node *node,
uint64_t UU(depth)) {
struct VisUnusedExtra *v_e = (struct VisUnusedExtra *)extra;
TOKU_DB_FRAGMENTATION report = v_e->_report;
uint64_t alignm = v_e->_align;
MhsRbTree::OUUInt64 offset = rbn_offset(node);
MhsRbTree::OUUInt64 size = rbn_size(node);
MhsRbTree::OUUInt64 answer_offset(Align(offset.ToInt(), alignm));
uint64_t free_space = (offset + size - answer_offset).ToInt();
if (free_space > 0) {
report->unused_bytes += free_space;
report->unused_blocks++;
if (free_space > report->largest_unused_block) {
report->largest_unused_block = free_space;
}
}
}
// Requires: report->file_size_bytes is filled in
// Requires: report->data_bytes is filled in
// Requires: report->checkpoint_bytes_additional is filled in
void block_allocator::get_unused_statistics(TOKU_DB_FRAGMENTATION report) {
assert(_n_bytes_in_use == report->data_bytes + report->checkpoint_bytes_additional);
void BlockAllocator::UnusedStatistics(TOKU_DB_FRAGMENTATION report) {
invariant(_n_bytes_in_use ==
report->data_bytes + report->checkpoint_bytes_additional);
report->unused_bytes = 0;
report->unused_blocks = 0;
report->largest_unused_block = 0;
if (_n_blocks > 0) {
//Deal with space before block 0 and after reserve:
{
struct blockpair *bp = &_blocks_array[0];
assert(bp->offset >= align(_reserve_at_beginning, _alignment));
uint64_t free_space = bp->offset - align(_reserve_at_beginning, _alignment);
if (free_space > 0) {
report->unused_bytes += free_space;
report->unused_blocks++;
if (free_space > report->largest_unused_block) {
report->largest_unused_block = free_space;
}
}
}
//Deal with space between blocks:
for (uint64_t blocknum = 0; blocknum +1 < _n_blocks; blocknum ++) {
// Consider the space after blocknum
struct blockpair *bp = &_blocks_array[blocknum];
uint64_t this_offset = bp[0].offset;
uint64_t this_size = bp[0].size;
uint64_t end_of_this_block = align(this_offset+this_size, _alignment);
uint64_t next_offset = bp[1].offset;
uint64_t free_space = next_offset - end_of_this_block;
if (free_space > 0) {
report->unused_bytes += free_space;
report->unused_blocks++;
if (free_space > report->largest_unused_block) {
report->largest_unused_block = free_space;
}
}
}
//Deal with space after last block
{
struct blockpair *bp = &_blocks_array[_n_blocks-1];
uint64_t this_offset = bp[0].offset;
uint64_t this_size = bp[0].size;
uint64_t end_of_this_block = align(this_offset+this_size, _alignment);
if (end_of_this_block < report->file_size_bytes) {
uint64_t free_space = report->file_size_bytes - end_of_this_block;
assert(free_space > 0);
report->unused_bytes += free_space;
report->unused_blocks++;
if (free_space > report->largest_unused_block) {
report->largest_unused_block = free_space;
}
}
}
} else {
// No blocks. Just the reserve.
uint64_t end_of_this_block = align(_reserve_at_beginning, _alignment);
if (end_of_this_block < report->file_size_bytes) {
uint64_t free_space = report->file_size_bytes - end_of_this_block;
assert(free_space > 0);
report->unused_bytes += free_space;
report->unused_blocks++;
if (free_space > report->largest_unused_block) {
report->largest_unused_block = free_space;
}
}
}
struct VisUnusedExtra extra = {report, _alignment};
_tree->InOrderVisitor(VisUnusedCollector, &extra);
}
void block_allocator::get_statistics(TOKU_DB_FRAGMENTATION report) {
report->data_bytes = _n_bytes_in_use;
report->data_blocks = _n_blocks;
void BlockAllocator::Statistics(TOKU_DB_FRAGMENTATION report) {
report->data_bytes = _n_bytes_in_use;
report->data_blocks = _n_blocks;
report->file_size_bytes = 0;
report->checkpoint_bytes_additional = 0;
get_unused_statistics(report);
UnusedStatistics(report);
}
void block_allocator::validate() const {
uint64_t n_bytes_in_use = _reserve_at_beginning;
for (uint64_t i = 0; i < _n_blocks; i++) {
n_bytes_in_use += _blocks_array[i].size;
if (i > 0) {
assert(_blocks_array[i].offset > _blocks_array[i - 1].offset);
assert(_blocks_array[i].offset >= _blocks_array[i - 1].offset + _blocks_array[i - 1].size );
}
struct ValidateExtra {
uint64_t _bytes;
MhsRbTree::Node *_pre_node;
};
static void VisUsedBlocksInOrder(void *extra,
MhsRbTree::Node *cur_node,
uint64_t UU(depth)) {
struct ValidateExtra *v_e = (struct ValidateExtra *)extra;
MhsRbTree::Node *pre_node = v_e->_pre_node;
// verify no overlaps
if (pre_node) {
invariant(rbn_size(pre_node) > 0);
invariant(rbn_offset(cur_node) >
rbn_offset(pre_node) + rbn_size(pre_node));
MhsRbTree::OUUInt64 used_space =
rbn_offset(cur_node) - (rbn_offset(pre_node) + rbn_size(pre_node));
v_e->_bytes += used_space.ToInt();
} else {
v_e->_bytes += rbn_offset(cur_node).ToInt();
}
assert(n_bytes_in_use == _n_bytes_in_use);
v_e->_pre_node = cur_node;
}
// Tracing
void block_allocator::_trace_create(void) {
if (ba_trace_file != nullptr) {
toku_mutex_lock(&_trace_lock);
fprintf(ba_trace_file, "ba_trace_create %p %" PRIu64 " %" PRIu64 "\n",
this, _reserve_at_beginning, _alignment);
toku_mutex_unlock(&_trace_lock);
fflush(ba_trace_file);
}
}
void block_allocator::_trace_create_from_blockpairs(void) {
if (ba_trace_file != nullptr) {
toku_mutex_lock(&_trace_lock);
fprintf(ba_trace_file, "ba_trace_create_from_blockpairs %p %" PRIu64 " %" PRIu64 " ",
this, _reserve_at_beginning, _alignment);
for (uint64_t i = 0; i < _n_blocks; i++) {
fprintf(ba_trace_file, "[%" PRIu64 " %" PRIu64 "] ",
_blocks_array[i].offset, _blocks_array[i].size);
}
fprintf(ba_trace_file, "\n");
toku_mutex_unlock(&_trace_lock);
fflush(ba_trace_file);
}
}
void block_allocator::_trace_destroy(void) {
if (ba_trace_file != nullptr) {
toku_mutex_lock(&_trace_lock);
fprintf(ba_trace_file, "ba_trace_destroy %p\n", this);
toku_mutex_unlock(&_trace_lock);
fflush(ba_trace_file);
}
}
void block_allocator::_trace_alloc(uint64_t size, uint64_t heat, uint64_t offset) {
if (ba_trace_file != nullptr) {
toku_mutex_lock(&_trace_lock);
fprintf(ba_trace_file, "ba_trace_alloc %p %" PRIu64 " %" PRIu64 " %" PRIu64 "\n",
this, size, heat, offset);
toku_mutex_unlock(&_trace_lock);
fflush(ba_trace_file);
}
}
void block_allocator::_trace_free(uint64_t offset) {
if (ba_trace_file != nullptr) {
toku_mutex_lock(&_trace_lock);
fprintf(ba_trace_file, "ba_trace_free %p %" PRIu64 "\n", this, offset);
toku_mutex_unlock(&_trace_lock);
fflush(ba_trace_file);
}
void BlockAllocator::Validate() const {
_tree->ValidateBalance();
_tree->ValidateMhs();
struct ValidateExtra extra = {0, nullptr};
_tree->InOrderVisitor(VisUsedBlocksInOrder, &extra);
invariant(extra._bytes == _n_bytes_in_use);
}

View file

@ -43,6 +43,7 @@ Copyright (c) 2006, 2015, Percona and/or its affiliates. All rights reserved.
#include "portability/toku_pthread.h"
#include "portability/toku_stdint.h"
#include "portability/toku_stdlib.h"
#include "ft/serialize/rbtree_mhs.h"
// Block allocator.
//
@ -51,151 +52,128 @@ Copyright (c) 2006, 2015, Percona and/or its affiliates. All rights reserved.
// The allocation of block numbers is handled elsewhere.
//
// When creating a block allocator we also specify a certain-sized
// block at the beginning that is preallocated (and cannot be allocated or freed)
// block at the beginning that is preallocated (and cannot be allocated or
// freed)
//
// We can allocate blocks of a particular size at a particular location.
// We can allocate blocks of a particular size at a location chosen by the allocator.
// We can free blocks.
// We can determine the size of a block.
class block_allocator {
public:
#define MAX_BYTE 0xffffffffffffffff
class BlockAllocator {
public:
static const size_t BLOCK_ALLOCATOR_ALIGNMENT = 4096;
// How much must be reserved at the beginning for the block?
// The actual header is 8+4+4+8+8_4+8+ the length of the db names + 1 pointer for each root.
// The actual header is 8+4+4+8+8_4+8+ the length of the db names + 1
// pointer for each root.
// So 4096 should be enough.
static const size_t BLOCK_ALLOCATOR_HEADER_RESERVE = 4096;
static_assert(BLOCK_ALLOCATOR_HEADER_RESERVE % BLOCK_ALLOCATOR_ALIGNMENT == 0,
static_assert(BLOCK_ALLOCATOR_HEADER_RESERVE % BLOCK_ALLOCATOR_ALIGNMENT ==
0,
"block allocator header must have proper alignment");
static const size_t BLOCK_ALLOCATOR_TOTAL_HEADER_RESERVE = BLOCK_ALLOCATOR_HEADER_RESERVE * 2;
static const size_t BLOCK_ALLOCATOR_TOTAL_HEADER_RESERVE =
BLOCK_ALLOCATOR_HEADER_RESERVE * 2;
enum allocation_strategy {
BA_STRATEGY_FIRST_FIT = 1,
BA_STRATEGY_BEST_FIT,
BA_STRATEGY_PADDED_FIT,
BA_STRATEGY_HEAT_ZONE
struct BlockPair {
uint64_t _offset;
uint64_t _size;
BlockPair(uint64_t o, uint64_t s) : _offset(o), _size(s) {}
int operator<(const struct BlockPair &rhs) const {
return _offset < rhs._offset;
}
int operator<(const uint64_t &o) const { return _offset < o; }
};
struct blockpair {
uint64_t offset;
uint64_t size;
blockpair(uint64_t o, uint64_t s) :
offset(o), size(s) {
}
int operator<(const struct blockpair &rhs) const {
return offset < rhs.offset;
}
int operator<(const uint64_t &o) const {
return offset < o;
}
};
// Effect: Create a block allocator, in which the first RESERVE_AT_BEGINNING bytes are not put into a block.
// The default allocation strategy is first fit (BA_STRATEGY_FIRST_FIT)
// Effect: Create a block allocator, in which the first RESERVE_AT_BEGINNING
// bytes are not put into a block.
// The default allocation strategy is first fit
// (BA_STRATEGY_FIRST_FIT)
// All blocks be start on a multiple of ALIGNMENT.
// Aborts if we run out of memory.
// Parameters
// reserve_at_beginning (IN) Size of reserved block at beginning. This size does not have to be aligned.
// reserve_at_beginning (IN) Size of reserved block at beginning.
// This size does not have to be aligned.
// alignment (IN) Block alignment.
void create(uint64_t reserve_at_beginning, uint64_t alignment);
void Create(uint64_t reserve_at_beginning, uint64_t alignment);
// Effect: Create a block allocator, in which the first RESERVE_AT_BEGINNING bytes are not put into a block.
// The default allocation strategy is first fit (BA_STRATEGY_FIRST_FIT)
// The allocator is initialized to contain `n_blocks' of blockpairs, taken from `pairs'
// Effect: Create a block allocator, in which the first RESERVE_AT_BEGINNING
// bytes are not put into a block.
// The allocator is initialized to contain `n_blocks' of BlockPairs,
// taken from `pairs'
// All blocks be start on a multiple of ALIGNMENT.
// Aborts if we run out of memory.
// Parameters
// pairs, unowned array of pairs to copy
// n_blocks, Size of pairs array
// reserve_at_beginning (IN) Size of reserved block at beginning. This size does not have to be aligned.
// reserve_at_beginning (IN) Size of reserved block at beginning.
// This size does not have to be aligned.
// alignment (IN) Block alignment.
void create_from_blockpairs(uint64_t reserve_at_beginning, uint64_t alignment,
struct blockpair *pairs, uint64_t n_blocks);
void CreateFromBlockPairs(uint64_t reserve_at_beginning,
uint64_t alignment,
struct BlockPair *pairs,
uint64_t n_blocks);
// Effect: Destroy this block allocator
void destroy();
void Destroy();
// Effect: Set the allocation strategy that the allocator should use
// Requires: No other threads are operating on this block allocator
void set_strategy(enum allocation_strategy strategy);
// Effect: Allocate a block of the specified size at an address chosen by the allocator.
// Effect: Allocate a block of the specified size at an address chosen by
// the allocator.
// Aborts if anything goes wrong.
// The block address will be a multiple of the alignment.
// Parameters:
// size (IN): The size of the block. (The size does not have to be aligned.)
// size (IN): The size of the block. (The size does not have to be
// aligned.)
// offset (OUT): The location of the block.
// heat (IN): A higher heat means we should be prepared to free this block soon (perhaps in the next checkpoint)
// Heat values are lexiographically ordered (like integers), but their specific values are arbitrary
void alloc_block(uint64_t size, uint64_t heat, uint64_t *offset);
// block soon (perhaps in the next checkpoint)
// Heat values are lexiographically ordered (like integers),
// but their specific values are arbitrary
void AllocBlock(uint64_t size, uint64_t *offset);
// Effect: Free the block at offset.
// Requires: There must be a block currently allocated at that offset.
// Parameters:
// offset (IN): The offset of the block.
void free_block(uint64_t offset);
void FreeBlock(uint64_t offset, uint64_t size);
// Effect: Return the size of the block that starts at offset.
// Requires: There must be a block currently allocated at that offset.
// Parameters:
// offset (IN): The offset of the block.
uint64_t block_size(uint64_t offset);
// Effect: Check to see if the block allocator is OK. This may take a long time.
// Effect: Check to see if the block allocator is OK. This may take a long
// time.
// Usage Hints: Probably only use this for unit tests.
// TODO: Private?
void validate() const;
void Validate() const;
// Effect: Return the unallocated block address of "infinite" size.
// That is, return the smallest address that is above all the allocated blocks.
uint64_t allocated_limit() const;
// That is, return the smallest address that is above all the allocated
// blocks.
uint64_t AllocatedLimit() const;
// Effect: Consider the blocks in sorted order. The reserved block at the beginning is number 0. The next one is number 1 and so forth.
// Effect: Consider the blocks in sorted order. The reserved block at the
// beginning is number 0. The next one is number 1 and so forth.
// Return the offset and size of the block with that number.
// Return 0 if there is a block that big, return nonzero if b is too big.
// Rationale: This is probably useful only for tests.
int get_nth_block_in_layout_order(uint64_t b, uint64_t *offset, uint64_t *size);
int NthBlockInLayoutOrder(uint64_t b, uint64_t *offset, uint64_t *size);
// Effect: Fill in report to indicate how the file is used.
// Requires:
// Requires:
// report->file_size_bytes is filled in
// report->data_bytes is filled in
// report->checkpoint_bytes_additional is filled in
void get_unused_statistics(TOKU_DB_FRAGMENTATION report);
void UnusedStatistics(TOKU_DB_FRAGMENTATION report);
// Effect: Fill in report->data_bytes with the number of bytes in use
// Fill in report->data_blocks with the number of blockpairs in use
// Fill in report->data_blocks with the number of BlockPairs in use
// Fill in unused statistics using this->get_unused_statistics()
// Requires:
// report->file_size is ignored on return
// report->checkpoint_bytes_additional is ignored on return
void get_statistics(TOKU_DB_FRAGMENTATION report);
void Statistics(TOKU_DB_FRAGMENTATION report);
// Block allocator tracing.
// - Enabled by setting TOKU_BA_TRACE_PATH to the file that the trace file
// should be written to.
// - Trace may be replayed by ba_trace_replay tool in tools/ directory
// eg: "cat mytracefile | ba_trace_replay"
static void maybe_initialize_trace();
static void maybe_close_trace();
virtual ~BlockAllocator(){};
private:
void _create_internal(uint64_t reserve_at_beginning, uint64_t alignment);
void grow_blocks_array_by(uint64_t n_to_add);
void grow_blocks_array();
int64_t find_block(uint64_t offset);
struct blockpair *choose_block_to_alloc_after(size_t size, uint64_t heat);
// Tracing
toku_mutex_t _trace_lock;
void _trace_create(void);
void _trace_create_from_blockpairs(void);
void _trace_destroy(void);
void _trace_alloc(uint64_t size, uint64_t heat, uint64_t offset);
void _trace_free(uint64_t offset);
private:
void CreateInternal(uint64_t reserve_at_beginning, uint64_t alignment);
// How much to reserve at the beginning
uint64_t _reserve_at_beginning;
@ -203,12 +181,8 @@ private:
uint64_t _alignment;
// How many blocks
uint64_t _n_blocks;
// How big is the blocks_array. Must be >= n_blocks.
uint64_t _blocks_array_size;
// These blocks are sorted by address.
struct blockpair *_blocks_array;
// Including the reserve_at_beginning
uint64_t _n_bytes_in_use;
// The allocation strategy are we using
enum allocation_strategy _strategy;
// These blocks are sorted by address.
MhsRbTree::Tree *_tree;
};

View file

@ -1,224 +0,0 @@
/* -*- mode: C++; c-basic-offset: 4; indent-tabs-mode: nil -*- */
// vim: ft=cpp:expandtab:ts=8:sw=4:softtabstop=4:
#ident "$Id$"
/*======
This file is part of PerconaFT.
Copyright (c) 2006, 2015, Percona and/or its affiliates. All rights reserved.
PerconaFT is free software: you can redistribute it and/or modify
it under the terms of the GNU General Public License, version 2,
as published by the Free Software Foundation.
PerconaFT is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with PerconaFT. If not, see <http://www.gnu.org/licenses/>.
----------------------------------------
PerconaFT is free software: you can redistribute it and/or modify
it under the terms of the GNU Affero General Public License, version 3,
as published by the Free Software Foundation.
PerconaFT is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU Affero General Public License for more details.
You should have received a copy of the GNU Affero General Public License
along with PerconaFT. If not, see <http://www.gnu.org/licenses/>.
======= */
#ident "Copyright (c) 2006, 2015, Percona and/or its affiliates. All rights reserved."
#include <algorithm>
#include <string.h>
#include "portability/toku_assert.h"
#include "ft/serialize/block_allocator_strategy.h"
static uint64_t _align(uint64_t value, uint64_t ba_alignment) {
return ((value + ba_alignment - 1) / ba_alignment) * ba_alignment;
}
static uint64_t _roundup_to_power_of_two(uint64_t value) {
uint64_t r = 4096;
while (r < value) {
r *= 2;
invariant(r > 0);
}
return r;
}
// First fit block allocation
static struct block_allocator::blockpair *
_first_fit(struct block_allocator::blockpair *blocks_array,
uint64_t n_blocks, uint64_t size, uint64_t alignment,
uint64_t max_padding) {
if (n_blocks == 1) {
// won't enter loop, can't underflow the direction < 0 case
return nullptr;
}
struct block_allocator::blockpair *bp = &blocks_array[0];
for (uint64_t n_spaces_to_check = n_blocks - 1; n_spaces_to_check > 0;
n_spaces_to_check--, bp++) {
// Consider the space after bp
uint64_t padded_alignment = max_padding != 0 ? _align(max_padding, alignment) : alignment;
uint64_t possible_offset = _align(bp->offset + bp->size, padded_alignment);
if (possible_offset + size <= bp[1].offset) { // bp[1] is always valid since bp < &blocks_array[n_blocks-1]
invariant(bp - blocks_array < (int64_t) n_blocks);
return bp;
}
}
return nullptr;
}
static struct block_allocator::blockpair *
_first_fit_bw(struct block_allocator::blockpair *blocks_array,
uint64_t n_blocks, uint64_t size, uint64_t alignment,
uint64_t max_padding, struct block_allocator::blockpair *blocks_array_limit) {
if (n_blocks == 1) {
// won't enter loop, can't underflow the direction < 0 case
return nullptr;
}
struct block_allocator::blockpair *bp = &blocks_array[-1];
for (uint64_t n_spaces_to_check = n_blocks - 1; n_spaces_to_check > 0;
n_spaces_to_check--, bp--) {
// Consider the space after bp
uint64_t padded_alignment = max_padding != 0 ? _align(max_padding, alignment) : alignment;
uint64_t possible_offset = _align(bp->offset + bp->size, padded_alignment);
if (&bp[1] < blocks_array_limit && possible_offset + size <= bp[1].offset) {
invariant(blocks_array - bp < (int64_t) n_blocks);
return bp;
}
}
return nullptr;
}
struct block_allocator::blockpair *
block_allocator_strategy::first_fit(struct block_allocator::blockpair *blocks_array,
uint64_t n_blocks, uint64_t size, uint64_t alignment) {
return _first_fit(blocks_array, n_blocks, size, alignment, 0);
}
// Best fit block allocation
struct block_allocator::blockpair *
block_allocator_strategy::best_fit(struct block_allocator::blockpair *blocks_array,
uint64_t n_blocks, uint64_t size, uint64_t alignment) {
struct block_allocator::blockpair *best_bp = nullptr;
uint64_t best_hole_size = 0;
for (uint64_t blocknum = 0; blocknum + 1 < n_blocks; blocknum++) {
// Consider the space after blocknum
struct block_allocator::blockpair *bp = &blocks_array[blocknum];
uint64_t possible_offset = _align(bp->offset + bp->size, alignment);
uint64_t possible_end_offset = possible_offset + size;
if (possible_end_offset <= bp[1].offset) {
// It fits here. Is it the best fit?
uint64_t hole_size = bp[1].offset - possible_end_offset;
if (best_bp == nullptr || hole_size < best_hole_size) {
best_hole_size = hole_size;
best_bp = bp;
}
}
}
return best_bp;
}
static uint64_t padded_fit_alignment = 4096;
// TODO: These compiler specific directives should be abstracted in a portability header
// portability/toku_compiler.h?
__attribute__((__constructor__))
static void determine_padded_fit_alignment_from_env(void) {
// TODO: Should be in portability as 'toku_os_getenv()?'
const char *s = getenv("TOKU_BA_PADDED_FIT_ALIGNMENT");
if (s != nullptr && strlen(s) > 0) {
const int64_t alignment = strtoll(s, nullptr, 10);
if (alignment <= 0) {
fprintf(stderr, "tokuft: error: block allocator padded fit alignment found in environment (%s), "
"but it's out of range (should be an integer > 0). defaulting to %" PRIu64 "\n",
s, padded_fit_alignment);
} else {
padded_fit_alignment = _roundup_to_power_of_two(alignment);
fprintf(stderr, "tokuft: setting block allocator padded fit alignment to %" PRIu64 "\n",
padded_fit_alignment);
}
}
}
// First fit into a block that is oversized by up to max_padding.
// The hope is that if we purposefully waste a bit of space at allocation
// time we'll be more likely to reuse this block later.
struct block_allocator::blockpair *
block_allocator_strategy::padded_fit(struct block_allocator::blockpair *blocks_array,
uint64_t n_blocks, uint64_t size, uint64_t alignment) {
return _first_fit(blocks_array, n_blocks, size, alignment, padded_fit_alignment);
}
static double hot_zone_threshold = 0.85;
// TODO: These compiler specific directives should be abstracted in a portability header
// portability/toku_compiler.h?
__attribute__((__constructor__))
static void determine_hot_zone_threshold_from_env(void) {
// TODO: Should be in portability as 'toku_os_getenv()?'
const char *s = getenv("TOKU_BA_HOT_ZONE_THRESHOLD");
if (s != nullptr && strlen(s) > 0) {
const double hot_zone = strtod(s, nullptr);
if (hot_zone < 1 || hot_zone > 99) {
fprintf(stderr, "tokuft: error: block allocator hot zone threshold found in environment (%s), "
"but it's out of range (should be an integer 1 through 99). defaulting to 85\n", s);
hot_zone_threshold = 85 / 100;
} else {
fprintf(stderr, "tokuft: setting block allocator hot zone threshold to %s\n", s);
hot_zone_threshold = hot_zone / 100;
}
}
}
struct block_allocator::blockpair *
block_allocator_strategy::heat_zone(struct block_allocator::blockpair *blocks_array,
uint64_t n_blocks, uint64_t size, uint64_t alignment,
uint64_t heat) {
if (heat > 0) {
struct block_allocator::blockpair *bp, *boundary_bp;
// Hot allocation. Find the beginning of the hot zone.
boundary_bp = &blocks_array[n_blocks - 1];
uint64_t highest_offset = _align(boundary_bp->offset + boundary_bp->size, alignment);
uint64_t hot_zone_offset = static_cast<uint64_t>(hot_zone_threshold * highest_offset);
boundary_bp = std::lower_bound(blocks_array, blocks_array + n_blocks, hot_zone_offset);
uint64_t blocks_in_zone = (blocks_array + n_blocks) - boundary_bp;
uint64_t blocks_outside_zone = boundary_bp - blocks_array;
invariant(blocks_in_zone + blocks_outside_zone == n_blocks);
if (blocks_in_zone > 0) {
// Find the first fit in the hot zone, going forward.
bp = _first_fit(boundary_bp, blocks_in_zone, size, alignment, 0);
if (bp != nullptr) {
return bp;
}
}
if (blocks_outside_zone > 0) {
// Find the first fit in the cold zone, going backwards.
bp = _first_fit_bw(boundary_bp, blocks_outside_zone, size, alignment, 0, &blocks_array[n_blocks]);
if (bp != nullptr) {
return bp;
}
}
} else {
// Cold allocations are simply first-fit from the beginning.
return _first_fit(blocks_array, n_blocks, size, alignment, 0);
}
return nullptr;
}

File diff suppressed because it is too large Load diff

View file

@ -62,13 +62,16 @@ enum {
RESERVED_BLOCKNUMS
};
typedef int (*BLOCKTABLE_CALLBACK)(BLOCKNUM b, int64_t size, int64_t address, void *extra);
typedef int (*BLOCKTABLE_CALLBACK)(BLOCKNUM b,
int64_t size,
int64_t address,
void *extra);
static inline BLOCKNUM make_blocknum(int64_t b) {
BLOCKNUM result = { .b = b };
BLOCKNUM result = {.b = b};
return result;
}
static const BLOCKNUM ROLLBACK_NONE = { .b = 0 };
static const BLOCKNUM ROLLBACK_NONE = {.b = 0};
/**
* There are three copies of the translation table (btt) in the block table:
@ -80,18 +83,20 @@ static const BLOCKNUM ROLLBACK_NONE = { .b = 0 };
*
* inprogress Is only filled by copying from current,
* and is the only version ever serialized to disk.
* (It is serialized to disk on checkpoint and clean shutdown.)
* (It is serialized to disk on checkpoint and clean
*shutdown.)
* At end of checkpoint it replaces 'checkpointed'.
* During a checkpoint, any 'pending' dirty writes will update
* inprogress.
*
* current Is initialized by copying from checkpointed,
* is the only version ever modified while the database is in use,
* is the only version ever modified while the database is in
*use,
* and is the only version ever copied to inprogress.
* It is never stored on disk.
*/
class block_table {
public:
public:
enum translation_type {
TRANSLATION_NONE = 0,
TRANSLATION_CURRENT,
@ -102,7 +107,10 @@ public:
void create();
int create_from_buffer(int fd, DISKOFF location_on_disk, DISKOFF size_on_disk, unsigned char *translation_buffer);
int create_from_buffer(int fd,
DISKOFF location_on_disk,
DISKOFF size_on_disk,
unsigned char *translation_buffer);
void destroy();
@ -114,11 +122,21 @@ public:
// Blocknums
void allocate_blocknum(BLOCKNUM *res, struct ft *ft);
void realloc_on_disk(BLOCKNUM b, DISKOFF size, DISKOFF *offset, struct ft *ft, int fd, bool for_checkpoint, uint64_t heat);
void realloc_on_disk(BLOCKNUM b,
DISKOFF size,
DISKOFF *offset,
struct ft *ft,
int fd,
bool for_checkpoint);
void free_blocknum(BLOCKNUM *b, struct ft *ft, bool for_checkpoint);
void translate_blocknum_to_offset_size(BLOCKNUM b, DISKOFF *offset, DISKOFF *size);
void translate_blocknum_to_offset_size(BLOCKNUM b,
DISKOFF *offset,
DISKOFF *size);
void free_unused_blocknums(BLOCKNUM root);
void realloc_descriptor_on_disk(DISKOFF size, DISKOFF *offset, struct ft *ft, int fd);
void realloc_descriptor_on_disk(DISKOFF size,
DISKOFF *offset,
struct ft *ft,
int fd);
void get_descriptor_offset_size(DISKOFF *offset, DISKOFF *size);
// External verfication
@ -127,15 +145,22 @@ public:
void verify_no_free_blocknums();
// Serialization
void serialize_translation_to_wbuf(int fd, struct wbuf *w, int64_t *address, int64_t *size);
void serialize_translation_to_wbuf(int fd,
struct wbuf *w,
int64_t *address,
int64_t *size);
// DEBUG ONLY (ftdump included), tests included
void blocknum_dump_translation(BLOCKNUM b);
void dump_translation_table_pretty(FILE *f);
void dump_translation_table(FILE *f);
void block_free(uint64_t offset);
void block_free(uint64_t offset, uint64_t size);
int iterate(enum translation_type type, BLOCKTABLE_CALLBACK f, void *extra, bool data_only, bool used_only);
int iterate(enum translation_type type,
BLOCKTABLE_CALLBACK f,
void *extra,
bool data_only,
bool used_only);
void internal_fragmentation(int64_t *total_sizep, int64_t *used_sizep);
// Requires: blocktable lock is held.
@ -146,13 +171,16 @@ public:
void get_info64(struct ftinfo64 *);
int iterate_translation_tables(uint64_t, int (*)(uint64_t, int64_t, int64_t, int64_t, int64_t, void *), void *);
int iterate_translation_tables(
uint64_t,
int (*)(uint64_t, int64_t, int64_t, int64_t, int64_t, void *),
void *);
private:
private:
struct block_translation_pair {
// If in the freelist, use next_free_blocknum, otherwise diskoff.
union {
DISKOFF diskoff;
DISKOFF diskoff;
BLOCKNUM next_free_blocknum;
} u;
@ -173,7 +201,8 @@ private:
struct translation {
enum translation_type type;
// Number of elements in array (block_translation). always >= smallest_never_used_blocknum
// Number of elements in array (block_translation). always >=
// smallest_never_used_blocknum
int64_t length_of_array;
BLOCKNUM smallest_never_used_blocknum;
@ -181,20 +210,28 @@ private:
BLOCKNUM blocknum_freelist_head;
struct block_translation_pair *block_translation;
// size_on_disk is stored in block_translation[RESERVED_BLOCKNUM_TRANSLATION].size
// location_on is stored in block_translation[RESERVED_BLOCKNUM_TRANSLATION].u.diskoff
// size_on_disk is stored in
// block_translation[RESERVED_BLOCKNUM_TRANSLATION].size
// location_on is stored in
// block_translation[RESERVED_BLOCKNUM_TRANSLATION].u.diskoff
};
void _create_internal();
int _translation_deserialize_from_buffer(struct translation *t, // destination into which to deserialize
DISKOFF location_on_disk, // location of translation_buffer
uint64_t size_on_disk,
unsigned char * translation_buffer); // buffer with serialized translation
int _translation_deserialize_from_buffer(
struct translation *t, // destination into which to deserialize
DISKOFF location_on_disk, // location of translation_buffer
uint64_t size_on_disk,
unsigned char *
translation_buffer); // buffer with serialized translation
void _copy_translation(struct translation *dst, struct translation *src, enum translation_type newtype);
void _copy_translation(struct translation *dst,
struct translation *src,
enum translation_type newtype);
void _maybe_optimize_translation(struct translation *t);
void _maybe_expand_translation(struct translation *t);
bool _translation_prevents_freeing(struct translation *t, BLOCKNUM b, struct block_translation_pair *old_pair);
bool _translation_prevents_freeing(struct translation *t,
BLOCKNUM b,
struct block_translation_pair *old_pair);
void _free_blocknum_in_translation(struct translation *t, BLOCKNUM b);
int64_t _calculate_size_on_disk(struct translation *t);
bool _pair_is_unallocated(struct block_translation_pair *pair);
@ -203,14 +240,26 @@ private:
// Blocknum management
void _allocate_blocknum_unlocked(BLOCKNUM *res, struct ft *ft);
void _free_blocknum_unlocked(BLOCKNUM *bp, struct ft *ft, bool for_checkpoint);
void _realloc_descriptor_on_disk_unlocked(DISKOFF size, DISKOFF *offset, struct ft *ft);
void _realloc_on_disk_internal(BLOCKNUM b, DISKOFF size, DISKOFF *offset, struct ft *ft, bool for_checkpoint, uint64_t heat);
void _translate_blocknum_to_offset_size_unlocked(BLOCKNUM b, DISKOFF *offset, DISKOFF *size);
void _free_blocknum_unlocked(BLOCKNUM *bp,
struct ft *ft,
bool for_checkpoint);
void _realloc_descriptor_on_disk_unlocked(DISKOFF size,
DISKOFF *offset,
struct ft *ft);
void _realloc_on_disk_internal(BLOCKNUM b,
DISKOFF size,
DISKOFF *offset,
struct ft *ft,
bool for_checkpoint);
void _translate_blocknum_to_offset_size_unlocked(BLOCKNUM b,
DISKOFF *offset,
DISKOFF *size);
// File management
void _maybe_truncate_file(int fd, uint64_t size_needed_before);
void _ensure_safe_write_unlocked(int fd, DISKOFF block_size, DISKOFF block_offset);
void _ensure_safe_write_unlocked(int fd,
DISKOFF block_size,
DISKOFF block_offset);
// Verification
bool _is_valid_blocknum(struct translation *t, BLOCKNUM b);
@ -220,29 +269,33 @@ private:
bool _no_data_blocks_except_root(BLOCKNUM root);
bool _blocknum_allocated(BLOCKNUM b);
// Locking
// Locking
//
// TODO: Move the lock to the FT
void _mutex_lock();
void _mutex_unlock();
// The current translation is the one used by client threads.
// The current translation is the one used by client threads.
// It is not represented on disk.
struct translation _current;
// The translation used by the checkpoint currently in progress.
// If the checkpoint thread allocates a block, it must also update the current translation.
// The translation used by the checkpoint currently in progress.
// If the checkpoint thread allocates a block, it must also update the
// current translation.
struct translation _inprogress;
// The translation for the data that shall remain inviolate on disk until the next checkpoint finishes,
// The translation for the data that shall remain inviolate on disk until
// the next checkpoint finishes,
// after which any blocks used only in this translation can be freed.
struct translation _checkpointed;
// The in-memory data structure for block allocation.
// The in-memory data structure for block allocation.
// There is no on-disk data structure for block allocation.
// Note: This is *allocation* not *translation* - the block allocator is unaware of which
// blocks are used for which translation, but simply allocates and deallocates blocks.
block_allocator _bt_block_allocator;
// Note: This is *allocation* not *translation* - the block allocator is
// unaware of which
// blocks are used for which translation, but simply allocates and
// deallocates blocks.
BlockAllocator *_bt_block_allocator;
toku_mutex_t _mutex;
struct nb_mutex _safe_file_size_lock;
bool _checkpoint_skipped;
@ -257,16 +310,16 @@ private:
#include "ft/serialize/wbuf.h"
static inline void wbuf_BLOCKNUM (struct wbuf *w, BLOCKNUM b) {
static inline void wbuf_BLOCKNUM(struct wbuf *w, BLOCKNUM b) {
wbuf_ulonglong(w, b.b);
}
static inline void wbuf_nocrc_BLOCKNUM (struct wbuf *w, BLOCKNUM b) {
static inline void wbuf_nocrc_BLOCKNUM(struct wbuf *w, BLOCKNUM b) {
wbuf_nocrc_ulonglong(w, b.b);
}
static inline void wbuf_DISKOFF(struct wbuf *wb, DISKOFF off) {
wbuf_ulonglong(wb, (uint64_t) off);
wbuf_ulonglong(wb, (uint64_t)off);
}
#include "ft/serialize/rbuf.h"
@ -280,6 +333,8 @@ static inline BLOCKNUM rbuf_blocknum(struct rbuf *rb) {
return result;
}
static inline void rbuf_ma_BLOCKNUM(struct rbuf *rb, memarena *UU(ma), BLOCKNUM *blocknum) {
static inline void rbuf_ma_BLOCKNUM(struct rbuf *rb,
memarena *UU(ma),
BLOCKNUM *blocknum) {
*blocknum = rbuf_blocknum(rb);
}

View file

@ -235,7 +235,7 @@ void toku_decompress (Bytef *dest, uLongf destLen,
strm.zalloc = Z_NULL;
strm.zfree = Z_NULL;
strm.opaque = Z_NULL;
char windowBits = source[1];
int8_t windowBits = source[1];
int r = inflateInit2(&strm, windowBits);
lazy_assert(r == Z_OK);
strm.next_out = dest;

View file

@ -217,8 +217,8 @@ int deserialize_ft_versioned(int fd, struct rbuf *rb, FT *ftp, uint32_t version)
// translation table itself won't fit in main memory.
ssize_t readsz = toku_os_pread(fd, tbuf, size_to_read,
translation_address_on_disk);
assert(readsz >= translation_size_on_disk);
assert(readsz <= (ssize_t)size_to_read);
invariant(readsz >= translation_size_on_disk);
invariant(readsz <= (ssize_t)size_to_read);
}
// Create table and read in data.
r = ft->blocktable.create_from_buffer(fd,
@ -411,73 +411,90 @@ exit:
return r;
}
static size_t
serialize_ft_min_size (uint32_t version) {
static size_t serialize_ft_min_size(uint32_t version) {
size_t size = 0;
switch(version) {
case FT_LAYOUT_VERSION_29:
size += sizeof(uint64_t); // logrows in ft
case FT_LAYOUT_VERSION_28:
size += sizeof(uint32_t); // fanout in ft
case FT_LAYOUT_VERSION_27:
case FT_LAYOUT_VERSION_26:
case FT_LAYOUT_VERSION_25:
case FT_LAYOUT_VERSION_24:
case FT_LAYOUT_VERSION_23:
case FT_LAYOUT_VERSION_22:
case FT_LAYOUT_VERSION_21:
size += sizeof(MSN); // max_msn_in_ft
case FT_LAYOUT_VERSION_20:
case FT_LAYOUT_VERSION_19:
size += 1; // compression method
size += sizeof(MSN); // highest_unused_msn_for_upgrade
case FT_LAYOUT_VERSION_18:
size += sizeof(uint64_t); // time_of_last_optimize_begin
size += sizeof(uint64_t); // time_of_last_optimize_end
size += sizeof(uint32_t); // count_of_optimize_in_progress
size += sizeof(MSN); // msn_at_start_of_last_completed_optimize
size -= 8; // removed num_blocks_to_upgrade_14
size -= 8; // removed num_blocks_to_upgrade_13
case FT_LAYOUT_VERSION_17:
size += 16;
invariant(sizeof(STAT64INFO_S) == 16);
case FT_LAYOUT_VERSION_16:
case FT_LAYOUT_VERSION_15:
size += 4; // basement node size
size += 8; // num_blocks_to_upgrade_14 (previously num_blocks_to_upgrade, now one int each for upgrade from 13, 14
size += 8; // time of last verification
case FT_LAYOUT_VERSION_14:
size += 8; //TXNID that created
case FT_LAYOUT_VERSION_13:
size += ( 4 // build_id
+4 // build_id_original
+8 // time_of_creation
+8 // time_of_last_modification
);
switch (version) {
case FT_LAYOUT_VERSION_29:
size += sizeof(uint64_t); // logrows in ft
case FT_LAYOUT_VERSION_28:
size += sizeof(uint32_t); // fanout in ft
case FT_LAYOUT_VERSION_27:
case FT_LAYOUT_VERSION_26:
case FT_LAYOUT_VERSION_25:
case FT_LAYOUT_VERSION_24:
case FT_LAYOUT_VERSION_23:
case FT_LAYOUT_VERSION_22:
case FT_LAYOUT_VERSION_21:
size += sizeof(MSN); // max_msn_in_ft
case FT_LAYOUT_VERSION_20:
case FT_LAYOUT_VERSION_19:
size += 1; // compression method
size += sizeof(MSN); // highest_unused_msn_for_upgrade
case FT_LAYOUT_VERSION_18:
size += sizeof(uint64_t); // time_of_last_optimize_begin
size += sizeof(uint64_t); // time_of_last_optimize_end
size += sizeof(uint32_t); // count_of_optimize_in_progress
size += sizeof(MSN); // msn_at_start_of_last_completed_optimize
size -= 8; // removed num_blocks_to_upgrade_14
size -= 8; // removed num_blocks_to_upgrade_13
case FT_LAYOUT_VERSION_17:
size += 16;
invariant(sizeof(STAT64INFO_S) == 16);
case FT_LAYOUT_VERSION_16:
case FT_LAYOUT_VERSION_15:
size += 4; // basement node size
size += 8; // num_blocks_to_upgrade_14 (previously
// num_blocks_to_upgrade, now one int each for upgrade
// from 13, 14
size += 8; // time of last verification
case FT_LAYOUT_VERSION_14:
size += 8; // TXNID that created
case FT_LAYOUT_VERSION_13:
size += (4 // build_id
+
4 // build_id_original
+
8 // time_of_creation
+
8 // time_of_last_modification
);
// fall through
case FT_LAYOUT_VERSION_12:
size += (+8 // "tokudata"
+4 // version
+4 // original_version
+4 // size
+8 // byte order verification
+8 // checkpoint_count
+8 // checkpoint_lsn
+4 // tree's nodesize
+8 // translation_size_on_disk
+8 // translation_address_on_disk
+4 // checksum
+8 // Number of blocks in old version.
+8 // diskoff
+4 // flags
);
break;
default:
abort();
case FT_LAYOUT_VERSION_12:
size += (+8 // "tokudata"
+
4 // version
+
4 // original_version
+
4 // size
+
8 // byte order verification
+
8 // checkpoint_count
+
8 // checkpoint_lsn
+
4 // tree's nodesize
+
8 // translation_size_on_disk
+
8 // translation_address_on_disk
+
4 // checksum
+
8 // Number of blocks in old version.
+
8 // diskoff
+
4 // flags
);
break;
default:
abort();
}
lazy_assert(size <= block_allocator::BLOCK_ALLOCATOR_HEADER_RESERVE);
lazy_assert(size <= BlockAllocator::BLOCK_ALLOCATOR_HEADER_RESERVE);
return size;
}
@ -486,7 +503,7 @@ int deserialize_ft_from_fd_into_rbuf(int fd,
struct rbuf *rb,
uint64_t *checkpoint_count,
LSN *checkpoint_lsn,
uint32_t * version_p)
uint32_t *version_p)
// Effect: Read and parse the header of a fractalal tree
//
// Simply reading the raw bytes of the header into an rbuf is insensitive
@ -496,18 +513,18 @@ int deserialize_ft_from_fd_into_rbuf(int fd,
// file AND the header is useless
{
int r = 0;
const int64_t prefix_size = 8 + // magic ("tokudata")
4 + // version
4 + // build_id
4; // size
const int64_t prefix_size = 8 + // magic ("tokudata")
4 + // version
4 + // build_id
4; // size
const int64_t read_size = roundup_to_multiple(512, prefix_size);
unsigned char *XMALLOC_N_ALIGNED(512, read_size, prefix);
rb->buf = NULL;
int64_t n = toku_os_pread(fd, prefix, read_size, offset_of_header);
if (n != read_size) {
if (n==0) {
if (n == 0) {
r = TOKUDB_DICTIONARY_NO_HEADER;
} else if (n<0) {
} else if (n < 0) {
r = get_error_errno();
} else {
r = EINVAL;
@ -518,95 +535,102 @@ int deserialize_ft_from_fd_into_rbuf(int fd,
rbuf_init(rb, prefix, prefix_size);
//Check magic number
// Check magic number
const void *magic;
rbuf_literal_bytes(rb, &magic, 8);
if (memcmp(magic,"tokudata",8)!=0) {
if ((*(uint64_t*)magic) == 0) {
if (memcmp(magic, "tokudata", 8) != 0) {
if ((*(uint64_t *)magic) == 0) {
r = TOKUDB_DICTIONARY_NO_HEADER;
} else {
r = EINVAL; //Not a tokudb file! Do not use.
r = EINVAL; // Not a tokudb file! Do not use.
}
goto exit;
}
//Version MUST be in network order regardless of disk order.
// Version MUST be in network order regardless of disk order.
uint32_t version;
version = rbuf_network_int(rb);
*version_p = version;
if (version < FT_LAYOUT_MIN_SUPPORTED_VERSION) {
r = TOKUDB_DICTIONARY_TOO_OLD; //Cannot use
r = TOKUDB_DICTIONARY_TOO_OLD; // Cannot use
goto exit;
} else if (version > FT_LAYOUT_VERSION) {
r = TOKUDB_DICTIONARY_TOO_NEW; //Cannot use
r = TOKUDB_DICTIONARY_TOO_NEW; // Cannot use
goto exit;
}
//build_id MUST be in network order regardless of disk order.
// build_id MUST be in network order regardless of disk order.
uint32_t build_id __attribute__((__unused__));
build_id = rbuf_network_int(rb);
int64_t min_header_size;
min_header_size = serialize_ft_min_size(version);
//Size MUST be in network order regardless of disk order.
// Size MUST be in network order regardless of disk order.
uint32_t size;
size = rbuf_network_int(rb);
//If too big, it is corrupt. We would probably notice during checksum
//but may have to do a multi-gigabyte malloc+read to find out.
//If its too small reading rbuf would crash, so verify.
if (size > block_allocator::BLOCK_ALLOCATOR_HEADER_RESERVE || size < min_header_size) {
// If too big, it is corrupt. We would probably notice during checksum
// but may have to do a multi-gigabyte malloc+read to find out.
// If its too small reading rbuf would crash, so verify.
if (size > BlockAllocator::BLOCK_ALLOCATOR_HEADER_RESERVE ||
size < min_header_size) {
r = TOKUDB_DICTIONARY_NO_HEADER;
goto exit;
}
lazy_assert(rb->ndone==prefix_size);
lazy_assert(rb->ndone == prefix_size);
rb->size = size;
{
toku_free(rb->buf);
uint32_t size_to_read = roundup_to_multiple(512, size);
XMALLOC_N_ALIGNED(512, size_to_read, rb->buf);
assert(offset_of_header%512==0);
invariant(offset_of_header % 512 == 0);
n = toku_os_pread(fd, rb->buf, size_to_read, offset_of_header);
if (n != size_to_read) {
if (n < 0) {
r = get_error_errno();
} else {
r = EINVAL; //Header might be useless (wrong size) or could be a disk read error.
r = EINVAL; // Header might be useless (wrong size) or could be
// a disk read error.
}
goto exit;
}
}
//It's version 14 or later. Magic looks OK.
//We have an rbuf that represents the header.
//Size is within acceptable bounds.
// It's version 14 or later. Magic looks OK.
// We have an rbuf that represents the header.
// Size is within acceptable bounds.
//Verify checksum (FT_LAYOUT_VERSION_13 or later, when checksum function changed)
// Verify checksum (FT_LAYOUT_VERSION_13 or later, when checksum function
// changed)
uint32_t calculated_x1764;
calculated_x1764 = toku_x1764_memory(rb->buf, rb->size-4);
calculated_x1764 = toku_x1764_memory(rb->buf, rb->size - 4);
uint32_t stored_x1764;
stored_x1764 = toku_dtoh32(*(int*)(rb->buf+rb->size-4));
stored_x1764 = toku_dtoh32(*(int *)(rb->buf + rb->size - 4));
if (calculated_x1764 != stored_x1764) {
r = TOKUDB_BAD_CHECKSUM; //Header useless
fprintf(stderr, "Header checksum failure: calc=0x%08x read=0x%08x\n", calculated_x1764, stored_x1764);
r = TOKUDB_BAD_CHECKSUM; // Header useless
fprintf(stderr,
"Header checksum failure: calc=0x%08x read=0x%08x\n",
calculated_x1764,
stored_x1764);
goto exit;
}
//Verify byte order
// Verify byte order
const void *tmp_byte_order_check;
lazy_assert((sizeof toku_byte_order_host) == 8);
rbuf_literal_bytes(rb, &tmp_byte_order_check, 8); //Must not translate byte order
rbuf_literal_bytes(
rb, &tmp_byte_order_check, 8); // Must not translate byte order
int64_t byte_order_stored;
byte_order_stored = *(int64_t*)tmp_byte_order_check;
byte_order_stored = *(int64_t *)tmp_byte_order_check;
if (byte_order_stored != toku_byte_order_host) {
r = TOKUDB_DICTIONARY_NO_HEADER; //Cannot use dictionary
r = TOKUDB_DICTIONARY_NO_HEADER; // Cannot use dictionary
goto exit;
}
//Load checkpoint count
// Load checkpoint count
*checkpoint_count = rbuf_ulonglong(rb);
*checkpoint_lsn = rbuf_LSN(rb);
//Restart at beginning during regular deserialization
// Restart at beginning during regular deserialization
rb->ndone = 0;
exit:
@ -620,11 +644,7 @@ exit:
// Read ft from file into struct. Read both headers and use one.
// We want the latest acceptable header whose checkpoint_lsn is no later
// than max_acceptable_lsn.
int
toku_deserialize_ft_from(int fd,
LSN max_acceptable_lsn,
FT *ft)
{
int toku_deserialize_ft_from(int fd, LSN max_acceptable_lsn, FT *ft) {
struct rbuf rb_0;
struct rbuf rb_1;
uint64_t checkpoint_count_0 = 0;
@ -638,13 +658,23 @@ toku_deserialize_ft_from(int fd,
int r0, r1, r;
toku_off_t header_0_off = 0;
r0 = deserialize_ft_from_fd_into_rbuf(fd, header_0_off, &rb_0, &checkpoint_count_0, &checkpoint_lsn_0, &version_0);
r0 = deserialize_ft_from_fd_into_rbuf(fd,
header_0_off,
&rb_0,
&checkpoint_count_0,
&checkpoint_lsn_0,
&version_0);
if (r0 == 0 && checkpoint_lsn_0.lsn <= max_acceptable_lsn.lsn) {
h0_acceptable = true;
}
toku_off_t header_1_off = block_allocator::BLOCK_ALLOCATOR_HEADER_RESERVE;
r1 = deserialize_ft_from_fd_into_rbuf(fd, header_1_off, &rb_1, &checkpoint_count_1, &checkpoint_lsn_1, &version_1);
toku_off_t header_1_off = BlockAllocator::BLOCK_ALLOCATOR_HEADER_RESERVE;
r1 = deserialize_ft_from_fd_into_rbuf(fd,
header_1_off,
&rb_1,
&checkpoint_count_1,
&checkpoint_lsn_1,
&version_1);
if (r1 == 0 && checkpoint_lsn_1.lsn <= max_acceptable_lsn.lsn) {
h1_acceptable = true;
}
@ -655,24 +685,29 @@ toku_deserialize_ft_from(int fd,
// We were unable to read either header or at least one is too
// new. Certain errors are higher priority than others. Order of
// these if/else if is important.
if (r0 == TOKUDB_DICTIONARY_TOO_NEW || r1 == TOKUDB_DICTIONARY_TOO_NEW) {
if (r0 == TOKUDB_DICTIONARY_TOO_NEW ||
r1 == TOKUDB_DICTIONARY_TOO_NEW) {
r = TOKUDB_DICTIONARY_TOO_NEW;
} else if (r0 == TOKUDB_DICTIONARY_TOO_OLD || r1 == TOKUDB_DICTIONARY_TOO_OLD) {
} else if (r0 == TOKUDB_DICTIONARY_TOO_OLD ||
r1 == TOKUDB_DICTIONARY_TOO_OLD) {
r = TOKUDB_DICTIONARY_TOO_OLD;
} else if (r0 == TOKUDB_BAD_CHECKSUM && r1 == TOKUDB_BAD_CHECKSUM) {
fprintf(stderr, "Both header checksums failed.\n");
r = TOKUDB_BAD_CHECKSUM;
} else if (r0 == TOKUDB_DICTIONARY_NO_HEADER || r1 == TOKUDB_DICTIONARY_NO_HEADER) {
} else if (r0 == TOKUDB_DICTIONARY_NO_HEADER ||
r1 == TOKUDB_DICTIONARY_NO_HEADER) {
r = TOKUDB_DICTIONARY_NO_HEADER;
} else {
r = r0 ? r0 : r1; //Arbitrarily report the error from the
//first header, unless it's readable
r = r0 ? r0 : r1; // Arbitrarily report the error from the
// first header, unless it's readable
}
// it should not be possible for both headers to be later than the max_acceptable_lsn
invariant(!((r0==0 && checkpoint_lsn_0.lsn > max_acceptable_lsn.lsn) &&
(r1==0 && checkpoint_lsn_1.lsn > max_acceptable_lsn.lsn)));
invariant(r!=0);
// it should not be possible for both headers to be later than the
// max_acceptable_lsn
invariant(
!((r0 == 0 && checkpoint_lsn_0.lsn > max_acceptable_lsn.lsn) &&
(r1 == 0 && checkpoint_lsn_1.lsn > max_acceptable_lsn.lsn)));
invariant(r != 0);
goto exit;
}
@ -682,8 +717,7 @@ toku_deserialize_ft_from(int fd,
invariant(version_0 >= version_1);
rb = &rb_0;
version = version_0;
}
else {
} else {
invariant(checkpoint_count_1 == checkpoint_count_0 + 1);
invariant(version_1 >= version_0);
rb = &rb_1;
@ -692,14 +726,18 @@ toku_deserialize_ft_from(int fd,
} else if (h0_acceptable) {
if (r1 == TOKUDB_BAD_CHECKSUM) {
// print something reassuring
fprintf(stderr, "Header 2 checksum failed, but header 1 ok. Proceeding.\n");
fprintf(
stderr,
"Header 2 checksum failed, but header 1 ok. Proceeding.\n");
}
rb = &rb_0;
version = version_0;
} else if (h1_acceptable) {
if (r0 == TOKUDB_BAD_CHECKSUM) {
// print something reassuring
fprintf(stderr, "Header 1 checksum failed, but header 2 ok. Proceeding.\n");
fprintf(
stderr,
"Header 1 checksum failed, but header 2 ok. Proceeding.\n");
}
rb = &rb_1;
version = version_1;
@ -718,15 +756,13 @@ exit:
return r;
}
size_t toku_serialize_ft_size (FT_HEADER h) {
size_t toku_serialize_ft_size(FT_HEADER h) {
size_t size = serialize_ft_min_size(h->layout_version);
//There is no dynamic data.
lazy_assert(size <= block_allocator::BLOCK_ALLOCATOR_HEADER_RESERVE);
// There is no dynamic data.
lazy_assert(size <= BlockAllocator::BLOCK_ALLOCATOR_HEADER_RESERVE);
return size;
}
void toku_serialize_ft_to_wbuf (
struct wbuf *wbuf,
FT_HEADER h,
@ -771,52 +807,60 @@ void toku_serialize_ft_to_wbuf (
}
void toku_serialize_ft_to(int fd, FT_HEADER h, block_table *bt, CACHEFILE cf) {
lazy_assert(h->type==FT_CHECKPOINT_INPROGRESS);
lazy_assert(h->type == FT_CHECKPOINT_INPROGRESS);
struct wbuf w_translation;
int64_t size_translation;
int64_t address_translation;
// Must serialize translation first, to get address,size for header.
bt->serialize_translation_to_wbuf(fd, &w_translation,
&address_translation,
&size_translation);
assert(size_translation == w_translation.ndone);
bt->serialize_translation_to_wbuf(
fd, &w_translation, &address_translation, &size_translation);
invariant(size_translation == w_translation.ndone);
// the number of bytes available in the buffer is 0 mod 512, and those last bytes are all initialized.
assert(w_translation.size % 512 == 0);
// the number of bytes available in the buffer is 0 mod 512, and those last
// bytes are all initialized.
invariant(w_translation.size % 512 == 0);
struct wbuf w_main;
size_t size_main = toku_serialize_ft_size(h);
size_t size_main = toku_serialize_ft_size(h);
size_t size_main_aligned = roundup_to_multiple(512, size_main);
assert(size_main_aligned<block_allocator::BLOCK_ALLOCATOR_HEADER_RESERVE);
invariant(size_main_aligned <
BlockAllocator::BLOCK_ALLOCATOR_HEADER_RESERVE);
char *XMALLOC_N_ALIGNED(512, size_main_aligned, mainbuf);
for (size_t i=size_main; i<size_main_aligned; i++) mainbuf[i]=0; // initialize the end of the buffer with zeros
for (size_t i = size_main; i < size_main_aligned; i++)
mainbuf[i] = 0; // initialize the end of the buffer with zeros
wbuf_init(&w_main, mainbuf, size_main);
toku_serialize_ft_to_wbuf(&w_main, h, address_translation, size_translation);
toku_serialize_ft_to_wbuf(
&w_main, h, address_translation, size_translation);
lazy_assert(w_main.ndone == size_main);
// Actually write translation table
// This write is guaranteed to read good data at the end of the buffer, since the
// This write is guaranteed to read good data at the end of the buffer,
// since the
// w_translation.buf is padded with zeros to a 512-byte boundary.
toku_os_full_pwrite(fd, w_translation.buf, roundup_to_multiple(512, size_translation), address_translation);
toku_os_full_pwrite(fd,
w_translation.buf,
roundup_to_multiple(512, size_translation),
address_translation);
//Everything but the header MUST be on disk before header starts.
//Otherwise we will think the header is good and some blocks might not
//yet be on disk.
//If the header has a cachefile we need to do cachefile fsync (to
//prevent crash if we redirected to dev null)
//If there is no cachefile we still need to do an fsync.
// Everything but the header MUST be on disk before header starts.
// Otherwise we will think the header is good and some blocks might not
// yet be on disk.
// If the header has a cachefile we need to do cachefile fsync (to
// prevent crash if we redirected to dev null)
// If there is no cachefile we still need to do an fsync.
if (cf) {
toku_cachefile_fsync(cf);
}
else {
} else {
toku_file_fsync(fd);
}
//Alternate writing header to two locations:
// Alternate writing header to two locations:
// Beginning (0) or BLOCK_ALLOCATOR_HEADER_RESERVE
toku_off_t main_offset;
main_offset = (h->checkpoint_count & 0x1) ? 0 : block_allocator::BLOCK_ALLOCATOR_HEADER_RESERVE;
main_offset = (h->checkpoint_count & 0x1)
? 0
: BlockAllocator::BLOCK_ALLOCATOR_HEADER_RESERVE;
toku_os_full_pwrite(fd, w_main.buf, size_main_aligned, main_offset);
toku_free(w_main.buf);
toku_free(w_translation.buf);

View file

@ -99,13 +99,11 @@ void toku_ft_serialize_layer_init(void) {
num_cores = toku_os_get_number_active_processors();
int r = toku_thread_pool_create(&ft_pool, num_cores);
lazy_assert_zero(r);
block_allocator::maybe_initialize_trace();
toku_serialize_in_parallel = false;
}
void toku_ft_serialize_layer_destroy(void) {
toku_thread_pool_destroy(&ft_pool);
block_allocator::maybe_close_trace();
}
enum { FILE_CHANGE_INCREMENT = (16 << 20) };
@ -773,19 +771,23 @@ int toku_serialize_ftnode_to_memory(FTNODE node,
return 0;
}
int
toku_serialize_ftnode_to (int fd, BLOCKNUM blocknum, FTNODE node, FTNODE_DISK_DATA* ndd, bool do_rebalancing, FT ft, bool for_checkpoint) {
int toku_serialize_ftnode_to(int fd,
BLOCKNUM blocknum,
FTNODE node,
FTNODE_DISK_DATA *ndd,
bool do_rebalancing,
FT ft,
bool for_checkpoint) {
size_t n_to_write;
size_t n_uncompressed_bytes;
char *compressed_buf = nullptr;
// because toku_serialize_ftnode_to is only called for
// because toku_serialize_ftnode_to is only called for
// in toku_ftnode_flush_callback, we pass false
// for in_parallel. The reasoning is that when we write
// nodes to disk via toku_ftnode_flush_callback, we
// nodes to disk via toku_ftnode_flush_callback, we
// assume that it is being done on a non-critical
// background thread (probably for checkpointing), and therefore
// background thread (probably for checkpointing), and therefore
// should not hog CPU,
//
// Should the above facts change, we may want to revisit
@ -802,32 +804,32 @@ toku_serialize_ftnode_to (int fd, BLOCKNUM blocknum, FTNODE node, FTNODE_DISK_DA
toku_unsafe_fetch(&toku_serialize_in_parallel),
&n_to_write,
&n_uncompressed_bytes,
&compressed_buf
);
&compressed_buf);
if (r != 0) {
return r;
}
// If the node has never been written, then write the whole buffer, including the zeros
invariant(blocknum.b>=0);
// If the node has never been written, then write the whole buffer,
// including the zeros
invariant(blocknum.b >= 0);
DISKOFF offset;
// Dirties the ft
ft->blocktable.realloc_on_disk(blocknum, n_to_write, &offset,
ft, fd, for_checkpoint,
// Allocations for nodes high in the tree are considered 'hot',
// as they are likely to move again in the next checkpoint.
node->height);
ft->blocktable.realloc_on_disk(
blocknum, n_to_write, &offset, ft, fd, for_checkpoint);
tokutime_t t0 = toku_time_now();
toku_os_full_pwrite(fd, compressed_buf, n_to_write, offset);
tokutime_t t1 = toku_time_now();
tokutime_t io_time = t1 - t0;
toku_ft_status_update_flush_reason(node, n_uncompressed_bytes, n_to_write, io_time, for_checkpoint);
toku_ft_status_update_flush_reason(
node, n_uncompressed_bytes, n_to_write, io_time, for_checkpoint);
toku_free(compressed_buf);
node->dirty = 0; // See #1957. Must set the node to be clean after serializing it so that it doesn't get written again on the next checkpoint or eviction.
node->dirty = 0; // See #1957. Must set the node to be clean after
// serializing it so that it doesn't get written again on
// the next checkpoint or eviction.
return 0;
}
@ -994,6 +996,7 @@ BASEMENTNODE toku_clone_bn(BASEMENTNODE orig_bn) {
bn->seqinsert = orig_bn->seqinsert;
bn->stale_ancestor_messages_applied = orig_bn->stale_ancestor_messages_applied;
bn->stat64_delta = orig_bn->stat64_delta;
bn->logical_rows_delta = orig_bn->logical_rows_delta;
bn->data_buffer.clone(&orig_bn->data_buffer);
return bn;
}
@ -1004,6 +1007,7 @@ BASEMENTNODE toku_create_empty_bn_no_buffer(void) {
bn->seqinsert = 0;
bn->stale_ancestor_messages_applied = false;
bn->stat64_delta = ZEROSTATS;
bn->logical_rows_delta = 0;
bn->data_buffer.init_zero();
return bn;
}
@ -1897,7 +1901,7 @@ read_and_decompress_block_from_fd_into_rbuf(int fd, BLOCKNUM blocknum,
/* out */ int *layout_version_p);
// This function upgrades a version 14 or 13 ftnode to the current
// verison. NOTE: This code assumes the first field of the rbuf has
// version. NOTE: This code assumes the first field of the rbuf has
// already been read from the buffer (namely the layout_version of the
// ftnode.)
static int
@ -2488,9 +2492,12 @@ toku_serialize_rollback_log_to_memory_uncompressed(ROLLBACK_LOG_NODE log, SERIAL
serialized->blocknum = log->blocknum;
}
int
toku_serialize_rollback_log_to (int fd, ROLLBACK_LOG_NODE log, SERIALIZED_ROLLBACK_LOG_NODE serialized_log, bool is_serialized,
FT ft, bool for_checkpoint) {
int toku_serialize_rollback_log_to(int fd,
ROLLBACK_LOG_NODE log,
SERIALIZED_ROLLBACK_LOG_NODE serialized_log,
bool is_serialized,
FT ft,
bool for_checkpoint) {
size_t n_to_write;
char *compressed_buf;
struct serialized_rollback_log_node serialized_local;
@ -2511,21 +2518,21 @@ toku_serialize_rollback_log_to (int fd, ROLLBACK_LOG_NODE log, SERIALIZED_ROLLBA
serialized_log->n_sub_blocks,
serialized_log->sub_block,
ft->h->compression_method,
&n_to_write, &compressed_buf);
&n_to_write,
&compressed_buf);
// Dirties the ft
DISKOFF offset;
ft->blocktable.realloc_on_disk(blocknum, n_to_write, &offset,
ft, fd, for_checkpoint,
// We consider rollback log flushing the hottest possible allocation,
// since rollback logs are short-lived compared to FT nodes.
INT_MAX);
ft->blocktable.realloc_on_disk(
blocknum, n_to_write, &offset, ft, fd, for_checkpoint);
toku_os_full_pwrite(fd, compressed_buf, n_to_write, offset);
toku_free(compressed_buf);
if (!is_serialized) {
toku_static_serialized_rollback_log_destroy(&serialized_local);
log->dirty = 0; // See #1957. Must set the node to be clean after serializing it so that it doesn't get written again on the next checkpoint or eviction.
log->dirty = 0; // See #1957. Must set the node to be clean after
// serializing it so that it doesn't get written again
// on the next checkpoint or eviction.
}
return 0;
}
@ -2704,7 +2711,7 @@ exit:
}
static int decompress_from_raw_block_into_rbuf_versioned(uint32_t version, uint8_t *raw_block, size_t raw_block_size, struct rbuf *rb, BLOCKNUM blocknum) {
// This function exists solely to accomodate future changes in compression.
// This function exists solely to accommodate future changes in compression.
int r = 0;
if ((version == FT_LAYOUT_VERSION_13 || version == FT_LAYOUT_VERSION_14) ||
(FT_LAYOUT_VERSION_25 <= version && version <= FT_LAYOUT_VERSION_27) ||

View file

@ -0,0 +1,833 @@
/*- mode: C++; c-basic-offset: 4; indent-tabs-mode: nil -*- */
// vim: ft=cpp:expandtab:ts=8:sw=4:softtabstop=4:
#ident "$Id$"
/*======
This file is part of PerconaFT.
Copyright (c) 2006, 2015, Percona and/or its affiliates. All rights reserved.
PerconaFT is free software: you can redistribute it and/or modify
it under the terms of the GNU General Public License, version 2,
as published by the Free Software Foundation.
PerconaFT is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILIT or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with PerconaFT. If not, see <http://www.gnu.org/licenses/>.
----------------------------------------
PerconaFT is free software: you can redistribute it and/or modify
it under the terms of the GNU Affero General Public License, version 3,
as published by the Free Software Foundation.
PerconaFT is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU Affero General Public License for more details.
You should have received a copy of the GNU Affero General Public License
along with PerconaFT. If not, see <http://www.gnu.org/licenses/>.
======= */
#ident "Copyright (c) 2006, 2015, Percona and/or its affiliates. All rights reserved."
#include "ft/serialize/rbtree_mhs.h"
#include "portability/toku_assert.h"
#include "portability/toku_portability.h"
#include <algorithm>
namespace MhsRbTree {
Tree::Tree() : _root(NULL), _align(1) {}
Tree::Tree(uint64_t align) : _root(NULL), _align(align) {}
Tree::~Tree() { Destroy(); }
void Tree::PreOrder(Node *tree) const {
if (tree != NULL) {
fprintf(stderr, "%" PRIu64 " ", rbn_offset(tree).ToInt());
PreOrder(tree->_left);
PreOrder(tree->_right);
}
}
void Tree::PreOrder() { PreOrder(_root); }
void Tree::InOrder(Node *tree) const {
if (tree != NULL) {
InOrder(tree->_left);
fprintf(stderr, "%" PRIu64 " ", rbn_offset(tree).ToInt());
InOrder(tree->_right);
}
}
// yeah, i only care about in order visitor. -Jun
void Tree::InOrderVisitor(Node *tree,
void (*f)(void *, Node *, uint64_t),
void *extra,
uint64_t depth) {
if (tree != NULL) {
InOrderVisitor(tree->_left, f, extra, depth + 1);
f(extra, tree, depth);
InOrderVisitor(tree->_right, f, extra, depth + 1);
}
}
void Tree::InOrderVisitor(void (*f)(void *, Node *, uint64_t),
void *extra) {
InOrderVisitor(_root, f, extra, 0);
}
void Tree::InOrder() { InOrder(_root); }
void Tree::PostOrder(Node *tree) const {
if (tree != NULL) {
PostOrder(tree->_left);
PostOrder(tree->_right);
fprintf(stderr, "%" PRIu64 " ", rbn_offset(tree).ToInt());
}
}
void Tree::PostOrder() { PostOrder(_root); }
Node *Tree::SearchByOffset(uint64_t offset) {
Node *x = _root;
while ((x != NULL) && (rbn_offset(x).ToInt() != offset)) {
if (offset < rbn_offset(x).ToInt())
x = x->_left;
else
x = x->_right;
}
return x;
}
// mostly for testing
Node *Tree::SearchFirstFitBySize(uint64_t size) {
if (EffectiveSize(_root) < size && rbn_left_mhs(_root) < size &&
rbn_right_mhs(_root) < size) {
return nullptr;
} else {
return SearchFirstFitBySizeHelper(_root, size);
}
}
Node *Tree::SearchFirstFitBySizeHelper(Node *x, uint64_t size) {
if (EffectiveSize(x) >= size) {
// only possible to go left
if (rbn_left_mhs(x) >= size)
return SearchFirstFitBySizeHelper(x->_left, size);
else
return x;
}
if (rbn_left_mhs(x) >= size)
return SearchFirstFitBySizeHelper(x->_left, size);
if (rbn_right_mhs(x) >= size)
return SearchFirstFitBySizeHelper(x->_right, size);
// this is an invalid state
Dump();
ValidateBalance();
ValidateMhs();
invariant(0);
return NULL;
}
Node *Tree::MinNode(Node *tree) {
if (tree == NULL)
return NULL;
while (tree->_left != NULL)
tree = tree->_left;
return tree;
}
Node *Tree::MinNode() { return MinNode(_root); }
Node *Tree::MaxNode(Node *tree) {
if (tree == NULL)
return NULL;
while (tree->_right != NULL)
tree = tree->_right;
return tree;
}
Node *Tree::MaxNode() { return MaxNode(_root); }
Node *Tree::SuccessorHelper(Node *y, Node *x) {
while ((y != NULL) && (x == y->_right)) {
x = y;
y = y->_parent;
}
return y;
}
Node *Tree::Successor(Node *x) {
if (x->_right != NULL)
return MinNode(x->_right);
Node *y = x->_parent;
return SuccessorHelper(y, x);
}
Node *Tree::PredecessorHelper(Node *y, Node *x) {
while ((y != NULL) && (x == y->_left)) {
x = y;
y = y->_parent;
}
return y;
}
Node *Tree::Predecessor(Node *x) {
if (x->_left != NULL)
return MaxNode(x->_left);
Node *y = x->_parent;
return SuccessorHelper(y, x);
}
/*
* px px
* / /
* x y
* / \ --(left rotation)--> / \ #
* lx y x ry
* / \ / \
* ly ry lx ly
* max_hole_size updates are pretty local
*/
void Tree::LeftRotate(Node *&root, Node *x) {
Node *y = x->_right;
x->_right = y->_left;
rbn_right_mhs(x) = rbn_left_mhs(y);
if (y->_left != NULL)
y->_left->_parent = x;
y->_parent = x->_parent;
if (x->_parent == NULL) {
root = y;
} else {
if (x->_parent->_left == x) {
x->_parent->_left = y;
} else {
x->_parent->_right = y;
}
}
y->_left = x;
rbn_left_mhs(y) = mhs_of_subtree(x);
x->_parent = y;
}
/* py py
* / /
* y x
* / \ --(right rotate)--> / \ #
* x ry lx y
* / \ / \ #
* lx rx rx ry
*
*/
void Tree::RightRotate(Node *&root, Node *y) {
Node *x = y->_left;
y->_left = x->_right;
rbn_left_mhs(y) = rbn_right_mhs(x);
if (x->_right != NULL)
x->_right->_parent = y;
x->_parent = y->_parent;
if (y->_parent == NULL) {
root = x;
} else {
if (y == y->_parent->_right)
y->_parent->_right = x;
else
y->_parent->_left = x;
}
x->_right = y;
rbn_right_mhs(x) = mhs_of_subtree(y);
y->_parent = x;
}
// walking from this node up to update the mhs info
// whenver there is change on left/right mhs or size we should recalculate.
// prerequisit: the children of the node are mhs up-to-date.
void Tree::RecalculateMhs(Node *node) {
uint64_t *p_node_mhs = 0;
Node *parent = node->_parent;
if (!parent)
return;
uint64_t max_mhs = mhs_of_subtree(node);
if (node == parent->_left) {
p_node_mhs = &rbn_left_mhs(parent);
} else if (node == parent->_right) {
p_node_mhs = &rbn_right_mhs(parent);
} else {
return;
}
if (*p_node_mhs != max_mhs) {
*p_node_mhs = max_mhs;
RecalculateMhs(parent);
}
}
void Tree::IsNewNodeMergable(Node *pred,
Node *succ,
Node::BlockPair pair,
bool *left_merge,
bool *right_merge) {
if (pred) {
OUUInt64 end_of_pred = rbn_size(pred) + rbn_offset(pred);
if (end_of_pred < pair._offset)
*left_merge = false;
else {
invariant(end_of_pred == pair._offset);
*left_merge = true;
}
}
if (succ) {
OUUInt64 begin_of_succ = rbn_offset(succ);
OUUInt64 end_of_node = pair._offset + pair._size;
if (end_of_node < begin_of_succ) {
*right_merge = false;
} else {
invariant(end_of_node == begin_of_succ);
*right_merge = true;
}
}
}
void Tree::AbsorbNewNode(Node *pred,
Node *succ,
Node::BlockPair pair,
bool left_merge,
bool right_merge,
bool is_right_child) {
invariant(left_merge || right_merge);
if (left_merge && right_merge) {
// merge to the succ
if (!is_right_child) {
rbn_size(succ) += pair._size;
rbn_offset(succ) = pair._offset;
// merge to the pred
rbn_size(pred) += rbn_size(succ);
// to keep the invariant of the tree -no overlapping holes
rbn_offset(succ) += rbn_size(succ);
rbn_size(succ) = 0;
RecalculateMhs(succ);
RecalculateMhs(pred);
// pred dominates succ. this is going to
// update the pred labels separately.
// remove succ
RawRemove(_root, succ);
} else {
rbn_size(pred) += pair._size;
rbn_offset(succ) = rbn_offset(pred);
rbn_size(succ) += rbn_size(pred);
rbn_offset(pred) += rbn_size(pred);
rbn_size(pred) = 0;
RecalculateMhs(pred);
RecalculateMhs(succ);
// now remove pred
RawRemove(_root, pred);
}
} else if (left_merge) {
rbn_size(pred) += pair._size;
RecalculateMhs(pred);
} else if (right_merge) {
rbn_offset(succ) -= pair._size;
rbn_size(succ) += pair._size;
RecalculateMhs(succ);
}
}
// this is the most tedious part, but not complicated:
// 1.find where to insert the pair
// 2.if the pred and succ can merge with the pair. merge with them. either
// pred
// or succ can be removed.
// 3. if only left-mergable or right-mergeable, just merge
// 4. non-mergable case. insert the node and run the fixup.
int Tree::Insert(Node *&root, Node::BlockPair pair) {
Node *x = _root;
Node *y = NULL;
bool left_merge = false;
bool right_merge = false;
Node *node = NULL;
while (x != NULL) {
y = x;
if (pair._offset < rbn_key(x))
x = x->_left;
else
x = x->_right;
}
// we found where to insert, lets find out the pred and succ for
// possible
// merges.
// node->parent = y;
Node *pred, *succ;
if (y != NULL) {
if (pair._offset < rbn_key(y)) {
// as the left child
pred = PredecessorHelper(y->_parent, y);
succ = y;
IsNewNodeMergable(pred, succ, pair, &left_merge, &right_merge);
if (left_merge || right_merge) {
AbsorbNewNode(
pred, succ, pair, left_merge, right_merge, false);
} else {
// construct the node
Node::Pair mhsp {0, 0};
node =
new Node(EColor::BLACK, pair, mhsp, nullptr, nullptr, nullptr);
if (!node)
return -1;
y->_left = node;
node->_parent = y;
RecalculateMhs(node);
}
} else {
// as the right child
pred = y;
succ = SuccessorHelper(y->_parent, y);
IsNewNodeMergable(pred, succ, pair, &left_merge, &right_merge);
if (left_merge || right_merge) {
AbsorbNewNode(
pred, succ, pair, left_merge, right_merge, true);
} else {
// construct the node
Node::Pair mhsp {0, 0};
node =
new Node(EColor::BLACK, pair, mhsp, nullptr, nullptr, nullptr);
if (!node)
return -1;
y->_right = node;
node->_parent = y;
RecalculateMhs(node);
}
}
} else {
Node::Pair mhsp {0, 0};
node = new Node(EColor::BLACK, pair, mhsp, nullptr, nullptr, nullptr);
if (!node)
return -1;
root = node;
}
if (!left_merge && !right_merge) {
invariant_notnull(node);
node->_color = EColor::RED;
return InsertFixup(root, node);
}
return 0;
}
int Tree::InsertFixup(Node *&root, Node *node) {
Node *parent, *gparent;
while ((parent = rbn_parent(node)) && rbn_is_red(parent)) {
gparent = rbn_parent(parent);
if (parent == gparent->_left) {
{
Node *uncle = gparent->_right;
if (uncle && rbn_is_red(uncle)) {
rbn_set_black(uncle);
rbn_set_black(parent);
rbn_set_red(gparent);
node = gparent;
continue;
}
}
if (parent->_right == node) {
Node *tmp;
LeftRotate(root, parent);
tmp = parent;
parent = node;
node = tmp;
}
rbn_set_black(parent);
rbn_set_red(gparent);
RightRotate(root, gparent);
} else {
{
Node *uncle = gparent->_left;
if (uncle && rbn_is_red(uncle)) {
rbn_set_black(uncle);
rbn_set_black(parent);
rbn_set_red(gparent);
node = gparent;
continue;
}
}
if (parent->_left == node) {
Node *tmp;
RightRotate(root, parent);
tmp = parent;
parent = node;
node = tmp;
}
rbn_set_black(parent);
rbn_set_red(gparent);
LeftRotate(root, gparent);
}
}
rbn_set_black(root);
return 0;
}
int Tree::Insert(Node::BlockPair pair) { return Insert(_root, pair); }
uint64_t Tree::Remove(size_t size) {
Node *node = SearchFirstFitBySize(size);
return Remove(_root, node, size);
}
void Tree::RawRemove(Node *&root, Node *node) {
Node *child, *parent;
EColor color;
if ((node->_left != NULL) && (node->_right != NULL)) {
Node *replace = node;
replace = replace->_right;
while (replace->_left != NULL)
replace = replace->_left;
if (rbn_parent(node)) {
if (rbn_parent(node)->_left == node)
rbn_parent(node)->_left = replace;
else
rbn_parent(node)->_right = replace;
} else {
root = replace;
}
child = replace->_right;
parent = rbn_parent(replace);
color = rbn_color(replace);
if (parent == node) {
parent = replace;
} else {
if (child)
rbn_parent(child) = parent;
parent->_left = child;
rbn_left_mhs(parent) = rbn_right_mhs(replace);
RecalculateMhs(parent);
replace->_right = node->_right;
rbn_set_parent(node->_right, replace);
rbn_right_mhs(replace) = rbn_right_mhs(node);
}
replace->_parent = node->_parent;
replace->_color = node->_color;
replace->_left = node->_left;
rbn_left_mhs(replace) = rbn_left_mhs(node);
node->_left->_parent = replace;
RecalculateMhs(replace);
if (color == EColor::BLACK)
RawRemoveFixup(root, child, parent);
delete node;
return;
}
if (node->_left != NULL)
child = node->_left;
else
child = node->_right;
parent = node->_parent;
color = node->_color;
if (child)
child->_parent = parent;
if (parent) {
if (parent->_left == node) {
parent->_left = child;
rbn_left_mhs(parent) = child ? mhs_of_subtree(child) : 0;
} else {
parent->_right = child;
rbn_right_mhs(parent) = child ? mhs_of_subtree(child) : 0;
}
RecalculateMhs(parent);
} else
root = child;
if (color == EColor::BLACK)
RawRemoveFixup(root, child, parent);
delete node;
}
void Tree::RawRemove(uint64_t offset) {
Node *node = SearchByOffset(offset);
RawRemove(_root, node);
}
static inline uint64_t align(uint64_t value, uint64_t ba_alignment) {
return ((value + ba_alignment - 1) / ba_alignment) * ba_alignment;
}
uint64_t Tree::Remove(Node *&root, Node *node, size_t size) {
OUUInt64 n_offset = rbn_offset(node);
OUUInt64 n_size = rbn_size(node);
OUUInt64 answer_offset(align(rbn_offset(node).ToInt(), _align));
invariant((answer_offset + size) <= (n_offset + n_size));
if (answer_offset == n_offset) {
rbn_offset(node) += size;
rbn_size(node) -= size;
RecalculateMhs(node);
if (rbn_size(node) == 0) {
RawRemove(root, node);
}
} else {
if (answer_offset + size == n_offset + n_size) {
rbn_size(node) -= size;
RecalculateMhs(node);
} else {
// well, cut in the middle...
rbn_size(node) = answer_offset - n_offset;
RecalculateMhs(node);
Insert(_root,
{(answer_offset + size),
(n_offset + n_size) - (answer_offset + size)});
}
}
return answer_offset.ToInt();
}
void Tree::RawRemoveFixup(Node *&root, Node *node, Node *parent) {
Node *other;
while ((!node || rbn_is_black(node)) && node != root) {
if (parent->_left == node) {
other = parent->_right;
if (rbn_is_red(other)) {
// Case 1: the brother of X, w, is read
rbn_set_black(other);
rbn_set_red(parent);
LeftRotate(root, parent);
other = parent->_right;
}
if ((!other->_left || rbn_is_black(other->_left)) &&
(!other->_right || rbn_is_black(other->_right))) {
// Case 2: w is black and both of w's children are black
rbn_set_red(other);
node = parent;
parent = rbn_parent(node);
} else {
if (!other->_right || rbn_is_black(other->_right)) {
// Case 3: w is black and left child of w is red but
// right
// child is black
rbn_set_black(other->_left);
rbn_set_red(other);
RightRotate(root, other);
other = parent->_right;
}
// Case 4: w is black and right child of w is red,
// regardless of
// left child's color
rbn_set_color(other, rbn_color(parent));
rbn_set_black(parent);
rbn_set_black(other->_right);
LeftRotate(root, parent);
node = root;
break;
}
} else {
other = parent->_left;
if (rbn_is_red(other)) {
// Case 1: w is red
rbn_set_black(other);
rbn_set_red(parent);
RightRotate(root, parent);
other = parent->_left;
}
if ((!other->_left || rbn_is_black(other->_left)) &&
(!other->_right || rbn_is_black(other->_right))) {
// Case 2: w is black and both children are black
rbn_set_red(other);
node = parent;
parent = rbn_parent(node);
} else {
if (!other->_left || rbn_is_black(other->_left)) {
// Case 3: w is black and left child of w is red whereas
// right child is black
rbn_set_black(other->_right);
rbn_set_red(other);
LeftRotate(root, other);
other = parent->_left;
}
// Case 4:w is black and right child of w is red, regardless
// of
// the left child's color
rbn_set_color(other, rbn_color(parent));
rbn_set_black(parent);
rbn_set_black(other->_left);
RightRotate(root, parent);
node = root;
break;
}
}
}
if (node)
rbn_set_black(node);
}
void Tree::Destroy(Node *&tree) {
if (tree == NULL)
return;
if (tree->_left != NULL)
Destroy(tree->_left);
if (tree->_right != NULL)
Destroy(tree->_right);
delete tree;
tree = NULL;
}
void Tree::Destroy() { Destroy(_root); }
void Tree::Dump(Node *tree, Node::BlockPair pair, EDirection dir) {
if (tree != NULL) {
if (dir == EDirection::NONE)
fprintf(stderr,
"(%" PRIu64 ",%" PRIu64 ", mhs:(%" PRIu64 ",%" PRIu64
"))(B) is root\n",
rbn_offset(tree).ToInt(),
rbn_size(tree).ToInt(),
rbn_left_mhs(tree),
rbn_right_mhs(tree));
else
fprintf(stderr,
"(%" PRIu64 ",%" PRIu64 ",mhs:(%" PRIu64 ",%" PRIu64
"))(%c) is %" PRIu64 "'s %s\n",
rbn_offset(tree).ToInt(),
rbn_size(tree).ToInt(),
rbn_left_mhs(tree),
rbn_right_mhs(tree),
rbn_is_red(tree) ? 'R' : 'B',
pair._offset.ToInt(),
dir == EDirection::RIGHT ? "right child" : "left child");
Dump(tree->_left, tree->_hole, EDirection::LEFT);
Dump(tree->_right, tree->_hole, EDirection::RIGHT);
}
}
uint64_t Tree::EffectiveSize(Node *node) {
OUUInt64 offset = rbn_offset(node);
OUUInt64 size = rbn_size(node);
OUUInt64 end = offset + size;
OUUInt64 aligned_offset(align(offset.ToInt(), _align));
if (aligned_offset > end) {
return 0;
}
return (end - aligned_offset).ToInt();
}
void Tree::Dump() {
if (_root != NULL)
Dump(_root, _root->_hole, (EDirection)0);
}
static void vis_bal_f(void *extra, Node *node, uint64_t depth) {
uint64_t **p = (uint64_t **)extra;
uint64_t min = *p[0];
uint64_t max = *p[1];
if (node->_left) {
Node *left = node->_left;
invariant(node == left->_parent);
}
if (node->_right) {
Node *right = node->_right;
invariant(node == right->_parent);
}
if (!node->_left || !node->_right) {
if (min > depth) {
*p[0] = depth;
} else if (max < depth) {
*p[1] = depth;
}
}
}
void Tree::ValidateBalance() {
uint64_t min_depth = 0xffffffffffffffff;
uint64_t max_depth = 0;
if (!_root) {
return;
}
uint64_t *p[2] = {&min_depth, &max_depth};
InOrderVisitor(vis_bal_f, (void *)p);
invariant((min_depth + 1) * 2 >= max_depth + 1);
}
static void vis_cmp_f(void *extra, Node *node, uint64_t UU(depth)) {
Node::BlockPair **p = (Node::BlockPair **)extra;
invariant_notnull(*p);
invariant((*p)->_offset == node->_hole._offset);
*p = *p + 1;
}
// validate the input pairs matches with sorted pairs
void Tree::ValidateInOrder(Node::BlockPair *pairs) {
InOrderVisitor(vis_cmp_f, &pairs);
}
uint64_t Tree::ValidateMhs(Node *node) {
if (!node)
return 0;
else {
uint64_t mhs_left = ValidateMhs(node->_left);
uint64_t mhs_right = ValidateMhs(node->_right);
if (mhs_left != rbn_left_mhs(node)) {
printf("assert failure: mhs_left = %" PRIu64 "\n", mhs_left);
Dump(node, node->_hole, (EDirection)0);
}
invariant(mhs_left == rbn_left_mhs(node));
if (mhs_right != rbn_right_mhs(node)) {
printf("assert failure: mhs_right = %" PRIu64 "\n", mhs_right);
Dump(node, node->_hole, (EDirection)0);
}
invariant(mhs_right == rbn_right_mhs(node));
return std::max(EffectiveSize(node), std::max(mhs_left, mhs_right));
}
}
void Tree::ValidateMhs() {
if (!_root)
return;
uint64_t mhs_left = ValidateMhs(_root->_left);
uint64_t mhs_right = ValidateMhs(_root->_right);
invariant(mhs_left == rbn_left_mhs(_root));
invariant(mhs_right == rbn_right_mhs(_root));
}
} // namespace MhsRbTree

View file

@ -0,0 +1,351 @@
/* -*- mode: C++; c-basic-offset: 4; indent-tabs-mode: nil -*- */
// vim: ft=cpp:expandtab:ts=8:sw=4:softtabstop=4:
#ident "$Id$"
/*======
This file is part of PerconaFT.
Copyright (c) 2006, 2015, Percona and/or its affiliates. All rights reserved.
PerconaFT is free software: you can redistribute it and/or modify
it under the terms of the GNU General Public License, version 2,
as published by the Free Software Foundation.
PerconaFT is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with PerconaFT. If not, see <http://www.gnu.org/licenses/>.
----------------------------------------
PerconaFT is free software: you can redistribute it and/or modify
it under the terms of the GNU Affero General Public License, version 3,
as published by the Free Software Foundation.
PerconaFT is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU Affero General Public License for more details.
You should have received a copy of the GNU Affero General Public License
along with PerconaFT. If not, see <http://www.gnu.org/licenses/>.
======= */
#ident "Copyright (c) 2006, 2015, Percona and/or its affiliates. All rights reserved."
#pragma once
#include <db.h>
#include "portability/toku_pthread.h"
#include "portability/toku_stdint.h"
#include "portability/toku_stdlib.h"
// RBTree(Red-black tree) with max hole sizes for subtrees.
// This is a tentative data struct to improve the block allocation time
// complexity from the linear time to the log time. Please be noted this DS only
// supports first-fit for now. It is actually easier to do it with
// best-fit.(just
// sort by size).
// RBTree is a classic data struct with O(log(n)) for insertion, deletion and
// search. Many years have seen its efficiency.
// a *hole* is the representation of an available BlockPair for allocation.
// defined as (start_address,size) or (offset, size) interchangably.
// each node has a *label* to indicate a pair of the max hole sizes for its
// subtree.
// We are implementing a RBTree with max hole sizes for subtree. It is a red
// black tree that is sorted by the start_address but also labeld with the max
// hole sizes of the subtrees.
// [(6,3)] -> [(offset, size)], the hole
// [{2,5}] -> [{mhs_of_left, mhs_of_right}], the label
/* / \ */
// [(0, 1)] [(10, 5)]
// [{0, 2}] [{0, 0}]
/* \ */
// [(3, 2)]
// [{0, 0}]
// request of allocation size=2 goes from root to [(3,2)].
// above example shows a simplified RBTree_max_holes.
// it is easier to tell the search time is O(log(n)) as we can make a decision
// on each descent until we get to the target.
// the only question is if we can keep the maintenance cost low -- and i think
// it is not a problem becoz an insertion/deletion is only going to update the
// max_hole_sizes of the nodes along the path from the root to the node to be
// deleted/inserted. The path can be cached and search is anyway O(log(n)).
// unlike the typical rbtree, Tree has to handle the inserts and deletes
// with more care: an allocation that triggers the delete might leave some
// unused space which we can simply update the start_addr and size without
// worrying overlapping. An free might not only mean the insertion but also
// *merging* with the adjacent holes.
namespace MhsRbTree {
#define offset_t uint64_t
enum class EColor { RED, BLACK };
enum class EDirection { NONE = 0, LEFT, RIGHT };
// I am a bit tired of fixing overflow/underflow, just quickly craft some
// int
// class that has an infinity-like max value and prevents overflow and
// underflow. If you got a file offset larger than MHS_MAX_VAL, it is not
// a problem here. :-/ - JYM
class OUUInt64 {
public:
static const uint64_t MHS_MAX_VAL = 0xffffffffffffffff;
OUUInt64() : _value(0) {}
OUUInt64(uint64_t s) : _value(s) {}
bool operator<(const OUUInt64 &r) const {
invariant(!(_value == MHS_MAX_VAL && r.ToInt() == MHS_MAX_VAL));
return _value < r.ToInt();
}
bool operator>(const OUUInt64 &r) const {
invariant(!(_value == MHS_MAX_VAL && r.ToInt() == MHS_MAX_VAL));
return _value > r.ToInt();
}
bool operator<=(const OUUInt64 &r) const {
invariant(!(_value == MHS_MAX_VAL && r.ToInt() == MHS_MAX_VAL));
return _value <= r.ToInt();
}
bool operator>=(const OUUInt64 &r) const {
invariant(!(_value == MHS_MAX_VAL && r.ToInt() == MHS_MAX_VAL));
return _value >= r.ToInt();
}
OUUInt64 operator+(const OUUInt64 &r) const {
if (_value == MHS_MAX_VAL || r.ToInt() == MHS_MAX_VAL) {
OUUInt64 tmp(MHS_MAX_VAL);
return tmp;
} else {
// detecting overflow
invariant((MHS_MAX_VAL - _value) >= r.ToInt());
uint64_t plus = _value + r.ToInt();
OUUInt64 tmp(plus);
return tmp;
}
}
OUUInt64 operator-(const OUUInt64 &r) const {
invariant(r.ToInt() != MHS_MAX_VAL);
if (_value == MHS_MAX_VAL) {
return *this;
} else {
invariant(_value >= r.ToInt());
uint64_t minus = _value - r.ToInt();
OUUInt64 tmp(minus);
return tmp;
}
}
OUUInt64 operator-=(const OUUInt64 &r) {
if (_value != MHS_MAX_VAL) {
invariant(r.ToInt() != MHS_MAX_VAL);
invariant(_value >= r.ToInt());
_value -= r.ToInt();
}
return *this;
}
OUUInt64 operator+=(const OUUInt64 &r) {
if (_value != MHS_MAX_VAL) {
if (r.ToInt() == MHS_MAX_VAL) {
_value = MHS_MAX_VAL;
} else {
invariant((MHS_MAX_VAL - _value) >= r.ToInt());
this->_value += r.ToInt();
}
}
return *this;
}
bool operator==(const OUUInt64 &r) const {
return _value == r.ToInt();
}
bool operator!=(const OUUInt64 &r) const {
return _value != r.ToInt();
}
OUUInt64 operator=(const OUUInt64 &r) {
_value = r.ToInt();
return *this;
}
uint64_t ToInt() const { return _value; }
private:
uint64_t _value;
};
class Node {
public:
struct BlockPair {
OUUInt64 _offset;
OUUInt64 _size;
BlockPair() : _offset(0), _size(0) {}
BlockPair(uint64_t o, uint64_t s) : _offset(o), _size(s) {}
BlockPair(OUUInt64 o, OUUInt64 s) : _offset(o), _size(s) {}
int operator<(const struct BlockPair &rhs) const {
return _offset < rhs._offset;
}
int operator<(const uint64_t &o) const { return _offset < o; }
};
struct Pair {
uint64_t _left;
uint64_t _right;
Pair(uint64_t l, uint64_t r) : _left(l), _right(r) {}
};
EColor _color;
struct BlockPair _hole;
struct Pair _label;
Node *_left;
Node *_right;
Node *_parent;
Node(EColor c,
Node::BlockPair h,
struct Pair lb,
Node *l,
Node *r,
Node *p)
: _color(c),
_hole(h),
_label(lb),
_left(l),
_right(r),
_parent(p) {}
};
class Tree {
private:
Node *_root;
uint64_t _align;
public:
Tree();
Tree(uint64_t);
~Tree();
void PreOrder();
void InOrder();
void PostOrder();
// immutable operations
Node *SearchByOffset(uint64_t addr);
Node *SearchFirstFitBySize(uint64_t size);
Node *MinNode();
Node *MaxNode();
Node *Successor(Node *);
Node *Predecessor(Node *);
// mapped from tree_allocator::free_block
int Insert(Node::BlockPair pair);
// mapped from tree_allocator::alloc_block
uint64_t Remove(size_t size);
// mapped from tree_allocator::alloc_block_after
void RawRemove(uint64_t offset);
void Destroy();
// print the tree
void Dump();
// validation
// balance
void ValidateBalance();
void ValidateInOrder(Node::BlockPair *);
void InOrderVisitor(void (*f)(void *, Node *, uint64_t), void *);
void ValidateMhs();
private:
void PreOrder(Node *node) const;
void InOrder(Node *node) const;
void PostOrder(Node *node) const;
Node *SearchByOffset(Node *node, offset_t addr) const;
Node *SearchFirstFitBySize(Node *node, size_t size) const;
Node *MinNode(Node *node);
Node *MaxNode(Node *node);
// rotations to fix up. we will have to update the labels too.
void LeftRotate(Node *&root, Node *x);
void RightRotate(Node *&root, Node *y);
int Insert(Node *&root, Node::BlockPair pair);
int InsertFixup(Node *&root, Node *node);
void RawRemove(Node *&root, Node *node);
uint64_t Remove(Node *&root, Node *node, size_t size);
void RawRemoveFixup(Node *&root, Node *node, Node *parent);
void Destroy(Node *&tree);
void Dump(Node *tree, Node::BlockPair pair, EDirection dir);
void RecalculateMhs(Node *node);
void IsNewNodeMergable(Node *, Node *, Node::BlockPair, bool *, bool *);
void AbsorbNewNode(Node *, Node *, Node::BlockPair, bool, bool, bool);
Node *SearchFirstFitBySizeHelper(Node *x, uint64_t size);
Node *SuccessorHelper(Node *y, Node *x);
Node *PredecessorHelper(Node *y, Node *x);
void InOrderVisitor(Node *,
void (*f)(void *, Node *, uint64_t),
void *,
uint64_t);
uint64_t ValidateMhs(Node *);
uint64_t EffectiveSize(Node *);
// mixed with some macros.....
#define rbn_parent(r) ((r)->_parent)
#define rbn_color(r) ((r)->_color)
#define rbn_is_red(r) ((r)->_color == EColor::RED)
#define rbn_is_black(r) ((r)->_color == EColor::BLACK)
#define rbn_set_black(r) \
do { \
(r)->_color = EColor::BLACK; \
} while (0)
#define rbn_set_red(r) \
do { \
(r)->_color = EColor::RED; \
} while (0)
#define rbn_set_parent(r, p) \
do { \
(r)->_parent = (p); \
} while (0)
#define rbn_set_color(r, c) \
do { \
(r)->_color = (c); \
} while (0)
#define rbn_set_offset(r) \
do { \
(r)->_hole._offset = (c); \
} while (0)
#define rbn_set_size(r, c) \
do { \
(r)->_hole._size = (c); \
} while (0)
#define rbn_set_left_mhs(r, c) \
do { \
(r)->_label._left = (c); \
} while (0)
#define rbn_set_right_mhs(r, c) \
do { \
(r)->_label._right = (c); \
} while (0)
#define rbn_size(r) ((r)->_hole._size)
#define rbn_offset(r) ((r)->_hole._offset)
#define rbn_key(r) ((r)->_hole._offset)
#define rbn_left_mhs(r) ((r)->_label._left)
#define rbn_right_mhs(r) ((r)->_label._right)
#define mhs_of_subtree(y) \
(std::max(std::max(rbn_left_mhs(y), rbn_right_mhs(y)), EffectiveSize(y)))
};
} // namespace MhsRbTree

View file

@ -1,126 +0,0 @@
/* -*- mode: C++; c-basic-offset: 4; indent-tabs-mode: nil -*- */
// vim: ft=cpp:expandtab:ts=8:sw=4:softtabstop=4:
#ident "$Id$"
/*======
This file is part of PerconaFT.
Copyright (c) 2006, 2015, Percona and/or its affiliates. All rights reserved.
PerconaFT is free software: you can redistribute it and/or modify
it under the terms of the GNU General Public License, version 2,
as published by the Free Software Foundation.
PerconaFT is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with PerconaFT. If not, see <http://www.gnu.org/licenses/>.
----------------------------------------
PerconaFT is free software: you can redistribute it and/or modify
it under the terms of the GNU Affero General Public License, version 3,
as published by the Free Software Foundation.
PerconaFT is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU Affero General Public License for more details.
You should have received a copy of the GNU Affero General Public License
along with PerconaFT. If not, see <http://www.gnu.org/licenses/>.
======= */
#ident "Copyright (c) 2006, 2015, Percona and/or its affiliates. All rights reserved."
#include "ft/tests/test.h"
#include "ft/serialize/block_allocator_strategy.h"
static const uint64_t alignment = 4096;
static void test_first_vs_best_fit(void) {
struct block_allocator::blockpair pairs[] = {
block_allocator::blockpair(1 * alignment, 6 * alignment),
// hole between 7x align -> 8x align
block_allocator::blockpair(8 * alignment, 4 * alignment),
// hole between 12x align -> 16x align
block_allocator::blockpair(16 * alignment, 1 * alignment),
block_allocator::blockpair(17 * alignment, 2 * alignment),
// hole between 19 align -> 21x align
block_allocator::blockpair(21 * alignment, 2 * alignment),
};
const uint64_t n_blocks = sizeof(pairs) / sizeof(pairs[0]);
block_allocator::blockpair *bp;
// first fit
bp = block_allocator_strategy::first_fit(pairs, n_blocks, 100, alignment);
assert(bp == &pairs[0]);
bp = block_allocator_strategy::first_fit(pairs, n_blocks, 4096, alignment);
assert(bp == &pairs[0]);
bp = block_allocator_strategy::first_fit(pairs, n_blocks, 3 * 4096, alignment);
assert(bp == &pairs[1]);
bp = block_allocator_strategy::first_fit(pairs, n_blocks, 5 * 4096, alignment);
assert(bp == nullptr);
// best fit
bp = block_allocator_strategy::best_fit(pairs, n_blocks, 100, alignment);
assert(bp == &pairs[0]);
bp = block_allocator_strategy::best_fit(pairs, n_blocks, 4100, alignment);
assert(bp == &pairs[3]);
bp = block_allocator_strategy::best_fit(pairs, n_blocks, 3 * 4096, alignment);
assert(bp == &pairs[1]);
bp = block_allocator_strategy::best_fit(pairs, n_blocks, 5 * 4096, alignment);
assert(bp == nullptr);
}
static void test_padded_fit(void) {
struct block_allocator::blockpair pairs[] = {
block_allocator::blockpair(1 * alignment, 1 * alignment),
// 4096 byte hole after bp[0]
block_allocator::blockpair(3 * alignment, 1 * alignment),
// 8192 byte hole after bp[1]
block_allocator::blockpair(6 * alignment, 1 * alignment),
// 16384 byte hole after bp[2]
block_allocator::blockpair(11 * alignment, 1 * alignment),
// 32768 byte hole after bp[3]
block_allocator::blockpair(17 * alignment, 1 * alignment),
// 116kb hole after bp[4]
block_allocator::blockpair(113 * alignment, 1 * alignment),
// 256kb hole after bp[5]
block_allocator::blockpair(371 * alignment, 1 * alignment),
};
const uint64_t n_blocks = sizeof(pairs) / sizeof(pairs[0]);
block_allocator::blockpair *bp;
// padding for a 100 byte allocation will be < than standard alignment,
// so it should fit in the first 4096 byte hole.
bp = block_allocator_strategy::padded_fit(pairs, n_blocks, 4000, alignment);
assert(bp == &pairs[0]);
// Even padded, a 12kb alloc will fit in a 16kb hole
bp = block_allocator_strategy::padded_fit(pairs, n_blocks, 3 * alignment, alignment);
assert(bp == &pairs[2]);
// would normally fit in the 116kb hole but the padding will bring it over
bp = block_allocator_strategy::padded_fit(pairs, n_blocks, 116 * alignment, alignment);
assert(bp == &pairs[5]);
bp = block_allocator_strategy::padded_fit(pairs, n_blocks, 127 * alignment, alignment);
assert(bp == &pairs[5]);
}
int test_main(int argc, const char *argv[]) {
(void) argc;
(void) argv;
test_first_vs_best_fit();
test_padded_fit();
return 0;
}

View file

@ -38,253 +38,243 @@ Copyright (c) 2006, 2015, Percona and/or its affiliates. All rights reserved.
#include "test.h"
static void ba_alloc(block_allocator *ba, uint64_t size, uint64_t *answer) {
ba->validate();
static void ba_alloc(BlockAllocator *ba, uint64_t size, uint64_t *answer) {
ba->Validate();
uint64_t actual_answer;
const uint64_t heat = random() % 2;
ba->alloc_block(512 * size, heat, &actual_answer);
ba->validate();
ba->AllocBlock(512 * size, &actual_answer);
ba->Validate();
assert(actual_answer%512==0);
*answer = actual_answer/512;
invariant(actual_answer % 512 == 0);
*answer = actual_answer / 512;
}
static void ba_free(block_allocator *ba, uint64_t offset) {
ba->validate();
ba->free_block(offset * 512);
ba->validate();
static void ba_free(BlockAllocator *ba, uint64_t offset, uint64_t size) {
ba->Validate();
ba->FreeBlock(offset * 512, 512 * size);
ba->Validate();
}
static void ba_check_l(block_allocator *ba, uint64_t blocknum_in_layout_order,
uint64_t expected_offset, uint64_t expected_size) {
static void ba_check_l(BlockAllocator *ba,
uint64_t blocknum_in_layout_order,
uint64_t expected_offset,
uint64_t expected_size) {
uint64_t actual_offset, actual_size;
int r = ba->get_nth_block_in_layout_order(blocknum_in_layout_order, &actual_offset, &actual_size);
assert(r==0);
assert(expected_offset*512 == actual_offset);
assert(expected_size *512 == actual_size);
int r = ba->NthBlockInLayoutOrder(
blocknum_in_layout_order, &actual_offset, &actual_size);
invariant(r == 0);
invariant(expected_offset * 512 == actual_offset);
invariant(expected_size * 512 == actual_size);
}
static void ba_check_none(block_allocator *ba, uint64_t blocknum_in_layout_order) {
static void ba_check_none(BlockAllocator *ba,
uint64_t blocknum_in_layout_order) {
uint64_t actual_offset, actual_size;
int r = ba->get_nth_block_in_layout_order(blocknum_in_layout_order, &actual_offset, &actual_size);
assert(r==-1);
int r = ba->NthBlockInLayoutOrder(
blocknum_in_layout_order, &actual_offset, &actual_size);
invariant(r == -1);
}
// Simple block allocator test
static void test_ba0(block_allocator::allocation_strategy strategy) {
block_allocator allocator;
block_allocator *ba = &allocator;
ba->create(100*512, 1*512);
ba->set_strategy(strategy);
assert(ba->allocated_limit()==100*512);
static void test_ba0() {
BlockAllocator allocator;
BlockAllocator *ba = &allocator;
ba->Create(100 * 512, 1 * 512);
invariant(ba->AllocatedLimit() == 100 * 512);
uint64_t b2, b3, b4, b5, b6, b7;
ba_alloc(ba, 100, &b2);
ba_alloc(ba, 100, &b3);
ba_alloc(ba, 100, &b4);
ba_alloc(ba, 100, &b5);
ba_alloc(ba, 100, &b6);
ba_alloc(ba, 100, &b7);
ba_free(ba, b2);
ba_alloc(ba, 100, &b2);
ba_free(ba, b4);
ba_free(ba, b6);
ba_alloc(ba, 100, &b2);
ba_alloc(ba, 100, &b3);
ba_alloc(ba, 100, &b4);
ba_alloc(ba, 100, &b5);
ba_alloc(ba, 100, &b6);
ba_alloc(ba, 100, &b7);
ba_free(ba, b2, 100);
ba_alloc(ba, 100, &b2);
ba_free(ba, b4, 100);
ba_free(ba, b6, 100);
uint64_t b8, b9;
ba_alloc(ba, 100, &b4);
ba_free(ba, b2);
ba_alloc(ba, 100, &b6);
ba_alloc(ba, 100, &b8);
ba_alloc(ba, 100, &b9);
ba_free(ba, b6);
ba_free(ba, b7);
ba_free(ba, b8);
ba_alloc(ba, 100, &b6);
ba_alloc(ba, 100, &b7);
ba_free(ba, b4);
ba_alloc(ba, 100, &b4);
ba_alloc(ba, 100, &b4);
ba_free(ba, b2, 100);
ba_alloc(ba, 100, &b6);
ba_alloc(ba, 100, &b8);
ba_alloc(ba, 100, &b9);
ba_free(ba, b6, 100);
ba_free(ba, b7, 100);
ba_free(ba, b8, 100);
ba_alloc(ba, 100, &b6);
ba_alloc(ba, 100, &b7);
ba_free(ba, b4, 100);
ba_alloc(ba, 100, &b4);
ba->destroy();
ba->Destroy();
}
// Manually to get coverage of all the code in the block allocator.
static void
test_ba1(block_allocator::allocation_strategy strategy, int n_initial) {
block_allocator allocator;
block_allocator *ba = &allocator;
ba->create(0*512, 1*512);
ba->set_strategy(strategy);
static void test_ba1(int n_initial) {
BlockAllocator allocator;
BlockAllocator *ba = &allocator;
ba->Create(0 * 512, 1 * 512);
int n_blocks=0;
int n_blocks = 0;
uint64_t blocks[1000];
for (int i = 0; i < 1000; i++) {
if (i < n_initial || random() % 2 == 0) {
if (n_blocks < 1000) {
ba_alloc(ba, 1, &blocks[n_blocks]);
//printf("A[%d]=%ld\n", n_blocks, blocks[n_blocks]);
n_blocks++;
}
} else {
if (n_blocks > 0) {
int blocknum = random()%n_blocks;
//printf("F[%d]%ld\n", blocknum, blocks[blocknum]);
ba_free(ba, blocks[blocknum]);
blocks[blocknum]=blocks[n_blocks-1];
n_blocks--;
}
}
if (i < n_initial || random() % 2 == 0) {
if (n_blocks < 1000) {
ba_alloc(ba, 1, &blocks[n_blocks]);
// printf("A[%d]=%ld\n", n_blocks, blocks[n_blocks]);
n_blocks++;
}
} else {
if (n_blocks > 0) {
int blocknum = random() % n_blocks;
// printf("F[%d]=%ld\n", blocknum, blocks[blocknum]);
ba_free(ba, blocks[blocknum], 1);
blocks[blocknum] = blocks[n_blocks - 1];
n_blocks--;
}
}
}
ba->destroy();
ba->Destroy();
}
// Check to see if it is first fit or best fit.
static void
test_ba2 (void)
{
block_allocator allocator;
block_allocator *ba = &allocator;
static void test_ba2(void) {
BlockAllocator allocator;
BlockAllocator *ba = &allocator;
uint64_t b[6];
enum { BSIZE = 1024 };
ba->create(100*512, BSIZE*512);
ba->set_strategy(block_allocator::BA_STRATEGY_FIRST_FIT);
assert(ba->allocated_limit()==100*512);
ba->Create(100 * 512, BSIZE * 512);
invariant(ba->AllocatedLimit() == 100 * 512);
ba_check_l (ba, 0, 0, 100);
ba_check_none (ba, 1);
ba_check_l(ba, 0, 0, 100);
ba_check_none(ba, 1);
ba_alloc (ba, 100, &b[0]);
ba_check_l (ba, 0, 0, 100);
ba_check_l (ba, 1, BSIZE, 100);
ba_check_none (ba, 2);
ba_alloc(ba, 100, &b[0]);
ba_check_l(ba, 0, 0, 100);
ba_check_l(ba, 1, BSIZE, 100);
ba_check_none(ba, 2);
ba_alloc (ba, BSIZE + 100, &b[1]);
ba_check_l (ba, 0, 0, 100);
ba_check_l (ba, 1, BSIZE, 100);
ba_check_l (ba, 2, 2*BSIZE, BSIZE + 100);
ba_check_none (ba, 3);
ba_alloc(ba, BSIZE + 100, &b[1]);
ba_check_l(ba, 0, 0, 100);
ba_check_l(ba, 1, BSIZE, 100);
ba_check_l(ba, 2, 2 * BSIZE, BSIZE + 100);
ba_check_none(ba, 3);
ba_alloc (ba, 100, &b[2]);
ba_check_l (ba, 0, 0, 100);
ba_check_l (ba, 1, BSIZE, 100);
ba_check_l (ba, 2, 2*BSIZE, BSIZE + 100);
ba_check_l (ba, 3, 4*BSIZE, 100);
ba_check_none (ba, 4);
ba_alloc(ba, 100, &b[2]);
ba_check_l(ba, 0, 0, 100);
ba_check_l(ba, 1, BSIZE, 100);
ba_check_l(ba, 2, 2 * BSIZE, BSIZE + 100);
ba_check_l(ba, 3, 4 * BSIZE, 100);
ba_check_none(ba, 4);
ba_alloc (ba, 100, &b[3]);
ba_alloc (ba, 100, &b[4]);
ba_alloc (ba, 100, &b[5]);
ba_check_l (ba, 0, 0, 100);
ba_check_l (ba, 1, BSIZE, 100);
ba_check_l (ba, 2, 2*BSIZE, BSIZE + 100);
ba_check_l (ba, 3, 4*BSIZE, 100);
ba_check_l (ba, 4, 5*BSIZE, 100);
ba_check_l (ba, 5, 6*BSIZE, 100);
ba_check_l (ba, 6, 7*BSIZE, 100);
ba_check_none (ba, 7);
ba_free (ba, 4*BSIZE);
ba_check_l (ba, 0, 0, 100);
ba_check_l (ba, 1, BSIZE, 100);
ba_check_l (ba, 2, 2*BSIZE, BSIZE + 100);
ba_check_l (ba, 3, 5*BSIZE, 100);
ba_check_l (ba, 4, 6*BSIZE, 100);
ba_check_l (ba, 5, 7*BSIZE, 100);
ba_check_none (ba, 6);
ba_alloc(ba, 100, &b[3]);
ba_alloc(ba, 100, &b[4]);
ba_alloc(ba, 100, &b[5]);
ba_check_l(ba, 0, 0, 100);
ba_check_l(ba, 1, BSIZE, 100);
ba_check_l(ba, 2, 2 * BSIZE, BSIZE + 100);
ba_check_l(ba, 3, 4 * BSIZE, 100);
ba_check_l(ba, 4, 5 * BSIZE, 100);
ba_check_l(ba, 5, 6 * BSIZE, 100);
ba_check_l(ba, 6, 7 * BSIZE, 100);
ba_check_none(ba, 7);
ba_free(ba, 4 * BSIZE, 100);
ba_check_l(ba, 0, 0, 100);
ba_check_l(ba, 1, BSIZE, 100);
ba_check_l(ba, 2, 2 * BSIZE, BSIZE + 100);
ba_check_l(ba, 3, 5 * BSIZE, 100);
ba_check_l(ba, 4, 6 * BSIZE, 100);
ba_check_l(ba, 5, 7 * BSIZE, 100);
ba_check_none(ba, 6);
uint64_t b2;
ba_alloc(ba, 100, &b2);
assert(b2==4*BSIZE);
ba_check_l (ba, 0, 0, 100);
ba_check_l (ba, 1, BSIZE, 100);
ba_check_l (ba, 2, 2*BSIZE, BSIZE + 100);
ba_check_l (ba, 3, 4*BSIZE, 100);
ba_check_l (ba, 4, 5*BSIZE, 100);
ba_check_l (ba, 5, 6*BSIZE, 100);
ba_check_l (ba, 6, 7*BSIZE, 100);
ba_check_none (ba, 7);
invariant(b2 == 4 * BSIZE);
ba_check_l(ba, 0, 0, 100);
ba_check_l(ba, 1, BSIZE, 100);
ba_check_l(ba, 2, 2 * BSIZE, BSIZE + 100);
ba_check_l(ba, 3, 4 * BSIZE, 100);
ba_check_l(ba, 4, 5 * BSIZE, 100);
ba_check_l(ba, 5, 6 * BSIZE, 100);
ba_check_l(ba, 6, 7 * BSIZE, 100);
ba_check_none(ba, 7);
ba_free (ba, BSIZE);
ba_free (ba, 5*BSIZE);
ba_check_l (ba, 0, 0, 100);
ba_check_l (ba, 1, 2*BSIZE, BSIZE + 100);
ba_check_l (ba, 2, 4*BSIZE, 100);
ba_check_l (ba, 3, 6*BSIZE, 100);
ba_check_l (ba, 4, 7*BSIZE, 100);
ba_check_none (ba, 5);
ba_free(ba, BSIZE, 100);
ba_free(ba, 5 * BSIZE, 100);
ba_check_l(ba, 0, 0, 100);
ba_check_l(ba, 1, 2 * BSIZE, BSIZE + 100);
ba_check_l(ba, 2, 4 * BSIZE, 100);
ba_check_l(ba, 3, 6 * BSIZE, 100);
ba_check_l(ba, 4, 7 * BSIZE, 100);
ba_check_none(ba, 5);
// This alloc will allocate the first block after the reserve space in the case of first fit.
// This alloc will allocate the first block after the reserve space in the
// case of first fit.
uint64_t b3;
ba_alloc(ba, 100, &b3);
assert(b3== BSIZE); // First fit.
invariant(b3 == BSIZE); // First fit.
// if (b3==5*BSIZE) then it is next fit.
// Now 5*BSIZE is free
uint64_t b5;
ba_alloc(ba, 100, &b5);
assert(b5==5*BSIZE);
ba_check_l (ba, 0, 0, 100);
ba_check_l (ba, 1, BSIZE, 100);
ba_check_l (ba, 2, 2*BSIZE, BSIZE + 100);
ba_check_l (ba, 3, 4*BSIZE, 100);
ba_check_l (ba, 4, 5*BSIZE, 100);
ba_check_l (ba, 5, 6*BSIZE, 100);
ba_check_l (ba, 6, 7*BSIZE, 100);
ba_check_none (ba, 7);
invariant(b5 == 5 * BSIZE);
ba_check_l(ba, 0, 0, 100);
ba_check_l(ba, 1, BSIZE, 100);
ba_check_l(ba, 2, 2 * BSIZE, BSIZE + 100);
ba_check_l(ba, 3, 4 * BSIZE, 100);
ba_check_l(ba, 4, 5 * BSIZE, 100);
ba_check_l(ba, 5, 6 * BSIZE, 100);
ba_check_l(ba, 6, 7 * BSIZE, 100);
ba_check_none(ba, 7);
// Now all blocks are busy
uint64_t b6, b7, b8;
ba_alloc(ba, 100, &b6);
ba_alloc(ba, 100, &b7);
ba_alloc(ba, 100, &b8);
assert(b6==8*BSIZE);
assert(b7==9*BSIZE);
assert(b8==10*BSIZE);
ba_check_l (ba, 0, 0, 100);
ba_check_l (ba, 1, BSIZE, 100);
ba_check_l (ba, 2, 2*BSIZE, BSIZE + 100);
ba_check_l (ba, 3, 4*BSIZE, 100);
ba_check_l (ba, 4, 5*BSIZE, 100);
ba_check_l (ba, 5, 6*BSIZE, 100);
ba_check_l (ba, 6, 7*BSIZE, 100);
ba_check_l (ba, 7, 8*BSIZE, 100);
ba_check_l (ba, 8, 9*BSIZE, 100);
ba_check_l (ba, 9, 10*BSIZE, 100);
ba_check_none (ba, 10);
ba_free(ba, 9*BSIZE);
ba_free(ba, 7*BSIZE);
invariant(b6 == 8 * BSIZE);
invariant(b7 == 9 * BSIZE);
invariant(b8 == 10 * BSIZE);
ba_check_l(ba, 0, 0, 100);
ba_check_l(ba, 1, BSIZE, 100);
ba_check_l(ba, 2, 2 * BSIZE, BSIZE + 100);
ba_check_l(ba, 3, 4 * BSIZE, 100);
ba_check_l(ba, 4, 5 * BSIZE, 100);
ba_check_l(ba, 5, 6 * BSIZE, 100);
ba_check_l(ba, 6, 7 * BSIZE, 100);
ba_check_l(ba, 7, 8 * BSIZE, 100);
ba_check_l(ba, 8, 9 * BSIZE, 100);
ba_check_l(ba, 9, 10 * BSIZE, 100);
ba_check_none(ba, 10);
ba_free(ba, 9 * BSIZE, 100);
ba_free(ba, 7 * BSIZE, 100);
uint64_t b9;
ba_alloc(ba, 100, &b9);
assert(b9==7*BSIZE);
invariant(b9 == 7 * BSIZE);
ba_free(ba, 5*BSIZE);
ba_free(ba, 2*BSIZE);
ba_free(ba, 5 * BSIZE, 100);
ba_free(ba, 2 * BSIZE, BSIZE + 100);
uint64_t b10, b11;
ba_alloc(ba, 100, &b10);
assert(b10==2*BSIZE);
invariant(b10 == 2 * BSIZE);
ba_alloc(ba, 100, &b11);
assert(b11==3*BSIZE);
invariant(b11 == 3 * BSIZE);
ba_alloc(ba, 100, &b11);
assert(b11==5*BSIZE);
invariant(b11 == 5 * BSIZE);
ba->destroy();
ba->Destroy();
}
int
test_main (int argc __attribute__((__unused__)), const char *argv[] __attribute__((__unused__))) {
enum block_allocator::allocation_strategy strategies[] = {
block_allocator::BA_STRATEGY_FIRST_FIT,
block_allocator::BA_STRATEGY_BEST_FIT,
block_allocator::BA_STRATEGY_PADDED_FIT,
block_allocator::BA_STRATEGY_HEAT_ZONE,
};
for (size_t i = 0; i < sizeof(strategies) / sizeof(strategies[0]); i++) {
test_ba0(strategies[i]);
test_ba1(strategies[i], 0);
test_ba1(strategies[i], 10);
test_ba1(strategies[i], 20);
}
int test_main(int argc __attribute__((__unused__)),
const char *argv[] __attribute__((__unused__))) {
test_ba0();
test_ba1(0);
test_ba1(10);
test_ba1(20);
test_ba2();
return 0;
}

View file

@ -45,7 +45,7 @@ Copyright (c) 2006, 2015, Percona and/or its affiliates. All rights reserved.
// #5978 is fixed. Here is what we do. We have four pairs with
// blocknums and fullhashes of 1,2,3,4. The cachetable has only
// two bucket mutexes, so 1 and 3 share a pair mutex, as do 2 and 4.
// We pin all four with expensive write locks. Then, on backgroud threads,
// We pin all four with expensive write locks. Then, on background threads,
// we call get_and_pin_nonblocking on 3, where the unlockers unpins 2, and
// we call get_and_pin_nonblocking on 4, where the unlockers unpins 1. Run this
// enough times, and we should see a deadlock before the fix, and no deadlock

View file

@ -77,7 +77,7 @@ flush (
//
// test the following things for simple cloning:
// - verifies that after teh checkpoint ends, the PAIR is properly
// - verifies that after the checkpoint ends, the PAIR is properly
// dirty or clean based on the second unpin
//
static void

View file

@ -38,69 +38,72 @@ Copyright (c) 2006, 2015, Percona and/or its affiliates. All rights reserved.
#include "test.h"
static int
int64_key_cmp (DB *db UU(), const DBT *a, const DBT *b) {
int64_t x = *(int64_t *) a->data;
int64_t y = *(int64_t *) b->data;
static int int64_key_cmp(DB *db UU(), const DBT *a, const DBT *b) {
int64_t x = *(int64_t *)a->data;
int64_t y = *(int64_t *)b->data;
if (x<y) return -1;
if (x>y) return 1;
if (x < y)
return -1;
if (x > y)
return 1;
return 0;
}
static void
test_prefetch_read(int fd, FT_HANDLE UU(ft), FT ft_h) {
static void test_prefetch_read(int fd, FT_HANDLE UU(ft), FT ft_h) {
int r;
FT_CURSOR XMALLOC(cursor);
FTNODE dn = NULL;
PAIR_ATTR attr;
// first test that prefetching everything should work
memset(&cursor->range_lock_left_key, 0 , sizeof(DBT));
memset(&cursor->range_lock_right_key, 0 , sizeof(DBT));
memset(&cursor->range_lock_left_key, 0, sizeof(DBT));
memset(&cursor->range_lock_right_key, 0, sizeof(DBT));
cursor->left_is_neg_infty = true;
cursor->right_is_pos_infty = true;
cursor->disable_prefetching = false;
ftnode_fetch_extra bfe;
// quick test to see that we have the right behavior when we set
// disable_prefetching to true
cursor->disable_prefetching = true;
bfe.create_for_prefetch( ft_h, cursor);
bfe.create_for_prefetch(ft_h, cursor);
FTNODE_DISK_DATA ndd = NULL;
r = toku_deserialize_ftnode_from(fd, make_blocknum(20), 0/*pass zero for hash*/, &dn, &ndd, &bfe);
assert(r==0);
assert(dn->n_children == 3);
assert(BP_STATE(dn,0) == PT_ON_DISK);
assert(BP_STATE(dn,1) == PT_ON_DISK);
assert(BP_STATE(dn,2) == PT_ON_DISK);
r = toku_deserialize_ftnode_from(
fd, make_blocknum(20), 0 /*pass zero for hash*/, &dn, &ndd, &bfe);
invariant(r == 0);
invariant(dn->n_children == 3);
invariant(BP_STATE(dn, 0) == PT_ON_DISK);
invariant(BP_STATE(dn, 1) == PT_ON_DISK);
invariant(BP_STATE(dn, 2) == PT_ON_DISK);
r = toku_ftnode_pf_callback(dn, ndd, &bfe, fd, &attr);
assert(BP_STATE(dn,0) == PT_ON_DISK);
assert(BP_STATE(dn,1) == PT_ON_DISK);
assert(BP_STATE(dn,2) == PT_ON_DISK);
invariant(BP_STATE(dn, 0) == PT_ON_DISK);
invariant(BP_STATE(dn, 1) == PT_ON_DISK);
invariant(BP_STATE(dn, 2) == PT_ON_DISK);
bfe.destroy();
toku_ftnode_free(&dn);
toku_free(ndd);
// now enable prefetching again
cursor->disable_prefetching = false;
bfe.create_for_prefetch( ft_h, cursor);
r = toku_deserialize_ftnode_from(fd, make_blocknum(20), 0/*pass zero for hash*/, &dn, &ndd, &bfe);
assert(r==0);
assert(dn->n_children == 3);
assert(BP_STATE(dn,0) == PT_AVAIL);
assert(BP_STATE(dn,1) == PT_AVAIL);
assert(BP_STATE(dn,2) == PT_AVAIL);
toku_ftnode_pe_callback(dn, make_pair_attr(0xffffffff), ft_h, def_pe_finalize_impl, nullptr);
assert(BP_STATE(dn,0) == PT_COMPRESSED);
assert(BP_STATE(dn,1) == PT_COMPRESSED);
assert(BP_STATE(dn,2) == PT_COMPRESSED);
bfe.create_for_prefetch(ft_h, cursor);
r = toku_deserialize_ftnode_from(
fd, make_blocknum(20), 0 /*pass zero for hash*/, &dn, &ndd, &bfe);
invariant(r == 0);
invariant(dn->n_children == 3);
invariant(BP_STATE(dn, 0) == PT_AVAIL);
invariant(BP_STATE(dn, 1) == PT_AVAIL);
invariant(BP_STATE(dn, 2) == PT_AVAIL);
toku_ftnode_pe_callback(
dn, make_pair_attr(0xffffffff), ft_h, def_pe_finalize_impl, nullptr);
invariant(BP_STATE(dn, 0) == PT_COMPRESSED);
invariant(BP_STATE(dn, 1) == PT_COMPRESSED);
invariant(BP_STATE(dn, 2) == PT_COMPRESSED);
r = toku_ftnode_pf_callback(dn, ndd, &bfe, fd, &attr);
assert(BP_STATE(dn,0) == PT_AVAIL);
assert(BP_STATE(dn,1) == PT_AVAIL);
assert(BP_STATE(dn,2) == PT_AVAIL);
invariant(BP_STATE(dn, 0) == PT_AVAIL);
invariant(BP_STATE(dn, 1) == PT_AVAIL);
invariant(BP_STATE(dn, 2) == PT_AVAIL);
bfe.destroy();
toku_ftnode_free(&dn);
toku_free(ndd);
@ -108,21 +111,23 @@ test_prefetch_read(int fd, FT_HANDLE UU(ft), FT ft_h) {
uint64_t left_key = 150;
toku_fill_dbt(&cursor->range_lock_left_key, &left_key, sizeof(uint64_t));
cursor->left_is_neg_infty = false;
bfe.create_for_prefetch( ft_h, cursor);
r = toku_deserialize_ftnode_from(fd, make_blocknum(20), 0/*pass zero for hash*/, &dn, &ndd, &bfe);
assert(r==0);
assert(dn->n_children == 3);
assert(BP_STATE(dn,0) == PT_ON_DISK);
assert(BP_STATE(dn,1) == PT_AVAIL);
assert(BP_STATE(dn,2) == PT_AVAIL);
toku_ftnode_pe_callback(dn, make_pair_attr(0xffffffff), ft_h, def_pe_finalize_impl, nullptr);
assert(BP_STATE(dn,0) == PT_ON_DISK);
assert(BP_STATE(dn,1) == PT_COMPRESSED);
assert(BP_STATE(dn,2) == PT_COMPRESSED);
bfe.create_for_prefetch(ft_h, cursor);
r = toku_deserialize_ftnode_from(
fd, make_blocknum(20), 0 /*pass zero for hash*/, &dn, &ndd, &bfe);
invariant(r == 0);
invariant(dn->n_children == 3);
invariant(BP_STATE(dn, 0) == PT_ON_DISK);
invariant(BP_STATE(dn, 1) == PT_AVAIL);
invariant(BP_STATE(dn, 2) == PT_AVAIL);
toku_ftnode_pe_callback(
dn, make_pair_attr(0xffffffff), ft_h, def_pe_finalize_impl, nullptr);
invariant(BP_STATE(dn, 0) == PT_ON_DISK);
invariant(BP_STATE(dn, 1) == PT_COMPRESSED);
invariant(BP_STATE(dn, 2) == PT_COMPRESSED);
r = toku_ftnode_pf_callback(dn, ndd, &bfe, fd, &attr);
assert(BP_STATE(dn,0) == PT_ON_DISK);
assert(BP_STATE(dn,1) == PT_AVAIL);
assert(BP_STATE(dn,2) == PT_AVAIL);
invariant(BP_STATE(dn, 0) == PT_ON_DISK);
invariant(BP_STATE(dn, 1) == PT_AVAIL);
invariant(BP_STATE(dn, 2) == PT_AVAIL);
bfe.destroy();
toku_ftnode_free(&dn);
toku_free(ndd);
@ -130,63 +135,69 @@ test_prefetch_read(int fd, FT_HANDLE UU(ft), FT ft_h) {
uint64_t right_key = 151;
toku_fill_dbt(&cursor->range_lock_right_key, &right_key, sizeof(uint64_t));
cursor->right_is_pos_infty = false;
bfe.create_for_prefetch( ft_h, cursor);
r = toku_deserialize_ftnode_from(fd, make_blocknum(20), 0/*pass zero for hash*/, &dn, &ndd, &bfe);
assert(r==0);
assert(dn->n_children == 3);
assert(BP_STATE(dn,0) == PT_ON_DISK);
assert(BP_STATE(dn,1) == PT_AVAIL);
assert(BP_STATE(dn,2) == PT_ON_DISK);
toku_ftnode_pe_callback(dn, make_pair_attr(0xffffffff), ft_h, def_pe_finalize_impl, nullptr);
assert(BP_STATE(dn,0) == PT_ON_DISK);
assert(BP_STATE(dn,1) == PT_COMPRESSED);
assert(BP_STATE(dn,2) == PT_ON_DISK);
bfe.create_for_prefetch(ft_h, cursor);
r = toku_deserialize_ftnode_from(
fd, make_blocknum(20), 0 /*pass zero for hash*/, &dn, &ndd, &bfe);
invariant(r == 0);
invariant(dn->n_children == 3);
invariant(BP_STATE(dn, 0) == PT_ON_DISK);
invariant(BP_STATE(dn, 1) == PT_AVAIL);
invariant(BP_STATE(dn, 2) == PT_ON_DISK);
toku_ftnode_pe_callback(
dn, make_pair_attr(0xffffffff), ft_h, def_pe_finalize_impl, nullptr);
invariant(BP_STATE(dn, 0) == PT_ON_DISK);
invariant(BP_STATE(dn, 1) == PT_COMPRESSED);
invariant(BP_STATE(dn, 2) == PT_ON_DISK);
r = toku_ftnode_pf_callback(dn, ndd, &bfe, fd, &attr);
assert(BP_STATE(dn,0) == PT_ON_DISK);
assert(BP_STATE(dn,1) == PT_AVAIL);
assert(BP_STATE(dn,2) == PT_ON_DISK);
invariant(BP_STATE(dn, 0) == PT_ON_DISK);
invariant(BP_STATE(dn, 1) == PT_AVAIL);
invariant(BP_STATE(dn, 2) == PT_ON_DISK);
bfe.destroy();
toku_ftnode_free(&dn);
toku_free(ndd);
left_key = 100000;
right_key = 100000;
bfe.create_for_prefetch( ft_h, cursor);
r = toku_deserialize_ftnode_from(fd, make_blocknum(20), 0/*pass zero for hash*/, &dn, &ndd, &bfe);
assert(r==0);
assert(dn->n_children == 3);
assert(BP_STATE(dn,0) == PT_ON_DISK);
assert(BP_STATE(dn,1) == PT_ON_DISK);
assert(BP_STATE(dn,2) == PT_AVAIL);
toku_ftnode_pe_callback(dn, make_pair_attr(0xffffffff), ft_h, def_pe_finalize_impl, nullptr);
assert(BP_STATE(dn,0) == PT_ON_DISK);
assert(BP_STATE(dn,1) == PT_ON_DISK);
assert(BP_STATE(dn,2) == PT_COMPRESSED);
bfe.create_for_prefetch(ft_h, cursor);
r = toku_deserialize_ftnode_from(
fd, make_blocknum(20), 0 /*pass zero for hash*/, &dn, &ndd, &bfe);
invariant(r == 0);
invariant(dn->n_children == 3);
invariant(BP_STATE(dn, 0) == PT_ON_DISK);
invariant(BP_STATE(dn, 1) == PT_ON_DISK);
invariant(BP_STATE(dn, 2) == PT_AVAIL);
toku_ftnode_pe_callback(
dn, make_pair_attr(0xffffffff), ft_h, def_pe_finalize_impl, nullptr);
invariant(BP_STATE(dn, 0) == PT_ON_DISK);
invariant(BP_STATE(dn, 1) == PT_ON_DISK);
invariant(BP_STATE(dn, 2) == PT_COMPRESSED);
r = toku_ftnode_pf_callback(dn, ndd, &bfe, fd, &attr);
assert(BP_STATE(dn,0) == PT_ON_DISK);
assert(BP_STATE(dn,1) == PT_ON_DISK);
assert(BP_STATE(dn,2) == PT_AVAIL);
invariant(BP_STATE(dn, 0) == PT_ON_DISK);
invariant(BP_STATE(dn, 1) == PT_ON_DISK);
invariant(BP_STATE(dn, 2) == PT_AVAIL);
bfe.destroy();
toku_free(ndd);
toku_ftnode_free(&dn);
left_key = 100;
right_key = 100;
bfe.create_for_prefetch( ft_h, cursor);
r = toku_deserialize_ftnode_from(fd, make_blocknum(20), 0/*pass zero for hash*/, &dn, &ndd, &bfe);
assert(r==0);
assert(dn->n_children == 3);
assert(BP_STATE(dn,0) == PT_AVAIL);
assert(BP_STATE(dn,1) == PT_ON_DISK);
assert(BP_STATE(dn,2) == PT_ON_DISK);
toku_ftnode_pe_callback(dn, make_pair_attr(0xffffffff), ft_h, def_pe_finalize_impl, nullptr);
assert(BP_STATE(dn,0) == PT_COMPRESSED);
assert(BP_STATE(dn,1) == PT_ON_DISK);
assert(BP_STATE(dn,2) == PT_ON_DISK);
bfe.create_for_prefetch(ft_h, cursor);
r = toku_deserialize_ftnode_from(
fd, make_blocknum(20), 0 /*pass zero for hash*/, &dn, &ndd, &bfe);
invariant(r == 0);
invariant(dn->n_children == 3);
invariant(BP_STATE(dn, 0) == PT_AVAIL);
invariant(BP_STATE(dn, 1) == PT_ON_DISK);
invariant(BP_STATE(dn, 2) == PT_ON_DISK);
toku_ftnode_pe_callback(
dn, make_pair_attr(0xffffffff), ft_h, def_pe_finalize_impl, nullptr);
invariant(BP_STATE(dn, 0) == PT_COMPRESSED);
invariant(BP_STATE(dn, 1) == PT_ON_DISK);
invariant(BP_STATE(dn, 2) == PT_ON_DISK);
r = toku_ftnode_pf_callback(dn, ndd, &bfe, fd, &attr);
assert(BP_STATE(dn,0) == PT_AVAIL);
assert(BP_STATE(dn,1) == PT_ON_DISK);
assert(BP_STATE(dn,2) == PT_ON_DISK);
invariant(BP_STATE(dn, 0) == PT_AVAIL);
invariant(BP_STATE(dn, 1) == PT_ON_DISK);
invariant(BP_STATE(dn, 2) == PT_ON_DISK);
bfe.destroy();
toku_ftnode_free(&dn);
toku_free(ndd);
@ -194,20 +205,19 @@ test_prefetch_read(int fd, FT_HANDLE UU(ft), FT ft_h) {
toku_free(cursor);
}
static void
test_subset_read(int fd, FT_HANDLE UU(ft), FT ft_h) {
static void test_subset_read(int fd, FT_HANDLE UU(ft), FT ft_h) {
int r;
FT_CURSOR XMALLOC(cursor);
FTNODE dn = NULL;
FTNODE_DISK_DATA ndd = NULL;
PAIR_ATTR attr;
// first test that prefetching everything should work
memset(&cursor->range_lock_left_key, 0 , sizeof(DBT));
memset(&cursor->range_lock_right_key, 0 , sizeof(DBT));
memset(&cursor->range_lock_left_key, 0, sizeof(DBT));
memset(&cursor->range_lock_right_key, 0, sizeof(DBT));
cursor->left_is_neg_infty = true;
cursor->right_is_pos_infty = true;
uint64_t left_key = 150;
uint64_t right_key = 151;
DBT left, right;
@ -216,101 +226,106 @@ test_subset_read(int fd, FT_HANDLE UU(ft), FT ft_h) {
ftnode_fetch_extra bfe;
bfe.create_for_subset_read(
ft_h,
NULL,
&left,
&right,
false,
false,
false,
false
);
ft_h, NULL, &left, &right, false, false, false, false);
// fake the childnum to read
// set disable_prefetching ON
bfe.child_to_read = 2;
bfe.disable_prefetching = true;
r = toku_deserialize_ftnode_from(fd, make_blocknum(20), 0/*pass zero for hash*/, &dn, &ndd, &bfe);
assert(r==0);
assert(dn->n_children == 3);
assert(BP_STATE(dn,0) == PT_ON_DISK);
assert(BP_STATE(dn,1) == PT_ON_DISK);
assert(BP_STATE(dn,2) == PT_AVAIL);
// need to call this twice because we had a subset read before, that touched the clock
toku_ftnode_pe_callback(dn, make_pair_attr(0xffffffff), ft_h, def_pe_finalize_impl, nullptr);
assert(BP_STATE(dn,0) == PT_ON_DISK);
assert(BP_STATE(dn,1) == PT_ON_DISK);
assert(BP_STATE(dn,2) == PT_AVAIL);
toku_ftnode_pe_callback(dn, make_pair_attr(0xffffffff), ft_h, def_pe_finalize_impl, nullptr);
assert(BP_STATE(dn,0) == PT_ON_DISK);
assert(BP_STATE(dn,1) == PT_ON_DISK);
assert(BP_STATE(dn,2) == PT_COMPRESSED);
r = toku_deserialize_ftnode_from(
fd, make_blocknum(20), 0 /*pass zero for hash*/, &dn, &ndd, &bfe);
invariant(r == 0);
invariant(dn->n_children == 3);
invariant(BP_STATE(dn, 0) == PT_ON_DISK);
invariant(BP_STATE(dn, 1) == PT_ON_DISK);
invariant(BP_STATE(dn, 2) == PT_AVAIL);
// need to call this twice because we had a subset read before, that touched
// the clock
toku_ftnode_pe_callback(
dn, make_pair_attr(0xffffffff), ft_h, def_pe_finalize_impl, nullptr);
invariant(BP_STATE(dn, 0) == PT_ON_DISK);
invariant(BP_STATE(dn, 1) == PT_ON_DISK);
invariant(BP_STATE(dn, 2) == PT_AVAIL);
toku_ftnode_pe_callback(
dn, make_pair_attr(0xffffffff), ft_h, def_pe_finalize_impl, nullptr);
invariant(BP_STATE(dn, 0) == PT_ON_DISK);
invariant(BP_STATE(dn, 1) == PT_ON_DISK);
invariant(BP_STATE(dn, 2) == PT_COMPRESSED);
r = toku_ftnode_pf_callback(dn, ndd, &bfe, fd, &attr);
assert(BP_STATE(dn,0) == PT_ON_DISK);
assert(BP_STATE(dn,1) == PT_ON_DISK);
assert(BP_STATE(dn,2) == PT_AVAIL);
invariant(BP_STATE(dn, 0) == PT_ON_DISK);
invariant(BP_STATE(dn, 1) == PT_ON_DISK);
invariant(BP_STATE(dn, 2) == PT_AVAIL);
toku_ftnode_free(&dn);
toku_free(ndd);
// fake the childnum to read
bfe.child_to_read = 2;
bfe.disable_prefetching = false;
r = toku_deserialize_ftnode_from(fd, make_blocknum(20), 0/*pass zero for hash*/, &dn, &ndd, &bfe);
assert(r==0);
assert(dn->n_children == 3);
assert(BP_STATE(dn,0) == PT_ON_DISK);
assert(BP_STATE(dn,1) == PT_AVAIL);
assert(BP_STATE(dn,2) == PT_AVAIL);
// need to call this twice because we had a subset read before, that touched the clock
toku_ftnode_pe_callback(dn, make_pair_attr(0xffffffff), ft_h, def_pe_finalize_impl, nullptr);
assert(BP_STATE(dn,0) == PT_ON_DISK);
assert(BP_STATE(dn,1) == PT_COMPRESSED);
assert(BP_STATE(dn,2) == PT_AVAIL);
toku_ftnode_pe_callback(dn, make_pair_attr(0xffffffff), ft_h, def_pe_finalize_impl, nullptr);
assert(BP_STATE(dn,0) == PT_ON_DISK);
assert(BP_STATE(dn,1) == PT_COMPRESSED);
assert(BP_STATE(dn,2) == PT_COMPRESSED);
r = toku_deserialize_ftnode_from(
fd, make_blocknum(20), 0 /*pass zero for hash*/, &dn, &ndd, &bfe);
invariant(r == 0);
invariant(dn->n_children == 3);
invariant(BP_STATE(dn, 0) == PT_ON_DISK);
invariant(BP_STATE(dn, 1) == PT_AVAIL);
invariant(BP_STATE(dn, 2) == PT_AVAIL);
// need to call this twice because we had a subset read before, that touched
// the clock
toku_ftnode_pe_callback(
dn, make_pair_attr(0xffffffff), ft_h, def_pe_finalize_impl, nullptr);
invariant(BP_STATE(dn, 0) == PT_ON_DISK);
invariant(BP_STATE(dn, 1) == PT_COMPRESSED);
invariant(BP_STATE(dn, 2) == PT_AVAIL);
toku_ftnode_pe_callback(
dn, make_pair_attr(0xffffffff), ft_h, def_pe_finalize_impl, nullptr);
invariant(BP_STATE(dn, 0) == PT_ON_DISK);
invariant(BP_STATE(dn, 1) == PT_COMPRESSED);
invariant(BP_STATE(dn, 2) == PT_COMPRESSED);
r = toku_ftnode_pf_callback(dn, ndd, &bfe, fd, &attr);
assert(BP_STATE(dn,0) == PT_ON_DISK);
assert(BP_STATE(dn,1) == PT_AVAIL);
assert(BP_STATE(dn,2) == PT_AVAIL);
invariant(BP_STATE(dn, 0) == PT_ON_DISK);
invariant(BP_STATE(dn, 1) == PT_AVAIL);
invariant(BP_STATE(dn, 2) == PT_AVAIL);
toku_ftnode_free(&dn);
toku_free(ndd);
// fake the childnum to read
bfe.child_to_read = 0;
r = toku_deserialize_ftnode_from(fd, make_blocknum(20), 0/*pass zero for hash*/, &dn, &ndd, &bfe);
assert(r==0);
assert(dn->n_children == 3);
assert(BP_STATE(dn,0) == PT_AVAIL);
assert(BP_STATE(dn,1) == PT_AVAIL);
assert(BP_STATE(dn,2) == PT_ON_DISK);
// need to call this twice because we had a subset read before, that touched the clock
toku_ftnode_pe_callback(dn, make_pair_attr(0xffffffff), ft_h, def_pe_finalize_impl, nullptr);
assert(BP_STATE(dn,0) == PT_AVAIL);
assert(BP_STATE(dn,1) == PT_COMPRESSED);
assert(BP_STATE(dn,2) == PT_ON_DISK);
toku_ftnode_pe_callback(dn, make_pair_attr(0xffffffff), ft_h, def_pe_finalize_impl, nullptr);
assert(BP_STATE(dn,0) == PT_COMPRESSED);
assert(BP_STATE(dn,1) == PT_COMPRESSED);
assert(BP_STATE(dn,2) == PT_ON_DISK);
r = toku_deserialize_ftnode_from(
fd, make_blocknum(20), 0 /*pass zero for hash*/, &dn, &ndd, &bfe);
invariant(r == 0);
invariant(dn->n_children == 3);
invariant(BP_STATE(dn, 0) == PT_AVAIL);
invariant(BP_STATE(dn, 1) == PT_AVAIL);
invariant(BP_STATE(dn, 2) == PT_ON_DISK);
// need to call this twice because we had a subset read before, that touched
// the clock
toku_ftnode_pe_callback(
dn, make_pair_attr(0xffffffff), ft_h, def_pe_finalize_impl, nullptr);
invariant(BP_STATE(dn, 0) == PT_AVAIL);
invariant(BP_STATE(dn, 1) == PT_COMPRESSED);
invariant(BP_STATE(dn, 2) == PT_ON_DISK);
toku_ftnode_pe_callback(
dn, make_pair_attr(0xffffffff), ft_h, def_pe_finalize_impl, nullptr);
invariant(BP_STATE(dn, 0) == PT_COMPRESSED);
invariant(BP_STATE(dn, 1) == PT_COMPRESSED);
invariant(BP_STATE(dn, 2) == PT_ON_DISK);
r = toku_ftnode_pf_callback(dn, ndd, &bfe, fd, &attr);
assert(BP_STATE(dn,0) == PT_AVAIL);
assert(BP_STATE(dn,1) == PT_AVAIL);
assert(BP_STATE(dn,2) == PT_ON_DISK);
invariant(BP_STATE(dn, 0) == PT_AVAIL);
invariant(BP_STATE(dn, 1) == PT_AVAIL);
invariant(BP_STATE(dn, 2) == PT_ON_DISK);
toku_ftnode_free(&dn);
toku_free(ndd);
toku_free(cursor);
}
static void
test_prefetching(void) {
static void test_prefetching(void) {
// struct ft_handle source_ft;
struct ftnode sn;
int fd = open(TOKU_TEST_FILENAME, O_RDWR|O_CREAT|O_BINARY, S_IRWXU|S_IRWXG|S_IRWXO); assert(fd >= 0);
int fd = open(TOKU_TEST_FILENAME,
O_RDWR | O_CREAT | O_BINARY,
S_IRWXU | S_IRWXG | S_IRWXO);
invariant(fd >= 0);
int r;
@ -327,7 +342,7 @@ test_prefetching(void) {
uint64_t key1 = 100;
uint64_t key2 = 200;
MALLOC_N(sn.n_children, sn.bp);
DBT pivotkeys[2];
toku_fill_dbt(&pivotkeys[0], &key1, sizeof(key1));
@ -336,13 +351,13 @@ test_prefetching(void) {
BP_BLOCKNUM(&sn, 0).b = 30;
BP_BLOCKNUM(&sn, 1).b = 35;
BP_BLOCKNUM(&sn, 2).b = 40;
BP_STATE(&sn,0) = PT_AVAIL;
BP_STATE(&sn,1) = PT_AVAIL;
BP_STATE(&sn,2) = PT_AVAIL;
BP_STATE(&sn, 0) = PT_AVAIL;
BP_STATE(&sn, 1) = PT_AVAIL;
BP_STATE(&sn, 2) = PT_AVAIL;
set_BNC(&sn, 0, toku_create_empty_nl());
set_BNC(&sn, 1, toku_create_empty_nl());
set_BNC(&sn, 2, toku_create_empty_nl());
//Create XIDS
// Create XIDS
XIDS xids_0 = toku_xids_get_root_xids();
XIDS xids_123;
XIDS xids_234;
@ -352,7 +367,7 @@ test_prefetching(void) {
CKERR(r);
// data in the buffers does not matter in this test
//Cleanup:
// Cleanup:
toku_xids_destroy(&xids_0);
toku_xids_destroy(&xids_123);
toku_xids_destroy(&xids_234);
@ -363,41 +378,48 @@ test_prefetching(void) {
make_blocknum(0),
ZERO_LSN,
TXNID_NONE,
4*1024*1024,
128*1024,
4 * 1024 * 1024,
128 * 1024,
TOKU_DEFAULT_COMPRESSION_METHOD,
16);
ft_h->cmp.create(int64_key_cmp, nullptr);
ft->ft = ft_h;
ft_h->blocktable.create();
{ int r_truncate = ftruncate(fd, 0); CKERR(r_truncate); }
//Want to use block #20
{
int r_truncate = ftruncate(fd, 0);
CKERR(r_truncate);
}
// Want to use block #20
BLOCKNUM b = make_blocknum(0);
while (b.b < 20) {
ft_h->blocktable.allocate_blocknum(&b, ft_h);
}
assert(b.b == 20);
invariant(b.b == 20);
{
DISKOFF offset;
DISKOFF size;
ft_h->blocktable.realloc_on_disk(b, 100, &offset, ft_h, fd, false, 0);
assert(offset==(DISKOFF)block_allocator::BLOCK_ALLOCATOR_TOTAL_HEADER_RESERVE);
ft_h->blocktable.realloc_on_disk(b, 100, &offset, ft_h, fd, false);
invariant(offset ==
(DISKOFF)BlockAllocator::BLOCK_ALLOCATOR_TOTAL_HEADER_RESERVE);
ft_h->blocktable.translate_blocknum_to_offset_size(b, &offset, &size);
assert(offset == (DISKOFF)block_allocator::BLOCK_ALLOCATOR_TOTAL_HEADER_RESERVE);
assert(size == 100);
invariant(offset ==
(DISKOFF)BlockAllocator::BLOCK_ALLOCATOR_TOTAL_HEADER_RESERVE);
invariant(size == 100);
}
FTNODE_DISK_DATA ndd = NULL;
r = toku_serialize_ftnode_to(fd, make_blocknum(20), &sn, &ndd, true, ft->ft, false);
assert(r==0);
r = toku_serialize_ftnode_to(
fd, make_blocknum(20), &sn, &ndd, true, ft->ft, false);
invariant(r == 0);
test_prefetch_read(fd, ft, ft_h);
test_prefetch_read(fd, ft, ft_h);
test_subset_read(fd, ft, ft_h);
toku_destroy_ftnode_internals(&sn);
ft_h->blocktable.block_free(block_allocator::BLOCK_ALLOCATOR_TOTAL_HEADER_RESERVE);
ft_h->blocktable.block_free(
BlockAllocator::BLOCK_ALLOCATOR_TOTAL_HEADER_RESERVE, 100);
ft_h->blocktable.destroy();
ft_h->cmp.destroy();
toku_free(ft_h->h);
@ -405,11 +427,12 @@ test_prefetching(void) {
toku_free(ft);
toku_free(ndd);
r = close(fd); assert(r != -1);
r = close(fd);
invariant(r != -1);
}
int
test_main (int argc __attribute__((__unused__)), const char *argv[] __attribute__((__unused__))) {
int test_main(int argc __attribute__((__unused__)),
const char *argv[] __attribute__((__unused__))) {
test_prefetching();
return 0;

View file

@ -40,38 +40,28 @@ Copyright (c) 2006, 2015, Percona and/or its affiliates. All rights reserved.
#include "ft/cursor.h"
enum ftnode_verify_type {
read_all=1,
read_compressed,
read_none
};
enum ftnode_verify_type { read_all = 1, read_compressed, read_none };
#ifndef MIN
#define MIN(x, y) (((x) < (y)) ? (x) : (y))
#endif
static int
string_key_cmp(DB *UU(e), const DBT *a, const DBT *b)
{
static int string_key_cmp(DB *UU(e), const DBT *a, const DBT *b) {
char *CAST_FROM_VOIDP(s, a->data);
char *CAST_FROM_VOIDP(t, b->data);
return strcmp(s, t);
}
static void
le_add_to_bn(bn_data* bn, uint32_t idx, const char *key, int keylen, const char *val, int vallen)
{
static void le_add_to_bn(bn_data *bn,
uint32_t idx,
const char *key,
int keylen,
const char *val,
int vallen) {
LEAFENTRY r = NULL;
uint32_t size_needed = LE_CLEAN_MEMSIZE(vallen);
void *maybe_free = nullptr;
bn->get_space_for_insert(
idx,
key,
keylen,
size_needed,
&r,
&maybe_free
);
bn->get_space_for_insert(idx, key, keylen, size_needed, &r, &maybe_free);
if (maybe_free) {
toku_free(maybe_free);
}
@ -81,70 +71,67 @@ le_add_to_bn(bn_data* bn, uint32_t idx, const char *key, int keylen, const char
memcpy(r->u.clean.val, val, vallen);
}
static void
le_malloc(bn_data* bn, uint32_t idx, const char *key, const char *val)
{
static void le_malloc(bn_data *bn,
uint32_t idx,
const char *key,
const char *val) {
int keylen = strlen(key) + 1;
int vallen = strlen(val) + 1;
le_add_to_bn(bn, idx, key, keylen, val, vallen);
}
static void
test1(int fd, FT ft_h, FTNODE *dn) {
static void test1(int fd, FT ft_h, FTNODE *dn) {
int r;
ftnode_fetch_extra bfe_all;
bfe_all.create_for_full_read(ft_h);
FTNODE_DISK_DATA ndd = NULL;
r = toku_deserialize_ftnode_from(fd, make_blocknum(20), 0/*pass zero for hash*/, dn, &ndd, &bfe_all);
r = toku_deserialize_ftnode_from(
fd, make_blocknum(20), 0 /*pass zero for hash*/, dn, &ndd, &bfe_all);
bool is_leaf = ((*dn)->height == 0);
assert(r==0);
invariant(r == 0);
for (int i = 0; i < (*dn)->n_children; i++) {
assert(BP_STATE(*dn,i) == PT_AVAIL);
invariant(BP_STATE(*dn, i) == PT_AVAIL);
}
// should sweep and NOT get rid of anything
PAIR_ATTR attr;
memset(&attr,0,sizeof(attr));
memset(&attr, 0, sizeof(attr));
toku_ftnode_pe_callback(*dn, attr, ft_h, def_pe_finalize_impl, nullptr);
for (int i = 0; i < (*dn)->n_children; i++) {
assert(BP_STATE(*dn,i) == PT_AVAIL);
invariant(BP_STATE(*dn, i) == PT_AVAIL);
}
// should sweep and get compress all
toku_ftnode_pe_callback(*dn, attr, ft_h, def_pe_finalize_impl, nullptr);
for (int i = 0; i < (*dn)->n_children; i++) {
if (!is_leaf) {
assert(BP_STATE(*dn,i) == PT_COMPRESSED);
}
else {
assert(BP_STATE(*dn,i) == PT_ON_DISK);
invariant(BP_STATE(*dn, i) == PT_COMPRESSED);
} else {
invariant(BP_STATE(*dn, i) == PT_ON_DISK);
}
}
PAIR_ATTR size;
bool req = toku_ftnode_pf_req_callback(*dn, &bfe_all);
assert(req);
invariant(req);
toku_ftnode_pf_callback(*dn, ndd, &bfe_all, fd, &size);
toku_ftnode_pe_callback(*dn, attr, ft_h, def_pe_finalize_impl, nullptr);
for (int i = 0; i < (*dn)->n_children; i++) {
assert(BP_STATE(*dn,i) == PT_AVAIL);
invariant(BP_STATE(*dn, i) == PT_AVAIL);
}
// should sweep and get compress all
toku_ftnode_pe_callback(*dn, attr, ft_h, def_pe_finalize_impl, nullptr);
for (int i = 0; i < (*dn)->n_children; i++) {
if (!is_leaf) {
assert(BP_STATE(*dn,i) == PT_COMPRESSED);
invariant(BP_STATE(*dn, i) == PT_COMPRESSED);
} else {
invariant(BP_STATE(*dn, i) == PT_ON_DISK);
}
else {
assert(BP_STATE(*dn,i) == PT_ON_DISK);
}
}
}
req = toku_ftnode_pf_req_callback(*dn, &bfe_all);
assert(req);
invariant(req);
toku_ftnode_pf_callback(*dn, ndd, &bfe_all, fd, &size);
toku_ftnode_pe_callback(*dn, attr, ft_h, def_pe_finalize_impl, nullptr);
for (int i = 0; i < (*dn)->n_children; i++) {
assert(BP_STATE(*dn,i) == PT_AVAIL);
invariant(BP_STATE(*dn, i) == PT_AVAIL);
}
(*dn)->dirty = 1;
toku_ftnode_pe_callback(*dn, attr, ft_h, def_pe_finalize_impl, nullptr);
@ -152,101 +139,102 @@ test1(int fd, FT ft_h, FTNODE *dn) {
toku_ftnode_pe_callback(*dn, attr, ft_h, def_pe_finalize_impl, nullptr);
toku_ftnode_pe_callback(*dn, attr, ft_h, def_pe_finalize_impl, nullptr);
for (int i = 0; i < (*dn)->n_children; i++) {
assert(BP_STATE(*dn,i) == PT_AVAIL);
invariant(BP_STATE(*dn, i) == PT_AVAIL);
}
toku_free(ndd);
toku_ftnode_free(dn);
}
static int search_cmp(const struct ft_search& UU(so), const DBT* UU(key)) {
static int search_cmp(const struct ft_search &UU(so), const DBT *UU(key)) {
return 0;
}
static void
test2(int fd, FT ft_h, FTNODE *dn) {
static void test2(int fd, FT ft_h, FTNODE *dn) {
DBT left, right;
DB dummy_db;
memset(&dummy_db, 0, sizeof(dummy_db));
memset(&left, 0, sizeof(left));
memset(&right, 0, sizeof(right));
ft_search search;
ftnode_fetch_extra bfe_subset;
bfe_subset.create_for_subset_read(
ft_h,
ft_search_init(&search, search_cmp, FT_SEARCH_LEFT, nullptr, nullptr, nullptr),
ft_search_init(
&search, search_cmp, FT_SEARCH_LEFT, nullptr, nullptr, nullptr),
&left,
&right,
true,
true,
false,
false
);
false);
FTNODE_DISK_DATA ndd = NULL;
int r = toku_deserialize_ftnode_from(fd, make_blocknum(20), 0/*pass zero for hash*/, dn, &ndd, &bfe_subset);
assert(r==0);
int r = toku_deserialize_ftnode_from(
fd, make_blocknum(20), 0 /*pass zero for hash*/, dn, &ndd, &bfe_subset);
invariant(r == 0);
bool is_leaf = ((*dn)->height == 0);
// at this point, although both partitions are available, only the
// at this point, although both partitions are available, only the
// second basement node should have had its clock
// touched
assert(BP_STATE(*dn, 0) == PT_AVAIL);
assert(BP_STATE(*dn, 1) == PT_AVAIL);
assert(BP_SHOULD_EVICT(*dn, 0));
assert(!BP_SHOULD_EVICT(*dn, 1));
invariant(BP_STATE(*dn, 0) == PT_AVAIL);
invariant(BP_STATE(*dn, 1) == PT_AVAIL);
invariant(BP_SHOULD_EVICT(*dn, 0));
invariant(!BP_SHOULD_EVICT(*dn, 1));
PAIR_ATTR attr;
memset(&attr,0,sizeof(attr));
memset(&attr, 0, sizeof(attr));
toku_ftnode_pe_callback(*dn, attr, ft_h, def_pe_finalize_impl, nullptr);
assert(BP_STATE(*dn, 0) == (is_leaf) ? PT_ON_DISK : PT_COMPRESSED);
assert(BP_STATE(*dn, 1) == PT_AVAIL);
assert(BP_SHOULD_EVICT(*dn, 1));
invariant(BP_STATE(*dn, 0) == (is_leaf) ? PT_ON_DISK : PT_COMPRESSED);
invariant(BP_STATE(*dn, 1) == PT_AVAIL);
invariant(BP_SHOULD_EVICT(*dn, 1));
toku_ftnode_pe_callback(*dn, attr, ft_h, def_pe_finalize_impl, nullptr);
assert(BP_STATE(*dn, 1) == (is_leaf) ? PT_ON_DISK : PT_COMPRESSED);
invariant(BP_STATE(*dn, 1) == (is_leaf) ? PT_ON_DISK : PT_COMPRESSED);
bool req = toku_ftnode_pf_req_callback(*dn, &bfe_subset);
assert(req);
invariant(req);
toku_ftnode_pf_callback(*dn, ndd, &bfe_subset, fd, &attr);
assert(BP_STATE(*dn, 0) == PT_AVAIL);
assert(BP_STATE(*dn, 1) == PT_AVAIL);
assert(BP_SHOULD_EVICT(*dn, 0));
assert(!BP_SHOULD_EVICT(*dn, 1));
invariant(BP_STATE(*dn, 0) == PT_AVAIL);
invariant(BP_STATE(*dn, 1) == PT_AVAIL);
invariant(BP_SHOULD_EVICT(*dn, 0));
invariant(!BP_SHOULD_EVICT(*dn, 1));
toku_free(ndd);
toku_ftnode_free(dn);
}
static void
test3_leaf(int fd, FT ft_h, FTNODE *dn) {
static void test3_leaf(int fd, FT ft_h, FTNODE *dn) {
DBT left, right;
DB dummy_db;
memset(&dummy_db, 0, sizeof(dummy_db));
memset(&left, 0, sizeof(left));
memset(&right, 0, sizeof(right));
ftnode_fetch_extra bfe_min;
bfe_min.create_for_min_read(ft_h);
FTNODE_DISK_DATA ndd = NULL;
int r = toku_deserialize_ftnode_from(fd, make_blocknum(20), 0/*pass zero for hash*/, dn, &ndd, &bfe_min);
assert(r==0);
int r = toku_deserialize_ftnode_from(
fd, make_blocknum(20), 0 /*pass zero for hash*/, dn, &ndd, &bfe_min);
invariant(r == 0);
//
// make sure we have a leaf
//
assert((*dn)->height == 0);
invariant((*dn)->height == 0);
for (int i = 0; i < (*dn)->n_children; i++) {
assert(BP_STATE(*dn, i) == PT_ON_DISK);
invariant(BP_STATE(*dn, i) == PT_ON_DISK);
}
toku_ftnode_free(dn);
toku_free(ndd);
}
static void
test_serialize_nonleaf(void) {
static void test_serialize_nonleaf(void) {
// struct ft_handle source_ft;
struct ftnode sn, *dn;
int fd = open(TOKU_TEST_FILENAME, O_RDWR|O_CREAT|O_BINARY, S_IRWXU|S_IRWXG|S_IRWXO); assert(fd >= 0);
int fd = open(TOKU_TEST_FILENAME,
O_RDWR | O_CREAT | O_BINARY,
S_IRWXU | S_IRWXG | S_IRWXO);
invariant(fd >= 0);
int r;
@ -265,11 +253,11 @@ test_serialize_nonleaf(void) {
sn.pivotkeys.create_from_dbts(toku_fill_dbt(&pivotkey, "hello", 6), 1);
BP_BLOCKNUM(&sn, 0).b = 30;
BP_BLOCKNUM(&sn, 1).b = 35;
BP_STATE(&sn,0) = PT_AVAIL;
BP_STATE(&sn,1) = PT_AVAIL;
BP_STATE(&sn, 0) = PT_AVAIL;
BP_STATE(&sn, 1) = PT_AVAIL;
set_BNC(&sn, 0, toku_create_empty_nl());
set_BNC(&sn, 1, toku_create_empty_nl());
//Create XIDS
// Create XIDS
XIDS xids_0 = toku_xids_get_root_xids();
XIDS xids_123;
XIDS xids_234;
@ -281,11 +269,38 @@ test_serialize_nonleaf(void) {
toku::comparator cmp;
cmp.create(string_key_cmp, nullptr);
toku_bnc_insert_msg(BNC(&sn, 0), "a", 2, "aval", 5, FT_NONE, next_dummymsn(), xids_0, true, cmp);
toku_bnc_insert_msg(BNC(&sn, 0), "b", 2, "bval", 5, FT_NONE, next_dummymsn(), xids_123, false, cmp);
toku_bnc_insert_msg(BNC(&sn, 1), "x", 2, "xval", 5, FT_NONE, next_dummymsn(), xids_234, true, cmp);
toku_bnc_insert_msg(BNC(&sn, 0),
"a",
2,
"aval",
5,
FT_NONE,
next_dummymsn(),
xids_0,
true,
cmp);
toku_bnc_insert_msg(BNC(&sn, 0),
"b",
2,
"bval",
5,
FT_NONE,
next_dummymsn(),
xids_123,
false,
cmp);
toku_bnc_insert_msg(BNC(&sn, 1),
"x",
2,
"xval",
5,
FT_NONE,
next_dummymsn(),
xids_234,
true,
cmp);
//Cleanup:
// Cleanup:
toku_xids_destroy(&xids_0);
toku_xids_destroy(&xids_123);
toku_xids_destroy(&xids_234);
@ -297,35 +312,41 @@ test_serialize_nonleaf(void) {
make_blocknum(0),
ZERO_LSN,
TXNID_NONE,
4*1024*1024,
128*1024,
4 * 1024 * 1024,
128 * 1024,
TOKU_DEFAULT_COMPRESSION_METHOD,
16);
ft_h->cmp.create(string_key_cmp, nullptr);
ft->ft = ft_h;
ft_h->blocktable.create();
{ int r_truncate = ftruncate(fd, 0); CKERR(r_truncate); }
//Want to use block #20
{
int r_truncate = ftruncate(fd, 0);
CKERR(r_truncate);
}
// Want to use block #20
BLOCKNUM b = make_blocknum(0);
while (b.b < 20) {
ft_h->blocktable.allocate_blocknum(&b, ft_h);
}
assert(b.b == 20);
invariant(b.b == 20);
{
DISKOFF offset;
DISKOFF size;
ft_h->blocktable.realloc_on_disk(b, 100, &offset, ft_h, fd, false, 0);
assert(offset==(DISKOFF)block_allocator::BLOCK_ALLOCATOR_TOTAL_HEADER_RESERVE);
ft_h->blocktable.realloc_on_disk(b, 100, &offset, ft_h, fd, false);
invariant(offset ==
(DISKOFF)BlockAllocator::BLOCK_ALLOCATOR_TOTAL_HEADER_RESERVE);
ft_h->blocktable.translate_blocknum_to_offset_size(b, &offset, &size);
assert(offset == (DISKOFF)block_allocator::BLOCK_ALLOCATOR_TOTAL_HEADER_RESERVE);
assert(size == 100);
invariant(offset ==
(DISKOFF)BlockAllocator::BLOCK_ALLOCATOR_TOTAL_HEADER_RESERVE);
invariant(size == 100);
}
FTNODE_DISK_DATA ndd = NULL;
r = toku_serialize_ftnode_to(fd, make_blocknum(20), &sn, &ndd, true, ft->ft, false);
assert(r==0);
r = toku_serialize_ftnode_to(
fd, make_blocknum(20), &sn, &ndd, true, ft->ft, false);
invariant(r == 0);
test1(fd, ft_h, &dn);
test2(fd, ft_h, &dn);
@ -333,22 +354,26 @@ test_serialize_nonleaf(void) {
toku_destroy_ftnode_internals(&sn);
toku_free(ndd);
ft_h->blocktable.block_free(block_allocator::BLOCK_ALLOCATOR_TOTAL_HEADER_RESERVE);
ft_h->blocktable.block_free(
BlockAllocator::BLOCK_ALLOCATOR_TOTAL_HEADER_RESERVE, 100);
ft_h->blocktable.destroy();
toku_free(ft_h->h);
ft_h->cmp.destroy();
toku_free(ft_h);
toku_free(ft);
r = close(fd); assert(r != -1);
r = close(fd);
invariant(r != -1);
}
static void
test_serialize_leaf(void) {
static void test_serialize_leaf(void) {
// struct ft_handle source_ft;
struct ftnode sn, *dn;
int fd = open(TOKU_TEST_FILENAME, O_RDWR|O_CREAT|O_BINARY, S_IRWXU|S_IRWXG|S_IRWXO); assert(fd >= 0);
int fd = open(TOKU_TEST_FILENAME,
O_RDWR | O_CREAT | O_BINARY,
S_IRWXU | S_IRWXG | S_IRWXO);
invariant(fd >= 0);
int r;
@ -364,8 +389,8 @@ test_serialize_leaf(void) {
MALLOC_N(sn.n_children, sn.bp);
DBT pivotkey;
sn.pivotkeys.create_from_dbts(toku_fill_dbt(&pivotkey, "b", 2), 1);
BP_STATE(&sn,0) = PT_AVAIL;
BP_STATE(&sn,1) = PT_AVAIL;
BP_STATE(&sn, 0) = PT_AVAIL;
BP_STATE(&sn, 1) = PT_AVAIL;
set_BLB(&sn, 0, toku_create_empty_bn());
set_BLB(&sn, 1, toku_create_empty_bn());
le_malloc(BLB_DATA(&sn, 0), 0, "a", "aval");
@ -378,51 +403,59 @@ test_serialize_leaf(void) {
make_blocknum(0),
ZERO_LSN,
TXNID_NONE,
4*1024*1024,
128*1024,
4 * 1024 * 1024,
128 * 1024,
TOKU_DEFAULT_COMPRESSION_METHOD,
16);
ft->ft = ft_h;
ft_h->blocktable.create();
{ int r_truncate = ftruncate(fd, 0); CKERR(r_truncate); }
//Want to use block #20
{
int r_truncate = ftruncate(fd, 0);
CKERR(r_truncate);
}
// Want to use block #20
BLOCKNUM b = make_blocknum(0);
while (b.b < 20) {
ft_h->blocktable.allocate_blocknum(&b, ft_h);
}
assert(b.b == 20);
invariant(b.b == 20);
{
DISKOFF offset;
DISKOFF size;
ft_h->blocktable.realloc_on_disk(b, 100, &offset, ft_h, fd, false, 0);
assert(offset==(DISKOFF)block_allocator::BLOCK_ALLOCATOR_TOTAL_HEADER_RESERVE);
ft_h->blocktable.realloc_on_disk(b, 100, &offset, ft_h, fd, false);
invariant(offset ==
(DISKOFF)BlockAllocator::BLOCK_ALLOCATOR_TOTAL_HEADER_RESERVE);
ft_h->blocktable.translate_blocknum_to_offset_size(b, &offset, &size);
assert(offset == (DISKOFF)block_allocator::BLOCK_ALLOCATOR_TOTAL_HEADER_RESERVE);
assert(size == 100);
invariant(offset ==
(DISKOFF)BlockAllocator::BLOCK_ALLOCATOR_TOTAL_HEADER_RESERVE);
invariant(size == 100);
}
FTNODE_DISK_DATA ndd = NULL;
r = toku_serialize_ftnode_to(fd, make_blocknum(20), &sn, &ndd, true, ft->ft, false);
assert(r==0);
r = toku_serialize_ftnode_to(
fd, make_blocknum(20), &sn, &ndd, true, ft->ft, false);
invariant(r == 0);
test1(fd, ft_h, &dn);
test3_leaf(fd, ft_h,&dn);
test3_leaf(fd, ft_h, &dn);
toku_destroy_ftnode_internals(&sn);
ft_h->blocktable.block_free(block_allocator::BLOCK_ALLOCATOR_TOTAL_HEADER_RESERVE);
ft_h->blocktable.block_free(
BlockAllocator::BLOCK_ALLOCATOR_TOTAL_HEADER_RESERVE, 100);
ft_h->blocktable.destroy();
toku_free(ft_h->h);
toku_free(ft_h);
toku_free(ft);
toku_free(ndd);
r = close(fd); assert(r != -1);
r = close(fd);
invariant(r != -1);
}
int
test_main (int argc __attribute__((__unused__)), const char *argv[] __attribute__((__unused__))) {
int test_main(int argc __attribute__((__unused__)),
const char *argv[] __attribute__((__unused__))) {
initialize_dummymsn();
test_serialize_nonleaf();
test_serialize_leaf();

View file

@ -41,27 +41,21 @@ Copyright (c) 2006, 2015, Percona and/or its affiliates. All rights reserved.
#include <sys/time.h>
#include "test.h"
#ifndef MIN
#define MIN(x, y) (((x) < (y)) ? (x) : (y))
#endif
const double USECS_PER_SEC = 1000000.0;
static void
le_add_to_bn(bn_data* bn, uint32_t idx, char *key, int keylen, char *val, int vallen)
{
static void le_add_to_bn(bn_data *bn,
uint32_t idx,
char *key,
int keylen,
char *val,
int vallen) {
LEAFENTRY r = NULL;
uint32_t size_needed = LE_CLEAN_MEMSIZE(vallen);
void *maybe_free = nullptr;
bn->get_space_for_insert(
idx,
key,
keylen,
size_needed,
&r,
&maybe_free
);
bn->get_space_for_insert(idx, key, keylen, size_needed, &r, &maybe_free);
if (maybe_free) {
toku_free(maybe_free);
}
@ -71,20 +65,24 @@ le_add_to_bn(bn_data* bn, uint32_t idx, char *key, int keylen, char *val, int va
memcpy(r->u.clean.val, val, vallen);
}
static int
long_key_cmp(DB *UU(e), const DBT *a, const DBT *b)
{
static int long_key_cmp(DB *UU(e), const DBT *a, const DBT *b) {
const long *CAST_FROM_VOIDP(x, a->data);
const long *CAST_FROM_VOIDP(y, b->data);
return (*x > *y) - (*x < *y);
}
static void
test_serialize_leaf(int valsize, int nelts, double entropy, int ser_runs, int deser_runs) {
static void test_serialize_leaf(int valsize,
int nelts,
double entropy,
int ser_runs,
int deser_runs) {
// struct ft_handle source_ft;
struct ftnode *sn, *dn;
int fd = open(TOKU_TEST_FILENAME, O_RDWR|O_CREAT|O_BINARY, S_IRWXU|S_IRWXG|S_IRWXO); assert(fd >= 0);
int fd = open(TOKU_TEST_FILENAME,
O_RDWR | O_CREAT | O_BINARY,
S_IRWXU | S_IRWXG | S_IRWXO);
invariant(fd >= 0);
int r;
@ -102,7 +100,7 @@ test_serialize_leaf(int valsize, int nelts, double entropy, int ser_runs, int de
MALLOC_N(sn->n_children, sn->bp);
sn->pivotkeys.create_empty();
for (int i = 0; i < sn->n_children; ++i) {
BP_STATE(sn,i) = PT_AVAIL;
BP_STATE(sn, i) = PT_AVAIL;
set_BLB(sn, i, toku_create_empty_bn());
}
int nperbn = nelts / sn->n_children;
@ -112,24 +110,19 @@ test_serialize_leaf(int valsize, int nelts, double entropy, int ser_runs, int de
k = ck * nperbn + i;
char buf[valsize];
int c;
for (c = 0; c < valsize * entropy; ) {
int *p = (int *) &buf[c];
for (c = 0; c < valsize * entropy;) {
int *p = (int *)&buf[c];
*p = rand();
c += sizeof(*p);
}
memset(&buf[c], 0, valsize - c);
le_add_to_bn(
BLB_DATA(sn,ck),
i,
(char *)&k,
sizeof k,
buf,
sizeof buf
);
BLB_DATA(sn, ck), i, (char *)&k, sizeof k, buf, sizeof buf);
}
if (ck < 7) {
DBT pivotkey;
sn->pivotkeys.insert_at(toku_fill_dbt(&pivotkey, &k, sizeof(k)), ck);
sn->pivotkeys.insert_at(toku_fill_dbt(&pivotkey, &k, sizeof(k)),
ck);
}
}
@ -139,31 +132,36 @@ test_serialize_leaf(int valsize, int nelts, double entropy, int ser_runs, int de
make_blocknum(0),
ZERO_LSN,
TXNID_NONE,
4*1024*1024,
128*1024,
4 * 1024 * 1024,
128 * 1024,
TOKU_DEFAULT_COMPRESSION_METHOD,
16);
ft_h->cmp.create(long_key_cmp, nullptr);
ft->ft = ft_h;
ft_h->blocktable.create();
{ int r_truncate = ftruncate(fd, 0); CKERR(r_truncate); }
//Want to use block #20
{
int r_truncate = ftruncate(fd, 0);
CKERR(r_truncate);
}
// Want to use block #20
BLOCKNUM b = make_blocknum(0);
while (b.b < 20) {
ft_h->blocktable.allocate_blocknum(&b, ft_h);
}
assert(b.b == 20);
invariant(b.b == 20);
{
DISKOFF offset;
DISKOFF size;
ft_h->blocktable.realloc_on_disk(b, 100, &offset, ft_h, fd, false, 0);
assert(offset==(DISKOFF)block_allocator::BLOCK_ALLOCATOR_TOTAL_HEADER_RESERVE);
ft_h->blocktable.realloc_on_disk(b, 100, &offset, ft_h, fd, false);
invariant(offset ==
(DISKOFF)BlockAllocator::BLOCK_ALLOCATOR_TOTAL_HEADER_RESERVE);
ft_h->blocktable.translate_blocknum_to_offset_size(b, &offset, &size);
assert(offset == (DISKOFF)block_allocator::BLOCK_ALLOCATOR_TOTAL_HEADER_RESERVE);
assert(size == 100);
invariant(offset ==
(DISKOFF)BlockAllocator::BLOCK_ALLOCATOR_TOTAL_HEADER_RESERVE);
invariant(size == 100);
}
struct timeval total_start;
@ -176,8 +174,9 @@ test_serialize_leaf(int valsize, int nelts, double entropy, int ser_runs, int de
gettimeofday(&t[0], NULL);
ndd = NULL;
sn->dirty = 1;
r = toku_serialize_ftnode_to(fd, make_blocknum(20), sn, &ndd, true, ft->ft, false);
assert(r==0);
r = toku_serialize_ftnode_to(
fd, make_blocknum(20), sn, &ndd, true, ft->ft, false);
invariant(r == 0);
gettimeofday(&t[1], NULL);
total_start.tv_sec += t[0].tv_sec;
total_start.tv_usec += t[0].tv_usec;
@ -186,12 +185,14 @@ test_serialize_leaf(int valsize, int nelts, double entropy, int ser_runs, int de
toku_free(ndd);
}
double dt;
dt = (total_end.tv_sec - total_start.tv_sec) + ((total_end.tv_usec - total_start.tv_usec) / USECS_PER_SEC);
dt = (total_end.tv_sec - total_start.tv_sec) +
((total_end.tv_usec - total_start.tv_usec) / USECS_PER_SEC);
dt *= 1000;
dt /= ser_runs;
printf("serialize leaf(ms): %0.05lf (average of %d runs)\n", dt, ser_runs);
printf(
"serialize leaf(ms): %0.05lf (average of %d runs)\n", dt, ser_runs);
//reset
// reset
total_start.tv_sec = total_start.tv_usec = 0;
total_end.tv_sec = total_end.tv_usec = 0;
@ -200,8 +201,9 @@ test_serialize_leaf(int valsize, int nelts, double entropy, int ser_runs, int de
bfe.create_for_full_read(ft_h);
gettimeofday(&t[0], NULL);
FTNODE_DISK_DATA ndd2 = NULL;
r = toku_deserialize_ftnode_from(fd, make_blocknum(20), 0/*pass zero for hash*/, &dn, &ndd2, &bfe);
assert(r==0);
r = toku_deserialize_ftnode_from(
fd, make_blocknum(20), 0 /*pass zero for hash*/, &dn, &ndd2, &bfe);
invariant(r == 0);
gettimeofday(&t[1], NULL);
total_start.tv_sec += t[0].tv_sec;
@ -212,35 +214,46 @@ test_serialize_leaf(int valsize, int nelts, double entropy, int ser_runs, int de
toku_ftnode_free(&dn);
toku_free(ndd2);
}
dt = (total_end.tv_sec - total_start.tv_sec) + ((total_end.tv_usec - total_start.tv_usec) / USECS_PER_SEC);
dt = (total_end.tv_sec - total_start.tv_sec) +
((total_end.tv_usec - total_start.tv_usec) / USECS_PER_SEC);
dt *= 1000;
dt /= deser_runs;
printf("deserialize leaf(ms): %0.05lf (average of %d runs)\n", dt, deser_runs);
printf("io time(ms) %lf decompress time(ms) %lf deserialize time(ms) %lf (average of %d runs)\n",
tokutime_to_seconds(bfe.io_time)*1000,
tokutime_to_seconds(bfe.decompress_time)*1000,
tokutime_to_seconds(bfe.deserialize_time)*1000,
deser_runs
);
printf(
"deserialize leaf(ms): %0.05lf (average of %d runs)\n", dt, deser_runs);
printf(
"io time(ms) %lf decompress time(ms) %lf deserialize time(ms) %lf "
"(average of %d runs)\n",
tokutime_to_seconds(bfe.io_time) * 1000,
tokutime_to_seconds(bfe.decompress_time) * 1000,
tokutime_to_seconds(bfe.deserialize_time) * 1000,
deser_runs);
toku_ftnode_free(&sn);
ft_h->blocktable.block_free(block_allocator::BLOCK_ALLOCATOR_TOTAL_HEADER_RESERVE);
ft_h->blocktable.block_free(
BlockAllocator::BLOCK_ALLOCATOR_TOTAL_HEADER_RESERVE, 100);
ft_h->blocktable.destroy();
ft_h->cmp.destroy();
toku_free(ft_h->h);
toku_free(ft_h);
toku_free(ft);
r = close(fd); assert(r != -1);
r = close(fd);
invariant(r != -1);
}
static void
test_serialize_nonleaf(int valsize, int nelts, double entropy, int ser_runs, int deser_runs) {
static void test_serialize_nonleaf(int valsize,
int nelts,
double entropy,
int ser_runs,
int deser_runs) {
// struct ft_handle source_ft;
struct ftnode sn, *dn;
int fd = open(TOKU_TEST_FILENAME, O_RDWR|O_CREAT|O_BINARY, S_IRWXU|S_IRWXG|S_IRWXO); assert(fd >= 0);
int fd = open(TOKU_TEST_FILENAME,
O_RDWR | O_CREAT | O_BINARY,
S_IRWXU | S_IRWXG | S_IRWXO);
invariant(fd >= 0);
int r;
@ -257,11 +270,11 @@ test_serialize_nonleaf(int valsize, int nelts, double entropy, int ser_runs, int
MALLOC_N(sn.n_children, sn.bp);
sn.pivotkeys.create_empty();
for (int i = 0; i < sn.n_children; ++i) {
BP_BLOCKNUM(&sn, i).b = 30 + (i*5);
BP_STATE(&sn,i) = PT_AVAIL;
BP_BLOCKNUM(&sn, i).b = 30 + (i * 5);
BP_STATE(&sn, i) = PT_AVAIL;
set_BNC(&sn, i, toku_create_empty_nl());
}
//Create XIDS
// Create XIDS
XIDS xids_0 = toku_xids_get_root_xids();
XIDS xids_123;
r = toku_xids_create_child(xids_0, &xids_123, (TXNID)123);
@ -276,14 +289,23 @@ test_serialize_nonleaf(int valsize, int nelts, double entropy, int ser_runs, int
k = ck * nperchild + i;
char buf[valsize];
int c;
for (c = 0; c < valsize * entropy; ) {
int *p = (int *) &buf[c];
for (c = 0; c < valsize * entropy;) {
int *p = (int *)&buf[c];
*p = rand();
c += sizeof(*p);
}
memset(&buf[c], 0, valsize - c);
toku_bnc_insert_msg(bnc, &k, sizeof k, buf, valsize, FT_NONE, next_dummymsn(), xids_123, true, cmp);
toku_bnc_insert_msg(bnc,
&k,
sizeof k,
buf,
valsize,
FT_NONE,
next_dummymsn(),
xids_123,
true,
cmp);
}
if (ck < 7) {
DBT pivotkey;
@ -291,7 +313,7 @@ test_serialize_nonleaf(int valsize, int nelts, double entropy, int ser_runs, int
}
}
//Cleanup:
// Cleanup:
toku_xids_destroy(&xids_0);
toku_xids_destroy(&xids_123);
cmp.destroy();
@ -302,65 +324,78 @@ test_serialize_nonleaf(int valsize, int nelts, double entropy, int ser_runs, int
make_blocknum(0),
ZERO_LSN,
TXNID_NONE,
4*1024*1024,
128*1024,
4 * 1024 * 1024,
128 * 1024,
TOKU_DEFAULT_COMPRESSION_METHOD,
16);
ft_h->cmp.create(long_key_cmp, nullptr);
ft->ft = ft_h;
ft_h->blocktable.create();
{ int r_truncate = ftruncate(fd, 0); CKERR(r_truncate); }
//Want to use block #20
{
int r_truncate = ftruncate(fd, 0);
CKERR(r_truncate);
}
// Want to use block #20
BLOCKNUM b = make_blocknum(0);
while (b.b < 20) {
ft_h->blocktable.allocate_blocknum(&b, ft_h);
}
assert(b.b == 20);
invariant(b.b == 20);
{
DISKOFF offset;
DISKOFF size;
ft_h->blocktable.realloc_on_disk(b, 100, &offset, ft_h, fd, false, 0);
assert(offset==(DISKOFF)block_allocator::BLOCK_ALLOCATOR_TOTAL_HEADER_RESERVE);
ft_h->blocktable.realloc_on_disk(b, 100, &offset, ft_h, fd, false);
invariant(offset ==
(DISKOFF)BlockAllocator::BLOCK_ALLOCATOR_TOTAL_HEADER_RESERVE);
ft_h->blocktable.translate_blocknum_to_offset_size(b, &offset, &size);
assert(offset == (DISKOFF)block_allocator::BLOCK_ALLOCATOR_TOTAL_HEADER_RESERVE);
assert(size == 100);
invariant(offset ==
(DISKOFF)BlockAllocator::BLOCK_ALLOCATOR_TOTAL_HEADER_RESERVE);
invariant(size == 100);
}
struct timeval t[2];
gettimeofday(&t[0], NULL);
FTNODE_DISK_DATA ndd = NULL;
r = toku_serialize_ftnode_to(fd, make_blocknum(20), &sn, &ndd, true, ft->ft, false);
assert(r==0);
r = toku_serialize_ftnode_to(
fd, make_blocknum(20), &sn, &ndd, true, ft->ft, false);
invariant(r == 0);
gettimeofday(&t[1], NULL);
double dt;
dt = (t[1].tv_sec - t[0].tv_sec) + ((t[1].tv_usec - t[0].tv_usec) / USECS_PER_SEC);
dt = (t[1].tv_sec - t[0].tv_sec) +
((t[1].tv_usec - t[0].tv_usec) / USECS_PER_SEC);
dt *= 1000;
printf("serialize nonleaf(ms): %0.05lf (IGNORED RUNS=%d)\n", dt, ser_runs);
printf(
"serialize nonleaf(ms): %0.05lf (IGNORED RUNS=%d)\n", dt, ser_runs);
ftnode_fetch_extra bfe;
bfe.create_for_full_read(ft_h);
gettimeofday(&t[0], NULL);
FTNODE_DISK_DATA ndd2 = NULL;
r = toku_deserialize_ftnode_from(fd, make_blocknum(20), 0/*pass zero for hash*/, &dn, &ndd2, &bfe);
assert(r==0);
r = toku_deserialize_ftnode_from(
fd, make_blocknum(20), 0 /*pass zero for hash*/, &dn, &ndd2, &bfe);
invariant(r == 0);
gettimeofday(&t[1], NULL);
dt = (t[1].tv_sec - t[0].tv_sec) + ((t[1].tv_usec - t[0].tv_usec) / USECS_PER_SEC);
dt = (t[1].tv_sec - t[0].tv_sec) +
((t[1].tv_usec - t[0].tv_usec) / USECS_PER_SEC);
dt *= 1000;
printf("deserialize nonleaf(ms): %0.05lf (IGNORED RUNS=%d)\n", dt, deser_runs);
printf("io time(ms) %lf decompress time(ms) %lf deserialize time(ms) %lf (IGNORED RUNS=%d)\n",
tokutime_to_seconds(bfe.io_time)*1000,
tokutime_to_seconds(bfe.decompress_time)*1000,
tokutime_to_seconds(bfe.deserialize_time)*1000,
deser_runs
);
printf(
"deserialize nonleaf(ms): %0.05lf (IGNORED RUNS=%d)\n", dt, deser_runs);
printf(
"io time(ms) %lf decompress time(ms) %lf deserialize time(ms) %lf "
"(IGNORED RUNS=%d)\n",
tokutime_to_seconds(bfe.io_time) * 1000,
tokutime_to_seconds(bfe.decompress_time) * 1000,
tokutime_to_seconds(bfe.deserialize_time) * 1000,
deser_runs);
toku_ftnode_free(&dn);
toku_destroy_ftnode_internals(&sn);
ft_h->blocktable.block_free(block_allocator::BLOCK_ALLOCATOR_TOTAL_HEADER_RESERVE);
ft_h->blocktable.block_free(
BlockAllocator::BLOCK_ALLOCATOR_TOTAL_HEADER_RESERVE, 100);
ft_h->blocktable.destroy();
toku_free(ft_h->h);
ft_h->cmp.destroy();
@ -369,17 +404,21 @@ test_serialize_nonleaf(int valsize, int nelts, double entropy, int ser_runs, int
toku_free(ndd);
toku_free(ndd2);
r = close(fd); assert(r != -1);
r = close(fd);
invariant(r != -1);
}
int
test_main (int argc __attribute__((__unused__)), const char *argv[] __attribute__((__unused__))) {
int test_main(int argc __attribute__((__unused__)),
const char *argv[] __attribute__((__unused__))) {
const int DEFAULT_RUNS = 5;
long valsize, nelts, ser_runs = DEFAULT_RUNS, deser_runs = DEFAULT_RUNS;
double entropy = 0.3;
if (argc != 3 && argc != 5) {
fprintf(stderr, "Usage: %s <valsize> <nelts> [<serialize_runs> <deserialize_runs>]\n", argv[0]);
fprintf(stderr,
"Usage: %s <valsize> <nelts> [<serialize_runs> "
"<deserialize_runs>]\n",
argv[0]);
fprintf(stderr, "Default (and min) runs is %d\n", DEFAULT_RUNS);
return 2;
}

File diff suppressed because it is too large Load diff

View file

@ -164,17 +164,16 @@ static void test_read_what_was_written (void) {
int r;
const int NVALS=10000;
if (verbose) printf("test_read_what_was_written(): "); fflush(stdout);
if (verbose) {
printf("test_read_what_was_written(): "); fflush(stdout);
}
unlink(fname);
toku_cachetable_create(&ct, 0, ZERO_LSN, nullptr);
r = toku_open_ft_handle(fname, 1, &ft, 1<<12, 1<<9, TOKU_DEFAULT_COMPRESSION_METHOD, ct, null_txn, toku_builtin_compare_fun); assert(r==0);
r = toku_close_ft_handle_nolsn(ft, 0); assert(r==0);
toku_cachetable_close(&ct);
toku_cachetable_close(&ct);
/* Now see if we can read an empty tree in. */
toku_cachetable_create(&ct, 0, ZERO_LSN, nullptr);
@ -189,8 +188,6 @@ static void test_read_what_was_written (void) {
r = toku_close_ft_handle_nolsn(ft, 0); assert(r==0);
toku_cachetable_close(&ct);
/* Now see if we can read it in and get the value. */
toku_cachetable_create(&ct, 0, ZERO_LSN, nullptr);
r = toku_open_ft_handle(fname, 0, &ft, 1<<12, 1<<9, TOKU_DEFAULT_COMPRESSION_METHOD, ct, null_txn, toku_builtin_compare_fun); assert(r==0);

View file

@ -109,7 +109,9 @@ static int run_test(void)
r = pqueue_pop(pq, &node); assert(r==0);
if (verbose) printf("%d : %d\n", i, *(int*)(node->key->data));
if ( *(int*)(node->key->data) != i ) {
if (verbose) printf("FAIL\n"); return -1;
if (verbose)
printf("FAIL\n");
return -1;
}
}
pqueue_free(pq);

View file

@ -793,7 +793,7 @@ static void test_le_garbage_collection_birdie(void) {
do_garbage_collect = ule_worth_running_garbage_collection(&ule, 200);
invariant(do_garbage_collect);
// It is definately worth doing when the above case is true
// It is definitely worth doing when the above case is true
// and there is more than one provisional entry.
ule.num_cuxrs = 1;
ule.num_puxrs = 2;

View file

@ -72,7 +72,7 @@ static void dummy_update_status(FTNODE UU(child), int UU(dirtied), void* UU(extr
enum { NODESIZE = 1024, KSIZE=NODESIZE-100, TOKU_PSIZE=20 };
static void test_oldest_referenced_xid_gets_propogated(void) {
static void test_oldest_referenced_xid_gets_propagated(void) {
int r;
CACHETABLE ct;
FT_HANDLE t;
@ -166,7 +166,7 @@ static void test_oldest_referenced_xid_gets_propogated(void) {
toku_ft_flush_some_child(t->ft, node, &fa);
// pin the child, verify that oldest referenced xid was
// propogated from parent to child during the flush
// propagated from parent to child during the flush
toku_pin_ftnode(
t->ft,
child_nonleaf_blocknum,
@ -185,6 +185,6 @@ static void test_oldest_referenced_xid_gets_propogated(void) {
int test_main(int argc __attribute__((__unused__)), const char *argv[] __attribute__((__unused__))) {
default_parse_args(argc, argv);
test_oldest_referenced_xid_gets_propogated();
test_oldest_referenced_xid_gets_propagated();
return 0;
}

View file

@ -36,30 +36,62 @@ Copyright (c) 2006, 2015, Percona and/or its affiliates. All rights reserved.
#ident "Copyright (c) 2006, 2015, Percona and/or its affiliates. All rights reserved."
#pragma once
#include "ft/serialize/rbtree_mhs.h"
#include "test.h"
#include <algorithm>
#include <vector>
#include <ctime>
#include <cstdlib>
#include <db.h>
static void test_insert_remove(void) {
uint64_t i;
MhsRbTree::Tree *tree = new MhsRbTree::Tree();
verbose = 0;
#include "ft/serialize/block_allocator.h"
tree->Insert({0, 100});
// Block allocation strategy implementations
for (i = 0; i < 10; i++) {
tree->Remove(3);
tree->Remove(2);
}
tree->ValidateBalance();
tree->ValidateMhs();
class block_allocator_strategy {
public:
static struct block_allocator::blockpair *
first_fit(struct block_allocator::blockpair *blocks_array,
uint64_t n_blocks, uint64_t size, uint64_t alignment);
for (i = 0; i < 10; i++) {
tree->Insert({5 * i, 3});
}
tree->ValidateBalance();
tree->ValidateMhs();
static struct block_allocator::blockpair *
best_fit(struct block_allocator::blockpair *blocks_array,
uint64_t n_blocks, uint64_t size, uint64_t alignment);
uint64_t offset = tree->Remove(2);
invariant(offset == 0);
offset = tree->Remove(10);
invariant(offset == 50);
offset = tree->Remove(3);
invariant(offset == 5);
tree->ValidateBalance();
tree->ValidateMhs();
static struct block_allocator::blockpair *
padded_fit(struct block_allocator::blockpair *blocks_array,
uint64_t n_blocks, uint64_t size, uint64_t alignment);
tree->Insert({48, 2});
tree->Insert({50, 10});
static struct block_allocator::blockpair *
heat_zone(struct block_allocator::blockpair *blocks_array,
uint64_t n_blocks, uint64_t size, uint64_t alignment,
uint64_t heat);
};
tree->ValidateBalance();
tree->ValidateMhs();
tree->Insert({3, 7});
offset = tree->Remove(10);
invariant(offset == 2);
tree->ValidateBalance();
tree->ValidateMhs();
tree->Dump();
delete tree;
}
int test_main(int argc, const char *argv[]) {
default_parse_args(argc, argv);
test_insert_remove();
if (verbose)
printf("test ok\n");
return 0;
}

View file

@ -0,0 +1,102 @@
/* -*- mode: C++; c-basic-offset: 4; indent-tabs-mode: nil -*- */
// vim: ft=cpp:expandtab:ts=8:sw=4:softtabstop=4:
#ident "$Id$"
/*======
This file is part of PerconaFT.
Copyright (c) 2006, 2015, Percona and/or its affiliates. All rights reserved.
PerconaFT is free software: you can redistribute it and/or modify
it under the terms of the GNU General Public License, version 2,
as published by the Free Software Foundation.
PerconaFT is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with PerconaFT. If not, see <http://www.gnu.org/licenses/>.
----------------------------------------
PerconaFT is free software: you can redistribute it and/or modify
it under the terms of the GNU Affero General Public License, version 3,
as published by the Free Software Foundation.
PerconaFT is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU Affero General Public License for more details.
You should have received a copy of the GNU Affero General Public License
along with PerconaFT. If not, see <http://www.gnu.org/licenses/>.
======= */
#ident "Copyright (c) 2006, 2015, Percona and/or its affiliates. All rights reserved."
#include "ft/serialize/rbtree_mhs.h"
#include "test.h"
#include <algorithm>
#include <vector>
#include <ctime>
#include <cstdlib>
#define N 1000000
std::vector<MhsRbTree::Node::BlockPair> input_vector;
MhsRbTree::Node::BlockPair old_vector[N];
static int myrandom(int i) { return std::rand() % i; }
static void generate_random_input() {
std::srand(unsigned(std::time(0)));
// set some values:
for (uint64_t i = 1; i < N; ++i) {
input_vector.push_back({i, 0});
old_vector[i] = {i, 0};
}
// using built-in random generator:
std::random_shuffle(input_vector.begin(), input_vector.end(), myrandom);
}
static void test_insert_remove(void) {
int i;
MhsRbTree::Tree *tree = new MhsRbTree::Tree();
verbose = 0;
generate_random_input();
if (verbose) {
printf("\n we are going to insert the following block offsets\n");
for (i = 0; i < N; i++)
printf("%" PRIu64 "\t", input_vector[i]._offset.ToInt());
}
for (i = 0; i < N; i++) {
tree->Insert(input_vector[i]);
// tree->ValidateBalance();
}
tree->ValidateBalance();
MhsRbTree::Node::BlockPair *p_bps = &old_vector[0];
tree->ValidateInOrder(p_bps);
printf("min node of the tree:%" PRIu64 "\n",
rbn_offset(tree->MinNode()).ToInt());
printf("max node of the tree:%" PRIu64 "\n",
rbn_offset(tree->MaxNode()).ToInt());
for (i = 0; i < N; i++) {
// tree->ValidateBalance();
tree->RawRemove(input_vector[i]._offset.ToInt());
}
tree->Destroy();
delete tree;
}
int test_main(int argc, const char *argv[]) {
default_parse_args(argc, argv);
test_insert_remove();
if (verbose)
printf("test ok\n");
return 0;
}

View file

@ -49,7 +49,7 @@ Copyright (c) 2006, 2015, Percona and/or its affiliates. All rights reserved.
// functionality provided by roll.c is exposed by an autogenerated
// header file, logheader.h
//
// this (poorly) explains the absense of "roll.h"
// this (poorly) explains the absence of "roll.h"
// these flags control whether or not we send commit messages for
// various operations

View file

@ -169,7 +169,7 @@ int toku_rollback_commit(TOKUTXN txn, LSN lsn) {
txn->roll_info.spilled_rollback_head = ROLLBACK_NONE;
txn->roll_info.spilled_rollback_tail = ROLLBACK_NONE;
}
// if we're commiting a child rollback, put its entries into the parent
// if we're committing a child rollback, put its entries into the parent
// by pinning both child and parent and then linking the child log entry
// list to the end of the parent log entry list.
if (txn_has_current_rollback_log(txn)) {

View file

@ -59,21 +59,18 @@ rollback_log_destroy(ROLLBACK_LOG_NODE log) {
// flush an ununused log to disk, by allocating a size 0 blocknum in
// the blocktable
static void
toku_rollback_flush_unused_log(
ROLLBACK_LOG_NODE log,
BLOCKNUM logname,
int fd,
FT ft,
bool write_me,
bool keep_me,
bool for_checkpoint,
bool is_clone
)
{
static void toku_rollback_flush_unused_log(ROLLBACK_LOG_NODE log,
BLOCKNUM logname,
int fd,
FT ft,
bool write_me,
bool keep_me,
bool for_checkpoint,
bool is_clone) {
if (write_me) {
DISKOFF offset;
ft->blocktable.realloc_on_disk(logname, 0, &offset, ft, fd, for_checkpoint, INT_MAX);
ft->blocktable.realloc_on_disk(
logname, 0, &offset, ft, fd, for_checkpoint);
}
if (!keep_me && !is_clone) {
toku_free(log);

View file

@ -587,8 +587,8 @@ bool toku_le_worth_running_garbage_collection(
// by new txns.
// 2.) There is only one committed entry, but the outermost
// provisional entry is older than the oldest known referenced
// xid, so it must have commited. Therefor we can promote it to
// committed and get rid of the old commited entry.
// xid, so it must have committed. Therefor we can promote it to
// committed and get rid of the old committed entry.
if (le->type != LE_MVCC) {
return false;
}

Some files were not shown because too many files have changed in this diff Show more