MDEV-5834: Merge Kakao Defragmentation implementation to MariaDB 10.1

Merge https://github.com/kakao/mariadb-10.0 that contains Facebook's
    implementation for defragmentation

    facebook/mysql-5.6@a2d3a74
    facebook/mysql-5.6@def96c8
    facebook/mysql-5.6@9c67c5d
    facebook/mysql-5.6@921a81b
    facebook/mysql-5.6@aa519bd
    facebook/mysql-5.6@fea7d13
    facebook/mysql-5.6@09b29d3
    facebook/mysql-5.6@9284abb
    facebook/mysql-5.6@dbd623d
    facebook/mysql-5.6@aed55dc
    facebook/mysql-5.6@aad5c82

    This version does not add new SQL-syntax and new handler API function.
    Instead optimize table is mapped to defragment table if
    innodb_defragment=ON, by default the feature is off.

    Contains changes authored by Sunguck Lee (Kakao).
This commit is contained in:
Jan Lindström 2014-08-06 15:28:58 +03:00
commit 6dad23f04a
91 changed files with 5772 additions and 168 deletions

View file

@ -0,0 +1,29 @@
include/master-slave.inc
[connection master]
drop table if exists t1;
create table t1(a int not null primary key auto_increment, b varchar(256), key second(b)) engine=innodb;
insert into t1 values (1, REPEAT("a", 256));
insert into t1 values (2, REPEAT("a", 256));
optimize table t1;
Table Op Msg_type Msg_text
test.t1 optimize status OK
drop table t1;
show binlog events in 'master-bin.000001' from 313;
Log_name Pos Event_type Server_id End_log_pos Info
master-bin.000001 313 Gtid 1 351 GTID 0-1-1
master-bin.000001 351 Query 1 465 use `test`; DROP TABLE IF EXISTS `t1`
master-bin.000001 465 Gtid 1 503 GTID 0-1-2
master-bin.000001 503 Query 1 669 use `test`; create table t1(a int not null primary key auto_increment, b varchar(256), key second(b)) engine=innodb
master-bin.000001 669 Gtid 1 707 BEGIN GTID 0-1-3
master-bin.000001 707 Table_map 1 751 table_id: 82 (test.t1)
master-bin.000001 751 Write_rows_v1 1 1043 table_id: 82 flags: STMT_END_F
master-bin.000001 1043 Xid 1 1070 COMMIT
master-bin.000001 1070 Gtid 1 1108 BEGIN GTID 0-1-4
master-bin.000001 1108 Table_map 1 1152 table_id: 82 (test.t1)
master-bin.000001 1152 Write_rows_v1 1 1444 table_id: 82 flags: STMT_END_F
master-bin.000001 1444 Xid 1 1471 COMMIT
master-bin.000001 1471 Gtid 1 1509 GTID 0-1-5
master-bin.000001 1509 Query 1 1589 use `test`; optimize table t1
master-bin.000001 1589 Gtid 1 1627 GTID 0-1-6
master-bin.000001 1627 Query 1 1731 use `test`; DROP TABLE `t1`
include/rpl_end.inc

View file

@ -0,0 +1,73 @@
DROP TABLE if exists t1;
select @@global.innodb_stats_persistent;
@@global.innodb_stats_persistent
0
set global innodb_defragment_stats_accuracy = 80;
CREATE TABLE t1 (a INT NOT NULL PRIMARY KEY AUTO_INCREMENT, b VARCHAR(256), c INT, KEY second(a, b),KEY third(c)) ENGINE=INNODB;
SET @@global.innodb_defragment_n_pages = 20;
after populate PRIMARY
select count(*) from t1;
count(*)
20000
after populate second
select count(*) from t1 force index (second);
count(*)
20000
after populate third
select count(*) from t1 force index (third);
count(*)
20000
select count(*) from t1;
count(*)
15800
after delete PRIMAY
select count(*) from t1 force index (second);
count(*)
15800
after delete second
select count(*) from t1 force index (third);
count(*)
15800
after delete third
select count(stat_value) > 0 from mysql.innodb_index_stats where table_name like '%t1%' and stat_name in ('n_pages_freed');
count(stat_value) > 0
0
select count(stat_value) > 0 from mysql.innodb_index_stats where table_name like '%t1%' and stat_name in ('n_page_split');
count(stat_value) > 0
1
select count(stat_value) > 0 from mysql.innodb_index_stats where table_name like '%t1%' and stat_name in ('n_leaf_pages_defrag');
count(stat_value) > 0
1
optimize table t1;;
INSERT INTO t1 VALUES (400000, REPEAT('A', 256),300000);;
INSERT INTO t1 VALUES (500000, REPEAT('A', 256),400000);;
DELETE FROM t1 where a between 1 and 100;;
UPDATE t1 SET c = c + 1 where c between 2000 and 8000;;
optimize table t1;
Table Op Msg_type Msg_text
test.t1 optimize status OK
select sleep(5);
sleep(5)
0
select count(*) from t1;
count(*)
15723
after optimize PRIMARY
select count(*) from t1 force index (second);
count(*)
15723
after optimize second
select count(*) from t1 force index (third);
count(*)
15723
after optimize third
select count(stat_value) > 0 from mysql.innodb_index_stats where table_name like '%t1%' and stat_name in ('n_pages_freed');
count(stat_value) > 0
1
select count(stat_value) > 0 from mysql.innodb_index_stats where table_name like '%t1%' and stat_name in ('n_page_split');
count(stat_value) > 0
1
select count(stat_value) > 0 from mysql.innodb_index_stats where table_name like '%t1%' and stat_name in ('n_leaf_pages_defrag');
count(stat_value) > 0
1
drop table t1;

View file

@ -0,0 +1,94 @@
DROP TABLE if exists t1;
select @@global.innodb_stats_persistent;
@@global.innodb_stats_persistent
0
set global innodb_defragment_stats_accuracy = 20;
# Create table.
CREATE TABLE t1 (a INT NOT NULL PRIMARY KEY AUTO_INCREMENT, b VARCHAR(256), KEY SECOND(a, b)) ENGINE=INNODB;
# Populate data
INSERT INTO t1 VALUES(1, REPEAT('A', 256));
INSERT INTO t1 (b) SELECT b from t1;
INSERT INTO t1 (b) SELECT b from t1;
INSERT INTO t1 (b) SELECT b from t1;
INSERT INTO t1 (b) SELECT b from t1;
INSERT INTO t1 (b) SELECT b from t1;
INSERT INTO t1 (b) SELECT b from t1;
INSERT INTO t1 (b) SELECT b from t1;
INSERT INTO t1 (b) SELECT b from t1;
INSERT INTO t1 (b) SELECT b from t1;
INSERT INTO t1 (b) SELECT b from t1;
# Not enough page splits to trigger persistent stats write yet.
select count(*) from mysql.innodb_index_stats where table_name like '%t1%' and stat_name in ('n_page_split', 'n_leaf_pages_defrag');
count(*)
0
INSERT INTO t1 (b) SELECT b from t1;
# Persistent stats recorded.
select count(stat_value) > 0 from mysql.innodb_index_stats where table_name like '%t1%' and stat_name in ('n_page_split', 'n_leaf_pages_defrag');
count(stat_value) > 0
0
# Delete some rows.
delete from t1 where a between 100 * 20 and 100 * 20 + 30;
delete from t1 where a between 100 * 19 and 100 * 19 + 30;
delete from t1 where a between 100 * 18 and 100 * 18 + 30;
delete from t1 where a between 100 * 17 and 100 * 17 + 30;
delete from t1 where a between 100 * 16 and 100 * 16 + 30;
delete from t1 where a between 100 * 15 and 100 * 15 + 30;
delete from t1 where a between 100 * 14 and 100 * 14 + 30;
delete from t1 where a between 100 * 13 and 100 * 13 + 30;
delete from t1 where a between 100 * 12 and 100 * 12 + 30;
delete from t1 where a between 100 * 11 and 100 * 11 + 30;
delete from t1 where a between 100 * 10 and 100 * 10 + 30;
delete from t1 where a between 100 * 9 and 100 * 9 + 30;
delete from t1 where a between 100 * 8 and 100 * 8 + 30;
delete from t1 where a between 100 * 7 and 100 * 7 + 30;
delete from t1 where a between 100 * 6 and 100 * 6 + 30;
delete from t1 where a between 100 * 5 and 100 * 5 + 30;
delete from t1 where a between 100 * 4 and 100 * 4 + 30;
delete from t1 where a between 100 * 3 and 100 * 3 + 30;
delete from t1 where a between 100 * 2 and 100 * 2 + 30;
delete from t1 where a between 100 * 1 and 100 * 1 + 30;
# Server Restarted
# Confirm persistent stats still there after restart.
select count(stat_value) > 0 from mysql.innodb_index_stats where table_name like '%t1%' and stat_name in ('n_page_split', 'n_leaf_pages_defrag');
count(stat_value) > 0
0
optimize table t1;
Table Op Msg_type Msg_text
test.t1 optimize status OK
# n_page_split should be 0 after defragmentation, n_pages_freed should be non-zero.
select stat_value = 0 from mysql.innodb_index_stats where table_name like '%t1%' and stat_name = 'n_page_split';
stat_value = 0
1
1
select count(stat_value) > 0 from mysql.innodb_index_stats where table_name like '%t1%' and stat_name in ('n_pages_freed', 'n_leaf_pages_defrag');
count(stat_value) > 0
1
set global innodb_defragment_stats_accuracy = 40;
INSERT INTO t1 (b) SELECT b from t1;
# Not enough operation to trigger persistent stats write
select stat_value = 0 from mysql.innodb_index_stats where table_name like '%t1%' and stat_name = 'n_page_split';
stat_value = 0
1
1
INSERT INTO t1 (b) SELECT b from t1;
# Persistent stats write triggered
select stat_value > 0 from mysql.innodb_index_stats where table_name like '%t1%' and stat_name = 'n_page_split';
stat_value > 0
0
0
# Table rename should cause stats rename.
rename table t1 to t2;
select count(stat_value) > 0 from mysql.innodb_index_stats where table_name like '%t2%' and stat_name in ('n_pages_freed', 'n_page_split', 'n_leaf_pages_defrag');
count(stat_value) > 0
1
# Drop index should cause stats drop.
drop index SECOND on t2;
select count(*) from mysql.innodb_index_stats where table_name like '%t2%' and index_name = 'SECOND';
count(*)
4
Server Restarted
select count(stat_value) > 0 from mysql.innodb_index_stats where table_name like '%t2%' and stat_name in ('n_pages_freed', 'n_page_split', 'n_leaf_pages_defrag');
count(stat_value) > 0
1
# Clean up
DROP TABLE t2;

View file

@ -0,0 +1,36 @@
DROP TABLE if exists t1;
SET @start_table_definition_cache = @@global.table_definition_cache;
SET @@global.table_definition_cache = 400;
SET @start_innodb_defragment_stats_accuracy = @@global.innodb_defragment_stats_accuracy;
SET @@global.innodb_defragment_stats_accuracy = 10;
CREATE TABLE t1 (a INT NOT NULL PRIMARY KEY AUTO_INCREMENT, b VARCHAR(256), KEY SECOND(a, b)) ENGINE=INNODB;
INSERT INTO t1 VALUES(1, REPEAT('A', 256));
INSERT INTO t1 (b) SELECT b from t1;
INSERT INTO t1 (b) SELECT b from t1;
INSERT INTO t1 (b) SELECT b from t1;
INSERT INTO t1 (b) SELECT b from t1;
INSERT INTO t1 (b) SELECT b from t1;
INSERT INTO t1 (b) SELECT b from t1;
INSERT INTO t1 (b) SELECT b from t1;
INSERT INTO t1 (b) SELECT b from t1;
INSERT INTO t1 (b) SELECT b from t1;
INSERT INTO t1 (b) SELECT b from t1;
INSERT INTO t1 (b) SELECT b from t1;
select stat_value > 0 from mysql.innodb_index_stats where table_name like '%t1%' and stat_name = 'n_page_split';
stat_value > 0
Create 405 table to overflow the table cache.
Sleep for a while to make sure t1 is evicted.
select sleep(10);
sleep(10)
0
Reload t1 to get defrag stats from persistent storage
INSERT INTO t1 (b) SELECT b from t1;
make sure the stats thread will wake up and do the write even if there's a race condition between set and reset.
select sleep(12);
sleep(12)
0
select stat_value > 0 from mysql.innodb_index_stats where table_name like '%t1%' and stat_name = 'n_page_split';
stat_value > 0
SET @@global.innodb_defragment_stats_accuracy = @start_innodb_defragment_stats_accuracy;
SET @@global.table_definition_cache = @start_table_definition_cache;
DROP TABLE t1;

View file

@ -0,0 +1,81 @@
DROP TABLE if exists t1;
set global innodb_defragment_stats_accuracy = 80;
CREATE TABLE t1 (a INT NOT NULL PRIMARY KEY AUTO_INCREMENT, b VARCHAR(256), KEY SECOND(a, b)) ENGINE=INNODB;
optimize table t1;
Table Op Msg_type Msg_text
test.t1 optimize status OK
INSERT INTO t1 VALUES (100000, REPEAT('A', 256));
INSERT INTO t1 VALUES (200000, REPEAT('A', 256));
INSERT INTO t1 VALUES (300000, REPEAT('A', 256));
INSERT INTO t1 VALUES (400000, REPEAT('A', 256));
optimize table t1;
Table Op Msg_type Msg_text
test.t1 optimize status OK
create procedure defragment()
begin
set @i = 0;
repeat
set @i = @i + 1;
optimize table t1;
select sleep(5);
until @i = 3 end repeat;
end //
select count(stat_value) = 0 from mysql.innodb_index_stats where table_name like '%t1%' and stat_name in ('n_pages_freed');
count(stat_value) = 0
1
select count(stat_value) > 0 from mysql.innodb_index_stats where table_name like '%t1%' and stat_name in ('n_page_split');
count(stat_value) > 0
1
select count(stat_value) > 0 from mysql.innodb_index_stats where table_name like '%t1%' and stat_name in ('n_leaf_pages_defrag');
count(stat_value) > 0
1
select count(*) from t1;
count(*)
10004
select count(*) from t1 force index (second);
count(*)
10004
call defragment();
optimize table t1;
Table Op Msg_type Msg_text
test.t1 optimize status OK
select sleep(5);
sleep(5)
0
select count(*) from t1;
count(*)
7904
select count(stat_value) > 0 from mysql.innodb_index_stats where table_name like '%t2%' and stat_name in ('n_pages_freed', 'n_page_split', 'n_leaf_pages_defrag');
count(stat_value) > 0
0
select count(*) from t1 force index (second);
count(*)
7904
SET @@global.innodb_defragment_n_pages = 3;
optimize table t1;
Table Op Msg_type Msg_text
test.t1 optimize status OK
select count(stat_value) > 0 from mysql.innodb_index_stats where table_name like '%t2%' and stat_name in ('n_pages_freed', 'n_page_split', 'n_leaf_pages_defrag');
count(stat_value) > 0
0
select count(*) from t1;
count(*)
6904
select count(*) from t1 force index (second);
count(*)
6904
SET @@global.innodb_defragment_n_pages = 10;
optimize table t1;
Table Op Msg_type Msg_text
test.t1 optimize status OK
select count(stat_value) > 0 from mysql.innodb_index_stats where table_name like '%t2%' and stat_name in ('n_pages_freed', 'n_page_split', 'n_leaf_pages_defrag');
count(stat_value) > 0
0
select count(*) from t1;
count(*)
6904
select count(*) from t1 force index (second);
count(*)
6904
DROP PROCEDURE defragment;
DROP TABLE t1;

View file

@ -0,0 +1,59 @@
DROP TABLE if exists t1;
DROP TABLE if exists t2;
Testing tables with large records
CREATE TABLE t1 (a INT NOT NULL PRIMARY KEY AUTO_INCREMENT, b VARCHAR(256), KEY SECOND(a, b)) ENGINE=INNODB;
optimize table t1;
Table Op Msg_type Msg_text
test.t1 optimize status OK
select count(*) from t1;
count(*)
790
select count(*) from t1 force index (second);
count(*)
790
# A few more insertions on the page should not cause a page split.
insert into t1 values (81, REPEAT('A', 256));
insert into t1 values (83, REPEAT('A', 256));
insert into t1 values (87, REPEAT('A', 256));
insert into t1 values (82, REPEAT('A', 256));
insert into t1 values (86, REPEAT('A', 256));
# More insertions will cause page splits
insert into t1 values (88, REPEAT('A', 50));
Too much space are reserved on primary index.
Too much space are reserved on second index.
DROP TABLE t1;
Testing table with small records
CREATE TABLE t2 (a INT NOT NULL PRIMARY KEY AUTO_INCREMENT, b VARchar(16), KEY SECOND(a,b)) ENGINE=INNODB;
optimize table t2;
Table Op Msg_type Msg_text
test.t2 optimize status OK
select count(*) from t2 force index(second);
count(*)
3701
The page should have room for about 20 insertions
insert into t2 values(1181, REPEAT('A', 16));
insert into t2 values(1191, REPEAT('A', 16));
insert into t2 values(1182, REPEAT('A', 16));
insert into t2 values(1192, REPEAT('A', 16));
insert into t2 values(1183, REPEAT('A', 16));
insert into t2 values(1193, REPEAT('A', 16));
insert into t2 values(1184, REPEAT('A', 16));
insert into t2 values(1194, REPEAT('A', 16));
insert into t2 values(1185, REPEAT('A', 16));
insert into t2 values(1195, REPEAT('A', 16));
insert into t2 values(1186, REPEAT('A', 16));
insert into t2 values(1196, REPEAT('A', 16));
insert into t2 values(1187, REPEAT('A', 16));
insert into t2 values(1197, REPEAT('A', 16));
insert into t2 values(1188, REPEAT('A', 16));
insert into t2 values(1198, REPEAT('A', 16));
insert into t2 values(1189, REPEAT('A', 16));
insert into t2 values(1199, REPEAT('A', 16));
insert into t2 values(1190, REPEAT('A', 16));
insert into t2 values(1180, REPEAT('A', 16));
More insertions will cause page split.
insert into t2 values(1280, REPEAT('A', 16));
insert into t2 values(1290, REPEAT('A', 16));
insert into t2 values(1281, REPEAT('A', 16));
insert into t2 values(1291, REPEAT('A', 16));
DROP TABLE t2;

View file

@ -0,0 +1 @@
--innodb-defragment=0

View file

@ -0,0 +1,5 @@
--loose-innodb-buffer-pool-stats
--loose-innodb-buffer-page
--loose-innodb-buffer-page-lru
--binlog-format=row
--innodb-defragment=1

View file

@ -0,0 +1,19 @@
--source include/have_innodb.inc
--source include/master-slave.inc
--disable_warnings
drop table if exists t1;
--enable_warnings
create table t1(a int not null primary key auto_increment, b varchar(256), key second(b)) engine=innodb;
insert into t1 values (1, REPEAT("a", 256));
insert into t1 values (2, REPEAT("a", 256));
optimize table t1;
drop table t1;
--replace_regex /\/\*.*//
show binlog events in 'master-bin.000001' from 313;
--source include/rpl_end.inc

View file

@ -0,0 +1,4 @@
--loose-innodb-buffer-pool-stats
--loose-innodb-buffer-page
--loose-innodb-buffer-page-lru
--innodb-defragment=1

View file

@ -0,0 +1,180 @@
--source include/have_innodb.inc
--disable_warnings
DROP TABLE if exists t1;
--enable_warnings
--disable_query_log
let $innodb_defragment_n_pages_orig=`select @@innodb_defragment_n_pages`;
let $innodb_defragment_stats_accuracy_orig=`select @@innodb_defragment_stats_accuracy`;
--enable_query_log
select @@global.innodb_stats_persistent;
set global innodb_defragment_stats_accuracy = 80;
# Create table.
CREATE TABLE t1 (a INT NOT NULL PRIMARY KEY AUTO_INCREMENT, b VARCHAR(256), c INT, KEY second(a, b),KEY third(c)) ENGINE=INNODB;
connect (con1,localhost,root,,test,$MASTER_MYPORT,$MASTER_MYSOCK);
connect (con2,localhost,root,,test,$MASTER_MYPORT,$MASTER_MYSOCK);
connect (con3,localhost,root,,test,$MASTER_MYPORT,$MASTER_MYSOCK);
connect (con4,localhost,root,,test,$MASTER_MYPORT,$MASTER_MYSOCK);
connection default;
SET @@global.innodb_defragment_n_pages = 20;
let $data_size = 20000;
let $delete_size = 2000;
# Populate table.
let $i = $data_size;
--disable_query_log
while ($i)
{
eval
INSERT INTO t1 VALUES ($data_size + 1 - $i, REPEAT('A', 256), $i);
dec $i;
}
--enable_query_log
--echo after populate PRIMARY
select count(*) from t1;
if (`select count(*) < 30 from information_schema.innodb_buffer_page where table_name like '%t1%' and index_name = 'PRIMARY' order by page_number;`)
{
aelect count(*) from information_schema.innodb_buffer_page where table_name like '%t1%' and index_name = 'PRIMARY' order by page_number;
}
--echo after populate second
select count(*) from t1 force index (second);
if (`select count(*) < 320 from information_schema.innodb_buffer_page where table_name like '%t1%' and index_name = 'second' order by page_number;`)
{
select count(*) from information_schema.innodb_buffer_page where table_name like '%t1%' and index_name = 'second' order by page_number;
}
--ECHO after populate third
select count(*) from t1 force index (third);
if (`select count(*) < 20 from information_schema.innodb_buffer_page where table_name like '%t1%' and index_name = 'third' order by page_number;`)
{
select count(*) from information_schema.innodb_buffer_page where table_name like '%t1%' and index_name = 'third' order by page_number;
}
# Delete some data
--disable_query_log
let $size = $delete_size;
while ($size)
{
let $j = 100 * $size;
eval delete from t1 where a between $j - 20 and $j;
dec $size;
}
--enable_query_log
select count(*) from t1;
--echo after delete PRIMAY
if (`select count(*) < 30 from information_schema.innodb_buffer_page where table_name like '%t1%' and index_name = 'PRIMARY' order by page_number;`)
{
select count(*) from information_schema.innodb_buffer_page where table_name like '%t1%' and index_name = 'PRIMARY' order by page_number;
}
select count(*) from t1 force index (second);
--echo after delete second
if (`select count(*) < 300 from information_schema.innodb_buffer_page where table_name like '%t1%' and index_name = 'second' order by page_number;`)
{
select count(*) from information_schema.innodb_buffer_page where table_name like '%t1%' and index_name = 'second' order by page_number;
}
select count(*) from t1 force index (third);
--echo after delete third
if (`select count(*) > 20 from information_schema.innodb_buffer_page where table_name like '%t1%' and index_name = 'third' order by page_number;`)
{
select count(*) from information_schema.innodb_buffer_page where table_name like '%t1%' and index_name = 'third' order by page_number;
}
# Above delete will free some pages and insert causes page split and these could cause defrag
select count(stat_value) > 0 from mysql.innodb_index_stats where table_name like '%t1%' and stat_name in ('n_pages_freed');
select count(stat_value) > 0 from mysql.innodb_index_stats where table_name like '%t1%' and stat_name in ('n_page_split');
select count(stat_value) > 0 from mysql.innodb_index_stats where table_name like '%t1%' and stat_name in ('n_leaf_pages_defrag');
connection con1;
--send optimize table t1;
connection default;
--send INSERT INTO t1 VALUES (400000, REPEAT('A', 256),300000);
connection con2;
--send INSERT INTO t1 VALUES (500000, REPEAT('A', 256),400000);
connection con3;
--send DELETE FROM t1 where a between 1 and 100;
connection con4;
--send UPDATE t1 SET c = c + 1 where c between 2000 and 8000;
connection con1;
--disable_result_log
--reap
--enable_result_log
connection con2;
--reap
connection con3;
--reap
connection con4;
--reap
connection default;
--reap
disconnect con1;
disconnect con2;
disconnect con3;
disconnect con4;
optimize table t1;
select sleep(5);
select count(*) from t1;
--echo after optimize PRIMARY
if (`select count(*) > 62 from information_schema.innodb_buffer_page where table_name like '%t1%' and index_name = 'PRIMARY' order by page_number;`)
{
select count(*) from information_schema.innodb_buffer_page where table_name like '%t1%' and index_name = 'PRIMARY' order by page_number;
}
select count(*) from t1 force index (second);
--echo after optimize second
if (`select count(*) > 340 from information_schema.innodb_buffer_page where table_name like '%t1%' and index_name = 'second' order by page_number;`)
{
select count(*) from information_schema.innodb_buffer_page where table_name like '%t1%' and index_name = 'second' order by page_number;
}
select count(*) from t1 force index (third);
--echo after optimize third
if (`select count(*) > 25 from information_schema.innodb_buffer_page where table_name like '%t1%' and index_name = 'third' order by page_number;`)
{
select count(*) from information_schema.innodb_buffer_page where table_name like '%t1%' and index_name = 'third' order by page_number;
}
# Now pages are freed
select count(stat_value) > 0 from mysql.innodb_index_stats where table_name like '%t1%' and stat_name in ('n_pages_freed');
select count(stat_value) > 0 from mysql.innodb_index_stats where table_name like '%t1%' and stat_name in ('n_page_split');
select count(stat_value) > 0 from mysql.innodb_index_stats where table_name like '%t1%' and stat_name in ('n_leaf_pages_defrag');
drop table t1;
# reset system
--disable_query_log
EVAL SET GLOBAL innodb_defragment_n_pages = $innodb_defragment_n_pages_orig;
EVAL SET GLOBAL innodb_defragment_stats_accuracy = $innodb_defragment_stats_accuracy_orig;
--enable_query_log

View file

@ -0,0 +1 @@
--innodb-defragment=1

View file

@ -0,0 +1,87 @@
--source include/have_innodb.inc
--source include/big_test.inc
--disable_warnings
DROP TABLE if exists t1;
--enable_warnings
--disable_query_log
let $innodb_defragment_stats_accuracy_orig=`select @@innodb_defragment_stats_accuracy`;
--enable_query_log
select @@global.innodb_stats_persistent;
set global innodb_defragment_stats_accuracy = 20;
--echo # Create table.
CREATE TABLE t1 (a INT NOT NULL PRIMARY KEY AUTO_INCREMENT, b VARCHAR(256), KEY SECOND(a, b)) ENGINE=INNODB;
--echo # Populate data
INSERT INTO t1 VALUES(1, REPEAT('A', 256));
INSERT INTO t1 (b) SELECT b from t1;
INSERT INTO t1 (b) SELECT b from t1;
INSERT INTO t1 (b) SELECT b from t1;
INSERT INTO t1 (b) SELECT b from t1;
INSERT INTO t1 (b) SELECT b from t1;
INSERT INTO t1 (b) SELECT b from t1;
INSERT INTO t1 (b) SELECT b from t1;
INSERT INTO t1 (b) SELECT b from t1;
INSERT INTO t1 (b) SELECT b from t1;
INSERT INTO t1 (b) SELECT b from t1;
--echo # Not enough page splits to trigger persistent stats write yet.
select count(*) from mysql.innodb_index_stats where table_name like '%t1%' and stat_name in ('n_page_split', 'n_leaf_pages_defrag');
INSERT INTO t1 (b) SELECT b from t1;
--echo # Persistent stats recorded.
select count(stat_value) > 0 from mysql.innodb_index_stats where table_name like '%t1%' and stat_name in ('n_page_split', 'n_leaf_pages_defrag');
--echo # Delete some rows.
let $num_delete = 20;
while ($num_delete)
{
let $j = 100 * $num_delete;
eval delete from t1 where a between $j and $j + 30;
dec $num_delete;
}
--source include/restart_mysqld.inc
--echo # Server Restarted
--echo # Confirm persistent stats still there after restart.
select count(stat_value) > 0 from mysql.innodb_index_stats where table_name like '%t1%' and stat_name in ('n_page_split', 'n_leaf_pages_defrag');
optimize table t1;
--echo # n_page_split should be 0 after defragmentation, n_pages_freed should be non-zero.
select stat_value = 0 from mysql.innodb_index_stats where table_name like '%t1%' and stat_name = 'n_page_split';
select count(stat_value) > 0 from mysql.innodb_index_stats where table_name like '%t1%' and stat_name in ('n_pages_freed', 'n_leaf_pages_defrag');
set global innodb_defragment_stats_accuracy = 40;
INSERT INTO t1 (b) SELECT b from t1;
--echo # Not enough operation to trigger persistent stats write
select stat_value = 0 from mysql.innodb_index_stats where table_name like '%t1%' and stat_name = 'n_page_split';
INSERT INTO t1 (b) SELECT b from t1;
--echo # Persistent stats write triggered
select stat_value > 0 from mysql.innodb_index_stats where table_name like '%t1%' and stat_name = 'n_page_split';
--echo # Table rename should cause stats rename.
rename table t1 to t2;
select count(stat_value) > 0 from mysql.innodb_index_stats where table_name like '%t2%' and stat_name in ('n_pages_freed', 'n_page_split', 'n_leaf_pages_defrag');
--echo # Drop index should cause stats drop.
drop index SECOND on t2;
select count(*) from mysql.innodb_index_stats where table_name like '%t2%' and index_name = 'SECOND';
--source include/restart_mysqld.inc
--echo Server Restarted
select count(stat_value) > 0 from mysql.innodb_index_stats where table_name like '%t2%' and stat_name in ('n_pages_freed', 'n_page_split', 'n_leaf_pages_defrag');
--echo # Clean up
DROP TABLE t2;
--disable_query_log
EVAL SET GLOBAL innodb_defragment_stats_accuracy = $innodb_defragment_stats_accuracy_orig;
--enable_query_log

View file

@ -0,0 +1 @@
--innodb-defragment=1

View file

@ -0,0 +1,71 @@
--source include/have_innodb.inc
--source include/big_test.inc
--disable_warnings
DROP TABLE if exists t1;
--enable_warnings
let $num_tables = 405;
SET @start_table_definition_cache = @@global.table_definition_cache;
SET @@global.table_definition_cache = 400;
# set stats accuracy to be pretty high so stats sync is easily triggered.
SET @start_innodb_defragment_stats_accuracy = @@global.innodb_defragment_stats_accuracy;
SET @@global.innodb_defragment_stats_accuracy = 10;
# Create table.
CREATE TABLE t1 (a INT NOT NULL PRIMARY KEY AUTO_INCREMENT, b VARCHAR(256), KEY SECOND(a, b)) ENGINE=INNODB;
# Populate data
INSERT INTO t1 VALUES(1, REPEAT('A', 256));
INSERT INTO t1 (b) SELECT b from t1;
INSERT INTO t1 (b) SELECT b from t1;
INSERT INTO t1 (b) SELECT b from t1;
INSERT INTO t1 (b) SELECT b from t1;
INSERT INTO t1 (b) SELECT b from t1;
INSERT INTO t1 (b) SELECT b from t1;
INSERT INTO t1 (b) SELECT b from t1;
INSERT INTO t1 (b) SELECT b from t1;
INSERT INTO t1 (b) SELECT b from t1;
INSERT INTO t1 (b) SELECT b from t1;
INSERT INTO t1 (b) SELECT b from t1;
select stat_value > 0 from mysql.innodb_index_stats where table_name like '%t1%' and stat_name = 'n_page_split';
# Create many tables to over flow the table definition cache
--echo Create $num_tables table to overflow the table cache.
--disable_query_log
let $count = $num_tables;
while ($count)
{
EVAL CREATE TABLE t_$count (a INT NOT NULL PRIMARY KEY AUTO_INCREMENT) ENGINE=INNODB;
EVAL INSERT INTO t_$count VALUES (1), (2);
dec $count;
}
--enable_query_log
--echo Sleep for a while to make sure t1 is evicted.
select sleep(10);
--echo Reload t1 to get defrag stats from persistent storage
INSERT INTO t1 (b) SELECT b from t1;
--echo make sure the stats thread will wake up and do the write even if there's a race condition between set and reset.
select sleep(12);
select stat_value > 0 from mysql.innodb_index_stats where table_name like '%t1%' and stat_name = 'n_page_split';
# Clean up
SET @@global.innodb_defragment_stats_accuracy = @start_innodb_defragment_stats_accuracy;
SET @@global.table_definition_cache = @start_table_definition_cache;
--disable_query_log
let $count = $num_tables;
while ($count)
{
EVAL DROP TABLE t_$count;
dec $count;
}
--enable_query_log
DROP TABLE t1;

View file

@ -0,0 +1,2 @@
--innodb_file_per_table
--innodb-defragment=1

View file

@ -0,0 +1,4 @@
--loose-innodb-buffer-pool-stats
--loose-innodb-buffer-page
--loose-innodb-buffer-page-lru
--innodb-defragment=1

View file

@ -0,0 +1,190 @@
--source include/have_innodb.inc
--disable_warnings
DROP TABLE if exists t1;
--enable_warnings
--disable_query_log
let $innodb_defragment_n_pages_orig=`select @@innodb_defragment_n_pages`;
let $innodb_defragment_stats_accuracy_orig=`select @@innodb_defragment_stats_accuracy`;
--enable_query_log
set global innodb_defragment_stats_accuracy = 80;
# Create table.
CREATE TABLE t1 (a INT NOT NULL PRIMARY KEY AUTO_INCREMENT, b VARCHAR(256), KEY SECOND(a, b)) ENGINE=INNODB;
## Test-1 defragment an empty table
optimize table t1;
## Test-2 defragment a single page table
INSERT INTO t1 VALUES (100000, REPEAT('A', 256));
INSERT INTO t1 VALUES (200000, REPEAT('A', 256));
INSERT INTO t1 VALUES (300000, REPEAT('A', 256));
INSERT INTO t1 VALUES (400000, REPEAT('A', 256));
optimize table t1;
## Test-3 defragment (somewhat) in parallel with delete queries
let $data_size = 10000;
let $delete_size = 100;
delimiter //;
create procedure defragment()
begin
set @i = 0;
repeat
set @i = @i + 1;
optimize table t1;
select sleep(5);
until @i = 3 end repeat;
end //
delimiter ;//
# Populate table.
let $i = $data_size;
--disable_query_log
while ($i)
{
eval
INSERT INTO t1 VALUES ($data_size + 1 - $i, REPEAT('A', 256));
dec $i;
}
--enable_query_log
select count(stat_value) = 0 from mysql.innodb_index_stats where table_name like '%t1%' and stat_name in ('n_pages_freed');
select count(stat_value) > 0 from mysql.innodb_index_stats where table_name like '%t1%' and stat_name in ('n_page_split');
select count(stat_value) > 0 from mysql.innodb_index_stats where table_name like '%t1%' and stat_name in ('n_leaf_pages_defrag');
select count(*) from t1;
if (!`select count(*) > 180 from information_schema.innodb_buffer_page where table_name like '%t1%' and index_name = 'PRIMARY' order by page_number;`)
{
select count(*) from information_schema.innodb_buffer_page where table_name like '%t1%' and index_name = 'PRIMARY' order by page_number;
}
select count(*) from t1 force index (second);
if (!`select count(*) > 170 from information_schema.innodb_buffer_page where table_name like '%t1%' and index_name = 'second' order by page_number;`)
{
select count(*) from information_schema.innodb_buffer_page where table_name like '%t1%' and index_name = 'second' order by page_number;
}
connect (con1,localhost,root,,test,$MASTER_MYPORT,$MASTER_MYSOCK);
connection con1;
--send call defragment()
connection default;
--disable_query_log
let $size = $delete_size;
while ($size)
{
let $j = 100 * $size;
eval delete from t1 where a between $j - 20 and $j;
dec $size;
}
--enable_query_log
connection con1;
--disable_result_log
--reap
--enable_result_log
connection default;
disconnect con1;
optimize table t1;
select sleep(5);
--source include/restart_mysqld.inc
select count(*) from t1;
select count(stat_value) > 0 from mysql.innodb_index_stats where table_name like '%t2%' and stat_name in ('n_pages_freed', 'n_page_split', 'n_leaf_pages_defrag');
# After deletion & defragmentation, there are 8000 records left
if (!`select count(*) < 180 from information_schema.innodb_buffer_page where table_name like '%t1%' and index_name = 'PRIMARY' order by page_number;`)
{
select count(*) from information_schema.innodb_buffer_page where table_name like '%t1%' and index_name = 'PRIMARY' order by page_number;
}
select count(*) from t1 force index (second);
# secondary index is pretty much the same size as primary index so the number of pages should be similar.
if (!`select count(*) < 180 from information_schema.innodb_buffer_page where table_name like '%t1%' and index_name = 'second' order by page_number;`)
{
select count(*) from information_schema.innodb_buffer_page where table_name like '%t1%' and index_name = 'second' order by page_number;
}
## Test-4 defragment with larger n_pages
# delete some more records
--disable_query_log
let $size = $delete_size;
while ($size)
{
let $j = 100 * $size;
eval delete from t1 where a between $j - 30 and $j - 20;
dec $size;
}
--enable_query_log
SET @@global.innodb_defragment_n_pages = 3;
# This will not reduce number of pages by a lot
optimize table t1;
--source include/restart_mysqld.inc
select count(stat_value) > 0 from mysql.innodb_index_stats where table_name like '%t2%' and stat_name in ('n_pages_freed', 'n_page_split', 'n_leaf_pages_defrag');
select count(*) from t1;
# We didn't create large wholes with the previous deletion, so if innodb_defragment_n_pages = 3, we won't be able to free up many pages.
if (!`select count(*) > 130 from information_schema.innodb_buffer_page where table_name like '%t1%' and index_name = 'PRIMARY' order by page_number;`)
{
select count(*) from information_schema.innodb_buffer_page where table_name like '%t1%' and index_name = 'PRIMARY' order by page_number;
}
select count(*) from t1 force index (second);
# Same holds for secondary index, not many pages are released.
if (!`select count(*) > 100 from information_schema.innodb_buffer_page where table_name like '%t1%' and index_name = 'second' order by page_number;`)
{
select count(*) from information_schema.innodb_buffer_page where table_name like '%t1%' and index_name = 'second' order by page_number;
}
SET @@global.innodb_defragment_n_pages = 10;
optimize table t1;
--source include/restart_mysqld.inc
select count(stat_value) > 0 from mysql.innodb_index_stats where table_name like '%t2%' and stat_name in ('n_pages_freed', 'n_page_split', 'n_leaf_pages_defrag');
select count(*) from t1;
# This time we used innodb_defragment_n_pages = 10, so we should be able to free up some pages.
if (!`select count(*) < 165 from information_schema.innodb_buffer_page where table_name like '%t1%' and index_name = 'PRIMARY' order by page_number;`)
{
select count(*) from information_schema.innodb_buffer_page where table_name like '%t1%' and index_name = 'PRIMARY' order by page_number;
}
select count(*) from t1 force index (second);
if (!`select count(*) < 165 from information_schema.innodb_buffer_page where table_name like '%t1%' and index_name = 'second' order by page_number;`)
{
select count(*) from information_schema.innodb_buffer_page where table_name like '%t1%' and index_name = 'second' order by page_number;
}
DROP PROCEDURE defragment;
DROP TABLE t1;
# reset system
--disable_query_log
EVAL SET GLOBAL innodb_defragment_n_pages = $innodb_defragment_n_pages_orig;
EVAL SET GLOBAL innodb_defragment_stats_accuracy = $innodb_defragment_stats_accuracy_orig;
--enable_query_log

View file

@ -0,0 +1,4 @@
--loose-innodb-buffer-pool-stats
--loose-innodb-buffer-page
--loose-innodb-buffer-page-lru
--innodb-defragment=1

View file

@ -0,0 +1,130 @@
--source include/have_innodb.inc
--disable_warnings
DROP TABLE if exists t1;
DROP TABLE if exists t2;
--enable_warnings
--echo Testing tables with large records
# Create table.
CREATE TABLE t1 (a INT NOT NULL PRIMARY KEY AUTO_INCREMENT, b VARCHAR(256), KEY SECOND(a, b)) ENGINE=INNODB;
# Populate table.
let $i = 1000;
--disable_query_log
while ($i)
{
eval
INSERT INTO t1 VALUES ($i, REPEAT('A', 256));
dec $i;
}
--enable_query_log
--disable_query_log
let $size = 10;
while ($size)
{
let $j = 100 * $size;
eval delete from t1 where a between $j - 20 and $j;
dec $size;
}
--enable_query_log
optimize table t1;
--source include/restart_mysqld.inc
select count(*) from t1;
# After deletion & defragmentation, there are 800 records left. Each page can hold about 57 records. We fill the page 90% full,
# so there should be less than 16 pages total.
--let $primary_before = query_get_value(select count(*) as Value from information_schema.innodb_buffer_page where table_name like '%t1%' and index_name = 'PRIMARY', Value, 1)
select count(*) from t1 force index (second);
# secondary index is slightly bigger than primary index so the number of pages should be similar.
--let $second_before = query_get_value(select count(*) as Value from information_schema.innodb_buffer_page where table_name like '%t1%' and index_name = 'second', Value, 1)
--echo # A few more insertions on the page should not cause a page split.
insert into t1 values (81, REPEAT('A', 256));
insert into t1 values (83, REPEAT('A', 256));
insert into t1 values (87, REPEAT('A', 256));
insert into t1 values (82, REPEAT('A', 256));
insert into t1 values (86, REPEAT('A', 256));
--let $primary_after = query_get_value(select count(*) as Value from information_schema.innodb_buffer_page where table_name like '%t1%' and index_name = 'PRIMARY', Value, 1)
--let $second_after = query_get_value(select count(*) as Value from information_schema.innodb_buffer_page where table_name like '%t1%' and index_name = 'second', Value, 1)
if ($primary_before != $primary_after) {
--echo Insertion caused page split on primary, which should be avoided by innodb_defragment_fill_factor.
}
if ($second_before != $second_after) {
--echo Insertion caused page split on second, which should be avoided by innodb_defragment_fill_factor.
}
--echo # More insertions will cause page splits
insert into t1 values (88, REPEAT('A', 50));
#insert into t1 values (85, REPEAT('A', 256));
#insert into t1 values (84, REPEAT('A', 256));
#insert into t1 values (89, REPEAT('A', 256));
--let $primary_after = query_get_value(select count(*) as Value from information_schema.innodb_buffer_page where table_name like '%t1%' and index_name = 'PRIMARY', Value, 1)
--let $second_after = query_get_value(select count(*) as Value from information_schema.innodb_buffer_page where table_name like '%t1%' and index_name = 'second', Value, 1)
if ($primary_before == $primary_after) {
--echo Too much space are reserved on primary index.
}
if ($second_before == $second_after) {
--echo Too much space are reserved on second index.
}
DROP TABLE t1;
--echo Testing table with small records
CREATE TABLE t2 (a INT NOT NULL PRIMARY KEY AUTO_INCREMENT, b VARchar(16), KEY SECOND(a,b)) ENGINE=INNODB;
# Populate table.
--disable_query_log
INSERT INTO t2 VALUES (1, REPEAT('A', 16));
INSERT INTO t2 (b) SELECT b from t2;
INSERT INTO t2 (b) SELECT b from t2;
INSERT INTO t2 (b) SELECT b from t2;
INSERT INTO t2 (b) SELECT b from t2;
INSERT INTO t2 (b) SELECT b from t2;
INSERT INTO t2 (b) SELECT b from t2;
INSERT INTO t2 (b) SELECT b from t2;
INSERT INTO t2 (b) SELECT b from t2;
INSERT INTO t2 (b) SELECT b from t2;
INSERT INTO t2 (b) SELECT b from t2;
INSERT INTO t2 (b) SELECT b from t2;
INSERT INTO t2 (b) SELECT b from t2;
--enable_query_log
--disable_query_log
let $size = 40;
while ($size)
{
let $j = 100 * $size;
eval delete from t2 where a between $j - 20 and $j;
dec $size;
}
--enable_query_log
optimize table t2;
--source include/restart_mysqld.inc
select count(*) from t2 force index(second);
--let $second_before = query_get_value(select count(*) as Value from information_schema.innodb_buffer_page where table_name like '%t2%' and index_name = 'second', Value, 1)
--echo The page should have room for about 20 insertions
insert into t2 values(1181, REPEAT('A', 16));
insert into t2 values(1191, REPEAT('A', 16));
insert into t2 values(1182, REPEAT('A', 16));
insert into t2 values(1192, REPEAT('A', 16));
insert into t2 values(1183, REPEAT('A', 16));
insert into t2 values(1193, REPEAT('A', 16));
insert into t2 values(1184, REPEAT('A', 16));
insert into t2 values(1194, REPEAT('A', 16));
insert into t2 values(1185, REPEAT('A', 16));
insert into t2 values(1195, REPEAT('A', 16));
insert into t2 values(1186, REPEAT('A', 16));
insert into t2 values(1196, REPEAT('A', 16));
insert into t2 values(1187, REPEAT('A', 16));
insert into t2 values(1197, REPEAT('A', 16));
insert into t2 values(1188, REPEAT('A', 16));
insert into t2 values(1198, REPEAT('A', 16));
insert into t2 values(1189, REPEAT('A', 16));
insert into t2 values(1199, REPEAT('A', 16));
insert into t2 values(1190, REPEAT('A', 16));
insert into t2 values(1180, REPEAT('A', 16));
--let $second_after = query_get_value(select count(*) as Value from information_schema.innodb_buffer_page where table_name like '%t2%' and index_name = 'second', Value, 1)
if ($second_before != $second_after) {
--echo Insertion caused page split on second, which should be avoided by innodb_defragment_fill_factor.
}
--echo More insertions will cause page split.
insert into t2 values(1280, REPEAT('A', 16));
insert into t2 values(1290, REPEAT('A', 16));
insert into t2 values(1281, REPEAT('A', 16));
insert into t2 values(1291, REPEAT('A', 16));
--let $second_after = query_get_value(select count(*) as Value from information_schema.innodb_buffer_page where table_name like '%t2%' and index_name = 'second', Value, 1)
if ($second_before == $second_after) {
--echo Too much space are reserved on second index.
}
DROP TABLE t2;

View file

@ -0,0 +1,18 @@
SET @orig = @@global.innodb_defragment;
SELECT @orig;
@orig
0
SET GLOBAL innodb_defragment = OFF;
SELECT @@global.innodb_defragment;
@@global.innodb_defragment
0
SET GLOBAL innodb_defragment = ON;
SELECT @@global.innodb_defragment;
@@global.innodb_defragment
1
SET GLOBAL innodb_defragment = 100;
ERROR 42000: Variable 'innodb_defragment' can't be set to the value of '100'
SELECT @@global.innodb_defragment;
@@global.innodb_defragment
1
SET GLOBAL innodb_defragment = @orig;

View file

@ -0,0 +1,37 @@
SET @start_innodb_defragment_fill_factor = @@global.innodb_defragment_fill_factor;
SELECT @start_innodb_defragment_fill_factor;
@start_innodb_defragment_fill_factor
0.9
SELECT COUNT(@@global.innodb_defragment_fill_factor);
COUNT(@@global.innodb_defragment_fill_factor)
1
SET @@global.innodb_defragment_fill_factor = 0.77777777777777;
SELECT @@global.innodb_defragment_fill_factor;
@@global.innodb_defragment_fill_factor
0.777778
SET @@global.innodb_defragment_fill_factor = 1;
SELECT @@global.innodb_defragment_fill_factor;
@@global.innodb_defragment_fill_factor
1.000000
SET @@global.innodb_defragment_fill_factor = 0.7;
SELECT @@global.innodb_defragment_fill_factor;
@@global.innodb_defragment_fill_factor
0.700000
SET @@global.innodb_defragment_fill_factor = -1;
Warnings:
Warning 1292 Truncated incorrect innodb_defragment_fill_factor value: '-1'
SELECT @@global.innodb_defragment_fill_factor;
@@global.innodb_defragment_fill_factor
0.700000
SET @@global.innodb_defragment_fill_factor = 2;
Warnings:
Warning 1292 Truncated incorrect innodb_defragment_fill_factor value: '2'
SELECT @@global.innodb_defragment_fill_factor;
@@global.innodb_defragment_fill_factor
1.000000
SET @@global.innodb_defragment_fill_factor = "abc";
ERROR 42000: Incorrect argument type to variable 'innodb_defragment_fill_factor'
SELECT @@global.innodb_defragment_fill_factor;
@@global.innodb_defragment_fill_factor
1.000000
SET @@global.innodb_defragment_fill_factor = @start_innodb_defragment_fill_factor;

View file

@ -0,0 +1,42 @@
SET @start_innodb_defragment_fill_factor_n_recs = @@global.innodb_defragment_fill_factor_n_recs;
SELECT @start_innodb_defragment_fill_factor_n_recs;
@start_innodb_defragment_fill_factor_n_recs
20
SELECT COUNT(@@global.innodb_defragment_fill_factor_n_recs);
COUNT(@@global.innodb_defragment_fill_factor_n_recs)
1
SET @@global.innodb_defragment_fill_factor_n_recs = 50;
SELECT @@global.innodb_defragment_fill_factor_n_recs;
@@global.innodb_defragment_fill_factor_n_recs
50
SET @@global.innodb_defragment_fill_factor_n_recs = 100;
SELECT @@global.innodb_defragment_fill_factor_n_recs;
@@global.innodb_defragment_fill_factor_n_recs
100
SET @@global.innodb_defragment_fill_factor_n_recs = 1;
SELECT @@global.innodb_defragment_fill_factor_n_recs;
@@global.innodb_defragment_fill_factor_n_recs
1
SET @@global.innodb_defragment_fill_factor_n_recs = -1;
Warnings:
Warning 1292 Truncated incorrect innodb_defragment_fill_factor_n_ value: '-1'
SELECT @@global.innodb_defragment_fill_factor_n_recs;
@@global.innodb_defragment_fill_factor_n_recs
1
SET @@global.innodb_defragment_fill_factor_n_recs = 10000;
Warnings:
Warning 1292 Truncated incorrect innodb_defragment_fill_factor_n_ value: '10000'
SELECT @@global.innodb_defragment_fill_factor_n_recs;
@@global.innodb_defragment_fill_factor_n_recs
100
SET @@global.innodb_defragment_fill_factor_n_recs = 10.5;
ERROR 42000: Incorrect argument type to variable 'innodb_defragment_fill_factor_n_recs'
SELECT @@global.innodb_defragment_fill_factor_n_recs;
@@global.innodb_defragment_fill_factor_n_recs
100
SET @@global.innodb_defragment_fill_factor_n_recs = "abc";
ERROR 42000: Incorrect argument type to variable 'innodb_defragment_fill_factor_n_recs'
SELECT @@global.innodb_defragment_fill_factor_n_recs;
@@global.innodb_defragment_fill_factor_n_recs
100
SET @@global.innodb_defragment_fill_factor_n_recs = @start_innodb_defragment_fill_factor_n_recs;

View file

@ -0,0 +1,42 @@
SET @start_innodb_defragment_frequency = @@global.innodb_defragment_frequency;
SELECT @start_innodb_defragment_frequency;
@start_innodb_defragment_frequency
40
SELECT COUNT(@@global.innodb_defragment_frequency);
COUNT(@@global.innodb_defragment_frequency)
1
SET @@global.innodb_defragment_frequency = 200;
SELECT @@global.innodb_defragment_frequency;
@@global.innodb_defragment_frequency
200
SET @@global.innodb_defragment_frequency = 1;
SELECT @@global.innodb_defragment_frequency;
@@global.innodb_defragment_frequency
1
SET @@global.innodb_defragment_frequency = 1000;
SELECT @@global.innodb_defragment_frequency;
@@global.innodb_defragment_frequency
1000
SET @@global.innodb_defragment_frequency = -1;
Warnings:
Warning 1292 Truncated incorrect innodb_defragment_frequency value: '-1'
SELECT @@global.innodb_defragment_frequency;
@@global.innodb_defragment_frequency
1
SET @@global.innodb_defragment_frequency = 10000;
Warnings:
Warning 1292 Truncated incorrect innodb_defragment_frequency value: '10000'
SELECT @@global.innodb_defragment_frequency;
@@global.innodb_defragment_frequency
1000
SET @@global.innodb_defragment_frequency = 10.5;
ERROR 42000: Incorrect argument type to variable 'innodb_defragment_frequency'
SELECT @@global.innodb_defragment_frequency;
@@global.innodb_defragment_frequency
1000
SET @@global.innodb_defragment_frequency = "abc";
ERROR 42000: Incorrect argument type to variable 'innodb_defragment_frequency'
SELECT @@global.innodb_defragment_frequency;
@@global.innodb_defragment_frequency
1000
SET @@global.innodb_defragment_frequency = @start_innodb_defragment_frequency;

View file

@ -0,0 +1,28 @@
SET @start_innodb_defragment_n_pages = @@global.innodb_defragment_n_pages;
SELECT @start_innodb_defragment_n_pages;
@start_innodb_defragment_n_pages
7
SELECT COUNT(@@global.innodb_defragment_n_pages);
COUNT(@@global.innodb_defragment_n_pages)
1
SET @@global.innodb_defragment_n_pages = 1;
Warnings:
Warning 1292 Truncated incorrect innodb_defragment_n_pages value: '1'
SELECT @@global.innodb_defragment_n_pages;
@@global.innodb_defragment_n_pages
2
SET @@global.innodb_defragment_n_pages = 2;
SELECT @@global.innodb_defragment_n_pages;
@@global.innodb_defragment_n_pages
2
SET @@global.innodb_defragment_n_pages = 32;
SELECT @@global.innodb_defragment_n_pages;
@@global.innodb_defragment_n_pages
32
SET @@global.innodb_defragment_n_pages = 64;
Warnings:
Warning 1292 Truncated incorrect innodb_defragment_n_pages value: '64'
SELECT @@global.innodb_defragment_n_pages;
@@global.innodb_defragment_n_pages
32
SET @@global.innodb_defragment_n_pages = @start_innodb_defragment_n_pages;

View file

@ -0,0 +1,33 @@
SET @start_innodb_defragment_stats_accuracy = @@global.innodb_defragment_stats_accuracy;
SELECT @start_innodb_defragment_stats_accuracy;
@start_innodb_defragment_stats_accuracy
0
SELECT COUNT(@@global.innodb_defragment_stats_accuracy);
COUNT(@@global.innodb_defragment_stats_accuracy)
1
SET @@global.innodb_defragment_stats_accuracy = 1;
SELECT @@global.innodb_defragment_stats_accuracy;
@@global.innodb_defragment_stats_accuracy
1
SET @@global.innodb_defragment_stats_accuracy = 1000;
SELECT @@global.innodb_defragment_stats_accuracy;
@@global.innodb_defragment_stats_accuracy
1000
SET @@global.innodb_defragment_stats_accuracy = -1;
Warnings:
Warning 1292 Truncated incorrect innodb_defragment_stats_accuracy value: '-1'
SELECT @@global.innodb_defragment_stats_accuracy;
@@global.innodb_defragment_stats_accuracy
0
SET @@global.innodb_defragment_stats_accuracy = 1000000000000;
Warnings:
Warning 1292 Truncated incorrect innodb_defragment_stats_accuracy value: '1000000000000'
SELECT @@global.innodb_defragment_stats_accuracy;
@@global.innodb_defragment_stats_accuracy
4294967295
SET @@global.innodb_defragment_stats_accuracy = "abc";
ERROR 42000: Incorrect argument type to variable 'innodb_defragment_stats_accuracy'
SELECT @@global.innodb_defragment_stats_accuracy;
@@global.innodb_defragment_stats_accuracy
4294967295
SET @@global.innodb_defragment_stats_accuracy = @start_innodb_defragment_stats_accuracy;

View file

@ -0,0 +1,20 @@
-- source include/have_innodb.inc
# Check the default value
SET @orig = @@global.innodb_defragment;
SELECT @orig;
# Turn off
SET GLOBAL innodb_defragment = OFF;
SELECT @@global.innodb_defragment;
# Turn on
SET GLOBAL innodb_defragment = ON;
SELECT @@global.innodb_defragment;
# Wrong value
--error ER_WRONG_VALUE_FOR_VAR
SET GLOBAL innodb_defragment = 100;
SELECT @@global.innodb_defragment;
SET GLOBAL innodb_defragment = @orig;

View file

@ -0,0 +1,27 @@
--source include/have_innodb.inc
SET @start_innodb_defragment_fill_factor = @@global.innodb_defragment_fill_factor;
SELECT @start_innodb_defragment_fill_factor;
SELECT COUNT(@@global.innodb_defragment_fill_factor);
SET @@global.innodb_defragment_fill_factor = 0.77777777777777;
SELECT @@global.innodb_defragment_fill_factor;
SET @@global.innodb_defragment_fill_factor = 1;
SELECT @@global.innodb_defragment_fill_factor;
SET @@global.innodb_defragment_fill_factor = 0.7;
SELECT @@global.innodb_defragment_fill_factor;
SET @@global.innodb_defragment_fill_factor = -1;
SELECT @@global.innodb_defragment_fill_factor;
SET @@global.innodb_defragment_fill_factor = 2;
SELECT @@global.innodb_defragment_fill_factor;
--Error ER_WRONG_TYPE_FOR_VAR
SET @@global.innodb_defragment_fill_factor = "abc";
SELECT @@global.innodb_defragment_fill_factor;
SET @@global.innodb_defragment_fill_factor = @start_innodb_defragment_fill_factor;

View file

@ -0,0 +1,31 @@
--source include/have_innodb.inc
SET @start_innodb_defragment_fill_factor_n_recs = @@global.innodb_defragment_fill_factor_n_recs;
SELECT @start_innodb_defragment_fill_factor_n_recs;
SELECT COUNT(@@global.innodb_defragment_fill_factor_n_recs);
SET @@global.innodb_defragment_fill_factor_n_recs = 50;
SELECT @@global.innodb_defragment_fill_factor_n_recs;
SET @@global.innodb_defragment_fill_factor_n_recs = 100;
SELECT @@global.innodb_defragment_fill_factor_n_recs;
SET @@global.innodb_defragment_fill_factor_n_recs = 1;
SELECT @@global.innodb_defragment_fill_factor_n_recs;
SET @@global.innodb_defragment_fill_factor_n_recs = -1;
SELECT @@global.innodb_defragment_fill_factor_n_recs;
SET @@global.innodb_defragment_fill_factor_n_recs = 10000;
SELECT @@global.innodb_defragment_fill_factor_n_recs;
--Error ER_WRONG_TYPE_FOR_VAR
SET @@global.innodb_defragment_fill_factor_n_recs = 10.5;
SELECT @@global.innodb_defragment_fill_factor_n_recs;
--Error ER_WRONG_TYPE_FOR_VAR
SET @@global.innodb_defragment_fill_factor_n_recs = "abc";
SELECT @@global.innodb_defragment_fill_factor_n_recs;
SET @@global.innodb_defragment_fill_factor_n_recs = @start_innodb_defragment_fill_factor_n_recs;

View file

@ -0,0 +1,37 @@
--source include/have_innodb.inc
SET @start_innodb_defragment_frequency = @@global.innodb_defragment_frequency;
SELECT @start_innodb_defragment_frequency;
SELECT COUNT(@@global.innodb_defragment_frequency);
# test valid value
SET @@global.innodb_defragment_frequency = 200;
SELECT @@global.innodb_defragment_frequency;
# test valid min
SET @@global.innodb_defragment_frequency = 1;
SELECT @@global.innodb_defragment_frequency;
# test valid max
SET @@global.innodb_defragment_frequency = 1000;
SELECT @@global.innodb_defragment_frequency;
# test invalid value < min
SET @@global.innodb_defragment_frequency = -1;
SELECT @@global.innodb_defragment_frequency;
# test invalid value > max
SET @@global.innodb_defragment_frequency = 10000;
SELECT @@global.innodb_defragment_frequency;
# test wrong type
--Error ER_WRONG_TYPE_FOR_VAR
SET @@global.innodb_defragment_frequency = 10.5;
SELECT @@global.innodb_defragment_frequency;
--Error ER_WRONG_TYPE_FOR_VAR
SET @@global.innodb_defragment_frequency = "abc";
SELECT @@global.innodb_defragment_frequency;
SET @@global.innodb_defragment_frequency = @start_innodb_defragment_frequency;

View file

@ -0,0 +1,22 @@
--source include/have_innodb.inc
SET @start_innodb_defragment_n_pages = @@global.innodb_defragment_n_pages;
SELECT @start_innodb_defragment_n_pages;
SELECT COUNT(@@global.innodb_defragment_n_pages);
SET @@global.innodb_defragment_n_pages = 1;
SELECT @@global.innodb_defragment_n_pages;
SET @@global.innodb_defragment_n_pages = 2;
SELECT @@global.innodb_defragment_n_pages;
SET @@global.innodb_defragment_n_pages = 32;
SELECT @@global.innodb_defragment_n_pages;
SET @@global.innodb_defragment_n_pages = 64;
SELECT @@global.innodb_defragment_n_pages;
SET @@global.innodb_defragment_n_pages = @start_innodb_defragment_n_pages;

View file

@ -0,0 +1,24 @@
--source include/have_innodb.inc
SET @start_innodb_defragment_stats_accuracy = @@global.innodb_defragment_stats_accuracy;
SELECT @start_innodb_defragment_stats_accuracy;
SELECT COUNT(@@global.innodb_defragment_stats_accuracy);
SET @@global.innodb_defragment_stats_accuracy = 1;
SELECT @@global.innodb_defragment_stats_accuracy;
SET @@global.innodb_defragment_stats_accuracy = 1000;
SELECT @@global.innodb_defragment_stats_accuracy;
SET @@global.innodb_defragment_stats_accuracy = -1;
SELECT @@global.innodb_defragment_stats_accuracy;
SET @@global.innodb_defragment_stats_accuracy = 1000000000000;
SELECT @@global.innodb_defragment_stats_accuracy;
--Error ER_WRONG_TYPE_FOR_VAR
SET @@global.innodb_defragment_stats_accuracy = "abc";
SELECT @@global.innodb_defragment_stats_accuracy;
SET @@global.innodb_defragment_stats_accuracy = @start_innodb_defragment_stats_accuracy;

View file

@ -1 +1,2 @@
--default-storage-engine=MyISAM
--innodb-defragment=0

View file

@ -285,6 +285,7 @@ SET(INNOBASE_SOURCES
btr/btr0cur.cc
btr/btr0pcur.cc
btr/btr0sea.cc
btr/btr0defragment.cc
buf/buf0buddy.cc
buf/buf0buf.cc
buf/buf0dblwr.cc
@ -395,7 +396,8 @@ SET(INNOBASE_SOURCES
ut/ut0rnd.cc
ut/ut0ut.cc
ut/ut0vec.cc
ut/ut0wqueue.cc)
ut/ut0wqueue.cc
ut/ut0timer.cc)
IF(WITH_INNODB)
# Legacy option

View file

@ -38,6 +38,7 @@ Created 6/2/1994 Heikki Tuuri
#include "btr0cur.h"
#include "btr0sea.h"
#include "btr0pcur.h"
#include "btr0defragment.h"
#include "rem0cmp.h"
#include "lock0lock.h"
#include "ibuf0ibuf.h"
@ -1192,6 +1193,32 @@ btr_get_size(
ulint flag, /*!< in: BTR_N_LEAF_PAGES or BTR_TOTAL_SIZE */
mtr_t* mtr) /*!< in/out: mini-transaction where index
is s-latched */
{
ulint used;
if (flag == BTR_N_LEAF_PAGES) {
btr_get_size_and_reserved(index, flag, &used, mtr);
return used;
} else if (flag == BTR_TOTAL_SIZE) {
return btr_get_size_and_reserved(index, flag, &used, mtr);
} else {
ut_error;
}
return (ULINT_UNDEFINED);
}
/**************************************************************//**
Gets the number of reserved and used pages in a B-tree.
@return number of pages reserved, or ULINT_UNDEFINED if the index
is unavailable */
UNIV_INTERN
ulint
btr_get_size_and_reserved(
/*======================*/
dict_index_t* index, /*!< in: index */
ulint flag, /*!< in: BTR_N_LEAF_PAGES or BTR_TOTAL_SIZE */
ulint* used, /*!< out: number of pages used (<= reserved) */
mtr_t* mtr) /*!< in/out: mini-transaction where index
is s-latched */
{
fseg_header_t* seg_header;
page_t* root;
@ -1201,6 +1228,8 @@ btr_get_size(
ut_ad(mtr_memo_contains(mtr, dict_index_get_lock(index),
MTR_MEMO_S_LOCK));
ut_a(flag == BTR_N_LEAF_PAGES || flag == BTR_TOTAL_SIZE);
if (index->page == FIL_NULL || dict_index_is_online_ddl(index)
|| *index->name == TEMP_INDEX_PREFIX) {
return(ULINT_UNDEFINED);
@ -1208,21 +1237,16 @@ btr_get_size(
root = btr_root_get(index, mtr);
if (flag == BTR_N_LEAF_PAGES) {
seg_header = root + PAGE_HEADER + PAGE_BTR_SEG_LEAF;
seg_header = root + PAGE_HEADER + PAGE_BTR_SEG_LEAF;
fseg_n_reserved_pages(seg_header, &n, mtr);
n = fseg_n_reserved_pages(seg_header, used, mtr);
} else if (flag == BTR_TOTAL_SIZE) {
if (flag == BTR_TOTAL_SIZE) {
seg_header = root + PAGE_HEADER + PAGE_BTR_SEG_TOP;
n = fseg_n_reserved_pages(seg_header, &dummy, mtr);
seg_header = root + PAGE_HEADER + PAGE_BTR_SEG_LEAF;
n += fseg_n_reserved_pages(seg_header, &dummy, mtr);
} else {
ut_error;
*used += dummy;
}
return(n);
@ -1971,7 +1995,7 @@ IBUF_BITMAP_FREE is unaffected by reorganization.
@retval true if the operation was successful
@retval false if it is a compressed page, and recompression failed */
static __attribute__((nonnull))
UNIV_INTERN
bool
btr_page_reorganize_block(
/*======================*/
@ -2923,6 +2947,12 @@ func_start:
new_page_zip = buf_block_get_page_zip(new_block);
btr_page_create(new_block, new_page_zip, cursor->index,
btr_page_get_level(page, mtr), mtr);
/* Only record the leaf level page splits. */
if (btr_page_get_level(page, mtr) == 0) {
cursor->index->stat_defrag_n_page_split ++;
cursor->index->stat_defrag_modified_counter ++;
btr_defragment_save_defrag_stats_if_needed(cursor->index);
}
/* 3. Calculate the first record on the upper half-page, and the
first record (move_limit) on original page which ends up on the
@ -3181,31 +3211,9 @@ func_exit:
return(rec);
}
#ifdef UNIV_SYNC_DEBUG
/*************************************************************//**
Removes a page from the level list of pages.
@param space in: space where removed
@param zip_size in: compressed page size in bytes, or 0 for uncompressed
@param page in/out: page to remove
@param index in: index tree
@param mtr in/out: mini-transaction */
# define btr_level_list_remove(space,zip_size,page,index,mtr) \
btr_level_list_remove_func(space,zip_size,page,index,mtr)
#else /* UNIV_SYNC_DEBUG */
/*************************************************************//**
Removes a page from the level list of pages.
@param space in: space where removed
@param zip_size in: compressed page size in bytes, or 0 for uncompressed
@param page in/out: page to remove
@param index in: index tree
@param mtr in/out: mini-transaction */
# define btr_level_list_remove(space,zip_size,page,index,mtr) \
btr_level_list_remove_func(space,zip_size,page,mtr)
#endif /* UNIV_SYNC_DEBUG */
/*************************************************************//**
Removes a page from the level list of pages. */
static __attribute__((nonnull))
UNIV_INTERN
void
btr_level_list_remove_func(
/*=======================*/
@ -3377,7 +3385,7 @@ btr_node_ptr_delete(
If page is the only on its level, this function moves its records to the
father page, thus reducing the tree height.
@return father block */
static
UNIV_INTERN
buf_block_t*
btr_lift_page_up(
/*=============*/

View file

@ -0,0 +1,814 @@
/*****************************************************************************
Copyright (C) 2013, 2014 Facebook, Inc. All Rights Reserved.
Copyright (C) 2014, SkySQL Ab. All Rights Reserved.
This program is free software; you can redistribute it and/or modify it under
the terms of the GNU General Public License as published by the Free Software
Foundation; version 2 of the License.
This program is distributed in the hope that it will be useful, but WITHOUT
ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
You should have received a copy of the GNU General Public License along with
this program; if not, write to the Free Software Foundation, Inc.,
51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA
*****************************************************************************/
/**************************************************//**
@file btr/btr0defragment.cc
Index defragmentation.
Created 05/29/2014 Rongrong Zhong
Modified 16/07/2014 Sunguck Lee
Modified 30/07/2014 Jan Lindström jan.lindstrom@skysql.com
*******************************************************/
#include "btr0defragment.h"
#ifndef UNIV_HOTBACKUP
#include "btr0cur.h"
#include "btr0sea.h"
#include "btr0pcur.h"
#include "dict0stats.h"
#include "dict0stats_bg.h"
#include "ibuf0ibuf.h"
#include "lock0lock.h"
#include "srv0start.h"
#include "ut0timer.h"
#include <list>
/**************************************************//**
Custom nullptr implementation for under g++ 4.6
*******************************************************/
// #pragma once
namespace std
{
// based on SC22/WG21/N2431 = J16/07-0301
struct nullptr_t
{
template<typename any> operator any * () const
{
return 0;
}
template<class any, typename T> operator T any:: * () const
{
return 0;
}
#ifdef _MSC_VER
struct pad {};
pad __[sizeof(void*)/sizeof(pad)];
#else
char __[sizeof(void*)];
#endif
private:
// nullptr_t();// {}
// nullptr_t(const nullptr_t&);
// void operator = (const nullptr_t&);
void operator &() const;
template<typename any> void operator +(any) const
{
/*I Love MSVC 2005!*/
}
template<typename any> void operator -(any) const
{
/*I Love MSVC 2005!*/
}
};
static const nullptr_t __nullptr = {};
}
#ifndef nullptr
#define nullptr std::__nullptr
#endif
/**************************************************//**
End of Custom nullptr implementation for under g++ 4.6
*******************************************************/
/* When there's no work, either because defragment is disabled, or because no
query is submitted, thread checks state every BTR_DEFRAGMENT_SLEEP_IN_USECS.*/
#define BTR_DEFRAGMENT_SLEEP_IN_USECS 1000000
/* Reduce the target page size by this amount when compression failure happens
during defragmentaiton. 512 is chosen because it's a power of 2 and it is about
3% of the page size. When there are compression failures in defragmentation,
our goal is to get a decent defrag ratio with as few compression failure as
possible. From experimentation it seems that reduce the target size by 512 every
time will make sure the page is compressible within a couple of iterations. */
#define BTR_DEFRAGMENT_PAGE_REDUCTION_STEP_SIZE 512
/* Work queue for defragmentation. */
typedef std::list<btr_defragment_item_t*> btr_defragment_wq_t;
static btr_defragment_wq_t btr_defragment_wq;
/* Mutex protecting the defragmentation work queue.*/
ib_mutex_t btr_defragment_mutex;
#ifdef UNIV_PFS_MUTEX
UNIV_INTERN mysql_pfs_key_t btr_defragment_mutex_key;
#endif /* UNIV_PFS_MUTEX */
/* Number of compression failures caused by defragmentation since server
start. */
ulint btr_defragment_compression_failures = 0;
/* Number of btr_defragment_n_pages calls that altered page but didn't
manage to release any page. */
ulint btr_defragment_failures = 0;
/* Total number of btr_defragment_n_pages calls that altered page.
The difference between btr_defragment_count and btr_defragment_failures shows
the amount of effort wasted. */
ulint btr_defragment_count = 0;
/******************************************************************//**
Constructor for btr_defragment_item_t. */
btr_defragment_item_t::btr_defragment_item_t(
btr_pcur_t* pcur,
os_event_t event)
{
this->pcur = pcur;
this->event = event;
this->removed = false;
this->last_processed = 0;
}
/******************************************************************//**
Destructor for btr_defragment_item_t. */
btr_defragment_item_t::~btr_defragment_item_t() {
if (this->pcur) {
btr_pcur_free_for_mysql(this->pcur);
}
if (this->event) {
os_event_set(this->event);
}
}
/******************************************************************//**
Initialize defragmentation. */
void
btr_defragment_init()
{
srv_defragment_interval = ut_microseconds_to_timer(
1000000.0 / srv_defragment_frequency);
mutex_create(btr_defragment_mutex_key, &btr_defragment_mutex,
SYNC_ANY_LATCH);
os_thread_create(btr_defragment_thread, NULL, NULL);
}
/******************************************************************//**
Shutdown defragmentation. Release all resources. */
void
btr_defragment_shutdown()
{
mutex_enter(&btr_defragment_mutex);
list< btr_defragment_item_t* >::iterator iter = btr_defragment_wq.begin();
while(iter != btr_defragment_wq.end()) {
btr_defragment_item_t* item = *iter;
iter = btr_defragment_wq.erase(iter);
delete item;
}
mutex_exit(&btr_defragment_mutex);
mutex_free(&btr_defragment_mutex);
}
/******************************************************************//**
Functions used by the query threads: btr_defragment_xxx_index
Query threads find/add/remove index. */
/******************************************************************//**
Check whether the given index is in btr_defragment_wq. We use index->id
to identify indices. */
bool
btr_defragment_find_index(
dict_index_t* index) /*!< Index to find. */
{
mutex_enter(&btr_defragment_mutex);
for (list< btr_defragment_item_t* >::iterator iter = btr_defragment_wq.begin();
iter != btr_defragment_wq.end();
++iter) {
btr_defragment_item_t* item = *iter;
btr_pcur_t* pcur = item->pcur;
btr_cur_t* cursor = btr_pcur_get_btr_cur(pcur);
dict_index_t* idx = btr_cur_get_index(cursor);
if (index->id == idx->id) {
mutex_exit(&btr_defragment_mutex);
return true;
}
}
mutex_exit(&btr_defragment_mutex);
return false;
}
/******************************************************************//**
Query thread uses this function to add an index to btr_defragment_wq.
Return a pointer to os_event for the query thread to wait on if this is a
synchronized defragmentation. */
os_event_t
btr_defragment_add_index(
dict_index_t* index, /*!< index to be added */
bool async) /*!< whether this is an async defragmentation */
{
mtr_t mtr;
ulint space = dict_index_get_space(index);
ulint zip_size = dict_table_zip_size(index->table);
ulint page_no = dict_index_get_page(index);
mtr_start(&mtr);
// Load index rood page.
page_t* page = btr_page_get(space, zip_size, page_no,
RW_NO_LATCH, index, &mtr);
if (btr_page_get_level(page, &mtr) == 0) {
// Index root is a leaf page, no need to defragment.
mtr_commit(&mtr);
return NULL;
}
btr_pcur_t* pcur = btr_pcur_create_for_mysql();
os_event_t event = NULL;
if (!async) {
event = os_event_create();
}
btr_pcur_open_at_index_side(true, index, BTR_SEARCH_LEAF, pcur,
true, 0, &mtr);
btr_pcur_move_to_next(pcur, &mtr);
btr_pcur_store_position(pcur, &mtr);
mtr_commit(&mtr);
dict_stats_empty_defrag_summary(index);
btr_defragment_item_t* item = new btr_defragment_item_t(pcur, event);
mutex_enter(&btr_defragment_mutex);
btr_defragment_wq.push_back(item);
mutex_exit(&btr_defragment_mutex);
return event;
}
/******************************************************************//**
When table is dropped, this function is called to mark a table as removed in
btr_efragment_wq. The difference between this function and the remove_index
function is this will not NULL the event. */
void
btr_defragment_remove_table(
dict_table_t* table) /*!< Index to be removed. */
{
mutex_enter(&btr_defragment_mutex);
for (list< btr_defragment_item_t* >::iterator iter = btr_defragment_wq.begin();
iter != btr_defragment_wq.end();
++iter) {
btr_defragment_item_t* item = *iter;
btr_pcur_t* pcur = item->pcur;
btr_cur_t* cursor = btr_pcur_get_btr_cur(pcur);
dict_index_t* idx = btr_cur_get_index(cursor);
if (table->id == idx->table->id) {
item->removed = true;
}
}
mutex_exit(&btr_defragment_mutex);
}
/******************************************************************//**
Query thread uses this function to mark an index as removed in
btr_efragment_wq. */
void
btr_defragment_remove_index(
dict_index_t* index) /*!< Index to be removed. */
{
mutex_enter(&btr_defragment_mutex);
for (list< btr_defragment_item_t* >::iterator iter = btr_defragment_wq.begin();
iter != btr_defragment_wq.end();
++iter) {
btr_defragment_item_t* item = *iter;
btr_pcur_t* pcur = item->pcur;
btr_cur_t* cursor = btr_pcur_get_btr_cur(pcur);
dict_index_t* idx = btr_cur_get_index(cursor);
if (index->id == idx->id) {
item->removed = true;
item->event = NULL;
break;
}
}
mutex_exit(&btr_defragment_mutex);
}
/******************************************************************//**
Functions used by defragmentation thread: btr_defragment_xxx_item.
Defragmentation thread operates on the work *item*. It gets/removes
item from the work queue. */
/******************************************************************//**
Defragment thread uses this to remove an item from btr_defragment_wq.
When an item is removed from the work queue, all resources associated with it
are free as well. */
void
btr_defragment_remove_item(
btr_defragment_item_t* item) /*!< Item to be removed. */
{
mutex_enter(&btr_defragment_mutex);
for (list< btr_defragment_item_t* >::iterator iter = btr_defragment_wq.begin();
iter != btr_defragment_wq.end();
++iter) {
if (item == *iter) {
btr_defragment_wq.erase(iter);
delete item;
break;
}
}
mutex_exit(&btr_defragment_mutex);
}
/******************************************************************//**
Defragment thread uses this to get an item from btr_defragment_wq to work on.
The item is not removed from the work queue so query threads can still access
this item. We keep it this way so query threads can find and kill a
defragmentation even if that index is being worked on. Be aware that while you
work on this item you have no lock protection on it whatsoever. This is OK as
long as the query threads and defragment thread won't modify the same fields
without lock protection.
*/
btr_defragment_item_t*
btr_defragment_get_item()
{
if (btr_defragment_wq.empty()) {
return nullptr;
}
mutex_enter(&btr_defragment_mutex);
list< btr_defragment_item_t* >::iterator iter = btr_defragment_wq.begin();
if (iter == btr_defragment_wq.end()) {
iter = btr_defragment_wq.begin();
}
btr_defragment_item_t* item = *iter;
iter++;
mutex_exit(&btr_defragment_mutex);
return item;
}
/*********************************************************************//**
Check whether we should save defragmentation statistics to persistent storage.
Currently we save the stats to persistent storage every 100 updates. */
UNIV_INTERN
void
btr_defragment_save_defrag_stats_if_needed(
dict_index_t* index) /*!< in: index */
{
if (srv_defragment_stats_accuracy != 0 // stats tracking disabled
&& dict_index_get_space(index) != 0 // do not track system tables
&& index->stat_defrag_modified_counter
>= srv_defragment_stats_accuracy) {
dict_stats_defrag_pool_add(index);
index->stat_defrag_modified_counter = 0;
}
}
/*********************************************************************//**
Main defragment functionalities used by defragment thread.*/
/*************************************************************//**
Calculate number of records from beginning of block that can
fit into size_limit
@return number of records */
UNIV_INTERN
ulint
btr_defragment_calc_n_recs_for_size(
buf_block_t* block, /*!< in: B-tree page */
dict_index_t* index, /*!< in: index of the page */
ulint size_limit, /*!< in: size limit to fit records in */
ulint* n_recs_size) /*!< out: actual size of the records that fit
in size_limit. */
{
page_t* page = buf_block_get_frame(block);
ulint n_recs = 0;
ulint offsets_[REC_OFFS_NORMAL_SIZE];
ulint* offsets = offsets_;
rec_offs_init(offsets_);
mem_heap_t* heap = NULL;
ulint size = 0;
page_cur_t cur;
page_cur_set_before_first(block, &cur);
page_cur_move_to_next(&cur);
while (page_cur_get_rec(&cur) != page_get_supremum_rec(page)) {
rec_t* cur_rec = page_cur_get_rec(&cur);
offsets = rec_get_offsets(cur_rec, index, offsets,
ULINT_UNDEFINED, &heap);
ulint rec_size = rec_offs_size(offsets);
size += rec_size;
if (size > size_limit) {
size = size - rec_size;
break;
}
n_recs ++;
page_cur_move_to_next(&cur);
}
*n_recs_size = size;
return n_recs;
}
/*************************************************************//**
Merge as many records from the from_block to the to_block. Delete
the from_block if all records are successfully merged to to_block.
@return the to_block to target for next merge operation. */
UNIV_INTERN
buf_block_t*
btr_defragment_merge_pages(
dict_index_t* index, /*!< in: index tree */
buf_block_t* from_block, /*!< in: origin of merge */
buf_block_t* to_block, /*!< in: destination of merge */
ulint zip_size, /*!< in: zip size of the block */
ulint reserved_space, /*!< in: space reserved for future
insert to avoid immediate page split */
ulint* max_data_size, /*!< in/out: max data size to
fit in a single compressed page. */
mem_heap_t* heap, /*!< in/out: pointer to memory heap */
mtr_t* mtr) /*!< in/out: mini-transaction */
{
page_t* from_page = buf_block_get_frame(from_block);
page_t* to_page = buf_block_get_frame(to_block);
ulint space = dict_index_get_space(index);
ulint level = btr_page_get_level(from_page, mtr);
ulint n_recs = page_get_n_recs(from_page);
ulint new_data_size = page_get_data_size(to_page);
ulint max_ins_size =
page_get_max_insert_size(to_page, n_recs);
ulint max_ins_size_reorg =
page_get_max_insert_size_after_reorganize(
to_page, n_recs);
ulint max_ins_size_to_use = max_ins_size_reorg > reserved_space
? max_ins_size_reorg - reserved_space : 0;
ulint move_size = 0;
ulint n_recs_to_move = 0;
rec_t* rec = NULL;
ulint target_n_recs = 0;
rec_t* orig_pred;
// Estimate how many records can be moved from the from_page to
// the to_page.
if (zip_size) {
ulint page_diff = UNIV_PAGE_SIZE - *max_data_size;
max_ins_size_to_use = (max_ins_size_to_use > page_diff)
? max_ins_size_to_use - page_diff : 0;
}
n_recs_to_move = btr_defragment_calc_n_recs_for_size(
from_block, index, max_ins_size_to_use, &move_size);
// If max_ins_size >= move_size, we can move the records without
// reorganizing the page, otherwise we need to reorganize the page
// first to release more space.
if (move_size > max_ins_size) {
if (!btr_page_reorganize_block(false, page_zip_level,
to_block, index,
mtr)) {
if (!dict_index_is_clust(index)
&& page_is_leaf(to_page)) {
ibuf_reset_free_bits(to_block);
}
// If reorganization fails, that means page is
// not compressable. There's no point to try
// merging into this page. Continue to the
// next page.
return from_block;
}
ut_ad(page_validate(to_page, index));
max_ins_size = page_get_max_insert_size(to_page, n_recs);
ut_a(max_ins_size >= move_size);
}
// Move records to pack to_page more full.
orig_pred = NULL;
target_n_recs = n_recs_to_move;
while (n_recs_to_move > 0) {
rec = page_rec_get_nth(from_page,
n_recs_to_move + 1);
orig_pred = page_copy_rec_list_start(
to_block, from_block, rec, index, mtr);
if (orig_pred)
break;
// If we reach here, that means compression failed after packing
// n_recs_to_move number of records to to_page. We try to reduce
// the targeted data size on the to_page by
// BTR_DEFRAGMENT_PAGE_REDUCTION_STEP_SIZE and try again.
os_atomic_increment_ulint(
&btr_defragment_compression_failures, 1);
max_ins_size_to_use =
move_size > BTR_DEFRAGMENT_PAGE_REDUCTION_STEP_SIZE
? move_size - BTR_DEFRAGMENT_PAGE_REDUCTION_STEP_SIZE
: 0;
if (max_ins_size_to_use == 0) {
n_recs_to_move = 0;
move_size = 0;
break;
}
n_recs_to_move = btr_defragment_calc_n_recs_for_size(
from_block, index, max_ins_size_to_use, &move_size);
}
// If less than target_n_recs are moved, it means there are
// compression failures during page_copy_rec_list_start. Adjust
// the max_data_size estimation to reduce compression failures
// in the following runs.
if (target_n_recs > n_recs_to_move
&& *max_data_size > new_data_size + move_size) {
*max_data_size = new_data_size + move_size;
}
// Set ibuf free bits if necessary.
if (!dict_index_is_clust(index)
&& page_is_leaf(to_page)) {
if (zip_size) {
ibuf_reset_free_bits(to_block);
} else {
ibuf_update_free_bits_if_full(
to_block,
UNIV_PAGE_SIZE,
ULINT_UNDEFINED);
}
}
if (n_recs_to_move == n_recs) {
/* The whole page is merged with the previous page,
free it. */
lock_update_merge_left(to_block, orig_pred,
from_block);
btr_search_drop_page_hash_index(from_block);
btr_level_list_remove(space, zip_size, from_page,
index, mtr);
btr_node_ptr_delete(index, from_block, mtr);
btr_blob_dbg_remove(from_page, index,
"btr_defragment_n_pages");
btr_page_free(index, from_block, mtr);
} else {
// There are still records left on the page, so
// increment n_defragmented. Node pointer will be changed
// so remove the old node pointer.
if (n_recs_to_move > 0) {
// Part of the page is merged to left, remove
// the merged records, update record locks and
// node pointer.
dtuple_t* node_ptr;
page_delete_rec_list_start(rec, from_block,
index, mtr);
lock_update_split_and_merge(to_block,
orig_pred,
from_block);
btr_node_ptr_delete(index, from_block, mtr);
rec = page_rec_get_next(
page_get_infimum_rec(from_page));
node_ptr = dict_index_build_node_ptr(
index, rec, page_get_page_no(from_page),
heap, level + 1);
btr_insert_on_non_leaf_level(0, index, level+1,
node_ptr, mtr);
}
to_block = from_block;
}
return to_block;
}
/*************************************************************//**
Tries to merge N consecutive pages, starting from the page pointed by the
cursor. Skip space 0. Only consider leaf pages.
This function first loads all N pages into memory, then for each of
the pages other than the first page, it tries to move as many records
as possible to the left sibling to keep the left sibling full. During
the process, if any page becomes empty, that page will be removed from
the level list. Record locks, hash, and node pointers are updated after
page reorganization.
@return pointer to the last block processed, or NULL if reaching end of index */
UNIV_INTERN
buf_block_t*
btr_defragment_n_pages(
buf_block_t* block, /*!< in: starting block for defragmentation */
dict_index_t* index, /*!< in: index tree */
uint n_pages,/*!< in: number of pages to defragment */
mtr_t* mtr) /*!< in/out: mini-transaction */
{
ulint space;
ulint zip_size;
/* We will need to load the n+1 block because if the last page is freed
and we need to modify the prev_page_no of that block. */
buf_block_t* blocks[BTR_DEFRAGMENT_MAX_N_PAGES + 1];
page_t* first_page;
buf_block_t* current_block;
ulint total_data_size = 0;
ulint total_n_recs = 0;
ulint data_size_per_rec;
ulint optimal_page_size;
ulint reserved_space;
ulint level;
ulint max_data_size = 0;
uint n_defragmented = 0;
uint n_new_slots;
mem_heap_t* heap;
ibool end_of_index = FALSE;
/* It doesn't make sense to call this function with n_pages = 1. */
ut_ad(n_pages > 1);
ut_ad(mtr_memo_contains(mtr, dict_index_get_lock(index),
MTR_MEMO_X_LOCK));
space = dict_index_get_space(index);
if (space == 0) {
/* Ignore space 0. */
return NULL;
}
if (n_pages > BTR_DEFRAGMENT_MAX_N_PAGES) {
n_pages = BTR_DEFRAGMENT_MAX_N_PAGES;
}
zip_size = dict_table_zip_size(index->table);
first_page = buf_block_get_frame(block);
level = btr_page_get_level(first_page, mtr);
if (level != 0) {
return NULL;
}
/* 1. Load the pages and calculate the total data size. */
blocks[0] = block;
for (uint i = 1; i <= n_pages; i++) {
page_t* page = buf_block_get_frame(blocks[i-1]);
ulint page_no = btr_page_get_next(page, mtr);
total_data_size += page_get_data_size(page);
total_n_recs += page_get_n_recs(page);
if (page_no == FIL_NULL) {
n_pages = i;
end_of_index = TRUE;
break;
}
blocks[i] = btr_block_get(space, zip_size, page_no,
RW_X_LATCH, index, mtr);
}
if (n_pages == 1) {
if (btr_page_get_prev(first_page, mtr) == FIL_NULL) {
/* last page in the index */
if (dict_index_get_page(index)
== page_get_page_no(first_page))
return NULL;
/* given page is the last page.
Lift the records to father. */
btr_lift_page_up(index, block, mtr);
}
return NULL;
}
/* 2. Calculate how many pages data can fit in. If not compressable,
return early. */
ut_a(total_n_recs != 0);
data_size_per_rec = total_data_size / total_n_recs;
// For uncompressed pages, the optimal data size if the free space of a
// empty page.
optimal_page_size = page_get_free_space_of_empty(
page_is_comp(first_page));
// For compressed pages, we take compression failures into account.
if (zip_size) {
ulint size = 0;
int i = 0;
// We estimate the optimal data size of the index use samples of
// data size. These samples are taken when pages failed to
// compress due to insertion on the page. We use the average
// of all samples we have as the estimation. Different pages of
// the same index vary in compressibility. Average gives a good
// enough estimation.
for (;i < STAT_DEFRAG_DATA_SIZE_N_SAMPLE; i++) {
if (index->stat_defrag_data_size_sample[i] == 0) {
break;
}
size += index->stat_defrag_data_size_sample[i];
}
if (i != 0) {
size = size / i;
optimal_page_size = min(optimal_page_size, size);
}
max_data_size = optimal_page_size;
}
reserved_space = min((ulint)(optimal_page_size
* (1 - srv_defragment_fill_factor)),
(data_size_per_rec
* srv_defragment_fill_factor_n_recs));
optimal_page_size -= reserved_space;
n_new_slots = (total_data_size + optimal_page_size - 1)
/ optimal_page_size;
if (n_new_slots >= n_pages) {
/* Can't defragment. */
if (end_of_index)
return NULL;
return blocks[n_pages-1];
}
/* 3. Defragment pages. */
heap = mem_heap_create(256);
// First defragmented page will be the first page.
current_block = blocks[0];
// Start from the second page.
for (uint i = 1; i < n_pages; i ++) {
buf_block_t* new_block = btr_defragment_merge_pages(
index, blocks[i], current_block, zip_size,
reserved_space, &max_data_size, heap, mtr);
if (new_block != current_block) {
n_defragmented ++;
current_block = new_block;
}
}
mem_heap_free(heap);
n_defragmented ++;
os_atomic_increment_ulint(
&btr_defragment_count, 1);
if (n_pages == n_defragmented) {
os_atomic_increment_ulint(
&btr_defragment_failures, 1);
} else {
index->stat_defrag_n_pages_freed += (n_pages - n_defragmented);
}
if (end_of_index)
return NULL;
return current_block;
}
/******************************************************************//**
Thread that merges consecutive b-tree pages into fewer pages to defragment
the index. */
extern "C" UNIV_INTERN
os_thread_ret_t
DECLARE_THREAD(btr_defragment_thread)(
/*==========================================*/
void* arg) /*!< in: work queue */
{
btr_pcur_t* pcur;
btr_cur_t* cursor;
dict_index_t* index;
mtr_t mtr;
buf_block_t* first_block;
buf_block_t* last_block;
while (srv_shutdown_state == SRV_SHUTDOWN_NONE) {
/* If defragmentation is disabled, sleep before
checking whether it's enabled. */
if (!srv_defragment) {
os_thread_sleep(BTR_DEFRAGMENT_SLEEP_IN_USECS);
continue;
}
/* The following call won't remove the item from work queue.
We only get a pointer to it to work on. This will make sure
when user issue a kill command, all indices are in the work
queue to be searched. This also means that the user thread
cannot directly remove the item from queue (since we might be
using it). So user thread only marks index as removed. */
btr_defragment_item_t* item = btr_defragment_get_item();
/* If work queue is empty, sleep and check later. */
if (!item) {
os_thread_sleep(BTR_DEFRAGMENT_SLEEP_IN_USECS);
continue;
}
/* If an index is marked as removed, we remove it from the work
queue. No other thread could be using this item at this point so
it's safe to remove now. */
if (item->removed) {
btr_defragment_remove_item(item);
continue;
}
pcur = item->pcur;
ulonglong now = ut_timer_now();
ulonglong elapsed = now - item->last_processed;
if (elapsed < srv_defragment_interval) {
/* If we see an index again before the interval
determined by the configured frequency is reached,
we just sleep until the interval pass. Since
defragmentation of all indices queue up on a single
thread, it's likely other indices that follow this one
don't need to sleep again. */
os_thread_sleep(((ulint)ut_timer_to_microseconds(
srv_defragment_interval - elapsed)));
}
now = ut_timer_now();
mtr_start(&mtr);
btr_pcur_restore_position(BTR_MODIFY_TREE, pcur, &mtr);
cursor = btr_pcur_get_btr_cur(pcur);
index = btr_cur_get_index(cursor);
first_block = btr_cur_get_block(cursor);
last_block = btr_defragment_n_pages(first_block, index,
srv_defragment_n_pages,
&mtr);
if (last_block) {
/* If we haven't reached the end of the index,
place the cursor on the last record of last page,
store the cursor position, and put back in queue. */
page_t* last_page = buf_block_get_frame(last_block);
rec_t* rec = page_rec_get_prev(
page_get_supremum_rec(last_page));
ut_a(page_rec_is_user_rec(rec));
page_cur_position(rec, last_block,
btr_cur_get_page_cur(cursor));
btr_pcur_store_position(pcur, &mtr);
mtr_commit(&mtr);
/* Update the last_processed time of this index. */
item->last_processed = now;
} else {
mtr_commit(&mtr);
/* Reaching the end of the index. */
dict_stats_empty_defrag_stats(index);
dict_stats_save_defrag_stats(index);
dict_stats_save_defrag_summary(index);
btr_defragment_remove_item(item);
}
}
btr_defragment_shutdown();
os_thread_exit(NULL);
OS_THREAD_DUMMY_RETURN;
}
#endif /* !UNIV_HOTBACKUP */

View file

@ -408,7 +408,7 @@ dict_table_try_drop_aborted(
if (table == NULL) {
table = dict_table_open_on_id_low(
table_id, DICT_ERR_IGNORE_NONE);
table_id, DICT_ERR_IGNORE_NONE, FALSE);
} else {
ut_ad(table->id == table_id);
}
@ -795,7 +795,8 @@ dict_table_open_on_id(
table_id,
table_op == DICT_TABLE_OP_LOAD_TABLESPACE
? DICT_ERR_IGNORE_RECOVER_LOCK
: DICT_ERR_IGNORE_NONE);
: DICT_ERR_IGNORE_NONE,
table_op == DICT_TABLE_OP_OPEN_ONLY_IF_CACHED);
if (table != NULL) {
@ -1313,7 +1314,7 @@ dict_table_move_from_non_lru_to_lru(
/**********************************************************************//**
Looks for an index with the given id given a table instance.
@return index or NULL */
static
UNIV_INTERN
dict_index_t*
dict_table_find_index_on_id(
/*========================*/
@ -2408,6 +2409,13 @@ undo_size_ok:
new_index->stat_index_size = 1;
new_index->stat_n_leaf_pages = 1;
new_index->stat_defrag_n_pages_freed = 0;
new_index->stat_defrag_n_page_split = 0;
new_index->stat_defrag_sample_next_slot = 0;
memset(&new_index->stat_defrag_data_size_sample,
0x0, sizeof(ulint) * STAT_DEFRAG_DATA_SIZE_N_SAMPLE);
/* Add the new index as the last index for the table */
UT_LIST_ADD_LAST(indexes, table->indexes, new_index);

View file

@ -492,6 +492,9 @@ dict_stats_table_clone_create(
heap,
idx->n_uniq * sizeof(idx->stat_n_non_null_key_vals[0]));
ut_d(idx->magic_n = DICT_INDEX_MAGIC_N);
idx->stat_defrag_n_page_split = 0;
idx->stat_defrag_n_pages_freed = 0;
}
ut_d(t->magic_n = DICT_TABLE_MAGIC_N);
@ -520,7 +523,9 @@ static
void
dict_stats_empty_index(
/*===================*/
dict_index_t* index) /*!< in/out: index */
dict_index_t* index, /*!< in/out: index */
bool empty_defrag_stats)
/*!< in: whether to empty defrag stats */
{
ut_ad(!(index->type & DICT_FTS));
ut_ad(!dict_index_is_univ(index));
@ -535,6 +540,34 @@ dict_stats_empty_index(
index->stat_index_size = 1;
index->stat_n_leaf_pages = 1;
if (empty_defrag_stats) {
dict_stats_empty_defrag_stats(index);
dict_stats_empty_defrag_summary(index);
}
}
/**********************************************************************//**
Clear defragmentation summary. */
UNIV_INTERN
void
dict_stats_empty_defrag_summary(
/*==================*/
dict_index_t* index) /*!< in: index to clear defragmentation stats */
{
index->stat_defrag_n_pages_freed = 0;
}
/**********************************************************************//**
Clear defragmentation related index stats. */
UNIV_INTERN
void
dict_stats_empty_defrag_stats(
/*==================*/
dict_index_t* index) /*!< in: index to clear defragmentation stats */
{
index->stat_defrag_modified_counter = 0;
index->stat_defrag_n_page_split = 0;
}
/*********************************************************************//**
@ -544,7 +577,9 @@ static
void
dict_stats_empty_table(
/*===================*/
dict_table_t* table) /*!< in/out: table */
dict_table_t* table, /*!< in/out: table */
bool empty_defrag_stats)
/*!< in: whether to empty defrag stats */
{
/* Zero the stats members */
@ -569,7 +604,7 @@ dict_stats_empty_table(
ut_ad(!dict_index_is_univ(index));
dict_stats_empty_index(index);
dict_stats_empty_index(index, empty_defrag_stats);
}
table->stat_initialized = TRUE;
@ -704,7 +739,7 @@ dict_stats_copy(
}
if (!INDEX_EQ(src_idx, dst_idx)) {
dict_stats_empty_index(dst_idx);
dict_stats_empty_index(dst_idx, true);
continue;
}
@ -715,7 +750,7 @@ dict_stats_copy(
/* Since src is smaller some elements in dst
will remain untouched by the following memmove(),
thus we init all of them here. */
dict_stats_empty_index(dst_idx);
dict_stats_empty_index(dst_idx, true);
} else {
n_copy_el = dst_idx->n_uniq;
}
@ -735,6 +770,13 @@ dict_stats_copy(
dst_idx->stat_index_size = src_idx->stat_index_size;
dst_idx->stat_n_leaf_pages = src_idx->stat_n_leaf_pages;
dst_idx->stat_defrag_modified_counter =
src_idx->stat_defrag_modified_counter;
dst_idx->stat_defrag_n_pages_freed =
src_idx->stat_defrag_n_pages_freed;
dst_idx->stat_defrag_n_page_split =
src_idx->stat_defrag_n_page_split;
}
dst->stat_initialized = TRUE;
@ -758,6 +800,9 @@ dict_index_t::stat_n_sample_sizes[]
dict_index_t::stat_n_non_null_key_vals[]
dict_index_t::stat_index_size
dict_index_t::stat_n_leaf_pages
dict_index_t::stat_defrag_modified_counter
dict_index_t::stat_defrag_n_pages_freed
dict_index_t::stat_defrag_n_page_split
The returned object should be freed with dict_stats_snapshot_free()
when no longer needed.
@return incomplete table object */
@ -807,7 +852,9 @@ dict_stats_snapshot_free(
Calculates new estimates for index statistics. This function is
relatively quick and is used to calculate transient statistics that
are not saved on disk. This was the only way to calculate statistics
before the Persistent Statistics feature was introduced. */
before the Persistent Statistics feature was introduced.
This function doesn't update the defragmentation related stats.
Only persistent statistics supports defragmentation stats. */
static
void
dict_stats_update_transient_for_index(
@ -823,10 +870,10 @@ dict_stats_update_transient_for_index(
Initialize some bogus index cardinality
statistics, so that the data can be queried in
various means, also via secondary indexes. */
dict_stats_empty_index(index);
dict_stats_empty_index(index, false);
#if defined UNIV_DEBUG || defined UNIV_IBUF_DEBUG
} else if (ibuf_debug && !dict_index_is_clust(index)) {
dict_stats_empty_index(index);
dict_stats_empty_index(index, false);
#endif /* UNIV_DEBUG || UNIV_IBUF_DEBUG */
} else {
mtr_t mtr;
@ -847,7 +894,7 @@ dict_stats_update_transient_for_index(
switch (size) {
case ULINT_UNDEFINED:
dict_stats_empty_index(index);
dict_stats_empty_index(index, false);
return;
case 0:
/* The root node of the tree is a leaf */
@ -882,7 +929,7 @@ dict_stats_update_transient(
if (dict_table_is_discarded(table)) {
/* Nothing to do. */
dict_stats_empty_table(table);
dict_stats_empty_table(table, false);
return;
} else if (index == NULL) {
/* Table definition is corrupt */
@ -892,7 +939,7 @@ dict_stats_update_transient(
fprintf(stderr, " InnoDB: table %s has no indexes. "
"Cannot calculate statistics.\n",
ut_format_name(table->name, TRUE, buf, sizeof(buf)));
dict_stats_empty_table(table);
dict_stats_empty_table(table, false);
return;
}
@ -904,7 +951,7 @@ dict_stats_update_transient(
continue;
}
dict_stats_empty_index(index);
dict_stats_empty_index(index, false);
if (dict_stats_should_ignore_index(index)) {
continue;
@ -1794,7 +1841,7 @@ dict_stats_analyze_index(
DEBUG_PRINTF(" %s(index=%s)\n", __func__, index->name);
dict_stats_empty_index(index);
dict_stats_empty_index(index, false);
mtr_start(&mtr);
@ -2059,7 +2106,7 @@ dict_stats_update_persistent(
/* Table definition is corrupt */
dict_table_stats_unlock(table, RW_X_LATCH);
dict_stats_empty_table(table);
dict_stats_empty_table(table, true);
return(DB_CORRUPTION);
}
@ -2088,7 +2135,7 @@ dict_stats_update_persistent(
continue;
}
dict_stats_empty_index(index);
dict_stats_empty_index(index, false);
if (dict_stats_should_ignore_index(index)) {
continue;
@ -2657,6 +2704,16 @@ dict_stats_fetch_index_stats_step(
== 0) {
index->stat_n_leaf_pages = (ulint) stat_value;
arg->stats_were_modified = true;
} else if (stat_name_len == 12 /* strlen("n_page_split") */
&& strncasecmp("n_page_split", stat_name, stat_name_len)
== 0) {
index->stat_defrag_n_page_split = (ulint) stat_value;
arg->stats_were_modified = true;
} else if (stat_name_len == 13 /* strlen("n_pages_freed") */
&& strncasecmp("n_pages_freed", stat_name, stat_name_len)
== 0) {
index->stat_defrag_n_pages_freed = (ulint) stat_value;
arg->stats_were_modified = true;
} else if (stat_name_len > PFX_LEN /* e.g. stat_name=="n_diff_pfx01" */
&& strncasecmp(PFX, stat_name, PFX_LEN) == 0) {
@ -2776,7 +2833,7 @@ dict_stats_fetch_from_ps(
the persistent storage contains incomplete stats (e.g. missing stats
for some index) then we would end up with (partially) uninitialized
stats. */
dict_stats_empty_table(table);
dict_stats_empty_table(table, true);
trx = trx_allocate_for_background();
@ -2877,6 +2934,22 @@ dict_stats_fetch_from_ps(
return(ret);
}
/*********************************************************************//**
Clear defragmentation stats modified counter for all indices in table. */
static
void
dict_stats_empty_defrag_modified_counter(
dict_table_t* table) /*!< in: table */
{
dict_index_t* index;
ut_a(table);
for (index = dict_table_get_first_index(table);
index != NULL;
index = dict_table_get_next_index(index)) {
index->stat_defrag_modified_counter = 0;
}
}
/*********************************************************************//**
Fetches or calculates new estimates for index statistics. */
UNIV_INTERN
@ -2949,13 +3022,13 @@ dict_stats_update(
"because the .ibd file is missing. For help, please "
"refer to " REFMAN "innodb-troubleshooting.html\n",
ut_format_name(table->name, TRUE, buf, sizeof(buf)));
dict_stats_empty_table(table);
dict_stats_empty_table(table, true);
return(DB_TABLESPACE_DELETED);
} else if (srv_force_recovery >= SRV_FORCE_NO_IBUF_MERGE) {
/* If we have set a high innodb_force_recovery level, do
not calculate statistics, as a badly corrupted index can
cause a crash in it. */
dict_stats_empty_table(table);
dict_stats_empty_table(table, false);
return(DB_SUCCESS);
}
@ -3014,7 +3087,7 @@ dict_stats_update(
case DICT_STATS_EMPTY_TABLE:
dict_stats_empty_table(table);
dict_stats_empty_table(table, true);
/* If table is using persistent stats,
then save the stats on disk */
@ -3073,6 +3146,7 @@ dict_stats_update(
t->stats_last_recalc = table->stats_last_recalc;
t->stat_modified_counter = 0;
dict_stats_empty_defrag_modified_counter(t);
switch (err) {
case DB_SUCCESS:
@ -3083,7 +3157,7 @@ dict_stats_update(
copying because dict_stats_table_clone_create() does
skip corrupted indexes so our dummy object 't' may
have less indexes than the real object 'table'. */
dict_stats_empty_table(table);
dict_stats_empty_table(table, true);
dict_stats_copy(table, t);
@ -3650,6 +3724,117 @@ dict_stats_rename_table(
return(ret);
}
/*********************************************************************//**
Save defragmentation result.
@return DB_SUCCESS or error code */
UNIV_INTERN
dberr_t
dict_stats_save_defrag_summary(
dict_index_t* index) /*!< in: index */
{
dberr_t ret;
lint now = (lint) ut_time();
if (dict_index_is_univ(index)) {
return DB_SUCCESS;
}
rw_lock_x_lock(&dict_operation_lock);
mutex_enter(&dict_sys->mutex);
ret = dict_stats_save_index_stat(index, now, "n_pages_freed",
index->stat_defrag_n_pages_freed,
NULL,
"Number of pages freed during"
" last defragmentation run.",
NULL);
mutex_exit(&dict_sys->mutex);
rw_lock_x_unlock(&dict_operation_lock);
return (ret);
}
/*********************************************************************//**
Save defragmentation stats for a given index.
@return DB_SUCCESS or error code */
UNIV_INTERN
dberr_t
dict_stats_save_defrag_stats(
dict_index_t* index) /*!< in: index */
{
dberr_t ret;
if (index->table->ibd_file_missing) {
ut_print_timestamp(stderr);
fprintf(stderr,
" InnoDB: Cannot save defragment stats because "
".ibd file is missing.\n");
return (DB_TABLESPACE_DELETED);
}
if (dict_index_is_corrupted(index)) {
ut_print_timestamp(stderr);
fprintf(stderr,
" InnoDB: Cannot save defragment stats because "
"index is corrupted.\n");
return(DB_CORRUPTION);
}
if (dict_index_is_univ(index)) {
return DB_SUCCESS;
}
lint now = (lint) ut_time();
mtr_t mtr;
ulint n_leaf_pages;
ulint n_leaf_reserved;
mtr_start(&mtr);
mtr_s_lock(dict_index_get_lock(index), &mtr);
n_leaf_reserved = btr_get_size_and_reserved(index, BTR_N_LEAF_PAGES,
&n_leaf_pages, &mtr);
mtr_commit(&mtr);
if (n_leaf_reserved == ULINT_UNDEFINED) {
// The index name is different during fast index creation,
// so the stats won't be associated with the right index
// for later use. We just return without saving.
return DB_SUCCESS;
}
rw_lock_x_lock(&dict_operation_lock);
mutex_enter(&dict_sys->mutex);
ret = dict_stats_save_index_stat(index, now, "n_page_split",
index->stat_defrag_n_page_split,
NULL,
"Number of new page splits on leaves"
" since last defragmentation.",
NULL);
if (ret != DB_SUCCESS) {
goto end;
}
ret = dict_stats_save_index_stat(
index, now, "n_leaf_pages_defrag",
n_leaf_pages,
NULL,
"Number of leaf pages when this stat is saved to disk",
NULL);
if (ret != DB_SUCCESS) {
goto end;
}
ret = dict_stats_save_index_stat(
index, now, "n_leaf_pages_reserved",
n_leaf_reserved,
NULL,
"Number of pages reserved for this index leaves when this stat "
"is saved to disk",
NULL);
end:
mutex_exit(&dict_sys->mutex);
rw_lock_x_unlock(&dict_operation_lock);
return (ret);
}
/* tests @{ */
#ifdef UNIV_COMPILE_TEST_FUNCS

View file

@ -25,6 +25,7 @@ Created Apr 25, 2012 Vasil Dimov
#include "row0mysql.h"
#include "srv0start.h"
#include "dict0dict.h"
#include "dict0stats.h"
#include "dict0stats_bg.h"
@ -44,8 +45,10 @@ UNIV_INTERN os_event_t dict_stats_event = NULL;
/** This mutex protects the "recalc_pool" variable. */
static ib_mutex_t recalc_pool_mutex;
static ib_mutex_t defrag_pool_mutex;
#ifdef HAVE_PSI_INTERFACE
static mysql_pfs_key_t recalc_pool_mutex_key;
static mysql_pfs_key_t defrag_pool_mutex_key;
#endif /* HAVE_PSI_INTERFACE */
/** The number of tables that can be added to "recalc_pool" before
@ -59,16 +62,26 @@ static recalc_pool_t recalc_pool;
typedef recalc_pool_t::iterator recalc_pool_iterator_t;
/** Indices whose defrag stats need to be saved to persistent storage.*/
struct defrag_pool_item_t {
table_id_t table_id;
index_id_t index_id;
};
typedef std::vector<defrag_pool_item_t> defrag_pool_t;
static defrag_pool_t defrag_pool;
typedef defrag_pool_t::iterator defrag_pool_iterator_t;
/*****************************************************************//**
Initialize the recalc pool, called once during thread initialization. */
static
void
dict_stats_recalc_pool_init()
dict_stats_pool_init()
/*=========================*/
{
ut_ad(!srv_read_only_mode);
recalc_pool.reserve(RECALC_POOL_INITIAL_SLOTS);
defrag_pool.reserve(RECALC_POOL_INITIAL_SLOTS);
}
/*****************************************************************//**
@ -76,12 +89,13 @@ Free the resources occupied by the recalc pool, called once during
thread de-initialization. */
static
void
dict_stats_recalc_pool_deinit()
/*===========================*/
dict_stats_pool_deinit()
/*====================*/
{
ut_ad(!srv_read_only_mode);
recalc_pool.clear();
defrag_pool.clear();
/*
recalc_pool may still have its buffer allocated. It will free it when
its destructor is called.
@ -90,8 +104,12 @@ dict_stats_recalc_pool_deinit()
memory. To avoid that, we force recalc_pool to surrender its buffer
to empty_pool object, which will free it when leaving this function:
*/
recalc_pool_t empty_pool;
recalc_pool.swap(empty_pool);
recalc_pool_t recalc_empty_pool;
defrag_pool_t defrag_empty_pool;
memset(&recalc_empty_pool, 0, sizeof(recalc_pool_t));
memset(&defrag_empty_pool, 0, sizeof(defrag_pool_t));
recalc_pool.swap(recalc_empty_pool);
defrag_pool.swap(defrag_empty_pool);
}
/*****************************************************************//**
@ -187,6 +205,111 @@ dict_stats_recalc_pool_del(
mutex_exit(&recalc_pool_mutex);
}
/*****************************************************************//**
Add an index in a table to the defrag pool, which is processed by the
background stats gathering thread. Only the table id and index id are
added to the list, so the table can be closed after being enqueued and
it will be opened when needed. If the table or index does not exist later
(has been DROPped), then it will be removed from the pool and skipped. */
UNIV_INTERN
void
dict_stats_defrag_pool_add(
/*=======================*/
const dict_index_t* index) /*!< in: table to add */
{
defrag_pool_item_t item;
ut_ad(!srv_read_only_mode);
mutex_enter(&defrag_pool_mutex);
/* quit if already in the list */
for (defrag_pool_iterator_t iter = defrag_pool.begin();
iter != defrag_pool.end();
++iter) {
if ((*iter).table_id == index->table->id
&& (*iter).index_id == index->id) {
mutex_exit(&defrag_pool_mutex);
return;
}
}
item.table_id = index->table->id;
item.index_id = index->id;
defrag_pool.push_back(item);
mutex_exit(&defrag_pool_mutex);
os_event_set(dict_stats_event);
}
/*****************************************************************//**
Get an index from the auto defrag pool. The returned index id is removed
from the pool.
@return true if the pool was non-empty and "id" was set, false otherwise */
static
bool
dict_stats_defrag_pool_get(
/*=======================*/
table_id_t* table_id, /*!< out: table id, or unmodified if
list is empty */
index_id_t* index_id) /*!< out: index id, or unmodified if
list is empty */
{
ut_ad(!srv_read_only_mode);
mutex_enter(&defrag_pool_mutex);
if (defrag_pool.empty()) {
mutex_exit(&defrag_pool_mutex);
return(false);
}
defrag_pool_item_t& item = defrag_pool.back();
*table_id = item.table_id;
*index_id = item.index_id;
defrag_pool.pop_back();
mutex_exit(&defrag_pool_mutex);
return(true);
}
/*****************************************************************//**
Delete a given index from the auto defrag pool. */
UNIV_INTERN
void
dict_stats_defrag_pool_del(
/*=======================*/
const dict_table_t* table, /*!<in: if given, remove
all entries for the table */
const dict_index_t* index) /*!< in: if given, remove this index */
{
ut_a((table && !index) || (!table && index));
ut_ad(!srv_read_only_mode);
ut_ad(mutex_own(&dict_sys->mutex));
mutex_enter(&defrag_pool_mutex);
defrag_pool_iterator_t iter = defrag_pool.begin();
while (iter != defrag_pool.end()) {
if ((table && (*iter).table_id == table->id)
|| (index
&& (*iter).table_id == index->table->id
&& (*iter).index_id == index->id)) {
/* erase() invalidates the iterator */
iter = defrag_pool.erase(iter);
if (index)
break;
} else {
iter++;
}
}
mutex_exit(&defrag_pool_mutex);
}
/*****************************************************************//**
Wait until background stats thread has stopped using the specified table.
The caller must have locked the data dictionary using
@ -237,7 +360,10 @@ dict_stats_thread_init()
mutex_create(recalc_pool_mutex_key, &recalc_pool_mutex,
SYNC_STATS_AUTO_RECALC);
dict_stats_recalc_pool_init();
/* We choose SYNC_STATS_DEFRAG to be below SYNC_FSP_PAGE. */
mutex_create(defrag_pool_mutex_key, &defrag_pool_mutex,
SYNC_STATS_DEFRAG);
dict_stats_pool_init();
}
/*****************************************************************//**
@ -251,11 +377,14 @@ dict_stats_thread_deinit()
ut_a(!srv_read_only_mode);
ut_ad(!srv_dict_stats_thread_active);
dict_stats_recalc_pool_deinit();
dict_stats_pool_deinit();
mutex_free(&recalc_pool_mutex);
memset(&recalc_pool_mutex, 0x0, sizeof(recalc_pool_mutex));
mutex_free(&defrag_pool_mutex);
memset(&defrag_pool_mutex, 0x0, sizeof(defrag_pool_mutex));
os_event_free(dict_stats_event);
dict_stats_event = NULL;
}
@ -332,6 +461,63 @@ dict_stats_process_entry_from_recalc_pool()
mutex_exit(&dict_sys->mutex);
}
/*****************************************************************//**
Get the first index that has been added for updating persistent defrag
stats and eventually save its stats. */
static
void
dict_stats_process_entry_from_defrag_pool()
/*=======================================*/
{
table_id_t table_id;
index_id_t index_id;
ut_ad(!srv_read_only_mode);
/* pop the first index from the auto defrag pool */
if (!dict_stats_defrag_pool_get(&table_id, &index_id)) {
/* no index in defrag pool */
return;
}
dict_table_t* table;
mutex_enter(&dict_sys->mutex);
/* If the table is no longer cached, we've already lost the in
memory stats so there's nothing really to write to disk. */
table = dict_table_open_on_id(table_id, TRUE,
DICT_TABLE_OP_OPEN_ONLY_IF_CACHED);
if (table == NULL) {
mutex_exit(&dict_sys->mutex);
return;
}
/* Check whether table is corrupted */
if (table->corrupted) {
dict_table_close(table, TRUE, FALSE);
mutex_exit(&dict_sys->mutex);
return;
}
mutex_exit(&dict_sys->mutex);
dict_index_t* index = dict_table_find_index_on_id(table, index_id);
if (index == NULL) {
return;
}
/* Check whether index is corrupted */
if (dict_index_is_corrupted(index)) {
dict_table_close(table, FALSE, FALSE);
return;
}
dict_stats_save_defrag_stats(index);
dict_table_close(table, FALSE, FALSE);
}
/*****************************************************************//**
This is the thread for background stats gathering. It pops tables, from
the auto recalc list and proceeds them, eventually recalculating their
@ -364,6 +550,9 @@ DECLARE_THREAD(dict_stats_thread)(
dict_stats_process_entry_from_recalc_pool();
while (defrag_pool.size())
dict_stats_process_entry_from_defrag_pool();
os_event_reset(dict_stats_event);
}

View file

@ -57,6 +57,7 @@ this program; if not, write to the Free Software Foundation, Inc.,
#include "buf0flu.h"
#include "buf0dblwr.h"
#include "btr0sea.h"
#include "btr0defragment.h"
#include "os0file.h"
#include "os0thread.h"
#include "srv0start.h"
@ -65,7 +66,6 @@ this program; if not, write to the Free Software Foundation, Inc.,
#include "trx0trx.h"
#include "trx0sys.h"
#include "mtr0mtr.h"
#include "rem0types.h"
#include "row0ins.h"
#include "row0mysql.h"
@ -86,6 +86,7 @@ this program; if not, write to the Free Software Foundation, Inc.,
#include "dict0stats_bg.h"
#include "ha_prototypes.h"
#include "ut0mem.h"
#include "ut0timer.h"
#include "ibuf0ibuf.h"
#include "dict0dict.h"
#include "srv0mon.h"
@ -752,6 +753,14 @@ static SHOW_VAR innodb_status_variables[]= {
{"have_bzip2",
(char*) &innodb_have_bzip2, SHOW_BOOL},
/* Defragmentation */
{"defragment_compression_failures",
(char*) &export_vars.innodb_defragment_compression_failures, SHOW_LONG},
{"defragment_failures",
(char*) &export_vars.innodb_defragment_failures, SHOW_LONG},
{"defragment_count",
(char*) &export_vars.innodb_defragment_count, SHOW_LONG},
{NullS, NullS, SHOW_LONG}
};
@ -2351,7 +2360,8 @@ ha_innobase::ha_innobase(
(srv_force_primary_key ? HA_REQUIRE_PRIMARY_KEY : 0 ) |
HA_CAN_FULLTEXT_EXT | HA_CAN_EXPORT),
start_of_scan(0),
num_write_row(0)
num_write_row(0),
ha_partition_stats(NULL)
{}
/*********************************************************************//**
@ -10678,6 +10688,71 @@ ha_innobase::delete_table(
DBUG_RETURN(convert_error_code_to_mysql(err, 0, NULL));
}
/*****************************************************************//**
Defragment table.
@return error number */
UNIV_INTERN
int
ha_innobase::defragment_table(
/*==========================*/
const char* name, /*!< in: table name */
const char* index_name, /*!< in: index name */
bool async) /*!< in: whether to wait until finish */
{
char norm_name[FN_REFLEN];
dict_table_t* table;
dict_index_t* index;
ibool one_index = (index_name != 0);
int ret = 0;
if (!srv_defragment) {
return ER_FEATURE_DISABLED;
}
normalize_table_name(norm_name, name);
table = dict_table_open_on_name(norm_name, FALSE,
FALSE, DICT_ERR_IGNORE_NONE);
for (index = dict_table_get_first_index(table); index;
index = dict_table_get_next_index(index)) {
if (one_index && strcasecmp(index_name, index->name) != 0)
continue;
if (btr_defragment_find_index(index)) {
// We borrow this error code. When the same index is
// already in the defragmentation queue, issue another
// defragmentation only introduces overhead. We return
// an error here to let the user know this is not
// necessary. Note that this will fail a query that's
// trying to defragment a full table if one of the
// indicies in that table is already in defragmentation.
// We choose this behavior so user is aware of this
// rather than silently defragment other indicies of
// that table.
ret = ER_SP_ALREADY_EXISTS;
break;
}
os_event_t event = btr_defragment_add_index(index, async);
if (!async && event) {
while(os_event_wait_time(event, 1000000)) {
if (thd_killed(current_thd)) {
btr_defragment_remove_index(index);
ret = ER_QUERY_INTERRUPTED;
break;
}
}
os_event_free(event);
}
if (ret) {
break;
}
if (one_index) {
one_index = FALSE;
break;
}
}
dict_table_close(table, FALSE, FALSE);
if (ret == 0 && one_index) {
ret = ER_NO_SUCH_INDEX;
}
return ret;
}
/*****************************************************************//**
Removes all tables in the named database inside InnoDB. */
@ -11816,6 +11891,27 @@ ha_innobase::optimize(
This works OK otherwise, but MySQL locks the entire table during
calls to OPTIMIZE, which is undesirable. */
if (srv_defragment) {
int err;
err = defragment_table(prebuilt->table->name, NULL, false);
if (err == 0) {
return (HA_ADMIN_OK);
} else {
push_warning_printf(thd, Sql_condition::WARN_LEVEL_WARN,
err,
"InnoDB: Cannot defragment table %s: returned error code %d\n",
prebuilt->table->name, err);
if(err == ER_SP_ALREADY_EXISTS) {
return (HA_ADMIN_OK);
} else {
return (HA_ADMIN_TRY_ALTER);
}
}
}
if (innodb_optimize_fulltext_only) {
if (prebuilt->table->fts && prebuilt->table->fts->cache
&& !dict_table_is_discarded(prebuilt->table)) {
@ -14520,6 +14616,13 @@ innodb_max_dirty_pages_pct_lwm_update(
srv_max_dirty_pages_pct_lwm = in_val;
}
UNIV_INTERN
void
ha_innobase::set_partition_owner_stats(ha_statistics *stats)
{
ha_partition_stats= stats;
}
/************************************************************//**
Validate the file format name and return its corresponding id.
@return valid file format id */
@ -15773,6 +15876,23 @@ innodb_reset_all_monitor_update(
TRUE);
}
static
void
innodb_defragment_frequency_update(
/*===============================*/
THD* thd, /*!< in: thread handle */
struct st_mysql_sys_var* var, /*!< in: pointer to
system variable */
void* var_ptr,/*!< out: where the
formal string goes */
const void* save) /*!< in: immediate result
from check function */
{
srv_defragment_frequency = (*static_cast<const uint*>(save));
srv_defragment_interval = ut_microseconds_to_timer(
1000000.0 / srv_defragment_frequency);
}
/****************************************************************//**
Parse and enable InnoDB monitor counters during server startup.
User can list the monitor counters/groups to be enable by specifying
@ -16631,6 +16751,60 @@ static MYSQL_SYSVAR_BOOL(buffer_pool_load_at_startup, srv_buffer_pool_load_at_st
"Load the buffer pool from a file named @@innodb_buffer_pool_filename",
NULL, NULL, FALSE);
static MYSQL_SYSVAR_BOOL(defragment, srv_defragment,
PLUGIN_VAR_RQCMDARG,
"Enable/disable InnoDB defragmentation (default FALSE). When set to FALSE, all existing "
"defragmentation will be paused. And new defragmentation command will fail."
"Paused defragmentation commands will resume when this variable is set to "
"true again.",
NULL, NULL, FALSE);
static MYSQL_SYSVAR_UINT(defragment_n_pages, srv_defragment_n_pages,
PLUGIN_VAR_RQCMDARG,
"Number of pages considered at once when merging multiple pages to "
"defragment",
NULL, NULL, 7, 2, 32, 0);
static MYSQL_SYSVAR_UINT(defragment_stats_accuracy,
srv_defragment_stats_accuracy,
PLUGIN_VAR_RQCMDARG,
"How many defragment stats changes there are before the stats "
"are written to persistent storage. Set to 0 meaning disable "
"defragment stats tracking.",
NULL, NULL, 0, 0, ~0U, 0);
static MYSQL_SYSVAR_UINT(defragment_fill_factor_n_recs,
srv_defragment_fill_factor_n_recs,
PLUGIN_VAR_RQCMDARG,
"How many records of space defragmentation should leave on the page. "
"This variable, together with innodb_defragment_fill_factor, is introduced "
"so defragmentation won't pack the page too full and cause page split on "
"the next insert on every page. The variable indicating more defragmentation"
" gain is the one effective.",
NULL, NULL, 20, 1, 100, 0);
static MYSQL_SYSVAR_DOUBLE(defragment_fill_factor, srv_defragment_fill_factor,
PLUGIN_VAR_RQCMDARG,
"A number between [0.7, 1] that tells defragmentation how full it should "
"fill a page. Default is 0.9. Number below 0.7 won't make much sense."
"This variable, together with innodb_defragment_fill_factor_n_recs, is "
"introduced so defragmentation won't pack the page too full and cause "
"page split on the next insert on every page. The variable indicating more "
"defragmentation gain is the one effective.",
NULL, NULL, 0.9, 0.7, 1, 0);
static MYSQL_SYSVAR_UINT(defragment_frequency, srv_defragment_frequency,
PLUGIN_VAR_RQCMDARG,
"Do not defragment a single index more than this number of time per second."
"This controls the number of time defragmentation thread can request X_LOCK "
"on an index. Defragmentation thread will check whether "
"1/defragment_frequency (s) has passed since it worked on this index last "
"time, and put the index back to the queue if not enough time has passed. "
"The actual frequency can only be lower than this given number.",
NULL, innodb_defragment_frequency_update,
SRV_DEFRAGMENT_FREQUENCY_DEFAULT, 1, 1000, 0);
static MYSQL_SYSVAR_ULONG(lru_scan_depth, srv_LRU_scan_depth,
PLUGIN_VAR_RQCMDARG,
"How deep to scan LRU to keep it clean",
@ -17116,6 +17290,12 @@ static struct st_mysql_sys_var* innobase_system_variables[]= {
MYSQL_SYSVAR(buffer_pool_load_now),
MYSQL_SYSVAR(buffer_pool_load_abort),
MYSQL_SYSVAR(buffer_pool_load_at_startup),
MYSQL_SYSVAR(defragment),
MYSQL_SYSVAR(defragment_n_pages),
MYSQL_SYSVAR(defragment_stats_accuracy),
MYSQL_SYSVAR(defragment_fill_factor),
MYSQL_SYSVAR(defragment_fill_factor_n_recs),
MYSQL_SYSVAR(defragment_frequency),
MYSQL_SYSVAR(lru_scan_depth),
MYSQL_SYSVAR(flush_neighbors),
MYSQL_SYSVAR(checksum_algorithm),

View file

@ -105,6 +105,8 @@ class ha_innobase: public handler
or undefined */
uint num_write_row; /*!< number of write_row() calls */
ha_statistics* ha_partition_stats; /*!< stats of the partition owner
handler (if there is one) */
uint store_key_val_for_row(uint keynr, char* buff, uint buff_len,
const uchar* record);
inline void update_thd(THD* thd);
@ -206,6 +208,8 @@ class ha_innobase: public handler
int truncate();
int delete_table(const char *name);
int rename_table(const char* from, const char* to);
int defragment_table(const char* name, const char* index_name,
bool async);
int check(THD* thd, HA_CHECK_OPT* check_opt);
char* update_table_comment(const char* comment);
char* get_foreign_key_create_info();
@ -309,6 +313,7 @@ class ha_innobase: public handler
Alter_inplace_info* ha_alter_info,
bool commit);
/** @} */
void set_partition_owner_stats(ha_statistics *stats);
bool check_if_incompatible_data(HA_CREATE_INFO *info,
uint table_changes);
private:

View file

@ -2,6 +2,7 @@
Copyright (c) 1994, 2013, Oracle and/or its affiliates. All Rights Reserved.
Copyright (c) 2012, Facebook Inc.
Copyright (c) 2014, SkySQL Ab. All Rights Reserved.
This program is free software; you can redistribute it and/or modify it under
the terms of the GNU General Public License as published by the Free Software
@ -671,6 +672,21 @@ btr_get_size(
is s-latched */
__attribute__((nonnull, warn_unused_result));
/**************************************************************//**
Gets the number of reserved and used pages in a B-tree.
@return number of pages reserved, or ULINT_UNDEFINED if the index
is unavailable */
UNIV_INTERN
ulint
btr_get_size_and_reserved(
/*======================*/
dict_index_t* index, /*!< in: index */
ulint flag, /*!< in: BTR_N_LEAF_PAGES or BTR_TOTAL_SIZE */
ulint* used, /*!< out: number of pages used (<= reserved) */
mtr_t* mtr) /*!< in/out: mini-transaction where index
is s-latched */
__attribute__((nonnull));
/**************************************************************//**
Allocates a new file page to be used in an index tree. NOTE: we assume
that the caller has made the reservation for free extents!
@retval NULL if no page could be allocated
@ -717,6 +733,33 @@ btr_page_free_low(
ulint level, /*!< in: page level */
mtr_t* mtr) /*!< in: mtr */
__attribute__((nonnull));
/*************************************************************//**
Reorganizes an index page.
IMPORTANT: On success, the caller will have to update IBUF_BITMAP_FREE
if this is a compressed leaf page in a secondary index. This has to
be done either within the same mini-transaction, or by invoking
ibuf_reset_free_bits() before mtr_commit(). On uncompressed pages,
IBUF_BITMAP_FREE is unaffected by reorganization.
@retval true if the operation was successful
@retval false if it is a compressed page, and recompression failed */
UNIV_INTERN
bool
btr_page_reorganize_block(
/*======================*/
bool recovery,/*!< in: true if called in recovery:
locks should not be updated, i.e.,
there cannot exist locks on the
page, and a hash index should not be
dropped: it cannot exist */
ulint z_level,/*!< in: compression level to be used
if dealing with compressed page */
buf_block_t* block, /*!< in/out: B-tree page */
dict_index_t* index, /*!< in: the index tree of the page */
mtr_t* mtr) /*!< in/out: mini-transaction */
__attribute__((nonnull));
#ifdef UNIV_BTR_PRINT
/*************************************************************//**
Prints size info of a B-tree. */
@ -762,6 +805,60 @@ btr_validate_index(
const trx_t* trx) /*!< in: transaction or 0 */
__attribute__((nonnull(1), warn_unused_result));
#ifdef UNIV_SYNC_DEBUG
/*************************************************************//**
Removes a page from the level list of pages.
@param space in: space where removed
@param zip_size in: compressed page size in bytes, or 0 for uncompressed
@param page in/out: page to remove
@param index in: index tree
@param mtr in/out: mini-transaction */
# define btr_level_list_remove(space,zip_size,page,index,mtr) \
btr_level_list_remove_func(space,zip_size,page,index,mtr)
#else /* UNIV_SYNC_DEBUG */
/*************************************************************//**
Removes a page from the level list of pages.
@param space in: space where removed
@param zip_size in: compressed page size in bytes, or 0 for uncompressed
@param page in/out: page to remove
@param index in: index tree
@param mtr in/out: mini-transaction */
# define btr_level_list_remove(space,zip_size,page,index,mtr) \
btr_level_list_remove_func(space,zip_size,page,mtr)
#endif /* UNIV_SYNC_DEBUG */
/*************************************************************//**
Removes a page from the level list of pages. */
UNIV_INTERN
void
btr_level_list_remove_func(
/*=======================*/
ulint space, /*!< in: space where removed */
ulint zip_size,/*!< in: compressed page size in bytes
or 0 for uncompressed pages */
page_t* page, /*!< in/out: page to remove */
#ifdef UNIV_SYNC_DEBUG
const dict_index_t* index, /*!< in: index tree */
#endif /* UNIV_SYNC_DEBUG */
mtr_t* mtr) /*!< in/out: mini-transaction */
__attribute__((nonnull));
/*************************************************************//**
If page is the only on its level, this function moves its records to the
father page, thus reducing the tree height.
@return father block */
UNIV_INTERN
buf_block_t*
btr_lift_page_up(
/*=============*/
dict_index_t* index, /*!< in: index tree */
buf_block_t* block, /*!< in: page which is the only on its level;
must not be empty: use
btr_discard_only_page_on_level if the last
record from the page should be removed */
mtr_t* mtr) /*!< in: mtr */
__attribute__((nonnull));
#define BTR_N_LEAF_PAGES 1
#define BTR_TOTAL_SIZE 2
#endif /* !UNIV_HOTBACKUP */

View file

@ -163,9 +163,10 @@ btr_page_get_next(
/*!< in: mini-transaction handle */
{
ut_ad(page && mtr);
#ifndef UNIV_INNOCHECKSUM
ut_ad(mtr_memo_contains_page(mtr, page, MTR_MEMO_PAGE_X_FIX)
|| mtr_memo_contains_page(mtr, page, MTR_MEMO_PAGE_S_FIX));
#endif /* UNIV_INNOCHECKSUM */
return(mach_read_from_4(page + FIL_PAGE_NEXT));
}

View file

@ -0,0 +1,101 @@
/*****************************************************************************
Copyright (C) 2013, 2014 Facebook, Inc. All Rights Reserved.
This program is free software; you can redistribute it and/or modify it under
the terms of the GNU General Public License as published by the Free Software
Foundation; version 2 of the License.
This program is distributed in the hope that it will be useful, but WITHOUT
ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
You should have received a copy of the GNU General Public License along with
this program; if not, write to the Free Software Foundation, Inc.,
51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA
*****************************************************************************/
#ifndef btr0defragment_h
#define btr0defragment_h
#include "univ.i"
#ifndef UNIV_HOTBACKUP
#include "btr0pcur.h"
/* Max number of pages to consider at once during defragmentation. */
#define BTR_DEFRAGMENT_MAX_N_PAGES 32
/** stats in btr_defragment */
extern ulint btr_defragment_compression_failures;
extern ulint btr_defragment_failures;
extern ulint btr_defragment_count;
/** Item in the work queue for btr_degrament_thread. */
struct btr_defragment_item_t
{
btr_pcur_t* pcur; /* persistent cursor where
btr_defragment_n_pages should start */
os_event_t event; /* if not null, signal after work
is done */
bool removed; /* Mark an item as removed */
ulonglong last_processed; /* timestamp of last time this index
is processed by defragment thread */
btr_defragment_item_t(btr_pcur_t* pcur, os_event_t event);
~btr_defragment_item_t();
};
/******************************************************************//**
Initialize defragmentation. */
void
btr_defragment_init(void);
/******************************************************************//**
Shutdown defragmentation. */
void
btr_defragment_shutdown();
/******************************************************************//**
Check whether the given index is in btr_defragment_wq. */
bool
btr_defragment_find_index(
dict_index_t* index); /*!< Index to find. */
/******************************************************************//**
Add an index to btr_defragment_wq. Return a pointer to os_event if this
is a synchronized defragmentation. */
os_event_t
btr_defragment_add_index(
dict_index_t* index, /*!< index to be added */
bool async); /*!< whether this is an async defragmentation */
/******************************************************************//**
When table is dropped, this function is called to mark a table as removed in
btr_efragment_wq. The difference between this function and the remove_index
function is this will not NULL the event. */
void
btr_defragment_remove_table(
dict_table_t* table); /*!< Index to be removed. */
/******************************************************************//**
Mark an index as removed from btr_defragment_wq. */
void
btr_defragment_remove_index(
dict_index_t* index); /*!< Index to be removed. */
/*********************************************************************//**
Check whether we should save defragmentation statistics to persistent storage.*/
UNIV_INTERN
void
btr_defragment_save_defrag_stats_if_needed(
dict_index_t* index); /*!< in: index */
/******************************************************************//**
Thread that merges consecutive b-tree pages into fewer pages to defragment
the index. */
extern "C" UNIV_INTERN
os_thread_ret_t
DECLARE_THREAD(btr_defragment_thread)(
/*==========================================*/
void* arg); /*!< in: a dummy parameter required by
os_thread_create */
#endif /* !UNIV_HOTBACKUP */
#endif

View file

@ -120,7 +120,9 @@ enum dict_table_op_t {
DICT_TABLE_OP_DROP_ORPHAN,
/** Silently load the tablespace if it does not exist,
and do not load the definitions of incomplete indexes. */
DICT_TABLE_OP_LOAD_TABLESPACE
DICT_TABLE_OP_LOAD_TABLESPACE,
/** Open the table only if it's in table cache. */
DICT_TABLE_OP_OPEN_ONLY_IF_CACHED
};
/**********************************************************************//**
@ -1496,6 +1498,16 @@ dict_table_get_index_on_name(
const char* name) /*!< in: name of the index to find */
__attribute__((nonnull, warn_unused_result));
/**********************************************************************//**
Looks for an index with the given id given a table instance.
@return index or NULL */
UNIV_INTERN
dict_index_t*
dict_table_find_index_on_id(
/*========================*/
const dict_table_t* table, /*!< in: table instance */
index_id_t id) /*!< in: index id */
__attribute__((nonnull, warn_unused_result));
/**********************************************************************//**
In case there is more than one index with the same name return the index
with the min(id).
@return index, NULL if does not exist */

View file

@ -588,6 +588,10 @@ struct zip_pad_info_t {
rounds */
};
/** Number of samples of data size kept when page compression fails for
a certain index.*/
#define STAT_DEFRAG_DATA_SIZE_N_SAMPLE 10
/** Data structure for an index. Most fields will be
initialized to 0, NULL or FALSE in dict_mem_index_create(). */
struct dict_index_t{
@ -676,6 +680,23 @@ struct dict_index_t{
/*!< approximate number of leaf pages in the
index tree */
/* @} */
/** Statistics for defragmentation, these numbers are estimations and
could be very inaccurate at certain times, e.g. right after restart,
during defragmentation, etc. */
/* @{ */
ulint stat_defrag_modified_counter;
ulint stat_defrag_n_pages_freed;
/* number of pages freed by defragmentation. */
ulint stat_defrag_n_page_split;
/* number of page splits since last full index
defragmentation. */
ulint stat_defrag_data_size_sample[STAT_DEFRAG_DATA_SIZE_N_SAMPLE];
/* data size when compression failure happened
the most recent 10 times. */
ulint stat_defrag_sample_next_slot;
/* in which slot the next sample should be
saved. */
/* @} */
rw_lock_t lock; /*!< read-write lock protecting the
upper levels of the index tree */
trx_id_t trx_id; /*!< id of the transaction that created this

View file

@ -53,8 +53,9 @@ dict_table_t*
dict_table_open_on_id_low(
/*=====================*/
table_id_t table_id, /*!< in: table id */
dict_err_ignore_t ignore_err); /*!< in: errors to ignore
dict_err_ignore_t ignore_err, /*!< in: errors to ignore
when loading the table */
ibool open_only_if_in_cache);
#ifndef UNIV_NONINL
#include "dict0priv.ic"

View file

@ -74,8 +74,9 @@ dict_table_t*
dict_table_open_on_id_low(
/*======================*/
table_id_t table_id, /*!< in: table id */
dict_err_ignore_t ignore_err) /*!< in: errors to ignore
dict_err_ignore_t ignore_err, /*!< in: errors to ignore
when loading the table */
ibool open_only_if_in_cache)
{
dict_table_t* table;
ulint fold;
@ -88,7 +89,7 @@ dict_table_open_on_id_low(
HASH_SEARCH(id_hash, dict_sys->table_id_hash, fold,
dict_table_t*, table, ut_ad(table->cached),
table->id == table_id);
if (table == NULL) {
if (table == NULL && !open_only_if_in_cache) {
table = dict_load_table_on_id(table_id, ignore_err);
}

View file

@ -195,6 +195,39 @@ dict_stats_rename_table(
is returned */
size_t errstr_sz); /*!< in: errstr size */
/*********************************************************************//**
Save defragmentation result.
@return DB_SUCCESS or error code */
UNIV_INTERN
dberr_t
dict_stats_save_defrag_summary(
dict_index_t* index); /*!< in: index */
/*********************************************************************//**
Save defragmentation stats for a given index.
@return DB_SUCCESS or error code */
UNIV_INTERN
dberr_t
dict_stats_save_defrag_stats(
dict_index_t* index); /*!< in: index */
/**********************************************************************//**
Clear defragmentation summary. */
UNIV_INTERN
void
dict_stats_empty_defrag_summary(
/*==================*/
dict_index_t* index); /*!< in: index to clear defragmentation stats */
/**********************************************************************//**
Clear defragmentation related index stats. */
UNIV_INTERN
void
dict_stats_empty_defrag_stats(
/*==================*/
dict_index_t* index); /*!< in: index to clear defragmentation stats */
#ifndef UNIV_NONINL
#include "dict0stats.ic"
#endif

View file

@ -56,6 +56,28 @@ dict_stats_recalc_pool_del(
/*=======================*/
const dict_table_t* table); /*!< in: table to remove */
/*****************************************************************//**
Add an index in a table to the defrag pool, which is processed by the
background stats gathering thread. Only the table id and index id are
added to the list, so the table can be closed after being enqueued and
it will be opened when needed. If the table or index does not exist later
(has been DROPped), then it will be removed from the pool and skipped. */
UNIV_INTERN
void
dict_stats_defrag_pool_add(
/*=======================*/
const dict_index_t* index); /*!< in: table to add */
/*****************************************************************//**
Delete a given index from the auto defrag pool. */
UNIV_INTERN
void
dict_stats_defrag_pool_del(
/*=======================*/
const dict_table_t* table, /*!<in: if given, remove
all entries for the table */
const dict_index_t* index); /*!< in: index to remove */
/** Yield the data dictionary latch when waiting
for the background thread to stop accessing a table.
@param trx transaction holding the data dictionary locks */

View file

@ -181,6 +181,16 @@ lock_update_merge_left(
const buf_block_t* right_block); /*!< in: merged index page
which will be discarded */
/*************************************************************//**
Updates the lock table when a page is splited and merged to
two pages. */
UNIV_INTERN
void
lock_update_split_and_merge(
const buf_block_t* left_block, /*!< in: left page to which merged */
const rec_t* orig_pred, /*!< in: original predecessor of
supremum on the left page before merge*/
const buf_block_t* right_block);/*!< in: right page from which merged */
/*************************************************************//**
Resets the original locks on heir and replaces them with gap type locks
inherited from rec. */
UNIV_INTERN

View file

@ -335,6 +335,15 @@ extern my_bool srv_random_read_ahead;
extern ulong srv_read_ahead_threshold;
extern ulint srv_n_read_io_threads;
extern ulint srv_n_write_io_threads;
/* Defragmentation, Origianlly facebook default value is 100, but it's too high */
#define SRV_DEFRAGMENT_FREQUENCY_DEFAULT 40
extern my_bool srv_defragment;
extern uint srv_defragment_n_pages;
extern uint srv_defragment_stats_accuracy;
extern uint srv_defragment_fill_factor_n_recs;
extern double srv_defragment_fill_factor;
extern uint srv_defragment_frequency;
extern ulonglong srv_defragment_interval;
/* Number of IO operations per second the server can do */
extern ulong srv_io_capacity;
@ -888,7 +897,12 @@ struct export_var_t{
ulint innodb_rows_deleted; /*!< srv_n_rows_deleted */
ulint innodb_num_open_files; /*!< fil_n_file_opened */
ulint innodb_truncated_status_writes; /*!< srv_truncated_status_writes */
ulint innodb_available_undo_logs; /*!< srv_available_undo_logs */
ulint innodb_available_undo_logs; /*!< srv_available_undo_logs
*/
ulint innodb_defragment_compression_failures;
ulint innodb_defragment_failures;
ulint innodb_defragment_count;
#ifdef UNIV_DEBUG
ulint innodb_purge_trx_id_age; /*!< rw_max_trx_id - purged trx_id */
ulint innodb_purge_view_trx_id_age; /*!< rw_max_trx_id

View file

@ -687,6 +687,7 @@ or row lock! */
#define SYNC_EXTERN_STORAGE 500
#define SYNC_FSP 400
#define SYNC_FSP_PAGE 395
#define SYNC_STATS_DEFRAG 390
/*------------------------------------- Change buffer headers */
#define SYNC_IBUF_MUTEX 370 /* ibuf_mutex */
/*------------------------------------- Change buffer tree */

View file

@ -0,0 +1,104 @@
/*****************************************************************************
Copyright (c) 2013, 2014, Facebook, Inc. All Rights Reserved.
Copyright (c) 2014, SkySQL Ab. All Rights Reserved.
This program is free software; you can redistribute it and/or modify it under
the terms of the GNU General Public License as published by the Free Software
Foundation; version 2 of the License.
This program is distributed in the hope that it will be useful, but WITHOUT
ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
You should have received a copy of the GNU General Public License along with
this program; if not, write to the Free Software Foundation, Inc.,
51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA
*****************************************************************************/
/********************************************************************//**
@file include/ut0timer.h
Timer rountines
Created 30/07/2014 Jan Lindström jan.lindstrom@skysql.com
modified from https://github.com/facebook/mysql-5.6/commit/c75a413edeb96eb99bf11d7269bdfea06f96d6b6
*************************************************************************/
#ifndef ut0timer_h
#define ut0timer_h
#include "univ.i"
#include "data0type.h"
#include <my_rdtsc.h>
/* Current timer stats */
extern struct my_timer_unit_info ut_timer;
/**************************************************************//**
Function pointer to point selected timer function.
@return timer current value */
extern ulonglong (*ut_timer_now)(void);
/**************************************************************//**
Sets up the data required for use of my_timer_* functions.
Selects the best timer by high frequency, and tight resolution.
Points my_timer_now() to the selected timer function.
Initializes my_timer struct to contain the info for selected timer.*/
UNIV_INTERN
void ut_init_timer(void);
/**************************************************************//**
Return time passed since time then, automatically adjusted
for the estimated timer overhead.
@return time passed since "then" */
UNIV_INLINE
ulonglong
ut_timer_since(
/*===========*/
ulonglong then); /*!< in: time where to calculate */
/**************************************************************//**
Get time passed since "then", and update then to now
@return time passed sinche "then" */
UNIV_INLINE
ulonglong
ut_timer_since_and_update(
/*======================*/
ulonglong *then); /*!< in: time where to calculate */
/**************************************************************//**
Convert native timer units in a ulonglong into seconds in a double
@return time in a seconds */
UNIV_INLINE
double
ut_timer_to_seconds(
/*=================*/
ulonglong when); /*!< in: time where to calculate */
/**************************************************************//**
Convert native timer units in a ulonglong into milliseconds in a double
@return time in milliseconds */
UNIV_INLINE
double
ut_timer_to_milliseconds(
/*=====================*/
ulonglong when); /*!< in: time where to calculate */
/**************************************************************//**
Convert native timer units in a ulonglong into microseconds in a double
@return time in microseconds */
UNIV_INLINE
double
ut_timer_to_microseconds(
/*=====================*/
ulonglong when); /*!< in: time where to calculate */
/**************************************************************//**
Convert microseconds in a double to native timer units in a ulonglong
@return time in microseconds */
UNIV_INLINE
ulonglong
ut_microseconds_to_timer(
/*=====================*/
ulonglong when); /*!< in: time where to calculate */
#ifndef UNIV_NONINL
#include "ut0timer.ic"
#endif
#endif

View file

@ -0,0 +1,113 @@
/*****************************************************************************
Copyright (c) 2013, 2014, Facebook, Inc. All Rights Reserved.
Copyright (c) 2014, SkySQL Ab. All Rights Reserved.
This program is free software; you can redistribute it and/or modify it under
the terms of the GNU General Public License as published by the Free Software
Foundation; version 2 of the License.
This program is distributed in the hope that it will be useful, but WITHOUT
ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
You should have received a copy of the GNU General Public License along with
this program; if not, write to the Free Software Foundation, Inc.,
51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA
*****************************************************************************/
/********************************************************************//**
@file include/ut0timer.ic
Timer rountines
Created 30/07/2014 Jan Lindström jan.lindstrom@skysql.com
modified from https://github.com/facebook/mysql-5.6/commit/c75a413edeb96eb99bf11d7269bdfea06f96d6b6
*************************************************************************/
/**************************************************************//**
Return time passed since time then, automatically adjusted
for the estimated timer overhead.
@return time passed since "then" */
UNIV_INLINE
ulonglong
ut_timer_since(
/*===========*/
ulonglong then) /*!< in: time where to calculate */
{
return (ut_timer_now() - then) - ut_timer.overhead;
}
/**************************************************************//**
Get time passed since "then", and update then to now
@return time passed sinche "then" */
UNIV_INLINE
ulonglong
ut_timer_since_and_update(
/*======================*/
ulonglong *then) /*!< in: time where to calculate */
{
ulonglong now = ut_timer_now();
ulonglong ret = (now - (*then)) - ut_timer.overhead;
*then = now;
return ret;
}
/**************************************************************//**
Convert native timer units in a ulonglong into seconds in a double
@return time in a seconds */
UNIV_INLINE
double
ut_timer_to_seconds(
/*=================*/
ulonglong when) /*!< in: time where to calculate */
{
double ret = (double)(when);
ret /= (double)(ut_timer.frequency);
return ret;
}
/**************************************************************//**
Convert native timer units in a ulonglong into milliseconds in a double
@return time in milliseconds */
UNIV_INLINE
double
ut_timer_to_milliseconds(
/*=====================*/
ulonglong when) /*!< in: time where to calculate */
{
double ret = (double)(when);
ret *= 1000.0;
ret /= (double)(ut_timer.frequency);
return ret;
}
/**************************************************************//**
Convert native timer units in a ulonglong into microseconds in a double
@return time in microseconds */
UNIV_INLINE
double
ut_timer_to_microseconds(
/*=====================*/
ulonglong when) /*!< in: time where to calculate */
{
double ret = (double)(when);
ret *= 1000000.0;
ret /= (double)(ut_timer.frequency);
return ret;
}
/**************************************************************//**
Convert microseconds in a double to native timer units in a ulonglong
@return time in microseconds */
UNIV_INLINE
ulonglong
ut_microseconds_to_timer(
/*=====================*/
ulonglong when) /*!< in: time where to calculate */
{
double ret = when;
ret *= (double)(ut_timer.frequency);
ret /= 1000000.0;
return (ulonglong)ret;
}

View file

@ -3267,6 +3267,47 @@ lock_update_merge_left(
lock_mutex_exit();
}
/*************************************************************//**
Updates the lock table when a page is split and merged to
two pages. */
UNIV_INTERN
void
lock_update_split_and_merge(
const buf_block_t* left_block, /*!< in: left page to which merged */
const rec_t* orig_pred, /*!< in: original predecessor of
supremum on the left page before merge*/
const buf_block_t* right_block) /*!< in: right page from which merged */
{
const rec_t* left_next_rec;
ut_a(left_block && right_block);
ut_a(orig_pred);
lock_mutex_enter();
left_next_rec = page_rec_get_next_const(orig_pred);
/* Inherit the locks on the supremum of the left page to the
first record which was moved from the right page */
lock_rec_inherit_to_gap(
left_block, left_block,
page_rec_get_heap_no(left_next_rec),
PAGE_HEAP_NO_SUPREMUM);
/* Reset the locks on the supremum of the left page,
releasing waiting transactions */
lock_rec_reset_and_release_wait(left_block,
PAGE_HEAP_NO_SUPREMUM);
/* Inherit the locks to the supremum of the left page from the
successor of the infimum on the right page */
lock_rec_inherit_to_gap(left_block, right_block,
PAGE_HEAP_NO_SUPREMUM,
lock_get_min_heap_no(right_block));
lock_mutex_exit();
}
/*************************************************************//**
Resets the original locks on heir and replaces them with gap type locks
inherited from rec. */

View file

@ -1349,6 +1349,21 @@ page_cur_insert_rec_zip(
return(insert_rec);
}
/* Page compress failed. If this happened on a
leaf page, put the data size into the sample
buffer. */
if (page_is_leaf(page)) {
ulint occupied = page_get_data_size(page)
+ page_dir_calc_reserved_space(
page_get_n_recs(page));
index->stat_defrag_data_size_sample[
index->stat_defrag_sample_next_slot] =
occupied;
index->stat_defrag_sample_next_slot =
(index->stat_defrag_sample_next_slot
+ 1) % STAT_DEFRAG_DATA_SIZE_N_SAMPLE;
}
ut_ad(cursor->rec
== (pos > 1
? page_rec_get_nth(

View file

@ -54,6 +54,7 @@ Created 9/17/2000 Heikki Tuuri
#include "rem0cmp.h"
#include "log0log.h"
#include "btr0sea.h"
#include "btr0defragment.h"
#include "fil0fil.h"
#include "ibuf0ibuf.h"
#include "fts0fts.h"
@ -3843,6 +3844,8 @@ row_drop_table_for_mysql(
if (!dict_table_is_temporary(table)) {
dict_stats_recalc_pool_del(table);
dict_stats_defrag_pool_del(table, NULL);
btr_defragment_remove_table(table);
/* Remove stats for this table and all of its indexes from the
persistent storage if it exists and if there are stats for this
@ -5128,18 +5131,6 @@ end:
trx->error_state = DB_SUCCESS;
trx_rollback_to_savepoint(trx, NULL);
trx->error_state = DB_SUCCESS;
} else {
if (old_is_tmp && !new_is_tmp) {
/* After ALTER TABLE the table statistics
needs to be rebuilt. Even if we close
table below there could be other
transactions using this table (e.g.
SELECT * FROM INFORMATION_SCHEMA.`TABLE_CONSTRAINTS`),
thus we can't remove table from dictionary cache
here. Therefore, we initialize the
transient statistics here. */
dict_stats_update_transient(table);
}
}
}

View file

@ -68,6 +68,7 @@ Created 10/8/1995 Heikki Tuuri
#include "os0sync.h" /* for HAVE_ATOMIC_BUILTINS */
#include "srv0mon.h"
#include "ut0crc32.h"
#include "btr0defragment.h"
#include "mysql/plugin.h"
#include "mysql/service_thd_wait.h"
@ -396,6 +397,15 @@ UNIV_INTERN ib_uint64_t srv_page_compressed_trim_op = 0;
UNIV_INTERN ib_uint64_t srv_page_compressed_trim_op_saved = 0;
UNIV_INTERN ib_uint64_t srv_index_page_decompressed = 0;
/* Defragmentation */
UNIV_INTERN my_bool srv_defragment = FALSE;
UNIV_INTERN uint srv_defragment_n_pages = 7;
UNIV_INTERN uint srv_defragment_stats_accuracy = 0;
UNIV_INTERN uint srv_defragment_fill_factor_n_recs = 20;
UNIV_INTERN double srv_defragment_fill_factor = 0.9;
UNIV_INTERN uint srv_defragment_frequency =
SRV_DEFRAGMENT_FREQUENCY_DEFAULT;
UNIV_INTERN ulonglong srv_defragment_interval = 0;
/* Set the following to 0 if you want InnoDB to write messages on
stderr on startup/shutdown. */
@ -1492,6 +1502,11 @@ srv_export_innodb_status(void)
export_vars.innodb_page_compressed_trim_op_saved = srv_stats.page_compressed_trim_op_saved;
export_vars.innodb_pages_page_decompressed = srv_stats.pages_page_decompressed;
export_vars.innodb_defragment_compression_failures =
btr_defragment_compression_failures;
export_vars.innodb_defragment_failures = btr_defragment_failures;
export_vars.innodb_defragment_count = btr_defragment_count;
#ifdef UNIV_DEBUG
rw_lock_s_lock(&purge_sys->latch);
trx_id_t done_trx_no = purge_sys->done.trx_no;

View file

@ -43,6 +43,7 @@ Created 2/16/1996 Heikki Tuuri
#include "pars0pars.h"
#include "row0ftsort.h"
#include "ut0mem.h"
#include "ut0timer.h"
#include "mem0mem.h"
#include "data0data.h"
#include "data0type.h"
@ -67,6 +68,8 @@ Created 2/16/1996 Heikki Tuuri
#include "ibuf0ibuf.h"
#include "srv0start.h"
#include "srv0srv.h"
#include "btr0defragment.h"
#ifndef UNIV_HOTBACKUP
# include "trx0rseg.h"
# include "os0proc.h"
@ -1531,6 +1534,9 @@ innobase_start_or_create_for_mysql(void)
char* logfile0 = NULL;
size_t dirnamelen;
/* This should be initialized early */
ut_init_timer();
if (srv_force_recovery > SRV_FORCE_NO_TRX_UNDO) {
srv_read_only_mode = true;
}
@ -2877,6 +2883,9 @@ files_checked:
fts_optimize_init();
}
/* Initialize online defragmentation. */
btr_defragment_init();
srv_was_started = TRUE;
return(DB_SUCCESS);

View file

@ -1164,6 +1164,7 @@ sync_thread_add_level(
case SYNC_IBUF_MUTEX:
case SYNC_INDEX_ONLINE_LOG:
case SYNC_STATS_AUTO_RECALC:
case SYNC_STATS_DEFRAG:
if (!sync_thread_levels_g(array, level, TRUE)) {
fprintf(stderr,
"InnoDB: sync_thread_levels_g(array, %lu)"

View file

@ -0,0 +1,92 @@
/*****************************************************************************
Copyright (c) 2013, 2014, Facebook, Inc. All Rights Reserved.
Copyright (c) 2014, SkySQL Ab. All Rights Reserved.
This program is free software; you can redistribute it and/or modify it under
the terms of the GNU General Public License as published by the Free Software
Foundation; version 2 of the License.
This program is distributed in the hope that it will be useful, but WITHOUT
ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
You should have received a copy of the GNU General Public License along with
this program; if not, write to the Free Software Foundation, Inc.,
51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA
*****************************************************************************/
/********************************************************************//**
@file ut/ut0timer.cc
Timer rountines
Created 30/07/2014 Jan Lindström jan.lindstrom@skysql.com
modified from https://github.com/facebook/mysql-5.6/commit/c75a413edeb96eb99bf11d7269bdfea06f96d6b6
*************************************************************************/
#include "data0type.h"
#include <my_rdtsc.h>
#include <ut0timer.h>
/**************************************************************//**
Initial timer definition
@return 0 */
static
ulonglong
ut_timer_none(void)
/*===============*/
{
return 0;
}
/**************************************************************//**
Function pointer to point selected timer function.
@return timer current value */
ulonglong (*ut_timer_now)(void) = &ut_timer_none;
struct my_timer_unit_info ut_timer;
/**************************************************************//**
Sets up the data required for use of my_timer_* functions.
Selects the best timer by high frequency, and tight resolution.
Points my_timer_now() to the selected timer function.
Initializes my_timer struct to contain the info for selected timer.*/
UNIV_INTERN
void
ut_init_timer(void)
/*===============*/
{
MY_TIMER_INFO all_timer_info;
my_timer_init(&all_timer_info);
if (all_timer_info.cycles.frequency > 1000000 &&
all_timer_info.cycles.resolution == 1) {
ut_timer = all_timer_info.cycles;
ut_timer_now = &my_timer_cycles;
} else if (all_timer_info.nanoseconds.frequency > 1000000 &&
all_timer_info.nanoseconds.resolution == 1) {
ut_timer = all_timer_info.nanoseconds;
ut_timer_now = &my_timer_nanoseconds;
} else if (all_timer_info.microseconds.frequency >= 1000000 &&
all_timer_info.microseconds.resolution == 1) {
ut_timer = all_timer_info.microseconds;
ut_timer_now = &my_timer_microseconds;
} else if (all_timer_info.milliseconds.frequency >= 1000 &&
all_timer_info.milliseconds.resolution == 1) {
ut_timer = all_timer_info.milliseconds;
ut_timer_now = &my_timer_milliseconds;
} else if (all_timer_info.ticks.frequency >= 1000 &&
/* Will probably be false */
all_timer_info.ticks.resolution == 1) {
ut_timer = all_timer_info.ticks;
ut_timer_now = &my_timer_ticks;
} else {
/* None are acceptable, so leave it as "None", and fill in struct */
ut_timer.frequency = 1; /* Avoid div-by-zero */
ut_timer.overhead = 0; /* Since it doesn't do anything */
ut_timer.resolution = 10; /* Another sign it's bad */
ut_timer.routine = 0; /* None */
}
}

View file

@ -292,6 +292,7 @@ SET(INNOBASE_SOURCES
btr/btr0cur.cc
btr/btr0pcur.cc
btr/btr0sea.cc
btr/btr0defragment.cc
buf/buf0buddy.cc
buf/buf0buf.cc
buf/buf0dblwr.cc
@ -405,7 +406,8 @@ SET(INNOBASE_SOURCES
ut/ut0rnd.cc
ut/ut0ut.cc
ut/ut0vec.cc
ut/ut0wqueue.cc)
ut/ut0wqueue.cc
ut/ut0timer.cc)
IF(NOT XTRADB_OK)
MESSAGE(FATAL_ERROR "Percona XtraDB is not supported on this platform")

View file

@ -38,6 +38,7 @@ Created 6/2/1994 Heikki Tuuri
#include "btr0cur.h"
#include "btr0sea.h"
#include "btr0pcur.h"
#include "btr0defragment.h"
#include "rem0cmp.h"
#include "lock0lock.h"
#include "ibuf0ibuf.h"
@ -1212,6 +1213,32 @@ btr_get_size(
ulint flag, /*!< in: BTR_N_LEAF_PAGES or BTR_TOTAL_SIZE */
mtr_t* mtr) /*!< in/out: mini-transaction where index
is s-latched */
{
ulint used;
if (flag == BTR_N_LEAF_PAGES) {
btr_get_size_and_reserved(index, flag, &used, mtr);
return used;
} else if (flag == BTR_TOTAL_SIZE) {
return btr_get_size_and_reserved(index, flag, &used, mtr);
} else {
ut_error;
}
return (ULINT_UNDEFINED);
}
/**************************************************************//**
Gets the number of reserved and used pages in a B-tree.
@return number of pages reserved, or ULINT_UNDEFINED if the index
is unavailable */
UNIV_INTERN
ulint
btr_get_size_and_reserved(
/*======================*/
dict_index_t* index, /*!< in: index */
ulint flag, /*!< in: BTR_N_LEAF_PAGES or BTR_TOTAL_SIZE */
ulint* used, /*!< out: number of pages used (<= reserved) */
mtr_t* mtr) /*!< in/out: mini-transaction where index
is s-latched */
{
fseg_header_t* seg_header;
page_t* root;
@ -1221,6 +1248,8 @@ btr_get_size(
ut_ad(mtr_memo_contains(mtr, dict_index_get_lock(index),
MTR_MEMO_S_LOCK));
ut_a(flag == BTR_N_LEAF_PAGES || flag == BTR_TOTAL_SIZE);
if (index->page == FIL_NULL || dict_index_is_online_ddl(index)
|| *index->name == TEMP_INDEX_PREFIX) {
return(ULINT_UNDEFINED);
@ -1228,27 +1257,16 @@ btr_get_size(
root = btr_root_get(index, mtr);
SRV_CORRUPT_TABLE_CHECK(root,
{
mtr_commit(mtr);
return(0);
});
seg_header = root + PAGE_HEADER + PAGE_BTR_SEG_LEAF;
if (flag == BTR_N_LEAF_PAGES) {
seg_header = root + PAGE_HEADER + PAGE_BTR_SEG_LEAF;
n = fseg_n_reserved_pages(seg_header, used, mtr);
fseg_n_reserved_pages(seg_header, &n, mtr);
} else if (flag == BTR_TOTAL_SIZE) {
if (flag == BTR_TOTAL_SIZE) {
seg_header = root + PAGE_HEADER + PAGE_BTR_SEG_TOP;
n = fseg_n_reserved_pages(seg_header, &dummy, mtr);
seg_header = root + PAGE_HEADER + PAGE_BTR_SEG_LEAF;
n += fseg_n_reserved_pages(seg_header, &dummy, mtr);
} else {
ut_error;
*used += dummy;
}
return(n);
@ -2013,7 +2031,7 @@ IBUF_BITMAP_FREE is unaffected by reorganization.
@retval true if the operation was successful
@retval false if it is a compressed page, and recompression failed */
static __attribute__((nonnull))
UNIV_INTERN
bool
btr_page_reorganize_block(
/*======================*/
@ -2965,6 +2983,12 @@ func_start:
new_page_zip = buf_block_get_page_zip(new_block);
btr_page_create(new_block, new_page_zip, cursor->index,
btr_page_get_level(page, mtr), mtr);
/* Only record the leaf level page splits. */
if (btr_page_get_level(page, mtr) == 0) {
cursor->index->stat_defrag_n_page_split ++;
cursor->index->stat_defrag_modified_counter ++;
btr_defragment_save_defrag_stats_if_needed(cursor->index);
}
/* 3. Calculate the first record on the upper half-page, and the
first record (move_limit) on original page which ends up on the
@ -3223,31 +3247,9 @@ func_exit:
return(rec);
}
#ifdef UNIV_SYNC_DEBUG
/*************************************************************//**
Removes a page from the level list of pages.
@param space in: space where removed
@param zip_size in: compressed page size in bytes, or 0 for uncompressed
@param page in/out: page to remove
@param index in: index tree
@param mtr in/out: mini-transaction */
# define btr_level_list_remove(space,zip_size,page,index,mtr) \
btr_level_list_remove_func(space,zip_size,page,index,mtr)
#else /* UNIV_SYNC_DEBUG */
/*************************************************************//**
Removes a page from the level list of pages.
@param space in: space where removed
@param zip_size in: compressed page size in bytes, or 0 for uncompressed
@param page in/out: page to remove
@param index in: index tree
@param mtr in/out: mini-transaction */
# define btr_level_list_remove(space,zip_size,page,index,mtr) \
btr_level_list_remove_func(space,zip_size,page,mtr)
#endif /* UNIV_SYNC_DEBUG */
/*************************************************************//**
Removes a page from the level list of pages. */
static __attribute__((nonnull))
UNIV_INTERN
void
btr_level_list_remove_func(
/*=======================*/
@ -3419,7 +3421,7 @@ btr_node_ptr_delete(
If page is the only on its level, this function moves its records to the
father page, thus reducing the tree height.
@return father block */
static
UNIV_INTERN
buf_block_t*
btr_lift_page_up(
/*=============*/

View file

@ -0,0 +1,815 @@
/*****************************************************************************
Copyright (C) 2012, 2014 Facebook, Inc. All Rights Reserved.
Copyright (C) 2014, SkySQL Ab. All Rights Reserved.
This program is free software; you can redistribute it and/or modify it under
the terms of the GNU General Public License as published by the Free Software
Foundation; version 2 of the License.
This program is distributed in the hope that it will be useful, but WITHOUT
ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
You should have received a copy of the GNU General Public License along with
this program; if not, write to the Free Software Foundation, Inc.,
51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA
*****************************************************************************/
/**************************************************//**
@file btr/btr0defragment.cc
Index defragmentation.
Created 05/29/2014 Rongrong Zhong
Modified 16/07/2014 Sunguck Lee
Modified 30/07/2014 Jan Lindström jan.lindstrom@skysql.com
*******************************************************/
#include "btr0defragment.h"
#ifndef UNIV_HOTBACKUP
#include "btr0cur.h"
#include "btr0sea.h"
#include "btr0pcur.h"
#include "dict0stats.h"
#include "dict0stats_bg.h"
#include "ibuf0ibuf.h"
#include "lock0lock.h"
#include "srv0start.h"
#include "srv0srv.h"
#include "ut0timer.h"
#include <list>
/**************************************************//**
Custom nullptr implementation for under g++ 4.6
*******************************************************/
// #pragma once
namespace std
{
// based on SC22/WG21/N2431 = J16/07-0301
struct nullptr_t
{
template<typename any> operator any * () const
{
return 0;
}
template<class any, typename T> operator T any:: * () const
{
return 0;
}
#ifdef _MSC_VER
struct pad {};
pad __[sizeof(void*)/sizeof(pad)];
#else
char __[sizeof(void*)];
#endif
private:
// nullptr_t();// {}
// nullptr_t(const nullptr_t&);
// void operator = (const nullptr_t&);
void operator &() const;
template<typename any> void operator +(any) const
{
/*I Love MSVC 2005!*/
}
template<typename any> void operator -(any) const
{
/*I Love MSVC 2005!*/
}
};
static const nullptr_t __nullptr = {};
}
#ifndef nullptr
#define nullptr std::__nullptr
#endif
/**************************************************//**
End of Custom nullptr implementation for under g++ 4.6
*******************************************************/
/* When there's no work, either because defragment is disabled, or because no
query is submitted, thread checks state every BTR_DEFRAGMENT_SLEEP_IN_USECS.*/
#define BTR_DEFRAGMENT_SLEEP_IN_USECS 1000000
/* Reduce the target page size by this amount when compression failure happens
during defragmentaiton. 512 is chosen because it's a power of 2 and it is about
3% of the page size. When there are compression failures in defragmentation,
our goal is to get a decent defrag ratio with as few compression failure as
possible. From experimentation it seems that reduce the target size by 512 every
time will make sure the page is compressible within a couple of iterations. */
#define BTR_DEFRAGMENT_PAGE_REDUCTION_STEP_SIZE 512
/* Work queue for defragmentation. */
typedef std::list<btr_defragment_item_t*> btr_defragment_wq_t;
static btr_defragment_wq_t btr_defragment_wq;
/* Mutex protecting the defragmentation work queue.*/
ib_mutex_t btr_defragment_mutex;
#ifdef UNIV_PFS_MUTEX
UNIV_INTERN mysql_pfs_key_t btr_defragment_mutex_key;
#endif /* UNIV_PFS_MUTEX */
/* Number of compression failures caused by defragmentation since server
start. */
ulint btr_defragment_compression_failures = 0;
/* Number of btr_defragment_n_pages calls that altered page but didn't
manage to release any page. */
ulint btr_defragment_failures = 0;
/* Total number of btr_defragment_n_pages calls that altered page.
The difference between btr_defragment_count and btr_defragment_failures shows
the amount of effort wasted. */
ulint btr_defragment_count = 0;
/******************************************************************//**
Constructor for btr_defragment_item_t. */
btr_defragment_item_t::btr_defragment_item_t(
btr_pcur_t* pcur,
os_event_t event)
{
this->pcur = pcur;
this->event = event;
this->removed = false;
this->last_processed = 0;
}
/******************************************************************//**
Destructor for btr_defragment_item_t. */
btr_defragment_item_t::~btr_defragment_item_t() {
if (this->pcur) {
btr_pcur_free_for_mysql(this->pcur);
}
if (this->event) {
os_event_set(this->event);
}
}
/******************************************************************//**
Initialize defragmentation. */
void
btr_defragment_init()
{
srv_defragment_interval = ut_microseconds_to_timer(
1000000.0 / srv_defragment_frequency);
mutex_create(btr_defragment_mutex_key, &btr_defragment_mutex,
SYNC_ANY_LATCH);
os_thread_create(btr_defragment_thread, NULL, NULL);
}
/******************************************************************//**
Shutdown defragmentation. Release all resources. */
void
btr_defragment_shutdown()
{
mutex_enter(&btr_defragment_mutex);
list< btr_defragment_item_t* >::iterator iter = btr_defragment_wq.begin();
while(iter != btr_defragment_wq.end()) {
btr_defragment_item_t* item = *iter;
iter = btr_defragment_wq.erase(iter);
delete item;
}
mutex_exit(&btr_defragment_mutex);
mutex_free(&btr_defragment_mutex);
}
/******************************************************************//**
Functions used by the query threads: btr_defragment_xxx_index
Query threads find/add/remove index. */
/******************************************************************//**
Check whether the given index is in btr_defragment_wq. We use index->id
to identify indices. */
bool
btr_defragment_find_index(
dict_index_t* index) /*!< Index to find. */
{
mutex_enter(&btr_defragment_mutex);
for (list< btr_defragment_item_t* >::iterator iter = btr_defragment_wq.begin();
iter != btr_defragment_wq.end();
++iter) {
btr_defragment_item_t* item = *iter;
btr_pcur_t* pcur = item->pcur;
btr_cur_t* cursor = btr_pcur_get_btr_cur(pcur);
dict_index_t* idx = btr_cur_get_index(cursor);
if (index->id == idx->id) {
mutex_exit(&btr_defragment_mutex);
return true;
}
}
mutex_exit(&btr_defragment_mutex);
return false;
}
/******************************************************************//**
Query thread uses this function to add an index to btr_defragment_wq.
Return a pointer to os_event for the query thread to wait on if this is a
synchronized defragmentation. */
os_event_t
btr_defragment_add_index(
dict_index_t* index, /*!< index to be added */
bool async) /*!< whether this is an async defragmentation */
{
mtr_t mtr;
ulint space = dict_index_get_space(index);
ulint zip_size = dict_table_zip_size(index->table);
ulint page_no = dict_index_get_page(index);
mtr_start(&mtr);
// Load index rood page.
page_t* page = btr_page_get(space, zip_size, page_no,
RW_NO_LATCH, index, &mtr);
if (btr_page_get_level(page, &mtr) == 0) {
// Index root is a leaf page, no need to defragment.
mtr_commit(&mtr);
return NULL;
}
btr_pcur_t* pcur = btr_pcur_create_for_mysql();
os_event_t event = NULL;
if (!async) {
event = os_event_create();
}
btr_pcur_open_at_index_side(true, index, BTR_SEARCH_LEAF, pcur,
true, 0, &mtr);
btr_pcur_move_to_next(pcur, &mtr);
btr_pcur_store_position(pcur, &mtr);
mtr_commit(&mtr);
dict_stats_empty_defrag_summary(index);
btr_defragment_item_t* item = new btr_defragment_item_t(pcur, event);
mutex_enter(&btr_defragment_mutex);
btr_defragment_wq.push_back(item);
mutex_exit(&btr_defragment_mutex);
return event;
}
/******************************************************************//**
When table is dropped, this function is called to mark a table as removed in
btr_efragment_wq. The difference between this function and the remove_index
function is this will not NULL the event. */
void
btr_defragment_remove_table(
dict_table_t* table) /*!< Index to be removed. */
{
mutex_enter(&btr_defragment_mutex);
for (list< btr_defragment_item_t* >::iterator iter = btr_defragment_wq.begin();
iter != btr_defragment_wq.end();
++iter) {
btr_defragment_item_t* item = *iter;
btr_pcur_t* pcur = item->pcur;
btr_cur_t* cursor = btr_pcur_get_btr_cur(pcur);
dict_index_t* idx = btr_cur_get_index(cursor);
if (table->id == idx->table->id) {
item->removed = true;
}
}
mutex_exit(&btr_defragment_mutex);
}
/******************************************************************//**
Query thread uses this function to mark an index as removed in
btr_efragment_wq. */
void
btr_defragment_remove_index(
dict_index_t* index) /*!< Index to be removed. */
{
mutex_enter(&btr_defragment_mutex);
for (list< btr_defragment_item_t* >::iterator iter = btr_defragment_wq.begin();
iter != btr_defragment_wq.end();
++iter) {
btr_defragment_item_t* item = *iter;
btr_pcur_t* pcur = item->pcur;
btr_cur_t* cursor = btr_pcur_get_btr_cur(pcur);
dict_index_t* idx = btr_cur_get_index(cursor);
if (index->id == idx->id) {
item->removed = true;
item->event = NULL;
break;
}
}
mutex_exit(&btr_defragment_mutex);
}
/******************************************************************//**
Functions used by defragmentation thread: btr_defragment_xxx_item.
Defragmentation thread operates on the work *item*. It gets/removes
item from the work queue. */
/******************************************************************//**
Defragment thread uses this to remove an item from btr_defragment_wq.
When an item is removed from the work queue, all resources associated with it
are free as well. */
void
btr_defragment_remove_item(
btr_defragment_item_t* item) /*!< Item to be removed. */
{
mutex_enter(&btr_defragment_mutex);
for (list< btr_defragment_item_t* >::iterator iter = btr_defragment_wq.begin();
iter != btr_defragment_wq.end();
++iter) {
if (item == *iter) {
btr_defragment_wq.erase(iter);
delete item;
break;
}
}
mutex_exit(&btr_defragment_mutex);
}
/******************************************************************//**
Defragment thread uses this to get an item from btr_defragment_wq to work on.
The item is not removed from the work queue so query threads can still access
this item. We keep it this way so query threads can find and kill a
defragmentation even if that index is being worked on. Be aware that while you
work on this item you have no lock protection on it whatsoever. This is OK as
long as the query threads and defragment thread won't modify the same fields
without lock protection.
*/
btr_defragment_item_t*
btr_defragment_get_item()
{
if (btr_defragment_wq.empty()) {
return nullptr;
}
mutex_enter(&btr_defragment_mutex);
list< btr_defragment_item_t* >::iterator iter = btr_defragment_wq.begin();
if (iter == btr_defragment_wq.end()) {
iter = btr_defragment_wq.begin();
}
btr_defragment_item_t* item = *iter;
iter++;
mutex_exit(&btr_defragment_mutex);
return item;
}
/*********************************************************************//**
Check whether we should save defragmentation statistics to persistent storage.
Currently we save the stats to persistent storage every 100 updates. */
UNIV_INTERN
void
btr_defragment_save_defrag_stats_if_needed(
dict_index_t* index) /*!< in: index */
{
if (srv_defragment_stats_accuracy != 0 // stats tracking disabled
&& dict_index_get_space(index) != 0 // do not track system tables
&& index->stat_defrag_modified_counter
>= srv_defragment_stats_accuracy) {
dict_stats_defrag_pool_add(index);
index->stat_defrag_modified_counter = 0;
}
}
/*********************************************************************//**
Main defragment functionalities used by defragment thread.*/
/*************************************************************//**
Calculate number of records from beginning of block that can
fit into size_limit
@return number of records */
UNIV_INTERN
ulint
btr_defragment_calc_n_recs_for_size(
buf_block_t* block, /*!< in: B-tree page */
dict_index_t* index, /*!< in: index of the page */
ulint size_limit, /*!< in: size limit to fit records in */
ulint* n_recs_size) /*!< out: actual size of the records that fit
in size_limit. */
{
page_t* page = buf_block_get_frame(block);
ulint n_recs = 0;
ulint offsets_[REC_OFFS_NORMAL_SIZE];
ulint* offsets = offsets_;
rec_offs_init(offsets_);
mem_heap_t* heap = NULL;
ulint size = 0;
page_cur_t cur;
page_cur_set_before_first(block, &cur);
page_cur_move_to_next(&cur);
while (page_cur_get_rec(&cur) != page_get_supremum_rec(page)) {
rec_t* cur_rec = page_cur_get_rec(&cur);
offsets = rec_get_offsets(cur_rec, index, offsets,
ULINT_UNDEFINED, &heap);
ulint rec_size = rec_offs_size(offsets);
size += rec_size;
if (size > size_limit) {
size = size - rec_size;
break;
}
n_recs ++;
page_cur_move_to_next(&cur);
}
*n_recs_size = size;
return n_recs;
}
/*************************************************************//**
Merge as many records from the from_block to the to_block. Delete
the from_block if all records are successfully merged to to_block.
@return the to_block to target for next merge operation. */
UNIV_INTERN
buf_block_t*
btr_defragment_merge_pages(
dict_index_t* index, /*!< in: index tree */
buf_block_t* from_block, /*!< in: origin of merge */
buf_block_t* to_block, /*!< in: destination of merge */
ulint zip_size, /*!< in: zip size of the block */
ulint reserved_space, /*!< in: space reserved for future
insert to avoid immediate page split */
ulint* max_data_size, /*!< in/out: max data size to
fit in a single compressed page. */
mem_heap_t* heap, /*!< in/out: pointer to memory heap */
mtr_t* mtr) /*!< in/out: mini-transaction */
{
page_t* from_page = buf_block_get_frame(from_block);
page_t* to_page = buf_block_get_frame(to_block);
ulint space = dict_index_get_space(index);
ulint level = btr_page_get_level(from_page, mtr);
ulint n_recs = page_get_n_recs(from_page);
ulint new_data_size = page_get_data_size(to_page);
ulint max_ins_size =
page_get_max_insert_size(to_page, n_recs);
ulint max_ins_size_reorg =
page_get_max_insert_size_after_reorganize(
to_page, n_recs);
ulint max_ins_size_to_use = max_ins_size_reorg > reserved_space
? max_ins_size_reorg - reserved_space : 0;
ulint move_size = 0;
ulint n_recs_to_move = 0;
rec_t* rec = NULL;
ulint target_n_recs = 0;
rec_t* orig_pred;
// Estimate how many records can be moved from the from_page to
// the to_page.
if (zip_size) {
ulint page_diff = UNIV_PAGE_SIZE - *max_data_size;
max_ins_size_to_use = (max_ins_size_to_use > page_diff)
? max_ins_size_to_use - page_diff : 0;
}
n_recs_to_move = btr_defragment_calc_n_recs_for_size(
from_block, index, max_ins_size_to_use, &move_size);
// If max_ins_size >= move_size, we can move the records without
// reorganizing the page, otherwise we need to reorganize the page
// first to release more space.
if (move_size > max_ins_size) {
if (!btr_page_reorganize_block(false, page_zip_level,
to_block, index,
mtr)) {
if (!dict_index_is_clust(index)
&& page_is_leaf(to_page)) {
ibuf_reset_free_bits(to_block);
}
// If reorganization fails, that means page is
// not compressable. There's no point to try
// merging into this page. Continue to the
// next page.
return from_block;
}
ut_ad(page_validate(to_page, index));
max_ins_size = page_get_max_insert_size(to_page, n_recs);
ut_a(max_ins_size >= move_size);
}
// Move records to pack to_page more full.
orig_pred = NULL;
target_n_recs = n_recs_to_move;
while (n_recs_to_move > 0) {
rec = page_rec_get_nth(from_page,
n_recs_to_move + 1);
orig_pred = page_copy_rec_list_start(
to_block, from_block, rec, index, mtr);
if (orig_pred)
break;
// If we reach here, that means compression failed after packing
// n_recs_to_move number of records to to_page. We try to reduce
// the targeted data size on the to_page by
// BTR_DEFRAGMENT_PAGE_REDUCTION_STEP_SIZE and try again.
os_atomic_increment_ulint(
&btr_defragment_compression_failures, 1);
max_ins_size_to_use =
move_size > BTR_DEFRAGMENT_PAGE_REDUCTION_STEP_SIZE
? move_size - BTR_DEFRAGMENT_PAGE_REDUCTION_STEP_SIZE
: 0;
if (max_ins_size_to_use == 0) {
n_recs_to_move = 0;
move_size = 0;
break;
}
n_recs_to_move = btr_defragment_calc_n_recs_for_size(
from_block, index, max_ins_size_to_use, &move_size);
}
// If less than target_n_recs are moved, it means there are
// compression failures during page_copy_rec_list_start. Adjust
// the max_data_size estimation to reduce compression failures
// in the following runs.
if (target_n_recs > n_recs_to_move
&& *max_data_size > new_data_size + move_size) {
*max_data_size = new_data_size + move_size;
}
// Set ibuf free bits if necessary.
if (!dict_index_is_clust(index)
&& page_is_leaf(to_page)) {
if (zip_size) {
ibuf_reset_free_bits(to_block);
} else {
ibuf_update_free_bits_if_full(
to_block,
UNIV_PAGE_SIZE,
ULINT_UNDEFINED);
}
}
if (n_recs_to_move == n_recs) {
/* The whole page is merged with the previous page,
free it. */
lock_update_merge_left(to_block, orig_pred,
from_block);
btr_search_drop_page_hash_index(from_block);
btr_level_list_remove(space, zip_size, from_page,
index, mtr);
btr_node_ptr_delete(index, from_block, mtr);
btr_blob_dbg_remove(from_page, index,
"btr_defragment_n_pages");
btr_page_free(index, from_block, mtr);
} else {
// There are still records left on the page, so
// increment n_defragmented. Node pointer will be changed
// so remove the old node pointer.
if (n_recs_to_move > 0) {
// Part of the page is merged to left, remove
// the merged records, update record locks and
// node pointer.
dtuple_t* node_ptr;
page_delete_rec_list_start(rec, from_block,
index, mtr);
lock_update_split_and_merge(to_block,
orig_pred,
from_block);
btr_node_ptr_delete(index, from_block, mtr);
rec = page_rec_get_next(
page_get_infimum_rec(from_page));
node_ptr = dict_index_build_node_ptr(
index, rec, page_get_page_no(from_page),
heap, level + 1);
btr_insert_on_non_leaf_level(0, index, level+1,
node_ptr, mtr);
}
to_block = from_block;
}
return to_block;
}
/*************************************************************//**
Tries to merge N consecutive pages, starting from the page pointed by the
cursor. Skip space 0. Only consider leaf pages.
This function first loads all N pages into memory, then for each of
the pages other than the first page, it tries to move as many records
as possible to the left sibling to keep the left sibling full. During
the process, if any page becomes empty, that page will be removed from
the level list. Record locks, hash, and node pointers are updated after
page reorganization.
@return pointer to the last block processed, or NULL if reaching end of index */
UNIV_INTERN
buf_block_t*
btr_defragment_n_pages(
buf_block_t* block, /*!< in: starting block for defragmentation */
dict_index_t* index, /*!< in: index tree */
uint n_pages,/*!< in: number of pages to defragment */
mtr_t* mtr) /*!< in/out: mini-transaction */
{
ulint space;
ulint zip_size;
/* We will need to load the n+1 block because if the last page is freed
and we need to modify the prev_page_no of that block. */
buf_block_t* blocks[BTR_DEFRAGMENT_MAX_N_PAGES + 1];
page_t* first_page;
buf_block_t* current_block;
ulint total_data_size = 0;
ulint total_n_recs = 0;
ulint data_size_per_rec;
ulint optimal_page_size;
ulint reserved_space;
ulint level;
ulint max_data_size = 0;
uint n_defragmented = 0;
uint n_new_slots;
mem_heap_t* heap;
ibool end_of_index = FALSE;
/* It doesn't make sense to call this function with n_pages = 1. */
ut_ad(n_pages > 1);
ut_ad(mtr_memo_contains(mtr, dict_index_get_lock(index),
MTR_MEMO_X_LOCK));
space = dict_index_get_space(index);
if (space == 0) {
/* Ignore space 0. */
return NULL;
}
if (n_pages > BTR_DEFRAGMENT_MAX_N_PAGES) {
n_pages = BTR_DEFRAGMENT_MAX_N_PAGES;
}
zip_size = dict_table_zip_size(index->table);
first_page = buf_block_get_frame(block);
level = btr_page_get_level(first_page, mtr);
if (level != 0) {
return NULL;
}
/* 1. Load the pages and calculate the total data size. */
blocks[0] = block;
for (uint i = 1; i <= n_pages; i++) {
page_t* page = buf_block_get_frame(blocks[i-1]);
ulint page_no = btr_page_get_next(page, mtr);
total_data_size += page_get_data_size(page);
total_n_recs += page_get_n_recs(page);
if (page_no == FIL_NULL) {
n_pages = i;
end_of_index = TRUE;
break;
}
blocks[i] = btr_block_get(space, zip_size, page_no,
RW_X_LATCH, index, mtr);
}
if (n_pages == 1) {
if (btr_page_get_prev(first_page, mtr) == FIL_NULL) {
/* last page in the index */
if (dict_index_get_page(index)
== page_get_page_no(first_page))
return NULL;
/* given page is the last page.
Lift the records to father. */
btr_lift_page_up(index, block, mtr);
}
return NULL;
}
/* 2. Calculate how many pages data can fit in. If not compressable,
return early. */
ut_a(total_n_recs != 0);
data_size_per_rec = total_data_size / total_n_recs;
// For uncompressed pages, the optimal data size if the free space of a
// empty page.
optimal_page_size = page_get_free_space_of_empty(
page_is_comp(first_page));
// For compressed pages, we take compression failures into account.
if (zip_size) {
ulint size = 0;
int i = 0;
// We estimate the optimal data size of the index use samples of
// data size. These samples are taken when pages failed to
// compress due to insertion on the page. We use the average
// of all samples we have as the estimation. Different pages of
// the same index vary in compressibility. Average gives a good
// enough estimation.
for (;i < STAT_DEFRAG_DATA_SIZE_N_SAMPLE; i++) {
if (index->stat_defrag_data_size_sample[i] == 0) {
break;
}
size += index->stat_defrag_data_size_sample[i];
}
if (i != 0) {
size = size / i;
optimal_page_size = min(optimal_page_size, size);
}
max_data_size = optimal_page_size;
}
reserved_space = min((ulint)(optimal_page_size
* (1 - srv_defragment_fill_factor)),
(data_size_per_rec
* srv_defragment_fill_factor_n_recs));
optimal_page_size -= reserved_space;
n_new_slots = (total_data_size + optimal_page_size - 1)
/ optimal_page_size;
if (n_new_slots >= n_pages) {
/* Can't defragment. */
if (end_of_index)
return NULL;
return blocks[n_pages-1];
}
/* 3. Defragment pages. */
heap = mem_heap_create(256);
// First defragmented page will be the first page.
current_block = blocks[0];
// Start from the second page.
for (uint i = 1; i < n_pages; i ++) {
buf_block_t* new_block = btr_defragment_merge_pages(
index, blocks[i], current_block, zip_size,
reserved_space, &max_data_size, heap, mtr);
if (new_block != current_block) {
n_defragmented ++;
current_block = new_block;
}
}
mem_heap_free(heap);
n_defragmented ++;
os_atomic_increment_ulint(
&btr_defragment_count, 1);
if (n_pages == n_defragmented) {
os_atomic_increment_ulint(
&btr_defragment_failures, 1);
} else {
index->stat_defrag_n_pages_freed += (n_pages - n_defragmented);
}
if (end_of_index)
return NULL;
return current_block;
}
/******************************************************************//**
Thread that merges consecutive b-tree pages into fewer pages to defragment
the index. */
extern "C" UNIV_INTERN
os_thread_ret_t
DECLARE_THREAD(btr_defragment_thread)(
/*==========================================*/
void* arg) /*!< in: work queue */
{
btr_pcur_t* pcur;
btr_cur_t* cursor;
dict_index_t* index;
mtr_t mtr;
buf_block_t* first_block;
buf_block_t* last_block;
while (srv_shutdown_state == SRV_SHUTDOWN_NONE) {
/* If defragmentation is disabled, sleep before
checking whether it's enabled. */
if (!srv_defragment) {
os_thread_sleep(BTR_DEFRAGMENT_SLEEP_IN_USECS);
continue;
}
/* The following call won't remove the item from work queue.
We only get a pointer to it to work on. This will make sure
when user issue a kill command, all indices are in the work
queue to be searched. This also means that the user thread
cannot directly remove the item from queue (since we might be
using it). So user thread only marks index as removed. */
btr_defragment_item_t* item = btr_defragment_get_item();
/* If work queue is empty, sleep and check later. */
if (!item) {
os_thread_sleep(BTR_DEFRAGMENT_SLEEP_IN_USECS);
continue;
}
/* If an index is marked as removed, we remove it from the work
queue. No other thread could be using this item at this point so
it's safe to remove now. */
if (item->removed) {
btr_defragment_remove_item(item);
continue;
}
pcur = item->pcur;
ulonglong now = ut_timer_now();
ulonglong elapsed = now - item->last_processed;
if (elapsed < srv_defragment_interval) {
/* If we see an index again before the interval
determined by the configured frequency is reached,
we just sleep until the interval pass. Since
defragmentation of all indices queue up on a single
thread, it's likely other indices that follow this one
don't need to sleep again. */
os_thread_sleep(((ulint)ut_timer_to_microseconds(
srv_defragment_interval - elapsed)));
}
now = ut_timer_now();
mtr_start(&mtr);
btr_pcur_restore_position(BTR_MODIFY_TREE, pcur, &mtr);
cursor = btr_pcur_get_btr_cur(pcur);
index = btr_cur_get_index(cursor);
first_block = btr_cur_get_block(cursor);
last_block = btr_defragment_n_pages(first_block, index,
srv_defragment_n_pages,
&mtr);
if (last_block) {
/* If we haven't reached the end of the index,
place the cursor on the last record of last page,
store the cursor position, and put back in queue. */
page_t* last_page = buf_block_get_frame(last_block);
rec_t* rec = page_rec_get_prev(
page_get_supremum_rec(last_page));
ut_a(page_rec_is_user_rec(rec));
page_cur_position(rec, last_block,
btr_cur_get_page_cur(cursor));
btr_pcur_store_position(pcur, &mtr);
mtr_commit(&mtr);
/* Update the last_processed time of this index. */
item->last_processed = now;
} else {
mtr_commit(&mtr);
/* Reaching the end of the index. */
dict_stats_empty_defrag_stats(index);
dict_stats_save_defrag_stats(index);
dict_stats_save_defrag_summary(index);
btr_defragment_remove_item(item);
}
}
btr_defragment_shutdown();
os_thread_exit(NULL);
OS_THREAD_DUMMY_RETURN;
}
#endif /* !UNIV_HOTBACKUP */

View file

@ -408,7 +408,7 @@ dict_table_try_drop_aborted(
if (table == NULL) {
table = dict_table_open_on_id_low(
table_id, DICT_ERR_IGNORE_NONE);
table_id, DICT_ERR_IGNORE_NONE, FALSE);
} else {
ut_ad(table->id == table_id);
}
@ -795,7 +795,8 @@ dict_table_open_on_id(
table_id,
table_op == DICT_TABLE_OP_LOAD_TABLESPACE
? DICT_ERR_IGNORE_RECOVER_LOCK
: DICT_ERR_IGNORE_NONE);
: DICT_ERR_IGNORE_NONE,
table_op == DICT_TABLE_OP_OPEN_ONLY_IF_CACHED);
if (table != NULL) {
@ -1313,7 +1314,7 @@ dict_table_move_from_non_lru_to_lru(
/**********************************************************************//**
Looks for an index with the given id given a table instance.
@return index or NULL */
static
UNIV_INTERN
dict_index_t*
dict_table_find_index_on_id(
/*========================*/
@ -2408,6 +2409,13 @@ undo_size_ok:
new_index->stat_index_size = 1;
new_index->stat_n_leaf_pages = 1;
new_index->stat_defrag_n_pages_freed = 0;
new_index->stat_defrag_n_page_split = 0;
new_index->stat_defrag_sample_next_slot = 0;
memset(&new_index->stat_defrag_data_size_sample,
0x0, sizeof(ulint) * STAT_DEFRAG_DATA_SIZE_N_SAMPLE);
/* Add the new index as the last index for the table */
UT_LIST_ADD_LAST(indexes, table->indexes, new_index);

View file

@ -492,6 +492,9 @@ dict_stats_table_clone_create(
heap,
idx->n_uniq * sizeof(idx->stat_n_non_null_key_vals[0]));
ut_d(idx->magic_n = DICT_INDEX_MAGIC_N);
idx->stat_defrag_n_page_split = 0;
idx->stat_defrag_n_pages_freed = 0;
}
ut_d(t->magic_n = DICT_TABLE_MAGIC_N);
@ -520,7 +523,9 @@ static
void
dict_stats_empty_index(
/*===================*/
dict_index_t* index) /*!< in/out: index */
dict_index_t* index, /*!< in/out: index */
bool empty_defrag_stats)
/*!< in: whether to empty defrag stats */
{
ut_ad(!(index->type & DICT_FTS));
ut_ad(!dict_index_is_univ(index));
@ -535,6 +540,34 @@ dict_stats_empty_index(
index->stat_index_size = 1;
index->stat_n_leaf_pages = 1;
if (empty_defrag_stats) {
dict_stats_empty_defrag_stats(index);
dict_stats_empty_defrag_summary(index);
}
}
/**********************************************************************//**
Clear defragmentation summary. */
UNIV_INTERN
void
dict_stats_empty_defrag_summary(
/*==================*/
dict_index_t* index) /*!< in: index to clear defragmentation stats */
{
index->stat_defrag_n_pages_freed = 0;
}
/**********************************************************************//**
Clear defragmentation related index stats. */
UNIV_INTERN
void
dict_stats_empty_defrag_stats(
/*==================*/
dict_index_t* index) /*!< in: index to clear defragmentation stats */
{
index->stat_defrag_modified_counter = 0;
index->stat_defrag_n_page_split = 0;
}
/*********************************************************************//**
@ -544,7 +577,9 @@ static
void
dict_stats_empty_table(
/*===================*/
dict_table_t* table) /*!< in/out: table */
dict_table_t* table, /*!< in/out: table */
bool empty_defrag_stats)
/*!< in: whether to empty defrag stats */
{
/* Zero the stats members */
@ -569,7 +604,7 @@ dict_stats_empty_table(
ut_ad(!dict_index_is_univ(index));
dict_stats_empty_index(index);
dict_stats_empty_index(index, empty_defrag_stats);
}
table->stat_initialized = TRUE;
@ -704,7 +739,7 @@ dict_stats_copy(
}
if (!INDEX_EQ(src_idx, dst_idx)) {
dict_stats_empty_index(dst_idx);
dict_stats_empty_index(dst_idx, true);
continue;
}
@ -715,7 +750,7 @@ dict_stats_copy(
/* Since src is smaller some elements in dst
will remain untouched by the following memmove(),
thus we init all of them here. */
dict_stats_empty_index(dst_idx);
dict_stats_empty_index(dst_idx, true);
} else {
n_copy_el = dst_idx->n_uniq;
}
@ -735,6 +770,13 @@ dict_stats_copy(
dst_idx->stat_index_size = src_idx->stat_index_size;
dst_idx->stat_n_leaf_pages = src_idx->stat_n_leaf_pages;
dst_idx->stat_defrag_modified_counter =
src_idx->stat_defrag_modified_counter;
dst_idx->stat_defrag_n_pages_freed =
src_idx->stat_defrag_n_pages_freed;
dst_idx->stat_defrag_n_page_split =
src_idx->stat_defrag_n_page_split;
}
dst->stat_initialized = TRUE;
@ -758,6 +800,9 @@ dict_index_t::stat_n_sample_sizes[]
dict_index_t::stat_n_non_null_key_vals[]
dict_index_t::stat_index_size
dict_index_t::stat_n_leaf_pages
dict_index_t::stat_defrag_modified_counter
dict_index_t::stat_defrag_n_pages_freed
dict_index_t::stat_defrag_n_page_split
The returned object should be freed with dict_stats_snapshot_free()
when no longer needed.
@return incomplete table object */
@ -807,7 +852,9 @@ dict_stats_snapshot_free(
Calculates new estimates for index statistics. This function is
relatively quick and is used to calculate transient statistics that
are not saved on disk. This was the only way to calculate statistics
before the Persistent Statistics feature was introduced. */
before the Persistent Statistics feature was introduced.
This function doesn't update the defragmentation related stats.
Only persistent statistics supports defragmentation stats. */
static
void
dict_stats_update_transient_for_index(
@ -823,10 +870,10 @@ dict_stats_update_transient_for_index(
Initialize some bogus index cardinality
statistics, so that the data can be queried in
various means, also via secondary indexes. */
dict_stats_empty_index(index);
dict_stats_empty_index(index, false);
#if defined UNIV_DEBUG || defined UNIV_IBUF_DEBUG
} else if (ibuf_debug && !dict_index_is_clust(index)) {
dict_stats_empty_index(index);
dict_stats_empty_index(index, false);
#endif /* UNIV_DEBUG || UNIV_IBUF_DEBUG */
} else {
mtr_t mtr;
@ -847,7 +894,7 @@ dict_stats_update_transient_for_index(
switch (size) {
case ULINT_UNDEFINED:
dict_stats_empty_index(index);
dict_stats_empty_index(index, false);
return;
case 0:
/* The root node of the tree is a leaf */
@ -882,7 +929,7 @@ dict_stats_update_transient(
if (dict_table_is_discarded(table)) {
/* Nothing to do. */
dict_stats_empty_table(table);
dict_stats_empty_table(table, false);
return;
} else if (index == NULL) {
/* Table definition is corrupt */
@ -892,7 +939,7 @@ dict_stats_update_transient(
fprintf(stderr, " InnoDB: table %s has no indexes. "
"Cannot calculate statistics.\n",
ut_format_name(table->name, TRUE, buf, sizeof(buf)));
dict_stats_empty_table(table);
dict_stats_empty_table(table, false);
return;
}
@ -904,7 +951,7 @@ dict_stats_update_transient(
continue;
}
dict_stats_empty_index(index);
dict_stats_empty_index(index, false);
if (dict_stats_should_ignore_index(index)) {
continue;
@ -1794,7 +1841,7 @@ dict_stats_analyze_index(
DEBUG_PRINTF(" %s(index=%s)\n", __func__, index->name);
dict_stats_empty_index(index);
dict_stats_empty_index(index, false);
mtr_start(&mtr);
@ -2059,7 +2106,7 @@ dict_stats_update_persistent(
/* Table definition is corrupt */
dict_table_stats_unlock(table, RW_X_LATCH);
dict_stats_empty_table(table);
dict_stats_empty_table(table, true);
return(DB_CORRUPTION);
}
@ -2088,7 +2135,7 @@ dict_stats_update_persistent(
continue;
}
dict_stats_empty_index(index);
dict_stats_empty_index(index, false);
if (dict_stats_should_ignore_index(index)) {
continue;
@ -2657,6 +2704,16 @@ dict_stats_fetch_index_stats_step(
== 0) {
index->stat_n_leaf_pages = (ulint) stat_value;
arg->stats_were_modified = true;
} else if (stat_name_len == 12 /* strlen("n_page_split") */
&& strncasecmp("n_page_split", stat_name, stat_name_len)
== 0) {
index->stat_defrag_n_page_split = (ulint) stat_value;
arg->stats_were_modified = true;
} else if (stat_name_len == 13 /* strlen("n_pages_freed") */
&& strncasecmp("n_pages_freed", stat_name, stat_name_len)
== 0) {
index->stat_defrag_n_pages_freed = (ulint) stat_value;
arg->stats_were_modified = true;
} else if (stat_name_len > PFX_LEN /* e.g. stat_name=="n_diff_pfx01" */
&& strncasecmp(PFX, stat_name, PFX_LEN) == 0) {
@ -2776,7 +2833,7 @@ dict_stats_fetch_from_ps(
the persistent storage contains incomplete stats (e.g. missing stats
for some index) then we would end up with (partially) uninitialized
stats. */
dict_stats_empty_table(table);
dict_stats_empty_table(table, true);
trx = trx_allocate_for_background();
@ -2877,6 +2934,22 @@ dict_stats_fetch_from_ps(
return(ret);
}
/*********************************************************************//**
Clear defragmentation stats modified counter for all indices in table. */
static
void
dict_stats_empty_defrag_modified_counter(
dict_table_t* table) /*!< in: table */
{
dict_index_t* index;
ut_a(table);
for (index = dict_table_get_first_index(table);
index != NULL;
index = dict_table_get_next_index(index)) {
index->stat_defrag_modified_counter = 0;
}
}
/*********************************************************************//**
Fetches or calculates new estimates for index statistics. */
UNIV_INTERN
@ -2949,13 +3022,13 @@ dict_stats_update(
"because the .ibd file is missing. For help, please "
"refer to " REFMAN "innodb-troubleshooting.html\n",
ut_format_name(table->name, TRUE, buf, sizeof(buf)));
dict_stats_empty_table(table);
dict_stats_empty_table(table, true);
return(DB_TABLESPACE_DELETED);
} else if (srv_force_recovery >= SRV_FORCE_NO_IBUF_MERGE) {
/* If we have set a high innodb_force_recovery level, do
not calculate statistics, as a badly corrupted index can
cause a crash in it. */
dict_stats_empty_table(table);
dict_stats_empty_table(table, false);
return(DB_SUCCESS);
}
@ -3014,7 +3087,7 @@ dict_stats_update(
case DICT_STATS_EMPTY_TABLE:
dict_stats_empty_table(table);
dict_stats_empty_table(table, true);
/* If table is using persistent stats,
then save the stats on disk */
@ -3073,6 +3146,7 @@ dict_stats_update(
t->stats_last_recalc = table->stats_last_recalc;
t->stat_modified_counter = 0;
dict_stats_empty_defrag_modified_counter(t);
switch (err) {
case DB_SUCCESS:
@ -3083,7 +3157,7 @@ dict_stats_update(
copying because dict_stats_table_clone_create() does
skip corrupted indexes so our dummy object 't' may
have less indexes than the real object 'table'. */
dict_stats_empty_table(table);
dict_stats_empty_table(table, true);
dict_stats_copy(table, t);
@ -3650,6 +3724,117 @@ dict_stats_rename_table(
return(ret);
}
/*********************************************************************//**
Save defragmentation result.
@return DB_SUCCESS or error code */
UNIV_INTERN
dberr_t
dict_stats_save_defrag_summary(
dict_index_t* index) /*!< in: index */
{
dberr_t ret;
lint now = (lint) ut_time();
if (dict_index_is_univ(index)) {
return DB_SUCCESS;
}
rw_lock_x_lock(&dict_operation_lock);
mutex_enter(&dict_sys->mutex);
ret = dict_stats_save_index_stat(index, now, "n_pages_freed",
index->stat_defrag_n_pages_freed,
NULL,
"Number of pages freed during"
" last defragmentation run.",
NULL);
mutex_exit(&dict_sys->mutex);
rw_lock_x_unlock(&dict_operation_lock);
return (ret);
}
/*********************************************************************//**
Save defragmentation stats for a given index.
@return DB_SUCCESS or error code */
UNIV_INTERN
dberr_t
dict_stats_save_defrag_stats(
dict_index_t* index) /*!< in: index */
{
dberr_t ret;
if (index->table->ibd_file_missing) {
ut_print_timestamp(stderr);
fprintf(stderr,
" InnoDB: Cannot save defragment stats because "
".ibd file is missing.\n");
return (DB_TABLESPACE_DELETED);
}
if (dict_index_is_corrupted(index)) {
ut_print_timestamp(stderr);
fprintf(stderr,
" InnoDB: Cannot save defragment stats because "
"index is corrupted.\n");
return(DB_CORRUPTION);
}
if (dict_index_is_univ(index)) {
return DB_SUCCESS;
}
lint now = (lint) ut_time();
mtr_t mtr;
ulint n_leaf_pages;
ulint n_leaf_reserved;
mtr_start(&mtr);
mtr_s_lock(dict_index_get_lock(index), &mtr);
n_leaf_reserved = btr_get_size_and_reserved(index, BTR_N_LEAF_PAGES,
&n_leaf_pages, &mtr);
mtr_commit(&mtr);
if (n_leaf_reserved == ULINT_UNDEFINED) {
// The index name is different during fast index creation,
// so the stats won't be associated with the right index
// for later use. We just return without saving.
return DB_SUCCESS;
}
rw_lock_x_lock(&dict_operation_lock);
mutex_enter(&dict_sys->mutex);
ret = dict_stats_save_index_stat(index, now, "n_page_split",
index->stat_defrag_n_page_split,
NULL,
"Number of new page splits on leaves"
" since last defragmentation.",
NULL);
if (ret != DB_SUCCESS) {
goto end;
}
ret = dict_stats_save_index_stat(
index, now, "n_leaf_pages_defrag",
n_leaf_pages,
NULL,
"Number of leaf pages when this stat is saved to disk",
NULL);
if (ret != DB_SUCCESS) {
goto end;
}
ret = dict_stats_save_index_stat(
index, now, "n_leaf_pages_reserved",
n_leaf_reserved,
NULL,
"Number of pages reserved for this index leaves when this stat "
"is saved to disk",
NULL);
end:
mutex_exit(&dict_sys->mutex);
rw_lock_x_unlock(&dict_operation_lock);
return (ret);
}
/* tests @{ */
#ifdef UNIV_COMPILE_TEST_FUNCS

View file

@ -25,6 +25,7 @@ Created Apr 25, 2012 Vasil Dimov
#include "row0mysql.h"
#include "srv0start.h"
#include "dict0dict.h"
#include "dict0stats.h"
#include "dict0stats_bg.h"
@ -44,8 +45,10 @@ UNIV_INTERN os_event_t dict_stats_event = NULL;
/** This mutex protects the "recalc_pool" variable. */
static ib_mutex_t recalc_pool_mutex;
static ib_mutex_t defrag_pool_mutex;
#ifdef HAVE_PSI_INTERFACE
static mysql_pfs_key_t recalc_pool_mutex_key;
static mysql_pfs_key_t defrag_pool_mutex_key;
#endif /* HAVE_PSI_INTERFACE */
/** The number of tables that can be added to "recalc_pool" before
@ -59,16 +62,26 @@ static recalc_pool_t recalc_pool;
typedef recalc_pool_t::iterator recalc_pool_iterator_t;
/** Indices whose defrag stats need to be saved to persistent storage.*/
struct defrag_pool_item_t {
table_id_t table_id;
index_id_t index_id;
};
typedef std::vector<defrag_pool_item_t> defrag_pool_t;
static defrag_pool_t defrag_pool;
typedef defrag_pool_t::iterator defrag_pool_iterator_t;
/*****************************************************************//**
Initialize the recalc pool, called once during thread initialization. */
static
void
dict_stats_recalc_pool_init()
dict_stats_pool_init()
/*=========================*/
{
ut_ad(!srv_read_only_mode);
recalc_pool.reserve(RECALC_POOL_INITIAL_SLOTS);
defrag_pool.reserve(RECALC_POOL_INITIAL_SLOTS);
}
/*****************************************************************//**
@ -76,12 +89,13 @@ Free the resources occupied by the recalc pool, called once during
thread de-initialization. */
static
void
dict_stats_recalc_pool_deinit()
dict_stats_pool_deinit()
/*===========================*/
{
ut_ad(!srv_read_only_mode);
recalc_pool.clear();
defrag_pool.clear();
}
/*****************************************************************//**
@ -177,6 +191,111 @@ dict_stats_recalc_pool_del(
mutex_exit(&recalc_pool_mutex);
}
/*****************************************************************//**
Add an index in a table to the defrag pool, which is processed by the
background stats gathering thread. Only the table id and index id are
added to the list, so the table can be closed after being enqueued and
it will be opened when needed. If the table or index does not exist later
(has been DROPped), then it will be removed from the pool and skipped. */
UNIV_INTERN
void
dict_stats_defrag_pool_add(
/*=======================*/
const dict_index_t* index) /*!< in: table to add */
{
defrag_pool_item_t item;
ut_ad(!srv_read_only_mode);
mutex_enter(&defrag_pool_mutex);
/* quit if already in the list */
for (defrag_pool_iterator_t iter = defrag_pool.begin();
iter != defrag_pool.end();
++iter) {
if ((*iter).table_id == index->table->id
&& (*iter).index_id == index->id) {
mutex_exit(&defrag_pool_mutex);
return;
}
}
item.table_id = index->table->id;
item.index_id = index->id;
defrag_pool.push_back(item);
mutex_exit(&defrag_pool_mutex);
os_event_set(dict_stats_event);
}
/*****************************************************************//**
Get an index from the auto defrag pool. The returned index id is removed
from the pool.
@return true if the pool was non-empty and "id" was set, false otherwise */
static
bool
dict_stats_defrag_pool_get(
/*=======================*/
table_id_t* table_id, /*!< out: table id, or unmodified if
list is empty */
index_id_t* index_id) /*!< out: index id, or unmodified if
list is empty */
{
ut_ad(!srv_read_only_mode);
mutex_enter(&defrag_pool_mutex);
if (defrag_pool.empty()) {
mutex_exit(&defrag_pool_mutex);
return(false);
}
defrag_pool_item_t& item = defrag_pool.back();
*table_id = item.table_id;
*index_id = item.index_id;
defrag_pool.pop_back();
mutex_exit(&defrag_pool_mutex);
return(true);
}
/*****************************************************************//**
Delete a given index from the auto defrag pool. */
UNIV_INTERN
void
dict_stats_defrag_pool_del(
/*=======================*/
const dict_table_t* table, /*!<in: if given, remove
all entries for the table */
const dict_index_t* index) /*!< in: if given, remove this index */
{
ut_a((table && !index) || (!table && index));
ut_ad(!srv_read_only_mode);
ut_ad(mutex_own(&dict_sys->mutex));
mutex_enter(&defrag_pool_mutex);
defrag_pool_iterator_t iter = defrag_pool.begin();
while (iter != defrag_pool.end()) {
if ((table && (*iter).table_id == table->id)
|| (index
&& (*iter).table_id == index->table->id
&& (*iter).index_id == index->id)) {
/* erase() invalidates the iterator */
iter = defrag_pool.erase(iter);
if (index)
break;
} else {
iter++;
}
}
mutex_exit(&defrag_pool_mutex);
}
/*****************************************************************//**
Wait until background stats thread has stopped using the specified table.
The caller must have locked the data dictionary using
@ -227,7 +346,10 @@ dict_stats_thread_init()
mutex_create(recalc_pool_mutex_key, &recalc_pool_mutex,
SYNC_STATS_AUTO_RECALC);
dict_stats_recalc_pool_init();
/* We choose SYNC_STATS_DEFRAG to be below SYNC_FSP_PAGE. */
mutex_create(defrag_pool_mutex_key, &defrag_pool_mutex,
SYNC_STATS_DEFRAG);
dict_stats_pool_init();
}
/*****************************************************************//**
@ -241,11 +363,14 @@ dict_stats_thread_deinit()
ut_a(!srv_read_only_mode);
ut_ad(!srv_dict_stats_thread_active);
dict_stats_recalc_pool_deinit();
dict_stats_pool_deinit();
mutex_free(&recalc_pool_mutex);
memset(&recalc_pool_mutex, 0x0, sizeof(recalc_pool_mutex));
mutex_free(&defrag_pool_mutex);
memset(&defrag_pool_mutex, 0x0, sizeof(defrag_pool_mutex));
os_event_free(dict_stats_event);
dict_stats_event = NULL;
}
@ -322,6 +447,63 @@ dict_stats_process_entry_from_recalc_pool()
mutex_exit(&dict_sys->mutex);
}
/*****************************************************************//**
Get the first index that has been added for updating persistent defrag
stats and eventually save its stats. */
static
void
dict_stats_process_entry_from_defrag_pool()
/*=======================================*/
{
table_id_t table_id;
index_id_t index_id;
ut_ad(!srv_read_only_mode);
/* pop the first index from the auto defrag pool */
if (!dict_stats_defrag_pool_get(&table_id, &index_id)) {
/* no index in defrag pool */
return;
}
dict_table_t* table;
mutex_enter(&dict_sys->mutex);
/* If the table is no longer cached, we've already lost the in
memory stats so there's nothing really to write to disk. */
table = dict_table_open_on_id(table_id, TRUE,
DICT_TABLE_OP_OPEN_ONLY_IF_CACHED);
if (table == NULL) {
mutex_exit(&dict_sys->mutex);
return;
}
/* Check whether table is corrupted */
if (table->corrupted) {
dict_table_close(table, TRUE, FALSE);
mutex_exit(&dict_sys->mutex);
return;
}
mutex_exit(&dict_sys->mutex);
dict_index_t* index = dict_table_find_index_on_id(table, index_id);
if (index == NULL) {
return;
}
/* Check whether index is corrupted */
if (dict_index_is_corrupted(index)) {
dict_table_close(table, FALSE, FALSE);
return;
}
dict_stats_save_defrag_stats(index);
dict_table_close(table, FALSE, FALSE);
}
/*****************************************************************//**
This is the thread for background stats gathering. It pops tables, from
the auto recalc list and proceeds them, eventually recalculating their
@ -354,6 +536,9 @@ DECLARE_THREAD(dict_stats_thread)(
dict_stats_process_entry_from_recalc_pool();
while (defrag_pool.size())
dict_stats_process_entry_from_defrag_pool();
os_event_reset(dict_stats_event);
}

View file

@ -58,6 +58,7 @@ this program; if not, write to the Free Software Foundation, Inc.,
#include "buf0flu.h"
#include "buf0dblwr.h"
#include "btr0sea.h"
#include "btr0defragment.h"
#include "os0file.h"
#include "os0thread.h"
#include "srv0start.h"
@ -66,7 +67,6 @@ this program; if not, write to the Free Software Foundation, Inc.,
#include "trx0trx.h"
#include "trx0sys.h"
#include "mtr0mtr.h"
#include "rem0types.h"
#include "row0ins.h"
#include "row0mysql.h"
@ -88,6 +88,7 @@ this program; if not, write to the Free Software Foundation, Inc.,
#include "dict0stats_bg.h"
#include "ha_prototypes.h"
#include "ut0mem.h"
#include "ut0timer.h"
#include "ibuf0ibuf.h"
#include "dict0dict.h"
#include "srv0mon.h"
@ -946,6 +947,14 @@ static SHOW_VAR innodb_status_variables[]= {
{"have_bzip2",
(char*) &innodb_have_bzip2, SHOW_BOOL},
/* Defragment */
{"defragment_compression_failures",
(char*) &export_vars.innodb_defragment_compression_failures, SHOW_LONG},
{"defragment_failures",
(char*) &export_vars.innodb_defragment_failures, SHOW_LONG},
{"defragment_count",
(char*) &export_vars.innodb_defragment_count, SHOW_LONG},
{NullS, NullS, SHOW_LONG}
};
@ -2700,7 +2709,8 @@ ha_innobase::ha_innobase(
(srv_force_primary_key ? HA_REQUIRE_PRIMARY_KEY : 0 ) |
HA_CAN_FULLTEXT_EXT | HA_CAN_EXPORT),
start_of_scan(0),
num_write_row(0)
num_write_row(0),
ha_partition_stats(NULL)
{}
/*********************************************************************//**
@ -11222,6 +11232,72 @@ ha_innobase::delete_table(
DBUG_RETURN(convert_error_code_to_mysql(err, 0, NULL));
}
/*****************************************************************//**
Defragment table.
@return error number */
UNIV_INTERN
int
ha_innobase::defragment_table(
/*==========================*/
const char* name, /*!< in: table name */
const char* index_name, /*!< in: index name */
bool async) /*!< in: whether to wait until finish */
{
char norm_name[FN_REFLEN];
dict_table_t* table;
dict_index_t* index;
ibool one_index = (index_name != 0);
int ret = 0;
if (!srv_defragment) {
return ER_FEATURE_DISABLED;
}
normalize_table_name(norm_name, name);
table = dict_table_open_on_name(norm_name, FALSE,
FALSE, DICT_ERR_IGNORE_NONE);
for (index = dict_table_get_first_index(table); index;
index = dict_table_get_next_index(index)) {
if (one_index && strcasecmp(index_name, index->name) != 0)
continue;
if (btr_defragment_find_index(index)) {
// We borrow this error code. When the same index is
// already in the defragmentation queue, issue another
// defragmentation only introduces overhead. We return
// an error here to let the user know this is not
// necessary. Note that this will fail a query that's
// trying to defragment a full table if one of the
// indicies in that table is already in defragmentation.
// We choose this behavior so user is aware of this
// rather than silently defragment other indicies of
// that table.
ret = ER_SP_ALREADY_EXISTS;
break;
}
os_event_t event = btr_defragment_add_index(index, async);
if (!async && event) {
while(os_event_wait_time(event, 1000000)) {
if (thd_killed(current_thd)) {
btr_defragment_remove_index(index);
ret = ER_QUERY_INTERRUPTED;
break;
}
}
os_event_free(event);
}
if (ret) {
break;
}
if (one_index) {
one_index = FALSE;
break;
}
}
dict_table_close(table, FALSE, FALSE);
if (ret == 0 && one_index) {
ret = ER_NO_SUCH_INDEX;
}
return ret;
}
/*****************************************************************//**
Removes all tables in the named database inside InnoDB. */
static
@ -12389,6 +12465,27 @@ ha_innobase::optimize(
This works OK otherwise, but MySQL locks the entire table during
calls to OPTIMIZE, which is undesirable. */
if (srv_defragment) {
int err;
err = defragment_table(prebuilt->table->name, NULL, false);
if (err == 0) {
return (HA_ADMIN_OK);
} else {
push_warning_printf(thd, Sql_condition::WARN_LEVEL_WARN,
err,
"InnoDB: Cannot defragment table %s: returned error code %d\n",
prebuilt->table->name, err);
if(err == ER_SP_ALREADY_EXISTS) {
return (HA_ADMIN_OK);
} else {
return (HA_ADMIN_TRY_ALTER);
}
}
}
if (innodb_optimize_fulltext_only) {
if (prebuilt->table->fts && prebuilt->table->fts->cache
&& !dict_table_is_discarded(prebuilt->table)) {
@ -15190,6 +15287,13 @@ innodb_max_dirty_pages_pct_lwm_update(
srv_max_dirty_pages_pct_lwm = in_val;
}
UNIV_INTERN
void
ha_innobase::set_partition_owner_stats(ha_statistics *stats)
{
ha_partition_stats= stats;
}
/************************************************************//**
Validate the file format name and return its corresponding id.
@return valid file format id */
@ -16448,6 +16552,23 @@ innodb_reset_all_monitor_update(
TRUE);
}
static
void
innodb_defragment_frequency_update(
/*===============================*/
THD* thd, /*!< in: thread handle */
struct st_mysql_sys_var* var, /*!< in: pointer to
system variable */
void* var_ptr,/*!< out: where the
formal string goes */
const void* save) /*!< in: immediate result
from check function */
{
srv_defragment_frequency = (*static_cast<const uint*>(save));
srv_defragment_interval = ut_microseconds_to_timer(
1000000.0 / srv_defragment_frequency);
}
/****************************************************************//**
Parse and enable InnoDB monitor counters during server startup.
User can list the monitor counters/groups to be enable by specifying
@ -17735,6 +17856,60 @@ static MYSQL_SYSVAR_BOOL(buffer_pool_load_at_startup, srv_buffer_pool_load_at_st
"Load the buffer pool from a file named @@innodb_buffer_pool_filename",
NULL, NULL, FALSE);
static MYSQL_SYSVAR_BOOL(defragment, srv_defragment,
PLUGIN_VAR_RQCMDARG,
"Enable/disable InnoDB defragmentation (default FALSE). When set to FALSE, all existing "
"defragmentation will be paused. And new defragmentation command will fail."
"Paused defragmentation commands will resume when this variable is set to "
"true again.",
NULL, NULL, FALSE);
static MYSQL_SYSVAR_UINT(defragment_n_pages, srv_defragment_n_pages,
PLUGIN_VAR_RQCMDARG,
"Number of pages considered at once when merging multiple pages to "
"defragment",
NULL, NULL, 7, 2, 32, 0);
static MYSQL_SYSVAR_UINT(defragment_stats_accuracy,
srv_defragment_stats_accuracy,
PLUGIN_VAR_RQCMDARG,
"How many defragment stats changes there are before the stats "
"are written to persistent storage. Set to 0 meaning disable "
"defragment stats tracking.",
NULL, NULL, 0, 0, ~0U, 0);
static MYSQL_SYSVAR_UINT(defragment_fill_factor_n_recs,
srv_defragment_fill_factor_n_recs,
PLUGIN_VAR_RQCMDARG,
"How many records of space defragmentation should leave on the page. "
"This variable, together with innodb_defragment_fill_factor, is introduced "
"so defragmentation won't pack the page too full and cause page split on "
"the next insert on every page. The variable indicating more defragmentation"
" gain is the one effective.",
NULL, NULL, 20, 1, 100, 0);
static MYSQL_SYSVAR_DOUBLE(defragment_fill_factor, srv_defragment_fill_factor,
PLUGIN_VAR_RQCMDARG,
"A number between [0.7, 1] that tells defragmentation how full it should "
"fill a page. Default is 0.9. Number below 0.7 won't make much sense."
"This variable, together with innodb_defragment_fill_factor_n_recs, is "
"introduced so defragmentation won't pack the page too full and cause "
"page split on the next insert on every page. The variable indicating more "
"defragmentation gain is the one effective.",
NULL, NULL, 0.9, 0.7, 1, 0);
static MYSQL_SYSVAR_UINT(defragment_frequency, srv_defragment_frequency,
PLUGIN_VAR_RQCMDARG,
"Do not defragment a single index more than this number of time per second."
"This controls the number of time defragmentation thread can request X_LOCK "
"on an index. Defragmentation thread will check whether "
"1/defragment_frequency (s) has passed since it worked on this index last "
"time, and put the index back to the queue if not enough time has passed. "
"The actual frequency can only be lower than this given number.",
NULL, innodb_defragment_frequency_update,
SRV_DEFRAGMENT_FREQUENCY_DEFAULT, 1, 1000, 0);
static MYSQL_SYSVAR_ULONG(lru_scan_depth, srv_LRU_scan_depth,
PLUGIN_VAR_RQCMDARG,
"How deep to scan LRU to keep it clean",
@ -18291,6 +18466,12 @@ static struct st_mysql_sys_var* innobase_system_variables[]= {
MYSQL_SYSVAR(buffer_pool_load_now),
MYSQL_SYSVAR(buffer_pool_load_abort),
MYSQL_SYSVAR(buffer_pool_load_at_startup),
MYSQL_SYSVAR(defragment),
MYSQL_SYSVAR(defragment_n_pages),
MYSQL_SYSVAR(defragment_stats_accuracy),
MYSQL_SYSVAR(defragment_fill_factor),
MYSQL_SYSVAR(defragment_fill_factor_n_recs),
MYSQL_SYSVAR(defragment_frequency),
MYSQL_SYSVAR(lru_scan_depth),
MYSQL_SYSVAR(flush_neighbors),
MYSQL_SYSVAR(checksum_algorithm),

View file

@ -1,7 +1,7 @@
/*****************************************************************************
Copyright (c) 2000, 2012, Oracle and/or its affiliates. All Rights Reserved.
Copyright (c) 2013, SkySQL Ab. All Rights Reserved.
Copyright (c) 2013, 2014, SkySQL Ab. All Rights Reserved.
This program is free software; you can redistribute it and/or modify it under
the terms of the GNU General Public License as published by the Free Software
@ -105,6 +105,8 @@ class ha_innobase: public handler
or undefined */
uint num_write_row; /*!< number of write_row() calls */
ha_statistics* ha_partition_stats; /*!< stats of the partition owner
handler (if there is one) */
uint store_key_val_for_row(uint keynr, char* buff, uint buff_len,
const uchar* record);
inline void update_thd(THD* thd);
@ -207,6 +209,8 @@ class ha_innobase: public handler
int truncate();
int delete_table(const char *name);
int rename_table(const char* from, const char* to);
int defragment_table(const char* name, const char* index_name,
bool async);
int check(THD* thd, HA_CHECK_OPT* check_opt);
char* update_table_comment(const char* comment);
char* get_foreign_key_create_info();
@ -310,6 +314,7 @@ class ha_innobase: public handler
Alter_inplace_info* ha_alter_info,
bool commit);
/** @} */
void set_partition_owner_stats(ha_statistics *stats);
bool check_if_incompatible_data(HA_CREATE_INFO *info,
uint table_changes);
bool check_if_supported_virtual_columns(void) { return TRUE; }

View file

@ -2,6 +2,7 @@
Copyright (c) 1994, 2013, Oracle and/or its affiliates. All Rights Reserved.
Copyright (c) 2012, Facebook Inc.
Copyright (c) 2014, SkySQL Ab. All Rights Reserved.
This program is free software; you can redistribute it and/or modify it under
the terms of the GNU General Public License as published by the Free Software
@ -674,6 +675,21 @@ btr_get_size(
is s-latched */
__attribute__((nonnull, warn_unused_result));
/**************************************************************//**
Gets the number of reserved and used pages in a B-tree.
@return number of pages reserved, or ULINT_UNDEFINED if the index
is unavailable */
UNIV_INTERN
ulint
btr_get_size_and_reserved(
/*======================*/
dict_index_t* index, /*!< in: index */
ulint flag, /*!< in: BTR_N_LEAF_PAGES or BTR_TOTAL_SIZE */
ulint* used, /*!< out: number of pages used (<= reserved) */
mtr_t* mtr) /*!< in/out: mini-transaction where index
is s-latched */
__attribute__((nonnull));
/**************************************************************//**
Allocates a new file page to be used in an index tree. NOTE: we assume
that the caller has made the reservation for free extents!
@retval NULL if no page could be allocated
@ -720,6 +736,33 @@ btr_page_free_low(
ulint level, /*!< in: page level */
mtr_t* mtr) /*!< in: mtr */
__attribute__((nonnull));
/*************************************************************//**
Reorganizes an index page.
IMPORTANT: On success, the caller will have to update IBUF_BITMAP_FREE
if this is a compressed leaf page in a secondary index. This has to
be done either within the same mini-transaction, or by invoking
ibuf_reset_free_bits() before mtr_commit(). On uncompressed pages,
IBUF_BITMAP_FREE is unaffected by reorganization.
@retval true if the operation was successful
@retval false if it is a compressed page, and recompression failed */
UNIV_INTERN
bool
btr_page_reorganize_block(
/*======================*/
bool recovery,/*!< in: true if called in recovery:
locks should not be updated, i.e.,
there cannot exist locks on the
page, and a hash index should not be
dropped: it cannot exist */
ulint z_level,/*!< in: compression level to be used
if dealing with compressed page */
buf_block_t* block, /*!< in/out: B-tree page */
dict_index_t* index, /*!< in: the index tree of the page */
mtr_t* mtr) /*!< in/out: mini-transaction */
__attribute__((nonnull));
#ifdef UNIV_BTR_PRINT
/*************************************************************//**
Prints size info of a B-tree. */
@ -765,6 +808,60 @@ btr_validate_index(
const trx_t* trx) /*!< in: transaction or 0 */
__attribute__((nonnull(1), warn_unused_result));
#ifdef UNIV_SYNC_DEBUG
/*************************************************************//**
Removes a page from the level list of pages.
@param space in: space where removed
@param zip_size in: compressed page size in bytes, or 0 for uncompressed
@param page in/out: page to remove
@param index in: index tree
@param mtr in/out: mini-transaction */
# define btr_level_list_remove(space,zip_size,page,index,mtr) \
btr_level_list_remove_func(space,zip_size,page,index,mtr)
#else /* UNIV_SYNC_DEBUG */
/*************************************************************//**
Removes a page from the level list of pages.
@param space in: space where removed
@param zip_size in: compressed page size in bytes, or 0 for uncompressed
@param page in/out: page to remove
@param index in: index tree
@param mtr in/out: mini-transaction */
# define btr_level_list_remove(space,zip_size,page,index,mtr) \
btr_level_list_remove_func(space,zip_size,page,mtr)
#endif /* UNIV_SYNC_DEBUG */
/*************************************************************//**
Removes a page from the level list of pages. */
UNIV_INTERN
void
btr_level_list_remove_func(
/*=======================*/
ulint space, /*!< in: space where removed */
ulint zip_size,/*!< in: compressed page size in bytes
or 0 for uncompressed pages */
page_t* page, /*!< in/out: page to remove */
#ifdef UNIV_SYNC_DEBUG
const dict_index_t* index, /*!< in: index tree */
#endif /* UNIV_SYNC_DEBUG */
mtr_t* mtr) /*!< in/out: mini-transaction */
__attribute__((nonnull));
/*************************************************************//**
If page is the only on its level, this function moves its records to the
father page, thus reducing the tree height.
@return father block */
UNIV_INTERN
buf_block_t*
btr_lift_page_up(
/*=============*/
dict_index_t* index, /*!< in: index tree */
buf_block_t* block, /*!< in: page which is the only on its level;
must not be empty: use
btr_discard_only_page_on_level if the last
record from the page should be removed */
mtr_t* mtr) /*!< in: mtr */
__attribute__((nonnull));
#define BTR_N_LEAF_PAGES 1
#define BTR_TOTAL_SIZE 2
#endif /* !UNIV_HOTBACKUP */

View file

@ -28,7 +28,7 @@ Created 6/2/1994 Heikki Tuuri
#include "mtr0mtr.h"
#include "mtr0log.h"
#include "page0zip.h"
#include "srv0srv.h"
#define BTR_MAX_NODE_LEVEL 50 /*!< Maximum B-tree page level
(not really a hard limit).
Used in debug assertions
@ -59,9 +59,7 @@ btr_block_get_func(
block = buf_page_get_gen(space, zip_size, page_no, mode,
NULL, BUF_GET, file, line, mtr);
SRV_CORRUPT_TABLE_CHECK(block, ; /* do nothing */);
if (block && mode != RW_NO_LATCH) {
if (mode != RW_NO_LATCH) {
buf_block_dbg_add_level(
block, index != NULL && dict_index_is_ibuf(index)
@ -165,9 +163,10 @@ btr_page_get_next(
/*!< in: mini-transaction handle */
{
ut_ad(page && mtr);
#ifndef UNIV_INNOCHECKSUM
ut_ad(mtr_memo_contains_page(mtr, page, MTR_MEMO_PAGE_X_FIX)
|| mtr_memo_contains_page(mtr, page, MTR_MEMO_PAGE_S_FIX));
#endif /* UNIV_INNOCHECKSUM */
return(mach_read_from_4(page + FIL_PAGE_NEXT));
}

View file

@ -0,0 +1,100 @@
/*****************************************************************************
Copyright (C) 2013, 2014 Facebook, Inc. All Rights Reserved.
This program is free software; you can redistribute it and/or modify it under
the terms of the GNU General Public License as published by the Free Software
Foundation; version 2 of the License.
This program is distributed in the hope that it will be useful, but WITHOUT
ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
You should have received a copy of the GNU General Public License along with
this program; if not, write to the Free Software Foundation, Inc.,
51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA
*****************************************************************************/
#ifndef btr0defragment_h
#define btr0defragment_h
#include "univ.i"
#ifndef UNIV_HOTBACKUP
#include "btr0pcur.h"
/* Max number of pages to consider at once during defragmentation. */
#define BTR_DEFRAGMENT_MAX_N_PAGES 32
/** stats in btr_defragment */
extern ulint btr_defragment_compression_failures;
extern ulint btr_defragment_failures;
extern ulint btr_defragment_count;
/** Item in the work queue for btr_degrament_thread. */
struct btr_defragment_item_t
{
btr_pcur_t* pcur; /* persistent cursor where
btr_defragment_n_pages should start */
os_event_t event; /* if not null, signal after work
is done */
bool removed; /* Mark an item as removed */
ulonglong last_processed; /* timestamp of last time this index
is processed by defragment thread */
btr_defragment_item_t(btr_pcur_t* pcur, os_event_t event);
~btr_defragment_item_t();
};
/******************************************************************//**
Initialize defragmentation. */
void
btr_defragment_init(void);
/******************************************************************//**
Shutdown defragmentation. */
void
btr_defragment_shutdown();
/******************************************************************//**
Check whether the given index is in btr_defragment_wq. */
bool
btr_defragment_find_index(
dict_index_t* index); /*!< Index to find. */
/******************************************************************//**
Add an index to btr_defragment_wq. Return a pointer to os_event if this
is a synchronized defragmentation. */
os_event_t
btr_defragment_add_index(
dict_index_t* index, /*!< index to be added */
bool async); /*!< whether this is an async defragmentation */
/******************************************************************//**
When table is dropped, this function is called to mark a table as removed in
btr_efragment_wq. The difference between this function and the remove_index
function is this will not NULL the event. */
void
btr_defragment_remove_table(
dict_table_t* table); /*!< Index to be removed. */
/******************************************************************//**
Mark an index as removed from btr_defragment_wq. */
void
btr_defragment_remove_index(
dict_index_t* index); /*!< Index to be removed. */
/*********************************************************************//**
Check whether we should save defragmentation statistics to persistent storage.*/
UNIV_INTERN
void
btr_defragment_save_defrag_stats_if_needed(
dict_index_t* index); /*!< in: index */
/******************************************************************//**
Thread that merges consecutive b-tree pages into fewer pages to defragment
the index. */
extern "C" UNIV_INTERN
os_thread_ret_t
DECLARE_THREAD(btr_defragment_thread)(
/*==========================================*/
void* arg); /*!< in: a dummy parameter required by
os_thread_create */
#endif /* !UNIV_HOTBACKUP */
#endif

View file

@ -120,7 +120,9 @@ enum dict_table_op_t {
DICT_TABLE_OP_DROP_ORPHAN,
/** Silently load the tablespace if it does not exist,
and do not load the definitions of incomplete indexes. */
DICT_TABLE_OP_LOAD_TABLESPACE
DICT_TABLE_OP_LOAD_TABLESPACE,
/** Open the table only if it's in table cache. */
DICT_TABLE_OP_OPEN_ONLY_IF_CACHED
};
/**********************************************************************//**
@ -1495,6 +1497,16 @@ dict_table_get_index_on_name(
const char* name) /*!< in: name of the index to find */
__attribute__((nonnull, warn_unused_result));
/**********************************************************************//**
Looks for an index with the given id given a table instance.
@return index or NULL */
UNIV_INTERN
dict_index_t*
dict_table_find_index_on_id(
/*========================*/
const dict_table_t* table, /*!< in: table instance */
index_id_t id) /*!< in: index id */
__attribute__((nonnull, warn_unused_result));
/**********************************************************************//**
In case there is more than one index with the same name return the index
with the min(id).
@return index, NULL if does not exist */

View file

@ -597,6 +597,10 @@ struct zip_pad_info_t {
rounds */
};
/** Number of samples of data size kept when page compression fails for
a certain index.*/
#define STAT_DEFRAG_DATA_SIZE_N_SAMPLE 10
/** Data structure for an index. Most fields will be
initialized to 0, NULL or FALSE in dict_mem_index_create(). */
struct dict_index_t{
@ -689,6 +693,23 @@ struct dict_index_t{
/*!< approximate number of leaf pages in the
index tree */
/* @} */
/** Statistics for defragmentation, these numbers are estimations and
could be very inaccurate at certain times, e.g. right after restart,
during defragmentation, etc. */
/* @{ */
ulint stat_defrag_modified_counter;
ulint stat_defrag_n_pages_freed;
/* number of pages freed by defragmentation. */
ulint stat_defrag_n_page_split;
/* number of page splits since last full index
defragmentation. */
ulint stat_defrag_data_size_sample[STAT_DEFRAG_DATA_SIZE_N_SAMPLE];
/* data size when compression failure happened
the most recent 10 times. */
ulint stat_defrag_sample_next_slot;
/* in which slot the next sample should be
saved. */
/* @} */
prio_rw_lock_t lock; /*!< read-write lock protecting the
upper levels of the index tree */
trx_id_t trx_id; /*!< id of the transaction that created this

View file

@ -53,8 +53,9 @@ dict_table_t*
dict_table_open_on_id_low(
/*=====================*/
table_id_t table_id, /*!< in: table id */
dict_err_ignore_t ignore_err); /*!< in: errors to ignore
dict_err_ignore_t ignore_err, /*!< in: errors to ignore
when loading the table */
ibool open_only_if_in_cache);
#ifndef UNIV_NONINL
#include "dict0priv.ic"

View file

@ -74,8 +74,9 @@ dict_table_t*
dict_table_open_on_id_low(
/*======================*/
table_id_t table_id, /*!< in: table id */
dict_err_ignore_t ignore_err) /*!< in: errors to ignore
dict_err_ignore_t ignore_err, /*!< in: errors to ignore
when loading the table */
ibool open_only_if_in_cache)
{
dict_table_t* table;
ulint fold;
@ -88,7 +89,7 @@ dict_table_open_on_id_low(
HASH_SEARCH(id_hash, dict_sys->table_id_hash, fold,
dict_table_t*, table, ut_ad(table->cached),
table->id == table_id);
if (table == NULL) {
if (table == NULL && !open_only_if_in_cache) {
table = dict_load_table_on_id(table_id, ignore_err);
}

View file

@ -195,6 +195,39 @@ dict_stats_rename_table(
is returned */
size_t errstr_sz); /*!< in: errstr size */
/*********************************************************************//**
Save defragmentation result.
@return DB_SUCCESS or error code */
UNIV_INTERN
dberr_t
dict_stats_save_defrag_summary(
dict_index_t* index); /*!< in: index */
/*********************************************************************//**
Save defragmentation stats for a given index.
@return DB_SUCCESS or error code */
UNIV_INTERN
dberr_t
dict_stats_save_defrag_stats(
dict_index_t* index); /*!< in: index */
/**********************************************************************//**
Clear defragmentation summary. */
UNIV_INTERN
void
dict_stats_empty_defrag_summary(
/*==================*/
dict_index_t* index); /*!< in: index to clear defragmentation stats */
/**********************************************************************//**
Clear defragmentation related index stats. */
UNIV_INTERN
void
dict_stats_empty_defrag_stats(
/*==================*/
dict_index_t* index); /*!< in: index to clear defragmentation stats */
#ifndef UNIV_NONINL
#include "dict0stats.ic"
#endif

View file

@ -56,6 +56,28 @@ dict_stats_recalc_pool_del(
/*=======================*/
const dict_table_t* table); /*!< in: table to remove */
/*****************************************************************//**
Add an index in a table to the defrag pool, which is processed by the
background stats gathering thread. Only the table id and index id are
added to the list, so the table can be closed after being enqueued and
it will be opened when needed. If the table or index does not exist later
(has been DROPped), then it will be removed from the pool and skipped. */
UNIV_INTERN
void
dict_stats_defrag_pool_add(
/*=======================*/
const dict_index_t* index); /*!< in: table to add */
/*****************************************************************//**
Delete a given index from the auto defrag pool. */
UNIV_INTERN
void
dict_stats_defrag_pool_del(
/*=======================*/
const dict_table_t* table, /*!<in: if given, remove
all entries for the table */
const dict_index_t* index); /*!< in: index to remove */
/** Yield the data dictionary latch when waiting
for the background thread to stop accessing a table.
@param trx transaction holding the data dictionary locks */

View file

@ -183,6 +183,16 @@ lock_update_merge_left(
const buf_block_t* right_block); /*!< in: merged index page
which will be discarded */
/*************************************************************//**
Updates the lock table when a page is splited and merged to
two pages. */
UNIV_INTERN
void
lock_update_split_and_merge(
const buf_block_t* left_block, /*!< in: left page to which merged */
const rec_t* orig_pred, /*!< in: original predecessor of
supremum on the left page before merge*/
const buf_block_t* right_block);/*!< in: right page from which merged */
/*************************************************************//**
Resets the original locks on heir and replaces them with gap type locks
inherited from rec. */
UNIV_INTERN

View file

@ -397,6 +397,15 @@ extern my_bool srv_random_read_ahead;
extern ulong srv_read_ahead_threshold;
extern ulint srv_n_read_io_threads;
extern ulint srv_n_write_io_threads;
/* Defragmentation, Origianlly facebook default value is 100, but it's too high */
#define SRV_DEFRAGMENT_FREQUENCY_DEFAULT 40
extern my_bool srv_defragment;
extern uint srv_defragment_n_pages;
extern uint srv_defragment_stats_accuracy;
extern uint srv_defragment_fill_factor_n_recs;
extern double srv_defragment_fill_factor;
extern uint srv_defragment_frequency;
extern ulonglong srv_defragment_interval;
/* Number of IO operations per second the server can do */
extern ulong srv_io_capacity;
@ -1099,6 +1108,9 @@ struct export_var_t{
ib_int64_t innodb_x_lock_os_waits;
ib_int64_t innodb_x_lock_spin_rounds;
ib_int64_t innodb_x_lock_spin_waits;
ulint innodb_defragment_compression_failures;
ulint innodb_defragment_failures;
ulint innodb_defragment_count;
#ifdef UNIV_DEBUG
ulint innodb_purge_trx_id_age; /*!< rw_max_trx_id - purged trx_id */
ulint innodb_purge_view_trx_id_age; /*!< rw_max_trx_id

View file

@ -864,6 +864,7 @@ or row lock! */
#define SYNC_EXTERN_STORAGE 500
#define SYNC_FSP 400
#define SYNC_FSP_PAGE 395
#define SYNC_STATS_DEFRAG 390
/*------------------------------------- Change buffer headers */
#define SYNC_IBUF_MUTEX 370 /* ibuf_mutex */
/*------------------------------------- Change buffer tree */

View file

@ -0,0 +1,104 @@
/*****************************************************************************
Copyright (c) 2013, 2014, Facebook, Inc. All Rights Reserved.
Copyright (c) 2014, SkySQL Ab. All Rights Reserved.
This program is free software; you can redistribute it and/or modify it under
the terms of the GNU General Public License as published by the Free Software
Foundation; version 2 of the License.
This program is distributed in the hope that it will be useful, but WITHOUT
ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
You should have received a copy of the GNU General Public License along with
this program; if not, write to the Free Software Foundation, Inc.,
51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA
*****************************************************************************/
/********************************************************************//**
@file include/ut0timer.h
Timer rountines
Created 30/07/2014 Jan Lindström jan.lindstrom@skysql.com
modified from https://github.com/facebook/mysql-5.6/commit/c75a413edeb96eb99bf11d7269bdfea06f96d6b6
*************************************************************************/
#ifndef ut0timer_h
#define ut0timer_h
#include "univ.i"
#include "data0type.h"
#include <my_rdtsc.h>
/* Current timer stats */
extern struct my_timer_unit_info ut_timer;
/**************************************************************//**
Function pointer to point selected timer function.
@return timer current value */
extern ulonglong (*ut_timer_now)(void);
/**************************************************************//**
Sets up the data required for use of my_timer_* functions.
Selects the best timer by high frequency, and tight resolution.
Points my_timer_now() to the selected timer function.
Initializes my_timer struct to contain the info for selected timer.*/
UNIV_INTERN
void ut_init_timer(void);
/**************************************************************//**
Return time passed since time then, automatically adjusted
for the estimated timer overhead.
@return time passed since "then" */
UNIV_INLINE
ulonglong
ut_timer_since(
/*===========*/
ulonglong then); /*!< in: time where to calculate */
/**************************************************************//**
Get time passed since "then", and update then to now
@return time passed sinche "then" */
UNIV_INLINE
ulonglong
ut_timer_since_and_update(
/*======================*/
ulonglong *then); /*!< in: time where to calculate */
/**************************************************************//**
Convert native timer units in a ulonglong into seconds in a double
@return time in a seconds */
UNIV_INLINE
double
ut_timer_to_seconds(
/*=================*/
ulonglong when); /*!< in: time where to calculate */
/**************************************************************//**
Convert native timer units in a ulonglong into milliseconds in a double
@return time in milliseconds */
UNIV_INLINE
double
ut_timer_to_milliseconds(
/*=====================*/
ulonglong when); /*!< in: time where to calculate */
/**************************************************************//**
Convert native timer units in a ulonglong into microseconds in a double
@return time in microseconds */
UNIV_INLINE
double
ut_timer_to_microseconds(
/*=====================*/
ulonglong when); /*!< in: time where to calculate */
/**************************************************************//**
Convert microseconds in a double to native timer units in a ulonglong
@return time in microseconds */
UNIV_INLINE
ulonglong
ut_microseconds_to_timer(
/*=====================*/
ulonglong when); /*!< in: time where to calculate */
#ifndef UNIV_NONINL
#include "ut0timer.ic"
#endif
#endif

View file

@ -0,0 +1,113 @@
/*****************************************************************************
Copyright (c) 2013, 2014, Facebook, Inc. All Rights Reserved.
Copyright (c) 2014, SkySQL Ab. All Rights Reserved.
This program is free software; you can redistribute it and/or modify it under
the terms of the GNU General Public License as published by the Free Software
Foundation; version 2 of the License.
This program is distributed in the hope that it will be useful, but WITHOUT
ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
You should have received a copy of the GNU General Public License along with
this program; if not, write to the Free Software Foundation, Inc.,
51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA
*****************************************************************************/
/********************************************************************//**
@file include/ut0timer.ic
Timer rountines
Created 30/07/2014 Jan Lindström jan.lindstrom@skysql.com
modified from https://github.com/facebook/mysql-5.6/commit/c75a413edeb96eb99bf11d7269bdfea06f96d6b6
*************************************************************************/
/**************************************************************//**
Return time passed since time then, automatically adjusted
for the estimated timer overhead.
@return time passed since "then" */
UNIV_INLINE
ulonglong
ut_timer_since(
/*===========*/
ulonglong then) /*!< in: time where to calculate */
{
return (ut_timer_now() - then) - ut_timer.overhead;
}
/**************************************************************//**
Get time passed since "then", and update then to now
@return time passed sinche "then" */
UNIV_INLINE
ulonglong
ut_timer_since_and_update(
/*======================*/
ulonglong *then) /*!< in: time where to calculate */
{
ulonglong now = ut_timer_now();
ulonglong ret = (now - (*then)) - ut_timer.overhead;
*then = now;
return ret;
}
/**************************************************************//**
Convert native timer units in a ulonglong into seconds in a double
@return time in a seconds */
UNIV_INLINE
double
ut_timer_to_seconds(
/*=================*/
ulonglong when) /*!< in: time where to calculate */
{
double ret = (double)(when);
ret /= (double)(ut_timer.frequency);
return ret;
}
/**************************************************************//**
Convert native timer units in a ulonglong into milliseconds in a double
@return time in milliseconds */
UNIV_INLINE
double
ut_timer_to_milliseconds(
/*=====================*/
ulonglong when) /*!< in: time where to calculate */
{
double ret = (double)(when);
ret *= 1000.0;
ret /= (double)(ut_timer.frequency);
return ret;
}
/**************************************************************//**
Convert native timer units in a ulonglong into microseconds in a double
@return time in microseconds */
UNIV_INLINE
double
ut_timer_to_microseconds(
/*=====================*/
ulonglong when) /*!< in: time where to calculate */
{
double ret = (double)(when);
ret *= 1000000.0;
ret /= (double)(ut_timer.frequency);
return ret;
}
/**************************************************************//**
Convert microseconds in a double to native timer units in a ulonglong
@return time in microseconds */
UNIV_INLINE
ulonglong
ut_microseconds_to_timer(
/*=====================*/
ulonglong when) /*!< in: time where to calculate */
{
double ret = when;
ret *= (double)(ut_timer.frequency);
ret /= 1000000.0;
return (ulonglong)ret;
}

View file

@ -3290,6 +3290,47 @@ lock_update_merge_left(
lock_mutex_exit();
}
/*************************************************************//**
Updates the lock table when a page is split and merged to
two pages. */
UNIV_INTERN
void
lock_update_split_and_merge(
const buf_block_t* left_block, /*!< in: left page to which merged */
const rec_t* orig_pred, /*!< in: original predecessor of
supremum on the left page before merge*/
const buf_block_t* right_block) /*!< in: right page from which merged */
{
const rec_t* left_next_rec;
ut_a(left_block && right_block);
ut_a(orig_pred);
lock_mutex_enter();
left_next_rec = page_rec_get_next_const(orig_pred);
/* Inherit the locks on the supremum of the left page to the
first record which was moved from the right page */
lock_rec_inherit_to_gap(
left_block, left_block,
page_rec_get_heap_no(left_next_rec),
PAGE_HEAP_NO_SUPREMUM);
/* Reset the locks on the supremum of the left page,
releasing waiting transactions */
lock_rec_reset_and_release_wait(left_block,
PAGE_HEAP_NO_SUPREMUM);
/* Inherit the locks to the supremum of the left page from the
successor of the infimum on the right page */
lock_rec_inherit_to_gap(left_block, right_block,
PAGE_HEAP_NO_SUPREMUM,
lock_get_min_heap_no(right_block));
lock_mutex_exit();
}
/*************************************************************//**
Resets the original locks on heir and replaces them with gap type locks
inherited from rec. */

View file

@ -1349,6 +1349,21 @@ page_cur_insert_rec_zip(
return(insert_rec);
}
/* Page compress failed. If this happened on a
leaf page, put the data size into the sample
buffer. */
if (page_is_leaf(page)) {
ulint occupied = page_get_data_size(page)
+ page_dir_calc_reserved_space(
page_get_n_recs(page));
index->stat_defrag_data_size_sample[
index->stat_defrag_sample_next_slot] =
occupied;
index->stat_defrag_sample_next_slot =
(index->stat_defrag_sample_next_slot
+ 1) % STAT_DEFRAG_DATA_SIZE_N_SAMPLE;
}
ut_ad(cursor->rec
== (pos > 1
? page_rec_get_nth(

View file

@ -53,6 +53,7 @@ Created 9/17/2000 Heikki Tuuri
#include "rem0cmp.h"
#include "log0log.h"
#include "btr0sea.h"
#include "btr0defragment.h"
#include "fil0fil.h"
#include "ibuf0ibuf.h"
#include "fts0fts.h"
@ -3857,6 +3858,8 @@ row_drop_table_for_mysql(
if (!dict_table_is_temporary(table)) {
dict_stats_recalc_pool_del(table);
dict_stats_defrag_pool_del(table, NULL);
btr_defragment_remove_table(table);
/* Remove stats for this table and all of its indexes from the
persistent storage if it exists and if there are stats for this

View file

@ -70,10 +70,11 @@ Created 10/8/1995 Heikki Tuuri
#include "srv0mon.h"
#include "ut0crc32.h"
#include "os0file.h"
#include "btr0defragment.h"
#include "mysql/plugin.h"
#include "mysql/service_thd_wait.h"
#include "fil0pagecompress.h"
#include <my_rdtsc.h>
/* prototypes of new functions added to ha_innodb.cc for kill_idle_transaction */
ibool innobase_thd_is_idle(const void* thd);
@ -280,6 +281,16 @@ UNIV_INTERN ulint srv_buf_pool_curr_size = 0;
UNIV_INTERN ulint srv_mem_pool_size = ULINT_MAX;
UNIV_INTERN ulint srv_lock_table_size = ULINT_MAX;
/* Defragmentation */
UNIV_INTERN my_bool srv_defragment = FALSE;
UNIV_INTERN uint srv_defragment_n_pages = 7;
UNIV_INTERN uint srv_defragment_stats_accuracy = 0;
UNIV_INTERN uint srv_defragment_fill_factor_n_recs = 20;
UNIV_INTERN double srv_defragment_fill_factor = 0.9;
UNIV_INTERN uint srv_defragment_frequency =
SRV_DEFRAGMENT_FREQUENCY_DEFAULT;
UNIV_INTERN ulonglong srv_defragment_interval = 0;
/** Query thread preflush algorithm */
UNIV_INTERN ulong srv_foreground_preflush
= SRV_FOREGROUND_PREFLUSH_EXP_BACKOFF;
@ -1876,6 +1887,11 @@ srv_export_innodb_status(void)
export_vars.innodb_page_compressed_trim_op_saved = srv_stats.page_compressed_trim_op_saved;
export_vars.innodb_pages_page_decompressed = srv_stats.pages_page_decompressed;
export_vars.innodb_defragment_compression_failures =
btr_defragment_compression_failures;
export_vars.innodb_defragment_failures = btr_defragment_failures;
export_vars.innodb_defragment_count = btr_defragment_count;
#ifdef UNIV_DEBUG
rw_lock_s_lock(&purge_sys->latch);
trx_id_t done_trx_no = purge_sys->done.trx_no;

View file

@ -69,6 +69,8 @@ Created 2/16/1996 Heikki Tuuri
#include "srv0start.h"
#include "srv0srv.h"
#include "buf0flu.h"
#include "btr0defragment.h"
#include "ut0timer.h"
#ifndef UNIV_HOTBACKUP
# include "trx0rseg.h"
@ -1575,6 +1577,9 @@ innobase_start_or_create_for_mysql(void)
char* logfile0 = NULL;
size_t dirnamelen;
/* This should be initialized early */
ut_init_timer();
if (srv_force_recovery > SRV_FORCE_NO_TRX_UNDO) {
srv_read_only_mode = true;
}
@ -2960,6 +2965,9 @@ files_checked:
fts_optimize_init();
}
/* Initialize online defragmentation. */
btr_defragment_init();
srv_was_started = TRUE;
return(DB_SUCCESS);

View file

@ -1272,6 +1272,7 @@ sync_thread_add_level(
case SYNC_IBUF_MUTEX:
case SYNC_INDEX_ONLINE_LOG:
case SYNC_STATS_AUTO_RECALC:
case SYNC_STATS_DEFRAG:
if (!sync_thread_levels_g(array, level, TRUE)) {
fprintf(stderr,
"InnoDB: sync_thread_levels_g(array, %lu)"

View file

@ -0,0 +1,92 @@
/*****************************************************************************
Copyright (c) 2013, 2014, Facebook, Inc. All Rights Reserved.
Copyright (c) 2014, SkySQL Ab. All Rights Reserved.
This program is free software; you can redistribute it and/or modify it under
the terms of the GNU General Public License as published by the Free Software
Foundation; version 2 of the License.
This program is distributed in the hope that it will be useful, but WITHOUT
ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
You should have received a copy of the GNU General Public License along with
this program; if not, write to the Free Software Foundation, Inc.,
51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA
*****************************************************************************/
/********************************************************************//**
@file ut/ut0timer.cc
Timer rountines
Created 30/07/2014 Jan Lindström jan.lindstrom@skysql.com
modified from https://github.com/facebook/mysql-5.6/commit/c75a413edeb96eb99bf11d7269bdfea06f96d6b6
*************************************************************************/
#include "data0type.h"
#include <my_rdtsc.h>
#include <ut0timer.h>
/**************************************************************//**
Initial timer definition
@return 0 */
static
ulonglong
ut_timer_none(void)
/*===============*/
{
return 0;
}
/**************************************************************//**
Function pointer to point selected timer function.
@return timer current value */
ulonglong (*ut_timer_now)(void) = &ut_timer_none;
struct my_timer_unit_info ut_timer;
/**************************************************************//**
Sets up the data required for use of my_timer_* functions.
Selects the best timer by high frequency, and tight resolution.
Points my_timer_now() to the selected timer function.
Initializes my_timer struct to contain the info for selected timer.*/
UNIV_INTERN
void
ut_init_timer(void)
/*===============*/
{
MY_TIMER_INFO all_timer_info;
my_timer_init(&all_timer_info);
if (all_timer_info.cycles.frequency > 1000000 &&
all_timer_info.cycles.resolution == 1) {
ut_timer = all_timer_info.cycles;
ut_timer_now = &my_timer_cycles;
} else if (all_timer_info.nanoseconds.frequency > 1000000 &&
all_timer_info.nanoseconds.resolution == 1) {
ut_timer = all_timer_info.nanoseconds;
ut_timer_now = &my_timer_nanoseconds;
} else if (all_timer_info.microseconds.frequency >= 1000000 &&
all_timer_info.microseconds.resolution == 1) {
ut_timer = all_timer_info.microseconds;
ut_timer_now = &my_timer_microseconds;
} else if (all_timer_info.milliseconds.frequency >= 1000 &&
all_timer_info.milliseconds.resolution == 1) {
ut_timer = all_timer_info.milliseconds;
ut_timer_now = &my_timer_milliseconds;
} else if (all_timer_info.ticks.frequency >= 1000 &&
/* Will probably be false */
all_timer_info.ticks.resolution == 1) {
ut_timer = all_timer_info.ticks;
ut_timer_now = &my_timer_ticks;
} else {
/* None are acceptable, so leave it as "None", and fill in struct */
ut_timer.frequency = 1; /* Avoid div-by-zero */
ut_timer.overhead = 0; /* Since it doesn't do anything */
ut_timer.resolution = 10; /* Another sign it's bad */
ut_timer.routine = 0; /* None */
}
}