MDEV-35096: History is stored in different partitions on different nodes when using SYSTEM VERSION

Row-injection updates don’t correctly set the historical partition
for tables with system versioning and system_time partitions. This
results in inconsistencies between the master and slave when
replicating transactions that target such tables (i.e. the primary
server would correctly distribute archived rows amongst its
partitions, whereas the replica would have all archived rows in a
single partition). The function
partition_info::vers_set_hist_part(THD*) is used to set the
partition; however, its initial check for
vers_require_hist_part(THD*) returns false, bypassing the rest of
the function (which sets up the partition to use). This is because
the actual check uses the LEX sql_command (via
LEX::vers_history_generating()) to determine if the command is valid
to generate history. Row injections don’t have sql_commands though.

This patch provides a fix which extends the check in
vers_history_generating() to additionally allow row injections to be
history generating (via the function LEX::is_stmt_row_injection()).

Special thanks to Jan Lindstrom <jan.lindstrom@galeracluster.com>
for his work in reproducing the bug, and providing an initial test
case.

Reviewed By
============
Kristian Nielsen <knielsen@knielsen-hq.org>
Aleksey Midenkov <midenok@mariadb.com>
This commit is contained in:
Brandon Nesterenko 2025-01-13 07:04:53 -07:00 committed by Brandon Nesterenko
parent 133e26fd7d
commit d8c841d0d4
4 changed files with 328 additions and 1 deletions

View file

@ -0,0 +1,71 @@
include/master-slave.inc
[connection master]
#
# Initialize system-versioned and partitioned table and its data
connection master;
SET timestamp=UNIX_TIMESTAMP('2025-01-01 01:00:00.000000');
RESET MASTER;
create table t1 (x int) engine=InnoDB with system versioning partition by system_time limit 3 partitions 5;
insert into t1 values(1);
insert into t1 values(2);
insert into t1 values(3);
insert into t1 values(4);
insert into t1 values(5);
# Verifying master partitions are correct after data insertion..
# .. done
connection slave;
connection slave;
# Verifying partitions of master and slave match on data setup..
# .. done
#
# "Delete" each row -- these are the BINLOG commands generated by
# mysqlbinlog from `delete from t1 where x=<n>` statments. Because the
# table uses system versioning and system_time partition, the actual
# events are updates, with added fields for the `row_start` and `row_end`
# columns.
connection master;
# BINLOG for Format Description event
BINLOG '
APZ0Zw8BAAAA/AAAAAABAAAAAAQAMTAuNi4yMS1NYXJpYURCLWRlYnVnLWxvZwAAAAAAAAAAAAAA
AAAAAAAAAAAAAAAAAAAA9nRnEzgNAAgAEgAEBAQEEgAA5AAEGggAAAAICAgCAAAACgoKAAAAAAAA
CgoKAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA
AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA
AAAAAAAAAAAEEwQADQgICAoKCgHgiCNP
';
# BINLOG for delete from t1 where x=1;
BINLOG '
APZ0ZxMBAAAAMQAAAAQHAAAAACEAAAAAAAEABHRlc3QAAnQxAAMDERECBgYBvaHPfA==
APZ0ZxgBAAAASAAAAEwHAAAAACEAAAAAAAEAAwcH+AEAAABndPYAAAAAf////w9CP/gBAAAAZ3T2
AAAAAGd09gAAAADnhA23
';
# BINLOG for delete from t1 where x=2;
BINLOG '
APZ0ZxMBAAAAMQAAAPUHAAAAACEAAAAAAAEABHRlc3QAAnQxAAMDERECBgYBwNtQNQ==
APZ0ZxgBAAAASAAAAD0IAAAAACEAAAAAAAEAAwcH+AIAAABndPYAAAAAf////w9CP/gCAAAAZ3T2
AAAAAGd09gAAAABPYZUX
';
# BINLOG for delete from t1 where x=3;
BINLOG '
APZ0ZxMBAAAAMQAAAOYIAAAAACEAAAAAAAEABHRlc3QAAnQxAAMDERECBgYBKWGevg==
APZ0ZxgBAAAASAAAAC4JAAAAACEAAAAAAAEAAwcH+AMAAABndPYAAAAAf////w9CP/gDAAAAZ3T2
AAAAAGd09gAAAAD0hz5S
';
# BINLOG for delete from t1 where x=4;
BINLOG '
APZ0ZxMBAAAAMQAAANcJAAAAACEAAAAAAAEABHRlc3QAAnQxAAMDERECBgYBaT9IZg==
APZ0ZxgBAAAASAAAAB8KAAAAACEAAAAAAAEAAwcH+AQAAABndPYAAAAAf////w9CP/gEAAAAZ3T2
AAAAAGd09gAAAADA4Tdx
';
# BINLOG for delete from t1 where x=5;
BINLOG '
APZ0ZxMBAAAAMQAAAMgKAAAAACEAAAAAAAEABHRlc3QAAnQxAAMDERECBgYBMk64Mw==
APZ0ZxgBAAAASAAAABALAAAAACEAAAAAAAEAAwcH+AUAAABndPYAAAAAf////w9CP/gFAAAAZ3T2
AAAAAGd09gAAAAA5blY6
';
# Verifying master partitions are correct after deletion BINLOG stmts..
# .. done
connection slave;
connection slave;
connection master;
drop table t1;
include/rpl_end.inc

View file

@ -0,0 +1,4 @@
!include ../my.cnf
[mysqld]
default_time_zone="-7:00"

View file

@ -0,0 +1,248 @@
#
# Ensure that executing row-injected events (i.e. via BINLOG statments and
# row-based binlog events) uses historical partitions. That is, for tables
# which use system versioning and system_time partitions, MDEV-35096 reported
# that row-injected events would not be stored into the correct historical
# partition. This test considers both use cases of row-injected events.
#
# The test setup creates a system-versioned table with system_time-based
# partitioning and fills the table up with enough records that bypass the size
# limit of each historical partition.
#
# To test BINLOG statements, a series of BINLOG statements are used to delete
# all the records in the test tables, and the resulting partitions are analyzed
# to ensure that they match the partition specification. The BINLOG events
# were collected by running an original set of delete statements on the table
# data, and taking their binlog data from mysqlbinlog. Note these binary log
# events are actually Update events, because system versioning just archives
# the rows, rather than deleting them.
#
# To test row-based event replication, a slave replicates the master's
# events, and the partitions are compared between the slave and master for
# consistency.
#
# Note that the TIMESTAMP of this test is fixed so the BINLOG statements can
# identify the correct rows to delete (system versioning adds implicit fields
# `row_start` and `row_end`, which are automatically populated using the current
# timestamp).
#
#
# References:
# MDEV-35096: History is stored in different partitions on different nodes
# when using SYSTEM VERSION
#
--source include/have_binlog_format_row.inc
--source include/master-slave.inc
--source include/have_innodb.inc
--source include/have_partition.inc
--echo #
--echo # Initialize system-versioned and partitioned table and its data
--connection master
# Fix the timestamp for the system versioned row_start and row_end fields, so
# the later hard-coded BINLOG base64 data can find the rows.
SET timestamp=UNIX_TIMESTAMP('2025-01-01 01:00:00.000000');
RESET MASTER;
create table t1 (x int) engine=InnoDB with system versioning partition by system_time limit 3 partitions 5;
insert into t1 values(1);
insert into t1 values(2);
insert into t1 values(3);
insert into t1 values(4);
insert into t1 values(5);
--let $master_total_size= `select count(*) from t1`
--let $master_p0_size= `select count(*) from t1 partition (p0)`
--let $master_p1_size= `select count(*) from t1 partition (p1)`
--let $master_p2_size= `select count(*) from t1 partition (p2)`
--echo # Verifying master partitions are correct after data insertion..
if ($master_total_size != 5)
{
--echo # Master t1 count: $master_total_size
--die Master table t1 should have 5 entries
}
if ($master_p0_size)
{
--echo # Master t1,p0 count: $master_p0_size
--die Master t1 partition p0 should be empty
}
if ($master_p1_size)
{
--echo # Master t1,p1 count: $master_p1_size
--die Master t1 partition p1 should be empty
}
if ($master_p2_size)
{
--echo # Master t1,p2 count: $master_p2_size
--die Master t1 partition p2 should be empty
}
--echo # .. done
--sync_slave_with_master
--connection slave
--let $slave_total_size= `select count(*) from t1`
--let $slave_p0_size= `select count(*) from t1 partition (p0)`
--let $slave_p1_size= `select count(*) from t1 partition (p1)`
--let $slave_p2_size= `select count(*) from t1 partition (p2)`
--echo # Verifying partitions of master and slave match on data setup..
if ($slave_total_size != $master_total_size)
{
--connection master
select count(*) from t0;
--connection slave
select count(*) from t1;
--die Size of t1 differs between master and slave
}
if ($slave_p0_size != $master_p0_size)
{
--connection master
select count(*) from t1 partition (p0);
--connection slave
select count(*) from t1 partition (p0);
--die Size of t1 partition p0 differs between master and slave
}
if ($slave_p1_size != $master_p1_size)
{
--connection master
select count(*) from t1 partition (p1);
--connection slave
select count(*) from t1 partition (p1);
--die Size of t1 partition p1 differs between master and slave
}
if ($slave_p2_size != $master_p2_size)
{
--connection master
select count(*) from t1 partition (p2);
--connection slave
select count(*) from t1 partition (p2);
--die Size of t1 partition p2 differs between master and slave
}
--echo # .. done
--echo #
--echo # "Delete" each row -- these are the BINLOG commands generated by
--echo # mysqlbinlog from `delete from t1 where x=<n>` statments. Because the
--echo # table uses system versioning and system_time partition, the actual
--echo # events are updates, with added fields for the `row_start` and `row_end`
--echo # columns.
--connection master
--echo # BINLOG for Format Description event
BINLOG '
APZ0Zw8BAAAA/AAAAAABAAAAAAQAMTAuNi4yMS1NYXJpYURCLWRlYnVnLWxvZwAAAAAAAAAAAAAA
AAAAAAAAAAAAAAAAAAAA9nRnEzgNAAgAEgAEBAQEEgAA5AAEGggAAAAICAgCAAAACgoKAAAAAAAA
CgoKAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA
AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA
AAAAAAAAAAAEEwQADQgICAoKCgHgiCNP
';
--echo # BINLOG for delete from t1 where x=1;
BINLOG '
APZ0ZxMBAAAAMQAAAAQHAAAAACEAAAAAAAEABHRlc3QAAnQxAAMDERECBgYBvaHPfA==
APZ0ZxgBAAAASAAAAEwHAAAAACEAAAAAAAEAAwcH+AEAAABndPYAAAAAf////w9CP/gBAAAAZ3T2
AAAAAGd09gAAAADnhA23
';
--echo # BINLOG for delete from t1 where x=2;
BINLOG '
APZ0ZxMBAAAAMQAAAPUHAAAAACEAAAAAAAEABHRlc3QAAnQxAAMDERECBgYBwNtQNQ==
APZ0ZxgBAAAASAAAAD0IAAAAACEAAAAAAAEAAwcH+AIAAABndPYAAAAAf////w9CP/gCAAAAZ3T2
AAAAAGd09gAAAABPYZUX
';
--echo # BINLOG for delete from t1 where x=3;
BINLOG '
APZ0ZxMBAAAAMQAAAOYIAAAAACEAAAAAAAEABHRlc3QAAnQxAAMDERECBgYBKWGevg==
APZ0ZxgBAAAASAAAAC4JAAAAACEAAAAAAAEAAwcH+AMAAABndPYAAAAAf////w9CP/gDAAAAZ3T2
AAAAAGd09gAAAAD0hz5S
';
--echo # BINLOG for delete from t1 where x=4;
BINLOG '
APZ0ZxMBAAAAMQAAANcJAAAAACEAAAAAAAEABHRlc3QAAnQxAAMDERECBgYBaT9IZg==
APZ0ZxgBAAAASAAAAB8KAAAAACEAAAAAAAEAAwcH+AQAAABndPYAAAAAf////w9CP/gEAAAAZ3T2
AAAAAGd09gAAAADA4Tdx
';
--echo # BINLOG for delete from t1 where x=5;
BINLOG '
APZ0ZxMBAAAAMQAAAMgKAAAAACEAAAAAAAEABHRlc3QAAnQxAAMDERECBgYBMk64Mw==
APZ0ZxgBAAAASAAAABALAAAAACEAAAAAAAEAAwcH+AUAAABndPYAAAAAf////w9CP/gFAAAAZ3T2
AAAAAGd09gAAAAA5blY6
';
--let $master_total_size= `select count(*) from t1`
--let $master_p0_size= `select count(*) from t1 partition (p0)`
--let $master_p1_size= `select count(*) from t1 partition (p1)`
--let $master_p2_size= `select count(*) from t1 partition (p2)`
--echo # Verifying master partitions are correct after deletion BINLOG stmts..
if ($master_total_size > 0)
{
--echo # Master t1 count: $master_total_size
--die Master table t1 should have 0 count
}
if ($master_p0_size != 3)
{
--echo # Master t1,p0 count: $master_p0_size
--die Master t1 partition p0 should have 3 entries
}
if ($master_p1_size != 2)
{
--echo # Master t1,p1 count: $master_p1_size
--die Master t1 partition p1 should have 2 entries
}
if ($master_p2_size)
{
--echo # Master t1,p2 count: $master_p2_size
--die Master t1 partition p2 should be empty
}
--echo # .. done
--sync_slave_with_master
--connection slave
--let $slave_total_size= `select count(*) from t1`
--let $slave_p0_size= `select count(*) from t1 partition (p0)`
--let $slave_p1_size= `select count(*) from t1 partition (p1)`
--let $slave_p2_size= `select count(*) from t1 partition (p2)`
if ($slave_total_size != $master_total_size)
{
--connection master
select count(*) from t1;
--connection slave
select count(*) from t1;
--die Size of t1 differs between master and slave
}
if ($slave_p0_size != $master_p0_size)
{
--connection master
select count(*) from t1 partition (p0);
--connection slave
select count(*) from t1 partition (p0);
--die Size of t1 partition p0 differs between master and slave
}
if ($slave_p1_size != $master_p1_size)
{
--connection master
select count(*) from t1 partition (p1);
--connection slave
select count(*) from t1 partition (p1);
--die Size of t1 partition p1 differs between master and slave
}
if ($slave_p2_size != $master_p2_size)
{
--connection master
select count(*) from t1 partition (p2);
--connection slave
select count(*) from t1 partition (p2);
--die Size of t1 partition p2 differs between master and slave
}
--connection master
drop table t1;
--source include/rpl_end.inc

View file

@ -4617,7 +4617,11 @@ public:
case SQLCOM_LOAD:
return duplicates == DUP_REPLACE;
default:
return false;
/*
Row injections (i.e. row binlog events and BINLOG statements) should
generate history.
*/
return is_stmt_row_injection();
}
}