mariadb/mysys/mf_iocache2.c
Kristian Nielsen b8f9f796ff MDEV-31273: Precompute binlog checksums
Compute binlog checksums (when enabled) already when writing events
into the statement or transaction caches, where before it was done
when the caches are copied to the real binlog file. This moves the
checksum computation outside of holding LOCK_log, improving
scalabitily.

At stmt/trx cache write time, the final end_log_pos values are not
known, so with this patch these will be set to 0. Events that are
written directly to the binlog file (not through stmt/trx cache) keep
the correct end_log_pos value. The GTID and COMMIT/XID events at the
start and end of event groups are written directly, so the zero
end_log_pos is only for events in the middle of event groups, which
do not negatively affect replication.

An option --binlog-legacy-event-pos, off by default, is provided to
disable this behavior to provide backwards compatibility with any
external applications that might rely on end_log_pos in events in the
middle of event groups.

Checksums cannot be pre-computed when binlog encryption is enabled, as
encryption relies on correct end_log_pos to provide part of the
nonce/IV.

Checksum pre-computation is also disabled for WSREP/Galera, as it uses
events differently in its write-sets and so on. Extending pre-computation of
checksums to Galera where it makes sense could be added in a future patch.

The current --binlog-checksum configuration is saved in
binlog_cache_data at transaction start and used to pre-compute
checksums in cache, if applicable. When the cache is later copied to
the binlog, a check is made if the saved value still matches the
configured global value; if so, the events are block-copied directly
into the binlog file. If --binlog-checksum was changed during the
transaction, events are re-written to the binlog file one-by-one and
the checksums recomputed/discarded as appropriate.

Reviewed-by: Monty <monty@mariadb.org>
Signed-off-by: Kristian Nielsen <knielsen@knielsen-hq.org>
2023-10-27 19:57:43 +02:00

529 lines
14 KiB
C

/* Copyright (c) 2000, 2018, Oracle and/or its affiliates.
Copyright (c) 2009, 2018, MariaDB Corporation
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; version 2 of the License.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with this program; if not, write to the Free Software
Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1335 USA */
/*
More functions to be used with IO_CACHE files
*/
#include "mysys_priv.h"
#include <m_string.h>
#include <stdarg.h>
#include <m_ctype.h>
/**
Copy the cache to the file. Copying can be constrained to @c count
number of bytes when the parameter is less than SIZE_T_MAX. The
cache will be optionally re-inited to a read cache and will read
from the beginning of the cache. If a failure to write fully
occurs, the cache is only copied partially.
TODO
Make this function solid by handling partial reads from the cache
in a correct manner: it should be atomic.
@param cache IO_CACHE to copy from
@param file File to copy to
@param count the copied size or the max of the type
when the whole cache is to be copied.
@return
0 All OK
1 An error occurred
*/
int
my_b_copy_to_file(IO_CACHE *cache, FILE *file,
size_t count)
{
size_t curr_write, bytes_in_cache;
DBUG_ENTER("my_b_copy_to_file");
bytes_in_cache= my_b_bytes_in_cache(cache);
do
{
curr_write= MY_MIN(bytes_in_cache, count);
if (my_fwrite(file, cache->read_pos, curr_write,
MYF(MY_WME | MY_NABP)) == (size_t) -1)
DBUG_RETURN(1);
cache->read_pos += curr_write;
count -= curr_write;
} while (count && (bytes_in_cache= my_b_fill(cache)));
if(cache->error == -1)
DBUG_RETURN(1);
DBUG_RETURN(0);
}
int my_b_copy_all_to_file(IO_CACHE *cache, FILE *file)
{
DBUG_ENTER("my_b_copy_all_to_file");
/* Reinit the cache to read from the beginning of the cache */
if (reinit_io_cache(cache, READ_CACHE, 0L, FALSE, FALSE))
DBUG_RETURN(1);
DBUG_RETURN(my_b_copy_to_file(cache, file, SIZE_T_MAX));
}
/**
Similar to above my_b_copy_to_file(), but destination is another IO_CACHE.
*/
int
my_b_copy_to_cache(IO_CACHE *from_cache, IO_CACHE *to_cache,
size_t count)
{
size_t curr_write, bytes_in_cache;
DBUG_ENTER("my_b_copy_to_cache");
bytes_in_cache= my_b_bytes_in_cache(from_cache);
do
{
curr_write= MY_MIN(bytes_in_cache, count);
if (my_b_write(to_cache, from_cache->read_pos, curr_write))
DBUG_RETURN(1);
from_cache->read_pos += curr_write;
count -= curr_write;
} while (count && (bytes_in_cache= my_b_fill(from_cache)));
if(from_cache->error == -1)
DBUG_RETURN(1);
DBUG_RETURN(0);
}
int my_b_copy_all_to_cache(IO_CACHE *from_cache, IO_CACHE *to_cache)
{
DBUG_ENTER("my_b_copy_all_to_cache");
/* Reinit the cache to read from the beginning of the cache */
if (reinit_io_cache(from_cache, READ_CACHE, 0L, FALSE, FALSE))
DBUG_RETURN(1);
DBUG_RETURN(my_b_copy_to_cache(from_cache, to_cache,
from_cache->end_of_file));
}
my_off_t my_b_append_tell(IO_CACHE* info)
{
/*
Sometimes we want to make sure that the variable is not put into
a register in debugging mode so we can see its value in the core
*/
#ifndef DBUG_OFF
# define dbug_volatile volatile
#else
# define dbug_volatile
#endif
/*
Prevent optimizer from putting res in a register when debugging
we need this to be able to see the value of res when the assert fails
*/
dbug_volatile my_off_t res;
/*
We need to lock the append buffer mutex to keep flush_io_cache()
from messing with the variables that we need in order to provide the
answer to the question.
*/
mysql_mutex_lock(&info->append_buffer_lock);
#ifndef DBUG_OFF
/*
Make sure EOF is where we think it is. Note that we cannot just use
my_tell() because we have a reader thread that could have left the
file offset in a non-EOF location
*/
{
volatile my_off_t save_pos;
save_pos= mysql_file_tell(info->file, MYF(0));
mysql_file_seek(info->file, 0, MY_SEEK_END, MYF(0));
/*
Save the value of my_tell in res so we can see it when studying coredump
*/
DBUG_ASSERT(info->end_of_file - (info->append_read_pos-info->write_buffer)
== (res= mysql_file_tell(info->file, MYF(0))));
mysql_file_seek(info->file, save_pos, MY_SEEK_SET, MYF(0));
}
#endif
res = info->end_of_file + (info->write_pos-info->append_read_pos);
mysql_mutex_unlock(&info->append_buffer_lock);
return res;
}
my_off_t my_b_safe_tell(IO_CACHE *info)
{
if (unlikely(info->type == SEQ_READ_APPEND))
return my_b_append_tell(info);
return my_b_tell(info);
}
/*
Make next read happen at the given position
For write cache, make next write happen at the given position
*/
void my_b_seek(IO_CACHE *info,my_off_t pos)
{
my_off_t offset;
DBUG_ENTER("my_b_seek");
DBUG_PRINT("enter",("pos: %lu", (ulong) pos));
/*
TODO:
Verify that it is OK to do seek in the non-append
area in SEQ_READ_APPEND cache
a) see if this always works
b) see if there is a better way to make it work
*/
if (info->type == SEQ_READ_APPEND)
(void) flush_io_cache(info);
offset=(pos - info->pos_in_file);
if (info->type == READ_CACHE || info->type == SEQ_READ_APPEND)
{
/* TODO: explain why this works if pos < info->pos_in_file */
if ((ulonglong) offset < (ulonglong) (info->read_end - info->buffer))
{
/* The read is in the current buffer; Reuse it */
info->read_pos = info->buffer + offset;
DBUG_VOID_RETURN;
}
else
{
/* Force a new read on next my_b_read */
info->read_pos=info->read_end=info->buffer;
}
}
else if (info->type == WRITE_CACHE)
{
/* If write is in current buffer, reuse it */
if ((ulonglong) offset <
(ulonglong) (info->write_end - info->write_buffer))
{
info->write_pos = info->write_buffer + offset;
DBUG_VOID_RETURN;
}
(void) flush_io_cache(info);
/* Correct buffer end so that we write in increments of IO_SIZE */
info->write_end=(info->write_buffer+info->buffer_length-
(pos & (IO_SIZE-1)));
}
info->pos_in_file=pos;
info->seek_not_done=1;
DBUG_VOID_RETURN;
}
int my_b_pread(IO_CACHE *info, uchar *Buffer, size_t Count, my_off_t pos)
{
if (info->myflags & MY_ENCRYPT)
{
my_b_seek(info, pos);
return my_b_read(info, Buffer, Count);
}
/* backward compatibility behavior. XXX remove it? */
if (mysql_file_pread(info->file, Buffer, Count, pos, info->myflags | MY_NABP))
return info->error= -1;
return 0;
}
/*
Read a string ended by '\n' into a buffer of 'max_length' size.
Returns number of characters read, 0 on error.
last byte is set to '\0'
If buffer is full then to[max_length-1] will be set to \0.
*/
size_t my_b_gets(IO_CACHE *info, char *to, size_t max_length)
{
char *start = to;
size_t length;
max_length--; /* Save place for end \0 */
/* Calculate number of characters in buffer */
if (!(length= my_b_bytes_in_cache(info)) &&
!(length= my_b_fill(info)))
return 0;
for (;;)
{
uchar *pos, *end;
if (length > max_length)
length=max_length;
for (pos=info->read_pos,end=pos+length ; pos < end ;)
{
if ((*to++ = *pos++) == '\n')
{
info->read_pos=pos;
*to='\0';
return (size_t) (to-start);
}
}
if (!(max_length-=length))
{
/* Found enough charcters; Return found string */
info->read_pos=pos;
*to='\0';
return (size_t) (to-start);
}
if (!(length=my_b_fill(info)))
return 0;
}
}
my_off_t my_b_filelength(IO_CACHE *info)
{
if (info->type == WRITE_CACHE)
return my_b_tell(info);
info->seek_not_done= 1;
return mysql_file_seek(info->file, 0, MY_SEEK_END, MYF(0));
}
my_bool
my_b_write_backtick_quote(IO_CACHE *info, const char *str, size_t len)
{
const uchar *start;
const uchar *p= (const uchar *)str;
const uchar *end= p + len;
size_t count;
if (my_b_write(info, (uchar *)"`", 1))
return 1;
for (;;)
{
start= p;
while (p < end && *p != '`')
++p;
count= p - start;
if (count && my_b_write(info, start, count))
return 1;
if (p >= end)
break;
if (my_b_write(info, (uchar *)"``", 2))
return 1;
++p;
}
return (my_b_write(info, (uchar *)"`", 1));
}
/*
Simple printf version. Supports '%s', '%d', '%u', "%ld" and "%lu"
Used for logging in MariaDB
@return 0 ok
1 error
*/
my_bool my_b_printf(IO_CACHE *info, const char* fmt, ...)
{
size_t result;
va_list args;
va_start(args,fmt);
result=my_b_vprintf(info, fmt, args);
va_end(args);
return result == (size_t) -1;
}
size_t my_b_vprintf(IO_CACHE *info, const char* fmt, va_list args)
{
size_t out_length= 0;
uint minimum_width; /* as yet unimplemented */
uint minimum_width_sign;
uint precision; /* as yet unimplemented for anything but %b */
my_bool is_zero_padded;
my_bool backtick_quoting;
/*
Store the location of the beginning of a format directive, for the
case where we learn we shouldn't have been parsing a format string
at all, and we don't want to lose the flag/precision/width/size
information.
*/
const char* backtrack;
for (; *fmt != '\0'; fmt++)
{
/* Copy everything until '%' or end of string */
const char *start=fmt;
size_t length;
for (; (*fmt != '\0') && (*fmt != '%'); fmt++) ;
length= (size_t) (fmt - start);
out_length+=length;
if (my_b_write(info, (const uchar*) start, length))
goto err;
if (*fmt == '\0') /* End of format */
return out_length;
/*
By this point, *fmt must be a percent; Keep track of this location and
skip over the percent character.
*/
DBUG_ASSERT(*fmt == '%');
backtrack= fmt;
fmt++;
is_zero_padded= FALSE;
backtick_quoting= FALSE;
minimum_width_sign= 1;
minimum_width= 0;
precision= 0;
/* Skip if max size is used (to be compatible with printf) */
process_flags:
switch (*fmt)
{
case '-':
minimum_width_sign= -1; fmt++; goto process_flags;
case '0':
is_zero_padded= TRUE; fmt++; goto process_flags;
case '`':
backtick_quoting= TRUE; fmt++; goto process_flags;
case '#':
/** @todo Implement "#" conversion flag. */ fmt++; goto process_flags;
case ' ':
/** @todo Implement " " conversion flag. */ fmt++; goto process_flags;
case '+':
/** @todo Implement "+" conversion flag. */ fmt++; goto process_flags;
}
if (*fmt == '*')
{
precision= (int) va_arg(args, int);
fmt++;
}
else
{
while (my_isdigit(&my_charset_latin1, *fmt)) {
minimum_width=(minimum_width * 10) + (*fmt - '0');
fmt++;
}
}
minimum_width*= minimum_width_sign;
if (*fmt == '.')
{
fmt++;
if (*fmt == '*') {
precision= (int) va_arg(args, int);
fmt++;
}
else
{
while (my_isdigit(&my_charset_latin1, *fmt)) {
precision=(precision * 10) + (*fmt - '0');
fmt++;
}
}
}
if (*fmt == 's') /* String parameter */
{
reg2 char *par = va_arg(args, char *);
size_t length2 = strlen(par);
/* TODO: implement precision */
if (backtick_quoting)
{
size_t total= my_b_write_backtick_quote(info, par, length2);
if (total == (size_t)-1)
goto err;
out_length+= total;
}
else
{
out_length+= length2;
if (my_b_write(info, (uchar*) par, length2))
goto err;
}
}
else if (*fmt == 'c') /* char type parameter */
{
char par[2];
par[0] = va_arg(args, int);
if (my_b_write(info, (uchar*) par, 1))
goto err;
}
else if (*fmt == 'b') /* Sized buffer parameter, only precision makes sense */
{
char *par = va_arg(args, char *);
out_length+= precision;
if (my_b_write(info, (uchar*) par, precision))
goto err;
}
else if (*fmt == 'd' || *fmt == 'u') /* Integer parameter */
{
register int iarg;
size_t length2;
char buff[32];
iarg = va_arg(args, int);
if (*fmt == 'd')
length2= (size_t) (int10_to_str((long) iarg,buff, -10) - buff);
else
length2= (uint) (int10_to_str((long) (uint) iarg,buff,10)- buff);
/* minimum width padding */
if (minimum_width > length2)
{
uchar *buffz;
buffz= (uchar*) my_alloca(minimum_width - length2);
if (is_zero_padded)
memset(buffz, '0', minimum_width - length2);
else
memset(buffz, ' ', minimum_width - length2);
if (my_b_write(info, buffz, minimum_width - length2))
{
my_afree(buffz);
goto err;
}
my_afree(buffz);
}
out_length+= length2;
if (my_b_write(info, (uchar*) buff, length2))
goto err;
}
else if ((*fmt == 'l' && (fmt[1] == 'd' || fmt[1] == 'u')))
/* long parameter */
{
register long iarg;
size_t length2;
char buff[32];
iarg = va_arg(args, long);
if (*++fmt == 'd')
length2= (size_t) (int10_to_str(iarg,buff, -10) - buff);
else
length2= (size_t) (int10_to_str(iarg,buff,10)- buff);
out_length+= length2;
if (my_b_write(info, (uchar*) buff, length2))
goto err;
}
else
{
/* %% or unknown code */
if (my_b_write(info, (uchar*) backtrack, (size_t) (fmt-backtrack)))
goto err;
out_length+= fmt-backtrack;
}
}
return out_length;
err:
return (size_t) -1;
}