mirror of
https://github.com/MariaDB/server.git
synced 2025-01-29 02:05:57 +01:00
Maria: first version of checkpoint (WL#3071), least-recently-dirtied page flushing (WL#3261), recovery (WL#3072),
control file (WL#3234), to serve as a detailed LLD. It looks like C code, but does not compile (no point in making it compile, as other modules on which I depend are not yet fully speficied or written); some pieces are not coded and just marked in comments. Files' organization (names, directories of C files) does not matter at this point. I don't think I had to commit so early, but it feels good to publish something, gives me the impression of moving forward :) storage/maria/checkpoint.c: WL#3071 Maria checkpoint, implementation storage/maria/checkpoint.h: WL#3071 Maria checkpoint, interface storage/maria/control_file.c: WL#3234 Maria control file, implementation storage/maria/control_file.h: WL#3234 Maria control file, interface storage/maria/least_recently_dirtied.c: WL#3261 Maria background flushing of least-recently-dirtied pages, implementation storage/maria/least_recently_dirtied.h: WL#3261 Maria background flushing of least-recently-dirtied pages, interface storage/maria/recovery.c: WL#3072 Maria recovery, implementation storage/maria/recovery.h: WL#3072 Maria recovery, interface
This commit is contained in:
parent
99a86a34bb
commit
06f7675b95
8 changed files with 937 additions and 0 deletions
394
storage/maria/checkpoint.c
Normal file
394
storage/maria/checkpoint.c
Normal file
|
@ -0,0 +1,394 @@
|
|||
/*
|
||||
WL#3071 Maria checkpoint
|
||||
First version written by Guilhem Bichot on 2006-04-27.
|
||||
Does not compile yet.
|
||||
*/
|
||||
|
||||
/* Here is the implementation of this module */
|
||||
|
||||
#include "page_cache.h"
|
||||
#include "least_recently_dirtied.h"
|
||||
#include "transaction.h"
|
||||
#include "share.h"
|
||||
#include "log.h"
|
||||
|
||||
/*
|
||||
this transaction is used for any system work (purge, checkpoint writing
|
||||
etc), that is, background threads. It will not be declared/initialized here
|
||||
in the final version.
|
||||
*/
|
||||
st_transaction system_trans= {0 /* long trans id */, 0 /* short trans id */,0,...};
|
||||
|
||||
/*
|
||||
The maximum rec_lsn in the LRD when last checkpoint was run, serves for the
|
||||
MEDIUM checkpoint.
|
||||
*/
|
||||
LSN max_rec_lsn_at_last_checkpoint= 0;
|
||||
|
||||
/* Picks a checkpoint request and executes it */
|
||||
my_bool checkpoint()
|
||||
{
|
||||
CHECKPOINT_LEVEL level;
|
||||
DBUG_ENTER("checkpoint");
|
||||
|
||||
level= checkpoint_running= checkpoint_request;
|
||||
unlock(log_mutex);
|
||||
|
||||
DBUG_ASSERT(level != NONE);
|
||||
|
||||
switch (level)
|
||||
{
|
||||
case FULL:
|
||||
/* flush all pages up to the current end of the LRD */
|
||||
flush_all_LRD_to_lsn(MAX_LSN); /* MAX_LSN==ULONGLONG_MAX */
|
||||
/* this will go full speed (normal scheduling, no sleep) */
|
||||
break;
|
||||
case MEDIUM:
|
||||
/*
|
||||
flush all pages which were already dirty at last checkpoint:
|
||||
ensures that recovery will never start from before the next-to-last
|
||||
checkpoint (two-checkpoint rule).
|
||||
It is max, not min as the WL says (TODO update WL).
|
||||
*/
|
||||
flush_all_LRD_to_lsn(max_rec_lsn_at_last_checkpoint);
|
||||
/* this will go full speed (normal scheduling, no sleep) */
|
||||
break;
|
||||
}
|
||||
|
||||
error= checkpoint_indirect();
|
||||
|
||||
lock(log_mutex);
|
||||
/*
|
||||
this portion cannot be done as a hook in write_log_record() for the
|
||||
LOGREC_CHECKPOINT type because:
|
||||
- at that moment we still have not written to the control file so cannot
|
||||
mark the request as done; this could be solved by writing to the control
|
||||
file in the hook but that would be an I/O under the log's mutex, bad.
|
||||
- it would not be nice organisation of code (I tried it :).
|
||||
*/
|
||||
mark_checkpoint_done(error);
|
||||
unlock(log_mutex);
|
||||
DBUG_RETURN(error);
|
||||
}
|
||||
|
||||
|
||||
my_bool checkpoint_indirect()
|
||||
{
|
||||
DBUG_ENTER("checkpoint_indirect");
|
||||
|
||||
int error= 0;
|
||||
/* checkpoint record data: */
|
||||
LSN checkpoint_start_lsn;
|
||||
LEX_STRING string1={0,0}, string2={0,0}, string3={0,0};
|
||||
LEX_STRING *string_array[4];
|
||||
char *ptr;
|
||||
LSN checkpoint_lsn;
|
||||
LSN candidate_max_rec_lsn_at_last_checkpoint= 0;
|
||||
list_element *el; /* to scan lists */
|
||||
|
||||
|
||||
DBUG_ASSERT(sizeof(byte *) <= 8);
|
||||
DBUG_ASSERT(sizeof(LSN) <= 8);
|
||||
|
||||
lock(log_mutex); /* will probably be in log_read_end_lsn() already */
|
||||
checkpoint_start_lsn= log_read_end_lsn();
|
||||
unlock(log_mutex);
|
||||
|
||||
DBUG_PRINT("info",("checkpoint_start_lsn %lu", checkpoint_start_lsn));
|
||||
|
||||
lock(global_LRD_mutex);
|
||||
string1.length= 8+8+(8+8)*LRD->count;
|
||||
if (NULL == (string1.str= my_malloc(string1.length)))
|
||||
goto err;
|
||||
ptr= string1.str;
|
||||
int8store(ptr, checkpoint_start_lsn);
|
||||
ptr+= 8;
|
||||
int8store(ptr, LRD->count);
|
||||
ptr+= 8;
|
||||
if (LRD->count)
|
||||
{
|
||||
candidate_max_rec_lsn_at_last_checkpoint= LRD->last->rec_lsn;
|
||||
for (el= LRD->first; el; el= el->next)
|
||||
{
|
||||
int8store(ptr, el->page_id);
|
||||
ptr+= 8;
|
||||
int8store(ptr, el->rec_lsn);
|
||||
ptr+= 8;
|
||||
}
|
||||
}
|
||||
unlock(global_LRD_mutex);
|
||||
|
||||
/*
|
||||
If trx are in more than one list (e.g. three:
|
||||
running transactions, committed transactions, purge queue), we can either
|
||||
take mutexes of all three together or do crabbing.
|
||||
But if an element can move from list 1 to list 3 without passing through
|
||||
list 2, crabbing is dangerous.
|
||||
Hopefully it's ok to take 3 mutexes together...
|
||||
Otherwise I'll have to make sure I miss no important trx and I handle dups.
|
||||
*/
|
||||
lock(global_transactions_list_mutex); /* or 3 mutexes if there are 3 */
|
||||
string2.length= 8+(8+8)*trx_list->count;
|
||||
if (NULL == (string2.str= my_malloc(string2.length)))
|
||||
goto err;
|
||||
ptr= string2.str;
|
||||
int8store(ptr, trx_list->count);
|
||||
ptr+= 8;
|
||||
for (el= trx_list->first; el; el= el->next)
|
||||
{
|
||||
/* possibly latch el.rwlock */
|
||||
*ptr= el->state;
|
||||
ptr++;
|
||||
int7store(ptr, el->long_trans_id);
|
||||
ptr+= 7;
|
||||
int2store(ptr, el->short_trans_id);
|
||||
ptr+= 2;
|
||||
int8store(ptr, el->undo_lsn);
|
||||
ptr+= 8;
|
||||
int8store(ptr, el->undo_purge_lsn);
|
||||
ptr+= 8;
|
||||
/*
|
||||
if no latch, use double variable of type ULONGLONG_CONSISTENT in
|
||||
st_transaction, or even no need if Intel >=486
|
||||
*/
|
||||
int8store(ptr, el->first_purge_lsn);
|
||||
ptr+= 8;
|
||||
/* possibly unlatch el.rwlock */
|
||||
}
|
||||
unlock(global_transactions_list_mutex);
|
||||
|
||||
lock(global_share_list_mutex);
|
||||
string3.length= 8+(8+8)*share_list->count;
|
||||
if (NULL == (string3.str= my_malloc(string3.length)))
|
||||
goto err;
|
||||
ptr= string3.str;
|
||||
/* possibly latch each MARIA_SHARE */
|
||||
make_copy_of_global_share_list_to_array;
|
||||
unlock(global_share_list_mutex);
|
||||
|
||||
/* work on copy */
|
||||
int8store(ptr, elements_in_array);
|
||||
ptr+= 8;
|
||||
for (scan_array)
|
||||
{
|
||||
int8store(ptr, array[...].file_id);
|
||||
ptr+= 8;
|
||||
memcpy(ptr, array[...].file_name, ...);
|
||||
ptr+= ...;
|
||||
/*
|
||||
these two are long ops (involving disk I/O) that's why we copied the
|
||||
list:
|
||||
*/
|
||||
flush_bitmap_pages(el);
|
||||
/*
|
||||
fsyncs the fd, that's the loooong operation (e.g. max 150 fsync per
|
||||
second, so if you have touched 1000 files it's 7 seconds).
|
||||
*/
|
||||
force_file(el);
|
||||
}
|
||||
|
||||
/* now write the record */
|
||||
string_array[0]= string1;
|
||||
string_array[1]= string2;
|
||||
string_array[2]= string3;
|
||||
string_array[3]= NULL;
|
||||
|
||||
checkpoint_lsn= log_write_record(LOGREC_CHECKPOINT,
|
||||
&system_trans, string_array);
|
||||
|
||||
if (0 == checkpoint_lsn) /* maybe 0 is impossible LSN to indicate error ? */
|
||||
goto err;
|
||||
|
||||
if (0 != control_file_write_and_force(checkpoint_lsn, NULL))
|
||||
goto err;
|
||||
|
||||
maximum_rec_lsn_last_checkpoint= candidate_max_rec_lsn_at_last_checkpoint;
|
||||
|
||||
DBUG_RETURN(0);
|
||||
|
||||
err:
|
||||
|
||||
print_error_to_error_log(the_error_message);
|
||||
my_free(buffer1.str, MYF(MY_ALLOW_ZERO_PTR));
|
||||
my_free(buffer2.str, MYF(MY_ALLOW_ZERO_PTR));
|
||||
my_free(buffer3.str, MYF(MY_ALLOW_ZERO_PTR));
|
||||
|
||||
DBUG_RETURN(1);
|
||||
}
|
||||
|
||||
|
||||
|
||||
/*
|
||||
Here's what should be put in log_write_record() in the log handler:
|
||||
*/
|
||||
log_write_record(...)
|
||||
{
|
||||
...;
|
||||
lock(log_mutex);
|
||||
...;
|
||||
write_to_log(length);
|
||||
written_since_last_checkpoint+= length;
|
||||
if (written_since_last_checkpoint >
|
||||
MAX_LOG_BYTES_WRITTEN_BETWEEN_CHECKPOINTS)
|
||||
{
|
||||
/*
|
||||
ask one system thread (the "LRD background flusher and checkpointer
|
||||
thread" WL#3261) to do a checkpoint
|
||||
*/
|
||||
request_checkpoint(INDIRECT, 0 /*wait_for_completion*/);
|
||||
}
|
||||
...;
|
||||
unlock(log_mutex);
|
||||
...;
|
||||
}
|
||||
|
||||
/*
|
||||
Call this when you want to request a checkpoint.
|
||||
In real life it will be called by log_write_record() and by client thread
|
||||
which explicitely wants to do checkpoint (ALTER ENGINE CHECKPOINT
|
||||
checkpoint_level).
|
||||
*/
|
||||
int request_checkpoint(CHECKPOINT_LEVEL level, my_bool wait_for_completion)
|
||||
{
|
||||
int error= 0;
|
||||
/*
|
||||
If caller wants to wait for completion we'll have to release the log mutex
|
||||
to wait on condition, if caller had log mutex he may not be happy that we
|
||||
release it, so we check that caller didn't have log mutex.
|
||||
*/
|
||||
if (wait_for_completion)
|
||||
{
|
||||
lock(log_mutex);
|
||||
}
|
||||
else
|
||||
safemutex_assert_owner(log_mutex);
|
||||
|
||||
DBUG_ASSERT(checkpoint_request >= checkpoint_running);
|
||||
DBUG_ASSERT(level > NONE);
|
||||
if (checkpoint_request < level)
|
||||
{
|
||||
/* no equal or stronger running or to run, we post request */
|
||||
/*
|
||||
note that thousands of requests for checkpoints are going to come all
|
||||
at the same time (when the log bound is passed), so it may not be a good
|
||||
idea for each of them to broadcast a cond. We just don't broacast a
|
||||
cond, the checkpoint thread will wake up in max one second.
|
||||
*/
|
||||
checkpoint_request= level; /* post request */
|
||||
}
|
||||
|
||||
if (wait_for_completion)
|
||||
{
|
||||
uint checkpoints_done_copy= checkpoints_done;
|
||||
uint checkpoint_errors_copy= checkpoint_errors;
|
||||
/*
|
||||
note that the "==done" works when the uint counter wraps too, so counter
|
||||
can even be smaller than uint if we wanted (however it should be big
|
||||
enough so that max_the_int_type checkpoints cannot happen between two
|
||||
wakeups of our thread below). uint sounds fine.
|
||||
Wait for our checkpoint to be done:
|
||||
*/
|
||||
|
||||
if (checkpoint_running != NONE) /* not ours, let it pass */
|
||||
{
|
||||
while (1)
|
||||
{
|
||||
if (checkpoints_done != checkpoints_done_copy)
|
||||
{
|
||||
if (checkpoints_done == (checkpoints_done_copy+1))
|
||||
{
|
||||
/* not our checkpoint, forget about it */
|
||||
checkpoints_done_copy= checkpoints_done;
|
||||
}
|
||||
break; /* maybe even ours has been done at this stage! */
|
||||
}
|
||||
cond_wait(checkpoint_done_cond, log_mutex);
|
||||
}
|
||||
}
|
||||
|
||||
/* now we come to waiting for our checkpoint */
|
||||
while (1)
|
||||
{
|
||||
if (checkpoints_done != checkpoints_done_copy)
|
||||
{
|
||||
/* our checkpoint has been done */
|
||||
break;
|
||||
}
|
||||
if (checkpoint_errors != checkpoint_errors_copy)
|
||||
{
|
||||
/*
|
||||
the one which was running a few milliseconds ago (if there was one),
|
||||
and/or ours, had an error, just assume it was ours. So there
|
||||
is a possibility that we return error though we succeeded, in which
|
||||
case user will have to retry; but two simultanate checkpoints have
|
||||
high changes to fail together (as the error probably comes from
|
||||
malloc or disk write problem), so chance of false alarm is low.
|
||||
Reporting the error only to the one which caused the error would
|
||||
require having a (not fixed size) list of all requests, not worth it.
|
||||
*/
|
||||
error= 1;
|
||||
break;
|
||||
}
|
||||
cond_wait(checkpoint_done_cond, log_mutex);
|
||||
}
|
||||
unlock(log_mutex);
|
||||
} /* ... if (wait_for_completion) */
|
||||
|
||||
/*
|
||||
If wait_for_completion was false, and there was an error, only an error
|
||||
message to the error log will say it; normal, for a checkpoint triggered
|
||||
by a log write, we probably don't want the client's log write to throw an
|
||||
error, as the log write succeeded and a checkpoint failure is not
|
||||
critical: the failure in this case is more for the DBA to know than for
|
||||
the end user.
|
||||
*/
|
||||
return error;
|
||||
}
|
||||
|
||||
void mark_checkpoint_done(int error)
|
||||
{
|
||||
safemutex_assert_owner(log_mutex);
|
||||
if (error)
|
||||
checkpoint_errors++;
|
||||
/* a checkpoint is said done even if it had an error */
|
||||
checkpoints_done++;
|
||||
if (checkpoint_request == checkpoint_running)
|
||||
{
|
||||
/*
|
||||
No new request has been posted, so we satisfied all requests, forget
|
||||
about them.
|
||||
*/
|
||||
checkpoint_request= NONE;
|
||||
}
|
||||
checkpoint_running= NONE;
|
||||
written_since_last_checkpoint= 0;
|
||||
broadcast(checkpoint_done_cond);
|
||||
}
|
||||
|
||||
/*
|
||||
Alternative (not to be done, too disturbing):
|
||||
do the autocheckpoint in the thread which passed the bound first (and do the
|
||||
checkpoint in the client thread which requested it).
|
||||
It will give a delay to that client thread which passed the bound (time to
|
||||
fsync() for example 1000 files is 16 s on my laptop). Here is code for
|
||||
explicit and implicit checkpoints, where client thread does the job:
|
||||
*/
|
||||
#if 0
|
||||
{
|
||||
lock(log_mutex); /* explicit takes it here, implicit already has it */
|
||||
while (checkpoint_running != NONE)
|
||||
{
|
||||
if (checkpoint_running >= my_level) /* always true for auto checkpoints */
|
||||
goto end; /* we skip checkpoint */
|
||||
/* a less strong is running, I'll go next */
|
||||
wait_on_checkpoint_done_cond();
|
||||
}
|
||||
checkpoint_running= my_level;
|
||||
checkpoint(my_level); // can gather checkpoint_start_lsn before unlock
|
||||
lock(log_mutex);
|
||||
checkpoint_running= NONE;
|
||||
written_since_last_checkpoint= 0;
|
||||
end:
|
||||
unlock(log_mutex);
|
||||
}
|
||||
#endif
|
23
storage/maria/checkpoint.h
Normal file
23
storage/maria/checkpoint.h
Normal file
|
@ -0,0 +1,23 @@
|
|||
/*
|
||||
WL#3071 Maria checkpoint
|
||||
First version written by Guilhem Bichot on 2006-04-27.
|
||||
Does not compile yet.
|
||||
*/
|
||||
|
||||
/* This is the interface of this module. */
|
||||
|
||||
typedef enum enum_checkpoint_level {
|
||||
NONE=-1,
|
||||
INDIRECT, /* just write dirty_pages, transactions table and sync files */
|
||||
MEDIUM, /* also flush all dirty pages which were already dirty at prev checkpoint*/
|
||||
FULL /* also flush all dirty pages */
|
||||
} CHECKPOINT_LEVEL;
|
||||
|
||||
/*
|
||||
Call this when you want to request a checkpoint.
|
||||
In real life it will be called by log_write_record() and by client thread
|
||||
which explicitely wants to do checkpoint (ALTER ENGINE CHECKPOINT
|
||||
checkpoint_level).
|
||||
*/
|
||||
int request_checkpoint(CHECKPOINT_LEVEL level, my_bool wait_for_completion);
|
||||
/* that's all that's needed in the interface */
|
77
storage/maria/control_file.c
Normal file
77
storage/maria/control_file.c
Normal file
|
@ -0,0 +1,77 @@
|
|||
/*
|
||||
WL#3234 Maria control file
|
||||
First version written by Guilhem Bichot on 2006-04-27.
|
||||
Does not compile yet.
|
||||
*/
|
||||
|
||||
/* Here is the implementation of this module */
|
||||
|
||||
/* Control file is 512 bytes (a disk sector), to be as atomic as possible */
|
||||
|
||||
int control_file_fd;
|
||||
|
||||
/*
|
||||
Looks for the control file. If absent, it's a fresh start, create file.
|
||||
If present, read it to find out last checkpoint's LSN and last log.
|
||||
Called at engine's start.
|
||||
*/
|
||||
int control_file_create_or_open()
|
||||
{
|
||||
char buffer[4];
|
||||
/* name is concatenation of Maria's home dir and "control" */
|
||||
if ((control_file_fd= my_open(name, O_RDWR)) < 0)
|
||||
{
|
||||
/* failure, try to create it */
|
||||
if ((control_file_fd= my_create(name, O_RDWR)) < 0)
|
||||
return 1;
|
||||
/*
|
||||
So this is a start from scratch, to be safer we should make sure that
|
||||
there are no logs or data/index files around (indeed it could be that
|
||||
the control file alone was deleted or not restored, and we should not
|
||||
go on with life at this point.
|
||||
For now we trust (this is alpha version), but for beta if would be great
|
||||
to verify.
|
||||
|
||||
We could have a tool which can rebuild the control file, by reading the
|
||||
directory of logs, finding the newest log, reading it to find last
|
||||
checkpoint... Slow but can save your db.
|
||||
*/
|
||||
last_checkpoint_lsn_at_startup= 0;
|
||||
last_log_name_at_startup= NULL;
|
||||
return 0;
|
||||
}
|
||||
/* Already existing file, read it */
|
||||
if (my_read(control_file_fd, buffer, 8, MYF(MY_FNABP)))
|
||||
return 1;
|
||||
last_checkpoint_lsn_at_startup= uint8korr(buffer);
|
||||
if (last_log_name_at_startup= my_malloc(512-8+1))
|
||||
return 1;
|
||||
if (my_read(control_file_fd, last_log_name_at_startup, 512-8), MYF(MY_FNABP))
|
||||
return 1;
|
||||
last_log_name[512-8]= 0; /* end zero to be nice */
|
||||
return 0;
|
||||
}
|
||||
|
||||
/*
|
||||
Write information durably to the control file.
|
||||
Called when we have created a new log (after syncing this log's creation)
|
||||
and when we have written a checkpoint (after syncing this log record).
|
||||
*/
|
||||
int control_file_write_and_force(LSN lsn, char *log_name)
|
||||
{
|
||||
char buffer[512];
|
||||
uint start=8,end=8;
|
||||
if (lsn != 0) /* LSN was specified */
|
||||
{
|
||||
start= 0;
|
||||
int8store(buffer, lsn);
|
||||
}
|
||||
if (log_name != NULL) /* log name was specified */
|
||||
{
|
||||
end= 512;
|
||||
memcpy(buffer+8, log_name, 512-8);
|
||||
}
|
||||
DBUG_ASSERT(start != end);
|
||||
return (my_pwrite(control_file_fd, buffer, end-start, start, MYF(MY_FNABP)) ||
|
||||
my_sync(control_file_fd))
|
||||
}
|
24
storage/maria/control_file.h
Normal file
24
storage/maria/control_file.h
Normal file
|
@ -0,0 +1,24 @@
|
|||
/*
|
||||
WL#3234 Maria control file
|
||||
First version written by Guilhem Bichot on 2006-04-27.
|
||||
Does not compile yet.
|
||||
*/
|
||||
|
||||
/* Here is the interface of this module */
|
||||
|
||||
LSN last_checkpoint_lsn_at_startup;
|
||||
char *last_log_name_at_startup;
|
||||
|
||||
/*
|
||||
Looks for the control file. If absent, it's a fresh start, create file.
|
||||
If present, read it to find out last checkpoint's LSN and last log.
|
||||
Called at engine's start.
|
||||
*/
|
||||
int control_file_create_or_open();
|
||||
|
||||
/*
|
||||
Write information durably to the control file.
|
||||
Called when we have created a new log (after syncing this log's creation)
|
||||
and when we have written a checkpoint (after syncing this log record).
|
||||
*/
|
||||
int control_file_write_and_force(LSN lsn, char *log_name);
|
175
storage/maria/least_recently_dirtied.c
Normal file
175
storage/maria/least_recently_dirtied.c
Normal file
|
@ -0,0 +1,175 @@
|
|||
/*
|
||||
WL#3261 Maria - background flushing of the least-recently-dirtied pages
|
||||
First version written by Guilhem Bichot on 2006-04-27.
|
||||
Does not compile yet.
|
||||
*/
|
||||
|
||||
/*
|
||||
To be part of the page cache.
|
||||
The pseudocode below is dependent on the page cache
|
||||
which is being designed WL#3134. It is not clear if I need to do page
|
||||
copies, as the page cache already keeps page copies.
|
||||
So, this code will move to the page cache and take inspiration from its
|
||||
methods. Below is just to give the idea of what could be done.
|
||||
And I should compare my imaginations to WL#3134.
|
||||
*/
|
||||
|
||||
/* Here is the implementation of this module */
|
||||
|
||||
#include "page_cache.h"
|
||||
#include "least_recently_dirtied.h"
|
||||
|
||||
/*
|
||||
When we flush a page, we should pin page.
|
||||
This "pin" is to protect against that:
|
||||
I make copy,
|
||||
you modify in memory and flush to disk and remove from LRD and from cache,
|
||||
I write copy to disk,
|
||||
checkpoint happens.
|
||||
result: old page is on disk, page is absent from LRD, your REDO will be
|
||||
wrongly ignored.
|
||||
|
||||
Pin: there can be multiple pins, flushing imposes that there are zero pins.
|
||||
For example, pin could be a uint counter protected by the page's latch.
|
||||
|
||||
Maybe it's ok if when there is a page replacement, the replacer does not
|
||||
remove page from the LRD (it would save global mutex); for that, background
|
||||
flusher should be prepared to see pages in the LRD which are not in the page
|
||||
cache (then just ignore them). However checkpoint will contain superfluous
|
||||
entries and so do more work.
|
||||
*/
|
||||
|
||||
#define PAGE_SIZE (16*1024) /* just as an example */
|
||||
/*
|
||||
Optimization:
|
||||
LRD flusher should not flush pages one by one: to be fast, it flushes a
|
||||
group of pages in sequential disk order if possible; a group of pages is just
|
||||
FLUSH_GROUP_SIZE pages.
|
||||
Key cache has groupping already somehow Monty said (investigate that).
|
||||
*/
|
||||
#define FLUSH_GROUP_SIZE 512 /* 8 MB */
|
||||
|
||||
/*
|
||||
This thread does background flush of pieces of the LRD, and all checkpoints.
|
||||
Just launch it when engine starts.
|
||||
*/
|
||||
pthread_handler_decl background_flush_and_checkpoint_thread()
|
||||
{
|
||||
char *flush_group_buffer= my_malloc(PAGE_SIZE*FLUSH_GROUP_SIZE);
|
||||
while (this_thread_not_killed)
|
||||
{
|
||||
lock(log_mutex);
|
||||
if (checkpoint_request)
|
||||
checkpoint(); /* will unlock mutex */
|
||||
else
|
||||
{
|
||||
unlock(log_mutex);
|
||||
lock(global_LRD_mutex);
|
||||
flush_one_group_from_LRD();
|
||||
safemutex_assert_not_owner(global_LRD_mutex);
|
||||
}
|
||||
my_sleep(1000000); /* one second ? */
|
||||
}
|
||||
my_free(flush_group_buffer);
|
||||
}
|
||||
|
||||
/*
|
||||
flushes only the first FLUSH_GROUP_SIZE pages of the LRD.
|
||||
*/
|
||||
flush_one_group_from_LRD()
|
||||
{
|
||||
char *ptr;
|
||||
safe_mutex_assert_owner(global_LRD_mutex);
|
||||
|
||||
for (page= 0; page<FLUSH_GROUP_SIZE; page++)
|
||||
{
|
||||
copy_element_to_array;
|
||||
}
|
||||
/*
|
||||
One rule to better observe is "page must be flushed to disk before it is
|
||||
removed from LRD" (otherwise checkpoint is incomplete info, corruption).
|
||||
*/
|
||||
unlock(global_LRD_mutex);
|
||||
/* page id is concatenation of "file id" and "number of page in file" */
|
||||
qsort(array, sizeof(*element), FLUSH_GROUP_SIZE, by_page_id);
|
||||
for (scan_array)
|
||||
{
|
||||
if (page_cache_latch(page_id, READ) == PAGE_ABSENT)
|
||||
{
|
||||
/*
|
||||
page disappeared since we made the copy (it was flushed to be
|
||||
replaced), remove from array (memcpy tail of array over it)...
|
||||
*/
|
||||
continue;
|
||||
}
|
||||
memcpy(flush_group_buffer+..., page->data, PAGE_SIZE);
|
||||
pin_page;
|
||||
page_cache_unlatch(page_id, KEEP_PINNED); /* but keep pinned */
|
||||
}
|
||||
for (scan_the_array)
|
||||
{
|
||||
/*
|
||||
As an optimization, we try to identify contiguous-in-the-file segments (to
|
||||
issue one big write()).
|
||||
In non-optimized version, contiguous segment is always only one page.
|
||||
*/
|
||||
if ((next_page.page_id - this_page.page_id) == 1)
|
||||
{
|
||||
/*
|
||||
this page and next page are in same file and are contiguous in the
|
||||
file: add page to contiguous segment...
|
||||
*/
|
||||
continue; /* defer write() to next pages */
|
||||
}
|
||||
/* contiguous segment ends */
|
||||
my_pwrite(file, contiguous_segment_start_offset, contiguous_segment_size);
|
||||
|
||||
/*
|
||||
note that if we had doublewrite, doublewrite buffer may prevent us from
|
||||
doing this write() grouping (if doublewrite space is shorter).
|
||||
*/
|
||||
}
|
||||
/*
|
||||
Now remove pages from LRD. As we have pinned them, all pages that we
|
||||
managed to pin are still in the LRD, in the same order, we can just cut
|
||||
the LRD at the last element of "array". This is more efficient that
|
||||
removing element by element (which would take LRD mutex many times) in the
|
||||
loop above.
|
||||
*/
|
||||
lock(global_LRD_mutex);
|
||||
/* cut LRD by bending LRD->first, free cut portion... */
|
||||
unlock(global_LRD_mutex);
|
||||
for (scan_array)
|
||||
{
|
||||
/*
|
||||
if the page has a property "modified since last flush" (i.e. which is
|
||||
redundant with the presence of the page in the LRD, this property can
|
||||
just be a pointer to the LRD element) we should reset it
|
||||
(note that then the property would live slightly longer than
|
||||
the presence in LRD).
|
||||
*/
|
||||
page_cache_unpin(page_id);
|
||||
/*
|
||||
order between unpin and removal from LRD is not clear, depends on what
|
||||
pin actually is.
|
||||
*/
|
||||
}
|
||||
free(array);
|
||||
}
|
||||
|
||||
/* flushes all page from LRD up to approximately rec_lsn>=max_lsn */
|
||||
int flush_all_LRD_to_lsn(LSN max_lsn)
|
||||
{
|
||||
lock(global_LRD_mutex);
|
||||
if (max_lsn == MAX_LSN) /* don't want to flush forever, so make it fixed: */
|
||||
max_lsn= LRD->first->prev->rec_lsn;
|
||||
while (LRD->first->rec_lsn < max_lsn)
|
||||
{
|
||||
if (flush_one_group_from_LRD()) /* will unlock mutex */
|
||||
return 1;
|
||||
/* scheduler may preempt us here so that we don't take full CPU */
|
||||
lock(global_LRD_mutex);
|
||||
}
|
||||
unlock(global_LRD_mutex);
|
||||
return 0;
|
||||
}
|
10
storage/maria/least_recently_dirtied.h
Normal file
10
storage/maria/least_recently_dirtied.h
Normal file
|
@ -0,0 +1,10 @@
|
|||
/*
|
||||
WL#3261 Maria - background flushing of the least-recently-dirtied pages
|
||||
First version written by Guilhem Bichot on 2006-04-27.
|
||||
Does not compile yet.
|
||||
*/
|
||||
|
||||
/* This is the interface of this module. */
|
||||
|
||||
/* flushes all page from LRD up to approximately rec_lsn>=max_lsn */
|
||||
int flush_all_LRD_to_lsn(LSN max_lsn);
|
224
storage/maria/recovery.c
Normal file
224
storage/maria/recovery.c
Normal file
|
@ -0,0 +1,224 @@
|
|||
/*
|
||||
WL#3072 Maria recovery
|
||||
First version written by Guilhem Bichot on 2006-04-27.
|
||||
Does not compile yet.
|
||||
*/
|
||||
|
||||
/* Here is the implementation of this module */
|
||||
|
||||
#include "page_cache.h"
|
||||
#include "least_recently_dirtied.h"
|
||||
#include "transaction.h"
|
||||
#include "share.h"
|
||||
#include "log.h"
|
||||
|
||||
typedef struct st_record_type_properties {
|
||||
/* used for debug error messages or "maria_read_log" command-line tool: */
|
||||
char *name,
|
||||
my_bool record_ends_group;
|
||||
int (*record_execute)(RECORD *); /* param will be record header instead later */
|
||||
} RECORD_TYPE_PROPERTIES;
|
||||
|
||||
RECORD_TYPE_PROPERTIES all_record_type_properties[]=
|
||||
{
|
||||
/* listed here in the order of the "log records type" enumeration */
|
||||
{"REDO_INSERT_HEAD", 0, redo_insert_head_execute},
|
||||
...,
|
||||
{"UNDO_INSERT" , 1, undo_insert_execute },
|
||||
{"COMMIT", , 1, commit_execute },
|
||||
...
|
||||
};
|
||||
|
||||
int redo_insert_head_execute(RECORD *record)
|
||||
{
|
||||
/* write the data to the proper page */
|
||||
}
|
||||
|
||||
int undo_insert_execute(RECORD *record)
|
||||
{
|
||||
trans_table[short_trans_id].undo_lsn= record.lsn;
|
||||
/* restore the old version of the row */
|
||||
}
|
||||
|
||||
int commit_execute(RECORD *record)
|
||||
{
|
||||
trans_table[short_trans_id].state= COMMITTED;
|
||||
/*
|
||||
and that's all: the delete/update handler should not be woken up! as there
|
||||
may be REDO for purge further in the log.
|
||||
*/
|
||||
}
|
||||
|
||||
#define record_ends_group(R) \
|
||||
all_record_type_properties[(R)->type].record_ends_group)
|
||||
|
||||
#define execute_log_record(R) \
|
||||
all_record_type_properties[(R).type].record_execute(R)
|
||||
|
||||
|
||||
int recovery()
|
||||
{
|
||||
control_file_create_or_open();
|
||||
/*
|
||||
init log handler: tell it that we are going to do large reads of the
|
||||
log, sequential and backward. Log handler could decide to alloc a big
|
||||
read-only IO_CACHE for this, or use its usual page cache.
|
||||
*/
|
||||
|
||||
/* read checkpoint log record from log handler */
|
||||
RECORD *checkpoint_record= log_read_record(last_checkpoint_lsn_at_start);
|
||||
|
||||
/* parse this record, build structs (dirty_pages, transactions table, file_map) */
|
||||
/*
|
||||
read log records (note: sometimes only the header is needed, for ex during
|
||||
REDO phase only the header of UNDO is needed, not the 4G blob in the
|
||||
variable-length part, so I could use that; however for PREPARE (which is a
|
||||
variable-length record) I'll need to read the full record in the REDO
|
||||
phase):
|
||||
*/
|
||||
|
||||
record= log_read_record(min(rec_lsn, ...));
|
||||
/*
|
||||
if log handler knows the end LSN of the log, we could print here how many
|
||||
MB of log we have to read (to give an idea of the time), and print
|
||||
progress notes.
|
||||
*/
|
||||
|
||||
while (record != NULL)
|
||||
{
|
||||
/*
|
||||
A complete group is a set of log records with an "end mark" record
|
||||
(e.g. a set of REDOs for an operation, terminated by an UNDO for this
|
||||
operation); if there is no "end mark" record the group is incomplete
|
||||
and won't be executed.
|
||||
*/
|
||||
if (record_ends_group(record)
|
||||
{
|
||||
/*
|
||||
such end events can always be executed immediately (they don't touch
|
||||
the disk).
|
||||
*/
|
||||
execute_log_record(record);
|
||||
if (trans_table[record.short_trans_id].group_start_lsn != 0)
|
||||
{
|
||||
/*
|
||||
There is a complete group for this transaction.
|
||||
We're going to read recently read log records:
|
||||
for this log_read_record() to be efficient (not touch the disk),
|
||||
log handler could cache recently read pages
|
||||
(can just use an IO_CACHE of 10 MB to read the log, or the normal
|
||||
log handler page cache).
|
||||
Without it only OS file cache will help.
|
||||
*/
|
||||
record2= log_read_record(trans_table[record.short_trans_id].group_start_lsn);
|
||||
while (record2.lsn < record.lsn)
|
||||
{
|
||||
if (record2.short_trans_id == record.short_trans_id)
|
||||
execute_log_record(record2); /* it's in our group */
|
||||
record2= log_read_next_record();
|
||||
}
|
||||
trans_table[record.short_trans_id].group_start_lsn= 0; /* group finished */
|
||||
/* we're now at the UNDO, re-read it to advance log pointer */
|
||||
record2= log_read_next_record(); /* and throw it away */
|
||||
}
|
||||
}
|
||||
else /* record does not end group */
|
||||
{
|
||||
/* just record the fact, can't know if can execute yet */
|
||||
if (trans_table[short_trans_id].group_start_lsn == 0) /* group not yet started */
|
||||
trans_table[short_trans_id].group_start_lsn= record.lsn;
|
||||
}
|
||||
|
||||
/*
|
||||
Later we can optimize: instead of "execute_log_record(record2)", do
|
||||
copy_record_into_exec_buffer(record2):
|
||||
this will just copy record into a multi-record (10 MB?) memory buffer,
|
||||
and when buffer is full, will do sorting of REDOs per
|
||||
page id and execute them.
|
||||
This sorting will enable us to do more sequential reads of the
|
||||
data/index pages.
|
||||
Note that updating bitmap pages (when we have executed a REDO for a page
|
||||
we update its bitmap page) may break the sequential read of pages,
|
||||
so maybe we should read and cache bitmap pages in the beginning.
|
||||
Or ok the sequence will be broken, but quickly all bitmap pages will be
|
||||
in memory and so the sequence will not be broken anymore.
|
||||
Sorting could even determine, based on physical device of files
|
||||
("st_dev" in stat()), that some files should be should be taken by
|
||||
different threads, if we want to do parallism.
|
||||
*/
|
||||
/*
|
||||
Here's how to read a complete variable-length record if needed:
|
||||
<sanja> read the header, allocate buffer of record length, read whole
|
||||
record.
|
||||
*/
|
||||
record= log_read_next_record();
|
||||
}
|
||||
|
||||
/*
|
||||
Earlier or here, create true transactions in TM.
|
||||
If done earlier, note that TM should not wake up the delete/update handler
|
||||
when it receives a commit info, as existing REDO for purge may exist in
|
||||
the log, and so the delete/update handler may do changes which conflict
|
||||
with these REDOs.
|
||||
Even if done here, better to not wake it up now as we're going to free the
|
||||
page cache:
|
||||
*/
|
||||
|
||||
/*
|
||||
We want to have two steps:
|
||||
engine->recover_with_max_memory();
|
||||
next_engine->recover_with_max_memory();
|
||||
engine->init_with_normal_memory();
|
||||
next_engine->init_with_normal_memory();
|
||||
So: in recover_with_max_memory() allocate a giant page cache, do REDO
|
||||
phase, then all page cache is flushed and emptied and freed (only retain
|
||||
small structures like TM): take full checkpoint, which is useful if
|
||||
next engine crashes in its recovery the next second.
|
||||
Destroy all shares (maria_close()), then at init_with_normal_memory() we
|
||||
do this:
|
||||
*/
|
||||
|
||||
print_information_to_error_log(nb of trans to roll back, nb of prepared trans);
|
||||
|
||||
/*
|
||||
Launch one or more threads to do the background rollback. Don't wait for
|
||||
them to complete their rollback (background rollback; for debugging, we
|
||||
can have an option which waits).
|
||||
|
||||
Note that InnoDB's rollback-in-background works as long as InnoDB is the
|
||||
last engine to recover, otherwise MySQL will refuse new connections until
|
||||
the last engine has recovered so it's not "background" from the user's
|
||||
point of view. InnoDB is near top of sys_table_types so all others
|
||||
(e.g. BDB) recover after it... So it's really "online rollback" only if
|
||||
InnoDB is the only engine.
|
||||
*/
|
||||
|
||||
/* wake up delete/update handler */
|
||||
/* tell the TM that it can now accept new transactions */
|
||||
|
||||
/*
|
||||
mark that checkpoint requests are now allowed.
|
||||
*/
|
||||
/*
|
||||
when all rollback threads have terminated, somebody should print "rollback
|
||||
finished" to the error log.
|
||||
*/
|
||||
}
|
||||
|
||||
pthread_handler_decl rollback_background_thread()
|
||||
{
|
||||
/*
|
||||
execute the normal runtime-rollback code for a bunch of transactions.
|
||||
*/
|
||||
while (trans in list_of_trans_to_rollback_by_this_thread)
|
||||
{
|
||||
while (trans->undo_lsn != 0)
|
||||
{
|
||||
/* this is the normal runtime-rollback code: */
|
||||
record= log_read_record(trans->undo_lsn);
|
||||
execute_log_record(record);
|
||||
trans->undo_lsn= record.prev_undo_lsn;
|
||||
}
|
||||
/* remove trans from list */
|
||||
}
|
||||
}
|
10
storage/maria/recovery.h
Normal file
10
storage/maria/recovery.h
Normal file
|
@ -0,0 +1,10 @@
|
|||
/*
|
||||
WL#3072 Maria recovery
|
||||
First version written by Guilhem Bichot on 2006-04-27.
|
||||
Does not compile yet.
|
||||
*/
|
||||
|
||||
/* This is the interface of this module. */
|
||||
|
||||
/* Performs recovery of the engine at start */
|
||||
int recovery();
|
Loading…
Add table
Reference in a new issue