mariadb/storage/maria/ma_recovery.c
unknown cdf831cf94 WL#3071 Maria checkpoint:
changing pseudocode to use the structures of the Maria pagecache
    ("pagecache->changed_blocks" etc) and other Maria structures
    inherited from MyISAM (THR_LOCK_maria etc).


mysys/mf_pagecache.c:
  comment
storage/maria/ma_checkpoint.c:
  changing pseudocode to use the structures of the Maria pagecache
  ("pagecache->changed_blocks" etc) and other Maria structures
  inherited from MyISAM (THR_LOCK_maria etc).
storage/maria/ma_checkpoint.h:
  copyright
storage/maria/ma_control_file.c:
  copyright
storage/maria/ma_control_file.h:
  copyright
storage/maria/ma_least_recently_dirtied.c:
  copyright
storage/maria/ma_least_recently_dirtied.h:
  copyright
storage/maria/ma_recovery.c:
  copyright
storage/maria/ma_recovery.h:
  copyright
storage/maria/unittest/Makefile.am:
  copyright
2006-09-14 19:06:51 +02:00

268 lines
9.9 KiB
C

/* Copyright (C) 2006 MySQL AB & MySQL Finland AB & TCX DataKonsult AB
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with this program; if not, write to the Free Software
Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA */
/*
WL#3072 Maria recovery
First version written by Guilhem Bichot on 2006-04-27.
Does not compile yet.
*/
/* Here is the implementation of this module */
#include "page_cache.h"
#include "least_recently_dirtied.h"
#include "transaction.h"
#include "share.h"
#include "log.h"
typedef struct st_record_type_properties {
/* used for debug error messages or "maria_read_log" command-line tool: */
char *name,
my_bool record_ends_group;
/* a function to execute when we see the record during the REDO phase */
int (*record_execute_in_redo_phase)(RECORD *); /* param will be record header instead later */
/* a function to execute when we see the record during the UNDO phase */
int (*record_execute_in_undo_phase)(RECORD *); /* param will be record header instead later */
} RECORD_TYPE_PROPERTIES;
int no_op(RECORD *) {return 0};
RECORD_TYPE_PROPERTIES all_record_type_properties[]=
{
/* listed here in the order of the "log records type" enumeration */
{"REDO_INSERT_HEAD", FALSE, redo_insert_head_execute_in_redo_phase, no_op},
...,
{"UNDO_INSERT" , TRUE , undo_insert_execute_in_redo_phase, undo_insert_execute_in_undo_phase},
{"COMMIT", , TRUE , commit_execute_in_redo_phase, no_op},
...
};
int redo_insert_head_execute_in_redo_phase(RECORD *record)
{
/* write the data to the proper page */
}
int undo_insert_execute_in_redo_phase(RECORD *record)
{
trans_table[short_trans_id].undo_lsn= record.lsn;
/* don't restore the old version of the row */
}
int undo_insert_execute_in_undo_phase(RECORD *record)
{
/* restore the old version of the row */
trans_table[short_trans_id].undo_lsn= record.prev_undo_lsn;
}
int commit_execute_in_redo_phase(RECORD *record)
{
trans_table[short_trans_id].state= COMMITTED;
/*
and that's all: the delete/update handler should not be woken up! as there
may be REDO for purge further in the log.
*/
}
#define record_ends_group(R) \
all_record_type_properties[(R)->type].record_ends_group)
#define execute_log_record_in_redo_phase(R) \
all_record_type_properties[(R).type].record_execute_in_redo_phase(R)
int recovery()
{
control_file_create_or_open();
/*
init log handler: tell it that we are going to do large reads of the
log, sequential and backward. Log handler could decide to alloc a big
read-only IO_CACHE for this, or use its usual page cache.
*/
/* read checkpoint log record from log handler */
RECORD *checkpoint_record= log_read_record(last_checkpoint_lsn_at_start);
/* parse this record, build structs (dirty_pages, transactions table, file_map) */
/*
read log records (note: sometimes only the header is needed, for ex during
REDO phase only the header of UNDO is needed, not the 4G blob in the
variable-length part, so I could use that; however for PREPARE (which is a
variable-length record) I'll need to read the full record in the REDO
phase):
*/
/**** REDO PHASE *****/
record= log_read_record(min(rec_lsn, ...)); /* later, read only header */
/*
if log handler knows the end LSN of the log, we could print here how many
MB of log we have to read (to give an idea of the time), and print
progress notes.
*/
while (record != NULL)
{
/*
A complete group is a set of log records with an "end mark" record
(e.g. a set of REDOs for an operation, terminated by an UNDO for this
operation); if there is no "end mark" record the group is incomplete
and won't be executed.
*/
if (record_ends_group(record)
{
if (trans_table[record.short_trans_id].group_start_lsn != 0)
{
/*
There is a complete group for this transaction, containing more than
this event.
We're going to read recently read log records:
for this log_read_record() to be efficient (not touch the disk),
log handler could cache recently read pages
(can just use an IO_CACHE of 10 MB to read the log, or the normal
log handler page cache).
Without it only OS file cache will help.
*/
record2=
log_read_record(trans_table[record.short_trans_id].group_start_lsn);
do
{
if (record2.short_trans_id == record.short_trans_id)
execute_log_record_in_redo_phase(record2); /* it's in our group */
record2= log_read_next_record();
}
while (record2.lsn < record.lsn);
trans_table[record.short_trans_id].group_start_lsn= 0; /* group finished */
}
execute_log_record_in_redo_phase(record);
}
else /* record does not end group */
{
/* just record the fact, can't know if can execute yet */
if (trans_table[short_trans_id].group_start_lsn == 0) /* group not yet started */
trans_table[short_trans_id].group_start_lsn= record.lsn;
}
/*
Later we can optimize: instead of "execute_log_record(record2)", do
copy_record_into_exec_buffer(record2):
this will just copy record into a multi-record (10 MB?) memory buffer,
and when buffer is full, will do sorting of REDOs per
page id and execute them.
This sorting will enable us to do more sequential reads of the
data/index pages.
Note that updating bitmap pages (when we have executed a REDO for a page
we update its bitmap page) may break the sequential read of pages,
so maybe we should read and cache bitmap pages in the beginning.
Or ok the sequence will be broken, but quickly all bitmap pages will be
in memory and so the sequence will not be broken anymore.
Sorting could even determine, based on physical device of files
("st_dev" in stat()), that some files should be should be taken by
different threads, if we want to do parallism.
*/
/*
Here's how to read a complete variable-length record if needed:
<sanja> read the header, allocate buffer of record length, read whole
record.
*/
record= log_read_next_record();
}
/*
Earlier or here, create true transactions in TM.
If done earlier, note that TM should not wake up the delete/update handler
when it receives a commit info, as existing REDO for purge may exist in
the log, and so the delete/update handler may do changes which conflict
with these REDOs.
Even if done here, better to not wake it up now as we're going to free the
page cache.
MikaelR suggests: support checkpoints during REDO phase too: do checkpoint
after a certain amount of log records have been executed. This helps
against repeated crashes. Those checkpoints could not be user-requested
(as engine is not communicating during the REDO phase), so they would be
automatic: this changes the original assumption that we don't write to the
log while in the REDO phase, but why not. How often should we checkpoint?
*/
/*
We want to have two steps:
engine->recover_with_max_memory();
next_engine->recover_with_max_memory();
engine->init_with_normal_memory();
next_engine->init_with_normal_memory();
So: in recover_with_max_memory() allocate a giant page cache, do REDO
phase, then all page cache is flushed and emptied and freed (only retain
small structures like TM): take full checkpoint, which is useful if
next engine crashes in its recovery the next second.
Destroy all shares (maria_close()), then at init_with_normal_memory() we
do this:
*/
/**** UNDO PHASE *****/
print_information_to_error_log(nb of trans to roll back, nb of prepared trans);
/*
Launch one or more threads to do the background rollback. Don't wait for
them to complete their rollback (background rollback; for debugging, we
can have an option which waits). Set a counter (total_of_rollback_threads)
to the number of threads to lauch.
Note that InnoDB's rollback-in-background works as long as InnoDB is the
last engine to recover, otherwise MySQL will refuse new connections until
the last engine has recovered so it's not "background" from the user's
point of view. InnoDB is near top of sys_table_types so all others
(e.g. BDB) recover after it... So it's really "online rollback" only if
InnoDB is the only engine.
*/
/* wake up delete/update handler */
/* tell the TM that it can now accept new transactions */
/*
mark that checkpoint requests are now allowed.
*/
}
pthread_handler_decl rollback_background_thread()
{
/*
execute the normal runtime-rollback code for a bunch of transactions.
*/
while (trans in list_of_trans_to_rollback_by_this_thread)
{
while (trans->undo_lsn != 0)
{
/* this is the normal runtime-rollback code: */
record= log_read_record(trans->undo_lsn);
execute_log_record_in_undo_phase(record);
trans->undo_lsn= record.prev_undo_lsn;
}
/* remove trans from list */
}
lock_mutex(rollback_threads); /* or atomic counter */
if (--total_of_rollback_threads == 0)
{
/*
All rollback threads are done. Print "rollback finished" to the error
log and take a full checkpoint.
*/
}
unlock_mutex(rollback_threads);
pthread_exit();
}