mirror of
https://github.com/MariaDB/server.git
synced 2025-01-22 06:44:16 +01:00
refs #5671 implement join_timeout with a portable 'crash and dump core' function
git-svn-id: file:///svn/toku/tokudb@50308 c7de825b-a66e-492c-adef-691d508d4ae1
This commit is contained in:
parent
27d87f64c4
commit
15e0df36fb
3 changed files with 193 additions and 62 deletions
112
portability/toku_crash.h
Normal file
112
portability/toku_crash.h
Normal file
|
@ -0,0 +1,112 @@
|
|||
/* -*- mode: C++; c-basic-offset: 4; indent-tabs-mode: nil -*- */
|
||||
// vim: ft=cpp:expandtab:ts=8:sw=4:softtabstop=4:
|
||||
#ident "$Id$"
|
||||
#ident "Copyright (c) 2007-2012 Tokutek Inc. All rights reserved."
|
||||
|
||||
#ifndef PORTABILITY_TOKU_CRASH_H
|
||||
#define PORTABILITY_TOKU_CRASH_H
|
||||
|
||||
#include <stdio.h>
|
||||
#include <stdlib.h>
|
||||
#include <signal.h>
|
||||
|
||||
//Simulate as hard a crash as possible.
|
||||
//Choices:
|
||||
// raise(SIGABRT)
|
||||
// kill -SIGKILL $pid
|
||||
// divide by 0
|
||||
// null dereference
|
||||
// abort()
|
||||
// assert(false) (from <assert.h>)
|
||||
// assert(false) (from <toku_assert.h>)
|
||||
//
|
||||
//Linux:
|
||||
// abort() and both assert(false) cause FILE buffers to be flushed and written to disk: Unacceptable
|
||||
//Windows:
|
||||
// None of them cause file buffers to be flushed/written to disk, however
|
||||
// abort(), assert(false) <assert.h>, null dereference, and divide by 0 cause popups requiring user intervention during tests: Unacceptable
|
||||
//
|
||||
//kill -SIGKILL $pid is annoying (and so far untested)
|
||||
//
|
||||
//raise(SIGABRT) has the downside that perhaps it could be caught?
|
||||
//I'm choosing raise(SIGABRT), followed by divide by 0, followed by null dereference, followed by all the others just in case one gets caught.
|
||||
static void __attribute__((unused, noreturn))
|
||||
toku_hard_crash_on_purpose(void) {
|
||||
#if TOKU_WINDOWS
|
||||
TerminateProcess(GetCurrentProcess(), 137);
|
||||
#else
|
||||
raise(SIGKILL); //Does not flush buffers on linux; cannot be caught.
|
||||
#endif
|
||||
{
|
||||
int zero = 0;
|
||||
int infinity = 1/zero;
|
||||
fprintf(stderr, "Force use of %d\n", infinity);
|
||||
fflush(stderr); //Make certain the string is calculated.
|
||||
}
|
||||
{
|
||||
void * intothevoid = NULL;
|
||||
(*(int*)intothevoid)++;
|
||||
fprintf(stderr, "Force use of *(%p) = %d\n", intothevoid, *(int*)intothevoid);
|
||||
fflush(stderr);
|
||||
}
|
||||
abort();
|
||||
fprintf(stderr, "This line should never be printed\n");
|
||||
fflush(stderr);
|
||||
}
|
||||
|
||||
// Similar to toku_hard_crash_on_purpose, but the goal isn't to crash hard, the primary goal is to get a corefile, the secondary goal is to terminate in any way possible.
|
||||
// We don't really care if buffers get flushed etc, in fact they may as well flush since there may be useful output in stdout or stderr.
|
||||
//
|
||||
// By default, the following signals generate cores:
|
||||
// Linux, from signal(7):
|
||||
// SIGQUIT 3 Core
|
||||
// SIGILL 4 Core
|
||||
// SIGABRT 6 Core
|
||||
// SIGFPE 8 Core
|
||||
// SIGSEGV 11 Core
|
||||
//
|
||||
// Darwin and FreeBSD, from signal(3):
|
||||
// 3 SIGQUIT create core image
|
||||
// 4 SIGILL create core image
|
||||
// 5 SIGTRAP create core image
|
||||
// 6 SIGABRT create core image
|
||||
// 7 SIGEMT create core image
|
||||
// 8 SIGFPE create core image
|
||||
// 10 SIGBUS create core image
|
||||
// 11 SIGSEGV create core image
|
||||
// 12 SIGSYS create core image
|
||||
//
|
||||
// We'll raise these in some sequence (common ones first), then try emulating the things that would cause these signals to be raised, then eventually just try to die normally and then loop like abort does.
|
||||
static void __attribute__((unused, noreturn))
|
||||
toku_crash_and_dump_core_on_purpose(void) {
|
||||
raise(SIGQUIT);
|
||||
raise(SIGILL);
|
||||
raise(SIGABRT);
|
||||
raise(SIGFPE);
|
||||
raise(SIGSEGV);
|
||||
#if defined(__FreeBSD__) || defined(__APPLE__)
|
||||
raise(SIGTRAP);
|
||||
raise(SIGEMT);
|
||||
raise(SIGBUS);
|
||||
raise(SIGSYS);
|
||||
#endif
|
||||
abort();
|
||||
{
|
||||
int zero = 0;
|
||||
int infinity = 1/zero;
|
||||
fprintf(stderr, "Force use of %d\n", infinity);
|
||||
fflush(stderr); //Make certain the string is calculated.
|
||||
}
|
||||
{
|
||||
void * intothevoid = NULL;
|
||||
(*(int*)intothevoid)++;
|
||||
fprintf(stderr, "Force use of *(%p) = %d\n", intothevoid, *(int*)intothevoid);
|
||||
fflush(stderr);
|
||||
}
|
||||
raise(SIGKILL);
|
||||
while (true) {
|
||||
// don't return
|
||||
}
|
||||
}
|
||||
|
||||
#endif // PORTABILITY_TOKU_CRASH_H
|
|
@ -17,6 +17,7 @@
|
|||
#include <limits.h>
|
||||
#include <errno.h>
|
||||
#include <toku_htonl.h>
|
||||
#include <portability/toku_crash.h>
|
||||
#include "toku_assert.h"
|
||||
#include <signal.h>
|
||||
#include <time.h>
|
||||
|
@ -272,50 +273,6 @@ void print_time_now(void) {
|
|||
printf("%s", timestr);
|
||||
}
|
||||
|
||||
//Simulate as hard a crash as possible.
|
||||
//Choices:
|
||||
// raise(SIGABRT)
|
||||
// kill -SIGKILL $pid
|
||||
// divide by 0
|
||||
// null dereference
|
||||
// abort()
|
||||
// assert(false) (from <assert.h>)
|
||||
// assert(false) (from <toku_assert.h>)
|
||||
//
|
||||
//Linux:
|
||||
// abort() and both assert(false) cause FILE buffers to be flushed and written to disk: Unacceptable
|
||||
//Windows:
|
||||
// None of them cause file buffers to be flushed/written to disk, however
|
||||
// abort(), assert(false) <assert.h>, null dereference, and divide by 0 cause popups requiring user intervention during tests: Unacceptable
|
||||
//
|
||||
//kill -SIGKILL $pid is annoying (and so far untested)
|
||||
//
|
||||
//raise(SIGABRT) has the downside that perhaps it could be caught?
|
||||
//I'm choosing raise(SIGABRT), followed by divide by 0, followed by null dereference, followed by all the others just in case one gets caught.
|
||||
static void UU()
|
||||
toku_hard_crash_on_purpose(void) {
|
||||
#if TOKU_WINDOWS
|
||||
TerminateProcess(GetCurrentProcess(), 137);
|
||||
#else
|
||||
raise(SIGKILL); //Does not flush buffers on linux; cannot be caught.
|
||||
#endif
|
||||
{
|
||||
int zero = 0;
|
||||
int infinity = 1/zero;
|
||||
fprintf(stderr, "Force use of %d\n", infinity);
|
||||
fflush(stderr); //Make certain the string is calculated.
|
||||
}
|
||||
{
|
||||
void * intothevoid = NULL;
|
||||
(*(int*)intothevoid)++;
|
||||
fprintf(stderr, "Force use of *(%p) = %d\n", intothevoid, *(int*)intothevoid);
|
||||
fflush(stderr);
|
||||
}
|
||||
abort();
|
||||
fprintf(stderr, "This line should never be printed\n");
|
||||
fflush(stderr);
|
||||
}
|
||||
|
||||
static void UU()
|
||||
multiply_locks_for_n_dbs(DB_ENV *env, int num_dbs) {
|
||||
#ifdef USE_TDB
|
||||
|
|
|
@ -13,7 +13,6 @@
|
|||
#include <stdio.h>
|
||||
#include <stdlib.h>
|
||||
|
||||
#include <signal.h>
|
||||
#include <locale.h>
|
||||
#include <unistd.h>
|
||||
#include <sys/stat.h>
|
||||
|
@ -1432,9 +1431,44 @@ static void *test_time(void *arg) {
|
|||
return arg;
|
||||
}
|
||||
|
||||
static void crashing_alarm_handler(int sig) {
|
||||
assert(sig == SIGALRM);
|
||||
toku_hard_crash_on_purpose();
|
||||
struct sleep_and_crash_extra {
|
||||
toku_mutex_t mutex;
|
||||
toku_cond_t cond;
|
||||
int seconds;
|
||||
bool is_setup;
|
||||
bool threads_have_joined;
|
||||
};
|
||||
static void *sleep_and_crash(void *extra) {
|
||||
sleep_and_crash_extra *e = static_cast<sleep_and_crash_extra *>(extra);
|
||||
toku_mutex_lock(&e->mutex);
|
||||
struct timeval tv;
|
||||
toku_timespec_t ts;
|
||||
gettimeofday(&tv, nullptr);
|
||||
ts.tv_sec = tv.tv_sec + e->seconds;
|
||||
ts.tv_nsec = 0;
|
||||
e->is_setup = true;
|
||||
if (verbose) {
|
||||
printf("Waiting %d seconds for other threads to join.\n", e->seconds);
|
||||
fflush(stdout);
|
||||
}
|
||||
int r = toku_cond_timedwait(&e->cond, &e->mutex, &ts);
|
||||
toku_mutex_assert_locked(&e->mutex);
|
||||
if (r == ETIMEDOUT) {
|
||||
invariant(!e->threads_have_joined);
|
||||
if (verbose) {
|
||||
printf("Some thread didn't join on time, crashing.\n");
|
||||
fflush(stdout);
|
||||
}
|
||||
toku_crash_and_dump_core_on_purpose();
|
||||
} else {
|
||||
assert(r == 0);
|
||||
assert(e->threads_have_joined);
|
||||
if (verbose) {
|
||||
printf("Other threads joined on time, exiting cleanly.\n");
|
||||
}
|
||||
}
|
||||
toku_mutex_unlock(&e->mutex);
|
||||
return nullptr;
|
||||
}
|
||||
|
||||
static int run_workers(
|
||||
|
@ -1484,20 +1518,48 @@ static int run_workers(
|
|||
void *ret;
|
||||
r = toku_pthread_join(time_tid, &ret); assert_zero(r);
|
||||
if (verbose) printf("%lu joined\n", (unsigned long) time_tid);
|
||||
sighandler_t old_alarm = signal(SIGALRM, crashing_alarm_handler);
|
||||
assert(old_alarm != SIG_ERR);
|
||||
// Set an alarm that will kill us if it takes too long to join all the
|
||||
// threads (i.e. there is some runaway thread).
|
||||
unsigned int remaining = alarm(cli_args->join_timeout);
|
||||
assert_zero(remaining);
|
||||
for (int i = 0; i < num_threads; ++i) {
|
||||
r = toku_pthread_join(tids[i], &ret); assert_zero(r);
|
||||
if (verbose)
|
||||
printf("%lu joined\n", (unsigned long) tids[i]);
|
||||
|
||||
{
|
||||
// Set an alarm that will kill us if it takes too long to join all the
|
||||
// threads (i.e. there is some runaway thread).
|
||||
struct sleep_and_crash_extra sac_extra;
|
||||
ZERO_STRUCT(sac_extra);
|
||||
toku_mutex_init(&sac_extra.mutex, nullptr);
|
||||
toku_cond_init(&sac_extra.cond, nullptr);
|
||||
sac_extra.seconds = cli_args->join_timeout;
|
||||
sac_extra.is_setup = false;
|
||||
sac_extra.threads_have_joined = false;
|
||||
|
||||
toku_mutex_lock(&sac_extra.mutex);
|
||||
toku_pthread_t sac_thread;
|
||||
r = toku_pthread_create(&sac_thread, nullptr, sleep_and_crash, &sac_extra);
|
||||
assert_zero(r);
|
||||
// Wait for sleep_and_crash thread to get set up, spinning is ok, this should be quick.
|
||||
while (!sac_extra.is_setup) {
|
||||
toku_mutex_unlock(&sac_extra.mutex);
|
||||
r = toku_pthread_yield();
|
||||
assert_zero(r);
|
||||
toku_mutex_lock(&sac_extra.mutex);
|
||||
}
|
||||
toku_mutex_unlock(&sac_extra.mutex);
|
||||
|
||||
// Timeout thread has started, join everyone
|
||||
for (int i = 0; i < num_threads; ++i) {
|
||||
r = toku_pthread_join(tids[i], &ret); assert_zero(r);
|
||||
if (verbose)
|
||||
printf("%lu joined\n", (unsigned long) tids[i]);
|
||||
}
|
||||
|
||||
// Signal timeout thread not to crash.
|
||||
toku_mutex_lock(&sac_extra.mutex);
|
||||
sac_extra.threads_have_joined = true;
|
||||
toku_cond_signal(&sac_extra.cond);
|
||||
toku_mutex_unlock(&sac_extra.mutex);
|
||||
r = toku_pthread_join(sac_thread, nullptr);
|
||||
assert_zero(r);
|
||||
toku_cond_destroy(&sac_extra.cond);
|
||||
toku_mutex_destroy(&sac_extra.mutex);
|
||||
}
|
||||
// All threads joined, deschedule the alarm.
|
||||
remaining = alarm(0);
|
||||
assert(remaining > 0);
|
||||
|
||||
if (cli_args->print_performance) {
|
||||
uint64_t *counters[num_threads];
|
||||
|
@ -2244,7 +2306,6 @@ static inline void parse_stress_test_args (int argc, char *const argv[], struct
|
|||
INT32_ARG_NONNEG("--num_elements", num_elements, ""),
|
||||
INT32_ARG_NONNEG("--num_DBs", num_DBs, ""),
|
||||
INT32_ARG_NONNEG("--num_seconds", num_seconds, "s"),
|
||||
INT32_ARG_NONNEG("--join_timeout", join_timeout, "s"),
|
||||
INT32_ARG_NONNEG("--node_size", env_args.node_size, " bytes"),
|
||||
INT32_ARG_NONNEG("--basement_node_size", env_args.basement_node_size, " bytes"),
|
||||
INT32_ARG_NONNEG("--rollback_node_size", env_args.rollback_node_size, " bytes"),
|
||||
|
@ -2259,6 +2320,7 @@ static inline void parse_stress_test_args (int argc, char *const argv[], struct
|
|||
UINT32_ARG("--txn_size", txn_size, " rows"),
|
||||
UINT32_ARG("--num_bucket_mutexes", env_args.num_bucket_mutexes, " mutexes"),
|
||||
|
||||
INT32_ARG_R("--join_timeout", join_timeout, "s", 1, INT32_MAX),
|
||||
INT32_ARG_R("--performance_period", performance_period, "s", 1, INT32_MAX),
|
||||
|
||||
// TODO: John thinks the cachetable size should be in megabytes
|
||||
|
|
Loading…
Add table
Reference in a new issue