From 15e0df36fb9e5fbda12923c225d20148ae59d884 Mon Sep 17 00:00:00 2001 From: Leif Walsh Date: Wed, 17 Apr 2013 00:01:18 -0400 Subject: [PATCH] refs #5671 implement join_timeout with a portable 'crash and dump core' function git-svn-id: file:///svn/toku/tokudb@50308 c7de825b-a66e-492c-adef-691d508d4ae1 --- portability/toku_crash.h | 112 +++++++++++++++++++++++ src/tests/test.h | 45 +-------- src/tests/threaded_stress_test_helpers.h | 98 ++++++++++++++++---- 3 files changed, 193 insertions(+), 62 deletions(-) create mode 100644 portability/toku_crash.h diff --git a/portability/toku_crash.h b/portability/toku_crash.h new file mode 100644 index 00000000000..962e87c1a61 --- /dev/null +++ b/portability/toku_crash.h @@ -0,0 +1,112 @@ +/* -*- mode: C++; c-basic-offset: 4; indent-tabs-mode: nil -*- */ +// vim: ft=cpp:expandtab:ts=8:sw=4:softtabstop=4: +#ident "$Id$" +#ident "Copyright (c) 2007-2012 Tokutek Inc. All rights reserved." + +#ifndef PORTABILITY_TOKU_CRASH_H +#define PORTABILITY_TOKU_CRASH_H + +#include +#include +#include + +//Simulate as hard a crash as possible. +//Choices: +// raise(SIGABRT) +// kill -SIGKILL $pid +// divide by 0 +// null dereference +// abort() +// assert(false) (from ) +// assert(false) (from ) +// +//Linux: +// abort() and both assert(false) cause FILE buffers to be flushed and written to disk: Unacceptable +//Windows: +// None of them cause file buffers to be flushed/written to disk, however +// abort(), assert(false) , null dereference, and divide by 0 cause popups requiring user intervention during tests: Unacceptable +// +//kill -SIGKILL $pid is annoying (and so far untested) +// +//raise(SIGABRT) has the downside that perhaps it could be caught? +//I'm choosing raise(SIGABRT), followed by divide by 0, followed by null dereference, followed by all the others just in case one gets caught. +static void __attribute__((unused, noreturn)) +toku_hard_crash_on_purpose(void) { +#if TOKU_WINDOWS + TerminateProcess(GetCurrentProcess(), 137); +#else + raise(SIGKILL); //Does not flush buffers on linux; cannot be caught. +#endif + { + int zero = 0; + int infinity = 1/zero; + fprintf(stderr, "Force use of %d\n", infinity); + fflush(stderr); //Make certain the string is calculated. + } + { + void * intothevoid = NULL; + (*(int*)intothevoid)++; + fprintf(stderr, "Force use of *(%p) = %d\n", intothevoid, *(int*)intothevoid); + fflush(stderr); + } + abort(); + fprintf(stderr, "This line should never be printed\n"); + fflush(stderr); +} + +// Similar to toku_hard_crash_on_purpose, but the goal isn't to crash hard, the primary goal is to get a corefile, the secondary goal is to terminate in any way possible. +// We don't really care if buffers get flushed etc, in fact they may as well flush since there may be useful output in stdout or stderr. +// +// By default, the following signals generate cores: +// Linux, from signal(7): +// SIGQUIT 3 Core +// SIGILL 4 Core +// SIGABRT 6 Core +// SIGFPE 8 Core +// SIGSEGV 11 Core +// +// Darwin and FreeBSD, from signal(3): +// 3 SIGQUIT create core image +// 4 SIGILL create core image +// 5 SIGTRAP create core image +// 6 SIGABRT create core image +// 7 SIGEMT create core image +// 8 SIGFPE create core image +// 10 SIGBUS create core image +// 11 SIGSEGV create core image +// 12 SIGSYS create core image +// +// We'll raise these in some sequence (common ones first), then try emulating the things that would cause these signals to be raised, then eventually just try to die normally and then loop like abort does. +static void __attribute__((unused, noreturn)) +toku_crash_and_dump_core_on_purpose(void) { + raise(SIGQUIT); + raise(SIGILL); + raise(SIGABRT); + raise(SIGFPE); + raise(SIGSEGV); +#if defined(__FreeBSD__) || defined(__APPLE__) + raise(SIGTRAP); + raise(SIGEMT); + raise(SIGBUS); + raise(SIGSYS); +#endif + abort(); + { + int zero = 0; + int infinity = 1/zero; + fprintf(stderr, "Force use of %d\n", infinity); + fflush(stderr); //Make certain the string is calculated. + } + { + void * intothevoid = NULL; + (*(int*)intothevoid)++; + fprintf(stderr, "Force use of *(%p) = %d\n", intothevoid, *(int*)intothevoid); + fflush(stderr); + } + raise(SIGKILL); + while (true) { + // don't return + } +} + +#endif // PORTABILITY_TOKU_CRASH_H diff --git a/src/tests/test.h b/src/tests/test.h index f9ca39ab5d6..109b9392f92 100644 --- a/src/tests/test.h +++ b/src/tests/test.h @@ -17,6 +17,7 @@ #include #include #include +#include #include "toku_assert.h" #include #include @@ -272,50 +273,6 @@ void print_time_now(void) { printf("%s", timestr); } -//Simulate as hard a crash as possible. -//Choices: -// raise(SIGABRT) -// kill -SIGKILL $pid -// divide by 0 -// null dereference -// abort() -// assert(false) (from ) -// assert(false) (from ) -// -//Linux: -// abort() and both assert(false) cause FILE buffers to be flushed and written to disk: Unacceptable -//Windows: -// None of them cause file buffers to be flushed/written to disk, however -// abort(), assert(false) , null dereference, and divide by 0 cause popups requiring user intervention during tests: Unacceptable -// -//kill -SIGKILL $pid is annoying (and so far untested) -// -//raise(SIGABRT) has the downside that perhaps it could be caught? -//I'm choosing raise(SIGABRT), followed by divide by 0, followed by null dereference, followed by all the others just in case one gets caught. -static void UU() -toku_hard_crash_on_purpose(void) { -#if TOKU_WINDOWS - TerminateProcess(GetCurrentProcess(), 137); -#else - raise(SIGKILL); //Does not flush buffers on linux; cannot be caught. -#endif - { - int zero = 0; - int infinity = 1/zero; - fprintf(stderr, "Force use of %d\n", infinity); - fflush(stderr); //Make certain the string is calculated. - } - { - void * intothevoid = NULL; - (*(int*)intothevoid)++; - fprintf(stderr, "Force use of *(%p) = %d\n", intothevoid, *(int*)intothevoid); - fflush(stderr); - } - abort(); - fprintf(stderr, "This line should never be printed\n"); - fflush(stderr); -} - static void UU() multiply_locks_for_n_dbs(DB_ENV *env, int num_dbs) { #ifdef USE_TDB diff --git a/src/tests/threaded_stress_test_helpers.h b/src/tests/threaded_stress_test_helpers.h index 2d8585297dc..20207823321 100644 --- a/src/tests/threaded_stress_test_helpers.h +++ b/src/tests/threaded_stress_test_helpers.h @@ -13,7 +13,6 @@ #include #include -#include #include #include #include @@ -1432,9 +1431,44 @@ static void *test_time(void *arg) { return arg; } -static void crashing_alarm_handler(int sig) { - assert(sig == SIGALRM); - toku_hard_crash_on_purpose(); +struct sleep_and_crash_extra { + toku_mutex_t mutex; + toku_cond_t cond; + int seconds; + bool is_setup; + bool threads_have_joined; +}; +static void *sleep_and_crash(void *extra) { + sleep_and_crash_extra *e = static_cast(extra); + toku_mutex_lock(&e->mutex); + struct timeval tv; + toku_timespec_t ts; + gettimeofday(&tv, nullptr); + ts.tv_sec = tv.tv_sec + e->seconds; + ts.tv_nsec = 0; + e->is_setup = true; + if (verbose) { + printf("Waiting %d seconds for other threads to join.\n", e->seconds); + fflush(stdout); + } + int r = toku_cond_timedwait(&e->cond, &e->mutex, &ts); + toku_mutex_assert_locked(&e->mutex); + if (r == ETIMEDOUT) { + invariant(!e->threads_have_joined); + if (verbose) { + printf("Some thread didn't join on time, crashing.\n"); + fflush(stdout); + } + toku_crash_and_dump_core_on_purpose(); + } else { + assert(r == 0); + assert(e->threads_have_joined); + if (verbose) { + printf("Other threads joined on time, exiting cleanly.\n"); + } + } + toku_mutex_unlock(&e->mutex); + return nullptr; } static int run_workers( @@ -1484,20 +1518,48 @@ static int run_workers( void *ret; r = toku_pthread_join(time_tid, &ret); assert_zero(r); if (verbose) printf("%lu joined\n", (unsigned long) time_tid); - sighandler_t old_alarm = signal(SIGALRM, crashing_alarm_handler); - assert(old_alarm != SIG_ERR); - // Set an alarm that will kill us if it takes too long to join all the - // threads (i.e. there is some runaway thread). - unsigned int remaining = alarm(cli_args->join_timeout); - assert_zero(remaining); - for (int i = 0; i < num_threads; ++i) { - r = toku_pthread_join(tids[i], &ret); assert_zero(r); - if (verbose) - printf("%lu joined\n", (unsigned long) tids[i]); + + { + // Set an alarm that will kill us if it takes too long to join all the + // threads (i.e. there is some runaway thread). + struct sleep_and_crash_extra sac_extra; + ZERO_STRUCT(sac_extra); + toku_mutex_init(&sac_extra.mutex, nullptr); + toku_cond_init(&sac_extra.cond, nullptr); + sac_extra.seconds = cli_args->join_timeout; + sac_extra.is_setup = false; + sac_extra.threads_have_joined = false; + + toku_mutex_lock(&sac_extra.mutex); + toku_pthread_t sac_thread; + r = toku_pthread_create(&sac_thread, nullptr, sleep_and_crash, &sac_extra); + assert_zero(r); + // Wait for sleep_and_crash thread to get set up, spinning is ok, this should be quick. + while (!sac_extra.is_setup) { + toku_mutex_unlock(&sac_extra.mutex); + r = toku_pthread_yield(); + assert_zero(r); + toku_mutex_lock(&sac_extra.mutex); + } + toku_mutex_unlock(&sac_extra.mutex); + + // Timeout thread has started, join everyone + for (int i = 0; i < num_threads; ++i) { + r = toku_pthread_join(tids[i], &ret); assert_zero(r); + if (verbose) + printf("%lu joined\n", (unsigned long) tids[i]); + } + + // Signal timeout thread not to crash. + toku_mutex_lock(&sac_extra.mutex); + sac_extra.threads_have_joined = true; + toku_cond_signal(&sac_extra.cond); + toku_mutex_unlock(&sac_extra.mutex); + r = toku_pthread_join(sac_thread, nullptr); + assert_zero(r); + toku_cond_destroy(&sac_extra.cond); + toku_mutex_destroy(&sac_extra.mutex); } - // All threads joined, deschedule the alarm. - remaining = alarm(0); - assert(remaining > 0); if (cli_args->print_performance) { uint64_t *counters[num_threads]; @@ -2244,7 +2306,6 @@ static inline void parse_stress_test_args (int argc, char *const argv[], struct INT32_ARG_NONNEG("--num_elements", num_elements, ""), INT32_ARG_NONNEG("--num_DBs", num_DBs, ""), INT32_ARG_NONNEG("--num_seconds", num_seconds, "s"), - INT32_ARG_NONNEG("--join_timeout", join_timeout, "s"), INT32_ARG_NONNEG("--node_size", env_args.node_size, " bytes"), INT32_ARG_NONNEG("--basement_node_size", env_args.basement_node_size, " bytes"), INT32_ARG_NONNEG("--rollback_node_size", env_args.rollback_node_size, " bytes"), @@ -2259,6 +2320,7 @@ static inline void parse_stress_test_args (int argc, char *const argv[], struct UINT32_ARG("--txn_size", txn_size, " rows"), UINT32_ARG("--num_bucket_mutexes", env_args.num_bucket_mutexes, " mutexes"), + INT32_ARG_R("--join_timeout", join_timeout, "s", 1, INT32_MAX), INT32_ARG_R("--performance_period", performance_period, "s", 1, INT32_MAX), // TODO: John thinks the cachetable size should be in megabytes