From b846739abfc80f4852c8751e7d6cd995b7bb9d3a Mon Sep 17 00:00:00 2001 From: Rich Prohaska Date: Wed, 18 Sep 2013 13:29:05 -0400 Subject: [PATCH] #69 measure long tail lock tree and cache table stalls --- ft/cachetable-internal.h | 4 ++++ ft/cachetable.cc | 28 +++++++++++++++++++++++++++- ft/cachetable.h | 4 ++++ locktree/locktree.h | 10 ++++++++++ locktree/manager.cc | 27 ++++++++++++++++++++++++++- 5 files changed, 71 insertions(+), 2 deletions(-) diff --git a/ft/cachetable-internal.h b/ft/cachetable-internal.h index 4b6c8e02a22..d7a72ed4aab 100644 --- a/ft/cachetable-internal.h +++ b/ft/cachetable-internal.h @@ -558,6 +558,10 @@ private: PARTITIONED_COUNTER m_size_leaf; PARTITIONED_COUNTER m_size_rollback; PARTITIONED_COUNTER m_size_cachepressure; + PARTITIONED_COUNTER m_wait_pressure_count; + PARTITIONED_COUNTER m_wait_pressure_time; + PARTITIONED_COUNTER m_long_wait_pressure_count; + PARTITIONED_COUNTER m_long_wait_pressure_time; KIBBUTZ m_kibbutz; diff --git a/ft/cachetable.cc b/ft/cachetable.cc index 3c97cc97695..01620a92029 100644 --- a/ft/cachetable.cc +++ b/ft/cachetable.cc @@ -146,6 +146,11 @@ status_init(void) { STATUS_INIT(CT_CLEANER_EXECUTIONS, CACHETABLE_CLEANER_EXECUTIONS, UINT64, "cleaner executions", TOKU_ENGINE_STATUS|TOKU_GLOBAL_STATUS); STATUS_INIT(CT_CLEANER_PERIOD, CACHETABLE_CLEANER_PERIOD, UINT64, "cleaner period", TOKU_ENGINE_STATUS|TOKU_GLOBAL_STATUS); STATUS_INIT(CT_CLEANER_ITERATIONS, CACHETABLE_CLEANER_ITERATIONS, UINT64, "cleaner iterations", TOKU_ENGINE_STATUS|TOKU_GLOBAL_STATUS); + + STATUS_INIT(CT_WAIT_PRESSURE_COUNT, CACHETABLE_WAIT_PRESSURE_COUNT, UINT64, "number of waits on cache pressure", TOKU_ENGINE_STATUS|TOKU_GLOBAL_STATUS); + STATUS_INIT(CT_WAIT_PRESSURE_TIME, CACHETABLE_WAIT_PRESSURE_COUNT, UINT64, "time waiting on cache pressure", TOKU_ENGINE_STATUS|TOKU_GLOBAL_STATUS); + STATUS_INIT(CT_LONG_WAIT_PRESSURE_COUNT, CACHETABLE_WAIT_PRESSURE_COUNT, UINT64, "number of long waits on cache pressure", TOKU_ENGINE_STATUS|TOKU_GLOBAL_STATUS); + STATUS_INIT(CT_LONG_WAIT_PRESSURE_TIME, CACHETABLE_WAIT_PRESSURE_COUNT, UINT64, "long time waiting on cache pressure", TOKU_ENGINE_STATUS|TOKU_GLOBAL_STATUS); ct_status.initialized = true; } #undef STATUS_INIT @@ -3652,6 +3657,10 @@ void evictor::init(long _size_limit, pair_list* _pl, KIBBUTZ _kibbutz, uint32_t m_size_leaf = create_partitioned_counter(); m_size_rollback = create_partitioned_counter(); m_size_cachepressure = create_partitioned_counter(); + m_wait_pressure_count = create_partitioned_counter(); + m_wait_pressure_time = create_partitioned_counter(); + m_long_wait_pressure_count = create_partitioned_counter(); + m_long_wait_pressure_time = create_partitioned_counter(); m_pl = _pl; m_kibbutz = _kibbutz; @@ -3706,6 +3715,11 @@ void evictor::destroy() { destroy_partitioned_counter(m_size_cachepressure); m_size_cachepressure = NULL; + destroy_partitioned_counter(m_wait_pressure_count); m_wait_pressure_count = NULL; + destroy_partitioned_counter(m_wait_pressure_time); m_wait_pressure_time = NULL; + destroy_partitioned_counter(m_long_wait_pressure_count); m_long_wait_pressure_count = NULL; + destroy_partitioned_counter(m_long_wait_pressure_time); m_long_wait_pressure_time = NULL; + toku_cond_destroy(&m_flow_control_cond); toku_cond_destroy(&m_ev_thread_cond); toku_mutex_destroy(&m_ev_thread_lock); @@ -4173,13 +4187,21 @@ void evictor::decrease_size_evicting(long size_evicting_estimate) { // size_evicting is number of bytes queued up to be evicted // void evictor::wait_for_cache_pressure_to_subside() { + uint64_t t0 = toku_current_time_microsec(); toku_mutex_lock(&m_ev_thread_lock); m_num_sleepers++; this->signal_eviction_thread(); toku_cond_wait(&m_flow_control_cond, &m_ev_thread_lock); m_num_sleepers--; toku_mutex_unlock(&m_ev_thread_lock); - + uint64_t t1 = toku_current_time_microsec(); + increment_partitioned_counter(m_wait_pressure_count, 1); + uint64_t tdelta = t1 - t0; + increment_partitioned_counter(m_wait_pressure_time, tdelta); + if (tdelta > 1000000) { + increment_partitioned_counter(m_long_wait_pressure_count, 1); + increment_partitioned_counter(m_long_wait_pressure_time, tdelta); + } } // @@ -4267,6 +4289,10 @@ void evictor::fill_engine_status() { STATUS_VALUE(CT_SIZE_LEAF) = read_partitioned_counter(m_size_leaf); STATUS_VALUE(CT_SIZE_ROLLBACK) = read_partitioned_counter(m_size_rollback); STATUS_VALUE(CT_SIZE_CACHEPRESSURE) = read_partitioned_counter(m_size_cachepressure); + STATUS_VALUE(CT_WAIT_PRESSURE_COUNT) = read_partitioned_counter(m_wait_pressure_count); + STATUS_VALUE(CT_WAIT_PRESSURE_TIME) = read_partitioned_counter(m_wait_pressure_time); + STATUS_VALUE(CT_LONG_WAIT_PRESSURE_COUNT) = read_partitioned_counter(m_long_wait_pressure_count); + STATUS_VALUE(CT_LONG_WAIT_PRESSURE_TIME) = read_partitioned_counter(m_long_wait_pressure_time); } //////////////////////////////////////////////////////////////////////////////// diff --git a/ft/cachetable.h b/ft/cachetable.h index d8f8703b81e..d59bd3fc96d 100644 --- a/ft/cachetable.h +++ b/ft/cachetable.h @@ -609,6 +609,10 @@ typedef enum { CT_CLEANER_EXECUTIONS, // number of times the cleaner thread's loop has executed CT_CLEANER_PERIOD, CT_CLEANER_ITERATIONS, // number of times the cleaner thread runs the cleaner per period + CT_WAIT_PRESSURE_COUNT, + CT_WAIT_PRESSURE_TIME, + CT_LONG_WAIT_PRESSURE_COUNT, + CT_LONG_WAIT_PRESSURE_TIME, CT_STATUS_NUM_ROWS } ct_status_entry; diff --git a/locktree/locktree.h b/locktree/locktree.h index c929880635d..91b197a1d5d 100644 --- a/locktree/locktree.h +++ b/locktree/locktree.h @@ -120,6 +120,10 @@ enum { LTM_LONG_WAIT_COUNT, LTM_LONG_WAIT_TIME, LTM_TIMEOUT_COUNT, + LTM_WAIT_ESCALATION_COUNT, + LTM_WAIT_ESCALATION_TIME, + LTM_LONG_WAIT_ESCALATION_COUNT, + LTM_LONG_WAIT_ESCALATION_TIME, LTM_STATUS_NUM_ROWS // must be last }; @@ -306,6 +310,8 @@ public: void *extra); int iterate_pending_lock_requests(lock_request_iterate_callback cb, void *extra); + void escalation_wait(uint64_t t); + private: static const uint64_t DEFAULT_MAX_LOCK_MEMORY = 64L * 1024 * 1024; static const uint64_t DEFAULT_LOCK_WAIT_TIME = 0; @@ -319,6 +325,10 @@ public: uint64_t m_escalation_count; tokutime_t m_escalation_time; uint64_t m_escalation_latest_result; + uint64_t m_wait_escalation_count; + uint64_t m_wait_escalation_time; + uint64_t m_long_wait_escalation_count; + uint64_t m_long_wait_escalation_time; struct lt_counters m_lt_counters; diff --git a/locktree/manager.cc b/locktree/manager.cc index 2ba27a73dd3..c46f1aad5fe 100644 --- a/locktree/manager.cc +++ b/locktree/manager.cc @@ -104,6 +104,10 @@ void locktree::manager::create(lt_create_cb create_cb, lt_destroy_cb destroy_cb, m_current_lock_memory = 0; m_escalation_count = 0; m_escalation_time = 0; + m_wait_escalation_count = 0; + m_wait_escalation_time = 0; + m_long_wait_escalation_count = 0; + m_long_wait_escalation_time = 0; m_escalation_latest_result = 0; m_lock_wait_time_ms = DEFAULT_LOCK_WAIT_TIME; m_mem_tracker.set_manager(this); @@ -350,6 +354,7 @@ int locktree::manager::memory_tracker::check_current_lock_constraints(void) { // mutex and check again. if we're still out of locks, run escalation. // return an error if we're still out of locks after escalation. if (out_of_locks()) { + uint64_t t0 = toku_current_time_microsec(); m_mgr->mutex_lock(); if (out_of_locks()) { m_mgr->run_escalation(); @@ -357,6 +362,8 @@ int locktree::manager::memory_tracker::check_current_lock_constraints(void) { r = TOKUDB_OUT_OF_LOCKS; } } + uint64_t t1 = toku_current_time_microsec(); + m_mgr->escalation_wait(t1 - t0); m_mgr->mutex_unlock(); } return r; @@ -404,6 +411,15 @@ int locktree::manager::iterate_pending_lock_requests( return r; } +void locktree::manager::escalation_wait(uint64_t t) { + m_wait_escalation_count += 1; + m_wait_escalation_time += t; + if (t >= 1000000) { + m_long_wait_escalation_count += 1; + m_long_wait_escalation_time += t; + } +} + #define STATUS_INIT(k,c,t,l,inc) TOKUDB_STATUS_INIT(status, k, c, t, "locktree: " l, inc) void locktree::manager::status_init(void) { @@ -420,10 +436,15 @@ void locktree::manager::status_init(void) { STATUS_INIT(LTM_WAIT_COUNT, LOCKTREE_WAIT_COUNT, UINT64, "number of wait locks", TOKU_ENGINE_STATUS|TOKU_GLOBAL_STATUS); STATUS_INIT(LTM_WAIT_TIME, LOCKTREE_WAIT_TIME, UINT64, "time waiting for locks", TOKU_ENGINE_STATUS|TOKU_GLOBAL_STATUS); - STATUS_INIT(LTM_LONG_WAIT_COUNT, LOCKTREE_LONG_WAIT_COUNT, UINT64, "number of long wait locks ", TOKU_ENGINE_STATUS|TOKU_GLOBAL_STATUS); + STATUS_INIT(LTM_LONG_WAIT_COUNT, LOCKTREE_LONG_WAIT_COUNT, UINT64, "number of long wait locks", TOKU_ENGINE_STATUS|TOKU_GLOBAL_STATUS); STATUS_INIT(LTM_LONG_WAIT_TIME, LOCKTREE_LONG_WAIT_TIME, UINT64, "long time waiting for locks", TOKU_ENGINE_STATUS|TOKU_GLOBAL_STATUS); STATUS_INIT(LTM_TIMEOUT_COUNT, LOCKTREE_TIMEOUT_COUNT, UINT64, "number of lock timeouts", TOKU_ENGINE_STATUS|TOKU_GLOBAL_STATUS); + STATUS_INIT(LTM_WAIT_ESCALATION_COUNT, LOCKTREE_WAIT_ESCALATION_COUNT, UINT64, "number of waits on lock escalation", TOKU_ENGINE_STATUS|TOKU_GLOBAL_STATUS); + STATUS_INIT(LTM_WAIT_ESCALATION_TIME, LOCKTREE_WAIT_ESCALATION_TIME, UINT64, "time waiting on lock escalation", TOKU_ENGINE_STATUS|TOKU_GLOBAL_STATUS); + STATUS_INIT(LTM_LONG_WAIT_ESCALATION_COUNT, LOCKTREE_LONG_WAIT_ESCALATION_COUNT, UINT64, "number of long waits on lock escalation", TOKU_ENGINE_STATUS|TOKU_GLOBAL_STATUS); + STATUS_INIT(LTM_LONG_WAIT_ESCALATION_TIME, LOCKTREE_LONG_WAIT_ESCALATION_TIME, UINT64, "long time waiting on lock escalation", TOKU_ENGINE_STATUS|TOKU_GLOBAL_STATUS); + status.initialized = true; } @@ -441,6 +462,10 @@ void locktree::manager::get_status(LTM_STATUS statp) { STATUS_VALUE(LTM_ESCALATION_COUNT) = m_escalation_count; STATUS_VALUE(LTM_ESCALATION_TIME) = m_escalation_time; STATUS_VALUE(LTM_ESCALATION_LATEST_RESULT) = m_escalation_latest_result; + STATUS_VALUE(LTM_WAIT_ESCALATION_COUNT) = m_wait_escalation_count; + STATUS_VALUE(LTM_WAIT_ESCALATION_TIME) = m_wait_escalation_time; + STATUS_VALUE(LTM_LONG_WAIT_ESCALATION_COUNT) = m_long_wait_escalation_count; + STATUS_VALUE(LTM_LONG_WAIT_ESCALATION_TIME) = m_long_wait_escalation_time; mutex_lock();