diff options
author | Jeff Darcy <jdarcy@fb.com> | 2017-06-02 14:02:02 -0700 |
---|---|---|
committer | Shreyas Siravara <sshreyas@fb.com> | 2017-09-08 04:34:13 +0000 |
commit | 8b6804f75cda612d13e3a691b3f9698028b2577d (patch) | |
tree | 9ce4705785c2e5b3d2c6eab01645ac74b2fc7f09 | |
parent | 49d0f911bdc34913836f7014d6b70fd5d58367ca (diff) |
performance/io-threads: Add watchdog to cover up a possible thread leak
Summary:
There appears to be a thread leak somewhere, which causes io-threads to
run out of threads to process a particular (priority-based) queue.
The leak should obviously be fixed, but that might take a while
and the consequences until then are severe - a brick essentially going
offline without the courtesy of actually dying. This patch adds a
watchdog that checks for stuck queues, and adds threads to break the deadlock.
The same thing done manually on one afflicted cluster caused brick CPU usage
to drop from 2600% to 400%, with latency quickly returning to normal.
The controlling option is performance.iot-watchdog-secs,
which is the number of seconds we'll watch for a non-empty
queue with no items being dequeued. That's our signal to
add a thread. A value of zero (the default) disables
this watchdog feature.
This is a port of D5177414 to 3.8.
Test Plan: All the usual tests to determine safety.
Use gdb to hack priv->queue_sizes to a non-zero value. This will make it look like the queue is non-empty, but since it does in fact have zero items there will be no dequeues. After watchdog-secs seconds, this should add a thread, with a corresponding entry in the brick log.
Change-Id: Ic051e411d3e9351e1cf5e233bad8bbb5078cb259
Reviewed-on: https://review.gluster.org/18239
Reviewed-by: Shreyas Siravara <sshreyas@fb.com>
CentOS-regression: Gluster Build System <jenkins@build.gluster.org>
Smoke: Gluster Build System <jenkins@build.gluster.org>
-rw-r--r-- | xlators/mgmt/glusterd/src/glusterd-volume-set.c | 5 | ||||
-rw-r--r-- | xlators/performance/io-threads/src/io-threads.c | 217 | ||||
-rw-r--r-- | xlators/performance/io-threads/src/io-threads.h | 7 |
3 files changed, 213 insertions, 16 deletions
diff --git a/xlators/mgmt/glusterd/src/glusterd-volume-set.c b/xlators/mgmt/glusterd/src/glusterd-volume-set.c index 64fa7e77c1b..857c494e039 100644 --- a/xlators/mgmt/glusterd/src/glusterd-volume-set.c +++ b/xlators/mgmt/glusterd/src/glusterd-volume-set.c @@ -1541,6 +1541,11 @@ struct volopt_map_entry glusterd_volopt_map[] = { .voltype = "performance/io-threads", .op_version = 2 }, + { .key = "performance.iot-watchdog-secs", + .voltype = "performance/io-threads", + .option = "watchdog-secs", + .op_version = 2 + }, /* Other perf xlators' options */ { .key = "performance.cache-size", diff --git a/xlators/performance/io-threads/src/io-threads.c b/xlators/performance/io-threads/src/io-threads.c index 7f9dc5f82a8..97d2dea1b71 100644 --- a/xlators/performance/io-threads/src/io-threads.c +++ b/xlators/performance/io-threads/src/io-threads.c @@ -15,11 +15,13 @@ #include "dict.h" #include "xlator.h" #include "io-threads.h" +#include <signal.h> #include <stdlib.h> #include <sys/time.h> #include <time.h> #include "locking.h" #include "io-threads-messages.h" +#include "timespec.h" void *iot_worker (void *arg); int iot_workers_scale (iot_conf_t *conf); @@ -114,6 +116,7 @@ __iot_dequeue (iot_conf_t *conf, int *pri, struct timespec *sleep) stub = list_entry (conf->reqs[i].next, call_stub_t, list); conf->ac_iot_count[i]++; + conf->queue_marked[i] = _gf_false; *pri = i; break; } @@ -935,6 +938,165 @@ iot_priv_dump (xlator_t *this) return 0; } +/* + * We use a decay model to keep track and make sure we're not spawning new + * threads too often. Each increment adds a large value to a counter, and that + * counter keeps ticking back down to zero over a fairly long period. For + * example, let's use ONE_WEEK=604800 seconds, and we want to detect when we + * have N=3 increments during that time. Thus, our threshold is + * (N-1)*ONE_WEEK. To see how it works, look at three examples. + * + * (a) Two events close together, then one more almost a week later. The + * first two events push our counter to 2*ONE_WEEK plus a bit. At the third + * event, we decay down to ONE_WEEK plus a bit and then add ONE_WEEK for the + * new event, exceeding our threshold. + * + * (b) One event, then two more almost a week later. At the time of the + * second and third events, the counter is already non-zero, so when we add + * 2*ONE_WEEK we exceed again. + * + * (c) Three events, spaced three days apart. At the time of the second + * event, we decay down to approxitely ONE_WEEK*4/7 and then add another + * ONE_WEEK. At the third event, we decay again down to ONE_WEEK*8/7 and add + * another ONE_WEEK, so boom. + * + * Note that in all three cases if that last event came a day later our counter + * would have decayed a bit more and we would *not* exceed our threshold. It's + * not exactly the same as a precise "three in one week" limit, but it's very + * close and it allows the same kind of tweaking while requiring only constant + * space - no arrays of variable length N to allocate or maintain. All we need + * (for each queue) is the value plus the time of the last update. + */ + +typedef struct { + uint32_t value; + time_t update_time; +} threshold_t; +/* + * Variables so that I can hack these for testing. + * TBD: make these tunable? + */ +static uint32_t THRESH_SECONDS = 604800; +static uint32_t THRESH_EVENTS = 3; +static uint32_t THRESH_LIMIT = 1209600; /* SECONDS * (EVENTS-1) */ + +static void +iot_apply_event (xlator_t *this, threshold_t *thresh) +{ + struct timespec now; + time_t delta; + + /* Refresh for manual testing/debugging. It's cheap. */ + THRESH_LIMIT = THRESH_SECONDS * (THRESH_EVENTS - 1); + + timespec_now (&now); + + if (thresh->value && thresh->update_time) { + delta = now.tv_sec - thresh->update_time; + /* Be careful about underflow. */ + if (thresh->value <= delta) { + thresh->value = 0; + } else { + thresh->value -= delta; + } + } + + thresh->value += THRESH_SECONDS; + if (thresh->value >= THRESH_LIMIT) { + gf_log (this->name, GF_LOG_EMERG, "watchdog firing too often"); + /* + * The default action for SIGTRAP is to dump core, but the fact + * that it's distinct from other signals we use means that + * there are other possibilities as well (e.g. drop into gdb or + * invoke a special handler). + */ + kill (getpid (), SIGTRAP); + } + + thresh->update_time = now.tv_sec; +} + +static void * +iot_watchdog (void *arg) +{ + xlator_t *this = arg; + iot_conf_t *priv = this->private; + int i; + int bad_times[IOT_PRI_MAX] = { 0, }; + threshold_t thresholds[IOT_PRI_MAX] = { { 0, } }; + + for (;;) { + sleep (max (priv->watchdog_secs/5, 1)); + pthread_setcancelstate (PTHREAD_CANCEL_DISABLE, NULL); + pthread_mutex_lock (&priv->mutex); + for (i = 0; i < IOT_PRI_MAX; ++i) { + if (priv->queue_marked[i]) { + if (++bad_times[i] >= 5) { + gf_log (this->name, GF_LOG_WARNING, + "queue %d stalled", i); + iot_apply_event (this, &thresholds[i]); + /* + * We might not get here if the event + * put us over our threshold. + */ + ++(priv->ac_iot_limit[i]); + bad_times[i] = 0; + } + } else { + bad_times[i] = 0; + } + priv->queue_marked[i] = (priv->queue_sizes[i] > 0); + } + pthread_mutex_unlock (&priv->mutex); + pthread_setcancelstate (PTHREAD_CANCEL_ENABLE, NULL); + } + + /* NOTREACHED */ + return NULL; +} + +static void +start_iot_watchdog (xlator_t *this) +{ + iot_conf_t *priv = this->private; + int ret; + + if (priv->watchdog_running) { + return; + } + + ret = pthread_create (&priv->watchdog_thread, NULL, iot_watchdog, this); + if (ret == 0) { + priv->watchdog_running = _gf_true; + } else { + gf_log (this->name, GF_LOG_WARNING, + "pthread_create(iot_watchdog) failed"); + } +} + +static void +stop_iot_watchdog (xlator_t *this) +{ + iot_conf_t *priv = this->private; + + if (!priv->watchdog_running) { + return; + } + + if (pthread_cancel (priv->watchdog_thread) != 0) { + gf_log (this->name, GF_LOG_WARNING, + "pthread_cancel(iot_watchdog) failed"); + } + + if (pthread_join (priv->watchdog_thread, NULL) != 0) { + gf_log (this->name, GF_LOG_WARNING, + "pthread_join(iot_watchdog) failed"); + } + + /* Failure probably means it's already dead. */ + priv->watchdog_running = _gf_false; +} + int reconfigure (xlator_t *this, dict_t *options) { @@ -966,8 +1128,16 @@ reconfigure (xlator_t *this, dict_t *options) GF_OPTION_RECONF ("enable-least-priority", conf->least_priority, options, bool, out); - GF_OPTION_RECONF("least-rate-limit", conf->throttle.rate_limit, options, - int32, out); + GF_OPTION_RECONF ("least-rate-limit", conf->throttle.rate_limit, + options, int32, out); + + GF_OPTION_RECONF ("watchdog-secs", conf->watchdog_secs, options, + int32, out); + if (conf->watchdog_secs > 0) { + start_iot_watchdog (this); + } else { + stop_iot_watchdog (this); + } ret = 0; out: @@ -1043,8 +1213,9 @@ init (xlator_t *this) GF_OPTION_INIT ("enable-least-priority", conf->least_priority, bool, out); - GF_OPTION_INIT("least-rate-limit", conf->throttle.rate_limit, int32, - out); + GF_OPTION_INIT ("least-rate-limit", conf->throttle.rate_limit, int32, + out); + if ((ret = pthread_mutex_init(&conf->throttle.lock, NULL)) != 0) { gf_msg (this->name, GF_LOG_ERROR, 0, IO_THREADS_MSG_INIT_FAILED, @@ -1068,6 +1239,13 @@ init (xlator_t *this) } this->private = conf; + + conf->watchdog_secs = 0; + GF_OPTION_INIT ("watchdog-secs", conf->watchdog_secs, int32, out); + if (conf->watchdog_secs > 0) { + start_iot_watchdog (this); + } + ret = 0; out: if (ret) @@ -1120,6 +1298,8 @@ fini (xlator_t *this) if (conf->mutex_inited) pthread_mutex_destroy (&conf->mutex); + stop_iot_watchdog (this); + GF_FREE (conf); this->private = NULL; @@ -1243,20 +1423,27 @@ struct volume_options options[] = { .default_value = "on", .description = "Enable/Disable least priority" }, - {.key = {"idle-time"}, - .type = GF_OPTION_TYPE_INT, - .min = 1, - .max = 0x7fffffff, - .default_value = "120", + { .key = {"idle-time"}, + .type = GF_OPTION_TYPE_INT, + .min = 1, + .max = 0x7fffffff, + .default_value = "120", }, - {.key = {"least-rate-limit"}, - .type = GF_OPTION_TYPE_INT, - .min = 0, - .max = INT_MAX, - .default_value = "0", - .description = "Max number of least priority operations to handle " + { .key = {"least-rate-limit"}, + .type = GF_OPTION_TYPE_INT, + .min = 0, + .max = INT_MAX, + .default_value = "0", + .description = "Max number of least priority operations to handle " "per-second" }, + { .key = {"watchdog-secs"}, + .type = GF_OPTION_TYPE_INT, + .min = 0, + .default_value = 0, + .description = "Number of seconds a queue must be stalled before " + "starting an 'emergency' thread." + }, { .key = {NULL}, }, }; diff --git a/xlators/performance/io-threads/src/io-threads.h b/xlators/performance/io-threads/src/io-threads.h index 011d4a00f7f..4056eb5fe09 100644 --- a/xlators/performance/io-threads/src/io-threads.h +++ b/xlators/performance/io-threads/src/io-threads.h @@ -77,7 +77,12 @@ struct iot_conf { gf_boolean_t down; /*PARENT_DOWN event is notified*/ gf_boolean_t mutex_inited; gf_boolean_t cond_inited; - struct iot_least_throttle throttle; + struct iot_least_throttle throttle; + + int32_t watchdog_secs; + gf_boolean_t watchdog_running; + pthread_t watchdog_thread; + gf_boolean_t queue_marked[IOT_PRI_MAX]; }; typedef struct iot_conf iot_conf_t; |