summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--xlators/features/bit-rot/src/bitd/Makefile.am4
-rw-r--r--xlators/features/bit-rot/src/bitd/bit-rot-tbf.c306
-rw-r--r--xlators/features/bit-rot/src/bitd/bit-rot-tbf.h70
-rw-r--r--xlators/features/bit-rot/src/bitd/bit-rot.c47
-rw-r--r--xlators/features/bit-rot/src/bitd/bit-rot.h11
-rw-r--r--xlators/features/bit-rot/src/stub/bit-rot-stub-mem-types.h3
6 files changed, 432 insertions, 9 deletions
diff --git a/xlators/features/bit-rot/src/bitd/Makefile.am b/xlators/features/bit-rot/src/bitd/Makefile.am
index 160e3b6..f67fa1a 100644
--- a/xlators/features/bit-rot/src/bitd/Makefile.am
+++ b/xlators/features/bit-rot/src/bitd/Makefile.am
@@ -9,11 +9,11 @@ AM_CPPFLAGS = $(GF_CPPFLAGS) -I$(top_srcdir)/libglusterfs/src \
-I$(CONTRIBDIR)/timer-wheel \
-I$(top_srcdir)/xlators/features/bit-rot/src/stub
-bit_rot_la_SOURCES = bit-rot.c bit-rot-scrub.c
+bit_rot_la_SOURCES = bit-rot.c bit-rot-scrub.c bit-rot-tbf.c
bit_rot_la_LIBADD = $(top_builddir)/libglusterfs/src/libglusterfs.la \
$(top_builddir)/xlators/features/changelog/lib/src/libgfchangelog.la
-noinst_HEADERS = bit-rot.h bit-rot-scrub.h
+noinst_HEADERS = bit-rot.h bit-rot-scrub.h bit-rot-tbf.h
AM_CFLAGS = -Wall $(GF_CFLAGS)
diff --git a/xlators/features/bit-rot/src/bitd/bit-rot-tbf.c b/xlators/features/bit-rot/src/bitd/bit-rot-tbf.c
new file mode 100644
index 0000000..d954341
--- /dev/null
+++ b/xlators/features/bit-rot/src/bitd/bit-rot-tbf.c
@@ -0,0 +1,306 @@
+/*
+ Copyright (c) 2015 Red Hat, Inc. <http://www.redhat.com>
+ This file is part of GlusterFS.
+
+ This file is licensed to you under your choice of the GNU Lesser
+ General Public License, version 3 or any later version (LGPLv3 or
+ later), or the GNU General Public License, version 2 (GPLv2), in all
+ cases as published by the Free Software Foundation.
+*/
+
+/**
+ *
+ * Basic token bucket implementation for rate limiting. As of now interfaces
+ * to throttle disk read request, directory entry scan and hash calculation
+ * are available. To throttle a particular request (operation), the call needs
+ * to be wrapped in-between throttling APIs, for e.g.
+ *
+ * TBF_THROTTLE_BEGIN (...); <-- induces "delays" if required
+ * {
+ * call (...);
+ * }
+ * TBF_THROTTLE_END (...); <-- not used atm, maybe needed later
+ *
+ */
+
+#include "mem-pool.h"
+#include "bit-rot-tbf.h"
+#include "bit-rot-stub-mem-types.h"
+
+typedef struct br_tbf_throttle {
+ char done;
+
+ pthread_mutex_t mutex;
+ pthread_cond_t cond;
+
+ unsigned long tokens;
+
+ struct list_head list;
+} br_tbf_throttle_t;
+
+/**
+ * OK. Most implementations of TBF I've come across generate tokens
+ * every second (UML, etc..) and some chose sub-second granularity
+ * (blk-iothrottle cgroups). TBF algorithm itself does not enforce
+ * any logic for choosing generation interval and it seems pretty
+ * logical as one could jack up token count per interval w.r.t.
+ * generation rate.
+ *
+ * Value used here is chosen based on a series of test(s) performed
+ * to balance object signing time and not maxing out on all available
+ * CPU cores. It's obvious to have seconds granularity and jack up
+ * token count per interval, thereby achieving close to similar
+ * results. Let's stick to this as it seems to be working fine for
+ * the set of ops that are throttled.
+ */
+#define BR_TBF_TOKENGEN_INTERVAL_USEC 600000
+
+static inline br_tbf_throttle_t *
+br_tbf_init_throttle (unsigned long tokens_required)
+{
+ br_tbf_throttle_t *throttle = NULL;
+
+ throttle = GF_CALLOC (1, sizeof (*throttle),
+ gf_br_mt_br_tbf_throttle_t);
+ if (!throttle)
+ return NULL;
+
+ throttle->done = 0;
+ throttle->tokens = tokens_required;
+ INIT_LIST_HEAD (&throttle->list);
+
+ (void) pthread_mutex_init (&throttle->mutex, NULL);
+ (void) pthread_cond_init (&throttle->cond, NULL);
+
+ return throttle;
+}
+
+void
+_br_tbf_dispatch_queued (br_tbf_bucket_t *bucket)
+{
+ gf_boolean_t xcont = _gf_false;
+ br_tbf_throttle_t *tmp = NULL;
+ br_tbf_throttle_t *throttle = NULL;
+
+ list_for_each_entry_safe (throttle, tmp, &bucket->queued, list) {
+
+ pthread_mutex_lock (&throttle->mutex);
+ {
+ if (bucket->tokens < throttle->tokens) {
+ xcont = _gf_true;
+ goto unblock;
+ }
+
+ /* this request can now be serviced */
+ throttle->done = 1;
+ list_del_init (&throttle->list);
+
+ bucket->tokens -= throttle->tokens;
+ pthread_cond_signal (&throttle->cond);
+ }
+ unblock:
+ pthread_mutex_unlock (&throttle->mutex);
+ if (xcont)
+ break;
+ }
+}
+
+void *br_tbf_tokengenerator (void *arg)
+{
+ unsigned long tokenrate = 0;
+ unsigned long maxtokens = 0;
+ br_tbf_bucket_t *bucket = arg;
+
+ tokenrate = bucket->tokenrate;
+ maxtokens = bucket->maxtokens;
+
+ while (1) {
+ usleep (BR_TBF_TOKENGEN_INTERVAL_USEC);
+
+ LOCK (&bucket->lock);
+ {
+ bucket->tokens += tokenrate;
+ if (bucket->tokens > maxtokens)
+ bucket->tokens = maxtokens;
+
+ if (!list_empty (&bucket->queued))
+ _br_tbf_dispatch_queued (bucket);
+ }
+ UNLOCK (&bucket->lock);
+ }
+
+ return NULL;
+}
+
+/**
+ * There is lazy synchronization between this routine (when invoked
+ * under br_tbf_mod() context) and br_tbf_throttle(). *bucket is
+ * updated _after_ all the required variables are initialized.
+ */
+static inline int32_t
+br_tbf_init_bucket (br_tbf_t *tbf, br_tbf_opspec_t *spec)
+{
+ int ret = 0;
+ br_tbf_bucket_t *curr = NULL;
+ br_tbf_bucket_t **bucket = NULL;
+
+ GF_ASSERT (spec->op >= BR_TBF_OP_MIN);
+ GF_ASSERT (spec->op <= BR_TBF_OP_MAX);
+
+ /* no rate? no throttling. */
+ if (!spec->rate)
+ return 0;
+
+ bucket = tbf->bucket + spec->op;
+
+ curr = GF_CALLOC (1, sizeof (*curr), gf_br_mt_br_tbf_bucket_t);
+ if (!curr)
+ goto error_return;
+
+ LOCK_INIT (&curr->lock);
+ INIT_LIST_HEAD (&curr->queued);
+
+ curr->tokens = 0;
+ curr->tokenrate = spec->rate;
+ curr->maxtokens = spec->maxlimit;
+
+ ret = gf_thread_create (&curr->tokener,
+ NULL, br_tbf_tokengenerator, curr);
+ if (ret != 0)
+ goto freemem;
+
+ *bucket = curr;
+ return 0;
+
+ freemem:
+ LOCK_DESTROY (&curr->lock);
+ GF_FREE (curr);
+ error_return:
+ return -1;
+}
+
+#define BR_TBF_ALLOC_SIZE \
+ (sizeof (br_tbf_t) + (BR_TBF_OP_MAX * sizeof (br_tbf_bucket_t)))
+
+br_tbf_t *
+br_tbf_init (br_tbf_opspec_t *tbfspec, unsigned int count)
+{
+ int32_t i = 0;
+ int32_t ret = 0;
+ br_tbf_t *tbf = NULL;
+ br_tbf_opspec_t *opspec = NULL;
+
+ tbf = GF_CALLOC (1, BR_TBF_ALLOC_SIZE, gf_br_mt_br_tbf_t);
+ if (!tbf)
+ goto error_return;
+
+ tbf->bucket = (br_tbf_bucket_t **) ((char *)tbf + sizeof (*tbf));
+ for (i = 0; i < BR_TBF_OP_MAX; i++) {
+ *(tbf->bucket + i) = NULL;
+ }
+
+ for (i = 0; i < count; i++) {
+ opspec = tbfspec + i;
+
+ ret = br_tbf_init_bucket (tbf, opspec);
+ if (ret)
+ break;
+ }
+
+ if (ret)
+ goto error_return;
+
+ return tbf;
+
+ error_return:
+ return NULL;
+}
+
+static void
+br_tbf_mod_bucket (br_tbf_bucket_t *bucket, br_tbf_opspec_t *spec)
+{
+ LOCK (&bucket->lock);
+ {
+ bucket->tokens = 0;
+ bucket->tokenrate = spec->rate;
+ bucket->maxtokens = spec->maxlimit;
+ }
+ UNLOCK (&bucket->lock);
+
+ /* next token tick would unqueue pending operations */
+}
+
+int
+br_tbf_mod (br_tbf_t *tbf, br_tbf_opspec_t *tbfspec)
+{
+ int ret = 0;
+ br_tbf_bucket_t *bucket = NULL;
+ br_tbf_ops_t op = BR_TBF_OP_MIN;
+
+ if (!tbf || !tbfspec)
+ return -1;
+
+ op = tbfspec->op;
+
+ GF_ASSERT (op >= BR_TBF_OP_MIN);
+ GF_ASSERT (op <= BR_TBF_OP_MAX);
+
+ bucket = *(tbf->bucket + op);
+ if (bucket) {
+ br_tbf_mod_bucket (bucket, tbfspec);
+ } else {
+ ret = br_tbf_init_bucket (tbf, tbfspec);
+ }
+
+ return ret;
+}
+
+void
+br_tbf_throttle (br_tbf_t *tbf, br_tbf_ops_t op, unsigned long tokens_requested)
+{
+ char waitq = 0;
+ br_tbf_bucket_t *bucket = NULL;
+ br_tbf_throttle_t *throttle = NULL;
+
+ GF_ASSERT (op >= BR_TBF_OP_MIN);
+ GF_ASSERT (op <= BR_TBF_OP_MAX);
+
+ bucket = *(tbf->bucket + op);
+ if (!bucket)
+ return;
+
+ LOCK (&bucket->lock);
+ {
+ /**
+ * if there are enough tokens in the bucket there is no need
+ * to throttle the request: therefore, consume the required
+ * number of tokens and continue.
+ */
+ if (tokens_requested <= bucket->tokens) {
+ bucket->tokens -= tokens_requested;
+ } else {
+ throttle = br_tbf_init_throttle (tokens_requested);
+ if (!throttle) /* let it slip through for now.. */
+ goto unblock;
+
+ waitq = 1;
+ pthread_mutex_lock (&throttle->mutex);
+ list_add_tail (&throttle->list, &bucket->queued);
+ }
+ }
+ unblock:
+ UNLOCK (&bucket->lock);
+
+ if (waitq) {
+ while (!throttle->done) {
+ pthread_cond_wait (&throttle->cond, &throttle->mutex);
+ }
+
+ pthread_mutex_unlock (&throttle->mutex);
+
+ pthread_mutex_destroy (&throttle->mutex);
+ pthread_cond_destroy (&throttle->cond);
+
+ GF_FREE (throttle);
+ }
+}
diff --git a/xlators/features/bit-rot/src/bitd/bit-rot-tbf.h b/xlators/features/bit-rot/src/bitd/bit-rot-tbf.h
new file mode 100644
index 0000000..5a41be4
--- /dev/null
+++ b/xlators/features/bit-rot/src/bitd/bit-rot-tbf.h
@@ -0,0 +1,70 @@
+/*
+ Copyright (c) 2015 Red Hat, Inc. <http://www.redhat.com>
+ This file is part of GlusterFS.
+
+ This file is licensed to you under your choice of the GNU Lesser
+ General Public License, version 3 or any later version (LGPLv3 or
+ later), or the GNU General Public License, version 2 (GPLv2), in all
+ cases as published by the Free Software Foundation.
+*/
+
+#include "list.h"
+#include "xlator.h"
+#include "locking.h"
+
+#ifndef __BIT_ROT_TBF_H__
+#define __BIT_ROT_TBF_H__
+
+typedef enum br_tbf_ops {
+ BR_TBF_OP_MIN = -1,
+ BR_TBF_OP_HASH = 0, /* checksum calculation */
+ BR_TBF_OP_READ = 1, /* inode read(s) */
+ BR_TBF_OP_READDIR = 2, /* dentry read(s) */
+ BR_TBF_OP_MAX = 3,
+} br_tbf_ops_t;
+
+/**
+ * Operation rate specification
+ */
+typedef struct br_tbf_opspec {
+ br_tbf_ops_t op;
+
+ unsigned long rate;
+
+ unsigned long maxlimit;
+} br_tbf_opspec_t;
+
+/**
+ * Token bucket for each operation type
+ */
+typedef struct br_tbf_bucket {
+ gf_lock_t lock;
+
+ pthread_t tokener; /* token generator thread */
+
+ unsigned long tokenrate; /* token generation rate */
+
+ unsigned long tokens; /* number of current tokens */
+
+ unsigned long maxtokens; /* maximum token in the bucket */
+
+ struct list_head queued; /* list of non-conformant requests */
+} br_tbf_bucket_t;
+
+typedef struct br_tbf {
+ br_tbf_bucket_t **bucket;
+} br_tbf_t;
+
+br_tbf_t *
+br_tbf_init (br_tbf_opspec_t *, unsigned int);
+
+int
+br_tbf_mod (br_tbf_t *, br_tbf_opspec_t *);
+
+void
+br_tbf_throttle (br_tbf_t *, br_tbf_ops_t, unsigned long);
+
+#define TBF_THROTTLE_BEGIN(tbf, op, tokens) (br_tbf_throttle (tbf, op, tokens))
+#define TBF_THROTTLE_END(tbf, op, tokens) (void)
+
+#endif /** __BIT_ROT_TBF_H__ */
diff --git a/xlators/features/bit-rot/src/bitd/bit-rot.c b/xlators/features/bit-rot/src/bitd/bit-rot.c
index d985cc4..880b16e 100644
--- a/xlators/features/bit-rot/src/bitd/bit-rot.c
+++ b/xlators/features/bit-rot/src/bitd/bit-rot.c
@@ -27,6 +27,17 @@
#include "tw.h"
+#define BR_HASH_CALC_READ_SIZE (128 * 1024)
+
+br_tbf_opspec_t opthrottle[] = {
+ {
+ .op = BR_TBF_OP_HASH,
+ .rate = BR_HASH_CALC_READ_SIZE,
+ .maxlimit = (2 * BR_WORKERS * BR_HASH_CALC_READ_SIZE),
+ },
+ /** TODO: throttle getdents(), read() request(s) */
+};
+
static int
br_find_child_index (xlator_t *this, xlator_t *child)
{
@@ -288,8 +299,10 @@ br_object_read_block_and_sign (xlator_t *this, fd_t *fd, br_child_t *child,
off_t offset, size_t size, SHA256_CTX *sha256)
{
int32_t ret = -1;
+ br_tbf_t *tbf = NULL;
struct iovec *iovec = NULL;
struct iobref *iobref = NULL;
+ br_private_t *priv = NULL;
int count = 0;
int i = 0;
@@ -297,6 +310,12 @@ br_object_read_block_and_sign (xlator_t *this, fd_t *fd, br_child_t *child,
GF_VALIDATE_OR_GOTO (this->name, fd, out);
GF_VALIDATE_OR_GOTO (this->name, fd->inode, out);
GF_VALIDATE_OR_GOTO (this->name, child, out);
+ GF_VALIDATE_OR_GOTO (this->name, this->private, out);
+
+ priv = this->private;
+
+ GF_VALIDATE_OR_GOTO (this->name, priv->tbf, out);
+ tbf = priv->tbf;
ret = syncop_readv (child->xl, fd,
size, offset, 0, &iovec, &count, &iobref, NULL,
@@ -313,9 +332,12 @@ br_object_read_block_and_sign (xlator_t *this, fd_t *fd, br_child_t *child,
goto out;
for (i = 0; i < count; i++) {
- SHA256_Update (sha256,
- (const unsigned char *) (iovec[i].iov_base),
- iovec[i].iov_len);
+ TBF_THROTTLE_BEGIN (tbf, BR_TBF_OP_HASH, iovec[i].iov_len);
+ {
+ SHA256_Update (sha256, (const unsigned char *)
+ (iovec[i].iov_base), iovec[i].iov_len);
+ }
+ TBF_THROTTLE_BEGIN (tbf, BR_TBF_OP_HASH, iovec[i].iov_len);
}
out:
@@ -334,7 +356,7 @@ br_calculate_obj_checksum (unsigned char *md,
{
int32_t ret = -1;
off_t offset = 0;
- size_t block = 128 * 1024; /* 128K block size */
+ size_t block = BR_HASH_CALC_READ_SIZE;
xlator_t *this = NULL;
SHA256_CTX sha256;
@@ -1358,6 +1380,16 @@ br_init_signer (xlator_t *this, br_private_t *priv)
}
int32_t
+br_init_rate_limiter (br_private_t *priv)
+{
+ br_tbf_opspec_t *spec = opthrottle;
+ priv->tbf = br_tbf_init (spec, sizeof (opthrottle)
+ / sizeof (br_tbf_opspec_t));
+
+ return priv->tbf ? 0 : -1;
+}
+
+int32_t
init (xlator_t *this)
{
int i = 0;
@@ -1411,12 +1443,16 @@ init (xlator_t *this)
INIT_LIST_HEAD (&priv->children[i].list);
INIT_LIST_HEAD (&priv->bricks);
+ ret = br_init_rate_limiter (priv);
+ if (ret)
+ goto cleanup_mutex;
+
this->private = priv;
if (!priv->iamscrubber) {
ret = br_init_signer (this, priv);
if (ret)
- goto cleanup_mutex;
+ goto cleanup_tbf;
}
ret = gf_thread_create (&priv->thread, NULL, br_handle_events, this);
@@ -1433,6 +1469,7 @@ init (xlator_t *this)
return 0;
}
+ cleanup_tbf:
cleanup_mutex:
(void) pthread_cond_destroy (&priv->cond);
(void) pthread_mutex_destroy (&priv->lock);
diff --git a/xlators/features/bit-rot/src/bitd/bit-rot.h b/xlators/features/bit-rot/src/bitd/bit-rot.h
index a634a1f..5b64180 100644
--- a/xlators/features/bit-rot/src/bitd/bit-rot.h
+++ b/xlators/features/bit-rot/src/bitd/bit-rot.h
@@ -25,13 +25,18 @@
#include "changelog.h"
#include "timer-wheel.h"
+#include "bit-rot-tbf.h"
+
#include "bit-rot-common.h"
#include "bit-rot-stub-mem-types.h"
#include <openssl/sha.h>
-/* TODO: make this configurable */
-#define BR_WORKERS 8
+/**
+ * TODO: make this configurable. As a best practice, set this to the
+ * number of processor cores.
+ */
+#define BR_WORKERS 4
#define signature_size(hl) (sizeof (br_isignature_t) + hl + 1)
@@ -92,6 +97,8 @@ struct br_private {
the objects */
int32_t expiry_time; /* objects "wait" time */
+ br_tbf_t *tbf; /* token bucket filter */
+
gf_boolean_t iamscrubber; /* function as a fs scrubber */
};
diff --git a/xlators/features/bit-rot/src/stub/bit-rot-stub-mem-types.h b/xlators/features/bit-rot/src/stub/bit-rot-stub-mem-types.h
index 4922786..bb40304 100644
--- a/xlators/features/bit-rot/src/stub/bit-rot-stub-mem-types.h
+++ b/xlators/features/bit-rot/src/stub/bit-rot-stub-mem-types.h
@@ -22,6 +22,9 @@ enum br_mem_types {
gf_br_mt_br_child_t,
gf_br_mt_br_object_t,
gf_br_mt_br_ob_n_wk_t,
+ gf_br_mt_br_tbf_t,
+ gf_br_mt_br_tbf_bucket_t,
+ gf_br_mt_br_tbf_throttle_t,
gf_br_stub_mt_end
};