6 files changed, 432 insertions, 9 deletions
diff --git a/xlators/features/bit-rot/src/bitd/Makefile.am b/xlators/features/bit-rot/src/bitd/Makefile.am
index 160e3b653df..f67fa1a3acd 100644
--- a/xlators/features/bit-rot/src/bitd/Makefile.am
+++ b/xlators/features/bit-rot/src/bitd/Makefile.am
@@ -9,11 +9,11 @@ AM_CPPFLAGS = $(GF_CPPFLAGS) -I$(top_srcdir)/libglusterfs/src \
                  -I$(CONTRIBDIR)/timer-wheel \
                  -I$(top_srcdir)/xlators/features/bit-rot/src/stub
 
-bit_rot_la_SOURCES = bit-rot.c bit-rot-scrub.c
+bit_rot_la_SOURCES = bit-rot.c bit-rot-scrub.c bit-rot-tbf.c
 bit_rot_la_LIBADD = $(top_builddir)/libglusterfs/src/libglusterfs.la \
                     $(top_builddir)/xlators/features/changelog/lib/src/libgfchangelog.la
 
-noinst_HEADERS = bit-rot.h bit-rot-scrub.h
+noinst_HEADERS = bit-rot.h bit-rot-scrub.h bit-rot-tbf.h
 
 AM_CFLAGS = -Wall $(GF_CFLAGS)
 
diff --git a/xlators/features/bit-rot/src/bitd/bit-rot-tbf.c b/xlators/features/bit-rot/src/bitd/bit-rot-tbf.c
new file mode 100644
index 00000000000..d9543416540
--- /dev/null
+++ b/xlators/features/bit-rot/src/bitd/bit-rot-tbf.c
@@ -0,0 +1,306 @@
+/*
+   Copyright (c) 2015 Red Hat, Inc. <http://www.redhat.com>
+   This file is part of GlusterFS.
+
+   This file is licensed to you under your choice of the GNU Lesser
+   General Public License, version 3 or any later version (LGPLv3 or
+   later), or the GNU General Public License, version 2 (GPLv2), in all
+   cases as published by the Free Software Foundation.
+*/
+
+/**
+ *
+ * Basic token bucket implementation for rate limiting. As of now interfaces
+ * to throttle disk read request, directory entry scan and hash calculation
+ * are available. To throttle a particular request (operation), the call needs
+ * to be wrapped in-between throttling APIs, for e.g.
+ *
+ *  TBF_THROTTLE_BEGIN (...);  <-- induces "delays" if required
+ *  {
+ *      call (...);
+ *  }
+ *  TBF_THROTTLE_END (...);  <-- not used atm, maybe needed later
+ *
+ */
+
+#include "mem-pool.h"
+#include "bit-rot-tbf.h"
+#include "bit-rot-stub-mem-types.h"
+
+typedef struct br_tbf_throttle {
+        char done;
+
+        pthread_mutex_t mutex;
+        pthread_cond_t  cond;
+
+        unsigned long tokens;
+
+        struct list_head list;
+} br_tbf_throttle_t;
+
+/**
+ * OK. Most implementations of TBF I've come across generate tokens
+ * every second (UML, etc..) and some chose sub-second granularity
+ * (blk-iothrottle cgroups). TBF algorithm itself does not enforce
+ * any logic for choosing generation interval and it seems pretty
+ * logical as one could jack up token count per interval w.r.t.
+ * generation rate.
+ *
+ * Value used here is chosen based on a series of test(s) performed
+ * to balance object signing time and not maxing out on all available
+ * CPU cores. It's obvious to have seconds granularity and jack up
+ * token count per interval, thereby achieving close to similar
+ * results. Let's stick to this as it seems to be working fine for
+ * the set of ops that are throttled.
+ */
+#define BR_TBF_TOKENGEN_INTERVAL_USEC  600000
+
+static inline br_tbf_throttle_t *
+br_tbf_init_throttle (unsigned long tokens_required)
+{
+        br_tbf_throttle_t *throttle = NULL;
+
+        throttle = GF_CALLOC (1, sizeof (*throttle),
+                              gf_br_mt_br_tbf_throttle_t);
+        if (!throttle)
+                return NULL;
+
+        throttle->done = 0;
+        throttle->tokens = tokens_required;
+        INIT_LIST_HEAD (&throttle->list);
+
+        (void) pthread_mutex_init (&throttle->mutex, NULL);
+        (void) pthread_cond_init (&throttle->cond, NULL);
+
+        return throttle;
+}
+
+void
+_br_tbf_dispatch_queued (br_tbf_bucket_t *bucket)
+{
+        gf_boolean_t xcont = _gf_false;
+        br_tbf_throttle_t *tmp = NULL;
+        br_tbf_throttle_t *throttle = NULL;
+
+        list_for_each_entry_safe (throttle, tmp, &bucket->queued, list) {
+
+                pthread_mutex_lock (&throttle->mutex);
+                {
+                        if (bucket->tokens < throttle->tokens) {
+                                xcont = _gf_true;
+                                goto unblock;
+                        }
+
+                        /* this request can now be serviced */
+                        throttle->done = 1;
+                        list_del_init (&throttle->list);
+
+                        bucket->tokens -= throttle->tokens;
+                        pthread_cond_signal (&throttle->cond);
+                }
+        unblock:
+                pthread_mutex_unlock (&throttle->mutex);
+                if (xcont)
+                        break;
+        }
+}
+
+void *br_tbf_tokengenerator (void *arg)
+{
+        unsigned long tokenrate = 0;
+        unsigned long maxtokens = 0;
+        br_tbf_bucket_t *bucket = arg;
+
+        tokenrate = bucket->tokenrate;
+        maxtokens = bucket->maxtokens;
+
+        while (1) {
+                usleep (BR_TBF_TOKENGEN_INTERVAL_USEC);
+
+                LOCK (&bucket->lock);
+                {
+                        bucket->tokens += tokenrate;
+                        if (bucket->tokens > maxtokens)
+                                bucket->tokens = maxtokens;
+
+                        if (!list_empty (&bucket->queued))
+                                _br_tbf_dispatch_queued (bucket);
+                }
+                UNLOCK (&bucket->lock);
+        }
+
+        return NULL;
+}
+
+/**
+ * There is lazy synchronization between this routine (when invoked
+ * under br_tbf_mod() context) and br_tbf_throttle(). *bucket is
+ * updated _after_ all the required variables are initialized.
+ */
+static inline int32_t
+br_tbf_init_bucket (br_tbf_t *tbf, br_tbf_opspec_t *spec)
+{
+        int ret = 0;
+        br_tbf_bucket_t *curr = NULL;
+        br_tbf_bucket_t **bucket = NULL;
+
+        GF_ASSERT (spec->op >= BR_TBF_OP_MIN);
+        GF_ASSERT (spec->op <= BR_TBF_OP_MAX);
+
+        /* no rate? no throttling. */
+        if (!spec->rate)
+                return 0;
+
+        bucket = tbf->bucket + spec->op;
+
+        curr = GF_CALLOC (1, sizeof (*curr), gf_br_mt_br_tbf_bucket_t);
+        if (!curr)
+                goto error_return;
+
+        LOCK_INIT (&curr->lock);
+        INIT_LIST_HEAD (&curr->queued);
+
+        curr->tokens = 0;
+        curr->tokenrate = spec->rate;
+        curr->maxtokens = spec->maxlimit;
+
+        ret = gf_thread_create (&curr->tokener,
+                                NULL, br_tbf_tokengenerator, curr);
+        if (ret != 0)
+                goto freemem;
+
+        *bucket = curr;
+        return 0;
+
+ freemem:
+        LOCK_DESTROY (&curr->lock);
+        GF_FREE (curr);
+ error_return:
+        return -1;
+}
+
+#define BR_TBF_ALLOC_SIZE                                               \
+        (sizeof (br_tbf_t) + (BR_TBF_OP_MAX * sizeof (br_tbf_bucket_t)))
+
+br_tbf_t *
+br_tbf_init (br_tbf_opspec_t *tbfspec, unsigned int count)
+{
+        int32_t i = 0;
+        int32_t ret = 0;
+        br_tbf_t *tbf = NULL;
+        br_tbf_opspec_t *opspec = NULL;
+
+        tbf = GF_CALLOC (1, BR_TBF_ALLOC_SIZE, gf_br_mt_br_tbf_t);
+        if (!tbf)
+                goto error_return;
+
+        tbf->bucket = (br_tbf_bucket_t **) ((char *)tbf + sizeof (*tbf));
+        for (i = 0; i < BR_TBF_OP_MAX; i++) {
+                *(tbf->bucket + i) = NULL;
+        }
+
+        for (i = 0; i < count; i++) {
+                opspec = tbfspec + i;
+
+                ret = br_tbf_init_bucket (tbf, opspec);
+                if (ret)
+                        break;
+        }
+
+        if (ret)
+                goto error_return;
+
+        return tbf;
+
+ error_return:
+        return NULL;
+}
+
+static void
+br_tbf_mod_bucket (br_tbf_bucket_t *bucket, br_tbf_opspec_t *spec)
+{
+        LOCK (&bucket->lock);
+        {
+                bucket->tokens = 0;
+                bucket->tokenrate = spec->rate;
+                bucket->maxtokens = spec->maxlimit;
+        }
+        UNLOCK (&bucket->lock);
+
+        /* next token tick would unqueue pending operations */
+}
+
+int
+br_tbf_mod (br_tbf_t *tbf, br_tbf_opspec_t *tbfspec)
+{
+        int              ret    = 0;
+        br_tbf_bucket_t *bucket = NULL;
+        br_tbf_ops_t     op     = BR_TBF_OP_MIN;
+
+        if (!tbf || !tbfspec)
+                return -1;
+
+        op = tbfspec->op;
+
+        GF_ASSERT (op >= BR_TBF_OP_MIN);
+        GF_ASSERT (op <= BR_TBF_OP_MAX);
+
+        bucket = *(tbf->bucket + op);
+        if (bucket) {
+                br_tbf_mod_bucket (bucket, tbfspec);
+        } else {
+                ret = br_tbf_init_bucket (tbf, tbfspec);
+        }
+
+        return ret;
+}
+
+void
+br_tbf_throttle (br_tbf_t *tbf, br_tbf_ops_t op, unsigned long tokens_requested)
+{
+        char waitq = 0;
+        br_tbf_bucket_t *bucket = NULL;
+        br_tbf_throttle_t *throttle = NULL;
+
+        GF_ASSERT (op >= BR_TBF_OP_MIN);
+        GF_ASSERT (op <= BR_TBF_OP_MAX);
+
+        bucket = *(tbf->bucket + op);
+        if (!bucket)
+                return;
+
+        LOCK (&bucket->lock);
+        {
+                /**
+                 * if there are enough tokens in the bucket there is no need
+                 * to throttle the request: therefore, consume the required
+                 * number of tokens and continue.
+                 */
+                if (tokens_requested <= bucket->tokens) {
+                        bucket->tokens -= tokens_requested;
+                } else {
+                        throttle = br_tbf_init_throttle (tokens_requested);
+                        if (!throttle) /* let it slip through for now.. */
+                                goto unblock;
+
+                        waitq = 1;
+                        pthread_mutex_lock (&throttle->mutex);
+                        list_add_tail (&throttle->list, &bucket->queued);
+                }
+        }
+ unblock:
+        UNLOCK (&bucket->lock);
+
+        if (waitq) {
+                while (!throttle->done) {
+                        pthread_cond_wait (&throttle->cond, &throttle->mutex);
+                }
+
+                pthread_mutex_unlock (&throttle->mutex);
+
+                pthread_mutex_destroy (&throttle->mutex);
+                pthread_cond_destroy (&throttle->cond);
+
+                GF_FREE (throttle);
+        }
+}
diff --git a/xlators/features/bit-rot/src/bitd/bit-rot-tbf.h b/xlators/features/bit-rot/src/bitd/bit-rot-tbf.h
new file mode 100644
index 00000000000..5a41be4fd95
--- /dev/null
+++ b/xlators/features/bit-rot/src/bitd/bit-rot-tbf.h
@@ -0,0 +1,70 @@
+/*
+   Copyright (c) 2015 Red Hat, Inc. <http://www.redhat.com>
+   This file is part of GlusterFS.
+
+   This file is licensed to you under your choice of the GNU Lesser
+   General Public License, version 3 or any later version (LGPLv3 or
+   later), or the GNU General Public License, version 2 (GPLv2), in all
+   cases as published by the Free Software Foundation.
+*/
+
+#include "list.h"
+#include "xlator.h"
+#include "locking.h"
+
+#ifndef __BIT_ROT_TBF_H__
+#define __BIT_ROT_TBF_H__
+
+typedef enum br_tbf_ops {
+        BR_TBF_OP_MIN     = -1,
+        BR_TBF_OP_HASH    = 0,    /* checksum calculation  */
+        BR_TBF_OP_READ    = 1,    /* inode read(s)         */
+        BR_TBF_OP_READDIR = 2,    /* dentry read(s)        */
+        BR_TBF_OP_MAX     = 3,
+} br_tbf_ops_t;
+
+/**
+ * Operation rate specification
+ */
+typedef struct br_tbf_opspec {
+        br_tbf_ops_t op;
+
+        unsigned long rate;
+
+        unsigned long maxlimit;
+} br_tbf_opspec_t;
+
+/**
+ * Token bucket for each operation type
+ */
+typedef struct br_tbf_bucket {
+        gf_lock_t lock;
+
+        pthread_t tokener;         /* token generator thread          */
+
+        unsigned long tokenrate;   /* token generation rate           */
+
+        unsigned long tokens;      /* number of current tokens        */
+
+        unsigned long maxtokens;   /* maximum token in the bucket     */
+
+        struct list_head queued;   /* list of non-conformant requests */
+} br_tbf_bucket_t;
+
+typedef struct br_tbf {
+        br_tbf_bucket_t **bucket;
+} br_tbf_t;
+
+br_tbf_t *
+br_tbf_init (br_tbf_opspec_t *, unsigned int);
+
+int
+br_tbf_mod (br_tbf_t *, br_tbf_opspec_t *);
+
+void
+br_tbf_throttle (br_tbf_t *, br_tbf_ops_t, unsigned long);
+
+#define TBF_THROTTLE_BEGIN(tbf, op, tokens) (br_tbf_throttle (tbf, op, tokens))
+#define TBF_THROTTLE_END(tbf, op, tokens) (void)
+
+#endif /** __BIT_ROT_TBF_H__ */
diff --git a/xlators/features/bit-rot/src/bitd/bit-rot.c b/xlators/features/bit-rot/src/bitd/bit-rot.c
index d985cc4442c..880b16edfa8 100644
--- a/xlators/features/bit-rot/src/bitd/bit-rot.c
+++ b/xlators/features/bit-rot/src/bitd/bit-rot.c
@@ -27,6 +27,17 @@
 
 #include "tw.h"
 
+#define BR_HASH_CALC_READ_SIZE  (128 * 1024)
+
+br_tbf_opspec_t opthrottle[] = {
+        {
+                .op       = BR_TBF_OP_HASH,
+                .rate     = BR_HASH_CALC_READ_SIZE,
+                .maxlimit = (2 * BR_WORKERS * BR_HASH_CALC_READ_SIZE),
+        },
+        /** TODO: throttle getdents(), read() request(s) */
+};
+
 static int
 br_find_child_index (xlator_t *this, xlator_t *child)
 {
@@ -288,8 +299,10 @@ br_object_read_block_and_sign (xlator_t *this, fd_t *fd, br_child_t *child,
                                off_t offset, size_t size, SHA256_CTX *sha256)
 {
         int32_t        ret    = -1;
+        br_tbf_t      *tbf    = NULL;
         struct iovec  *iovec  = NULL;
         struct iobref *iobref = NULL;
+        br_private_t  *priv   = NULL;
         int            count  = 0;
         int            i      = 0;
 
@@ -297,6 +310,12 @@ br_object_read_block_and_sign (xlator_t *this, fd_t *fd, br_child_t *child,
         GF_VALIDATE_OR_GOTO (this->name, fd, out);
         GF_VALIDATE_OR_GOTO (this->name, fd->inode, out);
         GF_VALIDATE_OR_GOTO (this->name, child, out);
+        GF_VALIDATE_OR_GOTO (this->name, this->private, out);
+
+        priv = this->private;
+
+        GF_VALIDATE_OR_GOTO (this->name, priv->tbf, out);
+        tbf = priv->tbf;
 
         ret = syncop_readv (child->xl, fd,
                             size, offset, 0, &iovec, &count, &iobref, NULL,
@@ -313,9 +332,12 @@ br_object_read_block_and_sign (xlator_t *this, fd_t *fd, br_child_t *child,
                 goto out;
 
         for (i = 0; i < count; i++) {
-                SHA256_Update (sha256,
-                               (const unsigned char *) (iovec[i].iov_base),
-                               iovec[i].iov_len);
+                TBF_THROTTLE_BEGIN (tbf, BR_TBF_OP_HASH, iovec[i].iov_len);
+                {
+                        SHA256_Update (sha256, (const unsigned char *)
+                                       (iovec[i].iov_base), iovec[i].iov_len);
+                }
+                TBF_THROTTLE_BEGIN (tbf, BR_TBF_OP_HASH, iovec[i].iov_len);
         }
 
  out:
@@ -334,7 +356,7 @@ br_calculate_obj_checksum (unsigned char *md,
 {
         int32_t   ret    = -1;
         off_t     offset = 0;
-        size_t    block  = 128 * 1024;  /* 128K block size */
+        size_t    block  = BR_HASH_CALC_READ_SIZE;
         xlator_t *this   = NULL;
 
         SHA256_CTX       sha256;
@@ -1358,6 +1380,16 @@ br_init_signer (xlator_t *this, br_private_t *priv)
 }
 
 int32_t
+br_init_rate_limiter (br_private_t *priv)
+{
+        br_tbf_opspec_t *spec = opthrottle;
+        priv->tbf = br_tbf_init (spec, sizeof (opthrottle)
+                                           / sizeof (br_tbf_opspec_t));
+
+        return priv->tbf ? 0 : -1;
+}
+
+int32_t
 init (xlator_t *this)
 {
         int            i    = 0;
@@ -1411,12 +1443,16 @@ init (xlator_t *this)
                 INIT_LIST_HEAD (&priv->children[i].list);
         INIT_LIST_HEAD (&priv->bricks);
 
+        ret = br_init_rate_limiter (priv);
+        if (ret)
+                goto cleanup_mutex;
+
 	this->private = priv;
 
         if (!priv->iamscrubber) {
                 ret = br_init_signer (this, priv);
                 if (ret)
-                        goto cleanup_mutex;
+                        goto cleanup_tbf;
         }
 
         ret = gf_thread_create (&priv->thread, NULL, br_handle_events, this);
@@ -1433,6 +1469,7 @@ init (xlator_t *this)
                 return 0;
         }
 
+ cleanup_tbf:
  cleanup_mutex:
         (void) pthread_cond_destroy (&priv->cond);
         (void) pthread_mutex_destroy (&priv->lock);
diff --git a/xlators/features/bit-rot/src/bitd/bit-rot.h b/xlators/features/bit-rot/src/bitd/bit-rot.h
index a634a1fa76f..5b641801916 100644
--- a/xlators/features/bit-rot/src/bitd/bit-rot.h
+++ b/xlators/features/bit-rot/src/bitd/bit-rot.h
@@ -25,13 +25,18 @@
 #include "changelog.h"
 #include "timer-wheel.h"
 
+#include "bit-rot-tbf.h"
+
 #include "bit-rot-common.h"
 #include "bit-rot-stub-mem-types.h"
 
 #include <openssl/sha.h>
 
-/* TODO: make this configurable */
-#define BR_WORKERS 8
+/**
+ * TODO: make this configurable. As a best practice, set this to the
+ * number of processor cores.
+ */
+#define BR_WORKERS 4
 
 #define signature_size(hl) (sizeof (br_isignature_t) + hl + 1)
 
@@ -92,6 +97,8 @@ struct br_private {
                                              the objects */
         int32_t expiry_time;              /* objects "wait" time */
 
+        br_tbf_t *tbf;                    /* token bucket filter */
+
         gf_boolean_t iamscrubber;         /* function as a fs scrubber */
 };
 
diff --git a/xlators/features/bit-rot/src/stub/bit-rot-stub-mem-types.h b/xlators/features/bit-rot/src/stub/bit-rot-stub-mem-types.h
index 492278639b4..bb4030493db 100644
--- a/xlators/features/bit-rot/src/stub/bit-rot-stub-mem-types.h
+++ b/xlators/features/bit-rot/src/stub/bit-rot-stub-mem-types.h
@@ -22,6 +22,9 @@ enum br_mem_types {
         gf_br_mt_br_child_t,
         gf_br_mt_br_object_t,
         gf_br_mt_br_ob_n_wk_t,
+        gf_br_mt_br_tbf_t,
+        gf_br_mt_br_tbf_bucket_t,
+        gf_br_mt_br_tbf_throttle_t,
         gf_br_stub_mt_end
 };