diff options
| -rw-r--r-- | xlators/features/bit-rot/src/bitd/Makefile.am | 4 | ||||
| -rw-r--r-- | xlators/features/bit-rot/src/bitd/bit-rot-tbf.c | 306 | ||||
| -rw-r--r-- | xlators/features/bit-rot/src/bitd/bit-rot-tbf.h | 70 | ||||
| -rw-r--r-- | xlators/features/bit-rot/src/bitd/bit-rot.c | 47 | ||||
| -rw-r--r-- | xlators/features/bit-rot/src/bitd/bit-rot.h | 11 | ||||
| -rw-r--r-- | xlators/features/bit-rot/src/stub/bit-rot-stub-mem-types.h | 3 | 
6 files changed, 432 insertions, 9 deletions
diff --git a/xlators/features/bit-rot/src/bitd/Makefile.am b/xlators/features/bit-rot/src/bitd/Makefile.am index 160e3b653df..f67fa1a3acd 100644 --- a/xlators/features/bit-rot/src/bitd/Makefile.am +++ b/xlators/features/bit-rot/src/bitd/Makefile.am @@ -9,11 +9,11 @@ AM_CPPFLAGS = $(GF_CPPFLAGS) -I$(top_srcdir)/libglusterfs/src \                   -I$(CONTRIBDIR)/timer-wheel \                   -I$(top_srcdir)/xlators/features/bit-rot/src/stub -bit_rot_la_SOURCES = bit-rot.c bit-rot-scrub.c +bit_rot_la_SOURCES = bit-rot.c bit-rot-scrub.c bit-rot-tbf.c  bit_rot_la_LIBADD = $(top_builddir)/libglusterfs/src/libglusterfs.la \                      $(top_builddir)/xlators/features/changelog/lib/src/libgfchangelog.la -noinst_HEADERS = bit-rot.h bit-rot-scrub.h +noinst_HEADERS = bit-rot.h bit-rot-scrub.h bit-rot-tbf.h  AM_CFLAGS = -Wall $(GF_CFLAGS) diff --git a/xlators/features/bit-rot/src/bitd/bit-rot-tbf.c b/xlators/features/bit-rot/src/bitd/bit-rot-tbf.c new file mode 100644 index 00000000000..d9543416540 --- /dev/null +++ b/xlators/features/bit-rot/src/bitd/bit-rot-tbf.c @@ -0,0 +1,306 @@ +/* +   Copyright (c) 2015 Red Hat, Inc. <http://www.redhat.com> +   This file is part of GlusterFS. + +   This file is licensed to you under your choice of the GNU Lesser +   General Public License, version 3 or any later version (LGPLv3 or +   later), or the GNU General Public License, version 2 (GPLv2), in all +   cases as published by the Free Software Foundation. +*/ + +/** + * + * Basic token bucket implementation for rate limiting. As of now interfaces + * to throttle disk read request, directory entry scan and hash calculation + * are available. To throttle a particular request (operation), the call needs + * to be wrapped in-between throttling APIs, for e.g. + * + *  TBF_THROTTLE_BEGIN (...);  <-- induces "delays" if required + *  { + *      call (...); + *  } + *  TBF_THROTTLE_END (...);  <-- not used atm, maybe needed later + * + */ + +#include "mem-pool.h" +#include "bit-rot-tbf.h" +#include "bit-rot-stub-mem-types.h" + +typedef struct br_tbf_throttle { +        char done; + +        pthread_mutex_t mutex; +        pthread_cond_t  cond; + +        unsigned long tokens; + +        struct list_head list; +} br_tbf_throttle_t; + +/** + * OK. Most implementations of TBF I've come across generate tokens + * every second (UML, etc..) and some chose sub-second granularity + * (blk-iothrottle cgroups). TBF algorithm itself does not enforce + * any logic for choosing generation interval and it seems pretty + * logical as one could jack up token count per interval w.r.t. + * generation rate. + * + * Value used here is chosen based on a series of test(s) performed + * to balance object signing time and not maxing out on all available + * CPU cores. It's obvious to have seconds granularity and jack up + * token count per interval, thereby achieving close to similar + * results. Let's stick to this as it seems to be working fine for + * the set of ops that are throttled. + */ +#define BR_TBF_TOKENGEN_INTERVAL_USEC  600000 + +static inline br_tbf_throttle_t * +br_tbf_init_throttle (unsigned long tokens_required) +{ +        br_tbf_throttle_t *throttle = NULL; + +        throttle = GF_CALLOC (1, sizeof (*throttle), +                              gf_br_mt_br_tbf_throttle_t); +        if (!throttle) +                return NULL; + +        throttle->done = 0; +        throttle->tokens = tokens_required; +        INIT_LIST_HEAD (&throttle->list); + +        (void) pthread_mutex_init (&throttle->mutex, NULL); +        (void) pthread_cond_init (&throttle->cond, NULL); + +        return throttle; +} + +void +_br_tbf_dispatch_queued (br_tbf_bucket_t *bucket) +{ +        gf_boolean_t xcont = _gf_false; +        br_tbf_throttle_t *tmp = NULL; +        br_tbf_throttle_t *throttle = NULL; + +        list_for_each_entry_safe (throttle, tmp, &bucket->queued, list) { + +                pthread_mutex_lock (&throttle->mutex); +                { +                        if (bucket->tokens < throttle->tokens) { +                                xcont = _gf_true; +                                goto unblock; +                        } + +                        /* this request can now be serviced */ +                        throttle->done = 1; +                        list_del_init (&throttle->list); + +                        bucket->tokens -= throttle->tokens; +                        pthread_cond_signal (&throttle->cond); +                } +        unblock: +                pthread_mutex_unlock (&throttle->mutex); +                if (xcont) +                        break; +        } +} + +void *br_tbf_tokengenerator (void *arg) +{ +        unsigned long tokenrate = 0; +        unsigned long maxtokens = 0; +        br_tbf_bucket_t *bucket = arg; + +        tokenrate = bucket->tokenrate; +        maxtokens = bucket->maxtokens; + +        while (1) { +                usleep (BR_TBF_TOKENGEN_INTERVAL_USEC); + +                LOCK (&bucket->lock); +                { +                        bucket->tokens += tokenrate; +                        if (bucket->tokens > maxtokens) +                                bucket->tokens = maxtokens; + +                        if (!list_empty (&bucket->queued)) +                                _br_tbf_dispatch_queued (bucket); +                } +                UNLOCK (&bucket->lock); +        } + +        return NULL; +} + +/** + * There is lazy synchronization between this routine (when invoked + * under br_tbf_mod() context) and br_tbf_throttle(). *bucket is + * updated _after_ all the required variables are initialized. + */ +static inline int32_t +br_tbf_init_bucket (br_tbf_t *tbf, br_tbf_opspec_t *spec) +{ +        int ret = 0; +        br_tbf_bucket_t *curr = NULL; +        br_tbf_bucket_t **bucket = NULL; + +        GF_ASSERT (spec->op >= BR_TBF_OP_MIN); +        GF_ASSERT (spec->op <= BR_TBF_OP_MAX); + +        /* no rate? no throttling. */ +        if (!spec->rate) +                return 0; + +        bucket = tbf->bucket + spec->op; + +        curr = GF_CALLOC (1, sizeof (*curr), gf_br_mt_br_tbf_bucket_t); +        if (!curr) +                goto error_return; + +        LOCK_INIT (&curr->lock); +        INIT_LIST_HEAD (&curr->queued); + +        curr->tokens = 0; +        curr->tokenrate = spec->rate; +        curr->maxtokens = spec->maxlimit; + +        ret = gf_thread_create (&curr->tokener, +                                NULL, br_tbf_tokengenerator, curr); +        if (ret != 0) +                goto freemem; + +        *bucket = curr; +        return 0; + + freemem: +        LOCK_DESTROY (&curr->lock); +        GF_FREE (curr); + error_return: +        return -1; +} + +#define BR_TBF_ALLOC_SIZE                                               \ +        (sizeof (br_tbf_t) + (BR_TBF_OP_MAX * sizeof (br_tbf_bucket_t))) + +br_tbf_t * +br_tbf_init (br_tbf_opspec_t *tbfspec, unsigned int count) +{ +        int32_t i = 0; +        int32_t ret = 0; +        br_tbf_t *tbf = NULL; +        br_tbf_opspec_t *opspec = NULL; + +        tbf = GF_CALLOC (1, BR_TBF_ALLOC_SIZE, gf_br_mt_br_tbf_t); +        if (!tbf) +                goto error_return; + +        tbf->bucket = (br_tbf_bucket_t **) ((char *)tbf + sizeof (*tbf)); +        for (i = 0; i < BR_TBF_OP_MAX; i++) { +                *(tbf->bucket + i) = NULL; +        } + +        for (i = 0; i < count; i++) { +                opspec = tbfspec + i; + +                ret = br_tbf_init_bucket (tbf, opspec); +                if (ret) +                        break; +        } + +        if (ret) +                goto error_return; + +        return tbf; + + error_return: +        return NULL; +} + +static void +br_tbf_mod_bucket (br_tbf_bucket_t *bucket, br_tbf_opspec_t *spec) +{ +        LOCK (&bucket->lock); +        { +                bucket->tokens = 0; +                bucket->tokenrate = spec->rate; +                bucket->maxtokens = spec->maxlimit; +        } +        UNLOCK (&bucket->lock); + +        /* next token tick would unqueue pending operations */ +} + +int +br_tbf_mod (br_tbf_t *tbf, br_tbf_opspec_t *tbfspec) +{ +        int              ret    = 0; +        br_tbf_bucket_t *bucket = NULL; +        br_tbf_ops_t     op     = BR_TBF_OP_MIN; + +        if (!tbf || !tbfspec) +                return -1; + +        op = tbfspec->op; + +        GF_ASSERT (op >= BR_TBF_OP_MIN); +        GF_ASSERT (op <= BR_TBF_OP_MAX); + +        bucket = *(tbf->bucket + op); +        if (bucket) { +                br_tbf_mod_bucket (bucket, tbfspec); +        } else { +                ret = br_tbf_init_bucket (tbf, tbfspec); +        } + +        return ret; +} + +void +br_tbf_throttle (br_tbf_t *tbf, br_tbf_ops_t op, unsigned long tokens_requested) +{ +        char waitq = 0; +        br_tbf_bucket_t *bucket = NULL; +        br_tbf_throttle_t *throttle = NULL; + +        GF_ASSERT (op >= BR_TBF_OP_MIN); +        GF_ASSERT (op <= BR_TBF_OP_MAX); + +        bucket = *(tbf->bucket + op); +        if (!bucket) +                return; + +        LOCK (&bucket->lock); +        { +                /** +                 * if there are enough tokens in the bucket there is no need +                 * to throttle the request: therefore, consume the required +                 * number of tokens and continue. +                 */ +                if (tokens_requested <= bucket->tokens) { +                        bucket->tokens -= tokens_requested; +                } else { +                        throttle = br_tbf_init_throttle (tokens_requested); +                        if (!throttle) /* let it slip through for now.. */ +                                goto unblock; + +                        waitq = 1; +                        pthread_mutex_lock (&throttle->mutex); +                        list_add_tail (&throttle->list, &bucket->queued); +                } +        } + unblock: +        UNLOCK (&bucket->lock); + +        if (waitq) { +                while (!throttle->done) { +                        pthread_cond_wait (&throttle->cond, &throttle->mutex); +                } + +                pthread_mutex_unlock (&throttle->mutex); + +                pthread_mutex_destroy (&throttle->mutex); +                pthread_cond_destroy (&throttle->cond); + +                GF_FREE (throttle); +        } +} diff --git a/xlators/features/bit-rot/src/bitd/bit-rot-tbf.h b/xlators/features/bit-rot/src/bitd/bit-rot-tbf.h new file mode 100644 index 00000000000..5a41be4fd95 --- /dev/null +++ b/xlators/features/bit-rot/src/bitd/bit-rot-tbf.h @@ -0,0 +1,70 @@ +/* +   Copyright (c) 2015 Red Hat, Inc. <http://www.redhat.com> +   This file is part of GlusterFS. + +   This file is licensed to you under your choice of the GNU Lesser +   General Public License, version 3 or any later version (LGPLv3 or +   later), or the GNU General Public License, version 2 (GPLv2), in all +   cases as published by the Free Software Foundation. +*/ + +#include "list.h" +#include "xlator.h" +#include "locking.h" + +#ifndef __BIT_ROT_TBF_H__ +#define __BIT_ROT_TBF_H__ + +typedef enum br_tbf_ops { +        BR_TBF_OP_MIN     = -1, +        BR_TBF_OP_HASH    = 0,    /* checksum calculation  */ +        BR_TBF_OP_READ    = 1,    /* inode read(s)         */ +        BR_TBF_OP_READDIR = 2,    /* dentry read(s)        */ +        BR_TBF_OP_MAX     = 3, +} br_tbf_ops_t; + +/** + * Operation rate specification + */ +typedef struct br_tbf_opspec { +        br_tbf_ops_t op; + +        unsigned long rate; + +        unsigned long maxlimit; +} br_tbf_opspec_t; + +/** + * Token bucket for each operation type + */ +typedef struct br_tbf_bucket { +        gf_lock_t lock; + +        pthread_t tokener;         /* token generator thread          */ + +        unsigned long tokenrate;   /* token generation rate           */ + +        unsigned long tokens;      /* number of current tokens        */ + +        unsigned long maxtokens;   /* maximum token in the bucket     */ + +        struct list_head queued;   /* list of non-conformant requests */ +} br_tbf_bucket_t; + +typedef struct br_tbf { +        br_tbf_bucket_t **bucket; +} br_tbf_t; + +br_tbf_t * +br_tbf_init (br_tbf_opspec_t *, unsigned int); + +int +br_tbf_mod (br_tbf_t *, br_tbf_opspec_t *); + +void +br_tbf_throttle (br_tbf_t *, br_tbf_ops_t, unsigned long); + +#define TBF_THROTTLE_BEGIN(tbf, op, tokens) (br_tbf_throttle (tbf, op, tokens)) +#define TBF_THROTTLE_END(tbf, op, tokens) (void) + +#endif /** __BIT_ROT_TBF_H__ */ diff --git a/xlators/features/bit-rot/src/bitd/bit-rot.c b/xlators/features/bit-rot/src/bitd/bit-rot.c index d985cc4442c..880b16edfa8 100644 --- a/xlators/features/bit-rot/src/bitd/bit-rot.c +++ b/xlators/features/bit-rot/src/bitd/bit-rot.c @@ -27,6 +27,17 @@  #include "tw.h" +#define BR_HASH_CALC_READ_SIZE  (128 * 1024) + +br_tbf_opspec_t opthrottle[] = { +        { +                .op       = BR_TBF_OP_HASH, +                .rate     = BR_HASH_CALC_READ_SIZE, +                .maxlimit = (2 * BR_WORKERS * BR_HASH_CALC_READ_SIZE), +        }, +        /** TODO: throttle getdents(), read() request(s) */ +}; +  static int  br_find_child_index (xlator_t *this, xlator_t *child)  { @@ -288,8 +299,10 @@ br_object_read_block_and_sign (xlator_t *this, fd_t *fd, br_child_t *child,                                 off_t offset, size_t size, SHA256_CTX *sha256)  {          int32_t        ret    = -1; +        br_tbf_t      *tbf    = NULL;          struct iovec  *iovec  = NULL;          struct iobref *iobref = NULL; +        br_private_t  *priv   = NULL;          int            count  = 0;          int            i      = 0; @@ -297,6 +310,12 @@ br_object_read_block_and_sign (xlator_t *this, fd_t *fd, br_child_t *child,          GF_VALIDATE_OR_GOTO (this->name, fd, out);          GF_VALIDATE_OR_GOTO (this->name, fd->inode, out);          GF_VALIDATE_OR_GOTO (this->name, child, out); +        GF_VALIDATE_OR_GOTO (this->name, this->private, out); + +        priv = this->private; + +        GF_VALIDATE_OR_GOTO (this->name, priv->tbf, out); +        tbf = priv->tbf;          ret = syncop_readv (child->xl, fd,                              size, offset, 0, &iovec, &count, &iobref, NULL, @@ -313,9 +332,12 @@ br_object_read_block_and_sign (xlator_t *this, fd_t *fd, br_child_t *child,                  goto out;          for (i = 0; i < count; i++) { -                SHA256_Update (sha256, -                               (const unsigned char *) (iovec[i].iov_base), -                               iovec[i].iov_len); +                TBF_THROTTLE_BEGIN (tbf, BR_TBF_OP_HASH, iovec[i].iov_len); +                { +                        SHA256_Update (sha256, (const unsigned char *) +                                       (iovec[i].iov_base), iovec[i].iov_len); +                } +                TBF_THROTTLE_BEGIN (tbf, BR_TBF_OP_HASH, iovec[i].iov_len);          }   out: @@ -334,7 +356,7 @@ br_calculate_obj_checksum (unsigned char *md,  {          int32_t   ret    = -1;          off_t     offset = 0; -        size_t    block  = 128 * 1024;  /* 128K block size */ +        size_t    block  = BR_HASH_CALC_READ_SIZE;          xlator_t *this   = NULL;          SHA256_CTX       sha256; @@ -1358,6 +1380,16 @@ br_init_signer (xlator_t *this, br_private_t *priv)  }  int32_t +br_init_rate_limiter (br_private_t *priv) +{ +        br_tbf_opspec_t *spec = opthrottle; +        priv->tbf = br_tbf_init (spec, sizeof (opthrottle) +                                           / sizeof (br_tbf_opspec_t)); + +        return priv->tbf ? 0 : -1; +} + +int32_t  init (xlator_t *this)  {          int            i    = 0; @@ -1411,12 +1443,16 @@ init (xlator_t *this)                  INIT_LIST_HEAD (&priv->children[i].list);          INIT_LIST_HEAD (&priv->bricks); +        ret = br_init_rate_limiter (priv); +        if (ret) +                goto cleanup_mutex; +  	this->private = priv;          if (!priv->iamscrubber) {                  ret = br_init_signer (this, priv);                  if (ret) -                        goto cleanup_mutex; +                        goto cleanup_tbf;          }          ret = gf_thread_create (&priv->thread, NULL, br_handle_events, this); @@ -1433,6 +1469,7 @@ init (xlator_t *this)                  return 0;          } + cleanup_tbf:   cleanup_mutex:          (void) pthread_cond_destroy (&priv->cond);          (void) pthread_mutex_destroy (&priv->lock); diff --git a/xlators/features/bit-rot/src/bitd/bit-rot.h b/xlators/features/bit-rot/src/bitd/bit-rot.h index a634a1fa76f..5b641801916 100644 --- a/xlators/features/bit-rot/src/bitd/bit-rot.h +++ b/xlators/features/bit-rot/src/bitd/bit-rot.h @@ -25,13 +25,18 @@  #include "changelog.h"  #include "timer-wheel.h" +#include "bit-rot-tbf.h" +  #include "bit-rot-common.h"  #include "bit-rot-stub-mem-types.h"  #include <openssl/sha.h> -/* TODO: make this configurable */ -#define BR_WORKERS 8 +/** + * TODO: make this configurable. As a best practice, set this to the + * number of processor cores. + */ +#define BR_WORKERS 4  #define signature_size(hl) (sizeof (br_isignature_t) + hl + 1) @@ -92,6 +97,8 @@ struct br_private {                                               the objects */          int32_t expiry_time;              /* objects "wait" time */ +        br_tbf_t *tbf;                    /* token bucket filter */ +          gf_boolean_t iamscrubber;         /* function as a fs scrubber */  }; diff --git a/xlators/features/bit-rot/src/stub/bit-rot-stub-mem-types.h b/xlators/features/bit-rot/src/stub/bit-rot-stub-mem-types.h index 492278639b4..bb4030493db 100644 --- a/xlators/features/bit-rot/src/stub/bit-rot-stub-mem-types.h +++ b/xlators/features/bit-rot/src/stub/bit-rot-stub-mem-types.h @@ -22,6 +22,9 @@ enum br_mem_types {          gf_br_mt_br_child_t,          gf_br_mt_br_object_t,          gf_br_mt_br_ob_n_wk_t, +        gf_br_mt_br_tbf_t, +        gf_br_mt_br_tbf_bucket_t, +        gf_br_mt_br_tbf_throttle_t,          gf_br_stub_mt_end  };  | 
