10 files changed, 5064 insertions, 0 deletions
diff --git a/xlators/features/bit-rot/src/bitd/Makefile.am b/xlators/features/bit-rot/src/bitd/Makefile.am
new file mode 100644
index 00000000000..6db800e6565
--- /dev/null
+++ b/xlators/features/bit-rot/src/bitd/Makefile.am
@@ -0,0 +1,23 @@
+if WITH_SERVER
+xlator_LTLIBRARIES = bit-rot.la
+endif
+xlatordir = $(libdir)/glusterfs/$(PACKAGE_VERSION)/xlator/features
+
+bit_rot_la_LDFLAGS = -module $(GF_XLATOR_DEFAULT_LDFLAGS)
+
+AM_CPPFLAGS = $(GF_CPPFLAGS) -I$(top_srcdir)/libglusterfs/src \
+	-I$(top_srcdir)/rpc/xdr/src/ -I$(top_builddir)/rpc/xdr/src/ \
+	-I$(top_srcdir)/rpc/rpc-lib/src -I$(CONTRIBDIR)/timer-wheel \
+	-I$(top_srcdir)/xlators/features/bit-rot/src/stub
+
+bit_rot_la_SOURCES = bit-rot.c bit-rot-scrub.c bit-rot-ssm.c \
+		     bit-rot-scrub-status.c
+bit_rot_la_LIBADD = $(top_builddir)/libglusterfs/src/libglusterfs.la \
+	$(top_builddir)/xlators/features/changelog/lib/src/libgfchangelog.la
+
+noinst_HEADERS = bit-rot.h bit-rot-scrub.h bit-rot-bitd-messages.h bit-rot-ssm.h \
+		 bit-rot-scrub-status.h
+
+AM_CFLAGS = -Wall -DBR_RATE_LIMIT_SIGNER $(GF_CFLAGS)
+
+CLEANFILES =
diff --git a/xlators/features/bit-rot/src/bitd/bit-rot-bitd-messages.h b/xlators/features/bit-rot/src/bitd/bit-rot-bitd-messages.h
new file mode 100644
index 00000000000..5bc5103a27c
--- /dev/null
+++ b/xlators/features/bit-rot/src/bitd/bit-rot-bitd-messages.h
@@ -0,0 +1,101 @@
+/*
+ Copyright (c) 2015 Red Hat, Inc. <http://www.redhat.com>
+ This file is part of GlusterFS.
+
+ This file is licensed to you under your choice of the GNU Lesser
+ General Public License, version 3 or any later version (LGPLv3 or
+ later), or the GNU General Public License, version 2 (GPLv2), in all
+ cases as published by the Free Software Foundation.
+ */
+
+#ifndef _BITROT_BITD_MESSAGES_H_
+#define _BITROT_BITD_MESSAGES_H_
+
+#include <glusterfs/glfs-message-id.h>
+
+/* To add new message IDs, append new identifiers at the end of the list.
+ *
+ * Never remove a message ID. If it's not used anymore, you can rename it or
+ * leave it as it is, but not delete it. This is to prevent reutilization of
+ * IDs by other messages.
+ *
+ * The component name must match one of the entries defined in
+ * glfs-message-id.h.
+ */
+
+GLFS_MSGID(BITROT_BITD, BRB_MSG_FD_CREATE_FAILED, BRB_MSG_READV_FAILED,
+           BRB_MSG_BLOCK_READ_FAILED, BRB_MSG_CALC_CHECKSUM_FAILED,
+           BRB_MSG_NO_MEMORY, BRB_MSG_GET_SIGN_FAILED, BRB_MSG_SET_SIGN_FAILED,
+           BRB_MSG_OP_FAILED, BRB_MSG_READ_AND_SIGN_FAILED, BRB_MSG_SIGN_FAILED,
+           BRB_MSG_GET_SUBVOL_FAILED, BRB_MSG_SET_TIMER_FAILED,
+           BRB_MSG_GET_INFO_FAILED, BRB_MSG_PATH_FAILED, BRB_MSG_MARK_BAD_FILE,
+           BRB_MSG_TRIGGER_SIGN, BRB_MSG_REGISTER_FAILED,
+           BRB_MSG_CRAWLING_START, BRB_MSG_SPAWN_FAILED,
+           BRB_MSG_INVALID_SUBVOL_CHILD, BRB_MSG_SKIP_OBJECT, BRB_MSG_NO_CHILD,
+           BRB_MSG_CHECKSUM_MISMATCH, BRB_MSG_MARK_CORRUPTED,
+           BRB_MSG_CRAWLING_FINISH, BRB_MSG_CALC_ERROR, BRB_MSG_LOOKUP_FAILED,
+           BRB_MSG_PARTIAL_VERSION_PRESENCE, BRB_MSG_MEM_ACNT_FAILED,
+           BRB_MSG_TIMER_WHEEL_UNAVAILABLE, BRB_MSG_BITROT_LOADED,
+           BRB_MSG_SCALE_DOWN_FAILED, BRB_MSG_SCALE_UP_FAILED,
+           BRB_MSG_SCALE_DOWN_SCRUBBER, BRB_MSG_SCALING_UP_SCRUBBER,
+           BRB_MSG_UNKNOWN_THROTTLE, BRB_MSG_RATE_LIMIT_INFO,
+           BRB_MSG_SCRUB_INFO, BRB_MSG_CONNECTED_TO_BRICK, BRB_MSG_BRICK_INFO,
+           BRB_MSG_SUBVOL_CONNECT_FAILED, BRB_MSG_INVALID_SUBVOL,
+           BRB_MSG_RESCHEDULE_SCRUBBER_FAILED, BRB_MSG_SCRUB_START,
+           BRB_MSG_SCRUB_FINISH, BRB_MSG_SCRUB_RUNNING,
+           BRB_MSG_SCRUB_RESCHEDULED, BRB_MSG_SCRUB_TUNABLE,
+           BRB_MSG_SCRUB_THREAD_CLEANUP, BRB_MSG_SCRUBBER_CLEANED,
+           BRB_MSG_GENERIC_SSM_INFO, BRB_MSG_ZERO_TIMEOUT_BUG,
+           BRB_MSG_BAD_OBJ_READDIR_FAIL, BRB_MSG_SSM_FAILED,
+           BRB_MSG_SCRUB_WAIT_FAILED, BRB_MSG_TRIGGER_SIGN_FAILED,
+           BRB_MSG_EVENT_UNHANDLED, BRB_MSG_COULD_NOT_SCHEDULE_SCRUB,
+           BRB_MSG_THREAD_CREATION_FAILED, BRB_MSG_MEM_POOL_ALLOC,
+           BRB_MSG_SAVING_HASH_FAILED);
+
+#define BRB_MSG_FD_CREATE_FAILED_STR "failed to create fd for the inode"
+#define BRB_MSG_READV_FAILED_STR "readv failed"
+#define BRB_MSG_BLOCK_READ_FAILED_STR "reading block failed"
+#define BRB_MSG_NO_MEMORY_STR "failed to allocate memory"
+#define BRB_MSG_CALC_CHECKSUM_FAILED_STR "calculating checksum failed"
+#define BRB_MSG_GET_SIGN_FAILED_STR "failed to get the signature"
+#define BRB_MSG_SET_SIGN_FAILED_STR "signing failed"
+#define BRB_MSG_OP_FAILED_STR "failed on object"
+#define BRB_MSG_TRIGGER_SIGN_FAILED_STR "Could not trigger signing"
+#define BRB_MSG_READ_AND_SIGN_FAILED_STR "reading and signing of object failed"
+#define BRB_MSG_SET_TIMER_FAILED_STR "Failed to allocate object expiry timer"
+#define BRB_MSG_GET_SUBVOL_FAILED_STR                                          \
+    "failed to get the subvolume for the brick"
+#define BRB_MSG_PATH_FAILED_STR "path failed"
+#define BRB_MSG_SKIP_OBJECT_STR "Entry is marked corrupted. skipping"
+#define BRB_MSG_PARTIAL_VERSION_PRESENCE_STR                                   \
+    "PArtial version xattr presence detected, ignoring"
+#define BRB_MSG_TRIGGER_SIGN_STR "Triggering signing"
+#define BRB_MSG_CRAWLING_START_STR                                             \
+    "Crawling brick, scanning for unsigned objects"
+#define BRB_MSG_CRAWLING_FINISH_STR "Completed crawling brick"
+#define BRB_MSG_REGISTER_FAILED_STR "Register to changelog failed"
+#define BRB_MSG_SPAWN_FAILED_STR "failed to spawn"
+#define BRB_MSG_CONNECTED_TO_BRICK_STR "Connected to brick"
+#define BRB_MSG_LOOKUP_FAILED_STR "lookup on root failed"
+#define BRB_MSG_GET_INFO_FAILED_STR "failed to get stub info"
+#define BRB_MSG_SCRUB_THREAD_CLEANUP_STR "Error cleaning up scanner thread"
+#define BRB_MSG_SCRUBBER_CLEANED_STR "clened up scrubber for brick"
+#define BRB_MSG_SUBVOL_CONNECT_FAILED_STR                                      \
+    "callback handler for subvolume failed"
+#define BRB_MSG_MEM_ACNT_FAILED_STR "Memory accounting init failed"
+#define BRB_MSG_EVENT_UNHANDLED_STR "Event unhandled for child"
+#define BRB_MSG_INVALID_SUBVOL_STR "Got event from invalid subvolume"
+#define BRB_MSG_RESCHEDULE_SCRUBBER_FAILED_STR                                 \
+    "on demand scrub schedule failed. Scrubber is not in pending state."
+#define BRB_MSG_COULD_NOT_SCHEDULE_SCRUB_STR                                   \
+    "Could not schedule ondemand scrubbing. Scrubbing will continue "          \
+    "according to old frequency."
+#define BRB_MSG_THREAD_CREATION_FAILED_STR "thread creation failed"
+#define BRB_MSG_RATE_LIMIT_INFO_STR "Rate Limit Info"
+#define BRB_MSG_MEM_POOL_ALLOC_STR "failed to allocate mem-pool for timer"
+#define BRB_MSG_NO_CHILD_STR "FATAL: no children"
+#define BRB_MSG_TIMER_WHEEL_UNAVAILABLE_STR "global timer wheel unavailable"
+#define BRB_MSG_BITROT_LOADED_STR "bit-rot xlator loaded"
+#define BRB_MSG_SAVING_HASH_FAILED_STR                                         \
+    "failed to allocate memory for saving hash of the object"
+#endif /* !_BITROT_BITD_MESSAGES_H_ */
diff --git a/xlators/features/bit-rot/src/bitd/bit-rot-scrub-status.c b/xlators/features/bit-rot/src/bitd/bit-rot-scrub-status.c
new file mode 100644
index 00000000000..5cef2ffa5e5
--- /dev/null
+++ b/xlators/features/bit-rot/src/bitd/bit-rot-scrub-status.c
@@ -0,0 +1,78 @@
+/*
+  Copyright (c) 2016 Red Hat, Inc. <http://www.redhat.com>
+  This file is part of GlusterFS.
+
+  This file is licensed to you under your choice of the GNU Lesser
+  General Public License, version 3 or any later version (LGPLv3 or
+  later), or the GNU General Public License, version 2 (GPLv2), in all
+  cases as published by the Free Software Foundation.
+*/
+
+#include <string.h>
+#include <stdio.h>
+
+#include "bit-rot-scrub-status.h"
+
+void
+br_inc_unsigned_file_count(br_scrub_stats_t *scrub_stat)
+{
+    if (!scrub_stat)
+        return;
+
+    pthread_mutex_lock(&scrub_stat->lock);
+    {
+        scrub_stat->unsigned_files++;
+    }
+    pthread_mutex_unlock(&scrub_stat->lock);
+}
+
+void
+br_inc_scrubbed_file(br_scrub_stats_t *scrub_stat)
+{
+    if (!scrub_stat)
+        return;
+
+    pthread_mutex_lock(&scrub_stat->lock);
+    {
+        scrub_stat->scrubbed_files++;
+    }
+    pthread_mutex_unlock(&scrub_stat->lock);
+}
+
+void
+br_update_scrub_start_time(br_scrub_stats_t *scrub_stat, time_t time)
+{
+    if (!scrub_stat)
+        return;
+
+    pthread_mutex_lock(&scrub_stat->lock);
+    {
+        scrub_stat->scrub_start_time = time;
+    }
+    pthread_mutex_unlock(&scrub_stat->lock);
+}
+
+void
+br_update_scrub_finish_time(br_scrub_stats_t *scrub_stat, char *timestr,
+                            time_t time)
+{
+    int lst_size = 0;
+
+    if (!scrub_stat)
+        return;
+
+    lst_size = sizeof(scrub_stat->last_scrub_time);
+    if (strlen(timestr) >= lst_size)
+        return;
+
+    pthread_mutex_lock(&scrub_stat->lock);
+    {
+        scrub_stat->scrub_end_time = time;
+
+        scrub_stat->scrub_duration = scrub_stat->scrub_end_time -
+                                     scrub_stat->scrub_start_time;
+
+        snprintf(scrub_stat->last_scrub_time, lst_size, "%s", timestr);
+    }
+    pthread_mutex_unlock(&scrub_stat->lock);
+}
diff --git a/xlators/features/bit-rot/src/bitd/bit-rot-scrub-status.h b/xlators/features/bit-rot/src/bitd/bit-rot-scrub-status.h
new file mode 100644
index 00000000000..f022aa831eb
--- /dev/null
+++ b/xlators/features/bit-rot/src/bitd/bit-rot-scrub-status.h
@@ -0,0 +1,50 @@
+/*
+   Copyright (c) 2016 Red Hat, Inc. <http://www.redhat.com>
+   This file is part of GlusterFS.
+
+   This file is licensed to you under your choice of the GNU Lesser
+   General Public License, version 3 or any later version (LGPLv3 or
+   later), or the GNU General Public License, version 2 (GPLv2), in all
+   cases as published by the Free Software Foundation.
+*/
+
+#ifndef __BIT_ROT_SCRUB_STATUS_H__
+#define __BIT_ROT_SCRUB_STATUS_H__
+
+#include <stdint.h>
+#include <sys/time.h>
+#include <pthread.h>
+
+#include <glusterfs/common-utils.h>
+
+struct br_scrub_stats {
+    uint64_t scrubbed_files; /* Total number of scrubbed files. */
+
+    uint64_t unsigned_files; /* Total number of unsigned files. */
+
+    uint64_t scrub_duration; /* Duration of last scrub. */
+
+    char last_scrub_time[GF_TIMESTR_SIZE]; /* Last scrub completion time. */
+
+    time_t scrub_start_time; /* Scrubbing starting time. */
+
+    time_t scrub_end_time; /* Scrubbing finishing time. */
+
+    int8_t scrub_running; /* Whether scrub running or not. */
+
+    pthread_mutex_t lock;
+};
+
+typedef struct br_scrub_stats br_scrub_stats_t;
+
+void
+br_inc_unsigned_file_count(br_scrub_stats_t *scrub_stat);
+void
+br_inc_scrubbed_file(br_scrub_stats_t *scrub_stat);
+void
+br_update_scrub_start_time(br_scrub_stats_t *scrub_stat, time_t time);
+void
+br_update_scrub_finish_time(br_scrub_stats_t *scrub_stat, char *timestr,
+                            time_t time);
+
+#endif /* __BIT_ROT_SCRUB_STATUS_H__ */
diff --git a/xlators/features/bit-rot/src/bitd/bit-rot-scrub.c b/xlators/features/bit-rot/src/bitd/bit-rot-scrub.c
new file mode 100644
index 00000000000..289dd53f610
--- /dev/null
+++ b/xlators/features/bit-rot/src/bitd/bit-rot-scrub.c
@@ -0,0 +1,2070 @@
+/*
+   Copyright (c) 2015 Red Hat, Inc. <http://www.redhat.com>
+   This file is part of GlusterFS.
+
+   This file is licensed to you under your choice of the GNU Lesser
+   General Public License, version 3 or any later version (LGPLv3 or
+   later), or the GNU General Public License, version 2 (GPLv2), in all
+   cases as published by the Free Software Foundation.
+*/
+
+#include <math.h>
+#include <ctype.h>
+#include <sys/uio.h>
+
+#include <glusterfs/glusterfs.h>
+#include <glusterfs/logging.h>
+#include <glusterfs/common-utils.h>
+
+#include "bit-rot-scrub.h"
+#include <pthread.h>
+#include "bit-rot-bitd-messages.h"
+#include "bit-rot-scrub-status.h"
+#include <glusterfs/events.h>
+
+struct br_scrubbers {
+    pthread_t scrubthread;
+
+    struct list_head list;
+};
+
+struct br_fsscan_entry {
+    void *data;
+
+    loc_t parent;
+
+    gf_dirent_t *entry;
+
+    struct br_scanfs *fsscan; /* backpointer to subvolume scanner */
+
+    struct list_head list;
+};
+
+/**
+ * fetch signature extended attribute from an object's fd.
+ * NOTE: On success @xattr is not unref'd as @sign points
+ * to the dictionary value.
+ */
+static int32_t
+bitd_fetch_signature(xlator_t *this, br_child_t *child, fd_t *fd,
+                     dict_t **xattr, br_isignature_out_t **sign)
+{
+    int32_t ret = -1;
+
+    ret = syncop_fgetxattr(child->xl, fd, xattr, GLUSTERFS_GET_OBJECT_SIGNATURE,
+                           NULL, NULL);
+    if (ret < 0) {
+        br_log_object(this, "fgetxattr", fd->inode->gfid, -ret);
+        goto out;
+    }
+
+    ret = dict_get_ptr(*xattr, GLUSTERFS_GET_OBJECT_SIGNATURE, (void **)sign);
+    if (ret) {
+        gf_msg(this->name, GF_LOG_ERROR, 0, BRB_MSG_GET_SIGN_FAILED,
+               "failed to extract signature info [GFID: %s]",
+               uuid_utoa(fd->inode->gfid));
+        goto unref_dict;
+    }
+
+    return 0;
+
+unref_dict:
+    dict_unref(*xattr);
+out:
+    return -1;
+}
+
+/**
+ * POST COMPUTE CHECK
+ *
+ * Checks to be performed before verifying calculated signature
+ * Object is skipped if:
+ *  - has stale signature
+ *  - mismatches versions caches in pre-compute check
+ */
+
+int32_t
+bitd_scrub_post_compute_check(xlator_t *this, br_child_t *child, fd_t *fd,
+                              unsigned long version,
+                              br_isignature_out_t **signature,
+                              br_scrub_stats_t *scrub_stat,
+                              gf_boolean_t skip_stat)
+{
+    int32_t ret = 0;
+    size_t signlen = 0;
+    dict_t *xattr = NULL;
+    br_isignature_out_t *signptr = NULL;
+
+    ret = bitd_fetch_signature(this, child, fd, &xattr, &signptr);
+    if (ret < 0) {
+        if (!skip_stat)
+            br_inc_unsigned_file_count(scrub_stat);
+        goto out;
+    }
+
+    /**
+     * Either the object got dirtied during the time the signature was
+     * calculated OR the version we saved during pre-compute check does
+     * not match now, implying that the object got dirtied and signed in
+     * between scrubs pre & post compute checks (checksum window).
+     *
+     * The log entry looks pretty ugly, but helps in debugging..
+     */
+    if (signptr->stale || (signptr->version != version)) {
+        if (!skip_stat)
+            br_inc_unsigned_file_count(scrub_stat);
+        gf_msg_debug(this->name, 0,
+                     "<STAGE: POST> Object [GFID: %s] "
+                     "either has a stale signature OR underwent "
+                     "signing during checksumming {Stale: %d | "
+                     "Version: %lu,%lu}",
+                     uuid_utoa(fd->inode->gfid), (signptr->stale) ? 1 : 0,
+                     version, signptr->version);
+        ret = -1;
+        goto unref_dict;
+    }
+
+    signlen = signptr->signaturelen;
+    *signature = GF_MALLOC(sizeof(br_isignature_out_t) + signlen,
+                           gf_common_mt_char);
+
+    (void)memcpy(*signature, signptr, sizeof(br_isignature_out_t) + signlen);
+
+    (*signature)->signaturelen = signlen;
+
+unref_dict:
+    dict_unref(xattr);
+out:
+    return ret;
+}
+
+static int32_t
+bitd_signature_staleness(xlator_t *this, br_child_t *child, fd_t *fd,
+                         int *stale, unsigned long *version,
+                         br_scrub_stats_t *scrub_stat, gf_boolean_t skip_stat)
+{
+    int32_t ret = -1;
+    dict_t *xattr = NULL;
+    br_isignature_out_t *signptr = NULL;
+
+    ret = bitd_fetch_signature(this, child, fd, &xattr, &signptr);
+    if (ret < 0) {
+        if (!skip_stat)
+            br_inc_unsigned_file_count(scrub_stat);
+        goto out;
+    }
+
+    /**
+     * save version for validation in post compute stage
+     * c.f. bitd_scrub_post_compute_check()
+     */
+    *stale = signptr->stale ? 1 : 0;
+    *version = signptr->version;
+
+    dict_unref(xattr);
+
+out:
+    return ret;
+}
+
+/**
+ * PRE COMPUTE CHECK
+ *
+ * Checks to be performed before initiating object signature calculation.
+ * An object is skipped if:
+ *  - it's already marked corrupted
+ *  - has stale signature
+ */
+int32_t
+bitd_scrub_pre_compute_check(xlator_t *this, br_child_t *child, fd_t *fd,
+                             unsigned long *version,
+                             br_scrub_stats_t *scrub_stat,
+                             gf_boolean_t skip_stat)
+{
+    int stale = 0;
+    int32_t ret = -1;
+
+    if (bitd_is_bad_file(this, child, NULL, fd)) {
+        gf_msg(this->name, GF_LOG_WARNING, 0, BRB_MSG_SKIP_OBJECT,
+               "Object [GFID: %s] is marked corrupted, skipping..",
+               uuid_utoa(fd->inode->gfid));
+        goto out;
+    }
+
+    ret = bitd_signature_staleness(this, child, fd, &stale, version, scrub_stat,
+                                   skip_stat);
+    if (!ret && stale) {
+        if (!skip_stat)
+            br_inc_unsigned_file_count(scrub_stat);
+        gf_msg_debug(this->name, 0,
+                     "<STAGE: PRE> Object [GFID: %s] "
+                     "has stale signature",
+                     uuid_utoa(fd->inode->gfid));
+        ret = -1;
+    }
+
+out:
+    return ret;
+}
+
+/* static int */
+int
+bitd_compare_ckum(xlator_t *this, br_isignature_out_t *sign, unsigned char *md,
+                  inode_t *linked_inode, gf_dirent_t *entry, fd_t *fd,
+                  br_child_t *child, loc_t *loc)
+{
+    int ret = -1;
+    dict_t *xattr = NULL;
+
+    GF_VALIDATE_OR_GOTO("bit-rot", this, out);
+    GF_VALIDATE_OR_GOTO(this->name, sign, out);
+    GF_VALIDATE_OR_GOTO(this->name, fd, out);
+    GF_VALIDATE_OR_GOTO(this->name, child, out);
+    GF_VALIDATE_OR_GOTO(this->name, linked_inode, out);
+    GF_VALIDATE_OR_GOTO(this->name, md, out);
+    GF_VALIDATE_OR_GOTO(this->name, entry, out);
+
+    if (strncmp(sign->signature, (char *)md, sign->signaturelen) == 0) {
+        gf_msg_debug(this->name, 0,
+                     "%s [GFID: %s | Brick: %s] "
+                     "matches calculated checksum",
+                     loc->path, uuid_utoa(linked_inode->gfid),
+                     child->brick_path);
+        return 0;
+    }
+
+    gf_msg(this->name, GF_LOG_DEBUG, 0, BRB_MSG_CHECKSUM_MISMATCH,
+           "Object checksum mismatch: %s [GFID: %s | Brick: %s]", loc->path,
+           uuid_utoa(linked_inode->gfid), child->brick_path);
+    gf_msg(this->name, GF_LOG_ALERT, 0, BRB_MSG_CHECKSUM_MISMATCH,
+           "CORRUPTION DETECTED: Object %s {Brick: %s | GFID: %s}", loc->path,
+           child->brick_path, uuid_utoa(linked_inode->gfid));
+
+    /* Perform bad-file marking */
+    xattr = dict_new();
+    if (!xattr) {
+        ret = -1;
+        goto out;
+    }
+
+    ret = dict_set_int32(xattr, BITROT_OBJECT_BAD_KEY, _gf_true);
+    if (ret) {
+        gf_msg(this->name, GF_LOG_ERROR, 0, BRB_MSG_MARK_BAD_FILE,
+               "Error setting bad-file marker for %s [GFID: %s | "
+               "Brick: %s]",
+               loc->path, uuid_utoa(linked_inode->gfid), child->brick_path);
+        goto dictfree;
+    }
+
+    gf_msg(this->name, GF_LOG_ALERT, 0, BRB_MSG_MARK_CORRUPTED,
+           "Marking"
+           " %s [GFID: %s | Brick: %s] as corrupted..",
+           loc->path, uuid_utoa(linked_inode->gfid), child->brick_path);
+    gf_event(EVENT_BITROT_BAD_FILE, "gfid=%s;path=%s;brick=%s",
+             uuid_utoa(linked_inode->gfid), loc->path, child->brick_path);
+    ret = syncop_fsetxattr(child->xl, fd, xattr, 0, NULL, NULL);
+    if (ret)
+        gf_msg(this->name, GF_LOG_ERROR, 0, BRB_MSG_MARK_BAD_FILE,
+               "Error marking object %s [GFID: %s] as corrupted", loc->path,
+               uuid_utoa(linked_inode->gfid));
+
+dictfree:
+    dict_unref(xattr);
+out:
+    return ret;
+}
+
+/**
+ * "The Scrubber"
+ *
+ * Perform signature validation for a given object with the assumption
+ * that the signature is SHA256 (because signer as of now _always_
+ * signs with SHA256).
+ */
+int
+br_scrubber_scrub_begin(xlator_t *this, struct br_fsscan_entry *fsentry)
+{
+    int32_t ret = -1;
+    fd_t *fd = NULL;
+    loc_t loc = {
+        0,
+    };
+    struct iatt iatt = {
+        0,
+    };
+    struct iatt parent_buf = {
+        0,
+    };
+    pid_t pid = 0;
+    br_child_t *child = NULL;
+    unsigned char *md = NULL;
+    inode_t *linked_inode = NULL;
+    br_isignature_out_t *sign = NULL;
+    unsigned long signedversion = 0;
+    gf_dirent_t *entry = NULL;
+    br_private_t *priv = NULL;
+    loc_t *parent = NULL;
+    gf_boolean_t skip_stat = _gf_false;
+    uuid_t shard_root_gfid = {
+        0,
+    };
+
+    GF_VALIDATE_OR_GOTO("bit-rot", fsentry, out);
+
+    entry = fsentry->entry;
+    parent = &fsentry->parent;
+    child = fsentry->data;
+
+    priv = this->private;
+
+    GF_VALIDATE_OR_GOTO("bit-rot", entry, out);
+    GF_VALIDATE_OR_GOTO("bit-rot", parent, out);
+    GF_VALIDATE_OR_GOTO("bit-rot", child, out);
+    GF_VALIDATE_OR_GOTO("bit-rot", priv, out);
+
+    pid = GF_CLIENT_PID_SCRUB;
+
+    ret = br_prepare_loc(this, child, parent, entry, &loc);
+    if (!ret)
+        goto out;
+
+    syncopctx_setfspid(&pid);
+
+    ret = syncop_lookup(child->xl, &loc, &iatt, &parent_buf, NULL, NULL);
+    if (ret) {
+        br_log_object_path(this, "lookup", loc.path, -ret);
+        goto out;
+    }
+
+    linked_inode = inode_link(loc.inode, parent->inode, loc.name, &iatt);
+    if (linked_inode)
+        inode_lookup(linked_inode);
+
+    gf_msg_debug(this->name, 0, "Scrubbing object %s [GFID: %s]", entry->d_name,
+                 uuid_utoa(linked_inode->gfid));
+
+    if (iatt.ia_type != IA_IFREG) {
+        gf_msg_debug(this->name, 0, "%s is not a regular file", entry->d_name);
+        ret = 0;
+        goto unref_inode;
+    }
+
+    if (IS_DHT_LINKFILE_MODE((&iatt))) {
+        gf_msg_debug(this->name, 0, "%s is a dht sticky bit file",
+                     entry->d_name);
+        ret = 0;
+        goto unref_inode;
+    }
+
+    /* skip updating scrub statistics for shard entries */
+    gf_uuid_parse(SHARD_ROOT_GFID, shard_root_gfid);
+    if (gf_uuid_compare(loc.pargfid, shard_root_gfid) == 0)
+        skip_stat = _gf_true;
+
+    /**
+     * open() an fd for subsequent operations
+     */
+    fd = fd_create(linked_inode, 0);
+    if (!fd) {
+        gf_msg(this->name, GF_LOG_ERROR, 0, BRB_MSG_FD_CREATE_FAILED,
+               "failed to create fd for inode %s",
+               uuid_utoa(linked_inode->gfid));
+        goto unref_inode;
+    }
+
+    ret = syncop_open(child->xl, &loc, O_RDWR, fd, NULL, NULL);
+    if (ret) {
+        br_log_object(this, "open", linked_inode->gfid, -ret);
+        ret = -1;
+        goto unrefd;
+    }
+
+    fd_bind(fd);
+
+    /**
+     * perform pre compute checks before initiating checksum
+     * computation
+     *  - presence of bad object
+     *  - signature staleness
+     */
+    ret = bitd_scrub_pre_compute_check(this, child, fd, &signedversion,
+                                       &priv->scrub_stat, skip_stat);
+    if (ret)
+        goto unrefd; /* skip this object */
+
+    /* if all's good, proceed to calculate the hash */
+    md = GF_MALLOC(SHA256_DIGEST_LENGTH, gf_common_mt_char);
+    if (!md)
+        goto unrefd;
+
+    ret = br_calculate_obj_checksum(md, child, fd, &iatt);
+    if (ret) {
+        gf_msg(this->name, GF_LOG_ERROR, 0, BRB_MSG_CALC_ERROR,
+               "error calculating hash for object [GFID: %s]",
+               uuid_utoa(fd->inode->gfid));
+        ret = -1;
+        goto free_md;
+    }
+
+    /**
+     * perform post compute checks as an object's signature may have
+     * become stale while scrubber calculated checksum.
+     */
+    ret = bitd_scrub_post_compute_check(this, child, fd, signedversion, &sign,
+                                        &priv->scrub_stat, skip_stat);
+    if (ret)
+        goto free_md;
+
+    ret = bitd_compare_ckum(this, sign, md, linked_inode, entry, fd, child,
+                            &loc);
+
+    if (!skip_stat)
+        br_inc_scrubbed_file(&priv->scrub_stat);
+
+    GF_FREE(sign); /* allocated on post-compute */
+
+    /** fd_unref() takes care of closing fd.. like syncop_close() */
+
+free_md:
+    GF_FREE(md);
+unrefd:
+    fd_unref(fd);
+unref_inode:
+    inode_unref(linked_inode);
+out:
+    loc_wipe(&loc);
+    return ret;
+}
+
+static void
+_br_lock_cleaner(void *arg)
+{
+    pthread_mutex_t *mutex = arg;
+
+    pthread_mutex_unlock(mutex);
+}
+
+static void
+wait_for_scrubbing(xlator_t *this, struct br_scanfs *fsscan)
+{
+    br_private_t *priv = NULL;
+    struct br_scrubber *fsscrub = NULL;
+
+    priv = this->private;
+    fsscrub = &priv->fsscrub;
+
+    pthread_cleanup_push(_br_lock_cleaner, &fsscan->waitlock);
+    pthread_mutex_lock(&fsscan->waitlock);
+    {
+        pthread_cleanup_push(_br_lock_cleaner, &fsscrub->mutex);
+        pthread_mutex_lock(&fsscrub->mutex);
+        {
+            list_replace_init(&fsscan->queued, &fsscan->ready);
+
+            /* wake up scrubbers */
+            pthread_cond_broadcast(&fsscrub->cond);
+        }
+        pthread_mutex_unlock(&fsscrub->mutex);
+        pthread_cleanup_pop(0);
+
+        while (fsscan->entries != 0)
+            pthread_cond_wait(&fsscan->waitcond, &fsscan->waitlock);
+    }
+    pthread_mutex_unlock(&fsscan->waitlock);
+    pthread_cleanup_pop(0);
+}
+
+static void
+_br_fsscan_inc_entry_count(struct br_scanfs *fsscan)
+{
+    fsscan->entries++;
+}
+
+static void
+_br_fsscan_dec_entry_count(struct br_scanfs *fsscan)
+{
+    if (--fsscan->entries == 0) {
+        pthread_mutex_lock(&fsscan->waitlock);
+        {
+            pthread_cond_signal(&fsscan->waitcond);
+        }
+        pthread_mutex_unlock(&fsscan->waitlock);
+    }
+}
+
+static void
+_br_fsscan_collect_entry(struct br_scanfs *fsscan,
+                         struct br_fsscan_entry *fsentry)
+{
+    list_add_tail(&fsentry->list, &fsscan->queued);
+    _br_fsscan_inc_entry_count(fsscan);
+}
+
+#define NR_ENTRIES (1 << 7) /* ..bulk scrubbing */
+
+int
+br_fsscanner_handle_entry(xlator_t *subvol, gf_dirent_t *entry, loc_t *parent,
+                          void *data)
+{
+    int32_t ret = -1;
+    int scrub = 0;
+    br_child_t *child = NULL;
+    xlator_t *this = NULL;
+    struct br_scanfs *fsscan = NULL;
+    struct br_fsscan_entry *fsentry = NULL;
+
+    GF_VALIDATE_OR_GOTO("bit-rot", subvol, error_return);
+    GF_VALIDATE_OR_GOTO("bit-rot", data, error_return);
+
+    child = data;
+    this = child->this;
+    fsscan = &child->fsscan;
+
+    _mask_cancellation();
+
+    fsentry = GF_CALLOC(1, sizeof(*fsentry), gf_br_mt_br_fsscan_entry_t);
+    if (!fsentry)
+        goto error_return;
+
+    {
+        fsentry->data = data;
+        fsentry->fsscan = &child->fsscan;
+
+        /* copy parent loc */
+        ret = loc_copy(&fsentry->parent, parent);
+        if (ret)
+            goto dealloc;
+
+        /* copy child entry */
+        fsentry->entry = entry_copy(entry);
+        if (!fsentry->entry)
+            goto locwipe;
+
+        INIT_LIST_HEAD(&fsentry->list);
+    }
+
+    LOCK(&fsscan->entrylock);
+    {
+        _br_fsscan_collect_entry(fsscan, fsentry);
+
+        /**
+         * need not be a equality check as entries may be pushed
+         * back onto the scanned queue when thread(s) are cleaned.
+         */
+        if (fsscan->entries >= NR_ENTRIES)
+            scrub = 1;
+    }
+    UNLOCK(&fsscan->entrylock);
+
+    _unmask_cancellation();
+
+    if (scrub)
+        wait_for_scrubbing(this, fsscan);
+
+    return 0;
+
+locwipe:
+    loc_wipe(&fsentry->parent);
+dealloc:
+    GF_FREE(fsentry);
+error_return:
+    return -1;
+}
+
+int32_t
+br_fsscan_deactivate(xlator_t *this)
+{
+    int ret = 0;
+    br_private_t *priv = NULL;
+    br_scrub_state_t nstate = 0;
+    struct br_monitor *scrub_monitor = NULL;
+
+    priv = this->private;
+    scrub_monitor = &priv->scrub_monitor;
+
+    ret = gf_tw_del_timer(priv->timer_wheel, scrub_monitor->timer);
+    if (ret == 0) {
+        nstate = BR_SCRUB_STATE_STALLED;
+        gf_msg(this->name, GF_LOG_INFO, 0, BRB_MSG_SCRUB_INFO,
+               "Volume is under active scrubbing. Pausing scrub..");
+    } else {
+        nstate = BR_SCRUB_STATE_PAUSED;
+        gf_msg(this->name, GF_LOG_INFO, 0, BRB_MSG_SCRUB_INFO,
+               "Scrubber paused");
+    }
+
+    _br_monitor_set_scrub_state(scrub_monitor, nstate);
+
+    return 0;
+}
+
+static void
+br_scrubber_log_time(xlator_t *this, const char *sfx)
+{
+    char timestr[GF_TIMESTR_SIZE] = {
+        0,
+    };
+    br_private_t *priv = NULL;
+    time_t now = 0;
+
+    now = gf_time();
+    priv = this->private;
+
+    gf_time_fmt(timestr, sizeof(timestr), now, gf_timefmt_FT);
+
+    if (strcasecmp(sfx, "started") == 0) {
+        br_update_scrub_start_time(&priv->scrub_stat, now);
+        gf_msg(this->name, GF_LOG_INFO, 0, BRB_MSG_SCRUB_START,
+               "Scrubbing %s at %s", sfx, timestr);
+    } else {
+        br_update_scrub_finish_time(&priv->scrub_stat, timestr, now);
+        gf_msg(this->name, GF_LOG_INFO, 0, BRB_MSG_SCRUB_FINISH,
+               "Scrubbing %s at %s", sfx, timestr);
+    }
+}
+
+static void
+br_fsscanner_log_time(xlator_t *this, br_child_t *child, const char *sfx)
+{
+    char timestr[GF_TIMESTR_SIZE] = {
+        0,
+    };
+    time_t now = 0;
+
+    now = gf_time();
+    gf_time_fmt(timestr, sizeof(timestr), now, gf_timefmt_FT);
+
+    if (strcasecmp(sfx, "started") == 0) {
+        gf_msg_debug(this->name, 0, "Scrubbing \"%s\" %s at %s",
+                     child->brick_path, sfx, timestr);
+    } else {
+        gf_msg_debug(this->name, 0, "Scrubbing \"%s\" %s at %s",
+                     child->brick_path, sfx, timestr);
+    }
+}
+
+void
+br_child_set_scrub_state(br_child_t *child, gf_boolean_t state)
+{
+    child->active_scrubbing = state;
+}
+
+static void
+br_fsscanner_wait_until_kicked(xlator_t *this, br_child_t *child)
+{
+    br_private_t *priv = NULL;
+    struct br_monitor *scrub_monitor = NULL;
+
+    priv = this->private;
+    scrub_monitor = &priv->scrub_monitor;
+
+    pthread_cleanup_push(_br_lock_cleaner, &scrub_monitor->wakelock);
+    pthread_mutex_lock(&scrub_monitor->wakelock);
+    {
+        while (!scrub_monitor->kick)
+            pthread_cond_wait(&scrub_monitor->wakecond,
+                              &scrub_monitor->wakelock);
+
+        /* Child lock is to synchronize with disconnect events */
+        pthread_cleanup_push(_br_lock_cleaner, &child->lock);
+        pthread_mutex_lock(&child->lock);
+        {
+            scrub_monitor->active_child_count++;
+            br_child_set_scrub_state(child, _gf_true);
+        }
+        pthread_mutex_unlock(&child->lock);
+        pthread_cleanup_pop(0);
+    }
+    pthread_mutex_unlock(&scrub_monitor->wakelock);
+    pthread_cleanup_pop(0);
+}
+
+static void
+br_scrubber_entry_control(xlator_t *this)
+{
+    br_private_t *priv = NULL;
+    struct br_monitor *scrub_monitor = NULL;
+
+    priv = this->private;
+    scrub_monitor = &priv->scrub_monitor;
+
+    LOCK(&scrub_monitor->lock);
+    {
+        /* Move the state to BR_SCRUB_STATE_ACTIVE */
+        if (scrub_monitor->state == BR_SCRUB_STATE_PENDING)
+            scrub_monitor->state = BR_SCRUB_STATE_ACTIVE;
+        br_scrubber_log_time(this, "started");
+        priv->scrub_stat.scrub_running = 1;
+    }
+    UNLOCK(&scrub_monitor->lock);
+}
+
+static void
+br_scrubber_exit_control(xlator_t *this)
+{
+    br_private_t *priv = NULL;
+    struct br_monitor *scrub_monitor = NULL;
+
+    priv = this->private;
+    scrub_monitor = &priv->scrub_monitor;
+
+    LOCK(&scrub_monitor->lock);
+    {
+        br_scrubber_log_time(this, "finished");
+        priv->scrub_stat.scrub_running = 0;
+
+        if (scrub_monitor->state == BR_SCRUB_STATE_ACTIVE) {
+            (void)br_fsscan_activate(this);
+        } else {
+            UNLOCK(&scrub_monitor->lock);
+            gf_msg(this->name, GF_LOG_INFO, 0, BRB_MSG_SCRUB_INFO,
+                   "Volume waiting to get rescheduled..");
+            return;
+        }
+    }
+    UNLOCK(&scrub_monitor->lock);
+}
+
+static void
+br_fsscanner_entry_control(xlator_t *this, br_child_t *child)
+{
+    br_fsscanner_log_time(this, child, "started");
+}
+
+static void
+br_fsscanner_exit_control(xlator_t *this, br_child_t *child)
+{
+    br_private_t *priv = NULL;
+    struct br_monitor *scrub_monitor = NULL;
+
+    priv = this->private;
+    scrub_monitor = &priv->scrub_monitor;
+
+    if (!_br_is_child_connected(child)) {
+        gf_msg(this->name, GF_LOG_WARNING, 0, BRB_MSG_SCRUB_INFO,
+               "Brick [%s] disconnected while scrubbing. Scrubbing "
+               "might be incomplete",
+               child->brick_path);
+    }
+
+    br_fsscanner_log_time(this, child, "finished");
+
+    pthread_cleanup_push(_br_lock_cleaner, &scrub_monitor->wakelock);
+    pthread_mutex_lock(&scrub_monitor->wakelock);
+    {
+        scrub_monitor->active_child_count--;
+        pthread_cleanup_push(_br_lock_cleaner, &child->lock);
+        pthread_mutex_lock(&child->lock);
+        {
+            br_child_set_scrub_state(child, _gf_false);
+        }
+        pthread_mutex_unlock(&child->lock);
+        pthread_cleanup_pop(0);
+
+        if (scrub_monitor->active_child_count == 0) {
+            /* The last child has finished scrubbing.
+             * Set the kick to false and  wake up other
+             * children who are waiting for the last
+             * child to complete scrubbing.
+             */
+            scrub_monitor->kick = _gf_false;
+            pthread_cond_broadcast(&scrub_monitor->wakecond);
+
+            /* Signal monitor thread waiting for the all
+             * the children to finish scrubbing.
+             */
+            pthread_cleanup_push(_br_lock_cleaner, &scrub_monitor->donelock);
+            pthread_mutex_lock(&scrub_monitor->donelock);
+            {
+                scrub_monitor->done = _gf_true;
+                pthread_cond_signal(&scrub_monitor->donecond);
+            }
+            pthread_mutex_unlock(&scrub_monitor->donelock);
+            pthread_cleanup_pop(0);
+        } else {
+            while (scrub_monitor->active_child_count)
+                pthread_cond_wait(&scrub_monitor->wakecond,
+                                  &scrub_monitor->wakelock);
+        }
+    }
+    pthread_mutex_unlock(&scrub_monitor->wakelock);
+    pthread_cleanup_pop(0);
+}
+
+void *
+br_fsscanner(void *arg)
+{
+    loc_t loc = {
+        0,
+    };
+    br_child_t *child = NULL;
+    xlator_t *this = NULL;
+    struct br_scanfs *fsscan = NULL;
+
+    child = arg;
+    this = child->this;
+    fsscan = &child->fsscan;
+
+    THIS = this;
+    loc.inode = child->table->root;
+
+    while (1) {
+        br_fsscanner_wait_until_kicked(this, child);
+        {
+            /* precursor for scrub */
+            br_fsscanner_entry_control(this, child);
+
+            /* scrub */
+            (void)syncop_ftw(child->xl, &loc, GF_CLIENT_PID_SCRUB, child,
+                             br_fsscanner_handle_entry);
+            if (!list_empty(&fsscan->queued))
+                wait_for_scrubbing(this, fsscan);
+
+            /* scrub exit criteria */
+            br_fsscanner_exit_control(this, child);
+        }
+    }
+
+    return NULL;
+}
+
+/**
+ * Keep this routine extremely simple and do not ever try to acquire
+ * child->lock here: it may lead to deadlock. Scrubber state is
+ * modified in br_fsscanner(). An intermediate state change to pause
+ * changes the scrub state to the _correct_ state by identifying a
+ * non-pending timer.
+ */
+void
+br_kickstart_scanner(struct gf_tw_timer_list *timer, void *data,
+                     unsigned long calltime)
+{
+    xlator_t *this = NULL;
+    struct br_monitor *scrub_monitor = data;
+    br_private_t *priv = NULL;
+
+    THIS = this = scrub_monitor->this;
+    priv = this->private;
+
+    /* Reset scrub statistics */
+    priv->scrub_stat.scrubbed_files = 0;
+    priv->scrub_stat.unsigned_files = 0;
+
+    /* Moves state from PENDING to ACTIVE */
+    (void)br_scrubber_entry_control(this);
+
+    /* kickstart scanning.. */
+    pthread_mutex_lock(&scrub_monitor->wakelock);
+    {
+        scrub_monitor->kick = _gf_true;
+        GF_ASSERT(scrub_monitor->active_child_count == 0);
+        pthread_cond_broadcast(&scrub_monitor->wakecond);
+    }
+    pthread_mutex_unlock(&scrub_monitor->wakelock);
+
+    return;
+}
+
+static uint32_t
+br_fsscan_calculate_delta(uint32_t times)
+{
+    return times;
+}
+
+#define BR_SCRUB_ONDEMAND (1)
+#define BR_SCRUB_MINUTE (60)
+#define BR_SCRUB_HOURLY (60 * 60)
+#define BR_SCRUB_DAILY (1 * 24 * 60 * 60)
+#define BR_SCRUB_WEEKLY (7 * 24 * 60 * 60)
+#define BR_SCRUB_BIWEEKLY (14 * 24 * 60 * 60)
+#define BR_SCRUB_MONTHLY (30 * 24 * 60 * 60)
+
+static unsigned int
+br_fsscan_calculate_timeout(scrub_freq_t freq)
+{
+    uint32_t timo = 0;
+
+    switch (freq) {
+        case BR_FSSCRUB_FREQ_MINUTE:
+            timo = br_fsscan_calculate_delta(BR_SCRUB_MINUTE);
+            break;
+        case BR_FSSCRUB_FREQ_HOURLY:
+            timo = br_fsscan_calculate_delta(BR_SCRUB_HOURLY);
+            break;
+        case BR_FSSCRUB_FREQ_DAILY:
+            timo = br_fsscan_calculate_delta(BR_SCRUB_DAILY);
+            break;
+        case BR_FSSCRUB_FREQ_WEEKLY:
+            timo = br_fsscan_calculate_delta(BR_SCRUB_WEEKLY);
+            break;
+        case BR_FSSCRUB_FREQ_BIWEEKLY:
+            timo = br_fsscan_calculate_delta(BR_SCRUB_BIWEEKLY);
+            break;
+        case BR_FSSCRUB_FREQ_MONTHLY:
+            timo = br_fsscan_calculate_delta(BR_SCRUB_MONTHLY);
+            break;
+        default:
+            timo = 0;
+    }
+
+    return timo;
+}
+
+int32_t
+br_fsscan_schedule(xlator_t *this)
+{
+    uint32_t timo = 0;
+    br_private_t *priv = NULL;
+    char timestr[GF_TIMESTR_SIZE] = {
+        0,
+    };
+    struct br_scrubber *fsscrub = NULL;
+    struct gf_tw_timer_list *timer = NULL;
+    struct br_monitor *scrub_monitor = NULL;
+
+    priv = this->private;
+    fsscrub = &priv->fsscrub;
+    scrub_monitor = &priv->scrub_monitor;
+
+    scrub_monitor->boot = gf_time();
+
+    timo = br_fsscan_calculate_timeout(fsscrub->frequency);
+    if (timo == 0) {
+        gf_msg(this->name, GF_LOG_ERROR, 0, BRB_MSG_ZERO_TIMEOUT_BUG,
+               "BUG: Zero schedule timeout");
+        goto error_return;
+    }
+
+    scrub_monitor->timer = GF_CALLOC(1, sizeof(*scrub_monitor->timer),
+                                     gf_br_stub_mt_br_scanner_freq_t);
+    if (!scrub_monitor->timer)
+        goto error_return;
+
+    timer = scrub_monitor->timer;
+    INIT_LIST_HEAD(&timer->entry);
+
+    timer->data = scrub_monitor;
+    timer->expires = timo;
+    timer->function = br_kickstart_scanner;
+
+    gf_tw_add_timer(priv->timer_wheel, timer);
+    _br_monitor_set_scrub_state(scrub_monitor, BR_SCRUB_STATE_PENDING);
+
+    gf_time_fmt(timestr, sizeof(timestr), (scrub_monitor->boot + timo),
+                gf_timefmt_FT);
+    gf_msg(this->name, GF_LOG_INFO, 0, BRB_MSG_SCRUB_INFO,
+           "Scrubbing is "
+           "scheduled to run at %s",
+           timestr);
+
+    return 0;
+
+error_return:
+    return -1;
+}
+
+int32_t
+br_fsscan_activate(xlator_t *this)
+{
+    uint32_t timo = 0;
+    char timestr[GF_TIMESTR_SIZE] = {
+        0,
+    };
+    time_t now = 0;
+    br_private_t *priv = NULL;
+    struct br_scrubber *fsscrub = NULL;
+    struct br_monitor *scrub_monitor = NULL;
+
+    priv = this->private;
+    fsscrub = &priv->fsscrub;
+    scrub_monitor = &priv->scrub_monitor;
+
+    now = gf_time();
+    timo = br_fsscan_calculate_timeout(fsscrub->frequency);
+    if (timo == 0) {
+        gf_msg(this->name, GF_LOG_ERROR, 0, BRB_MSG_ZERO_TIMEOUT_BUG,
+               "BUG: Zero schedule timeout");
+        return -1;
+    }
+
+    pthread_mutex_lock(&scrub_monitor->donelock);
+    {
+        scrub_monitor->done = _gf_false;
+    }
+    pthread_mutex_unlock(&scrub_monitor->donelock);
+
+    gf_time_fmt(timestr, sizeof(timestr), now + timo, gf_timefmt_FT);
+    (void)gf_tw_mod_timer(priv->timer_wheel, scrub_monitor->timer, timo);
+
+    _br_monitor_set_scrub_state(scrub_monitor, BR_SCRUB_STATE_PENDING);
+    gf_msg(this->name, GF_LOG_INFO, 0, BRB_MSG_SCRUB_INFO,
+           "Scrubbing is "
+           "rescheduled to run at %s",
+           timestr);
+
+    return 0;
+}
+
+int32_t
+br_fsscan_reschedule(xlator_t *this)
+{
+    int32_t ret = 0;
+    uint32_t timo = 0;
+    char timestr[GF_TIMESTR_SIZE] = {
+        0,
+    };
+    time_t now = 0;
+    br_private_t *priv = NULL;
+    struct br_scrubber *fsscrub = NULL;
+    struct br_monitor *scrub_monitor = NULL;
+
+    priv = this->private;
+    fsscrub = &priv->fsscrub;
+    scrub_monitor = &priv->scrub_monitor;
+
+    if (!fsscrub->frequency_reconf)
+        return 0;
+
+    now = gf_time();
+    timo = br_fsscan_calculate_timeout(fsscrub->frequency);
+    if (timo == 0) {
+        gf_msg(this->name, GF_LOG_ERROR, 0, BRB_MSG_ZERO_TIMEOUT_BUG,
+               "BUG: Zero schedule timeout");
+        return -1;
+    }
+
+    gf_time_fmt(timestr, sizeof(timestr), now + timo, gf_timefmt_FT);
+
+    pthread_mutex_lock(&scrub_monitor->donelock);
+    {
+        scrub_monitor->done = _gf_false;
+    }
+    pthread_mutex_unlock(&scrub_monitor->donelock);
+
+    ret = gf_tw_mod_timer_pending(priv->timer_wheel, scrub_monitor->timer,
+                                  timo);
+    if (ret == 0)
+        gf_msg(this->name, GF_LOG_INFO, 0, BRB_MSG_SCRUB_INFO,
+               "Scrubber is currently running and would be "
+               "rescheduled after completion");
+    else {
+        _br_monitor_set_scrub_state(scrub_monitor, BR_SCRUB_STATE_PENDING);
+        gf_msg(this->name, GF_LOG_INFO, 0, BRB_MSG_SCRUB_INFO,
+               "Scrubbing rescheduled to run at %s", timestr);
+    }
+
+    return 0;
+}
+
+int32_t
+br_fsscan_ondemand(xlator_t *this)
+{
+    int32_t ret = 0;
+    uint32_t timo = 0;
+    char timestr[GF_TIMESTR_SIZE] = {
+        0,
+    };
+    time_t now = 0;
+    br_private_t *priv = NULL;
+    struct br_monitor *scrub_monitor = NULL;
+
+    priv = this->private;
+    scrub_monitor = &priv->scrub_monitor;
+
+    now = gf_time();
+    timo = BR_SCRUB_ONDEMAND;
+    gf_time_fmt(timestr, sizeof(timestr), now + timo, gf_timefmt_FT);
+
+    pthread_mutex_lock(&scrub_monitor->donelock);
+    {
+        scrub_monitor->done = _gf_false;
+    }
+    pthread_mutex_unlock(&scrub_monitor->donelock);
+
+    ret = gf_tw_mod_timer_pending(priv->timer_wheel, scrub_monitor->timer,
+                                  timo);
+    if (ret == 0)
+        gf_msg(this->name, GF_LOG_INFO, 0, BRB_MSG_SCRUB_INFO,
+               "Scrubber is currently running and would be "
+               "rescheduled after completion");
+    else {
+        _br_monitor_set_scrub_state(scrub_monitor, BR_SCRUB_STATE_PENDING);
+        gf_msg(this->name, GF_LOG_INFO, 0, BRB_MSG_SCRUB_INFO,
+               "Ondemand Scrubbing scheduled to run at %s", timestr);
+    }
+
+    return 0;
+}
+
+#define BR_SCRUB_THREAD_SCALE_LAZY 0
+#define BR_SCRUB_THREAD_SCALE_NORMAL 0.4
+#define BR_SCRUB_THREAD_SCALE_AGGRESSIVE 1.0
+
+#ifndef M_E
+#define M_E 2.718
+#endif
+
+/**
+ * This is just a simple exponential scale to a fixed value selected
+ * per throttle config. We probably need to be more smart and select
+ * the scale based on the number of processor cores too.
+ */
+static unsigned int
+br_scrubber_calc_scale(xlator_t *this, br_private_t *priv,
+                       scrub_throttle_t throttle)
+{
+    unsigned int scale = 0;
+
+    switch (throttle) {
+        case BR_SCRUB_THROTTLE_VOID:
+        case BR_SCRUB_THROTTLE_STALLED:
+            scale = 0;
+            break;
+        case BR_SCRUB_THROTTLE_LAZY:
+            scale = priv->child_count * pow(M_E, BR_SCRUB_THREAD_SCALE_LAZY);
+            break;
+        case BR_SCRUB_THROTTLE_NORMAL:
+            scale = priv->child_count * pow(M_E, BR_SCRUB_THREAD_SCALE_NORMAL);
+            break;
+        case BR_SCRUB_THROTTLE_AGGRESSIVE:
+            scale = priv->child_count *
+                    pow(M_E, BR_SCRUB_THREAD_SCALE_AGGRESSIVE);
+            break;
+        default:
+            gf_msg(this->name, GF_LOG_ERROR, 0, BRB_MSG_UNKNOWN_THROTTLE,
+                   "Unknown throttle %d", throttle);
+    }
+
+    return scale;
+}
+
+static br_child_t *
+_br_scrubber_get_next_child(struct br_scrubber *fsscrub)
+{
+    br_child_t *child = NULL;
+
+    child = list_first_entry(&fsscrub->scrublist, br_child_t, list);
+    list_rotate_left(&fsscrub->scrublist);
+
+    return child;
+}
+
+static void
+_br_scrubber_get_entry(br_child_t *child, struct br_fsscan_entry **fsentry)
+{
+    struct br_scanfs *fsscan = &child->fsscan;
+
+    if (list_empty(&fsscan->ready))
+        return;
+    *fsentry = list_first_entry(&fsscan->ready, struct br_fsscan_entry, list);
+    list_del_init(&(*fsentry)->list);
+}
+
+static void
+_br_scrubber_find_scrubbable_entry(struct br_scrubber *fsscrub,
+                                   struct br_fsscan_entry **fsentry)
+{
+    br_child_t *child = NULL;
+    br_child_t *firstchild = NULL;
+
+    while (1) {
+        while (list_empty(&fsscrub->scrublist))
+            pthread_cond_wait(&fsscrub->cond, &fsscrub->mutex);
+
+        firstchild = NULL;
+        for (child = _br_scrubber_get_next_child(fsscrub); child != firstchild;
+             child = _br_scrubber_get_next_child(fsscrub)) {
+            if (!firstchild)
+                firstchild = child;
+
+            _br_scrubber_get_entry(child, fsentry);
+            if (*fsentry)
+                break;
+        }
+
+        if (*fsentry)
+            break;
+
+        /* nothing to work on.. wait till available */
+        pthread_cond_wait(&fsscrub->cond, &fsscrub->mutex);
+    }
+}
+
+static void
+br_scrubber_pick_entry(struct br_scrubber *fsscrub,
+                       struct br_fsscan_entry **fsentry)
+{
+    pthread_cleanup_push(_br_lock_cleaner, &fsscrub->mutex);
+
+    pthread_mutex_lock(&fsscrub->mutex);
+    {
+        *fsentry = NULL;
+        _br_scrubber_find_scrubbable_entry(fsscrub, fsentry);
+    }
+    pthread_mutex_unlock(&fsscrub->mutex);
+
+    pthread_cleanup_pop(0);
+}
+
+struct br_scrub_entry {
+    gf_boolean_t scrubbed;
+    struct br_fsscan_entry *fsentry;
+};
+
+/**
+ * We need to be a bit careful here. These thread(s) are prone to cancellations
+ * when threads are scaled down (depending on the thottling value configured)
+ * and pausing scrub. A thread can get cancelled while it's waiting for entries
+ * in the ->pending queue or when an object is undergoing scrubbing.
+ */
+static void
+br_scrubber_entry_handle(void *arg)
+{
+    struct br_scanfs *fsscan = NULL;
+    struct br_scrub_entry *sentry = NULL;
+    struct br_fsscan_entry *fsentry = NULL;
+
+    sentry = arg;
+
+    fsentry = sentry->fsentry;
+    fsscan = fsentry->fsscan;
+
+    LOCK(&fsscan->entrylock);
+    {
+        if (sentry->scrubbed) {
+            _br_fsscan_dec_entry_count(fsscan);
+
+            /* cleanup ->entry */
+            fsentry->data = NULL;
+            fsentry->fsscan = NULL;
+            loc_wipe(&fsentry->parent);
+            gf_dirent_entry_free(fsentry->entry);
+
+            GF_FREE(sentry->fsentry);
+        } else {
+            /* (re)queue the entry again for scrub */
+            _br_fsscan_collect_entry(fsscan, sentry->fsentry);
+        }
+    }
+    UNLOCK(&fsscan->entrylock);
+}
+
+static void
+br_scrubber_scrub_entry(xlator_t *this, struct br_fsscan_entry *fsentry)
+{
+    struct br_scrub_entry sentry = {
+        0,
+    };
+
+    sentry.scrubbed = 0;
+    sentry.fsentry = fsentry;
+
+    pthread_cleanup_push(br_scrubber_entry_handle, &sentry);
+    {
+        (void)br_scrubber_scrub_begin(this, fsentry);
+        sentry.scrubbed = 1;
+    }
+    pthread_cleanup_pop(1);
+}
+
+void *
+br_scrubber_proc(void *arg)
+{
+    xlator_t *this = NULL;
+    struct br_scrubber *fsscrub = NULL;
+    struct br_fsscan_entry *fsentry = NULL;
+
+    fsscrub = arg;
+    THIS = this = fsscrub->this;
+
+    while (1) {
+        br_scrubber_pick_entry(fsscrub, &fsentry);
+        br_scrubber_scrub_entry(this, fsentry);
+        sleep(1);
+    }
+
+    return NULL;
+}
+
+static int32_t
+br_scrubber_scale_up(xlator_t *this, struct br_scrubber *fsscrub,
+                     unsigned int v1, unsigned int v2)
+{
+    int i = 0;
+    int32_t ret = -1;
+    int diff = 0;
+    struct br_scrubbers *scrub = NULL;
+
+    diff = (int)(v2 - v1);
+
+    gf_msg(this->name, GF_LOG_INFO, 0, BRB_MSG_SCALING_UP_SCRUBBER,
+           "Scaling up scrubbers [%d => %d]", v1, v2);
+
+    for (i = 0; i < diff; i++) {
+        scrub = GF_CALLOC(diff, sizeof(*scrub), gf_br_mt_br_scrubber_t);
+        if (!scrub)
+            break;
+
+        INIT_LIST_HEAD(&scrub->list);
+        ret = gf_thread_create(&scrub->scrubthread, NULL, br_scrubber_proc,
+                               fsscrub, "brsproc");
+        if (ret)
+            break;
+
+        fsscrub->nr_scrubbers++;
+        list_add_tail(&scrub->list, &fsscrub->scrubbers);
+    }
+
+    if ((i != diff) && !scrub)
+        goto error_return;
+
+    if (i != diff) /* degraded scaling.. */
+        gf_msg(this->name, GF_LOG_WARNING, 0, BRB_MSG_SCALE_UP_FAILED,
+               "Could not fully scale up to %d scrubber(s). Spawned "
+               "%d/%d [total scrubber(s): %d]",
+               v2, i, diff, (v1 + i));
+
+    return 0;
+
+error_return:
+    return -1;
+}
+
+static int32_t
+br_scrubber_scale_down(xlator_t *this, struct br_scrubber *fsscrub,
+                       unsigned int v1, unsigned int v2)
+{
+    int i = 0;
+    int diff = 0;
+    int32_t ret = -1;
+    struct br_scrubbers *scrub = NULL;
+
+    diff = (int)(v1 - v2);
+
+    gf_msg(this->name, GF_LOG_INFO, 0, BRB_MSG_SCALE_DOWN_SCRUBBER,
+           "Scaling down scrubbers [%d => %d]", v1, v2);
+
+    for (i = 0; i < diff; i++) {
+        scrub = list_first_entry(&fsscrub->scrubbers, struct br_scrubbers,
+                                 list);
+
+        list_del_init(&scrub->list);
+        ret = gf_thread_cleanup_xint(scrub->scrubthread);
+        if (ret)
+            break;
+        GF_FREE(scrub);
+
+        fsscrub->nr_scrubbers--;
+    }
+
+    if (ret) {
+        gf_msg(this->name, GF_LOG_WARNING, 0, BRB_MSG_SCALE_DOWN_FAILED,
+               "Could not fully scale down "
+               "to %d scrubber(s). Terminated %d/%d [total "
+               "scrubber(s): %d]",
+               v1, i, diff, (v2 - i));
+        ret = 0;
+    }
+
+    return ret;
+}
+
+static int32_t
+br_scrubber_configure(xlator_t *this, br_private_t *priv,
+                      struct br_scrubber *fsscrub, scrub_throttle_t nthrottle)
+{
+    int32_t ret = 0;
+    unsigned int v1 = 0;
+    unsigned int v2 = 0;
+
+    v1 = fsscrub->nr_scrubbers;
+    v2 = br_scrubber_calc_scale(this, priv, nthrottle);
+
+    if (v1 == v2)
+        return 0;
+
+    if (v1 > v2)
+        ret = br_scrubber_scale_down(this, fsscrub, v1, v2);
+    else
+        ret = br_scrubber_scale_up(this, fsscrub, v1, v2);
+
+    return ret;
+}
+
+static int32_t
+br_scrubber_fetch_option(xlator_t *this, char *opt, dict_t *options,
+                         char **value)
+{
+    if (options)
+        GF_OPTION_RECONF(opt, *value, options, str, error_return);
+    else
+        GF_OPTION_INIT(opt, *value, str, error_return);
+
+    return 0;
+
+error_return:
+    return -1;
+}
+
+/* internal "throttle" override */
+#define BR_SCRUB_STALLED "STALLED"
+
+/* TODO: token buket spec */
+static int32_t
+br_scrubber_handle_throttle(xlator_t *this, br_private_t *priv, dict_t *options,
+                            gf_boolean_t scrubstall)
+{
+    int32_t ret = 0;
+    char *tmp = NULL;
+    struct br_scrubber *fsscrub = NULL;
+    scrub_throttle_t nthrottle = BR_SCRUB_THROTTLE_VOID;
+
+    fsscrub = &priv->fsscrub;
+    fsscrub->throttle_reconf = _gf_false;
+
+    ret = br_scrubber_fetch_option(this, "scrub-throttle", options, &tmp);
+    if (ret)
+        goto error_return;
+
+    if (scrubstall)
+        tmp = BR_SCRUB_STALLED;
+
+    if (strcasecmp(tmp, "lazy") == 0)
+        nthrottle = BR_SCRUB_THROTTLE_LAZY;
+    else if (strcasecmp(tmp, "normal") == 0)
+        nthrottle = BR_SCRUB_THROTTLE_NORMAL;
+    else if (strcasecmp(tmp, "aggressive") == 0)
+        nthrottle = BR_SCRUB_THROTTLE_AGGRESSIVE;
+    else if (strcasecmp(tmp, BR_SCRUB_STALLED) == 0)
+        nthrottle = BR_SCRUB_THROTTLE_STALLED;
+    else
+        goto error_return;
+
+    /* on failure old throttling value is preserved */
+    ret = br_scrubber_configure(this, priv, fsscrub, nthrottle);
+    if (ret)
+        goto error_return;
+
+    if (fsscrub->throttle != nthrottle)
+        fsscrub->throttle_reconf = _gf_true;
+
+    fsscrub->throttle = nthrottle;
+    return 0;
+
+error_return:
+    return -1;
+}
+
+static int32_t
+br_scrubber_handle_stall(xlator_t *this, br_private_t *priv, dict_t *options,
+                         gf_boolean_t *scrubstall)
+{
+    int32_t ret = 0;
+    char *tmp = NULL;
+
+    ret = br_scrubber_fetch_option(this, "scrub-state", options, &tmp);
+    if (ret)
+        goto error_return;
+
+    if (strcasecmp(tmp, "pause") == 0) /* anything else is active */
+        *scrubstall = _gf_true;
+
+    return 0;
+
+error_return:
+    return -1;
+}
+
+static int32_t
+br_scrubber_handle_freq(xlator_t *this, br_private_t *priv, dict_t *options,
+                        gf_boolean_t scrubstall)
+{
+    int32_t ret = -1;
+    char *tmp = NULL;
+    scrub_freq_t frequency = BR_FSSCRUB_FREQ_HOURLY;
+    struct br_scrubber *fsscrub = NULL;
+
+    fsscrub = &priv->fsscrub;
+    fsscrub->frequency_reconf = _gf_true;
+
+    ret = br_scrubber_fetch_option(this, "scrub-freq", options, &tmp);
+    if (ret)
+        goto error_return;
+
+    if (scrubstall)
+        tmp = BR_SCRUB_STALLED;
+
+    if (strcasecmp(tmp, "hourly") == 0) {
+        frequency = BR_FSSCRUB_FREQ_HOURLY;
+    } else if (strcasecmp(tmp, "daily") == 0) {
+        frequency = BR_FSSCRUB_FREQ_DAILY;
+    } else if (strcasecmp(tmp, "weekly") == 0) {
+        frequency = BR_FSSCRUB_FREQ_WEEKLY;
+    } else if (strcasecmp(tmp, "biweekly") == 0) {
+        frequency = BR_FSSCRUB_FREQ_BIWEEKLY;
+    } else if (strcasecmp(tmp, "monthly") == 0) {
+        frequency = BR_FSSCRUB_FREQ_MONTHLY;
+    } else if (strcasecmp(tmp, "minute") == 0) {
+        frequency = BR_FSSCRUB_FREQ_MINUTE;
+    } else if (strcasecmp(tmp, BR_SCRUB_STALLED) == 0) {
+        frequency = BR_FSSCRUB_FREQ_STALLED;
+    } else
+        goto error_return;
+
+    if (fsscrub->frequency == frequency)
+        fsscrub->frequency_reconf = _gf_false;
+    else
+        fsscrub->frequency = frequency;
+
+    return 0;
+
+error_return:
+    return -1;
+}
+
+static void
+br_scrubber_log_option(xlator_t *this, br_private_t *priv,
+                       gf_boolean_t scrubstall)
+{
+    struct br_scrubber *fsscrub = &priv->fsscrub;
+    char *scrub_throttle_str[] = {
+        [BR_SCRUB_THROTTLE_LAZY] = "lazy",
+        [BR_SCRUB_THROTTLE_NORMAL] = "normal",
+        [BR_SCRUB_THROTTLE_AGGRESSIVE] = "aggressive",
+        [BR_SCRUB_THROTTLE_STALLED] = "stalled",
+    };
+
+    char *scrub_freq_str[] = {
+        [0] = "",
+        [BR_FSSCRUB_FREQ_HOURLY] = "hourly",
+        [BR_FSSCRUB_FREQ_DAILY] = "daily",
+        [BR_FSSCRUB_FREQ_WEEKLY] = "weekly",
+        [BR_FSSCRUB_FREQ_BIWEEKLY] = "biweekly",
+        [BR_FSSCRUB_FREQ_MONTHLY] = "monthly (30 days)",
+        [BR_FSSCRUB_FREQ_MINUTE] = "every minute",
+    };
+
+    if (scrubstall)
+        return; /* logged as pause */
+
+    if (fsscrub->frequency_reconf || fsscrub->throttle_reconf) {
+        if (fsscrub->throttle == BR_SCRUB_THROTTLE_VOID)
+            return;
+        gf_msg(this->name, GF_LOG_INFO, 0, BRB_MSG_SCRUB_TUNABLE,
+               "SCRUB TUNABLES:: [Frequency: %s, Throttle: %s]",
+               scrub_freq_str[fsscrub->frequency],
+               scrub_throttle_str[fsscrub->throttle]);
+    }
+}
+
+int32_t
+br_scrubber_handle_options(xlator_t *this, br_private_t *priv, dict_t *options)
+{
+    int32_t ret = 0;
+    gf_boolean_t scrubstall = _gf_false; /* not as dangerous as it sounds */
+
+    ret = br_scrubber_handle_stall(this, priv, options, &scrubstall);
+    if (ret)
+        goto error_return;
+
+    ret = br_scrubber_handle_throttle(this, priv, options, scrubstall);
+    if (ret)
+        goto error_return;
+
+    ret = br_scrubber_handle_freq(this, priv, options, scrubstall);
+    if (ret)
+        goto error_return;
+
+    br_scrubber_log_option(this, priv, scrubstall);
+
+    return 0;
+
+error_return:
+    return -1;
+}
+
+inode_t *
+br_lookup_bad_obj_dir(xlator_t *this, br_child_t *child, uuid_t gfid)
+{
+    struct iatt statbuf = {
+        0,
+    };
+    inode_table_t *table = NULL;
+    int32_t ret = -1;
+    loc_t loc = {
+        0,
+    };
+    inode_t *linked_inode = NULL;
+    int32_t op_errno = 0;
+
+    GF_VALIDATE_OR_GOTO("bit-rot-scrubber", this, out);
+    GF_VALIDATE_OR_GOTO(this->name, this->private, out);
+    GF_VALIDATE_OR_GOTO(this->name, child, out);
+
+    table = child->table;
+
+    loc.inode = inode_new(table);
+    if (!loc.inode) {
+        gf_msg(this->name, GF_LOG_ERROR, ENOMEM, BRB_MSG_NO_MEMORY,
+               "failed to allocate a new inode for"
+               "bad object directory");
+        goto out;
+    }
+
+    gf_uuid_copy(loc.gfid, gfid);
+
+    ret = syncop_lookup(child->xl, &loc, &statbuf, NULL, NULL, NULL);
+    if (ret < 0) {
+        op_errno = -ret;
+        gf_msg(this->name, GF_LOG_ERROR, 0, BRB_MSG_LOOKUP_FAILED,
+               "failed to lookup the bad "
+               "objects directory (gfid: %s (%s))",
+               uuid_utoa(gfid), strerror(op_errno));
+        goto out;
+    }
+
+    linked_inode = inode_link(loc.inode, NULL, NULL, &statbuf);
+    if (linked_inode)
+        inode_lookup(linked_inode);
+
+out:
+    loc_wipe(&loc);
+    return linked_inode;
+}
+
+int32_t
+br_read_bad_object_dir(xlator_t *this, br_child_t *child, fd_t *fd,
+                       dict_t *dict)
+{
+    gf_dirent_t entries;
+    gf_dirent_t *entry = NULL;
+    int32_t ret = -1;
+    off_t offset = 0;
+    int32_t count = 0;
+    char key[32] = {
+        0,
+    };
+    dict_t *out_dict = NULL;
+
+    INIT_LIST_HEAD(&entries.list);
+
+    while ((ret = syncop_readdir(child->xl, fd, 131072, offset, &entries, NULL,
+                                 &out_dict))) {
+        if (ret < 0)
+            goto out;
+
+        list_for_each_entry(entry, &entries.list, list)
+        {
+            offset = entry->d_off;
+
+            snprintf(key, sizeof(key), "quarantine-%d", count);
+
+            /*
+             * ignore the dict_set errors for now. The intention is
+             * to get as many bad objects as possible instead of
+             * erroring out at the first failure.
+             */
+            ret = dict_set_dynstr_with_alloc(dict, key, entry->d_name);
+            if (!ret)
+                count++;
+
+            if (out_dict) {
+                dict_copy(out_dict, dict);
+                dict_unref(out_dict);
+                out_dict = NULL;
+            }
+        }
+
+        gf_dirent_free(&entries);
+    }
+
+    ret = count;
+    ret = dict_set_int32_sizen(dict, "count", count);
+
+out:
+    return ret;
+}
+
+int32_t
+br_get_bad_objects_from_child(xlator_t *this, dict_t *dict, br_child_t *child)
+{
+    inode_t *inode = NULL;
+    inode_table_t *table = NULL;
+    fd_t *fd = NULL;
+    int32_t ret = -1;
+    loc_t loc = {
+        0,
+    };
+    int32_t op_errno = 0;
+
+    GF_VALIDATE_OR_GOTO("bit-rot-scrubber", this, out);
+    GF_VALIDATE_OR_GOTO(this->name, this->private, out);
+    GF_VALIDATE_OR_GOTO(this->name, child, out);
+    GF_VALIDATE_OR_GOTO(this->name, dict, out);
+
+    table = child->table;
+
+    inode = inode_find(table, BR_BAD_OBJ_CONTAINER);
+    if (!inode) {
+        inode = br_lookup_bad_obj_dir(this, child, BR_BAD_OBJ_CONTAINER);
+        if (!inode)
+            goto out;
+    }
+
+    fd = fd_create(inode, 0);
+    if (!fd) {
+        gf_msg(this->name, GF_LOG_ERROR, ENOMEM, BRB_MSG_FD_CREATE_FAILED,
+               "fd creation for the bad "
+               "objects directory failed (gfid: %s)",
+               uuid_utoa(BR_BAD_OBJ_CONTAINER));
+        goto out;
+    }
+
+    loc.inode = inode;
+    gf_uuid_copy(loc.gfid, inode->gfid);
+
+    ret = syncop_opendir(child->xl, &loc, fd, NULL, NULL);
+    if (ret < 0) {
+        op_errno = -ret;
+        fd_unref(fd);
+        fd = NULL;
+        gf_msg(this->name, GF_LOG_ERROR, op_errno, BRB_MSG_FD_CREATE_FAILED,
+               "failed to open the bad "
+               "objects directory %s",
+               uuid_utoa(BR_BAD_OBJ_CONTAINER));
+        goto out;
+    }
+
+    fd_bind(fd);
+
+    ret = br_read_bad_object_dir(this, child, fd, dict);
+    if (ret < 0) {
+        gf_msg(this->name, GF_LOG_ERROR, 0, BRB_MSG_BAD_OBJ_READDIR_FAIL,
+               "readdir of the bad "
+               "objects directory (%s) failed ",
+               uuid_utoa(BR_BAD_OBJ_CONTAINER));
+        goto out;
+    }
+
+    ret = 0;
+
+out:
+    loc_wipe(&loc);
+    if (fd)
+        fd_unref(fd);
+    return ret;
+}
+
+int32_t
+br_collect_bad_objects_of_child(xlator_t *this, br_child_t *child, dict_t *dict,
+                                dict_t *child_dict, int32_t total_count)
+{
+    int32_t ret = -1;
+    int32_t count = 0;
+    char key[32] = {
+        0,
+    };
+    char main_key[32] = {
+        0,
+    };
+    int32_t j = 0;
+    int32_t tmp_count = 0;
+    char *entry = NULL;
+    char tmp[PATH_MAX] = {
+        0,
+    };
+    char *path = NULL;
+    int32_t len = 0;
+
+    ret = dict_get_int32_sizen(child_dict, "count", &count);
+    if (ret)
+        goto out;
+
+    tmp_count = total_count;
+
+    for (j = 0; j < count; j++) {
+        len = snprintf(key, sizeof(key), "quarantine-%d", j);
+        ret = dict_get_strn(child_dict, key, len, &entry);
+        if (ret)
+            continue;
+
+        ret = dict_get_str(child_dict, entry, &path);
+        len = snprintf(tmp, PATH_MAX, "%s ==> BRICK: %s\n path: %s", entry,
+                       child->brick_path, path);
+        if ((len < 0) || (len >= PATH_MAX)) {
+            continue;
+        }
+        snprintf(main_key, sizeof(main_key), "quarantine-%d", tmp_count);
+
+        ret = dict_set_dynstr_with_alloc(dict, main_key, tmp);
+        if (!ret)
+            tmp_count++;
+        path = NULL;
+    }
+
+    ret = tmp_count;
+
+out:
+    return ret;
+}
+
+int32_t
+br_collect_bad_objects_from_children(xlator_t *this, dict_t *dict)
+{
+    int32_t ret = -1;
+    dict_t *child_dict = NULL;
+    int32_t i = 0;
+    int32_t total_count = 0;
+    br_child_t *child = NULL;
+    br_private_t *priv = NULL;
+    dict_t *tmp_dict = NULL;
+
+    priv = this->private;
+    tmp_dict = dict;
+
+    for (i = 0; i < priv->child_count; i++) {
+        child = &priv->children[i];
+        GF_ASSERT(child);
+        if (!_br_is_child_connected(child))
+            continue;
+
+        child_dict = dict_new();
+        if (!child_dict) {
+            gf_msg(this->name, GF_LOG_ERROR, ENOMEM, BRB_MSG_NO_MEMORY,
+                   "failed to allocate dict");
+            continue;
+        }
+        ret = br_get_bad_objects_from_child(this, child_dict, child);
+        /*
+         * Continue asking the remaining children for the list of
+         * bad objects even though getting the list from one of them
+         * fails.
+         */
+        if (ret) {
+            dict_unref(child_dict);
+            continue;
+        }
+
+        ret = br_collect_bad_objects_of_child(this, child, tmp_dict, child_dict,
+                                              total_count);
+        if (ret < 0) {
+            dict_unref(child_dict);
+            continue;
+        }
+
+        total_count = ret;
+        dict_unref(child_dict);
+        child_dict = NULL;
+    }
+
+    ret = dict_set_int32(tmp_dict, "total-count", total_count);
+
+    return ret;
+}
+
+int32_t
+br_get_bad_objects_list(xlator_t *this, dict_t **dict)
+{
+    int32_t ret = -1;
+    dict_t *tmp_dict = NULL;
+
+    GF_VALIDATE_OR_GOTO("bir-rot-scrubber", this, out);
+    GF_VALIDATE_OR_GOTO(this->name, dict, out);
+
+    tmp_dict = *dict;
+    if (!tmp_dict) {
+        tmp_dict = dict_new();
+        if (!tmp_dict) {
+            gf_msg(this->name, GF_LOG_ERROR, ENOMEM, BRB_MSG_NO_MEMORY,
+                   "failed to allocate dict");
+            goto out;
+        }
+        *dict = tmp_dict;
+    }
+
+    ret = br_collect_bad_objects_from_children(this, tmp_dict);
+
+out:
+    return ret;
+}
+
+static int
+wait_for_scrub_to_finish(xlator_t *this)
+{
+    int ret = -1;
+    br_private_t *priv = NULL;
+    struct br_monitor *scrub_monitor = NULL;
+
+    priv = this->private;
+    scrub_monitor = &priv->scrub_monitor;
+
+    GF_VALIDATE_OR_GOTO("bit-rot", scrub_monitor, out);
+    GF_VALIDATE_OR_GOTO("bit-rot", this, out);
+
+    gf_msg(this->name, GF_LOG_INFO, 0, BRB_MSG_SCRUB_INFO,
+           "Waiting for all children to start and finish scrub");
+
+    pthread_mutex_lock(&scrub_monitor->donelock);
+    {
+        while (!scrub_monitor->done)
+            pthread_cond_wait(&scrub_monitor->donecond,
+                              &scrub_monitor->donelock);
+    }
+    pthread_mutex_unlock(&scrub_monitor->donelock);
+    ret = 0;
+out:
+    return ret;
+}
+
+/**
+ * This function is executed in a separate thread. This is scrubber monitor
+ * thread that takes care of state machine.
+ */
+void *
+br_monitor_thread(void *arg)
+{
+    int32_t ret = 0;
+    xlator_t *this = NULL;
+    br_private_t *priv = NULL;
+    struct br_monitor *scrub_monitor = NULL;
+
+    this = arg;
+    priv = this->private;
+
+    /*
+     * Since, this is the topmost xlator, THIS has to be set by bit-rot
+     * xlator itself (STACK_WIND won't help in this case). Also it has
+     * to be done for each thread that gets spawned. Otherwise, a new
+     * thread will get global_xlator's pointer when it does "THIS".
+     */
+    THIS = this;
+
+    scrub_monitor = &priv->scrub_monitor;
+
+    pthread_mutex_lock(&scrub_monitor->mutex);
+    {
+        while (!scrub_monitor->inited)
+            pthread_cond_wait(&scrub_monitor->cond, &scrub_monitor->mutex);
+    }
+    pthread_mutex_unlock(&scrub_monitor->mutex);
+
+    /* this needs to be serialized with reconfigure() */
+    pthread_mutex_lock(&priv->lock);
+    {
+        ret = br_scrub_state_machine(this, _gf_false);
+    }
+    pthread_mutex_unlock(&priv->lock);
+    if (ret) {
+        gf_msg(this->name, GF_LOG_ERROR, -ret, BRB_MSG_SSM_FAILED,
+               "Scrub state machine failed");
+        goto out;
+    }
+
+    while (1) {
+        /* Wait for all children to finish scrubbing */
+        ret = wait_for_scrub_to_finish(this);
+        if (ret) {
+            gf_msg(this->name, GF_LOG_ERROR, -ret, BRB_MSG_SCRUB_WAIT_FAILED,
+                   "Scrub wait failed");
+            goto out;
+        }
+
+        /* scrub exit criteria: Move the state to PENDING */
+        br_scrubber_exit_control(this);
+    }
+
+out:
+    return NULL;
+}
+
+static void
+br_set_scrub_state(struct br_monitor *scrub_monitor, br_scrub_state_t state)
+{
+    LOCK(&scrub_monitor->lock);
+    {
+        _br_monitor_set_scrub_state(scrub_monitor, state);
+    }
+    UNLOCK(&scrub_monitor->lock);
+}
+
+int32_t
+br_scrubber_monitor_init(xlator_t *this, br_private_t *priv)
+{
+    struct br_monitor *scrub_monitor = NULL;
+    int ret = 0;
+
+    scrub_monitor = &priv->scrub_monitor;
+
+    LOCK_INIT(&scrub_monitor->lock);
+    scrub_monitor->this = this;
+
+    scrub_monitor->inited = _gf_false;
+    pthread_mutex_init(&scrub_monitor->mutex, NULL);
+    pthread_cond_init(&scrub_monitor->cond, NULL);
+
+    scrub_monitor->kick = _gf_false;
+    scrub_monitor->active_child_count = 0;
+    pthread_mutex_init(&scrub_monitor->wakelock, NULL);
+    pthread_cond_init(&scrub_monitor->wakecond, NULL);
+
+    scrub_monitor->done = _gf_false;
+    pthread_mutex_init(&scrub_monitor->donelock, NULL);
+    pthread_cond_init(&scrub_monitor->donecond, NULL);
+
+    /* Set the state to INACTIVE */
+    br_set_scrub_state(&priv->scrub_monitor, BR_SCRUB_STATE_INACTIVE);
+
+    /* Start the monitor thread */
+    ret = gf_thread_create(&scrub_monitor->thread, NULL, br_monitor_thread,
+                           this, "brmon");
+    if (ret != 0) {
+        gf_msg(this->name, GF_LOG_ERROR, -ret, BRB_MSG_SPAWN_FAILED,
+               "monitor thread creation failed");
+        ret = -1;
+        goto err;
+    }
+
+    return 0;
+err:
+    pthread_mutex_destroy(&scrub_monitor->mutex);
+    pthread_cond_destroy(&scrub_monitor->cond);
+
+    pthread_mutex_destroy(&scrub_monitor->wakelock);
+    pthread_cond_destroy(&scrub_monitor->wakecond);
+
+    pthread_mutex_destroy(&scrub_monitor->donelock);
+    pthread_cond_destroy(&scrub_monitor->donecond);
+
+    LOCK_DESTROY(&scrub_monitor->lock);
+
+    return ret;
+}
+
+int32_t
+br_scrubber_init(xlator_t *this, br_private_t *priv)
+{
+    struct br_scrubber *fsscrub = NULL;
+    int ret = 0;
+
+    priv->tbf = tbf_init(NULL, 0);
+    if (!priv->tbf)
+        return -1;
+
+    ret = br_scrubber_monitor_init(this, priv);
+    if (ret)
+        return -1;
+
+    fsscrub = &priv->fsscrub;
+
+    fsscrub->this = this;
+    fsscrub->throttle = BR_SCRUB_THROTTLE_VOID;
+
+    pthread_mutex_init(&fsscrub->mutex, NULL);
+    pthread_cond_init(&fsscrub->cond, NULL);
+
+    fsscrub->nr_scrubbers = 0;
+    INIT_LIST_HEAD(&fsscrub->scrubbers);
+    INIT_LIST_HEAD(&fsscrub->scrublist);
+
+    return 0;
+}
diff --git a/xlators/features/bit-rot/src/bitd/bit-rot-scrub.h b/xlators/features/bit-rot/src/bitd/bit-rot-scrub.h
new file mode 100644
index 00000000000..4e5f67bc021
--- /dev/null
+++ b/xlators/features/bit-rot/src/bitd/bit-rot-scrub.h
@@ -0,0 +1,46 @@
+/*
+   Copyright (c) 2015 Red Hat, Inc. <http://www.redhat.com>
+   This file is part of GlusterFS.
+
+   This file is licensed to you under your choice of the GNU Lesser
+   General Public License, version 3 or any later version (LGPLv3 or
+   later), or the GNU General Public License, version 2 (GPLv2), in all
+   cases as published by the Free Software Foundation.
+*/
+
+#ifndef __BIT_ROT_SCRUB_H__
+#define __BIT_ROT_SCRUB_H__
+
+#include <glusterfs/xlator.h>
+#include "bit-rot.h"
+
+void *
+br_fsscanner(void *);
+
+int32_t
+br_fsscan_schedule(xlator_t *);
+int32_t
+br_fsscan_reschedule(xlator_t *);
+int32_t
+br_fsscan_activate(xlator_t *);
+int32_t
+br_fsscan_deactivate(xlator_t *);
+int32_t
+br_fsscan_ondemand(xlator_t *);
+
+int32_t
+br_scrubber_handle_options(xlator_t *, br_private_t *, dict_t *);
+
+int32_t
+br_scrubber_monitor_init(xlator_t *, br_private_t *);
+
+int32_t
+br_scrubber_init(xlator_t *, br_private_t *);
+
+int32_t
+br_collect_bad_objects_from_children(xlator_t *this, dict_t *dict);
+
+void
+br_child_set_scrub_state(br_child_t *, gf_boolean_t);
+
+#endif /* __BIT_ROT_SCRUB_H__ */
diff --git a/xlators/features/bit-rot/src/bitd/bit-rot-ssm.c b/xlators/features/bit-rot/src/bitd/bit-rot-ssm.c
new file mode 100644
index 00000000000..753e31a3b23
--- /dev/null
+++ b/xlators/features/bit-rot/src/bitd/bit-rot-ssm.c
@@ -0,0 +1,124 @@
+/*
+   Copyright (c) 2015 Red Hat, Inc. <http://www.redhat.com>
+   This file is part of GlusterFS.
+
+   This file is licensed to you under your choice of the GNU Lesser
+   General Public License, version 3 or any later version (LGPLv3 or
+   later), or the GNU General Public License, version 2 (GPLv2), in all
+   cases as published by the Free Software Foundation.
+*/
+
+#include "bit-rot-ssm.h"
+#include "bit-rot-scrub.h"
+#include "bit-rot-bitd-messages.h"
+
+int
+br_scrub_ssm_noop(xlator_t *this)
+{
+    return 0;
+}
+
+int
+br_scrub_ssm_state_pause(xlator_t *this)
+{
+    br_private_t *priv = NULL;
+    struct br_monitor *scrub_monitor = NULL;
+
+    priv = this->private;
+    scrub_monitor = &priv->scrub_monitor;
+
+    gf_msg(this->name, GF_LOG_INFO, 0, BRB_MSG_GENERIC_SSM_INFO,
+           "Scrubber paused");
+    _br_monitor_set_scrub_state(scrub_monitor, BR_SCRUB_STATE_PAUSED);
+    return 0;
+}
+
+int
+br_scrub_ssm_state_ipause(xlator_t *this)
+{
+    br_private_t *priv = NULL;
+    struct br_monitor *scrub_monitor = NULL;
+
+    priv = this->private;
+    scrub_monitor = &priv->scrub_monitor;
+
+    gf_msg(this->name, GF_LOG_INFO, 0, BRB_MSG_GENERIC_SSM_INFO,
+           "Scrubber paused");
+    _br_monitor_set_scrub_state(scrub_monitor, BR_SCRUB_STATE_IPAUSED);
+    return 0;
+}
+
+int
+br_scrub_ssm_state_active(xlator_t *this)
+{
+    br_private_t *priv = NULL;
+    struct br_monitor *scrub_monitor = NULL;
+
+    priv = this->private;
+    scrub_monitor = &priv->scrub_monitor;
+
+    if (scrub_monitor->done) {
+        (void)br_fsscan_activate(this);
+    } else {
+        gf_msg(this->name, GF_LOG_INFO, 0, BRB_MSG_GENERIC_SSM_INFO,
+               "Scrubbing resumed");
+        _br_monitor_set_scrub_state(scrub_monitor, BR_SCRUB_STATE_ACTIVE);
+    }
+
+    return 0;
+}
+
+int
+br_scrub_ssm_state_stall(xlator_t *this)
+{
+    br_private_t *priv = NULL;
+    struct br_monitor *scrub_monitor = NULL;
+
+    priv = this->private;
+    scrub_monitor = &priv->scrub_monitor;
+
+    gf_msg(this->name, GF_LOG_INFO, 0, BRB_MSG_GENERIC_SSM_INFO,
+           "Volume is under active scrubbing. Pausing scrub..");
+    _br_monitor_set_scrub_state(scrub_monitor, BR_SCRUB_STATE_STALLED);
+    return 0;
+}
+
+static br_scrub_ssm_call *br_scrub_ssm[BR_SCRUB_MAXSTATES][BR_SCRUB_MAXEVENTS] =
+    {
+        /* INACTIVE */
+        {br_fsscan_schedule, br_scrub_ssm_state_ipause, br_scrub_ssm_noop},
+        /* PENDING  */
+        {br_fsscan_reschedule, br_fsscan_deactivate, br_fsscan_ondemand},
+        /* ACTIVE   */
+        {br_scrub_ssm_noop, br_scrub_ssm_state_stall, br_scrub_ssm_noop},
+        /* PAUSED   */
+        {br_fsscan_activate, br_scrub_ssm_noop, br_scrub_ssm_noop},
+        /* IPAUSED  */
+        {br_fsscan_schedule, br_scrub_ssm_noop, br_scrub_ssm_noop},
+        /* STALLED  */
+        {br_scrub_ssm_state_active, br_scrub_ssm_noop, br_scrub_ssm_noop},
+};
+
+int32_t
+br_scrub_state_machine(xlator_t *this, gf_boolean_t scrub_ondemand)
+{
+    br_private_t *priv = NULL;
+    br_scrub_ssm_call *call = NULL;
+    struct br_scrubber *fsscrub = NULL;
+    br_scrub_state_t currstate = 0;
+    br_scrub_event_t event = 0;
+    struct br_monitor *scrub_monitor = NULL;
+
+    priv = this->private;
+    fsscrub = &priv->fsscrub;
+    scrub_monitor = &priv->scrub_monitor;
+
+    currstate = scrub_monitor->state;
+    if (scrub_ondemand)
+        event = BR_SCRUB_EVENT_ONDEMAND;
+    else
+        event = _br_child_get_scrub_event(fsscrub);
+
+    call = br_scrub_ssm[currstate][event];
+    return call(this);
+}
diff --git a/xlators/features/bit-rot/src/bitd/bit-rot-ssm.h b/xlators/features/bit-rot/src/bitd/bit-rot-ssm.h
new file mode 100644
index 00000000000..37b45a42eac
--- /dev/null
+++ b/xlators/features/bit-rot/src/bitd/bit-rot-ssm.h
@@ -0,0 +1,38 @@
+/*
+   Copyright (c) 2015 Red Hat, Inc. <http://www.redhat.com>
+   This file is part of GlusterFS.
+
+   This file is licensed to you under your choice of the GNU Lesser
+   General Public License, version 3 or any later version (LGPLv3 or
+   later), or the GNU General Public License, version 2 (GPLv2), in all
+   cases as published by the Free Software Foundation.
+*/
+
+#ifndef __BIT_ROT_SSM_H__
+#define __BIT_ROT_SSM_H__
+
+#include <glusterfs/xlator.h>
+
+typedef enum br_scrub_state {
+    BR_SCRUB_STATE_INACTIVE = 0,
+    BR_SCRUB_STATE_PENDING,
+    BR_SCRUB_STATE_ACTIVE,
+    BR_SCRUB_STATE_PAUSED,
+    BR_SCRUB_STATE_IPAUSED,
+    BR_SCRUB_STATE_STALLED,
+    BR_SCRUB_MAXSTATES,
+} br_scrub_state_t;
+
+typedef enum br_scrub_event {
+    BR_SCRUB_EVENT_SCHEDULE = 0,
+    BR_SCRUB_EVENT_PAUSE,
+    BR_SCRUB_EVENT_ONDEMAND,
+    BR_SCRUB_MAXEVENTS,
+} br_scrub_event_t;
+
+struct br_monitor;
+
+int32_t
+br_scrub_state_machine(xlator_t *, gf_boolean_t);
+
+#endif /* __BIT_ROT_SSM_H__ */
diff --git a/xlators/features/bit-rot/src/bitd/bit-rot.c b/xlators/features/bit-rot/src/bitd/bit-rot.c
new file mode 100644
index 00000000000..a2f1c343a1d
--- /dev/null
+++ b/xlators/features/bit-rot/src/bitd/bit-rot.c
@@ -0,0 +1,2232 @@
+/*
+   Copyright (c) 2015 Red Hat, Inc. <http://www.redhat.com>
+   This file is part of GlusterFS.
+
+   This file is licensed to you under your choice of the GNU Lesser
+   General Public License, version 3 or any later version (LGPLv3 or
+   later), or the GNU General Public License, version 2 (GPLv2), in all
+   cases as published by the Free Software Foundation.
+*/
+
+#include <ctype.h>
+
+#include <glusterfs/logging.h>
+#include <glusterfs/compat-errno.h>
+
+#include "bit-rot.h"
+#include "bit-rot-scrub.h"
+#include <pthread.h>
+#include "bit-rot-bitd-messages.h"
+
+#define BR_HASH_CALC_READ_SIZE (128 * 1024)
+
+typedef int32_t(br_child_handler)(xlator_t *, br_child_t *);
+
+struct br_child_event {
+    xlator_t *this;
+
+    br_child_t *child;
+
+    br_child_handler *call;
+
+    struct list_head list;
+};
+
+static int
+br_find_child_index(xlator_t *this, xlator_t *child)
+{
+    br_private_t *priv = NULL;
+    int i = -1;
+    int index = -1;
+
+    GF_VALIDATE_OR_GOTO("bit-rot", this, out);
+    GF_VALIDATE_OR_GOTO(this->name, this->private, out);
+    GF_VALIDATE_OR_GOTO(this->name, child, out);
+
+    priv = this->private;
+
+    for (i = 0; i < priv->child_count; i++) {
+        if (child == priv->children[i].xl) {
+            index = i;
+            break;
+        }
+    }
+
+out:
+    return index;
+}
+
+br_child_t *
+br_get_child_from_brick_path(xlator_t *this, char *brick_path)
+{
+    br_private_t *priv = NULL;
+    br_child_t *child = NULL;
+    br_child_t *tmp = NULL;
+    int i = 0;
+
+    GF_VALIDATE_OR_GOTO("bit-rot", this, out);
+    GF_VALIDATE_OR_GOTO(this->name, this->private, out);
+    GF_VALIDATE_OR_GOTO(this->name, brick_path, out);
+
+    priv = this->private;
+
+    pthread_mutex_lock(&priv->lock);
+    {
+        for (i = 0; i < priv->child_count; i++) {
+            tmp = &priv->children[i];
+            if (!strcmp(tmp->brick_path, brick_path)) {
+                child = tmp;
+                break;
+            }
+        }
+    }
+    pthread_mutex_unlock(&priv->lock);
+
+out:
+    return child;
+}
+
+/**
+ * probably we'll encapsulate brick inside our own structure when
+ * needed -- later.
+ */
+void *
+br_brick_init(void *xl, struct gf_brick_spec *brick)
+{
+    return brick;
+}
+
+/**
+ * and cleanup things here when allocated br_brick_init().
+ */
+void
+br_brick_fini(void *xl, char *brick, void *data)
+{
+    return;
+}
+
+/**
+ * TODO: Signature can contain null terminators which causes bitrot
+ * stub to store truncated hash as it depends on string length of
+ * the hash.
+ *
+ * FIX: Send the string length as part of the signature struct and
+ *      change stub to handle this change.
+ */
+static br_isignature_t *
+br_prepare_signature(const unsigned char *sign, unsigned long hashlen,
+                     int8_t hashtype, br_object_t *object)
+{
+    br_isignature_t *signature = NULL;
+
+    /* TODO: use mem-pool */
+    signature = GF_CALLOC(1, signature_size(hashlen + 1),
+                          gf_br_stub_mt_signature_t);
+    if (!signature)
+        return NULL;
+
+    /* object version */
+    signature->signedversion = object->signedversion;
+
+    /* signature length & type */
+    signature->signaturelen = hashlen;
+    signature->signaturetype = hashtype;
+
+    /* signature itself */
+    memcpy(signature->signature, (char *)sign, hashlen);
+    signature->signature[hashlen + 1] = '\0';
+
+    return signature;
+}
+
+gf_boolean_t
+bitd_is_bad_file(xlator_t *this, br_child_t *child, loc_t *loc, fd_t *fd)
+{
+    int32_t ret = -1;
+    dict_t *xattr = NULL;
+    inode_t *inode = NULL;
+    gf_boolean_t bad_file = _gf_false;
+
+    GF_VALIDATE_OR_GOTO("bit-rot", this, out);
+
+    inode = (loc) ? loc->inode : fd->inode;
+
+    if (fd)
+        ret = syncop_fgetxattr(child->xl, fd, &xattr, BITROT_OBJECT_BAD_KEY,
+                               NULL, NULL);
+    else if (loc)
+        ret = syncop_getxattr(child->xl, loc, &xattr, BITROT_OBJECT_BAD_KEY,
+                              NULL, NULL);
+
+    if (!ret) {
+        gf_msg_debug(this->name, 0, "[GFID: %s] is marked corrupted",
+                     uuid_utoa(inode->gfid));
+        bad_file = _gf_true;
+    }
+
+    if (xattr)
+        dict_unref(xattr);
+
+out:
+    return bad_file;
+}
+
+/**
+ * Do a lookup on the gfid present within the object.
+ */
+static int32_t
+br_object_lookup(xlator_t *this, br_object_t *object, struct iatt *iatt,
+                 inode_t **linked_inode)
+{
+    int ret = -EINVAL;
+    loc_t loc = {
+        0,
+    };
+    inode_t *inode = NULL;
+
+    GF_VALIDATE_OR_GOTO("bit-rot", this, out);
+    GF_VALIDATE_OR_GOTO(this->name, object, out);
+
+    inode = inode_find(object->child->table, object->gfid);
+
+    if (inode)
+        loc.inode = inode;
+    else
+        loc.inode = inode_new(object->child->table);
+
+    if (!loc.inode) {
+        ret = -ENOMEM;
+        goto out;
+    }
+
+    gf_uuid_copy(loc.gfid, object->gfid);
+
+    ret = syncop_lookup(object->child->xl, &loc, iatt, NULL, NULL, NULL);
+    if (ret < 0)
+        goto out;
+
+    /*
+     * The file might have been deleted by the application
+     * after getting the event, but before doing a lookup.
+     * So use linked_inode after inode_link is done.
+     */
+    *linked_inode = inode_link(loc.inode, NULL, NULL, iatt);
+    if (*linked_inode)
+        inode_lookup(*linked_inode);
+
+out:
+    loc_wipe(&loc);
+    return ret;
+}
+
+/**
+ * open the object with O_RDONLY flags and return the fd. How to let brick
+ * know that open is being done by bitd because syncop framework does not allow
+ * passing xdata -- may be use frame->root->pid itself.
+ */
+static int32_t
+br_object_open(xlator_t *this, br_object_t *object, inode_t *inode,
+               fd_t **openfd)
+{
+    int32_t ret = -1;
+    fd_t *fd = NULL;
+    loc_t loc = {
+        0,
+    };
+
+    GF_VALIDATE_OR_GOTO("bit-rot", this, out);
+    GF_VALIDATE_OR_GOTO(this->name, object, out);
+    GF_VALIDATE_OR_GOTO(this->name, inode, out);
+
+    ret = -EINVAL;
+    fd = fd_create(inode, 0);
+    if (!fd) {
+        gf_smsg(this->name, GF_LOG_ERROR, 0, BRB_MSG_FD_CREATE_FAILED,
+                "gfid=%s", uuid_utoa(inode->gfid), NULL);
+        goto out;
+    }
+
+    loc.inode = inode_ref(inode);
+    gf_uuid_copy(loc.gfid, inode->gfid);
+
+    ret = syncop_open(object->child->xl, &loc, O_RDONLY, fd, NULL, NULL);
+    if (ret) {
+        br_log_object(this, "open", inode->gfid, -ret);
+        fd_unref(fd);
+        fd = NULL;
+    } else {
+        fd_bind(fd);
+        *openfd = fd;
+    }
+
+    loc_wipe(&loc);
+
+out:
+    return ret;
+}
+
+/**
+ * read 128k block from the object @object from the offset @offset
+ * and return the buffer.
+ */
+static int32_t
+br_object_read_block_and_sign(xlator_t *this, fd_t *fd, br_child_t *child,
+                              off_t offset, size_t size, SHA256_CTX *sha256)
+{
+    int32_t ret = -1;
+    tbf_t *tbf = NULL;
+    struct iovec *iovec = NULL;
+    struct iobref *iobref = NULL;
+    br_private_t *priv = NULL;
+    int count = 0;
+    int i = 0;
+
+    GF_VALIDATE_OR_GOTO("bit-rot", this, out);
+    GF_VALIDATE_OR_GOTO(this->name, fd, out);
+    GF_VALIDATE_OR_GOTO(this->name, fd->inode, out);
+    GF_VALIDATE_OR_GOTO(this->name, child, out);
+    GF_VALIDATE_OR_GOTO(this->name, this->private, out);
+
+    priv = this->private;
+
+    GF_VALIDATE_OR_GOTO(this->name, priv->tbf, out);
+    tbf = priv->tbf;
+
+    ret = syncop_readv(child->xl, fd, size, offset, 0, &iovec, &count, &iobref,
+                       NULL, NULL, NULL);
+
+    if (ret < 0) {
+        gf_smsg(this->name, GF_LOG_ERROR, errno, BRB_MSG_READV_FAILED,
+                "gfid=%s", uuid_utoa(fd->inode->gfid), NULL);
+        ret = -1;
+        goto out;
+    }
+
+    if (ret == 0)
+        goto out;
+
+    for (i = 0; i < count; i++) {
+        TBF_THROTTLE_BEGIN(tbf, TBF_OP_HASH, iovec[i].iov_len);
+        {
+            SHA256_Update(sha256, (const unsigned char *)(iovec[i].iov_base),
+                          iovec[i].iov_len);
+        }
+        TBF_THROTTLE_BEGIN(tbf, TBF_OP_HASH, iovec[i].iov_len);
+    }
+
+out:
+    if (iovec)
+        GF_FREE(iovec);
+
+    if (iobref)
+        iobref_unref(iobref);
+
+    return ret;
+}
+
+int32_t
+br_calculate_obj_checksum(unsigned char *md, br_child_t *child, fd_t *fd,
+                          struct iatt *iatt)
+{
+    int32_t ret = -1;
+    off_t offset = 0;
+    size_t block = BR_HASH_CALC_READ_SIZE;
+    xlator_t *this = NULL;
+
+    SHA256_CTX sha256;
+
+    GF_VALIDATE_OR_GOTO("bit-rot", child, out);
+    GF_VALIDATE_OR_GOTO("bit-rot", iatt, out);
+    GF_VALIDATE_OR_GOTO("bit-rot", fd, out);
+
+    this = child->this;
+
+    SHA256_Init(&sha256);
+
+    while (1) {
+        ret = br_object_read_block_and_sign(this, fd, child, offset, block,
+                                            &sha256);
+        if (ret < 0) {
+            gf_smsg(this->name, GF_LOG_ERROR, 0, BRB_MSG_BLOCK_READ_FAILED,
+                    "offset=%" PRIu64, offset, "object-gfid=%s",
+                    uuid_utoa(fd->inode->gfid), NULL);
+            break;
+        }
+
+        if (ret == 0)
+            break;
+
+        offset += ret;
+    }
+
+    if (ret == 0)
+        SHA256_Final(md, &sha256);
+
+out:
+    return ret;
+}
+
+static int32_t
+br_object_checksum(unsigned char *md, br_object_t *object, fd_t *fd,
+                   struct iatt *iatt)
+{
+    return br_calculate_obj_checksum(md, object->child, fd, iatt);
+}
+
+static int32_t
+br_object_read_sign(inode_t *linked_inode, fd_t *fd, br_object_t *object,
+                    struct iatt *iatt)
+{
+    int32_t ret = -1;
+    xlator_t *this = NULL;
+    dict_t *xattr = NULL;
+    unsigned char *md = NULL;
+    br_isignature_t *sign = NULL;
+
+    GF_VALIDATE_OR_GOTO("bit-rot", object, out);
+    GF_VALIDATE_OR_GOTO("bit-rot", linked_inode, out);
+    GF_VALIDATE_OR_GOTO("bit-rot", fd, out);
+
+    this = object->this;
+
+    md = GF_MALLOC(SHA256_DIGEST_LENGTH, gf_common_mt_char);
+    if (!md) {
+        gf_smsg(this->name, GF_LOG_ERROR, ENOMEM, BRB_MSG_SAVING_HASH_FAILED,
+                "object-gfid=%s", uuid_utoa(fd->inode->gfid), NULL);
+        goto out;
+    }
+
+    ret = br_object_checksum(md, object, fd, iatt);
+    if (ret) {
+        gf_smsg(this->name, GF_LOG_ERROR, 0, BRB_MSG_CALC_CHECKSUM_FAILED,
+                "object-gfid=%s", uuid_utoa(linked_inode->gfid), NULL);
+        goto free_signature;
+    }
+
+    sign = br_prepare_signature(md, SHA256_DIGEST_LENGTH,
+                                BR_SIGNATURE_TYPE_SHA256, object);
+    if (!sign) {
+        gf_smsg(this->name, GF_LOG_ERROR, 0, BRB_MSG_GET_SIGN_FAILED,
+                "object-gfid=%s", uuid_utoa(fd->inode->gfid), NULL);
+        goto free_signature;
+    }
+
+    xattr = dict_for_key_value(GLUSTERFS_SET_OBJECT_SIGNATURE, (void *)sign,
+                               signature_size(SHA256_DIGEST_LENGTH), _gf_true);
+
+    if (!xattr) {
+        gf_smsg(this->name, GF_LOG_ERROR, 0, BRB_MSG_SET_SIGN_FAILED,
+                "dict-allocation object-gfid=%s", uuid_utoa(fd->inode->gfid),
+                NULL);
+        goto free_isign;
+    }
+
+    ret = syncop_fsetxattr(object->child->xl, fd, xattr, 0, NULL, NULL);
+    if (ret) {
+        gf_smsg(this->name, GF_LOG_ERROR, 0, BRB_MSG_SET_SIGN_FAILED,
+                "fsetxattr object-gfid=%s", uuid_utoa(fd->inode->gfid), NULL);
+        goto unref_dict;
+    }
+
+    ret = 0;
+
+unref_dict:
+    dict_unref(xattr);
+free_isign:
+    GF_FREE(sign);
+free_signature:
+    GF_FREE(md);
+out:
+    return ret;
+}
+
+static int
+br_object_sign_softerror(int32_t op_errno)
+{
+    return ((op_errno == ENOENT) || (op_errno == ESTALE) ||
+            (op_errno == ENODATA));
+}
+
+void
+br_log_object(xlator_t *this, char *op, uuid_t gfid, int32_t op_errno)
+{
+    int softerror = br_object_sign_softerror(op_errno);
+    if (softerror) {
+        gf_msg_debug(this->name, 0,
+                     "%s() failed on object %s "
+                     "[reason: %s]",
+                     op, uuid_utoa(gfid), strerror(op_errno));
+    } else {
+        gf_smsg(this->name, GF_LOG_ERROR, op_errno, BRB_MSG_OP_FAILED, "op=%s",
+                op, "gfid=%s", uuid_utoa(gfid), NULL);
+    }
+}
+
+void
+br_log_object_path(xlator_t *this, char *op, const char *path, int32_t op_errno)
+{
+    int softerror = br_object_sign_softerror(op_errno);
+    if (softerror) {
+        gf_msg_debug(this->name, 0,
+                     "%s() failed on object %s "
+                     "[reason: %s]",
+                     op, path, strerror(op_errno));
+    } else {
+        gf_smsg(this->name, GF_LOG_ERROR, op_errno, BRB_MSG_OP_FAILED, "op=%s",
+                op, "path=%s", path, NULL);
+    }
+}
+
+static void
+br_trigger_sign(xlator_t *this, br_child_t *child, inode_t *linked_inode,
+                loc_t *loc, gf_boolean_t need_reopen)
+{
+    fd_t *fd = NULL;
+    int32_t ret = -1;
+    uint32_t val = 0;
+    dict_t *dict = NULL;
+    pid_t pid = GF_CLIENT_PID_BITD;
+
+    syncopctx_setfspid(&pid);
+
+    val = (need_reopen == _gf_true) ? BR_OBJECT_REOPEN : BR_OBJECT_RESIGN;
+
+    dict = dict_new();
+    if (!dict)
+        goto out;
+
+    ret = dict_set_uint32(dict, BR_REOPEN_SIGN_HINT_KEY, val);
+    if (ret)
+        goto cleanup_dict;
+
+    ret = -1;
+    fd = fd_create(linked_inode, 0);
+    if (!fd) {
+        gf_smsg(this->name, GF_LOG_ERROR, 0, BRB_MSG_FD_CREATE_FAILED,
+                "gfid=%s", uuid_utoa(linked_inode->gfid), NULL);
+        goto cleanup_dict;
+    }
+
+    ret = syncop_open(child->xl, loc, O_RDWR, fd, NULL, NULL);
+    if (ret) {
+        br_log_object(this, "open", linked_inode->gfid, -ret);
+        goto unref_fd;
+    }
+
+    fd_bind(fd);
+
+    ret = syncop_fsetxattr(child->xl, fd, dict, 0, NULL, NULL);
+    if (ret)
+        br_log_object(this, "fsetxattr", linked_inode->gfid, -ret);
+
+    /* passthough: fd_unref() */
+
+unref_fd:
+    fd_unref(fd);
+cleanup_dict:
+    dict_unref(dict);
+out:
+    if (ret) {
+        gf_smsg(this->name, GF_LOG_WARNING, 0, BRB_MSG_TRIGGER_SIGN_FAILED,
+                "gfid=%s", uuid_utoa(linked_inode->gfid), "reopen-hint-val=%d",
+                val, NULL);
+    }
+}
+
+static void
+br_object_resign(xlator_t *this, br_object_t *object, inode_t *linked_inode)
+{
+    loc_t loc = {
+        0,
+    };
+
+    loc.inode = inode_ref(linked_inode);
+    gf_uuid_copy(loc.gfid, linked_inode->gfid);
+
+    br_trigger_sign(this, object->child, linked_inode, &loc, _gf_false);
+
+    loc_wipe(&loc);
+}
+
+/**
+ * Sign a given object. This routine runs full throttle. There needs to be
+ * some form of priority scheduling and/or read burstness to avoid starving
+ * (or kicking) client I/O's.
+ */
+static int32_t
+br_sign_object(br_object_t *object)
+{
+    int32_t ret = -1;
+    inode_t *linked_inode = NULL;
+    xlator_t *this = NULL;
+    fd_t *fd = NULL;
+    struct iatt iatt = {
+        0,
+    };
+    pid_t pid = GF_CLIENT_PID_BITD;
+    br_sign_state_t sign_info = BR_SIGN_NORMAL;
+
+    GF_VALIDATE_OR_GOTO("bit-rot", object, out);
+
+    this = object->this;
+
+    /**
+     * FIXME: This is required as signing an object is restricted to
+     * clients with special frame->root->pid. Change the way client
+     * pid is set.
+     */
+    syncopctx_setfspid(&pid);
+
+    ret = br_object_lookup(this, object, &iatt, &linked_inode);
+    if (ret) {
+        br_log_object(this, "lookup", object->gfid, -ret);
+        goto out;
+    }
+
+    /**
+     * For fd's that have notified for reopening, we send an explicit
+     * open() followed by a dummy write() call. This triggers the
+     * actual signing of the object.
+     */
+    sign_info = ntohl(object->sign_info);
+    if (sign_info == BR_SIGN_REOPEN_WAIT) {
+        br_object_resign(this, object, linked_inode);
+        goto unref_inode;
+    }
+
+    ret = br_object_open(this, object, linked_inode, &fd);
+    if (!fd) {
+        br_log_object(this, "open", object->gfid, -ret);
+        goto unref_inode;
+    }
+
+    /**
+     * we have an open file descriptor on the object. from here on,
+     * do not be generous to file operation errors.
+     */
+    gf_msg_debug(this->name, 0, "Signing object [%s]",
+                 uuid_utoa(linked_inode->gfid));
+
+    ret = br_object_read_sign(linked_inode, fd, object, &iatt);
+    if (ret) {
+        gf_smsg(this->name, GF_LOG_ERROR, 0, BRB_MSG_READ_AND_SIGN_FAILED,
+                "gfid=%s", uuid_utoa(linked_inode->gfid), NULL);
+        goto unref_fd;
+    }
+
+    ret = 0;
+
+unref_fd:
+    fd_unref(fd);
+unref_inode:
+    inode_unref(linked_inode);
+out:
+    return ret;
+}
+
+static br_object_t *
+__br_pick_object(br_private_t *priv)
+{
+    br_object_t *object = NULL;
+
+    while (list_empty(&priv->obj_queue->objects)) {
+        pthread_cond_wait(&priv->object_cond, &priv->lock);
+    }
+
+    object = list_first_entry(&priv->obj_queue->objects, br_object_t, list);
+    list_del_init(&object->list);
+
+    return object;
+}
+
+/**
+ * This is the place where the signing of the objects is triggered.
+ */
+void *
+br_process_object(void *arg)
+{
+    xlator_t *this = NULL;
+    br_object_t *object = NULL;
+    br_private_t *priv = NULL;
+    int32_t ret = -1;
+
+    this = arg;
+    priv = this->private;
+
+    THIS = this;
+
+    for (;;) {
+        pthread_mutex_lock(&priv->lock);
+        {
+            object = __br_pick_object(priv);
+        }
+        pthread_mutex_unlock(&priv->lock);
+
+        ret = br_sign_object(object);
+        if (ret && !br_object_sign_softerror(-ret))
+            gf_smsg(this->name, GF_LOG_ERROR, 0, BRB_MSG_SET_SIGN_FAILED,
+                    "gfid=%s", uuid_utoa(object->gfid), NULL);
+        GF_FREE(object);
+    }
+
+    return NULL;
+}
+
+/**
+ * This function gets kicked in once the object is expired from the
+ * timer wheel. This actually adds the object received via notification
+ * from the changelog to the queue from where the objects gets picked
+ * up for signing.
+ *
+ * This routine can be made lightweight by introducing an alternate
+ * timer-wheel API that dispatches _all_ expired objects in one-shot
+ * rather than an object at-a-time. This routine can then just simply
+ * be a call to list_splice_tail().
+ *
+ * NOTE: use call_time to instrument signing time in br_sign_object().
+ */
+void
+br_add_object_to_queue(struct gf_tw_timer_list *timer, void *data,
+                       unsigned long call_time)
+{
+    br_object_t *object = NULL;
+    xlator_t *this = NULL;
+    br_private_t *priv = NULL;
+
+    object = data;
+    this = object->this;
+    priv = this->private;
+
+    THIS = this;
+
+    pthread_mutex_lock(&priv->lock);
+    {
+        list_add_tail(&object->list, &priv->obj_queue->objects);
+        pthread_cond_broadcast(&priv->object_cond);
+    }
+    pthread_mutex_unlock(&priv->lock);
+
+    if (timer)
+        mem_put(timer);
+    return;
+}
+
+static br_object_t *
+br_initialize_object(xlator_t *this, br_child_t *child, changelog_event_t *ev)
+{
+    br_object_t *object = NULL;
+
+    object = GF_CALLOC(1, sizeof(*object), gf_br_mt_br_object_t);
+    if (!object)
+        goto out;
+    INIT_LIST_HEAD(&object->list);
+
+    object->this = this;
+    object->child = child;
+    gf_uuid_copy(object->gfid, ev->u.releasebr.gfid);
+
+    /* NOTE: it's BE, but no worry */
+    object->signedversion = ev->u.releasebr.version;
+    object->sign_info = ev->u.releasebr.sign_info;
+
+out:
+    return object;
+}
+
+static struct gf_tw_timer_list *
+br_initialize_timer(xlator_t *this, br_object_t *object, br_child_t *child,
+                    changelog_event_t *ev)
+{
+    br_private_t *priv = NULL;
+    struct gf_tw_timer_list *timer = NULL;
+
+    priv = this->private;
+
+    timer = mem_get0(child->timer_pool);
+    if (!timer)
+        goto out;
+    INIT_LIST_HEAD(&timer->entry);
+
+    timer->expires = priv->expiry_time;
+    if (!timer->expires)
+        timer->expires = 1;
+
+    timer->data = object;
+    timer->function = br_add_object_to_queue;
+    gf_tw_add_timer(priv->timer_wheel, timer);
+
+out:
+    return timer;
+}
+
+static int32_t
+br_schedule_object_reopen(xlator_t *this, br_object_t *object,
+                          br_child_t *child, changelog_event_t *ev)
+{
+    struct gf_tw_timer_list *timer = NULL;
+
+    timer = br_initialize_timer(this, object, child, ev);
+    if (!timer)
+        gf_smsg(this->name, GF_LOG_ERROR, 0, BRB_MSG_SET_TIMER_FAILED,
+                "gfid=%s", uuid_utoa(object->gfid), NULL);
+    return timer ? 0 : -1;
+}
+
+static int32_t
+br_object_quicksign(xlator_t *this, br_object_t *object)
+{
+    br_add_object_to_queue(NULL, object, 0ULL);
+    return 0;
+}
+
+/**
+ * This callback function registered with the changelog is executed
+ * whenever a notification from the changelog is received. This should
+ * add the object (or the gfid) on which the notification has come to
+ * the timer-wheel with some expiry time.
+ *
+ * TODO: use mem-pool for allocations and maybe allocate timer and
+ * object as a single alloc and bifurcate their respective pointers.
+ */
+void
+br_brick_callback(void *xl, char *brick, void *data, changelog_event_t *ev)
+{
+    int32_t ret = 0;
+    uuid_t gfid = {
+        0,
+    };
+    xlator_t *this = NULL;
+    br_object_t *object = NULL;
+    br_child_t *child = NULL;
+    br_sign_state_t sign_info = BR_SIGN_INVALID;
+
+    this = xl;
+
+    GF_VALIDATE_OR_GOTO(this->name, ev, out);
+    GF_VALIDATE_OR_GOTO("bit-rot", this, out);
+    GF_VALIDATE_OR_GOTO(this->name, this->private, out);
+
+    GF_ASSERT(ev->ev_type == CHANGELOG_OP_TYPE_BR_RELEASE);
+    GF_ASSERT(!gf_uuid_is_null(ev->u.releasebr.gfid));
+
+    gf_uuid_copy(gfid, ev->u.releasebr.gfid);
+
+    gf_msg_debug(this->name, 0, "RELEASE EVENT [GFID %s]", uuid_utoa(gfid));
+
+    child = br_get_child_from_brick_path(this, brick);
+    if (!child) {
+        gf_smsg(this->name, GF_LOG_ERROR, 0, BRB_MSG_GET_SUBVOL_FAILED,
+                "brick=%s", brick, NULL);
+        goto out;
+    }
+
+    object = br_initialize_object(this, child, ev);
+    if (!object) {
+        gf_smsg(this->name, GF_LOG_ERROR, ENOMEM, BRB_MSG_NO_MEMORY,
+                "object-gfid=%s", uuid_utoa(gfid), NULL);
+        goto out;
+    }
+
+    /* sanity check */
+    sign_info = ntohl(object->sign_info);
+    GF_ASSERT(sign_info != BR_SIGN_NORMAL);
+
+    if (sign_info == BR_SIGN_REOPEN_WAIT)
+        ret = br_schedule_object_reopen(this, object, child, ev);
+    else
+        ret = br_object_quicksign(this, object);
+
+    if (ret)
+        goto free_object;
+
+    gf_msg_debug(this->name, 0, "->callback: brick [%s], type [%d]\n", brick,
+                 ev->ev_type);
+    return;
+
+free_object:
+    GF_FREE(object);
+out:
+    return;
+}
+
+void
+br_fill_brick_spec(struct gf_brick_spec *brick, char *path)
+{
+    brick->brick_path = gf_strdup(path);
+    brick->filter = CHANGELOG_OP_TYPE_BR_RELEASE;
+
+    brick->init = br_brick_init;
+    brick->fini = br_brick_fini;
+    brick->callback = br_brick_callback;
+    brick->connected = NULL;
+    brick->disconnected = NULL;
+}
+
+static gf_boolean_t
+br_check_object_need_sign(xlator_t *this, dict_t *xattr, br_child_t *child)
+{
+    int32_t ret = -1;
+    gf_boolean_t need_sign = _gf_false;
+    br_isignature_out_t *sign = NULL;
+
+    GF_VALIDATE_OR_GOTO("bit-rot", this, out);
+    GF_VALIDATE_OR_GOTO(this->name, xattr, out);
+    GF_VALIDATE_OR_GOTO(this->name, child, out);
+
+    ret = dict_get_ptr(xattr, GLUSTERFS_GET_OBJECT_SIGNATURE, (void **)&sign);
+    if (ret) {
+        gf_smsg(this->name, GF_LOG_ERROR, 0, BRB_MSG_GET_SIGN_FAILED,
+                "object-info", NULL);
+        goto out;
+    }
+
+    /* Object has been opened and hence dirty. Do not sign it */
+    if (sign->stale)
+        need_sign = _gf_true;
+
+out:
+    return need_sign;
+}
+
+int32_t
+br_prepare_loc(xlator_t *this, br_child_t *child, loc_t *parent,
+               gf_dirent_t *entry, loc_t *loc)
+{
+    int32_t ret = -1;
+    inode_t *inode = NULL;
+
+    inode = inode_grep(child->table, parent->inode, entry->d_name);
+    if (!inode)
+        loc->inode = inode_new(child->table);
+    else {
+        loc->inode = inode;
+        if (loc->inode->ia_type != IA_IFREG) {
+            gf_msg_debug(this->name, 0,
+                         "%s is not a regular "
+                         "file",
+                         entry->d_name);
+            ret = 0;
+            goto out;
+        }
+    }
+
+    loc->parent = inode_ref(parent->inode);
+    gf_uuid_copy(loc->pargfid, parent->inode->gfid);
+
+    ret = inode_path(parent->inode, entry->d_name, (char **)&loc->path);
+    if (ret < 0 || !loc->path) {
+        gf_smsg(this->name, GF_LOG_ERROR, 0, BRB_MSG_PATH_FAILED,
+                "inode_path=%s", entry->d_name, "parent-gfid=%s",
+                uuid_utoa(parent->inode->gfid), NULL);
+        goto out;
+    }
+
+    loc->name = strrchr(loc->path, '/');
+    if (loc->name)
+        loc->name++;
+
+    ret = 1;
+
+out:
+    return ret;
+}
+
+/**
+ * Oneshot crawler
+ * ---------------
+ * This is a catchup mechanism. Objects that remained unsigned from the
+ * last run for whatever reason (node crashes, reboots, etc..) become
+ * candidates for signing. This allows the signature to "catch up" with
+ * the current state of the object. Triggering signing is easy: perform
+ * an open() followed by a close() thereby resulting in call boomerang.
+ * (though not back to itself :))
+ */
+int
+bitd_oneshot_crawl(xlator_t *subvol, gf_dirent_t *entry, loc_t *parent,
+                   void *data)
+{
+    int op_errno = 0;
+    br_child_t *child = NULL;
+    xlator_t *this = NULL;
+    loc_t loc = {
+        0,
+    };
+    struct iatt iatt = {
+        0,
+    };
+    struct iatt parent_buf = {
+        0,
+    };
+    dict_t *xattr = NULL;
+    int32_t ret = -1;
+    inode_t *linked_inode = NULL;
+    gf_boolean_t need_signing = _gf_false;
+    gf_boolean_t need_reopen = _gf_true;
+
+    GF_VALIDATE_OR_GOTO("bit-rot", subvol, out);
+    GF_VALIDATE_OR_GOTO("bit-rot", data, out);
+
+    child = data;
+    this = child->this;
+
+    ret = br_prepare_loc(this, child, parent, entry, &loc);
+    if (!ret)
+        goto out;
+
+    ret = syncop_lookup(child->xl, &loc, &iatt, &parent_buf, NULL, NULL);
+    if (ret) {
+        br_log_object_path(this, "lookup", loc.path, -ret);
+        goto out;
+    }
+
+    linked_inode = inode_link(loc.inode, parent->inode, loc.name, &iatt);
+    if (linked_inode)
+        inode_lookup(linked_inode);
+
+    if (iatt.ia_type != IA_IFREG) {
+        gf_msg_debug(this->name, 0,
+                     "%s is not a regular file, "
+                     "skipping..",
+                     entry->d_name);
+        ret = 0;
+        goto unref_inode;
+    }
+
+    /**
+     * As of now, 2 cases  are possible and handled.
+     * 1) GlusterFS is upgraded from a previous version which does not
+     *    have any idea about bit-rot and have data in the filesystem.
+     *    In this case syncop_getxattr fails with ENODATA and the object
+     *    is signed. (In real, when crawler sends lookup, bit-rot-stub
+     *    creates the xattrs before returning lookup reply)
+     * 2) Bit-rot was not enabled or BitD was does for some reasons, during
+     *    which some files were created, but since BitD was down, were not
+     *    signed.
+     * If the file was just created and was being written some data when
+     * the down BitD came up, then bit-rot stub should be intelligent to
+     * identify this case (by comparing the ongoing version or by checking
+     * if there are any fds present for that inode) and handle properly.
+     */
+
+    if (bitd_is_bad_file(this, child, &loc, NULL)) {
+        gf_smsg(this->name, GF_LOG_WARNING, 0, BRB_MSG_SKIP_OBJECT, "path=%s",
+                loc.path, NULL);
+        goto unref_inode;
+    }
+
+    ret = syncop_getxattr(child->xl, &loc, &xattr,
+                          GLUSTERFS_GET_OBJECT_SIGNATURE, NULL, NULL);
+    if (ret < 0) {
+        op_errno = -ret;
+        br_log_object(this, "getxattr", linked_inode->gfid, op_errno);
+
+        /**
+         * No need to sign the zero byte objects as the signing
+         * happens upon first modification of the object.
+         */
+        if (op_errno == ENODATA && (iatt.ia_size != 0))
+            need_signing = _gf_true;
+        if (op_errno == EINVAL)
+            gf_smsg(this->name, GF_LOG_WARNING, 0,
+                    BRB_MSG_PARTIAL_VERSION_PRESENCE, "gfid=%s",
+                    uuid_utoa(linked_inode->gfid), NULL);
+    } else {
+        need_signing = br_check_object_need_sign(this, xattr, child);
+
+        /*
+         * If we are here means, bitrot daemon has started. Is it just
+         * a simple restart of the daemon or is it started because the
+         * feature is enabled is something hard to determine. Hence,
+         * if need_signing is false (because bit-rot version and signature
+         * are present), then still go ahead and sign it.
+         */
+        if (!need_signing) {
+            need_signing = _gf_true;
+            need_reopen = _gf_true;
+        }
+    }
+
+    if (!need_signing)
+        goto unref_dict;
+
+    gf_smsg(this->name, GF_LOG_INFO, 0, BRB_MSG_TRIGGER_SIGN, "path=%s",
+            loc.path, "gfid=%s", uuid_utoa(linked_inode->gfid), "Brick-path=%s",
+            child->brick_path, NULL);
+    br_trigger_sign(this, child, linked_inode, &loc, need_reopen);
+
+    ret = 0;
+
+unref_dict:
+    if (xattr)
+        dict_unref(xattr);
+unref_inode:
+    inode_unref(linked_inode);
+out:
+    loc_wipe(&loc);
+
+    return ret;
+}
+
+#define BR_CRAWL_THROTTLE_COUNT 50
+#define BR_CRAWL_THROTTLE_ZZZ 5
+
+void *
+br_oneshot_signer(void *arg)
+{
+    loc_t loc = {
+        0,
+    };
+    xlator_t *this = NULL;
+    br_child_t *child = NULL;
+
+    child = arg;
+    this = child->this;
+
+    THIS = this;
+
+    gf_smsg(this->name, GF_LOG_INFO, 0, BRB_MSG_CRAWLING_START, "brick-path=%s",
+            child->brick_path, NULL);
+
+    loc.inode = child->table->root;
+    (void)syncop_ftw_throttle(child->xl, &loc, GF_CLIENT_PID_BITD, child,
+                              bitd_oneshot_crawl, BR_CRAWL_THROTTLE_COUNT,
+                              BR_CRAWL_THROTTLE_ZZZ);
+
+    gf_smsg(this->name, GF_LOG_INFO, 0, BRB_MSG_CRAWLING_FINISH,
+            "brick-path=%s", child->brick_path, NULL);
+
+    return NULL;
+}
+
+static void
+br_set_child_state(br_child_t *child, br_child_state_t state)
+{
+    pthread_mutex_lock(&child->lock);
+    {
+        _br_set_child_state(child, state);
+    }
+    pthread_mutex_unlock(&child->lock);
+}
+
+/**
+ * At this point a thread is spawned to crawl the filesystem (in
+ * tortoise pace) to sign objects that were not signed in previous run(s).
+ * Such objects are identified by examining it's dirtyness and timestamp.
+ *
+ *    pick object:
+ *       signature_is_stale() && (object_timestamp() <= stub_init_time())
+ *
+ * Also, we register to the changelog library to subscribe for event
+ * notifications.
+ */
+static int32_t
+br_enact_signer(xlator_t *this, br_child_t *child, br_stub_init_t *stub)
+{
+    int32_t ret = 0;
+    br_private_t *priv = NULL;
+    struct gf_brick_spec *brick = NULL;
+
+    priv = this->private;
+
+    brick = GF_CALLOC(1, sizeof(struct gf_brick_spec),
+                      gf_common_mt_gf_brick_spec_t);
+    if (!brick)
+        goto error_return;
+
+    br_fill_brick_spec(brick, stub->export);
+    ret = gf_changelog_register_generic(brick, 1, 1,
+                                        this->ctx->cmd_args.log_file, -1, this);
+    if (ret) {
+        gf_smsg(this->name, GF_LOG_ERROR, errno, BRB_MSG_REGISTER_FAILED, NULL);
+        goto dealloc;
+    }
+
+    child->threadrunning = 0;
+    ret = gf_thread_create(&child->thread, NULL, br_oneshot_signer, child,
+                           "brosign");
+    if (ret)
+        gf_smsg(this->name, GF_LOG_WARNING, 0, BRB_MSG_SPAWN_FAILED,
+                "FS-crawler-thread", NULL);
+    else
+        child->threadrunning = 1;
+
+    /* it's OK to continue, "old" objects would be signed when modified */
+    list_add_tail(&child->list, &priv->signing);
+    return 0;
+
+dealloc:
+    GF_FREE(brick);
+error_return:
+    return -1;
+}
+
+static int32_t
+br_launch_scrubber(xlator_t *this, br_child_t *child, struct br_scanfs *fsscan,
+                   struct br_scrubber *fsscrub)
+{
+    int32_t ret = -1;
+    br_private_t *priv = NULL;
+    struct br_monitor *scrub_monitor = NULL;
+
+    priv = this->private;
+
+    scrub_monitor = &priv->scrub_monitor;
+    ret = gf_thread_create(&child->thread, NULL, br_fsscanner, child,
+                           "brfsscan");
+    if (ret != 0) {
+        gf_smsg(this->name, GF_LOG_ALERT, 0, BRB_MSG_SPAWN_FAILED,
+                "bitrot-scrubber-daemon Brick-path=%s", child->brick_path,
+                NULL);
+        goto error_return;
+    }
+
+    /* Signal monitor to kick off state machine*/
+    pthread_mutex_lock(&scrub_monitor->mutex);
+    {
+        if (!scrub_monitor->inited)
+            pthread_cond_signal(&scrub_monitor->cond);
+        scrub_monitor->inited = _gf_true;
+    }
+    pthread_mutex_unlock(&scrub_monitor->mutex);
+
+    /**
+     * Everything has been setup.. add this subvolume to scrubbers
+     * list.
+     */
+    pthread_mutex_lock(&fsscrub->mutex);
+    {
+        list_add_tail(&child->list, &fsscrub->scrublist);
+        pthread_cond_broadcast(&fsscrub->cond);
+    }
+    pthread_mutex_unlock(&fsscrub->mutex);
+
+    return 0;
+
+error_return:
+    return -1;
+}
+
+static int32_t
+br_enact_scrubber(xlator_t *this, br_child_t *child)
+{
+    int32_t ret = 0;
+    br_private_t *priv = NULL;
+    struct br_scanfs *fsscan = NULL;
+    struct br_scrubber *fsscrub = NULL;
+
+    priv = this->private;
+
+    fsscan = &child->fsscan;
+    fsscrub = &priv->fsscrub;
+
+    /**
+     * if this child already witnesses a successful connection earlier
+     * there's no need to initialize mutexes, condvars, etc..
+     */
+    if (_br_child_witnessed_connection(child))
+        return br_launch_scrubber(this, child, fsscan, fsscrub);
+
+    LOCK_INIT(&fsscan->entrylock);
+    pthread_mutex_init(&fsscan->waitlock, NULL);
+    pthread_cond_init(&fsscan->waitcond, NULL);
+
+    fsscan->entries = 0;
+    INIT_LIST_HEAD(&fsscan->queued);
+    INIT_LIST_HEAD(&fsscan->ready);
+
+    ret = br_launch_scrubber(this, child, fsscan, fsscrub);
+    if (ret)
+        goto error_return;
+
+    return 0;
+
+error_return:
+    LOCK_DESTROY(&fsscan->entrylock);
+    pthread_mutex_destroy(&fsscan->waitlock);
+    pthread_cond_destroy(&fsscan->waitcond);
+
+    return -1;
+}
+
+static int32_t
+br_child_enaction(xlator_t *this, br_child_t *child, br_stub_init_t *stub)
+{
+    int32_t ret = -1;
+    br_private_t *priv = this->private;
+
+    pthread_mutex_lock(&child->lock);
+    {
+        if (priv->iamscrubber)
+            ret = br_enact_scrubber(this, child);
+        else
+            ret = br_enact_signer(this, child, stub);
+
+        if (!ret) {
+            child->witnessed = 1;
+            _br_set_child_state(child, BR_CHILD_STATE_CONNECTED);
+            gf_smsg(this->name, GF_LOG_INFO, 0, BRB_MSG_CONNECTED_TO_BRICK,
+                    "brick-path=%s", child->brick_path, NULL);
+        }
+    }
+    pthread_mutex_unlock(&child->lock);
+
+    return ret;
+}
+
+/**
+ * This routine fetches various attributes associated with a child which
+ * is basically a subvolume. Attributes include brick path and the stub
+ * birth time. This is done by performing a lookup on the root followed
+ * by getxattr() on a virtual key. Depending on the configuration, the
+ * process either acts as a signer or a scrubber.
+ */
+int32_t
+br_brick_connect(xlator_t *this, br_child_t *child)
+{
+    int32_t ret = -1;
+    loc_t loc = {
+        0,
+    };
+    struct iatt buf = {
+        0,
+    };
+    struct iatt parent = {
+        0,
+    };
+    br_stub_init_t *stub = NULL;
+    dict_t *xattr = NULL;
+    int op_errno = 0;
+
+    GF_VALIDATE_OR_GOTO("bit-rot", this, out);
+    GF_VALIDATE_OR_GOTO(this->name, child, out);
+    GF_VALIDATE_OR_GOTO(this->name, this->private, out);
+
+    br_child_set_scrub_state(child, _gf_false);
+    br_set_child_state(child, BR_CHILD_STATE_INITIALIZING);
+
+    loc.inode = inode_ref(child->table->root);
+    gf_uuid_copy(loc.gfid, loc.inode->gfid);
+    loc.path = gf_strdup("/");
+
+    ret = syncop_lookup(child->xl, &loc, &buf, &parent, NULL, NULL);
+    if (ret) {
+        op_errno = -ret;
+        ret = -1;
+        gf_smsg(this->name, GF_LOG_ERROR, op_errno, BRB_MSG_LOOKUP_FAILED,
+                NULL);
+        goto wipeloc;
+    }
+
+    ret = syncop_getxattr(child->xl, &loc, &xattr,
+                          GLUSTERFS_GET_BR_STUB_INIT_TIME, NULL, NULL);
+    if (ret) {
+        op_errno = -ret;
+        ret = -1;
+        gf_smsg(this->name, GF_LOG_ERROR, op_errno, BRB_MSG_GET_INFO_FAILED,
+                NULL);
+        goto wipeloc;
+    }
+
+    ret = dict_get_ptr(xattr, GLUSTERFS_GET_BR_STUB_INIT_TIME, (void **)&stub);
+    if (ret) {
+        gf_smsg(this->name, GF_LOG_ERROR, 0, BRB_MSG_GET_INFO_FAILED, NULL);
+        goto free_dict;
+    }
+
+    memcpy(child->brick_path, stub->export, strlen(stub->export) + 1);
+    child->tv.tv_sec = ntohl(stub->timebuf[0]);
+    child->tv.tv_usec = ntohl(stub->timebuf[1]);
+
+    ret = br_child_enaction(this, child, stub);
+
+free_dict:
+    dict_unref(xattr);
+wipeloc:
+    loc_wipe(&loc);
+out:
+    if (ret)
+        br_set_child_state(child, BR_CHILD_STATE_CONNFAILED);
+    return ret;
+}
+
+/* TODO: cleanup signer */
+static int32_t
+br_cleanup_signer(xlator_t *this, br_child_t *child)
+{
+    return 0;
+}
+
+static int32_t
+br_cleanup_scrubber(xlator_t *this, br_child_t *child)
+{
+    int32_t ret = 0;
+    br_private_t *priv = NULL;
+    struct br_scrubber *fsscrub = NULL;
+    struct br_monitor *scrub_monitor = NULL;
+
+    priv = this->private;
+    fsscrub = &priv->fsscrub;
+    scrub_monitor = &priv->scrub_monitor;
+
+    if (_br_is_child_scrub_active(child)) {
+        scrub_monitor->active_child_count--;
+        br_child_set_scrub_state(child, _gf_false);
+    }
+
+    /**
+     * 0x0: child (brick) goes out of rotation
+     *
+     * This is fully safe w.r.t. entries for this child being actively
+     * scrubbed. Each of the scrubber thread(s) would finish scrubbing
+     * the entry (probably failing due to disconnection) and either
+     * putting the entry back into the queue or continuing further.
+     * Either way, pending entries for this child's queue need not be
+     * drained; entries just sit there in the queued/ready list to be
+     * consumed later upon re-connection.
+     */
+    pthread_mutex_lock(&fsscrub->mutex);
+    {
+        list_del_init(&child->list);
+    }
+    pthread_mutex_unlock(&fsscrub->mutex);
+
+    /**
+     * 0x1: cleanup scanner thread
+     *
+     * The pending timer needs to be removed _after_ cleaning up the
+     * filesystem scanner (scheduling the next scrub time is not a
+     * cancellation point).
+     */
+    ret = gf_thread_cleanup_xint(child->thread);
+    if (ret)
+        gf_smsg(this->name, GF_LOG_INFO, 0, BRB_MSG_SCRUB_THREAD_CLEANUP, NULL);
+
+    gf_smsg(this->name, GF_LOG_INFO, 0, BRB_MSG_SCRUBBER_CLEANED,
+            "brick-path=%s", child->brick_path, NULL);
+
+    return 0;
+}
+
+/**
+ * OK.. this child has made it's mind to go down the drain. So,
+ * let's clean up what it touched. (NOTE: there's no need to clean
+ * the inode table, it's just reused taking care of stale inodes)
+ */
+int32_t
+br_brick_disconnect(xlator_t *this, br_child_t *child)
+{
+    int32_t ret = 0;
+    struct br_monitor *scrub_monitor = NULL;
+    br_private_t *priv = this->private;
+
+    scrub_monitor = &priv->scrub_monitor;
+
+    /* Lock order should be wakelock and then child lock to
+     * dead locks.
+     */
+    pthread_mutex_lock(&scrub_monitor->wakelock);
+    {
+        pthread_mutex_lock(&child->lock);
+        {
+            if (!_br_is_child_connected(child))
+                goto unblock;
+
+            /* child is on death row.. */
+            _br_set_child_state(child, BR_CHILD_STATE_DISCONNECTED);
+
+            if (priv->iamscrubber)
+                ret = br_cleanup_scrubber(this, child);
+            else
+                ret = br_cleanup_signer(this, child);
+        }
+    unblock:
+        pthread_mutex_unlock(&child->lock);
+    }
+    pthread_mutex_unlock(&scrub_monitor->wakelock);
+
+    return ret;
+}
+
+/**
+ * This function is executed in a separate thread. The thread gets the
+ * brick from where CHILD_UP has received from the queue and gets the
+ * information regarding that brick (such as brick path).
+ */
+void *
+br_handle_events(void *arg)
+{
+    int32_t ret = 0;
+    xlator_t *this = NULL;
+    br_private_t *priv = NULL;
+    br_child_t *child = NULL;
+    struct br_child_event *childev = NULL;
+
+    this = arg;
+    priv = this->private;
+
+    /*
+     * Since, this is the topmost xlator, THIS has to be set by bit-rot
+     * xlator itself (STACK_WIND won't help in this case). Also it has
+     * to be done for each thread that gets spawned. Otherwise, a new
+     * thread will get global_xlator's pointer when it does "THIS".
+     */
+    THIS = this;
+
+    while (1) {
+        pthread_mutex_lock(&priv->lock);
+        {
+            while (list_empty(&priv->bricks))
+                pthread_cond_wait(&priv->cond, &priv->lock);
+
+            childev = list_first_entry(&priv->bricks, struct br_child_event,
+                                       list);
+            list_del_init(&childev->list);
+        }
+        pthread_mutex_unlock(&priv->lock);
+
+        child = childev->child;
+        ret = childev->call(this, child);
+        if (ret)
+            gf_smsg(this->name, GF_LOG_ERROR, 0, BRB_MSG_SUBVOL_CONNECT_FAILED,
+                    "name=%s", child->xl->name, NULL);
+        GF_FREE(childev);
+    }
+
+    return NULL;
+}
+
+int32_t
+mem_acct_init(xlator_t *this)
+{
+    int32_t ret = -1;
+
+    if (!this)
+        return ret;
+
+    ret = xlator_mem_acct_init(this, gf_br_stub_mt_end + 1);
+
+    if (ret != 0) {
+        gf_smsg(this->name, GF_LOG_WARNING, 0, BRB_MSG_MEM_ACNT_FAILED, NULL);
+        return ret;
+    }
+
+    return ret;
+}
+
+static void
+_br_qchild_event(xlator_t *this, br_child_t *child, br_child_handler *call)
+{
+    br_private_t *priv = NULL;
+    struct br_child_event *childev = NULL;
+
+    priv = this->private;
+
+    childev = GF_CALLOC(1, sizeof(*childev), gf_br_mt_br_child_event_t);
+    if (!childev) {
+        gf_smsg(this->name, GF_LOG_ERROR, ENOMEM, BRB_MSG_EVENT_UNHANDLED,
+                "Brick-name=%s", child->xl->name, NULL);
+        return;
+    }
+
+    INIT_LIST_HEAD(&childev->list);
+    childev->this = this;
+    childev->child = child;
+    childev->call = call;
+
+    list_add_tail(&childev->list, &priv->bricks);
+}
+
+int
+br_scrubber_status_get(xlator_t *this, dict_t **dict)
+{
+    int ret = -1;
+    br_private_t *priv = NULL;
+    struct br_scrub_stats *scrub_stats = NULL;
+
+    priv = this->private;
+
+    GF_VALIDATE_OR_GOTO("bit-rot", priv, out);
+
+    scrub_stats = &priv->scrub_stat;
+
+    ret = br_get_bad_objects_list(this, dict);
+    if (ret) {
+        gf_msg_debug(this->name, 0,
+                     "Failed to collect corrupt "
+                     "files");
+    }
+
+    ret = dict_set_int8(*dict, "scrub-running", scrub_stats->scrub_running);
+    if (ret) {
+        gf_msg_debug(this->name, 0,
+                     "Failed setting scrub_running "
+                     "entry to the dictionary");
+    }
+
+    ret = dict_set_uint64(*dict, "scrubbed-files", scrub_stats->scrubbed_files);
+    if (ret) {
+        gf_msg_debug(this->name, 0,
+                     "Failed to setting scrubbed file "
+                     "entry to the dictionary");
+    }
+
+    ret = dict_set_uint64(*dict, "unsigned-files", scrub_stats->unsigned_files);
+    if (ret) {
+        gf_msg_debug(this->name, 0,
+                     "Failed to set unsigned file count"
+                     " entry to the dictionary");
+    }
+
+    ret = dict_set_uint64(*dict, "scrub-duration", scrub_stats->scrub_duration);
+    if (ret) {
+        gf_msg_debug(this->name, 0,
+                     "Failed to set scrub duration"
+                     " entry to the dictionary");
+    }
+
+    ret = dict_set_dynstr_with_alloc(*dict, "last-scrub-time",
+                                     scrub_stats->last_scrub_time);
+    if (ret) {
+        gf_msg_debug(this->name, 0,
+                     "Failed to set "
+                     "last scrub time value");
+    }
+
+out:
+    return ret;
+}
+
+int
+notify(xlator_t *this, int32_t event, void *data, ...)
+{
+    int idx = -1;
+    int ret = -1;
+    xlator_t *subvol = NULL;
+    br_child_t *child = NULL;
+    br_private_t *priv = NULL;
+    dict_t *output = NULL;
+    va_list ap;
+    struct br_monitor *scrub_monitor = NULL;
+
+    subvol = (xlator_t *)data;
+    priv = this->private;
+    scrub_monitor = &priv->scrub_monitor;
+
+    gf_msg_trace(this->name, 0, "Notification received: %d", event);
+
+    idx = br_find_child_index(this, subvol);
+
+    switch (event) {
+        case GF_EVENT_CHILD_UP:
+            if (idx < 0) {
+                gf_smsg(this->name, GF_LOG_ERROR, 0, BRB_MSG_INVALID_SUBVOL,
+                        "event=%d", event, NULL);
+                goto out;
+            }
+
+            pthread_mutex_lock(&priv->lock);
+            {
+                child = &priv->children[idx];
+                if (child->child_up == 1)
+                    goto unblock_0;
+                priv->up_children++;
+
+                child->child_up = 1;
+                child->xl = subvol;
+                if (!child->table)
+                    child->table = inode_table_new(4096, subvol);
+
+                _br_qchild_event(this, child, br_brick_connect);
+                pthread_cond_signal(&priv->cond);
+            }
+        unblock_0:
+            pthread_mutex_unlock(&priv->lock);
+
+            if (priv->up_children == priv->child_count)
+                default_notify(this, event, data);
+            break;
+
+        case GF_EVENT_CHILD_DOWN:
+            if (idx < 0) {
+                gf_smsg(this->name, GF_LOG_ERROR, 0, BRB_MSG_INVALID_SUBVOL,
+                        "event=%d", event, NULL);
+                goto out;
+            }
+
+            pthread_mutex_lock(&priv->lock);
+            {
+                child = &priv->children[idx];
+                if (child->child_up == 0)
+                    goto unblock_1;
+
+                child->child_up = 0;
+                priv->up_children--;
+
+                _br_qchild_event(this, child, br_brick_disconnect);
+                pthread_cond_signal(&priv->cond);
+            }
+        unblock_1:
+            pthread_mutex_unlock(&priv->lock);
+
+            if (priv->up_children == 0)
+                default_notify(this, event, data);
+            break;
+
+        case GF_EVENT_SCRUB_STATUS:
+            gf_msg_debug(this->name, GF_LOG_INFO,
+                         "BitRot scrub status "
+                         "called");
+            va_start(ap, data);
+            output = va_arg(ap, dict_t *);
+            va_end(ap);
+
+            ret = br_scrubber_status_get(this, &output);
+            gf_msg_debug(this->name, 0, "returning %d", ret);
+            break;
+
+        case GF_EVENT_SCRUB_ONDEMAND:
+            gf_log(this->name, GF_LOG_INFO,
+                   "BitRot scrub ondemand "
+                   "called");
+
+            if (scrub_monitor->state != BR_SCRUB_STATE_PENDING) {
+                gf_smsg(this->name, GF_LOG_ERROR, 0,
+                        BRB_MSG_RESCHEDULE_SCRUBBER_FAILED, "Current-state=%d",
+                        scrub_monitor->state, NULL);
+                return -2;
+            }
+
+            /* Needs synchronization with reconfigure thread */
+            pthread_mutex_lock(&priv->lock);
+            {
+                ret = br_scrub_state_machine(this, _gf_true);
+            }
+            pthread_mutex_unlock(&priv->lock);
+
+            if (ret) {
+                gf_smsg(this->name, GF_LOG_ERROR, 0,
+                        BRB_MSG_COULD_NOT_SCHEDULE_SCRUB, NULL);
+            }
+            gf_msg_debug(this->name, 0, "returning %d", ret);
+            break;
+        default:
+            default_notify(this, event, data);
+    }
+
+out:
+    return 0;
+}
+
+static void
+br_fini_signer(xlator_t *this, br_private_t *priv)
+{
+    int i = 0;
+
+    if (priv == NULL)
+        return;
+
+    for (; i < priv->signer_th_count; i++) {
+        (void)gf_thread_cleanup_xint(priv->obj_queue->workers[i]);
+    }
+    GF_FREE(priv->obj_queue->workers);
+
+    pthread_cond_destroy(&priv->object_cond);
+}
+
+/**
+ * Initialize signer specific structures, spawn worker threads.
+ */
+
+static int32_t
+br_init_signer(xlator_t *this, br_private_t *priv)
+{
+    int i = 0;
+    int32_t ret = -1;
+
+    /* initialize gfchangelog xlator context */
+    ret = gf_changelog_init(this);
+    if (ret)
+        goto out;
+
+    pthread_cond_init(&priv->object_cond, NULL);
+
+    priv->obj_queue = GF_CALLOC(1, sizeof(*priv->obj_queue),
+                                gf_br_mt_br_ob_n_wk_t);
+    if (!priv->obj_queue)
+        goto cleanup_cond;
+    INIT_LIST_HEAD(&priv->obj_queue->objects);
+
+    priv->obj_queue->workers = GF_CALLOC(
+        priv->signer_th_count, sizeof(pthread_t), gf_br_mt_br_worker_t);
+    if (!priv->obj_queue->workers)
+        goto cleanup_obj_queue;
+
+    for (i = 0; i < priv->signer_th_count; i++) {
+        ret = gf_thread_create(&priv->obj_queue->workers[i], NULL,
+                               br_process_object, this, "brpobj");
+        if (ret != 0) {
+            gf_smsg(this->name, GF_LOG_ERROR, -ret,
+                    BRB_MSG_THREAD_CREATION_FAILED, NULL);
+            ret = -1;
+            goto cleanup_threads;
+        }
+    }
+
+    return 0;
+
+cleanup_threads:
+    for (i--; i >= 0; i--) {
+        (void)gf_thread_cleanup_xint(priv->obj_queue->workers[i]);
+    }
+    GF_FREE(priv->obj_queue->workers);
+
+cleanup_obj_queue:
+    GF_FREE(priv->obj_queue);
+
+cleanup_cond:
+    /* that's explicit */
+    pthread_cond_destroy(&priv->object_cond);
+out:
+    return -1;
+}
+
+/**
+ * For signer, only rate limit CPU usage (during hash calculation) when
+ * compiled with -DBR_RATE_LIMIT_SIGNER cflags, else let it run full
+ * throttle.
+ */
+static int32_t
+br_rate_limit_signer(xlator_t *this, int child_count, int numbricks)
+{
+    br_private_t *priv = NULL;
+    tbf_opspec_t spec = {
+        0,
+    };
+
+    priv = this->private;
+
+    spec.op = TBF_OP_HASH;
+    spec.rate = 0;
+    spec.maxlimit = 0;
+
+    /**
+     * OK. Most implementations of TBF I've come across generate tokens
+     * every second (UML, etc..) and some chose sub-second granularity
+     * (blk-iothrottle cgroups). TBF algorithm itself does not enforce
+     * any logic for choosing generation interval and it seems pretty
+     * logical as one could jack up token count per interval w.r.t.
+     * generation rate.
+     *
+     * Value used here is chosen based on a series of test(s) performed
+     * to balance object signing time and not maxing out on all available
+     * CPU cores. It's obvious to have seconds granularity and jack up
+     * token count per interval, thereby achieving close to similar
+     * results. Let's stick to this as it seems to be working fine for
+     * the set of ops that are throttled.
+     **/
+    spec.token_gen_interval = 600000; /* In usec */
+
+#ifdef BR_RATE_LIMIT_SIGNER
+
+    double contribution = 0;
+    contribution = ((double)1 - ((double)child_count / (double)numbricks));
+    if (contribution == 0)
+        contribution = 1;
+    spec.rate = BR_HASH_CALC_READ_SIZE * contribution;
+    spec.maxlimit = priv->signer_th_count * BR_HASH_CALC_READ_SIZE;
+
+#endif
+
+    if (!spec.rate)
+        gf_smsg(this->name, GF_LOG_INFO, 0, BRB_MSG_RATE_LIMIT_INFO,
+                "FULL THROTTLE", NULL);
+    else
+        gf_smsg(this->name, GF_LOG_INFO, 0, BRB_MSG_RATE_LIMIT_INFO,
+                "tokens/sec-rate=%lu", spec.rate, "maxlimit=%lu", spec.maxlimit,
+                NULL);
+
+    priv->tbf = tbf_init(&spec, 1);
+    return priv->tbf ? 0 : -1;
+}
+
+static int32_t
+br_signer_handle_options(xlator_t *this, br_private_t *priv, dict_t *options)
+{
+    if (options) {
+        GF_OPTION_RECONF("expiry-time", priv->expiry_time, options, uint32,
+                         error_return);
+        GF_OPTION_RECONF("signer-threads", priv->signer_th_count, options,
+                         uint32, error_return);
+    } else {
+        GF_OPTION_INIT("expiry-time", priv->expiry_time, uint32, error_return);
+        GF_OPTION_INIT("signer-threads", priv->signer_th_count, uint32,
+                       error_return);
+    }
+
+    return 0;
+
+error_return:
+    return -1;
+}
+
+static int32_t
+br_signer_init(xlator_t *this, br_private_t *priv)
+{
+    int32_t ret = 0;
+    int numbricks = 0;
+
+    GF_OPTION_INIT("expiry-time", priv->expiry_time, uint32, error_return);
+    GF_OPTION_INIT("brick-count", numbricks, int32, error_return);
+    GF_OPTION_INIT("signer-threads", priv->signer_th_count, uint32,
+                   error_return);
+
+    ret = br_rate_limit_signer(this, priv->child_count, numbricks);
+    if (ret)
+        goto error_return;
+
+    ret = br_init_signer(this, priv);
+    if (ret)
+        goto cleanup_tbf;
+
+    return 0;
+
+cleanup_tbf:
+    /* cleanup TBF */
+error_return:
+    return -1;
+}
+
+static void
+br_free_scrubber_monitor(xlator_t *this, br_private_t *priv)
+{
+    struct br_monitor *scrub_monitor = &priv->scrub_monitor;
+
+    if (scrub_monitor->timer) {
+        (void)gf_tw_del_timer(priv->timer_wheel, scrub_monitor->timer);
+
+        GF_FREE(scrub_monitor->timer);
+        scrub_monitor->timer = NULL;
+    }
+
+    (void)gf_thread_cleanup_xint(scrub_monitor->thread);
+
+    /* Clean up cond and mutex variables */
+    pthread_mutex_destroy(&scrub_monitor->mutex);
+    pthread_cond_destroy(&scrub_monitor->cond);
+
+    pthread_mutex_destroy(&scrub_monitor->wakelock);
+    pthread_cond_destroy(&scrub_monitor->wakecond);
+
+    pthread_mutex_destroy(&scrub_monitor->donelock);
+    pthread_cond_destroy(&scrub_monitor->donecond);
+
+    LOCK_DESTROY(&scrub_monitor->lock);
+}
+
+static void
+br_free_children(xlator_t *this, br_private_t *priv, int count)
+{
+    br_child_t *child = NULL;
+
+    for (--count; count >= 0; count--) {
+        child = &priv->children[count];
+        mem_pool_destroy(child->timer_pool);
+        pthread_mutex_destroy(&child->lock);
+    }
+
+    GF_FREE(priv->children);
+    priv->children = NULL;
+}
+
+static int
+br_init_children(xlator_t *this, br_private_t *priv)
+{
+    int i = 0;
+    br_child_t *child = NULL;
+    xlator_list_t *trav = NULL;
+
+    priv->child_count = xlator_subvolume_count(this);
+    priv->children = GF_CALLOC(priv->child_count, sizeof(*priv->children),
+                               gf_br_mt_br_child_t);
+    if (!priv->children)
+        goto err;
+
+    trav = this->children;
+    while (trav) {
+        child = &priv->children[i];
+
+        pthread_mutex_init(&child->lock, NULL);
+        child->witnessed = 0;
+
+        br_set_child_state(child, BR_CHILD_STATE_DISCONNECTED);
+
+        child->this = this;
+        child->xl = trav->xlator;
+
+        child->timer_pool = mem_pool_new(struct gf_tw_timer_list, 4096);
+        if (!child->timer_pool) {
+            gf_smsg(this->name, GF_LOG_ERROR, ENOMEM, BRB_MSG_MEM_POOL_ALLOC,
+                    NULL);
+            errno = ENOMEM;
+            goto freechild;
+        }
+
+        INIT_LIST_HEAD(&child->list);
+
+        i++;
+        trav = trav->next;
+    }
+
+    return 0;
+
+freechild:
+    br_free_children(this, priv, i);
+err:
+    return -1;
+}
+
+int32_t
+init(xlator_t *this)
+{
+    int32_t ret = -1;
+    br_private_t *priv = NULL;
+
+    if (!this->children) {
+        gf_smsg(this->name, GF_LOG_ERROR, 0, BRB_MSG_NO_CHILD, NULL);
+        goto out;
+    }
+
+    priv = GF_CALLOC(1, sizeof(*priv), gf_br_mt_br_private_t);
+    if (!priv) {
+        gf_smsg(this->name, GF_LOG_ERROR, ENOMEM, BRB_MSG_NO_MEMORY, NULL);
+        goto out;
+    }
+
+    GF_OPTION_INIT("scrubber", priv->iamscrubber, bool, free_priv);
+
+    ret = br_init_children(this, priv);
+    if (ret)
+        goto free_priv;
+
+    pthread_mutex_init(&priv->lock, NULL);
+    pthread_cond_init(&priv->cond, NULL);
+
+    INIT_LIST_HEAD(&priv->bricks);
+    INIT_LIST_HEAD(&priv->signing);
+
+    priv->timer_wheel = glusterfs_ctx_tw_get(this->ctx);
+    if (!priv->timer_wheel) {
+        gf_smsg(this->name, GF_LOG_ERROR, 0, BRB_MSG_TIMER_WHEEL_UNAVAILABLE,
+                NULL);
+        goto cleanup;
+    }
+
+    this->private = priv;
+
+    if (!priv->iamscrubber) {
+        ret = br_signer_init(this, priv);
+        if (!ret)
+            ret = br_signer_handle_options(this, priv, NULL);
+    } else {
+        ret = br_scrubber_init(this, priv);
+        if (!ret)
+            ret = br_scrubber_handle_options(this, priv, NULL);
+    }
+
+    if (ret)
+        goto cleanup;
+
+    ret = gf_thread_create(&priv->thread, NULL, br_handle_events, this,
+                           "brhevent");
+    if (ret != 0) {
+        gf_smsg(this->name, GF_LOG_ERROR, -ret, BRB_MSG_THREAD_CREATION_FAILED,
+                NULL);
+        ret = -1;
+    }
+
+    if (!ret) {
+        gf_smsg(this->name, GF_LOG_INFO, 0, BRB_MSG_BITROT_LOADED, "mode=%s",
+                (priv->iamscrubber) ? "SCRUBBER" : "SIGNER", NULL);
+        return 0;
+    }
+
+cleanup:
+    (void)pthread_cond_destroy(&priv->cond);
+    (void)pthread_mutex_destroy(&priv->lock);
+
+    br_free_children(this, priv, priv->child_count);
+
+free_priv:
+    GF_FREE(priv);
+out:
+    this->private = NULL;
+    return -1;
+}
+
+void
+fini(xlator_t *this)
+{
+    br_private_t *priv = this->private;
+
+    if (!priv)
+        return;
+
+    if (!priv->iamscrubber)
+        br_fini_signer(this, priv);
+    else
+        (void)br_free_scrubber_monitor(this, priv);
+
+    br_free_children(this, priv, priv->child_count);
+
+    this->private = NULL;
+    GF_FREE(priv);
+
+    glusterfs_ctx_tw_put(this->ctx);
+
+    return;
+}
+
+static void
+br_reconfigure_monitor(xlator_t *this)
+{
+    int32_t ret = 0;
+
+    ret = br_scrub_state_machine(this, _gf_false);
+    if (ret) {
+        gf_smsg(this->name, GF_LOG_ERROR, 0, BRB_MSG_COULD_NOT_SCHEDULE_SCRUB,
+                NULL);
+    }
+}
+
+static int
+br_reconfigure_scrubber(xlator_t *this, dict_t *options)
+{
+    int32_t ret = -1;
+    br_private_t *priv = NULL;
+
+    priv = this->private;
+
+    pthread_mutex_lock(&priv->lock);
+    {
+        ret = br_scrubber_handle_options(this, priv, options);
+    }
+    pthread_mutex_unlock(&priv->lock);
+
+    if (ret)
+        goto err;
+
+    /* change state for all _up_ subvolume(s) */
+    pthread_mutex_lock(&priv->lock);
+    {
+        br_reconfigure_monitor(this);
+    }
+    pthread_mutex_unlock(&priv->lock);
+
+err:
+    return ret;
+}
+
+static int
+br_reconfigure_signer(xlator_t *this, dict_t *options)
+{
+    br_private_t *priv = this->private;
+
+    return br_signer_handle_options(this, priv, options);
+}
+
+int
+reconfigure(xlator_t *this, dict_t *options)
+{
+    int ret = 0;
+    br_private_t *priv = NULL;
+
+    priv = this->private;
+
+    if (priv->iamscrubber)
+        ret = br_reconfigure_scrubber(this, options);
+    else
+        ret = br_reconfigure_signer(this, options);
+
+    return ret;
+}
+
+struct xlator_fops fops;
+
+struct xlator_cbks cbks;
+
+struct volume_options options[] = {
+    {
+        .key = {"expiry-time"},
+        .type = GF_OPTION_TYPE_INT,
+        .default_value = SIGNING_TIMEOUT,
+        .op_version = {GD_OP_VERSION_3_7_0},
+        .flags = OPT_FLAG_SETTABLE,
+        .description = "Waiting time for an object on which it waits "
+                       "before it is signed",
+    },
+    {
+        .key = {"brick-count"},
+        .type = GF_OPTION_TYPE_STR,
+        .description = "Total number of bricks for the current node for "
+                       "all volumes in the trusted storage pool.",
+    },
+    {
+        .key = {"scrubber", "scrub"},
+        .type = GF_OPTION_TYPE_BOOL,
+        .default_value = "false",
+        .op_version = {GD_OP_VERSION_3_7_0},
+        .flags = OPT_FLAG_SETTABLE | OPT_FLAG_FORCE,
+        .description = "option to run as a scrubber",
+    },
+    {
+        .key = {"scrub-throttle"},
+        .type = GF_OPTION_TYPE_STR,
+        .default_value = "lazy",
+        .op_version = {GD_OP_VERSION_3_7_0},
+        .flags = OPT_FLAG_SETTABLE,
+        .description = "Scrub-throttle value is a measure of how fast "
+                       "or slow the scrubber scrubs the filesystem for "
+                       "volume <VOLNAME>",
+    },
+    {
+        .key = {"scrub-freq"},
+        .type = GF_OPTION_TYPE_STR,
+        .default_value = "biweekly",
+        .op_version = {GD_OP_VERSION_3_7_0},
+        .flags = OPT_FLAG_SETTABLE,
+        .description = "Scrub frequency for volume <VOLNAME>",
+    },
+    {
+        .key = {"scrub-state"},
+        .type = GF_OPTION_TYPE_STR,
+        .default_value = "active",
+        .op_version = {GD_OP_VERSION_4_0_0},
+        .flags = OPT_FLAG_SETTABLE,
+        .description = "Pause/Resume scrub. Upon resume, scrubber "
+                       "continues from where it left off.",
+    },
+    {
+        .key = {"signer-threads"},
+        .type = GF_OPTION_TYPE_INT,
+        .default_value = BR_WORKERS,
+        .op_version = {GD_OP_VERSION_8_0},
+        .flags = OPT_FLAG_SETTABLE,
+        .description = "Number of signing process threads. As a best "
+                       "practice, set this to the number of processor cores",
+    },
+    {.key = {NULL}},
+};
+
+xlator_api_t xlator_api = {
+    .init = init,
+    .fini = fini,
+    .notify = notify,
+    .reconfigure = reconfigure,
+    .mem_acct_init = mem_acct_init,
+    .op_version = {1}, /* Present from the initial version */
+    .fops = &fops,
+    .cbks = &cbks,
+    .options = options,
+    .identifier = "bit-rot",
+    .category = GF_MAINTAINED,
+};
diff --git a/xlators/features/bit-rot/src/bitd/bit-rot.h b/xlators/features/bit-rot/src/bitd/bit-rot.h
new file mode 100644
index 00000000000..8ac7dcdac3d
--- /dev/null
+++ b/xlators/features/bit-rot/src/bitd/bit-rot.h
@@ -0,0 +1,302 @@
+/*
+   Copyright (c) 2015 Red Hat, Inc. <http://www.redhat.com>
+   This file is part of GlusterFS.
+
+   This file is licensed to you under your choice of the GNU Lesser
+   General Public License, version 3 or any later version (LGPLv3 or
+   later), or the GNU General Public License, version 2 (GPLv2), in all
+   cases as published by the Free Software Foundation.
+*/
+
+#ifndef __BIT_ROT_H__
+#define __BIT_ROT_H__
+
+#include <glusterfs/glusterfs.h>
+#include <glusterfs/logging.h>
+#include <glusterfs/dict.h>
+#include <glusterfs/xlator.h>
+#include <glusterfs/defaults.h>
+#include <glusterfs/syncop.h>
+#include <glusterfs/syncop-utils.h>
+#include "changelog.h"
+#include "timer-wheel.h"
+
+#include <glusterfs/throttle-tbf.h>
+#include "bit-rot-ssm.h"
+
+#include "bit-rot-common.h"
+#include "bit-rot-stub-mem-types.h"
+#include "bit-rot-scrub-status.h"
+
+#include <openssl/sha.h>
+
+typedef enum scrub_throttle {
+    BR_SCRUB_THROTTLE_VOID = -1,
+    BR_SCRUB_THROTTLE_LAZY = 0,
+    BR_SCRUB_THROTTLE_NORMAL = 1,
+    BR_SCRUB_THROTTLE_AGGRESSIVE = 2,
+    BR_SCRUB_THROTTLE_STALLED = 3,
+} scrub_throttle_t;
+
+typedef enum scrub_freq {
+    BR_FSSCRUB_FREQ_HOURLY = 1,
+    BR_FSSCRUB_FREQ_DAILY,
+    BR_FSSCRUB_FREQ_WEEKLY,
+    BR_FSSCRUB_FREQ_BIWEEKLY,
+    BR_FSSCRUB_FREQ_MONTHLY,
+    BR_FSSCRUB_FREQ_MINUTE,
+    BR_FSSCRUB_FREQ_STALLED,
+} scrub_freq_t;
+
+#define signature_size(hl) (sizeof(br_isignature_t) + hl + 1)
+
+struct br_scanfs {
+    gf_lock_t entrylock;
+
+    pthread_mutex_t waitlock;
+    pthread_cond_t waitcond;
+
+    unsigned int entries;
+    struct list_head queued;
+    struct list_head ready;
+};
+
+/* just need three states to track child status */
+typedef enum br_child_state {
+    BR_CHILD_STATE_CONNECTED = 1,
+    BR_CHILD_STATE_INITIALIZING,
+    BR_CHILD_STATE_CONNFAILED,
+    BR_CHILD_STATE_DISCONNECTED,
+} br_child_state_t;
+
+struct br_child {
+    pthread_mutex_t lock;     /* protects child state */
+    char witnessed;           /* witnessed at least one successful
+                                 connection */
+    br_child_state_t c_state; /* current state of this child */
+
+    char child_up;             /* Indicates whether this child is
+                                  up or not */
+    xlator_t *xl;              /* client xlator corresponding to
+                                  this child */
+    inode_table_t *table;      /* inode table for this child */
+    char brick_path[PATH_MAX]; /* brick export directory of this
+                                  child */
+    struct list_head list;     /* hook to attach to the list of
+                                  UP children */
+    xlator_t *this;            /* Bit rot xlator */
+
+    pthread_t thread;  /* initial crawler for unsigned
+                          object(s) or scrub crawler */
+    int threadrunning; /* active thread */
+
+    struct mem_pool *timer_pool; /* timer-wheel's timer mem-pool */
+
+    struct timeval tv;
+
+    struct br_scanfs fsscan; /* per subvolume FS scanner */
+
+    gf_boolean_t active_scrubbing; /* Actively scrubbing or not */
+};
+
+typedef struct br_child br_child_t;
+
+struct br_obj_n_workers {
+    struct list_head objects; /* queue of objects expired from the
+                                 timer wheel and ready to be picked
+                                 up for signing */
+    pthread_t *workers;       /* Threads which pick up the objects
+                                 from the above queue and start
+                                 signing each object */
+};
+
+struct br_scrubber {
+    xlator_t *this;
+
+    scrub_throttle_t throttle;
+
+    /**
+     * frequency of scanning for this subvolume. this should
+     * normally be per-child, but since all children follow the
+     * same frequency for a volume, this option ends up here
+     * instead of br_child_t.
+     */
+    scrub_freq_t frequency;
+
+    gf_boolean_t frequency_reconf;
+    gf_boolean_t throttle_reconf;
+
+    pthread_mutex_t mutex;
+    pthread_cond_t cond;
+
+    unsigned int nr_scrubbers;
+    struct list_head scrubbers;
+
+    /**
+     * list of "rotatable" subvolume(s) undergoing scrubbing
+     */
+    struct list_head scrublist;
+};
+
+struct br_monitor {
+    gf_lock_t lock;
+    pthread_t thread; /* Monitor thread */
+
+    gf_boolean_t inited;
+    pthread_mutex_t mutex;
+    pthread_cond_t cond; /* Thread starts and will be waiting on cond.
+                            First child which is up wakes this up */
+
+    xlator_t *this;
+    /* scheduler */
+    uint32_t boot;
+
+    int32_t active_child_count; /* Number of children currently scrubbing */
+    gf_boolean_t kick;          /* This variable tracks the scrubber is
+                                 * kicked or not. Both 'kick' and
+                                 * 'active_child_count' uses the same pair
+                                 * of mutex-cond variable, i.e, wakelock and
+                                 * wakecond. */
+
+    pthread_mutex_t wakelock;
+    pthread_cond_t wakecond;
+
+    gf_boolean_t done;
+    pthread_mutex_t donelock;
+    pthread_cond_t donecond;
+
+    struct gf_tw_timer_list *timer;
+    br_scrub_state_t state; /* current scrub state */
+};
+
+typedef struct br_obj_n_workers br_obj_n_workers_t;
+
+typedef struct br_private br_private_t;
+
+typedef void (*br_scrubbed_file_update)(br_private_t *priv);
+
+struct br_private {
+    pthread_mutex_t lock;
+
+    struct list_head bricks; /* list of bricks from which enents
+                                have been received */
+
+    struct list_head signing;
+
+    pthread_cond_t object_cond; /* handling signing of objects */
+    int child_count;
+    br_child_t *children; /* list of subvolumes */
+    int up_children;
+
+    pthread_cond_t cond; /* handling CHILD_UP notifications */
+    pthread_t thread;    /* thread for connecting each UP
+                            child with changelog */
+
+    struct tvec_base *timer_wheel; /* timer wheel where the objects which
+                                      changelog has sent sits and waits
+                                      for expiry */
+    br_obj_n_workers_t *obj_queue; /* place holder for all the objects
+                                      that are expired from timer wheel
+                                      and ready to be picked up for
+                                      signing and the workers which sign
+                                      the objects */
+
+    uint32_t expiry_time; /* objects "wait" time */
+
+    uint32_t signer_th_count; /* Number of signing process threads */
+
+    tbf_t *tbf; /* token bucket filter */
+
+    gf_boolean_t iamscrubber; /* function as a fs scrubber */
+
+    struct br_scrub_stats scrub_stat; /* statistics of scrub*/
+
+    struct br_scrubber fsscrub; /* scrubbers for this subvolume */
+
+    struct br_monitor scrub_monitor; /* scrubber monitor */
+};
+
+struct br_object {
+    xlator_t *this;
+
+    uuid_t gfid;
+
+    unsigned long signedversion; /* version against which this object will
+                                    be signed */
+    br_child_t *child;           /* object's subvolume */
+
+    int sign_info;
+
+    struct list_head list; /* hook to add to the queue once the
+                              object is expired from timer wheel */
+    void *data;
+};
+
+typedef struct br_object br_object_t;
+typedef int32_t(br_scrub_ssm_call)(xlator_t *);
+
+void
+br_log_object(xlator_t *, char *, uuid_t, int32_t);
+
+void
+br_log_object_path(xlator_t *, char *, const char *, int32_t);
+
+int32_t
+br_calculate_obj_checksum(unsigned char *, br_child_t *, fd_t *, struct iatt *);
+
+int32_t
+br_prepare_loc(xlator_t *, br_child_t *, loc_t *, gf_dirent_t *, loc_t *);
+
+gf_boolean_t
+bitd_is_bad_file(xlator_t *, br_child_t *, loc_t *, fd_t *);
+
+static inline void
+_br_set_child_state(br_child_t *child, br_child_state_t state)
+{
+    child->c_state = state;
+}
+
+static inline int
+_br_is_child_connected(br_child_t *child)
+{
+    return (child->c_state == BR_CHILD_STATE_CONNECTED);
+}
+
+static inline int
+_br_is_child_scrub_active(br_child_t *child)
+{
+    return child->active_scrubbing;
+}
+
+static inline int
+_br_child_failed_conn(br_child_t *child)
+{
+    return (child->c_state == BR_CHILD_STATE_CONNFAILED);
+}
+
+static inline int
+_br_child_witnessed_connection(br_child_t *child)
+{
+    return (child->witnessed == 1);
+}
+
+/* scrub state */
+static inline void
+_br_monitor_set_scrub_state(struct br_monitor *scrub_monitor,
+                            br_scrub_state_t state)
+{
+    scrub_monitor->state = state;
+}
+
+static inline br_scrub_event_t
+_br_child_get_scrub_event(struct br_scrubber *fsscrub)
+{
+    return (fsscrub->frequency == BR_FSSCRUB_FREQ_STALLED)
+               ? BR_SCRUB_EVENT_PAUSE
+               : BR_SCRUB_EVENT_SCHEDULE;
+}
+
+int32_t
+br_get_bad_objects_list(xlator_t *this, dict_t **dict);
+
+#endif /* __BIT_ROT_H__ */