summaryrefslogtreecommitdiffstats
path: root/xlators/features
diff options
context:
space:
mode:
authorVenky Shankar <vshankar@redhat.com>2015-02-15 15:05:19 +0530
committerVijay Bellur <vbellur@redhat.com>2015-03-24 10:55:32 -0700
commit7927e8747c731dbb105e93ae66c336338f48f0e6 (patch)
treec7d4ce47ee90ef2483e1baf81327c3d2f2a545ea /xlators/features
parent31f841d6b35c242942b6bdcbfdc83cf548d5235a (diff)
features/bit-rot: Implementation of bit-rot xlator
This is the "Signer" -- responsible for signing files with their checksums upon last file descriptor close (last release()). The event notification facility provided by the changelog xlator is made use of. Moreover, checksums are as of now SHA256 hash of the object data and is the only available hash at this point of time. Therefore, there is no special "what hash to use" type check, although it's does not take much to add various hashing algorithms to sign objects with. Signatures are stored in extended attributes of the objects along with the the type of hashing used to calculate the signature. This makes thing future proof when other hash types are added. The signature infrastructure is provided by bitrot stub: a little piece of code that sits over the POSIX xlator providing interfaces to "get or set" objects signature and it's staleness. Since objects are signed upon receiving release() notification, pre-existing data which are "never" modified would never be signed. To counter this, an initial crawler thread is spawned The crawler scans the entire brick for objects that are unsigned or "missed" signing due to the server going offline (node reboots, crashes, etc..) and triggers an explicit sign. This would also sign objects when bit-rot is enabled for a volume and/or after upgrade. Change-Id: I1d9a98bee6cad1c39c35c53c8fb0fc4bad2bf67b BUG: 1170075 Original-Author: Raghavendra Bhat <raghavendra@redhat.com> Signed-off-by: Venky Shankar <vshankar@redhat.com> Reviewed-on: http://review.gluster.org/9711 Tested-by: Gluster Build System <jenkins@build.gluster.com> Reviewed-by: Vijay Bellur <vbellur@redhat.com>
Diffstat (limited to 'xlators/features')
-rw-r--r--xlators/features/bit-rot/src/Makefile.am19
-rw-r--r--xlators/features/bit-rot/src/bit-rot-mem-types.h24
-rw-r--r--xlators/features/bit-rot/src/bit-rot.c89
-rw-r--r--xlators/features/bit-rot/src/bit-rot.h33
-rw-r--r--xlators/features/bit-rot/src/bitd/Makefile.am20
-rw-r--r--xlators/features/bit-rot/src/bitd/bit-rot.c1351
-rw-r--r--xlators/features/bit-rot/src/bitd/bit-rot.h126
-rw-r--r--xlators/features/bit-rot/src/stub/Makefile.am4
-rw-r--r--xlators/features/bit-rot/src/stub/bit-rot-stub-mem-types.h10
-rw-r--r--xlators/features/changelog/lib/src/Makefile.am2
-rw-r--r--xlators/features/changelog/lib/src/changelog.h116
11 files changed, 1506 insertions, 288 deletions
diff --git a/xlators/features/bit-rot/src/Makefile.am b/xlators/features/bit-rot/src/Makefile.am
index 1f59a71ebea..b5e4a7d62a0 100644
--- a/xlators/features/bit-rot/src/Makefile.am
+++ b/xlators/features/bit-rot/src/Makefile.am
@@ -1,18 +1 @@
-
-SUBDIRS = stub
-
-xlator_LTLIBRARIES = bit-rot.la
-xlatordir = $(libdir)/glusterfs/$(PACKAGE_VERSION)/xlator/features
-
-bit_rot_la_LDFLAGS = -module -avoid-version
-
-bit_rot_la_SOURCES = bit-rot.c
-bit_rot_la_LIBADD = $(top_builddir)/libglusterfs/src/libglusterfs.la
-
-noinst_HEADERS = bit-rot.h bit-rot-mem-types.h
-
-AM_CPPFLAGS = $(GF_CPPFLAGS) -I$(top_srcdir)/libglusterfs/src
-
-AM_CFLAGS = -Wall $(GF_CFLAGS)
-
-CLEANFILES =
+SUBDIRS = stub bitd
diff --git a/xlators/features/bit-rot/src/bit-rot-mem-types.h b/xlators/features/bit-rot/src/bit-rot-mem-types.h
deleted file mode 100644
index 19c2aca0f8a..00000000000
--- a/xlators/features/bit-rot/src/bit-rot-mem-types.h
+++ /dev/null
@@ -1,24 +0,0 @@
-/*
- Copyright (c) 2014 Red Hat, Inc. <http://www.redhat.com>
- This file is part of GlusterFS.
-
- This file is licensed to you under your choice of the GNU Lesser
- General Public License, version 3 or any later version (LGPLv3 or
- later), or the GNU General Public License, version 2 (GPLv2), in all
- cases as published by the Free Software Foundation.
-*/
-
-#ifndef _BR_MEM_TYPES_H
-#define _BR_MEM_TYPES_H
-
-#include "mem-types.h"
-
-enum br_mem_types {
- gf_br_mt_br_private_t = gf_common_mt_end + 1,
- gf_br_mt_br_local_t,
- gf_br_mt_br_inode_t,
- gf_br_mt_br_fd_t,
- gf_br_mt_end
-};
-
-#endif
diff --git a/xlators/features/bit-rot/src/bit-rot.c b/xlators/features/bit-rot/src/bit-rot.c
deleted file mode 100644
index 0ba8b80825b..00000000000
--- a/xlators/features/bit-rot/src/bit-rot.c
+++ /dev/null
@@ -1,89 +0,0 @@
-/*
- Copyright (c) 2014 Red Hat, Inc. <http://www.redhat.com>
- This file is part of GlusterFS.
-
- This file is licensed to you under your choice of the GNU Lesser
- General Public License, version 3 or any later version (LGPLv3 or
- later), or the GNU General Public License, version 2 (GPLv2), in all
- cases as published by the Free Software Foundation.
-*/
-
-#include <ctype.h>
-#include <sys/uio.h>
-
-#ifndef _CONFIG_H
-#define _CONFIG_H
-#include "config.h"
-#endif
-
-#include "glusterfs.h"
-#include "xlator.h"
-#include "logging.h"
-
-#include "bit-rot.h"
-#include "bit-rot-mem-types.h"
-
-int32_t
-mem_acct_init (xlator_t *this)
-{
- int32_t ret = -1;
-
- if (!this)
- return ret;
-
- ret = xlator_mem_acct_init (this, gf_br_mt_end + 1);
-
- if (ret != 0) {
- gf_log (this->name, GF_LOG_WARNING, "Memory accounting"
- " init failed");
- return ret;
- }
-
- return ret;
-}
-
-int32_t
-init (xlator_t *this)
-{
- br_private_t *priv = NULL;
- int32_t ret = -1;
-
- if (!this->children) {
- gf_log (this->name, GF_LOG_ERROR,
- "FATAL: no children");
- goto out;
- }
-
- priv = GF_CALLOC (1, sizeof (*priv), gf_br_mt_br_private_t);
- if (!priv)
- goto out;
-
- this->private = priv;
-
- ret = 0;
-
-out:
- gf_log (this->name, GF_LOG_DEBUG, "bit-rot xlator loaded");
- return ret;
-}
-
-void
-fini (xlator_t *this)
-{
- br_private_t *priv = this->private;
-
- if (!priv)
- return;
- this->private = NULL;
- GF_FREE (priv);
-
- return;
-}
-
-struct xlator_fops fops;
-
-struct xlator_cbks cbks;
-
-struct volume_options options[] = {
- { .key = {NULL} },
-};
diff --git a/xlators/features/bit-rot/src/bit-rot.h b/xlators/features/bit-rot/src/bit-rot.h
deleted file mode 100644
index b275c0e9535..00000000000
--- a/xlators/features/bit-rot/src/bit-rot.h
+++ /dev/null
@@ -1,33 +0,0 @@
- /*
- Copyright (c) 2014 Red Hat, Inc. <http://www.redhat.com>
- This file is part of GlusterFS.
-
- This file is licensed to you under your choice of the GNU Lesser
- General Public License, version 3 or any later version (LGPLv3 or
- later), or the GNU General Public License, version 2 (GPLv2), in all
- cases as published by the Free Software Foundation.
-*/
-#ifndef __BIT_ROT_H__
-#define __BIT_ROT_H__
-
-#ifndef _CONFIG_H
-#define _CONFIG_H
-#include "config.h"
-#endif
-
-#include "glusterfs.h"
-#include "logging.h"
-#include "dict.h"
-#include "xlator.h"
-#include "defaults.h"
-#include "bit-rot-mem-types.h"
-#include "syncop.h"
-
-struct br_private {
- xlator_t *xl;
- gf_lock_t lock;
-};
-
-typedef struct br_private br_private_t;
-
-#endif /* __BIR_ROT_H__ */
diff --git a/xlators/features/bit-rot/src/bitd/Makefile.am b/xlators/features/bit-rot/src/bitd/Makefile.am
new file mode 100644
index 00000000000..d94a70dc97f
--- /dev/null
+++ b/xlators/features/bit-rot/src/bitd/Makefile.am
@@ -0,0 +1,20 @@
+xlator_LTLIBRARIES = bit-rot.la
+xlatordir = $(libdir)/glusterfs/$(PACKAGE_VERSION)/xlator/features
+
+bit_rot_la_LDFLAGS = -module -avoid-version
+
+AM_CPPFLAGS = $(GF_CPPFLAGS) -I$(top_srcdir)/libglusterfs/src \
+ -I$(top_srcdir)/rpc/xdr/src/ \
+ -I$(top_srcdir)/rpc/rpc-lib/src \
+ -I$(CONTRIBDIR)/timer-wheel \
+ -I$(top_srcdir)/xlators/features/bit-rot/src/stub
+
+bit_rot_la_SOURCES = bit-rot.c
+bit_rot_la_LIBADD = $(top_builddir)/libglusterfs/src/libglusterfs.la \
+ $(top_builddir)/xlators/features/changelog/lib/src/libgfchangelog.la
+
+noinst_HEADERS = bit-rot.h
+
+AM_CFLAGS = -Wall $(GF_CFLAGS)
+
+CLEANFILES =
diff --git a/xlators/features/bit-rot/src/bitd/bit-rot.c b/xlators/features/bit-rot/src/bitd/bit-rot.c
new file mode 100644
index 00000000000..6234dd83864
--- /dev/null
+++ b/xlators/features/bit-rot/src/bitd/bit-rot.c
@@ -0,0 +1,1351 @@
+/*
+ Copyright (c) 2015 Red Hat, Inc. <http://www.redhat.com>
+ This file is part of GlusterFS.
+
+ This file is licensed to you under your choice of the GNU Lesser
+ General Public License, version 3 or any later version (LGPLv3 or
+ later), or the GNU General Public License, version 2 (GPLv2), in all
+ cases as published by the Free Software Foundation.
+*/
+
+#ifndef _CONFIG_H
+#define _CONFIG_H
+#include "config.h"
+#endif
+
+#include <ctype.h>
+#include <sys/uio.h>
+
+#include "glusterfs.h"
+#include "xlator.h"
+#include "logging.h"
+#include "compat-errno.h"
+
+#include "bit-rot.h"
+#include <pthread.h>
+
+static int
+br_find_child_index (xlator_t *this, xlator_t *child)
+{
+ br_private_t *priv = NULL;
+ int i = -1;
+ int index = -1;
+
+ GF_VALIDATE_OR_GOTO ("bit-rot", this, out);
+ GF_VALIDATE_OR_GOTO (this->name, this->private, out);
+ GF_VALIDATE_OR_GOTO (this->name, child, out);
+
+ priv = this->private;
+
+ for (i = 0; i < priv->child_count; i++) {
+ if (child == priv->children[i].xl) {
+ index = i;
+ break;
+ }
+ }
+
+out:
+ return index;
+}
+
+static void
+br_free_children (xlator_t *this)
+{
+ br_private_t *priv = NULL;
+ int32_t i = 0;
+ br_child_t *child = NULL;
+
+ priv = this->private;
+
+ for (i = 0; i < priv->child_count; i++) {
+ child = &priv->children[i];
+ mem_pool_destroy (child->timer_pool);
+ list_del_init (&priv->children[i].list);
+ }
+
+ GF_FREE (priv->children);
+
+ priv->children = NULL;
+}
+
+br_child_t *
+br_get_child_from_brick_path (xlator_t *this, char *brick_path)
+{
+ br_private_t *priv = NULL;
+ br_child_t *child = NULL;
+ br_child_t *tmp = NULL;
+ int i = 0;
+
+ GF_VALIDATE_OR_GOTO ("bit-rot", this, out);
+ GF_VALIDATE_OR_GOTO (this->name, this->private, out);
+ GF_VALIDATE_OR_GOTO (this->name, brick_path, out);
+
+ priv = this->private;
+
+ pthread_mutex_lock (&priv->lock);
+ {
+ for (i = 0; i < priv->child_count; i++) {
+ tmp = &priv->children[i];
+ if (!strcmp (tmp->brick_path, brick_path)) {
+ child = tmp;
+ break;
+ }
+ }
+ }
+ pthread_mutex_unlock (&priv->lock);
+
+out:
+ return child;
+}
+
+/**
+ * probably we'll encapsulate brick inside our own structure when
+ * needed -- later.
+ */
+void *
+br_brick_init (void *xl, struct gf_brick_spec *brick)
+{
+ return brick;
+}
+
+/**
+ * and cleanup things here when allocated br_brick_init().
+ */
+void
+br_brick_fini (void *xl, char *brick, void *data)
+{
+ return;
+}
+
+/**
+ * TODO: Signature can contain null terminators which causes bitrot
+ * stub to store truncated hash as it depends on string length of
+ * the hash.
+ *
+ * FIX: Send the string length as part of the signature struct and
+ * change stub to handle this change.
+ */
+static inline br_isignature_t *
+br_prepare_signature (const unsigned char *sign,
+ unsigned long hashlen,
+ int8_t hashtype, br_object_t *object)
+{
+ br_isignature_t *signature = NULL;
+
+ /* TODO: use mem-pool */
+ signature = GF_CALLOC (1, signature_size (hashlen + 1),
+ gf_br_stub_mt_signature_t);
+ if (!signature)
+ return NULL;
+
+ signature->signedversion = object->signedversion;
+ signature->signaturetype = hashtype;
+ memcpy (signature->signature, (char *)sign, hashlen);
+ signature->signature[hashlen+1] = '\0';
+
+ return signature;
+}
+
+/**
+ * Do a lookup on the gfid present within the object.
+ */
+static inline int32_t
+br_object_lookup (xlator_t *this, br_object_t *object,
+ struct iatt *iatt, inode_t **linked_inode)
+{
+ int ret = -EINVAL;
+ loc_t loc = {0, };
+ inode_t *inode = NULL;
+
+ GF_VALIDATE_OR_GOTO ("bit-rot", this, out);
+ GF_VALIDATE_OR_GOTO (this->name, object, out);
+
+ inode = inode_find (object->child->table, object->gfid);
+
+ if (inode)
+ loc.inode = inode;
+ else
+ loc.inode = inode_new (object->child->table);
+
+ if (!loc.inode) {
+ ret = -ENOMEM;
+ goto out;
+ }
+
+ uuid_copy (loc.gfid, object->gfid);
+
+ ret = syncop_lookup (object->child->xl, &loc, NULL, iatt, NULL, NULL);
+ if (ret < 0)
+ goto out;
+
+ /*
+ * The file might have been deleted by the application
+ * after getting the event, but before doing a lookup.
+ * So use linked_inode after inode_link is done.
+ */
+ *linked_inode = inode_link (loc.inode, NULL, NULL, iatt);
+ if (*linked_inode)
+ inode_lookup (*linked_inode);
+
+out:
+ loc_wipe (&loc);
+ return ret;
+}
+
+/**
+ * open the object with O_RDONLY flags and return the fd. How to let brick
+ * know that open is being done by bitd because syncop framework does not allow
+ * passing xdata -- may be use frame->root->pid itself.
+ */
+static inline int32_t
+br_object_open (xlator_t *this,
+ br_object_t *object, inode_t *inode, fd_t **openfd)
+{
+ int32_t ret = -1;
+ fd_t *fd = NULL;
+ loc_t loc = {0, };
+
+ GF_VALIDATE_OR_GOTO ("bit-rot", this, out);
+ GF_VALIDATE_OR_GOTO (this->name, object, out);
+ GF_VALIDATE_OR_GOTO (this->name, inode, out);
+
+ ret = -EINVAL;
+ fd = fd_create (inode, 0);
+ if (!fd) {
+ gf_log (this->name, GF_LOG_ERROR, "failed to create fd for the "
+ "inode %s", uuid_utoa (inode->gfid));
+ goto out;
+ }
+
+ loc.inode = inode_ref (inode);
+ uuid_copy (loc.gfid, inode->gfid);
+
+ ret = syncop_open (object->child->xl, &loc, O_RDONLY, fd);
+ if (ret) {
+ fd_unref (fd);
+ fd = NULL;
+ } else {
+ fd_bind (fd);
+ *openfd = fd;
+ }
+
+ loc_wipe (&loc);
+
+out:
+ return ret;
+}
+
+/**
+ * read 128k block from the object @object from the offset @offset
+ * and return the buffer.
+ */
+static int32_t
+br_object_read_block_and_sign (xlator_t *this, fd_t *fd, br_child_t *child,
+ off_t offset, size_t size, SHA256_CTX *sha256)
+{
+ int32_t ret = -1;
+ struct iovec *iovec = NULL;
+ struct iobref *iobref = NULL;
+ int count = 0;
+ int i = 0;
+
+ GF_VALIDATE_OR_GOTO ("bit-rot", this, out);
+ GF_VALIDATE_OR_GOTO (this->name, fd, out);
+ GF_VALIDATE_OR_GOTO (this->name, fd->inode, out);
+ GF_VALIDATE_OR_GOTO (this->name, child, out);
+
+ ret = syncop_readv (child->xl, fd,
+ size, offset, 0, &iovec, &count, &iobref);
+
+ if (ret < 0) {
+ gf_log (this->name, GF_LOG_ERROR, "readv on %s failed (%s)",
+ uuid_utoa (fd->inode->gfid), strerror (errno));
+ ret = -1;
+ goto out;
+ }
+
+ if (ret == 0)
+ goto out;
+
+ for (i = 0; i < count; i++) {
+ SHA256_Update (sha256,
+ (const unsigned char *) (iovec[i].iov_base),
+ iovec[i].iov_len);
+ }
+
+ out:
+ if (iovec)
+ GF_FREE (iovec);
+
+ if (iobref)
+ iobref_unref (iobref);
+
+ return ret;
+}
+
+int32_t
+br_object_checksum (unsigned char *md,
+ br_object_t *object, fd_t *fd, struct iatt *iatt)
+{
+ int32_t ret = -1;
+ off_t offset = 0;
+ size_t block = 128 * 1024; /* 128K block size */
+ xlator_t *this = NULL;
+
+ SHA256_CTX sha256;
+
+ GF_VALIDATE_OR_GOTO ("bit-rot", object, out);
+ GF_VALIDATE_OR_GOTO ("bit-rot", iatt, out);
+ GF_VALIDATE_OR_GOTO ("bit-rot", fd, out);
+
+ this = object->this;
+
+ SHA256_Init (&sha256);
+
+ while (1) {
+ ret = br_object_read_block_and_sign (this, fd, object->child,
+ offset, block, &sha256);
+ if (ret < 0) {
+ gf_log (this->name, GF_LOG_ERROR, "reading block with "
+ "offset %lu of object %s failed", offset,
+ uuid_utoa (fd->inode->gfid));
+ break;
+ }
+
+ if (ret == 0)
+ break;
+
+ offset += ret;
+ }
+
+ if (ret == 0)
+ SHA256_Final (md, &sha256);
+
+ out:
+ return ret;
+}
+
+static inline int32_t
+br_object_read_sign (inode_t *linked_inode, fd_t *fd, br_object_t *object,
+ struct iatt *iatt)
+{
+ int32_t ret = -1;
+ xlator_t *this = NULL;
+ dict_t *xattr = NULL;
+ unsigned char *md = NULL;
+ br_isignature_t *sign = NULL;
+
+ GF_VALIDATE_OR_GOTO ("bit-rot", object, out);
+ GF_VALIDATE_OR_GOTO ("bit-rot", linked_inode, out);
+ GF_VALIDATE_OR_GOTO ("bit-rot", fd, out);
+
+ this = object->this;
+
+ md = GF_CALLOC (SHA256_DIGEST_LENGTH, sizeof (*md), gf_common_mt_char);
+ if (!md) {
+ gf_log (this->name, GF_LOG_ERROR, "failed to allocate memory "
+ "for saving hash of the object %s",
+ uuid_utoa (fd->inode->gfid));
+ goto out;
+ }
+
+ ret = br_object_checksum (md, object, fd, iatt);
+ if (ret) {
+ gf_log (this->name, GF_LOG_ERROR, "calculating checksum for "
+ "the object %s failed", uuid_utoa (linked_inode->gfid));
+ goto free_signature;
+ }
+
+ sign = br_prepare_signature (md, SHA256_DIGEST_LENGTH,
+ BR_SIGNATURE_TYPE_SHA256, object);
+ if (!sign) {
+ gf_log (this->name, GF_LOG_ERROR, "failed to get the signature "
+ "for the object %s", uuid_utoa (fd->inode->gfid));
+ goto free_signature;
+ }
+
+ xattr = dict_for_key_value
+ (GLUSTERFS_SET_OBJECT_SIGNATURE,
+ (void *)sign, signature_size (SHA256_DIGEST_LENGTH));
+
+ if (!xattr) {
+ gf_log (this->name, GF_LOG_ERROR, "dict allocation for signing"
+ " failed for the object %s",
+ uuid_utoa (fd->inode->gfid));
+ goto free_isign;
+ }
+
+ ret = syncop_fsetxattr (object->child->xl, fd, xattr, 0);
+ if (ret) {
+ gf_log (this->name, GF_LOG_ERROR, "fsetxattr of signature to "
+ "the object %s failed", uuid_utoa (fd->inode->gfid));
+ goto unref_dict;
+ }
+
+ ret = 0;
+
+ unref_dict:
+ dict_unref (xattr);
+ free_isign:
+ GF_FREE (sign);
+ free_signature:
+ GF_FREE (md);
+ out:
+ return ret;
+}
+
+static inline int br_object_sign_softerror (int32_t op_errno)
+{
+ return ((op_errno == ENOENT) || (op_errno = ESTALE));
+}
+
+void
+br_log_object (xlator_t *this, char *op, uuid_t gfid, int32_t op_errno)
+{
+ int softerror = br_object_sign_softerror (op_errno);
+ gf_log (this->name, (softerror) ? GF_LOG_DEBUG : GF_LOG_ERROR,
+ "%s() failed on object %s [reason: %s]",
+ op, uuid_utoa (gfid), strerror (op_errno));
+}
+
+void
+br_log_object_path (xlator_t *this, char *op,
+ const char *path, int32_t op_errno)
+{
+ int softerror = br_object_sign_softerror (op_errno);
+ gf_log (this->name, (softerror) ? GF_LOG_DEBUG : GF_LOG_ERROR,
+ "%s() failed on object %s [reason: %s]",
+ op, path, strerror (op_errno));
+}
+
+/**
+ * Sign a given object. This routine runs full throttle. There needs to be
+ * some form of priority scheduling and/or read burstness to avoid starving
+ * (or kicking) client I/O's.
+ */
+static inline int32_t br_sign_object (br_object_t *object)
+{
+ int32_t ret = -1;
+ inode_t *linked_inode = NULL;
+ xlator_t *this = NULL;
+ fd_t *fd = NULL;
+ struct iatt iatt = {0, };
+ pid_t pid = GF_CLIENT_PID_BITD;
+
+ GF_VALIDATE_OR_GOTO ("bit-rot", object, out);
+
+ this = object->this;
+
+ /**
+ * FIXME: This is required as signing an object is restricted to
+ * clients with special frame->root->pid. Change the way client
+ * pid is set.
+ */
+ syncopctx_setfspid (&pid);
+
+ ret = br_object_lookup (this, object, &iatt, &linked_inode);
+ if (ret) {
+ br_log_object (this, "lookup", object->gfid, -ret);
+ goto out;
+ }
+
+ ret = br_object_open (this, object, linked_inode, &fd);
+ if (!fd) {
+ br_log_object (this, "open", object->gfid, -ret);
+ goto unref_inode;
+ }
+
+ /**
+ * we have an open file descriptor on the object. from here on,
+ * do not be generous to file operation errors.
+ */
+
+ /* change this to DEBUG log level later */
+ gf_log (this->name, GF_LOG_DEBUG,
+ "Signing object [%s]", uuid_utoa (linked_inode->gfid));
+
+ ret = br_object_read_sign (linked_inode, fd, object, &iatt);
+ if (ret) {
+ gf_log (this->name, GF_LOG_ERROR, "reading and signing of the "
+ "object %s failed", uuid_utoa (linked_inode->gfid));
+ goto unref_fd;
+ }
+
+ ret = 0;
+
+ unref_fd:
+ fd_unref (fd);
+ unref_inode:
+ inode_unref (linked_inode);
+ out:
+ return ret;
+}
+
+static inline br_object_t *__br_pick_object (br_private_t *priv)
+{
+ br_object_t *object = NULL;
+
+ while (list_empty (&priv->obj_queue->objects)) {
+ pthread_cond_wait (&priv->object_cond, &priv->lock);
+ }
+
+ object = list_first_entry
+ (&priv->obj_queue->objects, br_object_t, list);
+ list_del_init (&object->list);
+
+ return object;
+}
+
+/**
+ * This is the place where the signing of the objects is triggered.
+ */
+void *
+br_process_object (void *arg)
+{
+ xlator_t *this = NULL;
+ br_object_t *object = NULL;
+ br_private_t *priv = NULL;
+ int32_t ret = -1;
+
+ this = arg;
+ priv = this->private;
+
+ THIS = this;
+
+ for (;;) {
+ pthread_mutex_lock (&priv->lock);
+ {
+ object = __br_pick_object (priv);
+ }
+ pthread_mutex_unlock (&priv->lock);
+
+ ret = br_sign_object (object);
+ if (ret && !br_object_sign_softerror (-ret))
+ gf_log (this->name, GF_LOG_ERROR,
+ "SIGNING FAILURE [%s]",
+ uuid_utoa (object->gfid));
+ GF_FREE (object);
+ }
+
+ return NULL;
+}
+
+/**
+ * This function gets kicked in once the object is expired from the
+ * timer wheel. This actually adds the object received via notification
+ * from the changelog to the queue from where the objects gets picked
+ * up for signing.
+ *
+ * This routine can be made lightweight by introducing an alternate
+ * timer-wheel API that dispatches _all_ expired objects in one-shot
+ * rather than an object at-a-time. This routine can then just simply
+ * be a call to list_splice_tail().
+ *
+ * NOTE: use call_time to instrument signing time in br_sign_object().
+ */
+void
+br_add_object_to_queue (struct gf_tw_timer_list *timer,
+ void *data, unsigned long call_time)
+{
+ br_object_t *object = NULL;
+ xlator_t *this = NULL;
+ br_private_t *priv = NULL;
+
+ object = data;
+ this = object->this;
+ priv = this->private;
+
+ pthread_mutex_lock (&priv->lock);
+ {
+ list_add_tail (&object->list, &priv->obj_queue->objects);
+ pthread_cond_broadcast (&priv->object_cond);
+ }
+ pthread_mutex_unlock (&priv->lock);
+
+ mem_put (timer);
+ return;
+}
+
+static inline br_object_t *
+br_initialize_object (xlator_t *this, br_child_t *child, changelog_event_t *ev)
+{
+ br_object_t *object = NULL;
+
+ object = GF_CALLOC (1, sizeof (*object), gf_br_mt_br_object_t);
+ if (!object)
+ goto out;
+ INIT_LIST_HEAD (&object->list);
+
+ object->this = this;
+ object->child = child;
+ uuid_copy (object->gfid, ev->u.releasebr.gfid);
+
+ /* NOTE: it's BE, but no worry */
+ object->signedversion = ev->u.releasebr.version;
+
+out:
+ return object;
+}
+
+static inline struct gf_tw_timer_list *
+br_initialize_timer (xlator_t *this, br_object_t *object, br_child_t *child,
+ changelog_event_t *ev)
+{
+ br_private_t *priv = NULL;
+ struct gf_tw_timer_list *timer = NULL;
+
+ priv = this->private;
+
+ timer = mem_get0 (child->timer_pool);
+ if (!timer)
+ goto out;
+ INIT_LIST_HEAD (&timer->entry);
+
+ timer->data = object;
+ timer->expires = priv->expiry_time;
+ timer->function = br_add_object_to_queue;
+ gf_tw_add_timer (priv->timer_wheel, timer);
+
+out:
+ return timer;
+}
+
+/**
+ * This callback function registered with the changelog is executed
+ * whenever a notification from the changelog is received. This should
+ * add the object (or the gfid) on which the notification has come to
+ * the timer-wheel with some expiry time.
+ *
+ * TODO: use mem-pool for allocations and maybe allocate timer and
+ * object as a single alloc and bifurcate their respective pointers.
+ */
+void
+br_brick_callback (void *xl, char *brick,
+ void *data, changelog_event_t *ev)
+{
+ uuid_t gfid = {0,};
+ xlator_t *this = NULL;
+ br_object_t *object = NULL;
+ br_child_t *child = NULL;
+ int32_t flags = 0;
+ struct gf_tw_timer_list *timer = NULL;
+
+ this = xl;
+
+ GF_VALIDATE_OR_GOTO (this->name, ev, out);
+ GF_VALIDATE_OR_GOTO ("bit-rot", this, out);
+ GF_VALIDATE_OR_GOTO (this->name, this->private, out);
+
+ GF_ASSERT (ev->ev_type == CHANGELOG_OP_TYPE_BR_RELEASE);
+ GF_ASSERT (!uuid_is_null (ev->u.releasebr.gfid));
+
+ uuid_copy (gfid, ev->u.releasebr.gfid);
+
+ gf_log (this->name, GF_LOG_DEBUG,
+ "RELEASE EVENT [GFID %s]", uuid_utoa (gfid));
+
+ flags = (int32_t)ntohl (ev->u.releasebr.flags);
+ if (flags == O_RDONLY) {
+ gf_log (this->name, GF_LOG_DEBUG,
+ "Read only fd [GFID: %s], ignoring signing..",
+ uuid_utoa (gfid));
+ goto out;
+ }
+
+ child = br_get_child_from_brick_path (this, brick);
+ if (!child) {
+ gf_log (this->name, GF_LOG_ERROR, "failed to get the subvolume "
+ "for the brick %s", brick);
+ goto out;
+ }
+
+ object = br_initialize_object (this, child, ev);
+ if (!object) {
+ gf_log (this->name, GF_LOG_ERROR, "failed to allocate "
+ "object memory [GFID: %s]", uuid_utoa (gfid));
+ goto out;
+ }
+
+ timer = br_initialize_timer (this, object, child, ev);
+ if (!timer) {
+ gf_log (this->name, GF_LOG_ERROR, "failed to allocate "
+ "object expiry timer [GFID: %s]", uuid_utoa (gfid));
+ goto free_object;
+ }
+
+ gf_log (this->name, GF_LOG_DEBUG, "->callback: brick [%s], type [%d]\n",
+ brick, ev->ev_type);
+
+ return;
+
+ free_object:
+ GF_FREE (object);
+out:
+ return;
+}
+
+void
+br_fill_brick_spec (struct gf_brick_spec *brick, char *path)
+{
+ brick->brick_path = gf_strdup (path);
+ brick->filter = CHANGELOG_OP_TYPE_BR_RELEASE;
+
+ brick->init = br_brick_init;
+ brick->fini = br_brick_fini;
+ brick->callback = br_brick_callback;
+ brick->connected = NULL;
+ brick->disconnected = NULL;
+}
+
+static inline gf_boolean_t
+br_time_equal (br_child_t *child, struct timeval *tv)
+{
+ if ((child->tv.tv_sec == tv->tv_sec) &&
+ (child->tv.tv_usec == tv->tv_usec))
+ return _gf_true;
+
+ return _gf_false;
+}
+
+static inline gf_boolean_t
+br_check_object_need_sign (xlator_t *this, dict_t *xattr, br_child_t *child)
+{
+ int32_t ret = -1;
+ gf_boolean_t need_sign = _gf_false;
+ struct timeval tv = {0,};
+ br_isignature_out_t *sign = NULL;
+
+ GF_VALIDATE_OR_GOTO ("bit-rot", this, out);
+ GF_VALIDATE_OR_GOTO (this->name, xattr, out);
+ GF_VALIDATE_OR_GOTO (this->name, child, out);
+
+ ret = dict_get_ptr (xattr, GLUSTERFS_GET_OBJECT_SIGNATURE,
+ (void **)&sign);
+ if (ret) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "failed to get object signature info");
+ goto out;
+ }
+
+ tv.tv_sec = ntohl (sign->time[0]);
+ tv.tv_usec = ntohl (sign->time[1]);
+
+ /* Object has been opened and hence dirty. Do not sign it */
+ if (sign->stale && !br_time_equal (child, &tv))
+ need_sign = _gf_true;
+
+out:
+ return need_sign;
+}
+
+static inline void
+br_trigger_sign (xlator_t *this, br_child_t *child, inode_t *linked_inode,
+ loc_t *loc)
+{
+ fd_t *fd = NULL;
+ int32_t ret = -1;
+
+ fd = fd_create (linked_inode, 0);
+ if (!fd) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "Failed to create fd [GFID %s]",
+ uuid_utoa (linked_inode->gfid));
+ goto out;
+ }
+
+ ret = syncop_open (child->xl, loc, O_RDWR, fd);
+ if (ret) {
+ br_log_object (this, "open", linked_inode->gfid, -ret);
+ fd_unref (fd);
+ fd = NULL;
+ } else {
+ fd_bind (fd);
+ }
+
+ if (fd)
+ syncop_close (fd);
+
+out:
+ return;
+}
+
+int32_t
+br_prepare_loc (xlator_t *this, br_child_t *child, loc_t *parent,
+ gf_dirent_t *entry, loc_t *loc)
+{
+ int32_t ret = -1;
+ inode_t *inode = NULL;
+
+ inode = inode_grep (child->table, parent->inode, entry->d_name);
+ if (!inode)
+ loc->inode = inode_new (child->table);
+ else {
+ loc->inode = inode;
+ if (loc->inode->ia_type != IA_IFREG) {
+ gf_log (this->name, GF_LOG_DEBUG, "%s is not a regular "
+ "file", entry->d_name);
+ ret = 0;
+ goto out;
+ }
+ }
+
+ loc->parent = inode_ref (parent->inode);
+ uuid_copy (loc->pargfid, parent->inode->gfid);
+
+ ret = inode_path (parent->inode, entry->d_name, (char **)&loc->path);
+ if (ret < 0 || !loc->path) {
+ gf_log (this->name, GF_LOG_ERROR, "inode_path on %s "
+ "(parent: %s) failed", entry->d_name,
+ uuid_utoa (parent->inode->gfid));
+ goto out;
+ }
+
+ loc->name = strrchr (loc->path, '/');
+ if (loc->name)
+ loc->name++;
+
+ ret = 1;
+
+out:
+ return ret;
+}
+
+/**
+ * Oneshot crawler
+ * ---------------
+ * This is a catchup mechanism. Objects that remained unsigned from the
+ * last run for whatever reason (node crashes, reboots, etc..) become
+ * candidates for signing. This allows the signature to "catch up" with
+ * the current state of the object. Triggering signing is easy: perform
+ * an open() followed by a close() therby resulting in call boomerang.
+ * (though not back to itself :))
+ */
+int
+bitd_oneshot_crawl (xlator_t *subvol,
+ gf_dirent_t *entry, loc_t *parent, void *data)
+{
+ int op_errno = 0;
+ br_child_t *child = NULL;
+ xlator_t *this = NULL;
+ loc_t loc = {0, };
+ struct iatt iatt = {0, };
+ struct iatt parent_buf = {0, };
+ dict_t *xattr = NULL;
+ int32_t ret = -1;
+ inode_t *linked_inode = NULL;
+ gf_boolean_t need_signing = _gf_false;
+
+ GF_VALIDATE_OR_GOTO ("bit-rot", subvol, out);
+ GF_VALIDATE_OR_GOTO ("bit-rot", data, out);
+
+ child = data;
+ this = child->this;
+
+ ret = br_prepare_loc (this, child, parent, entry, &loc);
+ if (!ret)
+ goto out;
+
+ ret = syncop_lookup (child->xl, &loc, NULL, &iatt, NULL, &parent_buf);
+ if (ret) {
+ br_log_object_path (this, "lookup", loc.path, -ret);
+ goto out;
+ }
+
+ linked_inode = inode_link (loc.inode, parent->inode, loc.name, &iatt);
+ if (linked_inode)
+ inode_lookup (linked_inode);
+
+ if (iatt.ia_type != IA_IFREG) {
+ gf_log (this->name, GF_LOG_DEBUG,
+ "%s is not a regular file, skipping..", entry->d_name);
+ ret = 0;
+ goto unref_inode;
+ }
+
+ /**
+ * As of now, 2 cases are possible and handled.
+ * 1) GlusterFS is upgraded from a previous version which does not
+ * have any idea about bit-rot and have data in the filesystem.
+ * In this case syncop_getxattr fails with ENODATA and the object
+ * is signed. (In real, when crawler sends lookup, bit-rot-stub
+ * creates the xattrs before returning lookup reply)
+ * 2) Bit-rot was not enabled or BitD was dows for some reasons, during
+ * which some files were created, but since BitD was down, were not
+ * signed.
+ * If the file was just created and was being written some data when
+ * the down BitD came up, then bit-rot stub should be intelligent to
+ * identify this case (by comparing the ongoing version or by checking
+ * if there are any fds present for that inode) and handle properly.
+ */
+
+ ret = syncop_getxattr (child->xl, &loc, &xattr,
+ GLUSTERFS_GET_OBJECT_SIGNATURE, NULL);
+ if (ret < 0) {
+ op_errno = -ret;
+ br_log_object (this, "getxattr", linked_inode->gfid, op_errno);
+
+ if (op_errno == ENODATA)
+ need_signing = _gf_true;
+ if (op_errno == EINVAL)
+ gf_log (this->name, GF_LOG_WARNING, "Partial version "
+ "xattr presence detected, ignoring [GFID: %s]",
+ uuid_utoa (linked_inode->gfid));
+ } else {
+ need_signing = br_check_object_need_sign (this, xattr, child);
+ }
+
+ if (!need_signing)
+ goto unref_dict;
+
+ gf_log (this->name, GF_LOG_INFO,
+ "Triggering signing for %s [GFID: %s | Brick: %s]",
+ loc.path, uuid_utoa (linked_inode->gfid), child->brick_path);
+ br_trigger_sign (this, child, linked_inode, &loc);
+
+ ret = 0;
+
+ unref_dict:
+ if (xattr)
+ dict_unref (xattr);
+ unref_inode:
+ inode_unref (linked_inode);
+ out:
+ loc_wipe (&loc);
+
+ return ret;
+}
+
+#define BR_CRAWL_THROTTLE_COUNT 50
+#define BR_CRAWL_THROTTLE_ZZZ 5
+
+void *
+br_oneshot_signer (void *arg)
+{
+ loc_t loc = {0,};
+ xlator_t *this = NULL;
+ br_child_t *child = NULL;
+
+ child = arg;
+ this = child->this;
+
+ THIS = this;
+
+ gf_log (this->name, GF_LOG_INFO, "Crawling brick [%s], scanning "
+ "for unsigned objects", child->brick_path);
+
+ loc.inode = child->table->root;
+ (void) syncop_ftw_throttle
+ (child->xl, &loc,
+ GF_CLIENT_PID_BITD, child, bitd_oneshot_crawl,
+ BR_CRAWL_THROTTLE_COUNT, BR_CRAWL_THROTTLE_ZZZ);
+
+ gf_log (this->name, GF_LOG_INFO,
+ "Completed crawling brick [%s]", child->brick_path);
+
+ return NULL;
+}
+
+/**
+ * At this point a thread is spawned to crawl the filesystem (in
+ * tortoise pace) to sign objects that were not signed in previous run(s).
+ * Such objects are identified by examining it's dirtyness and timestamp.
+ *
+ * pick object:
+ * signature_is_stale() && (object_timestamp() <= stub_init_time())
+ *
+ * Also, we register to the changelog library to subscribe for event
+ * notifications.
+ */
+static inline int32_t
+br_enact_signer (xlator_t *this, br_child_t *child, br_stub_init_t *stub)
+{
+ int32_t ret = 0;
+ struct gf_brick_spec *brick = NULL;
+
+ brick = GF_CALLOC (1, sizeof (struct gf_brick_spec),
+ gf_common_mt_gf_brick_spec_t);
+ if (!brick)
+ goto error_return;
+
+ br_fill_brick_spec (brick, stub->export);
+ ret = gf_changelog_register_generic
+ (brick, 1, 1, this->ctx->cmd_args.log_file, -1, this);
+ if (ret) {
+ gf_log (this->name, GF_LOG_ERROR, "Register to changelog failed"
+ " [Reason: %s]", strerror (errno));
+ goto dealloc;
+ }
+
+ child->threadrunning = 0;
+ ret = gf_thread_create (&child->thread, NULL, br_oneshot_signer, child);
+ if (ret)
+ gf_log (this->name, GF_LOG_WARNING,
+ "failed to spawn FS crawler thread");
+ else
+ child->threadrunning = 1;
+
+ /* it's OK to continue, "old" objects would be signed when modified */
+ return 0;
+
+ dealloc:
+ GF_FREE (brick);
+ error_return:
+ return -1;
+}
+
+/**
+ * This routine fetches various attributes associated with a child which
+ * is basically a subvolume. Attributes include brick path and the stub
+ * birth time. This is done by performing a lookup on the root followed
+ * by getxattr() on a virtual key.
+ */
+static inline int32_t
+br_brick_connect (xlator_t *this, br_child_t *child)
+{
+ int32_t ret = -1;
+ loc_t loc = {0, };
+ struct iatt buf = {0, };
+ struct iatt parent = {0, };
+ br_stub_init_t *stub = NULL;
+ dict_t *xattr = NULL;
+ int op_errno = 0;
+
+ GF_VALIDATE_OR_GOTO ("bit-rot", this, out);
+ GF_VALIDATE_OR_GOTO (this->name, child, out);
+ GF_VALIDATE_OR_GOTO (this->name, this->private, out);
+
+ loc.inode = inode_ref (child->table->root);
+ uuid_copy (loc.gfid, loc.inode->gfid);
+ loc.path = gf_strdup ("/");
+
+ ret = syncop_lookup (child->xl, &loc, NULL, &buf, NULL, &parent);
+ if (ret) {
+ op_errno = -ret;
+ ret = -1;
+ gf_log (this->name, GF_LOG_ERROR, "lookup on root failed "
+ "[Reason: %s]", strerror (op_errno));
+ goto wipeloc;
+ }
+
+ ret = syncop_getxattr (child->xl, &loc, &xattr,
+ GLUSTERFS_GET_BR_STUB_INIT_TIME, NULL);
+ if (ret) {
+ op_errno = -ret;
+ ret = -1;
+ gf_log (this->name, GF_LOG_ERROR, "failed to get stub info "
+ "[Reason: %s]", strerror (op_errno));
+ goto wipeloc;
+ }
+
+ ret = dict_get_ptr (xattr, GLUSTERFS_GET_BR_STUB_INIT_TIME,
+ (void **)&stub);
+ if (ret) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "failed to extract stub information");
+ goto free_dict;
+ }
+
+ memcpy (child->brick_path, stub->export, strlen (stub->export) + 1);
+ child->tv.tv_sec = ntohl (stub->timebuf[0]);
+ child->tv.tv_usec = ntohl (stub->timebuf[0]);
+
+ ret = br_enact_signer (this, child, stub);
+
+ free_dict:
+ dict_unref (xattr);
+ wipeloc:
+ loc_wipe (&loc);
+ out:
+ return ret;
+}
+
+/**
+ * This function is executed in a separate thread. The thread gets the
+ * brick from where CHILD_UP has received from the queue and gets the
+ * information regarding that brick (such as brick path).
+ */
+void *
+br_handle_events (void *arg)
+{
+ xlator_t *this = NULL;
+ br_private_t *priv = NULL;
+ br_child_t *child = NULL;
+ int32_t ret = -1;
+
+ this = arg;
+ priv = this->private;
+
+ /*
+ * Since, this is the topmost xlator, THIS has to be set by bit-rot
+ * xlator itself (STACK_WIND wont help in this case). Also it has
+ * to be done for each thread that gets spawned. Otherwise, a new
+ * thread will get global_xlator's pointer when it does "THIS".
+ */
+ THIS = this;
+
+ while (1) {
+ pthread_mutex_lock (&priv->lock);
+ {
+ while (list_empty (&priv->bricks)) {
+ pthread_cond_wait (&priv->cond,
+ &priv->lock);
+ }
+
+ child = list_entry (priv->bricks.next, br_child_t,
+ list);
+ if (child && child->child_up) {
+ ret = br_brick_connect (this, child);
+ if (ret == -1)
+ gf_log (this->name, GF_LOG_ERROR,
+ "failed to connect to the "
+ "child (subvolume: %s)",
+ child->xl->name);
+ else
+ list_del_init (&child->list);
+ }
+
+ }
+ pthread_mutex_unlock (&priv->lock);
+ }
+
+ return NULL;
+}
+
+int32_t
+mem_acct_init (xlator_t *this)
+{
+ int32_t ret = -1;
+
+ if (!this)
+ return ret;
+
+ ret = xlator_mem_acct_init (this, gf_br_stub_mt_end + 1);
+
+ if (ret != 0) {
+ gf_log (this->name, GF_LOG_WARNING, "Memory accounting"
+ " init failed");
+ return ret;
+ }
+
+ return ret;
+}
+
+int
+notify (xlator_t *this, int32_t event, void *data, ...)
+{
+ xlator_t *subvol = NULL;
+ br_private_t *priv = NULL;
+ int idx = -1;
+ br_child_t *child = NULL;
+
+ subvol = (xlator_t *)data;
+ priv = this->private;
+
+ gf_log (this->name, GF_LOG_TRACE, "Notification received: %d",
+ event);
+
+ switch (event) {
+ case GF_EVENT_CHILD_UP:
+ /* should this be done under lock? or is it ok to do it
+ without lock? */
+ idx = br_find_child_index (this, subvol);
+
+ pthread_mutex_lock (&priv->lock);
+ {
+ if (idx < 0) {
+ gf_log (this->name, GF_LOG_ERROR, "got child "
+ "up from invalid subvolume");
+ } else {
+ child = &priv->children[idx];
+ if (child->child_up != 1)
+ child->child_up = 1;
+ if (!child->xl)
+ child->xl = subvol;
+ if (!child->table)
+ child->table = inode_table_new (4096,
+ subvol);
+ priv->up_children++;
+ list_add_tail (&child->list, &priv->bricks);
+ pthread_cond_signal (&priv->cond);
+ }
+ }
+ pthread_mutex_unlock (&priv->lock);
+ break;
+
+ case GF_EVENT_CHILD_MODIFIED:
+ idx = br_find_child_index (this, subvol);
+ if (idx < 0) {
+ gf_log (this->name, GF_LOG_ERROR, "received child up "
+ "from invalid subvolume");
+ goto out;
+ }
+ priv = this->private;
+ /* ++(priv->generation); */
+ break;
+ case GF_EVENT_CHILD_DOWN:
+ idx = br_find_child_index (this, subvol);
+ if (idx < 0) {
+ gf_log (this->name, GF_LOG_ERROR, "received child down "
+ "from invalid subvolume");
+ goto out;
+ }
+
+ pthread_mutex_lock (&priv->lock);
+ {
+ if (priv->children[idx].child_up == 1) {
+ priv->children[idx].child_up = 0;
+ priv->up_children--;
+ }
+ }
+ pthread_mutex_unlock (&priv->lock);
+ break;
+ case GF_EVENT_PARENT_UP:
+ default_notify (this, GF_EVENT_PARENT_UP, data);
+ break;
+ }
+
+out:
+ return 0;
+}
+
+int32_t
+init (xlator_t *this)
+{
+ int i = 0;
+ int32_t ret = -1;
+ br_private_t *priv = NULL;
+ xlator_list_t *trav = NULL;
+
+ if (!this->children) {
+ gf_log (this->name, GF_LOG_ERROR, "FATAL: no children");
+ goto out;
+ }
+
+ priv = GF_CALLOC (1, sizeof (*priv), gf_br_mt_br_private_t);
+ if (!priv) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "failed to allocate memory (->priv)");
+ goto out;
+ }
+
+ /* initialize gfchangelog xlator context */
+ ret = gf_changelog_init (this);
+ if (ret)
+ goto out;
+
+ GF_OPTION_INIT ("expiry-time", priv->expiry_time, int32, out);
+
+ priv->child_count = xlator_subvolume_count (this);
+ priv->children = GF_CALLOC (priv->child_count, sizeof (*priv->children),
+ gf_br_mt_br_child_t);
+ if (!priv->children)
+ goto out;
+
+ trav = this->children;
+ while (trav) {
+ priv->children[i].this = this;
+ priv->children[i].xl = trav->xlator;
+
+ priv->children[i].timer_pool =
+ mem_pool_new (struct gf_tw_timer_list, 4096);
+ if (!priv->children[i].timer_pool) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "failed to allocate mem-pool for timer");
+ errno = ENOMEM;
+ goto out;
+ }
+
+ i++;
+ trav = trav->next;
+ }
+
+ pthread_mutex_init (&priv->lock, NULL);
+ pthread_cond_init (&priv->cond, NULL);
+
+ for (i = 0; i < priv->child_count; i++)
+ INIT_LIST_HEAD (&priv->children[i].list);
+ INIT_LIST_HEAD (&priv->bricks);
+
+ this->private = priv;
+
+ ret = gf_thread_create (&priv->thread, NULL, br_handle_events,
+ this);
+ if (ret != 0) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "thread creation failed (%s)", strerror (errno));
+ goto out;
+ }
+
+ priv->timer_wheel = gf_tw_init_timers ();
+ if (!priv->timer_wheel) {
+ gf_log (this->name, GF_LOG_ERROR, "failed to initialize the "
+ "timer wheel");
+ goto out;
+ }
+
+ pthread_cond_init (&priv->object_cond, NULL);
+ priv->obj_queue = GF_CALLOC (1, sizeof (*priv->obj_queue),
+ gf_br_mt_br_ob_n_wk_t);
+ if (!priv->obj_queue) {
+ gf_log (this->name, GF_LOG_ERROR, "memory allocation failed");
+ goto out;
+ }
+
+ INIT_LIST_HEAD (&priv->obj_queue->objects);
+
+ for (i = 0; i < BR_WORKERS; i++) {
+ gf_thread_create (&priv->obj_queue->workers[i], NULL,
+ br_process_object, this);
+ if (ret != 0) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "thread creation failed (%s)",
+ strerror (errno));
+ goto out;
+ }
+ }
+
+ ret = 0;
+
+out:
+ if (ret) {
+ if (priv->children)
+ GF_FREE (priv->children);
+ if (priv->timer_wheel)
+ gf_tw_cleanup_timers (priv->timer_wheel);
+ GF_FREE (priv);
+ }
+
+ gf_log (this->name, GF_LOG_DEBUG, "bit-rot xlator loaded");
+ return ret;
+}
+
+void
+fini (xlator_t *this)
+{
+ br_private_t *priv = this->private;
+
+ if (!priv)
+ return;
+
+ br_free_children (this);
+ if (priv->timer_wheel)
+ gf_tw_cleanup_timers (priv->timer_wheel);
+ this->private = NULL;
+ GF_FREE (priv);
+
+ return;
+}
+
+struct xlator_fops fops;
+
+struct xlator_cbks cbks;
+
+struct volume_options options[] = {
+ { .key = {"expiry-time"},
+ .type = GF_OPTION_TYPE_INT,
+ .default_value = "120",
+ .description = "default time duration for which an object waits "
+ "before it is signed",
+ },
+ { .key = {NULL} },
+};
diff --git a/xlators/features/bit-rot/src/bitd/bit-rot.h b/xlators/features/bit-rot/src/bitd/bit-rot.h
new file mode 100644
index 00000000000..ab9fd806232
--- /dev/null
+++ b/xlators/features/bit-rot/src/bitd/bit-rot.h
@@ -0,0 +1,126 @@
+/*
+ Copyright (c) 2015 Red Hat, Inc. <http://www.redhat.com>
+ This file is part of GlusterFS.
+
+ This file is licensed to you under your choice of the GNU Lesser
+ General Public License, version 3 or any later version (LGPLv3 or
+ later), or the GNU General Public License, version 2 (GPLv2), in all
+ cases as published by the Free Software Foundation.
+*/
+#ifndef __BIT_ROT_H__
+#define __BIT_ROT_H__
+
+#ifndef _CONFIG_H
+#define _CONFIG_H
+#include "config.h"
+#endif
+
+#include "glusterfs.h"
+#include "logging.h"
+#include "dict.h"
+#include "xlator.h"
+#include "defaults.h"
+#include "syncop.h"
+#include "syncop-utils.h"
+#include "changelog.h"
+#include "timer-wheel.h"
+
+#include "bit-rot-common.h"
+#include "bit-rot-stub-mem-types.h"
+
+#include <openssl/sha.h>
+
+/* TODO: make this configurable */
+#define BR_WORKERS 8
+
+#define signature_size(hl) (sizeof (br_isignature_t) + hl + 1)
+
+struct br_child {
+ char child_up; /* Indicates whether this child is
+ up or not */
+ xlator_t *xl; /* client xlator corresponding to
+ this child */
+ inode_table_t *table; /* inode table for this child */
+ char brick_path[PATH_MAX]; /* brick export directory of this
+ child */
+ struct list_head list; /* hook to attach to the list of
+ UP children */
+ xlator_t *this; /* Bit rot xlator */
+
+ pthread_t thread; /* initial crawler for unsigned
+ object(s) */
+ int threadrunning; /* active thread */
+
+ struct mem_pool *timer_pool; /* timer-wheel's timer mem-pool */
+
+ struct timeval tv;
+};
+
+typedef struct br_child br_child_t;
+
+struct br_obj_n_workers {
+ struct list_head objects; /* queue of objects expired from the
+ timer wheel and ready to be picked
+ up for signing */
+ pthread_t workers[BR_WORKERS]; /* Threads which pick up the objects
+ from the above queue and start
+ signing each object */
+};
+
+typedef struct br_obj_n_workers br_obj_n_workers_t;
+
+struct br_private {
+ pthread_mutex_t lock;
+
+ struct list_head bricks; /* list of bricks from which CHILD_UP
+ has been received */
+
+ pthread_cond_t cond; /* handling CHILD_UP notifications */
+ pthread_cond_t object_cond; /* handling signing of objects */
+ int child_count;
+ br_child_t *children; /* list of subvolumes */
+ int up_children;
+ pthread_t thread; /* thread for connecting each UP
+ child with changelog */
+ struct tvec_base *timer_wheel; /* timer wheel where the objects which
+ changelog has sent sits and waits
+ for expiry */
+ br_obj_n_workers_t *obj_queue; /* place holder for all the objects
+ that are expired from timer wheel
+ and ready to be picked up for
+ signing and the workers which sign
+ the objects */
+ int32_t expiry_time; /* objects "wait" time */
+};
+
+typedef struct br_private br_private_t;
+
+struct br_object {
+ xlator_t *this;
+
+ uuid_t gfid;
+
+ unsigned long signedversion; /* version aginst which this object will
+ be signed */
+ br_child_t *child; /* object's subvolume */
+
+ struct list_head list; /* hook to add to the queue once the
+ object is expired from timer wheel */
+ void *data;
+};
+
+typedef struct br_object br_object_t;
+
+void
+br_log_object (xlator_t *, char *, uuid_t, int32_t);
+
+void
+br_log_object_path (xlator_t *, char *, const char *, int32_t);
+
+int32_t
+br_object_checksum (unsigned char *, br_object_t *, fd_t *, struct iatt *);
+
+int32_t
+br_prepare_loc (xlator_t *, br_child_t *, loc_t *, gf_dirent_t *, loc_t *);
+
+#endif /* __BIT_ROT_H__ */
diff --git a/xlators/features/bit-rot/src/stub/Makefile.am b/xlators/features/bit-rot/src/stub/Makefile.am
index 9abcbb76db2..ec6b1ef4506 100644
--- a/xlators/features/bit-rot/src/stub/Makefile.am
+++ b/xlators/features/bit-rot/src/stub/Makefile.am
@@ -8,9 +8,7 @@ bitrot_stub_la_LIBADD = $(top_builddir)/libglusterfs/src/libglusterfs.la
noinst_HEADERS = bit-rot-stub.h bit-rot-common.h bit-rot-stub-mem-types.h
-AM_CPPFLAGS = $(GF_CPPFLAGS) -I$(top_srcdir)/libglusterfs/src \
- -I$(top_srcdir)/xlators/features/changelog/lib/src
-
+AM_CPPFLAGS = $(GF_CPPFLAGS) -I$(top_srcdir)/libglusterfs/src
AM_CFLAGS = -Wall $(GF_CFLAGS)
diff --git a/xlators/features/bit-rot/src/stub/bit-rot-stub-mem-types.h b/xlators/features/bit-rot/src/stub/bit-rot-stub-mem-types.h
index 64779923fd6..492278639b4 100644
--- a/xlators/features/bit-rot/src/stub/bit-rot-stub-mem-types.h
+++ b/xlators/features/bit-rot/src/stub/bit-rot-stub-mem-types.h
@@ -15,9 +15,13 @@
enum br_mem_types {
gf_br_stub_mt_private_t = gf_common_mt_end + 1,
- gf_br_stub_mt_version_t = gf_common_mt_end + 2,
- gf_br_stub_mt_inode_ctx_t = gf_common_mt_end + 3,
- gf_br_stub_mt_signature_t = gf_common_mt_end + 4,
+ gf_br_stub_mt_version_t,
+ gf_br_stub_mt_inode_ctx_t,
+ gf_br_stub_mt_signature_t,
+ gf_br_mt_br_private_t,
+ gf_br_mt_br_child_t,
+ gf_br_mt_br_object_t,
+ gf_br_mt_br_ob_n_wk_t,
gf_br_stub_mt_end
};
diff --git a/xlators/features/changelog/lib/src/Makefile.am b/xlators/features/changelog/lib/src/Makefile.am
index 306306bd585..456e211b89d 100644
--- a/xlators/features/changelog/lib/src/Makefile.am
+++ b/xlators/features/changelog/lib/src/Makefile.am
@@ -32,8 +32,6 @@ noinst_HEADERS = gf-changelog-helpers.h gf-changelog-rpc.h gf-changelog-journal.
$(CONTRIBDIR)/uuid/uuidd.h $(CONTRIBDIR)/uuid/uuid.h \
$(CONTRIBDIR)/uuid/uuidP.h $(CONTRIB_BUILDDIR)/uuid/uuid_types.h
-libgfchangelog_HEADERS = changelog.h
-
CLEANFILES =
CONFIG_CLEAN_FILES = $(CONTRIB_BUILDDIR)/uuid/uuid_types.h
diff --git a/xlators/features/changelog/lib/src/changelog.h b/xlators/features/changelog/lib/src/changelog.h
deleted file mode 100644
index 08307810704..00000000000
--- a/xlators/features/changelog/lib/src/changelog.h
+++ /dev/null
@@ -1,116 +0,0 @@
-/*
- Copyright (c) 2013 Red Hat, Inc. <http://www.redhat.com>
- This file is part of GlusterFS.
-
- This file is licensed to you under your choice of the GNU Lesser
- General Public License, version 3 or any later version (LGPLv3 or
- later), or the GNU General Public License, version 2 (GPLv2), in all
- cases as published by the Free Software Foundation.
-*/
-
-#ifndef _GF_CHANGELOG_H
-#define _GF_CHANGELOG_H
-
-struct gf_brick_spec;
-
-/**
- * Max bit shiter for event selection
- */
-#define CHANGELOG_EV_SELECTION_RANGE 5
-
-#define CHANGELOG_OP_TYPE_JOURNAL (1<<0)
-#define CHANGELOG_OP_TYPE_OPEN (1<<1)
-#define CHANGELOG_OP_TYPE_CREATE (1<<2)
-#define CHANGELOG_OP_TYPE_RELEASE (1<<3)
-#define CHANGELOG_OP_TYPE_BR_RELEASE (1<<4) /* logical release (last close()),
- sent by bitrot stub */
-#define CHANGELOG_OP_TYPE_MAX (1<<CHANGELOG_EV_SELECTION_RANGE)
-
-
-struct ev_open {
- unsigned char gfid[16];
- int32_t flags;
-};
-
-struct ev_creat {
- unsigned char gfid[16];
- int32_t flags;
-};
-
-struct ev_release {
- unsigned char gfid[16];
-};
-
-struct ev_release_br {
- int32_t flags;
- unsigned long version;
- unsigned char gfid[16];
-};
-
-struct ev_changelog {
- char path[PATH_MAX];
-};
-
-typedef struct changelog_event {
- unsigned int ev_type;
-
- union {
- struct ev_open open;
- struct ev_creat create;
- struct ev_release release;
- struct ev_changelog journal;
- struct ev_release_br releasebr;
- } u;
-} changelog_event_t;
-
-#define CHANGELOG_EV_SIZE (sizeof (changelog_event_t))
-
-/**
- * event callback, connected & disconnection defs
- */
-typedef void (CALLBACK) (void *, char *,
- void *, changelog_event_t *);
-typedef void *(INIT) (void *, struct gf_brick_spec *);
-typedef void (FINI) (void *, char *, void *);
-typedef void (CONNECT) (void *, char *, void *);
-typedef void (DISCONNECT) (void *, char *, void *);
-
-struct gf_brick_spec {
- char *brick_path;
- unsigned int filter;
-
- INIT *init;
- FINI *fini;
- CALLBACK *callback;
- CONNECT *connected;
- DISCONNECT *disconnected;
-
- void *ptr;
-};
-
-/* API set */
-
-int
-gf_changelog_register (char *brick_path, char *scratch_dir,
- char *log_file, int log_levl, int max_reconnects);
-ssize_t
-gf_changelog_scan ();
-
-int
-gf_changelog_start_fresh ();
-
-ssize_t
-gf_changelog_next_change (char *bufptr, size_t maxlen);
-
-int
-gf_changelog_done (char *file);
-
-/* newer flexible API */
-int
-gf_changelog_init (void *xl);
-
-int
-gf_changelog_register_generic (struct gf_brick_spec *bricks, int count,
- int ordered, char *logfile, int lvl, void *xl);
-
-#endif