summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorVenky Shankar <vshankar@redhat.com>2015-02-15 15:05:19 +0530
committerVijay Bellur <vbellur@redhat.com>2015-03-24 10:55:32 -0700
commit7927e8747c731dbb105e93ae66c336338f48f0e6 (patch)
treec7d4ce47ee90ef2483e1baf81327c3d2f2a545ea
parent31f841d6b35c242942b6bdcbfdc83cf548d5235a (diff)
features/bit-rot: Implementation of bit-rot xlator
This is the "Signer" -- responsible for signing files with their checksums upon last file descriptor close (last release()). The event notification facility provided by the changelog xlator is made use of. Moreover, checksums are as of now SHA256 hash of the object data and is the only available hash at this point of time. Therefore, there is no special "what hash to use" type check, although it's does not take much to add various hashing algorithms to sign objects with. Signatures are stored in extended attributes of the objects along with the the type of hashing used to calculate the signature. This makes thing future proof when other hash types are added. The signature infrastructure is provided by bitrot stub: a little piece of code that sits over the POSIX xlator providing interfaces to "get or set" objects signature and it's staleness. Since objects are signed upon receiving release() notification, pre-existing data which are "never" modified would never be signed. To counter this, an initial crawler thread is spawned The crawler scans the entire brick for objects that are unsigned or "missed" signing due to the server going offline (node reboots, crashes, etc..) and triggers an explicit sign. This would also sign objects when bit-rot is enabled for a volume and/or after upgrade. Change-Id: I1d9a98bee6cad1c39c35c53c8fb0fc4bad2bf67b BUG: 1170075 Original-Author: Raghavendra Bhat <raghavendra@redhat.com> Signed-off-by: Venky Shankar <vshankar@redhat.com> Reviewed-on: http://review.gluster.org/9711 Tested-by: Gluster Build System <jenkins@build.gluster.com> Reviewed-by: Vijay Bellur <vbellur@redhat.com>
-rw-r--r--api/src/glfs-fops.c21
-rw-r--r--configure.ac1
-rw-r--r--libglusterfs/src/Makefile.am3
-rw-r--r--libglusterfs/src/changelog.h (renamed from xlators/features/changelog/lib/src/changelog.h)0
-rw-r--r--libglusterfs/src/common-utils.c23
-rw-r--r--libglusterfs/src/common-utils.h4
-rw-r--r--libglusterfs/src/dict.c19
-rw-r--r--libglusterfs/src/dict.h4
-rw-r--r--libglusterfs/src/mem-types.h2
-rw-r--r--libglusterfs/src/syncop-utils.c86
-rw-r--r--libglusterfs/src/syncop-utils.h6
-rw-r--r--libglusterfs/src/xlator.c10
-rw-r--r--libglusterfs/src/xlator.h3
-rw-r--r--xlators/cluster/afr/src/afr-common.c12
-rw-r--r--xlators/features/bit-rot/src/Makefile.am19
-rw-r--r--xlators/features/bit-rot/src/bit-rot-mem-types.h24
-rw-r--r--xlators/features/bit-rot/src/bit-rot.c89
-rw-r--r--xlators/features/bit-rot/src/bit-rot.h33
-rw-r--r--xlators/features/bit-rot/src/bitd/Makefile.am20
-rw-r--r--xlators/features/bit-rot/src/bitd/bit-rot.c1351
-rw-r--r--xlators/features/bit-rot/src/bitd/bit-rot.h126
-rw-r--r--xlators/features/bit-rot/src/stub/Makefile.am4
-rw-r--r--xlators/features/bit-rot/src/stub/bit-rot-stub-mem-types.h10
-rw-r--r--xlators/features/changelog/lib/src/Makefile.am2
24 files changed, 1667 insertions, 205 deletions
diff --git a/api/src/glfs-fops.c b/api/src/glfs-fops.c
index f0c769def29..94b31ef076a 100644
--- a/api/src/glfs-fops.c
+++ b/api/src/glfs-fops.c
@@ -2839,27 +2839,6 @@ out:
GFAPI_SYMVER_PUBLIC_DEFAULT(glfs_flistxattr, 3.4.0);
-
-dict_t *
-dict_for_key_value (const char *name, const char *value, size_t size)
-{
- dict_t *xattr = NULL;
- int ret = 0;
-
- xattr = dict_new ();
- if (!xattr)
- return NULL;
-
- ret = dict_set_static_bin (xattr, (char *)name, (void *)value, size);
- if (ret) {
- dict_destroy (xattr);
- xattr = NULL;
- }
-
- return xattr;
-}
-
-
int
glfs_setxattr_common (struct glfs *fs, const char *path, const char *name,
const void *value, size_t size, int flags, int follow)
diff --git a/configure.ac b/configure.ac
index ee89ce99167..89ea35ce6f1 100644
--- a/configure.ac
+++ b/configure.ac
@@ -165,6 +165,7 @@ AC_CONFIG_FILES([Makefile
xlators/features/bit-rot/Makefile
xlators/features/bit-rot/src/Makefile
xlators/features/bit-rot/src/stub/Makefile
+ xlators/features/bit-rot/src/bitd/Makefile
xlators/playground/Makefile
xlators/playground/template/Makefile
xlators/playground/template/src/Makefile
diff --git a/libglusterfs/src/Makefile.am b/libglusterfs/src/Makefile.am
index 818de91cf36..33de0a287c7 100644
--- a/libglusterfs/src/Makefile.am
+++ b/libglusterfs/src/Makefile.am
@@ -11,6 +11,7 @@ libglusterfs_la_LIBADD = @LEXLIB@ $(ZLIB_LIBS) $(MATH_LIB)
libglusterfs_la_LDFLAGS = -version-info $(LIBGLUSTERFS_LT_VERSION)
lib_LTLIBRARIES = libglusterfs.la
+libgfchangelogdir = $(includedir)/glusterfs/gfchangelog
CONTRIB_BUILDDIR = $(top_builddir)/contrib
@@ -53,6 +54,8 @@ noinst_HEADERS = common-utils.h defaults.h dict.h glusterfs.h hashfn.h timespec.
unittest/unittest.h quota-common-utils.h rot-buffs.h \
$(CONTRIBDIR)/timer-wheel/timer-wheel.h
+libgfchangelog_HEADERS = changelog.h
+
EXTRA_DIST = graph.l graph.y
graph.lex.c: graph.l y.tab.h
diff --git a/xlators/features/changelog/lib/src/changelog.h b/libglusterfs/src/changelog.h
index 08307810704..08307810704 100644
--- a/xlators/features/changelog/lib/src/changelog.h
+++ b/libglusterfs/src/changelog.h
diff --git a/libglusterfs/src/common-utils.c b/libglusterfs/src/common-utils.c
index 751dc8a2e50..1adfdaa1673 100644
--- a/libglusterfs/src/common-utils.c
+++ b/libglusterfs/src/common-utils.c
@@ -3245,6 +3245,29 @@ gf_set_log_ident (cmd_args_t *cmd_args)
}
int
+gf_thread_cleanup_xint (pthread_t thread)
+{
+ int ret = 0;
+ void *res = NULL;
+
+ ret = pthread_cancel (thread);
+ if (ret != 0)
+ goto error_return;
+
+ ret = pthread_join (thread, &res);
+ if (ret != 0)
+ goto error_return;
+
+ if (res != PTHREAD_CANCELED)
+ goto error_return;
+
+ ret = 0;
+
+ error_return:
+ return ret;
+}
+
+int
gf_thread_create (pthread_t *thread, const pthread_attr_t *attr,
void *(*start_routine)(void *), void *arg)
{
diff --git a/libglusterfs/src/common-utils.h b/libglusterfs/src/common-utils.h
index c1deeef3c9d..6ac1442b0bf 100644
--- a/libglusterfs/src/common-utils.h
+++ b/libglusterfs/src/common-utils.h
@@ -707,4 +707,8 @@ gf_get_index_by_elem (char **array, char *elem);
int
glusterfs_is_local_pathinfo (char *pathinfo, gf_boolean_t *local);
+
+int
+gf_thread_cleanup_xint (pthread_t thread);
+
#endif /* _COMMON_UTILS_H */
diff --git a/libglusterfs/src/dict.c b/libglusterfs/src/dict.c
index 81db64dfd40..b8b6aeab248 100644
--- a/libglusterfs/src/dict.c
+++ b/libglusterfs/src/dict.c
@@ -2926,3 +2926,22 @@ dict_dump_to_statedump (dict_t *dict, char *dict_name, char *domain)
return;
}
+
+dict_t *
+dict_for_key_value (const char *name, const char *value, size_t size)
+{
+ dict_t *xattr = NULL;
+ int ret = 0;
+
+ xattr = dict_new ();
+ if (!xattr)
+ return NULL;
+
+ ret = dict_set_static_bin (xattr, (char *)name, (void *)value, size);
+ if (ret) {
+ dict_destroy (xattr);
+ xattr = NULL;
+ }
+
+ return xattr;
+}
diff --git a/libglusterfs/src/dict.h b/libglusterfs/src/dict.h
index a1a4c85f711..3708eede06d 100644
--- a/libglusterfs/src/dict.h
+++ b/libglusterfs/src/dict.h
@@ -260,4 +260,8 @@ int
dict_dump_to_str (dict_t *dict, char *dump, int dumpsize, char *format);
gf_boolean_t
dict_match_everything (dict_t *d, char *k, data_t *v, void *data);
+
+dict_t *
+dict_for_key_value (const char *name, const char *value, size_t size);
+
#endif
diff --git a/libglusterfs/src/mem-types.h b/libglusterfs/src/mem-types.h
index a24e5731114..fc06d52239b 100644
--- a/libglusterfs/src/mem-types.h
+++ b/libglusterfs/src/mem-types.h
@@ -148,6 +148,8 @@ enum gf_common_mem_types_ {
/* glusterd can load the nfs-xlator dynamically and needs these two */
gf_common_mt_nfs_netgroups = 130,
gf_common_mt_nfs_exports = 131,
+ gf_common_mt_gf_brick_spec_t = 132,
+ gf_common_mt_gf_timer_entry_t = 133,
gf_common_mt_end
};
#endif
diff --git a/libglusterfs/src/syncop-utils.c b/libglusterfs/src/syncop-utils.c
index 53768acd0ac..2fc95fa3e70 100644
--- a/libglusterfs/src/syncop-utils.c
+++ b/libglusterfs/src/syncop-utils.c
@@ -133,6 +133,92 @@ out:
return ret;
}
+/**
+ * Syncop_ftw_throttle can be used in a configurable way to control
+ * the speed at which crawling is done. It takes 2 more arguments
+ * compared to syncop_ftw.
+ * After @count entries are finished in a directory (to be
+ * precise, @count files) sleep for @sleep_time seconds.
+ * If either @count or @sleep_time is <=0, then it behaves similar to
+ * syncop_ftw.
+ */
+int
+syncop_ftw_throttle (xlator_t *subvol, loc_t *loc, int pid, void *data,
+ int (*fn) (xlator_t *subvol, gf_dirent_t *entry,
+ loc_t *parent, void *data),
+ int count, int sleep_time)
+{
+ loc_t child_loc = {0, };
+ fd_t *fd = NULL;
+ uint64_t offset = 0;
+ gf_dirent_t *entry = NULL;
+ int ret = 0;
+ gf_dirent_t entries;
+ int tmp = 0;
+
+ if (sleep_time <= 0) {
+ ret = syncop_ftw (subvol, loc, pid, data, fn);
+ goto out;
+ }
+
+ ret = syncop_dirfd (subvol, loc, &fd, pid);
+ if (ret)
+ goto out;
+
+ INIT_LIST_HEAD (&entries.list);
+
+ while ((ret = syncop_readdirp (subvol, fd, 131072, offset, 0,
+ &entries))) {
+ if (ret < 0)
+ break;
+
+ if (ret > 0) {
+ /* If the entries are only '.', and '..' then ret
+ * value will be non-zero. so set it to zero here. */
+ ret = 0;
+ }
+
+ tmp = 0;
+
+ list_for_each_entry (entry, &entries.list, list) {
+ offset = entry->d_off;
+
+ if (!strcmp (entry->d_name, ".") ||
+ !strcmp (entry->d_name, ".."))
+ continue;
+
+ if (++tmp >= count)
+ sleep (sleep_time);
+
+ gf_link_inode_from_dirent (NULL, fd->inode, entry);
+
+ ret = fn (subvol, entry, loc, data);
+ if (ret)
+ continue;
+
+ if (entry->d_stat.ia_type == IA_IFDIR) {
+ child_loc.inode = inode_ref (entry->inode);
+ uuid_copy (child_loc.gfid, entry->inode->gfid);
+ ret = syncop_ftw_throttle (subvol, &child_loc,
+ pid, data, fn, count,
+ sleep_time);
+ loc_wipe (&child_loc);
+ if (ret)
+ continue;
+ }
+ }
+
+ gf_dirent_free (&entries);
+ if (ret)
+ break;
+ }
+
+out:
+ if (fd)
+ fd_unref (fd);
+ return ret;
+}
+
int
syncop_dir_scan (xlator_t *subvol, loc_t *loc, int pid, void *data,
int (*fn) (xlator_t *subvol, gf_dirent_t *entry, loc_t *parent,
diff --git a/libglusterfs/src/syncop-utils.h b/libglusterfs/src/syncop-utils.h
index 918b3b7c666..7a9ccacb285 100644
--- a/libglusterfs/src/syncop-utils.h
+++ b/libglusterfs/src/syncop-utils.h
@@ -30,4 +30,10 @@ syncop_is_subvol_local (xlator_t *this, loc_t *loc, gf_boolean_t *is_local);
int
syncop_gfid_to_path (inode_table_t *itable, xlator_t *subvol, uuid_t gfid,
char **path_p);
+
+int
+syncop_ftw_throttle (xlator_t *subvol, loc_t *loc, int pid, void *data,
+ int (*fn) (xlator_t *subvol, gf_dirent_t *entry,
+ loc_t *parent, void *data),
+ int count, int sleep_time);
#endif /* _SYNCOP_H */
diff --git a/libglusterfs/src/xlator.c b/libglusterfs/src/xlator.c
index cc4726e0ea5..00f411e275b 100644
--- a/libglusterfs/src/xlator.c
+++ b/libglusterfs/src/xlator.c
@@ -1024,3 +1024,13 @@ glusterd_check_log_level (const char *value)
return log_level;
}
+int
+xlator_subvolume_count (xlator_t *this)
+{
+ int i = 0;
+ xlator_list_t *list = NULL;
+
+ for (list = this->children; list; list = list->next)
+ i++;
+ return i;
+}
diff --git a/libglusterfs/src/xlator.h b/libglusterfs/src/xlator.h
index 5a0b114d6a8..9bea950d720 100644
--- a/libglusterfs/src/xlator.h
+++ b/libglusterfs/src/xlator.h
@@ -989,4 +989,7 @@ glusterfs_leaf_position(xlator_t *tgt);
int
glusterfs_reachable_leaves(xlator_t *base, dict_t *leaves);
+int
+xlator_subvolume_count (xlator_t *this);
+
#endif /* _XLATOR_H */
diff --git a/xlators/cluster/afr/src/afr-common.c b/xlators/cluster/afr/src/afr-common.c
index 0af46993a34..6c06fd9b7b5 100644
--- a/xlators/cluster/afr/src/afr-common.c
+++ b/xlators/cluster/afr/src/afr-common.c
@@ -4164,18 +4164,6 @@ out:
return;
}
-int
-xlator_subvolume_count (xlator_t *this)
-{
- int i = 0;
- xlator_list_t *list = NULL;
-
- for (list = this->children; list; list = list->next)
- i++;
- return i;
-}
-
-
void
afr_handle_open_fd_count (call_frame_t *frame, xlator_t *this)
{
diff --git a/xlators/features/bit-rot/src/Makefile.am b/xlators/features/bit-rot/src/Makefile.am
index 1f59a71ebea..b5e4a7d62a0 100644
--- a/xlators/features/bit-rot/src/Makefile.am
+++ b/xlators/features/bit-rot/src/Makefile.am
@@ -1,18 +1 @@
-
-SUBDIRS = stub
-
-xlator_LTLIBRARIES = bit-rot.la
-xlatordir = $(libdir)/glusterfs/$(PACKAGE_VERSION)/xlator/features
-
-bit_rot_la_LDFLAGS = -module -avoid-version
-
-bit_rot_la_SOURCES = bit-rot.c
-bit_rot_la_LIBADD = $(top_builddir)/libglusterfs/src/libglusterfs.la
-
-noinst_HEADERS = bit-rot.h bit-rot-mem-types.h
-
-AM_CPPFLAGS = $(GF_CPPFLAGS) -I$(top_srcdir)/libglusterfs/src
-
-AM_CFLAGS = -Wall $(GF_CFLAGS)
-
-CLEANFILES =
+SUBDIRS = stub bitd
diff --git a/xlators/features/bit-rot/src/bit-rot-mem-types.h b/xlators/features/bit-rot/src/bit-rot-mem-types.h
deleted file mode 100644
index 19c2aca0f8a..00000000000
--- a/xlators/features/bit-rot/src/bit-rot-mem-types.h
+++ /dev/null
@@ -1,24 +0,0 @@
-/*
- Copyright (c) 2014 Red Hat, Inc. <http://www.redhat.com>
- This file is part of GlusterFS.
-
- This file is licensed to you under your choice of the GNU Lesser
- General Public License, version 3 or any later version (LGPLv3 or
- later), or the GNU General Public License, version 2 (GPLv2), in all
- cases as published by the Free Software Foundation.
-*/
-
-#ifndef _BR_MEM_TYPES_H
-#define _BR_MEM_TYPES_H
-
-#include "mem-types.h"
-
-enum br_mem_types {
- gf_br_mt_br_private_t = gf_common_mt_end + 1,
- gf_br_mt_br_local_t,
- gf_br_mt_br_inode_t,
- gf_br_mt_br_fd_t,
- gf_br_mt_end
-};
-
-#endif
diff --git a/xlators/features/bit-rot/src/bit-rot.c b/xlators/features/bit-rot/src/bit-rot.c
deleted file mode 100644
index 0ba8b80825b..00000000000
--- a/xlators/features/bit-rot/src/bit-rot.c
+++ /dev/null
@@ -1,89 +0,0 @@
-/*
- Copyright (c) 2014 Red Hat, Inc. <http://www.redhat.com>
- This file is part of GlusterFS.
-
- This file is licensed to you under your choice of the GNU Lesser
- General Public License, version 3 or any later version (LGPLv3 or
- later), or the GNU General Public License, version 2 (GPLv2), in all
- cases as published by the Free Software Foundation.
-*/
-
-#include <ctype.h>
-#include <sys/uio.h>
-
-#ifndef _CONFIG_H
-#define _CONFIG_H
-#include "config.h"
-#endif
-
-#include "glusterfs.h"
-#include "xlator.h"
-#include "logging.h"
-
-#include "bit-rot.h"
-#include "bit-rot-mem-types.h"
-
-int32_t
-mem_acct_init (xlator_t *this)
-{
- int32_t ret = -1;
-
- if (!this)
- return ret;
-
- ret = xlator_mem_acct_init (this, gf_br_mt_end + 1);
-
- if (ret != 0) {
- gf_log (this->name, GF_LOG_WARNING, "Memory accounting"
- " init failed");
- return ret;
- }
-
- return ret;
-}
-
-int32_t
-init (xlator_t *this)
-{
- br_private_t *priv = NULL;
- int32_t ret = -1;
-
- if (!this->children) {
- gf_log (this->name, GF_LOG_ERROR,
- "FATAL: no children");
- goto out;
- }
-
- priv = GF_CALLOC (1, sizeof (*priv), gf_br_mt_br_private_t);
- if (!priv)
- goto out;
-
- this->private = priv;
-
- ret = 0;
-
-out:
- gf_log (this->name, GF_LOG_DEBUG, "bit-rot xlator loaded");
- return ret;
-}
-
-void
-fini (xlator_t *this)
-{
- br_private_t *priv = this->private;
-
- if (!priv)
- return;
- this->private = NULL;
- GF_FREE (priv);
-
- return;
-}
-
-struct xlator_fops fops;
-
-struct xlator_cbks cbks;
-
-struct volume_options options[] = {
- { .key = {NULL} },
-};
diff --git a/xlators/features/bit-rot/src/bit-rot.h b/xlators/features/bit-rot/src/bit-rot.h
deleted file mode 100644
index b275c0e9535..00000000000
--- a/xlators/features/bit-rot/src/bit-rot.h
+++ /dev/null
@@ -1,33 +0,0 @@
- /*
- Copyright (c) 2014 Red Hat, Inc. <http://www.redhat.com>
- This file is part of GlusterFS.
-
- This file is licensed to you under your choice of the GNU Lesser
- General Public License, version 3 or any later version (LGPLv3 or
- later), or the GNU General Public License, version 2 (GPLv2), in all
- cases as published by the Free Software Foundation.
-*/
-#ifndef __BIT_ROT_H__
-#define __BIT_ROT_H__
-
-#ifndef _CONFIG_H
-#define _CONFIG_H
-#include "config.h"
-#endif
-
-#include "glusterfs.h"
-#include "logging.h"
-#include "dict.h"
-#include "xlator.h"
-#include "defaults.h"
-#include "bit-rot-mem-types.h"
-#include "syncop.h"
-
-struct br_private {
- xlator_t *xl;
- gf_lock_t lock;
-};
-
-typedef struct br_private br_private_t;
-
-#endif /* __BIR_ROT_H__ */
diff --git a/xlators/features/bit-rot/src/bitd/Makefile.am b/xlators/features/bit-rot/src/bitd/Makefile.am
new file mode 100644
index 00000000000..d94a70dc97f
--- /dev/null
+++ b/xlators/features/bit-rot/src/bitd/Makefile.am
@@ -0,0 +1,20 @@
+xlator_LTLIBRARIES = bit-rot.la
+xlatordir = $(libdir)/glusterfs/$(PACKAGE_VERSION)/xlator/features
+
+bit_rot_la_LDFLAGS = -module -avoid-version
+
+AM_CPPFLAGS = $(GF_CPPFLAGS) -I$(top_srcdir)/libglusterfs/src \
+ -I$(top_srcdir)/rpc/xdr/src/ \
+ -I$(top_srcdir)/rpc/rpc-lib/src \
+ -I$(CONTRIBDIR)/timer-wheel \
+ -I$(top_srcdir)/xlators/features/bit-rot/src/stub
+
+bit_rot_la_SOURCES = bit-rot.c
+bit_rot_la_LIBADD = $(top_builddir)/libglusterfs/src/libglusterfs.la \
+ $(top_builddir)/xlators/features/changelog/lib/src/libgfchangelog.la
+
+noinst_HEADERS = bit-rot.h
+
+AM_CFLAGS = -Wall $(GF_CFLAGS)
+
+CLEANFILES =
diff --git a/xlators/features/bit-rot/src/bitd/bit-rot.c b/xlators/features/bit-rot/src/bitd/bit-rot.c
new file mode 100644
index 00000000000..6234dd83864
--- /dev/null
+++ b/xlators/features/bit-rot/src/bitd/bit-rot.c
@@ -0,0 +1,1351 @@
+/*
+ Copyright (c) 2015 Red Hat, Inc. <http://www.redhat.com>
+ This file is part of GlusterFS.
+
+ This file is licensed to you under your choice of the GNU Lesser
+ General Public License, version 3 or any later version (LGPLv3 or
+ later), or the GNU General Public License, version 2 (GPLv2), in all
+ cases as published by the Free Software Foundation.
+*/
+
+#ifndef _CONFIG_H
+#define _CONFIG_H
+#include "config.h"
+#endif
+
+#include <ctype.h>
+#include <sys/uio.h>
+
+#include "glusterfs.h"
+#include "xlator.h"
+#include "logging.h"
+#include "compat-errno.h"
+
+#include "bit-rot.h"
+#include <pthread.h>
+
+static int
+br_find_child_index (xlator_t *this, xlator_t *child)
+{
+ br_private_t *priv = NULL;
+ int i = -1;
+ int index = -1;
+
+ GF_VALIDATE_OR_GOTO ("bit-rot", this, out);
+ GF_VALIDATE_OR_GOTO (this->name, this->private, out);
+ GF_VALIDATE_OR_GOTO (this->name, child, out);
+
+ priv = this->private;
+
+ for (i = 0; i < priv->child_count; i++) {
+ if (child == priv->children[i].xl) {
+ index = i;
+ break;
+ }
+ }
+
+out:
+ return index;
+}
+
+static void
+br_free_children (xlator_t *this)
+{
+ br_private_t *priv = NULL;
+ int32_t i = 0;
+ br_child_t *child = NULL;
+
+ priv = this->private;
+
+ for (i = 0; i < priv->child_count; i++) {
+ child = &priv->children[i];
+ mem_pool_destroy (child->timer_pool);
+ list_del_init (&priv->children[i].list);
+ }
+
+ GF_FREE (priv->children);
+
+ priv->children = NULL;
+}
+
+br_child_t *
+br_get_child_from_brick_path (xlator_t *this, char *brick_path)
+{
+ br_private_t *priv = NULL;
+ br_child_t *child = NULL;
+ br_child_t *tmp = NULL;
+ int i = 0;
+
+ GF_VALIDATE_OR_GOTO ("bit-rot", this, out);
+ GF_VALIDATE_OR_GOTO (this->name, this->private, out);
+ GF_VALIDATE_OR_GOTO (this->name, brick_path, out);
+
+ priv = this->private;
+
+ pthread_mutex_lock (&priv->lock);
+ {
+ for (i = 0; i < priv->child_count; i++) {
+ tmp = &priv->children[i];
+ if (!strcmp (tmp->brick_path, brick_path)) {
+ child = tmp;
+ break;
+ }
+ }
+ }
+ pthread_mutex_unlock (&priv->lock);
+
+out:
+ return child;
+}
+
+/**
+ * probably we'll encapsulate brick inside our own structure when
+ * needed -- later.
+ */
+void *
+br_brick_init (void *xl, struct gf_brick_spec *brick)
+{
+ return brick;
+}
+
+/**
+ * and cleanup things here when allocated br_brick_init().
+ */
+void
+br_brick_fini (void *xl, char *brick, void *data)
+{
+ return;
+}
+
+/**
+ * TODO: Signature can contain null terminators which causes bitrot
+ * stub to store truncated hash as it depends on string length of
+ * the hash.
+ *
+ * FIX: Send the string length as part of the signature struct and
+ * change stub to handle this change.
+ */
+static inline br_isignature_t *
+br_prepare_signature (const unsigned char *sign,
+ unsigned long hashlen,
+ int8_t hashtype, br_object_t *object)
+{
+ br_isignature_t *signature = NULL;
+
+ /* TODO: use mem-pool */
+ signature = GF_CALLOC (1, signature_size (hashlen + 1),
+ gf_br_stub_mt_signature_t);
+ if (!signature)
+ return NULL;
+
+ signature->signedversion = object->signedversion;
+ signature->signaturetype = hashtype;
+ memcpy (signature->signature, (char *)sign, hashlen);
+ signature->signature[hashlen+1] = '\0';
+
+ return signature;
+}
+
+/**
+ * Do a lookup on the gfid present within the object.
+ */
+static inline int32_t
+br_object_lookup (xlator_t *this, br_object_t *object,
+ struct iatt *iatt, inode_t **linked_inode)
+{
+ int ret = -EINVAL;
+ loc_t loc = {0, };
+ inode_t *inode = NULL;
+
+ GF_VALIDATE_OR_GOTO ("bit-rot", this, out);
+ GF_VALIDATE_OR_GOTO (this->name, object, out);
+
+ inode = inode_find (object->child->table, object->gfid);
+
+ if (inode)
+ loc.inode = inode;
+ else
+ loc.inode = inode_new (object->child->table);
+
+ if (!loc.inode) {
+ ret = -ENOMEM;
+ goto out;
+ }
+
+ uuid_copy (loc.gfid, object->gfid);
+
+ ret = syncop_lookup (object->child->xl, &loc, NULL, iatt, NULL, NULL);
+ if (ret < 0)
+ goto out;
+
+ /*
+ * The file might have been deleted by the application
+ * after getting the event, but before doing a lookup.
+ * So use linked_inode after inode_link is done.
+ */
+ *linked_inode = inode_link (loc.inode, NULL, NULL, iatt);
+ if (*linked_inode)
+ inode_lookup (*linked_inode);
+
+out:
+ loc_wipe (&loc);
+ return ret;
+}
+
+/**
+ * open the object with O_RDONLY flags and return the fd. How to let brick
+ * know that open is being done by bitd because syncop framework does not allow
+ * passing xdata -- may be use frame->root->pid itself.
+ */
+static inline int32_t
+br_object_open (xlator_t *this,
+ br_object_t *object, inode_t *inode, fd_t **openfd)
+{
+ int32_t ret = -1;
+ fd_t *fd = NULL;
+ loc_t loc = {0, };
+
+ GF_VALIDATE_OR_GOTO ("bit-rot", this, out);
+ GF_VALIDATE_OR_GOTO (this->name, object, out);
+ GF_VALIDATE_OR_GOTO (this->name, inode, out);
+
+ ret = -EINVAL;
+ fd = fd_create (inode, 0);
+ if (!fd) {
+ gf_log (this->name, GF_LOG_ERROR, "failed to create fd for the "
+ "inode %s", uuid_utoa (inode->gfid));
+ goto out;
+ }
+
+ loc.inode = inode_ref (inode);
+ uuid_copy (loc.gfid, inode->gfid);
+
+ ret = syncop_open (object->child->xl, &loc, O_RDONLY, fd);
+ if (ret) {
+ fd_unref (fd);
+ fd = NULL;
+ } else {
+ fd_bind (fd);
+ *openfd = fd;
+ }
+
+ loc_wipe (&loc);
+
+out:
+ return ret;
+}
+
+/**
+ * read 128k block from the object @object from the offset @offset
+ * and return the buffer.
+ */
+static int32_t
+br_object_read_block_and_sign (xlator_t *this, fd_t *fd, br_child_t *child,
+ off_t offset, size_t size, SHA256_CTX *sha256)
+{
+ int32_t ret = -1;
+ struct iovec *iovec = NULL;
+ struct iobref *iobref = NULL;
+ int count = 0;
+ int i = 0;
+
+ GF_VALIDATE_OR_GOTO ("bit-rot", this, out);
+ GF_VALIDATE_OR_GOTO (this->name, fd, out);
+ GF_VALIDATE_OR_GOTO (this->name, fd->inode, out);
+ GF_VALIDATE_OR_GOTO (this->name, child, out);
+
+ ret = syncop_readv (child->xl, fd,
+ size, offset, 0, &iovec, &count, &iobref);
+
+ if (ret < 0) {
+ gf_log (this->name, GF_LOG_ERROR, "readv on %s failed (%s)",
+ uuid_utoa (fd->inode->gfid), strerror (errno));
+ ret = -1;
+ goto out;
+ }
+
+ if (ret == 0)
+ goto out;
+
+ for (i = 0; i < count; i++) {
+ SHA256_Update (sha256,
+ (const unsigned char *) (iovec[i].iov_base),
+ iovec[i].iov_len);
+ }
+
+ out:
+ if (iovec)
+ GF_FREE (iovec);
+
+ if (iobref)
+ iobref_unref (iobref);
+
+ return ret;
+}
+
+int32_t
+br_object_checksum (unsigned char *md,
+ br_object_t *object, fd_t *fd, struct iatt *iatt)
+{
+ int32_t ret = -1;
+ off_t offset = 0;
+ size_t block = 128 * 1024; /* 128K block size */
+ xlator_t *this = NULL;
+
+ SHA256_CTX sha256;
+
+ GF_VALIDATE_OR_GOTO ("bit-rot", object, out);
+ GF_VALIDATE_OR_GOTO ("bit-rot", iatt, out);
+ GF_VALIDATE_OR_GOTO ("bit-rot", fd, out);
+
+ this = object->this;
+
+ SHA256_Init (&sha256);
+
+ while (1) {
+ ret = br_object_read_block_and_sign (this, fd, object->child,
+ offset, block, &sha256);
+ if (ret < 0) {
+ gf_log (this->name, GF_LOG_ERROR, "reading block with "
+ "offset %lu of object %s failed", offset,
+ uuid_utoa (fd->inode->gfid));
+ break;
+ }
+
+ if (ret == 0)
+ break;
+
+ offset += ret;
+ }
+
+ if (ret == 0)
+ SHA256_Final (md, &sha256);
+
+ out:
+ return ret;
+}
+
+static inline int32_t
+br_object_read_sign (inode_t *linked_inode, fd_t *fd, br_object_t *object,
+ struct iatt *iatt)
+{
+ int32_t ret = -1;
+ xlator_t *this = NULL;
+ dict_t *xattr = NULL;
+ unsigned char *md = NULL;
+ br_isignature_t *sign = NULL;
+
+ GF_VALIDATE_OR_GOTO ("bit-rot", object, out);
+ GF_VALIDATE_OR_GOTO ("bit-rot", linked_inode, out);
+ GF_VALIDATE_OR_GOTO ("bit-rot", fd, out);
+
+ this = object->this;
+
+ md = GF_CALLOC (SHA256_DIGEST_LENGTH, sizeof (*md), gf_common_mt_char);
+ if (!md) {
+ gf_log (this->name, GF_LOG_ERROR, "failed to allocate memory "
+ "for saving hash of the object %s",
+ uuid_utoa (fd->inode->gfid));
+ goto out;
+ }
+
+ ret = br_object_checksum (md, object, fd, iatt);
+ if (ret) {
+ gf_log (this->name, GF_LOG_ERROR, "calculating checksum for "
+ "the object %s failed", uuid_utoa (linked_inode->gfid));
+ goto free_signature;
+ }
+
+ sign = br_prepare_signature (md, SHA256_DIGEST_LENGTH,
+ BR_SIGNATURE_TYPE_SHA256, object);
+ if (!sign) {
+ gf_log (this->name, GF_LOG_ERROR, "failed to get the signature "
+ "for the object %s", uuid_utoa (fd->inode->gfid));
+ goto free_signature;
+ }
+
+ xattr = dict_for_key_value
+ (GLUSTERFS_SET_OBJECT_SIGNATURE,
+ (void *)sign, signature_size (SHA256_DIGEST_LENGTH));
+
+ if (!xattr) {
+ gf_log (this->name, GF_LOG_ERROR, "dict allocation for signing"
+ " failed for the object %s",
+ uuid_utoa (fd->inode->gfid));
+ goto free_isign;
+ }
+
+ ret = syncop_fsetxattr (object->child->xl, fd, xattr, 0);
+ if (ret) {
+ gf_log (this->name, GF_LOG_ERROR, "fsetxattr of signature to "
+ "the object %s failed", uuid_utoa (fd->inode->gfid));
+ goto unref_dict;
+ }
+
+ ret = 0;
+
+ unref_dict:
+ dict_unref (xattr);
+ free_isign:
+ GF_FREE (sign);
+ free_signature:
+ GF_FREE (md);
+ out:
+ return ret;
+}
+
+static inline int br_object_sign_softerror (int32_t op_errno)
+{
+ return ((op_errno == ENOENT) || (op_errno = ESTALE));
+}
+
+void
+br_log_object (xlator_t *this, char *op, uuid_t gfid, int32_t op_errno)
+{
+ int softerror = br_object_sign_softerror (op_errno);
+ gf_log (this->name, (softerror) ? GF_LOG_DEBUG : GF_LOG_ERROR,
+ "%s() failed on object %s [reason: %s]",
+ op, uuid_utoa (gfid), strerror (op_errno));
+}
+
+void
+br_log_object_path (xlator_t *this, char *op,
+ const char *path, int32_t op_errno)
+{
+ int softerror = br_object_sign_softerror (op_errno);
+ gf_log (this->name, (softerror) ? GF_LOG_DEBUG : GF_LOG_ERROR,
+ "%s() failed on object %s [reason: %s]",
+ op, path, strerror (op_errno));
+}
+
+/**
+ * Sign a given object. This routine runs full throttle. There needs to be
+ * some form of priority scheduling and/or read burstness to avoid starving
+ * (or kicking) client I/O's.
+ */
+static inline int32_t br_sign_object (br_object_t *object)
+{
+ int32_t ret = -1;
+ inode_t *linked_inode = NULL;
+ xlator_t *this = NULL;
+ fd_t *fd = NULL;
+ struct iatt iatt = {0, };
+ pid_t pid = GF_CLIENT_PID_BITD;
+
+ GF_VALIDATE_OR_GOTO ("bit-rot", object, out);
+
+ this = object->this;
+
+ /**
+ * FIXME: This is required as signing an object is restricted to
+ * clients with special frame->root->pid. Change the way client
+ * pid is set.
+ */
+ syncopctx_setfspid (&pid);
+
+ ret = br_object_lookup (this, object, &iatt, &linked_inode);
+ if (ret) {
+ br_log_object (this, "lookup", object->gfid, -ret);
+ goto out;
+ }
+
+ ret = br_object_open (this, object, linked_inode, &fd);
+ if (!fd) {
+ br_log_object (this, "open", object->gfid, -ret);
+ goto unref_inode;
+ }
+
+ /**
+ * we have an open file descriptor on the object. from here on,
+ * do not be generous to file operation errors.
+ */
+
+ /* change this to DEBUG log level later */
+ gf_log (this->name, GF_LOG_DEBUG,
+ "Signing object [%s]", uuid_utoa (linked_inode->gfid));
+
+ ret = br_object_read_sign (linked_inode, fd, object, &iatt);
+ if (ret) {
+ gf_log (this->name, GF_LOG_ERROR, "reading and signing of the "
+ "object %s failed", uuid_utoa (linked_inode->gfid));
+ goto unref_fd;
+ }
+
+ ret = 0;
+
+ unref_fd:
+ fd_unref (fd);
+ unref_inode:
+ inode_unref (linked_inode);
+ out:
+ return ret;
+}
+
+static inline br_object_t *__br_pick_object (br_private_t *priv)
+{
+ br_object_t *object = NULL;
+
+ while (list_empty (&priv->obj_queue->objects)) {
+ pthread_cond_wait (&priv->object_cond, &priv->lock);
+ }
+
+ object = list_first_entry
+ (&priv->obj_queue->objects, br_object_t, list);
+ list_del_init (&object->list);
+
+ return object;
+}
+
+/**
+ * This is the place where the signing of the objects is triggered.
+ */
+void *
+br_process_object (void *arg)
+{
+ xlator_t *this = NULL;
+ br_object_t *object = NULL;
+ br_private_t *priv = NULL;
+ int32_t ret = -1;
+
+ this = arg;
+ priv = this->private;
+
+ THIS = this;
+
+ for (;;) {
+ pthread_mutex_lock (&priv->lock);
+ {
+ object = __br_pick_object (priv);
+ }
+ pthread_mutex_unlock (&priv->lock);
+
+ ret = br_sign_object (object);
+ if (ret && !br_object_sign_softerror (-ret))
+ gf_log (this->name, GF_LOG_ERROR,
+ "SIGNING FAILURE [%s]",
+ uuid_utoa (object->gfid));
+ GF_FREE (object);
+ }
+
+ return NULL;
+}
+
+/**
+ * This function gets kicked in once the object is expired from the
+ * timer wheel. This actually adds the object received via notification
+ * from the changelog to the queue from where the objects gets picked
+ * up for signing.
+ *
+ * This routine can be made lightweight by introducing an alternate
+ * timer-wheel API that dispatches _all_ expired objects in one-shot
+ * rather than an object at-a-time. This routine can then just simply
+ * be a call to list_splice_tail().
+ *
+ * NOTE: use call_time to instrument signing time in br_sign_object().
+ */
+void
+br_add_object_to_queue (struct gf_tw_timer_list *timer,
+ void *data, unsigned long call_time)
+{
+ br_object_t *object = NULL;
+ xlator_t *this = NULL;
+ br_private_t *priv = NULL;
+
+ object = data;
+ this = object->this;
+ priv = this->private;
+
+ pthread_mutex_lock (&priv->lock);
+ {
+ list_add_tail (&object->list, &priv->obj_queue->objects);
+ pthread_cond_broadcast (&priv->object_cond);
+ }
+ pthread_mutex_unlock (&priv->lock);
+
+ mem_put (timer);
+ return;
+}
+
+static inline br_object_t *
+br_initialize_object (xlator_t *this, br_child_t *child, changelog_event_t *ev)
+{
+ br_object_t *object = NULL;
+
+ object = GF_CALLOC (1, sizeof (*object), gf_br_mt_br_object_t);
+ if (!object)
+ goto out;
+ INIT_LIST_HEAD (&object->list);
+
+ object->this = this;
+ object->child = child;
+ uuid_copy (object->gfid, ev->u.releasebr.gfid);
+
+ /* NOTE: it's BE, but no worry */
+ object->signedversion = ev->u.releasebr.version;
+
+out:
+ return object;
+}
+
+static inline struct gf_tw_timer_list *
+br_initialize_timer (xlator_t *this, br_object_t *object, br_child_t *child,
+ changelog_event_t *ev)
+{
+ br_private_t *priv = NULL;
+ struct gf_tw_timer_list *timer = NULL;
+
+ priv = this->private;
+
+ timer = mem_get0 (child->timer_pool);
+ if (!timer)
+ goto out;
+ INIT_LIST_HEAD (&timer->entry);
+
+ timer->data = object;
+ timer->expires = priv->expiry_time;
+ timer->function = br_add_object_to_queue;
+ gf_tw_add_timer (priv->timer_wheel, timer);
+
+out:
+ return timer;
+}
+
+/**
+ * This callback function registered with the changelog is executed
+ * whenever a notification from the changelog is received. This should
+ * add the object (or the gfid) on which the notification has come to
+ * the timer-wheel with some expiry time.
+ *
+ * TODO: use mem-pool for allocations and maybe allocate timer and
+ * object as a single alloc and bifurcate their respective pointers.
+ */
+void
+br_brick_callback (void *xl, char *brick,
+ void *data, changelog_event_t *ev)
+{
+ uuid_t gfid = {0,};
+ xlator_t *this = NULL;
+ br_object_t *object = NULL;
+ br_child_t *child = NULL;
+ int32_t flags = 0;
+ struct gf_tw_timer_list *timer = NULL;
+
+ this = xl;
+
+ GF_VALIDATE_OR_GOTO (this->name, ev, out);
+ GF_VALIDATE_OR_GOTO ("bit-rot", this, out);
+ GF_VALIDATE_OR_GOTO (this->name, this->private, out);
+
+ GF_ASSERT (ev->ev_type == CHANGELOG_OP_TYPE_BR_RELEASE);
+ GF_ASSERT (!uuid_is_null (ev->u.releasebr.gfid));
+
+ uuid_copy (gfid, ev->u.releasebr.gfid);
+
+ gf_log (this->name, GF_LOG_DEBUG,
+ "RELEASE EVENT [GFID %s]", uuid_utoa (gfid));
+
+ flags = (int32_t)ntohl (ev->u.releasebr.flags);
+ if (flags == O_RDONLY) {
+ gf_log (this->name, GF_LOG_DEBUG,
+ "Read only fd [GFID: %s], ignoring signing..",
+ uuid_utoa (gfid));
+ goto out;
+ }
+
+ child = br_get_child_from_brick_path (this, brick);
+ if (!child) {
+ gf_log (this->name, GF_LOG_ERROR, "failed to get the subvolume "
+ "for the brick %s", brick);
+ goto out;
+ }
+
+ object = br_initialize_object (this, child, ev);
+ if (!object) {
+ gf_log (this->name, GF_LOG_ERROR, "failed to allocate "
+ "object memory [GFID: %s]", uuid_utoa (gfid));
+ goto out;
+ }
+
+ timer = br_initialize_timer (this, object, child, ev);
+ if (!timer) {
+ gf_log (this->name, GF_LOG_ERROR, "failed to allocate "
+ "object expiry timer [GFID: %s]", uuid_utoa (gfid));
+ goto free_object;
+ }
+
+ gf_log (this->name, GF_LOG_DEBUG, "->callback: brick [%s], type [%d]\n",
+ brick, ev->ev_type);
+
+ return;
+
+ free_object:
+ GF_FREE (object);
+out:
+ return;
+}
+
+void
+br_fill_brick_spec (struct gf_brick_spec *brick, char *path)
+{
+ brick->brick_path = gf_strdup (path);
+ brick->filter = CHANGELOG_OP_TYPE_BR_RELEASE;
+
+ brick->init = br_brick_init;
+ brick->fini = br_brick_fini;
+ brick->callback = br_brick_callback;
+ brick->connected = NULL;
+ brick->disconnected = NULL;
+}
+
+static inline gf_boolean_t
+br_time_equal (br_child_t *child, struct timeval *tv)
+{
+ if ((child->tv.tv_sec == tv->tv_sec) &&
+ (child->tv.tv_usec == tv->tv_usec))
+ return _gf_true;
+
+ return _gf_false;
+}
+
+static inline gf_boolean_t
+br_check_object_need_sign (xlator_t *this, dict_t *xattr, br_child_t *child)
+{
+ int32_t ret = -1;
+ gf_boolean_t need_sign = _gf_false;
+ struct timeval tv = {0,};
+ br_isignature_out_t *sign = NULL;
+
+ GF_VALIDATE_OR_GOTO ("bit-rot", this, out);
+ GF_VALIDATE_OR_GOTO (this->name, xattr, out);
+ GF_VALIDATE_OR_GOTO (this->name, child, out);
+
+ ret = dict_get_ptr (xattr, GLUSTERFS_GET_OBJECT_SIGNATURE,
+ (void **)&sign);
+ if (ret) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "failed to get object signature info");
+ goto out;
+ }
+
+ tv.tv_sec = ntohl (sign->time[0]);
+ tv.tv_usec = ntohl (sign->time[1]);
+
+ /* Object has been opened and hence dirty. Do not sign it */
+ if (sign->stale && !br_time_equal (child, &tv))
+ need_sign = _gf_true;
+
+out:
+ return need_sign;
+}
+
+static inline void
+br_trigger_sign (xlator_t *this, br_child_t *child, inode_t *linked_inode,
+ loc_t *loc)
+{
+ fd_t *fd = NULL;
+ int32_t ret = -1;
+
+ fd = fd_create (linked_inode, 0);
+ if (!fd) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "Failed to create fd [GFID %s]",
+ uuid_utoa (linked_inode->gfid));
+ goto out;
+ }
+
+ ret = syncop_open (child->xl, loc, O_RDWR, fd);
+ if (ret) {
+ br_log_object (this, "open", linked_inode->gfid, -ret);
+ fd_unref (fd);
+ fd = NULL;
+ } else {
+ fd_bind (fd);
+ }
+
+ if (fd)
+ syncop_close (fd);
+
+out:
+ return;
+}
+
+int32_t
+br_prepare_loc (xlator_t *this, br_child_t *child, loc_t *parent,
+ gf_dirent_t *entry, loc_t *loc)
+{
+ int32_t ret = -1;
+ inode_t *inode = NULL;
+
+ inode = inode_grep (child->table, parent->inode, entry->d_name);
+ if (!inode)
+ loc->inode = inode_new (child->table);
+ else {
+ loc->inode = inode;
+ if (loc->inode->ia_type != IA_IFREG) {
+ gf_log (this->name, GF_LOG_DEBUG, "%s is not a regular "
+ "file", entry->d_name);
+ ret = 0;
+ goto out;
+ }
+ }
+
+ loc->parent = inode_ref (parent->inode);
+ uuid_copy (loc->pargfid, parent->inode->gfid);
+
+ ret = inode_path (parent->inode, entry->d_name, (char **)&loc->path);
+ if (ret < 0 || !loc->path) {
+ gf_log (this->name, GF_LOG_ERROR, "inode_path on %s "
+ "(parent: %s) failed", entry->d_name,
+ uuid_utoa (parent->inode->gfid));
+ goto out;
+ }
+
+ loc->name = strrchr (loc->path, '/');
+ if (loc->name)
+ loc->name++;
+
+ ret = 1;
+
+out:
+ return ret;
+}
+
+/**
+ * Oneshot crawler
+ * ---------------
+ * This is a catchup mechanism. Objects that remained unsigned from the
+ * last run for whatever reason (node crashes, reboots, etc..) become
+ * candidates for signing. This allows the signature to "catch up" with
+ * the current state of the object. Triggering signing is easy: perform
+ * an open() followed by a close() therby resulting in call boomerang.
+ * (though not back to itself :))
+ */
+int
+bitd_oneshot_crawl (xlator_t *subvol,
+ gf_dirent_t *entry, loc_t *parent, void *data)
+{
+ int op_errno = 0;
+ br_child_t *child = NULL;
+ xlator_t *this = NULL;
+ loc_t loc = {0, };
+ struct iatt iatt = {0, };
+ struct iatt parent_buf = {0, };
+ dict_t *xattr = NULL;
+ int32_t ret = -1;
+ inode_t *linked_inode = NULL;
+ gf_boolean_t need_signing = _gf_false;
+
+ GF_VALIDATE_OR_GOTO ("bit-rot", subvol, out);
+ GF_VALIDATE_OR_GOTO ("bit-rot", data, out);
+
+ child = data;
+ this = child->this;
+
+ ret = br_prepare_loc (this, child, parent, entry, &loc);
+ if (!ret)
+ goto out;
+
+ ret = syncop_lookup (child->xl, &loc, NULL, &iatt, NULL, &parent_buf);
+ if (ret) {
+ br_log_object_path (this, "lookup", loc.path, -ret);
+ goto out;
+ }
+
+ linked_inode = inode_link (loc.inode, parent->inode, loc.name, &iatt);
+ if (linked_inode)
+ inode_lookup (linked_inode);
+
+ if (iatt.ia_type != IA_IFREG) {
+ gf_log (this->name, GF_LOG_DEBUG,
+ "%s is not a regular file, skipping..", entry->d_name);
+ ret = 0;
+ goto unref_inode;
+ }
+
+ /**
+ * As of now, 2 cases are possible and handled.
+ * 1) GlusterFS is upgraded from a previous version which does not
+ * have any idea about bit-rot and have data in the filesystem.
+ * In this case syncop_getxattr fails with ENODATA and the object
+ * is signed. (In real, when crawler sends lookup, bit-rot-stub
+ * creates the xattrs before returning lookup reply)
+ * 2) Bit-rot was not enabled or BitD was dows for some reasons, during
+ * which some files were created, but since BitD was down, were not
+ * signed.
+ * If the file was just created and was being written some data when
+ * the down BitD came up, then bit-rot stub should be intelligent to
+ * identify this case (by comparing the ongoing version or by checking
+ * if there are any fds present for that inode) and handle properly.
+ */
+
+ ret = syncop_getxattr (child->xl, &loc, &xattr,
+ GLUSTERFS_GET_OBJECT_SIGNATURE, NULL);
+ if (ret < 0) {
+ op_errno = -ret;
+ br_log_object (this, "getxattr", linked_inode->gfid, op_errno);
+
+ if (op_errno == ENODATA)
+ need_signing = _gf_true;
+ if (op_errno == EINVAL)
+ gf_log (this->name, GF_LOG_WARNING, "Partial version "
+ "xattr presence detected, ignoring [GFID: %s]",
+ uuid_utoa (linked_inode->gfid));
+ } else {
+ need_signing = br_check_object_need_sign (this, xattr, child);
+ }
+
+ if (!need_signing)
+ goto unref_dict;
+
+ gf_log (this->name, GF_LOG_INFO,
+ "Triggering signing for %s [GFID: %s | Brick: %s]",
+ loc.path, uuid_utoa (linked_inode->gfid), child->brick_path);
+ br_trigger_sign (this, child, linked_inode, &loc);
+
+ ret = 0;
+
+ unref_dict:
+ if (xattr)
+ dict_unref (xattr);
+ unref_inode:
+ inode_unref (linked_inode);
+ out:
+ loc_wipe (&loc);
+
+ return ret;
+}
+
+#define BR_CRAWL_THROTTLE_COUNT 50
+#define BR_CRAWL_THROTTLE_ZZZ 5
+
+void *
+br_oneshot_signer (void *arg)
+{
+ loc_t loc = {0,};
+ xlator_t *this = NULL;
+ br_child_t *child = NULL;
+
+ child = arg;
+ this = child->this;
+
+ THIS = this;
+
+ gf_log (this->name, GF_LOG_INFO, "Crawling brick [%s], scanning "
+ "for unsigned objects", child->brick_path);
+
+ loc.inode = child->table->root;
+ (void) syncop_ftw_throttle
+ (child->xl, &loc,
+ GF_CLIENT_PID_BITD, child, bitd_oneshot_crawl,
+ BR_CRAWL_THROTTLE_COUNT, BR_CRAWL_THROTTLE_ZZZ);
+
+ gf_log (this->name, GF_LOG_INFO,
+ "Completed crawling brick [%s]", child->brick_path);
+
+ return NULL;
+}
+
+/**
+ * At this point a thread is spawned to crawl the filesystem (in
+ * tortoise pace) to sign objects that were not signed in previous run(s).
+ * Such objects are identified by examining it's dirtyness and timestamp.
+ *
+ * pick object:
+ * signature_is_stale() && (object_timestamp() <= stub_init_time())
+ *
+ * Also, we register to the changelog library to subscribe for event
+ * notifications.
+ */
+static inline int32_t
+br_enact_signer (xlator_t *this, br_child_t *child, br_stub_init_t *stub)
+{
+ int32_t ret = 0;
+ struct gf_brick_spec *brick = NULL;
+
+ brick = GF_CALLOC (1, sizeof (struct gf_brick_spec),
+ gf_common_mt_gf_brick_spec_t);
+ if (!brick)
+ goto error_return;
+
+ br_fill_brick_spec (brick, stub->export);
+ ret = gf_changelog_register_generic
+ (brick, 1, 1, this->ctx->cmd_args.log_file, -1, this);
+ if (ret) {
+ gf_log (this->name, GF_LOG_ERROR, "Register to changelog failed"
+ " [Reason: %s]", strerror (errno));
+ goto dealloc;
+ }
+
+ child->threadrunning = 0;
+ ret = gf_thread_create (&child->thread, NULL, br_oneshot_signer, child);
+ if (ret)
+ gf_log (this->name, GF_LOG_WARNING,
+ "failed to spawn FS crawler thread");
+ else
+ child->threadrunning = 1;
+
+ /* it's OK to continue, "old" objects would be signed when modified */
+ return 0;
+
+ dealloc:
+ GF_FREE (brick);
+ error_return:
+ return -1;
+}
+
+/**
+ * This routine fetches various attributes associated with a child which
+ * is basically a subvolume. Attributes include brick path and the stub
+ * birth time. This is done by performing a lookup on the root followed
+ * by getxattr() on a virtual key.
+ */
+static inline int32_t
+br_brick_connect (xlator_t *this, br_child_t *child)
+{
+ int32_t ret = -1;
+ loc_t loc = {0, };
+ struct iatt buf = {0, };
+ struct iatt parent = {0, };
+ br_stub_init_t *stub = NULL;
+ dict_t *xattr = NULL;
+ int op_errno = 0;
+
+ GF_VALIDATE_OR_GOTO ("bit-rot", this, out);
+ GF_VALIDATE_OR_GOTO (this->name, child, out);
+ GF_VALIDATE_OR_GOTO (this->name, this->private, out);
+
+ loc.inode = inode_ref (child->table->root);
+ uuid_copy (loc.gfid, loc.inode->gfid);
+ loc.path = gf_strdup ("/");
+
+ ret = syncop_lookup (child->xl, &loc, NULL, &buf, NULL, &parent);
+ if (ret) {
+ op_errno = -ret;
+ ret = -1;
+ gf_log (this->name, GF_LOG_ERROR, "lookup on root failed "
+ "[Reason: %s]", strerror (op_errno));
+ goto wipeloc;
+ }
+
+ ret = syncop_getxattr (child->xl, &loc, &xattr,
+ GLUSTERFS_GET_BR_STUB_INIT_TIME, NULL);
+ if (ret) {
+ op_errno = -ret;
+ ret = -1;
+ gf_log (this->name, GF_LOG_ERROR, "failed to get stub info "
+ "[Reason: %s]", strerror (op_errno));
+ goto wipeloc;
+ }
+
+ ret = dict_get_ptr (xattr, GLUSTERFS_GET_BR_STUB_INIT_TIME,
+ (void **)&stub);
+ if (ret) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "failed to extract stub information");
+ goto free_dict;
+ }
+
+ memcpy (child->brick_path, stub->export, strlen (stub->export) + 1);
+ child->tv.tv_sec = ntohl (stub->timebuf[0]);
+ child->tv.tv_usec = ntohl (stub->timebuf[0]);
+
+ ret = br_enact_signer (this, child, stub);
+
+ free_dict:
+ dict_unref (xattr);
+ wipeloc:
+ loc_wipe (&loc);
+ out:
+ return ret;
+}
+
+/**
+ * This function is executed in a separate thread. The thread gets the
+ * brick from where CHILD_UP has received from the queue and gets the
+ * information regarding that brick (such as brick path).
+ */
+void *
+br_handle_events (void *arg)
+{
+ xlator_t *this = NULL;
+ br_private_t *priv = NULL;
+ br_child_t *child = NULL;
+ int32_t ret = -1;
+
+ this = arg;
+ priv = this->private;
+
+ /*
+ * Since, this is the topmost xlator, THIS has to be set by bit-rot
+ * xlator itself (STACK_WIND wont help in this case). Also it has
+ * to be done for each thread that gets spawned. Otherwise, a new
+ * thread will get global_xlator's pointer when it does "THIS".
+ */
+ THIS = this;
+
+ while (1) {
+ pthread_mutex_lock (&priv->lock);
+ {
+ while (list_empty (&priv->bricks)) {
+ pthread_cond_wait (&priv->cond,
+ &priv->lock);
+ }
+
+ child = list_entry (priv->bricks.next, br_child_t,
+ list);
+ if (child && child->child_up) {
+ ret = br_brick_connect (this, child);
+ if (ret == -1)
+ gf_log (this->name, GF_LOG_ERROR,
+ "failed to connect to the "
+ "child (subvolume: %s)",
+ child->xl->name);
+ else
+ list_del_init (&child->list);
+ }
+
+ }
+ pthread_mutex_unlock (&priv->lock);
+ }
+
+ return NULL;
+}
+
+int32_t
+mem_acct_init (xlator_t *this)
+{
+ int32_t ret = -1;
+
+ if (!this)
+ return ret;
+
+ ret = xlator_mem_acct_init (this, gf_br_stub_mt_end + 1);
+
+ if (ret != 0) {
+ gf_log (this->name, GF_LOG_WARNING, "Memory accounting"
+ " init failed");
+ return ret;
+ }
+
+ return ret;
+}
+
+int
+notify (xlator_t *this, int32_t event, void *data, ...)
+{
+ xlator_t *subvol = NULL;
+ br_private_t *priv = NULL;
+ int idx = -1;
+ br_child_t *child = NULL;
+
+ subvol = (xlator_t *)data;
+ priv = this->private;
+
+ gf_log (this->name, GF_LOG_TRACE, "Notification received: %d",
+ event);
+
+ switch (event) {
+ case GF_EVENT_CHILD_UP:
+ /* should this be done under lock? or is it ok to do it
+ without lock? */
+ idx = br_find_child_index (this, subvol);
+
+ pthread_mutex_lock (&priv->lock);
+ {
+ if (idx < 0) {
+ gf_log (this->name, GF_LOG_ERROR, "got child "
+ "up from invalid subvolume");
+ } else {
+ child = &priv->children[idx];
+ if (child->child_up != 1)
+ child->child_up = 1;
+ if (!child->xl)
+ child->xl = subvol;
+ if (!child->table)
+ child->table = inode_table_new (4096,
+ subvol);
+ priv->up_children++;
+ list_add_tail (&child->list, &priv->bricks);
+ pthread_cond_signal (&priv->cond);
+ }
+ }
+ pthread_mutex_unlock (&priv->lock);
+ break;
+
+ case GF_EVENT_CHILD_MODIFIED:
+ idx = br_find_child_index (this, subvol);
+ if (idx < 0) {
+ gf_log (this->name, GF_LOG_ERROR, "received child up "
+ "from invalid subvolume");
+ goto out;
+ }
+ priv = this->private;
+ /* ++(priv->generation); */
+ break;
+ case GF_EVENT_CHILD_DOWN:
+ idx = br_find_child_index (this, subvol);
+ if (idx < 0) {
+ gf_log (this->name, GF_LOG_ERROR, "received child down "
+ "from invalid subvolume");
+ goto out;
+ }
+
+ pthread_mutex_lock (&priv->lock);
+ {
+ if (priv->children[idx].child_up == 1) {
+ priv->children[idx].child_up = 0;
+ priv->up_children--;
+ }
+ }
+ pthread_mutex_unlock (&priv->lock);
+ break;
+ case GF_EVENT_PARENT_UP:
+ default_notify (this, GF_EVENT_PARENT_UP, data);
+ break;
+ }
+
+out:
+ return 0;
+}
+
+int32_t
+init (xlator_t *this)
+{
+ int i = 0;
+ int32_t ret = -1;
+ br_private_t *priv = NULL;
+ xlator_list_t *trav = NULL;
+
+ if (!this->children) {
+ gf_log (this->name, GF_LOG_ERROR, "FATAL: no children");
+ goto out;
+ }
+
+ priv = GF_CALLOC (1, sizeof (*priv), gf_br_mt_br_private_t);
+ if (!priv) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "failed to allocate memory (->priv)");
+ goto out;
+ }
+
+ /* initialize gfchangelog xlator context */
+ ret = gf_changelog_init (this);
+ if (ret)
+ goto out;
+
+ GF_OPTION_INIT ("expiry-time", priv->expiry_time, int32, out);
+
+ priv->child_count = xlator_subvolume_count (this);
+ priv->children = GF_CALLOC (priv->child_count, sizeof (*priv->children),
+ gf_br_mt_br_child_t);
+ if (!priv->children)
+ goto out;
+
+ trav = this->children;
+ while (trav) {
+ priv->children[i].this = this;
+ priv->children[i].xl = trav->xlator;
+
+ priv->children[i].timer_pool =
+ mem_pool_new (struct gf_tw_timer_list, 4096);
+ if (!priv->children[i].timer_pool) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "failed to allocate mem-pool for timer");
+ errno = ENOMEM;
+ goto out;
+ }
+
+ i++;
+ trav = trav->next;
+ }
+
+ pthread_mutex_init (&priv->lock, NULL);
+ pthread_cond_init (&priv->cond, NULL);
+
+ for (i = 0; i < priv->child_count; i++)
+ INIT_LIST_HEAD (&priv->children[i].list);
+ INIT_LIST_HEAD (&priv->bricks);
+
+ this->private = priv;
+
+ ret = gf_thread_create (&priv->thread, NULL, br_handle_events,
+ this);
+ if (ret != 0) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "thread creation failed (%s)", strerror (errno));
+ goto out;
+ }
+
+ priv->timer_wheel = gf_tw_init_timers ();
+ if (!priv->timer_wheel) {
+ gf_log (this->name, GF_LOG_ERROR, "failed to initialize the "
+ "timer wheel");
+ goto out;
+ }
+
+ pthread_cond_init (&priv->object_cond, NULL);
+ priv->obj_queue = GF_CALLOC (1, sizeof (*priv->obj_queue),
+ gf_br_mt_br_ob_n_wk_t);
+ if (!priv->obj_queue) {
+ gf_log (this->name, GF_LOG_ERROR, "memory allocation failed");
+ goto out;
+ }
+
+ INIT_LIST_HEAD (&priv->obj_queue->objects);
+
+ for (i = 0; i < BR_WORKERS; i++) {
+ gf_thread_create (&priv->obj_queue->workers[i], NULL,
+ br_process_object, this);
+ if (ret != 0) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "thread creation failed (%s)",
+ strerror (errno));
+ goto out;
+ }
+ }
+
+ ret = 0;
+
+out:
+ if (ret) {
+ if (priv->children)
+ GF_FREE (priv->children);
+ if (priv->timer_wheel)
+ gf_tw_cleanup_timers (priv->timer_wheel);
+ GF_FREE (priv);
+ }
+
+ gf_log (this->name, GF_LOG_DEBUG, "bit-rot xlator loaded");
+ return ret;
+}
+
+void
+fini (xlator_t *this)
+{
+ br_private_t *priv = this->private;
+
+ if (!priv)
+ return;
+
+ br_free_children (this);
+ if (priv->timer_wheel)
+ gf_tw_cleanup_timers (priv->timer_wheel);
+ this->private = NULL;
+ GF_FREE (priv);
+
+ return;
+}
+
+struct xlator_fops fops;
+
+struct xlator_cbks cbks;
+
+struct volume_options options[] = {
+ { .key = {"expiry-time"},
+ .type = GF_OPTION_TYPE_INT,
+ .default_value = "120",
+ .description = "default time duration for which an object waits "
+ "before it is signed",
+ },
+ { .key = {NULL} },
+};
diff --git a/xlators/features/bit-rot/src/bitd/bit-rot.h b/xlators/features/bit-rot/src/bitd/bit-rot.h
new file mode 100644
index 00000000000..ab9fd806232
--- /dev/null
+++ b/xlators/features/bit-rot/src/bitd/bit-rot.h
@@ -0,0 +1,126 @@
+/*
+ Copyright (c) 2015 Red Hat, Inc. <http://www.redhat.com>
+ This file is part of GlusterFS.
+
+ This file is licensed to you under your choice of the GNU Lesser
+ General Public License, version 3 or any later version (LGPLv3 or
+ later), or the GNU General Public License, version 2 (GPLv2), in all
+ cases as published by the Free Software Foundation.
+*/
+#ifndef __BIT_ROT_H__
+#define __BIT_ROT_H__
+
+#ifndef _CONFIG_H
+#define _CONFIG_H
+#include "config.h"
+#endif
+
+#include "glusterfs.h"
+#include "logging.h"
+#include "dict.h"
+#include "xlator.h"
+#include "defaults.h"
+#include "syncop.h"
+#include "syncop-utils.h"
+#include "changelog.h"
+#include "timer-wheel.h"
+
+#include "bit-rot-common.h"
+#include "bit-rot-stub-mem-types.h"
+
+#include <openssl/sha.h>
+
+/* TODO: make this configurable */
+#define BR_WORKERS 8
+
+#define signature_size(hl) (sizeof (br_isignature_t) + hl + 1)
+
+struct br_child {
+ char child_up; /* Indicates whether this child is
+ up or not */
+ xlator_t *xl; /* client xlator corresponding to
+ this child */
+ inode_table_t *table; /* inode table for this child */
+ char brick_path[PATH_MAX]; /* brick export directory of this
+ child */
+ struct list_head list; /* hook to attach to the list of
+ UP children */
+ xlator_t *this; /* Bit rot xlator */
+
+ pthread_t thread; /* initial crawler for unsigned
+ object(s) */
+ int threadrunning; /* active thread */
+
+ struct mem_pool *timer_pool; /* timer-wheel's timer mem-pool */
+
+ struct timeval tv;
+};
+
+typedef struct br_child br_child_t;
+
+struct br_obj_n_workers {
+ struct list_head objects; /* queue of objects expired from the
+ timer wheel and ready to be picked
+ up for signing */
+ pthread_t workers[BR_WORKERS]; /* Threads which pick up the objects
+ from the above queue and start
+ signing each object */
+};
+
+typedef struct br_obj_n_workers br_obj_n_workers_t;
+
+struct br_private {
+ pthread_mutex_t lock;
+
+ struct list_head bricks; /* list of bricks from which CHILD_UP
+ has been received */
+
+ pthread_cond_t cond; /* handling CHILD_UP notifications */
+ pthread_cond_t object_cond; /* handling signing of objects */
+ int child_count;
+ br_child_t *children; /* list of subvolumes */
+ int up_children;
+ pthread_t thread; /* thread for connecting each UP
+ child with changelog */
+ struct tvec_base *timer_wheel; /* timer wheel where the objects which
+ changelog has sent sits and waits
+ for expiry */
+ br_obj_n_workers_t *obj_queue; /* place holder for all the objects
+ that are expired from timer wheel
+ and ready to be picked up for
+ signing and the workers which sign
+ the objects */
+ int32_t expiry_time; /* objects "wait" time */
+};
+
+typedef struct br_private br_private_t;
+
+struct br_object {
+ xlator_t *this;
+
+ uuid_t gfid;
+
+ unsigned long signedversion; /* version aginst which this object will
+ be signed */
+ br_child_t *child; /* object's subvolume */
+
+ struct list_head list; /* hook to add to the queue once the
+ object is expired from timer wheel */
+ void *data;
+};
+
+typedef struct br_object br_object_t;
+
+void
+br_log_object (xlator_t *, char *, uuid_t, int32_t);
+
+void
+br_log_object_path (xlator_t *, char *, const char *, int32_t);
+
+int32_t
+br_object_checksum (unsigned char *, br_object_t *, fd_t *, struct iatt *);
+
+int32_t
+br_prepare_loc (xlator_t *, br_child_t *, loc_t *, gf_dirent_t *, loc_t *);
+
+#endif /* __BIT_ROT_H__ */
diff --git a/xlators/features/bit-rot/src/stub/Makefile.am b/xlators/features/bit-rot/src/stub/Makefile.am
index 9abcbb76db2..ec6b1ef4506 100644
--- a/xlators/features/bit-rot/src/stub/Makefile.am
+++ b/xlators/features/bit-rot/src/stub/Makefile.am
@@ -8,9 +8,7 @@ bitrot_stub_la_LIBADD = $(top_builddir)/libglusterfs/src/libglusterfs.la
noinst_HEADERS = bit-rot-stub.h bit-rot-common.h bit-rot-stub-mem-types.h
-AM_CPPFLAGS = $(GF_CPPFLAGS) -I$(top_srcdir)/libglusterfs/src \
- -I$(top_srcdir)/xlators/features/changelog/lib/src
-
+AM_CPPFLAGS = $(GF_CPPFLAGS) -I$(top_srcdir)/libglusterfs/src
AM_CFLAGS = -Wall $(GF_CFLAGS)
diff --git a/xlators/features/bit-rot/src/stub/bit-rot-stub-mem-types.h b/xlators/features/bit-rot/src/stub/bit-rot-stub-mem-types.h
index 64779923fd6..492278639b4 100644
--- a/xlators/features/bit-rot/src/stub/bit-rot-stub-mem-types.h
+++ b/xlators/features/bit-rot/src/stub/bit-rot-stub-mem-types.h
@@ -15,9 +15,13 @@
enum br_mem_types {
gf_br_stub_mt_private_t = gf_common_mt_end + 1,
- gf_br_stub_mt_version_t = gf_common_mt_end + 2,
- gf_br_stub_mt_inode_ctx_t = gf_common_mt_end + 3,
- gf_br_stub_mt_signature_t = gf_common_mt_end + 4,
+ gf_br_stub_mt_version_t,
+ gf_br_stub_mt_inode_ctx_t,
+ gf_br_stub_mt_signature_t,
+ gf_br_mt_br_private_t,
+ gf_br_mt_br_child_t,
+ gf_br_mt_br_object_t,
+ gf_br_mt_br_ob_n_wk_t,
gf_br_stub_mt_end
};
diff --git a/xlators/features/changelog/lib/src/Makefile.am b/xlators/features/changelog/lib/src/Makefile.am
index 306306bd585..456e211b89d 100644
--- a/xlators/features/changelog/lib/src/Makefile.am
+++ b/xlators/features/changelog/lib/src/Makefile.am
@@ -32,8 +32,6 @@ noinst_HEADERS = gf-changelog-helpers.h gf-changelog-rpc.h gf-changelog-journal.
$(CONTRIBDIR)/uuid/uuidd.h $(CONTRIBDIR)/uuid/uuid.h \
$(CONTRIBDIR)/uuid/uuidP.h $(CONTRIB_BUILDDIR)/uuid/uuid_types.h
-libgfchangelog_HEADERS = changelog.h
-
CLEANFILES =
CONFIG_CLEAN_FILES = $(CONTRIB_BUILDDIR)/uuid/uuid_types.h