summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorKrutika Dhananjay <kdhananj@redhat.com>2014-11-21 11:47:23 +0530
committerVijay Bellur <vbellur@redhat.com>2015-03-19 10:59:31 -0700
commit6f389fbb812b384bdd9bf4a20e86930505531996 (patch)
tree7304eeab269d1a0226274cf439fb4f6c49bfcec9
parent32ed7aa5ad7049a9d85c795f997336c0366151a8 (diff)
features/shard: Introducing sharding translator
Based on the high-level design by Anand V. Avati which can be found @ https://gist.github.com/avati/af04f1030dcf52e16535#sharding-xlator-stripe-20 Still to-do: * complete implementation of inode write fops - [f]truncate, zerofill, fallocate, discard * introduce transaction mechanism in inode write fops * complete readv * Handle open with O_TRUNC * Handle unlinking of all shards during unlink/rename * Compute total ia_size and ia_blocks in lookup, readdirp, etc * wind fsync/flush on all shards Note: Most of the items above are related. Once we come up with a clean way to determine the last shard/shard count for a file/file size and the mgmt of sparse regions of the file, implementing them becomes trivial. Change-Id: Id871379b53a4a916e4baa2e06f197dd8c0043b0f BUG: 1200082 Signed-off-by: Krutika Dhananjay <kdhananj@redhat.com> Reviewed-on: http://review.gluster.org/9841 Tested-by: Gluster Build System <jenkins@build.gluster.com> Reviewed-by: Vijay Bellur <vbellur@redhat.com>
-rw-r--r--configure.ac2
-rw-r--r--xlators/features/Makefile.am2
-rw-r--r--xlators/features/shard/Makefile.am3
-rw-r--r--xlators/features/shard/src/Makefile.am16
-rw-r--r--xlators/features/shard/src/shard-mem-types.h22
-rw-r--r--xlators/features/shard/src/shard.c1791
-rw-r--r--xlators/features/shard/src/shard.h118
-rw-r--r--xlators/mgmt/glusterd/src/glusterd-volgen.c13
-rw-r--r--xlators/mgmt/glusterd/src/glusterd-volume-set.c11
9 files changed, 1976 insertions, 2 deletions
diff --git a/configure.ac b/configure.ac
index 16d32db7958..81faaf46549 100644
--- a/configure.ac
+++ b/configure.ac
@@ -160,6 +160,8 @@ AC_CONFIG_FILES([Makefile
xlators/features/snapview-client/src/Makefile
xlators/features/upcall/Makefile
xlators/features/upcall/src/Makefile
+ xlators/features/shard/Makefile
+ xlators/features/shard/src/Makefile
xlators/playground/Makefile
xlators/playground/template/Makefile
xlators/playground/template/src/Makefile
diff --git a/xlators/features/Makefile.am b/xlators/features/Makefile.am
index 5f35b37bc8e..8dc7051a303 100644
--- a/xlators/features/Makefile.am
+++ b/xlators/features/Makefile.am
@@ -1,5 +1,5 @@
SUBDIRS = locks quota read-only mac-compat quiesce marker index barrier arbiter\
protect compress changelog changetimerecorder ganesha gfid-access $(GLUPY_SUBDIR) qemu-block \
- upcall snapview-client snapview-server trash #path-converter # filter
+ upcall snapview-client snapview-server trash shard #path-converter # filter
CLEANFILES =
diff --git a/xlators/features/shard/Makefile.am b/xlators/features/shard/Makefile.am
new file mode 100644
index 00000000000..a985f42a877
--- /dev/null
+++ b/xlators/features/shard/Makefile.am
@@ -0,0 +1,3 @@
+SUBDIRS = src
+
+CLEANFILES =
diff --git a/xlators/features/shard/src/Makefile.am b/xlators/features/shard/src/Makefile.am
new file mode 100644
index 00000000000..b589cafe924
--- /dev/null
+++ b/xlators/features/shard/src/Makefile.am
@@ -0,0 +1,16 @@
+xlator_LTLIBRARIES = shard.la
+xlatordir = $(libdir)/glusterfs/$(PACKAGE_VERSION)/xlator/features
+
+shard_la_LDFLAGS = -module -avoid-version
+
+shard_la_SOURCES = shard.c
+
+shard_la_LIBADD = $(top_builddir)/libglusterfs/src/libglusterfs.la
+
+noinst_HEADERS = shard.h shard-mem-types.h
+
+AM_CPPFLAGS = $(GF_CPPFLAGS) -I$(top_srcdir)/libglusterfs/src
+
+AM_CFLAGS = -Wall $(GF_CFLAGS)
+
+CLEANFILES =
diff --git a/xlators/features/shard/src/shard-mem-types.h b/xlators/features/shard/src/shard-mem-types.h
new file mode 100644
index 00000000000..4bdcbba787c
--- /dev/null
+++ b/xlators/features/shard/src/shard-mem-types.h
@@ -0,0 +1,22 @@
+/*
+ Copyright (c) 2015 Red Hat, Inc. <http://www.redhat.com>
+ This file is part of GlusterFS.
+
+ This file is licensed to you under your choice of the GNU Lesser
+ General Public License, version 3 or any later version (LGPLv3 or
+ later), or the GNU General Public License, version 2 (GPLv2), in all
+ cases as published by the Free Software Foundation.
+*/
+#ifndef __SHARD_MEM_TYPES_H__
+#define __SHARD_MEM_TYPES_H__
+
+#include "mem-types.h"
+
+enum gf_shard_mem_types_ {
+ gf_shard_mt_priv_t = gf_common_mt_end + 1,
+ gf_shard_mt_inode_list,
+ gf_shard_mt_inode_ctx_t,
+ gf_shard_mt_iovec,
+ gf_shard_mt_end
+};
+#endif
diff --git a/xlators/features/shard/src/shard.c b/xlators/features/shard/src/shard.c
new file mode 100644
index 00000000000..fd003776f4e
--- /dev/null
+++ b/xlators/features/shard/src/shard.c
@@ -0,0 +1,1791 @@
+/*
+ Copyright (c) 2015 Red Hat, Inc. <http://www.redhat.com>
+ This file is part of GlusterFS.
+
+ This file is licensed to you under your choice of the GNU Lesser
+ General Public License, version 3 or any later version (LGPLv3 or
+ later), or the GNU General Public License, version 2 (GPLv2), in all
+ cases as published by the Free Software Foundation.
+*/
+
+#include <unistd.h>
+
+#include "shard.h"
+#include "shard-mem-types.h"
+#include "byte-order.h"
+#include "defaults.h"
+
+static gf_boolean_t
+__is_shard_dir (uuid_t gfid)
+{
+ shard_priv_t *priv = THIS->private;
+
+ if (uuid_compare (gfid, priv->dot_shard_gfid) == 0)
+ return _gf_true;
+
+ return _gf_false;
+}
+
+void
+shard_make_block_bname (int block_num, uuid_t gfid, char *buf, size_t len)
+{
+ char gfid_str[GF_UUID_BUF_SIZE] = {0,};
+
+ uuid_unparse (gfid, gfid_str);
+ snprintf (buf, len, "%s.%d", gfid_str, block_num);
+}
+
+void
+shard_make_block_abspath (int block_num, uuid_t gfid, char *filepath,
+ size_t len)
+{
+ char gfid_str[GF_UUID_BUF_SIZE] = {0,};
+
+ uuid_unparse (gfid, gfid_str);
+ snprintf (filepath, len, "/%s/%s.%d", GF_SHARD_DIR, gfid_str,
+ block_num);
+}
+
+int
+__shard_inode_ctx_get (inode_t *inode, xlator_t *this, shard_inode_ctx_t **ctx)
+{
+ int ret = -1;
+ uint64_t ctx_uint = 0;
+ shard_inode_ctx_t *ctx_p = NULL;
+
+ ret = __inode_ctx_get (inode, this, &ctx_uint);
+ if (ret == 0) {
+ *ctx = (shard_inode_ctx_t *) ctx_uint;
+ return ret;
+ }
+
+ ctx_p = GF_CALLOC (1, sizeof (*ctx_p), gf_shard_mt_inode_ctx_t);
+ if (!ctx_p)
+ return ret;
+
+ ret = __inode_ctx_set (inode, this, (uint64_t *)&ctx_p);
+ if (ret < 0) {
+ GF_FREE (ctx_p);
+ return ret;
+ }
+
+ *ctx = ctx_p;
+
+ return ret;
+}
+
+
+int
+__shard_inode_ctx_set (inode_t *inode, xlator_t *this,
+ shard_inode_ctx_t *ctx_in)
+{
+ int ret = -1;
+ shard_inode_ctx_t *ctx = NULL;
+
+ ret = __shard_inode_ctx_get (inode, this, &ctx);
+ if (ret)
+ return ret;
+
+ ctx->block_size = ctx_in->block_size;
+ ctx->mode = ctx_in->mode;
+ ctx->rdev = ctx_in->rdev;
+
+ return 0;
+}
+
+int
+shard_inode_ctx_set_all (inode_t *inode, xlator_t *this,
+ shard_inode_ctx_t *ctx_in)
+{
+ int ret = -1;
+
+ LOCK (&inode->lock);
+ {
+ ret = __shard_inode_ctx_set (inode, this, ctx_in);
+ }
+ UNLOCK (&inode->lock);
+
+ return ret;
+
+}
+
+int
+__shard_inode_ctx_get_block_size (inode_t *inode, xlator_t *this,
+ uint64_t *block_size)
+{
+ int ret = -1;
+ uint64_t ctx_uint = 0;
+ shard_inode_ctx_t *ctx = NULL;
+
+ ret = __inode_ctx_get (inode, this, &ctx_uint);
+ if (ret < 0)
+ return ret;
+
+ ctx = (shard_inode_ctx_t *) ctx_uint;
+
+ *block_size = ctx->block_size;
+
+ return 0;
+}
+
+int
+shard_inode_ctx_get_block_size (inode_t *inode, xlator_t *this,
+ uint64_t *block_size)
+{
+ int ret = -1;
+
+ LOCK (&inode->lock);
+ {
+ ret = __shard_inode_ctx_get_block_size (inode, this,
+ block_size);
+ }
+ UNLOCK (&inode->lock);
+
+ return ret;
+}
+
+int
+__shard_inode_ctx_get_all (inode_t *inode, xlator_t *this,
+ shard_inode_ctx_t *ctx_out)
+{
+ int ret = -1;
+ uint64_t ctx_uint = 0;
+ shard_inode_ctx_t *ctx = NULL;
+
+ ret = __inode_ctx_get (inode, this, &ctx_uint);
+ if (ret < 0)
+ return ret;
+
+ ctx = (shard_inode_ctx_t *) ctx_uint;
+
+ ctx_out->block_size = ctx->block_size;
+ ctx_out->mode = ctx->mode;
+ ctx_out->rdev = ctx->rdev;
+
+ return 0;
+}
+
+int
+shard_inode_ctx_get_all (inode_t *inode, xlator_t *this,
+ shard_inode_ctx_t *ctx_out)
+{
+ int ret = -1;
+
+ LOCK (&inode->lock);
+ {
+ ret = __shard_inode_ctx_get_all (inode, this, ctx_out);
+ }
+ UNLOCK (&inode->lock);
+
+ return ret;
+}
+
+void
+shard_local_wipe (shard_local_t *local)
+{
+ int i = 0;
+ int count = 0;
+
+ count = local->num_blocks;
+
+ loc_wipe (&local->loc);
+ loc_wipe (&local->dot_shard_loc);
+
+ if (local->fd)
+ fd_unref (local->fd);
+
+ if (local->xattr_req)
+ dict_unref (local->xattr_req);
+
+ for (i = 0; i < count; i++) {
+ if (local->inode_list[i])
+ inode_unref (local->inode_list[i]);
+ }
+
+ GF_FREE (local->inode_list);
+
+ GF_FREE (local->vector);
+ if (local->iobref)
+ iobref_unref (local->iobref);
+}
+
+int
+shard_call_count_return (call_frame_t *frame)
+{
+ int call_count = 0;
+ shard_local_t *local = NULL;
+
+ local = frame->local;
+
+ LOCK (&frame->lock);
+ {
+ call_count = --local->call_count;
+ }
+ UNLOCK (&frame->lock);
+
+ return call_count;
+}
+
+int
+shard_lookup_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
+ int32_t op_ret, int32_t op_errno, inode_t *inode,
+ struct iatt *buf, dict_t *xdata, struct iatt *postparent)
+{
+ int ret = 0;
+ uint64_t size = 0;
+ shard_inode_ctx_t ctx_tmp = {0,};
+
+ if (op_ret < 0)
+ goto unwind;
+
+ if ((op_ret == 0) && (!IA_ISDIR (buf->ia_type))) {
+ ret = dict_get_uint64 (xdata, GF_XATTR_SHARD_BLOCK_SIZE, &size);
+ if (!ret && size) {
+ ctx_tmp.block_size = ntoh64 (size);
+ ctx_tmp.mode = st_mode_from_ia (buf->ia_prot,
+ buf->ia_type);
+ ctx_tmp.rdev = buf->ia_rdev;
+ ret = shard_inode_ctx_set_all (inode, this, &ctx_tmp);
+ if (ret)
+ gf_log (this->name, GF_LOG_WARNING, "Failed to "
+ "set inode ctx for %s",
+ uuid_utoa (inode->gfid));
+ }
+ }
+
+ /* To-Do: return the call with aggregated values of ia_size and
+ * ia_blocks
+ */
+
+unwind:
+ SHARD_STACK_UNWIND (lookup, frame, op_ret, op_errno, inode, buf,
+ xdata, postparent);
+ return 0;
+}
+
+int
+shard_lookup (call_frame_t *frame, xlator_t *this, loc_t *loc,
+ dict_t *xattr_req)
+{
+ int ret = -1;
+ int32_t op_errno = ENOMEM;
+ uint64_t block_size = 0;
+ shard_local_t *local = NULL;
+
+ SHARD_ENTRY_FOP_CHECK (loc, op_errno, err);
+
+ local = mem_get0 (this->local_pool);
+ if (!local)
+ goto err;
+
+ frame->local = local;
+
+ loc_copy (&local->loc, loc);
+
+ local->xattr_req = xattr_req ? dict_ref (xattr_req) : dict_new ();
+ if (!local->xattr_req)
+ goto err;
+
+ if ((shard_inode_ctx_get_block_size (loc->inode, this, &block_size) ||
+ !block_size)) {
+ ret = dict_set_uint64 (local->xattr_req,
+ GF_XATTR_SHARD_BLOCK_SIZE, 0);
+ if (ret) {
+ gf_log (this->name, GF_LOG_WARNING, "Failed to set dict"
+ " value: key:%s for path %s",
+ GF_XATTR_SHARD_BLOCK_SIZE, loc->path);
+ goto err;
+ }
+ }
+
+ STACK_WIND (frame, shard_lookup_cbk, FIRST_CHILD (this),
+ FIRST_CHILD (this)->fops->lookup, loc, local->xattr_req);
+
+ return 0;
+
+
+err:
+ SHARD_STACK_UNWIND (lookup, frame, -1, op_errno, NULL, NULL,
+ NULL, NULL);
+ return 0;
+
+}
+
+int
+shard_stat_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
+ int32_t op_ret, int32_t op_errno, struct iatt *buf,
+ dict_t *xdata)
+{
+ /* To-Do: Update ia_size and ia_blocks in @buf before presenting it
+ * to the parent.
+ */
+
+ SHARD_STACK_UNWIND (stat, frame, op_ret, op_errno, buf, xdata);
+ return 0;
+}
+
+int
+shard_stat (call_frame_t *frame, xlator_t *this, loc_t *loc, dict_t *xdata)
+{
+ STACK_WIND (frame, shard_stat_cbk, FIRST_CHILD(this),
+ FIRST_CHILD(this)->fops->stat, loc, xdata);
+
+ return 0;
+}
+
+int
+shard_fstat_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
+ int32_t op_ret, int32_t op_errno, struct iatt *buf,
+ dict_t *xdata)
+{
+ /* To-Do: Update ia_size and ia_blocks in @buf before presenting it
+ * to the parent.
+ */
+ SHARD_STACK_UNWIND (fstat, frame, op_ret, op_errno, buf, xdata);
+ return 0;
+}
+
+int
+shard_fstat (call_frame_t *frame, xlator_t *this, fd_t *fd, dict_t *xdata)
+{
+ STACK_WIND (frame, shard_fstat_cbk, FIRST_CHILD(this),
+ FIRST_CHILD(this)->fops->fstat, fd, xdata);
+ return 0;
+}
+
+int
+shard_truncate (call_frame_t *frame, xlator_t *this, loc_t *loc, off_t offset,
+ dict_t *xdata)
+{
+ /*TBD - once the ability to determine the size/number of shards for a
+ * file in place, this can be filled.
+ */
+ SHARD_STACK_UNWIND (truncate, frame, -1, ENOTCONN, NULL, NULL, NULL);
+ return 0;
+}
+
+int
+shard_ftruncate (call_frame_t *frame, xlator_t *this, fd_t *fd, off_t offset,
+ dict_t *xdata)
+{
+ /*TBD - once the ability to determine the size/number of shards for a
+ * file in place, this can be filled.
+ */
+ SHARD_STACK_UNWIND (ftruncate, frame, -1, ENOTCONN, NULL, NULL, NULL);
+ return 0;
+}
+
+int
+shard_mknod_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
+ int32_t op_ret, int32_t op_errno, inode_t *inode,
+ struct iatt *buf, struct iatt *preparent,
+ struct iatt *postparent, dict_t *xdata)
+{
+ int ret = -1;
+ shard_local_t *local = NULL;
+ shard_inode_ctx_t ctx_tmp = {0,};
+
+ local = frame->local;
+
+ if (op_ret == -1)
+ goto unwind;
+
+ ctx_tmp.block_size = ntoh64 (local->block_size);
+ ctx_tmp.mode = st_mode_from_ia (buf->ia_prot, buf->ia_type);
+ ctx_tmp.rdev = buf->ia_rdev;
+ ret = shard_inode_ctx_set_all (inode, this, &ctx_tmp);
+ if (ret)
+ gf_log (this->name, GF_LOG_WARNING, "Failed to set inode ctx "
+ "for %s", uuid_utoa (inode->gfid));
+
+unwind:
+ SHARD_STACK_UNWIND (mknod, frame, op_ret, op_errno, inode, buf,
+ preparent, postparent, xdata);
+
+ return 0;
+}
+
+int
+shard_mknod (call_frame_t *frame, xlator_t *this, loc_t *loc, mode_t mode,
+ dev_t rdev, mode_t umask, dict_t *xdata)
+{
+ int ret = -1;
+ int32_t op_errno = ENOMEM;
+ shard_local_t *local = NULL;
+ shard_priv_t *priv = NULL;
+
+ priv = this->private;
+
+ local = mem_get0 (this->local_pool);
+ if (!local)
+ goto err;
+
+ frame->local = local;
+
+ local->block_size = hton64 (priv->block_size);
+
+ ret = dict_set_static_bin (xdata, GF_XATTR_SHARD_BLOCK_SIZE,
+ &local->block_size, sizeof (uint64_t));
+ if (ret) {
+ gf_log (this->name, GF_LOG_WARNING, "Failed to set key: %s "
+ "on path %s", GF_XATTR_SHARD_BLOCK_SIZE, loc->path);
+ goto err;
+ }
+
+ STACK_WIND (frame, shard_mknod_cbk, FIRST_CHILD (this),
+ FIRST_CHILD(this)->fops->mknod, loc, mode, rdev, umask,
+ xdata);
+ return 0;
+
+
+err:
+ SHARD_STACK_UNWIND (mknod, frame, -1, op_errno, NULL, NULL, NULL,
+ NULL, NULL);
+ return 0;
+
+}
+
+int
+shard_unlink_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
+ int32_t op_ret, int32_t op_errno, struct iatt *preparent,
+ struct iatt *postparent, dict_t *xdata)
+{
+ /* To-Do: Unlink all the shards associated with the file */
+
+ SHARD_STACK_UNWIND (unlink, frame, op_ret, op_errno, preparent,
+ postparent, xdata);
+
+ return 0;
+}
+
+int
+shard_unlink (call_frame_t *frame, xlator_t *this, loc_t *loc, int xflag,
+ dict_t *xdata)
+{
+ int op_errno = ENOMEM;
+ shard_local_t *local = NULL;
+
+ local = mem_get0 (this->local_pool);
+ if (!local)
+ goto err;
+
+ frame->local = local;
+
+ loc_copy (&local->loc, loc);
+ local->xflag = xflag;
+
+ if (xdata)
+ local->xattr_req = dict_ref (xdata);
+
+ STACK_WIND (frame, shard_unlink_cbk, FIRST_CHILD(this),
+ FIRST_CHILD(this)->fops->unlink, loc, xflag, xdata);
+
+ return 0;
+err:
+ SHARD_STACK_UNWIND (unlink, frame, -1, op_errno, NULL, NULL, NULL);
+ return 0;
+
+}
+
+
+int
+shard_rename_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
+ int32_t op_ret, int32_t op_errno, struct iatt *buf,
+ struct iatt *preoldparent, struct iatt *postoldparent,
+ struct iatt *prenewparent, struct iatt *postnewparent,
+ dict_t *xdata)
+{
+ /* To-Do: When both src and dst names exist, and src is renamed to
+ * dst, all the shards associated with the dst file must be unlinked.
+ */
+ SHARD_STACK_UNWIND (rename, frame, op_ret, op_errno, buf,
+ preoldparent, postoldparent, prenewparent,
+ postnewparent, xdata);
+ return 0;
+}
+
+int
+shard_rename (call_frame_t *frame, xlator_t *this, loc_t *oldloc, loc_t *newloc,
+ dict_t *xdata)
+{
+ int32_t op_errno = ENOMEM;
+ shard_local_t *local = NULL;
+
+ local = mem_get0 (this->local_pool);
+ if (!local)
+ goto err;
+
+ frame->local = local;
+
+ STACK_WIND (frame, shard_rename_cbk, FIRST_CHILD(this),
+ FIRST_CHILD(this)->fops->rename, oldloc, newloc, xdata);
+ return 0;
+
+err:
+ SHARD_STACK_UNWIND (rename, frame, -1, op_errno, NULL, NULL, NULL,
+ NULL, NULL, NULL);
+ return 0;
+
+}
+
+
+int
+shard_create_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
+ int32_t op_ret, int32_t op_errno, fd_t *fd, inode_t *inode,
+ struct iatt *stbuf, struct iatt *preparent,
+ struct iatt *postparent, dict_t *xdata)
+{
+ int ret = -1;
+ shard_local_t *local = NULL;
+ shard_inode_ctx_t ctx_tmp = {0,};
+
+ local = frame->local;
+
+ if (op_ret == -1)
+ goto unwind;
+
+ ctx_tmp.block_size = ntoh64 (local->block_size);
+ ctx_tmp.mode = st_mode_from_ia (stbuf->ia_prot, stbuf->ia_type);
+ ctx_tmp.rdev = stbuf->ia_rdev;
+ ret = shard_inode_ctx_set_all (inode, this, &ctx_tmp);
+ if (ret) {
+ gf_log (this->name, GF_LOG_WARNING, "Failed to set block size "
+ "for %s in inode ctx", uuid_utoa (inode->gfid));
+ goto unwind;
+ }
+
+unwind:
+ SHARD_STACK_UNWIND (create, frame, op_ret, op_errno, fd, inode, stbuf,
+ preparent, postparent, xdata);
+ return 0;
+}
+
+int
+shard_create (call_frame_t *frame, xlator_t *this, loc_t *loc, int32_t flags,
+ mode_t mode, mode_t umask, fd_t *fd, dict_t *xdata)
+{
+ int ret = -1;
+ int32_t op_errno = ENOMEM;
+ shard_local_t *local = NULL;
+ shard_priv_t *priv = NULL;
+
+ priv = this->private;
+
+ local = mem_get0 (this->local_pool);
+ if (!local)
+ goto err;
+
+ frame->local = local;
+
+ local->block_size = hton64 (priv->block_size);
+
+ ret = dict_set_static_bin (xdata, GF_XATTR_SHARD_BLOCK_SIZE,
+ &local->block_size, sizeof (uint64_t));
+ if (ret) {
+ gf_log (this->name, GF_LOG_WARNING, "Failed to set key: %s "
+ "on path %s", GF_XATTR_SHARD_BLOCK_SIZE, loc->path);
+ goto err;
+ }
+
+ STACK_WIND (frame, shard_create_cbk, FIRST_CHILD (this),
+ FIRST_CHILD(this)->fops->create, loc, flags, mode, umask,
+ fd, xdata);
+ return 0;
+
+err:
+ SHARD_STACK_UNWIND (create, frame, -1, op_errno, NULL, NULL, NULL,
+ NULL, NULL, NULL);
+ return 0;
+
+}
+
+int
+shard_open_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
+ int32_t op_ret, int32_t op_errno, fd_t *fd, dict_t *xdata)
+{
+ /* To-Do: Handle open with O_TRUNC under locks */
+ SHARD_STACK_UNWIND (open, frame, op_ret, op_errno, fd, xdata);
+ return 0;
+}
+
+int
+shard_open (call_frame_t *frame, xlator_t *this, loc_t *loc, int32_t flags,
+ fd_t *fd, dict_t *xdata)
+{
+ STACK_WIND (frame, shard_open_cbk, FIRST_CHILD(this),
+ FIRST_CHILD(this)->fops->open, loc, flags, fd, xdata);
+ return 0;
+}
+
+/* - Incomplete - TBD */
+int
+shard_readv (call_frame_t *frame, xlator_t *this, fd_t *fd, size_t size,
+ off_t offset, uint32_t flags, dict_t *xdata)
+{
+/*
+ int i = 0;
+ int32_t op_errno = ENOMEM;
+ uint64_t block_size = 0;
+ int highest_block = 0;
+ int num_blocks = 0;
+ int cur_block = 0;
+ char shard_abspath[PATH_MAX] = {0,};
+ off_t cur_offset = 0;
+ size_t total_size = 0;
+ fd_t *cur_fd = NULL;
+ inode_t *inode = NULL;
+ shard_local_t *local = NULL;
+
+ ret = shard_inode_ctx_get_block_size (fd->inode, this, &block_size);
+ if (ret) {
+ gf_log (this->name, GF_LOG_ERROR, "Failed to get inode ctx for "
+ "%s", uuid_utoa(fd->inode->gfid));
+ goto out;
+ }
+
+ if (!block_size) {
+ STACK_WIND (frame, default_readv_cbk, FIRST_CHILD(this),
+ FIRST_CHILD(this)->fops->readv, fd, size, offset,
+ flags, xdata);
+ return 0;
+ }
+
+ local = mem_get0 (this->local_pool);
+ if (!local)
+ goto err;
+
+ frame->local = local;
+
+ local->block_size = block_size;
+ local->offset = offset;
+ local->len = size;
+ local->first_block = get_lowest_block (offset, block_size);
+ highest_block = get_highest_block (offset, size, block_size);
+ num_blocks = local->num_blocks = highest_block - local->first_block + 1;
+
+ while (num_blocks--) {
+ cur_fd = (local->first_block == 0) ? fd_ref (fd) :
+ fd_anonymous (inode);
+ cur_offset = (cur_block == local->first_block) ?
+ get_shard_offset(offset, block_size):0;
+ STACK_WIND_COOKIE (frame, shard_readv_cbk, cur_fd,
+ FIRST_CHILD(this),
+ FIRST_CHILD(this)->fops->readv, cur_fd,
+ cur_size, cur_offset, flags, xdata);
+ }
+
+ return 0;
+
+err:
+ SHARD_STACK_UNWIND (readv, frame, -1, op_errno, NULL, 0, NULL, NULL,
+ NULL);
+*/
+ SHARD_STACK_UNWIND (readv, frame, -1, ENOTCONN, NULL, 0, NULL, NULL,
+ NULL);
+ return 0;
+
+}
+
+int
+shard_writev_do_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
+ int32_t op_ret, int32_t op_errno, struct iatt *prebuf,
+ struct iatt *postbuf, dict_t *xdata)
+{
+ int call_count = 0;
+ fd_t *anon_fd = cookie;
+ shard_local_t *local = NULL;
+
+ local = frame->local;
+
+ if (op_ret < 0) {
+ local->op_ret = op_ret;
+ local->op_errno = op_errno;
+ }
+
+ if (anon_fd)
+ fd_unref (anon_fd);
+
+ call_count = shard_call_count_return (frame);
+ if (call_count == 0)
+ SHARD_STACK_UNWIND (writev, frame, local->op_ret,
+ local->op_errno, prebuf, postbuf, xdata);
+
+ return 0;
+}
+
+int
+shard_writev_do (call_frame_t *frame, xlator_t *this)
+{
+ int i = 0;
+ int count = 0;
+ int call_count = 0;
+ int last_block = 0;
+ uint32_t cur_block = 0;
+ fd_t *fd = NULL;
+ fd_t *anon_fd = NULL;
+ shard_local_t *local = NULL;
+ struct iovec *vec = NULL;
+ gf_boolean_t wind_failed = _gf_false;
+ off_t orig_offset = 0;
+ off_t shard_offset = 0;
+ off_t vec_offset = 0;
+ size_t remaining_size = 0;
+ size_t write_size = 0;
+
+ local = frame->local;
+ fd = local->fd;
+
+ orig_offset = local->offset;
+ remaining_size = local->total_size;
+ cur_block = local->first_block;
+ local->call_count = call_count = local->num_blocks;
+ last_block = local->last_block;
+
+ dict_del (local->xattr_req, "gfid-req");
+
+ while (cur_block <= last_block) {
+ if (wind_failed) {
+ shard_writev_do_cbk (frame, (void *) (long) 0, this, -1,
+ ENOMEM, NULL, NULL, NULL);
+ goto next;
+ }
+
+ shard_offset = orig_offset % local->block_size;
+ write_size = local->block_size - shard_offset;
+ if (write_size > remaining_size)
+ write_size = remaining_size;
+
+ remaining_size -= write_size;
+
+ count = iov_subset (local->vector, local->count, vec_offset,
+ vec_offset + write_size, NULL);
+
+ vec = GF_CALLOC (count, sizeof (struct iovec),
+ gf_shard_mt_iovec);
+ if (!vec) {
+ local->op_ret = -1;
+ local->op_errno = ENOMEM;
+ wind_failed = _gf_true;
+ GF_FREE (vec);
+ shard_writev_do_cbk (frame, (void *) (long) 0, this, -1,
+ ENOMEM, NULL, NULL, NULL);
+ goto next;
+ }
+
+ count = iov_subset (local->vector, local->count, vec_offset,
+ vec_offset + write_size, vec);
+
+ if (cur_block == 0) {
+ anon_fd = fd_ref (fd);
+ } else {
+ anon_fd = fd_anonymous (local->inode_list[i]);
+ if (!anon_fd) {
+ local->op_ret = -1;
+ local->op_errno = ENOMEM;
+ wind_failed = _gf_true;
+ GF_FREE (vec);
+ shard_writev_do_cbk (frame,
+ (void *) (long) anon_fd,
+ this, -1, ENOMEM, NULL,
+ NULL, NULL);
+ goto next;
+ }
+ }
+
+ STACK_WIND_COOKIE (frame, shard_writev_do_cbk, anon_fd,
+ FIRST_CHILD(this),
+ FIRST_CHILD(this)->fops->writev, anon_fd,
+ vec, count, shard_offset, local->flags,
+ local->iobref, local->xattr_req);
+ GF_FREE (vec);
+ vec = NULL;
+ orig_offset += write_size;
+ vec_offset += write_size;
+next:
+ cur_block++;
+ i++;
+ call_count--;
+ }
+ return 0;
+}
+
+void
+shard_link_block_inode (shard_local_t *local, int block_num, inode_t *inode,
+ struct iatt *buf)
+{
+ char block_bname[256] = {0,};
+ inode_t *linked_inode = NULL;
+ shard_priv_t *priv = NULL;
+
+ priv = THIS->private;
+
+ shard_make_block_bname (block_num, local->fd->inode->gfid, block_bname,
+ sizeof (block_bname));
+
+ linked_inode = inode_link (inode, priv->dot_shard_inode, block_bname,
+ buf);
+ inode_lookup (linked_inode);
+ local->inode_list[block_num - local->first_block] = linked_inode;
+ /* Defer unref'ing the inodes until write is complete to prevent
+ * them from getting purged. These inodes are unref'd in the event of
+ * a failure or after successfull fop completion in shard_local_wipe().
+ */
+}
+
+int
+shard_writev_lookup_shards_cbk (call_frame_t *frame, void *cookie,
+ xlator_t *this, int32_t op_ret,
+ int32_t op_errno, inode_t *inode,
+ struct iatt *buf, dict_t *xdata,
+ struct iatt *postparent)
+{
+ int call_count = 0;
+ int shard_block_num = (long) cookie;
+ shard_local_t *local = NULL;
+
+ local = frame->local;
+
+ if (op_ret < 0) {
+ local->op_ret = op_ret;
+ local->op_errno = op_errno;
+ goto done;
+ }
+
+ shard_link_block_inode (local, shard_block_num, inode, buf);
+
+done:
+ call_count = shard_call_count_return (frame);
+ if (call_count == 0) {
+ if (local->op_ret < 0)
+ goto unwind;
+ else
+ shard_writev_do (frame, this);
+ }
+ return 0;
+
+unwind:
+ SHARD_STACK_UNWIND (writev, frame, local->op_ret, local->op_errno, NULL,
+ NULL, NULL);
+ return 0;
+}
+
+dict_t*
+shard_create_gfid_dict (dict_t *dict)
+{
+ int ret = 0;
+ dict_t *new = NULL;
+ uuid_t *gfid = NULL;
+
+ new = dict_copy_with_ref (dict, NULL);
+ if (!new)
+ return NULL;
+
+ gfid = GF_CALLOC (1, sizeof (uuid_t), gf_common_mt_char);
+ if (!gfid) {
+ ret = -1;
+ goto out;
+ }
+
+ uuid_generate (*gfid);
+
+ ret = dict_set_dynptr (new, "gfid-req", gfid, sizeof (uuid_t));
+
+out:
+ if (ret) {
+ dict_unref (new);
+ new = NULL;
+ GF_FREE (gfid);
+ }
+
+ return NULL;
+}
+
+int
+shard_writev_lookup_shards (call_frame_t *frame, xlator_t *this)
+{
+ int i = 0;
+ int ret = 0;
+ int call_count = 0;
+ int32_t shard_idx_iter = 0;
+ int last_block = 0;
+ char path[PATH_MAX] = {0,};
+ char *bname = NULL;
+ loc_t loc = {0,};
+ fd_t *fd = NULL;
+ shard_local_t *local = NULL;
+ shard_priv_t *priv = NULL;
+ gf_boolean_t wind_failed = _gf_false;
+ dict_t *xattr_req = NULL;
+
+ priv = this->private;
+ local = frame->local;
+ fd = local->fd;
+ call_count = local->call_count;
+ shard_idx_iter = local->first_block;
+ last_block = local->last_block;
+
+ while (shard_idx_iter <= last_block) {
+ if (local->inode_list[i]) {
+ i++;
+ shard_idx_iter++;
+ continue;
+ }
+
+ if (wind_failed) {
+ shard_writev_lookup_shards_cbk (frame,
+ (void *) (long) shard_idx_iter,
+ this, -1, ENOMEM, NULL,
+ NULL, NULL, NULL);
+ goto next;
+ }
+
+ shard_make_block_abspath (shard_idx_iter, fd->inode->gfid,
+ path, sizeof(path));
+
+ bname = strrchr (path, '/') + 1;
+ loc.inode = inode_new (priv->inode_table);
+ loc.parent = inode_ref (priv->dot_shard_inode);
+ ret = inode_path (loc.parent, bname, (char **) &(loc.path));
+ if (ret) {
+ gf_log (this->name, GF_LOG_ERROR, "Inode path failed on"
+ " %s", bname);
+ local->op_ret = -1;
+ local->op_errno = ENOMEM;
+ loc_wipe (&loc);
+ shard_writev_lookup_shards_cbk (frame,
+ (void *) (long) shard_idx_iter,
+ this, -1, ENOMEM, NULL,
+ NULL, NULL, NULL);
+ goto next;
+ }
+
+ loc.name = strrchr (loc.path, '/');
+ if (loc.name)
+ loc.name++;
+
+ xattr_req = shard_create_gfid_dict (local->xattr_req);
+ if (!xattr_req) {
+ local->op_ret = -1;
+ local->op_errno = ENOMEM;
+ wind_failed = _gf_true;
+ loc_wipe (&loc);
+ shard_writev_lookup_shards_cbk (frame,
+ (void *) (long) shard_idx_iter,
+ this, -1, ENOMEM, NULL,
+ NULL, NULL, NULL);
+ goto next;
+ }
+
+ STACK_WIND_COOKIE (frame, shard_writev_lookup_shards_cbk,
+ (void *) (long) shard_idx_iter,
+ FIRST_CHILD(this),
+ FIRST_CHILD(this)->fops->lookup, &loc,
+ xattr_req);
+ loc_wipe (&loc);
+ dict_unref (xattr_req);
+next:
+ shard_idx_iter++;
+ i++;
+
+ if (!--call_count)
+ break;
+ }
+
+ return 0;
+}
+
+int
+shard_writev_mknod_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
+ int32_t op_ret, int32_t op_errno, inode_t *inode,
+ struct iatt *buf, struct iatt *preparent,
+ struct iatt *postparent, dict_t *xdata)
+{
+ int shard_block_num = (long) cookie;
+ int call_count = 0;
+ shard_local_t *local = NULL;
+
+ local = frame->local;
+
+ if (op_ret < 0) {
+ if (op_errno == EEXIST) {
+ local->eexist_count++;
+ } else {
+ local->op_ret = op_ret;
+ local->op_errno = op_errno;
+ }
+ gf_log (this->name, GF_LOG_DEBUG, "SHARD WRITEV: mknod of "
+ "shard %d failed: %s", shard_block_num,
+ strerror (op_errno));
+ goto done;
+ }
+
+ shard_link_block_inode (local, shard_block_num, inode, buf);
+
+done:
+ call_count = shard_call_count_return (frame);
+ if (call_count == 0) {
+ if (local->op_ret < 0) {
+ goto unwind;
+ } else {
+ if (!local->eexist_count) {
+ shard_writev_do (frame, this);
+ } else {
+ local->call_count = local->eexist_count;
+ shard_writev_lookup_shards (frame, this);
+ }
+ }
+ }
+ return 0;
+
+unwind:
+ SHARD_STACK_UNWIND (writev, frame, local->op_ret, local->op_errno, NULL,
+ NULL, NULL);
+ return 0;
+}
+
+int
+shard_writev_resume_mknod (call_frame_t *frame, xlator_t *this)
+{
+ int i = 0;
+ int shard_idx_iter = 0;
+ int last_block = 0;
+ int ret = 0;
+ int call_count = 0;
+ char path[PATH_MAX] = {0,};
+ char *bname = NULL;
+ shard_priv_t *priv = NULL;
+ shard_inode_ctx_t ctx_tmp = {0,};
+ shard_local_t *local = NULL;
+ gf_boolean_t wind_failed = _gf_false;
+ fd_t *fd = NULL;
+ loc_t loc = {0,};
+ dict_t *xattr_req = NULL;
+
+ local = frame->local;
+ priv = this->private;
+ fd = local->fd;
+ shard_idx_iter = local->first_block;
+ last_block = local->last_block;
+ call_count = local->call_count;
+
+ ret = shard_inode_ctx_get_all (fd->inode, this, &ctx_tmp);
+ if (ret) {
+ gf_log (this->name, GF_LOG_ERROR, "Failed to get inode ctx for"
+ " %s", uuid_utoa (fd->inode->gfid));
+ goto err;
+ }
+
+ while (shard_idx_iter <= last_block) {
+ if (local->inode_list[i]) {
+ shard_idx_iter++;
+ i++;
+ continue;
+ }
+
+ if (wind_failed) {
+ shard_writev_mknod_cbk (frame,
+ (void *) (long) shard_idx_iter,
+ this, -1, ENOMEM, NULL, NULL,
+ NULL, NULL, NULL);
+ goto next;
+ }
+
+ shard_make_block_abspath (shard_idx_iter, fd->inode->gfid,
+ path, sizeof(path));
+
+ xattr_req = shard_create_gfid_dict (local->xattr_req);
+ if (!xattr_req) {
+ local->op_ret = -1;
+ local->op_errno = ENOMEM;
+ wind_failed = _gf_true;
+ shard_writev_mknod_cbk (frame,
+ (void *) (long) shard_idx_iter,
+ this, -1, ENOMEM, NULL, NULL,
+ NULL, NULL, NULL);
+ goto next;
+ }
+
+ bname = strrchr (path, '/') + 1;
+ loc.inode = inode_new (priv->inode_table);
+ loc.parent = inode_ref (priv->dot_shard_inode);
+ ret = inode_path (loc.parent, bname,
+ (char **) &(loc.path));
+ if (ret) {
+ gf_log (this->name, GF_LOG_ERROR, "Inode path failed on"
+ " %s", bname);
+ local->op_ret = -1;
+ local->op_errno = ENOMEM;
+ wind_failed = _gf_true;
+ loc_wipe (&loc);
+ dict_unref (xattr_req);
+ shard_writev_mknod_cbk (frame,
+ (void *) (long) shard_idx_iter,
+ this, -1, ENOMEM, NULL, NULL,
+ NULL, NULL, NULL);
+ goto next;
+ }
+
+ loc.name = strrchr (loc.path, '/');
+ if (loc.name)
+ loc.name++;
+
+ STACK_WIND_COOKIE (frame, shard_writev_mknod_cbk,
+ (void *) (long) shard_idx_iter,
+ FIRST_CHILD(this),
+ FIRST_CHILD(this)->fops->mknod, &loc,
+ ctx_tmp.mode, ctx_tmp.rdev, 0, xattr_req);
+ loc_wipe (&loc);
+ dict_unref (xattr_req);
+
+next:
+ shard_idx_iter++;
+ i++;
+ if (!--call_count)
+ break;
+ }
+
+ return 0;
+err:
+ /*
+ * This block is for handling failure in shard_inode_ctx_get_all().
+ * Failures in the while-loop are handled within the loop.
+ */
+ SHARD_STACK_UNWIND (writev, frame, -1, ENOMEM, NULL, NULL, NULL);
+ return 0;
+}
+
+int
+shard_writev_create_write_shards (call_frame_t *frame, xlator_t *this)
+{
+ int i = -1;
+ uint32_t shard_idx_iter = 0;
+ char path[PATH_MAX] = {0,};
+ fd_t *fd = NULL;
+ inode_t *inode = NULL;
+ shard_local_t *local = NULL;
+ shard_priv_t *priv = NULL;
+
+ priv = this->private;
+ local = frame->local;
+ fd = local->fd;
+ shard_idx_iter = local->first_block;
+
+ while (shard_idx_iter <= local->last_block) {
+ i++;
+ if (shard_idx_iter == 0) {
+ local->inode_list[i] = inode_ref (fd->inode);
+ shard_idx_iter++;
+ continue;
+ }
+
+ shard_make_block_abspath (shard_idx_iter, fd->inode->gfid,
+ path, sizeof(path));
+
+ inode = NULL;
+ inode = inode_resolve (priv->inode_table, path);
+ if (inode) {
+ gf_log (this->name, GF_LOG_DEBUG, "Shard %d already "
+ "present. gfid=%s. Saving inode for future.",
+ shard_idx_iter, uuid_utoa(inode->gfid));
+ shard_idx_iter++;
+ local->inode_list[i] = inode;
+ /* Let the ref on the inodes that are already present
+ * in inode table still be held so that they don't get
+ * forgotten by the time the fop reaches the actual
+ * write stage.
+ */
+ continue;
+ }
+ local->call_count++;
+ }
+
+ if (local->call_count)
+ shard_writev_resume_mknod (frame, this);
+ else
+ shard_writev_do (frame, this);
+
+ return 0;
+}
+
+static void
+shard_link_dot_shard_inode (shard_local_t *local, inode_t *inode,
+ struct iatt *buf)
+{
+ inode_t *linked_inode = NULL;
+ shard_priv_t *priv = NULL;
+
+ priv = THIS->private;
+
+ linked_inode = inode_link (inode, local->dot_shard_loc.parent,
+ local->dot_shard_loc.name, buf);
+ inode_lookup (linked_inode);
+ priv->dot_shard_inode = linked_inode;
+}
+
+int
+shard_lookup_dot_shard_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
+ int32_t op_ret, int32_t op_errno, inode_t *inode,
+ struct iatt *buf, dict_t *xdata,
+ struct iatt *postparent)
+{
+ shard_local_t *local = NULL;
+
+ local = frame->local;
+
+ if (op_ret)
+ goto unwind;
+
+ shard_link_dot_shard_inode (local, inode, buf);
+ shard_writev_create_write_shards (frame, this);
+ return 0;
+
+unwind:
+ SHARD_STACK_UNWIND (writev, frame, -1, op_errno, NULL, NULL, NULL);
+ return 0;
+}
+
+int
+shard_lookup_dot_shard (call_frame_t *frame, xlator_t *this)
+{
+ shard_local_t *local = NULL;
+
+ local = frame->local;
+
+ STACK_WIND (frame, shard_lookup_dot_shard_cbk, FIRST_CHILD(this),
+ FIRST_CHILD(this)->fops->lookup, &local->dot_shard_loc,
+ local->xattr_req);
+
+ return 0;
+}
+
+int
+shard_writev_mkdir_dot_shard_cbk (call_frame_t *frame, void *cookie,
+ xlator_t *this, int32_t op_ret,
+ int32_t op_errno, inode_t *inode,
+ struct iatt *buf, struct iatt *preparent,
+ struct iatt *postparent, dict_t *xdata)
+{
+ shard_local_t *local = NULL;
+
+ local = frame->local;
+
+ if (op_ret == -1) {
+ if (op_errno != EEXIST) {
+ goto unwind;
+ } else {
+ gf_log (this->name, GF_LOG_DEBUG, "mkdir on /.shard "
+ "failed with EEXIST. Attempting lookup now");
+ shard_lookup_dot_shard (frame, this);
+ return 0;
+ }
+ }
+
+ shard_link_dot_shard_inode (local, inode, buf);
+ shard_writev_create_write_shards (frame, this);
+ return 0;
+
+unwind:
+ SHARD_STACK_UNWIND (writev, frame, -1, op_errno, NULL, NULL, NULL);
+ return 0;
+}
+
+int
+shard_writev_mkdir_dot_shard (call_frame_t *frame, xlator_t *this)
+{
+ int ret = -1;
+ shard_local_t *local = NULL;
+ shard_priv_t *priv = NULL;
+ loc_t *dot_shard_loc = NULL;
+
+ local = frame->local;
+ priv = this->private;
+
+ dot_shard_loc = &local->dot_shard_loc;
+
+ dot_shard_loc->inode = inode_new (priv->inode_table);
+ dot_shard_loc->parent = inode_ref (priv->inode_table->root);
+ ret = inode_path (dot_shard_loc->parent, GF_SHARD_DIR,
+ (char **)&dot_shard_loc->path);
+ if (ret) {
+ gf_log (this->name, GF_LOG_ERROR, "Inode path failed on"
+ " %s", GF_SHARD_DIR);
+ goto err;
+ }
+
+ dot_shard_loc->name = strrchr (dot_shard_loc->path, '/');
+ if (dot_shard_loc->name)
+ dot_shard_loc->name++;
+
+ ret = dict_set_static_bin (local->xattr_req, "gfid-req",
+ priv->dot_shard_gfid, 16);
+ if (ret) {
+ gf_log (this->name, GF_LOG_ERROR, "Failed to set gfid-req for "
+ "/.shard");
+ goto err;
+ }
+
+ STACK_WIND (frame, shard_writev_mkdir_dot_shard_cbk, FIRST_CHILD(this),
+ FIRST_CHILD(this)->fops->mkdir, &local->dot_shard_loc,
+ 0755, 0, local->xattr_req);
+ return 0;
+
+err:
+ SHARD_STACK_UNWIND (writev, frame, -1, ENOMEM, NULL, NULL, NULL);
+ return 0;
+}
+
+/* shard_writev - still a WIP */
+int
+shard_writev (call_frame_t *frame, xlator_t *this, fd_t *fd,
+ struct iovec *vector, int32_t count, off_t offset, uint32_t flags,
+ struct iobref *iobref, dict_t *xdata)
+{
+ int i = 0;
+ uint64_t block_size = 0;
+ uint32_t first_block = 0;
+ uint32_t last_block = 0;
+ uint32_t num_blocks = 0;
+ size_t total_size = 0;
+ shard_local_t *local = NULL;
+ shard_priv_t *priv = NULL;
+
+ priv = this->private;
+
+ if (shard_inode_ctx_get_block_size (fd->inode, this, &block_size)) {
+ gf_log (this->name, GF_LOG_ERROR, "Failed to get inode ctx for "
+ "%s", uuid_utoa(fd->inode->gfid));
+ goto out;
+ }
+
+ for (i = 0; i < count; i++)
+ total_size += vector[i].iov_len;
+
+ first_block = get_lowest_block (offset, block_size);
+ last_block = get_highest_block (offset, total_size, block_size);
+ num_blocks = last_block - first_block + 1;
+
+ gf_log (this->name, GF_LOG_TRACE, "gfid=%s first_block=%"PRIu32" "
+ "last_block=%"PRIu32" num_blocks=%"PRIu32" offset=%"PRId64" "
+ "total_size=%lu", uuid_utoa (fd->inode->gfid), first_block,
+ last_block, num_blocks, offset, total_size);
+
+ if (!block_size ||
+ ((first_block == 0) && (first_block == last_block))) {
+ /* To-Do: Replace default_writev_cbk with a specific cbk
+ * that would collect total size and block count before unwind
+ */
+ STACK_WIND (frame, default_writev_cbk,
+ FIRST_CHILD(this), FIRST_CHILD(this)->fops->writev,
+ fd, vector, count, offset, flags, iobref, xdata);
+ return 0;
+ }
+
+ local = mem_get0 (this->local_pool);
+ if (!local)
+ goto out;
+
+ frame->local = local;
+
+ local->xattr_req = (xdata) ? dict_ref (xdata) : dict_new ();
+ if (!local->xattr_req)
+ goto out;
+
+ local->vector = iov_dup (vector, count);
+ if (!local->vector)
+ goto out;
+
+ local->count = count;
+ local->offset = offset;
+ local->flags = flags;
+ local->iobref = iobref_ref (iobref);
+ local->fd = fd_ref (fd);
+ local->first_block = first_block;
+ local->last_block = last_block;
+ local->total_size = total_size;
+ local->block_size = block_size;
+ local->num_blocks = num_blocks;
+ local->inode_list = GF_CALLOC (num_blocks, sizeof (inode_t *),
+ gf_shard_mt_inode_list);
+ if (!local->inode_list)
+ goto out;
+
+ local->dot_shard_loc.inode = inode_find (priv->inode_table,
+ priv->dot_shard_gfid);
+ if (!local->dot_shard_loc.inode)
+ shard_writev_mkdir_dot_shard (frame, this);
+ else
+ shard_writev_create_write_shards (frame, this);
+
+ return 0;
+out:
+ SHARD_STACK_UNWIND (writev, frame, -1, ENOMEM, NULL, NULL, NULL);
+ return 0;
+}
+
+int
+shard_flush_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
+ int32_t op_ret, int32_t op_errno, dict_t *xdata)
+{
+ /* To-Do: Wind flush on all shards of the file */
+ SHARD_STACK_UNWIND (flush, frame, op_ret, op_errno, xdata);
+ return 0;
+}
+
+int
+shard_flush (call_frame_t *frame, xlator_t *this, fd_t *fd, dict_t *xdata)
+{
+ STACK_WIND (frame, shard_flush_cbk, FIRST_CHILD(this),
+ FIRST_CHILD(this)->fops->flush, fd, xdata);
+ return 0;
+}
+
+int
+shard_fsync_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
+ int32_t op_ret, int32_t op_errno, struct iatt *prebuf,
+ struct iatt *postbuf, dict_t *xdata)
+{
+ /* To-Do: Wind fsync on all shards of the file */
+ SHARD_STACK_UNWIND (fsync, frame, op_ret, op_errno, prebuf, postbuf,
+ xdata);
+ return 0;
+}
+
+int
+shard_fsync (call_frame_t *frame, xlator_t *this, fd_t *fd, int32_t datasync,
+ dict_t *xdata)
+{
+ STACK_WIND (frame, shard_fsync_cbk, FIRST_CHILD(this),
+ FIRST_CHILD(this)->fops->fsync, fd, datasync, xdata);
+ return 0;
+}
+
+int32_t
+shard_readdir_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
+ int32_t op_ret, int32_t op_errno, gf_dirent_t *entries,
+ dict_t *xdata)
+{
+ fd_t *fd = NULL;
+ gf_dirent_t *entry = NULL;
+ gf_dirent_t *tmp = NULL;
+ shard_local_t *local = NULL;
+ gf_dirent_t skipped;
+
+ INIT_LIST_HEAD (&skipped.list);
+
+ local = frame->local;
+ fd = local->fd;
+
+ if (op_ret < 0)
+ goto unwind;
+
+ list_for_each_entry_safe (entry, tmp, &entries->list, list) {
+ if (__is_root_gfid (fd->inode->gfid) &&
+ !(strcmp (entry->d_name, GF_SHARD_DIR))) {
+ list_del_init (&entry->list);
+ list_add_tail (&entry->list, &skipped.list);
+ break;
+ }
+ }
+
+unwind:
+ SHARD_STACK_UNWIND (readdir, frame, op_ret, op_errno, entries, xdata);
+ gf_dirent_free (&skipped);
+ return 0;
+}
+
+
+int
+shard_readdir_do (call_frame_t *frame, xlator_t *this, fd_t *fd, size_t size,
+ off_t offset, int whichop, dict_t *xdata)
+{
+ int op_errno = ENOMEM;
+ shard_local_t *local = NULL;
+
+ local = mem_get0 (this->local_pool);
+ if (!local)
+ goto err;
+
+ frame->local = local;
+
+ local->fd = fd_ref (fd);
+
+ if (whichop == GF_FOP_READDIR)
+ STACK_WIND (frame, shard_readdir_cbk, FIRST_CHILD(this),
+ FIRST_CHILD(this)->fops->readdir, fd, size, offset,
+ xdata);
+ else
+ STACK_WIND (frame, shard_readdir_cbk, FIRST_CHILD(this),
+ FIRST_CHILD(this)->fops->readdirp, fd, size, offset,
+ xdata);
+
+ return 0;
+
+err:
+ STACK_UNWIND_STRICT (readdir, frame, -1, op_errno, NULL, NULL);
+ return 0;
+
+}
+
+
+int32_t
+shard_readdir (call_frame_t *frame, xlator_t *this, fd_t *fd, size_t size,
+ off_t offset, dict_t *xdata)
+{
+ shard_readdir_do (frame, this, fd, size, offset, GF_FOP_READDIR, xdata);
+ return 0;
+}
+
+
+int32_t
+shard_readdirp (call_frame_t *frame, xlator_t *this, fd_t *fd, size_t size,
+ off_t offset, dict_t *xdata)
+{
+ shard_readdir_do (frame, this, fd, size, offset, GF_FOP_READDIRP,
+ xdata);
+ return 0;
+}
+
+int
+shard_setattr_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
+ int32_t op_ret, int32_t op_errno, struct iatt *prebuf,
+ struct iatt *postbuf, dict_t *xdata)
+{
+ /* To-Do: Call fsetattr on all shards. */
+ SHARD_STACK_UNWIND (setattr, frame, op_ret, op_errno, prebuf, postbuf,
+ xdata);
+ return 0;
+}
+
+int
+shard_setattr (call_frame_t *frame, xlator_t *this, loc_t *loc,
+ struct iatt *stbuf, int32_t valid, dict_t *xdata)
+{
+ STACK_WIND (frame, shard_setattr_cbk, FIRST_CHILD(this),
+ FIRST_CHILD(this)->fops->setattr, loc, stbuf, valid, xdata);
+
+ return 0;
+}
+
+int
+shard_fsetattr_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
+ int32_t op_ret, int32_t op_errno, struct iatt *prebuf,
+ struct iatt *postbuf, dict_t *xdata)
+{
+ /* To-Do: Call fsetattr on all shards. */
+ SHARD_STACK_UNWIND (fsetattr, frame, op_ret, op_errno, prebuf, postbuf,
+ xdata);
+ return 0;
+}
+
+int
+shard_fsetattr (call_frame_t *frame, xlator_t *this, fd_t *fd,
+ struct iatt *stbuf, int32_t valid, dict_t *xdata)
+{
+ STACK_WIND (frame, shard_fsetattr_cbk, FIRST_CHILD(this),
+ FIRST_CHILD(this)->fops->fsetattr, fd, stbuf, valid, xdata);
+
+ return 0;
+}
+
+int
+shard_fallocate (call_frame_t *frame, xlator_t *this, fd_t *fd,
+ int32_t keep_size, off_t offset, size_t len, dict_t *xdata)
+{
+ /* TBD */
+ SHARD_STACK_UNWIND (fallocate, frame, -1, ENOTCONN, NULL, NULL, NULL);
+ return 0;
+}
+
+int
+shard_discard (call_frame_t *frame, xlator_t *this, fd_t *fd, off_t offset,
+ size_t len, dict_t *xdata)
+{
+ /* TBD */
+ SHARD_STACK_UNWIND (discard, frame, -1, ENOTCONN, NULL, NULL, NULL);
+ return 0;
+}
+
+int
+shard_zerofill (call_frame_t *frame, xlator_t *this, fd_t *fd, off_t offset,
+ off_t len, dict_t *xdata)
+{
+ /* TBD */
+ SHARD_STACK_UNWIND (zerofill, frame, -1, ENOTCONN, NULL, NULL, NULL);
+ return 0;
+}
+
+int
+init (xlator_t *this)
+{
+ int ret = -1;
+ shard_priv_t *priv = NULL;
+
+ if (!this) {
+ gf_log ("shard", GF_LOG_ERROR, "this is NULL. init() failed");
+ goto out;
+ }
+
+ if (!this->parents) {
+ gf_log (this->name, GF_LOG_ERROR, "Dangling volume. "
+ "Check volfile");
+ goto out;
+ }
+
+ if (!this->children || this->children->next) {
+ gf_log (this->name, GF_LOG_ERROR, "shard not configured with "
+ "exactly one sub-volume. Check volfile");
+ goto out;
+ }
+
+ priv = GF_CALLOC (1, sizeof (*priv), gf_shard_mt_priv_t);
+ if (!priv)
+ goto out;
+
+ GF_OPTION_INIT ("shard-block-size", priv->block_size, size_uint64, out);
+
+ this->local_pool = mem_pool_new (shard_local_t, 128);
+ if (!this->local_pool) {
+ ret = -1;
+ gf_log (this->name, GF_LOG_ERROR, "Failed to allocate locals "
+ "from mempool");
+ goto out;
+ }
+ priv->inode_table = inode_table_new (SHARD_INODE_LRU_LIMIT, this);
+ if (!priv->inode_table)
+ goto out;
+
+ uuid_parse (SHARD_ROOT_GFID, priv->dot_shard_gfid);
+
+ this->private = priv;
+ ret = 0;
+out:
+ if (ret) {
+ inode_table_destroy (priv->inode_table);
+ GF_FREE (priv);
+ mem_pool_destroy (this->local_pool);
+ }
+
+ return ret;
+
+}
+
+void
+fini (xlator_t *this)
+{
+ shard_priv_t *priv = NULL;
+
+ GF_VALIDATE_OR_GOTO ("shard", this, out);
+
+ mem_pool_destroy (this->local_pool);
+ this->local_pool = NULL;
+
+ priv = this->private;
+ if (!priv)
+ goto out;
+
+ this->private = NULL;
+ inode_table_destroy (priv->inode_table);
+ GF_FREE (priv);
+
+out:
+ return;
+}
+
+int
+reconfigure (xlator_t *this, dict_t *options)
+{
+ int ret = -1;
+ shard_priv_t *priv = NULL;
+
+ priv = this->private;
+
+ GF_OPTION_RECONF ("shard-block-size", priv->block_size, options, size,
+ out);
+
+ ret = 0;
+
+out:
+ return ret;
+}
+
+int
+shard_forget (xlator_t *this, inode_t *inode)
+{
+ uint64_t ctx_uint = 0;
+ shard_inode_ctx_t *ctx = NULL;
+
+ inode_ctx_del (inode, this, &ctx_uint);
+ if (!ctx_uint)
+ return 0;
+
+ ctx = (shard_inode_ctx_t *)ctx_uint;
+
+ GF_FREE (ctx);
+
+ return 0;
+}
+
+int
+shard_release (xlator_t *this, fd_t *fd)
+{
+ /* TBD */
+ return 0;
+}
+
+int
+shard_priv_dump (xlator_t *this)
+{
+ /* TBD */
+ return 0;
+}
+
+int
+shard_releasedir (xlator_t *this, fd_t *fd)
+{
+ return 0;
+}
+
+struct xlator_fops fops = {
+ .lookup = shard_lookup,
+ .open = shard_open,
+ .flush = shard_flush,
+ .fsync = shard_fsync,
+ .stat = shard_stat,
+ .fstat = shard_fstat,
+ .readv = shard_readv,
+ .writev = shard_writev,
+ .truncate = shard_truncate,
+ .ftruncate = shard_ftruncate,
+ .setattr = shard_setattr,
+ .fsetattr = shard_fsetattr,
+ .fallocate = shard_fallocate,
+ .discard = shard_discard,
+ .zerofill = shard_zerofill,
+ .readdir = shard_readdir,
+ .readdirp = shard_readdirp,
+ .create = shard_create,
+ .mknod = shard_mknod,
+ .unlink = shard_unlink,
+ .rename = shard_rename,
+};
+
+struct xlator_cbks cbks = {
+ .forget = shard_forget,
+ .release = shard_release,
+ .releasedir = shard_releasedir,
+};
+
+struct xlator_dumpops dumpops = {
+ .priv = shard_priv_dump,
+};
+
+struct volume_options options[] = {
+ { .key = {"shard-block-size"},
+ .type = GF_OPTION_TYPE_SIZET,
+ .default_value = "4MB",
+ .min = SHARD_MIN_BLOCK_SIZE,
+ .description = "Size of the shard unit that would be read from or "
+ "written to the shard servers.",
+ },
+ { .key = {NULL} },
+};
diff --git a/xlators/features/shard/src/shard.h b/xlators/features/shard/src/shard.h
new file mode 100644
index 00000000000..03d881079e6
--- /dev/null
+++ b/xlators/features/shard/src/shard.h
@@ -0,0 +1,118 @@
+/*
+ Copyright (c) 2015 Red Hat, Inc. <http://www.redhat.com>
+ This file is part of GlusterFS.
+
+ This file is licensed to you under your choice of the GNU Lesser
+ General Public License, version 3 or any later version (LGPLv3 or
+ later), or the GNU General Public License, version 2 (GPLv2), in all
+ cases as published by the Free Software Foundation.
+*/
+
+
+#ifndef __SHARD_H__
+#define __SHARD_H__
+
+#ifndef _CONFIG_H
+#define _CONFIG_H
+#include "config.h"
+#endif
+
+#include "xlator.h"
+#include "compat-errno.h"
+
+#define GF_SHARD_DIR ".shard"
+#define SHARD_MIN_BLOCK_SIZE (128*GF_UNIT_KB)
+#define GF_XATTR_SHARD_BLOCK_SIZE "trusted.glusterfs.shard.block-size"
+#define SHARD_ROOT_GFID "be318638-e8a0-4c6d-977d-7a937aa84806"
+#define SHARD_INODE_LRU_LIMIT 4096
+
+#define get_lowest_block(off, shard_size) (off / shard_size)
+#define get_highest_block(off, len, shard_size) ((off+len-1) / shard_size)
+
+#define SHARD_ENTRY_FOP_CHECK(loc, op_errno, label) do { \
+ if ((loc->name && !strcmp (GF_SHARD_DIR, loc->name)) && \
+ (((loc->parent) && \
+ __is_root_gfid (loc->parent->gfid)) || \
+ __is_root_gfid (loc->pargfid))) { \
+ op_errno = EPERM; \
+ goto label; \
+ } \
+ \
+ if ((loc->parent && \
+ __is_shard_dir (loc->parent->gfid)) || \
+ __is_shard_dir (loc->pargfid)) { \
+ op_errno = EPERM; \
+ goto label; \
+ } \
+} while (0)
+
+#define SHARD_INODE_OP_CHECK(gfid, err, label) do { \
+ if (__is_shard_dir(gfid)) { \
+ err = EPERM; \
+ goto label; \
+ } \
+} while (0)
+
+#define SHARD_STACK_UNWIND(fop, frame, params ...) do { \
+ shard_local_t *__local = NULL; \
+ if (frame) { \
+ __local = frame->local; \
+ frame->local = NULL; \
+ } \
+ STACK_UNWIND_STRICT (fop, frame, params); \
+ if (__local) { \
+ shard_local_wipe (__local); \
+ mem_put (__local); \
+ } \
+} while (0) \
+
+typedef struct shard_priv {
+ uint64_t block_size;
+ uuid_t dot_shard_gfid;
+ inode_table_t *inode_table;
+ inode_t *dot_shard_inode;
+} shard_priv_t;
+
+typedef struct {
+ loc_t *loc;
+ short type;
+ char *domain;
+} shard_lock_t;
+
+typedef struct shard_local {
+ int op_ret;
+ int op_errno;
+ int first_block;
+ int last_block;
+ int num_blocks;
+ int call_count;
+ int eexist_count;
+ int xflag;
+ int count;
+ uint32_t flags;
+ uint64_t block_size;
+ off_t offset;
+ size_t total_size;
+ uuid_t shard_gfid;
+ loc_t loc;
+ loc_t dot_shard_loc;
+ fd_t *fd;
+ dict_t *xattr_req;
+ inode_t **inode_list;
+ struct iovec *vector;
+ struct iobref *iobref;
+ struct {
+ int lock_count;
+ fop_inodelk_cbk_t inodelk_cbk;
+ shard_lock_t *shard_lock;
+ } lock;
+} shard_local_t;
+
+typedef struct shard_inode_ctx {
+ uint32_t rdev;
+ uint64_t block_size; /* The block size with which this inode is
+ sharded */
+ mode_t mode;
+} shard_inode_ctx_t;
+
+#endif /* __SHARD_H__ */
diff --git a/xlators/mgmt/glusterd/src/glusterd-volgen.c b/xlators/mgmt/glusterd/src/glusterd-volgen.c
index 6f6d1095edb..b098a297132 100644
--- a/xlators/mgmt/glusterd/src/glusterd-volgen.c
+++ b/xlators/mgmt/glusterd/src/glusterd-volgen.c
@@ -3353,6 +3353,18 @@ client_graph_builder (volgen_graph_t *graph, glusterd_volinfo_t *volinfo,
if (ret == -1)
goto out;
+ ret = dict_get_str_boolean (set_dict, "features.shard", _gf_false);
+ if (ret == -1)
+ goto out;
+
+ if (ret) {
+ xl = volgen_graph_add (graph, "features/shard", volname);
+ if (!xl) {
+ ret = -1;
+ goto out;
+ }
+ }
+
/* As of now snapshot volume is read-only. Read-only xlator is loaded
* in client graph so that AFR & DHT healing can be done in server.
*/
@@ -3388,7 +3400,6 @@ client_graph_builder (volgen_graph_t *graph, glusterd_volinfo_t *volinfo,
goto out;
if (ret) {
xl = volgen_graph_add (graph, "encryption/crypt", volname);
-
if (!xl) {
ret = -1;
goto out;
diff --git a/xlators/mgmt/glusterd/src/glusterd-volume-set.c b/xlators/mgmt/glusterd/src/glusterd-volume-set.c
index ada814bb25d..68e3fb7d46b 100644
--- a/xlators/mgmt/glusterd/src/glusterd-volume-set.c
+++ b/xlators/mgmt/glusterd/src/glusterd-volume-set.c
@@ -1763,6 +1763,17 @@ struct volopt_map_entry glusterd_volopt_map[] = {
.type = NO_DOC,
.op_version = GD_OP_VERSION_3_7_0,
},
+ { .key = "features.shard",
+ .voltype = "features/shard",
+ .value = "off",
+ .op_version = GD_OP_VERSION_3_7_0,
+ .flags = OPT_FLAG_CLIENT_OPT
+ },
+ { .key = "features.shard-block-size",
+ .voltype = "features/shard",
+ .op_version = GD_OP_VERSION_3_7_0,
+ .flags = OPT_FLAG_CLIENT_OPT
+ },
{ .key = NULL
}
};