diff options
Diffstat (limited to 'xlators/features/shard/src')
| -rw-r--r-- | xlators/features/shard/src/Makefile.am | 17 | ||||
| -rw-r--r-- | xlators/features/shard/src/shard-mem-types.h | 24 | ||||
| -rw-r--r-- | xlators/features/shard/src/shard-messages.h | 39 | ||||
| -rw-r--r-- | xlators/features/shard/src/shard.c | 7382 | ||||
| -rw-r--r-- | xlators/features/shard/src/shard.h | 348 |
5 files changed, 7810 insertions, 0 deletions
diff --git a/xlators/features/shard/src/Makefile.am b/xlators/features/shard/src/Makefile.am new file mode 100644 index 00000000000..bf5700d4bcc --- /dev/null +++ b/xlators/features/shard/src/Makefile.am @@ -0,0 +1,17 @@ +xlator_LTLIBRARIES = shard.la +xlatordir = $(libdir)/glusterfs/$(PACKAGE_VERSION)/xlator/features + +shard_la_LDFLAGS = -module $(GF_XLATOR_DEFAULT_LDFLAGS) + +shard_la_SOURCES = shard.c + +shard_la_LIBADD = $(top_builddir)/libglusterfs/src/libglusterfs.la + +noinst_HEADERS = shard.h shard-mem-types.h shard-messages.h + +AM_CPPFLAGS = $(GF_CPPFLAGS) -I$(top_srcdir)/libglusterfs/src \ + -I$(top_srcdir)/rpc/xdr/src -I$(top_builddir)/rpc/xdr/src + +AM_CFLAGS = -Wall $(GF_CFLAGS) + +CLEANFILES = diff --git a/xlators/features/shard/src/shard-mem-types.h b/xlators/features/shard/src/shard-mem-types.h new file mode 100644 index 00000000000..1fe7e2e2798 --- /dev/null +++ b/xlators/features/shard/src/shard-mem-types.h @@ -0,0 +1,24 @@ +/* + Copyright (c) 2015 Red Hat, Inc. <http://www.redhat.com> + This file is part of GlusterFS. + + This file is licensed to you under your choice of the GNU Lesser + General Public License, version 3 or any later version (LGPLv3 or + later), or the GNU General Public License, version 2 (GPLv2), in all + cases as published by the Free Software Foundation. +*/ +#ifndef __SHARD_MEM_TYPES_H__ +#define __SHARD_MEM_TYPES_H__ + +#include <glusterfs/mem-types.h> + +enum gf_shard_mem_types_ { + gf_shard_mt_priv_t = gf_common_mt_end + 1, + gf_shard_mt_inode_list, + gf_shard_mt_inode_ctx_t, + gf_shard_mt_iovec, + gf_shard_mt_int64_t, + gf_shard_mt_uint64_t, + gf_shard_mt_end +}; +#endif diff --git a/xlators/features/shard/src/shard-messages.h b/xlators/features/shard/src/shard-messages.h new file mode 100644 index 00000000000..2d0867eb136 --- /dev/null +++ b/xlators/features/shard/src/shard-messages.h @@ -0,0 +1,39 @@ +/* + Copyright (c) 2015 Red Hat, Inc. <http://www.redhat.com> + This file is part of GlusterFS. + + This file is licensed to you under your choice of the GNU Lesser + General Public License, version 3 or any later version (LGPLv3 or + later), or the GNU General Public License, version 2 (GPLv2), in all + cases as published by the Free Software Foundation. + */ + +#ifndef _SHARD_MESSAGES_H_ +#define _SHARD_MESSAGES_H_ + +#include <glusterfs/glfs-message-id.h> + +/* To add new message IDs, append new identifiers at the end of the list. + * + * Never remove a message ID. If it's not used anymore, you can rename it or + * leave it as it is, but not delete it. This is to prevent reutilization of + * IDs by other messages. + * + * The component name must match one of the entries defined in + * glfs-message-id.h. + */ + +GLFS_MSGID(SHARD, SHARD_MSG_BASE_FILE_LOOKUP_FAILED, SHARD_MSG_DICT_OP_FAILED, + SHARD_MSG_DOT_SHARD_NODIR, SHARD_MSG_FD_CTX_SET_FAILED, + SHARD_MSG_INODE_CTX_GET_FAILED, SHARD_MSG_INODE_CTX_SET_FAILED, + SHARD_MSG_INODE_PATH_FAILED, SHARD_MSG_INTERNAL_XATTR_MISSING, + SHARD_MSG_INVALID_VOLFILE, SHARD_MSG_LOOKUP_SHARD_FAILED, + SHARD_MSG_MEM_ACCT_INIT_FAILED, SHARD_MSG_NULL_THIS, + SHARD_MSG_SIZE_SET_FAILED, SHARD_MSG_STAT_FAILED, + SHARD_MSG_TRUNCATE_LAST_SHARD_FAILED, + SHARD_MSG_UPDATE_FILE_SIZE_FAILED, SHARD_MSG_FOP_NOT_SUPPORTED, + SHARD_MSG_INVALID_FOP, SHARD_MSG_MEMALLOC_FAILED, + SHARD_MSG_FOP_FAILED, SHARD_MSG_SHARDS_DELETION_FAILED, + SHARD_MSG_SHARD_DELETION_COMPLETED); + +#endif /* !_SHARD_MESSAGES_H_ */ diff --git a/xlators/features/shard/src/shard.c b/xlators/features/shard/src/shard.c new file mode 100644 index 00000000000..e5f93063943 --- /dev/null +++ b/xlators/features/shard/src/shard.c @@ -0,0 +1,7382 @@ +/* + Copyright (c) 2015 Red Hat, Inc. <http://www.redhat.com> + This file is part of GlusterFS. + + This file is licensed to you under your choice of the GNU Lesser + General Public License, version 3 or any later version (LGPLv3 or + later), or the GNU General Public License, version 2 (GPLv2), in all + cases as published by the Free Software Foundation. +*/ + +#include <unistd.h> + +#include "shard.h" +#include "shard-mem-types.h" +#include <glusterfs/byte-order.h> +#include <glusterfs/defaults.h> +#include <glusterfs/statedump.h> + +static gf_boolean_t +__is_shard_dir(uuid_t gfid) +{ + shard_priv_t *priv = THIS->private; + + if (gf_uuid_compare(gfid, priv->dot_shard_gfid) == 0) + return _gf_true; + + return _gf_false; +} + +static gf_boolean_t +__is_gsyncd_on_shard_dir(call_frame_t *frame, loc_t *loc) +{ + if (frame->root->pid == GF_CLIENT_PID_GSYNCD && + (__is_shard_dir(loc->pargfid) || + (loc->parent && __is_shard_dir(loc->parent->gfid)))) + return _gf_true; + + return _gf_false; +} + +void +shard_make_block_bname(int block_num, uuid_t gfid, char *buf, size_t len) +{ + char gfid_str[GF_UUID_BUF_SIZE] = { + 0, + }; + + gf_uuid_unparse(gfid, gfid_str); + snprintf(buf, len, "%s.%d", gfid_str, block_num); +} + +void +shard_make_block_abspath(int block_num, uuid_t gfid, char *filepath, size_t len) +{ + char gfid_str[GF_UUID_BUF_SIZE] = { + 0, + }; + + gf_uuid_unparse(gfid, gfid_str); + snprintf(filepath, len, "/%s/%s.%d", GF_SHARD_DIR, gfid_str, block_num); +} + +int +__shard_inode_ctx_get(inode_t *inode, xlator_t *this, shard_inode_ctx_t **ctx) +{ + int ret = -1; + uint64_t ctx_uint = 0; + shard_inode_ctx_t *ctx_p = NULL; + + ret = __inode_ctx_get(inode, this, &ctx_uint); + if (ret == 0) { + *ctx = (shard_inode_ctx_t *)(uintptr_t)ctx_uint; + return ret; + } + + ctx_p = GF_CALLOC(1, sizeof(*ctx_p), gf_shard_mt_inode_ctx_t); + if (!ctx_p) + return ret; + + INIT_LIST_HEAD(&ctx_p->ilist); + INIT_LIST_HEAD(&ctx_p->to_fsync_list); + + ctx_uint = (uint64_t)(uintptr_t)ctx_p; + ret = __inode_ctx_set(inode, this, &ctx_uint); + if (ret < 0) { + GF_FREE(ctx_p); + return ret; + } + + *ctx = ctx_p; + + return ret; +} + +int +shard_inode_ctx_get(inode_t *inode, xlator_t *this, shard_inode_ctx_t **ctx) +{ + int ret = 0; + + LOCK(&inode->lock); + { + ret = __shard_inode_ctx_get(inode, this, ctx); + } + UNLOCK(&inode->lock); + + return ret; +} + +int +__shard_inode_ctx_set(inode_t *inode, xlator_t *this, struct iatt *stbuf, + uint64_t block_size, int32_t valid) +{ + int ret = -1; + shard_inode_ctx_t *ctx = NULL; + + ret = __shard_inode_ctx_get(inode, this, &ctx); + if (ret) + return ret; + + if (valid & SHARD_MASK_BLOCK_SIZE) + ctx->block_size = block_size; + + if (valid & SHARD_MASK_PROT) + ctx->stat.ia_prot = stbuf->ia_prot; + + if (valid & SHARD_MASK_NLINK) + ctx->stat.ia_nlink = stbuf->ia_nlink; + + if (valid & SHARD_MASK_UID) + ctx->stat.ia_uid = stbuf->ia_uid; + + if (valid & SHARD_MASK_GID) + ctx->stat.ia_gid = stbuf->ia_gid; + + if (valid & SHARD_MASK_SIZE) + ctx->stat.ia_size = stbuf->ia_size; + + if (valid & SHARD_MASK_BLOCKS) + ctx->stat.ia_blocks = stbuf->ia_blocks; + + if (valid & SHARD_MASK_TIMES) { + SHARD_TIME_UPDATE(ctx->stat.ia_mtime, ctx->stat.ia_mtime_nsec, + stbuf->ia_mtime, stbuf->ia_mtime_nsec); + SHARD_TIME_UPDATE(ctx->stat.ia_ctime, ctx->stat.ia_ctime_nsec, + stbuf->ia_ctime, stbuf->ia_ctime_nsec); + SHARD_TIME_UPDATE(ctx->stat.ia_atime, ctx->stat.ia_atime_nsec, + stbuf->ia_atime, stbuf->ia_atime_nsec); + } + + if (valid & SHARD_MASK_OTHERS) { + ctx->stat.ia_ino = stbuf->ia_ino; + gf_uuid_copy(ctx->stat.ia_gfid, stbuf->ia_gfid); + ctx->stat.ia_dev = stbuf->ia_dev; + ctx->stat.ia_type = stbuf->ia_type; + ctx->stat.ia_rdev = stbuf->ia_rdev; + ctx->stat.ia_blksize = stbuf->ia_blksize; + } + + if (valid & SHARD_MASK_REFRESH_RESET) + ctx->refresh = _gf_false; + + return 0; +} + +int +shard_inode_ctx_set(inode_t *inode, xlator_t *this, struct iatt *stbuf, + uint64_t block_size, int32_t valid) +{ + int ret = -1; + + LOCK(&inode->lock); + { + ret = __shard_inode_ctx_set(inode, this, stbuf, block_size, valid); + } + UNLOCK(&inode->lock); + + return ret; +} + +int +__shard_inode_ctx_set_refresh_flag(inode_t *inode, xlator_t *this) +{ + int ret = -1; + shard_inode_ctx_t *ctx = NULL; + + ret = __shard_inode_ctx_get(inode, this, &ctx); + if (ret) + return ret; + + ctx->refresh = _gf_true; + + return 0; +} +int +shard_inode_ctx_set_refresh_flag(inode_t *inode, xlator_t *this) +{ + int ret = -1; + + LOCK(&inode->lock); + { + ret = __shard_inode_ctx_set_refresh_flag(inode, this); + } + UNLOCK(&inode->lock); + + return ret; +} + +int +__shard_inode_ctx_mark_dir_refreshed(inode_t *inode, xlator_t *this) +{ + int ret = -1; + shard_inode_ctx_t *ctx = NULL; + + ret = __shard_inode_ctx_get(inode, this, &ctx); + if (ret) + return ret; + + ctx->refreshed = _gf_true; + return 0; +} + +int +shard_inode_ctx_mark_dir_refreshed(inode_t *inode, xlator_t *this) +{ + int ret = -1; + + LOCK(&inode->lock); + { + ret = __shard_inode_ctx_mark_dir_refreshed(inode, this); + } + UNLOCK(&inode->lock); + + return ret; +} + +int +__shard_inode_ctx_add_to_fsync_list(inode_t *base_inode, xlator_t *this, + inode_t *shard_inode) +{ + int ret = -1; + shard_inode_ctx_t *base_ictx = NULL; + shard_inode_ctx_t *shard_ictx = NULL; + + ret = __shard_inode_ctx_get(base_inode, this, &base_ictx); + if (ret) + return ret; + + ret = __shard_inode_ctx_get(shard_inode, this, &shard_ictx); + if (ret) + return ret; + + if (shard_ictx->fsync_needed) { + shard_ictx->fsync_needed++; + return 1; + } + + list_add_tail(&shard_ictx->to_fsync_list, &base_ictx->to_fsync_list); + shard_ictx->inode = shard_inode; + shard_ictx->fsync_needed++; + base_ictx->fsync_count++; + shard_ictx->base_inode = base_inode; + + return 0; +} + +int +shard_inode_ctx_add_to_fsync_list(inode_t *base_inode, xlator_t *this, + inode_t *shard_inode) +{ + int ret = -1; + + /* This ref acts as a refkeepr on the base inode. We + * need to keep this inode alive as it holds the head + * of the to_fsync_list. + */ + inode_ref(base_inode); + inode_ref(shard_inode); + + LOCK(&base_inode->lock); + LOCK(&shard_inode->lock); + { + ret = __shard_inode_ctx_add_to_fsync_list(base_inode, this, + shard_inode); + } + UNLOCK(&shard_inode->lock); + UNLOCK(&base_inode->lock); + + /* Unref the base inode corresponding to the ref above, if the shard is + * found to be already part of the fsync list. + */ + if (ret != 0) { + inode_unref(base_inode); + inode_unref(shard_inode); + } + return ret; +} + +gf_boolean_t +__shard_inode_ctx_needs_lookup(inode_t *inode, xlator_t *this) +{ + int ret = -1; + shard_inode_ctx_t *ctx = NULL; + + ret = __shard_inode_ctx_get(inode, this, &ctx); + /* If inode ctx get fails, better to err on the side of caution and + * try again? Unless the failure is due to mem-allocation. + */ + if (ret) + return _gf_true; + + return !ctx->refreshed; +} + +gf_boolean_t +shard_inode_ctx_needs_lookup(inode_t *inode, xlator_t *this) +{ + gf_boolean_t flag = _gf_false; + + LOCK(&inode->lock); + { + flag = __shard_inode_ctx_needs_lookup(inode, this); + } + UNLOCK(&inode->lock); + + return flag; +} +int +__shard_inode_ctx_invalidate(inode_t *inode, xlator_t *this, struct iatt *stbuf) +{ + int ret = -1; + shard_inode_ctx_t *ctx = NULL; + + ret = __shard_inode_ctx_get(inode, this, &ctx); + if (ret) + return ret; + + if ((stbuf->ia_size != ctx->stat.ia_size) || + (stbuf->ia_blocks != ctx->stat.ia_blocks)) + ctx->refresh = _gf_true; + + return 0; +} + +int +shard_inode_ctx_invalidate(inode_t *inode, xlator_t *this, struct iatt *stbuf) +{ + int ret = -1; + + LOCK(&inode->lock); + { + ret = __shard_inode_ctx_invalidate(inode, this, stbuf); + } + UNLOCK(&inode->lock); + + return ret; +} + +int +__shard_inode_ctx_get_block_size(inode_t *inode, xlator_t *this, + uint64_t *block_size) +{ + int ret = -1; + uint64_t ctx_uint = 0; + shard_inode_ctx_t *ctx = NULL; + + ret = __inode_ctx_get(inode, this, &ctx_uint); + if (ret < 0) + return ret; + + ctx = (shard_inode_ctx_t *)(uintptr_t)ctx_uint; + + *block_size = ctx->block_size; + + return 0; +} + +int +shard_inode_ctx_get_block_size(inode_t *inode, xlator_t *this, + uint64_t *block_size) +{ + int ret = -1; + + LOCK(&inode->lock); + { + ret = __shard_inode_ctx_get_block_size(inode, this, block_size); + } + UNLOCK(&inode->lock); + + return ret; +} + +int +__shard_inode_ctx_get_fsync_count(inode_t *inode, xlator_t *this, + int *fsync_count) +{ + int ret = -1; + uint64_t ctx_uint = 0; + shard_inode_ctx_t *ctx = NULL; + + ret = __inode_ctx_get(inode, this, &ctx_uint); + if (ret < 0) + return ret; + + ctx = (shard_inode_ctx_t *)(uintptr_t)ctx_uint; + + *fsync_count = ctx->fsync_needed; + + return 0; +} + +int +shard_inode_ctx_get_fsync_count(inode_t *inode, xlator_t *this, + int *fsync_count) +{ + int ret = -1; + + LOCK(&inode->lock); + { + ret = __shard_inode_ctx_get_fsync_count(inode, this, fsync_count); + } + UNLOCK(&inode->lock); + + return ret; +} +int +__shard_inode_ctx_get_all(inode_t *inode, xlator_t *this, + shard_inode_ctx_t *ctx_out) +{ + int ret = -1; + uint64_t ctx_uint = 0; + shard_inode_ctx_t *ctx = NULL; + + ret = __inode_ctx_get(inode, this, &ctx_uint); + if (ret < 0) + return ret; + + ctx = (shard_inode_ctx_t *)(uintptr_t)ctx_uint; + + memcpy(ctx_out, ctx, sizeof(shard_inode_ctx_t)); + return 0; +} + +int +shard_inode_ctx_get_all(inode_t *inode, xlator_t *this, + shard_inode_ctx_t *ctx_out) +{ + int ret = -1; + + LOCK(&inode->lock); + { + ret = __shard_inode_ctx_get_all(inode, this, ctx_out); + } + UNLOCK(&inode->lock); + + return ret; +} + +int +__shard_inode_ctx_fill_iatt_from_cache(inode_t *inode, xlator_t *this, + struct iatt *buf, + gf_boolean_t *need_refresh) +{ + int ret = -1; + uint64_t ctx_uint = 0; + shard_inode_ctx_t *ctx = NULL; + + ret = __inode_ctx_get(inode, this, &ctx_uint); + if (ret < 0) + return ret; + + ctx = (shard_inode_ctx_t *)(uintptr_t)ctx_uint; + + if (ctx->refresh == _gf_false) + *buf = ctx->stat; + else + *need_refresh = _gf_true; + + return 0; +} + +int +shard_inode_ctx_fill_iatt_from_cache(inode_t *inode, xlator_t *this, + struct iatt *buf, + gf_boolean_t *need_refresh) +{ + int ret = -1; + + LOCK(&inode->lock); + { + ret = __shard_inode_ctx_fill_iatt_from_cache(inode, this, buf, + need_refresh); + } + UNLOCK(&inode->lock); + + return ret; +} + +void +shard_local_wipe(shard_local_t *local) +{ + int i = 0; + int count = 0; + + count = local->num_blocks; + + syncbarrier_destroy(&local->barrier); + loc_wipe(&local->loc); + loc_wipe(&local->dot_shard_loc); + loc_wipe(&local->dot_shard_rm_loc); + loc_wipe(&local->loc2); + loc_wipe(&local->tmp_loc); + loc_wipe(&local->int_inodelk.loc); + loc_wipe(&local->int_entrylk.loc); + loc_wipe(&local->newloc); + + if (local->name) + GF_FREE(local->name); + + if (local->int_entrylk.basename) + GF_FREE(local->int_entrylk.basename); + if (local->fd) + fd_unref(local->fd); + + if (local->xattr_req) + dict_unref(local->xattr_req); + if (local->xattr_rsp) + dict_unref(local->xattr_rsp); + + for (i = 0; i < count; i++) { + if (!local->inode_list) + break; + + if (local->inode_list[i]) + inode_unref(local->inode_list[i]); + } + + GF_FREE(local->inode_list); + + GF_FREE(local->vector); + if (local->iobref) + iobref_unref(local->iobref); + if (local->list_inited) + gf_dirent_free(&local->entries_head); + if (local->inodelk_frame) + SHARD_STACK_DESTROY(local->inodelk_frame); + if (local->entrylk_frame) + SHARD_STACK_DESTROY(local->entrylk_frame); +} + +int +shard_modify_size_and_block_count(struct iatt *stbuf, dict_t *dict) +{ + int ret = -1; + void *size_attr = NULL; + uint64_t size_array[4]; + + ret = dict_get_ptr(dict, GF_XATTR_SHARD_FILE_SIZE, &size_attr); + if (ret) { + gf_msg_callingfn(THIS->name, GF_LOG_ERROR, 0, + SHARD_MSG_INTERNAL_XATTR_MISSING, + "Failed to " + "get " GF_XATTR_SHARD_FILE_SIZE " for %s", + uuid_utoa(stbuf->ia_gfid)); + return ret; + } + + memcpy(size_array, size_attr, sizeof(size_array)); + + stbuf->ia_size = ntoh64(size_array[0]); + stbuf->ia_blocks = ntoh64(size_array[2]); + + return 0; +} + +int +shard_call_count_return(call_frame_t *frame) +{ + int call_count = 0; + shard_local_t *local = NULL; + + local = frame->local; + + LOCK(&frame->lock); + { + call_count = --local->call_count; + } + UNLOCK(&frame->lock); + + return call_count; +} + +static char * +shard_internal_dir_string(shard_internal_dir_type_t type) +{ + char *str = NULL; + + switch (type) { + case SHARD_INTERNAL_DIR_DOT_SHARD: + str = GF_SHARD_DIR; + break; + case SHARD_INTERNAL_DIR_DOT_SHARD_REMOVE_ME: + str = GF_SHARD_REMOVE_ME_DIR; + break; + default: + break; + } + return str; +} + +static int +shard_init_internal_dir_loc(xlator_t *this, shard_local_t *local, + shard_internal_dir_type_t type) +{ + int ret = -1; + char *bname = NULL; + inode_t *parent = NULL; + loc_t *internal_dir_loc = NULL; + shard_priv_t *priv = NULL; + + priv = this->private; + if (!local) + return -1; + + switch (type) { + case SHARD_INTERNAL_DIR_DOT_SHARD: + internal_dir_loc = &local->dot_shard_loc; + bname = GF_SHARD_DIR; + parent = inode_ref(this->itable->root); + break; + case SHARD_INTERNAL_DIR_DOT_SHARD_REMOVE_ME: + internal_dir_loc = &local->dot_shard_rm_loc; + bname = GF_SHARD_REMOVE_ME_DIR; + parent = inode_ref(priv->dot_shard_inode); + break; + default: + break; + } + + internal_dir_loc->inode = inode_new(this->itable); + internal_dir_loc->parent = parent; + ret = inode_path(internal_dir_loc->parent, bname, + (char **)&internal_dir_loc->path); + if (ret < 0 || !(internal_dir_loc->inode)) { + gf_msg(this->name, GF_LOG_ERROR, 0, SHARD_MSG_INODE_PATH_FAILED, + "Inode path failed on %s", bname); + goto out; + } + + internal_dir_loc->name = strrchr(internal_dir_loc->path, '/'); + if (internal_dir_loc->name) + internal_dir_loc->name++; + + ret = 0; +out: + return ret; +} + +inode_t * +__shard_update_shards_inode_list(inode_t *linked_inode, xlator_t *this, + inode_t *base_inode, int block_num, + uuid_t gfid) +{ + char block_bname[256] = { + 0, + }; + inode_t *lru_inode = NULL; + shard_priv_t *priv = NULL; + shard_inode_ctx_t *ctx = NULL; + shard_inode_ctx_t *lru_inode_ctx = NULL; + shard_inode_ctx_t *lru_base_inode_ctx = NULL; + inode_t *fsync_inode = NULL; + inode_t *lru_base_inode = NULL; + gf_boolean_t do_fsync = _gf_false; + + priv = this->private; + + shard_inode_ctx_get(linked_inode, this, &ctx); + + if (list_empty(&ctx->ilist)) { + if (priv->inode_count + 1 <= priv->lru_limit) { + /* If this inode was linked here for the first time (indicated + * by empty list), and if there is still space in the priv list, + * add this ctx to the tail of the list. + */ + /* For as long as an inode is in lru list, we try to + * keep it alive by holding a ref on it. + */ + inode_ref(linked_inode); + if (base_inode) + gf_uuid_copy(ctx->base_gfid, base_inode->gfid); + else + gf_uuid_copy(ctx->base_gfid, gfid); + ctx->block_num = block_num; + list_add_tail(&ctx->ilist, &priv->ilist_head); + priv->inode_count++; + ctx->base_inode = inode_ref(base_inode); + } else { + /*If on the other hand there is no available slot for this inode + * in the list, delete the lru inode from the head of the list, + * unlink it. And in its place add this new inode into the list. + */ + lru_inode_ctx = list_first_entry(&priv->ilist_head, + shard_inode_ctx_t, ilist); + GF_ASSERT(lru_inode_ctx->block_num > 0); + lru_base_inode = lru_inode_ctx->base_inode; + list_del_init(&lru_inode_ctx->ilist); + lru_inode = inode_find(linked_inode->table, + lru_inode_ctx->stat.ia_gfid); + /* If the lru inode was part of the pending-fsync list, + * the base inode needs to be unref'd, the lru inode + * deleted from fsync list and fsync'd in a new frame, + * and then unlinked in memory and forgotten. + */ + if (!lru_base_inode) + goto after_fsync_check; + LOCK(&lru_base_inode->lock); + LOCK(&lru_inode->lock); + { + if (!list_empty(&lru_inode_ctx->to_fsync_list)) { + list_del_init(&lru_inode_ctx->to_fsync_list); + lru_inode_ctx->fsync_needed = 0; + do_fsync = _gf_true; + __shard_inode_ctx_get(lru_base_inode, this, + &lru_base_inode_ctx); + lru_base_inode_ctx->fsync_count--; + } + } + UNLOCK(&lru_inode->lock); + UNLOCK(&lru_base_inode->lock); + + after_fsync_check: + if (!do_fsync) { + shard_make_block_bname(lru_inode_ctx->block_num, + lru_inode_ctx->base_gfid, block_bname, + sizeof(block_bname)); + /* The following unref corresponds to the ref held at + * the time the shard was added to the lru list. + */ + inode_unref(lru_inode); + inode_unlink(lru_inode, priv->dot_shard_inode, block_bname); + inode_forget(lru_inode, 0); + } else { + /* The following unref corresponds to the ref + * held when the shard was added to fsync list. + */ + inode_unref(lru_inode); + fsync_inode = lru_inode; + if (lru_base_inode) + inode_unref(lru_base_inode); + } + /* The following unref corresponds to the ref + * held by inode_find() above. + */ + inode_unref(lru_inode); + + /* The following unref corresponds to the ref held on the base shard + * at the time of adding shard inode to lru list + */ + if (lru_base_inode) + inode_unref(lru_base_inode); + + /* For as long as an inode is in lru list, we try to + * keep it alive by holding a ref on it. + */ + inode_ref(linked_inode); + if (base_inode) + gf_uuid_copy(ctx->base_gfid, base_inode->gfid); + else + gf_uuid_copy(ctx->base_gfid, gfid); + ctx->block_num = block_num; + ctx->base_inode = inode_ref(base_inode); + list_add_tail(&ctx->ilist, &priv->ilist_head); + } + } else { + /* If this is not the first time this inode is being operated on, move + * it to the most recently used end of the list. + */ + list_move_tail(&ctx->ilist, &priv->ilist_head); + } + return fsync_inode; +} + +int +shard_common_failure_unwind(glusterfs_fop_t fop, call_frame_t *frame, + int32_t op_ret, int32_t op_errno) +{ + switch (fop) { + case GF_FOP_LOOKUP: + SHARD_STACK_UNWIND(lookup, frame, op_ret, op_errno, NULL, NULL, + NULL, NULL); + break; + case GF_FOP_STAT: + SHARD_STACK_UNWIND(stat, frame, op_ret, op_errno, NULL, NULL); + break; + case GF_FOP_FSTAT: + SHARD_STACK_UNWIND(fstat, frame, op_ret, op_errno, NULL, NULL); + break; + case GF_FOP_TRUNCATE: + SHARD_STACK_UNWIND(truncate, frame, op_ret, op_errno, NULL, NULL, + NULL); + break; + case GF_FOP_FTRUNCATE: + SHARD_STACK_UNWIND(ftruncate, frame, op_ret, op_errno, NULL, NULL, + NULL); + break; + case GF_FOP_MKNOD: + SHARD_STACK_UNWIND(mknod, frame, op_ret, op_errno, NULL, NULL, NULL, + NULL, NULL); + break; + case GF_FOP_LINK: + SHARD_STACK_UNWIND(link, frame, op_ret, op_errno, NULL, NULL, NULL, + NULL, NULL); + break; + case GF_FOP_CREATE: + SHARD_STACK_UNWIND(create, frame, op_ret, op_errno, NULL, NULL, + NULL, NULL, NULL, NULL); + break; + case GF_FOP_UNLINK: + SHARD_STACK_UNWIND(unlink, frame, op_ret, op_errno, NULL, NULL, + NULL); + break; + case GF_FOP_RENAME: + SHARD_STACK_UNWIND(rename, frame, op_ret, op_errno, NULL, NULL, + NULL, NULL, NULL, NULL); + break; + case GF_FOP_WRITE: + SHARD_STACK_UNWIND(writev, frame, op_ret, op_errno, NULL, NULL, + NULL); + break; + case GF_FOP_FALLOCATE: + SHARD_STACK_UNWIND(fallocate, frame, op_ret, op_errno, NULL, NULL, + NULL); + break; + case GF_FOP_ZEROFILL: + SHARD_STACK_UNWIND(zerofill, frame, op_ret, op_errno, NULL, NULL, + NULL); + break; + case GF_FOP_DISCARD: + SHARD_STACK_UNWIND(discard, frame, op_ret, op_errno, NULL, NULL, + NULL); + break; + case GF_FOP_READ: + SHARD_STACK_UNWIND(readv, frame, op_ret, op_errno, NULL, -1, NULL, + NULL, NULL); + break; + case GF_FOP_FSYNC: + SHARD_STACK_UNWIND(fsync, frame, op_ret, op_errno, NULL, NULL, + NULL); + break; + case GF_FOP_REMOVEXATTR: + SHARD_STACK_UNWIND(removexattr, frame, op_ret, op_errno, NULL); + break; + case GF_FOP_FREMOVEXATTR: + SHARD_STACK_UNWIND(fremovexattr, frame, op_ret, op_errno, NULL); + break; + case GF_FOP_FGETXATTR: + SHARD_STACK_UNWIND(fgetxattr, frame, op_ret, op_errno, NULL, NULL); + break; + case GF_FOP_GETXATTR: + SHARD_STACK_UNWIND(getxattr, frame, op_ret, op_errno, NULL, NULL); + break; + case GF_FOP_FSETXATTR: + SHARD_STACK_UNWIND(fsetxattr, frame, op_ret, op_errno, NULL); + break; + case GF_FOP_SETXATTR: + SHARD_STACK_UNWIND(setxattr, frame, op_ret, op_errno, NULL); + break; + case GF_FOP_SETATTR: + SHARD_STACK_UNWIND(setattr, frame, op_ret, op_errno, NULL, NULL, + NULL); + break; + case GF_FOP_FSETATTR: + SHARD_STACK_UNWIND(fsetattr, frame, op_ret, op_errno, NULL, NULL, + NULL); + break; + case GF_FOP_SEEK: + SHARD_STACK_UNWIND(seek, frame, op_ret, op_errno, 0, NULL); + break; + default: + gf_msg(THIS->name, GF_LOG_WARNING, 0, SHARD_MSG_INVALID_FOP, + "Invalid fop id = %d", fop); + break; + } + return 0; +} + +int +shard_common_inode_write_success_unwind(glusterfs_fop_t fop, + call_frame_t *frame, int32_t op_ret) +{ + shard_local_t *local = frame->local; + + /* the below 3 variables are required because, in SHARD_STACK_UNWIND() + macro, there is a check for local being null. So many static analyzers + backtrace the code with assumption of possible (local == NULL) case, + and complains for below lines. By handling it like below, we overcome + the warnings */ + + struct iatt *prebuf = ((local) ? &local->prebuf : NULL); + struct iatt *postbuf = ((local) ? &local->postbuf : NULL); + dict_t *xattr_rsp = ((local) ? local->xattr_rsp : NULL); + + switch (fop) { + case GF_FOP_WRITE: + SHARD_STACK_UNWIND(writev, frame, op_ret, 0, prebuf, postbuf, + xattr_rsp); + break; + case GF_FOP_FALLOCATE: + SHARD_STACK_UNWIND(fallocate, frame, op_ret, 0, prebuf, postbuf, + xattr_rsp); + break; + case GF_FOP_ZEROFILL: + SHARD_STACK_UNWIND(zerofill, frame, op_ret, 0, prebuf, postbuf, + xattr_rsp); + break; + case GF_FOP_DISCARD: + SHARD_STACK_UNWIND(discard, frame, op_ret, 0, prebuf, postbuf, + xattr_rsp); + break; + default: + gf_msg(THIS->name, GF_LOG_WARNING, 0, SHARD_MSG_INVALID_FOP, + "Invalid fop id = %d", fop); + break; + } + return 0; +} + +int +shard_evicted_inode_fsync_cbk(call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, + struct iatt *prebuf, struct iatt *postbuf, + dict_t *xdata) +{ + char block_bname[256] = { + 0, + }; + fd_t *anon_fd = cookie; + inode_t *shard_inode = NULL; + shard_inode_ctx_t *ctx = NULL; + shard_priv_t *priv = NULL; + + priv = this->private; + + if (anon_fd == NULL || op_ret < 0) { + gf_msg(this->name, GF_LOG_WARNING, op_errno, SHARD_MSG_MEMALLOC_FAILED, + "fsync failed on shard"); + goto out; + } + shard_inode = anon_fd->inode; + + LOCK(&priv->lock); + LOCK(&shard_inode->lock); + { + __shard_inode_ctx_get(shard_inode, this, &ctx); + if ((list_empty(&ctx->to_fsync_list)) && (list_empty(&ctx->ilist))) { + shard_make_block_bname(ctx->block_num, shard_inode->gfid, + block_bname, sizeof(block_bname)); + inode_unlink(shard_inode, priv->dot_shard_inode, block_bname); + /* The following unref corresponds to the ref held by + * inode_link() at the time the shard was created or + * looked up + */ + inode_unref(shard_inode); + inode_forget(shard_inode, 0); + } + } + UNLOCK(&shard_inode->lock); + UNLOCK(&priv->lock); + +out: + if (anon_fd) + fd_unref(anon_fd); + STACK_DESTROY(frame->root); + return 0; +} + +int +shard_initiate_evicted_inode_fsync(xlator_t *this, inode_t *inode) +{ + fd_t *anon_fd = NULL; + call_frame_t *fsync_frame = NULL; + + fsync_frame = create_frame(this, this->ctx->pool); + if (!fsync_frame) { + gf_msg(this->name, GF_LOG_WARNING, ENOMEM, SHARD_MSG_MEMALLOC_FAILED, + "Failed to create new frame " + "to fsync shard"); + return -1; + } + + anon_fd = fd_anonymous(inode); + if (!anon_fd) { + gf_msg(this->name, GF_LOG_WARNING, ENOMEM, SHARD_MSG_MEMALLOC_FAILED, + "Failed to create anon fd to" + " fsync shard"); + STACK_DESTROY(fsync_frame->root); + return -1; + } + + STACK_WIND_COOKIE(fsync_frame, shard_evicted_inode_fsync_cbk, anon_fd, + FIRST_CHILD(this), FIRST_CHILD(this)->fops->fsync, + anon_fd, 1, NULL); + return 0; +} + +int +shard_common_inode_write_post_lookup_shards_handler(call_frame_t *frame, + xlator_t *this); + +int +shard_common_resolve_shards(call_frame_t *frame, xlator_t *this, + shard_post_resolve_fop_handler_t post_res_handler) +{ + int i = -1; + uint32_t shard_idx_iter = 0; + char path[PATH_MAX] = { + 0, + }; + uuid_t gfid = { + 0, + }; + inode_t *inode = NULL; + inode_t *res_inode = NULL; + inode_t *fsync_inode = NULL; + shard_priv_t *priv = NULL; + shard_local_t *local = NULL; + uint64_t resolve_count = 0; + + priv = this->private; + local = frame->local; + local->call_count = 0; + shard_idx_iter = local->first_block; + res_inode = local->resolver_base_inode; + + if ((local->op_ret < 0) || (local->resolve_not)) + goto out; + + /* If this prealloc FOP is for fresh file creation, then the size of the + * file will be 0. Then there will be no shards associated with this file. + * So we can skip the lookup process for the shards which do not exists + * and directly issue mknod to crete shards. + * + * In case the prealloc fop is to extend the preallocated file to bigger + * size then just lookup and populate inodes of existing shards and + * update the create count + */ + if (local->fop == GF_FOP_FALLOCATE) { + if (!local->prebuf.ia_size) { + local->inode_list[0] = inode_ref(res_inode); + local->create_count = local->last_block; + shard_common_inode_write_post_lookup_shards_handler(frame, this); + return 0; + } + if (local->prebuf.ia_size < local->total_size) + local->create_count = local->last_block - + ((local->prebuf.ia_size - 1) / + local->block_size); + } + + resolve_count = local->last_block - local->create_count; + + if (res_inode) + gf_uuid_copy(gfid, res_inode->gfid); + else + gf_uuid_copy(gfid, local->base_gfid); + + while (shard_idx_iter <= resolve_count) { + i++; + if (shard_idx_iter == 0) { + local->inode_list[i] = inode_ref(res_inode); + shard_idx_iter++; + continue; + } + + shard_make_block_abspath(shard_idx_iter, gfid, path, sizeof(path)); + + inode = NULL; + inode = inode_resolve(this->itable, path); + if (inode) { + gf_msg_debug(this->name, 0, + "Shard %d already " + "present. gfid=%s. Saving inode for future.", + shard_idx_iter, uuid_utoa(inode->gfid)); + local->inode_list[i] = inode; + /* Let the ref on the inodes that are already present + * in inode table still be held so that they don't get + * forgotten by the time the fop reaches the actual + * write stage. + */ + LOCK(&priv->lock); + { + fsync_inode = __shard_update_shards_inode_list( + inode, this, res_inode, shard_idx_iter, gfid); + } + UNLOCK(&priv->lock); + shard_idx_iter++; + if (fsync_inode) + shard_initiate_evicted_inode_fsync(this, fsync_inode); + continue; + } else { + local->call_count++; + shard_idx_iter++; + } + } +out: + post_res_handler(frame, this); + return 0; +} + +int +shard_update_file_size_cbk(call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, dict_t *dict, + dict_t *xdata) +{ + inode_t *inode = NULL; + shard_local_t *local = NULL; + + local = frame->local; + + if ((local->fd) && (local->fd->inode)) + inode = local->fd->inode; + else if (local->loc.inode) + inode = local->loc.inode; + + if (op_ret < 0) { + gf_msg(this->name, GF_LOG_ERROR, op_errno, + SHARD_MSG_UPDATE_FILE_SIZE_FAILED, + "Update to file size" + " xattr failed on %s", + uuid_utoa(inode->gfid)); + local->op_ret = op_ret; + local->op_errno = op_errno; + goto err; + } + + if (shard_modify_size_and_block_count(&local->postbuf, dict)) { + local->op_ret = -1; + local->op_errno = ENOMEM; + goto err; + } +err: + local->post_update_size_handler(frame, this); + return 0; +} + +int +shard_set_size_attrs(int64_t size, int64_t block_count, int64_t **size_attr_p) +{ + int ret = -1; + int64_t *size_attr = NULL; + + if (!size_attr_p) + goto out; + + size_attr = GF_CALLOC(4, sizeof(int64_t), gf_shard_mt_int64_t); + if (!size_attr) + goto out; + + size_attr[0] = hton64(size); + /* As sharding evolves, it _may_ be necessary to embed more pieces of + * information within the same xattr. So allocating slots for them in + * advance. For now, only bytes 0-63 and 128-191 which would make up the + * current size and block count respectively of the file are valid. + */ + size_attr[2] = hton64(block_count); + + *size_attr_p = size_attr; + + ret = 0; +out: + return ret; +} + +int +shard_update_file_size(call_frame_t *frame, xlator_t *this, fd_t *fd, + loc_t *loc, shard_post_update_size_fop_handler_t handler) +{ + int ret = -1; + int64_t *size_attr = NULL; + int64_t delta_blocks = 0; + inode_t *inode = NULL; + shard_local_t *local = NULL; + dict_t *xattr_req = NULL; + + local = frame->local; + local->post_update_size_handler = handler; + + xattr_req = dict_new(); + if (!xattr_req) { + local->op_ret = -1; + local->op_errno = ENOMEM; + goto out; + } + + if (fd) + inode = fd->inode; + else + inode = loc->inode; + + /* If both size and block count have not changed, then skip the xattrop. + */ + delta_blocks = GF_ATOMIC_GET(local->delta_blocks); + if ((local->delta_size + local->hole_size == 0) && (delta_blocks == 0)) { + goto out; + } + + ret = shard_set_size_attrs(local->delta_size + local->hole_size, + delta_blocks, &size_attr); + if (ret) { + gf_msg(this->name, GF_LOG_ERROR, 0, SHARD_MSG_SIZE_SET_FAILED, + "Failed to set size attrs for %s", uuid_utoa(inode->gfid)); + local->op_ret = -1; + local->op_errno = ENOMEM; + goto out; + } + + ret = dict_set_bin(xattr_req, GF_XATTR_SHARD_FILE_SIZE, size_attr, 8 * 4); + if (ret) { + gf_msg(this->name, GF_LOG_ERROR, 0, SHARD_MSG_DICT_OP_FAILED, + "Failed to set key %s into dict. gfid=%s", + GF_XATTR_SHARD_FILE_SIZE, uuid_utoa(inode->gfid)); + GF_FREE(size_attr); + local->op_ret = -1; + local->op_errno = ENOMEM; + goto out; + } + + if (fd) + STACK_WIND(frame, shard_update_file_size_cbk, FIRST_CHILD(this), + FIRST_CHILD(this)->fops->fxattrop, fd, + GF_XATTROP_ADD_ARRAY64, xattr_req, NULL); + else + STACK_WIND(frame, shard_update_file_size_cbk, FIRST_CHILD(this), + FIRST_CHILD(this)->fops->xattrop, loc, + GF_XATTROP_ADD_ARRAY64, xattr_req, NULL); + + dict_unref(xattr_req); + return 0; + +out: + if (xattr_req) + dict_unref(xattr_req); + handler(frame, this); + return 0; +} + +static inode_t * +shard_link_internal_dir_inode(shard_local_t *local, inode_t *inode, + struct iatt *buf, shard_internal_dir_type_t type) +{ + inode_t *linked_inode = NULL; + shard_priv_t *priv = NULL; + char *bname = NULL; + inode_t **priv_inode = NULL; + inode_t *parent = NULL; + + priv = THIS->private; + + switch (type) { + case SHARD_INTERNAL_DIR_DOT_SHARD: + bname = GF_SHARD_DIR; + priv_inode = &priv->dot_shard_inode; + parent = inode->table->root; + break; + case SHARD_INTERNAL_DIR_DOT_SHARD_REMOVE_ME: + bname = GF_SHARD_REMOVE_ME_DIR; + priv_inode = &priv->dot_shard_rm_inode; + parent = priv->dot_shard_inode; + break; + default: + break; + } + + linked_inode = inode_link(inode, parent, bname, buf); + inode_lookup(linked_inode); + *priv_inode = linked_inode; + return linked_inode; +} + +int +shard_refresh_internal_dir_cbk(call_frame_t *frame, void *cookie, + xlator_t *this, int32_t op_ret, int32_t op_errno, + inode_t *inode, struct iatt *buf, dict_t *xdata, + struct iatt *postparent) +{ + shard_local_t *local = NULL; + inode_t *linked_inode = NULL; + shard_internal_dir_type_t type = (shard_internal_dir_type_t)cookie; + + local = frame->local; + + if (op_ret) { + local->op_ret = op_ret; + local->op_errno = op_errno; + goto out; + } + + /* To-Do: Fix refcount increment per call to + * shard_link_internal_dir_inode(). + */ + linked_inode = shard_link_internal_dir_inode(local, inode, buf, type); + shard_inode_ctx_mark_dir_refreshed(linked_inode, this); +out: + shard_common_resolve_shards(frame, this, local->post_res_handler); + return 0; +} + +int +shard_refresh_internal_dir(call_frame_t *frame, xlator_t *this, + shard_internal_dir_type_t type) +{ + loc_t loc = { + 0, + }; + inode_t *inode = NULL; + shard_priv_t *priv = NULL; + shard_local_t *local = NULL; + uuid_t gfid = { + 0, + }; + + local = frame->local; + priv = this->private; + + switch (type) { + case SHARD_INTERNAL_DIR_DOT_SHARD: + gf_uuid_copy(gfid, priv->dot_shard_gfid); + break; + case SHARD_INTERNAL_DIR_DOT_SHARD_REMOVE_ME: + gf_uuid_copy(gfid, priv->dot_shard_rm_gfid); + break; + default: + break; + } + + inode = inode_find(this->itable, gfid); + + if (!shard_inode_ctx_needs_lookup(inode, this)) { + local->op_ret = 0; + goto out; + } + + /* Plain assignment because the ref is already taken above through + * call to inode_find() + */ + loc.inode = inode; + gf_uuid_copy(loc.gfid, gfid); + + STACK_WIND_COOKIE(frame, shard_refresh_internal_dir_cbk, (void *)(long)type, + FIRST_CHILD(this), FIRST_CHILD(this)->fops->lookup, &loc, + NULL); + loc_wipe(&loc); + + return 0; + +out: + shard_common_resolve_shards(frame, this, local->post_res_handler); + return 0; +} + +int +shard_lookup_internal_dir_cbk(call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, inode_t *inode, + struct iatt *buf, dict_t *xdata, + struct iatt *postparent) +{ + inode_t *link_inode = NULL; + shard_local_t *local = NULL; + shard_internal_dir_type_t type = (shard_internal_dir_type_t)cookie; + + local = frame->local; + + if (op_ret) { + local->op_ret = op_ret; + local->op_errno = op_errno; + goto unwind; + } + + if (!IA_ISDIR(buf->ia_type)) { + gf_msg(this->name, GF_LOG_CRITICAL, 0, SHARD_MSG_DOT_SHARD_NODIR, + "%s already exists and " + "is not a directory. Please remove it from all bricks " + "and try again", + shard_internal_dir_string(type)); + local->op_ret = -1; + local->op_errno = EIO; + goto unwind; + } + + link_inode = shard_link_internal_dir_inode(local, inode, buf, type); + if (link_inode != inode) { + shard_refresh_internal_dir(frame, this, type); + } else { + shard_inode_ctx_mark_dir_refreshed(link_inode, this); + shard_common_resolve_shards(frame, this, local->post_res_handler); + } + return 0; + +unwind: + local->post_res_handler(frame, this); + return 0; +} + +int +shard_lookup_internal_dir(call_frame_t *frame, xlator_t *this, + shard_post_resolve_fop_handler_t post_res_handler, + shard_internal_dir_type_t type) +{ + int ret = -1; + dict_t *xattr_req = NULL; + shard_priv_t *priv = NULL; + shard_local_t *local = NULL; + uuid_t *gfid = NULL; + loc_t *loc = NULL; + gf_boolean_t free_gfid = _gf_true; + + local = frame->local; + priv = this->private; + local->post_res_handler = post_res_handler; + + gfid = GF_MALLOC(sizeof(uuid_t), gf_common_mt_uuid_t); + if (!gfid) + goto err; + + xattr_req = dict_new(); + if (!xattr_req) { + local->op_ret = -1; + local->op_errno = ENOMEM; + goto err; + } + + switch (type) { + case SHARD_INTERNAL_DIR_DOT_SHARD: + gf_uuid_copy(*gfid, priv->dot_shard_gfid); + loc = &local->dot_shard_loc; + break; + case SHARD_INTERNAL_DIR_DOT_SHARD_REMOVE_ME: + gf_uuid_copy(*gfid, priv->dot_shard_rm_gfid); + loc = &local->dot_shard_rm_loc; + break; + default: + bzero(*gfid, sizeof(uuid_t)); + break; + } + + ret = dict_set_gfuuid(xattr_req, "gfid-req", *gfid, false); + if (ret) { + gf_msg(this->name, GF_LOG_ERROR, 0, SHARD_MSG_DICT_OP_FAILED, + "Failed to set gfid of %s into dict", + shard_internal_dir_string(type)); + local->op_ret = -1; + local->op_errno = ENOMEM; + goto err; + } else { + free_gfid = _gf_false; + } + + STACK_WIND_COOKIE(frame, shard_lookup_internal_dir_cbk, (void *)(long)type, + FIRST_CHILD(this), FIRST_CHILD(this)->fops->lookup, loc, + xattr_req); + + dict_unref(xattr_req); + return 0; + +err: + if (xattr_req) + dict_unref(xattr_req); + if (free_gfid) + GF_FREE(gfid); + post_res_handler(frame, this); + return 0; +} + +static void +shard_inode_ctx_update(inode_t *inode, xlator_t *this, dict_t *xdata, + struct iatt *buf) +{ + int ret = 0; + uint64_t size = 0; + void *bsize = NULL; + + if (shard_inode_ctx_get_block_size(inode, this, &size)) { + /* Fresh lookup */ + ret = dict_get_ptr(xdata, GF_XATTR_SHARD_BLOCK_SIZE, &bsize); + if (!ret) + size = ntoh64(*((uint64_t *)bsize)); + /* If the file is sharded, set its block size, otherwise just + * set 0. + */ + + shard_inode_ctx_set(inode, this, buf, size, SHARD_MASK_BLOCK_SIZE); + } + /* If the file is sharded, also set the remaining attributes, + * except for ia_size and ia_blocks. + */ + if (size) { + shard_inode_ctx_set(inode, this, buf, 0, SHARD_LOOKUP_MASK); + (void)shard_inode_ctx_invalidate(inode, this, buf); + } +} + +int +shard_delete_shards(void *opaque); + +int +shard_delete_shards_cbk(int ret, call_frame_t *frame, void *data); + +int +shard_start_background_deletion(xlator_t *this) +{ + int ret = 0; + gf_boolean_t i_cleanup = _gf_true; + shard_priv_t *priv = NULL; + call_frame_t *cleanup_frame = NULL; + + priv = this->private; + + LOCK(&priv->lock); + { + switch (priv->bg_del_state) { + case SHARD_BG_DELETION_NONE: + i_cleanup = _gf_true; + priv->bg_del_state = SHARD_BG_DELETION_LAUNCHING; + break; + case SHARD_BG_DELETION_LAUNCHING: + i_cleanup = _gf_false; + break; + case SHARD_BG_DELETION_IN_PROGRESS: + priv->bg_del_state = SHARD_BG_DELETION_LAUNCHING; + i_cleanup = _gf_false; + break; + default: + break; + } + } + UNLOCK(&priv->lock); + if (!i_cleanup) + return 0; + + cleanup_frame = create_frame(this, this->ctx->pool); + if (!cleanup_frame) { + gf_msg(this->name, GF_LOG_WARNING, ENOMEM, SHARD_MSG_MEMALLOC_FAILED, + "Failed to create " + "new frame to delete shards"); + ret = -ENOMEM; + goto err; + } + + set_lk_owner_from_ptr(&cleanup_frame->root->lk_owner, cleanup_frame->root); + + ret = synctask_new(this->ctx->env, shard_delete_shards, + shard_delete_shards_cbk, cleanup_frame, cleanup_frame); + if (ret < 0) { + gf_msg(this->name, GF_LOG_WARNING, errno, + SHARD_MSG_SHARDS_DELETION_FAILED, + "failed to create task to do background " + "cleanup of shards"); + STACK_DESTROY(cleanup_frame->root); + goto err; + } + return 0; + +err: + LOCK(&priv->lock); + { + priv->bg_del_state = SHARD_BG_DELETION_NONE; + } + UNLOCK(&priv->lock); + return ret; +} + +int +shard_lookup_cbk(call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, inode_t *inode, + struct iatt *buf, dict_t *xdata, struct iatt *postparent) +{ + int ret = -1; + shard_priv_t *priv = NULL; + gf_boolean_t i_start_cleanup = _gf_false; + + priv = this->private; + + if (op_ret < 0) + goto unwind; + + if (IA_ISDIR(buf->ia_type)) + goto unwind; + + /* Also, if the file is sharded, get the file size and block cnt xattr, + * and store them in the stbuf appropriately. + */ + + if (dict_get(xdata, GF_XATTR_SHARD_FILE_SIZE) && + frame->root->pid != GF_CLIENT_PID_GSYNCD) + shard_modify_size_and_block_count(buf, xdata); + + /* If this was a fresh lookup, there are two possibilities: + * 1) If the file is sharded (indicated by the presence of block size + * xattr), store this block size, along with rdev and mode in its + * inode ctx. + * 2) If the file is not sharded, store size along with rdev and mode + * (which are anyway don't cares) in inode ctx. Since @ctx_tmp is + * already initialised to all zeroes, nothing more needs to be done. + */ + + (void)shard_inode_ctx_update(inode, this, xdata, buf); + + LOCK(&priv->lock); + { + if (priv->first_lookup_done == _gf_false) { + priv->first_lookup_done = _gf_true; + i_start_cleanup = _gf_true; + } + } + UNLOCK(&priv->lock); + + if (!i_start_cleanup) + goto unwind; + + ret = shard_start_background_deletion(this); + if (ret < 0) { + LOCK(&priv->lock); + { + priv->first_lookup_done = _gf_false; + } + UNLOCK(&priv->lock); + } + +unwind: + SHARD_STACK_UNWIND(lookup, frame, op_ret, op_errno, inode, buf, xdata, + postparent); + return 0; +} + +int +shard_lookup(call_frame_t *frame, xlator_t *this, loc_t *loc, dict_t *xattr_req) +{ + int ret = -1; + int32_t op_errno = ENOMEM; + uint64_t block_size = 0; + shard_local_t *local = NULL; + + this->itable = loc->inode->table; + if ((frame->root->pid != GF_CLIENT_PID_GSYNCD) && + (frame->root->pid != GF_CLIENT_PID_GLFS_HEAL)) { + SHARD_ENTRY_FOP_CHECK(loc, op_errno, err); + } + + local = mem_get0(this->local_pool); + if (!local) + goto err; + + frame->local = local; + + loc_copy(&local->loc, loc); + + local->xattr_req = xattr_req ? dict_ref(xattr_req) : dict_new(); + if (!local->xattr_req) + goto err; + + if (shard_inode_ctx_get_block_size(loc->inode, this, &block_size)) { + ret = dict_set_uint64(local->xattr_req, GF_XATTR_SHARD_BLOCK_SIZE, 0); + if (ret) { + gf_msg(this->name, GF_LOG_WARNING, 0, SHARD_MSG_DICT_OP_FAILED, + "Failed to set dict" + " value: key:%s for path %s", + GF_XATTR_SHARD_BLOCK_SIZE, loc->path); + goto err; + } + } + + if (frame->root->pid != GF_CLIENT_PID_GSYNCD) { + ret = dict_set_uint64(local->xattr_req, GF_XATTR_SHARD_FILE_SIZE, + 8 * 4); + if (ret) { + gf_msg(this->name, GF_LOG_WARNING, 0, SHARD_MSG_DICT_OP_FAILED, + "Failed to set dict value: key:%s for path %s.", + GF_XATTR_SHARD_FILE_SIZE, loc->path); + goto err; + } + } + + if ((xattr_req) && (dict_get(xattr_req, GF_CONTENT_KEY))) + dict_del(xattr_req, GF_CONTENT_KEY); + + STACK_WIND(frame, shard_lookup_cbk, FIRST_CHILD(this), + FIRST_CHILD(this)->fops->lookup, loc, local->xattr_req); + return 0; +err: + shard_common_failure_unwind(GF_FOP_LOOKUP, frame, -1, op_errno); + return 0; +} + +int +shard_set_iattr_invoke_post_handler(call_frame_t *frame, xlator_t *this, + inode_t *inode, int32_t op_ret, + int32_t op_errno, struct iatt *buf, + dict_t *xdata) +{ + int ret = -1; + int32_t mask = SHARD_INODE_WRITE_MASK; + shard_local_t *local = frame->local; + shard_inode_ctx_t ctx = { + 0, + }; + + if (op_ret < 0) { + gf_msg(this->name, GF_LOG_ERROR, op_errno, + SHARD_MSG_BASE_FILE_LOOKUP_FAILED, + "Lookup on base file" + " failed : %s", + uuid_utoa(inode->gfid)); + local->op_ret = op_ret; + local->op_errno = op_errno; + goto unwind; + } + + local->prebuf = *buf; + if (shard_modify_size_and_block_count(&local->prebuf, xdata)) { + local->op_ret = -1; + local->op_errno = EINVAL; + goto unwind; + } + + if (shard_inode_ctx_get_all(inode, this, &ctx)) + mask = SHARD_ALL_MASK; + + ret = shard_inode_ctx_set(inode, this, &local->prebuf, 0, + (mask | SHARD_MASK_REFRESH_RESET)); + if (ret) { + gf_msg(this->name, GF_LOG_ERROR, SHARD_MSG_INODE_CTX_SET_FAILED, 0, + "Failed to set inode" + " write params into inode ctx for %s", + uuid_utoa(buf->ia_gfid)); + local->op_ret = -1; + local->op_errno = ENOMEM; + goto unwind; + } + +unwind: + local->handler(frame, this); + return 0; +} + +int +shard_fstat_base_file_cbk(call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, struct iatt *buf, + dict_t *xdata) +{ + shard_local_t *local = frame->local; + + shard_set_iattr_invoke_post_handler(frame, this, local->fd->inode, op_ret, + op_errno, buf, xdata); + return 0; +} + +int +shard_lookup_base_file_cbk(call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, inode_t *inode, + struct iatt *buf, dict_t *xdata, + struct iatt *postparent) +{ + /* In case of op_ret < 0, inode passed to this function will be NULL + ex: in case of op_errno = ENOENT. So refer prefilled inode data + which is part of local. + Note: Reassigning/overriding the inode passed to this cbk with inode + which is part of *struct shard_local_t* won't cause any issue as + both inodes have same reference/address as of the inode passed */ + inode = ((shard_local_t *)frame->local)->loc.inode; + + shard_set_iattr_invoke_post_handler(frame, this, inode, op_ret, op_errno, + buf, xdata); + return 0; +} + +/* This function decides whether to make file based lookup or + * fd based lookup (fstat) depending on the 3rd and 4th arg. + * If fd != NULL and loc == NULL then call is for fstat + * If fd == NULL and loc != NULL then call is for file based + * lookup. Please pass args based on the requirement. + */ +int +shard_refresh_base_file(call_frame_t *frame, xlator_t *this, loc_t *loc, + fd_t *fd, shard_post_fop_handler_t handler) +{ + int ret = -1; + inode_t *inode = NULL; + shard_local_t *local = NULL; + dict_t *xattr_req = NULL; + gf_boolean_t need_refresh = _gf_false; + + local = frame->local; + local->handler = handler; + inode = fd ? fd->inode : loc->inode; + + ret = shard_inode_ctx_fill_iatt_from_cache(inode, this, &local->prebuf, + &need_refresh); + /* By this time, inode ctx should have been created either in create, + * mknod, readdirp or lookup. If not it is a bug! + */ + if ((ret == 0) && (need_refresh == _gf_false)) { + gf_msg_debug(this->name, 0, + "Skipping lookup on base file: %s" + "Serving prebuf off the inode ctx cache", + uuid_utoa(inode->gfid)); + goto out; + } + + xattr_req = dict_new(); + if (!xattr_req) { + local->op_ret = -1; + local->op_errno = ENOMEM; + goto out; + } + + SHARD_MD_READ_FOP_INIT_REQ_DICT(this, xattr_req, inode->gfid, local, out); + + if (fd) + STACK_WIND(frame, shard_fstat_base_file_cbk, FIRST_CHILD(this), + FIRST_CHILD(this)->fops->fstat, fd, xattr_req); + else + STACK_WIND(frame, shard_lookup_base_file_cbk, FIRST_CHILD(this), + FIRST_CHILD(this)->fops->lookup, loc, xattr_req); + + dict_unref(xattr_req); + return 0; + +out: + if (xattr_req) + dict_unref(xattr_req); + handler(frame, this); + return 0; +} + +int +shard_post_fstat_handler(call_frame_t *frame, xlator_t *this) +{ + shard_local_t *local = NULL; + + local = frame->local; + + if (local->op_ret >= 0) + shard_inode_ctx_set(local->fd->inode, this, &local->prebuf, 0, + SHARD_LOOKUP_MASK); + + SHARD_STACK_UNWIND(fstat, frame, local->op_ret, local->op_errno, + &local->prebuf, local->xattr_rsp); + return 0; +} + +int +shard_post_stat_handler(call_frame_t *frame, xlator_t *this) +{ + shard_local_t *local = NULL; + + local = frame->local; + + if (local->op_ret >= 0) + shard_inode_ctx_set(local->loc.inode, this, &local->prebuf, 0, + SHARD_LOOKUP_MASK); + + SHARD_STACK_UNWIND(stat, frame, local->op_ret, local->op_errno, + &local->prebuf, local->xattr_rsp); + return 0; +} + +int +shard_common_stat_cbk(call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, struct iatt *buf, + dict_t *xdata) +{ + inode_t *inode = NULL; + shard_local_t *local = NULL; + + local = frame->local; + + if (op_ret < 0) { + gf_msg(this->name, GF_LOG_ERROR, op_errno, SHARD_MSG_STAT_FAILED, + "stat failed: %s", + local->fd ? uuid_utoa(local->fd->inode->gfid) + : uuid_utoa((local->loc.inode)->gfid)); + local->op_ret = op_ret; + local->op_errno = op_errno; + goto unwind; + } + + local->prebuf = *buf; + if (shard_modify_size_and_block_count(&local->prebuf, xdata)) { + local->op_ret = -1; + local->op_errno = EINVAL; + goto unwind; + } + local->xattr_rsp = dict_ref(xdata); + + if (local->loc.inode) + inode = local->loc.inode; + else + inode = local->fd->inode; + + shard_inode_ctx_invalidate(inode, this, &local->prebuf); + +unwind: + local->handler(frame, this); + return 0; +} + +int +shard_stat(call_frame_t *frame, xlator_t *this, loc_t *loc, dict_t *xdata) +{ + int ret = -1; + uint64_t block_size = 0; + shard_local_t *local = NULL; + + if ((IA_ISDIR(loc->inode->ia_type)) || (IA_ISLNK(loc->inode->ia_type))) { + STACK_WIND(frame, default_stat_cbk, FIRST_CHILD(this), + FIRST_CHILD(this)->fops->stat, loc, xdata); + return 0; + } + + ret = shard_inode_ctx_get_block_size(loc->inode, this, &block_size); + if (ret) { + gf_msg(this->name, GF_LOG_ERROR, 0, SHARD_MSG_INODE_CTX_GET_FAILED, + "Failed to get block " + "size from inode ctx of %s", + uuid_utoa(loc->inode->gfid)); + goto err; + } + + if (!block_size || frame->root->pid == GF_CLIENT_PID_GSYNCD) { + STACK_WIND(frame, default_stat_cbk, FIRST_CHILD(this), + FIRST_CHILD(this)->fops->stat, loc, xdata); + return 0; + } + + local = mem_get0(this->local_pool); + if (!local) + goto err; + + frame->local = local; + + local->handler = shard_post_stat_handler; + loc_copy(&local->loc, loc); + local->xattr_req = (xdata) ? dict_ref(xdata) : dict_new(); + if (!local->xattr_req) + goto err; + + SHARD_MD_READ_FOP_INIT_REQ_DICT(this, local->xattr_req, local->loc.gfid, + local, err); + + STACK_WIND(frame, shard_common_stat_cbk, FIRST_CHILD(this), + FIRST_CHILD(this)->fops->stat, loc, local->xattr_req); + return 0; +err: + shard_common_failure_unwind(GF_FOP_STAT, frame, -1, ENOMEM); + return 0; +} + +int +shard_fstat(call_frame_t *frame, xlator_t *this, fd_t *fd, dict_t *xdata) +{ + int ret = -1; + uint64_t block_size = 0; + shard_local_t *local = NULL; + + if ((IA_ISDIR(fd->inode->ia_type)) || (IA_ISLNK(fd->inode->ia_type))) { + STACK_WIND(frame, default_fstat_cbk, FIRST_CHILD(this), + FIRST_CHILD(this)->fops->fstat, fd, xdata); + return 0; + } + + ret = shard_inode_ctx_get_block_size(fd->inode, this, &block_size); + if (ret) { + gf_msg(this->name, GF_LOG_ERROR, 0, SHARD_MSG_INODE_CTX_GET_FAILED, + "Failed to get block " + "size from inode ctx of %s", + uuid_utoa(fd->inode->gfid)); + goto err; + } + + if (!block_size || frame->root->pid == GF_CLIENT_PID_GSYNCD) { + STACK_WIND(frame, default_fstat_cbk, FIRST_CHILD(this), + FIRST_CHILD(this)->fops->fstat, fd, xdata); + return 0; + } + + if (!this->itable) + this->itable = fd->inode->table; + + local = mem_get0(this->local_pool); + if (!local) + goto err; + + frame->local = local; + + local->handler = shard_post_fstat_handler; + local->fd = fd_ref(fd); + local->xattr_req = (xdata) ? dict_ref(xdata) : dict_new(); + if (!local->xattr_req) + goto err; + + SHARD_MD_READ_FOP_INIT_REQ_DICT(this, local->xattr_req, fd->inode->gfid, + local, err); + + STACK_WIND(frame, shard_common_stat_cbk, FIRST_CHILD(this), + FIRST_CHILD(this)->fops->fstat, fd, local->xattr_req); + return 0; +err: + shard_common_failure_unwind(GF_FOP_FSTAT, frame, -1, ENOMEM); + return 0; +} + +int +shard_post_update_size_truncate_handler(call_frame_t *frame, xlator_t *this) +{ + shard_local_t *local = NULL; + + local = frame->local; + + if (local->fop == GF_FOP_TRUNCATE) + SHARD_STACK_UNWIND(truncate, frame, local->op_ret, local->op_errno, + &local->prebuf, &local->postbuf, NULL); + else + SHARD_STACK_UNWIND(ftruncate, frame, local->op_ret, local->op_errno, + &local->prebuf, &local->postbuf, NULL); + return 0; +} + +int +shard_truncate_last_shard_cbk(call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, + struct iatt *prebuf, struct iatt *postbuf, + dict_t *xdata) +{ + inode_t *inode = NULL; + int64_t delta_blocks = 0; + shard_local_t *local = NULL; + + local = frame->local; + + SHARD_UNSET_ROOT_FS_ID(frame, local); + + inode = (local->fop == GF_FOP_TRUNCATE) ? local->loc.inode + : local->fd->inode; + if (op_ret < 0) { + gf_msg(this->name, GF_LOG_ERROR, op_errno, + SHARD_MSG_TRUNCATE_LAST_SHARD_FAILED, + "truncate on last" + " shard failed : %s", + uuid_utoa(inode->gfid)); + local->op_ret = op_ret; + local->op_errno = op_errno; + goto err; + } + + local->postbuf.ia_size = local->offset; + /* Let the delta be negative. We want xattrop to do subtraction */ + local->delta_size = local->postbuf.ia_size - local->prebuf.ia_size; + delta_blocks = GF_ATOMIC_ADD(local->delta_blocks, + postbuf->ia_blocks - prebuf->ia_blocks); + GF_ASSERT(delta_blocks <= 0); + local->postbuf.ia_blocks += delta_blocks; + local->hole_size = 0; + + shard_inode_ctx_set(inode, this, &local->postbuf, 0, SHARD_MASK_TIMES); + shard_update_file_size(frame, this, NULL, &local->loc, + shard_post_update_size_truncate_handler); + return 0; +err: + shard_common_failure_unwind(local->fop, frame, local->op_ret, + local->op_errno); + return 0; +} + +int +shard_truncate_last_shard(call_frame_t *frame, xlator_t *this, inode_t *inode) +{ + size_t last_shard_size_after = 0; + loc_t loc = { + 0, + }; + shard_local_t *local = NULL; + + local = frame->local; + + /* A NULL inode could be due to the fact that the last shard which + * needs to be truncated does not exist due to it lying in a hole + * region. So the only thing left to do in that case would be an + * update to file size xattr. + */ + if (!inode) { + gf_msg_debug(this->name, 0, + "Last shard to be truncated absent in backend: %" PRIu64 + " of gfid %s. Directly proceeding to update file size", + local->first_block, uuid_utoa(local->loc.inode->gfid)); + shard_update_file_size(frame, this, NULL, &local->loc, + shard_post_update_size_truncate_handler); + return 0; + } + + SHARD_SET_ROOT_FS_ID(frame, local); + + loc.inode = inode_ref(inode); + gf_uuid_copy(loc.gfid, inode->gfid); + + last_shard_size_after = (local->offset % local->block_size); + + STACK_WIND(frame, shard_truncate_last_shard_cbk, FIRST_CHILD(this), + FIRST_CHILD(this)->fops->truncate, &loc, last_shard_size_after, + NULL); + loc_wipe(&loc); + return 0; +} + +void +shard_unlink_block_inode(shard_local_t *local, int shard_block_num); + +int +shard_truncate_htol_cbk(call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, + struct iatt *preparent, struct iatt *postparent, + dict_t *xdata) +{ + int ret = 0; + int call_count = 0; + int shard_block_num = (long)cookie; + uint64_t block_count = 0; + shard_local_t *local = NULL; + + local = frame->local; + + if (op_ret < 0) { + local->op_ret = op_ret; + local->op_errno = op_errno; + goto done; + } + ret = dict_get_uint64(xdata, GF_GET_FILE_BLOCK_COUNT, &block_count); + if (!ret) { + GF_ATOMIC_SUB(local->delta_blocks, block_count); + } else { + /* dict_get failed possibly due to a heterogeneous cluster? */ + gf_msg(this->name, GF_LOG_WARNING, 0, SHARD_MSG_DICT_OP_FAILED, + "Failed to get key %s from dict during truncate of gfid %s", + GF_GET_FILE_BLOCK_COUNT, + uuid_utoa(local->resolver_base_inode->gfid)); + } + + shard_unlink_block_inode(local, shard_block_num); +done: + call_count = shard_call_count_return(frame); + if (call_count == 0) { + SHARD_UNSET_ROOT_FS_ID(frame, local); + shard_truncate_last_shard(frame, this, local->inode_list[0]); + } + return 0; +} + +int +shard_truncate_htol(call_frame_t *frame, xlator_t *this, inode_t *inode) +{ + int i = 1; + int ret = -1; + int call_count = 0; + uint32_t cur_block = 0; + uint32_t last_block = 0; + char path[PATH_MAX] = { + 0, + }; + char *bname = NULL; + loc_t loc = { + 0, + }; + gf_boolean_t wind_failed = _gf_false; + shard_local_t *local = NULL; + shard_priv_t *priv = NULL; + dict_t *xdata_req = NULL; + + local = frame->local; + priv = this->private; + + cur_block = local->first_block + 1; + last_block = local->last_block; + + /* Determine call count */ + for (i = 1; i < local->num_blocks; i++) { + if (!local->inode_list[i]) + continue; + call_count++; + } + + if (!call_count) { + /* Call count = 0 implies that all of the shards that need to be + * unlinked do not exist. So shard xlator would now proceed to + * do the final truncate + size updates. + */ + gf_msg_debug(this->name, 0, + "Shards to be unlinked as part of " + "truncate absent in backend: %s. Directly " + "proceeding to update file size", + uuid_utoa(inode->gfid)); + local->postbuf.ia_size = local->offset; + local->postbuf.ia_blocks = local->prebuf.ia_blocks; + local->delta_size = local->postbuf.ia_size - local->prebuf.ia_size; + GF_ATOMIC_INIT(local->delta_blocks, 0); + local->hole_size = 0; + shard_update_file_size(frame, this, local->fd, &local->loc, + shard_post_update_size_truncate_handler); + return 0; + } + + local->call_count = call_count; + i = 1; + xdata_req = dict_new(); + if (!xdata_req) { + shard_common_failure_unwind(local->fop, frame, -1, ENOMEM); + return 0; + } + ret = dict_set_uint64(xdata_req, GF_GET_FILE_BLOCK_COUNT, 8 * 8); + if (ret) { + gf_msg(this->name, GF_LOG_WARNING, 0, SHARD_MSG_DICT_OP_FAILED, + "Failed to set key %s into dict during truncate of %s", + GF_GET_FILE_BLOCK_COUNT, + uuid_utoa(local->resolver_base_inode->gfid)); + dict_unref(xdata_req); + shard_common_failure_unwind(local->fop, frame, -1, ENOMEM); + return 0; + } + + SHARD_SET_ROOT_FS_ID(frame, local); + while (cur_block <= last_block) { + if (!local->inode_list[i]) { + cur_block++; + i++; + continue; + } + if (wind_failed) { + shard_truncate_htol_cbk(frame, (void *)(long)cur_block, this, -1, + ENOMEM, NULL, NULL, NULL); + goto next; + } + + shard_make_block_abspath(cur_block, inode->gfid, path, sizeof(path)); + bname = strrchr(path, '/') + 1; + loc.parent = inode_ref(priv->dot_shard_inode); + ret = inode_path(loc.parent, bname, (char **)&(loc.path)); + if (ret < 0) { + gf_msg(this->name, GF_LOG_ERROR, 0, SHARD_MSG_INODE_PATH_FAILED, + "Inode path failed" + " on %s. Base file gfid = %s", + bname, uuid_utoa(inode->gfid)); + local->op_ret = -1; + local->op_errno = ENOMEM; + loc_wipe(&loc); + wind_failed = _gf_true; + shard_truncate_htol_cbk(frame, (void *)(long)cur_block, this, -1, + ENOMEM, NULL, NULL, NULL); + goto next; + } + loc.name = strrchr(loc.path, '/'); + if (loc.name) + loc.name++; + loc.inode = inode_ref(local->inode_list[i]); + + STACK_WIND_COOKIE(frame, shard_truncate_htol_cbk, + (void *)(long)cur_block, FIRST_CHILD(this), + FIRST_CHILD(this)->fops->unlink, &loc, 0, xdata_req); + loc_wipe(&loc); + next: + i++; + cur_block++; + if (!--call_count) + break; + } + dict_unref(xdata_req); + return 0; +} + +int +shard_truncate_do(call_frame_t *frame, xlator_t *this) +{ + shard_local_t *local = NULL; + + local = frame->local; + + if (local->num_blocks == 1) { + /* This means that there are no shards to be unlinked. + * The fop boils down to truncating the last shard, updating + * the size and unwinding. + */ + shard_truncate_last_shard(frame, this, local->inode_list[0]); + return 0; + } else { + shard_truncate_htol(frame, this, local->loc.inode); + } + return 0; +} + +int +shard_post_lookup_shards_truncate_handler(call_frame_t *frame, xlator_t *this) +{ + shard_local_t *local = NULL; + + local = frame->local; + + if (local->op_ret < 0) { + shard_common_failure_unwind(local->fop, frame, local->op_ret, + local->op_errno); + return 0; + } + + shard_truncate_do(frame, this); + return 0; +} + +void +shard_link_block_inode(shard_local_t *local, int block_num, inode_t *inode, + struct iatt *buf) +{ + int list_index = 0; + char block_bname[256] = { + 0, + }; + uuid_t gfid = { + 0, + }; + inode_t *linked_inode = NULL; + xlator_t *this = NULL; + inode_t *fsync_inode = NULL; + shard_priv_t *priv = NULL; + inode_t *base_inode = NULL; + + this = THIS; + priv = this->private; + if (local->loc.inode) { + gf_uuid_copy(gfid, local->loc.inode->gfid); + base_inode = local->loc.inode; + } else if (local->resolver_base_inode) { + gf_uuid_copy(gfid, local->resolver_base_inode->gfid); + base_inode = local->resolver_base_inode; + } else { + gf_uuid_copy(gfid, local->base_gfid); + } + + shard_make_block_bname(block_num, gfid, block_bname, sizeof(block_bname)); + + shard_inode_ctx_set(inode, this, buf, 0, SHARD_LOOKUP_MASK); + linked_inode = inode_link(inode, priv->dot_shard_inode, block_bname, buf); + inode_lookup(linked_inode); + list_index = block_num - local->first_block; + local->inode_list[list_index] = linked_inode; + + LOCK(&priv->lock); + { + fsync_inode = __shard_update_shards_inode_list( + linked_inode, this, base_inode, block_num, gfid); + } + UNLOCK(&priv->lock); + if (fsync_inode) + shard_initiate_evicted_inode_fsync(this, fsync_inode); +} + +int +shard_common_lookup_shards_cbk(call_frame_t *frame, void *cookie, + xlator_t *this, int32_t op_ret, int32_t op_errno, + inode_t *inode, struct iatt *buf, dict_t *xdata, + struct iatt *postparent) +{ + int call_count = 0; + int shard_block_num = (long)cookie; + uuid_t gfid = { + 0, + }; + shard_local_t *local = NULL; + + local = frame->local; + if (local->resolver_base_inode) + gf_uuid_copy(gfid, local->resolver_base_inode->gfid); + else + gf_uuid_copy(gfid, local->base_gfid); + + if (op_ret < 0) { + /* Ignore absence of shards in the backend in truncate fop. */ + switch (local->fop) { + case GF_FOP_TRUNCATE: + case GF_FOP_FTRUNCATE: + case GF_FOP_RENAME: + case GF_FOP_UNLINK: + if (op_errno == ENOENT) + goto done; + break; + case GF_FOP_WRITE: + case GF_FOP_READ: + case GF_FOP_ZEROFILL: + case GF_FOP_DISCARD: + case GF_FOP_FALLOCATE: + if ((!local->first_lookup_done) && (op_errno == ENOENT)) { + LOCK(&frame->lock); + { + local->create_count++; + } + UNLOCK(&frame->lock); + goto done; + } + break; + default: + break; + } + + /* else */ + gf_msg(this->name, GF_LOG_ERROR, op_errno, + SHARD_MSG_LOOKUP_SHARD_FAILED, + "Lookup on shard %d " + "failed. Base file gfid = %s", + shard_block_num, uuid_utoa(gfid)); + local->op_ret = op_ret; + local->op_errno = op_errno; + goto done; + } + + shard_link_block_inode(local, shard_block_num, inode, buf); + +done: + if (local->lookup_shards_barriered) { + syncbarrier_wake(&local->barrier); + return 0; + } else { + call_count = shard_call_count_return(frame); + if (call_count == 0) { + if (!local->first_lookup_done) + local->first_lookup_done = _gf_true; + local->pls_fop_handler(frame, this); + } + } + return 0; +} + +dict_t * +shard_create_gfid_dict(dict_t *dict) +{ + int ret = 0; + dict_t *new = NULL; + unsigned char *gfid = NULL; + + new = dict_copy_with_ref(dict, NULL); + if (!new) + return NULL; + + gfid = GF_MALLOC(sizeof(uuid_t), gf_common_mt_char); + if (!gfid) { + ret = -1; + goto out; + } + + gf_uuid_generate(gfid); + + ret = dict_set_gfuuid(new, "gfid-req", gfid, false); + +out: + if (ret) { + dict_unref(new); + new = NULL; + GF_FREE(gfid); + } + + return new; +} + +int +shard_common_lookup_shards(call_frame_t *frame, xlator_t *this, inode_t *inode, + shard_post_lookup_shards_fop_handler_t handler) +{ + int i = 0; + int ret = 0; + int count = 0; + int call_count = 0; + int32_t shard_idx_iter = 0; + int lookup_count = 0; + char path[PATH_MAX] = { + 0, + }; + char *bname = NULL; + uuid_t gfid = { + 0, + }; + loc_t loc = { + 0, + }; + shard_local_t *local = NULL; + shard_priv_t *priv = NULL; + gf_boolean_t wind_failed = _gf_false; + dict_t *xattr_req = NULL; + + priv = this->private; + local = frame->local; + count = call_count = local->call_count; + shard_idx_iter = local->first_block; + lookup_count = local->last_block - local->create_count; + local->pls_fop_handler = handler; + if (local->lookup_shards_barriered) + local->barrier.waitfor = local->call_count; + + if (inode) + gf_uuid_copy(gfid, inode->gfid); + else + gf_uuid_copy(gfid, local->base_gfid); + + while (shard_idx_iter <= lookup_count) { + if (local->inode_list[i]) { + i++; + shard_idx_iter++; + continue; + } + + if (wind_failed) { + shard_common_lookup_shards_cbk(frame, (void *)(long)shard_idx_iter, + this, -1, ENOMEM, NULL, NULL, NULL, + NULL); + goto next; + } + + shard_make_block_abspath(shard_idx_iter, gfid, path, sizeof(path)); + + bname = strrchr(path, '/') + 1; + loc.inode = inode_new(this->itable); + loc.parent = inode_ref(priv->dot_shard_inode); + gf_uuid_copy(loc.pargfid, priv->dot_shard_gfid); + ret = inode_path(loc.parent, bname, (char **)&(loc.path)); + if (ret < 0 || !(loc.inode)) { + gf_msg(this->name, GF_LOG_ERROR, 0, SHARD_MSG_INODE_PATH_FAILED, + "Inode path failed" + " on %s, base file gfid = %s", + bname, uuid_utoa(gfid)); + local->op_ret = -1; + local->op_errno = ENOMEM; + loc_wipe(&loc); + wind_failed = _gf_true; + shard_common_lookup_shards_cbk(frame, (void *)(long)shard_idx_iter, + this, -1, ENOMEM, NULL, NULL, NULL, + NULL); + goto next; + } + + loc.name = strrchr(loc.path, '/'); + if (loc.name) + loc.name++; + + xattr_req = shard_create_gfid_dict(local->xattr_req); + if (!xattr_req) { + local->op_ret = -1; + local->op_errno = ENOMEM; + wind_failed = _gf_true; + loc_wipe(&loc); + shard_common_lookup_shards_cbk(frame, (void *)(long)shard_idx_iter, + this, -1, ENOMEM, NULL, NULL, NULL, + NULL); + goto next; + } + + STACK_WIND_COOKIE(frame, shard_common_lookup_shards_cbk, + (void *)(long)shard_idx_iter, FIRST_CHILD(this), + FIRST_CHILD(this)->fops->lookup, &loc, xattr_req); + loc_wipe(&loc); + dict_unref(xattr_req); + next: + shard_idx_iter++; + i++; + + if (!--call_count) + break; + } + if (local->lookup_shards_barriered) { + syncbarrier_wait(&local->barrier, count); + local->pls_fop_handler(frame, this); + } + return 0; +} + +int +shard_post_resolve_truncate_handler(call_frame_t *frame, xlator_t *this) +{ + shard_local_t *local = NULL; + + local = frame->local; + + if (local->op_ret < 0) { + if (local->op_errno == ENOENT) { + /* If lookup on /.shard fails with ENOENT, it means that + * the file was 0-byte in size but truncated sometime in + * the past to a higher size which is reflected in the + * size xattr, and now being truncated to a lower size. + * In this case, the only thing that needs to be done is + * to update the size xattr of the file and unwind. + */ + local->first_block = local->last_block = 0; + local->num_blocks = 1; + local->call_count = 0; + local->op_ret = 0; + local->postbuf.ia_size = local->offset; + shard_update_file_size(frame, this, local->fd, &local->loc, + shard_post_update_size_truncate_handler); + return 0; + } else { + shard_common_failure_unwind(local->fop, frame, local->op_ret, + local->op_errno); + return 0; + } + } + + if (!local->call_count) + shard_truncate_do(frame, this); + else + shard_common_lookup_shards(frame, this, local->loc.inode, + shard_post_lookup_shards_truncate_handler); + + return 0; +} + +int +shard_truncate_begin(call_frame_t *frame, xlator_t *this) +{ + int ret = 0; + shard_local_t *local = NULL; + shard_priv_t *priv = NULL; + + priv = this->private; + local = frame->local; + + /* First participant block here is the lowest numbered block that would + * hold the last byte of the file post successful truncation. + * Last participant block is the block that contains the last byte in + * the current state of the file. + * If (first block == last_block): + * then that means that the file only needs truncation of the + * first (or last since both are same) block. + * Else + * if (new_size % block_size == 0) + * then that means there is no truncate to be done with + * only shards from first_block + 1 through the last + * block needing to be unlinked. + * else + * both truncate of the first block and unlink of the + * remaining shards until end of file is required. + */ + local->first_block = (local->offset == 0) + ? 0 + : get_lowest_block(local->offset - 1, + local->block_size); + local->last_block = get_highest_block(0, local->prebuf.ia_size, + local->block_size); + + local->num_blocks = local->last_block - local->first_block + 1; + GF_ASSERT(local->num_blocks > 0); + local->resolver_base_inode = (local->fop == GF_FOP_TRUNCATE) + ? local->loc.inode + : local->fd->inode; + + if ((local->first_block == 0) && (local->num_blocks == 1)) { + if (local->fop == GF_FOP_TRUNCATE) + STACK_WIND(frame, shard_truncate_last_shard_cbk, FIRST_CHILD(this), + FIRST_CHILD(this)->fops->truncate, &local->loc, + local->offset, local->xattr_req); + else + STACK_WIND(frame, shard_truncate_last_shard_cbk, FIRST_CHILD(this), + FIRST_CHILD(this)->fops->ftruncate, local->fd, + local->offset, local->xattr_req); + return 0; + } + + local->inode_list = GF_CALLOC(local->num_blocks, sizeof(inode_t *), + gf_shard_mt_inode_list); + if (!local->inode_list) + goto err; + + local->dot_shard_loc.inode = inode_find(this->itable, priv->dot_shard_gfid); + if (!local->dot_shard_loc.inode) { + ret = shard_init_internal_dir_loc(this, local, + SHARD_INTERNAL_DIR_DOT_SHARD); + if (ret) + goto err; + shard_lookup_internal_dir(frame, this, + shard_post_resolve_truncate_handler, + SHARD_INTERNAL_DIR_DOT_SHARD); + } else { + local->post_res_handler = shard_post_resolve_truncate_handler; + shard_refresh_internal_dir(frame, this, SHARD_INTERNAL_DIR_DOT_SHARD); + } + return 0; + +err: + shard_common_failure_unwind(local->fop, frame, -1, ENOMEM); + return 0; +} + +int +shard_post_lookup_truncate_handler(call_frame_t *frame, xlator_t *this) +{ + shard_local_t *local = NULL; + struct iatt tmp_stbuf = { + 0, + }; + + local = frame->local; + + if (local->op_ret < 0) { + shard_common_failure_unwind(local->fop, frame, local->op_ret, + local->op_errno); + return 0; + } + + local->postbuf = tmp_stbuf = local->prebuf; + + if (local->prebuf.ia_size == local->offset) { + /* If the file size is same as requested size, unwind the call + * immediately. + */ + if (local->fop == GF_FOP_TRUNCATE) + SHARD_STACK_UNWIND(truncate, frame, 0, 0, &local->prebuf, + &local->postbuf, NULL); + else + SHARD_STACK_UNWIND(ftruncate, frame, 0, 0, &local->prebuf, + &local->postbuf, NULL); + } else if (local->offset > local->prebuf.ia_size) { + /* If the truncate is from a lower to a higher size, set the + * new size xattr and unwind. + */ + local->hole_size = local->offset - local->prebuf.ia_size; + local->delta_size = 0; + GF_ATOMIC_INIT(local->delta_blocks, 0); + local->postbuf.ia_size = local->offset; + tmp_stbuf.ia_size = local->offset; + shard_inode_ctx_set(local->loc.inode, this, &tmp_stbuf, 0, + SHARD_INODE_WRITE_MASK); + shard_update_file_size(frame, this, NULL, &local->loc, + shard_post_update_size_truncate_handler); + } else { + /* ... else + * i. unlink all shards that need to be unlinked. + * ii. truncate the last of the shards. + * iii. update the new size using setxattr. + * and unwind the fop. + */ + local->hole_size = 0; + local->delta_size = (local->offset - local->prebuf.ia_size); + GF_ATOMIC_INIT(local->delta_blocks, 0); + tmp_stbuf.ia_size = local->offset; + shard_inode_ctx_set(local->loc.inode, this, &tmp_stbuf, 0, + SHARD_INODE_WRITE_MASK); + shard_truncate_begin(frame, this); + } + return 0; +} + +/* TO-DO: + * Fix updates to size and block count with racing write(s) and truncate(s). + */ + +int +shard_truncate(call_frame_t *frame, xlator_t *this, loc_t *loc, off_t offset, + dict_t *xdata) +{ + int ret = -1; + uint64_t block_size = 0; + shard_local_t *local = NULL; + + ret = shard_inode_ctx_get_block_size(loc->inode, this, &block_size); + if (ret) { + gf_msg(this->name, GF_LOG_ERROR, 0, SHARD_MSG_INODE_CTX_GET_FAILED, + "Failed to get block " + "size from inode ctx of %s", + uuid_utoa(loc->inode->gfid)); + goto err; + } + + if (!block_size || frame->root->pid == GF_CLIENT_PID_GSYNCD) { + STACK_WIND(frame, default_truncate_cbk, FIRST_CHILD(this), + FIRST_CHILD(this)->fops->truncate, loc, offset, xdata); + return 0; + } + + if (!this->itable) + this->itable = loc->inode->table; + + local = mem_get0(this->local_pool); + if (!local) + goto err; + + frame->local = local; + + ret = syncbarrier_init(&local->barrier); + if (ret) + goto err; + loc_copy(&local->loc, loc); + local->offset = offset; + local->block_size = block_size; + local->fop = GF_FOP_TRUNCATE; + local->xattr_req = (xdata) ? dict_ref(xdata) : dict_new(); + if (!local->xattr_req) + goto err; + local->resolver_base_inode = loc->inode; + GF_ATOMIC_INIT(local->delta_blocks, 0); + + shard_refresh_base_file(frame, this, &local->loc, NULL, + shard_post_lookup_truncate_handler); + return 0; + +err: + shard_common_failure_unwind(GF_FOP_TRUNCATE, frame, -1, ENOMEM); + return 0; +} + +int +shard_ftruncate(call_frame_t *frame, xlator_t *this, fd_t *fd, off_t offset, + dict_t *xdata) +{ + int ret = -1; + uint64_t block_size = 0; + shard_local_t *local = NULL; + + ret = shard_inode_ctx_get_block_size(fd->inode, this, &block_size); + if (ret) { + gf_msg(this->name, GF_LOG_ERROR, 0, SHARD_MSG_INODE_CTX_GET_FAILED, + "Failed to get block " + "size from inode ctx of %s", + uuid_utoa(fd->inode->gfid)); + goto err; + } + + if (!block_size || frame->root->pid == GF_CLIENT_PID_GSYNCD) { + STACK_WIND(frame, default_ftruncate_cbk, FIRST_CHILD(this), + FIRST_CHILD(this)->fops->ftruncate, fd, offset, xdata); + return 0; + } + + if (!this->itable) + this->itable = fd->inode->table; + + local = mem_get0(this->local_pool); + if (!local) + goto err; + + frame->local = local; + ret = syncbarrier_init(&local->barrier); + if (ret) + goto err; + local->fd = fd_ref(fd); + local->offset = offset; + local->block_size = block_size; + local->xattr_req = (xdata) ? dict_ref(xdata) : dict_new(); + if (!local->xattr_req) + goto err; + local->fop = GF_FOP_FTRUNCATE; + + local->loc.inode = inode_ref(fd->inode); + gf_uuid_copy(local->loc.gfid, fd->inode->gfid); + local->resolver_base_inode = fd->inode; + GF_ATOMIC_INIT(local->delta_blocks, 0); + + shard_refresh_base_file(frame, this, NULL, fd, + shard_post_lookup_truncate_handler); + return 0; +err: + shard_common_failure_unwind(GF_FOP_FTRUNCATE, frame, -1, ENOMEM); + return 0; +} + +int +shard_mknod_cbk(call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, inode_t *inode, + struct iatt *buf, struct iatt *preparent, + struct iatt *postparent, dict_t *xdata) +{ + int ret = -1; + shard_local_t *local = NULL; + + local = frame->local; + + if (op_ret == -1) + goto unwind; + + ret = shard_inode_ctx_set(inode, this, buf, local->block_size, + SHARD_ALL_MASK); + if (ret) + gf_msg(this->name, GF_LOG_WARNING, 0, SHARD_MSG_INODE_CTX_SET_FAILED, + "Failed to set inode " + "ctx for %s", + uuid_utoa(inode->gfid)); + +unwind: + SHARD_STACK_UNWIND(mknod, frame, op_ret, op_errno, inode, buf, preparent, + postparent, xdata); + + return 0; +} + +int +shard_mknod(call_frame_t *frame, xlator_t *this, loc_t *loc, mode_t mode, + dev_t rdev, mode_t umask, dict_t *xdata) +{ + shard_priv_t *priv = NULL; + shard_local_t *local = NULL; + + priv = this->private; + local = mem_get0(this->local_pool); + if (!local) + goto err; + + frame->local = local; + local->block_size = priv->block_size; + if (!__is_gsyncd_on_shard_dir(frame, loc)) { + SHARD_INODE_CREATE_INIT(this, local->block_size, xdata, loc, 0, 0, err); + } + + STACK_WIND(frame, shard_mknod_cbk, FIRST_CHILD(this), + FIRST_CHILD(this)->fops->mknod, loc, mode, rdev, umask, xdata); + return 0; +err: + shard_common_failure_unwind(GF_FOP_MKNOD, frame, -1, ENOMEM); + return 0; +} + +int32_t +shard_link_cbk(call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, inode_t *inode, + struct iatt *buf, struct iatt *preparent, + struct iatt *postparent, dict_t *xdata) +{ + shard_local_t *local = NULL; + + local = frame->local; + if (op_ret < 0) + goto err; + + shard_inode_ctx_set(inode, this, buf, 0, + SHARD_MASK_NLINK | SHARD_MASK_TIMES); + buf->ia_size = local->prebuf.ia_size; + buf->ia_blocks = local->prebuf.ia_blocks; + + SHARD_STACK_UNWIND(link, frame, op_ret, op_errno, inode, buf, preparent, + postparent, xdata); + return 0; +err: + shard_common_failure_unwind(GF_FOP_LINK, frame, op_ret, op_errno); + return 0; +} + +int +shard_post_lookup_link_handler(call_frame_t *frame, xlator_t *this) +{ + shard_local_t *local = NULL; + + local = frame->local; + + if (local->op_ret < 0) { + SHARD_STACK_UNWIND(link, frame, local->op_ret, local->op_errno, NULL, + NULL, NULL, NULL, NULL); + return 0; + } + + STACK_WIND(frame, shard_link_cbk, FIRST_CHILD(this), + FIRST_CHILD(this)->fops->link, &local->loc, &local->loc2, + local->xattr_req); + return 0; +} + +int32_t +shard_link(call_frame_t *frame, xlator_t *this, loc_t *oldloc, loc_t *newloc, + dict_t *xdata) +{ + int ret = -1; + uint64_t block_size = 0; + shard_local_t *local = NULL; + + ret = shard_inode_ctx_get_block_size(oldloc->inode, this, &block_size); + if (ret) { + gf_msg(this->name, GF_LOG_ERROR, 0, SHARD_MSG_INODE_CTX_GET_FAILED, + "Failed to get block " + "size from inode ctx of %s", + uuid_utoa(oldloc->inode->gfid)); + goto err; + } + + if (!block_size) { + STACK_WIND_TAIL(frame, FIRST_CHILD(this), FIRST_CHILD(this)->fops->link, + oldloc, newloc, xdata); + return 0; + } + + if (!this->itable) + this->itable = oldloc->inode->table; + + local = mem_get0(this->local_pool); + if (!local) + goto err; + + frame->local = local; + + loc_copy(&local->loc, oldloc); + loc_copy(&local->loc2, newloc); + local->xattr_req = (xdata) ? dict_ref(xdata) : dict_new(); + if (!local->xattr_req) + goto err; + + shard_refresh_base_file(frame, this, &local->loc, NULL, + shard_post_lookup_link_handler); + return 0; +err: + shard_common_failure_unwind(GF_FOP_LINK, frame, -1, ENOMEM); + return 0; +} + +int +shard_unlink_shards_do(call_frame_t *frame, xlator_t *this, inode_t *inode); + +int +shard_post_lookup_shards_unlink_handler(call_frame_t *frame, xlator_t *this) +{ + shard_local_t *local = NULL; + uuid_t gfid = { + 0, + }; + + local = frame->local; + + if (local->resolver_base_inode) + gf_uuid_copy(gfid, local->resolver_base_inode->gfid); + else + gf_uuid_copy(gfid, local->base_gfid); + + if ((local->op_ret < 0) && (local->op_errno != ENOENT)) { + gf_msg(this->name, GF_LOG_ERROR, local->op_errno, SHARD_MSG_FOP_FAILED, + "failed to delete shards of %s", uuid_utoa(gfid)); + return 0; + } + local->op_ret = 0; + local->op_errno = 0; + + shard_unlink_shards_do(frame, this, local->resolver_base_inode); + return 0; +} + +int +shard_post_resolve_unlink_handler(call_frame_t *frame, xlator_t *this) +{ + shard_local_t *local = NULL; + + local = frame->local; + local->lookup_shards_barriered = _gf_true; + + if (!local->call_count) + shard_unlink_shards_do(frame, this, local->resolver_base_inode); + else + shard_common_lookup_shards(frame, this, local->resolver_base_inode, + shard_post_lookup_shards_unlink_handler); + return 0; +} + +void +shard_unlink_block_inode(shard_local_t *local, int shard_block_num) +{ + char block_bname[256] = { + 0, + }; + uuid_t gfid = { + 0, + }; + inode_t *inode = NULL; + inode_t *base_inode = NULL; + xlator_t *this = NULL; + shard_priv_t *priv = NULL; + shard_inode_ctx_t *ctx = NULL; + shard_inode_ctx_t *base_ictx = NULL; + int unref_base_inode = 0; + int unref_shard_inode = 0; + + this = THIS; + priv = this->private; + + inode = local->inode_list[shard_block_num - local->first_block]; + shard_inode_ctx_get(inode, this, &ctx); + base_inode = ctx->base_inode; + if (base_inode) + gf_uuid_copy(gfid, base_inode->gfid); + else + gf_uuid_copy(gfid, ctx->base_gfid); + shard_make_block_bname(shard_block_num, gfid, block_bname, + sizeof(block_bname)); + + LOCK(&priv->lock); + if (base_inode) + LOCK(&base_inode->lock); + LOCK(&inode->lock); + { + __shard_inode_ctx_get(inode, this, &ctx); + if (!list_empty(&ctx->ilist)) { + list_del_init(&ctx->ilist); + priv->inode_count--; + unref_base_inode++; + unref_shard_inode++; + GF_ASSERT(priv->inode_count >= 0); + } + if (ctx->fsync_needed) { + unref_base_inode++; + unref_shard_inode++; + list_del_init(&ctx->to_fsync_list); + if (base_inode) { + __shard_inode_ctx_get(base_inode, this, &base_ictx); + base_ictx->fsync_count--; + } + } + } + UNLOCK(&inode->lock); + if (base_inode) + UNLOCK(&base_inode->lock); + + inode_unlink(inode, priv->dot_shard_inode, block_bname); + inode_ref_reduce_by_n(inode, unref_shard_inode); + inode_forget(inode, 0); + + if (base_inode && unref_base_inode) + inode_ref_reduce_by_n(base_inode, unref_base_inode); + UNLOCK(&priv->lock); +} + +int +shard_rename_cbk(call_frame_t *frame, xlator_t *this) +{ + shard_local_t *local = NULL; + + local = frame->local; + + SHARD_STACK_UNWIND(rename, frame, local->op_ret, local->op_errno, + &local->prebuf, &local->preoldparent, + &local->postoldparent, &local->prenewparent, + &local->postnewparent, local->xattr_rsp); + return 0; +} + +int32_t +shard_unlink_cbk(call_frame_t *frame, xlator_t *this) +{ + shard_local_t *local = frame->local; + + SHARD_STACK_UNWIND(unlink, frame, local->op_ret, local->op_errno, + &local->preoldparent, &local->postoldparent, + local->xattr_rsp); + return 0; +} + +int +shard_unlink_shards_do_cbk(call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, + struct iatt *preparent, struct iatt *postparent, + dict_t *xdata) +{ + int shard_block_num = (long)cookie; + shard_local_t *local = NULL; + + local = frame->local; + + if (op_ret < 0) { + local->op_ret = op_ret; + local->op_errno = op_errno; + goto done; + } + + shard_unlink_block_inode(local, shard_block_num); +done: + syncbarrier_wake(&local->barrier); + return 0; +} + +int +shard_unlink_shards_do(call_frame_t *frame, xlator_t *this, inode_t *inode) +{ + int i = 0; + int ret = -1; + int count = 0; + uint32_t cur_block = 0; + uint32_t cur_block_idx = 0; /*this is idx into inode_list[] array */ + char *bname = NULL; + char path[PATH_MAX] = { + 0, + }; + uuid_t gfid = { + 0, + }; + loc_t loc = { + 0, + }; + gf_boolean_t wind_failed = _gf_false; + shard_local_t *local = NULL; + shard_priv_t *priv = NULL; + + priv = this->private; + local = frame->local; + + if (inode) + gf_uuid_copy(gfid, inode->gfid); + else + gf_uuid_copy(gfid, local->base_gfid); + + for (i = 0; i < local->num_blocks; i++) { + if (!local->inode_list[i]) + continue; + count++; + } + + if (!count) { + /* callcount = 0 implies that all of the shards that need to be + * unlinked are non-existent (in other words the file is full of + * holes). + */ + gf_msg_debug(this->name, 0, + "All shards that need to be " + "unlinked are non-existent: %s", + uuid_utoa(gfid)); + return 0; + } + + SHARD_SET_ROOT_FS_ID(frame, local); + local->barrier.waitfor = count; + cur_block = cur_block_idx + local->first_block; + + while (cur_block_idx < local->num_blocks) { + if (!local->inode_list[cur_block_idx]) + goto next; + + if (wind_failed) { + shard_unlink_shards_do_cbk(frame, (void *)(long)cur_block, this, -1, + ENOMEM, NULL, NULL, NULL); + goto next; + } + + shard_make_block_abspath(cur_block, gfid, path, sizeof(path)); + bname = strrchr(path, '/') + 1; + loc.parent = inode_ref(priv->dot_shard_inode); + ret = inode_path(loc.parent, bname, (char **)&(loc.path)); + if (ret < 0) { + gf_msg(this->name, GF_LOG_ERROR, 0, SHARD_MSG_INODE_PATH_FAILED, + "Inode path failed" + " on %s, base file gfid = %s", + bname, uuid_utoa(gfid)); + local->op_ret = -1; + local->op_errno = ENOMEM; + loc_wipe(&loc); + wind_failed = _gf_true; + shard_unlink_shards_do_cbk(frame, (void *)(long)cur_block, this, -1, + ENOMEM, NULL, NULL, NULL); + goto next; + } + + loc.name = strrchr(loc.path, '/'); + if (loc.name) + loc.name++; + loc.inode = inode_ref(local->inode_list[cur_block_idx]); + + STACK_WIND_COOKIE(frame, shard_unlink_shards_do_cbk, + (void *)(long)cur_block, FIRST_CHILD(this), + FIRST_CHILD(this)->fops->unlink, &loc, local->xflag, + local->xattr_req); + loc_wipe(&loc); + next: + cur_block++; + cur_block_idx++; + } + syncbarrier_wait(&local->barrier, count); + SHARD_UNSET_ROOT_FS_ID(frame, local); + return 0; +} + +int +shard_regulated_shards_deletion(call_frame_t *cleanup_frame, xlator_t *this, + int now, int first_block, gf_dirent_t *entry) +{ + int i = 0; + int ret = 0; + shard_local_t *local = NULL; + uuid_t gfid = { + 0, + }; + + local = cleanup_frame->local; + + local->inode_list = GF_CALLOC(now, sizeof(inode_t *), + gf_shard_mt_inode_list); + if (!local->inode_list) + return -ENOMEM; + + local->first_block = first_block; + local->last_block = first_block + now - 1; + local->num_blocks = now; + gf_uuid_parse(entry->d_name, gfid); + gf_uuid_copy(local->base_gfid, gfid); + local->resolver_base_inode = inode_find(this->itable, gfid); + local->call_count = 0; + ret = syncbarrier_init(&local->barrier); + if (ret) { + GF_FREE(local->inode_list); + local->inode_list = NULL; + inode_unref(local->resolver_base_inode); + local->resolver_base_inode = NULL; + return -errno; + } + shard_common_resolve_shards(cleanup_frame, this, + shard_post_resolve_unlink_handler); + + for (i = 0; i < local->num_blocks; i++) { + if (local->inode_list[i]) + inode_unref(local->inode_list[i]); + } + GF_FREE(local->inode_list); + local->inode_list = NULL; + if (local->op_ret) + ret = -local->op_errno; + syncbarrier_destroy(&local->barrier); + inode_unref(local->resolver_base_inode); + local->resolver_base_inode = NULL; + STACK_RESET(cleanup_frame->root); + return ret; +} + +int +__shard_delete_shards_of_entry(call_frame_t *cleanup_frame, xlator_t *this, + gf_dirent_t *entry, inode_t *inode) +{ + int ret = 0; + int shard_count = 0; + int first_block = 0; + int now = 0; + uint64_t size = 0; + uint64_t block_size = 0; + uint64_t size_array[4] = { + 0, + }; + void *bsize = NULL; + void *size_attr = NULL; + dict_t *xattr_rsp = NULL; + loc_t loc = { + 0, + }; + shard_local_t *local = NULL; + shard_priv_t *priv = NULL; + + priv = this->private; + local = cleanup_frame->local; + ret = dict_reset(local->xattr_req); + if (ret) { + gf_msg(this->name, GF_LOG_WARNING, 0, SHARD_MSG_DICT_OP_FAILED, + "Failed to reset dict"); + ret = -ENOMEM; + goto err; + } + + ret = dict_set_uint64(local->xattr_req, GF_XATTR_SHARD_BLOCK_SIZE, 0); + if (ret) { + gf_msg(this->name, GF_LOG_WARNING, 0, SHARD_MSG_DICT_OP_FAILED, + "Failed to set dict value: key:%s", GF_XATTR_SHARD_BLOCK_SIZE); + ret = -ENOMEM; + goto err; + } + + ret = dict_set_uint64(local->xattr_req, GF_XATTR_SHARD_FILE_SIZE, 8 * 4); + if (ret) { + gf_msg(this->name, GF_LOG_WARNING, 0, SHARD_MSG_DICT_OP_FAILED, + "Failed to set dict value: key:%s", GF_XATTR_SHARD_FILE_SIZE); + ret = -ENOMEM; + goto err; + } + + loc.inode = inode_ref(inode); + loc.parent = inode_ref(priv->dot_shard_rm_inode); + ret = inode_path(loc.parent, entry->d_name, (char **)&(loc.path)); + if (ret < 0) { + gf_msg(this->name, GF_LOG_ERROR, 0, SHARD_MSG_INODE_PATH_FAILED, + "Inode path failed on %s", entry->d_name); + ret = -ENOMEM; + goto err; + } + + loc.name = strrchr(loc.path, '/'); + if (loc.name) + loc.name++; + ret = syncop_lookup(FIRST_CHILD(this), &loc, NULL, NULL, local->xattr_req, + &xattr_rsp); + if (ret) + goto err; + + ret = dict_get_ptr(xattr_rsp, GF_XATTR_SHARD_BLOCK_SIZE, &bsize); + if (ret) { + gf_msg(this->name, GF_LOG_ERROR, 0, SHARD_MSG_DICT_OP_FAILED, + "Failed to get dict value: key:%s", GF_XATTR_SHARD_BLOCK_SIZE); + goto err; + } + block_size = ntoh64(*((uint64_t *)bsize)); + + ret = dict_get_ptr(xattr_rsp, GF_XATTR_SHARD_FILE_SIZE, &size_attr); + if (ret) { + gf_msg(this->name, GF_LOG_ERROR, 0, SHARD_MSG_DICT_OP_FAILED, + "Failed to get dict value: key:%s", GF_XATTR_SHARD_FILE_SIZE); + goto err; + } + + memcpy(size_array, size_attr, sizeof(size_array)); + size = ntoh64(size_array[0]); + + shard_count = (size / block_size) - 1; + if (shard_count < 0) { + gf_msg_debug(this->name, 0, + "Size of %s hasn't grown beyond " + "its shard-block-size. Nothing to delete. " + "Returning", + entry->d_name); + /* File size < shard-block-size, so nothing to delete */ + ret = 0; + goto delete_marker; + } + if ((size % block_size) > 0) + shard_count++; + + if (shard_count == 0) { + gf_msg_debug(this->name, 0, + "Size of %s is exactly equal to " + "its shard-block-size. Nothing to delete. " + "Returning", + entry->d_name); + ret = 0; + goto delete_marker; + } + gf_msg_debug(this->name, 0, + "base file = %s, " + "shard-block-size=%" PRIu64 ", file-size=%" PRIu64 + ", " + "shard_count=%d", + entry->d_name, block_size, size, shard_count); + + /* Perform a gfid-based lookup to see if gfid corresponding to marker + * file's base name exists. + */ + loc_wipe(&loc); + loc.inode = inode_new(this->itable); + if (!loc.inode) { + ret = -ENOMEM; + goto err; + } + gf_uuid_parse(entry->d_name, loc.gfid); + ret = syncop_lookup(FIRST_CHILD(this), &loc, NULL, NULL, NULL, NULL); + if (!ret) { + gf_msg_debug(this->name, 0, + "Base shard corresponding to gfid " + "%s is present. Skipping shard deletion. " + "Returning", + entry->d_name); + ret = 0; + goto delete_marker; + } + + first_block = 1; + + while (shard_count) { + if (shard_count < local->deletion_rate) { + now = shard_count; + shard_count = 0; + } else { + now = local->deletion_rate; + shard_count -= local->deletion_rate; + } + + gf_msg_debug(this->name, 0, + "deleting %d shards starting from " + "block %d of gfid %s", + now, first_block, entry->d_name); + ret = shard_regulated_shards_deletion(cleanup_frame, this, now, + first_block, entry); + if (ret) + goto err; + first_block += now; + } + +delete_marker: + loc_wipe(&loc); + loc.inode = inode_ref(inode); + loc.parent = inode_ref(priv->dot_shard_rm_inode); + ret = inode_path(loc.parent, entry->d_name, (char **)&(loc.path)); + if (ret < 0) { + gf_msg(this->name, GF_LOG_ERROR, 0, SHARD_MSG_INODE_PATH_FAILED, + "Inode path failed on %s", entry->d_name); + ret = -ENOMEM; + goto err; + } + loc.name = strrchr(loc.path, '/'); + if (loc.name) + loc.name++; + ret = syncop_unlink(FIRST_CHILD(this), &loc, NULL, NULL); + if (ret) + gf_msg(this->name, GF_LOG_ERROR, 0, SHARD_MSG_SHARDS_DELETION_FAILED, + "Failed to delete %s " + "from /%s", + entry->d_name, GF_SHARD_REMOVE_ME_DIR); +err: + if (xattr_rsp) + dict_unref(xattr_rsp); + loc_wipe(&loc); + return ret; +} + +int +shard_delete_shards_of_entry(call_frame_t *cleanup_frame, xlator_t *this, + gf_dirent_t *entry, inode_t *inode) +{ + int ret = -1; + loc_t loc = { + 0, + }; + shard_priv_t *priv = NULL; + + priv = this->private; + loc.inode = inode_ref(priv->dot_shard_rm_inode); + + ret = syncop_entrylk(FIRST_CHILD(this), this->name, &loc, entry->d_name, + ENTRYLK_LOCK_NB, ENTRYLK_WRLCK, NULL, NULL); + if (ret < 0) { + if (ret == -EAGAIN) { + ret = 0; + } + goto out; + } + { + ret = __shard_delete_shards_of_entry(cleanup_frame, this, entry, inode); + } + syncop_entrylk(FIRST_CHILD(this), this->name, &loc, entry->d_name, + ENTRYLK_UNLOCK, ENTRYLK_WRLCK, NULL, NULL); +out: + loc_wipe(&loc); + return ret; +} + +int +shard_delete_shards_cbk(int ret, call_frame_t *frame, void *data) +{ + SHARD_STACK_DESTROY(frame); + return 0; +} + +int +shard_resolve_internal_dir(xlator_t *this, shard_local_t *local, + shard_internal_dir_type_t type) +{ + int ret = 0; + char *bname = NULL; + loc_t *loc = NULL; + shard_priv_t *priv = NULL; + uuid_t gfid = { + 0, + }; + struct iatt stbuf = { + 0, + }; + + priv = this->private; + + switch (type) { + case SHARD_INTERNAL_DIR_DOT_SHARD: + loc = &local->dot_shard_loc; + gf_uuid_copy(gfid, priv->dot_shard_gfid); + bname = GF_SHARD_DIR; + break; + case SHARD_INTERNAL_DIR_DOT_SHARD_REMOVE_ME: + loc = &local->dot_shard_rm_loc; + gf_uuid_copy(gfid, priv->dot_shard_rm_gfid); + bname = GF_SHARD_REMOVE_ME_DIR; + break; + default: + break; + } + + loc->inode = inode_find(this->itable, gfid); + if (!loc->inode) { + ret = shard_init_internal_dir_loc(this, local, type); + if (ret) + goto err; + ret = dict_reset(local->xattr_req); + if (ret) { + gf_msg(this->name, GF_LOG_WARNING, 0, SHARD_MSG_DICT_OP_FAILED, + "Failed to reset " + "dict"); + ret = -ENOMEM; + goto err; + } + ret = dict_set_gfuuid(local->xattr_req, "gfid-req", gfid, true); + ret = syncop_lookup(FIRST_CHILD(this), loc, &stbuf, NULL, + local->xattr_req, NULL); + if (ret < 0) { + if (ret != -ENOENT) + gf_msg(this->name, GF_LOG_ERROR, -ret, + SHARD_MSG_SHARDS_DELETION_FAILED, + "Lookup on %s failed, exiting", bname); + goto err; + } else { + shard_link_internal_dir_inode(local, loc->inode, &stbuf, type); + } + } + ret = 0; +err: + return ret; +} + +int +shard_lookup_marker_entry(xlator_t *this, shard_local_t *local, + gf_dirent_t *entry) +{ + int ret = 0; + loc_t loc = { + 0, + }; + + loc.inode = inode_new(this->itable); + if (!loc.inode) { + ret = -ENOMEM; + goto err; + } + loc.parent = inode_ref(local->fd->inode); + + ret = inode_path(loc.parent, entry->d_name, (char **)&(loc.path)); + if (ret < 0) { + gf_msg(this->name, GF_LOG_ERROR, 0, SHARD_MSG_INODE_PATH_FAILED, + "Inode path failed on %s", entry->d_name); + ret = -ENOMEM; + goto err; + } + + loc.name = strrchr(loc.path, '/'); + if (loc.name) + loc.name++; + + ret = syncop_lookup(FIRST_CHILD(this), &loc, NULL, NULL, NULL, NULL); + if (ret < 0) { + goto err; + } + entry->inode = inode_ref(loc.inode); + ret = 0; +err: + loc_wipe(&loc); + return ret; +} + +int +shard_delete_shards(void *opaque) +{ + int ret = 0; + off_t offset = 0; + loc_t loc = { + 0, + }; + inode_t *link_inode = NULL; + xlator_t *this = NULL; + shard_priv_t *priv = NULL; + shard_local_t *local = NULL; + gf_dirent_t entries; + gf_dirent_t *entry = NULL; + call_frame_t *cleanup_frame = NULL; + gf_boolean_t done = _gf_false; + + this = THIS; + priv = this->private; + INIT_LIST_HEAD(&entries.list); + + cleanup_frame = opaque; + + local = mem_get0(this->local_pool); + if (!local) { + gf_msg(this->name, GF_LOG_WARNING, ENOMEM, SHARD_MSG_MEMALLOC_FAILED, + "Failed to create local to " + "delete shards"); + ret = -ENOMEM; + goto err; + } + cleanup_frame->local = local; + local->fop = GF_FOP_UNLINK; + + local->xattr_req = dict_new(); + if (!local->xattr_req) { + ret = -ENOMEM; + goto err; + } + local->deletion_rate = priv->deletion_rate; + + ret = shard_resolve_internal_dir(this, local, SHARD_INTERNAL_DIR_DOT_SHARD); + if (ret == -ENOENT) { + gf_msg_debug(this->name, 0, + ".shard absent. Nothing to" + " delete. Exiting"); + ret = 0; + goto err; + } else if (ret < 0) { + goto err; + } + + ret = shard_resolve_internal_dir(this, local, + SHARD_INTERNAL_DIR_DOT_SHARD_REMOVE_ME); + if (ret == -ENOENT) { + gf_msg_debug(this->name, 0, + ".remove_me absent. " + "Nothing to delete. Exiting"); + ret = 0; + goto err; + } else if (ret < 0) { + goto err; + } + + local->fd = fd_anonymous(local->dot_shard_rm_loc.inode); + if (!local->fd) { + ret = -ENOMEM; + goto err; + } + + for (;;) { + offset = 0; + LOCK(&priv->lock); + { + if (priv->bg_del_state == SHARD_BG_DELETION_LAUNCHING) { + priv->bg_del_state = SHARD_BG_DELETION_IN_PROGRESS; + } else if (priv->bg_del_state == SHARD_BG_DELETION_IN_PROGRESS) { + priv->bg_del_state = SHARD_BG_DELETION_NONE; + done = _gf_true; + } + } + UNLOCK(&priv->lock); + if (done) + break; + while ( + (ret = syncop_readdirp(FIRST_CHILD(this), local->fd, 131072, offset, + &entries, local->xattr_req, NULL))) { + if (ret > 0) + ret = 0; + list_for_each_entry(entry, &entries.list, list) + { + offset = entry->d_off; + + if (!strcmp(entry->d_name, ".") || !strcmp(entry->d_name, "..")) + continue; + + if (!entry->inode) { + ret = shard_lookup_marker_entry(this, local, entry); + if (ret < 0) + continue; + } + link_inode = inode_link(entry->inode, local->fd->inode, + entry->d_name, &entry->d_stat); + + gf_msg_debug(this->name, 0, + "Initiating deletion of " + "shards of gfid %s", + entry->d_name); + ret = shard_delete_shards_of_entry(cleanup_frame, this, entry, + link_inode); + inode_unlink(link_inode, local->fd->inode, entry->d_name); + inode_unref(link_inode); + if (ret) { + gf_msg(this->name, GF_LOG_ERROR, -ret, + SHARD_MSG_SHARDS_DELETION_FAILED, + "Failed to clean up shards of gfid %s", + entry->d_name); + continue; + } + gf_msg(this->name, GF_LOG_INFO, 0, + SHARD_MSG_SHARD_DELETION_COMPLETED, + "Deleted " + "shards of gfid=%s from backend", + entry->d_name); + } + gf_dirent_free(&entries); + if (ret) + break; + } + } + ret = 0; + loc_wipe(&loc); + return ret; + +err: + LOCK(&priv->lock); + { + priv->bg_del_state = SHARD_BG_DELETION_NONE; + } + UNLOCK(&priv->lock); + loc_wipe(&loc); + return ret; +} + +int +shard_unlock_inodelk_cbk(call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, dict_t *xdata) +{ + if (op_ret) + gf_msg(this->name, GF_LOG_ERROR, op_errno, SHARD_MSG_FOP_FAILED, + "Unlock failed. Please check brick logs for " + "more details"); + SHARD_STACK_DESTROY(frame); + return 0; +} + +int +shard_unlock_inodelk(call_frame_t *frame, xlator_t *this) +{ + loc_t *loc = NULL; + call_frame_t *lk_frame = NULL; + shard_local_t *local = NULL; + shard_local_t *lk_local = NULL; + shard_inodelk_t *lock = NULL; + + local = frame->local; + lk_frame = local->inodelk_frame; + lk_local = lk_frame->local; + local->inodelk_frame = NULL; + loc = &local->int_inodelk.loc; + lock = &lk_local->int_inodelk; + lock->flock.l_type = F_UNLCK; + + STACK_WIND(lk_frame, shard_unlock_inodelk_cbk, FIRST_CHILD(this), + FIRST_CHILD(this)->fops->inodelk, lock->domain, loc, F_SETLK, + &lock->flock, NULL); + local->int_inodelk.acquired_lock = _gf_false; + return 0; +} + +int +shard_rename_src_cbk(call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, struct iatt *buf, + struct iatt *preoldparent, struct iatt *postoldparent, + struct iatt *prenewparent, struct iatt *postnewparent, + dict_t *xdata); +int +shard_rename_src_base_file(call_frame_t *frame, xlator_t *this) +{ + int ret = 0; + loc_t *dst_loc = NULL; + loc_t tmp_loc = { + 0, + }; + shard_local_t *local = frame->local; + + if (local->dst_block_size) { + tmp_loc.parent = inode_ref(local->loc2.parent); + ret = inode_path(tmp_loc.parent, local->loc2.name, + (char **)&tmp_loc.path); + if (ret < 0) { + gf_msg(this->name, GF_LOG_ERROR, 0, SHARD_MSG_INODE_PATH_FAILED, + "Inode path failed" + " on pargfid=%s bname=%s", + uuid_utoa(tmp_loc.parent->gfid), local->loc2.name); + local->op_ret = -1; + local->op_errno = ENOMEM; + goto err; + } + + tmp_loc.name = strrchr(tmp_loc.path, '/'); + if (tmp_loc.name) + tmp_loc.name++; + dst_loc = &tmp_loc; + } else { + dst_loc = &local->loc2; + } + + /* To-Do: Request open-fd count on dst base file */ + STACK_WIND(frame, shard_rename_src_cbk, FIRST_CHILD(this), + FIRST_CHILD(this)->fops->rename, &local->loc, dst_loc, + local->xattr_req); + loc_wipe(&tmp_loc); + return 0; +err: + loc_wipe(&tmp_loc); + shard_common_failure_unwind(local->fop, frame, local->op_ret, + local->op_errno); + return 0; +} + +int +shard_unlink_base_file(call_frame_t *frame, xlator_t *this); + +int +shard_set_size_attrs_on_marker_file_cbk(call_frame_t *frame, void *cookie, + xlator_t *this, int32_t op_ret, + int32_t op_errno, dict_t *dict, + dict_t *xdata) +{ + shard_priv_t *priv = NULL; + shard_local_t *local = NULL; + + priv = this->private; + local = frame->local; + if (op_ret < 0) { + gf_msg(this->name, GF_LOG_ERROR, op_errno, SHARD_MSG_FOP_FAILED, + "Xattrop on marker file failed " + "while performing %s; entry gfid=%s", + gf_fop_string(local->fop), local->newloc.name); + goto err; + } + + inode_unlink(local->newloc.inode, priv->dot_shard_rm_inode, + local->newloc.name); + + if (local->fop == GF_FOP_UNLINK) + shard_unlink_base_file(frame, this); + else if (local->fop == GF_FOP_RENAME) + shard_rename_src_base_file(frame, this); + return 0; +err: + shard_common_failure_unwind(local->fop, frame, op_ret, op_errno); + return 0; +} + +int +shard_set_size_attrs_on_marker_file(call_frame_t *frame, xlator_t *this) +{ + int op_errno = ENOMEM; + uint64_t bs = 0; + dict_t *xdata = NULL; + shard_local_t *local = NULL; + + local = frame->local; + xdata = dict_new(); + if (!xdata) + goto err; + + if (local->fop == GF_FOP_UNLINK) + bs = local->block_size; + else if (local->fop == GF_FOP_RENAME) + bs = local->dst_block_size; + SHARD_INODE_CREATE_INIT(this, bs, xdata, &local->newloc, + local->prebuf.ia_size, 0, err); + STACK_WIND(frame, shard_set_size_attrs_on_marker_file_cbk, + FIRST_CHILD(this), FIRST_CHILD(this)->fops->xattrop, + &local->newloc, GF_XATTROP_GET_AND_SET, xdata, NULL); + dict_unref(xdata); + return 0; +err: + if (xdata) + dict_unref(xdata); + shard_common_failure_unwind(local->fop, frame, -1, op_errno); + return 0; +} + +int +shard_lookup_marker_file_cbk(call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, inode_t *inode, + struct iatt *buf, dict_t *xdata, + struct iatt *postparent) +{ + inode_t *linked_inode = NULL; + shard_priv_t *priv = NULL; + shard_local_t *local = NULL; + + local = frame->local; + priv = this->private; + + if (op_ret < 0) { + gf_msg(this->name, GF_LOG_ERROR, op_errno, SHARD_MSG_FOP_FAILED, + "Lookup on marker file failed " + "while performing %s; entry gfid=%s", + gf_fop_string(local->fop), local->newloc.name); + goto err; + } + + linked_inode = inode_link(inode, priv->dot_shard_rm_inode, + local->newloc.name, buf); + inode_unref(local->newloc.inode); + local->newloc.inode = linked_inode; + shard_set_size_attrs_on_marker_file(frame, this); + return 0; +err: + shard_common_failure_unwind(local->fop, frame, op_ret, op_errno); + return 0; +} + +int +shard_lookup_marker_file(call_frame_t *frame, xlator_t *this) +{ + int op_errno = ENOMEM; + dict_t *xattr_req = NULL; + shard_local_t *local = NULL; + + local = frame->local; + + xattr_req = shard_create_gfid_dict(local->xattr_req); + if (!xattr_req) + goto err; + + STACK_WIND(frame, shard_lookup_marker_file_cbk, FIRST_CHILD(this), + FIRST_CHILD(this)->fops->lookup, &local->newloc, xattr_req); + dict_unref(xattr_req); + return 0; +err: + shard_common_failure_unwind(local->fop, frame, -1, op_errno); + return 0; +} + +int +shard_create_marker_file_under_remove_me_cbk( + call_frame_t *frame, void *cookie, xlator_t *this, int32_t op_ret, + int32_t op_errno, inode_t *inode, struct iatt *buf, struct iatt *preparent, + struct iatt *postparent, dict_t *xdata) +{ + inode_t *linked_inode = NULL; + shard_priv_t *priv = NULL; + shard_local_t *local = NULL; + + local = frame->local; + priv = this->private; + + SHARD_UNSET_ROOT_FS_ID(frame, local); + if (op_ret < 0) { + if ((op_errno != EEXIST) && (op_errno != ENODATA)) { + local->op_ret = op_ret; + local->op_errno = op_errno; + gf_msg(this->name, GF_LOG_ERROR, op_errno, SHARD_MSG_FOP_FAILED, + "Marker file creation " + "failed while performing %s; entry gfid=%s", + gf_fop_string(local->fop), local->newloc.name); + goto err; + } else { + shard_lookup_marker_file(frame, this); + return 0; + } + } + + linked_inode = inode_link(inode, priv->dot_shard_rm_inode, + local->newloc.name, buf); + inode_unref(local->newloc.inode); + local->newloc.inode = linked_inode; + + if (local->fop == GF_FOP_UNLINK) + shard_unlink_base_file(frame, this); + else if (local->fop == GF_FOP_RENAME) + shard_rename_src_base_file(frame, this); + return 0; +err: + shard_common_failure_unwind(local->fop, frame, -1, local->op_errno); + return 0; +} + +int +shard_create_marker_file_under_remove_me(call_frame_t *frame, xlator_t *this, + loc_t *loc) +{ + int ret = 0; + int op_errno = ENOMEM; + uint64_t bs = 0; + char g1[64] = { + 0, + }; + char g2[64] = { + 0, + }; + dict_t *xattr_req = NULL; + shard_priv_t *priv = NULL; + shard_local_t *local = NULL; + + priv = this->private; + local = frame->local; + + SHARD_SET_ROOT_FS_ID(frame, local); + + xattr_req = shard_create_gfid_dict(local->xattr_req); + if (!xattr_req) + goto err; + + local->newloc.inode = inode_new(this->itable); + local->newloc.parent = inode_ref(priv->dot_shard_rm_inode); + ret = inode_path(local->newloc.parent, uuid_utoa(loc->inode->gfid), + (char **)&local->newloc.path); + if (ret < 0) { + gf_msg(this->name, GF_LOG_ERROR, 0, SHARD_MSG_INODE_PATH_FAILED, + "Inode path failed on " + "pargfid=%s bname=%s", + uuid_utoa_r(priv->dot_shard_rm_gfid, g1), + uuid_utoa_r(loc->inode->gfid, g2)); + goto err; + } + local->newloc.name = strrchr(local->newloc.path, '/'); + if (local->newloc.name) + local->newloc.name++; + + if (local->fop == GF_FOP_UNLINK) + bs = local->block_size; + else if (local->fop == GF_FOP_RENAME) + bs = local->dst_block_size; + + SHARD_INODE_CREATE_INIT(this, bs, xattr_req, &local->newloc, + local->prebuf.ia_size, 0, err); + + STACK_WIND(frame, shard_create_marker_file_under_remove_me_cbk, + FIRST_CHILD(this), FIRST_CHILD(this)->fops->mknod, + &local->newloc, 0, 0, 0644, xattr_req); + dict_unref(xattr_req); + return 0; + +err: + if (xattr_req) + dict_unref(xattr_req); + shard_create_marker_file_under_remove_me_cbk(frame, 0, this, -1, op_errno, + NULL, NULL, NULL, NULL, NULL); + return 0; +} + +int +shard_unlock_entrylk(call_frame_t *frame, xlator_t *this); + +int +shard_unlink_base_file_cbk(call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, + struct iatt *preparent, struct iatt *postparent, + dict_t *xdata) +{ + int ret = 0; + shard_local_t *local = NULL; + + local = frame->local; + + if (op_ret < 0) { + local->op_ret = op_ret; + local->op_errno = op_errno; + } else { + shard_inode_ctx_set_refresh_flag(local->int_inodelk.loc.inode, this); + local->preoldparent = *preparent; + local->postoldparent = *postparent; + if (xdata) + local->xattr_rsp = dict_ref(xdata); + if (local->cleanup_required) + shard_start_background_deletion(this); + } + + if (local->entrylk_frame) { + ret = shard_unlock_entrylk(frame, this); + if (ret < 0) { + local->op_ret = -1; + local->op_errno = -ret; + } + } + + ret = shard_unlock_inodelk(frame, this); + if (ret < 0) { + local->op_ret = -1; + local->op_errno = -ret; + } + + shard_unlink_cbk(frame, this); + return 0; +} + +int +shard_unlink_base_file(call_frame_t *frame, xlator_t *this) +{ + shard_local_t *local = frame->local; + + /* To-Do: Request open-fd count on base file */ + STACK_WIND(frame, shard_unlink_base_file_cbk, FIRST_CHILD(this), + FIRST_CHILD(this)->fops->unlink, &local->loc, local->xflag, + local->xattr_req); + return 0; +} + +int +shard_unlock_entrylk_cbk(call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, dict_t *xdata) +{ + if (op_ret) + gf_msg(this->name, GF_LOG_ERROR, op_errno, SHARD_MSG_FOP_FAILED, + "Unlock failed. Please check brick logs for " + "more details"); + SHARD_STACK_DESTROY(frame); + return 0; +} + +int +shard_unlock_entrylk(call_frame_t *frame, xlator_t *this) +{ + loc_t *loc = NULL; + call_frame_t *lk_frame = NULL; + shard_local_t *local = NULL; + shard_local_t *lk_local = NULL; + shard_entrylk_t *lock = NULL; + + local = frame->local; + lk_frame = local->entrylk_frame; + lk_local = lk_frame->local; + local->entrylk_frame = NULL; + lock = &lk_local->int_entrylk; + loc = &lock->loc; + + STACK_WIND(lk_frame, shard_unlock_entrylk_cbk, FIRST_CHILD(this), + FIRST_CHILD(this)->fops->entrylk, this->name, loc, + lk_local->int_entrylk.basename, ENTRYLK_UNLOCK, ENTRYLK_WRLCK, + NULL); + local->int_entrylk.acquired_lock = _gf_false; + return 0; +} + +int +shard_post_entrylk_fop_handler(call_frame_t *frame, xlator_t *this) +{ + shard_local_t *local = NULL; + + local = frame->local; + + switch (local->fop) { + case GF_FOP_UNLINK: + case GF_FOP_RENAME: + shard_create_marker_file_under_remove_me(frame, this, + &local->int_inodelk.loc); + break; + default: + gf_msg(this->name, GF_LOG_WARNING, 0, SHARD_MSG_INVALID_FOP, + "post-entrylk handler not defined. This case should not" + " be hit"); + break; + } + return 0; +} + +int +shard_acquire_entrylk_cbk(call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, dict_t *xdata) +{ + call_frame_t *main_frame = NULL; + shard_local_t *local = NULL; + shard_local_t *main_local = NULL; + + local = frame->local; + main_frame = local->main_frame; + main_local = main_frame->local; + + if (local->op_ret < 0) { + shard_common_failure_unwind(main_local->fop, main_frame, op_ret, + op_errno); + return 0; + } + main_local->int_entrylk.acquired_lock = _gf_true; + shard_post_entrylk_fop_handler(main_frame, this); + return 0; +} + +int +shard_acquire_entrylk(call_frame_t *frame, xlator_t *this, inode_t *inode, + uuid_t gfid) +{ + char gfid_str[GF_UUID_BUF_SIZE] = { + 0, + }; + shard_local_t *local = NULL; + shard_local_t *entrylk_local = NULL; + shard_entrylk_t *int_entrylk = NULL; + call_frame_t *entrylk_frame = NULL; + + local = frame->local; + entrylk_frame = create_frame(this, this->ctx->pool); + if (!entrylk_frame) { + gf_msg(this->name, GF_LOG_WARNING, ENOMEM, SHARD_MSG_MEMALLOC_FAILED, + "Failed to create new frame " + "to lock marker file"); + goto err; + } + + entrylk_local = mem_get0(this->local_pool); + if (!entrylk_local) { + STACK_DESTROY(entrylk_frame->root); + goto err; + } + + entrylk_frame->local = entrylk_local; + entrylk_local->main_frame = frame; + int_entrylk = &entrylk_local->int_entrylk; + + int_entrylk->loc.inode = inode_ref(inode); + set_lk_owner_from_ptr(&entrylk_frame->root->lk_owner, entrylk_frame->root); + local->entrylk_frame = entrylk_frame; + gf_uuid_unparse(gfid, gfid_str); + int_entrylk->basename = gf_strdup(gfid_str); + + STACK_WIND(entrylk_frame, shard_acquire_entrylk_cbk, FIRST_CHILD(this), + FIRST_CHILD(this)->fops->entrylk, this->name, &int_entrylk->loc, + int_entrylk->basename, ENTRYLK_LOCK, ENTRYLK_WRLCK, NULL); + return 0; +err: + shard_common_failure_unwind(local->fop, frame, -1, ENOMEM); + return 0; +} + +int +shard_post_lookup_base_shard_rm_handler(call_frame_t *frame, xlator_t *this) +{ + shard_local_t *local = NULL; + shard_priv_t *priv = NULL; + + priv = this->private; + local = frame->local; + + if (local->op_ret < 0) { + shard_common_failure_unwind(local->fop, frame, -1, local->op_errno); + return 0; + } + + if (local->prebuf.ia_nlink > 1) { + gf_msg_debug(this->name, 0, + "link count on %s > 1:%d, " + "performing rename()/unlink()", + local->int_inodelk.loc.path, local->prebuf.ia_nlink); + if (local->fop == GF_FOP_RENAME) + shard_rename_src_base_file(frame, this); + else if (local->fop == GF_FOP_UNLINK) + shard_unlink_base_file(frame, this); + } else { + gf_msg_debug(this->name, 0, + "link count on %s = 1, creating " + "file under .remove_me", + local->int_inodelk.loc.path); + local->cleanup_required = _gf_true; + shard_acquire_entrylk(frame, this, priv->dot_shard_rm_inode, + local->prebuf.ia_gfid); + } + return 0; +} + +int +shard_post_inodelk_fop_handler(call_frame_t *frame, xlator_t *this) +{ + shard_local_t *local = NULL; + + local = frame->local; + + switch (local->fop) { + case GF_FOP_UNLINK: + case GF_FOP_RENAME: + shard_refresh_base_file(frame, this, &local->int_inodelk.loc, NULL, + shard_post_lookup_base_shard_rm_handler); + break; + default: + gf_msg(this->name, GF_LOG_WARNING, 0, SHARD_MSG_INVALID_FOP, + "post-inodelk handler not defined. This case should not" + " be hit"); + break; + } + return 0; +} + +int +shard_acquire_inodelk_cbk(call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, dict_t *xdata) +{ + call_frame_t *main_frame = NULL; + shard_local_t *local = NULL; + shard_local_t *main_local = NULL; + + local = frame->local; + main_frame = local->main_frame; + main_local = main_frame->local; + + if (local->op_ret < 0) { + shard_common_failure_unwind(main_local->fop, main_frame, op_ret, + op_errno); + return 0; + } + main_local->int_inodelk.acquired_lock = _gf_true; + shard_post_inodelk_fop_handler(main_frame, this); + return 0; +} + +int +shard_acquire_inodelk(call_frame_t *frame, xlator_t *this, loc_t *loc) +{ + call_frame_t *lk_frame = NULL; + shard_local_t *local = NULL; + shard_local_t *lk_local = NULL; + shard_inodelk_t *int_inodelk = NULL; + + local = frame->local; + lk_frame = create_frame(this, this->ctx->pool); + if (!lk_frame) { + gf_msg(this->name, GF_LOG_WARNING, ENOMEM, SHARD_MSG_MEMALLOC_FAILED, + "Failed to create new frame " + "to lock base shard"); + goto err; + } + lk_local = mem_get0(this->local_pool); + if (!lk_local) { + STACK_DESTROY(lk_frame->root); + goto err; + } + + lk_frame->local = lk_local; + lk_local->main_frame = frame; + int_inodelk = &lk_local->int_inodelk; + + int_inodelk->flock.l_len = 0; + int_inodelk->flock.l_start = 0; + int_inodelk->domain = this->name; + int_inodelk->flock.l_type = F_WRLCK; + loc_copy(&local->int_inodelk.loc, loc); + set_lk_owner_from_ptr(&lk_frame->root->lk_owner, lk_frame->root); + local->inodelk_frame = lk_frame; + + STACK_WIND(lk_frame, shard_acquire_inodelk_cbk, FIRST_CHILD(this), + FIRST_CHILD(this)->fops->inodelk, int_inodelk->domain, + &local->int_inodelk.loc, F_SETLKW, &int_inodelk->flock, NULL); + return 0; +err: + shard_common_failure_unwind(local->fop, frame, -1, ENOMEM); + return 0; +} + +int +shard_post_mkdir_rm_handler(call_frame_t *frame, xlator_t *this) +{ + loc_t *loc = NULL; + shard_local_t *local = NULL; + + local = frame->local; + + if (local->op_ret < 0) { + shard_common_failure_unwind(local->fop, frame, -1, local->op_errno); + return 0; + } + if (local->fop == GF_FOP_UNLINK) + loc = &local->loc; + else if (local->fop == GF_FOP_RENAME) + loc = &local->loc2; + shard_acquire_inodelk(frame, this, loc); + return 0; +} + +int +shard_mkdir_internal_dir(call_frame_t *frame, xlator_t *this, + shard_post_resolve_fop_handler_t handler, + shard_internal_dir_type_t type); +int +shard_pre_mkdir_rm_handler(call_frame_t *frame, xlator_t *this) +{ + shard_local_t *local = NULL; + + local = frame->local; + + if (local->op_ret < 0) { + shard_common_failure_unwind(local->fop, frame, -1, local->op_errno); + return 0; + } + shard_mkdir_internal_dir(frame, this, shard_post_mkdir_rm_handler, + SHARD_INTERNAL_DIR_DOT_SHARD_REMOVE_ME); + return 0; +} + +void +shard_begin_rm_resolution(call_frame_t *frame, xlator_t *this) +{ + shard_priv_t *priv = NULL; + shard_local_t *local = NULL; + + priv = this->private; + local = frame->local; + + local->dot_shard_rm_loc.inode = inode_find(this->itable, + priv->dot_shard_rm_gfid); + if (!local->dot_shard_rm_loc.inode) { + local->dot_shard_loc.inode = inode_find(this->itable, + priv->dot_shard_gfid); + if (!local->dot_shard_loc.inode) { + shard_mkdir_internal_dir(frame, this, shard_pre_mkdir_rm_handler, + SHARD_INTERNAL_DIR_DOT_SHARD); + } else { + local->post_res_handler = shard_pre_mkdir_rm_handler; + shard_refresh_internal_dir(frame, this, + SHARD_INTERNAL_DIR_DOT_SHARD); + } + } else { + local->post_res_handler = shard_post_mkdir_rm_handler; + shard_refresh_internal_dir(frame, this, + SHARD_INTERNAL_DIR_DOT_SHARD_REMOVE_ME); + } +} + +int +shard_unlink(call_frame_t *frame, xlator_t *this, loc_t *loc, int xflag, + dict_t *xdata) +{ + int ret = -1; + uint64_t block_size = 0; + shard_local_t *local = NULL; + + ret = shard_inode_ctx_get_block_size(loc->inode, this, &block_size); + if ((ret) && (!IA_ISLNK(loc->inode->ia_type))) { + gf_msg(this->name, GF_LOG_ERROR, 0, SHARD_MSG_INODE_CTX_GET_FAILED, + "Failed to get block " + "size from inode ctx of %s", + uuid_utoa(loc->inode->gfid)); + goto err; + } + + if (!block_size || frame->root->pid == GF_CLIENT_PID_GSYNCD) { + STACK_WIND(frame, default_unlink_cbk, FIRST_CHILD(this), + FIRST_CHILD(this)->fops->unlink, loc, xflag, xdata); + return 0; + } + + local = mem_get0(this->local_pool); + if (!local) + goto err; + + frame->local = local; + + loc_copy(&local->loc, loc); + local->xflag = xflag; + local->xattr_req = (xdata) ? dict_ref(xdata) : dict_new(); + local->block_size = block_size; + local->resolver_base_inode = loc->inode; + local->fop = GF_FOP_UNLINK; + if (!this->itable) + this->itable = (local->loc.inode)->table; + + local->resolve_not = _gf_true; + shard_begin_rm_resolution(frame, this); + return 0; +err: + shard_common_failure_unwind(GF_FOP_UNLINK, frame, -1, ENOMEM); + return 0; +} + +int +shard_post_rename_lookup_handler(call_frame_t *frame, xlator_t *this) +{ + shard_rename_cbk(frame, this); + return 0; +} + +int +shard_rename_src_cbk(call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, struct iatt *buf, + struct iatt *preoldparent, struct iatt *postoldparent, + struct iatt *prenewparent, struct iatt *postnewparent, + dict_t *xdata) +{ + int ret = 0; + shard_local_t *local = NULL; + + local = frame->local; + + if (op_ret < 0) { + local->op_ret = op_ret; + local->op_errno = op_errno; + goto err; + } + /* Set ctx->refresh to TRUE to force a lookup on disk when + * shard_lookup_base_file() is called next to refresh the hard link + * count in ctx. Note that this is applicable only to the case where + * the rename dst is already existent and sharded. + */ + if ((local->dst_block_size) && (!local->cleanup_required)) + shard_inode_ctx_set_refresh_flag(local->int_inodelk.loc.inode, this); + + local->prebuf = *buf; + local->preoldparent = *preoldparent; + local->postoldparent = *postoldparent; + local->prenewparent = *prenewparent; + local->postnewparent = *postnewparent; + if (xdata) + local->xattr_rsp = dict_ref(xdata); + + if (local->dst_block_size) { + if (local->entrylk_frame) { + ret = shard_unlock_entrylk(frame, this); + if (ret < 0) { + local->op_ret = -1; + local->op_errno = -ret; + } + } + + ret = shard_unlock_inodelk(frame, this); + if (ret < 0) { + local->op_ret = -1; + local->op_errno = -ret; + goto err; + } + if (local->cleanup_required) + shard_start_background_deletion(this); + } + + /* Now the base file of src, if sharded, is looked up to gather ia_size + * and ia_blocks.*/ + if (local->block_size) { + local->tmp_loc.inode = inode_new(this->itable); + gf_uuid_copy(local->tmp_loc.gfid, (local->loc.inode)->gfid); + shard_refresh_base_file(frame, this, &local->tmp_loc, NULL, + shard_post_rename_lookup_handler); + } else { + shard_rename_cbk(frame, this); + } + return 0; +err: + shard_common_failure_unwind(local->fop, frame, local->op_ret, + local->op_errno); + return 0; +} + +int +shard_post_lookup_dst_base_file_handler(call_frame_t *frame, xlator_t *this) +{ + shard_local_t *local = NULL; + + local = frame->local; + + if (local->op_ret < 0) { + shard_common_failure_unwind(local->fop, frame, local->op_ret, + local->op_errno); + return 0; + } + + /* Save dst base file attributes into postbuf so the information is not + * lost when it is overwritten after lookup on base file of src in + * shard_lookup_base_file_cbk(). + */ + local->postbuf = local->prebuf; + shard_rename_src_base_file(frame, this); + return 0; +} + +int +shard_rename(call_frame_t *frame, xlator_t *this, loc_t *oldloc, loc_t *newloc, + dict_t *xdata) +{ + int ret = -1; + uint64_t block_size = 0; + uint64_t dst_block_size = 0; + shard_local_t *local = NULL; + + if (IA_ISDIR(oldloc->inode->ia_type)) { + STACK_WIND(frame, default_rename_cbk, FIRST_CHILD(this), + FIRST_CHILD(this)->fops->rename, oldloc, newloc, xdata); + return 0; + } + + ret = shard_inode_ctx_get_block_size(oldloc->inode, this, &block_size); + if ((ret) && (!IA_ISLNK(oldloc->inode->ia_type))) { + gf_msg(this->name, GF_LOG_ERROR, 0, SHARD_MSG_INODE_CTX_GET_FAILED, + "Failed to get block " + "size from inode ctx of %s", + uuid_utoa(oldloc->inode->gfid)); + goto err; + } + + if (newloc->inode) + ret = shard_inode_ctx_get_block_size(newloc->inode, this, + &dst_block_size); + + /* The following stack_wind covers the case where: + * a. the src file is not sharded and dst doesn't exist, OR + * b. the src and dst both exist but are not sharded. + */ + if (((!block_size) && (!dst_block_size)) || + frame->root->pid == GF_CLIENT_PID_GSYNCD) { + STACK_WIND(frame, default_rename_cbk, FIRST_CHILD(this), + FIRST_CHILD(this)->fops->rename, oldloc, newloc, xdata); + return 0; + } + + local = mem_get0(this->local_pool); + if (!local) + goto err; + + frame->local = local; + loc_copy(&local->loc, oldloc); + loc_copy(&local->loc2, newloc); + local->resolver_base_inode = newloc->inode; + local->fop = GF_FOP_RENAME; + local->xattr_req = (xdata) ? dict_ref(xdata) : dict_new(); + if (!local->xattr_req) + goto err; + + local->block_size = block_size; + local->dst_block_size = dst_block_size; + if (!this->itable) + this->itable = (local->loc.inode)->table; + local->resolve_not = _gf_true; + + /* The following if-block covers the case where the dst file exists + * and is sharded. + */ + if (local->dst_block_size) { + shard_begin_rm_resolution(frame, this); + } else { + /* The following block covers the case where the dst either doesn't + * exist or is NOT sharded but the src is sharded. In this case, shard + * xlator would go ahead and rename src to dst. Once done, it would also + * lookup the base shard of src to get the ia_size and ia_blocks xattr + * values. + */ + shard_rename_src_base_file(frame, this); + } + return 0; + +err: + shard_common_failure_unwind(GF_FOP_RENAME, frame, -1, ENOMEM); + return 0; +} + +int +shard_create_cbk(call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, fd_t *fd, inode_t *inode, + struct iatt *stbuf, struct iatt *preparent, + struct iatt *postparent, dict_t *xdata) +{ + int ret = -1; + shard_local_t *local = NULL; + + local = frame->local; + + if (op_ret == -1) + goto unwind; + + ret = shard_inode_ctx_set(inode, this, stbuf, local->block_size, + SHARD_ALL_MASK); + if (ret) + gf_msg(this->name, GF_LOG_WARNING, 0, SHARD_MSG_INODE_CTX_SET_FAILED, + "Failed to set inode " + "ctx for %s", + uuid_utoa(inode->gfid)); + +unwind: + SHARD_STACK_UNWIND(create, frame, op_ret, op_errno, fd, inode, stbuf, + preparent, postparent, xdata); + return 0; +} + +int +shard_create(call_frame_t *frame, xlator_t *this, loc_t *loc, int32_t flags, + mode_t mode, mode_t umask, fd_t *fd, dict_t *xdata) +{ + shard_priv_t *priv = NULL; + shard_local_t *local = NULL; + + priv = this->private; + local = mem_get0(this->local_pool); + if (!local) + goto err; + + frame->local = local; + local->block_size = priv->block_size; + + if (!__is_gsyncd_on_shard_dir(frame, loc)) { + SHARD_INODE_CREATE_INIT(this, local->block_size, xdata, loc, 0, 0, err); + } + + STACK_WIND(frame, shard_create_cbk, FIRST_CHILD(this), + FIRST_CHILD(this)->fops->create, loc, flags, mode, umask, fd, + xdata); + return 0; +err: + shard_common_failure_unwind(GF_FOP_CREATE, frame, -1, ENOMEM); + return 0; +} + +int +shard_open_cbk(call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, fd_t *fd, dict_t *xdata) +{ + /* To-Do: Handle open with O_TRUNC under locks */ + SHARD_STACK_UNWIND(open, frame, op_ret, op_errno, fd, xdata); + return 0; +} + +int +shard_open(call_frame_t *frame, xlator_t *this, loc_t *loc, int32_t flags, + fd_t *fd, dict_t *xdata) +{ + STACK_WIND(frame, shard_open_cbk, FIRST_CHILD(this), + FIRST_CHILD(this)->fops->open, loc, flags, fd, xdata); + return 0; +} + +int +shard_readv_do_cbk(call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, struct iovec *vector, + int32_t count, struct iatt *stbuf, struct iobref *iobref, + dict_t *xdata) +{ + int i = 0; + int call_count = 0; + void *address = NULL; + uint64_t block_num = 0; + off_t off = 0; + struct iovec vec = { + 0, + }; + shard_local_t *local = NULL; + fd_t *anon_fd = cookie; + shard_inode_ctx_t *ctx = NULL; + + local = frame->local; + + /* If shard has already seen a failure here before, there is no point + * in aggregating subsequent reads, so just go to out. + */ + if (local->op_ret < 0) + goto out; + + if (op_ret < 0) { + local->op_ret = op_ret; + local->op_errno = op_errno; + goto out; + } + + if (local->op_ret >= 0) + local->op_ret += op_ret; + + shard_inode_ctx_get(anon_fd->inode, this, &ctx); + block_num = ctx->block_num; + + if (block_num == local->first_block) { + address = local->iobuf->ptr; + } else { + /* else + * address to start writing to = beginning of buffer + + * number of bytes until end of first block + + * + block_size times number of blocks + * between the current block and the first + */ + address = (char *)local->iobuf->ptr + + (local->block_size - (local->offset % local->block_size)) + + ((block_num - local->first_block - 1) * local->block_size); + } + + for (i = 0; i < count; i++) { + address = (char *)address + off; + memcpy(address, vector[i].iov_base, vector[i].iov_len); + off += vector[i].iov_len; + } + +out: + if (anon_fd) + fd_unref(anon_fd); + call_count = shard_call_count_return(frame); + if (call_count == 0) { + SHARD_UNSET_ROOT_FS_ID(frame, local); + if (local->op_ret < 0) { + shard_common_failure_unwind(GF_FOP_READ, frame, local->op_ret, + local->op_errno); + } else { + if (xdata) + local->xattr_rsp = dict_ref(xdata); + vec.iov_base = local->iobuf->ptr; + if (local->offset + local->req_size > local->prebuf.ia_size) + local->total_size = local->prebuf.ia_size - local->offset; + vec.iov_len = local->total_size; + local->op_ret = local->total_size; + SHARD_STACK_UNWIND(readv, frame, local->op_ret, local->op_errno, + &vec, 1, &local->prebuf, local->iobref, + local->xattr_rsp); + return 0; + } + } + + return 0; +} + +int +shard_readv_do(call_frame_t *frame, xlator_t *this) +{ + int i = 0; + int call_count = 0; + int last_block = 0; + int cur_block = 0; + off_t orig_offset = 0; + off_t shard_offset = 0; + size_t read_size = 0; + size_t remaining_size = 0; + fd_t *fd = NULL; + fd_t *anon_fd = NULL; + shard_local_t *local = NULL; + gf_boolean_t wind_failed = _gf_false; + + local = frame->local; + fd = local->fd; + + orig_offset = local->offset; + cur_block = local->first_block; + last_block = local->last_block; + remaining_size = local->total_size; + local->call_count = call_count = local->num_blocks; + + SHARD_SET_ROOT_FS_ID(frame, local); + + if (fd->flags & O_DIRECT) + local->flags = O_DIRECT; + + while (cur_block <= last_block) { + if (wind_failed) { + shard_readv_do_cbk(frame, (void *)(long)0, this, -1, ENOMEM, NULL, + 0, NULL, NULL, NULL); + goto next; + } + + shard_offset = orig_offset % local->block_size; + read_size = local->block_size - shard_offset; + if (read_size > remaining_size) + read_size = remaining_size; + + remaining_size -= read_size; + + if (cur_block == 0) { + anon_fd = fd_ref(fd); + } else { + anon_fd = fd_anonymous(local->inode_list[i]); + if (!anon_fd) { + local->op_ret = -1; + local->op_errno = ENOMEM; + wind_failed = _gf_true; + shard_readv_do_cbk(frame, (void *)(long)anon_fd, this, -1, + ENOMEM, NULL, 0, NULL, NULL, NULL); + goto next; + } + } + + STACK_WIND_COOKIE(frame, shard_readv_do_cbk, anon_fd, FIRST_CHILD(this), + FIRST_CHILD(this)->fops->readv, anon_fd, read_size, + shard_offset, local->flags, local->xattr_req); + + orig_offset += read_size; + next: + cur_block++; + i++; + call_count--; + } + return 0; +} + +int +shard_common_mknod_cbk(call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, inode_t *inode, + struct iatt *buf, struct iatt *preparent, + struct iatt *postparent, dict_t *xdata) +{ + int shard_block_num = (long)cookie; + int call_count = 0; + shard_local_t *local = NULL; + + local = frame->local; + + if (op_ret < 0) { + if (op_errno == EEXIST) { + LOCK(&frame->lock); + { + local->eexist_count++; + } + UNLOCK(&frame->lock); + } else { + local->op_ret = op_ret; + local->op_errno = op_errno; + } + gf_msg_debug(this->name, 0, + "mknod of shard %d " + "failed: %s", + shard_block_num, strerror(op_errno)); + goto done; + } + + shard_link_block_inode(local, shard_block_num, inode, buf); + +done: + call_count = shard_call_count_return(frame); + if (call_count == 0) { + SHARD_UNSET_ROOT_FS_ID(frame, local); + local->create_count = 0; + local->post_mknod_handler(frame, this); + } + + return 0; +} + +int +shard_common_resume_mknod(call_frame_t *frame, xlator_t *this, + shard_post_mknod_fop_handler_t post_mknod_handler) +{ + int i = 0; + int shard_idx_iter = 0; + int last_block = 0; + int ret = 0; + int call_count = 0; + char path[PATH_MAX] = { + 0, + }; + mode_t mode = 0; + char *bname = NULL; + shard_priv_t *priv = NULL; + shard_inode_ctx_t ctx_tmp = { + 0, + }; + shard_local_t *local = NULL; + gf_boolean_t wind_failed = _gf_false; + fd_t *fd = NULL; + loc_t loc = { + 0, + }; + dict_t *xattr_req = NULL; + + local = frame->local; + priv = this->private; + fd = local->fd; + shard_idx_iter = local->first_block; + last_block = local->last_block; + call_count = local->call_count = local->create_count; + local->post_mknod_handler = post_mknod_handler; + + SHARD_SET_ROOT_FS_ID(frame, local); + + ret = shard_inode_ctx_get_all(fd->inode, this, &ctx_tmp); + if (ret) { + gf_msg(this->name, GF_LOG_ERROR, 0, SHARD_MSG_INODE_CTX_GET_FAILED, + "Failed to get inode " + "ctx for %s", + uuid_utoa(fd->inode->gfid)); + local->op_ret = -1; + local->op_errno = ENOMEM; + goto err; + } + mode = st_mode_from_ia(ctx_tmp.stat.ia_prot, ctx_tmp.stat.ia_type); + + while (shard_idx_iter <= last_block) { + if (local->inode_list[i]) { + shard_idx_iter++; + i++; + continue; + } + + if (wind_failed) { + shard_common_mknod_cbk(frame, (void *)(long)shard_idx_iter, this, + -1, ENOMEM, NULL, NULL, NULL, NULL, NULL); + goto next; + } + + shard_make_block_abspath(shard_idx_iter, fd->inode->gfid, path, + sizeof(path)); + + xattr_req = shard_create_gfid_dict(local->xattr_req); + if (!xattr_req) { + local->op_ret = -1; + local->op_errno = ENOMEM; + wind_failed = _gf_true; + shard_common_mknod_cbk(frame, (void *)(long)shard_idx_iter, this, + -1, ENOMEM, NULL, NULL, NULL, NULL, NULL); + goto next; + } + + bname = strrchr(path, '/') + 1; + loc.inode = inode_new(this->itable); + loc.parent = inode_ref(priv->dot_shard_inode); + ret = inode_path(loc.parent, bname, (char **)&(loc.path)); + if (ret < 0 || !(loc.inode)) { + gf_msg(this->name, GF_LOG_ERROR, 0, SHARD_MSG_INODE_PATH_FAILED, + "Inode path failed" + "on %s, base file gfid = %s", + bname, uuid_utoa(fd->inode->gfid)); + local->op_ret = -1; + local->op_errno = ENOMEM; + wind_failed = _gf_true; + loc_wipe(&loc); + dict_unref(xattr_req); + shard_common_mknod_cbk(frame, (void *)(long)shard_idx_iter, this, + -1, ENOMEM, NULL, NULL, NULL, NULL, NULL); + goto next; + } + + loc.name = strrchr(loc.path, '/'); + if (loc.name) + loc.name++; + + STACK_WIND_COOKIE(frame, shard_common_mknod_cbk, + (void *)(long)shard_idx_iter, FIRST_CHILD(this), + FIRST_CHILD(this)->fops->mknod, &loc, mode, + ctx_tmp.stat.ia_rdev, 0, xattr_req); + loc_wipe(&loc); + dict_unref(xattr_req); + + next: + shard_idx_iter++; + i++; + if (!--call_count) + break; + } + + return 0; +err: + /* + * This block is for handling failure in shard_inode_ctx_get_all(). + * Failures in the while-loop are handled within the loop. + */ + SHARD_UNSET_ROOT_FS_ID(frame, local); + post_mknod_handler(frame, this); + return 0; +} + +int +shard_post_mknod_readv_handler(call_frame_t *frame, xlator_t *this); + +int +shard_post_lookup_shards_readv_handler(call_frame_t *frame, xlator_t *this) +{ + shard_local_t *local = NULL; + + local = frame->local; + + if (local->op_ret < 0) { + shard_common_failure_unwind(GF_FOP_READ, frame, local->op_ret, + local->op_errno); + return 0; + } + + if (local->create_count) { + shard_common_resume_mknod(frame, this, shard_post_mknod_readv_handler); + } else { + shard_readv_do(frame, this); + } + + return 0; +} + +int +shard_post_mknod_readv_handler(call_frame_t *frame, xlator_t *this) +{ + shard_local_t *local = NULL; + + local = frame->local; + + if (local->op_ret < 0) { + shard_common_failure_unwind(GF_FOP_READ, frame, local->op_ret, + local->op_errno); + return 0; + } + + if (!local->eexist_count) { + shard_readv_do(frame, this); + } else { + local->call_count = local->eexist_count; + shard_common_lookup_shards(frame, this, local->loc.inode, + shard_post_lookup_shards_readv_handler); + } + return 0; +} + +int +shard_post_resolve_readv_handler(call_frame_t *frame, xlator_t *this) +{ + shard_local_t *local = NULL; + + local = frame->local; + + if (local->op_ret < 0) { + if (local->op_errno != ENOENT) { + shard_common_failure_unwind(GF_FOP_READ, frame, local->op_ret, + local->op_errno); + return 0; + } else { + struct iovec vec = { + 0, + }; + + vec.iov_base = local->iobuf->ptr; + vec.iov_len = local->total_size; + local->op_ret = local->total_size; + SHARD_STACK_UNWIND(readv, frame, local->op_ret, 0, &vec, 1, + &local->prebuf, local->iobref, NULL); + return 0; + } + } + + if (local->call_count) { + shard_common_lookup_shards(frame, this, local->resolver_base_inode, + shard_post_lookup_shards_readv_handler); + } else { + shard_readv_do(frame, this); + } + + return 0; +} + +int +shard_post_lookup_readv_handler(call_frame_t *frame, xlator_t *this) +{ + int ret = 0; + struct iobuf *iobuf = NULL; + shard_local_t *local = NULL; + shard_priv_t *priv = NULL; + + priv = this->private; + local = frame->local; + + if (local->op_ret < 0) { + shard_common_failure_unwind(GF_FOP_READ, frame, local->op_ret, + local->op_errno); + return 0; + } + + if (local->offset >= local->prebuf.ia_size) { + /* If the read is being performed past the end of the file, + * unwind the FOP with 0 bytes read as status. + */ + struct iovec vec = { + 0, + }; + + iobuf = iobuf_get2(this->ctx->iobuf_pool, local->req_size); + if (!iobuf) + goto err; + + vec.iov_base = iobuf->ptr; + vec.iov_len = 0; + local->iobref = iobref_new(); + iobref_add(local->iobref, iobuf); + iobuf_unref(iobuf); + + SHARD_STACK_UNWIND(readv, frame, 0, 0, &vec, 1, &local->prebuf, + local->iobref, NULL); + return 0; + } + + local->first_block = get_lowest_block(local->offset, local->block_size); + + local->total_size = local->req_size; + + local->last_block = get_highest_block(local->offset, local->total_size, + local->block_size); + + local->num_blocks = local->last_block - local->first_block + 1; + GF_ASSERT(local->num_blocks > 0); + local->resolver_base_inode = local->loc.inode; + + local->inode_list = GF_CALLOC(local->num_blocks, sizeof(inode_t *), + gf_shard_mt_inode_list); + if (!local->inode_list) + goto err; + + iobuf = iobuf_get2(this->ctx->iobuf_pool, local->total_size); + if (!iobuf) + goto err; + + local->iobref = iobref_new(); + if (!local->iobref) { + iobuf_unref(iobuf); + goto err; + } + + if (iobref_add(local->iobref, iobuf) != 0) { + iobuf_unref(iobuf); + goto err; + } + + memset(iobuf->ptr, 0, local->total_size); + iobuf_unref(iobuf); + local->iobuf = iobuf; + + local->dot_shard_loc.inode = inode_find(this->itable, priv->dot_shard_gfid); + if (!local->dot_shard_loc.inode) { + ret = shard_init_internal_dir_loc(this, local, + SHARD_INTERNAL_DIR_DOT_SHARD); + if (ret) + goto err; + shard_lookup_internal_dir(frame, this, shard_post_resolve_readv_handler, + SHARD_INTERNAL_DIR_DOT_SHARD); + } else { + local->post_res_handler = shard_post_resolve_readv_handler; + shard_refresh_internal_dir(frame, this, SHARD_INTERNAL_DIR_DOT_SHARD); + } + return 0; +err: + shard_common_failure_unwind(GF_FOP_READ, frame, -1, ENOMEM); + return 0; +} + +int +shard_readv(call_frame_t *frame, xlator_t *this, fd_t *fd, size_t size, + off_t offset, uint32_t flags, dict_t *xdata) +{ + int ret = 0; + uint64_t block_size = 0; + shard_local_t *local = NULL; + + ret = shard_inode_ctx_get_block_size(fd->inode, this, &block_size); + if (ret) { + gf_msg(this->name, GF_LOG_ERROR, 0, SHARD_MSG_INODE_CTX_GET_FAILED, + "Failed to get block " + "size for %s from its inode ctx", + uuid_utoa(fd->inode->gfid)); + goto err; + } + + if (!block_size || frame->root->pid == GF_CLIENT_PID_GSYNCD) { + /* block_size = 0 means that the file was created before + * sharding was enabled on the volume. + */ + STACK_WIND(frame, default_readv_cbk, FIRST_CHILD(this), + FIRST_CHILD(this)->fops->readv, fd, size, offset, flags, + xdata); + return 0; + } + + if (!this->itable) + this->itable = fd->inode->table; + + local = mem_get0(this->local_pool); + if (!local) + goto err; + + frame->local = local; + + ret = syncbarrier_init(&local->barrier); + if (ret) + goto err; + local->fd = fd_ref(fd); + local->block_size = block_size; + local->offset = offset; + local->req_size = size; + local->flags = flags; + local->fop = GF_FOP_READ; + local->xattr_req = (xdata) ? dict_ref(xdata) : dict_new(); + if (!local->xattr_req) + goto err; + + local->loc.inode = inode_ref(fd->inode); + gf_uuid_copy(local->loc.gfid, fd->inode->gfid); + + shard_refresh_base_file(frame, this, NULL, fd, + shard_post_lookup_readv_handler); + return 0; +err: + shard_common_failure_unwind(GF_FOP_READ, frame, -1, ENOMEM); + return 0; +} + +int +shard_common_inode_write_post_update_size_handler(call_frame_t *frame, + xlator_t *this) +{ + shard_local_t *local = NULL; + + local = frame->local; + + if (local->op_ret < 0) { + shard_common_failure_unwind(local->fop, frame, local->op_ret, + local->op_errno); + } else { + shard_common_inode_write_success_unwind(local->fop, frame, + local->written_size); + } + return 0; +} + +static gf_boolean_t +shard_is_appending_write(shard_local_t *local) +{ + if (local->fop != GF_FOP_WRITE) + return _gf_false; + if (local->flags & O_APPEND) + return _gf_true; + if (local->fd->flags & O_APPEND) + return _gf_true; + return _gf_false; +} + +int +__shard_get_delta_size_from_inode_ctx(shard_local_t *local, inode_t *inode, + xlator_t *this) +{ + int ret = -1; + uint64_t ctx_uint = 0; + shard_inode_ctx_t *ctx = NULL; + + ret = __inode_ctx_get(inode, this, &ctx_uint); + if (ret < 0) + return ret; + + ctx = (shard_inode_ctx_t *)(uintptr_t)ctx_uint; + + if (shard_is_appending_write(local)) { + local->delta_size = local->total_size; + } else if (local->offset + local->total_size > ctx->stat.ia_size) { + local->delta_size = (local->offset + local->total_size) - + ctx->stat.ia_size; + } else { + local->delta_size = 0; + } + ctx->stat.ia_size += (local->delta_size); + local->postbuf = ctx->stat; + + return 0; +} + +int +shard_get_delta_size_from_inode_ctx(shard_local_t *local, inode_t *inode, + xlator_t *this) +{ + int ret = -1; + + LOCK(&inode->lock); + { + ret = __shard_get_delta_size_from_inode_ctx(local, inode, this); + } + UNLOCK(&inode->lock); + + return ret; +} + +int +shard_common_inode_write_do_cbk(call_frame_t *frame, void *cookie, + xlator_t *this, int32_t op_ret, + int32_t op_errno, struct iatt *pre, + struct iatt *post, dict_t *xdata) +{ + int call_count = 0; + fd_t *anon_fd = cookie; + shard_local_t *local = NULL; + glusterfs_fop_t fop = 0; + + local = frame->local; + fop = local->fop; + + LOCK(&frame->lock); + { + if (op_ret < 0) { + local->op_ret = op_ret; + local->op_errno = op_errno; + } else { + local->written_size += op_ret; + GF_ATOMIC_ADD(local->delta_blocks, + post->ia_blocks - pre->ia_blocks); + local->delta_size += (post->ia_size - pre->ia_size); + shard_inode_ctx_set(local->fd->inode, this, post, 0, + SHARD_MASK_TIMES); + if (local->fd->inode != anon_fd->inode) + shard_inode_ctx_add_to_fsync_list(local->fd->inode, this, + anon_fd->inode); + } + } + UNLOCK(&frame->lock); + + if (anon_fd) + fd_unref(anon_fd); + + call_count = shard_call_count_return(frame); + if (call_count == 0) { + SHARD_UNSET_ROOT_FS_ID(frame, local); + if (local->op_ret < 0) { + shard_common_failure_unwind(fop, frame, local->op_ret, + local->op_errno); + } else { + shard_get_delta_size_from_inode_ctx(local, local->fd->inode, this); + local->hole_size = 0; + if (xdata) + local->xattr_rsp = dict_ref(xdata); + shard_update_file_size( + frame, this, local->fd, NULL, + shard_common_inode_write_post_update_size_handler); + } + } + + return 0; +} + +int +shard_common_inode_write_wind(call_frame_t *frame, xlator_t *this, fd_t *fd, + struct iovec *vec, int count, off_t shard_offset, + size_t size) +{ + shard_local_t *local = NULL; + + local = frame->local; + + switch (local->fop) { + case GF_FOP_WRITE: + STACK_WIND_COOKIE( + frame, shard_common_inode_write_do_cbk, fd, FIRST_CHILD(this), + FIRST_CHILD(this)->fops->writev, fd, vec, count, shard_offset, + local->flags, local->iobref, local->xattr_req); + break; + case GF_FOP_FALLOCATE: + STACK_WIND_COOKIE( + frame, shard_common_inode_write_do_cbk, fd, FIRST_CHILD(this), + FIRST_CHILD(this)->fops->fallocate, fd, local->flags, + shard_offset, size, local->xattr_req); + break; + case GF_FOP_ZEROFILL: + STACK_WIND_COOKIE(frame, shard_common_inode_write_do_cbk, fd, + FIRST_CHILD(this), + FIRST_CHILD(this)->fops->zerofill, fd, + shard_offset, size, local->xattr_req); + break; + case GF_FOP_DISCARD: + STACK_WIND_COOKIE(frame, shard_common_inode_write_do_cbk, fd, + FIRST_CHILD(this), + FIRST_CHILD(this)->fops->discard, fd, + shard_offset, size, local->xattr_req); + break; + default: + gf_msg(this->name, GF_LOG_WARNING, 0, SHARD_MSG_INVALID_FOP, + "Invalid fop id = %d", local->fop); + break; + } + return 0; +} + +int +shard_common_inode_write_do(call_frame_t *frame, xlator_t *this) +{ + int i = 0; + int count = 0; + int call_count = 0; + int last_block = 0; + uint32_t cur_block = 0; + fd_t *fd = NULL; + fd_t *anon_fd = NULL; + shard_local_t *local = NULL; + struct iovec *vec = NULL; + gf_boolean_t wind_failed = _gf_false; + gf_boolean_t odirect = _gf_false; + off_t orig_offset = 0; + off_t shard_offset = 0; + off_t vec_offset = 0; + size_t remaining_size = 0; + size_t shard_write_size = 0; + + local = frame->local; + fd = local->fd; + + orig_offset = local->offset; + remaining_size = local->total_size; + cur_block = local->first_block; + local->call_count = call_count = local->num_blocks; + last_block = local->last_block; + + SHARD_SET_ROOT_FS_ID(frame, local); + + if (dict_set_uint32(local->xattr_req, GLUSTERFS_WRITE_UPDATE_ATOMIC, 4)) { + gf_msg(this->name, GF_LOG_ERROR, 0, SHARD_MSG_DICT_OP_FAILED, + "Failed to set " GLUSTERFS_WRITE_UPDATE_ATOMIC + " into " + "dict: %s", + uuid_utoa(fd->inode->gfid)); + local->op_ret = -1; + local->op_errno = ENOMEM; + local->call_count = 1; + shard_common_inode_write_do_cbk(frame, (void *)(long)0, this, -1, + ENOMEM, NULL, NULL, NULL); + return 0; + } + + if ((fd->flags & O_DIRECT) && (local->fop == GF_FOP_WRITE)) + odirect = _gf_true; + + while (cur_block <= last_block) { + if (wind_failed) { + shard_common_inode_write_do_cbk(frame, (void *)(long)0, this, -1, + ENOMEM, NULL, NULL, NULL); + goto next; + } + + shard_offset = orig_offset % local->block_size; + shard_write_size = local->block_size - shard_offset; + if (shard_write_size > remaining_size) + shard_write_size = remaining_size; + + remaining_size -= shard_write_size; + + if (local->fop == GF_FOP_WRITE) { + vec = NULL; + count = iov_subset(local->vector, local->count, vec_offset, + shard_write_size, &vec, 0); + if (count < 0) { + local->op_ret = -1; + local->op_errno = ENOMEM; + wind_failed = _gf_true; + shard_common_inode_write_do_cbk(frame, (void *)(long)0, this, + -1, ENOMEM, NULL, NULL, NULL); + goto next; + } + } + + if (cur_block == 0) { + anon_fd = fd_ref(fd); + } else { + anon_fd = fd_anonymous(local->inode_list[i]); + if (!anon_fd) { + local->op_ret = -1; + local->op_errno = ENOMEM; + wind_failed = _gf_true; + GF_FREE(vec); + shard_common_inode_write_do_cbk(frame, (void *)(long)anon_fd, + this, -1, ENOMEM, NULL, NULL, + NULL); + goto next; + } + + if (local->fop == GF_FOP_WRITE) { + if (odirect) + local->flags = O_DIRECT; + else + local->flags = GF_ANON_FD_FLAGS; + } + } + + shard_common_inode_write_wind(frame, this, anon_fd, vec, count, + shard_offset, shard_write_size); + if (vec) + vec_offset += shard_write_size; + orig_offset += shard_write_size; + GF_FREE(vec); + vec = NULL; + next: + cur_block++; + i++; + call_count--; + } + return 0; +} + +int +shard_common_inode_write_post_mknod_handler(call_frame_t *frame, + xlator_t *this); + +int +shard_common_inode_write_post_lookup_shards_handler(call_frame_t *frame, + xlator_t *this) +{ + shard_local_t *local = NULL; + + local = frame->local; + + if (local->op_ret < 0) { + shard_common_failure_unwind(local->fop, frame, local->op_ret, + local->op_errno); + return 0; + } + + if (local->create_count) { + shard_common_resume_mknod(frame, this, + shard_common_inode_write_post_mknod_handler); + } else { + shard_common_inode_write_do(frame, this); + } + + return 0; +} + +int +shard_common_inode_write_post_mknod_handler(call_frame_t *frame, xlator_t *this) +{ + shard_local_t *local = NULL; + + local = frame->local; + + if (local->op_ret < 0) { + shard_common_failure_unwind(local->fop, frame, local->op_ret, + local->op_errno); + return 0; + } + + if (!local->eexist_count) { + shard_common_inode_write_do(frame, this); + } else { + local->call_count = local->eexist_count; + shard_common_lookup_shards( + frame, this, local->loc.inode, + shard_common_inode_write_post_lookup_shards_handler); + } + + return 0; +} + +int +shard_common_inode_write_post_resolve_handler(call_frame_t *frame, + xlator_t *this) +{ + shard_local_t *local = NULL; + + local = frame->local; + + if (local->op_ret < 0) { + shard_common_failure_unwind(local->fop, frame, local->op_ret, + local->op_errno); + return 0; + } + + if (local->call_count) { + shard_common_lookup_shards( + frame, this, local->resolver_base_inode, + shard_common_inode_write_post_lookup_shards_handler); + } else if (local->create_count) { + shard_common_inode_write_post_lookup_shards_handler(frame, this); + } else { + shard_common_inode_write_do(frame, this); + } + + return 0; +} + +int +shard_common_inode_write_post_lookup_handler(call_frame_t *frame, + xlator_t *this) +{ + shard_local_t *local = frame->local; + shard_priv_t *priv = this->private; + + if (local->op_ret < 0) { + shard_common_failure_unwind(local->fop, frame, local->op_ret, + local->op_errno); + return 0; + } + + local->postbuf = local->prebuf; + + /*Adjust offset to EOF so that correct shard is chosen for append*/ + if (shard_is_appending_write(local)) + local->offset = local->prebuf.ia_size; + + local->first_block = get_lowest_block(local->offset, local->block_size); + local->last_block = get_highest_block(local->offset, local->total_size, + local->block_size); + local->num_blocks = local->last_block - local->first_block + 1; + GF_ASSERT(local->num_blocks > 0); + local->inode_list = GF_CALLOC(local->num_blocks, sizeof(inode_t *), + gf_shard_mt_inode_list); + if (!local->inode_list) { + shard_common_failure_unwind(local->fop, frame, -1, ENOMEM); + return 0; + } + + gf_msg_trace(this->name, 0, + "%s: gfid=%s first_block=%" PRIu64 + " " + "last_block=%" PRIu64 " num_blocks=%" PRIu64 " offset=%" PRId64 + " total_size=%zu flags=%" PRId32 "", + gf_fop_list[local->fop], + uuid_utoa(local->resolver_base_inode->gfid), + local->first_block, local->last_block, local->num_blocks, + local->offset, local->total_size, local->flags); + + local->dot_shard_loc.inode = inode_find(this->itable, priv->dot_shard_gfid); + + if (!local->dot_shard_loc.inode) { + /*change handler*/ + shard_mkdir_internal_dir(frame, this, + shard_common_inode_write_post_resolve_handler, + SHARD_INTERNAL_DIR_DOT_SHARD); + } else { + /*change handler*/ + local->post_res_handler = shard_common_inode_write_post_resolve_handler; + shard_refresh_internal_dir(frame, this, SHARD_INTERNAL_DIR_DOT_SHARD); + } + return 0; +} + +int +shard_mkdir_internal_dir_cbk(call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, inode_t *inode, + struct iatt *buf, struct iatt *preparent, + struct iatt *postparent, dict_t *xdata) +{ + inode_t *link_inode = NULL; + shard_local_t *local = NULL; + shard_internal_dir_type_t type = (shard_internal_dir_type_t)cookie; + + local = frame->local; + + SHARD_UNSET_ROOT_FS_ID(frame, local); + + if (op_ret == -1) { + if (op_errno != EEXIST) { + local->op_ret = op_ret; + local->op_errno = op_errno; + goto unwind; + } else { + gf_msg_debug(this->name, 0, + "mkdir on %s failed " + "with EEXIST. Attempting lookup now", + shard_internal_dir_string(type)); + shard_lookup_internal_dir(frame, this, local->post_res_handler, + type); + return 0; + } + } + + link_inode = shard_link_internal_dir_inode(local, inode, buf, type); + if (link_inode != inode) { + shard_refresh_internal_dir(frame, this, type); + } else { + shard_inode_ctx_mark_dir_refreshed(link_inode, this); + shard_common_resolve_shards(frame, this, local->post_res_handler); + } + return 0; +unwind: + shard_common_resolve_shards(frame, this, local->post_res_handler); + return 0; +} + +int +shard_mkdir_internal_dir(call_frame_t *frame, xlator_t *this, + shard_post_resolve_fop_handler_t handler, + shard_internal_dir_type_t type) +{ + int ret = -1; + shard_local_t *local = NULL; + shard_priv_t *priv = NULL; + dict_t *xattr_req = NULL; + uuid_t *gfid = NULL; + loc_t *loc = NULL; + gf_boolean_t free_gfid = _gf_true; + + local = frame->local; + priv = this->private; + + local->post_res_handler = handler; + gfid = GF_MALLOC(sizeof(uuid_t), gf_common_mt_uuid_t); + if (!gfid) + goto err; + + switch (type) { + case SHARD_INTERNAL_DIR_DOT_SHARD: + gf_uuid_copy(*gfid, priv->dot_shard_gfid); + loc = &local->dot_shard_loc; + break; + case SHARD_INTERNAL_DIR_DOT_SHARD_REMOVE_ME: + gf_uuid_copy(*gfid, priv->dot_shard_rm_gfid); + loc = &local->dot_shard_rm_loc; + break; + default: + bzero(*gfid, sizeof(uuid_t)); + break; + } + + xattr_req = dict_new(); + if (!xattr_req) + goto err; + + ret = shard_init_internal_dir_loc(this, local, type); + if (ret) + goto err; + + ret = dict_set_gfuuid(xattr_req, "gfid-req", *gfid, false); + if (ret) { + gf_msg(this->name, GF_LOG_ERROR, 0, SHARD_MSG_DICT_OP_FAILED, + "Failed to set gfid-req for %s", + shard_internal_dir_string(type)); + goto err; + } else { + free_gfid = _gf_false; + } + + SHARD_SET_ROOT_FS_ID(frame, local); + + STACK_WIND_COOKIE(frame, shard_mkdir_internal_dir_cbk, (void *)(long)type, + FIRST_CHILD(this), FIRST_CHILD(this)->fops->mkdir, loc, + 0755, 0, xattr_req); + dict_unref(xattr_req); + return 0; + +err: + if (xattr_req) + dict_unref(xattr_req); + local->op_ret = -1; + local->op_errno = ENOMEM; + if (free_gfid) + GF_FREE(gfid); + handler(frame, this); + return 0; +} + +int +shard_flush_cbk(call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, dict_t *xdata) +{ + /* To-Do: Wind flush on all shards of the file */ + SHARD_STACK_UNWIND(flush, frame, op_ret, op_errno, xdata); + return 0; +} + +int +shard_flush(call_frame_t *frame, xlator_t *this, fd_t *fd, dict_t *xdata) +{ + STACK_WIND(frame, shard_flush_cbk, FIRST_CHILD(this), + FIRST_CHILD(this)->fops->flush, fd, xdata); + return 0; +} + +int +__shard_get_timestamps_from_inode_ctx(shard_local_t *local, inode_t *inode, + xlator_t *this) +{ + int ret = -1; + uint64_t ctx_uint = 0; + shard_inode_ctx_t *ctx = NULL; + + ret = __inode_ctx_get(inode, this, &ctx_uint); + if (ret < 0) + return ret; + + ctx = (shard_inode_ctx_t *)(uintptr_t)ctx_uint; + + local->postbuf.ia_ctime = ctx->stat.ia_ctime; + local->postbuf.ia_ctime_nsec = ctx->stat.ia_ctime_nsec; + local->postbuf.ia_atime = ctx->stat.ia_atime; + local->postbuf.ia_atime_nsec = ctx->stat.ia_atime_nsec; + local->postbuf.ia_mtime = ctx->stat.ia_mtime; + local->postbuf.ia_mtime_nsec = ctx->stat.ia_mtime_nsec; + + return 0; +} + +int +shard_get_timestamps_from_inode_ctx(shard_local_t *local, inode_t *inode, + xlator_t *this) +{ + int ret = 0; + + LOCK(&inode->lock); + { + ret = __shard_get_timestamps_from_inode_ctx(local, inode, this); + } + UNLOCK(&inode->lock); + + return ret; +} + +int +shard_fsync_shards_cbk(call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, struct iatt *prebuf, + struct iatt *postbuf, dict_t *xdata) +{ + int call_count = 0; + uint64_t fsync_count = 0; + fd_t *anon_fd = cookie; + shard_local_t *local = NULL; + shard_inode_ctx_t *ctx = NULL; + shard_inode_ctx_t *base_ictx = NULL; + inode_t *base_inode = NULL; + gf_boolean_t unref_shard_inode = _gf_false; + + local = frame->local; + base_inode = local->fd->inode; + + if (local->op_ret < 0) + goto out; + + LOCK(&frame->lock); + { + if (op_ret < 0) { + local->op_ret = op_ret; + local->op_errno = op_errno; + UNLOCK(&frame->lock); + goto out; + } + shard_inode_ctx_set(local->fd->inode, this, postbuf, 0, + SHARD_MASK_TIMES); + } + UNLOCK(&frame->lock); + fd_ctx_get(anon_fd, this, &fsync_count); +out: + if (anon_fd && (base_inode != anon_fd->inode)) { + LOCK(&base_inode->lock); + LOCK(&anon_fd->inode->lock); + { + __shard_inode_ctx_get(anon_fd->inode, this, &ctx); + __shard_inode_ctx_get(base_inode, this, &base_ictx); + if (op_ret == 0) + ctx->fsync_needed -= fsync_count; + GF_ASSERT(ctx->fsync_needed >= 0); + if (ctx->fsync_needed != 0) { + list_add_tail(&ctx->to_fsync_list, &base_ictx->to_fsync_list); + base_ictx->fsync_count++; + } else { + unref_shard_inode = _gf_true; + } + } + UNLOCK(&anon_fd->inode->lock); + UNLOCK(&base_inode->lock); + } + + if (unref_shard_inode) + inode_unref(anon_fd->inode); + if (anon_fd) + fd_unref(anon_fd); + + call_count = shard_call_count_return(frame); + if (call_count != 0) + return 0; + + if (local->op_ret < 0) { + shard_common_failure_unwind(GF_FOP_FSYNC, frame, local->op_ret, + local->op_errno); + } else { + shard_get_timestamps_from_inode_ctx(local, base_inode, this); + SHARD_STACK_UNWIND(fsync, frame, local->op_ret, local->op_errno, + &local->prebuf, &local->postbuf, local->xattr_rsp); + } + return 0; +} + +int +shard_post_lookup_fsync_handler(call_frame_t *frame, xlator_t *this) +{ + int ret = 0; + int call_count = 0; + int fsync_count = 0; + fd_t *anon_fd = NULL; + inode_t *base_inode = NULL; + shard_local_t *local = NULL; + shard_inode_ctx_t *ctx = NULL; + shard_inode_ctx_t *iter = NULL; + struct list_head copy = { + 0, + }; + shard_inode_ctx_t *tmp = NULL; + + local = frame->local; + base_inode = local->fd->inode; + local->postbuf = local->prebuf; + INIT_LIST_HEAD(©); + + if (local->op_ret < 0) { + shard_common_failure_unwind(GF_FOP_FSYNC, frame, local->op_ret, + local->op_errno); + return 0; + } + + LOCK(&base_inode->lock); + { + __shard_inode_ctx_get(base_inode, this, &ctx); + list_splice_init(&ctx->to_fsync_list, ©); + call_count = ctx->fsync_count; + ctx->fsync_count = 0; + } + UNLOCK(&base_inode->lock); + + local->call_count = ++call_count; + + /* Send fsync() on the base shard first */ + anon_fd = fd_ref(local->fd); + STACK_WIND_COOKIE(frame, shard_fsync_shards_cbk, anon_fd, FIRST_CHILD(this), + FIRST_CHILD(this)->fops->fsync, anon_fd, local->datasync, + local->xattr_req); + call_count--; + anon_fd = NULL; + + list_for_each_entry_safe(iter, tmp, ©, to_fsync_list) + { + list_del_init(&iter->to_fsync_list); + fsync_count = 0; + shard_inode_ctx_get_fsync_count(iter->inode, this, &fsync_count); + GF_ASSERT(fsync_count > 0); + anon_fd = fd_anonymous(iter->inode); + if (!anon_fd) { + local->op_ret = -1; + local->op_errno = ENOMEM; + gf_msg(this->name, GF_LOG_WARNING, ENOMEM, + SHARD_MSG_MEMALLOC_FAILED, + "Failed to create " + "anon fd to fsync shard"); + shard_fsync_shards_cbk(frame, (void *)(long)anon_fd, this, -1, + ENOMEM, NULL, NULL, NULL); + continue; + } + + ret = fd_ctx_set(anon_fd, this, fsync_count); + if (ret) { + gf_msg(this->name, GF_LOG_ERROR, 0, SHARD_MSG_FD_CTX_SET_FAILED, + "Failed to set fd " + "ctx for shard inode gfid=%s", + uuid_utoa(iter->inode->gfid)); + local->op_ret = -1; + local->op_errno = ENOMEM; + shard_fsync_shards_cbk(frame, (void *)(long)anon_fd, this, -1, + ENOMEM, NULL, NULL, NULL); + continue; + } + STACK_WIND_COOKIE(frame, shard_fsync_shards_cbk, anon_fd, + FIRST_CHILD(this), FIRST_CHILD(this)->fops->fsync, + anon_fd, local->datasync, local->xattr_req); + call_count--; + } + + return 0; +} + +int +shard_fsync(call_frame_t *frame, xlator_t *this, fd_t *fd, int32_t datasync, + dict_t *xdata) +{ + int ret = 0; + uint64_t block_size = 0; + shard_local_t *local = NULL; + + ret = shard_inode_ctx_get_block_size(fd->inode, this, &block_size); + if (ret) { + gf_msg(this->name, GF_LOG_ERROR, 0, SHARD_MSG_INODE_CTX_GET_FAILED, + "Failed to get block " + "size for %s from its inode ctx", + uuid_utoa(fd->inode->gfid)); + goto err; + } + + if (!block_size || frame->root->pid == GF_CLIENT_PID_GSYNCD) { + STACK_WIND(frame, default_fsync_cbk, FIRST_CHILD(this), + FIRST_CHILD(this)->fops->fsync, fd, datasync, xdata); + return 0; + } + + if (!this->itable) + this->itable = fd->inode->table; + + local = mem_get0(this->local_pool); + if (!local) + goto err; + + frame->local = local; + + local->fd = fd_ref(fd); + local->fop = GF_FOP_FSYNC; + local->datasync = datasync; + local->xattr_req = (xdata) ? dict_ref(xdata) : dict_new(); + if (!local->xattr_req) + goto err; + + local->loc.inode = inode_ref(fd->inode); + gf_uuid_copy(local->loc.gfid, fd->inode->gfid); + + shard_refresh_base_file(frame, this, NULL, fd, + shard_post_lookup_fsync_handler); + return 0; +err: + shard_common_failure_unwind(GF_FOP_FSYNC, frame, -1, ENOMEM); + return 0; +} + +int +shard_readdir_past_dot_shard_cbk(call_frame_t *frame, void *cookie, + xlator_t *this, int32_t op_ret, + int32_t op_errno, gf_dirent_t *orig_entries, + dict_t *xdata) +{ + gf_dirent_t *entry = NULL; + gf_dirent_t *tmp = NULL; + shard_local_t *local = NULL; + + local = frame->local; + + if (op_ret < 0) + goto unwind; + + list_for_each_entry_safe(entry, tmp, (&orig_entries->list), list) + { + list_del_init(&entry->list); + list_add_tail(&entry->list, &local->entries_head.list); + + if (!entry->dict) + continue; + + if (IA_ISDIR(entry->d_stat.ia_type)) + continue; + + if (dict_get(entry->dict, GF_XATTR_SHARD_FILE_SIZE)) + shard_modify_size_and_block_count(&entry->d_stat, entry->dict); + if (!entry->inode) + continue; + + shard_inode_ctx_update(entry->inode, this, entry->dict, &entry->d_stat); + } + local->op_ret += op_ret; + +unwind: + if (local->fop == GF_FOP_READDIR) + SHARD_STACK_UNWIND(readdir, frame, local->op_ret, local->op_errno, + &local->entries_head, xdata); + else + SHARD_STACK_UNWIND(readdirp, frame, op_ret, op_errno, + &local->entries_head, xdata); + return 0; +} + +int32_t +shard_readdir_cbk(call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, gf_dirent_t *orig_entries, + dict_t *xdata) +{ + fd_t *fd = NULL; + gf_dirent_t *entry = NULL; + gf_dirent_t *tmp = NULL; + shard_local_t *local = NULL; + gf_boolean_t last_entry = _gf_false; + + local = frame->local; + fd = local->fd; + + if (op_ret < 0) + goto unwind; + + list_for_each_entry_safe(entry, tmp, (&orig_entries->list), list) + { + if (last_entry) + last_entry = _gf_false; + + if (__is_root_gfid(fd->inode->gfid) && + !(strcmp(entry->d_name, GF_SHARD_DIR))) { + local->offset = entry->d_off; + op_ret--; + last_entry = _gf_true; + continue; + } + + list_del_init(&entry->list); + list_add_tail(&entry->list, &local->entries_head.list); + + if (!entry->dict) + continue; + + if (IA_ISDIR(entry->d_stat.ia_type)) + continue; + + if (dict_get(entry->dict, GF_XATTR_SHARD_FILE_SIZE) && + frame->root->pid != GF_CLIENT_PID_GSYNCD) + shard_modify_size_and_block_count(&entry->d_stat, entry->dict); + + if (!entry->inode) + continue; + + shard_inode_ctx_update(entry->inode, this, entry->dict, &entry->d_stat); + } + + local->op_ret = op_ret; + + if (last_entry) { + if (local->fop == GF_FOP_READDIR) + STACK_WIND(frame, shard_readdir_past_dot_shard_cbk, + FIRST_CHILD(this), FIRST_CHILD(this)->fops->readdir, + local->fd, local->readdir_size, local->offset, + local->xattr_req); + else + STACK_WIND(frame, shard_readdir_past_dot_shard_cbk, + FIRST_CHILD(this), FIRST_CHILD(this)->fops->readdirp, + local->fd, local->readdir_size, local->offset, + local->xattr_req); + return 0; + } + +unwind: + if (local->fop == GF_FOP_READDIR) + SHARD_STACK_UNWIND(readdir, frame, op_ret, op_errno, + &local->entries_head, xdata); + else + SHARD_STACK_UNWIND(readdirp, frame, op_ret, op_errno, + &local->entries_head, xdata); + return 0; +} + +int +shard_readdir_do(call_frame_t *frame, xlator_t *this, fd_t *fd, size_t size, + off_t offset, int whichop, dict_t *xdata) +{ + int ret = 0; + shard_local_t *local = NULL; + + local = mem_get0(this->local_pool); + if (!local) { + goto err; + } + + frame->local = local; + + local->fd = fd_ref(fd); + local->fop = whichop; + local->readdir_size = size; + INIT_LIST_HEAD(&local->entries_head.list); + local->list_inited = _gf_true; + + if (whichop == GF_FOP_READDIR) { + STACK_WIND(frame, shard_readdir_cbk, FIRST_CHILD(this), + FIRST_CHILD(this)->fops->readdir, fd, size, offset, xdata); + } else { + local->xattr_req = (xdata) ? dict_ref(xdata) : dict_new(); + SHARD_MD_READ_FOP_INIT_REQ_DICT(this, local->xattr_req, fd->inode->gfid, + local, err); + ret = dict_set_uint64(local->xattr_req, GF_XATTR_SHARD_BLOCK_SIZE, 0); + if (ret) { + gf_log(this->name, GF_LOG_WARNING, + "Failed to set " + "dict value: key:%s, directory gfid=%s", + GF_XATTR_SHARD_BLOCK_SIZE, uuid_utoa(fd->inode->gfid)); + goto err; + } + + STACK_WIND(frame, shard_readdir_cbk, FIRST_CHILD(this), + FIRST_CHILD(this)->fops->readdirp, fd, size, offset, + local->xattr_req); + } + + return 0; + +err: + STACK_UNWIND_STRICT(readdir, frame, -1, ENOMEM, NULL, NULL); + return 0; +} + +int32_t +shard_readdir(call_frame_t *frame, xlator_t *this, fd_t *fd, size_t size, + off_t offset, dict_t *xdata) +{ + shard_readdir_do(frame, this, fd, size, offset, GF_FOP_READDIR, xdata); + return 0; +} + +int32_t +shard_readdirp(call_frame_t *frame, xlator_t *this, fd_t *fd, size_t size, + off_t offset, dict_t *xdata) +{ + shard_readdir_do(frame, this, fd, size, offset, GF_FOP_READDIRP, xdata); + return 0; +} + +int32_t +shard_modify_and_set_iatt_in_dict(dict_t *xdata, shard_local_t *local, + char *key) +{ + int ret = 0; + struct iatt *tmpbuf = NULL; + struct iatt *stbuf = NULL; + data_t *data = NULL; + + if (!xdata) + return 0; + + data = dict_get(xdata, key); + if (!data) + return 0; + + tmpbuf = data_to_iatt(data, key); + stbuf = GF_MALLOC(sizeof(struct iatt), gf_common_mt_char); + if (stbuf == NULL) { + local->op_ret = -1; + local->op_errno = ENOMEM; + goto err; + } + *stbuf = *tmpbuf; + stbuf->ia_size = local->prebuf.ia_size; + stbuf->ia_blocks = local->prebuf.ia_blocks; + ret = dict_set_iatt(xdata, key, stbuf, false); + if (ret < 0) { + local->op_ret = -1; + local->op_errno = ENOMEM; + goto err; + } + return 0; + +err: + GF_FREE(stbuf); + return -1; +} + +int32_t +shard_common_remove_xattr_cbk(call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, dict_t *xdata) +{ + int ret = -1; + shard_local_t *local = NULL; + + local = frame->local; + + if (op_ret < 0) { + local->op_ret = op_ret; + local->op_errno = op_errno; + goto err; + } + + ret = shard_modify_and_set_iatt_in_dict(xdata, local, GF_PRESTAT); + if (ret < 0) + goto err; + + ret = shard_modify_and_set_iatt_in_dict(xdata, local, GF_POSTSTAT); + if (ret < 0) + goto err; + + if (local->fd) + SHARD_STACK_UNWIND(fremovexattr, frame, local->op_ret, local->op_errno, + xdata); + else + SHARD_STACK_UNWIND(removexattr, frame, local->op_ret, local->op_errno, + xdata); + return 0; + +err: + shard_common_failure_unwind(local->fop, frame, local->op_ret, + local->op_errno); + return 0; +} + +int32_t +shard_post_lookup_remove_xattr_handler(call_frame_t *frame, xlator_t *this) +{ + shard_local_t *local = NULL; + + local = frame->local; + + if (local->op_ret < 0) { + shard_common_failure_unwind(local->fop, frame, local->op_ret, + local->op_errno); + return 0; + } + + if (local->fd) + STACK_WIND(frame, shard_common_remove_xattr_cbk, FIRST_CHILD(this), + FIRST_CHILD(this)->fops->fremovexattr, local->fd, + local->name, local->xattr_req); + else + STACK_WIND(frame, shard_common_remove_xattr_cbk, FIRST_CHILD(this), + FIRST_CHILD(this)->fops->removexattr, &local->loc, + local->name, local->xattr_req); + return 0; +} + +int32_t +shard_common_remove_xattr(call_frame_t *frame, xlator_t *this, + glusterfs_fop_t fop, loc_t *loc, fd_t *fd, + const char *name, dict_t *xdata) +{ + int ret = -1; + int op_errno = ENOMEM; + uint64_t block_size = 0; + shard_local_t *local = NULL; + inode_t *inode = loc ? loc->inode : fd->inode; + + if ((IA_ISDIR(inode->ia_type)) || (IA_ISLNK(inode->ia_type))) { + if (loc) + STACK_WIND_TAIL(frame, FIRST_CHILD(this), + FIRST_CHILD(this)->fops->removexattr, loc, name, + xdata); + else + STACK_WIND_TAIL(frame, FIRST_CHILD(this), + FIRST_CHILD(this)->fops->fremovexattr, fd, name, + xdata); + return 0; + } + + /* If shard's special xattrs are attempted to be removed, + * fail the fop with EPERM (except if the client is gsyncd). + */ + if (frame->root->pid != GF_CLIENT_PID_GSYNCD) { + GF_IF_NATIVE_XATTR_GOTO(SHARD_XATTR_PREFIX "*", name, op_errno, err); + } + + /* Repeat the same check for bulk-removexattr */ + if (xdata && (frame->root->pid != GF_CLIENT_PID_GSYNCD)) { + dict_del(xdata, GF_XATTR_SHARD_BLOCK_SIZE); + dict_del(xdata, GF_XATTR_SHARD_FILE_SIZE); + } + + ret = shard_inode_ctx_get_block_size(inode, this, &block_size); + if (ret) { + gf_msg(this->name, GF_LOG_ERROR, 0, SHARD_MSG_INODE_CTX_GET_FAILED, + "Failed to get block size from inode ctx of %s", + uuid_utoa(inode->gfid)); + goto err; + } + + if (!block_size || frame->root->pid == GF_CLIENT_PID_GSYNCD) { + if (loc) + STACK_WIND_TAIL(frame, FIRST_CHILD(this), + FIRST_CHILD(this)->fops->removexattr, loc, name, + xdata); + else + STACK_WIND_TAIL(frame, FIRST_CHILD(this), + FIRST_CHILD(this)->fops->fremovexattr, fd, name, + xdata); + return 0; + } + + local = mem_get0(this->local_pool); + if (!local) + goto err; + + frame->local = local; + local->fop = fop; + if (loc) { + if (loc_copy(&local->loc, loc) != 0) + goto err; + } + + if (fd) { + local->fd = fd_ref(fd); + local->loc.inode = inode_ref(fd->inode); + gf_uuid_copy(local->loc.gfid, fd->inode->gfid); + } + + if (name) { + local->name = gf_strdup(name); + if (!local->name) + goto err; + } + + if (xdata) + local->xattr_req = dict_ref(xdata); + + shard_refresh_base_file(frame, this, loc, fd, + shard_post_lookup_remove_xattr_handler); + return 0; +err: + shard_common_failure_unwind(fop, frame, -1, op_errno); + return 0; +} + +int32_t +shard_removexattr(call_frame_t *frame, xlator_t *this, loc_t *loc, + const char *name, dict_t *xdata) +{ + shard_common_remove_xattr(frame, this, GF_FOP_REMOVEXATTR, loc, NULL, name, + xdata); + return 0; +} + +int32_t +shard_fremovexattr(call_frame_t *frame, xlator_t *this, fd_t *fd, + const char *name, dict_t *xdata) +{ + shard_common_remove_xattr(frame, this, GF_FOP_FREMOVEXATTR, NULL, fd, name, + xdata); + return 0; +} + +int32_t +shard_fgetxattr_cbk(call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, dict_t *dict, + dict_t *xdata) +{ + if (op_ret < 0) + goto unwind; + + if (dict && (frame->root->pid != GF_CLIENT_PID_GSYNCD)) { + dict_del(dict, GF_XATTR_SHARD_BLOCK_SIZE); + dict_del(dict, GF_XATTR_SHARD_FILE_SIZE); + } + +unwind: + SHARD_STACK_UNWIND(fgetxattr, frame, op_ret, op_errno, dict, xdata); + return 0; +} + +int32_t +shard_fgetxattr(call_frame_t *frame, xlator_t *this, fd_t *fd, const char *name, + dict_t *xdata) +{ + int op_errno = EINVAL; + + if ((frame->root->pid != GF_CLIENT_PID_GSYNCD) && (name) && + (!strncmp(name, SHARD_XATTR_PREFIX, SLEN(SHARD_XATTR_PREFIX)))) { + op_errno = ENODATA; + goto out; + } + + STACK_WIND(frame, shard_fgetxattr_cbk, FIRST_CHILD(this), + FIRST_CHILD(this)->fops->fgetxattr, fd, name, xdata); + return 0; +out: + shard_common_failure_unwind(GF_FOP_FGETXATTR, frame, -1, op_errno); + return 0; +} + +int32_t +shard_getxattr_cbk(call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, dict_t *dict, + dict_t *xdata) +{ + if (op_ret < 0) + goto unwind; + + if (dict && (frame->root->pid != GF_CLIENT_PID_GSYNCD)) { + dict_del(dict, GF_XATTR_SHARD_BLOCK_SIZE); + dict_del(dict, GF_XATTR_SHARD_FILE_SIZE); + } + +unwind: + SHARD_STACK_UNWIND(getxattr, frame, op_ret, op_errno, dict, xdata); + return 0; +} + +int32_t +shard_getxattr(call_frame_t *frame, xlator_t *this, loc_t *loc, + const char *name, dict_t *xdata) +{ + int op_errno = EINVAL; + + if ((frame->root->pid != GF_CLIENT_PID_GSYNCD) && (name) && + (!strncmp(name, SHARD_XATTR_PREFIX, sizeof(SHARD_XATTR_PREFIX) - 1))) { + op_errno = ENODATA; + goto out; + } + + STACK_WIND(frame, shard_getxattr_cbk, FIRST_CHILD(this), + FIRST_CHILD(this)->fops->getxattr, loc, name, xdata); + return 0; +out: + shard_common_failure_unwind(GF_FOP_GETXATTR, frame, -1, op_errno); + return 0; +} + +int32_t +shard_common_set_xattr_cbk(call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, dict_t *xdata) +{ + int ret = -1; + shard_local_t *local = NULL; + + local = frame->local; + + if (op_ret < 0) { + local->op_ret = op_ret; + local->op_errno = op_errno; + goto err; + } + + ret = shard_modify_and_set_iatt_in_dict(xdata, local, GF_PRESTAT); + if (ret < 0) + goto err; + + ret = shard_modify_and_set_iatt_in_dict(xdata, local, GF_POSTSTAT); + if (ret < 0) + goto err; + + if (local->fd) + SHARD_STACK_UNWIND(fsetxattr, frame, local->op_ret, local->op_errno, + xdata); + else + SHARD_STACK_UNWIND(setxattr, frame, local->op_ret, local->op_errno, + xdata); + return 0; + +err: + shard_common_failure_unwind(local->fop, frame, local->op_ret, + local->op_errno); + return 0; +} + +int32_t +shard_post_lookup_set_xattr_handler(call_frame_t *frame, xlator_t *this) +{ + shard_local_t *local = NULL; + + local = frame->local; + + if (local->op_ret < 0) { + shard_common_failure_unwind(local->fop, frame, local->op_ret, + local->op_errno); + return 0; + } + + if (local->fd) + STACK_WIND(frame, shard_common_set_xattr_cbk, FIRST_CHILD(this), + FIRST_CHILD(this)->fops->fsetxattr, local->fd, + local->xattr_req, local->flags, local->xattr_rsp); + else + STACK_WIND(frame, shard_common_set_xattr_cbk, FIRST_CHILD(this), + FIRST_CHILD(this)->fops->setxattr, &local->loc, + local->xattr_req, local->flags, local->xattr_rsp); + return 0; +} + +int32_t +shard_common_set_xattr(call_frame_t *frame, xlator_t *this, glusterfs_fop_t fop, + loc_t *loc, fd_t *fd, dict_t *dict, int32_t flags, + dict_t *xdata) +{ + int ret = -1; + int op_errno = ENOMEM; + uint64_t block_size = 0; + shard_local_t *local = NULL; + inode_t *inode = loc ? loc->inode : fd->inode; + + if ((IA_ISDIR(inode->ia_type)) || (IA_ISLNK(inode->ia_type))) { + if (loc) + STACK_WIND_TAIL(frame, FIRST_CHILD(this), + FIRST_CHILD(this)->fops->setxattr, loc, dict, flags, + xdata); + else + STACK_WIND_TAIL(frame, FIRST_CHILD(this), + FIRST_CHILD(this)->fops->fsetxattr, fd, dict, flags, + xdata); + return 0; + } + + /* Sharded or not, if shard's special xattrs are attempted to be set, + * fail the fop with EPERM (except if the client is gsyncd. + */ + if (frame->root->pid != GF_CLIENT_PID_GSYNCD) { + GF_IF_INTERNAL_XATTR_GOTO(SHARD_XATTR_PREFIX "*", dict, op_errno, err); + } + + ret = shard_inode_ctx_get_block_size(inode, this, &block_size); + if (ret) { + gf_msg(this->name, GF_LOG_ERROR, 0, SHARD_MSG_INODE_CTX_GET_FAILED, + "Failed to get block size from inode ctx of %s", + uuid_utoa(inode->gfid)); + goto err; + } + + if (!block_size || frame->root->pid == GF_CLIENT_PID_GSYNCD) { + if (loc) + STACK_WIND_TAIL(frame, FIRST_CHILD(this), + FIRST_CHILD(this)->fops->setxattr, loc, dict, flags, + xdata); + else + STACK_WIND_TAIL(frame, FIRST_CHILD(this), + FIRST_CHILD(this)->fops->fsetxattr, fd, dict, flags, + xdata); + return 0; + } + + local = mem_get0(this->local_pool); + if (!local) + goto err; + + frame->local = local; + local->fop = fop; + if (loc) { + if (loc_copy(&local->loc, loc) != 0) + goto err; + } + + if (fd) { + local->fd = fd_ref(fd); + local->loc.inode = inode_ref(fd->inode); + gf_uuid_copy(local->loc.gfid, fd->inode->gfid); + } + local->flags = flags; + /* Reusing local->xattr_req and local->xattr_rsp to store the setxattr dict + * and the xdata dict + */ + if (dict) + local->xattr_req = dict_ref(dict); + if (xdata) + local->xattr_rsp = dict_ref(xdata); + + shard_refresh_base_file(frame, this, loc, fd, + shard_post_lookup_set_xattr_handler); + return 0; +err: + shard_common_failure_unwind(fop, frame, -1, op_errno); + return 0; +} + +int32_t +shard_fsetxattr(call_frame_t *frame, xlator_t *this, fd_t *fd, dict_t *dict, + int32_t flags, dict_t *xdata) +{ + shard_common_set_xattr(frame, this, GF_FOP_FSETXATTR, NULL, fd, dict, flags, + xdata); + return 0; +} + +int32_t +shard_setxattr(call_frame_t *frame, xlator_t *this, loc_t *loc, dict_t *dict, + int32_t flags, dict_t *xdata) +{ + shard_common_set_xattr(frame, this, GF_FOP_SETXATTR, loc, NULL, dict, flags, + xdata); + return 0; +} + +int +shard_post_setattr_handler(call_frame_t *frame, xlator_t *this) +{ + shard_local_t *local = NULL; + + local = frame->local; + + if (local->fop == GF_FOP_SETATTR) { + if (local->op_ret >= 0) + shard_inode_ctx_set(local->loc.inode, this, &local->postbuf, 0, + SHARD_LOOKUP_MASK); + SHARD_STACK_UNWIND(setattr, frame, local->op_ret, local->op_errno, + &local->prebuf, &local->postbuf, local->xattr_rsp); + } else if (local->fop == GF_FOP_FSETATTR) { + if (local->op_ret >= 0) + shard_inode_ctx_set(local->fd->inode, this, &local->postbuf, 0, + SHARD_LOOKUP_MASK); + SHARD_STACK_UNWIND(fsetattr, frame, local->op_ret, local->op_errno, + &local->prebuf, &local->postbuf, local->xattr_rsp); + } + + return 0; +} + +int +shard_common_setattr_cbk(call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, struct iatt *prebuf, + struct iatt *postbuf, dict_t *xdata) +{ + shard_local_t *local = NULL; + + local = frame->local; + + if (op_ret < 0) { + local->op_ret = op_ret; + local->op_errno = op_errno; + goto unwind; + } + + local->prebuf = *prebuf; + if (shard_modify_size_and_block_count(&local->prebuf, xdata)) { + local->op_ret = -1; + local->op_errno = EINVAL; + goto unwind; + } + if (xdata) + local->xattr_rsp = dict_ref(xdata); + local->postbuf = *postbuf; + local->postbuf.ia_size = local->prebuf.ia_size; + local->postbuf.ia_blocks = local->prebuf.ia_blocks; + +unwind: + local->handler(frame, this); + return 0; +} + +int +shard_setattr(call_frame_t *frame, xlator_t *this, loc_t *loc, + struct iatt *stbuf, int32_t valid, dict_t *xdata) +{ + int ret = -1; + uint64_t block_size = 0; + shard_local_t *local = NULL; + + if ((IA_ISDIR(loc->inode->ia_type)) || (IA_ISLNK(loc->inode->ia_type))) { + STACK_WIND(frame, default_setattr_cbk, FIRST_CHILD(this), + FIRST_CHILD(this)->fops->setattr, loc, stbuf, valid, xdata); + return 0; + } + + ret = shard_inode_ctx_get_block_size(loc->inode, this, &block_size); + if (ret) { + gf_msg(this->name, GF_LOG_ERROR, 0, SHARD_MSG_INODE_CTX_GET_FAILED, + "Failed to get block size from inode ctx of %s", + uuid_utoa(loc->inode->gfid)); + goto err; + } + + if (!block_size || frame->root->pid == GF_CLIENT_PID_GSYNCD) { + STACK_WIND(frame, default_setattr_cbk, FIRST_CHILD(this), + FIRST_CHILD(this)->fops->setattr, loc, stbuf, valid, xdata); + return 0; + } + + local = mem_get0(this->local_pool); + if (!local) + goto err; + + frame->local = local; + + local->handler = shard_post_setattr_handler; + local->xattr_req = (xdata) ? dict_ref(xdata) : dict_new(); + if (!local->xattr_req) + goto err; + local->fop = GF_FOP_SETATTR; + loc_copy(&local->loc, loc); + + SHARD_MD_READ_FOP_INIT_REQ_DICT(this, local->xattr_req, local->loc.gfid, + local, err); + + STACK_WIND(frame, shard_common_setattr_cbk, FIRST_CHILD(this), + FIRST_CHILD(this)->fops->setattr, loc, stbuf, valid, + local->xattr_req); + return 0; +err: + shard_common_failure_unwind(GF_FOP_SETATTR, frame, -1, ENOMEM); + return 0; +} + +int +shard_fsetattr(call_frame_t *frame, xlator_t *this, fd_t *fd, + struct iatt *stbuf, int32_t valid, dict_t *xdata) +{ + int ret = -1; + uint64_t block_size = 0; + shard_local_t *local = NULL; + + if ((IA_ISDIR(fd->inode->ia_type)) || (IA_ISLNK(fd->inode->ia_type))) { + STACK_WIND(frame, default_fsetattr_cbk, FIRST_CHILD(this), + FIRST_CHILD(this)->fops->fsetattr, fd, stbuf, valid, xdata); + return 0; + } + + ret = shard_inode_ctx_get_block_size(fd->inode, this, &block_size); + if (ret) { + gf_msg(this->name, GF_LOG_ERROR, 0, SHARD_MSG_INODE_CTX_GET_FAILED, + "Failed to get block size from inode ctx of %s", + uuid_utoa(fd->inode->gfid)); + goto err; + } + + if (!block_size || frame->root->pid == GF_CLIENT_PID_GSYNCD) { + STACK_WIND(frame, default_fsetattr_cbk, FIRST_CHILD(this), + FIRST_CHILD(this)->fops->fsetattr, fd, stbuf, valid, xdata); + return 0; + } + + if (!this->itable) + this->itable = fd->inode->table; + + local = mem_get0(this->local_pool); + if (!local) + goto err; + + frame->local = local; + + local->handler = shard_post_setattr_handler; + local->xattr_req = (xdata) ? dict_ref(xdata) : dict_new(); + if (!local->xattr_req) + goto err; + local->fop = GF_FOP_FSETATTR; + local->fd = fd_ref(fd); + + SHARD_MD_READ_FOP_INIT_REQ_DICT(this, local->xattr_req, fd->inode->gfid, + local, err); + + STACK_WIND(frame, shard_common_setattr_cbk, FIRST_CHILD(this), + FIRST_CHILD(this)->fops->fsetattr, fd, stbuf, valid, + local->xattr_req); + return 0; +err: + shard_common_failure_unwind(GF_FOP_FSETATTR, frame, -1, ENOMEM); + return 0; +} + +int +shard_common_inode_write_begin(call_frame_t *frame, xlator_t *this, + glusterfs_fop_t fop, fd_t *fd, + struct iovec *vector, int32_t count, + off_t offset, uint32_t flags, size_t len, + struct iobref *iobref, dict_t *xdata) +{ + int ret = 0; + int i = 0; + uint64_t block_size = 0; + shard_local_t *local = NULL; + + ret = shard_inode_ctx_get_block_size(fd->inode, this, &block_size); + if (ret) { + gf_msg(this->name, GF_LOG_ERROR, 0, SHARD_MSG_INODE_CTX_GET_FAILED, + "Failed to get block " + "size for %s from its inode ctx", + uuid_utoa(fd->inode->gfid)); + goto out; + } + + if (!block_size || frame->root->pid == GF_CLIENT_PID_GSYNCD) { + /* block_size = 0 means that the file was created before + * sharding was enabled on the volume. + */ + switch (fop) { + case GF_FOP_WRITE: + STACK_WIND_TAIL(frame, FIRST_CHILD(this), + FIRST_CHILD(this)->fops->writev, fd, vector, + count, offset, flags, iobref, xdata); + break; + case GF_FOP_FALLOCATE: + STACK_WIND_TAIL(frame, FIRST_CHILD(this), + FIRST_CHILD(this)->fops->fallocate, fd, flags, + offset, len, xdata); + break; + case GF_FOP_ZEROFILL: + STACK_WIND_TAIL(frame, FIRST_CHILD(this), + FIRST_CHILD(this)->fops->zerofill, fd, offset, + len, xdata); + break; + case GF_FOP_DISCARD: + STACK_WIND_TAIL(frame, FIRST_CHILD(this), + FIRST_CHILD(this)->fops->discard, fd, offset, + len, xdata); + break; + default: + gf_msg(this->name, GF_LOG_WARNING, 0, SHARD_MSG_INVALID_FOP, + "Invalid fop id = %d", fop); + break; + } + return 0; + } + + if (!this->itable) + this->itable = fd->inode->table; + + local = mem_get0(this->local_pool); + if (!local) + goto out; + + frame->local = local; + + ret = syncbarrier_init(&local->barrier); + if (ret) + goto out; + local->xattr_req = (xdata) ? dict_ref(xdata) : dict_new(); + if (!local->xattr_req) + goto out; + + if (vector) { + local->vector = iov_dup(vector, count); + if (!local->vector) + goto out; + for (i = 0; i < count; i++) + local->total_size += vector[i].iov_len; + local->count = count; + } else { + local->total_size = len; + } + + local->fop = fop; + local->offset = offset; + local->flags = flags; + if (iobref) + local->iobref = iobref_ref(iobref); + local->fd = fd_ref(fd); + local->block_size = block_size; + local->resolver_base_inode = local->fd->inode; + GF_ATOMIC_INIT(local->delta_blocks, 0); + + local->loc.inode = inode_ref(fd->inode); + gf_uuid_copy(local->loc.gfid, fd->inode->gfid); + + shard_refresh_base_file(frame, this, NULL, fd, + shard_common_inode_write_post_lookup_handler); + return 0; +out: + shard_common_failure_unwind(fop, frame, -1, ENOMEM); + return 0; +} + +int +shard_writev(call_frame_t *frame, xlator_t *this, fd_t *fd, + struct iovec *vector, int32_t count, off_t offset, uint32_t flags, + struct iobref *iobref, dict_t *xdata) +{ + shard_common_inode_write_begin(frame, this, GF_FOP_WRITE, fd, vector, count, + offset, flags, 0, iobref, xdata); + return 0; +} + +int +shard_fallocate(call_frame_t *frame, xlator_t *this, fd_t *fd, + int32_t keep_size, off_t offset, size_t len, dict_t *xdata) +{ + if ((keep_size != 0) && (keep_size != FALLOC_FL_ZERO_RANGE) && + (keep_size != (FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE))) + goto out; + + shard_common_inode_write_begin(frame, this, GF_FOP_FALLOCATE, fd, NULL, 0, + offset, keep_size, len, NULL, xdata); + return 0; +out: + shard_common_failure_unwind(GF_FOP_FALLOCATE, frame, -1, ENOTSUP); + return 0; +} + +int +shard_zerofill(call_frame_t *frame, xlator_t *this, fd_t *fd, off_t offset, + off_t len, dict_t *xdata) +{ + shard_common_inode_write_begin(frame, this, GF_FOP_ZEROFILL, fd, NULL, 0, + offset, 0, len, NULL, xdata); + return 0; +} + +int +shard_discard(call_frame_t *frame, xlator_t *this, fd_t *fd, off_t offset, + size_t len, dict_t *xdata) +{ + shard_common_inode_write_begin(frame, this, GF_FOP_DISCARD, fd, NULL, 0, + offset, 0, len, NULL, xdata); + return 0; +} + +int32_t +shard_seek(call_frame_t *frame, xlator_t *this, fd_t *fd, off_t offset, + gf_seek_what_t what, dict_t *xdata) +{ + /* TBD */ + gf_msg(this->name, GF_LOG_INFO, ENOTSUP, SHARD_MSG_FOP_NOT_SUPPORTED, + "seek called on %s.", uuid_utoa(fd->inode->gfid)); + shard_common_failure_unwind(GF_FOP_SEEK, frame, -1, ENOTSUP); + return 0; +} + +int32_t +mem_acct_init(xlator_t *this) +{ + int ret = -1; + + if (!this) + return ret; + + ret = xlator_mem_acct_init(this, gf_shard_mt_end + 1); + + if (ret != 0) { + gf_msg(this->name, GF_LOG_ERROR, 0, SHARD_MSG_MEM_ACCT_INIT_FAILED, + "Memory accounting init" + "failed"); + return ret; + } + + return ret; +} + +int +init(xlator_t *this) +{ + int ret = -1; + shard_priv_t *priv = NULL; + + if (!this) { + gf_msg("shard", GF_LOG_ERROR, 0, SHARD_MSG_NULL_THIS, + "this is NULL. init() failed"); + return -1; + } + + if (!this->parents) { + gf_msg(this->name, GF_LOG_ERROR, 0, SHARD_MSG_INVALID_VOLFILE, + "Dangling volume. Check volfile"); + goto out; + } + + if (!this->children || this->children->next) { + gf_msg(this->name, GF_LOG_ERROR, 0, SHARD_MSG_INVALID_VOLFILE, + "shard not configured with exactly one sub-volume. " + "Check volfile"); + goto out; + } + + priv = GF_CALLOC(1, sizeof(shard_priv_t), gf_shard_mt_priv_t); + if (!priv) + goto out; + + GF_OPTION_INIT("shard-block-size", priv->block_size, size_uint64, out); + + GF_OPTION_INIT("shard-deletion-rate", priv->deletion_rate, uint32, out); + + GF_OPTION_INIT("shard-lru-limit", priv->lru_limit, uint64, out); + + this->local_pool = mem_pool_new(shard_local_t, 128); + if (!this->local_pool) { + ret = -1; + goto out; + } + gf_uuid_parse(SHARD_ROOT_GFID, priv->dot_shard_gfid); + gf_uuid_parse(DOT_SHARD_REMOVE_ME_GFID, priv->dot_shard_rm_gfid); + + this->private = priv; + LOCK_INIT(&priv->lock); + INIT_LIST_HEAD(&priv->ilist_head); + ret = 0; +out: + if (ret) { + GF_FREE(priv); + mem_pool_destroy(this->local_pool); + } + + return ret; +} + +void +fini(xlator_t *this) +{ + shard_priv_t *priv = NULL; + + GF_VALIDATE_OR_GOTO("shard", this, out); + + /*Itable was not created by shard, hence setting to NULL.*/ + this->itable = NULL; + + mem_pool_destroy(this->local_pool); + this->local_pool = NULL; + + priv = this->private; + if (!priv) + goto out; + + this->private = NULL; + LOCK_DESTROY(&priv->lock); + GF_FREE(priv); + +out: + return; +} + +int +reconfigure(xlator_t *this, dict_t *options) +{ + int ret = -1; + shard_priv_t *priv = NULL; + + priv = this->private; + + GF_OPTION_RECONF("shard-block-size", priv->block_size, options, size, out); + + GF_OPTION_RECONF("shard-deletion-rate", priv->deletion_rate, options, + uint32, out); + ret = 0; + +out: + return ret; +} + +int +shard_forget(xlator_t *this, inode_t *inode) +{ + uint64_t ctx_uint = 0; + shard_inode_ctx_t *ctx = NULL; + shard_priv_t *priv = NULL; + + priv = this->private; + if (!priv) + return 0; + + inode_ctx_del(inode, this, &ctx_uint); + if (!ctx_uint) + return 0; + + ctx = (shard_inode_ctx_t *)(uintptr_t)ctx_uint; + + /* When LRU limit reaches inode will be forcefully removed from the + * table, inode needs to be removed from LRU of shard as well. + */ + if (!list_empty(&ctx->ilist)) { + LOCK(&priv->lock); + { + list_del_init(&ctx->ilist); + priv->inode_count--; + } + UNLOCK(&priv->lock); + } + GF_FREE(ctx); + + return 0; +} + +int +shard_release(xlator_t *this, fd_t *fd) +{ + /* TBD */ + return 0; +} + +int +shard_priv_dump(xlator_t *this) +{ + shard_priv_t *priv = NULL; + char key_prefix[GF_DUMP_MAX_BUF_LEN] = { + 0, + }; + char *str = NULL; + + priv = this->private; + + snprintf(key_prefix, GF_DUMP_MAX_BUF_LEN, "%s.%s", this->type, this->name); + gf_proc_dump_add_section("%s", key_prefix); + str = gf_uint64_2human_readable(priv->block_size); + gf_proc_dump_write("shard-block-size", "%s", str); + gf_proc_dump_write("inode-count", "%d", priv->inode_count); + gf_proc_dump_write("ilist_head", "%p", &priv->ilist_head); + gf_proc_dump_write("lru-max-limit", "%" PRIu64, priv->lru_limit); + + GF_FREE(str); + + return 0; +} + +int +shard_releasedir(xlator_t *this, fd_t *fd) +{ + return 0; +} + +struct xlator_fops fops = { + .lookup = shard_lookup, + .open = shard_open, + .flush = shard_flush, + .fsync = shard_fsync, + .stat = shard_stat, + .fstat = shard_fstat, + .getxattr = shard_getxattr, + .fgetxattr = shard_fgetxattr, + .readv = shard_readv, + .writev = shard_writev, + .truncate = shard_truncate, + .ftruncate = shard_ftruncate, + .setxattr = shard_setxattr, + .fsetxattr = shard_fsetxattr, + .setattr = shard_setattr, + .fsetattr = shard_fsetattr, + .removexattr = shard_removexattr, + .fremovexattr = shard_fremovexattr, + .fallocate = shard_fallocate, + .discard = shard_discard, + .zerofill = shard_zerofill, + .readdir = shard_readdir, + .readdirp = shard_readdirp, + .create = shard_create, + .mknod = shard_mknod, + .link = shard_link, + .unlink = shard_unlink, + .rename = shard_rename, + .seek = shard_seek, +}; + +struct xlator_cbks cbks = { + .forget = shard_forget, + .release = shard_release, + .releasedir = shard_releasedir, +}; + +struct xlator_dumpops dumpops = { + .priv = shard_priv_dump, +}; + +struct volume_options options[] = { + { + .key = {"shard"}, + .type = GF_OPTION_TYPE_BOOL, + .default_value = "off", + .description = "enable/disable shard", + .op_version = {GD_OP_VERSION_6_0}, + .flags = OPT_FLAG_SETTABLE, + }, + { + .key = {"shard-block-size"}, + .type = GF_OPTION_TYPE_SIZET, + .op_version = {GD_OP_VERSION_3_7_0}, + .flags = OPT_FLAG_SETTABLE | OPT_FLAG_CLIENT_OPT | OPT_FLAG_DOC, + .tags = {"shard"}, + .default_value = "64MB", + .min = SHARD_MIN_BLOCK_SIZE, + .max = SHARD_MAX_BLOCK_SIZE, + .description = "The size unit used to break a file into multiple " + "chunks", + }, + { + .key = {"shard-deletion-rate"}, + .type = GF_OPTION_TYPE_INT, + .op_version = {GD_OP_VERSION_5_0}, + .flags = OPT_FLAG_SETTABLE | OPT_FLAG_CLIENT_OPT | OPT_FLAG_DOC, + .tags = {"shard"}, + .default_value = "100", + .min = 100, + .max = INT_MAX, + .description = "The number of shards to send deletes on at a time", + }, + { + .key = {"shard-lru-limit"}, + .type = GF_OPTION_TYPE_INT, + .op_version = {GD_OP_VERSION_5_0}, + .flags = OPT_FLAG_SETTABLE | OPT_FLAG_CLIENT_OPT, + .tags = {"shard"}, + .default_value = "16384", + .min = 20, + .max = INT_MAX, + .description = "The number of resolved shard inodes to keep in " + "memory. A higher number means shards that are " + "resolved will remain in memory longer, avoiding " + "frequent lookups on them when they participate in " + "file operations. The option also has a bearing on " + "amount of memory consumed by these inodes and their " + "internal metadata", + }, + {.key = {NULL}}, +}; + +xlator_api_t xlator_api = { + .init = init, + .fini = fini, + .reconfigure = reconfigure, + .mem_acct_init = mem_acct_init, + .op_version = {1}, /* Present from the initial version */ + .dumpops = &dumpops, + .fops = &fops, + .cbks = &cbks, + .options = options, + .identifier = "shard", + .category = GF_MAINTAINED, +}; diff --git a/xlators/features/shard/src/shard.h b/xlators/features/shard/src/shard.h new file mode 100644 index 00000000000..4fe181b64d5 --- /dev/null +++ b/xlators/features/shard/src/shard.h @@ -0,0 +1,348 @@ +/* + Copyright (c) 2015 Red Hat, Inc. <http://www.redhat.com> + This file is part of GlusterFS. + + This file is licensed to you under your choice of the GNU Lesser + General Public License, version 3 or any later version (LGPLv3 or + later), or the GNU General Public License, version 2 (GPLv2), in all + cases as published by the Free Software Foundation. +*/ + +#ifndef __SHARD_H__ +#define __SHARD_H__ + +#include <glusterfs/xlator.h> +#include <glusterfs/compat-errno.h> +#include "shard-messages.h" +#include <glusterfs/syncop.h> + +#define GF_SHARD_DIR ".shard" +#define GF_SHARD_REMOVE_ME_DIR ".remove_me" +#define SHARD_MIN_BLOCK_SIZE (4 * GF_UNIT_MB) +#define SHARD_MAX_BLOCK_SIZE (4 * GF_UNIT_TB) +#define SHARD_XATTR_PREFIX "trusted.glusterfs.shard." +#define GF_XATTR_SHARD_BLOCK_SIZE "trusted.glusterfs.shard.block-size" +/** + * Bit masks for the valid flag, which is used while updating ctx + **/ +#define SHARD_MASK_BLOCK_SIZE (1 << 0) +#define SHARD_MASK_PROT (1 << 1) +#define SHARD_MASK_NLINK (1 << 2) +#define SHARD_MASK_UID (1 << 3) +#define SHARD_MASK_GID (1 << 4) +#define SHARD_MASK_SIZE (1 << 6) +#define SHARD_MASK_BLOCKS (1 << 7) +#define SHARD_MASK_TIMES (1 << 8) +#define SHARD_MASK_OTHERS (1 << 9) +#define SHARD_MASK_REFRESH_RESET (1 << 10) + +#define SHARD_INODE_WRITE_MASK \ + (SHARD_MASK_SIZE | SHARD_MASK_BLOCKS | SHARD_MASK_TIMES) + +#define SHARD_LOOKUP_MASK \ + (SHARD_MASK_PROT | SHARD_MASK_NLINK | SHARD_MASK_UID | SHARD_MASK_GID | \ + SHARD_MASK_TIMES | SHARD_MASK_OTHERS) + +#define SHARD_ALL_MASK \ + (SHARD_MASK_BLOCK_SIZE | SHARD_MASK_PROT | SHARD_MASK_NLINK | \ + SHARD_MASK_UID | SHARD_MASK_GID | SHARD_MASK_SIZE | SHARD_MASK_BLOCKS | \ + SHARD_MASK_TIMES | SHARD_MASK_OTHERS) + +#define get_lowest_block(off, shard_size) ((off) / (shard_size)) +#define get_highest_block(off, len, shard_size) \ + (((((off) + (len)) == 0) ? 0 : ((off) + (len)-1)) / (shard_size)) + +int +shard_unlock_inodelk(call_frame_t *frame, xlator_t *this); + +int +shard_unlock_entrylk(call_frame_t *frame, xlator_t *this); + +#define SHARD_ENTRY_FOP_CHECK(loc, op_errno, label) \ + do { \ + if ((loc->name && !strcmp(GF_SHARD_DIR, loc->name)) && \ + (((loc->parent) && __is_root_gfid(loc->parent->gfid)) || \ + __is_root_gfid(loc->pargfid))) { \ + op_errno = EPERM; \ + goto label; \ + } \ + \ + if ((loc->parent && __is_shard_dir(loc->parent->gfid)) || \ + __is_shard_dir(loc->pargfid)) { \ + op_errno = EPERM; \ + goto label; \ + } \ + } while (0) + +#define SHARD_INODE_OP_CHECK(gfid, err, label) \ + do { \ + if (__is_shard_dir(gfid)) { \ + err = EPERM; \ + goto label; \ + } \ + } while (0) + +#define SHARD_STACK_UNWIND(fop, frame, params...) \ + do { \ + shard_local_t *__local = NULL; \ + if (frame) { \ + __local = frame->local; \ + if (__local && __local->int_inodelk.acquired_lock) \ + shard_unlock_inodelk(frame, frame->this); \ + if (__local && __local->int_entrylk.acquired_lock) \ + shard_unlock_entrylk(frame, frame->this); \ + frame->local = NULL; \ + } \ + STACK_UNWIND_STRICT(fop, frame, params); \ + if (__local) { \ + shard_local_wipe(__local); \ + mem_put(__local); \ + } \ + } while (0) + +#define SHARD_STACK_DESTROY(frame) \ + do { \ + shard_local_t *__local = NULL; \ + __local = frame->local; \ + frame->local = NULL; \ + STACK_DESTROY(frame->root); \ + if (__local) { \ + shard_local_wipe(__local); \ + mem_put(__local); \ + } \ + } while (0); + +#define SHARD_INODE_CREATE_INIT(this, block_size, xattr_req, loc, size, \ + block_count, label) \ + do { \ + int __ret = -1; \ + int64_t *__size_attr = NULL; \ + uint64_t *__bs = 0; \ + \ + __bs = GF_MALLOC(sizeof(uint64_t), gf_shard_mt_uint64_t); \ + if (!__bs) \ + goto label; \ + *__bs = hton64(block_size); \ + __ret = dict_set_bin(xattr_req, GF_XATTR_SHARD_BLOCK_SIZE, __bs, \ + sizeof(*__bs)); \ + if (__ret) { \ + gf_msg(this->name, GF_LOG_WARNING, 0, SHARD_MSG_DICT_OP_FAILED, \ + "Failed to set key: %s " \ + "on path %s", \ + GF_XATTR_SHARD_BLOCK_SIZE, (loc)->path); \ + GF_FREE(__bs); \ + goto label; \ + } \ + \ + __ret = shard_set_size_attrs(size, block_count, &__size_attr); \ + if (__ret) \ + goto label; \ + \ + __ret = dict_set_bin(xattr_req, GF_XATTR_SHARD_FILE_SIZE, __size_attr, \ + 8 * 4); \ + if (__ret) { \ + gf_msg(this->name, GF_LOG_WARNING, 0, SHARD_MSG_DICT_OP_FAILED, \ + "Failed to set key: %s " \ + "on path %s", \ + GF_XATTR_SHARD_FILE_SIZE, (loc)->path); \ + GF_FREE(__size_attr); \ + goto label; \ + } \ + } while (0) + +#define SHARD_MD_READ_FOP_INIT_REQ_DICT(this, dict, gfid, local, label) \ + do { \ + int __ret = -1; \ + \ + __ret = dict_set_uint64(dict, GF_XATTR_SHARD_FILE_SIZE, 8 * 4); \ + if (__ret) { \ + local->op_ret = -1; \ + local->op_errno = ENOMEM; \ + gf_msg(this->name, GF_LOG_WARNING, 0, SHARD_MSG_DICT_OP_FAILED, \ + "Failed to set dict value:" \ + " key:%s for %s.", \ + GF_XATTR_SHARD_FILE_SIZE, uuid_utoa(gfid)); \ + goto label; \ + } \ + } while (0) + +#define SHARD_SET_ROOT_FS_ID(frame, local) \ + do { \ + if (!local->is_set_fsid) { \ + local->uid = frame->root->uid; \ + local->gid = frame->root->gid; \ + frame->root->uid = 0; \ + frame->root->gid = 0; \ + local->is_set_fsid = _gf_true; \ + } \ + } while (0) + +#define SHARD_UNSET_ROOT_FS_ID(frame, local) \ + do { \ + if (local->is_set_fsid) { \ + frame->root->uid = local->uid; \ + frame->root->gid = local->gid; \ + local->is_set_fsid = _gf_false; \ + } \ + } while (0) + +#define SHARD_TIME_UPDATE(ctx_sec, ctx_nsec, new_sec, new_nsec) \ + do { \ + if (ctx_sec == new_sec) \ + ctx_nsec = new_nsec = max(new_nsec, ctx_nsec); \ + else if (ctx_sec > new_sec) { \ + new_sec = ctx_sec; \ + new_nsec = ctx_nsec; \ + } else { \ + ctx_sec = new_sec; \ + ctx_nsec = new_nsec; \ + } \ + } while (0) + +typedef enum { + SHARD_BG_DELETION_NONE = 0, + SHARD_BG_DELETION_LAUNCHING, + SHARD_BG_DELETION_IN_PROGRESS, +} shard_bg_deletion_state_t; + +/* rm = "remove me" */ + +typedef struct shard_priv { + uint64_t block_size; + uuid_t dot_shard_gfid; + uuid_t dot_shard_rm_gfid; + inode_t *dot_shard_inode; + inode_t *dot_shard_rm_inode; + gf_lock_t lock; + int inode_count; + struct list_head ilist_head; + uint32_t deletion_rate; + shard_bg_deletion_state_t bg_del_state; + gf_boolean_t first_lookup_done; + uint64_t lru_limit; +} shard_priv_t; + +typedef struct { + loc_t loc; + char *domain; + struct gf_flock flock; + gf_boolean_t acquired_lock; +} shard_inodelk_t; + +typedef struct { + loc_t loc; + char *domain; + char *basename; + entrylk_cmd cmd; + entrylk_type type; + gf_boolean_t acquired_lock; +} shard_entrylk_t; + +typedef int32_t (*shard_post_fop_handler_t)(call_frame_t *frame, + xlator_t *this); +typedef int32_t (*shard_post_resolve_fop_handler_t)(call_frame_t *frame, + xlator_t *this); +typedef int32_t (*shard_post_lookup_shards_fop_handler_t)(call_frame_t *frame, + xlator_t *this); + +typedef int32_t (*shard_post_mknod_fop_handler_t)(call_frame_t *frame, + xlator_t *this); + +typedef int32_t (*shard_post_update_size_fop_handler_t)(call_frame_t *frame, + xlator_t *this); + +typedef struct shard_local { + int op_ret; + int op_errno; + uint64_t first_block; + uint64_t last_block; + uint64_t num_blocks; + int call_count; + int eexist_count; + int create_count; + int xflag; + int count; + uint32_t flags; + uint32_t uid; + uint32_t gid; + uint64_t block_size; + uint64_t dst_block_size; + int32_t datasync; + off_t offset; + size_t total_size; + size_t written_size; + size_t hole_size; + size_t req_size; + size_t readdir_size; + int64_t delta_size; + gf_atomic_t delta_blocks; + loc_t loc; + loc_t dot_shard_loc; + loc_t dot_shard_rm_loc; + loc_t loc2; + loc_t tmp_loc; + fd_t *fd; + dict_t *xattr_req; + dict_t *xattr_rsp; + inode_t **inode_list; + glusterfs_fop_t fop; + struct iatt prebuf; + struct iatt postbuf; + struct iatt preoldparent; + struct iatt postoldparent; + struct iatt prenewparent; + struct iatt postnewparent; + struct iovec *vector; + struct iobref *iobref; + struct iobuf *iobuf; + gf_dirent_t entries_head; + gf_boolean_t is_set_fsid; + gf_boolean_t list_inited; + shard_post_fop_handler_t handler; + shard_post_lookup_shards_fop_handler_t pls_fop_handler; + shard_post_resolve_fop_handler_t post_res_handler; + shard_post_mknod_fop_handler_t post_mknod_handler; + shard_post_update_size_fop_handler_t post_update_size_handler; + shard_inodelk_t int_inodelk; + shard_entrylk_t int_entrylk; + inode_t *resolver_base_inode; + gf_boolean_t first_lookup_done; + syncbarrier_t barrier; + gf_boolean_t lookup_shards_barriered; + gf_boolean_t unlink_shards_barriered; + gf_boolean_t resolve_not; + loc_t newloc; + call_frame_t *main_frame; + call_frame_t *inodelk_frame; + call_frame_t *entrylk_frame; + uint32_t deletion_rate; + gf_boolean_t cleanup_required; + uuid_t base_gfid; + char *name; +} shard_local_t; + +typedef struct shard_inode_ctx { + uint64_t block_size; /* The block size with which this inode is + sharded */ + struct iatt stat; + gf_boolean_t refresh; + /* The following members of inode ctx will be applicable only to the + * individual shards' ctx and never the base file ctx. + */ + struct list_head ilist; + uuid_t base_gfid; + int block_num; + gf_boolean_t refreshed; + struct list_head to_fsync_list; + int fsync_needed; + inode_t *inode; + int fsync_count; + inode_t *base_inode; +} shard_inode_ctx_t; + +typedef enum { + SHARD_INTERNAL_DIR_DOT_SHARD = 1, + SHARD_INTERNAL_DIR_DOT_SHARD_REMOVE_ME, +} shard_internal_dir_type_t; + +#endif /* __SHARD_H__ */ |
