summaryrefslogtreecommitdiffstats
path: root/xlators/cluster/ec/src/ec-heal.c
diff options
context:
space:
mode:
Diffstat (limited to 'xlators/cluster/ec/src/ec-heal.c')
-rw-r--r--xlators/cluster/ec/src/ec-heal.c3914
1 files changed, 2898 insertions, 1016 deletions
diff --git a/xlators/cluster/ec/src/ec-heal.c b/xlators/cluster/ec/src/ec-heal.c
index 54301d10df4..7d991f04aac 100644
--- a/xlators/cluster/ec/src/ec-heal.c
+++ b/xlators/cluster/ec/src/ec-heal.c
@@ -1,151 +1,171 @@
/*
- Copyright (c) 2012 DataLab, s.l. <http://www.datalab.es>
+ Copyright (c) 2012-2014 DataLab, s.l. <http://www.datalab.es>
+ This file is part of GlusterFS.
- This file is part of the cluster/ec translator for GlusterFS.
-
- The cluster/ec translator for GlusterFS is free software: you can
- redistribute it and/or modify it under the terms of the GNU General
- Public License as published by the Free Software Foundation, either
- version 3 of the License, or (at your option) any later version.
-
- The cluster/ec translator for GlusterFS is distributed in the hope
- that it will be useful, but WITHOUT ANY WARRANTY; without even the
- implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR
- PURPOSE. See the GNU General Public License for more details.
-
- You should have received a copy of the GNU General Public License
- along with the cluster/ec translator for GlusterFS. If not, see
- <http://www.gnu.org/licenses/>.
+ This file is licensed to you under your choice of the GNU Lesser
+ General Public License, version 3 or any later version (LGPLv3 or
+ later), or the GNU General Public License, version 2 (GPLv2), in all
+ cases as published by the Free Software Foundation.
*/
-#include "xlator.h"
-#include "defaults.h"
-#include "compat-errno.h"
+#include <glusterfs/defaults.h>
+#include <glusterfs/compat-errno.h>
+#include <glusterfs/byte-order.h>
+#include <glusterfs/syncop.h>
+#include <glusterfs/syncop-utils.h>
+#include <glusterfs/cluster-syncop.h>
+#include "ec.h"
+#include "ec-types.h"
+#include "ec-messages.h"
#include "ec-helpers.h"
#include "ec-common.h"
#include "ec-combine.h"
#include "ec-method.h"
#include "ec-fops.h"
+#include "ec-heald.h"
+
+#define EC_COUNT(array, max) \
+ ({ \
+ int __i; \
+ int __res = 0; \
+ for (__i = 0; __i < max; __i++) \
+ if (array[__i]) \
+ __res++; \
+ __res; \
+ })
+#define EC_INTERSECT(dst, src1, src2, max) \
+ ({ \
+ int __i; \
+ for (__i = 0; __i < max; __i++) \
+ dst[__i] = src1[__i] && src2[__i]; \
+ })
+#define EC_ADJUST_SOURCE(source, sources, max) \
+ ({ \
+ int __i; \
+ if (sources[source] == 0) { \
+ source = -1; \
+ for (__i = 0; __i < max; __i++) \
+ if (sources[__i]) \
+ source = __i; \
+ } \
+ })
+#define IA_EQUAL(f, s, field) \
+ (memcmp(&(f.ia_##field), &(s.ia_##field), sizeof(s.ia_##field)) == 0)
+#define EC_REPLIES_ALLOC(replies, numsubvols) \
+ do { \
+ int __i = 0; \
+ replies = alloca0(numsubvols * sizeof(*replies)); \
+ for (__i = 0; __i < numsubvols; __i++) \
+ INIT_LIST_HEAD(&replies[__i].entries.list); \
+ } while (0)
+
+struct ec_name_data {
+ call_frame_t *frame;
+ unsigned char *participants;
+ unsigned char *failed_on;
+ unsigned char *gfidless;
+ unsigned char *enoent;
+ unsigned char *same;
+ char *name;
+ inode_t *parent;
+ default_args_cbk_t *replies;
+ uint32_t heal_pending;
+};
+
+static char *ec_ignore_xattrs[] = {GF_SELINUX_XATTR_KEY, QUOTA_SIZE_KEY, NULL};
+
+static gf_boolean_t
+ec_ignorable_key_match(dict_t *dict, char *key, data_t *val, void *mdata)
+{
+ int i = 0;
-#include "ec-mem-types.h"
-#include "ec-data.h"
+ if (!key)
+ goto out;
-/* FOP: heal */
+ if (strncmp(key, EC_XATTR_PREFIX, SLEN(EC_XATTR_PREFIX)) == 0)
+ return _gf_true;
-void ec_heal_exclude(ec_heal_t * heal, uintptr_t mask)
-{
- LOCK(&heal->lock);
+ for (i = 0; ec_ignore_xattrs[i]; i++) {
+ if (!strcmp(key, ec_ignore_xattrs[i]))
+ return _gf_true;
+ }
- heal->bad &= ~mask;
+out:
+ return _gf_false;
+}
- UNLOCK(&heal->lock);
+static gf_boolean_t
+ec_sh_key_match(dict_t *dict, char *key, data_t *val, void *mdata)
+{
+ return !ec_ignorable_key_match(dict, key, val, mdata);
}
+/* FOP: heal */
-void ec_heal_lookup_resume(ec_fop_data_t * fop)
+void
+ec_set_entry_healing(ec_fop_data_t *fop)
{
- ec_heal_t * heal = fop->data;
- ec_cbk_data_t * cbk;
- uintptr_t good = 0, bad = 0;
+ ec_inode_t *ctx = NULL;
+ loc_t *loc = NULL;
- if (heal->lookup != NULL)
- {
- ec_fop_data_release(heal->lookup);
- }
- ec_fop_data_acquire(fop);
+ if (!fop)
+ return;
- list_for_each_entry(cbk, &fop->cbk_list, list)
+ loc = &fop->loc[0];
+ LOCK(&loc->inode->lock);
{
- if ((cbk->op_ret < 0) && (cbk->op_errno == ENOTCONN))
- {
- continue;
- }
-
- if (cbk == fop->answer)
- {
- if (cbk->op_ret >= 0)
- {
- heal->iatt = cbk->iatt[0];
- heal->version = cbk->version;
- heal->raw_size = cbk->size;
- heal->fop->pre_size = cbk->iatt[0].ia_size;
- heal->fop->post_size = cbk->iatt[0].ia_size;
-
- if (!ec_loc_prepare(heal->xl, &heal->loc, cbk->inode,
- &cbk->iatt[0]))
- {
- fop->answer = NULL;
- fop->error = EIO;
-
- bad |= cbk->mask;
-
- continue;
- }
- }
-
- good |= cbk->mask;
- }
- else
- {
- bad |= cbk->mask;
+ ctx = __ec_inode_get(loc->inode, fop->xl);
+ if (ctx) {
+ ctx->heal_count += 1;
}
}
-
- /* Heal lookups are not executed concurrently with anything else. So, when
- * a lookup finishes, it's safe to access heal->good and heal->bad without
- * acquiring any lock.
- */
- heal->good = good;
- heal->bad = bad;
-
- heal->lookup = fop;
-
- ec_resume_parent(fop, fop->answer != NULL ? 0 : fop->error);
-}
-
-int32_t ec_heal_entry_lookup_cbk(call_frame_t * frame, void * cookie,
- xlator_t * this, int32_t op_ret,
- int32_t op_errno, inode_t * inode,
- struct iatt * buf, dict_t * xdata,
- struct iatt * postparent)
-{
- ec_heal_lookup_resume(cookie);
-
- return 0;
+ UNLOCK(&loc->inode->lock);
}
-int32_t ec_heal_inode_lookup_cbk(call_frame_t * frame, void * cookie,
- xlator_t * this, int32_t op_ret,
- int32_t op_errno, inode_t * inode,
- struct iatt * buf, dict_t * xdata,
- struct iatt * postparent)
+void
+ec_reset_entry_healing(ec_fop_data_t *fop)
{
- ec_heal_lookup_resume(cookie);
+ ec_inode_t *ctx = NULL;
+ loc_t *loc = NULL;
+ int32_t heal_count = 0;
+ if (!fop)
+ return;
- return 0;
+ loc = &fop->loc[0];
+ LOCK(&loc->inode->lock);
+ {
+ ctx = __ec_inode_get(loc->inode, fop->xl);
+ if (ctx) {
+ ctx->heal_count += -1;
+ heal_count = ctx->heal_count;
+ }
+ }
+ UNLOCK(&loc->inode->lock);
+ GF_ASSERT(heal_count >= 0);
}
-uintptr_t ec_heal_check(ec_fop_data_t * fop, uintptr_t * pgood)
+uintptr_t
+ec_heal_check(ec_fop_data_t *fop, uintptr_t *pgood)
{
- ec_cbk_data_t * cbk;
- uintptr_t mask[2] = { 0, 0 };
+ ec_cbk_data_t *cbk;
+ uintptr_t mask[2] = {0, 0};
list_for_each_entry(cbk, &fop->cbk_list, list)
{
mask[cbk->op_ret >= 0] |= cbk->mask;
}
- if (pgood != NULL)
- {
+ if (pgood != NULL) {
*pgood = mask[1];
}
return mask[0];
}
-void ec_heal_update(ec_fop_data_t * fop, int32_t is_open)
+void
+ec_heal_update(ec_fop_data_t *fop, int32_t is_open)
{
- ec_heal_t * heal = fop->data;
+ ec_heal_t *heal = fop->data;
uintptr_t good, bad;
bad = ec_heal_check(fop, &good);
@@ -153,8 +173,7 @@ void ec_heal_update(ec_fop_data_t * fop, int32_t is_open)
LOCK(&heal->lock);
heal->bad &= ~bad;
- if (is_open)
- {
+ if (is_open) {
heal->open |= good;
}
@@ -163,9 +182,10 @@ void ec_heal_update(ec_fop_data_t * fop, int32_t is_open)
fop->error = 0;
}
-void ec_heal_avoid(ec_fop_data_t * fop)
+void
+ec_heal_avoid(ec_fop_data_t *fop)
{
- ec_heal_t * heal = fop->data;
+ ec_heal_t *heal = fop->data;
uintptr_t bad;
bad = ec_heal_check(fop, NULL);
@@ -177,1309 +197,3171 @@ void ec_heal_avoid(ec_fop_data_t * fop)
UNLOCK(&heal->lock);
}
-int32_t ec_heal_mkdir_cbk(call_frame_t * frame, void * cookie, xlator_t * this,
- int32_t op_ret, int32_t op_errno, inode_t * inode,
- struct iatt * buf, struct iatt * preparent,
- struct iatt * postparent, dict_t * xdata)
+int32_t
+ec_heal_lock_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+ int32_t op_ret, int32_t op_errno, dict_t *xdata)
{
- ec_heal_update(cookie, 0);
+ ec_fop_data_t *fop = cookie;
+ ec_heal_t *heal = fop->data;
+
+ if (op_ret >= 0) {
+ GF_ASSERT(
+ ec_set_inode_size(heal->fop, heal->fd->inode, heal->total_size));
+ }
return 0;
}
-int32_t ec_heal_mknod_cbk(call_frame_t * frame, void * cookie, xlator_t * this,
- int32_t op_ret, int32_t op_errno, inode_t * inode,
- struct iatt * buf, struct iatt * preparent,
- struct iatt * postparent, dict_t * xdata)
+void
+ec_heal_lock(ec_heal_t *heal, int32_t type, fd_t *fd, loc_t *loc, off_t offset,
+ size_t size)
{
- ec_heal_update(cookie, 0);
+ struct gf_flock flock;
+ fop_inodelk_cbk_t cbk = NULL;
- return 0;
+ flock.l_type = type;
+ flock.l_whence = SEEK_SET;
+ flock.l_start = offset;
+ flock.l_len = size;
+ flock.l_pid = 0;
+ flock.l_owner.len = 0;
+
+ if (type == F_UNLCK) {
+ /* Remove inode size information before unlocking it. */
+ if (fd == NULL) {
+ ec_clear_inode_info(heal->fop, heal->loc.inode);
+ } else {
+ ec_clear_inode_info(heal->fop, heal->fd->inode);
+ }
+ cbk = ec_lock_unlocked;
+ } else {
+ /* Otherwise use the callback to update size information. */
+ cbk = ec_heal_lock_cbk;
+ }
+
+ if (fd != NULL) {
+ ec_finodelk(heal->fop->frame, heal->xl,
+ &heal->fop->frame->root->lk_owner, heal->fop->mask,
+ EC_MINIMUM_ALL, cbk, heal, heal->xl->name, fd, F_SETLKW,
+ &flock, NULL);
+ } else {
+ ec_inodelk(heal->fop->frame, heal->xl,
+ &heal->fop->frame->root->lk_owner, heal->fop->mask,
+ EC_MINIMUM_ALL, cbk, heal, heal->xl->name, loc, F_SETLKW,
+ &flock, NULL);
+ }
}
-int32_t ec_heal_symlink_cbk(call_frame_t * frame, void * cookie,
- xlator_t * this, int32_t op_ret, int32_t op_errno,
- inode_t * inode, struct iatt * buf,
- struct iatt * preparent, struct iatt * postparent,
- dict_t * xdata)
+void
+ec_heal_inodelk(ec_heal_t *heal, int32_t type, int32_t use_fd, off_t offset,
+ size_t size)
{
- ec_heal_update(cookie, 0);
-
- return 0;
+ ec_heal_lock(heal, type, use_fd ? heal->fd : NULL, &heal->loc, offset,
+ size);
}
-int32_t ec_heal_create_cbk(call_frame_t * frame, void * cookie,
- xlator_t * this, int32_t op_ret, int32_t op_errno,
- fd_t * fd, inode_t * inode, struct iatt * buf,
- struct iatt * preparent, struct iatt * postparent,
- dict_t * xdata)
+int32_t
+ec_heal_xattr_clean(dict_t *dict, char *key, data_t *data, void *arg)
{
- ec_heal_update(cookie, 1);
+ dict_t *base = arg;
+
+ if (ec_ignorable_key_match(NULL, key, NULL, NULL)) {
+ dict_del(dict, key);
+ return 0;
+ }
+
+ if (dict_get(base, key) != NULL)
+ dict_del(dict, key);
return 0;
}
-int32_t ec_heal_setattr_cbk(call_frame_t * frame, void * cookie,
- xlator_t * this, int32_t op_ret, int32_t op_errno,
- struct iatt * preop_stbuf,
- struct iatt * postop_stbuf,
- dict_t * xdata)
+/********************************************************************
+ * ec_wind_xattrop_parallel:
+ * Helper function to update the extended attributes
+ * in parallel.
+ *
+ *******************************************************************/
+void
+ec_wind_xattrop_parallel(call_frame_t *frame, xlator_t *subvol, int child_index,
+ loc_t *loc, gf_xattrop_flags_t flags, dict_t **dict,
+ dict_t *xdata)
{
- ec_heal_update(cookie, 0);
-
- return 0;
+ gf_msg_debug("EC", 0, "WIND: on child %d ", child_index);
+ STACK_WIND_COOKIE(
+ frame, cluster_xattrop_cbk, (void *)(uintptr_t)child_index, subvol,
+ subvol->fops->xattrop, loc, flags, dict[child_index], xdata);
}
-int32_t ec_heal_setxattr_cbk(call_frame_t * frame, void * cookie,
- xlator_t * this, int32_t op_ret, int32_t op_errno,
- dict_t * xdata)
+int32_t
+ec_heal_writev_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+ int32_t op_ret, int32_t op_errno, struct iatt *prebuf,
+ struct iatt *postbuf, dict_t *xdata)
{
+ ec_fop_data_t *fop = cookie;
+ ec_heal_t *heal = fop->data;
+
+ ec_trace("WRITE_CBK", cookie, "ret=%d, errno=%d", op_ret, op_errno);
+
+ gf_msg_debug(fop->xl->name, 0,
+ "%s: write op_ret %d, op_errno %s"
+ " at %" PRIu64,
+ uuid_utoa(heal->fd->inode->gfid), op_ret, strerror(op_errno),
+ heal->offset);
+
ec_heal_update(cookie, 0);
return 0;
}
-int32_t ec_heal_removexattr_cbk(call_frame_t * frame, void * cookie,
- xlator_t * this, int32_t op_ret,
- int32_t op_errno, dict_t * xdata)
+int32_t
+ec_heal_readv_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+ int32_t op_ret, int32_t op_errno, struct iovec *vector,
+ int32_t count, struct iatt *stbuf, struct iobref *iobref,
+ dict_t *xdata)
{
- ec_heal_update(cookie, 0);
+ ec_fop_data_t *fop = cookie;
+ ec_heal_t *heal = fop->data;
+
+ ec_trace("READ_CBK", fop, "ret=%d, errno=%d", op_ret, op_errno);
+
+ ec_heal_avoid(fop);
+
+ if (op_ret > 0) {
+ gf_msg_debug(fop->xl->name, 0,
+ "%s: read succeeded, proceeding "
+ "to write at %" PRIu64,
+ uuid_utoa(heal->fd->inode->gfid), heal->offset);
+ ec_writev(heal->fop->frame, heal->xl, heal->bad, EC_MINIMUM_ONE,
+ ec_heal_writev_cbk, heal, heal->fd, vector, count,
+ heal->offset, 0, iobref, NULL);
+ } else {
+ if (op_ret < 0) {
+ gf_msg_debug(fop->xl->name, 0,
+ "%s: read failed %s, failing "
+ "to heal block at %" PRIu64,
+ uuid_utoa(heal->fd->inode->gfid), strerror(op_errno),
+ heal->offset);
+ heal->bad = 0;
+ }
+ heal->done = 1;
+ }
return 0;
}
-int32_t ec_heal_link_cbk(call_frame_t * frame, void * cookie, xlator_t * this,
- int32_t op_ret, int32_t op_errno, inode_t * inode,
- struct iatt * buf, struct iatt * preparent,
- struct iatt * postparent, dict_t * xdata)
+void
+ec_heal_data_block(ec_heal_t *heal)
{
- ec_fop_data_t * fop = cookie;
- ec_heal_t * heal = fop->data;
- uintptr_t good, bad;
+ ec_trace("DATA", heal->fop, "good=%lX, bad=%lX", heal->good, heal->bad);
- bad = ec_heal_check(fop, &good);
- ec_heal_exclude(heal, good);
+ if ((heal->good != 0) && (heal->bad != 0) &&
+ (heal->iatt.ia_type == IA_IFREG)) {
+ ec_readv(heal->fop->frame, heal->xl, heal->good, EC_MINIMUM_MIN,
+ ec_heal_readv_cbk, heal, heal->fd, heal->size, heal->offset, 0,
+ NULL);
+ }
+}
- if (bad != 0)
- {
- fop->error = 0;
+/* FOP: fheal */
- xdata = fop->xdata;
- fop = fop->parent;
+void
+ec_fheal(call_frame_t *frame, xlator_t *this, uintptr_t target,
+ uint32_t fop_flags, fop_fheal_cbk_t func, void *data, fd_t *fd,
+ int32_t partial, dict_t *xdata)
+{
+ ec_fd_t *ctx = ec_fd_get(fd, this);
- ec_create(fop->frame, fop->xl, bad, EC_MINIMUM_ONE,
- ec_heal_create_cbk, heal, &heal->loc, 0,
- st_mode_from_ia(heal->iatt.ia_prot, IA_INVAL),
- 0, heal->fd, xdata);
+ if (ctx != NULL) {
+ gf_msg_trace("ec", 0, "FHEAL ctx: flags=%X, open=%" PRIXPTR, ctx->flags,
+ ctx->open);
+ ec_heal(frame, this, target, fop_flags, func, data, &ctx->loc, partial,
+ xdata);
}
-
- return 0;
}
-int32_t ec_heal_target_open_cbk(call_frame_t * frame, void * cookie,
- xlator_t * this, int32_t op_ret,
- int32_t op_errno, fd_t * fd, dict_t * xdata)
+/* Common heal code */
+void
+ec_mask_to_char_array(uintptr_t mask, unsigned char *array, int numsubvols)
{
- ec_heal_update(cookie, 1);
+ int i = 0;
- return 0;
+ for (i = 0; i < numsubvols; i++)
+ array[i] = ((mask >> i) & 1);
}
-int32_t ec_heal_source_open_cbk(call_frame_t * frame, void * cookie,
- xlator_t * this, int32_t op_ret,
- int32_t op_errno, fd_t * fd, dict_t * xdata)
+uintptr_t
+ec_char_array_to_mask(unsigned char *array, int numsubvols)
{
- ec_heal_avoid(cookie);
+ int i = 0;
+ uintptr_t mask = 0;
- return 0;
+ if (array == NULL)
+ goto out;
+
+ for (i = 0; i < numsubvols; i++)
+ if (array[i])
+ mask |= (1ULL << i);
+out:
+ return mask;
}
-int32_t ec_heal_reopen_cbk(call_frame_t * frame, void * cookie,
- xlator_t * this, int32_t op_ret, int32_t op_errno,
- fd_t * fd, dict_t * xdata)
+int
+ec_heal_entry_find_direction(ec_t *ec, default_args_cbk_t *replies,
+ uint64_t *versions, uint64_t *dirty,
+ unsigned char *sources,
+ unsigned char *healed_sinks)
{
- ec_fop_data_t * fop = cookie;
- ec_fd_t * ctx;
- uintptr_t good;
+ uint64_t xattr[EC_VERSION_SIZE] = {0};
+ int source = -1;
+ uint64_t max_version = 0;
+ int ret = 0;
+ int i = 0;
+
+ for (i = 0; i < ec->nodes; i++) {
+ if (!replies[i].valid)
+ continue;
- ec_heal_check(fop, &good);
+ if (replies[i].op_ret == -1)
+ continue;
- if (good != 0)
- {
- LOCK(&fd->lock);
+ if (source == -1)
+ source = i;
- ctx = __ec_fd_get(fd, fop->xl);
- if ((ctx != NULL) && (ctx->loc.inode != NULL))
- {
- ctx->bad &= ~good;
- ctx->open |= good;
+ ret = ec_dict_get_array(replies[i].xdata, EC_XATTR_VERSION, xattr,
+ EC_VERSION_SIZE);
+ if (ret == 0) {
+ versions[i] = xattr[EC_DATA_TXN];
+ if (max_version < versions[i]) {
+ max_version = versions[i];
+ source = i;
+ }
}
- UNLOCK(&fd->lock);
+ memset(xattr, 0, sizeof(xattr));
+ ret = ec_dict_get_array(replies[i].xdata, EC_XATTR_DIRTY, xattr,
+ EC_VERSION_SIZE);
+ if (ret == 0) {
+ dirty[i] = xattr[EC_DATA_TXN];
+ }
}
- return 0;
-}
+ if (source < 0)
+ goto out;
-int32_t ec_heal_create(ec_heal_t * heal, uintptr_t mask, int32_t try_link)
-{
- loc_t loc;
- dict_t * xdata;
+ for (i = 0; i < ec->nodes; i++) {
+ if (!replies[i].valid)
+ continue;
- xdata = dict_new();
- if (xdata == NULL)
- {
- return ENOMEM;
+ if (replies[i].op_ret == -1)
+ continue;
+
+ if (versions[i] == versions[source])
+ sources[i] = 1;
+ else
+ healed_sinks[i] = 1;
}
- if (dict_set_static_bin(xdata, "gfid-req", heal->iatt.ia_gfid,
- sizeof(uuid_t)) != 0)
- {
- dict_unref(xdata);
+out:
+ return source;
+}
- return ENOMEM;
+int
+ec_adjust_versions(call_frame_t *frame, ec_t *ec, ec_txn_t type, inode_t *inode,
+ int source, unsigned char *sources,
+ unsigned char *healed_sinks, uint64_t *versions,
+ uint64_t *dirty)
+{
+ int i = 0;
+ int ret = 0;
+ int call_count = 0;
+ dict_t **xattr = NULL;
+ int op_ret = 0;
+ loc_t loc = {0};
+ gf_boolean_t erase_dirty = _gf_false;
+ uint64_t *versions_xattr = NULL;
+ uint64_t *dirty_xattr = NULL;
+ uint64_t allzero[2] = {0};
+ unsigned char *on = NULL;
+ unsigned char *output = NULL;
+ default_args_cbk_t *replies = NULL;
+
+ /* Allocate the required memory */
+ loc.inode = inode_ref(inode);
+ gf_uuid_copy(loc.gfid, inode->gfid);
+ on = alloca0(ec->nodes);
+ output = alloca0(ec->nodes);
+ EC_REPLIES_ALLOC(replies, ec->nodes);
+ xattr = GF_CALLOC(ec->nodes, sizeof(*xattr), gf_common_mt_pointer);
+ if (!xattr) {
+ op_ret = -ENOMEM;
+ goto out;
+ }
+ for (i = 0; i < ec->nodes; i++) {
+ xattr[i] = dict_new();
+ if (!xattr[i]) {
+ op_ret = -ENOMEM;
+ goto out;
+ }
}
- if ((heal->iatt.ia_type == IA_IFREG) && try_link)
- {
- memset(&loc, 0, sizeof(loc));
- loc.inode = heal->loc.inode;
- uuid_copy(loc.gfid, heal->iatt.ia_gfid);
+ /* dirty xattr represents if the file/dir needs heal. Unless all the
+ * copies are healed, don't erase it */
+ if (EC_COUNT(sources, ec->nodes) + EC_COUNT(healed_sinks, ec->nodes) ==
+ ec->nodes)
+ erase_dirty = _gf_true;
+ else
+ op_ret = -ENOTCONN;
- ec_link(heal->fop->frame, heal->xl, mask, EC_MINIMUM_ONE,
- ec_heal_link_cbk, heal, &loc, &heal->loc, xdata);
+ /* Populate the xattr array */
+ for (i = 0; i < ec->nodes; i++) {
+ if (!sources[i] && !healed_sinks[i])
+ continue;
+ versions_xattr = GF_CALLOC(EC_VERSION_SIZE, sizeof(*versions_xattr),
+ gf_common_mt_pointer);
+ if (!versions_xattr) {
+ op_ret = -ENOMEM;
+ continue;
+ }
- dict_unref(xdata);
+ versions_xattr[type] = hton64(versions[source] - versions[i]);
+ ret = dict_set_bin(xattr[i], EC_XATTR_VERSION, versions_xattr,
+ (sizeof(*versions_xattr) * EC_VERSION_SIZE));
+ if (ret < 0) {
+ op_ret = -ENOMEM;
+ continue;
+ }
- return 0;
- }
+ if (erase_dirty) {
+ dirty_xattr = GF_CALLOC(EC_VERSION_SIZE, sizeof(*dirty_xattr),
+ gf_common_mt_pointer);
+ if (!dirty_xattr) {
+ op_ret = -ENOMEM;
+ continue;
+ }
- switch (heal->iatt.ia_type)
- {
- case IA_IFDIR:
- ec_mkdir(heal->fop->frame, heal->xl, mask, EC_MINIMUM_ONE,
- ec_heal_mkdir_cbk, heal, &heal->loc,
- st_mode_from_ia(heal->iatt.ia_prot, IA_INVAL),
- 0, xdata);
+ dirty_xattr[type] = hton64(-dirty[i]);
+ ret = dict_set_bin(xattr[i], EC_XATTR_DIRTY, dirty_xattr,
+ (sizeof(*dirty_xattr) * EC_VERSION_SIZE));
+ if (ret < 0) {
+ op_ret = -ENOMEM;
+ continue;
+ }
+ }
- break;
+ if (memcmp(versions_xattr, allzero,
+ (sizeof(*versions_xattr) * EC_VERSION_SIZE)) == 0) {
+ if (!erase_dirty) {
+ continue;
+ }
- case IA_IFLNK:
- ec_symlink(heal->fop->frame, heal->xl, mask, EC_MINIMUM_ONE,
- ec_heal_symlink_cbk, heal, heal->symlink, &heal->loc,
- 0, xdata);
+ if (memcmp(dirty_xattr, allzero,
+ (sizeof(*dirty_xattr) * EC_VERSION_SIZE)) == 0) {
+ continue;
+ }
+ }
- break;
+ on[i] = 1;
+ call_count++;
+ }
- case IA_IFREG:
- ec_create(heal->fop->frame, heal->xl, mask, EC_MINIMUM_ONE,
- ec_heal_create_cbk, heal, &heal->loc, 0,
- st_mode_from_ia(heal->iatt.ia_prot, IA_INVAL),
- 0, heal->fd, xdata);
+ /* Update the bricks with xattr */
+ if (call_count) {
+ PARALLEL_FOP_ONLIST(ec->xl_list, on, ec->nodes, replies, frame,
+ ec_wind_xattrop_parallel, &loc,
+ GF_XATTROP_ADD_ARRAY64, xattr, NULL);
+ ret = cluster_fop_success_fill(replies, ec->nodes, output);
+ }
- break;
+ if (ret < call_count) {
+ op_ret = -ENOTCONN;
+ goto out;
+ }
- default:
- ec_mknod(heal->fop->frame, heal->xl, mask, EC_MINIMUM_ONE,
- ec_heal_mknod_cbk, heal, &heal->loc,
- st_mode_from_ia(heal->iatt.ia_prot, IA_INVAL),
- heal->iatt.ia_rdev, 0, xdata);
+out:
+ /* Cleanup */
+ if (xattr) {
+ for (i = 0; i < ec->nodes; i++) {
+ if (xattr[i])
+ dict_unref(xattr[i]);
+ }
+ GF_FREE(xattr);
+ }
+ cluster_replies_wipe(replies, ec->nodes);
+ loc_wipe(&loc);
+ return op_ret;
+}
- break;
+int
+ec_heal_metadata_find_direction(ec_t *ec, default_args_cbk_t *replies,
+ uint64_t *versions, uint64_t *dirty,
+ unsigned char *sources,
+ unsigned char *healed_sinks)
+{
+ uint64_t xattr[EC_VERSION_SIZE] = {0};
+ uint64_t max_version = 0;
+ int same_count = 0;
+ int max_same_count = 0;
+ int same_source = -1;
+ int ret = 0;
+ int i = 0;
+ int j = 0;
+ int *groups = NULL;
+ struct iatt source_ia = {0};
+ struct iatt child_ia = {0};
+
+ groups = alloca0(ec->nodes * sizeof(*groups));
+ for (i = 0; i < ec->nodes; i++)
+ groups[i] = -1;
+
+ for (i = 0; i < ec->nodes; i++) {
+ if (!replies[i].valid)
+ continue;
+ if (replies[i].op_ret < 0)
+ continue;
+ ret = ec_dict_get_array(replies[i].xdata, EC_XATTR_VERSION, xattr,
+ EC_VERSION_SIZE);
+ if (ret == 0) {
+ versions[i] = xattr[EC_METADATA_TXN];
+ }
+
+ memset(xattr, 0, sizeof(xattr));
+ ret = ec_dict_get_array(replies[i].xdata, EC_XATTR_DIRTY, xattr,
+ EC_VERSION_SIZE);
+ if (ret == 0) {
+ dirty[i] = xattr[EC_METADATA_TXN];
+ }
+ if (groups[i] >= 0) /*Already part of group*/
+ continue;
+ groups[i] = i;
+ same_count = 1;
+ source_ia = replies[i].stat;
+ for (j = i + 1; j < ec->nodes; j++) {
+ if (!replies[j].valid || replies[j].op_ret < 0)
+ continue;
+ child_ia = replies[j].stat;
+ if (!IA_EQUAL(source_ia, child_ia, gfid) ||
+ !IA_EQUAL(source_ia, child_ia, type) ||
+ !IA_EQUAL(source_ia, child_ia, prot) ||
+ !IA_EQUAL(source_ia, child_ia, uid) ||
+ !IA_EQUAL(source_ia, child_ia, gid))
+ continue;
+ if (!are_dicts_equal(replies[i].xdata, replies[j].xdata,
+ ec_sh_key_match, NULL))
+ continue;
+ groups[j] = i;
+ same_count++;
+ }
+
+ if (max_same_count < same_count) {
+ max_same_count = same_count;
+ same_source = i;
+ }
}
- dict_unref(xdata);
+ if (max_same_count < ec->fragments) {
+ ret = -EIO;
+ goto out;
+ }
- return 0;
+ for (i = 0; i < ec->nodes; i++) {
+ if (groups[i] == groups[same_source])
+ sources[i] = 1;
+ else if (replies[i].valid && replies[i].op_ret >= 0)
+ healed_sinks[i] = 1;
+ }
+ for (i = 0; i < ec->nodes; i++) {
+ if (sources[i] && (versions[i] > max_version)) {
+ same_source = i;
+ max_version = versions[i];
+ }
+ }
+ ret = same_source;
+out:
+ return ret;
}
-void ec_heal_recreate(ec_fop_data_t * fop)
+int
+__ec_heal_metadata_prepare(call_frame_t *frame, ec_t *ec, inode_t *inode,
+ unsigned char *locked_on,
+ default_args_cbk_t *replies, uint64_t *versions,
+ uint64_t *dirty, unsigned char *sources,
+ unsigned char *healed_sinks)
{
- ec_cbk_data_t * cbk;
- ec_heal_t * heal = fop->data;
- uintptr_t mask = 0;
-
- if (heal->iatt.ia_type == IA_INVAL)
- {
- return;
+ loc_t loc = {0};
+ unsigned char *output = NULL;
+ unsigned char *lookup_on = NULL;
+ int ret = 0;
+ int source = 0;
+ default_args_cbk_t *greplies = NULL;
+ int i = 0;
+ EC_REPLIES_ALLOC(greplies, ec->nodes);
+
+ loc.inode = inode_ref(inode);
+ gf_uuid_copy(loc.gfid, inode->gfid);
+ output = alloca0(ec->nodes);
+ lookup_on = alloca0(ec->nodes);
+ ret = cluster_lookup(ec->xl_list, locked_on, ec->nodes, replies, output,
+ frame, ec->xl, &loc, NULL);
+ if (ret <= ec->fragments) {
+ ret = -ENOTCONN;
+ goto out;
}
- list_for_each_entry(cbk, &fop->cbk_list, list)
- {
- if ((cbk->op_ret >= 0) || (cbk->op_errno == ENOENT) ||
- (cbk->op_errno == ENOTDIR))
- {
- mask |= cbk->mask;
+ memcpy(lookup_on, output, ec->nodes);
+ /*Use getxattr to get the filtered xattrs which filter internal xattrs*/
+ ret = cluster_getxattr(ec->xl_list, lookup_on, ec->nodes, greplies, output,
+ frame, ec->xl, &loc, NULL, NULL);
+ for (i = 0; i < ec->nodes; i++) {
+ if (lookup_on[i] && !output[i]) {
+ replies[i].valid = 0;
+ continue;
+ }
+ if (replies[i].xdata) {
+ dict_unref(replies[i].xdata);
+ replies[i].xdata = NULL;
+ if (greplies[i].xattr)
+ replies[i].xdata = dict_ref(greplies[i].xattr);
}
}
- if (mask != 0)
- {
- ec_heal_create(heal, mask, 0);
+ source = ec_heal_metadata_find_direction(ec, replies, versions, dirty,
+ sources, healed_sinks);
+ if (source < 0) {
+ ret = -EIO;
+ goto out;
}
+ ret = source;
+out:
+ cluster_replies_wipe(greplies, ec->nodes);
+ loc_wipe(&loc);
+ return ret;
}
-int32_t ec_heal_rmdir_cbk(call_frame_t * frame, void * cookie, xlator_t * this,
- int32_t op_ret, int32_t op_errno,
- struct iatt * preparent, struct iatt * postparent,
- dict_t * xdata)
+/* Metadata heal */
+int
+__ec_removexattr_sinks(call_frame_t *frame, ec_t *ec, inode_t *inode,
+ int source, unsigned char *sources,
+ unsigned char *healed_sinks, default_args_cbk_t *replies)
{
- ec_heal_update(cookie, 0);
- ec_heal_recreate(cookie);
+ int i = 0;
+ int ret = 0;
+ loc_t loc = {0};
+
+ loc.inode = inode_ref(inode);
+ gf_uuid_copy(loc.gfid, inode->gfid);
+
+ for (i = 0; i < ec->nodes; i++) {
+ if (i == source)
+ continue;
+ if (!sources[i] && !healed_sinks[i])
+ continue;
+ ret = dict_foreach(replies[i].xdata, ec_heal_xattr_clean,
+ replies[source].xdata);
+ if (ret < 0) {
+ sources[i] = 0;
+ healed_sinks[i] = 0;
+ continue;
+ }
+
+ if (replies[i].xdata->count == 0) {
+ continue;
+ } else if (sources[i]) {
+ /* This can happen if setxattr/removexattr succeeds on
+ * the bricks but fails to update the version. This
+ * will make sure that the xattrs are made equal after
+ * heal*/
+ sources[i] = 0;
+ healed_sinks[i] = 1;
+ }
+ ret = syncop_removexattr(ec->xl_list[i], &loc, "", replies[i].xdata,
+ NULL);
+ if (ret < 0)
+ healed_sinks[i] = 0;
+ }
+
+ loc_wipe(&loc);
+ if (EC_COUNT(healed_sinks, ec->nodes) == 0)
+ return -ENOTCONN;
return 0;
}
-int32_t ec_heal_unlink_cbk(call_frame_t * frame, void * cookie,
- xlator_t * this, int32_t op_ret, int32_t op_errno,
- struct iatt * preparent, struct iatt * postparent,
- dict_t * xdata)
+int
+__ec_heal_metadata(call_frame_t *frame, ec_t *ec, inode_t *inode,
+ unsigned char *locked_on, unsigned char *sources,
+ unsigned char *healed_sinks)
{
- ec_heal_update(cookie, 0);
- ec_heal_recreate(cookie);
+ loc_t loc = {0};
+ int ret = 0;
+ int source = 0;
+ default_args_cbk_t *replies = NULL;
+ default_args_cbk_t *sreplies = NULL;
+ uint64_t *versions = NULL;
+ uint64_t *dirty = NULL;
+ unsigned char *output = NULL;
+ dict_t *source_dict = NULL;
+ struct iatt source_buf = {0};
+
+ EC_REPLIES_ALLOC(replies, ec->nodes);
+ EC_REPLIES_ALLOC(sreplies, ec->nodes);
+
+ loc.inode = inode_ref(inode);
+ gf_uuid_copy(loc.gfid, inode->gfid);
+ output = alloca0(ec->nodes);
+ versions = alloca0(ec->nodes * sizeof(*versions));
+ dirty = alloca0(ec->nodes * sizeof(*dirty));
+ source = __ec_heal_metadata_prepare(frame, ec, inode, locked_on, replies,
+ versions, dirty, sources, healed_sinks);
+ if (source < 0) {
+ ret = -EIO;
+ goto out;
+ }
- return 0;
+ if ((EC_COUNT(sources, ec->nodes) == ec->nodes) ||
+ (EC_COUNT(healed_sinks, ec->nodes) == 0)) {
+ ret = 0;
+ goto erase_dirty;
+ }
+
+ source_buf = replies[source].stat;
+ ret = cluster_setattr(ec->xl_list, healed_sinks, ec->nodes, sreplies,
+ output, frame, ec->xl, &loc, &source_buf,
+ GF_SET_ATTR_MODE | GF_SET_ATTR_UID | GF_SET_ATTR_GID,
+ NULL);
+ /*In case the operation fails on some of the subvols*/
+ memcpy(healed_sinks, output, ec->nodes);
+ if (EC_COUNT(healed_sinks, ec->nodes) == 0) {
+ ret = -ENOTCONN;
+ goto out;
+ }
+
+ ret = __ec_removexattr_sinks(frame, ec, inode, source, sources,
+ healed_sinks, replies);
+ if (ret < 0)
+ goto out;
+
+ source_dict = dict_ref(replies[source].xdata);
+ if (dict_foreach_match(source_dict, ec_ignorable_key_match, NULL,
+ dict_remove_foreach_fn, NULL) == -1) {
+ ret = -ENOMEM;
+ goto out;
+ }
+
+ ret = cluster_setxattr(ec->xl_list, healed_sinks, ec->nodes, replies,
+ output, frame, ec->xl, &loc, source_dict, 0, NULL);
+
+ EC_INTERSECT(healed_sinks, healed_sinks, output, ec->nodes);
+ if (EC_COUNT(healed_sinks, ec->nodes) == 0) {
+ ret = -ENOTCONN;
+ goto out;
+ }
+
+erase_dirty:
+ ret = ec_adjust_versions(frame, ec, EC_METADATA_TXN, inode, source, sources,
+ healed_sinks, versions, dirty);
+out:
+ if (source_dict)
+ dict_unref(source_dict);
+
+ loc_wipe(&loc);
+ cluster_replies_wipe(replies, ec->nodes);
+ cluster_replies_wipe(sreplies, ec->nodes);
+ return ret;
}
-int32_t ec_heal_init(ec_fop_data_t * fop)
+int
+ec_heal_metadata(call_frame_t *frame, ec_t *ec, inode_t *inode,
+ unsigned char *sources, unsigned char *healed_sinks)
{
- ec_t * ec = fop->xl->private;
- struct iobuf_pool * pool;
- inode_t * inode;
- ec_inode_t * ctx;
- ec_heal_t * heal = NULL;
- int32_t error = 0;
-
- inode = fop->loc[0].inode;
- if (inode == NULL)
+ unsigned char *locked_on = NULL;
+ unsigned char *up_subvols = NULL;
+ unsigned char *output = NULL;
+ int ret = 0;
+ default_args_cbk_t *replies = NULL;
+
+ EC_REPLIES_ALLOC(replies, ec->nodes);
+ locked_on = alloca0(ec->nodes);
+ output = alloca0(ec->nodes);
+ up_subvols = alloca0(ec->nodes);
+ ec_mask_to_char_array(ec->xl_up, up_subvols, ec->nodes);
+ ret = cluster_inodelk(ec->xl_list, up_subvols, ec->nodes, replies,
+ locked_on, frame, ec->xl, ec->xl->name, inode, 0, 0);
{
- gf_log(fop->xl->name, GF_LOG_WARNING, "Unable to start inode healing "
- "because there is not enough "
- "information");
+ if (ret <= ec->fragments) {
+ gf_msg_debug(ec->xl->name, 0,
+ "%s: Skipping heal "
+ "as only %d number of subvolumes could "
+ "be locked",
+ uuid_utoa(inode->gfid), ret);
+ ret = -ENOTCONN;
+ goto unlock;
+ }
+ ret = __ec_heal_metadata(frame, ec, inode, locked_on, sources,
+ healed_sinks);
+ }
+unlock:
+ cluster_uninodelk(ec->xl_list, locked_on, ec->nodes, replies, output, frame,
+ ec->xl, ec->xl->name, inode, 0, 0);
+ cluster_replies_wipe(replies, ec->nodes);
+ return ret;
+}
+
+/*entry heal*/
+int
+__ec_heal_entry_prepare(call_frame_t *frame, ec_t *ec, inode_t *inode,
+ unsigned char *locked_on, uint64_t *versions,
+ uint64_t *dirty, unsigned char *sources,
+ unsigned char *healed_sinks)
+{
+ loc_t loc = {0};
+ int source = 0;
+ int ret = 0;
+ default_args_cbk_t *replies = NULL;
+ unsigned char *output = NULL;
+ dict_t *xdata = NULL;
- return ENODATA;
+ EC_REPLIES_ALLOC(replies, ec->nodes);
+
+ loc.inode = inode_ref(inode);
+ gf_uuid_copy(loc.gfid, inode->gfid);
+ xdata = dict_new();
+ if (!xdata) {
+ ret = -ENOMEM;
+ goto out;
}
- heal = GF_MALLOC(sizeof(ec_heal_t), ec_mt_ec_heal_t);
- if (heal == NULL)
- {
- return ENOMEM;
+ if (dict_set_uint64(xdata, EC_XATTR_VERSION, 0) ||
+ dict_set_uint64(xdata, EC_XATTR_DIRTY, 0)) {
+ ret = -ENOMEM;
+ goto out;
}
- memset(heal, 0, sizeof(ec_heal_t));
+ output = alloca0(ec->nodes);
+ ret = cluster_lookup(ec->xl_list, locked_on, ec->nodes, replies, output,
+ frame, ec->xl, &loc, xdata);
+ if (ret <= ec->fragments) {
+ ret = -ENOTCONN;
+ goto out;
+ }
- if (!ec_loc_from_loc(fop->xl, &heal->loc, &fop->loc[0]))
- {
- error = ENOMEM;
+ source = ec_heal_entry_find_direction(ec, replies, versions, dirty, sources,
+ healed_sinks);
+ if (source < 0) {
+ ret = -EIO;
+ goto out;
+ }
+ ret = source;
+out:
+ if (xdata)
+ dict_unref(xdata);
+ loc_wipe(&loc);
+ cluster_replies_wipe(replies, ec->nodes);
+ return ret;
+}
+int32_t
+ec_set_new_entry_dirty(ec_t *ec, loc_t *loc, struct iatt *ia,
+ call_frame_t *frame, xlator_t *this, unsigned char *on)
+{
+ dict_t *xattr = NULL;
+ int32_t ret = -1;
+ default_args_cbk_t *replies = NULL;
+ unsigned char *output = NULL;
+ uint64_t dirty[EC_VERSION_SIZE] = {1, 1};
+ loc_t newloc = {0};
+
+ /*Symlinks don't have any data to be healed*/
+ if (ia->ia_type == IA_IFLNK)
+ dirty[EC_DATA_TXN] = 0;
+
+ newloc.inode = inode_ref(loc->inode);
+ gf_uuid_copy(newloc.gfid, ia->ia_gfid);
+ EC_REPLIES_ALLOC(replies, ec->nodes);
+ output = alloca0(ec->nodes);
+ xattr = dict_new();
+ if (!xattr) {
+ ret = -ENOMEM;
+ goto out;
+ }
+ ret = ec_dict_set_array(xattr, EC_XATTR_DIRTY, dirty, EC_VERSION_SIZE);
+ if (ret)
+ goto out;
+
+ ret = cluster_xattrop(ec->xl_list, on, ec->nodes, replies, output, frame,
+ ec->xl, &newloc, GF_XATTROP_ADD_ARRAY64, xattr, NULL);
+
+ if (ret < ec->fragments) {
+ ret = -ENOTCONN;
goto out;
}
- LOCK_INIT(&heal->lock);
+out:
+ if (xattr)
+ dict_unref(xattr);
+ cluster_replies_wipe(replies, ec->nodes);
+ loc_wipe(&newloc);
+ return ret;
+}
- heal->xl = fop->xl;
- heal->fop = fop;
- pool = fop->xl->ctx->iobuf_pool;
- heal->size = iobpool_default_pagesize(pool) * ec->fragments;
+/*Name heal*/
+int
+ec_delete_stale_name(dict_t *gfid_db, char *key, data_t *d, void *data)
+{
+ struct ec_name_data *name_data = data;
+ struct iatt *ia = NULL;
+ ec_t *ec = NULL;
+ loc_t loc = {0};
+ unsigned char *same = data_to_bin(d);
+ default_args_cbk_t *replies = NULL;
+ unsigned char *output = NULL;
+ int ret = 0;
+ int estale_count = 0;
+ int i = 0;
+ call_frame_t *frame = name_data->frame;
+ uuid_t gfid;
+
+ ec = name_data->frame->this->private;
+ EC_REPLIES_ALLOC(replies, ec->nodes);
+ if (EC_COUNT(same, ec->nodes) >= ec->fragments) {
+ ret = 0;
+ goto out;
+ }
- LOCK(&inode->lock);
+ loc.parent = inode_ref(name_data->parent);
+ loc.inode = inode_new(name_data->parent->table);
+ if (!loc.inode) {
+ ret = -ENOMEM;
+ goto out;
+ }
- ctx = __ec_inode_get(inode, fop->xl);
- if (ctx == NULL)
- {
- error = EIO;
+ gf_uuid_parse(key, gfid);
+ gf_uuid_copy(loc.pargfid, name_data->parent->gfid);
+ loc.name = name_data->name;
+ output = alloca0(ec->nodes);
+ ret = cluster_lookup(ec->xl_list, name_data->participants, ec->nodes,
+ replies, output, name_data->frame, ec->xl, &loc, NULL);
- goto unlock;
+ for (i = 0; i < ec->nodes; i++) {
+ if (!replies[i].valid)
+ continue;
+ if (replies[i].op_ret == -1) {
+ if (replies[i].op_errno == ESTALE || replies[i].op_errno == ENOENT)
+ estale_count++;
+ else
+ name_data->participants[i] = 0;
+ } else if (gf_uuid_compare(gfid, replies[i].stat.ia_gfid)) {
+ estale_count++;
+ gf_msg_debug(ec->xl->name, 0, "%s/%s: different gfid as %s",
+ uuid_utoa(name_data->parent->gfid), name_data->name,
+ key);
+ }
}
- if (ctx->heal != NULL)
- {
- error = EEXIST;
+ if (estale_count <= ec->redundancy) {
+ /* We have at least ec->fragments number of fragments, so the
+ * file is recoverable, so don't delete it*/
- goto unlock;
+ /* Please note that the lookup call above could fail with
+ * ENOTCONN on all subvoumes and still this branch will be
+ * true, but in those cases conservatively we decide to not
+ * delete the file until we are sure*/
+ ret = 0;
+ goto out;
}
- fop->data = heal;
+ /*Noway to recover, delete the name*/
+ loc_wipe(&loc);
+ loc.parent = inode_ref(name_data->parent);
+ gf_uuid_copy(loc.pargfid, loc.parent->gfid);
+ loc.name = name_data->name;
+ for (i = 0; i < ec->nodes; i++) {
+ if (same[i] && replies[i].valid && (replies[i].op_ret == 0)) {
+ ia = &replies[i].stat;
+ break;
+ }
+ }
- ctx->heal = heal;
- heal = NULL;
+ if (!ia) {
+ ret = -ENOTCONN;
+ goto out;
+ }
-unlock:
- UNLOCK(&inode->lock);
+ if (IA_ISDIR(ia->ia_type)) {
+ ret = cluster_rmdir(ec->xl_list, same, ec->nodes, replies, output,
+ frame, ec->xl, &loc, 1, NULL);
+ gf_msg_debug(ec->xl->name, 0,
+ "cluster rmdir succeeded on %d "
+ "nodes",
+ ret);
+ } else {
+ ret = cluster_unlink(ec->xl_list, same, ec->nodes, replies, output,
+ frame, ec->xl, &loc, 0, NULL);
+ gf_msg_debug(ec->xl->name, 0,
+ "cluster unlink succeeded on %d "
+ "nodes",
+ ret);
+ }
+
+ for (i = 0; i < ec->nodes; i++) {
+ if (output[i]) {
+ same[i] = 0;
+ name_data->enoent[i] = 1;
+ } else {
+ /*op failed*/
+ if (same[i])
+ name_data->participants[i] = 0;
+ }
+ }
+ ret = 0;
+ /*This will help in making decisions about creating names*/
+ dict_del(gfid_db, key);
out:
- GF_FREE(heal);
+ if (ret < 0) {
+ gf_msg_debug(ec->xl->name, 0, "%s/%s: heal failed %s",
+ uuid_utoa(name_data->parent->gfid), name_data->name,
+ strerror(-ret));
+ }
+ cluster_replies_wipe(replies, ec->nodes);
+ loc_wipe(&loc);
+ return ret;
+}
- return error;
+int
+ec_delete_stale_names(call_frame_t *frame, ec_t *ec, inode_t *parent,
+ char *name, default_args_cbk_t *replies, dict_t *gfid_db,
+ unsigned char *enoent, unsigned char *gfidless,
+ unsigned char *participants)
+{
+ struct ec_name_data name_data = {0};
+
+ name_data.enoent = enoent;
+ name_data.gfidless = gfidless;
+ name_data.participants = participants;
+ name_data.name = name;
+ name_data.parent = parent;
+ name_data.frame = frame;
+ name_data.replies = replies;
+ return dict_foreach(gfid_db, ec_delete_stale_name, &name_data);
}
-void ec_heal_entrylk(ec_heal_t * heal, entrylk_cmd cmd)
+int
+_assign_same(dict_t *dict, char *key, data_t *value, void *data)
{
- loc_t loc;
- char * name;
- int32_t error;
+ struct ec_name_data *name_data = data;
- error = ec_loc_parent(heal->xl, &heal->loc, &loc, &name);
- if (error != 0)
- {
- ec_fop_set_error(heal->fop, error);
+ name_data->same = data_to_bin(value);
+ return 0;
+}
- return;
+int
+ec_create_name(call_frame_t *frame, ec_t *ec, inode_t *parent, char *name,
+ default_args_cbk_t *lookup_replies, dict_t *gfid_db,
+ unsigned char *enoent, unsigned char *participants)
+{
+ int ret = 0;
+ int i = 0;
+ struct ec_name_data name_data = {0};
+ struct iatt *ia = NULL;
+ unsigned char *output = 0;
+ unsigned char *output1 = 0;
+ unsigned char *on = NULL;
+ default_args_cbk_t *replies = NULL;
+ loc_t loc = {0};
+ loc_t srcloc = {0};
+ unsigned char *link = NULL;
+ unsigned char *create = NULL;
+ dict_t *xdata = NULL;
+ char *linkname = NULL;
+ ec_config_t config;
+
+ /* There should be just one gfid key */
+ EC_REPLIES_ALLOC(replies, ec->nodes);
+ if (gfid_db->count != 1) {
+ ret = -EINVAL;
+ goto out;
}
- ec_entrylk(heal->fop->frame, heal->xl, -1, EC_MINIMUM_ALL, NULL, NULL,
- heal->xl->name, &loc, name, cmd, ENTRYLK_WRLCK, NULL);
+ ret = dict_foreach(gfid_db, _assign_same, &name_data);
+ if (ret < 0)
+ goto out;
+ /*There should at least be one valid success reply with gfid*/
+ for (i = 0; i < ec->nodes; i++)
+ if (name_data.same[i])
+ break;
- loc_wipe(&loc);
- GF_FREE(name);
-}
+ if (i == ec->nodes) {
+ ret = -EINVAL;
+ goto out;
+ }
-void ec_heal_inodelk(ec_heal_t * heal, int32_t type, int32_t use_fd,
- off_t offset, size_t size)
-{
- struct gf_flock flock;
+ ia = &lookup_replies[i].stat;
+ xdata = dict_new();
+ loc.parent = inode_ref(parent);
+ gf_uuid_copy(loc.pargfid, parent->gfid);
+ loc.inode = inode_new(parent->table);
+ if (loc.inode)
+ srcloc.inode = inode_ref(loc.inode);
+ gf_uuid_copy(srcloc.gfid, ia->ia_gfid);
+ if (!loc.inode || !xdata ||
+ dict_set_static_bin(xdata, "gfid-req", ia->ia_gfid,
+ sizeof(ia->ia_gfid))) {
+ ret = -ENOMEM;
+ goto out;
+ }
+ loc.name = name;
+ link = alloca0(ec->nodes);
+ create = alloca0(ec->nodes);
+ on = alloca0(ec->nodes);
+ output = alloca0(ec->nodes);
+ output1 = alloca0(ec->nodes);
+
+ for (i = 0; i < ec->nodes; i++) {
+ if (!lookup_replies[i].valid)
+ continue;
+ if (lookup_replies[i].op_ret)
+ continue;
+ on[i] = 1;
+ }
+ switch (ia->ia_type) {
+ case IA_IFDIR:
+ ec_set_new_entry_dirty(ec, &loc, ia, frame, ec->xl, on);
+ (void)cluster_mkdir(
+ ec->xl_list, enoent, ec->nodes, replies, output, frame, ec->xl,
+ &loc, st_mode_from_ia(ia->ia_prot, ia->ia_type), 0, xdata);
+ break;
- flock.l_type = type;
- flock.l_whence = SEEK_SET;
- flock.l_start = offset;
- flock.l_len = size;
- flock.l_pid = 0;
- flock.l_owner.len = 0;
+ case IA_IFLNK:
+ /*Check for hard links and create/link*/
+ ret = cluster_lookup(ec->xl_list, enoent, ec->nodes, replies,
+ output, frame, ec->xl, &srcloc, NULL);
+ for (i = 0; i < ec->nodes; i++) {
+ if (output[i]) {
+ link[i] = 1;
+ } else {
+ if (replies[i].op_errno == ENOENT ||
+ replies[i].op_errno == ESTALE) {
+ create[i] = 1;
+ }
+ }
+ }
- if (use_fd)
- {
- ec_finodelk(heal->fop->frame, heal->xl, heal->fop->mask,
- EC_MINIMUM_ALL, NULL, NULL, heal->xl->name, heal->fd,
- F_SETLKW, &flock, NULL);
+ if (EC_COUNT(link, ec->nodes)) {
+ cluster_link(ec->xl_list, link, ec->nodes, replies, output1,
+ frame, ec->xl, &srcloc, &loc, NULL);
+ }
+
+ if (EC_COUNT(create, ec->nodes)) {
+ cluster_readlink(ec->xl_list, name_data.same, ec->nodes,
+ replies, output, frame, ec->xl, &srcloc, 4096,
+ NULL);
+ if (EC_COUNT(output, ec->nodes) == 0) {
+ ret = -ENOTCONN;
+ goto out;
+ }
+
+ for (i = 0; i < ec->nodes; i++) {
+ if (output[i])
+ break;
+ }
+ linkname = alloca0(strlen(replies[i].buf) + 1);
+ strcpy(linkname, replies[i].buf);
+ ec_set_new_entry_dirty(ec, &loc, ia, frame, ec->xl, on);
+ cluster_symlink(ec->xl_list, create, ec->nodes, replies, output,
+ frame, ec->xl, linkname, &loc, 0, xdata);
+ }
+ for (i = 0; i < ec->nodes; i++)
+ if (output1[i])
+ output[i] = 1;
+ break;
+ case IA_IFREG:
+ ec_set_new_entry_dirty(ec, &loc, ia, frame, ec->xl, on);
+ config.version = EC_CONFIG_VERSION;
+ config.algorithm = EC_CONFIG_ALGORITHM;
+ config.gf_word_size = EC_GF_BITS;
+ config.bricks = ec->nodes;
+ config.redundancy = ec->redundancy;
+ config.chunk_size = EC_METHOD_CHUNK_SIZE;
+
+ ret = ec_dict_set_config(xdata, EC_XATTR_CONFIG, &config);
+ if (ret != 0) {
+ goto out;
+ }
+
+ /* Fall through */
+
+ default:
+ ret = dict_set_int32(xdata, GLUSTERFS_INTERNAL_FOP_KEY, 1);
+ if (ret)
+ goto out;
+ ret = cluster_mknod(
+ ec->xl_list, enoent, ec->nodes, replies, output, frame, ec->xl,
+ &loc, st_mode_from_ia(ia->ia_prot, ia->ia_type),
+ makedev(ia_major(ia->ia_rdev), ia_minor(ia->ia_rdev)), 0,
+ xdata);
+ break;
}
- else
- {
- ec_inodelk(heal->fop->frame, heal->xl, heal->fop->mask, EC_MINIMUM_ALL,
- NULL, NULL, heal->xl->name, &heal->loc, F_SETLKW, &flock,
- NULL);
+
+ for (i = 0; i < ec->nodes; i++) {
+ if (enoent[i] && !output[i])
+ participants[i] = 0;
}
+
+ ret = 0;
+out:
+ if (ret < 0)
+ gf_msg_debug(ec->xl->name, 0, "%s/%s: heal failed %s",
+ uuid_utoa(parent->gfid), name, strerror(-ret));
+ cluster_replies_wipe(replies, ec->nodes);
+ loc_wipe(&loc);
+ loc_wipe(&srcloc);
+ if (xdata)
+ dict_unref(xdata);
+ return ret;
}
-void ec_heal_lookup(ec_heal_t * heal)
+int
+__ec_heal_name(call_frame_t *frame, ec_t *ec, inode_t *parent, char *name,
+ unsigned char *participants)
{
- dict_t * xdata;
- int32_t error = ENOMEM;
-
+ unsigned char *output = NULL;
+ unsigned char *enoent = NULL;
+ default_args_cbk_t *replies = NULL;
+ dict_t *xdata = NULL;
+ dict_t *gfid_db = NULL;
+ int ret = 0;
+ loc_t loc = {0};
+ int i = 0;
+ struct iatt *ia = NULL;
+ char gfid[64] = {0};
+ unsigned char *same = NULL;
+ unsigned char *gfidless = NULL;
+
+ EC_REPLIES_ALLOC(replies, ec->nodes);
+ loc.parent = inode_ref(parent);
+ loc.inode = inode_new(parent->table);
+ gf_uuid_copy(loc.pargfid, parent->gfid);
+ loc.name = name;
xdata = dict_new();
- if (xdata == NULL)
- {
+ gfid_db = dict_new();
+ if (!xdata || !gfid_db || !loc.inode) {
+ ret = -ENOMEM;
goto out;
}
- if (dict_set_uint64(xdata, "list-xattr", 0) != 0)
- {
+
+ ret = dict_set_int32(xdata, GF_GFIDLESS_LOOKUP, 1);
+ if (ret) {
+ ret = -ENOMEM;
goto out;
}
- ec_lookup(heal->fop->frame, heal->xl, heal->fop->mask, EC_MINIMUM_MIN,
- ec_heal_inode_lookup_cbk, heal, &heal->loc, xdata);
+ output = alloca0(ec->nodes);
+ gfidless = alloca0(ec->nodes);
+ enoent = alloca0(ec->nodes);
+ ret = cluster_lookup(ec->xl_list, participants, ec->nodes, replies, output,
+ frame, ec->xl, &loc, NULL);
+ for (i = 0; i < ec->nodes; i++) {
+ if (!replies[i].valid)
+ continue;
- error = 0;
+ if (replies[i].op_ret == -1) {
+ /*If ESTALE comes here, that means parent dir is not
+ * present, nothing to do there, so reset participants
+ * for that brick*/
+ if (replies[i].op_errno == ENOENT)
+ enoent[i] = 1;
+ else
+ participants[i] = 0;
+ continue;
+ }
+ ia = &replies[i].stat;
+ if (gf_uuid_is_null(ia->ia_gfid)) {
+ if (IA_ISDIR(ia->ia_type) || ia->ia_size == 0)
+ gfidless[i] = 1;
+ else
+ participants[i] = 0;
+ } else {
+ uuid_utoa_r(ia->ia_gfid, gfid);
+ ret = dict_get_bin(gfid_db, gfid, (void **)&same);
+ if (ret < 0) {
+ same = alloca0(ec->nodes);
+ }
+ same[i] = 1;
+ if (ret < 0) {
+ ret = dict_set_static_bin(gfid_db, gfid, same, ec->nodes);
+ }
+ if (ret < 0)
+ goto out;
+ }
+ }
-out:
- if (xdata != NULL)
- {
- dict_unref(xdata);
+ ret = ec_delete_stale_names(frame, ec, parent, name, replies, gfid_db,
+ enoent, gfidless, participants);
+
+ if (gfid_db->count == 0) {
+ /* All entries seem to be stale entries and deleted,
+ * nothing more to do.*/
+ goto out;
}
- ec_fop_set_error(heal->fop, error);
+ if (gfid_db->count > 1) {
+ gf_msg(ec->xl->name, GF_LOG_INFO, 0, EC_MSG_HEAL_FAIL,
+ "%s/%s: Not able to heal", uuid_utoa(parent->gfid), name);
+ memset(participants, 0, ec->nodes);
+ goto out;
+ }
+
+ EC_INTERSECT(enoent, enoent, participants, ec->nodes);
+ if (EC_COUNT(enoent, ec->nodes) == 0) {
+ ret = 0;
+ goto out;
+ }
+
+ ret = ec_create_name(frame, ec, parent, name, replies, gfid_db, enoent,
+ participants);
+ if (ret >= 0) {
+ /* If ec_create_name() succeeded we return 1 to indicate that a new
+ * file has been created and it will need to be healed. */
+ ret = 1;
+ }
+out:
+ cluster_replies_wipe(replies, ec->nodes);
+ loc_wipe(&loc);
+ if (xdata)
+ dict_unref(xdata);
+ if (gfid_db)
+ dict_unref(gfid_db);
+ return ret;
}
-void ec_heal_remove(ec_heal_t * heal, ec_cbk_data_t * cbk)
+int
+ec_heal_name(call_frame_t *frame, ec_t *ec, inode_t *parent, char *name,
+ unsigned char *participants)
{
- if (cbk->iatt[0].ia_type == IA_IFDIR)
- {
- // TODO: Remove directory recursively ?
- ec_rmdir(heal->fop->frame, heal->xl, cbk->mask, EC_MINIMUM_ONE,
- ec_heal_rmdir_cbk, heal, &heal->loc, 0, NULL);
+ int ret = 0;
+ default_args_cbk_t *replies = NULL;
+ unsigned char *output = NULL;
+ unsigned char *locked_on = NULL;
+ loc_t loc = {0};
+
+ loc.parent = inode_ref(parent);
+ loc.name = name;
+ loc.inode = inode_new(parent->table);
+ if (!loc.inode) {
+ ret = -ENOMEM;
+ goto out;
}
- else
+
+ EC_REPLIES_ALLOC(replies, ec->nodes);
+ output = alloca0(ec->nodes);
+ locked_on = alloca0(ec->nodes);
+ ret = cluster_inodelk(ec->xl_list, participants, ec->nodes, replies,
+ locked_on, frame, ec->xl, ec->xl->name, parent, 0, 0);
{
- ec_unlink(heal->fop->frame, heal->xl, cbk->mask, EC_MINIMUM_ONE,
- ec_heal_unlink_cbk, heal, &heal->loc, 0, NULL);
+ if (ret <= ec->fragments) {
+ gf_msg_debug(ec->xl->name, 0,
+ "%s/%s: Skipping "
+ "heal as only %d number of subvolumes could "
+ "be locked",
+ uuid_utoa(parent->gfid), name, ret);
+ ret = -ENOTCONN;
+ goto unlock;
+ }
+ EC_INTERSECT(participants, participants, locked_on, ec->nodes);
+ ret = __ec_heal_name(frame, ec, parent, name, participants);
}
+unlock:
+ cluster_uninodelk(ec->xl_list, locked_on, ec->nodes, replies, output, frame,
+ ec->xl, ec->xl->name, parent, 0, 0);
+out:
+ cluster_replies_wipe(replies, ec->nodes);
+ loc_wipe(&loc);
+ return ret;
}
-void ec_heal_remove_others(ec_heal_t * heal)
+int
+ec_name_heal_handler(xlator_t *subvol, gf_dirent_t *entry, loc_t *parent,
+ void *data)
{
- struct list_head * item;
- ec_cbk_data_t * cbk;
+ struct ec_name_data *name_data = data;
+ xlator_t *this = THIS;
+ ec_t *ec = this->private;
+ unsigned char *name_on = alloca0(ec->nodes);
+ int i = 0;
+ int ret = 0;
+
+ if (ec->shutdown) {
+ gf_msg_debug(this->name, 0,
+ "Cancelling directory heal "
+ "because EC is stopping.");
+ return -ENOTCONN;
+ }
- item = heal->lookup->cbk_list.next;
- do
- {
- item = item->next;
- cbk = list_entry(item, ec_cbk_data_t, list);
+ memcpy(name_on, name_data->participants, ec->nodes);
+ ret = ec_heal_name(name_data->frame, ec, parent->inode, entry->d_name,
+ name_on);
- if (cbk->op_ret < 0)
- {
- if ((cbk->op_errno != ENOENT) && (cbk->op_errno != ENOTDIR))
- {
- gf_log(heal->xl->name, GF_LOG_WARNING, "Don't know how to "
- "remove inode with "
- "error %d",
- cbk->op_errno);
- }
+ if (ret < 0) {
+ memset(name_on, 0, ec->nodes);
+ } else {
+ name_data->heal_pending += ret;
+ }
+
+ for (i = 0; i < ec->nodes; i++)
+ if (name_data->participants[i] && !name_on[i])
+ name_data->failed_on[i] = 1;
- ec_heal_exclude(heal, cbk->mask);
+ return 0;
+}
+int
+ec_heal_names(call_frame_t *frame, ec_t *ec, inode_t *inode,
+ unsigned char *participants, uint32_t *pending)
+{
+ int i = 0;
+ int j = 0;
+ loc_t loc = {0};
+ struct ec_name_data name_data = {0};
+ int ret = 0;
+
+ loc.inode = inode_ref(inode);
+ gf_uuid_copy(loc.gfid, inode->gfid);
+ name_data.frame = frame;
+ name_data.participants = participants;
+ name_data.failed_on = alloca0(ec->nodes);
+ name_data.heal_pending = 0;
+
+ for (i = 0; i < ec->nodes; i++) {
+ if (!participants[i])
continue;
+ ret = syncop_dir_scan(ec->xl_list[i], &loc, GF_CLIENT_PID_SELF_HEALD,
+ &name_data, ec_name_heal_handler);
+ if (ret < 0) {
+ break;
}
+ for (j = 0; j < ec->nodes; j++)
+ if (name_data.failed_on[j])
+ participants[j] = 0;
+
+ if (EC_COUNT(participants, ec->nodes) <= ec->fragments) {
+ ret = -ENOTCONN;
+ break;
+ }
+ }
+ *pending += name_data.heal_pending;
- ec_heal_remove(heal, cbk);
- } while (item->next != &heal->lookup->cbk_list);
+ loc_wipe(&loc);
+ return ret;
}
-void ec_heal_prepare_others(ec_heal_t * heal)
+int
+__ec_heal_entry(call_frame_t *frame, ec_t *ec, inode_t *inode,
+ unsigned char *heal_on, unsigned char *sources,
+ unsigned char *healed_sinks, uint32_t *pending)
{
- struct list_head * item;
- ec_cbk_data_t * cbk;
-
- item = heal->lookup->cbk_list.next;
- while (item->next != &heal->lookup->cbk_list)
+ unsigned char *locked_on = NULL;
+ unsigned char *output = NULL;
+ uint64_t *versions = NULL;
+ uint64_t *dirty = NULL;
+ unsigned char *participants = NULL;
+ default_args_cbk_t *replies = NULL;
+ int ret = 0;
+ int source = 0;
+ int i = 0;
+
+ locked_on = alloca0(ec->nodes);
+ output = alloca0(ec->nodes);
+ versions = alloca0(ec->nodes * sizeof(*versions));
+ dirty = alloca0(ec->nodes * sizeof(*dirty));
+
+ EC_REPLIES_ALLOC(replies, ec->nodes);
+ ret = cluster_inodelk(ec->xl_list, heal_on, ec->nodes, replies, locked_on,
+ frame, ec->xl, ec->xl->name, inode, 0, 0);
{
- item = item->next;
- cbk = list_entry(item, ec_cbk_data_t, list);
+ if (ret <= ec->fragments) {
+ gf_msg_debug(ec->xl->name, 0,
+ "%s: Skipping heal "
+ "as only %d number of subvolumes could "
+ "be locked",
+ uuid_utoa(inode->gfid), ret);
+ ret = -ENOTCONN;
+ goto unlock;
+ }
+ ret = __ec_heal_entry_prepare(frame, ec, inode, locked_on, versions,
+ dirty, sources, healed_sinks);
+ source = ret;
+ }
+unlock:
+ cluster_uninodelk(ec->xl_list, locked_on, ec->nodes, replies, output, frame,
+ ec->xl, ec->xl->name, inode, 0, 0);
+ if (ret < 0)
+ goto out;
- if (cbk->op_ret < 0)
- {
- if (cbk->op_errno == ENOENT)
- {
- ec_heal_create(heal, cbk->mask, 1);
- }
- else
- {
- gf_log(heal->xl->name, GF_LOG_ERROR, "Don't know how to "
- "heal error %d",
- cbk->op_errno);
+ participants = alloca0(ec->nodes);
+ for (i = 0; i < ec->nodes; i++) {
+ if (sources[i] || healed_sinks[i])
+ participants[i] = 1;
+ }
+ ret = ec_heal_names(frame, ec, inode, participants, pending);
- ec_heal_exclude(heal, cbk->mask);
- }
- }
- else
- {
- if ((heal->iatt.ia_type != cbk->iatt[0].ia_type) ||
- (uuid_compare(heal->iatt.ia_gfid, cbk->iatt[0].ia_gfid) != 0))
- {
- ec_heal_remove(heal, cbk);
- }
+ if (EC_COUNT(participants, ec->nodes) <= ec->fragments)
+ goto out;
+
+ for (i = 0; i < ec->nodes; i++) {
+ if (!participants[i]) {
+ sources[i] = 0;
+ healed_sinks[i] = 0;
}
}
+
+ ec_adjust_versions(frame, ec, EC_DATA_TXN, inode, source, sources,
+ healed_sinks, versions, dirty);
+out:
+ cluster_replies_wipe(replies, ec->nodes);
+ return ret;
}
-int32_t ec_heal_readlink_cbk(call_frame_t * frame, void * cookie,
- xlator_t * this, int32_t op_ret, int32_t op_errno,
- const char * path, struct iatt * buf,
- dict_t * xdata)
+int
+ec_heal_entry(call_frame_t *frame, ec_t *ec, inode_t *inode,
+ unsigned char *sources, unsigned char *healed_sinks,
+ uint32_t *pending)
{
- ec_fop_data_t * fop = cookie;
- ec_heal_t * heal = fop->data;
-
- if (op_ret >= 0)
+ unsigned char *locked_on = NULL;
+ unsigned char *up_subvols = NULL;
+ unsigned char *output = NULL;
+ char selfheal_domain[1024] = {0};
+ int ret = 0;
+ default_args_cbk_t *replies = NULL;
+
+ EC_REPLIES_ALLOC(replies, ec->nodes);
+ locked_on = alloca0(ec->nodes);
+ output = alloca0(ec->nodes);
+ up_subvols = alloca0(ec->nodes);
+
+ sprintf(selfheal_domain, "%s:self-heal", ec->xl->name);
+ ec_mask_to_char_array(ec->xl_up, up_subvols, ec->nodes);
+ /*If other processes are already doing the heal, don't block*/
+ ret = cluster_tiebreaker_inodelk(ec->xl_list, up_subvols, ec->nodes,
+ replies, locked_on, frame, ec->xl,
+ selfheal_domain, inode, 0, 0);
{
- heal->symlink = gf_strdup(path);
- if (heal->symlink != NULL)
- {
- ec_heal_prepare_others(heal);
- }
- else
- {
- ec_fop_set_error(fop, EIO);
+ if (ret <= ec->fragments) {
+ gf_msg_debug(ec->xl->name, 0,
+ "%s: Skipping heal "
+ "as only %d number of subvolumes could "
+ "be locked",
+ uuid_utoa(inode->gfid), ret);
+ ret = -ENOTCONN;
+ goto unlock;
}
+ ret = __ec_heal_entry(frame, ec, inode, locked_on, sources,
+ healed_sinks, pending);
}
-
- return 0;
+unlock:
+ cluster_uninodelk(ec->xl_list, locked_on, ec->nodes, replies, output, frame,
+ ec->xl, selfheal_domain, inode, 0, 0);
+ cluster_replies_wipe(replies, ec->nodes);
+ return ret;
}
-ec_cbk_data_t * ec_heal_lookup_check(ec_heal_t * heal, uintptr_t * pgood,
- uintptr_t * pbad)
+/*Find direction for data heal and heal info*/
+int
+ec_heal_data_find_direction(ec_t *ec, default_args_cbk_t *replies,
+ uint64_t *data_versions, uint64_t *dirty,
+ uint64_t *size, unsigned char *sources,
+ unsigned char *healed_sinks,
+ gf_boolean_t check_ondisksize, int which)
{
- ec_fop_data_t * fop = heal->lookup;
- ec_cbk_data_t * cbk = NULL, * ans = NULL;
- uintptr_t good = 0, bad = 0;
+ uint64_t xattr[EC_VERSION_SIZE] = {0};
+ char version_size[128] = {0};
+ dict_t *version_size_db = NULL;
+ unsigned char *same = NULL;
+ int max_same_count = 0;
+ int source = 0;
+ int i = 0;
+ int ret = 0;
+ dict_t *dict = NULL;
+ uint64_t source_size = 0;
+
+ version_size_db = dict_new();
+ if (!version_size_db) {
+ ret = -ENOMEM;
+ goto out;
+ }
- list_for_each_entry(ans, &fop->cbk_list, list)
- {
- if ((ans->op_ret < 0) && (ans->op_errno == ENOTCONN))
- {
+ for (i = 0; i < ec->nodes; i++) {
+ if (!replies[i].valid)
continue;
+ if (replies[i].op_ret < 0)
+ continue;
+ dict = (which == EC_COMBINE_XDATA) ? replies[i].xdata
+ : replies[i].xattr;
+
+ ret = ec_dict_get_array(dict, EC_XATTR_VERSION, xattr, EC_VERSION_SIZE);
+ if (ret == 0) {
+ data_versions[i] = xattr[EC_DATA_TXN];
}
- if (ans == fop->answer)
- {
- good |= ans->mask;
- cbk = ans;
+ memset(xattr, 0, sizeof(xattr));
+ ret = ec_dict_get_array(dict, EC_XATTR_DIRTY, xattr, EC_VERSION_SIZE);
+ if (ret == 0) {
+ dirty[i] = xattr[EC_DATA_TXN];
}
- else
- {
- bad |= ans->mask;
+ ret = ec_dict_del_number(dict, EC_XATTR_SIZE, &size[i]);
+ /*Build a db of same metadata and data version and size*/
+ snprintf(version_size, sizeof(version_size), "%" PRIu64 "-%" PRIu64,
+ data_versions[i], size[i]);
+
+ ret = dict_get_bin(version_size_db, version_size, (void **)&same);
+ if (ret < 0) {
+ same = alloca0(ec->nodes);
+ }
+
+ same[i] = 1;
+ if (max_same_count < EC_COUNT(same, ec->nodes)) {
+ max_same_count = EC_COUNT(same, ec->nodes);
+ source = i;
+ }
+
+ if (ret < 0) {
+ ret = dict_set_static_bin(version_size_db, version_size, same,
+ ec->nodes);
+ }
+
+ if (ret < 0) {
+ ret = -ENOMEM;
+ goto out;
+ }
+ }
+ /* If we don't have ec->fragments number of same version,size it is not
+ * recoverable*/
+ if (max_same_count < ec->fragments) {
+ ret = -EIO;
+ goto out;
+ } else {
+ snprintf(version_size, sizeof(version_size), "%" PRIu64 "-%" PRIu64,
+ data_versions[source], size[source]);
+
+ ret = dict_get_bin(version_size_db, version_size, (void **)&same);
+ if (ret < 0)
+ goto out;
+ memcpy(sources, same, ec->nodes);
+ for (i = 0; i < ec->nodes; i++) {
+ if (replies[i].valid && (replies[i].op_ret == 0) && !sources[i])
+ healed_sinks[i] = 1;
}
}
- *pgood = good;
- *pbad = bad;
+ /* There could be files with versions, size same but on disk ia_size
+ * could be different because of disk crashes, mark them as sinks as
+ * well*/
+
+ if (check_ondisksize) {
+ source_size = size[source];
+ ec_adjust_size_up(ec, &source_size, _gf_true);
+
+ for (i = 0; i < ec->nodes; i++) {
+ if (sources[i]) {
+ if (replies[i].stat.ia_size != source_size) {
+ sources[i] = 0;
+ healed_sinks[i] = 1;
+ max_same_count--;
+ } else {
+ source = i;
+ }
+ }
+ }
+ if (max_same_count < ec->fragments) {
+ ret = -EIO;
+ goto out;
+ }
+ }
- return cbk;
+ ret = source;
+out:
+ if (version_size_db)
+ dict_unref(version_size_db);
+ return ret;
}
-void ec_heal_prepare(ec_heal_t * heal)
+int
+__ec_heal_data_prepare(call_frame_t *frame, ec_t *ec, fd_t *fd,
+ unsigned char *locked_on, uint64_t *versions,
+ uint64_t *dirty, uint64_t *size, unsigned char *sources,
+ unsigned char *healed_sinks, unsigned char *trim,
+ struct iatt *stbuf)
{
- ec_cbk_data_t * cbk;
- ec_fd_t * ctx;
- int32_t error = ENOMEM;
+ default_args_cbk_t *replies = NULL;
+ default_args_cbk_t *fstat_replies = NULL;
+ unsigned char *output = NULL;
+ unsigned char *fstat_output = NULL;
+ dict_t *xattrs = NULL;
+ uint64_t zero_array[2] = {0};
+ int source = 0;
+ int ret = 0;
+ uint64_t zero_value = 0;
+ int i = 0;
+
+ EC_REPLIES_ALLOC(replies, ec->nodes);
+ EC_REPLIES_ALLOC(fstat_replies, ec->nodes);
+ output = alloca0(ec->nodes);
+ fstat_output = alloca0(ec->nodes);
+ xattrs = dict_new();
+ if (!xattrs ||
+ dict_set_static_bin(xattrs, EC_XATTR_VERSION, zero_array,
+ sizeof(zero_array)) ||
+ dict_set_static_bin(xattrs, EC_XATTR_DIRTY, zero_array,
+ sizeof(zero_array)) ||
+ dict_set_static_bin(xattrs, EC_XATTR_SIZE, &zero_value,
+ sizeof(zero_value))) {
+ ret = -ENOMEM;
+ goto out;
+ }
- heal->available = heal->good;
+ ret = cluster_fxattrop(ec->xl_list, locked_on, ec->nodes, replies, output,
+ frame, ec->xl, fd, GF_XATTROP_ADD_ARRAY64, xattrs,
+ NULL);
- cbk = heal->lookup->answer;
- if (cbk->op_ret < 0)
- {
- if ((cbk->op_errno == ENOENT) || (cbk->op_errno == ENOTDIR))
- {
- ec_heal_remove_others(heal);
- }
- else
- {
- gf_log(heal->xl->name, GF_LOG_ERROR, "Don't know how to heal "
- "error %d",
- cbk->op_errno);
- }
+ ret = cluster_fstat(ec->xl_list, locked_on, ec->nodes, fstat_replies,
+ fstat_output, frame, ec->xl, fd, NULL);
+
+ for (i = 0; i < ec->nodes; i++) {
+ output[i] = output[i] && fstat_output[i];
+ replies[i].valid = output[i];
+ if (output[i])
+ replies[i].stat = fstat_replies[i].stat;
}
- else
- {
- if (heal->iatt.ia_type == IA_IFREG)
- {
- heal->fd = fd_create(heal->loc.inode, heal->fop->frame->root->pid);
- if (heal->fd == NULL)
- {
- gf_log(heal->xl->name, GF_LOG_ERROR, "Unable to create a new "
- "file descriptor");
- goto out;
- }
- ctx = ec_fd_get(heal->fd, heal->xl);
- if ((ctx == NULL) || (loc_copy(&ctx->loc, &heal->loc) != 0))
- {
- goto out;
- }
+ if (EC_COUNT(output, ec->nodes) <= ec->fragments) {
+ ret = -ENOTCONN;
+ goto out;
+ }
- ctx->flags = O_RDWR;
- }
+ source = ec_heal_data_find_direction(ec, replies, versions, dirty, size,
+ sources, healed_sinks, _gf_true,
+ EC_COMBINE_DICT);
+ ret = source;
+ if (ret < 0)
+ goto out;
- if (heal->iatt.ia_type == IA_IFLNK)
- {
- ec_readlink(heal->fop->frame, heal->xl, cbk->mask, EC_MINIMUM_ONE,
- ec_heal_readlink_cbk, heal, &heal->loc,
- heal->iatt.ia_size, NULL);
- }
- else
- {
- ec_heal_prepare_others(heal);
+ if (stbuf)
+ *stbuf = replies[source].stat;
+
+ for (i = 0; i < ec->nodes; i++) {
+ if (healed_sinks[i]) {
+ if (replies[i].stat.ia_size)
+ trim[i] = 1;
}
}
- error = 0;
+ if (EC_COUNT(sources, ec->nodes) < ec->fragments) {
+ ret = -ENOTCONN;
+ goto out;
+ }
+ ret = source;
out:
- ec_fop_set_error(heal->fop, error);
+ if (xattrs)
+ dict_unref(xattrs);
+ cluster_replies_wipe(replies, ec->nodes);
+ cluster_replies_wipe(fstat_replies, ec->nodes);
+ if (ret < 0) {
+ gf_msg_debug(ec->xl->name, 0, "%s: heal failed %s",
+ uuid_utoa(fd->inode->gfid), strerror(-ret));
+ } else {
+ gf_msg_debug(ec->xl->name, 0,
+ "%s: sources: %d, sinks: "
+ "%d",
+ uuid_utoa(fd->inode->gfid), EC_COUNT(sources, ec->nodes),
+ EC_COUNT(healed_sinks, ec->nodes));
+ }
+ return ret;
}
-int32_t ec_heal_open_others(ec_heal_t * heal)
+int
+__ec_heal_mark_sinks(call_frame_t *frame, ec_t *ec, fd_t *fd,
+ uint64_t *versions, unsigned char *healed_sinks)
{
- struct list_head * item;
- ec_cbk_data_t * cbk;
- uintptr_t mask = 0, open = heal->open;
+ int i = 0;
+ int ret = 0;
+ unsigned char *mark = NULL;
+ dict_t *xattrs = NULL;
+ default_args_cbk_t *replies = NULL;
+ unsigned char *output = NULL;
+ uint64_t versions_xattr[2] = {0};
+
+ EC_REPLIES_ALLOC(replies, ec->nodes);
+ xattrs = dict_new();
+ if (!xattrs) {
+ ret = -ENOMEM;
+ goto out;
+ }
- item = heal->lookup->cbk_list.next;
- while (item->next != &heal->lookup->cbk_list)
- {
- item = item->next;
- cbk = list_entry(item, ec_cbk_data_t, list);
+ mark = alloca0(ec->nodes);
+ for (i = 0; i < ec->nodes; i++) {
+ if (!healed_sinks[i])
+ continue;
+ if ((versions[i] >> EC_SELFHEAL_BIT) & 1)
+ continue;
+ mark[i] = 1;
+ }
- if ((cbk->op_ret < 0) || (cbk->iatt[0].ia_type != IA_IFREG) ||
- (uuid_compare(heal->iatt.ia_gfid, cbk->iatt[0].ia_gfid) != 0))
- {
- ec_heal_exclude(heal, cbk->mask);
- }
- else
- {
- mask |= cbk->mask & ~heal->open;
- }
+ if (EC_COUNT(mark, ec->nodes) == 0)
+ return 0;
+
+ versions_xattr[EC_DATA_TXN] = hton64(1ULL << EC_SELFHEAL_BIT);
+ if (dict_set_static_bin(xattrs, EC_XATTR_VERSION, versions_xattr,
+ sizeof(versions_xattr))) {
+ ret = -ENOMEM;
+ goto out;
}
- if (mask != 0)
- {
- ec_open(heal->fop->frame, heal->xl, mask, EC_MINIMUM_ONE,
- ec_heal_target_open_cbk, heal, &heal->loc, O_RDWR | O_TRUNC,
- heal->fd, NULL);
+ output = alloca0(ec->nodes);
+ ret = cluster_fxattrop(ec->xl_list, mark, ec->nodes, replies, output, frame,
+ ec->xl, fd, GF_XATTROP_ADD_ARRAY64, xattrs, NULL);
+ for (i = 0; i < ec->nodes; i++) {
+ if (!output[i]) {
+ if (mark[i])
+ healed_sinks[i] = 0;
+ continue;
+ }
+ versions[i] |= (1ULL << EC_SELFHEAL_BIT);
+ }
- open |= mask;
+ if (EC_COUNT(healed_sinks, ec->nodes) == 0) {
+ ret = -ENOTCONN;
+ goto out;
}
+ ret = 0;
- return (open != 0);
+out:
+ cluster_replies_wipe(replies, ec->nodes);
+ if (xattrs)
+ dict_unref(xattrs);
+ if (ret < 0)
+ gf_msg_debug(ec->xl->name, 0, "%s: heal failed %s",
+ uuid_utoa(fd->inode->gfid), strerror(-ret));
+ return ret;
}
-void ec_heal_setxattr_others(ec_heal_t * heal)
+int32_t
+ec_manager_heal_block(ec_fop_data_t *fop, int32_t state)
{
- ec_cbk_data_t * cbk;
- dict_t * xdata;
- int32_t error = ENOMEM;
+ ec_heal_t *heal = fop->data;
+ heal->fop = fop;
- if ((heal->good != 0) && (heal->bad != 0))
- {
- cbk = heal->lookup->answer;
- xdata = cbk->xdata;
+ switch (state) {
+ case EC_STATE_INIT:
+ ec_owner_set(fop->frame, fop->frame->root);
- if ((cbk->iatt[0].ia_type == IA_IFREG) ||
- (cbk->iatt[0].ia_type == IA_IFDIR))
- {
- if (ec_dict_set_number(xdata, EC_XATTR_VERSION, cbk->version) != 0)
- {
- goto out;
+ ec_heal_inodelk(heal, F_WRLCK, 1, 0, 0);
+
+ return EC_STATE_HEAL_DATA_COPY;
+
+ case EC_STATE_HEAL_DATA_COPY:
+ gf_msg_debug(fop->xl->name, 0, "%s: read/write starting",
+ uuid_utoa(heal->fd->inode->gfid));
+ ec_heal_data_block(heal);
+
+ return EC_STATE_HEAL_DATA_UNLOCK;
+
+ case -EC_STATE_HEAL_DATA_COPY:
+ case -EC_STATE_HEAL_DATA_UNLOCK:
+ case EC_STATE_HEAL_DATA_UNLOCK:
+ ec_heal_inodelk(heal, F_UNLCK, 1, 0, 0);
+
+ return EC_STATE_REPORT;
+
+ case EC_STATE_REPORT:
+ if (fop->cbks.heal) {
+ fop->cbks.heal(fop->req_frame, fop->data, fop->xl, 0, 0,
+ (heal->good | heal->bad), heal->good, heal->bad,
+ 0, NULL);
}
- if (cbk->iatt[0].ia_type == IA_IFREG)
- {
- if (ec_dict_set_number(xdata, EC_XATTR_SIZE,
- cbk->iatt[0].ia_size) != 0)
- {
- goto out;
- }
+
+ return EC_STATE_END;
+ case -EC_STATE_REPORT:
+ if (fop->cbks.heal) {
+ fop->cbks.heal(fop->req_frame, fop->data, fop->xl, -1,
+ fop->error, 0, 0, 0, 0, NULL);
}
- }
- ec_setxattr(heal->fop->frame, heal->xl, heal->bad, EC_MINIMUM_ONE,
- ec_heal_setxattr_cbk, heal, &heal->loc, xdata, 0, NULL);
+ return EC_STATE_END;
+ default:
+ gf_msg(fop->xl->name, GF_LOG_ERROR, 0, EC_MSG_UNHANDLED_STATE,
+ "Unhandled state %d for %s", state, ec_fop_name(fop->id));
+
+ return EC_STATE_END;
}
+}
+
+/*Takes lock */
+void
+ec_heal_block(call_frame_t *frame, xlator_t *this, uintptr_t target,
+ uint32_t fop_flags, fop_heal_cbk_t func, ec_heal_t *heal)
+{
+ ec_cbk_t callback = {.heal = func};
+ ec_fop_data_t *fop = NULL;
+ int32_t error = ENOMEM;
+
+ gf_msg_trace("ec", 0, "EC(HEAL) %p", frame);
+
+ VALIDATE_OR_GOTO(this, out);
+ GF_VALIDATE_OR_GOTO(this->name, this->private, out);
+
+ fop = ec_fop_data_allocate(frame, this, EC_FOP_HEAL, 0, target, fop_flags,
+ NULL, ec_manager_heal_block, callback, heal);
+ if (fop == NULL)
+ goto out;
error = 0;
out:
- ec_fop_set_error(heal->fop, error);
+ if (fop != NULL) {
+ ec_manager(fop, error);
+ } else {
+ func(frame, heal, this, -1, error, 0, 0, 0, 0, NULL);
+ }
}
-int32_t ec_heal_xattr_clean(dict_t * dict, char * key, data_t * data,
- void * arg)
+int32_t
+ec_heal_block_done(call_frame_t *frame, void *cookie, xlator_t *this,
+ int32_t op_ret, int32_t op_errno, uintptr_t mask,
+ uintptr_t good, uintptr_t bad, uint32_t pending,
+ dict_t *xdata)
{
- dict_t * base = arg;
+ ec_heal_t *heal = cookie;
- if (dict_get(base, key) == NULL)
- {
- if (dict_set_static_bin(dict, key, dict, 0) != 0)
- {
- return -1;
- }
- }
- else
- {
- dict_del(dict, key);
+ if (heal->fop) {
+ heal->fop->heal = NULL;
}
+ heal->fop = NULL;
+ heal->error = op_ret < 0 ? op_errno : 0;
+ syncbarrier_wake(heal->data);
+ return 0;
+}
+int
+ec_sync_heal_block(call_frame_t *frame, xlator_t *this, ec_heal_t *heal)
+{
+ ec_heal_block(frame, this, heal->bad | heal->good, EC_MINIMUM_ONE,
+ ec_heal_block_done, heal);
+ syncbarrier_wait(heal->data, 1);
+ if (heal->error != 0) {
+ return -heal->error;
+ }
+ if (heal->bad == 0)
+ return -ENOTCONN;
return 0;
}
-void ec_heal_removexattr_others(ec_heal_t * heal)
+int
+ec_rebuild_data(call_frame_t *frame, ec_t *ec, fd_t *fd, uint64_t size,
+ unsigned char *sources, unsigned char *healed_sinks)
{
- struct list_head * item;
- ec_cbk_data_t * cbk;
- dict_t * xdata;
+ ec_heal_t *heal = NULL;
+ int ret = 0;
+ syncbarrier_t barrier;
+
+ if (syncbarrier_init(&barrier))
+ return -ENOMEM;
+
+ heal = alloca0(sizeof(*heal));
+ heal->fd = fd_ref(fd);
+ heal->xl = ec->xl;
+ heal->data = &barrier;
+ ec_adjust_size_up(ec, &size, _gf_false);
+ heal->total_size = size;
+ heal->size = (128 * GF_UNIT_KB * (ec->self_heal_window_size));
+ /* We need to adjust the size to a multiple of the stripe size of the
+ * volume. Otherwise writes would need to fill gaps (head and/or tail)
+ * with existent data from the bad bricks. This could be garbage on a
+ * damaged file or it could fail if there aren't enough bricks. */
+ heal->size -= heal->size % ec->stripe_size;
+ heal->bad = ec_char_array_to_mask(healed_sinks, ec->nodes);
+ heal->good = ec_char_array_to_mask(sources, ec->nodes);
+ heal->iatt.ia_type = IA_IFREG;
+ LOCK_INIT(&heal->lock);
- if ((heal->good == 0) || (heal->bad == 0))
- {
- return;
+ for (heal->offset = 0; (heal->offset < size) && !heal->done;
+ heal->offset += heal->size) {
+ /* We immediately abort any heal if a shutdown request has been
+ * received to avoid delays. The healing of this file will be
+ * restarted by another SHD or other client that accesses the
+ * file. */
+ if (ec->shutdown) {
+ gf_msg_debug(ec->xl->name, 0,
+ "Cancelling heal because "
+ "EC is stopping.");
+ ret = -ENOTCONN;
+ break;
+ }
+
+ gf_msg_debug(ec->xl->name, 0,
+ "%s: sources: %d, sinks: "
+ "%d, offset: %" PRIu64 " bsize: %" PRIu64,
+ uuid_utoa(fd->inode->gfid), EC_COUNT(sources, ec->nodes),
+ EC_COUNT(healed_sinks, ec->nodes), heal->offset,
+ heal->size);
+ ret = ec_sync_heal_block(frame, ec->xl, heal);
+ if (ret < 0)
+ break;
}
+ memset(healed_sinks, 0, ec->nodes);
+ ec_mask_to_char_array(heal->bad, healed_sinks, ec->nodes);
+ fd_unref(heal->fd);
+ LOCK_DESTROY(&heal->lock);
+ syncbarrier_destroy(heal->data);
+ if (ret < 0)
+ gf_msg_debug(ec->xl->name, 0, "%s: heal failed %s",
+ uuid_utoa(fd->inode->gfid), strerror(-ret));
+ return ret;
+}
- xdata = heal->lookup->answer->xdata;
- item = heal->lookup->cbk_list.next;
- while (item->next != &heal->lookup->cbk_list)
- {
- item = item->next;
- cbk = list_entry(item, ec_cbk_data_t, list);
+int
+__ec_heal_trim_sinks(call_frame_t *frame, ec_t *ec, fd_t *fd,
+ unsigned char *healed_sinks, unsigned char *trim,
+ uint64_t size)
+{
+ default_args_cbk_t *replies = NULL;
+ unsigned char *output = NULL;
+ int ret = 0;
+ int i = 0;
+ off_t trim_offset = 0;
- if (cbk->op_ret >= 0)
- {
- if (dict_foreach(cbk->xdata, ec_heal_xattr_clean, xdata) == 0)
- {
- ec_removexattr(heal->fop->frame, heal->xl, cbk->mask,
- EC_MINIMUM_ONE, ec_heal_removexattr_cbk, heal,
- &heal->loc, "", cbk->xdata);
- }
- }
+ EC_REPLIES_ALLOC(replies, ec->nodes);
+ output = alloca0(ec->nodes);
+
+ if (EC_COUNT(trim, ec->nodes) == 0) {
+ ret = 0;
+ goto out;
}
+ trim_offset = size;
+ ec_adjust_offset_up(ec, &trim_offset, _gf_true);
+ ret = cluster_ftruncate(ec->xl_list, trim, ec->nodes, replies, output,
+ frame, ec->xl, fd, trim_offset, NULL);
+ for (i = 0; i < ec->nodes; i++) {
+ if (!output[i] && trim[i])
+ healed_sinks[i] = 0;
+ }
+
+ if (EC_COUNT(healed_sinks, ec->nodes) == 0) {
+ ret = -ENOTCONN;
+ goto out;
+ }
+
+out:
+ cluster_replies_wipe(replies, ec->nodes);
+ if (ret < 0)
+ gf_msg_debug(ec->xl->name, 0, "%s: heal failed %s",
+ uuid_utoa(fd->inode->gfid), strerror(-ret));
+ return ret;
}
-void ec_heal_attr(ec_heal_t * heal)
+int
+ec_data_undo_pending(call_frame_t *frame, ec_t *ec, fd_t *fd, dict_t *xattr,
+ uint64_t *versions, uint64_t *dirty, uint64_t *size,
+ int source, gf_boolean_t erase_dirty, int idx)
{
- if ((heal->good != 0) && (heal->bad != 0))
- {
- ec_setattr(heal->fop->frame, heal->xl, heal->bad, EC_MINIMUM_ONE,
- ec_heal_setattr_cbk, heal, &heal->loc, &heal->iatt,
- GF_SET_ATTR_MODE | GF_SET_ATTR_UID | GF_SET_ATTR_GID |
- GF_SET_ATTR_ATIME | GF_SET_ATTR_MTIME, NULL);
+ uint64_t versions_xattr[2] = {0};
+ uint64_t dirty_xattr[2] = {0};
+ uint64_t allzero[2] = {0};
+ uint64_t size_xattr = 0;
+ int ret = 0;
+
+ versions_xattr[EC_DATA_TXN] = hton64(versions[source] - versions[idx]);
+ ret = dict_set_static_bin(xattr, EC_XATTR_VERSION, versions_xattr,
+ sizeof(versions_xattr));
+ if (ret < 0)
+ goto out;
+
+ size_xattr = hton64(size[source] - size[idx]);
+ ret = dict_set_static_bin(xattr, EC_XATTR_SIZE, &size_xattr,
+ sizeof(size_xattr));
+ if (ret < 0)
+ goto out;
+
+ if (erase_dirty) {
+ dirty_xattr[EC_DATA_TXN] = hton64(-dirty[idx]);
+ ret = dict_set_static_bin(xattr, EC_XATTR_DIRTY, dirty_xattr,
+ sizeof(dirty_xattr));
+ if (ret < 0)
+ goto out;
}
+
+ if ((memcmp(versions_xattr, allzero, sizeof(allzero)) == 0) &&
+ (memcmp(dirty_xattr, allzero, sizeof(allzero)) == 0) &&
+ (size_xattr == 0)) {
+ ret = 0;
+ goto out;
+ }
+
+ ret = syncop_fxattrop(ec->xl_list[idx], fd, GF_XATTROP_ADD_ARRAY64, xattr,
+ NULL, NULL, NULL);
+out:
+ return ret;
}
-int32_t ec_heal_needs_data_rebuild(ec_heal_t * heal)
+int
+__ec_fd_data_adjust_versions(call_frame_t *frame, ec_t *ec, fd_t *fd,
+ unsigned char *sources,
+ unsigned char *healed_sinks, uint64_t *versions,
+ uint64_t *dirty, uint64_t *size)
{
- ec_fop_data_t * fop = heal->lookup;
- ec_cbk_data_t * cbk = NULL;
- uintptr_t bad = 0;
+ dict_t *xattr = NULL;
+ int i = 0;
+ int ret = 0;
+ int op_ret = 0;
+ int source = -1;
+ gf_boolean_t erase_dirty = _gf_false;
+
+ xattr = dict_new();
+ if (!xattr) {
+ op_ret = -ENOMEM;
+ goto out;
+ }
- if ((heal->fop->error != 0) || (heal->good == 0) ||
- (heal->iatt.ia_type != IA_IFREG))
- {
- return 0;
+ /* dirty xattr represents if the file needs heal. Unless all the
+ * copies are healed, don't erase it */
+ if (EC_COUNT(sources, ec->nodes) + EC_COUNT(healed_sinks, ec->nodes) ==
+ ec->nodes)
+ erase_dirty = _gf_true;
+
+ for (i = 0; i < ec->nodes; i++) {
+ if (sources[i]) {
+ source = i;
+ break;
+ }
}
- list_for_each_entry(cbk, &fop->cbk_list, list)
- {
- if ((cbk->op_ret >= 0) &&
- ((cbk->size != heal->raw_size) || (cbk->version != heal->version)))
- {
- bad |= cbk->mask;
+ if (source == -1) {
+ op_ret = -ENOTCONN;
+ goto out;
+ }
+
+ for (i = 0; i < ec->nodes; i++) {
+ if (healed_sinks[i]) {
+ ret = ec_data_undo_pending(frame, ec, fd, xattr, versions, dirty,
+ size, source, erase_dirty, i);
+ if (ret < 0)
+ goto out;
}
}
- /* This function can only be called concurrently with entrylk, which do
- * not modify heal structure, so it's safe to access heal->bad without
- * acquiring any lock.
- */
- heal->bad = bad;
+ if (!erase_dirty)
+ goto out;
- return (bad != 0);
+ for (i = 0; i < ec->nodes; i++) {
+ if (sources[i]) {
+ ret = ec_data_undo_pending(frame, ec, fd, xattr, versions, dirty,
+ size, source, erase_dirty, i);
+ if (ret < 0)
+ continue;
+ }
+ }
+out:
+ if (xattr)
+ dict_unref(xattr);
+ return op_ret;
}
-void ec_heal_open(ec_heal_t * heal)
+int
+ec_restore_time_and_adjust_versions(call_frame_t *frame, ec_t *ec, fd_t *fd,
+ unsigned char *sources,
+ unsigned char *healed_sinks,
+ uint64_t *versions, uint64_t *dirty,
+ uint64_t *size)
{
- if (!ec_heal_needs_data_rebuild(heal))
- {
- return;
+ unsigned char *locked_on = NULL;
+ unsigned char *participants = NULL;
+ unsigned char *output = NULL;
+ default_args_cbk_t *replies = NULL;
+ unsigned char *postsh_sources = NULL;
+ unsigned char *postsh_healed_sinks = NULL;
+ unsigned char *postsh_trim = NULL;
+ uint64_t *postsh_versions = NULL;
+ uint64_t *postsh_dirty = NULL;
+ uint64_t *postsh_size = NULL;
+ int ret = 0;
+ int i = 0;
+ struct iatt source_buf = {0};
+ loc_t loc = {0};
+
+ locked_on = alloca0(ec->nodes);
+ output = alloca0(ec->nodes);
+ participants = alloca0(ec->nodes);
+ postsh_sources = alloca0(ec->nodes);
+ postsh_healed_sinks = alloca0(ec->nodes);
+ postsh_trim = alloca0(ec->nodes);
+ postsh_versions = alloca0(ec->nodes * sizeof(*postsh_versions));
+ postsh_dirty = alloca0(ec->nodes * sizeof(*postsh_dirty));
+ postsh_size = alloca0(ec->nodes * sizeof(*postsh_size));
+
+ for (i = 0; i < ec->nodes; i++) {
+ if (healed_sinks[i] || sources[i])
+ participants[i] = 1;
}
- if (ec_heal_open_others(heal))
+ EC_REPLIES_ALLOC(replies, ec->nodes);
+ ret = cluster_inodelk(ec->xl_list, participants, ec->nodes, replies,
+ locked_on, frame, ec->xl, ec->xl->name, fd->inode, 0,
+ 0);
{
- ec_open(heal->fop->frame, heal->xl, heal->good, EC_MINIMUM_MIN,
- ec_heal_source_open_cbk, heal, &heal->loc, O_RDONLY, heal->fd,
- NULL);
+ if (ret <= ec->fragments) {
+ gf_msg_debug(ec->xl->name, 0,
+ "%s: Skipping heal "
+ "as only %d number of subvolumes could "
+ "be locked",
+ uuid_utoa(fd->inode->gfid), ret);
+ ret = -ENOTCONN;
+ goto unlock;
+ }
+
+ ret = __ec_heal_data_prepare(frame, ec, fd, locked_on, postsh_versions,
+ postsh_dirty, postsh_size, postsh_sources,
+ postsh_healed_sinks, postsh_trim,
+ &source_buf);
+ if (ret < 0)
+ goto unlock;
+
+ loc.inode = inode_ref(fd->inode);
+ gf_uuid_copy(loc.gfid, fd->inode->gfid);
+ ret = cluster_setattr(
+ ec->xl_list, healed_sinks, ec->nodes, replies, output, frame,
+ ec->xl, &loc, &source_buf,
+ GF_SET_ATTR_ATIME | GF_SET_ATTR_MTIME | GF_SET_ATTR_CTIME, NULL);
+ EC_INTERSECT(healed_sinks, healed_sinks, output, ec->nodes);
+ if (EC_COUNT(healed_sinks, ec->nodes) == 0) {
+ ret = -ENOTCONN;
+ goto unlock;
+ }
+ ret = __ec_fd_data_adjust_versions(frame, ec, fd, sources, healed_sinks,
+ versions, dirty, size);
}
+unlock:
+ cluster_uninodelk(ec->xl_list, locked_on, ec->nodes, replies, output, frame,
+ ec->xl, ec->xl->name, fd->inode, 0, 0);
+ cluster_replies_wipe(replies, ec->nodes);
+ loc_wipe(&loc);
+ return ret;
}
-void ec_heal_reopen_fd(ec_heal_t * heal)
+int
+__ec_heal_data(call_frame_t *frame, ec_t *ec, fd_t *fd, unsigned char *heal_on,
+ unsigned char *sources, unsigned char *healed_sinks)
{
- inode_t * inode;
- fd_t * fd;
- ec_fd_t * ctx;
- uintptr_t mask;
- int32_t flags;
-
- inode = heal->loc.inode;
-
- LOCK(&inode->lock);
-
- list_for_each_entry(fd, &inode->fd_list, inode_list)
+ unsigned char *locked_on = NULL;
+ unsigned char *output = NULL;
+ uint64_t *versions = NULL;
+ uint64_t *dirty = NULL;
+ uint64_t *size = NULL;
+ unsigned char *trim = NULL;
+ default_args_cbk_t *replies = NULL;
+ int ret = 0;
+ int source = 0;
+
+ locked_on = alloca0(ec->nodes);
+ output = alloca0(ec->nodes);
+ trim = alloca0(ec->nodes);
+ versions = alloca0(ec->nodes * sizeof(*versions));
+ dirty = alloca0(ec->nodes * sizeof(*dirty));
+ size = alloca0(ec->nodes * sizeof(*size));
+
+ EC_REPLIES_ALLOC(replies, ec->nodes);
+ ret = cluster_inodelk(ec->xl_list, heal_on, ec->nodes, replies, locked_on,
+ frame, ec->xl, ec->xl->name, fd->inode, 0, 0);
{
- ctx = ec_fd_get(fd, heal->xl);
- if ((ctx != NULL) && (ctx->loc.inode != NULL))
- {
- mask = heal->bad & ~ctx->open;
- if (mask != 0)
- {
- UNLOCK(&inode->lock);
-
- if (heal->iatt.ia_type == IA_IFDIR)
- {
- ec_opendir(heal->fop->frame, heal->xl, mask,
- EC_MINIMUM_ONE, ec_heal_reopen_cbk, NULL,
- &heal->loc, fd, NULL);
- }
- else
- {
- flags = ctx->flags & ~O_TRUNC;
- if ((flags & O_ACCMODE) == O_WRONLY)
- {
- flags &= ~O_ACCMODE;
- flags |= O_RDWR;
- }
+ if (ret <= ec->fragments) {
+ gf_msg_debug(ec->xl->name, 0,
+ "%s: Skipping heal "
+ "as only %d number of subvolumes could "
+ "be locked",
+ uuid_utoa(fd->inode->gfid), ret);
+ ret = -ENOTCONN;
+ goto unlock;
+ }
- ec_open(heal->fop->frame, heal->xl, mask, EC_MINIMUM_ONE,
- ec_heal_reopen_cbk, NULL, &heal->loc, flags, fd,
- NULL);
- }
+ ret = __ec_heal_data_prepare(frame, ec, fd, locked_on, versions, dirty,
+ size, sources, healed_sinks, trim, NULL);
+ if (ret < 0)
+ goto unlock;
- LOCK(&inode->lock);
- }
+ if (EC_COUNT(healed_sinks, ec->nodes) == 0) {
+ ret = __ec_fd_data_adjust_versions(
+ frame, ec, fd, sources, healed_sinks, versions, dirty, size);
+ goto unlock;
}
+
+ source = ret;
+ ret = __ec_heal_mark_sinks(frame, ec, fd, versions, healed_sinks);
+ if (ret < 0)
+ goto unlock;
+
+ ret = __ec_heal_trim_sinks(frame, ec, fd, healed_sinks, trim,
+ size[source]);
}
+unlock:
+ cluster_uninodelk(ec->xl_list, locked_on, ec->nodes, replies, output, frame,
+ ec->xl, ec->xl->name, fd->inode, 0, 0);
+ if (ret < 0)
+ goto out;
- UNLOCK(&inode->lock);
-}
+ if (EC_COUNT(healed_sinks, ec->nodes) == 0)
+ goto out;
-int32_t ec_heal_writev_cbk(call_frame_t * frame, void * cookie,
- xlator_t * this, int32_t op_ret, int32_t op_errno,
- struct iatt * prebuf, struct iatt * postbuf,
- dict_t * xdata)
-{
- ec_trace("WRITE_CBK", cookie, "ret=%d, errno=%d", op_ret, op_errno);
+ gf_msg_debug(ec->xl->name, 0,
+ "%s: sources: %d, sinks: "
+ "%d",
+ uuid_utoa(fd->inode->gfid), EC_COUNT(sources, ec->nodes),
+ EC_COUNT(healed_sinks, ec->nodes));
- ec_heal_update(cookie, 0);
+ ret = ec_rebuild_data(frame, ec, fd, size[source], sources, healed_sinks);
+ if (ret < 0)
+ goto out;
- return 0;
+ ret = ec_restore_time_and_adjust_versions(
+ frame, ec, fd, sources, healed_sinks, versions, dirty, size);
+out:
+ cluster_replies_wipe(replies, ec->nodes);
+ return ret;
}
-int32_t ec_heal_readv_cbk(call_frame_t * frame, void * cookie, xlator_t * this,
- int32_t op_ret, int32_t op_errno,
- struct iovec * vector, int32_t count,
- struct iatt * stbuf, struct iobref * iobref,
- dict_t * xdata)
+int
+ec_heal_data(call_frame_t *frame, ec_t *ec, gf_boolean_t block, inode_t *inode,
+ unsigned char *sources, unsigned char *healed_sinks)
{
- ec_fop_data_t * fop = cookie;
- ec_heal_t * heal = fop->data;
+ unsigned char *locked_on = NULL;
+ unsigned char *up_subvols = NULL;
+ unsigned char *output = NULL;
+ default_args_cbk_t *replies = NULL;
+ fd_t *fd = NULL;
+ loc_t loc = {0};
+ char selfheal_domain[1024] = {0};
+ int ret = 0;
+
+ EC_REPLIES_ALLOC(replies, ec->nodes);
+
+ locked_on = alloca0(ec->nodes);
+ output = alloca0(ec->nodes);
+ up_subvols = alloca0(ec->nodes);
+ loc.inode = inode_ref(inode);
+ gf_uuid_copy(loc.gfid, inode->gfid);
+
+ fd = fd_create(inode, 0);
+ if (!fd) {
+ ret = -ENOMEM;
+ goto out;
+ }
- ec_trace("READ_CBK", fop, "ret=%d, errno=%d", op_ret, op_errno);
+ ec_mask_to_char_array(ec->xl_up, up_subvols, ec->nodes);
- ec_heal_avoid(fop);
+ ret = cluster_open(ec->xl_list, up_subvols, ec->nodes, replies, output,
+ frame, ec->xl, &loc, O_RDWR | O_LARGEFILE, fd, NULL);
+ if (ret <= ec->fragments) {
+ ret = -ENOTCONN;
+ goto out;
+ }
- if (op_ret > 0)
- {
- ec_writev(heal->fop->frame, heal->xl, heal->bad, EC_MINIMUM_ONE,
- ec_heal_writev_cbk, heal, heal->fd, vector, count,
- heal->offset, 0, iobref, NULL);
+ fd_bind(fd);
+ sprintf(selfheal_domain, "%s:self-heal", ec->xl->name);
+ /*If other processes are already doing the heal, don't block*/
+ if (block) {
+ ret = cluster_inodelk(ec->xl_list, output, ec->nodes, replies,
+ locked_on, frame, ec->xl, selfheal_domain, inode,
+ 0, 0);
+ } else {
+ ret = cluster_tiebreaker_inodelk(ec->xl_list, output, ec->nodes,
+ replies, locked_on, frame, ec->xl,
+ selfheal_domain, inode, 0, 0);
}
- else
{
- heal->done = 1;
+ if (ret <= ec->fragments) {
+ gf_msg_debug(ec->xl->name, 0,
+ "%s: Skipping heal "
+ "as only %d number of subvolumes could "
+ "be locked",
+ uuid_utoa(inode->gfid), ret);
+ ret = -ENOTCONN;
+ goto unlock;
+ }
+ ret = __ec_heal_data(frame, ec, fd, locked_on, sources, healed_sinks);
}
-
- return 0;
+unlock:
+ cluster_uninodelk(ec->xl_list, locked_on, ec->nodes, replies, output, frame,
+ ec->xl, selfheal_domain, inode, 0, 0);
+out:
+ if (fd)
+ fd_unref(fd);
+ loc_wipe(&loc);
+ cluster_replies_wipe(replies, ec->nodes);
+ return ret;
}
-void ec_heal_data(ec_heal_t * heal)
+int
+ec_heal_purge_stale_index(call_frame_t *frame, ec_t *ec, inode_t *inode)
{
- ec_trace("DATA", heal->fop, "good=%lX, bad=%lX", heal->good, heal->bad);
-
- if ((heal->good != 0) && (heal->bad != 0) &&
- (heal->iatt.ia_type == IA_IFREG))
- {
- ec_readv(heal->fop->frame, heal->xl, heal->good, EC_MINIMUM_MIN,
- ec_heal_readv_cbk, heal, heal->fd, heal->size, heal->offset,
- 0, NULL);
+ int i = 0;
+ int ret = 0;
+ dict_t **xattr = NULL;
+ loc_t loc = {0};
+ uint64_t dirty_xattr[EC_VERSION_SIZE] = {0};
+ unsigned char *on = NULL;
+ default_args_cbk_t *replies = NULL;
+ dict_t *dict = NULL;
+
+ /* Allocate the required memory */
+ loc.inode = inode_ref(inode);
+ gf_uuid_copy(loc.gfid, inode->gfid);
+ on = alloca0(ec->nodes);
+ EC_REPLIES_ALLOC(replies, ec->nodes);
+ xattr = GF_CALLOC(ec->nodes, sizeof(*xattr), gf_common_mt_pointer);
+ if (!xattr) {
+ ret = -ENOMEM;
+ goto out;
+ }
+ dict = dict_new();
+ if (!dict) {
+ ret = -ENOMEM;
+ goto out;
}
+ for (i = 0; i < ec->nodes; i++) {
+ xattr[i] = dict;
+ on[i] = 1;
+ }
+ ret = dict_set_static_bin(dict, EC_XATTR_DIRTY, dirty_xattr,
+ (sizeof(*dirty_xattr) * EC_VERSION_SIZE));
+ if (ret < 0) {
+ ret = -ENOMEM;
+ goto out;
+ }
+ PARALLEL_FOP_ONLIST(ec->xl_list, on, ec->nodes, replies, frame,
+ ec_wind_xattrop_parallel, &loc, GF_XATTROP_ADD_ARRAY64,
+ xattr, NULL);
+out:
+ if (dict) {
+ dict_unref(dict);
+ }
+ if (xattr) {
+ GF_FREE(xattr);
+ }
+ cluster_replies_wipe(replies, ec->nodes);
+ loc_wipe(&loc);
+ return ret;
}
-void ec_heal_dispatch(ec_heal_t * heal)
+void
+ec_heal_do(xlator_t *this, void *data, loc_t *loc, int32_t partial)
{
- ec_fop_data_t * fop = heal->fop;
- ec_cbk_data_t * cbk;
- inode_t * inode;
- ec_inode_t * ctx;
- int32_t error;
-
- inode = heal->loc.inode;
-
- LOCK(&inode->lock);
+ call_frame_t *frame = NULL;
+ unsigned char *participants = NULL;
+ unsigned char *msources = NULL;
+ unsigned char *mhealed_sinks = NULL;
+ unsigned char *sources = NULL;
+ unsigned char *healed_sinks = NULL;
+ ec_t *ec = NULL;
+ int ret = 0;
+ int op_ret = 0;
+ int op_errno = 0;
+ intptr_t mgood = 0;
+ intptr_t mbad = 0;
+ intptr_t good = 0;
+ intptr_t bad = 0;
+ uint32_t pending = 0;
+ ec_fop_data_t *fop = data;
+ gf_boolean_t blocking = _gf_false;
+ ec_heal_need_t need_heal = EC_HEAL_NONEED;
+ unsigned char *up_subvols = NULL;
+ char up_bricks[32];
+
+ ec = this->private;
+
+ /* If it is heal request from getxattr, complete the heal and then
+ * unwind, if it is ec_heal with NULL as frame then no need to block
+ * the heal as the caller doesn't care about its completion. In case
+ * of heald whichever gets tiebreaking inodelk will take care of the
+ * heal, so no need to block*/
+ if (fop->req_frame && !ec->shd.iamshd)
+ blocking = _gf_true;
+
+ frame = create_frame(this, this->ctx->pool);
+ if (!frame)
+ goto out;
- ctx = __ec_inode_get(inode, heal->xl);
- if (ctx != NULL)
- {
- ctx->bad &= ~heal->good;
- ctx->heal = NULL;
+ ec_owner_set(frame, frame->root);
+ /*Do heal as root*/
+ frame->root->uid = 0;
+ frame->root->gid = 0;
+ /*Mark the fops as internal*/
+ frame->root->pid = GF_CLIENT_PID_SELF_HEALD;
+ participants = alloca0(ec->nodes);
+ ec_mask_to_char_array(ec->xl_up, participants, ec->nodes);
+
+ up_subvols = alloca0(ec->nodes);
+ ec_mask_to_char_array(ec->xl_up, up_subvols, ec->nodes);
+
+ if (loc->name && strlen(loc->name)) {
+ ret = ec_heal_name(frame, ec, loc->parent, (char *)loc->name,
+ participants);
+ if (ret >= 0) {
+ gf_msg_debug(this->name, 0,
+ "%s: name heal "
+ "successful on %" PRIXPTR,
+ loc->path,
+ ec_char_array_to_mask(participants, ec->nodes));
+ } else {
+ gf_msg_debug(
+ this->name, 0,
+ "%s: name heal "
+ "failed. ret = %d, subvolumes up = %s",
+ loc->path, ret,
+ ec_bin(up_bricks, sizeof(up_bricks), ec->xl_up, ec->nodes));
+ }
}
- fop->data = NULL;
-
- UNLOCK(&inode->lock);
-
- error = fop->error;
-
- cbk = ec_cbk_data_allocate(fop->frame, heal->xl, fop, fop->id, 0,
- error == 0 ? 0 : -1, error);
- if (cbk != NULL)
- {
- cbk->uintptr[0] = heal->available;
- cbk->uintptr[1] = heal->good;
- cbk->uintptr[2] = heal->bad;
+ /* Mount triggers heal only when it detects that it must need heal, shd
+ * triggers heals periodically which need not be thorough*/
+ if (ec->shd.iamshd && (ret <= 0)) {
+ ec_heal_inspect(frame, ec, loc->inode, up_subvols, _gf_false, _gf_false,
+ &need_heal);
+
+ if (need_heal == EC_HEAL_PURGE_INDEX) {
+ gf_msg(ec->xl->name, GF_LOG_INFO, 0, EC_MSG_HEAL_FAIL,
+ "Index entry needs to be purged for: %s ",
+ uuid_utoa(loc->gfid));
+ /* We need to send zero-xattrop so that stale index entry could be
+ * removed. We need not take lock on this entry to do so as
+ * xattrop on a brick is atomic. */
+ ec_heal_purge_stale_index(frame, ec, loc->inode);
+ goto out;
+ } else if (need_heal == EC_HEAL_NONEED) {
+ gf_msg(ec->xl->name, GF_LOG_DEBUG, 0, EC_MSG_HEAL_FAIL,
+ "Heal is not required for : %s ", uuid_utoa(loc->gfid));
+ goto out;
+ }
+ }
- ec_combine(cbk, NULL);
+ sources = alloca0(ec->nodes);
+ healed_sinks = alloca0(ec->nodes);
+ if (IA_ISREG(loc->inode->ia_type)) {
+ ret = ec_heal_data(frame, ec, blocking, loc->inode, sources,
+ healed_sinks);
+ } else if (IA_ISDIR(loc->inode->ia_type) && !partial) {
+ ret = ec_heal_entry(frame, ec, loc->inode, sources, healed_sinks,
+ &pending);
+ } else {
+ ret = 0;
+ memcpy(sources, participants, ec->nodes);
+ memcpy(healed_sinks, participants, ec->nodes);
+ }
- fop->answer = cbk;
+ if (ret == 0) {
+ good = ec_char_array_to_mask(sources, ec->nodes);
+ bad = ec_char_array_to_mask(healed_sinks, ec->nodes);
+ } else {
+ op_ret = -1;
+ op_errno = -ret;
}
- else if (error == 0)
- {
- error = ENOMEM;
+ msources = alloca0(ec->nodes);
+ mhealed_sinks = alloca0(ec->nodes);
+ ret = ec_heal_metadata(frame, ec, loc->inode, msources, mhealed_sinks);
+ if (ret == 0) {
+ mgood = ec_char_array_to_mask(msources, ec->nodes);
+ mbad = ec_char_array_to_mask(mhealed_sinks, ec->nodes);
+ } else {
+ op_ret = -1;
+ op_errno = -ret;
}
- if (heal->lookup != NULL)
- {
- ec_fop_data_release(heal->lookup);
- }
- if (heal->fd != NULL)
- {
- fd_unref(heal->fd);
+out:
+ ec_reset_entry_healing(fop);
+ if (fop->cbks.heal) {
+ fop->cbks.heal(fop->req_frame, fop->data, fop->xl, op_ret, op_errno,
+ ec_char_array_to_mask(participants, ec->nodes),
+ mgood & good, mbad & bad, pending, NULL);
}
- GF_FREE(heal->symlink);
- loc_wipe(&heal->loc);
+ if (frame)
+ STACK_DESTROY(frame->root);
- LOCK_DESTROY(&heal->lock);
+ return;
+}
- GF_FREE(heal);
+int
+ec_synctask_heal_wrap(void *opaque)
+{
+ ec_fop_data_t *fop = opaque;
+ ec_heal_do(fop->xl, fop, &fop->loc[0], fop->int32);
+ return 0;
+}
- ec_fop_set_error(fop, error);
+int
+ec_heal_done(int ret, call_frame_t *heal, void *opaque)
+{
+ if (opaque)
+ ec_fop_data_release(opaque);
+ return 0;
}
-void ec_wind_heal(ec_t * ec, ec_fop_data_t * fop, int32_t idx)
+ec_fop_data_t *
+__ec_dequeue_heals(ec_t *ec)
{
- ec_cbk_data_t * cbk;
- ec_heal_t * heal = fop->data;
+ ec_fop_data_t *fop = NULL;
+
+ if (list_empty(&ec->heal_waiting))
+ goto none;
+
+ if ((ec->background_heals > 0) && (ec->healers >= ec->background_heals))
+ goto none;
+
+ fop = list_entry(ec->heal_waiting.next, ec_fop_data_t, healer);
+ ec->heal_waiters--;
+ list_del_init(&fop->healer);
+ list_add(&fop->healer, &ec->healing);
+ ec->healers++;
+ return fop;
+none:
+ gf_msg_debug(ec->xl->name, 0, "Num healers: %d, Num Waiters: %d",
+ ec->healers, ec->heal_waiters);
+ return NULL;
+}
- ec_trace("WIND", fop, "idx=%d", idx);
+void
+ec_heal_fail(ec_t *ec, ec_fop_data_t *fop)
+{
+ if (fop->cbks.heal) {
+ fop->cbks.heal(fop->req_frame, fop->data, ec->xl, -1, fop->error, 0, 0,
+ 0, 0, NULL);
+ }
+ ec_fop_data_release(fop);
+}
- cbk = ec_cbk_data_allocate(fop->req_frame, fop->xl, fop, EC_FOP_HEAL, idx,
- fop->error == 0 ? 0 : -1, fop->error);
- if (cbk != NULL)
- {
- cbk->uintptr[0] = heal->available;
- cbk->uintptr[1] = heal->good;
- cbk->uintptr[2] = heal->bad;
+void
+ec_launch_heal(ec_t *ec, ec_fop_data_t *fop)
+{
+ int ret = 0;
+ call_frame_t *frame = NULL;
- ec_combine(cbk, NULL);
+ frame = create_frame(ec->xl, ec->xl->ctx->pool);
+ if (!frame) {
+ ret = -1;
+ goto out;
}
- ec_complete(fop);
+ ec_owner_set(frame, frame->root);
+ /*Do heal as root*/
+ frame->root->uid = 0;
+ frame->root->gid = 0;
+ /*Mark the fops as internal*/
+ frame->root->pid = GF_CLIENT_PID_SELF_HEALD;
+
+ ret = synctask_new(ec->xl->ctx->env, ec_synctask_heal_wrap, ec_heal_done,
+ frame, fop);
+out:
+ if (ret < 0) {
+ ec_fop_set_error(fop, ENOMEM);
+ ec_heal_fail(ec, fop);
+ }
+
+ if (frame)
+ STACK_DESTROY(frame->root);
}
-int32_t ec_manager_heal(ec_fop_data_t * fop, int32_t state)
+void
+ec_handle_healers_done(ec_fop_data_t *fop)
{
- ec_cbk_data_t * cbk;
- ec_heal_t * heal = fop->data;
+ ec_t *ec = fop->xl->private;
+ ec_fop_data_t *heal_fop = NULL;
- switch (state)
- {
- case EC_STATE_INIT:
- ec_owner_set(fop->frame, fop->frame->root);
+ if (list_empty(&fop->healer))
+ return;
- fop->error = ec_heal_init(fop);
- if (fop->error != 0)
- {
- return EC_STATE_REPORT;
- }
+ LOCK(&ec->lock);
- /* Fall through */
+ list_del_init(&fop->healer);
- case EC_STATE_DISPATCH:
- ec_heal_entrylk(fop->data, ENTRYLK_LOCK);
+ do {
+ ec->healers--;
+ heal_fop = __ec_dequeue_heals(ec);
- return EC_STATE_HEAL_ENTRY_LOOKUP;
+ if ((heal_fop != NULL) && ec->shutdown) {
+ /* This will prevent ec_handle_healers_done() to be
+ * called recursively. That would be problematic if
+ * the queue is too big. */
+ list_del_init(&heal_fop->healer);
- case EC_STATE_HEAL_ENTRY_LOOKUP:
- ec_lookup(fop->frame, heal->xl, fop->mask, EC_MINIMUM_MIN,
- ec_heal_entry_lookup_cbk, heal, &heal->loc, NULL);
+ UNLOCK(&ec->lock);
- return EC_STATE_HEAL_ENTRY_PREPARE;
+ ec_fop_set_error(fop, ENOTCONN);
+ ec_heal_fail(ec, heal_fop);
- case EC_STATE_HEAL_ENTRY_PREPARE:
- ec_heal_prepare(heal);
+ LOCK(&ec->lock);
+ }
+ } while ((heal_fop != NULL) && ec->shutdown);
- return EC_STATE_HEAL_PRE_INODELK_LOCK;
+ UNLOCK(&ec->lock);
- case EC_STATE_HEAL_PRE_INODELK_LOCK:
- // Only heal data/metadata if enough information is supplied.
- if (uuid_is_null(heal->loc.gfid))
- {
- ec_heal_entrylk(heal, ENTRYLK_UNLOCK);
+ if (heal_fop)
+ ec_launch_heal(ec, heal_fop);
+}
- return EC_STATE_HEAL_DISPATCH;
- }
+gf_boolean_t
+ec_is_entry_healing(ec_fop_data_t *fop)
+{
+ ec_inode_t *ctx = NULL;
+ int32_t heal_count = 0;
+ loc_t *loc = NULL;
- ec_heal_inodelk(heal, F_WRLCK, 0, 0, 0);
+ loc = &fop->loc[0];
- return EC_STATE_HEAL_PRE_INODE_LOOKUP;
+ LOCK(&loc->inode->lock);
+ {
+ ctx = __ec_inode_get(loc->inode, fop->xl);
+ if (ctx) {
+ heal_count = ctx->heal_count;
+ }
+ }
+ UNLOCK(&loc->inode->lock);
+ GF_ASSERT(heal_count >= 0);
+ return heal_count;
+}
- case EC_STATE_HEAL_PRE_INODE_LOOKUP:
- ec_heal_lookup(heal);
+void
+ec_heal_throttle(xlator_t *this, ec_fop_data_t *fop)
+{
+ gf_boolean_t can_heal = _gf_true;
+ ec_t *ec = this->private;
+ ec_fop_data_t *fop_rel = NULL;
- return EC_STATE_HEAL_XATTRIBUTES_REMOVE;
+ if (fop->req_frame == NULL) {
+ LOCK(&ec->lock);
+ {
+ if ((ec->background_heals > 0) &&
+ (ec->heal_wait_qlen + ec->background_heals) >
+ (ec->heal_waiters + ec->healers)) {
+ if (!ec_is_entry_healing(fop)) {
+ list_add_tail(&fop->healer, &ec->heal_waiting);
+ ec->heal_waiters++;
+ ec_set_entry_healing(fop);
+ } else {
+ fop_rel = fop;
+ }
+ fop = __ec_dequeue_heals(ec);
+ } else {
+ can_heal = _gf_false;
+ }
+ }
+ UNLOCK(&ec->lock);
+ }
- case EC_STATE_HEAL_XATTRIBUTES_REMOVE:
- ec_heal_removexattr_others(heal);
+ if (can_heal) {
+ if (fop) {
+ if (fop->req_frame != NULL) {
+ ec_set_entry_healing(fop);
+ }
+ ec_launch_heal(ec, fop);
+ }
+ } else {
+ gf_msg_debug(this->name, 0,
+ "Max number of heals are "
+ "pending, background self-heal rejected");
+ ec_fop_set_error(fop, EBUSY);
+ ec_heal_fail(ec, fop);
+ }
+ if (fop_rel) {
+ ec_heal_done(0, NULL, fop_rel);
+ }
+}
- return EC_STATE_HEAL_XATTRIBUTES_SET;
+void
+ec_heal(call_frame_t *frame, xlator_t *this, uintptr_t target,
+ uint32_t fop_flags, fop_heal_cbk_t func, void *data, loc_t *loc,
+ int32_t partial, dict_t *xdata)
+{
+ ec_cbk_t callback = {.heal = func};
+ ec_fop_data_t *fop = NULL;
+ int32_t err = EINVAL;
- case EC_STATE_HEAL_XATTRIBUTES_SET:
- ec_heal_setxattr_others(heal);
+ gf_msg_trace("ec", 0, "EC(HEAL) %p", frame);
- return EC_STATE_HEAL_ATTRIBUTES;
+ VALIDATE_OR_GOTO(this, fail);
+ GF_VALIDATE_OR_GOTO(this->name, this->private, fail);
- case EC_STATE_HEAL_ATTRIBUTES:
- ec_heal_attr(heal);
+ if (!loc || !loc->inode || gf_uuid_is_null(loc->inode->gfid))
+ goto fail;
- return EC_STATE_HEAL_OPEN;
+ if (frame && frame->local)
+ goto fail;
+ fop = ec_fop_data_allocate(frame, this, EC_FOP_HEAL, 0, target, fop_flags,
+ NULL, NULL, callback, data);
- case EC_STATE_HEAL_OPEN:
- ec_heal_open(heal);
+ err = ENOMEM;
- return EC_STATE_HEAL_REOPEN_FD;
+ if (fop == NULL)
+ goto fail;
- case EC_STATE_HEAL_REOPEN_FD:
- ec_heal_reopen_fd(heal);
+ fop->int32 = partial;
- return EC_STATE_HEAL_UNLOCK;
+ if (loc) {
+ if (loc_copy(&fop->loc[0], loc) != 0)
+ goto fail;
+ }
- case -EC_STATE_HEAL_XATTRIBUTES_REMOVE:
- case -EC_STATE_HEAL_XATTRIBUTES_SET:
- case -EC_STATE_HEAL_ATTRIBUTES:
- case -EC_STATE_HEAL_OPEN:
- case -EC_STATE_HEAL_REOPEN_FD:
- case -EC_STATE_HEAL_UNLOCK:
- case EC_STATE_HEAL_UNLOCK:
- ec_heal_inodelk(heal, F_UNLCK, 0, 0, 0);
+ if (xdata)
+ fop->xdata = dict_ref(xdata);
- /* Fall through */
+ ec_heal_throttle(this, fop);
- case -EC_STATE_HEAL_ENTRY_PREPARE:
- case -EC_STATE_HEAL_PRE_INODELK_LOCK:
- case -EC_STATE_HEAL_PRE_INODE_LOOKUP:
- ec_heal_entrylk(heal, ENTRYLK_UNLOCK);
+ return;
- if (ec_heal_needs_data_rebuild(heal))
- {
- return EC_STATE_HEAL_DATA_LOCK;
- }
+fail:
+ if (fop)
+ ec_fop_data_release(fop);
+ if (func)
+ func(frame, data, this, -1, err, 0, 0, 0, 0, NULL);
+}
- return EC_STATE_HEAL_DISPATCH;
+int
+ec_replace_heal_done(int ret, call_frame_t *heal, void *opaque)
+{
+ ec_t *ec = opaque;
+ gf_boolean_t last_fop = _gf_false;
- case EC_STATE_HEAL_DATA_LOCK:
- if (heal->done)
- {
- return EC_STATE_HEAL_POST_INODELK_LOCK;
- }
+ if (GF_ATOMIC_DEC(ec->async_fop_count) == 0) {
+ LOCK(&ec->lock);
+ {
+ last_fop = __ec_is_last_fop(ec);
+ }
+ UNLOCK(&ec->lock);
+ }
+ gf_msg_debug(ec->xl->name, 0, "getxattr on bricks is done ret %d", ret);
- ec_heal_inodelk(heal, F_WRLCK, 1, heal->offset, heal->size);
+ if (last_fop)
+ ec_pending_fops_completed(ec);
- return EC_STATE_HEAL_DATA_COPY;
+ return 0;
+}
- case EC_STATE_HEAL_DATA_COPY:
- ec_heal_data(heal);
+int32_t
+ec_replace_heal(ec_t *ec, inode_t *inode)
+{
+ loc_t loc = {0};
+ int ret = 0;
+
+ loc.inode = inode_ref(inode);
+ gf_uuid_copy(loc.gfid, inode->gfid);
+ ret = syncop_getxattr(ec->xl, &loc, NULL, EC_XATTR_HEAL, NULL, NULL);
+ if (ret < 0)
+ gf_msg_debug(ec->xl->name, 0, "Heal failed for replace brick ret = %d",
+ ret);
+
+ /* Once the root inode has been checked, it might have triggered a
+ * self-heal on it after a replace brick command or for some other
+ * reason. It can also happen that the volume already had damaged
+ * files in the index, even if the heal on the root directory failed.
+ * In both cases we need to wake all index healers to continue
+ * healing remaining entries that are marked as dirty. */
+ ec_shd_index_healer_wake(ec);
- return EC_STATE_HEAL_DATA_UNLOCK;
+ loc_wipe(&loc);
+ return ret;
+}
- case -EC_STATE_HEAL_DATA_COPY:
- case -EC_STATE_HEAL_DATA_UNLOCK:
- case EC_STATE_HEAL_DATA_UNLOCK:
- ec_heal_inodelk(heal, F_UNLCK, 1, heal->offset, heal->size);
+int32_t
+ec_replace_brick_heal_wrap(void *opaque)
+{
+ ec_t *ec = opaque;
+ inode_table_t *itable = NULL;
+ int32_t ret = -1;
- heal->offset += heal->size;
+ if (ec->xl->itable)
+ itable = ec->xl->itable;
+ else
+ goto out;
- return EC_STATE_HEAL_DATA_LOCK;
+ if (xlator_is_cleanup_starting(ec->xl))
+ goto out;
- case EC_STATE_HEAL_POST_INODELK_LOCK:
- ec_heal_inodelk(heal, F_WRLCK, 1, 0, 0);
+ ret = ec_replace_heal(ec, itable->root);
+out:
+ return ret;
+}
- return EC_STATE_HEAL_POST_INODE_LOOKUP;
+int32_t
+ec_launch_replace_heal(ec_t *ec)
+{
+ int ret = -1;
- case EC_STATE_HEAL_POST_INODE_LOOKUP:
- ec_heal_lookup(heal);
+ ret = synctask_new(ec->xl->ctx->env, ec_replace_brick_heal_wrap,
+ ec_replace_heal_done, NULL, ec);
- return EC_STATE_HEAL_SETATTR;
+ if (ret < 0) {
+ gf_msg_debug(ec->xl->name, 0, "Heal failed for replace brick ret = %d",
+ ret);
+ ec_replace_heal_done(-1, NULL, ec);
+ }
- case EC_STATE_HEAL_SETATTR:
- ec_setattr(heal->fop->frame, heal->xl, heal->bad, EC_MINIMUM_ONE,
- ec_heal_setattr_cbk, heal, &heal->loc, &heal->iatt,
- GF_SET_ATTR_MODE | GF_SET_ATTR_UID | GF_SET_ATTR_GID |
- GF_SET_ATTR_ATIME | GF_SET_ATTR_MTIME, NULL);
+ return ret;
+}
- return EC_STATE_HEAL_POST_INODELK_UNLOCK;
+int32_t
+ec_set_heal_info(dict_t **dict_rsp, char *status)
+{
+ dict_t *dict = NULL;
+ int ret = 0;
- case -EC_STATE_HEAL_SETATTR:
- case -EC_STATE_HEAL_POST_INODELK_UNLOCK:
- case EC_STATE_HEAL_POST_INODELK_UNLOCK:
- ec_heal_inodelk(heal, F_UNLCK, 1, 0, 0);
+ dict = dict_new();
+ if (!dict) {
+ ret = -ENOMEM;
+ goto out;
+ }
+ ret = dict_set_str(dict, "heal-info", status);
+ if (ret) {
+ gf_msg(THIS->name, GF_LOG_WARNING, -ret, EC_MSG_HEAL_FAIL,
+ "Failed to set heal-info key to "
+ "%s",
+ status);
+ dict_unref(dict);
+ dict = NULL;
+ }
+ *dict_rsp = dict;
+out:
+ return ret;
+}
- return EC_STATE_HEAL_DISPATCH;
-
- case -EC_STATE_HEAL_POST_INODELK_LOCK:
- case -EC_STATE_HEAL_POST_INODE_LOOKUP:
- case -EC_STATE_HEAL_ENTRY_LOOKUP:
- case -EC_STATE_HEAL_DATA_LOCK:
- case -EC_STATE_HEAL_DISPATCH:
- case EC_STATE_HEAL_DISPATCH:
- ec_heal_dispatch(heal);
-
- return EC_STATE_PREPARE_ANSWER;
-
- case EC_STATE_PREPARE_ANSWER:
- cbk = fop->answer;
- if (cbk != NULL)
- {
- if (!ec_dict_combine(cbk, EC_COMBINE_XDATA))
- {
- if (cbk->op_ret >= 0)
- {
- cbk->op_ret = -1;
- cbk->op_errno = EIO;
- }
- }
- if (cbk->op_ret < 0)
- {
- ec_fop_set_error(fop, cbk->op_errno);
+static int32_t
+_need_heal_calculate(ec_t *ec, uint64_t *dirty, unsigned char *sources,
+ gf_boolean_t self_locked, int32_t lock_count,
+ ec_heal_need_t *need_heal, uint64_t *versions)
+{
+ int i = 0;
+ int source_count = 0;
+
+ source_count = EC_COUNT(sources, ec->nodes);
+ if (source_count == ec->nodes) {
+ *need_heal = EC_HEAL_NONEED;
+ if (self_locked || lock_count == 0) {
+ for (i = 0; i < ec->nodes; i++) {
+ if (dirty[i] || (versions[i] != versions[0])) {
+ *need_heal = EC_HEAL_MUST;
+ goto out;
}
}
- else
- {
- ec_fop_set_error(fop, EIO);
- }
-
- return EC_STATE_REPORT;
-
- case EC_STATE_REPORT:
- cbk = fop->answer;
-
- GF_ASSERT(cbk != NULL);
-
- if (fop->id == EC_FOP_HEAL)
- {
- if (fop->cbks.heal != NULL)
- {
- fop->cbks.heal(fop->req_frame, fop, fop->xl, cbk->op_ret,
- cbk->op_errno, cbk->uintptr[0],
- cbk->uintptr[1], cbk->uintptr[2],
- cbk->xdata);
+ /* If lock count is 0, all dirty flags are 0 and all the
+ * versions are macthing then why are we here. It looks
+ * like something went wrong while removing the index entries
+ * after completing a successful heal or fop. In this case
+ * we need to remove this index entry to avoid triggering heal
+ * in a loop and causing lookups again and again*/
+ *need_heal = EC_HEAL_PURGE_INDEX;
+ } else {
+ for (i = 0; i < ec->nodes; i++) {
+ /* Since each lock can only increment the dirty
+ * count once, if dirty is > 1 it means that
+ * another operation has left the dirty count
+ * set and this indicates a problem in the
+ * inode.*/
+ if (dirty[i] > 1) {
+ *need_heal = EC_HEAL_MUST;
+ goto out;
}
- }
- else
- {
- if (fop->cbks.fheal != NULL)
- {
- fop->cbks.fheal(fop->req_frame, fop, fop->xl, cbk->op_ret,
- cbk->op_errno, cbk->uintptr[0],
- cbk->uintptr[1], cbk->uintptr[2],
- cbk->xdata);
+ if (dirty[i] != dirty[0] || (versions[i] != versions[0])) {
+ *need_heal = EC_HEAL_MAYBE;
}
}
+ }
+ } else {
+ *need_heal = EC_HEAL_MUST;
+ }
- return EC_STATE_END;
-
- case -EC_STATE_DISPATCH:
- case -EC_STATE_PREPARE_ANSWER:
- case -EC_STATE_REPORT:
- GF_ASSERT(fop->error != 0);
-
- if (fop->id == EC_FOP_HEAL)
- {
- if (fop->cbks.heal != NULL)
- {
- fop->cbks.heal(fop->req_frame, fop, fop->xl, -1,
- fop->error, 0, 0, 0, NULL);
- }
- }
- else
- {
- if (fop->cbks.fheal != NULL)
- {
- fop->cbks.fheal(fop->req_frame, fop, fop->xl, -1,
- fop->error, 0, 0, 0, NULL);
- }
- }
+out:
+ return source_count;
+}
- return EC_STATE_END;
+static int32_t
+ec_need_metadata_heal(ec_t *ec, inode_t *inode, default_args_cbk_t *replies,
+ int32_t lock_count, gf_boolean_t self_locked,
+ gf_boolean_t thorough, ec_heal_need_t *need_heal)
+{
+ uint64_t *dirty = NULL;
+ unsigned char *sources = NULL;
+ unsigned char *healed_sinks = NULL;
+ uint64_t *meta_versions = NULL;
+ int ret = 0;
+
+ sources = alloca0(ec->nodes);
+ healed_sinks = alloca0(ec->nodes);
+ dirty = alloca0(ec->nodes * sizeof(*dirty));
+ meta_versions = alloca0(ec->nodes * sizeof(*meta_versions));
+ ret = ec_heal_metadata_find_direction(ec, replies, meta_versions, dirty,
+ sources, healed_sinks);
+ if (ret < 0 && ret != -EIO) {
+ goto out;
+ }
- default:
- gf_log(fop->xl->name, GF_LOG_ERROR, "Unhandled state %d for %s",
- state, ec_fop_name(fop->id));
+ ret = _need_heal_calculate(ec, dirty, sources, self_locked, lock_count,
+ need_heal, meta_versions);
+out:
+ return ret;
+}
- return EC_STATE_END;
+static int32_t
+ec_need_data_heal(ec_t *ec, inode_t *inode, default_args_cbk_t *replies,
+ int32_t lock_count, gf_boolean_t self_locked,
+ gf_boolean_t thorough, ec_heal_need_t *need_heal)
+{
+ uint64_t *dirty = NULL;
+ unsigned char *sources = NULL;
+ unsigned char *healed_sinks = NULL;
+ uint64_t *data_versions = NULL;
+ uint64_t *size = NULL;
+ int ret = 0;
+
+ sources = alloca0(ec->nodes);
+ healed_sinks = alloca0(ec->nodes);
+ dirty = alloca0(ec->nodes * sizeof(*dirty));
+ data_versions = alloca0(ec->nodes * sizeof(*data_versions));
+ size = alloca0(ec->nodes * sizeof(*size));
+
+ /* When dd is going on and heal info is called there is a very good
+ * chance for on disk sizes to mismatch even though nothing is wrong
+ * we don't need ondisk size check there. But if the file is either
+ * self-locked or the caller wants a thorough check then make sure to
+ * perform on disk check also. */
+ ret = ec_heal_data_find_direction(
+ ec, replies, data_versions, dirty, size, sources, healed_sinks,
+ self_locked || thorough, EC_COMBINE_XDATA);
+ if (ret < 0 && ret != -EIO) {
+ goto out;
}
+
+ ret = _need_heal_calculate(ec, dirty, sources, self_locked, lock_count,
+ need_heal, data_versions);
+out:
+ return ret;
}
-void ec_heal(call_frame_t * frame, xlator_t * this, uintptr_t target,
- int32_t minimum, fop_heal_cbk_t func, void * data, loc_t * loc,
- dict_t * xdata)
+static int32_t
+ec_need_entry_heal(ec_t *ec, inode_t *inode, default_args_cbk_t *replies,
+ int32_t lock_count, gf_boolean_t self_locked,
+ gf_boolean_t thorough, ec_heal_need_t *need_heal)
{
- ec_cbk_t callback = { .heal = func };
- ec_fop_data_t * fop = NULL;
- int32_t error = EIO;
+ uint64_t *dirty = NULL;
+ unsigned char *sources = NULL;
+ unsigned char *healed_sinks = NULL;
+ uint64_t *data_versions = NULL;
+ int ret = 0;
+
+ sources = alloca0(ec->nodes);
+ healed_sinks = alloca0(ec->nodes);
+ dirty = alloca0(ec->nodes * sizeof(*dirty));
+ data_versions = alloca0(ec->nodes * sizeof(*data_versions));
+
+ ret = ec_heal_entry_find_direction(ec, replies, data_versions, dirty,
+ sources, healed_sinks);
+ if (ret < 0 && ret != -EIO) {
+ goto out;
+ }
- gf_log("ec", GF_LOG_TRACE, "EC(HEAL) %p", frame);
+ ret = _need_heal_calculate(ec, dirty, sources, self_locked, lock_count,
+ need_heal, data_versions);
+out:
+ return ret;
+}
- VALIDATE_OR_GOTO(this, out);
- GF_VALIDATE_OR_GOTO(this->name, frame, out);
- GF_VALIDATE_OR_GOTO(this->name, this->private, out);
+static int32_t
+ec_need_heal(ec_t *ec, inode_t *inode, default_args_cbk_t *replies,
+ int32_t lock_count, gf_boolean_t self_locked,
+ gf_boolean_t thorough, ec_heal_need_t *need_heal)
+{
+ int ret = 0;
- fop = ec_fop_data_allocate(NULL, this, EC_FOP_HEAL,
- EC_FLAG_UPDATE_LOC_INODE, target, minimum,
- ec_wind_heal, ec_manager_heal, callback, data);
- if (fop == NULL)
- {
+ ret = ec_need_metadata_heal(ec, inode, replies, lock_count, self_locked,
+ thorough, need_heal);
+ if (ret < 0)
+ goto out;
+
+ if (*need_heal == EC_HEAL_MUST)
goto out;
+
+ if (inode->ia_type == IA_IFREG) {
+ ret = ec_need_data_heal(ec, inode, replies, lock_count, self_locked,
+ thorough, need_heal);
+ } else if (inode->ia_type == IA_IFDIR) {
+ ret = ec_need_entry_heal(ec, inode, replies, lock_count, self_locked,
+ thorough, need_heal);
}
- if (loc != NULL)
- {
- if (loc_copy(&fop->loc[0], loc) != 0)
- {
- gf_log(this->name, GF_LOG_ERROR, "Failed to copy a location.");
+out:
+ return ret;
+}
- goto out;
- }
+int32_t
+ec_heal_inspect(call_frame_t *frame, ec_t *ec, inode_t *inode,
+ unsigned char *locked_on, gf_boolean_t self_locked,
+ gf_boolean_t thorough, ec_heal_need_t *need_heal)
+{
+ loc_t loc = {0};
+ int i = 0;
+ int ret = 0;
+ dict_t *xdata = NULL;
+ uint64_t zero_array[2] = {0};
+ uint64_t zero_value = 0;
+ unsigned char *output = NULL;
+ default_args_cbk_t *replies = NULL;
+ int32_t lock_count = 0;
+
+ EC_REPLIES_ALLOC(replies, ec->nodes);
+ output = alloca0(ec->nodes);
+
+ loc.inode = inode_ref(inode);
+ gf_uuid_copy(loc.gfid, inode->gfid);
+
+ xdata = dict_new();
+ if (!xdata ||
+ dict_set_static_bin(xdata, EC_XATTR_VERSION, zero_array,
+ sizeof(zero_array)) ||
+ dict_set_static_bin(xdata, EC_XATTR_DIRTY, zero_array,
+ sizeof(zero_array)) ||
+ dict_set_static_bin(xdata, EC_XATTR_SIZE, &zero_value,
+ sizeof(zero_value))) {
+ ret = -ENOMEM;
+ goto out;
}
- if (xdata != NULL)
- {
- fop->xdata = dict_ref(xdata);
- if (fop->xdata == NULL)
- {
- gf_log(this->name, GF_LOG_ERROR, "Failed to reference a "
- "dictionary.");
+ if (!self_locked) {
+ ret = dict_set_str(xdata, GLUSTERFS_INODELK_DOM_COUNT, ec->xl->name);
+ if (ret) {
+ ret = -ENOMEM;
goto out;
}
}
- error = 0;
+ ret = cluster_lookup(ec->xl_list, locked_on, ec->nodes, replies, output,
+ frame, ec->xl, &loc, xdata);
-out:
- if (fop != NULL)
- {
- ec_manager(fop, error);
+ if (ret != ec->nodes) {
+ ret = ec->nodes;
+ *need_heal = EC_HEAL_MUST;
+ goto out;
}
- else
- {
- func(frame, NULL, this, -1, EIO, 0, 0, 0, NULL);
+
+ if (self_locked)
+ goto need_heal;
+
+ for (i = 0; i < ec->nodes; i++) {
+ if (!output[i] || !replies[i].xdata) {
+ continue;
+ }
+ if ((dict_get_int32(replies[i].xdata, GLUSTERFS_INODELK_COUNT,
+ &lock_count) == 0) &&
+ lock_count > 0) {
+ break;
+ }
+ }
+need_heal:
+ ret = ec_need_heal(ec, inode, replies, lock_count, self_locked, thorough,
+ need_heal);
+out:
+ cluster_replies_wipe(replies, ec->nodes);
+ loc_wipe(&loc);
+ if (xdata) {
+ dict_unref(xdata);
}
+ return ret;
}
-/* FOP: fheal */
+int32_t
+ec_heal_locked_inspect(call_frame_t *frame, ec_t *ec, inode_t *inode,
+ ec_heal_need_t *need_heal)
+{
+ unsigned char *locked_on = NULL;
+ unsigned char *up_subvols = NULL;
+ unsigned char *output = NULL;
+ default_args_cbk_t *replies = NULL;
+ int ret = 0;
+
+ EC_REPLIES_ALLOC(replies, ec->nodes);
+ locked_on = alloca0(ec->nodes);
+ output = alloca0(ec->nodes);
+ up_subvols = alloca0(ec->nodes);
+ ec_mask_to_char_array(ec->xl_up, up_subvols, ec->nodes);
+
+ ret = cluster_inodelk(ec->xl_list, up_subvols, ec->nodes, replies,
+ locked_on, frame, ec->xl, ec->xl->name, inode, 0, 0);
+ if (ret != ec->nodes) {
+ *need_heal = EC_HEAL_MUST;
+ goto unlock;
+ }
+ ret = ec_heal_inspect(frame, ec, inode, locked_on, _gf_true, _gf_true,
+ need_heal);
+unlock:
+ cluster_uninodelk(ec->xl_list, locked_on, ec->nodes, replies, output, frame,
+ ec->xl, ec->xl->name, inode, 0, 0);
+ cluster_replies_wipe(replies, ec->nodes);
+ return ret;
+}
-void ec_wind_fheal(ec_t * ec, ec_fop_data_t * fop, int32_t idx)
+int32_t
+ec_get_heal_info(xlator_t *this, loc_t *entry_loc, dict_t **dict_rsp)
{
- ec_cbk_data_t * cbk;
- ec_heal_t * heal = fop->data;
+ int ret = -ENOMEM;
+ ec_heal_need_t need_heal = EC_HEAL_NONEED;
+ call_frame_t *frame = NULL;
+ ec_t *ec = NULL;
+ unsigned char *up_subvols = NULL;
+ loc_t loc = {
+ 0,
+ };
- ec_trace("WIND", fop, "idx=%d", idx);
+ VALIDATE_OR_GOTO(this, out);
+ GF_VALIDATE_OR_GOTO(this->name, entry_loc, out);
- cbk = ec_cbk_data_allocate(fop->req_frame, fop->xl, fop, EC_FOP_FHEAL, idx,
- fop->error == 0 ? 0 : -1, fop->error);
- if (cbk != NULL)
- {
- cbk->uintptr[0] = heal->available;
- cbk->uintptr[1] = heal->good;
- cbk->uintptr[2] = heal->bad;
+ ec = this->private;
+ up_subvols = alloca0(ec->nodes);
+ ec_mask_to_char_array(ec->xl_up, up_subvols, ec->nodes);
- ec_combine(cbk, NULL);
+ if (EC_COUNT(up_subvols, ec->nodes) != ec->nodes) {
+ need_heal = EC_HEAL_MUST;
+ goto set_heal;
+ }
+ frame = create_frame(this, this->ctx->pool);
+ if (!frame) {
+ goto out;
+ }
+ ec_owner_set(frame, frame->root);
+ frame->root->uid = 0;
+ frame->root->gid = 0;
+ frame->root->pid = GF_CLIENT_PID_SELF_HEALD;
+
+ if (loc_copy(&loc, entry_loc) != 0) {
+ gf_msg(this->name, GF_LOG_ERROR, ENOMEM, EC_MSG_LOC_COPY_FAIL,
+ "Failed to copy a location.");
+ goto out;
+ }
+ if (!loc.inode) {
+ ret = syncop_inode_find(this, this, loc.gfid, &loc.inode, NULL, NULL);
+ if (ret < 0)
+ goto out;
}
- ec_complete(fop);
-}
-
-void ec_fheal(call_frame_t * frame, xlator_t * this, uintptr_t target,
- int32_t minimum, fop_fheal_cbk_t func, void * data, fd_t * fd,
- dict_t * xdata)
-{
- ec_fd_t * ctx = ec_fd_get(fd, this);
-
- if ((ctx != NULL) && (ctx->loc.inode != NULL))
- {
- gf_log("ec", GF_LOG_DEBUG, "FHEAL ctx: flags=%X, open=%lX, bad=%lX",
- ctx->flags, ctx->open, ctx->bad);
- ec_heal(frame, this, target, minimum, func, data, &ctx->loc, xdata);
+ ret = ec_heal_inspect(frame, ec, loc.inode, up_subvols, _gf_false,
+ _gf_false, &need_heal);
+ if (ret == ec->nodes && need_heal != EC_HEAL_MAYBE) {
+ goto set_heal;
+ }
+ need_heal = EC_HEAL_NONEED;
+ ret = ec_heal_locked_inspect(frame, ec, loc.inode, &need_heal);
+ if (ret < 0)
+ goto out;
+set_heal:
+ if (need_heal == EC_HEAL_MUST) {
+ ret = ec_set_heal_info(dict_rsp, "heal");
+ } else {
+ ret = ec_set_heal_info(dict_rsp, "no-heal");
}
+out:
+ if (frame) {
+ STACK_DESTROY(frame->root);
+ }
+ loc_wipe(&loc);
+ return ret;
}