From fae1e70ff3309d2b64febaafc70abcaa2771ecf0 Mon Sep 17 00:00:00 2001
From: Pranith Kumar K <pkarampu@redhat.com>
Date: Thu, 16 Apr 2015 09:25:31 +0530
Subject: cluster/ec: metadata/name/entry heal implementation for ec

Metadata self-heal:
1) Take inode lock in domain 'this->name' on 0-0 range (full file)
2) perform lookup and get the xattrs on all the bricks
3) Choose the brick with highest version as source
4) Setattr uid/gid/permissions
5) removexattr stale xattrs
6) Setxattr existing/new xattrs
7) xattrop with -ve values of 'dirty' and difference of highest and its own
   version values for version xattr
8) unlock lock acquired in 1)

Entry self-heal:
1) take directory lock in domain 'this->name:self-heal' on 'NULL' to prevent
   more than one self-heal
2) we take directory lock in domain 'this->name' on 'NULL'
3) Perform lookup on version, dirty and remember the values
4) unlock lock acquired in 2)
5) readdir on all the bricks and trigger name heals
6) xattrop with -ve values of 'dirty' and difference of highest and its own
   version values for version xattr
7) unlock lock acquired in 1)

Name heal:
1) Take 'name' lock in 'this->name' on 'NULL'
2) Perform lookup on 'name' and get stat and xattr structures
3) Build gfid_db where for each gfid we know what subvolumes/bricks have
   a file with 'name'
4) Delete all the stale files i.e. the file does not exist on more than
   ec->redundancy number of bricks
5) On all the subvolumes/bricks with missing entry create 'name' with same
   type,gfid,permissions etc.
6) Unlock lock acquired in 1)
Known limitation: At the moment with present design, it conservatively
preserves the 'name' in case it can not decide whether to delete it.  this can
happen in the following scenario:
1) we have 3=2+1 (bricks: A, B, C) ec volume and 1 brick is down (Lets say A)
2) rename d1/f1 -> d2/f2 is performed but the rename is successful only on one
   of the bricks (Lets say B)
3) Now name self-heal on d1 and d2 would re-create the file on both d1 and d2
   resulting in d1/f1 and d2/f2.

Because we wanted to prevent data loss in the case above, the following
scenario is not healable, i.e. it needs manual intervention:
1) we have 3=2+1 (bricks: A, B, C) ec volume and 1 brick is down (Lets say A)
2) We have two hard links: d1/a, d2/b and another file d3/c even before the
   brick went down
3) rename d3/c -> d2/b is performed
4) Now name self-heal on d2/b doesn't heal because d2/b with older gfid will
   not be deleted.  One could think why not delete the link if there is
   more than 1 hardlink, but that leads to similar data loss issue I described
   earlier:
Scenario:
1) we have 3=2+1 (bricks: A, B, C) ec volume and 1 brick is down (Lets say A)
2) We have two hard links: d1/a, d2/b
3) rename d1/a -> d3/c, d2/b -> d4/d is performed and both the operations are
   successful only on one of the bricks (Lets say B)
4) Now name self-heal on the 'names' above which can happen in parallel can
   decide to delete the file thinking it has 2 links but after all the
   self-heals do unlinks we are left with data loss.

Change-Id: I3a68218a47bb726bd684604efea63cf11cfd11be
BUG: 1216303
Signed-off-by: Pranith Kumar K <pkarampu@redhat.com>
Reviewed-on: http://review.gluster.org/10298
Reviewed-on: http://review.gluster.org/10691
Tested-by: Gluster Build System <jenkins@build.gluster.com>
Tested-by: NetBSD Build System
---
 xlators/cluster/ec/src/ec-common.h |    5 +
 xlators/cluster/ec/src/ec-heal.c   | 1053 ++++++++++++++++++++++++++++++++++++
 2 files changed, 1058 insertions(+)

diff --git a/xlators/cluster/ec/src/ec-common.h b/xlators/cluster/ec/src/ec-common.h
index aaae16e71c3..ba009040b71 100644
--- a/xlators/cluster/ec/src/ec-common.h
+++ b/xlators/cluster/ec/src/ec-common.h
@@ -15,6 +15,11 @@
 
 #include "ec-data.h"
 
+typedef enum {
+        EC_DATA_TXN,
+        EC_METADATA_TXN
+} ec_txn_t;
+
 #define EC_CONFIG_VERSION 0
 
 #define EC_CONFIG_ALGORITHM 0
diff --git a/xlators/cluster/ec/src/ec-heal.c b/xlators/cluster/ec/src/ec-heal.c
index 6148df904a4..1e19cf57e1b 100644
--- a/xlators/cluster/ec/src/ec-heal.c
+++ b/xlators/cluster/ec/src/ec-heal.c
@@ -20,6 +20,35 @@
 
 #include "ec-mem-types.h"
 #include "ec-data.h"
+#include "byte-order.h"
+#include "syncop.h"
+#include "syncop-utils.h"
+#include "cluster-syncop.h"
+
+#define alloca0(size) ({void *__ptr; __ptr = alloca(size); memset(__ptr, 0, size); __ptr; })
+#define EC_COUNT(array, max) ({int __i; int __res = 0; for (__i = 0; __i < max; __i++) if (array[__i]) __res++; __res; })
+#define EC_INTERSECT(dst, src1, src2, max) ({int __i; for (__i = 0; __i < max; __i++) dst[__i] = src1[__i] && src2[__i]; })
+#define EC_ADJUST_SOURCE(source, sources, max) ({int __i; if (sources[source] == 0) {source = -1; for (__i = 0; __i < max; __i++) if (sources[__i]) source = __i; } })
+#define IA_EQUAL(f, s, field) (memcmp (&(f.ia_##field), &(s.ia_##field), sizeof (s.ia_##field)) == 0)
+#define EC_REPLIES_ALLOC(replies, numsubvols) do {              \
+        int __i = 0;                                            \
+        replies = alloca0(numsubvols * sizeof (*replies));      \
+        for (__i = 0; __i < numsubvols; __i++)                  \
+                INIT_LIST_HEAD (&replies[__i].entries.list);    \
+        } while (0)
+
+
+struct ec_name_data {
+        call_frame_t *frame;
+        unsigned char *participants;
+        unsigned char *failed_on;
+        unsigned char *gfidless;
+        unsigned char *enoent;
+        unsigned char *same;
+        char *name;
+        inode_t *parent;
+        default_args_cbk_t *replies;
+};
 
 static char *ec_ignore_xattrs[] = {
         GF_SELINUX_XATTR_KEY,
@@ -35,6 +64,9 @@ ec_ignorable_key_match (dict_t *dict, char *key, data_t *val, void *mdata)
         if (!key)
                 goto out;
 
+        if (strncmp (key, EC_XATTR_PREFIX, strlen (EC_XATTR_PREFIX)) == 0)
+                        return _gf_true;
+
         for (i = 0; ec_ignore_xattrs[i]; i++) {
                 if (!strcmp (key, ec_ignore_xattrs[i]))
                        return _gf_true;
@@ -1590,3 +1622,1024 @@ void ec_fheal(call_frame_t * frame, xlator_t * this, uintptr_t target,
                 xdata);
     }
 }
+
+/* Common heal code */
+void
+ec_mask_to_char_array (uintptr_t mask, unsigned char *array, int numsubvols)
+{
+        int     i = 0;
+
+        for (i = 0; i < numsubvols; i++)
+                array[i] = ((mask >> i) & 1);
+}
+
+int
+ec_heal_find_direction (ec_t *ec, ec_txn_t type, default_args_cbk_t *replies,
+                        uint64_t *versions, uint64_t *dirty,
+                        unsigned char *sources, unsigned char *healed_sinks)
+{
+        void        *ptr        = NULL;
+        uint64_t    *value      = NULL;
+        uint64_t    max_version = 0;
+        int         source      = -1;
+        int32_t     len         = 0;
+        int         ret         = 0;
+        int         i           = 0;
+        struct iatt source_ia   = {0};
+        struct iatt child_ia    = {0};
+
+        for (i = 0; i < ec->nodes; i++) {
+                if (!replies[i].valid)
+                        continue;
+
+                if (replies[i].op_ret == -1)
+                        continue;
+
+                if (source == -1)
+                        source = i;
+
+                ret = dict_get_ptr_and_len (replies[i].xdata, EC_XATTR_VERSION,
+                                            &ptr, &len);
+                if (ret == 0) {
+                        value = ptr;
+                        versions[i] = ntoh64(value[type]);
+                        if (max_version < versions[i]) {
+                                max_version = versions[i];
+                                source = i;
+                        }
+                }
+
+                ret = dict_get_ptr_and_len (replies[i].xdata, EC_XATTR_DIRTY,
+                                            &ptr, &len);
+                if (ret == 0) {
+                        value = ptr;
+                        dirty[i] = ntoh64(value[type]);
+                }
+        }
+
+        if (source < 0)
+                goto out;
+
+        for (i = 0; i < ec->nodes; i++) {
+                if (!replies[i].valid)
+                        continue;
+
+                if (replies[i].op_ret == -1)
+                        continue;
+
+                if (versions[i] == versions[source])
+                        sources[i] = 1;
+                else
+                        healed_sinks[i] = 1;
+        }
+
+        if (type == EC_METADATA_TXN) {
+                source_ia = replies[source].stat;
+                for (i = 0; i < ec->nodes; i++) {
+                        if (!sources[i])
+                                continue;
+                        child_ia = replies[i].stat;
+                        if (!IA_EQUAL(source_ia, child_ia, gfid) ||
+                            !IA_EQUAL(source_ia, child_ia, type) ||
+                            !IA_EQUAL(source_ia, child_ia, prot) ||
+                            !IA_EQUAL(source_ia, child_ia, uid) ||
+                            !IA_EQUAL(source_ia, child_ia, gid)) {
+                                sources[i] = 0;
+                                healed_sinks[i] = 1;
+                        }
+                }
+        }
+out:
+        return source;
+}
+
+int
+ec_adjust_versions (call_frame_t *frame, ec_t *ec, ec_txn_t type, inode_t *inode, int source,
+                    unsigned char *sources,
+                    unsigned char *healed_sinks, uint64_t *versions,
+                    uint64_t *dirty)
+{
+        int                        i                 = 0;
+        int                        ret               = 0;
+        dict_t                     *xattr            = NULL;
+        int                        op_ret            = 0;
+        loc_t                      loc               = {0};
+        gf_boolean_t               erase_dirty       = _gf_false;
+        uint64_t                   versions_xattr[2] = {0};
+        uint64_t                   dirty_xattr[2]    = {0};
+        uint64_t                   allzero[2]        = {0};
+
+        loc.inode = inode_ref (inode);
+        gf_uuid_copy (loc.gfid, inode->gfid);
+        xattr = dict_new ();
+        if (!xattr)
+                goto out;
+
+        /* dirty xattr represents if the file/dir needs heal. Unless all the
+         * copies are healed, don't erase it */
+        if (EC_COUNT (sources, ec->nodes) +
+            EC_COUNT (healed_sinks, ec->nodes) == ec->nodes)
+                erase_dirty = _gf_true;
+
+        for (i = 0; i < ec->nodes; i++) {
+                if (!sources[i] && !healed_sinks[i])
+                        continue;
+                versions_xattr[type] = hton64(versions[source] - versions[i]);
+                ret = dict_set_static_bin (xattr, EC_XATTR_VERSION,
+                                           versions_xattr,
+                                           sizeof (versions_xattr));
+                if (ret < 0) {
+                        op_ret = -ENOTCONN;
+                        continue;
+                }
+
+                if (erase_dirty) {
+                        dirty_xattr[type] = hton64(-dirty[i]);
+                        ret = dict_set_static_bin (xattr, EC_XATTR_DIRTY,
+                                                   dirty_xattr,
+                                                   sizeof (dirty_xattr));
+                        if (ret < 0) {
+                                op_ret = -ENOTCONN;
+                                continue;
+                        }
+                }
+
+                if ((memcmp (versions_xattr, allzero, sizeof (allzero)) == 0) &&
+                    (memcmp (dirty_xattr, allzero, sizeof (allzero)) == 0))
+                        continue;
+
+                ret = syncop_xattrop (ec->xl_list[i], &loc,
+                                      GF_XATTROP_ADD_ARRAY64, xattr, NULL,
+                                      NULL);
+                if (ret < 0) {
+                        op_ret = -ret;
+                        continue;
+                }
+        }
+
+out:
+        if (xattr)
+                dict_unref (xattr);
+        loc_wipe (&loc);
+        return op_ret;
+}
+
+int
+__ec_heal_prepare (call_frame_t *frame, ec_t *ec, inode_t *inode,
+                   unsigned char *locked_on, default_args_cbk_t *replies,
+                   uint64_t *versions, uint64_t *dirty, unsigned char *sources,
+                   unsigned char *healed_sinks, ec_txn_t type)
+{
+        loc_t         loc     = {0};
+        unsigned char *output = NULL;
+        dict_t        *xdata  = NULL;
+        int           ret     = 0;
+        int           source  = 0;
+
+        xdata = dict_new ();
+        if (!xdata) {
+                ret = -ENOMEM;
+                goto out;
+        }
+
+        if (dict_set_uint64(xdata, "list-xattr", 0)) {
+                ret = -ENOMEM;
+                goto out;
+        }
+
+        loc.inode = inode_ref (inode);
+        gf_uuid_copy (loc.gfid, inode->gfid);
+        output = alloca0 (ec->nodes);
+        ret = cluster_lookup (ec->xl_list, locked_on, ec->nodes, replies,
+                              output, frame, ec->xl, &loc, xdata);
+        if (ret <= ec->fragments) {
+                ret = -ENOTCONN;
+                goto out;
+        }
+
+        source = ec_heal_find_direction (ec, type, replies, versions,
+                                         dirty, sources, healed_sinks);
+        if (source < 0) {
+                ret = -EIO;
+                goto out;
+        }
+        ret = source;
+out:
+        if (xdata)
+                dict_unref (xdata);
+
+        loc_wipe (&loc);
+        return ret;
+}
+
+/* Metadata heal */
+int
+__ec_removexattr_sinks (call_frame_t *frame, ec_t *ec, inode_t *inode,
+                        int source, unsigned char *sources,
+                        unsigned char *healed_sinks,
+                        default_args_cbk_t *replies)
+{
+        int   i   = 0;
+        int   ret = 0;
+        loc_t loc = {0};
+
+        loc.inode = inode_ref (inode);
+        gf_uuid_copy (loc.gfid, inode->gfid);
+
+        for (i = 0; i < ec->nodes; i++) {
+                if (i == source)
+                        continue;
+                if (!sources[i] && !healed_sinks[i])
+                        continue;
+                ret = dict_foreach (replies[i].xattr, ec_heal_xattr_clean,
+                                    replies[source].xattr);
+                if (ret < 0) {
+                        sources[i] = 0;
+                        healed_sinks[i] = 0;
+                }
+
+                if (replies[i].xattr->count == 0) {
+                        continue;
+                } else if (sources[i]) {
+                        /* This can happen if setxattr/removexattr succeeds on
+                         * the bricks but fails to update the version. This
+                         * will make sure that the xattrs are made equal after
+                         * heal*/
+                        sources[i] = 0;
+                        healed_sinks[i] = 1;
+                }
+
+                ret = syncop_removexattr (ec->xl_list[i], &loc, "",
+                                          replies[i].xattr, NULL);
+                if (ret < 0)
+                        healed_sinks[i] = 0;
+        }
+
+        loc_wipe (&loc);
+        if (EC_COUNT (healed_sinks, ec->nodes) == 0)
+                return -ENOTCONN;
+        return 0;
+}
+
+int
+__ec_heal_metadata (call_frame_t *frame, ec_t *ec, inode_t *inode,
+                    unsigned char *locked_on)
+{
+        loc_t              loc           = {0};
+        int                ret           = 0;
+        int                source        = 0;
+        default_args_cbk_t *replies      = NULL;
+        uint64_t           *versions     = NULL;
+        uint64_t           *dirty        = NULL;
+        unsigned char      *sources      = NULL;
+        unsigned char      *healed_sinks = NULL;
+        unsigned char      *output       = NULL;
+        dict_t             *source_dict  = NULL;
+        struct iatt        source_buf    = {0};
+
+        EC_REPLIES_ALLOC (replies, ec->nodes);
+
+        loc.inode = inode_ref (inode);
+        gf_uuid_copy (loc.gfid, inode->gfid);
+        output = alloca0 (ec->nodes);
+        versions = alloca0 (ec->nodes * sizeof (*versions));
+        dirty = alloca0 (ec->nodes * sizeof (*dirty));
+        sources = alloca0 (ec->nodes);
+        healed_sinks = alloca0 (ec->nodes);
+        source = __ec_heal_prepare (frame, ec, inode, locked_on, replies,
+                                    versions, dirty, sources, healed_sinks,
+                                    EC_METADATA_TXN);
+        if (source < 0) {
+                ret = -EIO;
+                goto out;
+        }
+
+        source_buf = replies[source].stat;
+        ret = cluster_setattr (ec->xl_list, healed_sinks, ec->nodes, replies,
+                               output, frame, ec->xl, &loc,
+                               &source_buf, GF_SET_ATTR_MODE |
+                               GF_SET_ATTR_UID | GF_SET_ATTR_GID, NULL);
+        /*In case the operation fails on some of the subvols*/
+        memcpy (healed_sinks, output, ec->nodes);
+        if (EC_COUNT (healed_sinks, ec->nodes) == 0) {
+                ret = -ENOTCONN;
+                goto out;
+        }
+
+        ret = cluster_getxattr (ec->xl_list, locked_on, ec->nodes, replies,
+                                output, frame, ec->xl, &loc, NULL, NULL);
+        EC_INTERSECT (sources, sources, output, ec->nodes);
+        EC_INTERSECT (healed_sinks, healed_sinks, output, ec->nodes);
+        EC_ADJUST_SOURCE (source, sources, ec->nodes);
+        if ((EC_COUNT (healed_sinks, ec->nodes) == 0) || (source < 0)) {
+                ret = -ENOTCONN;
+                goto out;
+        }
+
+        ret = __ec_removexattr_sinks (frame, ec, inode, source, sources,
+                                      healed_sinks, replies);
+        if (ret < 0)
+                goto out;
+
+        source_dict = dict_ref (replies[source].xattr);
+        if (dict_foreach_match (source_dict, ec_ignorable_key_match, NULL,
+                                dict_remove_foreach_fn, NULL) == -1) {
+                ret = -ENOMEM;
+                goto out;
+        }
+
+        ret = cluster_setxattr (ec->xl_list, healed_sinks, ec->nodes,
+                                replies, output, frame, ec->xl, &loc,
+                                source_dict, 0, NULL);
+
+        EC_INTERSECT (healed_sinks, healed_sinks, output, ec->nodes);
+        if (EC_COUNT (healed_sinks, ec->nodes) == 0) {
+                ret = -ENOTCONN;
+                goto out;
+        }
+
+        ret = ec_adjust_versions (frame, ec, EC_METADATA_TXN, inode, source,
+                                  sources, healed_sinks, versions, dirty);
+out:
+        if (source_dict)
+                dict_unref (source_dict);
+
+        loc_wipe (&loc);
+        cluster_replies_wipe (replies, ec->nodes);
+        return ret;
+}
+
+int
+ec_heal_metadata (call_frame_t *req_frame, ec_t *ec, inode_t *inode)
+{
+        unsigned char      *locked_on  = NULL;
+        unsigned char      *up_subvols = NULL;
+        unsigned char      *output     = NULL;
+        int                ret         = 0;
+        default_args_cbk_t *replies    = NULL;
+        call_frame_t       *frame      = NULL;
+
+        EC_REPLIES_ALLOC (replies, ec->nodes);
+        frame = copy_frame (req_frame);
+        if (!frame) {
+                ret = -ENOMEM;
+                goto out;
+        }
+
+        /*Do heal as root*/
+        frame->root->uid = 0;
+        frame->root->gid = 0;
+        locked_on = alloca0(ec->nodes);
+        output = alloca0(ec->nodes);
+        up_subvols = alloca0(ec->nodes);
+        ec_mask_to_char_array (ec->xl_up, up_subvols, ec->nodes);
+        ret = cluster_inodelk (ec->xl_list, up_subvols, ec->nodes, replies,
+                               locked_on, frame, ec->xl, ec->xl->name, inode, 0,
+                               0);
+        {
+                if (ret <= ec->fragments) {
+                        gf_log (ec->xl->name, GF_LOG_DEBUG, "%s: Skipping heal "
+                                "as only %d number of subvolumes could "
+                                "be locked", uuid_utoa (inode->gfid), ret);
+                        ret = -ENOTCONN;
+                        goto unlock;
+                }
+                ret = __ec_heal_metadata (frame, ec, inode, locked_on);
+        }
+unlock:
+        cluster_uninodelk (ec->xl_list, locked_on, ec->nodes, replies, output,
+                           frame, ec->xl, ec->xl->name, inode, 0, 0);
+out:
+        cluster_replies_wipe (replies, ec->nodes);
+        if (frame)
+                STACK_DESTROY (frame->root);
+        return ret;
+}
+
+/*entry heal*/
+int
+__ec_heal_entry_prepare (call_frame_t *frame, ec_t *ec, inode_t *inode,
+                         unsigned char *locked_on, uint64_t *versions,
+                         uint64_t *dirty, unsigned char *sources,
+                         unsigned char *healed_sinks)
+{
+        int                source   = 0;
+        default_args_cbk_t *replies = NULL;
+        loc_t              loc      = {0};
+        int                ret      = 0;
+
+        EC_REPLIES_ALLOC (replies, ec->nodes);
+
+        loc.inode = inode_ref (inode);
+        gf_uuid_copy (loc.gfid, inode->gfid);
+        source = __ec_heal_prepare (frame, ec, inode, locked_on, replies,
+                                    versions, dirty, sources, healed_sinks,
+                                    EC_DATA_TXN);
+        if (source < 0) {
+                ret = -EIO;
+                goto out;
+        }
+        ret = source;
+out:
+        loc_wipe (&loc);
+        cluster_replies_wipe (replies, ec->nodes);
+        return ret;
+}
+
+/*Name heal*/
+int
+ec_delete_stale_name (dict_t *gfid_db, char *key, data_t *d, void *data)
+{
+        struct ec_name_data *name_data   = data;
+        struct iatt         *ia          = NULL;
+        ec_t                *ec          = NULL;
+        loc_t               loc          = {0};
+        unsigned char       *same        = data_to_bin (d);
+        default_args_cbk_t  *replies     = NULL;
+        unsigned char       *output      = NULL;
+        int                 ret          = 0;
+        int                 estale_count = 0;
+        int                 i            = 0;
+        call_frame_t        *frame       = name_data->frame;
+
+        ec = name_data->frame->this->private;
+        EC_REPLIES_ALLOC (replies, ec->nodes);
+        if (EC_COUNT (same, ec->nodes) >= ec->fragments) {
+                ret = 0;
+                goto out;
+        }
+
+        loc.inode = inode_new (name_data->parent->table);
+        if (!loc.inode) {
+                ret = -ENOMEM;
+                goto out;
+        }
+        gf_uuid_parse (key, loc.gfid);
+        output = alloca0(ec->nodes);
+        ret = cluster_lookup (ec->xl_list, name_data->participants, ec->nodes,
+                              replies, output, name_data->frame, ec->xl, &loc,
+                              NULL);
+
+        for (i = 0; i < ec->nodes; i++) {
+                if (!replies[i].valid)
+                        continue;
+                if (replies[i].op_ret == -1) {
+                        if (replies[i].op_errno == ESTALE ||
+                            replies[i].op_errno == ENOENT)
+                                estale_count++;
+                        else
+                                name_data->participants[i] = 0;
+                }
+        }
+
+        if (estale_count <= ec->redundancy) {
+                /* We have at least ec->fragments number of fragments, so the
+                 * file is recoverable, so don't delete it*/
+
+                /* Please note that the lookup call above could fail with
+                 * ENOTCONN on all subvoumes and still this branch will be
+                 * true, but in those cases conservatively we decide to not
+                 * delete the file until we are sure*/
+                ret = 0;
+                goto out;
+        }
+
+        /*Noway to recover, delete the name*/
+        loc_wipe (&loc);
+        loc.parent = inode_ref (name_data->parent);
+        gf_uuid_copy (loc.pargfid, loc.parent->gfid);
+        loc.name = name_data->name;
+        for (i = 0; i < ec->nodes; i++) {
+                if (same[i] && replies[i].valid && (replies[i].op_ret == 0)) {
+                        ia = &replies[i].stat;
+                        break;
+                }
+        }
+
+        if (!ia) {
+                ret = -ENOTCONN;
+                goto out;
+        }
+
+        if (IA_ISDIR (ia->ia_type)) {
+                ret = cluster_rmdir (ec->xl_list, same, ec->nodes, replies,
+                                     output, frame, ec->xl, &loc, 1, NULL);
+        } else {
+                ret = cluster_unlink (ec->xl_list, same, ec->nodes, replies,
+                                      output, frame, ec->xl, &loc, 0, NULL);
+        }
+
+        for (i = 0; i < ec->nodes; i++) {
+                if (output[i]) {
+                        same[i] = 0;
+                        name_data->enoent[i] = 1;
+                } else {
+                        /*op failed*/
+                        if (same[i])
+                                name_data->participants[i] = 0;
+                }
+        }
+        ret = 0;
+        /*This will help in making decisions about creating names*/
+        dict_del (gfid_db, key);
+out:
+        cluster_replies_wipe (replies, ec->nodes);
+        loc_wipe (&loc);
+        return ret;
+}
+
+int
+ec_delete_stale_names (call_frame_t *frame, ec_t *ec, inode_t *parent,
+                       char *name, default_args_cbk_t *replies, dict_t *gfid_db,
+                       unsigned char *enoent, unsigned char *gfidless,
+                       unsigned char *participants)
+{
+        struct ec_name_data name_data = {0};
+
+        name_data.enoent = enoent;
+        name_data.gfidless = gfidless;
+        name_data.participants = participants;
+        name_data.name = name;
+        name_data.parent = parent;
+        name_data.frame = frame;
+        name_data.replies = replies;
+        return dict_foreach (gfid_db, ec_delete_stale_name, &name_data);
+}
+
+int
+_assign_same (dict_t *dict, char *key, data_t *value, void *data)
+{
+        struct ec_name_data *name_data = data;
+
+        name_data->same = data_to_bin (value);
+        return 0;
+}
+
+int
+ec_create_name (call_frame_t *frame, ec_t *ec, inode_t *parent, char *name,
+                default_args_cbk_t *lookup_replies, dict_t *gfid_db,
+                unsigned char *enoent, unsigned char *participants)
+{
+        int                 ret       = 0;
+        int                 i         = 0;
+        struct ec_name_data name_data = {0};
+        struct iatt         *ia       = NULL;
+        unsigned char       *output   = 0;
+        unsigned char       *output1  = 0;
+        default_args_cbk_t  *replies  = NULL;
+        loc_t               loc       = {0};
+        loc_t               srcloc    = {0};
+        unsigned char       *link     = NULL;
+        unsigned char       *create   = NULL;
+        dict_t              *xdata    = NULL;
+        char                *linkname = NULL;
+
+        /* There should be just one gfid key */
+        EC_REPLIES_ALLOC (replies, ec->nodes);
+        if (gfid_db->count != 1) {
+                ret = -EINVAL;
+                goto out;
+        }
+
+        ret = dict_foreach (gfid_db, _assign_same, &name_data);
+        if (ret < 0)
+                goto out;
+        /*There should at least be one valid success reply with gfid*/
+        for (i = 0; i < ec->nodes; i++)
+                if (name_data.same[i])
+                        break;
+
+        if (i == ec->nodes) {
+                ret = -EINVAL;
+                goto out;
+        }
+
+        ia = &lookup_replies[i].stat;
+        xdata = dict_new ();
+        loc.parent = inode_ref (parent);
+        gf_uuid_copy (loc.pargfid, parent->gfid);
+        loc.inode = inode_new (parent->table);
+        if (loc.inode)
+                srcloc.inode = inode_ref (loc.inode);
+        gf_uuid_copy (srcloc.gfid, ia->ia_gfid);
+        if (!loc.inode || !xdata || dict_set_static_bin (xdata, "gfid-req",
+                                                         ia->ia_gfid,
+                                                        sizeof (ia->ia_gfid))) {
+                ret = -ENOMEM;
+                goto out;
+        }
+        loc.name = name;
+        link = alloca0 (ec->nodes);
+        create = alloca0 (ec->nodes);
+        output = alloca0 (ec->nodes);
+        output1 = alloca0 (ec->nodes);
+        switch (ia->ia_type) {
+        case IA_IFDIR:
+                ret = cluster_mkdir (ec->xl_list, enoent, ec->nodes,
+                                   replies, output, frame, ec->xl, &loc,
+                                   st_mode_from_ia (ia->ia_prot,
+                                                ia->ia_type), 0, xdata);
+                break;
+
+        case IA_IFLNK:
+                /*Check for hard links and create/link*/
+                ret = cluster_lookup (ec->xl_list, enoent, ec->nodes,
+                                      replies, output, frame, ec->xl,
+                                      &srcloc, NULL);
+                for (i = 0; i < ec->nodes; i++) {
+                        if (output[i]) {
+                                link[i] = 1;
+                        } else {
+                                if (replies[i].op_errno == ENOENT ||
+                                    replies[i].op_errno == ESTALE) {
+                                        create[i] = 1;
+                                }
+                        }
+                }
+
+                if (EC_COUNT (link, ec->nodes)) {
+                        cluster_link (ec->xl_list, link, ec->nodes,
+                                      replies, output1, frame, ec->xl,
+                                      &srcloc, &loc, NULL);
+                }
+
+                if (EC_COUNT (create, ec->nodes)) {
+                        cluster_readlink (ec->xl_list, name_data.same,
+                                          ec->nodes, replies, output,
+                                          frame, ec->xl, &srcloc, 4096,
+                                          NULL);
+                        if (EC_COUNT (output, ec->nodes) == 0) {
+                                ret = -ENOTCONN;
+                                goto out;
+                        }
+
+                        for (i = 0; i < ec->nodes; i++) {
+                                if (output[i])
+                                        break;
+                        }
+                        linkname = alloca0 (strlen(replies[i].buf) + 1);
+                        strcpy (linkname, replies[i].buf);
+                        cluster_symlink (ec->xl_list, create, ec->nodes,
+                                         replies, output, frame, ec->xl,
+                                         linkname, &loc, 0, xdata);
+                }
+                for (i = 0; i < ec->nodes; i++)
+                        if (output1[i])
+                                output[i] = 1;
+                break;
+        default:
+                ret = dict_set_int32 (xdata, GLUSTERFS_INTERNAL_FOP_KEY,
+                                      1);
+                if (ret)
+                        goto out;
+                ret = cluster_mknod (ec->xl_list, enoent, ec->nodes,
+                                     replies, output, frame, ec->xl,
+                                     &loc, st_mode_from_ia (ia->ia_prot,
+                                                           ia->ia_type),
+                                     ia->ia_rdev, 0, xdata);
+                break;
+        }
+
+        for (i = 0; i < ec->nodes; i++) {
+                if (enoent[i] && !output[i])
+                        participants[i] = 0;
+        }
+
+        ret = 0;
+out:
+        loc_wipe (&loc);
+        loc_wipe (&srcloc);
+        EC_REPLIES_ALLOC (replies, ec->nodes);
+        if (xdata)
+                dict_unref (xdata);
+        return ret;
+}
+
+int
+__ec_heal_name (call_frame_t *frame, ec_t *ec, inode_t *parent, char *name,
+                unsigned char *participants)
+{
+        unsigned char      *output   = NULL;
+        unsigned char      *enoent   = NULL;
+        default_args_cbk_t *replies  = NULL;
+        dict_t             *xdata    = NULL;
+        dict_t             *gfid_db  = NULL;
+        int                ret       = 0;
+        loc_t              loc       = {0};
+        int                i         = 0;
+        struct iatt        *ia       = NULL;
+        char               gfid[64]  = {0};
+        unsigned char      *same     = NULL;
+        unsigned char      *gfidless = NULL;
+
+        loc.parent = inode_ref (parent);
+        loc.inode = inode_new (parent->table);
+        gf_uuid_copy (loc.pargfid, parent->gfid);
+        loc.name = name;
+        xdata = dict_new ();
+        gfid_db = dict_new ();
+        if (!xdata || !gfid_db || !loc.inode) {
+                ret = -ENOMEM;
+                goto out;
+        }
+
+        ret = dict_set_int32 (xdata, GF_GFIDLESS_LOOKUP, 1);
+        if (ret) {
+                ret = -ENOMEM;
+                goto out;
+        }
+
+        output = alloca0 (ec->nodes);
+        gfidless = alloca0 (ec->nodes);
+        enoent = alloca0 (ec->nodes);
+        EC_REPLIES_ALLOC (replies, ec->nodes);
+        ret = cluster_lookup (ec->xl_list, participants, ec->nodes, replies,
+                              output, frame, ec->xl, &loc, NULL);
+        for (i = 0; i < ec->nodes; i++) {
+                if (!replies[i].valid)
+                        continue;
+
+                if (replies[i].op_ret == -1) {
+                        /*If ESTALE comes here, that means parent dir is not
+                         * present, nothing to do there, so reset participants
+                         * for that brick*/
+                        if (replies[i].op_errno == ENOENT)
+                                enoent[i] = 1;
+                        else
+                                participants[i] = 0;
+                        continue;
+                }
+                ia = &replies[i].stat;
+                if (gf_uuid_is_null (ia->ia_gfid)) {
+                        if (IA_ISDIR (ia->ia_type) || ia->ia_size == 0)
+                                gfidless[i] = 1;
+                        else
+                                participants[i] = 0;
+                } else {
+                        uuid_utoa_r (ia->ia_gfid, gfid);
+                        ret = dict_get_bin (gfid_db, gfid, (void **)&same);
+                        if (ret < 0) {
+                                same = alloca0(ec->nodes);
+                        }
+                        same[i] = 1;
+                        if (ret < 0) {
+                                ret = dict_set_static_bin (gfid_db, gfid, same,
+                                                           ec->nodes);
+                        }
+                        if (ret < 0)
+                                goto out;
+                }
+        }
+
+        ret = ec_delete_stale_names (frame, ec, parent, name, replies, gfid_db,
+                                     enoent, gfidless, participants);
+
+        if (gfid_db->count == 0) {
+                /* All entries seem to be stale entries and deleted,
+                 * nothing more to do.*/
+                goto out;
+        }
+
+        if (gfid_db->count > 1) {
+                gf_log (ec->xl->name, GF_LOG_INFO, "%s/%s: Not able to heal",
+                        uuid_utoa (parent->gfid), name);
+                memset (participants, 0, ec->nodes);
+                goto out;
+        }
+
+        EC_INTERSECT (enoent, enoent, participants, ec->nodes);
+        if (EC_COUNT (enoent, ec->nodes) == 0) {
+                ret = 0;
+                goto out;
+        }
+
+        ret = ec_create_name (frame, ec, parent, name, replies, gfid_db, enoent,
+                              participants);
+out:
+        cluster_replies_wipe (replies, ec->nodes);
+        loc_wipe (&loc);
+        if (xdata)
+                dict_unref (xdata);
+        if (gfid_db)
+                dict_unref (gfid_db);
+        return ret;
+}
+
+int
+ec_heal_name (call_frame_t *frame, ec_t *ec, inode_t *parent, char *name,
+              unsigned char *participants)
+{
+        int                ret        = 0;
+        default_args_cbk_t *replies   = NULL;
+        unsigned char      *output    = NULL;
+        unsigned char      *locked_on = NULL;
+        loc_t              loc        = {0};
+
+        loc.parent = inode_ref (parent);
+        loc.name = name;
+        loc.inode = inode_new (parent->table);
+        if (!loc.inode) {
+                ret = -ENOMEM;
+                goto out;
+        }
+
+        EC_REPLIES_ALLOC (replies, ec->nodes);
+        output = alloca0 (ec->nodes);
+        locked_on = alloca0 (ec->nodes);
+        ret = cluster_entrylk (ec->xl_list, participants, ec->nodes, replies,
+                               locked_on, frame, ec->xl, ec->xl->name, parent,
+                               NULL);
+        {
+                if (ret <= ec->fragments) {
+                        gf_log (ec->xl->name, GF_LOG_DEBUG, "%s: Skipping heal "
+                                "as only %d number of subvolumes could "
+                                "be locked", uuid_utoa (parent->gfid), ret);
+                        ret = -ENOTCONN;
+                        goto unlock;
+                }
+                EC_INTERSECT (participants, participants, locked_on, ec->nodes);
+                ret = __ec_heal_name (frame, ec, parent, name, participants);
+        }
+unlock:
+        cluster_unentrylk (ec->xl_list, locked_on, ec->nodes, replies, output,
+                           frame, ec->xl, ec->xl->name, parent, NULL);
+out:
+        cluster_replies_wipe (replies, ec->nodes);
+        loc_wipe (&loc);
+        return ret;
+}
+
+int
+ec_name_heal_handler (xlator_t *subvol, gf_dirent_t *entry, loc_t *parent,
+                      void *data)
+{
+        struct ec_name_data *name_data = data;
+        xlator_t            *this      = THIS;
+        ec_t                *ec        = this->private;
+        unsigned char       *name_on   = alloca0 (ec->nodes);
+        int                 i          = 0;
+        int                 ret        = 0;
+
+        memcpy (name_on, name_data->participants, ec->nodes);
+        ret = ec_heal_name (name_data->frame, ec, parent->inode,
+                            entry->d_name, name_on);
+
+        if (ret < 0)
+                memset (name_on, 0, ec->nodes);
+
+        for (i = 0; i < ec->nodes; i++)
+                if (name_data->participants[i] && !name_on[i])
+                        name_data->failed_on[i] = 1;
+        return 0;
+}
+
+int
+ec_heal_names (call_frame_t *frame, ec_t *ec, inode_t *inode,
+               unsigned char *participants)
+{
+        int i = 0;
+        int j = 0;
+        loc_t loc = {0};
+        struct ec_name_data name_data = {0};
+
+        loc.inode = inode_ref (inode);
+        gf_uuid_copy (loc.gfid, inode->gfid);
+        name_data.frame = frame;
+        name_data.participants = participants;
+        name_data.failed_on = alloca0(ec->nodes);;
+
+        for (i = 0; i < ec->nodes; i++) {
+                if (!participants[i])
+                        continue;
+                syncop_dir_scan (ec->xl_list[i], &loc,
+                                GF_CLIENT_PID_AFR_SELF_HEALD, &name_data,
+                                ec_name_heal_handler);
+                for (j = 0; j < ec->nodes; j++)
+                        if (name_data.failed_on[j])
+                                participants[j] = 0;
+
+                if (EC_COUNT (participants, ec->nodes) <= ec->fragments)
+                        return -ENOTCONN;
+        }
+        return 0;
+}
+
+int
+__ec_heal_entry (call_frame_t *frame, ec_t *ec, inode_t *inode,
+                 unsigned char *heal_on)
+{
+        unsigned char      *locked_on    = NULL;
+        unsigned char      *output       = NULL;
+        uint64_t           *versions     = NULL;
+        uint64_t           *dirty        = NULL;
+        unsigned char      *sources      = NULL;
+        unsigned char      *healed_sinks = NULL;
+        unsigned char      *participants = NULL;
+        default_args_cbk_t *replies      = NULL;
+        int                ret           = 0;
+        int                source        = 0;
+        int                i             = 0;
+
+        locked_on = alloca0(ec->nodes);
+        output = alloca0(ec->nodes);
+        versions = alloca0 (ec->nodes * sizeof (*versions));
+        dirty = alloca0 (ec->nodes * sizeof (*dirty));
+        sources = alloca0 (ec->nodes);
+        healed_sinks = alloca0 (ec->nodes);
+
+        EC_REPLIES_ALLOC (replies, ec->nodes);
+        ret = cluster_entrylk (ec->xl_list, heal_on, ec->nodes, replies,
+                               locked_on, frame, ec->xl, ec->xl->name, inode,
+                               NULL);
+        {
+                if (ret <= ec->fragments) {
+                        gf_log (ec->xl->name, GF_LOG_DEBUG, "%s: Skipping heal "
+                                "as only %d number of subvolumes could "
+                                "be locked", uuid_utoa (inode->gfid), ret);
+                        ret = -ENOTCONN;
+                        goto unlock;
+                }
+                ret = __ec_heal_entry_prepare (frame, ec, inode, locked_on,
+                                               versions, dirty, sources,
+                                               healed_sinks);
+                source = ret;
+        }
+unlock:
+        cluster_unentrylk (ec->xl_list, locked_on, ec->nodes, replies, output,
+                           frame, ec->xl, ec->xl->name, inode, NULL);
+        if (ret < 0)
+                goto out;
+
+        participants = alloca0 (ec->nodes);
+        for (i = 0; i < ec->nodes; i++) {
+                if (sources[i] || healed_sinks[i])
+                        participants[i] = 1;
+        }
+        ret = ec_heal_names (frame, ec, inode, participants);
+
+        if (EC_COUNT (participants, ec->nodes) <= ec->fragments)
+                goto out;
+
+        for (i = 0; i < ec->nodes; i++) {
+                if (!participants[i]) {
+                        sources[i] = 0;
+                        healed_sinks[i] = 0;
+                }
+        }
+
+        ec_adjust_versions (frame, ec, EC_DATA_TXN, inode, source,
+                            sources, healed_sinks, versions, dirty);
+out:
+        cluster_replies_wipe (replies, ec->nodes);
+        return ret;
+}
+
+int
+ec_heal_entry (call_frame_t *req_frame, ec_t *ec, inode_t *inode)
+{
+        unsigned char      *locked_on            = NULL;
+        unsigned char      *up_subvols           = NULL;
+        unsigned char      *output               = NULL;
+        char               selfheal_domain[1024] = {0};
+        int                ret                   = 0;
+        default_args_cbk_t *replies              = NULL;
+        call_frame_t       *frame                = NULL;
+
+        EC_REPLIES_ALLOC (replies, ec->nodes);
+        locked_on = alloca0(ec->nodes);
+        output = alloca0(ec->nodes);
+        up_subvols = alloca0(ec->nodes);
+
+        frame = copy_frame (req_frame);
+        if (!frame) {
+                ret = -ENOMEM;
+                goto out;
+        }
+        /*Do heal as root*/
+        frame->root->uid = 0;
+        frame->root->gid = 0;
+        sprintf (selfheal_domain, "%s:self-heal", ec->xl->name);
+        ec_mask_to_char_array (ec->xl_up, up_subvols, ec->nodes);
+        /*If other processes are already doing the heal, don't block*/
+        ret = cluster_entrylk (ec->xl_list, up_subvols, ec->nodes, replies,
+                               locked_on, frame, ec->xl, selfheal_domain, inode,
+                               NULL);
+        {
+                if (ret <= ec->fragments) {
+                        gf_log (ec->xl->name, GF_LOG_DEBUG, "%s: Skipping heal "
+                                "as only %d number of subvolumes could "
+                                "be locked", uuid_utoa (inode->gfid), ret);
+                        ret = -ENOTCONN;
+                        goto unlock;
+                }
+                ret = __ec_heal_entry (frame, ec, inode, locked_on);
+        }
+unlock:
+        cluster_unentrylk (ec->xl_list, locked_on, ec->nodes, replies, output,
+                           frame, ec->xl, selfheal_domain, inode, NULL);
+out:
+        cluster_replies_wipe (replies, ec->nodes);
+        if (frame)
+                STACK_DESTROY (frame->root);
+        return ret;
+}
-- 
cgit