diff options
| author | Pranith Kumar K <pkarampu@redhat.com> | 2015-04-16 09:25:31 +0530 | 
|---|---|---|
| committer | Pranith Kumar Karampuri <pkarampu@redhat.com> | 2015-05-08 15:03:59 -0700 | 
| commit | fae1e70ff3309d2b64febaafc70abcaa2771ecf0 (patch) | |
| tree | 53498b10f860b0d8d0a9500e5afae535965a7f22 /xlators | |
| parent | e0401209cf58a638a32e7d867ab4c6199aa0e92f (diff) | |
cluster/ec: metadata/name/entry heal implementation for ec
Metadata self-heal:
1) Take inode lock in domain 'this->name' on 0-0 range (full file)
2) perform lookup and get the xattrs on all the bricks
3) Choose the brick with highest version as source
4) Setattr uid/gid/permissions
5) removexattr stale xattrs
6) Setxattr existing/new xattrs
7) xattrop with -ve values of 'dirty' and difference of highest and its own
   version values for version xattr
8) unlock lock acquired in 1)
Entry self-heal:
1) take directory lock in domain 'this->name:self-heal' on 'NULL' to prevent
   more than one self-heal
2) we take directory lock in domain 'this->name' on 'NULL'
3) Perform lookup on version, dirty and remember the values
4) unlock lock acquired in 2)
5) readdir on all the bricks and trigger name heals
6) xattrop with -ve values of 'dirty' and difference of highest and its own
   version values for version xattr
7) unlock lock acquired in 1)
Name heal:
1) Take 'name' lock in 'this->name' on 'NULL'
2) Perform lookup on 'name' and get stat and xattr structures
3) Build gfid_db where for each gfid we know what subvolumes/bricks have
   a file with 'name'
4) Delete all the stale files i.e. the file does not exist on more than
   ec->redundancy number of bricks
5) On all the subvolumes/bricks with missing entry create 'name' with same
   type,gfid,permissions etc.
6) Unlock lock acquired in 1)
Known limitation: At the moment with present design, it conservatively
preserves the 'name' in case it can not decide whether to delete it.  this can
happen in the following scenario:
1) we have 3=2+1 (bricks: A, B, C) ec volume and 1 brick is down (Lets say A)
2) rename d1/f1 -> d2/f2 is performed but the rename is successful only on one
   of the bricks (Lets say B)
3) Now name self-heal on d1 and d2 would re-create the file on both d1 and d2
   resulting in d1/f1 and d2/f2.
Because we wanted to prevent data loss in the case above, the following
scenario is not healable, i.e. it needs manual intervention:
1) we have 3=2+1 (bricks: A, B, C) ec volume and 1 brick is down (Lets say A)
2) We have two hard links: d1/a, d2/b and another file d3/c even before the
   brick went down
3) rename d3/c -> d2/b is performed
4) Now name self-heal on d2/b doesn't heal because d2/b with older gfid will
   not be deleted.  One could think why not delete the link if there is
   more than 1 hardlink, but that leads to similar data loss issue I described
   earlier:
Scenario:
1) we have 3=2+1 (bricks: A, B, C) ec volume and 1 brick is down (Lets say A)
2) We have two hard links: d1/a, d2/b
3) rename d1/a -> d3/c, d2/b -> d4/d is performed and both the operations are
   successful only on one of the bricks (Lets say B)
4) Now name self-heal on the 'names' above which can happen in parallel can
   decide to delete the file thinking it has 2 links but after all the
   self-heals do unlinks we are left with data loss.
Change-Id: I3a68218a47bb726bd684604efea63cf11cfd11be
BUG: 1216303
Signed-off-by: Pranith Kumar K <pkarampu@redhat.com>
Reviewed-on: http://review.gluster.org/10298
Reviewed-on: http://review.gluster.org/10691
Tested-by: Gluster Build System <jenkins@build.gluster.com>
Tested-by: NetBSD Build System
Diffstat (limited to 'xlators')
| -rw-r--r-- | xlators/cluster/ec/src/ec-common.h | 5 | ||||
| -rw-r--r-- | xlators/cluster/ec/src/ec-heal.c | 1053 | 
2 files changed, 1058 insertions, 0 deletions
diff --git a/xlators/cluster/ec/src/ec-common.h b/xlators/cluster/ec/src/ec-common.h index aaae16e71c3..ba009040b71 100644 --- a/xlators/cluster/ec/src/ec-common.h +++ b/xlators/cluster/ec/src/ec-common.h @@ -15,6 +15,11 @@  #include "ec-data.h" +typedef enum { +        EC_DATA_TXN, +        EC_METADATA_TXN +} ec_txn_t; +  #define EC_CONFIG_VERSION 0  #define EC_CONFIG_ALGORITHM 0 diff --git a/xlators/cluster/ec/src/ec-heal.c b/xlators/cluster/ec/src/ec-heal.c index 6148df904a4..1e19cf57e1b 100644 --- a/xlators/cluster/ec/src/ec-heal.c +++ b/xlators/cluster/ec/src/ec-heal.c @@ -20,6 +20,35 @@  #include "ec-mem-types.h"  #include "ec-data.h" +#include "byte-order.h" +#include "syncop.h" +#include "syncop-utils.h" +#include "cluster-syncop.h" + +#define alloca0(size) ({void *__ptr; __ptr = alloca(size); memset(__ptr, 0, size); __ptr; }) +#define EC_COUNT(array, max) ({int __i; int __res = 0; for (__i = 0; __i < max; __i++) if (array[__i]) __res++; __res; }) +#define EC_INTERSECT(dst, src1, src2, max) ({int __i; for (__i = 0; __i < max; __i++) dst[__i] = src1[__i] && src2[__i]; }) +#define EC_ADJUST_SOURCE(source, sources, max) ({int __i; if (sources[source] == 0) {source = -1; for (__i = 0; __i < max; __i++) if (sources[__i]) source = __i; } }) +#define IA_EQUAL(f, s, field) (memcmp (&(f.ia_##field), &(s.ia_##field), sizeof (s.ia_##field)) == 0) +#define EC_REPLIES_ALLOC(replies, numsubvols) do {              \ +        int __i = 0;                                            \ +        replies = alloca0(numsubvols * sizeof (*replies));      \ +        for (__i = 0; __i < numsubvols; __i++)                  \ +                INIT_LIST_HEAD (&replies[__i].entries.list);    \ +        } while (0) + + +struct ec_name_data { +        call_frame_t *frame; +        unsigned char *participants; +        unsigned char *failed_on; +        unsigned char *gfidless; +        unsigned char *enoent; +        unsigned char *same; +        char *name; +        inode_t *parent; +        default_args_cbk_t *replies; +};  static char *ec_ignore_xattrs[] = {          GF_SELINUX_XATTR_KEY, @@ -35,6 +64,9 @@ ec_ignorable_key_match (dict_t *dict, char *key, data_t *val, void *mdata)          if (!key)                  goto out; +        if (strncmp (key, EC_XATTR_PREFIX, strlen (EC_XATTR_PREFIX)) == 0) +                        return _gf_true; +          for (i = 0; ec_ignore_xattrs[i]; i++) {                  if (!strcmp (key, ec_ignore_xattrs[i]))                         return _gf_true; @@ -1590,3 +1622,1024 @@ void ec_fheal(call_frame_t * frame, xlator_t * this, uintptr_t target,                  xdata);      }  } + +/* Common heal code */ +void +ec_mask_to_char_array (uintptr_t mask, unsigned char *array, int numsubvols) +{ +        int     i = 0; + +        for (i = 0; i < numsubvols; i++) +                array[i] = ((mask >> i) & 1); +} + +int +ec_heal_find_direction (ec_t *ec, ec_txn_t type, default_args_cbk_t *replies, +                        uint64_t *versions, uint64_t *dirty, +                        unsigned char *sources, unsigned char *healed_sinks) +{ +        void        *ptr        = NULL; +        uint64_t    *value      = NULL; +        uint64_t    max_version = 0; +        int         source      = -1; +        int32_t     len         = 0; +        int         ret         = 0; +        int         i           = 0; +        struct iatt source_ia   = {0}; +        struct iatt child_ia    = {0}; + +        for (i = 0; i < ec->nodes; i++) { +                if (!replies[i].valid) +                        continue; + +                if (replies[i].op_ret == -1) +                        continue; + +                if (source == -1) +                        source = i; + +                ret = dict_get_ptr_and_len (replies[i].xdata, EC_XATTR_VERSION, +                                            &ptr, &len); +                if (ret == 0) { +                        value = ptr; +                        versions[i] = ntoh64(value[type]); +                        if (max_version < versions[i]) { +                                max_version = versions[i]; +                                source = i; +                        } +                } + +                ret = dict_get_ptr_and_len (replies[i].xdata, EC_XATTR_DIRTY, +                                            &ptr, &len); +                if (ret == 0) { +                        value = ptr; +                        dirty[i] = ntoh64(value[type]); +                } +        } + +        if (source < 0) +                goto out; + +        for (i = 0; i < ec->nodes; i++) { +                if (!replies[i].valid) +                        continue; + +                if (replies[i].op_ret == -1) +                        continue; + +                if (versions[i] == versions[source]) +                        sources[i] = 1; +                else +                        healed_sinks[i] = 1; +        } + +        if (type == EC_METADATA_TXN) { +                source_ia = replies[source].stat; +                for (i = 0; i < ec->nodes; i++) { +                        if (!sources[i]) +                                continue; +                        child_ia = replies[i].stat; +                        if (!IA_EQUAL(source_ia, child_ia, gfid) || +                            !IA_EQUAL(source_ia, child_ia, type) || +                            !IA_EQUAL(source_ia, child_ia, prot) || +                            !IA_EQUAL(source_ia, child_ia, uid) || +                            !IA_EQUAL(source_ia, child_ia, gid)) { +                                sources[i] = 0; +                                healed_sinks[i] = 1; +                        } +                } +        } +out: +        return source; +} + +int +ec_adjust_versions (call_frame_t *frame, ec_t *ec, ec_txn_t type, inode_t *inode, int source, +                    unsigned char *sources, +                    unsigned char *healed_sinks, uint64_t *versions, +                    uint64_t *dirty) +{ +        int                        i                 = 0; +        int                        ret               = 0; +        dict_t                     *xattr            = NULL; +        int                        op_ret            = 0; +        loc_t                      loc               = {0}; +        gf_boolean_t               erase_dirty       = _gf_false; +        uint64_t                   versions_xattr[2] = {0}; +        uint64_t                   dirty_xattr[2]    = {0}; +        uint64_t                   allzero[2]        = {0}; + +        loc.inode = inode_ref (inode); +        gf_uuid_copy (loc.gfid, inode->gfid); +        xattr = dict_new (); +        if (!xattr) +                goto out; + +        /* dirty xattr represents if the file/dir needs heal. Unless all the +         * copies are healed, don't erase it */ +        if (EC_COUNT (sources, ec->nodes) + +            EC_COUNT (healed_sinks, ec->nodes) == ec->nodes) +                erase_dirty = _gf_true; + +        for (i = 0; i < ec->nodes; i++) { +                if (!sources[i] && !healed_sinks[i]) +                        continue; +                versions_xattr[type] = hton64(versions[source] - versions[i]); +                ret = dict_set_static_bin (xattr, EC_XATTR_VERSION, +                                           versions_xattr, +                                           sizeof (versions_xattr)); +                if (ret < 0) { +                        op_ret = -ENOTCONN; +                        continue; +                } + +                if (erase_dirty) { +                        dirty_xattr[type] = hton64(-dirty[i]); +                        ret = dict_set_static_bin (xattr, EC_XATTR_DIRTY, +                                                   dirty_xattr, +                                                   sizeof (dirty_xattr)); +                        if (ret < 0) { +                                op_ret = -ENOTCONN; +                                continue; +                        } +                } + +                if ((memcmp (versions_xattr, allzero, sizeof (allzero)) == 0) && +                    (memcmp (dirty_xattr, allzero, sizeof (allzero)) == 0)) +                        continue; + +                ret = syncop_xattrop (ec->xl_list[i], &loc, +                                      GF_XATTROP_ADD_ARRAY64, xattr, NULL, +                                      NULL); +                if (ret < 0) { +                        op_ret = -ret; +                        continue; +                } +        } + +out: +        if (xattr) +                dict_unref (xattr); +        loc_wipe (&loc); +        return op_ret; +} + +int +__ec_heal_prepare (call_frame_t *frame, ec_t *ec, inode_t *inode, +                   unsigned char *locked_on, default_args_cbk_t *replies, +                   uint64_t *versions, uint64_t *dirty, unsigned char *sources, +                   unsigned char *healed_sinks, ec_txn_t type) +{ +        loc_t         loc     = {0}; +        unsigned char *output = NULL; +        dict_t        *xdata  = NULL; +        int           ret     = 0; +        int           source  = 0; + +        xdata = dict_new (); +        if (!xdata) { +                ret = -ENOMEM; +                goto out; +        } + +        if (dict_set_uint64(xdata, "list-xattr", 0)) { +                ret = -ENOMEM; +                goto out; +        } + +        loc.inode = inode_ref (inode); +        gf_uuid_copy (loc.gfid, inode->gfid); +        output = alloca0 (ec->nodes); +        ret = cluster_lookup (ec->xl_list, locked_on, ec->nodes, replies, +                              output, frame, ec->xl, &loc, xdata); +        if (ret <= ec->fragments) { +                ret = -ENOTCONN; +                goto out; +        } + +        source = ec_heal_find_direction (ec, type, replies, versions, +                                         dirty, sources, healed_sinks); +        if (source < 0) { +                ret = -EIO; +                goto out; +        } +        ret = source; +out: +        if (xdata) +                dict_unref (xdata); + +        loc_wipe (&loc); +        return ret; +} + +/* Metadata heal */ +int +__ec_removexattr_sinks (call_frame_t *frame, ec_t *ec, inode_t *inode, +                        int source, unsigned char *sources, +                        unsigned char *healed_sinks, +                        default_args_cbk_t *replies) +{ +        int   i   = 0; +        int   ret = 0; +        loc_t loc = {0}; + +        loc.inode = inode_ref (inode); +        gf_uuid_copy (loc.gfid, inode->gfid); + +        for (i = 0; i < ec->nodes; i++) { +                if (i == source) +                        continue; +                if (!sources[i] && !healed_sinks[i]) +                        continue; +                ret = dict_foreach (replies[i].xattr, ec_heal_xattr_clean, +                                    replies[source].xattr); +                if (ret < 0) { +                        sources[i] = 0; +                        healed_sinks[i] = 0; +                } + +                if (replies[i].xattr->count == 0) { +                        continue; +                } else if (sources[i]) { +                        /* This can happen if setxattr/removexattr succeeds on +                         * the bricks but fails to update the version. This +                         * will make sure that the xattrs are made equal after +                         * heal*/ +                        sources[i] = 0; +                        healed_sinks[i] = 1; +                } + +                ret = syncop_removexattr (ec->xl_list[i], &loc, "", +                                          replies[i].xattr, NULL); +                if (ret < 0) +                        healed_sinks[i] = 0; +        } + +        loc_wipe (&loc); +        if (EC_COUNT (healed_sinks, ec->nodes) == 0) +                return -ENOTCONN; +        return 0; +} + +int +__ec_heal_metadata (call_frame_t *frame, ec_t *ec, inode_t *inode, +                    unsigned char *locked_on) +{ +        loc_t              loc           = {0}; +        int                ret           = 0; +        int                source        = 0; +        default_args_cbk_t *replies      = NULL; +        uint64_t           *versions     = NULL; +        uint64_t           *dirty        = NULL; +        unsigned char      *sources      = NULL; +        unsigned char      *healed_sinks = NULL; +        unsigned char      *output       = NULL; +        dict_t             *source_dict  = NULL; +        struct iatt        source_buf    = {0}; + +        EC_REPLIES_ALLOC (replies, ec->nodes); + +        loc.inode = inode_ref (inode); +        gf_uuid_copy (loc.gfid, inode->gfid); +        output = alloca0 (ec->nodes); +        versions = alloca0 (ec->nodes * sizeof (*versions)); +        dirty = alloca0 (ec->nodes * sizeof (*dirty)); +        sources = alloca0 (ec->nodes); +        healed_sinks = alloca0 (ec->nodes); +        source = __ec_heal_prepare (frame, ec, inode, locked_on, replies, +                                    versions, dirty, sources, healed_sinks, +                                    EC_METADATA_TXN); +        if (source < 0) { +                ret = -EIO; +                goto out; +        } + +        source_buf = replies[source].stat; +        ret = cluster_setattr (ec->xl_list, healed_sinks, ec->nodes, replies, +                               output, frame, ec->xl, &loc, +                               &source_buf, GF_SET_ATTR_MODE | +                               GF_SET_ATTR_UID | GF_SET_ATTR_GID, NULL); +        /*In case the operation fails on some of the subvols*/ +        memcpy (healed_sinks, output, ec->nodes); +        if (EC_COUNT (healed_sinks, ec->nodes) == 0) { +                ret = -ENOTCONN; +                goto out; +        } + +        ret = cluster_getxattr (ec->xl_list, locked_on, ec->nodes, replies, +                                output, frame, ec->xl, &loc, NULL, NULL); +        EC_INTERSECT (sources, sources, output, ec->nodes); +        EC_INTERSECT (healed_sinks, healed_sinks, output, ec->nodes); +        EC_ADJUST_SOURCE (source, sources, ec->nodes); +        if ((EC_COUNT (healed_sinks, ec->nodes) == 0) || (source < 0)) { +                ret = -ENOTCONN; +                goto out; +        } + +        ret = __ec_removexattr_sinks (frame, ec, inode, source, sources, +                                      healed_sinks, replies); +        if (ret < 0) +                goto out; + +        source_dict = dict_ref (replies[source].xattr); +        if (dict_foreach_match (source_dict, ec_ignorable_key_match, NULL, +                                dict_remove_foreach_fn, NULL) == -1) { +                ret = -ENOMEM; +                goto out; +        } + +        ret = cluster_setxattr (ec->xl_list, healed_sinks, ec->nodes, +                                replies, output, frame, ec->xl, &loc, +                                source_dict, 0, NULL); + +        EC_INTERSECT (healed_sinks, healed_sinks, output, ec->nodes); +        if (EC_COUNT (healed_sinks, ec->nodes) == 0) { +                ret = -ENOTCONN; +                goto out; +        } + +        ret = ec_adjust_versions (frame, ec, EC_METADATA_TXN, inode, source, +                                  sources, healed_sinks, versions, dirty); +out: +        if (source_dict) +                dict_unref (source_dict); + +        loc_wipe (&loc); +        cluster_replies_wipe (replies, ec->nodes); +        return ret; +} + +int +ec_heal_metadata (call_frame_t *req_frame, ec_t *ec, inode_t *inode) +{ +        unsigned char      *locked_on  = NULL; +        unsigned char      *up_subvols = NULL; +        unsigned char      *output     = NULL; +        int                ret         = 0; +        default_args_cbk_t *replies    = NULL; +        call_frame_t       *frame      = NULL; + +        EC_REPLIES_ALLOC (replies, ec->nodes); +        frame = copy_frame (req_frame); +        if (!frame) { +                ret = -ENOMEM; +                goto out; +        } + +        /*Do heal as root*/ +        frame->root->uid = 0; +        frame->root->gid = 0; +        locked_on = alloca0(ec->nodes); +        output = alloca0(ec->nodes); +        up_subvols = alloca0(ec->nodes); +        ec_mask_to_char_array (ec->xl_up, up_subvols, ec->nodes); +        ret = cluster_inodelk (ec->xl_list, up_subvols, ec->nodes, replies, +                               locked_on, frame, ec->xl, ec->xl->name, inode, 0, +                               0); +        { +                if (ret <= ec->fragments) { +                        gf_log (ec->xl->name, GF_LOG_DEBUG, "%s: Skipping heal " +                                "as only %d number of subvolumes could " +                                "be locked", uuid_utoa (inode->gfid), ret); +                        ret = -ENOTCONN; +                        goto unlock; +                } +                ret = __ec_heal_metadata (frame, ec, inode, locked_on); +        } +unlock: +        cluster_uninodelk (ec->xl_list, locked_on, ec->nodes, replies, output, +                           frame, ec->xl, ec->xl->name, inode, 0, 0); +out: +        cluster_replies_wipe (replies, ec->nodes); +        if (frame) +                STACK_DESTROY (frame->root); +        return ret; +} + +/*entry heal*/ +int +__ec_heal_entry_prepare (call_frame_t *frame, ec_t *ec, inode_t *inode, +                         unsigned char *locked_on, uint64_t *versions, +                         uint64_t *dirty, unsigned char *sources, +                         unsigned char *healed_sinks) +{ +        int                source   = 0; +        default_args_cbk_t *replies = NULL; +        loc_t              loc      = {0}; +        int                ret      = 0; + +        EC_REPLIES_ALLOC (replies, ec->nodes); + +        loc.inode = inode_ref (inode); +        gf_uuid_copy (loc.gfid, inode->gfid); +        source = __ec_heal_prepare (frame, ec, inode, locked_on, replies, +                                    versions, dirty, sources, healed_sinks, +                                    EC_DATA_TXN); +        if (source < 0) { +                ret = -EIO; +                goto out; +        } +        ret = source; +out: +        loc_wipe (&loc); +        cluster_replies_wipe (replies, ec->nodes); +        return ret; +} + +/*Name heal*/ +int +ec_delete_stale_name (dict_t *gfid_db, char *key, data_t *d, void *data) +{ +        struct ec_name_data *name_data   = data; +        struct iatt         *ia          = NULL; +        ec_t                *ec          = NULL; +        loc_t               loc          = {0}; +        unsigned char       *same        = data_to_bin (d); +        default_args_cbk_t  *replies     = NULL; +        unsigned char       *output      = NULL; +        int                 ret          = 0; +        int                 estale_count = 0; +        int                 i            = 0; +        call_frame_t        *frame       = name_data->frame; + +        ec = name_data->frame->this->private; +        EC_REPLIES_ALLOC (replies, ec->nodes); +        if (EC_COUNT (same, ec->nodes) >= ec->fragments) { +                ret = 0; +                goto out; +        } + +        loc.inode = inode_new (name_data->parent->table); +        if (!loc.inode) { +                ret = -ENOMEM; +                goto out; +        } +        gf_uuid_parse (key, loc.gfid); +        output = alloca0(ec->nodes); +        ret = cluster_lookup (ec->xl_list, name_data->participants, ec->nodes, +                              replies, output, name_data->frame, ec->xl, &loc, +                              NULL); + +        for (i = 0; i < ec->nodes; i++) { +                if (!replies[i].valid) +                        continue; +                if (replies[i].op_ret == -1) { +                        if (replies[i].op_errno == ESTALE || +                            replies[i].op_errno == ENOENT) +                                estale_count++; +                        else +                                name_data->participants[i] = 0; +                } +        } + +        if (estale_count <= ec->redundancy) { +                /* We have at least ec->fragments number of fragments, so the +                 * file is recoverable, so don't delete it*/ + +                /* Please note that the lookup call above could fail with +                 * ENOTCONN on all subvoumes and still this branch will be +                 * true, but in those cases conservatively we decide to not +                 * delete the file until we are sure*/ +                ret = 0; +                goto out; +        } + +        /*Noway to recover, delete the name*/ +        loc_wipe (&loc); +        loc.parent = inode_ref (name_data->parent); +        gf_uuid_copy (loc.pargfid, loc.parent->gfid); +        loc.name = name_data->name; +        for (i = 0; i < ec->nodes; i++) { +                if (same[i] && replies[i].valid && (replies[i].op_ret == 0)) { +                        ia = &replies[i].stat; +                        break; +                } +        } + +        if (!ia) { +                ret = -ENOTCONN; +                goto out; +        } + +        if (IA_ISDIR (ia->ia_type)) { +                ret = cluster_rmdir (ec->xl_list, same, ec->nodes, replies, +                                     output, frame, ec->xl, &loc, 1, NULL); +        } else { +                ret = cluster_unlink (ec->xl_list, same, ec->nodes, replies, +                                      output, frame, ec->xl, &loc, 0, NULL); +        } + +        for (i = 0; i < ec->nodes; i++) { +                if (output[i]) { +                        same[i] = 0; +                        name_data->enoent[i] = 1; +                } else { +                        /*op failed*/ +                        if (same[i]) +                                name_data->participants[i] = 0; +                } +        } +        ret = 0; +        /*This will help in making decisions about creating names*/ +        dict_del (gfid_db, key); +out: +        cluster_replies_wipe (replies, ec->nodes); +        loc_wipe (&loc); +        return ret; +} + +int +ec_delete_stale_names (call_frame_t *frame, ec_t *ec, inode_t *parent, +                       char *name, default_args_cbk_t *replies, dict_t *gfid_db, +                       unsigned char *enoent, unsigned char *gfidless, +                       unsigned char *participants) +{ +        struct ec_name_data name_data = {0}; + +        name_data.enoent = enoent; +        name_data.gfidless = gfidless; +        name_data.participants = participants; +        name_data.name = name; +        name_data.parent = parent; +        name_data.frame = frame; +        name_data.replies = replies; +        return dict_foreach (gfid_db, ec_delete_stale_name, &name_data); +} + +int +_assign_same (dict_t *dict, char *key, data_t *value, void *data) +{ +        struct ec_name_data *name_data = data; + +        name_data->same = data_to_bin (value); +        return 0; +} + +int +ec_create_name (call_frame_t *frame, ec_t *ec, inode_t *parent, char *name, +                default_args_cbk_t *lookup_replies, dict_t *gfid_db, +                unsigned char *enoent, unsigned char *participants) +{ +        int                 ret       = 0; +        int                 i         = 0; +        struct ec_name_data name_data = {0}; +        struct iatt         *ia       = NULL; +        unsigned char       *output   = 0; +        unsigned char       *output1  = 0; +        default_args_cbk_t  *replies  = NULL; +        loc_t               loc       = {0}; +        loc_t               srcloc    = {0}; +        unsigned char       *link     = NULL; +        unsigned char       *create   = NULL; +        dict_t              *xdata    = NULL; +        char                *linkname = NULL; + +        /* There should be just one gfid key */ +        EC_REPLIES_ALLOC (replies, ec->nodes); +        if (gfid_db->count != 1) { +                ret = -EINVAL; +                goto out; +        } + +        ret = dict_foreach (gfid_db, _assign_same, &name_data); +        if (ret < 0) +                goto out; +        /*There should at least be one valid success reply with gfid*/ +        for (i = 0; i < ec->nodes; i++) +                if (name_data.same[i]) +                        break; + +        if (i == ec->nodes) { +                ret = -EINVAL; +                goto out; +        } + +        ia = &lookup_replies[i].stat; +        xdata = dict_new (); +        loc.parent = inode_ref (parent); +        gf_uuid_copy (loc.pargfid, parent->gfid); +        loc.inode = inode_new (parent->table); +        if (loc.inode) +                srcloc.inode = inode_ref (loc.inode); +        gf_uuid_copy (srcloc.gfid, ia->ia_gfid); +        if (!loc.inode || !xdata || dict_set_static_bin (xdata, "gfid-req", +                                                         ia->ia_gfid, +                                                        sizeof (ia->ia_gfid))) { +                ret = -ENOMEM; +                goto out; +        } +        loc.name = name; +        link = alloca0 (ec->nodes); +        create = alloca0 (ec->nodes); +        output = alloca0 (ec->nodes); +        output1 = alloca0 (ec->nodes); +        switch (ia->ia_type) { +        case IA_IFDIR: +                ret = cluster_mkdir (ec->xl_list, enoent, ec->nodes, +                                   replies, output, frame, ec->xl, &loc, +                                   st_mode_from_ia (ia->ia_prot, +                                                ia->ia_type), 0, xdata); +                break; + +        case IA_IFLNK: +                /*Check for hard links and create/link*/ +                ret = cluster_lookup (ec->xl_list, enoent, ec->nodes, +                                      replies, output, frame, ec->xl, +                                      &srcloc, NULL); +                for (i = 0; i < ec->nodes; i++) { +                        if (output[i]) { +                                link[i] = 1; +                        } else { +                                if (replies[i].op_errno == ENOENT || +                                    replies[i].op_errno == ESTALE) { +                                        create[i] = 1; +                                } +                        } +                } + +                if (EC_COUNT (link, ec->nodes)) { +                        cluster_link (ec->xl_list, link, ec->nodes, +                                      replies, output1, frame, ec->xl, +                                      &srcloc, &loc, NULL); +                } + +                if (EC_COUNT (create, ec->nodes)) { +                        cluster_readlink (ec->xl_list, name_data.same, +                                          ec->nodes, replies, output, +                                          frame, ec->xl, &srcloc, 4096, +                                          NULL); +                        if (EC_COUNT (output, ec->nodes) == 0) { +                                ret = -ENOTCONN; +                                goto out; +                        } + +                        for (i = 0; i < ec->nodes; i++) { +                                if (output[i]) +                                        break; +                        } +                        linkname = alloca0 (strlen(replies[i].buf) + 1); +                        strcpy (linkname, replies[i].buf); +                        cluster_symlink (ec->xl_list, create, ec->nodes, +                                         replies, output, frame, ec->xl, +                                         linkname, &loc, 0, xdata); +                } +                for (i = 0; i < ec->nodes; i++) +                        if (output1[i]) +                                output[i] = 1; +                break; +        default: +                ret = dict_set_int32 (xdata, GLUSTERFS_INTERNAL_FOP_KEY, +                                      1); +                if (ret) +                        goto out; +                ret = cluster_mknod (ec->xl_list, enoent, ec->nodes, +                                     replies, output, frame, ec->xl, +                                     &loc, st_mode_from_ia (ia->ia_prot, +                                                           ia->ia_type), +                                     ia->ia_rdev, 0, xdata); +                break; +        } + +        for (i = 0; i < ec->nodes; i++) { +                if (enoent[i] && !output[i]) +                        participants[i] = 0; +        } + +        ret = 0; +out: +        loc_wipe (&loc); +        loc_wipe (&srcloc); +        EC_REPLIES_ALLOC (replies, ec->nodes); +        if (xdata) +                dict_unref (xdata); +        return ret; +} + +int +__ec_heal_name (call_frame_t *frame, ec_t *ec, inode_t *parent, char *name, +                unsigned char *participants) +{ +        unsigned char      *output   = NULL; +        unsigned char      *enoent   = NULL; +        default_args_cbk_t *replies  = NULL; +        dict_t             *xdata    = NULL; +        dict_t             *gfid_db  = NULL; +        int                ret       = 0; +        loc_t              loc       = {0}; +        int                i         = 0; +        struct iatt        *ia       = NULL; +        char               gfid[64]  = {0}; +        unsigned char      *same     = NULL; +        unsigned char      *gfidless = NULL; + +        loc.parent = inode_ref (parent); +        loc.inode = inode_new (parent->table); +        gf_uuid_copy (loc.pargfid, parent->gfid); +        loc.name = name; +        xdata = dict_new (); +        gfid_db = dict_new (); +        if (!xdata || !gfid_db || !loc.inode) { +                ret = -ENOMEM; +                goto out; +        } + +        ret = dict_set_int32 (xdata, GF_GFIDLESS_LOOKUP, 1); +        if (ret) { +                ret = -ENOMEM; +                goto out; +        } + +        output = alloca0 (ec->nodes); +        gfidless = alloca0 (ec->nodes); +        enoent = alloca0 (ec->nodes); +        EC_REPLIES_ALLOC (replies, ec->nodes); +        ret = cluster_lookup (ec->xl_list, participants, ec->nodes, replies, +                              output, frame, ec->xl, &loc, NULL); +        for (i = 0; i < ec->nodes; i++) { +                if (!replies[i].valid) +                        continue; + +                if (replies[i].op_ret == -1) { +                        /*If ESTALE comes here, that means parent dir is not +                         * present, nothing to do there, so reset participants +                         * for that brick*/ +                        if (replies[i].op_errno == ENOENT) +                                enoent[i] = 1; +                        else +                                participants[i] = 0; +                        continue; +                } +                ia = &replies[i].stat; +                if (gf_uuid_is_null (ia->ia_gfid)) { +                        if (IA_ISDIR (ia->ia_type) || ia->ia_size == 0) +                                gfidless[i] = 1; +                        else +                                participants[i] = 0; +                } else { +                        uuid_utoa_r (ia->ia_gfid, gfid); +                        ret = dict_get_bin (gfid_db, gfid, (void **)&same); +                        if (ret < 0) { +                                same = alloca0(ec->nodes); +                        } +                        same[i] = 1; +                        if (ret < 0) { +                                ret = dict_set_static_bin (gfid_db, gfid, same, +                                                           ec->nodes); +                        } +                        if (ret < 0) +                                goto out; +                } +        } + +        ret = ec_delete_stale_names (frame, ec, parent, name, replies, gfid_db, +                                     enoent, gfidless, participants); + +        if (gfid_db->count == 0) { +                /* All entries seem to be stale entries and deleted, +                 * nothing more to do.*/ +                goto out; +        } + +        if (gfid_db->count > 1) { +                gf_log (ec->xl->name, GF_LOG_INFO, "%s/%s: Not able to heal", +                        uuid_utoa (parent->gfid), name); +                memset (participants, 0, ec->nodes); +                goto out; +        } + +        EC_INTERSECT (enoent, enoent, participants, ec->nodes); +        if (EC_COUNT (enoent, ec->nodes) == 0) { +                ret = 0; +                goto out; +        } + +        ret = ec_create_name (frame, ec, parent, name, replies, gfid_db, enoent, +                              participants); +out: +        cluster_replies_wipe (replies, ec->nodes); +        loc_wipe (&loc); +        if (xdata) +                dict_unref (xdata); +        if (gfid_db) +                dict_unref (gfid_db); +        return ret; +} + +int +ec_heal_name (call_frame_t *frame, ec_t *ec, inode_t *parent, char *name, +              unsigned char *participants) +{ +        int                ret        = 0; +        default_args_cbk_t *replies   = NULL; +        unsigned char      *output    = NULL; +        unsigned char      *locked_on = NULL; +        loc_t              loc        = {0}; + +        loc.parent = inode_ref (parent); +        loc.name = name; +        loc.inode = inode_new (parent->table); +        if (!loc.inode) { +                ret = -ENOMEM; +                goto out; +        } + +        EC_REPLIES_ALLOC (replies, ec->nodes); +        output = alloca0 (ec->nodes); +        locked_on = alloca0 (ec->nodes); +        ret = cluster_entrylk (ec->xl_list, participants, ec->nodes, replies, +                               locked_on, frame, ec->xl, ec->xl->name, parent, +                               NULL); +        { +                if (ret <= ec->fragments) { +                        gf_log (ec->xl->name, GF_LOG_DEBUG, "%s: Skipping heal " +                                "as only %d number of subvolumes could " +                                "be locked", uuid_utoa (parent->gfid), ret); +                        ret = -ENOTCONN; +                        goto unlock; +                } +                EC_INTERSECT (participants, participants, locked_on, ec->nodes); +                ret = __ec_heal_name (frame, ec, parent, name, participants); +        } +unlock: +        cluster_unentrylk (ec->xl_list, locked_on, ec->nodes, replies, output, +                           frame, ec->xl, ec->xl->name, parent, NULL); +out: +        cluster_replies_wipe (replies, ec->nodes); +        loc_wipe (&loc); +        return ret; +} + +int +ec_name_heal_handler (xlator_t *subvol, gf_dirent_t *entry, loc_t *parent, +                      void *data) +{ +        struct ec_name_data *name_data = data; +        xlator_t            *this      = THIS; +        ec_t                *ec        = this->private; +        unsigned char       *name_on   = alloca0 (ec->nodes); +        int                 i          = 0; +        int                 ret        = 0; + +        memcpy (name_on, name_data->participants, ec->nodes); +        ret = ec_heal_name (name_data->frame, ec, parent->inode, +                            entry->d_name, name_on); + +        if (ret < 0) +                memset (name_on, 0, ec->nodes); + +        for (i = 0; i < ec->nodes; i++) +                if (name_data->participants[i] && !name_on[i]) +                        name_data->failed_on[i] = 1; +        return 0; +} + +int +ec_heal_names (call_frame_t *frame, ec_t *ec, inode_t *inode, +               unsigned char *participants) +{ +        int i = 0; +        int j = 0; +        loc_t loc = {0}; +        struct ec_name_data name_data = {0}; + +        loc.inode = inode_ref (inode); +        gf_uuid_copy (loc.gfid, inode->gfid); +        name_data.frame = frame; +        name_data.participants = participants; +        name_data.failed_on = alloca0(ec->nodes);; + +        for (i = 0; i < ec->nodes; i++) { +                if (!participants[i]) +                        continue; +                syncop_dir_scan (ec->xl_list[i], &loc, +                                GF_CLIENT_PID_AFR_SELF_HEALD, &name_data, +                                ec_name_heal_handler); +                for (j = 0; j < ec->nodes; j++) +                        if (name_data.failed_on[j]) +                                participants[j] = 0; + +                if (EC_COUNT (participants, ec->nodes) <= ec->fragments) +                        return -ENOTCONN; +        } +        return 0; +} + +int +__ec_heal_entry (call_frame_t *frame, ec_t *ec, inode_t *inode, +                 unsigned char *heal_on) +{ +        unsigned char      *locked_on    = NULL; +        unsigned char      *output       = NULL; +        uint64_t           *versions     = NULL; +        uint64_t           *dirty        = NULL; +        unsigned char      *sources      = NULL; +        unsigned char      *healed_sinks = NULL; +        unsigned char      *participants = NULL; +        default_args_cbk_t *replies      = NULL; +        int                ret           = 0; +        int                source        = 0; +        int                i             = 0; + +        locked_on = alloca0(ec->nodes); +        output = alloca0(ec->nodes); +        versions = alloca0 (ec->nodes * sizeof (*versions)); +        dirty = alloca0 (ec->nodes * sizeof (*dirty)); +        sources = alloca0 (ec->nodes); +        healed_sinks = alloca0 (ec->nodes); + +        EC_REPLIES_ALLOC (replies, ec->nodes); +        ret = cluster_entrylk (ec->xl_list, heal_on, ec->nodes, replies, +                               locked_on, frame, ec->xl, ec->xl->name, inode, +                               NULL); +        { +                if (ret <= ec->fragments) { +                        gf_log (ec->xl->name, GF_LOG_DEBUG, "%s: Skipping heal " +                                "as only %d number of subvolumes could " +                                "be locked", uuid_utoa (inode->gfid), ret); +                        ret = -ENOTCONN; +                        goto unlock; +                } +                ret = __ec_heal_entry_prepare (frame, ec, inode, locked_on, +                                               versions, dirty, sources, +                                               healed_sinks); +                source = ret; +        } +unlock: +        cluster_unentrylk (ec->xl_list, locked_on, ec->nodes, replies, output, +                           frame, ec->xl, ec->xl->name, inode, NULL); +        if (ret < 0) +                goto out; + +        participants = alloca0 (ec->nodes); +        for (i = 0; i < ec->nodes; i++) { +                if (sources[i] || healed_sinks[i]) +                        participants[i] = 1; +        } +        ret = ec_heal_names (frame, ec, inode, participants); + +        if (EC_COUNT (participants, ec->nodes) <= ec->fragments) +                goto out; + +        for (i = 0; i < ec->nodes; i++) { +                if (!participants[i]) { +                        sources[i] = 0; +                        healed_sinks[i] = 0; +                } +        } + +        ec_adjust_versions (frame, ec, EC_DATA_TXN, inode, source, +                            sources, healed_sinks, versions, dirty); +out: +        cluster_replies_wipe (replies, ec->nodes); +        return ret; +} + +int +ec_heal_entry (call_frame_t *req_frame, ec_t *ec, inode_t *inode) +{ +        unsigned char      *locked_on            = NULL; +        unsigned char      *up_subvols           = NULL; +        unsigned char      *output               = NULL; +        char               selfheal_domain[1024] = {0}; +        int                ret                   = 0; +        default_args_cbk_t *replies              = NULL; +        call_frame_t       *frame                = NULL; + +        EC_REPLIES_ALLOC (replies, ec->nodes); +        locked_on = alloca0(ec->nodes); +        output = alloca0(ec->nodes); +        up_subvols = alloca0(ec->nodes); + +        frame = copy_frame (req_frame); +        if (!frame) { +                ret = -ENOMEM; +                goto out; +        } +        /*Do heal as root*/ +        frame->root->uid = 0; +        frame->root->gid = 0; +        sprintf (selfheal_domain, "%s:self-heal", ec->xl->name); +        ec_mask_to_char_array (ec->xl_up, up_subvols, ec->nodes); +        /*If other processes are already doing the heal, don't block*/ +        ret = cluster_entrylk (ec->xl_list, up_subvols, ec->nodes, replies, +                               locked_on, frame, ec->xl, selfheal_domain, inode, +                               NULL); +        { +                if (ret <= ec->fragments) { +                        gf_log (ec->xl->name, GF_LOG_DEBUG, "%s: Skipping heal " +                                "as only %d number of subvolumes could " +                                "be locked", uuid_utoa (inode->gfid), ret); +                        ret = -ENOTCONN; +                        goto unlock; +                } +                ret = __ec_heal_entry (frame, ec, inode, locked_on); +        } +unlock: +        cluster_unentrylk (ec->xl_list, locked_on, ec->nodes, replies, output, +                           frame, ec->xl, selfheal_domain, inode, NULL); +out: +        cluster_replies_wipe (replies, ec->nodes); +        if (frame) +                STACK_DESTROY (frame->root); +        return ret; +}  | 
