1 files changed, 106 insertions, 18 deletions
diff --git a/xlators/cluster/ec/src/ec-heal.c b/xlators/cluster/ec/src/ec-heal.c
index 81f6add5bb0..7d991f04aac 100644
--- a/xlators/cluster/ec/src/ec-heal.c
+++ b/xlators/cluster/ec/src/ec-heal.c
@@ -70,6 +70,7 @@ struct ec_name_data {
     char *name;
     inode_t *parent;
     default_args_cbk_t *replies;
+    uint32_t heal_pending;
 };
 
 static char *ec_ignore_xattrs[] = {GF_SELINUX_XATTR_KEY, QUOTA_SIZE_KEY, NULL};
@@ -994,6 +995,7 @@ ec_set_new_entry_dirty(ec_t *ec, loc_t *loc, struct iatt *ia,
         ret = -ENOTCONN;
         goto out;
     }
+
 out:
     if (xattr)
         dict_unref(xattr);
@@ -1172,6 +1174,7 @@ ec_create_name(call_frame_t *frame, ec_t *ec, inode_t *parent, char *name,
     dict_t *xdata = NULL;
     char *linkname = NULL;
     ec_config_t config;
+
     /* There should be just one gfid key */
     EC_REPLIES_ALLOC(replies, ec->nodes);
     if (gfid_db->count != 1) {
@@ -1416,6 +1419,11 @@ __ec_heal_name(call_frame_t *frame, ec_t *ec, inode_t *parent, char *name,
 
     ret = ec_create_name(frame, ec, parent, name, replies, gfid_db, enoent,
                          participants);
+    if (ret >= 0) {
+        /* If ec_create_name() succeeded we return 1 to indicate that a new
+         * file has been created and it will need to be healed. */
+        ret = 1;
+    }
 out:
     cluster_replies_wipe(replies, ec->nodes);
     loc_wipe(&loc);
@@ -1493,18 +1501,22 @@ ec_name_heal_handler(xlator_t *subvol, gf_dirent_t *entry, loc_t *parent,
     ret = ec_heal_name(name_data->frame, ec, parent->inode, entry->d_name,
                        name_on);
 
-    if (ret < 0)
+    if (ret < 0) {
         memset(name_on, 0, ec->nodes);
+    } else {
+        name_data->heal_pending += ret;
+    }
 
     for (i = 0; i < ec->nodes; i++)
         if (name_data->participants[i] && !name_on[i])
             name_data->failed_on[i] = 1;
+
     return 0;
 }
 
 int
 ec_heal_names(call_frame_t *frame, ec_t *ec, inode_t *inode,
-              unsigned char *participants)
+              unsigned char *participants, uint32_t *pending)
 {
     int i = 0;
     int j = 0;
@@ -1517,7 +1529,7 @@ ec_heal_names(call_frame_t *frame, ec_t *ec, inode_t *inode,
     name_data.frame = frame;
     name_data.participants = participants;
     name_data.failed_on = alloca0(ec->nodes);
-    ;
+    name_data.heal_pending = 0;
 
     for (i = 0; i < ec->nodes; i++) {
         if (!participants[i])
@@ -1536,6 +1548,8 @@ ec_heal_names(call_frame_t *frame, ec_t *ec, inode_t *inode,
             break;
         }
     }
+    *pending += name_data.heal_pending;
+
     loc_wipe(&loc);
     return ret;
 }
@@ -1543,7 +1557,7 @@ ec_heal_names(call_frame_t *frame, ec_t *ec, inode_t *inode,
 int
 __ec_heal_entry(call_frame_t *frame, ec_t *ec, inode_t *inode,
                 unsigned char *heal_on, unsigned char *sources,
-                unsigned char *healed_sinks)
+                unsigned char *healed_sinks, uint32_t *pending)
 {
     unsigned char *locked_on = NULL;
     unsigned char *output = NULL;
@@ -1588,7 +1602,7 @@ unlock:
         if (sources[i] || healed_sinks[i])
             participants[i] = 1;
     }
-    ret = ec_heal_names(frame, ec, inode, participants);
+    ret = ec_heal_names(frame, ec, inode, participants, pending);
 
     if (EC_COUNT(participants, ec->nodes) <= ec->fragments)
         goto out;
@@ -1609,7 +1623,8 @@ out:
 
 int
 ec_heal_entry(call_frame_t *frame, ec_t *ec, inode_t *inode,
-              unsigned char *sources, unsigned char *healed_sinks)
+              unsigned char *sources, unsigned char *healed_sinks,
+              uint32_t *pending)
 {
     unsigned char *locked_on = NULL;
     unsigned char *up_subvols = NULL;
@@ -1640,7 +1655,7 @@ ec_heal_entry(call_frame_t *frame, ec_t *ec, inode_t *inode,
             goto unlock;
         }
         ret = __ec_heal_entry(frame, ec, inode, locked_on, sources,
-                              healed_sinks);
+                              healed_sinks, pending);
     }
 unlock:
     cluster_uninodelk(ec->xl_list, locked_on, ec->nodes, replies, output, frame,
@@ -1961,14 +1976,14 @@ ec_manager_heal_block(ec_fop_data_t *fop, int32_t state)
             if (fop->cbks.heal) {
                 fop->cbks.heal(fop->req_frame, fop->data, fop->xl, 0, 0,
                                (heal->good | heal->bad), heal->good, heal->bad,
-                               NULL);
+                               0, NULL);
             }
 
             return EC_STATE_END;
         case -EC_STATE_REPORT:
             if (fop->cbks.heal) {
                 fop->cbks.heal(fop->req_frame, fop->data, fop->xl, -1,
-                               fop->error, 0, 0, 0, NULL);
+                               fop->error, 0, 0, 0, 0, NULL);
             }
 
             return EC_STATE_END;
@@ -2005,14 +2020,15 @@ out:
     if (fop != NULL) {
         ec_manager(fop, error);
     } else {
-        func(frame, heal, this, -1, error, 0, 0, 0, NULL);
+        func(frame, heal, this, -1, error, 0, 0, 0, 0, NULL);
     }
 }
 
 int32_t
 ec_heal_block_done(call_frame_t *frame, void *cookie, xlator_t *this,
                    int32_t op_ret, int32_t op_errno, uintptr_t mask,
-                   uintptr_t good, uintptr_t bad, dict_t *xdata)
+                   uintptr_t good, uintptr_t bad, uint32_t pending,
+                   dict_t *xdata)
 {
     ec_heal_t *heal = cookie;
 
@@ -2481,6 +2497,58 @@ out:
     return ret;
 }
 
+int
+ec_heal_purge_stale_index(call_frame_t *frame, ec_t *ec, inode_t *inode)
+{
+    int i = 0;
+    int ret = 0;
+    dict_t **xattr = NULL;
+    loc_t loc = {0};
+    uint64_t dirty_xattr[EC_VERSION_SIZE] = {0};
+    unsigned char *on = NULL;
+    default_args_cbk_t *replies = NULL;
+    dict_t *dict = NULL;
+
+    /* Allocate the required memory */
+    loc.inode = inode_ref(inode);
+    gf_uuid_copy(loc.gfid, inode->gfid);
+    on = alloca0(ec->nodes);
+    EC_REPLIES_ALLOC(replies, ec->nodes);
+    xattr = GF_CALLOC(ec->nodes, sizeof(*xattr), gf_common_mt_pointer);
+    if (!xattr) {
+        ret = -ENOMEM;
+        goto out;
+    }
+    dict = dict_new();
+    if (!dict) {
+        ret = -ENOMEM;
+        goto out;
+    }
+    for (i = 0; i < ec->nodes; i++) {
+        xattr[i] = dict;
+        on[i] = 1;
+    }
+    ret = dict_set_static_bin(dict, EC_XATTR_DIRTY, dirty_xattr,
+                              (sizeof(*dirty_xattr) * EC_VERSION_SIZE));
+    if (ret < 0) {
+        ret = -ENOMEM;
+        goto out;
+    }
+    PARALLEL_FOP_ONLIST(ec->xl_list, on, ec->nodes, replies, frame,
+                        ec_wind_xattrop_parallel, &loc, GF_XATTROP_ADD_ARRAY64,
+                        xattr, NULL);
+out:
+    if (dict) {
+        dict_unref(dict);
+    }
+    if (xattr) {
+        GF_FREE(xattr);
+    }
+    cluster_replies_wipe(replies, ec->nodes);
+    loc_wipe(&loc);
+    return ret;
+}
+
 void
 ec_heal_do(xlator_t *this, void *data, loc_t *loc, int32_t partial)
 {
@@ -2498,6 +2566,7 @@ ec_heal_do(xlator_t *this, void *data, loc_t *loc, int32_t partial)
     intptr_t mbad = 0;
     intptr_t good = 0;
     intptr_t bad = 0;
+    uint32_t pending = 0;
     ec_fop_data_t *fop = data;
     gf_boolean_t blocking = _gf_false;
     ec_heal_need_t need_heal = EC_HEAL_NONEED;
@@ -2533,7 +2602,7 @@ ec_heal_do(xlator_t *this, void *data, loc_t *loc, int32_t partial)
     if (loc->name && strlen(loc->name)) {
         ret = ec_heal_name(frame, ec, loc->parent, (char *)loc->name,
                            participants);
-        if (ret == 0) {
+        if (ret >= 0) {
             gf_msg_debug(this->name, 0,
                          "%s: name heal "
                          "successful on %" PRIXPTR,
@@ -2551,23 +2620,34 @@ ec_heal_do(xlator_t *this, void *data, loc_t *loc, int32_t partial)
 
     /* Mount triggers heal only when it detects that it must need heal, shd
      * triggers heals periodically which need not be thorough*/
-    if (ec->shd.iamshd) {
+    if (ec->shd.iamshd && (ret <= 0)) {
         ec_heal_inspect(frame, ec, loc->inode, up_subvols, _gf_false, _gf_false,
                         &need_heal);
 
-        if (need_heal == EC_HEAL_NONEED) {
+        if (need_heal == EC_HEAL_PURGE_INDEX) {
+            gf_msg(ec->xl->name, GF_LOG_INFO, 0, EC_MSG_HEAL_FAIL,
+                   "Index entry needs to be purged for: %s ",
+                   uuid_utoa(loc->gfid));
+            /* We need to send zero-xattrop so that stale index entry could be
+             * removed. We need not take lock on this entry to do so as
+             * xattrop on a brick is atomic. */
+            ec_heal_purge_stale_index(frame, ec, loc->inode);
+            goto out;
+        } else if (need_heal == EC_HEAL_NONEED) {
             gf_msg(ec->xl->name, GF_LOG_DEBUG, 0, EC_MSG_HEAL_FAIL,
                    "Heal is not required for : %s ", uuid_utoa(loc->gfid));
             goto out;
         }
     }
+
     sources = alloca0(ec->nodes);
     healed_sinks = alloca0(ec->nodes);
     if (IA_ISREG(loc->inode->ia_type)) {
         ret = ec_heal_data(frame, ec, blocking, loc->inode, sources,
                            healed_sinks);
     } else if (IA_ISDIR(loc->inode->ia_type) && !partial) {
-        ret = ec_heal_entry(frame, ec, loc->inode, sources, healed_sinks);
+        ret = ec_heal_entry(frame, ec, loc->inode, sources, healed_sinks,
+                            &pending);
     } else {
         ret = 0;
         memcpy(sources, participants, ec->nodes);
@@ -2597,10 +2677,11 @@ out:
     if (fop->cbks.heal) {
         fop->cbks.heal(fop->req_frame, fop->data, fop->xl, op_ret, op_errno,
                        ec_char_array_to_mask(participants, ec->nodes),
-                       mgood & good, mbad & bad, NULL);
+                       mgood & good, mbad & bad, pending, NULL);
     }
     if (frame)
         STACK_DESTROY(frame->root);
+
     return;
 }
 
@@ -2648,7 +2729,7 @@ ec_heal_fail(ec_t *ec, ec_fop_data_t *fop)
 {
     if (fop->cbks.heal) {
         fop->cbks.heal(fop->req_frame, fop->data, ec->xl, -1, fop->error, 0, 0,
-                       0, NULL);
+                       0, 0, NULL);
     }
     ec_fop_data_release(fop);
 }
@@ -2835,7 +2916,7 @@ fail:
     if (fop)
         ec_fop_data_release(fop);
     if (func)
-        func(frame, data, this, -1, err, 0, 0, 0, NULL);
+        func(frame, data, this, -1, err, 0, 0, 0, 0, NULL);
 }
 
 int
@@ -2964,6 +3045,13 @@ _need_heal_calculate(ec_t *ec, uint64_t *dirty, unsigned char *sources,
                     goto out;
                 }
             }
+            /* If lock count is 0, all dirty flags are 0 and all the
+             * versions are macthing then why are we here. It looks
+             * like something went wrong while removing the index entries
+             * after completing a successful heal or fop. In this case
+             * we need to remove this index entry to avoid triggering heal
+             * in a loop and causing lookups again and again*/
+            *need_heal = EC_HEAL_PURGE_INDEX;
         } else {
             for (i = 0; i < ec->nodes; i++) {
                 /* Since each lock can only increment the dirty