10 files changed, 345 insertions, 93 deletions
diff --git a/xlators/cluster/ec/src/ec-combine.c b/xlators/cluster/ec/src/ec-combine.c
index 9d712b359a0..703a30e2485 100644
--- a/xlators/cluster/ec/src/ec-combine.c
+++ b/xlators/cluster/ec/src/ec-combine.c
@@ -343,9 +343,8 @@ out:
 }
 
 static int32_t
-ec_dict_data_concat(const char *fmt, ec_cbk_data_t *cbk, int32_t which,
-                    char *key, char *new_key, const char *def,
-                    gf_boolean_t global, ...)
+ec_dict_data_concat(ec_cbk_data_t *cbk, int32_t which, char *key, char *new_key,
+                    const char *def, gf_boolean_t global, const char *fmt, ...)
 {
     ec_t *ec = cbk->fop->xl->private;
     data_t *data[ec->nodes];
@@ -357,7 +356,7 @@ ec_dict_data_concat(const char *fmt, ec_cbk_data_t *cbk, int32_t which,
 
     ec_dict_list(data, cbk, which, key, global);
 
-    va_start(args, global);
+    va_start(args, fmt);
     err = ec_concat_prepare(cbk->fop->xl, &pre, &sep, &post, fmt, args);
     va_end(args);
 
@@ -730,14 +729,14 @@ ec_dict_data_combine(dict_t *dict, char *key, data_t *value, void *arg)
 
     if ((strcmp(key, GF_XATTR_PATHINFO_KEY) == 0) ||
         (strcmp(key, GF_XATTR_USER_PATHINFO_KEY) == 0)) {
-        return ec_dict_data_concat("(<EC:%s> { })", data->cbk, data->which, key,
-                                   NULL, NULL, _gf_false,
+        return ec_dict_data_concat(data->cbk, data->which, key, NULL, NULL,
+                                   _gf_false, _gf_false, "(<EC:%s> { })",
                                    data->cbk->fop->xl->name);
     }
 
     if (strncmp(key, GF_XATTR_CLRLK_CMD, SLEN(GF_XATTR_CLRLK_CMD)) == 0) {
-        return ec_dict_data_concat("{\n}", data->cbk, data->which, key, NULL,
-                                   NULL, _gf_false);
+        return ec_dict_data_concat(data->cbk, data->which, key, NULL, NULL,
+                                   _gf_false, "{\n}");
     }
 
     if (strncmp(key, GF_XATTR_LOCKINFO_KEY, SLEN(GF_XATTR_LOCKINFO_KEY)) == 0) {
@@ -767,9 +766,9 @@ ec_dict_data_combine(dict_t *dict, char *key, data_t *value, void *arg)
     if (XATTR_IS_NODE_UUID(key)) {
         if (data->cbk->fop->int32) {
             /* List of node uuid is requested */
-            return ec_dict_data_concat("{ }", data->cbk, data->which, key,
+            return ec_dict_data_concat(data->cbk, data->which, key,
                                        GF_XATTR_LIST_NODE_UUIDS_KEY, UUID0_STR,
-                                       _gf_true);
+                                       _gf_true, "{ }");
         } else {
             return ec_dict_data_uuid(data->cbk, data->which, key);
         }
diff --git a/xlators/cluster/ec/src/ec-common.c b/xlators/cluster/ec/src/ec-common.c
index ded34b81aa2..b955efd8c2d 100644
--- a/xlators/cluster/ec/src/ec-common.c
+++ b/xlators/cluster/ec/src/ec-common.c
@@ -230,7 +230,7 @@ ec_child_next(ec_t *ec, ec_fop_data_t *fop, uint32_t idx)
 int32_t
 ec_heal_report(call_frame_t *frame, void *cookie, xlator_t *this,
                int32_t op_ret, int32_t op_errno, uintptr_t mask, uintptr_t good,
-               uintptr_t bad, dict_t *xdata)
+               uintptr_t bad, uint32_t pending, dict_t *xdata)
 {
     if (op_ret < 0) {
         gf_msg(this->name, GF_LOG_DEBUG, op_errno, EC_MSG_HEAL_FAIL,
@@ -316,17 +316,19 @@ ec_check_status(ec_fop_data_t *fop)
         }
     }
 
-    gf_msg(fop->xl->name, GF_LOG_WARNING, 0, EC_MSG_OP_FAIL_ON_SUBVOLS,
-           "Operation failed on %d of %d subvolumes.(up=%s, mask=%s, "
-           "remaining=%s, good=%s, bad=%s, %s)",
-           gf_bits_count(ec->xl_up & ~(fop->remaining | fop->good)), ec->nodes,
-           ec_bin(str1, sizeof(str1), ec->xl_up, ec->nodes),
-           ec_bin(str2, sizeof(str2), fop->mask, ec->nodes),
-           ec_bin(str3, sizeof(str3), fop->remaining, ec->nodes),
-           ec_bin(str4, sizeof(str4), fop->good, ec->nodes),
-           ec_bin(str5, sizeof(str5), ec->xl_up & ~(fop->remaining | fop->good),
-                  ec->nodes),
-           ec_msg_str(fop));
+    gf_msg(
+        fop->xl->name, GF_LOG_WARNING, 0, EC_MSG_OP_FAIL_ON_SUBVOLS,
+        "Operation failed on %d of %d subvolumes.(up=%s, mask=%s, "
+        "remaining=%s, good=%s, bad=%s,"
+        "(Least significant bit represents first client/brick of subvol), %s)",
+        gf_bits_count(ec->xl_up & ~(fop->remaining | fop->good)), ec->nodes,
+        ec_bin(str1, sizeof(str1), ec->xl_up, ec->nodes),
+        ec_bin(str2, sizeof(str2), fop->mask, ec->nodes),
+        ec_bin(str3, sizeof(str3), fop->remaining, ec->nodes),
+        ec_bin(str4, sizeof(str4), fop->good, ec->nodes),
+        ec_bin(str5, sizeof(str5), ec->xl_up & ~(fop->remaining | fop->good),
+               ec->nodes),
+        ec_msg_str(fop));
     if (fop->use_fd) {
         if (fop->fd != NULL) {
             ec_fheal(NULL, fop->xl, -1, EC_MINIMUM_ONE, ec_heal_report, NULL,
@@ -614,10 +616,10 @@ ec_msg_str(ec_fop_data_t *fop)
     loc_t *loc2 = NULL;
     char gfid1[64] = {0};
     char gfid2[64] = {0};
+    ec_fop_data_t *parent = fop->parent;
 
     if (fop->errstr)
         return fop->errstr;
-
     if (!fop->use_fd) {
         loc1 = &fop->loc[0];
         loc2 = &fop->loc[1];
@@ -625,23 +627,45 @@ ec_msg_str(ec_fop_data_t *fop)
         if (fop->id == GF_FOP_RENAME) {
             gf_asprintf(&fop->errstr,
                         "FOP : '%s' failed on '%s' and '%s' with gfids "
-                        "%s and %s respectively",
+                        "%s and %s respectively. Parent FOP: %s",
                         ec_fop_name(fop->id), loc1->path, loc2->path,
                         uuid_utoa_r(loc1->gfid, gfid1),
-                        uuid_utoa_r(loc2->gfid, gfid2));
+                        uuid_utoa_r(loc2->gfid, gfid2),
+                        parent ? ec_fop_name(parent->id) : "No Parent");
         } else {
-            gf_asprintf(&fop->errstr, "FOP : '%s' failed on '%s' with gfid %s",
-                        ec_fop_name(fop->id), loc1->path,
-                        uuid_utoa_r(loc1->gfid, gfid1));
+            gf_asprintf(
+                &fop->errstr,
+                "FOP : '%s' failed on '%s' with gfid %s. Parent FOP: %s",
+                ec_fop_name(fop->id), loc1->path,
+                uuid_utoa_r(loc1->gfid, gfid1),
+                parent ? ec_fop_name(parent->id) : "No Parent");
         }
     } else {
-        gf_asprintf(&fop->errstr, "FOP : '%s' failed on gfid %s",
-                    ec_fop_name(fop->id),
-                    uuid_utoa_r(fop->fd->inode->gfid, gfid1));
+        gf_asprintf(
+            &fop->errstr, "FOP : '%s' failed on gfid %s. Parent FOP: %s",
+            ec_fop_name(fop->id), uuid_utoa_r(fop->fd->inode->gfid, gfid1),
+            parent ? ec_fop_name(parent->id) : "No Parent");
     }
     return fop->errstr;
 }
 
+static void
+ec_log_insufficient_vol(ec_fop_data_t *fop, int32_t have, uint32_t need,
+                        int32_t loglevel)
+{
+    ec_t *ec = fop->xl->private;
+    char str1[32], str2[32], str3[32];
+
+    gf_msg(ec->xl->name, loglevel, 0, EC_MSG_CHILDS_INSUFFICIENT,
+           "Insufficient available children for this request: "
+           "Have : %d, Need : %u : Child UP : %s "
+           "Mask: %s, Healing : %s : %s ",
+           have, need, ec_bin(str1, sizeof(str1), ec->xl_up, ec->nodes),
+           ec_bin(str2, sizeof(str2), fop->mask, ec->nodes),
+           ec_bin(str3, sizeof(str3), fop->healing, ec->nodes),
+           ec_msg_str(fop));
+}
+
 static int32_t
 ec_child_select(ec_fop_data_t *fop)
 {
@@ -699,11 +723,7 @@ ec_child_select(ec_fop_data_t *fop)
     ec_trace("SELECT", fop, "");
 
     if ((num < fop->minimum) && (num < ec->fragments)) {
-        gf_msg(ec->xl->name, GF_LOG_ERROR, 0, EC_MSG_CHILDS_INSUFFICIENT,
-               "Insufficient available children "
-               "for this request (have %d, need "
-               "%d). %s",
-               num, fop->minimum, ec_msg_str(fop));
+        ec_log_insufficient_vol(fop, num, fop->minimum, GF_LOG_ERROR);
         return 0;
     }
 
@@ -711,11 +731,7 @@ ec_child_select(ec_fop_data_t *fop)
         (fop->locks[0].update[EC_DATA_TXN] ||
          fop->locks[0].update[EC_METADATA_TXN])) {
         if (ec->quorum_count && (num < ec->quorum_count)) {
-            gf_msg(ec->xl->name, GF_LOG_ERROR, 0, EC_MSG_CHILDS_INSUFFICIENT,
-                   "Insufficient available children "
-                   "for this request (have %d, need "
-                   "%d). %s",
-                   num, ec->quorum_count, ec_msg_str(fop));
+            ec_log_insufficient_vol(fop, num, ec->quorum_count, GF_LOG_ERROR);
             return 0;
         }
     }
diff --git a/xlators/cluster/ec/src/ec-dir-read.c b/xlators/cluster/ec/src/ec-dir-read.c
index ef6b06fa4dd..f71dcfac293 100644
--- a/xlators/cluster/ec/src/ec-dir-read.c
+++ b/xlators/cluster/ec/src/ec-dir-read.c
@@ -386,9 +386,16 @@ ec_manager_readdir(ec_fop_data_t *fop, int32_t state)
             /* Return error if opendir has not been successfully called on
              * any subvolume. */
             ctx = ec_fd_get(fop->fd, fop->xl);
-            if ((ctx == NULL) || (ctx->open == 0)) {
-                fop->error = EINVAL;
+            if (ctx == NULL) {
+                fop->error = ENOMEM;
+            } else if (ctx->open == 0) {
+                fop->error = EBADFD;
+            }
 
+            if (fop->error) {
+                gf_msg(fop->xl->name, GF_LOG_ERROR, fop->error,
+                       EC_MSG_INVALID_REQUEST, "EC is not winding readdir: %s",
+                       ec_msg_str(fop));
                 return EC_STATE_REPORT;
             }
 
diff --git a/xlators/cluster/ec/src/ec-heal.c b/xlators/cluster/ec/src/ec-heal.c
index 81f6add5bb0..7d991f04aac 100644
--- a/xlators/cluster/ec/src/ec-heal.c
+++ b/xlators/cluster/ec/src/ec-heal.c
@@ -70,6 +70,7 @@ struct ec_name_data {
     char *name;
     inode_t *parent;
     default_args_cbk_t *replies;
+    uint32_t heal_pending;
 };
 
 static char *ec_ignore_xattrs[] = {GF_SELINUX_XATTR_KEY, QUOTA_SIZE_KEY, NULL};
@@ -994,6 +995,7 @@ ec_set_new_entry_dirty(ec_t *ec, loc_t *loc, struct iatt *ia,
         ret = -ENOTCONN;
         goto out;
     }
+
 out:
     if (xattr)
         dict_unref(xattr);
@@ -1172,6 +1174,7 @@ ec_create_name(call_frame_t *frame, ec_t *ec, inode_t *parent, char *name,
     dict_t *xdata = NULL;
     char *linkname = NULL;
     ec_config_t config;
+
     /* There should be just one gfid key */
     EC_REPLIES_ALLOC(replies, ec->nodes);
     if (gfid_db->count != 1) {
@@ -1416,6 +1419,11 @@ __ec_heal_name(call_frame_t *frame, ec_t *ec, inode_t *parent, char *name,
 
     ret = ec_create_name(frame, ec, parent, name, replies, gfid_db, enoent,
                          participants);
+    if (ret >= 0) {
+        /* If ec_create_name() succeeded we return 1 to indicate that a new
+         * file has been created and it will need to be healed. */
+        ret = 1;
+    }
 out:
     cluster_replies_wipe(replies, ec->nodes);
     loc_wipe(&loc);
@@ -1493,18 +1501,22 @@ ec_name_heal_handler(xlator_t *subvol, gf_dirent_t *entry, loc_t *parent,
     ret = ec_heal_name(name_data->frame, ec, parent->inode, entry->d_name,
                        name_on);
 
-    if (ret < 0)
+    if (ret < 0) {
         memset(name_on, 0, ec->nodes);
+    } else {
+        name_data->heal_pending += ret;
+    }
 
     for (i = 0; i < ec->nodes; i++)
         if (name_data->participants[i] && !name_on[i])
             name_data->failed_on[i] = 1;
+
     return 0;
 }
 
 int
 ec_heal_names(call_frame_t *frame, ec_t *ec, inode_t *inode,
-              unsigned char *participants)
+              unsigned char *participants, uint32_t *pending)
 {
     int i = 0;
     int j = 0;
@@ -1517,7 +1529,7 @@ ec_heal_names(call_frame_t *frame, ec_t *ec, inode_t *inode,
     name_data.frame = frame;
     name_data.participants = participants;
     name_data.failed_on = alloca0(ec->nodes);
-    ;
+    name_data.heal_pending = 0;
 
     for (i = 0; i < ec->nodes; i++) {
         if (!participants[i])
@@ -1536,6 +1548,8 @@ ec_heal_names(call_frame_t *frame, ec_t *ec, inode_t *inode,
             break;
         }
     }
+    *pending += name_data.heal_pending;
+
     loc_wipe(&loc);
     return ret;
 }
@@ -1543,7 +1557,7 @@ ec_heal_names(call_frame_t *frame, ec_t *ec, inode_t *inode,
 int
 __ec_heal_entry(call_frame_t *frame, ec_t *ec, inode_t *inode,
                 unsigned char *heal_on, unsigned char *sources,
-                unsigned char *healed_sinks)
+                unsigned char *healed_sinks, uint32_t *pending)
 {
     unsigned char *locked_on = NULL;
     unsigned char *output = NULL;
@@ -1588,7 +1602,7 @@ unlock:
         if (sources[i] || healed_sinks[i])
             participants[i] = 1;
     }
-    ret = ec_heal_names(frame, ec, inode, participants);
+    ret = ec_heal_names(frame, ec, inode, participants, pending);
 
     if (EC_COUNT(participants, ec->nodes) <= ec->fragments)
         goto out;
@@ -1609,7 +1623,8 @@ out:
 
 int
 ec_heal_entry(call_frame_t *frame, ec_t *ec, inode_t *inode,
-              unsigned char *sources, unsigned char *healed_sinks)
+              unsigned char *sources, unsigned char *healed_sinks,
+              uint32_t *pending)
 {
     unsigned char *locked_on = NULL;
     unsigned char *up_subvols = NULL;
@@ -1640,7 +1655,7 @@ ec_heal_entry(call_frame_t *frame, ec_t *ec, inode_t *inode,
             goto unlock;
         }
         ret = __ec_heal_entry(frame, ec, inode, locked_on, sources,
-                              healed_sinks);
+                              healed_sinks, pending);
     }
 unlock:
     cluster_uninodelk(ec->xl_list, locked_on, ec->nodes, replies, output, frame,
@@ -1961,14 +1976,14 @@ ec_manager_heal_block(ec_fop_data_t *fop, int32_t state)
             if (fop->cbks.heal) {
                 fop->cbks.heal(fop->req_frame, fop->data, fop->xl, 0, 0,
                                (heal->good | heal->bad), heal->good, heal->bad,
-                               NULL);
+                               0, NULL);
             }
 
             return EC_STATE_END;
         case -EC_STATE_REPORT:
             if (fop->cbks.heal) {
                 fop->cbks.heal(fop->req_frame, fop->data, fop->xl, -1,
-                               fop->error, 0, 0, 0, NULL);
+                               fop->error, 0, 0, 0, 0, NULL);
             }
 
             return EC_STATE_END;
@@ -2005,14 +2020,15 @@ out:
     if (fop != NULL) {
         ec_manager(fop, error);
     } else {
-        func(frame, heal, this, -1, error, 0, 0, 0, NULL);
+        func(frame, heal, this, -1, error, 0, 0, 0, 0, NULL);
     }
 }
 
 int32_t
 ec_heal_block_done(call_frame_t *frame, void *cookie, xlator_t *this,
                    int32_t op_ret, int32_t op_errno, uintptr_t mask,
-                   uintptr_t good, uintptr_t bad, dict_t *xdata)
+                   uintptr_t good, uintptr_t bad, uint32_t pending,
+                   dict_t *xdata)
 {
     ec_heal_t *heal = cookie;
 
@@ -2481,6 +2497,58 @@ out:
     return ret;
 }
 
+int
+ec_heal_purge_stale_index(call_frame_t *frame, ec_t *ec, inode_t *inode)
+{
+    int i = 0;
+    int ret = 0;
+    dict_t **xattr = NULL;
+    loc_t loc = {0};
+    uint64_t dirty_xattr[EC_VERSION_SIZE] = {0};
+    unsigned char *on = NULL;
+    default_args_cbk_t *replies = NULL;
+    dict_t *dict = NULL;
+
+    /* Allocate the required memory */
+    loc.inode = inode_ref(inode);
+    gf_uuid_copy(loc.gfid, inode->gfid);
+    on = alloca0(ec->nodes);
+    EC_REPLIES_ALLOC(replies, ec->nodes);
+    xattr = GF_CALLOC(ec->nodes, sizeof(*xattr), gf_common_mt_pointer);
+    if (!xattr) {
+        ret = -ENOMEM;
+        goto out;
+    }
+    dict = dict_new();
+    if (!dict) {
+        ret = -ENOMEM;
+        goto out;
+    }
+    for (i = 0; i < ec->nodes; i++) {
+        xattr[i] = dict;
+        on[i] = 1;
+    }
+    ret = dict_set_static_bin(dict, EC_XATTR_DIRTY, dirty_xattr,
+                              (sizeof(*dirty_xattr) * EC_VERSION_SIZE));
+    if (ret < 0) {
+        ret = -ENOMEM;
+        goto out;
+    }
+    PARALLEL_FOP_ONLIST(ec->xl_list, on, ec->nodes, replies, frame,
+                        ec_wind_xattrop_parallel, &loc, GF_XATTROP_ADD_ARRAY64,
+                        xattr, NULL);
+out:
+    if (dict) {
+        dict_unref(dict);
+    }
+    if (xattr) {
+        GF_FREE(xattr);
+    }
+    cluster_replies_wipe(replies, ec->nodes);
+    loc_wipe(&loc);
+    return ret;
+}
+
 void
 ec_heal_do(xlator_t *this, void *data, loc_t *loc, int32_t partial)
 {
@@ -2498,6 +2566,7 @@ ec_heal_do(xlator_t *this, void *data, loc_t *loc, int32_t partial)
     intptr_t mbad = 0;
     intptr_t good = 0;
     intptr_t bad = 0;
+    uint32_t pending = 0;
     ec_fop_data_t *fop = data;
     gf_boolean_t blocking = _gf_false;
     ec_heal_need_t need_heal = EC_HEAL_NONEED;
@@ -2533,7 +2602,7 @@ ec_heal_do(xlator_t *this, void *data, loc_t *loc, int32_t partial)
     if (loc->name && strlen(loc->name)) {
         ret = ec_heal_name(frame, ec, loc->parent, (char *)loc->name,
                            participants);
-        if (ret == 0) {
+        if (ret >= 0) {
             gf_msg_debug(this->name, 0,
                          "%s: name heal "
                          "successful on %" PRIXPTR,
@@ -2551,23 +2620,34 @@ ec_heal_do(xlator_t *this, void *data, loc_t *loc, int32_t partial)
 
     /* Mount triggers heal only when it detects that it must need heal, shd
      * triggers heals periodically which need not be thorough*/
-    if (ec->shd.iamshd) {
+    if (ec->shd.iamshd && (ret <= 0)) {
         ec_heal_inspect(frame, ec, loc->inode, up_subvols, _gf_false, _gf_false,
                         &need_heal);
 
-        if (need_heal == EC_HEAL_NONEED) {
+        if (need_heal == EC_HEAL_PURGE_INDEX) {
+            gf_msg(ec->xl->name, GF_LOG_INFO, 0, EC_MSG_HEAL_FAIL,
+                   "Index entry needs to be purged for: %s ",
+                   uuid_utoa(loc->gfid));
+            /* We need to send zero-xattrop so that stale index entry could be
+             * removed. We need not take lock on this entry to do so as
+             * xattrop on a brick is atomic. */
+            ec_heal_purge_stale_index(frame, ec, loc->inode);
+            goto out;
+        } else if (need_heal == EC_HEAL_NONEED) {
             gf_msg(ec->xl->name, GF_LOG_DEBUG, 0, EC_MSG_HEAL_FAIL,
                    "Heal is not required for : %s ", uuid_utoa(loc->gfid));
             goto out;
         }
     }
+
     sources = alloca0(ec->nodes);
     healed_sinks = alloca0(ec->nodes);
     if (IA_ISREG(loc->inode->ia_type)) {
         ret = ec_heal_data(frame, ec, blocking, loc->inode, sources,
                            healed_sinks);
     } else if (IA_ISDIR(loc->inode->ia_type) && !partial) {
-        ret = ec_heal_entry(frame, ec, loc->inode, sources, healed_sinks);
+        ret = ec_heal_entry(frame, ec, loc->inode, sources, healed_sinks,
+                            &pending);
     } else {
         ret = 0;
         memcpy(sources, participants, ec->nodes);
@@ -2597,10 +2677,11 @@ out:
     if (fop->cbks.heal) {
         fop->cbks.heal(fop->req_frame, fop->data, fop->xl, op_ret, op_errno,
                        ec_char_array_to_mask(participants, ec->nodes),
-                       mgood & good, mbad & bad, NULL);
+                       mgood & good, mbad & bad, pending, NULL);
     }
     if (frame)
         STACK_DESTROY(frame->root);
+
     return;
 }
 
@@ -2648,7 +2729,7 @@ ec_heal_fail(ec_t *ec, ec_fop_data_t *fop)
 {
     if (fop->cbks.heal) {
         fop->cbks.heal(fop->req_frame, fop->data, ec->xl, -1, fop->error, 0, 0,
-                       0, NULL);
+                       0, 0, NULL);
     }
     ec_fop_data_release(fop);
 }
@@ -2835,7 +2916,7 @@ fail:
     if (fop)
         ec_fop_data_release(fop);
     if (func)
-        func(frame, data, this, -1, err, 0, 0, 0, NULL);
+        func(frame, data, this, -1, err, 0, 0, 0, 0, NULL);
 }
 
 int
@@ -2964,6 +3045,13 @@ _need_heal_calculate(ec_t *ec, uint64_t *dirty, unsigned char *sources,
                     goto out;
                 }
             }
+            /* If lock count is 0, all dirty flags are 0 and all the
+             * versions are macthing then why are we here. It looks
+             * like something went wrong while removing the index entries
+             * after completing a successful heal or fop. In this case
+             * we need to remove this index entry to avoid triggering heal
+             * in a loop and causing lookups again and again*/
+            *need_heal = EC_HEAL_PURGE_INDEX;
         } else {
             for (i = 0; i < ec->nodes; i++) {
                 /* Since each lock can only increment the dirty
diff --git a/xlators/cluster/ec/src/ec-heald.c b/xlators/cluster/ec/src/ec-heald.c
index 956e73c2088..5c1586bc9c5 100644
--- a/xlators/cluster/ec/src/ec-heald.c
+++ b/xlators/cluster/ec/src/ec-heald.c
@@ -62,7 +62,7 @@ __ec_shd_healer_wait(struct subvol_healer *healer)
     ec = healer->this->private;
 
 disabled_loop:
-    wait_till.tv_sec = time(NULL) + ec->shd.timeout;
+    wait_till.tv_sec = gf_time() + ec->shd.timeout;
 
     while (!healer->rerun) {
         ret = pthread_cond_timedwait(&healer->cond, &healer->mutex, &wait_till);
@@ -156,19 +156,78 @@ ec_shd_index_purge(xlator_t *subvol, inode_t *inode, char *name)
     return ret;
 }
 
+static gf_boolean_t
+ec_is_heal_completed(char *status)
+{
+    char *bad_pos = NULL;
+    char *zero_pos = NULL;
+
+    if (!status) {
+        return _gf_false;
+    }
+
+    /*Logic:
+     * Status will be of the form Good: <binary>, Bad: <binary>
+     * If heal completes, if we do strchr for '0' it should be present after
+     * 'Bad:' i.e. strRchr for ':'
+     * */
+
+    zero_pos = strchr(status, '0');
+    bad_pos = strrchr(status, ':');
+    if (!zero_pos || !bad_pos) {
+        /*malformed status*/
+        return _gf_false;
+    }
+
+    if (zero_pos > bad_pos) {
+        return _gf_true;
+    }
+
+    return _gf_false;
+}
+
 int
 ec_shd_selfheal(struct subvol_healer *healer, int child, loc_t *loc,
                 gf_boolean_t full)
 {
+    dict_t *xdata = NULL;
+    dict_t *dict = NULL;
+    uint32_t count;
     int32_t ret;
+    char *heal_status = NULL;
+    ec_t *ec = healer->this->private;
+
+    GF_ATOMIC_INC(ec->stats.shd.attempted);
+    ret = syncop_getxattr(healer->this, loc, &dict, EC_XATTR_HEAL, NULL,
+                          &xdata);
+    if (ret == 0) {
+        if (dict && (dict_get_str(dict, EC_XATTR_HEAL, &heal_status) == 0)) {
+            if (ec_is_heal_completed(heal_status)) {
+                GF_ATOMIC_INC(ec->stats.shd.completed);
+            }
+        }
+    }
 
-    ret = syncop_getxattr(healer->this, loc, NULL, EC_XATTR_HEAL, NULL, NULL);
-    if (!full && (ret >= 0) && (loc->inode->ia_type == IA_IFDIR)) {
+    if (!full && (loc->inode->ia_type == IA_IFDIR)) {
         /* If we have just healed a directory, it's possible that
-         * other index entries have appeared to be healed. We put a
-         * mark so that we can check it later and restart a scan
-         * without delay. */
-        healer->rerun = _gf_true;
+         * other index entries have appeared to be healed. */
+        if ((xdata != NULL) &&
+            (dict_get_uint32(xdata, EC_XATTR_HEAL_NEW, &count) == 0) &&
+            (count > 0)) {
+            /* Force a rerun of the index healer. */
+            gf_msg_debug(healer->this->name, 0, "%d more entries to heal",
+                         count);
+
+            healer->rerun = _gf_true;
+        }
+    }
+
+    if (xdata != NULL) {
+        dict_unref(xdata);
+    }
+
+    if (dict) {
+        dict_unref(dict);
     }
 
     return ret;
diff --git a/xlators/cluster/ec/src/ec-inode-read.c b/xlators/cluster/ec/src/ec-inode-read.c
index a891ccd0952..dad5f4d7018 100644
--- a/xlators/cluster/ec/src/ec-inode-read.c
+++ b/xlators/cluster/ec/src/ec-inode-read.c
@@ -390,7 +390,8 @@ ec_manager_getxattr(ec_fop_data_t *fop, int32_t state)
 int32_t
 ec_getxattr_heal_cbk(call_frame_t *frame, void *cookie, xlator_t *xl,
                      int32_t op_ret, int32_t op_errno, uintptr_t mask,
-                     uintptr_t good, uintptr_t bad, dict_t *xdata)
+                     uintptr_t good, uintptr_t bad, uint32_t pending,
+                     dict_t *xdata)
 {
     fop_getxattr_cbk_t func = cookie;
     ec_t *ec = xl->private;
@@ -398,6 +399,25 @@ ec_getxattr_heal_cbk(call_frame_t *frame, void *cookie, xlator_t *xl,
     char *str;
     char bin1[65], bin2[65];
 
+    /* We try to return the 'pending' information in xdata, but if this cannot
+     * be set, we will ignore it silently. We prefer to report the success or
+     * failure of the heal itself. */
+    if (xdata == NULL) {
+        xdata = dict_new();
+    } else {
+        dict_ref(xdata);
+    }
+    if (xdata != NULL) {
+        if (dict_set_uint32(xdata, EC_XATTR_HEAL_NEW, pending) != 0) {
+            /* dict_set_uint32() is marked as 'warn_unused_result' and gcc
+             * enforces to check the result in this case. However we don't
+             * really care if it succeeded or not. We'll just do the same.
+             *
+             * This empty 'if' avoids the warning, and it will be removed by
+             * the optimizer. */
+        }
+    }
+
     if (op_ret >= 0) {
         dict = dict_new();
         if (dict == NULL) {
@@ -431,11 +451,14 @@ ec_getxattr_heal_cbk(call_frame_t *frame, void *cookie, xlator_t *xl,
     }
 
 out:
-    func(frame, NULL, xl, op_ret, op_errno, dict, NULL);
+    func(frame, NULL, xl, op_ret, op_errno, dict, xdata);
 
     if (dict != NULL) {
         dict_unref(dict);
     }
+    if (xdata != NULL) {
+        dict_unref(xdata);
+    }
 
     return 0;
 }
diff --git a/xlators/cluster/ec/src/ec-locks.c b/xlators/cluster/ec/src/ec-locks.c
index 8e84977d2b3..601960d6154 100644
--- a/xlators/cluster/ec/src/ec-locks.c
+++ b/xlators/cluster/ec/src/ec-locks.c
@@ -24,9 +24,36 @@ ec_lock_check(ec_fop_data_t *fop, uintptr_t *mask)
     ec_t *ec = fop->xl->private;
     ec_cbk_data_t *ans = NULL;
     ec_cbk_data_t *cbk = NULL;
-    uintptr_t locked = 0, notlocked = 0;
+    uintptr_t locked = 0;
+    int32_t good = 0;
+    int32_t eagain = 0;
+    int32_t estale = 0;
     int32_t error = -1;
 
+    /* There are some errors that we'll handle in an special way while trying
+     * to acquire a lock.
+     *
+     *   EAGAIN:  If it's found during a parallel non-blocking lock request, we
+     *            consider that there's contention on the inode, so we consider
+     *            the acquisition a failure and try again with a sequential
+     *            blocking lock request. This will ensure that we get a lock on
+     *            as many bricks as possible (ignoring EAGAIN here would cause
+     *            unnecessary triggers of self-healing).
+     *
+     *            If it's found during a sequential blocking lock request, it's
+     *            considered an error. Lock will only succeed if there are
+     *            enough other bricks locked.
+     *
+     *   ESTALE:  This can appear during parallel or sequential lock request if
+     *            the inode has just been unlinked. We consider this error is
+     *            not recoverable, but we also don't consider it as fatal. So,
+     *            if it happens during parallel lock, we won't attempt a
+     *            sequential one unless there are EAGAIN errors on other
+     *            bricks (and are enough to form a quorum), but if we reach
+     *            quorum counting the ESTALE bricks, we consider the whole
+     *            result of the operation is ESTALE instead of EIO.
+     */
+
     list_for_each_entry(ans, &fop->cbk_list, list)
     {
         if (ans->op_ret >= 0) {
@@ -34,24 +61,23 @@ ec_lock_check(ec_fop_data_t *fop, uintptr_t *mask)
                 error = EIO;
             }
             locked |= ans->mask;
+            good = ans->count;
             cbk = ans;
-        } else {
-            if (ans->op_errno == EAGAIN) {
-                switch (fop->uint32) {
-                    case EC_LOCK_MODE_NONE:
-                    case EC_LOCK_MODE_ALL:
-                        /* Goal is to treat non-blocking lock as failure
-                         * even if there is a single EAGAIN*/
-                        notlocked |= ans->mask;
-                        break;
-                }
-            }
+        } else if (ans->op_errno == ESTALE) {
+            estale += ans->count;
+        } else if ((ans->op_errno == EAGAIN) &&
+                   (fop->uint32 != EC_LOCK_MODE_INC)) {
+            eagain += ans->count;
         }
     }
 
     if (error == -1) {
-        if (gf_bits_count(locked | notlocked) >= ec->fragments) {
-            if (notlocked == 0) {
+        /* If we have enough quorum with succeeded and EAGAIN answers, we
+         * ignore for now any ESTALE answer. If there are EAGAIN answers,
+         * we retry with a sequential blocking lock request if needed.
+         * Otherwise we succeed. */
+        if ((good + eagain) >= ec->fragments) {
+            if (eagain == 0) {
                 if (fop->answer == NULL) {
                     fop->answer = cbk;
                 }
@@ -64,21 +90,28 @@ ec_lock_check(ec_fop_data_t *fop, uintptr_t *mask)
                     case EC_LOCK_MODE_NONE:
                         error = EAGAIN;
                         break;
-
                     case EC_LOCK_MODE_ALL:
                         fop->uint32 = EC_LOCK_MODE_INC;
                         break;
-
                     default:
+                        /* This shouldn't happen because eagain cannot be > 0
+                         * when fop->uint32 is EC_LOCK_MODE_INC. */
                         error = EIO;
                         break;
                 }
             }
         } else {
-            if (fop->answer && fop->answer->op_ret < 0)
+            /* We have been unable to find enough candidates that will be able
+             * to take the lock. If we have quorum on some answer, we return
+             * it. Otherwise we check if ESTALE answers allow us to reach
+             * quorum. If so, we return ESTALE. */
+            if (fop->answer && fop->answer->op_ret < 0) {
                 error = fop->answer->op_errno;
-            else
+            } else if ((good + eagain + estale) >= ec->fragments) {
+                error = ESTALE;
+            } else {
                 error = EIO;
+            }
         }
     }
 
diff --git a/xlators/cluster/ec/src/ec-types.h b/xlators/cluster/ec/src/ec-types.h
index 7829b8c27b3..de9b89bb2c9 100644
--- a/xlators/cluster/ec/src/ec-types.h
+++ b/xlators/cluster/ec/src/ec-types.h
@@ -130,7 +130,12 @@ typedef void (*ec_resume_f)(ec_fop_data_t *, int32_t);
 
 enum _ec_read_policy { EC_ROUND_ROBIN, EC_GFID_HASH, EC_READ_POLICY_MAX };
 
-enum _ec_heal_need { EC_HEAL_NONEED, EC_HEAL_MAYBE, EC_HEAL_MUST };
+enum _ec_heal_need {
+    EC_HEAL_NONEED,
+    EC_HEAL_MAYBE,
+    EC_HEAL_MUST,
+    EC_HEAL_PURGE_INDEX
+};
 
 enum _ec_stripe_part { EC_STRIPE_HEAD, EC_STRIPE_TAIL };
 
@@ -186,10 +191,10 @@ struct _ec_inode {
 
 typedef int32_t (*fop_heal_cbk_t)(call_frame_t *, void *, xlator_t *, int32_t,
                                   int32_t, uintptr_t, uintptr_t, uintptr_t,
-                                  dict_t *);
+                                  uint32_t, dict_t *);
 typedef int32_t (*fop_fheal_cbk_t)(call_frame_t *, void *, xlator_t *, int32_t,
                                    int32_t, uintptr_t, uintptr_t, uintptr_t,
-                                   dict_t *);
+                                   uint32_t, dict_t *);
 
 union _ec_cbk {
     fop_access_cbk_t access;
@@ -621,6 +626,11 @@ struct _ec_statistics {
                                 requests. (Basically memory allocation
                                 errors). */
     } stripe_cache;
+    struct {
+        gf_atomic_t attempted; /*Number of heals attempted on
+                                files/directories*/
+        gf_atomic_t completed; /*Number of heals complted on files/directories*/
+    } shd;
 };
 
 struct _ec {
diff --git a/xlators/cluster/ec/src/ec.c b/xlators/cluster/ec/src/ec.c
index 66b4e634911..7344be4968d 100644
--- a/xlators/cluster/ec/src/ec.c
+++ b/xlators/cluster/ec/src/ec.c
@@ -325,13 +325,18 @@ ec_get_event_from_state(ec_t *ec)
 void
 ec_up(xlator_t *this, ec_t *ec)
 {
+    char str1[32], str2[32];
+
     if (ec->timer != NULL) {
         gf_timer_call_cancel(this->ctx, ec->timer);
         ec->timer = NULL;
     }
 
     ec->up = 1;
-    gf_msg(this->name, GF_LOG_INFO, 0, EC_MSG_EC_UP, "Going UP");
+    gf_msg(this->name, GF_LOG_INFO, 0, EC_MSG_EC_UP,
+           "Going UP : Child UP = %s Child Notify = %s",
+           ec_bin(str1, sizeof(str1), ec->xl_up, ec->nodes),
+           ec_bin(str2, sizeof(str2), ec->xl_notify, ec->nodes));
 
     gf_event(EVENT_EC_MIN_BRICKS_UP, "subvol=%s", this->name);
 }
@@ -339,13 +344,18 @@ ec_up(xlator_t *this, ec_t *ec)
 void
 ec_down(xlator_t *this, ec_t *ec)
 {
+    char str1[32], str2[32];
+
     if (ec->timer != NULL) {
         gf_timer_call_cancel(this->ctx, ec->timer);
         ec->timer = NULL;
     }
 
     ec->up = 0;
-    gf_msg(this->name, GF_LOG_INFO, 0, EC_MSG_EC_DOWN, "Going DOWN");
+    gf_msg(this->name, GF_LOG_INFO, 0, EC_MSG_EC_DOWN,
+           "Going DOWN : Child UP = %s Child Notify = %s",
+           ec_bin(str1, sizeof(str1), ec->xl_up, ec->nodes),
+           ec_bin(str2, sizeof(str2), ec->xl_notify, ec->nodes));
 
     gf_event(EVENT_EC_MIN_BRICKS_NOT_UP, "subvol=%s", this->name);
 }
@@ -700,6 +710,8 @@ ec_statistics_init(ec_t *ec)
     GF_ATOMIC_INIT(ec->stats.stripe_cache.evicts, 0);
     GF_ATOMIC_INIT(ec->stats.stripe_cache.allocs, 0);
     GF_ATOMIC_INIT(ec->stats.stripe_cache.errors, 0);
+    GF_ATOMIC_INIT(ec->stats.shd.attempted, 0);
+    GF_ATOMIC_INIT(ec->stats.shd.completed, 0);
 }
 
 static int
@@ -1569,6 +1581,10 @@ ec_dump_private(xlator_t *this)
                        GF_ATOMIC_GET(ec->stats.stripe_cache.allocs));
     gf_proc_dump_write("errors", "%" GF_PRI_ATOMIC,
                        GF_ATOMIC_GET(ec->stats.stripe_cache.errors));
+    gf_proc_dump_write("heals-attempted", "%" GF_PRI_ATOMIC,
+                       GF_ATOMIC_GET(ec->stats.shd.attempted));
+    gf_proc_dump_write("heals-completed", "%" GF_PRI_ATOMIC,
+                       GF_ATOMIC_GET(ec->stats.shd.completed));
 
     return 0;
 }
diff --git a/xlators/cluster/ec/src/ec.h b/xlators/cluster/ec/src/ec.h
index 1b210d9adc1..6f6de6d5981 100644
--- a/xlators/cluster/ec/src/ec.h
+++ b/xlators/cluster/ec/src/ec.h
@@ -18,6 +18,7 @@
 #define EC_XATTR_SIZE EC_XATTR_PREFIX "size"
 #define EC_XATTR_VERSION EC_XATTR_PREFIX "version"
 #define EC_XATTR_HEAL EC_XATTR_PREFIX "heal"
+#define EC_XATTR_HEAL_NEW EC_XATTR_PREFIX "heal-new"
 #define EC_XATTR_DIRTY EC_XATTR_PREFIX "dirty"
 #define EC_STRIPE_CACHE_MAX_SIZE 10
 #define EC_VERSION_SIZE 2