1 files changed, 1544 insertions, 604 deletions
diff --git a/xlators/cluster/ec/src/ec-common.c b/xlators/cluster/ec/src/ec-common.c
index d0c9f97ab28..b955efd8c2d 100644
--- a/xlators/cluster/ec/src/ec-common.c
+++ b/xlators/cluster/ec/src/ec-common.c
@@ -8,10 +8,11 @@
   cases as published by the Free Software Foundation.
 */
 
-#include "byte-order.h"
+#include <glusterfs/byte-order.h>
+#include <glusterfs/hashfn.h>
 
 #include "ec-mem-types.h"
-#include "ec-data.h"
+#include "ec-types.h"
 #include "ec-helpers.h"
 #include "ec-combine.h"
 #include "ec-common.h"
@@ -20,102 +21,332 @@
 #include "ec.h"
 #include "ec-messages.h"
 
-int32_t ec_child_valid(ec_t * ec, ec_fop_data_t * fop, int32_t idx)
+#define EC_INVALID_INDEX UINT32_MAX
+
+void
+ec_update_fd_status(fd_t *fd, xlator_t *xl, int idx, int32_t ret_status)
 {
-    return (idx < ec->nodes) && (((fop->remaining >> idx) & 1) == 1);
+    ec_fd_t *fd_ctx;
+
+    if (fd == NULL)
+        return;
+
+    LOCK(&fd->lock);
+    {
+        fd_ctx = __ec_fd_get(fd, xl);
+        if (fd_ctx) {
+            if (ret_status >= 0)
+                fd_ctx->fd_status[idx] = EC_FD_OPENED;
+            else
+                fd_ctx->fd_status[idx] = EC_FD_NOT_OPENED;
+        }
+    }
+    UNLOCK(&fd->lock);
 }
 
-int32_t ec_child_next(ec_t * ec, ec_fop_data_t * fop, int32_t idx)
+static uintptr_t
+ec_fd_ctx_need_open(fd_t *fd, xlator_t *this, uintptr_t mask)
 {
-    while (!ec_child_valid(ec, fop, idx))
+    int i = 0;
+    int count = 0;
+    ec_t *ec = NULL;
+    ec_fd_t *fd_ctx = NULL;
+    uintptr_t need_open = 0;
+
+    ec = this->private;
+
+    fd_ctx = ec_fd_get(fd, this);
+    if (!fd_ctx)
+        return count;
+
+    LOCK(&fd->lock);
     {
-        if (++idx >= ec->nodes)
-        {
+        for (i = 0; i < ec->nodes; i++) {
+            if ((fd_ctx->fd_status[i] == EC_FD_NOT_OPENED) &&
+                ((ec->xl_up & (1 << i)) != 0) && ((mask & (1 << i)) != 0)) {
+                fd_ctx->fd_status[i] = EC_FD_OPENING;
+                need_open |= (1 << i);
+                count++;
+            }
+        }
+    }
+    UNLOCK(&fd->lock);
+
+    /* If fd needs to open on minimum number of nodes
+     * then ignore fixing the fd as it has been
+     * requested from heal operation.
+     */
+    if (count >= ec->fragments) {
+        need_open = 0;
+    }
+
+    return need_open;
+}
+
+static gf_boolean_t
+ec_is_fd_fixable(fd_t *fd)
+{
+    if (!fd || !fd->inode)
+        return _gf_false;
+    else if (fd_is_anonymous(fd))
+        return _gf_false;
+    else if (gf_uuid_is_null(fd->inode->gfid))
+        return _gf_false;
+
+    return _gf_true;
+}
+
+static void
+ec_fix_open(ec_fop_data_t *fop, uintptr_t mask)
+{
+    uintptr_t need_open = 0;
+    int ret = 0;
+    int32_t flags = 0;
+    loc_t loc = {
+        0,
+    };
+
+    if (!ec_is_fd_fixable(fop->fd))
+        goto out;
+
+    /* Evaluate how many remote fd's to be opened */
+    need_open = ec_fd_ctx_need_open(fop->fd, fop->xl, mask);
+    if (need_open == 0) {
+        goto out;
+    }
+
+    loc.inode = inode_ref(fop->fd->inode);
+    gf_uuid_copy(loc.gfid, fop->fd->inode->gfid);
+    ret = loc_path(&loc, NULL);
+    if (ret < 0) {
+        goto out;
+    }
+
+    flags = fop->fd->flags & (~(O_TRUNC | O_APPEND | O_CREAT | O_EXCL));
+    if (IA_IFDIR == fop->fd->inode->ia_type) {
+        ec_opendir(fop->frame, fop->xl, need_open,
+                   EC_MINIMUM_ONE | EC_FOP_NO_PROPAGATE_ERROR, NULL, NULL,
+                   &fop->loc[0], fop->fd, NULL);
+    } else {
+        ec_open(fop->frame, fop->xl, need_open,
+                EC_MINIMUM_ONE | EC_FOP_NO_PROPAGATE_ERROR, NULL, NULL, &loc,
+                flags, fop->fd, NULL);
+    }
+
+out:
+    loc_wipe(&loc);
+}
+
+static off_t
+ec_range_end_get(off_t fl_start, uint64_t fl_size)
+{
+    if (fl_size > 0) {
+        if (fl_size >= EC_RANGE_FULL) {
+            /* Infinity */
+            fl_start = LLONG_MAX;
+        } else {
+            fl_start += fl_size - 1;
+            if (fl_start < 0) {
+                /* Overflow */
+                fl_start = LLONG_MAX;
+            }
+        }
+    }
+
+    return fl_start;
+}
+
+static gf_boolean_t
+ec_is_range_conflict(ec_lock_link_t *l1, ec_lock_link_t *l2)
+{
+    return ((l1->fl_end >= l2->fl_start) && (l2->fl_end >= l1->fl_start));
+}
+
+static gf_boolean_t
+ec_lock_conflict(ec_lock_link_t *l1, ec_lock_link_t *l2)
+{
+    ec_t *ec = l1->fop->xl->private;
+
+    /* Fops like access/stat won't have to worry what the other fops are
+     * modifying as the fop is wound only to one brick. So it can be
+     * executed in parallel*/
+    if (l1->fop->minimum == EC_MINIMUM_ONE ||
+        l2->fop->minimum == EC_MINIMUM_ONE)
+        return _gf_false;
+
+    if ((l1->fop->flags & EC_FLAG_LOCK_SHARED) &&
+        (l2->fop->flags & EC_FLAG_LOCK_SHARED))
+        return _gf_false;
+
+    if (!ec->parallel_writes) {
+        return _gf_true;
+    }
+
+    return ec_is_range_conflict(l1, l2);
+}
+
+uint32_t
+ec_select_first_by_read_policy(ec_t *ec, ec_fop_data_t *fop)
+{
+    if (ec->read_policy == EC_ROUND_ROBIN) {
+        return ec->idx;
+    } else if (ec->read_policy == EC_GFID_HASH) {
+        if (fop->use_fd) {
+            return SuperFastHash((char *)fop->fd->inode->gfid,
+                                 sizeof(fop->fd->inode->gfid)) %
+                   ec->nodes;
+        } else {
+            if (gf_uuid_is_null(fop->loc[0].gfid))
+                loc_gfid(&fop->loc[0], fop->loc[0].gfid);
+            return SuperFastHash((char *)fop->loc[0].gfid,
+                                 sizeof(fop->loc[0].gfid)) %
+                   ec->nodes;
+        }
+    }
+    return 0;
+}
+
+static gf_boolean_t
+ec_child_valid(ec_t *ec, ec_fop_data_t *fop, uint32_t idx)
+{
+    return (idx < ec->nodes) && (((fop->remaining >> idx) & 1) == 1);
+}
+
+static uint32_t
+ec_child_next(ec_t *ec, ec_fop_data_t *fop, uint32_t idx)
+{
+    while (!ec_child_valid(ec, fop, idx)) {
+        if (++idx >= ec->nodes) {
             idx = 0;
         }
-        if (idx == fop->first)
-        {
-            return -1;
+        if (idx == fop->first) {
+            return EC_INVALID_INDEX;
         }
     }
 
     return idx;
 }
 
-int32_t ec_heal_report(call_frame_t * frame, void * cookie, xlator_t * this,
-                       int32_t op_ret, int32_t op_errno, uintptr_t mask,
-                       uintptr_t good, uintptr_t bad, dict_t * xdata)
+int32_t
+ec_heal_report(call_frame_t *frame, void *cookie, xlator_t *this,
+               int32_t op_ret, int32_t op_errno, uintptr_t mask, uintptr_t good,
+               uintptr_t bad, uint32_t pending, dict_t *xdata)
 {
     if (op_ret < 0) {
-        gf_msg (this->name, GF_LOG_WARNING, op_errno,
-                EC_MSG_HEAL_FAIL, "Heal failed");
+        gf_msg(this->name, GF_LOG_DEBUG, op_errno, EC_MSG_HEAL_FAIL,
+               "Heal failed");
     } else {
         if ((mask & ~good) != 0) {
-            gf_msg (this->name, GF_LOG_INFO, 0,
-                    EC_MSG_HEAL_SUCCESS, "Heal succeeded on %d/%d "
-                    "subvolumes",
-                    ec_bits_count(mask & ~(good | bad)),
-                    ec_bits_count(mask & ~good));
+            gf_msg(this->name, GF_LOG_DEBUG, 0, EC_MSG_HEAL_SUCCESS,
+                   "Heal succeeded on %d/%d "
+                   "subvolumes",
+                   gf_bits_count(mask & ~(good | bad)),
+                   gf_bits_count(mask & ~good));
         }
     }
 
     return 0;
 }
 
-int32_t ec_fop_needs_heal(ec_fop_data_t *fop)
+static uintptr_t
+ec_fop_needs_name_heal(ec_fop_data_t *fop)
+{
+    ec_t *ec = NULL;
+    ec_cbk_data_t *cbk = NULL;
+    ec_cbk_data_t *enoent_cbk = NULL;
+
+    ec = fop->xl->private;
+    if (fop->id != GF_FOP_LOOKUP)
+        return 0;
+
+    if (!fop->loc[0].name || strlen(fop->loc[0].name) == 0)
+        return 0;
+
+    list_for_each_entry(cbk, &fop->cbk_list, list)
+    {
+        if (cbk->op_ret < 0 && cbk->op_errno == ENOENT) {
+            enoent_cbk = cbk;
+            break;
+        }
+    }
+
+    if (!enoent_cbk)
+        return 0;
+
+    return ec->xl_up & ~enoent_cbk->mask;
+}
+
+int32_t
+ec_fop_needs_heal(ec_fop_data_t *fop)
 {
     ec_t *ec = fop->xl->private;
 
+    if (fop->lock_count == 0) {
+        /*
+         * if fop->lock_count is zero that means it saw version mismatch
+         * without any locks so it can't be trusted. If we launch a heal
+         * based on this it will lead to INODELKs which will affect I/O
+         * performance. Considering self-heal-daemon and operations on
+         * the inode from client which take locks can still trigger the
+         * heal we can choose to not attempt a heal when fop->lock_count
+         * is zero.
+         */
+        return 0;
+    }
     return (ec->xl_up & ~(fop->remaining | fop->good)) != 0;
 }
 
-void ec_check_status(ec_fop_data_t * fop)
+void
+ec_check_status(ec_fop_data_t *fop)
 {
-    ec_t * ec = fop->xl->private;
+    ec_t *ec = fop->xl->private;
     int32_t partial = 0;
+    char str1[32], str2[32], str3[32], str4[32], str5[32];
 
-    if (!ec_fop_needs_heal(fop)) {
+    if (!ec_fop_needs_name_heal(fop) && !ec_fop_needs_heal(fop)) {
         return;
     }
 
-    if (fop->answer->op_ret >= 0) {
-        if ((fop->id == GF_FOP_LOOKUP) ||
-            (fop->id == GF_FOP_STAT) || (fop->id == GF_FOP_FSTAT)) {
+    if (fop->answer && fop->answer->op_ret >= 0) {
+        if ((fop->id == GF_FOP_LOOKUP) || (fop->id == GF_FOP_STAT) ||
+            (fop->id == GF_FOP_FSTAT)) {
             partial = fop->answer->iatt[0].ia_type == IA_IFDIR;
         } else if (fop->id == GF_FOP_OPENDIR) {
             partial = 1;
         }
     }
 
-    gf_msg (fop->xl->name, GF_LOG_WARNING, 0,
-            EC_MSG_OP_FAIL_ON_SUBVOLS,
-            "Operation failed on some "
-            "subvolumes (up=%lX, mask=%lX, "
-            "remaining=%lX, good=%lX, bad=%lX)",
-            ec->xl_up, fop->mask, fop->remaining, fop->good,
-            ec->xl_up & ~(fop->remaining | fop->good));
-
-    if (fop->use_fd)
-    {
+    gf_msg(
+        fop->xl->name, GF_LOG_WARNING, 0, EC_MSG_OP_FAIL_ON_SUBVOLS,
+        "Operation failed on %d of %d subvolumes.(up=%s, mask=%s, "
+        "remaining=%s, good=%s, bad=%s,"
+        "(Least significant bit represents first client/brick of subvol), %s)",
+        gf_bits_count(ec->xl_up & ~(fop->remaining | fop->good)), ec->nodes,
+        ec_bin(str1, sizeof(str1), ec->xl_up, ec->nodes),
+        ec_bin(str2, sizeof(str2), fop->mask, ec->nodes),
+        ec_bin(str3, sizeof(str3), fop->remaining, ec->nodes),
+        ec_bin(str4, sizeof(str4), fop->good, ec->nodes),
+        ec_bin(str5, sizeof(str5), ec->xl_up & ~(fop->remaining | fop->good),
+               ec->nodes),
+        ec_msg_str(fop));
+    if (fop->use_fd) {
         if (fop->fd != NULL) {
             ec_fheal(NULL, fop->xl, -1, EC_MINIMUM_ONE, ec_heal_report, NULL,
                      fop->fd, partial, NULL);
         }
-    }
-    else
-    {
+    } else {
         ec_heal(NULL, fop->xl, -1, EC_MINIMUM_ONE, ec_heal_report, NULL,
                 &fop->loc[0], partial, NULL);
 
-        if (fop->loc[1].inode != NULL)
-        {
+        if (fop->loc[1].inode != NULL) {
             ec_heal(NULL, fop->xl, -1, EC_MINIMUM_ONE, ec_heal_report, NULL,
                     &fop->loc[1], partial, NULL);
         }
     }
 }
 
-void ec_update_good(ec_fop_data_t *fop, uintptr_t good)
+void
+ec_update_good(ec_fop_data_t *fop, uintptr_t good)
 {
     fop->good = good;
 
@@ -126,7 +357,8 @@ void ec_update_good(ec_fop_data_t *fop, uintptr_t good)
     }
 }
 
-void ec_lock_update_good(ec_lock_t *lock, ec_fop_data_t *fop)
+void
+ec_lock_update_good(ec_lock_t *lock, ec_fop_data_t *fop)
 {
     /* Fops that are executed only on one brick do not have enough information
      * to update the global mask of good bricks. */
@@ -134,22 +366,24 @@ void ec_lock_update_good(ec_lock_t *lock, ec_fop_data_t *fop)
         return;
     }
 
-    /* When updating the good mask of the lock, we only take into
-     * consideration those bits corresponding to the bricks where
-     * the fop has been executed. */
-    lock->good_mask &= ~fop->mask | fop->remaining;
-    lock->good_mask |= fop->good;
+    /* When updating the good mask of the lock, we only take into consideration
+     * those bits corresponding to the bricks where the fop has been executed.
+     * Bad bricks are removed from good_mask, but once marked as bad it's never
+     * set to good until the lock is released and reacquired */
+
+    lock->good_mask &= fop->good | fop->remaining;
 }
 
-void __ec_fop_set_error(ec_fop_data_t * fop, int32_t error)
+void
+__ec_fop_set_error(ec_fop_data_t *fop, int32_t error)
 {
-    if ((error != 0) && (fop->error == 0))
-    {
+    if ((error != 0) && (fop->error == 0)) {
         fop->error = error;
     }
 }
 
-void ec_fop_set_error(ec_fop_data_t * fop, int32_t error)
+void
+ec_fop_set_error(ec_fop_data_t *fop, int32_t error)
 {
     LOCK(&fop->lock);
 
@@ -201,18 +435,20 @@ ec_fop_prepare_answer(ec_fop_data_t *fop, gf_boolean_t ro)
     return cbk;
 }
 
-void ec_sleep(ec_fop_data_t *fop)
+void
+ec_sleep(ec_fop_data_t *fop)
 {
     LOCK(&fop->lock);
 
-    GF_ASSERT (fop->refs > 0);
+    GF_ASSERT(fop->refs > 0);
     fop->refs++;
     fop->jobs++;
 
     UNLOCK(&fop->lock);
 }
 
-int32_t ec_check_complete(ec_fop_data_t * fop, ec_resume_f resume)
+int32_t
+ec_check_complete(ec_fop_data_t *fop, ec_resume_f resume)
 {
     int32_t error = -1;
 
@@ -220,14 +456,11 @@ int32_t ec_check_complete(ec_fop_data_t * fop, ec_resume_f resume)
 
     GF_ASSERT(fop->resume == NULL);
 
-    if (--fop->jobs != 0)
-    {
+    if (--fop->jobs != 0) {
         ec_trace("WAIT", fop, "resume=%p", resume);
 
         fop->resume = resume;
-    }
-    else
-    {
+    } else {
         error = fop->error;
         fop->error = 0;
     }
@@ -237,7 +470,8 @@ int32_t ec_check_complete(ec_fop_data_t * fop, ec_resume_f resume)
     return error;
 }
 
-void ec_resume(ec_fop_data_t * fop, int32_t error)
+void
+ec_resume(ec_fop_data_t *fop, int32_t error)
 {
     ec_resume_f resume = NULL;
 
@@ -245,16 +479,13 @@ void ec_resume(ec_fop_data_t * fop, int32_t error)
 
     __ec_fop_set_error(fop, error);
 
-    if (--fop->jobs == 0)
-    {
+    if (--fop->jobs == 0) {
         resume = fop->resume;
         fop->resume = NULL;
-        if (resume != NULL)
-        {
+        if (resume != NULL) {
             ec_trace("RESUME", fop, "error=%d", error);
 
-            if (fop->error != 0)
-            {
+            if (fop->error != 0) {
                 error = fop->error;
             }
             fop->error = 0;
@@ -263,21 +494,24 @@ void ec_resume(ec_fop_data_t * fop, int32_t error)
 
     UNLOCK(&fop->lock);
 
-    if (resume != NULL)
-    {
+    if (resume != NULL) {
         resume(fop, error);
     }
 
     ec_fop_data_release(fop);
 }
 
-void ec_resume_parent(ec_fop_data_t * fop, int32_t error)
+void
+ec_resume_parent(ec_fop_data_t *fop)
 {
-    ec_fop_data_t * parent;
+    ec_fop_data_t *parent;
+    int32_t error = 0;
 
     parent = fop->parent;
-    if (parent != NULL)
-    {
+    if (parent != NULL) {
+        if ((fop->fop_flags & EC_FOP_NO_PROPAGATE_ERROR) == 0) {
+            error = fop->error;
+        }
         ec_trace("RESUME_PARENT", fop, "error=%u", error);
         fop->parent = NULL;
         ec_resume(parent, error);
@@ -285,22 +519,23 @@ void ec_resume_parent(ec_fop_data_t * fop, int32_t error)
 }
 
 gf_boolean_t
-ec_is_recoverable_error (int32_t op_errno)
+ec_is_recoverable_error(int32_t op_errno)
 {
-        switch (op_errno) {
+    switch (op_errno) {
         case ENOTCONN:
         case ESTALE:
         case ENOENT:
-        case EBADFD:/*Opened fd but brick is disconnected*/
-        case EIO:/*Backend-fs crash like XFS/ext4 etc*/
-                return _gf_true;
-        }
-        return _gf_false;
+        case EBADFD: /*Opened fd but brick is disconnected*/
+        case EIO:    /*Backend-fs crash like XFS/ext4 etc*/
+            return _gf_true;
+    }
+    return _gf_false;
 }
 
-void ec_complete(ec_fop_data_t * fop)
+void
+ec_complete(ec_fop_data_t *fop)
 {
-    ec_cbk_data_t * cbk = NULL;
+    ec_cbk_data_t *cbk = NULL;
     int32_t resume = 0, update = 0;
     int healing_count = 0;
 
@@ -312,9 +547,9 @@ void ec_complete(ec_fop_data_t * fop)
         if (fop->answer == NULL) {
             if (!list_empty(&fop->cbk_list)) {
                 cbk = list_entry(fop->cbk_list.next, ec_cbk_data_t, list);
-                healing_count = ec_bits_count (cbk->mask & fop->healing);
-                    /* fop shouldn't be treated as success if it is not
-                     * successful on at least fop->minimum good copies*/
+                healing_count = gf_bits_count(cbk->mask & fop->healing);
+                /* fop shouldn't be treated as success if it is not
+                 * successful on at least fop->minimum good copies*/
                 if ((cbk->count - healing_count) >= fop->minimum) {
                     fop->answer = cbk;
 
@@ -336,8 +571,7 @@ void ec_complete(ec_fop_data_t * fop)
         ec_update_good(fop, cbk->mask);
     }
 
-    if (resume)
-    {
+    if (resume) {
         ec_resume(fop, 0);
     }
 
@@ -347,36 +581,95 @@ void ec_complete(ec_fop_data_t * fop)
 /* There could be already granted locks sitting on the bricks, unlock for which
  * must be wound at all costs*/
 static gf_boolean_t
-ec_must_wind (ec_fop_data_t *fop)
+ec_must_wind(ec_fop_data_t *fop)
 {
-        if ((fop->id == GF_FOP_INODELK) || (fop->id == GF_FOP_FINODELK) ||
-            (fop->id == GF_FOP_LK)) {
-                if (fop->flock.l_type == F_UNLCK)
-                        return _gf_true;
-        } else if ((fop->id == GF_FOP_ENTRYLK) ||
-                   (fop->id == GF_FOP_FENTRYLK)) {
-                if (fop->entrylk_cmd == ENTRYLK_UNLOCK)
-                        return _gf_true;
-        }
+    if ((fop->id == GF_FOP_INODELK) || (fop->id == GF_FOP_FINODELK) ||
+        (fop->id == GF_FOP_LK)) {
+        if (fop->flock.l_type == F_UNLCK)
+            return _gf_true;
+    } else if ((fop->id == GF_FOP_ENTRYLK) || (fop->id == GF_FOP_FENTRYLK)) {
+        if (fop->entrylk_cmd == ENTRYLK_UNLOCK)
+            return _gf_true;
+    }
 
-        return _gf_false;
+    return _gf_false;
 }
 
 static gf_boolean_t
-ec_internal_op (ec_fop_data_t *fop)
-{
-        if (ec_must_wind (fop))
-                return _gf_true;
-        if (fop->id == GF_FOP_XATTROP)
-                return _gf_true;
-        if (fop->id == GF_FOP_FXATTROP)
-                return _gf_true;
-        return _gf_false;
+ec_internal_op(ec_fop_data_t *fop)
+{
+    if (ec_must_wind(fop))
+        return _gf_true;
+    if (fop->id == GF_FOP_XATTROP)
+        return _gf_true;
+    if (fop->id == GF_FOP_FXATTROP)
+        return _gf_true;
+    if (fop->id == GF_FOP_OPEN)
+        return _gf_true;
+    return _gf_false;
+}
+
+char *
+ec_msg_str(ec_fop_data_t *fop)
+{
+    loc_t *loc1 = NULL;
+    loc_t *loc2 = NULL;
+    char gfid1[64] = {0};
+    char gfid2[64] = {0};
+    ec_fop_data_t *parent = fop->parent;
+
+    if (fop->errstr)
+        return fop->errstr;
+    if (!fop->use_fd) {
+        loc1 = &fop->loc[0];
+        loc2 = &fop->loc[1];
+
+        if (fop->id == GF_FOP_RENAME) {
+            gf_asprintf(&fop->errstr,
+                        "FOP : '%s' failed on '%s' and '%s' with gfids "
+                        "%s and %s respectively. Parent FOP: %s",
+                        ec_fop_name(fop->id), loc1->path, loc2->path,
+                        uuid_utoa_r(loc1->gfid, gfid1),
+                        uuid_utoa_r(loc2->gfid, gfid2),
+                        parent ? ec_fop_name(parent->id) : "No Parent");
+        } else {
+            gf_asprintf(
+                &fop->errstr,
+                "FOP : '%s' failed on '%s' with gfid %s. Parent FOP: %s",
+                ec_fop_name(fop->id), loc1->path,
+                uuid_utoa_r(loc1->gfid, gfid1),
+                parent ? ec_fop_name(parent->id) : "No Parent");
+        }
+    } else {
+        gf_asprintf(
+            &fop->errstr, "FOP : '%s' failed on gfid %s. Parent FOP: %s",
+            ec_fop_name(fop->id), uuid_utoa_r(fop->fd->inode->gfid, gfid1),
+            parent ? ec_fop_name(parent->id) : "No Parent");
+    }
+    return fop->errstr;
 }
 
-int32_t ec_child_select(ec_fop_data_t * fop)
+static void
+ec_log_insufficient_vol(ec_fop_data_t *fop, int32_t have, uint32_t need,
+                        int32_t loglevel)
 {
-    ec_t * ec = fop->xl->private;
+    ec_t *ec = fop->xl->private;
+    char str1[32], str2[32], str3[32];
+
+    gf_msg(ec->xl->name, loglevel, 0, EC_MSG_CHILDS_INSUFFICIENT,
+           "Insufficient available children for this request: "
+           "Have : %d, Need : %u : Child UP : %s "
+           "Mask: %s, Healing : %s : %s ",
+           have, need, ec_bin(str1, sizeof(str1), ec->xl_up, ec->nodes),
+           ec_bin(str2, sizeof(str2), fop->mask, ec->nodes),
+           ec_bin(str3, sizeof(str3), fop->healing, ec->nodes),
+           ec_msg_str(fop));
+}
+
+static int32_t
+ec_child_select(ec_fop_data_t *fop)
+{
+    ec_t *ec = fop->xl->private;
     int32_t first = 0, num = 0;
 
     ec_fop_cleanup(fop);
@@ -385,27 +678,25 @@ int32_t ec_child_select(ec_fop_data_t * fop)
     /* Wind the fop on same subvols as parent for any internal extra fops like
      * head/tail read in case of writev fop. Unlocks shouldn't do this because
      * unlock should go on all subvols where lock is performed*/
-    if (fop->parent && !ec_internal_op (fop)) {
-            fop->mask &= (fop->parent->mask & ~fop->parent->healing);
+    if (fop->parent && !ec_internal_op(fop)) {
+        fop->mask &= (fop->parent->mask & ~fop->parent->healing);
+        if (ec_is_data_fop(fop->id)) {
+            fop->healing |= fop->parent->healing;
+        }
     }
 
-    if ((fop->mask & ~ec->xl_up) != 0)
-    {
-        gf_msg (fop->xl->name, GF_LOG_WARNING, 0,
-                EC_MSG_OP_EXEC_UNAVAIL,
-                "Executing operation with "
-                "some subvolumes unavailable "
-                "(%lX)", fop->mask & ~ec->xl_up);
-
+    if ((fop->mask & ~ec->xl_up) != 0) {
+        gf_msg(fop->xl->name, GF_LOG_WARNING, 0, EC_MSG_OP_EXEC_UNAVAIL,
+               "Executing operation with "
+               "some subvolumes unavailable. (%" PRIXPTR "). %s ",
+               fop->mask & ~ec->xl_up, ec_msg_str(fop));
         fop->mask &= ec->xl_up;
     }
 
-    switch (fop->minimum)
-    {
+    switch (fop->minimum) {
         case EC_MINIMUM_ALL:
-            fop->minimum = ec_bits_count(fop->mask);
-            if (fop->minimum >= ec->fragments)
-            {
+            fop->minimum = gf_bits_count(fop->mask);
+            if (fop->minimum >= ec->fragments) {
                 break;
             }
         case EC_MINIMUM_MIN:
@@ -415,13 +706,15 @@ int32_t ec_child_select(ec_fop_data_t * fop)
             fop->minimum = 1;
     }
 
-    first = ec->idx;
-    if (++first >= ec->nodes)
-    {
-        first = 0;
+    if (ec->read_policy == EC_ROUND_ROBIN) {
+        first = ec->idx;
+        if (++first >= ec->nodes) {
+            first = 0;
+        }
+        ec->idx = first;
     }
-    ec->idx = first;
 
+    num = gf_bits_count(fop->mask);
     /*Unconditionally wind on healing subvolumes*/
     fop->mask |= fop->healing;
     fop->remaining = fop->mask;
@@ -429,32 +722,35 @@ int32_t ec_child_select(ec_fop_data_t * fop)
 
     ec_trace("SELECT", fop, "");
 
-    num = ec_bits_count(fop->mask);
-    if ((num < fop->minimum) && (num < ec->fragments))
-    {
-        gf_msg (ec->xl->name, GF_LOG_ERROR, 0,
-                EC_MSG_CHILDS_INSUFFICIENT,
-                "Insufficient available childs "
-                "for this request (have %d, need "
-                "%d)", num, fop->minimum);
-
+    if ((num < fop->minimum) && (num < ec->fragments)) {
+        ec_log_insufficient_vol(fop, num, fop->minimum, GF_LOG_ERROR);
         return 0;
     }
 
-    ec_sleep(fop);
+    if (!fop->parent && fop->lock_count &&
+        (fop->locks[0].update[EC_DATA_TXN] ||
+         fop->locks[0].update[EC_METADATA_TXN])) {
+        if (ec->quorum_count && (num < ec->quorum_count)) {
+            ec_log_insufficient_vol(fop, num, ec->quorum_count, GF_LOG_ERROR);
+            return 0;
+        }
+    }
 
     return 1;
 }
 
-int32_t ec_dispatch_next(ec_fop_data_t * fop, int32_t idx)
+void
+ec_dispatch_next(ec_fop_data_t *fop, uint32_t idx)
 {
-    ec_t * ec = fop->xl->private;
+    uint32_t i = EC_INVALID_INDEX;
+    ec_t *ec = fop->xl->private;
 
     LOCK(&fop->lock);
 
-    idx = ec_child_next(ec, fop, idx);
-    if (idx >= 0)
-    {
+    i = ec_child_next(ec, fop, idx);
+    if (i < EC_MAX_NODES) {
+        idx = i;
+
         fop->remaining ^= 1ULL << idx;
 
         ec_trace("EXECUTE", fop, "idx=%d", idx);
@@ -465,20 +761,18 @@ int32_t ec_dispatch_next(ec_fop_data_t * fop, int32_t idx)
 
     UNLOCK(&fop->lock);
 
-    if (idx >= 0)
-    {
+    if (i < EC_MAX_NODES) {
         fop->wind(ec, fop, idx);
     }
-
-    return idx;
 }
 
-void ec_dispatch_mask(ec_fop_data_t * fop, uintptr_t mask)
+void
+ec_dispatch_mask(ec_fop_data_t *fop, uintptr_t mask)
 {
-    ec_t * ec = fop->xl->private;
+    ec_t *ec = fop->xl->private;
     int32_t count, idx;
 
-    count = ec_bits_count(mask);
+    count = gf_bits_count(mask);
 
     LOCK(&fop->lock);
 
@@ -492,10 +786,8 @@ void ec_dispatch_mask(ec_fop_data_t * fop, uintptr_t mask)
     UNLOCK(&fop->lock);
 
     idx = 0;
-    while (mask != 0)
-    {
-        if ((mask & 1) != 0)
-        {
+    while (mask != 0) {
+        if ((mask & 1) != 0) {
             fop->wind(ec, fop, idx);
         }
         idx++;
@@ -503,29 +795,29 @@ void ec_dispatch_mask(ec_fop_data_t * fop, uintptr_t mask)
     }
 }
 
-void ec_dispatch_start(ec_fop_data_t * fop)
+void
+ec_dispatch_start(ec_fop_data_t *fop)
 {
     fop->answer = NULL;
     fop->good = 0;
 
     INIT_LIST_HEAD(&fop->cbk_list);
 
-    if (fop->lock_count > 0)
-    {
+    if (fop->lock_count > 0) {
         ec_owner_copy(fop->frame, &fop->req_frame->root->lk_owner);
     }
 }
 
-void ec_dispatch_one(ec_fop_data_t * fop)
+void
+ec_dispatch_one(ec_fop_data_t *fop)
 {
-    ec_t * ec = fop->xl->private;
-
     ec_dispatch_start(fop);
 
-    if (ec_child_select(fop))
-    {
+    if (ec_child_select(fop)) {
+        ec_sleep(fop);
+
         fop->expected = 1;
-        fop->first = ec->idx;
+        fop->first = ec_select_first_by_read_policy(fop->xl->private, fop);
 
         ec_dispatch_next(fop, fop->first);
     }
@@ -541,8 +833,8 @@ ec_dispatch_one_retry(ec_fop_data_t *fop, ec_cbk_data_t **cbk)
         *cbk = tmp;
     }
     if ((tmp != NULL) && (tmp->op_ret < 0) &&
-        ec_is_recoverable_error (tmp->op_errno)) {
-        GF_ASSERT (fop->mask & (1ULL << tmp->idx));
+        ec_is_recoverable_error(tmp->op_errno)) {
+        GF_ASSERT(fop->mask & (1ULL << tmp->idx));
         fop->mask ^= (1ULL << tmp->idx);
         if (fop->mask) {
             return _gf_true;
@@ -552,13 +844,15 @@ ec_dispatch_one_retry(ec_fop_data_t *fop, ec_cbk_data_t **cbk)
     return _gf_false;
 }
 
-void ec_dispatch_inc(ec_fop_data_t * fop)
+void
+ec_dispatch_inc(ec_fop_data_t *fop)
 {
     ec_dispatch_start(fop);
 
-    if (ec_child_select(fop))
-    {
-        fop->expected = ec_bits_count(fop->remaining);
+    if (ec_child_select(fop)) {
+        ec_sleep(fop);
+
+        fop->expected = gf_bits_count(fop->remaining);
         fop->first = 0;
 
         ec_dispatch_next(fop, 0);
@@ -566,55 +860,76 @@ void ec_dispatch_inc(ec_fop_data_t * fop)
 }
 
 void
-ec_dispatch_all (ec_fop_data_t *fop)
+ec_dispatch_all(ec_fop_data_t *fop)
 {
-        ec_dispatch_start(fop);
+    ec_dispatch_start(fop);
 
-        if (ec_child_select(fop)) {
-                fop->expected = ec_bits_count(fop->remaining);
-                fop->first = 0;
+    if (ec_child_select(fop)) {
+        ec_sleep(fop);
 
-                ec_dispatch_mask(fop, fop->remaining);
-        }
+        fop->expected = gf_bits_count(fop->remaining);
+        fop->first = 0;
+
+        ec_dispatch_mask(fop, fop->remaining);
+    }
 }
 
-void ec_dispatch_min(ec_fop_data_t * fop)
+void
+ec_dispatch_min(ec_fop_data_t *fop)
 {
-    ec_t * ec = fop->xl->private;
+    ec_t *ec = fop->xl->private;
     uintptr_t mask;
-    int32_t idx, count;
+    uint32_t idx;
+    int32_t count;
 
     ec_dispatch_start(fop);
 
-    if (ec_child_select(fop))
-    {
+    if (ec_child_select(fop)) {
+        ec_sleep(fop);
+
         fop->expected = count = ec->fragments;
-        fop->first = ec->idx;
+        fop->first = ec_select_first_by_read_policy(fop->xl->private, fop);
         idx = fop->first - 1;
         mask = 0;
-        while (count-- > 0)
-        {
+        while (count-- > 0) {
             idx = ec_child_next(ec, fop, idx + 1);
-            mask |= 1ULL << idx;
+            if (idx < EC_MAX_NODES)
+                mask |= 1ULL << idx;
         }
 
         ec_dispatch_mask(fop, mask);
     }
 }
 
-ec_lock_t *ec_lock_allocate(ec_fop_data_t *fop, loc_t *loc)
+void
+ec_succeed_all(ec_fop_data_t *fop)
+{
+    ec_dispatch_start(fop);
+
+    if (ec_child_select(fop)) {
+        fop->expected = gf_bits_count(fop->remaining);
+        fop->first = 0;
+
+        /* Simulate a successful execution on all bricks */
+        ec_trace("SUCCEED", fop, "");
+
+        fop->good = fop->remaining;
+        fop->remaining = 0;
+    }
+}
+
+ec_lock_t *
+ec_lock_allocate(ec_fop_data_t *fop, loc_t *loc)
 {
     ec_t *ec = fop->xl->private;
-    ec_lock_t * lock;
+    ec_lock_t *lock;
     int32_t err;
 
     if ((loc->inode == NULL) ||
-        (gf_uuid_is_null(loc->gfid) && gf_uuid_is_null(loc->inode->gfid)))
-    {
-        gf_msg (fop->xl->name, GF_LOG_ERROR, EINVAL,
-                EC_MSG_INVALID_INODE,
-                "Trying to lock based on an invalid "
-                "inode");
+        (gf_uuid_is_null(loc->gfid) && gf_uuid_is_null(loc->inode->gfid))) {
+        gf_msg(fop->xl->name, GF_LOG_ERROR, EINVAL, EC_MSG_INVALID_INODE,
+               "Trying to lock based on an invalid "
+               "inode");
 
         __ec_fop_set_error(fop, EINVAL);
 
@@ -622,9 +937,8 @@ ec_lock_t *ec_lock_allocate(ec_fop_data_t *fop, loc_t *loc)
     }
 
     lock = mem_get0(ec->lock_pool);
-    if (lock != NULL)
-    {
-        lock->good_mask = -1ULL;
+    if (lock != NULL) {
+        lock->good_mask = UINTPTR_MAX;
         INIT_LIST_HEAD(&lock->owners);
         INIT_LIST_HEAD(&lock->waiting);
         INIT_LIST_HEAD(&lock->frozen);
@@ -640,7 +954,8 @@ ec_lock_t *ec_lock_allocate(ec_fop_data_t *fop, loc_t *loc)
     return lock;
 }
 
-void ec_lock_destroy(ec_lock_t * lock)
+void
+ec_lock_destroy(ec_lock_t *lock)
 {
     loc_wipe(&lock->loc);
     if (lock->fd != NULL) {
@@ -650,13 +965,15 @@ void ec_lock_destroy(ec_lock_t * lock)
     mem_put(lock);
 }
 
-int32_t ec_lock_compare(ec_lock_t * lock1, ec_lock_t * lock2)
+int32_t
+ec_lock_compare(ec_lock_t *lock1, ec_lock_t *lock2)
 {
     return gf_uuid_compare(lock1->loc.gfid, lock2->loc.gfid);
 }
 
-void ec_lock_insert(ec_fop_data_t *fop, ec_lock_t *lock, uint32_t flags,
-                    loc_t *base)
+static void
+ec_lock_insert(ec_fop_data_t *fop, ec_lock_t *lock, uint32_t flags, loc_t *base,
+               off_t fl_start, uint64_t fl_size)
 {
     ec_lock_link_t *link;
 
@@ -690,13 +1007,15 @@ void ec_lock_insert(ec_fop_data_t *fop, ec_lock_t *lock, uint32_t flags,
     link->update[EC_DATA_TXN] = (flags & EC_UPDATE_DATA) != 0;
     link->update[EC_METADATA_TXN] = (flags & EC_UPDATE_META) != 0;
     link->base = base;
+    link->fl_start = fl_start;
+    link->fl_end = ec_range_end_get(fl_start, fl_size);
 
-    lock->refs++;
-    lock->inserted++;
+    lock->refs_pending++;
 }
 
-void ec_lock_prepare_inode_internal(ec_fop_data_t *fop, loc_t *loc,
-                                    uint32_t flags, loc_t *base)
+static void
+ec_lock_prepare_inode_internal(ec_fop_data_t *fop, loc_t *loc, uint32_t flags,
+                               loc_t *base, off_t fl_start, uint64_t fl_size)
 {
     ec_lock_t *lock = NULL;
     ec_inode_t *ctx;
@@ -725,8 +1044,8 @@ void ec_lock_prepare_inode_internal(ec_fop_data_t *fop, loc_t *loc,
         if ((fop->lock_count > 0) && (fop->locks[0].lock == lock)) {
             /* Combine data/meta updates */
             fop->locks[0].update[EC_DATA_TXN] |= (flags & EC_UPDATE_DATA) != 0;
-            fop->locks[0].update[EC_METADATA_TXN] |=
-                                                 (flags & EC_UPDATE_META) != 0;
+            fop->locks[0].update[EC_METADATA_TXN] |= (flags & EC_UPDATE_META) !=
+                                                     0;
 
             /* Only one base inode is allowed per fop, so there shouldn't be
              * overwrites here. */
@@ -737,8 +1056,10 @@ void ec_lock_prepare_inode_internal(ec_fop_data_t *fop, loc_t *loc,
             goto update_query;
         }
 
-        ec_trace("LOCK_INODELK", fop, "lock=%p, inode=%p. Lock already "
-                                      "acquired", lock, loc->inode);
+        ec_trace("LOCK_INODELK", fop,
+                 "lock=%p, inode=%p. Lock already "
+                 "acquired",
+                 lock, loc->inode);
 
         goto insert;
     }
@@ -757,22 +1078,25 @@ void ec_lock_prepare_inode_internal(ec_fop_data_t *fop, loc_t *loc,
     ctx->inode_lock = lock;
 
 insert:
-    ec_lock_insert(fop, lock, flags, base);
+    ec_lock_insert(fop, lock, flags, base, fl_start, fl_size);
 update_query:
     lock->query |= (flags & EC_QUERY_INFO) != 0;
 unlock:
     UNLOCK(&loc->inode->lock);
 }
 
-void ec_lock_prepare_inode(ec_fop_data_t *fop, loc_t *loc, uint32_t flags)
+void
+ec_lock_prepare_inode(ec_fop_data_t *fop, loc_t *loc, uint32_t flags,
+                      off_t fl_start, uint64_t fl_size)
 {
-    ec_lock_prepare_inode_internal(fop, loc, flags, NULL);
+    ec_lock_prepare_inode_internal(fop, loc, flags, NULL, fl_start, fl_size);
 }
 
-void ec_lock_prepare_parent_inode(ec_fop_data_t *fop, loc_t *loc,
-                                  uint32_t flags)
+void
+ec_lock_prepare_parent_inode(ec_fop_data_t *fop, loc_t *loc, loc_t *base,
+                             uint32_t flags)
 {
-    loc_t tmp, *base = NULL;
+    loc_t tmp;
     int32_t err;
 
     if (fop->error != 0) {
@@ -787,16 +1111,19 @@ void ec_lock_prepare_parent_inode(ec_fop_data_t *fop, loc_t *loc,
     }
 
     if ((flags & EC_INODE_SIZE) != 0) {
-        base = loc;
         flags ^= EC_INODE_SIZE;
+    } else {
+        base = NULL;
     }
 
-    ec_lock_prepare_inode_internal(fop, &tmp, flags, base);
+    ec_lock_prepare_inode_internal(fop, &tmp, flags, base, 0, EC_RANGE_FULL);
 
     loc_wipe(&tmp);
 }
 
-void ec_lock_prepare_fd(ec_fop_data_t *fop, fd_t *fd, uint32_t flags)
+void
+ec_lock_prepare_fd(ec_fop_data_t *fop, fd_t *fd, uint32_t flags, off_t fl_start,
+                   uint64_t fl_size)
 {
     loc_t loc;
     int32_t err;
@@ -812,21 +1139,20 @@ void ec_lock_prepare_fd(ec_fop_data_t *fop, fd_t *fd, uint32_t flags)
         return;
     }
 
-    ec_lock_prepare_inode_internal(fop, &loc, flags, NULL);
+    ec_lock_prepare_inode_internal(fop, &loc, flags, NULL, fl_start, fl_size);
 
     loc_wipe(&loc);
 }
 
 gf_boolean_t
-ec_config_check (ec_fop_data_t *fop, ec_config_t *config)
+ec_config_check(xlator_t *xl, ec_config_t *config)
 {
     ec_t *ec;
 
-    ec = fop->xl->private;
+    ec = xl->private;
     if ((config->version != EC_CONFIG_VERSION) ||
         (config->algorithm != EC_CONFIG_ALGORITHM) ||
-        (config->gf_word_size != EC_GF_BITS) ||
-        (config->bricks != ec->nodes) ||
+        (config->gf_word_size != EC_GF_BITS) || (config->bricks != ec->nodes) ||
         (config->redundancy != ec->redundancy) ||
         (config->chunk_size != EC_METHOD_CHUNK_SIZE)) {
         uint32_t data_bricks;
@@ -845,20 +1171,17 @@ ec_config_check (ec_fop_data_t *fop, ec_config_t *config)
         if ((config->redundancy < 1) ||
             (config->redundancy * 2 >= config->bricks) ||
             !ec_is_power_of_2(config->gf_word_size) ||
-            ((config->chunk_size * 8) % (config->gf_word_size * data_bricks)
-                                                                       != 0)) {
-            gf_msg (fop->xl->name, GF_LOG_ERROR, EINVAL,
-                    EC_MSG_INVALID_CONFIG,
-                    "Invalid or corrupted config");
+            ((config->chunk_size * 8) % (config->gf_word_size * data_bricks) !=
+             0)) {
+            gf_msg(xl->name, GF_LOG_ERROR, EINVAL, EC_MSG_INVALID_CONFIG,
+                   "Invalid or corrupted config");
         } else {
-            gf_msg (fop->xl->name, GF_LOG_ERROR, EINVAL,
-                    EC_MSG_INVALID_CONFIG,
-                    "Unsupported config "
-                    "(V=%u, A=%u, W=%u, "
-                    "N=%u, R=%u, S=%u)",
-                   config->version, config->algorithm,
-                   config->gf_word_size, config->bricks,
-                   config->redundancy, config->chunk_size);
+            gf_msg(xl->name, GF_LOG_ERROR, EINVAL, EC_MSG_INVALID_CONFIG,
+                   "Unsupported config "
+                   "(V=%u, A=%u, W=%u, "
+                   "N=%u, R=%u, S=%u)",
+                   config->version, config->algorithm, config->gf_word_size,
+                   config->bricks, config->redundancy, config->chunk_size);
         }
 
         return _gf_false;
@@ -867,111 +1190,164 @@ ec_config_check (ec_fop_data_t *fop, ec_config_t *config)
     return _gf_true;
 }
 
+gf_boolean_t
+ec_set_dirty_flag(ec_lock_link_t *link, ec_inode_t *ctx, uint64_t *dirty)
+{
+    gf_boolean_t set_dirty = _gf_false;
+
+    if (link->update[EC_DATA_TXN] && !ctx->dirty[EC_DATA_TXN]) {
+        if (!link->optimistic_changelog)
+            dirty[EC_DATA_TXN] = 1;
+    }
+
+    if (link->update[EC_METADATA_TXN] && !ctx->dirty[EC_METADATA_TXN]) {
+        if (!link->optimistic_changelog)
+            dirty[EC_METADATA_TXN] = 1;
+    }
+
+    if (dirty[EC_METADATA_TXN] || dirty[EC_DATA_TXN]) {
+        set_dirty = _gf_true;
+    }
+
+    return set_dirty;
+}
+
 int32_t
-ec_prepare_update_cbk (call_frame_t *frame, void *cookie,
-                       xlator_t *this, int32_t op_ret, int32_t op_errno,
-                       dict_t *dict, dict_t *xdata)
+ec_prepare_update_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+                      int32_t op_ret, int32_t op_errno, dict_t *dict,
+                      dict_t *xdata)
 {
     struct list_head list;
     ec_fop_data_t *fop = cookie, *parent, *tmp;
-    ec_lock_link_t *link = fop->data;
+    ec_lock_link_t *parent_link = fop->data;
+    ec_lock_link_t *link = NULL;
     ec_lock_t *lock = NULL;
     ec_inode_t *ctx;
-
-    lock = link->lock;
-    parent = link->fop;
+    gf_boolean_t release = _gf_false;
+    uint64_t provided_flags = 0;
+    uint64_t dirty[EC_VERSION_SIZE] = {0, 0};
+    lock = parent_link->lock;
+    parent = parent_link->fop;
     ctx = lock->ctx;
 
     INIT_LIST_HEAD(&list);
+    provided_flags = EC_PROVIDED_FLAGS(parent_link->waiting_flags);
 
     LOCK(&lock->loc.inode->lock);
 
-    list_for_each_entry(tmp, &lock->owners, owner_list) {
-        if ((tmp->flags & EC_FLAG_WAITING_SIZE) != 0) {
-            tmp->flags ^= EC_FLAG_WAITING_SIZE;
-
-            list_add_tail(&tmp->cbk_list, &list);
+    list_for_each_entry(link, &lock->owners, owner_list)
+    {
+        if ((link->waiting_flags & provided_flags) != 0) {
+            link->waiting_flags ^= (link->waiting_flags & provided_flags);
+            if (EC_NEEDED_FLAGS(link->waiting_flags) == 0)
+                list_add_tail(&link->fop->cbk_list, &list);
         }
     }
-
     if (op_ret < 0) {
-        gf_msg (this->name, GF_LOG_WARNING, op_errno,
-                EC_MSG_SIZE_VERS_GET_FAIL,
-                "Failed to get size and version");
+        gf_msg(this->name, GF_LOG_WARNING, op_errno, EC_MSG_SIZE_VERS_GET_FAIL,
+               "Failed to get size and version :  %s", ec_msg_str(fop));
 
         goto unlock;
     }
 
-    op_errno = -ec_dict_del_array(dict, EC_XATTR_VERSION, ctx->pre_version,
-                                  EC_VERSION_SIZE);
-    if (op_errno != 0) {
-        gf_msg (this->name, GF_LOG_ERROR, op_errno,
-                EC_MSG_VER_XATTR_GET_FAIL,
-                "Unable to get version xattr");
-
-        goto unlock;
-    }
-    ctx->post_version[0] += ctx->pre_version[0];
-    ctx->post_version[1] += ctx->pre_version[1];
-
-    ctx->have_version = _gf_true;
-
-    if (lock->loc.inode->ia_type == IA_IFREG) {
-        op_errno = -ec_dict_del_number(dict, EC_XATTR_SIZE, &ctx->pre_size);
+    if (EC_FLAGS_HAVE(provided_flags, EC_FLAG_XATTROP)) {
+        op_errno = -ec_dict_del_array(dict, EC_XATTR_VERSION, ctx->pre_version,
+                                      EC_VERSION_SIZE);
         if (op_errno != 0) {
-            gf_msg (this->name, GF_LOG_ERROR, op_errno,
-                    EC_MSG_SIZE_XATTR_GET_FAIL, "Unable to get size xattr");
-
+            gf_msg(this->name, GF_LOG_ERROR, op_errno,
+                   EC_MSG_VER_XATTR_GET_FAIL, "Unable to get version xattr. %s",
+                   ec_msg_str(fop));
             goto unlock;
         }
-        ctx->post_size = ctx->pre_size;
+        ctx->post_version[0] += ctx->pre_version[0];
+        ctx->post_version[1] += ctx->pre_version[1];
+
+        ctx->have_version = _gf_true;
+
+        if (lock->loc.inode->ia_type == IA_IFREG ||
+            lock->loc.inode->ia_type == IA_INVAL) {
+            op_errno = -ec_dict_del_number(dict, EC_XATTR_SIZE, &ctx->pre_size);
+            if (op_errno != 0) {
+                if (lock->loc.inode->ia_type == IA_IFREG) {
+                    gf_msg(this->name, GF_LOG_ERROR, op_errno,
+                           EC_MSG_SIZE_XATTR_GET_FAIL,
+                           "Unable to get size xattr. %s", ec_msg_str(fop));
+                    goto unlock;
+                }
+            } else {
+                ctx->post_size = ctx->pre_size;
 
-        ctx->have_size = _gf_true;
+                ctx->have_size = _gf_true;
+            }
 
-        op_errno = -ec_dict_del_config(dict, EC_XATTR_CONFIG, &ctx->config);
-        if (op_errno != 0) {
-            gf_msg (this->name, GF_LOG_ERROR, op_errno,
-                    EC_MSG_CONFIG_XATTR_GET_FAIL,
-                    "Unable to get config xattr");
+            op_errno = -ec_dict_del_config(dict, EC_XATTR_CONFIG, &ctx->config);
+            if (op_errno != 0) {
+                if ((lock->loc.inode->ia_type == IA_IFREG) ||
+                    (op_errno != ENODATA)) {
+                    gf_msg(this->name, GF_LOG_ERROR, op_errno,
+                           EC_MSG_CONFIG_XATTR_GET_FAIL,
+                           "Unable to get config xattr. %s", ec_msg_str(fop));
 
-            goto unlock;
-        }
-        if (!ec_config_check(parent, &ctx->config)) {
-            gf_msg (this->name, GF_LOG_ERROR, EINVAL,
-                    EC_MSG_CONFIG_XATTR_INVALID,
-                    "Invalid config xattr");
+                    goto unlock;
+                }
+            } else {
+                if (!ec_config_check(parent->xl, &ctx->config)) {
+                    gf_msg(this->name, GF_LOG_ERROR, EINVAL,
+                           EC_MSG_CONFIG_XATTR_INVALID, "Invalid config xattr");
 
-            op_errno = EINVAL;
+                    op_errno = EINVAL;
 
-            goto unlock;
+                    goto unlock;
+                }
+                ctx->have_config = _gf_true;
+            }
         }
-
-        ctx->have_config = _gf_true;
+        ctx->have_info = _gf_true;
     }
 
-    ctx->have_info = _gf_true;
+    ec_set_dirty_flag(fop->data, ctx, dirty);
+    if (dirty[EC_METADATA_TXN] &&
+        (EC_FLAGS_HAVE(provided_flags, EC_FLAG_METADATA_DIRTY))) {
+        GF_ASSERT(!ctx->dirty[EC_METADATA_TXN]);
+        ctx->dirty[EC_METADATA_TXN] = 1;
+    }
 
+    if (dirty[EC_DATA_TXN] &&
+        (EC_FLAGS_HAVE(provided_flags, EC_FLAG_DATA_DIRTY))) {
+        GF_ASSERT(!ctx->dirty[EC_DATA_TXN]);
+        ctx->dirty[EC_DATA_TXN] = 1;
+    }
     op_errno = 0;
-
 unlock:
-    lock->getting_size = _gf_false;
 
-    UNLOCK(&lock->loc.inode->lock);
+    lock->waiting_flags ^= provided_flags;
 
     if (op_errno == 0) {
+        /* If the fop fails on any of the good bricks, it is important to mark
+         * it dirty and update versions right away if dirty was not set before.
+         */
+        if (lock->good_mask & ~(fop->good | fop->remaining)) {
+            release = _gf_true;
+        }
+
+        if (parent_link->update[0] && !parent_link->dirty[0]) {
+            lock->release |= release;
+        }
+
+        if (parent_link->update[1] && !parent_link->dirty[1]) {
+            lock->release |= release;
+        }
+
         /* We don't allow the main fop to be executed on bricks that have not
          * succeeded the initial xattrop. */
-        parent->mask &= fop->good;
+        ec_lock_update_good(lock, fop);
 
         /*As of now only data healing marks bricks as healing*/
         lock->healing |= fop->healing;
-        if (ec_is_data_fop (parent->id)) {
-            parent->healing |= fop->healing;
-        }
-    } else {
-        ec_fop_set_error(parent, op_errno);
     }
 
+    UNLOCK(&lock->loc.inode->lock);
+
     while (!list_empty(&list)) {
         tmp = list_entry(list.next, ec_fop_data_t, cbk_list);
         list_del_init(&tmp->cbk_list);
@@ -980,95 +1356,163 @@ unlock:
             tmp->mask &= fop->good;
 
             /*As of now only data healing marks bricks as healing*/
-            if (ec_is_data_fop (tmp->id)) {
+            if (ec_is_data_fop(tmp->id)) {
                 tmp->healing |= fop->healing;
             }
-        } else {
-            ec_fop_set_error(tmp, op_errno);
         }
 
-        ec_resume(tmp, 0);
+        ec_resume(tmp, op_errno);
     }
 
     return 0;
 }
 
-void ec_get_size_version(ec_lock_link_t *link)
+static gf_boolean_t
+ec_set_needed_flag(ec_lock_t *lock, ec_lock_link_t *link, uint64_t flag)
+{
+    uint64_t current;
+
+    link->waiting_flags |= EC_FLAG_NEEDS(flag);
+
+    current = EC_NEEDED_FLAGS(lock->waiting_flags);
+    if (!EC_FLAGS_HAVE(current, flag)) {
+        lock->waiting_flags |= EC_FLAG_NEEDS(flag);
+        link->waiting_flags |= EC_FLAG_PROVIDES(flag);
+
+        return _gf_true;
+    }
+
+    return _gf_false;
+}
+
+static uint64_t
+ec_set_xattrop_flags_and_params(ec_lock_t *lock, ec_lock_link_t *link,
+                                uint64_t *dirty)
+{
+    uint64_t oldflags = 0;
+    uint64_t newflags = 0;
+    ec_inode_t *ctx = lock->ctx;
+
+    oldflags = EC_NEEDED_FLAGS(lock->waiting_flags);
+
+    if (lock->query && !ctx->have_info) {
+        ec_set_needed_flag(lock, link, EC_FLAG_XATTROP);
+    }
+
+    if (dirty[EC_DATA_TXN]) {
+        if (!ec_set_needed_flag(lock, link, EC_FLAG_DATA_DIRTY)) {
+            dirty[EC_DATA_TXN] = 0;
+        }
+    }
+
+    if (dirty[EC_METADATA_TXN]) {
+        if (!ec_set_needed_flag(lock, link, EC_FLAG_METADATA_DIRTY)) {
+            dirty[EC_METADATA_TXN] = 0;
+        }
+    }
+    newflags = EC_NEEDED_FLAGS(lock->waiting_flags);
+
+    return oldflags ^ newflags;
+}
+
+void
+ec_get_size_version(ec_lock_link_t *link)
 {
     loc_t loc;
     ec_lock_t *lock;
     ec_inode_t *ctx;
     ec_fop_data_t *fop;
     dict_t *dict = NULL;
-    uid_t uid;
-    gid_t gid;
-    int32_t error = -ENOMEM;
-    gf_boolean_t getting_size;
+    dict_t *xdata = NULL;
+    ec_t *ec = NULL;
+    int32_t error = 0;
+    gf_boolean_t set_dirty = _gf_false;
     uint64_t allzero[EC_VERSION_SIZE] = {0, 0};
-
+    uint64_t dirty[EC_VERSION_SIZE] = {0, 0};
     lock = link->lock;
     ctx = lock->ctx;
     fop = link->fop;
+    ec = fop->xl->private;
+    uint64_t changed_flags = 0;
+
+    if (ec->optimistic_changelog && !(ec->node_mask & ~link->lock->good_mask) &&
+        !ec_is_data_fop(fop->id))
+        link->optimistic_changelog = _gf_true;
+
+    memset(&loc, 0, sizeof(loc));
+
+    LOCK(&lock->loc.inode->lock);
+
+    set_dirty = ec_set_dirty_flag(link, ctx, dirty);
 
     /* If ec metadata has already been retrieved, do not try again. */
     if (ctx->have_info) {
-        if (ec_is_data_fop (fop->id)) {
+        if (ec_is_data_fop(fop->id)) {
             fop->healing |= lock->healing;
         }
-        return;
+        if (!set_dirty)
+            goto unlock;
     }
 
     /* Determine if there's something we need to retrieve for the current
      * operation. */
-    if (!lock->query && (lock->loc.inode->ia_type != IA_IFREG)) {
-        return;
+    if (!set_dirty && !lock->query && (lock->loc.inode->ia_type != IA_IFREG) &&
+        (lock->loc.inode->ia_type != IA_INVAL)) {
+        goto unlock;
     }
 
-    uid = fop->frame->root->uid;
-    gid = fop->frame->root->gid;
-
-    memset(&loc, 0, sizeof(loc));
-
-    LOCK(&lock->loc.inode->lock);
-
-    getting_size = lock->getting_size;
-    lock->getting_size = _gf_true;
-    if (getting_size) {
-        fop->flags |= EC_FLAG_WAITING_SIZE;
-
+    changed_flags = ec_set_xattrop_flags_and_params(lock, link, dirty);
+    if (link->waiting_flags) {
+        /* This fop needs to wait until all its flags are cleared which
+         * potentially can be cleared by other xattrops that are already
+         * wound*/
         ec_sleep(fop);
+    } else {
+        GF_ASSERT(!changed_flags);
     }
 
+unlock:
     UNLOCK(&lock->loc.inode->lock);
 
-    if (getting_size) {
-        error = 0;
-
+    if (!changed_flags)
         goto out;
-    }
 
     dict = dict_new();
     if (dict == NULL) {
+        error = -ENOMEM;
         goto out;
     }
 
-    /* Once we know that an xattrop will be needed, we try to get all available
-     * information in a single call. */
-    error = ec_dict_set_array(dict, EC_XATTR_VERSION, allzero,
-                              EC_VERSION_SIZE);
-    if (error == 0) {
-        error = ec_dict_set_array(dict, EC_XATTR_DIRTY, allzero,
+    if (EC_FLAGS_HAVE(changed_flags, EC_FLAG_XATTROP)) {
+        /* Once we know that an xattrop will be needed,
+         * we try to get all available information in a
+         * single call. */
+        error = ec_dict_set_array(dict, EC_XATTR_VERSION, allzero,
                                   EC_VERSION_SIZE);
-    }
-    if (error != 0) {
-        goto out;
-    }
+        if (error != 0) {
+            goto out;
+        }
 
-    if (lock->loc.inode->ia_type == IA_IFREG) {
-        error = ec_dict_set_number(dict, EC_XATTR_SIZE, 0);
-        if (error == 0) {
-            error = ec_dict_set_number(dict, EC_XATTR_CONFIG, 0);
+        if (lock->loc.inode->ia_type == IA_IFREG ||
+            lock->loc.inode->ia_type == IA_INVAL) {
+            error = ec_dict_set_number(dict, EC_XATTR_SIZE, 0);
+            if (error == 0) {
+                error = ec_dict_set_number(dict, EC_XATTR_CONFIG, 0);
+            }
+            if (error != 0) {
+                goto out;
+            }
+
+            xdata = dict_new();
+            if (xdata == NULL || dict_set_int32(xdata, GF_GET_SIZE, 1)) {
+                error = -ENOMEM;
+                goto out;
+            }
         }
+    }
+
+    if (memcmp(allzero, dirty, sizeof(allzero))) {
+        error = ec_dict_set_array(dict, EC_XATTR_DIRTY, dirty, EC_VERSION_SIZE);
         if (error != 0) {
             goto out;
         }
@@ -1099,20 +1543,20 @@ void ec_get_size_version(ec_lock_link_t *link)
             loc.name = NULL;
         }
 
-        ec_xattrop (fop->frame, fop->xl, fop->mask, fop->minimum,
-                    ec_prepare_update_cbk, link, &loc,
-                    GF_XATTROP_ADD_ARRAY64, dict, NULL);
+        ec_xattrop(fop->frame, fop->xl, fop->mask, fop->minimum,
+                   ec_prepare_update_cbk, link, &loc, GF_XATTROP_ADD_ARRAY64,
+                   dict, xdata);
     } else {
         ec_fxattrop(fop->frame, fop->xl, fop->mask, fop->minimum,
-                ec_prepare_update_cbk, link, lock->fd,
-                GF_XATTROP_ADD_ARRAY64, dict, NULL);
+                    ec_prepare_update_cbk, link, lock->fd,
+                    GF_XATTROP_ADD_ARRAY64, dict, xdata);
     }
 
     error = 0;
 
 out:
-    fop->frame->root->uid = uid;
-    fop->frame->root->gid = gid;
+    fop->frame->root->uid = fop->uid;
+    fop->frame->root->gid = fop->gid;
 
     loc_wipe(&loc);
 
@@ -1120,22 +1564,24 @@ out:
         dict_unref(dict);
     }
 
+    if (xdata != NULL) {
+        dict_unref(xdata);
+    }
+
     if (error != 0) {
         ec_fop_set_error(fop, -error);
     }
 }
 
-gf_boolean_t ec_get_inode_size(ec_fop_data_t *fop, inode_t *inode,
-                               uint64_t *size)
+gf_boolean_t
+__ec_get_inode_size(ec_fop_data_t *fop, inode_t *inode, uint64_t *size)
 {
     ec_inode_t *ctx;
     gf_boolean_t found = _gf_false;
 
-    LOCK(&inode->lock);
-
     ctx = __ec_inode_get(inode, fop->xl);
     if (ctx == NULL) {
-        goto unlock;
+        goto out;
     }
 
     if (ctx->have_size) {
@@ -1143,23 +1589,33 @@ gf_boolean_t ec_get_inode_size(ec_fop_data_t *fop, inode_t *inode,
         found = _gf_true;
     }
 
-unlock:
+out:
+    return found;
+}
+
+gf_boolean_t
+ec_get_inode_size(ec_fop_data_t *fop, inode_t *inode, uint64_t *size)
+{
+    gf_boolean_t found = _gf_false;
+
+    LOCK(&inode->lock);
+    {
+        found = __ec_get_inode_size(fop, inode, size);
+    }
     UNLOCK(&inode->lock);
 
     return found;
 }
 
-gf_boolean_t ec_set_inode_size(ec_fop_data_t *fop, inode_t *inode,
-                               uint64_t size)
+gf_boolean_t
+__ec_set_inode_size(ec_fop_data_t *fop, inode_t *inode, uint64_t size)
 {
     ec_inode_t *ctx;
     gf_boolean_t found = _gf_false;
 
-    LOCK(&inode->lock);
-
     ctx = __ec_inode_get(inode, fop->xl);
     if (ctx == NULL) {
-        goto unlock;
+        goto out;
     }
 
     /* Normal fops always have ctx->have_size set. However self-heal calls this
@@ -1174,13 +1630,42 @@ gf_boolean_t ec_set_inode_size(ec_fop_data_t *fop, inode_t *inode,
 
     found = _gf_true;
 
-unlock:
+out:
+    return found;
+}
+
+gf_boolean_t
+ec_set_inode_size(ec_fop_data_t *fop, inode_t *inode, uint64_t size)
+{
+    gf_boolean_t found = _gf_false;
+
+    LOCK(&inode->lock);
+    {
+        found = __ec_set_inode_size(fop, inode, size);
+    }
     UNLOCK(&inode->lock);
 
     return found;
 }
 
-void ec_clear_inode_info(ec_fop_data_t *fop, inode_t *inode)
+static void
+ec_release_stripe_cache(ec_inode_t *ctx)
+{
+    ec_stripe_list_t *stripe_cache = NULL;
+    ec_stripe_t *stripe = NULL;
+
+    stripe_cache = &ctx->stripe_cache;
+    while (!list_empty(&stripe_cache->lru)) {
+        stripe = list_first_entry(&stripe_cache->lru, ec_stripe_t, lru);
+        list_del(&stripe->lru);
+        GF_FREE(stripe);
+    }
+    stripe_cache->count = 0;
+    stripe_cache->max = 0;
+}
+
+void
+ec_clear_inode_info(ec_fop_data_t *fop, inode_t *inode)
 {
     ec_inode_t *ctx;
 
@@ -1191,6 +1676,7 @@ void ec_clear_inode_info(ec_fop_data_t *fop, inode_t *inode)
         goto unlock;
     }
 
+    ec_release_stripe_cache(ctx);
     ctx->have_info = _gf_false;
     ctx->have_config = _gf_false;
     ctx->have_version = _gf_false;
@@ -1206,10 +1692,10 @@ unlock:
     UNLOCK(&inode->lock);
 }
 
-int32_t ec_get_real_size_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
-                             int32_t op_ret, int32_t op_errno, inode_t *inode,
-                             struct iatt *buf, dict_t *xdata,
-                             struct iatt *postparent)
+int32_t
+ec_get_real_size_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+                     int32_t op_ret, int32_t op_errno, inode_t *inode,
+                     struct iatt *buf, dict_t *xdata, struct iatt *postparent)
 {
     ec_fop_data_t *fop = cookie;
     ec_lock_link_t *link;
@@ -1226,14 +1712,15 @@ int32_t ec_get_real_size_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
 }
 
 /* This function is used to get the trusted.ec.size xattr from a file when
- * no lock is needed on the inode. This is only required to maintan iatt
+ * no lock is needed on the inode. This is only required to maintain iatt
  * structs on fops that manipulate directory entries but do not operate
  * directly on the inode, like link, rename, ...
  *
  * Any error processing this request is ignored. In the worst case, an invalid
  * or not up to date value in the iatt could cause some cache invalidation.
  */
-void ec_get_real_size(ec_lock_link_t *link)
+void
+ec_get_real_size(ec_lock_link_t *link)
 {
     ec_fop_data_t *fop;
     dict_t *xdata;
@@ -1277,48 +1764,66 @@ ec_lock_update_fd(ec_lock_t *lock, ec_fop_data_t *fop)
 {
     /* If the fop has an fd available, attach it to the lock structure to be
      * able to do fxattrop calls instead of xattrop. */
-    if (fop->use_fd) {
-        if (lock->fd != NULL) {
-            __fd_unref(lock->fd);
-        }
+    if (fop->use_fd && (lock->fd == NULL)) {
         lock->fd = __fd_ref(fop->fd);
     }
 }
 
+static gf_boolean_t
+ec_link_has_lock_conflict(ec_lock_link_t *link, gf_boolean_t waitlist_check)
+{
+    ec_lock_link_t *trav_link = NULL;
+
+    list_for_each_entry(trav_link, &link->lock->owners, owner_list)
+    {
+        if (ec_lock_conflict(trav_link, link))
+            return _gf_true;
+    }
+
+    if (!waitlist_check)
+        return _gf_false;
+
+    list_for_each_entry(trav_link, &link->lock->waiting, wait_list)
+    {
+        if (ec_lock_conflict(trav_link, link))
+            return _gf_true;
+    }
+
+    return _gf_false;
+}
+
 static void
 ec_lock_wake_shared(ec_lock_t *lock, struct list_head *list)
 {
     ec_fop_data_t *fop;
     ec_lock_link_t *link;
-    gf_boolean_t exclusive = _gf_false;
+    gf_boolean_t conflict = _gf_false;
 
-    while (!exclusive && !list_empty(&lock->waiting)) {
+    while (!conflict && !list_empty(&lock->waiting)) {
         link = list_entry(lock->waiting.next, ec_lock_link_t, wait_list);
         fop = link->fop;
 
         /* If lock is not acquired, at most one fop can be assigned as owner.
          * The following fops will need to wait in the lock->waiting queue
          * until the lock has been fully acquired. */
-        exclusive = !lock->acquired;
+        conflict = !lock->acquired;
 
         /* If the fop is not shareable, only this fop can be assigned as owner.
          * Other fops will need to wait until this one finishes. */
-        if ((fop->flags & EC_FLAG_LOCK_SHARED) == 0) {
-            exclusive = _gf_true;
-
-            /* Avoid other requests to be assigned as owners. */
-            lock->exclusive = 1;
+        if (ec_link_has_lock_conflict(link, _gf_false)) {
+            conflict = _gf_true;
         }
 
         /* If only one fop is allowed, it can be assigned as the owner of the
          * lock only if there weren't any other owner. */
-        if (exclusive && !list_empty(&lock->owners)) {
+        if (conflict && !list_empty(&lock->owners)) {
             break;
         }
 
         list_move_tail(&link->wait_list, list);
 
-        list_add_tail(&fop->owner_list, &lock->owners);
+        list_add_tail(&link->owner_list, &lock->owners);
+        lock->refs_owners++;
 
         ec_lock_update_fd(lock, fop);
     }
@@ -1336,7 +1841,8 @@ ec_lock_apply(ec_lock_link_t *link)
     ec_get_real_size(link);
 }
 
-gf_boolean_t ec_lock_acquire(ec_lock_link_t *link);
+gf_boolean_t
+ec_lock_acquire(ec_lock_link_t *link);
 
 static void
 ec_lock_resume_shared(struct list_head *list)
@@ -1360,7 +1866,8 @@ ec_lock_resume_shared(struct list_head *list)
     }
 }
 
-void ec_lock_acquired(ec_lock_link_t *link)
+void
+ec_lock_acquired(ec_lock_link_t *link)
 {
     struct list_head list;
     ec_lock_t *lock;
@@ -1376,61 +1883,76 @@ void ec_lock_acquired(ec_lock_link_t *link)
     LOCK(&lock->loc.inode->lock);
 
     lock->acquired = _gf_true;
+    if (lock->contention) {
+        lock->release = _gf_true;
+        lock->contention = _gf_false;
+    }
 
     ec_lock_update_fd(lock, fop);
-    if ((fop->flags & EC_FLAG_LOCK_SHARED) != 0) {
-        ec_lock_wake_shared(lock, &list);
-    }
+    ec_lock_wake_shared(lock, &list);
 
     UNLOCK(&lock->loc.inode->lock);
 
     ec_lock_apply(link);
 
+    if (fop->use_fd &&
+        (link->update[EC_DATA_TXN] || link->update[EC_METADATA_TXN])) {
+        /* Try to reopen closed fd's only if lock has succeeded. */
+        ec_fix_open(fop, lock->mask);
+    }
+
     ec_lock_resume_shared(&list);
 }
 
-int32_t ec_locked(call_frame_t *frame, void *cookie, xlator_t *this,
-                  int32_t op_ret, int32_t op_errno, dict_t *xdata)
+int32_t
+ec_locked(call_frame_t *frame, void *cookie, xlator_t *this, int32_t op_ret,
+          int32_t op_errno, dict_t *xdata)
 {
     ec_fop_data_t *fop = cookie;
     ec_lock_link_t *link = NULL;
     ec_lock_t *lock = NULL;
 
+    link = fop->data;
+    lock = link->lock;
     if (op_ret >= 0) {
-        link = fop->data;
-        lock = link->lock;
         lock->mask = lock->good_mask = fop->good;
         lock->healing = 0;
 
         ec_lock_acquired(link);
         ec_lock(fop->parent);
     } else {
-        gf_msg (this->name, GF_LOG_WARNING, op_errno,
-                EC_MSG_PREOP_LOCK_FAILED,
-                "Failed to complete preop lock");
+        LOCK(&lock->loc.inode->lock);
+        {
+            lock->contention = _gf_false;
+        }
+        UNLOCK(&lock->loc.inode->lock);
+        gf_msg(this->name, GF_LOG_WARNING, op_errno, EC_MSG_PREOP_LOCK_FAILED,
+               "Failed to complete preop lock");
     }
 
     return 0;
 }
 
-gf_boolean_t ec_lock_acquire(ec_lock_link_t *link)
+gf_boolean_t
+ec_lock_acquire(ec_lock_link_t *link)
 {
     ec_lock_t *lock;
     ec_fop_data_t *fop;
+    gf_lkowner_t lk_owner;
 
     lock = link->lock;
     fop = link->fop;
 
     if (!lock->acquired) {
-        ec_owner_set(fop->frame, lock);
+        set_lk_owner_from_ptr(&lk_owner, lock);
 
         ec_trace("LOCK_ACQUIRE", fop, "lock=%p, inode=%p", lock,
                  lock->loc.inode);
 
         lock->flock.l_type = F_WRLCK;
-        ec_inodelk(fop->frame, fop->xl, -1, EC_MINIMUM_ALL, ec_locked,
-                   link, fop->xl->name, &lock->loc, F_SETLKW, &lock->flock,
-                   NULL);
+        ec_inodelk(fop->frame, fop->xl, &lk_owner, -1, EC_MINIMUM_ALL,
+                   ec_locked, link, fop->xl->name, &lock->loc, F_SETLKW,
+                   &lock->flock, NULL);
 
         return _gf_false;
     }
@@ -1442,6 +1964,67 @@ gf_boolean_t ec_lock_acquire(ec_lock_link_t *link)
     return _gf_true;
 }
 
+static ec_lock_link_t *
+ec_lock_timer_cancel(xlator_t *xl, ec_lock_t *lock)
+{
+    ec_lock_link_t *timer_link;
+
+    /* If we don't have any timer, there's nothing to cancel. */
+    if (lock->timer == NULL) {
+        return NULL;
+    }
+
+    /* We are trying to access a lock that has an unlock timer active.
+     * This means that the lock must be idle, i.e. no fop can be in the
+     * owner, waiting or frozen lists. It also means that the lock cannot
+     * have been marked as being released (this is done without timers).
+     * There should only be one owner reference, but it's possible that
+     * some fops are being prepared to use this lock. */
+    GF_ASSERT((lock->refs_owners == 1) && list_empty(&lock->owners) &&
+              list_empty(&lock->waiting));
+
+    /* We take the timer_link before cancelling the timer, since a
+     * successful cancellation will destroy it. It must not be NULL
+     * because it references the fop responsible for the delayed unlock
+     * that we are currently trying to cancel. */
+    timer_link = lock->timer->data;
+    GF_ASSERT(timer_link != NULL);
+
+    if (gf_timer_call_cancel(xl->ctx, lock->timer) < 0) {
+        /* It's too late to avoid the execution of the timer callback.
+         * Since we need to be sure that the callback has access to all
+         * needed resources, we cannot resume the execution of the
+         * timer fop now. This will be done in the callback. */
+        timer_link = NULL;
+    } else {
+        /* The timer has been cancelled. The fop referenced by
+         * timer_link holds the last reference. The caller is
+         * responsible to release it when not needed anymore. */
+        ec_trace("UNLOCK_CANCELLED", timer_link->fop, "lock=%p", lock);
+    }
+
+    /* We have two options here:
+     *
+     * 1. The timer has been successfully cancelled.
+     *
+     *    This is the easiest case and we can continue with the currently
+     *    acquired lock.
+     *
+     * 2. The timer callback has already been fired.
+     *
+     *    In this case we have not been able to cancel the timer before
+     *    the timer callback has been fired, but we also know that
+     *    lock->timer != NULL. This means that the timer callback is still
+     *    trying to acquire the inode mutex that we currently own. We are
+     *    safe until we release it. In this case we can safely clear
+     *    lock->timer. This will cause that the timer callback does nothing
+     *    once it acquires the mutex.
+     */
+    lock->timer = NULL;
+
+    return timer_link;
+}
+
 static gf_boolean_t
 ec_lock_assign_owner(ec_lock_link_t *link)
 {
@@ -1450,6 +2033,8 @@ ec_lock_assign_owner(ec_lock_link_t *link)
     ec_lock_link_t *timer_link = NULL;
     gf_boolean_t assigned = _gf_false;
 
+    /* The link cannot be in any list because we have just finished preparing
+     * it. */
     GF_ASSERT(list_empty(&link->wait_list));
 
     fop = link->fop;
@@ -1457,64 +2042,72 @@ ec_lock_assign_owner(ec_lock_link_t *link)
 
     LOCK(&lock->loc.inode->lock);
 
-    GF_ASSERT (lock->inserted > 0);
-    lock->inserted--;
+    /* Since the link has just been prepared but it's not active yet, the
+     * refs_pending must be one at least (the ref owned by this link). */
+    GF_ASSERT(lock->refs_pending > 0);
+    /* The link is not pending any more. It will be assigned to the owner,
+     * waiting or frozen list. */
+    lock->refs_pending--;
 
     if (lock->release) {
         ec_trace("LOCK_QUEUE_FREEZE", fop, "lock=%p", lock);
 
-        list_add_tail(&link->wait_list, &lock->frozen);
+        /* When lock->release is set, we'll unlock the lock as soon as
+         * possible, meaning that we won't use a timer. */
+        GF_ASSERT(lock->timer == NULL);
 
-        /* The lock is frozen, so we move the current reference to refs_frozen.
-         * After that, there should remain at least one ref belonging to the
-         * lock that is processing the release. */
-        lock->refs--;
-        GF_ASSERT(lock->refs > 0);
-        lock->refs_frozen++;
+        /* The lock is marked to be released. We can still have owners and fops
+         * in the waiting ilist f they have been added before the lock has been
+         * marked to be released. However new fops are put into the frozen list
+         * to wait for the next unlock/lock cycle. */
+        list_add_tail(&link->wait_list, &lock->frozen);
 
         goto unlock;
     }
 
-    lock->exclusive |= (fop->flags & EC_FLAG_LOCK_SHARED) == 0;
+    /* The lock is not marked to be released, so the frozen list should be
+     * empty. */
+    GF_ASSERT(list_empty(&lock->frozen));
+
+    timer_link = ec_lock_timer_cancel(fop->xl, lock);
 
     if (!list_empty(&lock->owners)) {
-        if (!lock->acquired || (lock->exclusive != 0)) {
+        /* There are other owners of this lock. We can only take ownership if
+         * the lock is already acquired and doesn't have conflict with existing
+         * owners, or waiters(to prevent starvation).
+         * Otherwise we need to wait.
+         */
+        if (!lock->acquired || ec_link_has_lock_conflict(link, _gf_true)) {
             ec_trace("LOCK_QUEUE_WAIT", fop, "lock=%p", lock);
 
             list_add_tail(&link->wait_list, &lock->waiting);
 
             goto unlock;
         }
-    } else if (lock->timer != NULL) {
-        GF_ASSERT (lock->release == _gf_false);
-
-        timer_link = lock->timer->data;
-        if (gf_timer_call_cancel(fop->xl->ctx, lock->timer) == 0) {
-            ec_trace("UNLOCK_CANCELLED", timer_link->fop, "lock=%p", lock);
-            lock->timer = NULL;
-            lock->refs--;
-            /* There should remain at least 1 ref, the current one. */
-            GF_ASSERT(lock->refs > 0);
-        } else {
-            /* Timer expired and on the way to unlock.
-             * Set lock->release to _gf_true, so that this
-             * lock will be put in frozen list*/
-            timer_link = NULL;
-            lock->release = _gf_true;
-        }
     }
 
-    list_add_tail(&fop->owner_list, &lock->owners);
+    list_add_tail(&link->owner_list, &lock->owners);
+
+    /* If timer_link is not NULL, it means that we have inherited the owner
+     * reference assigned to the timer fop. In this case we simply reuse it.
+     * Otherwise we need to increase the number of owners. */
+    if (timer_link == NULL) {
+        lock->refs_owners++;
+    }
 
     assigned = _gf_true;
 
 unlock:
     if (!assigned) {
+        /* We have not been able to take ownership of this lock. The fop must
+         * be put to sleep. */
         ec_sleep(fop);
     }
 
     UNLOCK(&lock->loc.inode->lock);
 
+    /* If we have cancelled the timer, we need to resume the fop that was
+     * waiting for it. */
     if (timer_link != NULL) {
         ec_resume(timer_link->fop, 0);
     }
@@ -1530,7 +2123,6 @@ ec_lock_next_owner(ec_lock_link_t *link, ec_cbk_data_t *cbk,
     ec_lock_t *lock = link->lock;
     ec_fop_data_t *fop = link->fop;
     ec_inode_t *ctx = lock->ctx;
-    ec_t *ec = fop->xl->private;
 
     INIT_LIST_HEAD(&list);
 
@@ -1538,38 +2130,46 @@ ec_lock_next_owner(ec_lock_link_t *link, ec_cbk_data_t *cbk,
 
     ec_trace("LOCK_DONE", fop, "lock=%p", lock);
 
-    GF_ASSERT(!list_empty(&fop->owner_list));
-    list_del_init(&fop->owner_list);
+    /* Current link must belong to the owner list of the lock. We don't
+     * decrement lock->refs_owners here because the inode mutex is released
+     * before ec_unlock() is called and we need to know when the last owner
+     * unlocks the lock to do proper cleanup. lock->refs_owners is used for
+     * this task. */
+    GF_ASSERT((lock->refs_owners > 0) && !list_empty(&link->owner_list));
+    list_del_init(&link->owner_list);
+
     lock->release |= release;
 
     if ((fop->error == 0) && (cbk != NULL) && (cbk->op_ret >= 0)) {
         if (link->update[0]) {
             ctx->post_version[0]++;
-            if (ec->node_mask & ~fop->good) {
-                ctx->dirty[0]++;
-            }
         }
         if (link->update[1]) {
             ctx->post_version[1]++;
-            if (ec->node_mask & ~fop->good) {
-                ctx->dirty[1]++;
+        }
+        /* If the fop fails on any of the good bricks, it is important to mark
+         * it dirty and update versions right away. */
+        if (link->update[0] || link->update[1]) {
+            if (lock->good_mask & ~(fop->good | fop->remaining)) {
+                lock->release = _gf_true;
             }
         }
     }
 
+    if (fop->healing) {
+        lock->healing = fop->healing & (fop->good | fop->remaining);
+    }
     ec_lock_update_good(lock, fop);
 
-    lock->exclusive -= (fop->flags & EC_FLAG_LOCK_SHARED) == 0;
-    if (list_empty(&lock->owners)) {
-        ec_lock_wake_shared(lock, &list);
-    }
+    ec_lock_wake_shared(lock, &list);
 
     UNLOCK(&lock->loc.inode->lock);
 
     ec_lock_resume_shared(&list);
 }
 
-void ec_lock(ec_fop_data_t *fop)
+void
+ec_lock(ec_fop_data_t *fop)
 {
     ec_lock_link_t *link;
 
@@ -1577,7 +2177,7 @@ void ec_lock(ec_fop_data_t *fop)
      * Which can result in refs == 0 for fop leading to use after free in this
      * function when it calls ec_sleep so do ec_sleep at start and ec_resume at
      * the end of this function.*/
-    ec_sleep (fop);
+    ec_sleep(fop);
 
     while (fop->locked < fop->lock_count) {
         /* Since there are only up to 2 locks per fop, this xor will change
@@ -1597,6 +2197,7 @@ ec_lock_unfreeze(ec_lock_link_t *link)
 {
     struct list_head list;
     ec_lock_t *lock;
+    gf_boolean_t destroy = _gf_false;
 
     lock = link->lock;
 
@@ -1604,18 +2205,30 @@ ec_lock_unfreeze(ec_lock_link_t *link)
 
     LOCK(&lock->loc.inode->lock);
 
-    lock->acquired = _gf_false;
+    /* The lock must be marked to be released here, since we have just released
+     * it and any attempt to assign it to more fops must have added them to the
+     * frozen list. We can only have one active reference here: the one that
+     * is processing this unfreeze. */
+    GF_ASSERT(lock->release && (lock->refs_owners == 1));
     lock->release = _gf_false;
-    lock->refs--;
+    lock->refs_owners = 0;
 
-    GF_ASSERT (lock->refs == lock->inserted);
-    GF_ASSERT(lock->exclusive == 0);
-    GF_ASSERT(list_empty(&lock->waiting) && list_empty(&lock->owners));
+    lock->acquired = _gf_false;
+
+    /* We are unfreezing a lock. This means that the lock has already been
+     * released. In this state it shouldn't have a pending timer nor have any
+     * owner, and the waiting list should be empty. Only the frozen list can
+     * contain some fop. */
+    GF_ASSERT((lock->timer == NULL) && list_empty(&lock->waiting) &&
+              list_empty(&lock->owners));
 
+    /* We move all frozen fops to the waiting list. */
     list_splice_init(&lock->frozen, &lock->waiting);
-    lock->refs += lock->refs_frozen;
-    lock->refs_frozen = 0;
-    if (lock->refs == 0) {
+
+    /* If we don't have any fop waiting nor there are any prepared fops using
+     * this lock, we can finally dispose it. */
+    destroy = list_empty(&lock->waiting) && (lock->refs_pending == 0);
+    if (destroy) {
         ec_trace("LOCK_DESTROY", link->fop, "lock=%p", lock);
 
         lock->ctx->inode_lock = NULL;
@@ -1629,22 +2242,21 @@ ec_lock_unfreeze(ec_lock_link_t *link)
 
     ec_lock_resume_shared(&list);
 
-    if (lock->refs == 0) {
+    if (destroy) {
         ec_lock_destroy(lock);
     }
 }
 
-int32_t ec_unlocked(call_frame_t *frame, void *cookie, xlator_t *this,
-                    int32_t op_ret, int32_t op_errno, dict_t *xdata)
+int32_t
+ec_unlocked(call_frame_t *frame, void *cookie, xlator_t *this, int32_t op_ret,
+            int32_t op_errno, dict_t *xdata)
 {
     ec_fop_data_t *fop = cookie;
     ec_lock_link_t *link = fop->data;
 
     if (op_ret < 0) {
-        gf_msg (this->name, GF_LOG_WARNING, op_errno,
-                EC_MSG_UNLOCK_FAILED,
-                "entry/inode unlocking failed (%s)",
-                ec_fop_name(link->fop->id));
+        gf_msg(this->name, GF_LOG_WARNING, op_errno, EC_MSG_UNLOCK_FAILED,
+               "entry/inode unlocking failed :(%s)", ec_msg_str(link->fop));
     } else {
         ec_trace("UNLOCKED", link->fop, "lock=%p", link->lock);
     }
@@ -1654,24 +2266,26 @@ int32_t ec_unlocked(call_frame_t *frame, void *cookie, xlator_t *this,
     return 0;
 }
 
-void ec_unlock_lock(ec_lock_link_t *link)
+void
+ec_unlock_lock(ec_lock_link_t *link)
 {
     ec_lock_t *lock;
     ec_fop_data_t *fop;
+    gf_lkowner_t lk_owner;
 
     lock = link->lock;
     fop = link->fop;
 
+    lock->unlock_now = _gf_false;
     ec_clear_inode_info(fop, lock->loc.inode);
 
     if ((lock->mask != 0) && lock->acquired) {
-        ec_owner_set(fop->frame, lock);
-
+        set_lk_owner_from_ptr(&lk_owner, lock);
         lock->flock.l_type = F_UNLCK;
         ec_trace("UNLOCK_INODELK", fop, "lock=%p, inode=%p", lock,
                  lock->loc.inode);
 
-        ec_inodelk(fop->frame, fop->xl, lock->mask, EC_MINIMUM_ONE,
+        ec_inodelk(fop->frame, fop->xl, &lk_owner, lock->mask, EC_MINIMUM_ONE,
                    ec_unlocked, link, fop->xl->name, &lock->loc, F_SETLK,
                    &lock->flock, NULL);
     } else {
@@ -1679,25 +2293,49 @@ void ec_unlock_lock(ec_lock_link_t *link)
     }
 }
 
-int32_t ec_update_size_version_done(call_frame_t * frame, void * cookie,
-                                    xlator_t * this, int32_t op_ret,
-                                    int32_t op_errno, dict_t * xattr,
-                                    dict_t * xdata)
+void
+ec_inode_bad_inc(inode_t *inode, xlator_t *xl)
+{
+    ec_inode_t *ctx = NULL;
+
+    LOCK(&inode->lock);
+    {
+        ctx = __ec_inode_get(inode, xl);
+        if (ctx == NULL) {
+            goto unlock;
+        }
+        ctx->bad_version++;
+    }
+unlock:
+    UNLOCK(&inode->lock);
+}
+
+int32_t
+ec_update_size_version_done(call_frame_t *frame, void *cookie, xlator_t *this,
+                            int32_t op_ret, int32_t op_errno, dict_t *xattr,
+                            dict_t *xdata)
 {
     ec_fop_data_t *fop = cookie;
     ec_lock_link_t *link;
     ec_lock_t *lock;
     ec_inode_t *ctx;
 
+    link = fop->data;
+    lock = link->lock;
+    ctx = lock->ctx;
+
     if (op_ret < 0) {
-        gf_msg(fop->xl->name, fop_log_level (fop->id, op_errno), op_errno,
+        if (link->lock->fd == NULL) {
+            ec_inode_bad_inc(link->lock->loc.inode, this);
+        } else {
+            ec_inode_bad_inc(link->lock->fd->inode, this);
+        }
+
+        gf_msg(fop->xl->name, fop_log_level(fop->id, op_errno), op_errno,
                EC_MSG_SIZE_VERS_UPDATE_FAIL,
-               "Failed to update version and size");
+               "Failed to update version and size. %s", ec_msg_str(fop));
     } else {
         fop->parent->good &= fop->good;
-        link = fop->data;
-        lock = link->lock;
-        ctx = lock->ctx;
 
         ec_lock_update_good(lock, fop);
 
@@ -1714,16 +2352,17 @@ int32_t ec_update_size_version_done(call_frame_t * frame, void * cookie,
             ctx->have_size = _gf_true;
         }
         if ((ec_dict_del_config(xdata, EC_XATTR_CONFIG, &ctx->config) == 0) &&
-            ec_config_check(fop->parent, &ctx->config)) {
+            ec_config_check(fop->xl, &ctx->config)) {
             ctx->have_config = _gf_true;
         }
 
         ctx->have_info = _gf_true;
     }
-
-    if ((fop->parent->id != GF_FOP_FLUSH) &&
-        (fop->parent->id != GF_FOP_FSYNC) &&
-        (fop->parent->id != GF_FOP_FSYNCDIR)) {
+    /* If we are here because of fop's and other than unlock request,
+     * that means we are still holding a lock. That make sure
+     * lock->unlock_now can not be modified.
+     */
+    if (lock->unlock_now) {
         ec_unlock_lock(fop->data);
     }
 
@@ -1731,21 +2370,19 @@ int32_t ec_update_size_version_done(call_frame_t * frame, void * cookie,
 }
 
 void
-ec_update_size_version(ec_lock_link_t *link, uint64_t *version,
-                       uint64_t size, uint64_t *dirty)
+ec_update_size_version(ec_lock_link_t *link, uint64_t *version, uint64_t size,
+                       uint64_t *dirty)
 {
     ec_fop_data_t *fop;
     ec_lock_t *lock;
     ec_inode_t *ctx;
-    dict_t * dict;
-    uid_t uid;
-    gid_t gid;
+    dict_t *dict = NULL;
+    uintptr_t update_on = 0;
     int32_t err = -ENOMEM;
 
     fop = link->fop;
-
-    GF_ASSERT(version[0] < 0x100000000);
-    GF_ASSERT(version[1] < 0x100000000);
+    lock = link->lock;
+    ctx = lock->ctx;
 
     ec_trace("UPDATE", fop, "version=%ld/%ld, size=%ld, dirty=%ld/%ld",
              version[0], version[1], size, dirty[0], dirty[1]);
@@ -1755,9 +2392,6 @@ ec_update_size_version(ec_lock_link_t *link, uint64_t *version,
         goto out;
     }
 
-    lock = link->lock;
-    ctx = lock->ctx;
-
     /* If we don't have version information or it has been modified, we
      * update it. */
     if (!ctx->have_version || (version[0] != 0) || (version[1] != 0)) {
@@ -1769,8 +2403,8 @@ ec_update_size_version(ec_lock_link_t *link, uint64_t *version,
     }
 
     if (size != 0) {
-        /* If size has been changed, we should already know the previous size
-         * of the file. */
+        /* If size has been changed, we should already
+         * know the previous size of the file. */
         GF_ASSERT(ctx->have_size);
 
         err = ec_dict_set_number(dict, EC_XATTR_SIZE, size);
@@ -1779,40 +2413,37 @@ ec_update_size_version(ec_lock_link_t *link, uint64_t *version,
         }
     }
 
-    /* If we don't have dirty information or it has been modified, we update
-     * it. */
-    if ((dirty[0] != 0) || (dirty[1] != 0)) {
+    if (dirty[0] || dirty[1]) {
         err = ec_dict_set_array(dict, EC_XATTR_DIRTY, dirty, EC_VERSION_SIZE);
         if (err != 0) {
             goto out;
         }
     }
 
-    /* If config information is not know, we request it now. */
+    /* If config information is not known, we request it now. */
     if ((lock->loc.inode->ia_type == IA_IFREG) && !ctx->have_config) {
         /* A failure requesting this xattr is ignored because it's not
          * absolutely required right now. */
-        ec_dict_set_number(dict, EC_XATTR_CONFIG, 0);
+        (void)ec_dict_set_number(dict, EC_XATTR_CONFIG, 0);
     }
 
-    uid = fop->frame->root->uid;
-    gid = fop->frame->root->gid;
-
     fop->frame->root->uid = 0;
     fop->frame->root->gid = 0;
 
+    update_on = lock->good_mask | lock->healing;
+
     if (link->lock->fd == NULL) {
-            ec_xattrop(fop->frame, fop->xl, fop->good, EC_MINIMUM_MIN,
-                       ec_update_size_version_done, link, &link->lock->loc,
-                       GF_XATTROP_ADD_ARRAY64, dict, NULL);
+        ec_xattrop(fop->frame, fop->xl, update_on, EC_MINIMUM_MIN,
+                   ec_update_size_version_done, link, &link->lock->loc,
+                   GF_XATTROP_ADD_ARRAY64, dict, NULL);
     } else {
-            ec_fxattrop(fop->frame, fop->xl, fop->good, EC_MINIMUM_MIN,
-                       ec_update_size_version_done, link, link->lock->fd,
-                       GF_XATTROP_ADD_ARRAY64, dict, NULL);
+        ec_fxattrop(fop->frame, fop->xl, update_on, EC_MINIMUM_MIN,
+                    ec_update_size_version_done, link, link->lock->fd,
+                    GF_XATTROP_ADD_ARRAY64, dict, NULL);
     }
 
-    fop->frame->root->uid = uid;
-    fop->frame->root->gid = gid;
+    fop->frame->root->uid = fop->uid;
+    fop->frame->root->gid = fop->gid;
 
     dict_unref(dict);
 
@@ -1825,8 +2456,12 @@ out:
 
     ec_fop_set_error(fop, -err);
 
-    gf_msg (fop->xl->name, GF_LOG_ERROR, -err, EC_MSG_SIZE_VERS_UPDATE_FAIL,
-            "Unable to update version and size");
+    gf_msg(fop->xl->name, GF_LOG_ERROR, -err, EC_MSG_SIZE_VERS_UPDATE_FAIL,
+           "Unable to update version and size. %s", ec_msg_str(fop));
+
+    if (lock->unlock_now) {
+        ec_unlock_lock(fop->data);
+    }
 }
 
 gf_boolean_t
@@ -1834,28 +2469,62 @@ ec_update_info(ec_lock_link_t *link)
 {
     ec_lock_t *lock;
     ec_inode_t *ctx;
-    uint64_t version[2];
-    uint64_t dirty[2];
+    uint64_t version[2] = {0, 0};
+    uint64_t dirty[2] = {0, 0};
     uint64_t size;
+    ec_t *ec = NULL;
+    uintptr_t mask;
 
     lock = link->lock;
     ctx = lock->ctx;
+    ec = link->fop->xl->private;
 
     /* pre_version[*] will be 0 if have_version is false */
-    version[0] = ctx->post_version[0] - ctx->pre_version[0];
-    version[1] = ctx->post_version[1] - ctx->pre_version[1];
+    version[EC_DATA_TXN] = ctx->post_version[EC_DATA_TXN] -
+                           ctx->pre_version[EC_DATA_TXN];
+    version[EC_METADATA_TXN] = ctx->post_version[EC_METADATA_TXN] -
+                               ctx->pre_version[EC_METADATA_TXN];
 
     size = ctx->post_size - ctx->pre_size;
+    /* If we set the dirty flag for update fop, we have to unset it.
+     * If fop has failed on some bricks, leave the dirty as marked. */
+
+    if (lock->unlock_now) {
+        if (version[EC_DATA_TXN]) {
+            /*A data fop will have difference in post and pre version
+             *and for data fop we send writes on healing bricks also */
+            mask = lock->good_mask | lock->healing;
+        } else {
+            mask = lock->good_mask;
+        }
+        /* Ensure that nodes are up while doing final
+         * metadata update.*/
+        if (!(ec->node_mask & ~(mask)) && !(ec->node_mask & ~ec->xl_up)) {
+            if (ctx->dirty[EC_DATA_TXN] != 0) {
+                dirty[EC_DATA_TXN] = -1;
+            }
+            if (ctx->dirty[EC_METADATA_TXN] != 0) {
+                dirty[EC_METADATA_TXN] = -1;
+            }
+            /*If everything is fine and we already
+             *have version xattr set on entry, there
+             *is no need to update version again*/
+            if (ctx->pre_version[EC_DATA_TXN]) {
+                version[EC_DATA_TXN] = 0;
+            }
+            if (ctx->pre_version[EC_METADATA_TXN]) {
+                version[EC_METADATA_TXN] = 0;
+            }
+        } else {
+            link->optimistic_changelog = _gf_false;
+            ec_set_dirty_flag(link, ctx, dirty);
+        }
+        memset(ctx->dirty, 0, sizeof(ctx->dirty));
+    }
 
-    dirty[0] = ctx->dirty[0];
-    dirty[1] = ctx->dirty[1];
-    /*Dirty is not combined so just reset it right here*/
-    memset(ctx->dirty, 0, sizeof(ctx->dirty));
-
-    if ((version[0] != 0) || (version[1] != 0) ||
-        (dirty[0] != 0) || (dirty[1] != 0)) {
+    if ((version[EC_DATA_TXN] != 0) || (version[EC_METADATA_TXN] != 0) ||
+        (dirty[EC_DATA_TXN] != 0) || (dirty[EC_METADATA_TXN] != 0)) {
         ec_update_size_version(link, version, size, dirty);
-
         return _gf_true;
     }
 
@@ -1865,7 +2534,15 @@ ec_update_info(ec_lock_link_t *link)
 void
 ec_unlock_now(ec_lock_link_t *link)
 {
+    ec_lock_t *lock;
+    lock = link->lock;
+
     ec_trace("UNLOCK_NOW", link->fop, "lock=%p", link->lock);
+    /*At this point, lock is not being used by any fop and
+     *can not be reused by any fop as it is going to be released.
+     *lock->unlock_now can not be modified at any other place.
+     */
+    lock->unlock_now = _gf_true;
 
     if (!ec_update_info(link)) {
         ec_unlock_lock(link);
@@ -1875,102 +2552,254 @@ ec_unlock_now(ec_lock_link_t *link)
 }
 
 void
+ec_lock_release(ec_t *ec, inode_t *inode)
+{
+    ec_lock_t *lock;
+    ec_inode_t *ctx;
+    ec_lock_link_t *timer_link = NULL;
+
+    LOCK(&inode->lock);
+
+    ctx = __ec_inode_get(inode, ec->xl);
+    if (ctx == NULL) {
+        goto done;
+    }
+    lock = ctx->inode_lock;
+    if ((lock == NULL) || lock->release) {
+        goto done;
+    }
+
+    gf_msg_debug(ec->xl->name, 0, "Releasing inode %p due to lock contention",
+                 inode);
+
+    if (!lock->acquired) {
+        /* This happens if some bricks already got the lock while inodelk is in
+         * progress.  Set release to true after lock is acquired*/
+        lock->contention = _gf_true;
+        goto done;
+    }
+
+    /* The lock is not marked to be released, so the frozen list should be
+     * empty. */
+    GF_ASSERT(list_empty(&lock->frozen));
+
+    timer_link = ec_lock_timer_cancel(ec->xl, lock);
+
+    /* We mark the lock to be released as soon as possible. */
+    lock->release = _gf_true;
+
+done:
+    UNLOCK(&inode->lock);
+
+    /* If we have cancelled the timer, we need to start the unlock of the
+     * inode. If there was a timer but we have been unable to cancel it
+     * because it was just triggered, the timer callback will take care
+     * of releasing the inode. */
+    if (timer_link != NULL) {
+        ec_unlock_now(timer_link);
+    }
+}
+
+void
+ec_unlock_timer_add(ec_lock_link_t *link);
+
+void
 ec_unlock_timer_del(ec_lock_link_t *link)
 {
-        int32_t before = 0;
-        ec_lock_t *lock;
-        inode_t *inode;
-        gf_boolean_t now = _gf_false;
+    ec_lock_t *lock;
+    inode_t *inode;
+    gf_boolean_t now = _gf_false;
 
-        lock = link->lock;
+    /* If we are here, it means that the timer has expired before having
+     * been cancelled. This guarantees that 'link' is still valid because
+     * the fop that contains it must be pending (if timer cancellation in
+     * ec_lock_assign_owner() fails, the fop is left sleeping).
+     *
+     * At the same time, the fop still has a reference to the lock, so
+     * it must also be valid.
+     */
+    lock = link->lock;
 
-        /* A race condition can happen if timer expires, calls this function
-         * and the lock is released (lock->loc is wiped) but the fop is not
-         * fully completed yet (it's still on the list of pending fops). In
-         * this case, this function can also be called if ec_unlock_force() is
-         * called. */
-        inode = lock->loc.inode;
-        if (inode == NULL) {
-                return;
-        }
+    /* 'lock' must have a valid inode since it can only be destroyed
+     * when the lock itself is destroyed, but we have a reference to the
+     * lock to avoid this.
+     */
+    inode = lock->loc.inode;
 
-        LOCK(&inode->lock);
+    LOCK(&inode->lock);
 
-        if (lock->timer != NULL) {
-                ec_trace("UNLOCK_DELAYED", link->fop, "lock=%p", lock);
+    if (lock->timer != NULL) {
+        ec_trace("UNLOCK_DELAYED", link->fop, "lock=%p", lock);
 
-                gf_timer_call_cancel(link->fop->xl->ctx, lock->timer);
-                lock->timer = NULL;
+        /* The unlock timer has expired without anyone cancelling it.
+         * This means that it shouldn't have any owner, and the waiting
+         * and frozen lists should be empty.  It must have only one
+         * owner reference, but there can be fops being prepared
+         * though.
+         * */
+        GF_ASSERT(!lock->release && (lock->refs_owners == 1) &&
+                  list_empty(&lock->owners) && list_empty(&lock->waiting) &&
+                  list_empty(&lock->frozen));
 
-                lock->release = now = _gf_true;
+        gf_timer_call_cancel(link->fop->xl->ctx, lock->timer);
+        lock->timer = NULL;
 
-                /* TODO: If the assertion is really true, following code is
-                 *       not needed. */
-                GF_ASSERT(list_empty(&lock->waiting));
+        /* Any fop being processed from now on, will need to wait
+         * until the next unlock/lock cycle. */
+        lock->release = now = _gf_true;
+    }
 
-                before = lock->refs + lock->refs_frozen;
-                list_splice_init(&lock->waiting, &lock->frozen);
-                lock->refs_frozen += lock->refs - lock->inserted - 1;
-                lock->refs = 1 + lock->inserted;
-                /* We moved around the locks, so total number of locks shouldn't
-                 * change by this operation*/
-                GF_ASSERT (before == (lock->refs + lock->refs_frozen));
-        }
+    UNLOCK(&inode->lock);
 
-        UNLOCK(&inode->lock);
+    if (now) {
+        ec_unlock_now(link);
+    } else {
+        /* The timer has been cancelled just after firing it but before
+         * getting here. This means that another fop has used the lock
+         * and everything should be handled as if this callback were
+         * have not been executed. However we still have an owner
+         * reference.
+         *
+         * We need to release our reference. If this is not the last
+         * reference (the most common case because another fop has
+         * taken another ref) we only need to decrement the counter.
+         * Otherwise we have been delayed enough so that the other fop
+         * has had time to acquire the reference, do its operation and
+         * release it. At the time of releasing it, the fop did found
+         * that the ref counter was > 1 (our reference), so the delayed
+         * unlock timer wasn't started. We need to start it again if we
+         * are the last reference.
+         *
+         * ec_unlock_timer_add() handles both cases.
+         */
+        ec_unlock_timer_add(link);
 
-        if (now) {
-                ec_unlock_now(link);
-        }
+        /* We need to resume the fop that was waiting for the delayed
+         * unlock.
+         */
+        ec_resume(link->fop, 0);
+    }
+}
+
+void
+ec_unlock_timer_cbk(void *data)
+{
+    ec_unlock_timer_del(data);
 }
 
-void ec_unlock_timer_cbk(void *data)
+static gf_boolean_t
+ec_eager_lock_used(ec_t *ec, ec_fop_data_t *fop)
 {
-        ec_unlock_timer_del(data);
+    /* Fops with no locks at this point mean that they are sent as sub-fops
+     * of other higher level fops. In this case we simply assume that the
+     * parent fop will take correct care of the eager lock. */
+    if (fop->lock_count == 0) {
+        return _gf_true;
+    }
+
+    /* We may have more than one lock, but this only happens in the rename
+     * fop, and both locks will reference an inode of the same type (a
+     * directory in this case), so we only need to check the first lock. */
+    if (fop->locks[0].lock->loc.inode->ia_type == IA_IFREG) {
+        return ec->eager_lock;
+    }
+
+    return ec->other_eager_lock;
 }
 
-void ec_unlock_timer_add(ec_lock_link_t *link)
+static uint32_t
+ec_eager_lock_timeout(ec_t *ec, ec_lock_t *lock)
+{
+    if (lock->loc.inode->ia_type == IA_IFREG) {
+        return ec->eager_lock_timeout;
+    }
+
+    return ec->other_eager_lock_timeout;
+}
+
+static gf_boolean_t
+ec_lock_delay_create(ec_lock_link_t *link)
 {
     struct timespec delay;
     ec_fop_data_t *fop = link->fop;
     ec_lock_t *lock = link->lock;
+
+    delay.tv_sec = ec_eager_lock_timeout(fop->xl->private, lock);
+    delay.tv_nsec = 0;
+    lock->timer = gf_timer_call_after(fop->xl->ctx, delay, ec_unlock_timer_cbk,
+                                      link);
+    if (lock->timer == NULL) {
+        gf_msg(fop->xl->name, GF_LOG_WARNING, ENOMEM,
+               EC_MSG_UNLOCK_DELAY_FAILED, "Unable to delay an unlock");
+
+        return _gf_false;
+    }
+
+    return _gf_true;
+}
+
+void
+ec_unlock_timer_add(ec_lock_link_t *link)
+{
+    ec_fop_data_t *fop = link->fop;
+    ec_lock_t *lock = link->lock;
     gf_boolean_t now = _gf_false;
 
     LOCK(&lock->loc.inode->lock);
 
-    GF_ASSERT(lock->timer == NULL);
+    /* We are trying to unlock the lock. We can have multiple scenarios here,
+     * but all of them need to have lock->timer == NULL:
+     *
+     * 1. There are other owners currently running that can call ec_unlock().
+     *
+     *    None of them can have started the timer until the last one. But this
+     *    call should be the consequence of this lastest one.
+     *
+     * 2. There are fops in the waiting or frozen lists.
+     *
+     *    These fops cannot call ec_unlock(). So we should be here.
+     *
+     * We must reach here with at least one owner reference.
+     */
+    GF_ASSERT((lock->timer == NULL) && (lock->refs_owners > 0));
 
-    if ((lock->refs - lock->inserted) > 1) {
+    /* If the fop detects that a heal is needed, we mark the lock to be
+     * released as soon as possible. */
+    lock->release |= ec_fop_needs_heal(fop);
+
+    if (lock->refs_owners > 1) {
         ec_trace("UNLOCK_SKIP", fop, "lock=%p", lock);
 
-        lock->refs--;
+        /* If there are other owners we cannot do anything else with the lock.
+         * Note that the current fop has already been removed from the owners
+         * list in ec_lock_reuse(). */
+        lock->refs_owners--;
 
         UNLOCK(&lock->loc.inode->lock);
     } else if (lock->acquired) {
-        ec_t *ec = fop->xl->private;
+        /* There are no other owners and the lock is acquired. If there were
+         * fops waiting, at least one of them should have been promoted to an
+         * owner, so the waiting list should be empty. */
+        GF_ASSERT(list_empty(&lock->owners) && list_empty(&lock->waiting));
 
-        GF_ASSERT(list_empty(&lock->owners));
+        ec_t *ec = fop->xl->private;
 
+        /* If everything goes as expected this fop will be put to sleep until
+         * the timer callback is executed. */
         ec_sleep(fop);
 
-        /* If healing is needed, the lock needs to be released due to
-         * contention, or ec is shutting down, do not delay lock release. */
-        if (!lock->release && !ec_fop_needs_heal(fop) && !ec->shutdown) {
+        /* If the lock needs to be released, or ec is shutting down, do not
+         * delay lock release. */
+        if (!lock->release && !ec->shutdown) {
             ec_trace("UNLOCK_DELAY", fop, "lock=%p, release=%d", lock,
                      lock->release);
 
-            delay.tv_sec = 1;
-            delay.tv_nsec = 0;
-            lock->timer = gf_timer_call_after(fop->xl->ctx, delay,
-                                              ec_unlock_timer_cbk, link);
-            if (lock->timer == NULL) {
-                gf_msg(fop->xl->name, GF_LOG_WARNING, ENOMEM,
-                       EC_MSG_UNLOCK_DELAY_FAILED,
-                       "Unable to delay an "
-                       "unlock");
-
+            if (!ec_lock_delay_create(link)) {
+                /* We are unable to create a new timer. We immediately release
+                 * the lock. */
                 lock->release = now = _gf_true;
             }
+
         } else {
             ec_trace("UNLOCK_FORCE", fop, "lock=%p, release=%d", lock,
                      lock->release);
@@ -1983,17 +2812,25 @@ void ec_unlock_timer_add(ec_lock_link_t *link)
             ec_unlock_now(link);
         }
     } else {
+        /* There are no owners and the lock is not acquired. This can only
+         * happen if a lock attempt has failed and we get to the unlock step
+         * of the fop. As in the previous case, the waiting list must be
+         * empty. */
+        GF_ASSERT(list_empty(&lock->owners) && list_empty(&lock->waiting));
+
+        /* We need to mark the lock to be released to correctly handle fops
+         * that may get in after we release the inode mutex but before
+         * ec_lock_unfreeze() is processed. */
         lock->release = _gf_true;
 
-        GF_ASSERT(list_empty(&lock->owners));
-
         UNLOCK(&lock->loc.inode->lock);
 
         ec_lock_unfreeze(link);
     }
 }
 
-void ec_unlock(ec_fop_data_t *fop)
+void
+ec_unlock(ec_fop_data_t *fop)
 {
     int32_t i;
 
@@ -2002,60 +2839,153 @@ void ec_unlock(ec_fop_data_t *fop)
     }
 }
 
-void ec_flush_size_version(ec_fop_data_t * fop)
+void
+ec_flush_size_version(ec_fop_data_t *fop)
 {
     GF_ASSERT(fop->lock_count == 1);
+    ec_update_info(&fop->locks[0]);
+}
 
-    /* In normal circumstances, ec_update_info() is called after having
-     * executed a normal fop, and it uses fop->good to update only those bricks
-     * that succeeded. In this case we haven't executed any fop, so fop->good
-     * is 0. We use the current good mask of the lock itself to send the
-     * updates.*/
-    fop->good = fop->locks[0].lock->good_mask;
+static void
+ec_update_stripe(ec_t *ec, ec_stripe_list_t *stripe_cache, ec_stripe_t *stripe,
+                 ec_fop_data_t *fop)
+{
+    off_t base;
+
+    /* On write fops, we only update existing fragments if the write has
+     * succeeded. Otherwise, we remove them from the cache. */
+    if ((fop->id == GF_FOP_WRITE) && (fop->answer != NULL) &&
+        (fop->answer->op_ret >= 0)) {
+        base = stripe->frag_offset - fop->frag_range.first;
+        base *= ec->fragments;
+
+        /* We check if the stripe offset falls inside the real region
+         * modified by the write fop (a write request is allowed,
+         * though uncommon, to write less bytes than requested). The
+         * current write fop implementation doesn't allow partial
+         * writes of fragments, so if there's no error, we are sure
+         * that a full stripe has been completely modified or not
+         * touched at all. The value of op_ret may not be a multiple
+         * of the stripe size because it depends on the requested
+         * size by the user, so we update the stripe if the write has
+         * modified at least one byte (meaning ec has written the full
+         * stripe). */
+        if (base < fop->answer->op_ret + fop->head) {
+            memcpy(stripe->data, fop->vector[0].iov_base + base,
+                   ec->stripe_size);
+            list_move_tail(&stripe->lru, &stripe_cache->lru);
+
+            GF_ATOMIC_INC(ec->stats.stripe_cache.updates);
+        }
+    } else {
+        stripe->frag_offset = -1;
+        list_move(&stripe->lru, &stripe_cache->lru);
 
-    ec_update_info(&fop->locks[0]);
+        GF_ATOMIC_INC(ec->stats.stripe_cache.invals);
+    }
+}
+
+static void
+ec_update_cached_stripes(ec_fop_data_t *fop)
+{
+    uint64_t first;
+    uint64_t last;
+    ec_stripe_t *stripe = NULL;
+    ec_inode_t *ctx = NULL;
+    ec_stripe_list_t *stripe_cache = NULL;
+    inode_t *inode = NULL;
+    struct list_head *temp;
+    struct list_head sentinel;
+
+    first = fop->frag_range.first;
+    /* 'last' represents the first stripe not touched by the operation */
+    last = fop->frag_range.last;
+
+    /* If there are no modified stripes, we don't need to do anything
+     * else. */
+    if (last <= first) {
+        return;
+    }
+
+    if (!fop->use_fd) {
+        inode = fop->loc[0].inode;
+    } else {
+        inode = fop->fd->inode;
+    }
+
+    LOCK(&inode->lock);
+
+    ctx = __ec_inode_get(inode, fop->xl);
+    if (ctx == NULL) {
+        goto out;
+    }
+    stripe_cache = &ctx->stripe_cache;
+
+    /* Since we'll be moving elements of the list to the tail, we might
+     * end in an infinite loop. To avoid it, we insert a sentinel element
+     * into the list, so that it will be used to detect when we have
+     * traversed all existing elements once. */
+    list_add_tail(&sentinel, &stripe_cache->lru);
+    temp = stripe_cache->lru.next;
+    while (temp != &sentinel) {
+        stripe = list_entry(temp, ec_stripe_t, lru);
+        temp = temp->next;
+        if ((first <= stripe->frag_offset) && (stripe->frag_offset < last)) {
+            ec_update_stripe(fop->xl->private, stripe_cache, stripe, fop);
+        }
+    }
+    list_del(&sentinel);
+
+out:
+    UNLOCK(&inode->lock);
 }
 
-void ec_lock_reuse(ec_fop_data_t *fop)
+void
+ec_lock_reuse(ec_fop_data_t *fop)
 {
     ec_cbk_data_t *cbk;
+    ec_t *ec = NULL;
     int32_t i, count;
     gf_boolean_t release = _gf_false;
-
+    ec = fop->xl->private;
     cbk = fop->answer;
-    if (cbk != NULL) {
+
+    if (ec_eager_lock_used(ec, fop) && cbk != NULL) {
         if (cbk->xdata != NULL) {
-            if ((dict_get_int32(cbk->xdata, GLUSTERFS_INODELK_COUNT,
-                                &count) == 0) && (count > 1)) {
+            if ((dict_get_int32(cbk->xdata, GLUSTERFS_INODELK_COUNT, &count) ==
+                 0) &&
+                (count > 1)) {
                 release = _gf_true;
             }
             if (release) {
-                gf_msg_debug (fop->xl->name, 0,
-                       "Lock contention detected");
+                gf_msg_debug(fop->xl->name, 0, "Lock contention detected");
             }
         }
     } else {
-        /* If we haven't get an answer with enough quorum, we always release
+        /* If eager lock is disabled or if we haven't get
+         * an answer with enough quorum, we always release
          * the lock. */
         release = _gf_true;
     }
+    ec_update_cached_stripes(fop);
 
     for (i = 0; i < fop->lock_count; i++) {
         ec_lock_next_owner(&fop->locks[i], cbk, release);
     }
 }
 
-void __ec_manager(ec_fop_data_t * fop, int32_t error)
+void
+__ec_manager(ec_fop_data_t *fop, int32_t error)
 {
     ec_t *ec = fop->xl->private;
 
     do {
         ec_trace("MANAGER", fop, "error=%d", error);
 
-        if (!ec_must_wind (fop)) {
-                if (ec->xl_up_count < ec->fragments) {
-                    error = ENOTCONN;
-                }
+        if (!ec_must_wind(fop)) {
+            if (ec->xl_up_count < ec->fragments) {
+                error = ENOTCONN;
+            }
         }
 
         if (error != 0) {
@@ -2081,22 +3011,32 @@ void __ec_manager(ec_fop_data_t * fop, int32_t error)
         fop->jobs = 1;
 
         fop->state = fop->handler(fop, fop->state);
-        GF_ASSERT (fop->state >= 0);
+        GF_ASSERT(fop->state >= 0);
 
         error = ec_check_complete(fop, __ec_manager);
     } while (error >= 0);
 }
 
-void ec_manager(ec_fop_data_t * fop, int32_t error)
+void
+ec_manager(ec_fop_data_t *fop, int32_t error)
 {
     GF_ASSERT(fop->jobs == 0);
     GF_ASSERT(fop->winds == 0);
     GF_ASSERT(fop->error == 0);
 
-    if (fop->state == EC_STATE_START)
-    {
+    if (fop->state == EC_STATE_START) {
         fop->state = EC_STATE_INIT;
     }
 
     __ec_manager(fop, error);
 }
+
+gf_boolean_t
+__ec_is_last_fop(ec_t *ec)
+{
+    if ((list_empty(&ec->pending_fops)) &&
+        (GF_ATOMIC_GET(ec->async_fop_count) == 0)) {
+        return _gf_true;
+    }
+    return _gf_false;
+}