cluster/ec: Introduce optimistic changelog in EC

Problem: Fix to https://bugzilla.redhat.com/show_bug.cgi?id=1316873 has made changes to set dirty flag before every update fop, data or metadata, and unset it after successful operation. That makes some of the fops very slow such as entry operations or metadata operations. Solution: File data operations are the only operation which take some time and setting dirty flag before a fop and unsetting it after serves the purpose as probability of failure of a fop is high when the time duration is more. For all the other operations, set dirty flag at the end of the fop, if any brick is down and need heal. Providing following option to choose between high performance or better heal marking for metadata and entry fops. Set/Unset dirty flag for every update fop at the start of the fop. If ON, this option impacts performance of entry operations or metadata operations as it will set dirty flag at the start and unset it at the end of ALL update fop. If OFF and all the bricks are good, dirty flag will be set at the start only for file fops For metadata and entry fops dirty flag will not be set at the start, if all the bricks are good. This does not impact performance for metadata operations and entry operation but has a very small window to miss marking entry as dirty in case it is required to be healed. Thanks to Xavi and Ashish for the design Picked the .t file from Ashish' patch https://review.gluster.org/16298 BUG: 1408809 Change-Id: I3ce860063f0e2901e50754dcfc3e4ed22daf819f Signed-off-by: Pranith Kumar K <pkarampu@redhat.com> Reviewed-on: https://review.gluster.org/16821 Smoke: Gluster Build System <jenkins@build.gluster.org> Reviewed-by: Xavier Hernandez <xhernandez@datalab.es> Tested-by: Xavier Hernandez <xhernandez@datalab.es> NetBSD-regression: NetBSD Build System <jenkins@build.gluster.org> CentOS-regression: Gluster Build System <jenkins@build.gluster.org>
author: Pranith Kumar K <pkarampu@redhat.com> 2017-03-02 07:14:14 +0530
committer: Pranith Kumar Karampuri <pkarampu@redhat.com> 2017-03-04 07:37:56 -0500
commit: 78c5c5637104cf79578d0fb9173647c9c3421177 (patch)
tree: 4d45320f5487f900dcc1a21b7087313162da9652 /xlators/cluster/ec/src/ec-common.c
parent: aaa5b2ec2f0ef1a62047c9ab91d957c7b0a1552a (diff)
1 files changed, 48 insertions, 1 deletions
diff --git a/xlators/cluster/ec/src/ec-common.c b/xlators/cluster/ec/src/ec-common.c
index 823922542a0..3ae7f110d99 100644
--- a/xlators/cluster/ec/src/ec-common.c
+++ b/xlators/cluster/ec/src/ec-common.c
@@ -932,16 +932,19 @@ ec_config_check (ec_fop_data_t *fop, ec_config_t *config)
 }
 
 gf_boolean_t
-ec_set_dirty_flag (ec_lock_link_t *link, ec_inode_t *ctx, uint64_t *dirty)
+ec_set_dirty_flag (ec_lock_link_t *link, ec_inode_t *ctx,
+                   uint64_t *dirty)
 {
 
     gf_boolean_t set_dirty = _gf_false;
 
     if (link->update[EC_DATA_TXN] && !ctx->dirty[EC_DATA_TXN]) {
+            if (!link->optimistic_changelog)
                 dirty[EC_DATA_TXN] = 1;
     }
 
     if (link->update[EC_METADATA_TXN] && !ctx->dirty[EC_METADATA_TXN]) {
+            if (!link->optimistic_changelog)
                 dirty[EC_METADATA_TXN] = 1;
     }
 
@@ -962,6 +965,7 @@ ec_prepare_update_cbk (call_frame_t *frame, void *cookie,
     ec_lock_link_t *link = fop->data;
     ec_lock_t *lock = NULL;
     ec_inode_t *ctx;
+    gf_boolean_t release = _gf_false;
 
     lock = link->lock;
     parent = link->fop;
@@ -1055,6 +1059,26 @@ unlock:
     UNLOCK(&lock->loc.inode->lock);
 
     if (op_errno == 0) {
+        /* If the fop fails on any of the good bricks, it is important to mark
+         * it dirty and update versions right away if dirty was not set before.
+         */
+        if (lock->good_mask & ~(fop->good | fop->remaining)) {
+                release = _gf_true;
+        }
+
+        /* lock->release is a critical field that is checked and modified most
+         * of the time inside a locked region. This use here is safe because we
+         * are in a modifying fop and we currently don't allow two modifying
+         * fops to be processed concurrently, so no one else could be checking
+         * or modifying it.*/
+        if (link->update[0] && !link->dirty[0]) {
+                lock->release |= release;
+        }
+
+        if (link->update[1] && !link->dirty[1]) {
+                lock->release |= release;
+        }
+
         /* We don't allow the main fop to be executed on bricks that have not
          * succeeded the initial xattrop. */
         parent->mask &= fop->good;
@@ -1097,6 +1121,7 @@ void ec_get_size_version(ec_lock_link_t *link)
     ec_inode_t *ctx;
     ec_fop_data_t *fop;
     dict_t *dict = NULL;
+    ec_t   *ec = NULL;
     int32_t error = 0;
     gf_boolean_t getting_xattr;
     gf_boolean_t set_dirty = _gf_false;
@@ -1105,6 +1130,17 @@ void ec_get_size_version(ec_lock_link_t *link)
     lock = link->lock;
     ctx = lock->ctx;
     fop = link->fop;
+    ec  = fop->xl->private;
+
+    if (ec->optimistic_changelog &&
+        !(ec->node_mask & ~link->lock->good_mask) && !ec_is_data_fop (fop->id))
+            link->optimistic_changelog = _gf_true;
+
+    /* If ctx->have_info is false and lock->query is true, it means that we'll
+     * send the xattrop anyway, so we can use it to update dirty counts, even
+     * if it's not necessary to do it right now. */
+    if (!ctx->have_info && lock->query)
+            link->optimistic_changelog = _gf_false;
 
     set_dirty = ec_set_dirty_flag (link, ctx, dirty);
 
@@ -1714,6 +1750,13 @@ ec_lock_next_owner(ec_lock_link_t *link, ec_cbk_data_t *cbk,
         if (link->update[1]) {
             ctx->post_version[1]++;
         }
+        /* If the fop fails on any of the good bricks, it is important to mark
+         * it dirty and update versions right away. */
+        if (link->update[0] || link->update[1]) {
+                if (lock->good_mask & ~(fop->good | fop->remaining)) {
+                        lock->release = _gf_true;
+                }
+        }
     }
 
     ec_lock_update_good(lock, fop);
@@ -2028,9 +2071,13 @@ ec_update_info(ec_lock_link_t *link)
                     if (ctx->dirty[1] != 0) {
                         dirty[1] = -1;
                     }
+            } else {
+                    link->optimistic_changelog = _gf_false;
+                    ec_set_dirty_flag (link, ctx, dirty);
             }
             memset(ctx->dirty, 0, sizeof(ctx->dirty));
     }
+
     if ((version[0] != 0) || (version[1] != 0) ||
         (dirty[0] != 0) || (dirty[1] != 0)) {
         ec_update_size_version(link, version, size, dirty);
author	Pranith Kumar K <pkarampu@redhat.com>	2017-03-02 07:14:14 +0530
committer	Pranith Kumar Karampuri <pkarampu@redhat.com>	2017-03-04 07:37:56 -0500
commit	78c5c5637104cf79578d0fb9173647c9c3421177 (patch)
tree	4d45320f5487f900dcc1a21b7087313162da9652 /xlators/cluster/ec/src/ec-common.c
parent	aaa5b2ec2f0ef1a62047c9ab91d957c7b0a1552a (diff)