cluster/afr: Let two data-self-heals compete in new domain

Problem: At the moment data-self-heal acquires locks in following pattern. It takes full file lock then gets xattrs on files on both replicas. Decides sources/sinks based on the xattrs. Now it acquires lock from 0-128k then unlocks the full file lock. Syncs 0-128k range from source to sink now acquires lock 128k+1 till 256k then unlocks 0-128k, syncs 128k+1 till 256k block... so on finally it takes full file lock again then unlocks the final small range block. It decrements pending counts and then unlocks the full file lock. This pattern of locks is chosen to avoid more than 1 self-heal to be in progress. BUT if another self-heal tries to take a full file lock while a self-heal is already in progress it will be put in blocked queue, further inodelks from writes by the application will also be put in blocked queue because of the way locks xlator grants inodelks. So until the self-heal is complete writes are blocked. Here is the code: xlators/features/locks/src/inodelk.c - line 225 if (__blocked_lock_conflict (dom, lock) && !(__owner_has_lock (dom, lock))) { ret = -EAGAIN; if (can_block == 0) goto out; gettimeofday (&lock->blkd_time, NULL); list_add_tail (&lock->blocked_locks, &dom->blocked_inodelks); } This leads to hangs in applications. Fix: Since we want to prevent two parallel self-heals. We let them compete in a separate "domain". Lets call the domain on which the locks have been taken on in previous approach as "data-domain". In the new approach When a self-heal is triggered, it acquires a full lock in the new domain "self-heal-domain". After this it performs data-self-heal using the locks in "data-domain" as before. unlock the full file lock in "self-heal-domain" With this approach, application's writevs don't have to wait in pending queue when more than 1 self-heal is triggered. Change-Id: Id79aef3dfa888945977fb9758374ac41c320d0d5 BUG: 967717 Signed-off-by: Pranith Kumar K <pkarampu@redhat.com> Reviewed-on: http://review.gluster.org/5100 Tested-by: Gluster Build System <jenkins@build.gluster.com> Reviewed-by: Vijay Bellur <vbellur@redhat.com>
author: Pranith Kumar K <pkarampu@redhat.com> 2013-07-03 21:28:25 +0530
committer: Vijay Bellur <vbellur@redhat.com> 2013-07-03 21:35:30 -0700
commit: 0aa4248cf967b158c0cf6ce33c300ad4b3ee3249 (patch)
tree: 56a8e408e81bc35ef335fcc6428b8cd7b14a5210
parent: c2abf3a6e39c5a5832a165757483bc0ae23cdb63 (diff)
4 files changed, 73 insertions, 23 deletions
diff --git a/xlators/cluster/afr/src/afr-self-heal-data.c b/xlators/cluster/afr/src/afr-self-heal-data.c
index 3c2726b8d..b71c75995 100644
--- a/xlators/cluster/afr/src/afr-self-heal-data.c
+++ b/xlators/cluster/afr/src/afr-self-heal-data.c
@@ -156,6 +156,25 @@ afr_sh_data_close (call_frame_t *frame, xlator_t *this)
 }
 
 int
+afr_sh_dom_unlock (call_frame_t *frame, xlator_t *this)
+{
+        afr_local_t     *local = NULL;
+        afr_self_heal_t *sh    = NULL;
+        afr_private_t   *priv  = NULL;
+
+        local = frame->local;
+        sh    = &local->self_heal;
+        priv  = this->private;
+
+        if (sh->sh_dom_lock_held)
+                afr_sh_data_unlock (frame, this, priv->sh_domain,
+                                    afr_sh_data_close);
+        else
+                afr_sh_data_close (frame, this);
+        return 0;
+}
+
+int
 afr_sh_data_setattr_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
                          int32_t op_ret, int32_t op_errno, struct iatt *statpre,
                          struct iatt *statpost, dict_t *xdata)
@@ -289,14 +308,18 @@ afr_sh_data_unlock (call_frame_t *frame, xlator_t *this, char *dom,
         afr_local_t         *local    = NULL;
         afr_internal_lock_t *int_lock = NULL;
         afr_self_heal_t     *sh       = NULL;
+        afr_private_t       *priv     = NULL;
         int                 ret       = 0;
 
         local    = frame->local;
         int_lock = &local->internal_lock;
         sh       = &local->self_heal;
+        priv     = this->private;
 
         if (strcmp (dom, this->name) == 0) {
                 sh->data_lock_held = _gf_false;
+        } else if (strcmp (dom, priv->sh_domain) == 0) {
+                sh->sh_dom_lock_held = _gf_false;
         } else {
                 ret = -1;
                 goto out;
@@ -316,8 +339,8 @@ out:
 int
 afr_sh_data_finish (call_frame_t *frame, xlator_t *this)
 {
-        afr_local_t   *local = NULL;
-        afr_self_heal_t *sh = NULL;
+        afr_local_t     *local = NULL;
+        afr_self_heal_t *sh    = NULL;
 
         local = frame->local;
         sh = &local->self_heal;
@@ -326,9 +349,9 @@ afr_sh_data_finish (call_frame_t *frame, xlator_t *this)
                 "finishing data selfheal of %s", local->loc.path);
 
         if (sh->data_lock_held)
-                afr_sh_data_unlock (frame, this, this->name, afr_sh_data_close);
+                afr_sh_data_unlock (frame, this, this->name, afr_sh_dom_unlock);
         else
-                afr_sh_data_close (frame, this);
+                afr_sh_dom_unlock (frame, this);
 
         return 0;
 }
@@ -346,10 +369,7 @@ afr_sh_data_fail (call_frame_t *frame, xlator_t *this)
                 "finishing failed data selfheal of %s", local->loc.path);
 
         afr_set_self_heal_status (sh, AFR_SELF_HEAL_FAILED);
-        if (sh->data_lock_held)
-                afr_sh_data_unlock (frame, this, this->name, afr_sh_data_close);
-        else
-                afr_sh_data_close (frame, this);
+        afr_sh_data_finish (frame, this);
         return 0;
 }
 
@@ -1184,6 +1204,22 @@ afr_sh_data_big_lock_success (call_frame_t *frame, xlator_t *this)
 }
 
 int
+afr_sh_dom_lock_success (call_frame_t *frame, xlator_t *this)
+{
+        afr_local_t   *local = NULL;
+        afr_self_heal_t *sh = NULL;
+
+        local = frame->local;
+        sh = &local->self_heal;
+
+        sh->sh_dom_lock_held = _gf_true;
+        afr_sh_data_lock (frame, this, 0, 0, _gf_true, this->name,
+                          afr_sh_data_big_lock_success,
+                          afr_sh_data_fail);
+        return 0;
+}
+
+int
 afr_sh_data_post_blocking_inodelk_cbk (call_frame_t *frame, xlator_t *this)
 {
         afr_internal_lock_t *int_lock = NULL;
@@ -1337,7 +1373,6 @@ afr_sh_data_open_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
         afr_private_t   *priv = NULL;
         int              call_count = 0;
         int              child_index = 0;
-	gf_boolean_t	 block = _gf_true;
 
         local = frame->local;
         sh = &local->self_heal;
@@ -1379,16 +1414,8 @@ afr_sh_data_open_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
                         "fd for %s opened, commencing sync",
                         local->loc.path);
 
-                /*
-                 * The read and write self-heal trigger codepaths do not provide
-                 * an unwind callback. We run a trylock in these codepaths
-                 * because we are sensitive to locking latency.
-                 */
-
-		block = sh->unwind ? _gf_true : _gf_false;
-                afr_sh_data_lock (frame, this, 0, 0, block, this->name,
-                                  afr_sh_data_big_lock_success,
-                                  afr_sh_data_fail);
+                afr_sh_data_lock (frame, this, 0, 0, _gf_true, priv->sh_domain,
+                                  afr_sh_dom_lock_success, afr_sh_data_fail);
         }
 
         return 0;
@@ -1493,9 +1520,10 @@ afr_can_start_data_self_heal (afr_self_heal_t *sh, afr_private_t *priv)
 int
 afr_self_heal_data (call_frame_t *frame, xlator_t *this)
 {
-        afr_local_t   *local = NULL;
-        afr_self_heal_t *sh = NULL;
-        afr_private_t *priv = this->private;
+        afr_local_t     *local = NULL;
+        afr_self_heal_t *sh    = NULL;
+        afr_private_t   *priv  = this->private;
+        int             ret    = -1;
 
         local = frame->local;
         sh = &local->self_heal;
@@ -1504,6 +1532,14 @@ afr_self_heal_data (call_frame_t *frame, xlator_t *this)
 
         if (afr_can_start_data_self_heal (sh, priv)) {
                 afr_set_self_heal_status (sh, AFR_SELF_HEAL_STARTED);
+                ret = afr_inodelk_init (&local->internal_lock.inodelk[1],
+                                        priv->sh_domain, priv->child_count);
+                if (ret < 0) {
+                        afr_set_self_heal_status (sh, AFR_SELF_HEAL_FAILED);
+                        afr_sh_data_done (frame, this);
+                        return 0;
+                }
+
                 if (IA_ISREG (sh->type)) {
                         afr_sh_data_open (frame, this);
                 } else {
diff --git a/xlators/cluster/afr/src/afr.c b/xlators/cluster/afr/src/afr.c
index 4dd1b1be3..4338b799a 100644
--- a/xlators/cluster/afr/src/afr.c
+++ b/xlators/cluster/afr/src/afr.c
@@ -388,6 +388,12 @@ init (xlator_t *this)
                 i++;
         }
 
+        ret = gf_asprintf (&priv->sh_domain, "%s-self-heal", this->name);
+        if (-1 == ret) {
+                ret = -ENOMEM;
+                goto out;
+        }
+
         priv->last_event = GF_CALLOC (child_count, sizeof (*priv->last_event),
                                       gf_afr_mt_int32_t);
         if (!priv->last_event) {
diff --git a/xlators/cluster/afr/src/afr.h b/xlators/cluster/afr/src/afr.h
index 940f84189..bca5c0f79 100644
--- a/xlators/cluster/afr/src/afr.h
+++ b/xlators/cluster/afr/src/afr.h
@@ -30,7 +30,7 @@
 #define AFR_SH_READDIR_SIZE_KEY "self-heal-readdir-size"
 
 #define AFR_LOCKEE_COUNT_MAX    3
-#define AFR_DOM_COUNT_MAX    2
+#define AFR_DOM_COUNT_MAX    3
 
 struct _pump_private;
 
@@ -172,6 +172,7 @@ typedef struct _afr_private {
         gf_boolean_t           readdir_failover;
         uint64_t               sh_readdir_size;
         gf_boolean_t           ensure_durability;
+        char                   *sh_domain;
 } afr_private_t;
 
 typedef enum {
@@ -280,6 +281,7 @@ struct afr_self_heal_ {
         gf_boolean_t actual_sh_started;
         gf_boolean_t sync_done;
         gf_boolean_t data_lock_held;
+        gf_boolean_t sh_dom_lock_held;
         gf_boolean_t eof_reached;
         fd_t  *healing_fd;
         int   file_has_holes;
diff --git a/xlators/cluster/afr/src/pump.c b/xlators/cluster/afr/src/pump.c
index db83410c7..c7b858691 100644
--- a/xlators/cluster/afr/src/pump.c
+++ b/xlators/cluster/afr/src/pump.c
@@ -2501,6 +2501,12 @@ init (xlator_t *this)
 		i++;
 	}
 
+        ret = gf_asprintf (&priv->sh_domain, "%s-self-heal", this->name);
+        if (-1 == ret) {
+                op_errno = ENOMEM;
+                goto out;
+        }
+
         priv->first_lookup = 1;
         priv->root_inode = NULL;
author	Pranith Kumar K <pkarampu@redhat.com>	2013-07-03 21:28:25 +0530
committer	Vijay Bellur <vbellur@redhat.com>	2013-07-03 21:35:30 -0700
commit	0aa4248cf967b158c0cf6ce33c300ad4b3ee3249 (patch)
tree	56a8e408e81bc35ef335fcc6428b8cd7b14a5210
parent	c2abf3a6e39c5a5832a165757483bc0ae23cdb63 (diff)