glusterd:Marking all the brick status as stopped when a process goes down in brick multiplexing

In brick multiplexing environment, if a brick process goes down i.e., if we kill it with SIGKILL, the status of the brick for which the process came up for the first time is only changing to stopped. all other brick statuses are remain started. This is happening because the process was killed abruptly using SIGKILL signal and signal handler wasn't invoked and further cleanup wasn't triggered. When we try to start a volume using force, it shows error saying "Request timed out", since all the brickinfo->status are still in started state, we're waiting for one of the brick process to come up which never going to happen since the brick process was killed. To resolve this, In the disconnect event, We are checking all the processes that whether the brick which got disconnected belongs the process. Once we get the process we are calling a function named glusterd_mark_bricks_stopped_by_proc() and sending brick_proc_t object as an argument. From the glusterd_brick_proc_t we can get all the bricks attached to that process. but these are duplicated ones. To get the original brickinfo we are reading volinfo from brick. In volinfo we will have original brickinfo copies. We are changing brickinfo->status to stopped for all the bricks. >Change-Id: Ifb9054b3ee081ef56b39b2903ae686984fe827e7 >BUG: 1499509 >Signed-off-by: Sanju Rakonde <srakonde@redhat.com> >Reviewed-on: https://review.gluster.org/#/c/18444/ >Smoke: Gluster Build System <jenkins@build.gluster.org> >CentOS-regression: Gluster Build System <jenkins@build.gluster.org> >Reviewed-by: Atin Mukherjee <amukherj@redhat.com> >cherry picked from commit 9422446d72bc054962d72ace9912ecb885946d49) Change-Id: Ifb9054b3ee081ef56b39b2903ae686984fe827e7 BUG: 1501154 Signed-off-by: Sanju Rakonde <srakonde@redhat.com>
author: Sanju Rakonde <srakonde@redhat.com> 2017-10-07 03:33:40 +0530
committer: jiffin tony Thottan <jthottan@redhat.com> 2017-10-12 18:49:37 +0000
commit: 8aa0c34c5301a15a87c0cb168a89cb291e85d741 (patch)
tree: 378d9e6bfb0c8698f14b9bb3ed0ab64891a7ac9a
parent: d93c94d460698b9dd3a73c2ba399ff577349180c (diff)
2 files changed, 85 insertions, 1 deletions
diff --git a/tests/bugs/glusterd/bug-1499509-disconnect-in-brick-mux.t b/tests/bugs/glusterd/bug-1499509-disconnect-in-brick-mux.t
new file mode 100644
index 00000000000..3c5bebee0c7
--- /dev/null
+++ b/tests/bugs/glusterd/bug-1499509-disconnect-in-brick-mux.t
@@ -0,0 +1,27 @@
+#!/bin/bash
+
+. $(dirname $0)/../../include.rc
+. $(dirname $0)/../../volume.rc
+. $(dirname $0)/../../cluster.rc
+
+cleanup
+
+TEST glusterd
+TEST pidof glusterd
+
+## Enable brick multiplexing
+TEST $CLI volume set all cluster.brick-multiplex on
+
+## creating 1x3 replicated volumes
+TEST $CLI volume create $V0 replica 3 $H0:$B0/${V0}_{1..3}
+TEST $CLI volume create $V1 replica 3 $H0:$B1/${V1}_{1..3}
+
+## Start the volume
+TEST $CLI volume start $V0
+TEST $CLI volume start $V1
+
+kill -9 $(pgrep glusterfsd)
+
+EXPECT 0 online_brick_count
+
+cleanup
diff --git a/xlators/mgmt/glusterd/src/glusterd-handler.c b/xlators/mgmt/glusterd/src/glusterd-handler.c
index 9d496e56c07..185186a8ad6 100644
--- a/xlators/mgmt/glusterd/src/glusterd-handler.c
+++ b/xlators/mgmt/glusterd/src/glusterd-handler.c
@@ -5938,6 +5938,31 @@ out:
 
 static int gd_stale_rpc_disconnect_log;
 
+static int
+glusterd_mark_bricks_stopped_by_proc (glusterd_brick_proc_t *brick_proc) {
+        glusterd_brickinfo_t     *brickinfo        =  NULL;
+        glusterd_brickinfo_t     *brickinfo_tmp    =  NULL;
+        glusterd_volinfo_t       *volinfo          =  NULL;
+        int                       ret              =  -1;
+
+        cds_list_for_each_entry (brickinfo, &brick_proc->bricks, brick_list) {
+                ret =  glusterd_get_volinfo_from_brick (brickinfo->path, &volinfo);
+                if (ret) {
+                        gf_msg (THIS->name, GF_LOG_ERROR, 0, GD_MSG_VOLINFO_GET_FAIL,
+                                "Failed to get volinfo from brick(%s)",
+                                brickinfo->path);
+                        goto out;
+                }
+                cds_list_for_each_entry (brickinfo_tmp, &volinfo->bricks, brick_list) {
+                        if (strcmp (brickinfo->path, brickinfo_tmp->path) == 0)
+                                glusterd_set_brick_status (brickinfo_tmp, GF_BRICK_STOPPED);
+                }
+        }
+        return 0;
+out:
+        return ret;
+}
+
 int
 __glusterd_brick_rpc_notify (struct rpc_clnt *rpc, void *mydata,
                              rpc_clnt_event_t event, void *data)
@@ -5948,6 +5973,9 @@ __glusterd_brick_rpc_notify (struct rpc_clnt *rpc, void *mydata,
         glusterd_brickinfo_t    *brickinfo         = NULL;
         glusterd_volinfo_t      *volinfo           = NULL;
         xlator_t                *this              = NULL;
+        int                      temp              = 0;
+        glusterd_brickinfo_t    *brickinfo_tmp     = NULL;
+        glusterd_brick_proc_t   *brick_proc        = NULL;
 
         brickid = mydata;
         if (!brickid)
@@ -6048,7 +6076,36 @@ __glusterd_brick_rpc_notify (struct rpc_clnt *rpc, void *mydata,
                                   brickinfo->path);
                 }
 
-                glusterd_set_brick_status (brickinfo, GF_BRICK_STOPPED);
+                if (is_brick_mx_enabled()) {
+                        cds_list_for_each_entry (brick_proc, &conf->brick_procs,
+                                                 brick_proc_list) {
+                                cds_list_for_each_entry (brickinfo_tmp,
+                                                         &brick_proc->bricks,
+                                                         brick_list) {
+                                        if (strcmp (brickinfo_tmp->path,
+                                                    brickinfo->path) == 0) {
+                                                ret  = glusterd_mark_bricks_stopped_by_proc
+                                                       (brick_proc);
+                                                if (ret) {
+                                                        gf_msg(THIS->name,
+                                                               GF_LOG_ERROR, 0,
+                                                               GD_MSG_BRICK_STOP_FAIL,
+                                                               "Unable to stop "
+                                                               "bricks of process"
+                                                               " to which brick(%s)"
+                                                               " belongs",
+                                                               brickinfo->path);
+                                                        goto out;
+                                                }
+                                                temp = 1;
+                                                break;
+                                        }
+                                }
+                                if (temp == 1)
+                                        break;
+                        }
+                } else
+                        glusterd_set_brick_status (brickinfo, GF_BRICK_STOPPED);
                 break;
 
         case RPC_CLNT_DESTROY:
author	Sanju Rakonde <srakonde@redhat.com>	2017-10-07 03:33:40 +0530
committer	jiffin tony Thottan <jthottan@redhat.com>	2017-10-12 18:49:37 +0000
commit	8aa0c34c5301a15a87c0cb168a89cb291e85d741 (patch)
tree	378d9e6bfb0c8698f14b9bb3ed0ab64891a7ac9a
parent	d93c94d460698b9dd3a73c2ba399ff577349180c (diff)