glusterd: handling brick termination in brick-mux

Problem: There's a race between the last glusterfs_handle_terminate() response sent to glusterd and the kill that happens immediately if the terminated brick is the last brick. Solution: When it is a last brick for the brick process, instead of glusterfsd killing itself, glusterd will kill the process in case of brick multiplexing. And also changing gf_attach utility accordingly. Change-Id: I386c19ca592536daa71294a13d9fc89a26d7e8c0 fixes: bz#1545048 BUG: 1545048 Signed-off-by: Sanju Rakonde <srakonde@redhat.com>
author: Sanju Rakonde <srakonde@redhat.com> 2018-02-21 12:46:25 +0530
committer: Atin Mukherjee <amukherj@redhat.com> 2018-03-28 04:27:18 +0000
commit: a60fc2ddc03134fb23c5ed5c0bcb195e1649416b (patch)
tree: c4f42085b4c6c6761bf4f3d23a24b8821cf292a8
parent: 95601229b8318f015a19d7eff89f73853b684a49 (diff)
7 files changed, 147 insertions, 39 deletions
diff --git a/glusterfsd/src/gf_attach.c b/glusterfsd/src/gf_attach.c
index 3f248292ddf..0eb4868263b 100644
--- a/glusterfsd/src/gf_attach.c
+++ b/glusterfsd/src/gf_attach.c
@@ -11,6 +11,9 @@
 #include <stdio.h>
 #include <stdlib.h>
 #include <unistd.h>
+#include <sys/types.h>
+#include <sys/wait.h>
+#include <signal.h>
 
 //#include "config.h"
 #include "glusterfs.h"
@@ -23,6 +26,7 @@
 
 int done = 0;
 int rpc_status;
+glfs_t *fs;
 
 struct rpc_clnt_procedure gf_attach_actors[GLUSTERD_BRICK_MAXVALUE] = {
         [GLUSTERD_BRICK_NULL] = {"NULL", NULL },
@@ -71,11 +75,43 @@ my_notify (struct rpc_clnt *rpc, void *mydata,
 }
 
 int32_t
-my_callback (struct rpc_req *req, struct iovec *iov, int count, void *frame)
+my_callback (struct rpc_req *req, struct iovec *iov, int count, void *v_frame)
 {
+        gd1_mgmt_brick_op_rsp     rsp;
+        dict_t          *dict   = NULL;
+        pid_t            pid    = -1;
+        int              ret    = -1;
+        xlator_t        *this   = NULL;
+
+        this = fs->ctx->master;
+        memset (&rsp, 0, sizeof (rsp));
+
+        ret = xdr_to_generic (*iov, &rsp, (xdrproc_t)xdr_gd1_mgmt_brick_op_rsp);
+
+        if (ret < 0) {
+                fprintf (stderr, "xdr decoding failed\n");
+                goto out;
+        }
+        GF_PROTOCOL_DICT_UNSERIALIZE (this, dict,
+                                      (rsp.output.output_val),
+                                      (rsp.output.output_len),
+                                      ret, rsp.op_errno, out);
+        if (dict) {
+                if (dict_get_int32 (dict, "last_brick_terminated", &pid) == 0) {
+                        int status = 0;
+
+                        gf_log ("gf_attach", GF_LOG_INFO, "Killing %d", pid);
+                        kill (pid, SIGTERM);
+                        waitpid (pid, &status, 0);
+                }
+                dict_unref (dict);
+        }
+
         rpc_status = req->rpc_status;
         done = 1;
-        return 0;
+        ret = 0;
+out:
+        return ret;
 }
 
 /* copied from gd_syncop_submit_request */
@@ -170,7 +206,6 @@ usage (char *prog)
 int
 main (int argc, char *argv[])
 {
-        glfs_t                  *fs;
         struct rpc_clnt         *rpc;
         dict_t                  *options;
         int                     ret;
diff --git a/glusterfsd/src/glusterfsd-mgmt.c b/glusterfsd/src/glusterfsd-mgmt.c
index d2b39494e51..c4df275077f 100644
--- a/glusterfsd/src/glusterfsd-mgmt.c
+++ b/glusterfsd/src/glusterfsd-mgmt.c
@@ -159,21 +159,31 @@ out:
 }
 
 int
-glusterfs_terminate_response_send (rpcsvc_request_t *req, int op_ret)
+glusterfs_terminate_response_send (rpcsvc_request_t *req, int op_ret,
+                                   gf_boolean_t last_brick)
 {
         gd1_mgmt_brick_op_rsp   rsp = {0,};
         dict_t                  *dict = NULL;
-        int                     ret = 0;
+        int                     ret = -1;
 
         rsp.op_ret = op_ret;
         rsp.op_errno = 0;
         rsp.op_errstr = "";
         dict = dict_new ();
 
-        if (dict)
+        if (dict) {
+                /* Setting the last_brick_terminated key in dictionary is
+                 * required to for standalone gf_attach utility to work.
+                 * gf_attach utility will receive this dictionary and kill
+                 * the process.
+                 */
+                if (last_brick) {
+                        ret = dict_set_int32 (dict, "last_brick_terminated",
+                                              getpid());
+                }
                 ret = dict_allocate_and_serialize (dict, &rsp.output.output_val,
                                                    &rsp.output.output_len);
-
+        }
 
         if (ret == 0)
                 ret = glusterfs_submit_reply (req, &rsp, NULL, 0, NULL,
@@ -262,6 +272,7 @@ glusterfs_handle_terminate (rpcsvc_request_t *req)
         xlator_t                *victim         = NULL;
         xlator_list_t           **trav_p        = NULL;
         gf_boolean_t            lockflag        = _gf_false;
+        gf_boolean_t            last_brick      = _gf_false;
 
         ret = xdr_to_generic (req->msg[0], &xlator_req,
                               (xdrproc_t)xdr_gd1_mgmt_brick_op_req);
@@ -294,17 +305,16 @@ glusterfs_handle_terminate (rpcsvc_request_t *req)
                  * make sure it's down and if it's already down that's
                  * good enough.
                  */
-                glusterfs_terminate_response_send (req, 0);
+                glusterfs_terminate_response_send (req, 0, last_brick);
                 goto err;
         }
 
-        glusterfs_terminate_response_send (req, 0);
         if ((trav_p == &top->children) && !(*trav_p)->next) {
-                gf_log (THIS->name, GF_LOG_INFO,
-                        "terminating after loss of last child %s",
-                        xlator_req.name);
-                rpc_clnt_mgmt_pmap_signout (glusterfsd_ctx, xlator_req.name);
-                kill (getpid(), SIGTERM);
+                last_brick = _gf_true;
+                glusterfs_terminate_response_send (req, 0, last_brick);
+                gf_log (THIS->name, GF_LOG_INFO, "This is last brick of process."
+                        "glusterD will kill the process and takes care of "
+                        "removal of entries from port map register");
         } else {
                 /*
                  * This is terribly unsafe without quiescing or shutting
@@ -313,6 +323,7 @@ glusterfs_handle_terminate (rpcsvc_request_t *req)
                  *
                  * TBD: finish implementing this "detach" code properly
                  */
+                glusterfs_terminate_response_send (req, 0, last_brick);
                 UNLOCK (&ctx->volfile_lock);
                 lockflag = _gf_true;
                 gf_log (THIS->name, GF_LOG_INFO, "detaching not-only"
diff --git a/tests/bugs/glusterd/stale-brick-proc-brick-mux.t b/tests/bugs/glusterd/stale-brick-proc-brick-mux.t
new file mode 100644
index 00000000000..a3efe273898
--- /dev/null
+++ b/tests/bugs/glusterd/stale-brick-proc-brick-mux.t
@@ -0,0 +1,32 @@
+#!/bin/bash
+
+. $(dirname $0)/../../include.rc
+. $(dirname $0)/../../cluster.rc
+
+function count_brick_processes {
+        pgrep glusterfsd | wc -l
+}
+
+cleanup;
+
+TEST launch_cluster 2
+TEST $CLI_1 peer probe $H2;
+EXPECT_WITHIN $PROBE_TIMEOUT 1 peer_count
+
+#bug-1549996 - stale brick processes on the nodes after volume deletion
+
+TEST $CLI_1 volume set all cluster.brick-multiplex on
+TEST $CLI_1 volume create $V0 replica 3 $H1:$B1/${V0}{1..3} $H2:$B2/${V0}{1..3}
+TEST $CLI_1 volume start $V0
+
+TEST $CLI_1 volume create $V1 replica 3 $H1:$B1/${V1}{1..3} $H2:$B2/${V1}{1..3}
+TEST $CLI_1 volume start $V1
+
+EXPECT 2 count_brick_processes
+
+TEST $CLI_1 volume stop $V0
+TEST $CLI_1 volume stop $V1
+
+EXPECT 0 count_brick_processes
+
+cleanup
diff --git a/xlators/mgmt/glusterd/src/glusterd-pmap.c b/xlators/mgmt/glusterd/src/glusterd-pmap.c
index 98b1aaa63af..8ef285bf48d 100644
--- a/xlators/mgmt/glusterd/src/glusterd-pmap.c
+++ b/xlators/mgmt/glusterd/src/glusterd-pmap.c
@@ -557,6 +557,7 @@ __gluster_pmap_signout (rpcsvc_request_t *req)
         glusterd_brickinfo_t *brickinfo            = NULL;
         char                  pidfile[PATH_MAX]    = {0};
         char                  brick_path[PATH_MAX] = {0,};
+        int                   kill_pid             = -1;
 
         this = THIS;
         GF_VALIDATE_OR_GOTO ("glusterd", this, fail);
@@ -606,7 +607,8 @@ __gluster_pmap_signout (rpcsvc_request_t *req)
                          * removed in the brick op phase. This situation would
                          * arise when the brick is killed explicitly from the
                          * backend */
-                        ret = glusterd_brick_process_remove_brick (brickinfo);
+                        ret = glusterd_brick_process_remove_brick (brickinfo,
+                                                                   &kill_pid);
                         if (ret) {
                                 gf_msg_debug (this->name, 0, "Couldn't remove "
                                               "brick %s:%s from brick process",
diff --git a/xlators/mgmt/glusterd/src/glusterd-syncop.c b/xlators/mgmt/glusterd/src/glusterd-syncop.c
index 31b08d76adc..e5d4421deb4 100644
--- a/xlators/mgmt/glusterd/src/glusterd-syncop.c
+++ b/xlators/mgmt/glusterd/src/glusterd-syncop.c
@@ -958,7 +958,6 @@ gd_syncop_mgmt_brick_op (struct rpc_clnt *rpc, glusterd_pending_node_t *pnode,
         gd1_mgmt_brick_op_req  *req  = NULL;
         int                    ret  = 0;
         xlator_t               *this = NULL;
-        glusterd_brickinfo_t   *brickinfo = NULL;
 
         this = THIS;
         args.op_ret = -1;
@@ -987,22 +986,6 @@ gd_syncop_mgmt_brick_op (struct rpc_clnt *rpc, glusterd_pending_node_t *pnode,
                 else
                         GF_FREE (args.errstr);
         }
-        if (op == GD_OP_STOP_VOLUME || op == GD_OP_REMOVE_BRICK) {
-                if (args.op_ret == 0) {
-                        brickinfo = pnode->node;
-                        ret = glusterd_brick_process_remove_brick (brickinfo);
-                        if (ret) {
-                                gf_msg ("glusterd", GF_LOG_ERROR, 0,
-                                        GD_MSG_BRICKPROC_REM_BRICK_FAILED,
-                                        "Removing brick %s:%s from brick"
-                                        " process failed",
-                                        brickinfo->hostname,
-                                        brickinfo->path);
-                                args.op_ret = ret;
-                                goto out;
-                        }
-                }
-        }
 
         if (GD_OP_STATUS_VOLUME == op) {
                 ret = dict_set_int32 (args.dict, "index", pnode->index);
diff --git a/xlators/mgmt/glusterd/src/glusterd-utils.c b/xlators/mgmt/glusterd/src/glusterd-utils.c
index af30756c947..8e71756b927 100644
--- a/xlators/mgmt/glusterd/src/glusterd-utils.c
+++ b/xlators/mgmt/glusterd/src/glusterd-utils.c
@@ -69,6 +69,7 @@
 #include <sys/types.h>
 #include <sys/ioctl.h>
 #include <sys/socket.h>
+#include <sys/wait.h>
 #include <rpc/pmap_clnt.h>
 #include <unistd.h>
 #include <fnmatch.h>
@@ -2313,7 +2314,8 @@ glusterd_brickprocess_delete (glusterd_brick_proc_t *brick_proc)
 }
 
 int
-glusterd_brick_process_remove_brick (glusterd_brickinfo_t *brickinfo)
+glusterd_brick_process_remove_brick (glusterd_brickinfo_t *brickinfo,
+                                     int *kill_pid)
 {
         int                      ret = -1;
         xlator_t                *this = NULL;
@@ -2352,6 +2354,7 @@ glusterd_brick_process_remove_brick (glusterd_brickinfo_t *brickinfo)
 
                 /* If all bricks have been removed, delete the brick process */
                 if (brick_proc->brick_count == 0) {
+                        *kill_pid = 1;
                         ret = glusterd_brickprocess_delete (brick_proc);
                         if (ret)
                                 goto out;
@@ -2454,7 +2457,11 @@ glusterd_volume_stop_glusterfs (glusterd_volinfo_t *volinfo,
         glusterd_conf_t *conf                   = NULL;
         int             ret                     = -1;
         char            *op_errstr              = NULL;
-        char            pidfile[PATH_MAX]       = {0,};
+        char            pidfile_path[PATH_MAX]  = {0,};
+        int             kill_pid                = -1;
+        FILE            *pidfile                = NULL;
+        pid_t           pid                     = -1;
+        int             status                  = -1;
 
         GF_ASSERT (volinfo);
         GF_ASSERT (brickinfo);
@@ -2467,7 +2474,7 @@ glusterd_volume_stop_glusterfs (glusterd_volinfo_t *volinfo,
 
         ret = 0;
 
-        ret = glusterd_brick_process_remove_brick (brickinfo);
+        ret = glusterd_brick_process_remove_brick (brickinfo, &kill_pid);
         if (ret) {
                 gf_msg_debug (this->name, 0, "Couldn't remove brick from"
                               " brick process");
@@ -2510,10 +2517,47 @@ glusterd_volume_stop_glusterfs (glusterd_volinfo_t *volinfo,
                 ret = 0;
         }
 
-        GLUSTERD_GET_BRICK_PIDFILE (pidfile, volinfo, brickinfo, conf);
+        GLUSTERD_GET_BRICK_PIDFILE (pidfile_path, volinfo, brickinfo, conf);
+        if (kill_pid == 1 && is_brick_mx_enabled ()) {
+                pidfile = fopen (pidfile_path, "r");
+                if (!pidfile) {
+                        gf_msg (this->name, GF_LOG_ERROR, errno,
+                                GD_MSG_FILE_OP_FAILED,
+                                "Unable to open pidfile: %s", pidfile_path);
+                        ret = -1;
+                        goto out;
+                }
+
+                ret = fscanf (pidfile, "%d", &pid);
+                if (ret <= 0) {
+                        gf_msg (this->name, GF_LOG_ERROR, errno,
+                                GD_MSG_FILE_OP_FAILED,
+                                "Unable to get pid of brick process");
+                        ret = -1;
+                        goto out;
+                }
+
+                if (conf->op_version >= GD_OP_VERSION_4_1_0) {
+                        while (conf->blockers) {
+                                synclock_unlock (&conf->big_lock);
+                                sleep (1);
+                                synclock_lock (&conf->big_lock);
+                        }
+                }
+                gf_log (this->name, GF_LOG_INFO,
+                        "terminating the brick process "
+                        "%d after loss of last brick %s of the volume %s",
+                        pid, brickinfo->path, volinfo->volname);
+                kill (pid, SIGTERM);
+                waitpid (pid, &status, 0);
+                pmap_registry_remove (this, brickinfo->port, brickinfo->path,
+                                      GF_PMAP_PORT_BRICKSERVER, NULL,
+                                      _gf_true);
+                ret = 0;
+        }
 
-        gf_msg_debug (this->name,  0, "Unlinking pidfile %s", pidfile);
-        (void) sys_unlink (pidfile);
+        gf_msg_debug (this->name,  0, "Unlinking pidfile %s", pidfile_path);
+        (void) sys_unlink (pidfile_path);
 
         brickinfo->status = GF_BRICK_STOPPED;
         brickinfo->start_triggered = _gf_false;
diff --git a/xlators/mgmt/glusterd/src/glusterd-utils.h b/xlators/mgmt/glusterd/src/glusterd-utils.h
index 0e9e54a0687..8118c994974 100644
--- a/xlators/mgmt/glusterd/src/glusterd-utils.h
+++ b/xlators/mgmt/glusterd/src/glusterd-utils.h
@@ -182,7 +182,8 @@ glusterd_brick_process_add_brick (glusterd_brickinfo_t *brickinfo,
                                   glusterd_volinfo_t *volinfo);
 
 int
-glusterd_brick_process_remove_brick (glusterd_brickinfo_t *brickinfo);
+glusterd_brick_process_remove_brick (glusterd_brickinfo_t *brickinfo,
+                                     int *kill_pid);
 
 int
 glusterd_brick_proc_for_port (int port, glusterd_brick_proc_t **brickprocess);
author	Sanju Rakonde <srakonde@redhat.com>	2018-02-21 12:46:25 +0530
committer	Atin Mukherjee <amukherj@redhat.com>	2018-03-28 04:27:18 +0000
commit	a60fc2ddc03134fb23c5ed5c0bcb195e1649416b (patch)
tree	c4f42085b4c6c6761bf4f3d23a24b8821cf292a8
parent	95601229b8318f015a19d7eff89f73853b684a49 (diff)