From 3f9851db49ca6ac7a969817964a6ad216b10fd6f Mon Sep 17 00:00:00 2001 From: Sanju Rakonde Date: Thu, 29 Mar 2018 10:48:32 +0000 Subject: Revert "glusterd: handling brick termination in brick-mux" This reverts commit a60fc2ddc03134fb23c5ed5c0bcb195e1649416b. This commit was causing multiple tests to time out when brick multiplexing is enabled. With further debugging, it's found that even though the volume stop transaction is converted into mgmt_v3 to allow the remote nodes to follow the synctask framework to process the command, there are other callers of glusterd_brick_stop () which are not synctask based. Change-Id: I7aee687abc6bfeaa70c7447031f55ed4ccd64693 updates: bz#1545048 --- glusterfsd/src/gf_attach.c | 41 ++--------------- glusterfsd/src/glusterfsd-mgmt.c | 33 +++++--------- tests/bugs/glusterd/stale-brick-proc-brick-mux.t | 32 -------------- xlators/mgmt/glusterd/src/glusterd-pmap.c | 4 +- xlators/mgmt/glusterd/src/glusterd-syncop.c | 17 +++++++ xlators/mgmt/glusterd/src/glusterd-utils.c | 56 +++--------------------- xlators/mgmt/glusterd/src/glusterd-utils.h | 3 +- 7 files changed, 39 insertions(+), 147 deletions(-) delete mode 100644 tests/bugs/glusterd/stale-brick-proc-brick-mux.t diff --git a/glusterfsd/src/gf_attach.c b/glusterfsd/src/gf_attach.c index 0eb4868263b..3f248292ddf 100644 --- a/glusterfsd/src/gf_attach.c +++ b/glusterfsd/src/gf_attach.c @@ -11,9 +11,6 @@ #include #include #include -#include -#include -#include //#include "config.h" #include "glusterfs.h" @@ -26,7 +23,6 @@ int done = 0; int rpc_status; -glfs_t *fs; struct rpc_clnt_procedure gf_attach_actors[GLUSTERD_BRICK_MAXVALUE] = { [GLUSTERD_BRICK_NULL] = {"NULL", NULL }, @@ -75,43 +71,11 @@ my_notify (struct rpc_clnt *rpc, void *mydata, } int32_t -my_callback (struct rpc_req *req, struct iovec *iov, int count, void *v_frame) +my_callback (struct rpc_req *req, struct iovec *iov, int count, void *frame) { - gd1_mgmt_brick_op_rsp rsp; - dict_t *dict = NULL; - pid_t pid = -1; - int ret = -1; - xlator_t *this = NULL; - - this = fs->ctx->master; - memset (&rsp, 0, sizeof (rsp)); - - ret = xdr_to_generic (*iov, &rsp, (xdrproc_t)xdr_gd1_mgmt_brick_op_rsp); - - if (ret < 0) { - fprintf (stderr, "xdr decoding failed\n"); - goto out; - } - GF_PROTOCOL_DICT_UNSERIALIZE (this, dict, - (rsp.output.output_val), - (rsp.output.output_len), - ret, rsp.op_errno, out); - if (dict) { - if (dict_get_int32 (dict, "last_brick_terminated", &pid) == 0) { - int status = 0; - - gf_log ("gf_attach", GF_LOG_INFO, "Killing %d", pid); - kill (pid, SIGTERM); - waitpid (pid, &status, 0); - } - dict_unref (dict); - } - rpc_status = req->rpc_status; done = 1; - ret = 0; -out: - return ret; + return 0; } /* copied from gd_syncop_submit_request */ @@ -206,6 +170,7 @@ usage (char *prog) int main (int argc, char *argv[]) { + glfs_t *fs; struct rpc_clnt *rpc; dict_t *options; int ret; diff --git a/glusterfsd/src/glusterfsd-mgmt.c b/glusterfsd/src/glusterfsd-mgmt.c index c4df275077f..d2b39494e51 100644 --- a/glusterfsd/src/glusterfsd-mgmt.c +++ b/glusterfsd/src/glusterfsd-mgmt.c @@ -159,31 +159,21 @@ out: } int -glusterfs_terminate_response_send (rpcsvc_request_t *req, int op_ret, - gf_boolean_t last_brick) +glusterfs_terminate_response_send (rpcsvc_request_t *req, int op_ret) { gd1_mgmt_brick_op_rsp rsp = {0,}; dict_t *dict = NULL; - int ret = -1; + int ret = 0; rsp.op_ret = op_ret; rsp.op_errno = 0; rsp.op_errstr = ""; dict = dict_new (); - if (dict) { - /* Setting the last_brick_terminated key in dictionary is - * required to for standalone gf_attach utility to work. - * gf_attach utility will receive this dictionary and kill - * the process. - */ - if (last_brick) { - ret = dict_set_int32 (dict, "last_brick_terminated", - getpid()); - } + if (dict) ret = dict_allocate_and_serialize (dict, &rsp.output.output_val, &rsp.output.output_len); - } + if (ret == 0) ret = glusterfs_submit_reply (req, &rsp, NULL, 0, NULL, @@ -272,7 +262,6 @@ glusterfs_handle_terminate (rpcsvc_request_t *req) xlator_t *victim = NULL; xlator_list_t **trav_p = NULL; gf_boolean_t lockflag = _gf_false; - gf_boolean_t last_brick = _gf_false; ret = xdr_to_generic (req->msg[0], &xlator_req, (xdrproc_t)xdr_gd1_mgmt_brick_op_req); @@ -305,16 +294,17 @@ glusterfs_handle_terminate (rpcsvc_request_t *req) * make sure it's down and if it's already down that's * good enough. */ - glusterfs_terminate_response_send (req, 0, last_brick); + glusterfs_terminate_response_send (req, 0); goto err; } + glusterfs_terminate_response_send (req, 0); if ((trav_p == &top->children) && !(*trav_p)->next) { - last_brick = _gf_true; - glusterfs_terminate_response_send (req, 0, last_brick); - gf_log (THIS->name, GF_LOG_INFO, "This is last brick of process." - "glusterD will kill the process and takes care of " - "removal of entries from port map register"); + gf_log (THIS->name, GF_LOG_INFO, + "terminating after loss of last child %s", + xlator_req.name); + rpc_clnt_mgmt_pmap_signout (glusterfsd_ctx, xlator_req.name); + kill (getpid(), SIGTERM); } else { /* * This is terribly unsafe without quiescing or shutting @@ -323,7 +313,6 @@ glusterfs_handle_terminate (rpcsvc_request_t *req) * * TBD: finish implementing this "detach" code properly */ - glusterfs_terminate_response_send (req, 0, last_brick); UNLOCK (&ctx->volfile_lock); lockflag = _gf_true; gf_log (THIS->name, GF_LOG_INFO, "detaching not-only" diff --git a/tests/bugs/glusterd/stale-brick-proc-brick-mux.t b/tests/bugs/glusterd/stale-brick-proc-brick-mux.t deleted file mode 100644 index a3efe273898..00000000000 --- a/tests/bugs/glusterd/stale-brick-proc-brick-mux.t +++ /dev/null @@ -1,32 +0,0 @@ -#!/bin/bash - -. $(dirname $0)/../../include.rc -. $(dirname $0)/../../cluster.rc - -function count_brick_processes { - pgrep glusterfsd | wc -l -} - -cleanup; - -TEST launch_cluster 2 -TEST $CLI_1 peer probe $H2; -EXPECT_WITHIN $PROBE_TIMEOUT 1 peer_count - -#bug-1549996 - stale brick processes on the nodes after volume deletion - -TEST $CLI_1 volume set all cluster.brick-multiplex on -TEST $CLI_1 volume create $V0 replica 3 $H1:$B1/${V0}{1..3} $H2:$B2/${V0}{1..3} -TEST $CLI_1 volume start $V0 - -TEST $CLI_1 volume create $V1 replica 3 $H1:$B1/${V1}{1..3} $H2:$B2/${V1}{1..3} -TEST $CLI_1 volume start $V1 - -EXPECT 2 count_brick_processes - -TEST $CLI_1 volume stop $V0 -TEST $CLI_1 volume stop $V1 - -EXPECT 0 count_brick_processes - -cleanup diff --git a/xlators/mgmt/glusterd/src/glusterd-pmap.c b/xlators/mgmt/glusterd/src/glusterd-pmap.c index 8ef285bf48d..98b1aaa63af 100644 --- a/xlators/mgmt/glusterd/src/glusterd-pmap.c +++ b/xlators/mgmt/glusterd/src/glusterd-pmap.c @@ -557,7 +557,6 @@ __gluster_pmap_signout (rpcsvc_request_t *req) glusterd_brickinfo_t *brickinfo = NULL; char pidfile[PATH_MAX] = {0}; char brick_path[PATH_MAX] = {0,}; - int kill_pid = -1; this = THIS; GF_VALIDATE_OR_GOTO ("glusterd", this, fail); @@ -607,8 +606,7 @@ __gluster_pmap_signout (rpcsvc_request_t *req) * removed in the brick op phase. This situation would * arise when the brick is killed explicitly from the * backend */ - ret = glusterd_brick_process_remove_brick (brickinfo, - &kill_pid); + ret = glusterd_brick_process_remove_brick (brickinfo); if (ret) { gf_msg_debug (this->name, 0, "Couldn't remove " "brick %s:%s from brick process", diff --git a/xlators/mgmt/glusterd/src/glusterd-syncop.c b/xlators/mgmt/glusterd/src/glusterd-syncop.c index e5d4421deb4..31b08d76adc 100644 --- a/xlators/mgmt/glusterd/src/glusterd-syncop.c +++ b/xlators/mgmt/glusterd/src/glusterd-syncop.c @@ -958,6 +958,7 @@ gd_syncop_mgmt_brick_op (struct rpc_clnt *rpc, glusterd_pending_node_t *pnode, gd1_mgmt_brick_op_req *req = NULL; int ret = 0; xlator_t *this = NULL; + glusterd_brickinfo_t *brickinfo = NULL; this = THIS; args.op_ret = -1; @@ -986,6 +987,22 @@ gd_syncop_mgmt_brick_op (struct rpc_clnt *rpc, glusterd_pending_node_t *pnode, else GF_FREE (args.errstr); } + if (op == GD_OP_STOP_VOLUME || op == GD_OP_REMOVE_BRICK) { + if (args.op_ret == 0) { + brickinfo = pnode->node; + ret = glusterd_brick_process_remove_brick (brickinfo); + if (ret) { + gf_msg ("glusterd", GF_LOG_ERROR, 0, + GD_MSG_BRICKPROC_REM_BRICK_FAILED, + "Removing brick %s:%s from brick" + " process failed", + brickinfo->hostname, + brickinfo->path); + args.op_ret = ret; + goto out; + } + } + } if (GD_OP_STATUS_VOLUME == op) { ret = dict_set_int32 (args.dict, "index", pnode->index); diff --git a/xlators/mgmt/glusterd/src/glusterd-utils.c b/xlators/mgmt/glusterd/src/glusterd-utils.c index 8e71756b927..af30756c947 100644 --- a/xlators/mgmt/glusterd/src/glusterd-utils.c +++ b/xlators/mgmt/glusterd/src/glusterd-utils.c @@ -69,7 +69,6 @@ #include #include #include -#include #include #include #include @@ -2314,8 +2313,7 @@ glusterd_brickprocess_delete (glusterd_brick_proc_t *brick_proc) } int -glusterd_brick_process_remove_brick (glusterd_brickinfo_t *brickinfo, - int *kill_pid) +glusterd_brick_process_remove_brick (glusterd_brickinfo_t *brickinfo) { int ret = -1; xlator_t *this = NULL; @@ -2354,7 +2352,6 @@ glusterd_brick_process_remove_brick (glusterd_brickinfo_t *brickinfo, /* If all bricks have been removed, delete the brick process */ if (brick_proc->brick_count == 0) { - *kill_pid = 1; ret = glusterd_brickprocess_delete (brick_proc); if (ret) goto out; @@ -2457,11 +2454,7 @@ glusterd_volume_stop_glusterfs (glusterd_volinfo_t *volinfo, glusterd_conf_t *conf = NULL; int ret = -1; char *op_errstr = NULL; - char pidfile_path[PATH_MAX] = {0,}; - int kill_pid = -1; - FILE *pidfile = NULL; - pid_t pid = -1; - int status = -1; + char pidfile[PATH_MAX] = {0,}; GF_ASSERT (volinfo); GF_ASSERT (brickinfo); @@ -2474,7 +2467,7 @@ glusterd_volume_stop_glusterfs (glusterd_volinfo_t *volinfo, ret = 0; - ret = glusterd_brick_process_remove_brick (brickinfo, &kill_pid); + ret = glusterd_brick_process_remove_brick (brickinfo); if (ret) { gf_msg_debug (this->name, 0, "Couldn't remove brick from" " brick process"); @@ -2517,47 +2510,10 @@ glusterd_volume_stop_glusterfs (glusterd_volinfo_t *volinfo, ret = 0; } - GLUSTERD_GET_BRICK_PIDFILE (pidfile_path, volinfo, brickinfo, conf); - if (kill_pid == 1 && is_brick_mx_enabled ()) { - pidfile = fopen (pidfile_path, "r"); - if (!pidfile) { - gf_msg (this->name, GF_LOG_ERROR, errno, - GD_MSG_FILE_OP_FAILED, - "Unable to open pidfile: %s", pidfile_path); - ret = -1; - goto out; - } - - ret = fscanf (pidfile, "%d", &pid); - if (ret <= 0) { - gf_msg (this->name, GF_LOG_ERROR, errno, - GD_MSG_FILE_OP_FAILED, - "Unable to get pid of brick process"); - ret = -1; - goto out; - } - - if (conf->op_version >= GD_OP_VERSION_4_1_0) { - while (conf->blockers) { - synclock_unlock (&conf->big_lock); - sleep (1); - synclock_lock (&conf->big_lock); - } - } - gf_log (this->name, GF_LOG_INFO, - "terminating the brick process " - "%d after loss of last brick %s of the volume %s", - pid, brickinfo->path, volinfo->volname); - kill (pid, SIGTERM); - waitpid (pid, &status, 0); - pmap_registry_remove (this, brickinfo->port, brickinfo->path, - GF_PMAP_PORT_BRICKSERVER, NULL, - _gf_true); - ret = 0; - } + GLUSTERD_GET_BRICK_PIDFILE (pidfile, volinfo, brickinfo, conf); - gf_msg_debug (this->name, 0, "Unlinking pidfile %s", pidfile_path); - (void) sys_unlink (pidfile_path); + gf_msg_debug (this->name, 0, "Unlinking pidfile %s", pidfile); + (void) sys_unlink (pidfile); brickinfo->status = GF_BRICK_STOPPED; brickinfo->start_triggered = _gf_false; diff --git a/xlators/mgmt/glusterd/src/glusterd-utils.h b/xlators/mgmt/glusterd/src/glusterd-utils.h index 8118c994974..0e9e54a0687 100644 --- a/xlators/mgmt/glusterd/src/glusterd-utils.h +++ b/xlators/mgmt/glusterd/src/glusterd-utils.h @@ -182,8 +182,7 @@ glusterd_brick_process_add_brick (glusterd_brickinfo_t *brickinfo, glusterd_volinfo_t *volinfo); int -glusterd_brick_process_remove_brick (glusterd_brickinfo_t *brickinfo, - int *kill_pid); +glusterd_brick_process_remove_brick (glusterd_brickinfo_t *brickinfo); int glusterd_brick_proc_for_port (int port, glusterd_brick_proc_t **brickprocess); -- cgit