glusterd: Block brick attach request till the brick's ctx is set

Problem: In multiplexing setup in a container environment we hit a race where before the first brick finishes its handshake with glusterd, the subsequent attach requests went through and they actually failed and glusterd has no mechanism to realize it. This resulted into all the such bricks not to be active resulting into clients not able to connect. Solution: Introduce a new flag port_registered in glusterd_brickinfo to make sure about pmap_signin finish before the subsequent attach bricks can be processed. Test: To reproduce the issue followed below steps 1) Create 100 volumes on 3 nodes(1x3) in CNS environment 2) Enable brick multiplexing 3) Reboot one container 4) Run below command for v in ‛gluster v list‛ do glfsheal $v | grep -i "transport" done After apply the patch command should not fail. Note: A big thanks to Atin for suggest the fix. >Reviewed-on: https://review.gluster.org/17984 >Reviewed-by: Atin Mukherjee <amukherj@redhat.com> >Smoke: Gluster Build System <jenkins@build.gluster.org> >CentOS-regression: Gluster Build System <jenkins@build.gluster.org> >Reviewed-by: Jeff Darcy <jeff@pl.atyp.us> >(cherry picked from commit c13d69babc228a2932994962d6ea8afe2cdd620a) BUG: 1479662 Change-Id: I8e1bd6132122b3a5b0dd49606cea564122f2609b Signed-off-by: Mohit Agrawal <moagrawa@redhat.com> Reviewed-on: https://review.gluster.org/18004 Tested-by: Atin Mukherjee <amukherj@redhat.com> Smoke: Gluster Build System <jenkins@build.gluster.org> CentOS-regression: Gluster Build System <jenkins@build.gluster.org> Reviewed-by: Shyamsundar Ranganathan <srangana@redhat.com>
author: Mohit Agrawal <moagrawa@redhat.com> 2017-08-08 14:36:17 +0530
committer: Shyamsundar Ranganathan <srangana@redhat.com> 2017-08-12 13:35:26 +0000
commit: d66af9ac76f84faa33ecb2eb390656f5637e6fee (patch)
tree: 863ba719f58b58284e73b28cf46ca4be73bd2e58
parent: fe8fe3effb70b8c9bb0c675b867114a9b0139d0a (diff)
3 files changed, 57 insertions, 24 deletions
diff --git a/xlators/mgmt/glusterd/src/glusterd-pmap.c b/xlators/mgmt/glusterd/src/glusterd-pmap.c
index 1fc7a250748..2a754769c95 100644
--- a/xlators/mgmt/glusterd/src/glusterd-pmap.c
+++ b/xlators/mgmt/glusterd/src/glusterd-pmap.c
@@ -502,6 +502,9 @@ __gluster_pmap_signin (rpcsvc_request_t *req)
                                          GF_PMAP_PORT_BRICKSERVER, req->trans);
 
         ret = glusterd_get_brickinfo (THIS, args.brick, args.port, &brickinfo);
+        /* Update portmap status in brickinfo */
+        if (brickinfo)
+                brickinfo->port_registered = _gf_true;
 
 fail:
         glusterd_submit_reply (req, &rsp, NULL, 0, NULL,
@@ -554,6 +557,10 @@ __gluster_pmap_signout (rpcsvc_request_t *req)
                                 brick_path, GF_PMAP_PORT_BRICKSERVER,
                                 req->trans);
         }
+        /* Update portmap status on brickinfo */
+        if (brickinfo)
+                brickinfo->port_registered = _gf_false;
+
         /* Clean up the pidfile for this brick given glusterfsd doesn't clean it
          * any more. This is required to ensure we don't end up with having
          * stale pid files in case a brick is killed from the backend
diff --git a/xlators/mgmt/glusterd/src/glusterd-utils.c b/xlators/mgmt/glusterd/src/glusterd-utils.c
index 9dcb4f78089..e0f166f1d4a 100644
--- a/xlators/mgmt/glusterd/src/glusterd-utils.c
+++ b/xlators/mgmt/glusterd/src/glusterd-utils.c
@@ -2102,6 +2102,7 @@ retry:
         brickinfo->port = port;
         brickinfo->rdma_port = rdma_port;
         brickinfo->status = GF_BRICK_STARTING;
+        brickinfo->port_registered = _gf_false;
 
         if (wait) {
                 synclock_unlock (&priv->big_lock);
@@ -5385,13 +5386,6 @@ attach_brick (xlator_t *this,
 
         GLUSTERD_REMOVE_SLASH_FROM_PATH (brickinfo->path, unslashed);
 
-        ret = pmap_registry_extend (this, other_brick->port,
-                                    brickinfo->path);
-        if (ret != 0) {
-                gf_log (this->name, GF_LOG_ERROR,
-                        "adding brick to process failed");
-                return -1;
-        }
         GLUSTERD_GET_BRICK_PIDFILE (pidfile1, other_vol, other_brick, conf);
         GLUSTERD_GET_BRICK_PIDFILE (pidfile2, volinfo, brickinfo, conf);
 
@@ -5414,6 +5408,14 @@ attach_brick (xlator_t *this,
                                                GLUSTERD_BRICK_ATTACH);
                         rpc_clnt_unref (rpc);
                         if (!ret) {
+                                ret = pmap_registry_extend (this, other_brick->port,
+                                                            brickinfo->path);
+                                if (ret != 0) {
+                                        gf_log (this->name, GF_LOG_ERROR,
+                                                "adding brick to process failed");
+                                        return ret;
+                                }
+
                                 /* PID file is copied once brick has attached
                                   successfully
                                 */
@@ -5422,6 +5424,7 @@ attach_brick (xlator_t *this,
                                 brickinfo->status = GF_BRICK_STARTED;
                                 brickinfo->rpc =
                                         rpc_clnt_ref (other_brick->rpc);
+                                brickinfo->port_registered = _gf_true;
                                 ret = glusterd_brick_process_add_brick (brickinfo,
                                                                         volinfo);
                                 if (ret) {
@@ -5475,6 +5478,7 @@ find_compat_brick_in_vol (glusterd_conf_t *conf,
         int16_t                 retries                 = 15;
         int                     mux_limit               = -1;
         int                     ret                     = -1;
+        gf_boolean_t            brick_status            = _gf_false;
 
         /*
          * If comp_vol is provided, we have to check *volume* compatibility
@@ -5549,31 +5553,45 @@ find_compat_brick_in_vol (glusterd_conf_t *conf,
                                 "set. Continuing with no limit set for "
                                 "brick multiplexing.");
                 }
-
-                GLUSTERD_GET_BRICK_PIDFILE (pidfile2, srch_vol, other_brick,
-                                            conf);
-
-                /* It is possible that the pidfile hasn't yet been populated,
-                 * when bricks are started in "no-wait" mode; for example
-                 * when bricks are started by glusterd_restart_bricks(). So
-                 * wait for the pidfile to be populated with a value before
-                 * checking if the service is running */
+                /* The first brick process might take some time to finish its
+                 * handshake with glusterd and prepare the graph. We can't
+                 * afford to send attach_req for other bricks till that time.
+                 * brick process sends PMAP_SIGNIN event after processing the
+                 * volfile and hence it's safe to assume that if glusterd has
+                 * received a pmap signin request for the same brick, we are
+                 * good for subsequent attach requests.
+                 */
+                retries = 15;
                 while (retries > 0) {
-                        if (sys_access (pidfile2, F_OK) == 0 &&
-                            gf_is_service_running (pidfile2, &pid2)) {
-                                break;
+                        if (other_brick->port_registered) {
+                                GLUSTERD_GET_BRICK_PIDFILE (pidfile2, srch_vol,
+                                                            other_brick, conf);
+                                if (sys_access (pidfile2, F_OK) == 0 &&
+                                    gf_is_service_running (pidfile2, &pid2)) {
+                                        gf_msg_debug (this->name, 0,
+                                                      "brick %s is running as a pid %d ",
+                                                      other_brick->path, pid2);
+                                        brick_status = _gf_true;
+                                        break;
+                                }
                         }
 
-                        sleep (1);
+                        synclock_unlock (&conf->big_lock);
+                        gf_msg_debug (this->name, 0, "brick %s is still"
+                                      " starting, waiting for 2 seconds ",
+                                      other_brick->path);
+                        sleep(2);
+                        synclock_lock (&conf->big_lock);
                         retries--;
                 }
 
-                if (retries == 0) {
+                if (!brick_status) {
                         gf_log (this->name, GF_LOG_INFO,
-                                "cleaning up dead brick %s:%s",
+                                "brick has not come up so cleaning up dead brick %s:%s",
                                 other_brick->hostname, other_brick->path);
                         other_brick->status = GF_BRICK_STOPPED;
-                        sys_unlink (pidfile2);
+                        if (pidfile2[0])
+                                sys_unlink (pidfile2);
                         continue;
                 }
                 return other_brick;
@@ -5849,6 +5867,12 @@ glusterd_brick_start (glusterd_volinfo_t *volinfo,
                                         brickinfo->path);
                                 goto out;
                         }
+                        /* We need to set the status back to STARTING so that
+                         * while the other (re)start brick requests come in for
+                         * other bricks, this brick can be considered as
+                         * compatible.
+                         */
+                        brickinfo->status = GF_BRICK_STARTING;
                 }
                 return 0;
         }
@@ -5892,7 +5916,8 @@ run:
          *
          * TBD: pray for GlusterD 2 to be ready soon.
          */
-
+        gf_log (this->name, GF_LOG_INFO, "starting a fresh brick process for "
+                "brick %s", brickinfo->path);
         ret = glusterd_volume_start_glusterfs (volinfo, brickinfo, wait);
         if (ret) {
                 gf_msg (this->name, GF_LOG_ERROR, 0,
diff --git a/xlators/mgmt/glusterd/src/glusterd.h b/xlators/mgmt/glusterd/src/glusterd.h
index 12a0c392d95..dd1ff7a66be 100644
--- a/xlators/mgmt/glusterd/src/glusterd.h
+++ b/xlators/mgmt/glusterd/src/glusterd.h
@@ -238,6 +238,7 @@ struct glusterd_brickinfo {
            the backend filesystem */
         uint64_t           statfs_fsid;
         uint32_t           fs_share_count;
+        gf_boolean_t       port_registered;
 };
 
 typedef struct glusterd_brickinfo glusterd_brickinfo_t;
author	Mohit Agrawal <moagrawa@redhat.com>	2017-08-08 14:36:17 +0530
committer	Shyamsundar Ranganathan <srangana@redhat.com>	2017-08-12 13:35:26 +0000
commit	d66af9ac76f84faa33ecb2eb390656f5637e6fee (patch)
tree	863ba719f58b58284e73b28cf46ca4be73bd2e58
parent	fe8fe3effb70b8c9bb0c675b867114a9b0139d0a (diff)