From 1a95fc3036db51b82b6a80952f0908bc2019d24a Mon Sep 17 00:00:00 2001
From: Jeff Darcy <jdarcy@redhat.com>
Date: Thu, 8 Dec 2016 16:24:15 -0500
Subject: core: run many bricks within one glusterfsd process

This patch adds support for multiple brick translator stacks running
in a single brick server process.  This reduces our per-brick memory usage by
approximately 3x, and our appetite for TCP ports even more.  It also creates
potential to avoid process/thread thrashing, and to improve QoS by scheduling
more carefully across the bricks, but realizing that potential will require
further work.

Multiplexing is controlled by the "cluster.brick-multiplex" global option.  By
default it's off, and bricks are started in separate processes as before.  If
multiplexing is enabled, then *compatible* bricks (mostly those with the same
transport options) will be started in the same process.

Change-Id: I45059454e51d6f4cbb29a4953359c09a408695cb
BUG: 1385758
Signed-off-by: Jeff Darcy <jdarcy@redhat.com>
Reviewed-on: https://review.gluster.org/14763
Smoke: Gluster Build System <jenkins@build.gluster.org>
NetBSD-regression: NetBSD Build System <jenkins@build.gluster.org>
CentOS-regression: Gluster Build System <jenkins@build.gluster.org>
Reviewed-by: Vijay Bellur <vbellur@redhat.com>
---
 xlators/mgmt/glusterd/src/glusterd-handler.c | 42 ++++++++++++++++++++++------
 1 file changed, 34 insertions(+), 8 deletions(-)

(limited to 'xlators/mgmt/glusterd/src/glusterd-handler.c')

diff --git a/xlators/mgmt/glusterd/src/glusterd-handler.c b/xlators/mgmt/glusterd/src/glusterd-handler.c
index 364623317ef..b6f0197aa19 100644
--- a/xlators/mgmt/glusterd/src/glusterd-handler.c
+++ b/xlators/mgmt/glusterd/src/glusterd-handler.c
@@ -3365,7 +3365,8 @@ int
 glusterd_rpc_create (struct rpc_clnt **rpc,
                      dict_t *options,
                      rpc_clnt_notify_t notify_fn,
-                     void *notify_data)
+                     void *notify_data,
+                     gf_boolean_t force)
 {
         struct rpc_clnt         *new_rpc = NULL;
         int                     ret = -1;
@@ -3376,6 +3377,11 @@ glusterd_rpc_create (struct rpc_clnt **rpc,
 
         GF_ASSERT (options);
 
+        if (force && rpc && *rpc) {
+                (void) rpc_clnt_unref (*rpc);
+                *rpc = NULL;
+        }
+
         /* TODO: is 32 enough? or more ? */
         new_rpc = rpc_clnt_new (options, this, this->name, 16);
         if (!new_rpc)
@@ -3531,7 +3537,8 @@ glusterd_friend_rpc_create (xlator_t *this, glusterd_peerinfo_t *peerinfo,
         }
 
         ret = glusterd_rpc_create (&peerinfo->rpc, options,
-                                   glusterd_peer_rpc_notify, peerctx);
+                                   glusterd_peer_rpc_notify, peerctx,
+                                   _gf_false);
         if (ret) {
                 gf_msg (this->name, GF_LOG_ERROR, 0,
                         GD_MSG_RPC_CREATE_FAIL,
@@ -4638,6 +4645,7 @@ gd_is_global_option (char *opt_key)
         return (strcmp (opt_key, GLUSTERD_SHARED_STORAGE_KEY) == 0 ||
                 strcmp (opt_key, GLUSTERD_QUORUM_RATIO_KEY) == 0 ||
                 strcmp (opt_key, GLUSTERD_GLOBAL_OP_VERSION_KEY) == 0 ||
+                strcmp (opt_key, GLUSTERD_BRICK_MULTIPLEX_KEY) == 0 ||
                 strcmp (opt_key, GLUSTERD_MAX_OP_VERSION_KEY) == 0);
 
 out:
@@ -5308,8 +5316,6 @@ glusterd_get_state (rpcsvc_request_t *req, dict_t *dict)
                                  count, brickinfo->rdma_port);
                         fprintf (fp, "Volume%d.Brick%d.status: %s\n", count_bkp,
                                  count, brickinfo->status ? "Started" : "Stopped");
-                        fprintf (fp, "Volume%d.Brick%d.signedin: %s\n", count_bkp,
-                                 count, brickinfo->signed_in ? "True" : "False");
 
                         /*FIXME: This is a hacky way of figuring out whether a
                          * brick belongs to the hot or cold tier */
@@ -5495,6 +5501,9 @@ __glusterd_handle_get_state (rpcsvc_request_t *req)
         GF_VALIDATE_OR_GOTO (THIS->name, this, out);
         GF_VALIDATE_OR_GOTO (this->name, req, out);
 
+        gf_msg (this->name, GF_LOG_INFO, 0, GD_MSG_DAEMON_STATE_REQ_RCVD,
+                "Received request to get state for glusterd");
+
         ret = xdr_to_generic (req->msg[0], &cli_req, (xdrproc_t)xdr_gf_cli_req);
         if (ret < 0) {
                 snprintf (err_str, sizeof (err_str), "Failed to decode "
@@ -5525,14 +5534,17 @@ __glusterd_handle_get_state (rpcsvc_request_t *req)
                 }
         }
 
-        gf_msg (this->name, GF_LOG_INFO, 0, GD_MSG_DAEMON_STATE_REQ_RCVD,
-                "Received request to get state for glusterd");
-
         ret = glusterd_get_state (req, dict);
 
 out:
-        if (dict)
+        if (dict && ret) {
+                /*
+                 * When glusterd_to_cli (called from glusterd_get_state)
+                 * succeeds, it frees the dict for us, so this would be a
+                 * double free, but in other cases it's our responsibility.
+                 */
                 dict_unref (dict);
+        }
         return ret;
 }
 
@@ -5658,6 +5670,20 @@ __glusterd_brick_rpc_notify (struct rpc_clnt *rpc, void *mydata,
 
         case RPC_CLNT_DISCONNECT:
                 rpc_clnt_unset_connected (&rpc->conn);
+                if (rpc != brickinfo->rpc) {
+                        /*
+                         * There used to be a bunch of races in the volume
+                         * start/stop code that could result in us getting here
+                         * and setting the brick status incorrectly.  Many of
+                         * those have been fixed or avoided, but just in case
+                         * any are still left it doesn't hurt to keep the extra
+                         * check and avoid further damage.
+                         */
+                        gf_log (this->name, GF_LOG_WARNING,
+                                "got disconnect from stale rpc on %s",
+                                brickinfo->path);
+                        break;
+                }
                 if (glusterd_is_brick_started (brickinfo)) {
                         gf_msg (this->name, GF_LOG_INFO, 0,
                                 GD_MSG_BRICK_DISCONNECTED,
-- 
cgit