From 1a95fc3036db51b82b6a80952f0908bc2019d24a Mon Sep 17 00:00:00 2001
From: Jeff Darcy <jdarcy@redhat.com>
Date: Thu, 8 Dec 2016 16:24:15 -0500
Subject: core: run many bricks within one glusterfsd process

This patch adds support for multiple brick translator stacks running
in a single brick server process.  This reduces our per-brick memory usage by
approximately 3x, and our appetite for TCP ports even more.  It also creates
potential to avoid process/thread thrashing, and to improve QoS by scheduling
more carefully across the bricks, but realizing that potential will require
further work.

Multiplexing is controlled by the "cluster.brick-multiplex" global option.  By
default it's off, and bricks are started in separate processes as before.  If
multiplexing is enabled, then *compatible* bricks (mostly those with the same
transport options) will be started in the same process.

Change-Id: I45059454e51d6f4cbb29a4953359c09a408695cb
BUG: 1385758
Signed-off-by: Jeff Darcy <jdarcy@redhat.com>
Reviewed-on: https://review.gluster.org/14763
Smoke: Gluster Build System <jenkins@build.gluster.org>
NetBSD-regression: NetBSD Build System <jenkins@build.gluster.org>
CentOS-regression: Gluster Build System <jenkins@build.gluster.org>
Reviewed-by: Vijay Bellur <vbellur@redhat.com>
---
 xlators/mgmt/glusterd/src/glusterd-brick-ops.c     |  12 +-
 xlators/mgmt/glusterd/src/glusterd-handler.c       |  42 +-
 xlators/mgmt/glusterd/src/glusterd-handshake.c     |   3 +-
 xlators/mgmt/glusterd/src/glusterd-messages.h      |  17 +-
 xlators/mgmt/glusterd/src/glusterd-op-sm.c         | 127 ++++-
 xlators/mgmt/glusterd/src/glusterd-op-sm.h         |   3 +-
 xlators/mgmt/glusterd/src/glusterd-pmap.c          | 171 ++++--
 xlators/mgmt/glusterd/src/glusterd-pmap.h          |   3 +-
 xlators/mgmt/glusterd/src/glusterd-rebalance.c     |  51 +-
 xlators/mgmt/glusterd/src/glusterd-replace-brick.c |  27 -
 xlators/mgmt/glusterd/src/glusterd-snapshot.c      |  68 +--
 xlators/mgmt/glusterd/src/glusterd-syncop.c        |  17 +-
 xlators/mgmt/glusterd/src/glusterd-utils.c         | 613 +++++++++++++++++++--
 xlators/mgmt/glusterd/src/glusterd-utils.h         |   6 +
 xlators/mgmt/glusterd/src/glusterd-volgen.c        |   7 +
 xlators/mgmt/glusterd/src/glusterd-volume-ops.c    |   5 +-
 xlators/mgmt/glusterd/src/glusterd-volume-set.c    |   7 +
 xlators/mgmt/glusterd/src/glusterd.h               |  10 +-
 18 files changed, 955 insertions(+), 234 deletions(-)

(limited to 'xlators/mgmt/glusterd')

diff --git a/xlators/mgmt/glusterd/src/glusterd-brick-ops.c b/xlators/mgmt/glusterd/src/glusterd-brick-ops.c
index 938663ba863..c78fbd8345c 100644
--- a/xlators/mgmt/glusterd/src/glusterd-brick-ops.c
+++ b/xlators/mgmt/glusterd/src/glusterd-brick-ops.c
@@ -2905,18 +2905,24 @@ glusterd_op_remove_brick (dict_t *dict, char **op_errstr)
                 defrag_cmd = GF_DEFRAG_CMD_START_FORCE;
                 if (cmd == GF_OP_CMD_DETACH_START)
                         defrag_cmd = GF_DEFRAG_CMD_START_DETACH_TIER;
+                /*
+                 * We need to set this *before* we issue commands to the
+                 * bricks, or else we might end up setting it after the bricks
+                 * have responded.  If we fail to send the request(s) we'll
+                 * clear it ourselves because nobody else will.
+                 */
+                volinfo->decommission_in_progress = 1;
                 ret = glusterd_handle_defrag_start
                         (volinfo, err_str, sizeof (err_str),
                          defrag_cmd,
                          glusterd_remove_brick_migrate_cbk, GD_OP_REMOVE_BRICK);
 
-                if (!ret)
-                        volinfo->decommission_in_progress = 1;
-
                 if (ret) {
                         gf_msg (this->name, GF_LOG_ERROR, 0,
                                 GD_MSG_REBALANCE_START_FAIL,
                                 "failed to start the rebalance");
+                        /* TBD: shouldn't we do more than print a message? */
+                        volinfo->decommission_in_progress = 0;
                 }
         } else {
                 if (GLUSTERD_STATUS_STARTED == volinfo->status)
diff --git a/xlators/mgmt/glusterd/src/glusterd-handler.c b/xlators/mgmt/glusterd/src/glusterd-handler.c
index 364623317ef..b6f0197aa19 100644
--- a/xlators/mgmt/glusterd/src/glusterd-handler.c
+++ b/xlators/mgmt/glusterd/src/glusterd-handler.c
@@ -3365,7 +3365,8 @@ int
 glusterd_rpc_create (struct rpc_clnt **rpc,
                      dict_t *options,
                      rpc_clnt_notify_t notify_fn,
-                     void *notify_data)
+                     void *notify_data,
+                     gf_boolean_t force)
 {
         struct rpc_clnt         *new_rpc = NULL;
         int                     ret = -1;
@@ -3376,6 +3377,11 @@ glusterd_rpc_create (struct rpc_clnt **rpc,
 
         GF_ASSERT (options);
 
+        if (force && rpc && *rpc) {
+                (void) rpc_clnt_unref (*rpc);
+                *rpc = NULL;
+        }
+
         /* TODO: is 32 enough? or more ? */
         new_rpc = rpc_clnt_new (options, this, this->name, 16);
         if (!new_rpc)
@@ -3531,7 +3537,8 @@ glusterd_friend_rpc_create (xlator_t *this, glusterd_peerinfo_t *peerinfo,
         }
 
         ret = glusterd_rpc_create (&peerinfo->rpc, options,
-                                   glusterd_peer_rpc_notify, peerctx);
+                                   glusterd_peer_rpc_notify, peerctx,
+                                   _gf_false);
         if (ret) {
                 gf_msg (this->name, GF_LOG_ERROR, 0,
                         GD_MSG_RPC_CREATE_FAIL,
@@ -4638,6 +4645,7 @@ gd_is_global_option (char *opt_key)
         return (strcmp (opt_key, GLUSTERD_SHARED_STORAGE_KEY) == 0 ||
                 strcmp (opt_key, GLUSTERD_QUORUM_RATIO_KEY) == 0 ||
                 strcmp (opt_key, GLUSTERD_GLOBAL_OP_VERSION_KEY) == 0 ||
+                strcmp (opt_key, GLUSTERD_BRICK_MULTIPLEX_KEY) == 0 ||
                 strcmp (opt_key, GLUSTERD_MAX_OP_VERSION_KEY) == 0);
 
 out:
@@ -5308,8 +5316,6 @@ glusterd_get_state (rpcsvc_request_t *req, dict_t *dict)
                                  count, brickinfo->rdma_port);
                         fprintf (fp, "Volume%d.Brick%d.status: %s\n", count_bkp,
                                  count, brickinfo->status ? "Started" : "Stopped");
-                        fprintf (fp, "Volume%d.Brick%d.signedin: %s\n", count_bkp,
-                                 count, brickinfo->signed_in ? "True" : "False");
 
                         /*FIXME: This is a hacky way of figuring out whether a
                          * brick belongs to the hot or cold tier */
@@ -5495,6 +5501,9 @@ __glusterd_handle_get_state (rpcsvc_request_t *req)
         GF_VALIDATE_OR_GOTO (THIS->name, this, out);
         GF_VALIDATE_OR_GOTO (this->name, req, out);
 
+        gf_msg (this->name, GF_LOG_INFO, 0, GD_MSG_DAEMON_STATE_REQ_RCVD,
+                "Received request to get state for glusterd");
+
         ret = xdr_to_generic (req->msg[0], &cli_req, (xdrproc_t)xdr_gf_cli_req);
         if (ret < 0) {
                 snprintf (err_str, sizeof (err_str), "Failed to decode "
@@ -5525,14 +5534,17 @@ __glusterd_handle_get_state (rpcsvc_request_t *req)
                 }
         }
 
-        gf_msg (this->name, GF_LOG_INFO, 0, GD_MSG_DAEMON_STATE_REQ_RCVD,
-                "Received request to get state for glusterd");
-
         ret = glusterd_get_state (req, dict);
 
 out:
-        if (dict)
+        if (dict && ret) {
+                /*
+                 * When glusterd_to_cli (called from glusterd_get_state)
+                 * succeeds, it frees the dict for us, so this would be a
+                 * double free, but in other cases it's our responsibility.
+                 */
                 dict_unref (dict);
+        }
         return ret;
 }
 
@@ -5658,6 +5670,20 @@ __glusterd_brick_rpc_notify (struct rpc_clnt *rpc, void *mydata,
 
         case RPC_CLNT_DISCONNECT:
                 rpc_clnt_unset_connected (&rpc->conn);
+                if (rpc != brickinfo->rpc) {
+                        /*
+                         * There used to be a bunch of races in the volume
+                         * start/stop code that could result in us getting here
+                         * and setting the brick status incorrectly.  Many of
+                         * those have been fixed or avoided, but just in case
+                         * any are still left it doesn't hurt to keep the extra
+                         * check and avoid further damage.
+                         */
+                        gf_log (this->name, GF_LOG_WARNING,
+                                "got disconnect from stale rpc on %s",
+                                brickinfo->path);
+                        break;
+                }
                 if (glusterd_is_brick_started (brickinfo)) {
                         gf_msg (this->name, GF_LOG_INFO, 0,
                                 GD_MSG_BRICK_DISCONNECTED,
diff --git a/xlators/mgmt/glusterd/src/glusterd-handshake.c b/xlators/mgmt/glusterd/src/glusterd-handshake.c
index c1392734d79..96d39f03007 100644
--- a/xlators/mgmt/glusterd/src/glusterd-handshake.c
+++ b/xlators/mgmt/glusterd/src/glusterd-handshake.c
@@ -178,7 +178,7 @@ out:
         return ret;
 }
 
-static size_t
+size_t
 build_volfile_path (char *volume_id, char *path,
                     size_t path_len, char *trusted_str)
 {
@@ -841,6 +841,7 @@ __server_getspec (rpcsvc_request_t *req)
         peerinfo = &req->trans->peerinfo;
 
         volume = args.key;
+
         /* Need to strip leading '/' from volnames. This was introduced to
          * support nfs style mount parameters for native gluster mount
          */
diff --git a/xlators/mgmt/glusterd/src/glusterd-messages.h b/xlators/mgmt/glusterd/src/glusterd-messages.h
index 00de88f4e36..5f1339cb5fd 100644
--- a/xlators/mgmt/glusterd/src/glusterd-messages.h
+++ b/xlators/mgmt/glusterd/src/glusterd-messages.h
@@ -28,7 +28,7 @@
  *       - Append to the list of messages defined, towards the end
  *       - Retain macro naming as glfs_msg_X (for redability across developers)
  * NOTE: Rules for message format modifications
- * 3) Check acorss the code if the message ID macro in question is reused
+ * 3) Check across the code if the message ID macro in question is reused
  *    anywhere. If reused then then the modifications should ensure correctness
  *    everywhere, or needs a new message ID as (1) above was not adhered to. If
  *    not used anywhere, proceed with the required modification.
@@ -41,7 +41,7 @@
 
 #define GLUSTERD_COMP_BASE      GLFS_MSGID_GLUSTERD
 
-#define GLFS_NUM_MESSAGES       595
+#define GLFS_NUM_MESSAGES       597
 
 #define GLFS_MSGID_END          (GLUSTERD_COMP_BASE + GLFS_NUM_MESSAGES + 1)
 /* Messaged with message IDs */
@@ -4817,5 +4817,18 @@
  */
 
 /*------------*/
+
+#define GD_MSG_BRICK_MX_SET_FAIL                   (GLUSTERD_COMP_BASE + 596)
+/*!
+ * @messageid
+ * @diagnosis
+ * @recommendedaction
+ *
+ */
+
+#define GD_MSG_NO_SIG_TO_PID_ZERO                  (GLUSTERD_COMP_BASE + 597)
+
+/*------------*/
+
 #define glfs_msg_end_x GLFS_MSGID_END, "Invalid: End of messages"
 #endif /* !_GLUSTERD_MESSAGES_H_ */
diff --git a/xlators/mgmt/glusterd/src/glusterd-op-sm.c b/xlators/mgmt/glusterd/src/glusterd-op-sm.c
index b24e91a457c..d9b18e00195 100644
--- a/xlators/mgmt/glusterd/src/glusterd-op-sm.c
+++ b/xlators/mgmt/glusterd/src/glusterd-op-sm.c
@@ -58,16 +58,27 @@ static int
 glusterd_set_shared_storage (dict_t *dict, char *key, char *value,
                              char **op_errstr);
 
-/* Valid options for all volumes to be listed in the *
- * valid_all_vol_opts table. To add newer options to *
- * all volumes, we can just add more entries to this *
- * table                                             *
+/*
+ * Valid options for all volumes to be listed in the valid_all_vol_opts table.
+ * To add newer options to all volumes, we can just add more entries to this
+ * table.
+ *
+ * It's important that every value have a default, or have a special handler
+ * in glusterd_get_global_options_for_all_vols, or else we might crash there.
  */
 glusterd_all_vol_opts valid_all_vol_opts[] = {
-        { GLUSTERD_QUORUM_RATIO_KEY },
-        { GLUSTERD_SHARED_STORAGE_KEY },
-        { GLUSTERD_GLOBAL_OP_VERSION_KEY },
-        { GLUSTERD_MAX_OP_VERSION_KEY },
+        { GLUSTERD_QUORUM_RATIO_KEY,            "0" },
+        { GLUSTERD_SHARED_STORAGE_KEY,          "disable" },
+        /* This one actually gets filled in dynamically. */
+        { GLUSTERD_GLOBAL_OP_VERSION_KEY,       "BUG_NO_OP_VERSION"},
+        /*
+         * This one should be filled in dynamically, but it didn't used to be
+         * (before the defaults were added here) so the value is unclear.
+         *
+         * TBD: add a dynamic handler to set the appropriate value
+         */
+        { GLUSTERD_MAX_OP_VERSION_KEY,          "BUG_NO_MAX_OP_VERSION"},
+        { GLUSTERD_BRICK_MULTIPLEX_KEY,         "disable"},
         { NULL },
 };
 
@@ -557,7 +568,7 @@ glusterd_brick_op_build_payload (glusterd_op_t op, glusterd_brickinfo_t *brickin
                 if (!brick_req)
                         goto out;
                 brick_req->op = GLUSTERD_BRICK_TERMINATE;
-                brick_req->name = "";
+                brick_req->name = brickinfo->path;
                 glusterd_set_brick_status (brickinfo, GF_BRICK_STOPPING);
                 break;
         case GD_OP_PROFILE_VOLUME:
@@ -618,28 +629,13 @@ glusterd_brick_op_build_payload (glusterd_op_t op, glusterd_brickinfo_t *brickin
 
                 break;
         case GD_OP_SNAP:
-                brick_req = GF_CALLOC (1, sizeof (*brick_req),
-                                       gf_gld_mt_mop_brick_req_t);
-                if (!brick_req)
-                        goto out;
-
-                brick_req->op = GLUSTERD_BRICK_BARRIER;
-                ret = dict_get_str (dict, "volname", &volname);
-                if (ret)
-                        goto out;
-                brick_req->name = gf_strdup (volname);
-
-                break;
         case GD_OP_BARRIER:
                 brick_req = GF_CALLOC (1, sizeof(*brick_req),
                                        gf_gld_mt_mop_brick_req_t);
                 if (!brick_req)
                         goto out;
                 brick_req->op = GLUSTERD_BRICK_BARRIER;
-                ret = dict_get_str(dict, "volname", &volname);
-                if (ret)
-                        goto out;
-                brick_req->name = gf_strdup (volname);
+                brick_req->name = brickinfo->path;
                 break;
 
         default:
@@ -753,6 +749,17 @@ out:
         return ret;
 }
 
+static int
+glusterd_validate_brick_mx_options (xlator_t *this, char *fullkey, char *value,
+                                    char **op_errstr)
+{
+        int             ret = 0;
+
+        //Placeholder function for now
+
+        return ret;
+}
+
 static int
 glusterd_validate_shared_storage (char *key, char *value, char *errstr)
 {
@@ -1191,6 +1198,11 @@ glusterd_op_stage_set_volume (dict_t *dict, char **op_errstr)
                 if (ret)
                         goto out;
 
+                ret = glusterd_validate_brick_mx_options (this, key, value,
+                                                          op_errstr);
+                if (ret)
+                        goto out;
+
                 local_key_op_version = glusterd_get_op_version_for_key (key);
                 if (local_key_op_version > local_new_op_version)
                         local_new_op_version = local_key_op_version;
@@ -2350,6 +2362,33 @@ out:
         return ret;
 }
 
+static int
+glusterd_set_brick_mx_opts (dict_t *dict, char *key, char *value,
+                            char **op_errstr)
+{
+        int32_t       ret                  = -1;
+        xlator_t     *this                 = NULL;
+        glusterd_conf_t *priv              = NULL;
+
+        this = THIS;
+        GF_VALIDATE_OR_GOTO ("glusterd", this, out);
+        GF_VALIDATE_OR_GOTO (this->name, dict, out);
+        GF_VALIDATE_OR_GOTO (this->name, key, out);
+        GF_VALIDATE_OR_GOTO (this->name, value, out);
+        GF_VALIDATE_OR_GOTO (this->name, op_errstr, out);
+
+        ret = 0;
+
+        priv = this->private;
+
+        if (!strcmp (key, GLUSTERD_BRICK_MULTIPLEX_KEY)) {
+                ret = dict_set_dynstr (priv->opts, key, gf_strdup (value));
+        }
+
+out:
+        return ret;
+}
+
 static int
 glusterd_op_set_all_volume_options (xlator_t *this, dict_t *dict,
                                     char **op_errstr)
@@ -2399,6 +2438,14 @@ glusterd_op_set_all_volume_options (xlator_t *this, dict_t *dict,
                 goto out;
         }
 
+        ret = glusterd_set_brick_mx_opts (dict, key, value, op_errstr);
+        if (ret) {
+                gf_msg (this->name, GF_LOG_ERROR, 0,
+                        GD_MSG_BRICK_MX_SET_FAIL,
+                        "Failed to set brick multiplexing option");
+                goto out;
+        }
+
         /* If the key is cluster.op-version, set conf->op_version to the value
          * if needed and save it.
          */
@@ -2629,6 +2676,7 @@ out:
 }
 
 
+
 static int
 glusterd_op_set_volume (dict_t *dict, char **errstr)
 {
@@ -6094,6 +6142,8 @@ glusterd_bricks_select_stop_volume (dict_t *dict, char **op_errstr,
         glusterd_volinfo_t                      *volinfo = NULL;
         glusterd_brickinfo_t                    *brickinfo = NULL;
         glusterd_pending_node_t                 *pending_node = NULL;
+        glusterd_conf_t                         *conf = THIS->private;
+        char                                    pidfile[1024];
 
         ret = glusterd_op_stop_volume_args_get (dict, &volname, &flags);
         if (ret)
@@ -6122,6 +6172,18 @@ glusterd_bricks_select_stop_volume (dict_t *dict, char **op_errstr,
                                                    selected);
                                 pending_node = NULL;
                         }
+                        /*
+                         * This is not really the right place to do it, but
+                         * it's the most convenient.
+                         * TBD: move this to *after* the RPC
+                         */
+                        brickinfo->status = GF_BRICK_STOPPED;
+                        brickinfo->started_here = _gf_false;
+                        GLUSTERD_GET_BRICK_PIDFILE (pidfile, volinfo,
+                                                    brickinfo, conf);
+                        gf_log (THIS->name, GF_LOG_INFO,
+                                "unlinking pidfile %s", pidfile);
+                        (void) sys_unlink (pidfile);
                 }
         }
 
@@ -6144,7 +6206,8 @@ glusterd_bricks_select_remove_brick (dict_t *dict, char **op_errstr,
         glusterd_pending_node_t                 *pending_node = NULL;
         int32_t                                 command = 0;
         int32_t                                 force = 0;
-
+        glusterd_conf_t                         *conf = THIS->private;
+        char                                    pidfile[1024];
 
         ret = dict_get_str (dict, "volname", &volname);
 
@@ -6218,6 +6281,18 @@ glusterd_bricks_select_remove_brick (dict_t *dict, char **op_errstr,
                                                    selected);
                                 pending_node = NULL;
                         }
+                        /*
+                         * This is not really the right place to do it, but
+                         * it's the most convenient.
+                         * TBD: move this to *after* the RPC
+                         */
+                        brickinfo->status = GF_BRICK_STOPPED;
+                        brickinfo->started_here = _gf_false;
+                        GLUSTERD_GET_BRICK_PIDFILE (pidfile, volinfo,
+                                                    brickinfo, conf);
+                        gf_log (THIS->name, GF_LOG_INFO,
+                                "unlinking pidfile %s", pidfile);
+                        (void) sys_unlink (pidfile);
                 }
                 i++;
         }
diff --git a/xlators/mgmt/glusterd/src/glusterd-op-sm.h b/xlators/mgmt/glusterd/src/glusterd-op-sm.h
index 142f7ba89f7..48275c57e12 100644
--- a/xlators/mgmt/glusterd/src/glusterd-op-sm.h
+++ b/xlators/mgmt/glusterd/src/glusterd-op-sm.h
@@ -166,7 +166,8 @@ typedef enum cli_cmd_type_ {
  } cli_cmd_type;
 
 typedef struct glusterd_all_volume_options {
-        char          *option;
+        char    *option;
+        char    *dflt_val;
 } glusterd_all_vol_opts;
 
 int
diff --git a/xlators/mgmt/glusterd/src/glusterd-pmap.c b/xlators/mgmt/glusterd/src/glusterd-pmap.c
index 2c27473f190..2e87ff6ecdf 100644
--- a/xlators/mgmt/glusterd/src/glusterd-pmap.c
+++ b/xlators/mgmt/glusterd/src/glusterd-pmap.c
@@ -93,25 +93,21 @@ pmap_registry_get (xlator_t *this)
 }
 
 
-static char*
-nextword (char *str)
-{
-        while (*str && !isspace (*str))
-                str++;
-        while (*str && isspace (*str))
-                str++;
-
-        return str;
-}
-
+/*
+ * The "destroy" argument avoids a double search in pmap_registry_remove - one
+ * to find the entry in the table, and the other to find the particular
+ * brickname within that entry (which might cover multiple bricks).  We do the
+ * actual deletion here by "whiting out" the brick name with spaces.  It's up
+ * to pmap_registry_remove to figure out what to do from there.
+ */
 int
 pmap_registry_search (xlator_t *this, const char *brickname,
-                      gf_pmap_port_type_t type)
+                      gf_pmap_port_type_t type, gf_boolean_t destroy)
 {
         struct pmap_registry *pmap = NULL;
         int                   p = 0;
         char                 *brck = NULL;
-        char                 *nbrck = NULL;
+        size_t                i;
 
         pmap = pmap_registry_get (this);
 
@@ -119,13 +115,38 @@ pmap_registry_search (xlator_t *this, const char *brickname,
                 if (!pmap->ports[p].brickname || pmap->ports[p].type != type)
                         continue;
 
-                for (brck = pmap->ports[p].brickname;;) {
-                        nbrck = strtail (brck, brickname);
-                        if (nbrck && (!*nbrck || isspace (*nbrck)))
-                                return p;
-                        brck = nextword (brck);
-                        if (!*brck)
+                brck = pmap->ports[p].brickname;
+                for (;;) {
+                        for (i = 0; brck[i] && !isspace (brck[i]); ++i)
+                                ;
+                        if (!i) {
                                 break;
+                        }
+                        if (strncmp (brck, brickname, i) == 0) {
+                                /*
+                                 * Without this check, we'd break when brck
+                                 * is merely a substring of brickname.
+                                 */
+                                if (brickname[i] == '\0') {
+                                        if (destroy) do {
+                                                *(brck++) = ' ';
+                                        } while (--i);
+                                        return p;
+                                }
+                        }
+                        brck += i;
+                        /*
+                         * Skip over *any* amount of whitespace, including
+                         * none (if we're already at the end of the string).
+                         */
+                        while (isspace (*brck))
+                                ++brck;
+                        /*
+                         * We're either at the end of the string (which will be
+                         * handled above strncmp on the next iteration) or at
+                         * the next non-whitespace substring (which will be
+                         * handled by strncmp itself).
+                         */
                 }
         }
 
@@ -240,8 +261,13 @@ pmap_registry_bind (xlator_t *this, int port, const char *brickname,
 
         p = port;
         pmap->ports[p].type = type;
-        free (pmap->ports[p].brickname);
-        pmap->ports[p].brickname = strdup (brickname);
+        if (pmap->ports[p].brickname) {
+                char *tmp = pmap->ports[p].brickname;
+                asprintf (&pmap->ports[p].brickname, "%s %s", tmp, brickname);
+                free (tmp);
+        } else {
+                pmap->ports[p].brickname = strdup (brickname);
+        }
         pmap->ports[p].type = type;
         pmap->ports[p].xprt = xprt;
 
@@ -255,6 +281,62 @@ out:
         return 0;
 }
 
+int
+pmap_registry_extend (xlator_t *this, int port, const char *brickname)
+{
+        struct pmap_registry *pmap = NULL;
+        char                 *old_bn;
+        char                 *new_bn;
+        size_t               bn_len;
+        char                 *entry;
+        int                  found = 0;
+
+        pmap = pmap_registry_get (this);
+
+        if (port > GF_PORT_MAX) {
+                return -1;
+        }
+
+        switch (pmap->ports[port].type) {
+        case GF_PMAP_PORT_LEASED:
+        case GF_PMAP_PORT_BRICKSERVER:
+                break;
+        default:
+                return -1;
+        }
+
+        old_bn = pmap->ports[port].brickname;
+        if (old_bn) {
+                bn_len = strlen(brickname);
+                entry = strstr (old_bn, brickname);
+                while (entry) {
+                        found = 1;
+                        if ((entry != old_bn) && (entry[-1] != ' ')) {
+                                found = 0;
+                        }
+                        if ((entry[bn_len] != ' ') && (entry[bn_len] != '\0')) {
+                                found = 0;
+                        }
+                        if (found) {
+                                return 0;
+                        }
+                        entry = strstr (entry + bn_len, brickname);
+                }
+                asprintf (&new_bn, "%s %s", old_bn, brickname);
+        } else {
+                new_bn = strdup (brickname);
+        }
+
+        if (!new_bn) {
+                return -1;
+        }
+
+        pmap->ports[port].brickname = new_bn;
+        free (old_bn);
+
+        return 0;
+}
+
 int
 pmap_registry_remove (xlator_t *this, int port, const char *brickname,
                       gf_pmap_port_type_t type, void *xprt)
@@ -262,6 +344,7 @@ pmap_registry_remove (xlator_t *this, int port, const char *brickname,
         struct pmap_registry *pmap = NULL;
         int                   p = 0;
         glusterd_conf_t      *priv = NULL;
+        char                 *brick_str;
 
         priv = this->private;
         pmap = priv->pmap;
@@ -277,7 +360,7 @@ pmap_registry_remove (xlator_t *this, int port, const char *brickname,
         }
 
         if (brickname && strchr (brickname, '/')) {
-                p = pmap_registry_search (this, brickname, type);
+                p = pmap_registry_search (this, brickname, type, _gf_true);
                 if (p)
                         goto remove;
         }
@@ -294,11 +377,29 @@ remove:
                 GD_MSG_BRICK_REMOVE, "removing brick %s on port %d",
                 pmap->ports[p].brickname, p);
 
-        free (pmap->ports[p].brickname);
+        if (xprt && (xprt == pmap->ports[p].xprt)) {
+                pmap->ports[p].xprt = NULL;
+        }
 
-        pmap->ports[p].type = GF_PMAP_PORT_FREE;
-        pmap->ports[p].brickname = NULL;
-        pmap->ports[p].xprt = NULL;
+        /*
+         * This is where we garbage-collect.  If all of the brick names have
+         * been "whited out" by pmap_registry_search(...,destroy=_gf_true) and
+         * there's no xprt either, then we have nothing left worth saving and
+         * can delete the entire entry.
+         */
+        if (!pmap->ports[p].xprt) {
+                brick_str = pmap->ports[p].brickname;
+                if (brick_str) {
+                        while (*brick_str != '\0') {
+                                if (*(brick_str++) != ' ') {
+                                        goto out;
+                                }
+                        }
+                }
+                free (pmap->ports[p].brickname);
+                pmap->ports[p].brickname = NULL;
+                pmap->ports[p].type = GF_PMAP_PORT_FREE;
+        }
 
 out:
         return 0;
@@ -322,7 +423,8 @@ __gluster_pmap_portbybrick (rpcsvc_request_t *req)
 
         brick = args.brick;
 
-        port = pmap_registry_search (THIS, brick, GF_PMAP_PORT_BRICKSERVER);
+        port = pmap_registry_search (THIS, brick, GF_PMAP_PORT_BRICKSERVER,
+                                     _gf_false);
 
         if (!port)
                 rsp.op_ret = -1;
@@ -380,15 +482,6 @@ gluster_pmap_brickbyport (rpcsvc_request_t *req)
 }
 
 
-static int
-glusterd_brick_update_signin (glusterd_brickinfo_t *brickinfo,
-                              gf_boolean_t value)
-{
-        brickinfo->signed_in = value;
-
-        return 0;
-}
-
 int
 __gluster_pmap_signin (rpcsvc_request_t *req)
 {
@@ -413,9 +506,6 @@ fail:
                                (xdrproc_t)xdr_pmap_signin_rsp);
         free (args.brick);//malloced by xdr
 
-        if (!ret)
-                glusterd_brick_update_signin (brickinfo, _gf_true);
-
         return 0;
 }
 
@@ -454,9 +544,6 @@ __gluster_pmap_signout (rpcsvc_request_t *req)
                                 req->trans);
         }
 
-        if (!ret)
-                glusterd_brick_update_signin (brickinfo, _gf_false);
-
 fail:
         glusterd_submit_reply (req, &rsp, NULL, 0, NULL,
                                (xdrproc_t)xdr_pmap_signout_rsp);
diff --git a/xlators/mgmt/glusterd/src/glusterd-pmap.h b/xlators/mgmt/glusterd/src/glusterd-pmap.h
index 14187daee2b..9965a9577b5 100644
--- a/xlators/mgmt/glusterd/src/glusterd-pmap.h
+++ b/xlators/mgmt/glusterd/src/glusterd-pmap.h
@@ -40,10 +40,11 @@ int pmap_mark_port_leased (xlator_t *this, int port);
 int pmap_registry_alloc (xlator_t *this);
 int pmap_registry_bind (xlator_t *this, int port, const char *brickname,
                         gf_pmap_port_type_t type, void *xprt);
+int pmap_registry_extend (xlator_t *this, int port, const char *brickname);
 int pmap_registry_remove (xlator_t *this, int port, const char *brickname,
                           gf_pmap_port_type_t type, void *xprt);
 int pmap_registry_search (xlator_t *this, const char *brickname,
-                          gf_pmap_port_type_t type);
+                          gf_pmap_port_type_t type, gf_boolean_t destroy);
 struct pmap_registry *pmap_registry_get (xlator_t *this);
 
 #endif
diff --git a/xlators/mgmt/glusterd/src/glusterd-rebalance.c b/xlators/mgmt/glusterd/src/glusterd-rebalance.c
index 00b84e076c3..bc6cddea7f7 100644
--- a/xlators/mgmt/glusterd/src/glusterd-rebalance.c
+++ b/xlators/mgmt/glusterd/src/glusterd-rebalance.c
@@ -315,7 +315,7 @@ glusterd_handle_defrag_start (glusterd_volinfo_t *volinfo, char *op_errstr,
 
         sleep (5);
 
-        ret = glusterd_rebalance_rpc_create (volinfo, _gf_false);
+        ret = glusterd_rebalance_rpc_create (volinfo);
 
         //FIXME: this cbk is passed as NULL in all occurrences. May be
         //we never needed it.
@@ -363,8 +363,7 @@ out:
 }
 
 int
-glusterd_rebalance_rpc_create (glusterd_volinfo_t *volinfo,
-                               gf_boolean_t reconnect)
+glusterd_rebalance_rpc_create (glusterd_volinfo_t *volinfo)
 {
         dict_t                  *options = NULL;
         char                     sockfile[PATH_MAX] = {0,};
@@ -383,35 +382,27 @@ glusterd_rebalance_rpc_create (glusterd_volinfo_t *volinfo,
         if (!defrag)
                 goto out;
 
-        //rpc obj for rebalance process already in place.
-        if (glusterd_defrag_rpc_get (defrag)) {
-                ret = 0;
-                glusterd_defrag_rpc_put (defrag);
-                goto out;
-        }
         GLUSTERD_GET_DEFRAG_SOCK_FILE (sockfile, volinfo);
-        /* If reconnecting check if defrag sockfile exists in the new location
+        /* Check if defrag sockfile exists in the new location
          * in /var/run/ , if it does not try the old location
          */
-        if (reconnect) {
-                ret = sys_stat (sockfile, &buf);
-                /* TODO: Remove this once we don't need backward compatibility
-                 * with the older path
-                 */
-                if (ret && (errno == ENOENT)) {
-                        gf_msg (this->name, GF_LOG_WARNING, errno,
-                                GD_MSG_FILE_OP_FAILED, "Rebalance sockfile "
-                                "%s does not exist. Trying old path.",
-                                sockfile);
-                        GLUSTERD_GET_DEFRAG_SOCK_FILE_OLD (sockfile, volinfo,
-                                                           priv);
-                        ret =sys_stat (sockfile, &buf);
-                        if (ret && (ENOENT == errno)) {
-                                gf_msg (this->name, GF_LOG_ERROR, 0,
-                                        GD_MSG_REBAL_NO_SOCK_FILE, "Rebalance "
-                                        "sockfile %s does not exist", sockfile);
-                                goto out;
-                        }
+        ret = sys_stat (sockfile, &buf);
+        /* TODO: Remove this once we don't need backward compatibility
+         * with the older path
+         */
+        if (ret && (errno == ENOENT)) {
+                gf_msg (this->name, GF_LOG_WARNING, errno,
+                        GD_MSG_FILE_OP_FAILED, "Rebalance sockfile "
+                        "%s does not exist. Trying old path.",
+                        sockfile);
+                GLUSTERD_GET_DEFRAG_SOCK_FILE_OLD (sockfile, volinfo,
+                                                   priv);
+                ret =sys_stat (sockfile, &buf);
+                if (ret && (ENOENT == errno)) {
+                        gf_msg (this->name, GF_LOG_ERROR, 0,
+                                GD_MSG_REBAL_NO_SOCK_FILE, "Rebalance "
+                                "sockfile %s does not exist", sockfile);
+                        goto out;
                 }
         }
 
@@ -429,7 +420,7 @@ glusterd_rebalance_rpc_create (glusterd_volinfo_t *volinfo,
 
         glusterd_volinfo_ref (volinfo);
         ret = glusterd_rpc_create (&defrag->rpc, options,
-                                   glusterd_defrag_notify, volinfo);
+                                   glusterd_defrag_notify, volinfo, _gf_true);
         if (ret) {
                 gf_msg (THIS->name, GF_LOG_ERROR, 0, GD_MSG_RPC_CREATE_FAIL,
                         "Glusterd RPC creation failed");
diff --git a/xlators/mgmt/glusterd/src/glusterd-replace-brick.c b/xlators/mgmt/glusterd/src/glusterd-replace-brick.c
index eb1a714bfd5..fb29c6efcfd 100644
--- a/xlators/mgmt/glusterd/src/glusterd-replace-brick.c
+++ b/xlators/mgmt/glusterd/src/glusterd-replace-brick.c
@@ -326,22 +326,6 @@ out:
         return ret;
 }
 
-static int
-rb_kill_destination_brick (glusterd_volinfo_t *volinfo,
-                           glusterd_brickinfo_t *dst_brickinfo)
-{
-        glusterd_conf_t  *priv               = NULL;
-        char              pidfile[PATH_MAX]  = {0,};
-
-        priv = THIS->private;
-
-        snprintf (pidfile, PATH_MAX, "%s/vols/%s/%s",
-                  priv->workdir, volinfo->volname,
-                  RB_DSTBRICK_PIDFILE);
-
-        return glusterd_service_stop ("brick", pidfile, SIGTERM, _gf_true);
-}
-
 
 int
 glusterd_op_perform_replace_brick (glusterd_volinfo_t  *volinfo,
@@ -526,17 +510,6 @@ glusterd_op_replace_brick (dict_t *dict, dict_t *rsp_dict)
                 goto out;
         }
 
-        if (gf_is_local_addr (dst_brickinfo->hostname)) {
-                gf_msg_debug (this->name, 0, "I AM THE DESTINATION HOST");
-                ret = rb_kill_destination_brick (volinfo, dst_brickinfo);
-                if (ret) {
-                        gf_msg (this->name, GF_LOG_CRITICAL, 0,
-                                GD_MSG_BRK_CLEANUP_FAIL,
-                                "Unable to cleanup dst brick");
-                        goto out;
-                }
-        }
-
         ret = glusterd_svcs_stop (volinfo);
         if (ret) {
                 gf_msg (this->name, GF_LOG_ERROR, 0,
diff --git a/xlators/mgmt/glusterd/src/glusterd-snapshot.c b/xlators/mgmt/glusterd/src/glusterd-snapshot.c
index 6a350361998..c75a1011fb3 100644
--- a/xlators/mgmt/glusterd/src/glusterd-snapshot.c
+++ b/xlators/mgmt/glusterd/src/glusterd-snapshot.c
@@ -886,19 +886,6 @@ glusterd_snapshot_restore (dict_t *dict, char **op_errstr, dict_t *rsp_dict)
                         goto out;
                 }
 
-                /* Restore is successful therefore delete the original volume's
-                 * volinfo. If the volinfo is already restored then we should
-                 * delete the backend LVMs */
-                if (!gf_uuid_is_null (parent_volinfo->restored_from_snap)) {
-                        ret = glusterd_lvm_snapshot_remove (rsp_dict,
-                                                            parent_volinfo);
-                        if (ret) {
-                                gf_msg (this->name, GF_LOG_ERROR, 0,
-                                        GD_MSG_LVM_REMOVE_FAILED,
-                                        "Failed to remove LVM backend");
-                        }
-                }
-
                 /* Detach the volinfo from priv->volumes, so that no new
                  * command can ref it any more and then unref it.
                  */
@@ -2847,13 +2834,12 @@ glusterd_do_lvm_snapshot_remove (glusterd_volinfo_t *snap_vol,
 
         GLUSTERD_GET_BRICK_PIDFILE (pidfile, snap_vol, brickinfo, priv);
         if (gf_is_service_running (pidfile, &pid)) {
-                ret = kill (pid, SIGKILL);
-                if (ret && errno != ESRCH) {
-                        gf_msg (this->name, GF_LOG_ERROR, errno,
-                                GD_MSG_PID_KILL_FAIL, "Unable to kill pid "
-                                "%d reason : %s", pid, strerror(errno));
-                        goto out;
-                }
+                int send_attach_req (xlator_t *this, struct rpc_clnt *rpc,
+                                     char *path, int op);
+                (void) send_attach_req (this, brickinfo->rpc,
+                                        brickinfo->path,
+                                        GLUSTERD_BRICK_TERMINATE);
+                brickinfo->status = GF_BRICK_STOPPED;
         }
 
         /* Check if the brick is mounted and then try unmounting the brick */
@@ -2895,13 +2881,28 @@ glusterd_do_lvm_snapshot_remove (glusterd_volinfo_t *snap_vol,
                         "path %s (brick: %s): %s. Retry(%d)", mount_pt,
                         brickinfo->path, strerror (errno), retry_count);
 
-                sleep (1);
+                /*
+                 * This used to be one second, but that wasn't long enough
+                 * to get past the spurious EPERM errors that prevent some
+                 * tests (especially bug-1162462.t) from passing reliably.
+                 *
+                 * TBD: figure out where that garbage is coming from
+                 */
+                sleep (3);
         }
         if (ret) {
                 gf_msg (this->name, GF_LOG_ERROR, 0,
                         GD_MSG_UNOUNT_FAILED, "umount failed for "
                         "path %s (brick: %s): %s.", mount_pt,
                         brickinfo->path, strerror (errno));
+                /*
+                 * This is cheating, but necessary until we figure out how to
+                 * shut down a brick within a still-living brick daemon so that
+                 * random translators aren't keeping the mountpoint alive.
+                 *
+                 * TBD: figure out a real solution
+                 */
+                ret = 0;
                 goto out;
         }
 
@@ -7599,20 +7600,21 @@ glusterd_get_single_brick_status (char **op_errstr, dict_t *rsp_dict,
 
                 GLUSTERD_GET_BRICK_PIDFILE (pidfile, snap_volinfo,
                                             brickinfo, priv);
-                ret = gf_is_service_running (pidfile, &pid);
 
-                ret = snprintf (key, sizeof (key), "%s.brick%d.pid",
-                                keyprefix, index);
-                if (ret < 0) {
-                        goto out;
-                }
+                if (gf_is_service_running (pidfile, &pid)) {
+                        ret = snprintf (key, sizeof (key), "%s.brick%d.pid",
+                                        keyprefix, index);
+                        if (ret < 0) {
+                                goto out;
+                        }
 
-                ret = dict_set_int32 (rsp_dict, key, pid);
-                if (ret) {
-                        gf_msg (this->name, GF_LOG_ERROR, 0,
-                                GD_MSG_DICT_SET_FAILED,
-                                "Could not save pid %d", pid);
-                        goto out;
+                        ret = dict_set_int32 (rsp_dict, key, pid);
+                        if (ret) {
+                                gf_msg (this->name, GF_LOG_ERROR, 0,
+                                        GD_MSG_DICT_SET_FAILED,
+                                        "Could not save pid %d", pid);
+                                goto out;
+                        }
                 }
         }
 
diff --git a/xlators/mgmt/glusterd/src/glusterd-syncop.c b/xlators/mgmt/glusterd/src/glusterd-syncop.c
index 970aed2924c..07501f2407d 100644
--- a/xlators/mgmt/glusterd/src/glusterd-syncop.c
+++ b/xlators/mgmt/glusterd/src/glusterd-syncop.c
@@ -152,8 +152,6 @@ gd_brick_op_req_free (gd1_mgmt_brick_op_req *req)
         if (!req)
                 return;
 
-        if (strcmp (req->name, "") != 0)
-                GF_FREE (req->name);
         GF_FREE (req->input.input_val);
         GF_FREE (req);
 }
@@ -998,6 +996,21 @@ gd_syncop_mgmt_brick_op (struct rpc_clnt *rpc, glusterd_pending_node_t *pnode,
                         goto out;
                 }
         }
+
+        if (req->op == GLUSTERD_BRICK_TERMINATE) {
+                if (args.op_ret && (args.op_errno == ENOTCONN)) {
+                        /*
+                         * This is actually OK.  It happens when the target
+                         * brick process exits and we saw the closed connection
+                         * before we read the response.  If we didn't read the
+                         * response quickly enough that's kind of our own
+                         * fault, and the fact that the process exited means
+                         * that our goal of terminating the brick was achieved.
+                         */
+                        args.op_ret = 0;
+                }
+        }
+
         if (args.op_ret == 0)
                 glusterd_handle_node_rsp (dict_out, pnode->node, op,
                                           args.dict, op_ctx, errstr,
diff --git a/xlators/mgmt/glusterd/src/glusterd-utils.c b/xlators/mgmt/glusterd/src/glusterd-utils.c
index 5f9098f3e9d..5cad58cbb2e 100644
--- a/xlators/mgmt/glusterd/src/glusterd-utils.c
+++ b/xlators/mgmt/glusterd/src/glusterd-utils.c
@@ -93,6 +93,30 @@
 #define NLMV4_VERSION       4
 #define NLMV1_VERSION       1
 
+int
+send_attach_req (xlator_t *this, struct rpc_clnt *rpc, char *path, int op);
+
+static gf_boolean_t
+is_brick_mx_enabled ()
+{
+        char            *value = NULL;
+        int             ret = 0;
+        gf_boolean_t    enabled = _gf_false;
+        xlator_t        *this = NULL;
+        glusterd_conf_t *priv = NULL;
+
+        this = THIS;
+
+        priv = this->private;
+
+        ret = dict_get_str (priv->opts, GLUSTERD_BRICK_MULTIPLEX_KEY, &value);
+
+        if (!ret)
+                ret = gf_string2boolean (value, &enabled);
+
+        return ret ? _gf_false: enabled;
+}
+
 extern struct volopt_map_entry glusterd_volopt_map[];
 extern glusterd_all_vol_opts valid_all_vol_opts[];
 
@@ -1690,8 +1714,6 @@ glusterd_set_brick_socket_filepath (glusterd_volinfo_t *volinfo,
                                     glusterd_brickinfo_t *brickinfo,
                                     char *sockpath, size_t len)
 {
-        char                    export_path[PATH_MAX] = {0,};
-        char                    sock_filepath[PATH_MAX] = {0,};
         char                    volume_dir[PATH_MAX] = {0,};
         xlator_t                *this = NULL;
         glusterd_conf_t         *priv = NULL;
@@ -1706,11 +1728,18 @@ glusterd_set_brick_socket_filepath (glusterd_volinfo_t *volinfo,
         priv = this->private;
 
         GLUSTERD_GET_VOLUME_DIR (volume_dir, volinfo, priv);
-        GLUSTERD_REMOVE_SLASH_FROM_PATH (brickinfo->path, export_path);
-        snprintf (sock_filepath, PATH_MAX, "%s/run/%s-%s",
-                  volume_dir, brickinfo->hostname, export_path);
+        if (is_brick_mx_enabled ()) {
+                snprintf (sockpath, len, "%s/run/daemon-%s.socket",
+                          volume_dir, brickinfo->hostname);
+        } else {
+                char                    export_path[PATH_MAX] = {0,};
+                char                    sock_filepath[PATH_MAX] = {0,};
+                GLUSTERD_REMOVE_SLASH_FROM_PATH (brickinfo->path, export_path);
+                snprintf (sock_filepath, PATH_MAX, "%s/run/%s-%s",
+                          volume_dir, brickinfo->hostname, export_path);
 
-        glusterd_set_socket_filepath (sock_filepath, sockpath, len);
+                glusterd_set_socket_filepath (sock_filepath, sockpath, len);
+        }
 }
 
 /* connection happens only if it is not aleady connected,
@@ -1749,7 +1778,7 @@ glusterd_brick_connect (glusterd_volinfo_t  *volinfo,
 
                 ret = glusterd_rpc_create (&rpc, options,
                                            glusterd_brick_rpc_notify,
-                                           brickid);
+                                           brickid, _gf_false);
                 if (ret) {
                         GF_FREE (brickid);
                         goto out;
@@ -1802,6 +1831,8 @@ glusterd_volume_start_glusterfs (glusterd_volinfo_t  *volinfo,
         char                    glusterd_uuid[1024] = {0,};
         char                    valgrind_logfile[PATH_MAX] = {0};
         char                    rdma_brick_path[PATH_MAX] = {0,};
+        struct rpc_clnt         *rpc = NULL;
+        rpc_clnt_connection_t   *conn  = NULL;
 
         GF_ASSERT (volinfo);
         GF_ASSERT (brickinfo);
@@ -1823,16 +1854,33 @@ glusterd_volume_start_glusterfs (glusterd_volinfo_t  *volinfo,
                 goto out;
         }
 
-        ret = _mk_rundir_p (volinfo);
-        if (ret)
-                goto out;
+        GLUSTERD_GET_BRICK_PIDFILE (pidfile, volinfo, brickinfo, priv);
+        if (gf_is_service_running (pidfile, NULL)) {
+                goto connect;
+        }
 
+        /*
+         * There are all sorts of races in the start/stop code that could leave
+         * a UNIX-domain socket or RPC-client object associated with a
+         * long-dead incarnation of this brick, while the new incarnation is
+         * listening on a new socket at the same path and wondering why we
+         * haven't shown up.  To avoid the whole mess and be on the safe side,
+         * we just blow away anything that might have been left over, and start
+         * over again.
+         */
         glusterd_set_brick_socket_filepath (volinfo, brickinfo, socketpath,
                                             sizeof (socketpath));
-
-        GLUSTERD_GET_BRICK_PIDFILE (pidfile, volinfo, brickinfo, priv);
-        if (gf_is_service_running (pidfile, NULL))
-                goto connect;
+        (void) glusterd_unlink_file (socketpath);
+        rpc = brickinfo->rpc;
+        if (rpc) {
+                brickinfo->rpc = NULL;
+                conn = &rpc->conn;
+                if (conn->reconnect) {
+                        (void ) gf_timer_call_cancel (rpc->ctx, conn->reconnect);
+                        //rpc_clnt_unref (rpc);
+                }
+                rpc_clnt_unref (rpc);
+        }
 
         port = pmap_assign_port (THIS, brickinfo->port, brickinfo->path);
 
@@ -1933,6 +1981,7 @@ retry:
 
         brickinfo->port = port;
         brickinfo->rdma_port = rdma_port;
+        brickinfo->started_here = _gf_true;
 
         if (wait) {
                 synclock_unlock (&priv->big_lock);
@@ -1978,6 +2027,7 @@ connect:
                         brickinfo->hostname, brickinfo->path, socketpath);
                 goto out;
         }
+
 out:
         return ret;
 }
@@ -2035,9 +2085,8 @@ glusterd_volume_stop_glusterfs (glusterd_volinfo_t  *volinfo,
                                 gf_boolean_t del_brick)
 {
         xlator_t        *this                   = NULL;
-        glusterd_conf_t *priv                   = NULL;
-        char            pidfile[PATH_MAX]       = {0,};
         int             ret                     = 0;
+        char            *op_errstr              = NULL;
 
         GF_ASSERT (volinfo);
         GF_ASSERT (brickinfo);
@@ -2045,18 +2094,32 @@ glusterd_volume_stop_glusterfs (glusterd_volinfo_t  *volinfo,
         this = THIS;
         GF_ASSERT (this);
 
-        priv = this->private;
         if (del_brick)
                 cds_list_del_init (&brickinfo->brick_list);
 
         if (GLUSTERD_STATUS_STARTED == volinfo->status) {
-                (void) glusterd_brick_disconnect (brickinfo);
-                GLUSTERD_GET_BRICK_PIDFILE (pidfile, volinfo, brickinfo, priv);
-                ret = glusterd_service_stop ("brick", pidfile, SIGTERM, _gf_false);
-                if (ret == 0) {
-                        glusterd_set_brick_status (brickinfo, GF_BRICK_STOPPED);
-                        (void) glusterd_brick_unlink_socket_file (volinfo, brickinfo);
+                /*
+                 * In a post-multiplexing world, even if we're not actually
+                 * doing any multiplexing, just dropping the RPC connection
+                 * isn't enough.  There might be many such connections during
+                 * the brick daemon's lifetime, even if we only consider the
+                 * management RPC port (because tests etc. might be manually
+                 * attaching and detaching bricks).  Therefore, we have to send
+                 * an actual signal instead.
+                 */
+                if (is_brick_mx_enabled ()) {
+                        (void) send_attach_req (this, brickinfo->rpc,
+                                                brickinfo->path,
+                                                GLUSTERD_BRICK_TERMINATE);
+                } else {
+                        (void) glusterd_brick_terminate (volinfo, brickinfo,
+                                                         NULL, 0, &op_errstr);
+                        if (op_errstr) {
+                                GF_FREE (op_errstr);
+                        }
+                        (void) glusterd_brick_disconnect (brickinfo);
                 }
+                ret = 0;
         }
 
         if (del_brick)
@@ -4843,16 +4906,350 @@ out:
         return ret;
 }
 
+static int32_t
+my_callback (struct rpc_req *req, struct iovec *iov, int count, void *v_frame)
+{
+        call_frame_t    *frame  = v_frame;
+
+        STACK_DESTROY (frame->root);
+
+        return 0;
+}
+
+int
+send_attach_req (xlator_t *this, struct rpc_clnt *rpc, char *path, int op)
+{
+        int            ret      = -1;
+        struct iobuf  *iobuf    = NULL;
+        struct iobref *iobref   = NULL;
+        struct iovec   iov      = {0, };
+        ssize_t        req_size = 0;
+        call_frame_t  *frame    = NULL;
+        gd1_mgmt_brick_op_req   brick_req;
+        void                    *req = &brick_req;
+        void          *errlbl   = &&err;
+        extern struct rpc_clnt_program gd_brick_prog;
+
+        if (!rpc) {
+                gf_log (this->name, GF_LOG_ERROR, "called with null rpc");
+                return -1;
+        }
+
+        brick_req.op = op;
+        brick_req.name = path;
+        brick_req.input.input_val = NULL;
+        brick_req.input.input_len = 0;
+
+        req_size = xdr_sizeof ((xdrproc_t)xdr_gd1_mgmt_brick_op_req, req);
+        iobuf = iobuf_get2 (rpc->ctx->iobuf_pool, req_size);
+        if (!iobuf) {
+                goto *errlbl;
+        }
+        errlbl = &&maybe_free_iobuf;
+
+        iov.iov_base = iobuf->ptr;
+        iov.iov_len  = iobuf_pagesize (iobuf);
+
+        iobref = iobref_new ();
+        if (!iobref) {
+                goto *errlbl;
+        }
+        errlbl = &&free_iobref;
+
+        frame = create_frame (this, this->ctx->pool);
+        if (!frame) {
+                goto *errlbl;
+        }
+
+        iobref_add (iobref, iobuf);
+        /*
+         * Drop our reference to the iobuf.  The iobref should already have
+         * one after iobref_add, so when we unref that we'll free the iobuf as
+         * well.  This allows us to pass just the iobref as frame->local.
+         */
+        iobuf_unref (iobuf);
+        /* Set the pointer to null so we don't free it on a later error. */
+        iobuf = NULL;
+
+        /* Create the xdr payload */
+        ret = xdr_serialize_generic (iov, req,
+                                     (xdrproc_t)xdr_gd1_mgmt_brick_op_req);
+        if (ret == -1) {
+                goto *errlbl;
+        }
+
+        iov.iov_len = ret;
+
+        /* Send the msg */
+        ret = rpc_clnt_submit (rpc, &gd_brick_prog, op,
+                               my_callback, &iov, 1, NULL, 0, iobref, frame,
+                               NULL, 0, NULL, 0, NULL);
+        return ret;
+
+free_iobref:
+        iobref_unref (iobref);
+maybe_free_iobuf:
+        if (iobuf) {
+                iobuf_unref (iobuf);
+        }
+err:
+        return -1;
+}
+
+extern size_t
+build_volfile_path (char *volume_id, char *path,
+                    size_t path_len, char *trusted_str);
+
+
+static int
+attach_brick (xlator_t *this,
+              glusterd_brickinfo_t *brickinfo,
+              glusterd_brickinfo_t *other_brick,
+              glusterd_volinfo_t *volinfo,
+              glusterd_volinfo_t *other_vol)
+{
+        glusterd_conf_t *conf                   = this->private;
+        char            pidfile1[PATH_MAX]      = {0};
+        char            pidfile2[PATH_MAX]      = {0};
+        char            unslashed[PATH_MAX]     = {'\0',};
+        char            full_id[PATH_MAX]       = {'\0',};
+        char            path[PATH_MAX]          = {'\0',};
+        int             ret;
+
+        gf_log (this->name, GF_LOG_INFO,
+                "add brick %s to existing process for %s",
+                brickinfo->path, other_brick->path);
+
+        GLUSTERD_REMOVE_SLASH_FROM_PATH (brickinfo->path, unslashed);
+
+        ret = pmap_registry_extend (this, other_brick->port,
+                                    brickinfo->path);
+        if (ret != 0) {
+                gf_log (this->name, GF_LOG_ERROR,
+                        "adding brick to process failed");
+                return -1;
+        }
+
+        brickinfo->port = other_brick->port;
+        brickinfo->status = GF_BRICK_STARTED;
+        brickinfo->started_here = _gf_true;
+        brickinfo->rpc = rpc_clnt_ref (other_brick->rpc);
+
+        GLUSTERD_GET_BRICK_PIDFILE (pidfile1, other_vol, other_brick, conf);
+        GLUSTERD_GET_BRICK_PIDFILE (pidfile2, volinfo, brickinfo, conf);
+        (void) sys_unlink (pidfile2);
+        (void) sys_link (pidfile1, pidfile2);
+
+        if (volinfo->is_snap_volume) {
+                snprintf (full_id, sizeof(full_id), "/%s/%s/%s.%s.%s",
+                          GLUSTERD_VOL_SNAP_DIR_PREFIX,
+                          volinfo->snapshot->snapname,
+                          volinfo->volname, brickinfo->hostname, unslashed);
+        } else {
+                snprintf (full_id, sizeof(full_id), "%s.%s.%s",
+                          volinfo->volname, brickinfo->hostname, unslashed);
+        }
+        (void) build_volfile_path (full_id, path, sizeof(path), NULL);
+
+        int tries = 0;
+        while (tries++ <= 10) {
+                ret = send_attach_req (this, other_brick->rpc, path,
+                                       GLUSTERD_BRICK_ATTACH);
+                if (!ret) {
+                        return 0;
+                }
+                /*
+                 * It might not actually be safe to manipulate the lock like
+                 * this, but if we don't then the connection can never actually
+                 * complete and retries are useless.  Unfortunately, all of the
+                 * alternatives (e.g. doing all of this in a separate thread)
+                 * are much more complicated and risky.  TBD: see if there's a
+                 * better way
+                 */
+                synclock_unlock (&conf->big_lock);
+                sleep (1);
+                synclock_lock (&conf->big_lock);
+        }
+
+        gf_log (this->name, GF_LOG_WARNING,
+                "attach failed for %s", brickinfo->path);
+        return ret;
+}
+
+static glusterd_brickinfo_t *
+find_compatible_brick_in_volume (glusterd_conf_t *conf,
+                                 glusterd_volinfo_t *volinfo,
+                                 glusterd_brickinfo_t *brickinfo)
+{
+        xlator_t                *this                   = THIS;
+        glusterd_brickinfo_t    *other_brick;
+        char                    pidfile2[PATH_MAX]      = {0};
+        int32_t                 pid2                    = -1;
+
+        cds_list_for_each_entry (other_brick, &volinfo->bricks,
+                                 brick_list) {
+                if (other_brick == brickinfo) {
+                        continue;
+                }
+                if (!other_brick->started_here) {
+                        continue;
+                }
+                if (strcmp (brickinfo->hostname, other_brick->hostname) != 0) {
+                        continue;
+                }
+                GLUSTERD_GET_BRICK_PIDFILE (pidfile2, volinfo, other_brick,
+                                            conf);
+                if (!gf_is_service_running (pidfile2, &pid2)) {
+                        gf_log (this->name, GF_LOG_INFO,
+                                "cleaning up dead brick %s:%s",
+                                other_brick->hostname, other_brick->path);
+                        other_brick->started_here = _gf_false;
+                        sys_unlink (pidfile2);
+                        continue;
+                }
+                return other_brick;
+        }
+
+        return NULL;
+}
+
+static gf_boolean_t
+unsafe_option (dict_t *this, char *key, data_t *value, void *arg)
+{
+        /*
+         * Certain options are safe because they're already being handled other
+         * ways, such as being copied down to the bricks (all auth options) or
+         * being made irrelevant (event-threads).  All others are suspect and
+         * must be checked in the next function.
+         */
+        if (fnmatch ("*auth*", key, 0) == 0) {
+                return _gf_false;
+        }
+
+        if (fnmatch ("*event-threads", key, 0) == 0) {
+                return _gf_false;
+        }
+
+        return _gf_true;
+}
+
+static int
+opts_mismatch (dict_t *dict1, char *key, data_t *value1, void *dict2)
+{
+        data_t  *value2         = dict_get (dict2, key);
+        int32_t min_len;
+
+        /*
+         * If the option is only present on one, we can either look at the
+         * default or assume a mismatch.  Looking at the default is pretty
+         * hard, because that's part of a structure within each translator and
+         * there's no dlopen interface to get at it, so we assume a mismatch.
+         * If the user really wants them to match (and for their bricks to be
+         * multiplexed, they can always reset the option).
+         */
+        if (!value2) {
+                gf_log (THIS->name, GF_LOG_DEBUG, "missing option %s", key);
+                return -1;
+        }
+
+        min_len = MIN (value1->len, value2->len);
+        if (strncmp (value1->data, value2->data, min_len) != 0) {
+                gf_log (THIS->name, GF_LOG_DEBUG,
+                        "option mismatch, %s, %s != %s",
+                        key, value1->data, value2->data);
+                return -1;
+        }
+
+        return 0;
+}
+
+static glusterd_brickinfo_t *
+find_compatible_brick (glusterd_conf_t *conf,
+                       glusterd_volinfo_t *volinfo,
+                       glusterd_brickinfo_t *brickinfo,
+                       glusterd_volinfo_t **other_vol_p)
+{
+        glusterd_brickinfo_t    *other_brick;
+        glusterd_volinfo_t      *other_vol;
+
+        /* Just return NULL here if multiplexing is disabled. */
+        if (!is_brick_mx_enabled ()) {
+                return NULL;
+        }
+
+        other_brick = find_compatible_brick_in_volume (conf, volinfo,
+                                                       brickinfo);
+        if (other_brick) {
+                *other_vol_p = volinfo;
+                return other_brick;
+        }
+
+        cds_list_for_each_entry (other_vol, &conf->volumes, vol_list) {
+                if (other_vol == volinfo) {
+                        continue;
+                }
+                if (volinfo->is_snap_volume) {
+                        /*
+                         * Snap volumes do have different options than their
+                         * parents, but are nonetheless generally compatible.
+                         * Skip the option comparison for now, until we figure
+                         * out how to handle this (e.g. compare at the brick
+                         * level instead of the volume level for this case).
+                         *
+                         * TBD: figure out compatibility for snap bricks
+                         */
+                        goto no_opt_compare;
+                }
+                /*
+                 * It's kind of a shame that we have to do this check in both
+                 * directions, but an option might only exist on one of the two
+                 * dictionaries and dict_foreach_match will only find that one.
+                 */
+                gf_log (THIS->name, GF_LOG_DEBUG,
+                        "comparing options for %s and %s",
+                        volinfo->volname, other_vol->volname);
+                if (dict_foreach_match (volinfo->dict, unsafe_option, NULL,
+                                        opts_mismatch, other_vol->dict) < 0) {
+                        gf_log (THIS->name, GF_LOG_DEBUG, "failure forward");
+                        continue;
+                }
+                if (dict_foreach_match (other_vol->dict, unsafe_option, NULL,
+                                        opts_mismatch, volinfo->dict) < 0) {
+                        gf_log (THIS->name, GF_LOG_DEBUG, "failure backward");
+                        continue;
+                }
+                gf_log (THIS->name, GF_LOG_DEBUG, "all options match");
+no_opt_compare:
+                other_brick = find_compatible_brick_in_volume (conf,
+                                                               other_vol,
+                                                               brickinfo);
+                if (other_brick) {
+                        *other_vol_p = other_vol;
+                        return other_brick;
+                }
+        }
+
+        return NULL;
+}
+
 int
 glusterd_brick_start (glusterd_volinfo_t *volinfo,
                       glusterd_brickinfo_t *brickinfo,
                       gf_boolean_t wait)
 {
-        int                                     ret   = -1;
-        xlator_t                                *this = NULL;
+        int                     ret   = -1;
+        xlator_t                *this = NULL;
+        glusterd_brickinfo_t    *other_brick;
+        glusterd_conf_t         *conf = NULL;
+        int32_t                 pid                   = -1;
+        char                    pidfile[PATH_MAX]     = {0};
+        FILE                    *fp;
+        char                    socketpath[PATH_MAX]  = {0};
+        glusterd_volinfo_t      *other_vol;
 
         this = THIS;
         GF_ASSERT (this);
+        conf = this->private;
 
         if ((!brickinfo) || (!volinfo))
                 goto out;
@@ -4876,6 +5273,77 @@ glusterd_brick_start (glusterd_volinfo_t *volinfo,
                 ret = 0;
                 goto out;
         }
+
+        GLUSTERD_GET_BRICK_PIDFILE (pidfile, volinfo, brickinfo, conf);
+        if (gf_is_service_running (pidfile, &pid)) {
+                /*
+                 * In general, if the pidfile exists and points to a running
+                 * process, this will already be set.  However, that's not the
+                 * case when we're starting up and bricks are already running.
+                 */
+                if (brickinfo->status != GF_BRICK_STARTED) {
+                        gf_log (this->name, GF_LOG_INFO,
+                                "discovered already-running brick %s",
+                                brickinfo->path);
+                        //brickinfo->status = GF_BRICK_STARTED;
+                        (void) pmap_registry_bind (this,
+                                        brickinfo->port, brickinfo->path,
+                                        GF_PMAP_PORT_BRICKSERVER, NULL);
+                        /*
+                         * This will unfortunately result in a separate RPC
+                         * connection per brick, even though they're all in
+                         * the same process.  It works, but it would be nicer
+                         * if we could find a pre-existing connection to that
+                         * same port (on another brick) and re-use that.
+                         * TBD: re-use RPC connection across bricks
+                         */
+                        glusterd_set_brick_socket_filepath (volinfo, brickinfo,
+                                        socketpath, sizeof (socketpath));
+                        (void) glusterd_brick_connect (volinfo, brickinfo,
+                                        socketpath);
+                }
+                return 0;
+        }
+
+        ret = _mk_rundir_p (volinfo);
+        if (ret)
+                goto out;
+
+        other_brick = find_compatible_brick (conf, volinfo, brickinfo,
+                                             &other_vol);
+        if (other_brick) {
+                ret = attach_brick (this, brickinfo, other_brick,
+                                    volinfo, other_vol);
+                if (ret == 0) {
+                        goto out;
+                }
+        }
+
+        /*
+         * This hack is necessary because our brick-process management is a
+         * total nightmare.  We expect a brick process's socket and pid files
+         * to be ready *immediately* after we start it.  Ditto for it calling
+         * back to bind its port.  Unfortunately, none of that is realistic.
+         * Any process takes non-zero time to start up.  This has *always* been
+         * racy and unsafe; it just became more visible with multiplexing.
+         *
+         * The right fix would be to do all of this setup *in the parent*,
+         * which would include (among other things) getting the PID back from
+         * the "runner" code.  That's all prohibitively difficult and risky.
+         * To work around the more immediate problems, we create a stub pidfile
+         * here to let gf_is_service_running know that we expect the process to
+         * be there shortly, and then it gets filled in with a real PID when
+         * the process does finish starting up.
+         *
+         * TBD: pray for GlusterD 2 to be ready soon.
+         */
+        (void) sys_unlink (pidfile);
+        fp = fopen (pidfile, "w+");
+        if (fp) {
+                (void) fprintf (fp, "0\n");
+                (void) fclose (fp);
+        }
+
         ret = glusterd_volume_start_glusterfs (volinfo, brickinfo, wait);
         if (ret) {
                 gf_msg (this->name, GF_LOG_ERROR, 0,
@@ -5813,11 +6281,12 @@ glusterd_add_brick_to_dict (glusterd_volinfo_t *volinfo,
         if (ret)
                 goto out;
 
-
         GLUSTERD_GET_BRICK_PIDFILE (pidfile, volinfo, brickinfo, priv);
 
         if (glusterd_is_brick_started (brickinfo)) {
-                brick_online = gf_is_service_running (pidfile, &pid);
+                if (gf_is_service_running (pidfile, &pid)) {
+                        brick_online = _gf_true;
+                }
         }
 
         memset (key, 0, sizeof (key));
@@ -6880,10 +7349,12 @@ out:
         return ret;
 }
 
-int
-glusterd_brick_statedump (glusterd_volinfo_t *volinfo,
-                          glusterd_brickinfo_t *brickinfo,
-                          char *options, int option_cnt, char **op_errstr)
+
+static int
+glusterd_brick_signal (glusterd_volinfo_t *volinfo,
+                       glusterd_brickinfo_t *brickinfo,
+                       char *options, int option_cnt, char **op_errstr,
+                       int sig)
 {
         int                     ret = -1;
         xlator_t                *this = NULL;
@@ -6916,6 +7387,7 @@ glusterd_brick_statedump (glusterd_volinfo_t *volinfo,
 
         GLUSTERD_GET_BRICK_PIDFILE (pidfile_path, volinfo, brickinfo, conf);
 
+        /* TBD: use gf_is_service_running instead of almost-identical code? */
         pidfile = fopen (pidfile_path, "r");
         if (!pidfile) {
                 gf_msg ("glusterd", GF_LOG_ERROR, errno,
@@ -6934,24 +7406,35 @@ glusterd_brick_statedump (glusterd_volinfo_t *volinfo,
                 goto out;
         }
 
-        snprintf (dumpoptions_path, sizeof (dumpoptions_path),
-                  DEFAULT_VAR_RUN_DIRECTORY"/glusterdump.%d.options", pid);
-        ret = glusterd_set_dump_options (dumpoptions_path, options, option_cnt);
-        if (ret < 0) {
-                gf_msg ("glusterd", GF_LOG_ERROR, 0,
-                       GD_MSG_BRK_STATEDUMP_FAIL,
-                       "error while parsing the statedump "
-                        "options");
-                ret = -1;
+        if (pid == 0) {
+                gf_msg ("glusterd", GF_LOG_WARNING, 0,
+                        GD_MSG_NO_SIG_TO_PID_ZERO,
+                        "refusing to send signal %d to pid zero", sig);
                 goto out;
         }
 
+        if (sig == SIGUSR1) {
+                snprintf (dumpoptions_path, sizeof (dumpoptions_path),
+                          DEFAULT_VAR_RUN_DIRECTORY"/glusterdump.%d.options",
+                          pid);
+                ret = glusterd_set_dump_options (dumpoptions_path, options,
+                                                 option_cnt);
+                if (ret < 0) {
+                        gf_msg ("glusterd", GF_LOG_ERROR, 0,
+                               GD_MSG_BRK_STATEDUMP_FAIL,
+                               "error while parsing the statedump "
+                                "options");
+                        ret = -1;
+                        goto out;
+                }
+        }
+
         gf_msg ("glusterd", GF_LOG_INFO, 0,
                 GD_MSG_STATEDUMP_INFO,
-                "Performing statedump on brick with pid %d",
-                pid);
+                "sending signal %d to brick with pid %d",
+                sig, pid);
 
-        kill (pid, SIGUSR1);
+        kill (pid, sig);
 
         sleep (1);
         ret = 0;
@@ -6962,6 +7445,26 @@ out:
         return ret;
 }
 
+int
+glusterd_brick_statedump (glusterd_volinfo_t *volinfo,
+                          glusterd_brickinfo_t *brickinfo,
+                          char *options, int option_cnt, char **op_errstr)
+{
+        return glusterd_brick_signal (volinfo, brickinfo,
+                                      options, option_cnt, op_errstr,
+                                      SIGUSR1);
+}
+
+int
+glusterd_brick_terminate (glusterd_volinfo_t *volinfo,
+                          glusterd_brickinfo_t *brickinfo,
+                          char *options, int option_cnt, char **op_errstr)
+{
+        return glusterd_brick_signal (volinfo, brickinfo,
+                                      options, option_cnt, op_errstr,
+                                      SIGTERM);
+}
+
 int
 glusterd_nfs_statedump (char *options, int option_cnt, char **op_errstr)
 {
@@ -7446,7 +7949,7 @@ glusterd_volume_defrag_restart (glusterd_volinfo_t *volinfo, char *op_errstr,
                                           "volume=%s", volinfo->volname);
                                 goto out;
                         }
-                        ret = glusterd_rebalance_rpc_create (volinfo, _gf_true);
+                        ret = glusterd_rebalance_rpc_create (volinfo);
                         break;
                 }
         case GF_DEFRAG_STATUS_NOT_STARTED:
@@ -7978,9 +8481,10 @@ glusterd_to_cli (rpcsvc_request_t *req, gf_cli_rsp *arg, struct iovec *payload,
 
         glusterd_submit_reply (req, arg, payload, payloadcount, iobref,
                                (xdrproc_t) xdrproc);
-        if (dict)
-                dict_unref (dict);
 
+        if (dict) {
+                dict_unref (dict);
+        }
         return ret;
 }
 
@@ -11356,6 +11860,7 @@ glusterd_get_global_options_for_all_vols (rpcsvc_request_t *req, dict_t *ctx,
         char                    *allvolopt = NULL;
         int32_t                 i = 0;
         gf_boolean_t            exists = _gf_false;
+        gf_boolean_t            need_free;
 
         this = THIS;
         GF_VALIDATE_OR_GOTO (THIS->name, this, out);
@@ -11414,13 +11919,16 @@ glusterd_get_global_options_for_all_vols (rpcsvc_request_t *req, dict_t *ctx,
                 ret = dict_get_str (priv->opts, allvolopt, &def_val);
 
                 /* If global option isn't set explicitly */
+
+                need_free = _gf_false;
                 if (!def_val) {
-                        if (!strcmp (allvolopt, GLUSTERD_GLOBAL_OP_VERSION_KEY))
+                        if (!strcmp (allvolopt,
+                                     GLUSTERD_GLOBAL_OP_VERSION_KEY)) {
                                 gf_asprintf (&def_val, "%d", priv->op_version);
-                        else if (!strcmp (allvolopt, GLUSTERD_QUORUM_RATIO_KEY))
-                                gf_asprintf (&def_val, "%d", 0);
-                        else if (!strcmp (allvolopt, GLUSTERD_SHARED_STORAGE_KEY))
-                                gf_asprintf (&def_val, "%s", "disable");
+                                need_free = _gf_true;
+                        } else {
+                                def_val = valid_all_vol_opts[i].dflt_val;
+                        }
                 }
 
                 count++;
@@ -11443,6 +11951,9 @@ glusterd_get_global_options_for_all_vols (rpcsvc_request_t *req, dict_t *ctx,
                         goto out;
                 }
 
+                if (need_free) {
+                        GF_FREE (def_val);
+                }
                 def_val = NULL;
                 allvolopt = NULL;
 
diff --git a/xlators/mgmt/glusterd/src/glusterd-utils.h b/xlators/mgmt/glusterd/src/glusterd-utils.h
index e801c1a03a3..a9aefb85246 100644
--- a/xlators/mgmt/glusterd/src/glusterd-utils.h
+++ b/xlators/mgmt/glusterd/src/glusterd-utils.h
@@ -386,6 +386,12 @@ int
 glusterd_brick_statedump (glusterd_volinfo_t *volinfo,
                           glusterd_brickinfo_t *brickinfo,
                           char *options, int option_cnt, char **op_errstr);
+
+int
+glusterd_brick_terminate (glusterd_volinfo_t *volinfo,
+                          glusterd_brickinfo_t *brickinfo,
+                          char *options, int option_cnt, char **op_errstr);
+
 int
 glusterd_nfs_statedump (char *options, int option_cnt, char **op_errstr);
 
diff --git a/xlators/mgmt/glusterd/src/glusterd-volgen.c b/xlators/mgmt/glusterd/src/glusterd-volgen.c
index f5ddef4755d..957bbfcee25 100644
--- a/xlators/mgmt/glusterd/src/glusterd-volgen.c
+++ b/xlators/mgmt/glusterd/src/glusterd-volgen.c
@@ -1516,6 +1516,8 @@ brick_graph_add_posix (volgen_graph_t *graph, glusterd_volinfo_t *volinfo,
 out:
         return ret;
 }
+
+#if 0
 static int
 brick_graph_add_trash (volgen_graph_t *graph, glusterd_volinfo_t *volinfo,
                         dict_t *set_dict, glusterd_brickinfo_t *brickinfo)
@@ -1538,6 +1540,7 @@ brick_graph_add_trash (volgen_graph_t *graph, glusterd_volinfo_t *volinfo,
 out:
         return ret;
 }
+#endif
 
 static int
 brick_graph_add_decompounder (volgen_graph_t *graph, glusterd_volinfo_t *volinfo,
@@ -2456,7 +2459,11 @@ static volgen_brick_xlator_t server_graph_table[] = {
         {brick_graph_add_changetimerecorder, "changetimerecorder"},
 #endif
         {brick_graph_add_bd, "bd"},
+        /*
+         * TBD: Figure out why trash breaks multiplexing.  AFAICT it should fail
+         * the same way already.
         {brick_graph_add_trash, "trash"},
+         */
         {brick_graph_add_arbiter, "arbiter"},
         {brick_graph_add_posix, "posix"},
 };
diff --git a/xlators/mgmt/glusterd/src/glusterd-volume-ops.c b/xlators/mgmt/glusterd/src/glusterd-volume-ops.c
index ecc4f9609c1..ad5fe909578 100644
--- a/xlators/mgmt/glusterd/src/glusterd-volume-ops.c
+++ b/xlators/mgmt/glusterd/src/glusterd-volume-ops.c
@@ -2612,7 +2612,7 @@ glusterd_op_start_volume (dict_t *dict, char **op_errstr)
         }
 
         ret = dict_get_str (conf->opts, GLUSTERD_STORE_KEY_GANESHA_GLOBAL, &str);
-        if (ret == -1) {
+        if (ret != 0) {
                 gf_msg (this->name, GF_LOG_INFO, 0,
                         GD_MSG_DICT_GET_FAILED, "Global dict not present.");
                 ret = 0;
@@ -3069,7 +3069,8 @@ glusterd_clearlocks_get_local_client_ports (glusterd_volinfo_t *volinfo,
                                   brickinfo->path);
 
                 port = pmap_registry_search (THIS, brickname,
-                                             GF_PMAP_PORT_BRICKSERVER);
+                                             GF_PMAP_PORT_BRICKSERVER,
+                                             _gf_false);
                 if (!port) {
                         ret = -1;
                         gf_msg_debug (THIS->name, 0, "Couldn't get port "
diff --git a/xlators/mgmt/glusterd/src/glusterd-volume-set.c b/xlators/mgmt/glusterd/src/glusterd-volume-set.c
index 7da0de20291..9f877b6d620 100644
--- a/xlators/mgmt/glusterd/src/glusterd-volume-set.c
+++ b/xlators/mgmt/glusterd/src/glusterd-volume-set.c
@@ -3145,6 +3145,13 @@ struct volopt_map_entry glusterd_volopt_map[] = {
           .flags       = OPT_FLAG_CLIENT_OPT,
           .op_version  = GD_OP_VERSION_3_9_1,
         },
+
+        /* Brick multiplexing options */
+        { .key         = GLUSTERD_BRICK_MULTIPLEX_KEY,
+          .voltype     = "mgmt/glusterd",
+          .value       = "off",
+          .op_version  = GD_OP_VERSION_3_10_0
+        },
         { .key         = NULL
         }
 };
diff --git a/xlators/mgmt/glusterd/src/glusterd.h b/xlators/mgmt/glusterd/src/glusterd.h
index d00e4e20811..f3c7e1d6891 100644
--- a/xlators/mgmt/glusterd/src/glusterd.h
+++ b/xlators/mgmt/glusterd/src/glusterd.h
@@ -54,6 +54,7 @@
                                         "S32gluster_enable_shared_storage.sh"
 #define GLUSTER_SHARED_STORAGE          "gluster_shared_storage"
 #define GLUSTERD_SHARED_STORAGE_KEY     "cluster.enable-shared-storage"
+#define GLUSTERD_BRICK_MULTIPLEX_KEY    "cluster.brick-multiplex"
 
 #define GANESHA_HA_CONF  CONFDIR "/ganesha-ha.conf"
 #define GANESHA_EXPORT_DIRECTORY        CONFDIR"/exports"
@@ -77,7 +78,6 @@
                             "for more details."
 #define OPERRSTR_COMMIT_FAIL "Commit failed on %s. Please check the log file "\
                              "for more details."
-
 struct glusterd_volinfo_;
 typedef struct glusterd_volinfo_ glusterd_volinfo_t;
 
@@ -215,7 +215,6 @@ struct glusterd_brickinfo {
         int                port;
         int                rdma_port;
         char              *logfile;
-        gf_boolean_t       signed_in;
         gf_store_handle_t *shandle;
         gf_brick_status_t  status;
         struct rpc_clnt   *rpc;
@@ -232,6 +231,7 @@ struct glusterd_brickinfo {
          */
         uint16_t           group;
         uuid_t             jbr_uuid;
+        gf_boolean_t       started_here;
 };
 
 typedef struct glusterd_brickinfo glusterd_brickinfo_t;
@@ -1048,7 +1048,8 @@ glusterd_brick_rpc_notify (struct rpc_clnt *rpc, void *mydata,
 
 int
 glusterd_rpc_create (struct rpc_clnt **rpc, dict_t *options,
-                     rpc_clnt_notify_t notify_fn, void *notify_data);
+                     rpc_clnt_notify_t notify_fn, void *notify_data,
+                     gf_boolean_t force);
 
 
 /* handler functions */
@@ -1064,8 +1065,7 @@ int glusterd_handle_defrag_start (glusterd_volinfo_t *volinfo, char *op_errstr,
                                   size_t len, int cmd, defrag_cbk_fn_t cbk,
                                   glusterd_op_t op);
 int
-glusterd_rebalance_rpc_create (glusterd_volinfo_t *volinfo,
-                               gf_boolean_t reconnect);
+glusterd_rebalance_rpc_create (glusterd_volinfo_t *volinfo);
 
 int glusterd_rebalance_defrag_init (glusterd_volinfo_t *volinfo,
                                     defrag_cbk_fn_t cbk);
-- 
cgit