summaryrefslogtreecommitdiffstats
path: root/xlators/mgmt/glusterd/src/glusterd-rebalance.c
diff options
context:
space:
mode:
Diffstat (limited to 'xlators/mgmt/glusterd/src/glusterd-rebalance.c')
-rw-r--r--xlators/mgmt/glusterd/src/glusterd-rebalance.c1134
1 files changed, 1134 insertions, 0 deletions
diff --git a/xlators/mgmt/glusterd/src/glusterd-rebalance.c b/xlators/mgmt/glusterd/src/glusterd-rebalance.c
new file mode 100644
index 00000000000..35fa4627d04
--- /dev/null
+++ b/xlators/mgmt/glusterd/src/glusterd-rebalance.c
@@ -0,0 +1,1134 @@
+/*
+ Copyright (c) 2010-2012 Red Hat, Inc. <http://www.redhat.com>
+ This file is part of GlusterFS.
+
+ This file is licensed to you under your choice of the GNU Lesser
+ General Public License, version 3 or any later version (LGPLv3 or
+ later), or the GNU General Public License, version 2 (GPLv2), in all
+ cases as published by the Free Software Foundation.
+*/
+#include <inttypes.h>
+#include <sys/types.h>
+#include <unistd.h>
+#include <sys/resource.h>
+#include <sys/statvfs.h>
+
+#include "globals.h"
+#include "compat.h"
+#include "protocol-common.h"
+#include "xlator.h"
+#include "logging.h"
+#include "timer.h"
+#include "glusterd-mem-types.h"
+#include "glusterd.h"
+#include "glusterd-sm.h"
+#include "glusterd-op-sm.h"
+#include "glusterd-utils.h"
+#include "glusterd-messages.h"
+#include "glusterd-store.h"
+#include "run.h"
+#include "glusterd-volgen.h"
+#include "glusterd-messages.h"
+
+#include "syscall.h"
+#include "cli1-xdr.h"
+#include "xdr-generic.h"
+
+int32_t
+glusterd_brick_op_cbk (struct rpc_req *req, struct iovec *iov,
+ int count, void *myframe);
+int
+glusterd_defrag_start_validate (glusterd_volinfo_t *volinfo, char *op_errstr,
+ size_t len, glusterd_op_t op)
+{
+ int ret = -1;
+ xlator_t *this = NULL;
+
+ this = THIS;
+ GF_ASSERT (this);
+
+ /* Check only if operation is not remove-brick */
+ if ((GD_OP_REMOVE_BRICK != op) &&
+ !gd_is_remove_brick_committed (volinfo)) {
+ gf_msg_debug (this->name, 0, "A remove-brick task on "
+ "volume %s is not yet committed", volinfo->volname);
+ snprintf (op_errstr, len, "A remove-brick task on volume %s is"
+ " not yet committed. Either commit or stop the "
+ "remove-brick task.", volinfo->volname);
+ goto out;
+ }
+
+ if (glusterd_is_defrag_on (volinfo)) {
+ gf_msg_debug (this->name, 0,
+ "rebalance on volume %s already started",
+ volinfo->volname);
+ snprintf (op_errstr, len, "Rebalance on %s is already started",
+ volinfo->volname);
+ goto out;
+ }
+
+ ret = 0;
+out:
+ gf_msg_debug (this->name, 0, "Returning %d", ret);
+ return ret;
+}
+
+
+int32_t
+__glusterd_defrag_notify (struct rpc_clnt *rpc, void *mydata,
+ rpc_clnt_event_t event, void *data)
+{
+ glusterd_volinfo_t *volinfo = NULL;
+ glusterd_defrag_info_t *defrag = NULL;
+ int ret = 0;
+ char pidfile[PATH_MAX];
+ glusterd_conf_t *priv = NULL;
+ xlator_t *this = NULL;
+
+ this = THIS;
+ if (!this)
+ return 0;
+
+ priv = this->private;
+ if (!priv)
+ return 0;
+
+ volinfo = mydata;
+ if (!volinfo)
+ return 0;
+
+ defrag = volinfo->rebal.defrag;
+ if (!defrag)
+ return 0;
+
+ if ((event == RPC_CLNT_DISCONNECT) && defrag->connected)
+ volinfo->rebal.defrag = NULL;
+
+ GLUSTERD_GET_DEFRAG_PID_FILE(pidfile, volinfo, priv);
+
+ switch (event) {
+ case RPC_CLNT_CONNECT:
+ {
+ if (defrag->connected)
+ return 0;
+
+ LOCK (&defrag->lock);
+ {
+ defrag->connected = 1;
+ }
+ UNLOCK (&defrag->lock);
+
+ gf_msg_debug (this->name, 0, "%s got RPC_CLNT_CONNECT",
+ rpc->conn.name);
+ break;
+ }
+
+ case RPC_CLNT_DISCONNECT:
+ {
+ if (!defrag->connected)
+ return 0;
+
+ LOCK (&defrag->lock);
+ {
+ defrag->connected = 0;
+ }
+ UNLOCK (&defrag->lock);
+
+ if (!gf_is_service_running (pidfile, NULL)) {
+ if (volinfo->rebal.defrag_status ==
+ GF_DEFRAG_STATUS_STARTED) {
+ volinfo->rebal.defrag_status =
+ GF_DEFRAG_STATUS_FAILED;
+ }
+ }
+
+ glusterd_store_perform_node_state_store (volinfo);
+
+ rpc_clnt_reconnect_cleanup (&defrag->rpc->conn);
+ glusterd_defrag_rpc_put (defrag);
+ if (defrag->cbk_fn)
+ defrag->cbk_fn (volinfo,
+ volinfo->rebal.defrag_status);
+
+ GF_FREE (defrag);
+ gf_msg (this->name, GF_LOG_INFO, 0,
+ GD_MSG_REBALANCE_DISCONNECTED,
+ "Rebalance process for volume %s has disconnected.",
+ volinfo->volname);
+ break;
+ }
+ case RPC_CLNT_DESTROY:
+ glusterd_volinfo_unref (volinfo);
+ break;
+ default:
+ gf_msg_trace (this->name, 0,
+ "got some other RPC event %d", event);
+ ret = 0;
+ break;
+ }
+
+ return ret;
+}
+
+int32_t
+glusterd_defrag_notify (struct rpc_clnt *rpc, void *mydata,
+ rpc_clnt_event_t event, void *data)
+{
+ return glusterd_big_locked_notify (rpc, mydata, event,
+ data, __glusterd_defrag_notify);
+}
+
+int
+glusterd_handle_defrag_start (glusterd_volinfo_t *volinfo, char *op_errstr,
+ size_t len, int cmd, defrag_cbk_fn_t cbk,
+ glusterd_op_t op)
+{
+ int ret = -1;
+ glusterd_defrag_info_t *defrag = NULL;
+ runner_t runner = {0,};
+ glusterd_conf_t *priv = NULL;
+ char defrag_path[PATH_MAX];
+ char sockfile[PATH_MAX] = {0,};
+ char pidfile[PATH_MAX] = {0,};
+ char logfile[PATH_MAX] = {0,};
+ char volname[PATH_MAX] = {0,};
+ char valgrind_logfile[PATH_MAX] = {0,};
+ char *volfileserver = NULL;
+
+ priv = THIS->private;
+
+ GF_ASSERT (volinfo);
+ GF_ASSERT (op_errstr);
+
+
+ ret = glusterd_defrag_start_validate (volinfo, op_errstr, len, op);
+ if (ret)
+ goto out;
+ if (!volinfo->rebal.defrag)
+ volinfo->rebal.defrag =
+ GF_CALLOC (1, sizeof (*volinfo->rebal.defrag),
+ gf_gld_mt_defrag_info);
+ if (!volinfo->rebal.defrag)
+ goto out;
+
+ defrag = volinfo->rebal.defrag;
+
+ defrag->cmd = cmd;
+
+ volinfo->rebal.defrag_cmd = cmd;
+ volinfo->rebal.op = op;
+
+ LOCK_INIT (&defrag->lock);
+
+ volinfo->rebal.defrag_status = GF_DEFRAG_STATUS_STARTED;
+
+ glusterd_volinfo_reset_defrag_stats (volinfo);
+ glusterd_store_perform_node_state_store (volinfo);
+
+ GLUSTERD_GET_DEFRAG_DIR (defrag_path, volinfo, priv);
+ ret = mkdir_p (defrag_path, 0777, _gf_true);
+ if (ret) {
+ gf_msg (THIS->name, GF_LOG_ERROR, errno,
+ GD_MSG_CREATE_DIR_FAILED, "Failed to create "
+ "directory %s", defrag_path);
+ goto out;
+ }
+
+ GLUSTERD_GET_DEFRAG_SOCK_FILE (sockfile, volinfo);
+ GLUSTERD_GET_DEFRAG_PID_FILE (pidfile, volinfo, priv);
+ snprintf (logfile, PATH_MAX, "%s/%s-%s.log",
+ DEFAULT_LOG_FILE_DIRECTORY, volinfo->volname,
+ (cmd == GF_DEFRAG_CMD_START_TIER ? "tier":"rebalance"));
+ runinit (&runner);
+
+ if (priv->valgrind) {
+ snprintf (valgrind_logfile, PATH_MAX,
+ "%s/valgrind-%s-rebalance.log",
+ DEFAULT_LOG_FILE_DIRECTORY,
+ volinfo->volname);
+
+ runner_add_args (&runner, "valgrind", "--leak-check=full",
+ "--trace-children=yes", "--track-origins=yes",
+ NULL);
+ runner_argprintf (&runner, "--log-file=%s", valgrind_logfile);
+ }
+
+ snprintf (volname, sizeof(volname), "rebalance/%s", volinfo->volname);
+
+ if (dict_get_str (THIS->options, "transport.socket.bind-address",
+ &volfileserver) == 0) {
+ /*In the case of running multiple glusterds on a single machine,
+ *we should ensure that log file and unix socket file shouls be
+ *unique in given cluster */
+
+ GLUSTERD_GET_DEFRAG_SOCK_FILE_OLD (sockfile, volinfo,
+ priv);
+ snprintf (logfile, PATH_MAX, "%s/%s-%s-%s.log",
+ DEFAULT_LOG_FILE_DIRECTORY, volinfo->volname,
+ (cmd == GF_DEFRAG_CMD_START_TIER ?
+ "tier":"rebalance"),
+ uuid_utoa(MY_UUID));
+
+ } else {
+ volfileserver = "localhost";
+ }
+
+ runner_add_args (&runner, SBIN_DIR"/glusterfs",
+ "-s", volfileserver, "--volfile-id", volname,
+ "--xlator-option", "*dht.use-readdirp=yes",
+ "--xlator-option", "*dht.lookup-unhashed=yes",
+ "--xlator-option", "*dht.assert-no-child-down=yes",
+ "--xlator-option", "*replicate*.data-self-heal=off",
+ "--xlator-option",
+ "*replicate*.metadata-self-heal=off",
+ "--xlator-option", "*replicate*.entry-self-heal=off",
+ "--xlator-option", "*dht.readdir-optimize=on",
+ NULL);
+
+ if (volinfo->type == GF_CLUSTER_TYPE_TIER) {
+ runner_add_arg (&runner, "--xlator-option");
+ runner_argprintf (&runner,
+ "*tier-dht.xattr-name=trusted.tier.tier-dht");
+ }
+
+ runner_add_arg (&runner, "--xlator-option");
+ runner_argprintf ( &runner, "*dht.rebalance-cmd=%d",cmd);
+ runner_add_arg (&runner, "--xlator-option");
+ runner_argprintf (&runner, "*dht.node-uuid=%s", uuid_utoa(MY_UUID));
+ runner_add_arg (&runner, "--xlator-option");
+ runner_argprintf (&runner, "*dht.commit-hash=%u",
+ volinfo->rebal.commit_hash);
+ runner_add_arg (&runner, "--socket-file");
+ runner_argprintf (&runner, "%s",sockfile);
+ runner_add_arg (&runner, "--pid-file");
+ runner_argprintf (&runner, "%s",pidfile);
+ runner_add_arg (&runner, "-l");
+ runner_argprintf (&runner, logfile);
+ if (volinfo->memory_accounting)
+ runner_add_arg (&runner, "--mem-accounting");
+
+ ret = runner_run_nowait (&runner);
+ if (ret) {
+ gf_msg_debug ("glusterd", 0, "rebalance command failed");
+ goto out;
+ }
+
+ sleep (5);
+
+ ret = glusterd_rebalance_rpc_create (volinfo, _gf_false);
+
+ //FIXME: this cbk is passed as NULL in all occurrences. May be
+ //we never needed it.
+ if (cbk)
+ defrag->cbk_fn = cbk;
+
+out:
+ gf_msg_debug ("glusterd", 0, "Returning %d", ret);
+ return ret;
+}
+
+int
+glusterd_rebalance_defrag_init (glusterd_volinfo_t *volinfo,
+ defrag_cbk_fn_t cbk)
+
+{
+ glusterd_defrag_info_t *defrag = NULL;
+ int ret = -1;
+
+ if (!volinfo->rebal.defrag) {
+ volinfo->rebal.defrag =
+ GF_CALLOC (1, sizeof (*volinfo->rebal.defrag),
+ gf_gld_mt_defrag_info);
+ } else {
+ /*
+ * if defrag variable is already initialized,
+ * we skip the initialization.
+ */
+ ret = 0;
+ goto out;
+ }
+
+ if (!volinfo->rebal.defrag)
+ goto out;
+ defrag = volinfo->rebal.defrag;
+
+ defrag->cmd = volinfo->rebal.defrag_cmd;
+ LOCK_INIT (&defrag->lock);
+ if (cbk)
+ defrag->cbk_fn = cbk;
+ ret = 0;
+out:
+ return ret;
+
+}
+
+int
+glusterd_rebalance_rpc_create (glusterd_volinfo_t *volinfo,
+ gf_boolean_t reconnect)
+{
+ dict_t *options = NULL;
+ char sockfile[PATH_MAX] = {0,};
+ int ret = -1;
+ glusterd_defrag_info_t *defrag = volinfo->rebal.defrag;
+ glusterd_conf_t *priv = NULL;
+ xlator_t *this = NULL;
+ struct stat buf = {0,};
+
+ this = THIS;
+ GF_ASSERT (this);
+ priv = this->private;
+ GF_ASSERT (priv);
+
+ //rebalance process is not started
+ if (!defrag)
+ goto out;
+
+ //rpc obj for rebalance process already in place.
+ if (glusterd_defrag_rpc_get (defrag)) {
+ ret = 0;
+ glusterd_defrag_rpc_put (defrag);
+ goto out;
+ }
+ GLUSTERD_GET_DEFRAG_SOCK_FILE (sockfile, volinfo);
+ /* If reconnecting check if defrag sockfile exists in the new location
+ * in /var/run/ , if it does not try the old location
+ */
+ if (reconnect) {
+ ret = sys_stat (sockfile, &buf);
+ /* TODO: Remove this once we don't need backward compatibility
+ * with the older path
+ */
+ if (ret && (errno == ENOENT)) {
+ gf_msg (this->name, GF_LOG_WARNING, errno,
+ GD_MSG_FILE_OP_FAILED, "Rebalance sockfile "
+ "%s does not exist. Trying old path.",
+ sockfile);
+ GLUSTERD_GET_DEFRAG_SOCK_FILE_OLD (sockfile, volinfo,
+ priv);
+ ret =sys_stat (sockfile, &buf);
+ if (ret && (ENOENT == errno)) {
+ gf_msg (this->name, GF_LOG_ERROR, 0,
+ GD_MSG_REBAL_NO_SOCK_FILE, "Rebalance "
+ "sockfile %s does not exist", sockfile);
+ goto out;
+ }
+ }
+ }
+
+ /* Setting frame-timeout to 10mins (600seconds).
+ * Unix domain sockets ensures that the connection is reliable. The
+ * default timeout of 30mins used for unreliable network connections is
+ * too long for unix domain socket connections.
+ */
+ ret = rpc_transport_unix_options_build (&options, sockfile, 600);
+ if (ret) {
+ gf_msg (THIS->name, GF_LOG_ERROR, 0, GD_MSG_UNIX_OP_BUILD_FAIL,
+ "Unix options build failed");
+ goto out;
+ }
+
+ glusterd_volinfo_ref (volinfo);
+ ret = glusterd_rpc_create (&defrag->rpc, options,
+ glusterd_defrag_notify, volinfo);
+ if (ret) {
+ gf_msg (THIS->name, GF_LOG_ERROR, 0, GD_MSG_RPC_CREATE_FAIL,
+ "Glusterd RPC creation failed");
+ goto out;
+ }
+ ret = 0;
+out:
+ return ret;
+}
+
+int
+glusterd_rebalance_cmd_validate (int cmd, char *volname,
+ glusterd_volinfo_t **volinfo,
+ char *op_errstr, size_t len)
+{
+ int ret = -1;
+
+ if (glusterd_volinfo_find(volname, volinfo)) {
+ gf_msg ("glusterd", GF_LOG_ERROR, EINVAL,
+ GD_MSG_VOL_NOT_FOUND, "Received rebalance on invalid"
+ " volname %s", volname);
+ snprintf (op_errstr, len, "Volume %s does not exist",
+ volname);
+ goto out;
+ }
+ if ((*volinfo)->brick_count <= (*volinfo)->dist_leaf_count) {
+ gf_msg ("glusterd", GF_LOG_ERROR, 0,
+ GD_MSG_VOL_NOT_DISTRIBUTE, "Volume %s is not a "
+ "distribute type or contains only 1 brick", volname);
+ snprintf (op_errstr, len, "Volume %s is not a distribute "
+ "volume or contains only 1 brick.\n"
+ "Not performing rebalance", volname);
+ goto out;
+ }
+
+ if ((*volinfo)->status != GLUSTERD_STATUS_STARTED) {
+ gf_msg ("glusterd", GF_LOG_ERROR, 0,
+ GD_MSG_VOL_STOPPED, "Received rebalance on stopped"
+ " volname %s", volname);
+ snprintf (op_errstr, len, "Volume %s needs to "
+ "be started to perform rebalance", volname);
+ goto out;
+ }
+
+ ret = glusterd_disallow_op_for_tier (*volinfo, GD_OP_REBALANCE, cmd);
+ if (ret) {
+ gf_msg ("glusterd", GF_LOG_ERROR, 0,
+ GD_MSG_REBALANCE_CMD_IN_TIER_VOL,
+ "Received rebalance command "
+ "on Tier volume %s", volname);
+ snprintf (op_errstr, len, "Rebalance operations are not "
+ "supported on a tiered volume");
+ goto out;
+ }
+
+ ret = 0;
+
+out:
+ gf_msg_debug ("glusterd", 0, "Returning %d", ret);
+ return ret;
+}
+
+int
+__glusterd_handle_defrag_volume (rpcsvc_request_t *req)
+{
+ int32_t ret = -1;
+ gf_cli_req cli_req = {{0,}};
+ glusterd_conf_t *priv = NULL;
+ dict_t *dict = NULL;
+ char *volname = NULL;
+ gf_cli_defrag_type cmd = 0;
+ char msg[2048] = {0,};
+ xlator_t *this = NULL;
+
+ GF_ASSERT (req);
+ this = THIS;
+ GF_ASSERT (this);
+
+ priv = this->private;
+ GF_ASSERT (priv);
+
+ ret = xdr_to_generic (req->msg[0], &cli_req, (xdrproc_t)xdr_gf_cli_req);
+ if (ret < 0) {
+ //failed to decode msg;
+ req->rpc_err = GARBAGE_ARGS;
+ goto out;
+ }
+
+ if (cli_req.dict.dict_len) {
+ /* Unserialize the dictionary */
+ dict = dict_new ();
+
+ ret = dict_unserialize (cli_req.dict.dict_val,
+ cli_req.dict.dict_len,
+ &dict);
+ if (ret < 0) {
+ gf_msg (this->name, GF_LOG_ERROR, 0,
+ GD_MSG_DICT_UNSERIALIZE_FAIL, "failed to "
+ "unserialize req-buffer to dictionary");
+ snprintf (msg, sizeof (msg), "Unable to decode the "
+ "command");
+ goto out;
+ }
+ }
+
+ ret = dict_get_str (dict, "volname", &volname);
+ if (ret) {
+ snprintf (msg, sizeof (msg), "Failed to get volume name");
+ gf_msg (this->name, GF_LOG_ERROR, 0,
+ GD_MSG_DICT_GET_FAILED, "%s", msg);
+ goto out;
+ }
+
+ ret = dict_get_int32 (dict, "rebalance-command", (int32_t*)&cmd);
+ if (ret) {
+ snprintf (msg, sizeof (msg), "Failed to get command");
+ gf_msg (this->name, GF_LOG_ERROR, 0,
+ GD_MSG_DICT_GET_FAILED, "%s", msg);
+ goto out;
+ }
+
+ ret = dict_set_static_bin (dict, "node-uuid", MY_UUID, 16);
+ if (ret)
+ goto out;
+
+ if ((cmd == GF_DEFRAG_CMD_STATUS) ||
+ (cmd == GF_DEFRAG_CMD_STATUS_TIER) ||
+ (cmd == GF_DEFRAG_CMD_STOP_DETACH_TIER) ||
+ (cmd == GF_DEFRAG_CMD_STOP) ||
+ (cmd == GF_DEFRAG_CMD_DETACH_STATUS)) {
+ ret = glusterd_op_begin (req, GD_OP_DEFRAG_BRICK_VOLUME,
+ dict, msg, sizeof (msg));
+ } else
+ ret = glusterd_op_begin (req, GD_OP_REBALANCE, dict,
+ msg, sizeof (msg));
+
+out:
+
+ glusterd_friend_sm ();
+ glusterd_op_sm ();
+
+ if (ret) {
+ if (msg[0] == '\0')
+ snprintf (msg, sizeof (msg), "Operation failed");
+ ret = glusterd_op_send_cli_response (GD_OP_REBALANCE, ret, 0,
+ req, dict, msg);
+
+ }
+
+ free (cli_req.dict.dict_val);//malloced by xdr
+
+ return 0;
+}
+
+int
+glusterd_handle_defrag_volume (rpcsvc_request_t *req)
+{
+ return glusterd_big_locked_handler (req, __glusterd_handle_defrag_volume);
+}
+
+static int
+glusterd_brick_validation (dict_t *dict, char *key, data_t *value,
+ void *data)
+{
+ int32_t ret = -1;
+ xlator_t *this = NULL;
+ glusterd_volinfo_t *volinfo = data;
+ glusterd_brickinfo_t *brickinfo = NULL;
+
+ this = THIS;
+ GF_ASSERT (this);
+
+ ret = glusterd_volume_brickinfo_get_by_brick (value->data, volinfo,
+ &brickinfo,
+ _gf_false);
+ if (ret) {
+ gf_msg (this->name, GF_LOG_ERROR, EINVAL,
+ GD_MSG_BRICK_NOT_FOUND,
+ "Incorrect brick %s for "
+ "volume %s", value->data, volinfo->volname);
+ return ret;
+ }
+
+ if (!brickinfo->decommissioned) {
+ gf_msg (this->name, GF_LOG_ERROR, EINVAL,
+ GD_MSG_BRICK_NOT_FOUND, "Incorrect brick %s for "
+ "volume %s", value->data, volinfo->volname);
+ ret = -1;
+ return ret;
+ }
+
+ return ret;
+}
+
+int
+glusterd_op_stage_rebalance (dict_t *dict, char **op_errstr)
+{
+ char *volname = NULL;
+ char *cmd_str = NULL;
+ int ret = 0;
+ int32_t cmd = 0;
+ char msg[2048] = {0};
+ glusterd_volinfo_t *volinfo = NULL;
+ glusterd_brickinfo_t *brickinfo = NULL;
+ glusterd_peerinfo_t *peerinfo = NULL;
+ char *task_id_str = NULL;
+ dict_t *op_ctx = NULL;
+ xlator_t *this = 0;
+ int32_t is_force = 0;
+
+ this = THIS;
+ GF_ASSERT (this);
+
+ ret = dict_get_str (dict, "volname", &volname);
+ if (ret) {
+ gf_msg_debug (this->name, 0, "volname not found");
+ goto out;
+ }
+
+ ret = dict_get_int32 (dict, "rebalance-command", &cmd);
+ if (ret) {
+ gf_msg_debug (this->name, 0, "cmd not found");
+ goto out;
+ }
+
+ ret = glusterd_rebalance_cmd_validate (cmd, volname, &volinfo,
+ msg, sizeof (msg));
+ if (ret) {
+ gf_msg_debug (this->name, 0, "failed to validate");
+ goto out;
+ }
+ switch (cmd) {
+ case GF_DEFRAG_CMD_START_TIER:
+ ret = dict_get_int32 (dict, "force", &is_force);
+ if (ret)
+ is_force = 0;
+
+ if (volinfo->type != GF_CLUSTER_TYPE_TIER) {
+ gf_asprintf (op_errstr, "volume %s is not a tier "
+ "volume.", volinfo->volname);
+ ret = -1;
+ goto out;
+ }
+ if ((!is_force) && glusterd_is_tier_daemon_running (volinfo)) {
+ ret = gf_asprintf (op_errstr, "A Tier daemon is "
+ "already running on volume %s",
+ volname);
+ ret = -1;
+ goto out;
+ }
+ case GF_DEFRAG_CMD_START:
+ case GF_DEFRAG_CMD_START_LAYOUT_FIX:
+ /* Check if the connected clients are all of version
+ * glusterfs-3.6 and higher. This is needed to prevent some data
+ * loss issues that could occur when older clients are connected
+ * when rebalance is run. This check can be bypassed by using
+ * 'force'
+ */
+ ret = glusterd_check_client_op_version_support
+ (volname, GD_OP_VERSION_3_6_0, NULL);
+ if (ret) {
+ ret = gf_asprintf (op_errstr, "Volume %s has one or "
+ "more connected clients of a version"
+ " lower than GlusterFS-v3.6.0. "
+ "Starting rebalance in this state "
+ "could lead to data loss.\nPlease "
+ "disconnect those clients before "
+ "attempting this command again.",
+ volname);
+ goto out;
+ }
+
+ cds_list_for_each_entry (brickinfo, &volinfo->bricks,
+ brick_list) {
+ if (glusterd_is_local_brick (THIS, volinfo, brickinfo)) {
+ if (brickinfo->status != GF_BRICK_STARTED) {
+ gf_asprintf (op_errstr, "Received"
+ " rebalance on volume with "
+ " stopped brick %s",
+ brickinfo->path);
+ ret = -1;
+ goto out;
+ }
+ } else {
+ rcu_read_lock ();
+ peerinfo = glusterd_peerinfo_find_by_uuid
+ (brickinfo->uuid);
+ if (!peerinfo) {
+ gf_asprintf (op_errstr, "Host node %s "
+ "of brick %s doesn't "
+ "belong to cluster",
+ brickinfo->hostname,
+ brickinfo->path);
+ ret = -1;
+ rcu_read_unlock ();
+ goto out;
+ } else if (!peerinfo->connected) {
+ gf_asprintf (op_errstr, "Host node %s "
+ "of brick %s is down",
+ brickinfo->hostname,
+ brickinfo->path);
+ ret = -1;
+ rcu_read_unlock ();
+ goto out;
+ }
+ rcu_read_unlock ();
+ }
+ }
+
+ case GF_DEFRAG_CMD_START_FORCE:
+ if (is_origin_glusterd (dict)) {
+ op_ctx = glusterd_op_get_ctx ();
+ if (!op_ctx) {
+ ret = -1;
+ gf_msg (this->name, GF_LOG_ERROR, 0,
+ GD_MSG_OPCTX_GET_FAIL,
+ "Failed to get op_ctx");
+ goto out;
+ }
+
+ ret = glusterd_generate_and_set_task_id
+ (op_ctx, GF_REBALANCE_TID_KEY);
+ if (ret) {
+ gf_msg (this->name, GF_LOG_ERROR, 0,
+ GD_MSG_TASKID_GEN_FAIL,
+ "Failed to generate task-id");
+ goto out;
+ }
+ } else {
+ ret = dict_get_str (dict, GF_REBALANCE_TID_KEY,
+ &task_id_str);
+ if (ret) {
+ snprintf (msg, sizeof (msg),
+ "Missing rebalance-id");
+ gf_msg (this->name, GF_LOG_WARNING, 0,
+ GD_MSG_REBALANCE_ID_MISSING, "%s", msg);
+ ret = 0;
+ }
+ }
+ ret = glusterd_defrag_start_validate (volinfo, msg,
+ sizeof (msg),
+ GD_OP_REBALANCE);
+ if (ret) {
+ gf_msg_debug (this->name, 0,
+ "start validate failed");
+ goto out;
+ }
+ break;
+ case GF_DEFRAG_CMD_STATUS_TIER:
+ case GF_DEFRAG_CMD_STATUS:
+ case GF_DEFRAG_CMD_STOP:
+
+ ret = dict_get_str (dict, "cmd-str", &cmd_str);
+ if (ret) {
+ gf_msg (this->name, GF_LOG_ERROR, 0,
+ GD_MSG_DICT_GET_FAILED, "Failed to get "
+ "command string");
+ ret = -1;
+ goto out;
+ }
+ if ((strstr(cmd_str, "rebalance") != NULL) &&
+ (volinfo->rebal.op != GD_OP_REBALANCE)) {
+ snprintf (msg, sizeof(msg), "Rebalance not started.");
+ ret = -1;
+ goto out;
+ }
+
+ if (strstr(cmd_str, "remove-brick") != NULL) {
+ if (volinfo->rebal.op != GD_OP_REMOVE_BRICK) {
+ snprintf (msg, sizeof(msg), "remove-brick not "
+ "started.");
+ ret = -1;
+ goto out;
+ }
+
+ /* For remove-brick status/stop command check whether
+ * given input brick is part of volume or not.*/
+
+ ret = dict_foreach_fnmatch (dict, "brick*",
+ glusterd_brick_validation,
+ volinfo);
+ if (ret == -1) {
+ snprintf (msg, sizeof (msg), "Incorrect brick"
+ " for volume %s", volinfo->volname);
+ goto out;
+ }
+ }
+ if (cmd == GF_DEFRAG_CMD_STATUS_TIER) {
+ if (volinfo->type != GF_CLUSTER_TYPE_TIER) {
+ snprintf (msg, sizeof(msg), "volume %s is not "
+ "a tier volume.", volinfo->volname);
+ ret = -1;
+ goto out;
+ }
+ }
+
+ break;
+
+ case GF_DEFRAG_CMD_STOP_DETACH_TIER:
+ case GF_DEFRAG_CMD_DETACH_STATUS:
+ if (volinfo->type != GF_CLUSTER_TYPE_TIER) {
+ snprintf (msg, sizeof(msg), "volume %s is not "
+ "a tier volume.", volinfo->volname);
+ ret = -1;
+ goto out;
+ }
+
+ if (volinfo->rebal.op != GD_OP_REMOVE_BRICK) {
+ snprintf (msg, sizeof(msg), "Detach-tier "
+ "not started");
+ ret = -1;
+ goto out;
+ }
+ break;
+ default:
+ break;
+ }
+
+ ret = 0;
+out:
+ if (ret && op_errstr && msg[0])
+ *op_errstr = gf_strdup (msg);
+
+ return ret;
+}
+
+int
+glusterd_op_rebalance (dict_t *dict, char **op_errstr, dict_t *rsp_dict)
+{
+ char *volname = NULL;
+ int ret = 0;
+ int32_t cmd = 0;
+ char msg[2048] = {0};
+ glusterd_volinfo_t *volinfo = NULL;
+ glusterd_conf_t *priv = NULL;
+ glusterd_brickinfo_t *brickinfo = NULL;
+ glusterd_brickinfo_t *tmp = NULL;
+ gf_boolean_t volfile_update = _gf_false;
+ char *task_id_str = NULL;
+ dict_t *ctx = NULL;
+ xlator_t *this = NULL;
+ uint32_t commit_hash;
+ int32_t is_force = 0;
+
+ this = THIS;
+ GF_ASSERT (this);
+ priv = this->private;
+
+ ret = dict_get_str (dict, "volname", &volname);
+ if (ret) {
+ gf_msg_debug (this->name, 0, "volname not given");
+ goto out;
+ }
+
+ ret = dict_get_int32 (dict, "rebalance-command", &cmd);
+ if (ret) {
+ gf_msg_debug (this->name, 0, "command not given");
+ goto out;
+ }
+
+
+ ret = glusterd_rebalance_cmd_validate (cmd, volname, &volinfo,
+ msg, sizeof (msg));
+ if (ret) {
+ gf_msg_debug (this->name, 0, "cmd validate failed");
+ goto out;
+ }
+
+ /* Set task-id, if available, in op_ctx dict for operations other than
+ * start
+ */
+ if (cmd == GF_DEFRAG_CMD_STATUS ||
+ cmd == GF_DEFRAG_CMD_STOP ||
+ cmd == GF_DEFRAG_CMD_STATUS_TIER) {
+ if (!gf_uuid_is_null (volinfo->rebal.rebalance_id)) {
+ ctx = glusterd_op_get_ctx ();
+ if (!ctx) {
+ gf_msg (this->name, GF_LOG_ERROR, 0,
+ GD_MSG_OPCTX_GET_FAIL,
+ "Failed to get op_ctx");
+ ret = -1;
+ goto out;
+ }
+
+ if (GD_OP_REMOVE_BRICK == volinfo->rebal.op)
+ ret = glusterd_copy_uuid_to_dict
+ (volinfo->rebal.rebalance_id, ctx,
+ GF_REMOVE_BRICK_TID_KEY);
+ else
+ ret = glusterd_copy_uuid_to_dict
+ (volinfo->rebal.rebalance_id, ctx,
+ GF_REBALANCE_TID_KEY);
+ if (ret) {
+ gf_msg (this->name, GF_LOG_ERROR, 0,
+ GD_MSG_TASKID_GEN_FAIL,
+ "Failed to set task-id");
+ goto out;
+ }
+ }
+ }
+
+ switch (cmd) {
+ case GF_DEFRAG_CMD_START:
+ case GF_DEFRAG_CMD_START_LAYOUT_FIX:
+ case GF_DEFRAG_CMD_START_FORCE:
+ case GF_DEFRAG_CMD_START_TIER:
+
+
+ ret = dict_get_int32 (dict, "force", &is_force);
+ if (ret)
+ is_force = 0;
+ if (!is_force) {
+ /* Reset defrag status to 'NOT STARTED' whenever a
+ * remove-brick/rebalance command is issued to remove
+ * stale information from previous run.
+ */
+ volinfo->rebal.defrag_status =
+ GF_DEFRAG_STATUS_NOT_STARTED;
+
+ ret = dict_get_str (dict, GF_REBALANCE_TID_KEY,
+ &task_id_str);
+ if (ret) {
+ gf_msg_debug (this->name, 0, "Missing rebalance"
+ " id");
+ ret = 0;
+ } else {
+ gf_uuid_parse (task_id_str,
+ volinfo->rebal.rebalance_id);
+ volinfo->rebal.op = GD_OP_REBALANCE;
+ }
+ if (!gd_should_i_start_rebalance (volinfo)) {
+ /* Store the rebalance-id and rebalance command
+ * even if the peer isn't starting a rebalance
+ * process. On peers where a rebalance process
+ * is started, glusterd_handle_defrag_start
+ * performs the storing.
+ * Storing this is needed for having
+ * 'volume status' work correctly.
+ */
+ glusterd_store_perform_node_state_store
+ (volinfo);
+ break;
+ }
+ if (dict_get_uint32 (dict, "commit-hash", &commit_hash)
+ == 0) {
+ volinfo->rebal.commit_hash = commit_hash;
+ }
+ ret = glusterd_handle_defrag_start (volinfo, msg,
+ sizeof (msg),
+ cmd, NULL, GD_OP_REBALANCE);
+ break;
+ } else {
+ /* Reset defrag status to 'STARTED' so that the
+ * pid is checked and restarted accordingly.
+ * If the pid is not running it executes the
+ * "NOT_STARTED" case and restarts the process
+ */
+ volinfo->rebal.defrag_status = GF_DEFRAG_STATUS_STARTED;
+ volinfo->rebal.defrag_cmd = cmd;
+ volinfo->rebal.op = GD_OP_REBALANCE;
+
+ ret = dict_get_str (dict, GF_REBALANCE_TID_KEY,
+ &task_id_str);
+ if (ret) {
+ gf_msg_debug (this->name, 0, "Missing rebalance"
+ " id");
+ ret = 0;
+ } else {
+ gf_uuid_parse (task_id_str,
+ volinfo->rebal.rebalance_id);
+ volinfo->rebal.op = GD_OP_REBALANCE;
+ }
+ if (dict_get_uint32 (dict, "commit-hash", &commit_hash)
+ == 0) {
+ volinfo->rebal.commit_hash = commit_hash;
+ }
+ ret = glusterd_restart_rebalance_for_volume (volinfo);
+ break;
+ }
+ case GF_DEFRAG_CMD_STOP:
+ case GF_DEFRAG_CMD_STOP_DETACH_TIER:
+ /* Clear task-id only on explicitly stopping rebalance.
+ * Also clear the stored operation, so it doesn't cause trouble
+ * with future rebalance/remove-brick starts
+ */
+ gf_uuid_clear (volinfo->rebal.rebalance_id);
+ volinfo->rebal.op = GD_OP_NONE;
+
+ /* Fall back to the old volume file in case of decommission*/
+ cds_list_for_each_entry_safe (brickinfo, tmp, &volinfo->bricks,
+ brick_list) {
+ if (!brickinfo->decommissioned)
+ continue;
+ brickinfo->decommissioned = 0;
+ volfile_update = _gf_true;
+ }
+
+ if (volfile_update == _gf_false) {
+ ret = 0;
+ break;
+ }
+
+ ret = glusterd_create_volfiles_and_notify_services (volinfo);
+ if (ret) {
+ gf_msg (this->name, GF_LOG_WARNING, 0,
+ GD_MSG_VOLFILE_CREATE_FAIL,
+ "failed to create volfiles");
+ goto out;
+ }
+
+ ret = glusterd_store_volinfo (volinfo,
+ GLUSTERD_VOLINFO_VER_AC_INCREMENT);
+ if (ret) {
+ gf_msg (this->name, GF_LOG_WARNING, 0,
+ GD_MSG_VOLINFO_SET_FAIL,
+ "failed to store volinfo");
+ goto out;
+ }
+
+ if (volinfo->type == GF_CLUSTER_TYPE_TIER &&
+ cmd == GF_OP_CMD_STOP_DETACH_TIER) {
+ glusterd_defrag_info_set (volinfo, dict,
+ GF_DEFRAG_CMD_START_TIER,
+ GF_DEFRAG_CMD_START,
+ GD_OP_REBALANCE);
+ glusterd_restart_rebalance_for_volume (volinfo);
+ }
+
+ ret = 0;
+ break;
+
+ case GF_DEFRAG_CMD_START_DETACH_TIER:
+ case GF_DEFRAG_CMD_STATUS:
+ case GF_DEFRAG_CMD_STATUS_TIER:
+ break;
+ default:
+ break;
+ }
+
+out:
+ if (ret && op_errstr && msg[0])
+ *op_errstr = gf_strdup (msg);
+
+ return ret;
+}
+
+int32_t
+glusterd_defrag_event_notify_handle (dict_t *dict)
+{
+ glusterd_volinfo_t *volinfo = NULL;
+ char *volname = NULL;
+ char *volname_ptr = NULL;
+ int32_t ret = -1;
+ xlator_t *this = NULL;
+
+ this = THIS;
+ GF_ASSERT (this);
+ GF_ASSERT (dict);
+
+ ret = dict_get_str (dict, "volname", &volname);
+ if (ret) {
+ gf_msg (this->name, GF_LOG_ERROR, 0,
+ GD_MSG_DICT_GET_FAILED, "Failed to get volname");
+ return ret;
+ }
+
+ volname_ptr = strstr (volname, "rebalance/");
+ if (volname_ptr) {
+ volname_ptr = strchr (volname_ptr, '/');
+ if (!volname_ptr) {
+ ret = -1;
+ goto out;
+ }
+ volname = volname_ptr + 1;
+ } else {
+ gf_msg (this->name, GF_LOG_ERROR, 0,
+ GD_MSG_NO_REBALANCE_PFX_IN_VOLNAME,
+ "volname received (%s) is not prefixed with rebalance.",
+ volname);
+ ret = -1;
+ goto out;
+ }
+
+ ret = glusterd_volinfo_find (volname, &volinfo);
+ if (ret) {
+ gf_msg (this->name, GF_LOG_ERROR, 0,
+ GD_MSG_VOLINFO_GET_FAIL,
+ "Failed to get volinfo for %s"
+ , volname);
+ return ret;
+ }
+
+ ret = glusterd_defrag_volume_status_update (volinfo, dict);
+
+ if (ret)
+ gf_msg (this->name, GF_LOG_ERROR, 0,
+ GD_MSG_DEFRAG_STATUS_UPDATE_FAIL,
+ "Failed to update status");
+
+out:
+ return ret;
+}