From 33e9f9da8546dc57ecf6b3705f6b6474150ec78c Mon Sep 17 00:00:00 2001 From: shishirng Date: Tue, 6 Mar 2012 18:55:37 +0530 Subject: glusterd/rebalance: Bring in support for parallel rebalance This patch, enables rebalance processes to be started on all nodes where the volume is spread across (1 process per node) node-uuid xattr identifies which node takes ownership of the task to migrate the file. The model employed is push (src pushes to dst) Change-Id: Ieacd46a6216cf6ded841bbaebd10cfaea51c16d6 BUG: 763844 Signed-off-by: shishirng Reviewed-on: http://review.gluster.com/2873 Tested-by: Gluster Build System Reviewed-by: Amar Tumballi Reviewed-by: Vijay Bellur --- cli/src/cli-rpc-ops.c | 10 +++- xlators/cluster/dht/src/dht-common.h | 1 + xlators/cluster/dht/src/dht-rebalance.c | 69 +++++++++++++++++++--- xlators/cluster/dht/src/dht.c | 17 ++++++ xlators/mgmt/glusterd/src/glusterd-op-sm.c | 82 +++++++++++++++++++++++++- xlators/mgmt/glusterd/src/glusterd-rebalance.c | 82 ++++---------------------- xlators/mgmt/glusterd/src/glusterd-rpc-ops.c | 28 ++++++++- xlators/storage/posix/src/posix.c | 2 +- 8 files changed, 204 insertions(+), 87 deletions(-) diff --git a/cli/src/cli-rpc-ops.c b/cli/src/cli-rpc-ops.c index e436024a98a..977ab0b2a13 100644 --- a/cli/src/cli-rpc-ops.c +++ b/cli/src/cli-rpc-ops.c @@ -976,6 +976,7 @@ gf_cli3_1_defrag_volume_cbk (struct rpc_req *req, struct iovec *iov, uint64_t size = 0; uint64_t lookup = 0; char msg[1024] = {0,}; + gf_defrag_status_t status_rcd = GF_DEFRAG_STATUS_NOT_STARTED; if (-1 == req->rpc_status) { goto out; @@ -1041,6 +1042,11 @@ gf_cli3_1_defrag_volume_cbk (struct rpc_req *req, struct iovec *iov, gf_log (THIS->name, GF_LOG_TRACE, "failed to get lookedup file count"); + ret = dict_get_int32 (dict, "status", (int32_t *)&status_rcd); + if (ret) + gf_log (THIS->name, GF_LOG_TRACE, + "failed to get status"); + if (cmd == GF_DEFRAG_CMD_STOP) { if (rsp.op_ret == -1) { if (strcmp (rsp.op_errstr, "")) @@ -1070,7 +1076,7 @@ gf_cli3_1_defrag_volume_cbk (struct rpc_req *req, struct iovec *iov, goto done; } - switch (rsp.op_errno) { + switch (status_rcd) { case GF_DEFRAG_STATUS_NOT_STARTED: status = "not started"; break; @@ -1113,7 +1119,7 @@ done: #if (HAVE_LIB_XML) if (global_state->mode & GLUSTER_MODE_XML) { ret = cli_xml_output_str ("volRebalance", msg, rsp.op_ret, - rsp.op_errno, rsp.op_errstr); + status_rcd, rsp.op_errstr); if (ret) gf_log ("cli", GF_LOG_ERROR, "Error outputting to xml"); diff --git a/xlators/cluster/dht/src/dht-common.h b/xlators/cluster/dht/src/dht-common.h index 3d215ab2546..d7689cc7f35 100644 --- a/xlators/cluster/dht/src/dht-common.h +++ b/xlators/cluster/dht/src/dht-common.h @@ -210,6 +210,7 @@ struct gf_defrag_info_ { uint32_t is_exiting; pid_t pid; inode_t *root_inode; + uuid_t node_uuid; }; diff --git a/xlators/cluster/dht/src/dht-rebalance.c b/xlators/cluster/dht/src/dht-rebalance.c index 64249d0e06a..4c5dd6e99c5 100644 --- a/xlators/cluster/dht/src/dht-rebalance.c +++ b/xlators/cluster/dht/src/dht-rebalance.c @@ -320,7 +320,7 @@ out: static inline int __dht_check_free_space (xlator_t *to, xlator_t *from, loc_t *loc, - struct iatt *stbuf) + struct iatt *stbuf, int flag) { struct statvfs src_statfs = {0,}; struct statvfs dst_statfs = {0,}; @@ -344,6 +344,12 @@ __dht_check_free_space (xlator_t *to, xlator_t *from, loc_t *loc, loc->path, to->name, strerror (errno)); goto out; } + + /* if force option is given, do not check for space @ dst. + * Check only if space is avail for the file */ + if (flag != GF_DHT_MIGRATE_DATA) + goto check_avail_space; + if (((dst_statfs.f_bavail * dst_statfs.f_bsize) / GF_DISK_SECTOR_SIZE) < (((src_statfs.f_bavail * src_statfs.f_bsize) / @@ -360,6 +366,17 @@ __dht_check_free_space (xlator_t *to, xlator_t *from, loc_t *loc, goto out; } +check_avail_space: + if (((dst_statfs.f_bavail * dst_statfs.f_bsize) / + GF_DISK_SECTOR_SIZE) < stbuf->ia_blocks) { + gf_log (this->name, GF_LOG_ERROR, + "data movement attempted from node (%s) with " + "to node (%s) which does not have required free space" + " for %s", from->name, to->name, loc->path); + ret = 1; + goto out; + } + ret = 0; out: return ret; @@ -672,12 +689,9 @@ dht_migrate_file (xlator_t *this, loc_t *loc, xlator_t *from, xlator_t *to, if (ret) goto out; - /* Should happen on all files when 'force' option is not given */ - if (flag == GF_DHT_MIGRATE_DATA) { - ret = __dht_check_free_space (to, from, loc, &stbuf); - if (ret) { - goto out; - } + ret = __dht_check_free_space (to, from, loc, &stbuf, flag); + if (ret) { + goto out; } /* Open the source, and also update mode/xattr */ @@ -1040,6 +1054,8 @@ gf_defrag_migrate_data (xlator_t *this, gf_defrag_info_t *defrag, loc_t *loc, dict_t *dict = NULL; struct iatt iatt = {0,}; int32_t op_errno = 0; + char *uuid_str = NULL; + uuid_t node_uuid = {0,}; gf_log (this->name, GF_LOG_INFO, "migate data called on %s", loc->path); @@ -1122,6 +1138,43 @@ gf_defrag_migrate_data (xlator_t *this, gf_defrag_info_t *defrag, loc_t *loc, continue; } + ret = syncop_getxattr (this, &entry_loc, &dict, + GF_XATTR_NODE_UUID_KEY); + if(ret < 0) { + gf_log (this->name, GF_LOG_ERROR, "Failed to " + "get node-uuid for %s", entry_loc.path); + continue; + } + + ret = dict_get_str (dict, GF_XATTR_NODE_UUID_KEY, + &uuid_str); + if(ret < 0) { + gf_log (this->name, GF_LOG_ERROR, "Failed to " + "get node-uuid from dict for %s", + entry_loc.path); + continue; + } + + if (uuid_parse (uuid_str, node_uuid)) { + gf_log (this->name, GF_LOG_ERROR, "uuid_parse " + "failed for %s", entry_loc.path); + continue; + } + + /* if file belongs to different node, skip migration + * the other node will take responsibility of migration + */ + if (uuid_compare (node_uuid, defrag->node_uuid)) { + gf_log (this->name, GF_LOG_TRACE, "%s does not" + "belong to this node", entry_loc.path); + continue; + } + + uuid_str = NULL; + + dict_del (dict, GF_XATTR_NODE_UUID_KEY); + + /* if distribute is present, it will honor this key. * -1 is returned if distribute is not present or file * doesn't have a link-file. If file has link-file, the @@ -1131,6 +1184,8 @@ gf_defrag_migrate_data (xlator_t *this, gf_defrag_info_t *defrag, loc_t *loc, ret = syncop_getxattr (this, &entry_loc, &dict, GF_XATTR_LINKINFO_KEY); if (ret < 0) { + gf_log (this->name, GF_LOG_TRACE, "getxattr " + "failed for %s", entry_loc.path); continue; } diff --git a/xlators/cluster/dht/src/dht.c b/xlators/cluster/dht/src/dht.c index 4502a751b49..96382bf16e0 100644 --- a/xlators/cluster/dht/src/dht.c +++ b/xlators/cluster/dht/src/dht.c @@ -359,6 +359,7 @@ init (xlator_t *this) int i = 0; gf_defrag_info_t *defrag = NULL; int cmd = 0; + char *node_uuid = NULL; GF_VALIDATE_OR_GOTO ("dht", this, err); @@ -391,6 +392,19 @@ init (xlator_t *this) defrag->is_exiting = 0; + ret = dict_get_str (this->options, "node-uuid", &node_uuid); + if (ret) { + gf_log (this->name, GF_LOG_ERROR, "node-uuid not " + "specified"); + goto err; + } + + if (uuid_parse (node_uuid, defrag->node_uuid)) { + gf_log (this->name, GF_LOG_ERROR, "Cannot parse " + "glusterd node uuid"); + goto err; + } + defrag->cmd = cmd; conf->defrag = defrag; @@ -591,6 +605,9 @@ struct volume_options options[] = { { .key = {"rebalance-cmd"}, .type = GF_OPTION_TYPE_INT, }, + { .key = {"node-uuid"}, + .type = GF_OPTION_TYPE_STR, + }, { .key = {NULL} }, }; diff --git a/xlators/mgmt/glusterd/src/glusterd-op-sm.c b/xlators/mgmt/glusterd/src/glusterd-op-sm.c index 3c1baa7e62e..bda5e61e4fb 100644 --- a/xlators/mgmt/glusterd/src/glusterd-op-sm.c +++ b/xlators/mgmt/glusterd/src/glusterd-op-sm.c @@ -2709,6 +2709,83 @@ out: return ret; } + +int +glusterd_defrag_volume_node_rsp (dict_t *req_dict, dict_t *rsp_dict, + dict_t *op_ctx) +{ + int ret = 0; + char *volname = NULL; + glusterd_volinfo_t *volinfo = NULL; + uint64_t files = 0; + uint64_t size = 0; + uint64_t lookup = 0; + gf_defrag_status_t status = GF_DEFRAG_STATUS_NOT_STARTED; + + GF_ASSERT (req_dict); + + ret = dict_get_str (req_dict, "volname", &volname); + if (ret) { + gf_log ("", GF_LOG_ERROR, "Unable to get volume name"); + goto out; + } + + ret = glusterd_volinfo_find (volname, &volinfo); + + if (ret) + goto out; + + ret = dict_get_uint64 (rsp_dict, "files", &files); + if (ret) + gf_log (THIS->name, GF_LOG_TRACE, + "failed to get file count"); + + ret = dict_get_uint64 (rsp_dict, "size", &size); + if (ret) + gf_log (THIS->name, GF_LOG_TRACE, + "failed to get size of xfer"); + + ret = dict_get_uint64 (rsp_dict, "lookups", &lookup); + if (ret) + gf_log (THIS->name, GF_LOG_TRACE, + "failed to get lookedup file count"); + ret = dict_get_int32 (rsp_dict, "status", (int32_t *)&status); + if (ret) + gf_log (THIS->name, GF_LOG_TRACE, + "failed to get status"); + + volinfo->rebalance_files += files; + volinfo->rebalance_data += size; + volinfo->lookedup_files += lookup; + + if (!op_ctx) { + dict_copy (rsp_dict, op_ctx); + goto out; + } + + ret = dict_set_uint64 (op_ctx, "files", volinfo->rebalance_files); + if (ret) + gf_log (THIS->name, GF_LOG_ERROR, + "failed to set file count"); + + ret = dict_set_uint64 (op_ctx, "size", volinfo->rebalance_data); + if (ret) + gf_log (THIS->name, GF_LOG_ERROR, + "failed to set size of xfer"); + + ret = dict_set_uint64 (op_ctx, "lookups", volinfo->lookedup_files); + if (ret) + gf_log (THIS->name, GF_LOG_ERROR, + "failed to set lookedup file count"); + ret = dict_set_int32 (op_ctx, "status", status); + if (ret) + gf_log (THIS->name, GF_LOG_ERROR, + "failed to set status"); + +out: + return ret; +} + int32_t glusterd_handle_node_rsp (glusterd_req_ctx_t *req_ctx, void *pending_entry, glusterd_op_t op, dict_t *rsp_dict, dict_t *op_ctx, @@ -2732,8 +2809,9 @@ glusterd_handle_node_rsp (glusterd_req_ctx_t *req_ctx, void *pending_entry, break; case GD_OP_DEFRAG_BRICK_VOLUME: - dict_copy (rsp_dict, op_ctx); - break; + glusterd_defrag_volume_node_rsp (req_ctx->dict, + rsp_dict, op_ctx); + break; case GD_OP_HEAL_VOLUME: ret = glusterd_heal_volume_brick_rsp (req_ctx->dict, rsp_dict, diff --git a/xlators/mgmt/glusterd/src/glusterd-rebalance.c b/xlators/mgmt/glusterd/src/glusterd-rebalance.c index f7304b9c074..936a3b26e6c 100644 --- a/xlators/mgmt/glusterd/src/glusterd-rebalance.c +++ b/xlators/mgmt/glusterd/src/glusterd-rebalance.c @@ -82,60 +82,6 @@ out: return ret; } -int -glusterd_defrag_status_get (glusterd_volinfo_t *volinfo, - dict_t *dict) -{ - int ret = 0; - uint64_t files = 0; - uint64_t size = 0; - uint64_t lookup = 0; - - if (!volinfo || !dict) - goto out; - - ret = 0; - if (volinfo->defrag_status == GF_DEFRAG_STATUS_NOT_STARTED) - goto out; - - if (volinfo->defrag) { - LOCK (&volinfo->defrag->lock); - { - files = volinfo->defrag->total_files; - size = volinfo->defrag->total_data; - lookup = volinfo->defrag->num_files_lookedup; - } - UNLOCK (&volinfo->defrag->lock); - } else { - files = volinfo->rebalance_files; - size = volinfo->rebalance_data; - lookup = volinfo->lookedup_files; - } - - ret = dict_set_uint64 (dict, "files", files); - if (ret) - gf_log (THIS->name, GF_LOG_WARNING, - "failed to set file count"); - - ret = dict_set_uint64 (dict, "size", size); - if (ret) - gf_log (THIS->name, GF_LOG_WARNING, - "failed to set size of xfer"); - - ret = dict_set_uint64 (dict, "lookups", lookup); - if (ret) - gf_log (THIS->name, GF_LOG_WARNING, - "failed to set lookedup file count"); - - ret = dict_set_int32 (dict, "status", volinfo->defrag_status); - if (ret) - gf_log (THIS->name, GF_LOG_WARNING, - "failed to set status"); - -out: - return 0; -} - void glusterd_rebalance_cmd_attempted_log (int cmd, char *volname) { @@ -338,6 +284,10 @@ glusterd_handle_defrag_start (glusterd_volinfo_t *volinfo, char *op_errstr, volinfo->defrag_status = GF_DEFRAG_STATUS_STARTED; + volinfo->rebalance_files = 0; + volinfo->rebalance_data = 0; + volinfo->lookedup_files = 0; + volinfo->defrag_cmd = cmd; glusterd_store_volinfo (volinfo, GLUSTERD_VOLINFO_VER_AC_INCREMENT); @@ -369,6 +319,8 @@ glusterd_handle_defrag_start (glusterd_volinfo_t *volinfo, char *op_errstr, NULL); runner_add_arg (&runner, "--xlator-option"); runner_argprintf ( &runner, "*dht.rebalance-cmd=%d",cmd); + runner_add_arg (&runner, "--xlator-option"); + runner_argprintf (&runner, "*dht.node-uuid=%s", uuid_utoa(priv->uuid)); runner_add_arg (&runner, "--socket-file"); runner_argprintf (&runner, "%s",sockfile); runner_add_arg (&runner, "--pid-file"); @@ -383,6 +335,7 @@ glusterd_handle_defrag_start (glusterd_volinfo_t *volinfo, char *op_errstr, goto out; } + sleep (5); ret = rpc_clnt_transport_unix_options_build (&options, sockfile); if (ret) { gf_log (THIS->name, GF_LOG_ERROR, "Unix options build failed"); @@ -609,7 +562,6 @@ glusterd_op_rebalance (dict_t *dict, char **op_errstr, dict_t *rsp_dict) int32_t cmd = 0; char msg[2048] = {0}; glusterd_volinfo_t *volinfo = NULL; - void *node_uuid = NULL; glusterd_conf_t *priv = NULL; priv = THIS->private; @@ -633,23 +585,6 @@ glusterd_op_rebalance (dict_t *dict, char **op_errstr, dict_t *rsp_dict) goto out; } - if ((cmd != GF_DEFRAG_CMD_STATUS) && - (cmd != GF_DEFRAG_CMD_STOP)) { - ret = dict_get_ptr (dict, "node-uuid", &node_uuid); - if (ret) { - gf_log (THIS->name, GF_LOG_DEBUG, "node-uuid not found"); - goto out; - } - - /* perform this on only the node which has - issued the command */ - if (uuid_compare (node_uuid, priv->uuid)) { - gf_log (THIS->name, GF_LOG_DEBUG, - "not the source node %s", uuid_utoa (priv->uuid)); - goto out; - } - } - switch (cmd) { case GF_DEFRAG_CMD_START: case GF_DEFRAG_CMD_START_LAYOUT_FIX: @@ -659,6 +594,9 @@ glusterd_op_rebalance (dict_t *dict, char **op_errstr, dict_t *rsp_dict) break; case GF_DEFRAG_CMD_STOP: case GF_DEFRAG_CMD_STATUS: + volinfo->rebalance_files = 0; + volinfo->rebalance_data = 0; + volinfo->lookedup_files = 0; break; default: break; diff --git a/xlators/mgmt/glusterd/src/glusterd-rpc-ops.c b/xlators/mgmt/glusterd/src/glusterd-rpc-ops.c index 39a9c6161a9..4e55c383c47 100644 --- a/xlators/mgmt/glusterd/src/glusterd-rpc-ops.c +++ b/xlators/mgmt/glusterd/src/glusterd-rpc-ops.c @@ -1059,6 +1059,8 @@ glusterd_volume_rebalance_use_rsp_dict (dict_t *rsp_dict) glusterd_op_t op = GD_OP_NONE; uint64_t value = 0; int32_t value32 = 0; + char *volname = NULL; + glusterd_volinfo_t *volinfo = NULL; GF_ASSERT (rsp_dict); @@ -1071,9 +1073,22 @@ glusterd_volume_rebalance_use_rsp_dict (dict_t *rsp_dict) if (!ctx_dict) goto out; + ret = dict_get_str (ctx_dict, "volname", &volname); + if (ret) { + gf_log ("", GF_LOG_ERROR, "Unable to get volume name"); + goto out; + } + + ret = glusterd_volinfo_find (volname, &volinfo); + + if (ret) + goto out; + ret = dict_get_uint64 (rsp_dict, "files", &value); if (!ret) { - ret = dict_set_uint64 (ctx_dict, "files", value); + volinfo->rebalance_files += value; + ret = dict_set_uint64 (ctx_dict, "files", + volinfo->rebalance_files); if (ret) { gf_log (THIS->name, GF_LOG_DEBUG, "failed to set the file count"); @@ -1082,7 +1097,9 @@ glusterd_volume_rebalance_use_rsp_dict (dict_t *rsp_dict) ret = dict_get_uint64 (rsp_dict, "size", &value); if (!ret) { - ret = dict_set_uint64 (ctx_dict, "size", value); + volinfo->rebalance_data += value; + ret = dict_set_uint64 (ctx_dict, "size", + volinfo->rebalance_data); if (ret) { gf_log (THIS->name, GF_LOG_DEBUG, "failed to set the size of migration"); @@ -1091,7 +1108,9 @@ glusterd_volume_rebalance_use_rsp_dict (dict_t *rsp_dict) ret = dict_get_uint64 (rsp_dict, "lookups", &value); if (!ret) { - ret = dict_set_uint64 (ctx_dict, "lookups", value); + volinfo->lookedup_files += value; + ret = dict_set_uint64 (ctx_dict, "lookups", + volinfo->lookedup_files); if (ret) { gf_log (THIS->name, GF_LOG_DEBUG, "failed to set lookuped file count"); @@ -1273,6 +1292,9 @@ glusterd3_1_commit_op_cbk (struct rpc_req *req, struct iovec *iov, case GD_OP_REBALANCE: case GD_OP_DEFRAG_BRICK_VOLUME: + ret = glusterd_volume_rebalance_use_rsp_dict (dict); + if (ret) + goto out; break; case GD_OP_HEAL_VOLUME: diff --git a/xlators/storage/posix/src/posix.c b/xlators/storage/posix/src/posix.c index 84d1d11a5db..22a9574dcbf 100644 --- a/xlators/storage/posix/src/posix.c +++ b/xlators/storage/posix/src/posix.c @@ -2480,7 +2480,7 @@ posix_getxattr (call_frame_t *frame, xlator_t *this, if (loc->inode && name && (strcmp (name, GF_XATTR_NODE_UUID_KEY) == 0) && !uuid_is_null (priv->glusterd_uuid)) { - (void) snprintf (host_buf, 1024, "<%s>", + (void) snprintf (host_buf, 1024, "%s", uuid_utoa (priv->glusterd_uuid)); dyn_rpath = gf_strdup (host_buf); -- cgit