From 243d61575c093c03b9beb014bf9d097646836e95 Mon Sep 17 00:00:00 2001 From: Jeff Darcy Date: Wed, 7 May 2014 19:31:30 +0000 Subject: dht: make lookup-unhashed=auto do something actually useful The key concept here is to determine whether a directory is "clean" by comparing its last-known-good topology to the current one for the volume. These are stored as "commit hashes" on the directory and the volume root respectively. The volume's commit hash changes whenever a brick is added or removed, and a fix-layout is done. A directory's commit hash changes only when a full rebalance (not just fix-layout) is done on it. If all bricks are present and have a directory commit hash that matches the volume commit hash, then we can assume that every file is in its "proper" place. Therefore, if we look for a file in that proper place and don't find it, we can assume it's not on any other subvolume and *safely* skip the global (broadcast to all) lookup. Change-Id: Id6ce4593ba1f7daffa74cfab591cb45960629ae3 BUG: 1220064 Reviewed-on-master: http://review.gluster.org/#/c/7702/ Signed-off-by: Jeff Darcy Signed-off-by: Shyam Reviewed-on: http://review.gluster.org/10729 Tested-by: Gluster Build System Reviewed-by: Krishnan Parthasarathi Reviewed-by: Vijay Bellur --- xlators/mgmt/glusterd/src/glusterd-brick-ops.c | 5 ++ xlators/mgmt/glusterd/src/glusterd-op-sm.c | 105 +++++++++++++++++-------- xlators/mgmt/glusterd/src/glusterd-rebalance.c | 7 ++ xlators/mgmt/glusterd/src/glusterd.h | 1 + 4 files changed, 87 insertions(+), 31 deletions(-) (limited to 'xlators/mgmt/glusterd') diff --git a/xlators/mgmt/glusterd/src/glusterd-brick-ops.c b/xlators/mgmt/glusterd/src/glusterd-brick-ops.c index 92d15c615be..019766c5d83 100644 --- a/xlators/mgmt/glusterd/src/glusterd-brick-ops.c +++ b/xlators/mgmt/glusterd/src/glusterd-brick-ops.c @@ -1997,6 +1997,8 @@ glusterd_op_remove_brick (dict_t *dict, char **op_errstr) dict_t *bricks_dict = NULL; char *brick_tmpstr = NULL; int start_remove = 0; + uint32_t commit_hash = 0; + this = THIS; GF_ASSERT (this); @@ -2262,6 +2264,9 @@ glusterd_op_remove_brick (dict_t *dict, char **op_errstr) break; } if (!force && need_rebalance) { + if (dict_get_uint32(dict, "commit-hash", &commit_hash) == 0) { + volinfo->rebal.commit_hash = commit_hash; + } /* perform the rebalance operations */ ret = glusterd_handle_defrag_start (volinfo, err_str, sizeof (err_str), diff --git a/xlators/mgmt/glusterd/src/glusterd-op-sm.c b/xlators/mgmt/glusterd/src/glusterd-op-sm.c index 858f0771ca6..bc0763483fd 100644 --- a/xlators/mgmt/glusterd/src/glusterd-op-sm.c +++ b/xlators/mgmt/glusterd/src/glusterd-op-sm.c @@ -3403,6 +3403,36 @@ out: return ret; } +int +gd_set_commit_hash (dict_t *dict) +{ + struct timeval tv; + uint32_t hash; + + /* + * We need a commit hash that won't conflict with others we might have + * set, or zero which is the implicit value if we never have. Using + * seconds<<3 like this ensures that we'll only get a collision if two + * consecutive rebalances are separated by exactly 2^29 seconds - about + * 17 years - and even then there's only a 1/8 chance of a collision in + * the low order bits. It's far more likely that this code will have + * changed completely by then. If not, call me in 2031. + * + * P.S. Time zone changes? Yeah, right. + */ + gettimeofday (&tv, NULL); + hash = tv.tv_sec << 3; + + /* + * Make sure at least one of those low-order bits is set. The extra + * shifting is because not all machines have sub-millisecond time + * resolution. + */ + hash |= 1 << ((tv.tv_usec >> 10) % 3); + + return dict_set_uint32 (dict, "commit-hash", hash); +} + int glusterd_op_build_payload (dict_t **req, char **op_errstr, dict_t *op_ctx) { @@ -3415,6 +3445,7 @@ glusterd_op_build_payload (dict_t **req, char **op_errstr, dict_t *op_ctx) uint32_t status_cmd = GF_CLI_STATUS_NONE; char *errstr = NULL; xlator_t *this = NULL; + gf_boolean_t do_common = _gf_false; GF_ASSERT (req); @@ -3503,12 +3534,6 @@ glusterd_op_build_payload (dict_t **req, char **op_errstr, dict_t *op_ctx) } break; - case GD_OP_SYNC_VOLUME: - { - dict_copy (dict, req_dict); - break; - } - case GD_OP_REMOVE_BRICK: { dict_t *dict = ctx; @@ -3525,6 +3550,10 @@ glusterd_op_build_payload (dict_t **req, char **op_errstr, dict_t *op_ctx) if (ret) goto out; + if (gd_set_commit_hash(dict) != 0) { + goto out; + } + dict_destroy (req_dict); req_dict = dict_ref (dict); } @@ -3544,8 +3573,10 @@ glusterd_op_build_payload (dict_t **req, char **op_errstr, dict_t *op_ctx) dict_copy (dict, req_dict); break; } + do_common = _gf_true; } - /*fall-through*/ + break; + case GD_OP_DELETE_VOLUME: case GD_OP_START_VOLUME: case GD_OP_STOP_VOLUME: @@ -3555,7 +3586,6 @@ glusterd_op_build_payload (dict_t **req, char **op_errstr, dict_t *op_ctx) case GD_OP_LOG_ROTATE: case GD_OP_QUOTA: case GD_OP_PROFILE_VOLUME: - case GD_OP_REBALANCE: case GD_OP_HEAL_VOLUME: case GD_OP_STATEDUMP_VOLUME: case GD_OP_CLEARLOCKS_VOLUME: @@ -3563,49 +3593,62 @@ glusterd_op_build_payload (dict_t **req, char **op_errstr, dict_t *op_ctx) case GD_OP_BARRIER: case GD_OP_BITROT: { - ret = dict_get_str (dict, "volname", &volname); - if (ret) { - gf_log (this->name, GF_LOG_CRITICAL, - "volname is not present in " - "operation ctx"); - goto out; - } - - if (strcasecmp (volname, "all")) { - ret = glusterd_dict_set_volid (dict, - volname, - op_errstr); - if (ret) - goto out; - } - dict_copy (dict, req_dict); + do_common = _gf_true; } break; - case GD_OP_COPY_FILE: + case GD_OP_REBALANCE: { - dict_copy (dict, req_dict); - break; + if (gd_set_commit_hash(dict) != 0) { + goto out; + } + do_common = _gf_true; } + break; + case GD_OP_SYNC_VOLUME: + case GD_OP_COPY_FILE: case GD_OP_SYS_EXEC: { dict_copy (dict, req_dict); - break; } + break; case GD_OP_GANESHA: { dict_copy (dict, req_dict); - break; } + break; default: break; } - *req = req_dict; - ret = 0; + /* + * This has been moved out of the switch so that multiple ops with + * other special needs can all "fall through" to it. + */ + if (do_common) { + ret = dict_get_str (dict, "volname", &volname); + if (ret) { + gf_log (this->name, GF_LOG_CRITICAL, + "volname is not present in " + "operation ctx"); + goto out; + } + + if (strcasecmp (volname, "all")) { + ret = glusterd_dict_set_volid (dict, + volname, + op_errstr); + if (ret) + goto out; + } + dict_copy (dict, req_dict); + } + + *req = req_dict; + ret = 0; out: return ret; diff --git a/xlators/mgmt/glusterd/src/glusterd-rebalance.c b/xlators/mgmt/glusterd/src/glusterd-rebalance.c index 48d9a706042..cf8ee3a79f7 100644 --- a/xlators/mgmt/glusterd/src/glusterd-rebalance.c +++ b/xlators/mgmt/glusterd/src/glusterd-rebalance.c @@ -284,6 +284,9 @@ glusterd_handle_defrag_start (glusterd_volinfo_t *volinfo, char *op_errstr, runner_argprintf ( &runner, "*dht.rebalance-cmd=%d",cmd); runner_add_arg (&runner, "--xlator-option"); runner_argprintf (&runner, "*dht.node-uuid=%s", uuid_utoa(MY_UUID)); + runner_add_arg (&runner, "--xlator-option"); + runner_argprintf (&runner, "*dht.commit-hash=%u", + volinfo->rebal.commit_hash); runner_add_arg (&runner, "--socket-file"); runner_argprintf (&runner, "%s",sockfile); runner_add_arg (&runner, "--pid-file"); @@ -716,6 +719,7 @@ glusterd_op_rebalance (dict_t *dict, char **op_errstr, dict_t *rsp_dict) char *task_id_str = NULL; dict_t *ctx = NULL; xlator_t *this = NULL; + uint32_t commit_hash; this = THIS; GF_ASSERT (this); @@ -804,6 +808,9 @@ glusterd_op_rebalance (dict_t *dict, char **op_errstr, dict_t *rsp_dict) glusterd_store_perform_node_state_store (volinfo); break; } + if (dict_get_uint32 (dict, "commit-hash", &commit_hash) == 0) { + volinfo->rebal.commit_hash = commit_hash; + } ret = glusterd_handle_defrag_start (volinfo, msg, sizeof (msg), cmd, NULL, GD_OP_REBALANCE); break; diff --git a/xlators/mgmt/glusterd/src/glusterd.h b/xlators/mgmt/glusterd/src/glusterd.h index 3f2ff45f1a1..5341192e84a 100644 --- a/xlators/mgmt/glusterd/src/glusterd.h +++ b/xlators/mgmt/glusterd/src/glusterd.h @@ -286,6 +286,7 @@ struct glusterd_rebalance_ { glusterd_op_t op; dict_t *dict; /* Dict to store misc information * like list of bricks being removed */ + uint32_t commit_hash; }; typedef struct glusterd_rebalance_ glusterd_rebalance_t; -- cgit