From 5784a00f997212d34bd52b2303e20c097240d91c Mon Sep 17 00:00:00 2001 From: karthik-us Date: Wed, 30 May 2018 15:27:52 +0530 Subject: cluster/afr: Use 2 domain locking in SHD for thin-arbiter With this change when SHD starts the index crawl it requests all the clients to release the AFR_TA_DOM_NOTIFY lock so that clients will know the in memory state is no more valid and any new operations needs to query the thin-arbiter if required. When SHD completes healing all the files without any failure, it will again take the AFR_TA_DOM_NOTIFY lock and gets the xattrs on TA to see whether there are any new failures happened by that time. If there are new failures marked on TA, SHD will start the crawl immediately to heal those failures as well. If there are no new failures, then SHD will take the AFR_TA_DOM_MODIFY lock and unsets the xattrs on TA, so that both the data bricks will be considered as good there after. Change-Id: I037b89a0823648f314580ba0716d877bd5ddb1f1 fixes: bz#1579788 Signed-off-by: karthik-us --- tests/basic/afr/ta-shd.t | 49 +++++++ tests/thin-arbiter.rc | 181 +++++++++++++++++++++++ xlators/cluster/afr/src/afr-common.c | 7 +- xlators/cluster/afr/src/afr-self-heald.c | 245 ++++++++++++++++++++----------- xlators/cluster/afr/src/afr.c | 1 + 5 files changed, 392 insertions(+), 91 deletions(-) create mode 100644 tests/basic/afr/ta-shd.t diff --git a/tests/basic/afr/ta-shd.t b/tests/basic/afr/ta-shd.t new file mode 100644 index 00000000000..bb2e58b3f77 --- /dev/null +++ b/tests/basic/afr/ta-shd.t @@ -0,0 +1,49 @@ +#!/bin/bash +#Self-heal tests + +. $(dirname $0)/../../include.rc +. $(dirname $0)/../../volume.rc +. $(dirname $0)/../../thin-arbiter.rc +cleanup; +TEST ta_create_brick_and_volfile brick0 +TEST ta_create_brick_and_volfile brick1 +TEST ta_create_ta_and_volfile ta +TEST ta_start_brick_process brick0 +TEST ta_start_brick_process brick1 +TEST ta_start_ta_process ta + +TEST ta_create_mount_volfile brick0 brick1 ta +TEST ta_start_mount_process $M0 +EXPECT_WITHIN $PROCESS_UP_TIMEOUT "1" ta_up_status $V0 $M0 0 +EXPECT_WITHIN $PROCESS_UP_TIMEOUT "trusted.afr.patchy-ta-2" ls $B0/ta + +TEST ta_create_shd_volfile brick0 brick1 ta +TEST ta_start_shd_process glustershd + +TEST touch $M0/a.txt +TEST ta_kill_brick brick0 +EXPECT_WITHIN $PROCESS_DOWN_TIMEOUT "0" afr_child_up_status_meta $M0 $V0-replicate-0 0 +echo "Hello" >> $M0/a.txt +EXPECT "000000010000000000000000" get_hex_xattr trusted.afr.$V0-client-0 $B0/brick1/a.txt +EXPECT "000000010000000000000000" get_hex_xattr trusted.afr.$V0-client-0 $B0/ta/trusted.afr.$V0-ta-2 + +#TODO: After the write txn changes are merged, take statedump of TA process and +#check whether AFR_TA_DOM_NOTIFY lock is held by the client here. Take the +#statedump again after line #38 to check AFR_TA_DOM_NOTIFY lock is released by +#the SHD process. + +TEST ta_start_brick_process brick0 +EXPECT_WITHIN $CHILD_UP_TIMEOUT "1" afr_child_up_status_meta $M0 $V0-replicate-0 0 +EXPECT_WITHIN $HEAL_TIMEOUT "000000000000000000000000" get_hex_xattr trusted.afr.$V0-client-0 $B0/brick1/a.txt +EXPECT_WITHIN $HEAL_TIMEOUT "000000000000000000000000" get_hex_xattr trusted.afr.$V0-client-0 $B0/ta/trusted.afr.$V0-ta-2 + +#Kill the previously up brick and try reading from other brick. Since the heal +#has happened file content should be same. +TEST ta_kill_brick brick1 +EXPECT_WITHIN $PROCESS_DOWN_TIMEOUT "0" afr_child_up_status_meta $M0 $V0-replicate-0 1 +#Umount and mount to remove cached data. +TEST umount $M0 +TEST ta_start_mount_process $M0 +EXPECT_WITHIN $PROCESS_UP_TIMEOUT "1" ta_up_status $V0 $M0 0 +EXPECT "Hello" cat $M0/a.txt +cleanup; diff --git a/tests/thin-arbiter.rc b/tests/thin-arbiter.rc index 36d11cea61d..c5ac00baaaf 100644 --- a/tests/thin-arbiter.rc +++ b/tests/thin-arbiter.rc @@ -431,3 +431,184 @@ function ta_up_status() local replica_id=$3 grep -E "^up = " $m/.meta/graphs/active/${v}-replicate-${replica_id}/private | cut -f2 -d'=' } + +function ta_create_shd_volfile() +{ + local b0=$B0/$1 + local b1=$B0/$2 + local ta=$B0/$3 + local b0_port=${PORTMAP[$1]} + local b1_port=${PORTMAP[$2]} + local ta_port=${PORTMAP[$3]} +cat > $B0/glustershd.vol <shd.iamshd) + GF_ASSERT(afr_ta_is_fop_called_from_synctask(this)); flock1.l_type = F_WRLCK; while (!locked) { @@ -6725,7 +6726,6 @@ afr_ta_post_op_lock(xlator_t *this, loc_t *loc) cmd = F_SETLKW; flock1.l_start = 0; flock1.l_len = 0; - } else { cmd = F_SETLK; if (priv->ta_notify_dom_lock_offset) { @@ -6780,7 +6780,8 @@ afr_ta_post_op_unlock(xlator_t *this, loc_t *loc) }; int ret = 0; - GF_ASSERT(afr_ta_is_fop_called_from_synctask(this)); + if (!priv->shd.iamshd) + GF_ASSERT(afr_ta_is_fop_called_from_synctask(this)); flock.l_type = F_UNLCK; flock.l_start = 0; flock.l_len = 0; diff --git a/xlators/cluster/afr/src/afr-self-heald.c b/xlators/cluster/afr/src/afr-self-heald.c index 0cf01a041b4..53d7ef8bb8e 100644 --- a/xlators/cluster/afr/src/afr-self-heald.c +++ b/xlators/cluster/afr/src/afr-self-heald.c @@ -546,14 +546,128 @@ afr_shd_full_sweep(struct subvol_healer *healer, inode_t *inode) GF_CLIENT_PID_SELF_HEALD, healer, afr_shd_full_heal); } -void -afr_shd_ta_set_xattrs(xlator_t *this, loc_t *loc, dict_t **xdata, int healer) +int +afr_shd_fill_ta_loc(xlator_t *this, loc_t *loc) { afr_private_t *priv = NULL; - dict_t *xattr = NULL; - struct gf_flock flock = { + struct iatt stbuf = { 0, }; + int ret = -1; + + priv = this->private; + loc->parent = inode_ref(this->itable->root); + gf_uuid_copy(loc->pargfid, loc->parent->gfid); + loc->name = priv->pending_key[THIN_ARBITER_BRICK_INDEX]; + loc->inode = inode_new(loc->parent->table); + GF_CHECK_ALLOC(loc->inode, ret, out); + + if (!gf_uuid_is_null(priv->ta_gfid)) + goto assign_gfid; + + ret = syncop_lookup(priv->children[THIN_ARBITER_BRICK_INDEX], loc, &stbuf, + 0, 0, 0); + if (ret) { + gf_msg(this->name, GF_LOG_ERROR, -ret, AFR_MSG_THIN_ARB, + "Failed lookup on file %s.", loc->name); + goto out; + } + + gf_uuid_copy(priv->ta_gfid, stbuf.ia_gfid); + +assign_gfid: + gf_uuid_copy(loc->gfid, priv->ta_gfid); + ret = 0; + +out: + if (ret) + loc_wipe(loc); + + return ret; +} + +int +_afr_shd_ta_get_xattrs(xlator_t *this, loc_t *loc, dict_t **xdata) +{ + afr_private_t *priv = NULL; + dict_t *xattr = NULL; + int *raw = NULL; + int ret = -1; + int i = 0; + + priv = this->private; + + xattr = dict_new(); + if (!xattr) { + gf_msg(this->name, GF_LOG_ERROR, ENOMEM, AFR_MSG_DICT_GET_FAILED, + "Failed to create dict."); + goto out; + } + + for (i = 0; i < priv->child_count; i++) { + raw = GF_CALLOC(AFR_NUM_CHANGE_LOGS, sizeof(int), gf_afr_mt_int32_t); + if (!raw) + goto out; + + ret = dict_set_bin(xattr, priv->pending_key[i], raw, + AFR_NUM_CHANGE_LOGS * sizeof(int)); + if (ret) { + GF_FREE(raw); + goto out; + } + } + + ret = syncop_xattrop(priv->children[THIN_ARBITER_BRICK_INDEX], loc, + GF_XATTROP_ADD_ARRAY, xattr, NULL, xdata, NULL); + if (ret || !(*xdata)) { + gf_msg(this->name, GF_LOG_ERROR, -ret, AFR_MSG_THIN_ARB, + "Xattrop failed on %s.", loc->name); + } + +out: + if (xattr) + dict_unref(xattr); + + return ret; +} + +void +afr_shd_ta_get_xattrs(xlator_t *this, loc_t *loc, struct subvol_healer *healer, + dict_t **xdata) +{ + int ret = 0; + + loc_wipe(loc); + if (afr_shd_fill_ta_loc(this, loc)) { + gf_msg(this->name, GF_LOG_ERROR, -ret, AFR_MSG_THIN_ARB, + "Failed to populate thin-arbiter loc for: %s.", loc->name); + goto out; + } + + ret = afr_ta_post_op_lock(this, loc); + if (ret) + goto out; + + ret = _afr_shd_ta_get_xattrs(this, loc, xdata); + if (ret) { + if (*xdata) { + dict_unref(*xdata); + *xdata = NULL; + } + } + + afr_ta_post_op_unlock(this, loc); + +out: + if (ret) + healer->rerun = 1; +} + +int +afr_shd_ta_unset_xattrs(xlator_t *this, loc_t *loc, dict_t **xdata, int healer) +{ + afr_private_t *priv = NULL; + dict_t *xattr = NULL; gf_boolean_t need_xattrop = _gf_false; void *pending_raw = NULL; int *raw = NULL; @@ -563,7 +677,7 @@ afr_shd_ta_set_xattrs(xlator_t *this, loc_t *loc, dict_t **xdata, int healer) int i = 0; int j = 0; int val = 0; - int ret = 0; + int ret = -1; priv = this->private; @@ -598,6 +712,7 @@ afr_shd_ta_set_xattrs(xlator_t *this, loc_t *loc, dict_t **xdata, int healer) "not the good shd. Skipping. " "SHD = %d.", healer); + ret = 0; GF_FREE(raw); goto out; } @@ -607,113 +722,69 @@ afr_shd_ta_set_xattrs(xlator_t *this, loc_t *loc, dict_t **xdata, int healer) } ret = dict_set_bin(xattr, priv->pending_key[i], raw, - AFR_NUM_CHANGE_LOGS * sizeof(int)); + AFR_NUM_CHANGE_LOGS * sizeof (int)); if (ret) { - GF_FREE(raw); + GF_FREE (raw); goto out; } - memset(pending, 0, sizeof(pending)); + if (need_xattrop) + break; } if (!need_xattrop) { + ret = 0; goto out; } - flock.l_type = F_WRLCK; - flock.l_start = 0; - flock.l_len = 0; - - ret = syncop_inodelk(priv->children[THIN_ARBITER_BRICK_INDEX], - AFR_TA_DOM_NOTIFY, loc, F_SETLKW, &flock, NULL, NULL); - if (ret) - goto out; - ret = syncop_xattrop(priv->children[THIN_ARBITER_BRICK_INDEX], loc, GF_XATTROP_ADD_ARRAY, xattr, NULL, NULL, NULL); if (ret) gf_msg(this->name, GF_LOG_ERROR, -ret, AFR_MSG_THIN_ARB, "Xattrop failed."); - flock.l_type = F_UNLCK; - syncop_inodelk(priv->children[THIN_ARBITER_BRICK_INDEX], AFR_TA_DOM_NOTIFY, - loc, F_SETLKW, &flock, NULL, NULL); - out: if (xattr) dict_unref(xattr); - return; + + return ret; } void -afr_shd_ta_get_xattrs(xlator_t *this, loc_t *loc, dict_t **xdata) +afr_shd_ta_check_and_unset_xattrs(xlator_t *this, loc_t *loc, + struct subvol_healer *healer, + dict_t *pre_crawl_xdata) { - afr_private_t *priv = NULL; - dict_t *xattr = NULL; - struct iatt stbuf = { - 0, - }; - int *raw = NULL; + int ret_lock = 0; int ret = 0; - int i = 0; + dict_t *post_crawl_xdata = NULL; - priv = this->private; - - loc->parent = inode_ref(this->itable->root); - gf_uuid_copy(loc->pargfid, loc->parent->gfid); - loc->name = priv->pending_key[THIN_ARBITER_BRICK_INDEX]; - loc->inode = inode_new(loc->parent->table); - if (!loc->inode) { - goto out; - } + ret_lock = afr_ta_post_op_lock(this, loc); + if (ret_lock) + goto unref; - ret = syncop_lookup(priv->children[THIN_ARBITER_BRICK_INDEX], loc, &stbuf, - 0, 0, 0); - if (ret) { - gf_msg(this->name, GF_LOG_ERROR, -ret, AFR_MSG_THIN_ARB, - "Failed lookup on file %s.", loc->name); - goto out; - } - - gf_uuid_copy(priv->ta_gfid, stbuf.ia_gfid); - gf_uuid_copy(loc->gfid, priv->ta_gfid); + ret = _afr_shd_ta_get_xattrs(this, loc, &post_crawl_xdata); + if (ret) + goto unref; - xattr = dict_new(); - if (!xattr) { - gf_msg(this->name, GF_LOG_ERROR, 0, AFR_MSG_DICT_GET_FAILED, - "Failed to create dict."); - goto out; + if (!are_dicts_equal(pre_crawl_xdata, post_crawl_xdata, NULL, NULL)) { + ret = -1; + goto unref; } - for (i = 0; i < priv->child_count; i++) { - raw = GF_CALLOC(AFR_NUM_CHANGE_LOGS, sizeof(int), gf_afr_mt_int32_t); - if (!raw) { - goto out; - } + ret = afr_shd_ta_unset_xattrs(this, loc, &post_crawl_xdata, healer->subvol); - ret = dict_set_bin(xattr, priv->pending_key[i], raw, - AFR_NUM_CHANGE_LOGS * sizeof(int)); - if (ret) { - GF_FREE(raw); - goto out; - } +unref: + if (post_crawl_xdata) { + dict_unref(post_crawl_xdata); + post_crawl_xdata = NULL; } - ret = syncop_xattrop(priv->children[THIN_ARBITER_BRICK_INDEX], loc, - GF_XATTROP_ADD_ARRAY, xattr, NULL, xdata, NULL); - if (ret) { - gf_msg(this->name, GF_LOG_ERROR, -ret, AFR_MSG_THIN_ARB, - "Xattrop failed."); - goto out; - } - if (!(*xdata)) - gf_msg(this->name, GF_LOG_ERROR, 0, AFR_MSG_DICT_GET_FAILED, - "Xdata response is empty."); + if (ret || ret_lock) + healer->rerun = 1; -out: - if (xattr) - dict_unref(xattr); - return; + if (!ret_lock) + afr_ta_post_op_unlock(this, loc); } void * @@ -723,7 +794,7 @@ afr_shd_index_healer(void *data) xlator_t *this = NULL; int ret = 0; afr_private_t *priv = NULL; - dict_t *xdata = NULL; + dict_t *pre_crawl_xdata = NULL; loc_t loc = { 0, }; @@ -739,8 +810,7 @@ afr_shd_index_healer(void *data) priv->local[healer->subvol] = healer->local; if (priv->thin_arbiter_count) { - loc_wipe(&loc); - afr_shd_ta_get_xattrs(this, &loc, &xdata); + afr_shd_ta_get_xattrs(this, &loc, healer, &pre_crawl_xdata); } do { @@ -770,15 +840,14 @@ afr_shd_index_healer(void *data) sleep(1); } while (ret > 0); - if (xdata && !healer->crawl_event.heal_failed_count) { - afr_shd_ta_set_xattrs(this, &loc, &xdata, healer->subvol); - dict_unref(xdata); - xdata = NULL; + if (pre_crawl_xdata && !healer->crawl_event.heal_failed_count) { + afr_shd_ta_check_and_unset_xattrs(this, &loc, healer, + pre_crawl_xdata); + dict_unref(pre_crawl_xdata); + pre_crawl_xdata = NULL; } } - loc_wipe(&loc); - return NULL; } diff --git a/xlators/cluster/afr/src/afr.c b/xlators/cluster/afr/src/afr.c index 568293cdf2c..26950fd7927 100644 --- a/xlators/cluster/afr/src/afr.c +++ b/xlators/cluster/afr/src/afr.c @@ -384,6 +384,7 @@ init(xlator_t *this) priv->child_count--; priv->ta_bad_child_index = AFR_CHILD_UNKNOWN; priv->ta_notify_dom_lock_offset = 0; + *priv->ta_gfid = 0; } INIT_LIST_HEAD(&priv->healing); INIT_LIST_HEAD(&priv->heal_waiting); -- cgit