summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--tests/basic/afr/ta-shd.t49
-rw-r--r--tests/thin-arbiter.rc181
-rw-r--r--xlators/cluster/afr/src/afr-common.c7
-rw-r--r--xlators/cluster/afr/src/afr-self-heald.c245
-rw-r--r--xlators/cluster/afr/src/afr.c1
5 files changed, 392 insertions, 91 deletions
diff --git a/tests/basic/afr/ta-shd.t b/tests/basic/afr/ta-shd.t
new file mode 100644
index 00000000000..bb2e58b3f77
--- /dev/null
+++ b/tests/basic/afr/ta-shd.t
@@ -0,0 +1,49 @@
+#!/bin/bash
+#Self-heal tests
+
+. $(dirname $0)/../../include.rc
+. $(dirname $0)/../../volume.rc
+. $(dirname $0)/../../thin-arbiter.rc
+cleanup;
+TEST ta_create_brick_and_volfile brick0
+TEST ta_create_brick_and_volfile brick1
+TEST ta_create_ta_and_volfile ta
+TEST ta_start_brick_process brick0
+TEST ta_start_brick_process brick1
+TEST ta_start_ta_process ta
+
+TEST ta_create_mount_volfile brick0 brick1 ta
+TEST ta_start_mount_process $M0
+EXPECT_WITHIN $PROCESS_UP_TIMEOUT "1" ta_up_status $V0 $M0 0
+EXPECT_WITHIN $PROCESS_UP_TIMEOUT "trusted.afr.patchy-ta-2" ls $B0/ta
+
+TEST ta_create_shd_volfile brick0 brick1 ta
+TEST ta_start_shd_process glustershd
+
+TEST touch $M0/a.txt
+TEST ta_kill_brick brick0
+EXPECT_WITHIN $PROCESS_DOWN_TIMEOUT "0" afr_child_up_status_meta $M0 $V0-replicate-0 0
+echo "Hello" >> $M0/a.txt
+EXPECT "000000010000000000000000" get_hex_xattr trusted.afr.$V0-client-0 $B0/brick1/a.txt
+EXPECT "000000010000000000000000" get_hex_xattr trusted.afr.$V0-client-0 $B0/ta/trusted.afr.$V0-ta-2
+
+#TODO: After the write txn changes are merged, take statedump of TA process and
+#check whether AFR_TA_DOM_NOTIFY lock is held by the client here. Take the
+#statedump again after line #38 to check AFR_TA_DOM_NOTIFY lock is released by
+#the SHD process.
+
+TEST ta_start_brick_process brick0
+EXPECT_WITHIN $CHILD_UP_TIMEOUT "1" afr_child_up_status_meta $M0 $V0-replicate-0 0
+EXPECT_WITHIN $HEAL_TIMEOUT "000000000000000000000000" get_hex_xattr trusted.afr.$V0-client-0 $B0/brick1/a.txt
+EXPECT_WITHIN $HEAL_TIMEOUT "000000000000000000000000" get_hex_xattr trusted.afr.$V0-client-0 $B0/ta/trusted.afr.$V0-ta-2
+
+#Kill the previously up brick and try reading from other brick. Since the heal
+#has happened file content should be same.
+TEST ta_kill_brick brick1
+EXPECT_WITHIN $PROCESS_DOWN_TIMEOUT "0" afr_child_up_status_meta $M0 $V0-replicate-0 1
+#Umount and mount to remove cached data.
+TEST umount $M0
+TEST ta_start_mount_process $M0
+EXPECT_WITHIN $PROCESS_UP_TIMEOUT "1" ta_up_status $V0 $M0 0
+EXPECT "Hello" cat $M0/a.txt
+cleanup;
diff --git a/tests/thin-arbiter.rc b/tests/thin-arbiter.rc
index 36d11cea61d..c5ac00baaaf 100644
--- a/tests/thin-arbiter.rc
+++ b/tests/thin-arbiter.rc
@@ -431,3 +431,184 @@ function ta_up_status()
local replica_id=$3
grep -E "^up = " $m/.meta/graphs/active/${v}-replicate-${replica_id}/private | cut -f2 -d'='
}
+
+function ta_create_shd_volfile()
+{
+ local b0=$B0/$1
+ local b1=$B0/$2
+ local ta=$B0/$3
+ local b0_port=${PORTMAP[$1]}
+ local b1_port=${PORTMAP[$2]}
+ local ta_port=${PORTMAP[$3]}
+cat > $B0/glustershd.vol <<EOF
+volume ${V0}-replicate-0-client-0
+ type protocol/client
+ option send-gids on
+ option transport.socket.lowlat off
+ option transport.socket.keepalive-interval 2
+ option remote-host $H0
+ option remote-subvolume $b0
+ option ping-timeout 42
+ option client-bind-insecure off
+ option transport.socket.own-thread off
+ option frame-timeout 1800
+ option non-blocking-io off
+ option transport.socket.keepalive 1
+ option transport.socket.keepalive-count 9
+ option transport.tcp-user-timeout 0
+ option transport.socket.nodelay 1
+ option transport.socket.keepalive-time 20
+ option transport.socket.read-fail-log off
+ option transport-type tcp
+ option filter-O_DIRECT disable
+ option event-threads 2
+ option transport.listen-backlog 1024
+ option transport.socket.ssl-enabled off
+ option password a0ad63dd-8314-4f97-9160-1b93e3cb1f0b
+ option username 459d48e8-2a92-4f11-89f2-077b29f6f86d
+ option remote-port $b0_port
+end-volume
+
+volume ${V0}-replicate-0-client-1
+ type protocol/client
+ option remote-host $H0
+ option transport.socket.keepalive-time 20
+ option transport.socket.keepalive-count 9
+ option transport.socket.own-thread off
+ option transport.socket.ssl-enabled off
+ option transport-type tcp
+ option remote-subvolume $b1
+ option event-threads 2
+ option transport.tcp-user-timeout 0
+ option transport.socket.keepalive 1
+ option transport.socket.nodelay 1
+ option transport.socket.read-fail-log off
+ option frame-timeout 1800
+ option ping-timeout 42
+ option client-bind-insecure off
+ option filter-O_DIRECT disable
+ option send-gids on
+ option non-blocking-io off
+ option transport.listen-backlog 1024
+ option transport.socket.lowlat off
+ option transport.socket.keepalive-interval 2
+ option password a0ad63dd-8314-4f97-9160-1b93e3cb1f0b
+ option username 459d48e8-2a92-4f11-89f2-077b29f6f86d
+ option remote-port $b1_port
+end-volume
+
+volume ${V0}-replicate-0-thin-arbiter-client
+ type protocol/client
+ option frame-timeout 1800
+ option event-threads 2
+ option transport.listen-backlog 1024
+ option transport.socket.nodelay 1
+ option transport.socket.keepalive-count 9
+ option transport.socket.ssl-enabled off
+ option transport-type tcp
+ option remote-subvolume $ta
+ option filter-O_DIRECT disable
+ option non-blocking-io off
+ option transport.socket.lowlat off
+ option transport.socket.keepalive-interval 2
+ option transport.socket.read-fail-log off
+ option remote-host $H0
+ option send-gids on
+ option transport.tcp-user-timeout 0
+ option transport.socket.keepalive-time 20
+ option ping-timeout 42
+ option client-bind-insecure off
+ option transport.socket.keepalive 1
+ option transport.socket.own-thread off
+ option remote-port $ta_port
+end-volume
+
+volume ${V0}-replicate-0
+ type cluster/replicate
+ option background-self-heal-count 8
+ option metadata-self-heal on
+ option data-change-log on
+ option entrylk-trace off
+ option iam-self-heal-daemon yes
+ option afr-dirty-xattr trusted.afr.dirty
+ option heal-timeout 10
+ option read-hash-mode 1
+ option metadata-splitbrain-forced-heal off
+ option thin-arbiter $H0:$ta
+ option shd-max-threads 1
+ option afr-pending-xattr ${V0}-client-0,${V0}-client-1,${V0}-ta-2
+ option halo-max-latency 5
+ option halo-max-replicas 99999
+ option entry-change-log on
+ option halo-nfsd-max-latency 5
+ option inodelk-trace off
+ option pre-op-compat on
+ option eager-lock on
+ option self-heal-readdir-size 1KB
+ option ensure-durability on
+ option locking-scheme full
+ option halo-enabled False
+ option heal-wait-queue-length 128
+ option entry-self-heal on
+ option self-heal-daemon on
+ option quorum-reads no
+ option shd-wait-qlength 1024
+ option choose-local true
+ option halo-min-replicas 2
+ option data-self-heal on
+ option metadata-change-log on
+ option consistent-metadata no
+ option full-lock yes
+ option use-compound-fops no
+ option halo-shd-max-latency 99999
+ option quorum-type none
+ option favorite-child-policy none
+ option read-subvolume-index -1
+ option optimistic-change-log on
+ option iam-nfs-daemon off
+ option post-op-delay-secs 1
+ option granular-entry-heal no
+ option consistent-io no
+ option data-self-heal-window-size 1
+ subvolumes ${V0}-replicate-0-client-0 ${V0}-replicate-0-client-1 ${V0}-replicate-0-thin-arbiter-client
+end-volume
+
+volume glustershd
+ type debug/io-stats
+ option log-buf-size 5
+ option ios-dump-format json
+ option latency-measurement off
+ option sys-log-level CRITICAL
+ option brick-log-level INFO
+ option client-logger gluster-log
+ option client-log-format with-msg-id
+ option brick-log-format with-msg-id
+ option client-log-buf-size 5
+ option log-flush-timeout 120
+ option ios-dump-interval 0
+ option ios-sample-interval 0
+ option ios-dnscache-ttl-sec 86400
+ option count-fop-hits off
+ option client-log-level INFO
+ option brick-logger gluster-log
+ option brick-log-buf-size 5
+ option ios-sample-buf-size 65535
+ option client-log-flush-timeout 120
+ option brick-log-flush-timeout 120
+ option unique-id /no/such/path
+ option dump-fd-stats off
+ subvolumes ${V0}-replicate-0
+end-volume
+EOF
+}
+
+function ta_start_shd_process()
+{
+ if glusterfs -p $B0/${1}.pid --volfile=$B0/${1}.vol -l $(gluster --print-logdir)/${1}.log --process-name=glustershd
+ then
+ cat $B0/${1}.pid
+ else
+ echo ""
+ return 1
+ fi
+}
diff --git a/xlators/cluster/afr/src/afr-common.c b/xlators/cluster/afr/src/afr-common.c
index eb0e7330a91..73f1d728809 100644
--- a/xlators/cluster/afr/src/afr-common.c
+++ b/xlators/cluster/afr/src/afr-common.c
@@ -6717,7 +6717,8 @@ afr_ta_post_op_lock(xlator_t *this, loc_t *loc)
};
int32_t cmd = 0;
- GF_ASSERT(afr_ta_is_fop_called_from_synctask(this));
+ if (!priv->shd.iamshd)
+ GF_ASSERT(afr_ta_is_fop_called_from_synctask(this));
flock1.l_type = F_WRLCK;
while (!locked) {
@@ -6725,7 +6726,6 @@ afr_ta_post_op_lock(xlator_t *this, loc_t *loc)
cmd = F_SETLKW;
flock1.l_start = 0;
flock1.l_len = 0;
-
} else {
cmd = F_SETLK;
if (priv->ta_notify_dom_lock_offset) {
@@ -6780,7 +6780,8 @@ afr_ta_post_op_unlock(xlator_t *this, loc_t *loc)
};
int ret = 0;
- GF_ASSERT(afr_ta_is_fop_called_from_synctask(this));
+ if (!priv->shd.iamshd)
+ GF_ASSERT(afr_ta_is_fop_called_from_synctask(this));
flock.l_type = F_UNLCK;
flock.l_start = 0;
flock.l_len = 0;
diff --git a/xlators/cluster/afr/src/afr-self-heald.c b/xlators/cluster/afr/src/afr-self-heald.c
index 0cf01a041b4..53d7ef8bb8e 100644
--- a/xlators/cluster/afr/src/afr-self-heald.c
+++ b/xlators/cluster/afr/src/afr-self-heald.c
@@ -546,14 +546,128 @@ afr_shd_full_sweep(struct subvol_healer *healer, inode_t *inode)
GF_CLIENT_PID_SELF_HEALD, healer, afr_shd_full_heal);
}
-void
-afr_shd_ta_set_xattrs(xlator_t *this, loc_t *loc, dict_t **xdata, int healer)
+int
+afr_shd_fill_ta_loc(xlator_t *this, loc_t *loc)
{
afr_private_t *priv = NULL;
- dict_t *xattr = NULL;
- struct gf_flock flock = {
+ struct iatt stbuf = {
0,
};
+ int ret = -1;
+
+ priv = this->private;
+ loc->parent = inode_ref(this->itable->root);
+ gf_uuid_copy(loc->pargfid, loc->parent->gfid);
+ loc->name = priv->pending_key[THIN_ARBITER_BRICK_INDEX];
+ loc->inode = inode_new(loc->parent->table);
+ GF_CHECK_ALLOC(loc->inode, ret, out);
+
+ if (!gf_uuid_is_null(priv->ta_gfid))
+ goto assign_gfid;
+
+ ret = syncop_lookup(priv->children[THIN_ARBITER_BRICK_INDEX], loc, &stbuf,
+ 0, 0, 0);
+ if (ret) {
+ gf_msg(this->name, GF_LOG_ERROR, -ret, AFR_MSG_THIN_ARB,
+ "Failed lookup on file %s.", loc->name);
+ goto out;
+ }
+
+ gf_uuid_copy(priv->ta_gfid, stbuf.ia_gfid);
+
+assign_gfid:
+ gf_uuid_copy(loc->gfid, priv->ta_gfid);
+ ret = 0;
+
+out:
+ if (ret)
+ loc_wipe(loc);
+
+ return ret;
+}
+
+int
+_afr_shd_ta_get_xattrs(xlator_t *this, loc_t *loc, dict_t **xdata)
+{
+ afr_private_t *priv = NULL;
+ dict_t *xattr = NULL;
+ int *raw = NULL;
+ int ret = -1;
+ int i = 0;
+
+ priv = this->private;
+
+ xattr = dict_new();
+ if (!xattr) {
+ gf_msg(this->name, GF_LOG_ERROR, ENOMEM, AFR_MSG_DICT_GET_FAILED,
+ "Failed to create dict.");
+ goto out;
+ }
+
+ for (i = 0; i < priv->child_count; i++) {
+ raw = GF_CALLOC(AFR_NUM_CHANGE_LOGS, sizeof(int), gf_afr_mt_int32_t);
+ if (!raw)
+ goto out;
+
+ ret = dict_set_bin(xattr, priv->pending_key[i], raw,
+ AFR_NUM_CHANGE_LOGS * sizeof(int));
+ if (ret) {
+ GF_FREE(raw);
+ goto out;
+ }
+ }
+
+ ret = syncop_xattrop(priv->children[THIN_ARBITER_BRICK_INDEX], loc,
+ GF_XATTROP_ADD_ARRAY, xattr, NULL, xdata, NULL);
+ if (ret || !(*xdata)) {
+ gf_msg(this->name, GF_LOG_ERROR, -ret, AFR_MSG_THIN_ARB,
+ "Xattrop failed on %s.", loc->name);
+ }
+
+out:
+ if (xattr)
+ dict_unref(xattr);
+
+ return ret;
+}
+
+void
+afr_shd_ta_get_xattrs(xlator_t *this, loc_t *loc, struct subvol_healer *healer,
+ dict_t **xdata)
+{
+ int ret = 0;
+
+ loc_wipe(loc);
+ if (afr_shd_fill_ta_loc(this, loc)) {
+ gf_msg(this->name, GF_LOG_ERROR, -ret, AFR_MSG_THIN_ARB,
+ "Failed to populate thin-arbiter loc for: %s.", loc->name);
+ goto out;
+ }
+
+ ret = afr_ta_post_op_lock(this, loc);
+ if (ret)
+ goto out;
+
+ ret = _afr_shd_ta_get_xattrs(this, loc, xdata);
+ if (ret) {
+ if (*xdata) {
+ dict_unref(*xdata);
+ *xdata = NULL;
+ }
+ }
+
+ afr_ta_post_op_unlock(this, loc);
+
+out:
+ if (ret)
+ healer->rerun = 1;
+}
+
+int
+afr_shd_ta_unset_xattrs(xlator_t *this, loc_t *loc, dict_t **xdata, int healer)
+{
+ afr_private_t *priv = NULL;
+ dict_t *xattr = NULL;
gf_boolean_t need_xattrop = _gf_false;
void *pending_raw = NULL;
int *raw = NULL;
@@ -563,7 +677,7 @@ afr_shd_ta_set_xattrs(xlator_t *this, loc_t *loc, dict_t **xdata, int healer)
int i = 0;
int j = 0;
int val = 0;
- int ret = 0;
+ int ret = -1;
priv = this->private;
@@ -598,6 +712,7 @@ afr_shd_ta_set_xattrs(xlator_t *this, loc_t *loc, dict_t **xdata, int healer)
"not the good shd. Skipping. "
"SHD = %d.",
healer);
+ ret = 0;
GF_FREE(raw);
goto out;
}
@@ -607,113 +722,69 @@ afr_shd_ta_set_xattrs(xlator_t *this, loc_t *loc, dict_t **xdata, int healer)
}
ret = dict_set_bin(xattr, priv->pending_key[i], raw,
- AFR_NUM_CHANGE_LOGS * sizeof(int));
+ AFR_NUM_CHANGE_LOGS * sizeof (int));
if (ret) {
- GF_FREE(raw);
+ GF_FREE (raw);
goto out;
}
- memset(pending, 0, sizeof(pending));
+ if (need_xattrop)
+ break;
}
if (!need_xattrop) {
+ ret = 0;
goto out;
}
- flock.l_type = F_WRLCK;
- flock.l_start = 0;
- flock.l_len = 0;
-
- ret = syncop_inodelk(priv->children[THIN_ARBITER_BRICK_INDEX],
- AFR_TA_DOM_NOTIFY, loc, F_SETLKW, &flock, NULL, NULL);
- if (ret)
- goto out;
-
ret = syncop_xattrop(priv->children[THIN_ARBITER_BRICK_INDEX], loc,
GF_XATTROP_ADD_ARRAY, xattr, NULL, NULL, NULL);
if (ret)
gf_msg(this->name, GF_LOG_ERROR, -ret, AFR_MSG_THIN_ARB,
"Xattrop failed.");
- flock.l_type = F_UNLCK;
- syncop_inodelk(priv->children[THIN_ARBITER_BRICK_INDEX], AFR_TA_DOM_NOTIFY,
- loc, F_SETLKW, &flock, NULL, NULL);
-
out:
if (xattr)
dict_unref(xattr);
- return;
+
+ return ret;
}
void
-afr_shd_ta_get_xattrs(xlator_t *this, loc_t *loc, dict_t **xdata)
+afr_shd_ta_check_and_unset_xattrs(xlator_t *this, loc_t *loc,
+ struct subvol_healer *healer,
+ dict_t *pre_crawl_xdata)
{
- afr_private_t *priv = NULL;
- dict_t *xattr = NULL;
- struct iatt stbuf = {
- 0,
- };
- int *raw = NULL;
+ int ret_lock = 0;
int ret = 0;
- int i = 0;
+ dict_t *post_crawl_xdata = NULL;
- priv = this->private;
-
- loc->parent = inode_ref(this->itable->root);
- gf_uuid_copy(loc->pargfid, loc->parent->gfid);
- loc->name = priv->pending_key[THIN_ARBITER_BRICK_INDEX];
- loc->inode = inode_new(loc->parent->table);
- if (!loc->inode) {
- goto out;
- }
+ ret_lock = afr_ta_post_op_lock(this, loc);
+ if (ret_lock)
+ goto unref;
- ret = syncop_lookup(priv->children[THIN_ARBITER_BRICK_INDEX], loc, &stbuf,
- 0, 0, 0);
- if (ret) {
- gf_msg(this->name, GF_LOG_ERROR, -ret, AFR_MSG_THIN_ARB,
- "Failed lookup on file %s.", loc->name);
- goto out;
- }
-
- gf_uuid_copy(priv->ta_gfid, stbuf.ia_gfid);
- gf_uuid_copy(loc->gfid, priv->ta_gfid);
+ ret = _afr_shd_ta_get_xattrs(this, loc, &post_crawl_xdata);
+ if (ret)
+ goto unref;
- xattr = dict_new();
- if (!xattr) {
- gf_msg(this->name, GF_LOG_ERROR, 0, AFR_MSG_DICT_GET_FAILED,
- "Failed to create dict.");
- goto out;
+ if (!are_dicts_equal(pre_crawl_xdata, post_crawl_xdata, NULL, NULL)) {
+ ret = -1;
+ goto unref;
}
- for (i = 0; i < priv->child_count; i++) {
- raw = GF_CALLOC(AFR_NUM_CHANGE_LOGS, sizeof(int), gf_afr_mt_int32_t);
- if (!raw) {
- goto out;
- }
+ ret = afr_shd_ta_unset_xattrs(this, loc, &post_crawl_xdata, healer->subvol);
- ret = dict_set_bin(xattr, priv->pending_key[i], raw,
- AFR_NUM_CHANGE_LOGS * sizeof(int));
- if (ret) {
- GF_FREE(raw);
- goto out;
- }
+unref:
+ if (post_crawl_xdata) {
+ dict_unref(post_crawl_xdata);
+ post_crawl_xdata = NULL;
}
- ret = syncop_xattrop(priv->children[THIN_ARBITER_BRICK_INDEX], loc,
- GF_XATTROP_ADD_ARRAY, xattr, NULL, xdata, NULL);
- if (ret) {
- gf_msg(this->name, GF_LOG_ERROR, -ret, AFR_MSG_THIN_ARB,
- "Xattrop failed.");
- goto out;
- }
- if (!(*xdata))
- gf_msg(this->name, GF_LOG_ERROR, 0, AFR_MSG_DICT_GET_FAILED,
- "Xdata response is empty.");
+ if (ret || ret_lock)
+ healer->rerun = 1;
-out:
- if (xattr)
- dict_unref(xattr);
- return;
+ if (!ret_lock)
+ afr_ta_post_op_unlock(this, loc);
}
void *
@@ -723,7 +794,7 @@ afr_shd_index_healer(void *data)
xlator_t *this = NULL;
int ret = 0;
afr_private_t *priv = NULL;
- dict_t *xdata = NULL;
+ dict_t *pre_crawl_xdata = NULL;
loc_t loc = {
0,
};
@@ -739,8 +810,7 @@ afr_shd_index_healer(void *data)
priv->local[healer->subvol] = healer->local;
if (priv->thin_arbiter_count) {
- loc_wipe(&loc);
- afr_shd_ta_get_xattrs(this, &loc, &xdata);
+ afr_shd_ta_get_xattrs(this, &loc, healer, &pre_crawl_xdata);
}
do {
@@ -770,15 +840,14 @@ afr_shd_index_healer(void *data)
sleep(1);
} while (ret > 0);
- if (xdata && !healer->crawl_event.heal_failed_count) {
- afr_shd_ta_set_xattrs(this, &loc, &xdata, healer->subvol);
- dict_unref(xdata);
- xdata = NULL;
+ if (pre_crawl_xdata && !healer->crawl_event.heal_failed_count) {
+ afr_shd_ta_check_and_unset_xattrs(this, &loc, healer,
+ pre_crawl_xdata);
+ dict_unref(pre_crawl_xdata);
+ pre_crawl_xdata = NULL;
}
}
- loc_wipe(&loc);
-
return NULL;
}
diff --git a/xlators/cluster/afr/src/afr.c b/xlators/cluster/afr/src/afr.c
index 568293cdf2c..26950fd7927 100644
--- a/xlators/cluster/afr/src/afr.c
+++ b/xlators/cluster/afr/src/afr.c
@@ -384,6 +384,7 @@ init(xlator_t *this)
priv->child_count--;
priv->ta_bad_child_index = AFR_CHILD_UNKNOWN;
priv->ta_notify_dom_lock_offset = 0;
+ *priv->ta_gfid = 0;
}
INIT_LIST_HEAD(&priv->healing);
INIT_LIST_HEAD(&priv->heal_waiting);