From 6f08b9f2b006a4eafaa176cfd792038eed7f6c98 Mon Sep 17 00:00:00 2001 From: Ravishankar N Date: Wed, 25 May 2016 21:18:19 +0530 Subject: afr: Automagic unsplit-brain by [ctime|mtime|size|majority] Backport of http://review.gluster.org/#/c/14026/ Introduce cluster.favorite-child-policy which when enabled with [ctime|mtime|size|majority], automatically heals files that are in split-brian. The majority policy will not pick a source if there is no majority. The other three policies pick the first brick with a valid reply and non-zero ctime/mtime/size as source. Change-Id: I93623a914dce2839957fce87b514050e9d274d4c BUG: 1339639 Signed-off-by: Ravishankar N Reviewed-on: http://review.gluster.org/14535 Smoke: Gluster Build System NetBSD-regression: NetBSD Build System CentOS-regression: Gluster Build System Reviewed-by: Pranith Kumar Karampuri --- .../basic/afr/split-brain-favorite-child-policy.t | 175 ++++++++++++ xlators/cluster/afr/src/afr-messages.h | 10 +- xlators/cluster/afr/src/afr-self-heal-common.c | 292 +++++++++++++++++++-- xlators/cluster/afr/src/afr-self-heal-data.c | 8 +- xlators/cluster/afr/src/afr-self-heal-metadata.c | 8 +- xlators/cluster/afr/src/afr-self-heal.h | 1 + xlators/cluster/afr/src/afr.c | 46 ++++ xlators/cluster/afr/src/afr.h | 13 + xlators/mgmt/glusterd/src/glusterd-volume-set.c | 6 + 9 files changed, 537 insertions(+), 22 deletions(-) create mode 100644 tests/basic/afr/split-brain-favorite-child-policy.t diff --git a/tests/basic/afr/split-brain-favorite-child-policy.t b/tests/basic/afr/split-brain-favorite-child-policy.t new file mode 100644 index 00000000000..66fcd67a031 --- /dev/null +++ b/tests/basic/afr/split-brain-favorite-child-policy.t @@ -0,0 +1,175 @@ +#!/bin/bash + +#Test the split-brain resolution CLI commands. +. $(dirname $0)/../../include.rc +. $(dirname $0)/../../volume.rc + +cleanup; + +TEST glusterd +TEST pidof glusterd + +#Create replica 2 volume +TEST $CLI volume create $V0 replica 2 $H0:$B0/${V0}{0,1} +TEST $CLI volume set $V0 performance.write-behind off +TEST $CLI volume set $V0 cluster.self-heal-daemon off +TEST $CLI volume set $V0 cluster.entry-self-heal off +TEST $CLI volume set $V0 cluster.data-self-heal off +TEST $CLI volume set $V0 cluster.metadata-self-heal off +TEST $CLI volume start $V0 +TEST glusterfs --volfile-id=/$V0 --volfile-server=$H0 $M0 --attribute-timeout=0 --entry-timeout=0 +TEST touch $M0/file + +############ Healing using favorite-child-policy = ctime ################# +TEST kill_brick $V0 $H0 $B0/${V0}0 +TEST dd if=/dev/urandom of=$M0/file bs=1024 count=1024 +TEST $CLI volume start $V0 force +EXPECT_WITHIN $PROCESS_UP_TIMEOUT "1" brick_up_status $V0 $H0 $B0/${V0}0 +EXPECT_WITHIN $PROCESS_UP_TIMEOUT "1" afr_child_up_status $V0 0 +TEST kill_brick $V0 $H0 $B0/${V0}1 +TEST dd if=/dev/urandom of=$M0/file bs=1024 count=1024 + +TEST $CLI volume start $V0 force +EXPECT_WITHIN $PROCESS_UP_TIMEOUT "1" brick_up_status $V0 $H0 $B0/${V0}1 +EXPECT_WITHIN $PROCESS_UP_TIMEOUT "1" afr_child_up_status $V0 1 +TEST $CLI volume set $V0 cluster.self-heal-daemon on +EXPECT_WITHIN $PROCESS_UP_TIMEOUT "Y" glustershd_up_status +EXPECT_WITHIN $CHILD_UP_TIMEOUT "1" afr_child_up_status_in_shd $V0 0 +EXPECT_WITHIN $CHILD_UP_TIMEOUT "1" afr_child_up_status_in_shd $V0 1 +TEST $CLI volume heal $V0 + +#file fill in split-brain +cat $M0/file > /dev/null +EXPECT "1" echo $? + +#We know that the first brick has latest ctime. +LATEST_CTIME_MD5=$(md5sum $B0/${V0}0/file | cut -d\ -f1) +TEST $CLI volume set $V0 cluster.favorite-child-policy ctime +TEST $CLI volume heal $V0 +EXPECT_WITHIN $HEAL_TIMEOUT "^0$" get_pending_heal_count $V0 +cat $M0/file > /dev/null +EXPECT "0" echo $? +HEALED_MD5=$(md5sum $B0/${V0}1/file | cut -d\ -f1) +TEST [ "$LATEST_CTIME_MD5" == "$HEALED_MD5" ] + +############ Healing using favorite-child-policy = mtime ################# +TEST $CLI volume set $V0 cluster.favorite-child-policy none +TEST $CLI volume set $V0 cluster.self-heal-daemon off +TEST kill_brick $V0 $H0 $B0/${V0}1 +TEST dd if=/dev/urandom of=$M0/file bs=1024 count=1024 +TEST $CLI volume start $V0 force +EXPECT_WITHIN $PROCESS_UP_TIMEOUT "1" brick_up_status $V0 $H0 $B0/${V0}1 +EXPECT_WITHIN $PROCESS_UP_TIMEOUT "1" afr_child_up_status $V0 1 +TEST kill_brick $V0 $H0 $B0/${V0}0 +TEST dd if=/dev/urandom of=$M0/file bs=1024 count=1024 + +TEST $CLI volume start $V0 force +EXPECT_WITHIN $PROCESS_UP_TIMEOUT "1" brick_up_status $V0 $H0 $B0/${V0}0 +EXPECT_WITHIN $PROCESS_UP_TIMEOUT "1" afr_child_up_status $V0 0 +TEST $CLI volume set $V0 cluster.self-heal-daemon on +EXPECT_WITHIN $PROCESS_UP_TIMEOUT "Y" glustershd_up_status +EXPECT_WITHIN $CHILD_UP_TIMEOUT "1" afr_child_up_status_in_shd $V0 0 +EXPECT_WITHIN $CHILD_UP_TIMEOUT "1" afr_child_up_status_in_shd $V0 1 +TEST $CLI volume heal $V0 + +#file still in split-brain +cat $M0/file > /dev/null +EXPECT "1" echo $? + +#We know that the second brick has latest mtime. +LATEST_CTIME_MD5=$(md5sum $B0/${V0}1/file | cut -d\ -f1) +TEST $CLI volume set $V0 cluster.favorite-child-policy mtime +TEST $CLI volume heal $V0 +EXPECT_WITHIN $HEAL_TIMEOUT "^0$" get_pending_heal_count $V0 +cat $M0/file > /dev/null +EXPECT "0" echo $? +HEALED_MD5=$(md5sum $B0/${V0}0/file | cut -d\ -f1) +TEST [ "$LATEST_CTIME_MD5" == "$HEALED_MD5" ] + +############ Healing using favorite-child-policy = size ################# +TEST $CLI volume set $V0 cluster.favorite-child-policy none +TEST $CLI volume set $V0 cluster.self-heal-daemon off +TEST kill_brick $V0 $H0 $B0/${V0}1 +TEST dd if=/dev/urandom of=$M0/file bs=1024 count=1024 +TEST $CLI volume start $V0 force +EXPECT_WITHIN $PROCESS_UP_TIMEOUT "1" brick_up_status $V0 $H0 $B0/${V0}1 +EXPECT_WITHIN $PROCESS_UP_TIMEOUT "1" afr_child_up_status $V0 1 +TEST kill_brick $V0 $H0 $B0/${V0}0 +TEST dd if=/dev/urandom of=$M0/file bs=1024 count=10240 + +TEST $CLI volume start $V0 force +EXPECT_WITHIN $PROCESS_UP_TIMEOUT "1" brick_up_status $V0 $H0 $B0/${V0}0 +EXPECT_WITHIN $PROCESS_UP_TIMEOUT "1" afr_child_up_status $V0 0 +TEST $CLI volume set $V0 cluster.self-heal-daemon on +EXPECT_WITHIN $PROCESS_UP_TIMEOUT "Y" glustershd_up_status +EXPECT_WITHIN $CHILD_UP_TIMEOUT "1" afr_child_up_status_in_shd $V0 0 +EXPECT_WITHIN $CHILD_UP_TIMEOUT "1" afr_child_up_status_in_shd $V0 1 +TEST $CLI volume heal $V0 + +#file fill in split-brain +cat $M0/file > /dev/null +EXPECT "1" echo $? + +#We know that the second brick has the bigger size file. +BIGGER_FILE_MD5=$(md5sum $B0/${V0}1/file | cut -d\ -f1) +TEST $CLI volume set $V0 cluster.favorite-child-policy size +TEST $CLI volume heal $V0 +EXPECT_WITHIN $HEAL_TIMEOUT "^0$" get_pending_heal_count $V0 +cat $M0/file > /dev/null +EXPECT "0" echo $? +HEALED_MD5=$(md5sum $B0/${V0}0/file | cut -d\ -f1) +TEST [ "$BIGGER_FILE_MD5" == "$HEALED_MD5" ] + +############ Healing using favorite-child-policy = majority on replica-3 ################# + +#Convert volume to replica-3 +TEST $CLI volume add-brick $V0 replica 3 $H0:$B0/${V0}2 +EXPECT_WITHIN $PROCESS_UP_TIMEOUT "1" brick_up_status $V0 $H0 $B0/${V0}2 +EXPECT_WITHIN $PROCESS_UP_TIMEOUT "1" afr_child_up_status $V0 2 +EXPECT_WITHIN $PROCESS_UP_TIMEOUT "Y" glustershd_up_status +EXPECT_WITHIN $CHILD_UP_TIMEOUT "1" afr_child_up_status_in_shd $V0 0 +EXPECT_WITHIN $CHILD_UP_TIMEOUT "1" afr_child_up_status_in_shd $V0 1 +EXPECT_WITHIN $CHILD_UP_TIMEOUT "1" afr_child_up_status_in_shd $V0 2 +TEST $CLI volume heal $V0 +EXPECT_WITHIN $HEAL_TIMEOUT "^0$" get_pending_heal_count $V0 + +TEST $CLI volume set $V0 cluster.quorum-type none +TEST $CLI volume set $V0 cluster.favorite-child-policy none +TEST $CLI volume set $V0 cluster.self-heal-daemon off +TEST kill_brick $V0 $H0 $B0/${V0}0 +TEST dd if=/dev/urandom of=$M0/file bs=1024 count=1024 +TEST $CLI volume start $V0 force +EXPECT_WITHIN $PROCESS_UP_TIMEOUT "1" brick_up_status $V0 $H0 $B0/${V0}0 +EXPECT_WITHIN $PROCESS_UP_TIMEOUT "1" afr_child_up_status $V0 0 +TEST kill_brick $V0 $H0 $B0/${V0}1 +TEST kill_brick $V0 $H0 $B0/${V0}2 +TEST dd if=/dev/urandom of=$M0/file bs=1024 count=10240 + +TEST $CLI volume start $V0 force +EXPECT_WITHIN $PROCESS_UP_TIMEOUT "1" brick_up_status $V0 $H0 $B0/${V0}1 +EXPECT_WITHIN $PROCESS_UP_TIMEOUT "1" brick_up_status $V0 $H0 $B0/${V0}2 +EXPECT_WITHIN $PROCESS_UP_TIMEOUT "1" afr_child_up_status $V0 1 +EXPECT_WITHIN $PROCESS_UP_TIMEOUT "1" afr_child_up_status $V0 2 +TEST $CLI volume set $V0 cluster.self-heal-daemon on +EXPECT_WITHIN $PROCESS_UP_TIMEOUT "Y" glustershd_up_status +EXPECT_WITHIN $CHILD_UP_TIMEOUT "1" afr_child_up_status_in_shd $V0 0 +EXPECT_WITHIN $CHILD_UP_TIMEOUT "1" afr_child_up_status_in_shd $V0 1 +EXPECT_WITHIN $CHILD_UP_TIMEOUT "1" afr_child_up_status_in_shd $V0 2 +TEST $CLI volume heal $V0 + +#file fill in split-brain +cat $M0/file > /dev/null +EXPECT "1" echo $? + +#We know that the second and third bricks agree with each other. Pick any one of them. +MAJORITY_MD5=$(md5sum $B0/${V0}1/file | cut -d\ -f1) +TEST $CLI volume set $V0 cluster.favorite-child-policy majority +TEST $CLI volume heal $V0 +EXPECT_WITHIN $HEAL_TIMEOUT "^0$" get_pending_heal_count $V0 +cat $M0/file > /dev/null +EXPECT "0" echo $? +HEALED_MD5=$(md5sum $B0/${V0}0/file | cut -d\ -f1) +TEST [ "$MAJORITY_MD5" == "$HEALED_MD5" ] + +TEST force_umount $M0 +cleanup diff --git a/xlators/cluster/afr/src/afr-messages.h b/xlators/cluster/afr/src/afr-messages.h index fac37b8c34b..c7af18d0f25 100644 --- a/xlators/cluster/afr/src/afr-messages.h +++ b/xlators/cluster/afr/src/afr-messages.h @@ -40,7 +40,7 @@ */ #define GLFS_COMP_BASE_AFR GLFS_MSGID_COMP_AFR -#define GLFS_NUM_MESSAGES 41 +#define GLFS_NUM_MESSAGES 42 #define GLFS_MSGID_END (GLFS_COMP_BASE_AFR + GLFS_NUM_MESSAGES + 1) #define glfs_msg_start_x GLFS_COMP_BASE_AFR, "Invalid: Start of messages" @@ -361,5 +361,13 @@ */ #define AFR_MSG_TIMER_CREATE_FAIL (GLFS_COMP_BASE_AFR + 41) +/*! + * @messageid 108042 + * @diagnosis Log messages relating to automated resolution of split-brain files + * based on favorite child policies. + * @recommendedaction +*/ +#define AFR_MSG_SBRAIN_FAV_CHILD_POLICY (GLFS_COMP_BASE_AFR + 42) + #define glfs_msg_end_x GLFS_MSGID_END, "Invalid: End of messages" #endif /* !_AFR_MESSAGES_H_ */ diff --git a/xlators/cluster/afr/src/afr-self-heal-common.c b/xlators/cluster/afr/src/afr-self-heal-common.c index 0b92f616030..a4c0e89e434 100644 --- a/xlators/cluster/afr/src/afr-self-heal-common.c +++ b/xlators/cluster/afr/src/afr-self-heal-common.c @@ -466,28 +466,20 @@ afr_dict_contains_heal_op (call_frame_t *frame) return _gf_true; } -/* Return a source depending on the type of heal_op, and set sources[source], - * sinks[source] and healed_sinks[source] to 1, 0 and 0 respectively. Do so - * only if the following condition is met: - * ∀i((i ∈ locked_on[] ∧ i=1)==>(sources[i]=0 ∧ sinks[i]=1 ∧ healed_sinks[i]=1)) - * i.e. for each locked node, sources[node] is 0; healed_sinks[node] and - * sinks[node] are 1. This should be the case if the file is in split-brain. - */ int -afr_mark_split_brain_source_sinks (call_frame_t *frame, xlator_t *this, - unsigned char *sources, +afr_mark_split_brain_source_sinks_by_heal_op (call_frame_t *frame, + xlator_t *this, unsigned char *sources, unsigned char *sinks, unsigned char *healed_sinks, unsigned char *locked_on, struct afr_reply *replies, - afr_transaction_type type) + afr_transaction_type type, int heal_op) { afr_local_t *local = NULL; afr_private_t *priv = NULL; dict_t *xdata_req = NULL; dict_t *xdata_rsp = NULL; int ret = 0; - int heal_op = -1; int i = 0; char *name = NULL; int source = -1; @@ -496,10 +488,6 @@ afr_mark_split_brain_source_sinks (call_frame_t *frame, xlator_t *this, priv = this->private; xdata_req = local->xdata_req; - ret = dict_get_int32 (xdata_req, "heal-op", &heal_op); - if (ret) - goto out; - for (i = 0; i < priv->child_count; i++) { if (locked_on[i]) if (sources[i] || !sinks[i] || !healed_sinks[i]) { @@ -598,6 +586,280 @@ out: } +int +afr_sh_fav_by_majority (xlator_t *this, struct afr_reply *replies, + inode_t *inode) +{ + afr_private_t *priv; + int vote_count = -1; + int fav_child = -1; + int i = 0; + int k = 0; + + priv = this->private; + + for (i = 0; i < priv->child_count; i++) { + if (replies[i].valid == 1) { + gf_msg_debug (this->name, 0, "Child:%s " + "mtime_sec = %d, size = %lu for gfid %s", + priv->children[i]->name, + replies[i].poststat.ia_mtime, + replies[i].poststat.ia_size, + uuid_utoa (inode->gfid)); + vote_count = 0; + for (k = 0; k < priv->child_count; k++) { + if ((replies[k].poststat.ia_mtime == + replies[i].poststat.ia_mtime) && + (replies[k].poststat.ia_size == + replies[i].poststat.ia_size) + ) { + vote_count++; + } + } + if (vote_count > priv->child_count/2) { + fav_child = i; + break; + } + } + } + return fav_child; +} + +/* + * afr_sh_fav_by_mtime: Choose favorite child by mtime. + */ +int +afr_sh_fav_by_mtime (xlator_t *this, struct afr_reply *replies, inode_t *inode) +{ + afr_private_t *priv; + int fav_child = -1; + int i = 0; + uint32_t cmp_mtime = 0; + uint32_t cmp_mtime_nsec = 0; + + priv = this->private; + + for (i = 0; i < priv->child_count; i++) { + if (replies[i].valid == 1) { + gf_msg_debug (this->name, 0, "Child:%s " + "mtime = %d, mtime_nsec = %d for gfid %s", + priv->children[i]->name, + replies[i].poststat.ia_mtime, + replies[i].poststat.ia_mtime_nsec, + uuid_utoa (inode->gfid)); + if (replies[i].poststat.ia_mtime > cmp_mtime) { + cmp_mtime = replies[i].poststat.ia_mtime; + cmp_mtime_nsec = + replies[i].poststat.ia_mtime_nsec; + fav_child = i; + } else if ((replies[i].poststat.ia_mtime == cmp_mtime) + && (replies[i].poststat.ia_mtime_nsec > + cmp_mtime_nsec)) { + cmp_mtime = replies[i].poststat.ia_mtime; + cmp_mtime_nsec = + replies[i].poststat.ia_mtime_nsec; + fav_child = i; + } + } + } + return fav_child; +} + +/* + * afr_sh_fav_by_ctime: Choose favorite child by ctime. + */ +int +afr_sh_fav_by_ctime (xlator_t *this, struct afr_reply *replies, inode_t *inode) +{ + afr_private_t *priv; + int fav_child = -1; + int i = 0; + uint32_t cmp_ctime = 0; + uint32_t cmp_ctime_nsec = 0; + + priv = this->private; + + for (i = 0; i < priv->child_count; i++) { + if (replies[i].valid == 1) { + gf_msg_debug (this->name, 0, "Child:%s " + "ctime = %d, ctime_nsec = %d for gfid %s", + priv->children[i]->name, + replies[i].poststat.ia_ctime, + replies[i].poststat.ia_ctime_nsec, + uuid_utoa (inode->gfid)); + if (replies[i].poststat.ia_ctime > cmp_ctime) { + cmp_ctime = replies[i].poststat.ia_ctime; + cmp_ctime_nsec = + replies[i].poststat.ia_ctime_nsec; + fav_child = i; + } else if ((replies[i].poststat.ia_ctime == cmp_ctime) + && (replies[i].poststat.ia_ctime_nsec > + cmp_ctime_nsec)) { + cmp_ctime = replies[i].poststat.ia_ctime; + cmp_ctime_nsec = + replies[i].poststat.ia_ctime_nsec; + fav_child = i; + } + } + } + return fav_child; +} + +/* + * afr_sh_fav_by_size: Choose favorite child by size. + */ +int +afr_sh_fav_by_size (xlator_t *this, struct afr_reply *replies, inode_t *inode) +{ + afr_private_t *priv; + int fav_child = -1; + int i = 0; + uint64_t cmp_sz = 0; + + priv = this->private; + + for (i = 0; i < priv->child_count; i++) { + if (replies[i].valid == 1) { + gf_msg_debug (this->name, 0, "Child:%s " + "file size = %lu for gfid %s", + priv->children[i]->name, + replies[i].poststat.ia_size, + uuid_utoa (inode->gfid)); + if (replies[i].poststat.ia_size > cmp_sz) { + cmp_sz = replies[i].poststat.ia_size; + fav_child = i; + } + } + } + return fav_child; +} + + +int +afr_mark_split_brain_source_sinks_by_policy (call_frame_t *frame, + xlator_t *this, + inode_t *inode, + unsigned char *sources, + unsigned char *sinks, + unsigned char *healed_sinks, + unsigned char *locked_on, + struct afr_reply *replies, + afr_transaction_type type) +{ + afr_private_t *priv = NULL; + int fav_child = -1; + char mtime_str[256]; + char ctime_str[256]; + char *policy_str = NULL; + struct tm *tm_ptr; + time_t time; + + priv = this->private; + if (priv->fav_child_policy == AFR_FAV_CHILD_BY_MAJORITY) { + fav_child = afr_sh_fav_by_majority (this, replies, inode); + if (fav_child >= 0) + policy_str = "MAJORITY"; + } else if (priv->fav_child_policy == AFR_FAV_CHILD_BY_MTIME) { + fav_child = afr_sh_fav_by_mtime (this, replies, inode); + if (fav_child >= 0) + policy_str = "MTIME"; + } else if (priv->fav_child_policy == AFR_FAV_CHILD_BY_CTIME) { + fav_child = afr_sh_fav_by_ctime (this, replies, inode); + if (fav_child >= 0) + policy_str = "CTIME"; + } else if (priv->fav_child_policy == AFR_FAV_CHILD_BY_SIZE) { + fav_child = afr_sh_fav_by_size (this, replies, inode); + if (fav_child >= 0) + policy_str = "SIZE"; + } + + if (fav_child > priv->child_count - 1) { + gf_msg (this->name, GF_LOG_ERROR, 0, + AFR_MSG_SBRAIN_FAV_CHILD_POLICY, "Invalid child (%d) " + "selected by policy %s.", fav_child, policy_str); + } else if (fav_child >= 0) { + time = replies[fav_child].poststat.ia_mtime; + tm_ptr = localtime (&time); + strftime (mtime_str, sizeof (mtime_str), "%Y-%m-%d %H:%M:%S", + tm_ptr); + time = replies[fav_child].poststat.ia_ctime; + tm_ptr = localtime (&time); + strftime (ctime_str, sizeof (ctime_str), "%Y-%m-%d %H:%M:%S", + tm_ptr); + + gf_msg (this->name, GF_LOG_WARNING, 0, + AFR_MSG_SBRAIN_FAV_CHILD_POLICY, "Source %s " + "selected as authentic to resolve conflicting " + "data in file (gfid:%s) by %s (%lu bytes @ %s mtime, " + "%s ctime).", + priv->children[fav_child]->name, + uuid_utoa (inode->gfid), + policy_str, + replies[fav_child].poststat.ia_size, + mtime_str, + ctime_str); + + sources[fav_child] = 1; + sinks[fav_child] = 0; + healed_sinks[fav_child] = 0; + } + return fav_child; +} + +/* Return a source depending on the type of heal_op, and set sources[source], + * sinks[source] and healed_sinks[source] to 1, 0 and 0 respectively. Do so + * only if the following condition is met: + * ∀i((i ∈ locked_on[] ∧ i=1)==>(sources[i]=0 ∧ sinks[i]=1 ∧ healed_sinks[i]=1)) + * i.e. for each locked node, sources[node] is 0; healed_sinks[node] and + * sinks[node] are 1. This should be the case if the file is in split-brain. + */ +int +afr_mark_split_brain_source_sinks (call_frame_t *frame, xlator_t *this, + inode_t *inode, + unsigned char *sources, + unsigned char *sinks, + unsigned char *healed_sinks, + unsigned char *locked_on, + struct afr_reply *replies, + afr_transaction_type type) +{ + afr_local_t *local = NULL; + afr_private_t *priv = NULL; + dict_t *xdata_req = NULL; + int heal_op = -1; + int ret = -1; + + local = frame->local; + priv = this->private; + xdata_req = local->xdata_req; + + ret = dict_get_int32 (xdata_req, "heal-op", &heal_op); + if (ret) + goto autoheal; + + ret = afr_mark_split_brain_source_sinks_by_heal_op (frame, this, + sources, sinks, + healed_sinks, + locked_on, replies, + type, heal_op); + return ret; + +autoheal: + /* Automatically heal if fav_child_policy is set. */ + if (priv->fav_child_policy != AFR_FAV_CHILD_NONE) { + ret = afr_mark_split_brain_source_sinks_by_policy (frame, this, + inode, + sources, + sinks, + healed_sinks, + locked_on, + replies, + type); + } + + return ret; +} + gf_boolean_t afr_does_witness_exist (xlator_t *this, uint64_t *witness) { diff --git a/xlators/cluster/afr/src/afr-self-heal-data.c b/xlators/cluster/afr/src/afr-self-heal-data.c index f4cd16c3a70..2a33e53764c 100644 --- a/xlators/cluster/afr/src/afr-self-heal-data.c +++ b/xlators/cluster/afr/src/afr-self-heal-data.c @@ -568,6 +568,7 @@ afr_mark_newest_file_as_source (xlator_t *this, unsigned char *sources, static int __afr_selfheal_data_finalize_source (call_frame_t *frame, xlator_t *this, + inode_t *inode, unsigned char *sources, unsigned char *sinks, unsigned char *healed_sinks, @@ -585,7 +586,7 @@ __afr_selfheal_data_finalize_source (call_frame_t *frame, xlator_t *this, if ((AFR_CMP (locked_on, healed_sinks, priv->child_count) == 0) || !sources_count) { /* split brain */ - source = afr_mark_split_brain_source_sinks (frame, this, + source = afr_mark_split_brain_source_sinks (frame, this, inode, sources, sinks, healed_sinks, locked_on, replies, @@ -663,8 +664,9 @@ __afr_selfheal_data_prepare (call_frame_t *frame, xlator_t *this, */ AFR_INTERSECT (healed_sinks, sinks, locked_on, priv->child_count); - source = __afr_selfheal_data_finalize_source (frame, this, sources, - sinks, healed_sinks, + source = __afr_selfheal_data_finalize_source (frame, this, inode, + sources, sinks, + healed_sinks, locked_on, replies, witness); if (source < 0) diff --git a/xlators/cluster/afr/src/afr-self-heal-metadata.c b/xlators/cluster/afr/src/afr-self-heal-metadata.c index 25d8b98adda..130a3daa203 100644 --- a/xlators/cluster/afr/src/afr-self-heal-metadata.c +++ b/xlators/cluster/afr/src/afr-self-heal-metadata.c @@ -196,6 +196,7 @@ out: static int __afr_selfheal_metadata_finalize_source (call_frame_t *frame, xlator_t *this, + inode_t *inode, unsigned char *sources, unsigned char *sinks, unsigned char *healed_sinks, @@ -215,7 +216,7 @@ __afr_selfheal_metadata_finalize_source (call_frame_t *frame, xlator_t *this, if ((AFR_CMP (locked_on, healed_sinks, priv->child_count) == 0) || !sources_count) { - source = afr_mark_split_brain_source_sinks (frame, this, + source = afr_mark_split_brain_source_sinks (frame, this, inode, sources, sinks, healed_sinks, locked_on, replies, @@ -352,8 +353,9 @@ __afr_selfheal_metadata_prepare (call_frame_t *frame, xlator_t *this, inode_t *i } } - source = __afr_selfheal_metadata_finalize_source (frame, this, sources, - sinks, healed_sinks, + source = __afr_selfheal_metadata_finalize_source (frame, this, inode, + sources, sinks, + healed_sinks, locked_on, replies); if (source < 0) diff --git a/xlators/cluster/afr/src/afr-self-heal.h b/xlators/cluster/afr/src/afr-self-heal.h index be787683c19..ec5337e60b2 100644 --- a/xlators/cluster/afr/src/afr-self-heal.h +++ b/xlators/cluster/afr/src/afr-self-heal.h @@ -220,6 +220,7 @@ afr_dict_contains_heal_op (call_frame_t *frame); int afr_mark_split_brain_source_sinks (call_frame_t *frame, xlator_t *this, + inode_t *inode, unsigned char *sources, unsigned char *sinks, unsigned char *healed_sinks, diff --git a/xlators/cluster/afr/src/afr.c b/xlators/cluster/afr/src/afr.c index bc417a4a2c8..6f4783c9213 100644 --- a/xlators/cluster/afr/src/afr.c +++ b/xlators/cluster/afr/src/afr.c @@ -20,6 +20,15 @@ struct volume_options options[]; +static char *afr_favorite_child_policies[AFR_FAV_CHILD_POLICY_MAX + 1] = { + [AFR_FAV_CHILD_NONE] = "none", + [AFR_FAV_CHILD_BY_SIZE] = "size", + [AFR_FAV_CHILD_BY_CTIME] = "ctime", + [AFR_FAV_CHILD_BY_MTIME] = "mtime", + [AFR_FAV_CHILD_BY_MAJORITY] = "majority", + [AFR_FAV_CHILD_POLICY_MAX] = NULL, +}; + int32_t notify (xlator_t *this, int32_t event, void *data, ...) @@ -100,6 +109,19 @@ fix_quorum_options (xlator_t *this, afr_private_t *priv, char *qtype, } } +int +afr_set_favorite_child_policy (afr_private_t *priv, char *policy) +{ + int index = -1; + + index = gf_get_index_by_elem (afr_favorite_child_policies, policy); + if (index < 0 || index >= AFR_FAV_CHILD_POLICY_MAX) + return -1; + + priv->fav_child_policy = index; + + return 0; +} int reconfigure (xlator_t *this, dict_t *options) { @@ -109,6 +131,7 @@ reconfigure (xlator_t *this, dict_t *options) int ret = -1; int index = -1; char *qtype = NULL; + char *fav_child_policy = NULL; priv = this->private; @@ -228,6 +251,11 @@ reconfigure (xlator_t *this, dict_t *options) GF_OPTION_RECONF ("shd-wait-qlength", priv->shd.wait_qlength, options, uint32, out); + GF_OPTION_RECONF ("favorite-child-policy", fav_child_policy, options, + str, out); + if (afr_set_favorite_child_policy (priv, fav_child_policy) == -1) + goto out; + priv->did_discovery = _gf_false; ret = 0; @@ -259,6 +287,7 @@ init (xlator_t *this) int read_subvol_index = -1; xlator_t *fav_child = NULL; char *qtype = NULL; + char *fav_child_policy = NULL; if (!this->children) { gf_msg (this->name, GF_LOG_ERROR, 0, @@ -338,6 +367,10 @@ init (xlator_t *this) fav_child->name, fav_child->name); } + GF_OPTION_INIT ("favorite-child-policy", fav_child_policy, str, out); + if (afr_set_favorite_child_policy(priv, fav_child_policy) == -1) + goto out; + GF_OPTION_INIT ("shd-max-threads", priv->shd.max_threads, uint32, out); @@ -886,5 +919,18 @@ struct volume_options options[] = { "granular way of recording changelogs and doing entry " "self-heal.", }, + { .key = {"favorite-child-policy"}, + .type = GF_OPTION_TYPE_STR, + .value = {"none", "size", "ctime", "mtime", "majority"}, + .default_value = "none", + .description = "This option can be used to automatically resolve " + "split-brains using various policies without user " + "intervention. \"size\" picks the file with the " + "biggest size as the source. \"ctime\" and \"mtime\" " + "pick the file with the latest ctime and mtime " + "respectively as the source. \"majority\" picks a file" + " with identical mtime and size in more than half the " + "number of bricks in the replica.", + }, { .key = {NULL} }, }; diff --git a/xlators/cluster/afr/src/afr.h b/xlators/cluster/afr/src/afr.h index 7d270ea94e7..5482dab25b2 100644 --- a/xlators/cluster/afr/src/afr.h +++ b/xlators/cluster/afr/src/afr.h @@ -50,6 +50,16 @@ typedef int (*afr_changelog_resume_t) (call_frame_t *frame, xlator_t *this); #define AFR_INTERSECT(dst,src1,src2,max) ({int __i; for (__i = 0; __i < max; __i++) dst[__i] = src1[__i] && src2[__i];}) #define AFR_CMP(a1,a2,len) ({int __cmp = 0; int __i; for (__i = 0; __i < len; __i++) if (a1[__i] != a2[__i]) { __cmp = 1; break;} __cmp;}) #define AFR_IS_ARBITER_BRICK(priv, index) ((priv->arbiter_count == 1) && (index == ARBITER_BRICK_INDEX)) + +typedef enum { + AFR_FAV_CHILD_NONE, + AFR_FAV_CHILD_BY_SIZE, + AFR_FAV_CHILD_BY_CTIME, + AFR_FAV_CHILD_BY_MTIME, + AFR_FAV_CHILD_BY_MAJORITY, + AFR_FAV_CHILD_POLICY_MAX, +} afr_favorite_child_policy; + typedef struct _afr_private { gf_lock_t lock; /* to guard access to child_count, etc */ unsigned int child_count; /* total number of children */ @@ -94,6 +104,9 @@ typedef struct _afr_private { int favorite_child; /* subvolume to be preferred in resolving split-brain cases */ + afr_favorite_child_policy fav_child_policy;/*Policy to use for automatic + resolution of split-brains.*/ + gf_boolean_t inodelk_trace; gf_boolean_t entrylk_trace; diff --git a/xlators/mgmt/glusterd/src/glusterd-volume-set.c b/xlators/mgmt/glusterd/src/glusterd-volume-set.c index 134ea8639f0..36e5483ee87 100644 --- a/xlators/mgmt/glusterd/src/glusterd-volume-set.c +++ b/xlators/mgmt/glusterd/src/glusterd-volume-set.c @@ -1293,6 +1293,12 @@ struct volopt_map_entry glusterd_volopt_map[] = { .op_version = GD_OP_VERSION_3_7_10, .flags = OPT_FLAG_CLIENT_OPT }, + { .key = "cluster.favorite-child-policy", + .voltype = "cluster/replicate", + .type = DOC, + .op_version = GD_OP_VERSION_3_7_12, + .flags = OPT_FLAG_CLIENT_OPT + }, /* stripe xlator options */ { .key = "cluster.stripe-block-size", -- cgit