diff options
| -rw-r--r-- | tests/basic/afr/split-brain-favorite-child-policy.t | 175 | ||||
| -rw-r--r-- | xlators/cluster/afr/src/afr-messages.h | 10 | ||||
| -rw-r--r-- | xlators/cluster/afr/src/afr-self-heal-common.c | 292 | ||||
| -rw-r--r-- | xlators/cluster/afr/src/afr-self-heal-data.c | 8 | ||||
| -rw-r--r-- | xlators/cluster/afr/src/afr-self-heal-metadata.c | 8 | ||||
| -rw-r--r-- | xlators/cluster/afr/src/afr-self-heal.h | 1 | ||||
| -rw-r--r-- | xlators/cluster/afr/src/afr.c | 46 | ||||
| -rw-r--r-- | xlators/cluster/afr/src/afr.h | 13 | ||||
| -rw-r--r-- | xlators/mgmt/glusterd/src/glusterd-volume-set.c | 6 | 
9 files changed, 537 insertions, 22 deletions
diff --git a/tests/basic/afr/split-brain-favorite-child-policy.t b/tests/basic/afr/split-brain-favorite-child-policy.t new file mode 100644 index 00000000000..66fcd67a031 --- /dev/null +++ b/tests/basic/afr/split-brain-favorite-child-policy.t @@ -0,0 +1,175 @@ +#!/bin/bash + +#Test the split-brain resolution CLI commands. +. $(dirname $0)/../../include.rc +. $(dirname $0)/../../volume.rc + +cleanup; + +TEST glusterd +TEST pidof glusterd + +#Create replica 2 volume +TEST $CLI volume create $V0 replica 2 $H0:$B0/${V0}{0,1} +TEST $CLI volume set $V0 performance.write-behind off +TEST $CLI volume set $V0 cluster.self-heal-daemon off +TEST $CLI volume set $V0 cluster.entry-self-heal off +TEST $CLI volume set $V0 cluster.data-self-heal off +TEST $CLI volume set $V0 cluster.metadata-self-heal off +TEST $CLI volume start $V0 +TEST glusterfs --volfile-id=/$V0 --volfile-server=$H0 $M0 --attribute-timeout=0 --entry-timeout=0 +TEST touch $M0/file + +############ Healing using favorite-child-policy = ctime ################# +TEST kill_brick $V0 $H0 $B0/${V0}0 +TEST dd if=/dev/urandom of=$M0/file bs=1024 count=1024 +TEST $CLI volume start $V0 force +EXPECT_WITHIN $PROCESS_UP_TIMEOUT "1" brick_up_status $V0 $H0 $B0/${V0}0 +EXPECT_WITHIN $PROCESS_UP_TIMEOUT "1" afr_child_up_status $V0 0 +TEST kill_brick $V0 $H0 $B0/${V0}1 +TEST dd if=/dev/urandom of=$M0/file bs=1024 count=1024 + +TEST $CLI volume start $V0 force +EXPECT_WITHIN $PROCESS_UP_TIMEOUT "1" brick_up_status $V0 $H0 $B0/${V0}1 +EXPECT_WITHIN $PROCESS_UP_TIMEOUT "1" afr_child_up_status $V0 1 +TEST $CLI volume set $V0 cluster.self-heal-daemon on +EXPECT_WITHIN $PROCESS_UP_TIMEOUT "Y" glustershd_up_status +EXPECT_WITHIN $CHILD_UP_TIMEOUT "1" afr_child_up_status_in_shd $V0 0 +EXPECT_WITHIN $CHILD_UP_TIMEOUT "1" afr_child_up_status_in_shd $V0 1 +TEST $CLI volume heal $V0 + +#file fill in split-brain +cat $M0/file > /dev/null +EXPECT "1" echo $? + +#We know that the first brick has latest ctime. +LATEST_CTIME_MD5=$(md5sum $B0/${V0}0/file | cut -d\  -f1) +TEST $CLI volume set $V0 cluster.favorite-child-policy ctime +TEST $CLI volume heal $V0 +EXPECT_WITHIN $HEAL_TIMEOUT "^0$" get_pending_heal_count $V0 +cat $M0/file > /dev/null +EXPECT "0" echo $? +HEALED_MD5=$(md5sum $B0/${V0}1/file | cut -d\  -f1) +TEST [ "$LATEST_CTIME_MD5" == "$HEALED_MD5" ] + +############ Healing using favorite-child-policy = mtime ################# +TEST $CLI volume set $V0 cluster.favorite-child-policy none +TEST $CLI volume set $V0 cluster.self-heal-daemon off +TEST kill_brick $V0 $H0 $B0/${V0}1 +TEST dd if=/dev/urandom of=$M0/file bs=1024 count=1024 +TEST $CLI volume start $V0 force +EXPECT_WITHIN $PROCESS_UP_TIMEOUT "1" brick_up_status $V0 $H0 $B0/${V0}1 +EXPECT_WITHIN $PROCESS_UP_TIMEOUT "1" afr_child_up_status $V0 1 +TEST kill_brick $V0 $H0 $B0/${V0}0 +TEST dd if=/dev/urandom of=$M0/file bs=1024 count=1024 + +TEST $CLI volume start $V0 force +EXPECT_WITHIN $PROCESS_UP_TIMEOUT "1" brick_up_status $V0 $H0 $B0/${V0}0 +EXPECT_WITHIN $PROCESS_UP_TIMEOUT "1" afr_child_up_status $V0 0 +TEST $CLI volume set $V0 cluster.self-heal-daemon on +EXPECT_WITHIN $PROCESS_UP_TIMEOUT "Y" glustershd_up_status +EXPECT_WITHIN $CHILD_UP_TIMEOUT "1" afr_child_up_status_in_shd $V0 0 +EXPECT_WITHIN $CHILD_UP_TIMEOUT "1" afr_child_up_status_in_shd $V0 1 +TEST $CLI volume heal $V0 + +#file still in split-brain +cat $M0/file > /dev/null +EXPECT "1" echo $? + +#We know that the second brick has latest mtime. +LATEST_CTIME_MD5=$(md5sum $B0/${V0}1/file | cut -d\  -f1) +TEST $CLI volume set $V0 cluster.favorite-child-policy mtime +TEST $CLI volume heal $V0 +EXPECT_WITHIN $HEAL_TIMEOUT "^0$" get_pending_heal_count $V0 +cat $M0/file > /dev/null +EXPECT "0" echo $? +HEALED_MD5=$(md5sum $B0/${V0}0/file | cut -d\  -f1) +TEST [ "$LATEST_CTIME_MD5" == "$HEALED_MD5" ] + +############ Healing using favorite-child-policy = size ################# +TEST $CLI volume set $V0 cluster.favorite-child-policy none +TEST $CLI volume set $V0 cluster.self-heal-daemon off +TEST kill_brick $V0 $H0 $B0/${V0}1 +TEST dd if=/dev/urandom of=$M0/file bs=1024 count=1024 +TEST $CLI volume start $V0 force +EXPECT_WITHIN $PROCESS_UP_TIMEOUT "1" brick_up_status $V0 $H0 $B0/${V0}1 +EXPECT_WITHIN $PROCESS_UP_TIMEOUT "1" afr_child_up_status $V0 1 +TEST kill_brick $V0 $H0 $B0/${V0}0 +TEST dd if=/dev/urandom of=$M0/file bs=1024 count=10240 + +TEST $CLI volume start $V0 force +EXPECT_WITHIN $PROCESS_UP_TIMEOUT "1" brick_up_status $V0 $H0 $B0/${V0}0 +EXPECT_WITHIN $PROCESS_UP_TIMEOUT "1" afr_child_up_status $V0 0 +TEST $CLI volume set $V0 cluster.self-heal-daemon on +EXPECT_WITHIN $PROCESS_UP_TIMEOUT "Y" glustershd_up_status +EXPECT_WITHIN $CHILD_UP_TIMEOUT "1" afr_child_up_status_in_shd $V0 0 +EXPECT_WITHIN $CHILD_UP_TIMEOUT "1" afr_child_up_status_in_shd $V0 1 +TEST $CLI volume heal $V0 + +#file fill in split-brain +cat $M0/file > /dev/null +EXPECT "1" echo $? + +#We know that the second brick has the bigger size file. +BIGGER_FILE_MD5=$(md5sum $B0/${V0}1/file | cut -d\  -f1) +TEST $CLI volume set $V0 cluster.favorite-child-policy size +TEST $CLI volume heal $V0 +EXPECT_WITHIN $HEAL_TIMEOUT "^0$" get_pending_heal_count $V0 +cat $M0/file > /dev/null +EXPECT "0" echo $? +HEALED_MD5=$(md5sum $B0/${V0}0/file | cut -d\  -f1) +TEST [ "$BIGGER_FILE_MD5" == "$HEALED_MD5" ] + +############ Healing using favorite-child-policy = majority on replica-3  ################# + +#Convert volume to replica-3 +TEST $CLI volume add-brick $V0 replica 3 $H0:$B0/${V0}2 +EXPECT_WITHIN $PROCESS_UP_TIMEOUT "1" brick_up_status $V0 $H0 $B0/${V0}2 +EXPECT_WITHIN $PROCESS_UP_TIMEOUT "1" afr_child_up_status $V0 2 +EXPECT_WITHIN $PROCESS_UP_TIMEOUT "Y" glustershd_up_status +EXPECT_WITHIN $CHILD_UP_TIMEOUT "1" afr_child_up_status_in_shd $V0 0 +EXPECT_WITHIN $CHILD_UP_TIMEOUT "1" afr_child_up_status_in_shd $V0 1 +EXPECT_WITHIN $CHILD_UP_TIMEOUT "1" afr_child_up_status_in_shd $V0 2 +TEST $CLI volume heal $V0 +EXPECT_WITHIN $HEAL_TIMEOUT "^0$" get_pending_heal_count $V0 + +TEST $CLI volume set $V0 cluster.quorum-type none +TEST $CLI volume set $V0 cluster.favorite-child-policy none +TEST $CLI volume set $V0 cluster.self-heal-daemon off +TEST kill_brick $V0 $H0 $B0/${V0}0 +TEST dd if=/dev/urandom of=$M0/file bs=1024 count=1024 +TEST $CLI volume start $V0 force +EXPECT_WITHIN $PROCESS_UP_TIMEOUT "1" brick_up_status $V0 $H0 $B0/${V0}0 +EXPECT_WITHIN $PROCESS_UP_TIMEOUT "1" afr_child_up_status $V0 0 +TEST kill_brick $V0 $H0 $B0/${V0}1 +TEST kill_brick $V0 $H0 $B0/${V0}2 +TEST dd if=/dev/urandom of=$M0/file bs=1024 count=10240 + +TEST $CLI volume start $V0 force +EXPECT_WITHIN $PROCESS_UP_TIMEOUT "1" brick_up_status $V0 $H0 $B0/${V0}1 +EXPECT_WITHIN $PROCESS_UP_TIMEOUT "1" brick_up_status $V0 $H0 $B0/${V0}2 +EXPECT_WITHIN $PROCESS_UP_TIMEOUT "1" afr_child_up_status $V0 1 +EXPECT_WITHIN $PROCESS_UP_TIMEOUT "1" afr_child_up_status $V0 2 +TEST $CLI volume set $V0 cluster.self-heal-daemon on +EXPECT_WITHIN $PROCESS_UP_TIMEOUT "Y" glustershd_up_status +EXPECT_WITHIN $CHILD_UP_TIMEOUT "1" afr_child_up_status_in_shd $V0 0 +EXPECT_WITHIN $CHILD_UP_TIMEOUT "1" afr_child_up_status_in_shd $V0 1 +EXPECT_WITHIN $CHILD_UP_TIMEOUT "1" afr_child_up_status_in_shd $V0 2 +TEST $CLI volume heal $V0 + +#file fill in split-brain +cat $M0/file > /dev/null +EXPECT "1" echo $? + +#We know that the second and third bricks agree with each other. Pick any one of them. +MAJORITY_MD5=$(md5sum $B0/${V0}1/file | cut -d\  -f1) +TEST $CLI volume set $V0 cluster.favorite-child-policy majority +TEST $CLI volume heal $V0 +EXPECT_WITHIN $HEAL_TIMEOUT "^0$" get_pending_heal_count $V0 +cat $M0/file > /dev/null +EXPECT "0" echo $? +HEALED_MD5=$(md5sum $B0/${V0}0/file | cut -d\  -f1) +TEST [ "$MAJORITY_MD5" == "$HEALED_MD5" ] + +TEST force_umount $M0 +cleanup diff --git a/xlators/cluster/afr/src/afr-messages.h b/xlators/cluster/afr/src/afr-messages.h index fac37b8c34b..c7af18d0f25 100644 --- a/xlators/cluster/afr/src/afr-messages.h +++ b/xlators/cluster/afr/src/afr-messages.h @@ -40,7 +40,7 @@   */  #define GLFS_COMP_BASE_AFR      GLFS_MSGID_COMP_AFR -#define GLFS_NUM_MESSAGES       41 +#define GLFS_NUM_MESSAGES       42  #define GLFS_MSGID_END          (GLFS_COMP_BASE_AFR + GLFS_NUM_MESSAGES + 1)  #define glfs_msg_start_x GLFS_COMP_BASE_AFR, "Invalid: Start of messages" @@ -361,5 +361,13 @@  */  #define AFR_MSG_TIMER_CREATE_FAIL               (GLFS_COMP_BASE_AFR + 41) +/*! + * @messageid 108042 + * @diagnosis Log messages relating to automated resolution of split-brain files + * based on favorite child policies. + * @recommendedaction +*/ +#define AFR_MSG_SBRAIN_FAV_CHILD_POLICY  (GLFS_COMP_BASE_AFR + 42) +  #define glfs_msg_end_x GLFS_MSGID_END, "Invalid: End of messages"  #endif /* !_AFR_MESSAGES_H_ */ diff --git a/xlators/cluster/afr/src/afr-self-heal-common.c b/xlators/cluster/afr/src/afr-self-heal-common.c index 0b92f616030..a4c0e89e434 100644 --- a/xlators/cluster/afr/src/afr-self-heal-common.c +++ b/xlators/cluster/afr/src/afr-self-heal-common.c @@ -466,28 +466,20 @@ afr_dict_contains_heal_op (call_frame_t *frame)          return _gf_true;  } -/* Return a source depending on the type of heal_op, and set sources[source], - * sinks[source] and healed_sinks[source] to 1, 0 and 0 respectively. Do so - * only if the following condition is met: - * ∀i((i ∈ locked_on[] ∧ i=1)==>(sources[i]=0 ∧ sinks[i]=1 ∧ healed_sinks[i]=1)) - * i.e. for each locked node, sources[node] is 0; healed_sinks[node] and - * sinks[node] are 1. This should be the case if the file is in split-brain. - */  int -afr_mark_split_brain_source_sinks (call_frame_t *frame, xlator_t *this, -                                   unsigned char *sources, +afr_mark_split_brain_source_sinks_by_heal_op (call_frame_t *frame, +                                   xlator_t *this, unsigned char *sources,                                     unsigned char *sinks,                                     unsigned char *healed_sinks,                                     unsigned char *locked_on,                                     struct afr_reply *replies, -                                   afr_transaction_type type) +                                   afr_transaction_type type, int heal_op)  {          afr_local_t   *local     = NULL;          afr_private_t *priv      = NULL;          dict_t        *xdata_req = NULL;          dict_t        *xdata_rsp = NULL;          int            ret       = 0; -        int            heal_op   = -1;          int            i         = 0;          char          *name      = NULL;          int            source     = -1; @@ -496,10 +488,6 @@ afr_mark_split_brain_source_sinks (call_frame_t *frame, xlator_t *this,          priv = this->private;          xdata_req = local->xdata_req; -        ret = dict_get_int32 (xdata_req, "heal-op", &heal_op); -        if (ret) -                goto out; -          for (i = 0; i < priv->child_count; i++) {                  if (locked_on[i])                          if (sources[i] || !sinks[i] || !healed_sinks[i]) { @@ -598,6 +586,280 @@ out:  } +int +afr_sh_fav_by_majority (xlator_t *this, struct afr_reply *replies, +                        inode_t *inode) +{ +        afr_private_t   *priv; +        int             vote_count = -1; +        int             fav_child = -1; +        int             i = 0; +        int             k = 0; + +        priv = this->private; + +        for (i = 0; i < priv->child_count; i++) { +                if (replies[i].valid == 1) { +                        gf_msg_debug (this->name, 0, "Child:%s " +                                "mtime_sec = %d, size = %lu for gfid %s", +                                priv->children[i]->name, +                                replies[i].poststat.ia_mtime, +                                replies[i].poststat.ia_size, +                                uuid_utoa (inode->gfid)); +                                vote_count = 0; +                        for (k = 0; k < priv->child_count; k++) { +                                if ((replies[k].poststat.ia_mtime == +                                     replies[i].poststat.ia_mtime) && +                                    (replies[k].poststat.ia_size == +                                     replies[i].poststat.ia_size) +                                   ) { +                                        vote_count++; +                                } +                        } +                        if (vote_count > priv->child_count/2) { +                                fav_child = i; +                                break; +                        } +                } +        } +        return fav_child; +} + +/* + * afr_sh_fav_by_mtime: Choose favorite child by mtime. + */ +int +afr_sh_fav_by_mtime (xlator_t *this, struct afr_reply *replies, inode_t *inode) +{ +        afr_private_t *priv; +        int fav_child = -1; +        int i = 0; +        uint32_t cmp_mtime = 0; +        uint32_t cmp_mtime_nsec = 0; + +        priv = this->private; + +        for (i = 0; i < priv->child_count; i++) { +                if (replies[i].valid == 1) { +                        gf_msg_debug (this->name, 0, "Child:%s " +                                "mtime = %d, mtime_nsec = %d for gfid %s", +                                priv->children[i]->name, +                                replies[i].poststat.ia_mtime, +                                replies[i].poststat.ia_mtime_nsec, +                                uuid_utoa (inode->gfid)); +                        if (replies[i].poststat.ia_mtime > cmp_mtime) { +                                cmp_mtime = replies[i].poststat.ia_mtime; +                                cmp_mtime_nsec = +                                        replies[i].poststat.ia_mtime_nsec; +                                fav_child = i; +                        } else if ((replies[i].poststat.ia_mtime == cmp_mtime) +                                    && (replies[i].poststat.ia_mtime_nsec > +                                    cmp_mtime_nsec)) { +                                cmp_mtime = replies[i].poststat.ia_mtime; +                                cmp_mtime_nsec = +                                        replies[i].poststat.ia_mtime_nsec; +                                fav_child = i; +                        } +                } +        } +        return fav_child; +} + +/* + * afr_sh_fav_by_ctime: Choose favorite child by ctime. + */ +int +afr_sh_fav_by_ctime (xlator_t *this, struct afr_reply *replies, inode_t *inode) +{ +        afr_private_t *priv; +        int fav_child = -1; +        int i = 0; +        uint32_t cmp_ctime = 0; +        uint32_t cmp_ctime_nsec = 0; + +        priv = this->private; + +        for (i = 0; i < priv->child_count; i++) { +                if (replies[i].valid == 1) { +                        gf_msg_debug (this->name, 0, "Child:%s " +                                "ctime = %d, ctime_nsec = %d for gfid %s", +                                priv->children[i]->name, +                                replies[i].poststat.ia_ctime, +                                replies[i].poststat.ia_ctime_nsec, +                                uuid_utoa (inode->gfid)); +                        if (replies[i].poststat.ia_ctime > cmp_ctime) { +                                cmp_ctime = replies[i].poststat.ia_ctime; +                                cmp_ctime_nsec = +                                        replies[i].poststat.ia_ctime_nsec; +                                fav_child = i; +                        } else if ((replies[i].poststat.ia_ctime == cmp_ctime) +                                    && (replies[i].poststat.ia_ctime_nsec > +                                    cmp_ctime_nsec)) { +                                cmp_ctime = replies[i].poststat.ia_ctime; +                                cmp_ctime_nsec = +                                        replies[i].poststat.ia_ctime_nsec; +                                fav_child = i; +                        } +                } +        } +        return fav_child; +} + +/* + * afr_sh_fav_by_size: Choose favorite child by size. + */ +int +afr_sh_fav_by_size (xlator_t *this, struct afr_reply *replies, inode_t *inode) +{ +        afr_private_t *priv; +        int fav_child = -1; +        int i = 0; +        uint64_t cmp_sz = 0; + +        priv = this->private; + +        for (i = 0; i < priv->child_count; i++) { +                if (replies[i].valid == 1) { +                        gf_msg_debug (this->name, 0, "Child:%s " +                                "file size = %lu for gfid %s", +                                priv->children[i]->name, +                                replies[i].poststat.ia_size, +                                uuid_utoa (inode->gfid)); +                        if (replies[i].poststat.ia_size > cmp_sz) { +                                cmp_sz = replies[i].poststat.ia_size; +                                fav_child = i; +                        } +                } +        } +        return fav_child; +} + + +int +afr_mark_split_brain_source_sinks_by_policy (call_frame_t *frame, +                                             xlator_t *this, +                                             inode_t *inode, +                                             unsigned char *sources, +                                             unsigned char *sinks, +                                             unsigned char *healed_sinks, +                                             unsigned char *locked_on, +                                             struct afr_reply *replies, +                                             afr_transaction_type type) +{ +        afr_private_t *priv = NULL; +        int fav_child = -1; +        char mtime_str[256]; +        char ctime_str[256]; +        char *policy_str = NULL; +        struct tm *tm_ptr; +        time_t time; + +        priv = this->private; +        if (priv->fav_child_policy == AFR_FAV_CHILD_BY_MAJORITY)  { +                fav_child = afr_sh_fav_by_majority (this, replies, inode); +                if (fav_child >= 0) +                        policy_str = "MAJORITY"; +        } else if (priv->fav_child_policy == AFR_FAV_CHILD_BY_MTIME)  { +                fav_child = afr_sh_fav_by_mtime (this, replies, inode); +                if (fav_child >= 0) +                        policy_str = "MTIME"; +        } else if (priv->fav_child_policy == AFR_FAV_CHILD_BY_CTIME)  { +                fav_child = afr_sh_fav_by_ctime (this, replies, inode); +                if (fav_child >= 0) +                        policy_str = "CTIME"; +        } else if (priv->fav_child_policy == AFR_FAV_CHILD_BY_SIZE)  { +                fav_child = afr_sh_fav_by_size (this, replies, inode); +                if (fav_child >= 0) +                        policy_str = "SIZE"; +        } + +        if (fav_child > priv->child_count - 1) { +                gf_msg (this->name, GF_LOG_ERROR, 0, +                        AFR_MSG_SBRAIN_FAV_CHILD_POLICY, "Invalid child (%d) " +                        "selected by policy %s.", fav_child, policy_str); +        } else if (fav_child >= 0) { +                time = replies[fav_child].poststat.ia_mtime; +                tm_ptr = localtime (&time); +                strftime (mtime_str, sizeof (mtime_str), "%Y-%m-%d %H:%M:%S", +                          tm_ptr); +                time = replies[fav_child].poststat.ia_ctime; +                tm_ptr = localtime (&time); +                strftime (ctime_str, sizeof (ctime_str), "%Y-%m-%d %H:%M:%S", +                          tm_ptr); + +                gf_msg (this->name, GF_LOG_WARNING, 0, +                        AFR_MSG_SBRAIN_FAV_CHILD_POLICY, "Source %s " +                        "selected as authentic to resolve conflicting " +                         "data in file (gfid:%s) by %s (%lu bytes @ %s mtime, " +                         "%s ctime).", +                         priv->children[fav_child]->name, +                         uuid_utoa (inode->gfid), +                         policy_str, +                         replies[fav_child].poststat.ia_size, +                         mtime_str, +                         ctime_str); + +                sources[fav_child] = 1; +                sinks[fav_child] = 0; +                healed_sinks[fav_child] = 0; +        } +        return fav_child; +} + +/* Return a source depending on the type of heal_op, and set sources[source], + * sinks[source] and healed_sinks[source] to 1, 0 and 0 respectively. Do so + * only if the following condition is met: + * ∀i((i ∈ locked_on[] ∧ i=1)==>(sources[i]=0 ∧ sinks[i]=1 ∧ healed_sinks[i]=1)) + * i.e. for each locked node, sources[node] is 0; healed_sinks[node] and + * sinks[node] are 1. This should be the case if the file is in split-brain. + */ +int +afr_mark_split_brain_source_sinks (call_frame_t *frame, xlator_t *this, +                                   inode_t *inode, +                                   unsigned char *sources, +                                   unsigned char *sinks, +                                   unsigned char *healed_sinks, +                                   unsigned char *locked_on, +                                   struct afr_reply *replies, +                                   afr_transaction_type type) +{ +        afr_local_t *local = NULL; +        afr_private_t *priv = NULL; +        dict_t *xdata_req = NULL; +        int heal_op = -1; +        int ret = -1; + +        local = frame->local; +        priv = this->private; +        xdata_req = local->xdata_req; + +        ret = dict_get_int32 (xdata_req, "heal-op", &heal_op); +        if (ret) +                goto autoheal; + +        ret = afr_mark_split_brain_source_sinks_by_heal_op (frame, this, +                                                            sources, sinks, +                                                            healed_sinks, +                                                            locked_on, replies, +                                                            type, heal_op); +        return ret; + +autoheal: +        /* Automatically heal if fav_child_policy is set. */ +        if (priv->fav_child_policy != AFR_FAV_CHILD_NONE) { +                ret = afr_mark_split_brain_source_sinks_by_policy (frame, this, +                                                                   inode, +                                                                   sources, +                                                                   sinks, +                                                                   healed_sinks, +                                                                   locked_on, +                                                                   replies, +                                                                   type); +        } + +        return ret; +} +  gf_boolean_t  afr_does_witness_exist (xlator_t *this, uint64_t *witness)  { diff --git a/xlators/cluster/afr/src/afr-self-heal-data.c b/xlators/cluster/afr/src/afr-self-heal-data.c index f4cd16c3a70..2a33e53764c 100644 --- a/xlators/cluster/afr/src/afr-self-heal-data.c +++ b/xlators/cluster/afr/src/afr-self-heal-data.c @@ -568,6 +568,7 @@ afr_mark_newest_file_as_source (xlator_t *this, unsigned char *sources,  static int  __afr_selfheal_data_finalize_source (call_frame_t *frame, xlator_t *this, +                                     inode_t *inode,                                       unsigned char *sources,                                       unsigned char *sinks,  				     unsigned char *healed_sinks, @@ -585,7 +586,7 @@ __afr_selfheal_data_finalize_source (call_frame_t *frame, xlator_t *this,  	if ((AFR_CMP (locked_on, healed_sinks, priv->child_count) == 0)              || !sources_count) {  		/* split brain */ -                source = afr_mark_split_brain_source_sinks (frame, this, +                source = afr_mark_split_brain_source_sinks (frame, this, inode,                                                              sources, sinks,                                                              healed_sinks,                                                              locked_on, replies, @@ -663,8 +664,9 @@ __afr_selfheal_data_prepare (call_frame_t *frame, xlator_t *this,          */          AFR_INTERSECT (healed_sinks, sinks, locked_on, priv->child_count); -	source = __afr_selfheal_data_finalize_source (frame, this, sources, -                                                      sinks, healed_sinks, +	source = __afr_selfheal_data_finalize_source (frame, this, inode, +                                                      sources, sinks, +                                                      healed_sinks,                                                        locked_on, replies,                                                        witness);  	if (source < 0) diff --git a/xlators/cluster/afr/src/afr-self-heal-metadata.c b/xlators/cluster/afr/src/afr-self-heal-metadata.c index 25d8b98adda..130a3daa203 100644 --- a/xlators/cluster/afr/src/afr-self-heal-metadata.c +++ b/xlators/cluster/afr/src/afr-self-heal-metadata.c @@ -196,6 +196,7 @@ out:  static int  __afr_selfheal_metadata_finalize_source (call_frame_t *frame, xlator_t *this, +                                         inode_t *inode,                                           unsigned char *sources,                                           unsigned char *sinks,  					 unsigned char *healed_sinks, @@ -215,7 +216,7 @@ __afr_selfheal_metadata_finalize_source (call_frame_t *frame, xlator_t *this,  	if ((AFR_CMP (locked_on, healed_sinks, priv->child_count) == 0)  	    || !sources_count) { -                source = afr_mark_split_brain_source_sinks (frame, this, +                source = afr_mark_split_brain_source_sinks (frame, this, inode,                                                              sources, sinks,                                                              healed_sinks,                                                              locked_on, replies, @@ -352,8 +353,9 @@ __afr_selfheal_metadata_prepare (call_frame_t *frame, xlator_t *this, inode_t *i                  }          } -	source = __afr_selfheal_metadata_finalize_source (frame, this, sources, -                                                          sinks, healed_sinks, +	source = __afr_selfheal_metadata_finalize_source (frame, this, inode, +                                                          sources, sinks, +                                                          healed_sinks,                                                            locked_on, replies);  	if (source < 0) diff --git a/xlators/cluster/afr/src/afr-self-heal.h b/xlators/cluster/afr/src/afr-self-heal.h index be787683c19..ec5337e60b2 100644 --- a/xlators/cluster/afr/src/afr-self-heal.h +++ b/xlators/cluster/afr/src/afr-self-heal.h @@ -220,6 +220,7 @@ afr_dict_contains_heal_op (call_frame_t *frame);  int  afr_mark_split_brain_source_sinks (call_frame_t *frame, xlator_t *this, +                                   inode_t *inode,                                     unsigned char *sources,                                     unsigned char *sinks,                                     unsigned char *healed_sinks, diff --git a/xlators/cluster/afr/src/afr.c b/xlators/cluster/afr/src/afr.c index bc417a4a2c8..6f4783c9213 100644 --- a/xlators/cluster/afr/src/afr.c +++ b/xlators/cluster/afr/src/afr.c @@ -20,6 +20,15 @@  struct volume_options options[]; +static char *afr_favorite_child_policies[AFR_FAV_CHILD_POLICY_MAX + 1] = { +        [AFR_FAV_CHILD_NONE] = "none", +        [AFR_FAV_CHILD_BY_SIZE] = "size", +        [AFR_FAV_CHILD_BY_CTIME] = "ctime", +        [AFR_FAV_CHILD_BY_MTIME] = "mtime", +        [AFR_FAV_CHILD_BY_MAJORITY] = "majority", +        [AFR_FAV_CHILD_POLICY_MAX] = NULL, +}; +  int32_t  notify (xlator_t *this, int32_t event,          void *data, ...) @@ -101,6 +110,19 @@ fix_quorum_options (xlator_t *this, afr_private_t *priv, char *qtype,  }  int +afr_set_favorite_child_policy (afr_private_t *priv, char *policy) +{ +        int index = -1; + +        index = gf_get_index_by_elem (afr_favorite_child_policies, policy); +        if (index  < 0 || index >= AFR_FAV_CHILD_POLICY_MAX) +                return -1; + +        priv->fav_child_policy = index; + +        return 0; +} +int  reconfigure (xlator_t *this, dict_t *options)  {          afr_private_t *priv        = NULL; @@ -109,6 +131,7 @@ reconfigure (xlator_t *this, dict_t *options)          int            ret         = -1;          int            index       = -1;          char          *qtype       = NULL; +        char          *fav_child_policy = NULL;          priv = this->private; @@ -228,6 +251,11 @@ reconfigure (xlator_t *this, dict_t *options)          GF_OPTION_RECONF ("shd-wait-qlength", priv->shd.wait_qlength,                            options, uint32, out); +        GF_OPTION_RECONF ("favorite-child-policy", fav_child_policy, options, +                          str, out); +        if (afr_set_favorite_child_policy (priv, fav_child_policy) == -1) +                goto out; +          priv->did_discovery = _gf_false;          ret = 0; @@ -259,6 +287,7 @@ init (xlator_t *this)          int            read_subvol_index = -1;          xlator_t      *fav_child   = NULL;          char          *qtype       = NULL; +        char          *fav_child_policy = NULL;          if (!this->children) {                  gf_msg (this->name, GF_LOG_ERROR, 0, @@ -338,6 +367,10 @@ init (xlator_t *this)                          fav_child->name, fav_child->name);          } +        GF_OPTION_INIT ("favorite-child-policy", fav_child_policy, str, out); +        if (afr_set_favorite_child_policy(priv, fav_child_policy) == -1) +                goto out; +          GF_OPTION_INIT ("shd-max-threads", priv->shd.max_threads,                           uint32, out); @@ -886,5 +919,18 @@ struct volume_options options[] = {                           "granular way of recording changelogs and doing entry "                           "self-heal.",          }, +        { .key   = {"favorite-child-policy"}, +          .type  = GF_OPTION_TYPE_STR, +          .value = {"none", "size", "ctime", "mtime", "majority"}, +          .default_value = "none", +          .description = "This option can be used to automatically resolve " +                         "split-brains using various policies without user " +                         "intervention. \"size\" picks the file with the " +                         "biggest size as the source. \"ctime\" and \"mtime\" " +                         "pick the file with the latest ctime and mtime " +                         "respectively as the source. \"majority\" picks a file" +                         " with identical mtime and size in more than half the " +                         "number of bricks in the replica.", +        },          { .key  = {NULL} },  }; diff --git a/xlators/cluster/afr/src/afr.h b/xlators/cluster/afr/src/afr.h index 7d270ea94e7..5482dab25b2 100644 --- a/xlators/cluster/afr/src/afr.h +++ b/xlators/cluster/afr/src/afr.h @@ -50,6 +50,16 @@ typedef int (*afr_changelog_resume_t) (call_frame_t *frame, xlator_t *this);  #define AFR_INTERSECT(dst,src1,src2,max) ({int __i; for (__i = 0; __i < max; __i++) dst[__i] = src1[__i] && src2[__i];})  #define AFR_CMP(a1,a2,len) ({int __cmp = 0; int __i; for (__i = 0; __i < len; __i++) if (a1[__i] != a2[__i]) { __cmp = 1; break;} __cmp;})  #define AFR_IS_ARBITER_BRICK(priv, index) ((priv->arbiter_count == 1) && (index == ARBITER_BRICK_INDEX)) + +typedef enum { +        AFR_FAV_CHILD_NONE, +        AFR_FAV_CHILD_BY_SIZE, +        AFR_FAV_CHILD_BY_CTIME, +        AFR_FAV_CHILD_BY_MTIME, +        AFR_FAV_CHILD_BY_MAJORITY, +        AFR_FAV_CHILD_POLICY_MAX, +} afr_favorite_child_policy; +  typedef struct _afr_private {          gf_lock_t lock;               /* to guard access to child_count, etc */          unsigned int child_count;     /* total number of children   */ @@ -94,6 +104,9 @@ typedef struct _afr_private {          int favorite_child;  /* subvolume to be preferred in resolving                                           split-brain cases */ +        afr_favorite_child_policy fav_child_policy;/*Policy to use for automatic +                                                 resolution of split-brains.*/ +          gf_boolean_t inodelk_trace;          gf_boolean_t entrylk_trace; diff --git a/xlators/mgmt/glusterd/src/glusterd-volume-set.c b/xlators/mgmt/glusterd/src/glusterd-volume-set.c index 134ea8639f0..36e5483ee87 100644 --- a/xlators/mgmt/glusterd/src/glusterd-volume-set.c +++ b/xlators/mgmt/glusterd/src/glusterd-volume-set.c @@ -1293,6 +1293,12 @@ struct volopt_map_entry glusterd_volopt_map[] = {            .op_version = GD_OP_VERSION_3_7_10,            .flags      = OPT_FLAG_CLIENT_OPT          }, +        { .key        = "cluster.favorite-child-policy", +          .voltype    = "cluster/replicate", +          .type       = DOC, +          .op_version = GD_OP_VERSION_3_7_12, +          .flags      = OPT_FLAG_CLIENT_OPT +        },          /* stripe xlator options */          { .key         = "cluster.stripe-block-size",  | 
