diff options
| author | Anuradha Talur <atalur@redhat.com> | 2015-08-03 17:09:13 +0530 | 
|---|---|---|
| committer | Pranith Kumar Karampuri <pkarampu@redhat.com> | 2015-08-25 10:32:49 -0700 | 
| commit | 36349fa250ace6109002dfa41305d9dcd54ce0a9 (patch) | |
| tree | ef9c181a2e165d3ac8255915f384b2bfb4e06ded | |
| parent | f3c7e6eaa9b14b89c1d58c0edcb5664f28914437 (diff) | |
cluster/afr : Examine data/metadata readable for read-subvol
During lookup and discover, currently read_subvol is based
only on data_readable. read_subvol should be decided based
on both data_readable and metadata_readable.
Credits to Ravishankar N for the logic of afr_first_up_child
from http://review.gluster.org/10905/ .
Change-Id: I98580b23c278172ee2902be08eeaafb6722e830c
BUG: 1240244
Signed-off-by: Anuradha Talur <atalur@redhat.com>
Reviewed-on: http://review.gluster.org/11551
Reviewed-by: Ravishankar N <ravishankar@redhat.com>
Tested-by: Gluster Build System <jenkins@build.gluster.com>
Reviewed-by: Krutika Dhananjay <kdhananj@redhat.com>
Reviewed-by: Pranith Kumar Karampuri <pkarampu@redhat.com>
| -rw-r--r-- | tests/bugs/replicate/bug-1238398-split-brain-resolution.t | 48 | ||||
| -rw-r--r-- | xlators/cluster/afr/src/afr-common.c | 77 | ||||
| -rw-r--r-- | xlators/cluster/afr/src/afr-read-txn.c | 16 | 
3 files changed, 118 insertions, 23 deletions
diff --git a/tests/bugs/replicate/bug-1238398-split-brain-resolution.t b/tests/bugs/replicate/bug-1238398-split-brain-resolution.t new file mode 100644 index 00000000000..7ba09f0dc5d --- /dev/null +++ b/tests/bugs/replicate/bug-1238398-split-brain-resolution.t @@ -0,0 +1,48 @@ +#!/bin/bash +. $(dirname $0)/../../include.rc +. $(dirname $0)/../../volume.rc +cleanup; + +function get_split_brain_status { +        local path=$1 +        echo `getfattr -n replica.split-brain-status $path` | cut -f2 -d"=" | sed -e 's/^"//'  -e 's/"$//' +} + +TEST glusterd +TEST pidof glusterd +TEST $CLI volume create $V0 replica 2 $H0:$B0/${V0}{0,1} +TEST $CLI volume start $V0 + +#Disable self-heal-daemon +TEST $CLI volume set $V0 cluster.self-heal-daemon off + +TEST glusterfs --volfile-id=$V0 --volfile-server=$H0 --entry-timeout=0 $M0; + +TEST `echo "some-data" > $M0/metadata-split-brain.txt` + +#Create metadata split-brain +TEST kill_brick $V0 $H0 $B0/${V0}0 +TEST chmod 666 $M0/metadata-split-brain.txt + +TEST $CLI volume start $V0 force +TEST kill_brick $V0 $H0 $B0/${V0}1 +EXPECT_WITHIN $PROCESS_UP_TIMEOUT "1" afr_child_up_status $V0 0 + +TEST chmod 757 $M0/metadata-split-brain.txt + +TEST $CLI volume start $V0 force +EXPECT_WITHIN $PROCESS_UP_TIMEOUT "1" afr_child_up_status $V0 0 +EXPECT_WITHIN $PROCESS_UP_TIMEOUT "1" afr_child_up_status $V0 1 + +EXPECT 2 get_pending_heal_count $V0 + +#Inspect the file in metadata-split-brain +EXPECT "data-split-brain:no metadata-split-brain:yes Choices:patchy-client-0,patchy-client-1" get_split_brain_status $M0/metadata-split-brain.txt +TEST setfattr -n replica.split-brain-choice -v $V0-client-0 $M0/metadata-split-brain.txt + +EXPECT "757" stat -c %a $M0/metadata-split-brain.txt + +TEST setfattr -n replica.split-brain-choice -v $V0-client-1 $M0/metadata-split-brain.txt +EXPECT "666" stat -c %a $M0/metadata-split-brain.txt + +cleanup; diff --git a/xlators/cluster/afr/src/afr-common.c b/xlators/cluster/afr/src/afr-common.c index 99ecd734c69..e9a7c8c3649 100644 --- a/xlators/cluster/afr/src/afr-common.c +++ b/xlators/cluster/afr/src/afr-common.c @@ -363,7 +363,8 @@ afr_inode_get_readable (call_frame_t *frame, inode_t *inode, xlator_t *this,          if (inode->ia_type == IA_IFDIR) {                  /* For directories, allow even if it is in data split-brain. */ -                if (type == AFR_METADATA_TRANSACTION) { +                if (type == AFR_METADATA_TRANSACTION || +                    local->op == GF_FOP_STAT || local->op == GF_FOP_FSTAT) {                          if (!metadata_count)                                  return -EIO;                  } @@ -1503,6 +1504,40 @@ afr_get_parent_read_subvol (xlator_t *this, inode_t *parent,  } +int +afr_read_subvol_decide (inode_t *inode, xlator_t *this, +                        afr_read_subvol_args_t *args) +{ +        int data_subvol  = -1; +        int mdata_subvol = -1; + +        data_subvol = afr_data_subvol_get (inode, this, +                                           0, 0, args); +        mdata_subvol = afr_metadata_subvol_get (inode, this, +                                                0, 0, args); +        if (data_subvol == -1 || mdata_subvol == -1) +                return -1; + +        return data_subvol; +} + +static inline int +afr_first_up_child (call_frame_t *frame, xlator_t *this) +{ +        afr_private_t       *priv  = NULL; +        afr_local_t         *local = NULL; +        int                  i     = 0; + +        local = frame->local; +        priv = this->private; + +        for (i = 0; i < priv->child_count; i++) +                if (local->replies[i].valid && +                    local->replies[i].op_ret == 0) +                        return i; +        return 0; +} +  static void  afr_lookup_done (call_frame_t *frame, xlator_t *this)  { @@ -1618,13 +1653,13 @@ afr_lookup_done (call_frame_t *frame, xlator_t *this)                  gf_uuid_copy (args.gfid, read_gfid);                  args.ia_type = ia_type;  		if (afr_replies_interpret (frame, this, local->inode)) { -			read_subvol = afr_data_subvol_get (local->inode, this, -							   0, 0, &args); +                        read_subvol = afr_read_subvol_decide (local->inode, +                                                              this, &args);  			afr_inode_read_subvol_reset (local->inode, this);  			goto cant_interpret;  		} else { -			read_subvol = afr_data_subvol_get (local->inode, this, -							   0, 0, &args); +                        read_subvol = afr_data_subvol_get (local->inode, this, +                                                           0, 0, &args);  		}  	} else {  	cant_interpret: @@ -1632,7 +1667,7 @@ afr_lookup_done (call_frame_t *frame, xlator_t *this)                          if (spb_choice >= 0)                                  read_subvol = spb_choice;                          else -                                read_subvol = 0; +                                read_subvol = afr_first_up_child (frame, this);                  }  		dict_del (replies[read_subvol].xdata, GF_CONTENT_KEY);  	} @@ -1644,7 +1679,7 @@ unwind:                  if (spb_choice >= 0)                          read_subvol = spb_choice;                  else -                        read_subvol = 0; +                        read_subvol = afr_first_up_child (frame, this);          }          par_read_subvol = afr_get_parent_read_subvol (this, parent, replies,                                                        readable); @@ -2024,11 +2059,15 @@ afr_discover_done (call_frame_t *frame, xlator_t *this)          afr_local_t         *local = NULL;  	int                 i = -1;  	int                 op_errno = 0; -	int                 read_subvol = 0; +	int                 spb_choice = -1; +	int                 read_subvol = -1;          priv  = this->private;          local = frame->local; +        afr_inode_split_brain_choice_get (local->inode, this, +                                          &spb_choice); +  	for (i = 0; i < priv->child_count; i++) {  		if (!local->replies[i].valid)  			continue; @@ -2046,27 +2085,25 @@ afr_discover_done (call_frame_t *frame, xlator_t *this)  	afr_replies_interpret (frame, this, local->inode); -	read_subvol = afr_data_subvol_get (local->inode, this, 0, 0, NULL); +	read_subvol = afr_read_subvol_decide (local->inode, this, NULL);  	if (read_subvol == -1) {  	        gf_msg (this->name, GF_LOG_WARNING, 0,                          AFR_MSG_READ_SUBVOL_ERROR, "no read subvols for %s",  			local->loc.path); -		for (i = 0; i < priv->child_count; i++) { -			if (!local->replies[i].valid || -			    local->replies[i].op_ret == -1) -				continue; -			read_subvol = i; -			break; -		} +                if (spb_choice >= 0) { +                        read_subvol = spb_choice; +                } else { +                        read_subvol = afr_first_up_child (frame, this); +                }  	}  unwind:  	if (read_subvol == -1) { -                afr_inode_split_brain_choice_get (local->inode, this, -                                                        &read_subvol); -                if (read_subvol == -1) -                        read_subvol = 0; +                if (spb_choice >= 0) +                        read_subvol = spb_choice; +                else +                        read_subvol = afr_first_up_child (frame, this);          }  	AFR_STACK_UNWIND (lookup, frame, local->op_ret, local->op_errno, diff --git a/xlators/cluster/afr/src/afr-read-txn.c b/xlators/cluster/afr/src/afr-read-txn.c index 1b2faf31eea..a70565c37a1 100644 --- a/xlators/cluster/afr/src/afr-read-txn.c +++ b/xlators/cluster/afr/src/afr-read-txn.c @@ -193,12 +193,16 @@ afr_read_txn (call_frame_t *frame, xlator_t *this, inode_t *inode,  {  	afr_local_t *local = NULL;  	afr_private_t *priv = NULL; +        unsigned char *data = NULL; +        unsigned char *metadata = NULL;  	int read_subvol = -1;  	int event_generation = 0;  	int ret = -1;  	priv = this->private;  	local = frame->local; +        data = alloca0 (priv->child_count); +        metadata = alloca0 (priv->child_count);  	afr_read_txn_wipe (frame, this); @@ -213,10 +217,16 @@ afr_read_txn (call_frame_t *frame, xlator_t *this, inode_t *inode,                  goto read;          } -  	local->transaction.type = type; -	ret = afr_inode_read_subvol_type_get (inode, this, local->readable, -					      &event_generation, type); +        if (local->op == GF_FOP_FSTAT || local->op == GF_FOP_STAT) { +                ret = afr_inode_read_subvol_get (inode, this, data, metadata, +                                                 &event_generation); +                AFR_INTERSECT (local->readable, data, metadata, +                               priv->child_count); +        } else { +                ret = afr_inode_read_subvol_type_get (inode, this, local->readable, +                                                      &event_generation, type); +        }  	if (ret == -1)  		/* very first transaction on this inode */  		goto refresh;  | 
