diff options
| -rw-r--r-- | tests/basic/afr/quorum.t | 23 | ||||
| -rwxr-xr-x | tests/bugs/replicate/bug-977797.t | 2 | ||||
| -rw-r--r-- | xlators/cluster/afr/src/afr-common.c | 244 | ||||
| -rw-r--r-- | xlators/cluster/afr/src/afr-read-txn.c | 3 | ||||
| -rw-r--r-- | xlators/cluster/afr/src/afr.c | 7 | ||||
| -rw-r--r-- | xlators/cluster/afr/src/afr.h | 1 | 
6 files changed, 164 insertions, 116 deletions
diff --git a/tests/basic/afr/quorum.t b/tests/basic/afr/quorum.t index 252e25468d7..58116ba49f5 100644 --- a/tests/basic/afr/quorum.t +++ b/tests/basic/afr/quorum.t @@ -31,11 +31,7 @@ TEST $CLI volume set $V0 cluster.quorum-count 2  TEST test_write  TEST kill_brick $V0 $H0 $B0/${V0}1  TEST ! test_write -EXPECT "abc" cat $M0/b -TEST $CLI volume set $V0 cluster.quorum-reads on -EXPECT_WITHIN $CONFIG_UPDATE_TIMEOUT "1" mount_get_option_value $M0 $V0-replicate-0 quorum-reads  TEST ! cat $M0/b -TEST $CLI volume reset $V0 cluster.quorum-reads  TEST $CLI volume set $V0 cluster.quorum-type auto  EXPECT auto volume_option $V0 cluster.quorum-type @@ -44,11 +40,7 @@ EXPECT_WITHIN $PROCESS_UP_TIMEOUT "1" afr_child_up_status $V0 0  TEST test_write  TEST kill_brick $V0 $H0 $B0/${V0}1  TEST ! test_write -EXPECT "abc" cat $M0/b -TEST $CLI volume set $V0 cluster.quorum-reads on -EXPECT_WITHIN $CONFIG_UPDATE_TIMEOUT "1" mount_get_option_value $M0 $V0-replicate-0 quorum-reads  TEST ! cat $M0/b -TEST $CLI volume reset $V0 cluster.quorum-reads  TEST $CLI volume set $V0 cluster.quorum-type none  EXPECT none volume_option $V0 cluster.quorum-type @@ -57,11 +49,6 @@ TEST test_write  TEST $CLI volume reset $V0 cluster.quorum-type  TEST test_write  EXPECT "abc" cat $M0/b -TEST $CLI volume set $V0 cluster.quorum-reads on -EXPECT_WITHIN $CONFIG_UPDATE_TIMEOUT "1" mount_get_option_value $M0 $V0-replicate-0 quorum-reads -EXPECT "abc" cat $M0/b -TEST $CLI volume reset $V0 cluster.quorum-reads -  cleanup;  TEST glusterd; @@ -86,24 +73,14 @@ TEST $CLI volume set $V0 cluster.quorum-count 3  TEST test_write  TEST kill_brick $V0 $H0 $B0/${V0}1  TEST ! test_write -EXPECT "abc" cat $M0/b -TEST $CLI volume set $V0 cluster.quorum-reads on -EXPECT_WITHIN $CONFIG_UPDATE_TIMEOUT "1" mount_get_option_value $M0 $V0-replicate-0 quorum-reads  TEST ! cat $M0/b -TEST $CLI volume reset $V0 cluster.quorum-reads -  TEST $CLI volume set $V0 cluster.quorum-type auto  EXPECT auto volume_option $V0 cluster.quorum-type  TEST test_write  TEST kill_brick $V0 $H0 $B0/${V0}3  TEST ! test_write -EXPECT "abc" cat $M0/b -TEST $CLI volume set $V0 cluster.quorum-reads on -EXPECT_WITHIN $CONFIG_UPDATE_TIMEOUT "1" mount_get_option_value $M0 $V0-replicate-0 quorum-reads  TEST ! cat $M0/b -TEST $CLI volume reset $V0 cluster.quorum-reads -  TEST $CLI volume set $V0 cluster.quorum-type none  EXPECT none volume_option $V0 cluster.quorum-type diff --git a/tests/bugs/replicate/bug-977797.t b/tests/bugs/replicate/bug-977797.t index ea9a98adc23..fee82054cc3 100755 --- a/tests/bugs/replicate/bug-977797.t +++ b/tests/bugs/replicate/bug-977797.t @@ -53,6 +53,8 @@ TEST chmod 757 $M0/a/file  TEST $CLI volume start $V0 force  EXPECT_WITHIN $CHILD_UP_TIMEOUT "1" afr_child_up_status $V0 1; +#Trigger entry heal of $M0/a +getfattr -n user.nosuchattr $M0/a  dd if=$M0/a/file of=/dev/null bs=1024k  #read fails, but heal is triggered.  TEST [ $? -ne 0 ] diff --git a/xlators/cluster/afr/src/afr-common.c b/xlators/cluster/afr/src/afr-common.c index 58256322a3b..2da2dc271d9 100644 --- a/xlators/cluster/afr/src/afr-common.c +++ b/xlators/cluster/afr/src/afr-common.c @@ -45,9 +45,7 @@  int32_t  afr_quorum_errno (afr_private_t *priv)  { -        if (priv->quorum_reads) -                return ENOTCONN; -        return EROFS; +        return ENOTCONN;  }  int @@ -1091,8 +1089,6 @@ afr_replies_interpret (call_frame_t *frame, xlator_t *this, inode_t *inode,  	return ret;  } - -  int  afr_refresh_selfheal_done (int ret, call_frame_t *heal, void *opaque)  { @@ -1663,6 +1659,29 @@ afr_inode_read_subvol_type_get (inode_t *inode, xlator_t *this,  	return ret;  } +void +afr_readables_intersect_get (inode_t *inode, xlator_t *this, int *event, +                             unsigned char *intersection) +{ +	afr_private_t *priv = NULL; +	unsigned char *data_readable = NULL; +	unsigned char *metadata_readable = NULL; +	unsigned char *intersect = NULL; + +	priv = this->private; +	data_readable = alloca0 (priv->child_count); +	metadata_readable = alloca0 (priv->child_count); +	intersect = alloca0 (priv->child_count); + +	afr_inode_read_subvol_get (inode, this, data_readable, +                                   metadata_readable, event); + +	AFR_INTERSECT (intersect, data_readable, metadata_readable, +		       priv->child_count); +        if (intersection) +                memcpy (intersection, intersect, +                        sizeof (*intersection) * priv->child_count); +}  int  afr_read_subvol_get (inode_t *inode, xlator_t *this, int *subvol_p, @@ -1671,8 +1690,6 @@ afr_read_subvol_get (inode_t *inode, xlator_t *this, int *subvol_p,                       afr_read_subvol_args_t *args)  {  	afr_private_t *priv = NULL; -	unsigned char *data_readable = NULL; -	unsigned char *metadata_readable = NULL;  	unsigned char *readable = NULL;  	unsigned char *intersection = NULL;  	int subvol = -1; @@ -1681,17 +1698,11 @@ afr_read_subvol_get (inode_t *inode, xlator_t *this, int *subvol_p,  	priv = this->private;  	readable = alloca0 (priv->child_count); -	data_readable = alloca0 (priv->child_count); -	metadata_readable = alloca0 (priv->child_count);  	intersection = alloca0 (priv->child_count);  	afr_inode_read_subvol_type_get (inode, this, readable, &event, type); -	afr_inode_read_subvol_get (inode, this, data_readable, metadata_readable, -				   &event); - -	AFR_INTERSECT (intersection, data_readable, metadata_readable, -		       priv->child_count); +        afr_readables_intersect_get (inode, this, &event, intersection);  	if (AFR_COUNT (intersection, priv->child_count) > 0)  		subvol = afr_read_subvol_select_by_policy (inode, this, @@ -2144,18 +2155,28 @@ afr_get_parent_read_subvol (xlator_t *this, inode_t *parent,  int  afr_read_subvol_decide (inode_t *inode, xlator_t *this, -                        afr_read_subvol_args_t *args) +                        afr_read_subvol_args_t *args, unsigned char *readable)  { -        int data_subvol  = -1; -        int mdata_subvol = -1; +        int event = 0; +        afr_private_t *priv = NULL; +        unsigned char *intersection = NULL; -        data_subvol = afr_data_subvol_get (inode, this, NULL, NULL, NULL, args); -        mdata_subvol = afr_metadata_subvol_get (inode, this, -                                                NULL, NULL, NULL, args); -        if (data_subvol == -1 || mdata_subvol == -1) +        priv = this->private; +        intersection = alloca0 (priv->child_count); + +        afr_readables_intersect_get (inode, this, &event, intersection); + +        if (AFR_COUNT (intersection, priv->child_count) <= 0) { +                /* TODO: If we have one brick with valid data_readable and +                 * another with metadata_readable, try to send an iatt with +                 * valid bits from both.*/                  return -1; +        } -        return data_subvol; +        memcpy (readable, intersection, sizeof (*readable) * priv->child_count); + +        return afr_read_subvol_select_by_policy (inode, this, intersection, +                                                 args);  }  static inline int @@ -2172,7 +2193,49 @@ afr_first_up_child (call_frame_t *frame, xlator_t *this)                  if (local->replies[i].valid &&                      local->replies[i].op_ret == 0)                          return i; -        return 0; +        return -1; +} + +static void +afr_attempt_readsubvol_set (call_frame_t *frame, xlator_t *this, +                            unsigned char *success_replies, +                            unsigned char *data_readable, int *read_subvol) +{ +        afr_private_t *priv = NULL; +        afr_local_t *local = NULL; +        int spb_choice = -1; +        int child_count = -1; + +        if (*read_subvol != -1) +                return; + +        priv = this->private; +        local = frame->local; +        child_count = priv->child_count; + +        afr_inode_split_brain_choice_get (local->inode, this, +                                          &spb_choice); +        if ((spb_choice >= 0) && +            (AFR_COUNT(success_replies, child_count) == child_count)) { +                *read_subvol = spb_choice; +        } else if (!priv->quorum_count) { +                *read_subvol = afr_first_up_child (frame, this); +        } else if (priv->quorum_count && +                   afr_has_quorum (data_readable, this)) { +                /* read_subvol is guaranteed to be valid if we hit this path. */ +                *read_subvol = afr_first_up_child (frame, this); +        } else { +               /* If quorum is enabled and we do not have a +                  readable yet, it means all good copies are down. +               */ +               local->op_ret = -1; +               local->op_errno = ENOTCONN; +               gf_msg (this->name, GF_LOG_WARNING, 0, +                       AFR_MSG_READ_SUBVOL_ERROR, "no read " +                       "subvols for %s", local->loc.path); +        } +        if (*read_subvol >= 0) +                dict_del (local->replies[*read_subvol].xdata, GF_CONTENT_KEY);  }  static void @@ -2186,13 +2249,13 @@ afr_lookup_done (call_frame_t *frame, xlator_t *this)          int                 par_read_subvol = 0;          int                 ret         = -1;  	unsigned char      *readable = NULL; +        unsigned char      *success_replies = NULL;  	int                 event = 0;  	struct afr_reply   *replies = NULL;  	uuid_t              read_gfid = {0, };  	gf_boolean_t        locked_entry = _gf_false;  	gf_boolean_t        can_interpret = _gf_true;          inode_t            *parent = NULL; -        int                 spb_choice = -1;          ia_type_t           ia_type = IA_INVAL;          afr_read_subvol_args_t args = {0,};          char               *gfid_heal_msg = NULL; @@ -2206,11 +2269,12 @@ afr_lookup_done (call_frame_t *frame, xlator_t *this)                                                    this);  	readable = alloca0 (priv->child_count); +	success_replies = alloca0 (priv->child_count);  	afr_inode_read_subvol_get (parent, this, readable, NULL, &event); +        par_read_subvol = afr_get_parent_read_subvol (this, parent, replies, +                                                      readable); -        afr_inode_split_brain_choice_get (local->inode, this, -                                                &spb_choice);  	/* First, check if we have a gfid-change from somewhere,  	   If so, propagate that so that a fresh lookup can be  	   issued @@ -2218,13 +2282,17 @@ afr_lookup_done (call_frame_t *frame, xlator_t *this)          if (local->cont.lookup.needs_fresh_lookup) {                  local->op_ret = -1;                  local->op_errno = ESTALE; -                goto unwind; +                goto error;          }  	op_errno = afr_final_errno (frame->local, this->private);  	local->op_errno = op_errno;  	read_subvol = -1; +        for (i = 0; i < priv->child_count; i++) +                if (replies[i].valid && replies[i].op_ret == 0) +                        success_replies[i] = 1; +  	for (i = 0; i < priv->child_count; i++) {  		if (!replies[i].valid)  			continue; @@ -2233,9 +2301,9 @@ afr_lookup_done (call_frame_t *frame, xlator_t *this)  		    replies[i].op_errno == ENOENT) {  			/* Second, check entry is still  			   "underway" in creation */ -			local->op_ret = -1; -			local->op_errno = ENOENT; -			goto unwind; +                        local->op_ret = -1; +                        local->op_errno = ENOENT; +                        goto error;  		}  		if (replies[i].op_ret == -1) @@ -2249,8 +2317,8 @@ afr_lookup_done (call_frame_t *frame, xlator_t *this)  		}  	} -	if (read_subvol == -1) -		goto unwind; +        if (read_subvol == -1) +                goto error;  	/* We now have a read_subvol, which is readable[] (if there  	   were any). Next we look for GFID mismatches. We don't  	   consider a GFID mismatch as an error if read_subvol is @@ -2276,58 +2344,61 @@ afr_lookup_done (call_frame_t *frame, xlator_t *this)  		if (readable[read_subvol] && !readable[i])  			continue; +                /* If we were called from glfsheal and there is still a gfid +                 * mismatch, succeed the lookup and let glfsheal print the +                 * response via gfid-heal-msg.*/ +                if (!dict_get_str (local->xattr_req, "gfid-heal-msg", +                                   &gfid_heal_msg)) +                        goto cant_interpret; +  		/* LOG ERROR */  		local->op_ret = -1;  		local->op_errno = EIO; -		goto unwind; +		goto error;  	}  	/* Forth, for the finalized GFID, pick the best subvolume  	   to return stats from.  	*/ +        read_subvol = -1; +        memset (readable, 0, sizeof (*readable) * priv->child_count);  	if (can_interpret) { +                if (!afr_has_quorum (success_replies, this)) +                        goto cant_interpret;  		/* It is safe to call afr_replies_interpret() because we have  		   a response from all the UP subvolumes and all of them resolved  		   to the same GFID  		*/                  gf_uuid_copy (args.gfid, read_gfid);                  args.ia_type = ia_type; -		if (afr_replies_interpret (frame, this, local->inode, NULL)) { -                        read_subvol = afr_read_subvol_decide (local->inode, -                                                              this, &args); +		ret = afr_replies_interpret (frame, this, local->inode, NULL); +                read_subvol = afr_read_subvol_decide (local->inode, this, &args, +                                                      readable); +                if (read_subvol == -1) +                        goto cant_interpret; +                if (ret) {  			afr_inode_event_gen_reset (local->inode, this); -			goto cant_interpret; -		} else { -                        read_subvol = afr_data_subvol_get (local->inode, this, -                                                       NULL, NULL, NULL, &args); -		} +                        dict_del (local->replies[read_subvol].xdata, +                                  GF_CONTENT_KEY); +                }  	} else {  	cant_interpret: +                afr_attempt_readsubvol_set (frame, this, success_replies, +                                            readable, &read_subvol);                  if (read_subvol == -1) { -                        if (spb_choice >= 0) -                                read_subvol = spb_choice; -                        else -                                read_subvol = afr_first_up_child (frame, this); +                        goto error;                  } -		dict_del (replies[read_subvol].xdata, GF_CONTENT_KEY);  	}  	afr_handle_quota_size (frame, this); -unwind:          afr_set_need_heal (this, local); -        if (read_subvol == -1) { -                if (spb_choice >= 0) -                        read_subvol = spb_choice; -                else -                        read_subvol = afr_first_up_child (frame, this); - -        } -        par_read_subvol = afr_get_parent_read_subvol (this, parent, replies, -                                                      readable);          if (AFR_IS_ARBITER_BRICK (priv, read_subvol) && local->op_ret == 0) { -                        local->op_ret = -1; -                        local->op_errno = ENOTCONN; +                local->op_ret = -1; +                local->op_errno = ENOTCONN; +                gf_msg_debug(this->name, 0, "Arbiter cannot be a read subvol " +                             "for %s", local->loc.path); +                goto error;          }          ret = dict_get_str (local->xattr_req, "gfid-heal-msg", &gfid_heal_msg); @@ -2347,6 +2418,11 @@ unwind:  			  local->inode, &local->replies[read_subvol].poststat,  			  local->replies[read_subvol].xdata,  			  &local->replies[par_read_subvol].postparent); +        return; + +error: +        AFR_STACK_UNWIND (lookup, frame, local->op_ret, local->op_errno, NULL, +                          NULL, NULL, NULL);  }  /* @@ -2764,55 +2840,54 @@ afr_discover_done (call_frame_t *frame, xlator_t *this)          afr_local_t         *local = NULL;  	int                 i = -1;  	int                 op_errno = 0; -	int                 spb_choice = -1;  	int                 read_subvol = -1; +        unsigned char *data_readable = NULL; +        unsigned char *success_replies = NULL;          priv  = this->private;          local = frame->local; - -        afr_inode_split_brain_choice_get (local->inode, this, -                                          &spb_choice); +        data_readable = alloca0 (priv->child_count); +        success_replies = alloca0 (priv->child_count);  	for (i = 0; i < priv->child_count; i++) {  		if (!local->replies[i].valid)  			continue; -		if (local->replies[i].op_ret == 0) +		if (local->replies[i].op_ret == 0) { +                        success_replies[i] = 1;  			local->op_ret = 0; +                }  	}  	op_errno = afr_final_errno (frame->local, this->private);          if (local->op_ret < 0) { -		local->op_errno = op_errno; -		local->op_ret = -1; -                goto unwind; +                AFR_STACK_UNWIND (lookup, frame, -1, op_errno, NULL, NULL, +                                  NULL, NULL); +                return;  	} -	afr_replies_interpret (frame, this, local->inode, NULL); +        if (!afr_has_quorum (success_replies, this)) +                goto unwind; -	read_subvol = afr_read_subvol_decide (local->inode, this, NULL); -	if (read_subvol == -1) { -	        gf_msg (this->name, GF_LOG_WARNING, 0, -                        AFR_MSG_READ_SUBVOL_ERROR, "no read subvols for %s", -			local->loc.path); +	afr_replies_interpret (frame, this, local->inode, NULL); -                if (spb_choice >= 0) { -                        read_subvol = spb_choice; -                } else { -                        read_subvol = afr_first_up_child (frame, this); -                } -	} +	read_subvol = afr_read_subvol_decide (local->inode, this, NULL, +                                              data_readable);  unwind: +        afr_attempt_readsubvol_set (frame, this, success_replies, data_readable, +                                    &read_subvol);  	if (read_subvol == -1) { -                if (spb_choice >= 0) -                        read_subvol = spb_choice; -                else -                        read_subvol = afr_first_up_child (frame, this); +                AFR_STACK_UNWIND (lookup, frame, local->op_ret, local->op_errno, +                                  NULL, NULL, NULL, NULL); +                return;          } +          if (AFR_IS_ARBITER_BRICK (priv, read_subvol) && local->op_ret == 0) { -                        local->op_ret = -1; -                        local->op_errno = ENOTCONN; +                local->op_ret = -1; +                local->op_errno = ENOTCONN; +	        gf_msg_debug (this->name, 0, "Arbiter cannot be a read subvol " +                              "for %s", local->loc.path);          }  	AFR_STACK_UNWIND (lookup, frame, local->op_ret, local->op_errno, @@ -4672,7 +4747,6 @@ afr_priv_dump (xlator_t *this)          gf_proc_dump_write("read_child", "%d", priv->read_child);          gf_proc_dump_write("favorite_child", "%d", priv->favorite_child);          gf_proc_dump_write("wait_count", "%u", priv->wait_count); -        gf_proc_dump_write("quorum-reads", "%d", priv->quorum_reads);          gf_proc_dump_write("heal-wait-queue-length", "%d",                             priv->heal_wait_qlen);          gf_proc_dump_write("heal-waiters", "%d", priv->heal_waiters); diff --git a/xlators/cluster/afr/src/afr-read-txn.c b/xlators/cluster/afr/src/afr-read-txn.c index 50e8040d33e..f6c491b713e 100644 --- a/xlators/cluster/afr/src/afr-read-txn.c +++ b/xlators/cluster/afr/src/afr-read-txn.c @@ -193,8 +193,7 @@ afr_read_txn (call_frame_t *frame, xlator_t *this, inode_t *inode,  	local->inode = inode_ref (inode);          local->is_read_txn = _gf_true; -        if (priv->quorum_reads && -            priv->quorum_count && !afr_has_quorum (priv->child_up, this)) { +        if (priv->quorum_count && !afr_has_quorum (local->child_up, this)) {                  local->op_ret = -1;                  local->op_errno = ENOTCONN;                  read_subvol = -1; diff --git a/xlators/cluster/afr/src/afr.c b/xlators/cluster/afr/src/afr.c index 84dbcc04680..b18f60fcc7c 100644 --- a/xlators/cluster/afr/src/afr.c +++ b/xlators/cluster/afr/src/afr.c @@ -278,8 +278,6 @@ reconfigure (xlator_t *this, dict_t *options)          GF_OPTION_RECONF ("heal-timeout", priv->shd.timeout, options,                            int32, out); -        GF_OPTION_RECONF ("quorum-reads", priv->quorum_reads, options, -                          bool, out);          GF_OPTION_RECONF ("consistent-metadata", priv->consistent_metadata,                            options, bool, out); @@ -554,7 +552,6 @@ init (xlator_t *this)  	GF_OPTION_INIT ("iam-self-heal-daemon", priv->shd.iamshd, bool, out);          GF_OPTION_INIT ("heal-timeout", priv->shd.timeout, int32, out); -        GF_OPTION_INIT ("quorum-reads", priv->quorum_reads, bool, out);          GF_OPTION_INIT ("consistent-metadata", priv->consistent_metadata, bool,                          out);          GF_OPTION_INIT ("consistent-io", priv->consistent_io, bool, out); @@ -998,8 +995,8 @@ struct volume_options options[] = {          { .key = {"quorum-reads"},            .type = GF_OPTION_TYPE_BOOL,            .default_value = "no", -          .description = "If quorum-reads is \"true\" only allow reads if " -                         "quorum is met when quorum is enabled.", +          .description = "This option has been removed. Reads are not allowed " +                          "if quorum is not met.",          },          { .key  = {"node-uuid"},            .type = GF_OPTION_TYPE_STR, diff --git a/xlators/cluster/afr/src/afr.h b/xlators/cluster/afr/src/afr.h index b05ec8f6b96..b59b9439273 100644 --- a/xlators/cluster/afr/src/afr.h +++ b/xlators/cluster/afr/src/afr.h @@ -139,7 +139,6 @@ typedef struct _afr_private {          gf_boolean_t      pre_op_compat;      /* on/off */  	uint32_t          post_op_delay_secs;          unsigned int      quorum_count; -        gf_boolean_t      quorum_reads;          char                   vol_uuid[UUID_SIZE + 1];          int32_t                *last_event;  | 
