summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorRavishankar N <ravishankar@redhat.com>2016-12-09 06:09:14 +0000
committerPranith Kumar Karampuri <pkarampu@redhat.com>2016-12-12 22:25:22 -0800
commit8d36a3cb838b8ff277a4b41f56cd380325cb7837 (patch)
tree56b90447aeb8368d6f98b308f13a23f29d21a530
parentc4eae872b8988ce41f03dd1e7a146ae7dcb0f963 (diff)
afr: allow I/O when favorite-child-policy is enabled
Problem: Currently, I/O on a split-brained file fails even when the favorite-child-policy is set until the self-heal is complete. Fix: If a valid 'source' is found using the set favorite-child-policy,inspect and reset the afr pending xattrs on the 'sinks' (inside appropriate locks),refresh the inode and then proceed with the read or write transaction. The resetting itself happens in the self-heal code and hence can also happen in the client side background-heal or by the shd's index-heal in addition to the txn code path explained above. When it happens in via heal, we also add checks in undo-pending to not reset the sink xattrs again. > Reviewed-on: http://review.gluster.org/15673 > Tested-by: Pranith Kumar Karampuri <pkarampu@redhat.com> > Smoke: Gluster Build System <jenkins@build.gluster.org> > Reviewed-by: Pranith Kumar Karampuri <pkarampu@redhat.com> > NetBSD-regression: NetBSD Build System <jenkins@build.gluster.org> > CentOS-regression: Gluster Build System <jenkins@build.gluster.org> Change-Id: Ic8c1317720cb26bd114b6fe6af4e58c73b864626 BUG: 1403121 Signed-off-by: Ravishankar N <ravishankar@redhat.com> Reported-by: Simon Turcotte-Langevin <simon.turcotte-langevin@ubisoft.com> Reviewed-on: http://review.gluster.org/16088 Smoke: Gluster Build System <jenkins@build.gluster.org> CentOS-regression: Gluster Build System <jenkins@build.gluster.org> NetBSD-regression: NetBSD Build System <jenkins@build.gluster.org> Reviewed-by: Pranith Kumar Karampuri <pkarampu@redhat.com>
-rw-r--r--tests/bugs/replicate/bug-1386188-sbrain-fav-child.t82
-rw-r--r--xlators/cluster/afr/src/afr-common.c205
-rw-r--r--xlators/cluster/afr/src/afr-read-txn.c13
-rw-r--r--xlators/cluster/afr/src/afr-self-heal-common.c159
-rw-r--r--xlators/cluster/afr/src/afr-self-heal-data.c20
-rw-r--r--xlators/cluster/afr/src/afr-self-heal-entry.c3
-rw-r--r--xlators/cluster/afr/src/afr-self-heal-metadata.c15
-rw-r--r--xlators/cluster/afr/src/afr-self-heal.h24
-rw-r--r--xlators/cluster/afr/src/afr-transaction.c9
9 files changed, 469 insertions, 61 deletions
diff --git a/tests/bugs/replicate/bug-1386188-sbrain-fav-child.t b/tests/bugs/replicate/bug-1386188-sbrain-fav-child.t
new file mode 100644
index 00000000000..d049d95ef9a
--- /dev/null
+++ b/tests/bugs/replicate/bug-1386188-sbrain-fav-child.t
@@ -0,0 +1,82 @@
+#!/bin/bash
+. $(dirname $0)/../../include.rc
+. $(dirname $0)/../../volume.rc
+cleanup;
+
+TEST glusterd
+TEST pidof glusterd
+TEST $CLI volume create $V0 replica 2 $H0:$B0/${V0}{0,1}
+TEST $CLI volume set $V0 self-heal-daemon off
+TEST $CLI volume set $V0 data-self-heal off
+TEST $CLI volume set $V0 entry-self-heal off
+TEST $CLI volume set $V0 metadata-self-heal off
+TEST $CLI volume start $V0
+
+TEST glusterfs --volfile-id=$V0 --volfile-server=$H0 $M0;
+TEST touch $M0/data.txt
+TEST touch $M0/mdata.txt
+
+#Create data and metadata split-brain
+TEST kill_brick $V0 $H0 $B0/${V0}0
+TEST dd if=/dev/urandom of=$M0/data.txt bs=1024 count=1024
+TEST setfattr -n user.value -v value1 $M0/mdata.txt
+TEST $CLI volume start $V0 force
+EXPECT_WITHIN $PROCESS_UP_TIMEOUT "1" brick_up_status $V0 $H0 $B0/${V0}0
+EXPECT_WITHIN $PROCESS_UP_TIMEOUT "1" afr_child_up_status $V0 0
+TEST kill_brick $V0 $H0 $B0/${V0}1
+TEST dd if=/dev/urandom of=$M0/data.txt bs=1024 count=1024
+TEST setfattr -n user.value -v value2 $M0/mdata.txt
+
+TEST $CLI volume start $V0 force
+EXPECT_WITHIN $PROCESS_UP_TIMEOUT "1" brick_up_status $V0 $H0 $B0/${V0}1
+EXPECT_WITHIN $PROCESS_UP_TIMEOUT "1" afr_child_up_status $V0 1
+
+## Check that the file still in split-brain,
+ ## I/O fails
+ cat $M0/data.txt > /dev/null
+ EXPECT "1" echo $?
+ ## pending xattrs blame each other.
+ brick0_pending=$(get_hex_xattr trusted.afr.$V0-client-1 $B0/${V0}0/data.txt)
+ brick1_pending=$(get_hex_xattr trusted.afr.$V0-client-0 $B0/${V0}1/data.txt)
+ TEST [ $brick0_pending -ne "000000000000000000000000" ]
+ TEST [ $brick1_pending -ne "000000000000000000000000" ]
+
+ ## I/O fails
+ getfattr -n user.value $M0/mdata.txt
+ EXPECT "1" echo $?
+ brick0_pending=$(get_hex_xattr trusted.afr.$V0-client-1 $B0/${V0}0/mdata.txt)
+ brick1_pending=$(get_hex_xattr trusted.afr.$V0-client-0 $B0/${V0}1/mdata.txt)
+ TEST [ $brick0_pending -ne "000000000000000000000000" ]
+ TEST [ $brick1_pending -ne "000000000000000000000000" ]
+
+## Let us use mtime as fav-child policy. So brick0 will be source.
+ # Set dirty (data part) on the sink brick to check if it is reset later along with the pending xattr.
+ TEST setfattr -n trusted.afr.dirty -v 0x000000010000000000000000 $B0/${V0}1/data.txt
+ # Set dirty (metadata part) on the sink brick to check if it is reset later along with the pending xattr.
+ TEST setfattr -n trusted.afr.dirty -v 0x000000000000000100000000 $B0/${V0}1/mdata.txt
+
+ TEST $CLI volume set $V0 favorite-child-policy mtime
+
+ # Reading the file should be allowed and sink brick xattrs must be reset.
+ cat $M0/data.txt > /dev/null
+ EXPECT "0" echo $?
+ TEST brick1_pending=$(get_hex_xattr trusted.afr.$V0-client-0 $B0/${V0}1/data.txt)
+ TEST brick1_dirty=$(get_hex_xattr trusted.afr.dirty $B0/${V0}1/data.txt)
+ TEST [ $brick1_dirty -eq "000000000000000000000000" ]
+ TEST [ $brick1_pending -eq "000000000000000000000000" ]
+
+ # Accessing the file should be allowed and sink brick xattrs must be reset.
+ EXPECT "value2" echo $(getfattr --only-values -n user.value $M0/mdata.txt)
+ TEST brick1_pending=$(get_hex_xattr trusted.afr.$V0-client-0 $B0/${V0}1/data.txt)
+ TEST brick1_dirty=$(get_hex_xattr trusted.afr.dirty $B0/${V0}1/data.txt)
+ TEST [ $brick1_dirty -eq "000000000000000000000000" ]
+ TEST [ $brick1_pending -eq "000000000000000000000000" ]
+
+#Enable shd and heal the file.
+TEST $CLI volume set $V0 cluster.self-heal-daemon on
+EXPECT_WITHIN $PROCESS_UP_TIMEOUT "Y" glustershd_up_status
+EXPECT_WITHIN $CHILD_UP_TIMEOUT "1" afr_child_up_status_in_shd $V0 0
+EXPECT_WITHIN $CHILD_UP_TIMEOUT "1" afr_child_up_status_in_shd $V0 1
+TEST $CLI volume heal $V0
+EXPECT 0 get_pending_heal_count $V0
+cleanup;
diff --git a/xlators/cluster/afr/src/afr-common.c b/xlators/cluster/afr/src/afr-common.c
index 176dcbfa13b..ac1da31dc76 100644
--- a/xlators/cluster/afr/src/afr-common.c
+++ b/xlators/cluster/afr/src/afr-common.c
@@ -53,6 +53,13 @@ afr_quorum_errno (afr_private_t *priv)
return EROFS;
}
+int
+afr_fav_child_reset_sink_xattrs (void *opaque);
+
+int
+afr_fav_child_reset_sink_xattrs_cbk (int ret, call_frame_t *frame,
+ void *opaque);
+
gf_boolean_t
afr_is_consistent_io_possible (afr_local_t *local, afr_private_t *priv,
int32_t *op_errno)
@@ -1011,6 +1018,82 @@ afr_selfheal_enabled (xlator_t *this)
return data || priv->metadata_self_heal || priv->entry_self_heal;
}
+
+int
+afr_txn_refresh_done (call_frame_t *frame, xlator_t *this, int err)
+{
+
+ call_frame_t *heal_frame = NULL;
+ afr_local_t *heal_local = NULL;
+ afr_local_t *local = NULL;
+ afr_private_t *priv = NULL;
+ inode_t *inode = NULL;
+ int event_generation = 0;
+ int read_subvol = -1;
+ int op_errno = ENOMEM;
+ int ret = 0;
+
+ local = frame->local;
+ inode = local->inode;
+ priv = this->private;
+
+ if (err)
+ goto refresh_done;
+
+ if (local->op == GF_FOP_LOOKUP)
+ goto refresh_done;
+
+ ret = afr_inode_get_readable (frame, inode, this, local->readable,
+ &event_generation,
+ local->transaction.type);
+
+ if (ret == -EIO || !event_generation) {
+ /* No readable subvolume even after refresh ==> splitbrain.*/
+ if (!priv->fav_child_policy) {
+ err = -EIO;
+ goto refresh_done;
+ }
+ read_subvol = afr_sh_get_fav_by_policy (this, local->replies,
+ inode, NULL);
+ if (read_subvol == -1) {
+ err = -EIO;
+ goto refresh_done;
+ }
+
+ heal_frame = copy_frame (frame);
+ if (!heal_frame) {
+ err = -EIO;
+ goto refresh_done;
+ }
+ heal_frame->root->pid = GF_CLIENT_PID_SELF_HEALD;
+ heal_local = AFR_FRAME_INIT (heal_frame, op_errno);
+ if (!heal_local) {
+ err = -EIO;
+ AFR_STACK_DESTROY (heal_frame);
+ goto refresh_done;
+ }
+ heal_local->xdata_req = dict_new();
+ if (!heal_local->xdata_req) {
+ err = -EIO;
+ AFR_STACK_DESTROY (heal_frame);
+ goto refresh_done;
+ }
+ heal_local->heal_frame = frame;
+ ret = synctask_new (this->ctx->env,
+ afr_fav_child_reset_sink_xattrs,
+ afr_fav_child_reset_sink_xattrs_cbk,
+ heal_frame,
+ heal_frame);
+ return 0;
+ }
+
+refresh_done:
+ afr_local_replies_wipe (local, this->private);
+ local->refreshfn (frame, this, err);
+
+ return 0;
+}
+
int
afr_inode_refresh_done (call_frame_t *frame, xlator_t *this)
{
@@ -1029,8 +1112,6 @@ afr_inode_refresh_done (call_frame_t *frame, xlator_t *this)
err = afr_inode_refresh_err (frame, this);
- afr_local_replies_wipe (local, this->private);
-
if (ret && afr_selfheal_enabled (this) && start_heal) {
heal_frame = copy_frame (frame);
if (!heal_frame)
@@ -1050,7 +1131,7 @@ afr_inode_refresh_done (call_frame_t *frame, xlator_t *this)
}
refresh_done:
- local->refreshfn (frame, this, err);
+ afr_txn_refresh_done (frame, this, err);
return 0;
}
@@ -5132,6 +5213,7 @@ afr_selfheal_locked_metadata_inspect (call_frame_t *frame, xlator_t *this,
unsigned char *sources = NULL;
unsigned char *sinks = NULL;
unsigned char *healed_sinks = NULL;
+ unsigned char *undid_pending = NULL;
struct afr_reply *locked_replies = NULL;
afr_private_t *priv = this->private;
@@ -5140,6 +5222,7 @@ afr_selfheal_locked_metadata_inspect (call_frame_t *frame, xlator_t *this,
sources = alloca0 (priv->child_count);
sinks = alloca0 (priv->child_count);
healed_sinks = alloca0 (priv->child_count);
+ undid_pending = alloca0 (priv->child_count);
locked_replies = alloca0 (sizeof (*locked_replies) * priv->child_count);
@@ -5156,6 +5239,7 @@ afr_selfheal_locked_metadata_inspect (call_frame_t *frame, xlator_t *this,
ret = __afr_selfheal_metadata_prepare (frame, this, inode,
locked_on, sources,
sinks, healed_sinks,
+ undid_pending,
locked_replies,
pending);
*msh = afr_decide_heal_info (priv, sources, ret);
@@ -5179,6 +5263,7 @@ afr_selfheal_locked_data_inspect (call_frame_t *frame, xlator_t *this,
unsigned char *sources = NULL;
unsigned char *sinks = NULL;
unsigned char *healed_sinks = NULL;
+ unsigned char *undid_pending = NULL;
afr_private_t *priv = NULL;
fd_t *fd = NULL;
struct afr_reply *locked_replies = NULL;
@@ -5192,6 +5277,7 @@ afr_selfheal_locked_data_inspect (call_frame_t *frame, xlator_t *this,
sources = alloca0 (priv->child_count);
sinks = alloca0 (priv->child_count);
healed_sinks = alloca0 (priv->child_count);
+ undid_pending = alloca0 (priv->child_count);
/* Heal-info does an open() on the file being examined so that the
* current eager-lock holding client, if present, at some point sees
@@ -5231,6 +5317,7 @@ afr_selfheal_locked_data_inspect (call_frame_t *frame, xlator_t *this,
ret = __afr_selfheal_data_prepare (frame, this, inode,
data_lock, sources,
sinks, healed_sinks,
+ undid_pending,
locked_replies,
pflag);
*dsh = afr_decide_heal_info (priv, sources, ret);
@@ -5818,3 +5905,115 @@ afr_compound_cleanup (compound_args_t *args, dict_t *xdata,
if (newloc_xdata)
dict_unref (newloc_xdata);
}
+
+int
+afr_fav_child_reset_sink_xattrs_cbk (int ret, call_frame_t *heal_frame,
+ void *opaque)
+{
+
+ call_frame_t *txn_frame = NULL;
+ afr_local_t *local = NULL;
+ afr_local_t *heal_local = NULL;
+ xlator_t *this = NULL;
+
+ heal_local = heal_frame->local;
+ txn_frame = heal_local->heal_frame;
+ local = txn_frame->local;
+ this = txn_frame->this;
+
+ /* Refresh the inode agan and proceed with the transaction.*/
+ afr_inode_refresh (txn_frame, this, local->inode, NULL,
+ local->refreshfn);
+
+ if (heal_frame)
+ AFR_STACK_DESTROY (heal_frame);
+
+ return 0;
+}
+
+int
+afr_fav_child_reset_sink_xattrs (void *opaque)
+{
+ call_frame_t *heal_frame = NULL;
+ call_frame_t *txn_frame = NULL;
+ xlator_t *this = NULL;
+ gf_boolean_t d_spb = _gf_false;
+ gf_boolean_t m_spb = _gf_false;
+ afr_local_t *heal_local = NULL;
+ afr_local_t *txn_local = NULL;
+ afr_private_t *priv = NULL;
+ inode_t *inode = NULL;
+ unsigned char *locked_on = NULL;
+ unsigned char *sources = NULL;
+ unsigned char *sinks = NULL;
+ unsigned char *healed_sinks = NULL;
+ unsigned char *undid_pending = NULL;
+ struct afr_reply *locked_replies = NULL;
+ int ret = 0;
+
+ heal_frame = (call_frame_t *) opaque;
+ heal_local = heal_frame->local;
+ txn_frame = heal_local->heal_frame;
+ txn_local = txn_frame->local;
+ this = txn_frame->this;
+ inode = txn_local->inode;
+ priv = this->private;
+ locked_on = alloca0 (priv->child_count);
+ sources = alloca0 (priv->child_count);
+ sinks = alloca0 (priv->child_count);
+ healed_sinks = alloca0 (priv->child_count);
+ undid_pending = alloca0 (priv->child_count);
+ locked_replies = alloca0 (sizeof (*locked_replies) * priv->child_count);
+
+ ret = _afr_is_split_brain (txn_frame, this, txn_local->replies,
+ AFR_DATA_TRANSACTION, &d_spb);
+
+ ret = _afr_is_split_brain (txn_frame, this, txn_local->replies,
+ AFR_METADATA_TRANSACTION, &m_spb);
+
+ /* Take appropriate locks and reset sink xattrs. */
+ if (d_spb) {
+ ret = afr_selfheal_inodelk (heal_frame, this, inode, this->name,
+ 0, 0, locked_on);
+ {
+ if (ret < AFR_SH_MIN_PARTICIPANTS)
+ goto data_unlock;
+ ret = __afr_selfheal_data_prepare (heal_frame, this,
+ inode, locked_on,
+ sources, sinks,
+ healed_sinks,
+ undid_pending,
+ locked_replies,
+ NULL);
+ }
+data_unlock:
+ afr_selfheal_uninodelk (heal_frame, this, inode, this->name,
+ 0, 0, locked_on);
+ }
+
+ if (m_spb) {
+ memset (locked_on, 0, sizeof (*locked_on) * priv->child_count);
+ memset (undid_pending, 0,
+ sizeof (*undid_pending) * priv->child_count);
+ ret = afr_selfheal_inodelk (heal_frame, this, inode, this->name,
+ LLONG_MAX-1, 0, locked_on);
+ {
+ if (ret < AFR_SH_MIN_PARTICIPANTS)
+ goto mdata_unlock;
+ ret = __afr_selfheal_metadata_prepare (heal_frame, this,
+ inode, locked_on,
+ sources, sinks,
+ healed_sinks,
+ undid_pending,
+ locked_replies,
+ NULL);
+
+ }
+mdata_unlock:
+ afr_selfheal_uninodelk (heal_frame, this, inode, this->name,
+ LLONG_MAX-1, 0, locked_on);
+ }
+
+ return ret;
+
+}
diff --git a/xlators/cluster/afr/src/afr-read-txn.c b/xlators/cluster/afr/src/afr-read-txn.c
index cb81af42510..26b0f1c2a11 100644
--- a/xlators/cluster/afr/src/afr-read-txn.c
+++ b/xlators/cluster/afr/src/afr-read-txn.c
@@ -64,7 +64,6 @@ afr_read_txn_refresh_done (call_frame_t *frame, xlator_t *this, int err)
{
afr_local_t *local = NULL;
int read_subvol = 0;
- int event_generation = 0;
inode_t *inode = NULL;
int ret = -1;
int spb_choice = -1;
@@ -76,18 +75,12 @@ afr_read_txn_refresh_done (call_frame_t *frame, xlator_t *this, int err)
local->op_errno = -err;
local->op_ret = -1;
read_subvol = -1;
+ gf_msg (this->name, GF_LOG_ERROR, EIO, AFR_MSG_SPLIT_BRAIN,
+ "Failing %s on gfid %s: split-brain observed.",
+ gf_fop_list[local->op], uuid_utoa (inode->gfid));
goto readfn;
}
- ret = afr_inode_get_readable (frame, inode, this, local->readable,
- &event_generation,
- local->transaction.type);
-
- if (ret == -EIO || !event_generation)
- /* Even after refresh, we don't have a good
- read subvolume. Time to bail */
- AFR_READ_TXN_SET_ERROR_AND_GOTO (-1, EIO, -1, readfn);
-
read_subvol = afr_read_subvol_select_by_policy (inode, this,
local->readable, NULL);
if (read_subvol == -1)
diff --git a/xlators/cluster/afr/src/afr-self-heal-common.c b/xlators/cluster/afr/src/afr-self-heal-common.c
index 858880860e6..17e15d760c6 100644
--- a/xlators/cluster/afr/src/afr-self-heal-common.c
+++ b/xlators/cluster/afr/src/afr-self-heal-common.c
@@ -145,8 +145,10 @@ err:
int
afr_selfheal_undo_pending (call_frame_t *frame, xlator_t *this, inode_t *inode,
unsigned char *sources, unsigned char *sinks,
- unsigned char *healed_sinks, afr_transaction_type type,
- struct afr_reply *replies, unsigned char *locked_on)
+ unsigned char *healed_sinks,
+ unsigned char *undid_pending,
+ afr_transaction_type type, struct afr_reply *replies,
+ unsigned char *locked_on)
{
afr_private_t *priv = NULL;
afr_local_t *local = NULL;
@@ -214,6 +216,10 @@ afr_selfheal_undo_pending (call_frame_t *frame, xlator_t *this, inode_t *inode,
and inspected on.
*/
continue;
+ if (undid_pending[i])
+ /* We already unset the pending xattrs in
+ * _afr_fav_child_reset_sink_xattrs(). */
+ continue;
xattr = afr_selfheal_output_xattr (this, local->need_full_crawl,
type, output_dirty,
@@ -735,6 +741,42 @@ afr_sh_fav_by_size (xlator_t *this, struct afr_reply *replies, inode_t *inode)
return fav_child;
}
+int
+afr_sh_get_fav_by_policy (xlator_t *this, struct afr_reply *replies,
+ inode_t *inode, char **policy_str)
+{
+ afr_private_t *priv = NULL;
+ int fav_child = -1;
+
+ priv = this->private;
+ switch (priv->fav_child_policy) {
+ case AFR_FAV_CHILD_BY_SIZE:
+ fav_child = afr_sh_fav_by_size (this, replies, inode);
+ if (policy_str && fav_child >= 0)
+ *policy_str = "SIZE";
+ break;
+ case AFR_FAV_CHILD_BY_CTIME:
+ fav_child = afr_sh_fav_by_ctime (this, replies, inode);
+ if (policy_str && fav_child >= 0)
+ *policy_str = "CTIME";
+ break;
+ case AFR_FAV_CHILD_BY_MTIME:
+ fav_child = afr_sh_fav_by_mtime (this, replies, inode);
+ if (policy_str && fav_child >= 0)
+ *policy_str = "MTIME";
+ break;
+ case AFR_FAV_CHILD_BY_MAJORITY:
+ fav_child = afr_sh_fav_by_majority (this, replies, inode);
+ if (policy_str && fav_child >= 0)
+ *policy_str = "MAJORITY";
+ break;
+ case AFR_FAV_CHILD_NONE:
+ default:
+ break;
+ }
+
+ return fav_child;
+}
int
afr_mark_split_brain_source_sinks_by_policy (call_frame_t *frame,
@@ -756,24 +798,9 @@ afr_mark_split_brain_source_sinks_by_policy (call_frame_t *frame,
time_t time;
priv = this->private;
- if (priv->fav_child_policy == AFR_FAV_CHILD_BY_MAJORITY) {
- fav_child = afr_sh_fav_by_majority (this, replies, inode);
- if (fav_child >= 0)
- policy_str = "MAJORITY";
- } else if (priv->fav_child_policy == AFR_FAV_CHILD_BY_MTIME) {
- fav_child = afr_sh_fav_by_mtime (this, replies, inode);
- if (fav_child >= 0)
- policy_str = "MTIME";
- } else if (priv->fav_child_policy == AFR_FAV_CHILD_BY_CTIME) {
- fav_child = afr_sh_fav_by_ctime (this, replies, inode);
- if (fav_child >= 0)
- policy_str = "CTIME";
- } else if (priv->fav_child_policy == AFR_FAV_CHILD_BY_SIZE) {
- fav_child = afr_sh_fav_by_size (this, replies, inode);
- if (fav_child >= 0)
- policy_str = "SIZE";
- }
+ fav_child = afr_sh_get_fav_by_policy (this, replies, inode,
+ &policy_str);
if (fav_child > priv->child_count - 1) {
gf_msg (this->name, GF_LOG_ERROR, 0,
AFR_MSG_SBRAIN_FAV_CHILD_POLICY, "Invalid child (%d) "
@@ -829,6 +856,7 @@ afr_mark_split_brain_source_sinks (call_frame_t *frame, xlator_t *this,
dict_t *xdata_req = NULL;
int heal_op = -1;
int ret = -1;
+ int source = -1;
local = frame->local;
priv = this->private;
@@ -838,27 +866,96 @@ afr_mark_split_brain_source_sinks (call_frame_t *frame, xlator_t *this,
if (ret)
goto autoheal;
- ret = afr_mark_split_brain_source_sinks_by_heal_op (frame, this,
+ source = afr_mark_split_brain_source_sinks_by_heal_op (frame, this,
sources, sinks,
healed_sinks,
locked_on, replies,
type, heal_op);
- return ret;
+ return source;
autoheal:
/* Automatically heal if fav_child_policy is set. */
if (priv->fav_child_policy != AFR_FAV_CHILD_NONE) {
- ret = afr_mark_split_brain_source_sinks_by_policy (frame, this,
- inode,
- sources,
- sinks,
+ source = afr_mark_split_brain_source_sinks_by_policy (frame,
+ this,
+ inode,
+ sources,
+ sinks,
healed_sinks,
- locked_on,
- replies,
- type);
+ locked_on,
+ replies,
+ type);
+ if (source != -1) {
+ ret = dict_set_int32 (xdata_req, "fav-child-policy", 1);
+ if (ret)
+ return -1;
+ }
}
- return ret;
+ return source;
+}
+
+int
+_afr_fav_child_reset_sink_xattrs (call_frame_t *frame, xlator_t *this,
+ inode_t *inode, int source,
+ unsigned char *healed_sinks,
+ unsigned char *undid_pending,
+ afr_transaction_type type,
+ unsigned char *locked_on,
+ struct afr_reply *replies)
+{
+ afr_private_t *priv = NULL;
+ afr_local_t *local = NULL;
+ int *input_dirty = NULL;
+ int **input_matrix = NULL;
+ int *output_dirty = NULL;
+ int **output_matrix = NULL;
+ dict_t *xattr = NULL;
+ dict_t *xdata = NULL;
+ int i = 0;
+
+ priv = this->private;
+ local = frame->local;
+
+ if (!dict_get (local->xdata_req, "fav-child-policy"))
+ return 0;
+
+ xdata = dict_new();
+ if (!xdata)
+ return -1;
+
+ input_dirty = alloca0 (priv->child_count * sizeof (int));
+ input_matrix = ALLOC_MATRIX (priv->child_count, int);
+ output_dirty = alloca0 (priv->child_count * sizeof (int));
+ output_matrix = ALLOC_MATRIX (priv->child_count, int);
+
+ afr_selfheal_extract_xattr (this, replies, type, input_dirty,
+ input_matrix);
+
+ for (i = 0; i < priv->child_count; i++) {
+ if (i == source || !healed_sinks[i])
+ continue;
+ output_dirty[i] = -input_dirty[i];
+ output_matrix[i][source] = -input_matrix[i][source];
+ }
+
+ for (i = 0; i < priv->child_count; i++) {
+ if (!healed_sinks[i] || !locked_on[i])
+ continue;
+ xattr = afr_selfheal_output_xattr (this, _gf_false, type,
+ output_dirty, output_matrix,
+ i, NULL);
+
+ afr_selfheal_post_op (frame, this, inode, i, xattr, xdata);
+
+ undid_pending[i] = 1;
+ dict_unref (xattr);
+ }
+
+ if (xdata)
+ dict_unref (xdata);
+
+ return 0;
}
gf_boolean_t
@@ -1906,11 +2003,15 @@ afr_selfheal (xlator_t *this, uuid_t gfid)
{
int ret = -1;
call_frame_t *frame = NULL;
+ afr_local_t *local = NULL;
frame = afr_frame_create (this);
if (!frame)
return ret;
+ local = frame->local;
+ local->xdata_req = dict_new();
+
ret = afr_selfheal_do (frame, this, gfid);
if (frame)
diff --git a/xlators/cluster/afr/src/afr-self-heal-data.c b/xlators/cluster/afr/src/afr-self-heal-data.c
index fbbbd192323..d032284926c 100644
--- a/xlators/cluster/afr/src/afr-self-heal-data.c
+++ b/xlators/cluster/afr/src/afr-self-heal-data.c
@@ -580,6 +580,7 @@ __afr_selfheal_data_finalize_source (call_frame_t *frame, xlator_t *this,
unsigned char *sinks,
unsigned char *healed_sinks,
unsigned char *locked_on,
+ unsigned char *undid_pending,
struct afr_reply *replies,
uint64_t *witness)
{
@@ -603,6 +604,11 @@ __afr_selfheal_data_finalize_source (call_frame_t *frame, xlator_t *this,
"file=%s", this->name, uuid_utoa(inode->gfid));
return -EIO;
}
+
+ _afr_fav_child_reset_sink_xattrs (frame, this, inode, source,
+ healed_sinks, undid_pending,
+ AFR_DATA_TRANSACTION,
+ locked_on, replies);
return source;
}
@@ -642,6 +648,7 @@ __afr_selfheal_data_prepare (call_frame_t *frame, xlator_t *this,
inode_t *inode, unsigned char *locked_on,
unsigned char *sources, unsigned char *sinks,
unsigned char *healed_sinks,
+ unsigned char *undid_pending,
struct afr_reply *replies, gf_boolean_t *pflag)
{
int ret = -1;
@@ -677,8 +684,8 @@ __afr_selfheal_data_prepare (call_frame_t *frame, xlator_t *this,
source = __afr_selfheal_data_finalize_source (frame, this, inode,
sources, sinks,
healed_sinks,
- locked_on, replies,
- witness);
+ locked_on, undid_pending,
+ replies, witness);
if (source < 0)
return -EIO;
@@ -696,6 +703,7 @@ __afr_selfheal_data (call_frame_t *frame, xlator_t *this, fd_t *fd,
unsigned char *sinks = NULL;
unsigned char *data_lock = NULL;
unsigned char *healed_sinks = NULL;
+ unsigned char *undid_pending = NULL;
struct afr_reply *locked_replies = NULL;
int source = -1;
gf_boolean_t did_sh = _gf_true;
@@ -707,6 +715,7 @@ __afr_selfheal_data (call_frame_t *frame, xlator_t *this, fd_t *fd,
sinks = alloca0 (priv->child_count);
healed_sinks = alloca0 (priv->child_count);
data_lock = alloca0 (priv->child_count);
+ undid_pending = alloca0 (priv->child_count);
locked_replies = alloca0 (sizeof (*locked_replies) * priv->child_count);
@@ -726,9 +735,8 @@ __afr_selfheal_data (call_frame_t *frame, xlator_t *this, fd_t *fd,
ret = __afr_selfheal_data_prepare (frame, this, fd->inode,
data_lock, sources, sinks,
- healed_sinks,
- locked_replies,
- NULL);
+ healed_sinks, undid_pending,
+ locked_replies, NULL);
if (ret < 0)
goto unlock;
@@ -787,7 +795,7 @@ restore_time:
}
ret = afr_selfheal_undo_pending (frame, this, fd->inode,
sources, sinks, healed_sinks,
- AFR_DATA_TRANSACTION,
+ undid_pending, AFR_DATA_TRANSACTION,
locked_replies, data_lock);
skip_undo_pending:
afr_selfheal_uninodelk (frame, this, fd->inode, this->name, 0, 0,
diff --git a/xlators/cluster/afr/src/afr-self-heal-entry.c b/xlators/cluster/afr/src/afr-self-heal-entry.c
index a0e361ab987..d8fe5422372 100644
--- a/xlators/cluster/afr/src/afr-self-heal-entry.c
+++ b/xlators/cluster/afr/src/afr-self-heal-entry.c
@@ -913,6 +913,7 @@ __afr_selfheal_entry (call_frame_t *frame, xlator_t *this, fd_t *fd,
unsigned char *data_lock = NULL;
unsigned char *postop_lock = NULL;
unsigned char *healed_sinks = NULL;
+ unsigned char *undid_pending = NULL;
struct afr_reply *locked_replies = NULL;
afr_local_t *local = NULL;
afr_private_t *priv = NULL;
@@ -924,6 +925,7 @@ __afr_selfheal_entry (call_frame_t *frame, xlator_t *this, fd_t *fd,
sources = alloca0 (priv->child_count);
sinks = alloca0 (priv->child_count);
healed_sinks = alloca0 (priv->child_count);
+ undid_pending = alloca0 (priv->child_count);
data_lock = alloca0 (priv->child_count);
postop_lock = alloca0 (priv->child_count);
@@ -996,6 +998,7 @@ unlock:
ret = afr_selfheal_undo_pending (frame, this, fd->inode,
sources, sinks, healed_sinks,
+ undid_pending,
AFR_ENTRY_TRANSACTION,
locked_replies, postop_lock);
}
diff --git a/xlators/cluster/afr/src/afr-self-heal-metadata.c b/xlators/cluster/afr/src/afr-self-heal-metadata.c
index 9dfe4a14e8c..5839ddc2e0f 100644
--- a/xlators/cluster/afr/src/afr-self-heal-metadata.c
+++ b/xlators/cluster/afr/src/afr-self-heal-metadata.c
@@ -203,6 +203,7 @@ __afr_selfheal_metadata_finalize_source (call_frame_t *frame, xlator_t *this,
unsigned char *sources,
unsigned char *sinks,
unsigned char *healed_sinks,
+ unsigned char *undid_pending,
unsigned char *locked_on,
struct afr_reply *replies)
{
@@ -224,8 +225,14 @@ __afr_selfheal_metadata_finalize_source (call_frame_t *frame, xlator_t *this,
healed_sinks,
locked_on, replies,
AFR_METADATA_TRANSACTION);
- if (source >= 0)
+ if (source >= 0) {
+ _afr_fav_child_reset_sink_xattrs (frame, this, inode,
+ source, healed_sinks,
+ undid_pending,
+ AFR_METADATA_TRANSACTION,
+ locked_on, replies);
return source;
+ }
/* If this is a directory mtime/ctime only split brain
use the most recent */
@@ -308,6 +315,7 @@ int
__afr_selfheal_metadata_prepare (call_frame_t *frame, xlator_t *this, inode_t *inode,
unsigned char *locked_on, unsigned char *sources,
unsigned char *sinks, unsigned char *healed_sinks,
+ unsigned char *undid_pending,
struct afr_reply *replies, gf_boolean_t *pflag)
{
int ret = -1;
@@ -362,6 +370,7 @@ __afr_selfheal_metadata_prepare (call_frame_t *frame, xlator_t *this, inode_t *i
source = __afr_selfheal_metadata_finalize_source (frame, this, inode,
sources, sinks,
healed_sinks,
+ undid_pending,
locked_on, replies);
if (source < 0)
@@ -379,6 +388,7 @@ afr_selfheal_metadata (call_frame_t *frame, xlator_t *this, inode_t *inode)
unsigned char *sinks = NULL;
unsigned char *data_lock = NULL;
unsigned char *healed_sinks = NULL;
+ unsigned char *undid_pending = NULL;
struct afr_reply *locked_replies = NULL;
gf_boolean_t did_sh = _gf_true;
int source = -1;
@@ -388,6 +398,7 @@ afr_selfheal_metadata (call_frame_t *frame, xlator_t *this, inode_t *inode)
sources = alloca0 (priv->child_count);
sinks = alloca0 (priv->child_count);
healed_sinks = alloca0 (priv->child_count);
+ undid_pending = alloca0 (priv->child_count);
data_lock = alloca0 (priv->child_count);
locked_replies = alloca0 (sizeof (*locked_replies) * priv->child_count);
@@ -403,6 +414,7 @@ afr_selfheal_metadata (call_frame_t *frame, xlator_t *this, inode_t *inode)
ret = __afr_selfheal_metadata_prepare (frame, this, inode,
data_lock, sources,
sinks, healed_sinks,
+ undid_pending,
locked_replies, NULL);
if (ret < 0)
goto unlock;
@@ -421,6 +433,7 @@ afr_selfheal_metadata (call_frame_t *frame, xlator_t *this, inode_t *inode)
ret = afr_selfheal_undo_pending (frame, this, inode, sources,
sinks, healed_sinks,
+ undid_pending,
AFR_METADATA_TRANSACTION,
locked_replies, data_lock);
}
diff --git a/xlators/cluster/afr/src/afr-self-heal.h b/xlators/cluster/afr/src/afr-self-heal.h
index 1705c967f60..500227abe24 100644
--- a/xlators/cluster/afr/src/afr-self-heal.h
+++ b/xlators/cluster/afr/src/afr-self-heal.h
@@ -167,8 +167,10 @@ afr_selfheal_extract_xattr (xlator_t *this, struct afr_reply *replies,
int
afr_selfheal_undo_pending (call_frame_t *frame, xlator_t *this, inode_t *inode,
unsigned char *sources, unsigned char *sinks,
- unsigned char *healed_sinks, afr_transaction_type type,
- struct afr_reply *replies, unsigned char *locked_on);
+ unsigned char *healed_sinks,
+ unsigned char *undid_pending,
+ afr_transaction_type type, struct afr_reply *replies,
+ unsigned char *locked_on);
int
afr_selfheal_recreate_entry (xlator_t *this, int dst, int source, inode_t *dir,
@@ -229,6 +231,19 @@ afr_mark_split_brain_source_sinks (call_frame_t *frame, xlator_t *this,
afr_transaction_type type);
int
+afr_sh_get_fav_by_policy (xlator_t *this, struct afr_reply *replies,
+ inode_t *inode, char **policy_str);
+
+int
+_afr_fav_child_reset_sink_xattrs (call_frame_t *frame, xlator_t *this,
+ inode_t *inode, int source,
+ unsigned char *healed_sinks,
+ unsigned char *undid_pending,
+ afr_transaction_type type,
+ unsigned char *locked_on,
+ struct afr_reply *replies);
+
+int
afr_get_child_index_from_name (xlator_t *this, char *name);
gf_boolean_t
@@ -239,8 +254,8 @@ __afr_selfheal_data_prepare (call_frame_t *frame, xlator_t *this,
inode_t *inode, unsigned char *locked_on,
unsigned char *sources,
unsigned char *sinks, unsigned char *healed_sinks,
- struct afr_reply *replies,
- gf_boolean_t *flag);
+ unsigned char *undid_pending,
+ struct afr_reply *replies, gf_boolean_t *flag);
int
__afr_selfheal_metadata_prepare (call_frame_t *frame, xlator_t *this,
@@ -248,6 +263,7 @@ __afr_selfheal_metadata_prepare (call_frame_t *frame, xlator_t *this,
unsigned char *sources,
unsigned char *sinks,
unsigned char *healed_sinks,
+ unsigned char *undid_pending,
struct afr_reply *replies,
gf_boolean_t *flag);
int
diff --git a/xlators/cluster/afr/src/afr-transaction.c b/xlators/cluster/afr/src/afr-transaction.c
index a8979cc6184..bf4ea944e23 100644
--- a/xlators/cluster/afr/src/afr-transaction.c
+++ b/xlators/cluster/afr/src/afr-transaction.c
@@ -2581,19 +2581,12 @@ afr_write_txn_refresh_done (call_frame_t *frame, xlator_t *this, int err)
if (err) {
local->op_errno = -err;
local->op_ret = -1;
- goto fail;
- }
- ret = afr_inode_get_readable (frame, local->inode, this,
- local->readable, NULL,
- local->transaction.type);
- if (ret < 0) {
gf_msg (this->name, GF_LOG_ERROR, -ret, AFR_MSG_SPLIT_BRAIN,
"Failing %s on gfid %s: split-brain observed.",
gf_fop_list[local->op], uuid_utoa (local->inode->gfid));
- local->op_ret = -1;
- local->op_errno = -ret;
goto fail;
}
+
afr_transaction_start (frame, this);
return 0;
fail: