summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorRavishankar N <ravishankar@redhat.com>2017-09-14 11:29:15 +0530
committerRavishankar N <ravishankar@redhat.com>2017-09-26 04:04:18 +0000
commit1719cffa911c5287715abfdb991bc8862f0c994e (patch)
tree3d524fab0d31fe7b3512fab1e4e6fa08b1f29ff8
parent898f0b7ce31ddf8ec02e572c5d22eff2e4205b4c (diff)
afr: auto-resolve split-brains for zero-byte files
Problems: As described in BZ 1491670, renaming hardlinks can result in data/mdata split-brain of the DHT link-to files (T files) without any mismatch of data and metadata. As described in BZ 1486063, for a zero-byte file with only dirty bits set, arbiter brick will likely be chosen as the source brick. Fix: For zero byte files in split-brain, pick first brick as a) data source if file size is zero on all bricks. b) metadata source if metadata is the same on all bricks In arbiter case, if file size is zero on all bricks and there are no pending afr xattrs, pick 1st brick as data source. Change-Id: I0270a9a2f97c3b21087e280bb890159b43975e04 BUG: 1491670 Signed-off-by: Ravishankar N <ravishankar@redhat.com> Reported-by: Rahul Hinduja <rhinduja@redhat.com> Reported-by: Mabi <mabi@protonmail.ch>
-rw-r--r--tests/bugs/replicate/bug-1190069-afr-stale-index-entries.t2
-rw-r--r--xlators/cluster/afr/src/afr-self-heal-common.c65
-rw-r--r--xlators/cluster/afr/src/afr-self-heal-data.c6
-rw-r--r--xlators/cluster/afr/src/afr-self-heal.h7
4 files changed, 79 insertions, 1 deletions
diff --git a/tests/bugs/replicate/bug-1190069-afr-stale-index-entries.t b/tests/bugs/replicate/bug-1190069-afr-stale-index-entries.t
index 81655074c9c..fe8e992e8f8 100644
--- a/tests/bugs/replicate/bug-1190069-afr-stale-index-entries.t
+++ b/tests/bugs/replicate/bug-1190069-afr-stale-index-entries.t
@@ -28,7 +28,7 @@ EXPECT_WITHIN $CHILD_UP_TIMEOUT '1' afr_child_up_status_meta $M0 $V0-replicate-0
TEST kill_brick $V0 $H0 $B0/$V0"0"
TEST stat $M0/datafile
TEST `echo append>>$M0/datafile`
-TEST chmod +x $M0/mdatafile
+TEST chmod -x $M0/mdatafile
TEST $CLI volume start $V0 force
EXPECT_WITHIN $CHILD_UP_TIMEOUT '1' afr_child_up_status_meta $M0 $V0-replicate-0 0
TEST ! cat $M0/datafile
diff --git a/xlators/cluster/afr/src/afr-self-heal-common.c b/xlators/cluster/afr/src/afr-self-heal-common.c
index 998289711df..6a159dc67d0 100644
--- a/xlators/cluster/afr/src/afr-self-heal-common.c
+++ b/xlators/cluster/afr/src/afr-self-heal-common.c
@@ -1172,6 +1172,65 @@ afr_mark_split_brain_source_sinks_by_policy (call_frame_t *frame,
return fav_child;
}
+int
+afr_mark_source_sinks_if_file_empty (xlator_t *this, unsigned char *sources,
+ unsigned char *sinks,
+ unsigned char *healed_sinks,
+ unsigned char *locked_on,
+ struct afr_reply *replies,
+ afr_transaction_type type)
+{
+ int source = -1;
+ int i = 0;
+ afr_private_t *priv = this->private;
+ struct iatt stbuf = {0, };
+
+ if ((AFR_COUNT (locked_on, priv->child_count) < priv->child_count) ||
+ (afr_success_count(replies, priv->child_count) < priv->child_count))
+ return -1;
+
+ for (i = 0; i < priv->child_count; i++) {
+ if (replies[i].poststat.ia_size != 0)
+ return -1;
+ }
+
+ if (type == AFR_DATA_TRANSACTION)
+ goto mark;
+
+ /*For AFR_METADATA_TRANSACTION, metadata must be same on all bricks.*/
+ stbuf = replies[0].poststat;
+ for (i = 1; i < priv->child_count; i++) {
+ if ((!IA_EQUAL (stbuf, replies[i].poststat, type)) ||
+ (!IA_EQUAL (stbuf, replies[i].poststat, uid)) ||
+ (!IA_EQUAL (stbuf, replies[i].poststat, gid)) ||
+ (!IA_EQUAL (stbuf, replies[i].poststat, prot)))
+ return -1;
+ }
+ for (i = 1; i < priv->child_count; i++) {
+ if (!afr_xattrs_are_equal (replies[0].xdata,
+ replies[i].xdata))
+ return -1;
+ }
+
+mark:
+ /* All bricks have a zero-byte file. Pick one of them as source. Rest
+ * are sinks.*/
+ for (i = 0 ; i < priv->child_count; i++) {
+ if (source == -1) {
+ source = i;
+ sources[i] = 1;
+ sinks[i] = 0;
+ healed_sinks[i] = 0;
+ continue;
+ }
+ sources[i] = 0;
+ sinks[i] = 1;
+ healed_sinks[i] = 1;
+ }
+
+ return source;
+}
+
/* Return a source depending on the type of heal_op, and set sources[source],
* sinks[source] and healed_sinks[source] to 1, 0 and 0 respectively. Do so
* only if the following condition is met:
@@ -1200,6 +1259,12 @@ afr_mark_split_brain_source_sinks (call_frame_t *frame, xlator_t *this,
priv = this->private;
xdata_req = local->xdata_req;
+ source = afr_mark_source_sinks_if_file_empty (this, sources, sinks,
+ healed_sinks, locked_on,
+ replies, type);
+ if (source >= 0)
+ return source;
+
ret = dict_get_int32 (xdata_req, "heal-op", &heal_op);
if (ret)
goto autoheal;
diff --git a/xlators/cluster/afr/src/afr-self-heal-data.c b/xlators/cluster/afr/src/afr-self-heal-data.c
index 13679608dfd..2c254e80aa1 100644
--- a/xlators/cluster/afr/src/afr-self-heal-data.c
+++ b/xlators/cluster/afr/src/afr-self-heal-data.c
@@ -581,6 +581,12 @@ __afr_selfheal_data_finalize_source (call_frame_t *frame, xlator_t *this,
afr_mark_largest_file_as_source (this, sources, replies);
afr_mark_biggest_witness_as_source (this, sources, witness);
afr_mark_newest_file_as_source (this, sources, replies);
+ if (priv->arbiter_count)
+ /* Choose non-arbiter brick as source for empty files. */
+ afr_mark_source_sinks_if_file_empty (this, sources, sinks,
+ healed_sinks, locked_on,
+ replies,
+ AFR_DATA_TRANSACTION);
out:
afr_mark_active_sinks (this, sources, locked_on, healed_sinks);
diff --git a/xlators/cluster/afr/src/afr-self-heal.h b/xlators/cluster/afr/src/afr-self-heal.h
index de2970e6c5a..b54da1facfd 100644
--- a/xlators/cluster/afr/src/afr-self-heal.h
+++ b/xlators/cluster/afr/src/afr-self-heal.h
@@ -341,5 +341,12 @@ afr_gfid_split_brain_source (xlator_t *this, struct afr_reply *replies,
inode_t *inode, uuid_t pargfid, const char *bname,
int src_idx, int child_idx,
unsigned char *locked_on, int *src, dict_t *xdata);
+int
+afr_mark_source_sinks_if_file_empty (xlator_t *this, unsigned char *sources,
+ unsigned char *sinks,
+ unsigned char *healed_sinks,
+ unsigned char *locked_on,
+ struct afr_reply *replies,
+ afr_transaction_type type);
#endif /* !_AFR_SELFHEAL_H */