summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--libglusterfs/src/common-utils.c24
-rw-r--r--libglusterfs/src/common-utils.h11
-rw-r--r--tests/bugs/replicate/bug-1363721.t112
-rw-r--r--xlators/cluster/afr/src/afr-common.c125
-rw-r--r--xlators/cluster/afr/src/afr-dir-write.c3
-rw-r--r--xlators/cluster/afr/src/afr-inode-write.c19
-rw-r--r--xlators/cluster/afr/src/afr-transaction.c7
-rw-r--r--xlators/cluster/afr/src/afr.h17
-rw-r--r--xlators/cluster/ec/src/ec-common.c16
-rw-r--r--xlators/cluster/ec/src/ec-helpers.c21
-rw-r--r--xlators/cluster/ec/src/ec-helpers.h2
-rw-r--r--xlators/cluster/ec/src/ec-locks.c2
12 files changed, 308 insertions, 51 deletions
diff --git a/libglusterfs/src/common-utils.c b/libglusterfs/src/common-utils.c
index 3529ad7f897..311ed72d04c 100644
--- a/libglusterfs/src/common-utils.c
+++ b/libglusterfs/src/common-utils.c
@@ -4553,3 +4553,27 @@ gfid_to_ino (uuid_t gfid)
return ino;
}
+
+int
+gf_bits_count (uint64_t n)
+{
+ int val = 0;
+#ifdef _GNU_SOURCE
+ val = __builtin_popcountll (n);
+#else
+ n -= (n >> 1) & 0x5555555555555555ULL;
+ n = ((n >> 2) & 0x3333333333333333ULL) + (n & 0x3333333333333333ULL);
+ n = (n + (n >> 4)) & 0x0F0F0F0F0F0F0F0FULL;
+ n += n >> 8;
+ n += n >> 16;
+ n += n >> 32;
+ val = n & 0xFF;
+#endif
+ return val;
+}
+
+int
+gf_bits_index (uint64_t n)
+{
+ return ffsll(n) - 1;
+}
diff --git a/libglusterfs/src/common-utils.h b/libglusterfs/src/common-utils.h
index fa5d28d92b3..4bcb6f9f2e0 100644
--- a/libglusterfs/src/common-utils.h
+++ b/libglusterfs/src/common-utils.h
@@ -25,6 +25,10 @@
#include <limits.h>
#include <fnmatch.h>
+#ifndef ffsll
+#define ffsll(x) __builtin_ffsll(x)
+#endif
+
void trap (void);
#define GF_UNIVERSAL_ANSWER 42 /* :O */
@@ -849,4 +853,11 @@ is_virtual_xattr (const char *k);
const char *
gf_inode_type_to_str (ia_type_t type);
+
+int32_t
+gf_bits_count (uint64_t n);
+
+int32_t
+gf_bits_index (uint64_t n);
+
#endif /* _COMMON_UTILS_H */
diff --git a/tests/bugs/replicate/bug-1363721.t b/tests/bugs/replicate/bug-1363721.t
new file mode 100644
index 00000000000..ec39889b27e
--- /dev/null
+++ b/tests/bugs/replicate/bug-1363721.t
@@ -0,0 +1,112 @@
+#!/bin/bash
+
+. $(dirname $0)/../../include.rc
+. $(dirname $0)/../../volume.rc
+
+FILE_UPDATE_TIMEOUT=20
+cleanup
+
+function size_increased {
+ local file=$1
+ local size=$2
+ local new_size=$(stat -c%s $file)
+ if [ $new_size -gt $size ];
+ then
+ echo "Y"
+ else
+ echo "N"
+ fi
+}
+
+TEST glusterd
+TEST pidof glusterd
+TEST $CLI volume create $V0 replica 3 $H0:$B0/${V0}{0,1,2}
+TEST $CLI volume set $V0 performance.write-behind off
+TEST $CLI volume set $V0 cluster.self-heal-daemon off
+TEST $CLI volume set $V0 cluster.data-self-heal off
+TEST $CLI volume set $V0 cluster.metadata-self-heal off
+TEST $CLI volume set $V0 cluster.entry-self-heal off
+TEST $CLI volume start $V0
+TEST $GFS --volfile-id=$V0 --volfile-server=$H0 $M0 --direct-io-mode=enable
+
+cd $M0
+
+# Start writing to a file.
+(dd if=/dev/urandom of=$M0/file1 bs=1k 2>/dev/null 1>/dev/null)&
+dd_pid=$!
+
+# Let IO happen
+EXPECT_WITHIN $FILE_UPDATE_TIMEOUT "Y" size_increased file1 0
+
+# Now kill the zeroth brick
+kill_brick $V0 $H0 $B0/${V0}0
+
+# Let IO continue
+EXPECT_WITHIN $FILE_UPDATE_TIMEOUT "Y" size_increased file1 $(stat -c%s file1)
+
+# Now bring the brick back up
+TEST $CLI volume start $V0 force
+EXPECT_WITHIN $CHILD_UP_TIMEOUT "1" afr_child_up_status $V0 0
+
+# Let IO continue
+EXPECT_WITHIN $FILE_UPDATE_TIMEOUT "Y" size_increased file1 $(stat -c%s file1)
+
+# Now kill the first brick
+kill_brick $V0 $H0 $B0/${V0}1
+
+# Let IO continue
+EXPECT_WITHIN $FILE_UPDATE_TIMEOUT "Y" size_increased file1 $(stat -c%s file1)
+
+# Now bring the brick back up
+TEST $CLI volume start $V0 force
+EXPECT_WITHIN $CHILD_UP_TIMEOUT "1" afr_child_up_status $V0 1
+
+# Let IO continue for 3 seconds
+sleep 3
+
+# Now kill the second brick
+kill_brick $V0 $H0 $B0/${V0}2
+
+# At this point the write should have been failed. But make sure that the second
+# brick is never an accused.
+
+md5sum_2=$(md5sum $B0/${V0}2/file1 | awk '{print $1}')
+
+EXPECT_NOT "$md5sum_2" echo `md5sum $B0/${V0}0/file1 | awk '{print $1}'`
+EXPECT_NOT "$md5sum_2" echo `md5sum $B0/${V0}1/file1 | awk '{print $1}'`
+
+EXPECT_NOT "00000000" afr_get_specific_changelog_xattr $B0/${V0}0/file1 trusted.afr.dirty data
+EXPECT_NOT "00000000" afr_get_specific_changelog_xattr $B0/${V0}1/file1 trusted.afr.dirty data
+
+EXPECT "00000000" afr_get_specific_changelog_xattr $B0/${V0}0/file1 trusted.afr.$V0-client-2 data
+EXPECT "00000000" afr_get_specific_changelog_xattr $B0/${V0}1/file1 trusted.afr.$V0-client-2 data
+EXPECT "00000000" afr_get_specific_changelog_xattr $B0/${V0}2/file1 trusted.afr.$V0-client-2 data
+EXPECT "00000000" afr_get_specific_changelog_xattr $B0/${V0}0/file1 trusted.afr.$V0-client-2 metadata
+EXPECT "00000000" afr_get_specific_changelog_xattr $B0/${V0}1/file1 trusted.afr.$V0-client-2 metadata
+EXPECT "00000000" afr_get_specific_changelog_xattr $B0/${V0}2/file1 trusted.afr.$V0-client-2 metadata
+
+# Now bring the brick back up
+TEST $CLI volume start $V0 force
+EXPECT_WITHIN $CHILD_UP_TIMEOUT "1" afr_child_up_status $V0 2
+
+# Enable shd
+TEST $CLI volume set $V0 self-heal-daemon on
+EXPECT_WITHIN $PROCESS_UP_TIMEOUT "Y" glustershd_up_status
+EXPECT_WITHIN $CHILD_UP_TIMEOUT "1" afr_child_up_status_in_shd $V0 0
+EXPECT_WITHIN $CHILD_UP_TIMEOUT "1" afr_child_up_status_in_shd $V0 1
+EXPECT_WITHIN $CHILD_UP_TIMEOUT "1" afr_child_up_status_in_shd $V0 2
+
+TEST $CLI volume heal $V0
+
+# Wait for heal to complete
+EXPECT_WITHIN $HEAL_TIMEOUT "^0$" get_pending_heal_count $V0
+
+EXPECT "$md5sum_2" echo `md5sum $B0/${V0}0/file1 | awk '{print $1}'`
+EXPECT "$md5sum_2" echo `md5sum $B0/${V0}1/file1 | awk '{print $1}'`
+EXPECT "$md5sum_2" echo `md5sum $B0/${V0}2/file1 | awk '{print $1}'`
+
+cd ~
+
+EXPECT_WITHIN $UMOUNT_TIMEOUT "Y" force_umount $M0
+
+cleanup
diff --git a/xlators/cluster/afr/src/afr-common.c b/xlators/cluster/afr/src/afr-common.c
index 557b2cd8891..9b2c0d7caea 100644
--- a/xlators/cluster/afr/src/afr-common.c
+++ b/xlators/cluster/afr/src/afr-common.c
@@ -155,6 +155,119 @@ out:
*/
int
+__afr_set_in_flight_sb_status (xlator_t *this, afr_local_t *local,
+ inode_t *inode)
+{
+ int i = 0;
+ int ret = -1;
+ int txn_type = 0;
+ int count = 0;
+ int index = -1;
+ uint16_t datamap_old = 0;
+ uint16_t metadatamap_old = 0;
+ uint16_t datamap = 0;
+ uint16_t metadatamap = 0;
+ uint16_t tmp_map = 0;
+ uint16_t mask = 0;
+ uint32_t event = 0;
+ uint64_t val = 0;
+ afr_private_t *priv = NULL;
+ afr_inode_ctx_t *ctx = NULL;
+
+ priv = this->private;
+ txn_type = local->transaction.type;
+
+ ret = __afr_inode_ctx_get (this, inode, &ctx);
+ if (ret < 0)
+ return ret;
+
+ val = ctx->read_subvol;
+
+ metadatamap_old = metadatamap = (val & 0x000000000000ffff);
+ datamap_old = datamap = (val & 0x00000000ffff0000) >> 16;
+ /* Hard-code event to 0 since there is a failure and the inode
+ * needs to be refreshed anyway.
+ */
+ event = 0;
+
+ if (txn_type == AFR_DATA_TRANSACTION)
+ tmp_map = datamap;
+ else if (txn_type == AFR_METADATA_TRANSACTION)
+ tmp_map = metadatamap;
+
+ count = gf_bits_count (tmp_map);
+
+ if (count == 1)
+ index = gf_bits_index (tmp_map);
+
+ for (i = 0; i < priv->child_count; i++) {
+ mask = 0;
+ if (!local->transaction.failed_subvols[i])
+ continue;
+
+ mask = 1 << i;
+ if (txn_type == AFR_METADATA_TRANSACTION)
+ metadatamap &= ~mask;
+ else if (txn_type == AFR_DATA_TRANSACTION)
+ datamap &= ~mask;
+ }
+
+ switch (txn_type) {
+ case AFR_METADATA_TRANSACTION:
+ if ((metadatamap_old != 0) && (metadatamap == 0) &&
+ (count == 1)) {
+ local->transaction.in_flight_sb_errno =
+ local->replies[index].op_errno;
+ local->transaction.in_flight_sb = _gf_true;
+ metadatamap |= (1 << index);
+ }
+ break;
+
+ case AFR_DATA_TRANSACTION:
+ if ((datamap_old != 0) && (datamap == 0) && (count == 1)) {
+ local->transaction.in_flight_sb_errno =
+ local->replies[index].op_errno;
+ local->transaction.in_flight_sb = _gf_true;
+ datamap |= (1 << index);
+ }
+ break;
+
+ default:
+ break;
+ }
+
+ val = ((uint64_t) metadatamap) |
+ (((uint64_t) datamap) << 16) |
+ (((uint64_t) event) << 32);
+
+ ctx->read_subvol = val;
+
+ return ret;
+}
+
+int
+afr_set_in_flight_sb_status (xlator_t *this, afr_local_t *local, inode_t *inode)
+{
+ int ret = -1;
+ afr_private_t *priv = NULL;
+
+ priv = this->private;
+
+ /* If this transaction saw no failures, then exit. */
+ if (AFR_COUNT (local->transaction.failed_subvols,
+ priv->child_count) == 0)
+ return 0;
+
+ LOCK (&inode->lock);
+ {
+ ret = __afr_set_in_flight_sb_status (this, local, inode);
+ }
+ UNLOCK (&inode->lock);
+
+ return ret;
+}
+
+int
__afr_inode_read_subvol_get_small (inode_t *inode, xlator_t *this,
unsigned char *data, unsigned char *metadata,
int *event_p)
@@ -233,12 +346,12 @@ out:
int
__afr_inode_read_subvol_reset_small (inode_t *inode, xlator_t *this)
{
- int ret = -1;
- uint16_t datamap = 0;
- uint16_t metadatamap = 0;
- uint32_t event = 0;
- uint64_t val = 0;
- afr_inode_ctx_t *ctx = NULL;
+ int ret = -1;
+ uint16_t datamap = 0;
+ uint16_t metadatamap = 0;
+ uint32_t event = 0;
+ uint64_t val = 0;
+ afr_inode_ctx_t *ctx = NULL;
ret = __afr_inode_ctx_get (this, inode, &ctx);
if (ret)
diff --git a/xlators/cluster/afr/src/afr-dir-write.c b/xlators/cluster/afr/src/afr-dir-write.c
index f3de5352d7e..286a5392da6 100644
--- a/xlators/cluster/afr/src/afr-dir-write.c
+++ b/xlators/cluster/afr/src/afr-dir-write.c
@@ -122,8 +122,7 @@ __afr_dir_write_finalize (call_frame_t *frame, xlator_t *this)
continue;
if (local->replies[i].op_ret < 0) {
if (local->inode)
- afr_inode_read_subvol_reset (local->inode,
- this);
+ afr_inode_read_subvol_reset (local->inode, this);
if (local->parent)
afr_inode_read_subvol_reset (local->parent,
this);
diff --git a/xlators/cluster/afr/src/afr-inode-write.c b/xlators/cluster/afr/src/afr-inode-write.c
index 323f50c9413..ddc257dbde4 100644
--- a/xlators/cluster/afr/src/afr-inode-write.c
+++ b/xlators/cluster/afr/src/afr-inode-write.c
@@ -38,13 +38,13 @@
static void
__afr_inode_write_finalize (call_frame_t *frame, xlator_t *this)
{
- afr_local_t *local = NULL;
- afr_private_t *priv = NULL;
- int read_subvol = 0;
- int i = 0;
- afr_read_subvol_args_t args = {0,};
- struct iatt *stbuf = NULL;
- int ret = 0;
+ int i = 0;
+ int ret = 0;
+ int read_subvol = 0;
+ struct iatt *stbuf = NULL;
+ afr_local_t *local = NULL;
+ afr_private_t *priv = NULL;
+ afr_read_subvol_args_t args = {0,};
local = frame->local;
priv = this->private;
@@ -94,10 +94,8 @@ __afr_inode_write_finalize (call_frame_t *frame, xlator_t *this)
for (i = 0; i < priv->child_count; i++) {
if (!local->replies[i].valid)
continue;
- if (local->replies[i].op_ret < 0) {
- afr_inode_read_subvol_reset (local->inode, this);
+ if (local->replies[i].op_ret < 0)
continue;
- }
/* Order of checks in the compound conditional
below is important.
@@ -134,6 +132,7 @@ __afr_inode_write_finalize (call_frame_t *frame, xlator_t *this)
}
afr_txn_arbitrate_fop_cbk (frame, this);
+ afr_set_in_flight_sb_status (this, local, local->inode);
}
diff --git a/xlators/cluster/afr/src/afr-transaction.c b/xlators/cluster/afr/src/afr-transaction.c
index 274e90d6f3c..6130ad76543 100644
--- a/xlators/cluster/afr/src/afr-transaction.c
+++ b/xlators/cluster/afr/src/afr-transaction.c
@@ -850,6 +850,13 @@ afr_changelog_post_op_now (call_frame_t *frame, xlator_t *this)
goto out;
}
+ if (local->transaction.in_flight_sb) {
+ local->op_ret = -1;
+ local->op_errno = local->transaction.in_flight_sb_errno;
+ afr_changelog_post_op_done (frame, this);
+ goto out;
+ }
+
xattr = dict_new ();
if (!xattr) {
local->op_ret = -1;
diff --git a/xlators/cluster/afr/src/afr.h b/xlators/cluster/afr/src/afr.h
index 54f988e8f33..29008287e6d 100644
--- a/xlators/cluster/afr/src/afr.h
+++ b/xlators/cluster/afr/src/afr.h
@@ -735,10 +735,17 @@ typedef struct _afr_local {
gf_boolean_t uninherit_done;
gf_boolean_t uninherit_value;
+ gf_boolean_t in_flight_sb; /* Indicator for occurrence of
+ split-brain while in the middle of
+ a txn. */
+ int32_t in_flight_sb_errno; /* This is where the cause of the
+ failure on the last good copy of
+ the file is stored.
+ */
+
/* @changelog_resume: function to be called after changlogging
(either pre-op or post-op) is done
*/
-
afr_changelog_resume_t changelog_resume;
call_frame_t *main_frame;
@@ -871,6 +878,10 @@ afr_read_subvol_get (inode_t *inode, xlator_t *this, int *subvol_p,
afr_read_subvol_get(i, t, s, r, e, AFR_METADATA_TRANSACTION, a)
int
+afr_inode_ctx_reset_unreadable_subvol (inode_t *inode, xlator_t *this,
+ int subvol_idx, int txn_type);
+
+int
afr_inode_refresh (call_frame_t *frame, xlator_t *this, inode_t *inode,
uuid_t gfid, afr_inode_refresh_cbk_t cbk);
@@ -1145,4 +1156,8 @@ afr_selfheal_data_open (xlator_t *this, inode_t *inode, fd_t **fd);
int
afr_get_msg_id (char *op_type);
+
+int
+afr_set_in_flight_sb_status (xlator_t *this, afr_local_t *local,
+ inode_t *inode);
#endif /* __AFR_H__ */
diff --git a/xlators/cluster/ec/src/ec-common.c b/xlators/cluster/ec/src/ec-common.c
index 175f6dfa71f..2e6759a2803 100644
--- a/xlators/cluster/ec/src/ec-common.c
+++ b/xlators/cluster/ec/src/ec-common.c
@@ -74,8 +74,8 @@ int32_t ec_heal_report(call_frame_t * frame, void * cookie, xlator_t * this,
gf_msg (this->name, GF_LOG_INFO, 0,
EC_MSG_HEAL_SUCCESS, "Heal succeeded on %d/%d "
"subvolumes",
- ec_bits_count(mask & ~(good | bad)),
- ec_bits_count(mask & ~good));
+ gf_bits_count(mask & ~(good | bad)),
+ gf_bits_count(mask & ~good));
}
}
@@ -333,7 +333,7 @@ void ec_complete(ec_fop_data_t * fop)
if (fop->answer == NULL) {
if (!list_empty(&fop->cbk_list)) {
cbk = list_entry(fop->cbk_list.next, ec_cbk_data_t, list);
- healing_count = ec_bits_count (cbk->mask & fop->healing);
+ healing_count = gf_bits_count (cbk->mask & fop->healing);
/* fop shouldn't be treated as success if it is not
* successful on at least fop->minimum good copies*/
if ((cbk->count - healing_count) >= fop->minimum) {
@@ -424,7 +424,7 @@ int32_t ec_child_select(ec_fop_data_t * fop)
switch (fop->minimum)
{
case EC_MINIMUM_ALL:
- fop->minimum = ec_bits_count(fop->mask);
+ fop->minimum = gf_bits_count(fop->mask);
if (fop->minimum >= ec->fragments)
{
break;
@@ -451,7 +451,7 @@ int32_t ec_child_select(ec_fop_data_t * fop)
ec_trace("SELECT", fop, "");
- num = ec_bits_count(fop->mask);
+ num = gf_bits_count(fop->mask);
if ((num < fop->minimum) && (num < ec->fragments))
{
gf_msg (ec->xl->name, GF_LOG_ERROR, 0,
@@ -500,7 +500,7 @@ void ec_dispatch_mask(ec_fop_data_t * fop, uintptr_t mask)
ec_t * ec = fop->xl->private;
int32_t count, idx;
- count = ec_bits_count(mask);
+ count = gf_bits_count(mask);
LOCK(&fop->lock);
@@ -578,7 +578,7 @@ void ec_dispatch_inc(ec_fop_data_t * fop)
if (ec_child_select(fop))
{
- fop->expected = ec_bits_count(fop->remaining);
+ fop->expected = gf_bits_count(fop->remaining);
fop->first = 0;
ec_dispatch_next(fop, 0);
@@ -591,7 +591,7 @@ ec_dispatch_all (ec_fop_data_t *fop)
ec_dispatch_start(fop);
if (ec_child_select(fop)) {
- fop->expected = ec_bits_count(fop->remaining);
+ fop->expected = gf_bits_count(fop->remaining);
fop->first = 0;
ec_dispatch_mask(fop, fop->remaining);
diff --git a/xlators/cluster/ec/src/ec-helpers.c b/xlators/cluster/ec/src/ec-helpers.c
index c8f904ac51d..7cf8232353d 100644
--- a/xlators/cluster/ec/src/ec-helpers.c
+++ b/xlators/cluster/ec/src/ec-helpers.c
@@ -17,10 +17,6 @@
#include "ec-helpers.h"
#include "ec-messages.h"
-#ifndef ffsll
-#define ffsll(x) __builtin_ffsll(x)
-#endif
-
static const char * ec_fop_list[] =
{
[-EC_FOP_HEAL] = "HEAL"
@@ -96,23 +92,6 @@ void ec_trace(const char * event, ec_fop_data_t * fop, const char * fmt, ...)
}
}
-int32_t ec_bits_count(uint64_t n)
-{
- n -= (n >> 1) & 0x5555555555555555ULL;
- n = ((n >> 2) & 0x3333333333333333ULL) + (n & 0x3333333333333333ULL);
- n = (n + (n >> 4)) & 0x0F0F0F0F0F0F0F0FULL;
- n += n >> 8;
- n += n >> 16;
- n += n >> 32;
-
- return n & 0xFF;
-}
-
-int32_t ec_bits_index(uint64_t n)
-{
- return ffsll(n) - 1;
-}
-
int32_t ec_bits_consume(uint64_t * n)
{
uint64_t tmp;
diff --git a/xlators/cluster/ec/src/ec-helpers.h b/xlators/cluster/ec/src/ec-helpers.h
index 1f39da2c09f..93d77726089 100644
--- a/xlators/cluster/ec/src/ec-helpers.h
+++ b/xlators/cluster/ec/src/ec-helpers.h
@@ -16,8 +16,6 @@
const char * ec_bin(char * str, size_t size, uint64_t value, int32_t digits);
const char * ec_fop_name(int32_t id);
void ec_trace(const char * event, ec_fop_data_t * fop, const char * fmt, ...);
-int32_t ec_bits_count(uint64_t n);
-int32_t ec_bits_index(uint64_t n);
int32_t ec_bits_consume(uint64_t * n);
size_t ec_iov_copy_to(void * dst, struct iovec * vector, int32_t count,
off_t offset, size_t size);
diff --git a/xlators/cluster/ec/src/ec-locks.c b/xlators/cluster/ec/src/ec-locks.c
index 0253b51bf5e..ed835f1aadc 100644
--- a/xlators/cluster/ec/src/ec-locks.c
+++ b/xlators/cluster/ec/src/ec-locks.c
@@ -52,7 +52,7 @@ int32_t ec_lock_check(ec_fop_data_t *fop, uintptr_t *mask)
}
if (error == -1) {
- if (ec_bits_count(locked | notlocked) >= ec->fragments) {
+ if (gf_bits_count(locked | notlocked) >= ec->fragments) {
if (notlocked == 0) {
if (fop->answer == NULL) {
fop->answer = cbk;