From fcb5b70b1099d0379b40c81f35750df8bb9545a5 Mon Sep 17 00:00:00 2001 From: Krutika Dhananjay Date: Thu, 28 Jul 2016 21:29:59 +0530 Subject: cluster/afr: Prevent split-brain when bricks are brought off and on in cyclic order When the bricks are brought offline and then online in cyclic order while writes are in progress on a file, thanks to inode refresh in write txns, AFR will mostly fail the write attempt when the only good copy is offline. However, there is still a remote possibility that the file will run into split-brain if the brick that has the lone good copy goes offline *after* the inode refresh but *before* the write txn completes (I call it in-flight split-brain in the patch for ease of reference), requiring intervention from admin to resolve the split-brain before the IO can resume normally on the file. To get around this, the patch does the following things: i) retains the dirty xattrs on the file ii) avoids marking the last of the good copies as bad (or accused) in case it is the one to go down during the course of a write. iii) fails that particular write with the appropriate errno. This way, we still have one good copy left despite the split-brain situation which when it is back online, will be chosen as source to do the heal. Change-Id: I9ca634b026ac830b172bac076437cc3bf1ae7d8a BUG: 1363721 Signed-off-by: Krutika Dhananjay Reviewed-on: http://review.gluster.org/15080 Tested-by: Pranith Kumar Karampuri Smoke: Gluster Build System CentOS-regression: Gluster Build System Reviewed-by: Ravishankar N Reviewed-by: Oleksandr Natalenko NetBSD-regression: NetBSD Build System Reviewed-by: Pranith Kumar Karampuri --- libglusterfs/src/common-utils.c | 24 ++++++ libglusterfs/src/common-utils.h | 11 +++ tests/bugs/replicate/bug-1363721.t | 112 ++++++++++++++++++++++++++ xlators/cluster/afr/src/afr-common.c | 125 ++++++++++++++++++++++++++++-- xlators/cluster/afr/src/afr-dir-write.c | 3 +- xlators/cluster/afr/src/afr-inode-write.c | 19 +++-- xlators/cluster/afr/src/afr-transaction.c | 7 ++ xlators/cluster/afr/src/afr.h | 17 +++- xlators/cluster/ec/src/ec-common.c | 16 ++-- xlators/cluster/ec/src/ec-helpers.c | 21 ----- xlators/cluster/ec/src/ec-helpers.h | 2 - xlators/cluster/ec/src/ec-locks.c | 2 +- 12 files changed, 308 insertions(+), 51 deletions(-) create mode 100644 tests/bugs/replicate/bug-1363721.t diff --git a/libglusterfs/src/common-utils.c b/libglusterfs/src/common-utils.c index 3529ad7f897..311ed72d04c 100644 --- a/libglusterfs/src/common-utils.c +++ b/libglusterfs/src/common-utils.c @@ -4553,3 +4553,27 @@ gfid_to_ino (uuid_t gfid) return ino; } + +int +gf_bits_count (uint64_t n) +{ + int val = 0; +#ifdef _GNU_SOURCE + val = __builtin_popcountll (n); +#else + n -= (n >> 1) & 0x5555555555555555ULL; + n = ((n >> 2) & 0x3333333333333333ULL) + (n & 0x3333333333333333ULL); + n = (n + (n >> 4)) & 0x0F0F0F0F0F0F0F0FULL; + n += n >> 8; + n += n >> 16; + n += n >> 32; + val = n & 0xFF; +#endif + return val; +} + +int +gf_bits_index (uint64_t n) +{ + return ffsll(n) - 1; +} diff --git a/libglusterfs/src/common-utils.h b/libglusterfs/src/common-utils.h index fa5d28d92b3..4bcb6f9f2e0 100644 --- a/libglusterfs/src/common-utils.h +++ b/libglusterfs/src/common-utils.h @@ -25,6 +25,10 @@ #include #include +#ifndef ffsll +#define ffsll(x) __builtin_ffsll(x) +#endif + void trap (void); #define GF_UNIVERSAL_ANSWER 42 /* :O */ @@ -849,4 +853,11 @@ is_virtual_xattr (const char *k); const char * gf_inode_type_to_str (ia_type_t type); + +int32_t +gf_bits_count (uint64_t n); + +int32_t +gf_bits_index (uint64_t n); + #endif /* _COMMON_UTILS_H */ diff --git a/tests/bugs/replicate/bug-1363721.t b/tests/bugs/replicate/bug-1363721.t new file mode 100644 index 00000000000..ec39889b27e --- /dev/null +++ b/tests/bugs/replicate/bug-1363721.t @@ -0,0 +1,112 @@ +#!/bin/bash + +. $(dirname $0)/../../include.rc +. $(dirname $0)/../../volume.rc + +FILE_UPDATE_TIMEOUT=20 +cleanup + +function size_increased { + local file=$1 + local size=$2 + local new_size=$(stat -c%s $file) + if [ $new_size -gt $size ]; + then + echo "Y" + else + echo "N" + fi +} + +TEST glusterd +TEST pidof glusterd +TEST $CLI volume create $V0 replica 3 $H0:$B0/${V0}{0,1,2} +TEST $CLI volume set $V0 performance.write-behind off +TEST $CLI volume set $V0 cluster.self-heal-daemon off +TEST $CLI volume set $V0 cluster.data-self-heal off +TEST $CLI volume set $V0 cluster.metadata-self-heal off +TEST $CLI volume set $V0 cluster.entry-self-heal off +TEST $CLI volume start $V0 +TEST $GFS --volfile-id=$V0 --volfile-server=$H0 $M0 --direct-io-mode=enable + +cd $M0 + +# Start writing to a file. +(dd if=/dev/urandom of=$M0/file1 bs=1k 2>/dev/null 1>/dev/null)& +dd_pid=$! + +# Let IO happen +EXPECT_WITHIN $FILE_UPDATE_TIMEOUT "Y" size_increased file1 0 + +# Now kill the zeroth brick +kill_brick $V0 $H0 $B0/${V0}0 + +# Let IO continue +EXPECT_WITHIN $FILE_UPDATE_TIMEOUT "Y" size_increased file1 $(stat -c%s file1) + +# Now bring the brick back up +TEST $CLI volume start $V0 force +EXPECT_WITHIN $CHILD_UP_TIMEOUT "1" afr_child_up_status $V0 0 + +# Let IO continue +EXPECT_WITHIN $FILE_UPDATE_TIMEOUT "Y" size_increased file1 $(stat -c%s file1) + +# Now kill the first brick +kill_brick $V0 $H0 $B0/${V0}1 + +# Let IO continue +EXPECT_WITHIN $FILE_UPDATE_TIMEOUT "Y" size_increased file1 $(stat -c%s file1) + +# Now bring the brick back up +TEST $CLI volume start $V0 force +EXPECT_WITHIN $CHILD_UP_TIMEOUT "1" afr_child_up_status $V0 1 + +# Let IO continue for 3 seconds +sleep 3 + +# Now kill the second brick +kill_brick $V0 $H0 $B0/${V0}2 + +# At this point the write should have been failed. But make sure that the second +# brick is never an accused. + +md5sum_2=$(md5sum $B0/${V0}2/file1 | awk '{print $1}') + +EXPECT_NOT "$md5sum_2" echo `md5sum $B0/${V0}0/file1 | awk '{print $1}'` +EXPECT_NOT "$md5sum_2" echo `md5sum $B0/${V0}1/file1 | awk '{print $1}'` + +EXPECT_NOT "00000000" afr_get_specific_changelog_xattr $B0/${V0}0/file1 trusted.afr.dirty data +EXPECT_NOT "00000000" afr_get_specific_changelog_xattr $B0/${V0}1/file1 trusted.afr.dirty data + +EXPECT "00000000" afr_get_specific_changelog_xattr $B0/${V0}0/file1 trusted.afr.$V0-client-2 data +EXPECT "00000000" afr_get_specific_changelog_xattr $B0/${V0}1/file1 trusted.afr.$V0-client-2 data +EXPECT "00000000" afr_get_specific_changelog_xattr $B0/${V0}2/file1 trusted.afr.$V0-client-2 data +EXPECT "00000000" afr_get_specific_changelog_xattr $B0/${V0}0/file1 trusted.afr.$V0-client-2 metadata +EXPECT "00000000" afr_get_specific_changelog_xattr $B0/${V0}1/file1 trusted.afr.$V0-client-2 metadata +EXPECT "00000000" afr_get_specific_changelog_xattr $B0/${V0}2/file1 trusted.afr.$V0-client-2 metadata + +# Now bring the brick back up +TEST $CLI volume start $V0 force +EXPECT_WITHIN $CHILD_UP_TIMEOUT "1" afr_child_up_status $V0 2 + +# Enable shd +TEST $CLI volume set $V0 self-heal-daemon on +EXPECT_WITHIN $PROCESS_UP_TIMEOUT "Y" glustershd_up_status +EXPECT_WITHIN $CHILD_UP_TIMEOUT "1" afr_child_up_status_in_shd $V0 0 +EXPECT_WITHIN $CHILD_UP_TIMEOUT "1" afr_child_up_status_in_shd $V0 1 +EXPECT_WITHIN $CHILD_UP_TIMEOUT "1" afr_child_up_status_in_shd $V0 2 + +TEST $CLI volume heal $V0 + +# Wait for heal to complete +EXPECT_WITHIN $HEAL_TIMEOUT "^0$" get_pending_heal_count $V0 + +EXPECT "$md5sum_2" echo `md5sum $B0/${V0}0/file1 | awk '{print $1}'` +EXPECT "$md5sum_2" echo `md5sum $B0/${V0}1/file1 | awk '{print $1}'` +EXPECT "$md5sum_2" echo `md5sum $B0/${V0}2/file1 | awk '{print $1}'` + +cd ~ + +EXPECT_WITHIN $UMOUNT_TIMEOUT "Y" force_umount $M0 + +cleanup diff --git a/xlators/cluster/afr/src/afr-common.c b/xlators/cluster/afr/src/afr-common.c index 557b2cd8891..9b2c0d7caea 100644 --- a/xlators/cluster/afr/src/afr-common.c +++ b/xlators/cluster/afr/src/afr-common.c @@ -154,6 +154,119 @@ out: * to be called to recalculate the bitmaps. */ +int +__afr_set_in_flight_sb_status (xlator_t *this, afr_local_t *local, + inode_t *inode) +{ + int i = 0; + int ret = -1; + int txn_type = 0; + int count = 0; + int index = -1; + uint16_t datamap_old = 0; + uint16_t metadatamap_old = 0; + uint16_t datamap = 0; + uint16_t metadatamap = 0; + uint16_t tmp_map = 0; + uint16_t mask = 0; + uint32_t event = 0; + uint64_t val = 0; + afr_private_t *priv = NULL; + afr_inode_ctx_t *ctx = NULL; + + priv = this->private; + txn_type = local->transaction.type; + + ret = __afr_inode_ctx_get (this, inode, &ctx); + if (ret < 0) + return ret; + + val = ctx->read_subvol; + + metadatamap_old = metadatamap = (val & 0x000000000000ffff); + datamap_old = datamap = (val & 0x00000000ffff0000) >> 16; + /* Hard-code event to 0 since there is a failure and the inode + * needs to be refreshed anyway. + */ + event = 0; + + if (txn_type == AFR_DATA_TRANSACTION) + tmp_map = datamap; + else if (txn_type == AFR_METADATA_TRANSACTION) + tmp_map = metadatamap; + + count = gf_bits_count (tmp_map); + + if (count == 1) + index = gf_bits_index (tmp_map); + + for (i = 0; i < priv->child_count; i++) { + mask = 0; + if (!local->transaction.failed_subvols[i]) + continue; + + mask = 1 << i; + if (txn_type == AFR_METADATA_TRANSACTION) + metadatamap &= ~mask; + else if (txn_type == AFR_DATA_TRANSACTION) + datamap &= ~mask; + } + + switch (txn_type) { + case AFR_METADATA_TRANSACTION: + if ((metadatamap_old != 0) && (metadatamap == 0) && + (count == 1)) { + local->transaction.in_flight_sb_errno = + local->replies[index].op_errno; + local->transaction.in_flight_sb = _gf_true; + metadatamap |= (1 << index); + } + break; + + case AFR_DATA_TRANSACTION: + if ((datamap_old != 0) && (datamap == 0) && (count == 1)) { + local->transaction.in_flight_sb_errno = + local->replies[index].op_errno; + local->transaction.in_flight_sb = _gf_true; + datamap |= (1 << index); + } + break; + + default: + break; + } + + val = ((uint64_t) metadatamap) | + (((uint64_t) datamap) << 16) | + (((uint64_t) event) << 32); + + ctx->read_subvol = val; + + return ret; +} + +int +afr_set_in_flight_sb_status (xlator_t *this, afr_local_t *local, inode_t *inode) +{ + int ret = -1; + afr_private_t *priv = NULL; + + priv = this->private; + + /* If this transaction saw no failures, then exit. */ + if (AFR_COUNT (local->transaction.failed_subvols, + priv->child_count) == 0) + return 0; + + LOCK (&inode->lock); + { + ret = __afr_set_in_flight_sb_status (this, local, inode); + } + UNLOCK (&inode->lock); + + return ret; +} + int __afr_inode_read_subvol_get_small (inode_t *inode, xlator_t *this, unsigned char *data, unsigned char *metadata, @@ -233,12 +346,12 @@ out: int __afr_inode_read_subvol_reset_small (inode_t *inode, xlator_t *this) { - int ret = -1; - uint16_t datamap = 0; - uint16_t metadatamap = 0; - uint32_t event = 0; - uint64_t val = 0; - afr_inode_ctx_t *ctx = NULL; + int ret = -1; + uint16_t datamap = 0; + uint16_t metadatamap = 0; + uint32_t event = 0; + uint64_t val = 0; + afr_inode_ctx_t *ctx = NULL; ret = __afr_inode_ctx_get (this, inode, &ctx); if (ret) diff --git a/xlators/cluster/afr/src/afr-dir-write.c b/xlators/cluster/afr/src/afr-dir-write.c index f3de5352d7e..286a5392da6 100644 --- a/xlators/cluster/afr/src/afr-dir-write.c +++ b/xlators/cluster/afr/src/afr-dir-write.c @@ -122,8 +122,7 @@ __afr_dir_write_finalize (call_frame_t *frame, xlator_t *this) continue; if (local->replies[i].op_ret < 0) { if (local->inode) - afr_inode_read_subvol_reset (local->inode, - this); + afr_inode_read_subvol_reset (local->inode, this); if (local->parent) afr_inode_read_subvol_reset (local->parent, this); diff --git a/xlators/cluster/afr/src/afr-inode-write.c b/xlators/cluster/afr/src/afr-inode-write.c index 323f50c9413..ddc257dbde4 100644 --- a/xlators/cluster/afr/src/afr-inode-write.c +++ b/xlators/cluster/afr/src/afr-inode-write.c @@ -38,13 +38,13 @@ static void __afr_inode_write_finalize (call_frame_t *frame, xlator_t *this) { - afr_local_t *local = NULL; - afr_private_t *priv = NULL; - int read_subvol = 0; - int i = 0; - afr_read_subvol_args_t args = {0,}; - struct iatt *stbuf = NULL; - int ret = 0; + int i = 0; + int ret = 0; + int read_subvol = 0; + struct iatt *stbuf = NULL; + afr_local_t *local = NULL; + afr_private_t *priv = NULL; + afr_read_subvol_args_t args = {0,}; local = frame->local; priv = this->private; @@ -94,10 +94,8 @@ __afr_inode_write_finalize (call_frame_t *frame, xlator_t *this) for (i = 0; i < priv->child_count; i++) { if (!local->replies[i].valid) continue; - if (local->replies[i].op_ret < 0) { - afr_inode_read_subvol_reset (local->inode, this); + if (local->replies[i].op_ret < 0) continue; - } /* Order of checks in the compound conditional below is important. @@ -134,6 +132,7 @@ __afr_inode_write_finalize (call_frame_t *frame, xlator_t *this) } afr_txn_arbitrate_fop_cbk (frame, this); + afr_set_in_flight_sb_status (this, local, local->inode); } diff --git a/xlators/cluster/afr/src/afr-transaction.c b/xlators/cluster/afr/src/afr-transaction.c index 274e90d6f3c..6130ad76543 100644 --- a/xlators/cluster/afr/src/afr-transaction.c +++ b/xlators/cluster/afr/src/afr-transaction.c @@ -850,6 +850,13 @@ afr_changelog_post_op_now (call_frame_t *frame, xlator_t *this) goto out; } + if (local->transaction.in_flight_sb) { + local->op_ret = -1; + local->op_errno = local->transaction.in_flight_sb_errno; + afr_changelog_post_op_done (frame, this); + goto out; + } + xattr = dict_new (); if (!xattr) { local->op_ret = -1; diff --git a/xlators/cluster/afr/src/afr.h b/xlators/cluster/afr/src/afr.h index 54f988e8f33..29008287e6d 100644 --- a/xlators/cluster/afr/src/afr.h +++ b/xlators/cluster/afr/src/afr.h @@ -735,10 +735,17 @@ typedef struct _afr_local { gf_boolean_t uninherit_done; gf_boolean_t uninherit_value; + gf_boolean_t in_flight_sb; /* Indicator for occurrence of + split-brain while in the middle of + a txn. */ + int32_t in_flight_sb_errno; /* This is where the cause of the + failure on the last good copy of + the file is stored. + */ + /* @changelog_resume: function to be called after changlogging (either pre-op or post-op) is done */ - afr_changelog_resume_t changelog_resume; call_frame_t *main_frame; @@ -870,6 +877,10 @@ afr_read_subvol_get (inode_t *inode, xlator_t *this, int *subvol_p, #define afr_metadata_subvol_get(i, t, s, r, e, a) \ afr_read_subvol_get(i, t, s, r, e, AFR_METADATA_TRANSACTION, a) +int +afr_inode_ctx_reset_unreadable_subvol (inode_t *inode, xlator_t *this, + int subvol_idx, int txn_type); + int afr_inode_refresh (call_frame_t *frame, xlator_t *this, inode_t *inode, uuid_t gfid, afr_inode_refresh_cbk_t cbk); @@ -1145,4 +1156,8 @@ afr_selfheal_data_open (xlator_t *this, inode_t *inode, fd_t **fd); int afr_get_msg_id (char *op_type); + +int +afr_set_in_flight_sb_status (xlator_t *this, afr_local_t *local, + inode_t *inode); #endif /* __AFR_H__ */ diff --git a/xlators/cluster/ec/src/ec-common.c b/xlators/cluster/ec/src/ec-common.c index 175f6dfa71f..2e6759a2803 100644 --- a/xlators/cluster/ec/src/ec-common.c +++ b/xlators/cluster/ec/src/ec-common.c @@ -74,8 +74,8 @@ int32_t ec_heal_report(call_frame_t * frame, void * cookie, xlator_t * this, gf_msg (this->name, GF_LOG_INFO, 0, EC_MSG_HEAL_SUCCESS, "Heal succeeded on %d/%d " "subvolumes", - ec_bits_count(mask & ~(good | bad)), - ec_bits_count(mask & ~good)); + gf_bits_count(mask & ~(good | bad)), + gf_bits_count(mask & ~good)); } } @@ -333,7 +333,7 @@ void ec_complete(ec_fop_data_t * fop) if (fop->answer == NULL) { if (!list_empty(&fop->cbk_list)) { cbk = list_entry(fop->cbk_list.next, ec_cbk_data_t, list); - healing_count = ec_bits_count (cbk->mask & fop->healing); + healing_count = gf_bits_count (cbk->mask & fop->healing); /* fop shouldn't be treated as success if it is not * successful on at least fop->minimum good copies*/ if ((cbk->count - healing_count) >= fop->minimum) { @@ -424,7 +424,7 @@ int32_t ec_child_select(ec_fop_data_t * fop) switch (fop->minimum) { case EC_MINIMUM_ALL: - fop->minimum = ec_bits_count(fop->mask); + fop->minimum = gf_bits_count(fop->mask); if (fop->minimum >= ec->fragments) { break; @@ -451,7 +451,7 @@ int32_t ec_child_select(ec_fop_data_t * fop) ec_trace("SELECT", fop, ""); - num = ec_bits_count(fop->mask); + num = gf_bits_count(fop->mask); if ((num < fop->minimum) && (num < ec->fragments)) { gf_msg (ec->xl->name, GF_LOG_ERROR, 0, @@ -500,7 +500,7 @@ void ec_dispatch_mask(ec_fop_data_t * fop, uintptr_t mask) ec_t * ec = fop->xl->private; int32_t count, idx; - count = ec_bits_count(mask); + count = gf_bits_count(mask); LOCK(&fop->lock); @@ -578,7 +578,7 @@ void ec_dispatch_inc(ec_fop_data_t * fop) if (ec_child_select(fop)) { - fop->expected = ec_bits_count(fop->remaining); + fop->expected = gf_bits_count(fop->remaining); fop->first = 0; ec_dispatch_next(fop, 0); @@ -591,7 +591,7 @@ ec_dispatch_all (ec_fop_data_t *fop) ec_dispatch_start(fop); if (ec_child_select(fop)) { - fop->expected = ec_bits_count(fop->remaining); + fop->expected = gf_bits_count(fop->remaining); fop->first = 0; ec_dispatch_mask(fop, fop->remaining); diff --git a/xlators/cluster/ec/src/ec-helpers.c b/xlators/cluster/ec/src/ec-helpers.c index c8f904ac51d..7cf8232353d 100644 --- a/xlators/cluster/ec/src/ec-helpers.c +++ b/xlators/cluster/ec/src/ec-helpers.c @@ -17,10 +17,6 @@ #include "ec-helpers.h" #include "ec-messages.h" -#ifndef ffsll -#define ffsll(x) __builtin_ffsll(x) -#endif - static const char * ec_fop_list[] = { [-EC_FOP_HEAL] = "HEAL" @@ -96,23 +92,6 @@ void ec_trace(const char * event, ec_fop_data_t * fop, const char * fmt, ...) } } -int32_t ec_bits_count(uint64_t n) -{ - n -= (n >> 1) & 0x5555555555555555ULL; - n = ((n >> 2) & 0x3333333333333333ULL) + (n & 0x3333333333333333ULL); - n = (n + (n >> 4)) & 0x0F0F0F0F0F0F0F0FULL; - n += n >> 8; - n += n >> 16; - n += n >> 32; - - return n & 0xFF; -} - -int32_t ec_bits_index(uint64_t n) -{ - return ffsll(n) - 1; -} - int32_t ec_bits_consume(uint64_t * n) { uint64_t tmp; diff --git a/xlators/cluster/ec/src/ec-helpers.h b/xlators/cluster/ec/src/ec-helpers.h index 1f39da2c09f..93d77726089 100644 --- a/xlators/cluster/ec/src/ec-helpers.h +++ b/xlators/cluster/ec/src/ec-helpers.h @@ -16,8 +16,6 @@ const char * ec_bin(char * str, size_t size, uint64_t value, int32_t digits); const char * ec_fop_name(int32_t id); void ec_trace(const char * event, ec_fop_data_t * fop, const char * fmt, ...); -int32_t ec_bits_count(uint64_t n); -int32_t ec_bits_index(uint64_t n); int32_t ec_bits_consume(uint64_t * n); size_t ec_iov_copy_to(void * dst, struct iovec * vector, int32_t count, off_t offset, size_t size); diff --git a/xlators/cluster/ec/src/ec-locks.c b/xlators/cluster/ec/src/ec-locks.c index 0253b51bf5e..ed835f1aadc 100644 --- a/xlators/cluster/ec/src/ec-locks.c +++ b/xlators/cluster/ec/src/ec-locks.c @@ -52,7 +52,7 @@ int32_t ec_lock_check(ec_fop_data_t *fop, uintptr_t *mask) } if (error == -1) { - if (ec_bits_count(locked | notlocked) >= ec->fragments) { + if (gf_bits_count(locked | notlocked) >= ec->fragments) { if (notlocked == 0) { if (fop->answer == NULL) { fop->answer = cbk; -- cgit