From 786343abca3474ff01aa1017210112d97cbc4843 Mon Sep 17 00:00:00 2001 From: Pranith Kumar K Date: Mon, 4 Sep 2017 16:57:25 +0530 Subject: cluster/afr: Fail open on split-brain Problem: Append on a file with split-brain succeeds. Open is intercepted by open-behind, when write comes on the file, open-behind does open+write. Open succeeds because afr doesn't fail it. Then write succeeds because write-behind intercepts it. Flush is also intercepted by write-behind, so the application never gets to know that the write failed. Fix: Fail open on split-brain, so that when open-behind does open+write open fails which leads to write failure. Application will know about this failure. Change-Id: I4bff1c747c97bb2925d6987f4ced5f1ce75dbc15 BUG: 1294051 Signed-off-by: Pranith Kumar K --- xlators/cluster/afr/src/afr-common.c | 77 ++++++++++++++++++-- xlators/cluster/afr/src/afr-inode-write.c | 2 +- xlators/cluster/afr/src/afr-open.c | 93 +++++++++++++++++------- xlators/cluster/afr/src/afr-self-heal-common.c | 11 ++- xlators/cluster/afr/src/afr-self-heal-data.c | 58 ++++++++++++++- xlators/cluster/afr/src/afr-self-heal-metadata.c | 4 +- xlators/cluster/afr/src/afr-self-heal-name.c | 2 +- xlators/cluster/afr/src/afr-self-heal.h | 2 +- xlators/cluster/afr/src/afr-self-heald.c | 6 +- xlators/cluster/afr/src/afr-transaction.c | 43 +---------- xlators/cluster/afr/src/afr.h | 5 +- 11 files changed, 208 insertions(+), 95 deletions(-) (limited to 'xlators/cluster') diff --git a/xlators/cluster/afr/src/afr-common.c b/xlators/cluster/afr/src/afr-common.c index 01357fe2e08..f2a165d2b05 100644 --- a/xlators/cluster/afr/src/afr-common.c +++ b/xlators/cluster/afr/src/afr-common.c @@ -253,8 +253,9 @@ __afr_set_in_flight_sb_status (xlator_t *this, afr_local_t *local, local->transaction.in_flight_sb = _gf_true; metadatamap |= (1 << index); } - if (metadatamap_old != metadatamap) + if (metadatamap_old != metadatamap) { event = 0; + } break; case AFR_DATA_TRANSACTION: @@ -281,19 +282,71 @@ __afr_set_in_flight_sb_status (xlator_t *this, afr_local_t *local, return ret; } -int -afr_set_in_flight_sb_status (xlator_t *this, afr_local_t *local, inode_t *inode) +gf_boolean_t +afr_is_symmetric_error (call_frame_t *frame, xlator_t *this) { - int ret = -1; + afr_local_t *local = NULL; afr_private_t *priv = NULL; + int op_errno = 0; + int i_errno = 0; + gf_boolean_t matching_errors = _gf_true; + int i = 0; + + priv = this->private; + local = frame->local; + + for (i = 0; i < priv->child_count; i++) { + if (!local->replies[i].valid) + continue; + if (local->replies[i].op_ret != -1) { + /* Operation succeeded on at least one subvol, + so it is not a failed-everywhere situation. + */ + matching_errors = _gf_false; + break; + } + i_errno = local->replies[i].op_errno; + + if (i_errno == ENOTCONN) { + /* ENOTCONN is not a symmetric error. We do not + know if the operation was performed on the + backend or not. + */ + matching_errors = _gf_false; + break; + } + + if (!op_errno) { + op_errno = i_errno; + } else if (op_errno != i_errno) { + /* Mismatching op_errno's */ + matching_errors = _gf_false; + break; + } + } + + return matching_errors; +} + +int +afr_set_in_flight_sb_status (xlator_t *this, call_frame_t *frame, + inode_t *inode) +{ + int ret = -1; + afr_private_t *priv = NULL; + afr_local_t *local = NULL; priv = this->private; + local = frame->local; /* If this transaction saw no failures, then exit. */ if (AFR_COUNT (local->transaction.failed_subvols, priv->child_count) == 0) return 0; + if (afr_is_symmetric_error (frame, this)) + return 0; + LOCK (&inode->lock); { ret = __afr_set_in_flight_sb_status (this, local, inode); @@ -546,8 +599,9 @@ afr_inode_get_readable (call_frame_t *frame, inode_t *inode, xlator_t *this, } } else { /* For files, abort in case of data/metadata split-brain. */ - if (!data_count || !metadata_count) + if (!data_count || !metadata_count) { return -EIO; + } } if (type == AFR_METADATA_TRANSACTION && readable) @@ -1952,6 +2006,11 @@ afr_local_cleanup (afr_local_t *local, xlator_t *this) GF_FREE (local->cont.opendir.checksum); } + { /* open */ + if (local->cont.open.fd) + fd_unref (local->cont.open.fd); + } + { /* readdirp */ if (local->cont.readdir.dict) dict_unref (local->cont.readdir.dict); @@ -2529,9 +2588,11 @@ afr_lookup_metadata_heal_check (call_frame_t *frame, xlator_t *this) if (!afr_can_start_metadata_self_heal (frame, this)) goto out; - heal = afr_frame_create (this); - if (!heal) + heal = afr_frame_create (this, &ret); + if (!heal) { + ret = -ret; goto out; + } ret = synctask_new (this->ctx->env, afr_lookup_sh_metadata_wrap, afr_refresh_selfheal_done, heal, frame); @@ -2624,7 +2685,7 @@ afr_lookup_entry_heal (call_frame_t *frame, xlator_t *this) } if (need_heal) { - heal = afr_frame_create (this); + heal = afr_frame_create (this, NULL); if (!heal) goto metadata_heal; diff --git a/xlators/cluster/afr/src/afr-inode-write.c b/xlators/cluster/afr/src/afr-inode-write.c index 6651e92f482..97397f986b5 100644 --- a/xlators/cluster/afr/src/afr-inode-write.c +++ b/xlators/cluster/afr/src/afr-inode-write.c @@ -131,7 +131,7 @@ __afr_inode_write_finalize (call_frame_t *frame, xlator_t *this) } } - afr_set_in_flight_sb_status (this, local, local->inode); + afr_set_in_flight_sb_status (this, frame, local->inode); } diff --git a/xlators/cluster/afr/src/afr-open.c b/xlators/cluster/afr/src/afr-open.c index 7a628350c34..6c625ccf7da 100644 --- a/xlators/cluster/afr/src/afr-open.c +++ b/xlators/cluster/afr/src/afr-open.c @@ -66,16 +66,15 @@ afr_open_ftruncate_cbk (call_frame_t *frame, void *cookie, xlator_t *this, return 0; } - int afr_open_cbk (call_frame_t *frame, void *cookie, xlator_t *this, int32_t op_ret, int32_t op_errno, fd_t *fd, dict_t *xdata) { - afr_local_t * local = NULL; - int call_count = -1; - int child_index = (long) cookie; - afr_fd_ctx_t *fd_ctx = NULL; + afr_local_t *local = NULL; + int call_count = -1; + int child_index = (long) cookie; + afr_fd_ctx_t *fd_ctx = NULL; local = frame->local; fd_ctx = local->fd_ctx; @@ -103,24 +102,62 @@ afr_open_cbk (call_frame_t *frame, void *cookie, fd, 0, NULL); } else { AFR_STACK_UNWIND (open, frame, local->op_ret, - local->op_errno, local->fd, - local->xdata_rsp); + local->op_errno, local->cont.open.fd, + local->xdata_rsp); } } return 0; } + +int +afr_open_continue (call_frame_t *frame, xlator_t *this, int err) +{ + afr_local_t *local = NULL; + afr_private_t *priv = NULL; + int call_count = 0; + int i = 0; + + local = frame->local; + priv = this->private; + + if (err) { + AFR_STACK_UNWIND (open, frame, -1, -err, NULL, NULL); + } else { + local->call_count = AFR_COUNT (local->child_up, + priv->child_count); + call_count = local->call_count; + + for (i = 0; i < priv->child_count; i++) { + if (local->child_up[i]) { + STACK_WIND_COOKIE (frame, afr_open_cbk, + (void *)(long)i, + priv->children[i], + priv->children[i]->fops->open, + &local->loc, + (local->cont.open.flags & ~O_TRUNC), + local->cont.open.fd, + local->xdata_req); + if (!--call_count) + break; + } + } + } + return 0; +} + int afr_open (call_frame_t *frame, xlator_t *this, loc_t *loc, int32_t flags, fd_t *fd, dict_t *xdata) { - afr_private_t * priv = NULL; - afr_local_t * local = NULL; - int i = 0; - int32_t call_count = 0; - int32_t op_errno = 0; - afr_fd_ctx_t *fd_ctx = NULL; + afr_private_t *priv = NULL; + afr_local_t *local = NULL; + int spb_choice = 0; + int event_generation = 0; + int ret = 0; + int32_t op_errno = 0; + afr_fd_ctx_t *fd_ctx = NULL; //We can't let truncation to happen outside transaction. @@ -140,23 +177,27 @@ afr_open (call_frame_t *frame, xlator_t *this, loc_t *loc, int32_t flags, if (!afr_is_consistent_io_possible (local, priv, &op_errno)) goto out; - local->fd = fd_ref (fd); + local->inode = inode_ref (loc->inode); + loc_copy (&local->loc, loc); local->fd_ctx = fd_ctx; fd_ctx->flags = flags; - - call_count = local->call_count; + if (xdata) + local->xdata_req = dict_ref (xdata); local->cont.open.flags = flags; - - for (i = 0; i < priv->child_count; i++) { - if (local->child_up[i]) { - STACK_WIND_COOKIE (frame, afr_open_cbk, (void *) (long) i, - priv->children[i], - priv->children[i]->fops->open, - loc, (flags & ~O_TRUNC), fd, xdata); - if (!--call_count) - break; - } + local->cont.open.fd = fd_ref (fd); + + ret = afr_inode_get_readable (frame, local->inode, this, + NULL, &event_generation, + AFR_DATA_TRANSACTION); + if ((ret < 0) && + (afr_inode_split_brain_choice_get (local->inode, + this, &spb_choice) == 0) && + spb_choice < 0) { + afr_inode_refresh (frame, this, local->inode, + local->inode->gfid, afr_open_continue); + } else { + afr_open_continue (frame, this, 0); } return 0; diff --git a/xlators/cluster/afr/src/afr-self-heal-common.c b/xlators/cluster/afr/src/afr-self-heal-common.c index 20e81dd43e0..26d3860b234 100644 --- a/xlators/cluster/afr/src/afr-self-heal-common.c +++ b/xlators/cluster/afr/src/afr-self-heal-common.c @@ -66,9 +66,9 @@ afr_lookup_and_heal_gfid (xlator_t *this, inode_t *parent, const char *name, goto out; } - frame = afr_frame_create (this); + frame = afr_frame_create (this, &ret); if (!frame) { - ret = -ENOMEM; + ret = -ret; goto out; } @@ -2349,18 +2349,17 @@ afr_inode_find (xlator_t *this, uuid_t gfid) call_frame_t * -afr_frame_create (xlator_t *this) +afr_frame_create (xlator_t *this, int32_t *op_errno) { call_frame_t *frame = NULL; afr_local_t *local = NULL; - int op_errno = 0; pid_t pid = GF_CLIENT_PID_SELF_HEALD; frame = create_frame (this, this->ctx->pool); if (!frame) return NULL; - local = AFR_FRAME_INIT (frame, op_errno); + local = AFR_FRAME_INIT (frame, (*op_errno)); if (!local) { STACK_DESTROY (frame->root); return NULL; @@ -2490,7 +2489,7 @@ afr_selfheal (xlator_t *this, uuid_t gfid) call_frame_t *frame = NULL; afr_local_t *local = NULL; - frame = afr_frame_create (this); + frame = afr_frame_create (this, NULL); if (!frame) return ret; diff --git a/xlators/cluster/afr/src/afr-self-heal-data.c b/xlators/cluster/afr/src/afr-self-heal-data.c index 2c254e80aa1..8cf43f2807b 100644 --- a/xlators/cluster/afr/src/afr-self-heal-data.c +++ b/xlators/cluster/afr/src/afr-self-heal-data.c @@ -776,13 +776,37 @@ out: return ret; } +int +afr_selfheal_data_open_cbk (call_frame_t *frame, void *cookie, + xlator_t *this, int32_t op_ret, int32_t op_errno, + fd_t *fd, dict_t *xdata) +{ + afr_local_t *local = NULL; + int i = (long) cookie; + + local = frame->local; + + local->replies[i].valid = 1; + local->replies[i].op_ret = op_ret; + local->replies[i].op_errno = op_errno; + + syncbarrier_wake (&local->barrier); + + return 0; +} int afr_selfheal_data_open (xlator_t *this, inode_t *inode, fd_t **fd) { - int ret = 0; - fd_t *fd_tmp = NULL; - loc_t loc = {0,}; + int ret = 0; + fd_t *fd_tmp = NULL; + loc_t loc = {0,}; + call_frame_t *frame = NULL; + afr_local_t *local = NULL; + afr_private_t *priv = NULL; + int i = 0; + + priv = this->private; fd_tmp = fd_create (inode, 0); if (!fd_tmp) @@ -791,7 +815,31 @@ afr_selfheal_data_open (xlator_t *this, inode_t *inode, fd_t **fd) loc.inode = inode_ref (inode); gf_uuid_copy (loc.gfid, inode->gfid); - ret = syncop_open (this, &loc, O_RDWR|O_LARGEFILE, fd_tmp, NULL, NULL); + frame = afr_frame_create (this, &ret); + if (!frame) { + ret = -ret; + fd_unref (fd_tmp); + goto out; + } + local = frame->local; + + AFR_ONLIST (local->child_up, frame, afr_selfheal_data_open_cbk, open, + &loc, O_RDWR|O_LARGEFILE, fd_tmp, NULL); + + ret = -ENOTCONN; + for (i = 0; i < priv->child_count; i++) { + if (!local->replies[i].valid) + continue; + + if (local->replies[i].op_ret < 0) { + ret = -local->replies[i].op_errno; + continue; + } + + ret = 0; + break; + } + if (ret < 0) { fd_unref (fd_tmp); goto out; @@ -802,6 +850,8 @@ afr_selfheal_data_open (xlator_t *this, inode_t *inode, fd_t **fd) *fd = fd_tmp; out: loc_wipe (&loc); + if (frame) + AFR_STACK_DESTROY (frame); return ret; } diff --git a/xlators/cluster/afr/src/afr-self-heal-metadata.c b/xlators/cluster/afr/src/afr-self-heal-metadata.c index f23cf8ec6ee..199f8961480 100644 --- a/xlators/cluster/afr/src/afr-self-heal-metadata.c +++ b/xlators/cluster/afr/src/afr-self-heal-metadata.c @@ -486,9 +486,9 @@ afr_selfheal_metadata_by_stbuf (xlator_t *this, struct iatt *stbuf) goto out; } - frame = afr_frame_create (this); + frame = afr_frame_create (this, &ret); if (!frame) { - ret = -ENOMEM; + ret = -ret; goto out; } diff --git a/xlators/cluster/afr/src/afr-self-heal-name.c b/xlators/cluster/afr/src/afr-self-heal-name.c index 352d151207e..556d14baf82 100644 --- a/xlators/cluster/afr/src/afr-self-heal-name.c +++ b/xlators/cluster/afr/src/afr-self-heal-name.c @@ -670,7 +670,7 @@ afr_selfheal_name (xlator_t *this, uuid_t pargfid, const char *bname, if (!parent) goto out; - frame = afr_frame_create (this); + frame = afr_frame_create (this, NULL); if (!frame) goto out; diff --git a/xlators/cluster/afr/src/afr-self-heal.h b/xlators/cluster/afr/src/afr-self-heal.h index 8b0047df28f..8e976905e97 100644 --- a/xlators/cluster/afr/src/afr-self-heal.h +++ b/xlators/cluster/afr/src/afr-self-heal.h @@ -209,7 +209,7 @@ afr_selfheal_post_op (call_frame_t *frame, xlator_t *this, inode_t *inode, int subvol, dict_t *xattr, dict_t *xdata); call_frame_t * -afr_frame_create (xlator_t *this); +afr_frame_create (xlator_t *this, int32_t *op_errno); inode_t * afr_inode_find (xlator_t *this, uuid_t gfid); diff --git a/xlators/cluster/afr/src/afr-self-heald.c b/xlators/cluster/afr/src/afr-self-heald.c index 74c9bb67931..19cde88a6d5 100644 --- a/xlators/cluster/afr/src/afr-self-heald.c +++ b/xlators/cluster/afr/src/afr-self-heald.c @@ -260,7 +260,7 @@ afr_shd_zero_xattrop (xlator_t *this, uuid_t gfid) int raw[AFR_NUM_CHANGE_LOGS] = {0}; priv = this->private; - frame = afr_frame_create (this); + frame = afr_frame_create (this, NULL); if (!frame) goto out; inode = afr_inode_find (this, gfid); @@ -457,9 +457,9 @@ afr_shd_index_sweep (struct subvol_healer *healer, char *vgfid) priv = healer->this->private; subvol = priv->children[healer->subvol]; - frame = afr_frame_create (healer->this); + frame = afr_frame_create (healer->this, &ret); if (!frame) { - ret = -ENOMEM; + ret = -ret; goto out; } diff --git a/xlators/cluster/afr/src/afr-transaction.c b/xlators/cluster/afr/src/afr-transaction.c index 35621d98493..791c70f6fcc 100644 --- a/xlators/cluster/afr/src/afr-transaction.c +++ b/xlators/cluster/afr/src/afr-transaction.c @@ -626,51 +626,10 @@ afr_txn_nothing_failed (call_frame_t *frame, xlator_t *this) return _gf_true; } - void afr_handle_symmetric_errors (call_frame_t *frame, xlator_t *this) { - afr_local_t *local = NULL; - afr_private_t *priv = NULL; - int op_errno = 0; - int i_errno = 0; - gf_boolean_t matching_errors = _gf_true; - int i = 0; - - priv = this->private; - local = frame->local; - - for (i = 0; i < priv->child_count; i++) { - if (!local->replies[i].valid) - continue; - if (local->replies[i].op_ret != -1) { - /* Operation succeeded on at least on subvol, - so it is not a failed-everywhere situation. - */ - matching_errors = _gf_false; - break; - } - i_errno = local->replies[i].op_errno; - - if (i_errno == ENOTCONN) { - /* ENOTCONN is not a symmetric error. We do not - know if the operation was performed on the - backend or not. - */ - matching_errors = _gf_false; - break; - } - - if (!op_errno) { - op_errno = i_errno; - } else if (op_errno != i_errno) { - /* Mismatching op_errno's */ - matching_errors = _gf_false; - break; - } - } - - if (matching_errors) + if (afr_is_symmetric_error (frame, this)) __mark_all_success (frame, this); } diff --git a/xlators/cluster/afr/src/afr.h b/xlators/cluster/afr/src/afr.h index c4ceb66914f..8cd687398f0 100644 --- a/xlators/cluster/afr/src/afr.h +++ b/xlators/cluster/afr/src/afr.h @@ -519,6 +519,7 @@ typedef struct _afr_local { struct { int32_t flags; + fd_t *fd; } open; struct { @@ -1213,7 +1214,7 @@ int afr_get_msg_id (char *op_type); int -afr_set_in_flight_sb_status (xlator_t *this, afr_local_t *local, +afr_set_in_flight_sb_status (xlator_t *this, call_frame_t *frame, inode_t *inode); int32_t @@ -1262,4 +1263,6 @@ int afr_serialize_xattrs_with_delimiter (call_frame_t *frame, xlator_t *this, char *buf, const char *default_str, int32_t *serz_len, char delimiter); +gf_boolean_t +afr_is_symmetric_error (call_frame_t *frame, xlator_t *this); #endif /* __AFR_H__ */ -- cgit