diff options
Diffstat (limited to 'xlators')
| -rw-r--r-- | xlators/cluster/afr/src/afr-common.c | 14 | ||||
| -rw-r--r-- | xlators/cluster/afr/src/afr-dir-write.c | 16 | ||||
| -rw-r--r-- | xlators/cluster/afr/src/afr-inode-write.c | 62 | ||||
| -rw-r--r-- | xlators/cluster/afr/src/afr-open.c | 4 | ||||
| -rw-r--r-- | xlators/cluster/afr/src/afr-transaction.c | 144 | ||||
| -rw-r--r-- | xlators/cluster/afr/src/afr-transaction.h | 1 | ||||
| -rw-r--r-- | xlators/cluster/afr/src/afr.c | 9 | ||||
| -rw-r--r-- | xlators/cluster/afr/src/afr.h | 16 | 
8 files changed, 164 insertions, 102 deletions
diff --git a/xlators/cluster/afr/src/afr-common.c b/xlators/cluster/afr/src/afr-common.c index 164a651bad5..4c7692bd671 100644 --- a/xlators/cluster/afr/src/afr-common.c +++ b/xlators/cluster/afr/src/afr-common.c @@ -3144,6 +3144,8 @@ afr_notify (xlator_t *this, int32_t event,          int             up_child            = -1;          dict_t          *input              = NULL;          dict_t          *output             = NULL; +        gf_boolean_t    had_quorum          = _gf_false; +        gf_boolean_t    has_quorum          = _gf_false;          priv = this->private; @@ -3181,6 +3183,8 @@ afr_notify (xlator_t *this, int32_t event,                  goto out;          } +        had_quorum = priv->quorum_count && afr_has_quorum (priv->child_up, +                                                           this);          switch (event) {          case GF_EVENT_CHILD_UP:                  LOCK (&priv->lock); @@ -3279,6 +3283,16 @@ afr_notify (xlator_t *this, int32_t event,                  break;          } +        if (priv->quorum_count) { +                has_quorum = afr_has_quorum (priv->child_up, this); +                if (!had_quorum && has_quorum) +                        gf_log (this->name, GF_LOG_INFO, "Client-quorum" +                                " is met"); +                if (had_quorum && !has_quorum) +                        gf_log (this->name, GF_LOG_WARNING, +                                "Client-quorum is not met"); +        } +          /* have all subvolumes reported status once by now? */          have_heard_from_all = 1;          for (i = 0; i < priv->child_count; i++) { diff --git a/xlators/cluster/afr/src/afr-dir-write.c b/xlators/cluster/afr/src/afr-dir-write.c index 465dde54f9c..d59536fcb48 100644 --- a/xlators/cluster/afr/src/afr-dir-write.c +++ b/xlators/cluster/afr/src/afr-dir-write.c @@ -447,8 +447,6 @@ afr_create (call_frame_t *frame, xlator_t *this, loc_t *loc, int32_t flags,          priv = this->private; -        QUORUM_CHECK(create,out); -          transaction_frame = copy_frame (frame);          if (!transaction_frame)                  goto out; @@ -585,8 +583,6 @@ afr_mknod (call_frame_t *frame, xlator_t *this, loc_t *loc, mode_t mode,          priv = this->private; -        QUORUM_CHECK(mknod,out); -          transaction_frame = copy_frame (frame);          if (!transaction_frame)                  goto out; @@ -718,8 +714,6 @@ afr_mkdir (call_frame_t *frame, xlator_t *this, loc_t *loc, mode_t mode,          priv = this->private; -        QUORUM_CHECK(mkdir,out); -          transaction_frame = copy_frame (frame);          if (!transaction_frame)                  goto out; @@ -849,8 +843,6 @@ afr_link (call_frame_t *frame, xlator_t *this, loc_t *oldloc, loc_t *newloc,          priv = this->private; -        QUORUM_CHECK(link,out); -          transaction_frame = copy_frame (frame);          if (!transaction_frame)                  goto out; @@ -981,8 +973,6 @@ afr_symlink (call_frame_t *frame, xlator_t *this, const char *linkpath,          priv = this->private; -        QUORUM_CHECK(symlink,out); -          transaction_frame = copy_frame (frame);          if (!transaction_frame)                  goto out; @@ -1116,8 +1106,6 @@ afr_rename (call_frame_t *frame, xlator_t *this, loc_t *oldloc, loc_t *newloc,          priv = this->private; -        QUORUM_CHECK(rename,out); -          transaction_frame = copy_frame (frame);          if (!transaction_frame)                  op_errno = ENOMEM; @@ -1271,8 +1259,6 @@ afr_unlink (call_frame_t *frame, xlator_t *this, loc_t *loc, int xflag,          priv = this->private; -        QUORUM_CHECK(unlink,out); -          transaction_frame = copy_frame (frame);          if (!transaction_frame)                  goto out; @@ -1400,8 +1386,6 @@ afr_rmdir (call_frame_t *frame, xlator_t *this, loc_t *loc, int flags,          priv = this->private; -        QUORUM_CHECK(rmdir,out); -          transaction_frame = copy_frame (frame);          if (!transaction_frame)                  goto out; diff --git a/xlators/cluster/afr/src/afr-inode-write.c b/xlators/cluster/afr/src/afr-inode-write.c index 00e0d26768b..0c3ba920937 100644 --- a/xlators/cluster/afr/src/afr-inode-write.c +++ b/xlators/cluster/afr/src/afr-inode-write.c @@ -229,7 +229,7 @@ afr_writev_handle_short_writes (call_frame_t *frame, xlator_t *this)                          continue;                  if (local->replies[i].op_ret < local->op_ret) -                        afr_transaction_fop_failed(frame, this, i); +                        afr_transaction_fop_failed (frame, this, i);          }  } @@ -402,14 +402,9 @@ afr_writev (call_frame_t *frame, xlator_t *this, fd_t *fd,              struct iovec *vector, int32_t count, off_t offset,              uint32_t flags, struct iobref *iobref, dict_t *xdata)  { -        afr_private_t *priv = NULL;          afr_local_t *local = NULL;          int op_errno = ENOMEM; -        priv = this->private; - -        QUORUM_CHECK(writev,out); -  	local = AFR_FRAME_INIT (frame, op_errno);  	if (!local)  		goto out; @@ -528,16 +523,11 @@ int  afr_truncate (call_frame_t *frame, xlator_t *this,                loc_t *loc, off_t offset, dict_t *xdata)  { -        afr_private_t * priv  = NULL;          afr_local_t   * local = NULL;          call_frame_t   *transaction_frame = NULL;  	int ret = -1;          int op_errno = ENOMEM; -        priv = this->private; - -        QUORUM_CHECK(truncate,out); -          transaction_frame = copy_frame (frame);          if (!transaction_frame)                  goto out; @@ -652,16 +642,11 @@ int  afr_ftruncate (call_frame_t *frame, xlator_t *this, fd_t *fd, off_t offset,  	       dict_t *xdata)  { -        afr_private_t *priv = NULL;          afr_local_t *local = NULL;  	call_frame_t *transaction_frame = NULL;  	int ret = -1;          int op_errno = ENOMEM; -        priv = this->private; - -        QUORUM_CHECK(ftruncate,out); -  	transaction_frame = copy_frame (frame);  	if (!frame)  		goto out; @@ -769,16 +754,11 @@ int  afr_setattr (call_frame_t *frame, xlator_t *this, loc_t *loc, struct iatt *buf,  	     int32_t valid, dict_t *xdata)  { -        afr_private_t *priv = NULL;          afr_local_t *local = NULL;          call_frame_t *transaction_frame = NULL;          int ret = -1;          int op_errno = ENOMEM; -        priv = this->private; - -        QUORUM_CHECK(setattr,out); -          transaction_frame = copy_frame (frame);          if (!transaction_frame)                  goto out; @@ -879,16 +859,11 @@ int  afr_fsetattr (call_frame_t *frame, xlator_t *this,                fd_t *fd, struct iatt *buf, int32_t valid, dict_t *xdata)  { -        afr_private_t *priv = NULL;          afr_local_t *local = NULL;          call_frame_t *transaction_frame = NULL;          int ret = -1;          int op_errno = ENOMEM; -        priv = this->private; - -        QUORUM_CHECK(fsetattr,out); -          transaction_frame = copy_frame (frame);          if (!transaction_frame)                  goto out; @@ -991,7 +966,6 @@ int  afr_setxattr (call_frame_t *frame, xlator_t *this, loc_t *loc, dict_t *dict,  	      int32_t flags, dict_t *xdata)  { -        afr_private_t  *priv              = NULL;          afr_local_t    *local             = NULL;          call_frame_t   *transaction_frame = NULL;          int             ret               = -1; @@ -1003,10 +977,6 @@ afr_setxattr (call_frame_t *frame, xlator_t *this, loc_t *loc, dict_t *dict,          GF_IF_INTERNAL_XATTR_GOTO ("trusted.glusterfs.afr.*", dict,                                     op_errno, out); -        priv = this->private; - -        QUORUM_CHECK(setxattr,out); -          transaction_frame = copy_frame (frame);          if (!transaction_frame)                  goto out; @@ -1107,7 +1077,6 @@ int  afr_fsetxattr (call_frame_t *frame, xlator_t *this,                 fd_t *fd, dict_t *dict, int32_t flags, dict_t *xdata)  { -        afr_private_t    *priv              = NULL;          afr_local_t      *local             = NULL;          call_frame_t     *transaction_frame = NULL;          int               ret               = -1; @@ -1119,10 +1088,6 @@ afr_fsetxattr (call_frame_t *frame, xlator_t *this,          GF_IF_INTERNAL_XATTR_GOTO ("trusted.glusterfs.afr.*", dict,                                     op_errno, out); -        priv = this->private; - -        QUORUM_CHECK(fsetxattr,out); -          transaction_frame = copy_frame (frame);          if (!transaction_frame)                  goto out; @@ -1226,7 +1191,6 @@ int  afr_removexattr (call_frame_t *frame, xlator_t *this,                   loc_t *loc, const char *name, dict_t *xdata)  { -        afr_private_t   *priv              = NULL;          afr_local_t     *local             = NULL;          call_frame_t    *transaction_frame = NULL;          int              ret               = -1; @@ -1238,10 +1202,6 @@ afr_removexattr (call_frame_t *frame, xlator_t *this,          GF_IF_NATIVE_XATTR_GOTO ("trusted.glusterfs.afr.*",                                   name, op_errno, out); -        priv = this->private; - -        QUORUM_CHECK(removexattr,out); -          transaction_frame = copy_frame (frame);          if (!transaction_frame)                  goto out; @@ -1339,7 +1299,6 @@ int  afr_fremovexattr (call_frame_t *frame, xlator_t *this, fd_t *fd,  		  const char *name, dict_t *xdata)  { -	afr_private_t *priv = NULL;          afr_local_t *local = NULL;          call_frame_t *transaction_frame = NULL;          int ret = -1; @@ -1351,10 +1310,6 @@ afr_fremovexattr (call_frame_t *frame, xlator_t *this, fd_t *fd,          GF_IF_NATIVE_XATTR_GOTO ("trusted.glusterfs.afr.*",                                   name, op_errno, out); -	priv = this->private; - -        QUORUM_CHECK(fremovexattr, out); -          transaction_frame = copy_frame (frame);          if (!transaction_frame)                  goto out; @@ -1455,16 +1410,11 @@ int  afr_fallocate (call_frame_t *frame, xlator_t *this, fd_t *fd, int32_t mode,                 off_t offset, size_t len, dict_t *xdata)  { -	afr_private_t *priv = NULL;          call_frame_t *transaction_frame = NULL;          afr_local_t *local = NULL;          int ret = -1;          int op_errno = ENOMEM; -	priv = this->private; - -        QUORUM_CHECK(fallocate,out); -          transaction_frame = copy_frame (frame);          if (!transaction_frame)                  goto out; @@ -1573,16 +1523,11 @@ int  afr_discard (call_frame_t *frame, xlator_t *this, fd_t *fd, off_t offset,               size_t len, dict_t *xdata)  { -        afr_private_t *priv  = NULL;          afr_local_t *local = NULL;          call_frame_t *transaction_frame = NULL;          int ret = -1;          int op_errno = ENOMEM; -        priv = this->private; - -        QUORUM_CHECK(discard, out); -  	transaction_frame = copy_frame (frame);  	if (!transaction_frame)  		goto out; @@ -1687,16 +1632,11 @@ int  afr_zerofill (call_frame_t *frame, xlator_t *this, fd_t *fd, off_t offset,               size_t len, dict_t *xdata)  { -        afr_private_t *priv  = NULL;          afr_local_t *local = NULL;          call_frame_t *transaction_frame = NULL;          int ret = -1;          int op_errno = ENOMEM; -        priv = this->private; - -        QUORUM_CHECK(discard, out); -  	transaction_frame = copy_frame (frame);  	if (!transaction_frame)  		goto out; diff --git a/xlators/cluster/afr/src/afr-open.c b/xlators/cluster/afr/src/afr-open.c index f86aa7fd80d..83da86053f4 100644 --- a/xlators/cluster/afr/src/afr-open.c +++ b/xlators/cluster/afr/src/afr-open.c @@ -131,10 +131,6 @@ afr_open (call_frame_t *frame, xlator_t *this, loc_t *loc, int32_t flags,          priv = this->private; -        if (flags & (O_CREAT|O_TRUNC)) { -                QUORUM_CHECK(open,out); -        } -  	local = AFR_FRAME_INIT (frame, op_errno);  	if (!local)  		goto out; diff --git a/xlators/cluster/afr/src/afr-transaction.c b/xlators/cluster/afr/src/afr-transaction.c index 205ff759ef5..270e2212913 100644 --- a/xlators/cluster/afr/src/afr-transaction.c +++ b/xlators/cluster/afr/src/afr-transaction.c @@ -430,6 +430,135 @@ afr_handle_symmetric_errors (call_frame_t *frame, xlator_t *this)  		__mark_all_success (frame, this);  } +gf_boolean_t +afr_has_quorum (unsigned char *subvols, xlator_t *this) +{ +        unsigned int  quorum_count = 0; +        afr_private_t *priv  = NULL; +        unsigned int  up_children_count = 0; + +        priv = this->private; +        up_children_count = AFR_COUNT (subvols, priv->child_count); + +        if (priv->quorum_count == AFR_QUORUM_AUTO) { +        /* +         * Special case for even numbers of nodes in auto-quorum: +         * if we have exactly half children up +         * and that includes the first ("senior-most") node, then that counts +         * as quorum even if it wouldn't otherwise.  This supports e.g. N=2 +         * while preserving the critical property that there can only be one +         * such group. +         */ +                if ((priv->child_count % 2 == 0) && +                    (up_children_count == (priv->child_count/2))) +                        return subvols[0]; +        } + +        if (priv->quorum_count == AFR_QUORUM_AUTO) { +                quorum_count = priv->child_count/2 + 1; +        } else { +                quorum_count = priv->quorum_count; +        } + +        if (up_children_count >= quorum_count) +                return _gf_true; + +        return _gf_false; +} + +static gf_boolean_t +afr_has_fop_quorum (call_frame_t *frame) +{ +        xlator_t        *this = frame->this; +        afr_local_t     *local = frame->local; +        unsigned char *locked_nodes = NULL; + +        locked_nodes = afr_locked_nodes_get (local->transaction.type, +                                             &local->internal_lock); +        return afr_has_quorum (locked_nodes, this); +} + +static gf_boolean_t +afr_has_fop_cbk_quorum (call_frame_t *frame) +{ +        afr_local_t   *local   = frame->local; +        xlator_t      *this    = frame->this; +        afr_private_t *priv    = this->private; +        unsigned char *success = alloca0(priv->child_count); +        int           i        = 0; + +        for (i = 0; i < priv->child_count; i++) { +                if (local->transaction.pre_op[i]) +                        if (!local->transaction.failed_subvols[i]) +                                success[i] = 1; +        } + +        return afr_has_quorum (success, this); +} + +void +afr_handle_quorum (call_frame_t *frame) +{ +        afr_local_t   *local = NULL; +        afr_private_t *priv  = NULL; +        int           i      = 0; +        const char    *file  = NULL; +        uuid_t        gfid   = {0}; + +        local = frame->local; +        priv  = frame->this->private; + +        if (priv->quorum_count == 0) +                return; + +        /* +         * Network split may happen just after the fops are unwound, so check +         * if the fop succeeded in a way it still follows quorum. If it doesn't, +         * mark the fop as failure, mark the changelogs so it reflects that +         * failure. +         * +         * Scenario: +         * There are 3 mounts on 3 machines(node1, node2, node3) all writing to +         * single file. Network split happened in a way that node1 can't see +         * node2, node3. Node2, node3 both of them can't see node1. Now at the +         * time of sending write all the bricks are up. Just after write fop is +         * wound on node1, network split happens. Node1 thinks write fop failed +         * on node2, node3 so marks pending changelog for those 2 extended +         * attributes on node1. Node2, node3 thinks writes failed on node1 so +         * they mark pending changelog for node1. When the network is stable +         * again the file already is in split-brain. These checks prevent +         * marking pending changelog on other subvolumes if the fop doesn't +         * succeed in a way it is still following quorum. So with this fix what +         * is happening is, node1 will have all pending changelog(FOOL) because +         * the write succeeded only on node1 but failed on node2, node3 so +         * instead of marking pending changelogs on node2, node3 it just treats +         * the fop as failure and goes into DIRTY state. Where as node2, node3 +         * say they are sources and have pending changelog to node1 so there is +         * no split-brain with the fix. The problem is eliminated completely. +         */ + +        if (afr_has_fop_cbk_quorum (frame)) +                return; + +        if (local->fd) { +                uuid_copy (gfid, local->fd->inode->gfid); +                file = uuid_utoa (gfid); +        } else { +                loc_path (&local->loc, local->loc.name); +                file = local->loc.path; +        } + +        gf_log (frame->this->name, GF_LOG_WARNING, "%s: Failing %s as quorum " +                "is not met", file, gf_fop_list[local->op]); + +        for (i = 0; i < priv->child_count; i++) { +                if (local->transaction.pre_op[i]) +                        afr_transaction_fop_failed (frame, frame->this, i); +        } + +        local->op_ret = -1; +        local->op_errno = EROFS; +}  int  afr_changelog_post_op_now (call_frame_t *frame, xlator_t *this) @@ -443,6 +572,7 @@ afr_changelog_post_op_now (call_frame_t *frame, xlator_t *this)          int            nothing_failed = 1;  	gf_boolean_t   need_undirty = _gf_false; +        afr_handle_quorum (frame);          local = frame->local;  	idx = afr_index_for_transaction_type (local->transaction.type); @@ -453,6 +583,14 @@ afr_changelog_post_op_now (call_frame_t *frame, xlator_t *this)  	else  		need_undirty = _gf_true; +        //If the fop fails on all the subvols then pending markers are placed +        //for every subvol on all subvolumes. Which is nothing but split-brain. +        //Avoid this by not doing post-op in case of failures. +        if (local->op_ret < 0) { +                afr_changelog_post_op_done (frame, this); +                goto out; +        } +  	if (nothing_failed && !need_undirty) {  		afr_changelog_post_op_done (frame, this);                  goto out; @@ -842,7 +980,10 @@ afr_changelog_pre_op (call_frame_t *frame, xlator_t *this)  		}  	} -	/* TBD: quorum check w/ call_count */ +        if (priv->quorum_count && !afr_has_fop_quorum (frame)) { +                op_errno = EROFS; +                goto err; +        }          if (call_count == 0) {  		op_errno = ENOTCONN; @@ -1057,7 +1198,6 @@ afr_post_blocking_rename_cbk (call_frame_t *frame, xlator_t *this)          return 0;  } -  int  afr_post_lower_unlock_cbk (call_frame_t *frame, xlator_t *this)  { diff --git a/xlators/cluster/afr/src/afr-transaction.h b/xlators/cluster/afr/src/afr-transaction.h index 77cc8eed019..c3ce333b771 100644 --- a/xlators/cluster/afr/src/afr-transaction.h +++ b/xlators/cluster/afr/src/afr-transaction.h @@ -49,5 +49,6 @@ int afr_read_txn_continue (call_frame_t *frame, xlator_t *this, int subvol);  int __afr_txn_write_fop (call_frame_t *frame, xlator_t *this);  int __afr_txn_write_done (call_frame_t *frame, xlator_t *this);  call_frame_t *afr_transaction_detach_fop_frame (call_frame_t *frame); +gf_boolean_t afr_has_quorum (unsigned char *subvols, xlator_t *this);  #endif /* __TRANSACTION_H__ */ diff --git a/xlators/cluster/afr/src/afr.c b/xlators/cluster/afr/src/afr.c index ead08425f2c..6af136e9163 100644 --- a/xlators/cluster/afr/src/afr.c +++ b/xlators/cluster/afr/src/afr.c @@ -180,6 +180,9 @@ reconfigure (xlator_t *this, dict_t *options)          GF_OPTION_RECONF ("quorum-count", priv->quorum_count, options,                            uint32, out);          fix_quorum_options(this,priv,qtype); +        if (priv->quorum_count && !afr_has_quorum (priv->child_up, this)) +                gf_log (this->name, GF_LOG_WARNING, "Client-quorum is not met"); +  	GF_OPTION_RECONF ("post-op-delay-secs", priv->post_op_delay_secs, options,  			  uint32, out); @@ -456,9 +459,6 @@ struct xlator_fops fops = {          .finodelk    = afr_finodelk,          .entrylk     = afr_entrylk,          .fentrylk    = afr_fentrylk, -	.fallocate   = afr_fallocate, -	.discard     = afr_discard, -        .zerofill    = afr_zerofill,          /* inode read */          .access      = afr_access, @@ -479,6 +479,9 @@ struct xlator_fops fops = {          .fsetattr    = afr_fsetattr,          .removexattr = afr_removexattr,          .fremovexattr = afr_fremovexattr, +        .fallocate   = afr_fallocate, +        .discard     = afr_discard, +        .zerofill    = afr_zerofill,          /* dir read */          .opendir     = afr_opendir, diff --git a/xlators/cluster/afr/src/afr.h b/xlators/cluster/afr/src/afr.h index 36042f7b2e5..674ea0bf72e 100644 --- a/xlators/cluster/afr/src/afr.h +++ b/xlators/cluster/afr/src/afr.h @@ -922,9 +922,6 @@ int  afr_child_fd_ctx_set (xlator_t *this, fd_t *fd, int32_t child,                        int flags); -gf_boolean_t -afr_have_quorum (char *logname, afr_private_t *priv); -  void  afr_matrix_cleanup (int32_t **pending, unsigned int m); @@ -940,19 +937,6 @@ afr_filter_xattrs (dict_t *xattr);   */  #define AFR_QUORUM_AUTO INT_MAX -/* - * Having this as a macro will make debugging a bit weirder, but does reduce - * the probability of functions handling this check inconsistently. - */ -#define QUORUM_CHECK(_func,_label) do {                                  \ -        if (priv->quorum_count && !afr_have_quorum(this->name,priv)) { \ -                gf_log(this->name,GF_LOG_WARNING,                        \ -                       "failing "#_func" due to lack of quorum");        \ -                op_errno = EROFS;                                        \ -                goto _label;                                             \ -        }                                                                \ -} while (0); -  int  afr_fd_report_unstable_write (xlator_t *this, fd_t *fd);  | 
