diff options
| author | Anand V. Avati <avati@amp.gluster.com> | 2010-10-18 00:16:31 +0000 | 
|---|---|---|
| committer | Vijay Bellur <vijay@dev.gluster.com> | 2010-10-18 03:25:46 -0700 | 
| commit | f213c1b051d7e91e33a2e4631a9ef383ae48921e (patch) | |
| tree | bd4fb173a3f2d099d6b2b7a7b8fe2c5dd32f2435 | |
| parent | 9be13aff89232c5ede11bdb37c49c2e5dca5d840 (diff) | |
replicate: replace first-write-to-flush optimization
use a changelog piggybacking optimization instead of first-write-to-flush
optimization and do other cleanups (removal of post-post-op hook etc.)
Signed-off-by: Anand V. Avati <avati@amp.gluster.com>
Signed-off-by: Vijay Bellur <vijay@dev.gluster.com>
BUG: 1235 (Bug for all pump/migrate commits)
URL: http://bugs.gluster.com/cgi-bin/bugzilla3/show_bug.cgi?id=1235
| -rw-r--r-- | xlators/cluster/afr/src/afr-inode-write.c | 16 | ||||
| -rw-r--r-- | xlators/cluster/afr/src/afr-open.c | 77 | ||||
| -rw-r--r-- | xlators/cluster/afr/src/afr-transaction.c | 742 | ||||
| -rw-r--r-- | xlators/cluster/afr/src/afr.c | 128 | ||||
| -rw-r--r-- | xlators/cluster/afr/src/afr.h | 27 | 
5 files changed, 423 insertions, 567 deletions
diff --git a/xlators/cluster/afr/src/afr-inode-write.c b/xlators/cluster/afr/src/afr-inode-write.c index a7751e68e9c..8ebba6d4ad2 100644 --- a/xlators/cluster/afr/src/afr-inode-write.c +++ b/xlators/cluster/afr/src/afr-inode-write.c @@ -305,13 +305,9 @@ afr_writev (call_frame_t *frame, xlator_t *this, fd_t *fd,          fd_ctx = (afr_fd_ctx_t *)(long) ctx; -        if (fd_ctx->down_count < priv->down_count) { -                local->up_down_flush_cbk = afr_do_writev; -                afr_up_down_flush (frame, this, fd, AFR_CHILD_DOWN_FLUSH); - -        } else if (fd_ctx->up_count < priv->up_count) { -                local->up_down_flush_cbk = afr_do_writev; -                afr_up_down_flush (frame, this, fd, AFR_CHILD_UP_FLUSH); +        if (fd_ctx->up_count < priv->up_count) { +                local->openfd_flush_cbk = afr_do_writev; +                afr_openfd_flush (frame, this, fd);          } else {                  afr_do_writev (frame, this); @@ -787,9 +783,9 @@ afr_ftruncate (call_frame_t *frame, xlator_t *this,          fd_ctx = (afr_fd_ctx_t *)(long) ctx; -        if (fd_ctx->down_count < priv->down_count) { -                local->up_down_flush_cbk = afr_do_ftruncate; -                afr_up_down_flush (frame, this, fd, AFR_CHILD_DOWN_FLUSH); +        if (fd_ctx->up_count < priv->up_count) { +                local->openfd_flush_cbk = afr_do_ftruncate; +                afr_openfd_flush (frame, this, fd);          } else {                  afr_do_ftruncate (frame, this);          } diff --git a/xlators/cluster/afr/src/afr-open.c b/xlators/cluster/afr/src/afr-open.c index 43a38c0b112..00958438a4c 100644 --- a/xlators/cluster/afr/src/afr-open.c +++ b/xlators/cluster/afr/src/afr-open.c @@ -215,9 +215,8 @@ out:  int -afr_up_down_flush_open_cbk (call_frame_t *frame, void *cookie, -                            xlator_t *this, int32_t op_ret, int32_t op_errno, -                            fd_t *fd) +afr_openfd_sh_open_cbk (call_frame_t *frame, void *cookie, xlator_t *this, +                        int32_t op_ret, int32_t op_errno, fd_t *fd)  {          afr_local_t *local  = NULL;          afr_private_t *priv = NULL; @@ -257,7 +256,7 @@ out:          call_count = afr_frame_return (frame);          if (call_count == 0) { -                local->transaction.post_post_op (frame, this); +                local->transaction.resume (frame, this);          }          return 0; @@ -265,7 +264,7 @@ out:  static int -__unopened_count (int child_count, unsigned char *opened_on, unsigned char *child_up) +__unopened_count (int child_count, unsigned int *opened_on, unsigned char *child_up)  {          int i;          int count = 0; @@ -280,7 +279,7 @@ __unopened_count (int child_count, unsigned char *opened_on, unsigned char *chil  int -afr_up_down_flush_sh_unwind (call_frame_t *frame, xlator_t *this) +afr_openfd_sh_unwind (call_frame_t *frame, xlator_t *this)  {          afr_local_t *local  = NULL;          afr_private_t *priv = NULL; @@ -311,8 +310,17 @@ afr_up_down_flush_sh_unwind (call_frame_t *frame, xlator_t *this)          fd_ctx = (afr_fd_ctx_t *)(long) ctx; -        call_count = __unopened_count (priv->child_count, fd_ctx->opened_on, -                                       local->child_up); +        LOCK (&local->fd->lock); +        { +                call_count = __unopened_count (priv->child_count, +                                               fd_ctx->opened_on, +                                               local->child_up); +                for (i = 0; i < priv->child_count; i++) { +                        fd_ctx->pre_op_done[i] = 0; +                        fd_ctx->pre_op_piggyback[i] = 0; +                } +        } +        UNLOCK (&local->fd->lock);          if (call_count == 0) {                  abandon = 1; @@ -332,7 +340,7 @@ afr_up_down_flush_sh_unwind (call_frame_t *frame, xlator_t *this)                                  "opening fd for %s on subvolume %s",                                  local->loc.path, priv->children[i]->name); -                        STACK_WIND_COOKIE (frame, afr_up_down_flush_open_cbk, +                        STACK_WIND_COOKIE (frame, afr_openfd_sh_open_cbk,                                             (void *)(long) i,                                             priv->children[i],                                             priv->children[i]->fops->open, @@ -346,14 +354,14 @@ afr_up_down_flush_sh_unwind (call_frame_t *frame, xlator_t *this)  out:          if (abandon) -                local->transaction.post_post_op (frame, this); +                local->transaction.resume (frame, this);          return 0;  }  int -afr_up_down_flush_post_post_op (call_frame_t *frame, xlator_t *this) +afr_openfd_sh (call_frame_t *frame, xlator_t *this)  {          afr_private_t *priv = NULL;          afr_local_t *local  = NULL; @@ -369,7 +377,7 @@ afr_up_down_flush_post_post_op (call_frame_t *frame, xlator_t *this)          if (ret < 0) {                  gf_log (this->name, GF_LOG_TRACE,                          "Inode path failed. Possible open-unlink-write detected"); -                afr_up_down_flush_sh_unwind (frame, this); +                afr_openfd_sh_unwind (frame, this);                  return 0;          } @@ -386,7 +394,7 @@ afr_up_down_flush_post_post_op (call_frame_t *frame, xlator_t *this)          sh->need_data_self_heal = _gf_true;          sh->mode                = local->fd->inode->st_mode;          sh->background          = _gf_false; -        sh->unwind              = afr_up_down_flush_sh_unwind; +        sh->unwind              = afr_openfd_sh_unwind;          afr_self_heal (frame, this); @@ -395,21 +403,7 @@ afr_up_down_flush_post_post_op (call_frame_t *frame, xlator_t *this)  int -afr_up_down_flush_wind (call_frame_t *frame, xlator_t *this) -{ -	afr_local_t *local = NULL; -	afr_private_t *priv = NULL; - -	local = frame->local; -	priv  = this->private; - -        local->transaction.resume (frame, this); -	return 0; -} - - -int -afr_up_down_flush_done (call_frame_t *frame, xlator_t *this) +afr_openfd_flush_done (call_frame_t *frame, xlator_t *this)  {          afr_private_t *priv = NULL;  	afr_local_t *local  = NULL; @@ -418,7 +412,6 @@ afr_up_down_flush_done (call_frame_t *frame, xlator_t *this)          afr_fd_ctx_t * fd_ctx = NULL;          int _ret = -1; -        int i    = 0;          priv  = this->private;  	local = frame->local; @@ -435,26 +428,20 @@ afr_up_down_flush_done (call_frame_t *frame, xlator_t *this)                  fd_ctx->down_count = priv->down_count;                  fd_ctx->up_count   = priv->up_count; - -                for (i = 0; i < priv->child_count; i++) { -                        if (local->child_up[i]) -                                fd_ctx->pre_op_done[i] = 0; -                }          }  out:          UNLOCK (&local->fd->lock);          afr_local_transaction_cleanup (local, this); -        local->up_down_flush_cbk (frame, this); +        local->openfd_flush_cbk (frame, this);  	return 0;  }  int -afr_up_down_flush (call_frame_t *frame, xlator_t *this, fd_t *fd, -                   afr_flush_type type) +afr_openfd_flush (call_frame_t *frame, xlator_t *this, fd_t *fd)  {  	afr_private_t * priv  = NULL;  	afr_local_t   * local = NULL; @@ -473,18 +460,8 @@ afr_up_down_flush (call_frame_t *frame, xlator_t *this, fd_t *fd,  //        local->fd = fd_ref (local->fd); -        local->transaction.fop          = afr_up_down_flush_wind; -        local->transaction.done         = afr_up_down_flush_done; - -        switch (type) { -        case AFR_CHILD_UP_FLUSH: -                local->transaction.post_post_op = afr_up_down_flush_post_post_op; -                break; - -        case AFR_CHILD_DOWN_FLUSH: -                local->transaction.post_post_op = NULL; -                break; -        } +        local->transaction.fop          = afr_openfd_sh; +        local->transaction.done         = afr_openfd_flush_done;          local->transaction.start  = 0;          local->transaction.len    = 0; @@ -493,7 +470,7 @@ afr_up_down_flush (call_frame_t *frame, xlator_t *this, fd_t *fd,                  "doing up/down flush on fd=%p",                  fd); -        afr_transaction (frame, this, AFR_FLUSH_TRANSACTION); +        afr_transaction (frame, this, AFR_DATA_TRANSACTION);  	op_ret = 0;  out: diff --git a/xlators/cluster/afr/src/afr-transaction.c b/xlators/cluster/afr/src/afr-transaction.c index 75a059f303c..53a2b380d70 100644 --- a/xlators/cluster/afr/src/afr-transaction.c +++ b/xlators/cluster/afr/src/afr-transaction.c @@ -33,6 +33,25 @@  #define LOCKED_LOWER    0x2        /* for lower_path of RENAME */ +afr_fd_ctx_t * +afr_fd_ctx_get (fd_t *fd, xlator_t *this) +{ +        uint64_t       ctx = 0; +        afr_fd_ctx_t  *fd_ctx = NULL; +        int            ret = 0; + +        ret = fd_ctx_get (fd, this, &ctx); + +        if (ret < 0) +                goto out; + +        fd_ctx = (afr_fd_ctx_t *)(long) ctx; + +out: +        return fd_ctx; +} + +  static void  afr_pid_save (call_frame_t *frame)  { @@ -82,79 +101,53 @@ __mark_child_dead (int32_t *pending[], int child_count, int child,  static void -__mark_fop_failed_on_fd (fd_t *fd, xlator_t *this, -                         int child_index) -{ -        uint64_t       ctx; -        afr_fd_ctx_t * fd_ctx = NULL; - -        int ret = 0; - -        ret = fd_ctx_get (fd, this, &ctx); - -        if (ret < 0) -                goto out; - -        fd_ctx = (afr_fd_ctx_t *)(long) ctx; - -        fd_ctx->child_failed[child_index] = 1; -out: -        return; -} - - -static void -__mark_failed_children (int32_t *pending[], int child_count,  -                        xlator_t *this, fd_t *fd, afr_transaction_type type) +__mark_pre_op_done_on_fd (call_frame_t *frame, xlator_t *this, int child_index)  { -        uint64_t       ctx; +        afr_local_t *local = NULL;          afr_fd_ctx_t * fd_ctx = NULL; -        int ret = 0; -        int i   = 0; -        int j   = 0; +        local = frame->local; -        ret = fd_ctx_get (fd, this, &ctx); +        if (!local->fd) +                return; -        if (ret < 0) +        fd_ctx = afr_fd_ctx_get (local->fd, this); +        if (!fd_ctx)                  goto out; -        fd_ctx = (afr_fd_ctx_t *)(long) ctx; - -        for (i = 0; i < child_count; i++) { -                j = afr_index_for_transaction_type (type); - -                if (fd_ctx->child_failed[i]) -                        pending[i][j] = 0; +        LOCK (&local->fd->lock); +        { +                if (local->transaction.type == AFR_DATA_TRANSACTION) +                        fd_ctx->pre_op_done[child_index]++;          } -         +        UNLOCK (&local->fd->lock); +  out:          return;  }  static void -__mark_pre_op_done_on_fd (call_frame_t *frame, xlator_t *this, int child_index) +__mark_pre_op_undone_on_fd (call_frame_t *frame, xlator_t *this, int child_index)  {          afr_local_t *local = NULL; - -        uint64_t       ctx;          afr_fd_ctx_t * fd_ctx = NULL; -        int ret = 0;          local = frame->local; -        ret = fd_ctx_get (local->fd, this, &ctx); +        if (!local->fd) +                return; -        if (ret < 0) +        fd_ctx = afr_fd_ctx_get (local->fd, this); +        if (!fd_ctx)                  goto out; -        fd_ctx = (afr_fd_ctx_t *)(long) ctx; - -        if ((local->op == GF_FOP_WRITE) -            || (local->op == GF_FOP_FTRUNCATE)) { -                fd_ctx->pre_op_done[child_index] = 1; +        LOCK (&local->fd->lock); +        { +                if (local->transaction.type == AFR_DATA_TRANSACTION) +                        fd_ctx->pre_op_done[child_index]--;          } +        UNLOCK (&local->fd->lock);  out:          return; @@ -191,115 +184,6 @@ __mark_all_success (int32_t *pending[], int child_count,  } -static int -__is_first_write_on_fd (xlator_t *this, fd_t *fd) -{ -        int op_ret     = 0; -        int _ret       = -1; -        int i          = 0; - -        uint64_t       ctx; -        afr_fd_ctx_t * fd_ctx = NULL; - -        afr_private_t *priv = NULL; - -        priv = this->private; - -        LOCK (&fd->lock); -        { -                _ret = __fd_ctx_get (fd, this, &ctx); -                 -                if (_ret < 0) { -                        gf_log (this->name, GF_LOG_DEBUG, -                                "could not get fd ctx on fd=%p", -                                fd); -                        goto out; -                } - -                fd_ctx = (afr_fd_ctx_t *)(long) ctx; - -                op_ret = 1; -                for (i = 0; i < priv->child_count; i++) { -                        if (fd_ctx->pre_op_done[i] == 0) -                                continue; - -                        op_ret = 0; -                } -        } -out: -        UNLOCK (&fd->lock); - -        return op_ret; -} - - -static int -__if_fd_pre_op_done (xlator_t *this, fd_t *fd, int child_index) -{ -        int op_ret = 0; -        int _ret   = -1; - -        uint64_t       ctx; -        afr_fd_ctx_t * fd_ctx = NULL; - -        LOCK (&fd->lock); -        { -                _ret = __fd_ctx_get (fd, this, &ctx); - -                if (_ret < 0) { -                        goto out; -                } - -                fd_ctx = (afr_fd_ctx_t *)(long) ctx; - -                if (fd_ctx->pre_op_done[child_index]) { -                        op_ret = 1; -                } -                fd_ctx->pre_op_done[child_index] = 0; -        } -out: -        UNLOCK (&fd->lock); - -        return op_ret; -} - - -static int -afr_pre_op_done_count (xlator_t *this, fd_t *fd, unsigned char *child_up) -{ -        int i = 0; -        int count = 0; - -        int _ret = 0; -        uint64_t       ctx; -        afr_fd_ctx_t * fd_ctx = NULL; - -        afr_private_t *priv = NULL; - -        priv = this->private; - -        LOCK (&fd->lock); -        { -                _ret = __fd_ctx_get (fd, this, &ctx); - -                if (_ret < 0) { -                        goto out; -                } - -                fd_ctx = (afr_fd_ctx_t *)(long) ctx; - -                for (i = 0; i < priv->child_count; i++) { -                        if (fd_ctx->pre_op_done[i] && child_up[i]) { -                                count++; -                        } -                } -        } -out: -        UNLOCK (&fd->lock); - -        return count; -} -  static int  __changelog_enabled (afr_private_t *priv, afr_transaction_type type) @@ -325,9 +209,6 @@ __changelog_enabled (afr_private_t *priv, afr_transaction_type type)  			ret = 1;  		break; -		 -	case AFR_FLUSH_TRANSACTION: -		ret = 1;  	}  	return ret; @@ -339,7 +220,6 @@ __changelog_needed_pre_op (call_frame_t *frame, xlator_t *this)  {  	afr_private_t * priv  = NULL;  	afr_local_t   * local = NULL; -	fd_t *          fd    = NULL;  	int op_ret   = 0; @@ -351,15 +231,7 @@ __changelog_needed_pre_op (call_frame_t *frame, xlator_t *this)  		case GF_FOP_WRITE:  		case GF_FOP_FTRUNCATE: -			/*  -			   if it's a data transaction, we write the changelog -			   only on the first write on an fd  -			*/ -			 -			fd = local->fd; -			if (!fd || __is_first_write_on_fd (this, fd)) -				op_ret = 1; - +                        op_ret = 1;  			break;  		case GF_FOP_FLUSH: @@ -395,11 +267,11 @@ __changelog_needed_post_op (call_frame_t *frame, xlator_t *this)                  case GF_FOP_WRITE:                  case GF_FOP_FTRUNCATE: -                        op_ret = 0; +                        op_ret = 1;                          break;                  case GF_FOP_FLUSH: -                        op_ret = 1; +                        op_ret = 0;                          break;                  default: @@ -412,6 +284,39 @@ __changelog_needed_post_op (call_frame_t *frame, xlator_t *this)  static int +afr_set_piggyback_dict (afr_private_t *priv, dict_t *xattr, int32_t **pending, +                        afr_transaction_type type) +{ +        int i; +        int ret = 0; +        int *arr = NULL; +        int index = 0; + +        index = afr_index_for_transaction_type (type); + +        for (i = 0; i < priv->child_count; i++) { +                arr = CALLOC (3 * sizeof (int32_t), priv->child_count); +                if (!arr) { +                        ret = -1; +                        goto out; +                } + +                memcpy (arr, pending[i], 3 * sizeof (int32_t)); +                arr[index]++; +                ret = dict_set_bin (xattr, priv->pending_key[i], +                                    arr, 3 * sizeof (int32_t)); +                /* 3 = data+metadata+entry */ + +                if (ret < 0) +                        goto out; +        } + +out: +        return ret; +} + + +static int  afr_set_pending_dict (afr_private_t *priv, dict_t *xattr, int32_t **pending)  {          int i; @@ -437,7 +342,6 @@ afr_lock_server_count (afr_private_t *priv, afr_transaction_type type)  	int ret = 0;  	switch (type) { -	case AFR_FLUSH_TRANSACTION:  	case AFR_DATA_TRANSACTION:  		ret = priv->data_lock_server_count;  		break; @@ -562,7 +466,6 @@ afr_unlock (call_frame_t *frame, xlator_t *this)                  switch (local->transaction.type) {                  case AFR_DATA_TRANSACTION:                  case AFR_METADATA_TRANSACTION: -                case AFR_FLUSH_TRANSACTION:                          if (local->transaction.locked_nodes[i] & LOCKED_YES) {                                  if (local->fd) { @@ -671,11 +574,10 @@ afr_changelog_post_op_cbk (call_frame_t *frame, void *cookie, xlator_t *this,  {  	afr_private_t * priv  = NULL;  	afr_local_t *   local = NULL; +        int             child_index = -1;  	int call_count = -1; -        int (*post_post_op) (call_frame_t *, xlator_t *); -  	priv  = this->private;  	local = frame->local; @@ -685,25 +587,23 @@ afr_changelog_post_op_cbk (call_frame_t *frame, void *cookie, xlator_t *this,  	}  	UNLOCK (&frame->lock); -	if (call_count == 0) { -                if (local->transaction.post_post_op) { -                        post_post_op = local->transaction.post_post_op; +        child_index = (long) cookie; -                        if (afr_lock_server_count (priv, local->transaction.type) == 0) { -                                local->transaction.post_post_op = local->transaction.done; -                        } else { -                                local->transaction.post_post_op = afr_unlock; -                        } +        if (op_ret == 1) { +                /* cached */ +        } + +        if (op_ret == 0) { +                __mark_pre_op_undone_on_fd (frame, this, child_index); +        } -                        post_post_op (frame, this); +	if (call_count == 0) { +                if (afr_lock_server_count (priv, local->transaction.type) == 0) { +                        local->transaction.done (frame, this);                  } else { -                        if (afr_lock_server_count (priv, local->transaction.type) == 0) { -                                local->transaction.done (frame, this); -                        } else { -                                afr_unlock (frame, this); -                        } +                        afr_unlock (frame, this);                  } -	} +        }  	return 0;	  } @@ -720,18 +620,17 @@ afr_changelog_post_op (call_frame_t *frame, xlator_t *this)  	afr_local_t *  local = NULL;	  	dict_t        **xattr = NULL; +        afr_fd_ctx_t  *fdctx = NULL; +        int            piggyback = 0; +        int            index = 0; +        int            nothing_failed = 1; +  	local = frame->local;  	__mark_down_children (local->pending, priv->child_count,                                 local->child_up, local->transaction.type); -        if (local->op == GF_FOP_FLUSH) { -                __mark_failed_children (local->pending, priv->child_count, -                                        this, local->fd, -                                        local->transaction.type); -        } -          xattr = alloca (priv->child_count * sizeof (*xattr));          memset (xattr, 0, (priv->child_count * sizeof (*xattr)));  	for (i = 0; i < priv->child_count; i++) { @@ -739,16 +638,15 @@ afr_changelog_post_op (call_frame_t *frame, xlator_t *this)                  dict_ref (xattr[i]);          } -        if (local->op == GF_FOP_FLUSH) { -                call_count = afr_pre_op_done_count (this, local->fd, local->child_up); -        } else { -                call_count = afr_up_children_count (priv->child_count, local->child_up);  +        call_count = afr_up_children_count (priv->child_count, local->child_up);  -                if (local->transaction.type == AFR_ENTRY_RENAME_TRANSACTION) { -                        call_count *= 2; -                } +        if (local->transaction.type == AFR_ENTRY_RENAME_TRANSACTION) { +                call_count *= 2;          } +        if (local->fd) +                fdctx = afr_fd_ctx_get (local->fd, this); +  	local->call_count = call_count;		  	if (call_count == 0) { @@ -761,100 +659,136 @@ afr_changelog_post_op (call_frame_t *frame, xlator_t *this)  		return 0;  	} +        /* check if something has failed, to handle piggybacking */ +        nothing_failed = 1; +        index = afr_index_for_transaction_type (local->transaction.type); +        for (i = 0; i < priv->child_count; i++) { +                if (local->pending[i][index] == 0) { +                        nothing_failed = 0; +                        break; +                } +        } + +  	for (i = 0; i < priv->child_count; i++) { -		if (local->child_up[i]) { -                        ret = afr_set_pending_dict (priv, xattr[i],  -                                                    local->pending); - -			if (ret < 0) -				gf_log (this->name, GF_LOG_DEBUG, -					"failed to set pending entry"); - - -			switch (local->transaction.type) { -			case AFR_DATA_TRANSACTION: -			case AFR_METADATA_TRANSACTION: -			{ -				if (local->fd) -					STACK_WIND (frame, afr_changelog_post_op_cbk, -						    priv->children[i], -						    priv->children[i]->fops->fxattrop, -						    local->fd, -						    GF_XATTROP_ADD_ARRAY, xattr[i]); -				else -					STACK_WIND (frame, afr_changelog_post_op_cbk, -						    priv->children[i], -						    priv->children[i]->fops->xattrop, -						    &local->loc, -						    GF_XATTROP_ADD_ARRAY, xattr[i]); -                                call_count--; -			} -			break; +                if (!local->child_up[i]) +                        continue; + +                ret = afr_set_pending_dict (priv, xattr[i],  +                                            local->pending); + +                if (ret < 0) +                        gf_log (this->name, GF_LOG_DEBUG, +                                "failed to set pending entry"); + + +                switch (local->transaction.type) { +                case AFR_DATA_TRANSACTION: +                { +                        if (!fdctx) { +                                STACK_WIND (frame, afr_changelog_post_op_cbk, +                                            priv->children[i], +                                            priv->children[i]->fops->xattrop, +                                            &local->loc, +                                            GF_XATTROP_ADD_ARRAY, xattr[i]); +                                break; +                        } -			case AFR_FLUSH_TRANSACTION: -			{ -				if (__if_fd_pre_op_done (this, local->fd, i)) { -					STACK_WIND (frame, afr_changelog_post_op_cbk, -						    priv->children[i],  -						    priv->children[i]->fops->fxattrop, -						    local->fd, -						    GF_XATTROP_ADD_ARRAY, xattr[i]); -                                        call_count--; +                        LOCK (&local->fd->lock); +                        { +                                piggyback = 0; +                                if (fdctx->pre_op_piggyback[i]) { +                                        fdctx->pre_op_piggyback[i]--; +                                        piggyback = 1;                                  } -			} -			break; +                        } +                        UNLOCK (&local->fd->lock); + +                        if (piggyback && !nothing_failed) +                                ret = afr_set_piggyback_dict (priv, xattr[i], +                                                              local->pending, +                                                              local->transaction.type); +                        if (nothing_failed && piggyback) { +                                afr_changelog_post_op_cbk (frame, (void *)(long)i, +                                                           this, 1, 0, xattr[i]); +                        } else { +                                STACK_WIND_COOKIE (frame, +                                                   afr_changelog_post_op_cbk, +                                                   (void *) (long) i, +                                                   priv->children[i], +                                                   priv->children[i]->fops->fxattrop, +                                                   local->fd, +                                                   GF_XATTROP_ADD_ARRAY, xattr[i]); +                        } +                } +                break; -			case AFR_ENTRY_RENAME_TRANSACTION: -			{ -				STACK_WIND_COOKIE (frame, afr_changelog_post_op_cbk, -						   (void *) (long) i, -						   priv->children[i], -						   priv->children[i]->fops->xattrop, -						   &local->transaction.new_parent_loc, -						   GF_XATTROP_ADD_ARRAY, xattr[i]); -				 -				call_count--; -			} +                case AFR_METADATA_TRANSACTION: +                { +                        if (local->fd) +                                STACK_WIND (frame, afr_changelog_post_op_cbk, +                                            priv->children[i], +                                            priv->children[i]->fops->fxattrop, +                                            local->fd, +                                            GF_XATTROP_ADD_ARRAY, xattr[i]); +                        else +                                STACK_WIND (frame, afr_changelog_post_op_cbk, +                                            priv->children[i], +                                            priv->children[i]->fops->xattrop, +                                            &local->loc, +                                            GF_XATTROP_ADD_ARRAY, xattr[i]); +                        call_count--; +                } +                break; -			/*  -			   set it again because previous stack_wind -			   might have already returned (think of case -			   where subvolume is posix) and would have -			   used the dict as placeholder for return -			   value -			*/ -                         -			ret = afr_set_pending_dict (priv, xattr[i],  -                                                    local->pending); - -			if (ret < 0) -				gf_log (this->name, GF_LOG_DEBUG, -					"failed to set pending entry"); - -			/* fall through */ - -			case AFR_ENTRY_TRANSACTION: -			{ -				if (local->fd) -					STACK_WIND (frame, afr_changelog_post_op_cbk, -						    priv->children[i],  -						    priv->children[i]->fops->fxattrop, -						    local->fd,  -						    GF_XATTROP_ADD_ARRAY, xattr[i]); -				else  -					STACK_WIND (frame, afr_changelog_post_op_cbk, -						    priv->children[i],  -						    priv->children[i]->fops->xattrop, -						    &local->transaction.parent_loc,  -						    GF_XATTROP_ADD_ARRAY, xattr[i]); -                                call_count--; -			} -			break; -			} +                case AFR_ENTRY_RENAME_TRANSACTION: +                { +                        STACK_WIND_COOKIE (frame, afr_changelog_post_op_cbk, +                                           (void *) (long) i, +                                           priv->children[i], +                                           priv->children[i]->fops->xattrop, +                                           &local->transaction.new_parent_loc, +                                           GF_XATTROP_ADD_ARRAY, xattr[i]); +                        call_count--; +                } -			if (!call_count) -				break; -		} +                /*  +                   set it again because previous stack_wind +                   might have already returned (think of case +                   where subvolume is posix) and would have +                   used the dict as placeholder for return +                   value +                */ +                ret = afr_set_pending_dict (priv, xattr[i],  +                                            local->pending); + +                if (ret < 0) +                        gf_log (this->name, GF_LOG_DEBUG, +                                "failed to set pending entry"); + +                /* fall through */ + +                case AFR_ENTRY_TRANSACTION: +                { +                        if (local->fd) +                                STACK_WIND (frame, afr_changelog_post_op_cbk, +                                            priv->children[i],  +                                            priv->children[i]->fops->fxattrop, +                                            local->fd,  +                                            GF_XATTROP_ADD_ARRAY, xattr[i]); +                        else +                                STACK_WIND (frame, afr_changelog_post_op_cbk, +                                            priv->children[i],  +                                            priv->children[i]->fops->xattrop, +                                            &local->transaction.parent_loc,  +                                            GF_XATTROP_ADD_ARRAY, xattr[i]); +                        call_count--; +                } +                break; +                } + +                if (!call_count) +                        break;  	}          for (i = 0; i < priv->child_count; i++) { @@ -881,6 +815,10 @@ afr_changelog_pre_op_cbk (call_frame_t *frame, void *cookie, xlator_t *this,  	LOCK (&frame->lock);  	{ +                if (op_ret == 1) { +                        /* special op_ret for piggyback */ +                } +                  if (op_ret == 0) {                          __mark_pre_op_done_on_fd (frame, this, child_index);                  } @@ -934,6 +872,8 @@ afr_changelog_pre_op (call_frame_t *frame, xlator_t *this)  	int ret = 0;  	int call_count = 0;		       	dict_t **xattr = NULL; +        afr_fd_ctx_t *fdctx = NULL; +        int           piggyback = 0;  	afr_local_t *local = NULL; @@ -969,97 +909,139 @@ afr_changelog_pre_op (call_frame_t *frame, xlator_t *this)  	__mark_all_pending (local->pending, priv->child_count,                              local->transaction.type); +        if (local->fd) +                fdctx = afr_fd_ctx_get (local->fd, this); +  	for (i = 0; i < priv->child_count; i++) { -		if (local->child_up[i]) { -			ret = afr_set_pending_dict (priv, xattr[i],  -                                                    local->pending); - -			if (ret < 0) -				gf_log (this->name, GF_LOG_DEBUG, -					"failed to set pending entry"); - - -			switch (local->transaction.type) { -			case AFR_DATA_TRANSACTION: -			case AFR_METADATA_TRANSACTION: -			case AFR_FLUSH_TRANSACTION: -			{ -				if (local->fd) -					STACK_WIND_COOKIE (frame,  -							   afr_changelog_pre_op_cbk, -							   (void *) (long) i, -							   priv->children[i],  -							   priv->children[i]->fops->fxattrop, -							   local->fd, -							   GF_XATTROP_ADD_ARRAY, xattr[i]); -				else -					STACK_WIND_COOKIE (frame,  -							   afr_changelog_pre_op_cbk, -							   (void *) (long) i, -							   priv->children[i],  -							   priv->children[i]->fops->xattrop, -							   &(local->loc),  -							   GF_XATTROP_ADD_ARRAY, xattr[i]); -			} -			break; +                if (!local->child_up[i]) +                        continue; + +                ret = afr_set_pending_dict (priv, xattr[i],  +                                            local->pending); + +                if (ret < 0) +                        gf_log (this->name, GF_LOG_DEBUG, +                                "failed to set pending entry"); + + +                switch (local->transaction.type) { +                case AFR_DATA_TRANSACTION: +                { +                        if (!fdctx) { +                                STACK_WIND_COOKIE (frame, +                                                   afr_changelog_pre_op_cbk, +                                                   (void *) (long) i, +                                                   priv->children[i], +                                                   priv->children[i]->fops->xattrop, +                                                   &(local->loc), +                                                   GF_XATTROP_ADD_ARRAY, xattr[i]); +                                break; +                        } + +                        LOCK (&local->fd->lock); +                        { +                                piggyback = 0; +                                if (fdctx->pre_op_done[i]) { +                                        fdctx->pre_op_piggyback[i]++; +                                        piggyback = 1; +                                        fdctx->hit++; +                                } else { +                                        fdctx->miss++; +                                } +                        } +                        UNLOCK (&local->fd->lock); + +                        if (piggyback) +                                afr_changelog_pre_op_cbk (frame, (void *)(long)i, +                                                          this, 1, 0, xattr[i]); +                        else +                                STACK_WIND_COOKIE (frame, +                                                   afr_changelog_pre_op_cbk, +                                                   (void *) (long) i, +                                                   priv->children[i], +                                                   priv->children[i]->fops->fxattrop, +                                                   local->fd, +                                                   GF_XATTROP_ADD_ARRAY, xattr[i]); +                } +                break; + +                case AFR_METADATA_TRANSACTION: +                { +                        if (local->fd) +                                STACK_WIND_COOKIE (frame,  +                                                   afr_changelog_pre_op_cbk, +                                                   (void *) (long) i, +                                                   priv->children[i],  +                                                   priv->children[i]->fops->fxattrop, +                                                   local->fd, +                                                   GF_XATTROP_ADD_ARRAY, xattr[i]); +                        else +                                STACK_WIND_COOKIE (frame,  +                                                   afr_changelog_pre_op_cbk, +                                                   (void *) (long) i, +                                                   priv->children[i],  +                                                   priv->children[i]->fops->xattrop, +                                                   &(local->loc),  +                                                   GF_XATTROP_ADD_ARRAY, xattr[i]); +                } +                break; -			case AFR_ENTRY_RENAME_TRANSACTION:  -			{ -				STACK_WIND_COOKIE (frame,  -						   afr_changelog_pre_op_cbk, -						   (void *) (long) i, -						   priv->children[i],  -						   priv->children[i]->fops->xattrop, -						   &local->transaction.new_parent_loc,  -						   GF_XATTROP_ADD_ARRAY, xattr[i]); - -				call_count--; -			} +                case AFR_ENTRY_RENAME_TRANSACTION:  +                { +                        STACK_WIND_COOKIE (frame,  +                                           afr_changelog_pre_op_cbk, +                                           (void *) (long) i, +                                           priv->children[i],  +                                           priv->children[i]->fops->xattrop, +                                           &local->transaction.new_parent_loc,  +                                           GF_XATTROP_ADD_ARRAY, xattr[i]); + +                        call_count--; +                } -			/*  -			   set it again because previous stack_wind -			   might have already returned (think of case -			   where subvolume is posix) and would have -			   used the dict as placeholder for return -			   value -			*/ +                /*  +                   set it again because previous stack_wind +                   might have already returned (think of case +                   where subvolume is posix) and would have +                   used the dict as placeholder for return +                   value +                */ -			ret = afr_set_pending_dict (priv, xattr[i],  -                                                    local->pending); +                ret = afr_set_pending_dict (priv, xattr[i],  +                                            local->pending); -			if (ret < 0) -				gf_log (this->name, GF_LOG_DEBUG, -					"failed to set pending entry"); +                if (ret < 0) +                        gf_log (this->name, GF_LOG_DEBUG, +                                "failed to set pending entry"); -			/* fall through */ +                /* fall through */ -			case AFR_ENTRY_TRANSACTION: -			{ -				if (local->fd) -					STACK_WIND_COOKIE (frame,  -							   afr_changelog_pre_op_cbk, -							   (void *) (long) i, -							   priv->children[i],  -							   priv->children[i]->fops->fxattrop, -							   local->fd,  -							   GF_XATTROP_ADD_ARRAY, xattr[i]); -				else -					STACK_WIND_COOKIE (frame,  -							   afr_changelog_pre_op_cbk, -							   (void *) (long) i, -							   priv->children[i],  -							   priv->children[i]->fops->xattrop, -							   &local->transaction.parent_loc,  -							   GF_XATTROP_ADD_ARRAY, xattr[i]); -			} +                case AFR_ENTRY_TRANSACTION: +                { +                        if (local->fd) +                                STACK_WIND_COOKIE (frame,  +                                                   afr_changelog_pre_op_cbk, +                                                   (void *) (long) i, +                                                   priv->children[i],  +                                                   priv->children[i]->fops->fxattrop, +                                                   local->fd,  +                                                   GF_XATTROP_ADD_ARRAY, xattr[i]); +                        else +                                STACK_WIND_COOKIE (frame,  +                                                   afr_changelog_pre_op_cbk, +                                                   (void *) (long) i, +                                                   priv->children[i],  +                                                   priv->children[i]->fops->xattrop, +                                                   &local->transaction.parent_loc,  +                                                   GF_XATTROP_ADD_ARRAY, xattr[i]); +                } -			break; -			} +                break; +                } -			if (!--call_count) -				break; -		} +                if (!--call_count) +                        break;  	}          for (i = 0; i < priv->child_count; i++) { @@ -1294,7 +1276,6 @@ int afr_lock_rec (call_frame_t *frame, xlator_t *this, int child_index)  	switch (local->transaction.type) {  	case AFR_DATA_TRANSACTION:		  	case AFR_METADATA_TRANSACTION: -	case AFR_FLUSH_TRANSACTION:  		if (local->fd) {  			STACK_WIND_COOKIE (frame, afr_lock_cbk, @@ -1421,15 +1402,8 @@ afr_transaction_fop_failed (call_frame_t *frame, xlator_t *this, int child_index  	local = frame->local;  	priv  = this->private; -        switch (local->op) { -        case GF_FOP_WRITE: -                __mark_fop_failed_on_fd (local->fd, this, child_index); -                break; -        default: -                __mark_child_dead (local->pending, priv->child_count, -                                   child_index, local->transaction.type); -                break; -        } +        __mark_child_dead (local->pending, priv->child_count, +                           child_index, local->transaction.type);  } diff --git a/xlators/cluster/afr/src/afr.c b/xlators/cluster/afr/src/afr.c index 90c7d7ab511..ced89bdb1c9 100644 --- a/xlators/cluster/afr/src/afr.c +++ b/xlators/cluster/afr/src/afr.c @@ -1121,10 +1121,10 @@ afr_fd_ctx_set (xlator_t *this, fd_t *fd)                          goto unlock;                  } -                fd_ctx->child_failed = CALLOC (sizeof (*fd_ctx->child_failed), -                                               priv->child_count); +                fd_ctx->pre_op_piggyback = CALLOC (sizeof (*fd_ctx->pre_op_piggyback), +                                                   priv->child_count); -                if (!fd_ctx->child_failed) { +                if (!fd_ctx->pre_op_piggyback) {                          gf_log (this->name, GF_LOG_ERROR,                                  "Out of memory"); @@ -1278,73 +1278,6 @@ afr_flush_done (call_frame_t *frame, xlator_t *this)  int -afr_plain_flush_cbk (call_frame_t *frame, void *cookie, xlator_t *this, -                     int32_t op_ret, int32_t op_errno) - -{ -	afr_local_t *local = NULL; - -	int call_count = -1; - -	local = frame->local; - -	LOCK (&frame->lock); -	{ -		if (op_ret == 0) -			local->op_ret = 0; - -		local->op_errno = op_errno; -	} -	UNLOCK (&frame->lock); - -	call_count = afr_frame_return (frame); - -	if (call_count == 0) -		AFR_STACK_UNWIND (flush, frame, local->op_ret, local->op_errno); - -	return 0; -} - - -static int -__no_pre_op_done (xlator_t *this, fd_t *fd) -{ -        int i      = 0; -        int op_ret = 1; - -        int _ret = 0; -        uint64_t       ctx; -        afr_fd_ctx_t * fd_ctx = NULL; - -        afr_private_t *priv = NULL; - -        priv = this->private; - -        LOCK (&fd->lock); -        { -                _ret = __fd_ctx_get (fd, this, &ctx); - -                if (_ret < 0) { -                        goto out; -                } - -                fd_ctx = (afr_fd_ctx_t *)(long) ctx; - -                for (i = 0; i < priv->child_count; i++) { -                        if (fd_ctx->pre_op_done[i]) { -                                op_ret = 0; -                                break; -                        } -                } -        } -out: -        UNLOCK (&fd->lock); - -        return op_ret; -} - - -int  afr_flush (call_frame_t *frame, xlator_t *this, fd_t *fd)  {  	afr_private_t * priv  = NULL; @@ -1357,7 +1290,6 @@ afr_flush (call_frame_t *frame, xlator_t *this, fd_t *fd)  	int op_ret   = -1;  	int op_errno = 0; -        int i          = 0;          int call_count = 0;  	VALIDATE_OR_GOTO (frame, out); @@ -1376,45 +1308,29 @@ afr_flush (call_frame_t *frame, xlator_t *this, fd_t *fd)          call_count = afr_up_children_count (priv->child_count, local->child_up); -        if (__no_pre_op_done (this, fd)) { -                frame->local = local; - -                for (i = 0; i < priv->child_count; i++) { -                        if (local->child_up[i]) { -                                STACK_WIND_COOKIE (frame, afr_plain_flush_cbk, -                                                   (void *) (long) i, -                                                   priv->children[i], -                                                   priv->children[i]->fops->flush, -                                                   fd); -                                if (!--call_count) -                                        break; -                        } -                } -        } else { -                transaction_frame = copy_frame (frame); -                if (!transaction_frame) { -                        op_errno = ENOMEM; -                        gf_log (this->name, GF_LOG_ERROR, -                                "Out of memory."); -                        goto out; -                } +        transaction_frame = copy_frame (frame); +        if (!transaction_frame) { +                op_errno = ENOMEM; +                gf_log (this->name, GF_LOG_ERROR, +                        "Out of memory."); +                goto out; +        } -                transaction_frame->local = local; +        transaction_frame->local = local; -                local->op = GF_FOP_FLUSH; +        local->op = GF_FOP_FLUSH; -                local->transaction.fop    = afr_flush_wind; -                local->transaction.done   = afr_flush_done; -                local->transaction.unwind = afr_flush_unwind; +        local->transaction.fop    = afr_flush_wind; +        local->transaction.done   = afr_flush_done; +        local->transaction.unwind = afr_flush_unwind; -                local->fd                 = fd_ref (fd); +        local->fd                 = fd_ref (fd); -                local->transaction.main_frame = frame; -                local->transaction.start  = 0; -                local->transaction.len    = 0; +        local->transaction.main_frame = frame; +        local->transaction.start  = 0; +        local->transaction.len    = 0; -                afr_transaction (transaction_frame, this, AFR_FLUSH_TRANSACTION); -        } +        afr_transaction (transaction_frame, this, AFR_DATA_TRANSACTION);  	op_ret = 0;  out: @@ -1446,8 +1362,8 @@ afr_cleanup_fd_ctx (xlator_t *this, fd_t *fd)          fd_ctx = (afr_fd_ctx_t *)(long) ctx;          if (fd_ctx) { -                if (fd_ctx->child_failed) -                        FREE (fd_ctx->child_failed); +                if (fd_ctx->pre_op_piggyback) +                        FREE (fd_ctx->pre_op_piggyback);                  if (fd_ctx->pre_op_done)                          FREE (fd_ctx->pre_op_done); diff --git a/xlators/cluster/afr/src/afr.h b/xlators/cluster/afr/src/afr.h index 6e1f810b67d..17310c8fd96 100644 --- a/xlators/cluster/afr/src/afr.h +++ b/xlators/cluster/afr/src/afr.h @@ -162,7 +162,6 @@ typedef enum {  	AFR_METADATA_TRANSACTION,      /* chmod, chown, ... */  	AFR_ENTRY_TRANSACTION,         /* create, rmdir, ... */  	AFR_ENTRY_RENAME_TRANSACTION,  /* rename */ -	AFR_FLUSH_TRANSACTION,         /* flush */  } afr_transaction_type; @@ -179,7 +178,6 @@ afr_index_for_transaction_type (afr_transaction_type type)          switch (type) {          case AFR_DATA_TRANSACTION: -        case AFR_FLUSH_TRANSACTION:                  return 0;          case AFR_METADATA_TRANSACTION: @@ -194,12 +192,6 @@ afr_index_for_transaction_type (afr_transaction_type type)  } -typedef enum { -        AFR_CHILD_UP_FLUSH, -        AFR_CHILD_DOWN_FLUSH, -} afr_flush_type; - -  typedef struct _afr_local {  	unsigned int call_count;  	unsigned int success_count; @@ -235,7 +227,7 @@ typedef struct _afr_local {  	int32_t  inodelk_count;  	int32_t  entrylk_count; -        int (*up_down_flush_cbk) (call_frame_t *, xlator_t *); +        int (*openfd_flush_cbk) (call_frame_t *, xlator_t *);  	/*   	   This struct contains the arguments for the "continuation" @@ -535,8 +527,6 @@ typedef struct _afr_local {  		int (*unwind) (call_frame_t *frame, xlator_t *this); -                /* post-op hook */ -                int (*post_post_op) (call_frame_t *frame, xlator_t *this);  	} transaction;  	afr_self_heal_t self_heal; @@ -544,15 +534,17 @@ typedef struct _afr_local {  typedef struct { -        unsigned char *pre_op_done; -        unsigned char *opened_on;     /* which subvolumes the fd is open on */ -        unsigned char *child_failed; +        unsigned int *pre_op_done; +        unsigned int *opened_on;     /* which subvolumes the fd is open on */ +        unsigned int *pre_op_piggyback;          int flags;          int32_t wbflags;          uint64_t up_count;   /* number of CHILD_UPs this fd has seen */          uint64_t down_count; /* number of CHILD_DOWNs this fd has seen */          int32_t last_tried; + +        int hit, miss;          gf_boolean_t failed_over;          struct list_head entries; /* needed for readdir failover */  } afr_fd_ctx_t; @@ -623,9 +615,6 @@ int  afr_open (call_frame_t *frame, xlator_t *this, loc_t *loc, int32_t flags,            fd_t *fd, int32_t wbflags); -int -afr_up_down_flush (call_frame_t *frame, xlator_t *this, fd_t *fd, afr_flush_type type); -  void  afr_set_opendir_done (xlator_t *this, inode_t *inode); @@ -638,6 +627,10 @@ afr_local_transaction_cleanup (afr_local_t *local, xlator_t *this);  int  afr_cleanup_fd_ctx (xlator_t *this, fd_t *fd); +int +afr_openfd_flush (call_frame_t *frame, xlator_t *this, fd_t *fd); + +  #define AFR_STACK_UNWIND(fop, frame, params ...)        \  	do {						\  		afr_local_t *__local = NULL;		\  | 
