From 75c70231fe7e13a68239dac889568d243a79a75e Mon Sep 17 00:00:00 2001 From: Vikas Gorur Date: Thu, 2 Apr 2009 08:31:23 -0700 Subject: Load balance read operations among subvolumes in afr Signed-off-by: Anand V. Avati --- xlators/cluster/afr/src/afr-dir-write.c | 225 +++++++++++++++++++++-- xlators/cluster/afr/src/afr-inode-read.c | 305 +++++++++++++++++++++++-------- xlators/cluster/afr/src/afr.c | 85 ++++++--- xlators/cluster/afr/src/afr.h | 8 +- 4 files changed, 513 insertions(+), 110 deletions(-) (limited to 'xlators') diff --git a/xlators/cluster/afr/src/afr-dir-write.c b/xlators/cluster/afr/src/afr-dir-write.c index af72c6440b6..faaf75e45b6 100644 --- a/xlators/cluster/afr/src/afr-dir-write.c +++ b/xlators/cluster/afr/src/afr-dir-write.c @@ -72,6 +72,40 @@ afr_build_parent_loc (loc_t *parent, loc_t *child) } +afr_inode_ctx_t * +afr_get_inode_ctx (xlator_t *this, inode_t *inode) +{ + afr_inode_ctx_t * inode_ctx = NULL; + uint64_t ctx; + + int ret = 0; + + LOCK (&inode->lock); + { + ret = __inode_ctx_get (inode, this, &ctx); + + if (ret < 0) { + inode_ctx = CALLOC (1, sizeof (afr_inode_ctx_t)); + + ret = __inode_ctx_put (inode, this, + (uint64_t)(long) inode_ctx); + + if (ret < 0) { + gf_log (this->name, GF_LOG_ERROR, + "could not set inode ctx"); + FREE (inode_ctx); + inode_ctx = NULL; + } + } else { + inode_ctx = (afr_inode_ctx_t *)(long) ctx; + } + } + UNLOCK (&inode->lock); + + return inode_ctx; +} + + /* {{{ create */ int @@ -91,11 +125,13 @@ afr_create_unwind (call_frame_t *frame, xlator_t *this) } UNLOCK (&frame->lock); - if (main_frame) + if (main_frame) { AFR_STACK_UNWIND (main_frame, local->op_ret, local->op_errno, local->cont.create.fd, local->cont.create.inode, &local->cont.create.buf); + } + return 0; } @@ -107,6 +143,8 @@ afr_create_wind_cbk (call_frame_t *frame, void *cookie, xlator_t *this, { afr_local_t * local = NULL; afr_private_t * priv = NULL; + + afr_inode_ctx_t * inode_ctx = NULL; int call_count = -1; int child_index = -1; @@ -124,14 +162,36 @@ afr_create_wind_cbk (call_frame_t *frame, void *cookie, xlator_t *this, if (op_ret != -1) { local->op_ret = op_ret; - if ((local->success_count == 0) - || (child_index == priv->read_child)) { + if (local->success_count == 0) { local->cont.create.buf = *buf; local->cont.create.buf.st_ino = afr_itransform (buf->st_ino, priv->child_count, child_index); + + inode_ctx = afr_get_inode_ctx (this, inode); + + if (inode_ctx) { + if (priv->read_child >= 0) { + inode_ctx->read_child = priv->read_child; + } else { + inode_ctx->read_child = local->read_child_index; + } + } } + + if (child_index == local->read_child_index) { + inode_ctx = afr_get_inode_ctx (this, inode); + + if (inode_ctx) { + if (priv->read_child >= 0) { + inode_ctx->read_child = priv->read_child; + } else { + inode_ctx->read_child = local->read_child_index; + } + } + } + local->cont.create.inode = inode; local->success_count++; @@ -246,6 +306,13 @@ afr_create (call_frame_t *frame, xlator_t *this, loc_copy (&local->loc, loc); + LOCK (&priv->read_child_lock); + { + local->read_child_index = (++priv->read_child_rr) + % (priv->child_count); + } + UNLOCK (&priv->read_child_lock); + local->cont.create.flags = flags; local->cont.create.mode = mode; local->cont.create.fd = fd_ref (fd); @@ -294,10 +361,12 @@ afr_mknod_unwind (call_frame_t *frame, xlator_t *this) } UNLOCK (&frame->lock); - if (main_frame) + if (main_frame) { AFR_STACK_UNWIND (main_frame, local->op_ret, local->op_errno, local->cont.mknod.inode, &local->cont.mknod.buf); + } + return 0; } @@ -310,6 +379,8 @@ afr_mknod_wind_cbk (call_frame_t *frame, void *cookie, xlator_t *this, afr_local_t * local = NULL; afr_private_t * priv = NULL; + afr_inode_ctx_t * inode_ctx = NULL; + int call_count = -1; int child_index = -1; @@ -326,14 +397,36 @@ afr_mknod_wind_cbk (call_frame_t *frame, void *cookie, xlator_t *this, if (op_ret != -1) { local->op_ret = op_ret; - if ((local->success_count == 0) - || (child_index == priv->read_child)) { + if (local->success_count == 0){ local->cont.mknod.buf = *buf; local->cont.mknod.buf.st_ino = afr_itransform (buf->st_ino, priv->child_count, child_index); + + inode_ctx = afr_get_inode_ctx (this, inode); + + if (inode_ctx) { + if (priv->read_child >= 0) { + inode_ctx->read_child = priv->read_child; + } else { + inode_ctx->read_child = local->read_child_index; + } + } } + + if (child_index == local->read_child_index) { + inode_ctx = afr_get_inode_ctx (this, inode); + + if (inode_ctx) { + if (priv->read_child >= 0) { + inode_ctx->read_child = priv->read_child; + } else { + inode_ctx->read_child = local->read_child_index; + } + } + } + local->cont.mknod.inode = inode; local->success_count++; @@ -444,6 +537,13 @@ afr_mknod (call_frame_t *frame, xlator_t *this, loc_copy (&local->loc, loc); + LOCK (&priv->read_child_lock); + { + local->read_child_index = (++priv->read_child_rr) + % (priv->child_count); + } + UNLOCK (&priv->read_child_lock); + local->cont.mknod.mode = mode; local->cont.mknod.dev = dev; @@ -492,10 +592,12 @@ afr_mkdir_unwind (call_frame_t *frame, xlator_t *this) } UNLOCK (&frame->lock); - if (main_frame) + if (main_frame) { AFR_STACK_UNWIND (main_frame, local->op_ret, local->op_errno, local->cont.mkdir.inode, &local->cont.mkdir.buf); + } + return 0; } @@ -508,6 +610,8 @@ afr_mkdir_wind_cbk (call_frame_t *frame, void *cookie, xlator_t *this, afr_local_t * local = NULL; afr_private_t * priv = NULL; + afr_inode_ctx_t * inode_ctx = NULL; + int call_count = -1; int child_index = -1; @@ -524,13 +628,35 @@ afr_mkdir_wind_cbk (call_frame_t *frame, void *cookie, xlator_t *this, if (op_ret != -1) { local->op_ret = op_ret; - if ((local->success_count == 0) - || (child_index == priv->read_child)) { + if (local->success_count == 0) { local->cont.mkdir.buf = *buf; local->cont.mkdir.buf.st_ino = afr_itransform (buf->st_ino, priv->child_count, child_index); + + inode_ctx = afr_get_inode_ctx (this, inode); + + if (inode_ctx) { + if (priv->read_child >= 0) { + inode_ctx->read_child = priv->read_child; + } else { + inode_ctx->read_child = local->read_child_index; + } + } } + + if (child_index == local->read_child_index) { + inode_ctx = afr_get_inode_ctx (this, inode); + + if (inode_ctx) { + if (priv->read_child >= 0) { + inode_ctx->read_child = priv->read_child; + } else { + inode_ctx->read_child = local->read_child_index; + } + } + } + local->cont.mkdir.inode = inode; local->success_count++; @@ -642,6 +768,13 @@ afr_mkdir (call_frame_t *frame, xlator_t *this, loc_copy (&local->loc, loc); + LOCK (&priv->read_child_lock); + { + local->read_child_index = (++priv->read_child_rr) + % (priv->child_count); + } + UNLOCK (&priv->read_child_lock); + local->cont.mkdir.mode = mode; local->transaction.fop = afr_mkdir_wind; @@ -710,6 +843,8 @@ afr_link_wind_cbk (call_frame_t *frame, void *cookie, xlator_t *this, afr_local_t * local = NULL; afr_private_t * priv = NULL; + afr_inode_ctx_t * inode_ctx = NULL; + int call_count = -1; int child_index = -1; @@ -726,13 +861,35 @@ afr_link_wind_cbk (call_frame_t *frame, void *cookie, xlator_t *this, if (op_ret != -1) { local->op_ret = op_ret; - if ((local->success_count == 0) - || (child_index == priv->read_child)) { + if (local->success_count == 0) { local->cont.link.buf = *buf; local->cont.link.buf.st_ino = afr_itransform (buf->st_ino, priv->child_count, child_index); + + inode_ctx = afr_get_inode_ctx (this, inode); + + if (inode_ctx) { + if (priv->read_child >= 0) { + inode_ctx->read_child = priv->read_child; + } else { + inode_ctx->read_child = local->read_child_index; + } + } } + + if (child_index == local->read_child_index) { + inode_ctx = afr_get_inode_ctx (this, inode); + + if (inode_ctx) { + if (priv->read_child >= 0) { + inode_ctx->read_child = priv->read_child; + } else { + inode_ctx->read_child = local->read_child_index; + } + } + } + local->cont.link.inode = inode; local->success_count++; @@ -844,6 +1001,13 @@ afr_link (call_frame_t *frame, xlator_t *this, loc_copy (&local->loc, oldloc); loc_copy (&local->newloc, newloc); + LOCK (&priv->read_child_lock); + { + local->read_child_index = (++priv->read_child_rr) + % (priv->child_count); + } + UNLOCK (&priv->read_child_lock); + local->cont.link.ino = oldloc->inode->ino; local->transaction.fop = afr_link_wind; @@ -892,10 +1056,12 @@ afr_symlink_unwind (call_frame_t *frame, xlator_t *this) } UNLOCK (&frame->lock); - if (main_frame) + if (main_frame) { AFR_STACK_UNWIND (main_frame, local->op_ret, local->op_errno, local->cont.symlink.inode, &local->cont.symlink.buf); + } + return 0; } @@ -908,6 +1074,8 @@ afr_symlink_wind_cbk (call_frame_t *frame, void *cookie, xlator_t *this, afr_local_t * local = NULL; afr_private_t * priv = NULL; + afr_inode_ctx_t * inode_ctx = NULL; + int call_count = -1; int child_index = -1; @@ -924,13 +1092,35 @@ afr_symlink_wind_cbk (call_frame_t *frame, void *cookie, xlator_t *this, if (op_ret != -1) { local->op_ret = op_ret; - if ((local->success_count == 0) - || (child_index == priv->read_child)) { + if (local->success_count == 0) { local->cont.symlink.buf = *buf; local->cont.symlink.buf.st_ino = afr_itransform (buf->st_ino, priv->child_count, child_index); + + inode_ctx = afr_get_inode_ctx (this, inode); + + if (inode_ctx) { + if (priv->read_child >= 0) { + inode_ctx->read_child = priv->read_child; + } else { + inode_ctx->read_child = local->read_child_index; + } + } } + + if (child_index == local->read_child_index) { + inode_ctx = afr_get_inode_ctx (this, inode); + + if (inode_ctx) { + if (priv->read_child >= 0) { + inode_ctx->read_child = priv->read_child; + } else { + inode_ctx->read_child = local->read_child_index; + } + } + } + local->cont.symlink.inode = inode; local->success_count++; @@ -1043,6 +1233,13 @@ afr_symlink (call_frame_t *frame, xlator_t *this, loc_copy (&local->loc, loc); + LOCK (&priv->read_child_lock); + { + local->read_child_index = (++priv->read_child_rr) + % (priv->child_count); + } + UNLOCK (&priv->read_child_lock); + local->cont.symlink.ino = loc->inode->ino; local->cont.symlink.linkpath = strdup (linkpath); diff --git a/xlators/cluster/afr/src/afr-inode-read.c b/xlators/cluster/afr/src/afr-inode-read.c index fd1edc3b593..97b429049c7 100644 --- a/xlators/cluster/afr/src/afr-inode-read.c +++ b/xlators/cluster/afr/src/afr-inode-read.c @@ -49,7 +49,7 @@ /** * Common algorithm for inode read calls: - * + * * - Try the fop on the first child that is up * - if we have failed due to ENOTCONN: * try the next child @@ -70,13 +70,17 @@ afr_access_cbk (call_frame_t *frame, void *cookie, int unwind = 1; int last_tried = -1; int this_try = -1; + int read_child = -1; priv = this->private; children = priv->children; local = frame->local; + read_child = (long) cookie; + if (op_ret == -1) { + retry: last_tried = local->cont.access.last_tried; if (all_tried (last_tried, priv->child_count)) { @@ -84,11 +88,15 @@ afr_access_cbk (call_frame_t *frame, void *cookie, } this_try = ++local->cont.access.last_tried; + if (this_try == read_child) { + goto retry; + } + unwind = 0; STACK_WIND_COOKIE (frame, afr_access_cbk, - (void *) (long) this_try, - children[this_try], + (void *) (long) read_child, + children[this_try], children[this_try]->fops->access, &local->loc, local->cont.access.mask); } @@ -111,6 +119,10 @@ afr_access (call_frame_t *frame, xlator_t *this, int call_child = 0; afr_local_t *local = NULL; + afr_inode_ctx_t * inode_ctx = NULL; + uint64_t ctx; + int ret = 0; + int32_t op_ret = -1; int32_t op_errno = 0; @@ -125,15 +137,34 @@ afr_access (call_frame_t *frame, xlator_t *this, ALLOC_OR_GOTO (local, afr_local_t, out); - call_child = afr_first_up_child (priv); - if (call_child == -1) { - op_errno = ENOTCONN; - gf_log (this->name, GF_LOG_ERROR, - "no child is up :("); - goto out; - } + ret = inode_ctx_get (loc->inode, this, + &ctx); + if (ret < 0) { + op_errno = EINVAL; + gf_log (this->name, GF_LOG_ERROR, + "inode ctx not set!"); + goto out; + } + + inode_ctx = (afr_inode_ctx_t *)(long) ctx; + + if (inode_ctx->read_child >= 0) { + call_child = inode_ctx->read_child; + + local->cont.access.last_tried = -1; + + } else { + call_child = afr_first_up_child (priv); + if (call_child == -1) { + op_errno = ENOTCONN; + gf_log (this->name, GF_LOG_ERROR, + "no child is up :("); + goto out; + } + + local->cont.access.last_tried = call_child; + } - local->cont.access.last_tried = call_child; loc_copy (&local->loc, loc); local->cont.access.mask = mask; @@ -164,16 +195,15 @@ afr_stat_cbk (call_frame_t *frame, void *cookie, afr_local_t * local = NULL; xlator_t ** children = NULL; - int deitransform_child = -1; - int unwind = 1; int last_tried = -1; int this_try = -1; + int read_child = -1; priv = this->private; children = priv->children; - deitransform_child = (long) cookie; + read_child = (long) cookie; local = frame->local; @@ -186,15 +216,15 @@ afr_stat_cbk (call_frame_t *frame, void *cookie, } this_try = ++local->cont.stat.last_tried; - if (this_try == deitransform_child) { + if (this_try == read_child) { goto retry; } unwind = 0; STACK_WIND_COOKIE (frame, afr_stat_cbk, - (void *) (long) deitransform_child, - children[this_try], + (void *) (long) read_child, + children[this_try], children[this_try]->fops->stat, &local->loc); } @@ -219,6 +249,10 @@ afr_stat (call_frame_t *frame, xlator_t *this, afr_local_t * local = NULL; xlator_t ** children = NULL; + afr_inode_ctx_t * inode_ctx = NULL; + uint64_t ctx; + int ret = 0; + int call_child = 0; int32_t op_ret = -1; @@ -237,14 +271,36 @@ afr_stat (call_frame_t *frame, xlator_t *this, frame->local = local; - call_child = afr_deitransform (loc->inode->ino, priv->child_count); + ret = inode_ctx_get (loc->inode, this, + &ctx); + if (ret < 0) { + op_errno = EINVAL; + gf_log (this->name, GF_LOG_ERROR, + "inode ctx not set!"); + goto out; + } + + inode_ctx = (afr_inode_ctx_t *)(long) ctx; + + if (inode_ctx->read_child >= 0) { + call_child = inode_ctx->read_child; + + local->cont.stat.last_tried = -1; + + } else { + call_child = afr_first_up_child (priv); + if (call_child == -1) { + op_errno = ENOTCONN; + gf_log (this->name, GF_LOG_ERROR, + "no child is up :("); + goto out; + } + + local->cont.stat.last_tried = call_child; + } + loc_copy (&local->loc, loc); - /* - if stat fails from the deitranform'd child, we try - all children starting with the first one - */ - local->cont.stat.last_tried = -1; local->cont.stat.ino = loc->inode->ino; STACK_WIND_COOKIE (frame, afr_stat_cbk, (void *) (long) call_child, @@ -275,19 +331,18 @@ afr_fstat_cbk (call_frame_t *frame, void *cookie, afr_local_t * local = NULL; xlator_t ** children = NULL; - int deitransform_child = -1; - int unwind = 1; int last_tried = -1; int this_try = -1; + int read_child = -1; priv = this->private; children = priv->children; - deitransform_child = (long) cookie; - local = frame->local; + read_child = (long) cookie; + if (op_ret == -1) { retry: last_tried = local->cont.fstat.last_tried; @@ -297,20 +352,15 @@ afr_fstat_cbk (call_frame_t *frame, void *cookie, } this_try = ++local->cont.fstat.last_tried; - if (this_try == deitransform_child) { - /* - skip the deitransform'd child since if we are here - we must have already tried that child - */ + if (this_try == read_child) { goto retry; } - unwind = 0; STACK_WIND_COOKIE (frame, afr_fstat_cbk, - (void *) (long) deitransform_child, - children[this_try], + (void *) (long) read_child, + children[this_try], children[this_try]->fops->fstat, local->fd); } @@ -337,6 +387,10 @@ afr_fstat (call_frame_t *frame, xlator_t *this, int call_child = 0; + afr_inode_ctx_t * inode_ctx = NULL; + uint64_t ctx; + int ret = 0; + int32_t op_ret = -1; int32_t op_errno = 0; @@ -356,13 +410,35 @@ afr_fstat (call_frame_t *frame, xlator_t *this, VALIDATE_OR_GOTO (fd->inode, out); - call_child = afr_deitransform (fd->inode->ino, priv->child_count); + ret = inode_ctx_get (fd->inode, this, + &ctx); + + if (ret < 0) { + op_errno = EINVAL; + gf_log (this->name, GF_LOG_ERROR, + "inode ctx not set!"); + goto out; + } + + inode_ctx = (afr_inode_ctx_t *)(long) ctx; + + if (inode_ctx->read_child >= 0) { + call_child = inode_ctx->read_child; + + local->cont.fstat.last_tried = -1; + } else { + call_child = afr_first_up_child (priv); + + if (call_child == -1) { + op_errno = ENOTCONN; + gf_log (this->name, GF_LOG_ERROR, + "no child is up :("); + goto out; + } + + local->cont.fstat.last_tried = call_child; + } - /* - if fstat fails from the deitranform'd child, we try - all children starting with the first one - */ - local->cont.fstat.last_tried = -1; local->cont.fstat.ino = fd->inode->ino; local->fd = fd_ref (fd); @@ -396,13 +472,17 @@ afr_readlink_cbk (call_frame_t *frame, void *cookie, int unwind = 1; int last_tried = -1; int this_try = -1; + int read_child = -1; priv = this->private; children = priv->children; local = frame->local; + read_child = (long) cookie; + if (op_ret == -1) { + retry: last_tried = local->cont.readlink.last_tried; if (all_tried (last_tried, priv->child_count)) { @@ -410,10 +490,14 @@ afr_readlink_cbk (call_frame_t *frame, void *cookie, } this_try = ++local->cont.readlink.last_tried; + if (this_try == read_child) { + goto retry; + } + unwind = 0; STACK_WIND_COOKIE (frame, afr_readlink_cbk, - (void *) (long) this_try, - children[this_try], + (void *) (long) read_child, + children[this_try], children[this_try]->fops->readlink, &local->loc, local->cont.readlink.size); @@ -437,6 +521,10 @@ afr_readlink (call_frame_t *frame, xlator_t *this, int call_child = 0; afr_local_t *local = NULL; + afr_inode_ctx_t * inode_ctx = NULL; + uint64_t ctx; + int ret = 0; + int32_t op_ret = -1; int32_t op_errno = 0; @@ -453,15 +541,35 @@ afr_readlink (call_frame_t *frame, xlator_t *this, frame->local = local; - call_child = afr_first_up_child (priv); - if (call_child == -1) { - op_errno = ENOTCONN; - gf_log (this->name, GF_LOG_ERROR, - "no child is up :("); - goto out; - } + ret = inode_ctx_get (loc->inode, this, + &ctx); + if (ret < 0) { + op_errno = EINVAL; + gf_log (this->name, GF_LOG_ERROR, + "inode ctx not set!"); + goto out; + } + + inode_ctx = (afr_inode_ctx_t *)(long) ctx; + + if (inode_ctx->read_child >= 0) { + call_child = inode_ctx->read_child; + + local->cont.readlink.last_tried = -1; + + } else { + call_child = afr_first_up_child (priv); + + if (call_child == -1) { + op_errno = ENOTCONN; + gf_log (this->name, GF_LOG_ERROR, + "no child is up :("); + goto out; + } + + local->cont.readlink.last_tried = call_child; + } - local->cont.readlink.last_tried = call_child; loc_copy (&local->loc, loc); local->cont.readlink.size = size; @@ -495,13 +603,17 @@ afr_getxattr_cbk (call_frame_t *frame, void *cookie, int unwind = 1; int last_tried = -1; int this_try = -1; + int read_child = -1; priv = this->private; children = priv->children; local = frame->local; + read_child = (long) cookie; + if (op_ret == -1) { + retry: last_tried = local->cont.getxattr.last_tried; if (all_tried (last_tried, priv->child_count)) { @@ -509,10 +621,14 @@ afr_getxattr_cbk (call_frame_t *frame, void *cookie, } this_try = ++local->cont.getxattr.last_tried; + if (this_try == read_child) { + goto retry; + } + unwind = 0; STACK_WIND_COOKIE (frame, afr_getxattr_cbk, - (void *) (long) this_try, - children[this_try], + (void *) (long) read_child, + children[this_try], children[this_try]->fops->getxattr, &local->loc, local->cont.getxattr.name); @@ -536,6 +652,10 @@ afr_getxattr (call_frame_t *frame, xlator_t *this, int call_child = 0; afr_local_t * local = NULL; + afr_inode_ctx_t * inode_ctx = NULL; + uint64_t ctx; + int ret = 0; + int32_t op_ret = -1; int32_t op_errno = 0; @@ -551,15 +671,34 @@ afr_getxattr (call_frame_t *frame, xlator_t *this, ALLOC_OR_GOTO (local, afr_local_t, out); frame->local = local; - call_child = afr_first_up_child (priv); - if (call_child == -1) { - op_errno = ENOTCONN; - gf_log (this->name, GF_LOG_ERROR, - "no child is up :("); - goto out; - } + ret = inode_ctx_get (loc->inode, this, &ctx); + + if (ret < 0) { + op_errno = EINVAL; + gf_log (this->name, GF_LOG_ERROR, + "inode ctx not set!"); + goto out; + } + + inode_ctx = (afr_inode_ctx_t *)(long) ctx; + + if (inode_ctx->read_child >= 0) { + call_child = inode_ctx->read_child; + + local->cont.getxattr.last_tried = -1; + } else { + call_child = afr_first_up_child (priv); + + if (call_child == -1) { + op_errno = ENOTCONN; + gf_log (this->name, GF_LOG_ERROR, + "no child is up :("); + goto out; + } + + local->cont.getxattr.last_tried = call_child; + } - local->cont.getxattr.last_tried = call_child; loc_copy (&local->loc, loc); if (name) local->cont.getxattr.name = strdup (name); @@ -584,7 +723,7 @@ out: /** * read algorithm: - * + * * if the user has specified a read subvolume, use it * otherwise - * use the inode number to hash it to one of the subvolumes, and @@ -593,7 +732,7 @@ out: * if any of the above read's fail, try the children in sequence * beginning at the beginning */ - + int32_t afr_readv_cbk (call_frame_t *frame, void *cookie, xlator_t *this, int32_t op_ret, int32_t op_errno, @@ -605,7 +744,8 @@ afr_readv_cbk (call_frame_t *frame, void *cookie, int unwind = 1; int last_tried = -1; - int this_try = -1; + int this_try = -1; + int read_child = -1; VALIDATE_OR_GOTO (frame, out); VALIDATE_OR_GOTO (this, out); @@ -618,6 +758,8 @@ afr_readv_cbk (call_frame_t *frame, void *cookie, local = frame->local; + read_child = (long) cookie; + if (op_ret == -1) { retry: last_tried = local->cont.readv.last_tried; @@ -627,8 +769,8 @@ afr_readv_cbk (call_frame_t *frame, void *cookie, } this_try = ++local->cont.readv.last_tried; - if (this_try == priv->read_child) { - /* + if (this_try == read_child) { + /* skip the read child since if we are here we must have already tried that child */ @@ -638,8 +780,8 @@ afr_readv_cbk (call_frame_t *frame, void *cookie, unwind = 0; STACK_WIND_COOKIE (frame, afr_readv_cbk, - (void *) (long) this_try, - children[this_try], + (void *) (long) read_child, + children[this_try], children[this_try]->fops->readv, local->fd, local->cont.readv.size, local->cont.readv.offset); @@ -662,6 +804,10 @@ afr_readv (call_frame_t *frame, xlator_t *this, afr_local_t * local = NULL; xlator_t ** children = NULL; + afr_inode_ctx_t * inode_ctx = NULL; + uint64_t ctx; + int ret = 0; + int call_child = 0; int32_t op_ret = -1; @@ -679,15 +825,28 @@ afr_readv (call_frame_t *frame, xlator_t *this, frame->local = local; - if (priv->read_child != -1) { - call_child = priv->read_child; + ret = inode_ctx_get (fd->inode, this, + &ctx); - /* + if (ret < 0) { + op_errno = EINVAL; + gf_log (this->name, GF_LOG_ERROR, + "inode ctx not set!"); + goto out; + } + + inode_ctx = (afr_inode_ctx_t *)(long) ctx; + + if (inode_ctx->read_child >= 0) { + call_child = inode_ctx->read_child; + + /* if read fails from the read child, we try all children starting with the first one */ - local->cont.readv.last_tried = -1; - } else { + local->cont.readv.last_tried = -1; + + } else { call_child = afr_first_up_child (priv); if (call_child == -1) { op_errno = ENOTCONN; diff --git a/xlators/cluster/afr/src/afr.c b/xlators/cluster/afr/src/afr.c index 4ae302deb7c..e3526087ae7 100644 --- a/xlators/cluster/afr/src/afr.c +++ b/xlators/cluster/afr/src/afr.c @@ -345,10 +345,13 @@ afr_lookup_cbk (call_frame_t *frame, void *cookie, struct stat * lookup_buf = NULL; int call_count = -1; int child_index = -1; - int prev_child_index = -1; + uint32_t open_fd_count = 0; int ret = 0; + afr_inode_ctx_t * inode_ctx = NULL; + uint64_t ctx; + child_index = (long) cookie; priv = this->private; @@ -409,7 +412,52 @@ afr_lookup_cbk (call_frame_t *frame, void *cookie, lookup_buf->st_ino = afr_itransform (buf->st_ino, priv->child_count, child_index); + + ret = inode_ctx_get (local->cont.lookup.inode, this, + &ctx); + + inode_ctx = (afr_inode_ctx_t *)(long) ctx; + + if (priv->read_child >= 0) { + inode_ctx->read_child = priv->read_child; + } else { + inode_ctx->read_child = child_index; + } + } else { + if ((local->op_ret == 0) + && (child_index == local->read_child_index)) { + + /* + lookup has succeeded on the read child. + So use its inode number + */ + + local->op_ret = op_ret; + + if (local->cont.lookup.xattr) + dict_unref (local->cont.lookup.xattr); + + local->cont.lookup.inode = inode; + local->cont.lookup.xattr = dict_ref (xattr); + + *lookup_buf = *buf; + lookup_buf->st_ino = afr_itransform (buf->st_ino, + priv->child_count, + child_index); + + ret = inode_ctx_get (local->cont.lookup.inode, this, + &ctx); + + inode_ctx = (afr_inode_ctx_t *)(long) ctx; + + if (priv->read_child >= 0) { + inode_ctx->read_child = priv->read_child; + } else { + inode_ctx->read_child = local->read_child_index; + } + } + if (FILETYPE_DIFFERS (buf, lookup_buf)) { /* mismatching filetypes with same name -- Govinda !! GOvinda !!! @@ -431,15 +479,6 @@ afr_lookup_cbk (call_frame_t *frame, void *cookie, && S_ISREG (buf->st_mode)) { local->need_data_self_heal = 1; } - - prev_child_index = afr_deitransform_orig (lookup_buf->st_ino, - priv->child_count); - if (child_index < prev_child_index) { - *lookup_buf = *buf; - lookup_buf->st_ino = afr_itransform (buf->st_ino, - priv->child_count, - child_index); - } } local->success_count++; @@ -465,9 +504,13 @@ unlock: } if (local->success_count) { - /* check for govinda_gOvinda case in previous lookup */ - if (!inode_ctx_get (local->cont.lookup.inode, - this, NULL)) + /* check for split-brain case in previous lookup */ + ret = inode_ctx_get (local->cont.lookup.inode, this, + &ctx); + + inode_ctx = (afr_inode_ctx_t *)(long) ctx; + + if (inode_ctx->split_brain) local->need_data_self_heal = 1; } @@ -544,7 +587,12 @@ afr_lookup (call_frame_t *frame, xlator_t *this, } } - local->reval_child_index = 0; + LOCK (&priv->read_child_lock); + { + local->read_child_index = (++priv->read_child_rr) + % (priv->child_count); + } + UNLOCK (&priv->read_child_lock); local->call_count = priv->child_count; @@ -2187,17 +2235,12 @@ init (xlator_t *this) trav = trav->next; } - /* XXX: return inode numbers from 1st subvolume till - afr supports read-subvolume based on inode's ctx - (and not itransform) for this reason afr_deitransform() - returns 0 always - */ - priv->read_child = 0; - priv->wait_count = 1; priv->child_count = child_count; + LOCK_INIT (&priv->lock); + LOCK_INIT (&priv->read_child_lock); priv->child_up = CALLOC (sizeof (unsigned char), child_count); if (!priv->child_up) { diff --git a/xlators/cluster/afr/src/afr.h b/xlators/cluster/afr/src/afr.h index 5db6e98092a..a447b74f47b 100644 --- a/xlators/cluster/afr/src/afr.h +++ b/xlators/cluster/afr/src/afr.h @@ -35,6 +35,9 @@ typedef struct _afr_private { gf_lock_t lock; /* to guard access to child_count, etc */ unsigned int child_count; /* total number of children */ + unsigned int read_child_rr; /* round-robin index of the read_child */ + gf_lock_t read_child_lock; /* lock to protect above */ + xlator_t **children; unsigned char *child_up; @@ -48,7 +51,7 @@ typedef struct _afr_private { gf_boolean_t metadata_change_log; /* on/off */ gf_boolean_t entry_change_log; /* on/off */ - unsigned int read_child; /* read-subvolume */ + int read_child; /* read-subvolume */ unsigned int favorite_child; /* subvolume to be preferred in resolving split-brain cases */ @@ -110,7 +113,8 @@ typedef struct _afr_local { unsigned int need_data_self_heal; unsigned int govinda_gOvinda; - unsigned int reval_child_index; + unsigned int read_child_index; + int32_t op_ret; int32_t op_errno; -- cgit