diff options
20 files changed, 1582 insertions, 1727 deletions
diff --git a/xlators/cluster/afr/src/afr-common.c b/xlators/cluster/afr/src/afr-common.c index 94335bd0298..19c5a83d2da 100644 --- a/xlators/cluster/afr/src/afr-common.c +++ b/xlators/cluster/afr/src/afr-common.c @@ -706,7 +706,7 @@ afr_local_sh_cleanup (afr_local_t *local, xlator_t *this)          if (sh->locked_nodes)                  GF_FREE (sh->locked_nodes); -        if (sh->healing_fd && !sh->healing_fd_opened) { +        if (sh->healing_fd) {                  fd_unref (sh->healing_fd);                  sh->healing_fd = NULL;          } @@ -724,6 +724,14 @@ afr_local_sh_cleanup (afr_local_t *local, xlator_t *this)                  GF_FREE (sh->fresh_parent_dirs);          loc_wipe (&sh->parent_loc); + +        if (sh->checksum) +                GF_FREE (sh->checksum); + +        if (sh->write_needed) +                GF_FREE (sh->write_needed); +        if (sh->healing_fd) +                fd_unref (sh->healing_fd);  } @@ -795,6 +803,9 @@ afr_local_cleanup (afr_local_t *local, xlator_t *this)          if (local->fresh_children)                  GF_FREE (local->fresh_children); +        if (local->fd_open_on) +                GF_FREE (local->fd_open_on); +          { /* lookup */                  if (local->cont.lookup.xattrs) {                          afr_reset_xattr (local->cont.lookup.xattrs, @@ -897,23 +908,34 @@ afr_frame_return (call_frame_t *frame)          return call_count;  } - -/** - * up_children_count - return the number of children that are up - */ -  int -afr_up_children_count (int child_count, unsigned char *child_up) +afr_set_elem_count_get (unsigned char *elems, int child_count)  {          int i   = 0;          int ret = 0;          for (i = 0; i < child_count; i++) -                if (child_up[i]) +                if (elems[i])                          ret++;          return ret;  } +/** + * up_children_count - return the number of children that are up + */ + +unsigned int +afr_up_children_count (unsigned char *child_up, unsigned int child_count) +{ +        return afr_set_elem_count_get (child_up, child_count); +} + +unsigned int +afr_locked_children_count (unsigned char *children, unsigned int child_count) +{ +        return afr_set_elem_count_get (children, child_count); +} +  gf_boolean_t  afr_is_fresh_lookup (loc_t *loc, xlator_t *this)  { @@ -1172,7 +1194,7 @@ afr_is_transaction_running (afr_local_t *local)          return ((local->inodelk_count > 0) || (local->entrylk_count > 0));  } -static void +void  afr_launch_self_heal (call_frame_t *frame, xlator_t *this, inode_t *inode,                        gf_boolean_t is_background, ia_type_t ia_type,                        void (*gfid_sh_success_cbk) (call_frame_t *sh_frame, @@ -1186,6 +1208,7 @@ afr_launch_self_heal (call_frame_t *frame, xlator_t *this, inode_t *inode,          GF_ASSERT (frame);          GF_ASSERT (this);          GF_ASSERT (inode); +        GF_ASSERT (ia_type != IA_INVAL);          local = frame->local;          local->self_heal.background = is_background; @@ -1444,7 +1467,7 @@ static void  afr_lookup_perform_self_heal_if_needed (call_frame_t *frame, xlator_t *this,                                          gf_boolean_t *sh_launched)  { -        size_t              up_count = 0; +        unsigned int         up_count = 0;          afr_private_t       *priv    = NULL;          afr_local_t         *local   = NULL; @@ -1453,7 +1476,7 @@ afr_lookup_perform_self_heal_if_needed (call_frame_t *frame, xlator_t *this,          priv         = this->private;          local        = frame->local; -        up_count  = afr_up_children_count (priv->child_count, local->child_up); +        up_count  = afr_up_children_count (local->child_up, priv->child_count);          if (up_count == 1) {                  gf_log (this->name, GF_LOG_DEBUG,                          "Only 1 child up - do not attempt to detect self heal"); @@ -1591,8 +1614,8 @@ afr_lookup_done (call_frame_t *frame, xlator_t *this)          if (local->op_ret < 0)                  goto unwind;          gfid_miss_count = afr_lookup_gfid_missing_count (local, this); -        up_children_count = afr_up_children_count (priv->child_count, -                                                   local->child_up); +        up_children_count = afr_up_children_count (local->child_up, +                                                   priv->child_count);          enotconn_count = priv->child_count - up_children_count;          if ((gfid_miss_count == local->success_count) &&              (enotconn_count > 0)) { @@ -1871,7 +1894,8 @@ afr_lookup (call_frame_t *frame, xlator_t *this,          if (loc->parent)                  local->cont.lookup.parent_ino = loc->parent->ino; -        local->child_up = memdup (priv->child_up, priv->child_count); +        local->child_up = memdup (priv->child_up, +                                  sizeof (*local->child_up) * priv->child_count);          if (NULL == local->child_up) {                  op_errno = ENOMEM;                  goto out; @@ -1883,8 +1907,8 @@ afr_lookup (call_frame_t *frame, xlator_t *this,                  goto out;          } -        local->call_count = afr_up_children_count (priv->child_count, -                                                   local->child_up); +        local->call_count = afr_up_children_count (local->child_up, +                                                   priv->child_count);          call_count = local->call_count;          if (local->call_count == 0) { @@ -1994,7 +2018,7 @@ afr_fd_ctx_set (xlator_t *this, fd_t *fd)                  fd_ctx->opened_on = GF_CALLOC (sizeof (*fd_ctx->opened_on),                                                 priv->child_count, -                                               gf_afr_mt_char); +                                               gf_afr_mt_int32_t);                  if (!fd_ctx->opened_on) {                          ret = -ENOMEM;                          goto unlock; @@ -2011,12 +2035,13 @@ afr_fd_ctx_set (xlator_t *this, fd_t *fd)                          goto unlock;                  } +                INIT_LIST_HEAD (&fd_ctx->paused_calls); +                INIT_LIST_HEAD (&fd_ctx->entries); +                  ret = __fd_ctx_set (fd, this, (uint64_t)(long) fd_ctx);                  if (ret)                          gf_log (this->name, GF_LOG_DEBUG,                                  "failed to set fd ctx (%p)", fd); - -                INIT_LIST_HEAD (&fd_ctx->entries);          }  unlock:          UNLOCK (&fd->lock); @@ -2108,7 +2133,7 @@ afr_flush_wind (call_frame_t *frame, xlator_t *this)          local = frame->local;          priv = this->private; -        call_count = afr_up_children_count (priv->child_count, local->child_up); +        call_count = afr_up_children_count (local->child_up, priv->child_count);          if (call_count == 0) {                  local->transaction.resume (frame, this); @@ -2174,7 +2199,7 @@ afr_flush (call_frame_t *frame, xlator_t *this, fd_t *fd)                  goto out;          } -        call_count = afr_up_children_count (priv->child_count, local->child_up); +        call_count = afr_up_children_count (local->child_up, priv->child_count);          transaction_frame = copy_frame (frame);          if (!transaction_frame) { @@ -2196,6 +2221,12 @@ afr_flush (call_frame_t *frame, xlator_t *this, fd_t *fd)          local->transaction.start  = 0;          local->transaction.len    = 0; +        ret = afr_open_fd_fix (transaction_frame, this, _gf_false); +        if (ret) { +                op_ret = -1; +                op_errno = -ret; +                goto out; +        }          afr_transaction (transaction_frame, this, AFR_DATA_TRANSACTION); @@ -3474,8 +3505,8 @@ AFR_LOCAL_INIT (afr_local_t *local, afr_private_t *priv)  {          local->op_ret = -1;          local->op_errno = EUCLEAN; -        local->call_count = afr_up_children_count (priv->child_count, -                                                   priv->child_up); +        local->call_count = afr_up_children_count (priv->child_up, +                                                   priv->child_count);          if (local->call_count == 0) {                  gf_log (THIS->name, GF_LOG_INFO, "no subvolumes up");                  return -ENOTCONN; @@ -3531,19 +3562,22 @@ out:  }  int -afr_transaction_local_init (afr_local_t *local, afr_private_t *priv) +afr_transaction_local_init (afr_local_t *local, xlator_t *this)  { -        int i; -        int child_up_count = 0; -        int ret = -ENOMEM; +        int            i = 0; +        int            child_up_count = 0; +        int            ret = -ENOMEM; +        afr_private_t *priv = NULL; +        priv = this->private;          ret = afr_internal_lock_init (&local->internal_lock, priv->child_count,                                        AFR_TRANSACTION_LK);          if (ret < 0)                  goto out;          ret = -ENOMEM; -        child_up_count = afr_up_children_count (priv->child_count, local->child_up); +        child_up_count = afr_up_children_count (local->child_up, +                                                priv->child_count);          if (priv->optimistic_change_log && child_up_count == priv->child_count)                  local->optimistic_change_log = 1; @@ -3567,6 +3601,14 @@ afr_transaction_local_init (afr_local_t *local, afr_private_t *priv)          if (!local->fresh_children)                  goto out; +        if (local->fd) { +                local->fd_open_on = GF_CALLOC (sizeof (*local->fd_open_on), +                                               priv->child_count, +                                               gf_afr_mt_int32_t); +                if (!local->fd_open_on) +                        goto out; +        } +          for (i = 0; i < priv->child_count; i++) {                  local->pending[i] = GF_CALLOC (sizeof (*local->pending[i]),                                                 3, /* data + metadata + entry */ diff --git a/xlators/cluster/afr/src/afr-dir-read.c b/xlators/cluster/afr/src/afr-dir-read.c index 645da2a6c57..ec3639ff73b 100644 --- a/xlators/cluster/afr/src/afr-dir-read.c +++ b/xlators/cluster/afr/src/afr-dir-read.c @@ -164,9 +164,6 @@ out:                          sh->need_entry_self_heal  = _gf_true;                          sh->forced_merge          = _gf_true; -                        sh->type                  = local->fd->inode->ia_type; -                        sh->background            = _gf_false; -                        sh->unwind                = afr_examine_dir_sh_unwind;                          afr_self_heal_type_str_get(&local->self_heal,                                                     sh_type_str, @@ -177,7 +174,9 @@ out:                                  " forced merge option set",                                  sh_type_str, local->loc.path); -                        afr_self_heal (frame, this, local->fd->inode); +                        afr_launch_self_heal (frame, this, local->fd->inode, +                                              _gf_false, local->fd->inode->ia_type, +                                              NULL, afr_examine_dir_sh_unwind);                  } else {                          afr_set_opendir_done (this, local->fd->inode); @@ -205,7 +204,7 @@ afr_examine_dir (call_frame_t *frame, xlator_t *this)                                                    sizeof (*local->cont.opendir.checksum),                                                    gf_afr_mt_int32_t); -        call_count = afr_up_children_count (priv->child_count, local->child_up); +        call_count = afr_up_children_count (local->child_up, priv->child_count);          local->call_count = call_count; @@ -240,8 +239,8 @@ afr_opendir_cbk (call_frame_t *frame, void *cookie,          priv  = this->private;          local = frame->local; -        up_children_count = afr_up_children_count (priv->child_count, -                                                   local->child_up); +        up_children_count = afr_up_children_count (local->child_up, +                                                   priv->child_count);          LOCK (&frame->lock);          { diff --git a/xlators/cluster/afr/src/afr-dir-write.c b/xlators/cluster/afr/src/afr-dir-write.c index 21287f8b8c2..1c25d160689 100644 --- a/xlators/cluster/afr/src/afr-dir-write.c +++ b/xlators/cluster/afr/src/afr-dir-write.c @@ -166,7 +166,7 @@ afr_create_wind_cbk (call_frame_t *frame, void *cookie, xlator_t *this,                          fd_ctx = (afr_fd_ctx_t *)(long) ctx; -                        fd_ctx->opened_on[child_index] = 1; +                        fd_ctx->opened_on[child_index] = AFR_FD_OPENED;                          fd_ctx->flags                  = local->cont.create.flags;                          if (local->success_count == 0) @@ -212,13 +212,16 @@ afr_create_wind (call_frame_t *frame, xlator_t *this)  {          afr_local_t *local = NULL;          afr_private_t *priv = NULL; +        afr_internal_lock_t *int_lock = NULL;          int call_count = -1;          int i = 0;          local = frame->local;          priv = this->private; +        int_lock = &local->internal_lock; -        call_count = afr_up_children_count (priv->child_count, local->child_up); +        call_count = afr_locked_children_count (int_lock->entry_locked_nodes, +                                                priv->child_count);          if (call_count == 0) {                  local->transaction.resume (frame, this); @@ -228,7 +231,7 @@ afr_create_wind (call_frame_t *frame, xlator_t *this)          local->call_count = call_count;          for (i = 0; i < priv->child_count; i++) { -                if (local->child_up[i]) { +                if (local->child_up[i] && int_lock->entry_locked_nodes[i]) {                          STACK_WIND_COOKIE (frame, afr_create_wind_cbk,                                             (void *) (long) i,                                             priv->children[i], @@ -442,13 +445,16 @@ afr_mknod_wind (call_frame_t *frame, xlator_t *this)  {          afr_local_t *local = NULL;          afr_private_t *priv = NULL; +        afr_internal_lock_t *int_lock = NULL;          int call_count = -1;          int i = 0;          local = frame->local;          priv  = this->private; +        int_lock = &local->internal_lock; -        call_count = afr_up_children_count (priv->child_count, local->child_up); +        call_count = afr_locked_children_count (int_lock->entry_locked_nodes, +                                                priv->child_count);          if (call_count == 0) {                  local->transaction.resume (frame, this); @@ -458,7 +464,7 @@ afr_mknod_wind (call_frame_t *frame, xlator_t *this)          local->call_count = call_count;          for (i = 0; i < priv->child_count; i++) { -                if (local->child_up[i]) { +                if (local->child_up[i] && int_lock->entry_locked_nodes[i]) {                          STACK_WIND_COOKIE (frame, afr_mknod_wind_cbk, (void *) (long) i,                                             priv->children[i],                                             priv->children[i]->fops->mknod, @@ -667,13 +673,16 @@ afr_mkdir_wind (call_frame_t *frame, xlator_t *this)  {          afr_local_t *local = NULL;          afr_private_t *priv = NULL; +        afr_internal_lock_t *int_lock = NULL;          int call_count = -1;          int i = 0;          local = frame->local;          priv  = this->private; +        int_lock = &local->internal_lock; -        call_count = afr_up_children_count (priv->child_count, local->child_up); +        call_count = afr_locked_children_count (int_lock->entry_locked_nodes, +                                                priv->child_count);          if (call_count == 0) {                  local->transaction.resume (frame, this); @@ -683,7 +692,7 @@ afr_mkdir_wind (call_frame_t *frame, xlator_t *this)          local->call_count = call_count;          for (i = 0; i < priv->child_count; i++) { -                if (local->child_up[i]) { +                if (local->child_up[i] && int_lock->entry_locked_nodes[i]) {                          STACK_WIND_COOKIE (frame, afr_mkdir_wind_cbk,                                             (void *) (long) i,                                             priv->children[i], @@ -894,13 +903,16 @@ afr_link_wind (call_frame_t *frame, xlator_t *this)  {          afr_local_t *local = NULL;          afr_private_t *priv = NULL; +        afr_internal_lock_t *int_lock = NULL;          int call_count = -1;          int i = 0;          local = frame->local;          priv  = this->private; +        int_lock = &local->internal_lock; -        call_count = afr_up_children_count (priv->child_count, local->child_up); +        call_count = afr_locked_children_count (int_lock->entry_locked_nodes, +                                                priv->child_count);          if (call_count == 0) {                  local->transaction.resume (frame, this); @@ -910,7 +922,7 @@ afr_link_wind (call_frame_t *frame, xlator_t *this)          local->call_count = call_count;          for (i = 0; i < priv->child_count; i++) { -                if (local->child_up[i]) { +                if (local->child_up[i] && int_lock->entry_locked_nodes[i]) {                          STACK_WIND_COOKIE (frame, afr_link_wind_cbk, (void *) (long) i,                                             priv->children[i],                                             priv->children[i]->fops->link, @@ -1117,13 +1129,16 @@ afr_symlink_wind (call_frame_t *frame, xlator_t *this)  {          afr_local_t *local = NULL;          afr_private_t *priv = NULL; +        afr_internal_lock_t *int_lock = NULL;          int call_count = -1;          int i = 0;          local = frame->local;          priv = this->private; +        int_lock = &local->internal_lock; -        call_count = afr_up_children_count (priv->child_count, local->child_up); +        call_count = afr_locked_children_count (int_lock->entry_locked_nodes, +                                                priv->child_count);          if (call_count == 0) {                  local->transaction.resume (frame, this); @@ -1133,7 +1148,7 @@ afr_symlink_wind (call_frame_t *frame, xlator_t *this)          local->call_count = call_count;          for (i = 0; i < priv->child_count; i++) { -                if (local->child_up[i]) { +                if (local->child_up[i] && int_lock->entry_locked_nodes[i]) {                          STACK_WIND_COOKIE (frame, afr_symlink_wind_cbk,                                             (void *) (long) i,                                             priv->children[i], @@ -1338,13 +1353,16 @@ afr_rename_wind (call_frame_t *frame, xlator_t *this)  {          afr_local_t *local = NULL;          afr_private_t *priv = NULL; +        afr_internal_lock_t *int_lock = NULL;          int call_count = -1;          int i = 0;          local = frame->local;          priv = this->private; +        int_lock = &local->internal_lock; -        call_count = afr_up_children_count (priv->child_count, local->child_up); +        call_count = afr_locked_children_count (int_lock->entry_locked_nodes, +                                                priv->child_count);          if (call_count == 0) {                  local->transaction.resume (frame, this); @@ -1354,7 +1372,7 @@ afr_rename_wind (call_frame_t *frame, xlator_t *this)          local->call_count = call_count;          for (i = 0; i < priv->child_count; i++) { -                if (local->child_up[i]) { +                if (local->child_up[i] && int_lock->entry_locked_nodes[i]) {                          STACK_WIND_COOKIE (frame, afr_rename_wind_cbk,                                             (void *) (long) i,                                             priv->children[i], @@ -1543,13 +1561,16 @@ afr_unlink_wind (call_frame_t *frame, xlator_t *this)  {          afr_local_t *local = NULL;          afr_private_t *priv = NULL; +        afr_internal_lock_t *int_lock = NULL;          int call_count = -1;          int i = 0;          local = frame->local;          priv  = this->private; +        int_lock = &local->internal_lock; -        call_count = afr_up_children_count (priv->child_count, local->child_up); +        call_count = afr_locked_children_count (int_lock->entry_locked_nodes, +                                                priv->child_count);          if (call_count == 0) {                  local->transaction.resume (frame, this); @@ -1559,7 +1580,7 @@ afr_unlink_wind (call_frame_t *frame, xlator_t *this)          local->call_count = call_count;          for (i = 0; i < priv->child_count; i++) { -                if (local->child_up[i]) { +                if (local->child_up[i] && int_lock->entry_locked_nodes[i]) {                          STACK_WIND_COOKIE (frame, afr_unlink_wind_cbk,                                             (void *) (long) i,                                             priv->children[i], @@ -1741,13 +1762,16 @@ afr_rmdir_wind (call_frame_t *frame, xlator_t *this)  {          afr_local_t *local = NULL;          afr_private_t *priv = NULL; +        afr_internal_lock_t *int_lock = NULL;          int call_count = -1;          int i = 0;          local = frame->local;          priv  = this->private; +        int_lock = &local->internal_lock; -        call_count = afr_up_children_count (priv->child_count, local->child_up); +        call_count = afr_locked_children_count (int_lock->entry_locked_nodes, +                                                priv->child_count);          if (call_count == 0) {                  local->transaction.resume (frame, this); @@ -1757,7 +1781,7 @@ afr_rmdir_wind (call_frame_t *frame, xlator_t *this)          local->call_count = call_count;          for (i = 0; i < priv->child_count; i++) { -                if (local->child_up[i]) { +                if (local->child_up[i] && int_lock->entry_locked_nodes[i]) {                          STACK_WIND_COOKIE (frame, afr_rmdir_wind_cbk,                                             (void *) (long) i,                                             priv->children[i], diff --git a/xlators/cluster/afr/src/afr-inode-read.c b/xlators/cluster/afr/src/afr-inode-read.c index f8157482758..1258afe09fc 100644 --- a/xlators/cluster/afr/src/afr-inode-read.c +++ b/xlators/cluster/afr/src/afr-inode-read.c @@ -44,9 +44,6 @@  #include "compat-errno.h"  #include "compat.h" -#include "afr.h" - -  /**   * Common algorithm for inode read calls:   * @@ -399,6 +396,12 @@ afr_fstat (call_frame_t *frame, xlator_t *this,          local->cont.fstat.ino = fd->inode->ino;          local->fd = fd_ref (fd); +        op_ret = afr_open_fd_fix (frame, this, _gf_false); +        if (op_ret) { +                op_errno = -op_ret; +                op_ret = -1; +                goto out; +        }          STACK_WIND_COOKIE (frame, afr_fstat_cbk, (void *) (long) call_child,                             children[call_child],                             children[call_child]->fops->fstat, @@ -1036,6 +1039,12 @@ afr_readv (call_frame_t *frame, xlator_t *this,          local->cont.readv.size       = size;          local->cont.readv.offset     = offset; +        op_ret = afr_open_fd_fix (frame, this, _gf_false); +        if (op_ret) { +                op_errno = -op_ret; +                op_ret = -1; +                goto out; +        }          STACK_WIND_COOKIE (frame, afr_readv_cbk,                             (void *) (long) call_child,                             children[call_child], diff --git a/xlators/cluster/afr/src/afr-inode-write.c b/xlators/cluster/afr/src/afr-inode-write.c index c292b7493f6..faf3db40041 100644 --- a/xlators/cluster/afr/src/afr-inode-write.c +++ b/xlators/cluster/afr/src/afr-inode-write.c @@ -46,6 +46,7 @@  #include "afr.h"  #include "afr-transaction.h" +#include "afr-self-heal-common.h"  /* {{{ writev */ @@ -125,19 +126,21 @@ afr_writev_wind_cbk (call_frame_t *frame, void *cookie, xlator_t *this,          return 0;  } -  int  afr_writev_wind (call_frame_t *frame, xlator_t *this)  {          afr_local_t *local = NULL;          afr_private_t *priv = NULL; +        afr_internal_lock_t *int_lock = NULL;          int i = 0;          int call_count = -1;          local = frame->local;          priv = this->private; +        int_lock = &local->internal_lock; -        call_count = afr_up_children_count (priv->child_count, local->child_up); +        call_count = afr_locked_children_count (int_lock->inode_locked_nodes, +                                                priv->child_count);          if (call_count == 0) {                  local->transaction.resume (frame, this); @@ -147,7 +150,7 @@ afr_writev_wind (call_frame_t *frame, xlator_t *this)          local->call_count = call_count;          for (i = 0; i < priv->child_count; i++) { -                if (local->child_up[i]) { +                if (local->child_up[i] && int_lock->inode_locked_nodes[i]) {                          STACK_WIND_COOKIE (frame, afr_writev_wind_cbk,                                             (void *) (long) i,                                             priv->children[i], @@ -188,10 +191,10 @@ afr_writev_done (call_frame_t *frame, xlator_t *this)  int  afr_do_writev (call_frame_t *frame, xlator_t *this)  { -        call_frame_t * transaction_frame = NULL; -        afr_local_t *  local             = NULL; -        int op_ret   = -1; -        int op_errno = 0; +        call_frame_t    *transaction_frame = NULL; +        afr_local_t     *local             = NULL; +        int             op_ret   = -1; +        int             op_errno = 0;          local = frame->local; @@ -235,6 +238,202 @@ out:          return 0;  } +static int +afr_prepare_loc (call_frame_t *frame, fd_t *fd) +{ +        afr_local_t    *local = NULL; +        char           *name = NULL; +        char           *path = NULL; +        int             ret = 0; + +        if ((!fd) || (!fd->inode)) +                return -1; + +        local = frame->local; +        ret = inode_path (fd->inode, NULL, (char **)&path); +        if (ret <= 0) { +                gf_log (frame->this->name, GF_LOG_DEBUG, +                        "Unable to get path for gfid: %s", +                        uuid_utoa (fd->inode->gfid)); +                return -1; +        } + +        if (local->loc.path) { +                if (strcmp (path, local->loc.path)) +                        gf_log (frame->this->name, GF_LOG_DEBUG, +                                "overwriting old loc->path %s with %s", +                                local->loc.path, path); +                GF_FREE ((char *)local->loc.path); +        } +        local->loc.path = path; + +        name = strrchr (local->loc.path, '/'); +        if (name) +                name++; +        local->loc.name = name; + +        if (local->loc.inode) { +                inode_unref (local->loc.inode); +        } +        local->loc.inode = inode_ref (fd->inode); + +        if (local->loc.parent) { +                inode_unref (local->loc.parent); +        } + +        local->loc.parent = inode_parent (local->loc.inode, 0, NULL); + +        return 0; +} + +afr_fd_paused_call_t* +afr_paused_call_create (call_frame_t *frame) +{ +        afr_local_t             *local = NULL; +        afr_fd_paused_call_t   *paused_call = NULL; + +        local = frame->local; +        GF_ASSERT (local->fop_call_continue); + +        paused_call = GF_CALLOC (1, sizeof (*paused_call), +                                  gf_afr_fd_paused_call_t); +        if (paused_call) { +                INIT_LIST_HEAD (&paused_call->call_list); +                paused_call->frame = frame; +        } + +        return paused_call; +} + +static int +afr_pause_fd_fop (call_frame_t *frame, xlator_t *this, afr_fd_ctx_t *fd_ctx) +{ +        afr_fd_paused_call_t *paused_call = NULL; +        int                    ret = 0; + +        paused_call = afr_paused_call_create (frame); +        if (paused_call) +                list_add (&paused_call->call_list, &fd_ctx->paused_calls); +        else +                ret = -ENOMEM; + +        return ret; +} + +static void +afr_trigger_open_fd_self_heal (call_frame_t *frame, xlator_t *this) +{ +        afr_local_t             *local = NULL; +        afr_self_heal_t         *sh = NULL; +        char                    sh_type_str[256] = {0}; + +        local = frame->local; +        sh    = &local->self_heal; + +        sh->need_missing_entry_self_heal = _gf_true; +        sh->need_gfid_self_heal = _gf_true; +        sh->need_data_self_heal = _gf_true; +        afr_self_heal_type_str_get(&local->self_heal, sh_type_str, +                                   sizeof(sh_type_str)); +        gf_log (this->name, GF_LOG_INFO, "%s self-heal triggered. " +                "path: %s, reason: Replicate up down flush, data lock " +                "is held", sh_type_str, local->loc.path); + +        afr_launch_self_heal (frame, this, local->fd->inode, _gf_true, +                              local->fd->inode->ia_type, NULL, NULL); +} + +int +afr_open_fd_fix (call_frame_t *frame, xlator_t *this, gf_boolean_t pause_fop) +{ +        int                     ret = 0; +        int                     i   = 0; +        afr_fd_ctx_t            *fd_ctx = NULL; +        gf_boolean_t            need_self_heal = _gf_false; +        int                     *need_open = NULL; +        int                     need_open_count = 0; +        afr_local_t             *local = NULL; +        afr_private_t           *priv = NULL; +        gf_boolean_t            fop_continue = _gf_true; +        gf_boolean_t            queue_fop = _gf_false; + +        local = frame->local; +        priv  = this->private; + +        GF_ASSERT (local->fd); +        if (pause_fop) +                GF_ASSERT (local->fop_call_continue); + +        ret = afr_prepare_loc (frame, local->fd); +        if (ret < 0) { +                //File does not exist we cant open it. +                ret = 0; +                goto out; +        } + +        fd_ctx = afr_fd_ctx_get (local->fd, this); +        if (!fd_ctx) { +                ret = -EINVAL; +                goto unlock; +        } + +        LOCK (&local->fd->lock); +        { +                if (fd_ctx->up_count < priv->up_count) { +                        need_self_heal = _gf_true; +                        fd_ctx->up_count   = priv->up_count; +                        fd_ctx->down_count = priv->down_count; +                } +                for (i = 0; i < priv->child_count; i++) { +                        if ((fd_ctx->opened_on[i] == AFR_FD_NOT_OPENED) && +                            local->child_up[i]) { +                                fd_ctx->opened_on[i] = AFR_FD_OPENING; +                                if (!need_open) +                                        need_open = GF_CALLOC (priv->child_count, +                                                               sizeof (*need_open), +                                                               gf_afr_mt_int32_t); +                                need_open[i] = 1; +                                need_open_count++; +                        } else if (pause_fop && local->child_up[i] && +                                   (fd_ctx->opened_on[i] == AFR_FD_OPENING)) { +                                queue_fop = _gf_true; +                        } +                } + +                if (queue_fop) { +                        GF_ASSERT (pause_fop); +                        gf_log (this->name, GF_LOG_INFO, "Pause fd %p", +                                local->fd); +                        ret = afr_pause_fd_fop (frame, this, fd_ctx); +                        if (ret) +                                goto unlock; +                } +        } +unlock: +        UNLOCK (&local->fd->lock); +        if (ret) { +                gf_log (this->name, GF_LOG_ERROR, "Failed to fix fd for %s", +                        local->loc.path); +                fop_continue = _gf_false; +                goto out; +        } + +        if (need_self_heal) +                afr_trigger_open_fd_self_heal (frame, this); + +        if (!need_open_count) +                goto out; + +        gf_log (this->name, GF_LOG_INFO, "Opening fd %p", local->fd); +        afr_fix_open (frame, this, fd_ctx, need_open_count, need_open); +        fop_continue = _gf_false; +out: +        if (need_open) +                GF_FREE (need_open); +        if (fop_continue && local->fop_call_continue) +                local->fop_call_continue (frame, this); +        return ret; +}  int  afr_writev (call_frame_t *frame, xlator_t *this, fd_t *fd, @@ -246,8 +445,6 @@ afr_writev (call_frame_t *frame, xlator_t *this, fd_t *fd,          int ret = -1;          int op_ret   = -1;          int op_errno = 0; -        uint64_t ctx = 0; -        afr_fd_ctx_t *fd_ctx = NULL;          VALIDATE_OR_GOTO (frame, out);          VALIDATE_OR_GOTO (this, out); @@ -272,21 +469,14 @@ afr_writev (call_frame_t *frame, xlator_t *this, fd_t *fd,          local->cont.writev.iobref     = iobref_ref (iobref);          local->fd                = fd_ref (fd); +        local->fop_call_continue = afr_do_writev; -        ret = fd_ctx_get (fd, this, &ctx); -        if (ret < 0) { +        ret = afr_open_fd_fix (frame, this, _gf_true); +        if (ret) { +                op_errno = -ret;                  goto out;          } -        fd_ctx = (afr_fd_ctx_t *)(long) ctx; - -        if (fd_ctx->up_count < priv->up_count) { -                local->openfd_flush_cbk = afr_do_writev; -                afr_openfd_flush (frame, this, fd); -        } else { -                afr_do_writev (frame, this); -        } -          op_ret = 0;  out:          if (op_ret == -1) { @@ -395,13 +585,16 @@ afr_truncate_wind (call_frame_t *frame, xlator_t *this)  {          afr_local_t *local = NULL;          afr_private_t *priv = NULL; +        afr_internal_lock_t *int_lock = NULL;          int call_count = -1;          int i = 0;          local = frame->local;          priv = this->private; +        int_lock = &local->internal_lock; -        call_count = afr_up_children_count (priv->child_count, local->child_up); +        call_count = afr_locked_children_count (int_lock->inode_locked_nodes, +                                                priv->child_count);          if (call_count == 0) {                  local->transaction.resume (frame, this); @@ -411,7 +604,7 @@ afr_truncate_wind (call_frame_t *frame, xlator_t *this)          local->call_count = call_count;          for (i = 0; i < priv->child_count; i++) { -                if (local->child_up[i]) { +                if (local->child_up[i] && int_lock->inode_locked_nodes[i]) {                          STACK_WIND_COOKIE (frame, afr_truncate_wind_cbk,                                             (void *) (long) i,                                             priv->children[i], @@ -602,13 +795,16 @@ afr_ftruncate_wind (call_frame_t *frame, xlator_t *this)  {          afr_local_t *local = NULL;          afr_private_t *priv = NULL; +        afr_internal_lock_t *int_lock = NULL;          int call_count = -1;          int i = 0;          local = frame->local;          priv = this->private; +        int_lock = &local->internal_lock; -        call_count = afr_up_children_count (priv->child_count, local->child_up); +        call_count = afr_locked_children_count (int_lock->inode_locked_nodes, +                                                priv->child_count);          if (call_count == 0) {                  local->transaction.resume (frame, this); @@ -618,7 +814,7 @@ afr_ftruncate_wind (call_frame_t *frame, xlator_t *this)          local->call_count = call_count;          for (i = 0; i < priv->child_count; i++) { -                if (local->child_up[i]) { +                if (local->child_up[i] && int_lock->inode_locked_nodes[i]) {                          STACK_WIND_COOKIE (frame, afr_ftruncate_wind_cbk,                                             (void *) (long) i,                                             priv->children[i], @@ -702,8 +898,6 @@ afr_ftruncate (call_frame_t *frame, xlator_t *this,          int ret = -1;          int op_ret   = -1;          int op_errno = 0; -        uint64_t ctx = 0; -        afr_fd_ctx_t *fd_ctx = NULL;          VALIDATE_OR_GOTO (frame, out);          VALIDATE_OR_GOTO (this, out); @@ -725,21 +919,14 @@ afr_ftruncate (call_frame_t *frame, xlator_t *this,          local->cont.ftruncate.ino     = fd->inode->ino;          local->fd = fd_ref (fd); +        local->fop_call_continue = afr_do_ftruncate; -        ret = fd_ctx_get (fd, this, &ctx); -        if (ret < 0) { +        ret = afr_open_fd_fix (frame, this, _gf_true); +        if (ret) { +                op_errno = -ret;                  goto out;          } -        fd_ctx = (afr_fd_ctx_t *)(long) ctx; - -        if (fd_ctx->up_count < priv->up_count) { -                local->openfd_flush_cbk = afr_do_ftruncate; -                afr_openfd_flush (frame, this, fd); -        } else { -                afr_do_ftruncate (frame, this); -        } -          op_ret = 0;  out:          if (op_ret == -1) { @@ -849,13 +1036,16 @@ afr_setattr_wind (call_frame_t *frame, xlator_t *this)  {          afr_local_t *local = NULL;          afr_private_t *priv = NULL; +        afr_internal_lock_t *int_lock = NULL;          int call_count = -1;          int i = 0;          local = frame->local;          priv = this->private; +        int_lock = &local->internal_lock; -        call_count = afr_up_children_count (priv->child_count, local->child_up); +        call_count = afr_locked_children_count (int_lock->inode_locked_nodes, +                                                priv->child_count);          if (call_count == 0) {                  local->transaction.resume (frame, this); @@ -865,7 +1055,7 @@ afr_setattr_wind (call_frame_t *frame, xlator_t *this)          local->call_count = call_count;          for (i = 0; i < priv->child_count; i++) { -                if (local->child_up[i]) { +                if (local->child_up[i] && int_lock->inode_locked_nodes[i]) {                          STACK_WIND_COOKIE (frame, afr_setattr_wind_cbk,                                             (void *) (long) i,                                             priv->children[i], @@ -1056,13 +1246,16 @@ afr_fsetattr_wind (call_frame_t *frame, xlator_t *this)  {          afr_local_t *local = NULL;          afr_private_t *priv = NULL; +        afr_internal_lock_t *int_lock = NULL;          int call_count = -1;          int i = 0;          local = frame->local;          priv = this->private; +        int_lock = &local->internal_lock; -        call_count = afr_up_children_count (priv->child_count, local->child_up); +        call_count = afr_locked_children_count (int_lock->inode_locked_nodes, +                                                priv->child_count);          if (call_count == 0) {                  local->transaction.resume (frame, this); @@ -1072,7 +1265,7 @@ afr_fsetattr_wind (call_frame_t *frame, xlator_t *this)          local->call_count = call_count;          for (i = 0; i < priv->child_count; i++) { -                if (local->child_up[i]) { +                if (local->child_up[i] && int_lock->inode_locked_nodes[i]) {                          STACK_WIND_COOKIE (frame, afr_fsetattr_wind_cbk,                                             (void *) (long) i,                                             priv->children[i], @@ -1104,7 +1297,6 @@ afr_fsetattr_done (call_frame_t *frame, xlator_t *this)          return 0;  } -  int  afr_fsetattr (call_frame_t *frame, xlator_t *this,                fd_t *fd, struct iatt *buf, int32_t valid) @@ -1124,10 +1316,12 @@ afr_fsetattr (call_frame_t *frame, xlator_t *this,          transaction_frame = copy_frame (frame);          if (!transaction_frame) { +                op_errno = ENOMEM;                  goto out;          }          ALLOC_OR_GOTO (local, afr_local_t, out); +        transaction_frame->local = local;          ret = AFR_LOCAL_INIT (local, priv);          if (ret < 0) { @@ -1135,12 +1329,9 @@ afr_fsetattr (call_frame_t *frame, xlator_t *this,                  goto out;          } -        transaction_frame->local = local; -          local->op_ret = -1;          local->cont.fsetattr.ino     = fd->inode->ino; -          local->cont.fsetattr.in_buf = *buf;          local->cont.fsetattr.valid  = valid; @@ -1150,6 +1341,13 @@ afr_fsetattr (call_frame_t *frame, xlator_t *this,          local->fd                 = fd_ref (fd); +        op_ret = afr_open_fd_fix (frame, this, _gf_false); +        if (ret) { +                op_errno = -op_ret; +                op_ret = -1; +                goto out; +        } +          local->transaction.main_frame = frame;          local->transaction.start   = LLONG_MAX - 1;          local->transaction.len     = 0; @@ -1242,13 +1440,16 @@ afr_setxattr_wind (call_frame_t *frame, xlator_t *this)  {          afr_local_t *local = NULL;          afr_private_t *priv = NULL; +        afr_internal_lock_t *int_lock = NULL;          int call_count = -1;          int i = 0;          local = frame->local;          priv = this->private; +        int_lock = &local->internal_lock; -        call_count = afr_up_children_count (priv->child_count, local->child_up); +        call_count = afr_locked_children_count (int_lock->inode_locked_nodes, +                                                priv->child_count);          if (call_count == 0) {                  local->transaction.resume (frame, this); @@ -1258,7 +1459,7 @@ afr_setxattr_wind (call_frame_t *frame, xlator_t *this)          local->call_count = call_count;          for (i = 0; i < priv->child_count; i++) { -                if (local->child_up[i]) { +                if (local->child_up[i] && int_lock->inode_locked_nodes[i]) {                          STACK_WIND_COOKIE (frame, afr_setxattr_wind_cbk,                                             (void *) (long) i,                                             priv->children[i], @@ -1424,13 +1625,16 @@ afr_removexattr_wind (call_frame_t *frame, xlator_t *this)  {          afr_local_t *local = NULL;          afr_private_t *priv = NULL; +        afr_internal_lock_t *int_lock = NULL;          int call_count = -1;          int i = 0;          local = frame->local;          priv = this->private; +        int_lock = &local->internal_lock; -        call_count = afr_up_children_count (priv->child_count, local->child_up); +        call_count = afr_locked_children_count (int_lock->inode_locked_nodes, +                                                priv->child_count);          if (call_count == 0) {                  local->transaction.resume (frame, this); @@ -1440,7 +1644,7 @@ afr_removexattr_wind (call_frame_t *frame, xlator_t *this)          local->call_count = call_count;          for (i = 0; i < priv->child_count; i++) { -                if (local->child_up[i]) { +                if (local->child_up[i] && int_lock->inode_locked_nodes[i]) {                          STACK_WIND_COOKIE (frame, afr_removexattr_wind_cbk,                                             (void *) (long) i,                                             priv->children[i], diff --git a/xlators/cluster/afr/src/afr-lk-common.c b/xlators/cluster/afr/src/afr-lk-common.c index 17651add96d..2168ee26f69 100644 --- a/xlators/cluster/afr/src/afr-lk-common.c +++ b/xlators/cluster/afr/src/afr-lk-common.c @@ -88,8 +88,7 @@ is_afr_lock_selfheal (afr_local_t *local)  }  int32_t -internal_lock_count (call_frame_t *frame, xlator_t *this, -                     afr_fd_ctx_t *fd_ctx) +internal_lock_count (call_frame_t *frame, xlator_t *this)  {          afr_local_t   *local = NULL;          afr_private_t *priv  = NULL; @@ -99,10 +98,9 @@ internal_lock_count (call_frame_t *frame, xlator_t *this,          local = frame->local;          priv  = this->private; -        if (fd_ctx) { -                GF_ASSERT (local->fd); +        if (local->fd) {                  for (i = 0; i < priv->child_count; i++) { -                        if (local->child_up[i] && fd_ctx->opened_on[i]) +                        if (local->child_up[i] && local->fd_open_on[i])                                  ++call_count;                  }          } else { @@ -552,20 +550,24 @@ static int32_t  afr_unlock_inodelk_cbk (call_frame_t *frame, void *cookie, xlator_t *this,                          int32_t op_ret, int32_t op_errno)  { -        afr_local_t *local = NULL; +        afr_local_t         *local = NULL; +        afr_internal_lock_t *int_lock = NULL; +        int32_t             child_index = (long)cookie;          local = frame->local; +        int_lock = &local->internal_lock;          afr_trace_inodelk_out (frame, AFR_INODELK_TRANSACTION,                                 AFR_UNLOCK_OP, NULL, op_ret, -                               op_errno, (long) cookie); +                               op_errno, child_index);          if (op_ret < 0 && op_errno != ENOTCONN && op_errno != EBADFD) {                  gf_log (this->name, GF_LOG_ERROR, -                        "%s: unlock failed %s", -                        local->loc.path, strerror (op_errno)); +                        "%s: unlock failed on %d, reason: %s", +                        local->loc.path, child_index, strerror (op_errno));          } +        int_lock->inode_locked_nodes[child_index] &= LOCKED_NO;          afr_unlock_common_cbk (frame, cookie, this, op_ret, op_errno);          return 0; @@ -590,6 +592,9 @@ afr_unlock_inodelk (call_frame_t *frame, xlator_t *this)          flock.l_len   = int_lock->lk_flock.l_len;          flock.l_type  = F_UNLCK; +        gf_log (this->name, GF_LOG_DEBUG, "attempting data unlock range %"PRIu64 +                " %"PRIu64" by %"PRIu64, flock.l_start, flock.l_len, +                frame->root->lk_owner);          call_count = afr_locked_nodes_count (int_lock->inode_locked_nodes,                                               priv->child_count); @@ -646,9 +651,22 @@ static int32_t  afr_unlock_entrylk_cbk (call_frame_t *frame, void *cookie, xlator_t *this,                          int32_t op_ret, int32_t op_errno)  { +        afr_local_t         *local = NULL; +        afr_internal_lock_t *int_lock = NULL; +        int32_t             child_index = (long)cookie; + +        local = frame->local; +        int_lock = &local->internal_lock; +          afr_trace_entrylk_out (frame, AFR_ENTRYLK_TRANSACTION,                                 AFR_UNLOCK_OP, NULL, op_ret, -                               op_errno, (long) cookie); +                               op_errno, child_index); + +        if (op_ret < 0 && op_errno != ENOTCONN && op_errno != EBADFD) { +                gf_log (this->name, GF_LOG_ERROR, +                        "%s: unlock failed on %d, reason: %s", +                        local->loc.path, child_index, strerror (op_errno)); +        }          afr_unlock_common_cbk (frame, cookie, this, op_ret, op_errno); @@ -747,8 +765,7 @@ afr_lock_cbk (call_frame_t *frame, void *cookie, xlator_t *this,                  afr_unlock (frame, this);          } else {                  if (op_ret == 0) { -                        int_lock->locked_nodes[child_index] -                                |= LOCKED_YES; +                        int_lock->locked_nodes[child_index] |= LOCKED_YES;                          int_lock->lock_count++;                  }                  afr_lock_blocking (frame, this, child_index + 1); @@ -940,8 +957,8 @@ afr_lock_blocking (call_frame_t *frame, xlator_t *this, int child_index)                     or don't have the fd open */                  while ((child_index < priv->child_count) -                       && (!local->child_up[child_index] -                           || !fd_ctx->opened_on[child_index])) +                       && (!local->child_up[child_index] || +                          !local->fd_open_on[child_index]))                          child_index++;          } else { @@ -969,9 +986,7 @@ afr_lock_blocking (call_frame_t *frame, xlator_t *this, int child_index)          }          if ((child_index == priv->child_count) -            || (int_lock->lock_count == -                afr_up_children_count (priv->child_count, -                                       local->child_up))) { +            || (int_lock->lock_count == int_lock->lk_expected_count)) {                  /* we're done locking */ @@ -1081,8 +1096,6 @@ afr_lock_blocking (call_frame_t *frame, xlator_t *this, int child_index)          }          return 0; - -  }  int32_t @@ -1091,6 +1104,7 @@ afr_blocking_lock (call_frame_t *frame, xlator_t *this)          afr_internal_lock_t *int_lock = NULL;          afr_local_t         *local    = NULL;          afr_private_t       *priv     = NULL; +        int                  up_count = 0;          priv     = this->private;          local    = frame->local; @@ -1103,6 +1117,10 @@ afr_blocking_lock (call_frame_t *frame, xlator_t *this)                  break;          case AFR_ENTRY_RENAME_TRANSACTION: +                up_count = afr_up_children_count (local->child_up, +                                                  priv->child_count); +                int_lock->lk_expected_count = 2 * up_count; +                //fallthrough          case AFR_ENTRY_TRANSACTION:                  initialize_entrylk_variables (frame, this);                  break; @@ -1151,8 +1169,7 @@ afr_nonblocking_entrylk_cbk (call_frame_t *frame, void *cookie, xlator_t *this,                          local->op_errno              = op_errno;                  }          } else if (op_ret == 0) { -                int_lock->entry_locked_nodes[child_index] -                        |= LOCKED_YES; +                int_lock->entry_locked_nodes[child_index] |= LOCKED_YES;                  int_lock->entrylk_lock_count++;          } @@ -1161,7 +1178,7 @@ afr_nonblocking_entrylk_cbk (call_frame_t *frame, void *cookie, xlator_t *this,                          "Last locking reply received");                  /* all locks successfull. Proceed to call FOP */                  if (int_lock->entrylk_lock_count == -                    afr_up_children_count (priv->child_count, local->child_up)) { +                                int_lock->lk_expected_count) {                          gf_log (this->name, GF_LOG_TRACE,                                  "All servers locked. Calling the cbk");                          int_lock->lock_op_ret = 0; @@ -1181,6 +1198,20 @@ afr_nonblocking_entrylk_cbk (call_frame_t *frame, void *cookie, xlator_t *this,          return 0;  } +void +afr_mark_fd_open_on (afr_local_t *local, afr_fd_ctx_t *fd_ctx, +                       size_t child_count) +{ +        int             i = 0; + +        GF_ASSERT (local->fd_open_on); + +        memset (local->fd_open_on, 0, sizeof (*local->fd_open_on)*child_count); +        for (i = 0; i < child_count; i++) +                if (fd_ctx->opened_on[i] == AFR_FD_OPENED) +                        local->fd_open_on[i] = 1; +} +  int  afr_nonblocking_entrylk (call_frame_t *frame, xlator_t *this)  { @@ -1192,8 +1223,6 @@ afr_nonblocking_entrylk (call_frame_t *frame, xlator_t *this)          loc_t               *loc      = NULL;          int32_t call_count = 0;          int i = 0; -        uint64_t  ctx = 0; -        int ret = 0;          local    = frame->local;          int_lock = &local->internal_lock; @@ -1206,9 +1235,8 @@ afr_nonblocking_entrylk (call_frame_t *frame, xlator_t *this)                  loc = int_lock->lk_loc;          if (local->fd) { -                ret = fd_ctx_get (local->fd, this, &ctx); - -                if (ret < 0) { +                fd_ctx = afr_fd_ctx_get (local->fd, this); +                if (!fd_ctx) {                          gf_log (this->name, GF_LOG_INFO,                                  "unable to get fd ctx for fd=%p",                                  local->fd); @@ -1221,10 +1249,10 @@ afr_nonblocking_entrylk (call_frame_t *frame, xlator_t *this)                          return -1;                  } -                fd_ctx = (afr_fd_ctx_t *)(long) ctx; - -                call_count = internal_lock_count (frame, this, fd_ctx); +                afr_mark_fd_open_on (local, fd_ctx, priv->child_count); +                call_count = internal_lock_count (frame, this);                  int_lock->lk_call_count = call_count; +                int_lock->lk_expected_count = call_count;                  if (!call_count) {                          gf_log (this->name, GF_LOG_INFO, @@ -1236,7 +1264,7 @@ afr_nonblocking_entrylk (call_frame_t *frame, xlator_t *this)                  /* Send non-blocking entrylk calls only on up children                     and where the fd has been opened */                  for (i = 0; i < priv->child_count; i++) { -                        if (local->child_up[i] && fd_ctx->opened_on[i]) { +                        if (local->child_up[i] && local->fd_open_on[i]) {                                  afr_trace_entrylk_in (frame, AFR_ENTRYLK_NB_TRANSACTION,                                                        AFR_LOCK_OP, basename, i); @@ -1252,8 +1280,9 @@ afr_nonblocking_entrylk (call_frame_t *frame, xlator_t *this)          } else {                  GF_ASSERT (loc); -                call_count = internal_lock_count (frame, this, NULL); +                call_count = internal_lock_count (frame, this);                  int_lock->lk_call_count = call_count; +                int_lock->lk_expected_count = call_count;                  for (i = 0; i < priv->child_count; i++) {                          if (local->child_up[i]) { @@ -1314,8 +1343,7 @@ afr_nonblocking_inodelk_cbk (call_frame_t *frame, void *cookie, xlator_t *this,                          local->op_errno              = op_errno;                  }          } else if (op_ret == 0) { -                int_lock->inode_locked_nodes[child_index] -                        |= LOCKED_YES; +                int_lock->inode_locked_nodes[child_index] |= LOCKED_YES;                  int_lock->inodelk_lock_count++;          } @@ -1324,7 +1352,7 @@ afr_nonblocking_inodelk_cbk (call_frame_t *frame, void *cookie, xlator_t *this,                          "Last inode locking reply received");                  /* all locks successfull. Proceed to call FOP */                  if (int_lock->inodelk_lock_count == -                    afr_up_children_count (priv->child_count, local->child_up)) { +                                int_lock->lk_expected_count) {                          gf_log (this->name, GF_LOG_TRACE,                                  "All servers locked. Calling the cbk");                          int_lock->lock_op_ret = 0; @@ -1352,7 +1380,6 @@ afr_nonblocking_inodelk (call_frame_t *frame, xlator_t *this)          afr_private_t       *priv     = NULL;          afr_fd_ctx_t        *fd_ctx   = NULL;          int32_t  call_count = 0; -        uint64_t ctx        = 0;          int      i          = 0;          int      ret        = 0;          struct gf_flock flock = {0,}; @@ -1365,12 +1392,14 @@ afr_nonblocking_inodelk (call_frame_t *frame, xlator_t *this)          flock.l_len   = int_lock->lk_flock.l_len;          flock.l_type  = int_lock->lk_flock.l_type; +        gf_log (this->name, GF_LOG_DEBUG, "attempting data lock range %"PRIu64 +                " %"PRIu64" by %"PRIu64, flock.l_start, flock.l_len, +                frame->root->lk_owner);          initialize_inodelk_variables (frame, this);          if (local->fd) { -                ret = fd_ctx_get (local->fd, this, &ctx); - -                if (ret < 0) { +                fd_ctx = afr_fd_ctx_get (local->fd, this); +                if (!fd_ctx) {                          gf_log (this->name, GF_LOG_INFO,                                  "unable to get fd ctx for fd=%p",                                  local->fd); @@ -1384,10 +1413,10 @@ afr_nonblocking_inodelk (call_frame_t *frame, xlator_t *this)                          goto out;                  } -                fd_ctx = (afr_fd_ctx_t *)(long) ctx; - -                call_count = internal_lock_count (frame, this, fd_ctx); +                afr_mark_fd_open_on (local, fd_ctx, priv->child_count); +                call_count = internal_lock_count (frame, this);                  int_lock->lk_call_count = call_count; +                int_lock->lk_expected_count = call_count;                  if (!call_count) {                          gf_log (this->name, GF_LOG_INFO, @@ -1399,7 +1428,7 @@ afr_nonblocking_inodelk (call_frame_t *frame, xlator_t *this)                  /* Send non-blocking inodelk calls only on up children                     and where the fd has been opened */                  for (i = 0; i < priv->child_count; i++) { -                        if (local->child_up[i] && fd_ctx->opened_on[i]) { +                        if (local->child_up[i] && local->fd_open_on[i]) {                                  afr_trace_inodelk_in (frame, AFR_INODELK_NB_TRANSACTION,                                                        AFR_LOCK_OP, &flock, F_SETLK, i); @@ -1417,8 +1446,9 @@ afr_nonblocking_inodelk (call_frame_t *frame, xlator_t *this)                  }          } else { -                call_count = internal_lock_count (frame, this, NULL); +                call_count = internal_lock_count (frame, this);                  int_lock->lk_call_count = call_count; +                int_lock->lk_expected_count = call_count;                  for (i = 0; i < priv->child_count; i++) {                          if (local->child_up[i]) { @@ -1989,7 +2019,7 @@ afr_mark_fd_opened (xlator_t *this, fd_t *fd, int32_t child_index)          fdctx = (afr_fd_ctx_t *) (long) tmp; -        fdctx->opened_on[child_index] = 1; +        fdctx->opened_on[child_index] = AFR_FD_OPENED;  out:          return ret; @@ -2083,7 +2113,7 @@ is_fd_opened (fd_t *fd, int32_t child_index)          fdctx = (afr_fd_ctx_t *) (long) tmp; -        if (fdctx->opened_on[child_index]) +        if (fdctx->opened_on[child_index] == AFR_FD_OPENED)                  ret = 1;  out: diff --git a/xlators/cluster/afr/src/afr-mem-types.h b/xlators/cluster/afr/src/afr-mem-types.h index 98e86574031..d5a988708b8 100644 --- a/xlators/cluster/afr/src/afr-mem-types.h +++ b/xlators/cluster/afr/src/afr-mem-types.h @@ -43,6 +43,7 @@ enum gf_afr_mem_types_ {          gf_afr_mt_pump_priv,          gf_afr_mt_locked_fd,          gf_afr_mt_inode_ctx_t, +        gf_afr_fd_paused_call_t,          gf_afr_mt_end  };  #endif diff --git a/xlators/cluster/afr/src/afr-open.c b/xlators/cluster/afr/src/afr-open.c index 306f5a85af0..02d8f3ded3b 100644 --- a/xlators/cluster/afr/src/afr-open.c +++ b/xlators/cluster/afr/src/afr-open.c @@ -116,7 +116,7 @@ afr_open_cbk (call_frame_t *frame, void *cookie,                          fd_ctx = (afr_fd_ctx_t *)(long) ctx; -                        fd_ctx->opened_on[child_index] = 1; +                        fd_ctx->opened_on[child_index] = AFR_FD_OPENED;                          fd_ctx->flags                  = local->cont.open.flags;                          fd_ctx->wbflags                = local->cont.open.wbflags;                  } @@ -141,7 +141,6 @@ unlock:          return 0;  } -  int  afr_open (call_frame_t *frame, xlator_t *this, loc_t *loc, int32_t flags,            fd_t *fd, int32_t wbflags) @@ -180,7 +179,6 @@ afr_open (call_frame_t *frame, xlator_t *this, loc_t *loc, int32_t flags,          frame->local = local;          call_count   = local->call_count; -          loc_copy (&local->loc, loc);          local->cont.open.flags   = flags; @@ -209,446 +207,165 @@ out:          return 0;  } - -int -afr_openfd_sh_open_cbk (call_frame_t *frame, void *cookie, xlator_t *this, -                        int32_t op_ret, int32_t op_errno, fd_t *fd) +//NOTE: this function should be called with holding the lock on +//fd to which fd_ctx belongs +void +afr_get_resumable_calls (xlator_t *this, afr_fd_ctx_t *fd_ctx, +                         struct list_head *list)  { -        afr_internal_lock_t *int_lock    = NULL; -        afr_local_t         *local       = NULL; -        afr_private_t       *priv        = NULL; -        afr_fd_ctx_t        *fd_ctx      = NULL; -        uint64_t             ctx         = 0; -        int                  ret         = 0; -        int                  call_count  = 0; -        int                  child_index = (long) cookie; - -        priv     = this->private; -        local    = frame->local; -        int_lock = &local->internal_lock; - -        LOCK (&frame->lock); -        { -                if (op_ret >= 0) { -                        ret = fd_ctx_get (fd, this, &ctx); +        afr_fd_paused_call_t *paused_call = NULL; +        afr_fd_paused_call_t *tmp = NULL; +        afr_local_t           *call_local  = NULL; +        afr_private_t         *priv        = NULL; +        int                    i = 0; +        gf_boolean_t           call = _gf_false; -                        if (ret < 0) { -                                gf_log (this->name, GF_LOG_WARNING, -                                        "failed to get fd context, %p", fd); -                                goto out; -                        } - -                        fd_ctx = (afr_fd_ctx_t *)(long) ctx; - -                        fd_ctx->opened_on[child_index] = 1; - -                        gf_log (this->name, GF_LOG_TRACE, -                                "fd for %s opened successfully on subvolume %s", -                                local->loc.path, priv->children[child_index]->name); +        priv = this->private; +        list_for_each_entry_safe (paused_call, tmp, &fd_ctx->paused_calls, +                                  call_list) { +                call = _gf_true; +                call_local = paused_call->frame->local; +                for (i = 0; i < priv->child_count; i++) { +                        if (call_local->child_up[i] && +                            (fd_ctx->opened_on[i] == AFR_FD_OPENING)) +                                call = _gf_false;                  } -        } -out: -        UNLOCK (&frame->lock); - -        call_count = afr_frame_return (frame); -        if (call_count == 0) { -                int_lock->lock_cbk = local->transaction.done; -                local->transaction.resume (frame, this); +                if (call) { +                        list_del_init (&paused_call->call_list); +                        list_add (&paused_call->call_list, list); +                }          } - -        return 0;  } - -static int -__unopened_count (int child_count, unsigned int *opened_on, unsigned char *child_up) +void +afr_resume_calls (xlator_t *this, struct list_head *list)  { -        int i = 0; -        int count = 0; - -        for (i = 0; i < child_count; i++) { -                if (!opened_on[i] && child_up[i]) -                        count++; +        afr_fd_paused_call_t *paused_call = NULL; +        afr_fd_paused_call_t *tmp = NULL; +        afr_local_t           *call_local  = NULL; + +        list_for_each_entry_safe (paused_call, tmp, list, call_list) { +                list_del_init (&paused_call->call_list); +                call_local = paused_call->frame->local; +                call_local->fop_call_continue (paused_call->frame, this); +                GF_FREE (paused_call);          } - -        return count;  } -  int -afr_openfd_sh_unwind (call_frame_t *frame, xlator_t *this, int32_t op_ret, -                      int32_t op_errno) +afr_openfd_fix_open_cbk (call_frame_t *frame, void *cookie, xlator_t *this, +                        int32_t op_ret, int32_t op_errno, fd_t *fd)  { -        afr_local_t   *local      = NULL; -        afr_private_t *priv       = NULL; -        uint64_t       ctx        = 0; -        afr_fd_ctx_t  *fd_ctx     = NULL; -        int            abandon    = 0; -        int            ret        = 0; -        int            i          = 0; -        int            call_count = 0; +        afr_local_t           *local       = NULL; +        afr_private_t         *priv        = NULL; +        afr_fd_ctx_t          *fd_ctx      = NULL; +        int                    call_count  = 0; +        int                    child_index = (long) cookie; +        struct list_head       paused_calls = {0}; -        priv  = this->private; -        local = frame->local; +        priv     = this->private; +        local    = frame->local; -        /* -         * Some subvolumes might have come up on which we never -         * opened this fd in the first place. Re-open fd's on those -         * subvolumes now. -         */ +        call_count = afr_frame_return (frame); -        ret = fd_ctx_get (local->fd, this, &ctx); -        if (ret < 0) { +        //Note: No frame locking needed for this block of code +        fd_ctx = afr_fd_ctx_get (local->fd, this); +        if (!fd_ctx) {                  gf_log (this->name, GF_LOG_WARNING, -                        "failed to get fd context %p (%s)", -                        local->fd, local->loc.path); -                abandon = 1; +                        "failed to get fd context, %p", local->fd);                  goto out;          } -        fd_ctx = (afr_fd_ctx_t *)(long) ctx; -          LOCK (&local->fd->lock);          { -                call_count = __unopened_count (priv->child_count, -                                               fd_ctx->opened_on, -                                               local->child_up); -                for (i = 0; i < priv->child_count; i++) { -                        fd_ctx->pre_op_done[i] = 0; -                        fd_ctx->pre_op_piggyback[i] = 0; +                if (op_ret >= 0) { +                        fd_ctx->opened_on[child_index] = AFR_FD_OPENED; +                        gf_log (this->name, GF_LOG_INFO, "fd for %s opened " +                                "successfully on subvolume %s", local->loc.path, +                                priv->children[child_index]->name); +                } else { +                        //Change open status from OPENING to NOT OPENED. +                        fd_ctx->opened_on[child_index] = AFR_FD_NOT_OPENED;                  } -        } -        UNLOCK (&local->fd->lock); - -        if (call_count == 0) { -                gf_log (this->name, GF_LOG_WARNING, -                        "fd not open on any subvolume %p (%s)", -                        local->fd, local->loc.path); -                abandon = 1; -                goto out; -        } - -        local->call_count = call_count; - -        for (i = 0; i < priv->child_count; i++) { -                if (!fd_ctx->opened_on[i] && local->child_up[i]) { -                        gf_log (this->name, GF_LOG_TRACE, -                                "opening fd for %s on subvolume %s", -                                local->loc.path, priv->children[i]->name); - -                        STACK_WIND_COOKIE (frame, afr_openfd_sh_open_cbk, -                                           (void *)(long) i, -                                           priv->children[i], -                                           priv->children[i]->fops->open, -                                           &local->loc, fd_ctx->flags, local->fd, -                                           fd_ctx->wbflags); - -                        if (!--call_count) -                                break; +                if (call_count == 0) { +                        INIT_LIST_HEAD (&paused_calls); +                        afr_get_resumable_calls (this, fd_ctx, &paused_calls);                  }          } - +        UNLOCK (&local->fd->lock);  out: -        if (abandon) -                local->transaction.resume (frame, this); - -        return 0; -} - - -static int -afr_prepare_loc (call_frame_t *frame, fd_t *fd) -{ -        afr_local_t    *local = NULL; -        char           *name = NULL; -        char           *path = NULL; -        int             ret = 0; - -        if ((!fd) || (!fd->inode)) -                return -1; - -        local = frame->local; -        ret = inode_path (fd->inode, NULL, (char **)&path); -        if (ret <= 0) { -                gf_log (frame->this->name, GF_LOG_DEBUG, -                        "Unable to get path for gfid: %s", -                        uuid_utoa (fd->inode->gfid)); -                return -1; -        } - -        if (local->loc.path) { -                if (strcmp (path, local->loc.path)) -                        gf_log (frame->this->name, GF_LOG_DEBUG, -                                "overwriting old loc->path %s with %s", -                                local->loc.path, path); -                GF_FREE ((char *)local->loc.path); -        } -        local->loc.path = path; - -        name = strrchr (local->loc.path, '/'); -        if (name) -                name++; -        local->loc.name = name; - -        if (local->loc.inode) { -                inode_unref (local->loc.inode); -        } -        local->loc.inode = inode_ref (fd->inode); - -        if (local->loc.parent) { -                inode_unref (local->loc.parent); +        if (call_count == 0) { +                afr_resume_calls (this, &paused_calls); +                if (local->fop_call_continue) +                        local->fop_call_continue (frame, this); +                else +                        AFR_STACK_DESTROY (frame);          } -        local->loc.parent = inode_parent (local->loc.inode, 0, NULL); -          return 0;  } -  int -afr_openfd_sh (call_frame_t *frame, xlator_t *this) +afr_fix_open (call_frame_t *frame, xlator_t *this, afr_fd_ctx_t *fd_ctx, +              int need_open_count, int *need_open)  { -        afr_local_t     *local  = NULL; -        afr_self_heal_t *sh = NULL; -        char            sh_type_str[256] = {0,}; +        afr_local_t     *local = NULL; +        afr_private_t   *priv  = NULL; +        int             i      = 0; +        call_frame_t    *open_frame = NULL; +        afr_local_t    *open_local = NULL; +        int             ret    = -1; +        int32_t         op_errno = 0; + +        GF_ASSERT (fd_ctx); +        GF_ASSERT (need_open_count > 0); +        GF_ASSERT (need_open);          local = frame->local; -        sh    = &local->self_heal; - -        GF_ASSERT (local->loc.path); -        /* forcibly trigger missing-entries self-heal */ - -        sh->need_missing_entry_self_heal = _gf_true; -        sh->need_gfid_self_heal = _gf_true; -        sh->data_lock_held      = _gf_true; -        sh->need_data_self_heal = _gf_true; -        sh->type                = local->fd->inode->ia_type; -        sh->background          = _gf_false; -        sh->unwind              = afr_openfd_sh_unwind; - -        afr_self_heal_type_str_get(&local->self_heal, -                                   sh_type_str, -                                   sizeof(sh_type_str)); -        gf_log (this->name, GF_LOG_INFO, "%s self-heal triggered. " -                "path: %s, reason: Replicate up down flush, data lock is held", -                sh_type_str, local->loc.path); - -        afr_self_heal (frame, this, local->fd->inode); - -        return 0; -} - - -int -afr_openfd_flush_done (call_frame_t *frame, xlator_t *this) -{ -        afr_private_t *priv = NULL; -        afr_local_t *local  = NULL; - -        uint64_t       ctx; -        afr_fd_ctx_t * fd_ctx = NULL; - -        int _ret = -1; -          priv  = this->private; -        local = frame->local; - -        LOCK (&local->fd->lock); -        { -                _ret = __fd_ctx_get (local->fd, this, &ctx); -                if (_ret < 0) { -                        gf_log (this->name, GF_LOG_WARNING, -                                "failed to get fd context %p (%s)", -                                local->fd, local->loc.path); +        if (!local->fop_call_continue) { +                open_frame = copy_frame (frame); +                if (!open_frame) { +                        ret = -ENOMEM;                          goto out;                  } - -                fd_ctx = (afr_fd_ctx_t *)(long) ctx; - -                fd_ctx->down_count = priv->down_count; -                fd_ctx->up_count   = priv->up_count; -        } -out: -        UNLOCK (&local->fd->lock); - -        afr_local_transaction_cleanup (local, this); - -        gf_log (this->name, GF_LOG_TRACE, -                "The up/down flush is over"); - -        fd_unref (local->fd); -        local->openfd_flush_cbk (frame, this); - -        return 0; -} - - - -int -afr_openfd_xaction (call_frame_t *frame, xlator_t *this, fd_t *fd) -{ -        afr_local_t   * local = NULL; - -        VALIDATE_OR_GOTO (frame, out); -        VALIDATE_OR_GOTO (this, out); -        VALIDATE_OR_GOTO (this->private, out); - -        local = frame->local; - -        local->op = GF_FOP_FLUSH; - -        local->transaction.fop    = afr_openfd_sh; -        local->transaction.done   = afr_openfd_flush_done; - -        local->transaction.start  = 0; -        local->transaction.len    = 0; - -        gf_log (this->name, GF_LOG_TRACE, -                "doing up/down flush on fd=%p", fd); - -        afr_transaction (frame, this, AFR_DATA_TRANSACTION); - -out: -        return 0; -} - - - -int -afr_openfd_xaction_open_cbk (call_frame_t *frame, void *cookie, xlator_t *this, -                             int32_t op_ret, int32_t op_errno, fd_t *fd) -{ -        afr_internal_lock_t *int_lock    = NULL; -        afr_local_t         *local       = NULL; -        afr_private_t       *priv        = NULL; -        int                  ret         = 0; -        uint64_t             ctx         = 0; -        afr_fd_ctx_t        *fd_ctx      = NULL; -        int                  call_count  = 0; -        int                  child_index = (long) cookie; - -        priv     = this->private; -        local    = frame->local; -        int_lock = &local->internal_lock; - -        LOCK (&frame->lock); -        { -                if (op_ret >= 0) { -                        ret = fd_ctx_get (fd, this, &ctx); - -                        if (ret < 0) { -                                gf_log (this->name, GF_LOG_WARNING, -                                        "failed to get fd context %p (%s)", -                                        fd, local->loc.path); -                                goto out; -                        } - -                        fd_ctx = (afr_fd_ctx_t *)(long) ctx; - -                        fd_ctx->opened_on[child_index] = 1; - -                        gf_log (this->name, GF_LOG_TRACE, -                                "fd for %s opened successfully on subvolume %s", -                                local->loc.path, priv->children[child_index]->name); +                ALLOC_OR_GOTO (open_local, afr_local_t, out); +                ret = AFR_LOCAL_INIT (open_local, priv); +                if (ret < 0) { +                        op_errno = -ret; +                        goto out;                  } +                loc_copy (&open_local->loc, &local->loc); +                open_local->fd = fd_ref (local->fd); +        } else { +                ret = 0; +                open_frame = frame; +                open_local = local;          } -out: -        UNLOCK (&frame->lock); -        call_count = afr_frame_return (frame); - -        if (call_count == 0) { -                afr_openfd_xaction (frame, this, local->fd); -        } - -        return 0; -} - - -int -afr_openfd_flush (call_frame_t *frame, xlator_t *this, fd_t *fd) -{ -        afr_local_t   *local      = NULL; -        afr_private_t *priv       = NULL; -        uint64_t       ctx        = 0; -        afr_fd_ctx_t  *fd_ctx     = NULL; -        int            no_open    = 0; -        int            ret        = 0; -        int            i          = 0; -        int            call_count = 0; - -        priv  = this->private; -        local = frame->local; - -        /* -         * If the file is already deleted while the fd is open, no need to -         * perform the openfd flush, call the flush_cbk and get out. -         */ -        ret = afr_prepare_loc (frame, fd); -        if (ret < 0) { -                local->openfd_flush_cbk (frame, this); -                goto out; -        } +        open_local->call_count = need_open_count; -        /* -         * Some subvolumes might have come up on which we never -         * opened this fd in the first place. Re-open fd's on those -         * subvolumes now. -         */ - -        local->fd = fd_ref (fd); - -        ret = fd_ctx_get (fd, this, &ctx); -        if (ret < 0) { -                gf_log (this->name, GF_LOG_WARNING, -                        "failed to get fd context %p (%s)", -                        fd, local->loc.path); -                no_open = 1; -                goto out; -        } - -        fd_ctx = (afr_fd_ctx_t *)(long) ctx; - -        LOCK (&local->fd->lock); -        { -                call_count = __unopened_count (priv->child_count, -                                               fd_ctx->opened_on, -                                               local->child_up); -        } -        UNLOCK (&local->fd->lock); - -        if (call_count == 0) { -                gf_log (this->name, GF_LOG_WARNING, -                        "fd not open on any subvolume %p (%s)", -                        fd, local->loc.path); -                no_open = 1; -                goto out; -        } - -        local->call_count = call_count; +        gf_log (this->name, GF_LOG_DEBUG, "need open count: %d", +                need_open_count);          for (i = 0; i < priv->child_count; i++) { -                if (!fd_ctx->opened_on[i] && local->child_up[i]) { -                        gf_log (this->name, GF_LOG_TRACE, +                if (need_open[i]) { +                        gf_log (this->name, GF_LOG_DEBUG,                                  "opening fd for %s on subvolume %s",                                  local->loc.path, priv->children[i]->name); -                        STACK_WIND_COOKIE (frame, afr_openfd_xaction_open_cbk, +                        STACK_WIND_COOKIE (open_frame, afr_openfd_fix_open_cbk,                                             (void *)(long) i,                                             priv->children[i],                                             priv->children[i]->fops->open, -                                           &local->loc, fd_ctx->flags, fd, -                                           fd_ctx->wbflags); +                                           &open_local->loc, fd_ctx->flags, +                                           open_local->fd, fd_ctx->wbflags); -                        if (!--call_count) -                                break;                  }          } -  out: -        if (no_open) -                afr_openfd_xaction (frame, this, fd); - -        return 0; +        if (ret && open_frame) +                AFR_STACK_DESTROY (open_frame); +        return ret;  } diff --git a/xlators/cluster/afr/src/afr-self-heal-algorithm.c b/xlators/cluster/afr/src/afr-self-heal-algorithm.c index 04b388fe052..1c7cdf41819 100644 --- a/xlators/cluster/afr/src/afr-self-heal-algorithm.c +++ b/xlators/cluster/afr/src/afr-self-heal-algorithm.c @@ -44,279 +44,249 @@    This file contains the various self-heal algorithms  */ +static int +sh_loop_driver (call_frame_t *sh_frame, xlator_t *this, +                gf_boolean_t is_first_call, call_frame_t *old_loop_frame); +static int +sh_loop_return (call_frame_t *sh_frame, xlator_t *this, call_frame_t *loop_frame, +                int32_t op_ret, int32_t op_errno); +static int +sh_destroy_frame (call_frame_t *frame, xlator_t *this) +{ +        if (!frame) +                goto out; -/* -  The "full" algorithm. Copies the entire file from -  source to sinks. -*/ - +        AFR_STACK_DESTROY (frame); +out: +        return 0; +}  static void -sh_full_private_cleanup (call_frame_t *frame, xlator_t *this) +sh_private_cleanup (call_frame_t *frame, xlator_t *this)  { -        afr_local_t *               local   = NULL; -        afr_self_heal_t *           sh      = NULL; -        afr_sh_algo_full_private_t *sh_priv = NULL; +        afr_local_t             *local   = NULL; +        afr_self_heal_t         *sh      = NULL; +        afr_sh_algo_private_t   *sh_priv = NULL;          local = frame->local;          sh    = &local->self_heal;          sh_priv = sh->private; -          if (sh_priv)                  GF_FREE (sh_priv);  } -  static int -sh_full_loop_driver (call_frame_t *frame, xlator_t *this, gf_boolean_t is_first_call); +sh_number_of_writes_needed (unsigned char *write_needed, int child_count) +{ +        int writes = 0; +        int i      = 0; + +        for (i = 0; i < child_count; i++) { +                if (write_needed[i]) +                        writes++; +        } + +        return writes; +} +  static int -sh_full_loop_driver_done (call_frame_t *frame, xlator_t *this) +sh_loop_driver_done (call_frame_t *frame, xlator_t *this, +                     call_frame_t *last_loop_frame)  { -        afr_private_t * priv = NULL; -        afr_local_t * local  = NULL; -        afr_self_heal_t *sh  = NULL; -        afr_sh_algo_full_private_t *sh_priv = NULL; +        afr_private_t           *priv         = NULL; +        afr_local_t             *local        = NULL; +        afr_self_heal_t         *sh           = NULL; +        afr_sh_algo_private_t   *sh_priv      = NULL; +        int32_t                 total_blocks = 0; +        int32_t                 diff_blocks  = 0; -        priv    = this->private; -        local   = frame->local; -        sh      = &local->self_heal; -        sh_priv = sh->private; +        priv         = this->private; +        local        = frame->local; +        sh           = &local->self_heal; +        sh_priv      = sh->private; +        total_blocks = sh_priv->total_blocks; +        diff_blocks  = sh_priv->diff_blocks; -        sh_full_private_cleanup (frame, this); +        sh_private_cleanup (frame, this);          if (sh->op_failed) { +                GF_ASSERT (!last_loop_frame); +                //loop_finish should have happened and the old_loop should be NULL                  gf_log (this->name, GF_LOG_INFO, -                        "full self-heal aborting on %s", +                        "self-heal aborting on %s",                          local->loc.path);                  local->self_heal.algo_abort_cbk (frame, this);          } else { -                gf_log (this->name, GF_LOG_INFO, -                        "full self-heal completed on %s", -                        local->loc.path); +                GF_ASSERT (last_loop_frame); +                if (diff_blocks == total_blocks) { +                        gf_log (this->name, GF_LOG_INFO, "full self-heal " +                                "completed on %s",local->loc.path); +                } else { +                        gf_log (this->name, GF_LOG_INFO, +                                "diff self-heal on %s: completed. " +                                "(%d blocks of %d were different (%.2f%%))", +                                local->loc.path, diff_blocks, total_blocks, +                                ((diff_blocks * 1.0)/total_blocks) * 100); +                } +                sh->old_loop_frame = last_loop_frame;                  local->self_heal.algo_completion_cbk (frame, this);          } +          return 0;  } -static int -sh_full_loop_return (call_frame_t *rw_frame, xlator_t *this, off_t offset) +int +sh_loop_finish (call_frame_t *loop_frame, xlator_t *this)  { -        afr_local_t *               rw_local = NULL; -        afr_self_heal_t *           rw_sh    = NULL; -        call_frame_t               *sh_frame = NULL; -        afr_local_t *               sh_local = NULL; -        afr_self_heal_t            *sh       = NULL; -        afr_sh_algo_full_private_t *sh_priv  = NULL; - -        rw_local = rw_frame->local; -        rw_sh    = &rw_local->self_heal; +        afr_local_t             *loop_local = NULL; +        afr_self_heal_t         *loop_sh = NULL; -        sh_frame = rw_sh->sh_frame; -        sh_local = sh_frame->local; -        sh       = &sh_local->self_heal; -        sh_priv  = sh->private; - -        AFR_STACK_DESTROY (rw_frame); +        if (!loop_frame) +                goto out; -        sh_full_loop_driver (sh_frame, this, _gf_false); +        loop_local = loop_frame->local; +        if (loop_local) { +                loop_sh = &loop_local->self_heal; +        } +        if (loop_sh && loop_sh->loop_completion_cbk) { +                if (loop_sh->data_lock_held) { +                        afr_sh_data_unlock (loop_frame, this, +                                            loop_sh->loop_completion_cbk); +                } else { +                        loop_sh->loop_completion_cbk (loop_frame, this); +                } +        } else { +                //default loop_completion_cbk destroys the loop_frame +                if (loop_sh && !loop_sh->loop_completion_cbk) +                        GF_ASSERT (!loop_sh->data_lock_held); +                sh_destroy_frame (loop_frame, this); +        } +out:          return 0;  } -  static int -sh_full_write_cbk (call_frame_t *rw_frame, void *cookie, xlator_t *this, -                   int32_t op_ret, int32_t op_errno, struct iatt *prebuf, -                   struct iatt *postbuf) +sh_loop_lock_success (call_frame_t *loop_frame, xlator_t *this)  { -        afr_private_t *  priv        = NULL; -        afr_local_t *    rw_local    = NULL; -        afr_self_heal_t *rw_sh       = NULL; -        call_frame_t    *sh_frame    = NULL; -        afr_local_t *    sh_local    = NULL; -        afr_self_heal_t *sh          = NULL; -        int              child_index = (long) cookie; -        int              call_count  = 0; - -        priv = this->private; - -        rw_local = rw_frame->local; -        rw_sh    = &rw_local->self_heal; - -        sh_frame = rw_sh->sh_frame; -        sh_local = sh_frame->local; -        sh       = &sh_local->self_heal; - -        gf_log (this->name, GF_LOG_TRACE, -                "wrote %d bytes of data from %s to child %d, offset %"PRId64"", -                op_ret, sh_local->loc.path, child_index, -                rw_sh->offset - op_ret); - -        LOCK (&sh_frame->lock); -        { -                if (op_ret == -1) { -                        gf_log (this->name, GF_LOG_ERROR, -                                "write to %s failed on subvolume %s (%s)", -                                sh_local->loc.path, -                                priv->children[child_index]->name, -                                strerror (op_errno)); +        afr_local_t                 *loop_local = NULL; +        afr_self_heal_t             *loop_sh    = NULL; -                        sh->op_failed = 1; -                } -        } -        UNLOCK (&sh_frame->lock); +        loop_local = loop_frame->local; +        loop_sh = &loop_local->self_heal; -        call_count = afr_frame_return (rw_frame); - -        if (call_count == 0) { -                sh_full_loop_return (rw_frame, this, rw_sh->offset - op_ret); -        } +        sh_loop_finish (loop_sh->old_loop_frame, this); +        loop_sh->old_loop_frame = NULL; +        gf_log (this->name, GF_LOG_DEBUG, "Aquired lock for range %"PRIu64 +                " %"PRIu64, loop_sh->offset, loop_sh->block_size); +        loop_sh->data_lock_held = _gf_true; +        loop_sh->sh_data_algo_start (loop_frame, this);          return 0;  } -  static int -sh_full_read_cbk (call_frame_t *rw_frame, void *cookie, -                  xlator_t *this, int32_t op_ret, int32_t op_errno, -                  struct iovec *vector, int32_t count, struct iatt *buf, -                  struct iobref *iobref) +sh_loop_lock_failure (call_frame_t *loop_frame, xlator_t *this)  { -        afr_private_t *  priv       = NULL; -        afr_local_t *    rw_local   = NULL; -        afr_self_heal_t *rw_sh      = NULL; -        call_frame_t    *sh_frame   = NULL; -        afr_local_t *    sh_local   = NULL; -        afr_self_heal_t *sh         = NULL; -        int              i          = 0; -        int              call_count = 0; -        off_t            offset     = (long) cookie; - -        priv = this->private; -        rw_local = rw_frame->local; -        rw_sh    = &rw_local->self_heal; - -        sh_frame = rw_sh->sh_frame; -        sh_local = sh_frame->local; -        sh       = &sh_local->self_heal; - -        call_count = sh->active_sinks; - -        rw_local->call_count = call_count; - -        gf_log (this->name, GF_LOG_TRACE, -                "read %d bytes of data from %s, offset %"PRId64"", -                op_ret, sh_local->loc.path, offset); - -        if (op_ret <= 0) { -                gf_log (this->name, GF_LOG_ERROR, -                        "read from %s failed on subvolume %s (%s)", -                        sh_local->loc.path, -                        priv->children[sh->source]->name, -                        strerror (op_errno)); -                sh->op_failed = 1; -                sh_full_loop_return (rw_frame, this, offset); -                return 0; -        } - -        rw_sh->offset += op_ret; - -        if (sh->file_has_holes) { -                if (iov_0filled (vector, count) == 0) { -                        /* the iter function depends on the -                           sh->offset already being updated -                           above -                        */ -                        gf_log (this->name, GF_LOG_DEBUG, -                                "block has all 0 filled"); -                        sh_full_loop_return (rw_frame, this, offset); -                        goto out; -                } -        } - -        for (i = 0; i < priv->child_count; i++) { -                if (sh->sources[i] || !sh_local->child_up[i]) -                        continue; - -                /* this is a sink, so write to it */ - -                STACK_WIND_COOKIE (rw_frame, sh_full_write_cbk, -                                   (void *) (long) i, -                                   priv->children[i], -                                   priv->children[i]->fops->writev, -                                   sh->healing_fd, vector, count, offset, -                                   iobref); - -                if (!--call_count) -                        break; -        } - -out: +        call_frame_t                *sh_frame = NULL; +        afr_local_t                 *loop_local = NULL; +        afr_self_heal_t             *loop_sh    = NULL; + +        loop_local = loop_frame->local; +        loop_sh = &loop_local->self_heal; +        sh_frame = loop_sh->sh_frame; + +        gf_log (this->name, GF_LOG_ERROR, "failed lock for range %"PRIu64 +                " %"PRIu64, loop_sh->offset, loop_sh->block_size); +        sh_loop_finish (loop_sh->old_loop_frame, this); +        loop_sh->old_loop_frame = NULL; +        sh_loop_return (sh_frame, this, loop_frame, -1, ENOTCONN);          return 0;  } -  static int -sh_full_read_write (call_frame_t *frame, xlator_t *this, off_t offset) +sh_loop_start (call_frame_t *sh_frame, xlator_t *this, off_t offset, +               call_frame_t *old_loop_frame)  { -        afr_private_t *  priv     = NULL; -        afr_local_t *    local    = NULL; -        afr_local_t *    rw_local = NULL; -        afr_self_heal_t *rw_sh    = NULL; -        afr_self_heal_t *sh       = NULL; -        call_frame_t    *rw_frame = NULL; -        int32_t          op_errno = 0; +        call_frame_t                *new_loop_frame = NULL; +        afr_local_t                 *local          = NULL; +        afr_self_heal_t             *sh             = NULL; +        afr_local_t                 *new_loop_local = NULL; +        afr_self_heal_t             *new_loop_sh    = NULL; +        afr_private_t               *priv           = NULL; -        priv  = this->private; -        local = frame->local; -        sh    = &local->self_heal; - -        rw_frame = copy_frame (frame); -        if (!rw_frame) -                goto out; +        GF_ASSERT (sh_frame); -        ALLOC_OR_GOTO (rw_local, afr_local_t, out); - -        rw_frame->local = rw_local; -        rw_sh           = &rw_local->self_heal; +        local   = sh_frame->local; +        sh      = &local->self_heal; +        priv    = this->private; -        rw_sh->offset       = offset; -        rw_sh->sh_frame     = frame; +        new_loop_frame = copy_frame (sh_frame); +        if (!new_loop_frame) +                goto out; +        //We want the frame to have same lk_oner as sh_frame +        new_loop_local = afr_local_copy (local, this); +        if (!new_loop_local) +                goto out; +        new_loop_frame->local = new_loop_local; -        STACK_WIND_COOKIE (rw_frame, sh_full_read_cbk, -                           (void *) (long) offset, -                           priv->children[sh->source], -                           priv->children[sh->source]->fops->readv, -                           sh->healing_fd, sh->block_size, -                           offset); +        new_loop_sh = &new_loop_local->self_heal; +        new_loop_sh->sources = memdup (sh->sources, +                                       priv->child_count * sizeof (*sh->sources)); +        if (!new_loop_sh->sources) +                goto out; +        new_loop_sh->write_needed = GF_CALLOC (priv->child_count, +                                               sizeof (*new_loop_sh->write_needed), +                                               gf_afr_mt_char); +        if (!new_loop_sh->write_needed) +                goto out; +        new_loop_sh->checksum = GF_CALLOC (priv->child_count, MD5_DIGEST_LEN, +                                           gf_afr_mt_uint8_t); +        if (!new_loop_sh->checksum) +                goto out; +        new_loop_sh->offset = offset; +        new_loop_sh->block_size = sh->block_size; +        new_loop_sh->old_loop_frame = old_loop_frame; +        new_loop_sh->sh_frame = sh_frame; +        new_loop_sh->inode      = inode_ref (sh->inode); +        new_loop_sh->sh_data_algo_start = sh->sh_data_algo_start; +        new_loop_sh->source = sh->source; +        new_loop_sh->active_sinks = sh->active_sinks; +        new_loop_sh->healing_fd = fd_ref (sh->healing_fd); +        new_loop_sh->file_has_holes = sh->file_has_holes; +        new_loop_sh->loop_completion_cbk = sh_destroy_frame; +        afr_sh_data_lock (new_loop_frame, this, offset, new_loop_sh->block_size, +                          sh_loop_lock_success, sh_loop_lock_failure);          return 0; -  out:          sh->op_failed = 1; - -        sh_full_loop_driver (frame, this, _gf_false); - +        if (new_loop_frame) { +                new_loop_frame->local = new_loop_local; +        } +        if (old_loop_frame) +                sh_loop_finish (old_loop_frame, this); +        sh_loop_return (sh_frame, this, new_loop_frame, -1, ENOMEM);          return 0;  } -  static int -sh_full_loop_driver (call_frame_t *frame, xlator_t *this, gf_boolean_t is_first_call) +sh_loop_driver (call_frame_t *sh_frame, xlator_t *this, +                gf_boolean_t is_first_call, call_frame_t *old_loop_frame)  {          afr_private_t *             priv           = NULL;          afr_local_t *               local          = NULL; -        afr_self_heal_t            *sh             = NULL; -        afr_sh_algo_full_private_t *sh_priv        = NULL; +        afr_self_heal_t *           sh             = NULL; +        afr_sh_algo_private_t       *sh_priv        = NULL;          gf_boolean_t                is_driver_done = _gf_false;          blksize_t                   block_size     = 0; -        off_t                       offset         = 0;          int                         loop           = 0; +        off_t                       offset         = 0;          priv    = this->private; -        local   = frame->local; +        local   = sh_frame->local;          sh      = &local->self_heal;          sh_priv = sh->private; @@ -324,23 +294,18 @@ sh_full_loop_driver (call_frame_t *frame, xlator_t *this, gf_boolean_t is_first_          {                  if (_gf_false == is_first_call)                          sh_priv->loops_running--; -                offset           = sh_priv->offset; -                block_size       = sh->block_size; -                while ((sh->op_failed == 0) && +                offset = sh_priv->offset; +                block_size = sh->block_size; +                while ((!sh->eof_reached) && (0 == sh->op_failed) &&                         (sh_priv->loops_running < priv->data_self_heal_window_size)                         && (sh_priv->offset < sh->file_size)) {                          loop++; -                        gf_log (this->name, GF_LOG_TRACE, -                                "spawning a loop for offset %"PRId64, -                                sh_priv->offset); - -                        sh_priv->offset += sh->block_size; +                        sh_priv->offset += block_size;                          sh_priv->loops_running++;                          if (_gf_false == is_first_call)                                  break; -                  }                  if (0 == sh_priv->loops_running) {                          is_driver_done = _gf_true; @@ -348,274 +313,130 @@ sh_full_loop_driver (call_frame_t *frame, xlator_t *this, gf_boolean_t is_first_          }          UNLOCK (&sh_priv->lock); +        if (0 == loop) { +                //loop finish does unlock, but the erasing of the pending +                //xattrs needs to happen before that so do not finish the loop +                if (is_driver_done && !sh->op_failed) +                        goto driver_done; +                if (old_loop_frame) { +                        sh_loop_finish (old_loop_frame, this); +                        old_loop_frame = NULL; +                } +        } + +        //If we have more loops to form we should finish previous loop after +        //the next loop lock          while (loop--) {                  if (sh->op_failed) {                          // op failed in other loop, stop spawning more loops -                        sh_full_loop_driver (frame, this, _gf_false); +                        if (old_loop_frame) { +                                sh_loop_finish (old_loop_frame, this); +                                old_loop_frame = NULL; +                        } +                        sh_loop_driver (sh_frame, this, _gf_false, NULL);                  } else { -                        sh_full_read_write (frame, this, offset); +                        gf_log (this->name, GF_LOG_TRACE, "spawning a loop " +                                "for offset %"PRId64, offset); + +                        sh_loop_start (sh_frame, this, offset, old_loop_frame); +                        old_loop_frame = NULL;                          offset += block_size;                  }          } +driver_done:          if (is_driver_done) { -                sh_full_loop_driver_done (frame, this); +                sh_loop_driver_done (sh_frame, this, old_loop_frame);          } - -        return 0; -} - - -int -afr_sh_algo_full (call_frame_t *frame, xlator_t *this) -{ -        afr_local_t *               local   = NULL; -        afr_self_heal_t *           sh      = NULL; -        afr_sh_algo_full_private_t *sh_priv = NULL; - -        local = frame->local; -        sh    = &local->self_heal; - -        sh_priv = GF_CALLOC (1, sizeof (*sh_priv), -                             gf_afr_mt_afr_private_t); -        if (!sh_priv) -                goto out; - -        LOCK_INIT (&sh_priv->lock); - -        sh->private = sh_priv; - -        local->call_count = 0; - -        sh_full_loop_driver (frame, this, _gf_true); -out:          return 0;  } - -/* - * The "diff" algorithm. Copies only those blocks whose checksums - * don't match with those of source. - */ - - -static void -sh_diff_private_cleanup (call_frame_t *frame, xlator_t *this) -{ -        afr_private_t *             priv    = NULL; -        afr_local_t *               local   = NULL; -        afr_self_heal_t *           sh      = NULL; -        afr_sh_algo_diff_private_t *sh_priv = NULL; -        int                         i       = 0; - -        priv  = this->private; -        local = frame->local; -        sh    = &local->self_heal; - -        sh_priv = sh->private; - -        for (i = 0; i < priv->data_self_heal_window_size; i++) { -                if (sh_priv->loops[i]) { -                        if (sh_priv->loops[i]->write_needed) -                                GF_FREE (sh_priv->loops[i]->write_needed); - -                        if (sh_priv->loops[i]->checksum) -                                GF_FREE (sh_priv->loops[i]->checksum); - -                        GF_FREE (sh_priv->loops[i]); -                } -        } - -        if (sh_priv) { -                if (sh_priv->loops) -                        GF_FREE (sh_priv->loops); - -                GF_FREE (sh_priv); -        } - - -} - - -static uint32_t -__make_cookie (int loop_index, int child_index) -{ -        uint32_t ret = ((loop_index << 16) | child_index); -        return ret; -} - - -static int -__loop_index (uint32_t cookie) -{ -        return ((cookie & 0xFFFF0000) >> 16); -} - - -static int -__child_index (uint32_t cookie) -{ -        return (cookie & 0x0000FFFF); -} - - -static void -sh_diff_loop_state_reset (struct sh_diff_loop_state *loop_state, int child_count) -{ -        loop_state->active = _gf_false; -//        loop_state->offset = 0; - -        memset (loop_state->write_needed, -                0, sizeof (*loop_state->write_needed) * child_count); - -        memset (loop_state->checksum, -                0, MD5_DIGEST_LEN * child_count); -} - -  static int -sh_diff_number_of_writes_needed (unsigned char *write_needed, int child_count) -{ -        int writes = 0; -        int i      = 0; - -        for (i = 0; i < child_count; i++) { -                if (write_needed[i]) -                        writes++; -        } - -        return writes; -} - - -static int -sh_diff_loop_driver_done (call_frame_t *frame, xlator_t *this) -{ -        afr_private_t *             priv         = NULL; -        afr_local_t *               local        = NULL; -        afr_self_heal_t *           sh           = NULL; -        afr_sh_algo_diff_private_t *sh_priv      = NULL; -        int32_t                     total_blocks = 0; -        int32_t                     diff_blocks  = 0; - -        priv         = this->private; -        local        = frame->local; -        sh           = &local->self_heal; -        sh_priv      = sh->private; -        total_blocks = sh_priv->total_blocks; -        diff_blocks  = sh_priv->diff_blocks; - -        sh_diff_private_cleanup (frame, this); -        if (sh->op_failed) { -                gf_log (this->name, GF_LOG_INFO, -                        "diff self-heal aborting on %s", -                        local->loc.path); - -                local->self_heal.algo_abort_cbk (frame, this); -        } else { -                gf_log (this->name, GF_LOG_INFO, -                        "diff self-heal on %s: completed. " -                        "(%d blocks of %d were different (%.2f%%))", -                        local->loc.path, diff_blocks, total_blocks, -                        ((diff_blocks * 1.0)/total_blocks) * 100); - -                local->self_heal.algo_completion_cbk (frame, this); -        } - -        return 0; -} - -static int -sh_diff_loop_driver (call_frame_t *frame, xlator_t *this, -                     gf_boolean_t is_first_call, -                     struct sh_diff_loop_state *loop_state); - -static int -sh_diff_loop_return (call_frame_t *rw_frame, xlator_t *this, -                     struct sh_diff_loop_state *loop_state) +sh_loop_return (call_frame_t *sh_frame, xlator_t *this, call_frame_t *loop_frame, +                int32_t op_ret, int32_t op_errno)  {          afr_private_t *             priv     = NULL; -        afr_local_t *               rw_local = NULL; -        afr_self_heal_t *           rw_sh    = NULL; -        call_frame_t               *sh_frame = NULL; +        afr_local_t *               loop_local = NULL; +        afr_self_heal_t *           loop_sh    = NULL;          afr_local_t *               sh_local = NULL;          afr_self_heal_t            *sh       = NULL; -        afr_sh_algo_diff_private_t *sh_priv  = NULL; +        afr_sh_algo_private_t      *sh_priv  = NULL;          priv  = this->private; -        rw_local = rw_frame->local; -        rw_sh    = &rw_local->self_heal; - -        sh_frame = rw_sh->sh_frame;          sh_local = sh_frame->local;          sh       = &sh_local->self_heal;          sh_priv  = sh->private; -        gf_log (this->name, GF_LOG_TRACE, -                "loop for offset %"PRId64" returned", loop_state->offset); +        if (loop_frame) { +                loop_local = loop_frame->local; +                if (loop_local) +                        loop_sh    = &loop_local->self_heal; +                if (loop_sh) +                        gf_log (this->name, GF_LOG_TRACE, "loop for offset " +                                "%"PRId64" returned", loop_sh->offset); +        } -        AFR_STACK_DESTROY (rw_frame); +        if (op_ret == -1) { +                sh->op_failed = 1; +                afr_sh_set_error (sh, op_errno); +                if (loop_frame) { +                        sh_loop_finish (loop_frame, this); +                        loop_frame = NULL; +                } +        } -        sh_diff_loop_driver (sh_frame, this, _gf_false, loop_state); +        sh_loop_driver (sh_frame, this, _gf_false, loop_frame);          return 0;  } -  static int -sh_diff_write_cbk (call_frame_t *rw_frame, void *cookie, xlator_t *this, +sh_loop_write_cbk (call_frame_t *loop_frame, void *cookie, xlator_t *this,                     int32_t op_ret, int32_t op_errno, struct iatt *buf,                     struct iatt *postbuf)  {          afr_private_t *             priv        = NULL; -        afr_local_t *               rw_local    = NULL; -        afr_self_heal_t *           rw_sh       = NULL; +        afr_local_t *               loop_local    = NULL; +        afr_self_heal_t *           loop_sh       = NULL;          call_frame_t               *sh_frame    = NULL;          afr_local_t *               sh_local    = NULL;          afr_self_heal_t            *sh          = NULL; -        afr_sh_algo_diff_private_t *sh_priv     = NULL; -        struct sh_diff_loop_state  *loop_state  = NULL; +        afr_sh_algo_private_t      *sh_priv     = NULL;          int                         call_count  = 0;          int                         child_index = 0; -        int                         loop_index  = 0;          priv     = this->private; -        rw_local = rw_frame->local; -        rw_sh    = &rw_local->self_heal; +        loop_local = loop_frame->local; +        loop_sh    = &loop_local->self_heal; -        sh_frame = rw_sh->sh_frame; +        sh_frame = loop_sh->sh_frame;          sh_local = sh_frame->local;          sh       = &sh_local->self_heal;          sh_priv  = sh->private; -        child_index = __child_index ((uint32_t) (long) cookie); -        loop_index  = __loop_index ((uint32_t) (long) cookie); -        loop_state  = sh_priv->loops[loop_index]; +        child_index =  (long) cookie;          gf_log (this->name, GF_LOG_TRACE,                  "wrote %d bytes of data from %s to child %d, offset %"PRId64"", -                op_ret, sh_local->loc.path, child_index, -                loop_state->offset); +                op_ret, sh_local->loc.path, child_index, loop_sh->offset); -        LOCK (&sh_frame->lock); -        { -                if (op_ret == -1) { -                        gf_log (this->name, GF_LOG_ERROR, -                                "write to %s failed on subvolume %s (%s)", -                                sh_local->loc.path, -                                priv->children[child_index]->name, -                                strerror (op_errno)); +        if (op_ret == -1) { +                gf_log (this->name, GF_LOG_ERROR, +                        "write to %s failed on subvolume %s (%s)", +                        sh_local->loc.path, +                        priv->children[child_index]->name, +                        strerror (op_errno)); -                        sh->op_failed = 1; -                } +                sh->op_failed = 1; +                afr_sh_set_error (loop_sh, op_errno);          } -        UNLOCK (&sh_frame->lock); -        call_count = afr_frame_return (rw_frame); +        call_count = afr_frame_return (loop_frame);          if (call_count == 0) { -                sh_diff_loop_return (rw_frame, this, loop_state); +                sh_loop_return (sh_frame, this, loop_frame, +                                loop_sh->op_ret, loop_sh->op_errno);          }          return 0; @@ -623,74 +444,71 @@ sh_diff_write_cbk (call_frame_t *rw_frame, void *cookie, xlator_t *this,  static int -sh_diff_read_cbk (call_frame_t *rw_frame, void *cookie, +sh_loop_read_cbk (call_frame_t *loop_frame, void *cookie,                    xlator_t *this, int32_t op_ret, int32_t op_errno,                    struct iovec *vector, int32_t count, struct iatt *buf,                    struct iobref *iobref)  {          afr_private_t *               priv       = NULL; -        afr_local_t *                 rw_local   = NULL; -        afr_self_heal_t *             rw_sh      = NULL; -        afr_sh_algo_diff_private_t *  sh_priv    = NULL; +        afr_local_t *                 loop_local   = NULL; +        afr_self_heal_t *             loop_sh      = NULL;          call_frame_t                 *sh_frame   = NULL; -        afr_local_t *                 sh_local   = NULL; -        afr_self_heal_t              *sh         = NULL; -        int                           loop_index = 0; -        struct sh_diff_loop_state    *loop_state = NULL; -        uint32_t                      wcookie    = 0;          int                           i          = 0;          int                           call_count = 0; +        afr_local_t *                 sh_local   = NULL; +        afr_self_heal_t *             sh      = NULL; -        priv     = this->private; -        rw_local = rw_frame->local; -        rw_sh    = &rw_local->self_heal; +        priv       = this->private; +        loop_local = loop_frame->local; +        loop_sh    = &loop_local->self_heal; -        sh_frame = rw_sh->sh_frame; +        sh_frame = loop_sh->sh_frame;          sh_local = sh_frame->local;          sh       = &sh_local->self_heal; -        sh_priv  = sh->private; - -        loop_index = __loop_index ((uint32_t) (long) cookie); -        loop_state = sh_priv->loops[loop_index]; - -        call_count = sh_diff_number_of_writes_needed (loop_state->write_needed, -                                                      priv->child_count); - -        rw_local->call_count = call_count;          gf_log (this->name, GF_LOG_TRACE,                  "read %d bytes of data from %s, offset %"PRId64"", -                op_ret, sh_local->loc.path, loop_state->offset); - -        if ((op_ret <= 0) || -            (call_count == 0)) { -                sh_diff_loop_return (rw_frame, this, loop_state); +                op_ret, loop_local->loc.path, loop_sh->offset); -                return 0; +        if (op_ret <= 0) { +                if (op_ret < 0) { +                        sh->op_failed = 1; +                        gf_log (this->name, GF_LOG_ERROR, "read failed on %d " +                                "for %s reason :%s", sh->source, +                                sh_local->loc.path, strerror (errno)); +                } else { +                        sh->eof_reached = _gf_true; +                        gf_log (this->name, GF_LOG_DEBUG, "Eof reached for %s", +                                sh_local->loc.path); +                } +                sh_loop_return (sh_frame, this, loop_frame, op_ret, op_errno); +                goto out;          } -        if (sh->file_has_holes) { -                if (iov_0filled (vector, count) == 0) { +        if (loop_sh->file_has_holes && iov_0filled (vector, count) == 0) {                          gf_log (this->name, GF_LOG_DEBUG, "0 filled block"); -                        sh_diff_loop_return (rw_frame, this, loop_state); +                        sh_loop_return (sh_frame, this, loop_frame, +                                        op_ret, op_errno);                          goto out; -                }          } -        for (i = 0; i < priv->child_count; i++) { -                if (loop_state->write_needed[i]) { -                        wcookie = __make_cookie (loop_index, i); +        call_count = sh_number_of_writes_needed (loop_sh->write_needed, +                                                 priv->child_count); +        GF_ASSERT (call_count > 0); +        loop_local->call_count = call_count; -                        STACK_WIND_COOKIE (rw_frame, sh_diff_write_cbk, -                                           (void *) (long) wcookie, -                                           priv->children[i], -                                           priv->children[i]->fops->writev, -                                           sh->healing_fd, vector, count, -                                           loop_state->offset, iobref); +        for (i = 0; i < priv->child_count; i++) { +                if (!loop_sh->write_needed[i]) +                        continue; +                STACK_WIND_COOKIE (loop_frame, sh_loop_write_cbk, +                                   (void *) (long) i, +                                   priv->children[i], +                                   priv->children[i]->fops->writev, +                                   loop_sh->healing_fd, vector, count, +                                   loop_sh->offset, iobref); -                        if (!--call_count) -                                break; -                } +                if (!--call_count) +                        break;          }  out: @@ -699,100 +517,77 @@ out:  static int -sh_diff_read (call_frame_t *rw_frame, xlator_t *this, -              int loop_index) +sh_loop_read (call_frame_t *loop_frame, xlator_t *this)  { -        afr_private_t *               priv       = NULL; -        afr_local_t *                 rw_local   = NULL; -        afr_self_heal_t *             rw_sh      = NULL; -        afr_sh_algo_diff_private_t *  sh_priv    = NULL; -        struct sh_diff_loop_state    *loop_state = NULL; -        call_frame_t                 *sh_frame   = NULL; -        afr_local_t *                 sh_local   = NULL; -        afr_self_heal_t              *sh         = NULL; -        uint32_t                      cookie     = 0; +        afr_private_t           *priv       = NULL; +        afr_local_t             *loop_local   = NULL; +        afr_self_heal_t         *loop_sh      = NULL;          priv     = this->private; -        rw_local = rw_frame->local; -        rw_sh    = &rw_local->self_heal; - -        sh_frame = rw_sh->sh_frame; -        sh_local = sh_frame->local; -        sh       = &sh_local->self_heal; -        sh_priv  = sh->private; - -        loop_state = sh_priv->loops[loop_index]; +        loop_local = loop_frame->local; +        loop_sh    = &loop_local->self_heal; -        cookie = __make_cookie (loop_index, sh->source); - -        STACK_WIND_COOKIE (rw_frame, sh_diff_read_cbk, -                           (void *) (long) cookie, -                           priv->children[sh->source], -                           priv->children[sh->source]->fops->readv, -                           sh->healing_fd, sh_priv->block_size, -                           loop_state->offset); +        STACK_WIND_COOKIE (loop_frame, sh_loop_read_cbk, +                           (void *) (long) loop_sh->source, +                           priv->children[loop_sh->source], +                           priv->children[loop_sh->source]->fops->readv, +                           loop_sh->healing_fd, loop_sh->block_size, +                           loop_sh->offset);          return 0;  }  static int -sh_diff_checksum_cbk (call_frame_t *rw_frame, void *cookie, xlator_t *this, +sh_diff_checksum_cbk (call_frame_t *loop_frame, void *cookie, xlator_t *this,                        int32_t op_ret, int32_t op_errno,                        uint32_t weak_checksum, uint8_t *strong_checksum)  { -        afr_private_t *               priv         = NULL; -        afr_local_t *                 rw_local     = NULL; -        afr_self_heal_t              *rw_sh        = NULL; -        call_frame_t                 *sh_frame     = NULL; -        afr_local_t *                 sh_local     = NULL; -        afr_self_heal_t              *sh           = NULL; -        afr_sh_algo_diff_private_t *  sh_priv      = NULL; -        int                           loop_index   = 0; +        afr_private_t                 *priv         = NULL; +        afr_local_t                   *loop_local   = NULL; +        afr_self_heal_t               *loop_sh      = NULL; +        call_frame_t                  *sh_frame     = NULL; +        afr_local_t                   *sh_local     = NULL; +        afr_self_heal_t               *sh           = NULL; +        afr_sh_algo_private_t         *sh_priv      = NULL;          int                           child_index  = 0; -        struct sh_diff_loop_state    *loop_state   = NULL;          int                           call_count   = 0;          int                           i            = 0;          int                           write_needed = 0;          priv  = this->private; -        rw_local = rw_frame->local; -        rw_sh    = &rw_local->self_heal; +        loop_local = loop_frame->local; +        loop_sh    = &loop_local->self_heal; -        sh_frame = rw_sh->sh_frame; +        sh_frame = loop_sh->sh_frame;          sh_local = sh_frame->local;          sh       = &sh_local->self_heal;          sh_priv = sh->private; -        child_index = __child_index ((uint32_t) (long) cookie); -        loop_index  = __loop_index ((uint32_t) (long) cookie); - -        loop_state  = sh_priv->loops[loop_index]; +        child_index = (long) cookie;          if (op_ret < 0) {                  gf_log (this->name, GF_LOG_ERROR,                          "checksum on %s failed on subvolume %s (%s)",                          sh_local->loc.path, priv->children[child_index]->name,                          strerror (op_errno)); -                  sh->op_failed = 1;          } else { -                memcpy (loop_state->checksum + child_index * MD5_DIGEST_LEN, -                        strong_checksum, -                        MD5_DIGEST_LEN); +                memcpy (loop_sh->checksum + child_index * MD5_DIGEST_LEN, +                        strong_checksum, MD5_DIGEST_LEN);          } -        call_count = afr_frame_return (rw_frame); +        call_count = afr_frame_return (loop_frame);          if (call_count == 0) {                  for (i = 0; i < priv->child_count; i++) {                          if (sh->sources[i] || !sh_local->child_up[i])                                  continue; -                        if (memcmp (loop_state->checksum + (i * MD5_DIGEST_LEN), -                                    loop_state->checksum + (sh->source * MD5_DIGEST_LEN), +                        if (memcmp (loop_sh->checksum + (i * MD5_DIGEST_LEN), +                                    loop_sh->checksum + (sh->source * MD5_DIGEST_LEN),                                      MD5_DIGEST_LEN)) {                                  /*                                    Checksums differ, so this block @@ -802,9 +597,9 @@ sh_diff_checksum_cbk (call_frame_t *rw_frame, void *cookie, xlator_t *this,                                  gf_log (this->name, GF_LOG_DEBUG,                                          "checksum on subvolume %s at offset %"                                          PRId64" differs from that on source", -                                        priv->children[i]->name, loop_state->offset); +                                        priv->children[i]->name, loop_sh->offset); -                                write_needed = loop_state->write_needed[i] = 1; +                                write_needed = loop_sh->write_needed[i] = 1;                          }                  } @@ -817,271 +612,130 @@ sh_diff_checksum_cbk (call_frame_t *rw_frame, void *cookie, xlator_t *this,                  UNLOCK (&sh_priv->lock);                  if (write_needed && !sh->op_failed) { -                        sh_diff_read (rw_frame, this, loop_index); +                        sh_loop_read (loop_frame, this);                  } else { -                        sh->offset += sh_priv->block_size; - -                        sh_diff_loop_return (rw_frame, this, loop_state); +                        sh_loop_return (sh_frame, this, loop_frame, +                                        op_ret, op_errno);                  }          }          return 0;  } - -static int -sh_diff_find_unused_loop (afr_sh_algo_diff_private_t *sh_priv, int max) -{ -        int i = 0; - -        LOCK (&sh_priv->lock); -        { -                for (i = 0; i < max; i++) { -                        if (sh_priv->loops[i]->active == _gf_false) { -                                sh_priv->loops[i]->active = _gf_true; -                                break; -                        } -                } -        } -        UNLOCK (&sh_priv->lock); - -        if (i == max) { -                gf_log ("[sh-diff]", GF_LOG_ERROR, -                        "no free loops found! This shouldn't happen. Please" -                        " report this to gluster-devel@nongnu.org"); -        } - -        return i; -} - -  static int -sh_diff_checksum (call_frame_t *frame, xlator_t *this, off_t offset) +sh_diff_checksum (call_frame_t *loop_frame, xlator_t *this)  { -        afr_private_t *               priv       = NULL; -        afr_local_t *                 local      = NULL; -        afr_local_t *                 rw_local   = NULL; -        afr_self_heal_t *             sh         = NULL; -        afr_self_heal_t *             rw_sh      = NULL; -        afr_sh_algo_diff_private_t *  sh_priv    = NULL; -        call_frame_t                 *rw_frame   = NULL; -        uint32_t                      cookie     = 0; -        int                           loop_index = 0; -        struct sh_diff_loop_state    *loop_state = NULL; -        int32_t                       op_errno   = 0; -        int                           call_count = 0; -        int                           i          = 0; - -        priv    = this->private; -        local   = frame->local; -        sh      = &local->self_heal; - -        sh_priv = sh->private; - -        rw_frame = copy_frame (frame); -        if (!rw_frame) -                goto out; - -        ALLOC_OR_GOTO (rw_local, afr_local_t, out); +        afr_private_t           *priv         = NULL; +        afr_local_t             *loop_local   = NULL; +        afr_self_heal_t         *loop_sh      = NULL; +        afr_sh_algo_private_t   *loop_sh_priv = NULL; +        int                     call_count    = 0; +        int                     i             = 0; -        rw_frame->local = rw_local; -        rw_sh           = &rw_local->self_heal; - -        rw_sh->offset       = sh->offset; -        rw_sh->sh_frame     = frame; - -        call_count = sh->active_sinks + 1;  /* sinks and source */ - -        rw_local->call_count = call_count; - -        loop_index = sh_diff_find_unused_loop (sh_priv, priv->data_self_heal_window_size); +        priv         = this->private; +        loop_local   = loop_frame->local; +        loop_sh      = &loop_local->self_heal; -        loop_state = sh_priv->loops[loop_index]; -        loop_state->offset       = offset; +        loop_sh_priv = loop_sh->private; -        /* we need to send both the loop index and child index, -           so squeeze them both into a 32-bit number */ +        call_count = loop_sh->active_sinks + 1;  /* sinks and source */ -        cookie = __make_cookie (loop_index, sh->source); +        loop_local->call_count = call_count; -        STACK_WIND_COOKIE (rw_frame, sh_diff_checksum_cbk, -                           (void *) (long) cookie, -                           priv->children[sh->source], -                           priv->children[sh->source]->fops->rchecksum, -                           sh->healing_fd, -                           offset, sh_priv->block_size); +        STACK_WIND_COOKIE (loop_frame, sh_diff_checksum_cbk, +                           (void *) (long) loop_sh->source, +                           priv->children[loop_sh->source], +                           priv->children[loop_sh->source]->fops->rchecksum, +                           loop_sh->healing_fd, +                           loop_sh->offset, loop_sh->block_size);          for (i = 0; i < priv->child_count; i++) { -                if (sh->sources[i] || !local->child_up[i]) +                if (loop_sh->sources[i] || !loop_local->child_up[i])                          continue; -                cookie = __make_cookie (loop_index, i); - -                STACK_WIND_COOKIE (rw_frame, sh_diff_checksum_cbk, -                                   (void *) (long) cookie, +                STACK_WIND_COOKIE (loop_frame, sh_diff_checksum_cbk, +                                   (void *) (long) i,                                     priv->children[i],                                     priv->children[i]->fops->rchecksum, -                                   sh->healing_fd, -                                   offset, sh_priv->block_size); +                                   loop_sh->healing_fd, +                                   loop_sh->offset, loop_sh->block_size);                  if (!--call_count)                          break;          }          return 0; - -out: -        sh->op_failed = 1; - -        sh_diff_loop_driver (frame, this, _gf_false, loop_state); - -        return 0;  } -  static int -sh_diff_loop_driver (call_frame_t *frame, xlator_t *this, -                     gf_boolean_t is_first_call, -                     struct sh_diff_loop_state *loop_state) +sh_full_read_write_to_sinks (call_frame_t *loop_frame, xlator_t *this)  { -        afr_private_t *             priv           = NULL; -        afr_local_t *               local          = NULL; -        afr_self_heal_t *           sh             = NULL; -        afr_sh_algo_diff_private_t *sh_priv        = NULL; -        gf_boolean_t                is_driver_done = _gf_false; -        blksize_t                   block_size     = 0; -        int                         loop           = 0; -        off_t                       offset         = 0; -        char                        sh_type_str[256] = {0,}; - -        priv    = this->private; -        local   = frame->local; -        sh      = &local->self_heal; -        sh_priv = sh->private; - -        afr_self_heal_type_str_get(sh, sh_type_str, sizeof(sh_type_str)); - -        LOCK (&sh_priv->lock); -        { -                if (loop_state) -                        sh_diff_loop_state_reset (loop_state, priv->child_count); -                if (_gf_false == is_first_call) -                        sh_priv->loops_running--; -                offset = sh_priv->offset; -                block_size = sh_priv->block_size; -                while ((0 == sh->op_failed) && -                       (sh_priv->loops_running < priv->data_self_heal_window_size) -                       && (sh_priv->offset < sh->file_size)) { - -                        loop++; -                        gf_log (this->name, GF_LOG_TRACE, -                                "spawning a loop for offset %"PRId64, -                                sh_priv->offset); - -                        sh_priv->offset += sh_priv->block_size; -                        sh_priv->loops_running++; - -                        if (_gf_false == is_first_call) -                                break; +        afr_private_t           *priv         = NULL; +        afr_local_t             *loop_local   = NULL; +        afr_self_heal_t         *loop_sh      = NULL; +        int                     i             = 0; -                } -                if (0 == sh_priv->loops_running) { -                        is_driver_done = _gf_true; -                } -        } -        UNLOCK (&sh_priv->lock); - -        while (loop--) { -                if (sh->op_failed) { -                        // op failed in other loop, stop spawning more loops -                        sh_diff_loop_driver (frame, this, _gf_false, NULL); -                } else { -                        sh_diff_checksum (frame, this, offset); -                        offset += block_size; -                } -        } +        priv         = this->private; +        loop_local   = loop_frame->local; +        loop_sh      = &loop_local->self_heal; -        if (is_driver_done) { -                sh_diff_loop_driver_done (frame, this); +        for (i = 0; i < priv->child_count; i++) { +                if (loop_sh->sources[i] || !loop_local->child_up[i]) +                        continue; +                loop_sh->write_needed[i] = 1;          } +        sh_loop_read (loop_frame, this);          return 0;  } +static int +sh_do_nothing (call_frame_t *frame, xlator_t *this) +{ +        return 0; +}  int -afr_sh_algo_diff (call_frame_t *frame, xlator_t *this) +afr_sh_start_loops (call_frame_t *sh_frame, xlator_t *this, +                    afr_sh_algo_fn sh_data_algo_start)  { -        afr_private_t *             priv    = NULL; -        afr_local_t *               local   = NULL; -        afr_self_heal_t *           sh      = NULL; -        afr_sh_algo_diff_private_t *sh_priv = NULL; -        int                         i       = 0; +        afr_local_t             *sh_local   = NULL; +        afr_self_heal_t         *sh      = NULL; +        afr_sh_algo_private_t   *sh_priv = NULL; -        priv  = this->private; -        local = frame->local; -        sh    = &local->self_heal; +        sh_local = sh_frame->local; +        sh    = &sh_local->self_heal;          sh_priv = GF_CALLOC (1, sizeof (*sh_priv),                               gf_afr_mt_afr_private_t);          if (!sh_priv) -                goto err; - -        sh_priv->block_size = this->ctx->page_size; - -        sh->private = sh_priv; +                goto out;          LOCK_INIT (&sh_priv->lock); -        local->call_count = 0; - -        sh_priv->loops = GF_CALLOC (priv->data_self_heal_window_size, -                                    sizeof (*sh_priv->loops), -                                    gf_afr_mt_sh_diff_loop_state); -        if (!sh_priv->loops) -                goto err; - -        for (i = 0; i < priv->data_self_heal_window_size; i++) { -                sh_priv->loops[i]               = GF_CALLOC (1, sizeof (*sh_priv->loops[i]), -                                                             gf_afr_mt_sh_diff_loop_state); -                if (!sh_priv->loops[i]) -                        goto err; - -                sh_priv->loops[i]->checksum     = GF_CALLOC (priv->child_count, -                                                             MD5_DIGEST_LEN, gf_afr_mt_uint8_t); -                if (!sh_priv->loops[i]->checksum) -                        goto err; - -                sh_priv->loops[i]->write_needed = GF_CALLOC (priv->child_count, -                                                             sizeof (*sh_priv->loops[i]->write_needed), -                                                             gf_afr_mt_char); -                if (!sh_priv->loops[i]->write_needed) -                        goto err; - -        } +        sh->private = sh_priv; +        sh->sh_data_algo_start = sh_data_algo_start; -        sh_diff_loop_driver (frame, this, _gf_true, NULL); +        sh_local->call_count = 0; +        sh->loop_completion_cbk = sh_do_nothing; +        sh_loop_driver (sh_frame, this, _gf_true, sh_frame); +out:          return 0; -err: -        if (sh_priv) { -                if (sh_priv->loops) { -                        for (i = 0; i < priv->data_self_heal_window_size; i++) { -                                if (sh_priv->loops[i]->write_needed) -                                        GF_FREE (sh_priv->loops[i]->write_needed); -                                if (sh_priv->loops[i]->checksum) -                                        GF_FREE (sh_priv->loops[i]->checksum); -                                if (sh_priv->loops[i]) -                                        GF_FREE (sh_priv->loops[i]); -                        } - -                        GF_FREE (sh_priv->loops); -                } +} -                GF_FREE (sh_priv); -        } +int +afr_sh_algo_diff (call_frame_t *sh_frame, xlator_t *this) +{ +        afr_sh_start_loops (sh_frame, this, sh_diff_checksum);          return 0;  } +int +afr_sh_algo_full (call_frame_t *sh_frame, xlator_t *this) +{ +        afr_sh_start_loops (sh_frame, this, sh_full_read_write_to_sinks); +        return 0; +}  struct afr_sh_algorithm afr_self_heal_algorithms[] = {          {.name = "full",  .fn = afr_sh_algo_full}, diff --git a/xlators/cluster/afr/src/afr-self-heal-algorithm.h b/xlators/cluster/afr/src/afr-self-heal-algorithm.h index 2790dbc6a52..04d8e8a6ce7 100644 --- a/xlators/cluster/afr/src/afr-self-heal-algorithm.h +++ b/xlators/cluster/afr/src/afr-self-heal-algorithm.h @@ -30,31 +30,13 @@ struct afr_sh_algorithm {  };  extern struct afr_sh_algorithm afr_self_heal_algorithms[3]; - -typedef struct { -        gf_lock_t lock; -        unsigned int loops_running; -        off_t offset; -} afr_sh_algo_full_private_t; - -struct sh_diff_loop_state { -        off_t   offset; -        unsigned char *write_needed; -        uint8_t *checksum; -        gf_boolean_t active; -}; -  typedef struct { -        size_t block_size; -          gf_lock_t lock;          unsigned int loops_running;          off_t offset;          int32_t total_blocks;          int32_t diff_blocks; - -        struct sh_diff_loop_state **loops; -} afr_sh_algo_diff_private_t; +} afr_sh_algo_private_t;  #endif /* __AFR_SELF_HEAL_ALGORITHM_H__ */ diff --git a/xlators/cluster/afr/src/afr-self-heal-common.c b/xlators/cluster/afr/src/afr-self-heal-common.c index f66bdff8446..0846184c29f 100644 --- a/xlators/cluster/afr/src/afr-self-heal-common.c +++ b/xlators/cluster/afr/src/afr-self-heal-common.c @@ -57,6 +57,29 @@ afr_sh_select_source (int sources[], int child_count)          return -1;  } +void +afr_sh_mark_source_sinks (call_frame_t *frame, xlator_t *this) +{ +        int              i = 0; +        afr_local_t     *local      = NULL; +        afr_self_heal_t *sh = NULL; +        afr_private_t   *priv = NULL; +        int              active_sinks = 0; + +        local = frame->local; +        sh = &local->self_heal; +        priv = this->private; + +        for (i = 0; i < priv->child_count; i++) { +                if (sh->sources[i] == 0 && local->child_up[i] == 1) { +                        active_sinks++; +                        sh->success[i] = 1; +                } else if (sh->sources[i] == 1 && local->child_up[i] == 1) { +                        sh->success[i] = 1; +                } +        } +        sh->active_sinks = active_sinks; +}  /**   * sink_count - return number of sinks in sources array @@ -112,7 +135,7 @@ afr_sh_print_pending_matrix (int32_t *pending_matrix[], xlator_t *this)                          ptr += sprintf (ptr, "%d ", pending_matrix[i][j]);                  }                  sprintf (ptr, "]"); -                gf_log (this->name, GF_LOG_TRACE, +                gf_log (this->name, GF_LOG_DEBUG,                          "pending_matrix: %s", buf);          } @@ -718,7 +741,7 @@ out:  void  afr_sh_pending_to_delta (afr_private_t *priv, dict_t **xattr, -                         int32_t *delta_matrix[], int success[], +                         int32_t *delta_matrix[], unsigned char success[],                           int child_count, afr_transaction_type type)  {          /* Indexable by result of afr_index_for_transaction_type(): 0 -- 2. */ @@ -970,12 +993,13 @@ afr_sh_missing_entries_finish (call_frame_t *frame, xlator_t *this)          return 0;  } -static void +void  afr_sh_common_lookup_resp_handler (call_frame_t *frame, void *cookie,                                     xlator_t *this,                                     int32_t op_ret, int32_t op_errno,                                     inode_t *inode, struct iatt *buf, -                                   dict_t *xattr, struct iatt *postparent) +                                   dict_t *xattr, struct iatt *postparent, +                                   loc_t *loc)  {          int              child_index = 0;          afr_local_t     *local = NULL; @@ -991,15 +1015,13 @@ afr_sh_common_lookup_resp_handler (call_frame_t *frame, void *cookie,          {                  if (op_ret == 0) {                          sh->buf[child_index] = *buf; -                        sh->parentbuf        = *postparent;                          sh->parentbufs[child_index] = *postparent;                          sh->success_children[sh->success_count] = child_index;                          sh->success_count++;                          sh->xattr[child_index] = dict_ref (xattr);                  } else { -                        gf_log (this->name, GF_LOG_ERROR, -                                "path %s on subvolume %s => -1 (%s)", -                                local->loc.path, +                        gf_log (this->name, GF_LOG_ERROR, "path %s on subvolume" +                                " %s => -1 (%s)", loc->path,                                  priv->children[child_index]->name,                                  strerror (op_errno));                          local->self_heal.child_errno[child_index] = op_errno; @@ -1201,6 +1223,7 @@ afr_sh_missing_entries_lookup_done (call_frame_t *frame, xlator_t *this)          if (sh->gfid_sh_success_cbk)                  sh->gfid_sh_success_cbk (frame, this); +        sh->type = sh->buf[sh->source].ia_type;          sh_missing_entries_create (frame, this);          return;  out: @@ -1227,7 +1250,7 @@ afr_sh_missing_entries_lookup_cbk (call_frame_t *frame, void *cookie,          afr_sh_common_lookup_resp_handler (frame, cookie, this, op_ret,                                             op_errno, inode, buf, xattr, -                                           postparent); +                                           postparent, &local->loc);          call_count = afr_frame_return (frame);          if (call_count == 0) @@ -1417,6 +1440,8 @@ afr_sh_purge_entry_common (call_frame_t *frame, xlator_t *this,          for (i = 0; i < priv->child_count; i++) {                  if (!purge_condition (local, priv, i))                          continue; +                gf_log (this->name, GF_LOG_INFO, "purging the stale entry %s " +                        "on %d", local->loc.path, i);                  afr_sh_call_entry_expunge_remove (frame, this,                                                    (long) i, &sh->buf[i],                                                    afr_sh_remove_entry_cbk); @@ -1536,6 +1561,8 @@ afr_sh_children_lookup_done (call_frame_t *frame, xlator_t *this)                                                 sh->child_errno,                                                 priv->child_count, ENOENT);          if (fresh_child_enoents == fresh_parent_count) { +                gf_log (this->name, GF_LOG_INFO, "Deleting stale file %s", +                        local->loc.path);                  afr_sh_set_error (sh, ENOENT);                  sh->op_failed = 1;                  afr_sh_purge_entry (frame, this); @@ -1570,10 +1597,13 @@ afr_sh_children_lookup_cbk (call_frame_t *frame, void *cookie, xlator_t *this,                              struct iatt *postparent)  {          int              call_count = 0; +        afr_local_t     *local = NULL; + +        local = frame->local;          afr_sh_common_lookup_resp_handler (frame, cookie, this, op_ret,                                             op_errno, inode, buf, xattr, -                                           postparent); +                                           postparent, &local->loc);          call_count = afr_frame_return (frame);          if (call_count == 0) @@ -1669,10 +1699,15 @@ afr_sh_conflicting_entry_lookup_cbk (call_frame_t *frame, void *cookie,                                       dict_t *xattr, struct iatt *postparent)  {          int              call_count = 0; +        afr_local_t     *local = NULL; +        afr_self_heal_t *sh = NULL; + +        local = frame->local; +        sh = &local->self_heal;          afr_sh_common_lookup_resp_handler (frame, cookie, this, op_ret,                                             op_errno, inode, buf, xattr, -                                           postparent); +                                           postparent, &sh->parent_loc);          call_count = afr_frame_return (frame);          if (call_count == 0) @@ -1716,8 +1751,7 @@ afr_sh_common_lookup (call_frame_t *frame, xlator_t *this, loc_t *loc,          priv  = this->private;          sh    = &local->self_heal; -        call_count = afr_up_children_count (priv->child_count, -                                            local->child_up); +        call_count = afr_up_children_count (local->child_up, priv->child_count);          local->call_count = call_count; @@ -1728,7 +1762,7 @@ afr_sh_common_lookup (call_frame_t *frame, xlator_t *this, loc_t *loc,                  if (set_gfid) {                          gf_log (this->name, GF_LOG_DEBUG,                                  "looking up %s with gfid: %s", -                                local->loc.path, uuid_utoa (sh->sh_gfid_req)); +                                loc->path, uuid_utoa (sh->sh_gfid_req));                          GF_ASSERT (!uuid_is_null (sh->sh_gfid_req));                          afr_set_dict_gfid (xattr_req, sh->sh_gfid_req);                  } @@ -1739,7 +1773,7 @@ afr_sh_common_lookup (call_frame_t *frame, xlator_t *this, loc_t *loc,                  if (local->child_up[i]) {                          gf_log (this->name, GF_LOG_DEBUG,                                  "looking up %s on subvolume %s", -                                local->loc.path, priv->children[i]->name); +                                loc->path, priv->children[i]->name);                          STACK_WIND_COOKIE (frame,                                             lookup_cbk, @@ -1906,12 +1940,7 @@ afr_local_t *afr_local_copy (afr_local_t *l, xlator_t *this)          shc->need_metadata_self_heal = sh->need_metadata_self_heal;          shc->need_entry_self_heal = sh->need_entry_self_heal;          shc->forced_merge = sh->forced_merge; -        shc->healing_fd_opened = sh->healing_fd_opened;          shc->data_lock_held = sh->data_lock_held; -        if (sh->healing_fd && !sh->healing_fd_opened) -                shc->healing_fd = fd_ref (sh->healing_fd); -        else -                shc->healing_fd = sh->healing_fd;          shc->background = sh->background;          shc->type = sh->type; @@ -1919,7 +1948,8 @@ afr_local_t *afr_local_copy (afr_local_t *l, xlator_t *this)          if (l->loc.path)                  loc_copy (&lc->loc, &l->loc); -        lc->child_up  = memdup (l->child_up, priv->child_count); +        lc->child_up  = memdup (l->child_up, +                                sizeof (*lc->child_up) * priv->child_count);          if (l->xattr_req)                  lc->xattr_req = dict_ref (l->xattr_req); @@ -1930,7 +1960,7 @@ afr_local_t *afr_local_copy (afr_local_t *l, xlator_t *this)          if (l->internal_lock.inode_locked_nodes)                  lc->internal_lock.inode_locked_nodes =                          memdup (l->internal_lock.inode_locked_nodes, -                                priv->child_count); +                                sizeof (*lc->internal_lock.inode_locked_nodes) * priv->child_count);          else                  lc->internal_lock.inode_locked_nodes =                          GF_CALLOC (sizeof (*l->internal_lock.inode_locked_nodes), @@ -1939,7 +1969,7 @@ afr_local_t *afr_local_copy (afr_local_t *l, xlator_t *this)          if (l->internal_lock.entry_locked_nodes)                  lc->internal_lock.entry_locked_nodes =                          memdup (l->internal_lock.entry_locked_nodes, -                                priv->child_count); +                                sizeof (*lc->internal_lock.entry_locked_nodes) * priv->child_count);          else                  lc->internal_lock.entry_locked_nodes =                          GF_CALLOC (sizeof (*l->internal_lock.entry_locked_nodes), @@ -1948,7 +1978,7 @@ afr_local_t *afr_local_copy (afr_local_t *l, xlator_t *this)          if (l->internal_lock.locked_nodes)                  lc->internal_lock.locked_nodes =                          memdup (l->internal_lock.locked_nodes, -                                priv->child_count); +                                sizeof (*lc->internal_lock.locked_nodes) * priv->child_count);          else                  lc->internal_lock.locked_nodes =                          GF_CALLOC (sizeof (*l->internal_lock.locked_nodes), @@ -1994,7 +2024,7 @@ afr_self_heal_completion_cbk (call_frame_t *bgsh_frame, xlator_t *this)          FRAME_SU_UNDO (bgsh_frame, afr_local_t); -        if (!sh->unwound) { +        if (!sh->unwound && sh->unwind) {                  sh->unwind (sh->orig_frame, this, sh->op_ret, sh->op_errno);          } @@ -2068,8 +2098,8 @@ afr_self_heal (call_frame_t *frame, xlator_t *this, inode_t *inode)                                      gf_afr_mt_iatt);          sh->child_errno = GF_CALLOC (priv->child_count, sizeof (int),                                       gf_afr_mt_int); -        sh->success = GF_CALLOC (priv->child_count, sizeof (int), -                                 gf_afr_mt_int); +        sh->success = GF_CALLOC (priv->child_count, sizeof (*sh->success), +                                 gf_afr_mt_char);          sh->xattr = GF_CALLOC (priv->child_count, sizeof (dict_t *),                                 gf_afr_mt_dict_t);          sh->sources = GF_CALLOC (sizeof (*sh->sources), priv->child_count, diff --git a/xlators/cluster/afr/src/afr-self-heal-common.h b/xlators/cluster/afr/src/afr-self-heal-common.h index 043ebea2da6..3df5f0a0aa6 100644 --- a/xlators/cluster/afr/src/afr-self-heal-common.h +++ b/xlators/cluster/afr/src/afr-self-heal-common.h @@ -53,7 +53,7 @@ afr_build_pending_matrix (char **pending_key, int32_t **pending_matrix,  void  afr_sh_pending_to_delta (afr_private_t *priv, dict_t **xattr, -                         int32_t *delta_matrix[], int success[], +                         int32_t *delta_matrix[], unsigned char success[],                           int child_count, afr_transaction_type type);  int @@ -82,6 +82,15 @@ afr_build_sources (xlator_t *xlator, dict_t **xattr, struct iatt *bufs,                     int32_t *success_children, afr_transaction_type type);  void  afr_sh_common_reset (afr_self_heal_t *sh, unsigned int child_count); + +void +afr_sh_common_lookup_resp_handler (call_frame_t *frame, void *cookie, +                                   xlator_t *this, +                                   int32_t op_ret, int32_t op_errno, +                                   inode_t *inode, struct iatt *buf, +                                   dict_t *xattr, struct iatt *postparent, +                                   loc_t *loc); +  int  afr_sh_common_lookup (call_frame_t *frame, xlator_t *this, loc_t *loc,                        afr_lookup_cbk_t lookup_cbk, gf_boolean_t set_gfid); @@ -95,4 +104,22 @@ int  afr_sh_entry_impunge_create (call_frame_t *impunge_frame, xlator_t *this,                               int child_index, struct iatt *buf,                               struct iatt *postparent); +int +afr_sh_data_unlock (call_frame_t *frame, xlator_t *this, +                    afr_lock_cbk_t lock_cbk); +afr_local_t * +afr_local_copy (afr_local_t *l, xlator_t *this); +int +afr_sh_data_lock (call_frame_t *frame, xlator_t *this, +                  off_t start, off_t len, +                  afr_lock_cbk_t success_handler, +                  afr_lock_cbk_t failure_handler); +void +afr_sh_set_error (afr_self_heal_t *sh, int32_t op_errno); +void +afr_sh_mark_source_sinks (call_frame_t *frame, xlator_t *this); +typedef int +(*afr_fxattrop_cbk_t) (call_frame_t *frame, void *cookie, +                       xlator_t *this, int32_t op_ret, int32_t op_errno, +                       dict_t *xattr);  #endif /* __AFR_SELF_HEAL_COMMON_H__ */ diff --git a/xlators/cluster/afr/src/afr-self-heal-data.c b/xlators/cluster/afr/src/afr-self-heal-data.c index dcaad9c8b47..5db2d94f5cb 100644 --- a/xlators/cluster/afr/src/afr-self-heal-data.c +++ b/xlators/cluster/afr/src/afr-self-heal-data.c @@ -50,6 +50,18 @@  #include "afr-self-heal-algorithm.h" +extern int +sh_loop_finish (call_frame_t *loop_frame, xlator_t *this); + +int +afr_post_sh_big_lock_success (call_frame_t *frame, xlator_t *this); + +int +afr_post_sh_big_lock_failure (call_frame_t *frame, xlator_t *this); + +int +afr_sh_data_finish (call_frame_t *frame, xlator_t *this); +  int  afr_sh_data_done (call_frame_t *frame, xlator_t *this)  { @@ -61,20 +73,6 @@ afr_sh_data_done (call_frame_t *frame, xlator_t *this)          sh = &local->self_heal;          priv = this->private; -        /* -          TODO: cleanup sh->* -        */ - -        if (sh->healing_fd && !sh->healing_fd_opened) { -                /* unref only if we created the fd ourselves */ - -                fd_unref (sh->healing_fd); -                sh->healing_fd = NULL; -        } - -        /* for (i = 0; i < priv->child_count; i++) */ -        /*        sh->locked_nodes[i] = 0; */ -          sh->completion_cbk (frame, this);          return 0; @@ -97,7 +95,7 @@ afr_sh_data_flush_cbk (call_frame_t *frame, void *cookie, xlator_t *this,          {                  if (op_ret == -1) {                          gf_log (this->name, GF_LOG_INFO, -                                "flush or setattr failed on %s on subvolume %s: %s", +                                "flush failed on %s on subvolume %s: %s",                                  local->loc.path, priv->children[child_index]->name,                                  strerror (op_errno));                  } @@ -113,18 +111,6 @@ afr_sh_data_flush_cbk (call_frame_t *frame, void *cookie, xlator_t *this,          return 0;  } - -int -afr_sh_data_setattr_cbk (call_frame_t *frame, void *cookie, xlator_t *this, -                         int32_t op_ret, int32_t op_errno, struct iatt *statpre, -                         struct iatt *statpost) -{ -        afr_sh_data_flush_cbk (frame, cookie, this, op_ret, op_errno); - -        return 0; -} - -  int  afr_sh_data_close (call_frame_t *frame, xlator_t *this)  { @@ -134,8 +120,6 @@ afr_sh_data_close (call_frame_t *frame, xlator_t *this)          int              i          = 0;          int              call_count = 0;          int              source     = 0; -        int32_t          valid      = 0; -        struct iatt      stbuf      = {0,};          local = frame->local;          sh    = &local->self_heal; @@ -143,30 +127,11 @@ afr_sh_data_close (call_frame_t *frame, xlator_t *this)          source = sh->source; -        valid |= (GF_SET_ATTR_ATIME | GF_SET_ATTR_MTIME); - -        stbuf.ia_atime = sh->buf[source].ia_atime; -        stbuf.ia_atime_nsec = sh->buf[source].ia_atime_nsec; -        stbuf.ia_mtime = sh->buf[source].ia_mtime; -        stbuf.ia_mtime_nsec = sh->buf[source].ia_mtime_nsec; - -        if (sh->healing_fd_opened) { -                /* not our job to close the fd */ - -                afr_sh_data_done (frame, this); -                return 0; -        } - -        if (!sh->healing_fd) { -                afr_sh_data_done (frame, this); -                return 0; -        } - -        call_count        = (sh->active_sinks + 1) * 2; +        call_count        = (sh->active_sinks + 1);          local->call_count = call_count;          /* closed source */ -        gf_log (this->name, GF_LOG_TRACE, +        gf_log (this->name, GF_LOG_DEBUG,                  "closing fd of %s on %s",                  local->loc.path, priv->children[sh->source]->name); @@ -177,14 +142,6 @@ afr_sh_data_close (call_frame_t *frame, xlator_t *this)                             sh->healing_fd);          call_count--; -        STACK_WIND_COOKIE (frame, afr_sh_data_setattr_cbk, -                           (void *) (long) sh->source, -                           priv->children[sh->source], -                           priv->children[sh->source]->fops->setattr, -                           &local->loc, &stbuf, valid); - -        call_count--; -          if (call_count == 0)                  return 0; @@ -192,7 +149,7 @@ afr_sh_data_close (call_frame_t *frame, xlator_t *this)                  if (sh->sources[i] || !local->child_up[i])                          continue; -                gf_log (this->name, GF_LOG_TRACE, +                gf_log (this->name, GF_LOG_DEBUG,                          "closing fd of %s on %s",                          local->loc.path, priv->children[i]->name); @@ -202,14 +159,6 @@ afr_sh_data_close (call_frame_t *frame, xlator_t *this)                                     priv->children[i]->fops->flush,                                     sh->healing_fd); -                call_count--; - -                STACK_WIND_COOKIE (frame, afr_sh_data_setattr_cbk, -                                   (void *) (long) i, -                                   priv->children[i], -                                   priv->children[i]->fops->setattr, -                                   &local->loc, &stbuf, valid); -                  if (!--call_count)                          break;          } @@ -217,28 +166,27 @@ afr_sh_data_close (call_frame_t *frame, xlator_t *this)          return 0;  } -  int -afr_sh_data_unlck_cbk (call_frame_t *frame, void *cookie, xlator_t *this, -                       int32_t op_ret, int32_t op_errno) +afr_sh_data_setattr_cbk (call_frame_t *frame, void *cookie, xlator_t *this, +                         int32_t op_ret, int32_t op_errno, struct iatt *statpre, +                         struct iatt *statpost)  { -        afr_local_t * local = NULL; -        int           call_count = 0; -        int           child_index = (long) cookie; + +        afr_local_t   *local       = NULL; +        afr_private_t *priv        = NULL; +        int            call_count  = 0; +        int            child_index = (long) cookie;          local = frame->local; +        priv = this->private;          LOCK (&frame->lock);          {                  if (op_ret == -1) {                          gf_log (this->name, GF_LOG_INFO, -                                "locking inode of %s on child %d failed: %s", -                                local->loc.path, child_index, +                                "setattr failed on %s on subvolume %s: %s", +                                local->loc.path, priv->children[child_index]->name,                                  strerror (op_errno)); -                } else { -                        gf_log (this->name, GF_LOG_TRACE, -                                "inode of %s on child %d locked", -                                local->loc.path, child_index);                  }          }          UNLOCK (&frame->lock); @@ -246,15 +194,114 @@ afr_sh_data_unlck_cbk (call_frame_t *frame, void *cookie, xlator_t *this,          call_count = afr_frame_return (frame);          if (call_count == 0) { -                afr_sh_data_close (frame, this); +                afr_sh_data_finish (frame, this);          }          return 0;  } +int +afr_sh_data_setattr (call_frame_t *frame, xlator_t *this) +{ +        afr_local_t     *local      = NULL; +        afr_private_t   *priv       = NULL; +        afr_self_heal_t *sh         = NULL; +        int              i          = 0; +        int              call_count = 0; +        int              source     = 0; +        int32_t          valid      = 0; +        struct iatt      stbuf      = {0,}; + +        local = frame->local; +        sh    = &local->self_heal; +        priv  = this->private; + +        source = sh->source; + +        valid |= (GF_SET_ATTR_ATIME | GF_SET_ATTR_MTIME); + +        stbuf.ia_atime = sh->buf[source].ia_atime; +        stbuf.ia_atime_nsec = sh->buf[source].ia_atime_nsec; +        stbuf.ia_mtime = sh->buf[source].ia_mtime; +        stbuf.ia_mtime_nsec = sh->buf[source].ia_mtime_nsec; + +        call_count        = afr_set_elem_count_get (sh->success, +                                                    priv->child_count); +        local->call_count = call_count; + +        if (call_count == 0) { +                GF_ASSERT (0); +                afr_sh_data_finish (frame, this); +                return 0; +        } + +        for (i = 0; i < priv->child_count; i++) { +                if (!sh->success[i]) +                        continue; + +                STACK_WIND_COOKIE (frame, afr_sh_data_setattr_cbk, +                                   (void *) (long) i, +                                   priv->children[i], +                                   priv->children[i]->fops->setattr, +                                   &local->loc, &stbuf, valid); + +                if (!--call_count) +                        break; +        } + +        return 0; +}  int -afr_sh_data_unlock (call_frame_t *frame, xlator_t *this) +afr_sh_data_setattr_fstat_cbk (call_frame_t *frame, void *cookie, +                               xlator_t *this, int32_t op_ret, int32_t op_errno, +                               struct iatt *buf) +{ +        afr_private_t   *priv  = NULL; +        afr_local_t     *local = NULL; +        afr_self_heal_t *sh = NULL; +        int child_index = (long) cookie; + +        local = frame->local; +        sh = &local->self_heal; +        priv = this->private; + +        GF_ASSERT (sh->source == child_index); +        if (op_ret != -1) +                sh->buf[child_index] = *buf; +        afr_sh_data_setattr (frame, this); + +        return 0; +} + +/* + * If there are any writes after the self-heal is triggered then the + * stbuf stored in local->self_heal.buf[] will be invalid so we do one more + * stat on the source and then set the [am]times + */ +int +afr_sh_set_timestamps (call_frame_t *frame, xlator_t *this) +{ +        afr_local_t     *local      = NULL; +        afr_private_t   *priv       = NULL; +        afr_self_heal_t *sh         = NULL; + +        local = frame->local; +        sh    = &local->self_heal; +        priv  = this->private; + +        STACK_WIND_COOKIE (frame, afr_sh_data_setattr_fstat_cbk, +                           (void *) (long) sh->source, +                           priv->children[sh->source], +                           priv->children[sh->source]->fops->fstat, +                           sh->healing_fd); +        return 0; +} + +//Fun fact, lock_cbk is being used for both lock & unlock +int +afr_sh_data_unlock (call_frame_t *frame, xlator_t *this, +                    afr_lock_cbk_t lock_cbk)  {          afr_local_t         *local    = NULL;          afr_internal_lock_t *int_lock = NULL; @@ -264,15 +311,15 @@ afr_sh_data_unlock (call_frame_t *frame, xlator_t *this)          int_lock = &local->internal_lock;          sh       = &local->self_heal; -        GF_ASSERT (!sh->data_lock_held); +        GF_ASSERT (sh->data_lock_held); -        int_lock->lock_cbk = afr_sh_data_close; +        sh->data_lock_held = _gf_false; +        int_lock->lock_cbk = lock_cbk;          afr_unlock (frame, this);          return 0;  } -  int  afr_sh_data_finish (call_frame_t *frame, xlator_t *this)  { @@ -285,44 +332,52 @@ afr_sh_data_finish (call_frame_t *frame, xlator_t *this)          gf_log (this->name, GF_LOG_DEBUG,                  "finishing data selfheal of %s", local->loc.path); -        if (!sh->data_lock_held) -                afr_sh_data_unlock (frame, this); +        if (sh->data_lock_held) +                afr_sh_data_unlock (frame, this, afr_sh_data_close);          else                  afr_sh_data_close (frame, this);          return 0;  } +int +afr_sh_data_fail (call_frame_t *frame, xlator_t *this) +{ +        afr_local_t   *local = NULL; +        afr_self_heal_t *sh = NULL; + +        local = frame->local; +        sh = &local->self_heal; + +        gf_log (this->name, GF_LOG_DEBUG, +                "finishing failed data selfheal of %s", local->loc.path); + +        sh->op_failed = 1; +        if (sh->data_lock_held) +                afr_sh_data_unlock (frame, this, afr_sh_data_close); +        else +                afr_sh_data_close (frame, this); +        return 0; +}  int  afr_sh_data_erase_pending_cbk (call_frame_t *frame, void *cookie,                                 xlator_t *this, int32_t op_ret,                                 int32_t op_errno, dict_t *xattr)  { -        afr_local_t     *local     = NULL;          int             call_count = 0; -        long            i          = 0; -        afr_self_heal_t *sh        = NULL; -        afr_private_t   *priv      = NULL; - -        local = frame->local; -        priv  = this->private; -        sh = &local->self_heal; -        i = (long)cookie; -        afr_children_add_child (sh->fresh_children, i, priv->child_count);          call_count = afr_frame_return (frame);          if (call_count == 0) { -                afr_inode_set_read_ctx (this, sh->inode, sh->source, -                                        sh->fresh_children); -                afr_sh_data_finish (frame, this); +                afr_sh_data_lock (frame, this, 0, 0, +                                  afr_post_sh_big_lock_success, +                                  afr_post_sh_big_lock_failure);          }          return 0;  } -  int  afr_sh_data_erase_pending (call_frame_t *frame, xlator_t *this)  { @@ -339,6 +394,9 @@ afr_sh_data_erase_pending (call_frame_t *frame, xlator_t *this)          afr_sh_pending_to_delta (priv, sh->xattr, sh->delta_matrix, sh->success,                                   priv->child_count, AFR_DATA_TRANSACTION); +        gf_log (this->name, GF_LOG_DEBUG, "Delta matrix for: %"PRIu64, +                frame->root->lk_owner); +        afr_sh_print_pending_matrix (sh->delta_matrix, this);          erase_xattr = GF_CALLOC (sizeof (*erase_xattr), priv->child_count,                                   gf_afr_mt_dict_t); @@ -355,12 +413,13 @@ afr_sh_data_erase_pending (call_frame_t *frame, xlator_t *this)          afr_sh_delta_to_xattr (priv, sh->delta_matrix, erase_xattr,                                 priv->child_count, AFR_DATA_TRANSACTION); +        GF_ASSERT (call_count);          local->call_count = call_count;          for (i = 0; i < priv->child_count; i++) {                  if (!erase_xattr[i])                          continue; -                gf_log (this->name, GF_LOG_TRACE, +                gf_log (this->name, GF_LOG_DEBUG,                          "erasing pending flags from %s on %s",                          local->loc.path, priv->children[i]->name); @@ -385,85 +444,6 @@ afr_sh_data_erase_pending (call_frame_t *frame, xlator_t *this)  } -int -afr_sh_data_trim_cbk (call_frame_t *frame, void *cookie, xlator_t *this, -                      int32_t op_ret, int32_t op_errno, struct iatt *prebuf, -                      struct iatt *postbuf) -{ -        afr_private_t * priv = NULL; -        afr_local_t * local  = NULL; -        int              call_count = 0; -        int              child_index = 0; - -        priv = this->private; -        local = frame->local; - -        child_index = (long) cookie; - -        LOCK (&frame->lock); -        { -                if (op_ret == -1) -                        gf_log (this->name, GF_LOG_INFO, -                                "ftruncate of %s on subvolume %s failed (%s)", -                                local->loc.path, -                                priv->children[child_index]->name, -                                strerror (op_errno)); -                else -                        gf_log (this->name, GF_LOG_TRACE, -                                "ftruncate of %s on subvolume %s completed", -                                local->loc.path, -                                priv->children[child_index]->name); -        } -        UNLOCK (&frame->lock); - -        call_count = afr_frame_return (frame); - -        if (call_count == 0) { -                afr_sh_data_erase_pending (frame, this); -        } - -        return 0; -} - - -int -afr_sh_data_trim_sinks (call_frame_t *frame, xlator_t *this) -{ -        afr_private_t * priv = NULL; -        afr_local_t * local  = NULL; -        afr_self_heal_t *sh  = NULL; -        int             *sources = NULL; -        int              call_count = 0; -        int              i = 0; - - -        priv = this->private; -        local = frame->local; -        sh = &local->self_heal; - -        sources = sh->sources; -        call_count = sh->active_sinks; - -        local->call_count = call_count; - -        for (i = 0; i < priv->child_count; i++) { -                if (sources[i] || !local->child_up[i]) -                        continue; - -                STACK_WIND_COOKIE (frame, afr_sh_data_trim_cbk, -                                   (void *) (long) i, -                                   priv->children[i], -                                   priv->children[i]->fops->ftruncate, -                                   sh->healing_fd, sh->file_size); - -                if (!--call_count) -                        break; -        } - -        return 0; -} - -  static struct afr_sh_algorithm *  sh_algo_from_name (xlator_t *this, char *name)  { @@ -549,64 +529,138 @@ afr_sh_data_sync_prepare (call_frame_t *frame, xlator_t *this)          afr_local_t     *local = NULL;          afr_self_heal_t *sh = NULL;          afr_private_t   *priv = NULL; -        int              active_sinks = 0; -        int              source = 0; -        int              i = 0;          struct afr_sh_algorithm *sh_algo = NULL;          local = frame->local;          sh = &local->self_heal;          priv = this->private; -        source = sh->source; +        sh->algo_completion_cbk = afr_sh_data_erase_pending; +        sh->algo_abort_cbk      = afr_sh_data_fail; -        for (i = 0; i < priv->child_count; i++) { -                if (sh->sources[i] == 0 && local->child_up[i] == 1) { -                        active_sinks++; -                        sh->success[i] = 1; -                } -        } -        sh->success[source] = 1; +        sh_algo = afr_sh_data_pick_algo (frame, this); -        if (active_sinks == 0) { -                gf_log (this->name, GF_LOG_INFO, -                        "no active sinks for performing self-heal on file %s", -                        local->loc.path); -                afr_sh_data_finish (frame, this); -                return 0; +        sh_algo->fn (frame, this); + +        return 0; +} + +int +afr_sh_data_trim_cbk (call_frame_t *frame, void *cookie, xlator_t *this, +                      int32_t op_ret, int32_t op_errno, struct iatt *prebuf, +                      struct iatt *postbuf) +{ +        afr_private_t * priv = NULL; +        afr_local_t * local  = NULL; +        int              call_count = 0; +        int              child_index = 0; + +        priv = this->private; +        local = frame->local; + +        child_index = (long) cookie; + +        LOCK (&frame->lock); +        { +                if (op_ret == -1) +                        gf_log (this->name, GF_LOG_INFO, +                                "ftruncate of %s on subvolume %s failed (%s)", +                                local->loc.path, +                                priv->children[child_index]->name, +                                strerror (op_errno)); +                else +                        gf_log (this->name, GF_LOG_DEBUG, +                                "ftruncate of %s on subvolume %s completed", +                                local->loc.path, +                                priv->children[child_index]->name);          } -        sh->active_sinks = active_sinks; +        UNLOCK (&frame->lock); -        gf_log (this->name, GF_LOG_DEBUG, -                "self-healing file %s from subvolume %s to %d other", -                local->loc.path, priv->children[source]->name, active_sinks); +        call_count = afr_frame_return (frame); -        sh->algo_completion_cbk = afr_sh_data_trim_sinks; -        sh->algo_abort_cbk      = afr_sh_data_finish; +        if (call_count == 0) +                afr_sh_data_sync_prepare (frame, this); -        sh_algo = afr_sh_data_pick_algo (frame, this); +        return 0; +} -        sh_algo->fn (frame, this); + +int +afr_sh_data_trim_sinks (call_frame_t *frame, xlator_t *this) +{ +        afr_private_t * priv = NULL; +        afr_local_t * local  = NULL; +        afr_self_heal_t *sh  = NULL; +        int             *sources = NULL; +        int              call_count = 0; +        int              i = 0; + + +        priv = this->private; +        local = frame->local; +        sh = &local->self_heal; + +        sources = sh->sources; +        call_count = sh->active_sinks; + +        local->call_count = call_count; + +        for (i = 0; i < priv->child_count; i++) { +                if (sources[i] || !local->child_up[i]) +                        continue; + +                STACK_WIND_COOKIE (frame, afr_sh_data_trim_cbk, +                                   (void *) (long) i, +                                   priv->children[i], +                                   priv->children[i]->fops->ftruncate, +                                   sh->healing_fd, sh->file_size); + +                if (!--call_count) +                        break; +        }          return 0;  } +int +afr_sh_inode_set_read_ctx (afr_self_heal_t *sh, xlator_t *this) +{ +        afr_private_t   *priv = NULL; +        int             ret = 0; + +        priv = this->private; +        sh->source = afr_sh_select_source (sh->sources, priv->child_count); +        if (sh->source < 0) { +                ret = -1; +                goto out; +        } + +        afr_reset_children (sh->fresh_children, priv->child_count); +        afr_get_fresh_children (sh->success_children, sh->sources, +                                sh->fresh_children, priv->child_count); +        afr_inode_set_read_ctx (this, sh->inode, sh->source, +                                sh->fresh_children); +out: +        return ret; +}  int  afr_sh_data_fix (call_frame_t *frame, xlator_t *this)  {          afr_local_t     *local      = NULL; -        afr_local_t *    orig_local = NULL;          afr_self_heal_t *sh = NULL;          afr_private_t   *priv = NULL;          int              nsources = 0;          int              source = 0;          int              i = 0; +        int              ret = 0;          local = frame->local;          sh = &local->self_heal;          priv = this->private; +        gf_log (this->name, GF_LOG_DEBUG, "Pending matrix for: %"PRIu64, +                frame->root->lk_owner);          nsources = afr_build_sources (this, sh->xattr, sh->buf, sh->pending_matrix,                                        sh->sources, sh->success_children,                                        AFR_DATA_TRANSACTION); @@ -643,30 +697,26 @@ afr_sh_data_fix (call_frame_t *frame, xlator_t *this)                  local->govinda_gOvinda = 1; -                afr_sh_data_finish (frame, this); +                afr_sh_data_fail (frame, this);                  return 0;          } -        source = afr_sh_select_source (sh->sources, priv->child_count); - -        if (source == -1) { +        ret = afr_sh_inode_set_read_ctx (sh, this); +        if (ret) {                  gf_log (this->name, GF_LOG_DEBUG,                          "No active sources found."); -                afr_sh_data_finish (frame, this); +                afr_sh_data_fail (frame, this);                  return 0;          } -        sh->source     = source; -        sh->block_size = 65536; /* TODO: make it configurable or use macro */ +        source     = sh->source; +        sh->block_size = this->ctx->page_size;          sh->file_size  = sh->buf[source].ia_size;          if (FILE_HAS_HOLES (&sh->buf[source]))                  sh->file_has_holes = 1; -        orig_local = sh->orig_frame->local; -        orig_local->cont.lookup.buf.ia_size = sh->buf[source].ia_size; -          /* detect changes not visible through pending flags -- JIC */          for (i = 0; i < priv->child_count; i++) {                  if (i == source || sh->child_errno[i]) @@ -676,27 +726,25 @@ afr_sh_data_fix (call_frame_t *frame, xlator_t *this)                          sh->sources[i] = 0;          } -        afr_reset_children (sh->fresh_children, priv->child_count); -        afr_get_fresh_children (sh->success_children, sh->sources, -                                sh->fresh_children, priv->child_count); -        afr_inode_set_read_ctx (this, sh->inode, sh->source, -                                sh->fresh_children); - -        /* -          quick-read might have read the file, so send xattr from -          the source subvolume (http://bugs.gluster.com/cgi-bin/bugzilla3/show_bug.cgi?id=815) -        */ - -        dict_unref (orig_local->cont.lookup.xattr); -        if (orig_local->cont.lookup.xattrs) -                orig_local->cont.lookup.xattr = dict_ref (orig_local->cont.lookup.xattrs[sh->source]); - -        if (sh->background) { +        if (sh->background && sh->unwind) {                  sh->unwind (sh->orig_frame, this, sh->op_ret, sh->op_errno);                  sh->unwound = _gf_true;          } -        afr_sh_data_sync_prepare (frame, this); +        afr_sh_mark_source_sinks (frame, this); +        if (sh->active_sinks == 0) { +                gf_log (this->name, GF_LOG_INFO, +                        "no active sinks for performing self-heal on file %s", +                        local->loc.path); +                afr_sh_data_finish (frame, this); +                return 0; +        } + +        gf_log (this->name, GF_LOG_DEBUG, +                "self-healing file %s from subvolume %s to %d other", +                local->loc.path, priv->children[sh->source]->name, +                sh->active_sinks); +        afr_sh_data_trim_sinks (frame, this);          return 0;  } @@ -855,8 +903,8 @@ afr_sh_data_fstat (call_frame_t *frame, xlator_t *this)          local = frame->local;          sh    = &local->self_heal; -        call_count = afr_up_children_count (priv->child_count, -                                            local->child_up); +        call_count = afr_up_children_count (local->child_up, +                                            priv->child_count);          local->call_count = call_count; @@ -878,16 +926,14 @@ afr_sh_data_fstat (call_frame_t *frame, xlator_t *this)          return 0;  } - -int -afr_sh_data_fxattrop_cbk (call_frame_t *frame, void *cookie, -                          xlator_t *this, int32_t op_ret, int32_t op_errno, -                          dict_t *xattr) +void +afr_sh_common_fxattrop_resp_handler (call_frame_t *frame, void *cookie, +                                     xlator_t *this, int32_t op_ret, +                                     int32_t op_errno, dict_t *xattr)  {          afr_private_t   *priv  = NULL;          afr_local_t     *local = NULL;          afr_self_heal_t *sh = NULL; -        int call_count  = -1;          int child_index = (long) cookie;          local = frame->local; @@ -903,12 +949,55 @@ afr_sh_data_fxattrop_cbk (call_frame_t *frame, void *cookie,                                  priv->children[child_index]->name);                          sh->xattr[child_index] = dict_ref (xattr); +                        sh->success_children[sh->success_count] = child_index; +                        sh->success_count++;                  }          }          UNLOCK (&frame->lock); +} + +int +afr_post_sh_data_fxattrop_cbk (call_frame_t *frame, void *cookie, +                               xlator_t *this, int32_t op_ret, int32_t op_errno, +                               dict_t *xattr) +{ +        int             call_count  = -1; +        int             ret = 0; +        afr_local_t     *local = NULL; +        afr_self_heal_t *sh = NULL; + +        afr_sh_common_fxattrop_resp_handler (frame, cookie, this, op_ret, +                                             op_errno, xattr); +        local = frame->local; +        sh = &local->self_heal;          call_count = afr_frame_return (frame); +        if (call_count == 0) { +                (void) afr_build_sources (this, sh->xattr, NULL, +                                          sh->pending_matrix, +                                          sh->sources, sh->success_children, +                                          AFR_DATA_TRANSACTION); +                ret = afr_sh_inode_set_read_ctx (sh, this); +                if (ret) +                        afr_sh_data_fail (frame, this); +                else +                        afr_sh_set_timestamps (frame, this); +        } +        return 0; +} + +int +afr_sh_data_fxattrop_cbk (call_frame_t *frame, void *cookie, +                          xlator_t *this, int32_t op_ret, int32_t op_errno, +                          dict_t *xattr) +{ +        int call_count  = -1; + +        afr_sh_common_fxattrop_resp_handler (frame, cookie, this, op_ret, +                                             op_errno, xattr); + +        call_count = afr_frame_return (frame);          if (call_count == 0) {                  afr_sh_data_fstat (frame, this);          } @@ -918,7 +1007,8 @@ afr_sh_data_fxattrop_cbk (call_frame_t *frame, void *cookie,  int -afr_sh_data_fxattrop (call_frame_t *frame, xlator_t *this) +afr_sh_data_fxattrop (call_frame_t *frame, xlator_t *this, +                      afr_fxattrop_cbk_t fxattrop_cbk)  {          afr_self_heal_t *sh    = NULL;          afr_local_t     *local = NULL; @@ -933,8 +1023,8 @@ afr_sh_data_fxattrop (call_frame_t *frame, xlator_t *this)          local = frame->local;          sh    = &local->self_heal; -        call_count = afr_up_children_count (priv->child_count, -                                            local->child_up); +        call_count = afr_up_children_count (local->child_up, +                                            priv->child_count);          local->call_count = call_count; @@ -963,9 +1053,12 @@ afr_sh_data_fxattrop (call_frame_t *frame, xlator_t *this)                  }          } +        afr_reset_xattr (sh->xattr, priv->child_count); +        afr_reset_children (sh->success_children, priv->child_count); +        sh->success_count = 0;          for (i = 0; i < priv->child_count; i++) {                  if (local->child_up[i]) { -                        STACK_WIND_COOKIE (frame, afr_sh_data_fxattrop_cbk, +                        STACK_WIND_COOKIE (frame, fxattrop_cbk,                                             (void *) (long) i,                                             priv->children[i],                                             priv->children[i]->fops->fxattrop, @@ -992,7 +1085,45 @@ out:  }  int -afr_sh_data_lock_rec (call_frame_t *frame, xlator_t *this); +afr_sh_data_big_lock_success (call_frame_t *frame, xlator_t *this) +{ +        afr_local_t   *local = NULL; +        afr_self_heal_t *sh = NULL; + +        local = frame->local; +        sh = &local->self_heal; + +        sh->data_lock_held = _gf_true; +        afr_sh_data_fxattrop (frame, this, afr_sh_data_fxattrop_cbk); +        return 0; +} + +int +afr_sh_data_post_blocking_inodelk_cbk (call_frame_t *frame, xlator_t *this) +{ +        afr_internal_lock_t *int_lock = NULL; +        afr_local_t         *local    = NULL; +        afr_self_heal_t     *sh       = NULL; + +        local    = frame->local; +        int_lock = &local->internal_lock; +        sh       = &local->self_heal; + +        if (int_lock->lock_op_ret < 0) { +                gf_log (this->name, GF_LOG_ERROR, "Blocking data inodelks " +                        "failed for %s. by %"PRIu64, +                        local->loc.path, frame->root->lk_owner); +                sh->data_lock_failure_handler (frame, this); +        } else { + +                gf_log (this->name, GF_LOG_DEBUG, "Blocking data inodelks " +                        "done for %s by %"PRIu64". Proceding to self-heal", +                        local->loc.path, frame->root->lk_owner); +                sh->data_lock_success_handler (frame, this); +        } + +        return 0; +}  int  afr_sh_data_post_nonblocking_inodelk_cbk (call_frame_t *frame, xlator_t *this) @@ -1006,22 +1137,24 @@ afr_sh_data_post_nonblocking_inodelk_cbk (call_frame_t *frame, xlator_t *this)          sh       = &local->self_heal;          if (int_lock->lock_op_ret < 0) { -                gf_log (this->name, GF_LOG_ERROR, "Non Blocking data inodelks " -                        "failed for %s.", local->loc.path); -                sh->op_failed = 1; -                afr_sh_data_done (frame, this); +                gf_log (this->name, GF_LOG_DEBUG, "Non Blocking data inodelks " +                        "failed for %s. by %"PRIu64, +                        local->loc.path, frame->root->lk_owner); +                int_lock->lock_cbk = afr_sh_data_post_blocking_inodelk_cbk; +                afr_blocking_lock (frame, this);          } else {                  gf_log (this->name, GF_LOG_DEBUG, "Non Blocking data inodelks " -                        "done for %s. Proceeding to FOP", local->loc.path); -                afr_sh_data_fxattrop (frame, this); +                        "done for %s by %"PRIu64". Proceeding to self-heal", +                        local->loc.path, frame->root->lk_owner); +                sh->data_lock_success_handler (frame, this);          }          return 0;  }  int -afr_sh_data_lock_rec (call_frame_t *frame, xlator_t *this) +afr_sh_data_lock_rec (call_frame_t *frame, xlator_t *this, off_t start, off_t len)  {          afr_internal_lock_t *int_lock = NULL;          afr_local_t         *local    = NULL; @@ -1036,8 +1169,8 @@ afr_sh_data_lock_rec (call_frame_t *frame, xlator_t *this)          afr_set_lock_number (frame, this); -        int_lock->lk_flock.l_start = 0; -        int_lock->lk_flock.l_len   = 0; +        int_lock->lk_flock.l_start = start; +        int_lock->lk_flock.l_len   = len;          int_lock->lk_flock.l_type  = F_WRLCK;          int_lock->lock_cbk         = afr_sh_data_post_nonblocking_inodelk_cbk; @@ -1046,9 +1179,45 @@ afr_sh_data_lock_rec (call_frame_t *frame, xlator_t *this)          return 0;  } +int +afr_post_sh_big_lock_success (call_frame_t *frame, xlator_t *this) +{ +        afr_local_t     *local     = NULL; +        afr_self_heal_t *sh        = NULL; + +        local = frame->local; +        sh = &local->self_heal; + +        GF_ASSERT (sh->old_loop_frame); +        sh_loop_finish (sh->old_loop_frame, this); +        sh->old_loop_frame = NULL; +        sh->data_lock_held = _gf_true; +        afr_sh_data_fxattrop (frame, this, afr_post_sh_data_fxattrop_cbk); +        return 0; +}  int -afr_sh_data_lock (call_frame_t *frame, xlator_t *this) +afr_post_sh_big_lock_failure (call_frame_t *frame, xlator_t *this) +{ +        afr_local_t     *local     = NULL; +        afr_self_heal_t *sh        = NULL; + +        local = frame->local; +        sh = &local->self_heal; + +        GF_ASSERT (sh->old_loop_frame); +        sh_loop_finish (sh->old_loop_frame, this); +        sh->old_loop_frame = NULL; +        afr_sh_set_timestamps (frame, this); +        return 0; +} + + +int +afr_sh_data_lock (call_frame_t *frame, xlator_t *this, +                  off_t start, off_t len, +                  afr_lock_cbk_t success_handler, +                  afr_lock_cbk_t failure_handler)  {          afr_local_t *   local = NULL;          afr_private_t * priv  = NULL; @@ -1059,18 +1228,11 @@ afr_sh_data_lock (call_frame_t *frame, xlator_t *this)          sh    = &local->self_heal;          priv  = this->private; -        if (sh->data_lock_held) { -                /* caller has held the lock already, -                   so skip locking */ - -                afr_sh_data_fxattrop (frame, this); -                return 0; -        } - -        return afr_sh_data_lock_rec (frame, this); +        sh->data_lock_success_handler = success_handler; +        sh->data_lock_failure_handler = failure_handler; +        return afr_sh_data_lock_rec (frame, this, start, len);  } -  int  afr_sh_data_open_cbk (call_frame_t *frame, void *cookie, xlator_t *this,                        int32_t op_ret, int32_t op_errno, fd_t *fd) @@ -1113,7 +1275,7 @@ afr_sh_data_open_cbk (call_frame_t *frame, void *cookie, xlator_t *this,          if (call_count == 0) {                  if (sh->op_failed) { -                        afr_sh_data_finish (frame, this); +                        afr_sh_data_fail (frame, this);                          return 0;                  } @@ -1121,7 +1283,9 @@ afr_sh_data_open_cbk (call_frame_t *frame, void *cookie, xlator_t *this,                          "fd for %s opened, commencing sync",                          local->loc.path); -                afr_sh_data_lock (frame, this); +                afr_sh_data_lock (frame, this, 0, 0, +                                  afr_sh_data_big_lock_success, +                                  afr_sh_data_fail);          }          return 0; @@ -1142,14 +1306,7 @@ afr_sh_data_open (call_frame_t *frame, xlator_t *this)          sh = &local->self_heal;          priv = this->private; -        if (sh->healing_fd_opened) { -                /* caller has opened the fd for us already, so skip open */ - -                afr_sh_data_lock (frame, this); -                return 0; -        } - -        call_count = afr_up_children_count (priv->child_count, local->child_up); +        call_count = afr_up_children_count (local->child_up, priv->child_count);          local->call_count = call_count;          fd = fd_create (local->loc.inode, frame->root->pid); diff --git a/xlators/cluster/afr/src/afr-self-heal-entry.c b/xlators/cluster/afr/src/afr-self-heal-entry.c index 9e80cb3d5a5..ddca2619db8 100644 --- a/xlators/cluster/afr/src/afr-self-heal-entry.c +++ b/xlators/cluster/afr/src/afr-self-heal-entry.c @@ -60,18 +60,10 @@ afr_sh_entry_done (call_frame_t *frame, xlator_t *this)          sh = &local->self_heal;          priv = this->private; -        /* -          TODO: cleanup sh->* -        */ -          if (sh->healing_fd)                  fd_unref (sh->healing_fd);          sh->healing_fd = NULL; -        /* for (i = 0; i < priv->child_count; i++) { */ -        /*        sh->locked_nodes[i] = 0; */ -        /* } */ -          sh->completion_cbk (frame, this);          return 0; @@ -2192,9 +2184,7 @@ afr_sh_entry_sync_prepare (call_frame_t *frame, xlator_t *this)          afr_local_t     *local = NULL;          afr_self_heal_t *sh = NULL;          afr_private_t   *priv = NULL; -        int              active_sinks = 0;          int              source = 0; -        int              i = 0;          local = frame->local;          sh = &local->self_heal; @@ -2202,37 +2192,31 @@ afr_sh_entry_sync_prepare (call_frame_t *frame, xlator_t *this)          source = sh->source; -        for (i = 0; i < priv->child_count; i++) { -                if (sh->sources[i] == 0 && local->child_up[i] == 1) { -                        active_sinks++; -                        sh->success[i] = 1; -                } -        } +        afr_sh_mark_source_sinks (frame, this);          if (source != -1)                  sh->success[source] = 1; -        if (active_sinks == 0) { +        if (sh->active_sinks == 0) {                  gf_log (this->name, GF_LOG_TRACE,                          "no active sinks for self-heal on dir %s",                          local->loc.path);                  afr_sh_entry_finish (frame, this);                  return 0;          } -        if (source == -1 && active_sinks < 2) { +        if (source == -1 && sh->active_sinks < 2) {                  gf_log (this->name, GF_LOG_TRACE,                          "cannot sync with 0 sources and 1 sink on dir %s",                          local->loc.path);                  afr_sh_entry_finish (frame, this);                  return 0;          } -        sh->active_sinks = active_sinks;          if (source != -1)                  gf_log (this->name, GF_LOG_DEBUG,                          "self-healing directory %s from subvolume %s to "                          "%d other",                          local->loc.path, priv->children[source]->name, -                        active_sinks); +                        sh->active_sinks);          else                  gf_log (this->name, GF_LOG_DEBUG,                          "no active sources for %s found. " @@ -2302,25 +2286,13 @@ afr_sh_entry_lookup_cbk (call_frame_t *frame, void *cookie,                           inode_t *inode, struct iatt *buf, dict_t *xattr,                           struct iatt *postparent)  { +        int             call_count  = 0;          afr_local_t     *local = NULL; -        afr_self_heal_t *sh = NULL; - -        int call_count  = -1; -        int child_index = (long) cookie;          local = frame->local; -        sh = &local->self_heal; - -        LOCK (&frame->lock); -        { -                if (op_ret != -1) { -                        sh->xattr[child_index] = dict_ref (xattr); -                        sh->buf[child_index] = *buf; -                        sh->success_children[sh->success_count] = child_index; -                        sh->success_count++; -                } -        } -        UNLOCK (&frame->lock); +        afr_sh_common_lookup_resp_handler (frame, cookie, this, op_ret, +                                           op_errno, inode, buf, xattr, +                                           postparent, &local->loc);          call_count = afr_frame_return (frame); diff --git a/xlators/cluster/afr/src/afr-self-heal-metadata.c b/xlators/cluster/afr/src/afr-self-heal-metadata.c index 5445132ab8c..fb92cd999ed 100644 --- a/xlators/cluster/afr/src/afr-self-heal-metadata.c +++ b/xlators/cluster/afr/src/afr-self-heal-metadata.c @@ -55,7 +55,6 @@ afr_sh_metadata_done (call_frame_t *frame, xlator_t *this)          afr_local_t     *local = NULL;          afr_self_heal_t *sh = NULL;          afr_private_t   *priv = NULL; -        int              i = 0;          local = frame->local;          sh = &local->self_heal; @@ -63,18 +62,9 @@ afr_sh_metadata_done (call_frame_t *frame, xlator_t *this)  //      memset (sh->child_errno, 0, sizeof (int) * priv->child_count);          memset (sh->buf, 0, sizeof (struct iatt) * priv->child_count); -        memset (sh->success, 0, sizeof (int) * priv->child_count); - -/*         for (i = 0; i < priv->child_count; i++) { */ -/*                 sh->locked_nodes[i] = 1; */ -/*         } */ - -        for (i = 0; i < priv->child_count; i++) { -                if (sh->xattr[i]) -                        dict_unref (sh->xattr[i]); -                sh->xattr[i] = NULL; -        } +        memset (sh->success, 0, sizeof (*sh->success) * priv->child_count); +        afr_reset_xattr (sh->xattr, priv->child_count);          if (local->govinda_gOvinda) {                  gf_log (this->name, GF_LOG_INFO,                          "split-brain detected, aborting selfheal of %s", @@ -438,9 +428,7 @@ afr_sh_metadata_sync_prepare (call_frame_t *frame, xlator_t *this)          afr_local_t     *local = NULL;          afr_self_heal_t *sh = NULL;          afr_private_t   *priv = NULL; -        int              active_sinks = 0;          int              source = 0; -        int              i = 0;          local = frame->local;          sh = &local->self_heal; @@ -448,26 +436,19 @@ afr_sh_metadata_sync_prepare (call_frame_t *frame, xlator_t *this)          source = sh->source; -        for (i = 0; i < priv->child_count; i++) { -                if (sh->sources[i] == 0 && local->child_up[i] == 1) { -                        active_sinks++; -                        sh->success[i] = 1; -                } -        } -        sh->success[source] = 1; - -        if (active_sinks == 0) { +        afr_sh_mark_source_sinks (frame, this); +        if (sh->active_sinks == 0) {                  gf_log (this->name, GF_LOG_DEBUG,                          "no active sinks for performing self-heal on file %s",                          local->loc.path);                  afr_sh_metadata_finish (frame, this);                  return 0;          } -        sh->active_sinks = active_sinks;          gf_log (this->name, GF_LOG_TRACE,                  "syncing metadata of %s from subvolume %s to %d active sinks", -                local->loc.path, priv->children[source]->name, active_sinks); +                local->loc.path, priv->children[source]->name, +                sh->active_sinks);          STACK_WIND (frame, afr_sh_metadata_getxattr_cbk,                      priv->children[source], @@ -558,8 +539,7 @@ afr_sh_metadata_fix (call_frame_t *frame, xlator_t *this)          if ((!IA_ISREG (sh->buf[source].ia_type)) &&              (!IA_ISDIR (sh->buf[source].ia_type))) { -                afr_reset_children (sh->fresh_children, -                                          priv->child_count); +                afr_reset_children (sh->fresh_children, priv->child_count);                  afr_get_fresh_children (sh->success_children, sh->sources,                                          sh->fresh_children, priv->child_count);                  afr_inode_set_read_ctx (this, sh->inode, sh->source, @@ -579,43 +559,17 @@ afr_sh_metadata_lookup_cbk (call_frame_t *frame, void *cookie, xlator_t *this,                              struct iatt *postparent)  {          afr_local_t     *local = NULL; -        afr_self_heal_t *sh = NULL; -        afr_private_t   *priv = NULL;          int              call_count = 0;          int              child_index = 0;          local = frame->local; -        sh = &local->self_heal; -        priv = this->private;          child_index = (long) cookie; -        LOCK (&frame->lock); -        { -                if (op_ret == 0) { -                        gf_log (this->name, GF_LOG_TRACE, -                                "path %s on subvolume %s is of mode 0%o", -                                local->loc.path, -                                priv->children[child_index]->name, -                                buf->ia_type); - -                        sh->buf[child_index] = *buf; -                        if (xattr) -                                sh->xattr[child_index] = dict_ref (xattr); -                        sh->success_children[sh->success_count] = child_index; -                        sh->success_count++; -                } else { -                        gf_log (this->name, GF_LOG_INFO, -                                "path %s on subvolume %s => -1 (%s)", -                                local->loc.path, -                                priv->children[child_index]->name, -                                strerror (op_errno)); - -                        sh->child_errno[child_index] = op_errno; -                } -        } -        UNLOCK (&frame->lock); +        afr_sh_common_lookup_resp_handler (frame, cookie, this, op_ret, +                                           op_errno, inode, buf, xattr, +                                           postparent, &local->loc);          call_count = afr_frame_return (frame); diff --git a/xlators/cluster/afr/src/afr-transaction.c b/xlators/cluster/afr/src/afr-transaction.c index fc030433b69..6b0e9a7cfe5 100644 --- a/xlators/cluster/afr/src/afr-transaction.c +++ b/xlators/cluster/afr/src/afr-transaction.c @@ -322,7 +322,7 @@ afr_set_piggyback_dict (afr_private_t *priv, dict_t *xattr, int32_t **pending,                  memcpy (arr, pending[i], pending_xattr_size); -                arr[index]++; +                arr[index] = hton32 (ntoh32(arr[index]) + 1);                  ret = dict_set_bin (xattr, priv->pending_key[i],                                      arr, pending_xattr_size); @@ -468,6 +468,32 @@ out:          return;  } +int +afr_fxattrop_call_count (afr_transaction_type type, afr_internal_lock_t *int_lock, +                         unsigned int child_count) +{ +        int call_count = 0; + +        switch (type) { +        case AFR_DATA_TRANSACTION: +        case AFR_METADATA_TRANSACTION: +                call_count = afr_locked_children_count (int_lock->inode_locked_nodes, +                                                        child_count); +        break; + +        case AFR_ENTRY_TRANSACTION: +        case AFR_ENTRY_RENAME_TRANSACTION: +                call_count = afr_locked_children_count (int_lock->entry_locked_nodes, +                                                        child_count); +        break; +        } + +        if (type == AFR_ENTRY_RENAME_TRANSACTION) { +                call_count *= 2; +        } +        return call_count; +} +  int  afr_changelog_post_op (call_frame_t *frame, xlator_t *this) @@ -503,12 +529,8 @@ afr_changelog_post_op (call_frame_t *frame, xlator_t *this)                  dict_ref (xattr[i]);          } -        call_count = afr_up_children_count (priv->child_count, local->child_up); - -        if (local->transaction.type == AFR_ENTRY_RENAME_TRANSACTION) { -                call_count *= 2; -        } - +        call_count = afr_fxattrop_call_count (local->transaction.type, int_lock, +                                              priv->child_count);          local->call_count = call_count;          if (local->fd) @@ -546,6 +568,8 @@ afr_changelog_post_op (call_frame_t *frame, xlator_t *this)          for (i = 0; i < priv->child_count; i++) {                  if (!local->child_up[i])                          continue; +                if (local->fd && !local->fd_open_on[i]) +                        continue;                  ret = afr_set_pending_dict (priv, xattr[i],                                              local->pending); @@ -750,7 +774,6 @@ afr_changelog_pre_op_cbk (call_frame_t *frame, void *cookie, xlator_t *this,          return 0;  } -  int  afr_changelog_pre_op (call_frame_t *frame, xlator_t *this)  { @@ -762,8 +785,10 @@ afr_changelog_pre_op (call_frame_t *frame, xlator_t *this)          afr_fd_ctx_t *fdctx = NULL;          afr_local_t *local = NULL;          int          piggyback = 0; +        afr_internal_lock_t *int_lock = NULL;          local = frame->local; +        int_lock = &local->internal_lock;          xattr = alloca (priv->child_count * sizeof (*xattr));          memset (xattr, 0, (priv->child_count * sizeof (*xattr))); @@ -773,13 +798,8 @@ afr_changelog_pre_op (call_frame_t *frame, xlator_t *this)                  dict_ref (xattr[i]);          } -        call_count = afr_up_children_count (priv->child_count, -                                            local->child_up); - -        if (local->transaction.type == AFR_ENTRY_RENAME_TRANSACTION) { -                call_count *= 2; -        } - +        call_count = afr_fxattrop_call_count (local->transaction.type, int_lock, +                                              priv->child_count);          if (call_count == 0) {                  /* no child is up */                  for (i = 0; i < priv->child_count; i++) { @@ -803,6 +823,9 @@ afr_changelog_pre_op (call_frame_t *frame, xlator_t *this)          for (i = 0; i < priv->child_count; i++) {                  if (!local->child_up[i])                          continue; +                if (local->fd && !local->fd_open_on[i]) +                        continue; +                  ret = afr_set_pending_dict (priv, xattr[i],                                              local->pending); @@ -1246,7 +1269,7 @@ afr_transaction (call_frame_t *frame, xlator_t *this, afr_transaction_type type)          local = frame->local;          priv  = this->private; -        afr_transaction_local_init (local, priv); +        afr_transaction_local_init (local, this);          local->transaction.resume = afr_transaction_resume;          local->transaction.type   = type; diff --git a/xlators/cluster/afr/src/afr-transaction.h b/xlators/cluster/afr/src/afr-transaction.h index 4b4428cc59e..10f274feccc 100644 --- a/xlators/cluster/afr/src/afr-transaction.h +++ b/xlators/cluster/afr/src/afr-transaction.h @@ -30,4 +30,6 @@ afr_lock_server_count (afr_private_t *priv, afr_transaction_type type);  int32_t  afr_transaction (call_frame_t *frame, xlator_t *this, afr_transaction_type type); +afr_fd_ctx_t * +afr_fd_ctx_get (fd_t *fd, xlator_t *this);  #endif /* __TRANSACTION_H__ */ diff --git a/xlators/cluster/afr/src/afr.h b/xlators/cluster/afr/src/afr.h index 236a24a6057..a392dbefa97 100644 --- a/xlators/cluster/afr/src/afr.h +++ b/xlators/cluster/afr/src/afr.h @@ -29,6 +29,7 @@  #include "call-stub.h"  #include "compat-errno.h"  #include "afr-mem-types.h" +#include "afr-self-heal-algorithm.h"  #include "libxlator.h" @@ -147,17 +148,8 @@ typedef struct {          gf_boolean_t forced_merge;        /* Is this a self-heal triggered to                                               forcibly merge the directories? */ -        gf_boolean_t healing_fd_opened;   /* true if caller has already -                                             opened fd */ - -        gf_boolean_t data_lock_held;      /* true if caller has already -                                             acquired 0-0 lock */ - -        fd_t *healing_fd;                 /* set if callers has opened fd */ -          gf_boolean_t background;          /* do self-heal in background                                               if possible */ -          ia_type_t type;                   /* st_mode of the entry we're doing                                               self-heal on */          inode_t   *inode;                 /* inode on which the self-heal is @@ -208,7 +200,7 @@ typedef struct {          int source;          int active_source;          int active_sinks; -        int *success; +        unsigned char *success;          unsigned char *locked_nodes;          int lock_count; @@ -217,24 +209,31 @@ typedef struct {          int   op_failed; +        gf_boolean_t data_lock_held; +        gf_boolean_t eof_reached; +        fd_t  *healing_fd;          int   file_has_holes;          blksize_t block_size;          off_t file_size;          off_t offset; +        unsigned char *write_needed; +        uint8_t *checksum;          afr_post_remove_call_t post_remove_call;          loc_t parent_loc;          call_frame_t *orig_frame; +        call_frame_t *old_loop_frame;          gf_boolean_t unwound; -        /* private data for the particular self-heal algorithm */ -        void *private; - -        int (*flush_self_heal_cbk) (call_frame_t *frame, xlator_t *this); +        afr_sh_algo_private_t *private; +        afr_lock_cbk_t data_lock_success_handler; +        afr_lock_cbk_t data_lock_failure_handler;          int (*completion_cbk) (call_frame_t *frame, xlator_t *this); +        int (*sh_data_algo_start) (call_frame_t *frame, xlator_t *this);          int (*algo_completion_cbk) (call_frame_t *frame, xlator_t *this); +        afr_lock_cbk_t loop_completion_cbk;          int (*algo_abort_cbk) (call_frame_t *frame, xlator_t *this);          void (*gfid_sh_success_cbk) (call_frame_t *sh_frame, xlator_t *this); @@ -327,12 +326,11 @@ typedef struct {          uint64_t lock_number;          int32_t lk_call_count; +        int32_t lk_expected_count;          int32_t lock_op_ret;          int32_t lock_op_errno; - -        int (*lock_cbk) (call_frame_t *, xlator_t *); - +        afr_lock_cbk_t lock_cbk;  } afr_internal_lock_t;  typedef struct _afr_locked_fd { @@ -365,6 +363,7 @@ typedef struct _afr_local {          loc_t newloc;          fd_t *fd; +        int32_t *fd_open_on;          glusterfs_fop_t fop; @@ -387,7 +386,7 @@ typedef struct _afr_local {          dict_t  *dict;          int      optimistic_change_log; -        int (*openfd_flush_cbk) (call_frame_t *frame, xlator_t *this); +        int (*fop_call_continue) (call_frame_t *frame, xlator_t *this);          /*            This struct contains the arguments for the "continuation" @@ -685,10 +684,20 @@ typedef struct _afr_local {          struct marker_str     marker;  } afr_local_t; +typedef enum { +        AFR_FD_NOT_OPENED, +        AFR_FD_OPENED, +        AFR_FD_OPENING +} afr_fd_open_status_t; + +typedef struct { +        struct list_head call_list; +        call_frame_t    *frame; +} afr_fd_paused_call_t;  typedef struct {          unsigned int *pre_op_done; -        unsigned int *opened_on;     /* which subvolumes the fd is open on */ +        afr_fd_open_status_t *opened_on; /* which subvolumes the fd is open on */          unsigned int *pre_op_piggyback;          int flags; @@ -703,6 +712,7 @@ typedef struct {          struct list_head entries; /* needed for readdir failover */          unsigned char *locked_on; /* which subvolumes locks have been successful */ +	struct list_head  paused_calls; /* queued calls while fix_open happens  */  } afr_fd_ctx_t; @@ -790,8 +800,11 @@ afr_inode_set_read_ctx (xlator_t *this, inode_t *inode, int32_t read_child,  void  afr_build_parent_loc (loc_t *parent, loc_t *child); -int -afr_up_children_count (int child_count, unsigned char *child_up); +unsigned int +afr_up_children_count (unsigned char *child_up, unsigned int child_count); + +unsigned int +afr_locked_children_count (unsigned char *children, unsigned int child_count);  gf_boolean_t  afr_is_fresh_lookup (loc_t *loc, xlator_t *this); @@ -831,7 +844,7 @@ int  afr_cleanup_fd_ctx (xlator_t *this, fd_t *fd);  int -afr_openfd_flush (call_frame_t *frame, xlator_t *this, fd_t *fd); +afr_launch_openfd_self_heal (call_frame_t *frame, xlator_t *this, fd_t *fd);  #define AFR_STACK_UNWIND(fop, frame, params ...)                \          do {                                                    \ @@ -872,7 +885,7 @@ AFR_BASENAME (const char *str)  }  int -afr_transaction_local_init (afr_local_t *local, afr_private_t *priv); +afr_transaction_local_init (afr_local_t *local, xlator_t *this);  int32_t  afr_marker_getxattr (call_frame_t *frame, xlator_t *this, @@ -957,4 +970,18 @@ afr_resultant_errno_get (int32_t *children,  void  afr_inode_rm_stale_children (xlator_t *this, inode_t *inode, int32_t read_child,                               int32_t *stale_children); +void +afr_launch_self_heal (call_frame_t *frame, xlator_t *this, inode_t *inode, +                      gf_boolean_t is_background, ia_type_t ia_type, +                      void (*gfid_sh_success_cbk) (call_frame_t *sh_frame, +                                                   xlator_t *this), +                      int (*unwind) (call_frame_t *frame, xlator_t *this, +                                     int32_t op_ret, int32_t op_errno)); +int +afr_fix_open (call_frame_t *frame, xlator_t *this, afr_fd_ctx_t *fd_ctx, +              int need_open_count, int *need_open); +int +afr_open_fd_fix (call_frame_t *frame, xlator_t *this, gf_boolean_t pause_fop); +int +afr_set_elem_count_get (unsigned char *elems, int child_count);  #endif /* __AFR_H__ */ diff --git a/xlators/cluster/afr/src/pump.c b/xlators/cluster/afr/src/pump.c index 769e03b14ea..d27ecd21526 100644 --- a/xlators/cluster/afr/src/pump.c +++ b/xlators/cluster/afr/src/pump.c @@ -1654,7 +1654,7 @@ afr_setxattr_wind (call_frame_t *frame, xlator_t *this)  	local = frame->local;  	priv = this->private; -	call_count = afr_up_children_count (priv->child_count, local->child_up); +	call_count = afr_up_children_count (local->child_up, priv->child_count);  	if (call_count == 0) {  		local->transaction.resume (frame, this); diff --git a/xlators/features/locks/src/inodelk.c b/xlators/features/locks/src/inodelk.c index 71775439010..1fd6a7e0685 100644 --- a/xlators/features/locks/src/inodelk.c +++ b/xlators/features/locks/src/inodelk.c @@ -143,7 +143,8 @@ __inodelk_grantable (pl_dom_list_t *dom, pl_inode_lock_t *lock)          if (list_empty (&dom->inodelk_list))                  goto out;          list_for_each_entry (l, &dom->inodelk_list, list){ -                if (inodelk_conflict (lock, l)) { +                if (inodelk_conflict (lock, l) && +                    !same_inodelk_owner (lock, l)) {                          ret = l;                          goto out;                  }  | 
