diff options
| -rw-r--r-- | xlators/cluster/afr/src/afr-common.c | 820 | ||||
| -rw-r--r-- | xlators/cluster/afr/src/afr-dir-read.c | 14 | ||||
| -rw-r--r-- | xlators/cluster/afr/src/afr-self-heal-common.c | 441 | ||||
| -rw-r--r-- | xlators/cluster/afr/src/afr-self-heal-common.h | 21 | ||||
| -rw-r--r-- | xlators/cluster/afr/src/afr-self-heal-data.c | 207 | ||||
| -rw-r--r-- | xlators/cluster/afr/src/afr-self-heal-entry.c | 20 | ||||
| -rw-r--r-- | xlators/cluster/afr/src/afr-self-heal-metadata.c | 21 | ||||
| -rw-r--r-- | xlators/cluster/afr/src/afr-self-heal.h | 13 | ||||
| -rw-r--r-- | xlators/cluster/afr/src/afr.h | 16 | 
9 files changed, 983 insertions, 590 deletions
diff --git a/xlators/cluster/afr/src/afr-common.c b/xlators/cluster/afr/src/afr-common.c index a435a38b1..f49d8c55e 100644 --- a/xlators/cluster/afr/src/afr-common.c +++ b/xlators/cluster/afr/src/afr-common.c @@ -328,6 +328,8 @@ afr_local_sh_cleanup (afr_local_t *local, xlator_t *this)          if (sh->linkname)                  GF_FREE ((char *)sh->linkname); +        if (sh->child_success) +                GF_FREE (sh->child_success);          loc_wipe (&sh->parent_loc);  } @@ -417,6 +419,18 @@ afr_local_cleanup (afr_local_t *local, xlator_t *this)                  if (local->cont.lookup.inode) {                          inode_unref (local->cont.lookup.inode);                  } + +                if (local->cont.lookup.postparents) +                        GF_FREE (local->cont.lookup.postparents); + +                if (local->cont.lookup.bufs) +                        GF_FREE (local->cont.lookup.bufs); + +                if (local->cont.lookup.child_success) +                        GF_FREE (local->cont.lookup.child_success); + +                if (local->cont.lookup.sources) +                        GF_FREE (local->cont.lookup.sources);          }          { /* getxattr */ @@ -509,6 +523,22 @@ afr_up_children_count (int child_count, unsigned char *child_up)          return ret;  } +gf_boolean_t +afr_is_fresh_lookup (loc_t *loc, xlator_t *this) +{ +        uint64_t          ctx = 0; +        int32_t           ret = 0; + +        GF_ASSERT (loc); +        GF_ASSERT (this); +        GF_ASSERT (loc->inode); + +        ret = inode_ctx_get (loc->inode, this, &ctx); +        if (0 == ret) +                return _gf_false; +        return _gf_true; +} +  void  afr_update_loc_gfids (loc_t *loc, struct iatt *buf, struct iatt *postparent)  { @@ -567,68 +597,96 @@ afr_self_heal_lookup_unwind (call_frame_t *frame, xlator_t *this)          }          AFR_STACK_UNWIND (lookup, frame, local->op_ret, local->op_errno, -                          local->cont.lookup.inode, -                          &local->cont.lookup.buf, +                          local->cont.lookup.inode, &local->cont.lookup.buf,                            local->cont.lookup.xattr,                            &local->cont.lookup.postparent);          return 0;  } +void +afr_lookup_build_response_params (afr_local_t *local, xlator_t *this) +{ +        int32_t         read_child = -1; +        struct iatt     *buf = NULL; +        struct iatt     *postparent = NULL; +        dict_t          **xattr = NULL; -static void -afr_lookup_collect_xattr (afr_local_t *local, xlator_t *this, -                          int child_index, dict_t *xattr) +        GF_ASSERT (local); +        GF_ASSERT (local->cont.lookup.read_child >= 0); + +        buf = &local->cont.lookup.buf; +        postparent = &local->cont.lookup.postparent; +        xattr = &local->cont.lookup.xattr; + +        read_child = local->cont.lookup.read_child; +        *xattr = dict_ref (local->cont.lookup.xattrs[read_child]); +        *buf = local->cont.lookup.bufs[read_child]; +        *postparent = local->cont.lookup.postparents[read_child]; + +        if (IA_INVAL == local->cont.lookup.inode->ia_type) { +                /* fix for RT #602 */ +                local->cont.lookup.inode->ia_type = buf->ia_type; +        } +} + + + static void +afr_lookup_update_lk_counts (afr_local_t *local, xlator_t *this, +                            int child_index, dict_t *xattr)  {          uint32_t inodelk_count = 0;          uint32_t entrylk_count = 0; -        int      ret           = 0; +        int      ret           = -1; + +        GF_ASSERT (local); +        GF_ASSERT (this); +        GF_ASSERT (xattr); +        GF_ASSERT (child_index >= 0); + +        ret = dict_get_uint32 (xattr, GLUSTERFS_INODELK_COUNT, +                               &inodelk_count); +        if (ret == 0) +                local->inodelk_count += inodelk_count; -        if (afr_sh_has_metadata_pending (xattr, child_index, this)) { +        ret = dict_get_uint32 (xattr, GLUSTERFS_ENTRYLK_COUNT, +                               &entrylk_count); +        if (ret == 0) +                local->entrylk_count += entrylk_count; +} + +static void +afr_lookup_detect_self_heal_by_xattr (afr_local_t *local, xlator_t *this, +                                      dict_t *xattr) +{ +        GF_ASSERT (local); +        GF_ASSERT (this); +        GF_ASSERT (xattr); + +        if (afr_sh_has_metadata_pending (xattr, this)) {                  local->self_heal.need_metadata_self_heal = _gf_true;                  gf_log(this->name, GF_LOG_DEBUG,                         "metadata self-heal is pending for %s.",                         local->loc.path);          } -        if (afr_sh_has_entry_pending (xattr, child_index, this)) { +        if (afr_sh_has_entry_pending (xattr, this)) {                  local->self_heal.need_entry_self_heal = _gf_true;                  gf_log(this->name, GF_LOG_DEBUG,                         "entry self-heal is pending for %s.", local->loc.path);          } -        if (afr_sh_has_data_pending (xattr, child_index, this)) { +        if (afr_sh_has_data_pending (xattr, this)) {                  local->self_heal.need_data_self_heal = _gf_true;                  gf_log(this->name, GF_LOG_DEBUG,                         "data self-heal is pending for %s.", local->loc.path);          } - -        ret = dict_get_uint32 (xattr, GLUSTERFS_INODELK_COUNT, -                               &inodelk_count); -        if (ret == 0) -                local->inodelk_count += inodelk_count; - -        ret = dict_get_uint32 (xattr, GLUSTERFS_ENTRYLK_COUNT, -                               &entrylk_count); -        if (ret == 0) -                local->entrylk_count += entrylk_count;  } -  static void -afr_lookup_self_heal_check (xlator_t *this, afr_local_t *local, +afr_detect_self_heal_by_iatt (afr_local_t *local, xlator_t *this,                              struct iatt *buf, struct iatt *lookup_buf)  { -        if (FILETYPE_DIFFERS (buf, lookup_buf)) { -                /* mismatching filetypes with same name -                 */ - -                gf_log (this->name, GF_LOG_INFO, -                        "filetype differs for %s ", local->loc.path); - -                local->govinda_gOvinda = 1; -        } -          if (PERMISSION_DIFFERS (buf, lookup_buf)) {                  /* mismatching permissions */                  gf_log (this->name, GF_LOG_INFO, @@ -674,157 +732,299 @@ out:          return valid;  } -void -afr_lookup_set_read_child (xlator_t *this, afr_local_t *local) +static void +afr_detect_self_heal_by_lookup_status (afr_local_t *local, xlator_t *this)  { -        ia_type_t               ia_type = IA_INVAL; -        afr_transaction_type    transaction_type = AFR_DATA_TRANSACTION; -        afr_private_t           *priv = NULL; -        int32_t                 read_child = -1; -        afr_self_heal_t         *sh = NULL; +        GF_ASSERT (local); +        GF_ASSERT (this); -        priv = this->private; -        sh   = &local->self_heal; +        if ((local->success_count > 0) && (local->enoent_count > 0)) { +                local->self_heal.need_metadata_self_heal = _gf_true; +                local->self_heal.need_data_self_heal     = _gf_true; +                local->self_heal.need_entry_self_heal    = _gf_true; +                gf_log(this->name, GF_LOG_INFO, +                       "entries are missing in lookup of %s.", +                       local->loc.path); +                //If all self-heals are needed no need to check for other rules +                goto out; +        } -        ia_type = local->cont.lookup.inode->ia_type; -        if (IA_ISREG (ia_type)) { -                transaction_type = AFR_DATA_TRANSACTION; -        } else if IA_ISDIR (ia_type) { -                transaction_type = AFR_ENTRY_TRANSACTION; -        } else { -                transaction_type = AFR_METADATA_TRANSACTION; +        if (local->success_count > 0) { +                if (afr_is_split_brain (this, local->cont.lookup.inode) && +                    IA_ISREG (local->cont.lookup.inode->ia_type)) { +                        local->self_heal.need_data_self_heal = _gf_true; +                        gf_log (this->name, GF_LOG_WARNING, +                                "split brain detected during lookup of %s.", +                                local->loc.path); +                }          } -        afr_self_heal_find_sources (this, local, -                                    local->cont.lookup.xattrs, -                                    transaction_type); -        if (!sh->sources) -                goto out; -        read_child = local->read_child_index; -        if (afr_is_valid_read_child (sh->sources, priv->child_count, -                                     read_child)) -                goto out; +out: +        return; +} + +gf_boolean_t +afr_can_self_heal_proceed (afr_self_heal_t *sh, afr_private_t *priv) +{ +        GF_ASSERT (sh); +        GF_ASSERT (priv); + +        return ((priv->data_self_heal && sh->need_data_self_heal) +                || (priv->metadata_self_heal && sh->need_metadata_self_heal) +                || (priv->entry_self_heal && sh->need_entry_self_heal)); +} -        read_child = afr_read_child (this, local->loc.inode); -        if (afr_is_valid_read_child (sh->sources, priv->child_count, -                                     read_child)) +gf_boolean_t +afr_is_self_heal_enabled (afr_private_t *priv) +{ +        GF_ASSERT (priv); + +        return (priv->data_self_heal || priv->metadata_self_heal +                || priv->entry_self_heal); +} + +int +afr_lookup_select_read_child (afr_local_t *local, xlator_t *this, +                              int32_t *read_child) +{ +        int32_t                 source = -1; +        ia_type_t               ia_type = 0; +        int                     ret = -1; +        afr_transaction_type    type = AFR_METADATA_TRANSACTION; +        dict_t                  **xattrs = NULL; +        int32_t                 *child_success = NULL; +        struct iatt             *bufs = NULL; + +        GF_ASSERT (local); +        GF_ASSERT (this); + +        bufs = local->cont.lookup.bufs; +        child_success = local->cont.lookup.child_success; +        ia_type = local->cont.lookup.bufs[child_success[0]].ia_type; +        if (IA_ISDIR (ia_type)) { +                type = AFR_ENTRY_TRANSACTION; +        } else if (IA_ISREG (ia_type)) { +                type = AFR_DATA_TRANSACTION; +        } +        xattrs = local->cont.lookup.xattrs; +        source = afr_lookup_select_read_child_by_txn_type (this, local, xattrs, +                                                           type); +        if (source < 0)                  goto out; -        read_child = afr_sh_select_source (sh->sources, priv->child_count); +        *read_child = source; +        ret = 0;  out: -        if (read_child >= 0) { -                afr_set_read_child (this, -                                    local->cont.lookup.inode, -                                    read_child); -        } +        return ret; +} + +static inline gf_boolean_t +afr_is_self_heal_running (afr_local_t *local) +{ +        GF_ASSERT (local); +        return ((local->inodelk_count > 0) || (local->entrylk_count > 0));  }  static void -afr_lookup_done (call_frame_t *frame, xlator_t *this, struct iatt *lookup_buf) +afr_launch_self_heal (call_frame_t *frame, xlator_t *this, +                      gf_boolean_t is_background, ia_type_t ia_type, +                      int (*unwind) (call_frame_t *frame, xlator_t *this))  { -        int                     unwind = 1; -        int                     up_count = 0; -        char                    sh_type_str[256] = {0,}; -        afr_private_t           *priv  = NULL;          afr_local_t             *local = NULL; +        char                    sh_type_str[256] = {0,}; + +        GF_ASSERT (frame); +        GF_ASSERT (this); -        priv  = this->private;          local = frame->local; +        local->self_heal.background = is_background; +        local->self_heal.type       = ia_type; +        local->self_heal.unwind     = unwind; -        if (local->op_ret != 0) -                goto unwind; +        afr_self_heal_type_str_get (&local->self_heal, +                                    sh_type_str, +                                    sizeof (sh_type_str)); -        local->cont.lookup.postparent.ia_ino  = local->cont.lookup.parent_ino; +        gf_log (this->name, GF_LOG_INFO, +                "background %s self-heal triggered. path: %s", +                sh_type_str, local->loc.path); -        if (local->cont.lookup.ino) { -                local->cont.lookup.buf.ia_ino = local->cont.lookup.ino; +        afr_self_heal (frame, this); +} + +static void +afr_lookup_detect_self_heal (afr_local_t *local, xlator_t *this) +{ +        int                     i = 0; +        struct iatt             *bufs = NULL; +        dict_t                  **xattr = NULL; +        afr_private_t           *priv = NULL; +        int32_t                 child1 = -1; +        int32_t                 child2 = -1; + +        afr_detect_self_heal_by_lookup_status (local, this); + +        bufs = local->cont.lookup.bufs; +        for (i = 1; i < local->success_count; i++) { +                child1 = local->cont.lookup.child_success[i-1]; +                child2 = local->cont.lookup.child_success[i];; +                afr_detect_self_heal_by_iatt (local, this, +                                              &bufs[child1], &bufs[child2]);          } -        if (local->op_ret == 0) { -                /* KLUDGE: assuming DHT will not itransform in -                   revalidate */ -                if (local->cont.lookup.inode->ino) { -                        local->cont.lookup.buf.ia_ino = -                                local->cont.lookup.inode->ino; -                } +        xattr = local->cont.lookup.xattrs; +        priv  = this->private; +        for (i = 0; i < local->success_count; i++) { +                child1 = local->cont.lookup.child_success[i];; +                afr_lookup_detect_self_heal_by_xattr (local, this, +                                                      xattr[child1]);          } +} + +static void +afr_lookup_perform_self_heal_if_needed (call_frame_t *frame, xlator_t *this, +                                        gf_boolean_t *sh_launched) +{ +        size_t              up_count = 0; +        afr_private_t       *priv    = NULL; +        afr_local_t         *local   = NULL; + +        GF_ASSERT (sh_launched); +        *sh_launched = _gf_false; +        priv         = this->private; +        local        = frame->local; -        afr_lookup_set_read_child (this, local); -        up_count = afr_up_children_count (priv->child_count, priv->child_up); +        up_count  = afr_up_children_count (priv->child_count, local->child_up);          if (up_count == 1) {                  gf_log (this->name, GF_LOG_DEBUG,                          "Only 1 child up - do not attempt to detect self heal"); +                goto out; +        } -                goto unwind; +        if (_gf_false == afr_is_self_heal_enabled (priv)) { +                gf_log (this->name, GF_LOG_DEBUG, +                        "Self heal is not enabled"); +                goto out;          } -        if (local->success_count && local->enoent_count) { -                local->self_heal.need_metadata_self_heal = _gf_true; -                local->self_heal.need_data_self_heal     = _gf_true; -                local->self_heal.need_entry_self_heal    = _gf_true; -                gf_log(this->name, GF_LOG_INFO, -                       "entries are missing in lookup of %s.", -                       local->loc.path); +        afr_lookup_detect_self_heal (local, this); +        if (afr_can_self_heal_proceed (&local->self_heal, priv)) { +                if  (afr_is_self_heal_running (local)) { +                        goto out; +                } + +                afr_launch_self_heal (frame, this, _gf_true, +                                      local->cont.lookup.buf.ia_type, +                                      afr_self_heal_lookup_unwind); +                *sh_launched = _gf_true;          } +out: +        return; +} -        if (local->success_count) { -                /* check for split-brain case in previous lookup */ -                if (afr_is_split_brain (this, local->cont.lookup.inode)) { -                        local->self_heal.need_data_self_heal = _gf_true; -                        gf_log(this->name, GF_LOG_WARNING, -                               "split brain detected during lookup of %s.", -                               local->loc.path); +static gf_boolean_t +afr_lookup_split_brain (afr_local_t *local, xlator_t *this) +{ +        int             i              = 0; +        gf_boolean_t    symptom        = _gf_false; +        struct iatt     *bufs          = NULL; +        int32_t         *child_success = NULL; +        struct iatt     *child1        = NULL; +        struct iatt     *child2        = NULL; +        const char      *path          = NULL; + +        bufs = local->cont.lookup.bufs; +        child_success = local->cont.lookup.child_success; +        for (i = 1; i < local->success_count; i++) { +                child1 = &bufs[child_success[i-1]]; +                child2 = &bufs[child_success[i]]; +                /* +                 * TODO: gfid self-heal +                 * if (uuid_compare (child1->ia_gfid, child2->ia_gfid)) { +                 *        gf_log (this->name, GF_LOG_WARNING, "%s: gfid differs" +                 *                " on subvolumes (%d, %d)", local->loc.path, +                 *                child_success[i-1], child_success[i]); +                 *        symptom = _gf_true; +                 * } +                 */ + +                if (FILETYPE_DIFFERS (child1, child2)) { +                        path = local->loc.path; +                        gf_log (this->name, GF_LOG_WARNING, "%s: filetype " +                                "differs on subvolumes (%d, %d)", path, +                                child_success[i-1], child_success[i]); +                        symptom = _gf_true; +                        local->govinda_gOvinda = 1;                  } +                if (symptom) +                        break;          } +        return symptom; +} -        if (local->self_heal.need_metadata_self_heal -             || local->self_heal.need_data_self_heal -             || local->self_heal.need_entry_self_heal) -             { +static int +afr_lookup_set_read_child (afr_local_t *local, xlator_t *this, int32_t read_child) +{ +        GF_ASSERT (read_child >= 0); -                if (local->inodelk_count || local->entrylk_count) { +        afr_set_read_child (this, local->cont.lookup.inode, read_child); +        local->cont.lookup.read_child = read_child; -                        /* Someone else is doing self-heal on this file. -                           return */ +        return 0; +} -                        goto unwind; -                } +static void +afr_lookup_done (call_frame_t *frame, xlator_t *this) +{ +        int                 unwind = 1; +        afr_private_t       *priv  = NULL; +        afr_local_t         *local = NULL; +        int                 ret = -1; +        gf_boolean_t        sh_launched = _gf_false; +        int32_t             read_child = -1; -                if (!local->cont.lookup.inode->ia_type) { -                        /* fix for RT #602 */ -                        local->cont.lookup.inode->ia_type = -                                lookup_buf->ia_type; -                } +        priv  = this->private; +        local = frame->local; -                local->self_heal.background = _gf_true; -                local->self_heal.type       = local->cont.lookup.buf.ia_type; -                local->self_heal.unwind     = afr_self_heal_lookup_unwind; +        if (local->op_ret < 0) +                goto unwind; -                unwind = 0; +        if (_gf_true == afr_lookup_split_brain (local, this)) { +                local->op_ret = -1; +                local->op_errno = EIO; +                goto unwind; +        } -                afr_self_heal_type_str_get(&local->self_heal, -                                           sh_type_str, -                                           sizeof(sh_type_str)); +        ret = afr_lookup_select_read_child (local, this, &read_child); +        if (ret) { +                local->op_ret = -1; +                local->op_errno = EIO; +                goto unwind; +        } -                gf_log (this->name, GF_LOG_INFO, -                        "background %s self-heal triggered. path: %s", -                        sh_type_str, local->loc.path); +        ret = afr_lookup_set_read_child (local, this, read_child); +        if (ret) +                goto unwind; -                afr_self_heal (frame, this); +        afr_lookup_build_response_params (local, this); +        if (afr_is_fresh_lookup (&local->loc, this)) { +                afr_update_loc_gfids (&local->loc, &local->cont.lookup.buf, +                                      &local->cont.lookup.postparent);          } -unwind: -        if (unwind) { -                AFR_STACK_UNWIND (lookup, frame, local->op_ret, -                                  local->op_errno, -                                  local->cont.lookup.inode, -                                  &local->cont.lookup.buf, -                                  local->cont.lookup.xattr, -                                  &local->cont.lookup.postparent); +        afr_lookup_perform_self_heal_if_needed (frame, this, &sh_launched); +        if (sh_launched) +                unwind = 0; + unwind: +         if (unwind) { +                 AFR_STACK_UNWIND (lookup, frame, local->op_ret, +                                  local->op_errno, local->cont.lookup.inode, +                                   &local->cont.lookup.buf, +                                   local->cont.lookup.xattr, +                                   &local->cont.lookup.postparent);          }  } -  /*   * During a lookup, some errors are more "important" than   * others in that they must be given higher priority while @@ -850,259 +1050,169 @@ __error_more_important (int32_t old_errno, int32_t new_errno)          return ret;  } - -int -afr_fresh_lookup_cbk (call_frame_t *frame, void *cookie, -                      xlator_t *this,  int32_t op_ret,  int32_t op_errno, -                      inode_t *inode,   struct iatt *buf, dict_t *xattr, -                      struct iatt *postparent) +static void +afr_lookup_handle_error (afr_local_t *local, int32_t op_ret,  int32_t op_errno)  { -        afr_local_t *   local = NULL; -        afr_private_t * priv  = NULL; -        struct iatt *   lookup_buf = NULL; -        int             call_count      = -1; -        int             child_index     = -1; -        int             first_up_child  = -1; +        GF_ASSERT (local); +        if (op_errno == ENOENT) +                local->enoent_count++; -        child_index = (long) cookie; -        priv = this->private; - -        LOCK (&frame->lock); -        { -                local = frame->local; - -                lookup_buf = &local->cont.lookup.buf; - -                if (op_ret == -1) { -                        if (op_errno == ENOENT) -                                local->enoent_count++; - -                        if (__error_more_important (local->op_errno, op_errno)) -                                local->op_errno = op_errno; - -                        if (local->op_errno == ESTALE) { -                                local->op_ret = -1; -                        } - -                        goto unlock; -                } - -                afr_lookup_collect_xattr (local, this, child_index, xattr); - -                first_up_child = afr_first_up_child (priv); - -                if (child_index == first_up_child) { -                        local->cont.lookup.ino = -                                afr_itransform (buf->ia_ino, -                                                priv->child_count, -                                                first_up_child); -                } - -                if (local->success_count == 0) { -                        if (local->op_errno != ESTALE) -                                local->op_ret = op_ret; - -                        local->cont.lookup.inode               = inode_ref (inode); -                        local->cont.lookup.xattr               = dict_ref (xattr); -                        local->cont.lookup.xattrs[child_index] = dict_ref (xattr); -                        local->cont.lookup.postparent          = *postparent; - -                        if (priv->first_lookup && inode->ino == 1) { -                                gf_log (this->name, GF_LOG_INFO, -                                        "added root inode"); -                                priv->root_inode = inode_ref (inode); -                                priv->first_lookup = 0; -                        } - -                        *lookup_buf = *buf; - -                        uuid_copy (local->loc.gfid, buf->ia_gfid); -                        uuid_copy (local->loc.pargfid, -                                   postparent->ia_gfid); - -                        lookup_buf->ia_ino = afr_itransform (buf->ia_ino, -                                                             priv->child_count, -                                                             child_index); -                        if (priv->read_child >= 0) { -                                afr_set_read_child (this, -                                                    local->cont.lookup.inode, -                                                    priv->read_child); -                        } else { -                                afr_set_read_child (this, -                                                    local->cont.lookup.inode, -                                                    child_index); -                                //prefer fast child -                                local->read_child_index = child_index; -                        } - -                } else { -                        afr_lookup_self_heal_check (this, local, buf, lookup_buf); +        if (__error_more_important (local->op_errno, op_errno)) +                local->op_errno = op_errno; +        if (local->op_errno == ESTALE) { +                local->op_ret = -1; +        } +} -                        local->cont.lookup.xattrs[child_index] = dict_ref (xattr); -                        if (child_index == local->read_child_index) { -                                /* -                                  lookup has succeeded on the read child. -                                  So use its inode number -                                */ -                                if (local->cont.lookup.xattr) -                                        dict_unref (local->cont.lookup.xattr); +static void +afr_set_root_inode_on_first_lookup (afr_local_t *local, xlator_t *this, +                                    inode_t *inode) +{ +        afr_private_t           *priv = NULL; +        GF_ASSERT (inode); -                                local->cont.lookup.xattr = dict_ref (xattr); -                                local->cont.lookup.postparent          = *postparent; +        if (inode->ino != 1) +                goto out; +        if (!afr_is_fresh_lookup (&local->loc, this)) +                goto out; +        priv = this->private; +        if ((priv->first_lookup)) { +                gf_log (this->name, GF_LOG_INFO, "added root inode"); +                priv->root_inode = inode_ref (inode); +                priv->first_lookup = 0; +        } +out: +        return; +} -                                *lookup_buf = *buf; +static void +afr_lookup_cache_args (afr_local_t *local, int child_index, dict_t *xattr, +                       struct iatt *buf, struct iatt *postparent) +{ +        GF_ASSERT (child_index >= 0); +        local->cont.lookup.xattrs[child_index] = dict_ref (xattr); +        local->cont.lookup.postparents[child_index] = *postparent; +        local->cont.lookup.bufs[child_index] = *buf; +} -                                uuid_copy (local->loc.gfid, buf->ia_gfid); -                                uuid_copy (local->loc.pargfid, -                                           postparent->ia_gfid); -                        } +static void +afr_lookup_handle_first_success (afr_local_t *local, xlator_t *this, +                                 inode_t *inode, struct iatt *buf) +{ +        local->cont.lookup.inode      = inode_ref (inode); +        local->cont.lookup.buf        = *buf; +        afr_set_root_inode_on_first_lookup (local, this, inode); +} +static void +afr_lookup_handle_success (afr_local_t *local, xlator_t *this, int32_t child_index, +                           int32_t op_ret, int32_t op_errno, inode_t *inode, +                           struct iatt *buf, dict_t *xattr, +                           struct iatt *postparent) +{ +        if (local->success_count == 0) { +                if (local->op_errno != ESTALE) { +                        local->op_ret = op_ret; +                        local->op_errno = 0;                  } - -                local->success_count++; -        } -unlock: -        UNLOCK (&frame->lock); - -        call_count = afr_frame_return (frame); - -        if (call_count == 0) { -                afr_lookup_done (frame, this, lookup_buf); +                afr_lookup_handle_first_success (local, this, inode, buf);          } +        afr_lookup_update_lk_counts (local, this, +                                     child_index, xattr); -        return 0; +        afr_lookup_cache_args (local, child_index, xattr, +                               buf, postparent); +        local->cont.lookup.child_success[local->success_count] = child_index; +        local->success_count++;  } -  int -afr_revalidate_lookup_cbk (call_frame_t *frame, void *cookie, -                           xlator_t *this, int32_t op_ret, int32_t op_errno, -                           inode_t *inode, struct iatt *buf, dict_t *xattr, -                           struct iatt *postparent) +afr_lookup_cbk (call_frame_t *frame, void *cookie, +                xlator_t *this,  int32_t op_ret,  int32_t op_errno, +                inode_t *inode,   struct iatt *buf, dict_t *xattr, +                struct iatt *postparent)  {          afr_local_t *   local = NULL; -        afr_private_t * priv  = NULL; -        struct iatt *   lookup_buf = NULL;          int             call_count      = -1;          int             child_index     = -1; -        int             first_up_child  = -1; -        child_index = (long) cookie; -        priv = this->private; +         child_index = (long) cookie;          LOCK (&frame->lock);          {                  local = frame->local; -                lookup_buf = &local->cont.lookup.buf; -                  if (op_ret == -1) { -                        if (op_errno == ENOENT) -                                local->enoent_count++; - -                        if (__error_more_important (local->op_errno, op_errno)) -                                local->op_errno = op_errno; - -                        if (local->op_errno == ESTALE) { -                                local->op_ret = -1; -                        } - +                        afr_lookup_handle_error (local, op_ret, op_errno);                          goto unlock;                  } +                afr_lookup_handle_success (local, this, child_index, op_ret, +                                           op_errno, inode, buf, xattr, +                                           postparent); -                afr_lookup_collect_xattr (local, this, child_index, xattr); - -                first_up_child = afr_first_up_child (priv); - -                if (child_index == first_up_child) { -                        local->cont.lookup.ino = -                                afr_itransform (buf->ia_ino, -                                                priv->child_count, -                                                first_up_child); -                } - -                /* in case of revalidate, we need to send stat of the -                 * child whose stat was sent during the first lookup. -                 * (so that time stamp does not vary with revalidate. -                 * in case it is down, stat of the fist success will -                 * be replied */ - -                /* inode number should be preserved across revalidates */ - -                if (local->success_count == 0) { -                        if (local->op_errno != ESTALE) -                                local->op_ret = op_ret; - -                        local->cont.lookup.inode               = inode_ref (inode); -                        local->cont.lookup.xattr               = dict_ref (xattr); -                        local->cont.lookup.xattrs[child_index] = dict_ref (xattr); -                        local->cont.lookup.postparent          = *postparent; - -                        *lookup_buf = *buf; - -                        lookup_buf->ia_ino = afr_itransform (buf->ia_ino, -                                                             priv->child_count, -                                                             child_index); - -                        if (priv->read_child >= 0) { -                                afr_set_read_child (this, -                                                    local->cont.lookup.inode, -                                                    priv->read_child); -                        } else { -                                afr_set_read_child (this, -                                                    local->cont.lookup.inode, -                                                    child_index); -                        } - -                } else { -                        afr_lookup_self_heal_check (this, local, buf, lookup_buf); - -                        local->cont.lookup.xattrs[child_index] = dict_ref (xattr); -                        if (child_index == local->read_child_index) { +         } +unlock: +        UNLOCK (&frame->lock); -                                /* -                                  lookup has succeeded on the read child. -                                  So use its inode number -                                */ +        call_count = afr_frame_return (frame); +        if (call_count == 0) { +               afr_lookup_done (frame, this); +        } -                                if (local->cont.lookup.xattr) -                                        dict_unref (local->cont.lookup.xattr); +         return 0; +} -                                local->cont.lookup.xattr               = dict_ref (xattr); -                                local->cont.lookup.postparent          = *postparent; +int +afr_lookup_cont_init (afr_local_t *local, unsigned int child_count) +{ +        int               ret            = -ENOMEM; +        int32_t           *child_success = NULL; +        struct iatt       *iatts         = NULL; +        int               i              = 0; -                                *lookup_buf = *buf; -                        } +        GF_ASSERT (local); +        local->cont.lookup.xattrs = GF_CALLOC (child_count, +                                               sizeof (*local->cont.lookup.xattr), +                                               gf_afr_mt_dict_t); +        if (NULL == local->cont.lookup.xattrs) +                goto out; -                } +        iatts = GF_CALLOC (child_count, sizeof (*iatts), gf_afr_mt_iatt); +        if (NULL == iatts) +                goto out; +        local->cont.lookup.postparents = iatts; -                local->success_count++; -        } -unlock: -        UNLOCK (&frame->lock); +        iatts = GF_CALLOC (child_count, sizeof (*iatts), gf_afr_mt_iatt); +        if (NULL == iatts) +                goto out; +        local->cont.lookup.bufs = iatts; -        call_count = afr_frame_return (frame); +        child_success = GF_CALLOC (child_count, sizeof (*child_success), +                                   gf_afr_mt_char); +        if (NULL == child_success) +                goto out; +        for (i = 0; i < child_count; i++) +                child_success[i] = -1; -        if (call_count == 0) { -                afr_lookup_done (frame, this, lookup_buf); -        } +        local->cont.lookup.child_success = child_success; -        return 0; +        local->cont.lookup.read_child = -1; +        ret = 0; +out: +        return ret;  } -  int  afr_lookup (call_frame_t *frame, xlator_t *this,              loc_t *loc, dict_t *xattr_req)  { -        afr_private_t    *priv       = NULL; -        afr_local_t      *local      = NULL; -        int               ret        = -1; -        int               i          = 0; -        fop_lookup_cbk_t  callback   = NULL; -        int               call_count = 0; -        uint64_t          ctx        = 0; -        int32_t           op_errno   = 0; +        afr_private_t    *priv           = NULL; +        afr_local_t      *local          = NULL; +        int               ret            = -1; +        int               i              = 0; +        int               call_count     = 0; +        uint64_t          ctx            = 0; +        int32_t           op_errno       = 0;          priv = this->private; @@ -1123,14 +1233,9 @@ afr_lookup (call_frame_t *frame, xlator_t *this,          if (ret == 0) {                  /* lookup is a revalidate */ -                callback = afr_revalidate_lookup_cbk; - -                local->cont.lookup.is_revalidate = _gf_true;                  local->read_child_index          = afr_read_child (this,                                                                     loc->inode);          } else { -                callback = afr_fresh_lookup_cbk; -                  LOCK (&priv->read_child_lock);                  {                          local->read_child_index = (++priv->read_child_rr) @@ -1143,10 +1248,16 @@ afr_lookup (call_frame_t *frame, xlator_t *this,                  local->cont.lookup.parent_ino = loc->parent->ino;          local->child_up = memdup (priv->child_up, priv->child_count); +        if (NULL == local->child_up) { +                op_errno = ENOMEM; +                goto out; +        } -        local->cont.lookup.xattrs = GF_CALLOC (priv->child_count, -                                               sizeof (*local->cont.lookup.xattr), -                                               gf_afr_mt_dict_t); +        ret = afr_lookup_cont_init (local, priv->child_count); +        if (ret < 0) { +                op_errno = -ret; +                goto out; +        }          local->call_count = afr_up_children_count (priv->child_count,                                                     local->child_up); @@ -1192,7 +1303,8 @@ afr_lookup (call_frame_t *frame, xlator_t *this,          for (i = 0; i < priv->child_count; i++) {                  if (local->child_up[i]) { -                        STACK_WIND_COOKIE (frame, callback, (void *) (long) i, +                        STACK_WIND_COOKIE (frame, afr_lookup_cbk, +                                           (void *) (long) i,                                             priv->children[i],                                             priv->children[i]->fops->lookup,                                             loc, local->xattr_req); diff --git a/xlators/cluster/afr/src/afr-dir-read.c b/xlators/cluster/afr/src/afr-dir-read.c index e626a6f1a..3f056b686 100644 --- a/xlators/cluster/afr/src/afr-dir-read.c +++ b/xlators/cluster/afr/src/afr-dir-read.c @@ -70,15 +70,19 @@ gf_boolean_t  __checksums_differ (uint32_t *checksum, int child_count,                      unsigned char *child_up)  { -        int      ret   = _gf_false; -        int      i     = 0; -        uint32_t cksum = 0; - -        cksum = checksum[0]; +        int          ret            = _gf_false; +        int          i              = 0; +        uint32_t     cksum          = 0; +        gf_boolean_t activate_check = _gf_false;          for (i = 0; i < child_count; i++) {                  if (!child_up[i])                          continue; +                if (_gf_false == activate_check) { +                        cksum          = checksum[i]; +                        activate_check = _gf_true; +                        continue; +                }                  if (cksum != checksum[i]) {                          ret = _gf_true; diff --git a/xlators/cluster/afr/src/afr-self-heal-common.c b/xlators/cluster/afr/src/afr-self-heal-common.c index 4f50f3577..b29deb8bc 100644 --- a/xlators/cluster/afr/src/afr-self-heal-common.c +++ b/xlators/cluster/afr/src/afr-self-heal-common.c @@ -70,23 +70,6 @@ afr_sh_source_count (int sources[], int child_count)          return nsource;  } - -int -afr_sh_supress_errenous_children (int sources[], int child_errno[], -                                  int child_count) -{ -        int i = 0; - -        for (i = 0; i < child_count; i++) { -                if (child_errno[i] && sources[i]) { -                        sources[i] = 0; -                } -        } - -        return 0; -} - -  void  afr_sh_print_pending_matrix (int32_t *pending_matrix[], xlator_t *this)  { @@ -113,11 +96,46 @@ afr_sh_print_pending_matrix (int32_t *pending_matrix[], xlator_t *this)          GF_FREE (buf);  } +void +afr_init_pending_matrix (int32_t **pending_matrix, size_t child_count) +{ +        int             i   = 0; +        int             j   = 0; + +        GF_ASSERT (pending_matrix); + +        for (i = 0; i < child_count; i++) { +                for (j = 0; j < child_count; j++) { +                        pending_matrix[i][j] = 0; +                } +        } +}  void -afr_sh_build_pending_matrix (afr_private_t *priv, -                             int32_t *pending_matrix[], dict_t *xattr[], -                             int child_count, afr_transaction_type type) +afr_mark_ignorant_subvols_as_pending (int32_t **pending_matrix, +                                      unsigned char *ignorant_subvols, +                                      size_t  child_count) +{ +        int            i                = 0; +        int            j                = 0; + +        GF_ASSERT (pending_matrix); +        GF_ASSERT (ignorant_subvols); + +        for (i = 0; i < child_count; i++) { +                if (ignorant_subvols[i]) { +                        for (j = 0; j < child_count; j++) { +                                if (!ignorant_subvols[j]) +                                        pending_matrix[j][i] += 1; +                        } +                } +        } +} + +int +afr_build_pending_matrix (char **pending_key, int32_t **pending_matrix, +                          dict_t *xattr[], afr_transaction_type type, +                          size_t child_count)  {          /* Indexable by result of afr_index_for_transaction_type(): 0 -- 2. */          int32_t        pending[3]       = {0,}; @@ -130,19 +148,16 @@ afr_sh_build_pending_matrix (afr_private_t *priv,          ignorant_subvols = GF_CALLOC (sizeof (*ignorant_subvols), child_count,                                        gf_afr_mt_char); +        if (NULL == ignorant_subvols) +                goto out; -        /* start clean */ -        for (i = 0; i < child_count; i++) { -                for (j = 0; j < child_count; j++) { -                        pending_matrix[i][j] = 0; -                } -        } +        afr_init_pending_matrix (pending_matrix, child_count);          for (i = 0; i < child_count; i++) {                  pending_raw = NULL;                  for (j = 0; j < child_count; j++) { -                        ret = dict_get_ptr (xattr[i], priv->pending_key[j], +                        ret = dict_get_ptr (xattr[i], pending_key[j],                                              &pending_raw);                          if (ret != 0) { @@ -163,21 +178,12 @@ afr_sh_build_pending_matrix (afr_private_t *priv,                  }          } -        /* -         * Make all non-ignorant subvols point towards the ignorant -         * subvolumes. -         */ - -        for (i = 0; i < child_count; i++) { -                if (ignorant_subvols[i]) { -                        for (j = 0; j < child_count; j++) { -                                if (!ignorant_subvols[j]) -                                        pending_matrix[j][i] += 1; -                        } -                } -        } - +        afr_mark_ignorant_subvols_as_pending (pending_matrix, +                                              ignorant_subvols, +                                              child_count);          GF_FREE (ignorant_subvols); +out: +        return ret;  } @@ -208,7 +214,8 @@ afr_sh_build_pending_matrix (afr_private_t *priv,  typedef enum {          AFR_NODE_INNOCENT,          AFR_NODE_FOOL, -        AFR_NODE_WISE +        AFR_NODE_WISE, +        AFR_NODE_INVALID = -1,  } afr_node_type;  typedef struct { @@ -353,182 +360,276 @@ afr_sh_mark_wisest_as_sources (int sources[],          return nsources;  } - -static int -afr_sh_mark_if_size_differs (afr_self_heal_t *sh, int child_count) +static void +afr_compute_witness_of_fools (int32_t *witnesses, int32_t **pending_matrix, +                              afr_node_character *characters, +                              int32_t child_count)  { -        int32_t ** pending_matrix = NULL; -        int        i              = 0; -        int        j              = 0; -        int        size_differs   = 0; +        int i       = 0; +        int j       = 0; +        int witness = 0; -        pending_matrix = sh->pending_matrix; +        GF_ASSERT (witnesses); +        GF_ASSERT (pending_matrix); +        GF_ASSERT (characters); +        GF_ASSERT (child_count > 0);          for (i = 0; i < child_count; i++) { +                if (characters[i].type != AFR_NODE_FOOL) +                        continue; + +                witness = 0;                  for (j = 0; j < child_count; j++) { -                        if (!sh->buf) -                                break; +                        if (i == j) +                                continue; +                        witness += pending_matrix[i][j]; +                } +                witnesses[i] = witness; +        } +} -                        if (SIZE_DIFFERS (&sh->buf[i], &sh->buf[j]) -                            && (pending_matrix[i][j] == 0) -                            && (pending_matrix[j][i] == 0)) { +static int32_t +afr_find_biggest_witness_among_fools (int32_t *witnesses, +                                      afr_node_character *characters, +                                      int32_t child_count) +{ +        int i               = 0; +        int biggest_witness = -1; -                                pending_matrix[i][j] = 1; -                                pending_matrix[j][i] = 1; +        GF_ASSERT (witnesses); +        GF_ASSERT (characters); +        GF_ASSERT (child_count > 0); -                                size_differs = 1; -                        } -                } -        } +        for (i = 0; i < child_count; i++) { +                if (characters[i].type != AFR_NODE_FOOL) +                        continue; -        return size_differs; +                if (biggest_witness < witnesses[i]) +                        biggest_witness = witnesses[i]; +        } +        return biggest_witness;  } - -static int -afr_sh_mark_biggest_fool_as_source (afr_self_heal_t *sh, +int +afr_mark_fool_as_source_by_witness (int32_t *sources, int32_t *witnesses,                                      afr_node_character *characters, -                                    int child_count) +                                    int32_t child_count, int32_t witness)  { -        int i       = 0; -        int biggest = 0; +        int i        = 0; +        int nsources = 0; -        for (i = 0; i < child_count; i++) { -                if (characters[i].type == AFR_NODE_FOOL) { -                        biggest = i; -                        break; -                } -        } +        GF_ASSERT (sources); +        GF_ASSERT (witnesses); +        GF_ASSERT (characters); +        GF_ASSERT (child_count > 0);          for (i = 0; i < child_count; i++) {                  if (characters[i].type != AFR_NODE_FOOL)                          continue; -                if (!sh->buf) -                        break; - -                if (SIZE_GREATER (&sh->buf[i], &sh->buf[biggest])) { -                        biggest = i; +                if (witness == witnesses[i]) { +                        sources[i] = 1; +                        nsources++;                  }          } +        return nsources; +} + +static int +afr_mark_biggest_of_fools_as_source (int32_t *sources, int32_t **pending_matrix, +                                     afr_node_character *characters, +                                     int child_count) +{ +        int32_t       biggest_witness = 0; +        int           nsources        = 0; +        int32_t       *witnesses      = NULL; -        sh->sources[biggest] = 1; +        GF_ASSERT (child_count > 0); -        return 1; -} +        witnesses = GF_CALLOC (child_count, sizeof (*witnesses), +                               gf_afr_mt_int32_t); +        if (NULL == witnesses) { +                nsources = -1; +                goto out; +        } +        afr_compute_witness_of_fools (witnesses, pending_matrix, characters, +                                      child_count); +        biggest_witness = afr_find_biggest_witness_among_fools (witnesses, +                                                                characters, +                                                                child_count); +        nsources = afr_mark_fool_as_source_by_witness (sources, witnesses, +                                                       characters, child_count, +                                                       biggest_witness); +out: +        if (witnesses) +                GF_FREE (witnesses); +        return nsources; +} -static int -afr_sh_mark_biggest_as_source (afr_self_heal_t *sh, int child_count) +int +afr_mark_child_as_source_by_uid (int32_t *sources, struct iatt *bufs, +                                 int32_t *valid_children, int child_count, +                                 uint32_t uid)  { -        int biggest = 0; -        int i       = 0; +        int     i        = 0; +        int     nsources = 0; +        int     child    = 0; + +        GF_ASSERT (bufs); +        GF_ASSERT (valid_children); +        GF_ASSERT (sources); +        GF_ASSERT (child_count > 0);          for (i = 0; i < child_count; i++) { -                if (!sh->buf) -                        break; +                if (-1 == valid_children[i]) +                        continue; -                if (SIZE_GREATER (&sh->buf[i], &sh->buf[biggest])) { -                        biggest = i; +                child = valid_children[i]; +                if (uid == bufs[child].ia_uid) { +                        sources[child] = 1; +                        nsources++;                  }          } +        return nsources; +} -        sh->sources[biggest] = 1; +int +afr_get_child_with_lowest_uid (struct iatt *bufs, int32_t *valid_children, +                               int child_count) +{ +        int     i        = 0; +        int     smallest = -1; +        int     child    = 0; -        return 1; -} +        GF_ASSERT (bufs); +        GF_ASSERT (valid_children); +        GF_ASSERT (child_count > 0); +        for (i = 0; i < child_count; i++) { +                if (-1 == valid_children[i]) +                        continue; +                child = valid_children[i]; +                if ((smallest == -1) || +                    (bufs[child].ia_uid < bufs[smallest].ia_uid)) { +                        smallest = child; +                } +        } +        return smallest; +}  static int -afr_sh_mark_loweia_uid_as_source (afr_self_heal_t *sh, int child_count) +afr_sh_mark_lowest_uid_as_source (struct iatt *bufs, int32_t *valid_children, +                                  int child_count, int32_t *sources)  { -        uid_t smallest = 0; -        int   i        = 0; +        int   nsources              = 0; +        int   smallest              = 0; -        for (i = 0; i < child_count; i++) { -                if (!sh->buf) -                        break; - -                if (sh->buf[i].ia_uid < sh->buf[smallest].ia_uid) { -                        smallest = i; -                } +        smallest = afr_get_child_with_lowest_uid (bufs, valid_children, +                                                  child_count); +        if (smallest < 0) { +                nsources = -1; +                goto out;          } +        nsources = afr_mark_child_as_source_by_uid (sources, bufs, +                                                    valid_children, child_count, +                                                    bufs[smallest].ia_uid); +out: +        return nsources; +} -        sh->sources[smallest] = 1; +char * +afr_get_character_str (afr_node_type type) +{ +        char *character = NULL; -        return 1; +        switch (type) { +        case AFR_NODE_INNOCENT: +                character = "innocent"; +                break; +        case AFR_NODE_FOOL: +                character = "fool"; +                break; +        case AFR_NODE_WISE: +                character = "wise"; +                break; +        default: +                character = "invalid"; +                break; +        } +        return character;  } +afr_node_type +afr_find_child_character_type (int32_t *pending_row, int32_t child, +                               int32_t child_count, const char *xlator_name) +{ +        afr_node_type type = AFR_NODE_INVALID; + +        GF_ASSERT (pending_row); +        GF_ASSERT (child_count > 0); +        GF_ASSERT ((child >= 0) && (child < child_count)); + +        if (afr_sh_is_innocent (pending_row, child_count)) +                type = AFR_NODE_INNOCENT; +        else if (afr_sh_is_fool (pending_row, child, child_count)) +                type = AFR_NODE_FOOL; +        else if (afr_sh_is_wise (pending_row, child, child_count)) +                type = AFR_NODE_WISE; +        else +                GF_ASSERT (0); + +        gf_log (xlator_name, GF_LOG_DEBUG, "child %d character %s", +                child, afr_get_character_str (type)); +        return type; +}  int -afr_sh_mark_sources (afr_self_heal_t *sh, int child_count, -                     afr_self_heal_type type) +afr_mark_sources (int32_t *sources, int32_t **pending_matrix, struct iatt *bufs, +                  int32_t child_count, afr_self_heal_type type, +                  int32_t *valid_children, const char *xlator_name)  {          /* stores the 'characters' (innocent, fool, wise) of the nodes */ -        afr_node_character *characters =  NULL; +        afr_node_character *characters =  NULL;          int            i              = 0; -        int32_t **     pending_matrix = NULL; -        int *          sources        = NULL; -        int            size_differs   = 0; -        int            nsources       = 0; +        int            nsources       = -1;          xlator_t      *this           = NULL; -        afr_private_t *priv           = NULL;          characters = GF_CALLOC (sizeof (afr_node_character), -                                        child_count, -                                        gf_afr_mt_afr_node_character) ; +                                child_count, gf_afr_mt_afr_node_character);          if (!characters)                  goto out;          this = THIS; -        priv = this->private; -        pending_matrix = sh->pending_matrix; -        sources        = sh->sources;          /* start clean */          for (i = 0; i < child_count; i++) {                  sources[i] = 0;          } +        nsources = 0;          for (i = 0; i < child_count; i++) { -                if (afr_sh_is_innocent (pending_matrix[i], child_count)) { -                        characters[i].type = AFR_NODE_INNOCENT; - -                } else if (afr_sh_is_fool (pending_matrix[i], i, child_count)) { -                        characters[i].type = AFR_NODE_FOOL; - -                } else if (afr_sh_is_wise (pending_matrix[i], i, child_count)) { -                        characters[i].type = AFR_NODE_WISE; - -                } else { -                        gf_log (this->name, GF_LOG_CRITICAL, -                                "Could not determine the state of subvolume %s!" -                                " (This message should never appear." -                                " Please file a bug report to " -                                "<gluster-devel@nongnu.org>.)", -                                priv->children[i]->name); -                } -        } - -        if (type == AFR_SELF_HEAL_DATA) { -                size_differs = afr_sh_mark_if_size_differs (sh, child_count); +                characters[i].type = +                        afr_find_child_character_type (pending_matrix[i], i, +                                                       child_count, +                                                       xlator_name); +                if (AFR_NODE_INVALID == characters[i].type) +                        gf_log (xlator_name, GF_LOG_WARNING, +                                "child %d had invalid xattrs", i);          }          if ((type == AFR_SELF_HEAL_METADATA)              && afr_sh_all_nodes_innocent (characters, child_count)) { -                nsources = afr_sh_mark_loweia_uid_as_source (sh, child_count); +                nsources = afr_sh_mark_lowest_uid_as_source (bufs, +                                                             valid_children, +                                                             child_count, +                                                             sources);                  goto out;          } -        if (afr_sh_all_nodes_innocent (characters, child_count)) { -                if (size_differs) { -                        nsources = afr_sh_mark_biggest_as_source (sh, -                                                                  child_count); -                } - -        } else if (afr_sh_wise_nodes_exist (characters, child_count)) { +        if (afr_sh_wise_nodes_exist (characters, child_count)) {                  afr_sh_compute_wisdom (pending_matrix, characters, child_count);                  if (afr_sh_wise_nodes_conflict (characters, child_count)) { @@ -536,7 +637,6 @@ afr_sh_mark_sources (afr_self_heal_t *sh, int child_count,                          gf_log (this->name, GF_LOG_INFO,                                  "split-brain possible, no source detected");                          nsources = -1; -                        goto out;                  } else {                          nsources = afr_sh_mark_wisest_as_sources (sources, @@ -544,18 +644,26 @@ afr_sh_mark_sources (afr_self_heal_t *sh, int child_count,                                                                    child_count);                  }          } else { -                nsources = afr_sh_mark_biggest_fool_as_source (sh, characters, -                                                               child_count); +                nsources = afr_mark_biggest_of_fools_as_source (sources, +                                                                pending_matrix, +                                                                characters, +                                                                child_count);          }  out: +        if (nsources == 0) { +                for (i = 0; i < child_count; i++) { +                        if (valid_children[i] != -1) +                                sources[valid_children[i]] = 1; +                } +        }          if (characters)                  GF_FREE (characters); +        gf_log (this->name, GF_LOG_DEBUG, "Number of sources: %d", nsources);          return nsources;  } -  void  afr_sh_pending_to_delta (afr_private_t *priv, dict_t **xattr,                           int32_t *delta_matrix[], int success[], @@ -643,7 +751,7 @@ afr_sh_delta_to_xattr (afr_private_t *priv,  int -afr_sh_has_metadata_pending (dict_t *xattr, int child_count, xlator_t *this) +afr_sh_has_metadata_pending (dict_t *xattr, xlator_t *this)  {          /* Indexable by result of afr_index_for_transaction_type(): 0 -- 2. */          int32_t        pending[3]  = {0,}; @@ -674,7 +782,7 @@ afr_sh_has_metadata_pending (dict_t *xattr, int child_count, xlator_t *this)  int -afr_sh_has_data_pending (dict_t *xattr, int child_count, xlator_t *this) +afr_sh_has_data_pending (dict_t *xattr, xlator_t *this)  {          /* Indexable by result of afr_index_for_transaction_type(): 0 -- 2. */          int32_t        pending[3]  = {0,}; @@ -705,7 +813,7 @@ afr_sh_has_data_pending (dict_t *xattr, int child_count, xlator_t *this)  int -afr_sh_has_entry_pending (dict_t *xattr, int child_count, xlator_t *this) +afr_sh_has_entry_pending (dict_t *xattr, xlator_t *this)  {          /* Indexable by result of afr_index_for_transaction_type(): 0 -- 2. */          int32_t        pending[3]  = {0,}; @@ -1655,6 +1763,9 @@ afr_self_heal (call_frame_t *frame, xlator_t *this)                                                   priv->child_count,                                                   gf_afr_mt_int32_t);          } +        sh->child_success = GF_CALLOC (sizeof (*sh->child_success), +                                       priv->child_count, gf_afr_mt_int32_t); +          FRAME_SU_DO (sh_frame, afr_local_t);          if (local->success_count && local->enoent_count) { @@ -1688,3 +1799,25 @@ afr_self_heal_type_str_get (afr_self_heal_t *self_heal_p, char *str,                  snprintf(str + strlen(str), size - strlen(str), " entry");          }  } + +afr_self_heal_type +afr_self_heal_type_for_transaction (afr_transaction_type type) +{ +        afr_self_heal_type sh_type = AFR_SELF_HEAL_INVALID; + +        switch (type) { +        case AFR_DATA_TRANSACTION: +                sh_type = AFR_SELF_HEAL_DATA; +                break; +        case AFR_METADATA_TRANSACTION: +                sh_type = AFR_SELF_HEAL_METADATA; +                break; +        case AFR_ENTRY_TRANSACTION: +                sh_type = AFR_SELF_HEAL_ENTRY; +                break; +        case AFR_ENTRY_RENAME_TRANSACTION: +                GF_ASSERT (0); +                break; +        } +        return sh_type; +} diff --git a/xlators/cluster/afr/src/afr-self-heal-common.h b/xlators/cluster/afr/src/afr-self-heal-common.h index 87856761c..676e933ae 100644 --- a/xlators/cluster/afr/src/afr-self-heal-common.h +++ b/xlators/cluster/afr/src/afr-self-heal-common.h @@ -26,6 +26,7 @@ typedef enum {          AFR_SELF_HEAL_ENTRY,          AFR_SELF_HEAL_METADATA,          AFR_SELF_HEAL_DATA, +        AFR_SELF_HEAL_INVALID = -1,  } afr_self_heal_type;  int @@ -37,17 +38,13 @@ afr_sh_sink_count (int sources[], int child_count);  int  afr_sh_source_count (int sources[], int child_count); -int -afr_sh_supress_errenous_children (int sources[], int child_errno[], -				  int child_count); -  void  afr_sh_print_pending_matrix (int32_t *pending_matrix[], xlator_t *this); -void -afr_sh_build_pending_matrix (afr_private_t *priv, -                             int32_t *pending_matrix[], dict_t *xattr[], -			     int child_count, afr_transaction_type type); +int +afr_build_pending_matrix (char **pending_key, int32_t **pending_matrix, +                          dict_t *xattr[], afr_transaction_type type, +                          size_t child_count);  void  afr_sh_pending_to_delta (afr_private_t *priv, dict_t **xattr, @@ -55,8 +52,9 @@ afr_sh_pending_to_delta (afr_private_t *priv, dict_t **xattr,                           int child_count, afr_transaction_type type);  int -afr_sh_mark_sources (afr_self_heal_t *sh, int child_count, -                     afr_self_heal_type type); +afr_mark_sources (int32_t *sources, int32_t **pending_matrix, struct iatt *bufs, +                  int32_t child_count, afr_self_heal_type type, +                  int32_t *valid_children, const char *xlator_name);  int  afr_sh_delta_to_xattr (afr_private_t *priv, @@ -70,4 +68,7 @@ void  afr_self_heal_type_str_get (afr_self_heal_t *self_heal_p, char *str,                              size_t size); +afr_self_heal_type +afr_self_heal_type_for_transaction (afr_transaction_type type); +  #endif /* __AFR_SELF_HEAL_COMMON_H__ */ diff --git a/xlators/cluster/afr/src/afr-self-heal-data.c b/xlators/cluster/afr/src/afr-self-heal-data.c index 0b9e4218c..38799db70 100644 --- a/xlators/cluster/afr/src/afr-self-heal-data.c +++ b/xlators/cluster/afr/src/afr-self-heal-data.c @@ -594,16 +594,15 @@ afr_sh_data_fix (call_frame_t *frame, xlator_t *this)          sh = &local->self_heal;          priv = this->private; -        afr_sh_build_pending_matrix (priv, sh->pending_matrix, sh->xattr, -                                     priv->child_count, AFR_DATA_TRANSACTION); +        afr_build_pending_matrix (priv->pending_key, sh->pending_matrix, +                                  sh->xattr, AFR_DATA_TRANSACTION, +                                  priv->child_count);          afr_sh_print_pending_matrix (sh->pending_matrix, this); -        nsources = afr_sh_mark_sources (sh, priv->child_count, -                                        AFR_SELF_HEAL_DATA); - -        afr_sh_supress_errenous_children (sh->sources, sh->child_errno, -                                          priv->child_count); +        nsources = afr_mark_sources (sh->sources, sh->pending_matrix, sh->buf, +                                     priv->child_count, AFR_SELF_HEAL_DATA, +                                     sh->child_success, this->name);          if (nsources == 0) {                  gf_log (this->name, GF_LOG_TRACE, @@ -692,53 +691,164 @@ afr_sh_data_fix (call_frame_t *frame, xlator_t *this)          return 0;  } +gf_boolean_t +afr_is_fresh_read_child (int32_t *sources, int32_t child_count, +                         int32_t read_child) +{ +        gf_boolean_t             is_fresh_child = _gf_false; + +        GF_ASSERT (read_child < child_count); -void -afr_self_heal_find_sources (xlator_t *this, afr_local_t *local, dict_t **xattr, -                            afr_transaction_type transaction_type) +        if ((read_child >= 0) && (read_child < child_count) && +             sources[read_child]) { +                is_fresh_child = _gf_true; +        } +        return is_fresh_child; +} + +static int +afr_select_read_child_from_policy (int32_t *sources, int32_t child_count, +                                   int32_t prev_read_child, +                                   int32_t config_read_child, +                                   int32_t *valid_children)  { -        afr_self_heal_t *sh   = NULL; -        afr_private_t   *priv = NULL; -        int              i = 0; -        afr_self_heal_type sh_type = AFR_SELF_HEAL_DATA; -        int              nsources = 0; +        int32_t                  read_child = -1; +        int                      i          = 0; -        sh   = &local->self_heal; -        priv = this->private; +        GF_ASSERT (sources); -        sh->pending_matrix = GF_CALLOC (sizeof (int32_t *), priv->child_count, -                                        gf_afr_mt_int32_t); -        for (i = 0; i < priv->child_count; i++) { -                sh->pending_matrix[i] = GF_CALLOC (sizeof (int32_t), -                                                   priv->child_count, -                                                   gf_afr_mt_int32_t); +        read_child = prev_read_child; +        if (_gf_true == afr_is_fresh_read_child (sources, child_count, +                                                 read_child)) +                goto out; + +        read_child = config_read_child; +        if (_gf_true == afr_is_fresh_read_child (sources, child_count, +                                                 read_child)) +                goto out; + +        for (i = 0; i < child_count; i++) { +                read_child = valid_children[i]; +                if (read_child < 0) +                        break; +                if (_gf_true == afr_is_fresh_read_child (sources, child_count, +                                                         read_child)) +                        goto out;          } +        read_child = -1; + +out: +        return read_child; +} -        sh->sources = GF_CALLOC (priv->child_count, sizeof (*sh->sources), -                                 gf_afr_mt_int32_t); - -        afr_sh_build_pending_matrix (priv, sh->pending_matrix, xattr, -                                     priv->child_count, transaction_type); - -        switch (transaction_type) { -        case AFR_DATA_TRANSACTION: -                sh_type = AFR_SELF_HEAL_DATA; -                break; -        case AFR_ENTRY_TRANSACTION: -                sh_type = AFR_SELF_HEAL_ENTRY; -                break; -        case AFR_METADATA_TRANSACTION: -                sh_type = AFR_SELF_HEAL_METADATA; -                break; -        default: -                sh_type = AFR_SELF_HEAL_METADATA; -                break; +static void +afr_destroy_pending_matrix (int32_t **pending_matrix, int32_t child_count) +{ +        int             i = 0; +        GF_ASSERT (child_count > 0); +        if (pending_matrix) { +                for (i = 0; i < child_count; i++) { +                        if (pending_matrix[i]) +                                GF_FREE (pending_matrix[i]); +                } +                GF_FREE (pending_matrix);          } -        nsources = afr_sh_mark_sources (sh, priv->child_count, sh_type); -        if (nsources == 0) { -                for (i = 0; i < priv->child_count; i++) -                        sh->sources[i] = 1; +} + +static int32_t** +afr_create_pending_matrix (int32_t child_count) +{ +        gf_boolean_t            cleanup = _gf_false; +        int32_t                 **pending_matrix = NULL; +        int                     i = 0; + +        GF_ASSERT (child_count > 0); + +        pending_matrix = GF_CALLOC (sizeof (*pending_matrix), child_count, +                                    gf_afr_mt_int32_t); +        if (NULL == pending_matrix) +                goto out; +        for (i = 0; i < child_count; i++) { +                pending_matrix[i] = GF_CALLOC (sizeof (**pending_matrix), +                                               child_count, +                                               gf_afr_mt_int32_t); +                if (NULL == pending_matrix[i]) { +                        cleanup = _gf_true; +                        goto out; +                } +        } +out: +        if (_gf_true == cleanup) { +                afr_destroy_pending_matrix (pending_matrix, child_count); +                pending_matrix = NULL; +        } +        return pending_matrix; +} + +int +afr_lookup_select_read_child_by_txn_type (xlator_t *this, afr_local_t *local, +                                          dict_t **xattr, +                                          afr_transaction_type txn_type) +{ +        afr_private_t            *priv      = NULL; +        int                      read_child = -1; +        int                      ret        = -1; +        afr_self_heal_type       sh_type    = AFR_SELF_HEAL_INVALID; +        int32_t                  **pending_matrix = NULL; +        int32_t                  *sources         = NULL; +        int32_t                  *valid_children  = NULL; +        struct iatt              *bufs            = NULL; +        int32_t                  nsources         = 0; +        int32_t                  prev_read_child  = -1; +        int32_t                  config_read_child = -1; +        afr_self_heal_t          *sh = NULL; + +        priv = this->private; +        bufs = local->cont.lookup.bufs; +        valid_children = local->cont.lookup.child_success; +        sh = &local->self_heal; + +        pending_matrix = afr_create_pending_matrix (priv->child_count); +        if (NULL == pending_matrix) +                goto out; + +        sources = GF_CALLOC (sizeof (*sources), priv->child_count, +                             gf_afr_mt_int32_t); +        if (NULL == sources) +                goto out; + +        afr_build_pending_matrix (priv->pending_key, pending_matrix, +                                  xattr, txn_type, priv->child_count); + +        sh_type = afr_self_heal_type_for_transaction (txn_type); +        if (AFR_SELF_HEAL_INVALID == sh_type) +                goto out; + +        nsources = afr_mark_sources (sources, pending_matrix, bufs, +                                     priv->child_count, sh_type, +                                     valid_children, this->name); +        if (nsources < 0) { +                ret = -1; +                goto out; +        } + +        prev_read_child = local->read_child_index; +        config_read_child = priv->read_child; +        read_child = afr_select_read_child_from_policy (sources, +                                                        priv->child_count, +                                                        prev_read_child, +                                                        config_read_child, +                                                        valid_children); +        ret = 0; +        local->cont.lookup.sources = sources; +out: +        afr_destroy_pending_matrix (pending_matrix, priv->child_count); +        if (-1 == ret) { +                if (sources) +                        GF_FREE (sources);          } +        gf_log (this->name, GF_LOG_DEBUG, "returning read_child: %d", read_child); +        return read_child;  } @@ -766,6 +876,8 @@ afr_sh_data_fstat_cbk (call_frame_t *frame, void *cookie,                                  priv->children[child_index]->name);                          sh->buf[child_index] = *buf; +                        sh->child_success[sh->success_count] = child_index; +                        sh->success_count++;                  }          }          UNLOCK (&frame->lock); @@ -798,6 +910,9 @@ afr_sh_data_fstat (call_frame_t *frame, xlator_t *this)          local->call_count = call_count; +        for (i = 0; i < priv->child_count; i++) +                sh->child_success[i] = -1; +        sh->success_count = 0;          for (i = 0; i < priv->child_count; i++) {                  if (local->child_up[i]) {                          STACK_WIND_COOKIE (frame, afr_sh_data_fstat_cbk, diff --git a/xlators/cluster/afr/src/afr-self-heal-entry.c b/xlators/cluster/afr/src/afr-self-heal-entry.c index 0504da17c..8c619ff45 100644 --- a/xlators/cluster/afr/src/afr-self-heal-entry.c +++ b/xlators/cluster/afr/src/afr-self-heal-entry.c @@ -2159,13 +2159,15 @@ afr_sh_entry_fix (call_frame_t *frame, xlator_t *this)                  goto heal;          } -        afr_sh_build_pending_matrix (priv, sh->pending_matrix, sh->xattr, -                                     priv->child_count, AFR_ENTRY_TRANSACTION); +        afr_build_pending_matrix (priv->pending_key, sh->pending_matrix, +                                  sh->xattr, AFR_ENTRY_TRANSACTION, +                                  priv->child_count);          afr_sh_print_pending_matrix (sh->pending_matrix, this); -        nsources = afr_sh_mark_sources (sh, priv->child_count, -                                        AFR_SELF_HEAL_ENTRY); +        nsources = afr_mark_sources (sh->sources, sh->pending_matrix, sh->buf, +                                     priv->child_count, AFR_SELF_HEAL_ENTRY, +                                     sh->child_success, this->name);          if (nsources == 0) {                  gf_log (this->name, GF_LOG_TRACE, @@ -2176,9 +2178,6 @@ afr_sh_entry_fix (call_frame_t *frame, xlator_t *this)                  return 0;          } -        afr_sh_supress_errenous_children (sh->sources, sh->child_errno, -                                          priv->child_count); -          source = afr_sh_select_source (sh->sources, priv->child_count);          sh->source = source; @@ -2211,6 +2210,8 @@ afr_sh_entry_lookup_cbk (call_frame_t *frame, void *cookie,                  if (op_ret != -1) {                          sh->xattr[child_index] = dict_ref (xattr);                          sh->buf[child_index] = *buf; +                        sh->child_success[sh->success_count] = child_index; +                        sh->success_count++;                  }          }          UNLOCK (&frame->lock); @@ -2235,9 +2236,11 @@ afr_sh_entry_lookup (call_frame_t *frame, xlator_t *this)          int ret = 0;          int call_count = 0;          int i = 0; +        afr_self_heal_t *sh = NULL;          priv  = this->private;          local = frame->local; +        sh = &local->self_heal;          call_count = afr_up_children_count (priv->child_count,                                              local->child_up); @@ -2257,6 +2260,9 @@ afr_sh_entry_lookup (call_frame_t *frame, xlator_t *this)                  }          } +        for (i = 0; i < priv->child_count; i++) +                sh->child_success[i] = -1; +        sh->success_count = 0;          for (i = 0; i < priv->child_count; i++) {                  if (local->child_up[i]) {                          STACK_WIND_COOKIE (frame, diff --git a/xlators/cluster/afr/src/afr-self-heal-metadata.c b/xlators/cluster/afr/src/afr-self-heal-metadata.c index a4f037c04..1214eefe2 100644 --- a/xlators/cluster/afr/src/afr-self-heal-metadata.c +++ b/xlators/cluster/afr/src/afr-self-heal-metadata.c @@ -475,17 +475,15 @@ afr_sh_metadata_fix (call_frame_t *frame, xlator_t *this)          sh = &local->self_heal;          priv = this->private; -        afr_sh_build_pending_matrix (priv, sh->pending_matrix, sh->xattr, -                                     priv->child_count, -                                     AFR_METADATA_TRANSACTION); +        afr_build_pending_matrix (priv->pending_key, sh->pending_matrix, +                                  sh->xattr, AFR_METADATA_TRANSACTION, +                                  priv->child_count);          afr_sh_print_pending_matrix (sh->pending_matrix, this); -        nsources = afr_sh_mark_sources (sh, priv->child_count, -                                        AFR_SELF_HEAL_METADATA); - -        afr_sh_supress_errenous_children (sh->sources, sh->child_errno, -                                          priv->child_count); +        nsources = afr_mark_sources (sh->sources, sh->pending_matrix, sh->buf, +                                     priv->child_count, AFR_SELF_HEAL_METADATA, +                                     sh->child_success, this->name);          if (nsources == 0) {                  gf_log (this->name, GF_LOG_TRACE, @@ -584,6 +582,8 @@ afr_sh_metadata_lookup_cbk (call_frame_t *frame, void *cookie, xlator_t *this,                          sh->buf[child_index] = *buf;                          if (xattr)                                  sh->xattr[child_index] = dict_ref (xattr); +                        sh->child_success[sh->success_count] = child_index; +                        sh->success_count++;                  } else {                          gf_log (this->name, GF_LOG_INFO,                                  "path %s on subvolume %s => -1 (%s)", @@ -614,9 +614,11 @@ afr_sh_metadata_lookup (call_frame_t *frame, xlator_t *this)          int              call_count = 0;          dict_t          *xattr_req = NULL;          int              ret = 0; +        afr_self_heal_t *sh = NULL;          local = frame->local;          priv = this->private; +        sh = &local->self_heal;          call_count = afr_up_children_count (priv->child_count,                                              local->child_up); @@ -635,6 +637,9 @@ afr_sh_metadata_lookup (call_frame_t *frame, xlator_t *this)                  }          } +        for (i = 0; i < priv->child_count; i++) +                sh->child_success[i] = -1; +        sh->success_count = 0;          for (i = 0; i < priv->child_count; i++) {                  if (local->child_up[i]) {                          gf_log (this->name, GF_LOG_TRACE, diff --git a/xlators/cluster/afr/src/afr-self-heal.h b/xlators/cluster/afr/src/afr-self-heal.h index 8bf484b48..00e9a1b1e 100644 --- a/xlators/cluster/afr/src/afr-self-heal.h +++ b/xlators/cluster/afr/src/afr-self-heal.h @@ -30,11 +30,11 @@  #define SIZE_GREATER(buf1,buf2) ((buf1)->ia_size > (buf2)->ia_size)  int -afr_sh_has_metadata_pending (dict_t *xattr, int child_count, xlator_t *this); +afr_sh_has_metadata_pending (dict_t *xattr, xlator_t *this);  int -afr_sh_has_entry_pending (dict_t *xattr, int child_count, xlator_t *this); +afr_sh_has_entry_pending (dict_t *xattr, xlator_t *this);  int -afr_sh_has_data_pending (dict_t *xattr, int child_count, xlator_t *this); +afr_sh_has_data_pending (dict_t *xattr, xlator_t *this);  int  afr_self_heal_entry (call_frame_t *frame, xlator_t *this); @@ -52,4 +52,11 @@ afr_self_heal_find_sources (xlator_t *this, afr_local_t *local, dict_t **xattr,  int  afr_self_heal (call_frame_t *frame, xlator_t *this); +gf_boolean_t +afr_is_fresh_read_child (int32_t *sources, int32_t child_count, +                         int32_t read_child); +int +afr_lookup_select_read_child_by_txn_type (xlator_t *this, afr_local_t *local, +                                          dict_t **xattr, +                                          afr_transaction_type txn_type);  #endif /* __AFR_SELF_HEAL_H__ */ diff --git a/xlators/cluster/afr/src/afr.h b/xlators/cluster/afr/src/afr.h index 6f40ded12..8f7f54faf 100644 --- a/xlators/cluster/afr/src/afr.h +++ b/xlators/cluster/afr/src/afr.h @@ -138,6 +138,10 @@ typedef struct {          /* array of xattr's, one for each child */          dict_t **xattr; +        /* array containing if the lookups succeeded in the order of response +         */ +        int32_t *child_success; +        int     success_count;          /* array of errno's, one for each child */          int *child_errno; @@ -341,14 +345,17 @@ typedef struct _afr_local {                  struct {                          inode_t *inode;                          struct iatt buf; -                        struct iatt read_child_buf;                          struct iatt postparent;                          ino_t ino;                          uint64_t gen;                          ino_t parent_ino; -                        dict_t *xattr;                          dict_t **xattrs; -                        gf_boolean_t is_revalidate; +                        dict_t *xattr; +                        struct iatt *postparents; +                        struct iatt *bufs; +                        int32_t read_child; +                        int32_t *child_success;//in the order of response +                        int32_t *sources;                  } lookup;                  struct { @@ -737,6 +744,9 @@ afr_build_parent_loc (loc_t *parent, loc_t *child);  int  afr_up_children_count (int child_count, unsigned char *child_up); +gf_boolean_t +afr_is_fresh_lookup (loc_t *loc, xlator_t *this); +  void  afr_update_loc_gfids (loc_t *loc, struct iatt *buf, struct iatt *postparent);  | 
