From ab2b2fefdcb10aa29749190b9e908b3bc81d86e5 Mon Sep 17 00:00:00 2001 From: Pranith Kumar K Date: Mon, 18 Jul 2011 12:30:17 +0530 Subject: cluster/afr: Handle lookups when self-heal is off Change-Id: Ibc23fef417bcf613850e03dc4dadcc88f89e2b6f BUG: 2586 Reviewed-on: http://review.gluster.com/59 Tested-by: Gluster Build System Reviewed-by: Vijay Bellur --- xlators/cluster/afr/src/afr-common.c | 820 +++++++++++++---------- xlators/cluster/afr/src/afr-dir-read.c | 14 +- xlators/cluster/afr/src/afr-self-heal-common.c | 441 +++++++----- xlators/cluster/afr/src/afr-self-heal-common.h | 21 +- xlators/cluster/afr/src/afr-self-heal-data.c | 207 ++++-- xlators/cluster/afr/src/afr-self-heal-entry.c | 20 +- xlators/cluster/afr/src/afr-self-heal-metadata.c | 21 +- xlators/cluster/afr/src/afr-self-heal.h | 13 +- xlators/cluster/afr/src/afr.h | 16 +- 9 files changed, 983 insertions(+), 590 deletions(-) diff --git a/xlators/cluster/afr/src/afr-common.c b/xlators/cluster/afr/src/afr-common.c index a435a38b1..f49d8c55e 100644 --- a/xlators/cluster/afr/src/afr-common.c +++ b/xlators/cluster/afr/src/afr-common.c @@ -328,6 +328,8 @@ afr_local_sh_cleanup (afr_local_t *local, xlator_t *this) if (sh->linkname) GF_FREE ((char *)sh->linkname); + if (sh->child_success) + GF_FREE (sh->child_success); loc_wipe (&sh->parent_loc); } @@ -417,6 +419,18 @@ afr_local_cleanup (afr_local_t *local, xlator_t *this) if (local->cont.lookup.inode) { inode_unref (local->cont.lookup.inode); } + + if (local->cont.lookup.postparents) + GF_FREE (local->cont.lookup.postparents); + + if (local->cont.lookup.bufs) + GF_FREE (local->cont.lookup.bufs); + + if (local->cont.lookup.child_success) + GF_FREE (local->cont.lookup.child_success); + + if (local->cont.lookup.sources) + GF_FREE (local->cont.lookup.sources); } { /* getxattr */ @@ -509,6 +523,22 @@ afr_up_children_count (int child_count, unsigned char *child_up) return ret; } +gf_boolean_t +afr_is_fresh_lookup (loc_t *loc, xlator_t *this) +{ + uint64_t ctx = 0; + int32_t ret = 0; + + GF_ASSERT (loc); + GF_ASSERT (this); + GF_ASSERT (loc->inode); + + ret = inode_ctx_get (loc->inode, this, &ctx); + if (0 == ret) + return _gf_false; + return _gf_true; +} + void afr_update_loc_gfids (loc_t *loc, struct iatt *buf, struct iatt *postparent) { @@ -567,68 +597,96 @@ afr_self_heal_lookup_unwind (call_frame_t *frame, xlator_t *this) } AFR_STACK_UNWIND (lookup, frame, local->op_ret, local->op_errno, - local->cont.lookup.inode, - &local->cont.lookup.buf, + local->cont.lookup.inode, &local->cont.lookup.buf, local->cont.lookup.xattr, &local->cont.lookup.postparent); return 0; } +void +afr_lookup_build_response_params (afr_local_t *local, xlator_t *this) +{ + int32_t read_child = -1; + struct iatt *buf = NULL; + struct iatt *postparent = NULL; + dict_t **xattr = NULL; -static void -afr_lookup_collect_xattr (afr_local_t *local, xlator_t *this, - int child_index, dict_t *xattr) + GF_ASSERT (local); + GF_ASSERT (local->cont.lookup.read_child >= 0); + + buf = &local->cont.lookup.buf; + postparent = &local->cont.lookup.postparent; + xattr = &local->cont.lookup.xattr; + + read_child = local->cont.lookup.read_child; + *xattr = dict_ref (local->cont.lookup.xattrs[read_child]); + *buf = local->cont.lookup.bufs[read_child]; + *postparent = local->cont.lookup.postparents[read_child]; + + if (IA_INVAL == local->cont.lookup.inode->ia_type) { + /* fix for RT #602 */ + local->cont.lookup.inode->ia_type = buf->ia_type; + } +} + + + static void +afr_lookup_update_lk_counts (afr_local_t *local, xlator_t *this, + int child_index, dict_t *xattr) { uint32_t inodelk_count = 0; uint32_t entrylk_count = 0; - int ret = 0; + int ret = -1; + + GF_ASSERT (local); + GF_ASSERT (this); + GF_ASSERT (xattr); + GF_ASSERT (child_index >= 0); + + ret = dict_get_uint32 (xattr, GLUSTERFS_INODELK_COUNT, + &inodelk_count); + if (ret == 0) + local->inodelk_count += inodelk_count; - if (afr_sh_has_metadata_pending (xattr, child_index, this)) { + ret = dict_get_uint32 (xattr, GLUSTERFS_ENTRYLK_COUNT, + &entrylk_count); + if (ret == 0) + local->entrylk_count += entrylk_count; +} + +static void +afr_lookup_detect_self_heal_by_xattr (afr_local_t *local, xlator_t *this, + dict_t *xattr) +{ + GF_ASSERT (local); + GF_ASSERT (this); + GF_ASSERT (xattr); + + if (afr_sh_has_metadata_pending (xattr, this)) { local->self_heal.need_metadata_self_heal = _gf_true; gf_log(this->name, GF_LOG_DEBUG, "metadata self-heal is pending for %s.", local->loc.path); } - if (afr_sh_has_entry_pending (xattr, child_index, this)) { + if (afr_sh_has_entry_pending (xattr, this)) { local->self_heal.need_entry_self_heal = _gf_true; gf_log(this->name, GF_LOG_DEBUG, "entry self-heal is pending for %s.", local->loc.path); } - if (afr_sh_has_data_pending (xattr, child_index, this)) { + if (afr_sh_has_data_pending (xattr, this)) { local->self_heal.need_data_self_heal = _gf_true; gf_log(this->name, GF_LOG_DEBUG, "data self-heal is pending for %s.", local->loc.path); } - - ret = dict_get_uint32 (xattr, GLUSTERFS_INODELK_COUNT, - &inodelk_count); - if (ret == 0) - local->inodelk_count += inodelk_count; - - ret = dict_get_uint32 (xattr, GLUSTERFS_ENTRYLK_COUNT, - &entrylk_count); - if (ret == 0) - local->entrylk_count += entrylk_count; } - static void -afr_lookup_self_heal_check (xlator_t *this, afr_local_t *local, +afr_detect_self_heal_by_iatt (afr_local_t *local, xlator_t *this, struct iatt *buf, struct iatt *lookup_buf) { - if (FILETYPE_DIFFERS (buf, lookup_buf)) { - /* mismatching filetypes with same name - */ - - gf_log (this->name, GF_LOG_INFO, - "filetype differs for %s ", local->loc.path); - - local->govinda_gOvinda = 1; - } - if (PERMISSION_DIFFERS (buf, lookup_buf)) { /* mismatching permissions */ gf_log (this->name, GF_LOG_INFO, @@ -674,157 +732,299 @@ out: return valid; } -void -afr_lookup_set_read_child (xlator_t *this, afr_local_t *local) +static void +afr_detect_self_heal_by_lookup_status (afr_local_t *local, xlator_t *this) { - ia_type_t ia_type = IA_INVAL; - afr_transaction_type transaction_type = AFR_DATA_TRANSACTION; - afr_private_t *priv = NULL; - int32_t read_child = -1; - afr_self_heal_t *sh = NULL; + GF_ASSERT (local); + GF_ASSERT (this); - priv = this->private; - sh = &local->self_heal; + if ((local->success_count > 0) && (local->enoent_count > 0)) { + local->self_heal.need_metadata_self_heal = _gf_true; + local->self_heal.need_data_self_heal = _gf_true; + local->self_heal.need_entry_self_heal = _gf_true; + gf_log(this->name, GF_LOG_INFO, + "entries are missing in lookup of %s.", + local->loc.path); + //If all self-heals are needed no need to check for other rules + goto out; + } - ia_type = local->cont.lookup.inode->ia_type; - if (IA_ISREG (ia_type)) { - transaction_type = AFR_DATA_TRANSACTION; - } else if IA_ISDIR (ia_type) { - transaction_type = AFR_ENTRY_TRANSACTION; - } else { - transaction_type = AFR_METADATA_TRANSACTION; + if (local->success_count > 0) { + if (afr_is_split_brain (this, local->cont.lookup.inode) && + IA_ISREG (local->cont.lookup.inode->ia_type)) { + local->self_heal.need_data_self_heal = _gf_true; + gf_log (this->name, GF_LOG_WARNING, + "split brain detected during lookup of %s.", + local->loc.path); + } } - afr_self_heal_find_sources (this, local, - local->cont.lookup.xattrs, - transaction_type); - if (!sh->sources) - goto out; - read_child = local->read_child_index; - if (afr_is_valid_read_child (sh->sources, priv->child_count, - read_child)) - goto out; +out: + return; +} + +gf_boolean_t +afr_can_self_heal_proceed (afr_self_heal_t *sh, afr_private_t *priv) +{ + GF_ASSERT (sh); + GF_ASSERT (priv); + + return ((priv->data_self_heal && sh->need_data_self_heal) + || (priv->metadata_self_heal && sh->need_metadata_self_heal) + || (priv->entry_self_heal && sh->need_entry_self_heal)); +} - read_child = afr_read_child (this, local->loc.inode); - if (afr_is_valid_read_child (sh->sources, priv->child_count, - read_child)) +gf_boolean_t +afr_is_self_heal_enabled (afr_private_t *priv) +{ + GF_ASSERT (priv); + + return (priv->data_self_heal || priv->metadata_self_heal + || priv->entry_self_heal); +} + +int +afr_lookup_select_read_child (afr_local_t *local, xlator_t *this, + int32_t *read_child) +{ + int32_t source = -1; + ia_type_t ia_type = 0; + int ret = -1; + afr_transaction_type type = AFR_METADATA_TRANSACTION; + dict_t **xattrs = NULL; + int32_t *child_success = NULL; + struct iatt *bufs = NULL; + + GF_ASSERT (local); + GF_ASSERT (this); + + bufs = local->cont.lookup.bufs; + child_success = local->cont.lookup.child_success; + ia_type = local->cont.lookup.bufs[child_success[0]].ia_type; + if (IA_ISDIR (ia_type)) { + type = AFR_ENTRY_TRANSACTION; + } else if (IA_ISREG (ia_type)) { + type = AFR_DATA_TRANSACTION; + } + xattrs = local->cont.lookup.xattrs; + source = afr_lookup_select_read_child_by_txn_type (this, local, xattrs, + type); + if (source < 0) goto out; - read_child = afr_sh_select_source (sh->sources, priv->child_count); + *read_child = source; + ret = 0; out: - if (read_child >= 0) { - afr_set_read_child (this, - local->cont.lookup.inode, - read_child); - } + return ret; +} + +static inline gf_boolean_t +afr_is_self_heal_running (afr_local_t *local) +{ + GF_ASSERT (local); + return ((local->inodelk_count > 0) || (local->entrylk_count > 0)); } static void -afr_lookup_done (call_frame_t *frame, xlator_t *this, struct iatt *lookup_buf) +afr_launch_self_heal (call_frame_t *frame, xlator_t *this, + gf_boolean_t is_background, ia_type_t ia_type, + int (*unwind) (call_frame_t *frame, xlator_t *this)) { - int unwind = 1; - int up_count = 0; - char sh_type_str[256] = {0,}; - afr_private_t *priv = NULL; afr_local_t *local = NULL; + char sh_type_str[256] = {0,}; + + GF_ASSERT (frame); + GF_ASSERT (this); - priv = this->private; local = frame->local; + local->self_heal.background = is_background; + local->self_heal.type = ia_type; + local->self_heal.unwind = unwind; - if (local->op_ret != 0) - goto unwind; + afr_self_heal_type_str_get (&local->self_heal, + sh_type_str, + sizeof (sh_type_str)); - local->cont.lookup.postparent.ia_ino = local->cont.lookup.parent_ino; + gf_log (this->name, GF_LOG_INFO, + "background %s self-heal triggered. path: %s", + sh_type_str, local->loc.path); - if (local->cont.lookup.ino) { - local->cont.lookup.buf.ia_ino = local->cont.lookup.ino; + afr_self_heal (frame, this); +} + +static void +afr_lookup_detect_self_heal (afr_local_t *local, xlator_t *this) +{ + int i = 0; + struct iatt *bufs = NULL; + dict_t **xattr = NULL; + afr_private_t *priv = NULL; + int32_t child1 = -1; + int32_t child2 = -1; + + afr_detect_self_heal_by_lookup_status (local, this); + + bufs = local->cont.lookup.bufs; + for (i = 1; i < local->success_count; i++) { + child1 = local->cont.lookup.child_success[i-1]; + child2 = local->cont.lookup.child_success[i];; + afr_detect_self_heal_by_iatt (local, this, + &bufs[child1], &bufs[child2]); } - if (local->op_ret == 0) { - /* KLUDGE: assuming DHT will not itransform in - revalidate */ - if (local->cont.lookup.inode->ino) { - local->cont.lookup.buf.ia_ino = - local->cont.lookup.inode->ino; - } + xattr = local->cont.lookup.xattrs; + priv = this->private; + for (i = 0; i < local->success_count; i++) { + child1 = local->cont.lookup.child_success[i];; + afr_lookup_detect_self_heal_by_xattr (local, this, + xattr[child1]); } +} + +static void +afr_lookup_perform_self_heal_if_needed (call_frame_t *frame, xlator_t *this, + gf_boolean_t *sh_launched) +{ + size_t up_count = 0; + afr_private_t *priv = NULL; + afr_local_t *local = NULL; + + GF_ASSERT (sh_launched); + *sh_launched = _gf_false; + priv = this->private; + local = frame->local; - afr_lookup_set_read_child (this, local); - up_count = afr_up_children_count (priv->child_count, priv->child_up); + up_count = afr_up_children_count (priv->child_count, local->child_up); if (up_count == 1) { gf_log (this->name, GF_LOG_DEBUG, "Only 1 child up - do not attempt to detect self heal"); + goto out; + } - goto unwind; + if (_gf_false == afr_is_self_heal_enabled (priv)) { + gf_log (this->name, GF_LOG_DEBUG, + "Self heal is not enabled"); + goto out; } - if (local->success_count && local->enoent_count) { - local->self_heal.need_metadata_self_heal = _gf_true; - local->self_heal.need_data_self_heal = _gf_true; - local->self_heal.need_entry_self_heal = _gf_true; - gf_log(this->name, GF_LOG_INFO, - "entries are missing in lookup of %s.", - local->loc.path); + afr_lookup_detect_self_heal (local, this); + if (afr_can_self_heal_proceed (&local->self_heal, priv)) { + if (afr_is_self_heal_running (local)) { + goto out; + } + + afr_launch_self_heal (frame, this, _gf_true, + local->cont.lookup.buf.ia_type, + afr_self_heal_lookup_unwind); + *sh_launched = _gf_true; } +out: + return; +} - if (local->success_count) { - /* check for split-brain case in previous lookup */ - if (afr_is_split_brain (this, local->cont.lookup.inode)) { - local->self_heal.need_data_self_heal = _gf_true; - gf_log(this->name, GF_LOG_WARNING, - "split brain detected during lookup of %s.", - local->loc.path); +static gf_boolean_t +afr_lookup_split_brain (afr_local_t *local, xlator_t *this) +{ + int i = 0; + gf_boolean_t symptom = _gf_false; + struct iatt *bufs = NULL; + int32_t *child_success = NULL; + struct iatt *child1 = NULL; + struct iatt *child2 = NULL; + const char *path = NULL; + + bufs = local->cont.lookup.bufs; + child_success = local->cont.lookup.child_success; + for (i = 1; i < local->success_count; i++) { + child1 = &bufs[child_success[i-1]]; + child2 = &bufs[child_success[i]]; + /* + * TODO: gfid self-heal + * if (uuid_compare (child1->ia_gfid, child2->ia_gfid)) { + * gf_log (this->name, GF_LOG_WARNING, "%s: gfid differs" + * " on subvolumes (%d, %d)", local->loc.path, + * child_success[i-1], child_success[i]); + * symptom = _gf_true; + * } + */ + + if (FILETYPE_DIFFERS (child1, child2)) { + path = local->loc.path; + gf_log (this->name, GF_LOG_WARNING, "%s: filetype " + "differs on subvolumes (%d, %d)", path, + child_success[i-1], child_success[i]); + symptom = _gf_true; + local->govinda_gOvinda = 1; } + if (symptom) + break; } + return symptom; +} - if (local->self_heal.need_metadata_self_heal - || local->self_heal.need_data_self_heal - || local->self_heal.need_entry_self_heal) - { +static int +afr_lookup_set_read_child (afr_local_t *local, xlator_t *this, int32_t read_child) +{ + GF_ASSERT (read_child >= 0); - if (local->inodelk_count || local->entrylk_count) { + afr_set_read_child (this, local->cont.lookup.inode, read_child); + local->cont.lookup.read_child = read_child; - /* Someone else is doing self-heal on this file. - return */ + return 0; +} - goto unwind; - } +static void +afr_lookup_done (call_frame_t *frame, xlator_t *this) +{ + int unwind = 1; + afr_private_t *priv = NULL; + afr_local_t *local = NULL; + int ret = -1; + gf_boolean_t sh_launched = _gf_false; + int32_t read_child = -1; - if (!local->cont.lookup.inode->ia_type) { - /* fix for RT #602 */ - local->cont.lookup.inode->ia_type = - lookup_buf->ia_type; - } + priv = this->private; + local = frame->local; - local->self_heal.background = _gf_true; - local->self_heal.type = local->cont.lookup.buf.ia_type; - local->self_heal.unwind = afr_self_heal_lookup_unwind; + if (local->op_ret < 0) + goto unwind; - unwind = 0; + if (_gf_true == afr_lookup_split_brain (local, this)) { + local->op_ret = -1; + local->op_errno = EIO; + goto unwind; + } - afr_self_heal_type_str_get(&local->self_heal, - sh_type_str, - sizeof(sh_type_str)); + ret = afr_lookup_select_read_child (local, this, &read_child); + if (ret) { + local->op_ret = -1; + local->op_errno = EIO; + goto unwind; + } - gf_log (this->name, GF_LOG_INFO, - "background %s self-heal triggered. path: %s", - sh_type_str, local->loc.path); + ret = afr_lookup_set_read_child (local, this, read_child); + if (ret) + goto unwind; - afr_self_heal (frame, this); + afr_lookup_build_response_params (local, this); + if (afr_is_fresh_lookup (&local->loc, this)) { + afr_update_loc_gfids (&local->loc, &local->cont.lookup.buf, + &local->cont.lookup.postparent); } -unwind: - if (unwind) { - AFR_STACK_UNWIND (lookup, frame, local->op_ret, - local->op_errno, - local->cont.lookup.inode, - &local->cont.lookup.buf, - local->cont.lookup.xattr, - &local->cont.lookup.postparent); + afr_lookup_perform_self_heal_if_needed (frame, this, &sh_launched); + if (sh_launched) + unwind = 0; + unwind: + if (unwind) { + AFR_STACK_UNWIND (lookup, frame, local->op_ret, + local->op_errno, local->cont.lookup.inode, + &local->cont.lookup.buf, + local->cont.lookup.xattr, + &local->cont.lookup.postparent); } } - /* * During a lookup, some errors are more "important" than * others in that they must be given higher priority while @@ -850,259 +1050,169 @@ __error_more_important (int32_t old_errno, int32_t new_errno) return ret; } - -int -afr_fresh_lookup_cbk (call_frame_t *frame, void *cookie, - xlator_t *this, int32_t op_ret, int32_t op_errno, - inode_t *inode, struct iatt *buf, dict_t *xattr, - struct iatt *postparent) +static void +afr_lookup_handle_error (afr_local_t *local, int32_t op_ret, int32_t op_errno) { - afr_local_t * local = NULL; - afr_private_t * priv = NULL; - struct iatt * lookup_buf = NULL; - int call_count = -1; - int child_index = -1; - int first_up_child = -1; + GF_ASSERT (local); + if (op_errno == ENOENT) + local->enoent_count++; - child_index = (long) cookie; - priv = this->private; - - LOCK (&frame->lock); - { - local = frame->local; - - lookup_buf = &local->cont.lookup.buf; - - if (op_ret == -1) { - if (op_errno == ENOENT) - local->enoent_count++; - - if (__error_more_important (local->op_errno, op_errno)) - local->op_errno = op_errno; - - if (local->op_errno == ESTALE) { - local->op_ret = -1; - } - - goto unlock; - } - - afr_lookup_collect_xattr (local, this, child_index, xattr); - - first_up_child = afr_first_up_child (priv); - - if (child_index == first_up_child) { - local->cont.lookup.ino = - afr_itransform (buf->ia_ino, - priv->child_count, - first_up_child); - } - - if (local->success_count == 0) { - if (local->op_errno != ESTALE) - local->op_ret = op_ret; - - local->cont.lookup.inode = inode_ref (inode); - local->cont.lookup.xattr = dict_ref (xattr); - local->cont.lookup.xattrs[child_index] = dict_ref (xattr); - local->cont.lookup.postparent = *postparent; - - if (priv->first_lookup && inode->ino == 1) { - gf_log (this->name, GF_LOG_INFO, - "added root inode"); - priv->root_inode = inode_ref (inode); - priv->first_lookup = 0; - } - - *lookup_buf = *buf; - - uuid_copy (local->loc.gfid, buf->ia_gfid); - uuid_copy (local->loc.pargfid, - postparent->ia_gfid); - - lookup_buf->ia_ino = afr_itransform (buf->ia_ino, - priv->child_count, - child_index); - if (priv->read_child >= 0) { - afr_set_read_child (this, - local->cont.lookup.inode, - priv->read_child); - } else { - afr_set_read_child (this, - local->cont.lookup.inode, - child_index); - //prefer fast child - local->read_child_index = child_index; - } - - } else { - afr_lookup_self_heal_check (this, local, buf, lookup_buf); + if (__error_more_important (local->op_errno, op_errno)) + local->op_errno = op_errno; + if (local->op_errno == ESTALE) { + local->op_ret = -1; + } +} - local->cont.lookup.xattrs[child_index] = dict_ref (xattr); - if (child_index == local->read_child_index) { - /* - lookup has succeeded on the read child. - So use its inode number - */ - if (local->cont.lookup.xattr) - dict_unref (local->cont.lookup.xattr); +static void +afr_set_root_inode_on_first_lookup (afr_local_t *local, xlator_t *this, + inode_t *inode) +{ + afr_private_t *priv = NULL; + GF_ASSERT (inode); - local->cont.lookup.xattr = dict_ref (xattr); - local->cont.lookup.postparent = *postparent; + if (inode->ino != 1) + goto out; + if (!afr_is_fresh_lookup (&local->loc, this)) + goto out; + priv = this->private; + if ((priv->first_lookup)) { + gf_log (this->name, GF_LOG_INFO, "added root inode"); + priv->root_inode = inode_ref (inode); + priv->first_lookup = 0; + } +out: + return; +} - *lookup_buf = *buf; +static void +afr_lookup_cache_args (afr_local_t *local, int child_index, dict_t *xattr, + struct iatt *buf, struct iatt *postparent) +{ + GF_ASSERT (child_index >= 0); + local->cont.lookup.xattrs[child_index] = dict_ref (xattr); + local->cont.lookup.postparents[child_index] = *postparent; + local->cont.lookup.bufs[child_index] = *buf; +} - uuid_copy (local->loc.gfid, buf->ia_gfid); - uuid_copy (local->loc.pargfid, - postparent->ia_gfid); - } +static void +afr_lookup_handle_first_success (afr_local_t *local, xlator_t *this, + inode_t *inode, struct iatt *buf) +{ + local->cont.lookup.inode = inode_ref (inode); + local->cont.lookup.buf = *buf; + afr_set_root_inode_on_first_lookup (local, this, inode); +} +static void +afr_lookup_handle_success (afr_local_t *local, xlator_t *this, int32_t child_index, + int32_t op_ret, int32_t op_errno, inode_t *inode, + struct iatt *buf, dict_t *xattr, + struct iatt *postparent) +{ + if (local->success_count == 0) { + if (local->op_errno != ESTALE) { + local->op_ret = op_ret; + local->op_errno = 0; } - - local->success_count++; - } -unlock: - UNLOCK (&frame->lock); - - call_count = afr_frame_return (frame); - - if (call_count == 0) { - afr_lookup_done (frame, this, lookup_buf); + afr_lookup_handle_first_success (local, this, inode, buf); } + afr_lookup_update_lk_counts (local, this, + child_index, xattr); - return 0; + afr_lookup_cache_args (local, child_index, xattr, + buf, postparent); + local->cont.lookup.child_success[local->success_count] = child_index; + local->success_count++; } - int -afr_revalidate_lookup_cbk (call_frame_t *frame, void *cookie, - xlator_t *this, int32_t op_ret, int32_t op_errno, - inode_t *inode, struct iatt *buf, dict_t *xattr, - struct iatt *postparent) +afr_lookup_cbk (call_frame_t *frame, void *cookie, + xlator_t *this, int32_t op_ret, int32_t op_errno, + inode_t *inode, struct iatt *buf, dict_t *xattr, + struct iatt *postparent) { afr_local_t * local = NULL; - afr_private_t * priv = NULL; - struct iatt * lookup_buf = NULL; int call_count = -1; int child_index = -1; - int first_up_child = -1; - child_index = (long) cookie; - priv = this->private; + child_index = (long) cookie; LOCK (&frame->lock); { local = frame->local; - lookup_buf = &local->cont.lookup.buf; - if (op_ret == -1) { - if (op_errno == ENOENT) - local->enoent_count++; - - if (__error_more_important (local->op_errno, op_errno)) - local->op_errno = op_errno; - - if (local->op_errno == ESTALE) { - local->op_ret = -1; - } - + afr_lookup_handle_error (local, op_ret, op_errno); goto unlock; } + afr_lookup_handle_success (local, this, child_index, op_ret, + op_errno, inode, buf, xattr, + postparent); - afr_lookup_collect_xattr (local, this, child_index, xattr); - - first_up_child = afr_first_up_child (priv); - - if (child_index == first_up_child) { - local->cont.lookup.ino = - afr_itransform (buf->ia_ino, - priv->child_count, - first_up_child); - } - - /* in case of revalidate, we need to send stat of the - * child whose stat was sent during the first lookup. - * (so that time stamp does not vary with revalidate. - * in case it is down, stat of the fist success will - * be replied */ - - /* inode number should be preserved across revalidates */ - - if (local->success_count == 0) { - if (local->op_errno != ESTALE) - local->op_ret = op_ret; - - local->cont.lookup.inode = inode_ref (inode); - local->cont.lookup.xattr = dict_ref (xattr); - local->cont.lookup.xattrs[child_index] = dict_ref (xattr); - local->cont.lookup.postparent = *postparent; - - *lookup_buf = *buf; - - lookup_buf->ia_ino = afr_itransform (buf->ia_ino, - priv->child_count, - child_index); - - if (priv->read_child >= 0) { - afr_set_read_child (this, - local->cont.lookup.inode, - priv->read_child); - } else { - afr_set_read_child (this, - local->cont.lookup.inode, - child_index); - } - - } else { - afr_lookup_self_heal_check (this, local, buf, lookup_buf); - - local->cont.lookup.xattrs[child_index] = dict_ref (xattr); - if (child_index == local->read_child_index) { + } +unlock: + UNLOCK (&frame->lock); - /* - lookup has succeeded on the read child. - So use its inode number - */ + call_count = afr_frame_return (frame); + if (call_count == 0) { + afr_lookup_done (frame, this); + } - if (local->cont.lookup.xattr) - dict_unref (local->cont.lookup.xattr); + return 0; +} - local->cont.lookup.xattr = dict_ref (xattr); - local->cont.lookup.postparent = *postparent; +int +afr_lookup_cont_init (afr_local_t *local, unsigned int child_count) +{ + int ret = -ENOMEM; + int32_t *child_success = NULL; + struct iatt *iatts = NULL; + int i = 0; - *lookup_buf = *buf; - } + GF_ASSERT (local); + local->cont.lookup.xattrs = GF_CALLOC (child_count, + sizeof (*local->cont.lookup.xattr), + gf_afr_mt_dict_t); + if (NULL == local->cont.lookup.xattrs) + goto out; - } + iatts = GF_CALLOC (child_count, sizeof (*iatts), gf_afr_mt_iatt); + if (NULL == iatts) + goto out; + local->cont.lookup.postparents = iatts; - local->success_count++; - } -unlock: - UNLOCK (&frame->lock); + iatts = GF_CALLOC (child_count, sizeof (*iatts), gf_afr_mt_iatt); + if (NULL == iatts) + goto out; + local->cont.lookup.bufs = iatts; - call_count = afr_frame_return (frame); + child_success = GF_CALLOC (child_count, sizeof (*child_success), + gf_afr_mt_char); + if (NULL == child_success) + goto out; + for (i = 0; i < child_count; i++) + child_success[i] = -1; - if (call_count == 0) { - afr_lookup_done (frame, this, lookup_buf); - } + local->cont.lookup.child_success = child_success; - return 0; + local->cont.lookup.read_child = -1; + ret = 0; +out: + return ret; } - int afr_lookup (call_frame_t *frame, xlator_t *this, loc_t *loc, dict_t *xattr_req) { - afr_private_t *priv = NULL; - afr_local_t *local = NULL; - int ret = -1; - int i = 0; - fop_lookup_cbk_t callback = NULL; - int call_count = 0; - uint64_t ctx = 0; - int32_t op_errno = 0; + afr_private_t *priv = NULL; + afr_local_t *local = NULL; + int ret = -1; + int i = 0; + int call_count = 0; + uint64_t ctx = 0; + int32_t op_errno = 0; priv = this->private; @@ -1123,14 +1233,9 @@ afr_lookup (call_frame_t *frame, xlator_t *this, if (ret == 0) { /* lookup is a revalidate */ - callback = afr_revalidate_lookup_cbk; - - local->cont.lookup.is_revalidate = _gf_true; local->read_child_index = afr_read_child (this, loc->inode); } else { - callback = afr_fresh_lookup_cbk; - LOCK (&priv->read_child_lock); { local->read_child_index = (++priv->read_child_rr) @@ -1143,10 +1248,16 @@ afr_lookup (call_frame_t *frame, xlator_t *this, local->cont.lookup.parent_ino = loc->parent->ino; local->child_up = memdup (priv->child_up, priv->child_count); + if (NULL == local->child_up) { + op_errno = ENOMEM; + goto out; + } - local->cont.lookup.xattrs = GF_CALLOC (priv->child_count, - sizeof (*local->cont.lookup.xattr), - gf_afr_mt_dict_t); + ret = afr_lookup_cont_init (local, priv->child_count); + if (ret < 0) { + op_errno = -ret; + goto out; + } local->call_count = afr_up_children_count (priv->child_count, local->child_up); @@ -1192,7 +1303,8 @@ afr_lookup (call_frame_t *frame, xlator_t *this, for (i = 0; i < priv->child_count; i++) { if (local->child_up[i]) { - STACK_WIND_COOKIE (frame, callback, (void *) (long) i, + STACK_WIND_COOKIE (frame, afr_lookup_cbk, + (void *) (long) i, priv->children[i], priv->children[i]->fops->lookup, loc, local->xattr_req); diff --git a/xlators/cluster/afr/src/afr-dir-read.c b/xlators/cluster/afr/src/afr-dir-read.c index e626a6f1a..3f056b686 100644 --- a/xlators/cluster/afr/src/afr-dir-read.c +++ b/xlators/cluster/afr/src/afr-dir-read.c @@ -70,15 +70,19 @@ gf_boolean_t __checksums_differ (uint32_t *checksum, int child_count, unsigned char *child_up) { - int ret = _gf_false; - int i = 0; - uint32_t cksum = 0; - - cksum = checksum[0]; + int ret = _gf_false; + int i = 0; + uint32_t cksum = 0; + gf_boolean_t activate_check = _gf_false; for (i = 0; i < child_count; i++) { if (!child_up[i]) continue; + if (_gf_false == activate_check) { + cksum = checksum[i]; + activate_check = _gf_true; + continue; + } if (cksum != checksum[i]) { ret = _gf_true; diff --git a/xlators/cluster/afr/src/afr-self-heal-common.c b/xlators/cluster/afr/src/afr-self-heal-common.c index 4f50f3577..b29deb8bc 100644 --- a/xlators/cluster/afr/src/afr-self-heal-common.c +++ b/xlators/cluster/afr/src/afr-self-heal-common.c @@ -70,23 +70,6 @@ afr_sh_source_count (int sources[], int child_count) return nsource; } - -int -afr_sh_supress_errenous_children (int sources[], int child_errno[], - int child_count) -{ - int i = 0; - - for (i = 0; i < child_count; i++) { - if (child_errno[i] && sources[i]) { - sources[i] = 0; - } - } - - return 0; -} - - void afr_sh_print_pending_matrix (int32_t *pending_matrix[], xlator_t *this) { @@ -113,11 +96,46 @@ afr_sh_print_pending_matrix (int32_t *pending_matrix[], xlator_t *this) GF_FREE (buf); } +void +afr_init_pending_matrix (int32_t **pending_matrix, size_t child_count) +{ + int i = 0; + int j = 0; + + GF_ASSERT (pending_matrix); + + for (i = 0; i < child_count; i++) { + for (j = 0; j < child_count; j++) { + pending_matrix[i][j] = 0; + } + } +} void -afr_sh_build_pending_matrix (afr_private_t *priv, - int32_t *pending_matrix[], dict_t *xattr[], - int child_count, afr_transaction_type type) +afr_mark_ignorant_subvols_as_pending (int32_t **pending_matrix, + unsigned char *ignorant_subvols, + size_t child_count) +{ + int i = 0; + int j = 0; + + GF_ASSERT (pending_matrix); + GF_ASSERT (ignorant_subvols); + + for (i = 0; i < child_count; i++) { + if (ignorant_subvols[i]) { + for (j = 0; j < child_count; j++) { + if (!ignorant_subvols[j]) + pending_matrix[j][i] += 1; + } + } + } +} + +int +afr_build_pending_matrix (char **pending_key, int32_t **pending_matrix, + dict_t *xattr[], afr_transaction_type type, + size_t child_count) { /* Indexable by result of afr_index_for_transaction_type(): 0 -- 2. */ int32_t pending[3] = {0,}; @@ -130,19 +148,16 @@ afr_sh_build_pending_matrix (afr_private_t *priv, ignorant_subvols = GF_CALLOC (sizeof (*ignorant_subvols), child_count, gf_afr_mt_char); + if (NULL == ignorant_subvols) + goto out; - /* start clean */ - for (i = 0; i < child_count; i++) { - for (j = 0; j < child_count; j++) { - pending_matrix[i][j] = 0; - } - } + afr_init_pending_matrix (pending_matrix, child_count); for (i = 0; i < child_count; i++) { pending_raw = NULL; for (j = 0; j < child_count; j++) { - ret = dict_get_ptr (xattr[i], priv->pending_key[j], + ret = dict_get_ptr (xattr[i], pending_key[j], &pending_raw); if (ret != 0) { @@ -163,21 +178,12 @@ afr_sh_build_pending_matrix (afr_private_t *priv, } } - /* - * Make all non-ignorant subvols point towards the ignorant - * subvolumes. - */ - - for (i = 0; i < child_count; i++) { - if (ignorant_subvols[i]) { - for (j = 0; j < child_count; j++) { - if (!ignorant_subvols[j]) - pending_matrix[j][i] += 1; - } - } - } - + afr_mark_ignorant_subvols_as_pending (pending_matrix, + ignorant_subvols, + child_count); GF_FREE (ignorant_subvols); +out: + return ret; } @@ -208,7 +214,8 @@ afr_sh_build_pending_matrix (afr_private_t *priv, typedef enum { AFR_NODE_INNOCENT, AFR_NODE_FOOL, - AFR_NODE_WISE + AFR_NODE_WISE, + AFR_NODE_INVALID = -1, } afr_node_type; typedef struct { @@ -353,182 +360,276 @@ afr_sh_mark_wisest_as_sources (int sources[], return nsources; } - -static int -afr_sh_mark_if_size_differs (afr_self_heal_t *sh, int child_count) +static void +afr_compute_witness_of_fools (int32_t *witnesses, int32_t **pending_matrix, + afr_node_character *characters, + int32_t child_count) { - int32_t ** pending_matrix = NULL; - int i = 0; - int j = 0; - int size_differs = 0; + int i = 0; + int j = 0; + int witness = 0; - pending_matrix = sh->pending_matrix; + GF_ASSERT (witnesses); + GF_ASSERT (pending_matrix); + GF_ASSERT (characters); + GF_ASSERT (child_count > 0); for (i = 0; i < child_count; i++) { + if (characters[i].type != AFR_NODE_FOOL) + continue; + + witness = 0; for (j = 0; j < child_count; j++) { - if (!sh->buf) - break; + if (i == j) + continue; + witness += pending_matrix[i][j]; + } + witnesses[i] = witness; + } +} - if (SIZE_DIFFERS (&sh->buf[i], &sh->buf[j]) - && (pending_matrix[i][j] == 0) - && (pending_matrix[j][i] == 0)) { +static int32_t +afr_find_biggest_witness_among_fools (int32_t *witnesses, + afr_node_character *characters, + int32_t child_count) +{ + int i = 0; + int biggest_witness = -1; - pending_matrix[i][j] = 1; - pending_matrix[j][i] = 1; + GF_ASSERT (witnesses); + GF_ASSERT (characters); + GF_ASSERT (child_count > 0); - size_differs = 1; - } - } - } + for (i = 0; i < child_count; i++) { + if (characters[i].type != AFR_NODE_FOOL) + continue; - return size_differs; + if (biggest_witness < witnesses[i]) + biggest_witness = witnesses[i]; + } + return biggest_witness; } - -static int -afr_sh_mark_biggest_fool_as_source (afr_self_heal_t *sh, +int +afr_mark_fool_as_source_by_witness (int32_t *sources, int32_t *witnesses, afr_node_character *characters, - int child_count) + int32_t child_count, int32_t witness) { - int i = 0; - int biggest = 0; + int i = 0; + int nsources = 0; - for (i = 0; i < child_count; i++) { - if (characters[i].type == AFR_NODE_FOOL) { - biggest = i; - break; - } - } + GF_ASSERT (sources); + GF_ASSERT (witnesses); + GF_ASSERT (characters); + GF_ASSERT (child_count > 0); for (i = 0; i < child_count; i++) { if (characters[i].type != AFR_NODE_FOOL) continue; - if (!sh->buf) - break; - - if (SIZE_GREATER (&sh->buf[i], &sh->buf[biggest])) { - biggest = i; + if (witness == witnesses[i]) { + sources[i] = 1; + nsources++; } } + return nsources; +} + +static int +afr_mark_biggest_of_fools_as_source (int32_t *sources, int32_t **pending_matrix, + afr_node_character *characters, + int child_count) +{ + int32_t biggest_witness = 0; + int nsources = 0; + int32_t *witnesses = NULL; - sh->sources[biggest] = 1; + GF_ASSERT (child_count > 0); - return 1; -} + witnesses = GF_CALLOC (child_count, sizeof (*witnesses), + gf_afr_mt_int32_t); + if (NULL == witnesses) { + nsources = -1; + goto out; + } + afr_compute_witness_of_fools (witnesses, pending_matrix, characters, + child_count); + biggest_witness = afr_find_biggest_witness_among_fools (witnesses, + characters, + child_count); + nsources = afr_mark_fool_as_source_by_witness (sources, witnesses, + characters, child_count, + biggest_witness); +out: + if (witnesses) + GF_FREE (witnesses); + return nsources; +} -static int -afr_sh_mark_biggest_as_source (afr_self_heal_t *sh, int child_count) +int +afr_mark_child_as_source_by_uid (int32_t *sources, struct iatt *bufs, + int32_t *valid_children, int child_count, + uint32_t uid) { - int biggest = 0; - int i = 0; + int i = 0; + int nsources = 0; + int child = 0; + + GF_ASSERT (bufs); + GF_ASSERT (valid_children); + GF_ASSERT (sources); + GF_ASSERT (child_count > 0); for (i = 0; i < child_count; i++) { - if (!sh->buf) - break; + if (-1 == valid_children[i]) + continue; - if (SIZE_GREATER (&sh->buf[i], &sh->buf[biggest])) { - biggest = i; + child = valid_children[i]; + if (uid == bufs[child].ia_uid) { + sources[child] = 1; + nsources++; } } + return nsources; +} - sh->sources[biggest] = 1; +int +afr_get_child_with_lowest_uid (struct iatt *bufs, int32_t *valid_children, + int child_count) +{ + int i = 0; + int smallest = -1; + int child = 0; - return 1; -} + GF_ASSERT (bufs); + GF_ASSERT (valid_children); + GF_ASSERT (child_count > 0); + for (i = 0; i < child_count; i++) { + if (-1 == valid_children[i]) + continue; + child = valid_children[i]; + if ((smallest == -1) || + (bufs[child].ia_uid < bufs[smallest].ia_uid)) { + smallest = child; + } + } + return smallest; +} static int -afr_sh_mark_loweia_uid_as_source (afr_self_heal_t *sh, int child_count) +afr_sh_mark_lowest_uid_as_source (struct iatt *bufs, int32_t *valid_children, + int child_count, int32_t *sources) { - uid_t smallest = 0; - int i = 0; + int nsources = 0; + int smallest = 0; - for (i = 0; i < child_count; i++) { - if (!sh->buf) - break; - - if (sh->buf[i].ia_uid < sh->buf[smallest].ia_uid) { - smallest = i; - } + smallest = afr_get_child_with_lowest_uid (bufs, valid_children, + child_count); + if (smallest < 0) { + nsources = -1; + goto out; } + nsources = afr_mark_child_as_source_by_uid (sources, bufs, + valid_children, child_count, + bufs[smallest].ia_uid); +out: + return nsources; +} - sh->sources[smallest] = 1; +char * +afr_get_character_str (afr_node_type type) +{ + char *character = NULL; - return 1; + switch (type) { + case AFR_NODE_INNOCENT: + character = "innocent"; + break; + case AFR_NODE_FOOL: + character = "fool"; + break; + case AFR_NODE_WISE: + character = "wise"; + break; + default: + character = "invalid"; + break; + } + return character; } +afr_node_type +afr_find_child_character_type (int32_t *pending_row, int32_t child, + int32_t child_count, const char *xlator_name) +{ + afr_node_type type = AFR_NODE_INVALID; + + GF_ASSERT (pending_row); + GF_ASSERT (child_count > 0); + GF_ASSERT ((child >= 0) && (child < child_count)); + + if (afr_sh_is_innocent (pending_row, child_count)) + type = AFR_NODE_INNOCENT; + else if (afr_sh_is_fool (pending_row, child, child_count)) + type = AFR_NODE_FOOL; + else if (afr_sh_is_wise (pending_row, child, child_count)) + type = AFR_NODE_WISE; + else + GF_ASSERT (0); + + gf_log (xlator_name, GF_LOG_DEBUG, "child %d character %s", + child, afr_get_character_str (type)); + return type; +} int -afr_sh_mark_sources (afr_self_heal_t *sh, int child_count, - afr_self_heal_type type) +afr_mark_sources (int32_t *sources, int32_t **pending_matrix, struct iatt *bufs, + int32_t child_count, afr_self_heal_type type, + int32_t *valid_children, const char *xlator_name) { /* stores the 'characters' (innocent, fool, wise) of the nodes */ - afr_node_character *characters = NULL; + afr_node_character *characters = NULL; int i = 0; - int32_t ** pending_matrix = NULL; - int * sources = NULL; - int size_differs = 0; - int nsources = 0; + int nsources = -1; xlator_t *this = NULL; - afr_private_t *priv = NULL; characters = GF_CALLOC (sizeof (afr_node_character), - child_count, - gf_afr_mt_afr_node_character) ; + child_count, gf_afr_mt_afr_node_character); if (!characters) goto out; this = THIS; - priv = this->private; - pending_matrix = sh->pending_matrix; - sources = sh->sources; /* start clean */ for (i = 0; i < child_count; i++) { sources[i] = 0; } + nsources = 0; for (i = 0; i < child_count; i++) { - if (afr_sh_is_innocent (pending_matrix[i], child_count)) { - characters[i].type = AFR_NODE_INNOCENT; - - } else if (afr_sh_is_fool (pending_matrix[i], i, child_count)) { - characters[i].type = AFR_NODE_FOOL; - - } else if (afr_sh_is_wise (pending_matrix[i], i, child_count)) { - characters[i].type = AFR_NODE_WISE; - - } else { - gf_log (this->name, GF_LOG_CRITICAL, - "Could not determine the state of subvolume %s!" - " (This message should never appear." - " Please file a bug report to " - ".)", - priv->children[i]->name); - } - } - - if (type == AFR_SELF_HEAL_DATA) { - size_differs = afr_sh_mark_if_size_differs (sh, child_count); + characters[i].type = + afr_find_child_character_type (pending_matrix[i], i, + child_count, + xlator_name); + if (AFR_NODE_INVALID == characters[i].type) + gf_log (xlator_name, GF_LOG_WARNING, + "child %d had invalid xattrs", i); } if ((type == AFR_SELF_HEAL_METADATA) && afr_sh_all_nodes_innocent (characters, child_count)) { - nsources = afr_sh_mark_loweia_uid_as_source (sh, child_count); + nsources = afr_sh_mark_lowest_uid_as_source (bufs, + valid_children, + child_count, + sources); goto out; } - if (afr_sh_all_nodes_innocent (characters, child_count)) { - if (size_differs) { - nsources = afr_sh_mark_biggest_as_source (sh, - child_count); - } - - } else if (afr_sh_wise_nodes_exist (characters, child_count)) { + if (afr_sh_wise_nodes_exist (characters, child_count)) { afr_sh_compute_wisdom (pending_matrix, characters, child_count); if (afr_sh_wise_nodes_conflict (characters, child_count)) { @@ -536,7 +637,6 @@ afr_sh_mark_sources (afr_self_heal_t *sh, int child_count, gf_log (this->name, GF_LOG_INFO, "split-brain possible, no source detected"); nsources = -1; - goto out; } else { nsources = afr_sh_mark_wisest_as_sources (sources, @@ -544,18 +644,26 @@ afr_sh_mark_sources (afr_self_heal_t *sh, int child_count, child_count); } } else { - nsources = afr_sh_mark_biggest_fool_as_source (sh, characters, - child_count); + nsources = afr_mark_biggest_of_fools_as_source (sources, + pending_matrix, + characters, + child_count); } out: + if (nsources == 0) { + for (i = 0; i < child_count; i++) { + if (valid_children[i] != -1) + sources[valid_children[i]] = 1; + } + } if (characters) GF_FREE (characters); + gf_log (this->name, GF_LOG_DEBUG, "Number of sources: %d", nsources); return nsources; } - void afr_sh_pending_to_delta (afr_private_t *priv, dict_t **xattr, int32_t *delta_matrix[], int success[], @@ -643,7 +751,7 @@ afr_sh_delta_to_xattr (afr_private_t *priv, int -afr_sh_has_metadata_pending (dict_t *xattr, int child_count, xlator_t *this) +afr_sh_has_metadata_pending (dict_t *xattr, xlator_t *this) { /* Indexable by result of afr_index_for_transaction_type(): 0 -- 2. */ int32_t pending[3] = {0,}; @@ -674,7 +782,7 @@ afr_sh_has_metadata_pending (dict_t *xattr, int child_count, xlator_t *this) int -afr_sh_has_data_pending (dict_t *xattr, int child_count, xlator_t *this) +afr_sh_has_data_pending (dict_t *xattr, xlator_t *this) { /* Indexable by result of afr_index_for_transaction_type(): 0 -- 2. */ int32_t pending[3] = {0,}; @@ -705,7 +813,7 @@ afr_sh_has_data_pending (dict_t *xattr, int child_count, xlator_t *this) int -afr_sh_has_entry_pending (dict_t *xattr, int child_count, xlator_t *this) +afr_sh_has_entry_pending (dict_t *xattr, xlator_t *this) { /* Indexable by result of afr_index_for_transaction_type(): 0 -- 2. */ int32_t pending[3] = {0,}; @@ -1655,6 +1763,9 @@ afr_self_heal (call_frame_t *frame, xlator_t *this) priv->child_count, gf_afr_mt_int32_t); } + sh->child_success = GF_CALLOC (sizeof (*sh->child_success), + priv->child_count, gf_afr_mt_int32_t); + FRAME_SU_DO (sh_frame, afr_local_t); if (local->success_count && local->enoent_count) { @@ -1688,3 +1799,25 @@ afr_self_heal_type_str_get (afr_self_heal_t *self_heal_p, char *str, snprintf(str + strlen(str), size - strlen(str), " entry"); } } + +afr_self_heal_type +afr_self_heal_type_for_transaction (afr_transaction_type type) +{ + afr_self_heal_type sh_type = AFR_SELF_HEAL_INVALID; + + switch (type) { + case AFR_DATA_TRANSACTION: + sh_type = AFR_SELF_HEAL_DATA; + break; + case AFR_METADATA_TRANSACTION: + sh_type = AFR_SELF_HEAL_METADATA; + break; + case AFR_ENTRY_TRANSACTION: + sh_type = AFR_SELF_HEAL_ENTRY; + break; + case AFR_ENTRY_RENAME_TRANSACTION: + GF_ASSERT (0); + break; + } + return sh_type; +} diff --git a/xlators/cluster/afr/src/afr-self-heal-common.h b/xlators/cluster/afr/src/afr-self-heal-common.h index 87856761c..676e933ae 100644 --- a/xlators/cluster/afr/src/afr-self-heal-common.h +++ b/xlators/cluster/afr/src/afr-self-heal-common.h @@ -26,6 +26,7 @@ typedef enum { AFR_SELF_HEAL_ENTRY, AFR_SELF_HEAL_METADATA, AFR_SELF_HEAL_DATA, + AFR_SELF_HEAL_INVALID = -1, } afr_self_heal_type; int @@ -37,17 +38,13 @@ afr_sh_sink_count (int sources[], int child_count); int afr_sh_source_count (int sources[], int child_count); -int -afr_sh_supress_errenous_children (int sources[], int child_errno[], - int child_count); - void afr_sh_print_pending_matrix (int32_t *pending_matrix[], xlator_t *this); -void -afr_sh_build_pending_matrix (afr_private_t *priv, - int32_t *pending_matrix[], dict_t *xattr[], - int child_count, afr_transaction_type type); +int +afr_build_pending_matrix (char **pending_key, int32_t **pending_matrix, + dict_t *xattr[], afr_transaction_type type, + size_t child_count); void afr_sh_pending_to_delta (afr_private_t *priv, dict_t **xattr, @@ -55,8 +52,9 @@ afr_sh_pending_to_delta (afr_private_t *priv, dict_t **xattr, int child_count, afr_transaction_type type); int -afr_sh_mark_sources (afr_self_heal_t *sh, int child_count, - afr_self_heal_type type); +afr_mark_sources (int32_t *sources, int32_t **pending_matrix, struct iatt *bufs, + int32_t child_count, afr_self_heal_type type, + int32_t *valid_children, const char *xlator_name); int afr_sh_delta_to_xattr (afr_private_t *priv, @@ -70,4 +68,7 @@ void afr_self_heal_type_str_get (afr_self_heal_t *self_heal_p, char *str, size_t size); +afr_self_heal_type +afr_self_heal_type_for_transaction (afr_transaction_type type); + #endif /* __AFR_SELF_HEAL_COMMON_H__ */ diff --git a/xlators/cluster/afr/src/afr-self-heal-data.c b/xlators/cluster/afr/src/afr-self-heal-data.c index 0b9e4218c..38799db70 100644 --- a/xlators/cluster/afr/src/afr-self-heal-data.c +++ b/xlators/cluster/afr/src/afr-self-heal-data.c @@ -594,16 +594,15 @@ afr_sh_data_fix (call_frame_t *frame, xlator_t *this) sh = &local->self_heal; priv = this->private; - afr_sh_build_pending_matrix (priv, sh->pending_matrix, sh->xattr, - priv->child_count, AFR_DATA_TRANSACTION); + afr_build_pending_matrix (priv->pending_key, sh->pending_matrix, + sh->xattr, AFR_DATA_TRANSACTION, + priv->child_count); afr_sh_print_pending_matrix (sh->pending_matrix, this); - nsources = afr_sh_mark_sources (sh, priv->child_count, - AFR_SELF_HEAL_DATA); - - afr_sh_supress_errenous_children (sh->sources, sh->child_errno, - priv->child_count); + nsources = afr_mark_sources (sh->sources, sh->pending_matrix, sh->buf, + priv->child_count, AFR_SELF_HEAL_DATA, + sh->child_success, this->name); if (nsources == 0) { gf_log (this->name, GF_LOG_TRACE, @@ -692,53 +691,164 @@ afr_sh_data_fix (call_frame_t *frame, xlator_t *this) return 0; } +gf_boolean_t +afr_is_fresh_read_child (int32_t *sources, int32_t child_count, + int32_t read_child) +{ + gf_boolean_t is_fresh_child = _gf_false; + + GF_ASSERT (read_child < child_count); -void -afr_self_heal_find_sources (xlator_t *this, afr_local_t *local, dict_t **xattr, - afr_transaction_type transaction_type) + if ((read_child >= 0) && (read_child < child_count) && + sources[read_child]) { + is_fresh_child = _gf_true; + } + return is_fresh_child; +} + +static int +afr_select_read_child_from_policy (int32_t *sources, int32_t child_count, + int32_t prev_read_child, + int32_t config_read_child, + int32_t *valid_children) { - afr_self_heal_t *sh = NULL; - afr_private_t *priv = NULL; - int i = 0; - afr_self_heal_type sh_type = AFR_SELF_HEAL_DATA; - int nsources = 0; + int32_t read_child = -1; + int i = 0; - sh = &local->self_heal; - priv = this->private; + GF_ASSERT (sources); - sh->pending_matrix = GF_CALLOC (sizeof (int32_t *), priv->child_count, - gf_afr_mt_int32_t); - for (i = 0; i < priv->child_count; i++) { - sh->pending_matrix[i] = GF_CALLOC (sizeof (int32_t), - priv->child_count, - gf_afr_mt_int32_t); + read_child = prev_read_child; + if (_gf_true == afr_is_fresh_read_child (sources, child_count, + read_child)) + goto out; + + read_child = config_read_child; + if (_gf_true == afr_is_fresh_read_child (sources, child_count, + read_child)) + goto out; + + for (i = 0; i < child_count; i++) { + read_child = valid_children[i]; + if (read_child < 0) + break; + if (_gf_true == afr_is_fresh_read_child (sources, child_count, + read_child)) + goto out; } + read_child = -1; + +out: + return read_child; +} - sh->sources = GF_CALLOC (priv->child_count, sizeof (*sh->sources), - gf_afr_mt_int32_t); - - afr_sh_build_pending_matrix (priv, sh->pending_matrix, xattr, - priv->child_count, transaction_type); - - switch (transaction_type) { - case AFR_DATA_TRANSACTION: - sh_type = AFR_SELF_HEAL_DATA; - break; - case AFR_ENTRY_TRANSACTION: - sh_type = AFR_SELF_HEAL_ENTRY; - break; - case AFR_METADATA_TRANSACTION: - sh_type = AFR_SELF_HEAL_METADATA; - break; - default: - sh_type = AFR_SELF_HEAL_METADATA; - break; +static void +afr_destroy_pending_matrix (int32_t **pending_matrix, int32_t child_count) +{ + int i = 0; + GF_ASSERT (child_count > 0); + if (pending_matrix) { + for (i = 0; i < child_count; i++) { + if (pending_matrix[i]) + GF_FREE (pending_matrix[i]); + } + GF_FREE (pending_matrix); } - nsources = afr_sh_mark_sources (sh, priv->child_count, sh_type); - if (nsources == 0) { - for (i = 0; i < priv->child_count; i++) - sh->sources[i] = 1; +} + +static int32_t** +afr_create_pending_matrix (int32_t child_count) +{ + gf_boolean_t cleanup = _gf_false; + int32_t **pending_matrix = NULL; + int i = 0; + + GF_ASSERT (child_count > 0); + + pending_matrix = GF_CALLOC (sizeof (*pending_matrix), child_count, + gf_afr_mt_int32_t); + if (NULL == pending_matrix) + goto out; + for (i = 0; i < child_count; i++) { + pending_matrix[i] = GF_CALLOC (sizeof (**pending_matrix), + child_count, + gf_afr_mt_int32_t); + if (NULL == pending_matrix[i]) { + cleanup = _gf_true; + goto out; + } + } +out: + if (_gf_true == cleanup) { + afr_destroy_pending_matrix (pending_matrix, child_count); + pending_matrix = NULL; + } + return pending_matrix; +} + +int +afr_lookup_select_read_child_by_txn_type (xlator_t *this, afr_local_t *local, + dict_t **xattr, + afr_transaction_type txn_type) +{ + afr_private_t *priv = NULL; + int read_child = -1; + int ret = -1; + afr_self_heal_type sh_type = AFR_SELF_HEAL_INVALID; + int32_t **pending_matrix = NULL; + int32_t *sources = NULL; + int32_t *valid_children = NULL; + struct iatt *bufs = NULL; + int32_t nsources = 0; + int32_t prev_read_child = -1; + int32_t config_read_child = -1; + afr_self_heal_t *sh = NULL; + + priv = this->private; + bufs = local->cont.lookup.bufs; + valid_children = local->cont.lookup.child_success; + sh = &local->self_heal; + + pending_matrix = afr_create_pending_matrix (priv->child_count); + if (NULL == pending_matrix) + goto out; + + sources = GF_CALLOC (sizeof (*sources), priv->child_count, + gf_afr_mt_int32_t); + if (NULL == sources) + goto out; + + afr_build_pending_matrix (priv->pending_key, pending_matrix, + xattr, txn_type, priv->child_count); + + sh_type = afr_self_heal_type_for_transaction (txn_type); + if (AFR_SELF_HEAL_INVALID == sh_type) + goto out; + + nsources = afr_mark_sources (sources, pending_matrix, bufs, + priv->child_count, sh_type, + valid_children, this->name); + if (nsources < 0) { + ret = -1; + goto out; + } + + prev_read_child = local->read_child_index; + config_read_child = priv->read_child; + read_child = afr_select_read_child_from_policy (sources, + priv->child_count, + prev_read_child, + config_read_child, + valid_children); + ret = 0; + local->cont.lookup.sources = sources; +out: + afr_destroy_pending_matrix (pending_matrix, priv->child_count); + if (-1 == ret) { + if (sources) + GF_FREE (sources); } + gf_log (this->name, GF_LOG_DEBUG, "returning read_child: %d", read_child); + return read_child; } @@ -766,6 +876,8 @@ afr_sh_data_fstat_cbk (call_frame_t *frame, void *cookie, priv->children[child_index]->name); sh->buf[child_index] = *buf; + sh->child_success[sh->success_count] = child_index; + sh->success_count++; } } UNLOCK (&frame->lock); @@ -798,6 +910,9 @@ afr_sh_data_fstat (call_frame_t *frame, xlator_t *this) local->call_count = call_count; + for (i = 0; i < priv->child_count; i++) + sh->child_success[i] = -1; + sh->success_count = 0; for (i = 0; i < priv->child_count; i++) { if (local->child_up[i]) { STACK_WIND_COOKIE (frame, afr_sh_data_fstat_cbk, diff --git a/xlators/cluster/afr/src/afr-self-heal-entry.c b/xlators/cluster/afr/src/afr-self-heal-entry.c index 0504da17c..8c619ff45 100644 --- a/xlators/cluster/afr/src/afr-self-heal-entry.c +++ b/xlators/cluster/afr/src/afr-self-heal-entry.c @@ -2159,13 +2159,15 @@ afr_sh_entry_fix (call_frame_t *frame, xlator_t *this) goto heal; } - afr_sh_build_pending_matrix (priv, sh->pending_matrix, sh->xattr, - priv->child_count, AFR_ENTRY_TRANSACTION); + afr_build_pending_matrix (priv->pending_key, sh->pending_matrix, + sh->xattr, AFR_ENTRY_TRANSACTION, + priv->child_count); afr_sh_print_pending_matrix (sh->pending_matrix, this); - nsources = afr_sh_mark_sources (sh, priv->child_count, - AFR_SELF_HEAL_ENTRY); + nsources = afr_mark_sources (sh->sources, sh->pending_matrix, sh->buf, + priv->child_count, AFR_SELF_HEAL_ENTRY, + sh->child_success, this->name); if (nsources == 0) { gf_log (this->name, GF_LOG_TRACE, @@ -2176,9 +2178,6 @@ afr_sh_entry_fix (call_frame_t *frame, xlator_t *this) return 0; } - afr_sh_supress_errenous_children (sh->sources, sh->child_errno, - priv->child_count); - source = afr_sh_select_source (sh->sources, priv->child_count); sh->source = source; @@ -2211,6 +2210,8 @@ afr_sh_entry_lookup_cbk (call_frame_t *frame, void *cookie, if (op_ret != -1) { sh->xattr[child_index] = dict_ref (xattr); sh->buf[child_index] = *buf; + sh->child_success[sh->success_count] = child_index; + sh->success_count++; } } UNLOCK (&frame->lock); @@ -2235,9 +2236,11 @@ afr_sh_entry_lookup (call_frame_t *frame, xlator_t *this) int ret = 0; int call_count = 0; int i = 0; + afr_self_heal_t *sh = NULL; priv = this->private; local = frame->local; + sh = &local->self_heal; call_count = afr_up_children_count (priv->child_count, local->child_up); @@ -2257,6 +2260,9 @@ afr_sh_entry_lookup (call_frame_t *frame, xlator_t *this) } } + for (i = 0; i < priv->child_count; i++) + sh->child_success[i] = -1; + sh->success_count = 0; for (i = 0; i < priv->child_count; i++) { if (local->child_up[i]) { STACK_WIND_COOKIE (frame, diff --git a/xlators/cluster/afr/src/afr-self-heal-metadata.c b/xlators/cluster/afr/src/afr-self-heal-metadata.c index a4f037c04..1214eefe2 100644 --- a/xlators/cluster/afr/src/afr-self-heal-metadata.c +++ b/xlators/cluster/afr/src/afr-self-heal-metadata.c @@ -475,17 +475,15 @@ afr_sh_metadata_fix (call_frame_t *frame, xlator_t *this) sh = &local->self_heal; priv = this->private; - afr_sh_build_pending_matrix (priv, sh->pending_matrix, sh->xattr, - priv->child_count, - AFR_METADATA_TRANSACTION); + afr_build_pending_matrix (priv->pending_key, sh->pending_matrix, + sh->xattr, AFR_METADATA_TRANSACTION, + priv->child_count); afr_sh_print_pending_matrix (sh->pending_matrix, this); - nsources = afr_sh_mark_sources (sh, priv->child_count, - AFR_SELF_HEAL_METADATA); - - afr_sh_supress_errenous_children (sh->sources, sh->child_errno, - priv->child_count); + nsources = afr_mark_sources (sh->sources, sh->pending_matrix, sh->buf, + priv->child_count, AFR_SELF_HEAL_METADATA, + sh->child_success, this->name); if (nsources == 0) { gf_log (this->name, GF_LOG_TRACE, @@ -584,6 +582,8 @@ afr_sh_metadata_lookup_cbk (call_frame_t *frame, void *cookie, xlator_t *this, sh->buf[child_index] = *buf; if (xattr) sh->xattr[child_index] = dict_ref (xattr); + sh->child_success[sh->success_count] = child_index; + sh->success_count++; } else { gf_log (this->name, GF_LOG_INFO, "path %s on subvolume %s => -1 (%s)", @@ -614,9 +614,11 @@ afr_sh_metadata_lookup (call_frame_t *frame, xlator_t *this) int call_count = 0; dict_t *xattr_req = NULL; int ret = 0; + afr_self_heal_t *sh = NULL; local = frame->local; priv = this->private; + sh = &local->self_heal; call_count = afr_up_children_count (priv->child_count, local->child_up); @@ -635,6 +637,9 @@ afr_sh_metadata_lookup (call_frame_t *frame, xlator_t *this) } } + for (i = 0; i < priv->child_count; i++) + sh->child_success[i] = -1; + sh->success_count = 0; for (i = 0; i < priv->child_count; i++) { if (local->child_up[i]) { gf_log (this->name, GF_LOG_TRACE, diff --git a/xlators/cluster/afr/src/afr-self-heal.h b/xlators/cluster/afr/src/afr-self-heal.h index 8bf484b48..00e9a1b1e 100644 --- a/xlators/cluster/afr/src/afr-self-heal.h +++ b/xlators/cluster/afr/src/afr-self-heal.h @@ -30,11 +30,11 @@ #define SIZE_GREATER(buf1,buf2) ((buf1)->ia_size > (buf2)->ia_size) int -afr_sh_has_metadata_pending (dict_t *xattr, int child_count, xlator_t *this); +afr_sh_has_metadata_pending (dict_t *xattr, xlator_t *this); int -afr_sh_has_entry_pending (dict_t *xattr, int child_count, xlator_t *this); +afr_sh_has_entry_pending (dict_t *xattr, xlator_t *this); int -afr_sh_has_data_pending (dict_t *xattr, int child_count, xlator_t *this); +afr_sh_has_data_pending (dict_t *xattr, xlator_t *this); int afr_self_heal_entry (call_frame_t *frame, xlator_t *this); @@ -52,4 +52,11 @@ afr_self_heal_find_sources (xlator_t *this, afr_local_t *local, dict_t **xattr, int afr_self_heal (call_frame_t *frame, xlator_t *this); +gf_boolean_t +afr_is_fresh_read_child (int32_t *sources, int32_t child_count, + int32_t read_child); +int +afr_lookup_select_read_child_by_txn_type (xlator_t *this, afr_local_t *local, + dict_t **xattr, + afr_transaction_type txn_type); #endif /* __AFR_SELF_HEAL_H__ */ diff --git a/xlators/cluster/afr/src/afr.h b/xlators/cluster/afr/src/afr.h index 6f40ded12..8f7f54faf 100644 --- a/xlators/cluster/afr/src/afr.h +++ b/xlators/cluster/afr/src/afr.h @@ -138,6 +138,10 @@ typedef struct { /* array of xattr's, one for each child */ dict_t **xattr; + /* array containing if the lookups succeeded in the order of response + */ + int32_t *child_success; + int success_count; /* array of errno's, one for each child */ int *child_errno; @@ -341,14 +345,17 @@ typedef struct _afr_local { struct { inode_t *inode; struct iatt buf; - struct iatt read_child_buf; struct iatt postparent; ino_t ino; uint64_t gen; ino_t parent_ino; - dict_t *xattr; dict_t **xattrs; - gf_boolean_t is_revalidate; + dict_t *xattr; + struct iatt *postparents; + struct iatt *bufs; + int32_t read_child; + int32_t *child_success;//in the order of response + int32_t *sources; } lookup; struct { @@ -737,6 +744,9 @@ afr_build_parent_loc (loc_t *parent, loc_t *child); int afr_up_children_count (int child_count, unsigned char *child_up); +gf_boolean_t +afr_is_fresh_lookup (loc_t *loc, xlator_t *this); + void afr_update_loc_gfids (loc_t *loc, struct iatt *buf, struct iatt *postparent); -- cgit