From 84c8cc9c5936a2a7539f343c180f06312c8f6d39 Mon Sep 17 00:00:00 2001 From: Krutika Dhananjay Date: Wed, 14 Oct 2015 14:14:51 +0530 Subject: cluster/afr: Entry self-heal performance enhancements Change-Id: I52da41dff5619492b656c2217f4716a6cdadebe0 BUG: 1269461 Signed-off-by: Krutika Dhananjay Reviewed-on: http://review.gluster.org/12442 Reviewed-by: Pranith Kumar Karampuri NetBSD-regression: NetBSD Build System Smoke: Gluster Build System CentOS-regression: Gluster Build System --- xlators/cluster/afr/src/afr-self-heal-entry.c | 305 ++++++++++++++++++++++++-- 1 file changed, 289 insertions(+), 16 deletions(-) (limited to 'xlators/cluster/afr/src/afr-self-heal-entry.c') diff --git a/xlators/cluster/afr/src/afr-self-heal-entry.c b/xlators/cluster/afr/src/afr-self-heal-entry.c index fccffa7dbac..00af8e9f2e6 100644 --- a/xlators/cluster/afr/src/afr-self-heal-entry.c +++ b/xlators/cluster/afr/src/afr-self-heal-entry.c @@ -14,6 +14,7 @@ #include "byte-order.h" #include "afr-transaction.h" #include "afr-messages.h" +#include "syncop-utils.h" /* Max file name length is 255 this filename is of length 256. No file with * this name can ever come, entry-lock with this name is going to prevent @@ -349,6 +350,82 @@ __afr_selfheal_entry_dirent (call_frame_t *frame, xlator_t *this, fd_t *fd, return ret; } +static gf_boolean_t +is_full_heal_marker_present (xlator_t *this, dict_t *xdata, int idx) +{ + int i = 0; + int pending[3] = {0,}; + void *pending_raw = NULL; + afr_private_t *priv = NULL; + + priv = this->private; + + if (!xdata) + return _gf_false; + + /* Iterate over each of the priv->pending_keys[] elements and then + * see if any of them have data segment non-zero. If they do, return + * true. Else return false. + */ + for (i = 0; i < priv->child_count; i++) { + if (dict_get_ptr (xdata, priv->pending_key[i], &pending_raw)) + continue; + + if (!pending_raw) + continue; + + memcpy (pending, pending_raw, sizeof (pending)); + if (ntoh32 (pending[idx])) + return _gf_true; + } + + return _gf_false; +} + +static gf_boolean_t +afr_need_full_heal (xlator_t *this, struct afr_reply *replies, int source, + unsigned char *healed_sinks, afr_transaction_type type) +{ + int i = 0; + int idx = 0; + afr_private_t *priv = NULL; + + priv = this->private; + + if (!priv->esh_granular) + return _gf_true; + + if (type != AFR_ENTRY_TRANSACTION) + return _gf_true; + + priv = this->private; + idx = afr_index_for_transaction_type (AFR_DATA_TRANSACTION); + + /* If there is a clear source, check whether the full-heal-indicator + * is present in its xdata. Otherwise, we need to examine all the + * participating bricks and then figure if *even* one of them has a + * full-heal-indicator. + */ + + if (source != -1) { + if (is_full_heal_marker_present (this, replies[source].xdata, + idx)) + return _gf_true; + } + + /* else ..*/ + + for (i = 0; i < priv->child_count; i++) { + if (!healed_sinks[i]) + continue; + + if (is_full_heal_marker_present (this, replies[i].xdata, idx)) + return _gf_true; + } + + return _gf_false; +} + static int __afr_selfheal_entry_finalize_source (xlator_t *this, unsigned char *sources, unsigned char *healed_sinks, @@ -431,7 +508,8 @@ __afr_selfheal_entry_prepare (call_frame_t *frame, xlator_t *this, static int afr_selfheal_entry_dirent (call_frame_t *frame, xlator_t *this, - fd_t *fd, char *name) + fd_t *fd, char *name, inode_t *parent_idx_inode, + xlator_t *subvol, gf_boolean_t full_crawl) { int ret = 0; int source = -1; @@ -486,10 +564,15 @@ afr_selfheal_entry_dirent (call_frame_t *frame, xlator_t *this, ret = __afr_selfheal_entry_dirent (frame, this, fd, name, inode, source, sources, healed_sinks, locked_on, replies); + + if ((ret == 0) && (priv->esh_granular) && (!full_crawl)) + ret = afr_shd_index_purge (subvol, parent_idx_inode, + name); } + unlock: afr_selfheal_unentrylk (frame, this, fd->inode, this->name, NULL, - locked_on); + locked_on, NULL); if (inode) inode_unref (inode); if (replies) @@ -513,12 +596,16 @@ afr_selfheal_entry_do_subvol (call_frame_t *frame, xlator_t *this, xlator_t *subvol = NULL; afr_private_t *priv = NULL; gf_boolean_t mismatch = _gf_false; + afr_local_t *iter_local = NULL; + afr_local_t *local = NULL; priv = this->private; subvol = priv->children[child]; INIT_LIST_HEAD (&entries.list); + local = frame->local; + iter_frame = afr_copy_frame (frame); if (!iter_frame) return -ENOMEM; @@ -539,7 +626,9 @@ afr_selfheal_entry_do_subvol (call_frame_t *frame, xlator_t *this, continue; ret = afr_selfheal_entry_dirent (iter_frame, this, fd, - entry->d_name); + entry->d_name, NULL, + NULL, + local->need_full_crawl); AFR_STACK_RESET (iter_frame); if (iter_frame->local == NULL) { ret = -ENOTCONN; @@ -567,36 +656,210 @@ afr_selfheal_entry_do_subvol (call_frame_t *frame, xlator_t *this, return ret; } +static inode_t * +afr_shd_entry_changes_index_inode (xlator_t *this, xlator_t *subvol, + uuid_t pargfid) +{ + int ret = -1; + void *index_gfid = NULL; + loc_t rootloc = {0,}; + loc_t loc = {0,}; + dict_t *xattr = NULL; + inode_t *inode = NULL; + struct iatt iatt = {0,}; + + rootloc.inode = inode_ref (this->itable->root); + gf_uuid_copy (rootloc.gfid, rootloc.inode->gfid); + + ret = syncop_getxattr (subvol, &rootloc, &xattr, + GF_XATTROP_ENTRY_CHANGES_GFID, NULL, NULL); + if (ret || !xattr) { + errno = -ret; + goto out; + } + + ret = dict_get_ptr (xattr, GF_XATTROP_ENTRY_CHANGES_GFID, &index_gfid); + if (ret) { + errno = EINVAL; + goto out; + } + + loc.inode = inode_new (this->itable); + if (!loc.inode) { + errno = ENOMEM; + goto out; + } + + gf_uuid_copy (loc.pargfid, index_gfid); + loc.name = gf_strdup (uuid_utoa (pargfid)); + + ret = syncop_lookup (subvol, &loc, &iatt, NULL, NULL, NULL); + if (ret < 0) { + errno = -ret; + goto out; + } + + inode = inode_link (loc.inode, NULL, NULL, &iatt); + +out: + if (xattr) + dict_unref (xattr); + loc_wipe (&rootloc); + GF_FREE ((char *)loc.name); + loc_wipe (&loc); + + return inode; +} + +static int +afr_selfheal_entry_granular_dirent (xlator_t *subvol, gf_dirent_t *entry, + loc_t *parent, void *data) +{ + int ret = 0; + loc_t loc = {0,}; + struct iatt iatt = {0,}; + afr_granular_esh_args_t *args = data; + + /* Look up the actual inode associated with entry. If the lookup returns + * ESTALE or ENOENT, then it means we have a stale index. Remove it. + * This is analogous to the check in afr_shd_index_heal() except that + * here it is achieved through LOOKUP and in afr_shd_index_heal() through + * a GETXATTR. + */ + + loc.inode = inode_new (args->xl->itable); + loc.parent = inode_ref (args->heal_fd->inode); + gf_uuid_copy (loc.pargfid, loc.parent->gfid); + loc.name = entry->d_name; + + ret = syncop_lookup (args->xl, &loc, &iatt, NULL, NULL, NULL); + if ((ret == -ENOENT) || (ret == -ESTALE)) { + afr_shd_index_purge (subvol, parent->inode, entry->d_name); + ret = 0; + goto out; + } + /* TBD: afr_shd_zero_xattrop? */ + + ret = afr_selfheal_entry_dirent (args->frame, args->xl, args->heal_fd, + entry->d_name, parent->inode, subvol, + _gf_false); + AFR_STACK_RESET (args->frame); + if (args->frame->local == NULL) + ret = -ENOTCONN; + + if (ret == -1) + args->mismatch = _gf_true; + +out: + loc_wipe (&loc); + return 0; +} + +static int +afr_selfheal_entry_granular (call_frame_t *frame, xlator_t *this, fd_t *fd, + int subvol_idx, gf_boolean_t is_src) +{ + int ret = 0; + loc_t loc = {0,}; + xlator_t *subvol = NULL; + afr_private_t *priv = NULL; + afr_granular_esh_args_t args = {0,}; + + priv = this->private; + subvol = priv->children[subvol_idx]; + + args.frame = afr_copy_frame (frame); + args.xl = this; + /* args.heal_fd represents the fd associated with the original directory + * on which entry heal is being attempted. + */ + args.heal_fd = fd; + + /* @subvol here represents the subvolume of AFR where + * indices/entry-changes/ will be processed + */ + loc.inode = afr_shd_entry_changes_index_inode (this, subvol, + fd->inode->gfid); + if (!loc.inode) { + /* If granular heal failed on the sink (as it might sometimes + * because it is the src that would mostly contain the granular + * changelogs and the sink's entry-changes would be empty), + * do not treat heal as failure. + */ + if (is_src) + return -errno; + else + return 0; + } + + ret = syncop_dir_scan (subvol, &loc, GF_CLIENT_PID_SELF_HEALD, + &args, afr_selfheal_entry_granular_dirent); + + loc_wipe (&loc); + + if (args.mismatch == _gf_true) + ret = -1; + + return ret; +} + static int afr_selfheal_entry_do (call_frame_t *frame, xlator_t *this, fd_t *fd, int source, unsigned char *sources, unsigned char *healed_sinks) { - int i = 0; - afr_private_t *priv = NULL; - gf_boolean_t mismatch = _gf_false; - int ret = 0; + int i = 0; + int ret = 0; + gf_boolean_t mismatch = _gf_false; + afr_local_t *local = NULL; + afr_private_t *priv = NULL; priv = this->private; + local = frame->local; gf_msg (this->name, GF_LOG_INFO, 0, AFR_MSG_SELF_HEAL_INFO, "performing entry selfheal on %s", uuid_utoa (fd->inode->gfid)); for (i = 0; i < priv->child_count; i++) { + /* Expunge */ if (!healed_sinks[i]) continue; - ret = afr_selfheal_entry_do_subvol (frame, this, fd, i); + + if (!local->need_full_crawl) + /* Why call afr_selfheal_entry_granular() on a "healed sink", + * given that it is the source that contains the granular + * indices? + * If the index for this directory is non-existent or empty on + * this subvol (=> clear sink), then it will return early + * without failure status. + * If the index is non-empty and it is yet a 'healed sink', then + * it is due to a split-brain in which case we anyway need to + * crawl the indices/entry-changes/pargfid directory. + */ + ret = afr_selfheal_entry_granular (frame, this, fd, i, + _gf_false); + else + ret = afr_selfheal_entry_do_subvol (frame, this, fd, i); + if (ret == -1) { /* gfid or type mismatch. */ mismatch = _gf_true; ret = 0; } - if (ret) - break; + if (ret) + break; } - if (!ret && source != -1) - ret = afr_selfheal_entry_do_subvol (frame, this, fd, source); + + if (!ret && source != -1) { + /* Impunge */ + if (local->need_full_crawl) + ret = afr_selfheal_entry_do_subvol (frame, this, fd, + source); + else + ret = afr_selfheal_entry_granular (frame, this, fd, + source, _gf_true); + } if (mismatch == _gf_true) /* undo pending will be skipped */ @@ -616,10 +879,12 @@ __afr_selfheal_entry (call_frame_t *frame, xlator_t *this, fd_t *fd, unsigned char *postop_lock = NULL; unsigned char *healed_sinks = NULL; struct afr_reply *locked_replies = NULL; + afr_local_t *local = NULL; afr_private_t *priv = NULL; gf_boolean_t did_sh = _gf_true; priv = this->private; + local = frame->local; sources = alloca0 (priv->child_count); sinks = alloca0 (priv->child_count); @@ -651,10 +916,16 @@ __afr_selfheal_entry (call_frame_t *frame, xlator_t *this, fd_t *fd, did_sh = _gf_false; goto unlock; } + + local->need_full_crawl = afr_need_full_heal (this, + locked_replies, + source, + healed_sinks, + AFR_ENTRY_TRANSACTION); } unlock: afr_selfheal_unentrylk (frame, this, fd->inode, this->name, NULL, - data_lock); + data_lock, NULL); if (ret < 0) goto out; @@ -695,7 +966,7 @@ unlock: } postop_unlock: afr_selfheal_unentrylk (frame, this, fd->inode, this->name, NULL, - postop_lock); + postop_lock, NULL); out: if (did_sh) afr_log_selfheal (fd->inode->gfid, this, ret, "entry", source, @@ -796,10 +1067,12 @@ afr_selfheal_entry (call_frame_t *frame, xlator_t *this, inode_t *inode) } if (!granular_locks) afr_selfheal_unentrylk (frame, this, inode, this->name, - LONG_FILENAME, long_name_locked); + LONG_FILENAME, long_name_locked, + NULL); } unlock: - afr_selfheal_unentrylk (frame, this, inode, priv->sh_domain, NULL, locked_on); + afr_selfheal_unentrylk (frame, this, inode, priv->sh_domain, NULL, + locked_on, NULL); if (fd) fd_unref (fd); -- cgit