diff options
Diffstat (limited to 'xlators/cluster/afr/src/afr-self-heal-metadata.c')
| -rw-r--r-- | xlators/cluster/afr/src/afr-self-heal-metadata.c | 744 |
1 files changed, 505 insertions, 239 deletions
diff --git a/xlators/cluster/afr/src/afr-self-heal-metadata.c b/xlators/cluster/afr/src/afr-self-heal-metadata.c index 83628297fd1..03f43bad16e 100644 --- a/xlators/cluster/afr/src/afr-self-heal-metadata.c +++ b/xlators/cluster/afr/src/afr-self-heal-metadata.c @@ -8,273 +8,539 @@ cases as published by the Free Software Foundation. */ - -#ifndef _CONFIG_H -#define _CONFIG_H -#include "config.h" -#endif - #include "afr.h" #include "afr-self-heal.h" -#include "byte-order.h" +#include <glusterfs/byte-order.h> +#include "protocol-common.h" +#include <glusterfs/events.h> + +#define AFR_HEAL_ATTR (GF_SET_ATTR_UID | GF_SET_ATTR_GID | GF_SET_ATTR_MODE) + +static gf_boolean_t +_afr_ignorable_key_match(dict_t *d, char *k, data_t *val, void *mdata) +{ + return afr_is_xattr_ignorable(k); +} -#define AFR_HEAL_ATTR (GF_SET_ATTR_UID|GF_SET_ATTR_GID|GF_SET_ATTR_MODE) +void +afr_delete_ignorable_xattrs(dict_t *xattr) +{ + dict_foreach_match(xattr, _afr_ignorable_key_match, NULL, + dict_remove_foreach_fn, NULL); +} int -afr_selfheal_metadata_do (call_frame_t *frame, xlator_t *this, inode_t *inode, - int source, unsigned char *healed_sinks, - struct afr_reply *locked_replies) +__afr_selfheal_metadata_do(call_frame_t *frame, xlator_t *this, inode_t *inode, + int source, unsigned char *healed_sinks, + struct afr_reply *locked_replies) { - int ret = -1; - loc_t loc = {0,}; - dict_t *xattr = NULL; - dict_t *old_xattr = NULL; - afr_private_t *priv = NULL; - int i = 0; - - priv = this->private; - - loc.inode = inode_ref (inode); - uuid_copy (loc.gfid, inode->gfid); - - gf_log (this->name, GF_LOG_INFO, "performing metadata selfheal on %s", - uuid_utoa (inode->gfid)); - - ret = syncop_getxattr (priv->children[source], &loc, &xattr, NULL); - if (ret < 0) { - loc_wipe (&loc); - return -EIO; - } - - afr_filter_xattrs (xattr); - dict_del (xattr, GF_SELINUX_XATTR_KEY); - - for (i = 0; i < priv->child_count; i++) { - if (!healed_sinks[i]) - continue; - - ret = syncop_setattr (priv->children[i], &loc, - &locked_replies[source].poststat, - AFR_HEAL_ATTR, NULL, NULL); - if (ret) - healed_sinks[i] = 0; - - old_xattr = NULL; - ret = syncop_getxattr (priv->children[i], &loc, &old_xattr, 0); - if (old_xattr) { - dict_del (old_xattr, GF_SELINUX_XATTR_KEY); - afr_filter_xattrs (old_xattr); - ret = syncop_removexattr (priv->children[i], &loc, "", - old_xattr); - } - - ret = syncop_setxattr (priv->children[i], &loc, xattr, 0); - if (ret) - healed_sinks[i] = 0; - } - - loc_wipe (&loc); - if (xattr) - dict_unref (xattr); - - return 0; + int ret = -1; + loc_t loc = { + 0, + }; + dict_t *xattr = NULL; + dict_t *old_xattr = NULL; + afr_private_t *priv = NULL; + int i = 0; + + priv = this->private; + + loc.inode = inode_ref(inode); + gf_uuid_copy(loc.gfid, inode->gfid); + + gf_msg(this->name, GF_LOG_INFO, 0, AFR_MSG_SELF_HEAL_INFO, + "performing metadata selfheal on %s", uuid_utoa(inode->gfid)); + + ret = syncop_getxattr(priv->children[source], &loc, &xattr, NULL, NULL, + NULL); + if (ret < 0) { + ret = -EIO; + goto out; + } + + afr_delete_ignorable_xattrs(xattr); + + for (i = 0; i < priv->child_count; i++) { + if (old_xattr) { + dict_unref(old_xattr); + old_xattr = NULL; + } + + if (!healed_sinks[i]) + continue; + + ret = syncop_setattr(priv->children[i], &loc, + &locked_replies[source].poststat, AFR_HEAL_ATTR, + NULL, NULL, NULL, NULL); + if (ret) + healed_sinks[i] = 0; + + ret = syncop_getxattr(priv->children[i], &loc, &old_xattr, 0, NULL, + NULL); + if (old_xattr) { + afr_delete_ignorable_xattrs(old_xattr); + ret = syncop_removexattr(priv->children[i], &loc, "", old_xattr, + NULL); + if (ret) + healed_sinks[i] = 0; + } + + ret = syncop_setxattr(priv->children[i], &loc, xattr, 0, NULL, NULL); + if (ret) + healed_sinks[i] = 0; + } + ret = 0; + +out: + loc_wipe(&loc); + if (xattr) + dict_unref(xattr); + if (old_xattr) + dict_unref(old_xattr); + + return ret; } +static uint64_t +mtime_ns(struct iatt *ia) +{ + uint64_t ret; + + ret = (((uint64_t)(ia->ia_mtime)) * 1000000000) + + (uint64_t)(ia->ia_mtime_nsec); + + return ret; +} /* - * Look for mismatching uid/gid or mode even if xattrs don't say so, and - * pick one arbitrarily as winner. + * When directory content is modified, [mc]time is updated. On + * Linux, the filesystem does it, while at least on NetBSD, the + * kernel file-system independent code does it. This means that + * when entries are added while bricks are down, the kernel sends + * a SETATTR [mc]time which will cause metadata split brain for + * the directory. In this case, clear the split brain by finding + * the source with the most recent modification date. */ - static int -__afr_selfheal_metadata_finalize_source (xlator_t *this, unsigned char *sources, - unsigned char *sinks, - unsigned char *locked_on, - struct afr_reply *replies) +afr_dirtime_splitbrain_source(call_frame_t *frame, xlator_t *this, + struct afr_reply *replies, + unsigned char *locked_on) { - int i = 0; - afr_private_t *priv = NULL; - struct iatt first = {0, }; - int source = -1; - int locked_count = 0; - int sources_count = 0; - int sinks_count = 0; - - priv = this->private; - - locked_count = AFR_COUNT (locked_on, priv->child_count); - sources_count = AFR_COUNT (sources, priv->child_count); - sinks_count = AFR_COUNT (sinks, priv->child_count); - - if (locked_count == sinks_count || !sources_count) { - if (!priv->metadata_splitbrain_forced_heal) { - return -EIO; - } - /* Metadata split brain, select one subvol - arbitrarily */ - for (i = 0; i < priv->child_count; i++) { - if (locked_on[i] && sinks[i]) { - sources[i] = 1; - sinks[i] = 0; - break; - } - } - } - - for (i = 0; i < priv->child_count; i++) { - if (!sources[i]) - continue; - if (source == -1) { - source = i; - first = replies[i].poststat; - } - } - - for (i = 0; i < priv->child_count; i++) { - if (!sources[i]) - continue; - if (!IA_EQUAL (first, replies[i].poststat, type) || - !IA_EQUAL (first, replies[i].poststat, uid) || - !IA_EQUAL (first, replies[i].poststat, gid) || - !IA_EQUAL (first, replies[i].poststat, prot)) { - sources[i] = 0; - sinks[i] = 1; - } - } - - return source; + afr_private_t *priv = NULL; + int source = -1; + struct iatt source_ia; + struct iatt child_ia; + uint64_t mtime = 0; + int i; + int ret = -1; + + priv = this->private; + + for (i = 0; i < priv->child_count; i++) { + if (!locked_on[i]) + continue; + + if (!replies[i].valid) + continue; + + if (replies[i].op_ret != 0) + continue; + + if (mtime_ns(&replies[i].poststat) <= mtime) + continue; + + mtime = mtime_ns(&replies[i].poststat); + source = i; + } + + if (source == -1) + goto out; + + source_ia = replies[source].poststat; + if (source_ia.ia_type != IA_IFDIR) + goto out; + + for (i = 0; i < priv->child_count; i++) { + if (i == source) + continue; + + if (!replies[i].valid) + continue; + + if (replies[i].op_ret != 0) + continue; + + child_ia = replies[i].poststat; + + if (!IA_EQUAL(source_ia, child_ia, gfid) || + !IA_EQUAL(source_ia, child_ia, type) || + !IA_EQUAL(source_ia, child_ia, prot) || + !IA_EQUAL(source_ia, child_ia, uid) || + !IA_EQUAL(source_ia, child_ia, gid) || + !afr_xattrs_are_equal(replies[source].xdata, replies[i].xdata)) + goto out; + } + + /* + * Metadata split brain is just about [amc]time + * We return our source. + */ + ret = source; +out: + return ret; } - static int -__afr_selfheal_metadata_prepare (call_frame_t *frame, xlator_t *this, inode_t *inode, - unsigned char *locked_on, unsigned char *sources, - unsigned char *sinks, unsigned char *healed_sinks, - struct afr_reply *replies) +__afr_selfheal_metadata_mark_pending_xattrs(call_frame_t *frame, xlator_t *this, + inode_t *inode, + struct afr_reply *replies, + unsigned char *sources) { - int ret = -1; - int source = -1; - afr_private_t *priv = NULL; - int i = 0; - - priv = this->private; - - ret = afr_selfheal_unlocked_discover (frame, inode, inode->gfid, - replies); - if (ret) - return ret; - - ret = afr_selfheal_find_direction (frame, this, replies, - AFR_METADATA_TRANSACTION, - locked_on, sources, sinks); - if (ret) - return ret; - - source = __afr_selfheal_metadata_finalize_source (this, sources, sinks, - locked_on, replies); - if (source < 0) - return -EIO; - - for (i = 0; i < priv->child_count; i++) - /* Initialize the healed_sinks[] array optimistically to - the intersection of to-be-healed (i.e sinks[]) and - the list of servers which are up (i.e locked_on[]). - - As we encounter failures in the healing process, we - will unmark the respective servers in the healed_sinks[] - array. - */ - healed_sinks[i] = sinks[i] && locked_on[i]; - - return source; + int ret = 0; + int i = 0; + int m_idx = 0; + afr_private_t *priv = NULL; + int raw[AFR_NUM_CHANGE_LOGS] = {0}; + dict_t *xattr = NULL; + + priv = this->private; + m_idx = afr_index_for_transaction_type(AFR_METADATA_TRANSACTION); + raw[m_idx] = 1; + + xattr = dict_new(); + if (!xattr) + return -ENOMEM; + + for (i = 0; i < priv->child_count; i++) { + if (sources[i]) + continue; + ret = dict_set_static_bin(xattr, priv->pending_key[i], raw, + sizeof(int) * AFR_NUM_CHANGE_LOGS); + if (ret) { + ret = -1; + goto out; + } + } + + for (i = 0; i < priv->child_count; i++) { + if (!sources[i]) + continue; + ret = afr_selfheal_post_op(frame, this, inode, i, xattr, NULL); + if (ret < 0) { + gf_msg(this->name, GF_LOG_INFO, -ret, AFR_MSG_SELF_HEAL_INFO, + "Failed to set pending metadata xattr on child %d for %s", i, + uuid_utoa(inode->gfid)); + goto out; + } + } + + afr_replies_wipe(replies, priv->child_count); + ret = afr_selfheal_unlocked_discover(frame, inode, inode->gfid, replies); + +out: + if (xattr) + dict_unref(xattr); + return ret; } +/* + * Look for mismatching uid/gid or mode or user xattrs even if + * AFR xattrs don't say so, and pick one arbitrarily as winner. */ static int -__afr_selfheal_metadata (call_frame_t *frame, xlator_t *this, inode_t *inode, - unsigned char *locked_on) +__afr_selfheal_metadata_finalize_source(call_frame_t *frame, xlator_t *this, + inode_t *inode, unsigned char *sources, + unsigned char *sinks, + unsigned char *healed_sinks, + unsigned char *undid_pending, + unsigned char *locked_on, + struct afr_reply *replies) { - afr_private_t *priv = NULL; - int ret = -1; - unsigned char *sources = NULL; - unsigned char *sinks = NULL; - unsigned char *data_lock = NULL; - unsigned char *healed_sinks = NULL; - struct afr_reply *locked_replies = NULL; - int source = -1; - - priv = this->private; - - sources = alloca0 (priv->child_count); - sinks = alloca0 (priv->child_count); - healed_sinks = alloca0 (priv->child_count); - data_lock = alloca0 (priv->child_count); - - locked_replies = alloca0 (sizeof (*locked_replies) * priv->child_count); - - ret = afr_selfheal_inodelk (frame, this, inode, this->name, - LLONG_MAX - 1, 0, data_lock); - { - if (ret < 2) { - ret = -ENOTCONN; - goto unlock; - } - - ret = __afr_selfheal_metadata_prepare (frame, this, inode, data_lock, - sources, sinks, healed_sinks, - locked_replies); - if (ret < 0) - goto unlock; - - source = ret; - ret = 0; - } -unlock: - afr_selfheal_uninodelk (frame, this, inode, this->name, - LLONG_MAX -1, 0, data_lock); - if (ret < 0) - goto out; - - ret = afr_selfheal_metadata_do (frame, this, inode, source, healed_sinks, - locked_replies); - if (ret) - goto out; - - ret = afr_selfheal_undo_pending (frame, this, inode, sources, sinks, - healed_sinks, AFR_METADATA_TRANSACTION, - locked_replies, data_lock); + int i = 0; + afr_private_t *priv = NULL; + struct iatt srcstat = { + 0, + }; + int source = -1; + int sources_count = 0; + int ret = 0; + + priv = this->private; + + sources_count = AFR_COUNT(sources, priv->child_count); + + if ((AFR_CMP(locked_on, healed_sinks, priv->child_count) == 0) || + !sources_count) { + source = afr_mark_split_brain_source_sinks( + frame, this, inode, sources, sinks, healed_sinks, locked_on, + replies, AFR_METADATA_TRANSACTION); + if (source >= 0) { + _afr_fav_child_reset_sink_xattrs( + frame, this, inode, source, healed_sinks, undid_pending, + AFR_METADATA_TRANSACTION, locked_on, replies); + goto out; + } + + /* If this is a directory mtime/ctime only split brain + use the most recent */ + source = afr_dirtime_splitbrain_source(frame, this, replies, locked_on); + if (source != -1) { + gf_msg(this->name, GF_LOG_INFO, 0, AFR_MSG_SPLIT_BRAIN, + "clear time " + "split brain on %s", + uuid_utoa(replies[source].poststat.ia_gfid)); + sources[source] = 1; + healed_sinks[source] = 0; + goto out; + } + + if (!priv->metadata_splitbrain_forced_heal) { + gf_event(EVENT_AFR_SPLIT_BRAIN, + "client-pid=%d;" + "subvol=%s;" + "type=metadata;file=%s", + this->ctx->cmd_args.client_pid, this->name, + uuid_utoa(inode->gfid)); + return -EIO; + } + + /* Metadata split brain, select one subvol + arbitrarily */ + for (i = 0; i < priv->child_count; i++) { + if (locked_on[i] && healed_sinks[i]) { + sources[i] = 1; + healed_sinks[i] = 0; + break; + } + } + } + + /* No split brain at this point. If we were called from + * afr_heal_splitbrain_file(), abort.*/ + if (afr_dict_contains_heal_op(frame)) + return -EIO; + + source = afr_choose_source_by_policy(priv, sources, + AFR_METADATA_TRANSACTION); + srcstat = replies[source].poststat; + + for (i = 0; i < priv->child_count; i++) { + if (!sources[i] || i == source) + continue; + if (!IA_EQUAL(srcstat, replies[i].poststat, type) || + !IA_EQUAL(srcstat, replies[i].poststat, uid) || + !IA_EQUAL(srcstat, replies[i].poststat, gid) || + !IA_EQUAL(srcstat, replies[i].poststat, prot)) { + gf_msg_debug(this->name, 0, + "%s: iatt mismatch " + "for source(%d) vs (%d)", + uuid_utoa(replies[source].poststat.ia_gfid), source, + i); + sources[i] = 0; + healed_sinks[i] = 1; + } + } + + for (i = 0; i < priv->child_count; i++) { + if (!sources[i] || i == source) + continue; + if (!afr_xattrs_are_equal(replies[source].xdata, replies[i].xdata)) { + gf_msg_debug(this->name, 0, + "%s: xattr mismatch " + "for source(%d) vs (%d)", + uuid_utoa(replies[source].poststat.ia_gfid), source, + i); + sources[i] = 0; + healed_sinks[i] = 1; + } + } + if ((sources_count == priv->child_count) && (source > -1) && + (AFR_COUNT(healed_sinks, priv->child_count) != 0)) { + ret = __afr_selfheal_metadata_mark_pending_xattrs(frame, this, inode, + replies, sources); + if (ret < 0) + return ret; + } out: - return ret; + afr_mark_active_sinks(this, sources, locked_on, healed_sinks); + return source; } +int +__afr_selfheal_metadata_prepare(call_frame_t *frame, xlator_t *this, + inode_t *inode, unsigned char *locked_on, + unsigned char *sources, unsigned char *sinks, + unsigned char *healed_sinks, + unsigned char *undid_pending, + struct afr_reply *replies, unsigned char *pflag) +{ + int ret = -1; + int source = -1; + afr_private_t *priv = NULL; + int i = 0; + uint64_t *witness = NULL; + + priv = this->private; + + ret = afr_selfheal_unlocked_discover(frame, inode, inode->gfid, replies); + if (ret) + return ret; + + witness = alloca0(sizeof(*witness) * priv->child_count); + ret = afr_selfheal_find_direction(frame, this, replies, + AFR_METADATA_TRANSACTION, locked_on, + sources, sinks, witness, pflag); + if (ret) + return ret; + + /* Initialize the healed_sinks[] array optimistically to + the intersection of to-be-healed (i.e sinks[]) and + the list of servers which are up (i.e locked_on[]). + + As we encounter failures in the healing process, we + will unmark the respective servers in the healed_sinks[] + array. + */ + AFR_INTERSECT(healed_sinks, sinks, locked_on, priv->child_count); + + /* If any source has witness, pick first + * witness source and make everybody else sinks */ + for (i = 0; i < priv->child_count; i++) { + if (sources[i] && witness[i]) { + source = i; + break; + } + } + + if (source != -1) { + for (i = 0; i < priv->child_count; i++) { + if (i != source && sources[i]) { + sources[i] = 0; + healed_sinks[i] = 1; + } + } + } + + source = __afr_selfheal_metadata_finalize_source( + frame, this, inode, sources, sinks, healed_sinks, undid_pending, + locked_on, replies); + + if (source < 0) + return -EIO; + + return source; +} int -afr_selfheal_metadata (call_frame_t *frame, xlator_t *this, inode_t *inode) +afr_selfheal_metadata(call_frame_t *frame, xlator_t *this, inode_t *inode) { - afr_private_t *priv = NULL; - unsigned char *locked_on = NULL; - int ret = 0; - - priv = this->private; - - locked_on = alloca0 (priv->child_count); - - ret = afr_selfheal_tryinodelk (frame, this, inode, priv->sh_domain, 0, 0, - locked_on); - { - if (ret < 2) { - /* Either less than two subvols available, or another - selfheal (from another server) is in progress. Skip - for now in any case there isn't anything to do. - */ - ret = -ENOTCONN; - goto unlock; - } - - ret = __afr_selfheal_metadata (frame, this, inode, locked_on); - } + afr_private_t *priv = NULL; + int ret = -1; + unsigned char *sources = NULL; + unsigned char *sinks = NULL; + unsigned char *data_lock = NULL; + unsigned char *healed_sinks = NULL; + unsigned char *undid_pending = NULL; + struct afr_reply *locked_replies = NULL; + gf_boolean_t did_sh = _gf_true; + int source = -1; + + priv = this->private; + + sources = alloca0(priv->child_count); + sinks = alloca0(priv->child_count); + healed_sinks = alloca0(priv->child_count); + undid_pending = alloca0(priv->child_count); + data_lock = alloca0(priv->child_count); + + locked_replies = alloca0(sizeof(*locked_replies) * priv->child_count); + + ret = afr_selfheal_inodelk(frame, this, inode, this->name, LLONG_MAX - 1, 0, + data_lock); + { + if (ret < priv->child_count) { + ret = -ENOTCONN; + goto unlock; + } + + ret = __afr_selfheal_metadata_prepare( + frame, this, inode, data_lock, sources, sinks, healed_sinks, + undid_pending, locked_replies, NULL); + if (ret < 0) + goto unlock; + + source = ret; + + if (AFR_COUNT(healed_sinks, priv->child_count) == 0) { + did_sh = _gf_false; + goto unlock; + } + + ret = __afr_selfheal_metadata_do(frame, this, inode, source, + healed_sinks, locked_replies); + if (ret) + goto unlock; + + afr_selfheal_restore_time(frame, this, inode, source, healed_sinks, + locked_replies); + + ret = afr_selfheal_undo_pending( + frame, this, inode, sources, sinks, healed_sinks, undid_pending, + AFR_METADATA_TRANSACTION, locked_replies, data_lock); + } unlock: - afr_selfheal_uninodelk (frame, this, inode, priv->sh_domain, 0, 0, locked_on); + afr_selfheal_uninodelk(frame, this, inode, this->name, LLONG_MAX - 1, 0, + data_lock); + + if (did_sh) + afr_log_selfheal(inode->gfid, this, ret, "metadata", source, sources, + healed_sinks); + else + ret = 1; + + if (locked_replies) + afr_replies_wipe(locked_replies, priv->child_count); + return ret; +} - return ret; +int +afr_selfheal_metadata_by_stbuf(xlator_t *this, struct iatt *stbuf) +{ + inode_t *inode = NULL; + inode_t *link_inode = NULL; + call_frame_t *frame = NULL; + int ret = 0; + + if (gf_uuid_is_null(stbuf->ia_gfid)) { + ret = -EINVAL; + goto out; + } + + inode = inode_new(this->itable); + if (!inode) { + ret = -ENOMEM; + goto out; + } + + link_inode = inode_link(inode, NULL, NULL, stbuf); + if (!link_inode) { + ret = -ENOMEM; + goto out; + } + + frame = afr_frame_create(this, &ret); + if (!frame) { + ret = -ret; + goto out; + } + + ret = afr_selfheal_metadata(frame, this, link_inode); +out: + if (inode) + inode_unref(inode); + if (link_inode) + inode_unref(link_inode); + if (frame) + AFR_STACK_DESTROY(frame); + return ret; } |
