/* Copyright (c) 2013 Red Hat, Inc. This file is part of GlusterFS. This file is licensed to you under your choice of the GNU Lesser General Public License, version 3 or any later version (LGPLv3 or later), or the GNU General Public License, version 2 (GPLv2), in all cases as published by the Free Software Foundation. */ #include "afr.h" #include "afr-self-heal.h" #include #include "afr-transaction.h" #include "afr-messages.h" #include #include static int afr_selfheal_entry_delete(xlator_t *this, inode_t *dir, const char *name, inode_t *inode, int child, struct afr_reply *replies) { afr_private_t *priv = NULL; xlator_t *subvol = NULL; int ret = 0; loc_t loc = { 0, }; char g[64]; priv = this->private; subvol = priv->children[child]; loc.parent = inode_ref(dir); gf_uuid_copy(loc.pargfid, dir->gfid); loc.name = name; loc.inode = inode_ref(inode); if (replies[child].valid && replies[child].op_ret == 0) { switch (replies[child].poststat.ia_type) { case IA_IFDIR: gf_msg(this->name, GF_LOG_WARNING, 0, AFR_MSG_EXPUNGING_FILE_OR_DIR, "expunging dir %s/%s (%s) on %s", uuid_utoa(dir->gfid), name, uuid_utoa_r(replies[child].poststat.ia_gfid, g), subvol->name); ret = syncop_rmdir(subvol, &loc, 1, NULL, NULL); break; default: gf_msg(this->name, GF_LOG_WARNING, 0, AFR_MSG_EXPUNGING_FILE_OR_DIR, "expunging file %s/%s (%s) on %s", uuid_utoa(dir->gfid), name, uuid_utoa_r(replies[child].poststat.ia_gfid, g), subvol->name); ret = syncop_unlink(subvol, &loc, NULL, NULL); break; } } loc_wipe(&loc); return ret; } int afr_selfheal_recreate_entry(call_frame_t *frame, int dst, int source, unsigned char *sources, inode_t *dir, const char *name, inode_t *inode, struct afr_reply *replies) { int ret = 0; loc_t loc = { 0, }; loc_t srcloc = { 0, }; xlator_t *this = frame->this; afr_private_t *priv = NULL; dict_t *xdata = NULL; struct iatt *iatt = NULL; char *linkname = NULL; mode_t mode = 0; struct iatt newent = { 0, }; unsigned char *newentry = NULL; priv = this->private; iatt = &replies[source].poststat; if (iatt->ia_type == IA_INVAL || gf_uuid_is_null(iatt->ia_gfid)) { gf_msg(this->name, GF_LOG_ERROR, 0, AFR_MSG_SELF_HEAL_FAILED, "Invalid ia_type (%d) or gfid(%s). source brick=%d, " "pargfid=%s, name=%s", iatt->ia_type, uuid_utoa(iatt->ia_gfid), source, uuid_utoa(dir->gfid), name); ret = -EINVAL; goto out; } xdata = dict_new(); if (!xdata) return -ENOMEM; newentry = alloca0(priv->child_count); loc.parent = inode_ref(dir); gf_uuid_copy(loc.pargfid, dir->gfid); loc.name = name; loc.inode = inode_ref(inode); ret = afr_selfheal_entry_delete(this, dir, name, inode, dst, replies); if (ret) goto out; ret = dict_set_gfuuid(xdata, "gfid-req", replies[source].poststat.ia_gfid, true); if (ret) goto out; srcloc.inode = inode_ref(inode); gf_uuid_copy(srcloc.gfid, iatt->ia_gfid); if (iatt->ia_type != IA_IFDIR) ret = syncop_lookup(priv->children[dst], &srcloc, 0, 0, 0, 0); if (iatt->ia_type == IA_IFDIR || ret == -ENOENT || ret == -ESTALE) { newentry[dst] = 1; ret = afr_selfheal_newentry_mark(frame, this, inode, source, replies, sources, newentry); if (ret) goto out; } mode = st_mode_from_ia(iatt->ia_prot, iatt->ia_type); switch (iatt->ia_type) { case IA_IFDIR: ret = syncop_mkdir(priv->children[dst], &loc, mode, 0, xdata, NULL); break; case IA_IFLNK: if (!newentry[dst]) { ret = syncop_link(priv->children[dst], &srcloc, &loc, &newent, NULL, NULL); } else { ret = syncop_readlink(priv->children[source], &srcloc, &linkname, 4096, NULL, NULL); if (ret <= 0) goto out; ret = syncop_symlink(priv->children[dst], &loc, linkname, NULL, xdata, NULL); } break; default: ret = dict_set_int32_sizen(xdata, GLUSTERFS_INTERNAL_FOP_KEY, 1); if (ret) goto out; ret = syncop_mknod( priv->children[dst], &loc, mode, makedev(ia_major(iatt->ia_rdev), ia_minor(iatt->ia_rdev)), &newent, xdata, NULL); break; } out: if (xdata) dict_unref(xdata); GF_FREE(linkname); loc_wipe(&loc); loc_wipe(&srcloc); return ret; } static int __afr_selfheal_heal_dirent(call_frame_t *frame, xlator_t *this, fd_t *fd, char *name, inode_t *inode, int source, unsigned char *sources, unsigned char *healed_sinks, unsigned char *locked_on, struct afr_reply *replies) { int ret = 0; afr_private_t *priv = NULL; int i = 0; priv = this->private; if (!replies[source].valid) return -EIO; /* Skip healing this entry if the last lookup on it failed for reasons * other than ENOENT. */ if ((replies[source].op_ret < 0) && (replies[source].op_errno != ENOENT)) return -replies[source].op_errno; if (replies[source].op_ret == 0) { ret = afr_lookup_and_heal_gfid(this, fd->inode, name, inode, replies, source, sources, &replies[source].poststat.ia_gfid, NULL); if (ret) return ret; } for (i = 0; i < priv->child_count; i++) { if (!healed_sinks[i]) continue; if (replies[source].op_ret == -1 && replies[source].op_errno == ENOENT) { ret = afr_selfheal_entry_delete(this, fd->inode, name, inode, i, replies); } else { if (!gf_uuid_compare(replies[i].poststat.ia_gfid, replies[source].poststat.ia_gfid)) continue; ret = afr_selfheal_recreate_entry(frame, i, source, sources, fd->inode, name, inode, replies); } if (ret < 0) break; } return ret; } static int afr_selfheal_detect_gfid_and_type_mismatch(xlator_t *this, struct afr_reply *replies, inode_t *inode, uuid_t pargfid, char *bname, int src_idx, unsigned char *locked_on, int *src) { int i = 0; int ret = -1; afr_private_t *priv = NULL; void *gfid = NULL; ia_type_t ia_type = IA_INVAL; priv = this->private; gfid = &replies[src_idx].poststat.ia_gfid; ia_type = replies[src_idx].poststat.ia_type; for (i = 0; i < priv->child_count; i++) { if (i == src_idx) continue; if (!replies[i].valid) continue; if (replies[i].op_ret != 0) continue; if (gf_uuid_compare(gfid, replies[i].poststat.ia_gfid) && (ia_type == replies[i].poststat.ia_type)) { ret = afr_gfid_split_brain_source(this, replies, inode, pargfid, bname, src_idx, i, locked_on, src, NULL); if (ret) gf_msg(this->name, GF_LOG_ERROR, 0, AFR_MSG_SPLIT_BRAIN, "Skipping conservative merge on the " "file."); return ret; } if (ia_type != replies[i].poststat.ia_type) { gf_msg(this->name, GF_LOG_ERROR, 0, AFR_MSG_SPLIT_BRAIN, "Type mismatch detected " "for /%s>, %s on %s and %s on %s. " "Skipping conservative merge on the file.", uuid_utoa(pargfid), bname, gf_inode_type_to_str(replies[i].poststat.ia_type), priv->children[i]->name, gf_inode_type_to_str(replies[src_idx].poststat.ia_type), priv->children[src_idx]->name); gf_event(EVENT_AFR_SPLIT_BRAIN, "client-pid=%d;" "subvol=%s;type=file;" "file=/%s>;count=2;child-%d=%s;type-" "%d=%s;child-%d=%s;type-%d=%s", this->ctx->cmd_args.client_pid, this->name, uuid_utoa(pargfid), bname, i, priv->children[i]->name, i, gf_inode_type_to_str(replies[i].poststat.ia_type), src_idx, priv->children[src_idx]->name, src_idx, gf_inode_type_to_str(replies[src_idx].poststat.ia_type)); return -1; } } return 0; } static int __afr_selfheal_merge_dirent(call_frame_t *frame, xlator_t *this, fd_t *fd, char *name, inode_t *inode, unsigned char *sources, unsigned char *healed_sinks, unsigned char *locked_on, struct afr_reply *replies) { int ret = 0; int i = 0; int source = -1; int src = -1; afr_private_t *priv = NULL; priv = this->private; for (i = 0; i < priv->child_count; i++) { if (replies[i].valid && replies[i].op_ret == 0) { source = i; break; } } if (source == -1) { /* entry got deleted in the mean time? */ return 0; } /* Set all the sources as 1, otheriwse newentry_mark won't be set */ for (i = 0; i < priv->child_count; i++) { if (replies[i].valid && replies[i].op_ret == 0) { sources[i] = 1; } } ret = afr_lookup_and_heal_gfid(this, fd->inode, name, inode, replies, source, sources, &replies[source].poststat.ia_gfid, NULL); if (ret) return ret; /* In case of type mismatch / unable to resolve gfid mismatch on the * entry, return -1.*/ ret = afr_selfheal_detect_gfid_and_type_mismatch( this, replies, inode, fd->inode->gfid, name, source, locked_on, &src); if (ret < 0) return ret; if (src != -1) { source = src; for (i = 0; i < priv->child_count; i++) { if (i != src && replies[i].valid && gf_uuid_compare(replies[src].poststat.ia_gfid, replies[i].poststat.ia_gfid)) { sources[i] = 0; } } } for (i = 0; i < priv->child_count; i++) { if (i == source || !healed_sinks[i]) continue; if (src != -1) { if (!gf_uuid_compare(replies[src].poststat.ia_gfid, replies[i].poststat.ia_gfid)) continue; } else if (replies[i].op_errno != ENOENT) { continue; } ret |= afr_selfheal_recreate_entry(frame, i, source, sources, fd->inode, name, inode, replies); } return ret; } static int __afr_selfheal_entry_dirent(call_frame_t *frame, xlator_t *this, fd_t *fd, char *name, inode_t *inode, int source, unsigned char *sources, unsigned char *healed_sinks, unsigned char *locked_on, struct afr_reply *replies) { int ret = -1; if (source < 0) ret = __afr_selfheal_merge_dirent(frame, this, fd, name, inode, sources, healed_sinks, locked_on, replies); else ret = __afr_selfheal_heal_dirent(frame, this, fd, name, inode, source, sources, healed_sinks, locked_on, replies); return ret; } static gf_boolean_t is_full_heal_marker_present(xlator_t *this, dict_t *xdata, int idx) { int i = 0; int pending[3] = { 0, }; void *pending_raw = NULL; afr_private_t *priv = NULL; priv = this->private; if (!xdata) return _gf_false; /* Iterate over each of the priv->pending_keys[] elements and then * see if any of them have data segment non-zero. If they do, return * true. Else return false. */ for (i = 0; i < priv->child_count; i++) { if (dict_get_ptr(xdata, priv->pending_key[i], &pending_raw)) continue; if (!pending_raw) continue; memcpy(pending, pending_raw, sizeof(pending)); if (ntoh32(pending[idx])) return _gf_true; } return _gf_false; } static gf_boolean_t afr_need_full_heal(xlator_t *this, struct afr_reply *replies, int source, unsigned char *healed_sinks, afr_transaction_type type) { int i = 0; int idx = 0; afr_private_t *priv = NULL; priv = this->private; if (!priv->esh_granular) return _gf_true; if (type != AFR_ENTRY_TRANSACTION) return _gf_true; priv = this->private; idx = afr_index_for_transaction_type(AFR_DATA_TRANSACTION); /* If there is a clear source, check whether the full-heal-indicator * is present in its xdata. Otherwise, we need to examine all the * participating bricks and then figure if *even* one of them has a * full-heal-indicator. */ if (source != -1) { if (is_full_heal_marker_present(this, replies[source].xdata, idx)) return _gf_true; } /* else ..*/ for (i = 0; i < priv->child_count; i++) { if (!healed_sinks[i]) continue; if (is_full_heal_marker_present(this, replies[i].xdata, idx)) return _gf_true; } return _gf_false; } static int __afr_selfheal_entry_finalize_source(xlator_t *this, unsigned char *sources, unsigned char *healed_sinks, unsigned char *locked_on, struct afr_reply *replies, uint64_t *witness) { afr_private_t *priv = NULL; int source = -1; int sources_count = 0; priv = this->private; sources_count = AFR_COUNT(sources, priv->child_count); if ((AFR_CMP(locked_on, healed_sinks, priv->child_count) == 0) || !sources_count || afr_does_witness_exist(this, witness)) { memset(sources, 0, sizeof(*sources) * priv->child_count); afr_mark_active_sinks(this, sources, locked_on, healed_sinks); return -1; } source = afr_choose_source_by_policy(priv, sources, AFR_ENTRY_TRANSACTION); return source; } int __afr_selfheal_entry_prepare(call_frame_t *frame, xlator_t *this, inode_t *inode, unsigned char *locked_on, unsigned char *sources, unsigned char *sinks, unsigned char *healed_sinks, struct afr_reply *replies, int *source_p, unsigned char *pflag) { int ret = -1; int source = -1; afr_private_t *priv = NULL; uint64_t *witness = NULL; priv = this->private; ret = afr_selfheal_unlocked_discover(frame, inode, inode->gfid, replies); if (ret) return ret; witness = alloca0(sizeof(*witness) * priv->child_count); ret = afr_selfheal_find_direction(frame, this, replies, AFR_ENTRY_TRANSACTION, locked_on, sources, sinks, witness, pflag); if (ret) return ret; /* Initialize the healed_sinks[] array optimistically to the intersection of to-be-healed (i.e sinks[]) and the list of servers which are up (i.e locked_on[]). As we encounter failures in the healing process, we will unmark the respective servers in the healed_sinks[] array. */ AFR_INTERSECT(healed_sinks, sinks, locked_on, priv->child_count); source = __afr_selfheal_entry_finalize_source(this, sources, healed_sinks, locked_on, replies, witness); if (source < 0) { /* If source is < 0 (typically split-brain), we perform a conservative merge of entries rather than erroring out */ } *source_p = source; return ret; } static int afr_selfheal_entry_dirent(call_frame_t *frame, xlator_t *this, fd_t *fd, char *name, inode_t *parent_idx_inode, xlator_t *subvol, gf_boolean_t full_crawl) { int ret = 0; int source = -1; unsigned char *locked_on = NULL; unsigned char *sources = NULL; unsigned char *sinks = NULL; unsigned char *healed_sinks = NULL; inode_t *inode = NULL; struct afr_reply *replies = NULL; struct afr_reply *par_replies = NULL; afr_private_t *priv = NULL; dict_t *xattr = NULL; priv = this->private; xattr = dict_new(); if (!xattr) return -ENOMEM; ret = dict_set_int32_sizen(xattr, GF_GFIDLESS_LOOKUP, 1); if (ret) { dict_unref(xattr); return -1; } sources = alloca0(priv->child_count); sinks = alloca0(priv->child_count); healed_sinks = alloca0(priv->child_count); locked_on = alloca0(priv->child_count); replies = alloca0(priv->child_count * sizeof(*replies)); par_replies = alloca0(priv->child_count * sizeof(*par_replies)); ret = afr_selfheal_entrylk(frame, this, fd->inode, this->name, NULL, locked_on); { if (ret < AFR_SH_MIN_PARTICIPANTS) { gf_msg_debug(this->name, 0, "%s: Skipping " "entry self-heal as only %d sub-volumes " " could be locked in %s domain", uuid_utoa(fd->inode->gfid), ret, this->name); ret = -ENOTCONN; goto unlock; } ret = __afr_selfheal_entry_prepare(frame, this, fd->inode, locked_on, sources, sinks, healed_sinks, par_replies, &source, NULL); if (ret < 0) goto unlock; inode = afr_selfheal_unlocked_lookup_on(frame, fd->inode, name, replies, locked_on, xattr); if (!inode) { ret = -ENOMEM; goto unlock; } ret = __afr_selfheal_entry_dirent(frame, this, fd, name, inode, source, sources, healed_sinks, locked_on, replies); if ((ret == 0) && (priv->esh_granular) && parent_idx_inode) { ret = afr_shd_index_purge(subvol, parent_idx_inode, name, inode->ia_type); /* Why is ret force-set to 0? We do not care about * index purge failing for full heal as it is quite * possible during replace-brick that not all files * and directories have their name indices present in * entry-changes/. */ ret = 0; } } unlock: afr_selfheal_unentrylk(frame, this, fd->inode, this->name, NULL, locked_on, NULL); if (inode) inode_unref(inode); if (replies) afr_replies_wipe(replies, priv->child_count); if (par_replies) afr_replies_wipe(par_replies, priv->child_count); if (xattr) dict_unref(xattr); return ret; } static inode_t * afr_shd_entry_changes_index_inode(xlator_t *this, xlator_t *subvol, uuid_t pargfid) { int ret = -1; void *index_gfid = NULL; loc_t rootloc = { 0, }; loc_t loc = { 0, }; dict_t *xattr = NULL; inode_t *inode = NULL; struct iatt iatt = { 0, }; rootloc.inode = inode_ref(this->itable->root); gf_uuid_copy(rootloc.gfid, rootloc.inode->gfid); ret = syncop_getxattr(subvol, &rootloc, &xattr, GF_XATTROP_ENTRY_CHANGES_GFID, NULL, NULL); if (ret || !xattr) { errno = -ret; goto out; } ret = dict_get_ptr(xattr, GF_XATTROP_ENTRY_CHANGES_GFID, &index_gfid); if (ret) { errno = EINVAL; goto out; } loc.inode = inode_new(this->itable); if (!loc.inode) { errno = ENOMEM; goto out; } gf_uuid_copy(loc.pargfid, index_gfid); loc.name = gf_strdup(uuid_utoa(pargfid)); ret = syncop_lookup(subvol, &loc, &iatt, NULL, NULL, NULL); if (ret < 0) { errno = -ret; goto out; } inode = inode_link(loc.inode, NULL, NULL, &iatt); out: if (xattr) dict_unref(xattr); loc_wipe(&rootloc); GF_FREE((char *)loc.name); loc_wipe(&loc); return inode; } static int afr_selfheal_entry_do_subvol(call_frame_t *frame, xlator_t *this, fd_t *fd, int child) { int ret = 0; gf_dirent_t entries; gf_dirent_t *entry = NULL; off_t offset = 0; call_frame_t *iter_frame = NULL; xlator_t *subvol = NULL; afr_private_t *priv = NULL; gf_boolean_t mismatch = _gf_false; afr_local_t *local = NULL; loc_t loc = { 0, }; priv = this->private; subvol = priv->children[child]; INIT_LIST_HEAD(&entries.list); local = frame->local; iter_frame = afr_copy_frame(frame); if (!iter_frame) return -ENOMEM; loc.inode = afr_shd_entry_changes_index_inode(this, subvol, fd->inode->gfid); while ((ret = syncop_readdir(subvol, fd, 131072, offset, &entries, NULL, NULL))) { if (ret > 0) ret = 0; list_for_each_entry(entry, &entries.list, list) { offset = entry->d_off; if (!strcmp(entry->d_name, ".") || !strcmp(entry->d_name, "..")) continue; if (__is_root_gfid(fd->inode->gfid) && !strcmp(entry->d_name, GF_REPLICATE_TRASH_DIR)) continue; ret = afr_selfheal_entry_dirent(iter_frame, this, fd, entry->d_name, loc.inode, subvol, local->need_full_crawl); AFR_STACK_RESET(iter_frame); if (iter_frame->local == NULL) { ret = -ENOTCONN; break; } if (ret == -1) { /* gfid or type mismatch. */ mismatch = _gf_true; ret = 0; } if (ret) break; } gf_dirent_free(&entries); if (ret) break; } loc_wipe(&loc); AFR_STACK_DESTROY(iter_frame); if (mismatch == _gf_true) /* undo pending will be skipped */ ret = -1; return ret; } static int afr_selfheal_entry_granular_dirent(xlator_t *subvol, gf_dirent_t *entry, loc_t *parent, void *data) { int ret = 0; loc_t loc = { 0, }; struct iatt iatt = { 0, }; afr_granular_esh_args_t *args = data; /* Look up the actual inode associated with entry. If the lookup returns * ESTALE or ENOENT, then it means we have a stale index. Remove it. * This is analogous to the check in afr_shd_index_heal() except that * here it is achieved through LOOKUP and in afr_shd_index_heal() through * a GETXATTR. */ loc.inode = inode_new(args->xl->itable); loc.parent = inode_ref(args->heal_fd->inode); gf_uuid_copy(loc.pargfid, loc.parent->gfid); loc.name = entry->d_name; ret = syncop_lookup(args->xl, &loc, &iatt, NULL, NULL, NULL); if ((ret == -ENOENT) || (ret == -ESTALE)) { /* The name indices under the pgfid index dir are guaranteed * to be regular files. Hence the hardcoding. */ afr_shd_index_purge(subvol, parent->inode, entry->d_name, IA_IFREG); ret = 0; goto out; } /* TBD: afr_shd_zero_xattrop? */ ret = afr_selfheal_entry_dirent(args->frame, args->xl, args->heal_fd, entry->d_name, parent->inode, subvol, _gf_false); AFR_STACK_RESET(args->frame); if (args->frame->local == NULL) ret = -ENOTCONN; if (ret == -1) args->mismatch = _gf_true; out: loc_wipe(&loc); return 0; } static int afr_selfheal_entry_granular(call_frame_t *frame, xlator_t *this, fd_t *fd, int subvol_idx, gf_boolean_t is_src) { int ret = 0; loc_t loc = { 0, }; xlator_t *subvol = NULL; afr_private_t *priv = NULL; afr_granular_esh_args_t args = { 0, }; priv = this->private; subvol = priv->children[subvol_idx]; args.frame = afr_copy_frame(frame); args.xl = this; /* args.heal_fd represents the fd associated with the original directory * on which entry heal is being attempted. */ args.heal_fd = fd; /* @subvol here represents the subvolume of AFR where * indices/entry-changes/ will be processed */ loc.inode = afr_shd_entry_changes_index_inode(this, subvol, fd->inode->gfid); if (!loc.inode) { /* If granular heal failed on the sink (as it might sometimes * because it is the src that would mostly contain the granular * changelogs and the sink's entry-changes would be empty), * do not treat heal as failure. */ if (is_src) return -errno; else return 0; } ret = syncop_dir_scan(subvol, &loc, GF_CLIENT_PID_SELF_HEALD, &args, afr_selfheal_entry_granular_dirent); loc_wipe(&loc); if (args.mismatch == _gf_true) ret = -1; return ret; } static int afr_selfheal_entry_do(call_frame_t *frame, xlator_t *this, fd_t *fd, int source, unsigned char *sources, unsigned char *healed_sinks) { int i = 0; int ret = 0; gf_boolean_t mismatch = _gf_false; afr_local_t *local = NULL; afr_private_t *priv = NULL; priv = this->private; local = frame->local; gf_msg(this->name, GF_LOG_INFO, 0, AFR_MSG_SELF_HEAL_INFO, "performing entry selfheal on %s", uuid_utoa(fd->inode->gfid)); for (i = 0; i < priv->child_count; i++) { /* Expunge */ if (!healed_sinks[i]) continue; if (!local->need_full_crawl) /* Why call afr_selfheal_entry_granular() on a "healed sink", * given that it is the source that contains the granular * indices? * If the index for this directory is non-existent or empty on * this subvol (=> clear sink), then it will return early * without failure status. * If the index is non-empty and it is yet a 'healed sink', then * it is due to a split-brain in which case we anyway need to * crawl the indices/entry-changes/pargfid directory. */ ret = afr_selfheal_entry_granular(frame, this, fd, i, _gf_false); else ret = afr_selfheal_entry_do_subvol(frame, this, fd, i); if (ret == -1) { /* gfid or type mismatch. */ mismatch = _gf_true; ret = 0; } if (ret) break; } if (!ret && source != -1) { /* Impunge */ if (local->need_full_crawl) ret = afr_selfheal_entry_do_subvol(frame, this, fd, source); else ret = afr_selfheal_entry_granular(frame, this, fd, source, _gf_true); } if (mismatch == _gf_true) /* undo pending will be skipped */ ret = -1; return ret; } static int __afr_selfheal_entry(call_frame_t *frame, xlator_t *this, fd_t *fd, unsigned char *locked_on) { int ret = -1; int source = -1; unsigned char *sources = NULL; unsigned char *sinks = NULL; unsigned char *data_lock = NULL; unsigned char *postop_lock = NULL; unsigned char *healed_sinks = NULL; unsigned char *undid_pending = NULL; struct afr_reply *locked_replies = NULL; afr_local_t *local = NULL; afr_private_t *priv = NULL; gf_boolean_t did_sh = _gf_true; priv = this->private; local = frame->local; sources = alloca0(priv->child_count); sinks = alloca0(priv->child_count); healed_sinks = alloca0(priv->child_count); undid_pending = alloca0(priv->child_count); data_lock = alloca0(priv->child_count); postop_lock = alloca0(priv->child_count); locked_replies = alloca0(sizeof(*locked_replies) * priv->child_count); ret = afr_selfheal_entrylk(frame, this, fd->inode, this->name, NULL, data_lock); { if (ret < AFR_SH_MIN_PARTICIPANTS) { gf_msg_debug(this->name, 0, "%s: Skipping " "entry self-heal as only %d sub-volumes could " "be locked in %s domain", uuid_utoa(fd->inode->gfid), ret, this->name); ret = -ENOTCONN; goto unlock; } ret = __afr_selfheal_entry_prepare(frame, this, fd->inode, data_lock, sources, sinks, healed_sinks, locked_replies, &source, NULL); if (AFR_COUNT(healed_sinks, priv->child_count) == 0) { did_sh = _gf_false; goto unlock; } local->need_full_crawl = afr_need_full_heal( this, locked_replies, source, healed_sinks, AFR_ENTRY_TRANSACTION); } unlock: afr_selfheal_unentrylk(frame, this, fd->inode, this->name, NULL, data_lock, NULL); if (ret < 0) goto out; if (!did_sh) goto out; ret = afr_selfheal_entry_do(frame, this, fd, source, sources, healed_sinks); if (ret) goto out; /* Take entrylks in xlator domain before doing post-op (undo-pending) in * entry self-heal. This is to prevent a parallel name self-heal on * an entry under @fd->inode from reading pending xattrs while it is * being modified by SHD after entry sh below, given that * name self-heal takes locks ONLY in xlator domain and is free to read * pending changelog in the absence of the following locking. */ ret = afr_selfheal_entrylk(frame, this, fd->inode, this->name, NULL, postop_lock); { if (AFR_CMP(data_lock, postop_lock, priv->child_count) != 0) { gf_msg_debug(this->name, 0, "%s: Skipping " "post-op after entry self-heal as %d " "sub-volumes, as opposed to %d, " "could be locked in %s domain", uuid_utoa(fd->inode->gfid), ret, AFR_COUNT(data_lock, priv->child_count), this->name); ret = -ENOTCONN; goto postop_unlock; } ret = afr_selfheal_undo_pending( frame, this, fd->inode, sources, sinks, healed_sinks, undid_pending, AFR_ENTRY_TRANSACTION, locked_replies, postop_lock); } postop_unlock: afr_selfheal_unentrylk(frame, this, fd->inode, this->name, NULL, postop_lock, NULL); out: if (did_sh) afr_log_selfheal(fd->inode->gfid, this, ret, "entry", source, sources, healed_sinks); else ret = 1; if (locked_replies) afr_replies_wipe(locked_replies, priv->child_count); return ret; } static fd_t * afr_selfheal_data_opendir(xlator_t *this, inode_t *inode) { loc_t loc = { 0, }; int ret = 0; fd_t *fd = NULL; fd = fd_create(inode, 0); if (!fd) return NULL; loc.inode = inode_ref(inode); gf_uuid_copy(loc.gfid, inode->gfid); ret = syncop_opendir(this, &loc, fd, NULL, NULL); if (ret) { fd_unref(fd); fd = NULL; } else { fd_bind(fd); } loc_wipe(&loc); return fd; } int afr_selfheal_entry(call_frame_t *frame, xlator_t *this, inode_t *inode) { afr_private_t *priv = NULL; unsigned char *locked_on = NULL; fd_t *fd = NULL; int ret = 0; priv = this->private; fd = afr_selfheal_data_opendir(this, inode); if (!fd) return -EIO; locked_on = alloca0(priv->child_count); ret = afr_selfheal_tie_breaker_entrylk(frame, this, inode, priv->sh_domain, NULL, locked_on); { if (ret < AFR_SH_MIN_PARTICIPANTS) { gf_msg_debug(this->name, 0, "%s: Skipping " "entry self-heal as only %d sub-volumes could " "be locked in %s domain", uuid_utoa(fd->inode->gfid), ret, priv->sh_domain); /* Either less than two subvols available, or another selfheal (from another server) is in progress. Skip for now in any case there isn't anything to do. */ ret = -ENOTCONN; goto unlock; } ret = __afr_selfheal_entry(frame, this, fd, locked_on); } unlock: afr_selfheal_unentrylk(frame, this, inode, priv->sh_domain, NULL, locked_on, NULL); if (fd) fd_unref(fd); return ret; }