summaryrefslogtreecommitdiffstats
path: root/xlators
diff options
context:
space:
mode:
authorPranith Kumar K <pkarampu@redhat.com>2020-04-13 19:31:51 +0530
committerRavishankar N <ravishankar@redhat.com>2020-10-01 12:03:25 +0000
commit9ecbd69127d373ac000e9e1be00c1829e49e64a4 (patch)
tree66e5276ca1fbcc50913e29fe004a3c6ffbc919ae /xlators
parent4ec05d087e07dc1ae2ada9d36ab2597c175890b4 (diff)
cluster/afr: Heal directory rename without rmdir/mkdir
Problem1: When a directory is renamed while a brick is down entry-heal always did an rm -rf on that directory on the sink on old location and did mkdir and created the directory hierarchy again in the new location. This is inefficient. Problem2: Renamedir heal order may lead to a scenario where directory in the new location could be created before deleting it from old location leading to 2 directories with same gfid in posix. Fix: As part of heal, if oldlocation is healed first and is not present in source-brick always rename it into a hidden directory inside the sink-brick so that when heal is triggered in new-location shd can rename it from this hidden directory to the new-location. If new-location heal is triggered first and it detects that the directory already exists in the brick, then it should skip healing the directory until it appears in the hidden directory. Credits: Ravi for rename-data-loss.t script Fixes: #1211 Change-Id: I0cba2006f35cd03d314d18211ce0bd530e254843 Signed-off-by: Pranith Kumar K <pkarampu@redhat.com>
Diffstat (limited to 'xlators')
-rw-r--r--xlators/cluster/afr/src/afr-common.c46
-rw-r--r--xlators/cluster/afr/src/afr-dir-read.c12
-rw-r--r--xlators/cluster/afr/src/afr-self-heal-common.c182
-rw-r--r--xlators/cluster/afr/src/afr-self-heal-entry.c205
-rw-r--r--xlators/cluster/afr/src/afr-self-heal-name.c33
-rw-r--r--xlators/cluster/afr/src/afr-self-heal.h5
-rw-r--r--xlators/cluster/afr/src/afr-self-heald.c178
-rw-r--r--xlators/cluster/afr/src/afr-self-heald.h2
-rw-r--r--xlators/cluster/afr/src/afr.c41
-rw-r--r--xlators/cluster/afr/src/afr.h11
-rw-r--r--xlators/mgmt/glusterd/src/glusterd-volgen.c39
-rw-r--r--xlators/mgmt/glusterd/src/glusterd-volume-set.c5
12 files changed, 677 insertions, 82 deletions
diff --git a/xlators/cluster/afr/src/afr-common.c b/xlators/cluster/afr/src/afr-common.c
index 4c8fa31..1a4341a 100644
--- a/xlators/cluster/afr/src/afr-common.c
+++ b/xlators/cluster/afr/src/afr-common.c
@@ -45,6 +45,41 @@ afr_quorum_errno(afr_private_t *priv)
return ENOTCONN;
}
+gf_boolean_t
+afr_is_private_directory(afr_private_t *priv, uuid_t pargfid, const char *name,
+ pid_t pid)
+{
+ if (!__is_root_gfid(pargfid)) {
+ return _gf_false;
+ }
+
+ if (strcmp(name, GF_REPLICATE_TRASH_DIR) == 0) {
+ /*For backward compatibility /.landfill is private*/
+ return _gf_true;
+ }
+
+ if (pid == GF_CLIENT_PID_GSYNCD) {
+ /*geo-rep needs to create/sync private directory on slave because
+ * it appears in changelog*/
+ return _gf_false;
+ }
+
+ if (pid == GF_CLIENT_PID_GLFS_HEAL || pid == GF_CLIENT_PID_SELF_HEALD) {
+ if (strcmp(name, priv->anon_inode_name) == 0) {
+ /* anonymous-inode dir is private*/
+ return _gf_true;
+ }
+ } else {
+ if (strncmp(name, AFR_ANON_DIR_PREFIX, strlen(AFR_ANON_DIR_PREFIX)) ==
+ 0) {
+ /* anonymous-inode dir prefix is private for geo-rep to work*/
+ return _gf_true;
+ }
+ }
+
+ return _gf_false;
+}
+
void
afr_fill_success_replies(afr_local_t *local, afr_private_t *priv,
unsigned char *replies)
@@ -3978,11 +4013,10 @@ afr_lookup(call_frame_t *frame, xlator_t *this, loc_t *loc, dict_t *xattr_req)
return 0;
}
- if (__is_root_gfid(loc->parent->gfid)) {
- if (!strcmp(loc->name, GF_REPLICATE_TRASH_DIR)) {
- op_errno = EPERM;
- goto out;
- }
+ if (afr_is_private_directory(this->private, loc->parent->gfid, loc->name,
+ frame->root->pid)) {
+ op_errno = EPERM;
+ goto out;
}
local = AFR_FRAME_INIT(frame, op_errno);
@@ -5660,6 +5694,7 @@ afr_priv_dump(xlator_t *this)
priv->background_self_heal_count);
gf_proc_dump_write("healers", "%d", priv->healers);
gf_proc_dump_write("read-hash-mode", "%d", priv->hash_mode);
+ gf_proc_dump_write("use-anonymous-inode", "%d", priv->use_anon_inode);
if (priv->quorum_count == AFR_QUORUM_AUTO) {
gf_proc_dump_write("quorum-type", "auto");
} else if (priv->quorum_count == 0) {
@@ -6653,6 +6688,7 @@ afr_priv_destroy(afr_private_t *priv)
GF_FREE(priv->local);
GF_FREE(priv->pending_key);
GF_FREE(priv->children);
+ GF_FREE(priv->anon_inode);
GF_FREE(priv->child_up);
GF_FREE(priv->halo_child_up);
GF_FREE(priv->child_latency);
diff --git a/xlators/cluster/afr/src/afr-dir-read.c b/xlators/cluster/afr/src/afr-dir-read.c
index f69013f..f8bf834 100644
--- a/xlators/cluster/afr/src/afr-dir-read.c
+++ b/xlators/cluster/afr/src/afr-dir-read.c
@@ -164,8 +164,8 @@ afr_validate_read_subvol(inode_t *inode, xlator_t *this, int par_read_subvol)
}
static void
-afr_readdir_transform_entries(gf_dirent_t *subvol_entries, int subvol,
- gf_dirent_t *entries, fd_t *fd)
+afr_readdir_transform_entries(call_frame_t *frame, gf_dirent_t *subvol_entries,
+ int subvol, gf_dirent_t *entries, fd_t *fd)
{
int ret = -1;
gf_dirent_t *entry = NULL;
@@ -183,8 +183,8 @@ afr_readdir_transform_entries(gf_dirent_t *subvol_entries, int subvol,
list_for_each_entry_safe(entry, tmp, &subvol_entries->list, list)
{
- if (__is_root_gfid(fd->inode->gfid) &&
- !strcmp(entry->d_name, GF_REPLICATE_TRASH_DIR)) {
+ if (afr_is_private_directory(priv, fd->inode->gfid, entry->d_name,
+ frame->root->pid)) {
continue;
}
@@ -228,8 +228,8 @@ afr_readdir_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
}
if (op_ret >= 0)
- afr_readdir_transform_entries(subvol_entries, (long)cookie, &entries,
- local->fd);
+ afr_readdir_transform_entries(frame, subvol_entries, (long)cookie,
+ &entries, local->fd);
AFR_STACK_UNWIND(readdir, frame, op_ret, op_errno, &entries, xdata);
diff --git a/xlators/cluster/afr/src/afr-self-heal-common.c b/xlators/cluster/afr/src/afr-self-heal-common.c
index f35c41d..a580a15 100644
--- a/xlators/cluster/afr/src/afr-self-heal-common.c
+++ b/xlators/cluster/afr/src/afr-self-heal-common.c
@@ -2750,3 +2750,185 @@ afr_choose_source_by_policy(afr_private_t *priv, unsigned char *sources,
out:
return source;
}
+
+static int
+afr_anon_inode_mkdir_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+ int32_t op_ret, int32_t op_errno, inode_t *inode,
+ struct iatt *buf, struct iatt *preparent,
+ struct iatt *postparent, dict_t *xdata)
+{
+ afr_local_t *local = frame->local;
+ int i = (long)cookie;
+
+ local->replies[i].valid = 1;
+ local->replies[i].op_ret = op_ret;
+ local->replies[i].op_errno = op_errno;
+ if (op_ret == 0) {
+ local->op_ret = 0;
+ local->replies[i].poststat = *buf;
+ local->replies[i].preparent = *preparent;
+ local->replies[i].postparent = *postparent;
+ }
+ if (xdata) {
+ local->replies[i].xdata = dict_ref(xdata);
+ }
+
+ syncbarrier_wake(&local->barrier);
+ return 0;
+}
+
+int
+afr_anon_inode_create(xlator_t *this, int child, inode_t **linked_inode)
+{
+ call_frame_t *frame = NULL;
+ afr_local_t *local = NULL;
+ afr_private_t *priv = this->private;
+ unsigned char *mkdir_on = alloca0(priv->child_count);
+ unsigned char *lookup_on = alloca0(priv->child_count);
+ loc_t loc = {0};
+ int32_t op_errno = 0;
+ int32_t child_op_errno = 0;
+ struct iatt iatt = {0};
+ dict_t *xdata = NULL;
+ uuid_t anon_inode_gfid = {0};
+ int mkdir_count = 0;
+ int i = 0;
+
+ /*Try to mkdir everywhere and return success if the dir exists on 'child'
+ */
+
+ if (!priv->use_anon_inode) {
+ op_errno = EINVAL;
+ goto out;
+ }
+
+ frame = afr_frame_create(this, &op_errno);
+ if (op_errno) {
+ goto out;
+ }
+ local = frame->local;
+ if (!local->child_up[child]) {
+ /*Other bricks may need mkdir so don't error out yet*/
+ child_op_errno = ENOTCONN;
+ }
+ gf_uuid_parse(priv->anon_gfid_str, anon_inode_gfid);
+ for (i = 0; i < priv->child_count; i++) {
+ if (!local->child_up[i])
+ continue;
+
+ if (priv->anon_inode[i]) {
+ mkdir_on[i] = 0;
+ } else {
+ mkdir_on[i] = 1;
+ mkdir_count++;
+ }
+ }
+
+ if (mkdir_count == 0) {
+ *linked_inode = inode_find(this->itable, anon_inode_gfid);
+ if (*linked_inode) {
+ op_errno = 0;
+ goto out;
+ }
+ }
+
+ loc.parent = inode_ref(this->itable->root);
+ loc.name = priv->anon_inode_name;
+ loc.inode = inode_new(this->itable);
+ if (!loc.inode) {
+ op_errno = ENOMEM;
+ goto out;
+ }
+
+ xdata = dict_new();
+ if (!xdata) {
+ op_errno = ENOMEM;
+ goto out;
+ }
+
+ op_errno = -dict_set_gfuuid(xdata, "gfid-req", anon_inode_gfid, _gf_true);
+ if (op_errno) {
+ goto out;
+ }
+
+ if (mkdir_count == 0) {
+ memcpy(lookup_on, local->child_up, priv->child_count);
+ goto lookup;
+ }
+
+ AFR_ONLIST(mkdir_on, frame, afr_anon_inode_mkdir_cbk, mkdir, &loc, 0755, 0,
+ xdata);
+
+ for (i = 0; i < priv->child_count; i++) {
+ if (!mkdir_on[i]) {
+ continue;
+ }
+
+ if (local->replies[i].op_ret == 0) {
+ priv->anon_inode[i] = 1;
+ iatt = local->replies[i].poststat;
+ } else if (local->replies[i].op_ret < 0 &&
+ local->replies[i].op_errno == EEXIST) {
+ lookup_on[i] = 1;
+ } else if (i == child) {
+ child_op_errno = local->replies[i].op_errno;
+ }
+ }
+
+ if (AFR_COUNT(lookup_on, priv->child_count) == 0) {
+ goto link;
+ }
+
+lookup:
+ AFR_ONLIST(lookup_on, frame, afr_selfheal_discover_cbk, lookup, &loc,
+ xdata);
+ for (i = 0; i < priv->child_count; i++) {
+ if (!lookup_on[i]) {
+ continue;
+ }
+
+ if (local->replies[i].op_ret == 0) {
+ if (gf_uuid_compare(anon_inode_gfid,
+ local->replies[i].poststat.ia_gfid) == 0) {
+ priv->anon_inode[i] = 1;
+ iatt = local->replies[i].poststat;
+ } else {
+ if (i == child)
+ child_op_errno = EINVAL;
+ gf_msg(this->name, GF_LOG_ERROR, 0, AFR_MSG_INVALID_DATA,
+ "%s has gfid: %s", priv->anon_inode_name,
+ uuid_utoa(local->replies[i].poststat.ia_gfid));
+ }
+ } else if (i == child) {
+ child_op_errno = local->replies[i].op_errno;
+ }
+ }
+link:
+ if (!gf_uuid_is_null(iatt.ia_gfid)) {
+ *linked_inode = inode_link(loc.inode, loc.parent, loc.name, &iatt);
+ if (*linked_inode) {
+ op_errno = 0;
+ inode_lookup(*linked_inode);
+ } else {
+ op_errno = ENOMEM;
+ }
+ goto out;
+ }
+
+out:
+ if (xdata)
+ dict_unref(xdata);
+ loc_wipe(&loc);
+ /*child_op_errno takes precedence*/
+ if (child_op_errno == 0) {
+ child_op_errno = op_errno;
+ }
+
+ if (child_op_errno && *linked_inode) {
+ inode_unref(*linked_inode);
+ *linked_inode = NULL;
+ }
+ if (frame)
+ AFR_STACK_DESTROY(frame);
+ return -child_op_errno;
+}
diff --git a/xlators/cluster/afr/src/afr-self-heal-entry.c b/xlators/cluster/afr/src/afr-self-heal-entry.c
index ac31751..64893f4 100644
--- a/xlators/cluster/afr/src/afr-self-heal-entry.c
+++ b/xlators/cluster/afr/src/afr-self-heal-entry.c
@@ -16,54 +16,170 @@
#include <glusterfs/syncop-utils.h>
#include <glusterfs/events.h>
-static int
-afr_selfheal_entry_delete(xlator_t *this, inode_t *dir, const char *name,
- inode_t *inode, int child, struct afr_reply *replies)
+int
+afr_selfheal_entry_anon_inode(xlator_t *this, inode_t *dir, const char *name,
+ inode_t *inode, int child,
+ struct afr_reply *replies,
+ gf_boolean_t *anon_inode)
{
afr_private_t *priv = NULL;
+ afr_local_t *local = NULL;
xlator_t *subvol = NULL;
int ret = 0;
+ int i = 0;
+ char g[64] = {0};
+ unsigned char *lookup_success = NULL;
+ call_frame_t *frame = NULL;
+ loc_t loc2 = {
+ 0,
+ };
loc_t loc = {
0,
};
- char g[64];
priv = this->private;
-
subvol = priv->children[child];
+ lookup_success = alloca0(priv->child_count);
+ uuid_utoa_r(replies[child].poststat.ia_gfid, g);
+ loc.inode = inode_new(inode->table);
+ if (!loc.inode) {
+ ret = -ENOMEM;
+ goto out;
+ }
+
+ if (replies[child].poststat.ia_type == IA_IFDIR) {
+ /* This directory may have sub-directory hierarchy which may need to
+ * be preserved for subsequent heals. So unconditionally move the
+ * directory to anonymous-inode directory*/
+ *anon_inode = _gf_true;
+ goto anon_inode;
+ }
+
+ frame = afr_frame_create(this, &ret);
+ if (!frame) {
+ ret = -ret;
+ goto out;
+ }
+ local = frame->local;
+ gf_uuid_copy(loc.gfid, replies[child].poststat.ia_gfid);
+ AFR_ONLIST(local->child_up, frame, afr_selfheal_discover_cbk, lookup, &loc,
+ NULL);
+ for (i = 0; i < priv->child_count; i++) {
+ if (local->replies[i].op_ret == 0) {
+ lookup_success[i] = 1;
+ } else if (local->replies[i].op_errno != ENOENT &&
+ local->replies[i].op_errno != ESTALE) {
+ ret = -local->replies[i].op_errno;
+ }
+ }
+
+ if (priv->quorum_count) {
+ if (afr_has_quorum(lookup_success, this, NULL)) {
+ *anon_inode = _gf_true;
+ }
+ } else if (AFR_COUNT(lookup_success, priv->child_count) > 1) {
+ *anon_inode = _gf_true;
+ } else if (ret) {
+ goto out;
+ }
+
+anon_inode:
+ if (!*anon_inode) {
+ ret = 0;
+ goto out;
+ }
loc.parent = inode_ref(dir);
gf_uuid_copy(loc.pargfid, dir->gfid);
loc.name = name;
- loc.inode = inode_ref(inode);
- if (replies[child].valid && replies[child].op_ret == 0) {
- switch (replies[child].poststat.ia_type) {
- case IA_IFDIR:
- gf_msg(this->name, GF_LOG_WARNING, 0,
- AFR_MSG_EXPUNGING_FILE_OR_DIR,
- "expunging dir %s/%s (%s) on %s", uuid_utoa(dir->gfid),
- name, uuid_utoa_r(replies[child].poststat.ia_gfid, g),
- subvol->name);
- ret = syncop_rmdir(subvol, &loc, 1, NULL, NULL);
- break;
- default:
- gf_msg(this->name, GF_LOG_WARNING, 0,
- AFR_MSG_EXPUNGING_FILE_OR_DIR,
- "expunging file %s/%s (%s) on %s", uuid_utoa(dir->gfid),
- name, uuid_utoa_r(replies[child].poststat.ia_gfid, g),
- subvol->name);
- ret = syncop_unlink(subvol, &loc, NULL, NULL);
- break;
- }
+ ret = afr_anon_inode_create(this, child, &loc2.parent);
+ if (ret < 0)
+ goto out;
+
+ loc2.name = g;
+ ret = syncop_rename(subvol, &loc, &loc2, NULL, NULL);
+ if (ret < 0) {
+ gf_msg(this->name, GF_LOG_WARNING, -ret, AFR_MSG_EXPUNGING_FILE_OR_DIR,
+ "Rename to %s dir %s/%s (%s) on %s failed",
+ priv->anon_inode_name, uuid_utoa(dir->gfid), name, g,
+ subvol->name);
+ } else {
+ gf_msg(this->name, GF_LOG_WARNING, 0, AFR_MSG_EXPUNGING_FILE_OR_DIR,
+ "Rename to %s dir %s/%s (%s) on %s successful",
+ priv->anon_inode_name, uuid_utoa(dir->gfid), name, g,
+ subvol->name);
}
+out:
loc_wipe(&loc);
+ loc_wipe(&loc2);
+ if (frame) {
+ AFR_STACK_DESTROY(frame);
+ }
return ret;
}
int
+afr_selfheal_entry_delete(xlator_t *this, inode_t *dir, const char *name,
+ inode_t *inode, int child, struct afr_reply *replies)
+{
+ char g[64] = {0};
+ afr_private_t *priv = NULL;
+ xlator_t *subvol = NULL;
+ int ret = 0;
+ loc_t loc = {
+ 0,
+ };
+ gf_boolean_t anon_inode = _gf_false;
+
+ priv = this->private;
+ subvol = priv->children[child];
+
+ if ((!replies[child].valid) || (replies[child].op_ret < 0)) {
+ /*Nothing to do*/
+ ret = 0;
+ goto out;
+ }
+
+ if (priv->use_anon_inode) {
+ ret = afr_selfheal_entry_anon_inode(this, dir, name, inode, child,
+ replies, &anon_inode);
+ if (ret < 0 || anon_inode)
+ goto out;
+ }
+
+ loc.parent = inode_ref(dir);
+ loc.inode = inode_new(inode->table);
+ if (!loc.inode) {
+ ret = -ENOMEM;
+ goto out;
+ }
+ loc.name = name;
+ switch (replies[child].poststat.ia_type) {
+ case IA_IFDIR:
+ gf_msg(this->name, GF_LOG_WARNING, 0, AFR_MSG_EXPUNGING_FILE_OR_DIR,
+ "expunging dir %s/%s (%s) on %s", uuid_utoa(dir->gfid), name,
+ uuid_utoa_r(replies[child].poststat.ia_gfid, g),
+ subvol->name);
+ ret = syncop_rmdir(subvol, &loc, 1, NULL, NULL);
+ break;
+ default:
+ gf_msg(this->name, GF_LOG_WARNING, 0, AFR_MSG_EXPUNGING_FILE_OR_DIR,
+ "expunging file %s/%s (%s) on %s", uuid_utoa(dir->gfid),
+ name, uuid_utoa_r(replies[child].poststat.ia_gfid, g),
+ subvol->name);
+ ret = syncop_unlink(subvol, &loc, NULL, NULL);
+ break;
+ }
+
+out:
+ loc_wipe(&loc);
+ return ret;
+}
+
+int
afr_selfheal_recreate_entry(call_frame_t *frame, int dst, int source,
unsigned char *sources, inode_t *dir,
const char *name, inode_t *inode,
@@ -76,6 +192,9 @@ afr_selfheal_recreate_entry(call_frame_t *frame, int dst, int source,
loc_t srcloc = {
0,
};
+ loc_t anonloc = {
+ 0,
+ };
xlator_t *this = frame->this;
afr_private_t *priv = NULL;
dict_t *xdata = NULL;
@@ -86,15 +205,17 @@ afr_selfheal_recreate_entry(call_frame_t *frame, int dst, int source,
0,
};
unsigned char *newentry = NULL;
- char dir_uuid_str[64] = {0}, iatt_uuid_str[64] = {0};
+ char iatt_uuid_str[64] = {0};
+ char dir_uuid_str[64] = {0};
priv = this->private;
iatt = &replies[source].poststat;
+ uuid_utoa_r(iatt->ia_gfid, iatt_uuid_str);
if (iatt->ia_type == IA_INVAL || gf_uuid_is_null(iatt->ia_gfid)) {
gf_msg(this->name, GF_LOG_ERROR, 0, AFR_MSG_SELF_HEAL_FAILED,
"Invalid ia_type (%d) or gfid(%s). source brick=%d, "
"pargfid=%s, name=%s",
- iatt->ia_type, uuid_utoa_r(iatt->ia_gfid, iatt_uuid_str), source,
+ iatt->ia_type, iatt_uuid_str, source,
uuid_utoa_r(dir->gfid, dir_uuid_str), name);
ret = -EINVAL;
goto out;
@@ -120,14 +241,24 @@ afr_selfheal_recreate_entry(call_frame_t *frame, int dst, int source,
srcloc.inode = inode_ref(inode);
gf_uuid_copy(srcloc.gfid, iatt->ia_gfid);
- if (iatt->ia_type != IA_IFDIR)
- ret = syncop_lookup(priv->children[dst], &srcloc, 0, 0, 0, 0);
- if (iatt->ia_type == IA_IFDIR || ret == -ENOENT || ret == -ESTALE) {
+ ret = syncop_lookup(priv->children[dst], &srcloc, 0, 0, 0, 0);
+ if (ret == -ENOENT || ret == -ESTALE) {
newentry[dst] = 1;
ret = afr_selfheal_newentry_mark(frame, this, inode, source, replies,
sources, newentry);
if (ret)
goto out;
+ } else if (ret == 0 && iatt->ia_type == IA_IFDIR && priv->use_anon_inode) {
+ // Try rename from hidden directory
+ ret = afr_anon_inode_create(this, dst, &anonloc.parent);
+ if (ret < 0)
+ goto out;
+ anonloc.inode = inode_ref(inode);
+ anonloc.name = iatt_uuid_str;
+ ret = syncop_rename(priv->children[dst], &anonloc, &loc, NULL, NULL);
+ if (ret == -ENOENT || ret == -ESTALE)
+ ret = -1; /*This sets 'mismatch' to true*/
+ goto out;
}
mode = st_mode_from_ia(iatt->ia_prot, iatt->ia_type);
@@ -166,6 +297,7 @@ out:
GF_FREE(linkname);
loc_wipe(&loc);
loc_wipe(&srcloc);
+ loc_wipe(&anonloc);
return ret;
}
@@ -578,6 +710,11 @@ afr_selfheal_entry_dirent(call_frame_t *frame, xlator_t *this, fd_t *fd,
priv = this->private;
+ if (afr_is_private_directory(priv, fd->inode->gfid, name,
+ GF_CLIENT_PID_SELF_HEALD)) {
+ return 0;
+ }
+
xattr = dict_new();
if (!xattr)
return -ENOMEM;
@@ -626,7 +763,7 @@ afr_selfheal_entry_dirent(call_frame_t *frame, xlator_t *this, fd_t *fd,
replies);
if ((ret == 0) && (priv->esh_granular) && parent_idx_inode) {
- ret = afr_shd_index_purge(subvol, parent_idx_inode, name,
+ ret = afr_shd_entry_purge(subvol, parent_idx_inode, name,
inode->ia_type);
/* Why is ret force-set to 0? We do not care about
* index purge failing for full heal as it is quite
@@ -756,10 +893,6 @@ afr_selfheal_entry_do_subvol(call_frame_t *frame, xlator_t *this, fd_t *fd,
if (!strcmp(entry->d_name, ".") || !strcmp(entry->d_name, ".."))
continue;
- if (__is_root_gfid(fd->inode->gfid) &&
- !strcmp(entry->d_name, GF_REPLICATE_TRASH_DIR))
- continue;
-
ret = afr_selfheal_entry_dirent(iter_frame, this, fd, entry->d_name,
loc.inode, subvol,
local->need_full_crawl);
@@ -822,7 +955,7 @@ afr_selfheal_entry_granular_dirent(xlator_t *subvol, gf_dirent_t *entry,
/* The name indices under the pgfid index dir are guaranteed
* to be regular files. Hence the hardcoding.
*/
- afr_shd_index_purge(subvol, parent->inode, entry->d_name, IA_IFREG);
+ afr_shd_entry_purge(subvol, parent->inode, entry->d_name, IA_IFREG);
ret = 0;
goto out;
}
diff --git a/xlators/cluster/afr/src/afr-self-heal-name.c b/xlators/cluster/afr/src/afr-self-heal-name.c
index dd40c57..834aac8 100644
--- a/xlators/cluster/afr/src/afr-self-heal-name.c
+++ b/xlators/cluster/afr/src/afr-self-heal-name.c
@@ -98,21 +98,12 @@ __afr_selfheal_name_expunge(xlator_t *this, inode_t *parent, uuid_t pargfid,
const char *bname, inode_t *inode,
struct afr_reply *replies)
{
- loc_t loc = {
- 0,
- };
int i = 0;
afr_private_t *priv = NULL;
- char g[64];
int ret = 0;
priv = this->private;
- loc.parent = inode_ref(parent);
- gf_uuid_copy(loc.pargfid, pargfid);
- loc.name = bname;
- loc.inode = inode_ref(inode);
-
for (i = 0; i < priv->child_count; i++) {
if (!replies[i].valid)
continue;
@@ -120,30 +111,10 @@ __afr_selfheal_name_expunge(xlator_t *this, inode_t *parent, uuid_t pargfid,
if (replies[i].op_ret)
continue;
- switch (replies[i].poststat.ia_type) {
- case IA_IFDIR:
- gf_msg(this->name, GF_LOG_WARNING, 0,
- AFR_MSG_EXPUNGING_FILE_OR_DIR,
- "expunging dir %s/%s (%s) on %s", uuid_utoa(pargfid),
- bname, uuid_utoa_r(replies[i].poststat.ia_gfid, g),
- priv->children[i]->name);
-
- ret |= syncop_rmdir(priv->children[i], &loc, 1, NULL, NULL);
- break;
- default:
- gf_msg(this->name, GF_LOG_WARNING, 0,
- AFR_MSG_EXPUNGING_FILE_OR_DIR,
- "expunging file %s/%s (%s) on %s", uuid_utoa(pargfid),
- bname, uuid_utoa_r(replies[i].poststat.ia_gfid, g),
- priv->children[i]->name);
-
- ret |= syncop_unlink(priv->children[i], &loc, NULL, NULL);
- break;
- }
+ ret |= afr_selfheal_entry_delete(this, parent, bname, inode, i,
+ replies);
}
- loc_wipe(&loc);
-
return ret;
}
diff --git a/xlators/cluster/afr/src/afr-self-heal.h b/xlators/cluster/afr/src/afr-self-heal.h
index 7a038fa..48e6dbc 100644
--- a/xlators/cluster/afr/src/afr-self-heal.h
+++ b/xlators/cluster/afr/src/afr-self-heal.h
@@ -369,4 +369,9 @@ gf_boolean_t
afr_is_file_empty_on_all_children(afr_private_t *priv,
struct afr_reply *replies);
+int
+afr_selfheal_entry_delete(xlator_t *this, inode_t *dir, const char *name,
+ inode_t *inode, int child, struct afr_reply *replies);
+int
+afr_anon_inode_create(xlator_t *this, int child, inode_t **linked_inode);
#endif /* !_AFR_SELFHEAL_H */
diff --git a/xlators/cluster/afr/src/afr-self-heald.c b/xlators/cluster/afr/src/afr-self-heald.c
index f2e0890..109fd4b 100644
--- a/xlators/cluster/afr/src/afr-self-heald.c
+++ b/xlators/cluster/afr/src/afr-self-heald.c
@@ -222,7 +222,7 @@ out:
}
int
-afr_shd_index_purge(xlator_t *subvol, inode_t *inode, char *name,
+afr_shd_entry_purge(xlator_t *subvol, inode_t *inode, char *name,
ia_type_t type)
{
int ret = 0;
@@ -424,7 +424,7 @@ afr_shd_index_heal(xlator_t *subvol, gf_dirent_t *entry, loc_t *parent,
ret = afr_shd_selfheal(healer, healer->subvol, gfid);
if (ret == -ENOENT || ret == -ESTALE)
- afr_shd_index_purge(subvol, parent->inode, entry->d_name, val);
+ afr_shd_entry_purge(subvol, parent->inode, entry->d_name, val);
if (ret == 2)
/* If bricks crashed in pre-op after creating indices/xattrop
@@ -843,6 +843,176 @@ out:
return need_heal;
}
+static int
+afr_shd_anon_inode_cleaner(xlator_t *subvol, gf_dirent_t *entry, loc_t *parent,
+ void *data)
+{
+ struct subvol_healer *healer = data;
+ afr_private_t *priv = healer->this->private;
+ call_frame_t *frame = NULL;
+ afr_local_t *local = NULL;
+ int ret = 0;
+ loc_t loc = {0};
+ int count = 0;
+ int i = 0;
+ int op_errno = 0;
+ struct iatt *iatt = NULL;
+ gf_boolean_t multiple_links = _gf_false;
+ unsigned char *gfid_present = alloca0(priv->child_count);
+ unsigned char *entry_present = alloca0(priv->child_count);
+ char *type = "file";
+
+ frame = afr_frame_create(healer->this, &ret);
+ if (!frame) {
+ ret = -ret;
+ goto out;
+ }
+ local = frame->local;
+ if (AFR_COUNT(local->child_up, priv->child_count) != priv->child_count) {
+ gf_msg_debug(healer->this->name, 0,
+ "Not all bricks are up. Skipping "
+ "cleanup of %s on %s",
+ entry->d_name, subvol->name);
+ ret = 0;
+ goto out;
+ }
+
+ loc.inode = inode_new(parent->inode->table);
+ if (!loc.inode) {
+ ret = -ENOMEM;
+ goto out;
+ }
+ ret = gf_uuid_parse(entry->d_name, loc.gfid);
+ if (ret) {
+ ret = 0;
+ goto out;
+ }
+ AFR_ONLIST(local->child_up, frame, afr_selfheal_discover_cbk, lookup, &loc,
+ NULL);
+ for (i = 0; i < priv->child_count; i++) {
+ if (local->replies[i].op_ret == 0) {
+ count++;
+ gfid_present[i] = 1;
+ iatt = &local->replies[i].poststat;
+ if (iatt->ia_type == IA_IFDIR) {
+ type = "dir";
+ }
+
+ if (i == healer->subvol) {
+ if (local->replies[i].poststat.ia_nlink > 1) {
+ multiple_links = _gf_true;
+ }
+ }
+ } else if (local->replies[i].op_errno != ENOENT &&
+ local->replies[i].op_errno != ESTALE) {
+ /*We don't have complete view. Skip the entry*/
+ gf_msg_debug(healer->this->name, local->replies[i].op_errno,
+ "Skipping cleanup of %s on %s", entry->d_name,
+ subvol->name);
+ ret = 0;
+ goto out;
+ }
+ }
+
+ /*Inode is deleted from subvol*/
+ if (count == 1 || (iatt->ia_type != IA_IFDIR && multiple_links)) {
+ gf_msg(healer->this->name, GF_LOG_WARNING, 0,
+ AFR_MSG_EXPUNGING_FILE_OR_DIR, "expunging %s %s/%s on %s", type,
+ priv->anon_inode_name, entry->d_name, subvol->name);
+ ret = afr_shd_entry_purge(subvol, parent->inode, entry->d_name,
+ iatt->ia_type);
+ if (ret == -ENOENT || ret == -ESTALE)
+ ret = 0;
+ } else if (count > 1) {
+ loc_wipe(&loc);
+ loc.parent = inode_ref(parent->inode);
+ loc.name = entry->d_name;
+ loc.inode = inode_new(parent->inode->table);
+ if (!loc.inode) {
+ ret = -ENOMEM;
+ goto out;
+ }
+ AFR_ONLIST(local->child_up, frame, afr_selfheal_discover_cbk, lookup,
+ &loc, NULL);
+ count = 0;
+ for (i = 0; i < priv->child_count; i++) {
+ if (local->replies[i].op_ret == 0) {
+ count++;
+ entry_present[i] = 1;
+ iatt = &local->replies[i].poststat;
+ } else if (local->replies[i].op_errno != ENOENT &&
+ local->replies[i].op_errno != ESTALE) {
+ /*We don't have complete view. Skip the entry*/
+ gf_msg_debug(healer->this->name, local->replies[i].op_errno,
+ "Skipping cleanup of %s on %s", entry->d_name,
+ subvol->name);
+ ret = 0;
+ goto out;
+ }
+ }
+ for (i = 0; i < priv->child_count; i++) {
+ if (gfid_present[i] && !entry_present[i]) {
+ /*Entry is not anonymous on at least one subvol*/
+ gf_msg_debug(healer->this->name, 0,
+ "Valid entry present on %s "
+ "Skipping cleanup of %s on %s",
+ priv->children[i]->name, entry->d_name,
+ subvol->name);
+ ret = 0;
+ goto out;
+ }
+ }
+
+ gf_msg(healer->this->name, GF_LOG_WARNING, 0,
+ AFR_MSG_EXPUNGING_FILE_OR_DIR,
+ "expunging %s %s/%s on all subvols", type, priv->anon_inode_name,
+ entry->d_name);
+ ret = 0;
+ for (i = 0; i < priv->child_count; i++) {
+ op_errno = -afr_shd_entry_purge(priv->children[i], loc.parent,
+ entry->d_name, iatt->ia_type);
+ if (op_errno != ENOENT && op_errno != ESTALE) {
+ ret |= -op_errno;
+ }
+ }
+ }
+
+out:
+ if (frame)
+ AFR_STACK_DESTROY(frame);
+ loc_wipe(&loc);
+ return ret;
+}
+
+static void
+afr_cleanup_anon_inode_dir(struct subvol_healer *healer)
+{
+ int ret = 0;
+ call_frame_t *frame = NULL;
+ afr_private_t *priv = healer->this->private;
+ loc_t loc = {0};
+
+ ret = afr_anon_inode_create(healer->this, healer->subvol, &loc.inode);
+ if (ret)
+ goto out;
+
+ frame = afr_frame_create(healer->this, &ret);
+ if (!frame) {
+ ret = -ret;
+ goto out;
+ }
+
+ ret = syncop_mt_dir_scan(frame, priv->children[healer->subvol], &loc,
+ GF_CLIENT_PID_SELF_HEALD, healer,
+ afr_shd_anon_inode_cleaner, NULL,
+ priv->shd.max_threads, priv->shd.wait_qlength);
+out:
+ if (frame)
+ AFR_STACK_DESTROY(frame);
+ loc_wipe(&loc);
+ return;
+}
+
void *
afr_shd_index_healer(void *data)
{
@@ -900,6 +1070,10 @@ afr_shd_index_healer(void *data)
sleep(1);
} while (ret > 0);
+ if (ret == 0) {
+ afr_cleanup_anon_inode_dir(healer);
+ }
+
if (ret == 0 && pre_crawl_xdata &&
!healer->crawl_event.heal_failed_count) {
afr_shd_ta_check_and_unset_xattrs(this, &loc, healer,
diff --git a/xlators/cluster/afr/src/afr-self-heald.h b/xlators/cluster/afr/src/afr-self-heald.h
index 687c28e..18db728 100644
--- a/xlators/cluster/afr/src/afr-self-heald.h
+++ b/xlators/cluster/afr/src/afr-self-heald.h
@@ -70,6 +70,6 @@ afr_shd_gfid_to_path(xlator_t *this, xlator_t *subvol, uuid_t gfid,
char **path_p);
int
-afr_shd_index_purge(xlator_t *subvol, inode_t *inode, char *name,
+afr_shd_entry_purge(xlator_t *subvol, inode_t *inode, char *name,
ia_type_t type);
#endif /* !_AFR_SELF_HEALD_H */
diff --git a/xlators/cluster/afr/src/afr.c b/xlators/cluster/afr/src/afr.c
index 67e0a4d..df7366f 100644
--- a/xlators/cluster/afr/src/afr.c
+++ b/xlators/cluster/afr/src/afr.c
@@ -135,6 +135,27 @@ set_data_self_heal_algorithm(afr_private_t *priv, char *algo)
}
}
+void
+afr_handle_anon_inode_options(afr_private_t *priv, dict_t *options)
+{
+ char *volfile_id_str = NULL;
+ uuid_t anon_inode_gfid = {0};
+
+ /*If volume id is not present don't enable anything*/
+ if (dict_get_str(options, "volume-id", &volfile_id_str))
+ return;
+ GF_ASSERT(strlen(AFR_ANON_DIR_PREFIX) + strlen(volfile_id_str) <= NAME_MAX);
+ /*anon_inode_name is not supposed to change once assigned*/
+ if (!priv->anon_inode_name[0]) {
+ snprintf(priv->anon_inode_name, sizeof(priv->anon_inode_name), "%s-%s",
+ AFR_ANON_DIR_PREFIX, volfile_id_str);
+ gf_uuid_parse(volfile_id_str, anon_inode_gfid);
+ /*Flip a bit to make sure volfile-id and anon-gfid are not same*/
+ anon_inode_gfid[0] ^= 1;
+ uuid_utoa_r(anon_inode_gfid, priv->anon_gfid_str);
+ }
+}
+
int
reconfigure(xlator_t *this, dict_t *options)
{
@@ -290,6 +311,10 @@ reconfigure(xlator_t *this, dict_t *options)
consistent_io = _gf_false;
priv->consistent_io = consistent_io;
+ afr_handle_anon_inode_options(priv, options);
+
+ GF_OPTION_RECONF("use-anonymous-inode", priv->use_anon_inode, options, bool,
+ out);
if (priv->shd.enabled) {
if ((priv->shd.enabled != enabled_old) ||
(timeout_old != priv->shd.timeout))
@@ -541,7 +566,9 @@ init(xlator_t *this)
GF_OPTION_INIT("consistent-metadata", priv->consistent_metadata, bool, out);
GF_OPTION_INIT("consistent-io", priv->consistent_io, bool, out);
+ afr_handle_anon_inode_options(priv, this->options);
+ GF_OPTION_INIT("use-anonymous-inode", priv->use_anon_inode, bool, out);
if (priv->quorum_count != 0)
priv->consistent_io = _gf_false;
@@ -553,6 +580,9 @@ init(xlator_t *this)
goto out;
}
+ priv->anon_inode = GF_CALLOC(sizeof(unsigned char), child_count,
+ gf_afr_mt_char);
+
priv->child_up = GF_CALLOC(sizeof(unsigned char), child_count,
gf_afr_mt_char);
@@ -561,7 +591,8 @@ init(xlator_t *this)
priv->halo_child_up = GF_CALLOC(sizeof(unsigned char), child_count,
gf_afr_mt_char);
- if (!priv->child_up || !priv->child_latency || !priv->halo_child_up) {
+ if (!priv->child_up || !priv->child_latency || !priv->halo_child_up ||
+ !priv->anon_inode) {
ret = -ENOMEM;
goto out;
}
@@ -1286,6 +1317,14 @@ struct volume_options options[] = {
.tags = {"replicate"},
.description = "This option exists only for backward compatibility "
"and configuring it doesn't have any effect"},
+ {.key = {"use-anonymous-inode"},
+ .type = GF_OPTION_TYPE_BOOL,
+ .default_value = "no",
+ .op_version = {GD_OP_VERSION_8_0},
+ .flags = OPT_FLAG_CLIENT_OPT | OPT_FLAG_SETTABLE,
+ .tags = {"replicate"},
+ .description = "Setting this option heals directory renames efficiently"},
+
{.key = {NULL}},
};
diff --git a/xlators/cluster/afr/src/afr.h b/xlators/cluster/afr/src/afr.h
index 2e60708..d62f9a9 100644
--- a/xlators/cluster/afr/src/afr.h
+++ b/xlators/cluster/afr/src/afr.h
@@ -42,6 +42,7 @@
#define AFR_LK_HEAL_DOM "afr.lock-heal.domain"
#define AFR_HALO_MAX_LATENCY 99999
+#define AFR_ANON_DIR_PREFIX ".glusterfs-anonymous-inode"
#define PFLAG_PENDING (1 << 0)
#define PFLAG_SBRAIN (1 << 1)
@@ -190,6 +191,7 @@ typedef struct _afr_private {
struct list_head ta_waitq;
struct list_head ta_onwireq;
+ unsigned char *anon_inode;
unsigned char *child_up;
unsigned char *halo_child_up;
int64_t *child_latency;
@@ -275,10 +277,15 @@ typedef struct _afr_private {
gf_boolean_t esh_granular;
gf_boolean_t consistent_io;
gf_boolean_t data_self_heal; /* on/off */
+ gf_boolean_t use_anon_inode;
/*For lock healing.*/
struct list_head saved_locks;
struct list_head lk_healq;
+
+ /*For anon-inode handling */
+ char anon_inode_name[NAME_MAX + 1];
+ char anon_gfid_str[UUID_SIZE + 1];
} afr_private_t;
typedef enum {
@@ -1409,4 +1416,8 @@ afr_dom_lock_release(call_frame_t *frame);
void
afr_fill_success_replies(afr_local_t *local, afr_private_t *priv,
unsigned char *replies);
+
+gf_boolean_t
+afr_is_private_directory(afr_private_t *priv, uuid_t pargfid, const char *name,
+ pid_t pid);
#endif /* __AFR_H__ */
diff --git a/xlators/mgmt/glusterd/src/glusterd-volgen.c b/xlators/mgmt/glusterd/src/glusterd-volgen.c
index 4e5712e..8d6fb5e 100644
--- a/xlators/mgmt/glusterd/src/glusterd-volgen.c
+++ b/xlators/mgmt/glusterd/src/glusterd-volgen.c
@@ -3810,6 +3810,38 @@ out:
}
static int
+set_volfile_id_option(volgen_graph_t *graph, glusterd_volinfo_t *volinfo,
+ int clusters)
+{
+ xlator_t *xlator = NULL;
+ int i = 0;
+ int ret = -1;
+ glusterd_conf_t *conf = NULL;
+ xlator_t *this = NULL;
+
+ this = THIS;
+ GF_VALIDATE_OR_GOTO("glusterd", this, out);
+ conf = this->private;
+ GF_VALIDATE_OR_GOTO(this->name, conf, out);
+
+ if (conf->op_version < GD_OP_VERSION_9_0)
+ return 0;
+ xlator = first_of(graph);
+
+ for (i = 0; i < clusters; i++) {
+ ret = xlator_set_fixed_option(xlator, "volume-id",
+ uuid_utoa(volinfo->volume_id));
+ if (ret)
+ goto out;
+
+ xlator = xlator->next;
+ }
+
+out:
+ return ret;
+}
+
+static int
volgen_graph_build_afr_clusters(volgen_graph_t *graph,
glusterd_volinfo_t *volinfo)
{
@@ -3851,6 +3883,13 @@ volgen_graph_build_afr_clusters(volgen_graph_t *graph,
clusters = -1;
goto out;
}
+
+ ret = set_volfile_id_option(graph, volinfo, clusters);
+ if (ret) {
+ clusters = -1;
+ goto out;
+ }
+
if (!volinfo->arbiter_count && !volinfo->thin_arbiter_count)
goto out;
diff --git a/xlators/mgmt/glusterd/src/glusterd-volume-set.c b/xlators/mgmt/glusterd/src/glusterd-volume-set.c
index f74876e..398b4d7 100644
--- a/xlators/mgmt/glusterd/src/glusterd-volume-set.c
+++ b/xlators/mgmt/glusterd/src/glusterd-volume-set.c
@@ -3138,4 +3138,9 @@ struct volopt_map_entry glusterd_volopt_map[] = {
.type = NO_DOC,
},
+ {.key = "cluster.use-anonymous-inode",
+ .voltype = "cluster/replicate",
+ .op_version = GD_OP_VERSION_9_0,
+ .value = "yes",
+ .flags = VOLOPT_FLAG_CLIENT_OPT},
{.key = NULL}};