summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--cli/src/cli-cmd-parser.c91
-rw-r--r--cli/src/cli-cmd-volume.c79
-rw-r--r--cli/src/cli-rpc-ops.c6
-rw-r--r--heal/src/glfs-heal.c416
-rw-r--r--libglusterfs/src/glusterfs.h1
-rw-r--r--rpc/rpc-lib/src/protocol-common.h2
-rw-r--r--tests/basic/afr/split-brain-healing.t183
-rw-r--r--xlators/cluster/afr/src/afr-common.c76
-rw-r--r--xlators/cluster/afr/src/afr-inode-read.c5
-rw-r--r--xlators/cluster/afr/src/afr-self-heal-common.c191
-rw-r--r--xlators/cluster/afr/src/afr-self-heal-data.c62
-rw-r--r--xlators/cluster/afr/src/afr-self-heal-metadata.c34
-rw-r--r--xlators/cluster/afr/src/afr-self-heal.h18
-rw-r--r--xlators/cluster/afr/src/afr.h4
-rw-r--r--xlators/cluster/dht/src/dht-common.c12
15 files changed, 1039 insertions, 141 deletions
diff --git a/cli/src/cli-cmd-parser.c b/cli/src/cli-cmd-parser.c
index 28888ba656d..53b14d27708 100644
--- a/cli/src/cli-cmd-parser.c
+++ b/cli/src/cli-cmd-parser.c
@@ -2929,6 +2929,43 @@ out:
return ret;
}
+static int
+set_hostname_path_in_dict (const char *token, dict_t *dict, int heal_op)
+{
+ char *hostname = NULL;
+ char *path = NULL;
+ int ret = 0;
+
+ ret = extract_hostname_path_from_token (token, &hostname, &path);
+ if (ret)
+ goto out;
+
+ switch (heal_op) {
+ case GF_AFR_OP_SBRAIN_HEAL_FROM_BRICK:
+ ret = dict_set_dynstr (dict, "heal-source-hostname",
+ hostname);
+ if (ret)
+ goto out;
+ ret = dict_set_dynstr (dict, "heal-source-brickpath",
+ path);
+ break;
+ case GF_AFR_OP_STATISTICS_HEAL_COUNT_PER_REPLICA:
+ ret = dict_set_dynstr (dict, "per-replica-cmd-hostname",
+ hostname);
+ if (ret)
+ goto out;
+ ret = dict_set_dynstr (dict, "per-replica-cmd-path",
+ path);
+ break;
+ default:
+ ret = -1;
+ break;
+ }
+
+out:
+ return ret;
+
+}
int
cli_cmd_volume_heal_options_parse (const char **words, int wordcount,
@@ -2936,8 +2973,6 @@ cli_cmd_volume_heal_options_parse (const char **words, int wordcount,
{
int ret = 0;
dict_t *dict = NULL;
- char *hostname = NULL;
- char *path = NULL;
dict = dict_new ();
if (!dict)
@@ -3008,6 +3043,35 @@ cli_cmd_volume_heal_options_parse (const char **words, int wordcount,
ret = -1;
goto out;
}
+ if (wordcount == 6) {
+ if (strcmp (words[3], "split-brain")) {
+ ret = -1;
+ goto out;
+ }
+ if (!strcmp (words[4], "bigger-file")) {
+ ret = dict_set_int32 (dict, "heal-op",
+ GF_AFR_OP_SBRAIN_HEAL_FROM_BIGGER_FILE);
+ if (ret)
+ goto out;
+ ret = dict_set_str (dict, "file", (char *)words[5]);
+ if (ret)
+ goto out;
+ goto done;
+ }
+ if (!strcmp (words[4], "source-brick")) {
+ ret = dict_set_int32 (dict, "heal-op",
+ GF_AFR_OP_SBRAIN_HEAL_FROM_BRICK);
+ if (ret)
+ goto out;
+ ret = set_hostname_path_in_dict (words[5], dict,
+ GF_AFR_OP_SBRAIN_HEAL_FROM_BRICK);
+ if (ret)
+ goto out;
+ goto done;
+ }
+ ret = -1;
+ goto out;
+ }
if (wordcount == 7) {
if (!strcmp (words[3], "statistics")
&& !strcmp (words[4], "heal-count")
@@ -3017,21 +3081,26 @@ cli_cmd_volume_heal_options_parse (const char **words, int wordcount,
GF_AFR_OP_STATISTICS_HEAL_COUNT_PER_REPLICA);
if (ret)
goto out;
- ret = extract_hostname_path_from_token (words[6],
- &hostname, &path);
+ ret = set_hostname_path_in_dict (words[6], dict,
+ GF_AFR_OP_STATISTICS_HEAL_COUNT_PER_REPLICA);
if (ret)
goto out;
- ret = dict_set_dynstr (dict, "per-replica-cmd-hostname",
- hostname);
+ goto done;
+
+ }
+ if (!strcmp (words[3], "split-brain") &&
+ !strcmp (words[4], "source-brick")) {
+ ret = dict_set_int32 (dict, "heal-op",
+ GF_AFR_OP_SBRAIN_HEAL_FROM_BRICK);
+ ret = set_hostname_path_in_dict (words[5], dict,
+ GF_AFR_OP_SBRAIN_HEAL_FROM_BRICK);
if (ret)
goto out;
- ret = dict_set_dynstr (dict, "per-replica-cmd-path",
- path);
+ ret = dict_set_str (dict, "file",
+ (char *) words[6]);
if (ret)
goto out;
- else
- goto done;
-
+ goto done;
}
}
ret = -1;
diff --git a/cli/src/cli-cmd-volume.c b/cli/src/cli-cmd-volume.c
index 238c8673d75..501b5776dec 100644
--- a/cli/src/cli-cmd-volume.c
+++ b/cli/src/cli-cmd-volume.c
@@ -1879,6 +1879,60 @@ cli_print_brick_status (cli_volume_status_t *status)
return 0;
}
+#define NEEDS_GLFS_HEAL(op) ((op == GF_AFR_OP_SBRAIN_HEAL_FROM_BIGGER_FILE) || \
+ (op == GF_AFR_OP_SBRAIN_HEAL_FROM_BRICK) || \
+ (op == GF_AFR_OP_INDEX_SUMMARY))
+
+int
+cli_launch_glfs_heal (int heal_op, dict_t *options)
+{
+ char buff[PATH_MAX] = {0};
+ runner_t runner = {0};
+ char *filename = NULL;
+ char *hostname = NULL;
+ char *path = NULL;
+ char *volname = NULL;
+ char *out = NULL;
+ int ret = 0;
+
+ runinit (&runner);
+ ret = dict_get_str (options, "volname", &volname);
+ runner_add_args (&runner, SBIN_DIR"/glfsheal", volname, NULL);
+ runner_redir (&runner, STDOUT_FILENO, RUN_PIPE);
+
+ switch (heal_op) {
+ case GF_AFR_OP_INDEX_SUMMARY:
+ break;
+ case GF_AFR_OP_SBRAIN_HEAL_FROM_BIGGER_FILE:
+ ret = dict_get_str (options, "file", &filename);
+ runner_add_args (&runner, "bigger-file", filename, NULL);
+ break;
+ case GF_AFR_OP_SBRAIN_HEAL_FROM_BRICK:
+ ret = dict_get_str (options, "heal-source-hostname",
+ &hostname);
+ ret = dict_get_str (options, "heal-source-brickpath",
+ &path);
+ runner_add_args (&runner, "source-brick", NULL);
+ runner_argprintf (&runner, "%s:%s", hostname, path);
+ if (dict_get_str (options, "file", &filename) == 0)
+ runner_argprintf (&runner, filename);
+ break;
+ default:
+ ret = -1;
+ }
+ ret = runner_start (&runner);
+ if (ret == -1)
+ goto out;
+ while ((out = fgets (buff, sizeof(buff),
+ runner_chio (&runner, STDOUT_FILENO)))) {
+ printf ("%s", out);
+ }
+ ret = runner_end (&runner);
+ ret = WEXITSTATUS (ret);
+
+out:
+ return ret;
+}
int
cli_cmd_volume_heal_cbk (struct cli_state *state, struct cli_cmd_word *word,
const char **words, int wordcount)
@@ -1892,9 +1946,6 @@ cli_cmd_volume_heal_cbk (struct cli_state *state, struct cli_cmd_word *word,
xlator_t *this = NULL;
cli_local_t *local = NULL;
int heal_op = 0;
- runner_t runner = {0};
- char buff[PATH_MAX] = {0};
- char *out = NULL;
this = THIS;
frame = create_frame (this, this->ctx->pool);
@@ -1916,21 +1967,10 @@ cli_cmd_volume_heal_cbk (struct cli_state *state, struct cli_cmd_word *word,
ret = dict_get_int32 (options, "heal-op", &heal_op);
if (ret < 0)
goto out;
-
- if (heal_op == GF_AFR_OP_INDEX_SUMMARY) {
- runinit (&runner);
- runner_add_args (&runner, SBIN_DIR"/glfsheal", words[2], NULL);
- runner_redir (&runner, STDOUT_FILENO, RUN_PIPE);
- ret = runner_start (&runner);
+ if (NEEDS_GLFS_HEAL (heal_op)) {
+ ret = cli_launch_glfs_heal (heal_op, options);
if (ret == -1)
goto out;
- while ((out = fgets(buff, sizeof(buff),
- runner_chio (&runner, STDOUT_FILENO)))) {
- printf ("%s", out);
- }
-
- ret = runner_end (&runner);
- ret = WEXITSTATUS (ret);
}
else {
proc = &cli_rpc_prog->proctable[GLUSTER_CLI_HEAL_VOLUME];
@@ -1946,7 +1986,7 @@ out:
if (ret) {
cli_cmd_sent_status_get (&sent);
if ((sent == 0) && (parse_error == 0))
- cli_out ("Volume heal failed");
+ cli_out ("Volume heal failed.");
}
CLI_STACK_DESTROY (frame);
@@ -2316,7 +2356,10 @@ struct cli_cmd volume_cmds[] = {
cli_cmd_volume_status_cbk,
"display status of all or specified volume(s)/brick"},
- { "volume heal <VOLNAME> [{full | statistics {heal-count {replica <hostname:brickname>}} |info {healed | heal-failed | split-brain}}]",
+ { "volume heal <VOLNAME> [full | statistics [heal-count "\
+ "[replica <HOSTNAME:BRICKNAME>]] |info [healed | heal-failed | "\
+ "split-brain]| split-brain {bigger-file <FILE> |source-brick "\
+ "<HOSTNAME:BRICKNAME> [<FILE>]}]",
cli_cmd_volume_heal_cbk,
"self-heal commands on volume specified by <VOLNAME>"},
diff --git a/cli/src/cli-rpc-ops.c b/cli/src/cli-rpc-ops.c
index 1d8cf23ff42..72ffaf4129a 100644
--- a/cli/src/cli-rpc-ops.c
+++ b/cli/src/cli-rpc-ops.c
@@ -7358,6 +7358,12 @@ gf_cli_heal_volume_cbk (struct rpc_req *req, struct iovec *iov,
case GF_AFR_OP_STATISTICS_HEAL_COUNT_PER_REPLICA:
heal_op_str = "count of entries to be healed per replica";
break;
+ /* The below 2 cases are never hit; they're coded only to make
+ * compiler warnings go away.*/
+ case GF_AFR_OP_SBRAIN_HEAL_FROM_BIGGER_FILE:
+ case GF_AFR_OP_SBRAIN_HEAL_FROM_BRICK:
+ break;
+
case GF_AFR_OP_INVALID:
heal_op_str = "invalid heal op";
break;
diff --git a/heal/src/glfs-heal.c b/heal/src/glfs-heal.c
index a9baad3ac56..f49f3a58afc 100644
--- a/heal/src/glfs-heal.c
+++ b/heal/src/glfs-heal.c
@@ -14,11 +14,17 @@
#include "glfs.h"
#include "glfs-handles.h"
#include "glfs-internal.h"
+#include "protocol-common.h"
#include "syncop.h"
#include <string.h>
#include <time.h>
#define DEFAULT_HEAL_LOG_FILE_DIRECTORY DATADIR "/log/glusterfs"
+#define USAGE_STR "Usage: %s <VOLNAME> [bigger-file <FILE> | "\
+ "source-brick <HOSTNAME:BRICKNAME> [<FILE>]]\n"
+
+int glfsh_heal_splitbrain_file (glfs_t *fs, xlator_t *top_subvol,
+ loc_t *rootloc, char *file, dict_t *xattr_req);
int
glfsh_link_inode_update_loc (loc_t *loc, struct iatt *iattr)
@@ -83,6 +89,37 @@ out:
return ret;
}
+int
+glfsh_get_index_dir_fd (xlator_t *xl, loc_t *loc, fd_t **fd)
+{
+ int ret = -1;
+
+ *fd = fd_create (loc->inode, GF_CLIENT_PID_GLFS_HEAL);
+ if (!*fd) {
+ printf ("fd_create failed: %s", strerror(errno));
+ goto out;
+ }
+ ret = syncop_opendir (xl, loc, *fd);
+ if (ret) {
+ fd_unref(*fd);
+#ifdef GF_LINUX_HOST_OS /* See comment in afr_shd_index_opendir() */
+ *fd = fd_anonymous (loc->inode);
+ if (!*fd) {
+ printf ("fd_anonymous failed: %s",
+ strerror(errno));
+ goto out;
+ }
+ ret = 0;
+#else
+ printf ("opendir failed: %s", strerror(errno));
+ goto out;
+#endif
+ }
+
+out:
+ return ret;
+}
+
static xlator_t*
_get_afr_ancestor (xlator_t *xl)
{
@@ -185,6 +222,33 @@ glfsh_print_heal_status (dict_t *dict, char *path, uuid_t gfid,
}
static int
+glfsh_heal_entries (glfs_t *fs, xlator_t *top_subvol, loc_t *rootloc,
+ gf_dirent_t *entries, uint64_t *offset,
+ uint64_t *num_entries, dict_t *xattr_req) {
+
+ gf_dirent_t *entry = NULL;
+ gf_dirent_t *tmp = NULL;
+ int ret = 0;
+ char file[64] = {0};
+
+ list_for_each_entry_safe (entry, tmp, &entries->list, list) {
+ *offset = entry->d_off;
+ if ((strcmp (entry->d_name, ".") == 0) ||
+ (strcmp (entry->d_name, "..") == 0))
+ continue;
+ memset (file, 0, sizeof(file));
+ snprintf (file, sizeof(file), "gfid:%s", entry->d_name);
+ ret = glfsh_heal_splitbrain_file (fs, top_subvol, rootloc, file,
+ xattr_req);
+ if (ret)
+ continue;
+ (*num_entries)++;
+ }
+
+ return ret;
+}
+
+static int
glfsh_process_entries (xlator_t *xl, fd_t *fd, gf_dirent_t *entries,
uint64_t *offset, uint64_t *num_entries)
{
@@ -240,15 +304,21 @@ glfsh_process_entries (xlator_t *xl, fd_t *fd, gf_dirent_t *entries,
}
static int
-glfsh_crawl_directory (xlator_t *readdir_xl, fd_t *fd, loc_t *loc)
+glfsh_crawl_directory (glfs_t *fs, xlator_t *top_subvol, loc_t *rootloc,
+ xlator_t *readdir_xl, fd_t *fd, loc_t *loc,
+ dict_t *xattr_req)
{
uint64_t offset = 0;
gf_dirent_t entries;
int ret = 0;
gf_boolean_t free_entries = _gf_false;
uint64_t num_entries = 0;
+ int heal_op = -1;
INIT_LIST_HEAD (&entries.list);
+ ret = dict_get_int32 (xattr_req, "heal-op", &heal_op);
+ if (ret)
+ return ret;
while (1) {
ret = syncop_readdir (readdir_xl, fd, 131072, offset, &entries);
@@ -260,11 +330,16 @@ glfsh_crawl_directory (xlator_t *readdir_xl, fd_t *fd, loc_t *loc)
if (list_empty (&entries.list))
goto out;
- ret = glfsh_process_entries (readdir_xl, fd, &entries, &offset,
- &num_entries);
- if (ret < 0)
- goto out;
-
+ if (heal_op == GF_AFR_OP_INDEX_SUMMARY) {
+ ret = glfsh_process_entries (readdir_xl, fd, &entries,
+ &offset, &num_entries);
+ if (ret < 0)
+ goto out;
+ } else if (heal_op == GF_AFR_OP_SBRAIN_HEAL_FROM_BRICK) {
+ ret = glfsh_heal_entries (fs, top_subvol, rootloc,
+ &entries, &offset,
+ &num_entries, xattr_req);
+ }
gf_dirent_free (&entries);
free_entries = _gf_false;
}
@@ -275,9 +350,12 @@ out:
if (ret < 0) {
printf ("Failed to complete gathering info. "
"Number of entries so far: %"PRIu64"\n", num_entries);
- }
- else {
- printf ("Number of entries: %"PRIu64"\n", num_entries);
+ } else {
+ if (heal_op == GF_AFR_OP_INDEX_SUMMARY)
+ printf ("Number of entries: %"PRIu64"\n", num_entries);
+ else if (heal_op == GF_AFR_OP_SBRAIN_HEAL_FROM_BRICK)
+ printf ("Number of healed entries: %"PRIu64"\n",
+ num_entries);
}
return ret;
}
@@ -333,13 +411,22 @@ out:
}
void
-glfsh_print_pending_heals (xlator_t *xl, loc_t *rootloc)
+glfsh_print_pending_heals (glfs_t *fs, xlator_t *top_subvol, loc_t *rootloc,
+ xlator_t *xl)
{
int ret = 0;
loc_t dirloc = {0};
fd_t *fd = NULL;
int32_t op_errno = 0;
+ dict_t *xattr_req = NULL;
+ xattr_req = dict_new();
+ if (!xattr_req)
+ goto out;
+
+ ret = dict_set_int32 (xattr_req, "heal-op", GF_AFR_OP_INDEX_SUMMARY);
+ if (ret)
+ goto out;
ret = glfsh_print_brick (xl, rootloc);
if (ret < 0) {
glfsh_print_brick_from_xl (xl);
@@ -356,30 +443,16 @@ glfsh_print_pending_heals (xlator_t *xl, loc_t *rootloc)
goto out;
}
- fd = fd_create (dirloc.inode, GF_CLIENT_PID_GLFS_HEAL);
- if (!fd) {
- printf ("fd_create failed: %s", strerror(errno));
- goto out;
- }
- ret = syncop_opendir (xl, &dirloc, fd);
- if (ret) {
- fd_unref(fd);
-#ifdef GF_LINUX_HOST_OS /* See comment in afr_shd_index_opendir() */
- fd = fd_anonymous (dirloc.inode);
- if (!fd) {
- printf ("fd_anonymous failed: %s",
- strerror(errno));
- goto out;
- }
-#else
- printf ("opendir failed: %s", strerror(errno));
+ ret = glfsh_get_index_dir_fd (xl, &dirloc, &fd);
+ if (ret)
goto out;
-#endif
- }
- ret = glfsh_crawl_directory (xl, fd, &dirloc);
+ ret = glfsh_crawl_directory (fs, top_subvol, rootloc, xl, fd, &dirloc,
+ xattr_req);
if (fd)
fd_unref (fd);
+ if (xattr_req)
+ dict_unref (xattr_req);
if (ret < 0)
printf ("Failed to find entries with pending self-heal\n");
out:
@@ -411,6 +484,209 @@ glfsh_validate_replicate_volume (xlator_t *xl)
return ret;
}
+static xlator_t*
+_brick_path_to_client_xlator (xlator_t *top_subvol, char *hostname,
+ char *brickpath)
+{
+ int ret = 0;
+ xlator_t *xl = NULL;
+ char *remote_host = NULL;
+ char *remote_subvol = NULL;
+
+ xl = top_subvol;
+
+ while (xl->next)
+ xl = xl->next;
+
+ while (xl) {
+ if (!strcmp (xl->type, "protocol/client")) {
+ ret = dict_get_str (xl->options, "remote-host",
+ &remote_host);
+ if (ret < 0)
+ goto out;
+ ret = dict_get_str (xl->options,
+ "remote-subvolume", &remote_subvol);
+ if (ret < 0)
+ goto out;
+ if (!strcmp (hostname, remote_host) &&
+ !strcmp (brickpath, remote_subvol))
+ return xl;
+ }
+ xl = xl->prev;
+ }
+
+out:
+ return NULL;
+}
+
+
+int
+glfsh_gather_heal_info (glfs_t *fs, xlator_t *top_subvol, loc_t *rootloc)
+{
+ xlator_t *xl = NULL;
+ xlator_t *afr_xl = NULL;
+ xlator_t *old_THIS = NULL;
+
+ xl = top_subvol;
+ while (xl->next)
+ xl = xl->next;
+ while (xl) {
+ if (strcmp (xl->type, "protocol/client") == 0) {
+ afr_xl = _get_afr_ancestor (xl);
+ if (afr_xl)
+ old_THIS = THIS;
+ THIS = afr_xl;
+ glfsh_print_pending_heals (fs, top_subvol,
+ rootloc, xl);
+ THIS = old_THIS;
+ printf ("\n");
+ }
+
+ xl = xl->prev;
+ }
+
+ return 0;
+}
+
+int
+glfsh_heal_splitbrain_file (glfs_t *fs, xlator_t *top_subvol, loc_t *rootloc,
+ char *file, dict_t *xattr_req)
+{
+ int ret = -1;
+ int reval = 0;
+ loc_t loc = {0, };
+ char *path = NULL;
+ char *filename = NULL;
+ struct iatt iatt = {0, };
+ xlator_t *xl = top_subvol;
+ dict_t *xattr_rsp = NULL;
+ char *sh_fail_msg = NULL;
+ int32_t op_errno = 0;
+
+ if (!strncmp (file, "gfid:", 5)) {
+ filename = gf_strdup(file);
+ path = strtok (filename, ":");
+ path = strtok (NULL, ";");
+ uuid_parse (path, loc.gfid);
+ loc.path = gf_strdup (uuid_utoa (loc.gfid));
+ loc.inode = inode_new (rootloc->inode->table);
+ ret = syncop_lookup (xl, &loc, xattr_req, 0, &xattr_rsp, 0);
+ if (ret) {
+ op_errno = -ret;
+ printf ("Lookup failed on %s:%s.\n", file,
+ strerror(op_errno));
+ goto out;
+ }
+ } else {
+ if (file[0] != '/') {
+ printf ("<FILE> must be absolute path w.r.t. the "
+ "volume, starting with '/'\n");
+ ret = -1;
+ goto out;
+ }
+retry:
+ ret = glfs_resolve (fs, xl, file, &loc, &iatt, reval);
+ ESTALE_RETRY (ret, errno, reval, &loc, retry);
+ if (ret) {
+ printf("Lookup failed on %s:%s\n",
+ file, strerror (errno));
+ goto out;
+ }
+ }
+
+ ret = syncop_getxattr (xl, &loc, &xattr_rsp, GF_AFR_HEAL_SBRAIN,
+ xattr_req);
+ if (ret) {
+ op_errno = -ret;
+ printf ("Healing %s failed:%s.\n", file, strerror(op_errno));
+ goto out;
+ }
+ ret = dict_get_str (xattr_rsp, "sh-fail-msg", &sh_fail_msg);
+ if (!ret) {
+ printf ("Healing %s failed: %s.\n", file, sh_fail_msg);
+ ret = -1;
+ goto out;
+ }
+ printf ("Healed %s.\n", file);
+ ret = 0;
+out:
+ if (xattr_rsp)
+ dict_unref (xattr_rsp);
+ return ret;
+}
+
+int
+glfsh_heal_from_brick (glfs_t *fs, xlator_t *top_subvol, loc_t *rootloc,
+ char *hostname, char *brickpath, char *file)
+{
+ int ret = -1;
+ dict_t *xattr_req = NULL;
+ xlator_t *client = NULL;
+ fd_t *fd = NULL;
+ loc_t dirloc = {0};
+ int32_t op_errno = 0;
+
+ xattr_req = dict_new();
+ if (!xattr_req)
+ goto out;
+ ret = dict_set_int32 (xattr_req, "heal-op",
+ GF_AFR_OP_SBRAIN_HEAL_FROM_BRICK);
+ if (ret)
+ goto out;
+ client = _brick_path_to_client_xlator (top_subvol, hostname, brickpath);
+ if (!client) {
+ printf("\"%s:%s\"- No such brick available in the volume.\n",
+ hostname, brickpath);
+ ret = -1;
+ goto out;
+ }
+ ret = dict_set_str (xattr_req, "child-name", client->name);
+ if (ret)
+ goto out;
+ if (file)
+ ret = glfsh_heal_splitbrain_file (fs, top_subvol, rootloc, file,
+ xattr_req);
+ else {
+ ret = glfsh_get_index_dir_loc (rootloc, client, &dirloc,
+ &op_errno);
+ ret = glfsh_get_index_dir_fd (client, &dirloc, &fd);
+ if (ret)
+ goto out;
+ ret = glfsh_crawl_directory (fs, top_subvol, rootloc, client,
+ fd, &dirloc, xattr_req);
+ if (fd)
+ fd_unref (fd);
+ }
+out:
+ if (xattr_req)
+ dict_unref (xattr_req);
+ loc_wipe (&dirloc);
+ return ret;
+}
+
+int
+glfsh_heal_from_bigger_file (glfs_t *fs, xlator_t *top_subvol, loc_t *rootloc,
+ char *file)
+{
+
+ int ret = -1;
+ dict_t *xattr_req = NULL;
+
+ xattr_req = dict_new();
+ if (!xattr_req)
+ goto out;
+ ret = dict_set_int32 (xattr_req, "heal-op",
+ GF_AFR_OP_SBRAIN_HEAL_FROM_BIGGER_FILE);
+ if (ret)
+ goto out;
+ ret = glfsh_heal_splitbrain_file (fs, top_subvol, rootloc, file,
+ xattr_req);
+out:
+ if (xattr_req)
+ dict_unref (xattr_req);
+ return ret;
+}
+
int
main (int argc, char **argv)
{
@@ -418,18 +694,54 @@ main (int argc, char **argv)
int ret = 0;
char *volname = NULL;
xlator_t *top_subvol = NULL;
- xlator_t *xl = NULL;
loc_t rootloc = {0};
char logfilepath[PATH_MAX] = {0};
- xlator_t *old_THIS = NULL;
- xlator_t *afr_xl = NULL;
+ char *hostname = NULL;
+ char *path = NULL;
+ char *file = NULL;
+ gf_xl_afr_op_t heal_op = -1;
- if (argc != 2) {
- printf ("Usage: %s <volname>\n", argv[0]);
+ if (argc < 2) {
+ printf (USAGE_STR, argv[0]);
ret = -1;
goto out;
}
volname = argv[1];
+ switch (argc) {
+ case 2:
+ heal_op = GF_AFR_OP_INDEX_SUMMARY;
+ break;
+ case 4:
+ if (!strcmp (argv[2], "bigger-file")) {
+ heal_op = GF_AFR_OP_SBRAIN_HEAL_FROM_BIGGER_FILE;
+ file = argv[3];
+ } else if (!strcmp (argv[2], "source-brick")) {
+ heal_op = GF_AFR_OP_SBRAIN_HEAL_FROM_BRICK;
+ hostname = strtok (argv[3], ":");
+ path = strtok (NULL, ":");
+ } else {
+ printf (USAGE_STR, argv[0]);
+ ret = -1;
+ goto out;
+ }
+ break;
+ case 5:
+ if (!strcmp (argv[2], "source-brick")) {
+ heal_op = GF_AFR_OP_SBRAIN_HEAL_FROM_BRICK;
+ hostname = strtok (argv[3], ":");
+ path = strtok (NULL, ":");
+ file = argv[4];
+ } else {
+ printf (USAGE_STR, argv[0]);
+ ret = -1;
+ goto out;
+ }
+ break;
+ default:
+ printf (USAGE_STR, argv[0]);
+ ret = -1;
+ goto out;
+ }
fs = glfs_new (volname);
if (!fs) {
@@ -485,30 +797,28 @@ main (int argc, char **argv)
rootloc.inode = inode_ref (top_subvol->itable->root);
glfs_loc_touchup (&rootloc);
- xl = top_subvol;
- while (xl->next)
- xl = xl->next;
-
- while (xl) {
- if (strcmp (xl->type, "protocol/client") == 0) {
- afr_xl = _get_afr_ancestor (xl);
- if (afr_xl) {
- old_THIS = THIS;
- THIS = afr_xl;
- glfsh_print_pending_heals (xl, &rootloc);
- THIS = old_THIS;
- printf("\n");
- }
- }
-
- xl = xl->prev;
+ switch (heal_op) {
+ case GF_AFR_OP_INDEX_SUMMARY:
+ ret = glfsh_gather_heal_info (fs, top_subvol, &rootloc);
+ break;
+ case GF_AFR_OP_SBRAIN_HEAL_FROM_BIGGER_FILE:
+ ret = glfsh_heal_from_bigger_file (fs, top_subvol,
+ &rootloc, file);
+ break;
+ case GF_AFR_OP_SBRAIN_HEAL_FROM_BRICK:
+ ret = glfsh_heal_from_brick (fs, top_subvol, &rootloc,
+ hostname, path, file);
+ break;
+ default:
+ ret = -1;
+ break;
}
loc_wipe (&rootloc);
glfs_subvol_done (fs, top_subvol);
glfs_fini (fs);
- return 0;
+ return ret;
out:
if (fs && top_subvol)
glfs_subvol_done (fs, top_subvol);
diff --git a/libglusterfs/src/glusterfs.h b/libglusterfs/src/glusterfs.h
index 4c213f41576..73945e578fe 100644
--- a/libglusterfs/src/glusterfs.h
+++ b/libglusterfs/src/glusterfs.h
@@ -138,6 +138,7 @@
#define GF_XATTROP_INDEX_COUNT "glusterfs.xattrop_index_count"
#define GF_AFR_HEAL_INFO "glusterfs.heal-info"
+#define GF_AFR_HEAL_SBRAIN "glusterfs.heal-sbrain"
#define GF_GFIDLESS_LOOKUP "gfidless-lookup"
/* replace-brick and pump related internal xattrs */
diff --git a/rpc/rpc-lib/src/protocol-common.h b/rpc/rpc-lib/src/protocol-common.h
index 1fd063aec25..f560c103acd 100644
--- a/rpc/rpc-lib/src/protocol-common.h
+++ b/rpc/rpc-lib/src/protocol-common.h
@@ -231,6 +231,8 @@ typedef enum {
GF_AFR_OP_STATISTICS,
GF_AFR_OP_STATISTICS_HEAL_COUNT,
GF_AFR_OP_STATISTICS_HEAL_COUNT_PER_REPLICA,
+ GF_AFR_OP_SBRAIN_HEAL_FROM_BIGGER_FILE,
+ GF_AFR_OP_SBRAIN_HEAL_FROM_BRICK,
} gf_xl_afr_op_t ;
struct gf_gsync_detailed_status_ {
diff --git a/tests/basic/afr/split-brain-healing.t b/tests/basic/afr/split-brain-healing.t
new file mode 100644
index 00000000000..1dc317df8dd
--- /dev/null
+++ b/tests/basic/afr/split-brain-healing.t
@@ -0,0 +1,183 @@
+#!/bin/bash
+
+#Test the split-brain resolution CLI commands.
+. $(dirname $0)/../../include.rc
+. $(dirname $0)/../../volume.rc
+
+function get_replicate_subvol_number {
+ local filename=$1
+ #get_backend_paths
+ if [ -f $B0/${V0}1/$filename ]
+ then
+ echo 0
+ elif [ -f $B0/${V0}3/$filename ]
+ then echo 1
+ else
+ echo -1
+ fi
+}
+
+cleanup;
+
+AREQUAL_PATH=$(dirname $0)/../../utils
+CFLAGS=""
+test "`uname -s`" != "Linux" && {
+ CFLAGS="$CFLAGS -I$(dirname $0)/../../../contrib/argp-standalone ";
+ CFLAGS="$CFLAGS -L$(dirname $0)/../../../contrib/argp-standalone -largp ";
+ CFLAGS="$CFLAGS -lintl";
+}
+build_tester $AREQUAL_PATH/arequal-checksum.c $CFLAGS
+TEST glusterd
+TEST pidof glusterd
+TEST $CLI volume create $V0 replica 2 $H0:$B0/${V0}{1,2,3,4}
+TEST $CLI volume set $V0 cluster.self-heal-daemon off
+TEST $CLI volume start $V0
+TEST glusterfs --volfile-id=/$V0 --volfile-server=$H0 $M0 --attribute-timeout=0 --entry-timeout=0
+
+cd $M0
+for i in {1..10}
+do
+ echo "Initial content">>file$i
+done
+
+replica_0_files_list=(`ls $B0/${V0}1`)
+replica_1_files_list=(`ls $B0/${V0}3`)
+
+############ Create data split-brain in the files. ###########################
+TEST kill_brick $V0 $H0 $B0/${V0}1
+for file in ${!replica_0_files_list[*]}
+do
+ echo "B1 is down">>${replica_0_files_list[$file]}
+done
+TEST kill_brick $V0 $H0 $B0/${V0}3
+for file in ${!replica_1_files_list[*]}
+do
+ echo "B3 is down">>${replica_1_files_list[$file]}
+done
+
+SMALLER_FILE_SIZE=$(stat -c %s file1)
+
+TEST $CLI volume start $V0 force
+EXPECT_WITHIN $PROCESS_UP_TIMEOUT "1" afr_child_up_status $V0 0
+EXPECT_WITHIN $PROCESS_UP_TIMEOUT "1" afr_child_up_status $V0 2
+
+TEST kill_brick $V0 $H0 $B0/${V0}2
+for file in ${!replica_0_files_list[*]}
+do
+ echo "B2 is down">>${replica_0_files_list[$file]}
+ echo "appending more content to make it the bigger file">>${replica_0_files_list[$file]}
+done
+TEST kill_brick $V0 $H0 $B0/${V0}4
+for file in ${!replica_1_files_list[*]}
+do
+ echo "B4 is down">>${replica_1_files_list[$file]}
+ echo "appending more content to make it the bigger file">>${replica_1_files_list[$file]}
+done
+
+BIGGER_FILE_SIZE=$(stat -c %s file1)
+
+TEST $CLI volume start $V0 force
+EXPECT_WITHIN $PROCESS_UP_TIMEOUT "1" afr_child_up_status $V0 1
+EXPECT_WITHIN $PROCESS_UP_TIMEOUT "1" afr_child_up_status $V0 3
+
+
+############### Acessing the files should now give EIO. ###############################
+TEST ! cat file1
+TEST ! cat file2
+TEST ! cat file3
+TEST ! cat file4
+TEST ! cat file5
+TEST ! cat file6
+TEST ! cat file7
+TEST ! cat file8
+TEST ! cat file9
+TEST ! cat file10
+###################
+TEST $CLI volume set $V0 cluster.self-heal-daemon on
+EXPECT_WITHIN $PROCESS_UP_TIMEOUT "Y" glustershd_up_status
+EXPECT_WITHIN $CHILD_UP_TIMEOUT "1" afr_child_up_status_in_shd $V0 0
+EXPECT_WITHIN $CHILD_UP_TIMEOUT "1" afr_child_up_status_in_shd $V0 1
+EXPECT_WITHIN $CHILD_UP_TIMEOUT "1" afr_child_up_status_in_shd $V0 2
+EXPECT_WITHIN $CHILD_UP_TIMEOUT "1" afr_child_up_status_in_shd $V0 3
+
+################ Heal file1 using the bigger-file option ##############
+$CLI volume heal $V0 split-brain bigger-file /file1
+EXPECT "0" echo $?
+EXPECT $BIGGER_FILE_SIZE stat -c %s file1
+
+################ Heal file2 using the bigger-file option and its gfid ##############
+subvolume=$(get_replicate_subvol_number file2)
+if [ $subvolume == 0 ]
+then
+ GFID=$(gf_get_gfid_xattr $B0/${V0}1/file2)
+elif [ $subvolume == 1 ]
+then
+ GFID=$(gf_get_gfid_xattr $B0/${V0}3/file2)
+fi
+GFIDSTR="gfid:$(gf_gfid_xattr_to_str $GFID)"
+$CLI volume heal $V0 split-brain bigger-file $GFIDSTR
+EXPECT "0" echo $?
+
+################ Heal file3 using the source-brick option ##############
+################ Use the brick having smaller file size as source #######
+subvolume=$(get_replicate_subvol_number file3)
+if [ $subvolume == 0 ]
+then
+ $CLI volume heal $V0 split-brain source-brick $H0:$B0/${V0}2 /file3
+elif [ $subvolume == 1]
+then
+ $CLI volume heal $V0 split-brain source-brick $H0:$B0/${V0}4 /file3
+fi
+EXPECT "0" echo $?
+EXPECT $SMALLER_FILE_SIZE stat -c %s file3
+
+################ Heal file4 using the source-brick option and it's gfid ##############
+################ Use the brick having smaller file size as source #######
+subvolume=$(get_replicate_subvol_number file4)
+if [ $subvolume == 0 ]
+then
+ GFID=$(gf_get_gfid_xattr $B0/${V0}1/file4)
+ GFIDSTR="gfid:$(gf_gfid_xattr_to_str $GFID)"
+ $CLI volume heal $V0 split-brain source-brick $H0:$B0/${V0}2 $GFIDSTR
+elif [ $subvolume == 1]
+then
+ GFID=$(gf_get_gfid_xattr $B0/${V0}3/file4)
+ GFIDSTR="gfid:$(gf_gfid_xattr_to_str $GFID)"
+ $CLI volume heal $V0 split-brain source-brick $H0:$B0/${V0}4 $GFIDSTR
+fi
+EXPECT "0" echo $?
+EXPECT $SMALLER_FILE_SIZE stat -c %s file4
+
+################ Heal remaining SB'ed files of replica_0 using B1 as source ##############
+$CLI volume heal $V0 split-brain source-brick $H0:$B0/${V0}1
+EXPECT "0" echo $?
+
+################ Heal remaining SB'ed files of replica_1 using B3 as source ##############
+$CLI volume heal $V0 split-brain source-brick $H0:$B0/${V0}3
+EXPECT "0" echo $?
+
+############### Reading the files should now suceed. ###############################
+TEST cat file1
+TEST cat file2
+TEST cat file3
+TEST cat file4
+TEST cat file5
+TEST cat file6
+TEST cat file7
+TEST cat file8
+TEST cat file9
+TEST cat file10
+
+################ File contents on the bricks must be same. ################################
+TEST diff <(arequal-checksum -p $B0/$V01 -i .glusterfs) <(arequal-checksum -p $B0/$V02 -i .glusterfs)
+TEST diff <(arequal-checksum -p $B0/$V03 -i .glusterfs) <(arequal-checksum -p $B0/$V04 -i .glusterfs)
+
+############### Trying to heal files not in SB should fail. ###############################
+$CLI volume heal $V0 split-brain bigger-file /file1
+EXPECT "1" echo $?
+$CLI volume heal $V0 split-brain source-brick $H0:$B0/${V0}4 /file3
+EXPECT "1" echo $?
+
+cd -
+TEST rm $AREQUAL_PATH/arequal-checksum
+cleanup
diff --git a/xlators/cluster/afr/src/afr-common.c b/xlators/cluster/afr/src/afr-common.c
index f39db802588..e6d45add4e8 100644
--- a/xlators/cluster/afr/src/afr-common.c
+++ b/xlators/cluster/afr/src/afr-common.c
@@ -4471,5 +4471,81 @@ out:
AFR_STACK_UNWIND (getxattr, frame, ret, op_errno, dict, NULL);
if (dict)
dict_unref (dict);
+ if (inode) {
+ inode_forget (inode, 1);
+ inode_unref (inode);
+ }
+ return ret;
+}
+
+int32_t
+afr_heal_splitbrain_file(call_frame_t *frame, xlator_t *this, loc_t *loc)
+{
+ gf_boolean_t data_selfheal = _gf_false;
+ gf_boolean_t metadata_selfheal = _gf_false;
+ gf_boolean_t entry_selfheal = _gf_false;
+ dict_t *dict = NULL;
+ afr_local_t *local = NULL;
+ inode_t *inode = NULL;
+ int entry_ret = 0, metadata_ret = 0, data_ret = 0;
+ int ret = 0, op_errno = 0;
+
+ local = frame->local;
+ dict = dict_new ();
+ if (!dict) {
+ op_errno = ENOMEM;
+ ret = -1;
+ goto out;
+ }
+
+ ret = afr_selfheal_unlocked_inspect (frame, this, loc->gfid, &inode,
+ &data_selfheal,
+ &metadata_selfheal,
+ &entry_selfheal);
+ if (ret) {
+ op_errno = -ret;
+ ret = -1;
+ goto out;
+ }
+
+ if (!data_selfheal && !metadata_selfheal && !entry_selfheal) {
+ ret = dict_set_str (dict, "sh-fail-msg",
+ "File not in split-brain");
+ if (ret)
+ gf_log (this->name, GF_LOG_WARNING,
+ "Failed to set sh-fail-msg in dict");
+ ret = 0;
+ goto out;
+ }
+
+ if (data_selfheal)
+ data_ret = afr_selfheal_data (frame, this, inode);
+
+ if (metadata_selfheal)
+ metadata_ret = afr_selfheal_metadata (frame, this, inode);
+
+ if (entry_selfheal)
+ entry_ret = afr_selfheal_entry (frame, this, inode);
+
+ ret = (data_ret | metadata_ret | entry_ret);
+
+ if (local->xdata_rsp) {
+ /* 'sh-fail-msg' has been set in the dict during self-heal.*/
+ dict_copy (local->xdata_rsp, dict);
+ ret = 0;
+ } else if (ret) {
+ /*Some other error during self-heal. Just propagate it.*/
+ op_errno = -ret;
+ ret = -1;
+ }
+
+out:
+ AFR_STACK_UNWIND (getxattr, frame, ret, op_errno, dict, NULL);
+ if (dict)
+ dict_unref(dict);
+ if (inode) {
+ inode_forget (inode, 1);
+ inode_unref (inode);
+ }
return ret;
}
diff --git a/xlators/cluster/afr/src/afr-inode-read.c b/xlators/cluster/afr/src/afr-inode-read.c
index e64070e1bcd..78dd65f30e7 100644
--- a/xlators/cluster/afr/src/afr-inode-read.c
+++ b/xlators/cluster/afr/src/afr-inode-read.c
@@ -1380,6 +1380,11 @@ afr_getxattr (call_frame_t *frame, xlator_t *this,
return 0;
}
+ if (!strcmp (name, GF_AFR_HEAL_SBRAIN)) {
+ afr_heal_splitbrain_file (frame, this, loc);
+ return 0;
+ }
+
/*
* if we are doing getxattr with pathinfo as the key then we
* collect information from all childs
diff --git a/xlators/cluster/afr/src/afr-self-heal-common.c b/xlators/cluster/afr/src/afr-self-heal-common.c
index 6198d4cf72c..e9d853c4ecd 100644
--- a/xlators/cluster/afr/src/afr-self-heal-common.c
+++ b/xlators/cluster/afr/src/afr-self-heal-common.c
@@ -17,7 +17,7 @@
#include "afr.h"
#include "afr-self-heal.h"
#include "byte-order.h"
-
+#include "protocol-common.h"
int
afr_selfheal_post_op_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
@@ -287,6 +287,39 @@ afr_selfheal_extract_xattr (xlator_t *this, struct afr_reply *replies,
return 0;
}
+/*
+ * If by chance there are multiple sources with differing sizes, select
+ * the largest file as the source.
+ *
+ * This can happen if data was directly modified in the backend or for snapshots
+ */
+void
+afr_mark_largest_file_as_source (xlator_t *this, unsigned char *sources,
+ struct afr_reply *replies)
+{
+ int i = 0;
+ afr_private_t *priv = NULL;
+ uint64_t size = 0;
+
+ /* Find source with biggest file size */
+ priv = this->private;
+ for (i = 0; i < priv->child_count; i++) {
+ if (!sources[i])
+ continue;
+ if (size <= replies[i].poststat.ia_size) {
+ size = replies[i].poststat.ia_size;
+ }
+ }
+
+ /* Mark sources with less size as not source */
+ for (i = 0; i < priv->child_count; i++) {
+ if (!sources[i])
+ continue;
+ if (size > replies[i].poststat.ia_size)
+ sources[i] = 0;
+ }
+}
+
void
afr_mark_active_sinks (xlator_t *this, unsigned char *sources,
unsigned char *locked_on, unsigned char *sinks)
@@ -304,6 +337,154 @@ afr_mark_active_sinks (xlator_t *this, unsigned char *sources,
}
gf_boolean_t
+afr_dict_contains_heal_op (call_frame_t *frame)
+{
+ afr_local_t *local = NULL;
+ dict_t *xdata_req = NULL;
+ int ret = 0;
+ int heal_op = -1;
+
+ local = frame->local;
+ xdata_req = local->xdata_req;
+ ret = dict_get_int32 (xdata_req, "heal-op", &heal_op);
+ if (ret)
+ return _gf_false;
+ if (local->xdata_rsp == NULL) {
+ local->xdata_rsp = dict_new();
+ if (!local->xdata_rsp)
+ return _gf_true;
+ }
+ ret = dict_set_str (local->xdata_rsp, "sh-fail-msg",
+ "File not in split-brain");
+
+ return _gf_true;
+}
+
+/* Return a source depending on the type of heal_op, and set sources[source],
+ * sinks[source] and healed_sinks[source] to 1, 0 and 0 respectively. Do so
+ * only if the following condition is met:
+ * ∀i((i ∈ locked_on[] ∧ i=1)==>(sources[i]=0 ∧ sinks[i]=1 ∧ healed_sinks[i]=1))
+ * i.e. for each locked node, sources[node] is 0; healed_sinks[node] and
+ * sinks[node] are 1. This should be the case if the file is in split-brain.
+ */
+int
+afr_mark_split_brain_source_sinks (call_frame_t *frame, xlator_t *this,
+ unsigned char *sources,
+ unsigned char *sinks,
+ unsigned char *healed_sinks,
+ unsigned char *locked_on,
+ struct afr_reply *replies,
+ afr_transaction_type type)
+{
+ afr_local_t *local = NULL;
+ afr_private_t *priv = NULL;
+ dict_t *xdata_req = NULL;
+ dict_t *xdata_rsp = NULL;
+ int ret = 0;
+ int heal_op = -1;
+ int i = 0;
+ char *name = NULL;
+ int source = -1;
+
+ local = frame->local;
+ priv = this->private;
+ xdata_req = local->xdata_req;
+ ret = dict_get_int32 (xdata_req, "heal-op", &heal_op);
+ if (ret)
+ goto out;
+ for (i = 0; i < priv->child_count; i++) {
+ if (locked_on[i])
+ if (sources[i] || !sinks[i] || !healed_sinks[i]) {
+ ret = -1;
+ goto out;
+ }
+ }
+ if (local->xdata_rsp == NULL) {
+ local->xdata_rsp = dict_new();
+ if (!local->xdata_rsp) {
+ ret = -1;
+ goto out;
+ }
+ }
+ xdata_rsp = local->xdata_rsp;
+
+ switch (heal_op) {
+ case GF_AFR_OP_SBRAIN_HEAL_FROM_BIGGER_FILE:
+ if (type == AFR_METADATA_TRANSACTION) {
+ ret = dict_set_str (xdata_rsp, "sh-fail-msg",
+ "Use source-brick option to"
+ " heal metadata split-brain");
+ if (!ret)
+ ret = -1;
+ goto out;
+ }
+ for (i = 0 ; i < priv->child_count; i++)
+ if (locked_on[i])
+ sources[i] = 1;
+ afr_mark_largest_file_as_source (this, sources, replies);
+ if (AFR_COUNT (sources, priv->child_count) != 1) {
+ ret = dict_set_str (xdata_rsp, "sh-fail-msg",
+ "No bigger file");
+ if (!ret)
+ ret = -1;
+ goto out;
+ }
+ for (i = 0 ; i < priv->child_count; i++)
+ if (sources[i])
+ source = i;
+ sinks[source] = 0;
+ healed_sinks[source] = 0;
+ break;
+ case GF_AFR_OP_SBRAIN_HEAL_FROM_BRICK:
+ ret = dict_get_str (xdata_req, "child-name", &name);
+ if (ret)
+ goto out;
+ source = afr_get_child_index_from_name (this, name);
+ if (source < 0) {
+ ret = dict_set_str (xdata_rsp, "sh-fail-msg",
+ "Invalid brick name");
+ if (!ret)
+ ret = -1;
+ goto out;
+ }
+ if (locked_on[source] != 1) {
+ ret = dict_set_str (xdata_rsp, "sh-fail-msg",
+ "Brick is not up");
+ if (!ret)
+ ret = -1;
+ goto out;
+ }
+ sources[source] = 1;
+ sinks[source] = 0;
+ healed_sinks[source] = 0;
+ break;
+ default:
+ ret = -1;
+ goto out;
+ }
+ ret = source;
+out:
+ return ret;
+
+}
+
+int
+afr_get_child_index_from_name (xlator_t *this, char *name)
+{
+ afr_private_t *priv = this->private;
+ int index = -1;
+
+ for (index = 0; index < priv->child_count; index++) {
+ if (!strcmp (priv->children[index]->name, name))
+ goto out;
+ }
+ index = -1;
+out:
+ return index;
+}
+
+
+gf_boolean_t
afr_does_witness_exist (xlator_t *this, uint64_t *witness)
{
int i = 0;
@@ -427,6 +608,14 @@ afr_selfheal_find_direction (call_frame_t *frame, xlator_t *this,
}
}
+ /* If no sources, all locked nodes are sinks - split brain */
+ if (AFR_COUNT (sources, priv->child_count) == 0) {
+ for (i = 0; i < priv->child_count; i++) {
+ if (locked_on[i])
+ sinks[i] = 1;
+ }
+ }
+
/* In afr-v1 if a file is self-accused but didn't have any pending
* operations on others then it is similar to 'dirty' in afr-v2.
* Consider such cases as witness.
diff --git a/xlators/cluster/afr/src/afr-self-heal-data.c b/xlators/cluster/afr/src/afr-self-heal-data.c
index a434b9e6ba1..45a099cec86 100644
--- a/xlators/cluster/afr/src/afr-self-heal-data.c
+++ b/xlators/cluster/afr/src/afr-self-heal-data.c
@@ -17,6 +17,7 @@
#include "afr.h"
#include "afr-self-heal.h"
#include "byte-order.h"
+#include "protocol-common.h"
enum {
AFR_SELFHEAL_DATA_FULL = 0,
@@ -426,41 +427,6 @@ afr_does_size_mismatch (xlator_t *this, unsigned char *sources,
return _gf_false;
}
-/*
- * If by chance there are multiple sources with differing sizes, select
- * the largest file as the source.
- *
- * This can happen if data was directly modified in the backend or for snapshots
- */
-
-static void
-afr_mark_largest_file_as_source (xlator_t *this, unsigned char *sources,
- struct afr_reply *replies)
-{
- int i = 0;
- afr_private_t *priv = NULL;
- uint64_t size = 0;
-
- /* Find source with biggest file size */
- priv = this->private;
- for (i = 0; i < priv->child_count; i++) {
- if (!sources[i])
- continue;
- if (size <= replies[i].poststat.ia_size) {
- size = replies[i].poststat.ia_size;
- }
- }
-
- /* Mark sources with less size as not source */
- for (i = 0; i < priv->child_count; i++) {
- if (!sources[i])
- continue;
- if (size > replies[i].poststat.ia_size)
- sources[i] = 0;
- }
-
- return;
-}
static void
afr_mark_biggest_witness_as_source (xlator_t *this, unsigned char *sources,
@@ -518,7 +484,9 @@ afr_mark_newest_file_as_source (xlator_t *this, unsigned char *sources,
}
static int
-__afr_selfheal_data_finalize_source (xlator_t *this, unsigned char *sources,
+__afr_selfheal_data_finalize_source (call_frame_t *frame, xlator_t *this,
+ unsigned char *sources,
+ unsigned char *sinks,
unsigned char *healed_sinks,
unsigned char *locked_on,
struct afr_reply *replies,
@@ -528,7 +496,6 @@ __afr_selfheal_data_finalize_source (xlator_t *this, unsigned char *sources,
afr_private_t *priv = NULL;
int source = -1;
int sources_count = 0;
-
priv = this->private;
sources_count = AFR_COUNT (sources, priv->child_count);
@@ -536,9 +503,21 @@ __afr_selfheal_data_finalize_source (xlator_t *this, unsigned char *sources,
if ((AFR_CMP (locked_on, healed_sinks, priv->child_count) == 0)
|| !sources_count) {
/* split brain */
- return -EIO;
+ source = afr_mark_split_brain_source_sinks (frame, this,
+ sources, sinks,
+ healed_sinks,
+ locked_on, replies,
+ AFR_DATA_TRANSACTION);
+ if (source < 0)
+ return -EIO;
+ return source;
}
+ /* No split brain at this point. If we were called from
+ * afr_heal_splitbrain_file(), abort.*/
+ if (afr_dict_contains_heal_op(frame))
+ return -EIO;
+
/* If there are no witnesses/size-mismatches on sources we are done*/
if (!afr_does_size_mismatch (this, sources, replies) &&
!afr_has_source_witnesses (this, sources, witness))
@@ -605,9 +584,10 @@ __afr_selfheal_data_prepare (call_frame_t *frame, xlator_t *this,
*/
AFR_INTERSECT (healed_sinks, sinks, locked_on, priv->child_count);
- source = __afr_selfheal_data_finalize_source (this, sources,
- healed_sinks, locked_on,
- replies, witness);
+ source = __afr_selfheal_data_finalize_source (frame, this, sources,
+ sinks, healed_sinks,
+ locked_on, replies,
+ witness);
if (source < 0)
return -EIO;
diff --git a/xlators/cluster/afr/src/afr-self-heal-metadata.c b/xlators/cluster/afr/src/afr-self-heal-metadata.c
index 0518c1821e3..05d9f2b4917 100644
--- a/xlators/cluster/afr/src/afr-self-heal-metadata.c
+++ b/xlators/cluster/afr/src/afr-self-heal-metadata.c
@@ -17,6 +17,7 @@
#include "afr.h"
#include "afr-self-heal.h"
#include "byte-order.h"
+#include "protocol-common.h"
#define AFR_HEAL_ATTR (GF_SET_ATTR_UID|GF_SET_ATTR_GID|GF_SET_ATTR_MODE)
@@ -199,6 +200,7 @@ out:
static int
__afr_selfheal_metadata_finalize_source (call_frame_t *frame, xlator_t *this,
unsigned char *sources,
+ unsigned char *sinks,
unsigned char *healed_sinks,
unsigned char *locked_on,
struct afr_reply *replies)
@@ -208,13 +210,26 @@ __afr_selfheal_metadata_finalize_source (call_frame_t *frame, xlator_t *this,
struct iatt first = {0, };
int source = -1;
int sources_count = 0;
+ dict_t *xdata_req = NULL;
+ afr_local_t *local = NULL;
priv = this->private;
+ local = frame->local;
+ xdata_req = local->xdata_req;
sources_count = AFR_COUNT (sources, priv->child_count);
if ((AFR_CMP (locked_on, healed_sinks, priv->child_count) == 0)
|| !sources_count) {
+
+ source = afr_mark_split_brain_source_sinks (frame, this,
+ sources, sinks,
+ healed_sinks,
+ locked_on, replies,
+ AFR_METADATA_TRANSACTION);
+ if (source >= 0)
+ return source;
+
/* If this is a directory mtime/ctime only split brain
use the most recent */
source = afr_dirtime_splitbrain_source (frame, this,
@@ -224,17 +239,7 @@ __afr_selfheal_metadata_finalize_source (call_frame_t *frame, xlator_t *this,
"split brain on %s",
uuid_utoa (replies[source].poststat.ia_gfid));
sources[source] = 1;
-
- for (i = 0; i < priv->child_count; i++) {
- if (i == source)
- continue;
-
- if (!locked_on[i])
- continue;
-
- healed_sinks[i] = 1;
- }
-
+ healed_sinks[source] = 0;
return source;
}
@@ -253,6 +258,11 @@ __afr_selfheal_metadata_finalize_source (call_frame_t *frame, xlator_t *this,
}
}
+ /* No split brain at this point. If we were called from
+ * afr_heal_splitbrain_file(), abort.*/
+ if (afr_dict_contains_heal_op(frame))
+ return -EIO;
+
for (i = 0; i < priv->child_count; i++) {
if (!sources[i])
continue;
@@ -352,7 +362,7 @@ __afr_selfheal_metadata_prepare (call_frame_t *frame, xlator_t *this, inode_t *i
}
source = __afr_selfheal_metadata_finalize_source (frame, this, sources,
- healed_sinks,
+ sinks, healed_sinks,
locked_on, replies);
if (source < 0)
diff --git a/xlators/cluster/afr/src/afr-self-heal.h b/xlators/cluster/afr/src/afr-self-heal.h
index 50cff91ccb3..74cc9608cf6 100644
--- a/xlators/cluster/afr/src/afr-self-heal.h
+++ b/xlators/cluster/afr/src/afr-self-heal.h
@@ -193,10 +193,28 @@ afr_log_selfheal (uuid_t gfid, xlator_t *this, int ret, char *type,
int source, unsigned char *healed_sinks);
void
+afr_mark_largest_file_as_source (xlator_t *this, unsigned char *sources,
+ struct afr_reply *replies);
+void
afr_mark_active_sinks (xlator_t *this, unsigned char *sources,
unsigned char *locked_on, unsigned char *sinks);
gf_boolean_t
+afr_dict_contains_heal_op (call_frame_t *frame);
+
+int
+afr_mark_split_brain_source_sinks (call_frame_t *frame, xlator_t *this,
+ unsigned char *sources,
+ unsigned char *sinks,
+ unsigned char *healed_sinks,
+ unsigned char *locked_on,
+ struct afr_reply *replies,
+ afr_transaction_type type);
+
+int
+afr_get_child_index_from_name (xlator_t *this, char *name);
+
+gf_boolean_t
afr_does_witness_exist (xlator_t *this, uint64_t *witness);
int
diff --git a/xlators/cluster/afr/src/afr.h b/xlators/cluster/afr/src/afr.h
index 4fdc5f774cc..09821b724fe 100644
--- a/xlators/cluster/afr/src/afr.h
+++ b/xlators/cluster/afr/src/afr.h
@@ -1021,4 +1021,8 @@ afr_is_xattr_ignorable (char *key);
int
afr_get_heal_info (call_frame_t *frame, xlator_t *this, loc_t *loc,
dict_t *xdata);
+
+int
+afr_heal_splitbrain_file(call_frame_t *frame, xlator_t *this, loc_t *loc);
+
#endif /* __AFR_H__ */
diff --git a/xlators/cluster/dht/src/dht-common.c b/xlators/cluster/dht/src/dht-common.c
index 82b527e9141..866e3faf629 100644
--- a/xlators/cluster/dht/src/dht-common.c
+++ b/xlators/cluster/dht/src/dht-common.c
@@ -2636,8 +2636,10 @@ dht_getxattr_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
this_call_cnt = dht_frame_return (frame);
- if (!xattr || (op_ret == -1))
+ if (!xattr || (op_ret == -1)) {
+ local->op_ret = op_ret;
goto out;
+ }
if (dict_get (xattr, conf->xattr_name)) {
dict_del (xattr, conf->xattr_name);
@@ -2808,7 +2810,7 @@ dht_getxattr (call_frame_t *frame, xlator_t *this,
subvol = layout->list[i].xlator;
STACK_WIND (frame, dht_vgetxattr_dir_cbk,
subvol, subvol->fops->getxattr,
- loc, key, NULL);
+ loc, key, xdata);
}
return 0;
}
@@ -2821,7 +2823,7 @@ dht_getxattr (call_frame_t *frame, xlator_t *this,
local->call_cnt = 1;
STACK_WIND (frame, dht_vgetxattr_cbk, cached_subvol,
- cached_subvol->fops->getxattr, loc, key, NULL);
+ cached_subvol->fops->getxattr, loc, key, xdata);
return 0;
}
@@ -2854,7 +2856,7 @@ dht_getxattr (call_frame_t *frame, xlator_t *this,
if (hashed_subvol) {
STACK_WIND (frame, dht_linkinfo_getxattr_cbk, hashed_subvol,
hashed_subvol->fops->getxattr, loc,
- GF_XATTR_PATHINFO_KEY, NULL);
+ GF_XATTR_PATHINFO_KEY, xdata);
return 0;
}
op_errno = ENODATA;
@@ -2933,7 +2935,7 @@ dht_getxattr (call_frame_t *frame, xlator_t *this,
subvol = layout->list[i].xlator;
STACK_WIND (frame, dht_getxattr_cbk,
subvol, subvol->fops->getxattr,
- loc, key, NULL);
+ loc, key, xdata);
}
return 0;