summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorNithya Balachandran <nbalacha@redhat.com>2015-04-27 21:18:10 +0530
committerRaghavendra G <rgowdapp@redhat.com>2015-05-05 09:21:37 -0700
commitdbaaacf720baedc7f94b3acb6a479db394f54f57 (patch)
treeb0a4cdec5a19b29c08c85380ba49ca0a9f080bba
parenta5fe0f594d41e1a11661d9074bb19e9c2e2c4776 (diff)
geo-rep: rename handling in dht volume
Background: Glusterfs changelogs are stored in each brick, which records the changes happened in that brick. Georep will run in all the nodes of master and processes changelogs "independently". Processing changelogs is in brick level, but all the fops will be replayed on "slave mount" point. Problem: With a DHT volume, in changelog "internal fops" are NOT recorded. For Rename case, Rename is recorded in "hashed" brick changelog. (DHT's internal fops like creating linkto file, unlink is NOT recorded). This lead us to inconsistent rename operations. For example, Distribute volume created with Two bricks B1, B2. //Consider master volume mounted @ /mnt/master and following operations executed: cd /mnt/master touch f1 // f1 falls on B1 Hash mv f1 f2 // f2 falls on B2 Hash // Here, Changelogs are recorded as below: @B1 CREATE f1 @B2 RENAME f1 f2 Here, race exists between Brick B1 and B2, say B2 will get executed first. Source file f1 itself is "NOT PRESENT", so it will go ahead and create f2 (Current implementation). We have this problem When rename falls in another brick and file is unlinked in Master. Similar kind of issue exists in following case too(multiple rename): CREATE f1 RENAME f1 f2 RENAME f2 f1 Solution: Instead of carrying out "changelogging" at "HASHED volume", carry out at the "CACHED volume". This way we have rename operations carried out where actual files are present. So,Changelog recorded as : @B1 CREATE f1 RENAME f1 f2 credit: sarumuga@redhat.com PS: Some of the races as the one below are _NOT_ fixed by this patch * f1 and f2 exist. B1 and B2 are their respective cached subvols. For both files hashed-subvol == cached-subvol * mv f1 f2 on master. * B1 has change-log entry of rename f1 f2 * rebalance migrates f2 from B1 and B2 * mv f2 f1 on master. * B2 has change-log entry of rename f2 f1 Since changelog entries (rename f1 f2) and (rename f2 f1) are processed independently by gsyncds, which of either f1 and f2 survives on slave is subject to race. Note that on master its file f1 with name f1 which survived. On slave it can be either file f1 with name f1 or file f2 with name f2 based on who wins the race of processing changelog. Change-Id: Iebc222f582613924c3a7cba37fb6d3e2d8332eda BUG: 1141379 Signed-off-by: Nithya Balachandran <nbalacha@redhat.com> Reviewed-on: http://review.gluster.org/10410 Tested-by: Gluster Build System <jenkins@build.gluster.com> Tested-by: NetBSD Build System Reviewed-by: Raghavendra G <rgowdapp@redhat.com> Tested-by: Raghavendra G <rgowdapp@redhat.com>
-rw-r--r--libglusterfs/src/common-utils.h25
-rw-r--r--libglusterfs/src/glusterfs.h1
-rw-r--r--xlators/cluster/dht/src/dht-rename.c82
3 files changed, 108 insertions, 0 deletions
diff --git a/libglusterfs/src/common-utils.h b/libglusterfs/src/common-utils.h
index e8b5fc83591..c361405d5eb 100644
--- a/libglusterfs/src/common-utils.h
+++ b/libglusterfs/src/common-utils.h
@@ -130,6 +130,31 @@ enum _gf_xlator_ipc_targets {
typedef enum _gf_boolean gf_boolean_t;
typedef enum _gf_client_pid gf_client_pid_t;
typedef enum _gf_xlator_ipc_targets _gf_xlator_ipc_targets_t;
+
+/* The DHT file rename operation is not a straightforward rename.
+ * It involves creating linkto and linkfiles, and can unlink or rename the
+ * source file depending on the hashed and cached subvols for the source
+ * and target files. this makes it difficult for geo-rep to figure out that
+ * a rename operation has taken place.
+ *
+ * We now send a special key and the values of the source and target pargfids
+ * and basenames to indicate to changelog that the operation in question
+ * should be treated as a rename. We are explicitly filling and sending this
+ * as a binary value in the dictionary as the unlink op will not have the
+ * source file information. The lengths of the src and target basenames
+ * are used to calculate where to start reading the names in the structure.
+ * XFS allows a max of 255 chars for filenames but other file systems might
+ * not have such restrictions
+ */
+typedef struct dht_changelog_rename_info {
+ uuid_t old_pargfid;
+ uuid_t new_pargfid;
+ int32_t oldname_len;
+ int32_t newname_len;
+ char buffer[1];
+ } dht_changelog_rename_info_t;
+
+
typedef int (*gf_cmp) (void *, void *);
void gf_global_variable_init(void);
diff --git a/libglusterfs/src/glusterfs.h b/libglusterfs/src/glusterfs.h
index ba59e4d54db..3373ef6f7b8 100644
--- a/libglusterfs/src/glusterfs.h
+++ b/libglusterfs/src/glusterfs.h
@@ -140,6 +140,7 @@
#define GLUSTERFS_VERSION_XCHG_KEY "glusterfs.version.xchg"
#define GLUSTERFS_INTERNAL_FOP_KEY "glusterfs-internal-fop"
+#define DHT_CHANGELOG_RENAME_OP_KEY "changelog.rename-op"
#define ZR_FILE_CONTENT_STR "glusterfs.file."
#define ZR_FILE_CONTENT_STRLEN 15
diff --git a/xlators/cluster/dht/src/dht-rename.c b/xlators/cluster/dht/src/dht-rename.c
index 0594945203e..4e4e9869685 100644
--- a/xlators/cluster/dht/src/dht-rename.c
+++ b/xlators/cluster/dht/src/dht-rename.c
@@ -319,6 +319,56 @@ err:
NULL, NULL);
return 0;
}
+
+
+
+static int
+dht_rename_track_for_changelog (xlator_t *this, dict_t *xattr,
+ loc_t *oldloc, loc_t *newloc)
+{
+ int ret = -1;
+ dht_changelog_rename_info_t *info = NULL;
+ char *name = NULL;
+ int len1 = 0;
+ int len2 = 0;
+ int size = 0;
+
+ if (!xattr || !oldloc || !newloc || !this)
+ return ret;
+
+ len1 = strlen (oldloc->name) + 1;
+ len2 = strlen (newloc->name) + 1;
+ size = sizeof (dht_changelog_rename_info_t) + len1 + len2;
+
+ info = GF_CALLOC (size, sizeof(char), gf_common_mt_char);
+ if (!info) {
+ gf_msg (this->name, GF_LOG_ERROR, 0,
+ DHT_MSG_DICT_SET_FAILED,
+ "Failed to calloc memory");
+ return ret;
+ }
+
+ gf_uuid_copy (info->old_pargfid, oldloc->pargfid);
+ gf_uuid_copy (info->new_pargfid, newloc->pargfid);
+
+ info->oldname_len = len1;
+ info->newname_len = len2;
+ strncpy (info->buffer, oldloc->name, len1);
+ name = info->buffer + len1;
+ strncpy (name, newloc->name, len2);
+
+ ret = dict_set_bin (xattr, DHT_CHANGELOG_RENAME_OP_KEY,
+ info, size);
+ if (ret) {
+ gf_msg (this->name, GF_LOG_ERROR, 0, DHT_MSG_DICT_SET_FAILED,
+ "Failed to set dictionary value: key = %s,"
+ " path = %s", DHT_CHANGELOG_RENAME_OP_KEY,
+ oldloc->name);
+ }
+ return ret;
+}
+
+
#define DHT_MARK_FOP_INTERNAL(xattr) do { \
int tmp = -1; \
if (!xattr) { \
@@ -354,6 +404,32 @@ err:
} \
}while (0)
+
+#define DHT_CHANGELOG_TRACK_AS_RENAME(xattr, oldloc, newloc) do { \
+ int tmp = -1; \
+ if (!xattr) { \
+ xattr = dict_new (); \
+ if (!xattr) { \
+ gf_msg (this->name, GF_LOG_ERROR, 0, \
+ DHT_MSG_DICT_SET_FAILED, \
+ "Failed to create dictionary to " \
+ "track rename"); \
+ break; \
+ } \
+ } \
+ \
+ tmp = dht_rename_track_for_changelog (this, xattr, \
+ oldloc, newloc); \
+ \
+ if (tmp) { \
+ gf_msg (this->name, GF_LOG_ERROR, 0, \
+ DHT_MSG_DICT_SET_FAILED, \
+ "Failed to set dictionary value: key = %s," \
+ " path = %s", DHT_CHANGELOG_RENAME_OP_KEY, \
+ (oldloc)->path); \
+ } \
+ } while (0)
+
int
dht_rename_unlock_cbk (call_frame_t *frame, void *cookie,
xlator_t *this, int32_t op_ret, int32_t op_errno,
@@ -745,6 +821,8 @@ err:
DHT_MARKER_DONT_ACCOUNT(xattr_new);
}
+ DHT_CHANGELOG_TRACK_AS_RENAME(xattr_new, &local->loc,
+ &local->loc2);
STACK_WIND (frame, dht_rename_unlink_cbk,
src_cached, src_cached->fops->unlink,
&local->loc, 0, xattr_new);
@@ -831,6 +909,10 @@ dht_do_rename (call_frame_t *frame)
DHT_MARKER_DONT_ACCOUNT(dict);
}
+ if (rename_subvol == src_cached) {
+ DHT_CHANGELOG_TRACK_AS_RENAME(dict, &local->loc, &local->loc2);
+ }
+
gf_msg_trace (this->name, 0,
"renaming %s => %s (%s)",
local->loc.path, local->loc2.path, rename_subvol->name);