From eef0737ca6ae8056d906c7bff0a9280cc748270e Mon Sep 17 00:00:00 2001
From: Pranith Kumar K <pkarampu@redhat.com>
Date: Mon, 22 Jul 2013 16:44:09 +0530
Subject: cluster/afr: Handle parallel hardlinks self-heal

Change-Id: Ieda11870c65edae500140b6c061f15a7b3f264f3
BUG: 986905
Signed-off-by: Pranith Kumar K <pkarampu@redhat.com>
Reviewed-on: http://review.gluster.org/5370
Tested-by: Gluster Build System <jenkins@build.gluster.com>
Reviewed-by: Vijay Bellur <vbellur@redhat.com>
---
 tests/bugs/bug-986905.t                       | 27 +++++++++++++++++++++++++
 xlators/cluster/afr/src/afr-self-heal-entry.c | 29 +++++++++++++++++++++++++++
 2 files changed, 56 insertions(+)
 create mode 100755 tests/bugs/bug-986905.t

diff --git a/tests/bugs/bug-986905.t b/tests/bugs/bug-986905.t
new file mode 100755
index 000000000..0fac40fb4
--- /dev/null
+++ b/tests/bugs/bug-986905.t
@@ -0,0 +1,27 @@
+#!/bin/bash
+
+. $(dirname $0)/../include.rc
+. $(dirname $0)/../volume.rc
+
+#This script checks if hardlinks that are created while a brick is down are
+#healed properly.
+
+cleanup;
+function get_inum {
+        ls -i $1 | awk '{print $1}'
+}
+
+TEST glusterd
+TEST pidof glusterd
+TEST $CLI volume create $V0 replica 2 $H0:$B0/${V0}{0,1}
+TEST $CLI volume start $V0
+TEST glusterfs --volfile-id=/$V0 --volfile-server=$H0 $M0 --attribute-timeout=0 --entry-timeout=0
+TEST kill_brick $V0 $H0 $B0/${V0}0
+TEST touch $M0/a
+TEST ln $M0/a $M0/link_a
+TEST $CLI volume start $V0 force
+EXPECT_WITHIN 20 "1" afr_child_up_status $V0 0
+TEST ls -l $M0
+inum=$(get_inum $B0/${V0}0/a)
+EXPECT "$inum" get_inum $B0/${V0}0/link_a
+cleanup
diff --git a/xlators/cluster/afr/src/afr-self-heal-entry.c b/xlators/cluster/afr/src/afr-self-heal-entry.c
index 3598f79d1..db17052cb 100644
--- a/xlators/cluster/afr/src/afr-self-heal-entry.c
+++ b/xlators/cluster/afr/src/afr-self-heal-entry.c
@@ -1256,6 +1256,35 @@ afr_sh_entry_impunge_mknod (call_frame_t *impunge_frame, xlator_t *this,
                 gf_log (this->name, GF_LOG_INFO, "%s: gfid set failed",
                         impunge_local->loc.path);
 
+        /*
+         * Reason for adding GLUSTERFS_INTERNAL_FOP_KEY :
+         *
+         * Problem:
+         * While a brick is down in a replica pair, lets say the user creates
+         * one file(file-A) and a hard link to that file(h-file-A). After the
+         * brick comes back up, entry self-heal is attempted on parent dir of
+         * these two files. As part of readdir in self-heal it reads both the
+         * entries file-A and h-file-A for both of them it does name less lookup
+         * to check if there are any hardlinks already present in the
+         * destination brick. It finds that there are no hard links already
+         * present for files file-A, h-file-A. Self-heal does mknods for both
+         * file-A and h-file-A. This leads to file-A and h-file-A not being
+         * hardlinks anymore.
+         *
+         * Fix: (More like shrinking of race-window, the race itself is still
+         * present in posix-mknod).
+         * If mknod comes with the presence of GLUSTERFS_INTERNAL_FOP_KEY then
+         * posix_mknod checks if there are already any gfid-links and does
+         * link() instead of mknod. There still can be a race where two
+         * posix_mknods same gfid see that
+         * gfid-link file is not present and proceeds with mknods and result in
+         * two different files with same gfid.
+         */
+        ret = dict_set_str (dict, GLUSTERFS_INTERNAL_FOP_KEY, "yes");
+        if (ret)
+                gf_log (this->name, GF_LOG_INFO, "%s: %s set failed",
+                        impunge_local->loc.path, GLUSTERFS_INTERNAL_FOP_KEY);
+
         STACK_WIND_COOKIE (impunge_frame, afr_sh_entry_impunge_newfile_cbk,
                            (void *) (long) child_index,
                            priv->children[child_index],
-- 
cgit