summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorVenkatesh Somyajulu <vsomyaju@redhat.com>2014-06-17 15:09:50 +0530
committerVijay Bellur <vbellur@redhat.com>2014-06-17 05:39:22 -0700
commit93832829016a0a51a8938c0c89c6bd09b3229c9f (patch)
tree46bf0526f0585e1046141cd265dfb3afd1844cd5
parent3a499d170de2c7df06b127b709d27c64cef98886 (diff)
cluster/dht: Do layout self healing of directory for nameless lookup
Problem: Currently in the nameless lookup code path, if at the end of the lookup, even if it detects that layout anamolies are there, layout healing will not be done as there is no code to heal it. So there can be race between mkdir and lookup. Assume mkdir is going on from some other mount point, Say, M1. Directories are created on some nodes but layout is not set yet. Now from M2, nameless lookup goes, lookup will be success full as the directory is present on some of the nodes, but it won't heal layout. Now if create goes after lookup fop, because layout is absent, file creation will fail. Fix: Included the code of layout self-heal in the nameless lookup path. At the end of lookup, layout will be computed as it would have been in the named lookup, but it will be set to those node only, where directory is present. So after that if create fop goes, the probabiliy to get the subvolume with proper hash-range is high now, so reduces the race window. Other: Whenever a directory is created, we have to choose a brick from which we start allocating layout in a circular fashion. To calculate this starting brick, I have changed the candidate from name of the directory to gfid of the directory But to compute where a given file belongs, we will still use the name of the file. Hash computed from the name of the file should belong to any one of the directory-hash-range Calculation of hash for a file is acting as a consumer and the setting of directory layout based on gfid is acting as a producer, which are independent from each other. Change-Id: I3808c55082cd1b5c72d2c77cbbc063f55aa38bee BUG: 1095888 Signed-off-by: Venkatesh Somyajulu <vsomyaju@redhat.com> Reviewed-on: http://review.gluster.org/7493 Tested-by: Gluster Build System <jenkins@build.gluster.com> Reviewed-by: Raghavendra Bhat <raghavendra@redhat.com> Reviewed-by: Vijay Bellur <vbellur@redhat.com>
-rwxr-xr-xtests/bugs/bug-1088231.t161
-rw-r--r--xlators/cluster/dht/src/dht-common.c23
-rw-r--r--xlators/cluster/dht/src/dht-common.h6
-rw-r--r--xlators/cluster/dht/src/dht-layout.c1
-rw-r--r--xlators/cluster/dht/src/dht-selfheal.c197
5 files changed, 387 insertions, 1 deletions
diff --git a/tests/bugs/bug-1088231.t b/tests/bugs/bug-1088231.t
new file mode 100755
index 00000000000..da6adaeb173
--- /dev/null
+++ b/tests/bugs/bug-1088231.t
@@ -0,0 +1,161 @@
+#!/bin/bash
+. $(dirname $0)/../include.rc
+. $(dirname $0)/../volume.rc
+. $(dirname $0)/../fileio.rc
+. $(dirname $0)/../dht.rc
+
+
+cleanup;
+
+TEST glusterd
+TEST pidof glusterd
+TEST $CLI volume create $V0 $H0:$B0/${V0}{0,1}
+TEST $CLI volume set $V0 cluster.randomize-hash-range-by-gfid on
+TEST $CLI volume start $V0
+TEST glusterfs --volfile-id=/$V0 --aux-gfid-mount --volfile-server=$H0 $M0
+TEST mkdir $M0/a
+
+
+## Bug Description: In case of dht_discover code path, which is triggered
+## when lookup done is nameless lookup, at the end of the lookup, even if
+## it finds that self-heal is needed to fix-the layout it wont heal because
+## healing code path is not added under nameless lookup.
+
+## What to test: With Patch, Even in case of nameless lookup, if layout
+## needs to be fixed, the it will be fixed wherever lookup is successfull
+## and it will not create any directory for subvols having ENOENT as it is
+## nameless lookup.
+
+gfid_with_hyphen=`getfattr -n glusterfs.gfid.string $M0/a 2>/dev/null \
+ | grep glusterfs.gfid.string | cut -d '"' -f 2`
+
+TEST setfattr -x trusted.glusterfs.dht $B0/$V0"0"/a
+
+TEST stat $M0/.gfid/$gfid_with_hyphen
+
+## Assuming that we have two bricks, we can have two permutations of layout
+## Case 1: Brick - A Brick - B
+## 0 - 50 51-100
+##
+## Case 2: Brick - A Brick - B
+## 51 - 100 0 - 50
+##
+## To ensure layout is assigned properly, the following tests should be
+## performed.
+##
+## Case 1: Layout_b0_s = 0; Layout_b0_e = 50, Layout_b1_s=51,
+## Layout_b1_e = 100;
+##
+## layout_b1_s = layout_b0_e + 1;
+## layout_b0_s = layout_b1_e + 1; but b0_s is 0, so change to 101
+## then compare
+## Case 2: Layout_b0_s = 51, Layout_b0_e = 100, Layout_b1_s=0,
+## Layout_b1_e = 51
+##
+## layout_b0_s = Layout_b1_e + 1;
+## layout_b1_s = Layout_b0_e + 1; but b1_s is 0, so chage to 101.
+
+
+##Extract Layout
+layout_b0_s=`get_layout $B0/$V0"0"/a | cut -c19-26`
+layout_b0_e=`get_layout $B0/$V0"0"/a | cut -c27-34`
+layout_b1_s=`get_layout $B0/$V0"1"/a | cut -c19-26`
+layout_b1_e=`get_layout $B0/$V0"1"/a | cut -c27-34`
+
+
+##Add 0X to perform Hex arithematic
+layout_b0_s="0x"$layout_b0_s
+layout_b0_e="0x"$layout_b0_e
+layout_b1_s="0x"$layout_b1_s
+layout_b1_e="0x"$layout_b1_e
+
+
+
+## Logic of converting starting layout "0" to "Max_value of layout + 1"
+comp1=$(($layout_b0_s + 0))
+if [ "$comp1" == "0" ];then
+ comp1=4294967296
+fi
+
+comp2=$(($layout_b1_s + 0))
+if [ "$comp2" == "0" ];then
+ comp2=4294967296
+fi
+
+diff1=$(($layout_b0_e + 1))
+diff2=$(($layout_b1_e + 1))
+
+
+healed=0
+
+if [ "$comp1" == "$diff1" ] && [ "$comp2" == "$diff2" ]; then
+ healed=$(($healed + 1))
+fi
+
+if [ "$comp1" == "$diff2" ] && [ "$comp2" == "$diff1" ]; then
+ healed=$(($healed + 1))
+fi
+
+TEST [ $healed == 1 ]
+
+cleanup
+
+TEST glusterd
+TEST pidof glusterd
+TEST $CLI volume create $V0 $H0:$B0/${V0}{0,1}
+TEST $CLI volume set $V0 cluster.randomize-hash-range-by-gfid on
+TEST $CLI volume start $V0
+TEST glusterfs --volfile-id=/$V0 --aux-gfid-mount --volfile-server=$H0 $M0
+TEST mkdir $M0/a
+
+gfid_with_hyphen=`getfattr -n glusterfs.gfid.string $M0/a 2>/dev/null \
+ | grep glusterfs.gfid.string | cut -d '"' -f 2`
+
+TEST setfattr -x trusted.glusterfs.dht $B0/$V0"0"/a
+TEST setfattr -x trusted.glusterfs.dht $B0/$V0"1"/a
+
+TEST stat $M0/.gfid/$gfid_with_hyphen
+
+##Extract Layout
+layout_b0_s=`get_layout $B0/$V0"0"/a | cut -c19-26`
+layout_b0_e=`get_layout $B0/$V0"0"/a | cut -c27-34`
+layout_b1_s=`get_layout $B0/$V0"1"/a | cut -c19-26`
+layout_b1_e=`get_layout $B0/$V0"1"/a | cut -c27-34`
+
+
+##Add 0X to perform Hex arithematic
+layout_b0_s="0x"$layout_b0_s
+layout_b0_e="0x"$layout_b0_e
+layout_b1_s="0x"$layout_b1_s
+layout_b1_e="0x"$layout_b1_e
+
+
+
+## Logic of converting starting layout "0" to "Max_value of layout + 1"
+comp1=$(($layout_b0_s + 0))
+if [ "$comp1" == "0" ];then
+ comp1=4294967296
+fi
+
+comp2=$(($layout_b1_s + 0))
+if [ "$comp2" == "0" ];then
+ comp2=4294967296
+fi
+
+diff1=$(($layout_b0_e + 1))
+diff2=$(($layout_b1_e + 1))
+
+
+healed=0
+
+if [ "$comp1" == "$diff1" ] && [ "$comp2" == "$diff2" ]; then
+ healed=$(($healed + 1))
+fi
+
+if [ "$comp1" == "$diff2" ] && [ "$comp2" == "$diff1" ]; then
+ healed=$(($healed + 1))
+fi
+
+TEST [ $healed == 1 ]
+cleanup
+
diff --git a/xlators/cluster/dht/src/dht-common.c b/xlators/cluster/dht/src/dht-common.c
index 5c4fe2d4ca4..7eb40faa9a3 100644
--- a/xlators/cluster/dht/src/dht-common.c
+++ b/xlators/cluster/dht/src/dht-common.c
@@ -212,6 +212,17 @@ dht_discover_complete (xlator_t *this, call_frame_t *discover_frame)
op_errno = ESTALE;
goto out;
}
+
+ /* For fixing the directory layout, we need to choose
+ * the subvolume on which layout will be set first.
+ * Because in nameless lookup, we have gfid only,
+ * we are dependent on gfid. Therefore if conf->
+ * randomize_by_gfid is set, then only we proceed for
+ * healing layout of directory otherwise we don't heal.
+ */
+
+ if (local->inode && conf->randomize_by_gfid)
+ goto selfheal;
}
if (local->inode)
@@ -227,6 +238,18 @@ out:
NULL);
return ret;
+
+selfheal:
+
+ main_frame->local = local;
+ discover_frame->local = NULL;
+ FRAME_SU_DO (main_frame, dht_local_t);
+ uuid_copy (local->loc.gfid, local->gfid);
+ ret = dht_selfheal_directory_for_nameless_lookup (main_frame,
+ dht_lookup_selfheal_cbk,
+ &local->loc, layout);
+ return ret;
+
}
diff --git a/xlators/cluster/dht/src/dht-common.h b/xlators/cluster/dht/src/dht-common.h
index e1e3a5a4e38..ba66306dd13 100644
--- a/xlators/cluster/dht/src/dht-common.h
+++ b/xlators/cluster/dht/src/dht-common.h
@@ -456,6 +456,12 @@ int dht_lookup_everywhere (call_frame_t *f
int
dht_selfheal_directory (call_frame_t *frame, dht_selfheal_dir_cbk_t cbk,
loc_t *loc, dht_layout_t *layout);
+
+int
+dht_selfheal_directory_for_nameless_lookup (call_frame_t *frame,
+ dht_selfheal_dir_cbk_t cbk,
+ loc_t *loc, dht_layout_t *layout);
+
int
dht_selfheal_new_directory (call_frame_t *frame, dht_selfheal_dir_cbk_t cbk,
dht_layout_t *layout);
diff --git a/xlators/cluster/dht/src/dht-layout.c b/xlators/cluster/dht/src/dht-layout.c
index 82127adf6e5..34892983a55 100644
--- a/xlators/cluster/dht/src/dht-layout.c
+++ b/xlators/cluster/dht/src/dht-layout.c
@@ -442,7 +442,6 @@ dht_layout_entry_cmp_volname (dht_layout_t *layout, int i, int j)
layout->list[j].xlator->name));
}
-
gf_boolean_t
dht_is_subvol_in_layout (dht_layout_t *layout, xlator_t *xlator)
{
diff --git a/xlators/cluster/dht/src/dht-selfheal.c b/xlators/cluster/dht/src/dht-selfheal.c
index 04e58903147..1e2d7438350 100644
--- a/xlators/cluster/dht/src/dht-selfheal.c
+++ b/xlators/cluster/dht/src/dht-selfheal.c
@@ -332,6 +332,144 @@ out:
return 0;
}
+gf_boolean_t
+dht_is_subvol_part_of_layout (dht_layout_t *layout, xlator_t *xlator)
+{
+ int i = 0;
+ gf_boolean_t ret = _gf_false;
+
+ for (i = 0; i < layout->cnt; i++) {
+ if (!strcmp (layout->list[i].xlator->name, xlator->name)) {
+ ret = _gf_true;
+ break;
+
+ }
+ }
+
+ return ret;
+}
+
+int
+dht_layout_index_from_conf (dht_layout_t *layout, xlator_t *xlator)
+{
+ int i = -1;
+ int j = 0;
+
+ for (j = 0; j < layout->cnt; j++) {
+ if (!strcmp (layout->list[j].xlator->name, xlator->name)) {
+ i = j;
+ break;
+ }
+ }
+
+ return i;
+}
+
+
+static int
+dht_selfheal_dir_xattr_for_nameless_lookup (call_frame_t *frame, loc_t *loc,
+ dht_layout_t *layout)
+{
+ dht_local_t *local = NULL;
+ int missing_xattr = 0;
+ int i = 0;
+ xlator_t *this = NULL;
+ dht_conf_t *conf = NULL;
+ dht_layout_t *dummy = NULL;
+ int j = 0;
+
+ local = frame->local;
+ this = frame->this;
+ conf = this->private;
+
+ for (i = 0; i < layout->cnt; i++) {
+ if (layout->list[i].err != -1 || !layout->list[i].stop) {
+ /* err != -1 would mean xattr present on the directory
+ or the directory is non existent.
+ !layout->list[i].stop would mean layout absent
+ */
+
+ continue;
+ }
+ missing_xattr++;
+ }
+
+ /* Also account for subvolumes with no-layout. Used for zero'ing out
+ the layouts and for setting quota key's if present */
+
+ /* Send where either the subvol is not part of layout,
+ * or it is part of the layout but error is non-zero but error
+ * is not equal to -1 or ENOENT.
+ */
+
+ for (i = 0; i < conf->subvolume_cnt; i++) {
+ if (dht_is_subvol_part_of_layout (layout, conf->subvolumes[i])
+ == _gf_false) {
+ missing_xattr++;
+ continue;
+ }
+
+ j = dht_layout_index_from_conf (layout, conf->subvolumes[i]);
+
+ if ((j != -1) && (layout->list[j].err != -1) &&
+ (layout->list[j].err != 0) &&
+ (layout->list[j].err != ENOENT)) {
+ missing_xattr++;
+ }
+
+ }
+
+
+ gf_log (this->name, GF_LOG_TRACE,
+ "%d subvolumes missing xattr for %s",
+ missing_xattr, loc->path);
+
+ if (missing_xattr == 0) {
+ dht_selfheal_dir_finish (frame, this, 0);
+ return 0;
+ }
+
+ local->call_cnt = missing_xattr;
+ for (i = 0; i < layout->cnt; i++) {
+ if (layout->list[i].err != -1 || !layout->list[i].stop)
+ continue;
+
+ dht_selfheal_dir_xattr_persubvol (frame, loc, layout, i, NULL);
+
+ if (--missing_xattr == 0)
+ break;
+ }
+
+ dummy = dht_layout_new (this, 1);
+ if (!dummy)
+ goto out;
+
+ for (i = 0; i < conf->subvolume_cnt && missing_xattr; i++) {
+ if (dht_is_subvol_part_of_layout (layout, conf->subvolumes[i])
+ == _gf_false) {
+ dht_selfheal_dir_xattr_persubvol (frame, loc, dummy, 0,
+ conf->subvolumes[i]);
+ missing_xattr--;
+ continue;
+ }
+
+ j = dht_layout_index_from_conf (layout, conf->subvolumes[i]);
+
+ if ((j != -1) && (layout->list[j].err != -1) &&
+ (layout->list[j].err != ENOENT) &&
+ (layout->list[j].err != 0)) {
+ dht_selfheal_dir_xattr_persubvol (frame, loc, dummy, 0,
+ conf->subvolumes[i]);
+ missing_xattr--;
+ }
+ }
+
+ dht_layout_unref (this, dummy);
+out:
+ return 0;
+
+}
+
int
dht_selfheal_dir_setattr_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
int op_ret, int op_errno, struct iatt *statpre,
@@ -1030,6 +1168,65 @@ sorry_no_fix:
return 0;
}
+int
+dht_selfheal_directory_for_nameless_lookup (call_frame_t *frame,
+ dht_selfheal_dir_cbk_t dir_cbk,
+ loc_t *loc, dht_layout_t *layout)
+{
+ dht_local_t *local = NULL;
+ uint32_t down = 0;
+ uint32_t misc = 0;
+ int ret = 0;
+ xlator_t *this = NULL;
+
+ local = frame->local;
+ this = frame->this;
+ dht_layout_anomalies (this, loc, layout,
+ &local->selfheal.hole_cnt,
+ &local->selfheal.overlaps_cnt,
+ NULL, &local->selfheal.down,
+ &local->selfheal.misc, NULL);
+
+ down = local->selfheal.down;
+ misc = local->selfheal.misc;
+
+ local->selfheal.dir_cbk = dir_cbk;
+ local->selfheal.layout = dht_layout_ref (this, layout);
+
+ if (down) {
+ gf_log (this->name, GF_LOG_WARNING,
+ "%d subvolumes down -- not fixing", down);
+ ret = 0;
+ goto sorry_no_fix;
+ }
+
+ if (misc) {
+ gf_log (this->name, GF_LOG_WARNING,
+ "%d subvolumes have unrecoverable errors", misc);
+ ret = 0;
+ goto sorry_no_fix;
+ }
+
+ dht_layout_sort_volname (layout);
+ ret = dht_selfheal_dir_getafix (frame, loc, layout);
+
+ if (ret == -1) {
+ gf_log (this->name, GF_LOG_WARNING,
+ "not able to form layout for the directory");
+ goto sorry_no_fix;
+ }
+
+ dht_selfheal_dir_xattr_for_nameless_lookup (frame, &local->loc, layout);
+ return 0;
+
+sorry_no_fix:
+ /* TODO: need to put appropriate local->op_errno */
+ dht_selfheal_dir_finish (frame, this, ret);
+
+ return 0;
+
+
+}
int
dht_selfheal_restore (call_frame_t *frame, dht_selfheal_dir_cbk_t dir_cbk,