summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorJoseph Fernandes <josferna@redhat.com>2015-08-04 20:38:06 +0530
committerDan Lambright <dlambrig@redhat.com>2015-10-10 10:42:07 -0700
commitf6618acd4f7642dab19445e35cf2c7fbc8244a5e (patch)
treedf8997f7e472cb5ad1f7d95cef29e24ab5071923
parent672baab88fb7f32e844cd4be22e0924e4e0e83fc (diff)
tier/ctr: CTR DB named lookup heal of cold tier during attach tier
Heal hardlink in the db for already existing data in the cold tier during attach tier. i.e during fix layout do lookup to files in the cold tier. CTR xlator on the brick/server side does db update/insert of the hardlink on a namelookup. Currently the namedlookup is done synchronous to the fixlayout that is triggered by attach tier. This is not performant, adding more time to fixlayout. The performant approach is record the hardlinks on a compressed datastore and then do the namelookup asynchronously later, giving the ctr db eventual consistency Change-Id: I4ffc337fffe7d447804786851a9183a51b5044a9 BUG: 1252586 Signed-off-by: Joseph Fernandes <josferna@redhat.com> Reviewed-on: http://review.gluster.org/11828 Tested-by: Gluster Build System <jenkins@build.gluster.com> Reviewed-by: Dan Lambright <dlambrig@redhat.com> Tested-by: Dan Lambright <dlambrig@redhat.com>
-rw-r--r--libglusterfs/src/glusterfs.h2
-rw-r--r--tests/basic/tier/legacy-many.t122
-rw-r--r--xlators/cluster/dht/src/dht-rebalance.c130
-rw-r--r--xlators/features/changetimerecorder/src/changetimerecorder.c1
-rw-r--r--xlators/features/changetimerecorder/src/ctr-helper.h8
5 files changed, 257 insertions, 6 deletions
diff --git a/libglusterfs/src/glusterfs.h b/libglusterfs/src/glusterfs.h
index fae7d490af8..3bc76f6622a 100644
--- a/libglusterfs/src/glusterfs.h
+++ b/libglusterfs/src/glusterfs.h
@@ -239,6 +239,8 @@
#define CTR_RESPONSE_LINK_COUNT_XDATA "ctr_response_link_count"
#define CTR_REQUEST_LINK_COUNT_XDATA "ctr_request_link_count"
+#define CTR_ATTACH_TIER_LOOKUP "ctr_attach_tier_lookup"
+
#define GF_LOG_LRU_BUFSIZE_DEFAULT 5
#define GF_LOG_LRU_BUFSIZE_MIN 0
#define GF_LOG_LRU_BUFSIZE_MAX 20
diff --git a/tests/basic/tier/legacy-many.t b/tests/basic/tier/legacy-many.t
new file mode 100644
index 00000000000..17275494aba
--- /dev/null
+++ b/tests/basic/tier/legacy-many.t
@@ -0,0 +1,122 @@
+#!/bin/bash
+
+. $(dirname $0)/../../include.rc
+. $(dirname $0)/../../volume.rc
+
+LAST_BRICK=3
+CACHE_BRICK_FIRST=4
+CACHE_BRICK_LAST=5
+DEMOTE_TIMEOUT=12
+PROMOTE_TIMEOUT=5
+MIGRATION_TIMEOUT=10
+DEMOTE_FREQ=60
+PROMOTE_FREQ=4
+TEST_DIR="test_files"
+NUM_FILES=20
+
+
+# Grab md5sum without file path (failed attempt notifications are discarded)
+function fingerprint {
+ md5sum $1 2> /dev/null | grep --only-matching -m 1 '^[0-9a-f]*'
+}
+
+# Create a large number of files. Store their md5 signatures.
+function create_many_files {
+ mkdir ${TEST_DIR}
+ for i in `seq 1 $NUM_FILES`; do
+ dd if=/dev/urandom of=./${TEST_DIR}/i$i bs=1048576 count=1;
+ id[i]=$(fingerprint "./${TEST_DIR}/i$i");
+ done
+}
+
+function confirm_tier_removed {
+ $CLI system getspec $V0 | grep $1
+ if [ $? == 0 ]; then
+ echo "1"
+ else
+ echo "0"
+ fi
+}
+
+function confirm_vol_stopped {
+ $CLI volume stop $1
+ if [ $? == 0 ]; then
+ echo "0"
+ else
+ echo "1"
+ fi
+}
+
+function check_counters {
+ index=0
+ ret=0
+ rm -f /tmp/tc*.txt
+ echo "0" > /tmp/tc2.txt
+
+ $CLI volume rebalance $V0 tier status | grep localhost > /tmp/tc.txt
+
+ promote=`cat /tmp/tc.txt |awk '{print $2}'`
+ demote=`cat /tmp/tc.txt |awk '{print $3}'`
+ if [ "${promote}" != "${1}" ]; then
+ echo "1" > /tmp/tc2.txt
+
+ elif [ "${demote}" != "${2}" ]; then
+ echo "2" > /tmp/tc2.txt
+ fi
+
+ # temporarily disable non-Linux tests.
+ case $OSTYPE in
+ NetBSD | FreeBSD | Darwin)
+ echo "0" > /tmp/tc2.txt
+ ;;
+ esac
+ cat /tmp/tc2.txt
+}
+
+function read_all {
+ for file in *
+ do
+ cat $file
+ done
+}
+
+cleanup
+
+TEST glusterd
+TEST pidof glusterd
+
+# Create distributed replica volume
+TEST $CLI volume create $V0 replica 2 $H0:$B0/${V0}{0..$LAST_BRICK}
+TEST $CLI volume start $V0
+
+TEST $CLI volume set $V0 performance.quick-read off
+TEST $CLI volume set $V0 performance.io-cache off
+TEST $CLI volume set $V0 features.ctr-enabled on
+
+TEST $GFS --volfile-id=/$V0 --volfile-server=$H0 $M0;
+
+# Create a number of "legacy" files before attaching tier
+cd $M0
+TEST create_many_files
+wait
+
+# Attach tier
+TEST $CLI volume attach-tier $V0 replica 2 $H0:$B0/${V0}$CACHE_BRICK_FIRST $H0:$B0/${V0}$CACHE_BRICK_LAST
+TEST $CLI volume rebalance $V0 tier status
+
+TEST $CLI volume set $V0 cluster.tier-demote-frequency $DEMOTE_FREQ
+TEST $CLI volume set $V0 cluster.tier-promote-frequency $PROMOTE_FREQ
+TEST $CLI volume set $V0 cluster.read-freq-threshold 0
+TEST $CLI volume set $V0 cluster.write-freq-threshold 0
+
+# Read "legacy" files
+drop_cache $M0
+cd ${TEST_DIR}
+TEST read_all
+
+# Test to make sure files were promoted as expected
+sleep $DEMOTE_TIMEOUT
+EXPECT_WITHIN $DEMOTE_TIMEOUT "0" check_counters 20 0
+
+cd;
+cleanup
diff --git a/xlators/cluster/dht/src/dht-rebalance.c b/xlators/cluster/dht/src/dht-rebalance.c
index 7dc89d8a069..a6c14b085fa 100644
--- a/xlators/cluster/dht/src/dht-rebalance.c
+++ b/xlators/cluster/dht/src/dht-rebalance.c
@@ -2562,6 +2562,118 @@ gf_defrag_settle_hash (xlator_t *this, gf_defrag_info_t *defrag,
return 0;
}
+
+
+/* Function for doing a named lookup on file inodes during an attach tier
+ * So that a hardlink lookup heal i.e gfid to parent gfid lookup heal
+ * happens on pre-existing data. This is required so that the ctr database has
+ * hardlinks of all the exisitng file in the volume. CTR xlator on the
+ * brick/server side does db update/insert of the hardlink on a namelookup.
+ * Currently the namedlookup is done synchronous to the fixlayout that is
+ * triggered by attach tier. This is not performant, adding more time to
+ * fixlayout. The performant approach is record the hardlinks on a compressed
+ * datastore and then do the namelookup asynchronously later, giving the ctr db
+ * eventual consistency
+ * */
+int
+gf_fix_layout_tier_attach_lookup (xlator_t *this,
+ loc_t *parent_loc,
+ gf_dirent_t *file_dentry)
+{
+ int ret = -1;
+ dict_t *lookup_xdata = NULL;
+ dht_conf_t *conf = NULL;
+ loc_t file_loc = {0,};
+ struct iatt iatt = {0,};
+
+ GF_VALIDATE_OR_GOTO ("tier", this, out);
+
+ GF_VALIDATE_OR_GOTO (this->name, parent_loc, out);
+
+ GF_VALIDATE_OR_GOTO (this->name, file_dentry, out);
+
+ GF_VALIDATE_OR_GOTO (this->name, this->private, out);
+
+ if (!parent_loc->inode) {
+ gf_msg (this->name, GF_LOG_ERROR, 0, DHT_MSG_LOG_TIER_ERROR,
+ "%s/%s parent is NULL", parent_loc->path,
+ file_dentry->d_name);
+ goto out;
+ }
+
+
+ conf = this->private;
+
+ loc_wipe (&file_loc);
+
+ if (gf_uuid_is_null (file_dentry->d_stat.ia_gfid)) {
+ gf_msg (this->name, GF_LOG_ERROR, 0, DHT_MSG_LOG_TIER_ERROR,
+ "%s/%s gfid not present", parent_loc->path,
+ file_dentry->d_name);
+ goto out;
+ }
+
+ gf_uuid_copy (file_loc.gfid, file_dentry->d_stat.ia_gfid);
+
+ if (gf_uuid_is_null (parent_loc->gfid)) {
+ gf_msg (this->name, GF_LOG_ERROR, 0, DHT_MSG_LOG_TIER_ERROR,
+ "%s/%s"
+ " gfid not present", parent_loc->path,
+ file_dentry->d_name);
+ goto out;
+ }
+
+ gf_uuid_copy (file_loc.pargfid, parent_loc->gfid);
+
+
+ ret = dht_build_child_loc (this, &file_loc, parent_loc,
+ file_dentry->d_name);
+ if (ret) {
+ gf_msg (this->name, GF_LOG_ERROR, 0, DHT_MSG_LOG_TIER_ERROR,
+ "Child loc build failed");
+ ret = -1;
+ goto out;
+ }
+
+ lookup_xdata = dict_new ();
+ if (!lookup_xdata) {
+ gf_msg (this->name, GF_LOG_ERROR, 0, DHT_MSG_LOG_TIER_ERROR,
+ "Failed creating lookup dict for %s",
+ file_dentry->d_name);
+ goto out;
+ }
+
+ ret = dict_set_int32 (lookup_xdata, CTR_ATTACH_TIER_LOOKUP, 1);
+ if (ret) {
+ gf_msg (this->name, GF_LOG_ERROR, 0, DHT_MSG_LOG_TIER_ERROR,
+ "Failed to set lookup flag");
+ goto out;
+ }
+
+ gf_uuid_copy (file_loc.parent->gfid, parent_loc->gfid);
+
+ /* Sending lookup to cold tier only */
+ ret = syncop_lookup (conf->subvolumes[0], &file_loc, &iatt,
+ NULL, lookup_xdata, NULL);
+ if (ret) {
+ gf_msg (this->name, GF_LOG_ERROR, 0, DHT_MSG_LOG_TIER_ERROR,
+ "%s lookup failed", file_loc.path);
+ goto out;
+ }
+
+ ret = 0;
+
+out:
+
+ loc_wipe (&file_loc);
+
+ if (lookup_xdata)
+ dict_unref (lookup_xdata);
+
+ return ret;
+}
+
+
int
gf_defrag_fix_layout (xlator_t *this, gf_defrag_info_t *defrag, loc_t *loc,
dict_t *fix_layout, dict_t *migrate_data)
@@ -2577,6 +2689,8 @@ gf_defrag_fix_layout (xlator_t *this, gf_defrag_info_t *defrag, loc_t *loc,
struct iatt iatt = {0,};
inode_t *linked_inode = NULL, *inode = NULL;
+
+
ret = syncop_lookup (this, loc, &iatt, NULL, NULL, NULL);
if (ret) {
gf_log (this->name, GF_LOG_ERROR, "Lookup failed on %s",
@@ -2638,10 +2752,22 @@ gf_defrag_fix_layout (xlator_t *this, gf_defrag_info_t *defrag, loc_t *loc,
if (!strcmp (entry->d_name, ".") ||
!strcmp (entry->d_name, ".."))
continue;
+ if (!IA_ISDIR (entry->d_stat.ia_type)) {
+
+ /* If its a fix layout during the attach
+ * tier operation do lookups on files
+ * on cold subvolume so that there is a
+ * CTR DB Lookup Heal triggered on existing
+ * data.
+ * */
+ if (defrag->cmd ==
+ GF_DEFRAG_CMD_START_TIER) {
+ gf_fix_layout_tier_attach_lookup
+ (this, loc, entry);
+ }
- if (!IA_ISDIR (entry->d_stat.ia_type))
continue;
-
+ }
loc_wipe (&entry_loc);
ret =dht_build_child_loc (this, &entry_loc, loc,
diff --git a/xlators/features/changetimerecorder/src/changetimerecorder.c b/xlators/features/changetimerecorder/src/changetimerecorder.c
index 89445b47bca..7305f68c4c5 100644
--- a/xlators/features/changetimerecorder/src/changetimerecorder.c
+++ b/xlators/features/changetimerecorder/src/changetimerecorder.c
@@ -214,7 +214,6 @@ ctr_lookup_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
gf_boolean_t _is_heal_needed = _gf_false;
CTR_IS_DISABLED_THEN_GOTO(this, out);
- CTR_IF_INTERNAL_FOP_THEN_GOTO (frame, dict, out);
/* if the lookup failed lookup dont do anything*/
if (op_ret == -1) {
diff --git a/xlators/features/changetimerecorder/src/ctr-helper.h b/xlators/features/changetimerecorder/src/ctr-helper.h
index dbad3029462..4f650350c94 100644
--- a/xlators/features/changetimerecorder/src/ctr-helper.h
+++ b/xlators/features/changetimerecorder/src/ctr-helper.h
@@ -284,10 +284,12 @@ do {\
* */
#define CTR_IS_INTERNAL_FOP(frame, dict)\
(AFR_SELF_HEAL_FOP (frame) \
- || REBALANCE_FOP (frame) \
- || TIER_REBALANCE_FOP (frame) \
+ || (REBALANCE_FOP (frame) && dict && \
+ !dict_get (dict, CTR_ATTACH_TIER_LOOKUP)) \
+ || (TIER_REBALANCE_FOP (frame) && dict && \
+ !dict_get (dict, CTR_ATTACH_TIER_LOOKUP)) \
|| (dict && \
- dict_get (dict, GLUSTERFS_INTERNAL_FOP_KEY)))
+ dict_get (dict, GLUSTERFS_INTERNAL_FOP_KEY)))
/**
* ignore internal fops for all clients except AFR self-heal daemon