summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorPranith Kumar K <pkarampu@redhat.com>2017-03-02 07:14:14 +0530
committerPranith Kumar Karampuri <pkarampu@redhat.com>2017-03-04 07:37:56 -0500
commit78c5c5637104cf79578d0fb9173647c9c3421177 (patch)
tree4d45320f5487f900dcc1a21b7087313162da9652
parentaaa5b2ec2f0ef1a62047c9ab91d957c7b0a1552a (diff)
cluster/ec: Introduce optimistic changelog in EC
Problem: Fix to https://bugzilla.redhat.com/show_bug.cgi?id=1316873 has made changes to set dirty flag before every update fop, data or metadata, and unset it after successful operation. That makes some of the fops very slow such as entry operations or metadata operations. Solution: File data operations are the only operation which take some time and setting dirty flag before a fop and unsetting it after serves the purpose as probability of failure of a fop is high when the time duration is more. For all the other operations, set dirty flag at the end of the fop, if any brick is down and need heal. Providing following option to choose between high performance or better heal marking for metadata and entry fops. Set/Unset dirty flag for every update fop at the start of the fop. If ON, this option impacts performance of entry operations or metadata operations as it will set dirty flag at the start and unset it at the end of ALL update fop. If OFF and all the bricks are good, dirty flag will be set at the start only for file fops For metadata and entry fops dirty flag will not be set at the start, if all the bricks are good. This does not impact performance for metadata operations and entry operation but has a very small window to miss marking entry as dirty in case it is required to be healed. Thanks to Xavi and Ashish for the design Picked the .t file from Ashish' patch https://review.gluster.org/16298 BUG: 1408809 Change-Id: I3ce860063f0e2901e50754dcfc3e4ed22daf819f Signed-off-by: Pranith Kumar K <pkarampu@redhat.com> Reviewed-on: https://review.gluster.org/16821 Smoke: Gluster Build System <jenkins@build.gluster.org> Reviewed-by: Xavier Hernandez <xhernandez@datalab.es> Tested-by: Xavier Hernandez <xhernandez@datalab.es> NetBSD-regression: NetBSD Build System <jenkins@build.gluster.org> CentOS-regression: Gluster Build System <jenkins@build.gluster.org>
-rw-r--r--libglusterfs/src/globals.h2
-rw-r--r--tests/basic/ec/ec-optimistic-changelog.t152
-rw-r--r--xlators/cluster/ec/src/ec-common.c49
-rw-r--r--xlators/cluster/ec/src/ec-generic.c14
-rw-r--r--xlators/cluster/ec/src/ec-types.h4
-rw-r--r--xlators/cluster/ec/src/ec.c22
-rw-r--r--xlators/mgmt/glusterd/src/glusterd-volume-set.c6
7 files changed, 243 insertions, 6 deletions
diff --git a/libglusterfs/src/globals.h b/libglusterfs/src/globals.h
index a058a0f4ecd..1c8547265d1 100644
--- a/libglusterfs/src/globals.h
+++ b/libglusterfs/src/globals.h
@@ -78,6 +78,8 @@
#define GD_OP_VERSION_3_10_0 31000 /* Op-version for GlusterFS 3.10.0 */
+#define GD_OP_VERSION_3_10_1 31001 /* Op-version for GlusterFS 3.10.1 */
+
#define GD_OP_VERSION_4_0_0 40000 /* Op-version for GlusterFS 4.0.0 */
#define GD_OP_VER_PERSISTENT_AFR_XATTRS GD_OP_VERSION_3_6_0
diff --git a/tests/basic/ec/ec-optimistic-changelog.t b/tests/basic/ec/ec-optimistic-changelog.t
new file mode 100644
index 00000000000..1277da6ca1b
--- /dev/null
+++ b/tests/basic/ec/ec-optimistic-changelog.t
@@ -0,0 +1,152 @@
+#!/bin/bash
+
+. $(dirname $0)/../../include.rc
+. $(dirname $0)/../../volume.rc
+
+# This test checks optimistic-change-log option
+
+cleanup
+TEST glusterd
+TEST pidof glusterd
+TEST $CLI volume create $V0 disperse 3 redundancy 1 $H0:$B0/${V0}{0..2}
+TEST $CLI volume heal $V0 disable
+
+TEST $CLI volume set $V0 performance.stat-prefetch off
+TEST $CLI volume set $V0 performance.write-behind off
+TEST $CLI volume set $V0 performance.quick-read off
+TEST $CLI volume set $V0 performance.read-ahead off
+TEST $CLI volume set $V0 performance.io-cache off
+TEST $CLI volume set $V0 disperse.background-heals 0
+TEST $CLI volume set $V0 disperse.optimistic-change-log off
+TEST $CLI volume set $V0 disperse.eager-lock off
+TEST $CLI volume start $V0
+
+TEST $GFS --volfile-id=/$V0 --volfile-server=$H0 $M0;
+EXPECT_WITHIN $CHILD_UP_TIMEOUT "3" ec_child_up_count $V0 0
+EXPECT_WITHIN $CONFIG_UPDATE_TIMEOUT "0" mount_get_option_value $M0 $V0-disperse-0 background-heals
+EXPECT_WITHIN $CONFIG_UPDATE_TIMEOUT "0" mount_get_option_value $M0 $V0-disperse-0 heal-wait-qlength
+
+TEST $CLI volume set $V0 disperse.background-heals 1
+TEST touch $M0/a
+EXPECT_WITHIN $HEAL_TIMEOUT "^0$" count_sh_entries $B0/${V0}0
+EXPECT_WITHIN $HEAL_TIMEOUT "^0$" count_sh_entries $B0/${V0}1
+EXPECT_WITHIN $HEAL_TIMEOUT "^0$" count_sh_entries $B0/${V0}2
+
+
+
+### optimistic-change-log = off ; All bricks good. Test file operation
+echo abc > $M0/a
+EXPECT_WITHIN $HEAL_TIMEOUT "^0$" get_pending_heal_count $V0
+
+############################################################
+
+## optimistic-change-log = off ; Kill one brick . Test file operation
+TEST kill_brick $V0 $H0 $B0/${V0}2
+echo abc > $M0/a
+EXPECT 2 get_pending_heal_count $V0 #One for each active brick
+$CLI volume start $V0 force
+EXPECT_WITHIN $CHILD_UP_TIMEOUT "3" ec_child_up_count $V0 0
+#Accessing file should heal the file now
+EXPECT "abc" cat $M0/a
+EXPECT_WITHIN $HEAL_TIMEOUT "^0$" get_pending_heal_count $V0
+
+############################################################
+
+## optimistic-change-log = off ; All bricks good. Test entry operation
+TEST touch $M0/b
+EXPECT_WITHIN $HEAL_TIMEOUT "^0$" get_pending_heal_count $V0
+
+############################################################
+
+## optimistic-change-log = off ; All bricks good. Test metadata operation
+TEST chmod 0777 $M0/b
+EXPECT_WITHIN $HEAL_TIMEOUT "^0$" get_pending_heal_count $V0
+
+############################################################
+
+## optimistic-change-log = off ; Kill one brick. Test entry operation
+
+TEST kill_brick $V0 $H0 $B0/${V0}2
+TEST touch $M0/c
+EXPECT 4 get_pending_heal_count $V0 #two for each active brick
+$CLI volume start $V0 force
+EXPECT_WITHIN $CHILD_UP_TIMEOUT "3" ec_child_up_count $V0 0
+getfattr -d -m. -e hex $M0 2>&1 > /dev/null
+getfattr -d -m. -e hex $M0/c 2>&1 > /dev/null
+EXPECT_WITHIN $HEAL_TIMEOUT "^0$" get_pending_heal_count $V0
+
+############################################################
+
+## optimistic-change-log = off ; Kill one brick. Test metadata operation
+TEST kill_brick $V0 $H0 $B0/${V0}2
+TEST chmod 0777 $M0/c
+EXPECT 2 get_pending_heal_count $V0 #One for each active brick
+$CLI volume start $V0 force
+EXPECT_WITHIN $CHILD_UP_TIMEOUT "3" ec_child_up_count $V0 0
+getfattr -d -m. -e hex $M0/c 2>&1 > /dev/null
+EXPECT_WITHIN $HEAL_TIMEOUT "^0$" get_pending_heal_count $V0
+
+############################################################
+
+TEST $CLI volume set $V0 disperse.optimistic-change-log on
+
+### optimistic-change-log = on ; All bricks good. Test file operation
+
+echo abc > $M0/aa
+EXPECT_WITHIN $HEAL_TIMEOUT "^0$" get_pending_heal_count $V0
+
+############################################################
+
+## optimistic-change-log = on ; Kill one brick. Test file operation
+
+TEST kill_brick $V0 $H0 $B0/${V0}2
+echo abc > $M0/aa
+EXPECT 2 get_pending_heal_count $V0 #One for each active brick
+$CLI volume start $V0 force
+EXPECT_WITHIN $CHILD_UP_TIMEOUT "3" ec_child_up_count $V0 0
+#Accessing file should heal the file now
+getfattr -d -m. -e hex $M0/aa 2>&1 > /dev/null
+EXPECT_WITHIN $HEAL_TIMEOUT "^0$" get_pending_heal_count $V0
+
+############################################################
+
+## optimistic-change-log = on ; All bricks good. Test entry operation
+
+TEST touch $M0/bb
+EXPECT_WITHIN $HEAL_TIMEOUT "^0$" get_pending_heal_count $V0
+
+############################################################
+
+## optimistic-change-log = on ; All bricks good. Test metadata operation
+
+TEST chmod 0777 $M0/bb
+EXPECT_WITHIN $HEAL_TIMEOUT "^0$" get_pending_heal_count $V0
+
+############################################################
+
+## optimistic-change-log = on ; Kill one brick. Test entry operation
+
+TEST kill_brick $V0 $H0 $B0/${V0}2
+TEST touch $M0/cc
+EXPECT 4 get_pending_heal_count $V0 #two for each active brick
+$CLI volume start $V0 force
+EXPECT_WITHIN $CHILD_UP_TIMEOUT "3" ec_child_up_count $V0 0
+getfattr -d -m. -e hex $M0 2>&1 > /dev/null
+getfattr -d -m. -e hex $M0/cc 2>&1 > /dev/null
+EXPECT_WITHIN $HEAL_TIMEOUT "^0$" get_pending_heal_count $V0
+
+############################################################
+
+## optimistic-change-log = on ; Kill one brick. Test metadata operation
+
+TEST kill_brick $V0 $H0 $B0/${V0}2
+TEST chmod 0777 $M0/cc
+EXPECT 2 get_pending_heal_count $V0 #One for each active brick
+$CLI volume start $V0 force
+EXPECT_WITHIN $CHILD_UP_TIMEOUT "3" ec_child_up_count $V0 0
+getfattr -d -m. -e hex $M0/cc 2>&1 > /dev/null
+EXPECT_WITHIN $HEAL_TIMEOUT "^0$" get_pending_heal_count $V0
+
+############################################################
+
+cleanup
diff --git a/xlators/cluster/ec/src/ec-common.c b/xlators/cluster/ec/src/ec-common.c
index 823922542a0..3ae7f110d99 100644
--- a/xlators/cluster/ec/src/ec-common.c
+++ b/xlators/cluster/ec/src/ec-common.c
@@ -932,16 +932,19 @@ ec_config_check (ec_fop_data_t *fop, ec_config_t *config)
}
gf_boolean_t
-ec_set_dirty_flag (ec_lock_link_t *link, ec_inode_t *ctx, uint64_t *dirty)
+ec_set_dirty_flag (ec_lock_link_t *link, ec_inode_t *ctx,
+ uint64_t *dirty)
{
gf_boolean_t set_dirty = _gf_false;
if (link->update[EC_DATA_TXN] && !ctx->dirty[EC_DATA_TXN]) {
+ if (!link->optimistic_changelog)
dirty[EC_DATA_TXN] = 1;
}
if (link->update[EC_METADATA_TXN] && !ctx->dirty[EC_METADATA_TXN]) {
+ if (!link->optimistic_changelog)
dirty[EC_METADATA_TXN] = 1;
}
@@ -962,6 +965,7 @@ ec_prepare_update_cbk (call_frame_t *frame, void *cookie,
ec_lock_link_t *link = fop->data;
ec_lock_t *lock = NULL;
ec_inode_t *ctx;
+ gf_boolean_t release = _gf_false;
lock = link->lock;
parent = link->fop;
@@ -1055,6 +1059,26 @@ unlock:
UNLOCK(&lock->loc.inode->lock);
if (op_errno == 0) {
+ /* If the fop fails on any of the good bricks, it is important to mark
+ * it dirty and update versions right away if dirty was not set before.
+ */
+ if (lock->good_mask & ~(fop->good | fop->remaining)) {
+ release = _gf_true;
+ }
+
+ /* lock->release is a critical field that is checked and modified most
+ * of the time inside a locked region. This use here is safe because we
+ * are in a modifying fop and we currently don't allow two modifying
+ * fops to be processed concurrently, so no one else could be checking
+ * or modifying it.*/
+ if (link->update[0] && !link->dirty[0]) {
+ lock->release |= release;
+ }
+
+ if (link->update[1] && !link->dirty[1]) {
+ lock->release |= release;
+ }
+
/* We don't allow the main fop to be executed on bricks that have not
* succeeded the initial xattrop. */
parent->mask &= fop->good;
@@ -1097,6 +1121,7 @@ void ec_get_size_version(ec_lock_link_t *link)
ec_inode_t *ctx;
ec_fop_data_t *fop;
dict_t *dict = NULL;
+ ec_t *ec = NULL;
int32_t error = 0;
gf_boolean_t getting_xattr;
gf_boolean_t set_dirty = _gf_false;
@@ -1105,6 +1130,17 @@ void ec_get_size_version(ec_lock_link_t *link)
lock = link->lock;
ctx = lock->ctx;
fop = link->fop;
+ ec = fop->xl->private;
+
+ if (ec->optimistic_changelog &&
+ !(ec->node_mask & ~link->lock->good_mask) && !ec_is_data_fop (fop->id))
+ link->optimistic_changelog = _gf_true;
+
+ /* If ctx->have_info is false and lock->query is true, it means that we'll
+ * send the xattrop anyway, so we can use it to update dirty counts, even
+ * if it's not necessary to do it right now. */
+ if (!ctx->have_info && lock->query)
+ link->optimistic_changelog = _gf_false;
set_dirty = ec_set_dirty_flag (link, ctx, dirty);
@@ -1714,6 +1750,13 @@ ec_lock_next_owner(ec_lock_link_t *link, ec_cbk_data_t *cbk,
if (link->update[1]) {
ctx->post_version[1]++;
}
+ /* If the fop fails on any of the good bricks, it is important to mark
+ * it dirty and update versions right away. */
+ if (link->update[0] || link->update[1]) {
+ if (lock->good_mask & ~(fop->good | fop->remaining)) {
+ lock->release = _gf_true;
+ }
+ }
}
ec_lock_update_good(lock, fop);
@@ -2028,9 +2071,13 @@ ec_update_info(ec_lock_link_t *link)
if (ctx->dirty[1] != 0) {
dirty[1] = -1;
}
+ } else {
+ link->optimistic_changelog = _gf_false;
+ ec_set_dirty_flag (link, ctx, dirty);
}
memset(ctx->dirty, 0, sizeof(ctx->dirty));
}
+
if ((version[0] != 0) || (version[1] != 0) ||
(dirty[0] != 0) || (dirty[1] != 0)) {
ec_update_size_version(link, version, size, dirty);
diff --git a/xlators/cluster/ec/src/ec-generic.c b/xlators/cluster/ec/src/ec-generic.c
index 3ce3c2ab02a..ddb90ce39cc 100644
--- a/xlators/cluster/ec/src/ec-generic.c
+++ b/xlators/cluster/ec/src/ec-generic.c
@@ -697,6 +697,7 @@ int32_t ec_lookup_cbk(call_frame_t * frame, void * cookie, xlator_t * this,
ec_fop_data_t * fop = NULL;
ec_cbk_data_t * cbk = NULL;
int32_t idx = (int32_t)(uintptr_t)cookie;
+ uint64_t dirty[2] = {0};
VALIDATE_OR_GOTO(this, out);
GF_VALIDATE_OR_GOTO(this->name, frame, out);
@@ -746,8 +747,7 @@ int32_t ec_lookup_cbk(call_frame_t * frame, void * cookie, xlator_t * this,
goto out;
}
- ec_dict_del_array (xdata, EC_XATTR_DIRTY, cbk->dirty,
- EC_VERSION_SIZE);
+ ec_dict_del_array (xdata, EC_XATTR_DIRTY, dirty, EC_VERSION_SIZE);
}
ec_combine(cbk, ec_combine_lookup);
@@ -1142,7 +1142,9 @@ ec_xattrop_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
dict_t *xdata)
{
ec_fop_data_t *fop = NULL;
+ ec_lock_link_t *link = NULL;
ec_cbk_data_t *cbk = NULL;
+ uint64_t dirty[2] = {0};
data_t *data;
uint64_t *version;
int32_t idx = (int32_t)(uintptr_t)cookie;
@@ -1178,8 +1180,14 @@ ec_xattrop_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
}
}
- ec_dict_del_array (xattr, EC_XATTR_DIRTY, cbk->dirty,
+ ec_dict_del_array (xattr, EC_XATTR_DIRTY, dirty,
EC_VERSION_SIZE);
+ link = fop->data;
+ if (link) {
+ /*Keep a note of if the dirty is already set or not*/
+ link->dirty[0] |= (dirty[0] != 0);
+ link->dirty[1] |= (dirty[1] != 0);
+ }
}
if (xdata)
diff --git a/xlators/cluster/ec/src/ec-types.h b/xlators/cluster/ec/src/ec-types.h
index d701fe7d25e..f184f459c2e 100644
--- a/xlators/cluster/ec/src/ec-types.h
+++ b/xlators/cluster/ec/src/ec-types.h
@@ -246,6 +246,8 @@ struct _ec_lock_link {
struct list_head owner_list;
struct list_head wait_list;
gf_boolean_t update[2];
+ gf_boolean_t dirty[2];
+ gf_boolean_t optimistic_changelog;
loc_t *base;
uint64_t size;
};
@@ -331,7 +333,6 @@ struct _ec_cbk_data {
int32_t op_errno;
int32_t count;
uintptr_t mask;
- uint64_t dirty[2];
dict_t *xdata;
dict_t *dict;
@@ -561,6 +562,7 @@ struct _ec {
gf_timer_t *timer;
gf_boolean_t shutdown;
gf_boolean_t eager_lock;
+ gf_boolean_t optimistic_changelog;
uint32_t background_heals;
uint32_t heal_wait_qlen;
struct list_head pending_fops;
diff --git a/xlators/cluster/ec/src/ec.c b/xlators/cluster/ec/src/ec.c
index e467fea28b8..01f1473f96d 100644
--- a/xlators/cluster/ec/src/ec.c
+++ b/xlators/cluster/ec/src/ec.c
@@ -301,6 +301,8 @@ reconfigure (xlator_t *this, dict_t *options)
ret = -1;
}
+ GF_OPTION_RECONF ("optimistic-change-log", ec->optimistic_changelog,
+ options, bool, failed);
failed:
return ret;
}
@@ -611,6 +613,7 @@ init (xlator_t *this)
this->private = ec;
ec->xl = this;
+ ec->optimistic_changelog = _gf_true;
LOCK_INIT(&ec->lock);
INIT_LIST_HEAD(&ec->pending_fops);
@@ -669,6 +672,7 @@ init (xlator_t *this)
GF_OPTION_INIT ("shd-max-threads", ec->shd.max_threads, uint32, failed);
GF_OPTION_INIT ("shd-wait-qlength", ec->shd.wait_qlength, uint32, failed);
+ GF_OPTION_INIT ("optimistic-change-log", ec->optimistic_changelog, bool, failed);
this->itable = inode_table_new (EC_SHD_INODE_LRU_LIMIT, this);
if (!this->itable)
@@ -1463,5 +1467,21 @@ struct volume_options options[] =
.description = "force the cpu extensions to be used to accelerate the "
"galois field computations."
},
- { }
+ { .key = {"optimistic-change-log"},
+ .type = GF_OPTION_TYPE_BOOL,
+ .default_value = "on",
+ .description = "Set/Unset dirty flag for every update fop at the start"
+ "of the fop. If OFF, this option impacts performance of"
+ "entry operations or metadata operations as it will"
+ "set dirty flag at the start and unset it at the end of"
+ "ALL update fop. If ON and all the bricks are good,"
+ "dirty flag will be set at the start only for file fops"
+ "For metadata and entry fops dirty flag will not be set"
+ "at the start, if all the bricks are good. This does"
+ "not impact performance for metadata operations and"
+ "entry operation but has a very small window to miss"
+ "marking entry as dirty in case it is required to be"
+ "healed"
+ },
+ { .key = {NULL} }
};
diff --git a/xlators/mgmt/glusterd/src/glusterd-volume-set.c b/xlators/mgmt/glusterd/src/glusterd-volume-set.c
index 5cb4b6c9702..b3f6c40d7f0 100644
--- a/xlators/mgmt/glusterd/src/glusterd-volume-set.c
+++ b/xlators/mgmt/glusterd/src/glusterd-volume-set.c
@@ -3153,6 +3153,12 @@ struct volopt_map_entry glusterd_volopt_map[] = {
.op_version = GD_OP_VERSION_3_10_0,
.validate_fn = validate_boolean
},
+ { .key = "disperse.optimistic-change-log",
+ .voltype = "cluster/disperse",
+ .type = NO_DOC,
+ .op_version = GD_OP_VERSION_3_10_1,
+ .flags = OPT_FLAG_CLIENT_OPT
+ },
{ .key = NULL
}
};