summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--libglusterfs/src/globals.h2
-rw-r--r--tests/basic/ec/ec-optimistic-changelog.t152
-rw-r--r--xlators/cluster/ec/src/ec-common.c49
-rw-r--r--xlators/cluster/ec/src/ec-generic.c14
-rw-r--r--xlators/cluster/ec/src/ec-types.h4
-rw-r--r--xlators/cluster/ec/src/ec.c22
-rw-r--r--xlators/mgmt/glusterd/src/glusterd-volume-set.c6
7 files changed, 243 insertions, 6 deletions
diff --git a/libglusterfs/src/globals.h b/libglusterfs/src/globals.h
index a058a0f4ecd..1c8547265d1 100644
--- a/libglusterfs/src/globals.h
+++ b/libglusterfs/src/globals.h
@@ -78,6 +78,8 @@
#define GD_OP_VERSION_3_10_0 31000 /* Op-version for GlusterFS 3.10.0 */
+#define GD_OP_VERSION_3_10_1 31001 /* Op-version for GlusterFS 3.10.1 */
+
#define GD_OP_VERSION_4_0_0 40000 /* Op-version for GlusterFS 4.0.0 */
#define GD_OP_VER_PERSISTENT_AFR_XATTRS GD_OP_VERSION_3_6_0
diff --git a/tests/basic/ec/ec-optimistic-changelog.t b/tests/basic/ec/ec-optimistic-changelog.t
new file mode 100644
index 00000000000..1277da6ca1b
--- /dev/null
+++ b/tests/basic/ec/ec-optimistic-changelog.t
@@ -0,0 +1,152 @@
+#!/bin/bash
+
+. $(dirname $0)/../../include.rc
+. $(dirname $0)/../../volume.rc
+
+# This test checks optimistic-change-log option
+
+cleanup
+TEST glusterd
+TEST pidof glusterd
+TEST $CLI volume create $V0 disperse 3 redundancy 1 $H0:$B0/${V0}{0..2}
+TEST $CLI volume heal $V0 disable
+
+TEST $CLI volume set $V0 performance.stat-prefetch off
+TEST $CLI volume set $V0 performance.write-behind off
+TEST $CLI volume set $V0 performance.quick-read off
+TEST $CLI volume set $V0 performance.read-ahead off
+TEST $CLI volume set $V0 performance.io-cache off
+TEST $CLI volume set $V0 disperse.background-heals 0
+TEST $CLI volume set $V0 disperse.optimistic-change-log off
+TEST $CLI volume set $V0 disperse.eager-lock off
+TEST $CLI volume start $V0
+
+TEST $GFS --volfile-id=/$V0 --volfile-server=$H0 $M0;
+EXPECT_WITHIN $CHILD_UP_TIMEOUT "3" ec_child_up_count $V0 0
+EXPECT_WITHIN $CONFIG_UPDATE_TIMEOUT "0" mount_get_option_value $M0 $V0-disperse-0 background-heals
+EXPECT_WITHIN $CONFIG_UPDATE_TIMEOUT "0" mount_get_option_value $M0 $V0-disperse-0 heal-wait-qlength
+
+TEST $CLI volume set $V0 disperse.background-heals 1
+TEST touch $M0/a
+EXPECT_WITHIN $HEAL_TIMEOUT "^0$" count_sh_entries $B0/${V0}0
+EXPECT_WITHIN $HEAL_TIMEOUT "^0$" count_sh_entries $B0/${V0}1
+EXPECT_WITHIN $HEAL_TIMEOUT "^0$" count_sh_entries $B0/${V0}2
+
+
+
+### optimistic-change-log = off ; All bricks good. Test file operation
+echo abc > $M0/a
+EXPECT_WITHIN $HEAL_TIMEOUT "^0$" get_pending_heal_count $V0
+
+############################################################
+
+## optimistic-change-log = off ; Kill one brick . Test file operation
+TEST kill_brick $V0 $H0 $B0/${V0}2
+echo abc > $M0/a
+EXPECT 2 get_pending_heal_count $V0 #One for each active brick
+$CLI volume start $V0 force
+EXPECT_WITHIN $CHILD_UP_TIMEOUT "3" ec_child_up_count $V0 0
+#Accessing file should heal the file now
+EXPECT "abc" cat $M0/a
+EXPECT_WITHIN $HEAL_TIMEOUT "^0$" get_pending_heal_count $V0
+
+############################################################
+
+## optimistic-change-log = off ; All bricks good. Test entry operation
+TEST touch $M0/b
+EXPECT_WITHIN $HEAL_TIMEOUT "^0$" get_pending_heal_count $V0
+
+############################################################
+
+## optimistic-change-log = off ; All bricks good. Test metadata operation
+TEST chmod 0777 $M0/b
+EXPECT_WITHIN $HEAL_TIMEOUT "^0$" get_pending_heal_count $V0
+
+############################################################
+
+## optimistic-change-log = off ; Kill one brick. Test entry operation
+
+TEST kill_brick $V0 $H0 $B0/${V0}2
+TEST touch $M0/c
+EXPECT 4 get_pending_heal_count $V0 #two for each active brick
+$CLI volume start $V0 force
+EXPECT_WITHIN $CHILD_UP_TIMEOUT "3" ec_child_up_count $V0 0
+getfattr -d -m. -e hex $M0 2>&1 > /dev/null
+getfattr -d -m. -e hex $M0/c 2>&1 > /dev/null
+EXPECT_WITHIN $HEAL_TIMEOUT "^0$" get_pending_heal_count $V0
+
+############################################################
+
+## optimistic-change-log = off ; Kill one brick. Test metadata operation
+TEST kill_brick $V0 $H0 $B0/${V0}2
+TEST chmod 0777 $M0/c
+EXPECT 2 get_pending_heal_count $V0 #One for each active brick
+$CLI volume start $V0 force
+EXPECT_WITHIN $CHILD_UP_TIMEOUT "3" ec_child_up_count $V0 0
+getfattr -d -m. -e hex $M0/c 2>&1 > /dev/null
+EXPECT_WITHIN $HEAL_TIMEOUT "^0$" get_pending_heal_count $V0
+
+############################################################
+
+TEST $CLI volume set $V0 disperse.optimistic-change-log on
+
+### optimistic-change-log = on ; All bricks good. Test file operation
+
+echo abc > $M0/aa
+EXPECT_WITHIN $HEAL_TIMEOUT "^0$" get_pending_heal_count $V0
+
+############################################################
+
+## optimistic-change-log = on ; Kill one brick. Test file operation
+
+TEST kill_brick $V0 $H0 $B0/${V0}2
+echo abc > $M0/aa
+EXPECT 2 get_pending_heal_count $V0 #One for each active brick
+$CLI volume start $V0 force
+EXPECT_WITHIN $CHILD_UP_TIMEOUT "3" ec_child_up_count $V0 0
+#Accessing file should heal the file now
+getfattr -d -m. -e hex $M0/aa 2>&1 > /dev/null
+EXPECT_WITHIN $HEAL_TIMEOUT "^0$" get_pending_heal_count $V0
+
+############################################################
+
+## optimistic-change-log = on ; All bricks good. Test entry operation
+
+TEST touch $M0/bb
+EXPECT_WITHIN $HEAL_TIMEOUT "^0$" get_pending_heal_count $V0
+
+############################################################
+
+## optimistic-change-log = on ; All bricks good. Test metadata operation
+
+TEST chmod 0777 $M0/bb
+EXPECT_WITHIN $HEAL_TIMEOUT "^0$" get_pending_heal_count $V0
+
+############################################################
+
+## optimistic-change-log = on ; Kill one brick. Test entry operation
+
+TEST kill_brick $V0 $H0 $B0/${V0}2
+TEST touch $M0/cc
+EXPECT 4 get_pending_heal_count $V0 #two for each active brick
+$CLI volume start $V0 force
+EXPECT_WITHIN $CHILD_UP_TIMEOUT "3" ec_child_up_count $V0 0
+getfattr -d -m. -e hex $M0 2>&1 > /dev/null
+getfattr -d -m. -e hex $M0/cc 2>&1 > /dev/null
+EXPECT_WITHIN $HEAL_TIMEOUT "^0$" get_pending_heal_count $V0
+
+############################################################
+
+## optimistic-change-log = on ; Kill one brick. Test metadata operation
+
+TEST kill_brick $V0 $H0 $B0/${V0}2
+TEST chmod 0777 $M0/cc
+EXPECT 2 get_pending_heal_count $V0 #One for each active brick
+$CLI volume start $V0 force
+EXPECT_WITHIN $CHILD_UP_TIMEOUT "3" ec_child_up_count $V0 0
+getfattr -d -m. -e hex $M0/cc 2>&1 > /dev/null
+EXPECT_WITHIN $HEAL_TIMEOUT "^0$" get_pending_heal_count $V0
+
+############################################################
+
+cleanup
diff --git a/xlators/cluster/ec/src/ec-common.c b/xlators/cluster/ec/src/ec-common.c
index 823922542a0..3ae7f110d99 100644
--- a/xlators/cluster/ec/src/ec-common.c
+++ b/xlators/cluster/ec/src/ec-common.c
@@ -932,16 +932,19 @@ ec_config_check (ec_fop_data_t *fop, ec_config_t *config)
}
gf_boolean_t
-ec_set_dirty_flag (ec_lock_link_t *link, ec_inode_t *ctx, uint64_t *dirty)
+ec_set_dirty_flag (ec_lock_link_t *link, ec_inode_t *ctx,
+ uint64_t *dirty)
{
gf_boolean_t set_dirty = _gf_false;
if (link->update[EC_DATA_TXN] && !ctx->dirty[EC_DATA_TXN]) {
+ if (!link->optimistic_changelog)
dirty[EC_DATA_TXN] = 1;
}
if (link->update[EC_METADATA_TXN] && !ctx->dirty[EC_METADATA_TXN]) {
+ if (!link->optimistic_changelog)
dirty[EC_METADATA_TXN] = 1;
}
@@ -962,6 +965,7 @@ ec_prepare_update_cbk (call_frame_t *frame, void *cookie,
ec_lock_link_t *link = fop->data;
ec_lock_t *lock = NULL;
ec_inode_t *ctx;
+ gf_boolean_t release = _gf_false;
lock = link->lock;
parent = link->fop;
@@ -1055,6 +1059,26 @@ unlock:
UNLOCK(&lock->loc.inode->lock);
if (op_errno == 0) {
+ /* If the fop fails on any of the good bricks, it is important to mark
+ * it dirty and update versions right away if dirty was not set before.
+ */
+ if (lock->good_mask & ~(fop->good | fop->remaining)) {
+ release = _gf_true;
+ }
+
+ /* lock->release is a critical field that is checked and modified most
+ * of the time inside a locked region. This use here is safe because we
+ * are in a modifying fop and we currently don't allow two modifying
+ * fops to be processed concurrently, so no one else could be checking
+ * or modifying it.*/
+ if (link->update[0] && !link->dirty[0]) {
+ lock->release |= release;
+ }
+
+ if (link->update[1] && !link->dirty[1]) {
+ lock->release |= release;
+ }
+
/* We don't allow the main fop to be executed on bricks that have not
* succeeded the initial xattrop. */
parent->mask &= fop->good;
@@ -1097,6 +1121,7 @@ void ec_get_size_version(ec_lock_link_t *link)
ec_inode_t *ctx;
ec_fop_data_t *fop;
dict_t *dict = NULL;
+ ec_t *ec = NULL;
int32_t error = 0;
gf_boolean_t getting_xattr;
gf_boolean_t set_dirty = _gf_false;
@@ -1105,6 +1130,17 @@ void ec_get_size_version(ec_lock_link_t *link)
lock = link->lock;
ctx = lock->ctx;
fop = link->fop;
+ ec = fop->xl->private;
+
+ if (ec->optimistic_changelog &&
+ !(ec->node_mask & ~link->lock->good_mask) && !ec_is_data_fop (fop->id))
+ link->optimistic_changelog = _gf_true;
+
+ /* If ctx->have_info is false and lock->query is true, it means that we'll
+ * send the xattrop anyway, so we can use it to update dirty counts, even
+ * if it's not necessary to do it right now. */
+ if (!ctx->have_info && lock->query)
+ link->optimistic_changelog = _gf_false;
set_dirty = ec_set_dirty_flag (link, ctx, dirty);
@@ -1714,6 +1750,13 @@ ec_lock_next_owner(ec_lock_link_t *link, ec_cbk_data_t *cbk,
if (link->update[1]) {
ctx->post_version[1]++;
}
+ /* If the fop fails on any of the good bricks, it is important to mark
+ * it dirty and update versions right away. */
+ if (link->update[0] || link->update[1]) {
+ if (lock->good_mask & ~(fop->good | fop->remaining)) {
+ lock->release = _gf_true;
+ }
+ }
}
ec_lock_update_good(lock, fop);
@@ -2028,9 +2071,13 @@ ec_update_info(ec_lock_link_t *link)
if (ctx->dirty[1] != 0) {
dirty[1] = -1;
}
+ } else {
+ link->optimistic_changelog = _gf_false;
+ ec_set_dirty_flag (link, ctx, dirty);
}
memset(ctx->dirty, 0, sizeof(ctx->dirty));
}
+
if ((version[0] != 0) || (version[1] != 0) ||
(dirty[0] != 0) || (dirty[1] != 0)) {
ec_update_size_version(link, version, size, dirty);
diff --git a/xlators/cluster/ec/src/ec-generic.c b/xlators/cluster/ec/src/ec-generic.c
index 3ce3c2ab02a..ddb90ce39cc 100644
--- a/xlators/cluster/ec/src/ec-generic.c
+++ b/xlators/cluster/ec/src/ec-generic.c
@@ -697,6 +697,7 @@ int32_t ec_lookup_cbk(call_frame_t * frame, void * cookie, xlator_t * this,
ec_fop_data_t * fop = NULL;
ec_cbk_data_t * cbk = NULL;
int32_t idx = (int32_t)(uintptr_t)cookie;
+ uint64_t dirty[2] = {0};
VALIDATE_OR_GOTO(this, out);
GF_VALIDATE_OR_GOTO(this->name, frame, out);
@@ -746,8 +747,7 @@ int32_t ec_lookup_cbk(call_frame_t * frame, void * cookie, xlator_t * this,
goto out;
}
- ec_dict_del_array (xdata, EC_XATTR_DIRTY, cbk->dirty,
- EC_VERSION_SIZE);
+ ec_dict_del_array (xdata, EC_XATTR_DIRTY, dirty, EC_VERSION_SIZE);
}
ec_combine(cbk, ec_combine_lookup);
@@ -1142,7 +1142,9 @@ ec_xattrop_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
dict_t *xdata)
{
ec_fop_data_t *fop = NULL;
+ ec_lock_link_t *link = NULL;
ec_cbk_data_t *cbk = NULL;
+ uint64_t dirty[2] = {0};
data_t *data;
uint64_t *version;
int32_t idx = (int32_t)(uintptr_t)cookie;
@@ -1178,8 +1180,14 @@ ec_xattrop_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
}
}
- ec_dict_del_array (xattr, EC_XATTR_DIRTY, cbk->dirty,
+ ec_dict_del_array (xattr, EC_XATTR_DIRTY, dirty,
EC_VERSION_SIZE);
+ link = fop->data;
+ if (link) {
+ /*Keep a note of if the dirty is already set or not*/
+ link->dirty[0] |= (dirty[0] != 0);
+ link->dirty[1] |= (dirty[1] != 0);
+ }
}
if (xdata)
diff --git a/xlators/cluster/ec/src/ec-types.h b/xlators/cluster/ec/src/ec-types.h
index d701fe7d25e..f184f459c2e 100644
--- a/xlators/cluster/ec/src/ec-types.h
+++ b/xlators/cluster/ec/src/ec-types.h
@@ -246,6 +246,8 @@ struct _ec_lock_link {
struct list_head owner_list;
struct list_head wait_list;
gf_boolean_t update[2];
+ gf_boolean_t dirty[2];
+ gf_boolean_t optimistic_changelog;
loc_t *base;
uint64_t size;
};
@@ -331,7 +333,6 @@ struct _ec_cbk_data {
int32_t op_errno;
int32_t count;
uintptr_t mask;
- uint64_t dirty[2];
dict_t *xdata;
dict_t *dict;
@@ -561,6 +562,7 @@ struct _ec {
gf_timer_t *timer;
gf_boolean_t shutdown;
gf_boolean_t eager_lock;
+ gf_boolean_t optimistic_changelog;
uint32_t background_heals;
uint32_t heal_wait_qlen;
struct list_head pending_fops;
diff --git a/xlators/cluster/ec/src/ec.c b/xlators/cluster/ec/src/ec.c
index e467fea28b8..01f1473f96d 100644
--- a/xlators/cluster/ec/src/ec.c
+++ b/xlators/cluster/ec/src/ec.c
@@ -301,6 +301,8 @@ reconfigure (xlator_t *this, dict_t *options)
ret = -1;
}
+ GF_OPTION_RECONF ("optimistic-change-log", ec->optimistic_changelog,
+ options, bool, failed);
failed:
return ret;
}
@@ -611,6 +613,7 @@ init (xlator_t *this)
this->private = ec;
ec->xl = this;
+ ec->optimistic_changelog = _gf_true;
LOCK_INIT(&ec->lock);
INIT_LIST_HEAD(&ec->pending_fops);
@@ -669,6 +672,7 @@ init (xlator_t *this)
GF_OPTION_INIT ("shd-max-threads", ec->shd.max_threads, uint32, failed);
GF_OPTION_INIT ("shd-wait-qlength", ec->shd.wait_qlength, uint32, failed);
+ GF_OPTION_INIT ("optimistic-change-log", ec->optimistic_changelog, bool, failed);
this->itable = inode_table_new (EC_SHD_INODE_LRU_LIMIT, this);
if (!this->itable)
@@ -1463,5 +1467,21 @@ struct volume_options options[] =
.description = "force the cpu extensions to be used to accelerate the "
"galois field computations."
},
- { }
+ { .key = {"optimistic-change-log"},
+ .type = GF_OPTION_TYPE_BOOL,
+ .default_value = "on",
+ .description = "Set/Unset dirty flag for every update fop at the start"
+ "of the fop. If OFF, this option impacts performance of"
+ "entry operations or metadata operations as it will"
+ "set dirty flag at the start and unset it at the end of"
+ "ALL update fop. If ON and all the bricks are good,"
+ "dirty flag will be set at the start only for file fops"
+ "For metadata and entry fops dirty flag will not be set"
+ "at the start, if all the bricks are good. This does"
+ "not impact performance for metadata operations and"
+ "entry operation but has a very small window to miss"
+ "marking entry as dirty in case it is required to be"
+ "healed"
+ },
+ { .key = {NULL} }
};
diff --git a/xlators/mgmt/glusterd/src/glusterd-volume-set.c b/xlators/mgmt/glusterd/src/glusterd-volume-set.c
index 5cb4b6c9702..b3f6c40d7f0 100644
--- a/xlators/mgmt/glusterd/src/glusterd-volume-set.c
+++ b/xlators/mgmt/glusterd/src/glusterd-volume-set.c
@@ -3153,6 +3153,12 @@ struct volopt_map_entry glusterd_volopt_map[] = {
.op_version = GD_OP_VERSION_3_10_0,
.validate_fn = validate_boolean
},
+ { .key = "disperse.optimistic-change-log",
+ .voltype = "cluster/disperse",
+ .type = NO_DOC,
+ .op_version = GD_OP_VERSION_3_10_1,
+ .flags = OPT_FLAG_CLIENT_OPT
+ },
{ .key = NULL
}
};