summaryrefslogtreecommitdiffstats
path: root/xlators/cluster/ec
diff options
context:
space:
mode:
authorPranith Kumar K <pkarampu@redhat.com>2017-03-02 07:14:14 +0530
committerPranith Kumar Karampuri <pkarampu@redhat.com>2017-03-04 07:37:56 -0500
commit78c5c5637104cf79578d0fb9173647c9c3421177 (patch)
tree4d45320f5487f900dcc1a21b7087313162da9652 /xlators/cluster/ec
parentaaa5b2ec2f0ef1a62047c9ab91d957c7b0a1552a (diff)
cluster/ec: Introduce optimistic changelog in EC
Problem: Fix to https://bugzilla.redhat.com/show_bug.cgi?id=1316873 has made changes to set dirty flag before every update fop, data or metadata, and unset it after successful operation. That makes some of the fops very slow such as entry operations or metadata operations. Solution: File data operations are the only operation which take some time and setting dirty flag before a fop and unsetting it after serves the purpose as probability of failure of a fop is high when the time duration is more. For all the other operations, set dirty flag at the end of the fop, if any brick is down and need heal. Providing following option to choose between high performance or better heal marking for metadata and entry fops. Set/Unset dirty flag for every update fop at the start of the fop. If ON, this option impacts performance of entry operations or metadata operations as it will set dirty flag at the start and unset it at the end of ALL update fop. If OFF and all the bricks are good, dirty flag will be set at the start only for file fops For metadata and entry fops dirty flag will not be set at the start, if all the bricks are good. This does not impact performance for metadata operations and entry operation but has a very small window to miss marking entry as dirty in case it is required to be healed. Thanks to Xavi and Ashish for the design Picked the .t file from Ashish' patch https://review.gluster.org/16298 BUG: 1408809 Change-Id: I3ce860063f0e2901e50754dcfc3e4ed22daf819f Signed-off-by: Pranith Kumar K <pkarampu@redhat.com> Reviewed-on: https://review.gluster.org/16821 Smoke: Gluster Build System <jenkins@build.gluster.org> Reviewed-by: Xavier Hernandez <xhernandez@datalab.es> Tested-by: Xavier Hernandez <xhernandez@datalab.es> NetBSD-regression: NetBSD Build System <jenkins@build.gluster.org> CentOS-regression: Gluster Build System <jenkins@build.gluster.org>
Diffstat (limited to 'xlators/cluster/ec')
-rw-r--r--xlators/cluster/ec/src/ec-common.c49
-rw-r--r--xlators/cluster/ec/src/ec-generic.c14
-rw-r--r--xlators/cluster/ec/src/ec-types.h4
-rw-r--r--xlators/cluster/ec/src/ec.c22
4 files changed, 83 insertions, 6 deletions
diff --git a/xlators/cluster/ec/src/ec-common.c b/xlators/cluster/ec/src/ec-common.c
index 823922542a0..3ae7f110d99 100644
--- a/xlators/cluster/ec/src/ec-common.c
+++ b/xlators/cluster/ec/src/ec-common.c
@@ -932,16 +932,19 @@ ec_config_check (ec_fop_data_t *fop, ec_config_t *config)
}
gf_boolean_t
-ec_set_dirty_flag (ec_lock_link_t *link, ec_inode_t *ctx, uint64_t *dirty)
+ec_set_dirty_flag (ec_lock_link_t *link, ec_inode_t *ctx,
+ uint64_t *dirty)
{
gf_boolean_t set_dirty = _gf_false;
if (link->update[EC_DATA_TXN] && !ctx->dirty[EC_DATA_TXN]) {
+ if (!link->optimistic_changelog)
dirty[EC_DATA_TXN] = 1;
}
if (link->update[EC_METADATA_TXN] && !ctx->dirty[EC_METADATA_TXN]) {
+ if (!link->optimistic_changelog)
dirty[EC_METADATA_TXN] = 1;
}
@@ -962,6 +965,7 @@ ec_prepare_update_cbk (call_frame_t *frame, void *cookie,
ec_lock_link_t *link = fop->data;
ec_lock_t *lock = NULL;
ec_inode_t *ctx;
+ gf_boolean_t release = _gf_false;
lock = link->lock;
parent = link->fop;
@@ -1055,6 +1059,26 @@ unlock:
UNLOCK(&lock->loc.inode->lock);
if (op_errno == 0) {
+ /* If the fop fails on any of the good bricks, it is important to mark
+ * it dirty and update versions right away if dirty was not set before.
+ */
+ if (lock->good_mask & ~(fop->good | fop->remaining)) {
+ release = _gf_true;
+ }
+
+ /* lock->release is a critical field that is checked and modified most
+ * of the time inside a locked region. This use here is safe because we
+ * are in a modifying fop and we currently don't allow two modifying
+ * fops to be processed concurrently, so no one else could be checking
+ * or modifying it.*/
+ if (link->update[0] && !link->dirty[0]) {
+ lock->release |= release;
+ }
+
+ if (link->update[1] && !link->dirty[1]) {
+ lock->release |= release;
+ }
+
/* We don't allow the main fop to be executed on bricks that have not
* succeeded the initial xattrop. */
parent->mask &= fop->good;
@@ -1097,6 +1121,7 @@ void ec_get_size_version(ec_lock_link_t *link)
ec_inode_t *ctx;
ec_fop_data_t *fop;
dict_t *dict = NULL;
+ ec_t *ec = NULL;
int32_t error = 0;
gf_boolean_t getting_xattr;
gf_boolean_t set_dirty = _gf_false;
@@ -1105,6 +1130,17 @@ void ec_get_size_version(ec_lock_link_t *link)
lock = link->lock;
ctx = lock->ctx;
fop = link->fop;
+ ec = fop->xl->private;
+
+ if (ec->optimistic_changelog &&
+ !(ec->node_mask & ~link->lock->good_mask) && !ec_is_data_fop (fop->id))
+ link->optimistic_changelog = _gf_true;
+
+ /* If ctx->have_info is false and lock->query is true, it means that we'll
+ * send the xattrop anyway, so we can use it to update dirty counts, even
+ * if it's not necessary to do it right now. */
+ if (!ctx->have_info && lock->query)
+ link->optimistic_changelog = _gf_false;
set_dirty = ec_set_dirty_flag (link, ctx, dirty);
@@ -1714,6 +1750,13 @@ ec_lock_next_owner(ec_lock_link_t *link, ec_cbk_data_t *cbk,
if (link->update[1]) {
ctx->post_version[1]++;
}
+ /* If the fop fails on any of the good bricks, it is important to mark
+ * it dirty and update versions right away. */
+ if (link->update[0] || link->update[1]) {
+ if (lock->good_mask & ~(fop->good | fop->remaining)) {
+ lock->release = _gf_true;
+ }
+ }
}
ec_lock_update_good(lock, fop);
@@ -2028,9 +2071,13 @@ ec_update_info(ec_lock_link_t *link)
if (ctx->dirty[1] != 0) {
dirty[1] = -1;
}
+ } else {
+ link->optimistic_changelog = _gf_false;
+ ec_set_dirty_flag (link, ctx, dirty);
}
memset(ctx->dirty, 0, sizeof(ctx->dirty));
}
+
if ((version[0] != 0) || (version[1] != 0) ||
(dirty[0] != 0) || (dirty[1] != 0)) {
ec_update_size_version(link, version, size, dirty);
diff --git a/xlators/cluster/ec/src/ec-generic.c b/xlators/cluster/ec/src/ec-generic.c
index 3ce3c2ab02a..ddb90ce39cc 100644
--- a/xlators/cluster/ec/src/ec-generic.c
+++ b/xlators/cluster/ec/src/ec-generic.c
@@ -697,6 +697,7 @@ int32_t ec_lookup_cbk(call_frame_t * frame, void * cookie, xlator_t * this,
ec_fop_data_t * fop = NULL;
ec_cbk_data_t * cbk = NULL;
int32_t idx = (int32_t)(uintptr_t)cookie;
+ uint64_t dirty[2] = {0};
VALIDATE_OR_GOTO(this, out);
GF_VALIDATE_OR_GOTO(this->name, frame, out);
@@ -746,8 +747,7 @@ int32_t ec_lookup_cbk(call_frame_t * frame, void * cookie, xlator_t * this,
goto out;
}
- ec_dict_del_array (xdata, EC_XATTR_DIRTY, cbk->dirty,
- EC_VERSION_SIZE);
+ ec_dict_del_array (xdata, EC_XATTR_DIRTY, dirty, EC_VERSION_SIZE);
}
ec_combine(cbk, ec_combine_lookup);
@@ -1142,7 +1142,9 @@ ec_xattrop_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
dict_t *xdata)
{
ec_fop_data_t *fop = NULL;
+ ec_lock_link_t *link = NULL;
ec_cbk_data_t *cbk = NULL;
+ uint64_t dirty[2] = {0};
data_t *data;
uint64_t *version;
int32_t idx = (int32_t)(uintptr_t)cookie;
@@ -1178,8 +1180,14 @@ ec_xattrop_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
}
}
- ec_dict_del_array (xattr, EC_XATTR_DIRTY, cbk->dirty,
+ ec_dict_del_array (xattr, EC_XATTR_DIRTY, dirty,
EC_VERSION_SIZE);
+ link = fop->data;
+ if (link) {
+ /*Keep a note of if the dirty is already set or not*/
+ link->dirty[0] |= (dirty[0] != 0);
+ link->dirty[1] |= (dirty[1] != 0);
+ }
}
if (xdata)
diff --git a/xlators/cluster/ec/src/ec-types.h b/xlators/cluster/ec/src/ec-types.h
index d701fe7d25e..f184f459c2e 100644
--- a/xlators/cluster/ec/src/ec-types.h
+++ b/xlators/cluster/ec/src/ec-types.h
@@ -246,6 +246,8 @@ struct _ec_lock_link {
struct list_head owner_list;
struct list_head wait_list;
gf_boolean_t update[2];
+ gf_boolean_t dirty[2];
+ gf_boolean_t optimistic_changelog;
loc_t *base;
uint64_t size;
};
@@ -331,7 +333,6 @@ struct _ec_cbk_data {
int32_t op_errno;
int32_t count;
uintptr_t mask;
- uint64_t dirty[2];
dict_t *xdata;
dict_t *dict;
@@ -561,6 +562,7 @@ struct _ec {
gf_timer_t *timer;
gf_boolean_t shutdown;
gf_boolean_t eager_lock;
+ gf_boolean_t optimistic_changelog;
uint32_t background_heals;
uint32_t heal_wait_qlen;
struct list_head pending_fops;
diff --git a/xlators/cluster/ec/src/ec.c b/xlators/cluster/ec/src/ec.c
index e467fea28b8..01f1473f96d 100644
--- a/xlators/cluster/ec/src/ec.c
+++ b/xlators/cluster/ec/src/ec.c
@@ -301,6 +301,8 @@ reconfigure (xlator_t *this, dict_t *options)
ret = -1;
}
+ GF_OPTION_RECONF ("optimistic-change-log", ec->optimistic_changelog,
+ options, bool, failed);
failed:
return ret;
}
@@ -611,6 +613,7 @@ init (xlator_t *this)
this->private = ec;
ec->xl = this;
+ ec->optimistic_changelog = _gf_true;
LOCK_INIT(&ec->lock);
INIT_LIST_HEAD(&ec->pending_fops);
@@ -669,6 +672,7 @@ init (xlator_t *this)
GF_OPTION_INIT ("shd-max-threads", ec->shd.max_threads, uint32, failed);
GF_OPTION_INIT ("shd-wait-qlength", ec->shd.wait_qlength, uint32, failed);
+ GF_OPTION_INIT ("optimistic-change-log", ec->optimistic_changelog, bool, failed);
this->itable = inode_table_new (EC_SHD_INODE_LRU_LIMIT, this);
if (!this->itable)
@@ -1463,5 +1467,21 @@ struct volume_options options[] =
.description = "force the cpu extensions to be used to accelerate the "
"galois field computations."
},
- { }
+ { .key = {"optimistic-change-log"},
+ .type = GF_OPTION_TYPE_BOOL,
+ .default_value = "on",
+ .description = "Set/Unset dirty flag for every update fop at the start"
+ "of the fop. If OFF, this option impacts performance of"
+ "entry operations or metadata operations as it will"
+ "set dirty flag at the start and unset it at the end of"
+ "ALL update fop. If ON and all the bricks are good,"
+ "dirty flag will be set at the start only for file fops"
+ "For metadata and entry fops dirty flag will not be set"
+ "at the start, if all the bricks are good. This does"
+ "not impact performance for metadata operations and"
+ "entry operation but has a very small window to miss"
+ "marking entry as dirty in case it is required to be"
+ "healed"
+ },
+ { .key = {NULL} }
};