summaryrefslogtreecommitdiffstats
path: root/xlators/cluster/afr/src
diff options
context:
space:
mode:
authorAnand Avati <avati@gluster.com>2010-11-09 05:27:02 +0000
committerAnand V. Avati <avati@dev.gluster.com>2010-11-09 03:07:07 -0800
commit6fb49f18a9bbfd1266b4773e757e459519c6719c (patch)
treefff8ff41717114ead7a7e2b848e83058d6d8b15a /xlators/cluster/afr/src
parent667c5e22467cbecd371bfc052e7f65b6b6b41e2d (diff)
replicate: optimistic changelog
The standard way of maintaining changelog in replicate has been to write out pending flags and to unset the pending flag post the actual operation. This new optimization kicks in only when all subvolumes are up. The optimization is that, during pre-op, no changelog is written for METADATA and ENTRY/RENAME operations. If during the operation nothing failed, no changelog is updated in post-op either. If however, something does fail during an operation, then, pending flags get written during post op pointing only towards the failed nodes. DATA transactions continue to work the way they are. If one subvolume is down, pending flags are written in pre-op changelog itself as before. The impact of this optimization is only in the case when both servers die or the client dies while the 'FOP' stage of the transaction is in progress. By nature of METADATA and ENTRY operations, detecting a mismatch later is not dependent on the presence of changelog. Changelog only determines the direction in which self-heal happens for these types of transactions. For the direction too this optimization does not have a major impact because in the cases of failure (both servers dieing or client dieing) the final state (direction of self-heal) would be arbitrary anyways as the syscall wouldn't have completed. Signed-off-by: Anand V. Avati <avati@blackhole.gluster.com> Signed-off-by: Anand V. Avati <avati@dev.gluster.com> BUG: 2068 (performance enhancements) URL: http://bugs.gluster.com/cgi-bin/bugzilla3/show_bug.cgi?id=2068
Diffstat (limited to 'xlators/cluster/afr/src')
-rw-r--r--xlators/cluster/afr/src/afr-transaction.c68
-rw-r--r--xlators/cluster/afr/src/afr.c38
-rw-r--r--xlators/cluster/afr/src/afr.h8
3 files changed, 101 insertions, 13 deletions
diff --git a/xlators/cluster/afr/src/afr-transaction.c b/xlators/cluster/afr/src/afr-transaction.c
index ff9c88badd3..d48d6eb72f4 100644
--- a/xlators/cluster/afr/src/afr-transaction.c
+++ b/xlators/cluster/afr/src/afr-transaction.c
@@ -513,6 +513,14 @@ afr_changelog_post_op (call_frame_t *frame, xlator_t *this)
}
}
+ index = afr_index_for_transaction_type (local->transaction.type);
+ if (local->optimistic_change_log &&
+ local->transaction.type != AFR_DATA_TRANSACTION) {
+ /* if nothing_failed, then local->pending[..] == {0 .. 0} */
+ for (i = 0; i < priv->child_count; i++)
+ local->pending[i][index]++;
+ }
+
for (i = 0; i < priv->child_count; i++) {
if (!local->child_up[i])
continue;
@@ -568,6 +576,12 @@ afr_changelog_post_op (call_frame_t *frame, xlator_t *this)
break;
case AFR_METADATA_TRANSACTION:
{
+ if (nothing_failed) {
+ afr_changelog_post_op_cbk (frame, (void *)(long)i,
+ this, 1, 0, xattr[i]);
+ break;
+ }
+
if (local->fd)
STACK_WIND (frame, afr_changelog_post_op_cbk,
priv->children[i],
@@ -585,12 +599,17 @@ afr_changelog_post_op (call_frame_t *frame, xlator_t *this)
case AFR_ENTRY_RENAME_TRANSACTION:
{
- STACK_WIND_COOKIE (frame, afr_changelog_post_op_cbk,
- (void *) (long) i,
- priv->children[i],
- priv->children[i]->fops->xattrop,
- &local->transaction.new_parent_loc,
- GF_XATTROP_ADD_ARRAY, xattr[i]);
+ if (nothing_failed) {
+ afr_changelog_post_op_cbk (frame, (void *)(long)i,
+ this, 1, 0, xattr[i]);
+ } else {
+ STACK_WIND_COOKIE (frame, afr_changelog_post_op_cbk,
+ (void *) (long) i,
+ priv->children[i],
+ priv->children[i]->fops->xattrop,
+ &local->transaction.new_parent_loc,
+ GF_XATTROP_ADD_ARRAY, xattr[i]);
+ }
call_count--;
}
@@ -613,6 +632,12 @@ afr_changelog_post_op (call_frame_t *frame, xlator_t *this)
case AFR_ENTRY_TRANSACTION:
{
+ if (nothing_failed) {
+ afr_changelog_post_op_cbk (frame, (void *)(long)i,
+ this, 1, 0, xattr[i]);
+ break;
+ }
+
if (local->fd)
STACK_WIND (frame, afr_changelog_post_op_cbk,
priv->children[i],
@@ -808,6 +833,12 @@ afr_changelog_pre_op (call_frame_t *frame, xlator_t *this)
break;
case AFR_METADATA_TRANSACTION:
{
+ if (local->optimistic_change_log) {
+ afr_changelog_pre_op_cbk (frame, (void *)(long)i,
+ this, 1, 0, xattr[i]);
+ break;
+ }
+
if (local->fd)
STACK_WIND_COOKIE (frame,
afr_changelog_pre_op_cbk,
@@ -829,13 +860,18 @@ afr_changelog_pre_op (call_frame_t *frame, xlator_t *this)
case AFR_ENTRY_RENAME_TRANSACTION:
{
- STACK_WIND_COOKIE (frame,
- afr_changelog_pre_op_cbk,
- (void *) (long) i,
- priv->children[i],
- priv->children[i]->fops->xattrop,
- &local->transaction.new_parent_loc,
- GF_XATTROP_ADD_ARRAY, xattr[i]);
+ if (local->optimistic_change_log) {
+ afr_changelog_pre_op_cbk (frame, (void *)(long)i,
+ this, 1, 0, xattr[i]);
+ } else {
+ STACK_WIND_COOKIE (frame,
+ afr_changelog_pre_op_cbk,
+ (void *) (long) i,
+ priv->children[i],
+ priv->children[i]->fops->xattrop,
+ &local->transaction.new_parent_loc,
+ GF_XATTROP_ADD_ARRAY, xattr[i]);
+ }
call_count--;
}
@@ -860,6 +896,12 @@ afr_changelog_pre_op (call_frame_t *frame, xlator_t *this)
case AFR_ENTRY_TRANSACTION:
{
+ if (local->optimistic_change_log) {
+ afr_changelog_pre_op_cbk (frame, (void *)(long)i,
+ this, 1, 0, xattr[i]);
+ break;
+ }
+
if (local->fd)
STACK_WIND_COOKIE (frame,
afr_changelog_pre_op_cbk,
diff --git a/xlators/cluster/afr/src/afr.c b/xlators/cluster/afr/src/afr.c
index 775a53a8fba..cb458250597 100644
--- a/xlators/cluster/afr/src/afr.c
+++ b/xlators/cluster/afr/src/afr.c
@@ -71,6 +71,7 @@ validate_options (xlator_t *this, dict_t *options, char **op_errstr)
gf_boolean_t metadata_change_log;
gf_boolean_t entry_change_log;
gf_boolean_t strict_readdir;
+ gf_boolean_t optimistic_change_log;
xlator_list_t * trav = NULL;
@@ -257,6 +258,26 @@ validate_options (xlator_t *this, dict_t *options, char **op_errstr)
"change-log %s'.", change_log);
}
+
+ dict_ret = dict_get_str (options, "optimistic-change-log",
+ &change_log);
+ if (dict_ret == 0) {
+ temp_ret = gf_string2boolean (change_log, &optimistic_change_log);
+ if (temp_ret < 0) {
+ gf_log (this->name, GF_LOG_WARNING,
+ "Validation faled for optimistic-change-log");
+ *op_errstr = gf_strdup ("Error, option should be boolean");
+ ret = -1;
+ goto out;
+ }
+
+
+ gf_log (this->name, GF_LOG_DEBUG,
+ "Validated 'option optimistic-"
+ "change-log %s'.", change_log);
+ }
+
+
read_ret = dict_get_str (options, "read-subvolume", &read_subvol);
if (read_ret)
@@ -674,6 +695,7 @@ init (xlator_t *this)
priv->data_change_log = 1;
priv->metadata_change_log = 1;
priv->entry_change_log = 1;
+ priv->optimistic_change_log = 1;
dict_ret = dict_get_str (this->options, "data-change-log",
&change_log);
@@ -715,6 +737,19 @@ init (xlator_t *this)
}
}
+ dict_ret = dict_get_str (this->options, "optimistic-change-log",
+ &change_log);
+ if (dict_ret == 0) {
+ ret = gf_string2boolean (change_log, &priv->optimistic_change_log);
+ if (ret < 0) {
+ gf_log (this->name, GF_LOG_WARNING,
+ "Invalid 'option optimistic-change-log %s'. "
+ "Defaulting to optimistic-change-log as 'on'.",
+ change_log);
+ priv->optimistic_change_log = 1;
+ }
+ }
+
/* Locking options */
priv->inodelk_trace = 0;
@@ -994,6 +1029,9 @@ struct volume_options options[] = {
{ .key = {"entry-change-log"},
.type = GF_OPTION_TYPE_BOOL
},
+ { .key = {"optimistic-change-log"},
+ .type = GF_OPTION_TYPE_BOOL
+ },
{ .key = {"data-lock-server-count"},
.type = GF_OPTION_TYPE_INT,
.min = 0
diff --git a/xlators/cluster/afr/src/afr.h b/xlators/cluster/afr/src/afr.h
index 758ac789aff..a7359f26963 100644
--- a/xlators/cluster/afr/src/afr.h
+++ b/xlators/cluster/afr/src/afr.h
@@ -88,6 +88,7 @@ typedef struct _afr_private {
pthread_mutex_t mutex;
struct list_head saved_fds; /* list of fds on which locks have succeeded */
+ gf_boolean_t optimistic_change_log;
} afr_private_t;
typedef struct {
@@ -312,6 +313,7 @@ typedef struct _afr_local {
int32_t lock_recovery_child;
dict_t *dict;
+ int optimistic_change_log;
int (*openfd_flush_cbk) (call_frame_t *frame, xlator_t *this);
@@ -805,6 +807,8 @@ AFR_BASENAME (const char *str)
static inline int
AFR_LOCAL_INIT (afr_local_t *local, afr_private_t *priv)
{
+ int child_up_count = 0;
+
local->child_up = GF_CALLOC (sizeof (*local->child_up),
priv->child_count,
gf_afr_mt_char);
@@ -815,6 +819,10 @@ AFR_LOCAL_INIT (afr_local_t *local, afr_private_t *priv)
memcpy (local->child_up, priv->child_up,
sizeof (*local->child_up) * priv->child_count);
+ child_up_count = afr_up_children_count (priv->child_count, local->child_up);
+
+ if (priv->optimistic_change_log && child_up_count == priv->child_count)
+ local->optimistic_change_log = 1;
local->call_count = afr_up_children_count (priv->child_count, local->child_up);
if (local->call_count == 0)