summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorPranith Kumar K <pkarampu@redhat.com>2016-08-16 16:04:37 +0530
committerPranith Kumar Karampuri <pkarampu@redhat.com>2016-08-22 13:55:42 -0700
commit413594ed647400f1b39e05d4f1b12ad846e48800 (patch)
treefd7072bc983754156eb66ff2348c7ac99773c9e2
parenteddada59f7ad3cf21463a558a5f62591f4b72c68 (diff)
cluster/afr: Give option to do consistent-io
Problem: When tiering/rebalance does migrations and afr with 2-way replica is in picture, migration can read stale data if the source brick goes down and writes to the destination. After this deletion of the file leads to permanent loss of data after migration. Fix: Rebalance/tiering should migrate only when the data is definitely not stale. So introduce an option in afr called consistent-io which will be enabled in migration daemons. BUG: 1306398 Change-Id: I750f65091cc70a3ed4bf3c12f83d0949af43920a Signed-off-by: Pranith Kumar K <pkarampu@redhat.com> Reviewed-on: http://review.gluster.org/13425 Reviewed-by: Anuradha Talur <atalur@redhat.com> Reviewed-by: Krutika Dhananjay <kdhananj@redhat.com> Smoke: Gluster Build System <jenkins@build.gluster.org> NetBSD-regression: NetBSD Build System <jenkins@build.gluster.org> CentOS-regression: Gluster Build System <jenkins@build.gluster.org>
-rw-r--r--xlators/cluster/afr/src/afr-common.c193
-rw-r--r--xlators/cluster/afr/src/afr-dir-read.c4
-rw-r--r--xlators/cluster/afr/src/afr-lk-common.c2
-rw-r--r--xlators/cluster/afr/src/afr-messages.h4
-rw-r--r--xlators/cluster/afr/src/afr-open.c4
-rw-r--r--xlators/cluster/afr/src/afr-read-txn.c6
-rw-r--r--xlators/cluster/afr/src/afr-transaction.c6
-rw-r--r--xlators/cluster/afr/src/afr.c26
-rw-r--r--xlators/cluster/afr/src/afr.h25
9 files changed, 229 insertions, 41 deletions
diff --git a/xlators/cluster/afr/src/afr-common.c b/xlators/cluster/afr/src/afr-common.c
index 9b2c0d7caea..dec667fd460 100644
--- a/xlators/cluster/afr/src/afr-common.c
+++ b/xlators/cluster/afr/src/afr-common.c
@@ -43,6 +43,20 @@
#include "afr-self-heald.h"
#include "afr-messages.h"
+gf_boolean_t
+afr_is_consistent_io_possible (afr_local_t *local, afr_private_t *priv,
+ int32_t *op_errno)
+{
+ if (priv->consistent_io && local->call_count != priv->child_count) {
+ gf_msg (THIS->name, GF_LOG_INFO, 0,
+ AFR_MSG_SUBVOLS_DOWN, "All subvolumes are not up");
+ if (op_errno)
+ *op_errno = ENOTCONN;
+ return _gf_false;
+ }
+ return _gf_true;
+}
+
call_frame_t *
afr_copy_frame (call_frame_t *base)
{
@@ -1555,6 +1569,100 @@ afr_remove_eager_lock_stub (afr_local_t *local)
UNLOCK (&local->fd->lock);
}
+static gf_boolean_t
+afr_entrylk_is_unlock (entrylk_cmd cmd)
+{
+ if (ENTRYLK_UNLOCK == cmd)
+ return _gf_true;
+ return _gf_false;
+}
+
+static gf_boolean_t
+afr_inodelk_is_unlock (int32_t cmd, struct gf_flock *flock)
+{
+ switch (cmd) {
+ case F_SETLKW:
+ case F_SETLK:
+ if (F_UNLCK == flock->l_type)
+ return _gf_true;
+ break;
+ default:
+ return _gf_false;
+ }
+ return _gf_false;
+}
+
+static gf_boolean_t
+afr_lk_is_unlock (int32_t cmd, struct gf_flock *flock)
+{
+ switch (cmd) {
+ case F_RESLK_UNLCK:
+ return _gf_true;
+ break;
+
+#if F_SETLKW != F_SETLKW64
+ case F_SETLKW64:
+#endif
+ case F_SETLKW:
+
+#if F_SETLK != F_SETLK64
+ case F_SETLK64:
+#endif
+ case F_SETLK:
+ if (F_UNLCK == flock->l_type)
+ return _gf_true;
+ break;
+ default:
+ return _gf_false;
+ }
+ return _gf_false;
+}
+
+void
+afr_handle_inconsistent_fop (call_frame_t *frame, int32_t *op_ret,
+ int32_t *op_errno)
+{
+ afr_private_t *priv = NULL;
+ afr_local_t *local = NULL;
+
+ if (!frame || !frame->this || !frame->local || !frame->this->private)
+ return;
+
+ if (*op_ret < 0)
+ return;
+
+ /* Failing inodelk/entrylk/lk here is not a good idea because we
+ * need to cleanup the locks on the other bricks if we choose to fail
+ * the fop here. The brick may go down just after unwind happens as well
+ * so anyways the fop will fail when the next fop is sent so leaving
+ * it like this for now.*/
+ local = frame->local;
+ switch (local->op) {
+ case GF_FOP_LOOKUP:
+ case GF_FOP_INODELK:
+ case GF_FOP_FINODELK:
+ case GF_FOP_ENTRYLK:
+ case GF_FOP_FENTRYLK:
+ case GF_FOP_LK:
+ return;
+ default:
+ break;
+ }
+
+ priv = frame->this->private;
+ if (!priv->consistent_io)
+ return;
+
+ if (local->event_generation &&
+ (local->event_generation != priv->event_generation))
+ goto inconsistent;
+
+ return;
+inconsistent:
+ *op_ret = -1;
+ *op_errno = ENOTCONN;
+}
+
void
afr_local_cleanup (afr_local_t *local, xlator_t *this)
{
@@ -2997,10 +3105,9 @@ afr_flush (call_frame_t *frame, xlator_t *this, fd_t *fd, dict_t *xdata)
if (!local)
goto out;
- if (!local->call_count) {
- op_errno = ENOTCONN;
+ local->op = GF_FOP_FLUSH;
+ if (!afr_is_consistent_io_possible (local, this->private, &op_errno))
goto out;
- }
local->fd = fd_ref(fd);
@@ -3126,11 +3233,9 @@ afr_fsync (call_frame_t *frame, xlator_t *this, fd_t *fd, int32_t datasync,
if (!local)
goto out;
- call_count = local->call_count;
- if (!call_count) {
- op_errno = ENOTCONN;
+ local->op = GF_FOP_FSYNC;
+ if (!afr_is_consistent_io_possible (local, priv, &op_errno))
goto out;
- }
local->fd = fd_ref (fd);
@@ -3140,6 +3245,7 @@ afr_fsync (call_frame_t *frame, xlator_t *this, fd_t *fd, int32_t datasync,
local->inode = inode_ref (fd->inode);
+ call_count = local->call_count;
for (i = 0; i < priv->child_count; i++) {
if (local->child_up[i]) {
STACK_WIND_COOKIE (frame, afr_fsync_cbk,
@@ -3210,12 +3316,11 @@ afr_fsyncdir (call_frame_t *frame, xlator_t *this, fd_t *fd, int32_t datasync,
if (!local)
goto out;
- call_count = local->call_count;
- if (!call_count) {
- op_errno = ENOTCONN;
+ local->op = GF_FOP_FSYNCDIR;
+ if (!afr_is_consistent_io_possible (local, priv, &op_errno))
goto out;
- }
+ call_count = local->call_count;
for (i = 0; i < priv->child_count; i++) {
if (local->child_up[i]) {
STACK_WIND (frame, afr_fsyncdir_cbk,
@@ -3506,6 +3611,11 @@ afr_inodelk (call_frame_t *frame, xlator_t *this,
if (!local)
goto out;
+ local->op = GF_FOP_INODELK;
+ if (!afr_inodelk_is_unlock (cmd, flock) &&
+ !afr_is_consistent_io_possible (local, this->private, &op_errno))
+ goto out;
+
loc_copy (&local->loc, loc);
local->cont.inodelk.volume = gf_strdup (volume);
if (!local->cont.inodelk.volume) {
@@ -3589,12 +3699,23 @@ afr_finodelk (call_frame_t *frame, xlator_t *this, const char *volume, fd_t *fd,
if (!local)
goto out;
- call_count = local->call_count;
- if (!call_count) {
- op_errno = ENOTCONN;
- goto out;
- }
+ local->op = GF_FOP_FINODELK;
+ if (!afr_inodelk_is_unlock (cmd, flock) &&
+ !afr_is_consistent_io_possible (local, this->private, &op_errno))
+ goto out;
+ local->cont.inodelk.volume = gf_strdup (volume);
+ if (!local->cont.inodelk.volume) {
+ op_errno = ENOMEM;
+ goto out;
+ }
+
+ local->fd = fd_ref (fd);
+ local->cont.inodelk.cmd = cmd;
+ local->cont.inodelk.flock = *flock;
+ if (xdata)
+ local->xdata_req = dict_ref (xdata);
+ call_count = local->call_count;
for (i = 0; i < priv->child_count; i++) {
if (local->child_up[i]) {
STACK_WIND (frame, afr_finodelk_cbk,
@@ -3610,7 +3731,6 @@ afr_finodelk (call_frame_t *frame, xlator_t *this, const char *volume, fd_t *fd,
return 0;
out:
AFR_STACK_UNWIND (finodelk, frame, -1, op_errno, NULL);
-
return 0;
}
@@ -3642,7 +3762,6 @@ afr_entrylk_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
return 0;
}
-
int
afr_entrylk (call_frame_t *frame, xlator_t *this, const char *volume,
loc_t *loc, const char *basename, entrylk_cmd cmd,
@@ -3660,12 +3779,13 @@ afr_entrylk (call_frame_t *frame, xlator_t *this, const char *volume,
if (!local)
goto out;
- call_count = local->call_count;
- if (!call_count) {
- op_errno = ENOTCONN;
- goto out;
- }
+ local->op = GF_FOP_ENTRYLK;
+ if (!afr_entrylk_is_unlock (cmd) &&
+ !afr_is_consistent_io_possible (local, priv, &op_errno))
+ goto out;
+ local->cont.entrylk.cmd = cmd;
+ call_count = local->call_count;
for (i = 0; i < priv->child_count; i++) {
if (local->child_up[i]) {
STACK_WIND (frame, afr_entrylk_cbk,
@@ -3733,12 +3853,13 @@ afr_fentrylk (call_frame_t *frame, xlator_t *this, const char *volume, fd_t *fd,
if (!local)
goto out;
- call_count = local->call_count;
- if (!call_count) {
- op_errno = ENOTCONN;
- goto out;
- }
+ local->op = GF_FOP_FENTRYLK;
+ if (!afr_entrylk_is_unlock (cmd) &&
+ !afr_is_consistent_io_possible (local, priv, &op_errno))
+ goto out;
+ local->cont.entrylk.cmd = cmd;
+ call_count = local->call_count;
for (i = 0; i < priv->child_count; i++) {
if (local->child_up[i]) {
STACK_WIND (frame, afr_fentrylk_cbk,
@@ -3823,6 +3944,10 @@ afr_statfs (call_frame_t *frame, xlator_t *this, loc_t *loc, dict_t *xdata)
if (!local)
goto out;
+ local->op = GF_FOP_STATFS;
+ if (!afr_is_consistent_io_possible (local, priv, &op_errno))
+ goto out;
+
if (priv->arbiter_count == 1 && local->child_up[ARBITER_BRICK_INDEX])
local->call_count--;
call_count = local->call_count;
@@ -3963,7 +4088,6 @@ afr_lk_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
return 0;
}
-
int
afr_lk (call_frame_t *frame, xlator_t *this,
fd_t *fd, int32_t cmd, struct gf_flock *flock, dict_t *xdata)
@@ -3979,6 +4103,11 @@ afr_lk (call_frame_t *frame, xlator_t *this,
if (!local)
goto out;
+ local->op = GF_FOP_LK;
+ if (!afr_lk_is_unlock (cmd, flock) &&
+ !afr_is_consistent_io_possible (local, priv, &op_errno))
+ goto out;
+
local->cont.lk.locked_nodes = GF_CALLOC (priv->child_count,
sizeof (*local->cont.lk.locked_nodes),
gf_afr_mt_char);
@@ -4311,7 +4440,7 @@ afr_notify (xlator_t *this, int32_t event,
down_children++;
if (down_children == priv->child_count) {
gf_msg (this->name, GF_LOG_ERROR, 0,
- AFR_MSG_ALL_SUBVOLS_DOWN,
+ AFR_MSG_SUBVOLS_DOWN,
"All subvolumes are down. Going offline "
"until atleast one of them comes back up.");
} else {
@@ -4399,7 +4528,6 @@ out:
return ret;
}
-
int
afr_local_init (afr_local_t *local, afr_private_t *priv, int32_t *op_errno)
{
@@ -4422,11 +4550,12 @@ afr_local_init (afr_local_t *local, afr_private_t *priv, int32_t *op_errno)
local->call_count = AFR_COUNT (local->child_up, priv->child_count);
if (local->call_count == 0) {
gf_msg (THIS->name, GF_LOG_INFO, 0,
- AFR_MSG_ALL_SUBVOLS_DOWN, "no subvolumes up");
+ AFR_MSG_SUBVOLS_DOWN, "no subvolumes up");
if (op_errno)
*op_errno = ENOTCONN;
goto out;
}
+
local->event_generation = priv->event_generation;
local->read_attempted = GF_CALLOC (priv->child_count, sizeof (char),
diff --git a/xlators/cluster/afr/src/afr-dir-read.c b/xlators/cluster/afr/src/afr-dir-read.c
index 2260e5dac26..4e29171482a 100644
--- a/xlators/cluster/afr/src/afr-dir-read.c
+++ b/xlators/cluster/afr/src/afr-dir-read.c
@@ -88,6 +88,10 @@ afr_opendir (call_frame_t *frame, xlator_t *this, loc_t *loc, fd_t *fd)
if (!local)
goto out;
+ local->op = GF_FOP_OPENDIR;
+ if (!afr_is_consistent_io_possible (local, priv, &op_errno))
+ goto out;
+
fd_ctx = afr_fd_ctx_get (fd, this);
if (!fd_ctx)
goto out;
diff --git a/xlators/cluster/afr/src/afr-lk-common.c b/xlators/cluster/afr/src/afr-lk-common.c
index c2a5f526c08..718ba318cfe 100644
--- a/xlators/cluster/afr/src/afr-lk-common.c
+++ b/xlators/cluster/afr/src/afr-lk-common.c
@@ -1622,7 +1622,7 @@ afr_nonblocking_inodelk (call_frame_t *frame, xlator_t *this)
if (!call_count) {
gf_msg (this->name, GF_LOG_INFO, 0,
- AFR_MSG_ALL_SUBVOLS_DOWN,
+ AFR_MSG_SUBVOLS_DOWN,
"All bricks are down, aborting.");
afr_unlock (frame, this);
goto out;
diff --git a/xlators/cluster/afr/src/afr-messages.h b/xlators/cluster/afr/src/afr-messages.h
index c7af18d0f25..5fb81c696d8 100644
--- a/xlators/cluster/afr/src/afr-messages.h
+++ b/xlators/cluster/afr/src/afr-messages.h
@@ -93,11 +93,11 @@
/*!
* @messageid 108006
- * @diagnosis All bricks of a replica set are down. Data residing in that
+ * @diagnosis bricks of a replica set are down. Data residing in that
* replica cannot be accessed until one of the bricks come back up.
* @recommendedaction Ensure that the bricks are up.
*/
-#define AFR_MSG_ALL_SUBVOLS_DOWN (GLFS_COMP_BASE_AFR + 6)
+#define AFR_MSG_SUBVOLS_DOWN (GLFS_COMP_BASE_AFR + 6)
/*!
diff --git a/xlators/cluster/afr/src/afr-open.c b/xlators/cluster/afr/src/afr-open.c
index 059d3f9bd71..7a628350c34 100644
--- a/xlators/cluster/afr/src/afr-open.c
+++ b/xlators/cluster/afr/src/afr-open.c
@@ -130,12 +130,16 @@ afr_open (call_frame_t *frame, xlator_t *this, loc_t *loc, int32_t flags,
if (!local)
goto out;
+ local->op = GF_FOP_OPEN;
fd_ctx = afr_fd_ctx_get (fd, this);
if (!fd_ctx) {
op_errno = ENOMEM;
goto out;
}
+ if (!afr_is_consistent_io_possible (local, priv, &op_errno))
+ goto out;
+
local->fd = fd_ref (fd);
local->fd_ctx = fd_ctx;
fd_ctx->flags = flags;
diff --git a/xlators/cluster/afr/src/afr-read-txn.c b/xlators/cluster/afr/src/afr-read-txn.c
index 74749f029c8..cb81af42510 100644
--- a/xlators/cluster/afr/src/afr-read-txn.c
+++ b/xlators/cluster/afr/src/afr-read-txn.c
@@ -217,6 +217,12 @@ afr_read_txn (call_frame_t *frame, xlator_t *this, inode_t *inode,
goto read;
}
+ if (!afr_is_consistent_io_possible (local, priv, &local->op_errno)) {
+ local->op_ret = -1;
+ read_subvol = -1;
+ goto read;
+ }
+
local->transaction.type = type;
ret = afr_inode_read_subvol_get (inode, this, data, metadata,
&event_generation);
diff --git a/xlators/cluster/afr/src/afr-transaction.c b/xlators/cluster/afr/src/afr-transaction.c
index 6130ad76543..64a42d9fc7e 100644
--- a/xlators/cluster/afr/src/afr-transaction.c
+++ b/xlators/cluster/afr/src/afr-transaction.c
@@ -207,6 +207,7 @@ afr_transaction_detach_fop_frame (call_frame_t *frame)
local = frame->local;
+ afr_handle_inconsistent_fop (frame, &local->op_ret, &local->op_errno);
LOCK (&frame->lock);
{
fop_frame = local->transaction.main_frame;
@@ -2238,6 +2239,11 @@ afr_transaction (call_frame_t *frame, xlator_t *this, afr_transaction_type type)
local->transaction.resume = afr_transaction_resume;
local->transaction.type = type;
+ if (!afr_is_consistent_io_possible (local, priv, &ret)) {
+ ret = -ret; /*op_errno to ret conversion*/
+ goto out;
+ }
+
ret = afr_transaction_local_init (local, this);
if (ret < 0)
goto out;
diff --git a/xlators/cluster/afr/src/afr.c b/xlators/cluster/afr/src/afr.c
index da62564e93a..48beaf24a6e 100644
--- a/xlators/cluster/afr/src/afr.c
+++ b/xlators/cluster/afr/src/afr.c
@@ -132,6 +132,7 @@ reconfigure (xlator_t *this, dict_t *options)
int index = -1;
char *qtype = NULL;
char *fav_child_policy = NULL;
+ gf_boolean_t consistent_io = _gf_false;
priv = this->private;
@@ -258,6 +259,11 @@ reconfigure (xlator_t *this, dict_t *options)
priv->did_discovery = _gf_false;
+ GF_OPTION_RECONF ("consistent-io", consistent_io, options, bool, out);
+ if (priv->quorum_count != 0)
+ consistent_io = _gf_false;
+ priv->consistent_io = consistent_io;
+
ret = 0;
out:
return ret;
@@ -494,6 +500,10 @@ init (xlator_t *this)
GF_OPTION_INIT ("quorum-reads", priv->quorum_reads, bool, out);
GF_OPTION_INIT ("consistent-metadata", priv->consistent_metadata, bool,
out);
+ GF_OPTION_INIT ("consistent-io", priv->consistent_io, bool, out);
+
+ if (priv->quorum_count != 0)
+ priv->consistent_io = _gf_false;
priv->wait_count = 1;
@@ -594,14 +604,11 @@ fini (xlator_t *this)
struct xlator_fops fops = {
.lookup = afr_lookup,
- .open = afr_open,
.lk = afr_lk,
.flush = afr_flush,
.statfs = afr_statfs,
.fsync = afr_fsync,
.fsyncdir = afr_fsyncdir,
- .xattrop = afr_xattrop,
- .fxattrop = afr_fxattrop,
.inodelk = afr_inodelk,
.finodelk = afr_finodelk,
.entrylk = afr_entrylk,
@@ -629,9 +636,14 @@ struct xlator_fops fops = {
.fallocate = afr_fallocate,
.discard = afr_discard,
.zerofill = afr_zerofill,
+ .xattrop = afr_xattrop,
+ .fxattrop = afr_fxattrop,
- /* dir read */
+ /*inode open*/
.opendir = afr_opendir,
+ .open = afr_open,
+
+ /* dir read */
.readdir = afr_readdir,
.readdirp = afr_readdirp,
@@ -986,5 +998,11 @@ struct volume_options options[] = {
" with identical mtime and size in more than half the "
"number of bricks in the replica.",
},
+ { .key = {"consistent-io"},
+ .type = GF_OPTION_TYPE_BOOL,
+ .default_value = "no",
+ .description = "If this option is enabled, i/o will fail even if "
+ "one of the bricks is down in the replicas",
+ },
{ .key = {NULL} },
};
diff --git a/xlators/cluster/afr/src/afr.h b/xlators/cluster/afr/src/afr.h
index 29008287e6d..983f07fcce9 100644
--- a/xlators/cluster/afr/src/afr.h
+++ b/xlators/cluster/afr/src/afr.h
@@ -152,6 +152,7 @@ typedef struct _afr_private {
gf_boolean_t use_afr_in_pump;
char *locking_scheme;
gf_boolean_t esh_granular;
+ gf_boolean_t consistent_io;
} afr_private_t;
@@ -663,6 +664,10 @@ typedef struct _afr_local {
} inodelk;
struct {
+ entrylk_cmd cmd;
+ } entrylk;
+
+ struct {
off_t offset;
gf_seek_what_t what;
} seek;
@@ -965,16 +970,25 @@ afr_local_transaction_cleanup (afr_local_t *local, xlator_t *this);
int
afr_cleanup_fd_ctx (xlator_t *this, fd_t *fd);
-#define AFR_STACK_UNWIND(fop, frame, params ...) \
+#define AFR_STACK_UNWIND(fop, frame, op_ret, op_errno, params ...)\
do { \
afr_local_t *__local = NULL; \
xlator_t *__this = NULL; \
+ int32_t __op_ret = 0; \
+ int32_t __op_errno = 0; \
+ \
+ __op_ret = op_ret; \
+ __op_errno = op_errno; \
if (frame) { \
__local = frame->local; \
__this = frame->this; \
+ afr_handle_inconsistent_fop (frame, &__op_ret,\
+ &__op_errno);\
frame->local = NULL; \
} \
- STACK_UNWIND_STRICT (fop, frame, params); \
+ \
+ STACK_UNWIND_STRICT (fop, frame, __op_ret, \
+ __op_errno, params); \
if (__local) { \
afr_local_cleanup (__local, __this); \
mem_put (__local); \
@@ -1160,4 +1174,11 @@ afr_get_msg_id (char *op_type);
int
afr_set_in_flight_sb_status (xlator_t *this, afr_local_t *local,
inode_t *inode);
+
+gf_boolean_t
+afr_is_consistent_io_possible (afr_local_t *local, afr_private_t *priv,
+ int32_t *op_errno);
+void
+afr_handle_inconsistent_fop (call_frame_t *frame, int32_t *op_ret,
+ int32_t *op_errno);
#endif /* __AFR_H__ */