summaryrefslogtreecommitdiffstats
path: root/xlators/cluster/ec/src/ec-common.c
diff options
context:
space:
mode:
authorXavi Hernandez <xhernandez@redhat.com>2019-04-12 17:54:44 +0200
committerPranith Kumar Karampuri <pkarampu@redhat.com>2019-04-23 11:28:58 +0000
commit4f0db1373be93e05b6fc451d2f07514704d3c4ca (patch)
tree82f4d82fbe914bcee2e900a87281a681b6cd40fc /xlators/cluster/ec/src/ec-common.c
parent132f03a0b77980b25a8b499ad1fe1e4ab2aee40e (diff)
cluster/ec: fix fd reopen
Currently EC tries to reopen fd's that have been opened while a brick was down. This is done as part of regular write operations, just after having acquired the locks, and it's sent as a sub-fop of the main write fop. There were two problems: 1. The reopen was attempted on all UP bricks, even if a previous lock didn't succeed. This is incorrect because most probably the open will fail. 2. If reopen is sent and fails, the error is propagated to the main operation, causing it to fail when it shouldn't. To fix this, we only attempt reopens on bricks where the current fop owns a lock, and we prevent any error to be propagated to the main fop. To implement this behaviour an argument used to indicate the minimum number of required answers has overloaded to also include some flags. To make the change consistent, it has been necessary to rename the argument, which means that a lot of files have been changed. However there are no functional changes. This change has also uncovered a problem in discard code, which didn't correctely process requests of small sizes because no real discard fop was being processed, only a write of 0's on some region. In this case some fields of the fop remained uninitialized or with incorrect values. To fix this, a new function has been created to simulate success on a fop and it's used in the discard case. Thanks to Pranith for providing a test script that has also detected an issue in this patch. This patch includes a small modification of this script to force data to be written into bricks before stopping them. Change-Id: If272343873369186c2fb8f43c1d9c52c3ea304ec Fixes: bz#1699866 Signed-off-by: Xavi Hernandez <xhernandez@redhat.com>
Diffstat (limited to 'xlators/cluster/ec/src/ec-common.c')
-rw-r--r--xlators/cluster/ec/src/ec-common.c73
1 files changed, 53 insertions, 20 deletions
diff --git a/xlators/cluster/ec/src/ec-common.c b/xlators/cluster/ec/src/ec-common.c
index 518368073e8..1454ae23b23 100644
--- a/xlators/cluster/ec/src/ec-common.c
+++ b/xlators/cluster/ec/src/ec-common.c
@@ -44,16 +44,16 @@ ec_update_fd_status(fd_t *fd, xlator_t *xl, int idx, int32_t ret_status)
UNLOCK(&fd->lock);
}
-static int
-ec_fd_ctx_need_open(fd_t *fd, xlator_t *this, uintptr_t *need_open)
+static uintptr_t
+ec_fd_ctx_need_open(fd_t *fd, xlator_t *this, uintptr_t mask)
{
int i = 0;
int count = 0;
ec_t *ec = NULL;
ec_fd_t *fd_ctx = NULL;
+ uintptr_t need_open = 0;
ec = this->private;
- *need_open = 0;
fd_ctx = ec_fd_get(fd, this);
if (!fd_ctx)
@@ -63,9 +63,9 @@ ec_fd_ctx_need_open(fd_t *fd, xlator_t *this, uintptr_t *need_open)
{
for (i = 0; i < ec->nodes; i++) {
if ((fd_ctx->fd_status[i] == EC_FD_NOT_OPENED) &&
- (ec->xl_up & (1 << i))) {
+ ((ec->xl_up & (1 << i)) != 0) && ((mask & (1 << i)) != 0)) {
fd_ctx->fd_status[i] = EC_FD_OPENING;
- *need_open |= (1 << i);
+ need_open |= (1 << i);
count++;
}
}
@@ -76,10 +76,11 @@ ec_fd_ctx_need_open(fd_t *fd, xlator_t *this, uintptr_t *need_open)
* then ignore fixing the fd as it has been
* requested from heal operation.
*/
- if (count >= ec->fragments)
- count = 0;
+ if (count >= ec->fragments) {
+ need_open = 0;
+ }
- return count;
+ return need_open;
}
static gf_boolean_t
@@ -96,9 +97,8 @@ ec_is_fd_fixable(fd_t *fd)
}
static void
-ec_fix_open(ec_fop_data_t *fop)
+ec_fix_open(ec_fop_data_t *fop, uintptr_t mask)
{
- int call_count = 0;
uintptr_t need_open = 0;
int ret = 0;
loc_t loc = {
@@ -109,9 +109,10 @@ ec_fix_open(ec_fop_data_t *fop)
goto out;
/* Evaluate how many remote fd's to be opened */
- call_count = ec_fd_ctx_need_open(fop->fd, fop->xl, &need_open);
- if (!call_count)
+ need_open = ec_fd_ctx_need_open(fop->fd, fop->xl, mask);
+ if (need_open == 0) {
goto out;
+ }
loc.inode = inode_ref(fop->fd->inode);
gf_uuid_copy(loc.gfid, fop->fd->inode->gfid);
@@ -121,11 +122,13 @@ ec_fix_open(ec_fop_data_t *fop)
}
if (IA_IFDIR == fop->fd->inode->ia_type) {
- ec_opendir(fop->frame, fop->xl, need_open, EC_MINIMUM_ONE, NULL, NULL,
+ ec_opendir(fop->frame, fop->xl, need_open,
+ EC_MINIMUM_ONE | EC_FOP_NO_PROPAGATE_ERROR, NULL, NULL,
&fop->loc[0], fop->fd, NULL);
} else {
- ec_open(fop->frame, fop->xl, need_open, EC_MINIMUM_ONE, NULL, NULL,
- &loc, fop->fd->flags, fop->fd, NULL);
+ ec_open(fop->frame, fop->xl, need_open,
+ EC_MINIMUM_ONE | EC_FOP_NO_PROPAGATE_ERROR, NULL, NULL, &loc,
+ fop->fd->flags, fop->fd, NULL);
}
out:
@@ -495,12 +498,16 @@ ec_resume(ec_fop_data_t *fop, int32_t error)
}
void
-ec_resume_parent(ec_fop_data_t *fop, int32_t error)
+ec_resume_parent(ec_fop_data_t *fop)
{
ec_fop_data_t *parent;
+ int32_t error = 0;
parent = fop->parent;
if (parent != NULL) {
+ if ((fop->fop_flags & EC_FOP_NO_PROPAGATE_ERROR) == 0) {
+ error = fop->error;
+ }
ec_trace("RESUME_PARENT", fop, "error=%u", error);
fop->parent = NULL;
ec_resume(parent, error);
@@ -593,6 +600,8 @@ ec_internal_op(ec_fop_data_t *fop)
return _gf_true;
if (fop->id == GF_FOP_FXATTROP)
return _gf_true;
+ if (fop->id == GF_FOP_OPEN)
+ return _gf_true;
return _gf_false;
}
@@ -631,7 +640,7 @@ ec_msg_str(ec_fop_data_t *fop)
return fop->errstr;
}
-int32_t
+static int32_t
ec_child_select(ec_fop_data_t *fop)
{
ec_t *ec = fop->xl->private;
@@ -693,8 +702,6 @@ ec_child_select(ec_fop_data_t *fop)
return 0;
}
- ec_sleep(fop);
-
return 1;
}
@@ -773,6 +780,8 @@ ec_dispatch_one(ec_fop_data_t *fop)
ec_dispatch_start(fop);
if (ec_child_select(fop)) {
+ ec_sleep(fop);
+
fop->expected = 1;
fop->first = ec_select_first_by_read_policy(fop->xl->private, fop);
@@ -807,6 +816,8 @@ ec_dispatch_inc(ec_fop_data_t *fop)
ec_dispatch_start(fop);
if (ec_child_select(fop)) {
+ ec_sleep(fop);
+
fop->expected = gf_bits_count(fop->remaining);
fop->first = 0;
@@ -820,6 +831,8 @@ ec_dispatch_all(ec_fop_data_t *fop)
ec_dispatch_start(fop);
if (ec_child_select(fop)) {
+ ec_sleep(fop);
+
fop->expected = gf_bits_count(fop->remaining);
fop->first = 0;
@@ -838,6 +851,8 @@ ec_dispatch_min(ec_fop_data_t *fop)
ec_dispatch_start(fop);
if (ec_child_select(fop)) {
+ ec_sleep(fop);
+
fop->expected = count = ec->fragments;
fop->first = ec_select_first_by_read_policy(fop->xl->private, fop);
idx = fop->first - 1;
@@ -852,6 +867,23 @@ ec_dispatch_min(ec_fop_data_t *fop)
}
}
+void
+ec_succeed_all(ec_fop_data_t *fop)
+{
+ ec_dispatch_start(fop);
+
+ if (ec_child_select(fop)) {
+ fop->expected = gf_bits_count(fop->remaining);
+ fop->first = 0;
+
+ /* Simulate a successful execution on all bricks */
+ ec_trace("SUCCEED", fop, "");
+
+ fop->good = fop->remaining;
+ fop->remaining = 0;
+ }
+}
+
ec_lock_t *
ec_lock_allocate(ec_fop_data_t *fop, loc_t *loc)
{
@@ -1825,7 +1857,8 @@ ec_lock_acquired(ec_lock_link_t *link)
if (fop->use_fd &&
(link->update[EC_DATA_TXN] || link->update[EC_METADATA_TXN])) {
- ec_fix_open(fop);
+ /* Try to reopen closed fd's only if lock has succeeded. */
+ ec_fix_open(fop, lock->mask);
}
ec_lock_resume_shared(&list);