summaryrefslogtreecommitdiffstats
path: root/xlators/cluster/ec/src
diff options
context:
space:
mode:
authorXavier Hernandez <xhernandez@datalab.es>2015-05-14 20:07:10 +0200
committerPranith Kumar Karampuri <pkarampu@redhat.com>2015-05-20 01:00:01 -0700
commit61cfcf65f0d4ad70fc8a47395c583d4b5bf1efbe (patch)
tree9d3134df974f4ab0160008dbd877e317d471d2c7 /xlators/cluster/ec/src
parent8f788528e64c4c13e16f7ad2d9f667a3813e08cc (diff)
cluster/ec: Correctly cleanup delayed locks
When a delayed lock is pending, a graph switch doesn't correctly terminate it. This means that the update of version and size xattrs is lost, causing EIO errors. This patch handles GF_EVENT_PARENT_DOWN event to correctly finish pending udpdates before completing the graph switch. Change-Id: I394f3b8d41df8d83cdd36636aeb62330f30a66d5 BUG: 1188145 Signed-off-by: Xavier Hernandez <xhernandez@datalab.es> Reviewed-on: http://review.gluster.org/10787 Tested-by: NetBSD Build System Tested-by: Gluster Build System <jenkins@build.gluster.com> Reviewed-by: Pranith Kumar Karampuri <pkarampu@redhat.com>
Diffstat (limited to 'xlators/cluster/ec/src')
-rw-r--r--xlators/cluster/ec/src/ec-common.c74
-rw-r--r--xlators/cluster/ec/src/ec-common.h1
-rw-r--r--xlators/cluster/ec/src/ec-data.c13
-rw-r--r--xlators/cluster/ec/src/ec-data.h31
-rw-r--r--xlators/cluster/ec/src/ec-heal.c7
-rw-r--r--xlators/cluster/ec/src/ec.c97
-rw-r--r--xlators/cluster/ec/src/ec.h5
7 files changed, 163 insertions, 65 deletions
diff --git a/xlators/cluster/ec/src/ec-common.c b/xlators/cluster/ec/src/ec-common.c
index afd46c095f3..9f312e0c37c 100644
--- a/xlators/cluster/ec/src/ec-common.c
+++ b/xlators/cluster/ec/src/ec-common.c
@@ -1531,31 +1531,46 @@ void ec_unlock_now(ec_fop_data_t *fop, ec_lock_t *lock)
ec_resume(fop, 0);
}
-void ec_unlock_timer_cbk(void *data)
+void
+ec_unlock_timer_del(ec_fop_data_t *fop, ec_lock_t *lock)
{
- ec_lock_link_t *link = data;
- ec_lock_t *lock = link->lock;
- ec_fop_data_t *fop = NULL;
+ inode_t *inode;
+ gf_boolean_t now = _gf_false;
+
+ /* A race condition can happen if timer expires, calls this function
+ * and the lock is released (lock->loc is wiped) but the fop is not
+ * fully completed yet (it's still on the list of pending fops). In
+ * this case, this function can also be called if ec_unlock_force() is
+ * called. */
+ inode = lock->loc.inode;
+ if (inode == NULL) {
+ return;
+ }
- LOCK(&lock->loc.inode->lock);
+ LOCK(&inode->lock);
- if (lock->timer != NULL) {
- fop = link->fop;
+ if (lock->timer != NULL) {
+ ec_trace("UNLOCK_DELAYED", fop, "lock=%p", lock);
- ec_trace("UNLOCK_DELAYED", fop, "lock=%p", lock);
+ gf_timer_call_cancel(fop->xl->ctx, lock->timer);
+ lock->timer = NULL;
+ *lock->plock = NULL;
- GF_ASSERT(lock->refs == 1);
+ now = _gf_true;
+ }
- gf_timer_call_cancel(fop->xl->ctx, lock->timer);
- lock->timer = NULL;
- *lock->plock = NULL;
- }
+ UNLOCK(&inode->lock);
- UNLOCK(&lock->loc.inode->lock);
+ if (now) {
+ ec_unlock_now(fop, lock);
+ }
+}
- if (fop != NULL) {
- ec_unlock_now(fop, lock);
- }
+void ec_unlock_timer_cbk(void *data)
+{
+ ec_lock_link_t *link = data;
+
+ ec_unlock_timer_del(link->fop, link->lock);
}
void ec_unlock_timer_add(ec_lock_link_t *link)
@@ -1626,6 +1641,18 @@ void ec_unlock(ec_fop_data_t *fop)
}
}
+void
+ec_unlock_force(ec_fop_data_t *fop)
+{
+ int32_t i;
+
+ for (i = 0; i < fop->lock_count; i++) {
+ ec_trace("UNLOCK_FORCED", fop, "lock=%p", &fop->locks[i]);
+
+ ec_unlock_timer_del(fop, fop->locks[i].lock);
+ }
+}
+
void ec_flush_size_version(ec_fop_data_t * fop)
{
ec_lock_t * lock;
@@ -1740,8 +1767,21 @@ void __ec_manager(ec_fop_data_t * fop, int32_t error)
}
if ((fop->state == EC_STATE_END) || (fop->state == -EC_STATE_END)) {
+ gf_boolean_t notify;
+
+ LOCK(&ec->lock);
+
+ list_del_init(&fop->pending_list);
+ notify = list_empty(&ec->pending_fops);
+
+ UNLOCK(&ec->lock);
+
ec_fop_data_release(fop);
+ if (notify) {
+ ec_pending_fops_completed(ec);
+ }
+
break;
}
diff --git a/xlators/cluster/ec/src/ec-common.h b/xlators/cluster/ec/src/ec-common.h
index 9e0aaa0f079..08993f03c8f 100644
--- a/xlators/cluster/ec/src/ec-common.h
+++ b/xlators/cluster/ec/src/ec-common.h
@@ -87,6 +87,7 @@ void ec_lock_prepare_fd(ec_fop_data_t *fop, fd_t *fd, int32_t update);
void ec_lock(ec_fop_data_t * fop);
void ec_lock_reuse(ec_fop_data_t *fop);
void ec_unlock(ec_fop_data_t * fop);
+void ec_unlock_force(ec_fop_data_t *fop);
void ec_get_size_version(ec_fop_data_t * fop);
void ec_prepare_update(ec_fop_data_t *fop);
diff --git a/xlators/cluster/ec/src/ec-data.c b/xlators/cluster/ec/src/ec-data.c
index fb47aea90a8..b747fc42348 100644
--- a/xlators/cluster/ec/src/ec-data.c
+++ b/xlators/cluster/ec/src/ec-data.c
@@ -165,17 +165,18 @@ ec_fop_data_t * ec_fop_data_allocate(call_frame_t * frame, xlator_t * this,
parent = frame->local;
if (parent != NULL)
{
- LOCK(&parent->lock);
-
- parent->jobs++;
- parent->refs++;
-
- UNLOCK(&parent->lock);
+ ec_sleep(parent);
}
fop->parent = parent;
}
+ LOCK(&ec->lock);
+
+ list_add_tail(&fop->pending_list, &ec->pending_fops);
+
+ UNLOCK(&ec->lock);
+
return fop;
}
diff --git a/xlators/cluster/ec/src/ec-data.h b/xlators/cluster/ec/src/ec-data.h
index 9e5c92dd5b8..8a58ffb288b 100644
--- a/xlators/cluster/ec/src/ec-data.h
+++ b/xlators/cluster/ec/src/ec-data.h
@@ -172,13 +172,14 @@ struct _ec_fop_data
int32_t winds;
int32_t jobs;
int32_t error;
- ec_fop_data_t * parent;
- xlator_t * xl;
- call_frame_t * req_frame; // frame of the calling xlator
- call_frame_t * frame; // frame used by this fop
- struct list_head cbk_list; // sorted list of groups of answers
- struct list_head answer_list; // list of answers
- ec_cbk_data_t * answer; // accepted answer
+ ec_fop_data_t *parent;
+ xlator_t *xl;
+ call_frame_t *req_frame; /* frame of the calling xlator */
+ call_frame_t *frame; /* frame used by this fop */
+ struct list_head cbk_list; /* sorted list of groups of answers */
+ struct list_head answer_list; /* list of answers */
+ struct list_head pending_list; /* member of ec_t.pending_fops */
+ ec_cbk_data_t *answer; /* accepted answer */
int32_t lock_count;
int32_t locked;
ec_lock_link_t locks[2];
@@ -203,7 +204,7 @@ struct _ec_fop_data
ec_handler_f handler;
ec_resume_f resume;
ec_cbk_t cbks;
- void * data;
+ void *data;
ec_heal_t *heal;
uint64_t user_size;
@@ -211,8 +212,8 @@ struct _ec_fop_data
int32_t use_fd;
- dict_t * xdata;
- dict_t * dict;
+ dict_t *xdata;
+ dict_t *dict;
int32_t int32;
uint32_t uint32;
uint64_t size;
@@ -222,14 +223,14 @@ struct _ec_fop_data
entrylk_type entrylk_type;
gf_xattrop_flags_t xattrop_flags;
dev_t dev;
- inode_t * inode;
- fd_t * fd;
+ inode_t *inode;
+ fd_t *fd;
struct iatt iatt;
- char * str[2];
+ char *str[2];
loc_t loc[2];
struct gf_flock flock;
- struct iovec * vector;
- struct iobref * buffers;
+ struct iovec *vector;
+ struct iobref *buffers;
};
struct _ec_cbk_data
diff --git a/xlators/cluster/ec/src/ec-heal.c b/xlators/cluster/ec/src/ec-heal.c
index ceddfeb6ac7..4fe5754cbb8 100644
--- a/xlators/cluster/ec/src/ec-heal.c
+++ b/xlators/cluster/ec/src/ec-heal.c
@@ -532,12 +532,7 @@ ec_heal_init (ec_fop_data_t * fop)
gf_log("ec", GF_LOG_INFO, "Healing '%s', gfid %s", heal->loc.path,
uuid_utoa(heal->loc.gfid));
} else {
- LOCK(&fop->lock);
-
- fop->jobs++;
- fop->refs++;
-
- UNLOCK(&fop->lock);
+ ec_sleep(fop);
}
list_add_tail(&heal->list, &ctx->heal);
diff --git a/xlators/cluster/ec/src/ec.c b/xlators/cluster/ec/src/ec.c
index 3dd04299541..4028aa4d2bb 100644
--- a/xlators/cluster/ec/src/ec.c
+++ b/xlators/cluster/ec/src/ec.c
@@ -278,6 +278,7 @@ ec_notify_cbk (void *data)
{
ec_t *ec = data;
glusterfs_event_t event = GF_EVENT_MAXVAL;
+ gf_boolean_t propagate = _gf_false;
LOCK(&ec->lock);
{
@@ -309,10 +310,14 @@ ec_notify_cbk (void *data)
/* CHILD_DOWN should not come here as no grace period is given
* for notifying CHILD_DOWN. */
- default_notify (ec->xl, event, NULL);
+ propagate = _gf_true;
}
unlock:
UNLOCK(&ec->lock);
+
+ if (propagate) {
+ default_notify (ec->xl, event, NULL);
+ }
}
void
@@ -360,6 +365,49 @@ ec_handle_down (xlator_t *this, ec_t *ec, int32_t idx)
}
}
+gf_boolean_t
+ec_force_unlocks(ec_t *ec)
+{
+ struct list_head list;
+ ec_fop_data_t *fop;
+
+ if (list_empty(&ec->pending_fops)) {
+ return _gf_true;
+ }
+
+ INIT_LIST_HEAD(&list);
+
+ /* All pending fops when GF_EVENT_PARENT_DOWN is received should only
+ * be fops waiting for a delayed unlock. However the unlock can
+ * generate new fops. We don't want to trverse these new fops while
+ * forcing unlocks, so we move all fops to a temporal list. To process
+ * them without interferences.*/
+ list_splice_init(&ec->pending_fops, &list);
+
+ while (!list_empty(&list)) {
+ fop = list_entry(list.next, ec_fop_data_t, pending_list);
+ list_move_tail(&fop->pending_list, &ec->pending_fops);
+
+ UNLOCK(&ec->lock);
+
+ ec_unlock_force(fop);
+
+ LOCK(&ec->lock);
+ }
+
+ ec->shutdown = _gf_true;
+
+ return list_empty(&ec->pending_fops);
+}
+
+void
+ec_pending_fops_completed(ec_t *ec)
+{
+ if (ec->shutdown) {
+ default_notify(ec->xl, GF_EVENT_PARENT_DOWN, NULL);
+ }
+}
+
int32_t
ec_notify (xlator_t *this, int32_t event, void *data, void *data2)
{
@@ -367,14 +415,16 @@ ec_notify (xlator_t *this, int32_t event, void *data, void *data2)
int32_t idx = 0;
int32_t error = 0;
glusterfs_event_t old_event = GF_EVENT_MAXVAL;
- glusterfs_event_t new_event = GF_EVENT_MAXVAL;
dict_t *input = NULL;
dict_t *output = NULL;
+ gf_boolean_t propagate = _gf_true;
+
+ gf_log (this->name, GF_LOG_TRACE, "NOTIFY(%d): %p, %p",
+ event, data, data2);
if (event == GF_EVENT_TRANSLATOR_OP) {
if (!ec->up) {
error = -1;
- goto out;
} else {
input = data;
output = data2;
@@ -400,13 +450,14 @@ ec_notify (xlator_t *this, int32_t event, void *data, void *data2)
*/
ec_launch_notify_timer (this, ec);
goto unlock;
+ } else if (event == GF_EVENT_PARENT_DOWN) {
+ /* If there aren't pending fops running after we have waken up
+ * them, we immediately propagate the notification. */
+ propagate = ec_force_unlocks(ec);
+ goto unlock;
}
- gf_log (this->name, GF_LOG_TRACE, "NOTIFY(%d): %p, %d",
- event, data, idx);
-
if (idx < ec->nodes) { /* CHILD_* events */
-
old_event = ec_get_event_from_state (ec);
if (event == GF_EVENT_CHILD_UP) {
@@ -415,28 +466,30 @@ ec_notify (xlator_t *this, int32_t event, void *data, void *data2)
ec_handle_down (this, ec, idx);
}
- new_event = ec_get_event_from_state (ec);
+ event = ec_get_event_from_state (ec);
- if (new_event == GF_EVENT_CHILD_UP && !ec->up) {
+ if (event == GF_EVENT_CHILD_UP && !ec->up) {
ec_up (this, ec);
- } else if (new_event == GF_EVENT_CHILD_DOWN && ec->up) {
+ } else if (event == GF_EVENT_CHILD_DOWN && ec->up) {
ec_down (this, ec);
}
- if ((new_event == old_event) && (new_event != GF_EVENT_MAXVAL))
- new_event = GF_EVENT_CHILD_MODIFIED;
-
- event = GF_EVENT_MAXVAL;/* Take care of notifying inside lock */
- if (new_event != GF_EVENT_MAXVAL)
- error = default_notify (this, new_event, data);
+ if (event != GF_EVENT_MAXVAL) {
+ if (event == old_event) {
+ event = GF_EVENT_CHILD_MODIFIED;
+ }
+ } else {
+ propagate = _gf_false;
+ }
}
- unlock:
- UNLOCK (&ec->lock);
+unlock:
+ UNLOCK (&ec->lock);
- if (event != GF_EVENT_MAXVAL)
- return default_notify (this, event, data);
+ if (propagate) {
+ error = default_notify (this, event, data);
+ }
out:
- return error;
+ return error;
}
int32_t
@@ -478,6 +531,8 @@ init (xlator_t *this)
ec->xl = this;
LOCK_INIT(&ec->lock);
+ INIT_LIST_HEAD(&ec->pending_fops);
+
ec->fop_pool = mem_pool_new(ec_fop_data_t, 1024);
ec->cbk_pool = mem_pool_new(ec_cbk_data_t, 4096);
ec->lock_pool = mem_pool_new(ec_lock_t, 1024);
diff --git a/xlators/cluster/ec/src/ec.h b/xlators/cluster/ec/src/ec.h
index b8f3e288197..fdedb89ec18 100644
--- a/xlators/cluster/ec/src/ec.h
+++ b/xlators/cluster/ec/src/ec.h
@@ -44,6 +44,8 @@ struct _ec
xlator_t ** xl_list;
gf_lock_t lock;
gf_timer_t * timer;
+ gf_boolean_t shutdown;
+ struct list_head pending_fops;
struct mem_pool * fop_pool;
struct mem_pool * cbk_pool;
struct mem_pool * lock_pool;
@@ -51,4 +53,7 @@ struct _ec
char vol_uuid[UUID_SIZE + 1];
dict_t *leaf_to_subvolid;
};
+
+void ec_pending_fops_completed(ec_t *ec);
+
#endif /* __EC_H__ */