summaryrefslogtreecommitdiffstats
path: root/transport
diff options
context:
space:
mode:
authorRaghavendra G <raghavendra@gluster.com>2009-11-13 16:08:15 +0000
committerAnand V. Avati <avati@dev.gluster.com>2009-11-16 00:41:36 -0800
commit0c7597a1339f1a6762505cfe7292202a0db196a6 (patch)
tree72995a0867829c47662a558ea05c27f33222cc89 /transport
parentd67322fdfa27746f82fb6e37ffe57efde9bfa021 (diff)
transport/ib-verbs: fix race-condition resulting in freeing of transport while it was still being used.
- while handling a failed work completion status, the transport is disconnected. But further down in the code, the transport is still used. There can be a possibility that transport might've been freed by the time control reaches this point. More detailed description of events leading to this situation can be found at http://bugs.gluster.com/cgi-bin/bugzilla3/show_bug.cgi?id=381 Signed-off-by: Raghavendra G <raghavendra@gluster.com> Signed-off-by: Anand V. Avati <avati@dev.gluster.com> BUG: 381 (glusterfs crash in ib-verbs) URL: http://bugs.gluster.com/cgi-bin/bugzilla3/show_bug.cgi?id=381
Diffstat (limited to 'transport')
-rw-r--r--transport/ib-verbs/src/ib-verbs.c81
1 files changed, 70 insertions, 11 deletions
diff --git a/transport/ib-verbs/src/ib-verbs.c b/transport/ib-verbs/src/ib-verbs.c
index 1dd00f2..e0e2593 100644
--- a/transport/ib-verbs/src/ib-verbs.c
+++ b/transport/ib-verbs/src/ib-verbs.c
@@ -691,22 +691,43 @@ ib_verbs_unregister_peer (ib_verbs_device_t *device,
static ib_verbs_peer_t *
-ib_verbs_lookup_peer (ib_verbs_device_t *device,
- int32_t qp_num)
+__ib_verbs_lookup_peer (ib_verbs_device_t *device, int32_t qp_num)
{
- struct _qpent *ent;
- ib_verbs_qpreg_t *qpreg = &device->qpreg;
- ib_verbs_peer_t *peer;
- int32_t hash = qp_num % 42;
+ struct _qpent *ent = NULL;
+ ib_verbs_peer_t *peer = NULL;
+ ib_verbs_qpreg_t *qpreg = NULL;
+ int32_t hash = 0;
- pthread_mutex_lock (&qpreg->lock);
+ hash = qp_num % 42;
ent = qpreg->ents[hash].next;
while ((ent != &qpreg->ents[hash]) && (ent->qp_num != qp_num))
ent = ent->next;
- peer = ent->peer;
+
+ if (ent != &qpreg->ents[hash]) {
+ peer = ent->peer;
+ }
+
+ return peer;
+}
+
+/*
+static ib_verbs_peer_t *
+ib_verbs_lookup_peer (ib_verbs_device_t *device,
+ int32_t qp_num)
+{
+ ib_verbs_qpreg_t *qpreg = NULL;
+ ib_verbs_peer_t *peer = NULL;
+
+ qpreg = &device->qpreg;
+ pthread_mutex_lock (&qpreg->lock);
+ {
+ peer = __ib_verbs_lookup_peer (device, qp_num);
+ }
pthread_mutex_unlock (&qpreg->lock);
+
return peer;
}
+*/
static void
@@ -1115,7 +1136,23 @@ ib_verbs_recv_completion_proc (void *data)
while ((ret = ibv_poll_cq (event_cq, 1, &wc)) > 0) {
post = (ib_verbs_post_t *) (long) wc.wr_id;
- peer = ib_verbs_lookup_peer (device, wc.qp_num);
+
+ pthread_mutex_lock (&device->qpreg.lock);
+ {
+ peer = __ib_verbs_lookup_peer (device,
+ wc.qp_num);
+
+ /*
+ * keep a refcount on transport so that it
+ * doesnot get freed because of some error
+ * indicated by wc.status till we are done
+ * with usage of peer and thereby that of trans.
+ */
+ if (peer != NULL) {
+ transport_ref (peer->trans);
+ }
+ }
+ pthread_mutex_unlock (&device->qpreg.lock);
if (wc.status != IBV_WC_SUCCESS) {
gf_log ("transport/ib-verbs", GF_LOG_ERROR,
@@ -1123,8 +1160,10 @@ ib_verbs_recv_completion_proc (void *data)
"error (%d)",
device->device_name,
wc.status);
- if (peer)
+ if (peer) {
+ transport_unref (peer->trans);
transport_disconnect (peer->trans);
+ }
if (post) {
ib_verbs_post_recv (device->srq, post);
@@ -1159,6 +1198,8 @@ ib_verbs_recv_completion_proc (void *data)
peer->trans->xl->name);
transport_disconnect (peer->trans);
}
+
+ transport_unref (peer->trans);
} else {
gf_log ("transport/ib-verbs",
GF_LOG_DEBUG,
@@ -1216,7 +1257,23 @@ ib_verbs_send_completion_proc (void *data)
while ((ret = ibv_poll_cq (event_cq, 1, &wc)) > 0) {
post = (ib_verbs_post_t *) (long) wc.wr_id;
- peer = ib_verbs_lookup_peer (device, wc.qp_num);
+
+ pthread_mutex_lock (&device->qpreg.lock);
+ {
+ peer = __ib_verbs_lookup_peer (device,
+ wc.qp_num);
+
+ /*
+ * keep a refcount on transport so that it
+ * doesnot get freed because of some error
+ * indicated by wc.status till we are done
+ * with usage of peer and thereby that of trans.
+ */
+ if (peer != NULL) {
+ transport_ref (peer->trans);
+ }
+ }
+ pthread_mutex_unlock (&device->qpreg.lock);
if (wc.status != IBV_WC_SUCCESS) {
gf_log ("transport/ib-verbs", GF_LOG_ERROR,
@@ -1254,6 +1311,8 @@ ib_verbs_send_completion_proc (void *data)
"failed to send message");
}
+
+ transport_unref (peer->trans);
} else {
gf_log ("transport/ib-verbs", GF_LOG_DEBUG,
"could not lookup peer for qp_num: %d",