summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorXavier Hernandez <jahernan@redhat.com>2018-01-19 12:18:13 +0100
committerRaghavendra G <rgowdapp@redhat.com>2019-05-11 14:25:53 +0000
commit59841f7e1ff0511b04884015441a181a56d07bea (patch)
tree7b4f16752014cf0cfc0ba1aad1847a43325e28a9
parentda4601d536da761ce908a2461a0930857f99f171 (diff)
rpc: implement reconnect back-off strategy
When a connection failure happens, gluster tries to reconnect every 3 seconds. In some cases the failure is spurious, so a delay of 3 seconds could be unnecessarily long. This patch implements a back-off strategy that tries a reconnect as soon as 1 tenth of a second. If this fails, the time is doubled until it's around 3 seconds. After that, the reconnect is attempted every 3 seconds as before. Change-Id: Icb3fbe20d618f50cbbb599dce542b4e871c22149 Updates: bz#1193929 Signed-off-by: Xavier Hernandez <xhernandez@redhat.com>
-rw-r--r--rpc/rpc-lib/src/rpc-clnt.c33
-rw-r--r--rpc/rpc-lib/src/rpc-clnt.h1
2 files changed, 18 insertions, 16 deletions
diff --git a/rpc/rpc-lib/src/rpc-clnt.c b/rpc/rpc-lib/src/rpc-clnt.c
index 8ef05378351..c1945dfb6ec 100644
--- a/rpc/rpc-lib/src/rpc-clnt.c
+++ b/rpc/rpc-lib/src/rpc-clnt.c
@@ -392,8 +392,16 @@ rpc_clnt_reconnect(void *conn_ptr)
conn->reconnect = 0;
if ((conn->connected == 0) && !clnt->disabled) {
- ts.tv_sec = 3;
- ts.tv_nsec = 0;
+ if (conn->reconnect_delay.tv_sec < 3) {
+ conn->reconnect_delay.tv_sec *= 2;
+ int64_t ns = conn->reconnect_delay.tv_nsec * 2;
+ if (ns >= 1000000000ULL) {
+ conn->reconnect_delay.tv_sec++;
+ ns -= 1000000000ULL;
+ }
+ conn->reconnect_delay.tv_nsec = ns;
+ }
+ ts = conn->reconnect_delay;
gf_log(conn->name, GF_LOG_TRACE, "attempting reconnect");
(void)rpc_transport_connect(trans, conn->config.remote_port);
@@ -838,9 +846,11 @@ rpc_clnt_handle_disconnect(struct rpc_clnt *clnt, rpc_clnt_connection_t *conn)
pthread_mutex_lock(&conn->lock);
{
+ conn->reconnect_delay.tv_sec = 0;
+ conn->reconnect_delay.tv_nsec = 100000000;
+
if (!conn->rpc_clnt->disabled && (conn->reconnect == NULL)) {
- ts.tv_sec = 3;
- ts.tv_nsec = 0;
+ ts = conn->reconnect_delay;
rpc_clnt_ref(clnt);
conn->reconnect = gf_timer_call_after(clnt->ctx, ts,
@@ -1160,6 +1170,8 @@ rpc_clnt_start(struct rpc_clnt *rpc)
* rpc_clnt_reconnect fire event.
*/
rpc_clnt_ref(rpc);
+ conn->reconnect_delay.tv_sec = 0;
+ conn->reconnect_delay.tv_nsec = 50000000;
rpc_clnt_reconnect(conn);
return 0;
@@ -1177,18 +1189,7 @@ rpc_clnt_cleanup_and_start(struct rpc_clnt *rpc)
rpc_clnt_connection_cleanup(conn);
- pthread_mutex_lock(&conn->lock);
- {
- rpc->disabled = 0;
- }
- pthread_mutex_unlock(&conn->lock);
- /* Corresponding unref will be either on successful timer cancel or last
- * rpc_clnt_reconnect fire event.
- */
- rpc_clnt_ref(rpc);
- rpc_clnt_reconnect(conn);
-
- return 0;
+ return rpc_clnt_start(rpc);
}
int
diff --git a/rpc/rpc-lib/src/rpc-clnt.h b/rpc/rpc-lib/src/rpc-clnt.h
index b46feed50c8..2c252d5ff86 100644
--- a/rpc/rpc-lib/src/rpc-clnt.h
+++ b/rpc/rpc-lib/src/rpc-clnt.h
@@ -136,6 +136,7 @@ struct rpc_clnt_connection {
struct saved_frames *saved_frames;
struct timespec last_sent;
struct timespec last_received;
+ struct timespec reconnect_delay;
uint64_t pingcnt;
uint64_t msgcnt;
uint64_t cleanup_gen;