From 59841f7e1ff0511b04884015441a181a56d07bea Mon Sep 17 00:00:00 2001 From: Xavier Hernandez Date: Fri, 19 Jan 2018 12:18:13 +0100 Subject: rpc: implement reconnect back-off strategy When a connection failure happens, gluster tries to reconnect every 3 seconds. In some cases the failure is spurious, so a delay of 3 seconds could be unnecessarily long. This patch implements a back-off strategy that tries a reconnect as soon as 1 tenth of a second. If this fails, the time is doubled until it's around 3 seconds. After that, the reconnect is attempted every 3 seconds as before. Change-Id: Icb3fbe20d618f50cbbb599dce542b4e871c22149 Updates: bz#1193929 Signed-off-by: Xavier Hernandez --- rpc/rpc-lib/src/rpc-clnt.c | 33 +++++++++++++++++---------------- rpc/rpc-lib/src/rpc-clnt.h | 1 + 2 files changed, 18 insertions(+), 16 deletions(-) (limited to 'rpc/rpc-lib/src') diff --git a/rpc/rpc-lib/src/rpc-clnt.c b/rpc/rpc-lib/src/rpc-clnt.c index 8ef05378351..c1945dfb6ec 100644 --- a/rpc/rpc-lib/src/rpc-clnt.c +++ b/rpc/rpc-lib/src/rpc-clnt.c @@ -392,8 +392,16 @@ rpc_clnt_reconnect(void *conn_ptr) conn->reconnect = 0; if ((conn->connected == 0) && !clnt->disabled) { - ts.tv_sec = 3; - ts.tv_nsec = 0; + if (conn->reconnect_delay.tv_sec < 3) { + conn->reconnect_delay.tv_sec *= 2; + int64_t ns = conn->reconnect_delay.tv_nsec * 2; + if (ns >= 1000000000ULL) { + conn->reconnect_delay.tv_sec++; + ns -= 1000000000ULL; + } + conn->reconnect_delay.tv_nsec = ns; + } + ts = conn->reconnect_delay; gf_log(conn->name, GF_LOG_TRACE, "attempting reconnect"); (void)rpc_transport_connect(trans, conn->config.remote_port); @@ -838,9 +846,11 @@ rpc_clnt_handle_disconnect(struct rpc_clnt *clnt, rpc_clnt_connection_t *conn) pthread_mutex_lock(&conn->lock); { + conn->reconnect_delay.tv_sec = 0; + conn->reconnect_delay.tv_nsec = 100000000; + if (!conn->rpc_clnt->disabled && (conn->reconnect == NULL)) { - ts.tv_sec = 3; - ts.tv_nsec = 0; + ts = conn->reconnect_delay; rpc_clnt_ref(clnt); conn->reconnect = gf_timer_call_after(clnt->ctx, ts, @@ -1160,6 +1170,8 @@ rpc_clnt_start(struct rpc_clnt *rpc) * rpc_clnt_reconnect fire event. */ rpc_clnt_ref(rpc); + conn->reconnect_delay.tv_sec = 0; + conn->reconnect_delay.tv_nsec = 50000000; rpc_clnt_reconnect(conn); return 0; @@ -1177,18 +1189,7 @@ rpc_clnt_cleanup_and_start(struct rpc_clnt *rpc) rpc_clnt_connection_cleanup(conn); - pthread_mutex_lock(&conn->lock); - { - rpc->disabled = 0; - } - pthread_mutex_unlock(&conn->lock); - /* Corresponding unref will be either on successful timer cancel or last - * rpc_clnt_reconnect fire event. - */ - rpc_clnt_ref(rpc); - rpc_clnt_reconnect(conn); - - return 0; + return rpc_clnt_start(rpc); } int diff --git a/rpc/rpc-lib/src/rpc-clnt.h b/rpc/rpc-lib/src/rpc-clnt.h index b46feed50c8..2c252d5ff86 100644 --- a/rpc/rpc-lib/src/rpc-clnt.h +++ b/rpc/rpc-lib/src/rpc-clnt.h @@ -136,6 +136,7 @@ struct rpc_clnt_connection { struct saved_frames *saved_frames; struct timespec last_sent; struct timespec last_received; + struct timespec reconnect_delay; uint64_t pingcnt; uint64_t msgcnt; uint64_t cleanup_gen; -- cgit