From 3e255fb10f404dbc0d3add7164c95b0721231312 Mon Sep 17 00:00:00 2001 From: Brian Foster Date: Mon, 3 Dec 2012 10:45:04 -0500 Subject: afr: use data trylock mode in read/write self-heal trigger paths Self-heal data lock contention between clients and glustershd instances can lead to long wait and user response times if the client ends up pending its lock on glustershd self-heal of a large file. We have reports of guest vm instances going completely unresponsive during self-heal of virtual disk images. Optimize the read/write self-heal trigger codepath (i.e., afr_open_fd_fix()) to trylock for self-heal and skip the self-heal otherwise to minimize the likelihood of a running/active guest of competing with glustershd on arrival of a brick. Note that lock contention is still possible from the client (e.g., via lookup). BUG: 874045 Change-Id: I077e2c0aaa424b80734a471284173bda8871cdc3 Signed-off-by: Brian Foster Reviewed-on: https://code.engineering.redhat.com/gerrit/1911 Reviewed-by: Vijay Bellur Tested-by: Vijay Bellur --- xlators/cluster/afr/src/afr-self-heal-data.c | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/xlators/cluster/afr/src/afr-self-heal-data.c b/xlators/cluster/afr/src/afr-self-heal-data.c index d2a205ee4ab..29951537eee 100644 --- a/xlators/cluster/afr/src/afr-self-heal-data.c +++ b/xlators/cluster/afr/src/afr-self-heal-data.c @@ -1296,6 +1296,7 @@ afr_sh_data_open_cbk (call_frame_t *frame, void *cookie, xlator_t *this, afr_private_t *priv = NULL; int call_count = 0; int child_index = 0; + gf_boolean_t block = _gf_true; local = frame->local; sh = &local->self_heal; @@ -1337,7 +1338,13 @@ afr_sh_data_open_cbk (call_frame_t *frame, void *cookie, xlator_t *this, "fd for %s opened, commencing sync", local->loc.path); - afr_sh_data_lock (frame, this, 0, 0, _gf_true, + /* + * The read and write self-heal trigger codepaths do not provide + * an unwind callback. We run a trylock in these codepaths + * because we are sensitive to locking latency. + */ + block = sh->unwind ? _gf_true : _gf_false; + afr_sh_data_lock (frame, this, 0, 0, block, afr_sh_data_big_lock_success, afr_sh_data_fail); } -- cgit