summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorKotresh HR <khiremat@redhat.com>2014-10-03 17:35:47 +0530
committerVenky Shankar <vshankar@redhat.com>2014-11-12 04:28:02 -0800
commitbeedf68266f19ac77b77f2ec5f9533f3e63c159f (patch)
tree69db2ab739e9175aedc140af6ad7c1294645d2d5
parent1ffdf112f707a13c9fd74bbf17f99d28f84f0f0c (diff)
glusterd/geo-rep: Fix race in updating status file
When geo-rep is in paused state and a node in a cluster is rebooted, the geo-rep status goes to "faulty (Paused)" and no worker processes are started on that node yet. In this state, when geo-rep is resumed, there is a race in updating status file between glusterd and gsyncd itself as geo-rep is resumed first and then status is updated. glusterd tries to update to previous state and gsyncd tries to update it to "Initializing...(Paused)" on restart as it was paused previously. If gsyncd on restart wins, the state is always paused but the process is not acutally paused. So the solution is glusterd to update the status file and then resume. BUG: 1159195 Change-Id: I4c06f42226db98f5a3c49b90f31ecf6cf2b6d0cb Reviewed-on: http://review.gluster.org/8911 Signed-off-by: Kotresh HR <khiremat@redhat.com> Reviewed-on: http://review.gluster.org/9021 Tested-by: Gluster Build System <jenkins@build.gluster.com> Reviewed-by: Venky Shankar <vshankar@redhat.com> Tested-by: Venky Shankar <vshankar@redhat.com>
-rw-r--r--xlators/mgmt/glusterd/src/glusterd-geo-rep.c44
1 files changed, 26 insertions, 18 deletions
diff --git a/xlators/mgmt/glusterd/src/glusterd-geo-rep.c b/xlators/mgmt/glusterd/src/glusterd-geo-rep.c
index 74d28a71898..a160314134e 100644
--- a/xlators/mgmt/glusterd/src/glusterd-geo-rep.c
+++ b/xlators/mgmt/glusterd/src/glusterd-geo-rep.c
@@ -2821,32 +2821,40 @@ gd_pause_or_resume_gsync (dict_t *dict, char *master, char *slave,
goto out;
}
} else {
- ret = kill (-pid, SIGCONT);
- if (ret) {
- gf_log (this->name, GF_LOG_ERROR,
- "Failed to resume gsyncd. Error: %s",
- strerror (errno));
- goto out;
- }
token = strtok (monitor_status, "(");
ret = glusterd_create_status_file (master, slave,
slave_host,
slave_vol, token);
if (ret) {
gf_log (this->name, GF_LOG_ERROR,
- "Unable to update state_file."
- " Error : %s", strerror (errno));
- /* If status cannot be updated pause back */
- if (kill (-pid, SIGSTOP)) {
+ "Resume Failed: Unable to update "
+ "state_file. Error : %s",
+ strerror (errno));
+ goto out;
+ }
+ ret = kill (-pid, SIGCONT);
+ if (ret) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "Resumed Failed: Unable to send"
+ " SIGCONT. Error: %s",
+ strerror (errno));
+ /* Process can't be resumed, update status
+ * back to paused. */
+ ret = glusterd_create_status_file (master,
+ slave,
+ slave_host,
+ slave_vol,
+ monitor_status);
+ if (ret) {
snprintf (errmsg, sizeof(errmsg),
- "Resume successful but could "
- "not update status file."
- " Please use 'pause force' to"
- " pause back and retry resume"
- " to reflect in status");
+ "Resume failed!!! Status "
+ "inconsistent. Please use "
+ "'resume force' to resume and"
+ " reach consistent state");
gf_log (this->name, GF_LOG_ERROR,
- "Pause back Failed. Error: %s",
- strerror (errno));
+ "Updating status back to paused"
+ " Failed. Error: %s",
+ strerror (errno));
*op_errstr = gf_strdup (errmsg);
}
goto out;