From a3f90eeb0ad97e4f86aef603f95b0562ab18f36d Mon Sep 17 00:00:00 2001 From: Pranith K Date: Fri, 1 Oct 2010 08:00:12 +0000 Subject: mgmt/glusterd: handle glusterfs crashes for start/stop of bricks Signed-off-by: Pranith Kumar K Signed-off-by: Vijay Bellur BUG: 1741 (gluster volume stop after one process crashed.) URL: http://bugs.gluster.com/cgi-bin/bugzilla3/show_bug.cgi?id=1741 --- xlators/mgmt/glusterd/src/glusterd-utils.c | 127 +++++++++++++++-------------- 1 file changed, 66 insertions(+), 61 deletions(-) (limited to 'xlators/mgmt/glusterd/src/glusterd-utils.c') diff --git a/xlators/mgmt/glusterd/src/glusterd-utils.c b/xlators/mgmt/glusterd/src/glusterd-utils.c index 5b5d98750d0..3fa965c8800 100644 --- a/xlators/mgmt/glusterd/src/glusterd-utils.c +++ b/xlators/mgmt/glusterd/src/glusterd-utils.c @@ -763,6 +763,7 @@ glusterd_service_stop (const char *service, char *pidfile, int sig, int32_t ret = -1; pid_t pid = -1; FILE *file = NULL; + gf_boolean_t is_locked = _gf_false; file = fopen (pidfile, "r+"); @@ -778,6 +779,17 @@ glusterd_service_stop (const char *service, char *pidfile, int sig, ret = -1; goto out; } + ret = lockf (fileno (file), F_TLOCK, 0); + if (!ret) { + is_locked = _gf_true; + ret = unlink (pidfile); + if (ret && (ENOENT != errno)) { + gf_log ("", GF_LOG_ERROR, "Unable to " + "unlink stale pidfile: %s", pidfile); + } + goto out; + } + ret = fscanf (file, "%d", &pid); if (ret <= 0) { @@ -796,8 +808,13 @@ glusterd_service_stop (const char *service, char *pidfile, int sig, if (force_kill) { sleep (1); - ret = access (pidfile, F_OK); - if (!ret) { + file = fopen (pidfile, "r+"); + if (!file) { + ret = 0; + goto out; + } + ret = lockf (fileno (file), F_TLOCK, 0); + if (ret && ((EAGAIN == errno) || (EACCES == errno))) { ret = kill (pid, SIGKILL); if (ret) { gf_log ("", GF_LOG_ERROR, "Unable to " @@ -805,17 +822,22 @@ glusterd_service_stop (const char *service, char *pidfile, int sig, strerror(errno)); goto out; } - ret = unlink (pidfile); - if (ret && (ENOENT != errno)) { - gf_log ("", GF_LOG_ERROR, "Unable to " - "unlink pidfile: %s", pidfile); - goto out; - } + + } else if (0 == ret){ + is_locked = _gf_true; + } + ret = unlink (pidfile); + if (ret && (ENOENT != errno)) { + gf_log ("", GF_LOG_ERROR, "Unable to " + "unlink pidfile: %s", pidfile); + goto out; } } ret = 0; out: + if (is_locked && file) + lockf (fileno (file), F_ULOCK, 0); if (file) fclose (file); return ret; @@ -836,6 +858,8 @@ glusterd_volume_start_glusterfs (glusterd_volinfo_t *volinfo, char exp_path[PATH_MAX] = {0,}; char logfile[PATH_MAX] = {0,}; int port = 0; + FILE *file = NULL; + gf_boolean_t is_locked = _gf_false; GF_ASSERT (volinfo); GF_ASSERT (brickinfo); @@ -855,13 +879,31 @@ glusterd_volume_start_glusterfs (glusterd_volinfo_t *volinfo, goto out; } - port = brickinfo->port; - if (!port) - port = pmap_registry_alloc (THIS); - GLUSTERD_GET_BRICK_PIDFILE (pidfile, path, brickinfo->hostname, brickinfo->path); + ret = pmap_registry_search (this, brickinfo->path, + GF_PMAP_PORT_BRICKSERVER); + if (ret) { + ret = 0; + file = fopen (pidfile, "r+"); + if (file) { + ret = lockf (fileno (file), F_TLOCK, 0); + if (ret && ((EAGAIN == errno) || (EACCES == errno))) { + ret = 0; + gf_log ("", GF_LOG_NORMAL, "brick %s:%s " + "already started", brickinfo->hostname, + brickinfo->path); + goto out; + } else if (0 == ret) { + is_locked = _gf_true; + } + } + } + unlink (pidfile); + gf_log ("", GF_LOG_NORMAL, "About to start glusterfs" + " for brick %s:%s", brickinfo->hostname, + brickinfo->path); GLUSTERD_REMOVE_SLASH_FROM_PATH (brickinfo->path, exp_path); snprintf (volfile, PATH_MAX, "%s.%s.%s", volinfo->volname, brickinfo->hostname, exp_path); @@ -872,6 +914,10 @@ glusterd_volume_start_glusterfs (glusterd_volinfo_t *volinfo, brickinfo->logfile = gf_strdup (logfile); } + port = brickinfo->port; + if (!port) + port = pmap_registry_alloc (THIS); + snprintf (cmd_str, 8192, "%s/sbin/glusterfsd --xlator-option %s-server.listen-port=%d " "-s localhost --volfile-id %s -p %s --brick-name %s " @@ -887,6 +933,10 @@ glusterd_volume_start_glusterfs (glusterd_volinfo_t *volinfo, brickinfo->port = port; } out: + if (is_locked && file) + lockf (fileno (file), F_ULOCK, 0); + if (file) + fclose (file); return ret; } @@ -1750,9 +1800,6 @@ glusterd_brick_start (glusterd_volinfo_t *volinfo, int ret = -1; xlator_t *this = NULL; glusterd_conf_t *conf = NULL; - char path[PATH_MAX] = {0,}; - char pidfile[PATH_MAX] = {0,}; - struct stat stbuf = {0,}; if ((!brickinfo) || (!volinfo)) goto out; @@ -1776,47 +1823,13 @@ glusterd_brick_start (glusterd_volinfo_t *volinfo, ret = 0; goto out; } - - if (!glusterd_is_brick_started (brickinfo)) { - gf_log ("", GF_LOG_DEBUG, "brick: %s:%s, of volume: %s already" - " started", brickinfo->hostname, brickinfo->path, - volinfo->volname); - ret = 0; + ret = glusterd_volume_start_glusterfs (volinfo, brickinfo); + if (ret) { + gf_log ("", GF_LOG_ERROR, "Unable to start " + "glusterfs, ret: %d", ret); goto out; } - GLUSTERD_GET_VOLUME_DIR (path, volinfo, conf); - GLUSTERD_GET_BRICK_PIDFILE (pidfile, path, brickinfo->hostname, - brickinfo->path); - ret = stat (pidfile, &stbuf); - if (ret && errno == ENOENT) { - gf_log ("", GF_LOG_NORMAL, "About to start glusterfs" - " for brick %s:%s", brickinfo->hostname, - brickinfo->path); - ret = glusterd_volume_start_glusterfs (volinfo, brickinfo); - if (ret) { - gf_log ("", GF_LOG_ERROR, "Unable to start " - "glusterfs, ret: %d", ret); - goto out; - } - } else if (!ret) { - ret = pmap_registry_search (this, brickinfo->path, - GF_PMAP_PORT_BRICKSERVER); - if (ret) { - ret = 0; - goto out; - } - ret = unlink (pidfile); - gf_log ("", GF_LOG_NORMAL, "About to start glusterfs" - " for brick %s:%s", brickinfo->hostname, - brickinfo->path); - ret = glusterd_volume_start_glusterfs (volinfo, brickinfo); - if (ret) { - gf_log ("", GF_LOG_ERROR, "Unable to start " - "glusterfs, ret: %d", ret); - goto out; - } - } out: gf_log ("", GF_LOG_DEBUG, "returning %d ", ret); @@ -2095,14 +2108,6 @@ glusterd_brick_stop (glusterd_volinfo_t *volinfo, goto out; } - if (glusterd_is_brick_started (brickinfo)) { - gf_log ("", GF_LOG_DEBUG, "brick: %s:%s, of volume: %s not" - " started", brickinfo->hostname, brickinfo->path, - volinfo->volname); - ret = 0; - goto out; - } - gf_log ("", GF_LOG_NORMAL, "About to stop glusterfs" " for brick %s:%s", brickinfo->hostname, brickinfo->path); -- cgit