summaryrefslogtreecommitdiffstats
path: root/extras/ganesha/ocf/ganesha_nfsd
diff options
context:
space:
mode:
authorKaleb S KEITHLEY <kkeithle@redhat.com>2015-12-14 09:24:57 -0500
committerKaleb KEITHLEY <kkeithle@redhat.com>2016-03-14 21:34:27 -0700
commit40a24f5ab917863d1549508ae9cf31085955d174 (patch)
tree04568d9bb2c39b6ad9abc5059d298b3457a65f6a /extras/ganesha/ocf/ganesha_nfsd
parent7240923a55eab2fc237d951ec0d89c51676a2aae (diff)
common-ha: reliable grace using pacemaker notify actions
Using *-dead_ip-1 resources to track on which nodes the ganesha.nfsd had died was found to be unreliable. Running `pcs status` in the ganesha_grace monitor action was seen to time out during failover; the HA devs opined that it was, generally, not a good idea to run `pcs status` in a monitor action in any event. They suggested using the notify feature, where the resources on all the nodes are notified when a clone resource agent dies. This change adds a notify action to the ganesha_grace RA. The ganesha_mon RA monitors its ganesha.nfsd daemon. While the daemon is running, it creates two attributes: ganesha-active and grace-active. When the daemon stops for any reason, the attributes are deleted. Deleting the ganesha-active attribute triggers the failover of the virtual IP (the IPaddr RA) to another node where ganesha.nfsd is still running. The ganesha_grace RA monitors the grace-active attribute. When the grace-active attibute is deleted, the ganesha_grace RA stops, and will not restart. This triggers pacemaker to trigger the notify action in the ganesha_grace RAs on the other nodes in the cluster; which send a DBUS message to their ganesha.nfsd. (N.B. grace-active is a bit of a misnomer. while the grace-active attribute exists, everything is normal and healthy. Deleting the attribute triggers putting the surviving ganesha.nfsds into GRACE.) To ensure that the remaining/surviving ganesha.nfsds are put into NFS-GRACE before the IPaddr (virtual IP) fails over there is a short delay (sleep) between deleting the grace-active attribute and the ganesha-active attribute. To summarize: 1. on node 2 ganesha_mon:monitor notices that ganesha.nfsd has died 2. on node 2 ganesha_mon:monitor deletes its grace-active attribute 3. on node 2 ganesha_grace:monitor notices that grace-active is gone and returns OCF_ERR_GENERIC, a.k.a. new error. When pacemaker tries to (re)start ganesha_grace, its start action will return OCF_NOT_RUNNING, a.k.a. known error, don't attempt further restarts. 4. on nodes 1, 3, etc., ganesha_grace:notify receives a post-stop notification indicating that node 2 is gone, and sends a DBUS message to its ganesha.nfsd putting it into NFS-GRACE. 5. on node 2 ganesha_mon:monitor waits a short period, then deletes its ganesha-active attribute. This triggers the IPaddr (virt IP) failover according to constraint location rules. ganesha_nfsd modified to run for the duration, start action is invoked to setup the /var/lib/nfs symlink, stop action is invoked to restore it. ganesha-ha.sh modified accordingly to create it as a clone resource. BUG: 1290865 Change-Id: I1ba24f38fa4338b3aeb17c65645e9f439387ff57 Signed-off-by: Kaleb S KEITHLEY <kkeithle@redhat.com> Reviewed-on: http://review.gluster.org/12964 Smoke: Gluster Build System <jenkins@build.gluster.com> NetBSD-regression: NetBSD Build System <jenkins@build.gluster.org> CentOS-regression: Gluster Build System <jenkins@build.gluster.com> Reviewed-on: http://review.gluster.org/13725
Diffstat (limited to 'extras/ganesha/ocf/ganesha_nfsd')
-rw-r--r--extras/ganesha/ocf/ganesha_nfsd89
1 files changed, 47 insertions, 42 deletions
diff --git a/extras/ganesha/ocf/ganesha_nfsd b/extras/ganesha/ocf/ganesha_nfsd
index e064183daef..a9d3e4d860f 100644
--- a/extras/ganesha/ocf/ganesha_nfsd
+++ b/extras/ganesha/ocf/ganesha_nfsd
@@ -29,13 +29,16 @@
: ${OCF_FUNCTIONS_DIR=${OCF_ROOT}/lib/heartbeat}
. ${OCF_FUNCTIONS_DIR}/ocf-shellfuncs
-if [ -n "$OCF_DEBUG_LIBRARY" ]; then
- . $OCF_DEBUG_LIBRARY
+if [ -n "${OCF_DEBUG_LIBRARY}" ]; then
+ . ${OCF_DEBUG_LIBRARY}
else
: ${OCF_FUNCTIONS_DIR=${OCF_ROOT}/lib/heartbeat}
. ${OCF_FUNCTIONS_DIR}/ocf-shellfuncs
fi
+OCF_RESKEY_ha_vol_mnt_default="/var/run/gluster/shared_storage"
+: ${OCF_RESKEY_ha_vol_mnt=${OCF_RESKEY_ha_vol_mnt_default}}
+
ganesha_meta_data() {
cat <<END
<?xml version="1.0"?>
@@ -59,16 +62,16 @@ resource agent for nfs-ganesha.
</parameters>
<actions>
-<action name="start" timeout="40s" />
-<action name="stop" timeout="40s" />
-<action name="status" depth="0" timeout="20s" interval="1m" />
-<action name="monitor" depth="0" timeout="10s" interval="1m" />
+<action name="start" timeout="5s" />
+<action name="stop" timeout="5s" />
+<action name="status" depth="0" timeout="5s" interval="0" />
+<action name="monitor" depth="0" timeout="5s" interval="0" />
<action name="meta-data" timeout="20s" />
</actions>
</resource-agent>
END
-return $OCF_SUCCESS
+return ${OCF_SUCCESS}
}
ganesha_nfsd_usage() {
@@ -78,10 +81,10 @@ ganesha_nfsd_usage() {
# Make sure meta-data and usage always succeed
case $__OCF_ACTION in
meta-data) ganesha_meta_data
- exit $OCF_SUCCESS
+ exit ${OCF_SUCCESS}
;;
usage|help) ganesha_usage
- exit $OCF_SUCCESS
+ exit ${OCF_SUCCESS}
;;
*)
;;
@@ -89,58 +92,60 @@ esac
ganesha_nfsd_start()
{
- return $OCF_SUCCESS
+ local long_host=$(hostname)
+
+ if [[ -d /var/lib/nfs ]]; then
+ mv /var/lib/nfs /var/lib/nfs.backup
+ if [ $? -ne 0 ]; then
+ ocf_log notice "mv /var/lib/nfs /var/lib/nfs.backup failed"
+ fi
+ ln -s ${OCF_RESKEY_ha_vol_mnt}/nfs-ganesha/${long_host}/nfs /var/lib/nfs
+ if [ $? -ne 0 ]; then
+ ocf_log notice "ln -s ${OCF_RESKEY_ha_vol_mnt}/nfs-ganesha/${long_host}/nfs /var/lib/nfs failed"
+ fi
+ fi
+
+ return ${OCF_SUCCESS}
}
ganesha_nfsd_stop()
{
- local short_host=$(hostname -s)
- local long_host=""
-
- if [ "X${OCF_RESOURCE_INSTANCE:0:9}X" = "Xnfs_startX" ]; then
-
- # if this is any nfs_start, go ahead. worst case we
- # find the link already exists and do nothing
- long_host=$(hostname)
-
- if [ -d /var/lib/nfs ]; then
- mv /var/lib/nfs /var/lib/nfs.backup
- ln -s $OCF_RESKEY_ha_vol_mnt/nfs-ganesha/${long_host}/nfs /var/lib/nfs
- if [ $? -ne 0 ]; then
- logger "warning: ln -s $OCF_RESKEY_ha_vol_mnt/nfs-ganesha/${long_host}/nfs /var/lib/nfs failed"
- fi
+ if [ -L /var/lib/nfs -a -d /var/lib/nfs.backup ]; then
+ rm -f /var/lib/nfs
+ if [ $? -ne 0 ]; then
+ ocf_log notice "rm -f /var/lib/nfs failed"
fi
- else
-
- # if this is a clone resource or is specific to this node
- # remove the symlink and restore /var/lib/nfs
-
- if [ "X${OCF_RESOURCE_INSTANCE}X" = "Xnfs_stopX" ] ||
- [ "X${OCF_RESOURCE_INSTANCE}X" = "Xnfs_stop-${short_host}X" ]; then
- if [ -L /var/lib/nfs -a -d /var/lib/nfs.backup ]; then
- rm -f /var/lib/nfs
- mv /var/lib/nfs.backup /var/lib/nfs
- fi
+ mv /var/lib/nfs.backup /var/lib/nfs
+ if [ $? -ne 0 ]; then
+ ocf_log notice "mv /var/lib/nfs.backup /var/lib/nfs failed"
fi
fi
- return $OCF_SUCCESS
+ return ${OCF_SUCCESS}
}
ganesha_nfsd_monitor()
{
- return $OCF_SUCCESS
+ # pacemaker checks to see if RA is already running before starting it.
+ # if we return success, then it's presumed it's already running and
+ # doesn't need to be started, i.e. invoke the start action.
+ # return something other than success to make pacemaker invoke the
+ # start action
+ if [[ -L /var/lib/nfs ]]; then
+ return ${OCF_SUCCESS}
+ fi
+ return ${OCF_NOT_RUNNING}
}
ganesha_nfsd_validate()
{
- return $OCF_SUCCESS
+ return ${OCF_SUCCESS}
}
ganesha_nfsd_validate
-# logger "ganesha_nfsd ${OCF_RESOURCE_INSTANCE} $__OCF_ACTION"
+# ocf_log notice "ganesha_nfsd ${OCF_RESOURCE_INSTANCE} $__OCF_ACTION"
# Translate each action into the appropriate function call
case $__OCF_ACTION in
@@ -151,13 +156,13 @@ stop) ganesha_nfsd_stop
status|monitor) ganesha_nfsd_monitor
;;
*) ganesha_nfsd_usage
- exit $OCF_ERR_UNIMPLEMENTED
+ exit ${OCF_ERR_UNIMPLEMENTED}
;;
esac
rc=$?
# The resource agent may optionally log a debug message
-ocf_log debug "${OCF_RESOURCE_INSTANCE} $__OCF_ACTION returned $rc"
+ocf_log debug "${OCF_RESOURCE_INSTANCE} ${__OCF_ACTION} returned $rc"
exit $rc