From e8121c4afb3680f532b450872b5a3ffcb3766a97 Mon Sep 17 00:00:00 2001
From: Kaleb S KEITHLEY <kkeithle@redhat.com>
Date: Mon, 14 Dec 2015 09:24:57 -0500
Subject: common-ha: reliable grace using pacemaker notify actions

Using *-dead_ip-1 resources to track on which nodes the ganesha.nfsd
had died was found to be unreliable.

Running `pcs status` in the ganesha_grace monitor action was seen to
time out during failover; the HA devs opined that it was, generally,
not a good idea to run `pcs status` in a monitor action in any event.
They suggested using the notify feature, where the resources on all
the nodes are notified when a clone resource agent dies.

This change adds a notify action to the ganesha_grace RA. The ganesha_mon
RA monitors its ganesha.nfsd daemon. While the daemon is running, it
creates two attributes: ganesha-active and grace-active. When the daemon
stops for any reason, the attributes are deleted. Deleting the
ganesha-active attribute triggers the failover of the virtual IP (the
IPaddr RA) to another node where ganesha.nfsd is still running. The
ganesha_grace RA monitors the grace-active attribute. When the
grace-active attibute is deleted, the ganesha_grace RA stops, and will
not restart. This triggers pacemaker to trigger the notify action in
the ganesha_grace RAs on the other nodes in the cluster; which send a
DBUS message to their ganesha.nfsd.

(N.B. grace-active is a bit of a misnomer. while the grace-active
attribute exists, everything is normal and healthy. Deleting the
attribute triggers putting the surviving ganesha.nfsds into GRACE.)

To ensure that the remaining/surviving ganesha.nfsds are put into
NFS-GRACE before the IPaddr (virtual IP) fails over there is a short
delay (sleep) between deleting the grace-active attribute and the
ganesha-active attribute. To summarize:
  1. on node 2 ganesha_mon:monitor notices that ganesha.nfsd has died
  2. on node 2 ganesha_mon:monitor deletes its grace-active attribute
  3. on node 2 ganesha_grace:monitor notices that grace-active is gone
     and returns OCF_ERR_GENERIC, a.k.a. new error. When pacemaker
     tries to (re)start ganesha_grace, its start action will return
     OCF_NOT_RUNNING, a.k.a. known error, don't attempt further
     restarts.
  4. on nodes 1, 3, etc., ganesha_grace:notify receives a post-stop
     notification indicating that node 2 is gone, and sends a DBUS
     message to its ganesha.nfsd putting it into NFS-GRACE.
  5. on node 2 ganesha_mon:monitor waits a short period, then deletes
     its ganesha-active attribute. This triggers the IPaddr (virt IP)
     failover according to constraint location rules.

ganesha_nfsd modified to run for the duration, start action is invoked
to setup the /var/lib/nfs symlink, stop action is invoked to restore it.
ganesha-ha.sh modified accordingly to create it as a clone resource.

Change-Id: Iad60b0c5222bbd55ef95c8b8f955e791caa3ffd0
BUG: 1290865
Signed-off-by: Kaleb S KEITHLEY <kkeithle@redhat.com>
Reviewed-on: http://review.gluster.org/12964
Smoke: Gluster Build System <jenkins@build.gluster.com>
NetBSD-regression: NetBSD Build System <jenkins@build.gluster.org>
CentOS-regression: Gluster Build System <jenkins@build.gluster.com>
---
 extras/ganesha/scripts/ganesha-ha.sh | 207 +++++++----------------------------
 1 file changed, 42 insertions(+), 165 deletions(-)

(limited to 'extras/ganesha/scripts')

diff --git a/extras/ganesha/scripts/ganesha-ha.sh b/extras/ganesha/scripts/ganesha-ha.sh
index 149733fcbd3..7bb7d0d81f6 100644
--- a/extras/ganesha/scripts/ganesha-ha.sh
+++ b/extras/ganesha/scripts/ganesha-ha.sh
@@ -170,8 +170,8 @@ setup_cluster()
     logger "setting up cluster ${name} with the following ${servers}"
 
     pcs cluster auth ${servers}
-# fedora    pcs cluster setup ${name} ${servers}
-# rhel6     pcs cluster setup --name ${name} ${servers}
+    # fedora    pcs cluster setup ${name} ${servers}
+    # rhel6     pcs cluster setup --name ${name} ${servers}
     pcs cluster setup ${RHEL6_PCS_CNAME_OPTION} ${name} ${servers}
     if [ $? -ne 0 ]; then
         logger "pcs cluster setup ${RHEL6_PCS_CNAME_OPTION} ${name} ${servers} failed"
@@ -204,7 +204,7 @@ setup_cluster()
 }
 
 
-setup_finalize()
+setup_finalize_ha()
 {
     local cibfile=${1}
     local stopped=""
@@ -215,7 +215,7 @@ setup_finalize()
          stopped=$(pcs status | grep -u "Stopped")
     done
 
-    pcs status | grep dead_ip-1 | sort > /var/run/ganesha/pcs_status
+    # pcs resource cleanup
 
 }
 
@@ -293,7 +293,7 @@ string:\"EXPORT(Path=/$VOL)\" 2>&1")
         exit 1
     fi
 
-#Run the same command on the localhost,
+    # Run the same command on the localhost,
         output=$(dbus-send --print-reply --system --dest=org.ganesha.nfsd \
 /org/ganesha/nfsd/ExportMgr org.ganesha.nfsd.exportmgr.RemoveExport \
 uint16:$removed_id 2>&1)
@@ -374,13 +374,13 @@ teardown_cluster()
         fi
     done
 
-# BZ 1193433 - pcs doesn't reload cluster.conf after modification
-# after teardown completes, a subsequent setup will appear to have
-# 'remembered' the deleted node. You can work around this by
-# issuing another `pcs cluster node remove $node`,
-# `crm_node -f -R $server`, or
-# `cibadmin --delete --xml-text '<node id="$server"
-# uname="$server"/>'
+    # BZ 1193433 - pcs doesn't reload cluster.conf after modification
+    # after teardown completes, a subsequent setup will appear to have
+    # 'remembered' the deleted node. You can work around this by
+    # issuing another `pcs cluster node remove $node`,
+    # `crm_node -f -R $server`, or
+    # `cibadmin --delete --xml-text '<node id="$server"
+    # uname="$server"/>'
 
     pcs cluster stop --all
     if [ $? -ne 0 ]; then
@@ -480,28 +480,26 @@ setup_create_resources()
 {
     local cibfile=$(mktemp -u)
 
-    # mount the HA-state volume and start ganesha.nfsd on all nodes
-    pcs resource create nfs_start ganesha_nfsd ha_vol_mnt=${HA_VOL_MNT} --clone
+    # fixup /var/lib/nfs
+    logger "pcs resource create nfs_setup ocf:heartbeat:ganesha_nfsd ha_vol_mnt=${HA_VOL_MNT} --clone"
+    pcs resource create nfs_setup ocf:heartbeat:ganesha_nfsd ha_vol_mnt=${HA_VOL_MNT} --clone
     if [ $? -ne 0 ]; then
-        logger "warning: pcs resource create nfs_start ganesha_nfsd ha_vol_mnt=${HA_VOL_MNT} --clone failed"
+        logger "warning: pcs resource create nfs_setup ocf:heartbeat:ganesha_nfsd ha_vol_mnt=${HA_VOL_MNT} --clone failed"
     fi
-    sleep 1
-    # cloned resources seem to never have their start() invoked when they
-    # are created, but stop() is invoked when they are destroyed. Why???.
-    # No matter, we don't want this resource agent hanging around anyway
-    pcs resource delete nfs_start-clone
+
+    pcs resource create nfs-mon ocf:heartbeat:ganesha_mon --clone
     if [ $? -ne 0 ]; then
-        logger "warning: pcs resource delete nfs_start-clone failed"
+        logger "warning: pcs resource create nfs-mon ocf:heartbeat:ganesha_mon --clone failed"
     fi
 
-    pcs resource create nfs-mon ganesha_mon --clone
+    pcs resource create nfs-grace ocf:heartbeat:ganesha_grace --clone meta notify=true
     if [ $? -ne 0 ]; then
-        logger "warning: pcs resource create nfs-mon ganesha_mon --clone failed"
+        logger "warning: pcs resource create nfs-grace ocf:heartbeat:ganesha_grace --clone failed"
     fi
 
-    pcs resource create nfs-grace ganesha_grace --clone
+    pcs constraint location nfs-grace-clone rule score=-INFINITY grace-active ne 1
     if [ $? -ne 0 ]; then
-        logger "warning: pcs resource create nfs-grace ganesha_grace --clone failed"
+        logger "warning: pcs constraint location nfs-grace-clone rule score=-INFINITY grace-active ne 1"
     fi
 
     pcs cluster cib ${cibfile}
@@ -531,21 +529,6 @@ setup_create_resources()
             logger "warning pcs resource create ${1}-cluster_ip-1 ocf:heartbeat:IPaddr ip=${ipaddr} cidr_netmask=32 op monitor interval=15s failed"
         fi
 
-        pcs -f ${cibfile} resource create ${1}-trigger_ip-1 ocf:heartbeat:Dummy
-        if [ $? -ne 0 ]; then
-            logger "warning: pcs resource create ${1}-trigger_ip-1 ocf:heartbeat:Dummy failed"
-        fi
-
-        pcs -f ${cibfile} constraint colocation add ${1}-cluster_ip-1 with ${1}-trigger_ip-1
-        if [ $? -ne 0 ]; then
-            logger "warning: pcs constraint colocation add ${1}-cluster_ip-1 with ${1}-trigger_ip-1 failed"
-        fi
-
-        pcs -f ${cibfile} constraint order ${1}-trigger_ip-1 then nfs-grace-clone
-        if [ $? -ne 0 ]; then
-            logger "warning: pcs constraint order ${1}-trigger_ip-1 then nfs-grace-clone failed"
-        fi
-
         pcs -f ${cibfile} constraint order nfs-grace-clone then ${1}-cluster_ip-1
         if [ $? -ne 0 ]; then
             logger "warning: pcs constraint order nfs-grace-clone then ${1}-cluster_ip-1 failed"
@@ -568,6 +551,13 @@ teardown_resources()
 {
     # local mntpt=$(grep ha-vol-mnt ${HA_CONFIG_FILE} | cut -d = -f 2)
 
+    # restore /var/lib/nfs
+    logger "notice: pcs resource delete nfs_setup-clone"
+    pcs resource delete nfs_setup-clone
+    if [ $? -ne 0 ]; then
+        logger "warning: pcs resource delete nfs_setup-clone failed"
+    fi
+
     # delete -clone resource agents
     # in particular delete the ganesha monitor so we don't try to
     # trigger anything when we shut down ganesha next.
@@ -581,32 +571,11 @@ teardown_resources()
         logger "warning: pcs resource delete nfs-grace-clone failed"
     fi
 
-    # unmount the HA-state volume and terminate ganesha.nfsd on all nodes
-    pcs resource create nfs_stop ganesha_nfsd ha_vol_mnt=${HA_VOL_MNT} --clone
-    if [ $? -ne 0 ]; then
-        logger "warning: pcs resource create nfs_stop ganesha_nfsd ha_vol_mnt=${HA_VOL_MNT} --clone failed"
-    fi
-    sleep 1
-    # cloned resources seem to never have their start() invoked when they
-    # are created, but stop() is invoked when they are destroyed. Why???.
-    pcs resource delete nfs_stop-clone
-    if [ $? -ne 0 ]; then
-        logger "warning: pcs resource delete nfs_stop-clone failed"
-    fi
-
     while [[ ${1} ]]; do
         pcs resource delete ${1}-cluster_ip-1
         if [ $? -ne 0 ]; then
             logger "warning: pcs resource delete ${1}-cluster_ip-1 failed"
         fi
-        pcs resource delete ${1}-trigger_ip-1
-        if [ $? -ne 0 ]; then
-            logger "warning: pcs resource delete ${1}-trigger_ip-1 failed"
-        fi
-        pcs resource delete ${1}-dead_ip-1
-        if [ $? -ne 0 ]; then
-            logger "info: pcs resource delete ${1}-dead_ip-1 failed"
-        fi
         shift
     done
 
@@ -633,21 +602,6 @@ recreate_resources()
             logger "warning pcs resource create ${1}-cluster_ip-1 ocf:heartbeat:IPaddr ip=${ipaddr} cidr_netmask=32 op monitor interval=10s failed"
         fi
 
-        pcs -f ${cibfile} resource create ${1}-trigger_ip-1 ocf:heartbeat:Dummy
-        if [ $? -ne 0 ]; then
-            logger "warning: pcs resource create ${1}-trigger_ip-1 ocf:heartbeat:Dummy failed"
-        fi
-
-        pcs -f ${cibfile} constraint colocation add ${1}-cluster_ip-1 with ${1}-trigger_ip-1
-        if [ $? -ne 0 ]; then
-            logger "warning: pcs constraint colocation add ${1}-cluster_ip-1 with ${1}-trigger_ip-1 failed"
-        fi
-
-        pcs -f ${cibfile} constraint order ${1}-trigger_ip-1 then nfs-grace-clone
-        if [ $? -ne 0 ]; then
-            logger "warning: pcs constraint order ${1}-trigger_ip-1 then nfs-grace-clone failed"
-        fi
-
         pcs -f ${cibfile} constraint order nfs-grace-clone then ${1}-cluster_ip-1
         if [ $? -ne 0 ]; then
             logger "warning: pcs constraint order nfs-grace-clone then ${1}-cluster_ip-1 failed"
@@ -671,21 +625,6 @@ addnode_recreate_resources()
         logger "warning pcs resource create ${add_node}-cluster_ip-1 ocf:heartbeat:IPaddr ip=${add_vip} cidr_netmask=32 op monitor interval=10s failed"
     fi
 
-    pcs -f ${cibfile} resource create ${add_node}-trigger_ip-1 ocf:heartbeat:Dummy
-    if [ $? -ne 0 ]; then
-        logger "warning: pcs resource create ${add_node}-trigger_ip-1 ocf:heartbeat:Dummy failed"
-    fi
-
-    pcs -f ${cibfile} constraint colocation add ${add_node}-cluster_ip-1 with ${add_node}-trigger_ip-1
-    if [ $? -ne 0 ]; then
-        logger "warning: pcs constraint colocation add ${add_node}-cluster_ip-1 with ${add_node}-trigger_ip-1 failed"
-    fi
-
-    pcs -f ${cibfile} constraint order ${add_node}-trigger_ip-1 then nfs-grace-clone
-    if [ $? -ne 0 ]; then
-        logger "warning: pcs constraint order ${add_node}-trigger_ip-1 then nfs-grace-clone failed"
-    fi
-
     pcs -f ${cibfile} constraint order nfs-grace-clone then ${add_node}-cluster_ip-1
     if [ $? -ne 0 ]; then
         logger "warning: pcs constraint order nfs-grace-clone then ${add_node}-cluster_ip-1 failed"
@@ -703,11 +642,6 @@ clear_resources()
             logger "warning: pcs -f ${cibfile} resource delete ${1}-cluster_ip-1"
         fi
 
-        pcs -f ${cibfile} resource delete ${1}-trigger_ip-1
-        if [ $? -ne 0 ]; then
-            logger "warning: pcs -f ${cibfile} resource delete ${1}-trigger_ip-1"
-        fi
-
         shift
     done
 }
@@ -719,52 +653,19 @@ addnode_create_resources()
     local add_vip=${1}; shift
     local cibfile=$(mktemp -u)
 
-    # mount the HA-state volume and start ganesha.nfsd on the new node
-    pcs cluster cib ${cibfile}
-    if [ $? -ne 0 ]; then
-        logger "warning: pcs cluster cib ${cibfile} failed"
-    fi
-
-    pcs -f ${cibfile} resource create nfs_start-${add_node} ganesha_nfsd ha_vol_mnt=${HA_VOL_MNT}
-    if [ $? -ne 0 ]; then
-        logger "warning: pcs -f ${cibfile} resource create nfs_start-${add_node} ganesha_nfsd ha_vol_mnt=${HA_VOL_MNT} failed"
-    fi
-
-    pcs -f ${cibfile} constraint location nfs_start-${add_node} prefers ${add_node}=INFINITY
-    if [ $? -ne 0 ]; then
-        logger "warning: pcs -f ${cibfile} constraint location nfs_start-${add_node} prefers ${add_node}=INFINITY failed"
-    fi
-
-    pcs -f ${cibfile} constraint order nfs_start-${add_node} then nfs-mon-clone
-    if [ $? -ne 0 ]; then
-        logger "warning: pcs -f ${cibfile} constraint order nfs_start-${add_node} then nfs-mon-clone failed"
-    fi
-
-    pcs cluster cib-push ${cibfile}
-    if [ $? -ne 0 ]; then
-        logger "warning: pcs cluster cib-push ${cibfile} failed"
-    fi
-    rm -f ${cibfile}
-
     # start HA on the new node
     pcs cluster start ${add_node}
     if [ $? -ne 0 ]; then
        logger "warning: pcs cluster start ${add_node} failed"
     fi
 
-    pcs resource delete nfs_start-${add_node}
-    if [ $? -ne 0 ]; then
-        logger "warning: pcs resource delete nfs_start-${add_node} failed"
-    fi
-
-
     pcs cluster cib ${cibfile}
     if [ $? -ne 0 ]; then
         logger "warning: pcs cluster cib ${cibfile} failed"
     fi
 
-    # delete all the -cluster_ip-1 and -trigger_ip-1 resources,
-    # clearing their constraints, then create them again so we can
+    # delete all the -cluster_ip-1 resources, clearing
+    # their constraints, then create them again so we can
     # recompute their constraints
     clear_resources ${cibfile} ${HA_SERVERS}
     addnode_recreate_resources ${cibfile} ${add_node} ${add_vip}
@@ -806,31 +707,6 @@ deletenode_delete_resources()
     fi
     rm -f ${cibfile}
 
-    pcs cluster cib ${cibfile}
-    if [ $? -ne 0 ]; then
-        logger "warning: pcs cluster cib ${cibfile} failed"
-    fi
-
-    pcs -f ${cibfile} resource create nfs_stop-${node} ganesha_nfsd ha_vol_mnt=${HA_VOL_MNT}
-    if [ $? -ne 0 ]; then
-        logger "warning: pcs -f ${cibfile} resource create nfs_stop-${node} ganesha_nfsd ha_vol_mnt=${HA_VOL_MNT} failed"
-    fi
-
-    pcs -f ${cibfile} constraint location nfs_stop-${node} prefers ${node}=INFINITY
-    if [ $? -ne 0 ]; then
-        logger "warning: pcs -f ${cibfile} constraint location nfs_stop-${node} prefers ${node}=INFINITY failed"
-    fi
-
-    pcs cluster cib-push ${cibfile}
-    if [ $? -ne 0 ]; then
-        logger "warning: pcs cluster cib-push ${cibfile} failed"
-    fi
-    rm -f ${cibfile}
-
-    pcs resource delete nfs_stop-${node}
-    if [ $? -ne 0 ]; then
-        logger "warning: pcs resource delete nfs_stop-${node} failed"
-    fi
 }
 
 
@@ -974,11 +850,12 @@ main()
 
             setup_create_resources ${HA_SERVERS}
 
+            setup_finalize_ha
+
             setup_state_volume ${HA_SERVERS}
 
             setup_copy_config ${HA_SERVERS}
 
-            setup_finalize
         else
 
             logger "insufficient servers for HA, aborting"
@@ -1019,15 +896,15 @@ main()
         fi
 
         addnode_create_resources ${node} ${vip}
-        #Subsequent add-node recreates resources for all the nodes
-        #that already exist in the cluster. The nodes are picked up
-        #from the entries in the ganesha-ha.conf file. Adding the
-        #newly added node to the file so that the resources specfic
-        #to this node is correctly recreated in the future.
+        # Subsequent add-node recreates resources for all the nodes
+        # that already exist in the cluster. The nodes are picked up
+        # from the entries in the ganesha-ha.conf file. Adding the
+        # newly added node to the file so that the resources specfic
+        # to this node is correctly recreated in the future.
         clean_node=${node//[-.]/_}
         echo "VIP_$clean_node=\"${vip}\"" >> ${HA_CONFDIR}/ganesha-ha.conf
 
-        NEW_NODES="$HA_CLUSTER_NODES,$node"
+        NEW_NODES="$HA_CLUSTER_NODES,${node}"
 
         sed -i s/HA_CLUSTER_NODES.*/"HA_CLUSTER_NODES=\"$NEW_NODES\""/ \
 $HA_CONFDIR/ganesha-ha.conf
@@ -1054,7 +931,7 @@ $HA_CONFDIR/ganesha-ha.conf
 
         setup_copy_config ${HA_SERVERS}
 
-        rm -rf ${HA_VOL_MNT}/nfs-ganesha/{node}
+        rm -rf ${HA_VOL_MNT}/nfs-ganesha/${node}
 
         determine_service_manager
 
@@ -1075,7 +952,7 @@ $HA_CONFDIR/ganesha-ha.conf
         refresh_config ${VOL} ${HA_CONFDIR} ${HA_SERVERS}
         ;;
 
-      *)
+    *)
         # setup and teardown are not intended to be used by a
         # casual user
         usage
-- 
cgit