From e8121c4afb3680f532b450872b5a3ffcb3766a97 Mon Sep 17 00:00:00 2001 From: Kaleb S KEITHLEY Date: Mon, 14 Dec 2015 09:24:57 -0500 Subject: common-ha: reliable grace using pacemaker notify actions Using *-dead_ip-1 resources to track on which nodes the ganesha.nfsd had died was found to be unreliable. Running `pcs status` in the ganesha_grace monitor action was seen to time out during failover; the HA devs opined that it was, generally, not a good idea to run `pcs status` in a monitor action in any event. They suggested using the notify feature, where the resources on all the nodes are notified when a clone resource agent dies. This change adds a notify action to the ganesha_grace RA. The ganesha_mon RA monitors its ganesha.nfsd daemon. While the daemon is running, it creates two attributes: ganesha-active and grace-active. When the daemon stops for any reason, the attributes are deleted. Deleting the ganesha-active attribute triggers the failover of the virtual IP (the IPaddr RA) to another node where ganesha.nfsd is still running. The ganesha_grace RA monitors the grace-active attribute. When the grace-active attibute is deleted, the ganesha_grace RA stops, and will not restart. This triggers pacemaker to trigger the notify action in the ganesha_grace RAs on the other nodes in the cluster; which send a DBUS message to their ganesha.nfsd. (N.B. grace-active is a bit of a misnomer. while the grace-active attribute exists, everything is normal and healthy. Deleting the attribute triggers putting the surviving ganesha.nfsds into GRACE.) To ensure that the remaining/surviving ganesha.nfsds are put into NFS-GRACE before the IPaddr (virtual IP) fails over there is a short delay (sleep) between deleting the grace-active attribute and the ganesha-active attribute. To summarize: 1. on node 2 ganesha_mon:monitor notices that ganesha.nfsd has died 2. on node 2 ganesha_mon:monitor deletes its grace-active attribute 3. on node 2 ganesha_grace:monitor notices that grace-active is gone and returns OCF_ERR_GENERIC, a.k.a. new error. When pacemaker tries to (re)start ganesha_grace, its start action will return OCF_NOT_RUNNING, a.k.a. known error, don't attempt further restarts. 4. on nodes 1, 3, etc., ganesha_grace:notify receives a post-stop notification indicating that node 2 is gone, and sends a DBUS message to its ganesha.nfsd putting it into NFS-GRACE. 5. on node 2 ganesha_mon:monitor waits a short period, then deletes its ganesha-active attribute. This triggers the IPaddr (virt IP) failover according to constraint location rules. ganesha_nfsd modified to run for the duration, start action is invoked to setup the /var/lib/nfs symlink, stop action is invoked to restore it. ganesha-ha.sh modified accordingly to create it as a clone resource. Change-Id: Iad60b0c5222bbd55ef95c8b8f955e791caa3ffd0 BUG: 1290865 Signed-off-by: Kaleb S KEITHLEY Reviewed-on: http://review.gluster.org/12964 Smoke: Gluster Build System NetBSD-regression: NetBSD Build System CentOS-regression: Gluster Build System --- extras/ganesha/scripts/ganesha-ha.sh | 207 +++++++---------------------------- 1 file changed, 42 insertions(+), 165 deletions(-) (limited to 'extras/ganesha/scripts') diff --git a/extras/ganesha/scripts/ganesha-ha.sh b/extras/ganesha/scripts/ganesha-ha.sh index 149733fcbd3..7bb7d0d81f6 100644 --- a/extras/ganesha/scripts/ganesha-ha.sh +++ b/extras/ganesha/scripts/ganesha-ha.sh @@ -170,8 +170,8 @@ setup_cluster() logger "setting up cluster ${name} with the following ${servers}" pcs cluster auth ${servers} -# fedora pcs cluster setup ${name} ${servers} -# rhel6 pcs cluster setup --name ${name} ${servers} + # fedora pcs cluster setup ${name} ${servers} + # rhel6 pcs cluster setup --name ${name} ${servers} pcs cluster setup ${RHEL6_PCS_CNAME_OPTION} ${name} ${servers} if [ $? -ne 0 ]; then logger "pcs cluster setup ${RHEL6_PCS_CNAME_OPTION} ${name} ${servers} failed" @@ -204,7 +204,7 @@ setup_cluster() } -setup_finalize() +setup_finalize_ha() { local cibfile=${1} local stopped="" @@ -215,7 +215,7 @@ setup_finalize() stopped=$(pcs status | grep -u "Stopped") done - pcs status | grep dead_ip-1 | sort > /var/run/ganesha/pcs_status + # pcs resource cleanup } @@ -293,7 +293,7 @@ string:\"EXPORT(Path=/$VOL)\" 2>&1") exit 1 fi -#Run the same command on the localhost, + # Run the same command on the localhost, output=$(dbus-send --print-reply --system --dest=org.ganesha.nfsd \ /org/ganesha/nfsd/ExportMgr org.ganesha.nfsd.exportmgr.RemoveExport \ uint16:$removed_id 2>&1) @@ -374,13 +374,13 @@ teardown_cluster() fi done -# BZ 1193433 - pcs doesn't reload cluster.conf after modification -# after teardown completes, a subsequent setup will appear to have -# 'remembered' the deleted node. You can work around this by -# issuing another `pcs cluster node remove $node`, -# `crm_node -f -R $server`, or -# `cibadmin --delete --xml-text '' + # BZ 1193433 - pcs doesn't reload cluster.conf after modification + # after teardown completes, a subsequent setup will appear to have + # 'remembered' the deleted node. You can work around this by + # issuing another `pcs cluster node remove $node`, + # `crm_node -f -R $server`, or + # `cibadmin --delete --xml-text '' pcs cluster stop --all if [ $? -ne 0 ]; then @@ -480,28 +480,26 @@ setup_create_resources() { local cibfile=$(mktemp -u) - # mount the HA-state volume and start ganesha.nfsd on all nodes - pcs resource create nfs_start ganesha_nfsd ha_vol_mnt=${HA_VOL_MNT} --clone + # fixup /var/lib/nfs + logger "pcs resource create nfs_setup ocf:heartbeat:ganesha_nfsd ha_vol_mnt=${HA_VOL_MNT} --clone" + pcs resource create nfs_setup ocf:heartbeat:ganesha_nfsd ha_vol_mnt=${HA_VOL_MNT} --clone if [ $? -ne 0 ]; then - logger "warning: pcs resource create nfs_start ganesha_nfsd ha_vol_mnt=${HA_VOL_MNT} --clone failed" + logger "warning: pcs resource create nfs_setup ocf:heartbeat:ganesha_nfsd ha_vol_mnt=${HA_VOL_MNT} --clone failed" fi - sleep 1 - # cloned resources seem to never have their start() invoked when they - # are created, but stop() is invoked when they are destroyed. Why???. - # No matter, we don't want this resource agent hanging around anyway - pcs resource delete nfs_start-clone + + pcs resource create nfs-mon ocf:heartbeat:ganesha_mon --clone if [ $? -ne 0 ]; then - logger "warning: pcs resource delete nfs_start-clone failed" + logger "warning: pcs resource create nfs-mon ocf:heartbeat:ganesha_mon --clone failed" fi - pcs resource create nfs-mon ganesha_mon --clone + pcs resource create nfs-grace ocf:heartbeat:ganesha_grace --clone meta notify=true if [ $? -ne 0 ]; then - logger "warning: pcs resource create nfs-mon ganesha_mon --clone failed" + logger "warning: pcs resource create nfs-grace ocf:heartbeat:ganesha_grace --clone failed" fi - pcs resource create nfs-grace ganesha_grace --clone + pcs constraint location nfs-grace-clone rule score=-INFINITY grace-active ne 1 if [ $? -ne 0 ]; then - logger "warning: pcs resource create nfs-grace ganesha_grace --clone failed" + logger "warning: pcs constraint location nfs-grace-clone rule score=-INFINITY grace-active ne 1" fi pcs cluster cib ${cibfile} @@ -531,21 +529,6 @@ setup_create_resources() logger "warning pcs resource create ${1}-cluster_ip-1 ocf:heartbeat:IPaddr ip=${ipaddr} cidr_netmask=32 op monitor interval=15s failed" fi - pcs -f ${cibfile} resource create ${1}-trigger_ip-1 ocf:heartbeat:Dummy - if [ $? -ne 0 ]; then - logger "warning: pcs resource create ${1}-trigger_ip-1 ocf:heartbeat:Dummy failed" - fi - - pcs -f ${cibfile} constraint colocation add ${1}-cluster_ip-1 with ${1}-trigger_ip-1 - if [ $? -ne 0 ]; then - logger "warning: pcs constraint colocation add ${1}-cluster_ip-1 with ${1}-trigger_ip-1 failed" - fi - - pcs -f ${cibfile} constraint order ${1}-trigger_ip-1 then nfs-grace-clone - if [ $? -ne 0 ]; then - logger "warning: pcs constraint order ${1}-trigger_ip-1 then nfs-grace-clone failed" - fi - pcs -f ${cibfile} constraint order nfs-grace-clone then ${1}-cluster_ip-1 if [ $? -ne 0 ]; then logger "warning: pcs constraint order nfs-grace-clone then ${1}-cluster_ip-1 failed" @@ -568,6 +551,13 @@ teardown_resources() { # local mntpt=$(grep ha-vol-mnt ${HA_CONFIG_FILE} | cut -d = -f 2) + # restore /var/lib/nfs + logger "notice: pcs resource delete nfs_setup-clone" + pcs resource delete nfs_setup-clone + if [ $? -ne 0 ]; then + logger "warning: pcs resource delete nfs_setup-clone failed" + fi + # delete -clone resource agents # in particular delete the ganesha monitor so we don't try to # trigger anything when we shut down ganesha next. @@ -581,32 +571,11 @@ teardown_resources() logger "warning: pcs resource delete nfs-grace-clone failed" fi - # unmount the HA-state volume and terminate ganesha.nfsd on all nodes - pcs resource create nfs_stop ganesha_nfsd ha_vol_mnt=${HA_VOL_MNT} --clone - if [ $? -ne 0 ]; then - logger "warning: pcs resource create nfs_stop ganesha_nfsd ha_vol_mnt=${HA_VOL_MNT} --clone failed" - fi - sleep 1 - # cloned resources seem to never have their start() invoked when they - # are created, but stop() is invoked when they are destroyed. Why???. - pcs resource delete nfs_stop-clone - if [ $? -ne 0 ]; then - logger "warning: pcs resource delete nfs_stop-clone failed" - fi - while [[ ${1} ]]; do pcs resource delete ${1}-cluster_ip-1 if [ $? -ne 0 ]; then logger "warning: pcs resource delete ${1}-cluster_ip-1 failed" fi - pcs resource delete ${1}-trigger_ip-1 - if [ $? -ne 0 ]; then - logger "warning: pcs resource delete ${1}-trigger_ip-1 failed" - fi - pcs resource delete ${1}-dead_ip-1 - if [ $? -ne 0 ]; then - logger "info: pcs resource delete ${1}-dead_ip-1 failed" - fi shift done @@ -633,21 +602,6 @@ recreate_resources() logger "warning pcs resource create ${1}-cluster_ip-1 ocf:heartbeat:IPaddr ip=${ipaddr} cidr_netmask=32 op monitor interval=10s failed" fi - pcs -f ${cibfile} resource create ${1}-trigger_ip-1 ocf:heartbeat:Dummy - if [ $? -ne 0 ]; then - logger "warning: pcs resource create ${1}-trigger_ip-1 ocf:heartbeat:Dummy failed" - fi - - pcs -f ${cibfile} constraint colocation add ${1}-cluster_ip-1 with ${1}-trigger_ip-1 - if [ $? -ne 0 ]; then - logger "warning: pcs constraint colocation add ${1}-cluster_ip-1 with ${1}-trigger_ip-1 failed" - fi - - pcs -f ${cibfile} constraint order ${1}-trigger_ip-1 then nfs-grace-clone - if [ $? -ne 0 ]; then - logger "warning: pcs constraint order ${1}-trigger_ip-1 then nfs-grace-clone failed" - fi - pcs -f ${cibfile} constraint order nfs-grace-clone then ${1}-cluster_ip-1 if [ $? -ne 0 ]; then logger "warning: pcs constraint order nfs-grace-clone then ${1}-cluster_ip-1 failed" @@ -671,21 +625,6 @@ addnode_recreate_resources() logger "warning pcs resource create ${add_node}-cluster_ip-1 ocf:heartbeat:IPaddr ip=${add_vip} cidr_netmask=32 op monitor interval=10s failed" fi - pcs -f ${cibfile} resource create ${add_node}-trigger_ip-1 ocf:heartbeat:Dummy - if [ $? -ne 0 ]; then - logger "warning: pcs resource create ${add_node}-trigger_ip-1 ocf:heartbeat:Dummy failed" - fi - - pcs -f ${cibfile} constraint colocation add ${add_node}-cluster_ip-1 with ${add_node}-trigger_ip-1 - if [ $? -ne 0 ]; then - logger "warning: pcs constraint colocation add ${add_node}-cluster_ip-1 with ${add_node}-trigger_ip-1 failed" - fi - - pcs -f ${cibfile} constraint order ${add_node}-trigger_ip-1 then nfs-grace-clone - if [ $? -ne 0 ]; then - logger "warning: pcs constraint order ${add_node}-trigger_ip-1 then nfs-grace-clone failed" - fi - pcs -f ${cibfile} constraint order nfs-grace-clone then ${add_node}-cluster_ip-1 if [ $? -ne 0 ]; then logger "warning: pcs constraint order nfs-grace-clone then ${add_node}-cluster_ip-1 failed" @@ -703,11 +642,6 @@ clear_resources() logger "warning: pcs -f ${cibfile} resource delete ${1}-cluster_ip-1" fi - pcs -f ${cibfile} resource delete ${1}-trigger_ip-1 - if [ $? -ne 0 ]; then - logger "warning: pcs -f ${cibfile} resource delete ${1}-trigger_ip-1" - fi - shift done } @@ -719,52 +653,19 @@ addnode_create_resources() local add_vip=${1}; shift local cibfile=$(mktemp -u) - # mount the HA-state volume and start ganesha.nfsd on the new node - pcs cluster cib ${cibfile} - if [ $? -ne 0 ]; then - logger "warning: pcs cluster cib ${cibfile} failed" - fi - - pcs -f ${cibfile} resource create nfs_start-${add_node} ganesha_nfsd ha_vol_mnt=${HA_VOL_MNT} - if [ $? -ne 0 ]; then - logger "warning: pcs -f ${cibfile} resource create nfs_start-${add_node} ganesha_nfsd ha_vol_mnt=${HA_VOL_MNT} failed" - fi - - pcs -f ${cibfile} constraint location nfs_start-${add_node} prefers ${add_node}=INFINITY - if [ $? -ne 0 ]; then - logger "warning: pcs -f ${cibfile} constraint location nfs_start-${add_node} prefers ${add_node}=INFINITY failed" - fi - - pcs -f ${cibfile} constraint order nfs_start-${add_node} then nfs-mon-clone - if [ $? -ne 0 ]; then - logger "warning: pcs -f ${cibfile} constraint order nfs_start-${add_node} then nfs-mon-clone failed" - fi - - pcs cluster cib-push ${cibfile} - if [ $? -ne 0 ]; then - logger "warning: pcs cluster cib-push ${cibfile} failed" - fi - rm -f ${cibfile} - # start HA on the new node pcs cluster start ${add_node} if [ $? -ne 0 ]; then logger "warning: pcs cluster start ${add_node} failed" fi - pcs resource delete nfs_start-${add_node} - if [ $? -ne 0 ]; then - logger "warning: pcs resource delete nfs_start-${add_node} failed" - fi - - pcs cluster cib ${cibfile} if [ $? -ne 0 ]; then logger "warning: pcs cluster cib ${cibfile} failed" fi - # delete all the -cluster_ip-1 and -trigger_ip-1 resources, - # clearing their constraints, then create them again so we can + # delete all the -cluster_ip-1 resources, clearing + # their constraints, then create them again so we can # recompute their constraints clear_resources ${cibfile} ${HA_SERVERS} addnode_recreate_resources ${cibfile} ${add_node} ${add_vip} @@ -806,31 +707,6 @@ deletenode_delete_resources() fi rm -f ${cibfile} - pcs cluster cib ${cibfile} - if [ $? -ne 0 ]; then - logger "warning: pcs cluster cib ${cibfile} failed" - fi - - pcs -f ${cibfile} resource create nfs_stop-${node} ganesha_nfsd ha_vol_mnt=${HA_VOL_MNT} - if [ $? -ne 0 ]; then - logger "warning: pcs -f ${cibfile} resource create nfs_stop-${node} ganesha_nfsd ha_vol_mnt=${HA_VOL_MNT} failed" - fi - - pcs -f ${cibfile} constraint location nfs_stop-${node} prefers ${node}=INFINITY - if [ $? -ne 0 ]; then - logger "warning: pcs -f ${cibfile} constraint location nfs_stop-${node} prefers ${node}=INFINITY failed" - fi - - pcs cluster cib-push ${cibfile} - if [ $? -ne 0 ]; then - logger "warning: pcs cluster cib-push ${cibfile} failed" - fi - rm -f ${cibfile} - - pcs resource delete nfs_stop-${node} - if [ $? -ne 0 ]; then - logger "warning: pcs resource delete nfs_stop-${node} failed" - fi } @@ -974,11 +850,12 @@ main() setup_create_resources ${HA_SERVERS} + setup_finalize_ha + setup_state_volume ${HA_SERVERS} setup_copy_config ${HA_SERVERS} - setup_finalize else logger "insufficient servers for HA, aborting" @@ -1019,15 +896,15 @@ main() fi addnode_create_resources ${node} ${vip} - #Subsequent add-node recreates resources for all the nodes - #that already exist in the cluster. The nodes are picked up - #from the entries in the ganesha-ha.conf file. Adding the - #newly added node to the file so that the resources specfic - #to this node is correctly recreated in the future. + # Subsequent add-node recreates resources for all the nodes + # that already exist in the cluster. The nodes are picked up + # from the entries in the ganesha-ha.conf file. Adding the + # newly added node to the file so that the resources specfic + # to this node is correctly recreated in the future. clean_node=${node//[-.]/_} echo "VIP_$clean_node=\"${vip}\"" >> ${HA_CONFDIR}/ganesha-ha.conf - NEW_NODES="$HA_CLUSTER_NODES,$node" + NEW_NODES="$HA_CLUSTER_NODES,${node}" sed -i s/HA_CLUSTER_NODES.*/"HA_CLUSTER_NODES=\"$NEW_NODES\""/ \ $HA_CONFDIR/ganesha-ha.conf @@ -1054,7 +931,7 @@ $HA_CONFDIR/ganesha-ha.conf setup_copy_config ${HA_SERVERS} - rm -rf ${HA_VOL_MNT}/nfs-ganesha/{node} + rm -rf ${HA_VOL_MNT}/nfs-ganesha/${node} determine_service_manager @@ -1075,7 +952,7 @@ $HA_CONFDIR/ganesha-ha.conf refresh_config ${VOL} ${HA_CONFDIR} ${HA_SERVERS} ;; - *) + *) # setup and teardown are not intended to be used by a # casual user usage -- cgit