# Include this file only on CentOS 6, it breaks things on CentOS 7 CENTOS_RELEASE=$(/usr/lib/rpm/redhat/dist.sh --distnum) if [ "$CENTOS_RELEASE" == "6" ]; then . /etc/rc.d/init.d/functions fi PATH="$PATH:/usr/local/bin:/usr/bin:/bin:/usr/sbin" GLUSTERD_DATA_DIR="/data/gluster_vols" GLUSTERD_CONF_DIR="/var/lib/glusterd" GLUSTERD_PORT=24007 # Can be over-written by the glusterfs.root_mount SMC service/tier property GLUSTER_ROOT_MOUNT="/mnt/groot" # Can be over-written by the glusterfs.root_volume SMC service/tier property GLUSTER_ROOT_VOLUME="groot" HOST_IP=$(host $HOSTNAME | awk '{print $NF}') OOM_SCORE_ADJ="-1000" NICENESS="-10" NETCAT_OPTS="-w1 -6" # Simple port probe. probe_glusterd() { return $(nc $NETCAT_OPTS $1 $2 < /dev/null) } # # FUNCTION wait_for_smc # # DESCRIPTION: As you might guess this function just sits around and waits # for SMC proxy to start or it times out whichever comes first. # wait_for_smc() { CNT=0 while ! (smcc tiers $HOSTNAME &> /dev/null) && (( $CNT < $SMC_TIMEOUT )) do echo "Gluster init waiting for SMC proxy..." && sleep 1 CNT=$(($CNT+1)) done if (( $CNT >= $SMC_TIMEOUT )); then echo_failure; echo "Timed out waiting on SMC" return 1 else echo_success && echo "SMC proxy is alive!" return 0 fi } # # FUNCTION set_smc_Tier # # DESCRIPTION: Tries to find the storage.gluster SMC tier for this host, # if it finds the tier it will set a few enviro variables to their SMC # values, # set_smc_tier() { [ -n "$GLUSTER_SMC_TIER" ] && return 0 wait_for_smc || return 1 if GLUSTER_SMC_TIER=$(smcc tiers $HOSTNAME | \ /bin/grep "storage.gluster" 2> /dev/null); then TIER_VOL_DIR=$(smcc getprop $GLUSTER_SMC_TIER \ glusterfs.data_dir 2>/dev/null) && GLUSTERD_DATA_DIR="$TIER_VOL_DIR" TIER_ROOT_VOLUME=$(smcc getprop $GLUSTER_SMC_TIER \ glusterfs.root_volume 2>/dev/null) && [ -n "$TIER_ROOT_VOLUME" ] && GLUSTER_ROOT_VOLUME="$TIER_ROOT_VOLUME" SVC_ROOT_VOLUME=$(smcc getsvcprop $GLUSTER_SMC_TIER \ $HOSTNAME glusterfs.root_volume 2>/dev/null) && [ -n "$SVC_ROOT_VOLUME" ] && GLUSTER_ROOT_VOLUME="$SVC_ROOT_VOLUME" TIER_ROOT_MOUNT=$(smcc getprop $GLUSTER_SMC_TIER glusterfs.root_mount \ 2> /dev/null) && [ -n "$TIER_ROOT_MOUNT" ] && GLUSTER_ROOT_MOUNT="$TIER_ROOT_MOUNT" SVC_ROOT_MOUNT=$(smcc getsvcprop $GLUSTER_SMC_TIER $HOSTNAME \ glusterfs.root_mount 2> /dev/null) && [ -n "$SVC_ROOT_MOUNT" ] && GLUSTER_ROOT_MOUNT="$SVC_ROOT_MOUNT" SVC_UUID=$(smcc getsvcprop $GLUSTER_SMC_TIER $HOSTNAME glusterfs.uuid \ 2> /dev/null) NICE_DAEMON=$(smcc getprop $GLUSTER_SMC_TIER \ glusterfs.nice_daemon 2> /dev/null) WARM_INODE_CACHE=$(smcc getprop $GLUSTER_SMC_TIER \ glusterfs.warm_inode_cache 2> /dev/null) # Fatal if we don't find any services TIER_SERVICES=($(smcc ls $GLUSTER_SMC_TIER | /bin/cut -d: -f1)) || return 1 return 0 fi return 1 } # FUNCTION nice_daemon # # DESCRIPTION: Nice the glustefsd (brick) and glusterd (management) # daemons. Also, adjust their OOM scores to prevent the OOM killer # from killing them in OOM low memory conditions. # # Also consider adjusting vm.min_free_kbytes kernel property via # /etc/sysctl.conf and disabling swap (swapoff -a). # nice_daemon() { set_smc_tier || return 1 if [ "$NICE_DAEMON" == "1" ]; then sleep 2 renice $NICENESS -g $(pgrep -x glusterfsd) &> /dev/null && \ echo_success && echo "Nice'ing glusterfsd..." renice $NICENESS -g $(pgrep -x glusterd) &> /dev/null && \ echo_success && echo "Nice'ing glusterd..." for p in $(pgrep -x glusterfsd);do echo $OOM_SCORE_ADJ > \ /proc/$p/oom_score_adj;done for p in $(pgrep -x glusterd);do echo $OOM_SCORE_ADJ > \ /proc/$p/oom_score_adj;done echo_success && echo "Adjusting OOM score..." fi } # # FUNCTION set_bricks # # DESCRIPTION: Populates "$BRICKS" with a list of hostnames which are # found to be in the groot volume. Currently this won't work for clusters # with more than volume. # set_bricks() { if [ -z "$BRICKS" ]; then if ! BRICKS=($(gluster volume info all | grep -E "^Brick[0-9]+:" | awk '{print $NF}' | cut -d: -f1)); then echo "Unable to find any bricks." return 1 else echo "Found ${#BRICKS[@]} bricks..." fi fi return 0 } set_hosted_vols() { local ALL_VOLS=($(\ls $GLUSTERD_CONF_DIR/vols)) for VOL in ${ALL_VOLS[@]}; do if grep ${HOSTNAME/.facebook.com/} $GLUSTERD_CONF_DIR/vols/$VOL/info &> /dev/null; then HOSTED_VOLS+=($VOL) fi done } # # FUNCTION set_replica_cnt # # DESCRIPTION: Sets $REPLICA_CNT to the current replication factor for the # cluster. # set_replica_cnt() { set_hosted_vols if [ -n "$REPLICA_CNT" ]; then return 0 fi for VOL in ${HOSTED_VOLS[@]}; do REPLICA_CNT=$(grep disperse_count /var/lib/glusterd/vols/$VOL/info | cut -d= -f2) if (( $REPLICA_CNT > 0 )); then return 0 fi done if BRICK_NO_STR=$(gluster volume info all | grep -E \ "Number of Bricks: [0-9]+ x [0-9] = [0-9]+"); then REPLICA_CNT=$(echo $BRICK_NO_STR | grep "Number of Bricks" | awk '{print $6}') elif BRICK_NO_STR=$(gluster volume info all | grep -E \ "Number of Bricks: [0-9]+"); then REPLICA_CNT=$(echo $BRICK_NO_STR | grep "Number of Bricks" | awk '{print $NF}') else echo "Unable to determine number of brick!" return 1 fi return 0 } # # FUNCTION set_node_index # # DESCRIPTION: Sets $NODE_INDEX to the position this node has in the # brick list given by the "volume info all" command. We will use this # for quorum calculations. # set_node_index() { set_bricks || return 1 if [ -n "$NODE_INDEX" ]; then return 0 fi local POS=0 local BRICK="" for BRICK in ${BRICKS[@]} do if echo $BRICK | grep -E "[[:digit:]]{1,3}\.[[:digit:]]{1,3}\.[[:digit:]]{1,3}\.[[:digit:]]{1,3}" &> /dev/null; then BRICK=$(host $BRICK | awk '{print $NF}') fi BRICK_IP=$(host $BRICK | awk '{print $NF}') if [ "$BRICK_IP" = "$HOST_IP" ]; then NODE_INDEX=$POS return 0 fi POS=$(($POS+1)) done return 1 } # # FUNCTION set_replicas # # DESCRIPTION: Sets $REPLICAS to a list of hosts which are replicas # of this host. # set_replicas() { set_replica_cnt || return 1 set_bricks || return 1 if ! set_node_index; then echo "$HOSTNAME not a member of any replica group." return 2 fi local MODULUS=$((($NODE_INDEX+1) % $REPLICA_CNT)) local START_POS=0 if (( $MODULUS == 0 )); then START_POS=$(($NODE_INDEX-$REPLICA_CNT+1)) else START_POS=$(($NODE_INDEX-$MODULUS+1)) fi local OFFSET=0 while (( $OFFSET < $REPLICA_CNT )) do POS=$(($OFFSET+$START_POS)) if (( $POS != $NODE_INDEX )); then REPLICAS+=(${BRICKS[$POS]}) fi OFFSET=$(($OFFSET + 1)) done } # # FUNCTION set_live_replica # # DESCRIPTION: Sets $LIVE_REPLICA to a host fromthe $REPLICAS list which is # confirmed to be "alive" by way of a probe sent to the hosts Gluster # management port (we can't use the brick port since it is dynamic). # set_live_replica() { set_replicas || return 0 local REPLICA="" for REPLICA in ${REPLICAS[@]} do echo -n "Checking host $REPLICA..." if probe_glusterd $REPLICA $GLUSTERD_PORT; then echo "ALIVE, setting as replica host." LIVE_REPLICA=$REPLICA return 0 else echo "DEAD" fi done return 1 } # # FUNCTION: probe_peer # # DESCRIPTION: This function will find a working host in the hosts SMC tier # to probe. # probe_peer() { for HOST in ${TIER_SERVICES[@]}; do if [ ! "$HOST" == "$HOSTNAME" ] && probe_glusterd $HOST $GLUSTERD_PORT; then if gluster peer probe $HOST &> /dev/null; then echo_success && echo "Probed @ $HOST" return 0 else echo_failure; echo "Failed to probe $HOST" fi fi done return 1 } # # FUNCTION: sync_uuid_smc # # DESCRIPTION: This function will copy the hosts UUID into SMC for later # use i.e. re-adding a node to a cluster after re-imaging. # sync_uuid_smc() { if ! smcc setsvcprop $GLUSTER_SMC_TIER $HOSTNAME glusterfs.uuid $1 &> \ /dev/null; then echo_failure; echo "Failed to save UUID to SMC" return 1 fi echo_success && echo "Sync'd UUID to SMC" return 0 } # # FUNCTION: smartmount_vol # # DESCRIPTION: This function figures out how to mount a Gluster volume in # a SMC tier by trying to find a host which has a working daemon. Once # a working daemon is found it will attempt to mount against that node. # After the initial mount is made and the cluster topology is # downloaded to the client this host is no longer required. # smartmount_vol() { set_smc_tier || return 1 /sbin/modprobe fuse || (echo "Failed to load FUSE!" && return 1) local VOLUME="$1" local MOUNT="$2" rpm -q nmap &> /dev/null || yum -y -q install nmap &> /dev/null for HOST in ${TIER_SERVICES[@]}; do if probe_glusterd $HOST $GLUSTERD_PORT; then echo_success && echo "Found GlusterFS host @ $HOST" if grep -E "^[[:graph:]]+ $MOUNT fuse.glusterfs" /proc/mounts &> /dev/null; then echo_success && echo "$MOUNT already mounted" return 0 elif mkdir -p "$GLUSTER_ROOT_MOUNT" &> /dev/null && mount -t glusterfs $HOST:/"$VOLUME" "$MOUNT" && sleep 1 && cat /proc/mounts | grep "$MOUNT" &> /dev/null; then echo_success && echo "Mounted GlusterFS $VOLUME @ $MOUNT" return 0 else echo_failure; echo "Failed to mount from $HOST" fi fi done } # # FUNCTION: patch_services # # DESCRIPTION: Patch /etc/services off the get-go so we don't # steal fbagent's port. cfengine can handle this as well but # it takes some time to run, so we don't want to take a chance # given how vital it is. # patch_etc_services() { if ! grep "fbagent.*988" /etc/services &> /dev/null; then grep "fbagent.*988/tcp" /etc/services || \ echo "fbagent 988/tcp" >> /etc/services grep "fbagent.*988/udp" /etc/services || \ echo "fbagent 988/udp" >> /etc/services echo_success && echo "Added fbagent to /etc/services" fi } # # FUNCTION: heal_volume # # DESCRIPTION: Heal volume will traverse a given volume stat'ing each # file in order to trigger a self-heal & ensure the file is re-mirrored # to a host which has been re-imaged or otherwise become out of sync. # heal_volume() { set_smc_tier || return 1 local VOLUME="$(echo $1 | sed 's/\./_/g')" local CONCURRENT_HEALS="2" [ -n "$2" ] && CONCURRENT_HEALS="$2" local TMP_MOUNT="/tmp/$VOLUME.healer" [ -d "$TMP_MOUNT" ] || mkdir -p $TMP_MOUNT cat /proc/mounts | grep "$TMP_MOUNT" &> /dev/null && umount "$TMP_MOUNT" if smartmount_vol "$VOLUME" "$TMP_MOUNT"; then umount "$TMP_MOUNT" smartmount_vol "$VOLUME" "$TMP_MOUNT" cd "$TMP_MOUNT" for ((CNT=1; CNT<=$CONCURRENT_HEALS; CNT++)) do for ENTRY in $(ls | sed -n "$CNT~""$CONCURRENT_HEALS""p");do echo "Healing $ENTRY..." && ( [ -d "$ENTRY" ] && \ ls "$ENTRY"/* | xargs -n50 -P1 stat >/dev/null ) || stat "$ENTRY" &> /dev/null done & done cd / wait # Don't umount here, as the actual heals are backgrounded by # the FUSE client. If we umount now they will go unfinished. # (Don't worry, this all goes away as of v3.3). echo_success && echo "Healed $VOLUME" else echo_failure; echo "Failed to heal $VOLUME" return 1 fi } # # FUNCTION: check_config # # DESCRIPTION: This function verifies the hosts Gluster configuration and if # necessary will restore the hosts UUID & re-sync the configuration from a # working node in the cluster. Afterwards it will re-create the volume # directories and trigger a self-heal on all files # # NOTE: This function will only run if the node is *not* Gluster MService # managed, as the MService handles these functions and then some. It's # here for cases where we are testing out new configs but still want to be # resilient through re-imaging cycles. For long-term production use the # MService should be used. # check_config() { # If the host isn't listed in a storage.gluster.* tier do nothing set_smc_tier || return 0 # If tier uses Gluster MService don't do anything, the MService # will handle these functions smcc getprop $GLUSTER_SMC_TIER fbd_package 2>&1 | grep -E "gluster_mservice|antfarm" &> /dev/null && return 0 LOCAL_UUID=$(cat $GLUSTERD_CONF/glusterd.info 2> /dev/null | cut -d= -f2) if [ -n "$SVC_UUID" ]; then # We have a storaged UUID in SMC, two cases, either # we have been re-imaged, or we just need to sync it to SMC if ! grep "UUID=$SVC_UUID" $GLUSTERD_CONF/glusterd.info &> /dev/null; then # SMC UUID doesn't match, restore it! echo "UUID=$SVC_UUID" > $GLUSTERD_CONF/glusterd.info echo_success && echo "Restored UUID from SMC" start_daemon sleep 5 probe_peer sleep 5 stop sleep 5 start_daemon sleep 5 if VOL_DIRS=($(gluster volume info | grep -Eo \ "(^Brick[0-9]+: $HOSTNAME)|(^Brick[0-9]+: $(echo $HOSTNAME | sed 's/.facebook.com//g')):$GLUSTERD_DATA_DIR.*" | cut -d: -f3)); then stop start_daemon for VOL_DIR in ${VOL_DIRS[@]}; do mkdir -p "$VOL_DIR" heal_volume "${VOL_DIR##*/}" done echo_success && echo "Created volume dirs" else echo_failure; echo "No volume dirs found" fi fi else # We don't have any UUID stored in SMC, either we need to record it # or this is a completely fresh install. if [ -z "$LOCAL_UUID" ]; then # Not even a local UUID, fresh install case start_daemon sleep 5 if ! LOCAL_UUID=$(cat $GLUSTERD_CONF/glusterd.info | cut -d= -f2); then echo_failure; echo "UUID not generated" return 1 fi stop fi sync_uuid_smc $LOCAL_UUID fi return 0 } # # FUNCTION: mount_root # # DESCRIPTION: Mount root will attempt to find a defined "root" volume which # is assigned this this host and mount it # mount_root() { if ! set_smc_tier; then echo_failure; echo "Mounting root not possible, no GFS SMC tier found" return 1 fi if [ -z "$SVC_UUID" ]; then echo_failure;echo "Not mounting, no UUID in SMC, new node?" return 1 fi if smartmount_vol $GLUSTER_ROOT_VOLUME $GLUSTER_ROOT_MOUNT; then return 0 else echo_failure; echo \ "WARNING: GlusterFS not mounted @ $GLUSTER_ROOT_MOUNT" && return 1 fi } # # FUNCTION: warm_inode_cache # # DESCRIPTION: This function effectively "pre-warms" the inode cache of a # Gluster host by simply doing an ls -lR on the data directory. This is # very useful for hosts which run with only 1 spindle as the number of # meta-data requests which flood a host which participates in a cluster # with large numbers of files creates head contention. The result of this # contention can be a cluster which is unresponsive and/or laggy. Loading # this meta-data into memory ahead of time eliminates this problem. # warm_inode_cache() { # Don't fail here, attempt to run with defaults set_smc_tier if [ "$WARM_INODE_CACHE" == "1" ] && [ -n "$GLUSTERD_DATA_DIR" ] && \ [ -d "$GLUSTERD_DATA_DIR" ]; then echo -n "Warming inode cache ($GLUSTERD_DATA_DIR)..." mkdir -p $GLUSTERD_DATA_DIR if CNT=$(ls -lR $GLUSTERD_DATA_DIR | wc -l); then echo -n "$CNT entries" echo_success && echo "" else echo_failure && echo "" fi fi return 0 } # # FUNCTION: check_quorum # # DESCRIPTION: Checks the quorum status of the local node. Will non-zero if # the node quorum margin is <= 0, where node margin is defined by how many # nodes can be downed before we have a loss of quorum. This will principally # be used by FBAR to easily figure out if it can remediate a Gluster node # (it can call this via SSH). # check_quorum() { # Return 0 here so FBAR knows it's ok to take a spare or otherwise # dead node. if ! pgrep glusterd &> /dev/null; then echo "glusterd not running!" return 1 fi set_replica_cnt || return 1 set_replicas local REPLICAS_RET_CODE=$? if (( $REPLICAS_RET_CODE == 2 )); then return 0 elif (( $REPLICAS_RET_CODE != 0 )); then return 1 fi local REDUNDANCY_CNT=0 for VOL in ${HOSTED_VOLS[@]}; do REDUNDANCY_CNT=$(grep redundancy_count /var/lib/glusterd/vols/groot/info | cut -d= -f2) if (( REDUNDANCY_COUNT > 0 )); then break; fi done if ! (( REDUNDANCY_CNT > 0 )); then REDUNDANCY_CNT=${#REPLICAS[@]} QUORUM_THRESHOLD=$(((${REDUNDANCY_CNT}+1)/2+1)) echo "Quorum threshold: $QUORUM_THRESHOLD" else QUORUM_THRESHOLD=$((${REDUNDANCY_CNT}/2)) echo "Quorum threshold (EC @ 50% of ${REDUNDANCY_CNT} redundant bricks): $QUORUM_THRESHOLD" fi local LIVING_BRICKS=$REPLICA_CNT local CHECK_LIST=(${REPLICAS[@]}) CHECK_LIST+=($HOST) local CHECK_HOST="" local DEAD_BRICKS=0 for CHECK_HOST in ${CHECK_LIST[@]} do echo -n "Replica $CHECK_HOST: " if ! probe_glusterd $CHECK_HOST $GLUSTERD_PORT; then echo "DEAD" LIVING_BRICKS=$(($LIVING_BRICKS-1)) DEAD_BRICKS=$(($DEAD_BRICKS+1)) else echo "ALIVE" fi done QUORUM_MARGIN=$(($QUORUM_THRESHOLD-$DEAD_BRICKS)) echo "Quorum margin: $QUORUM_MARGIN" if (( $QUORUM_MARGIN > 0 )); then return 0 else return 1 fi } # # FUNCTION: fsdiff # # DESCRIPTION: Does a quick sanity check on the file sets between the local node # and one of it's partner nodes. This function will return a list of all files # which differ in size. Keep in mind this will be approximate on running live # hosts since the script can't get a perfect snapshot of each FS. On a node # which is about to be re-integrated into the cluster however it will give a # good view of how much data is out of sync. # fsdiff() { WORK_DIR="/tmp/gfsdiff" set_smc_tier if ! set_node_index; then echo "$HOSTNAME not a member of any replica group." exit 1 fi set_replicas || ( echo "No replicas found!" && return 1 ) set_live_replica || ( echo "No live replica found!" && return 1 ) mkdir -p $WORK_DIR echo -n "Getting local file list for $HOSTNAME..." find $GLUSTERD_DATA_DIR -type f -printf '%s\t%p\n' | sort > $WORK_DIR/$HOSTNAME.lst echo "DONE" echo -n "Getting file list for $LIVE_REPLICA..." ssh root@$LIVE_REPLICA "find $GLUSTERD_DATA_DIR -type f -printf '%s\t%p\n'" \ | sort > $WORK_DIR/$LIVE_REPLICA.lst echo "DONE" echo "Finding differences..." comm -1 -3 $WORK_DIR/$LIVE_REPLICA.lst $WORK_DIR/$HOSTNAME.lst | awk '{print $NF}' }