#! /bin/bash

# Copyright 2016 Cumulus Networks, inc  all rights reserved

# This script is run from systemd via OnFailure actions for
# cumulus services for which we want to run cl-support

# This argument should be started with %i, to get the name of the
# restarted service.

# At the time of this writing (Jan 2016), that's switchd and clagd

service=$1
prog=${1%.service}
# watchdog.prog so programs that do 'rm -f /run/prog.*' don't catch it
heartbeatmiss=/run/watchdog.${prog}
failure=/run/failure.${prog}

# capture the state once, as soon as possible.
stat="$(systemctl show -p Result,ExecMainCode,ExecMainStatus,UnitFileState $service \
    | tr '\n' ' ')"
# We don't want the UnitFileState field in the log message
statmsg=${stat% UnitFileState=*}

function handle_watchdog()
{
        local -i miss=0
        [ -e ${heartbeatmiss} ] && . ${heartbeatmiss}
        (( miss++ ))
        echo miss=${miss} > ${heartbeatmiss}
        if [ ${miss} -ne 1 ]; then
            logger -p err -t heartbeat Restarting $prog after heartbeat miss '#'${miss} without cl-support
        else
            if [ "$prog" = switchd ]; then
                mods=switchd.stack,
            fi
            logger -p err -t heartbeat $prog heartbeat miss '#'${miss} taking cl-support
            /usr/cumulus/bin/cl-support -e ${mods}system,network.kernel \
                "$prog first heartbeat miss"
        fi
}

function log_failure()
{
    local -i fails=0
    [ -e ${failure} ] && . ${failure}
    (( fails++ ))
    echo fails=${fails} > ${failure}
    logger -p err -t Failure Not running cl-support for $1, failure '#'${fails} status: "$statmsg"
}


case "$stat" in
    *Result=watchdog*)
	handle_watchdog
	;;
    *UnitFileState=disabled*)
	 ;; # ignore disabled services
    *ActiveState=inaactive*)
	 ;; # ignore inactive (stopped, never started) services
    *'Result=signal '* | 'Result=exit-code'* )
	# We need this case for the case when the program missing
	# the heartbeat was so stuck that systemd tried SIGTERM, but
	# the program still didn't exit, and therefore systemd used
	# SIGKILL.   When that happens, the Result is no longer "watchdog"
	# We can see other status's also, if systemd sends the SIGTERM,
	# and the process handles it and exits, or just dies.
	#
	# So then we always look at the journalctl output for the last 2 minutes
	# and if we see a watchdog reported for the failing service, we
	# assume it did watchdog, but then was killed with SIGKILL.
	# We don't want to do this for all SIGKILL, in case somebody
	# does something like 'killall -KILL switchd'
	iswatch=$(journalctl -l -o short --since='2 minutes ago'  |
		 egrep "${service} watchdog.timeout")
	if [ -n "$iswatch" ]; then
		handle_watchdog
	else
		log_failure $service
	fi
	;;
    *)
	log_failure $service
        ;;
esac
