#!/bin/bash
#
# Periodically check state of grid jobs in DGBridge, and put mark files
# for finished jobs.
#
# usage: scan_DGBridge_job control_dir ...

# Load arc.conf and set up environment
joboption_lrms=DGBridge

# ARC1 passes the config file first.
if [ "$1" = "--config" ]; then shift; ARC_CONFIG=$1; shift; fi

basedir=`dirname $0`
basedir=`cd $basedir > /dev/null && pwd` || exit $?
basewebdir="/var/www/3GBridge/"


pkgdatadir="$basedir"

. "${pkgdatadir}/configure-${joboption_lrms}-env.sh" || exit $?

. "${pkgdatadir}/scan_common.sh" || exit $?

# Prevent multiple instances of scan job to run concurrently
lockfile="${TMPDIR:-/tmp}/scan-DGBridge-job.lock"
#Check if lockfile exist, if not, create it.
(set -C; : > "$lockfile") 2> /dev/null
if [ "$?" != "0" ]; then
    if ps -p $(< "$lockfile") 2>/dev/null;then
	echo "lockfile exists and PID $(< $lockfile) is running"
	exit 1
    fi
    echo "old lockfile found, was scan-DGBridge-job killed?"

    # sleep, and if no other have removed and recreated the lockfile we remove it.
    # there are still races possible, but this will have to do
    sleep $((${RANDOM}%30+10))
    if ps -p $(< $lockfile) &>/dev/null;then
        echo "lockfile exists and $(< $lockfile) is running"
	exit 1
    else
	echo "still not running, removing lockfile"
	rm $lockfile
	exit 1
    fi
fi
echo "$$" > "$lockfile"
#If killed, remove lockfile
trap 'rm $lockfile' EXIT KILL TERM
#Default sleep-time is 30 seconds
sleep ${CONFIG_scan_wakeupperiod:-30}


## There is no shared file system possible in the DGBridge, instead we must copy output files from upload to session dir


#Validate control directories supplied on command-line
if [ -z "$1" ] ; then
    echo "no control_dir specified" 1>&2; exit 1
fi
for ctr_dir in "$@"; do
    if [ ! -d "$ctr_dir" ]; then
	echo "called with erronous control dir: $ctr_dir" 1>&2
	exit 1
    fi
done


# List of DGBridge jobids for grid-jobs with state INLRMS
declare -a localids
# Array with basenames of grid-job files in ctrl_dir, indexed by localid
# example /some/path/job.XXXXX /some/other/parh/job.YYYYY
declare -a basenames
declare -a gridids
declare -a endpoints
# Array with states of the jobs in SLURM, indexed by localid
declare -a jobstates
# Array to store localids of jobs that are determined to have finished, which are sent to gm-kick
declare -a kicklist

option_ctrdir=$@

# Find list of grid jobs with status INLRMS, store localid and
# basename for those jobs
#for basename in $(find "$@" -name 'job.*.status' -print0 \
#    | xargs -0 egrep -l "INLRMS|CANCELING" \
#    | sed 's/.status$//')
for basename in $(ls $option_ctrdir/processing|sed -e 's/.status//g' -e "s,^,$option_ctrdir/,")
do
  localid=$(grep ^localid= "${basename}.local" | cut -d= -f2)
  ind=${#localids[@]}
  localids[$ind]="$localid"
  endpoints[$ind]=$(grep ^joboption_wsendpoint= "${basename}.grami" | cut -d= -f2)
  temp=${basename##*/}
  gridids[$ind]=`echo $temp|sed 's/^job.//'` 
  basenames[$ind]="$basename"
done

# No need to continue further if no jobs have status INLRMS
if [ "${#localids[@]}" -eq 0 ];then
    exit 0
fi


# Get JobStates from wsclient

#performance: change this to use the file - switch to read multiple jids from stdin
numids=$((${#localids[@]}-1))
for ind in `seq 0 $numids` ; do
   #get endpoint
   wsendpoint=${endpoints[$ind]}
   jobid=${localids[$ind]}
   jobstate=$(wsclient -e "$wsendpoint" -m status -j "$jobid" 2>&1)
   if [[ $? -ne 0 || ! "${jobstate#$jobid }" =~ \
      "Init|Running|Finished|Unknown|Error|TempFailed" ]]; then
      echo "Failed to get job status from web service: $jobstate" 2>&1
      jobstate="$jobid WSError"
   fi
   jobstates[$ind]="${jobstate#$jobid }"
done

function cleanbridge()
{
   #$1 wsendpoint
   wsep=$1
   #3g id
   lid=$2
   #ARC id
   gridid=$3
   # clean local input storage
   echo "cleaning job: $gridid"
   # extract gridid
   # if [ ! "$gridid" = "" ]; then
   #    rm -rf $basewebdir/$gridid/
   # fi
   for ((i=0; i<=$EDGES_3G_RETRIES; i++)); do
      OUTPUT=$(wsclient -e "$wsep" -m delete -j "$lid" 2>&1)
      [ $? -eq 0 ] && break
      (( i < EDGES_3G_RETRIES )) && sleep "$((EDGES_3G_TIMEOUT / EDGES_3G_RETRIES))"
   done &
}

#setup edgi monitoring logs
dato=`date +%Y-%m-%d`
edgilog=$option_ctrdir/3gbridge_logs/$dato
dato=`date +%Y-%m-%d_%H:%M:%S`

# Look at the list of jobstates and determine which jobs that have
# finished. Write job.XXXX.lrms_done according to this
numids=$((${#localids[@]}-1))
for ind in `seq 0 $numids` ; do
    wsendpoint=${endpoints[$ind]}
    #echo "${localids[$ind]}($wsendpoint ; ${gridids[$ind]}): ${jobstates[$ind]}" >> /tmp/chrulle.scan
    
    case "${jobstates[$ind]}" in
  	Init)
            ;;
        Running)
  	#Job is running, nothing to do.
	#performance: delete input files in running state, only possible if remote 3gbridge does not use passthrough of data
	    msg="dt=$dato event=job_status job_id=${gridids[$ind]} status=Running"
            flock -w 2 $edgilog -c "echo $msg >> $edgilog"
            if [ $? == 1 ]; then
              echo "Failed to log monitor data to: $edgilog" 1>&2
            fi
  	    ;;
  	Unknown)
	    #bridge doesn't know job, maybe cancelled
	    echo "-1 Job was cancelled" > "${basenames[$ind]}.lrms_done"
	    kicklist=(${kicklist[@]} $ind)
	    cleanbridge $wsendpoint ${localids[$ind]} ${gridids[$ind]}

	    msg="dt=$dato event=job_status job_id=${gridids[$ind]} status=Failed"
            flock -w 2 $edgilog -c "echo $msg >> $edgilog"
            if [ $? == 1 ]; then
              echo "Failed to log monitor data to: $edgilog" 1>&2
            fi

	    ;;
  	Finished)
            #fetch outputfiles. Maybe this will take too long.
            #first get list

            OUTPUT=$(wsclient -e "$wsendpoint" -m output -j "${localids[$ind]}" 2>&1)
            if [ $? -ne 0 ]; then
               #echo "cannot get ouput fail job" >> /tmp/scanlog.chrulle
                
               echo "-1 Job could not get output" > "${basenames[$ind]}.lrms_done"
               kicklist=(${kicklist[@]} $ind)
               #clean bridge?
   	       cleanbridge $wsendpoint ${localids[$ind]} ${gridids[$ind]}

               msg="dt=$dato event=job_status job_id=${gridids[$ind]} status=Failed"
               flock -w 2 $edgilog -c "echo $msg >> $edgilog"
               if [ $? == 1 ]; then
                 echo "Failed to log monitor data to: $edgilog" 1>&2
               fi

               continue
            fi
            #fetch list using wget? yes
            # parse output
            output=$(echo "$OUTPUT"|grep http|awk '{print $2}')
            for line in $output; do
              # FIX
   	      #echo "fetching $line" >> /tmp/chrulle.scan
              wget -P /var/spool/nordugrid/session/${gridids[$ind]}/ $line
            done
            chown -R 3gbridge:3gbridge /var/spool/nordugrid/session/${gridids[$ind]}/  
	    #clean 3Gbridge
	    cleanbridge $wsendpoint ${localids[$ind]} ${gridids[$ind]}
            
            #trigger done            
	    echo "0 Job Finished" > "${basenames[$ind]}.lrms_done"
            
            #monitor
	    msg="dt=$dato event=job_status job_id=${gridids[$ind]} status=Finished"
            flock -w 2 $edgilog -c "echo $msg >> $edgilog"
            if [ $? == 1 ]; then
              echo "Failed to log monitor data to: $edgilog" 1>&2
            fi
            
  	    ;;
  	Error|TempFailed)
	    #job failed
	    echo "-1 Job Failed" > "${basenames[$ind]}.lrms_done"
	    kicklist=(${kicklist[@]} $ind)
	    #clean
	    cleanbridge $wsendpoint ${localids[$ind]} ${gridids[$ind]}
            #monitor
	    msg="dt=$dato event=job_status job_id=${gridids[$ind]} status=Failed"
            flock -w 2 $edgilog -c "echo $msg >> $edgilog"
            if [ $? == 1 ]; then
              echo "Failed to log monitor data to: $edgilog" 1>&2
            fi


  	    ;;
	WSError)
	    #webservice failed - perhaps have a count and then fail job?
 	    ;;
	*)
	    ;;
    esac
done

# Kick the GM
if [ -n "${kicklist[*]}" ];then
    "${basedir}/gm-kick" \
	$(for ind in "${kicklist[@]}";do
	    echo "${basenames[$ind]}.status"
	    done | xargs)
fi

exit 0
