#!/bin/bash #PBS -A TG-DEB090011 export PATH=$(dirname $(readlink -f $0)):${PATH} #Try to load both so that we can run on either Trestles or Gordon. module purge module load gnu module load mvapich2 module load mvapich2_ib module load intel module load phylobayes export MV2_ENABLE_AFFINITY=0 #All of the following environment variables are expected by the run script. export THEPROGRAM=$(which pb_mpi) #defaults for command line options export SEED=${SEED-"$RANDOM"} export NUM_CHAINS=${NUM_CHAINS-'2'} export CHECKINTERVAL=${CHECKINTERVAL-'1800'} #Check for convergence every 1 hour. export BURNIN=${BURNIN-'500'} #: : number of cycles to exclude from comparison export ACCEPTDIFF=${ACCEPTDIFF-'0.3'} #: : maxdiff must be less than this. export ACCEPTSIZE=${ACCEPTSIZE-'50'} #: : The effective number of samples must be greater than this. export GIVEUP=${GIVEUP-'10000'} #: : The number of cycles that must occur before we start to check if we are stuck. export CHAINNAME="chain" export RESTARTING=0 #Function to print help to the user: usage() { cat 1>&2 << EOF Usage: $0 [Wrapper/Convergence Options] -- [PhyloBayes options EXCEPT or -rnd ] $0 is a BASH script that starts and runs several chains of PhyloBayesMPI in parallel. You submit this script as an MPI job to your PBS cluster, and it divides the available hosts and slots evenly amongst the chains. On the head node, it also starts a convergence checker that runs in the background, terminating the chains if they fulfill the convergence parameters. WRAPPER OPTIONS: -h --help Print this help message. --RESTART Restart the existing chains. Totally ignore options after the '--' --seed --SEED Set the base seed, each chain increments this by 1. (Which is sufficiently random, 1 and 2 produce completely different pseudorandom chains.) (This uses the undocumented '--rnd' option of PhyloBayes) -N --N The number of different chains to start/run. Each will get (roughly) 1/N CPUs. (If there is a remainder, of course those CPUS are used.) --CHECKINTERVAL The interval, in seconds, between convergence checks. Defaults to 1800 = 1 Hour. --BURNIN The number of cycles at the beginning of the chain to be ignored by the convergence checker. Defaults to 500. --ACCEPTDIFF The 'maxdiff' between all chains must be less than this for convergence. Defaults to 0.3, the PB manual says that 0.3 is 'acceptable' and 0.1 is 'good'. --ACCEPTSIZE The effective number of samples must be greater than this for convergence to be accepted. Defaults to 50, the PB manual says that 50 is 'acceptable' and 300 is 'good'. --GIVEUP The number of cycles to run before deciding that a set of chains that still have 'maxdiff' = 1 have failed. PHYLOBAYES OPTIONS: Please DO NOT provide the following options after the '--': --rnd The wrapper chooses the seed for each chain, they must have different seeds. PLEASE USE the wrapper '--SEED' option. The wrapper chooses the names of the chains, 'chain0', 'chain1', etc. EOF } #Function to print to both stdout and stderr echoerr() { echo "$@" ; echo "$@" 1>&2; } #Parse the command line PARSINGARGS=1 while [[ "$PARSINGARGS" == "1" && "$1" ]] do case $1 in -h|--help) usage; exit 0;; --seed|--SEED) shift; SEED=$1;; #check that seed is a positive int -N|--N) shift; NUM_CHAINS=$1;; #check that num_chains is a positive int. --CHECKINTERVAL) shift; CHECKINTERVAL=$1;; #check that checkinterval is a positive int --BURNIN) shift; BURNIN=$1;; #int >= 0 --ACCEPTDIFF) shift; ACCEPTDIFF=$1;; #float (0,1) --ACCEPTSIZE) shift; ACCEPTSIZE=$1;; #int >= 1 --GIVEUP) shift; GIVEUP=$1;; #int >= 1 --RESTART) RESTARTING=1;; --) PARSINGARGS=0;; *) echoerr "Unknown parameter: '${1}', terminating."; usage; exit 1;; esac shift done if [[ "$PARSINGARGS" == "1" ]] then echoerr "No '--' encountered while parsing arguments. Please verify the command line." exit 1 fi export ALLARGS=$* if [[ "$RESTARTING" == "1" ]] then #Clear the arguments that we are going to ignore ALLARGS="" #Set the number of chains according to the files here. NUM_CHAINS=$( ls ${CHAINNAME}*.chain 2>/dev/null | wc -w ) fi #Done Parsing WRAPPER OPTIONS #Do a basic sanity check on the PhyloBayes options. echo ${ALLARGS} | grep ' -rnd' >/dev/null if [ "$?" == "0" ] then #Found -rnd in options echoerr "Please do not provide the '-rnd' option to PhyloBayes. Use the -h option of this script for more details." exit 1 fi #THIS NEEDS TO BE FIXED. It is not load-balancing nicely. Especially not for odd numbers of chains. #build other variables that depend on those that may change when the command-line is parsed. #NTHREADS needs to be rounded up, or we get unused localnodefiles, a complete waste. export NCPUS=$(wc -l ${PBS_NODEFILE} | sed 's/ .*//g') export NTHREADS=$(( NCPUS / NUM_CHAINS )) #begin setup #Load-Balance the subprocesses #awk -v NUM=${NUM_CHAINS} '{print $0 > ("__localnodefile"(NR-1)%NUM);}' ${PBS_NODEFILE} split -l ${NTHREADS} -d -a1 ${PBS_NODEFILE} __localnodefile set -m set -b export STARTUPERROR=0 #We have to do this here, because seq gives a SIGCHLD when it terminates, triggering the exit signal handler. #BTW, I don't convert to 0-index because I like it, but because the 'split' command uses 0-indexed output files. NUMBERS=$(seq 0 $(( ${NUM_CHAINS} - 1 )) ) ALLCHAINS="" for i in $(seq 0 $(( ${NUM_CHAINS} - 1 )) ) do ALLCHAINS+="${CHAINNAME}${i} " done trap "pkill -INT -P $$" EXIT SIGINT trap "STARTUPERROR=1; echo 'got sigchild during startup'; jobs; exit 1" SIGCHLD RUNPIDS="" for i in ${NUMBERS} do export LSEED=$(( SEED + i )) #Added the '-launcher ssh' because mpirun was not properly inheriting the umask. I guess this makes things less robust. echo "mpirun -f __localnodefile${i} -launcher ssh ${THEPROGRAM} -rnd ${LSEED} ${ALLARGS} ${CHAINNAME}${i}" bash -c "mpirun -f __localnodefile${i} -launcher ssh ${THEPROGRAM} -rnd ${LSEED} ${ALLARGS} ${CHAINNAME}${i}" & RUNPIDS+="${!} " echo "started chain ${i} with PID : ${!}" done #maybe we should ensure that everything is running before calling "wait"? #Can we do that without starting a child process? #Setup the convergence tester bash PBterminator.bash ${CHECKINTERVAL} ${BURNIN} ${ACCEPTDIFF} ${ACCEPTSIZE} ${GIVEUP} ${ALLCHAINS} > __terminator.txt& TERMPID="$!" echo "Terminator running as PID : ${TERMPID}" #This is not best, because we are assuming that the chains continue apace, #It is however, easy to implement, if any child process dies, terminate the other ones. trap "echo 'Got SIGCHLD or SIGTERM, attempting graceful return. '; kill -s SIGTERM -- -${TERMPID}; kill -s SIGTERM ${RUNPIDS}; sleep 5; kill -9 -- ${RUNPIDS} -${TERMPID}" SIGCHLD SIGTERM if [ "${STARTUPERROR}" == "0" ] then wait ${RUNPIDS} fi kill -s SIGTERM ${TERMPID} sleep 2; kill -9 -- -${TERMPID} exit 0