For longer simulations it is inconvenient to restart the individual jobs by hand. This task is done by a script originally from Hans-Günter Ludwig. Its basic function is sketched in Fig. 9.1.
Here comes an example of the script (rhd1.job)
for a system without dedicated batch system.
The submission of a job is done via nohup rhd1.job &:
#!/bin/sh
# --- PBS -N rhd1
# --- PBS -l walltime=72:00:00
# --- PBS -l nodes=1:ppn=4
#
# --- Job file for the execution of CO5BOLD ---
# --- Source: rhd1.job, original from HGL ---
# --- Last modification: 2014-06-23 ---
#
BASEDIR=/users/bf/dat/job/j1
STADIR=${BASEDIR}/sta # Start directory: contains *.par, rhd1.cmd, rhd1.job
WRKDIR=${BASEDIR}/wrk # Work directory
BAKDIR=${BASEDIR}/bak # Backup and output directory
RHDEXE=$STADIR/rhd.exe # RHD program to be executed
#
ulimit -s unlimited
#
export OMP_NUM_THREADS=4
export OMP_STACKSIZE=300M
#
# --- Jump into work directory ---
cd ${WRKDIR}
#
# --- Loop: execute RHD code (possibly) several times in one job --------------
for IRUN in 1 2
do
echo
echo 'CO5BOLD Run #' $IRUN
date
#
# --- Clear up work directory ---
rm *
#
##############################################################################
#
# --- Get old command file, select actual command line ---
# --- and read variables from command line ---
#
cp -p $STADIR/rhd1.cmd rhd1.cmd
#
awk \
' /#/ { print; next}
/\*/ { print; next}
/\+/ { print; next}
DONE!="T" { print >"cmdline"; printf "%-77s*\n", $0; DONE="T"; next}
{ print} ' rhd1.cmd | cat > dummy_nrhd.cmd
#
read INFILE OUTFILE PARFILE ACT < cmdline
mv dummy_nrhd.cmd ${OUTFILE}_nrhd.cmd
echo ${INFILE}
echo ${OUTFILE}
echo ${PARFILE}
echo ${ACT}
#
##############################################################################
#
if [ "${ACT}" != dum ]
then
############################################################################
#
# --- RHD execution ---
#
# --- Copy parameter file ---
cp -p ${STADIR}/${PARFILE} rhd.par
#
# --- Copy start file ---
if test -s ${BAKDIR}/${INFILE}
then
cp -p ${BAKDIR}/${INFILE} rhd.sta
else
cp -p ${STADIR}/${INFILE} rhd.sta
cp -p rhd.sta ${BAKDIR}/${INFILE}
fi
# --- Copy executable ---
cp -p ${RHDEXE} rhd.exe
#
# --- Execute RHD ---
./rhd.exe > rhd.out
RHD_EXIT=$?
#
if [ "$RHD_EXIT" != 0 ] || [ ! -s rhd.done ]
then
# --- Exit status not zero, ---
# --- error may have occurred during execution of rhd.exe ---
if [ "$RHD_EXIT" != 0 ]
then
echo 'Non-zero exit status ' $RHD_EXIT ' occurred during execution of RHD!'
else
echo 'No rhd.done file found: assume error during execution of RHD!'
fi
echo 'Execution of job chain terminated!'
# --- Modify nrhd.cmd, set termination character "+" ---
awk \
' /#/ { print; next}
/\*/ { print; next}
/\+/ { print; next}
DONE!="T" { printf "%-77s+\n", $0; DONE="T"; next}
{ print} ' rhd1.cmd | cat > nrhd.cmd
# --- Terminate chain by simulating ${ACT} = eoc ... ---
ACT=eoc
else
# --- Modify command file: add '*' in appropriate column to indicate ---
# --- proper execution ---
awk \
' /#/ { print; next}
/\*/ { print; next}
/\+/ { print; next}
DONE!="T" { print >"cmdline"; printf "%-77s*\n", $0; DONE="T"; next}
{ print} ' rhd1.cmd | cat > nrhd.cmd
fi
#
ls -l
#
# --- Move data into backup directory and modify permissions ---
gzip rhd.out
mv rhd.out.gz ${BAKDIR}/${OUTFILE}.out.gz
cp -p rhd.par ${BAKDIR}/${OUTFILE}.par
mv rhd.end ${BAKDIR}/${OUTFILE}.end
mv rhd.fine ${BAKDIR}/${OUTFILE}.fine
mv rhd.full ${BAKDIR}/${OUTFILE}.full
mv rhd.mcyl ${BAKDIR}/${OUTFILE}.mcyl
mv rhd.mean ${BAKDIR}/${OUTFILE}.mean
mv rhd.chu1 ${BAKDIR}/${OUTFILE}.chu1
mv rhd.chu2 ${BAKDIR}/${OUTFILE}.chu2
mv rhd.chu3 ${BAKDIR}/${OUTFILE}.chu3
chmod go+r ${BAKDIR}/${PARFILE} ${BAKDIR}/${OUTFILE}.*
chmod go-w ${BAKDIR}/${PARFILE} ${BAKDIR}/${OUTFILE}.*
#
############################################################################
fi
#
#
##############################################################################
#
# --- Dispose modified command file for next job ---
cp -p nrhd.cmd $STADIR/rhd1.cmd
#
# --- Exit loop if end of chain reached ---
if test ${ACT} = eoc
then
break
fi
#
# --- End of loop ------------------------------------------------------------
done
#
# --- Submit next job if not end of chain reached---
if test ${ACT} != eoc
then
cd $STADIR
echo "Resubmit job"
# /usr/local/bin/qsub rhd1.job
# qsub rhd1.job
# nice nohup rhd1.job &
nohup rhd1.job &
else
echo "End of job chain"
fi
#
################################################################################