For longer simulations it is inconvenient to restart the individual jobs by hand. This task is done by a script originally from Hans-Günter Ludwig. Its basic function is sketched in Fig. 9.1.
Here comes an example of the script (rhd1.job
)
for a system without dedicated batch system.
The submission of a job is done via nohup rhd1.job &
:
#!/bin/sh # --- PBS -N rhd1 # --- PBS -l walltime=72:00:00 # --- PBS -l nodes=1:ppn=4 # # --- Job file for the execution of CO5BOLD --- # --- Source: rhd1.job, original from HGL --- # --- Last modification: 2014-06-23 --- # BASEDIR=/users/bf/dat/job/j1 STADIR=${BASEDIR}/sta # Start directory: contains *.par, rhd1.cmd, rhd1.job WRKDIR=${BASEDIR}/wrk # Work directory BAKDIR=${BASEDIR}/bak # Backup and output directory RHDEXE=$STADIR/rhd.exe # RHD program to be executed # ulimit -s unlimited # export OMP_NUM_THREADS=4 export OMP_STACKSIZE=300M # # --- Jump into work directory --- cd ${WRKDIR} # # --- Loop: execute RHD code (possibly) several times in one job -------------- for IRUN in 1 2 do echo echo 'CO5BOLD Run #' $IRUN date # # --- Clear up work directory --- rm * # ############################################################################## # # --- Get old command file, select actual command line --- # --- and read variables from command line --- # cp -p $STADIR/rhd1.cmd rhd1.cmd # awk \ ' /#/ { print; next} /\*/ { print; next} /\+/ { print; next} DONE!="T" { print >"cmdline"; printf "%-77s*\n", $0; DONE="T"; next} { print} ' rhd1.cmd | cat > dummy_nrhd.cmd # read INFILE OUTFILE PARFILE ACT < cmdline mv dummy_nrhd.cmd ${OUTFILE}_nrhd.cmd echo ${INFILE} echo ${OUTFILE} echo ${PARFILE} echo ${ACT} # ############################################################################## # if [ "${ACT}" != dum ] then ############################################################################ # # --- RHD execution --- # # --- Copy parameter file --- cp -p ${STADIR}/${PARFILE} rhd.par # # --- Copy start file --- if test -s ${BAKDIR}/${INFILE} then cp -p ${BAKDIR}/${INFILE} rhd.sta else cp -p ${STADIR}/${INFILE} rhd.sta cp -p rhd.sta ${BAKDIR}/${INFILE} fi # --- Copy executable --- cp -p ${RHDEXE} rhd.exe # # --- Execute RHD --- ./rhd.exe > rhd.out RHD_EXIT=$? # if [ "$RHD_EXIT" != 0 ] || [ ! -s rhd.done ] then # --- Exit status not zero, --- # --- error may have occurred during execution of rhd.exe --- if [ "$RHD_EXIT" != 0 ] then echo 'Non-zero exit status ' $RHD_EXIT ' occurred during execution of RHD!' else echo 'No rhd.done file found: assume error during execution of RHD!' fi echo 'Execution of job chain terminated!' # --- Modify nrhd.cmd, set termination character "+" --- awk \ ' /#/ { print; next} /\*/ { print; next} /\+/ { print; next} DONE!="T" { printf "%-77s+\n", $0; DONE="T"; next} { print} ' rhd1.cmd | cat > nrhd.cmd # --- Terminate chain by simulating ${ACT} = eoc ... --- ACT=eoc else # --- Modify command file: add '*' in appropriate column to indicate --- # --- proper execution --- awk \ ' /#/ { print; next} /\*/ { print; next} /\+/ { print; next} DONE!="T" { print >"cmdline"; printf "%-77s*\n", $0; DONE="T"; next} { print} ' rhd1.cmd | cat > nrhd.cmd fi # ls -l # # --- Move data into backup directory and modify permissions --- gzip rhd.out mv rhd.out.gz ${BAKDIR}/${OUTFILE}.out.gz cp -p rhd.par ${BAKDIR}/${OUTFILE}.par mv rhd.end ${BAKDIR}/${OUTFILE}.end mv rhd.fine ${BAKDIR}/${OUTFILE}.fine mv rhd.full ${BAKDIR}/${OUTFILE}.full mv rhd.mcyl ${BAKDIR}/${OUTFILE}.mcyl mv rhd.mean ${BAKDIR}/${OUTFILE}.mean mv rhd.chu1 ${BAKDIR}/${OUTFILE}.chu1 mv rhd.chu2 ${BAKDIR}/${OUTFILE}.chu2 mv rhd.chu3 ${BAKDIR}/${OUTFILE}.chu3 chmod go+r ${BAKDIR}/${PARFILE} ${BAKDIR}/${OUTFILE}.* chmod go-w ${BAKDIR}/${PARFILE} ${BAKDIR}/${OUTFILE}.* # ############################################################################ fi # # ############################################################################## # # --- Dispose modified command file for next job --- cp -p nrhd.cmd $STADIR/rhd1.cmd # # --- Exit loop if end of chain reached --- if test ${ACT} = eoc then break fi # # --- End of loop ------------------------------------------------------------ done # # --- Submit next job if not end of chain reached--- if test ${ACT} != eoc then cd $STADIR echo "Resubmit job" # /usr/local/bin/qsub rhd1.job # qsub rhd1.job # nice nohup rhd1.job & nohup rhd1.job & else echo "End of job chain" fi # ################################################################################