next up previous contents index
Next: 10. Data analysis with Up: 9. Running a simulation Previous: 9.1 Quickstart: How to   Contents   Index


9.2 Using a batch system

For longer simulations it is inconvenient to restart the individual jobs by hand. This task is done by a script originally from Hans-Günter Ludwig. Its basic function is sketched in Fig. 9.1.

Figure 9.1: Program scheme (Postscript version)
\includegraphics[width=0.95\textwidth]{co5bold/coboldscheme.eps}

Here comes an example of the script (rhd1.job) for a system without dedicated batch system. The submission of a job is done via nohup rhd1.job &:

#!/bin/sh
# --- PBS -N rhd1
# --- PBS -l walltime=72:00:00 
# --- PBS -l nodes=1:ppn=4
#
# --- Job file for the execution of CO5BOLD ---
# --- Source: rhd1.job, original from HGL   ---
# --- Last modification: 2014-06-23         ---
#
BASEDIR=/users/bf/dat/job/j1
STADIR=${BASEDIR}/sta      # Start directory: contains *.par, rhd1.cmd, rhd1.job
WRKDIR=${BASEDIR}/wrk      # Work directory
BAKDIR=${BASEDIR}/bak      # Backup and output directory
RHDEXE=$STADIR/rhd.exe     # RHD program to be executed
#
ulimit -s unlimited
#
export OMP_NUM_THREADS=4
export OMP_STACKSIZE=300M
#
# --- Jump into work directory ---
cd ${WRKDIR}
#
# --- Loop: execute RHD code (possibly) several times in one job --------------
for IRUN in 1 2
do
  echo
  echo 'CO5BOLD Run #' $IRUN
  date
  #
  # --- Clear up work directory ---
  rm *
  #
  ##############################################################################
  #
  #   --- Get old command file, select actual command line ---
  #   --- and read variables from command line             ---
  #
  cp -p $STADIR/rhd1.cmd rhd1.cmd
  #
  awk \
  '      /#/   { print; next}
        /\*/   { print; next}
        /\+/   { print; next}
   DONE!="T"   { print >"cmdline"; printf "%-77s*\n", $0; DONE="T"; next}
               { print} ' rhd1.cmd | cat > dummy_nrhd.cmd
  #
  read INFILE OUTFILE PARFILE ACT < cmdline
  mv dummy_nrhd.cmd ${OUTFILE}_nrhd.cmd
  echo ${INFILE}
  echo ${OUTFILE}
  echo ${PARFILE}
  echo ${ACT}
  #
  ##############################################################################
  #
  if [ "${ACT}" != dum ]
  then
    ############################################################################
    #
    #   --- RHD execution ---
    #
    # --- Copy parameter file ---
    cp -p ${STADIR}/${PARFILE} rhd.par
    #
    # --- Copy start file ---
    if test -s ${BAKDIR}/${INFILE}
    then
      cp -p ${BAKDIR}/${INFILE} rhd.sta
    else
      cp -p ${STADIR}/${INFILE} rhd.sta
      cp -p rhd.sta ${BAKDIR}/${INFILE}
    fi
    # --- Copy executable ---
    cp -p ${RHDEXE} rhd.exe
    #
    # --- Execute RHD ---
    ./rhd.exe > rhd.out
    RHD_EXIT=$?
    #
    if [ "$RHD_EXIT" != 0 ] || [ ! -s rhd.done ]
    then
      # --- Exit status not zero, ---
      # --- error may have occurred during execution of rhd.exe ---
      if [ "$RHD_EXIT" != 0 ]
      then
        echo 'Non-zero exit status ' $RHD_EXIT ' occurred during execution of RHD!'
      else
        echo 'No rhd.done file found: assume error during execution of RHD!'
      fi
      echo 'Execution of job chain terminated!'
      # --- Modify nrhd.cmd, set termination character "+" ---
      awk \
      '      /#/   { print; next}
            /\*/   { print; next}
            /\+/   { print; next}
       DONE!="T"   { printf "%-77s+\n", $0; DONE="T"; next}
                   { print} ' rhd1.cmd | cat > nrhd.cmd
      # --- Terminate chain by simulating ${ACT} = eoc ... ---
      ACT=eoc
    else
      # --- Modify command file: add '*' in appropriate column to indicate ---
      # --- proper execution                                               ---    
      awk \
      '      /#/   { print; next}
            /\*/   { print; next}
            /\+/   { print; next}
       DONE!="T"   { print >"cmdline"; printf "%-77s*\n", $0; DONE="T"; next}
                   { print} ' rhd1.cmd | cat > nrhd.cmd
    fi
    #
    ls -l
    #
    # --- Move data into backup directory and modify permissions ---
    gzip rhd.out
    mv rhd.out.gz ${BAKDIR}/${OUTFILE}.out.gz
    cp -p rhd.par ${BAKDIR}/${OUTFILE}.par
    mv rhd.end    ${BAKDIR}/${OUTFILE}.end
    mv rhd.fine   ${BAKDIR}/${OUTFILE}.fine
    mv rhd.full   ${BAKDIR}/${OUTFILE}.full
    mv rhd.mcyl   ${BAKDIR}/${OUTFILE}.mcyl
    mv rhd.mean   ${BAKDIR}/${OUTFILE}.mean
    mv rhd.chu1   ${BAKDIR}/${OUTFILE}.chu1
    mv rhd.chu2   ${BAKDIR}/${OUTFILE}.chu2
    mv rhd.chu3   ${BAKDIR}/${OUTFILE}.chu3
    chmod go+r    ${BAKDIR}/${PARFILE} ${BAKDIR}/${OUTFILE}.*
    chmod go-w    ${BAKDIR}/${PARFILE} ${BAKDIR}/${OUTFILE}.*
    #
    ############################################################################
  fi
  #
  #
  ##############################################################################
  #
  # --- Dispose modified command file for next job ---
  cp -p nrhd.cmd $STADIR/rhd1.cmd
  #
  # --- Exit loop if end of chain reached ---
  if test ${ACT} = eoc
  then
    break
  fi
  #
  # --- End of loop ------------------------------------------------------------
done
#
# --- Submit next job if not end of chain reached---
if test ${ACT} != eoc
then
  cd $STADIR
  echo "Resubmit job"
#  /usr/local/bin/qsub rhd1.job
#  qsub rhd1.job
#  nice nohup rhd1.job &
  nohup rhd1.job &
else
  echo "End of job chain"
fi
#
################################################################################