next up previous contents index
Next: 7 Data Analysis with Up: 6 Running a Simulation Previous: 6.1 Quickstart: How to   Contents   Index


6.2 Running CO5BOLD on a Machine with Batch System

For longer simulations it is inconvenient to restart the individual jobs by hand. This task is done by a script originally from Hans-Günter Ludwig. Its basic function is sketched in Fig. 2.

Figure 2: Program scheme , Postscript version
\begin{figure}\centering\includegraphics[width=16.2cm]{co5bold/coboldscheme.eps}\end{figure}

Here comes an example of the script (rhd1.job) for a system without dedicated batch system. The submission of a job is done via nohup rhd1.job:
#!/bin/sh
#--- PBS -N rhd1
#--- PBS -l walltime=72:00:00 
#--- PBS -l nodes=1:ppn=4
#
# --- Job file for the execution of RHD on gunnar (Itanium 2, Uppsala) ---
# --- Source: rhd1.job, original from HGL                              ---
# --- Last modification: 2002-12-13                                    ---
#
#set -xv
#
BASEDIR=/users/bf/dat/job/j1
STADIR=${BASEDIR}/sta      # Start directory: contains *.par, rhd1.cmd, rhd1.job
WRKDIR=${BASEDIR}/wrk      # Work directory
BAKDIR=${BASEDIR}/bak      # Backup and output directory
RHDEXE=$STADIR/rhd.exe     # RHD program to be executed
#
#export NCPUS=4
#echo $NCPUS
export OMP_NUM_THREADS=1
#
# --- Jump into work directory ---
cd $WRKDIR
#
# --- Loop: execute RHD code (possibly) several times in one job --------------
for IRUN in 1
do
  echo
  echo 'CO5BOLD Run #' $IRUN
  date
  #
  # --- Clear up work directory ---
  #set +e
  rm *
  #set -e
  #
  ##############################################################################
  #
  #   --- Get old command file, select actual command line ---
  #   --- and read variables from command line             ---
  #
  cp -p $STADIR/rhd1.cmd rhd1.cmd
  #
  awk \
  '      /#/   { print; next}
        /\*/   { print; next}
        /\+/   { print; next}
   DONE!="T"   { print >"cmdline"; printf "%-77s*\n", $0; DONE="T"; next}
               { print} ' rhd1.cmd | cat > dummy_nrhd.cmd
  #
  read INFILE OUTFILE PARFILE ACT < cmdline
  mv dummy_nrhd.cmd ${OUTFILE}_nrhd.cmd
  echo ${INFILE}
  echo ${OUTFILE}
  echo ${PARFILE}
  echo ${ACT}
  #
  ##############################################################################
  #
  if [ "${ACT}" != dum ]
  then
    ############################################################################
    #
    #   --- RHD execution ---
    #
    # --- Copy parameter file ---
    cp -p ${STADIR}/${PARFILE} rhd.par
    #
    # --- Copy start file ---
    if test -s ${BAKDIR}/${INFILE}
    then
      cp -p ${BAKDIR}/${INFILE} rhd.sta
    else
      cp -p ${STADIR}/${INFILE} rhd.sta
      cp -p rhd.sta ${BAKDIR}/${INFILE}
    fi
    # --- Copy executable ---
    cp -p ${RHDEXE} rhd.exe
    #
    # --- Execute RHD ---
    ./rhd.exe > rhd.out
    RHD_EXIT=$?
    #
    if [ "$RHD_EXIT" != 0 ] || [ ! -s rhd.done ]
    then
      # --- Exit status not zero, ---
      # --- error may have occurred during execution of rhd.exe ---
      if [ "$RHD_EXIT" != 0 ]
      then
        echo 'Non-zero exit status ' $RHD_EXIT ' occurred during execution of RHD!'
      else
        echo 'No rhd.done file found: assume error during execution of RHD!'
      fi
      echo 'Execution of job chain terminated!'
      # --- Modify nrhd.cmd, set termination character "+" ---
      awk \
      '      /#/   { print; next}
            /\*/   { print; next}
            /\+/   { print; next}
       DONE!="T"   { printf "%-77s+\n", $0; DONE="T"; next}
                   { print} ' rhd1.cmd | cat > nrhd.cmd
      # --- Terminate chain by simulating ${ACT} = eoc ... ---
      ACT=eoc
    else
      # --- Modify command file: add '*' in appropriate column to indicate ---
      # --- proper execution                                               ---    
      awk \
      '      /#/   { print; next}
            /\*/   { print; next}
            /\+/   { print; next}
       DONE!="T"   { print >"cmdline"; printf "%-77s*\n", $0; DONE="T"; next}
                   { print} ' rhd1.cmd | cat > nrhd.cmd
    fi
    #
    ls -l
    #
    # --- Move data into backup directory ---
    cp -p rhd.par  ${BAKDIR}/${PARFILE}
    mv rhd.out  ${BAKDIR}/${OUTFILE}.out
    mv rhd.par  ${BAKDIR}/${OUTFILE}.par
    #mv rhd.sta  ${BAKDIR}/${OUTFILE}.sta
    mv rhd.end  ${BAKDIR}/${OUTFILE}.end
    mv rhd.full ${BAKDIR}/${OUTFILE}.full
    mv rhd.mean ${BAKDIR}/${OUTFILE}.mean
    chmod go+r ${BAKDIR}/${PARFILE} ${BAKDIR}/${OUTFILE}.*
    chmod go-w ${BAKDIR}/${PARFILE} ${BAKDIR}/${OUTFILE}.*
    #
    ############################################################################
  fi
  #
  #
  ##############################################################################
  #
  # --- Dispose modified command file for next job ---
  cp -p nrhd.cmd $STADIR/rhd1.cmd
  #
  # --- Exit loop if end of chain reached ---
  if test ${ACT} = eoc
  then
    break
  fi
  #
  # --- End of loop ------------------------------------------------------------
done
#
# --- Submit next job if not end of chain reached---
if test ${ACT} != eoc
then
  cd $STADIR
  echo "Resubmit job"
#  /usr/local/bin/qsub rhd1.job
#  qsub rhd1.job
#  nice nohup rhd1.job &
  nohup rhd1.job &
else
  echo "End of job chain"
fi
#
################################################################################

Basic informations about the batch system of the Ångström Sun cluster are available.

A more detailed description of the script will come sometimes in the future (e.g. from Sven's manual...).