Script that does housekeeping and submit batch jobs to the cluster.


#!/usr/bin/perl
#
# script to analyze E5 root files on the Richmond cluster.
# run this by executing 'submit_eod3d.pl' at the command line.
# the input files and such are in the /home/gilfoyl/eod/run
# area on pscm1.
#                                          - gpg
# 
# do housekeeping first.
#
system("rm /home/gilfoyle/eod/run/results/*");
system("rm /scratch/gilfoyle/e5/eod*");
system("rm -r /scratch/gilfoyle/e5/24*");
#
# read the file with the run numbers and read those numbers into an array.
#
$run_numbers_file="E5_run_numbers.inp";
open(RUNS,"$run_numbers_file") || die "Failed to open run numbers file.\n";
@runno=;
close(RUNS);
#
# loop over each run number, get the slave node, create the input file
# for the batch command, and then issue the batch command.
#
foreach $line (@runno) {
    chomp($line);
#
# use the Scyld beomap command to get the next idle node. if they're
# all busy then do a loop and wait for $sleep_time seconds until the next 
# available node. the problem here is that 'beomap --no-over' returns
# -1 when the average load on the slaves is too high. if you run something
# on node -1, this will run on the master!
#
    $node = `beomap --no-over`;
    chomp($node);
    print "node = $node\n";
    $sleep_time = 60;
    while ($node < 0) {
	print "No CPUs available. Sleeping for $sleep_time seconds.\n\n";
	sleep $sleep_time;
	$node = `beomap --no-over`;
	chomp($node);
	print "node = $node\n";
    }
#
# get the batch command file ready to submit the job and let're rip. 
# clean up is done in run_root_on_node3.sh since we have to wait for 
# root to get done on the slave. the sleep is done here to prevent 
# collisions among the slaves in getting resources (data files, 
# libraries, etc) which can cause pscm1 to 'hang'.
#
    open(OUT,">run_job");
    print OUT "./run_root_on_node3.pl $node $line \n";
    print     "./run_root_on_node3.pl $node $line \n";
    close(OUT);
    system("batch -f run_job");
    sleep 30;
    print "Submit on node $node run $line for analysis.\n";
}


Script called by the one above to actually run jobs on a node.


#!/usr/bin/perl
# 
# script for running on a slave node of the cluster.
# called from submit_eod3d.pl - gpg 12/03/02
#
# set up the environment including the node and run 
# number assignments that come from the arguments of the
# script.

@option  = @ARGV;
$options = @option;

$NODE =  $option[0];
$RUNNO =  $option[1];
$RUNFILE = "/home/gilfoyle/eod/run/files_4.232/run$RUNNO\_files.dat";
system("setenv ROOTSYS /usr/root/PRO");

# the following statement was commented out early on because the CLAS
# software was not ready and it seems we do not need these environment 
# variables.

#system("source /home/clas/builds/PRODUCTION/packages/cms/rich.cshrc PRODUCTION");

# Make a working directory and go to it
# NOTE: Make this on the master as well as a place holder
# no actual data goes to the placeholder on the master.

system("bpsh $NODE mkdir -p /scratch/gilfoyle/e5/$RUNNO/ > /scratch/gilfoyle/e5/eod3_log${NODE}_run${RUNNO}");
system("mkdir -p /scratch/gilfoyle/e5/$RUNNO/ >> /scratch/gilfoyle/e5/eod3_log${NODE}_run${RUNNO}");
system("cd /scratch/gilfoyle/e5/$RUNNO/");

# get the data filenames.

print "\nrun files: $RUNFILE, run number: $RUNNO.\n";
open(FILES,"$RUNFILE") || die "Failed to open filename file.\n";
@filenames=;
close(FILES);

# make the slave directory and copy the data to it. do NOT use bpcp because it
# apparently copies the data twice according to steven james.

system("bpsh $NODE mkdir -p /scratch/gilfoyle/e5/$RUNNO/");
foreach $data_filename (@filenames) {
   chomp($data_filename);
   print "copy /data2/e5/root/4.232/$data_filename to $NODE:/scratch/gilfoyle/e5/$RUNNO/$data_filename.\n";
   system("bpsh $NODE cp /data2/e5/root/4.232/$data_filename /scratch/gilfoyle/e5/$RUNNO/$data_filename");
}

# copy other necessary files to slave directory. these files are small so 
# bpcp is not a bottleneck (see previous comment).

system("bpcp $RUNFILE $NODE:/scratch/gilfoyle/e5/$RUNNO/data_filenames.dat");
system("bpcp /home/gilfoyle/eod/root/eod3.C $NODE:/scratch/gilfoyle/e5/$RUNNO/");
system("bpcp /home/gilfoyle/eod/root/eod3.h $NODE:/scratch/gilfoyle/e5/$RUNNO/");
system("bpcp /home/gilfoyle/eod/root/user_eod3.h $NODE:/scratch/gilfoyle/e5/$RUNNO/");
system("bpcp /home/gilfoyle/eod/root/run_eod3.C $NODE:/scratch/gilfoyle/e5/$RUNNO/");
 
# More housekeeping and monitoring information.

system("cd /scratch/gilfoyle/e5/$RUNNO/");
chdir("/scratch/gilfoyle/e5/$RUNNO/");
system("rm /scratch/gilfoyle/e5/eod3_log${NODE}_run${RUNNO}");
system("date >> /scratch/gilfoyle/e5/eod3_log${NODE}_run${RUNNO}");
system("pwd  >>  /scratch/gilfoyle/e5/eod3_log${NODE}_run${RUNNO}");

# run root.

system("echo 'run root.'>> /scratch/gilfoyle/e5/eod3_log${NODE}_run${RUNNO}");
system("bpsh $NODE root -b -q /scratch/gilfoyle/e5/$RUNNO/run_eod3.C >> /scratch/gilfoyle/e5/eod3_log${NODE}_run${RUNNO}");

system("date >> /scratch/gilfoyle/e5/eod3_log${NODE}_run${RUNNO}");

# Copy histo file back to the master and clean up the slave directory.
# Do this here instead of in submit_eod3d.pl so that root is done before
# we start cleaning up.

system("bpcp ${NODE}:/scratch/gilfoyle/e5/${RUNNO}/eod3_hists.root /home/gilfoyle/eod/run/results/run${RUNNO}_results.root");
chdir("/home/gilfoyle/eod/run/");
system("bpsh ${NODE} rm -r /scratch/gilfoyle/e5/$RUNNO/");