Script that does housekeeping and submit batch jobs to the cluster.
#!/usr/bin/perl # # script to analyze E5 root files on the Richmond cluster. # run this by executing 'submit_eod3d.pl' at the command line. # the input files and such are in the /home/gilfoyl/eod/run # area on pscm1. # - gpg # # do housekeeping first. # system("rm /home/gilfoyle/eod/run/results/*"); system("rm /scratch/gilfoyle/e5/eod*"); system("rm -r /scratch/gilfoyle/e5/24*"); # # read the file with the run numbers and read those numbers into an array. # $run_numbers_file="E5_run_numbers.inp"; open(RUNS,"$run_numbers_file") || die "Failed to open run numbers file.\n"; @runno=; close(RUNS); # # loop over each run number, get the slave node, create the input file # for the batch command, and then issue the batch command. # foreach $line (@runno) { chomp($line); # # use the Scyld beomap command to get the next idle node. if they're # all busy then do a loop and wait for $sleep_time seconds until the next # available node. the problem here is that 'beomap --no-over' returns # -1 when the average load on the slaves is too high. if you run something # on node -1, this will run on the master! # $node = `beomap --no-over`; chomp($node); print "node = $node\n"; $sleep_time = 60; while ($node < 0) { print "No CPUs available. Sleeping for $sleep_time seconds.\n\n"; sleep $sleep_time; $node = `beomap --no-over`; chomp($node); print "node = $node\n"; } # # get the batch command file ready to submit the job and let're rip. # clean up is done in run_root_on_node3.sh since we have to wait for # root to get done on the slave. the sleep is done here to prevent # collisions among the slaves in getting resources (data files, # libraries, etc) which can cause pscm1 to 'hang'. # open(OUT,">run_job"); print OUT "./run_root_on_node3.pl $node $line \n"; print "./run_root_on_node3.pl $node $line \n"; close(OUT); system("batch -f run_job"); sleep 30; print "Submit on node $node run $line for analysis.\n"; }
Script called by the one above to actually run jobs on a node.
#!/usr/bin/perl # # script for running on a slave node of the cluster. # called from submit_eod3d.pl - gpg 12/03/02 # # set up the environment including the node and run # number assignments that come from the arguments of the # script. @option = @ARGV; $options = @option; $NODE = $option[0]; $RUNNO = $option[1]; $RUNFILE = "/home/gilfoyle/eod/run/files_4.232/run$RUNNO\_files.dat"; system("setenv ROOTSYS /usr/root/PRO"); # the following statement was commented out early on because the CLAS # software was not ready and it seems we do not need these environment # variables. #system("source /home/clas/builds/PRODUCTION/packages/cms/rich.cshrc PRODUCTION"); # Make a working directory and go to it # NOTE: Make this on the master as well as a place holder # no actual data goes to the placeholder on the master. system("bpsh $NODE mkdir -p /scratch/gilfoyle/e5/$RUNNO/ > /scratch/gilfoyle/e5/eod3_log${NODE}_run${RUNNO}"); system("mkdir -p /scratch/gilfoyle/e5/$RUNNO/ >> /scratch/gilfoyle/e5/eod3_log${NODE}_run${RUNNO}"); system("cd /scratch/gilfoyle/e5/$RUNNO/"); # get the data filenames. print "\nrun files: $RUNFILE, run number: $RUNNO.\n"; open(FILES,"$RUNFILE") || die "Failed to open filename file.\n"; @filenames=; close(FILES); # make the slave directory and copy the data to it. do NOT use bpcp because it # apparently copies the data twice according to steven james. system("bpsh $NODE mkdir -p /scratch/gilfoyle/e5/$RUNNO/"); foreach $data_filename (@filenames) { chomp($data_filename); print "copy /data2/e5/root/4.232/$data_filename to $NODE:/scratch/gilfoyle/e5/$RUNNO/$data_filename.\n"; system("bpsh $NODE cp /data2/e5/root/4.232/$data_filename /scratch/gilfoyle/e5/$RUNNO/$data_filename"); } # copy other necessary files to slave directory. these files are small so # bpcp is not a bottleneck (see previous comment). system("bpcp $RUNFILE $NODE:/scratch/gilfoyle/e5/$RUNNO/data_filenames.dat"); system("bpcp /home/gilfoyle/eod/root/eod3.C $NODE:/scratch/gilfoyle/e5/$RUNNO/"); system("bpcp /home/gilfoyle/eod/root/eod3.h $NODE:/scratch/gilfoyle/e5/$RUNNO/"); system("bpcp /home/gilfoyle/eod/root/user_eod3.h $NODE:/scratch/gilfoyle/e5/$RUNNO/"); system("bpcp /home/gilfoyle/eod/root/run_eod3.C $NODE:/scratch/gilfoyle/e5/$RUNNO/"); # More housekeeping and monitoring information. system("cd /scratch/gilfoyle/e5/$RUNNO/"); chdir("/scratch/gilfoyle/e5/$RUNNO/"); system("rm /scratch/gilfoyle/e5/eod3_log${NODE}_run${RUNNO}"); system("date >> /scratch/gilfoyle/e5/eod3_log${NODE}_run${RUNNO}"); system("pwd >> /scratch/gilfoyle/e5/eod3_log${NODE}_run${RUNNO}"); # run root. system("echo 'run root.'>> /scratch/gilfoyle/e5/eod3_log${NODE}_run${RUNNO}"); system("bpsh $NODE root -b -q /scratch/gilfoyle/e5/$RUNNO/run_eod3.C >> /scratch/gilfoyle/e5/eod3_log${NODE}_run${RUNNO}"); system("date >> /scratch/gilfoyle/e5/eod3_log${NODE}_run${RUNNO}"); # Copy histo file back to the master and clean up the slave directory. # Do this here instead of in submit_eod3d.pl so that root is done before # we start cleaning up. system("bpcp ${NODE}:/scratch/gilfoyle/e5/${RUNNO}/eod3_hists.root /home/gilfoyle/eod/run/results/run${RUNNO}_results.root"); chdir("/home/gilfoyle/eod/run/"); system("bpsh ${NODE} rm -r /scratch/gilfoyle/e5/$RUNNO/");