Spaces:
Runtime error
Runtime error
#!/usr/bin/env perl | |
use warnings; #sed replacement for -w perl parameter | |
use Cwd; | |
use File::Basename; | |
# This program is like run.pl except rather than just running on a local | |
# machine, it can be configured to run on remote machines via ssh. | |
# It requires that you have set up passwordless access to those machines, | |
# and that Kaldi is running from a location that is accessible via the | |
# same path on those machines (presumably via an NFS mount). | |
# | |
# It looks for a file .queue/machines that should have, on each line, the name | |
# of a machine that you can ssh to (which may include this machine). It doesn't | |
# have to be a fully qualified name. | |
# | |
# Later we may extend this so that on each line of .queue/machines you | |
# can specify various resources that each machine has, such as how | |
# many slots and how much memory, and make it wait if machines are | |
# busy. But for now it simply ssh's to a machine from those in the list. | |
# The command-line interface of this program is the same as run.pl; | |
# see run.pl for more information about the usage. | |
@ARGV < 2 && die "usage: ssh.pl log-file command-line arguments..."; | |
$jobstart = 1; | |
$jobend = 1; | |
$qsub_opts=""; # These will be ignored. | |
# First parse an option like JOB=1:4, and any | |
# options that would normally be given to | |
# ssh.pl, which we will just discard. | |
if (@ARGV > 0) { | |
while (@ARGV >= 2 && $ARGV[0] =~ m:^-:) { # parse any options | |
# that would normally go to qsub, but which will be ignored here. | |
$switch = shift @ARGV; | |
if ($switch eq "-V") { | |
$qsub_opts .= "-V "; | |
} else { | |
$option = shift @ARGV; | |
if ($switch eq "-sync" && $option =~ m/^[yY]/) { | |
$qsub_opts .= "-sync "; # Note: in the | |
# corresponding code in queue.pl it says instead, just "$sync = 1;". | |
} | |
$qsub_opts .= "$switch $option "; | |
if ($switch eq "-pe") { # e.g. -pe smp 5 | |
$option2 = shift @ARGV; | |
$qsub_opts .= "$option2 "; | |
} | |
} | |
} | |
if ($ARGV[0] =~ m/^([\w_][\w\d_]*)+=(\d+):(\d+)$/) { # e.g. JOB=1:10 | |
$jobname = $1; | |
$jobstart = $2; | |
$jobend = $3; | |
shift; | |
if ($jobstart > $jobend) { | |
die "run.pl: invalid job range $ARGV[0]"; | |
} | |
if ($jobstart <= 0) { | |
die "run.pl: invalid job range $ARGV[0], start must be strictly positive (this is required for GridEngine compatibility)"; | |
} | |
} elsif ($ARGV[0] =~ m/^([\w_][\w\d_]*)+=(\d+)$/) { # e.g. JOB=1. | |
$jobname = $1; | |
$jobstart = $2; | |
$jobend = $2; | |
shift; | |
} elsif ($ARGV[0] =~ m/.+\=.*\:.*$/) { | |
print STDERR "Warning: suspicious first argument to run.pl: $ARGV[0]\n"; | |
} | |
} | |
if ($qsub_opts ne "") { | |
print STDERR "Warning: ssh.pl ignoring options \"$qsub_opts\"\n"; | |
} | |
{ # Read .queue/machines | |
if (!open(Q, "<.queue/machines")) { | |
print STDERR "ssh.pl: expected the file .queue/machines to exist.\n"; | |
exit(1); | |
} | |
@machines = (); | |
while (<Q>) { | |
chop; | |
if ($_ ne "") { | |
@A = split; | |
if (@A != 1) { | |
die "ssh.pl: bad line '$_' in .queue/machines."; | |
} | |
if ($A[0] !~ m/^[a-z0-9\.\-]+/) { | |
die "ssh.pl: invalid machine name '$A[0]'"; | |
} | |
push @machines, $A[0]; | |
} | |
} | |
if (@machines == 0) { die "ssh.pl: no machines listed in .queue/machines"; } | |
} | |
$logfile = shift @ARGV; | |
if (defined $jobname && $logfile !~ m/$jobname/ && | |
$jobend > $jobstart) { | |
print STDERR "ssh.pl: you are trying to run a parallel job but " | |
. "you are putting the output into just one log file ($logfile)\n"; | |
exit(1); | |
} | |
{ | |
$offset = 0; # $offset will be an offset added to any index from the job-id | |
# specified if the user does JOB=1:10. The main point of this is | |
# that there are instances where a script will manually submit a | |
# number of jobs to the queue, e.g. with log files foo.1.log, | |
# foo.2.log and so on, and we don't want all of these to go | |
# to the first machine. | |
@A = split(".", basename($logfile)); | |
# if $logfile looks like foo.9.log, add 9 to $offset. | |
foreach $a (@A) { if ($a =~ m/^\d+$/) { $offset += $a; } } | |
} | |
$cmd = ""; | |
foreach $x (@ARGV) { | |
if ($x =~ m/^\S+$/) { $cmd .= $x . " "; } | |
elsif ($x =~ m:\":) { $cmd .= "'$x' "; } | |
else { $cmd .= "\"$x\" "; } | |
} | |
for ($jobid = $jobstart; $jobid <= $jobend; $jobid++) { | |
$childpid = fork(); | |
if (!defined $childpid) { die "Error forking in ssh.pl (writing to $logfile)"; } | |
if ($childpid == 0) { | |
# We're in the child... this branch executes the job and returns (possibly | |
# with an error status). | |
if (defined $jobname) { | |
$cmd =~ s/$jobname/$jobid/g; | |
$logfile =~ s/$jobname/$jobid/g; | |
} | |
{ # work out the machine to ssh to. | |
$local_offset = $offset + $jobid - 1; # subtract 1 since jobs never start | |
# from 0; we'd like the first job | |
# to normally run on the first | |
# machine. | |
$num_machines = scalar @machines; | |
# in the next line, the "+ $num_machines" is in case $local_offset is | |
# negative, to ensure the modulus is calculated in the mathematical way, not | |
# in the C way where (negative number % positive number) is negative. | |
$machines_index = ($local_offset + $num_machines) % $num_machines; | |
$machine = $machines[$machines_index]; | |
} | |
if (!open(S, "|ssh $machine bash")) { | |
print STDERR "ssh.pl failed to ssh to $machine"; | |
exit(1); # exits from the forked process within ssh.pl. | |
} | |
$cwd = getcwd(); | |
$logdir = dirname($logfile); | |
# Below, we're printing into ssh which has opened a bash session; these are | |
# bash commands. | |
print S "set -e\n"; # if any of the later commands fails, we want it to exit. | |
print S "cd $cwd\n"; | |
print S ". ./path.sh\n"; | |
print S "mkdir -p $logdir\n"; | |
print S "time1=\`date +\"%s\"\`\n"; | |
print S "( echo '#' Running on \`hostname\`\n"; | |
print S " echo '#' Started at \`date\`\n"; | |
print S " echo -n '# '; cat <<EOF\n"; | |
print S "$cmd\n"; | |
print S "EOF\n"; | |
print S ") >$logfile\n"; | |
print S "set +e\n"; # we don't want bash to exit if the next line fails. | |
# in the next line, || true means allow this one to fail and not have bash exit immediately. | |
print S " ( $cmd ) 2>>$logfile >>$logfile\n"; | |
print S "ret=\$?\n"; | |
print S "set -e\n"; # back into mode where it will exit on error. | |
print S "time2=\`date +\"%s\"\`\n"; | |
print S "echo '#' Accounting: time=\$((\$time2-\$time1)) threads=1 >>$logfile\n"; | |
print S "echo '#' Finished at \`date\` with status \$ret >>$logfile\n"; | |
print S "exit \$ret"; # return with the status the command exited with. | |
$ret = close(S); | |
$ssh_return_status = $?; | |
# see http://perldoc.perl.org/functions/close.html for explanation of return | |
# status of close() and the variables it sets. | |
if (! $ret && $! != 0) { die "ssh.pl: unexpected problem ssh'ing to machine $machine"; } | |
if ($ssh_return_status != 0) { exit(1); } # exit with error status from this forked process. | |
else { exit(0); } # else exit with non-error status. | |
} | |
} | |
$ret = 0; | |
$numfail = 0; | |
for ($jobid = $jobstart; $jobid <= $jobend; $jobid++) { | |
$r = wait(); | |
if ($r == -1) { die "Error waiting for child process"; } # should never happen. | |
if ($? != 0) { $numfail++; $ret = 1; } # The child process failed. | |
} | |
if ($ret != 0) { | |
$njobs = $jobend - $jobstart + 1; | |
if ($njobs == 1) { | |
if (defined $jobname) { | |
$logfile =~ s/$jobname/$jobstart/; # only one numbered job, so replace name with | |
# that job. | |
} | |
print STDERR "ssh.pl: job failed, log is in $logfile\n"; | |
if ($logfile =~ m/JOB/) { | |
print STDERR "run.pl: probably you forgot to put JOB=1:\$nj in your script."; | |
} | |
} | |
else { | |
$logfile =~ s/$jobname/*/g; | |
print STDERR "ssh.pl: $numfail / $njobs failed, log is in $logfile\n"; | |
} | |
} | |
exit ($ret); | |