|
Home | Switchboard | Unix Administration | Red Hat | TCP/IP Networks | Neoliberalism | Toxic Managers |
(slightly skeptical) Educational society promoting "Back to basics" movement against IT overcomplexity and bastardization of classic Unix |
|
The ql command parses a lot of SGE data and outputs an overall picture of the cluster load:
?jbp@head1 [ 82 ] % ql
12 nodes are down
102 total load (rounded up:
141)
10 nodes have a load avg
of 0.50 to 1.00
14 nodes have a load avg
of 1.50 to 2.00
|
In this case, the system is about half-loaded – there are 232 CPUs available in the system and 102 of them are being used. Since the 'rounded up' number is slightly higher than the total load, this indicates that some of the jobs are running at higher load than they "should" be. If we look at the next block of information, we see that 30 nodes have a load of 2.00 to 2.50, but we know that each machine has only 2 CPUs, so some of those machines are actually over-loaded. However, looking at this information, we can see that 130 CPUs are unused and thus if we wanted to launch a large parallel job, it is likely that we would be able to do so without waiting in the queue for very long.
#!/usr/bin/perl # # (C) 2004-2009, John Pormann, Duke University # [email protected] # # Permission is hereby granted, free of charge, to any person obtaining a copy # of this software and associated documentation files (the "Software"), to deal # in the Software without restriction, including without limitation the rights # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell # copies of the Software, and to permit persons to whom the Software is # furnished to do so, subject to the following conditions: # # The above copyright notice and this permission notice shall be included in # all copies or substantial portions of the Software. # # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN # THE SOFTWARE. # # RCSID: $Id: ql,v 1.15 2006/08/09 14:50:39 jbp1 Exp jbp1 $ # # ql - parse SGE queue info into overall system load info use Getopt::Std; getopts('vVr:hHjT:'); if( defined($opt_h) ) { print "usage: ql [-v] [-l] [-r #]\n" . " -j show job-load\n" . " -H show histogram\n" . " -r # sets rounding parameter\n" . " -v verbose output\n" . " -V really verbose output\n"; exit( 1 ); } if( not defined($opt_r) ) { $round_l = 0.5; } else { $round_l = $opt_r; } $inv_round_l = 1.0/$round_l; %hostinfo = (); %jobinfo = (); &process_sge_info(); if( defined($opt_V) ) { foreach $k ( keys(%hostinfo) ) { $v = $hostinfo{$k}; @jlist = split( /\|/, $v ); $x = scalar(@jlist) - 1; print "host [$k] [$x][$v]\n"; } foreach $k ( keys(%jobinfo) ) { $v = $jobinfo{$k}; print "job [$k] [$v]\n"; } } @loadavg_hist = (); @loadpct_hist = (); @jobload_hist = (); $tot_mach = 0; $tot_cpus = 0; # sum of the loads on the machines $tot_load = 0; # sum of the loads on the machines (rounded up) $tot_rload = 0; # sum of the job-loads on the machines $tot_jload = 0; # any 'down' machines? $tot_down = 0; # how many are up? $tot_up = 0; foreach $host ( keys(%hostinfo) ) { $data = $hostinfo{$host}; @list = split( '!', $data ); $tot_mach++; if( ($list[0] eq '-') or ($list[2] eq '-') ) { if( defined($opt_V) ) { print "* host [$host] is down\n"; } $tot_down++; $list[1] = 0; $list[2] = 0; } else { $tot_up++; $tot_cpus += $list[1]; $tot_load += $list[2]; $tot_rload += int( $list[2] + 0.99 ); @jlist = split( /\|/, $data ); $x = scalar(@jlist) - 1; $tot_jload += $x; $jobload_hist[$x]++; if( ($list[1]+0) > 0 ) { $lp = 100 * $list[2] / $list[1]; $lp = int( $lp/$round_l + 0.5 )*$round_l; } else { $lp = 'inf'; } $la = int( $list[2]*$inv_round_l + 0.5 ); $loadavg_hist[$la]++; if( defined($opt_T) ) { if( $x >= ($opt_T+0) ) { print "host [$host] has [$x] jobs [$data]\n"; } } } } printf "\t%4d total hosts\n",$tot_mach; if( $tot_down > 0 ) { printf "\t%4d hosts are down\n",$tot_down; } printf "\t%4d total number of cpus\n",$tot_cpus; printf "\t%4d total load (rounded up: %d)\n",$tot_load,$tot_rload; print "\n"; if( defined($opt_j) and defined($opt_H) ) { &print_jobhist(); } elsif( defined($opt_j) ) { &print_jobload(); } elsif( defined($opt_H) ) { &print_loadhist(); } else { &print_loadavg(); } # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # sub process_sge_info { my ($host,$job,$prio); my @list; open( FP, "/usr/bin/qhost -q -j |" ); $host = ''; $job = -1; while() { chomp( $_ ); if( $_ =~ m/HOSTNAME\s+ARCH\s+NCPU/ ) { # header line, skip it } elsif( $_ =~ m/job\-ID\s+prior\s+name/ ) { # header line, skip it } elsif( $_ =~ m/\-{20,}/ ) { # separator line, skip it } elsif( $_ =~ m/^\w/ ) { # first char is a letter, so it is a host @list = split( /\s+/, $_ ); $host = shift( @list ); $hostinfo{$host} = join( '!', @list ); } elsif( $_ =~ m/^\s+\d/ ) { # first chars are spaces, then digit, so it is a job num $_ =~ s/^\s+//; @list = split( /\s+/, $_ ); $job = shift( @list ); if( exists($jobinfo{$job}) ) { $jobinfo{$job} .= "|$host!$prio!" . join( '!', @list ); } else { $jobinfo{$job} = "$host!$prio!" . join( '!', @list ); } $hostinfo{$host} .= "|$job!$prio"; } elsif( $_ =~ m/^\s+high/ ) { $prio = 'hi'; } elsif( $_ =~ m/^\s+low/ ) { $prio = 'lo'; } elsif( $_ =~ m/^\s+\w/ ) { # first chars are spaces, then letter, so it is a continuation of a job $_ =~ s/^\s+//; $_ =~ s/\s+/!/g; @list = split( /!/, $_ ); $jobinfo{$job} .= "|$host!$prio!" . join( '!', @list ); $hostinfo{$host} .= "|$job!$prio"; } else { } } # all done, rest of lines have pending job info close( FP ); # get rid of 'global' item delete( $hostinfo{global} ); return; } sub print_jobload { for($i=0;$i 0 ) { $x = $i; $y = $x + 0.99; printf "\t%4d nodes have a job-load of %5.2f to %5.2f\n", $jobload_hist[$i],$x,$y; } } } sub print_jobhist { $ppp = 50 / $tot_up; $xxx = ' ' x 9 . '|'; $x = $ppp * 100; $y = 1/$ppp; printf " one '#' is 2%% or %.1f nodes\n", $x,$y; print " nds : job-load : |$xxx$xxx$xxx$xxx${xxx}100%\n"; for($i=0;$i 0 ) { $x = $i * $round_l; $y = $x + $round_l; printf "\t%4d nodes have a load avg of %5.2f to %5.2f\n",$loadavg_hist[$i],$x,$y; } } }