#!/usr/bin/perl -w # # $Id: //websites/unixwiz/unixwiz.net/webroot/evo/evo-patch-Jamaica#4 $ # # written by : Stephen J. Friedl # Software Consultant # Tustin, California USA # steve@unixwiz.net / www.unixwiz.net # # This program applies a series of patches to the various Evolution # data files. It's designed to use multiple processors if available # (it scales well with more), and generally manage the whole process # well. # # NOTE: This has been customized for the Irasburg->Jamaica upgrade # # TASK MANAGER # ------------ # # We've written this program to be more generic than necessary: it's # pretty much a child subprocess ("thread") manager in perl, and it # was an offshoot of our evo-reload script. The idea is that we first # make a list of all the work to do, then spawn children to process # them. # # All the "work to do" is represented by a "work object", and this # is an anon hashref that contains all the important values that # show what has to be done. This includes the input parameters # (supplied by the caller), plus the runtime parameters used # while managing the task. # # The main two input parameters are: # # $ref->{dbfile} - the name of the Firebird ".gdb" file # to be patched. This file MUST exist when the # program starts or the work request will be # rejected. # # $ref->{patchfile} - the name of the ".sql" file used to # apply the patches. This likewise must exist. # # Plus at runtime there are a few others: # # $ref->{pid} - process ID of the child while running # # $ref->{status} - exit status from the process # # $ref->{starttime} - UNIX time that the process started # # $ref->{runtime} - total seconds for the process to run # # ESTIMATED TIME TO COMPLETION # ---------------------------- # # Patching can be a very slow process, and it's annoying to not have # any idea when it will finish: so the program tries to make a good # guess. # # Each task is timed, and when each one finishes, an average time per # task can be calculated. This time is then applied to all the jobs # left to process to get an estimated finish time. # # This is imprecise for a couple of reasons: # # 1) if a few non-CL files are patched at the start of the run, this # can throw off the calculations a lot. We've thought about # excluding these "extra" patch tasks from the calculations. # # 2) bigger CL_ files take longer to patch than others, and this # calculation doesn't take that into account. We've actually # tried a version that did, and it didn't help at all. # # It usually takes a while before the "estimated time" starts to # converge on something accurate. # # TAILORING FOR A NEW PATCH SYSTEM # -------------------------------- # # There are basically two areas that have to be updated when modifying # this for a new set of patches. # # 1) find the section "BUILD WORK LIST" which decides which files are # to be patched, and which .sql files are used to do it. There are # command-line parameters that can guide the program's selection of # work (such as --clients-only), but this must be tuned for each # set of patches. # # 2) the "runpatch()" subroutine does the actual work, applying the # given patchfile to the given DB file. # # This program anticipates applying multiple patch files to each DB # file: they're all collectively considered one tasks. # # LOGGING # ------- # # The standard error shows an ongoing log of what's happening, but this # is in an imprecise format. The results of the actual SQL patch are # directed into a temporary logfile that is grepped for failure # messages, and any failure of a patch abandons the whole tasks. # # Once each actual isql patch command is finished, the results of this # temp logfile is appended to the per-patch logfile that's named for # the patch itself (e.g., "CL.sql" -> "CL.log"). There is a small # header is prepended to the patch output that shows the client file # being patched, along with the time. # # If any errors occurred during patching, these files should be # consulted to find out why. # # COMMAND LINE # ------------ # # --nthreads=N Use worker threads to apply the patches. One # should be used for each CPU on the system, but the # default is 1. Note that these aren't really "threads", # (they're child processes), but this seems to convey # the sentiment we have in mind. # # --limit=N Patch no more than files: this is mainly used to # for testing, and the default is to process them all. # # --clients-only Patch only the CL_### files, not anything else. # # --dbpath=DIR Look in DIR for the database files to patch. This # is REQUIRED because we don't want any chance on # patching the wrong files. # # --addpath=DIR Add DIR to the runtime path, which is mainly meant # for the Firebird binaries path. # # --verbose Show a bit more runtime info. This can be repeated to # get even more debug, but the extra debugging info is # only really useful for testing the process dispatching # # --noexec Show what would be run, but don't actually run it. # # --skip-reload After patching, skip the backup/restore step. This # --no-sweep really shouldn't be used for patching real data, but # for testing it can make the patch go much more quickly. # # --exiterror Stop patching if any of the tasks fail # # --patch-dotclients Patch the .CL_* files too. Some service bureaus make # it a practice to rename deleted databases from # CL_123.gdb to .CL_123.gdb so they remain around, but # are excluded from all processing. They need to be # patched eventually, but for testing it's probably not # necessary for testing, or at least not at first. # # --no-chown Normally we perform a change-owner operation on all # the DB files so they are owned/group by "firebird", # as the patching process normally leaves them owned by # root. --no-chown flag suppresses this operation. # # --sysfiles-only Don't patch any CL files; S_BUREAU & TMP_TBLS only # # # SAMPLE RUN # ----------- # # Unlike previous patch programs, this does NO prompting of data. But # because there could be several parameters, and each run of patching # should be done the same way, it's prudent to put the command # required in a small shell script that's run every time. This allows # for consistent # # # runpatch # # cd /db/Jamaica-patches # # rm *.log 2>/dev/null # # time ./evo-patch-Jamaica --dbpath=. "$@" # # When running this via a remote network connection that could be # interrupted, it's best to put it in the background with the "nohup" # command: # # # cd /db # # # rm nohup.out # # # nohup sh runpatch & # # # tail -f nohup.out # # Now, "nohup.out" will contain all the output of the program,a nd the # "tail -f" allows you to watch it run. It's OK to log off of the system # and come back, as the patch process is immune from being killed by a # hangup. # # HISTORY # ------- # # 3.0.0 - started rebuilding for Irasburg # 3.1.0 - 2006/04/20; first release for Iraburg # 3.1.1 - 2006/04/24; added chown-to-firebird-user option # 3.1.2 - 2006/04/24; moved around all the DB user access # 4.0.0 - 2006/11/27; updated for Jamaica # 4.0.1 - 2006/11/30; added start/end timings # 4.0.2 - 2006/12/15; refined the timing reporting # 4.0.3 - 2006/12/15; added --sysfiles-only # 4.0.4 - 2006/12/29; Fixed CPU detection for Red Hat 8 # use strict; use English; my $version = "4.0.4"; # 2.x = Garfield # 3.x = Irasburg # 4.x = Jamaica #-------------------------------------------------------------- # GLOBALS # my $dbpath = undef; my $addpath = "/opt/interbase/bin/"; # These define the credentials for the various operations: patch, backup, and restore. my $patchuserinfo = " -user SYSDBA -pass pps97"; my $backupuserinfo = " -user SYSDBA -pass pps97"; my $restoreuserinfo = " -user EUSER -pass pps97"; my $exit_on_error = 0; # bail on first error my $verbose = 0; # show individual commands my $clients_only = 0; my $patch_dotclients = 0; my $skip_reload = 0; my $sysfiles_only = 0; my $noexec = 0; my $limit = 0; my $nochown = 0; my $continue; $0 =~ s|..*/||; # tail pathname ( my $nthreads = `grep '^processor' /proc/cpuinfo | wc -l` ) =~ s/\s+//g; $nthreads = 1 if not $nthreads; print "$0 version $version; provided by www.unixwiz.net/evo/\n\n"; foreach ( @ARGV ) { if ( m/^--help/ ) { print STDERR < $addpath is already in \$PATH\n" if $verbose; } else { print "--> adding $addpath to \$PATH\n" if $verbose; $ENV{'PATH'} = "$addpath:$path"; } } # ------------------------------------------------------------------------ # Apply all the SQL patches to the various DB files. We typically process # a few of the "base" system files, then run all the client files. This # is normally highly customized for each release. # my @WORKLIST = (); sub addwork { my $dbfile = shift; my @patchfiles = @_; if ( not $dbfile ) { die "ERROR: cannot add $dbfile to worklist - not found\n"; } # make sure all patch files are found foreach my $patchfile ( @patchfiles ) { die "ERROR: can't find patch file $patchfile\n" if not -r $patchfile; } push @WORKLIST, { dbfile => $dbfile, patchfiles => [ @patchfiles ], pid => undef, status => 0, startime => 0, runtime => 0 }; } # # runwork() # # Given a ref to a work object, run everything that needs to be done. # This is the *generic* interface to the process-spawning system, # and its job is to call the *specific* function to do the real # work. In this case, it's "runpatch" called with the names of the # DB and patch files. # # NOTE: this is the generic wrapper function that's designed to be # called from the dispatcher, and it doesn't know much about the # actual job to be called. # sub runwork { my $wref = shift; my $rc = runpatch($wref->{dbfile}, @{ $wref->{patchfiles} }); sleep 1 if $rc == 0; # failure? print STDERR "runwork: returning $rc\n" if $verbose > 1; exit $rc; } my %CHILDLIST = (); my @DONELIST = (); # ------------------------------------------------------------------------ # BUILD WORK LIST # # Select all the files to patch. This can include the "central" files such # as S_BUREAU or TMP_TBLS, and probably all of the CL_xxx files too. For # each file, call "addwork()" with the DB file name and the name of the # patch file. # # ===TUNING: the name of the patch files can be applied here. # if ( not $clients_only ) { addwork($dbpath.'S_BUREAU.gdb', 'sb_8-0-0.sql'); addwork($dbpath.'TMP_TBLS.gdb', 'tmp_8-0-0.sql'); } my @CLIENTS = (); if ( not $sysfiles_only ) { push @CLIENTS, glob( $dbpath . 'CL_*.gdb' ); push @CLIENTS, glob( $dbpath . '.CL_*.gdb' ) if $patch_dotclients; } foreach my $clname ( @CLIENTS ) { # NOTE: at one point CL_BASE was not patched, but the current # Dorset arrangment does patch it. # next if $clname =~ m/CL_BASE/; # that's not patched addwork( $clname, 'cl_8-0-0.sql'); } # account for the limit, if any if ( $limit ) { print STDERR "--> limiting worklist to $limit items\n"; @WORKLIST = splice(@WORKLIST, 0, $limit); } # ------------------------------------------------------------------------ # WIERD STUFF # # In Red Hat 7.2 with perl 5.6, we needed to ignore SIGCHLD, but with # Red Hat 9 and perl 5.8, it causes the wait() syscall below to fail. # We don't know why this is the case, # # SO: to make this portable, we ignore the signal, but if the wait # call fails below, we un-set it. We really ought to figure out why # this is happening. # # my $SIGCHLD_save = $SIG{CHLD}; # $SIG{CHLD} = 'IGNORE'; my $totaltime = 0; my $totaljobs = 0; my @FAILLIST = (); while ( @WORKLIST or %CHILDLIST ) { my $did_stuff = 0; # print STDERR "(top of main work list)\n"; # ---------------------------------------------------------------- # SPAWN WORKER THREADS/PROCESSES # # As long as we have actual work to do, and the number of current # workers is less than #threads, spawn child processes to run # each patch. while ( (@WORKLIST > 0) and ( scalar(keys %CHILDLIST) < $nthreads ) ) { $did_stuff = 1; my $work = shift @WORKLIST; my $pid; if ( $pid = fork() ) { my $f = $work->{dbfile}; $work->{pid} = $pid; $work->{starttime} = time; print STDERR "(adding child $pid)\n" if $verbose > 1; $CHILDLIST{$pid} = $work; } elsif ( defined $pid ) { # in the child my $rc = runwork($work); print STDERR "child: exiting $rc\n" if $verbose > 1; exit $rc; } else { die "ERROR: cannot fork - bye\n"; } } # ---------------------------------------------------------------- # WAIT FOR CHILDREN # # If we got here, then the child table is full so we're waiting # for a child to show up. # # NOTE: different versions of perl/linux hav a problem with the # wait() call if ( scalar keys %CHILDLIST ) { my $pid = wait(); my $status = $?; # exit status from child if ( $pid <= 0 ) { print STDERR "NOTE: wait returned error $pid\n"; # undef $SIG{CHLD}; # maybe needed? sleep 1; next; } $did_stuff = 1; my $ref = $CHILDLIST{$pid}; if ( not defined $ref ) { print STDERR "INTERNAL ERROR: got unexpected pid $pid\n"; } else { $ref->{status} = $status; $ref->{runtime} = time - $ref->{starttime}; # seconds delete $CHILDLIST{$pid}; push @DONELIST, $ref; printf STDERR "Patch of %s finished, stat=%d (%ld secs)\n", $ref->{dbfile}, $ref->{status}, $ref->{runtime}; $totaltime += $ref->{runtime}; $totaljobs++; # compute estimated finish time my $jobs_remaining = scalar @WORKLIST + scalar keys %CHILDLIST; my $avg_per_job = ($totaltime / $totaljobs) / $nthreads; my $secs_remaining = int($jobs_remaining * $avg_per_job); my $endtime = time + $secs_remaining; my $hh = ( $secs_remaining / (60*60) ); my $mm = ( $secs_remaining / 60) % 60; my $ss = ( $secs_remaining % 60 ); printf STDERR "Remaining: %d jobs %02d:%02d:%02d (end at %s)\n", $jobs_remaining, $hh, $mm, $ss, scalar localtime($endtime); } # -------------------------------------------------------- # If the child failed, then we have to note this so we # can stop when all the current children exit. # if ( $status != 0 ) { push @FAILLIST, $ref->{dbfile}; @WORKLIST = () if $exit_on_error; } } # ---------------------------------------------------------------- # This is a sanity check: there should be no circumstance that # we burn through the whole loop without doing *anything*, but # if it happens we don't want to just loop like mad. So we report # the error and pause for a moment. This should not happen. # if ( not $did_stuff ) { print STDERR "(nothing to do in this loop)\n"; sleep 1; } } # ------------------------------------------------------------------------ # Now we're done patching, so we have to make sure that the files are all # owned by the firebird user and group (they default to root:root). Evo # won't run unless the files are owned by firebird. # runcmd("chown -R firebird:firebird $dbpath") unless $nochown; $endtime = time; my $npatched = @DONELIST; my $elapsed = $endtime - $starttime; my $patchrate = ($npatched > 0) ? ($elapsed / $npatched) : 0; printf "Ended at %s; elapsed = %d seconds; %d files; %.2f/file\n", scalar localtime $endtime, $elapsed, $npatched, $patchrate; if ( @FAILLIST ) { my $count = @FAILLIST; my $errors = ($count == 1) ? "error" : "errors"; print STDERR < $tmplog 2>&1") != 0 ) { $nerr = 1; } # -------------------------------------------------------- # If we're *actually* patching (and not --noexec), then we # have an output file to deal with. Open the file and suck # all the data from it, then unlink it. If the file contains # any words that suggest an error, bump up the error count. # # Then dump this per-file log to the per-patch log. # if ( not $noexec ) { local $/ = undef; open(LOG, $tmplog) || die "ERROR: cannot open temp patch file $tmplog\n"; my $logtext = ; close(LOG); unlink($tmplog); $nerr++ if $logtext =~ m/(fail|error)/i; # copy to the main log with a small header my $timestr = scalar localtime; $logtext = "\n" . "# patch $dbfile with $patchfile $timestr\n" . $logtext; open(LOG, ">>$logfile"); print LOG $logtext; close LOG; } if ( $nerr ) { print "\nERROR applying $patchfile to $dbfile\n"; return 1; # failure } } # ---------------------------------------------------------------- # BACKUP AND RESTORE # # Finally we must backup and restore the file to shrink it back # to normal size and then clean up. Note that --skip-reload on # the command line skips this whole step. # # IRASBURG/JAMAICA HACK: no backup + restore for TMP_TBLS # if ( not $skip_reload and not $dbfile =~ m/TMP_TBLS/ ) # IRASBURG HACK { ( my $gbkfile = $dbfile ) =~ s/\.gdb$/.gbk/; die "INTERNAL ERROR: gdb/gbk mismach error\n" if $gbkfile eq $dbfile; my $backup_cmd = "gbak"; $backup_cmd .= " -B"; # backup, not restore $backup_cmd .= " -t"; # transportable format $backup_cmd .= " -l"; # ignore transactions in limbo $backup_cmd .= " -g"; # no garbage collection # $backup_cmd .= " -ig"; # ignore bad checksums (huh?) $backup_cmd .= " $backupuserinfo"; $backup_cmd .= " $dbfile"; # SOURCE FILE $backup_cmd .= " $gbkfile"; # TARGET FILE my $restore_cmd = "gbak"; $restore_cmd .= " -r"; # restore # $restore_cmd .= " -c"; $restore_cmd .= " -p 8192"; # pagesize $restore_cmd .= " $restoreuserinfo"; $restore_cmd .= " $gbkfile"; # SOURCE FILE $restore_cmd .= " $dbfile"; # TARGET FILE if ( runcmd($backup_cmd) == 0 && runcmd($restore_cmd) == 0 ) { runcmd("rm $gbkfile"); } else { # ack! errors! $nerr++; } } return $nerr; } # # time_remaining # # Given a number of seconds, return the time in HH:MM:SS notation. # This is used to report the estimated completion of the client # loads. # sub time_remaining { my $nsecs = shift; my $ss = $nsecs % 60; $nsecs = int($nsecs / 60); my $mm = $nsecs % 60; $nsecs = int($nsecs / 60); my $hh = $nsecs; return sprintf("%02d:%02d:%02d", $hh, $mm, $ss ); } # # runcmd # # Given a command line, show it to the user and run it unless the user # has given the --noexec command-line parameter. The return value is # whatever the system() command runs, and if --noexec we presume success. # sub runcmd { my $cmd = join(" ", @_); my $rc = 0; print STDERR "--> $cmd\n" if $verbose; $rc = system($cmd) unless $noexec; if ( $rc != 0 ) { print STDERR "** Command {$cmd} returned $rc\n"; } return $rc; }