#!/usr/bin/env perl use strict; use Getopt::Long; # for std option handling: -h --yadda=badda, etc use Socket; use Env qw(HOME PATH); # TODO # - when catching up to a previous rsync, should cycle fast until it's all # caught up then drop back to regular periodicity. Maybe, regardless of # requested cycle time, it cycles very fast for the 1st few to check that it's caught up? # - check section where: INFO: Waiting for fpart to get ahead of the transfer # (~392 - this seems to be infinite-looping in a few cases. # - use this form for DEBUGs: if ($DEBUG){&debug(__LINE__, "RPMLIST = [$rpmlist]\n")} # - look into either writing a new script or expanding this one for another use; # first time or one-time data movement - ie no rsync capabilities needed. # uses fpart -L -> tar -> pigz -> nc/mbuffer -> network and then reverse. # the order to place data on the remote target. # - refactor this code - what a mess. use vars qw($allPIDs $ALL_SYS_RSYNC_PIDS $BAREFILES $ch $CHECKPERIOD $cmd $crr $CUR_FP_FLE $CUR_FPI $DATE $dcnt $DEBUG @DIRS @DIRS2SYNC $dirtmp $EMAIL $Filecnt %FILES $fl $fn $fnd2r $FOUT $FPART_LOGFILE $FPART_PID $FPART_RUNNING $FPARTSIZE $FPARTSIZE_N $FP_PIDFILE $FP_ROOT $FP_ROOT_DIR $FP_RUNNING $hdr_cnt $hdr_rpt $HELP $IF_SPEED $LOAD1mratio $loadavg $logfile $MAXBW $MAXLOAD $nbr_cur_fpc_fles $NBR_FP_FLES $NCPUs $NDIRS $NETIF $NOWAIT $NP $NP_chunk $parsync_dir $parsync_dir $PARSYNCVER $PIDFILE $PIDFILE $prev_cache $QUIET $rem_host $remote $rem_path $rem_user $REUSECACHE $rootdir $rPIDs $sPIDs $ROOTDIR $RSYNC_CMD $RSYNCOPTS $RSYNCS_GOING $STILLRSYNCS @SYSLOAD $TARGET $tmp $Totlsiz %UTILS $VERSION ); $PARSYNCVER = << "VERSION"; parsyncfp version 1.11 (beta) 10-09-2014 by Harry Mangalam || parsyncfp is a Perl script that wraps Andrew Tridgell's miraculous 'rsync' to provide some load balancing and parallel operation across network connections to increase the amount of bandwidth it can use. The 'fp' variant uses 'fpart' to bypass the need for a full recursive descent of the dir trees before the actual transfer starts. VERSION if (! @ARGV) {usage();} # in case someone doesn't know what to do. &GetOptions( "startdir=s" => \$ROOTDIR, # Have to be able to set rootdir -> SRC in rsync "rsyncopts=s" => \$RSYNCOPTS, # passthru to rsync as a string "NP=i" => \$NP, # number of rsync processes to start "chunksize=s" => \$FPARTSIZE, # the size that fpart chunks (allow PpTtGgMmKk) "reusecache!" => \$REUSECACHE, # dont re-read dirs, re-use existing ones. "checkperiod=i" => \$CHECKPERIOD, # # of sec between system load checks "maxbw=i" => \$MAXBW, # max bw to use (--bwlimit=KBPS passthru to rsync) "maxload=f" => \$MAXLOAD, # max system load - if > this, sleep rsyncs "email=s" => \$EMAIL, # email to notify when finished "interface=s" => \$NETIF, # network interface to use if multiple ones "nowait!" => \$NOWAIT, # sleep a few s rather than wait for a user ack "help!" => \$HELP, # dump usage, tips "version!" => \$VERSION, # duh.. "debug!" => \$DEBUG, # requests more developer-level info ); if (! defined $QUIET) {$QUIET = 0;} ## Set up run-permanent variables. $DATE=`date +"%T_%F" | sed 's/:/./g' `; chomp $DATE; $parsync_dir = $HOME . "/.parsync"; if (defined $VERSION) { print $PARSYNCVER; exit;} if (!defined $RSYNCOPTS) {$RSYNCOPTS = "";} if (defined $HELP) {usage();} check_utils(); # check that the required utilities are on the system ### get the current system stats: #CPUs, load, bandwidth, etc # CPUs $NCPUs = `cat /proc/cpuinfo | grep processor | wc -l`; chomp $NCPUs; $loadavg = `cat /proc/loadavg | tr -d '\n'`; @SYSLOAD = split (/\s+/, $loadavg); # 1st 3 fields are 1, 5, 15m loads # so as long as the 1m load / NCPUs < 1, we're fine; if > 1, we may want to start throttling.. $LOAD1mratio = $SYSLOAD[0] / $NCPUs; if (! defined $NETIF) {$NETIF = "eth0";} if (! defined $CHECKPERIOD || $CHECKPERIOD < 5) {$CHECKPERIOD = 5;} # really 10s due to the 5s of network sampling elsif ($CHECKPERIOD >= 5) {$CHECKPERIOD -= 5;} if (! defined $NP){$NP = int(sqrt($NCPUs)+ 0.5);} # round sqrt(NCPUs) (hyperthreaded if Intel) 8 -> 3 if (! defined $MAXBW) {$MAXBW = 1000000;} # essentially unlimited else {$MAXBW = int($MAXBW / $NP + 0.5);} # users expect total maxbw; so have to divide by NP. if (! defined $MAXLOAD) {$MAXLOAD = $NP + 2 ;} # + 1 for IO load if (! defined $ROOTDIR) {$ROOTDIR = `pwd`; chomp $ROOTDIR;} # where all dirs must be rooted. if (! defined $FPARTSIZE) {$FPARTSIZE = "10G"; $FPARTSIZE_N = 104857600;} # default is 10Gish elsif ($FPARTSIZE =~ /[PpTtGgMmKk]/) {$FPARTSIZE_N = ptgmk($FPARTSIZE); } else {$FPARTSIZE_N = $FPARTSIZE;} if ($DEBUG) {&debug(__LINE__, "FPARTSIZE = $FPARTSIZE\nFPARTSIZE_N = $FPARTSIZE_N");} # get some network info if ($NETIF =~ /eth/) { $IF_SPEED = `ethtool eth0 2> /dev/null | grep Speed | cut -f2 -d:`;} elsif ($NETIF =~ /wlan/) { $IF_SPEED = `iwconfig wlan0 | grep -i quality`; } elsif ($NETIF =~ /ib/) { $IF_SPEED = `ibstat | grep Rate | head -1 | sed -e 's/^[ \t]*//'`; $IF_SPEED = "IB:" . $IF_SPEED; } chomp $IF_SPEED; if ($DEBUG){ print "\tEVAL: Using network interface [$NETIF] with connection quality [$IF_SPEED]\n\n";} if ($SYSLOAD[0] < $MAXLOAD){ if ($DEBUG){ print "\n\tEVAL: 1m load is [$SYSLOAD[0]] and the 1m Load:#CPU ratio is [$LOAD1mratio] ( [$NCPUs] CPU cores). OK to continue.\n " } } else { print "\n!!WARN: 1m System load is > [$SYSLOAD[0]]. The 1m Load:#CPU ratio is [$LOAD1mratio].\n Continue? [Cntrl+C to interrupt; Enter to continue]\n "; pause(); } if (-d $parsync_dir) { my $ls = `ls -l $parsync_dir`; print < $hdr_rpt so it gets printed 1st time # this takes care of the last ARGV so that all the rest of the words are target dirs&files $TARGET = $ARGV[$#ARGV]; # remote rsync target if (!defined $TARGET ){die "\n\nXX FATAL XX: No target defined! Where you gonna put this stuff??!?\nTry $0 --help for the built-in help.\n"} $#ARGV--; # now process the dirs $dcnt = 0; $fnd2r = ""; # zero the list of 'files 'n' dirs to rsync' $dirtmp = shift; # should only be dir/files left once getopt finishes (see above) # If there are no files or dirs defined, take the current dir if (!defined $dirtmp) { $dirtmp = `pwd`;} #print "DEBUG: dirtmp = [$dirtmp]\n"; while (defined $dirtmp) { # should work on explicitly named dirs as well as globs. $dirtmp = $ROOTDIR . '/' . $dirtmp; if (! -r $dirtmp){ # quick check to see if its readable. print "WARN: [$dirtmp] isn't readable; either it's not where you think it\nis or you need to escalate your privs. Regardless, it won't be transferred in this run.\n"; } else { # otherwise, add the file to list to be chunked and transferred. $fnd2r .= $dirtmp . " "; } $dirtmp = shift; } $#ARGV++; # now incr to allow the TARGET to be captured. my @cachefiles = (); # will populate with list of cachefiles to process together. ### TODO REUSECACHE my $rsls = `ls -1 $FP_ROOT_DIR`; #print "DEBUG: rsls = $rsls\n"; my $glob = $FP_ROOT_DIR . "/fpc.*"; if ($rsls =~ 'fpc') {$prev_cache = `ls -1 $glob`; } elsif (defined $REUSECACHE){ print "!!WARN: You chose '--reusecache', but there's no files for it. Unsetting that option\n\n."; undef $REUSECACHE; sleep 1; } ## This is the big REUSECACHE SECTION. ONlY enter if want to REUSECACHE if (defined $REUSECACHE && -d $FP_ROOT_DIR){ print "!!WARN: NOT GENERATING NEW CACHE; RE-USING ALL OF PREVIOUS CACHE. This includes the following cache files from [$FP_ROOT_DIR]: -------------------------------------------------------------------- $prev_cache -------------------------------------------------------------------- If you want to ignore some of these cachefiles, delete them or move them out of the way. Hit [CTRL + C] to cancel or .. "; if ($NOWAIT){ print " Actually... Not waiting. You have 5 sec to cancel.\n"; sleep 5; } else{ pause(); } # now have to populate the @cachefiles array from the existing cachefiles my $nn = @cachefiles = split(/\n/,$prev_cache); # this ends up with an array of cache files, but since they're being used as-is # we really don't need to process the list - just use as-is } else { # forking the fpart partitioner # have to unlink all the old cache files first to prevent overlaps print "WARN: about to remove all the old chunkfiles you didn't want. You've got 5s to ^C to stop me!\nOtherwise, they're gone\n"; sleep 5; unlink glob "$glob"; print "WARN: OK, they're gone.. continuing\n"; # make sure there's no other fpart's running. my $fparts_already_running = `ps aux | grep 'fpar[t]'`; chomp $fparts_already_running; if ($fparts_already_running ne ''){ print "WARN: one or more 'fpart's are already running: ==== [$fparts_already_running] ==== Unless you know that these fparts are valid and not left over from previous parsyncfp's, you should exit and kill them off before restarting this run. Continue? [Ny] "; my $ansr = ; if ($ansr =~ /[yY]/) { print "\nOK, continuing\n"; } else {die "Fine - clear up the confusion and try again.\n"} } my $x = 0; my $cmd = "fpart -v -L -s $FPARTSIZE_N -o $FP_ROOT $fnd2r 2> $FPART_LOGFILE & echo \"\${!}\" > $FP_PIDFILE"; # That captures the child PID! if ($DEBUG) {print "DEBUG: fpart fork cmd:\n[$cmd]\n";} if ($FPART_PID = fork) { print "\nINFO: Forking fpart..\nCheck [$FPART_LOGFILE] for errors if it hangs.\n"; } else { system "$cmd"; $FPART_PID = `cat $FP_PIDFILE`; chomp $FPART_PID; exit(0); # it's forked, now exit this stanza } # fpart has been forked; wait for enough chunkfiles to be written to start the rsyncs while (! -e $FP_PIDFILE) { sleep 1; #print "INFO: Waiting for fpart to be forked..\n"; } $FPART_PID = `cat $FP_PIDFILE`; chomp $FPART_PID; my $ready2start = my $waitcnt = $NBR_FP_FLES = 0; my $fp0 = $FP_ROOT . ".0"; my $fp1 = $FP_ROOT . ".1"; my $done = 0; while ($ready2start == 0) { if (-e $fp0) { print "INFO: [$fp0] visible.\n"; $NBR_FP_FLES++; $ready2start=1; } $waitcnt++; print "INFO: Waiting [$waitcnt] s for chunk files to be written\r"; sleep 1; } } # forking the fpart partitioner # start up NP rsyncs 1st, then cycle every CHECKPERIOD, checking # of rsyncs still going and # starting new ones as needed until the chunkfiles are exhausted. my $STILL_FP_CHUNKS = my $KEEPGOING = 1; my $FPCFS = $FP_ROOT . '.'; # FP Chunk File Stem my $NBR_FP_FLES = `ls -1 $FPCFS* | wc -l`; chomp $NBR_FP_FLES; $RSYNCS_GOING = $CUR_FPI = 0; # $CUR_FPI = current FP index print "\nINFO: Starting the 1st [$NP] rsyncs ..\n"; my $sc = 0; while ($RSYNCS_GOING < $NP && $KEEPGOING) { # $CUR_FP_FLE = $FP_ROOT . "." . $CUR_FPI ; # the current fp chunkfile if (-e $CUR_FP_FLE) { # if the current chunkfile exists fixfilenames($CUR_FP_FLE, $ROOTDIR); # check & fix for spaces, bad chars. # entire rsync command and PID capture (used in total of 2 places) $logfile = $parsync_dir . '/' ."rsync-logfile-" . $DATE . "_" . $CUR_FPI; $RSYNC_CMD = "rsync --bwlimit=$MAXBW $RSYNCOPTS -a --log-file=$logfile --files-from=$CUR_FP_FLE $ROOTDIR $TARGET & echo \"\${!}\" >> $PIDFILE"; # there will be as many logfiles as fp chunkfiles. # ie LOTS. but they can be deleted after the run has been verified.. # TODO don't know if we need this logfile. system("$RSYNC_CMD"); # launch rsync and capture the bg job PID to PIDfile $CUR_FPI++; $RSYNCS_GOING++; } else { # there aren't any more fp chunk files waiting, so check to see if it's finished. $FPART_RUNNING = `ps aux | grep fpar[t] | grep $FPART_PID | wc -l`; chomp $FPART_RUNNING; if ($FPART_RUNNING eq '0') { # so if it's done, then we're done. No more chunk files, so no more rsyncs to start. $KEEPGOING = 0; # signal the while loop to break. } else { # fpart is still going so wait for the next fpart chunkfile to be finished. print "INFO: waiting [$sc]s for next chunkfile [$CUR_FP_FLE]\r"; sleep 2; $sc += 2; } } } #while ($RSYNCS_GOING < $NP && $KEEPGOING) print "\n\n"; # so at this point either we've loaded all the rsyncs up to NP or we've completely finished. # If the latter, say good bye. If the former, then we have to keep launching # rsyncs up to NP until we've used up all the fpart chunkfiles. $sPIDs = ""; # running PIDs launched by parsync, suspended PIDs (strings) $NBR_FP_FLES = `ls -1 $FPCFS* | wc -l`; chomp $NBR_FP_FLES; # get current # of chunks my @aprPIDs; # all recorded parsync rsync PIDs ever started my @crrPIDs; # currently RUNNING parsync rsync PIDs. my @csrPIDs; #currently SUSPENDED parsync rsync PIDs. ### FOLLOWING IS THE MAIN PARSYNC-FPART LOOP if ($REUSECACHE) {$FP_RUNNING = 0;} else { $FP_RUNNING = `ps aux | grep $FPART_PID | grep fpar[t] | wc -l`; chomp $FP_RUNNING;} while ($CUR_FPI < $NBR_FP_FLES || $FP_RUNNING || $STILLRSYNCS ) { $rPIDs = ""; # print the header if ($hdr_cnt++ > $hdr_rpt) { my $glob = $FP_ROOT . "*"; $hdr_cnt = 0; $nbr_cur_fpc_fles = `ls -1 $glob | wc -l`; chomp $nbr_cur_fpc_fles; print " Timestamp | 1m Load | BW [$NETIF] | Running PIDs || Suspended PIDs || Chunk [$CUR_FPI] of [$nbr_cur_fpc_fles]\n"; } ($rPIDs, $crr) = get_rPIDs($PIDFILE, $sPIDs); # now get load, bw, etc, and start rsyncs on new chunkfiles or suspend them to # load-balance $loadavg = `cat /proc/loadavg | tr -d '\n'`; # What's the system load? @SYSLOAD = split (/\s+/, $loadavg); # 1st 3 fields are 1, 5, 15m loads $LOAD1mratio = $SYSLOAD[0] / $NCPUs; my $meanbw = "0"; # What's the 5s mean BW? (contributes 5s to periodicity of updates) # TODO: this can be improved by sampling in the background. (fork ifstat to record # continuously and just sample the last X samples. $meanbw = `ifstat -i $NETIF 1 5 | tail -5 | cut -c9-19 | stats --quiet | grep Mean | cut -c 7-19`; chomp $meanbw; # print out current data with the date $rPIDs =~ s/^\s+|\s+$//g ; $sPIDs =~ s/^\s+|\s+$//g ; # trim leading & trailing whitespace if ($rPIDs eq "" ){$rPIDs = "No current PIDs (used all chunk files); start more next cycle"} my $rDATE=`date +"%T_%F" | sed 's/:/./g' `; chomp $rDATE; printf "$rDATE %5.2f %12.2f [%s] || [%s]\n", $SYSLOAD[0], $meanbw , $rPIDs, $sPIDs; ### SUSPEND OR CONTINUE RSYNCS for LOADBALANCING if ($SYSLOAD[0] > $MAXLOAD) { # suspend a PID; then loop as normal. If still high, will continue to # suspend PIDs until there's none left. if ($DEBUG) {print "\nDEBUG: System load [$SYSLOAD[0]] is > MAXLOAD [$MAXLOAD]. Will try to suspend a running rsync to shed load.\n";} # reassign a new list from ONLY RUNNING PIDs to $rPIDs (refresh $rPIDs) # this cmd picks up both suspended and running PIDs- have to remove the suspended ones. # in an efficient way. if ($rPIDs =~ /\d+/) {$rPIDs = `ps -p $rPIDs | grep -v PID| cut -c 1-5 | tr '\n' ' '`;} $rPIDs =~ s/^\s+|\s+$//g ; # trim leading and trailing # turn it into an array - (-> sub?) my $rn = my @ra = split(/\s+/, $rPIDs); my $sn = my @sa = split(/\s+/, $sPIDs); for (my $r=0; $r< $rn; $r++) { for (my $s=0; $s< $sn; $s++) { if ($ra[$r] eq $sa[$s]) {$rPIDs =~ s/$ra[$r]//g;} # delete it from $rPIDs } } # picks up both suspended and running PIDs # and the new result has to have something in it as well. if ($rPIDs =~ /\d+/){ # if any still left my $N = my @raPIDs = split(/\s+/, $rPIDs); my $e = 0; #@raPIDs = temp array to carry currently running PIDs while ($e <= $N && $raPIDs[$e] !~ /\d+/){$e++}; if ($DEBUG) {print "\t\tDEBUG:got one: [$raPIDs[$e]]; will now suspend it\n";} kill 'STOP', $raPIDs[$e]; if ($sPIDs !~ /$raPIDs[$e]/) { # If it's not there already $sPIDs = "$sPIDs" . ' ' . "$raPIDs[$e]"; # transfer rPID to sPID. $rPIDs =~ s/$raPIDs[$e]//g; # only then delete that PID fr the rPID string } } else { # there aren't any more PIDs left - all done or killed off.' print "\tINFO: No more running rsync PIDs left [$rPIDs]. All rsyncs are suspended [$sPIDs].\n"; } } elsif ($sPIDs =~ /\d+/) { # if there are sPIDs, unsuspend them one by one # split em my $N = my @saPIDs = split(/\s+/, $sPIDs); my $e = 0; while ($e <= $N && $saPIDs[$e] !~ /\d+/){$e++}; if ($DEBUG) { print "\t\tDEBUG:got one: [$saPIDs[$e]]; will now UNsuspend it\n";} kill 'CONT', $saPIDs[$e]; $rPIDs = "$rPIDs" . ' ' . "$saPIDs[$e]"; # transfer sPID to rPID. $sPIDs =~ s/$saPIDs[$e]//g; # delete that PID fr the sPID string } # end of 'SUSPEND OR CONTINUE to LOADBALANCE.' test loop # and if neither of those conditions are met, then we can launch another rsync. elsif ($crr < $NP) { # then launch another rsync with the next fpart chunkfile $CUR_FP_FLE = $FP_ROOT . "." . $CUR_FPI ; # generate the next fpart chunk file with $CUR_FPI # if fpart is still going, wait for the next chunkfile to show up my $cfw = 0; if ($REUSECACHE) {$FPART_RUNNING = 0;} else { $FPART_RUNNING = `ps aux | grep fpar[t] | grep $FPART_PID | wc -l`; chomp $FPART_RUNNING;} while (! -e $CUR_FP_FLE && $FPART_RUNNING eq '1'){ print "INFO: Waiting [$cfw] s for next chunkfile..\r"; sleep 2; $cfw += 2; } ($rPIDs, $crr) = get_rPIDs($PIDFILE, $sPIDs); my $n = my @a = split(/\s+/, $rPIDs); my $R2SU = $NP - $n; # this is the number of rsyncs to start up my $glob = $FP_ROOT . "*"; my $nbr_cur_fpc_fles = `ls -1 $glob | wc -l`; chomp $nbr_cur_fpc_fles; # $fparts_already_running will be '' if it's finished running. my $fparts_already_running = `ps aux | grep 'fpar[t]'`; chomp $fparts_already_running; # Check this more carefully for exceptions - this is the drop-dead error point # in some situations for ($n=0; $n<$R2SU; $n++) { while (($CUR_FPI >= $nbr_cur_fpc_fles) && $fparts_already_running ne '') { print "DEBUG[l398]: CUR_FPI=$CUR_FPI > nbr_cur_fpc_fles=$nbr_cur_fpc_fles?\n"; print "INFO: Waiting for fpart to get ahead of the transfer\n"; $nbr_cur_fpc_fles = `ls -1 $glob | wc -l`; chomp $nbr_cur_fpc_fles; sleep 2; } $logfile = $parsync_dir . '/' ."rsync-logfile-" . $DATE . "_" . $CUR_FPI; $nbr_cur_fpc_fles = `ls -1 $glob | wc -l`; chomp $nbr_cur_fpc_fles; $RSYNC_CMD = "rsync --bwlimit=$MAXBW -a --log-file=$logfile $RSYNCOPTS --files-from=$CUR_FP_FLE $ROOTDIR $TARGET & echo \"\${!}\" >> $PIDFILE"; if ($DEBUG) {print "\nDEBUG: Starting [$RSYNC_CMD]\n"; } if (-e $CUR_FP_FLE) { fixfilenames($CUR_FP_FLE, $ROOTDIR); # check & fix for spaces, bad chars. print "INFO: Starting another rsync with chunk file [", ($CUR_FPI+1), "] of [$nbr_cur_fpc_fles]\n"; system("$RSYNC_CMD"); # capture the bg job PID to PIDfile $CUR_FPI++; } } ($rPIDs, $crr) = get_rPIDs($PIDFILE, $sPIDs); } sleep $CHECKPERIOD; $NBR_FP_FLES = `ls -1 $FPCFS* | wc -l`; chomp $NBR_FP_FLES; # get current # of chunks if ($rPIDs =~ /\d+/) {$STILLRSYNCS = 1;} else {$STILLRSYNCS = 0;} } # while ($CUR_FPI < $NBR_FP_FLES ) my $host = `hostname`; if (defined $EMAIL){system("echo 'all rsyncs done' | mail -s 'parsyncfp on host [$host] completed' $EMAIL");} # remind user how much storage the cache takes and to clear the cache files my $du_cache = `du -sh $parsync_dir`; print "WARN: The parsync cache dir [$parsync_dir] takes up [$du_cache] Don't forget to delete it, but wait until you are sure that your job completed correctly, so that you can re-use it if necessary.\n"; exit; # ================= subroutines ================= # Define utilities required to run this version of parsync sub check_utils { my %UTILS = ( # required utils to help this run correctly "ethtool" => "", "iwconfig" => "", "ifstat" => "", "stats" => "", "fpart" => "", "scut" => "", ); # and check that they can be found.. my $utilsz = keys %UTILS; foreach my $util (keys %UTILS){ my $utilpath = `which $util | tr -d '\n'`; if ($utilpath !~ /$util/){ print "!!WARN: [$util] not found. you can find 'stats', scut, and 'fpart' here: and the rest via yum, apt-get, or google.\n"; die "\n\nFATAL: [$util] isn't on your PATH [$PATH]; Please install it or correct your PATH variable to include it.\nTry ''module load perl'' or use cpan to install it.\n\n" } else {$UTILS{$util} = $utilpath; if ($DEBUG){print "\tEVAL: Found [$util] at [$utilpath].\n"} } } } # usage: ($rPIDs, $crr) = get_rPIDs($PIDFILE, $sPIDs); sub get_rPIDs($$) { # Inputs my $pidfile = shift; # string name of PIDFILE my $spids = shift; # suspended PIDs in a string. my @aprPIDs = (); my $NSusPIDs = 0; my @SusPIDs; my $rpids = ""; # to be generated and returned as a string my @crrPIDs = (); # array that holds the currently running rsync PIDs my @ASRP; # All System Rsync PIDs my $NASRP; my $crr = 0;# currently running rsyncs counter my @crrPIDs = (); my $apr = 0; # all parsync rsync PIDs # how many rsyncs are running? Check the PIDFILE against the rsync PIDs that are running # if there are other rsyncs running, their PIDs won't be in the PIDFILE. # so have to do a diff of the PIDFILE vs all PIDs of rsyncs running. my $ALL_SYS_RSYNC_PIDS = `ps aux | grep rsyn[c] | scut -f=1 | sort -g | tr '\n' ' '`; chop $ALL_SYS_RSYNC_PIDS; $NASRP = @ASRP = split(/\s+/, $ALL_SYS_RSYNC_PIDS); open (PIDFILE, "<$pidfile") or die "\nFATAL: Can't open PIDFILE [$pidfile]'.\n"; # PIDs from the PIDFILE to compare system rsyncs (could be multiple going) # with parsync-launched rsyncs while () {chomp; $aprPIDs[$apr++] = $_; } # all parsync rsync PIDs close PIDFILE; # if there are any PIDs in the $spids string, split into an array if ($spids =~ /\d+/) { $NSusPIDs = @SusPIDs = split(/\s+/, $spids); } $rpids =~ s/^\s+|\s+$//g ; $spids =~ s/^\s+|\s+$//g ; # strip leading/trailing spaces # suboptimal I know, but the arrays are so small it doesn't matter. for (my $a=0; $a<$NASRP; $a++) { for (my $b=0; $b<$apr; $b++) { # if they match, they're MY rsyncs AND they're running if ($ASRP[$a] eq $aprPIDs[$b]) { $crrPIDs[$crr++] = $aprPIDs[$b]; } } } # dump @crrPIDs into $rpids $rpids = join(" ", @crrPIDs); $crr--; # trim off the extra incr # now mask out the sPIDs from the rPIDs list; works but ugly! $spids =~ s/^\s+|\s+$//g ; if ($spids =~ /\d+/) { # if there are any spids $NSusPIDs = @SusPIDs = split(/\s+/, $spids); for (my $r=0; $r<$NSusPIDs; $r++) { for (my $b=0; $b<$apr; $b++) { # if a sPID == rPID, delete the PID from the $rPIDs string if ( $SusPIDs[$r] eq $aprPIDs[$b]) { $rpids =~ s/$aprPIDs[$b]//g;} } } } return ($rpids, $crr); } sub pause { print "Press [ENTER] to continue.\n"; my $tmp = ; } # call as [debug(__LINE__, "string")] to print line # and debug string sub debug($$) { my $line = shift; my $msg = shift; print STDERR "DEBUG[$line]: $msg\n"; pause; } # fixfilenames reads in a file of filenames and iterates over them, fixing their # names and emitting useful warning if something goes odd. sub fixfilenames { my $FN = shift; my $startdir = shift; $startdir .= '/'; my $fpnew = $FN . ".new"; open (FP, "< $FN") or die "ERROR: Can't open fp file [$FN]\n."; open (FPN, "> $fpnew") or die "ERROR: Can't open replacement file [$fpnew]\n."; my $lc = my $fws = my $verified = my $failed = 0; while () { chomp; if ($_ =~ / /) { # subst all spaces with '\ ' $fws++; s/ /\ /g; } # and also delete off the startdir s/$startdir//g; print FPN "$_\n"; } close FP; close FPN; rename $fpnew, $FN; # and then rename the new one to the original } # ptgmk converts values suffixed with [PpTtGgMmKk] to bytes correctly # uses the 1024 bytes/kb as oppo to 1000 sub ptgmk { my $instr = shift; # trim spaces from back and front $instr =~ s/^\s+|\s+$//g; my $abbr = chop $instr; my $nbr = $instr; if ($abbr !~ /[PpTtGgMmKk]/) {die "\n\nFATAL: tgmk() input doesn't contain [TtGgMmKk], so nothing to convert.\n\n"; } if ($abbr =~ /[Kk]/) {$nbr *= 1024; return $nbr;} if ($abbr =~ /[Mm]/) {$nbr *= 1048576; return $nbr;} if ($abbr =~ /[Gg]/) {$nbr *= 1073741824; return $nbr;} if ($abbr =~ /[Tt]/) {$nbr *= 1.09951162778e+12; return $nbr;} if ($abbr =~ /[Pp]/) {$nbr *= 1.12589990684e+15; return $nbr;} } sub usage { my $helpfile = "$HOME/.parsync/parsync-help.tmp"; if (! -d "$HOME/.parsync") {mkdir "$HOME/.parsync";} open HLP, ">$helpfile" or die "Can't open the temp help file [$helpfile]\n"; my $helptxt = < to create chunkfiles for rsync to read, bypassing the need to wait for a complete recursive scan. It appropriates rsync's bandwidth throttle mechanism, using '--maxbw' as a passthru to rsync's 'bwlimit' option, but divides it by NP so as to keep the total bw the same as the stated limit. It monitors and shows network bandwidth, but can't change the bw allocation mid-job. It can only suspend rsyncs until the load decreases below the cutoff. If you suspend parsync (^Z), all rsync children will suspend as well, regardless of current state. Unless changed by '--interface', it assumes and monitors eth0. The transfer will use whatever interface normal routing provides, normally set by the name of the target. It can also be used for non-host-based transfers (between mounted filesystems) but the network bandwidth continues to be (pointlessly) shown. [[NB: Between mounted filesystems, parsync sometimes works very poorly for reasons still mysterious. In such cases (monitor with 'ifstat'), use 'cp' for the initial data movement and a single rsync to finalize. I believe the multiple rsync chatter is interfering with the transfer.]] It only works on dirs and files that originate from the current dir (or specified via "--startdir"). You cannot include dirs and files from discontinuous or higher-level dirs. ** the [$parsync_dir] files ** The [$parsync_dir] dir contains the cache dir (fpcache), and the time-stamped log files. The cache files can be re-used with '--reusecache' (which will re-use ALL the chunk files. The log files are datestamped and are not NOT overwritten. ** Odd characters in names ** parsyncfp will refuse to transfer some oddly named files (tho it should copy filenames with spaces fine. Filenames with embedded newlines, DOS EOLs, and some other odd chars will be recorded in the log files in the [$parsync_dir] dir. OPTIONS ======= [i] = integer number [f] = floating point number [s] = "quoted string" ( ) = the default if any --NP [i] (sqrt(#CPUs)) ................ number of rsync processes to start optimal NP depends on many vars. Try the default and incr as needed --startdir [s] (`pwd`) .................... the directory it starts at(*) --maxbw [i] (unlimited) .......... in KB/s max bandwidth to use (--bwlimit passthru to rsync). maxbw is the total BW to be used, NOT per rsync. --maxload [f] (NP+2) ............. max system load - if sysload > maxload, sleeps an rsync proc for 10s --chunksize [s] (10G) ... aggregate size of the files allocated to one rsync process. Can specify in 'human' terms [100M, 50K, 1T] as well as integer bytes. --rsyncopts [s] .... options passed to rsync as a quoted string (CAREFUL!) this opt triggers a pause before executing to verify the command(+) --interface [s] ........ network interface to monitor (not use; see above) --reusecache .......... don't re-read the dirs; re-use the existing caches --email [s] ..................... email address to send completion message --nowait ............. for scripting, sleep for a few s instead of pausing --version ................................. dumps version string and exits --help ......................................................... this help (*) you can use globs/regexes with --startdir, but only if you're at that point in the dir tree. ie: if you're not in the dir where the globs can be expanded, then the glob will fail. However, explicit dirs can be set from anywhere if given an existing startdir. (+) the '--rsyncopts' string can pass any rsync option to all the rsyncs that wil be started. This allows options like '--exclude-from' to filter out unwanted files. Examples ======== (Good example) % parsync --maxload=5.5 --NP=4 --startdir='/home/hjm' dir[123] \ hjm\@remotehost:~/backups where = "--startdir='/home/hjm'" sets the working dir of this operation to '/home/hjm' and dir1 dir2 dir3 are subdirs from '/home/hjm' = the target "hjm\@remotehost:~/backups" is the same target rsync would use = "--NP=4" forks 4 instances of rsync = -"-maxload=5.5" will start suspending rsync instances when the 1m system load gets to 5.5 and then unsuspending them when it goes below it. It uses 4 instances to rsync dir1 dir2 dir3 to hjm\@remotehost:~/backups (Good example) % parsync --reusecache --NP=3 --barefiles *.txt /mount/backups/txt where = "--reusecache" indicates that the filecache shouldn't be re-generated, uses the previous filecache in ~/.parsync = "--NP=3" for 3 copies of rsync (with no "--maxload", the default is 4) = "--barefiles" indicates that it's OK to transfer barefiles instead of recursing thru dirs. = "/mount/backups/txt" is the target - a local disk mount instead of a network host. It uses 3 instances to rsync *.txt from the current dir to "/mount/backups/txt". (Good example) parsyncfp --checkperiod 6 --NP 3 --interface eth0 --chunksize=87682352 \ --rsyncopts="--exclude='[abc]*'" nacs/fabio hjm\@moo:~/backups The above command shows several options used correctly: --chunksize=87682352 - shows that the chunksize option can be used with explicit integers as well as the human specifiers (TGMK). --rsyncopts="--exclude='[abc]*'" - shows the correct form for excluding files based on regexes (note the quoting) nacs/fabio - shows that you can specify subdirs as well as top-level dirs (as long as the shell is positioned in the dir above, or has been specified via '--startdir' (Error Example) % pwd /home/hjm # executing parsync from here % parsync --NP4 /usr/local /media/backupdisk why this is an error: = '--NP4' is not an option (parsync will say "Unknown option: np4" It should be '--NP=4' or '--NP 4' = if you were trying to rsync '/usr/local' to '/media/backupdisk', it will fail since there is no /home/hjm/usr/local dir to use as a source. This will be shown in the log files in ~/.parsync/rsync-logfile-_# as a spew of "No such file or directory (2)" errors The correct version of the above command is: % parsync --NP=4 --startdir=/usr local /media/backupdisk HELP print HLP $helptxt; close HLP; system("less -S $helpfile"); unlink $helpfile; die "Did that help?.\n"; }