#!/usr/bin/env perl
use strict;
use Getopt::Long;      # for std option handling: -h --yadda=badda, etc
use Socket;
use Env qw(HOME PATH);
use File::Path qw(remove_tree make_path);
use Term::ANSIColor;  # for alarms

# after significant changes, update the tarball and cp to moo for distribution; update the github
# fn="/home/hjm/bin/parsyncfp"; cd ; cp  $fn ~/parsyncfp/; tar -cvzf parsyncfp+utils.tar.gz parsyncfp; scp parsyncfp+utils.tar.gz moo:~/public_html/parsync ; 
# copy to all the local hosts
# scp $fn moo:~/public_html; scp $fn moo:~/bin;  scp $fn   dabrick:~/bin; ssh -t moo 'scp bin/parsyncfp hmangala@hpcs:/data/hpc/bin'; 

# don't forget!!  using git!  Add changes to changelog in the README.md file
# cd ~/gits/parsyncfp; cp ~/bin/parsyncfp .; git add parsyncfp README.md; git commit -m 'commit message'; git push
# check github for bug reports.
 
# TODO
# - integrate pmj into pfp? use somthing like [xterm -e "cd /path/to/pmj/dir; pmj shell start file; wait"

#    - --pmj=/path/to/pmj dir 
#    - starts an xterm and sends output there, opens the gnuplot window
# - [x] changed the calc for determining TCP network bandwidth to reference /proc/net/dev which should
#       be more reliable across distro's and maybe even OSs.  However, this won't detect RDMA data.  For that,
#       need perfquery.
# - [x] addded RDMA support (if the interface =~ ib, then it will try to use perfquery to measure the RDMA
#       bandwidth
# - [x] check the sequencing for the use of the alt-cache option to make sure that things are being 
#     deleted or not in the right sequence.
# - [x] write funcs to color different outputs different colors based on what they are - 
#    blue for INFO, orange for WARNINGs, red for ERRORs, 
# - check that fpart can generate at least the # of chunk that are > than NP (as below)
# - port to MacOSX using hackintosh
# - [x] done: fix bandwidth calculation subroutine.
# - [x] done: check for fpart before running.
# - [x] done: test for '-d' or --delete' in the rsyncopts line and refer to problem with this.
# - test for # of chunk files generated.  emit warnings if goes above 2000 (advise to choose 
#    a larger chunksize; or if less than NP. Don't assume a large # or even the same # as the NP #.
# - [x] done: decouple the cycle time from the job start time. ie, keep monitoring the exit codes
# and launch the next rsync immediately, don't wait for the checkperiod cycle, since that could be 
# quite long
# - handle immediate, top-level dirs 


use vars qw($allPIDs $ALL_SYS_RSYNC_PIDS $ch $CHECKPERIOD $cmd 
$crr $CUR_FP_FLE $CUR_FPI $DATE $dcnt $DEBUG @DIRS @DIRS2SYNC $dirtmp 
$EMAIL $Filecnt %FILES $fl $fn $fnd2r $FOUT $FPART_LOGFILE $FPART_PID 
$FPART_RUNNING $FPARTSIZE $FPARTSIZE_N $FP_PIDFILE $FP_ROOT $cyclecnt
$FP_ROOT_DIR $FP_RUNNING $hdr_cnt $hdr_rpt $HELP $IF_SPEED $VERBOSE
$LOAD1mratio $loadavg $logfile $MAXBW $MAXLOAD $nbr_cur_fpc_fles 
$NBR_FP_FLES $NCPUs $NDIRS $NETIF $NOWAIT $NP $NP_chunk $glob $ALTCACHE
$parsync_dir $PARSYNCVER $PIDFILE $PIDFILE $prev_cache $lenPID $DISPOSE
$rem_host $remote $rem_path $rem_user  $rootdir $rPIDs $sPIDs
$ROOTDIR $RSYNC_CMD $RSYNCOPTS $RSYNCS_GOING $STILLRSYNCS $DFLT_RSYNCOPTS
@SYSLOAD $TARGET $tmp $Totlsiz %UTILS $VERSION $OS $Linux $MacOSX $NETFILE $myIP
$PERFQUERY $avgTCPrecv $avgTCPsend $avgRDMArecv $avgRDMAsend
);

$PARSYNCVER =  << "VERSION";
parsyncfp version 1.56 
Dec 12th, 2018
by Harry Mangalam <hjmangalam\@gmail.com>

parsyncfp is a Perl script that wraps Andrew Tridgell's miraculous
'rsync' to provide some load balancing and parallel operation across
network connections to increase the amount of bandwidth it can use.
The 'fp' variant uses 'fpart' to bypass the need for a full recursive 
descent of the dir trees before the actual transfer starts. 
Do NOT try to use rsync --delete options'.  More help with '--help'
VERSION

if (! @ARGV) {usage();} # in case someone doesn't know what to do.


&GetOptions(
  "startdir|sd=s"    =>   \$ROOTDIR,     # Have to be able to set rootdir -> SRC in rsync
  "altcache|ac=s"    =>   \$ALTCACHE,    # alternative cache instead of ~/.parsyncfp
  "rsyncopts|ro=s"   =>   \$RSYNCOPTS,   # passthru to rsync as a string
  "NP|np=i"          =>   \$NP,          # number of rsync processes to start
  "chunksize|cs=s"   =>   \$FPARTSIZE,   # the size that fpart chunks (allow PpTtGgMmKk)
  "checkperiod|cp=i" =>   \$CHECKPERIOD, # # of sec between system load checks
  "maxbw=i"          =>   \$MAXBW,       # max bw to use (--bwlimit=KBPS passthru to rsync)
  "maxload|ml=f"     =>   \$MAXLOAD,     # max system load - if > this, sleep rsyncs
  "email=s"          =>   \$EMAIL,       # email to notify when finished
  "interface|i=s"    =>   \$NETIF,       # network interface to use if multiple ones
  "verbose|v=i"      =>   \$VERBOSE,     # how chatty it should be.
  "nowait|nw!"       =>   \$NOWAIT,      # sleep a few s rather than wait for a user ack
  "help!"            =>   \$HELP,        # dump usage, tips
  "version!"         =>   \$VERSION,     # duh..
  "dispose|d=s"      =>   \$DISPOSE,     # what to do with the cache (compress, delete, leave untouched)
  "debug|d!"         =>   \$DEBUG,       # developer-level info; (historical) alias for '-v 3'
);

## Set up run-permanent variables.

$DATE=`date +"%T_%F" | sed 's/:/./g' `; chomp $DATE;
if (! defined $ALTCACHE) {$parsync_dir = $HOME . "/.parsyncfp";}
else {$parsync_dir = $ALTCACHE; }  #mkdir $parsync_dir; !!!
$NETFILE = "/proc/net/dev";
$OS = `uname -s`; chomp $OS;
$Linux = $MacOSX = 0;
if ($OS =~ /Linux/) {$Linux = 1;} else {$MacOSX = 1;}
$DFLT_RSYNCOPTS = "-a -s"; # the default options to pass to rsync; blanked if define $RSYNCOPTS
if (defined $VERSION) { print colored(['green'], $PARSYNCVER, "\n"); exit;}
if (!defined $CHECKPERIOD) {$CHECKPERIOD = 3;}
if (!defined $VERBOSE) {$VERBOSE = 2;}
$PERFQUERY = 0;

my $fpcheck = `which fpart`;
if ($fpcheck eq "") {FATAL("There's no 'fpart' executable on your PATH. Did you install it?
See: https://github.com/martymac/fpart/blob/master/README");}

if (!defined $RSYNCOPTS) {$RSYNCOPTS = ""; $DFLT_RSYNCOPTS = "-a -s";}
else {  # if def $RSYNCOPTS, then user takes all responsibility
    $DFLT_RSYNCOPTS = "";
    if ($RSYNCOPTS =~ / -d / || $RSYNCOPTS =~ / --del/){ # user tries to pass in a 'delete' option
        WARN("It looks like you're trying to pass in a '--delete' option 
in the '--rsyncopts' string.  [$RSYNCOPTS]
Because parallel rsyncs don't know what the other rsyncs are doing, 
'delete' options don't work well. If this is what you want to do, 
omit that option here and follow the parsyncfp command with a regular 
'rsync --delete' command.  It will be slower than a parallel 
operation but since most of the action will be remote deletes, 
it should be fairly fast.

If the operation is to be performed on locally mounted filesystems 
(not to remote nodes), I'd strongly recommend the 'fpsync' tool, which 
you should have already received as part of the 'fpart' package necessary 
to run parsyncfp. 'fpsync' DOES provide support for a parallel '--delete', 
and the author provides a good explanation as to how he does this here:
<https://goo.gl/dtwp3P>.  HOWEVER!! Anytime you use '--delete' in an rsync 
operation, MAKE SURE you know what you're doing.
"); exit(0);
    }
} 

#if (defined $HELP ||  @ARGV == 0) { usage(); }
if (defined $HELP) {usage($parsync_dir);}
if (!defined $DISPOSE) {$DISPOSE = 'l';} # for leave untouched

# check_utils(); # check that the required utilities are on the system

### get the current system stats:  #CPUs, load, bandwidth, etc

if ($Linux) {
    $NCPUs = `cat /proc/cpuinfo | grep processor | wc -l`; chomp $NCPUs;
    $loadavg = `cat /proc/loadavg | tr -d '\n'`;
    my $pid_max = `cat /proc/sys/kernel/pid_max`;
    $lenPID = length $pid_max;  # usually 5 but can go as high as 7
} elsif ($MacOSX) {
    $NCPUs = `sysctl -n hw.ncpu`; chomp $NCPUs;
    $loadavg = `sysctl -n vm.loadavg | cut -d" " -f2 -f3 -f4 | tr -d '\n'`;
    $lenPID = 5; # highest possible pid is 99998.
} else { FATAL("parsyncfp only supports Linux and MacOSX at this point\n"); }

@SYSLOAD = split (/\s+/, $loadavg); # 1st 3 fields are 1, 5, 15m loads
# so as long as the 1m load / NCPUs < 1, we're fine; if > 1, we may want to start throttling..
$LOAD1mratio = $SYSLOAD[0] / $NCPUs;

if (! defined $NETIF) {
  if ($MacOSX) {
     $NETIF = `netstat -nr | grep "^default" | head -n1 | awk '{print \$6}'`; chomp $NETIF;
     $myIP = `ifconfig $NETIF | grep 'inet ' | awk '{print \$2}'`; chomp $myIP;
  } else {
    #TODO This has to be checked for multi-homed systems and if the system is multihomed, 
    # force a choice as to which one to use via --interface
    my $ifs = `/sbin/route -n | grep "^0.0.0.0" | awk '{print \$8}' | wc -l`; chomp $ifs;
    $NETIF = `/sbin/route -n | grep "^0.0.0.0" | awk '{print \$8}'`; chomp $NETIF;
    if ($ifs != '1'){
      die "\nERROR: Your system is multi-homed - I've detected more than 1 active interface:
$NETIF \nPlease specify the one you want to use via the '--interface' flag.\n";
    } else {
      $NETIF = `/sbin/route -n | grep "^0.0.0.0" | awk '{print \$8}'`; chomp $NETIF;
      $myIP = `ifconfig $NETIF | grep 'inet ' | awk '{print \$2}' | cut -d: -f2`; chomp $myIP;
    }
  }
}
my $pqpath = `which perfquery`;
if ($NETIF =~ /ib/){
  INFO("You've specified what looks like an Infiniband interface [$NETIF]...\n");
  if ($pqpath ne "") {
    $PERFQUERY = 1;
    INFO(".. and you have 'perfquery installed, so RDMA bytes will be reported as well.\n");
  } else {
    $PERFQUERY = 0;
    INFO(".. but you don't have 'perfquery' installed, so only TCP bytes will be reported.\n");
  }
}

if (defined $DEBUG) {$VERBOSE = 3;} # DEBUG = VERBOSE=3
if (defined $VERBOSE && ($VERBOSE < 0 || $VERBOSE > 3)) {die "ERROR: --verbose arg must be 0-3. Try again.\n";}
if (! defined $NP){$NP = int(sqrt($NCPUs)+ 0.5);} # round sqrt(NCPUs) (hyperthreaded if Intel) 8 -> 3
if (! defined $MAXBW) {$MAXBW = 1000000;} # essentially unlimited
else {$MAXBW = int($MAXBW / $NP + 0.5);} # users expect total maxbw; so have to divide by NP.
if (! defined $MAXLOAD) {$MAXLOAD = $NP + 2 ;} #  + 1 for IO load
if (! defined $ROOTDIR) {$ROOTDIR = `pwd`; chomp $ROOTDIR;}  # where all dirs must be rooted.
if (! defined $FPARTSIZE) {$FPARTSIZE = "10G"; $FPARTSIZE_N = 104857600;} # default is 10Gish
elsif ($FPARTSIZE =~ /[PpTtGgMmKk]/) {$FPARTSIZE_N = ptgmk($FPARTSIZE); }
else {$FPARTSIZE_N = $FPARTSIZE;}
if ($DEBUG) {&debug(__LINE__, "FPARTSIZE = $FPARTSIZE\nFPARTSIZE_N = $FPARTSIZE_N");}

# fix .ssh/config file to eliminate wonky errors.
fix_ssh_config();

# ?? Is this nec anymore?  If so, need to bring it up to date with the new naming conventions
# see: https://goo.gl/kDLr8b
# get some network info
if ($NETIF =~ /eth/) {
  $IF_SPEED = `ethtool eth0 2> /dev/null | grep Speed | cut -f2 -d:`;}
elsif ($NETIF =~ /wlan/) {
  $IF_SPEED = `iwconfig wlan0 | grep -i quality`;
} elsif ($NETIF =~ /ib/) {
  $IF_SPEED = `ibstat | grep Rate | head -1 | sed -e 's/^[ \t]*//'`;
  $IF_SPEED = "IB:" . $IF_SPEED;
}
chomp $IF_SPEED;

if ($DEBUG){
  print "\tDEBUG: Using network interface [$NETIF] with connection quality [$IF_SPEED]\n\n";}
  if ($SYSLOAD[0] < $MAXLOAD){
  if ($DEBUG){
    print "\n\tDEBUG: 1m load is [$SYSLOAD[0]] and the 1m Load:#CPU ratio is [$LOAD1mratio] ( [$NCPUs] CPU cores).
	    OK to continue.\n "
  }
} else {
  WARN("1m System load is > [$SYSLOAD[0]].  The 1m Load:#CPU ratio is [$LOAD1mratio].
  Continue? [Cntrl+C to interrupt; Enter to continue]");
  pause();
}

$FP_ROOT_DIR = "${parsync_dir}/fpcache";
if (-d $parsync_dir) {
  if ($VERBOSE >= 1) {WARN("About to remove all the old cached chunkfiles from [$FP_ROOT_DIR].
  Enter ^C to stop this.
  If you specified '--nowait', cache will be cleared in 3s regardless.
  Otherwise, hit [Enter] and I'll clear them.");
  }
  $glob = "${FP_ROOT_DIR}/f*";
  if ($NOWAIT){ sleep 3;}
  elsif ($VERBOSE > 0) {pause();}
  system("rm -f $glob");
  if ($VERBOSE >=2 ) {
    INFO("The fpart chunk files [$glob] are cleared .. continuing.\n")
  }
} elsif (! -d $parsync_dir) {
   make_path  $parsync_dir or FATAL("Can't create [ $parsync_dir ]");
}

if (! -d $FP_ROOT_DIR) {mkdir $FP_ROOT_DIR or FATAL("Can't make 'FP_ROOT_DIR' [$FP_ROOT_DIR]");}
$FP_ROOT = $FP_ROOT_DIR . "/f";  # the root name of the fpart chunk files f.1, etc
$PIDFILE = $FP_ROOT_DIR . '/' . "rsync-PIDs" . '-' . $DATE;
$FPART_LOGFILE = $FP_ROOT_DIR . '/' . "fpart.log." . $DATE;
$FP_PIDFILE = $FP_ROOT_DIR . '/' . "FP_PIDFILE" . $DATE;
$hdr_rpt = 20; # nbr of lines to repeat the header
$hdr_cnt = 21;  # header counter; > $hdr_rpt so it gets printed 1st time

# this takes care of the last ARGV so that all the rest of the words are target dirs&files
$TARGET = $ARGV[$#ARGV]; # remote rsync target
if (!defined $TARGET ){ FATAL("No target defined! Where you gonna put this stuff??!?\nTry $0 --help for the built-in help."); }
$#ARGV--;

if ($TARGET =~ /~/) { FATAL("You defined the target dir with a '~': [$TARGET].
    While this SHOULD work, it often doesn't so I'm going to force you to replace 
    it with an explicit remote path.  
    ie instead of using '~/dir', please use '/home/<user>/dir.  Sorry."); }

# now process the dirs
$dcnt = 0;
$fnd2r = "";  # zero the list of 'files 'n' dirs to rsync'
$dirtmp = shift; # should only be dir/files left once getopt finishes (see above)

# If there are no files or dirs defined, take the current dir
if (!defined $dirtmp) { $dirtmp = `pwd`;}
while (defined $dirtmp) { # should work on explicitly named dirs as well as globs.
  $dirtmp = $ROOTDIR . '/' . $dirtmp;
  if (! -r $dirtmp){ # quick check to see if its readable.
    WARN("[$dirtmp] isn't readable; 
    either it's not where you think it is or you need to escalate your privs.  
    Regardless, it won't be transferred in this run.");
    if ($NOWAIT){ sleep 3;}
    elsif ($VERBOSE > 0) {pause();}
  } else {  # otherwise, add the file to list to be chunked and  transferred.
    $fnd2r .= $dirtmp . " ";
  }
  $dirtmp = shift;
}
$#ARGV++; # now incr to allow the TARGET to be captured.
my @cachefiles = (); # will populate with list of cachefiles to process together.

my $fparts_already_running = `ps aux | grep 'fpar[t]'`; chomp $fparts_already_running;
if ($fparts_already_running ne ''){
    WARN("One or more 'fpart's are already running:
    ======
    [$fparts_already_running]
    ======
    Unless you know that these fparts are valid (ie you're running 
    another parsyncfp in another shell on this machine) and not 
    left over from previous parsyncfp's, you should ^C and kill 
    them off before restarting this run.
    
    Pausing for 5s to allow you to read this and take action (or not).
    If you do nothing, I'll continue.
    "); 
    sleep 5;
}
my $x = 0;

$fnd2r =~ s/^\s+|\s+$//g ; # trim leading and trailing
# Up to the user to escape internal spaces in the names of target dirs.
# keeping the following line here as a reminder to think about how to better 
# address this problem.
#$fnd2r =~ s/ /\\ /g; # subs internal spaces with escaped spaces
my $cmd = "fpart -v -L -z -s $FPARTSIZE_N -o $FP_ROOT $fnd2r  2> $FPART_LOGFILE & echo \"\${!}\" > $FP_PIDFILE"; # captures the child PID!
if ($DEBUG) {print "DEBUG: fpart fork cmd:\n[$cmd]\n";} sleep 5;
if ($FPART_PID = fork) { # this actually takes a couple of seconds 
    if ($VERBOSE >= 2) { INFO("Forking fpart.  Check [$FPART_LOGFILE] for errors if it hangs.\n"); }
} else {
    system "$cmd";
    $FPART_PID = `cat $FP_PIDFILE`; chomp $FPART_PID;
    exit(0); # it's forked, now exit this stanza
}
# fpart has been forked; wait for enough chunkfiles to be written to start the rsyncs
while (! -e $FP_PIDFILE) { sleep 1;
    if ($VERBOSE >= 3) {INFO("Waiting for fpart to be forked..\n");}
}
$FPART_PID = `cat $FP_PIDFILE`; chomp $FPART_PID;
my $ready2start = my $waitcnt = $NBR_FP_FLES = 0;
my $fp0 = $FP_ROOT . ".0";
my $fp1 = $FP_ROOT . ".1";
my $done = 0;
while ($ready2start == 0) {
if (-e $fp0) {
    if ($VERBOSE >= 3) {INFO("[$fp0] visible.\n");}
    $NBR_FP_FLES++; $ready2start=1; 
}
$waitcnt++;
if ($VERBOSE >= 3) {INFO("Waiting [$waitcnt]s for chunk files to be written\r");}
sleep 1;
}

# start up NP rsyncs 1st, then cycle every CHECKPERIOD, checking # of rsyncs still going and
# starting new ones as needed until the chunkfiles are exhausted.
my $STILL_FP_CHUNKS = my $KEEPGOING = 1;
my $FPCFS = "${FP_ROOT}."; # FP Chunk File Stem
my $NBR_FP_FLES = `\\ls -U1 ${FPCFS}* | wc -l`; chomp $NBR_FP_FLES;
$RSYNCS_GOING = $CUR_FPI = 0; # $CUR_FPI = current FP index

if ($VERBOSE >= 2) {INFO("Starting the 1st [$NP] rsyncs ..\n");}
my $sc = 0;
while ($RSYNCS_GOING < $NP && $KEEPGOING) { #
  $CUR_FP_FLE = $FP_ROOT . "." . $CUR_FPI ; # the current fp chunkfile
  if (-e $CUR_FP_FLE) { # if the current chunkfile exists
    fixfilenames($CUR_FP_FLE, $ROOTDIR);  # check & fix for spaces, bad chars.
    # entire rsync command and PID capture (used in total of 2 places)
    $logfile = $parsync_dir . '/' ."rsync-logfile-" . $DATE . "_" . $CUR_FPI;
    $RSYNC_CMD = "rsync  --bwlimit=$MAXBW  $RSYNCOPTS -a -s --log-file=$logfile --files-from=$CUR_FP_FLE  $ROOTDIR  $TARGET  & echo \"\${!}\" >> $PIDFILE";
    # there will be as many logfiles as fp chunkfiles.
    # ie LOTS. but they can be deleted after the run has been verified..
    # TODO don't know if we need this logfile.
    if ($DEBUG) {&debug(__LINE__, "Complete rsync cmd = [$RSYNC_CMD]");}
    system("$RSYNC_CMD"); # launch rsync and capture the bg job PID to PIDfile
    $CUR_FPI++;
    $RSYNCS_GOING++;
  } else { # there aren't any more fp chunk files waiting, so check to see if it's finished.
    $FPART_RUNNING = `ps aux | grep fpar[t] | grep $FPART_PID | wc -l`; chomp $FPART_RUNNING;
    if ($FPART_RUNNING eq '0') {
      # so if it's done, then we're done.  No more chunk files, so no more rsyncs to start.
      $KEEPGOING = 0; # signal the while loop to break.
    } else { # fpart is still going so wait for the next fpart chunkfile to be finished.
      if ($VERBOSE >= 2) {INFO("waiting [$sc]s for next chunkfile [$CUR_FP_FLE]\r");} sleep 2; $sc += 2;
    }
  }
}  #while ($RSYNCS_GOING < $NP && $KEEPGOING)

# so at this point either we've loaded all the rsyncs up to NP or we've completely finished.
# If the latter, say good bye.  If the former, then we have to keep launching
# rsyncs up to NP until we've used up all the fpart chunkfiles.

$sPIDs = ""; # running PIDs launched by parsync, suspended PIDs (strings)
$NBR_FP_FLES = `\\ls -U1 $FPCFS* | wc -l`; chomp $NBR_FP_FLES; # get current # of chunks
my @aprPIDs; # all recorded parsyncfp rsync PIDs ever started
my @crrPIDs; # currently RUNNING parsyncfp rsync PIDs.
my @csrPIDs; #currently SUSPENDED parsyncfp rsync PIDs.

### FOLLOWING IS THE MAIN PARSYNC-FPART LOOP
$FP_RUNNING = `ps aux | grep $FPART_PID | grep fpar[t] | wc -l`; chomp $FP_RUNNING;

$ cyclecnt = 0;
my $IFN = sprintf("%7s",$NETIF);

my $day =`date +"%F"`; chomp $day;
#                                     |     TCP / RDMA  out  |
if ($VERBOSE == 0) { #  ..............|---------- / ---------|
    print "        | Elapsed |   1m   |    [$IFN]   MB/s  | Running || Susp'd  |      Chunks       [$day]
  Time  | time(m) |  Load  |     TCP / RDMA  out  |   PIDs  ||  PIDs   | [UpTo] of [ToDo]\n";
}

my $start_secs = `date  +"%s"`; 

while ($CUR_FPI < $NBR_FP_FLES || $FP_RUNNING || $STILLRSYNCS ) {
  $rPIDs = "";
  # print the header
  if ($hdr_cnt > $hdr_rpt) {
    my $glob = "${FP_ROOT}.*"; $hdr_cnt = 0;
    $nbr_cur_fpc_fles = `\\ls -U1 $glob | wc -l`; chomp $nbr_cur_fpc_fles;

     $day =`date +"%F"`; chomp $day; 
     
     if ($VERBOSE > 1) {print "        | Elapsed |   1m   |    [$IFN]   MB/s  | Running || Susp'd  |      Chunks       [$day]
  Time  | time(m) |  Load  |     TCP / RDMA  out  |   PIDs  ||  PIDs   | [UpTo] of [ToDo]\n";}
  }
  ($rPIDs, $crr) = get_rPIDs($PIDFILE, $sPIDs);

  # now get load, bw, etc, and start rsyncs on new chunkfiles or suspend them to
  # load-balance

  $loadavg = `cat /proc/loadavg | tr -d '\n'`; # What's the system load?
  @SYSLOAD = split (/\s+/, $loadavg); # 1st 3 fields are 1, 5, 15m loads
  $LOAD1mratio = $SYSLOAD[0] / $NCPUs;

  # print out current data with the date
  $rPIDs =~ s/^\s+|\s+$//g ; $sPIDs =~ s/^\s+|\s+$//g ;  # trim leading & trailing whitespace
  my $NrPIDs = my @Lr = split(/\s+/, $rPIDs);
  my $NsPIDs = my @Ls = split(/\s+/, $sPIDs);
  my $glob = "${FP_ROOT}.*"; 
  $nbr_cur_fpc_fles = `\\ls -U1 $glob | wc -l`; chomp $nbr_cur_fpc_fles;

  # if fpart is done ($FPART_RUNNING = "")
  # $FPART_RUNNING = `ps aux | grep fpar[t] | grep $FPART_PID | wc -l`; chomp $FPART_RUNNING;
  #AND $CUR_FPI >= $nbr_cur_fpc_fles
  # AND there aren't any $rPIDs AND there aren't any $sPIDs
  # then I think we're done.
  # check fpart to see if it's still running..
  $FPART_RUNNING = `ps aux | grep fpar[t] | grep $FPART_PID | wc -l`; chomp $FPART_RUNNING;

  if ($rPIDs eq "" ){$rPIDs = "No running PIDs; will start more next cycle"}
  my $rDATE=`date +"%T" | sed 's/:/./g' `; chomp $rDATE;
  
  # check cycles, print if exceed then reset counter.
  if ($cyclecnt++ > ($CHECKPERIOD - 4)) {
    my $avgTCPsend;
    if ($Linux) {
        ($avgTCPrecv, $avgTCPsend, $avgRDMArecv, $avgRDMAsend) = getavgnetbw($NETIF, $CHECKPERIOD, $PERFQUERY); 
        chomp $avgTCPsend;  $avgTCPsend =  ($avgTCPsend  / 1048576); # convert to MB
        chomp $avgRDMAsend; $avgRDMAsend = ($avgRDMAsend / 262144); # convert to MB; use same divisor as rdma-tct-stats
    } else {
        my $RDMA_T1 = my $RDMA_T2 = 0;
#        if ($DEBUG) {print "DEBUG: netstat lines next with myIP=[$myIP]\n";}
        my $o1_bytes = `netstat -bi | grep $myIP | awk '{print \$10}'`; sleep $CHECKPERIOD;
        my $o2_bytes = `netstat -bi | grep $myIP | awk '{print \$10}'`;
        $avgTCPsend = ($o2_bytes - $o1_bytes) / $CHECKPERIOD / 1048576; # (1024^2) 
    }
    my $cur_secs = `date +"%s"`;
    my $el_min = ($cur_secs - $start_secs) / 60; 
    
    # this should switch from scrolling to overwrite when VERBOSE < 2
    # print out the line
    if ($VERBOSE > 0) {
        printf "%8s   %5.2f    %5.2f  %9.2f / %-9.2f       %2d    <>  %2d          [%d] of [%d]",
    $rDATE, $el_min, $SYSLOAD[0],   $avgTCPsend ,$avgRDMAsend,   $NrPIDs, $NsPIDs, $CUR_FPI, $nbr_cur_fpc_fles;  
    }
    # and then over-write it or add a newline for scrolling data.
    if ($VERBOSE == 1) { printf "\r";}
    elsif ($VERBOSE >= 2) {printf "\n";}

    if ($DEBUG) {print "\nDEBUG: rPIDs = $rPIDs; sPIDs = $sPIDs\n";}
    $cyclecnt = 0; $hdr_cnt++;
  }
### SUSPEND OR CONTINUE RSYNCS for LOADBALANCING
  if ($SYSLOAD[0] > $MAXLOAD) {
    # suspend a PID; then loop as normal. If still high, will continue to
    # suspend PIDs until there's none left.
    if ($DEBUG) {print "\nDEBUG: System load [$SYSLOAD[0]] is > MAXLOAD [$MAXLOAD].  Will try to suspend a running rsync to shed load.\n";}
    # reassign a new list from ONLY RUNNING PIDs to $rPIDs (refresh $rPIDs)
    # this cmd picks up both suspended and running PIDs- have to remove the suspended ones.
    # in an efficient way.
    if ($rPIDs =~ /\d+/) {$rPIDs = `ps -p $rPIDs | grep -v PID| cut -c 1-5 | tr '\n' ' '`;}
    $rPIDs =~ s/^\s+|\s+$//g ; # trim leading and trailing
    # turn it into an array - (-> sub?)
    my $rn = my @ra = split(/\s+/, $rPIDs);
    my $sn = my @sa = split(/\s+/, $sPIDs);
    for (my $r=0; $r< $rn; $r++) {
      for (my $s=0; $s< $sn; $s++) {
      if ($ra[$r] eq $sa[$s]) {$rPIDs =~ s/$ra[$r]//g;} # delete it from $rPIDs
      }
    }
    # picks up both suspended and running PIDs and the new result has to have something in it as well.
    if ($rPIDs =~ /\d+/){ # if any still left
      my $N = my @raPIDs = split(/\s+/, $rPIDs); my $e = 0; # @raPIDs = temp array to carry currently running PIDs
      while ($e <= $N && $raPIDs[$e] !~ /\d+/){$e++};
      if ($DEBUG) {print "\t\tDEBUG:got one: [$raPIDs[$e]]; will now suspend it\n";}
      kill 'STOP', $raPIDs[$e];
      if ($sPIDs !~ /$raPIDs[$e]/) { # If it's not there already
	$sPIDs = "$sPIDs" . ' ' . "$raPIDs[$e]"; # transfer rPID to sPID.
	$rPIDs =~ s/$raPIDs[$e]//g; # only then delete that PID fr the rPID string
      }
    } else { # there aren't any more PIDs left - all done or killed off.'
      if ($VERBOSE >= 2) {WARN("No more running rsync PIDs left [$rPIDs].  All rsyncs are suspended [$sPIDs].");}
    }
  } elsif ($sPIDs =~ /\d+/) { # if there are sPIDs, unsuspend them one by one
    # split em
    my $N = my @saPIDs = split(/\s+/, $sPIDs); my $e = 0;
    while ($e <= $N && $saPIDs[$e] !~ /\d+/){$e++};
    if ($DEBUG) { print "\t\tDEBUG:got one: [$saPIDs[$e]]; will now UNsuspend it\n";}
    kill 'CONT', $saPIDs[$e];
    $rPIDs = "$rPIDs" . ' ' . "$saPIDs[$e]"; # transfer sPID to rPID.
    $sPIDs =~ s/$saPIDs[$e]//g; # delete that PID fr the sPID string
  } # end of 'SUSPEND OR CONTINUE to LOADBALANCE.' test loop
  # and if neither of those conditions are met, then we can launch another rsync.
  elsif  ($crr < $NP) { # then launch another rsync with the next fpart chunkfile
    $CUR_FP_FLE = "${FP_ROOT}.${CUR_FPI}" ; # generate the next fpart chunk file with $CUR_FPI
      # if fpart is still going, wait for the next chunkfile to show up
    my $cfw = 0;
    $FPART_RUNNING = `ps aux | grep fpar[t] | grep $FPART_PID | wc -l`; chomp $FPART_RUNNING;
    while (! -e $CUR_FP_FLE && $FPART_RUNNING eq '1'){
      if ($VERBOSE >= 2) {INFO("Waiting [$cfw]s for next chunkfile..\r"); sleep 2; $cfw += 2;}
    }
    ($rPIDs, $crr) = get_rPIDs($PIDFILE, $sPIDs);
    my $n = my @a = split(/\s+/, $rPIDs);
    my $R2SU = $NP - $n; # this is the number of rsyncs to start up
    $glob = "${FP_ROOT}.*";
    my $nbr_cur_fpc_fles = `\\ls -U1 $glob | wc -l`; chomp $nbr_cur_fpc_fles;
    # $fparts_already_running will be '' if it's finished running.
    my $fparts_already_running = `ps aux | grep 'fpar[t]'`; chomp $fparts_already_running;
    # Check this more carefully for exceptions - this is the drop-dead error point
    # in some situations
    for ($n=0; $n<$R2SU; $n++) {
      # make sure we haven't finished
      $FPART_RUNNING = `ps aux | grep fpar[t] | grep $FPART_PID | wc -l`; chomp $FPART_RUNNING;

        if ($rPIDs eq "" && $sPIDs eq "" && $CUR_FPI >= $nbr_cur_fpc_fles && $FPART_RUNNING == 0){
            # then we're done - exit.
            if ($VERBOSE >= 2) {INFO("Done.  Please check the target to make sure 
  expected files are where they're supposed to be. \n");}
            
            # remind user how much storage the cache takes and to clear the cache files 
            my $du_cache = `du -sh $parsync_dir`; chomp $du_cache;
            if ($VERBOSE >= 2) {INFO("
    The parsyncfp cache dir takes up [$du_cache]
    Don't forget to delete it, but wait until you are sure that your job
    completed correctly, so you don't need the log files anymore.
    
Reminder: check the parsyncfp log [$logfile]
    and the fpart log [$FPART_LOGFILE] 
    if there were errors. Use '--verbose=1' for less output.
      
      Thanks for using parsyncfp. Tell me how to make it better.
                       <hjmangalam\@gmail.com>\n");
            }
            exit;
        }
        while  (($CUR_FPI >= $nbr_cur_fpc_fles) && $fparts_already_running ne '') {
            if ($DEBUG) {print "DEBUG: CUR_FPI=$CUR_FPI >= nbr_cur_fpc_fles=$nbr_cur_fpc_fles?\n";}
            if ($VERBOSE >= 2) {INFO("Waiting for fpart to get ahead of the transfer..\r"); }
            $nbr_cur_fpc_fles = `\\ls -U1 $glob | wc -l`; chomp $nbr_cur_fpc_fles;
            $fparts_already_running = `ps aux | grep 'fpar[t]'`; chomp $fparts_already_running;
            sleep 2;
        }
        $logfile = $parsync_dir . '/' ."rsync-logfile-" . $DATE . "_" . $CUR_FPI;
        $CUR_FP_FLE = "${FP_ROOT}.${CUR_FPI}" ; # generate the next fpart chunk file with $CUR_FPI
        $nbr_cur_fpc_fles = `\\ls -U1 $glob | wc -l`; chomp $nbr_cur_fpc_fles;
        $RSYNC_CMD = "rsync  --bwlimit=$MAXBW -a -s --log-file=$logfile $RSYNCOPTS  --files-from=$CUR_FP_FLE  $ROOTDIR  $TARGET & echo \"\${!}\" >> $PIDFILE";
        if ($DEBUG) {print "\nDEBUG: Starting [$RSYNC_CMD]\n"; }

        # check status in a 1s loop checking to start extra rsyncs do we don't wait any more than 1s
        # OR keep cycling continuously on a 1s loop and ONLY print out info every X cycles.  This is the easiest way. 
        if (-e $CUR_FP_FLE) {
            fixfilenames($CUR_FP_FLE, $ROOTDIR);  # check & fix for spaces, bad chars.
            if ($VERBOSE >= 3) {my $tt = $CUR_FPI+1;  INFO("next chunk [$tt] of [$nbr_cur_fpc_fles]\n");}
            system("$RSYNC_CMD"); # capture the bg job PID to PIDfile
            $CUR_FPI++;
        }
    }
    ($rPIDs, $crr) = get_rPIDs($PIDFILE, $sPIDs);
  }
  # sleep 1;
  $NBR_FP_FLES = `\\ls -U1 ${FPCFS}* | wc -l`; chomp $NBR_FP_FLES; # get current # of chunks
  if ($rPIDs =~ /\d+/) {
    $STILLRSYNCS = 1;
    
  }
  else {$STILLRSYNCS = 0;}
}  # while ($CUR_FPI < $NBR_FP_FLES )

my $host = `hostname`;
if (defined $EMAIL){system("echo 'all rsyncs done' | mail -s 'parsyncfp on host [$host] completed' $EMAIL");}

# and based on --disposal, (=c(ompress), =d(elete) =l(eave untouched)  all the chunk files.
if ($DISPOSE =~ /d/) {
  if ($VERBOSE >= 2) {print ".. and finally disposing of the cache...";}
  system("\\rm -rf ${FP_ROOT_DIR}/f*"); 
}
elsif ($DISPOSE =~ /c/ ) { # can it just be put into background?
  if ($VERBOSE >= 2) {print ".. tarring up your cachefiles...";}
  $cmd="tar --remove-files -czf ${parsync_dir}/fpcache_${DATE}.tar.gz ${FP_ROOT_DIR} &";
  #print "final tar cmd: [$cmd] \n";
  system ("$cmd");
} elsif ($VERBOSE >=2 ) {
  INFO("Your cache files have been left intact in [${FP_ROOT_DIR}]. 
  Please dispose of them as you see fit.
  
  Reminder: check [$FPART_LOGFILE] for errors if there were errors.
  
  Thanks for using parsyncfp. Tell me how to make it better.
  <hjmangalam\@gmail.com>\n");
}
exit;


# ================= subroutines =================
# Define utilities required to run this version of parsync
sub check_utils {
  my %UTILS = (  # required utils to help this run correctly
    "ethtool"   => "",
    "iwconfig"  => "",
    "fpart"     => "",
  );
  # and check that they can be found..
  my $utilsz = keys %UTILS;
  foreach my $util (keys %UTILS){
    my $utilpath = `which $util | tr -d '\n'`;
    if ($utilpath !~ /$util/){
      FATAL("[$util] not found.  you can get 'fpart' here:
	  <http://moo.nac.uci.edu/~hjm/parsync/utils>
      and the rest via yum, apt-get, or google.  Please install it or correct your 
      PATH variable to include it.");
    } else {
      $UTILS{$util} = $utilpath;
      if ($DEBUG){print "\tEVAL: Found [$util] at [$utilpath].\n"}
    }
  }
}

# usage:  ($rPIDs, $crr) = get_rPIDs($PIDFILE, $sPIDs);
sub get_rPIDs($$) {
# Inputs
  my $pidfile = shift; # string name of PIDFILE
  my $spids = shift;   # suspended PIDs in a string.
 
  my @aprPIDs = ();
  my $NSusPIDs = 0; 
  my @SusPIDs;
  my $rpids = ""; # to be generated and returned as a string
  my @crrPIDs = (); # array that holds the currently running rsync PIDs
  my @ASRP;  # All System Rsync PIDs
  my $NASRP;
  my $crr = 0;# currently running rsyncs counter
  my @crrPIDs = ();
  my $apr = 0;  # all parsyncfp rsync PIDs
  # how many rsyncs are running?  Check the PIDFILE against the rsync PIDs that are running
  # if there are other rsyncs running, their PIDs won't be in the PIDFILE.
  # so have to do a diff of the PIDFILE vs all PIDs of rsyncs running.
  my $ALL_SYS_RSYNC_PIDS = `ps aux | grep rsyn[c] |  awk '{print $2}' | sort -g | tr '\n' ' '`;  chop $ALL_SYS_RSYNC_PIDS;
  $NASRP = @ASRP = split(/\s+/, $ALL_SYS_RSYNC_PIDS);
  open (PIDFILE, "<$pidfile") or FATAL("Can't open PIDFILE [$pidfile]'");
  # PIDs from the PIDFILE to compare system rsyncs (could be multiple going)
  # with parsync-launched rsyncs
  while (<PIDFILE>) {chomp;  $aprPIDs[$apr++] = $_; } # all parsyncfp rsync PIDs
  close PIDFILE;
  # if there are any PIDs in the $spids string, split into an array
  if ($spids =~ /\d+/) { $NSusPIDs = @SusPIDs = split(/\s+/, $spids); }
  $rpids =~ s/^\s+|\s+$//g ; $spids =~ s/^\s+|\s+$//g ; # strip leading/trailing spaces
  # suboptimal I know, but the arrays are so small it doesn't matter.
  for (my $a=0; $a<$NASRP; $a++) {
    for (my $b=0; $b<$apr; $b++) {
      # if they match, they're MY rsyncs AND they're running
      if ($ASRP[$a] eq $aprPIDs[$b]) {
	$crrPIDs[$crr++] = $aprPIDs[$b];
      }
    }
  }
  # dump @crrPIDs into $rpids
  $rpids = join(" ", @crrPIDs);
  $crr--; # trim off the extra incr
  
  # now mask out the sPIDs from the rPIDs list; works but ugly!
  $spids =~ s/^\s+|\s+$//g ;
  if ($spids =~ /\d+/) { # if there are any spids
    $NSusPIDs = @SusPIDs = split(/\s+/, $spids);
    for (my $r=0; $r<$NSusPIDs; $r++) {
      for (my $b=0; $b<$apr; $b++) {
	# if a sPID == rPID, delete the PID from the $rPIDs string
	if ( $SusPIDs[$r] eq $aprPIDs[$b]) { $rpids =~ s/$aprPIDs[$b]//g;}
      }
    }
  }
  return ($rpids, $crr);
}

sub getavgnetbw ($$$) { 
# call as  (my $avgTCPrecv, $avgTCPsend, $avgRDMArecv, $avgRDMAsend) = getavgnetbw($NETIF, $CHECKPERIOD, $PERFQUERY); 
    my ($avgrec,$avgtrans,$R1,$T1,$R2,$T2,$RDMA_T1,$RDMA_T2,$RDMA_R1,$RDMA_R2, $avgRDMAsend,$avgRDMArecv,$PQ);
    $avgRDMAsend = $avgRDMArecv = 0;
    my $NETIF = shift; my $CHECKPERIOD = shift; my $PQ = shift;
    $R1=`cat /sys/class/net/${NETIF}/statistics/rx_bytes`; 
    $T1=`cat /sys/class/net/${NETIF}/statistics/tx_bytes`;
    if ($PQ) {
      $RDMA_T1 = `perfquery -x | grep XmitData  | cut -f2 -d:  | sed -e 's/\\.*//g'`;  chomp $RDMA_T1;
      $RDMA_R1 = `perfquery -x  | grep RcvData  | cut -f2 -d: | sed -e 's/\\.*//g'`;  chomp $RDMA_R1;
    }
    # now sleep
    sleep $CHECKPERIOD;
    
    $R2=`cat /sys/class/net/${NETIF}/statistics/rx_bytes`; 
    $T2=`cat /sys/class/net/${NETIF}/statistics/tx_bytes`;
    if ($PQ) {
      $RDMA_T2 = `perfquery -x  | grep XmitData | cut -f2 -d: | sed -e 's/\\.*//g'`; chomp $RDMA_T2;
      $RDMA_R2 = `perfquery -x  | grep RcvData  | cut -f2 -d: | sed -e 's/\\.*//g'`; chomp $RDMA_R2;
      # print "[$RDMA_T2] - [$RDMA_T1]\n";
      $avgRDMAsend = ( $RDMA_T2 - $RDMA_T1) / $CHECKPERIOD;
      $avgRDMArecv = ( $RDMA_R2 - $RDMA_R1) / $CHECKPERIOD;
    }
    $avgrec = ($R2 - $R1) / $CHECKPERIOD;
    $avgtrans = ($T2 - $T1) / $CHECKPERIOD; 
    # print "getavgnetbw(): avgRDMAsend = $avgRDMAsend\n";
    return ($avgrec, $avgtrans, $avgRDMArecv, $avgRDMAsend);
}


sub pause {
    print "Press [ENTER] to continue.\n";
    my $tmp = <STDIN>;
}

# color info string ($) blue
sub INFO($) {
    my $msg = shift;
    print color('bold blue');
    print " INFO: $msg";
    print color('reset');
}

# # color warning string ($) orange
sub WARN($) {
    my $msg = shift;
    print color('bold magenta');
    print " WARN: $msg \n";
    print color('reset');
}

# color error string ($) red
sub ERROR($) {
    my $msg = shift;
    print color('bold red');
    print " ERROR: $msg \n";
    print color('reset');
}

sub FATAL($) {
    my $msg = shift;
    print color('bold red');
    print "\n ** FATAL ERROR **: $msg \n\n";
    print color('reset');
    exit(1);
}


# call as [debug(__LINE__, "string")] to print line # and debug string
sub debug($$) {
	my $line = shift;
	my $msg = shift;
	print STDERR "DEBUG[$line]: $msg\n";
	pause;
}

# fixfilenames reads in a file of filenames and iterates over them, fixing their
# names and emitting useful warning if something goes odd.
#  called like: fixfilenames($CUR_FP_FLE, $ROOTDIR)
# where $CUR_FP_FLE = current fpart file (fqpn)
#       $ROOTDIR    = pwd, or where all additional dirs are rooted.
sub fixfilenames {
  my $FN = shift;
  my $startdir = shift; $startdir .= '/'; # and suffxed with a '/'
#  print "\nstartdir = $startdir\n";
  my $fpnew =  $FN . ".new";
  open (FP, "< $FN") or die "ERROR: Can't open fp file [$FN]\n.";
  open (FPN, "> $fpnew") or die "ERROR: Can't open replacement file [$fpnew]\n.";
  my $lc = my $verified = my $failed = 0;
  while (<FP>) {
    chomp;
    if ($_ =~ / /) { s/ /\ /g; }  # subst all spaces with '\ '
    s/^$startdir//g;  # and also delete off the startdir (Thanks Ken Bass for the missing '^')
    print FPN "$_\n";
  }
  close FP;  close FPN;
  rename $fpnew, $FN; # and then rename the new one to the original
}


# ptgmk converts values suffixed with [PpTtGgMmKk] to bytes correctly
# uses the 1024 bytes/kb as oppo to 1000
sub ptgmk {
  my $instr = shift;
  # trim spaces from back and front
  $instr =~ s/^\s+|\s+$//g;
  my $abbr = chop $instr;
  my $nbr = $instr;
  if ($abbr !~ /[PpTtGgMmKk]/) {FATAL("ptgmk() input doesn't contain [PpTtGgMmKk], so nothing to convert.");}
  if ($abbr =~ /[Kk]/) {$nbr *= 1024; return $nbr;}
  if ($abbr =~ /[Mm]/) {$nbr *= 1048576; return $nbr;}
  if ($abbr =~ /[Gg]/) {$nbr *= 1073741824; return $nbr;}
  if ($abbr =~ /[Tt]/) {$nbr *= 1.09951162778e+12; return $nbr;}
  if ($abbr =~ /[Pp]/) {$nbr *= 1.12589990684e+15; return $nbr;}
}

sub fix_ssh_config {
    $HOME = $ENV{"HOME"};
    my $append_fxt = 0;
    if (-e "$HOME/.ssh/config") { # if it exists, fix it.
    open (CF, "<$HOME/.ssh/config") or FATAL("Can't open $HOME/.ssh/config, even tho it exists.. WTF??");
    while (<CF>) {
        if ($_ =~ /ForwardX11Trusted\s+yes/i) { $append_fxt = 0;}
        if ($_ =~ /ForwardX11Trusted\s+no/i) {  $append_fxt = 1;}
    }
    close CF;
    } else { $append_fxt = 1;}
    if ($append_fxt) {
        INFO("parsyncfp would like to append 'ForwardX11Trusted yes' & 'ForwardX11 yes' 
        to your ~/.ssh/config.
        Skipping this may result in a lot of odd ssh warnings being emitted during 
        the run if you don't have ssh set correctly for the remote system, but the 
        transfer should still work.)
   
If this mod of your ~/.ssh/config file is OK, hit [Enter].  Otherwise hit [s] to skip.\n ");
        my $tmp = <STDIN>; 
        if ($tmp !~ /[sS]/) {
            system ("echo -n \"#Next 2 lines added by parsyncfp\nForwardX11Trusted       yes\nForwardX11              yes\n\" >> $HOME/.ssh/config" );
            system("chmod 600 $HOME/.ssh/config");
            INFO("Your ~/.ssh/config file is set correctly.\n"); sleep 1;
        } else {INFO("Your ~/.ssh/config was not changed.\n"); sleep 1;}
    }
}

sub usage {
  #my $parsync_dir = shift;
  my $helpfile = "$HOME/.parsyncfp/parsyncfp-help.tmp";
  if (! -d "$HOME/.parsyncfp") {mkdir "$HOME/.parsyncfp";}
  open HLP, ">$helpfile" or die "Can't open the temp help file [$helpfile]\n";
  my $helptxt = <<HELP;

$PARSYNCVER
The only native rsync option that parsyncfp uses is '-a (archive).  If you
need more, then it's up to you to provide them ALL via '--rsyncopts'.
parsyncfp checks to see if the current system load is too heavy and tries
to throttle the rsyncs during the run by monitoring and suspending
/ continuing them as needed.

parsyncfp uses fpart <http://goo.gl/K1WwtD> to create chunkfiles for rsync
to read, bypassing the need to wait for a complete recursive scan. ie, it 
starts the transfer immediately. For large deep trees, this can be useful.

It appropriates rsync's bandwidth throttle mechanism, using '--maxbw'
as a passthru to rsync's 'bwlimit' option, but divides it by NP so
as to keep the total bw the same as the stated limit.  It monitors and
shows network bandwidth, but can't change the bw allocation mid-job.
It can only suspend rsyncs until the load decreases below the cutoff.
If you suspend parsyncfp (^Z), all rsync children will suspend as well,
regardless of current state.

Unless changed by '--interface', it assumes and monitors the routable interface.  
The transfer will use whatever interface normal routing provides, normally
set by the name of the target.  It can also be used for non-host-based
transfers (between mounted filesystems) but the network bandwidth continues
to be (pointlessly) shown.

[NB: Between mounted filesystems, parsyncfp sometimes works very poorly for
reasons still mysterious.  In such cases, I recommend the fpsync tool 
contained in the fpart package above].

It only works on dirs and files that originate from the current dir (or
specified via "--startdir").  You cannot include dirs and files from
discontinuous or higher-level dirs.  parsyncfp also does not use rsync's 
sophisticated/idiosyncratic treatment of trailing '/'s to direct where 
files vs dirs are sent; dirs are treated as dirs regardless of the 
trailing '/'.

** the [.parsyncfp] files **
The [.parsyncfp] dir contains the cache dir (fpcache), and the time-
stamped log files, which are not NOT overwritten.

** Odd characters in names **
parsyncfp will refuse to transfer some oddly named files (tho it should copy
filenames with spaces fine.  Filenames with embedded newlines, DOS EOLs,
and some other odd chars will be recorded in the log files in the 
[.parsyncfp] dir.

OPTIONS
=======
[i] = integer number                      [s] = "quoted string"
[f] = floating point number               ( ) = the default if any

--NP|np [i] (sqrt(#CPUs)) ..............  number of rsync processes to start
       optimal NP depends on many vars.   Try the default and incr as needed
--altcache|ac (~/.parsyncfp) ..... alternative cache dir for placing it on a
                another FS or for running multiple parsyncfps simultaneously
--startdir|sd [s] (`pwd`)  ..................  the directory it starts at(*)
--maxbw [i] (unlimited) ...........  in KB/s max bandwidth to use (--bwlimit
       passthru to rsync).  maxbw is the total BW to be used, NOT per rsync.
--maxload|ml [f] (NP+2)  ..........  max system load - if loadavg > maxload,
                                            an rsync proc will sleep for 10s
--chunksize|cs [s] (10G) .... aggregate size of files allocated to one rsync
                      process.  Can specify in 'human' terms [100M, 50K, 1T]
                                                   as well as integer bytes.
--rsyncopts|ro [s]  ...  options passed to rsync as quoted string (CAREFUL!)
         this opt triggers a pause before executing to verify the command(+)
--interface|i [s]  ......  network interface to monitor (not use; see above)
                                              Only SENT bytes are displayed.
--checkperiod|cp [i] (3) ........ sets the period in seconds between updates
--verbose|v [0-3] (2) ....sets chattiness. 3=debug; 2=normal; 1=less; 0=none
                     This only affects verbosity post-start; warning & error
                                             messages will still be printed.
--dispose|d [s] (l) .... what to do with the cache files. (l)eave untouched,
                                          (c)ompress to a tarball, (d)elete.
--email [s]  .....................  email address to send completion message
--nowait  .............  for scripting, sleep for a few s instead of pausing
--version  .................................  dumps version string and exits
--help  .........................................................  this help

(*) you can use globs/regexes with --startdir, but only if you're at that
point in the dir tree. ie: if you're not in the dir where the globs can be
expanded, then the glob will fail.  However, explicit dirs can be set from
anywhere if given an existing startdir.

(+) the '--rsyncopts' string can pass any rsync option to all the rsyncs that
will be started.  This allows options like '-z' (compression) or '--exclude-from' 
to filter out unwanted files. Use any 'delete' options carefully tho.

Hints & Workarounds
===================
IMPORTANT: rsync '--delete' options will not work with '--rsyncopts' bc the 
multiple parallel rsyncs that parsyncfp launches are independent and therefore 
don't know about each other (and so cannot exchange info about what should
be deleted or not.  Use a final 'rsync --delete' to clean up the transfer
if that's your need.


If you see an error related to "sh: /usr/bin/ls: Argument list too long", 
it usually means that fpart has generated a huge list of chunkfiles (10s 
of 1000s) and 'ls' has trouble processing that many.  This is usually
due to pointing parsyncfp at a huge filesystem, with millions of files, 
with a chunksize that's too small (resulting in the above-noted too many 
chunkfiles). You can either increase the chunksize ('--chunksize=100G) 
which will result in a smaller number of chunk files to process, or split 
up the source dirs among multiple parsyncfps (which can be done using the 
'--altcache' option above).


Examples
========
                            == Good example 1 ==
% parsyncfp  --maxload=5.5 --NP=4 \
--chunksize=\$((1024 * 1024 * 4)) \
--startdir='/home/hjm' dir[123]  \
hjm\@remotehost:~/backups

where
  = -"-maxload=5.5" will start suspending rsync instances when the 1m system
      load gets to 5.5 and then unsuspending them when it goes below it.
  = "--NP=4" forks 4 instances of rsync
  = "--chunksize=\$((1024 * 1024 * 4))" sets the chunksize, by multiplication
        or by explicit size: 4194304
  = "--startdir='/home/hjm'" sets the working dir of this operation to
      '/home/hjm' and dir1 dir2 dir3 are subdirs from '/home/hjm'
  = the target "hjm\@remotehost:~/backups" is the same target rsync would use

  It uses 4 instances to rsync dir1 dir2 dir3 to hjm\@remotehost:~/backups


                            == Good example 2 ==
parsyncfp   --checkperiod 6  --NP 3 --interface eth0  --chunksize=87682352 \
   --rsyncopts="--exclude='[abc]*'"  nacs/fabio   hjm\@moo:~/backups

The above command shows several options used correctly:

--chunksize=87682352 - shows that the chunksize option can be used with explicit
integers as well as the human specifiers (TGMK).

--rsyncopts="--exclude='[abc]*'" - shows the correct form for excluding files
based on regexes (note the quoting)

nacs/fabio - shows that you can specify subdirs as well as top-level dirs (as
long as the shell is positioned in the dir above, or has been specified via
'--startdir'


                            == Good example 3 ==
parsyncfp -v 1 --nowait --ac pfpcache1 --NP 4 --cp=5 --cs=50M --ro '-az'  \
linux-4.8.4 moo:~/test

The above command shows:
- short version of several options (-v for --verbose, --cp for checkperiod, etc)
- shows use of --altcache (--ac pfpcache1), writing to relative dir pfpcache1
- again shows use of --rsyncopts (--ro '-az') indicating 'archive' & compression'.
- includes '--nowait' to allow unattended scripting of parsyncfp


                            == Error example 1 ==
% pwd
/home/hjm  # executing parsyncfp from here

% parsyncfp --NP4  /usr/local  /media/backupdisk

why this is an error:
  = '--NP4' is not an option (parsyncfp will say "Unknown option: np4"
    It should be '--NP=4' or '--NP 4'
  = if you were trying to rsync '/usr/local' to '/media/backupdisk', it will
    fail since there is no /home/hjm/usr/local dir to use as a source.
    This will be shown in the log files in ~/.parsync/rsync-logfile-<datestamp>_#
    as a spew of "No such file or directory (2)" errors

The correct version of the above command is:

% parsyncfp --NP=4  --startdir=/usr  local  /media/backupdisk

HELP

  print HLP $helptxt;
  close HLP;
  system("less -S $helpfile");
  unlink $helpfile;
  die "Did that help?.
  Send suggestions for improvement to <hjmangalam\@gmail.com>\n";
}