#!/usr/bin/env perl
# by Harry Mangalam, hjmangalam@gmail.com.com.

# after significant changes, update the tarballs that need it and cp to moo for distribution; update the scut github
# export filename="/home/hjm/bin/stats"; scp ${filename} moo:~/public_html; scp ${filename} moo:~/bin;  scp ${filename}   dabrick:~/bin; ssh moo 'scp bin/stats hmangala@hpcs:~/bin'

# cd ~/gits/scut; cp ~/bin/stats .; git add stats; git commit -m 'commit message'; git push
# TODO add QQ plot to test for normality?
use strict;
use Getopt::Long;

use vars qw( $wide $dist $Xsize $Ysize $ln $N $sum $Min $Max $XYDist @Data $pager
    $XRange $XBinSize $XMul @SData $NWH $Median $even $Mean $SumDiffs2 $SumDiffs3 
    $SumDiffs4 $ValCnt $Val $MaxSoFarValCnt $ModeInd @Dist $jmin $jmax $J $YMax 
    $YBinSize $YMul @XYDist $ModeNum $Mode $S2 $S $Kurtosis $SEM $Skew $StdSkew 
    $gfmt $VERSION $DATE $HELPFILE $HELP $ConfIntLow $ConfIntHi $QUIET $stdout 
    %xfhash $xf $id $od $raw $QUANTILE $cutindex $cutvalue @QUANT $QUANTSTR
    $ONLY $pSUM $pNUM $pMEAN $pMEDIAN $pMODE $pNMODES $pMIN $pMAX $pRANGE $pVAR
    $pSD $pSEM $p95C $pSKEW $pSTDSKEW $pKURTOSIS $pQUANTILE $pQindex $PopKurtosis
    $pPopKURTOSIS @pr %OUT $MINLIMIT $MAXLIMIT $CAPTURE %ConfMult $pCONF $CONFINT 
    $pConfindex
);
    
$VERSION = "2.1.1 LonelyXmas";
$DATE = "Dec 23, 2020";
$ONLY = 0;
if (!defined $ENV{'PAGER'}) {$pager = "less";};
$gfmt = 1;
$NWH = 0;
$stdout = 0;
$pQUANTILE = 0; 
$pQindex = 100000; # outside possible range.
$MINLIMIT = -1e32;
$MAXLIMIT = 1e32;
$CAPTURE = 0;
%xfhash = (
  'log10'  => 1,  'ln'     => 1,  'sqrt'   => 1,  'x^2'    => 1,  'x^3'    => 1,
  '1/x'    => 1,  'sin'    => 1,  'cos'    => 1,  'tan'    => 1,  'asin'   => 1,
  'acos'   => 1,  'atan'   => 1,  'round'  => 1,  'trunc'  => 1,  'frac'   => 1,
  'abs'    => 1,  'exp'    => 1,  'pass'   => 1,  ''       => 1
  );  
%ConfMult = ( # see: https://www.mathsisfun.com/data/confidence-interval.html
  '80' => 1.282,  '85' => 1.440,  '90' => 1.645,  '95' => 1.960,  '99' => 2.576,
  '99.5' => 2.807,  '99.9' => 3.291 
);

&GetOptions(
   "wide!"   => \$wide,  # no args - just set to 1
   "dist=i"  => \$dist,  # 1 for 1-liner, 2 for xy plot with the following vars, 3 for both
   "x=i"     => \$Xsize, # the # of characters in the X axis
   "y=i"     => \$Ysize, # the # of lines in the Y axis
   "help!"   => \$HELP,  # ask for help
   "h!"      => \$HELP,
   "quiet!"  => \$QUIET, # shhhhh!
   "nwh!"    => \$NWH,   # No Wide Headers (if repeating wide mode, don't want headers)
   "stdout!" => \$stdout,   # just print, don't do stats on the numbers
   "xf=s"    => \$xf,    # do a transform of the #s before doing anything.
   "gfmt!"   => \$gfmt,  # set to Perl's 'general' numeric notation; leave alone so interface doesn't change
   "raw!"    => \$raw,   # set to see raw Perl notation (not gfmt, now the default)
   "id=s"    => \$id,    # input delimiter
   "od=s"    => \$od,    # output delimiter
   "minlimit=i" => \$MINLIMIT, # filter values < this
   "maxlimit=i" => \$MAXLIMIT, # filter values > this
   "quantile=i" => \$QUANTILE, # number of quantiles to calculate
   "conf=s"   => \$CONFINT, # confidence interval 
   "capture!" => \$CAPTURE, # capture filtered values in './stats.capture', overwritten each time
   "sum!"     => \$pSUM,    # for printSUM, etc 
   "num!"     => \$pNUM,    #
   "mean!"    => \$pMEAN,    #
   "median!"  => \$pMEDIAN,    #
   "mode!"    => \$pMODE,    #
   "nmodes!"  => \$pNMODES,    #
   "min!"     => \$pMIN,    #
   "max!"     => \$pMAX,    #
   "range!"   => \$pRANGE,    #
   "var!"     => \$pVAR,    #
   "sd!"      => \$pSD,    #
   "sem!"     => \$pSEM,    #
#   "conf!"    => \$pCONF,    #
   "skew!"    => \$pSKEW,    #
   "stdskew!" => \$pSTDSKEW,    #
   "kurt!"    => \$pKURTOSIS,    #
   "popkurt"  => \$pPopKURTOSIS, #
   "qq!"      => \$pQUANTILE,   # quantile
   );

if (defined $CONFINT) {
    $CONFINT = trim($CONFINT); $CONFINT =~ tr/%//d;
    $pCONF = 1; # if define it, want to print it.
    if (!defined $ConfMult{$CONFINT}){
        die("The Confidence Interval you specified [$CONFINT] isn't supported.
        Supported values are 80, 85, 90, 95, 99, 99.5, 99.9");
    }
} else { $CONFINT = "95";}

if (defined $QUANTILE) {$pQUANTILE = 1;} # if define, want to print it.
# then run thru the array els and incr $ONLY for each key.
if ($pSUM){$pr[$ONLY++] = "Sum";}
if ($pNUM){$pr[$ONLY++] = "Number";}
if ($pMEAN){$pr[$ONLY++] = "Mean";}
if ($pMEDIAN){$pr[$ONLY++] = "Median";} 
if ($pMODE){$pr[$ONLY++] = "Mode";} 
if ($pNMODES){$pr[$ONLY++] = "NModes";} 
if ($pMIN){$pr[$ONLY++] = "Min";} 
if ($pMAX){$pr[$ONLY++] = "Max";} 
if ($pRANGE){$pr[$ONLY++] = "Range";} 
if ($pVAR){$pr[$ONLY++] = "Variance";}    
if ($pSD){$pr[$ONLY++] = "Std_Dev";} 
if ($pSEM){$pr[$ONLY++] = "SEM";} 
if ($pCONF){$pConfindex = $ONLY; $pr[$ONLY++] = "Conf_Int";} 
if ($pSKEW){$pr[$ONLY++] = "Skew";} 
if ($pSTDSKEW){$pr[$ONLY++] = "Std_Skew";} 
if ($pKURTOSIS){$pr[$ONLY++] = "Kurtosis";}
if ($pPopKURTOSIS){$pr[$ONLY++] = "PopKurt";}
if ($pQUANTILE){$pQindex = $ONLY; $pr[$ONLY++] = "Quantiles";}
# $ONLY is now pointing past the end of the array
if ($CAPTURE) {open (CAP, "> ./stats.exceptions") or die "Can't open filter file [./stats.filtered]"; }

# print indiv results by using $pr[x] as header and $OUTPUT{"$pr[x]"} as the value.
if ($raw) {$gfmt = 0;} # set to perl native format

#  perltidy cmd to format uniformly: perltidy -ce -i=2 -l=100 stats 
# 11.16.2020 added variable quantiles, variable confidence intervals.
# 10.26.2020 added quantiles, variable printing.
# 11.28.2017 added xf=pass for simple text/data filtering.
# 11.10.2017 added transforms (--xf) & --stdout so stats can be used as an inline transform
# 06.15.2017  added 95% confidence intervals
# 03.12.2014 added '--quiet' to silence non-fatal warnings.
# 01.11.12 added 'general numeric format, replacing strict sci notation
# 05.01.08 added comma removal after embarrassing conversation with credit card company
#  7.14.06 add --sci to format for scientific notation output. ie:
#  --sci
#  2.05.01 addded Distribution calc/graph
#  2.01.01 format change to ease integration (Mode, NMode# split onto separate lines)
#         Made Labels single words and unambiguous for easier grepping

# 11.10.00 added wide printing (--wide)
#  9.27.00 adding check for included non-numbers
#  4.21.00 adding check for FLAT mode, modecount =1
# 11.24.99 adding Mode, Mode count, Median to output.
$N = 0;
$sum = 0;
$Min = $Max = 0;

# handle input and offer help if none.
if (-t STDIN) {
    if ($HELP) {usage()}
    else {
        print "\n[$0] will emit descriptive statistics based on 
all number-like input fed it on STDIN.  Use '-h' for more help.\n"; 
    }
    exit 0;
}

# define undefined vars
if (!$xfhash{$xf}){die "ERROR: I don't support that transform; try again or see the help (-h)\n"; }
if (!defined $Xsize) { $Xsize = 60; }
if (!defined $Ysize) { $Ysize = 25; }
if (!defined $id) {$id = "\\s+";}
if (!defined $od) {$od = "\t";}
if (!defined $QUANTILE) {$QUANTILE=5;} 
#Zero the DIST array
for (my $x=0; $x<$Xsize; $x++) {
  for (my $y=0; $y<$Ysize; $y++) {
    $XYDist[$x][$y] = ' ';
  }
}

# main loop to ingest data
while (<>) {
	$_ = trim($_);
	my $x = my @arr = split /$id/;
	for (my $i = 0; $i < $x; $i++) {
        # make sure all the things we're including are number-like
        # remove commas to prevent rejection downstream
        $arr[$i] =~ s/,//g;
        # previous slightly shaky regex
        #        if (($arr[$i] =~ /\d+|\d*\.\d*|\d+\.\d*[eE]-?\d+/) &&
        #            ($arr[$i] !~ /[a-df-zA-DF-Z\[\]]+/) ) {
        # stolen from https://docstore.mik.ua/orelly/perl4/cook/ch02_02.htm, 2.1.3
        if ($arr[$i] =~/^([+-]?)(?=\d|\.\d)\d*(\.\d*)?([Ee]([+-]?\d+))?$/) { 
            if ($arr[$i] > $MINLIMIT && $arr[$i] < $MAXLIMIT) {
                #$Data[$N++] = $arr[$i];  # store them for calcing the SD, etc
                if (defined $xf) { # want to exec a transform; already checked that its supported.
                # some of these are direct maps to perl-supplied functions; others have to be munged.
                    my $v = $arr[$i];
                    if    ($xf eq "pass")  { $arr[$i] = $v} # simply a filtering and passthru.
                    elsif ($xf eq "log10") { if ($v > 0) { $arr[$i] = log($v) / log(10);} else { $arr[$i] = "NA";} }
                    elsif ($xf eq "ln")    { if ($v > 0) { $arr[$i] = log($v);}           else { $arr[$i] = "NA";} }
                    elsif ($xf eq "sqrt")  { if ($v >= 0) { $arr[$i] = sqrt($v);}         else { $arr[$i] = "NA";} }
                    elsif ($xf eq "x^2")   { $arr[$i] *= ($v)}
                    elsif ($xf eq "x^3")   { $arr[$i] = ($v)*($v)*($v)}
                    elsif ($xf eq "1/x")   { $arr[$i] = 1 / $v }
                    elsif ($xf eq "sin")   { $arr[$i] = sin($v) }
                    elsif ($xf eq "cos")   { $arr[$i] = cos($v) }
                    elsif ($xf eq "tan")   { $arr[$i] = sin($v) / cos($v) }
                    elsif ($xf eq "asin")  { $arr[$i] = 1 / sin($v) }
                    elsif ($xf eq "acos")  { $arr[$i] = 1 / cos($v) }
                    elsif ($xf eq "atan")  { $arr[$i] = 1 / tan($v) }
                    elsif ($xf eq "round") { $arr[$i] = int ($v + 0.5) }
                    elsif ($xf eq "trunc") { $arr[$i] = int ($v) }
                    elsif ($xf eq "frac")  { $arr[$i] = $v - int($v) }
                    elsif ($xf eq "abs")   { $arr[$i] = abs($v) }
                    elsif ($xf eq "exp")   { $arr[$i] = exp($v) }
                }
                if ($stdout) {
                    my $y = $x-1;
                    if ($gfmt) { printf "%g", $arr[$i]; if ($i<$y) {print "$od"} else {print "\n";} }
                    else       { print "$arr[$i]";      if ($i<$y) {print "$od"} else {print "\n";} }
                }
                $sum += $arr[$i]; # sum the numbers as they come in
                if ($N == 0) { $Min = $Max = $arr[$i]; }
                if ($arr[$i] < $Min) { $Min = $arr[$i]; }
                if ($arr[$i] > $Max) { $Max = $arr[$i]; }
                $Data[$N++] = $arr[$i];  # store them for calcing the SD, etc
            } elsif ($CAPTURE) {print CAP "$arr[$i]\n";}
        }
    }
}

if (! $stdout) {
    # All the numbers sucked in; now calc the values wanted

    # autoscale the X axis 
    $XRange = $Max - $Min;
    if ($XRange != 0){
        $XBinSize = $XRange/$Xsize;
        $XMul = $Xsize/$XRange;
    } else {
        $XBinSize = -1;
        $XMul = -1;
    }

    # if want to get mode, median, would help to sort $Data
    @SData = sort {$a <=> $b}  @Data;

    if ($N % 2 < 0.001) {
        #then $N is even and we can calc median via...
    $Median = ($SData[($N-1)/2] + $SData[(($N-1)+2)/2]) / 2;
        $even = 1;
    } else {
        # then $N is odd and we can calc median via...
    $Median = ($SData[($N+1)/2]) ;
        $even = 0;
    }
    $Mean = $sum / $N;
    $SumDiffs2 = 0;
    $SumDiffs3 = 0;
    $SumDiffs4 = 0;
    $MaxSoFarValCnt = 0;
    $ModeInd = 0;
    $ValCnt = 0;
    $Val = $SData[0];

    #init Distribution array
    for (my $i=0; $i<$Xsize; $i++){ $Dist[$i] = 0; }
    $jmin = $jmax = 0;
    for (my $i=0; $i < $N; $i++){
        $SumDiffs2 = $SumDiffs2 + (($Data[$i] - $Mean)**2);
        $SumDiffs3 = $SumDiffs3 + (($Data[$i] - $Mean)**3);
        $SumDiffs4 = $SumDiffs4 + (($Data[$i] - $Mean)**4);

        # this next stanza calculates the Mode pointer
        if ($Val == $SData[$i]) {
            # if its another of the same #, incr the counters
            $ValCnt++;
            $Val = $SData[$i];
        } else { # it's a new value, so check if the run of the last set of #s
                    # exceeds the longest so far
            if ($ValCnt > $MaxSoFarValCnt) {
                # and if so, replace the old values with the new 'winners'
                $MaxSoFarValCnt = $ValCnt;
                $ModeInd = $i-1;
            }
            $ValCnt = 0; # and reset the counters for the new
        }
        $Val = $SData[$i];

        # calc the distribution
        if ($XMul > 0) {
                $J = int($Data[$i] * $XMul);
                if ($J < $jmin) { $jmin = $J; }
                if ($J > $jmax) { $jmax = $J; }
                $Dist[$J]++;  # range of Dist should be close to $Xsize
        } #else {print "\nErr: All #s same, no range, no distribution\n";}
    }
#vvvvvvvvv Don't include in golang version vvvvvvv
    #Scale the Y axis; 1st find out the range for Y
    $YMax = 0;
    for (my $i=0; $i<$jmax; $i++) {
        if (abs($Dist[$i]) > $YMax) { $YMax = abs($Dist[$i]);}
    }
    if ($YMax == 0) { $YMax = 1;}
    $YBinSize = $YMax/($Ysize-1);
    #print "\nYMax = $YMax\n";
    $YMul = ($Ysize-1) / $YMax;
    for (my $x=0; $x<$Xsize; $x++) {
        my $y = int($Dist[$x] * $YMul);
        $XYDist[$x][$y] = '*';
    }
#^^^^^^^^^^^^ Don't include in golang version ^^^^^^^^^^^^

    ## Calc Quantiles
    # not providing min/max values since they're already printed
    # fill the array, create a string; no printing here.
    if (defined $QUANTILE){
        $QUANTSTR = "";
        $cutindex = 0;
        for (my $q=1; $q<$QUANTILE; $q++) {
        my $inc = int(($N * ($q/$QUANTILE))+1); #print "inc = [$inc]\n";
            $cutindex = $inc - 1;   
            my $cutvalue =  $SData[$cutindex];
            $QUANT[$q][0] = $cutindex; $QUANT[$q][1] = $cutvalue; 
            $QUANTSTR .= "$q\t$cutindex\t$cutvalue\n";
        }
    }
    if ($XMul > 0) {
        if ($MaxSoFarValCnt > 1) {
            $ModeNum = $MaxSoFarValCnt + 1;
            $Mode = $SData[$ModeInd];
        } else {
            $ModeNum = "No # was represented more than once";
            $Mode = "FLAT";
        } 
        # set up the @OUT array
        $OUT{"Sum"} = $sum;
        $OUT{"Number"} = $N;
        $OUT{"Mean"} = $Mean;
        $OUT{"Median"} = $Median;
        $OUT{"Mode"} = $Mode;
        $OUT{"NModes"} = $ModeNum;
        $OUT{"Min"} = $Min;
        $OUT{"Max"} = $Max;
        $OUT{"Range"} = $XRange;
        $OUT{"Variance"} = $S2 = $SumDiffs2 / ($N - 1);
        $OUT{"Std_Dev"} = $S = sqrt($S2);
        ## Check this formula (this looks like it's normalized to 0; should be 3
        ## and also add the population-normalized Kurtosis  Or not..?  Kurtosis is 
        ## a pretty useless measure.
        $OUT{"Kurtosis"} = $Kurtosis = $SumDiffs4 / ($N * $S**4);
        # was $OUT{"Kurtosis"} = $Kurtosis = ($SumDiffs4 / (($N-1)*($S**4))) - 3;
        $OUT{"PopKurt"} = $PopKurtosis = ($N * ($N+1)) / (($N-1) * ($N-2) * ($N-3)) * $Kurtosis - ((3*($N-1)^2) / (($N-2) * ($N-3)));
        $OUT{"SEM"} = $SEM = $S / sqrt($N);
        if ($S > 0 && $N > 3) {
        $OUT{"Skew"} = $Skew = ($N * $SumDiffs3) / (($N-1) * ($N-2) * ($S ** 3));
        $OUT{"Std_Skew"} = $StdSkew = $Skew / sqrt(6/$N);
        $ConfIntHi  = $Mean + ($ConfMult{$CONFINT} * ($S/sqrt($N)));
        $ConfIntLow = $Mean - ($ConfMult{$CONFINT} * ($S/sqrt($N)));
        $OUT{"Conf_Int"} = sprintf("%g:%g",$ConfIntLow,$ConfIntHi);
        $OUT{"Quantiles"} = $QUANTSTR;
        }
    }

    if (!$ONLY && !$wide && $gfmt == 0) {  # don't include in golang version
    print  "Sum       $sum",
            "\nNumber    $N",
            "\nMean      $Mean",
            "\nMedian    $Median",
            "\nMode      $Mode  ",
            "\nNModes    $ModeNum",
            "\nMin       $Min",
            "\nMax       $Max",
            "\nRange     $XRange",
            "\nVariance  $S2",
            "\nStd_Dev   $S",
            "\nSEM       $SEM",
            "\n${CONFINT}% Conf  $ConfIntLow to $ConfIntHi",
            "\n          [for a normal distribution (ND) - see Skew & Kurtosis]",
            "\nQuantiles ($QUANTILE)\n\tIndex\tValue\n$QUANTSTR";
    } elsif (!$ONLY && !$wide && $gfmt > 0) {  # shorten this post verify
    printf   "Sum       %g",$sum;
    printf "\nNumber    %g",$N;
    printf "\nMean      %g",$Mean;
    printf "\nMedian    %g",$Median;
    printf "\nMode      %g",$Mode;
    printf "\nNModes    %g",$ModeNum;
    printf "\nMin       %g",$Min;
    printf "\nMax       %g",$Max;
    printf "\nRange     %g",$XRange;
    printf "\nVariance  %g",$S2;
    printf "\nStd_Dev   %g",$S;
    printf "\nSEM       %g",$SEM;
    printf "\n%d% Conf  %g to %g", $CONFINT, $ConfIntLow, $ConfIntHi;
    print  "\n          [for a normal distribution (ND) - see skew]";
    print  "\nQuantiles ($QUANTILE)\n\tIndex\tValue\n$QUANTSTR";
    }
    if (!$ONLY && $S > 0 && $N > 3 && !$wide) {
        if (!$gfmt) {
            print  "Skew      $Skew",
                "\n          [Skew=0 for a symmetric dist]",
                "\nStd_Skew  $StdSkew",
                "\nKurtosis  $Kurtosis",
                "\n          [Kurtosis=3 for a normal dist (ND)]",
                "\nPopKurt   $PopKurtosis",
                "\n          [Pop'n Kurtosis is normalized to sample size; PK=0 for a ND]\n"
        } else {
            printf   "Skew      %g", $Skew;
            print  "\n          [Skew=0 for a symmetric dist]";
            printf "\nStd_Skew  %g", $StdSkew;
            printf "\nKurtosis  %g", $Kurtosis;
            print  "\n          [Kurtosis=3 for a ND]";
            printf "\nPopKurt   %g", $PopKurtosis;
            print  "\n          [Pop'n Kurtosis is normalized to sample size; PK=0 for a ND]\n"
        }
    } elsif (!$ONLY && !$QUIET) {
        print STDERR "#Std Dev = 0 or N <=3 or printing wide; Skipping all Skewness & Kurtosis cal'ns.\n";
    }

    if (!$ONLY && $wide) {
        if (!$NWH){
            print  "# (Quantiles not printed in wide mode.)
    #Sum\tN\tMean\tMedian\tMode\tNModes\tMin\tMax\tRange\tVariance\tStd_Dev\tSEM\t95%L\t95%H\tSkew\tStd_Skew\tKurtosis\n";
        }
        if ($gfmt) {
            printf "%g\t%g\t%g\t%g\t%g\t%g\t%g\t%g\t%g\t%g\t%g\t%g\t%g\t%g", $sum,$N,$Mean,$Median,$Mode,$ModeNum,$Min,$Max,$XRange,$S2,$S,$SEM,$ConfIntLow,$ConfIntHi;
        } else {
            print "$sum\t$N\t$Mean\t$Median\t$Mode\t$ModeNum\t$Min\t$Max\t$XRange\t$S2\t$S\t$SEM\t$ConfIntLow\t$ConfIntHi";
        }
        if ($S > 0 && $N > 3) {
            if ($gfmt) {
                printf "\t%g\t%g\t%g\t%g\n", $Skew,$StdSkew,$Kurtosis,$PopKurtosis;
            } else { print "\t$Skew\t$StdSkew\t$Kurtosis\t$PopKurtosis\n";}
        } elsif (! $QUIET) {
            print  STDERR "NA\tNA\nStd Dev = 0 or N <=3; Skipping all Skewness & Kurtosis cal'ns.\n";
        }
    }

    # print out the distribution
    # this way prints it out in 1 line
    if ($dist == 1 || $dist == 3) {
        for (my $r=0; $r<($Xsize); $r++) {
            if ($Dist[$r] < 10) { print $Dist[$r]; }
            else { print "($Dist[$r])"; }
        }
    }
    # if used only 1 option, print out the single number requested
    if ($ONLY == 1) {
        if ($pQUANTILE) {
            printf "Quant\tIndex\tValue\n%s", $OUT{$pr[0]};
        } elsif ($pCONF) {
             printf "%s%s",$OUT{$pr[0]},$od; 
        } else {
            printf "%g\n",$OUT{$pr[0]};
        }
    } elsif ($ONLY > 1) {
        print "#";
        for (my $r=0;$r<=$#pr; $r++) {
            printf "%12s%s", $pr[$r],$od;
        } print "\n";
        for (my $r=0;$r<=$#pr; $r++) {
            if ($r == $pConfindex) { printf "%s%s",$OUT{$pr[$r]},$od; }
            elsif ($r != $pQindex) {
                printf "%12g%s",$OUT{$pr[$r]},$od;
            } elsif ($pQUANTILE) {
                printf "(quantiles next lines)\nQuant\tIndex\tValue\n%s",$OUT{$pr[$r]},$od;
            }
        } print "\n";
    }
    
    # if used selective options, print out the selected bits
    # omit from golang version
    # this way prints a little xy graph, if wanted
    my $spacer = "";
    for (my $x=0; $x<($Xsize - 14); $x++) { $spacer = $spacer . " ";}

    if ($XBinSize > 0) {
        if ($dist == 2 || $dist == 3) {
            print "\n\nDistribution\nX BinSize $XBinSize\nY BinSize  $YBinSize\n\nYMax:$YMax\n      |";
            for (my $y=($Ysize-1); $y>=0; $y--) {
                for (my $x=0; $x<$Xsize; $x++) { print "$XYDist[$x][$y]"; }
                print "\n      |";
            }
            for (my $x=0; $x<$Xsize; $x++) { print '-';}
            print "\n  X Min $spacer        X Max\n";
            printf "%7.2f %s %12.2f \n", $Min, $spacer, $Max;
            if (!$ln) {
                print "\nIf points are jammed at one end, use '--xf=ln' to spread them.\n";
            }
        }
    } else {
        print STDERR "\nINFO: Identical numbers in input; no range, no distribution\n";
    }
}
 
 
sub trim($)	{
	my $string = shift;
	$string =~ s/^\s+//;
	$string =~ s/\s+$//;
	return $string;
}

sub usage {
my $LESSHELP = <<HELP;
 stats version: $VERSION, last mod: $DATE
 Harry Mangalam <hjmangalam\@gmail.com>
 Lives with 'scut' and 'cols' here: <https://github.com/hjmangalam/scut>

 'stats' is a utility that reads STDIN for all #s, whether in one line
 or in many (removes commas, checks for text contaminants; only have to 
 be separated by whitespace), calculates some basic stats, then spits 
 them to STDOUT.  Starting from this version, the default is to format the 
 test in Perl's General Numeric Format (gfmt), altho you can ask for the raw
 output with '--raw'.
 
 New in 2.0.8, you can ask for specific output with the options listed below.  
 If you ask for a single output '--sum', it will be emitted as a single, 
 naked number.  If you ask for multiple specific outputs, they will 
 have a #-prefixed line of headers above the numbers, roughly aligned and 
 separated by the output delimiter ('--od', by default a <tab>).
 If you don't ask for specific stats, all will be emitted as shown in the
 example stanza below. 
 
 Starting in V2.x, you can choose from a small set of transforms (see 
 below) to be applied to the data and then emitted with '--stdout' 
 (thus using stats as a transform filter) or  have the stats applied 
 to that transformed data.

 usage: stats < file.of.numbers
     or
 cmd1 | cmd2 |cmd3 | stats [options]

 where Options are:
       --help .......................................   dumps this help
       --nwh ...... 'No Wide Headers' (no headers on wide output output
                        good for repeating output as for logging stuff)
       --xf="fn" ..... to transform STDIN before doing the stats, where
                         "fn" is one of log10, ln, sqrt, x^2, x^3, 1/x, 
                       sin, cos, tan, asin, acos, atan, round, abs, exp, 
                 pass(thru), trunc (integer part), frac (decimal part). 
       --stdout ....... JUST print STDIN to STDOUT (don't do any stats)
                                          (--xf also applies to STDOUT)
       --id="token" .... Input delimiter; defaults to whitespace (\\s+)
       --od="token" ........... Output delimiter; defaults to tab (\\t)
       --minlimit=# .............. sets a MINIMUM limit to filter input
       --maxlimit=# .............. sets a MAXIMUM limit to filter input
                                      above 2 line also set the filters
                                   for the transforms (--xf, see above)
       --capture ...... captures the exceptions from the above 2 limits
                                  to file 'stats.exceptions' in the cwd
       --conf=# ............... the confidence interval of the set; one
               of 80, 85, 90, 95, 99, 99.5, or 99.9.  If the confidence 
             interval is smaller, the range of values will of course be 
                 be larger. Assumes normal distribution - see warnings.
       --gfmt ............. output in Perl's 'general numeric notation'
                                                          (now default)
       --raw ............................... unformatted numeric output
       --quiet ...................... decrease amount of error messages
       --wide . writes the stats in 1-line, useful for some spreadsheet
                apps. Omits Quantiles.  Use cols/column/columns to view
                aligned columns. See example below..
       --dist=# .................. plots a distribution function where:
                          # = 1 = 1-liner distribution
                              2 = the std xy plot of the data
                              3 = both 1 liner & longer version
         (hint: pipe '--stdout' into 'feedgnuplot' for better plotting)
       --x=# ..................where # = an integer indicating the # of
                                               characters in the X axis
       --y=# ..................................... ditto for the y axis

       ## the following options specifically request the related values
       --sum ...........................  the total of all input values
       --num ............................... the number of input values
       --mean ............................... the average of all values
       --median ...................the approx value above & below which 
                                   there are the same number of values.
       --mode .........................  the most frequently seen value
       --nmodes ................................... the number of modes
       --min ............................. the minimum value in the set
       --max ............................. the maximum value in the set
       --range .............................................. max - min
       --var .................................. the variance of the set
       --sd .............................. the std deviation of the set
       --sem ..................... the std error of the mean of the set
       --skew ............ an estimate of the non-normalilty of the set
                               a normal distribution will have skew = 0
       --stdskew ............... skew normalized to the size of the set
       --kurt ........ kurtosis is a measure of the 'tailedness' of the 
                          distribution. A normal distribution will have 
                          kurtosis = 3.
       --popkurt ............. kurtosis normalized to the sample size & 
                          adjusted to be zero for a normal distribution
       --quantile=#  ........ sets the number of quantiles to calculate
       --qq  ..... prints the quantiles requested by the '--quantile=#'
                     option above. Outputs 3 values: the # of quantile, 
                      the index of the sorted array holding the values, 
                              and the value of the array at that index.
                     The quantile output can't easily be put in 1 line,
                     so it spills over into multiple lines.

Performance isn't stellar - it's Perl.  A rough estimate is about 128K
integers / sec with about 180bytes per integer RAM usage on a Thinkpad
T530 (Intel i5-3210M cpu @ 2.50GHz).  Feel free to convert it C.

Example 1:
=======================================================================
Get only the sum and mean of the input data:

\$ cat 'file-of-numbers.txt' | stats --sum --mean
#         Sum           Mean  
 1.12697e+11     2.10727e+07  

Example 2:
=======================================================================
 Get only the sum of the input data:

\$ cat 'file-of-numbers.txt' | stats --sum
 1.12697e+11
(note that ONLY the result is printed; no headers)
 
Example 3:
=======================================================================
Print output wide: (as it appears in 'less')
\$ cat 'file-of-numbers.txt' | stats --wide | cols | less
#Std Dev = 0 or N <=3 or printing wide; Skipping Skewness, Std Skewness cal'n.
      1      0           1        2        3      4       5       6     7      8 ...
      2      #  (Quantiles      not  printed     in    wide  mode.)     -      - ...
      3   #Sum           N     Mean   Median   Mode  NModes     Min   Max  Range ...
      4 134418        3200  42.0056    41.98  41.94      22   38.33  45.7   7.37 ...
 
Example 4:
=======================================================================
Calculate the file size distribution in the current directory:

\$ ls -l | awk '{print \$5}'  | stats  --dist=2 --x=20 --y=10
Sum       5.72512e+08
Number    172
Mean      3.32856e+06
Median    13918
Mode      4096
NModes    11
Min       0
Max       2.84115e+08
Range     2.84115e+08
Variance  7.56924e+14
Std_Dev   2.75123e+07
SEM       2.09779e+06
95% Conf  -783109 to 7.44023e+06 **
          (for a normal distribution - see skew)
Skew      9.3415
          (skew = 0 for a symmetric dist)
Std_Skew  50.0155
Kurtosis  84.6439 **
          (K=3 for a normal dist)

** This assumes normal distribution , but since this distribution is 
   extremely skewed (see above Kurtosis value, and the plot below), the 
   confidence limits will be incorrect.
   (For a web page that calculates more descriptive stats, including 
   estimation of normality, see: <http://www.xuru.org/st/DS.asp>
   For specific plots or analyses, see: <http://www.wessa.net/desc.wasp>

Distribution
X BinSize 14205747.2
Y BinSize  18.7777777777778

YMax:169
      |*                   
      |                    
      |                    
      |                    
      |                    
      |                    
      |                    
      |                    
      |                    
      | *******************
      |--------------------
  X Min               X Max
   0.00        284114944.00 

If points are jammed at one end, use '--xf=ln' to spread them.

** This assumes normal distribution, but since this distribution is 
   extremely skewed, the confidence limits will be inaccurate.
   (for a web page that calculates more descriptive stats, including 
   estimation of normality, see: http://www.xuru.org/st/DS.asp
   for specific plots or analyses, see: http://www.wessa.net/desc.wasp
=======================================================================

Example 5:
=======================================================================
Calculate the file size distribution in the current directory with the 
suggested ln transform.  
NB: the stats are calculated with the transformed data.

\$ ls -l | awk '{print \$5}'  | stats --xf='ln' --dist=2 --x=20 --y=10
Sum       1597.38
Number    172
Mean      9.28707
Median    9.54093
Mode      8.31777
NModes    11
Min       0
Max       19.4649
Range     19.4649
Variance  12.8159
Std_Dev   3.57994
SEM       0.272968
95% Conf  8.75205 to 9.82208
          (for a normal distribution - see skew)
Skew      -0.297273
          (skew = 0 for a symmetric dist)
Std_Skew  -1.59164
Kurtosis  0.47218
          (K=3 for a normal dist)


Distribution
X BinSize 0.973244472331889
Y BinSize  2.66666666666667

YMax:24
      |           *        
      |         *          
      |        *           
      |            *       
      |     *    *         
      |                    
      |      **     *      
      |    *          *    
      |**            *     
      |  **            ****
      |--------------------
  X Min               X Max
   0.00               19.46 


 = Hint: while the above plotting function is better than nothing, 
 consider using the excellent 'feedgnuplot' to plot columns of numbers. 
 ex:  scut/cut [options] | feedgnuplot --lines --points 

   
 Feel free to add whatever additional calculations you want,
 but if you do and you think they might be of general use,
 let me know so I can add them to the original.  
 
 Help me make it better; send bug reports, suggestions back to the author:
 <hjmangalam\@gmail.com>
 

HELP

$HELPFILE = ".statshelpfile" . $$; # write a hidden helpfile
open(HF, ">$HELPFILE") or die "Can't open helpfile [$HELPFILE] at __LINE__ \n";
print HF $LESSHELP;
close HF;
system("$pager $HELPFILE");
unlink $HELPFILE; # and get rid of it asap
exit 0;
}

exit 0;