#!/usr/bin/perl -w
#
# $Id: dirinfo.pl,v 1.8 2003/01/13 05:28:42 jmates Exp $
#
# Copyright (c) 2000-2002, Jeremy Mates.  This script is free
# software; you can redistribute it and/or modify it under the same
# terms as Perl itself.
#
# Run perldoc(1) on this file for additional documentation.
#
######################################################################
#
# REQUIREMENTS

require 5;

use strict;

######################################################################
#
# MODULES

use Carp;           # to die for
use Getopt::Std;    # command line option handling

use File::Basename; # for getting extensions
use File::Spec;     # for platform-independant file path ops

######################################################################
#
# VARIABLES

my $VERSION;
($VERSION = '$Revision: 1.8 $ ') =~ s/[^0-9.]//g;

# where everything is stored, options, array lookup table
my (%d_info, %opts, %di);

my $path;           # where we are, so to speak

# hash that holds array positions for various elements in the %d_info hash o' arrays
# probably should rearrange the array positions
$di{'cnt_d'} = 0;    # sub-dir counter
$di{'cnt_f'} = 1;    # sub-file counter
$di{'cnt_e'} = 2;    # sub-file-with-extensions counter
$di{'cnt_o'} =
  3;    # not directory or file counter (e.g. links, character devices)
$di{'ssize'} = 4;     # sum of file sizes in dir
$di{'size'}  = 5;     # size of largest file in dir
$di{'fsize'} = 6;     # name of largest file in dir
$di{'min'}   = 7;     # least recently modified file
$di{'fmin'}  = 8;     # ... and it's path
$di{'max'}   = 9;     # most recently modified file
$di{'fmax'}  = 10;    # ... and it's path
$di{'mean'}  = 11;    # mean modification date
$di{'stdev'} = 12;    # standard deviation of mod dates
$di{'wtref'} = 13;    # reference to array of wtimes for the directory
$di{'szref'} = 14;    # ref to array of file sizes in directory
$di{'szmn'}  = 15;    # mean file size
$di{'szstd'} = 16;    # ... std deviation of file sizes

# controls the default output format of the humanizing routine that
# makes file sizes readable by a human.
my %global_prefs = (

    # include decimals in output? (e.g. 25.8 K vs. 26 K)
    'decimal' => 1,

    # include .0 in decmail output?
    'decimal_zero' => 1,

    # what to divide file sizes down by
    'factor' => 1024,

    # percentage above which will be bumped up
    # (e.g. 999 bytes -> 1 K as within 5% of 1024)
    # set to undef to turn off
    'fudge' => 0.95,

    # lengths above which decimals will not be included
    # for better readability
    'max_human_length' => 2,

    # list of suffixes for human readable output
    'suffix' => ['', 'K', 'M', 'G', 'T', 'P', 'E', 'Z', 'Y'],
);

######################################################################
#
# MAIN

# parse command-line options
getopts('h?lmoas:p:z:', \%opts);

# help blab if they want it
if (exists $opts{'h'} or exists $opts{'?'}) {
    print <<"HELP";
Usage: $0 [options] dir1 dir2 .. dirN

A recursive directory information gleaner.

Options for version $VERSION:
  -h/-?    Display this message

  -m       Display results in ugly format (raw data using TSV)
  -o       Display results in a pretty, human readable format (default)
  -a       Include averages, std. dev info (requires more memory)

  -l       Limit search to current directory only (a la CVS :)
  -s n     Summarize at depth n (where n is 0 or higher) in file tree;
             default is 0, summarize at the level of each passed dir.

  -H       Do not humanize the output file sizes. (default is to)

  -p expr  Perl expression describing what directories to prune out
  -z expr  As -p except describes what files to exclude from stats

Run perldoc(1) on this script for more information.

HELP
    exit;
}

# Directories to examine either on the command line or riding
# in on an input stream
chomp(@ARGV = <STDIN>) unless @ARGV;

# at what depth of recursion should we start adding sub directory
# results to the "parent" directory instead of individually
my $depth = 0;
$depth = $opts{'s'} if exists $opts{'s'};

for $path (@ARGV) {
    parsedir($path, 0, $path);
}

# data we want should now be lumbering around in %d_info hash...

# do the machine format if called for
if (exists $opts{'m'}) {
    for (sort { $a cmp $b || length $a <=> length $b } keys %d_info) {
        print $_, "\t", "\n";

        for (@{$d_info{$_}}) {
            unless (ref $_) {
                print $_, "\t";
            } else {
                print join (",", @$_), "\t";
            }
        }
        print "\n";
    }
}

# do the human format?
unless (exists $opts{'m'}) {
    local $^W = 0;    # to hide 'Use of uninitialized value' errors
    for (sort { $a cmp $b || length $a <=> length $b } keys %d_info) {
        my $ext_cnt   = $d_info{$_}->[$di{'cnt_e'}];
        my $file_cnt  = $d_info{$_}->[$di{'cnt_f'}];
        my $dir_cnt   = $d_info{$_}->[$di{'cnt_d'}];
        my $other_cnt = $d_info{$_}->[$di{'cnt_o'}];

        my $items_cnt = $file_cnt + $dir_cnt + $other_cnt;

        my $ext_pcnt;

        if ($file_cnt != 0 || $dir_cnt != 0 || $other_cnt != 0) {

            print $_, ":\n";
            print '  Directories:  ', $dir_cnt, "\n" unless $dir_cnt == 0;
            print '  Files:        ', $file_cnt unless $file_cnt == 0;

            if ($file_cnt > 0) {
                $ext_pcnt =
                  sprintf("%.1f", (($ext_cnt / $file_cnt) * 100)) . "%";
            } else {
                $ext_pcnt = '0%';
            }
            $ext_cnt = 0 unless defined $ext_cnt;

            print ' (', $ext_cnt, ' have extensions - ', $ext_pcnt, ")\n"
              unless $file_cnt == 0;
            print '  Other:        ', $other_cnt, "\n" unless $other_cnt == 0;

            print '  Wasted:       ', humanize($d_info{$_}->[$di{'ssize'}]),
              "\n"
              unless $d_info{$_}->[$di{'ssize'}] == 0;
            print '  Mean Size:    ',
              humanize(sprintf("%.f", $d_info{$_}->[$di{'szmn'}])), "\n"
              unless $items_cnt < 2 || !defined $d_info{$_}->[$di{'szmn'}];
            print '  Std.Dev. Sz.: ',
              humanize(sprintf("%.f", $d_info{$_}->[$di{'szstd'}])), "\n"
              unless $d_info{$_}->[$di{'szstd'}] == 0;
            print '  Largest File: ', humanize($d_info{$_}->[$di{'size'}]), " ",
              $d_info{$_}->[$di{'fsize'}], "\n"
              unless $d_info{$_}->[$di{'size'}] == 0;

            print '  Oldest File:  ', get_date($d_info{$_}->[$di{'min'}]), "  ",
              $d_info{$_}->[$di{'fmin'}], "\n";
            print '  Newest File:  ', get_date($d_info{$_}->[$di{'max'}]), "  ",
              $d_info{$_}->[$di{'fmax'}], "\n"
              unless $d_info{$_}->[$di{'fmin'}] eq $d_info{$_}->[$di{'fmax'}];
            print '  Mean Date:    ', get_date($d_info{$_}->[$di{'mean'}]), "\n"
              unless $items_cnt < 2 || !defined $d_info{$_}->[$di{'mean'}];
            print '  Deviation:    ',
              get_time_diff($d_info{$_}->[$di{'stdev'}]), ' (',
              sprintf("%.f", $d_info{$_}->[$di{'stdev'}]), ")\n"
              unless $d_info{$_}->[$di{'stdev'}] == 0;

        } else {
            print $_, " (Empty Directory) \n";
        }
        print "\n";
    }
}

exit;

######################################################################
#
# SUBROUTINES

sub parsedir {
    my $dir      = shift;    # where we should be poking around
    my $level    = shift;    # to what depth we have recursed
    my $previous = shift;    # whence we have come

    #    my $item;   # current thingy dealing with (for loop later)
    my $target;              # d_info directory to target for new info

    # Statistical Stuff
#    my @wtimes; # temp array for time-last-modified of each file in a folder
    #    my($min_time, $min_time_file);
    #    my($max_time, $max_time_file);
    #    my($max_size, $max_size_file);

    #    warn ' ' x $level, $dir, "\n";

    # figure out what part of %d_info to shove our info into based on
    # depth limit
    if ($level > $depth) {
        $target = $previous;
    } else {
        $target = $dir;
    }

    #    warn "$dir -> $target (level: $level; depth: $depth)\n"; #DBG

    opendir DIR, $dir or warn 'Could not open ', $dir, ' (', $!, ")\n";
    DIRITEM: for (readdir DIR) {
        my $pti = File::Spec->catfile($dir, $_);

        next DIRITEM if m/^\.{1,2}$/;    # prune out . and .. "files"
             # (otherwise, script will happily "descend" into that '.' dir!)

        #	warn "Dealing with $pti\n"; #DBG

        if (-d $pti) {
            local $^W = 0;    # hide 'Use of uninitialized value' warnings

            # see whether this dir needs to be pruned from the search
            if (exists $opts{'p'}) {
                my $results = eval "return 1 if( " . $opts{'p'} . " );";

                if ($@) {
                    chomp($@);
                    die "Prune error: ", $@;    # croak on errors
                }

                if ($results) {

                    #		    warn "Pruned $pti\n";
                    next DIRITEM;
                }
            }

            # (optionally) recuse on downwards
            unless (-l $pti) {
                parsedir($pti, $level + 1, $target) unless exists $opts{'l'};
            } else {

                # directory links evil. (well, according to this script)
                warn
                  "Directory link $pti skipped to avoid possible infinite loop"
                  if exists $opts{'v'};
                $d_info{$target}->[$di{'cnt_o'}]++;
            }

            # collect some stats on this directory...
            my $dmtime = (stat(_))[9];

            if (($dmtime < $d_info{$target}->[$di{'min'}])
                || (!defined $d_info{$target}->[$di{'min'}])) {

#		warn "Oldest changed to $pti $dmtime vs. $d_info{$target}->[$di{'min'}]\n"; #DBG
                $d_info{$target}->[$di{'min'}]  = $dmtime;
                $d_info{$target}->[$di{'fmin'}] = $pti;
            }
            if ($dmtime > $d_info{$target}->[$di{'max'}]) {
                $d_info{$target}->[$di{'max'}]  = $dmtime;
                $d_info{$target}->[$di{'fmax'}] = $pti;
            }

            # add last write time to array
            push @{$d_info{$target}->[$di{'wtref'}]}, $dmtime
              if exists $opts{'a'};

            # increment dir counter
            $d_info{$target}->[$di{'cnt_d'}]++;

        } elsif (-f $pti) {
            local $^W = 0;    # hide 'Use of uninitialized value' warnings

            # see whether this file is to be skipped
            if (exists $opts{'z'}) {
                my $results = eval "return 1 if( " . $opts{'z'} . " );";

                if ($@) {
                    chomp($@);
                    die "Skip error: ", $@;    # croak on errors
                }

                if ($results) {

                    #		    warn "Pruned $pti\n";
                    next DIRITEM;
                }
            }

            my ($fsize, $fmtime) = (stat(_))[7, 9];

            # do stuff with the file size attribute
            if ($fsize > $d_info{$target}->[$di{'size'}]) {
                $d_info{$target}->[$di{'size'}]  = $fsize;
                $d_info{$target}->[$di{'fsize'}] = $pti;
            }
            push @{$d_info{$target}->[$di{'szref'}]}, $fsize
              if exists $opts{'a'};
            $d_info{$target}->[$di{'ssize'}] += $fsize;

            # do stuff with modification time attribute
            if (($fmtime < $d_info{$target}->[$di{'min'}])
                || (!defined $d_info{$target}->[$di{'min'}])) {

#		warn "Oldest changed to $pti $fmtime vs. $d_info{$target}->[$di{'min'}]\n"; #DBG
                $d_info{$target}->[$di{'min'}]  = $fmtime;
                $d_info{$target}->[$di{'fmin'}] = $pti;
            }
            if ($fmtime > $d_info{$target}->[$di{'max'}]) {
                $d_info{$target}->[$di{'max'}]  = $fmtime;
                $d_info{$target}->[$di{'fmax'}] = $pti;
            }
            push @{$d_info{$target}->[$di{'wtref'}]}, $fmtime
              if exists $opts{'a'};

            # and keep a count going...
            $d_info{$target}->[$di{'cnt_f'}]++;
            $d_info{$target}->[$di{'cnt_e'}]++ if get_ext($pti);

            #           print ' ' x ($level +1), $pti, " $fmtime\n";
        } else {
            local $^W = 0;    # hide 'Use of uninitialized value' warnings

            $d_info{$target}->[$di{'cnt_o'}]++;

            # links don't return mod times, so exclude them from stats!
        }
    }
    closedir DIR;

    # now, gain some statistical info from the wtimes array
    # but only if we're about to drop out of a directory
    # we've been targeting due to depth summarization
    if ($target eq $dir) {

        # any mod times worth mentioning?
        if ($d_info{$target}->[$di{'wtref'}]) {
            $d_info{$target}->[$di{'mean'}] =
              mean($d_info{$target}->[$di{'wtref'}]);

#	    warn "Got mean of $d_info{$target}->[$di{'mean'}] for $target\n"; # DBG
#    warn join("\n$.", sort { $a <=> $b } (@{$d_info{$target}->[$di{'wtref'}]})), "\n" if $target =~ m/p1$/; #DBG

            $d_info{$target}->[$di{'stdev'}] = standard_deviation(
                $d_info{$target}->[$di{'wtref'}],
                $d_info{$target}->[$di{'mean'}]
              )
              if $d_info{$target}->[$di{'mean'}];

#	    warn "Got deviation of  $d_info{$target}->[$di{'stdev'}]\n"; #DBG
        }

        # any file sizes?
        if ($d_info{$target}->[$di{'szref'}]) {
            $d_info{$target}->[$di{'szmn'}] =
              mean($d_info{$target}->[$di{'szref'}]);
            $d_info{$target}->[$di{'szstd'}] = standard_deviation(
                $d_info{$target}->[$di{'szref'}],
                $d_info{$target}->[$di{'szmn'}]
              )
              if $d_info{$target}->[$di{'szmn'}];
        }
    }
}

# Uses the standard File::Basename module to return the extension
# of a passed file path, if any.  Optionally, you can specify the
# regex to use to match the extension as a second argument.  Pass
# '\..*' to match stuff like ".tar.gz" (default is ".gz" only).
sub get_ext {
    my $ext_path = shift;

    # (optional) pass '\..*' to match stuff like '.tar.gz'
    # my regex here matches ".gz" (but not dot files without
    # additional dot's :)
    my $ext_pattern = shift || '.\.[^.]+';

    return (fileparse($ext_path, $ext_pattern))[2];
}

# $mean = mean(\@array) computes the mean of an array of numbers.
#
sub mean {
    my ($arrayref) = shift;
    my $result;
    for (@$arrayref) { $result += $_ }
    return $result / @$arrayref;
}

# $sd = standard_deviation_data(\@array) computes the standard
# deviation of an array of numbers.
#
sub standard_deviation {
    my $arrayref = shift;
    my $mean     = shift;

    # JAM there is a faster (6%) algorithm in Mastering Algorithms
    # with perl, but it chokes on certain directories where the
    # epoch times are all within a second or two of one another
    return sqrt(mean([map (($_ - $mean)**2, @$arrayref)]));
}

# simple routine that puts commas in passed numbers
sub commify {
    my $text = reverse $_[0];
    $text =~ s/(\d\d\d)(?=\d)(?!\d*\.)/$1,/g;
    return scalar reverse $text;
}

# returns a custom date tag for the human-readable format
sub get_date {
    my $thingy = shift;
    my ($sec, $min, $hour, $mday, $mon, $year, $wday) = localtime($thingy);
    my $month     = (qw:Jan Feb Mar Apr May Jun Jul Aug Sep Oct Nov Dec:)[$mon];
    my $dayofweek = (qw:Sun Mon Tue Wed Thu Fri Sat:)[$wday];
    $mon++;
    $year += 1900;

    return
      sprintf("%04d $month %02d %02d:%02d:%02d $dayofweek", $year, $mday, $hour,
        $min, $sec);
}

# retuns a delta time value (for the standard deviation)
sub get_time_diff {
    my $difference = shift;
    $difference = int($difference);

    my $seconds = $difference % 60;
    $difference = ($difference - $seconds) / 60;
    my $minutes = $difference % 60;
    $difference = ($difference - $minutes) / 60;
    my $hours = $difference % 24;
    $difference = ($difference - $hours) / 24;
    my $days  = $difference % 7;
    my $weeks = ($difference - $days) / 7;

    # probably a better way to do this!
    my $temp = ($weeks) ? "$weeks weeks, " : '';
    $temp .= ($days) ? "$days days, " : '';

    return $temp . sprintf("%02d:%02d:%02d", $hours, $minutes, $seconds);
}

# Inspired from GNU's df -h output, which fixes 133456345 bytes
# to be something human readable.
#
# takes a number, returns formatted string.  Also takes optional
# hash containing various defaults that affect output style.
sub humanize {
    my $num = shift;    # could also take a array ref or hash ref to parse thru?

    return $num if exists $opts{'H'};    # lazy hack. don't do this.

    my %prefs = @_;

    # inherit global prefs, but give preference to user supplied ones
    unless (keys %prefs) {
        %prefs = %global_prefs;
    } else {

        # benchmarking w/ 5.6.0 on Linux PPC & i386 showed this next
        # faster than direct merge method (p. 145 Perl Cookbook)
        while (my ($k, $v) = each(%global_prefs)) {
            $prefs{$k} = $v unless exists $prefs{$k};
        }
    }

    # some local working variables
    my $count  = 0;
    my $prefix = '';
    my $tmp    = '';

    # handle negatives
    if ($num < 0) {
        $num    = abs $num;
        $prefix = '-';
    }

    # reduce number to something readable by factor specified	
    while ($num > $prefs{'factor'}) {
        $num /= $prefs{'factor'};
        $count++;
    }

    # optionally fudge "near" values up to next higher level
    if (defined $prefs{'fudge'}) {
        if ($num > ($prefs{'fudge'} * $prefs{'factor'})) {
            $count++;
            $num /= $prefs{'factor'};
        }
    }

    # no .[1-9] decimal on longer numbers for easier reading
    # only show decimal if prefs say so
    if (length sprintf("%.f", $num) > $prefs{'max_human_length'}
        || !$prefs{'decimal'}) {

        $tmp = sprintf("%.0f", $num);

    } else {
        $tmp = sprintf("%.1f", $num);

        # optionally hack trailing .0 as is not needed
        $tmp =~ s/\.0$// unless $prefs{'decimal_zero'};
    }

    # return number with proper style applied
    return $prefix . $tmp . $prefs{'suffix'}->[$count];
}

######################################################################
#
# DOCUMENTATION

=head1 NAME

dirinfo.pl - summarizes directory information.

=head1 SYNOPSIS

A summary of file and directory information for your home directory:

  $ dirinfo.pl ~/

A one level-deep report for the root filesystem, skipping over nfs:

  # dirinfo.pl -p 'm!^/nfs!' -s 1 /

=head1 DESCRIPTION

This script summarizes and prints out various information about a
directory, including space consumed by the directory tree, largest
file, oldest file, etc.

It's a bit slow, and the code is horrible as I wrote it a while
back. :)

=head1 USAGE

  $ dirinfo.pl [options] dir1 [dir2 .. dirN]

Directories will be read from STDIN if they are ommited from the
command line.  See L<"OPTIONS"> for details on the command line
switches.

=head1 OPTIONS

The following command line options are available:

=over 4

=item B<-h/-?>

Display a brief little help blarb.

=item B<-v>

Become a bit more chatty about the whole process.  (Currently only
turns on annoying link-skipping warnings. :)

=item B<-m>

Display results in ugly format (raw data using TSV).

=item B<-o>

Display results in a quasi-human readable format (default).

=item B<-a>

Include averages, std. dev info.

=item B<-l>

Limit search to current directory only.

=item B<-s n>

Summarize at depth n (where n is 0 or higher) in file tree; default is
0, summarize at the level of each passed dir.

=item B<-H>

Do not humanize the output file sizes.  Default is to convert ungainly
numbers such as 406502769 to human readable numbers such as 388M.

=item B<-p expr>

Perl expression that will result in the current directory (stored in
$_) being pruned out of the tree.  Use this to skip "dot directories,"
for example:

  -p 'm/^\../'

=item B<-z expr>

As -p except describes what files to exclude from stats.

=back

=head1 EXAMPLES

None yet.

=head1 BUGS

=head2 Reporting Bugs

Newer versions of this script may be available from:

http://sial.org/code/perl/

If the bug is in the latest version, send a report to the author.
Patches that fix problems or add new features are welcome.

=head2 Known Issues

No known bugs.

=head1 TODO

Rewrite chicken scratch code in a better format. :)

=head1 SEE ALSO

perl(1)

=head1 AUTHOR

Jeremy Mates, http://sial.org/contact/

=head1 COPYRIGHT

Copyright (c) 2000-2002, Jeremy Mates.  This script is free
software; you can redistribute it and/or modify it under the same
terms as Perl itself.

=head1 VERSION

$Id: dirinfo.pl,v 1.8 2003/01/13 05:28:42 jmates Exp $

=cut

