#!/usr/bin/env perl
use strict;
use warnings;

use Getopt::Long;
use File::Find::Rule;
use Pod::Usage;
use Text::TFIDF::Ngram;

pod2usage(1) unless @ARGV;

my %opts = (
    dir    => undef,
    size   => 2,
    top    => 0,
    stop   => 1,
    phrase => '',
    type   => 'txt',
    punc   => '[-!"#$%&()*+,.\/\\:;<=>?@\[\]^_`{|}~]',
    lc     => 0,
);
GetOptions( \%opts,
    'dir=s@',
    'size=i',
    'top=i',
    'stop=i',
    'phrase=s',
    'type=s',
    'punc=s',
    'lc',
    'help|?',
    'man',
) or pod2usage(2);

pod2usage(1) if $opts{help} || !$opts{dir};
pod2usage( -exitval => 0, -verbose => 2 ) if $opts{man};

my @files;
for my $dir ( @{ $opts{dir} } ) {
    push @files, File::Find::Rule->file()->name( '*.' . $opts{type} )->in($dir);
}

my $t = Text::TFIDF::Ngram->new(
    files       => \@files,
    size        => $opts{size},
    stopwords   => $opts{stop},
    punctuation => $opts{punc} ? qr/$opts{punc}/ : '',
    lowercase   => $opts{lc},
);

# If the top phrases is requested...
if ( $opts{top} ) {
    my $x = $t->tfidf_by_file();

    my $i = 0;

    for my $file ( sort keys %$x ) {
        $i++;

        printf "%d. %s\n", $i, $file;

        my $n = 0;

        for my $p ( sort { $x->{$file}{$b} <=> $x->{$file}{$a} } keys %{ $x->{$file} } ) {
            last if ++$n > $opts{top};

            printf "\t%d. %s = %.10f\n", $n, $p, $x->{$file}{$p};
        }
    }
}

# If an analysis of a single phrase is requested...
if ( $opts{phrase} ) {
    my $idf = $t->idf($opts{phrase});
    printf "Phrase: '%s', IDF = %.10f\n", $opts{phrase}, $idf
        if $idf;

    my $i = 0;
    for my $file ( @{ $t->files } ) {
        my $tf = $t->tf( $file, $opts{phrase} );
        if ( $tf ) {
            $i++;

            printf "%d. %s\n", $i, $file;

            printf "\tTF = %.10f\n", $t->tf( $file, $opts{phrase} );

            my $tfidf = $t->tfidf( $file, $opts{phrase} );
            printf "\tTF-IDF = %.10f\n", $tfidf
                if $tfidf;
        }
    }
}

__END__

=head1 NAME

analyze - TF-IDF Analyze a corpus

=head1 SYNOPSIS

  perl analyze --dir=/some/corpus [options]

  perl analyze --dir=/Users/you/Documents/lit/inaugural --top=5
  perl analyze --dir=/Users/you/Documents/lit/inaugural --phrase='public good'
  perl analyze --dir=/Users/you/Documents/lit/inaugural --dir=/Users/you/Documents/lit/SOTU --top=5
  perl analyze --dir=/Users/you/Documents/lit/Shakespeare --size=3 --top=5
  perl analyze --dir=/Users/you/perl5/lib/site_perl/Music --size=1 --type=pm --punc=''

=head1 DESCRIPTION

This program analyzes the given corpus with the TF-IDF measure for ngrams.

=head1 OPTIONS

=over 4

=item --help

Brief help message.

=item --man

Full documentation.

=item --dir

Corpus list of text documents.

=item --size

Ngram phrase size.

Default: C<2>

=item --top

Show the top B<N> ngrams seen.

Default: C<0>

=item --stop

Constrain the ngrams by excluding stopwords.

Default: C<1>

=item --phrase

Search the corpus for the B<phrase> and its IF-IDF values.

Default: C<''>

=item --type

Read corpus files of this file extension.

Default: C<txt>

=item --punc

A string defining a regular expression of characters to exclude from results.
Giving a C<''> or C<0> as the value for this will not exclude any characters.

Default: [-!"#$%&()*+,.\/\\:;<=>?@\[\]^_`{|}~]

Note that the single quote is not excluded by default.

=item --lc

Lower-case the results.

Default: C<0>

=back

=cut
