#!/usr/bin/perl

use utf8;
use strictures 2;
use autodie;

use version;
use Getopt::Long::Descriptive;
my $COPYRIGHT;
use Pod::Constants
	-trim => 1,
	'COPYRIGHT AND LICENSE' =>
	sub { ($COPYRIGHT) = s/C<< (.*) >>/$1/gr; $COPYRIGHT =~ s/Â©/©/g };
use Path::Tiny;
use Try::Tiny;
use String::Escape qw(unbackslash);
use List::Util 1.45 qw(uniq uniqstr);
use Sort::Key::Multi 1.25 qw(rs_keysort rus_keysort);

use App::Licensecheck;

=head1 NAME

licensecheck - simple license checker for source files

=head1 VERSION

Version v3.0.36

=cut

our $VERSION = version->declare("v3.0.36");

binmode STDOUT, ':utf8';

my $progname = path($0)->basename;

=head1 SYNOPSIS

B<licensecheck> B<--help>|B<--version>

B<licensecheck> [B<options>...] F<path> [F<path>...]

=head1 DESCRIPTION

B<licensecheck> attempts to determine the license that applies to each file
passed to it, by searching the start of the file for text belonging to
various licenses.

If any of the arguments passed are directories, B<licensecheck> will add
the files contained within to the list of files to process.

=cut

my ( $opt, $usage ) = describe_options(
	'%c %o path [path...]',
	[],
	[   'check|c=s', 'regular expression of files to include',
		{ default => 'common source files' }
	],
	[   'ignore|i=s', 'regular expression of files to skip',
		{ default => 'some backup and VCS files' }
	],
	[ 'recursive|r', 'traverse directories recursively' ],
	[],
	[   'lines|l=i',
		'number of lines to parse from top of each file; implies optimistic search including only first cluster of detected copyrights/licenses; set to 0 to parse the whole file (and ignore --tail)',
		{ default => 60 }
	],
	[   'tail=i',
		'number of bytes to parse from bottom of each file; set to 0 to avoid parsing from end of file',
		{ default => 5000 }
	],
	[   'encoding|e=s',
		'try decode source files as specified codec, with latin1 as fallback; by default no decoding is done'
	],
	[],
	[ 'verbose!',  'add header of each file to license information' ],
	[ 'copyright', 'add copyright statements to license information' ],
	[   'skipped|s',
		'print skipped files to STDERR, i.e. files matching --check option but not --ignore option.'
	],
	[ 'deb-fmt!', 'use Debian license shortnames (based on SPDX keywords)' ],
	[   'machine|m',
		'print license information as TAB-separated fields, for processing with line-oriented tools like awk and sort (NB! the --verbose option will kill the readability)'
	],
	[   'deb-machine!',
		'print license information like a Debian copyright file; implies --copyright and --deb-fmt'
	],
	[   'list-delimiter=s',
		'printf-string used between multiple plain list items in Debian copyright file',
		{ default => '\n ' }
	],
	[   'rfc822-delimiter=s',
		'printf-string used between multiple RFC822-style items in Debian copyright file',
		{ default => '\n  ' }
	],
	[   'copyright-delimiter=s',
		'printf-string used between years and owners in Debian copyright file',
		{ default => ', ' }
	],
	[   'merge-licenses!',
		'merge same-licensed files in Debian copyright file'
	],
	[ 'text|t',         '', { hidden => 1 } ],
	[ 'noconf|no-conf', '', { hidden => 1 } ],
	[],
	[ 'help|h', 'print help message and exit', { shortcircuit => 1 } ],
	[   'version|v', 'print version and copyright information and exit',
		{ shortcircuit => 1 }
	],
	{   getopt_conf   => [qw(gnu_getopt)],
		show_defaults => 1,
	},
);

print( $usage->text ), exit if $opt->help;
if ( $opt->version ) { version(); exit 0; }

if ( $opt->text ) {
	warn "$0 warning: option -text is deprecated\n";   # remove -text end 2015
}
if ( $opt->noconf ) {
	warn "$0 warning: option --no-conf is deprecated\n";    # No-op
}

print( "$progname: No paths provided.\n", $usage->leader_text ), exit 2
	unless @ARGV;

my $app = App::Licensecheck->new(
	check_regex  => $opt->check,
	ignore_regex => $opt->ignore,
	recursive    => $opt->recursive,
	lines        => $opt->lines,
	tail         => $opt->tail,
	verbose      => $opt->verbose,
	skipped      => $opt->skipped,
	deb_fmt      => $opt->deb_fmt // $opt->deb_machine,
	deb_machine  => $opt->deb_machine,
);

my %patternfiles;
my %patternownerlines;
my %patternlicense;

foreach my $file ( $app->find(@ARGV) ) {
	my ( $license, $copyright );

	if ( $opt->encoding ) {
		try {
			$app->encoding( $opt->encoding );
			( $license, $copyright ) = $app->parse($file);
		}
		catch {
			if (/does not map to Unicode/) {
				print
					"file $file cannot be read with $opt->encoding; encoding, will try latin-1:\n$_"
					if $opt->verbose;
				try {
					$app->encoding('latin-1');
					( $license, $copyright ) = $app->parse($file);
				}
				catch {
					if (/does not map to Unicode/) {
						print
							"file $file cannot be read with latin-1; encoding, will try binary:\n$_"
							if $opt->verbose;
						( $license, $copyright ) = $app->parse($file);
					}
					else {
						die $_;
					}
				}
			}
			else {
				die $_;
			}
		}
	}
	else {
		( $license, $copyright ) = $app->parse($file);
	}

	# drop duplicates
	my @copyrights = uniq rs_keysort {$_} split /^/, $copyright;
	chomp @copyrights;

	if ( $opt->deb_machine ) {
		my @ownerlines_clean        = ();
		my %owneryears              = ();
		my $owneryears_seem_correct = 1;
		for my $ownerline (@copyrights) {
			my ( $owneryear, $owner )
				= $ownerline =~ /^(\d{4}(?:(?:-|, )\d{4})*)? ?(\S.*)?/;
			$owneryears_seem_correct = 0 unless ($owneryear);
			$owner =~ s/,?\s+All Rights Reserved\.?//gi if ($owner);
			push @ownerlines_clean,
				join unbackslash( $opt->copyright_delimiter, ),
				$owneryear || (), $owner || ();
			push @{ $owneryears{ $owner || '' } }, $owneryear;
		}
		my @owners = sort keys %owneryears;
		@owners = ()
			if ( $opt->merge_licenses and $owneryears_seem_correct );
		my $pattern = join( "\n", $license, @owners );
		push @{ $patternfiles{"$pattern"} },      $file;
		push @{ $patternownerlines{"$pattern"} }, @ownerlines_clean;
		$patternlicense{"$pattern"} = $license;
	}
	elsif ( $opt->machine ) {
		print "$file\t$license";
		print "\t" . ( join( " / ", @copyrights ) or "*No copyright*" )
			if $opt->copyright;
		print "\n";
	}
	else {
		print "$file: ";
		print "*No copyright* " unless @copyrights;
		print $license . "\n";
		print "  [Copyright: " . join( " / ", @copyrights ) . "]\n"
			if @copyrights and $opt->copyright;
		print "\n" if $opt->copyright;
	}
}

if ( $opt->deb_machine ) {
	print <<'HEADER';
Format: https://www.debian.org/doc/packaging-manuals/copyright-format/1.0/
Upstream-Name: FIXME
Upstream-Contact: FIXME
Source: FIXME
Disclaimer: Autogenerated by licensecheck

HEADER
	foreach my $pattern (
		rus_keysort { int @{ $patternfiles{$_} }, $_ }
		keys %patternfiles
		)
	{
		my @ownerlines_unique
			= uniqstr sort @{ $patternownerlines{$pattern} };
		@ownerlines_unique = ('NONE') unless (@ownerlines_unique);
		print "Files: ",
			join(
			unbackslash( $opt->list_delimiter, ),
			sort @{ $patternfiles{$pattern} }
			),
			"\n";
		print "Copyright: ",
			join(
			unbackslash( $opt->rfc822_delimiter, ),
			@ownerlines_unique
			),
			"\n";
		print "License: $patternlicense{$pattern}\n FIXME\n\n";
	}
}

sub version
{
	print <<"EOF";
This is $progname version $VERSION

$COPYRIGHT
EOF
}

=head1 SEE ALSO

Other similar tools exist.

Here is a list of known tools also command-line based and general-purpose:

=over 16

=item L<copyright-update|https://github.com/jaalto/project--copyright-update>

Written in Perl.

=item L<debmake|http://anonscm.debian.org/git/collab-maint/debmake.git>

Written in Python.

Specific to Debian packages.

=item L<decopy|https://anonscm.debian.org/git/collab-maint/decopy.git>

Written in Python.

)item L<Licensee|http://ben.balter.com/licensee/>

Written in Ruby.

=item L<LicenseFinder|https://github.com/pivotal/LicenseFinder>

Written in Ruby.

=item L<ninka|http://ninka.turingmachine.org/>

Written in C++.

Used in L<FOSSology|http://fossology.org/>
(along with Monk and Nomos apparently unavailable as standalone command-line tools).

=item L<ripper|https://github.com/odeke-em/ripper>

Written in Go.

=item L<scancode-toolkit|https://github.com/nexB/scancode-toolkit>

Written in Python.

=back

=encoding UTF-8

=head1 AUTHOR

Jonas Smedegaard C<< <dr@jones.dk> >>

=head1 COPYRIGHT AND LICENSE

This program is based on the script "licensecheck" from the KDE SDK,
originally introduced by Stefan Westerfeld C<< <stefan@space.twc.de> >>.

  Copyright © 2007, 2008 Adam D. Barratt

  Copyright © 2012 Francesco Poli

  Copyright © 2016 Jonas Smedegaard

This program is free software; you can redistribute it and/or modify it
under the terms of the GNU General Public License as published by the
Free Software Foundation; either version 3, or (at your option) any
later version.

This program is distributed in the hope that it will be useful, but
WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
General Public License for more details.

You should have received a copy of the GNU General Public License along
with this program. If not, see <https://www.gnu.org/licenses/>.

=cut
