use 5.008001;
use strict;
use warnings;
use utf8;
use re (qw/eval/);

package String::Copyright;

=encoding UTF-8

=head1 NAME

String::Copyright - Representation of text-based copyright statements

=head1 VERSION

Version 0.002001

=cut

our $VERSION = '0.002001';

# Dependencies
use parent 'Exporter::Tiny';
use Carp ();

our @EXPORT = qw/copyright/;

use constant {
	PLAINTEXT => 0,
	BLOCKS    => 1,
};

use overload (

#	q{@{}}   => sub { ${$_[0]->[BLOCKS]} },
#	q{@{}}   => sub { ( $_[0]->[BLOCKS] ) },
#	q{@{}}   => sub { $_[0]->[BLOCKS] },
#	q{0+}    => sub { 0+$_[0]->[BLOCKS] },
	q{""}    => sub { $_[0]->_compose },
	fallback => 1,
);

=head1 SYNOPSIS

    use String::Copyright;

    my $copyright = copyright(<<'END');
    copr. © 1999,2000 Foo Barbaz <fb@acme.corp> and Acme Corp.
    Copyright (c) 2001,2004 Foo (work address) <foo@zorg.corp>
    Copyright 2003, Foo B. and friends
    © 2000, 2002 Foo Barbaz <foo@bar.baz>
    END

    print $copyright;

    # Copyright 1999-2000 Foo Barbaz <fb@acme.com> and Acme Corp.
    # Copyright 2000, 2002 Foo Barbaz and Acme Corp.
    # Copyright 2001, 2004 Foo (work address) <foo@zorg.org>
    # Copyright 2003 Foo B. and friends

    print $copyright->normalize(
      alias  => {
        [ 'foo@bar.baz' => [ 'fb@acme.com', 'foo@zorg.org'] ] }
      mangle => {
        [ 's/Foo Barbaz\K(?= and .*)$/ <foo@bar.baz>/' ] }
    );

    # Copyright 1999-2000, 2002-2003 Acme Corp.
    # Copyright 1999-2004 Foo Barbaz <foo@bar.baz>
    # Copyright 2003 Foo B. and friends

=head1 DESCRIPTION

L<String::Copyright> Parses common styles of copyright statements
and serializes in normalized format.

=head1 OPTIONS

Options can be set as an argument to the 'use' statement.

    use String::Copyright { threshold_after => 5 };

=head2 threshold, threshold_before, threshold_after

Stop parsing after this many lines whithout copyright information,
before or after having found any copyright information at all.
C<threshold> sets both C<threshold_before> and C<threshold_after>.

By default unset: All lines are parsed.

=head1 FUNCTIONS

Exports one function: C<copyright>.
This module uses L<Exporter::Tiny> to export functions,
which allows for flexible import options;
see the L<Exporter::Tiny> documentation for details.

=cut

my $blank_re          = qr/[ \t]/;
my $blank_or_break_re = qr/$blank_re+|$blank_re*\n$blank_re*/;
my $sign_re = qr/copyright(?:-holders?)?|copr\.|[©⒞Ⓒⓒ🄒🄫🅒]/i;
my $pseudo_sign_re = qr/\(c\)/i;

my $year_re = qr/\b[0-9]{4}\b/;
my $comma_re
	= qr/$blank_re*,$blank_or_break_re|$blank_or_break_re,$blank_re*|$blank_re*,$blank_re*|$blank_or_break_re|$blank_or_break_re/;
my $dash_re
	= qr/$blank_re*[-˗‐‑‒–—―⁃−﹣－]$blank_or_break_re*/;
my $owner_initial_re = qr/[^\s!\"#$%&'()*+,.\/:;<=>?@[\\\]^_`{|}~]/;

# TODO: test if \K or non-backref beneficial on perl >= 5.10
#my $comma_tidy_re = qr/$year_re\K$comma_re(?=$year_re)/;
#my $comma_tidy_re = qr/($year_re)$comma_re(?=$year_re)/;
my $comma_tidy_re = qr/(?<=$year_re)$comma_re(?=$year_re)/;
my $comma_tidied  = ', ';

# TODO: test if \K beneficial on perl >= 5.10
#my $year_begin_re = qr/(?:\A|(?<!-))($year_re)\K/;
our $y;
my $y_save_re   = qr/(?:(?<=[^-]($year_re))|(?<=\A($year_re)))(?{$y=$^N})/;
my $y_next_re   = qr/(??{++$y})/;
my $y_future_re = qr/(??{if($y<=$^N){$y=$^N;'';}else{'XXXX';}})/;
my $years_tidy_re
	= qr/$y_save_re(?:(?:$dash_re|$comma_tidied)($y_next_re)\b|-($year_re)$y_future_re\b)+/;

# this should cause *no* false positives, and stop-chars therefore
# exclude e.g. email address building blocks; tested against the code
# corpus at https://codesearch.debian.net/ (tricky: its RE2 engine lacks
# support for negative groups) using searches like these:
# (?i)copyright (?:(?:claim|holder|info|information|notice|owner|ownership|statement|string)s?|in|is|to)@\w
# (?i)copyright (?:(?:claim|holder|info|information|notice|owner|ownership|statement|string)s?|in|is|to)@\b[-_@]
# (?im)copyright (?:(?:claim|holder|info|information|notice|owner|ownership|statement|string)s?|in|is|to)[^ $]
my $chatter_re
	= qr/copyright$blank_or_break_re(?:(?:claim|holder|info|information|law|notice|owner|ownership|statement|string)s?|and|in|is|on|to)(?:$|@\W|[^a-zA-Z0-9@_-])/im;

my $signs_re
	= qr/(?:$sign_re|(?:^|$blank_re)$pseudo_sign_re)(?:$blank_re+(?:$sign_re|$pseudo_sign_re))*/m;
my $years_re = qr/((?:$year_re(?:(?:$dash_re|$comma_re)$year_re)*)?)/;
my $owners_re
	= qr/((?:\bby$blank_or_break_re)?((?:$owner_initial_re\S*(?:$blank_re*\S+)*)?))/;

my $sign_years_owners_re
	= qr/(?:$chatter_re.*|$signs_re(?::$blank_or_break_re|$comma_re)$years_re$comma_re?$owners_re|(?:\n|\z))/s;

sub _generate_copyright
{
	my ( $class, $name, $args, $globals ) = @_;

	return sub {
		my $copyright = shift;

		Carp::croak("String::Copyright strings require defined parts")
			unless 1 + @_ == grep {defined} $copyright, @_;

	   # String::Copyright objects are effectively immutable and can be reused
		if ( !@_ && ref($copyright) eq __PACKAGE__ ) {
			return $copyright;
		}

		# stringify objects
		$copyright = "$copyright";

		# TODO: also parse @_ - but each separately!
		my @block;
		my $skipped = 0;
		while ( $copyright =~ /$sign_years_owners_re/g ) {
			my $years        = $1;
			my $owners_dirty = $2;
			my $owners       = $3 || '';

			if ( $globals->{threshold_before} || $globals->{threshold} ) {
				last
					if (!@block
					and !length $years
					and !length $owners
					and ++$skipped >= ( $globals->{threshold_before}
							|| $globals->{threshold} ) );
			}
			if ( $globals->{threshold_after} || $globals->{threshold} ) {

				# "after" detects end of _current_ line so is skewed by one
				last
					if (@block
					and !length $years
					and !length $owners
					and ++$skipped >= 1
					+ ( $globals->{threshold_after} || $globals->{threshold} )
					);
			}
			next if ( !length $years and !length $owners );
			$skipped = 0;

			# normalize
			$years =~ s/$comma_tidy_re/$comma_tidied/g;
			$years =~ s/$years_tidy_re/-$^N/g;
			$owners =~ s/\s{2,}/ /g;

			my $copyright_match
				= ( length $years and length $owners )
				? $years . ' ' . $owners
				: $years . $owners;

# split owner into owner_id and owner

			push @block, $copyright_match;
		}

# TODO: save $skipped_lines to indicate how dirty parsing was

		bless [ $copyright, \@block ], __PACKAGE__;
		}
}

sub new
{
	my ( $self, @data ) = @_;
	Carp::croak("String::Copyright require defined, positive-length parts")
		unless 1 + @_ == grep { defined && length } @data;

	# String::Copyright objects are simply stripped of their string part
	if ( !@_ && ref($self) eq __PACKAGE__ ) {
		return bless [ undef, $data[1] ], __PACKAGE__;
	}

	# FIXME: properly validate data
	Carp::croak("String::Copyright blocks must be an array of strings")
		unless @_ == grep { ref eq 'ARRAY' } @data;

	bless [ undef, \@data ], __PACKAGE__;
}

sub blocks { $_[0]->[BLOCKS] }

sub normalize
{
	my ( $self, @opts ) = @_;
	Carp::confess("normalize options not yet implemented")
		if @opts;

	new($self);
}

sub _compose
{
	join "\n", map { '© ' . $_ } @{ $_[0]->[BLOCKS] };
}

sub is_normalized { !defined $_[0]->[PLAINTEXT] }

=head1 SEE ALSO

=over 4

=item *

L<Encode>

=item *

L<Exporter::Tiny>

=back

=head1 BUGS/CAVEATS/etc

L<String::Copyright> operates on strings, not bytes.
Data encoded as UTF-8, Latin1 or other formats
need to be decoded to strings before use.

Only ASCII characters and B<©> (copyright sign) are directly processed.

If copyright sign is mis-detected
or accents or multi-byte characters display wrong,
then most likely the data was not decoded into a string.

If ranges or lists of years are not tidied,
then maybe it contained non-ASCII whitespace or digits.

=head1 AUTHOR

Jonas Smedegaard C<< <dr@jones.dk> >>

=head1 COPYRIGHT AND LICENSE

Derived from L<App::Licensecheck> originally part of the KDE SDK,
originally introduced by Stefan Westerfeld C<< <stefan@space.twc.de> >>;
and on the script licensecheck2dep5 part of Debian CDBS tool,
written by Jonas Smedegaard.

  Copyright © 2007, 2008 Adam D. Barratt

  Copyright © 2005-2012, 2016 Jonas Smedegaard

This program is free software; you can redistribute it and/or modify it
under the terms of the GNU General Public License as published by the
Free Software Foundation; either version 3, or (at your option) any
later version.

This program is distributed in the hope that it will be useful, but
WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
General Public License for more details.

You should have received a copy of the GNU General Public License along
with this program. If not, see <https://www.gnu.org/licenses/>.

=cut

1;
