#!/usr/bin/perl -X
use strict;
use warnings;
use Bio::SeqIO;
use Getopt::Long;
use Bio::Phylo::Util::Logger ':levels';

# process command line arguments
my $verbosity = INFO;
my $infile;
my $outfile;
GetOptions(
	'infile=s'  => \$infile,
	'outfile=s' => \$outfile,
	'verbose+'  => \$verbosity,
);
Bio::Phylo::Util::Logger->new( 
	'-level' => $verbosity, 
	'-class' => 'main' 
);

# open file handle or use STDIN
my $in;
if ( $infile ) {
	INFO "reading from infile $infile";
	$in = Bio::SeqIO->new(
		'-format' => 'fasta',
		'-file'   => $infile,
	);
}
else {
	INFO "reading from STDIN";
	$in = Bio::SeqIO->new(
		'-format' => 'fasta',
		'-fh'     => \*STDIN,
	);
}

# open file handle or STDOUT
my $out;
if ( $outfile ) {
	INFO "writing to outfile $outfile";
	$out = Bio::SeqIO->new(
		'-format' => 'fasta',
		'-file'   => ">$outfile",
	);
}
else {
	INFO "writing to STDOUT";
	$out = Bio::SeqIO->new(
		'-format' => 'fasta',
		'-fh'     => \*STDOUT,
	);
}

# start filtering
my %seen;
while( my $seq = $in->next_seq ) {
	my $seqdata = $seq->seq;
	if ( not $seen{$seqdata} ) {
		$out->write_seq( $seq );
		$seen{$seqdata} = $seq->id;
	}
	else {
		my $this_id = $seq->id;
		my $seen_id = $seen{$seqdata};
		INFO "already seen sequence $this_id in $seen_id";
	}	
}

