#!/usr/bin/perl -w
use POSIX;
use Getopt::Long qw(:config bundling require_order auto_version);
use Pod::Usage;
use FAST;
use FAST::Bio::SeqIO;
use FAST::Bio::AlignIO;
use FAST::Bio::UnivAln;
use FAST::Bio::SimpleAlign;
use FAST::Bio::Location::NarrowestCoordPolicy;
use strict;

use vars qw($VERSION $DESC $NAME $COMMAND $DATE);
$VERSION = $FAST::VERSION; 
$DESC    = "Extract sites from alignments based on reg-ex matching on features in an annotated GenBank file.\n";
$NAME    = $0;
$NAME    =~ s/^.*\///;
$COMMAND = join " ",$NAME,@ARGV;
$DATE = POSIX::strftime("%c",localtime());

use constant { true => 1, false => 0 };

## DEFAULT OPTION VALUES
my $def_format  = $FAST::DEF_FORMAT;  #7/1/13 "fasta";
my $def_logname = $FAST::DEF_LOGNAME; #7/1/13 "FAST.log.txt";
my $def_qualifier = 'note';

## OPTION VARIABLES
my $man                  = undef;  # --man
my $help                 = undef;  # -h
my $moltype              = undef;  # -m, in case bioperl can't tell
my $format               = $def_format;  # --format
my $log                  = undef;        # -l
my $logname              = $def_logname; # -L
my $comment              = undef;        # -C
my $insensitive          = undef; # -i
my $negate               = undef; # -v
my $qualifier            = $def_qualifier; # -q
#my $intersect_match      = undef; # -x
my $key                  = undef; # -k
my $verbose              = undef;

GetOptions('help|h'         		 => \$help, 
	   'man'            		 => \$man,
	   'moltype|m=s'                 => sub{  my (undef,$val) = @_; 
						  die "$NAME: --moltype or -m option must be either \"dna\", \"rna\" or \"protein\"" 
						    unless $val =~ /dna|rna|protein/i; 
						  $moltype = $val;
						},
	   'format=s'                    => \$format,
	   'log|l'                       => \$log,
	   'logname|L=s'                 => \$logname,
	   'comment|C=s'                 => \$comment,
	   'insensitive|i'               => \$insensitive,
	   'negate|v'                    => \$negate,
	   'qualifier|q=s'               => \$qualifier,
#	   'intersect|x=s'               => \$intersect_match,
	   'key|k'                       => \$key,
	   'verbose'                     => \$verbose,
	  ) 
  or pod2usage(2);
 		  
pod2usage(1) if $help;
pod2usage(-verbose => 2) if $man;

my $fromSTDIN = ((-t STDIN) ? false : true);

pod2usage("$NAME: Expects at least three arguments: GenBank-file \'perl-reg-exp\' alignment-file. Try \"perldoc $NAME\"") if (!($fromSTDIN) && (@ARGV < 3));
pod2usage("$NAME: Expects two arguments with input on STDIN: GenBank-file \'perl-reg-exp\'. Try \"perldoc $NAME\"") if ($fromSTDIN && @ARGV != 2);


&FAST::log($logname, $DATE, $COMMAND, $comment) if ($log); 

my @colslice;   
# my @x_colslice;
my $subaln;      
my $rex;
my $value;

my $gb   = FAST::Bio::SeqIO->new(   -file =>  shift @ARGV , '-format' => 'genbank')->next_seq();
my $rearg = shift @ARGV;
my $re = ($insensitive ? qr/$rearg/i : qr/$rearg/);

my $IN;
unless (@ARGV) {   
  if ($moltype){
    $IN = FAST::Bio::AlignIO->new(-fh => *STDIN{IO}, '-format' => $def_format, '-alphabet' => $moltype);
  }
  else{
    $IN = FAST::Bio::AlignIO->new(-fh => *STDIN{IO}, '-format' => $def_format);
  }
}

## SET COORDINATE POLICY TO NARROW
my $narrow_policy = FAST::Bio::Location::NarrowestCoordPolicy->new();

while ($IN or @ARGV) {
  if (@ARGV) {
    my $file = shift (@ARGV);
    unless (-e $file) {
      warn "$NAME: Could not find file $file. Skipping.\n";
      next;
    }
    elsif ($moltype) {
      $IN = FAST::Bio::AlignIO->new(-file => $file, '-format' => $def_format, '-alphabet' => $moltype);
    }
    else {
      $IN = FAST::Bio::AlignIO->new(-file => $file, '-format' => $def_format);
    }
  }
  if ($IN) { 
    while (my $saln = $IN->next_aln){
      my @seqs = $saln->each_seq();
      my $aln  = FAST::Bio::UnivAln->new('-seqs' => \@seqs);
      my $timesseen = 0;
      my $gbseq;
      foreach my $sob ($saln->each_seq_with_id($gb->id())){
	$timesseen++;
	$gbseq = $sob;
      }
      die "$NAME: GenBank sequence ID not found in alignment\n" if ($timesseen == 0); 
      die "$NAME: GenBank sequence ID found more than once in alignment\n" if ($timesseen > 1); 
      my $gbseqseq = $gbseq->seq();
      my $gapchar = $saln->gap_char();
      $gbseqseq =~ s/$gapchar//g;
      die "$NAME: GenBank sequence length is different in alignment\n" unless ( length( $gbseqseq ) == $gb->length );       

      ## SEARCH FEATURES
      foreach my $feat ( $gb->top_SeqFeatures() ) {
	$feat->location->coordinate_policy($narrow_policy);
	if ($key) { 
	  if ($feat->primary_tag() =~ $re) {
	    print STDERR "Matched ",$feat->primary_tag()," at ",$feat->location->to_FTstring(),".\n" if ($verbose);
	    &push_sites($saln,$feat,\@colslice);
	  }
	  next;
	}
	if ($feat->has_tag($qualifier)) { # "qualifier" is NCBI nomenclature
	  foreach $value ( $feat->each_tag_value($qualifier) ) {
	    if ($value =~ $re) {
	      print STDERR "Matched ",$feat->primary_tag()," at ",$feat->location->to_FTstring()," with $qualifier $value.\n" if ($verbose);
	      &push_sites($saln,$feat,\@colslice);
	    }
	    # ## GET COLUMNS MATCHING INTERSECTION REGEX IF EXISTS
	    # if ($intersect_match and $value =~ $rex) {
	    # 	&push_sites($feat,\@x_colslice);
	    # }
	  }
	}
      }

      ## SORT AND COMPRESS THE COLUMN LIST
      my %seen = ();
      my @sc_colslice =  sort {$a <=> $b} grep { ! $seen{$_} ++ } @colslice;
      
      ## TAKE SET-COMPLEMENT OF SITES IF --negate
      if ($negate) {
	@sc_colslice = grep { ! $seen{$_} } (1..$saln->length());
      }
      
      
      # if ($intersect_match) {
      #   ## SORT AND COMPRESS THE INTERSECTION COL SLICE
      #   %seen = ();
      #   my @x_sc_colslice = sort {$a <=> $b} grep { ! $seen{$_} ++ } @x_colslice;
      
      #   ## FIND THE INTERSECTION OF THE TWO LISTS
      #   my %union = my %isect = ();
      #   foreach (@sc_colslice, @x_sc_colslice) { $union{$_}++ && $isect{$_}++ }
      
      #   @sc_colslice = keys %isect;
      # }

      ## SUBSET AND PRINT ALIGNMENT
      ## SUBSET SEQUENCES IN ALIGNMENT IF DESIRED
      if (scalar @sc_colslice) {
	$subaln = new FAST::Bio::UnivAln(-seqs=>scalar($aln->seqs([],\@sc_colslice)), -row_ids=>$aln->row_ids);
      }
      $subaln->ffmt($def_format); 
      my $layout = $subaln->layout();
      ## FIX FASTA PRINT BUG (IDs SHOULD GO RIGHT AFTER '>') 
      $layout =~ s/^>\s+/>/gm;
      print "$layout";
    }
  }
  undef $IN;
}

sub push_sites() {
  my ($saln,$feat,$slice_array_ref) = @_;
  if (ref($feat->location) eq 'FAST::Bio::Location::Simple') {
    my $start = $saln->column_from_residue_number($gb->id(),$feat->start);
    my $stop  = $saln->column_from_residue_number($gb->id(),$feat->end);
    push @$slice_array_ref, ($start < $stop ? ($start)..($stop) : ($stop)..($start)); 
  }
  elsif (ref($feat->location) eq 'FAST::Bio::Location::Split') {
    foreach my $subloc ($feat->location->sub_Location) {
      my $start = $saln->column_from_residue_number($gb->id(),$subloc->start);
      my $stop  = $saln->column_from_residue_number($gb->id(),$subloc->end);
      push @$slice_array_ref, ($start < $stop ? ($start)..($stop) : ($stop)..($start)); 
    }
  }
}


__END__

=head1 NAME

B<gbfalncut> - cut sites from alignments by regex-matching on features in an annotated GenBank file

=head1 SYNOPSIS

B<gbfalncut> [OPTIONS] GENBANK-FILE PERL-REGEX ALIGNMENT-FILE 

=head1 DESCRIPTION

B<gbfalncut> extracts sites from alignments that correspond to one or
more features annotated on a sequence in a GenBank file. The alignment
must contain exactly one sequence with the same identifier as the
sequence in the GenBank file. By default all features in the GenBank
file are searched for a qualifier "/note" whose value matches the perl
regex given as the third argument. By default all features are
searched, but the search may be restricted to features with a specific
key. Optionally, regex matching may occur directly on feature keys
(the labels for features at the left side of the feature table),
allowing matching to features without qualifiers.

Both simple and split features (specified with a "join" operator) are
supported. Reverse-strand features (specified with a "complement"
operator) are not reverse-complemented; check fasrc for that.

Options specific to B<gbfalncut>:
  B<-i>, B<--insensitive>          case-insensitive pattern matching
  B<-v>, B<--negate>               cut sites that don't belong to a feature
  B<-q>, B<--qualifier>=<string>   match on values of qualifier <string> 
                                     default qualifier is "note" (as in /note="intron")          
  B<-k>, B<--key>                  regex match directly on feature keys
  B<--verbose>                  produce verbose output              

Options general to FAST:
  B<-h>, B<--help>                 print a brief help message
  B<--man>             	           print full documentation
  B<--version>                     print version
  B<-l>, B<--log>                  create/append to logfile	
  B<-L>, B<--logname>=<string>     use logfile name <string>
  B<-C>, B<--comment>=<string>     save comment <string> to log
  B<--format>=<format>             use alternative format for input  
  B<--moltype>=<[dna|rna|protein]> specify input sequence type

=head1 INPUT AND OUTPUT

B<gbfalncut> is part of FAST, the FAST Analysis of Sequences Toolbox, based
on Bioperl. Most core FAST utilities expect input and return output in
multifasta format. Input can occur in one or more files or on
STDIN. Output occurs to STDOUT. The FAST utility B<fasconvert> can
reformat other formats to and from multifasta.

=head1 OPTIONS

=over 8

=item B<-i>,
      B<--insensitive>

Match data case-insensitively.

=item B<-n>,
      B<--negate>

Output sites that fall B<outside> the features matched by the regular
expression argument.

=item B<-q>=<string>,
      B<--qualifier>=<string>


=item B<-k>=<string>,
      B<--key>=<string>


=item B<-v>,
      B<--verbose>


=item B<-h>,
      B<--help>

Print a brief help message and exit.

=item B<--man>

Print the manual page and exit.

=item B<--version>

Print version information and exit.

=item B<-l>,
      B<--log>

Creates, or appends to, a generic FAST logfile in the current working
directory. The logfile records date/time of execution, full command
with options and arguments, and an optional comment.

=item B<-L [string]>,
      B<--logname=[string]>

Use [string] as the name of the logfile. Default is "FAST.log.txt".

=item B<-C [string]>,
      B<--comment=[string]>

Include comment [string] in logfile. No comment is saved by default.

=item B<--format=[format]> 		  

Use alternative format for input. See man page for "fasconvert" for
allowed formats. This is for convenience; the FAST tools are designed
to exchange data in Fasta format, and "fasta" is the default format
for this tool.

=item B<-m [dna|rna|protein]>,
      B<--moltype=[dna|rna|protein]> 		  

Specify the type of sequence on input (should not be needed in most
cases, but sometimes Bioperl cannot guess and complains when
processing data).

=back

=head1 EXAMPLES

Print alignment of 5'UTRs

=over 8

B<gbfalncut> -k --verbose t/data/AF194338.1.gb 5.UTR t/data/ArdellEtAl03_ncbi_popset_32329588.fas

=back

Print alignment of coding regions (CDS), remove sequences, degap and translate them

=over 8

B<gbfalncut> -k  t/data/AF194338.1.gb CDS t/data/ArdellEtAl03_ncbi_popset_32329588.fas | fasgrep -v "(AF194|3496)" | fastr -snD "-" | fasxl -t 6

=back

=head1 SEE ALSO

=over 8

=item C<man perlre>

=item C<perldoc perlre>

Documentation on perl regular expressions.

=item C<man FAST>

=item C<perldoc FAST>

Introduction and cookbook for FAST

=item L<The FAST Home Page|http://compbio.ucmerced.edu/ardell/FAST>"

=back 

=head1 CITING

If you use FAST, please cite I<Lawrence et al. (2014). FAST: FAST Analysis of
Sequences Toolbox.> and Bioperl I<Stajich et al.>. 

=cut
