#!/usr/bin/env perl

# File: extractPslInfo
# Author: Terry Furey
# Date: 10/2001
# Description: Extracts essential placement info from a psl file

# $Id: extractPslInfo,v 1.2 2006/09/08 19:17:47 hiram Exp $

use strict;
use warnings;

# Usage mesage
if ($#ARGV < 0) {
  print STDERR "USAGE extractPslInfo [-h -be -mrna -gb -cds <file>] <psl file>\n";
  exit(1);
}

# Read parameters
my $header = 0;
my $bacends = 0;
my $mrna = 0;
my $gb = 0;
my $cds = 0;
my $cdsFile  = "";
while ($#ARGV > 0) {
  my $arg = shift(@ARGV);
  if ($arg eq "-h") {
    $header = 1;
  } elsif ($arg eq "-be") {
    $bacends = 1;
  } elsif ($arg eq "-mrna") {
    $mrna = 1;
  } elsif ($arg eq "-gb") {
    $gb = 1;
  } elsif ($arg eq "-cds") {
    $cds = 1;
    $cdsFile = shift(@ARGV);
    open(CDS, "$cdsFile") || die("Can not open -cds $cdsFile: $!");
  }
}
my $file = shift(@ARGV);
open(FILE, "<$file") || die("Can not open $file: $!");

my %cdsStart;
my %cdsEnd;

# If cds file provided, read it in
if ($cds) {
    while (my $line = <CDS>) {
	chomp($line);
	my ($acc, $start, $end) = split("\t",$line);
	$start =~ s/\>//;
	$cdsStart{$acc} = $start;
	$cdsEnd{$acc} = $end;
    }
    close(CDS);
}

# If header present, discard lines
if ($header) {
  for (my $i = 0; $i < 5; $i++) {
    my $line = <FILE>;
  }
}

# Read in and print out info
open(TEMP, ">$$.temp");
print TEMP "chr\tstart\tend\tname\t";
if ($bacends) {
    print TEMP "orien\t";
} elsif ($mrna) {
    print TEMP "qsize\tqstart\tqend\t";
}
print TEMP "cov\tid";
if ($cds) {
    print TEMP "\tcdsS\tcdsE";
}
print TEMP "\n";
print TEMP "10S\t10N\t10N\t10S\t";
if ($bacends) {
    print TEMP "10\t";
} elsif ($mrna) {
    print TEMP "10N\t10N\t10N\t";
}
print TEMP "10N\t10N";
if ($cds) {
    print TEMP "\t10N\t10N";
}
print TEMP "\n";
while (my $line = <FILE>) {
  chomp($line);
  my @fields = split("\t",$line);
  # Percent covered by the alignment
  my $cov = 0;
  if ($fields[10]) {
      $cov = ($fields[12] - $fields[11] - $fields[5])/$fields[10];
  }
  # Percent identity of the alignment
  my $id = 0;
  if ($fields[12]-$fields[11]-$fields[5]-$fields[3]) {
      $id = ($fields[0] + $fields[2])/($fields[12] - $fields[11] - $fields[5] - $fields[3]);
  }
  if ($gb) {
    my @names = split(/\|/,$fields[13]);
    ($fields[13],my $vers) = split(/\./,$names[$#names]);
  }
  print TEMP "$fields[13]\t$fields[15]\t$fields[16]\t$fields[9]\t";
  if ($bacends) {
    print TEMP "$fields[8]\t";
  } elsif ($mrna) {
    print TEMP "$fields[10]\t$fields[11]\t$fields[12]\t";
  }
  printf (TEMP "%.4f\t%.4f",$cov,$id);
  if ($cds) {
      my $acc = $fields[9];
      my $blocks = $fields[17];
      my @sizes = split(",",$fields[18]);
      my @qStarts = split(",",$fields[19]);
      my @tStarts = split(",",$fields[20]);
      my $start = "";
      my $end = "";
      if ($fields[8] eq "-") {
	  for (my $i = 0; $i < $blocks; $i++) {
	      $qStarts[$i] = $fields[10] - $qStarts[$i] - $sizes[$i];
	  }
      }
      if (!$cdsStart{$acc}) {
	  $cdsStart{$acc} = 1;
      }
      if (!$cdsEnd{$acc}) {
	  $cdsEnd{$acc} = $fields[10];
      }
      for (my $i = 0; $i < $blocks; $i++) {
	  if (($cdsStart{$acc} <= ($qStarts[$i] + $sizes[$i])) && 
	      ($cdsStart{$acc} >= $qStarts[$i])) {
	      $start = $tStarts[$i] + $cdsStart{$acc} - $qStarts[$i] + 1;
	      if ($fields[8] eq "-") {
		  $start = $tStarts[$i] + $sizes[$i] - $cdsStart{$acc} + $qStarts[$i] + 1;
	      }
	  }
	  if (($cdsStart{$acc} < $qStarts[$i]) && ($start eq "")) {
	      $start = $tStarts[$i];
	  }
	  if (($cdsEnd{$acc} <= ($qStarts[$i] + $sizes[$i])) && 
	      ($cdsEnd{$acc} >= $qStarts[$i])) {
	      $end = $tStarts[$i] + $cdsEnd{$acc} - $qStarts[$i];
	      if ($fields[8] eq "-") {
		  $end = $tStarts[$i] + $sizes[$i] - $cdsEnd{$acc} + $qStarts[$i];
	      }
	  }
	  if (($cdsEnd{$acc} < $qStarts[$i]) && (($end eq "") || ($end > ($tStarts[$i-1] + $sizes[$i-1])))) {
	      $end = $tStarts[$i-1] + $sizes[$i-1];
	  }
      }
      if ($cdsStart{$acc} <= $fields[11]) {
	  if ($fields[8] eq "-") {
	      $start = $fields[16];
	  } else {
	      $start = $fields[15] + 1;
	  }
      }
      if ($cdsEnd{$acc} >= $fields[12]) {
	  if ($fields[8] eq "-") {
	      $end = $fields[15] + 1;
	  } else {
	      $end = $fields[16];
	  }
      }
      if ((($start eq "") || ($end eq "")) && ($acc)) {
	  print STDERR "$acc ($fields[9]) Bad cds $start-$end $fields[8]\n";
      }
      if ($fields[8] eq "-") {
	  print TEMP "\t$end\t$start";
      } else {
	  print TEMP "\t$start\t$end";
      }
      print TEMP "\t$fields[18]\t$fields[20]";
  }
  print TEMP "\n";
}
close(FILE);
close(TEMP);
if ($cds) {
    print `sorttbl chr start name < $$.temp | uniqtbl chr start name | sorttbl chr cdsS name | headchg -del > $file.initial`;
} else {
    print `sorttbl chr start name < $$.temp | uniqtbl chr start name | headchg -del > $file.initial`;
}
`rm $$.temp`;
