#!/usr/bin/perl -W

$usage  = "
parseDbSnpXML takes dbSnp XML files and writes condensed a file of information
     that can be lifted and loaded directly into snp table.  It also creates files of 
     flanking sequences, structural information, and locus info.\n
usage: parseDbSnpXML <infile> <outDir> <outRoot>
       parseDbSnpXML ds_chY.xml.gz ds_chY
where <infile>  is a gzipped XML file,
      <outDir>  is the directory for output files, and 
      <outRoot> specifies the root for the output files:
                    <outRoot>.det:  details for snp table
                    <outRoot>.seq:  flanking sequences
                    <outRoot>.str:  structural info
                    <outRoot>.loc:  locus link info\n\n";

if ($#ARGV != 2) {die $usage;}
$inputFile   = $ARGV[0];
$outDir      = $ARGV[1];
$outRoot     = $ARGV[2];
$tmpDetail   = "/tmp/$outRoot.det";
$tmpSeq      = "/tmp/$outRoot.seq";
$tmpStruct   = "/tmp/$outRoot.str";
$tmpLocus    = "/tmp/$outRoot.loc";
unlink( " /tmp/$outRoot.* $outDir/$outRoot.* " );
open( IN,  " zcat $inputFile | " ) or die "can't open INPUT  file: $inputFile; $!";
open( DET, " > $tmpDetail      " ) or die "can't open OUTPUT file: $tmpDetail; $!";
open( SEQ, " > $tmpSeq         " ) or die "can't open OUTPUT file: $tmpSeq;    $!";
open( STR, " > $tmpStruct      " ) or die "can't open OUTPUT file: $tmpStruct; $!";
open( LOC, " > $tmpLocus       " ) or die "can't open OUTPUT file: $tmpLocus;  $!";

$accession=$strand=$observed=$molType=$class=$status=$locType=$valid=".";
$protAcc=$frame=$allele=$residue=$aaPos=$func=".";
$version=$contigStart=$contigEnd=$rsId=$avHet=$avHetSE=0;
%mapLoc=();
%funcHash=();
%validHash=();

$printHeaders="FALSE";
if ($printHeaders eq "TRUE")
{
    print DET "chr/acc\tctgStrt\tctgEnd\trsId\tscore\tstrand\tmolType\tclass\tvalid\t";
    print DET "avHet\tavHetSE\tfunc\tlocType\thitQual\tmapWt\tchrHits\tctgHits\tseqHits\tsource\n";
    print STR "rsId\tprotAcc\tprotGi\tstructGi\tprotLoc\trsRes\tprotRes\tstructRes\tstructLoc\n";
    print LOC "rsId\tlocusId\tsymbol\tmrnaAcc\tprotAcc\tfunc\tframe\tallele\tresidue\taaPos\n";
    print SEQ "rsId\tflank5\tflank3\tobserved\n";
}

while (<IN>) 
{
    chomp;
    if    (/NSE-rsStruct/) 
    {
	if    (/<(NSE-rsStruct_prot-acc)>(\S+)\s?<\/\1>/)           { $protAcc       = $2; }
	elsif (/<(NSE-rsStruct_prot-gi)>(\S+)<\/\1>/)               { $protGi        = $2; }
	elsif (/<(NSE-rsStruct_prot-loc)>(\S+)<\/\1>/)              { $protLoc       = $2; }
	elsif (/<(NSE-rsStruct_prot-res)>(\S+)<\/\1>/)              { $protRes       = $2; }
	elsif (/<(NSE-rsStruct_rs-res)>(\S+)<\/\1>/)                { $rsRes         = $2; }
	elsif (/<(NSE-rsStruct_struct-gi)>(\S+)<\/\1>/)             { $structGi      = $2; }
	elsif (/<(NSE-rsStruct_struct-loc)>(\S+)<\/\1>/)            { $structLoc     = $2; }
	elsif (/<(NSE-rsStruct_struct-res)>(\S+)<\/\1>/)            { $structRes     = $2; }
	elsif (/<\/NSE-rsStruct>/)
	{
	    print STR "$rsId\t$protAcc\t$protGi\t$structGi\t$protLoc\t$rsRes\t$protRes\t$structRes\t$structLoc\n";
	    $protAcc=$protGi=$protLoc=$protRes=$rsRes=$structGi=$structLoc=$structRes=".";
	}
    }
    elsif (/NSE-FxnSet/) 
    { 
	if    (/<(NSE-FxnSet_fxn-class-contig value=\")(\S+)\"\/>/) { $func          = $2; }
	elsif (/<(NSE-FxnSet_locusid)>(\S+)<\/\1>/)                 { $locusId       = $2; }
	elsif (/<(NSE-FxnSet_symbol)>(\S+)<\/\1>/)                  { $symbol        = $2; }
	elsif (/<(NSE-FxnSet_mrna-acc)>(\S+)<\/\1>/)                { $mrnaAcc       = $2; }
	elsif (/<(NSE-FxnSet_prot-acc)>(\S+)<\/\1>/)                { $protAcc       = $2; }
	elsif (/<(NSE-FxnSet_reading-frame)>(\S+)<\/\1>/)           { $frame         = $2; }
	elsif (/<(NSE-FxnSet_allele)>(\S+)<\/\1>/)                  { $allele        = $2; }
	elsif (/<(NSE-FxnSet_residue)>(\S+)<\/\1>/)                 { $residue       = $2; }
	elsif (/<(NSE-FxnSet_aa-position)>(\S+)<\/\1>/)             { $aaPos         = $2; }
	elsif (/<\/NSE-FxnSet>/)
	{
	    print LOC "rs$rsId\t$locusId\t$symbol\t$mrnaAcc\t$protAcc\t$func\t$frame\t$allele\t$residue\t$aaPos\n";
	    if ($func ne "reference")
	    {
		$funcHash{$func}= 1;
	    }
	    $locusId=$symbol=$mrnaAcc=$protAcc=$frame=$allele=$residue=$aaPos=$func=".";
	}
    }
    elsif (/NSE-rs_seq/) 
    {
	if (/<(NSE-rs_seq-5_E)>(\S+)<\/\1>/)                        { $flank5       .= $2; }
	if (/<(NSE-rs_seq-3_E)>(\S+)<\/\1>/)                        { $flank3       .= $2; }
    }
    elsif (/NSE-rsMaploc/)
    {
	if    (/<(NSE-rsMaploc_asn-from)>(\S+)<\/\1>/)              { $contigStart   = $2; }
	elsif (/<(NSE-rsMaploc_asn-to)>(\S+)<\/\1>/)                { $contigEnd     = $2; }
	elsif (/<(NSE-rsMaploc_loc-type value=\")(\S+)\"\/>/)       { $locType       = $2; }
	elsif (/<(NSE-rsMaploc_orient value=\")(\S+)\"\/>/)         { $strand        = $2; }
	elsif (/<\/NSE-rsMaploc>/)
	{
	    $func="";
	    foreach $key (sort keys %funcHash)
	    {
		$func .= "$key,";
	    }
	    if ($func eq "")
	    {
		$func = "unknown,";
	    }
	    substr($func, -1) = "";
	    if    ($locType eq "between") # half open
	    { 
		$contigEnd   -= 1;
	    }
	    else
	    { 
		$contigStart  -= 1; 
	    }
	    if    ($strand eq "forward")                            { $strand        = "+"; }
	    elsif ($strand eq "reverse")                            { $strand        = "-"; }
	    $key = "$contigStart:$contigEnd:$locType:$strand:$func";
	    $mapLoc{$key}=1;
	    $contigStart=$contigEnd=$locType=$strand=$func=".";
	    %funcHash=();
	}
    }
    elsif (/<(NSE-rs_refsnp-id)>(\S+)<\/\1>/)                       { $rsId          = $2; }
    elsif (/<(NSE-rs_snp-class value=\")(.*)\"\/>/)                 { $class         = $2; }
    elsif (/<(NSE-rs_moltype value=\")(.*)\"\/>/)                   { $molType       = $2; }
    elsif (/<(NSE-rs_observed)>(.+)<\/\1>/)                         { $observed      = $2; }
    elsif (/<(NSE-rs_het)>(\S+)<\/\1>/)                             { $avHet         = $2; }
    elsif (/<(NSE-rs_het-SE)>(\S+)<\/\1>/ && ($2 ne "NaN"))         { $avHetSE       = $2; }
    elsif (/<(NSE-rs_validated-)(\S+) value=\"true\"\/>/)           { $validHash{$2} =  1; }
    elsif (/<(NSE-rs_)(genotype) value=\"true\"\/>/)                { $validHash{$2} =  1; }
    elsif (/<(NSE-rsContigHit_accession)>(\S+)<\/\1>/)              { $accession     = $2; }
    elsif (/<(NSE-rsContigHit_chromosome)>(.*)<\/\1>/)              { $chrom         = $2; }
    elsif (/<\/NSE-rsContigHit>/) # end of current contig
    {   
	if ($observed eq ".")   
	{
	    die "Observed: rs$rsId >[$_]<; $!";
	}
	$observed =~ s/ /_/g;
	if ($valid eq ".")
	{
	    $valid="";
	    foreach $key (sort keys %validHash)
	    {
		$valid .= "$key,";
	    }
	    if ($valid eq "")
	    { 
		$valid = "unknown,"; 
	    }
	    substr($valid, -1) = "";
	    %validHash=();
	}
	foreach $key (sort keys %mapLoc)
	{
	    ( $contigStart, $contigEnd, $locType, $strand, $func ) = split(":", $key);
	    print DET "$chrom/$accession\t$contigStart\t$contigEnd\trs$rsId\t0\t$strand\t$observed\t$molType\t$class\t$valid\t";
	    if ($avHet>0)
	    { 
		printf DET ("%.3f\t%.3f\t$func\t$locType\t2\t0\n",$avHet,$avHetSE);
	    }
	    else
	    {
		print  DET  "0\t0\t$func\t$locType\t2\t0\n";
	    }
	}
	%mapLoc=();
    }
    elsif (/<\/NSE-rs>/) # end of current SNP; reinitialize
    {
	print SEQ "rs$rsId\t$flank5\t$flank3\t$observed\n";
	$accession=$strand=$observed=$molType=$class=$valid=$status=$locType=".";
	$protAcc=$frame=$allele=$residue=$aaPos=$func=".";
	$flank3=$flank5="";
	$version=$contigStart=$contigEnd=$rsId=$avHet=$avHetSE=0;
	%mapLoc=();
	%funcHash=();
	%validHash=();
    }
}
close(DET);
close(SEQ);
close(STR);
close(LOC);
system("/bin/gzip $tmpDetail");
system("/bin/gzip $tmpSeq");
system("/bin/gzip $tmpStruct");
system("/bin/gzip $tmpLocus");
system("/bin/mv -f $tmpDetail.gz ./det/");
system("/bin/mv -f $tmpSeq.gz    ./seq/");
system("/bin/mv -f $tmpStruct.gz ./str/");
system("/bin/mv -f $tmpLocus.gz  ./loc/");
system("/bin/mv -f /tmp/$outRoot.* .");
