#!/usr/bin/env python2.7
# ucscAltAgp
"""Generate UCSC named agp and fasta files for alternate sequences"""
import os
import fnmatch
import sys
import argparse
import tempfile
import time
import gzip

def parseArgs(args):
    """
    Parse the command line arguments.
    """
    parser= argparse.ArgumentParser(description = __doc__)
    parser.add_argument ("inputFile",
        help = " Full meta data file for the alternate sequences, typically " + \
                "../genbank/*assembly_structure/all_alt_scaffold_placement.txt ",
        action = "store")
    parser.add_argument ("path",
        help = " The path to the alt agp files, typically ../genbank/*assembly_structure/",
        action = "store")
    
    if (len(sys.argv) == 1):
        parser.print_help()
        exit(1)
    
    options = parser.parse_args()
    return options

def main(args):
    """
    Initialized options and calls other functions.
    """
    options = parseArgs(args)

    # Associate NW_*/NT_* contig accessions with our chr*_..._alt names and
    # map NCBI ALT_* IDs to lists of contigs
    ncbiContigToUcscName = dict()
    ncbiAltIdToContigs = dict()
    inputF = open(options.inputFile)
    for line in inputF:
        words = line.strip("\n").split("\t")
        contigAcc = words[3]
        dotToV = contigAcc.replace('.', 'v')
        chrNum = words[5]
        ucscName = "chr" + chrNum + "_" + dotToV + "_alt"
        ncbiContigToUcscName[contigAcc] = ucscName
        altId = words[0]
        if altId in ncbiAltIdToContigs:
            ncbiAltIdToContigs[altId].append(contigAcc)
        else:
            ncbiAltIdToContigs[altId] = [contigAcc]

    # Find all alternative agp files and store as a list.
    altAgpFiles = []
    for dirName, subdirs, files in os.walk(options.path):
        for agpFile in fnmatch.filter(files, "alt.scaf.agp.gz"):
            altAgpFiles.append(os.path.join(dirName, agpFile))

    # Make new .agp and .fa files with ucsc names. 
    for theirAgpFile in altAgpFiles:
        # All the alternative agp files have the same name, so get a number for the file 
        # name from the file path. 
        altId = theirAgpFile.split("/")[3]
        ourFaFileName = altId + ".fa"
        ourAgpFileName = altId + ".agp"

        # Go through their agp file and convert the names. 
        ourAgpFile = open(ourAgpFileName, "w")
        for line in gzip.open(theirAgpFile.strip("\n"), "r"):
            if line.startswith("#"):
                ourAgpFile.write(line)
                continue
            splitLine = line.strip("\n").split("\t")
            newChrom = ncbiContigToUcscName[splitLine[0]]
            ourAgpFile.write(newChrom + "\t" + "\t".join(splitLine[1:]) + "\n")
        ourAgpFile.close()
        print ("Finished creating " + ourAgpFileName)

        # Make a list of alternative chromosomes to grab out of the twoBit.
        altChrmList = tempfile.NamedTemporaryFile(mode="w+", delete=False)
        for contigAcc in ncbiAltIdToContigs[altId]:
            altChrmList.write(contigAcc + "\n");
        altChrmList.close();

        # Make an fasta file with their name scheme.  
        theirFaFile = tempfile.NamedTemporaryFile(mode="w+")
        cmd = "twoBitToFa refseq.2bit " + theirFaFile.name + " -seqList=" + altChrmList.name
        os.system(cmd)

        # Go through their fa file and convert the names.  
        ourFaFile = open(ourFaFileName, "w")
        for line in theirFaFile:
            if line.startswith(">"): 
                theirChrom = line.strip("\n").strip(">")
                ourFaFile.write(">" + ncbiContigToUcscName[theirChrom] + "\n")
            else:
                ourFaFile.write(line)
        ourFaFile.close() 

        cmd = "gzip -f " + ourFaFileName
        os.system(cmd)
        print ("Finished creating" + ourFaFileName + ".gz")
        os.remove(altChrmList.name)

if __name__ == "__main__" : 
    sys.exit(main(sys.argv))
