#!/usr/bin/env /hive/data/outside/genbank/bin/gbPython
#
# $Id: gbMkOrganismList,v 1.3 2006/04/11 16:33:44 markd Exp $
#

usage="""gbMkOrganismList outFile
Create a tab file with a list of all of the organisms and a count of
the mRNAs, ESTs, and RefSeqs for each organism.
"""

import sys,os
# require 2.4 or newer
if (sys.version_info[0] <= 2) and (sys.version_info[1] < 4):
    raise Exception("python 2.4 or newer required, using " + sys.version)

myBinDir = os.path.normpath(os.path.dirname(sys.argv[0]))
sys.path.append(myBinDir + "/../lib/py")
from optparse import OptionParser
from genbank import procOps, fileOps
from genbank.Pipeline import Pipeline
from genbank.GbIndex import GbIndex, GenBank, RefSeq, MRNA, EST

class Organism(object):
    "Counts for an organism"
    def __init__(self, name):
        self.name = name
        self.mrnaCnt = 0
        self.estCnt = 0
        self.refSeqCnt = 0

class Organisms(dict):
    "Table of organisms and counts of cDNAs"

    def __init__(self):
        "load from genbank indices"
        gbi = GbIndex()
        self._countRelease(gbi[RefSeq].getLatestRel())
        self._countRelease(gbi[GenBank].getLatestRel())

    def _getOrg(self, name):
        "get an Organism object, creating if not found"
        org = self.get(name)
        if org == None:
            org = Organism(name)
            self[name] = org
        return org

    def _countRelease(self, rel):
        "count organism cDNAs in a release"
        gbIdxTbl = self._getRelGbIndexes(rel)
        if rel.srcDb == RefSeq:
            self._countGbIdxs(RefSeq, "refSeqCnt", gbIdxTbl[MRNA][None])
        else:
            self._countGbIdxs(GenBank, "mrnaCnt", gbIdxTbl[MRNA][None])
            for gbIdxs in gbIdxTbl[EST].itervalues():
                self._countGbIdxs(GenBank, "estCnt", gbIdxs)

    def _getRelGbIndexes(self, rel):
        """Get table of all gbidx (covering all updates) in release, partationed
        by cdnaType, then accPrefix.  accPrefix is None when none is avaliable"""
        gbIdxTbl = {}
        for cdt in rel.cdnaTypes:
            gbIdxTbl[cdt] = {}
        for up in rel.updates:
            for part in up.procParts:
                cdtTbl = gbIdxTbl[part.cdnaType]
                accPreGbIdxs = cdtTbl.get(part.accPrefix)
                if accPreGbIdxs == None:
                    cdtTbl[part.accPrefix] = accPrefGbIdxs = []
                accPrefGbIdxs.append(part.getGbIdx())
        return gbIdxTbl


    def _countGbIdxs(self, srcDb, fld, gbIdxs):
        "count the unique accessions in a set of gbidx files"
        # gbIdx columns: acc version moddate organism
        # used cut to get accession and organism, sort to make unique
        # then awk to count by organism.  Only count NM_ for RefSeqs.
        cutCmd = ["cut", "-f", "1,4"]
        cutCmd.extend(gbIdxs)
        sortCmd = ["sort", "-u"]
        awkProg = """BEGIN{FS="\t"; OFS="\t"}"""
        if srcDb == RefSeq:
            awkProg += """ !/^#/&&/NM_/{cnts[$2]++}"""
        else:
            awkProg += """ !/^#/{cnts[$2]++}"""
        awkProg += """ END{for (o in cnts) {print o,cnts[o]}}"""
        awkCmd = ["awk", awkProg]
        pl = Pipeline([cutCmd, sortCmd, awkCmd])

        while True:
            l = pl.fh.readline()
            if len(l) == 0:
                break
            row = l[0:-1].split("\t")
            org = self._getOrg(row[0])
            org.__dict__[fld] += int(row[1])
        pl.close()

    def outputList(self, outFile):
        orgNames = list(self.iterkeys())
        orgNames.sort()
        ofh = open(outFile, "w")
        fileOps.prRowv(ofh, "#organism", "mrnaCnt", "estCnt", "refSeqCnt")
        for on in orgNames:
            o = self[on]
            fileOps.prRowv(ofh, o.name, o.mrnaCnt, o.estCnt, o.refSeqCnt)
        ofh.close()

def mkOrganismList(outFile):
    orgs = Organisms()
    orgs.outputList(outFile)

def main():
    "entry point"
    parser = OptionParser(usage=usage)
    global cmdOpts
    (cmdOpts, args) = parser.parse_args()
    if len(args) != 1:
        parser.error("wrong number of arguments")
    mkOrganismList(args[0])

main()

# Local Variables:
# mode: python
# End:
