#!/usr/bin/awk -f
# ensGeneToGenePred [options] [exons] > genePred
#
# Converts a mysql dump of ensembl exons to an extended genePred file.  if
# the gene is single exon, set to frame to 0 to override bad data from
# ENSEMBL.
#
# options:
#  -genePredExt - create extended genePred columns, including frames.
#  -ignore filterFile - drop ENSEMBL transcript ids that are in this file.
#   Empty lines and whose starting with # are ignored.  NOTE; -ignore was used
#   rather than -f, because awk wanted to interpret this and an option to awk
#   instead of putting it in ARGV
#
BEGIN {
    FS = OFS="\t"
    filterFile = ""
    genePredExt = 0;
    iArg = 1;
    while ((iArg < ARGC) && (ARGV[iArg] ~ "^-.*")) {
        if (ARGV[iArg] == "-ignore") {
            if (iArg+1 >= ARGC) {
                print "Error: -ignore requires an argument" >"/dev/stderr"
                exit(1)
            }
            filterFile = ARGV[iArg+1]
            ARGV[iArg++] = ""
        } else if (ARGV[iArg] == "-genePredExt") {
            genePredExt = 1
        } else {
            print "Error: invalid option:",ARGV[iArg] >"/dev/stderr"
            exit(1)
        }
        ARGV[iArg++] = ""
    }
    if (filterFile) {
        while ((getline < filterFile) > 0) {
            $0 = gensub("[[:space:]]+", "", "g", $0);
            if (!($0 ~ "^(#|$)")) {
                ignore[$0] = 1
            }
        }
        close(filterFile)
    }
    exonCnt = 0;
}

# prints current gene
function prGene() {
    if ((exonCnt == 1) && (cdsStart < cdsEnd)) {
       frames = "0,";  # fix bogus fram on single-exon genes
    }
    if (genePredExt) {
        print name, chr, strand, txStart, txEnd, cdsStart, cdsEnd, exonCnt, eStarts, eEnds, 0, name2, "cmpl", "cmpl", frames
    } else {
        print name, chr, strand, txStart, txEnd, cdsStart, cdsEnd, exonCnt, eStarts, eEnds
    }
}

ignore[$11] {
    next;  # skip ignored gene
}

# output gene being assemble if a hit a new gene
($11 != name) && (name != "") {
    prGene();
}

# new gene
($11 != name) {
    name = $11;
    chr = $1;
    strand = $8
    txStart = $5; 
    cdsStart = $3; 
    cdsEnd = $4; 
    name2 = $10;
    eStarts = eEnds = frames = "";
    exonCnt = 0;
}

# accumulate gene
{
    txEnd = $6;
    eStarts = eStarts $5 ","; 
    eEnds = eEnds $6 ","; 
    if ($9 != ".") {
        frames = frames $9 ",";
    } else {
        frames = frames "-1,";
    }
    exonCnt++;
}
END {
    if (name != "") {
        prGene();
    }
}
