#!/usr/bin/awk -f
#
# summerize gbSanity output
#
function reset() {
    totalErr = 0;
    refLinkName = 0;
    invalidPsl = 0;
    pslQSizeMismatch = 0;
    exonOverlap = 0;
    faNameMatch = 0;
    gbStatCdnaVer = 0;
    gbStatCdnaModDate = 0;
    qNameNotInIndex = 0;
    noGbIndexProc = 0;
    typeNotMatchExpected = 0;
    cdsEndNotInExon = 0;
    noGbIndexAligned = 0;
    numAlignsNotMatchGbStatus = 0;
    noPepRefSeq = 0;
    noPepRefLink = 0;
    emptyPepRefLink = 0;
    gbSeqNoExtFile = 0;
    pepNotInExtFile = 0;
    pepOffNotFaRec = 0;
    nonNmRefLink = 0;
    nonNmGbSeq = 0;
    nonNmNoRefLink = 0;
    nonNmNoPeptide = 0;
}
function prErrInfo(desc, cnt) {
    if (cnt > 0) {
        print db, desc, cnt;
    }
}
BEGIN {
    reset();
}

#->gbSanity: begin: ce2: tod=2006-03-11T05:47:35
/^->gbSanity: begin: / {
    db = gensub(":", "", "g", $3);
}

/Error: .* no psl table/ {
    next;  # not a real error
}

/^Error: / {
    totalErr++;
}

/^Error: .*(refFlat|xenoRefFlat).* geneName.*does not match refLink name/ {
    refLinkName++;
    next;
}

/^Error: invalid PSL:/ {
    invalidPsl++;
    next;
}

/^Error: .*psl.*qSize .* != .*size/ {
    pslQSizeMismatch++;
    next;
}

/^Error: .*overlaps previous exon/ {
    exonOverlap++;
    next;
}

/^Error: .* name in fasta header/ {
    faNameMatch++;
    next;
}

/^Error: .* gbStatus.version.* not same gbCdnaInfo.version/ {
    gbStatCdnaVer++;
    next;
}

/^Error: .* gbStatus.modDate .* not same gbCdnaInfo.moddate/ {
    gbStatCdnaModDate++;
    next;
}

/^Error: .* qName not in gbIndex as type/ {
    qNameNotInIndex++;
    next;
}

/^Error: .* no gbIndex processed entry for version/ {
    noGbIndexProc++;
    next;
}

/^Error: .*type mrna,refseq doesn.t match expected mrna,native,refseq/ {
    typeNotMatchExpected++;  # bogus??
    next;
}
/^Error: .* cdsEnd .* not in an exon/ {
    cdsEndNotInExon++;
    next;
}

/^Error: .* no genbank gbIndex aligned object for gbStatus/ {
    noGbIndexAligned++;
    next;
}

/^Error: .*number of alignments found .*does not match expected.* from gbStatus/ {
    numAlignsNotMatchGbStatus++;
    next;
}

/^Error: .*non-NM_ mrnaAcc in refLink/ {
    nonNmRefLink++;
    next;
}

/^Error: .* (YP_|NC_|NR_).* peptide in gbSeq not found in refLink/ {
    nonNmGbSeq++;
    next;
}
/^Error: .* (YP_|NC_|NR_).* not in refLink / {
    nonNmNoRefLink++;
    next; 
}

/^Error: .* (YP_|NC_|NR_).* no peptide for RefSeq/ {
    nonNmNoPeptide++;
    next; 
}

/^Error: .* NM_.* no peptide for RefSeq/ {
    noPepRefLink++;
    next;
}
/^Error: .* NP_.* peptide in gbSeq not found in refLink/ {
    noPepRefSeq++;
    next;
}

/^Error: .* NM_.* empty protein acc in refLink/ {
    emptyPepRefLink++;
    next;
}

/^Error: .*gbSeq.gbExtFile .* not in gbExtFile table/ {
    gbSeqNoExtFile++;
    next;
}

/^Error: .*NM_.*RefSeq peptide .* not in gbExtFile table/ {
    pepNotInExtFile++;
    next;
}

/^Error: .* gbExtFile offset .* start a fasta record/ {
    pepOffNotFaRec++;
    next;
}

/^Error: / {
    print "Unknown: ",$0;
    next;
}

/^<-gbSanity: completed:/ {
    print db, "totalErr", totalErr;
    prErrInfo("refLinkName", refLinkName);
    prErrInfo("invalidPsl", invalidPsl);
    prErrInfo("pslQSizeMismatch", pslQSizeMismatch);
    prErrInfo("exonOverlap", exonOverlap);
    prErrInfo("faNameMatch", faNameMatch);
    prErrInfo("gbStatCdnaVer", gbStatCdnaVer);
    prErrInfo("gbStatCdnaModDate", gbStatCdnaModDate);
    prErrInfo("qNameNotInIndex",qNameNotInIndex);
    prErrInfo("noGbIndexProc",noGbIndexProc);
    prErrInfo("typeNotMatchExpected",typeNotMatchExpected);
    prErrInfo("cdsEndNotInExon",cdsEndNotInExon);
    prErrInfo("noGbIndexAligned",noGbIndexAligned);
    prErrInfo("numAlignsNotMatchGbStatus", numAlignsNotMatchGbStatus);
    prErrInfo("noPepRefSeq" noPepRefSeq);
    prErrInfo("noPepRefLink", noPepRefLink);
    prErrInfo("emptyPepRefLink", emptyPepRefLink);
    prErrInfo("gbSeqNoExtFile", gbSeqNoExtFile);
    prErrInfo("pepNotInExtFile", pepNotInExtFile);
    prErrInfo("nonNmRefLink" nonNmRefLink);
    prErrInfo("nonNmGbSeq", nonNmGbSeq);
    prErrInfo("nonNmNoRefLink", nonNmNoRefLink);
    prErrInfo("nonNmNoPeptide", nonNmNoPeptide);
    prErrInfo("pepOffNotFaRec", pepOffNotFaRec);
    print "";
    reset();
}
