#!/bin/bash
usage='gencodeStats db ver

Genrate statstics on a gencode for a release to include in ENCODE reports.
ver is in the form "V7"
'
set -beEu -o pipefail

if [ $# != 2 ] ; then
    echo "wrong number of args: ${usage}" >&2
    exit 1
fi
db="$1"
ver="$2"

# generate transcript counts for a given class
reportTransCounts() {
    local transcriptClass="$1"
    local baseSel="select count(distinct attrs.transcriptId) from wgEncodeGencodeAttrs${ver} attrs, wgEncodeGencodeTranscriptSource${ver} src where attrs.transcriptClass = \"${transcriptClass}\" and src.transcriptId = attrs.transcriptId"
    local baseSelPar="select count(distinct attrs.transcriptId) from wgEncodeGencodeAttrs${ver} attrs, wgEncodeGencodeTag${ver} tag, wgEncodeGencodeTranscriptSource${ver} src where attrs.transcriptClass = \"${transcriptClass}\" and tag.transcriptId = attrs.transcriptId and src.transcriptId = attrs.transcriptId and tag.tag = \"PAR\""
    local cnt
    local par
    cnt=$(hgsql -Ne "${baseSel} and src.source like \"ensembl_havana%\"" ${db})
    par=$(hgsql -Ne "${baseSelPar} and src.source like \"ensembl_havana%\"" ${db})
    cnt=$(( $cnt+$par ))
    echo "${transcriptClass}	both manual and automatic	$cnt"
    cnt=$(hgsql -Ne "${baseSel} and src.source like \"havana%\"" ${db})
    par=$(hgsql -Ne "${baseSelPar} and src.source like \"havana%\"" ${db})
    cnt=$(( $cnt+$par ))
    echo "${transcriptClass}	manual only	$cnt"
    cnt=$(hgsql -Ne "${baseSel} and src.source not like \"%havana%\"" ${db})
    par=$(hgsql -Ne "${baseSelPar} and src.source not like \"%havana%\"" ${db})
    cnt=$(( $cnt+$par ))
    echo "${transcriptClass}	automatic only	$cnt"
}
echo "Transcript report"
echo "transcriptClass	source	count"
for transcriptClass in $(hgsql -Ne "select distinct(transcriptClass) from wgEncodeGencodeAttrs${ver}" ${db}) ; do
   reportTransCounts ${transcriptClass}
done
cnt=$(hgsql -Ne "select count(distinct attrs.transcriptId) from wgEncodeGencodeAttrs${ver} attrs" ${db})
par=$(hgsql -Ne "select count(distinct transcriptId) from wgEncodeGencodeTag${ver} where tag = \"PAR\"" ${db})
cnt=$(( $cnt+$par ))
echo "All	All	$cnt"

# generate gene stats
reportGeneCounts() {
    local baseSel="select count(distinct attrs.geneId) from wgEncodeGencodeAttrs${ver} attrs, wgEncodeGencodeGeneSource${ver} src where src.geneId = attrs.geneId"
    local baseSelPar="select count(distinct attrs.geneId) from wgEncodeGencodeAttrs${ver} attrs, wgEncodeGencodeTag${ver} tag, wgEncodeGencodeGeneSource${ver} src where tag.transcriptId = attrs.transcriptId and src.geneId = attrs.geneId and tag.tag = \"PAR\""
    local cnt
    local par
    cnt=$(hgsql -Ne "${baseSel} and src.source like \"ensembl_havana%\"" ${db})
    par=$(hgsql -Ne "${baseSelPar} and src.source like \"ensembl_havana%\"" ${db})
    cnt=$(( $cnt+$par ))
    echo "both manual and automatic	$cnt"
    cnt=$(hgsql -Ne "${baseSel} and src.source like \"havana%\"" ${db})
    par=$(hgsql -Ne "${baseSelPar} and src.source like \"havana%\"" ${db})
    cnt=$(( $cnt+$par ))
    echo "manual only	$cnt"
    cnt=$(hgsql -Ne "${baseSel} and src.source not like \"%havana%\"" ${db})
    par=$(hgsql -Ne "${baseSelPar} and src.source not like \"%havana%\"" ${db})
    cnt=$(( $cnt+$par ))
    echo "automatic only	$cnt"
    type="PseudoGene"
    cnt=$(hgsql -Ne "${baseSel} and attrs.geneType like \"%pseudogene\" and src.source like \"ensembl_havana%\"" ${db}) 
    par=$(hgsql -Ne "${baseSelPar} and attrs.geneType like \"%pseudogene\"and src.source like \"ensembl_havana%\"" ${db})
    cnt=$(( $cnt+$par ))
    echo "pseudo	both manual and automatic	$cnt"
    cnt=$(hgsql -Ne "${baseSel} and geneType like \"%pseudogene\" and src.source like \"havana%\"" ${db}) 
    par=$(hgsql -Ne "${baseSelPar} and attrs.geneType like \"%pseudogene\" and src.source like \"havana%\"" ${db})
    cnt=$(( $cnt+$par ))
    echo "pseudo	manual only	$cnt"
    cnt=$(hgsql -Ne "${baseSel} and geneType like \"%pseudogene\" and src.source not like \"%havana%\"" ${db}) 
    par=$(hgsql -Ne "${baseSelPar} and attrs.geneType like \"%pseudogene\" and src.source not like \"%havana%\"" ${db})
    cnt=$(( $cnt+$par ))
    echo "pseudo	automatic only	$cnt"
}
echo ""
echo "Gene report"
echo "source	count"
reportGeneCounts
cnt=$(hgsql -Ne "select count(distinct attrs.geneId) from wgEncodeGencodeAttrs${ver} attrs" ${db})
par=$(hgsql -Ne "select count(distinct attrs.geneId) from wgEncodeGencodeAttrs${ver} attrs, wgEncodeGencodeTag${ver} tag where attrs.transcriptId = tag.transcriptId and tag.tag = \"PAR\"" ${db})
cnt=$(( $cnt+$par ))
echo "All	$cnt"
cnt=$(hgsql -Ne "select count(distinct name) from wgEncodeGencode2wayConsPseudo${ver};" ${db})
echo "2-way pseudogenes	$cnt"
