
# download and load ncbiGene track

db=danRer10
mkdir  /cluster/data/genomes/$db/bed/ncbiGene
cd  /cluster/data/genomes/$db/bed/ncbiGene

ftpFile=ftp://ftp.ncbi.nlm.nih.gov/genomes/D_rerio/GFF/ref_GRCz10_top_level.gff3.gz
gff3File=`basename $ftpFile`

echo "select * from ucscToRefSeq" | hgsql $db | tail -n +2 | awk '{print 0, $4, $3, $1, $3}' > refSeqToUcsc.lft
wget $ftpFile

/cluster/home/braney/bin/x86_64/gff3ToGenePred -useName -warnAndContinue -attrsOut=attrs -bad=bad.gp $gff3File stdout 2> convertErr.txt | liftUp -type=.gp -extGenePred lift.gp refSeqToUcsc.lft warn  stdin 2> liftErr.txt
wc -l lift.gp
# 54816 lift.gp
wc -l bad.gp
# 0 bad.gp

tawk '{print $1}'  attrs | sort | uniq > meta
wc -l meta
# 63880 meta
for i in product Dbxref gene gbkey
do 
    echo $i
    tawk -v attr=$i '$2==attr {print $1,$3}' attrs | sort | uniq | join -t $'\t' /dev/stdin meta > out
    mv out meta
done  
wc -l meta
# 63745 meta

cat curated.gp predicted.gp | awk '{print $1}' | sort -u > tmp1
cat meta | awk '{print $1}' | sort -u > tmp2
join -v 1 tmp1 tmp2 | wc -l
# 0

egrep "^N(M|R|P)" lift.gp > curated.gp
egrep "^X(M|R)" lift.gp > predicted.gp

wc -l curated.gp predicted.gp
#       15143 curated.gp
#       39686 predicted.gp
#       54829 total

grep dropping convertErr.txt | wc -l
#    0

awk '/isn/ {print $1}' liftErr.txt | sort -u
# nothing

hgLoadGenePred -genePredExt $db ncbiRefCurated curated.gp
hgLoadGenePred -genePredExt $db ncbiRefPredicted predicted.gp
hgLoadSqlTab $db ncbiRefLink $kent/src/hg/lib/ncbiRefLink.sql meta

hgsql -e 'INSERT INTO trackVersion \
    (db, name, who, version, updateTime, comment, source, dateReference) \
    VALUES("danRer10", "ncbiRefSeq", "braney", "104", now(), \
    "http://www.ncbi.nlm.nih.gov/genome/annotation_euk/Danio_rerio/104/", \
    "ftp://ftp.ncbi.nlm.nih.gov/genomes/D_rerio", \
    "24 September 2014" );' hgFixed
