# downloaded from ftp://ftp.sanger.ac.uk/pub/gencode/gencode.4.DCC.tgz
# on 2010-05-07
cd /hive/groups/encode/dcc/data/gencodeV4

# create tables from 
hgsql hg19

create table wgEncodeGencodeAutoV4 select * from wgEncodeGencodeAutoV3 limit 0;
create table wgEncodeGencodeClassesV4 select * from wgEncodeGencodeClassesV3 limit 0;
create table wgEncodeGencodeManualV4 select * from wgEncodeGencodeManualV3 limit 0;
create table wgEncodeGencodePolyaV4 select * from wgEncodeGencodePolyaV3 limit 0;

#build classes table
echo "LOAD DATA LOCAL INFILE 'gencode_v4.annotation.GRCh37.level_1_2.classes' into table wgEncodeGencodeClassesV4" | hgsql hg19
echo "LOAD DATA LOCAL INFILE 'gencode_v4.annotation.GRCh37.level_3.classes' into table wgEncodeGencodeClassesV4" | hgsql hg19
echo "LOAD DATA LOCAL INFILE 'gencode.v4.tRNAs.GRCh37.classes' into table wgEncodeGencodeClassesV4" | hgsql hg19
echo "LOAD DATA LOCAL INFILE 'gencode.v4.polyAs.GRCh37.classes' into table wgEncodeGencodeClassesV4" | hgsql hg19
echo "LOAD DATA LOCAL INFILE 'gencode.v3.2wayconspseudos.GRCh37.classes' into table wgEncodeGencodeClassesV4" | hgsql hg19

ldHgGene -gtf -genePredExt hg19 wgEncodeGencodeManualV4 gencode_v4.annotation.GRCh37.level_1_2.gtf > reload_manual.out 2>&1
# Reading gencode_v4.annotation.GRCh37.level_1_2.gtf
# Read 101466 transcripts in 1322945 lines in 1 files
  # 101466 groups 24 seqs 1 sources 6 feature types
  # 101466 gene predictions

ldHgGene -gtf -genePredExt hg19 wgEncodeGencodeAutoV4 gencode_v4.annotation.GRCh37.level_3.gtf > reload_auto_level3.out 2>&1 
# Read 41171 transcripts in 811166 lines in 1 files
  # 41171 groups 25 seqs 1 sources 6 feature types
  # 41171 gene predictions

ldHgGene  -exon=tRNAscan  -genePredExt -oldTable hg19 wgEncodeGencodeAutoV4 gencode.v4.tRNAs.GRCh37.gtf > reload_auto_level_tRNAs.out 2>&1 
#Reading gencode.v4.tRNAs.GRCh37.gtf
# Read 622 transcripts in 622 lines in 1 files
#   622 groups 24 seqs 1 sources 1 feature types
#   622 gene predictions

grep -v polyA_signal gencode.v4.polyAs.GRCh37.gtf | grep -v pseudo_polyA | grep -v polyA_site > gencode.v4.polyAs.GRCh37.header.gtf
cp gencode.v4.polyAs.GRCh37.header.gtf gencode.v4.polyAs.GRCh37.signal.gtf
cp gencode.v4.polyAs.GRCh37.header.gtf gencode.v4.polyAs.GRCh37.pseudo.gtf
cp gencode.v4.polyAs.GRCh37.header.gtf gencode.v4.polyAs.GRCh37.site.gtf
grep polyA_signal gencode.v4.polyAs.GRCh37.gtf >> gencode.v4.polyAs.GRCh37.signal.gtf
grep pseudo_polyA gencode.v4.polyAs.GRCh37.gtf >> gencode.v4.polyAs.GRCh37.pseudo.gtf
grep polyA_site gencode.v4.polyAs.GRCh37.gtf >> gencode.v4.polyAs.GRCh37.site.gtf

ldHgGene -exon=polyA_signal -genePredExt -noncoding -oldTable hg19 wgEncodeGencodePolyaV4 gencode.v4.polyAs.GRCh37.signal.gtf > load_polyA.out 2>&1
ldHgGene -exon=pseudo_polyA -genePredExt -noncoding -oldTable hg19 wgEncodeGencodePolyaV4 gencode.v4.polyAs.GRCh37.pseudo.gtf >> load_polyA.out 2>&1
ldHgGene -exon=polyA_site -genePredExt -noncoding -oldTable hg19 wgEncodeGencodePolyaV4 gencode.v4.polyAs.GRCh37.site.gtf >> load_polyA.out 2>&1 



###  2010-07-02 Download update V4 files from FelixK (braney)
cd /hive/groups/encode/dcc/data/gencodeV4
wget "ftp://ftp.sanger.ac.uk/pub/gencode/other_data/gencode_v4fixed.tgz"
# -rw-r--r-- fsk/ensembl 247107660 2010-07-02 08:26:21 gencode_v4fix.annotation.GRCh37.level_3.gtf
# -rw-r--r-- fsk/ensembl 486686715 2010-07-02 08:26:39 gencode_v4fix.annotation.GRCh37.level_1_2.gtf

#### 2010-07-29 trying to get this sorted (braney)
mkdir old
mv * old
wget "ftp://ftp.sanger.ac.uk/pub/gencode/other_data/gencode_v4fixed.tgz"
tar xvfz gencode_v4fixed.tgz
# gencode_v4fix.annotation.GRCh37.level_3.gtf
# gencode_v4fix.annotation.GRCh37.level_1_2.gtf

# reload auto and manual tables
ldHgGene -gtf -genePredExt hg19 wgEncodeGencodeManualV4 gencode_v4fix.annotation.GRCh37.level_1_2.gtf 
# Reading gencode_v4fix.annotation.GRCh37.level_1_2.gtf
# Read 101466 transcripts in 1322945 lines in 1 files
#   101466 groups 24 seqs 1 sources 6 feature types
#   101466 gene predictions

ldHgGene -gtf -genePredExt hg19 wgEncodeGencodeAutoV4 gencode_v4fix.annotation.GRCh37.level_3.gtf 
# Reading gencode_v4fix.annotation.GRCh37.level_3.gtf
# Read 41171 transcripts in 811166 lines in 1 files
#   41171 groups 25 seqs 1 sources 6 feature types
#   41171 gene predictions

cp old/gencode.v4.tRNAs.GRCh37.gtf .
ldHgGene  -exon=tRNAscan  -genePredExt -oldTable hg19 wgEncodeGencodeAutoV4 gencode.v4.tRNAs.GRCh37.gtf 
# Reading gencode.v4.tRNAs.GRCh37.gtf
# Read 622 transcripts in 622 lines in 1 files
#   622 groups 24 seqs 1 sources 1 feature types
#   622 gene predictions

# update the downloads
cat gencode.v4.tRNAs.GRCh37.gtf gencode_v4fix.annotation.GRCh37.level_3.gtf | gzip -c > /hive/groups/encode/dcc/pipeline/downloads/hg19/wgEncodeGencode/wgEncodeGencodeAutoV4.gtf.gz
cat gencode_v4fix.annotation.GRCh37.level_1_2.gtf | gzip -c > /hive/groups/encode/dcc/pipeline/downloads/hg19/wgEncodeGencode/wgEncodeGencodeManualV4.gtf.gz

# get Yale mappings for the pseudogenes
wget "http://homes.gersteinlab.org/people/yhl3/pgenes/gencode/wgEncodeGencodePgenes.v4.Human58.tsv"

#added following entries to the trackDb for wgEncodeGencode
# yalePseudoAssoc wgEncodeGencodeYalePseudoV4
# yaleUrl http://tables.pseudogene.org/

hgLoadSqlTab hg19 wgEncodeGencodeYalePseudoV4 ~/kent/src/hg/lib/yaleGencodeAssoc.sql wgEncodeGencodePgenes.v4.Human58.tsv 
cat wgEncodeGencodePgenes.v4.Human58.tsv | gzip -c > /hive/groups/encode/dcc/pipeline/downloads/hg19/wgEncodeGencode/wgEncodeGencodeYalePseudoV4.tsv.gz

# load 2wayconspeudo
mv old/gencode.v3.2wayconspseudos.GRCh37.gtf .
ldHgGene -exon=transcript -noncoding -genePredExt hg19 wgEncodeGencode2wayConsPseudo gencode.v3.2wayconspseudos.GRCh37.gtf

echo "rename table wgEncodeGencode2wayConsPseudo to wgEncodeGencode2wayConsPseudoV4;" | hgsql hg19

cat gencode.v3.2wayconspseudos.GRCh37.gtf | gzip -c > /hive/groups/encode/dcc/pipeline/downloads/hg19/wgEncodeGencode/wgEncodeGencode2wayConsPseudoV4.gtf.gz

# build metadata
for i in YalePseudo 2wayConsPseudo Manual Auto Classes PolyA
do
table="wgEncodeGencode"$i"V4"

echo "metaObject $table"
echo "objType table"
echo "annotation Automatic"
echo "composite wgEncodeGencode"
echo "dataType Gencode"
echo "dataVersion ENCODE May 2010 Freeze"
echo "dateSubmitted 2010-05-01"
echo "dateUnrestricted 2011-02-01"
echo "fileName "$table".gtf.gz"
echo "grant Hubbard"
echo "lab Sanger"
echo "level 3"
echo "project wgEncode"
echo "submittedDataVersion V4"
echo "tableName "$table
echo 
done

#  reloading the "right" way so that the gene_name in the GTF ends up in name2
#  of the genePred

# ldHgGene -gtf -genePredExt hg19 wgEncodeGencodeManualV4 gencode_v4fix.annotation.GRCh37.level_1_2.gtf 

gtfToGenePred  -genePredExt -geneNameAsName2 gencode_v4fix.annotation.GRCh37.level_1_2.gtf stdout | sed 's/ENSTR/ENST0/' > gencode_v4fix.annotation.GRCh37.level_1_2.gp 
hgLoadGenePred -genePredExt hg19 wgEncodeGencodeManualV4 gencode_v4fix.annotation.GRCh37.level_1_2.gp 

# ldHgGene -exon=transcript -noncoding -genePredExt hg19 wgEncodeGencode2wayConsPseudo gencode.v3.2wayconspseudos.GRCh37.gtf
 tawk '$3=="transcript"{$3="exon"}{print $0}' gencode.v3.2wayconspseudos.GRCh37.gtf |gtfToGenePred -genePredExt stdin gencode.v3.2wayconspseudos.GRCh37.gp
hgLoadGenePred -genePredExt hg19 wgEncodeGencode2wayConsPseudoV4 gencode.v3.2wayconspseudos.GRCh37.gp

# ldHgGene -gtf -genePredExt hg19 wgEncodeGencodeAutoV4 gencode_v4fix.annotation.GRCh37.level_3.gtf 
# ldHgGene  -exon=tRNAscan  -genePredExt -oldTable hg19 wgEncodeGencodeAutoV4 gencode.v4.tRNAs.GRCh37.gtf 

gtfToGenePred  -genePredExt -geneNameAsName2 gencode_v4fix.annotation.GRCh37.level_3.gtf stdout | sed 's/ENSTR/ENST0/' > gencode_v4fix.annotation.GRCh37.level_3.gp
 tawk '$3=="tRNAscan"{$3="exon"}{print $0}' gencode.v4.tRNAs.GRCh37.gtf | gtfToGenePred -genePredExt stdin  gencode.v4.tRNAs.GRCh37.gp
cat gencode_v4fix.annotation.GRCh37.level_3.gp gencode.v4.tRNAs.GRCh37.gp | hgLoadGenePred -genePredExt hg19 wgEncodeGencodeAutoV4 stdin




# ldHgGene -exon=polyA_signal -genePredExt -noncoding -oldTable hg19 wgEncodeGencodePolyaV4 gencode.v4.polyAs.GRCh37.signal.gtf > load_polyA.out 2>&1
# ldHgGene -exon=pseudo_polyA -genePredExt -noncoding -oldTable hg19 wgEncodeGencodePolyaV4 gencode.v4.polyAs.GRCh37.pseudo.gtf >> load_polyA.out 2>&1
# ldHgGene -exon=polyA_site -genePredExt -noncoding -oldTable hg19 wgEncodeGencodePolyaV4 gencode.v4.polyAs.GRCh37.site.gtf >> load_polyA.out 2>&1 

 #tawk '$3=="polyA_signal"{$3="exon"}{print $0}' gencode.v4.polyAs.GRCh37.signal.gtf | gtfToGenePred -genePredExt stdin  gencode.v4.polyAs.GRCh37.signal.gp


# didn't reload PolyA since names are garbage and in V3 all name = name2

#build classes table
# echo "LOAD DATA LOCAL INFILE 'gencode_v4.annotation.GRCh37.level_1_2.classes' into table wgEncodeGencodeClassesV4" | hgsql hg19
# echo "LOAD DATA LOCAL INFILE 'gencode_v4.annotation.GRCh37.level_3.classes' into table wgEncodeGencodeClassesV4" | hgsql hg19
# echo "LOAD DATA LOCAL INFILE 'gencode.v4.tRNAs.GRCh37.classes' into table wgEncodeGencodeClassesV4" | hgsql hg19
# echo "LOAD DATA LOCAL INFILE 'gencode.v4.polyAs.GRCh37.classes' into table wgEncodeGencodeClassesV4" | hgsql hg19
# echo "LOAD DATA LOCAL INFILE 'gencode.v3.2wayconspseudos.GRCh37.classes' into table wgEncodeGencodeClassesV4" | hgsql hg19

cat gencode_v4.annotation.GRCh37.level_1_2.classes gencode_v4.annotation.GRCh37.level_3.classes gencode.v4.tRNAs.GRCh37.classes gencode.v4.polyAs.GRCh37.classes gencode.v3.2wayconspseudos.GRCh37.classes | hgLoadSqlTab hg19 wgEncodeGencodeClassesV4 ~/kent/src/hg/lib/gencodeGeneClass.sql stdin

