# for emacs: -*- mode: sh; -*-

# rock hyrax ( Procavia capensis )

#	http://www.ncbi.nlm.nih.gov/genome/2488
#	http://www.ncbi.nlm.nih.gov/bioproject/13972
#	http://www.ncbi.nlm.nih.gov/Traces/wgs/?val=ABRQ00

#########################################################################
# DOWNLOAD SEQUENCE (DONE braney 2008-10-07)
    ssh kkstore05
    mkdir /cluster/store12/proCap1
    ln -s /cluster/store12/proCap1 /cluster/data
    mkdir /cluster/data/proCap1/broad
    cd /cluster/data/proCap1/broad

    wget --timestamping \
ftp://ftp.broad.mit.edu/pub/assemblies/mammals/rockHyrax/Procap1.0/assembly.agp \
ftp://ftp.broad.mit.edu/pub/assemblies/mammals/rockHyrax/Procap1.0/assembly.bases.gz \
ftp://ftp.broad.mit.edu/pub/assemblies/mammals/rockHyrax/Procap1.0/assembly.quals.gz 
    md5sum ass* > assembly.md5sum

    qaToQac assembly.quals.gz stdout | qacAgpLift assembly.agp stdin proCap1.qual.qac

   cut -f 1 assembly.agp | uniq -c | wc -l 
   # Number of scaffolds: 295006

#########################################################################
# Create .ra file and run makeGenomeDb.pl (DONE braney 2008-10-07)
    ssh kolossus
    cd /cluster/data/proCap1
cat << _EOF_ >proCap1.config.ra
# Config parameters for makeGenomeDb.pl:
db proCap1
clade mammal
genomeCladePriority 35
scientificName  Procavia capensis 
commonName Rock hyrax
assemblyDate Jul. 2008
assemblyLabel Broad Institute proCap1 
orderKey 236.5
#mitoAcc AJ222767
mitoAcc none
fastaFiles /cluster/data/proCap1/broad/assembly.bases.gz
agpFiles /cluster/data/proCap1/broad/assembly.agp
qualFiles /cluster/data/proCap1/broad/proCap1.qual.qac
dbDbSpeciesDir rockHyrax
_EOF_

    makeGenomeDb.pl -workhorse kolossus proCap1.config.ra > makeGenomeDb.out 2>&1 &

    cut -f 2 chrom.sizes | ave stdin
# Q1 1549.000000
# median 4249.000000
# Q3 11283.000000
# average 10119.316214
# min 315.000000
# max 390036.000000
# count 295006
# total 2985258999.000000
# standard deviation 16728.518926

#########################################################################
## Repeat Masker (DONE - 2008-10-16 - Hiram)
    screen	# to manage this several day job
    mkdir /hive/data/genomes/proCap1/bed/repeatMasker
    cd /hive/data/genomes/proCap1/bed/repeatMasker
    time $HOME/kent/src/hg/utils/automation/doRepeatMasker.pl \
	-workhorse=hgwdev -bigClusterHub=swarm \
	-buildDir=`pwd` proCap1 > do.log 2>&1 &
    #	real    867m49.502s
    twoBitToFa proCap1.rmsk.2bit stdout | faSize stdin > faSize.rmsk.txt
# 2985258999 bases (577411318 N's 2407847681 real 1731549681 upper 676298000
# lower) in 295006 sequences in 1 files
# %22.65 masked total, %28.09 masked rea

#########################################################################
# SIMPLE REPEATS TRF (DONE - 2008-10-16 - Hiram)
    screen # use a screen to manage this job
    mkdir /cluster/data/proCap1/bed/simpleRepeat
    cd /cluster/data/proCap1/bed/simpleRepeat
    # 
    time $HOME/kent/src/hg/utils/automation/doSimpleRepeat.pl \
	-buildDir=/cluster/data/proCap1/bed/simpleRepeat proCap1 > do.log 2>&1 &
    #	real    67m53.007s
    #	batch failed on pk, finish it manually
    #	this was an unusual error, when TrfRun.csh was splitting up the .lft
    #	file, it ended up with a single file in the last split, and that
    #	single file had no results for it, so the lift of that non-existent
    #	fail failed.  Temporarily changed, in TrfRun.csh:
    #	split -l 500 $inLft SplitLft.
    #	to be:
    #	split -l 400 $inLft SplitLft.
    #	and ran TrfRun.csh manually:
    time ./TrfRun.csh /san/sanvol1/scratch/proCap1/TrfPart/039/039.lst.bed
    #	now, continuing:
    time $HOME/kent/src/hg/utils/automation/doSimpleRepeat.pl \
	-continue=filter \
	-buildDir=/cluster/data/proCap1/bed/simpleRepeat proCap1 \
	> filter.log 2>&1 &

    cat fb.simpleRepeat
    #	24599087 bases of 2407847681 (1.022%) in intersection

    #	after RM run is done, add this mask:
    cd /hive/data/genomes/proCap1
    rm proCap1.2bit
    twoBitMask proCap1.rmsk.2bit -add bed/simpleRepeat/trfMask.bed proCap1.2bit
    #	can safely ignore warning about >=13 fields in bed file

    twoBitToFa proCap1.2bit stdout | faSize stdin > proCap1.2bit.faSize.txt
# 2985258999 bases (577411318 N's 2407847681 real 1731119427 upper 676728254
# lower) in 295006 sequences in 1 files
# %22.67 masked total, %28.11 masked rea

    #	link to gbdb
    ln -s `pwd`/proCap1.2bit /gbdb/proCap1

###########################################################################
# prepare for kluster runs (DONE 2008-10-16 - Hiram)
    # compare to size of real bases to adjust the repMatch
    #	hg18: 2881421696
    #	proCap1: 2407847681
    # thus: 1024 * 2407847681/2881421696 = 855
    #	rounding up to 900 to be conservative
    cd /hive/data/genomes/proCap1
    time blat proCap1.2bit \
	/dev/null /dev/null -tileSize=11 -makeOoc=proCap1.11.ooc -repMatch=900
    #	Wrote 30662 overused 11-mers to proCap1.11.ooc
    #	real    2m12.136s


    #	and staging data for push to kluster nodes
    mkdir /hive/data/staging/data/proCap1
    cp -p proCap1.2bit chrom.sizes proCap1.11.ooc \
	/hive/data/staging/data/proCap1
    #	request to cluster admin to push this to the kluster nodes
    #	/scratch/data/

###########################################################################
# add NCBI identifiers to the dbDb (DONE - 2008-10-21 - Hiram)
    hgsql -e 'update dbDb set
sourceName="Broad Institute proCap1 (NCBI project 13972, ABRQ01000000)" where name="proCap1";' hgcentraltest

############################################################################
#  proCap1 - Rock hyrax - Ensembl Genes version 51  (DONE - 2008-12-02 -
#  hiram)
    ssh hgwdev
    cd /hive/data/genomes/proCap1
    cat << '_EOF_' > proCap1.ensGene.ra
# required db variable
db proCap1
# do we need to translate geneScaffold coordinates
geneScaffolds yes
# ignore genes that do not properly convert to a gene pred, and contig
#       names that are not in the UCSC assembly
skipInvalid yes
# ignore the two genes that have invalid structures from Ensembl:
# 4595: ENSPCAT00000007286 no exonFrame on CDS exon 1
# 28894: ENSPCAT00000000699 no exonFrame on CDS exon 4
'_EOF_'
#  << happy emacs

    doEnsGeneUpdate.pl -ensVersion=51 proCap1.ensGene.ra
    ssh hgwdev
    cd /hive/data/genomes/proCap1/bed/ensGene.51
    featureBits proCap1 ensGene
    # 25217399 bases of 2407847681 (1.047%) in intersection

 *** All done!  (through the 'makeDoc' step)
 *** Steps were performed in /hive/data/genomes/proCap1/bed/ensGene.51

###########################################################################
# cpgIslands - (DONE - 2011-04-23 - Hiram)
    mkdir /hive/data/genomes/proCap1/bed/cpgIslands
    cd /hive/data/genomes/proCap1/bed/cpgIslands
    time doCpgIslands.pl proCap1 > do.log 2>&1
    #   real    884m46.543s
    # fixing broken command in script:
    time ./doLoadCpg.csh
    #   real    3m6.649s
    time doCpgIslands.pl -continue=cleanup proCap1 > cleanup.log 2>&1
    #   real    105m41.557s

    cat fb.proCap1.cpgIslandExt.txt
    #   13143269 bases of 2407847681 (0.546%) in intersection

#########################################################################
# genscan - (DONE - 2011-04-25 - Hiram)
    mkdir /hive/data/genomes/proCap1/bed/genscan
    cd /hive/data/genomes/proCap1/bed/genscan
    time doGenscan.pl proCap1 > do.log 2>&1
    # recovering from power failure, kluster run had just finished
    # and it had just started on makeBed:
    time ./doMakeBed.csh
    #   real    483m14.978s
    time doGenscan.pl -continue=load proCap1 > load.log 2>&1
    #   real    39m19.361s

    cat fb.proCap1.genscan.txt
    #   66421419 bases of 2407847681 (2.759%) in intersection
    cat fb.proCap1.genscanSubopt.txt
    #   74279950 bases of 2407847681 (3.085%) in intersection

#########################################################################
# windowMasker - (DONE - 2012-05-02 - Hiram)
    screen -S proCap1
    mkdir /hive/data/genomes/proCap1/bed/windowMasker
    cd /hive/data/genomes/proCap1/bed/windowMasker
    # trying out new version of the script that does all the usual steps
    #   that used to be performed manually after it was done
    time /cluster/home/hiram/kent/src/hg/utils/automation/doWindowMasker.pl \
        -workhorse=hgwdev -buildDir=`pwd` -dbHost=hgwdev proCap1 > do.log 2>&1
    #   real    1091m9.829s
    # fixing a broken command in the doLoad step:
    time ./lastLoad.csh
    #   real    11m17.751s
    time /cluster/home/hiram/kent/src/hg/utils/automation/doWindowMasker.pl \
        -continue=cleanup -workhorse=hgwdev -buildDir=`pwd` \
        -dbHost=hgwdev proCap1 > cleanup.log 2>&1
    #   real    2m21.973s

    sed -e 's/^/    #\t/' fb.proCap1.windowmaskerSdust.beforeClean.txt
    #   1550951798 bases of 2985258999 (51.954%) in intersection
    sed -e 's/^/    #\t/' fb.proCap1.windowmaskerSdust.clean.txt
    #   973540480 bases of 2985258999 (32.612%) in intersection
    sed -e 's/^/    #\t/' fb.proCap1.rmsk.windowmaskerSdust.txt
    #   389997236 bases of 2985258999 (13.064%) in intersection

#########################################################################
# MAKE 11.OOC FILE FOR BLAT/GENBANK (DONE - 2012-05-03 - Hiram)
    # Use -repMatch=900, based on size -- for human we use 1024
    # use the "real" number from the faSize measurement,
    # hg19 is 2897316137, calculate the ratio factor for 1024:
    calc \( 2407847681 / 2897316137 \) \* 1024
    #	( 2407847681 / 2897316137 ) * 1024 = 851.006900

    # round up to 900

    cd /hive/data/genomes/proCap1
    time blat proCap1.2bit /dev/null /dev/null -tileSize=11 \
      -makeOoc=jkStuff/proCap1.11.ooc -repMatch=900
    #   Wrote 30662 overused 11-mers to jkStuff/proCap1.11.ooc
    #   real    0m50.748s

    # there are no non-bridged gaps, no lift file needed for genbank
    hgsql -N -e "select bridge from gap;" proCap1 | sort | uniq -c
    #   552392 yes
#    cd /hive/data/genomes/proCap1/jkStuff
#    gapToLift proCap1 proCap1.nonBridged.lift -bedFile=proCap1.nonBridged.bed
    # largest non-bridged contig:
#    awk '{print $3-$2,$0}' proCap1.nonBridged.bed | sort -nr | head
    #   123773608 chrX  95534   123869142       chrX.01

#########################################################################
# AUTO UPDATE GENBANK (DONE - 2012-05-03 - Hiram)
    # examine the file:
    /cluster/data/genbank/data/organism.lst
    # for your species to see what counts it has for:
# organism       mrnaCnt estCnt  refSeqCnt
# Procavia capensis not found on the list
    # to decide which "native" mrna or ests you want to specify in genbank.conf

    ssh hgwdev
    cd $HOME/kent/src/hg/makeDb/genbank
    git pull
    # edit etc/genbank.conf to add proCap1 just after ce2
# proCap1 (Rock hyrax)
proCap1.serverGenome = /hive/data/genomes/proCap1/proCap1.2bit
proCap1.clusterGenome = /hive/data/genomes/proCap1/proCap1.2bit
proCap1.ooc = /hive/data/genomes/proCap1/jkStuff/proCap1.11.ooc
proCap1.lift = no
proCap1.refseq.mrna.native.pslCDnaFilter  = ${lowCover.refseq.mrna.native.pslCDnaFilter}
proCap1.refseq.mrna.xeno.pslCDnaFilter    = ${lowCover.refseq.mrna.xeno.pslCDnaFilter}
proCap1.genbank.mrna.native.pslCDnaFilter = ${lowCover.genbank.mrna.native.pslCDnaFilter}
proCap1.genbank.mrna.xeno.pslCDnaFilter   = ${lowCover.genbank.mrna.xeno.pslCDnaFilter}
proCap1.genbank.est.native.pslCDnaFilter  = ${lowCover.genbank.est.native.pslCDnaFilter}
proCap1.refseq.mrna.native.load = no
proCap1.refseq.mrna.xeno.load = yes
proCap1.genbank.mrna.xeno.load = yes
proCap1.genbank.est.native.load = no
proCap1.downloadDir = proCap1
proCap1.perChromTables = no

    # end of section added to etc/genbank.conf
    git commit -m "adding proCap1 Rock hyrax" etc/genbank.conf
    git push
    make etc-update

    git pull
    # Edit src/lib/gbGenome.c to add new species.
    git commit -m "adding definition for proCapNames" \
        src/lib/gbGenome.c
    git push
    make install-server

    ssh hgwdev			# used to do this on "genbank" machine
    screen -S proCap1           # long running job managed in screen
    cd /cluster/data/genbank
    time nice -n +19 ./bin/gbAlignStep -initial proCap1 &
    #   var/build/logs/2012.06.08-09:53:54.proCap1.initalign.log
    #   real    2549m58.923s

    # load database when finished
    ssh hgwdev
    cd /cluster/data/genbank
    time nice -n +19 ./bin/gbDbLoadStep -drop -initialLoad proCap1 &
    #	logFile:  var/dbload/hgwdev/logs/2012.06.11-13:27:43.dbload.log
    #   real    8m41.825s

    # enable daily alignment and update of hgwdev (DONE - 2012-05-11 - Hiram)
    cd ~/kent/src/hg/makeDb/genbank
    git pull
    # add proCap1 to:
    vi etc/align.dbs etc/hgwdev.dbs
    git commit -m "Added proCap1." etc/align.dbs etc/hgwdev.dbs
    git push
    make etc-update

#########################################################################
# set default position to RHO gene displays  (DONE - 2012-07-24 - Hiram)
    hgsql -e \
'update dbDb set defaultPos="scaffold_15515:19471-24894" where name="proCap1";' \
	hgcentraltest

############################################################################
# pushQ entry (DONE - 2012-07-24 - Hiram)
    mkdir /hive/data/genomes/proCap1/pushQ
    cd /hive/data/genomes/proCap1/pushQ
    # Mark says don't let the transMap track get there
    time makePushQSql.pl proCap1 2> stderr.txt | grep -v transMap > proCap1.sql
    #   real    3m42.061s
    # check the stderr.txt for bad stuff, these kinds of warnings are OK:
# WARNING: hgwdev does not have /gbdb/proCap1/wib/gc5Base.wib
# WARNING: hgwdev does not have /gbdb/proCap1/wib/quality.wib
# WARNING: hgwdev does not have /gbdb/proCap1/bbi/quality.bw
# WARNING: proCap1 does not have seq
# WARNING: proCap1 does not have extFile
# WARNING: proCap1 does not have estOrientInfo

    scp -p proCap1.sql hgwbeta:/tmp
    ssh hgwbeta "hgsql qapushq < /tmp/proCap1.sql"

############################################################################
##############################################################################
# TransMap V3 tracks. see makeDb/doc/transMapTracks.txt (2014-12-21 markd)
##############################################################################
