# for emacs: -*- mode: sh; -*-

# This file describes browser build for the speTri2
#	Spermophilus tridecemlineatus - Squirrel genome: November 2011
#	new name: Ictidomys tridecemlineatus (thirteen-lined ground squirrel)

#	http://www.ncbi.nlm.nih.gov/genome/assembly/317808/
#	http://www.ncbi.nlm.nih.gov/bioproject/61725
#	http://www.ncbi.nlm.nih.gov/genome/472
#	http://www.ncbi.nlm.nih.gov/Traces/wgs/?val=AGTP01
#	Genome Coverage : 495.1x

#############################################################################
# Fetch sequence from genbank (DONE - 2012-01-06 - Hiram)

    mkdir -p /hive/data/genomes/speTri2/genbank
    cd /hive/data/genomes/speTri2/genbank

    rsync -a -P \
rsync://ftp.ncbi.nlm.nih.gov/genbank/genomes/Eukaryotes/vertebrates_mammals/Spermophilus_tridecemlineatus/SpeTri2.0/ ./

    # measure total sequence in this assembly
    faSize Primary_Assembly/unplaced_scaffolds/FASTA/unplaced.scaf.fa.gz

# 2478393770 bases (167333470 N's 2311060300 real 2311060300 upper 0 lower) in
# 12483 sequences in 1 files
# Total size: mean 198541.5 sd 1542819.5
#	min 1000 (gi|358076641|gb|AGTP01153276.1|)
#	max 58279134 (gi|358364222|gb|JH393279.1|) median 3276

#############################################################################
# process into UCSC naming scheme (DONE - 2012-03-14 - Hiram)
    cd /hive/data/genomes/speTri2/genbank
    # watch out for the pattern match below: s/.*gb\|//;
    #	depends upon what string is in the header of the fasta file
    cat << '_EOF_' > unplaced.pl
#!/bin/env perl

use strict;
use warnings;

my $agpFile =  "../genbank/Primary_Assembly/unplaced_scaffolds/AGP/unplaced.scaf.agp.gz";
my $fastaFile =  "../genbank/Primary_Assembly/unplaced_scaffolds/FASTA/unplaced.scaf.fa.gz";
open (FH, "zcat $agpFile|") or die "can not read $agpFile";
open (UC, ">unplaced.agp") or die "can not write to unplaced.agp";
while (my $line = <FH>) {
    if ($line =~ m/^#/) {
        print UC $line;
    } else {
        $line =~ s/\.1//;    
        printf UC "%s", $line;
    }
}
close (FH);
close (UC);

open (FH, "zcat $fastaFile|") or die "can not read $fastaFile";
open (UC, ">unplaced.fa") or die "can not write to unplaced.fa";
while (my $line = <FH>) {
    if ($line =~ m/^>/) {
        chomp $line;
        $line =~ s/.*gb\|//;
        $line =~ s/\.1\|.*//;
        printf UC ">$line\n";
    } else {
        print UC $line;
    }
}
close (FH);
close (UC);
'_EOF_'
    # << happy emacs
    chmod +x unplaced.pl
    time ./unplaced.pl
    #	real    0m32.848s

    # compress these files
    time gzip *.fa *.agp
    #	real    11m17.120s

    # verify nothing has changed in the sequence, should be the same as above
    #	with just the names changed:
    faSize unplaced.fa.gz
# 2478393770 bases (167333470 N's 2311060300 real 2311060300 upper 0 lower)
#	in 12483 sequences in 1 files
# Total size: mean 198541.5 sd 1542819.5 min 1000 (AGTP01153276)
#	max 58279134 (JH393279) median 3276


speTri1 Feb. 2008 (Broad/speTri1)       /gbdb/speTri1   Squirrel        scaffold_113960:498395-508394   1       1910    Squirrel        Spermophilus tridecemlineatus   /gbdb/speTri1/html/description.html     0       0       Broad Institute speTri1 43179

#############################################################################
#  Initial database build (DONE - 2012-03-14 - Hiram)
    cd /hive/data/genomes/speTri2
    cat << '_EOF_' > speTri2.config.ra
# Config parameters for makeGenomeDb.pl:
db speTri2
clade mammal
genomeCladePriority 35
scientificName Spermophilus tridecemlineatus
commonName Squirrel
assemblyDate Nov. 2011
assemblyLabel Broad Institute (GCA_000236235.1)
assemblyShortLabel Broad SpeTri2
orderKey 1909
mitoAcc none
fastaFiles /hive/data/genomes/speTri2/genbank/unplaced.fa.gz
agpFiles /hive/data/genomes/speTri2/genbank/unplaced.agp.gz
dbDbSpeciesDir squirrel
taxId   43179
ncbiAssemblyId 317808
ncbiAssemblyName SpeTri2.0
'_EOF_'
    # << happy emacs

    # first verify the sequence and AGP files are OK
    time makeGenomeDb.pl -stop=agp -workhorse=hgwdev speTri2.config.ra \
	> agp.log 2>&1
    #	real    1m55.580s
    # verify that was OK, look at the agp.log file
    time makeGenomeDb.pl -continue=db -workhorse=hgwdev speTri2.config.ra \
	> db.log 2>&1
    #	real    17m23.012s

    # verify that was OK, look at the do.log file
    # copy the trackDb business to the source tree, check it in and add
    #	to the trackDb/makefile

#############################################################################
# running repeat masker (DONE - 2012-03-14 - Hiram)
    # construct a screen to keep this process under
    screen -S speTri2RM
    mkdir /hive/data/genomes/speTri2/bed/repeatMasker
    cd /hive/data/genomes/speTri2/bed/repeatMasker
    time doRepeatMasker.pl -buildDir=`pwd` -noSplit \
	-bigClusterHub=swarm -dbHost=hgwdev -workhorse=hgwdev \
	-smallClusterHub=encodek speTri2 > do.log 2>&1 &
    #	real    444m26.325s
    cat faSize.rmsk.txt
    #	2478393770 bases (167333470 N's 2311060300 real 1473623875 upper
    #	837436425 lower) in 12483 sequences in 1 files
    #	Total size: mean 198541.5 sd 1542819.5 min 1000 (AGTP01153276)
    #	max 58279134 (JH393279) median 3276
    #	%33.79 masked total, %36.24 masked real

    grep -i versi do.log
# RepeatMasker version development-$Id: RepeatMasker,v 1.26 2011/09/26 16:19:44 angie Exp $
#    April 26 2011 (open-3-3-0) version of RepeatMasker

    time featureBits -countGaps speTri2 rmsk
    #	839787873 bases of 2478393770 (33.884%) in intersection
    #	real    0m28.253s

    # why is it different than the faSize above ?
    # because rmsk masks out some N's as well as bases, the count above
    #	separates out the N's from the bases, it doesn't show lower case N's

##########################################################################
# running simple repeat (DONE - 2012-03-14 - Hiram)
    mkdir /hive/data/genomes/speTri2/bed/simpleRepeat
    cd /hive/data/genomes/speTri2/bed/simpleRepeat
    time doSimpleRepeat.pl -buildDir=`pwd` -bigClusterHub=swarm \
	-dbHost=hgwdev -workhorse=hgwdev -smallClusterHub=encodek \
	speTri2 > do.log 2>&1 &
    #	real    27m3.355s
    cat fb.simpleRepeat 
    #	40145040 bases of 2311060300 (1.737%) in intersection

    # add to rmsk after it is done:
    cd /hive/data/genomes/speTri2
    twoBitMask speTri2.rmsk.2bit \
	-add bed/simpleRepeat/trfMask.bed speTri2.2bit
    #	you can safely ignore the warning about fields >= 13

    twoBitToFa speTri2.2bit stdout | faSize stdin > faSize.speTri2.2bit.txt
    cat faSize.speTri2.2bit.txt
# 2478393770 bases (167333470 N's 2311060300 real 1472783000 upper
#	838277300 lower) in 12483 sequences in 1 files
# Total size: mean 198541.5 sd 1542819.5 min 1000 (AGTP01153276)
#	max 58279134 (JH393279) median 3276
# %33.82 masked total, %36.27 masked real

    rm /gbdb/speTri2/speTri2.2bit
    ln -s `pwd`/speTri2.2bit /gbdb/speTri2/speTri2.2bit

#########################################################################
# Verify all gaps are marked, add any N's not in gap as type 'other'
#	(DONE - 2012-03-14 - Hiram)
    mkdir /hive/data/genomes/speTri2/bed/gap
    cd /hive/data/genomes/speTri2/bed/gap
    time nice -n +19 findMotif -motif=gattaca -verbose=4 \
	-strand=+ ../../speTri2.unmasked.2bit > findMotif.txt 2>&1
    #	real    0m20.648s
    grep "^#GAP " findMotif.txt | sed -e "s/^#GAP //" > allGaps.bed
    time featureBits speTri2 -not gap -bed=notGap.bed
    #	real    0m14.265s
    #	2311060300 bases of 2311060300 (100.000%) in intersection
    time featureBits speTri2 allGaps.bed notGap.bed -bed=new.gaps.bed
    #	0 bases of 2311060300 (0.000%) in intersection

    # no new gaps here, nothing to do for this.

    hgsql -N -e "select bridge from gap;" speTri2 | sort | uniq -c
    #	141005 yes

##########################################################################
## WINDOWMASKER (DONE - 2012-03-14 - Hiram)
    mkdir /hive/data/genomes/speTri2/bed/windowMasker
    cd /hive/data/genomes/speTri2/bed/windowMasker
    time nice -n +19 doWindowMasker.pl -buildDir=`pwd` -workhorse=hgwdev \
	-dbHost=hgwdev speTri2 > do.log 2>&1 &
    #	real    166m6.383s

    # Masking statistics
    twoBitToFa speTri2.wmsk.2bit stdout | faSize stdin
    #	2478393770 bases (167333470 N's 2311060300 real 1631445412 upper
    #	679614888 lower) in 12483 sequences in 1 files
    #	Total size: mean 198541.5 sd 1542819.5 min 1000 (AGTP01153276)
    #	max 58279134 (JH393279) median 3276
    #	%27.42 masked total, %29.41 masked real

    twoBitToFa speTri2.wmsk.sdust.2bit stdout | faSize stdin
# 2478393770 bases (167333470 N's 2311060300 real 1619347498 upper
#	691712802 lower) in 12483 sequences in 1 files
# Total size: mean 198541.5 sd 1542819.5 min 1000 (AGTP01153276)
#	max 58279134 (JH393279) median 3276
# %27.91 masked total, %29.93 masked real

    hgLoadBed speTri2 windowmaskerSdust windowmasker.sdust.bed.gz
    #	Loaded 14098318 elements of size 3

    time featureBits -countGaps speTri2 windowmaskerSdust
    #	859046272 bases of 2478393770 (34.661%) in intersection
    #	real    1m16.823s

    #	eliminate the gaps from the masking
    time featureBits speTri2 -not gap -bed=notGap.bed
    #	2311060300 bases of 2311060300 (100.000%) in intersection
    #	real    0m17.884s

    time nice -n +19 featureBits speTri2 windowmaskerSdust notGap.bed \
        -bed=stdout | gzip -c > cleanWMask.bed.gz
    #	691712802 bases of 2311060300 (29.931%) in intersection
    #	real    6m49.209s

    hgLoadBed speTri2 windowmaskerSdust cleanWMask.bed.gz
    #	Loaded 14116066 elements of size 4

    featureBits -countGaps speTri2 windowmaskerSdust
    #	691712802 bases of 2478393770 (27.910%) in intersection

    #	DO NOT NEED TO mask the sequence with this clean mask
    #	The RepeatMasker did a good job, using that masked sequence.
#    zcat cleanWMask.bed.gz \
#	| twoBitMask ../../speTri2.unmasked.2bit stdin \
#	    -type=.bed speTri2.cleanWMSdust.2bit
#    twoBitToFa speTri2.cleanWMSdust.2bit stdout | faSize stdin \
#        > speTri2.cleanWMSdust.faSize.txt
#    cat speTri2.cleanWMSdust.faSize.txt
    #	824327835 bases (216289280 N's 608038555 real 418637635 upper
    #	189400920 lower) in 427428 sequences in 1 files
    #	%22.98 masked total, %31.15 masked real

    # how much does this window masker and repeat masker overlap:
    featureBits -countGaps speTri2 rmsk windowmaskerSdust
    #	404169457 bases of 2478393770 (16.308%) in intersection

#########################################################################
# MASK SEQUENCE WITH WM+TRF (DONE - 2012-03-14 - Hiram)
    # Do not need to do this since Repeat Masker was used

########################################################################
# cpgIslands - (DONE - 2011-04-23 - Hiram)
    mkdir /hive/data/genomes/speTri2/bed/cpgIslands
    cd /hive/data/genomes/speTri2/bed/cpgIslands
    time doCpgIslands.pl speTri2 > do.log 2>&1
    #   real    61m47.841s

    cat fb.speTri2.cpgIslandExt.txt
    #   14580847 bases of 2311060300 (0.631%) in intersection

#########################################################################
# genscan - (DONE - 2011-04-26 - Hiram)
    mkdir /hive/data/genomes/speTri2/bed/genscan
    cd /hive/data/genomes/speTri2/bed/genscan
    time doGenscan.pl speTri2 > do.log 2>&1
    #   real    62m2.334s

    cat fb.speTri2.genscan.txt
    #   55266527 bases of 2311060300 (2.391%) in intersection
    cat fb.speTri2.genscanSubopt.txt
    #   50135714 bases of 2311060300 (2.169%) in intersection

#########################################################################
# MAKE 11.OOC FILE FOR BLAT/GENBANK (DONE - 2012-05-03 - Hiram)
    # Use -repMatch=900, based on size -- for human we use 1024
    # use the "real" number from the faSize measurement,
    # hg19 is 2897316137, calculate the ratio factor for 1024:
    calc \( 2311060300 / 2897316137 \) \* 1024
    #	( 2311060300 / 2897316137 ) * 1024 = 816.799284

    # round up to 850

    cd /hive/data/genomes/speTri2
    time blat speTri2.2bit /dev/null /dev/null -tileSize=11 \
      -makeOoc=jkStuff/speTri2.11.ooc -repMatch=850
    #   Wrote 23341 overused 11-mers to jkStuff/speTri2.11.ooc
    #   real    0m49.211s

    # there are no non-bridged gaps, no lift file needed for genbank
    hgsql -N -e "select bridge from gap;" speTri2 | sort | uniq -c
    #   141005 yes
#    cd /hive/data/genomes/speTri2/jkStuff
#    gapToLift speTri2 speTri2.nonBridged.lift -bedFile=speTri2.nonBridged.bed
    # largest non-bridged contig:
#    awk '{print $3-$2,$0}' speTri2.nonBridged.bed | sort -nr | head
    #   123773608 chrX  95534   123869142       chrX.01

#########################################################################
# AUTO UPDATE GENBANK (DONE - 2012-05-03 - Hiram)
    # examine the file:
    /cluster/data/genbank/data/organism.lst
    # for your species to see what counts it has for:
# organism       mrnaCnt estCnt  refSeqCnt
# Ictidomys tridecemlineatus	43	5258	0
# Spermophilus tridecemlineatus - not in the list, named is as above
    # to decide which "native" mrna or ests you want to specify in genbank.conf

    ssh hgwdev
    cd $HOME/kent/src/hg/makeDb/genbank
    git pull
    # edit etc/genbank.conf to add speTri2
# speTri2 (13-lined ground squirrel)
speTri2.serverGenome = /hive/data/genomes/speTri2/speTri2.2bit
speTri2.clusterGenome = /hive/data/genomes/speTri2/speTri2.2bit
speTri2.ooc = /hive/data/genomes/speTri2/jkStuff/speTri2.11.ooc
speTri2.lift = no
speTri2.refseq.mrna.native.pslCDnaFilter  = ${lowCover.refseq.mrna.native.pslCDnaFilter}
speTri2.refseq.mrna.xeno.pslCDnaFilter    = ${lowCover.refseq.mrna.xeno.pslCDnaFilter}
speTri2.genbank.mrna.native.pslCDnaFilter = ${lowCover.genbank.mrna.native.pslCDnaFilter}
speTri2.genbank.mrna.xeno.pslCDnaFilter   = ${lowCover.genbank.mrna.xeno.pslCDnaFilter}
speTri2.genbank.est.native.pslCDnaFilter  = ${lowCover.genbank.est.native.pslCDnaFilter}
speTri2.refseq.mrna.native.load = no
speTri2.refseq.mrna.xeno.load = yes
speTri2.genbank.mrna.xeno.load = yes
speTri2.genbank.est.native.load = yes
speTri2.downloadDir = speTri2
speTri2.perChromTables = no

    # end of section added to etc/genbank.conf
    git commit -m "adding speTri2 13-lined ground squirrel" etc/genbank.conf
    git push
    make etc-update

    git pull
    # Edit src/lib/gbGenome.c to add new species.
    git commit -m "adding definition for speTriNames" src/lib/gbGenome.c
    git push
    make install-server

    ssh hgwdev			# used to do this on "genbank" machine
    screen -S speTri2           # long running job managed in screen
    cd /cluster/data/genbank
    time nice -n +19 ./bin/gbAlignStep -initial speTri2 &
    #   var/build/logs/2012.06.08-09:59:03.speTri2.initalign.log
    #   real    1706m23.669s

    # load database when finished
    ssh hgwdev
    cd /cluster/data/genbank
    time nice -n +19 ./bin/gbDbLoadStep -drop -initialLoad speTri2 &
    #	logFile:  var/dbload/hgwdev/logs/2012.06.11-11:53:53.dbload.log
    #   real    36m2.148s

    # enable daily alignment and update of hgwdev (DONE - 2012-05-11 - Hiram)
    cd ~/kent/src/hg/makeDb/genbank
    git pull
    # add speTri2 to:
    vi etc/align.dbs etc/hgwdev.dbs
    git commit -m "Added speTri2." etc/align.dbs etc/hgwdev.dbs
    git push
    make etc-update

#########################################################################
# set default position to RHO gene displays  (DONE - 2012-07-26 - Hiram)
    hgsql -e \
'update dbDb set defaultPos="JH393412:896020-902549" where name="speTri2";' \
	hgcentraltest

############################################################################
# pushQ entry (DONE - 2012-07-26 - Hiram)
    mkdir /hive/data/genomes/speTri2/pushQ
    cd /hive/data/genomes/speTri2/pushQ
    # Mark says don't let the transMap track get there
    time makePushQSql.pl speTri2 2> stderr.txt | grep -v transMap > speTri2.sql
    #   real    3m56.966s
    # check the stderr.txt for bad stuff, these kinds of warnings are OK:
# WARNING: hgwdev does not have /gbdb/speTri2/wib/gc5Base.wib
# WARNING: hgwdev does not have /gbdb/speTri2/wib/quality.wib
# WARNING: hgwdev does not have /gbdb/speTri2/bbi/quality.bw
# WARNING: speTri2 does not have seq
# WARNING: speTri2 does not have extFile
# WARNING: speTri2 does not have estOrientInfo

    scp -p speTri2.sql hgwbeta:/tmp
    ssh hgwbeta "hgsql qapushq < /tmp/speTri2.sql"

############################################################################
# create ucscToINSDC name mapping (DONE - 2013-08-23 - Hiram)
    mkdir /hive/data/genomes/speTri2/bed/ucscToINSDC
    cd /hive/data/genomes/speTri2/bed/ucscToINSDC

    # copying these scripts from the previous load and improving them
    # with each instance
    ./translateNames.sh
    ./verifyAll.sh
    ./join.sh

    sed -e "s/21/12/" $HOME/kent/src/hg/lib/ucscToINSDC.sql \
       | hgLoadSqlTab speTri2 ucscToINSDC stdin ucscToINSDC.tab
    checkTableCoords speTri2 ucscToINSDC
    featureBits -countGaps speTri2 ucscToINSDC
    #  2478393770 bases of 2478393770 (100.000%) in intersection

    # verify the track link to INSDC functions

##############################################################################
##############################################################################
# TransMap V3 tracks. see makeDb/doc/transMapTracks.txt (2014-12-21 markd)
##############################################################################
