# for emacs: -*- mode: sh; -*-

# Echinops telfairi 2.0 sequence:
# ftp.ncbi.nlm.nih.gov/genbank/genomes/Eukaryotes/vertebrates_other/
#	Anolis_carolinensis/

#	http://www.ncbi.nlm.nih.gov/Traces/wgs/?val=AAWZ00

##########################################################################
# Download sequence (DONE - 2012-08-27 - Hiram)
    mkdir -p /hive/data/genomes/echTel2/genbank
    cd /hive/data/genomes/echTel2/genbank

    rsync -a -P \
rsync://ftp.ncbi.nlm.nih.gov/genbank/genomes/Eukaryotes/vertebrates_mammals/Echinops_telfairi/EchTel2.0/
./ > fetch.log 2>&1

###########################################################################
# fixup to UCSC names (DONE - 2013-03-25 - Hiram)
    cd /hive/data/genomes/echTel2

    $HOME/kent/src/hg/utils/automation/unplacedScaffolds.pl
    # constructs the directory: /hive/data/genomes/echTel2/ucsc
    # with files:
    cd /hive/data/genomes/echTel2/ucsc
    ls -ogrt
# -rwxrwxr-x 1       355 May 29 15:21 fetchChrM.sh
# -rw-rw-r-- 1     16866 May 29 15:21 NC_002631.fa
# -rw-rw-r-- 1     16793 May 29 15:21 chrM.fa
# -rw-rw-r-- 1        37 May 29 15:21 chrM.agp
# -rw-rw-r-- 1  32452071 May 29 15:24 echTel2.ucsc.agp
# -rw-rw-r-- 1 810337259 May 29 15:38 echTel2.ucsc.fa.gz

    # NOTE: the chrM sequence was manually added to the fa.gz and .agp file
    # see the fetchChrM.sh script there

    # this script also constructs the echTel2.unmasked.2bit file, but
    # this is not needed with the makeGenomeDb.pl script:
    rm -f /hive/data/genomes/echTel2/echTel2.unmasked.2bit

###########################################################################
# Initial genome build (DONE - 2013-05-28 - Hiram)
    cd /hive/data/genomes/echTel2

    cat << '_EOF_' > echTel2.config.ra
# Config parameters for makeGenomeDb.pl:
db echTel2
clade mammal
scientificName Echinops telfairi
commonName Tenrec
assemblyDate Nov. 2012
assemblyLabel Broad/EchTel2.0
assemblyShortLabel Broad/EchTel2.0
orderKey 3369
mitoAcc none
fastaFiles /hive/data/genomes/echTel2/ucsc/echTel2.ucsc.fa.gz
agpFiles /hive/data/genomes/echTel2/ucsc/echTel2.ucsc.agp
# qualFiles /hive/data/genomes/echTel2/broad/scaffolds.qac
dbDbSpeciesDir tenrec
photoCreditURL http://www.hobbygarten.com/
photoCreditName Kathrin Holzer, all rights reserved
ncbiGenomeId 234
ncbiAssemblyId 500848
ncbiAssemblyName EchTel2.0
ncbiBioProject 12590
genBankAccessionID GCA_000313985.1
taxId 9371
'_EOF_'

    # run step wise to confirm sequence and AGP files match each other
    time makeGenomeDb.pl -stop=agp echTel2.config.ra > genomeDb.agp.log 2>&1
    # real    4m15.127s

    # verify it is OK:
    tail -1 genomeDb.agp.log
    # *** All done!  (through the 'agp' step)

    time nice -n +19 makeGenomeDb.pl -fileServer=hgwdev \
	-workhorse=hgwdev -continue=db echTel2.config.ra \
        > genomeDb.db.log 2>&1

    # add the trackDb business to the source tree

##########################################################################
# running repeat masker (DONE - 2013-05-29 - Hiram)
    mkdir /hive/data/genomes/echTel2/bed/repeatMasker
    cd /hive/data/genomes/echTel2/bed/repeatMasker
    time doRepeatMasker.pl -buildDir=`pwd` -noSplit \
	-bigClusterHub=swarm -dbHost=hgwdev -workhorse=hgwdev \
	-smallClusterHub=encodek echTel2 > do.log 2>&1 &
    #   real    390m14.722s
    cat faSize.rmsk.txt
# 2947024286 bases (341827925 N's 2605196361 real 1869498320 upper
#       735698041 lower) in 8402 sequences in 1 files
# Total size: mean 350752.7 sd 4055776.1 min 1000 (AAIY02277568) max
#       111700974 (JH980293) median 2033
# %24.96 masked total, %28.24 masked real

    grep -i versi do.log
# RepeatMasker version open-4.0.0
#    January 10 2013 (open-4-0-0) version of RepeatMasker

##########################################################################
# running simple repeat (DONE - 2013-05-29 - Hiram)
    mkdir /hive/data/genomes/echTel2/bed/simpleRepeat
    cd /hive/data/genomes/echTel2/bed/simpleRepeat
    time doSimpleRepeat.pl -buildDir=`pwd` -bigClusterHub=swarm \
	-dbHost=hgwdev -workhorse=hgwdev -smallClusterHub=encodek \
	echTel2 > do.log 2>&1 &
    # real    17m17.690s

    #	real    198m33.953s
    cat fb.simpleRepeat
    #	25511877 bases of 2605196361 (0.979%) in intersection

    # using RMSK and TRF since RMSK is enough masking even though
    # WM is pretty good
    cd /hive/data/genomes/echTel2
    twoBitMask echTel2.rmsk.2bit \
	-add bed/simpleRepeat/trfMask.bed echTel2.2bit
    #	you can safely ignore the warning about fields >= 13

    twoBitToFa echTel2.2bit stdout | faSize stdin > faSize.echTel2.2bit.txt
    cat faSize.echTel2.2bit.txt
# 2947024286 bases (341827925 N's 2605196361 real 1868909316 upper
#    736287045 lower) in 8402 sequences in 1 files
# Total size: mean 350752.7 sd 4055776.1 min 1000 (AAIY02277568)
#    max 111700974 (JH980293) median 2033
# %24.98 masked total, %28.26 masked real

    rm /gbdb/echTel2/echTel2.2bit
    ln -s `pwd`/echTel2.2bit /gbdb/echTel2/echTel2.2bit

#########################################################################
# Verify all gaps are marked, add any N's not in gap as type 'other'
#	(DONE - 2013-05-29 - Hiram)
    mkdir /hive/data/genomes/echTel2/bed/gap
    cd /hive/data/genomes/echTel2/bed/gap
    time nice -n +19 findMotif -motif=gattaca -verbose=4 \
	-strand=+ ../../echTel2.unmasked.2bit > findMotif.txt 2>&1
    #	real    0m40.949s
    grep "^#GAP " findMotif.txt | sed -e "s/^#GAP //" > allGaps.bed
    time featureBits echTel2 -not gap -bed=notGap.bed
    # 2605196361 bases of 2605196361 (100.000%) in intersection
    # real    0m24.729s

    time featureBits echTel2 allGaps.bed notGap.bed -bed=new.gaps.bed
    # 0 bases of 2605196361 (0.000%) in intersection
    # real    13m21.473s

#########################################################################
# cytoBandIdeo - (DONE - 2013-06-12 - Hiram)
    mkdir /hive/data/genomes/echTel2/bed/cytoBand
    cd /hive/data/genomes/echTel2/bed/cytoBand
    makeCytoBandIdeo.csh echTel2

##########################################################################
## WINDOWMASKER (DONE - 2013-05-30 - Hiram)
    mkdir /hive/data/genomes/echTel2/bed/windowMasker
    cd /hive/data/genomes/echTel2/bed/windowMasker
    time nice -n +19 doWindowMasker.pl -buildDir=`pwd` -workhorse=hgwdev \
	-dbHost=hgwdev echTel2 > do.log 2>&1 &
    #   real    301m9.805s

    cat faSize.echTel2.cleanWMSdust.txt
# 2947024286 bases (341827925 N's 2605196361 real 1655356294 upper
#    949840067 lower) in 8402 sequences in 1 files
# Total size: mean 350752.7 sd 4055776.1 min 1000 (AAIY02277568) max
#    111700974 (JH980293) median 2033
# %32.23 masked total, %36.46 masked real

    #  This is pretty good for WM, but RMSK isn't that bad either,
    #  so, using the RMSK result to mask the genome

    featureBits -countGaps echTel2 rmsk windowmaskerSdust  > fb.echTel2.rmsk.windowmaskerSdust.txt 2>&1
    cat fb.echTel2.rmsk.windowmaskerSdust.txt
#  416158057 bases of 2947024286 (14.121%) in intersection

########################################################################
# cpgIslands - (DONE - 2013-06-12 - Hiram)
    mkdir /hive/data/genomes/echTel2/bed/cpgIslands
    cd /hive/data/genomes/echTel2/bed/cpgIslands
    time doCpgIslands.pl echTel2 > do.log 2>&1
    #  real    8m6.460s

    cat fb.echTel2.cpgIslandExt.txt
    #   18449467 bases of 2605196361 (0.708%) in intersection

#########################################################################
# genscan - (DONE - 2013-06-12 - Hiram)
    mkdir /hive/data/genomes/echTel2/bed/genscan
    cd /hive/data/genomes/echTel2/bed/genscan
    time doGenscan.pl echTel2 > do.log 2>&1
    # real    14m37.851s
    # four jobs failed, ran on hgwdev with -window=2000000 and they finshed
    # real    27m4.758s

    # continuing:
    time doGenscan.pl -continue=makeBed echTel2 > makeBed.log 2>&1
    #  real    4m2.687s

    cat fb.echTel2.genscan.txt
    #   74304215 bases of 2605196361 (2.852%) in intersection
    cat fb.echTel2.genscanSubopt.txt
    #   67950430 bases of 2605196361 (2.608%) in intersection

########################################################################
# MAKE 11.OOC FILE FOR BLAT/GENBANK (DONE - 2013-06-06 - Hiram)
    # Use -repMatch=950, based on size -- for human we use 1024
    # use the "real" number from the faSize measurement,
    # hg19 is 2897316137, calculate the ratio factor for 1024:
    calc \( 2605196361 / 2897316137 \) \* 1024
    #	( 2605196361 / 2897316137 ) * 1024 = 920.755951
    # round up to 950

    # echTel1 was: -repMatch=1024
    #  Wrote 9721 overused 11-mers to /cluster/bluearc/echTel1/11.ooc

    cd /hive/data/genomes/echTel2
    blat echTel2.2bit /dev/null /dev/null -tileSize=11 \
      -makeOoc=jkStuff/echTel2.11.ooc -repMatch=950
    #	Wrote 21134 overused 11-mers to jkStuff/echTel2.11.ooc

    # there are *only* bridged gaps, no lift file needed for genbank
    hgsql -N -e "select bridge from gap;" echTel2 | sort | uniq -c
    #   269444 yes

#########################################################################
# AUTO UPDATE GENBANK (DONE - 2013-06-07 - Hiram)
    # examine the file:
    /cluster/data/genbank/data/organism.lst
    # for your species to see what counts it has for:
# organism       mrnaCnt estCnt  refSeqCnt
# Echinops telfairi       5       0       0
    # to decide which "native" mrna or ests you want to specify in genbank.conf
    # this appears that echTel2 has plenty of native est's

    ssh hgwdev
    cd $HOME/kent/src/hg/makeDb/genbank
    git pull
    # edit etc/genbank.conf to add echTel2 before echTel1 and commit to GIT
# echTel2 (tenrec)
echTel2.serverGenome = /hive/data/genomes/echTel2/echTel2.2bit
echTel2.clusterGenome = /hive/data/genomes/echTel2/echTel2.2bit
echTel2.ooc = /hive/data/genomes/echTel2/jkStuff/echTel2.11.ooc
echTel2.lift = no
echTel2.refseq.mrna.native.pslCDnaFilter  = ${lowCover.refseq.mrna.native.pslCDnaFilter}
echTel2.refseq.mrna.xeno.pslCDnaFilter    = ${lowCover.refseq.mrna.xeno.pslCDnaFilter}
echTel2.genbank.mrna.native.pslCDnaFilter = ${lowCover.genbank.mrna.native.pslCDnaFilter}
echTel2.genbank.mrna.xeno.pslCDnaFilter   = ${lowCover.genbank.mrna.xeno.pslCDnaFilter}
echTel2.genbank.est.native.pslCDnaFilter  = ${lowCover.genbank.est.native.pslCDnaFilter}
echTel2.refseq.mrna.native.load = no
echTel2.refseq.mrna.xeno.load = yes
echTel2.genbank.mrna.xeno.load = no
echTel2.genbank.est.native.load = no
echTel2.genbank.mrna.native.load = no
echTel2.genbank.mrna.native.loadDesc = no
echTel2.downloadDir = echTel2
echTel2.perChromTables = no

    # end of section added to etc/genbank.conf
    git commit -m "adding echTel2 tenrec refs #9693" etc/genbank.conf
    git push
    make etc-update

    # ~/kent/src/hg/makeDb/genbank/src/lib/gbGenome.c already contains
    # anoCar genome information, if this is a new species, need to add stuff
    # there

    ssh hgwdev			# used to do this on "genbank" machine
    screen			# long running job managed in screen
    cd /cluster/data/genbank
    time nice -n +19 ./bin/gbAlignStep -initial echTel2 &
    #	var/build/logs/2013.06.06-14:36:22.echTel2.initalign.log
    #	real    217m40.528s

    # load database when finished
    ssh hgwdev
    cd /cluster/data/genbank
    time nice -n +19 ./bin/gbDbLoadStep -drop -initialLoad echTel2 &
    #	logFile: var/dbload/hgwdev/logs/2013.06.06-23:23:31.dbload.log
    #	real    15m12.665s

    # enable daily alignment and update of hgwdev (TBD - Hiram)
    cd ~/kent/src/hg/makeDb/genbank
    git pull
    # add echTel2 to: etc/align.dbs etc/hgwdev.dbs
    vi etc/align.dbs etc/hgwdev.dbs
    git commit -m "Added echTel2 to daily hgwdev build refs #9693" etc/align.dbs etc/hgwdev.dbs
    git push
    make etc-update

############################################################################
# construct liftOver from echTel1 to echTel2 (DONE - 2013-06-10 - Hiram)
    # documentation for this step is in echTel1 - remember to do this

###########################################################################
# construct downloads files (DONE - 2013-06-28 - Hiram)
#   before starting downloads, the joinerCheck should be clean
#   after echTel2 is added to all.joiner:
    joinerCheck -keys -database=echTel2 all.joiner

    cd /hive/data/genomes/echTel2
    time makeDownloads.pl -dbHost=hgwdev -workhorse=hgwdev echTel2 \
	> downloads.log 2>&1
    # real    27m35.022s
    # examine the README.txt files to verify the text

###########################################################################
# ready for first pushQ entry (DONE - 2013-06-28 - Hiram)
    mkdir /hive/data/genomes/echTel2/pushQ
    cd /hive/data/genomes/echTel2/pushQ
    time makePushQSql.pl echTel2 > echTel2.sql 2> stderr.out
    #  real    3m58.285s

    # some errors are legitimate and OK:
     head stderr.out
# WARNING: hgwdev does not have /gbdb/echTel2/wib/gc5Base.wib
# WARNING: hgwdev does not have /gbdb/echTel2/wib/quality.wib
# WARNING: hgwdev does not have /gbdb/echTel2/bbi/quality.bw
# WARNING: echTel2 does not have seq
# WARNING: echTel2 does not have extFile
    scp -p echTel2.sql hgwbeta:/tmp
    ssh hgwbeta 'hgsql qapushq < /tmp/echTel2.sql'
    # look at the pushQ and verify all the files can be seen

###########################################################################
# lastz alignment with Human/hg19 (DONE - 2013-06-26 - Hiram)
    #	the original alignment
    cd /hive/data/genomes/hg19/bed/lastzEchTel2.2013-06-12
    cat fb.hg19.chainEchTel2Link.txt
    #	873117393 bases of 2897316137 (30.135%) in intersection

    #	running this swap
    mkdir /hive/data/genomes/echTel2/bed/blastz.hg19.swap
    cd /hive/data/genomes/echTel2/bed/blastz.hg19.swap
    time nice -n +19 doBlastzChainNet.pl -verbose=2 \
	/hive/data/genomes/hg19/bed/lastzEchTel2.2011-04-19/DEF \
	-noLoadChainSplit -chainMinScore=5000 -chainLinearGap=loose \
	-workhorse=hgwdev -smallClusterHub=encodek -bigClusterHub=swarm \
	-syntenicNet -swap -qRepeats=windowmaskerSdust > swap.log 2>&1 &
    #	real    20m45.683s
    cat fb.echTel2.chainHg19Link.txt
    #	852830619 bases of 2605196361 (32.736%) in intersection

############################################################################
# blat servers (DONE - 2013-06-10 - Hiram)
#	After getting a blat server assigned by the Blat Server Gods,
    ssh hgwdev

    hgsql -e 'INSERT INTO blatServers (db, host, port, isTrans, canPcr) \
	VALUES ("echTel2", "blat4a", "17846", "1", "0"); \
	INSERT INTO blatServers (db, host, port, isTrans, canPcr) \
	VALUES ("echTel2", "blat4a", "17847", "0", "1");' \
	    hgcentraltest
    #	test it with some sequence

#########################################################################
## Default position set at RHO protein found with blat
##    (DONE - 2013-06-10 - Hiram)
    ssh hgwdev
    hgsql -e 'update dbDb set defaultPos="JH980304:37472293-37479623"
	where name="echTel2";' hgcentraltest

#########################################################################
#  lastz swap mouse/mm10 (DONE - 2013-06-12 - Hiram)
    # original alignment to human:
    cd /hive/data/genomes/mm10/bed/lastzEchTel2.2013-06-12
    cat fb.mm10.chainEchTel2Link.txt
    #   384570981 bases of 2652783500 (14.497%) in intersection

    # and, for this swap
    mkdir /hive/data/genomes/echTel2/bed/blastz.mm10.swap
    cd /hive/data/genomes/echTel2/bed/blastz.mm10.swap
    time nice -n +19 doBlastzChainNet.pl -verbose=2 \
        /hive/data/genomes/mm10/bed/lastzEchTel2.2013-06-12/DEF \
        -swap -syntenicNet \
        -workhorse=hgwdev -smallClusterHub=encodek -bigClusterHub=swarm \
        -chainMinScore=3000 -chainLinearGap=medium > swap.log 2>&1 &
    #   real    43m0.194s
    cat fb.echTel2.chainMm10Link.txt
    #   380872172 bases of 2605196361 (14.620%) in intersection

    # set sym link to indicate this is the lastz for this genome:
    cd /hive/data/genomes/echTel2/bed
    ln -s blastz.mm10.swap lastz.mm10

#########################################################################
# fixup search rule for assembly track/gold table (DONE - 2013-08-06 - Hiram)
    hgsql -N -e "select frag from gold;" echTel2 | sort | head -1
AAIY02000001.1
    hgsql -N -e "select frag from gold;" echTel2 | sort | tail -2
AAIY02277845.1
NC_002631

    # verify this rule will find them all or eliminate them all:
    hgsql -N -e "select frag from gold;" echTel2 | wc -l
    # 277846

    hgsql -N -e "select frag from gold;" echTel2 | egrep -e '[AN][AC][I_][Y0]0[0-9]+(\.1)?' | wc -l
    # 277846

    hgsql -N -e "select frag from gold;" echTel2 | egrep -v -e '[AN][AC][I_][Y0]0[0-9]+(\.1)?' | wc -l
    # 0

    # hence, add to trackDb/tenrec/echTel2/trackDb.ra
searchTable gold
shortCircuit 1
termRegex [AN][AC][I_][Y0]0[0-9]+(\.1)?
query select chrom,chromStart,chromEnd,frag from %s where frag like '%s%%'
searchPriority 8

#########################################################################
# CPG Islands Unmasked track

    mkdir /hive/data/genomes/echTel2/bed/cpgIslandsUnmasked
    cd /hive/data/genomes/echTel2/bed/cpgIslandsUnmasked
    time doCpgIslands.pl -buildDir=`pwd` -bigClusterHub=ku \
     -tableName=cpgIslandExtUnmasked -dbHost=hgwdev -smallClusterHub=ku \
        -workhorse=hgwdev \
          -maskedSeq=/hive/data/genomes/echTel2/echTel2.unmasked.2bit \
            echTel2 > do.log 2>&1
    #  Elapsed time: 5m38s

    cat fb.echTel2.cpgIslandExtUnmasked.txt
    # 20726777 bases of 2605196361 (0.796%) in intersection

##############################################################################
##############################################################################
# TransMap V3 tracks. see makeDb/doc/transMapTracks.txt (2014-12-21 markd)
##############################################################################
