# for emacs: -*- mode: sh; -*-

# This file describes how the browser for C. angaria WS245 version is built

##############################################################################
# download sequence, create UCSC sequence (DONE - 2015-06-24 - Hiram)
    mkdir -p /hive/data/genomes/priPac3/ws245
    cd /hive/data/genomes/priPac3/ws245

    wget --no-parent --timestamping -m -nH --cut-dirs=6 \
ftp://ftp.sanger.ac.uk/pub/wormbase/releases/WS245/species/PRJNA12644

    mkdir /hive/data/genomes/priPac3/ucsc
    cd /hive/data/genomes/priPac3/ucsc
    # WormBase contig names are of the pattern:
# >Ppa_Contig0
# >Ppa_Contig1
# >Ppa_Contig2
#   ...

    # no conversion of names to maintain equivalence with WormBase:
    zcat ../ws245/PRJNA12644/p_pacificus.PRJNA12644.WS245.genomic.fa.gz \
        | gzip -c > priPac3.fa.gz

    hgFakeAgp priPac3.fa.gz priPac3.agp

#############################################################################
#  Initial database build (DONE - 2015-06-26 - Hiram)

    cd /hive/data/genomes/priPac3
    cat << '_EOF_' > priPac3.config.ra
# Config parameters for makeGenomeDb.pl:
db priPac3
clade worm
# genomeCladePriority 70
scientificName Pristionchus pacificus
commonName P. pacificus
assemblyDate Aug. 2014
assemblyShortLabel P_pacificus-v2
assemblyLabel Max Planck Institute for Developmental Biology P. pacificus genome project
orderKey 16013
mitoAcc JF414117.1
fastaFiles /hive/data/genomes/priPac3/ucsc/priPac3.fa.gz
agpFiles /hive/data/genomes/priPac3/ucsc/priPac3.agp
# qualFiles none
dbDbSpeciesDir worm
photoCreditURL http://www.eb.tuebingen.mpg.de/departments/4-evolutionary-biology/department-4-evolutionary-biology
photoCreditName Scanning electron micrograph courtesy of Jrgen Berger, and Ralf J. Sommer, Max Planck Institute for Developmental Biology, All Rights Reserved
ncbiGenomeId 246
ncbiAssemblyId 320251
ncbiAssemblyName WS221
ncbiBioProject 12644
genBankAccessionID GCA_000180635.2
taxId 54126
'_EOF_'
    # << happy emacs

    # verify sequence and AGP are OK:
    time (makeGenomeDb.pl -workhorse=hgwdev -dbHost=hgwdev -fileServer=hgwdev \
         -stop=agp priPac3.config.ra) > agp.log 2>&1
    # *** All done!  (through the 'agp' step)
    # real    0m33.651s

    # then finish it off:
    time (makeGenomeDb.pl -workhorse=hgwdev -dbHost=hgwdev \
       -fileServer=hgwdev -continue=db priPac3.config.ra) > db.log 2>&1
    # real    1m46.091s

    # check in the trackDb files created and add to trackDb/makefile

##############################################################################
# cpgIslands on UNMASKED sequence (DONE - 2015-07-01 - Hiram)
    mkdir /hive/data/genomes/priPac3/bed/cpgIslandsUnmasked
    cd /hive/data/genomes/priPac3/bed/cpgIslandsUnmasked

    time (doCpgIslands.pl -dbHost=hgwdev -bigClusterHub=ku -buildDir=`pwd` \
       -tableName=cpgIslandExtUnmasked \
          -maskedSeq=/hive/data/genomes/priPac3/priPac3.unmasked.2bit \
             -workhorse=hgwdev -smallClusterHub=ku priPac3) > do.log 2>&1
    # real    12m23.178s

    cat fb.priPac3.cpgIslandExtUnmasked.txt
    # 9210988 bases of 153238914 (6.011%) in intersection

#############################################################################
# cytoBandIdeo - (DONE - 2015-07-01 - Hiram)
    mkdir /hive/data/genomes/priPac3/bed/cytoBand
    cd /hive/data/genomes/priPac3/bed/cytoBand
    makeCytoBandIdeo.csh priPac3

#########################################################################
# ucscToINSDC table/track (TBD - 2015-03-20 - Hiram)
    mkdir /hive/data/genomes/priPac3/bed/ucscToINSDC
    cd /hive/data/genomes/priPac3/bed/ucscToINSDC

    ~/kent/src/hg/utils/automation/ucscToINSDC.sh \
       ../../genbank/GCA_*assembly_structure/Primary_Assembly

    awk '{printf "%s\t0\t%d\n", $1,$2}' ../../chrom.sizes \
         | sort > name.coordinate.tab
    join name.coordinate.tab ucscToINSDC.txt | tr '[ ]' '[\t]' \
         > ucscToINSDC.bed
    # verify all names are coming through, should be same line count:
    wc -l *
    # 25187 name.coordinate.tab
    # 25187 ucscToINSDC.bed
    # 25187 ucscToINSDC.txt

    cut -f1 ucscToINSDC.bed | awk '{print length($0)}' | sort -n | tail -1
    # 14
    # use the 14 in this sed
    sed -e "s/21/14/" $HOME/kent/src/hg/lib/ucscToINSDC.sql \
         | hgLoadSqlTab priPac3 ucscToINSDC stdin ucscToINSDC.bed
    checkTableCoords priPac3
    # should cover %100 entirely:
    featureBits -countGaps priPac3 ucscToINSDC
    # 2053849526 bases of 2053849526 (100.000%) in intersection

#########################################################################
# fixup search rule for assembly track/gold table (DONE - 2015-06-01 - Hiram)
    hgsql -N -e "select frag from gold;" priPac3 | sort | head -3
JF414117.1
Ppa_Contig0_1
Ppa_Contig0_10
[JP][Fp][a4][1_][C4][o1][n1][t7][i\.](g[0-9]*)?(\_[0-9]*)?

    hgsql -N -e "select frag from gold;" priPac3 | sort | tail -2
Ppa_Contig9_98
Ppa_Contig9_99

    # verify this rule will find them all or eliminate them all:
    hgsql -N -e "select frag from gold;" priPac3 | wc -l
    # 28759

    hgsql -N -e "select frag from gold;" priPac3 \
      | egrep -e '[JP][Fp][a4][1_][C4][o1][n1][t7][i\.](g[0-9]*)?(\_[0-9]*)?' \
        | wc -l
    # 28759

  hgsql -N -e "select frag from gold;" priPac3 \
    | egrep -v -e '[JP][Fp][a4][1_][C4][o1][n1][t7][i\.](g[0-9]*)?(\_[0-9]*)?' \
       | wc -l
    # 0

    # hence, add to trackDb/worm/priPac3/trackDb.ra
searchTable gold
shortCircuit 1
termRegex [JP][Fp][a4][1_][C4][o1][n1][t7][i\.](g[0-9]*)?(\_[0-9]*)?
query select chrom,chromStart,chromEnd,frag from %s where frag like '%s%%'
searchPriority 8

##########################################################################
# running repeat masker (DONE - 2015-07-01 - Hiram)
    mkdir /hive/data/genomes/priPac3/bed/repeatMasker
    cd /hive/data/genomes/priPac3/bed/repeatMasker
    time (doRepeatMasker.pl -buildDir=`pwd` \
        -bigClusterHub=ku -dbHost=hgwdev -workhorse=hgwdev \
        -smallClusterHub=ku priPac3) > do.log 2>&1
    # real    189m11.418s

    cat faSize.rmsk.txt
    # 172510819 bases (19302620 N's 153208199 real 145490438 upper
    #    7717761 lower) in 18084 sequences in 1 files
    # Total size: mean 9539.4 sd 122670.4 min 47 (Ppa_Contig13657)
    #    max 5268024 (Ppa_Contig0) median 685
    # %4.47 masked total, %5.04 masked real

    egrep -i "versi|relea" do.log
    # RepeatMasker version open-4.0.5
    #    January 31 2015 (open-4-0-5) version of RepeatMasker
    # CC   RELEASE 20140131;

    time featureBits -countGaps priPac3 rmsk
    # 7720275 bases of 172510819 (4.475%) in intersection
    #  real    0m6.103s

    # why is it different than the faSize above ?
    # because rmsk masks out some N's as well as bases, the count above
    #   separates out the N's from the bases, it doesn't show lower case N's

##########################################################################
# running simple repeat (DONE - 2015-07-01 - Hiram)

    mkdir /hive/data/genomes/priPac3/bed/simpleRepeat
    cd /hive/data/genomes/priPac3/bed/simpleRepeat
    time (doSimpleRepeat.pl -buildDir=`pwd` -bigClusterHub=ku \
        -dbHost=hgwdev -workhorse=hgwdev -smallClusterHub=ku \
        priPac3) > do.log 2>&1
    # real    10m16.188s

    cat fb.simpleRepeat
    # 3902648 bases of 153238914 (2.547%) in intersection

    # using the Window Masker result as indicated below

##########################################################################
# CREATE MICROSAT TRACK (DONE - 2015-07-01 - Hiram)
     ssh hgwdev
     mkdir /cluster/data/priPac3/bed/microsat
     cd /cluster/data/priPac3/bed/microsat
     awk '($5==2 || $5==3) && $6 >= 15 && $8 == 100 && $9 == 0 {printf("%s\t%s\t%s\t%dx%s\n", $1, $2, $3, $6, $16);}' \
	../simpleRepeat/simpleRepeat.bed > microsat.bed
    hgLoadBed priPac3 microsat microsat.bed
    #	Read 765 elements of size 4 from microsat.bed

##########################################################################
## WINDOWMASKER (DONE - 2015-07-01 - Hiram)

    mkdir /hive/data/genomes/priPac3/bed/windowMasker
    cd /hive/data/genomes/priPac3/bed/windowMasker
    time (doWindowMasker.pl -buildDir=`pwd` -workhorse=hgwdev \
        -dbHost=hgwdev priPac3) > do.log 2>&1
    # real    13m19.730s

    # Masking statistics
    cat faSize.priPac3.cleanWMSdust.txt
    # 172510819 bases (19302620 N's 153208199 real 116592215 upper
    #    36615984 lower) in 18084 sequences in 1 files
    # Total size: mean 9539.4 sd 122670.4 min 47 (Ppa_Contig13657)
    #    max 5268024 (Ppa_Contig0) median 685
    # %21.23 masked total, %23.90 masked real

    cat fb.priPac3.rmsk.windowmaskerSdust.txt
    # 5980404 bases of 172510819 (3.467%) in intersection

    # using this Window Masker result for final masking::
    cd /hive/data/genomes/priPac3
    #   you can safely ignore the warning about fields >= 13
    twoBitMask bed/windowMasker/priPac3.cleanWMSdust.2bit \
       -add bed/simpleRepeat/trfMask.bed  priPac3.2bit

    # measure the final masking:
    twoBitToFa priPac3.2bit stdout | faSize stdin > faSize.priPac3.2bit.txt
    cat faSize.priPac3.2bit.txt
    # 172510819 bases (19302620 N's 153208199 real 116540993 upper
    #    36667206 lower) in 18084 sequences in 1 files
    # Total size: mean 9539.4 sd 122670.4 min 47 (Ppa_Contig13657)
    #    max 5268024 (Ppa_Contig0) median 685
    # %21.26 masked total, %23.93 masked real

    # and reset the symlink
    rm /gbdb/priPac3/priPac3.2bit
    ln -s /hive/data/genomes/priPac3/priPac3.2bit /gbdb/priPac3/priPac3.2bit

##########################################################################
# cpgIslands - (DONE - 2015-07-01 - Hiram)
    mkdir /hive/data/genomes/priPac3/bed/cpgIslands
    cd /hive/data/genomes/priPac3/bed/cpgIslands
    time (doCpgIslands.pl -dbHost=hgwdev -bigClusterHub=ku \
      -workhorse=hgwdev -smallClusterHub=ku priPac3) > do.log 2>&1 &
    # real    21m15.732s

    cat fb.priPac3.cpgIslandExt.txt
    # 2971735 bases of 153238914 (1.939%) in intersection

#########################################################################
# augustus - (DONE - 2015-07-01 - Hiram)
    mkdir /hive/data/genomes/priPac3/bed/augustus
    cd /hive/data/genomes/priPac3/bed/augustus
    time (doAugustus.pl -buildDir=`pwd` -bigClusterHub=ku \
      -species=caenorhabditis -dbHost=hgwdev \
         -workhorse=hgwdev priPac3) > do.log 2>&1
    # real    91m20.674s

    cat fb.priPac3.augustusGene.txt
    # 30647862 bases of 153238914 (20.000%) in intersection

#########################################################################
# genscan - (DONE - 2015-07-01 - Hiram)
    mkdir /hive/data/genomes/priPac3/bed/genscan
    cd /hive/data/genomes/priPac3/bed/genscan
    time (doGenscan.pl -buildDir=`pwd` -workhorse=hgwdev -dbHost=hgwdev \
      -bigClusterHub=ku priPac3) > do.log 2>&1
    # real    15m3.155s

    cat fb.priPac3.genscan.txt
    #   10119755 bases of 153238914 (6.604%) in intersection

    cat fb.priPac3.genscanSubopt.txt
    #   6198002 bases of 153238914 (4.045%) in intersection

########################################################################
# Create kluster run files (TBD - 2015-03-24 - Hiram)

    cd /hive/data/genomes/priPac3
    # numerator is priPac3 gapless bases "real" as reported by:
    head -1 faSize.priPac3.2bit.txt
# 172510819 bases (19302620 N's 153208199 real 116540993 upper 36667206 lower)
# in 18084 sequences in 1 files
    # numerator is 'real' base count
    # denominator is hg19 gapless bases as reported by:
    #   featureBits -noRandom -noHap hg19 gap
    #     234344806 bases of 2861349177 (8.190%) in intersection
    # 1024 is threshold used for human -repMatch:
    calc \( 153208199 / 2861349177 \) \* 1024
    # ( 153208199 / 2861349177 ) * 1024 = 54.829098

    # ==> use -repMatch=50 according to size scaled down from 1024 for human.
    #   and rounded down to nearest 50
    cd /hive/data/genomes/priPac3
    time blat priPac3.2bit \
         /dev/null /dev/null -tileSize=11 -makeOoc=jkStuff/priPac3.11.ooc \
        -repMatch=50
    # Wrote 23520 overused 11-mers to jkStuff/priPac3.11.ooc
    #  real    0m2.869s

    # there are a few non-bridged gaps
    #   check non-bridged gaps to see what the typical size is:
    hgsql -N -e 'select * from gap where bridge="no" order by size;' priPac3 \
       | ave -tableOut -col=7 stdin
# min Q1 median Q3 max mean N sum stddev
# 61704 80172.5 105503 114920 116040 93010.2 8 744082 22910.2

    # note the minimum non-bridged gap size is 61,704

    gapToLift -verbose=2 -minGap=50000 priPac3 jkStuff/priPac3.nonBridged.lft \
         -bedFile=jkStuff/priPac3.nonBridged.bed

    # survey sizes:
    n50.pl chrom.sizes
#       reading: chrom.sizes
#       contig count: 18084, total size: 172510819, one half size: 86255409
# cumulative    N50 count       contig  contig size
# 85581688        38      Ppa_Contig38    1290309
# 86255409 one half size
# 86826222        39      Ppa_Contig47    1244534

#############################################################################
# GENBANK AUTO UPDATE (TBD - 2015-06-09 - Hiram)
    ssh hgwdev
    cd $HOME/kent/src/hg/makeDb/genbank
    git pull
    # /cluster/data/genbank/data/organism.lst shows:
    # #organism               mrnaCnt estCnt  refSeqCnt
    #  Pristionchus pacificus  97      37470   0

    # edit etc/genbank.conf to add priPac3 just before priPac1

# priPac3 (P. pacificus)
priPac3.serverGenome = /hive/data/genomes/priPac3/priPac3.2bit
priPac3.clusterGenome = /hive/data/genomes/priPac3/priPac3.2bit
priPac3.ooc = /hive/data/genomes/priPac3/jkStuff/priPac3.11.ooc
priPac3.lift = /hive/data/genomes/priPac3/jkStuff/priPac3.nonBridged.lft
priPac3.refseq.mrna.native.pslCDnaFilter  = ${ordered.refseq.mrna.native.pslCDnaFilter}
priPac3.refseq.mrna.xeno.pslCDnaFilter    = ${ordered.refseq.mrna.xeno.pslCDnaFilter}
priPac3.genbank.mrna.native.pslCDnaFilter = ${ordered.genbank.mrna.native.pslCDnaFilter}
priPac3.genbank.mrna.xeno.pslCDnaFilter   = ${ordered.genbank.mrna.xeno.pslCDnaFilter}
priPac3.genbank.est.native.pslCDnaFilter  = ${ordered.genbank.est.native.pslCDnaFilter}
priPac3.refseq.mrna.native.load = yes
priPac3.refseq.mrna.xeno.load  = yes
priPac3.refseq.mrna.xeno.loadDesc = yes
# DO NOT NEED genbank.mrna.xeno except for human, mouse
priPac3.genbank.mrna.xeno.load = no
priPac3.genbank.est.native.load = yes
priPac3.genbank.est.native.loadDesc = no
priPac3.downloadDir = priPac3
priPac3.perChromTables = no

    git commit -m "Added priPac3 - P. pacificus refs #15209" etc/genbank.conf
    git push
    # update /cluster/data/genbank/etc/:
    make etc-update

    screen      #  control this business with a screen since it takes a while
    cd /cluster/data/genbank

    time ./bin/gbAlignStep -initial priPac3
    # logFile: var/build/logs/2015.07.02-11:39:01.priPac3.initalign.log
    # real    91m20.362s

    #   To re-do, rm the dir first:
    #     /cluster/data/genbank/work/initial.priPac3

    # load database when finished
    ssh hgwdev
    cd /cluster/data/genbank
    time ./bin/gbDbLoadStep -drop -initialLoad priPac3
    # logFile: var/dbload/hgwdev/logs/2015.07.06-09:51:31.priPac3.dbload.log
    # real    18m34.687s

    # enable daily alignment and update of hgwdev
    cd ~/kent/src/hg/makeDb/genbank
    git pull
    # add priPac3 to:
    #   vi etc/align.dbs etc/hgwdev.dbs
    git commit -m "Added priPac3 - Pristionchus pacificus refs #15209" \
      etc/align.dbs etc/hgwdev.dbs
    git push
    make etc-update

#########################################################################
# all.joiner update, downloads and in pushQ - (TBD - 2015-06-22 - Hiram)
    cd $HOME/kent/src/hg/makeDb/schema
    # fixup all.joiner until this is a clean output
    joinerCheck -database=priPac3 -tableCoverage all.joiner
    joinerCheck -database=priPac3 -times all.joiner
    joinerCheck -database=priPac3 -keys all.joiner

    cd /hive/data/genomes/priPac3
    time makeDownloads.pl priPac3 > downloads.log 2>&1
    # real    13m42.027s

    #   now ready for pushQ entry
    mkdir /hive/data/genomes/priPac3/pushQ
    cd /hive/data/genomes/priPac3/pushQ
    makePushQSql.pl priPac3 > priPac3.pushQ.sql 2> stderr.out
    #   check for errors in stderr.out, some are OK, e.g.:
    # WARNING: hgwdev does not have /gbdb/priPac3/wib/gc5Base.wib
    # WARNING: hgwdev does not have /gbdb/priPac3/wib/quality.wib
    # WARNING: hgwdev does not have /gbdb/priPac3/bbi/qualityBw/quality.bw
    # WARNING: priPac3 does not have seq
    # WARNING: priPac3 does not have extFile
    # WARNING: priPac3 does not have estOrientInfo
    # WARNING: priPac3 does not have mrnaOrientInfo

    #   copy it to hgwbeta
    scp -p priPac3.pushQ.sql qateam@hgwbeta:/tmp
    ssh qateam@hgwbeta "./bin/x86_64/hgsql qapushq < /tmp/priPac3.pushQ.sql"

    #   in that pushQ entry walk through each entry and see if the
    #   sizes will set properly

#########################################################################
# LIFTOVER TO priPac1 (DONE - 2015-07-07 - Hiram )
    mkdir /hive/data/genomes/priPac3/bed/blat.priPac1.2015-07-07
    cd /hive/data/genomes/priPac3/bed/blat.priPac1.2015-07-07
    # -debug run to create run dir, preview scripts...
    doSameSpeciesLiftOver.pl \
	-buildDir=`pwd` \
	-bigClusterHub=ku -dbHost=hgwdev -workhorse=hgwdev \
  -ooc=/hive/data/genomes/priPac3/jkStuff/priPac3.11.ooc -debug priPac3 priPac1
    # Real run:
    time (doSameSpeciesLiftOver.pl -buildDir=`pwd` \
      -bigClusterHub=ku -dbHost=hgwdev -workhorse=hgwdev \
       -ooc=/hive/data/genomes/priPac3/jkStuff/priPac3.11.ooc priPac3 priPac1) \
		> do.log 2>&1
    #	real    14m59.119s
    # verify it works on genome-test

#############################################################################
# LIFTOVER TO priPac2 (DONE - 2015-07-07 - Hiram )
    mkdir /hive/data/genomes/priPac3/bed/blat.priPac2.2015-07-07
    cd /hive/data/genomes/priPac3/bed/blat.priPac2.2015-07-07
    # -debug run to create run dir, preview scripts...
    doSameSpeciesLiftOver.pl \
	-buildDir=`pwd` \
	-bigClusterHub=ku -dbHost=hgwdev -workhorse=hgwdev \
  -ooc=/hive/data/genomes/priPac3/jkStuff/priPac3.11.ooc -debug priPac3 priPac2
    # Real run:
    time (doSameSpeciesLiftOver.pl -buildDir=`pwd` \
      -bigClusterHub=ku -dbHost=hgwdev -workhorse=hgwdev \
       -ooc=/hive/data/genomes/priPac3/jkStuff/priPac3.11.ooc priPac3 priPac2) \
		> do.log 2>&1
    #	real    6m45.278s
    # verify it works on genome-test

#############################################################################
