# for emacs: -*- mode: sh; -*-

# This file describes browser build for the danRer10

# Assembly Name:  GRCz10
# Description:    Genome Reference Consortium Zebrafish Build 10
# Organism name:  Danio rerio
# Taxid:          7955
# Submitter:      Genome Reference Consortium
# Date:           2014-9-2
# BioSample:      SAMN03020626
# Assembly type:  haploid
# Release type:   major
# Assembly level: Chromosome
# Genome representation: full
# GenBank Assembly Accession: GCA_000002035.3 (latest)
# RefSeq Assembly Accession: GCF_000002035.5 (species-representative latest)
# RefSeq Assembly and GenBank Assemblies Identical: no
#
## Assembly-Units:
## GenBank Unit Accession       RefSeq Unit Accession   Assembly-Unit name
## GCA_000000175.3      GCF_000000175.4 Primary Assembly
##      GCF_000002055.1 non-nuclear

#############################################################################
# fetch sequence from new style download directory (DONE - 2015-01-22 - Hiram)
    # now using the new NCBI naming scheme hierarchy
    mkdir -p /hive/data/genomes/danRer10/genbank
    cd /hive/data/genomes/danRer10/genbank

    time rsync -L -a -P rsync://ftp.ncbi.nlm.nih.gov/genomes/genbank/vertebrate_other/Danio_rerio/all_assembly_versions/GCA_000002035.3_GRCz10/ ./
    # real    1m18.054s

    # measure what we have here:
    faSize GCA_000002035.3_GRCz10_genomic.fna.gz

    # 1371702787 bases (2087465 N's 1369615322 real 696150664 upper
    #   673464658 lower) in 1060 sequences in 1 files
    # Total size: mean 1294059.2 sd 8275856.7 min 650 (KN150525.1)
    #   max 76625712 (CM002888.1) median 18168
    # %49.10 masked total, %49.17 masked real

    # note that these totals do not include chrM since the GenBank ftp directory
    # did not include a non-nuclear directory

    # same measurement on the individual files:

    faSize GCA_000002035.3_GRCz10_assembly_structure/Primary_Assembly/unplaced_scaffolds/FASTA/unplaced.scaf.fna.gz \
      GCA_000002035.3_GRCz10_assembly_structure/Primary_Assembly/assembled_chromosomes/FASTA/*.fna.gz
    # 1371702787 bases (2087465 N's 1369615322 real 1369615322 upper
    #    0 lower) in 1060 sequences in 26 files
    # Total size: mean 1294059.2 sd 8275856.7 min 650 (KN150525.1)
    #    max 76625712 (CM002888.1) median 18168
    # %0.00 masked total, %0.00 masked real

#############################################################################
# fixup to UCSC naming scheme (DONE - 2015-01-22 - Hiram)
    mkdir /hive/data/genomes/danRer10/ucsc
    cd /hive/data/genomes/danRer10/ucsc

    time ~/kent/src/hg/makeDb/doc/danRer10/ucscCompositeAgp.pl \
        ../genbank/GCA_000002035.3_GRCz10_assembly_structure/Primary_Assembly

    # this unplaced.pl script is generic enough at this point, it should
    # go into utils/automation/
    time ~/kent/src/hg/makeDb/doc/danRer10/unplaced.pl \
        ../genbank/GCA_000002035.3_GRCz10_assembly_structure/Primary_Assembly
    zcat ../genbank/GCA_000164805.2_Tarsius_syrichta-2.0.1_assembly_structure/Primary_Assembly/unplaced_scaffolds/AGP/unplaced.scaf.agp.gz \

    # verify we haven't lost anything:
    faSize chr*.fa
    # 1371702787 bases (2087465 N's 1369615322 real 1369615322 upper 0 lower)
    #    in 1060 sequences in 26 files
    # Total size: mean 1294059.2 sd 8275856.7 min 650 (chrUn_KN150525v1)
    #    max 76625712 (chr4) median 18168
    # %0.00 masked total, %0.00 masked real

    # same numbers as above.

    time gzip chr*.fa chr*.agp
    #  real    7m27.555s

#############################################################################
#  Initial database build (DONE - 2014-11-20 - Hiram)

    cd /hive/data/genomes/danRer10
    cat << '_EOF_' > danRer10.config.ra
# Config parameters for makeGenomeDb.pl:
db danRer10
clade vertebrate
scientificName Danio rerio
commonName Zebrafish
assemblyDate Sep. 2014
assemblyLabel Genome Reference Consortium Zebrafish Build 10
assemblyShortLabel Zv10
orderKey 26173
fastaFiles /hive/data/genomes/danRer10/ucsc/chr*.fa.gz
agpFiles /hive/data/genomes/danRer10/ucsc/chr*.agp.gz
# qualFiles none
dbDbSpeciesDir zebrafish
mitoAcc NC_002333.2
photoCreditURL http://www.genome.gov/dmd/img.cfm?node=Photos/Animals/Zebrafish&id=79130
photoCreditName NHGRI Press Photos
ncbiGenomeId 50
ncbiAssemblyId 210611
ncbiAssemblyName GRCz10
ncbiBioProject 13922
genBankAccessionID GCA_000002035.3
taxId 7955
'_EOF_'
    # << happy emacs

    # verify sequence and AGP are OK:
    time makeGenomeDb.pl -workhorse=hgwdev -dbHost=hgwdev -fileServer=hgwdev \
         -stop=agp danRer10.config.ra > agp.log 2>&1
    # real    1m21.490s

    # then finish it off:
    time makeGenomeDb.pl -workhorse=hgwdev -dbHost=hgwdev \
       -fileServer=hgwdev -continue=db danRer10.config.ra > db.log 2>&1
    # real    11m8.084s

    # check in the trackDb files created here

##########################################################################
# running repeat masker (DONE - 2015-01-22 - Hiram)
    mkdir /hive/data/genomes/danRer10/bed/repeatMasker
    cd /hive/data/genomes/danRer10/bed/repeatMasker
    time  (doRepeatMasker.pl -buildDir=`pwd` \
        -bigClusterHub=ku -dbHost=hgwdev -workhorse=hgwdev \
        -smallClusterHub=ku danRer10) > do.log 2>&1
    # real    904m9.491s
    # had to patch ProcessRepeats.pl to get this to finish in the RM step
    # then continuing
    time  (doRepeatMasker.pl -buildDir=`pwd` \
        -bigClusterHub=ku -dbHost=hgwdev -workhorse=hgwdev \
        -continue=cat -smallClusterHub=ku danRer10) > cat.log 2>&1
    #  real    20m54.276s


    cat faSize.rmsk.txt
    # 1371719383 bases (2087465 N's 1369631918 real 654384100 upper
    #   715247818 lower) in 1061 sequences in 1 files
    # Total size: mean 1292855.2 sd 8272045.0 min 650 (chrUn_KN150525v1)
    #    max 76625712 (chr4) median 18160
    # %52.14 masked total, %52.22 masked real

    egrep -i "versi|relea" do.log
    #    January 31 2015 (open-4-0-5) version of RepeatMasker
    #  CC   RELEASE 20140131;

    time featureBits -countGaps danRer10 rmsk
    # 715370858 bases of 1371719383 (52.151%) in intersection
    # real    0m25.047s

    # why is it different than the faSize above ?
    # because rmsk masks out some N's as well as bases, the count above
    #   separates out the N's from the bases, it doesn't show lower case N's

##########################################################################
# running simple repeat (DONE 2015-01-22 - Hiram)

    mkdir /hive/data/genomes/danRer10/bed/simpleRepeat
    cd /hive/data/genomes/danRer10/bed/simpleRepeat
    time (doSimpleRepeat.pl -buildDir=`pwd` -bigClusterHub=ku \
        -dbHost=hgwdev -workhorse=hgwdev -smallClusterHub=ku \
        danRer10) > do.log 2>&1
    # real    48m11.853s

    cat fb.simpleRepeat
    # 96161783 bases of 1369683683 (7.021%) in intersection

    # add to rmsk after it is done:
    cd /hive/data/genomes/danRer10
    twoBitMask danRer10.rmsk.2bit \
        -add bed/simpleRepeat/trfMask.bed danRer10.2bit
    #   you can safely ignore the warning about fields >= 13

    twoBitToFa danRer10.2bit stdout | faSize stdin > faSize.danRer10.2bit.txt
    cat faSize.danRer10.2bit.txt
    # 1371719383 bases (2087465 N's 1369631918 real 653073010 upper
    #     716558908 lower) in 1061 sequences in 1 files
    # Total size: mean 1292855.2 sd 8272045.0 min 650 (chrUn_KN150525v1)
    #    max 76625712 (chr4) median 18160
    # %52.24 masked total, %52.32 masked real

    rm /gbdb/danRer10/danRer10.2bit
    ln -s `pwd`/danRer10.2bit /gbdb/danRer10/danRer10.2bit

##########################################################################
## WINDOWMASKER (DONE - 2014-11-21 - Hiram)

    mkdir /hive/data/genomes/danRer10/bed/windowMasker
    cd /hive/data/genomes/danRer10/bed/windowMasker
    time (doWindowMasker.pl -buildDir=`pwd` -workhorse=hgwdev \
        -dbHost=hgwdev danRer10) > do.log 2>&1
    # real    real    81m14.166s

    # Masking statistics
    cat faSize.danRer10.cleanWMSdust.txt
    # 1371719383 bases (2087465 N's 1369631918 real 682741767 upper
    #    686890151 lower) in 1061 sequences in 1 files
    # Total size: mean 1292855.2 sd 8272045.0 min 650 (chrUn_KN150525v1)
    #    max 76625712 (chr4) median 18160
    # %50.08 masked total, %50.15 masked real

    # run this after rmsk was done
    featureBits -countGaps danRer10 rmsk windowmaskerSdust \
       > fb.danRer10.rmsk.windowmaskerSdust.txt 2>&1
    cat fb.danRer10.rmsk.windowmaskerSdust.txt
    # 530239617 bases of 1371719383 (38.655%) in intersection

    # then continuing
    time (doWindowMasker.pl -buildDir=`pwd` -workhorse=hgwdev \
        -continue=cleanup -dbHost=hgwdev danRer10) > cleanup.log 2>&1

##########################################################################
# cpgIslands - (DONE - 2015-01-26 - Hiram)
    mkdir /hive/data/genomes/danRer10/bed/cpgIslands
    cd /hive/data/genomes/danRer10/bed/cpgIslands
    time doCpgIslands.pl -dbHost=hgwdev -bigClusterHub=ku \
      -workhorse=hgwdev -smallClusterHub=ku danRer10 > do.log 2>&1
    #  real    24m18.369s

    cat fb.danRer10.cpgIslandExt.txt
    # 4485867 bases of 1369683683 (0.328%) in intersection

##############################################################################
# cpgIslands on UNMASKED sequence (DONE - 2015-01-22 - Hiram)
    mkdir /hive/data/genomes/danRer10/bed/cpgIslandsUnmasked
    cd /hive/data/genomes/danRer10/bed/cpgIslandsUnmasked

    time (doCpgIslands.pl -dbHost=hgwdev -bigClusterHub=ku -buildDir=`pwd` \
       -tableName=cpgIslandExtUnmasked \
          -maskedSeq=/hive/data/genomes/danRer10/danRer10.unmasked.2bit \
             -workhorse=hgwdev -smallClusterHub=ku danRer10) > do.log 2>&1
    # real    42m5.784s

    cat fb.danRer10.cpgIslandExtUnmasked.txt
    # 28557104 bases of 1369683683 (2.085%) in intersection

#############################################################################
# cytoBandIdeo - (DONE - 2015-01-22 - Hiram)
    mkdir /hive/data/genomes/danRer10/bed/cytoBand
    cd /hive/data/genomes/danRer10/bed/cytoBand
    makeCytoBandIdeo.csh danRer10

#########################################################################
# genscan - (DONE 2015-01-26) - Hiram)
    mkdir /hive/data/genomes/danRer10/bed/genscan
    cd /hive/data/genomes/danRer10/bed/genscan
    time (doGenscan.pl -buildDir=`pwd` -workhorse=hgwdev -dbHost=hgwdev \
      -bigClusterHub=ku danRer10) > do.log 2>&1
    # real    55m31.303s

    cat fb.danRer10.genscan.txt
    # 51655591 bases of 1369683683 (3.771%) in intersection

    cat fb.danRer10.genscanSubopt.txt
    # 33381870 bases of 1369683683 (2.437%) in intersection

########################################################################
# Create kluster run files (DONE - 2015-01-26 - Hiram)

     head -1 faSize.danRer10.2bit.txt 
    # 1371719383 bases (2087465 N's 1369631918 real 653073010 upper
    #   716558908 lower) in 1061 sequences in 1 files

    # numerator is danRer10 gapless bases "real"
    # denominator is hg19 gapless bases as reported by:
    #   featureBits -noRandom -noHap hg19 gap
    #     234344806 bases of 2861349177 (8.190%) in intersection
    # 1024 is threshold used for human -repMatch:
    calc \( 1369631918 / 2861349177 \) \* 1024
    # ( 1369631918 / 2861349177 ) * 1024 = 490.154468

    # ==> use -repMatch=450 according to size scaled down from 1024 for human.
    #   and rounded down to nearest 50
    cd /hive/data/genomes/danRer10
    time blat danRer10.2bit \
         /dev/null /dev/null -tileSize=11 -makeOoc=jkStuff/danRer10.11.ooc \
        -repMatch=450
    # Wrote 47978 overused 11-mers to jkStuff/danRer10.11.ooc
    # real    0m24.503s

    #   all the non-bridged gaps here are size 100
    #   check non-bridged gaps to see what the typical size is:
    hgsql -N -e 'select * from gap where bridge="no" order by size;' \
       danRer10 | ave -col=7 stdin
# Q1 100.000000
# median 100.000000
# Q3 100.000000
# average 100.000000
# min 100.000000
# max 100.000000
# count 2338
# total 233800.000000
# standard deviation 0.000000

    # for use with the genbank runs
    gapToLift -verbose=2 -minGap=100 bosTau7 jkStuff/nonBridged.lft \
        -bedFile=jkStuff/nonBridged.bed

########################################################################
# GENBANK AUTO UPDATE (DONE - 2015-01-29 - Hiram)
    ssh hgwdev
    cd $HOME/kent/src/hg/makeDb/genbank
    git pull
    # /cluster/data/genbank/data/organism.lst shows:
    # #organism       mrnaCnt estCnt  refSeqCnt
    # Danio rerio     30176   1488365 14746

    # edit etc/genbank.conf to add danRer10 just after danRer7

# danRer10 (zebrafish)
danRer10.serverGenome = /hive/data/genomes/danRer10/danRer10.2bit
danRer10.clusterGenome = /hive/data/genomes/danRer10/danRer10.2bit
danRer10.ooc = /hive/data/genomes/danRer10/jkStuff/danRer10.11.ooc
danRer10.lift = no
danRer10.refseq.mrna.native.pslCDnaFilter  = ${finished.refseq.mrna.native.pslCDnaFilter}
danRer10.refseq.mrna.xeno.pslCDnaFilter    = ${finished.refseq.mrna.xeno.pslCDnaFilter}
danRer10.genbank.mrna.native.pslCDnaFilter = ${finished.genbank.mrna.native.pslCDnaFilter}
danRer10.genbank.mrna.xeno.pslCDnaFilter   = ${finished.genbank.mrna.xeno.pslCDnaFilter}
danRer10.genbank.est.native.pslCDnaFilter  = ${finished.genbank.est.native.pslCDnaFilter}
danRer10.genbank.mrna.xeno.load  = yes
danRer10.downloadDir = danRer10
danRer10.perChromTables = no
danRer10.refseq.mrna.xeno.load  = yes
danRer10.mgc = yes
danRer10.orfeome = yes
# danRer10.upstreamGeneTbl = ensGene
# danRer10.upstreamMaf = multiz8way
# /hive/data/genomes/danRer10/bed/multiz8way/species.lst

    git commit -m "Added danRer10; refs #14017" etc/genbank.conf
    git push
    # update /cluster/data/genbank/:
    make etc-update

# Edit src/lib/gbGenome.c to add new species.  Skipped

    screen      #  control this business with a screen since it takes a while
    cd /cluster/data/genbank

    time ./bin/gbAlignStep -initial danRer10
    # logFile: var/build/logs/2015.01.27-11:27:05.danRer10.initalign.log
    #   real    770m4.239s

    #   To re-do, rm the dir first:
    #     /cluster/data/genbank/work/initial.danRer10

    # load database when finished
    ssh hgwdev
    cd /cluster/data/genbank
    time ./bin/gbDbLoadStep -drop -initialLoad danRer10
    # logFile: var/dbload/hgwdev/logs/2015.01.28-11:07:50.danRer10.dbload.log
    # real    97m40.551s

    # enable daily alignment and update of hgwdev
    cd ~/kent/src/hg/makeDb/genbank
    git pull
    # add danRer10 to:
    #   vi etc/align.dbs etc/hgwdev.dbs
    git commit -m "Added danRer10 - Zebrafish refs #14017" etc/align.dbs etc/hgwdev.dbs
    git push
    make etc-update

#########################################################################
# fixup search rule for assembly track/gold table (DONE - 2014-05-01 - Hiram)
    hgsql -N -e "select frag from gold;" danRer10 | sort -u \
        > /tmp/danRer10.frag.gold.txt


    export maxLen=`awk '{print length($0)}' /tmp/danRer10.frag.gold.txt | sort -rn | head -1`
    echo "scan to column: $maxLen"

export C=1
while [ $C -le $maxLen ];
do
echo -n " $C: "
awk '{ print substr($0,'$C',1) }' /tmp/danRer10.frag.gold.txt | sort -u | xargs echo | sed -e 's/ //g'
C=`echo $C | awk '{print $1+1}'`
done
 1: ABCFLN
 2: ACKLOPQRTUX
 3: 0123456789B_
 4: 0123456789Z
 5: 0123456789
 6: 0123456789
 7: 0123456789
 8: 0123456789
 9: .0123456789
 10: .0123456789
 11: 0123456789
 12: 0123456789
 13: .
 14: 1

    # verify this rule will find them all or eliminate them all:
    hgsql -N -e "select frag from gold;" danRer10 | wc -l
    # 34028

    hgsql -N -e "select frag from gold;" danRer10 \
       | egrep -e '[ABCFLN][ACKLOPQRTUX][B0-9_][Z0-9][0-9]+(\.[0-9]+)?' | wc -l
    # 34028

    hgsql -N -e "select frag from gold;" danRer10 \
       | egrep -v -e '[ABCFLN][ACKLOPQRTUX][B0-9_][Z0-9][0-9]+(\.[0-9]+)?' | wc -l
    # 0

    # hence, add to trackDb/zebrafish/danRer10/trackDb.ra
searchTable gold
shortCircuit 1
termRegex [ABCFLN][ACKLOPQRTUX][B0-9_][Z0-9][0-9]+(\.[0-9]+)?
query select chrom,chromStart,chromEnd,frag from %s where frag like '%s%%'
searchPriority 8

#########################################################################
# ucscToINSDC table/track (DONE - 2015-01-22 - Hiram)

    mkdir /hive/data/genomes/danRer10/bed/ucscToINSDC
    cd /hive/data/genomes/danRer10/bed/ucscToINSDC
    # check for chrM in assembly:
    grep chrM ../../danRer10.agp
# chrM    1       16596   4       F       NC_002333.2     1       16596   +

    # use the accession name from there in this command (blank if none)
    ~/kent/src/hg/utils/automation/ucscToINSDC.sh \
        ../../genbank/GCA_*assembly_structure/Primary_Assembly NC_002333.2

    awk '{printf "%s\t0\t%d\n", $1,$2}' ../../chrom.sizes \
         | sort > name.coordinate.tab
    join name.coordinate.tab ucscToINSDC.txt | tr '[ ]' '[\t]' \
         > ucscToINSDC.bed
    # should all be the same line count:
    wc -l *
#  1061 name.coordinate.tab
#  1061 ucscToINSDC.bed
#  1061 ucscToINSDC.txt

    cut -f1 ucscToINSDC.bed | awk '{print length($0)}' | sort -n | tail -1
    # 16
    # use the 16 in this sed
    sed -e "s/21/16/" $HOME/kent/src/hg/lib/ucscToINSDC.sql \
         | hgLoadSqlTab danRer10 ucscToINSDC stdin ucscToINSDC.bed
    checkTableCoords danRer10
    # should cover %100 entirely:
    featureBits -countGaps danRer10 ucscToINSDC
    # 1371719383 bases of 1371719383 (100.000%) in intersection

############################################################################
#  BLATSERVERS ENTRY (DONE - 2015-03-20 - Hiram)
#	After getting a blat server assigned by the Blat Server Gods,
     ssh hgwdev

     hgsql -e 'INSERT INTO blatServers (db, host, port, isTrans, canPcr) \
	VALUES ("danRer10", "blat4c", "17862", "1", "0"); \
	INSERT INTO blatServers (db, host, port, isTrans, canPcr) \
	VALUES ("danRer10", "blat4c", "17863", "0", "1");' \
		hgcentraltest
     #	test it with some sequence

############################################################################
# LIFTOVER TO danRer11 (DONE - 2015-01-29 - Hiram)
    ssh hgwdev
    mkdir /hive/data/genomes/danRer10/bed/blat.danRer11.2017-10-20
    cd /hive/data/genomes/danRer10/bed/blat.danRer11.2017-10-20
    time (doSameSpeciesLiftOver.pl -verbose=2 -buildDir=`pwd` \
	-ooc=/hive/data/genomes/danRer10/jkStuff/danRer10.11.ooc \
        -bigClusterHub=ku -dbHost=hgwdev -workhorse=hgwdev \
         danRer10 danRer11) > do.log 2>&1
    # real    194m46.839s

    # verify the convert link on the test browser is now active from danRer10 to
    # danRer11

############################################################################
# LIFTOVER TO danRer7 (DONE - 2015-01-29 - Hiram)
    ssh hgwdev
    mkdir /hive/data/genomes/danRer10/bed/blat.danRer7.2015-01-29
    cd /hive/data/genomes/danRer10/bed/blat.danRer7.2015-01-29
    time (doSameSpeciesLiftOver.pl -verbose=2 -buildDir=`pwd` \
	-ooc=/hive/data/genomes/danRer10/jkStuff/danRer10.11.ooc \
        -bigClusterHub=ku -dbHost=hgwdev -workhorse=hgwdev \
         danRer10 danRer7) > do.log 2>&1
    # real    253m49.880s

    # verify the convert link on the test browser is now active from danRer10 to
    # danRer7

#########################################################################
# all.joiner update, downloads and in pushQ - (DONE 2015-02-04 - Hiram)
    cd $HOME/kent/src/hg/makeDb/schema
    # fixup all.joiner until this is a clean output
    joinerCheck -database=danRer10 -keys all.joiner
    # about 50 minutes
    joinerCheck -database=danRer10 -tableCoverage all.joiner
    joinerCheck -database=danRer10 -times all.joiner

    cd /hive/data/genomes/danRer10
    time makeDownloads.pl danRer10 > downloads.log 2>&1
    # real    10m5.488s

    #   now ready for pushQ entry
    mkdir /hive/data/genomes/danRer10/pushQ
    cd /hive/data/genomes/danRer10/pushQ
    time makePushQSql.pl danRer10 > danRer10.pushQ.sql 2> stderr.out
    # real    5m47.086s

    #   check for errors in stderr.out, some are OK, e.g.:
# WARNING: hgwdev does not have /gbdb/danRer10/wib/gc5Base.wib
# WARNING: hgwdev does not have /gbdb/danRer10/wib/quality.wib
# WARNING: hgwdev does not have /gbdb/danRer10/bbi/qualityBw/quality.bw
# WARNING: danRer10 does not have seq
# WARNING: danRer10 does not have extFile

    #   copy it to hgwbeta
    scp -p danRer10.pushQ.sql qateam@hgwbeta:/tmp
    ssh qateam@hgwbeta hgwbeta "./bin/x86_64/hgsql qapushq < /tmp/danRer10.pushQ.sql"

    #   in that pushQ entry walk through each entry and see if the
    #   sizes will set properly

#########################################################################
# UCSC to RefSeq name correspondence (DONE - 2015-04-15 - Hiram)

    mkdir /hive/data/genomes/danRer10/bed/ucscToRefSeq
    cd /hive/data/genomes/danRer10/bed/ucscToRefSeq

    rsync -avPL \
  rsync://ftp.ncbi.nlm.nih.gov/genomes/genbank/vertebrate_other/Danio_rerio/all_assembly_versions/GCA_000002035.3_GRCz10/GCA_000002035.3_GRCz10_assembly_report.txt ./

    # this assembly_report has "UCSC-style-name" in column 10
    # but it does not name everything

    # columns 5 and 7 are the INSDC and RefSeq names
    # chrMT fixup in the sed
    grep -v "^#" GCA_000002035.3_GRCz10_assembly_report.txt \
      | awk -F'\t' '{printf "%s\t%s\n", $5,$7}' |
        sed -e 's/^na/NC_002333.2/;' | sort > insdc.refSeq.tab

    hgsql -N -e 'select name,chrom,chromStart,chromEnd from ucscToINSDC;' \
      danRer10 | sort > insdc.ucsc.tab

    join insdc.ucsc.tab insdc.refSeq.tab | tr '[ ]' '[\t]' \
       | cut -f2- > ucsc.refSeq.tab

    # when working perfectly, all these tab files have the same line count:
    wc -l *.tab
# 1061 insdc.refSeq.tab
# 1061 insdc.ucsc.tab
# 1061 ucsc.refSeq.tab

    export chrSize=`cut -f1 ucsc.refSeq.tab | awk '{print length($0)}' | sort -n | tail -1`
    sed -e "s/21/$chrSize/" $HOME/kent/src/hg/lib/ucscToINSDC.sql \
       | sed -e 's/INSDC/RefSeq/g;' > ucscToRefSeq.sql
    hgLoadSqlTab danRer10 ucscToRefSeq ./ucscToRefSeq.sql ucsc.refSeq.tab

    checkTableCoords  danRer10 -table=ucscToRefSeq
    # should cover %100 all bases:
    featureBits -countGaps danRer10 ucscToRefSeq
    # 2730871774 bases of 2730871774 (100.000%) in intersection

#########################################################################
# add chromAlias table (DONE - 2017-06-22 - Hiram)

    mkdir /hive/data/genomes/danRer10/bed/chromAlias
    cd /hive/data/genomes/danRer10/bed/chromAlias

    mkdir genbank
    cd genbank
    cp -p /hive/data/inside/ncbi/genomes/genbank/vertebrate_other/Danio_rerio/latest_assembly_versions/GCA_000002035.3_GRCz10/GCA_000002035.3_GRCz10.ncbi.2bit .
    time (doIdKeys.pl GCF_000002035.3_GRCz10 \
      -twoBit=`pwd`/GCF_000002035.3_GRCz10.ncbi.2bit \
         -buildDir=`pwd`) > do.log 2>&1
    # real    0m52.374s

    cd /hive/data/genomes/danRer10/bed/chromAlias
    join ../idKeys/danRer10.idKeys.txt \
 genbank/GCA_000002035.3_GRCz10.idKeys.txt \
    | awk '{printf "%s\t%s\n", $2,$3}' | sort > ucsc.genbank.tab

    cd /hive/data/genomes/danRer10/bed/chromAlias
    mkdir refseq
    cd refseq
    cp -p /hive/data/inside/ncbi/genomes/refseq/vertebrate_other/Danio_rerio/latest_assembly_versions/GCF_000002035.5_GRCz10/GCF_000002035.5_GRCz10.ncbi.2bit .
    time (doIdKeys.pl GCF_000002035.5_GRCz10 \
      -twoBit=`pwd`/GCF_000002035.5_GRCz10.ncbi.2bit \
         -buildDir=`pwd`) > do.log 2>&1
    # real    0m47.365s

    cd /hive/data/genomes/danRer10/bed/chromAlias

    join ../idKeys/danRer10.idKeys.txt \
  /hive/users/hiram/idKeys/ensembl/release-85/danio_rerio/Danio_rerio.GRCz10.idKeys.txt \
    | awk '{printf "%s\t%s\n", $2,$3}' | sort > ucsc.ensembl1.tab

    ~/kent/src/hg/utils/automation/chromAlias.pl

    hgLoadSqlTab danRer10 chromAlias ~/kent/src/hg/lib/chromAlias.sql \
        danRer10.chromAlias.tab

##############################################################################
# crispr whole genome (DONE - 2023-11-04 - Hiram)
    # redmine issue 21863: https://redmine.soe.ucsc.edu/issues/21863

    mkdir /hive/data/genomes/danRer10/bed/crisprAll
    cd /hive/data/genomes/danRer10/bed/crisprAll

    # make sure it can get started
    time (~/kent/src/hg/utils/automation/doCrispr.pl \
      -stop=guides -buildDir=`pwd` -smallClusterHub=hgwdev danRer10) \
           > guides.log 2>&1
    # real    30m39.710s
    sed -e 's/^/# /;' guides/run.time
# Completed: 99 of 99 jobs
# CPU time in finished jobs:       5510s      91.83m     1.53h    0.06d  0.000 y
# IO & Wait Time:                   273s       4.55m     0.08h    0.00d  0.000 y
# Average job time:                  58s       0.97m     0.02h    0.00d
# Longest finished job:              96s       1.60m     0.03h    0.00d
# Submission to last job:           105s       1.75m     0.03h    0.00d

    # looks good, let it run through the load:
    time ~/kent/src/hg/utils/automation/doCrispr.pl -continue=specScoreJobList \
        -stop=load -buildDir=`pwd` -smallClusterHub=hgwdev danRer10) \
           > load.log 2>&1
    # real    2523m24.976s

    sed -e 's/^/# /;' specScores/run.time
# Completed: 840309 of 840309 jobs
# CPU time in finished jobs:   46880766s  781346.10m 13022.43h  542.60d  1.487 y
# IO & Wait Time:               1164411s   19406.85m   323.45h   13.48d  0.037 y
# Average job time:                  57s       0.95m     0.02h    0.00d
# Longest finished job:             145s       2.42m     0.04h    0.00d
# Submission to last job:        116299s    1938.32m    32.31h    1.35d

    sed -e 's/^/# /;'  effScores/run.time
# Completed: 9536 of 9536 jobs
# CPU time in finished jobs:    4820886s   80348.09m  1339.13h   55.80d  0.153 y
# IO & Wait Time:                 41241s     687.36m    11.46h    0.48d  0.001 y
# Average job time:                 510s       8.50m     0.14h    0.01d
# Longest finished job:            2057s      34.28m     0.57h    0.02d
# Submission to last job:          9265s     154.42m     2.57h    0.11d

    sed -e 's/^/# /;'  offTargets/run.time
# Completed: 42016 of 42016 jobs
# CPU time in finished jobs:     558233s    9303.88m   155.06h    6.46d  0.018 y
# IO & Wait Time:                309956s    5165.94m    86.10h    3.59d  0.010 y
# Average job time:                  21s       0.34m     0.01h    0.00d
# Longest finished job:              34s       0.57m     0.01h    0.00d
# Submission to last job:          1102s      18.37m     0.31h    0.01d


    # that made the table crispr10K and symlinks in /gbdb/danRer10/crisrp10K/
    # when it should have been instead crisprAll, reset the links and reload
    # the correct table:
mkdir -p /gbdb/danRer10/crisprAll/
rm -f /gbdb/danRer10/crisprAll/crispr.bb
rm -f /gbdb/danRer10/crisprAll/crisprDetails.tab
ln -sf `pwd`/crispr.bb /gbdb/danRer10/crisprAll/crispr.bb
ln -sf `pwd`/crisprDetails.tab /gbdb/danRer10/crisprAll/crisprDetails.tab
hgBbiDbLink danRer10 crisprAllTargets /gbdb/danRer10/crisprAll/crispr.bb

    hgsql -e 'drop table crispr10K;' danRer10

    grep -c . effScores.tab
    # 95378380
    grep -c . specScores.tab
    # 61805075

    time (cut -f1-3 crispr.bed | bedSingleCover.pl stdin \
       | awk '{sum+=$3-$2}END{printf "%d bases\n", sum}') \
            > coverage.crispr.bed.txt 2>&1
    936176533 bases
    real    4m42.959s
    ave -col=2 ../../*.sizes | grep total
    total 1371719383.000000
    # 'featureBits' result:
    echo "scale+=3; 100.0 * 936176533 / 1371719383" | bc
    68.248

##############################################################################
