# for emacs: -*- mode: sh; -*-

# This file describes browser build for the panPan2
# Bonobo -- 

# this is an update to panPan1 sequence to fix big errors in their chrom
#    structures

# chrMT listed in assembly ASM225v1 Mmul_051212 == NC_005943.1

#############################################################################
# fetch sequence from new style download directory (DONE - 2016-01-08 - Hiram)
    mkdir -p /hive/data/genomes/panPan2/refseq
    cd /hive/data/genomes/panPan2/refseq

    time rsync -L -a -P \
rsync://ftp.ncbi.nlm.nih.gov/genomes/refseq/vertebrate_mammalian/Pan_paniscus/all_assembly_versions/GCF_000258655.2_panpan1.1/ ./
    # real    6m9.041s
    # sent 2810 bytes  received 3928893248 bytes  10633006.92 bytes/sec
    # total size is 3928403393  speedup is 1.00

    # measure what we have here:
    faSize  GCF_000258655.2_panpan1.1_genomic.fna.gz
# 3286643896 bases (560753890 N's 2725890006 real 1745980756 upper 979909250
# lower) in 10274 sequences in 1 files
# Total size: mean 319899.2 sd 6899023.6 min 217 (NW_014024393.1) max 247869975 (NC_027870.1) median 1230
# %29.81 masked total, %35.95 masked real

    time faCount  GCF_000258655.2_panpan1.1_genomic.fna.gz | less
# #seq    len             A       C          G       T         N       cpg
# total 3286643896  806971547 555159394 555452392 808306673 560753890 25946869
# real    1m27.422s

#############################################################################
# fixup to UCSC naming scheme (DONE - 2016-04-18 - Hiram)
    mkdir /hive/data/genomes/panPan2/ucsc
    cd /hive/data/genomes/panPan2/ucsc

    time ~/kent/src/hg/utils/automation/ucscCompositeAgp.pl \
     ../refseq/*_genomic.fna.gz ../refseq/*_assembly_structure/Primary_Assembly

NC_027868.1 chr1
NC_027869.1 chr2A
NC_027870.1 chr2B
NC_027871.1 chr3
NC_027872.1 chr4
NC_027873.1 chr5
NC_027874.1 chr6
NC_027875.1 chr7
NC_027876.1 chr8
NC_027877.1 chr9
NC_027878.1 chr10
NC_027879.1 chr11
NC_027880.1 chr12
NC_027881.1 chr13
NC_027882.1 chr14
NC_027883.1 chr15
NC_027884.1 chr16
NC_027885.1 chr17
NC_027886.1 chr18
NC_027887.1 chr19
NC_027888.1 chr20
NC_027889.1 chr21
NC_027890.1 chr22
NC_027891.1 chrX

real    17m57.979s

    time ~/kent/src/hg/utils/automation/unplacedWithChroms.pl \
       ../refseq/*_assembly_structure/Primary_Assembly
    # processed 10249 sequences into chrUn.fa.gz
    # real    4m35.540s

    # bash syntax here
    mitoAcc=`grep "^# mitoAcc" ../panPan2.config.ra | awk '{print $NF}'`
    printf "# mitoAcc %s\n" "$mitoAcc"
# mitoAcc NC_001644.1

    zcat \
  ../refseq/*_assembly_structure/non-nuclear/assem*/AGP/chrMT.comp.agp.gz \
     | grep -v "^#" | sed -e "s/^$mitoAcc/chrM/;" > chrM.agp

    printf ">chrM\n" > chrM.fa
    twoBitToFa -noMask refseq.2bit:$mitoAcc stdout | grep -v "^>" >> chrM.fa
    gzip chrM.fa

    # verify fasta and AGP match:
    time faToTwoBit chr*.fa.gz test.2bit
    # real    1m45.015s
    cat *.agp | checkAgpAndFa stdin test.2bit 2>&1 | tail
    #  All AGP and FASTA entries agree - both files are valid


    # verify nothing lost compared to genbank:
    faSize *.fa.gz
# 3286643896 bases (560753890 N's 2725890006 real 2725890006 upper 0 lower) in 10274 sequences in 26 files
# Total size: mean 319899.2 sd 6899023.6 min 217 (chrUn_NW_014024393v1) max 247869975 (chr2B) median 1230

    # same totals as above:
# 3286643896 bases (560753890 N's 2725890006 real 1745980756 upper 979909250
# lower) in 10274 sequences in 1 files
# Total size: mean 319899.2 sd 6899023.6 min 217 (NW_014024393.1) max 247869975 (NC_027870.1) median 1230

#############################################################################
#  Initial database build (DONE - 2016-04-18 - Hiram)

# construct the required photoReference.txt
    cd /hive/data/genomes/panPan2
    printf "photoCreditURL http://a-z-animals.com/animals/bonobo/pictures/2955/
photoCreditName Photo courtesy of Kabir Bakie\n" > photoReference.txt

    # this almost functioned OK.  It couldn't find a commonName or the
    # bioProject
    ~/kent/src/hg/utils/automation/prepConfig.pl panPan2 mammal bonobo \
       refseq/*_assembly_report.txt > panPan2.config.ra
    
    cat panPan2.config.ra
# config parameters for makeGenomeDb.pl:
db panPan2
clade mammal
genomeCladePriority 15
scientificName Pan paniscus
commonName Bonobo
assemblyDate Aug. 2015
assemblyLabel Max-Planck Institute for Evolutionary Anthropology
assemblyShortLabel MPI-EVA panpan1.1
orderKey 2624
# mitochondrial sequence included in refseq release
# mitoAcc NC_001644.1
# http://www.ncbi.nlm.nih.gov/bioproject/PRJNA11815
mitoAcc none
fastaFiles /hive/data/genomes/panPan2/ucsc/*.fa.gz
agpFiles /hive/data/genomes/panPan2/ucsc/*.agp
# qualFiles none
dbDbSpeciesDir bonobo
photoCreditURL http://a-z-animals.com/animals/bonobo/pictures/2955/
photoCreditName Photo courtesy of Kabir Bakie
ncbiGenomeId 10729
ncbiAssemblyId 474211
ncbiAssemblyName panpan1.1
ncbiBioProject 169343
ncbiBioSample SAMEA1029457
genBankAccessionID GCF_000258655.2
taxId 9597

    # verify sequence and AGP are OK:
    time makeGenomeDb.pl -workhorse=hgwdev -dbHost=hgwdev -fileServer=hgwdev \
         -stop=agp panPan2.config.ra > agp.log 2>&1
    # real    3m58.459s

    # then finish it off:
    time (~/kent/src/hg/utils/automation/makeGenomeDb.pl -workhorse=hgwdev \
       -dbHost=hgwdev -fileServer=hgwdev -continue=db \
           panPan2.config.ra ) > db.log 2>&1
    # real    29m36.731s

    # check in the trackDb files created and add to trackDb/makefile
    # temporary symlink until after masking
    ln -s `pwd`/panPan2.unmasked.2bit /gbdb/panPan2/panPan2.2bit

#############################################################################
# cytoBandIdeo - (DONE - 2016-04-19 - Hiram)
    mkdir /hive/data/genomes/panPan2/bed/cytoBand
    cd /hive/data/genomes/panPan2/bed/cytoBand
    makeCytoBandIdeo.csh panPan2

##############################################################################
# cpgIslands on UNMASKED sequence (DONE - 2016-04-19 - Hiram)
    mkdir /hive/data/genomes/panPan2/bed/cpgIslandsUnmasked
    cd /hive/data/genomes/panPan2/bed/cpgIslandsUnmasked

    # run stepwise so the loading can be done in a different table
    time (doCpgIslands.pl -dbHost=hgwdev -bigClusterHub=ku -buildDir=`pwd` \
       -tableName=cpgIslandExtUnmasked \
          -maskedSeq=/hive/data/genomes/panPan2/panPan2.unmasked.2bit \
             -workhorse=hgwdev -smallClusterHub=ku panPan2) > do.log 2>&1
    # real    22m23.619s

    cat fb.panPan2.cpgIslandExtUnmasked.txt
    # 23203460 bases of 2725937399 (0.851%) in intersection

#############################################################################
# running repeat masker (DONE - 2016-04-19,05-04 - Hiram)
    mkdir /hive/data/genomes/panPan2/bed/repeatMasker
    cd /hive/data/genomes/panPan2/bed/repeatMasker
    time  (doRepeatMasker.pl -buildDir=`pwd` \
        -bigClusterHub=ku -dbHost=hgwdev -workhorse=hgwdev \
        -smallClusterHub=ku panPan2) > do.log 2>&1 &
    # real    805m20.249s
    # had one missing ID failure:
#RepeatMasker bug?: Undefined id, line 3391703 of input:
#  2118  12.4  3.8  2.4  chr4      152298481 152298818 (43505445) +  MER61C         LTR/ERV1                86  428    (0)
# At least one ID was missing (see warnings above) -- please report to Robert Hubley.  -continue at your disgression.

    # cleaning out the single bad record
    grep -v "152298481 152298818" panPan2.sorted.fa.out > panPan2.cleaned.fa.out
    mv panPan2.sorted.fa.out panPan2.sorted.fa.out.badRecord
    mv panPan2.cleaned.fa.out panPan2.sorted.fa.out

    # continuing:
    time  (doRepeatMasker.pl -buildDir=`pwd` \
        -bigClusterHub=ku -dbHost=hgwdev -workhorse=hgwdev \
        -continue=mask -smallClusterHub=ku panPan2) > mask.log 2>&1

    cat faSize.rmsk.txt
# 3286643896 bases (560753890 N's 2725890006 real 1348347850 upper
#    1377542156 lower) in 10274 sequences in 1 files
# Total size: mean 319899.2 sd 6899023.6 min 217 (chrUn_NW_014024393v1)
#    max 247869975 (chr2B) median 1230
# %41.91 masked total, %50.54 masked real

    egrep -i "versi|relea" do.log
    # RepeatMasker version open-4.0.5
    #    January 31 2015 (open-4-0-5) version of RepeatMasker
    # CC   RELEASE 20140131;

    time featureBits -countGaps panPan2 rmsk
    # 1378308706 bases of 3286643896 (41.937%) in intersection
    # real    0m56.056s

    # why is it different than the faSize above ?
    # because rmsk masks out some N's as well as bases, the count above
    #   separates out the N's from the bases, it doesn't show lower case N's

    # faster way to get the same result:
    time hgsql -N -e 'select genoName,genoStart,genoEnd from rmsk;' panPan2 \
        | bedSingleCover.pl stdin | ave -col=4 stdin | grep "^total"
    # total 1378308706.000000
    # real    0m36.998s

##########################################################################
# running simple repeat (DONE - 2016-04-19 - Hiram)

    mkdir /hive/data/genomes/panPan2/bed/simpleRepeat
    cd /hive/data/genomes/panPan2/bed/simpleRepeat
    time (doSimpleRepeat.pl -buildDir=`pwd` -bigClusterHub=ku \
        -dbHost=hgwdev -workhorse=hgwdev -smallClusterHub=ku \
        panPan2) > do.log 2>&1 &
    # real    20m23.012s

    cat fb.simpleRepeat
    # 53570869 bases of 2725937399 (1.965%) in intersection

    # add to rmsk after it is done:
    cd /hive/data/genomes/panPan2
    twoBitMask panPan2.rmsk.2bit \
        -add bed/simpleRepeat/trfMask.bed panPan2.2bit
    #   you can safely ignore the warning about fields >= 13
    twoBitToFa panPan2.2bit stdout | faSize stdin > faSize.panPan2.2bit.txt
    cat faSize.panPan2.2bit.txt
    # 3286643896 bases (560753890 N's 2725890006 real 1346906367 upper
    #    1378983639 lower) in 10274 sequences in 1 files
    # Total size: mean 319899.2 sd 6899023.6 min 217 (chrUn_NW_014024393v1)
    #    max 247869975 (chr2B) median 1230
    # %41.96 masked total, %50.59 masked real

    rm /gbdb/panPan2/panPan2.2bit
    ln -s `pwd`/panPan2.2bit /gbdb/panPan2/panPan2.2bit

#############################################################################
# CREATE MICROSAT TRACK (DONE - 2016-04-19 - Hiram)
    ssh hgwdev
    mkdir /cluster/data/panPan2/bed/microsat
    cd /cluster/data/panPan2/bed/microsat

    awk '($5==2 || $5==3) && $6 >= 15 && $8 == 100 && $9 == 0 {printf("%s\t%s\t%s\t%dx%s\n", $1, $2, $3, $6, $16);}' \
       ../simpleRepeat/simpleRepeat.bed > microsat.bed

    hgLoadBed panPan2 microsat microsat.bed
    # Read 26743 elements of size 4 from microsat.bed

#############################################################################
# ucscToINSDC table/track (DONE - 2016-04-19 - Hiram)
    # the sequence here is working for a 'refseq' assembly with a chrM
    # situation may be specific depending upon what is available in the assembly

    mkdir /hive/data/genomes/panPan2/bed/ucscToINSDC
    cd /hive/data/genomes/panPan2/bed/ucscToINSDC

    # find accession for chrM
    grep chrM ../../panPan2.agp
# chrM    1       16563   1       O       NC_001644.1     1       16563   +

    # use that accession here:
    ~/kent/src/hg/utils/automation/ucscToINSDC.sh \
        ../../refseq/GCF_*structure/Primary_Assembly NC_001644.1
    awk '{printf "%s\t%s\n", $2, $1}' ucscToINSDC.txt | sort > insdcToUcsc.txt

    # there is no name for chrM/NC_007393.1 sequence, there is no such
    #  sequence with an INSDC name
    grep -v "^#" ../../refseq/GCF*_assembly_report.txt | cut -f5,7 \
      | sed -e 's/na\b/notAvailable/;' | awk '{printf "%s\t%s\n", $2, $1}' \
         | sort > insdc.refseq.txt
    # the sed \b means to match word

    awk '{printf "%s\t0\t%d\n", $1,$2}' ../../chrom.sizes \
         | sort > name.coordinate.tab

    # the tr commands avoid the problem of trying to use the -t argument
    # to the join command which doesn't accept -t'\t' but instead has
    # to use the unseen/can not copy command ctrl-v i
    join insdc.refseq.txt insdcToUcsc.txt | tr '[ ]' '[\t]' | sort -k3 \
       | join -2 3 name.coordinate.tab - | tr '[ ]' '[\t]' | cut -f1-3,5 \
           > ucscToINSDC.bed

    # should be same line counts throughout:
    wc -l *
    # 2490 insdc.refseq.txt
    # 2490 insdcToUcsc.txt
    # 2490 name.coordinate.tab
    # 2490 ucscToINSDC.bed
    # 2490 ucscToINSDC.txt

    cut -f1 ucscToINSDC.bed | awk '{print length($0)}' | sort -n | tail -1
    # 20
    # use the 20 in this sed
    sed -e "s/21/20/" $HOME/kent/src/hg/lib/ucscToINSDC.sql \
         | hgLoadSqlTab panPan2 ucscToINSDC stdin ucscToINSDC.bed
    checkTableCoords panPan2
    # should cover %100 entirely:
    featureBits -countGaps panPan2 ucscToINSDC
    # 3286643896 bases of 3286643896 (100.000%) in intersection

    join -1 2 <(sort -k2 ucscToINSDC.txt) insdc.refseq.txt | tr '[ ]' '[\t]' \
      | sort -k2 | join -2 2 name.coordinate.tab - |  tr '[ ]' '[\t]' \
        | cut -f1-4 > ucscToRefSeq.bed
    cut -f1 ucscToRefSeq.bed | awk '{print length($0)}' | sort -n | tail -1
    # 20
    # use the 20 in this sed
    sed -e "s/21/20/" $HOME/kent/src/hg/lib/ucscToINSDC.sql \
       | sed -e 's/INSDC/RefSeq/g;' > ucscToRefSeq.sql
    hgLoadSqlTab panPan2 ucscToRefSeq ./ucscToRefSeq.sql ucscToRefSeq.bed

    checkTableCoords  panPan2 -table=ucscToRefSeq
    # should cover %100 all bases:
    featureBits -countGaps panPan2 ucscToRefSeq
    # 3286643896 bases of 3286643896 (100.000%) in intersection

#########################################################################
# add chromAlias table (DONE - 2017-02-27 - Hiram)

    mkdir /hive/data/genomes/panPan2/bed/chromAlias
    cd /hive/data/genomes/panPan2/bed/chromAlias

    hgsql -N -e 'select chrom,name,"refseq" from ucscToRefSeq;' panPan2 \
        > ucsc.refseq.tab
    hgsql -N -e 'select chrom,name,"genbank" from ucscToINSDC;' panPan2 \
        > ucsc.genbank.tab

    awk '{printf "%s\t%s\t%s\n", $2,$1,$3}' ucsc.genbank.tab ucsc.refseq.tab \
        | sort > panPan2.chromAlias.tab

    hgLoadSqlTab panPan2 chromAlias ~/kent/src/hg/lib/chromAlias.sql \
        panPan2.chromAlias.tab

    cd /hive/data/genomes/panPan2/bed/chromAlias
    # add ensembl names 2017-12-14
    mkdir previous
    mv *.tab previous
    join -t$'\t' ../idKeys/panPan2.idKeys.txt \
	../../ensembl/ensemblPanPan2.idKeys.txt \
	| cut -f2,3 | sort > ucsc.ensembl.tab

    cut -f1,2 previous/ucsc.refseq.tab > ucsc.refseq.tab
    cut -f1,2 previous/ucsc.genbank.tab > ucsc.genbank.tab

    ~/kent/src/hg/utils/automation/chromAlias.pl
    sort -o panPan2.chromAlias.tab panPan2.chromAlias.tab

for t in refseq genbank ensembl
do
  c0=`cat ucsc.$t.tab | wc -l`
  c1=`grep $t panPan2.chromAlias.tab | wc -l`
  ok="OK"
  if [ "$c0" -ne "$c1" ]; then
     ok="ERROR"
  fi
  printf "# checking $t: $c0 =? $c1 $ok\n"
done
# checking refseq: 10274 =? 10274 OK
# checking genbank: 10274 =? 10274 OK
# checking ensembl: 10274 =? 10274 OK

    hgLoadSqlTab panPan2 chromAlias ~/kent/src/hg/lib/chromAlias.sql \
        panPan2.chromAlias.tab

#########################################################################
# fixup search rule for assembly track/gold table (DONE - 2016-04-19 - Hiram)

    cd ~/kent/src/hg/makeDb/trackDb/bonobo/panPan2
    # preview prefixes and suffixes:
    hgsql -N -e "select frag from gold;" panPan2 \
      | sed -e 's/[0-9][0-9]*//;' | sort | uniq -c 
# 121336 AJFE.1
#      1 NC_.1

    # implies a search rule of: '[ACEFJN_]+[0-9]+(\.[0-9]+)?'

    # verify this rule will find them all or eliminate them all:
    hgsql -N -e "select frag from gold;" panPan2 | wc -l
    # 121337

    hgsql -N -e "select frag from gold;" panPan2 \
       | egrep -e '[ACEFJN_]+[0-9]+(\.[0-9]+)?' | wc -l
    # 121337

    hgsql -N -e "select frag from gold;" panPan2 \
       | egrep -v -e '[ACEFJN_]+[0-9]+(\.[0-9]+)?' | wc -l
    # 0

    # hence, add to trackDb/rhesus/panPan2/trackDb.ra
searchTable gold
shortCircuit 1
termRegex [ACEFJN_]+[0-9]+(\.[0-9]+)?
query select chrom,chromStart,chromEnd,frag from %s where frag like '%s%%'
searchPriority 8

    # verify searches work in the position box

##########################################################################
## WINDOWMASKER (DONE - 2016-05-04 - Hiram)

    mkdir /hive/data/genomes/panPan2/bed/windowMasker
    cd /hive/data/genomes/panPan2/bed/windowMasker
    time (doWindowMasker.pl -buildDir=`pwd` -workhorse=hgwdev \
        -dbHost=hgwdev panPan2) > do.log 2>&1
    # real    379m56.649s

    # Masking statistics
    cat faSize.panPan2.cleanWMSdust.txt
# 3286643896 bases (560753890 N's 2725890006 real 1730579207 upper
#    995310799 lower) in 10274 sequences in 1 files
# Total size: mean 319899.2 sd 6899023.6 min 217 (chrUn_NW_014024393v1)
#    max 247869975 (chr2B) median 1230
# %30.28 masked total, %36.51 masked real

    cat fb.panPan2.rmsk.windowmaskerSdust.txt
    # 768236951 bases of 3286643896 (23.375%) in intersection

##########################################################################
# cpgIslands - (DONE - 2016-05-05 - Hiram)
    mkdir /hive/data/genomes/panPan2/bed/cpgIslands
    cd /hive/data/genomes/panPan2/bed/cpgIslands
    time (doCpgIslands.pl -dbHost=hgwdev -bigClusterHub=ku \
      -workhorse=hgwdev -smallClusterHub=ku panPan2) > do.log 2>&1
    # real    6m54.203s

    cat fb.panPan2.cpgIslandExt.txt
    # 17035990 bases of 2725937399 (0.625%) in intersection

##############################################################################
# ncbiRefSeq gene track (DONE - 2016-05-05 - Hiram)
    mkdir /hive/data/genomes/panPan2/bed/ncbiRefSeq
    cd /hive/data/genomes/panPan2/bed/ncbiRefSeq

    # working on this script, running step by step:
    time (/cluster/home/hiram/kent/src/hg/utils/automation/doNcbiRefSeq.pl \
  -stop=download -buildDir=`pwd` -bigClusterHub=ku \
  -fileServer=hgwdev -workhorse=hgwdev -smallClusterHub=ku -dbHost=hgwdev \
      refseq vertebrate_mammalian Pan_paniscus \
         GCF_000258655.2_panpan1.1 panPan2) > download.log 2>&1
    # real    12m36.320s

    time (/cluster/home/hiram/kent/src/hg/utils/automation/doNcbiRefSeq.pl \
  -continue=process -stop=process -buildDir=`pwd` -bigClusterHub=ku \
  -fileServer=hgwdev -workhorse=hgwdev -smallClusterHub=ku -dbHost=hgwdev \
      refseq vertebrate_mammalian Pan_paniscus \
         GCF_000258655.2_panpan1.1 panPan2) > process.log 2>&1
    # real    4m22.621s

    time (/cluster/home/hiram/kent/src/hg/utils/automation/doNcbiRefSeq.pl \
  -continue=load -stop=load -buildDir=`pwd` -bigClusterHub=ku \
  -fileServer=hgwdev -workhorse=hgwdev -smallClusterHub=ku -dbHost=hgwdev \
      refseq vertebrate_mammalian Pan_paniscus \
         GCF_000258655.2_panpan1.1 panPan2) > load.log 2>&1
    # real    0m21.690s

    cat fb.ncbiRefSeq.panPan2.txt
    # 74646536 bases of 2725937399 (2.738%) in intersection

##############################################################################
# genscan - (DONE - 2016-05-05 - Hiram)
    mkdir /hive/data/genomes/panPan2/bed/genscan
    cd /hive/data/genomes/panPan2/bed/genscan
    time (doGenscan.pl -buildDir=`pwd` -workhorse=hgwdev -dbHost=hgwdev \
      -bigClusterHub=ku panPan2) > do.log 2>&1
# Completed: 10270 of 10274 jobs
# Crashed: 2 jobs
# CPU time in finished jobs:     450524s    7508.73m   125.15h    5.21d  0.014 y
# IO & Wait Time:                 27120s     452.00m     7.53h    0.31d  0.001 y
# Time in running jobs:          167787s    2796.45m    46.61h    1.94d  0.005 y
# Average job time:                  47s       0.78m     0.01h    0.00d
# Longest running job:            83913s    1398.55m    23.31h    0.97d
# Longest finished job:           64597s    1076.62m    17.94h    0.75d
# Submission to last job:         64787s    1079.78m    18.00h    0.75d

    # two jobs failed due to almost all N's in the hard mask sequence and
    # they sneaked through the check for that, continuing:
    time (doGenscan.pl -buildDir=`pwd` -workhorse=hgwdev -dbHost=hgwdev \
      -continue=makeBed -bigClusterHub=ku panPan2) > makeBed.log 2>&1
    # real    2m23.103s

    cat fb.panPan2.genscan.txt
    #   45997720 bases of 2725937399 (1.687%) in intersection

    cat fb.panPan2.genscanSubopt.txt
    #   45930774 bases of 2725937399 (1.685%) in intersection

#############################################################################
# augustus gene track (DONE - 2016-05-11 - Hiram)

    mkdir /hive/data/genomes/panPan2/bed/augustus
    cd /hive/data/genomes/panPan2/bed/augustus
    time (doAugustus.pl -buildDir=`pwd` -bigClusterHub=ku \
        -species=human -dbHost=hgwdev \
           -workhorse=hgwdev panPan2) > do.log 2>&1
    # real    95m5.905s

    cat fb.panPan2.augustusGene.txt
    # 49244982 bases of 3142093174 (1.567%) in intersection

#########################################################################
# Create kluster run files (DONE - 2016-05-11 - Hiram)

    # numerator is panPan2 gapless bases "real" as reported by:
    featureBits -noRandom -noHap panPan2 gap
    # 469457882 bases of 2682465908 (17.501%) in intersection

    # denominator is hg19 gapless bases as reported by:
    #   featureBits -noRandom -noHap hg19 gap
    #     234344806 bases of 2861349177 (8.190%) in intersection
    # 1024 is threshold used for human -repMatch:
    calc \( 2682465908 / 2861349177 \) \* 1024
    # ( 2682465908 / 2861349177 ) * 1024 = 959.982484

    # ==> use -repMatch=900 according to size scaled down from 1024 for human.
    #   and rounded down to nearest 100
    cd /hive/data/genomes/panPan2
    time blat panPan2.2bit \
         /dev/null /dev/null -tileSize=11 -makeOoc=jkStuff/panPan2.11.ooc \
        -repMatch=900
    #   Wrote 34618 overused 11-mers to jkStuff/panPan2.11.ooc
    #    real    0m56.353s

    #   check non-bridged gaps to see what the typical size is:
    hgsql -N \
        -e 'select * from gap where bridge="no" order by size;' panPan2 \
        | sort -k7,7nr
    #   minimum size is 1000
    #   decide on a minimum gap for this break, use either 100 or 5000 will
    #   generate 13387 liftOver rows, but if use 6000, only got 11703 rows.
    #   so use 100 here to get more liftOver row.
    gapToLift -verbose=2 -minGap=1000 panPan2 jkStuff/panPan2.nonBridged.lft \
        -bedFile=jkStuff/panPan2.nonBridged.bed

########################################################################
# GENBANK AUTO UPDATE (DONE - 2016-05-17 - Hiram)
    ssh hgwdev
    cd $HOME/kent/src/hg/makeDb/genbank
    git pull
    # /cluster/data/genbank/data/organism.lst shows:
    # #organism       mrnaCnt estCnt  refSeqCnt
    # Pan paniscus    440     1       46


    # edit etc/genbank.conf to add panPan2 just before panPan1

# panPan2 (bonobo - Pan paniscus)
panPan2.serverGenome = /hive/data/genomes/panPan2/panPan2.2bit
panPan2.clusterGenome = /hive/data/genomes/panPan2/panPan2.2bit
panPan2.ooc = /hive/data/genomes/panPan2/jkStuff/panPan2.11.ooc
panPan2.lift = /hive/data/genomes/panPan2/jkStuff/panPan2.nonBridged.lft
panPan2.perChromTables = no
panPan2.refseq.mrna.native.pslCDnaFilter  = ${finished.refseq.mrna.native.pslCDnaFilter}
panPan2.refseq.mrna.xeno.pslCDnaFilter    = ${finished.refseq.mrna.xeno.pslCDnaFilter}
panPan2.genbank.mrna.native.pslCDnaFilter = ${finished.genbank.mrna.native.pslCDnaFilter}
panPan2.genbank.mrna.xeno.pslCDnaFilter   = ${finished.genbank.mrna.xeno.pslCDnaFilter}
panPan2.genbank.est.native.pslCDnaFilter  = ${finished.genbank.est.native.pslCDnaFilter}
panPan2.genbank.est.xeno.pslCDnaFilter    = ${finished.genbank.est.xeno.pslCDnaFilter}
panPan2.downloadDir = panPan2
# defaults yes genbank.mrna.native, genbank.est.native.load,
#              refseq.mrna.native, refseq.mrna.xeno
# DO NOT NEED genbank.mrna.xeno except for human, mouse
panPan2.genbank.est.native.load = no

    git commit -m "Added panPan2; refs #16036" etc/genbank.conf
    git push
    # update /cluster/data/genbank/:
    make etc-update

# Edit src/lib/gbGenome.c to add new species.  Skipped

    screen      #  control this business with a screen since it takes a while
    cd /cluster/data/genbank

    time ./bin/gbAlignStep -initial panPan2
    #  logFile: var/build/logs/2016.05.11-14:44:15.panPan2.initalign.log
    #   real    196m8.278s
    tail -2 var/build/logs/2016.05.11-14:44:15.panPan2.initalign.log
# hgwdev 2016.05.11-17:58:29 panPan2.initalign: Succeeded: panPan2
# hgwdev 2016.05.11-18:00:23 panPan2.initalign: finish

    #   To re-do, rm the dir first:
    #     /cluster/data/genbank/work/initial.panPan2

    # load database when finished
    ssh hgwdev
    cd /cluster/data/genbank
    time ./bin/gbDbLoadStep -drop -initialLoad panPan2
    # logFile: var/dbload/hgwdev/logs/2016.05.13-16:11:17.panPan2.dbload.log
    # real    11m44.667s
    tail -1 var/dbload/hgwdev/logs/2016.05.13-16:11:17.panPan2.dbload.log
    # hgwdev 2016.05.13-16:23:02 panPan2.dbload: finish

    # enable daily alignment and update of hgwdev
    cd ~/kent/src/hg/makeDb/genbank
    git pull
    # add panPan2 to:
    #   etc/align.dbs
    #   etc/hgwdev.dbs
    git add etc/align.dbs
    git add etc/hgwdev.dbs
    git commit -m "Added panPan2 - Bonobo refs #16036" etc/align.dbs etc/hgwdev.dbs
    git push
    make etc-update

##############################################################################
# LIFTOVER TO panPan1 (DONE - 2016-05-17 - Hiram)
    ssh hgwdev
    mkdir /hive/data/genomes/panPan2/bed/blat.panPan1.2016-05-17
    cd /hive/data/genomes/panPan2/bed/blat.panPan1.2016-05-17

    doSameSpeciesLiftOver.pl -verbose=2 \
        -debug -ooc=/hive/data/genomes/panPan2/jkStuff/panPan2.11.ooc \
        -bigClusterHub=ku -dbHost=hgwdev -workhorse=hgwdev \
         panPan2 panPan1

    time (doSameSpeciesLiftOver.pl -verbose=2 \
        -ooc=/hive/data/genomes/panPan2/jkStuff/panPan2.11.ooc \
        -bigClusterHub=ku -dbHost=hgwdev -workhorse=hgwdev \
         panPan2 panPan1) > doLiftOverToPanPan1.log 2>&1
    # real    156m48.953s

    # verify this functions in the genome browser from panPan2 to panPan1

#########################################################################
#  BLATSERVERS ENTRY (DONE - 2017-02-06 - Hiram)
#	After getting a blat server assigned by the Blat Server Gods,
    ssh hgwdev

    hgsql -e 'INSERT INTO blatServers (db, host, port, isTrans, canPcr) \
	VALUES ("panPan2", "blat1c", "17884", "1", "0"); \
	INSERT INTO blatServers (db, host, port, isTrans, canPcr) \
	VALUES ("panPan2", "blat1c", "17885", "0", "1");' \
	    hgcentraltest
    #	test it with some sequence

############################################################################

##############################################################################
# set default position to FOXP2 gene displays  (DONE - 2017-02-06 - Hiram)
    hgsql -e \
'update dbDb set defaultPos="chr7:119267067-119359255" where name="panPan2";' \
	hgcentraltest

##############################################################################
# all.joiner update, downloads and in pushQ - (DONE - 2017-02-06 - Hiram)
    cd $HOME/kent/src/hg/makeDb/schema
    # fixup all.joiner until this is a clean output
    joinerCheck -database=panPan2 -tableCoverage all.joiner
    joinerCheck -database=panPan2 -times all.joiner
    joinerCheck -database=panPan2 -keys all.joiner

    cd /hive/data/genomes/panPan2
    time (makeDownloads.pl panPan2) > downloads.log 2>&1
    # real    20m55.262s

    #   now ready for pushQ entry
    mkdir /hive/data/genomes/panPan2/pushQ
    cd /hive/data/genomes/panPan2/pushQ
    time (makePushQSql.pl panPan2) > panPan2.pushQ.sql 2> stderr.out
    # real    3m46.855s

    #   check for errors in stderr.out, some are OK, e.g.:
    # WARNING: hgwdev does not have /gbdb/panPan2/wib/gc5Base.wib
    # WARNING: hgwdev does not have /gbdb/panPan2/wib/quality.wib
    # WARNING: hgwdev does not have /gbdb/panPan2/bbi/quality.bw
    # WARNING: panPan2 does not have seq
    # WARNING: panPan2 does not have extFile

    #   copy it to hgwbeta
    #   copy it to hgwbeta
    scp -p panPan2.pushQ.sql qateam@hgwbeta:/tmp/
    ssh qateam@hgwbeta "./bin/x86_64/hgsql qapushq < /tmp/panPan2.pushQ.sql"
    #   in that pushQ entry walk through each entry and see if the
    #   sizes will set properly

##############################################################################
# LIFTOVER TO panPan3 (DONE - 2020-06-15 - Hiram)
    ssh hgwdev
    mkdir /hive/data/genomes/panPan2/bed/blat.panPan3.2020-06-15
    cd /hive/data/genomes/panPan2/bed/blat.panPan3.2020-06-15

    doSameSpeciesLiftOver.pl -verbose=2 \
        -debug -ooc=/hive/data/genomes/panPan2/jkStuff/panPan2.11.ooc \
        -bigClusterHub=ku -dbHost=hgwdev -workhorse=hgwdev \
         panPan2 panPan3

    time (doSameSpeciesLiftOver.pl -verbose=2 \
        -ooc=/hive/data/genomes/panPan2/jkStuff/panPan2.11.ooc \
        -bigClusterHub=ku -dbHost=hgwdev -workhorse=hgwdev \
         panPan2 panPan3) > doLiftOverToPanPan3.log 2>&1
    # real    312m42.976s

    # verify this functions in the genome browser from panPan2 to panPan3

#########################################################################
