# for emacs: -*- mode: sh; -*-

# This file describes browser build for the gorGor4
# gorilla/Gorilla gorilla gorilla -- Wellcome Trust Sanger Institute

#############################################################################
# fetch sequence from new style download directory (DONE - 2015-10-16 - Jonathan)
    # NCBI has redesigned their FTP download site, new type of address
    #      and naming scheme
    mkdir -p /hive/data/genomes/gorGor4/genbank
    cd /hive/data/genomes/gorGor4/genbank

    rsync -L -a -P \
rsync://ftp.ncbi.nlm.nih.gov/genomes/genbank/vertebrate_mammalian/Gorilla_gorilla/latest_assembly_versions/GCA_000151905.3_gorGor4/ ./

    # measure what we have here:
    faSize \
*_assembly_structure/Primary_Assembly/assembled_chromosomes/FASTA/*.fna.gz \
*_assembly_structure/Primary_Assembly/unlocalized_scaffolds/FASTA/chr*.unlocalized.scaf.fna.gz \
*_assembly_structure/Primary_Assembly/unplaced_scaffolds/FASTA/unplaced.scaf.fna.gz
    # 3063346382 bases (146001290 N's 2917345092 real 2917345092 upper 0
    #     lower) in 40710 sequences in 49 files
    # Total size: mean 75248.0 sd 3218522.4 min 100 (CABD030167151.1) max
    #     228908639 (FR853080.2) median 2005
    # N count: mean 3586.4 sd 159236.0
    # U count: mean 71661.6 sd 3065255.5
    # L count: mean 0.0 sd 0.0
    # %0.00 masked total, %0.00 masked real

    # note that these totals do not include chrM since the GenBank ftp directory
    # did not include a non-nuclear directory

#############################################################################
# fixup to UCSC naming scheme (DONE - 2015-10-16 - Jonathan)
    mkdir /hive/data/genomes/gorGor4/ucsc
    cd /hive/data/genomes/gorGor4/ucsc

    time ~/kent/src/hg/utils/automation/ucscCompositeAgp.pl \
../genbank/*_assembly_structure/Primary_Assembly

# FR853101.2 8
# FR853102.2 9
# FR853080.2 1
# FR853091.2 20
# FR853089.2 18
# FR853088.3 17
# FR853086.2 15
# FR853083.2 12
# FR853096.2 3
# FR853099.2 6
# FR853082.2 11
# FR853093.2 22
# FR853098.3 5
# FR853090.2 19
# FR853097.2 4
# FR853085.2 14
# FR853103.2 X
# FR853092.2 21
# FR853087.2 16
# FR853100.2 7
# FR853081.2 10
# FR853095.2 2B
# FR853094.2 2A
# FR853084.2 13

# real    0m42.532s

    time ~/kent/src/hg/utils/automation/unlocalizedWithChroms.pl \
../genbank/*_assembly_structure/Primary_Assembly

# chr11
# chr21
# chr7
# chr17
# chr22
# chr1
# chr18
# chr16
# chr13
# chr6
# chrX
# chr3
# chr9
# chr2B
# chr12
# chr20
# chr14
# chr15
# chr8
# chr4
# chr2A
# chr19
# chr10
# chr5

    # real    0m1.427s

    time ~/kent/src/hg/utils/automation/unplacedWithChroms.pl \
../genbank/*_assembly_structure/Primary_Assembly
    # real    0m2.393s

    # verify nothing lost compared to genbank:
    faSize *.fa
    # 3063346382 bases (146001290 N's 2917345092 real 2917345092 upper 0
    #     lower) in 40710 sequences in 49 files
    # Total size: mean 75248.0 sd 3218522.4 min 100 (chrUn_CABD030167151v1)
    #     max 228908639 (chr1) median 2005
    # %0.00 masked total, %0.00 masked real

    # same numbers as above.

    cd /hive/data/genomes/gorGor4/ucsc
    zcat ../genbank/*_assembly_structure/non-nuclear/assembled_chromosomes/AGP/*.agp.gz | \
      sed 's/^LN611626.1/chrM/' > chrM.agp
    zcat ../genbank/*_assembly_structure/non-nuclear/assembled_chromosomes/FASTA/*.fna.gz | \
      sed 's/^>LN611626.1.*/>chrM/' > chrM.fa

    # Checking to make sure fa and AGP files match up
    faToTwoBit *.fa junk.2bit
    cat *.agp | checkAgpAndFa stdin junk.2bit 2>&1 | tail

    # Should really run twoBitDup here to save time later - look
    # for duplicate chromosome records.


    # All AGP and FASTA entries agree - both files are valid
    rm junk.2bit

#############################################################################
#  Initial database build (DONE - 2015-10-19 - Jonathan)

    cd /hive/data/genomes/gorGor4
    cat << '_EOF_' > gorGor4.config.ra
# Config parameters for makeGenomeDb.pl:
db gorGor4
clade mammal
genomeCladePriority 12
scientificName  Gorilla gorilla gorilla
commonName Gorilla
assemblyDate Dec 2014
assemblyLabel Wellcome Trust Sanger Institute Dec 2014 (NCBI project 31265, GCA_000151905.1)
assemblyShortLabel gorGor4.1
orderKey 7682
# no mito accession - already part of assembly: LN611626.1
mitoAcc none
fastaFiles /hive/data/genomes/gorGor4/ucsc/*fa
agpFiles /hive/data/genomes/gorGor4/ucsc/*agp
# qualFiles none
dbDbSpeciesDir gorilla
photoCreditURL https://commons.wikimedia.org/wiki/User:Arpingstone
photoCreditName Wikimedia Commons/Adrian Pingstone
ncbiGenomeId 2156
ncbiAssemblyId 503571
ncbiAssemblyName gorGor4
ncbiBioProject 31265
genBankAccessionID GCA_000151905.3
taxId 9595
'_EOF_'
    # << happy emacs

    # verify sequence and AGP are OK:
    time makeGenomeDb.pl -workhorse=hgwdev -dbHost=hgwdev -fileServer=hgwdev \
         -stop=agp gorGor4.config.ra > agp.log 2>&1

    # $ twoBitDup gorGor4.unmasked.2bit 
    # chrUn_CABD030132546v1 and chr2A_CABD030129486v1_random are identical
    # chrUn_CABD030131115v1 and chrUn_CABD030131928v1 are identical
    # chrUn_CABD030131001v1 and chrUn_CABD030131928v1 are identical
    # chrUn_CABD030131720v1 and chrUn_CABD030130937v1 are identical
    # chrUn_CABD030131327v1 and chrUn_CABD030130937v1 are identical
    # chrUn_CABD030132118v1 and chrUn_CABD030132114v1 are identical
    # chrUn_CABD030132319v1 and chr9_CABD030129872v1_random are identical
    # chrUn_CABD030131156v1 and chrUn_CABD030131166v1 are identical
    # chrUn_CABD030132550v1 and chr2A_CABD030129492v1_random are identical
    # chrUn_CABD030132215v1 and chrUn_CABD030132180v1 are identical
    # chrUn_CABD030167305v1 and chrUn_CABD030167306v1 are identical
    # chrUn_CABD030132026v1 and chrUn_CABD030131028v1 are identical
    # chrUn_CABD030131473v1 and chrUn_CABD030131472v1 are identical
    # chrUn_CABD030130414v1 and chrUn_CABD030130361v1 are identical
    # chrUn_CABD030131340v1 and chrUn_CABD030131033v1 are identical
    # chrUn_CABD030131344v1 and chrUn_CABD030132223v1 are identical
    # chrUn_CABD030132027v1 and chrUn_CABD030131029v1 are identical
    # chrUn_CABD030132028v1 and chrUn_CABD030131030v1 are identical
    # chrUn_CABD030131168v1 and chrUn_CABD030131158v1 are identical

    # Exclude duplicates (first name in list)

    faSomeRecords -exclude ucsc/chrUn.fa excludeList dupFree.fa
    grep -v -f excludeList ucsc/chrUn.agp > dupFree.agp

    mkdir ucsc/aside
    mv ucsc/chrUn.* ucsc/aside
    mv dupFree.* ucsc/

    # re-run previous makeGenomeDb.pl command
    #
    # real  2m44.282s

    # then finish it off:
    time nice -n +19 makeGenomeDb.pl -workhorse=hgwdev -dbHost=hgwdev \
       -fileServer=hgwdev -continue=db gorGor4.config.ra > db.log 2>&1
    # real    real    24m37.067s

    # check in the trackDb files created and add to trackDb/makefile

##########################################################################
# running repeat masker (DONE - 2015-10-19 - Jonathan)
    mkdir /hive/data/genomes/gorGor4/bed/repeatMasker
    cd /hive/data/genomes/gorGor4/bed/repeatMasker
    time  (doRepeatMasker.pl -buildDir=`pwd` \
        -bigClusterHub=ku -dbHost=hgwdev -workhorse=hgwdev \
        -smallClusterHub=ku gorGor4) > do.log 2>&1
    # real    587m53.835s

    cat faSize.rmsk.txt
    # 3063310485 bases (146001289 N's 2917309196 real 1460079025 upper
    # 1457230171 lower) in 40692 sequences in 1 files
    # Total size: mean 75280.4 sd 3219233.8 min 100 (chrUn_CABD030167151v1)
    # max 228908639 (chr1) median 2006
    # %47.57 masked total, %49.95 masked real

    egrep -i "versi|relea" do.log
    # RepeatMasker version open-4.0.5
    #    January 31 2015 (open-4-0-5) version of RepeatMasker
    # CC   RELEASE 20140131;

    time featureBits -countGaps gorGor4 rmsk
    # 1459613759 bases of 3063310485 (47.648%) in intersection
    # real  1m0.933s

    # why is it different than the faSize above ?
    # because rmsk masks out some N's as well as bases, the count above
    #   separates out the N's from the bases, it doesn't show lower case N's

#########################################################################
# UCSC to RefSeq name correspondence (DONE - 2017-10-20 - Hiram)

    mkdir /hive/data/genomes/gorGor4/bed/ucscToRefSeq
    cd /hive/data/genomes/gorGor4/bed/ucscToRefSeq

    # after constructing idKeys in ../chromAlias/refseq/ directory:

    join -t$'\t' ../idKeys/*.idKeys.txt  ../chromAlias/refseq/*.idKeys.txt \
	| cut -f2- | sort > ucsc.refseq.tab
    # the duplicates in the refseq assembly are managed correctly, e.g.:
    grep  chrUn_CABD030131928v1 *.tab
# chrUn_CABD030131928v1   NW_017495415.1
# chrUn_CABD030131928v1   NW_017498946.1
# chrUn_CABD030131928v1   NW_017499000.1

    # refSeq chose a different chrM: UCSC has: LN611626.1 = CABD030129419.1
    # printf "chrM\tNC_011120.1\n" >> ucsc.refseq.tab

    awk '{printf "%s\t0\t%d\n", $1,$2}' ../../chrom.sizes \
         | sort > name.coordinate.tab

    # the -u on the sort eliminates the duplicates:
    join ucsc.refseq.tab name.coordinate.tab \
       | awk '{printf "%s\t%d\t%d\t%s\n", $1,$3,$4,$2}' \
          | sort -k1,1 -k2,2n -u > ucscToRefSeq.bed

    # when working perfectly, all these tab files have the same line count:
    wc -l *.tab *.bed
#   40692 name.coordinate.tab
#   40710 ucsc.refseq.tab
#   40691 ucscToRefSeq.bed

    # the extra counts in the ucsc.refseq.tab are due to the duplicates
    # in the refseq assembly.  The missing item in the bed file is chrM

    export chrSize=`cut -f1 ucscToRefSeq.bed | awk '{print length($0)}' | sort -n | tail -1`
    echo $chrSize
    #  28
    sed -e "s/21/$chrSize/" $HOME/kent/src/hg/lib/ucscToINSDC.sql \
       | sed -e 's/INSDC/RefSeq/g;' > ucscToRefSeq.sql
    hgLoadSqlTab gorGor4 ucscToRefSeq ./ucscToRefSeq.sql ucscToRefSeq.bed

    checkTableCoords  gorGor4 -table=ucscToRefSeq
    # would normally cover %100 all bases, this case missing chrM:
    featureBits -countGaps gorGor4 ucscToRefSeq
    # 3063294113 bases of 3063310485 (99.999%) in intersection

    calc 3063310485 - 3063294113
    # 3063310485 - 3063294113 = 16372.000000
    twoBitToFa ../../gorGor4.2bit:chrM stdout | faSize stdin
    # 16372 bases (0 N's 16372 real 15975 upper 397 lower) in 1 sequences

#########################################################################
# add chromAlias table (DONE - 2017-12-14 - Hiram)

    mkdir /hive/data/genomes/gorGor4/bed/chromAlias
    cd /hive/data/genomes/gorGor4/bed/chromAlias

    hgsql -N -e 'select chrom,name from ucscToRefSeq;' gorGor4 \
        > ucsc.refseq.tab
    hgsql -N -e 'select chrom,name from ucscToINSDC;' gorGor4 \
        > ucsc.genbank.tab

    join -t$'\t' ../idKeys/gorGor4.idKeys.txt \
	../../ensembl/ensemblGorGor4.idKeys.txt \
	| cut -f2,3 | sort > ucsc.ensembl.tab

    ~/kent/src/hg/utils/automation/chromAlias.pl
    sort -o gorGor4.chromAlias.tab gorGor4.chromAlias.tab

for t in refseq genbank ensembl
do
  c0=`cat ucsc.$t.tab | wc -l`
  c1=`grep $t gorGor4.chromAlias.tab | wc -l`
  ok="OK"
  if [ "$c0" -ne "$c1" ]; then
     ok="ERROR"
  fi
  printf "# checking $t: $c0 =? $c1 $ok\n"
done
# checking refseq: 40691 =? 40691 OK
# checking genbank: 40692 =? 40692 OK
# checking ensembl: 40710 =? 40704 ERROR

# the duplicates from ensembl are in the table, this is OK

    hgLoadSqlTab gorGor4 chromAlias ~/kent/src/hg/lib/chromAlias.sql \
        gorGor4.chromAlias.tab

##########################################################################
# running simple repeat (DONE 2015-11-11 - Jonathan)
    # This is running into the same kind of problem that we had with
    # hg38 - ceontromere sequence causes trf to run indefinitely.
    # Following Hiram's method of repeatedly splitting the run down
    # until everything finishes.

    # this didn't work
    mkdir /hive/data/genomes/gorGor4/bed/simpleRepeat
    cd /hive/data/genomes/gorGor4/bed/simpleRepeat
    time (doSimpleRepeat.pl -buildDir=`pwd` -bigClusterHub=ku \
        -dbHost=hgwdev -workhorse=hgwdev -smallClusterHub=ku \
        gorGor4) > do.log 2>&1

    # moving that run aside, starting again
    cd /hive/data/genomes/gorGor4/bed
    mv simpleRepeat simpleRepeat.2015-10-19

    mkdir -p /hive/data/genomes/gorGor4/bed/simpleRepeat.2015-11-5/splitGap
    cd /hive/data/genomes/gorGor4/bed/simpleRepeat.2015-11-5/splitGap
    mkdir -p noGap

    twoBitToFa ../../../gorGor4.unmasked.2bit stdout \
      | faSplit -lift=noGap.lift gap stdin 5000000 noGap/gorGor4_
    # make sure nothing's missing:
    faCount noGap/*.fa > faCount.txt
    tail -1 faCount.txt
# total 3035682197  862124294 596217273 593636862 865330767 118373001 29582190
    # compared to full genome - should have same ACGT numbers
    # (numeric colums 2-5 - first column is size)
    twoBitToFa ../../../gorGor4.unmasked.2bit stdout | faCount stdin
# total 3063310485  862124294 596217273 593636862 865330767 146001289 29582190
    faToTwoBit noGap/*.fa gorGor4.nogap.2bit
    twoBitInfo gorGor4.nogap.2bit stdout | sort -k2,2nr > gorGor4.nogap.sizes


    mkdir /hive/data/genomes/gorGor4/bed/simpleRepeat.2015-11-5/run20M
    cd /hive/data/genomes/gorGor4/bed/simpleRepeat.2015-11-5/run20M
    /cluster/bin/scripts/simplePartition.pl \
/hive/data/genomes/gorGor4/bed/simpleRepeat.2015-11-5/splitGap/gorGor4.nogap.2bit \
    20000000 /hive/data/genomes/gorGor4/TrfPart20M
    ln -s /hive/data/genomes/gorGor4/TrfPart20M \
      /hive/data/genomes/gorGor4/bed/simpleRepeat.2015-11-5/TrfPart20M
    ssh ku
    cd /hive/data/genomes/gorGor4/bed/simpleRepeat.2015-11-5/run20M

    printf "%s" '#LOOP
./TrfRun.csh /hive/data/genomes/gorGor4/TrfPart20M/$(path1).bed
#ENDLOOP
' > gsub
    gensub2 /hive/data/genomes/gorGor4/TrfPart20M/partitions.lst single gsub jobList
    para create jobList
    para push
    # 1 job doesn't want to finish
# Completed: 165 of 166 jobs
# Jobs currently running: 1
# CPU time in finished jobs:      89775s    1496.25m    24.94h    1.04d  0.003 y
# IO & Wait Time:                   499s       8.31m     0.14h    0.01d  0.000 y
# Time in running jobs:           64864s    1081.07m    18.02h    0.75d  0.002 y
# Average job time:                 547s       9.12m     0.15h    0.01d
# Longest running job:            64864s    1081.07m    18.02h    0.75d
# Longest finished job:           46144s     769.07m    12.82h    0.53d
# Submission to last job:         46151s     769.18m    12.82h    0.53d

    # determine which are the last jobs as individual bits:
    para status | grep -v done | awk '{print $(NF-1),$NF}' | grep TrfRun \
      > not.done.list
    awk '{print $NF}' not.done.list | sed -e 's/.bed//' | while read F
do
    cat $F
done > seq.specs.not.done

    mkdir /hive/data/genomes/gorGor4/bed/simpleRepeat.2015-11-5/run20M/lastJobs
    cd /hive/data/genomes/gorGor4/bed/simpleRepeat.2015-11-5/run20M/lastJobs
    mkdir fasta
    for seqSpec in `cat ../seq.specs.not.done`
do
  fName=`echo $seqSpec | sed -e 's/.*://'`
  echo $fName
  twoBitToFa $seqSpec fasta/$fName.fa
done
  ls -1S `pwd`/fasta > part.list
  printf "%s" '#LOOP
./runTrf {check in line+ fasta/$(path1)} {check out line bed/$(root1).bed}
#ENDLOOP
' > template

  cat << '_EOF_' > runTrf
#!/bin/bash
set -beEu -o pipefail
export path1=$1
export inputFN=`basename $1`
export outpath=$2
export outputFN=`basename $2`
mkdir -p /dev/shm/$outputFN
cp -p $path1 /dev/shm/$outputFN
cd /dev/shm/$outputFN
/cluster/bin/x86_64/trfBig -trf=/cluster/bin/x86_64/trf \
      $inputFN /dev/null -bedAt=$outputFN -tempDir=/dev/shm
cd /hive/data/genomes/gorGor4/bed/simpleRepeat.2015-11-5/run20M/lastJobs
rm -f $outpath
cp -p /dev/shm/$outputFN/$outputFN $outpath
rm -fr /dev/shm/$outputFN/*
rmdir --ignore-fail-on-non-empty /dev/shm/$outputFN
'_EOF_'
  # << happy emacs
  chmod +x runTrf
  mkdir bed

  # used parasol remove job to kill the old unfinished job
  # now to start the new batch that replaces it
  gensub2 part.list single template jobList
  para create jobList
  para push

  # Once again, not everything finished:
  # Completed: 42 of 44 jobs
  # Jobs currently running: 2
  # CPU time in finished jobs:      56289s     938.15m    15.64h    0.65d 0.002 y
  # IO & Wait Time:                   191s       3.18m     0.05h    0.00d 0.000 y
  # Time in running jobs:          253691s    4228.18m    70.47h    2.94d 0.008 y
  # Average job time:                1345s      22.41m     0.37h    0.02d
  # Longest running job:           126853s    2114.22m    35.24h    1.47d
  # Longest finished job:           56355s     939.25m    15.65h    0.65d
  # Submission to last job:         56366s     939.43m    15.66h    0.65d

  # Assembling the data that did finish:
  liftUp result.bed ../../splitGap/noGap.lift error bed/*.bed
  # find jobs not done
  para status | grep -v done | grep runTrf | awk '{print $(NF-2),$(NF-1),$NF}' \
    > not.done.list

  # splitting up those last jobs - note, this 1M_10K split was abandoned.
  # Jump down to the 100K_10K split for the next step.
  mkdir /hive/data/genomes/gorGor4/bed/simpleRepeat.2015-11-5/run20M/splitBits
  cd /hive/data/genomes/gorGor4/bed/simpleRepeat.2015-11-5/run20M/splitBits
  mkdir noGap
  awk '{print $2}' ../lastJobs/not.done.list | while read F
do
  cp -p ../lastJobs/$F ./noGap/
done

  # split into 1,000,000 chunks with 10,000 overlap:
  mkdir -p 1M_10K

  for F in noGap/*.fa
  do
    B=`basename $F .fa`
    echo "faSplit -lift=$B.lift -extra=10000 size $F 1000000 1M_10K/$B_"
    faSplit -lift=$B.lift -extra=10000 size $F 1000000 1M_10K/${B}_
  done

  ls -1S `pwd`/1M_10K/*.fa > part.list

  cat << '_EOF_' > runTrf
#!/bin/bash
set -beEu -o pipefail
export path1=$1
export inputFN=`basename $1`
export outpath=$2
export outputFN=`basename $2`
mkdir -p /dev/shm/$outputFN
cp -p $path1 /dev/shm/$outputFN
cd /dev/shm/$outputFN
/cluster/bin/x86_64/trfBig -trf=/cluster/bin/x86_64/trf \
      $inputFN /dev/null -bedAt=$outputFN -tempDir=/dev/shm
cd /hive/data/genomes/gorGor4/bed/simpleRepeat.2015-11-5/run20M/splitBits
rm -f $outpath
cp -p /dev/shm/$outputFN/$outputFN $outpath
rm -fr /dev/shm/$outputFN/*
rmdir --ignore-fail-on-non-empty /dev/shm/$outputFN
'_EOF_'
 # << happy emacs

  chmod +x runTrf
  mkdir -p bed

  cat << '_EOF_' > template
#LOOP
./runTrf {check in line+ $(path1)}  {check out line bed/$(root1).bed}
#ENDLOOP
'_EOF_'
 # << happy emacs

  gensub2 part.list single template jobList
  para create jobList
  para push
  # Not all of those jobs finished either.  Actually, none of them did.
  # Completed: 0 of 10 jobs
  # Jobs currently running: 10
  # CPU time in finished jobs:          0s       0.00m     0.00h    0.00d 0.000 y
  # IO & Wait Time:                     0s       0.00m     0.00h    0.00d 0.000 y
  # Time in running jobs:          743780s   12396.33m   206.61h    8.61d 0.024 y

  # Remove that batch, start again with a finer split
  para stop

  mkdir /hive/data/genomes/gorGor4/bed/simpleRepeat.2015-11-5/run20M/fineSplitBits
  cd /hive/data/genomes/gorGor4/bed/simpleRepeat.2015-11-5/run20M/fineSplitBits
  mkdir noGap

  awk '{print $2}' ../lastJobs/not.done.list | while read F
do
  cp -p ../lastJobs/$F ./noGap/
done

  # split into 100K chunks with 10K overlap:
  mkdir -p 100K_10K

  for F in noGap/*.fa
  do
    B=`basename $F .fa`
    echo "faSplit -lift=$B.lift -extra=10000 size $F 100000 100K_10K/$B_"
    faSplit -lift=$B.lift -extra=10000 size $F 100000 100K_10K/${B}_
  done

  ls -1S `pwd`/100K_10K/*.fa > part.list

  cat << '_EOF_' > runTrf
#!/bin/bash
set -beEu -o pipefail
export path1=$1
export inputFN=`basename $1`
export outpath=$2
export outputFN=`basename $2`
mkdir -p /dev/shm/$outputFN
cp -p $path1 /dev/shm/$outputFN
cd /dev/shm/$outputFN
/cluster/bin/x86_64/trfBig -trf=/cluster/bin/x86_64/trf \
      $inputFN /dev/null -bedAt=$outputFN -tempDir=/dev/shm
cd /hive/data/genomes/gorGor4/bed/simpleRepeat.2015-11-5/run20M/fineSplitBits
rm -f $outpath
cp -p /dev/shm/$outputFN/$outputFN $outpath
rm -fr /dev/shm/$outputFN/*
rmdir --ignore-fail-on-non-empty /dev/shm/$outputFN
'_EOF_'
 # << happy emacs

  chmod +x runTrf
  mkdir -p bed

  cat << '_EOF_' > template
#LOOP
./runTrf {check in line+ $(path1)}  {check out line bed/$(root1).bed}
#ENDLOOP
'_EOF_'
 # << happy emacs

  gensub2 part.list single template jobList
  para create jobList
  para push
  # This time, they all finished.
  # Completed: 100 of 100 jobs
  # CPU time in finished jobs:      95589s    1593.14m    26.55h    1.11d  0.003 y
  # IO & Wait Time:                   263s       4.39m     0.07h    0.00d  0.000 y
  # Average job time:                 959s      15.98m     0.27h    0.01d
  # Longest finished job:            2523s      42.05m     0.70h    0.03d
  # Submission to last job:          2526s      42.10m     0.70h    0.03d

  cat *.lift | liftUp 100Kparts.bed stdin error bed/*.bed
  liftUp -type=.bed stdout ../../splitGap/noGap.lift error 100Kparts.bed | sort -u \
    | sort -k1,1 -k2,2n > gorGor4.result.bed

  # Putting it all together:
  cd /hive/data/genomes/gorGor4/bed/simpleRepeat.2015-11-5/run20M
  cat /hive/data/genomes/gorGor4/TrfPart20M/???/*.bed lastJobs/bed/*.bed \
    fineSplitBits/100Kparts.bed > beforeLift.simpleRepeat.bed
  liftUp -type=.bed stdout ../splitGap/noGap.lift error \
    beforeLift.simpleRepeat.bed | sort -u \
    | sort -k1,1 -k2,2n > simpleRepeat.bed

  awk '{if ($5 <= 12) print;}' simpleRepeat.bed > trfMask.bed

  hgLoadBed gorGor4 simpleRepeat simpleRepeat.bed \
    -sqlTable=$HOME/kent/src/hg/lib/simpleRepeat.sql
  featureBits gorGor4 simpleRepeat > fb.simpleRepeat 2>&1
  cat fb.simpleRepeat
  # 166266267 bases of 2917333143 (5.699%) in intersection

    # add to rmsk after it is done:
    cd /hive/data/genomes/gorGor4
    twoBitMask gorGor4.rmsk.2bit \
        -add bed/simpleRepeat.2015-11-5/run20M/trfMask.bed gorGor4.2bit
    #   you can safely ignore the warning about fields >= 13
    twoBitToFa gorGor4.2bit stdout | faSize stdin > faSize.gorGor4.2bit.txt
    cat faSize.gorGor4.2bit.txt
    # 3063310485 bases (146001289 N's 2917309196 real 1458434938 upper
    # 1458874258 lower) in 40692 sequences in 1 files
    # Total size: mean 75280.4 sd 3219233.8 min 100 (chrUn_CABD030167151v1)
    #    max 228908639 (chr1) median 2006
    # %47.62 masked total, %50.01 masked real

    rm /gbdb/gorGor4/gorGor4.2bit
    ln -s `pwd`/gorGor4.2bit /gbdb/gorGor4/gorGor4.2bit

##########################################################################
# CREATE MICROSAT TRACK (DONE - 2015-11-11 - Jonathan)
     ssh hgwdev
     mkdir /cluster/data/gorGor4/bed/microsat
     cd /cluster/data/gorGor4/bed/microsat
     awk '($5==2 || $5==3) && $6 >= 15 && $8 == 100 && $9 == 0 {printf("%s\t%s\t%s\t%dx%s\n", $1, $2, $3, $6, $16);}' \
	../simpleRepeat/simpleRepeat.bed > microsat.bed
    hgLoadBed gorGor4 microsat microsat.bed
    #	Read 27963 elements of size 4 from microsat.bed

##########################################################################
## WINDOWMASKER (DONE - 2015-11-11 - Jonathan)

    mkdir /hive/data/genomes/gorGor4/bed/windowMasker
    cd /hive/data/genomes/gorGor4/bed/windowMasker
    time nice -n +19 doWindowMasker.pl -buildDir=`pwd` -workhorse=hgwdev \
        -dbHost=hgwdev gorGor4 > do.log 2>&1
    # real    245m41.678s

    # Masking statistics
    cat faSize.gorGor4.cleanWMSdust.txt
    # 3063310485 bases (146001289 N's 2917309196 real 1797939035 upper
    # 1119370161 lower) in 40692 sequences in 1 files
    # Total size: mean 75280.4 sd 3219233.8 min 100 (chrUn_CABD030167151v1)
    # max 228908639 (chr1) median 2006
    # %36.54 masked total, %38.37 masked real

    cat fb.gorGor4.rmsk.windowmaskerSdust.txt
    # 824968454 bases of 3063310485 (26.931%) in intersection

##########################################################################
# cpgIslands - (DONE - 2015-11-11 - Jonathan)
    mkdir /hive/data/genomes/gorGor4/bed/cpgIslands
    cd /hive/data/genomes/gorGor4/bed/cpgIslands
    time doCpgIslands.pl -dbHost=hgwdev -bigClusterHub=ku \
      -workhorse=hgwdev -smallClusterHub=ku gorGor4 > do.log 2>&1
    # real    32m36.495s

    cat fb.gorGor4.cpgIslandExt.txt
    # 19071327 bases of 2917333143 (0.654%) in intersection

##############################################################################
# cpgIslands on UNMASKED sequence (DONE - 2015-11-11 - Jonathan)
    mkdir /hive/data/genomes/gorGor4/bed/cpgIslandsUnmasked
    cd /hive/data/genomes/gorGor4/bed/cpgIslandsUnmasked

    # run stepwise so the loading can be done in a different table
    time (doCpgIslands.pl -dbHost=hgwdev -bigClusterHub=ku -buildDir=`pwd` \
       -tableName=cpgIslandExtUnmasked \
          -maskedSeq=/hive/data/genomes/gorGor4/gorGor4.unmasked.2bit \
             -workhorse=hgwdev -smallClusterHub=ku gorGor4) > do.log 2>&1
    # real    71m51.112s

    cat fb.gorGor4.cpgIslandExtUnmasked.txt
    # 25964207 bases of 2917333143 (0.890%) in intersection

#############################################################################
# cytoBandIdeo - (DONE - 2015-10-19 - Jonathan)
    mkdir /hive/data/genomes/gorGor4/bed/cytoBand
    cd /hive/data/genomes/gorGor4/bed/cytoBand
    makeCytoBandIdeo.csh gorGor4

#########################################################################
# genscan - (DONE - 2015-11-18 - Jonathan)
    mkdir /hive/data/genomes/gorGor4/bed/genscan
    cd /hive/data/genomes/gorGor4/bed/genscan
    time doGenscan.pl -buildDir=`pwd` -workhorse=hgwdev -dbHost=hgwdev \
      -bigClusterHub=ku gorGor4 > do.log 2>&1
    # real    2252m4.489s
    # two jobs failed, running with reduced window size 2,000,000
# ./runGsBigLastTwo.csh chr19 000 gtf/000/chr19.gtf pep/000/chr19.pep subopt/000/chr19.bed &
# ./runGsBigLastTwo.csh chr22 000 gtf/000/chr22.gtf pep/000/chr22.pep subopt/000/chr22.bed
# wait

# chr22 finished, but chr19 didn't.  Dropping window size to 1.8M
# ./runGsBigLastOne.csh chr19 000 gtf/000/chr19.gtf pep/000/chr19.pep subopt/000/chr19.bed
# wait

    # continuing
    time doGenscan.pl -continue=makeBed  -buildDir=`pwd` -workhorse=hgwdev \
      -dbHost=hgwdev -bigClusterHub=ku gorGor4 > makeBed.log 2>&1
    # real    3m18.308s

    cat fb.gorGor4.genscan.txt
    # 42850913 bases of 2917333143 (1.469%) in intersection

    cat fb.gorGor4.genscanSubopt.txt
    # 42201199 bases of 2917333143 (1.447%) in intersection


#########################################################################
# genscan redo - (DONE - 2016-03-18 - Jonathan)
# Turns out some chromosomes somehow lacked annotation (notably, chr4).
# Re-doing from scratch with 1.8M windows
    mkdir /hive/data/genomes/gorGor4/bed/genscanTakeTwo
    cd /hive/data/genomes/gorGor4/bed/genscanTakeTwo
    cp `which doGenscan.pl` .
    # edit doGenscan.pl to change window size
    time ./doGenscan.pl -buildDir=`pwd` -workhorse=hgwdev -dbHost=hgwdev \
      -bigClusterHub=ku gorGor4 > do.log 2>&1

    # Everything finished except chr22 (out of memory error, suggesting
    # 8 GB is not enough?).  Re-running that one on dev.
    ./runGsBig.csh chr22 000 gtf/000/chr22.gtf pep/000/chr22.pep subopt/000/chr22.bed
    # wait
    # Crashed again.  Resized window to 2M from 1.8M.
    ./runGsBigLastOne.csh chr22 000 gtf/000/chr22.gtf pep/000/chr22.pep subopt/000/chr22.bed
    # Success!

    # continuing
    time doGenscan.pl -continue=makeBed  -buildDir=`pwd` -workhorse=hgwdev \
    -dbHost=hgwdev -bigClusterHub=ku gorGor4 > makeBed.log 2>&1

    # real    11m41.464s

    cat fb.gorGor4.genscan.txt
    # 51964982 bases of 2917333143 (1.781%) in intersection
    cat fb.gorGor4.genscanSubopt.txt
    # 53599149 bases of 2917333143 (1.837%) in intersection

########################################################################
# Create kluster run files (DONE - 2015-10-19 - Hiram/Jonathan)

    # denominator is hg19 gapless bases as reported by:
    #   featureBits -noRandom -noHap hg19 gap
    #     234344806 bases of 2861349177 (8.190%) in intersection
    # 1024 is threshold used for human -repMatch:
    calc \( 2917345092 / 2861349177 \) \* 1024
    # ( 2917345092 / 2861349177 ) * 1024 = 1044.039434

    # ==> use repMatch=1000 - it's nice and round
    cd /hive/data/genomes/gorGor4
    time blat gorGor4.2bit \
         /dev/null /dev/null -tileSize=11 -makeOoc=jkStuff/gorGor4.11.ooc \
        -repMatch=1000
    # Wrote 32028 overused 11-mers to jkStuff/gorGor4.11.ooc
    # Done making jkStuff/gorGor4.11.ooc
    #
    # real  0m54.563s

    #   check non-bridged gaps to see what the typical size is:
    hgsql -N -e 'select * from gap where bridge="no" order by size;' \
       gorGor4 | ave -col=7 stdin
# Q1 2380000.000000
# median 2992365.000000
# Q3 3000000.000000
# average 2554669.052632
# min 99581.000000
# max 3000000.000000
# count 19
# total 48538712.000000
# standard deviation 787329.083452

    # they are all 100 bases
    gapToLift -verbose=2 -minGap=100 gorGor4 jkStuff/gorGor4.nonBridged.lft \
        -bedFile=jkStuff/gorGor4.nonBridged.bed

    cat jkStuff/gorGor4.nonBridged.bed | awk '{printf "%s\t%d\n", $4,$3-$2}' \
        | sort -k2nr > jkStuff/nonBridged.sizes
    n50.pl jkStuff/nonBridged.sizes
# reading: jkStuff/nonBridged.sizes
# contig count: 40711, total size: 3014771773, one half size: 1507385886
# cumulative  N50 count contig  contig size
1475129793  14  chr12 82873846
1507385886 one half size
1556356822  15  chr11 81227029

########################################################################
# GENBANK AUTO UPDATE (DONE - 2015-11-24 - Hiram/Jonathan)
    ssh hgwdev
    cd $HOME/kent/src/hg/makeDb/genbank
    git pull
    # /cluster/data/genbank/data/organism.lst shows:
    # #organism       mrnaCnt estCnt  refSeqCnt
    # Gorilla gorilla 589 1 87
    # Gorilla gorilla gorilla 6 0 0

    # edit etc/genbank.conf to add gorGor4 just before gorGor3
gorGor4.serverGenome = /hive/data/genomes/gorGor4/gorGor4.2bit
gorGor4.clusterGenome = /hive/data/genomes/gorGor4/gorGor4.2bit
gorGor4.ooc = /hive/data/genomes/gorGor4/jkStuff/gorGor4.11.ooc
gorGor4.lift = /hive/data/genomes/gorGor4/jkStuff/gorGor4.nonBridged.lft
gorGor4.perChromTables = no
gorGor4.refseq.mrna.native.pslCDnaFilter  = ${finished.refseq.mrna.native.pslCDnaFilter}
gorGor4.refseq.mrna.xeno.pslCDnaFilter    = ${finished.refseq.mrna.xeno.pslCDnaFilter}
gorGor4.genbank.mrna.native.pslCDnaFilter = ${finished.genbank.mrna.native.pslCDnaFilter}
gorGor4.genbank.mrna.xeno.pslCDnaFilter   = ${finished.genbank.mrna.xeno.pslCDnaFilter}
gorGor4.genbank.est.native.pslCDnaFilter  = ${finished.genbank.est.native.pslCDnaFilter}
gorGor4.genbank.est.xeno.pslCDnaFilter    = ${finished.genbank.est.xeno.pslCDnaFilter}
gorGor4.downloadDir = gorGor4
# refseq.mrna native and xeno are default yes
# # genbank.mrna and genbank.est native are default yes, the xeno is default
# no
# # reset upstream* if and when a multiple alignment is constructed
# # gorGor4.upstreamGeneTbl = ensGene
# # gorGor4.upstreamMaf = multiz11way
# /hive/data/genomes/gorGor4/bed/multiz11way/species.list


    git commit -m "adding gorGor4 definitions refs #16220" etc/genbank.conf
    git push
    # update /cluster/data/genbank/:
    make etc-update

# Edit src/lib/gbGenome.c to add new species.  Skipped

    screen      #  control this business with a screen since it takes a while
    cd /cluster/data/genbank

    time ./bin/gbAlignStep -initial gorGor4
    # logFile: var/build/logs/2015.11.20-11:13:18.gorGor4.initalign.log
    #
    # real    1414m49.353s
    #
    #   If necessary to restart from beginning, rm the dir first:
    #     /cluster/data/genbank/work/initial.gorGor4

    # load database when finished
    ssh hgwdev
    cd /cluster/data/genbank
    time ./bin/gbDbLoadStep -drop -initialLoad gorGor4
    # logFile: var/dbload/hgwdev/logs/2015.11.24-10:08:58.gorGor4.dbload.log
    # real    41m18.271s

    # enable daily alignment and update of hgwdev
    cd ~/kent/src/hg/makeDb/genbank
    git pull
    # add gorGor4 to:
    #   vi etc/align.dbs etc/hgwdev.dbs
    git commit -m "Added gorGor4 - gorilla refs #16220" etc/align.dbs etc/hgwdev.dbs
    git push
    make etc-update

############################################################################
# construct liftOver to gorGor3 (DONE - 2015-11-16 - Jonathan)
    screen -S gorGor3	# manage this longish running job in a screen
    mkdir /hive/data/genomes/gorGor4/bed/blat.gorGor3.2015-11-16
    cd /hive/data/genomes/gorGor4/bed/blat.gorGor3.2015-11-16
    # check it with -debug first to see if it is going to work:
    time doSameSpeciesLiftOver.pl -buildDir=`pwd` -bigClusterHub=ku \
	-ooc=/hive/data/genomes/gorGor4/jkStuff/gorGor4.11.ooc \
	-debug -dbHost=hgwdev -workhorse=hgwdev gorGor4 gorGor3
    # real    0m1.373s
    # if that is OK, then run it:
    time (doSameSpeciesLiftOver.pl -buildDir=`pwd` -bigClusterHub=ku \
	-ooc=/hive/data/genomes/gorGor4/jkStuff/gorGor4.11.ooc \
	-dbHost=hgwdev -workhorse=hgwdev gorGor4 gorGor3) > do.log 2>&1
    #	real    16m38.789s

# log crashed after connection to ku failed; resumed by hand after the
# jobs completed.
    para time > run.time
    time (doSameSpeciesLiftOver.pl -continue chain -buildDir=`pwd` \
  -bigClusterHub=ku -ooc=/hive/data/genomes/gorGor4/jkStuff/gorGor4.11.ooc \
  -dbHost=hgwdev -workhorse=hgwdev gorGor4 gorGor3) >> do.log 2>&1

    # verify this file exists:
    #	/gbdb/gorGor4/liftOver/gorGor4ToGorGor3.over.chain.gz
    # and try out the conversion on genome-test from gorGor4 to gorGor3


############################################################################
# construct liftOver from gorGor3 (DONE - 2015-11-20 - Jonathan)

    screen -S gorGor3To4	# manage this longish running job in a screen
    mkdir /hive/data/genomes/gorGor3/bed/blat.gorGor4.2015-11-19
    cd /hive/data/genomes/gorGor3/bed/blat.gorGor4.2015-11-19
    # check it with -debug first to see if it is going to work:
    time doSameSpeciesLiftOver.pl -buildDir=`pwd` -bigClusterHub=ku \
	-ooc=/hive/data/genomes/gorGor3/jkStuff/gorGor3.11.ooc \
	-debug -dbHost=hgwdev -workhorse=hgwdev gorGor3 gorGor4
    # real    0m1.613s
    # if that is OK, then run it:
    time (doSameSpeciesLiftOver.pl -buildDir=`pwd` -bigClusterHub=ku \
	-ooc=/hive/data/genomes/gorGor3/jkStuff/gorGor3.11.ooc \
	-dbHost=hgwdev -workhorse=hgwdev gorGor3 gorGor4) > do.log 2>&1
    #	real    274m58.405s

    # verify this file exists:
    #	/gbdb/gorGor3/liftOver/gorGor3ToGorGor4.over.chain.gz
    # and try out the conversion on genome-test from gorGor4 to gorGor3

#########################################################################
# ucscToINSDC table/track (DONE - 2015-10-19 - Jonathan)

    mkdir /hive/data/genomes/gorGor4/bed/ucscToINSDC
    cd /hive/data/genomes/gorGor4/bed/ucscToINSDC
    # check for chrM in assembly:
    grep chrM ../../gorGor4.agp
# chrM    1       16372   1       W       CABD030129419.1 1       16372   +

    # that chrM accession name is from the non-nuclear AGP file, however
    # a better genbank/INSDC name would be: LN611626.1
    # use the accession name from there in this command (blank if none)
    ~/kent/src/hg/utils/automation/ucscToINSDC.sh \
        ../../genbank/GCA_*assembly_structure/Primary_Assembly LN611626.1

    awk '{printf "%s\t0\t%d\n", $1,$2}' ../../chrom.sizes \
         | sort > name.coordinate.tab
    join name.coordinate.tab ucscToINSDC.txt | tr '[ ]' '[\t]' \
         > ucscToINSDC.bed
    # should all be the same line count:
    wc -l *
    # 40692 ucscToINSDC.bed
    # 40711 ucscToINSDC.txt
    # extra counts in ucscToINSDC.txt due to duplicate sequences in genbank
    # assembly

    cut -f1 ucscToINSDC.bed | awk '{print length($0)}' | sort -n | tail -1
    # 28
    # use the 28 in this sed
    sed -e "s/21/28/" $HOME/kent/src/hg/lib/ucscToINSDC.sql \
         | hgLoadSqlTab gorGor4 ucscToINSDC stdin ucscToINSDC.bed
    checkTableCoords gorGor4
    # should cover %100 entirely:
    featureBits -countGaps gorGor4 ucscToINSDC
    # 3063310485 bases of 3063310485 (100.000%) in intersection

############################################################################
#  BLATSERVERS ENTRY (DONE - 2015-11-20 - Jonathan)
#	After getting a blat server assigned by the Blat Server Gods,
     ssh hgwdev

     # verify doesn't exist yet:
     hgsql -e 'select * from blatServers;' hgcentraltest | grep -i gorgor
     # only shows gorGor3

     hgsql -e 'INSERT INTO blatServers (db, host, port, isTrans, canPcr) \
	VALUES ("gorGor4", "blat1c", "17870", "1", "0"); \
	INSERT INTO blatServers (db, host, port, isTrans, canPcr) \
	VALUES ("gorGor4", "blat1c", "17871", "0", "1");' \
		hgcentraltest

     # verify entry:
     hgsql -e 'select * from blatServers;' hgcentraltest | grep -i gorgor
     #    gorGor4 blat1c  17870 1 0
     #    gorGor4 blat1c  17871 0 1

     #	test it with some sequence

#########################################################################
# set default position (DONE - 2015-11-20 - Jonathan)

    # Maybe something around chr5:6,996,520-7,026,281, roughly the location
    # for EVPL - one of the most accelerated gorilla genes according to
    # the assembly paper (PMID 22398555).
    # gorGor3 didn't have a default - just jumped to the middle of chr1.
    hgsql -e \
'update dbDb set defaultPos="chr5:6996937-7026323" where name="gorGor4";' \
	hgcentraltest

#########################################################################
# Augustus genes - (DONE 2016-01-04 - Jonathan)

    mkdir /hive/data/genomes/gorGor4/bed/augustus
    cd /hive/data/genomes/gorGor4/bed/augustus
    time (doAugustus.pl -buildDir=`pwd` -bigClusterHub=ku \
        -species=human -dbHost=hgwdev \
           -workhorse=hgwdev gorGor4) > do.log 2>&1

    cat fb.gorGor4.augustusGene.txt
    # 64344606 bases of 2917333143 (2.206%) in intersection

#########################################################################
# all.joiner update, downloads and in pushQ - (DONE 2016-01-08 - Jonathan)
    cd $HOME/kent/src/hg/makeDb/schema
    # fixup all.joiner until this is a clean output
    joinerCheck -database=gorGor4 -keys all.joiner
    joinerCheck -database=gorGor4 -tableCoverage all.joiner
    joinerCheck -database=gorGor4 -times all.joiner

    cd /hive/data/genomes/gorGor4
    time makeDownloads.pl gorGor4 > downloads.log 2>&1
    #  real    19m56.404s

    #   now ready for pushQ entry
    mkdir /hive/data/genomes/gorGor4/pushQ
    cd /hive/data/genomes/gorGor4/pushQ
    # do not want transMap in the output:
    makePushQSql.pl gorGor4 2> stderr.out | grep -v transMap > gorGor4.pushQ.sql

    #   check for errors in stderr.out, some are OK, e.g.:
    # WARNING: hgwdev does not have /gbdb/gorGor4/wib/gc5Base.wib
    # WARNING: hgwdev does not have /gbdb/gorGor4/wib/quality.wib
    # WARNING: hgwdev does not have /gbdb/gorGor4/bbi/quality.bw
    # WARNING: gorGor4 does not have seq
    # WARNING: gorGor4 does not have extFile

    #   copy it to hgwbeta
    scp -p gorGor4.pushQ.sql qateam@hgwbeta:/tmp
    ssh qateam@hgwbeta './bin/x86_64/hgsql qapushq < /tmp/gorGor4.pushQ.sql'
    #   in that pushQ entry walk through each entry and see if the
    #   sizes will set properly

#########################################################################
# LIFTOVER TO gorGor6 (DONE - 2019-11-20 - Hiram)
    ssh hgwdev
    mkdir /hive/data/genomes/gorGor4/bed/blat.gorGor6.2019-11-20
    cd /hive/data/genomes/gorGor4/bed/blat.gorGor6.2019-11-20
    doSameSpeciesLiftOver.pl -verbose=2 \
        -debug -bigClusterHub=ku -dbHost=hgwdev -workhorse=hgwdev \
        -ooc=/hive/data/genomes/gorGor4/jkStuff/gorGor4.11.ooc \
         gorGor4 gorGor6
    time (doSameSpeciesLiftOver.pl -verbose=2 \
        -bigClusterHub=ku -dbHost=hgwdev -workhorse=hgwdev \
        -ooc=/hive/data/genomes/gorGor4/jkStuff/gorGor4.11.ooc \
         gorGor4 gorGor6) > doLiftOverToGorGor6.log 2>&1
    # real    5784m52.206s
    # had some kind of trouble at the end of the alignment step which took
    # a really long time

    # continue with the chain step
    time (doSameSpeciesLiftOver.pl -verbose=2 -buildDir=`pwd` \
	-bigClusterHub=ku -dbHost=hgwdev -workhorse=hgwdev  \
	-ooc=/hive/data/genomes/gorGor4/jkStuff/gorGor4.11.ooc \
	-continue=chain gorGor4 gorGor6) > chain.log 2>&1 &
    # real    8m38.022s

    # see if the liftOver menus function in the browser from gorGor4 to gorGor6

#########################################################################
