# for emacs: -*- mode: sh; -*-

# This file describes browser build for the felCat9

#########################################################################
# reuse photograph from felCat8 previous versions
#    (DONE - 2018-02-27 - Hiram)

mkdir /hive/data/genomes/felCat9
cd /hive/data/genomes/felCat9
cp -p ../felCat8/photoReference.txt .

cat photoReference.txt

photoCreditURL  http://www.genome.gov/pressDisplay.cfm?photoID=42
photoCreditName Dr. Kristina Narfstrom

#########################################################################
#  Initial steps (DONE - 2018-02-27 - Hiram)

# To start this initialBuild.txt document, from a previous assembly document:

mkdir ~/kent/src/hg/makeDb/doc/felCat9
cd ~/kent/src/hg/makeDb/doc/felCat9

# best to use a most recent document since it has the latest features and
# procedures:
sed -e 's/papAnu3/felCat9/g; s/PapAnu3/FelCat9/g; s/DONE/TBD/g;' ../papAnu3/initialBuild.txt > initialBuild.txt

mkdir /hive/data/genomes/felCat9/refseq
cd /hive/data/genomes/felCat9/refseq

time rsync -L -a -P \
rsync://ftp.ncbi.nlm.nih.gov/genomes/refseq/vertebrate_mammalian/Felis_catus/all_assembly_versions/GCF_000181335.3_Felis_catus_9.0/ ./

# sent 3201 bytes  received 3658828880 bytes  19307821.01 bytes/sec
# total size is 3658370114  speedup is 1.00

# real    3m8.429s

# check assembly size for later reference:

faSize G*_9.0_genomic.fna.gz
# 2521863845 bases (45410675 N's 2476453170 real 1649339622 upper
#	827113548 lower) in 4508 sequences in 1 files
# Total size: mean 559419.7 sd 9108161.9 min 1969 (NW_019369707.1)
#	max 242100913 (NC_018723.3) median 9988
# %32.80 masked total, %33.40 masked real


# this information is from the top of
#    felCat9/refseq/GCF_000181335.3_Felis_catus_9.0_assembly_report.txt

# Assembly name:  Felis_catus_9.0
# Organism name:  Felis catus (domestic cat)
# Infraspecific name:  breed=Abyssinian
# Isolate:  Cinnamon
# Sex:  female
# Taxid:          9685
# BioSample:      SAMN02953640
# BioProject:     PRJNA16726
# Submitter:      Genome Sequencing Center (GSC) at Washington University (WashU) School of Medicine
# Date:           2017-11-20
# Assembly type:  haploid
# Release type:   major
# Assembly level: Chromosome
# Genome representation: full
# WGS project:    AANG04
# Assembly method: WTDBG v. 1.2, Chromonomer v. 1.0.7
# Genome coverage: 72x
# Sequencing technology: PacBio; 454 Titanium; Illumina; Sanger dideoxy sequencing
# RefSeq category: Representative Genome
# GenBank assembly accession: GCA_000181335.4
# RefSeq assembly accession: GCF_000181335.3
# RefSeq assembly and GenBank assemblies identical: no
#
## Assembly-Units:
## GenBank Unit Accession       RefSeq Unit Accession   Assembly-Unit name
## GCA_000181345.4      GCF_000181345.3 Primary Assembly
##      GCF_000029335.1 non-nuclear

#############################################################################
# establish config.ra file (DONE - Hiram - 2018-02-27)
    # arguments here are: <db> <clade> <trackDbDir> <assembly_report.txt>
    cd /hive/data/genomes/felCat9
    $HOME/kent/src/hg/utils/automation/prepConfig.pl felCat9 mammal \
        cat ./refseq/*_assembly_report.txt > felCat9.config.ra

    # fixup common name to remain compatible with previous panAnu2 version
    # and therfore orderKey wasn't correct.  Also fixup shortLabel
    # to check orderKey:
    hgsql -e 'select name,orderKey,organism from dbDb order by orderKey;' \
        hgcentraltest | less

    # verify it looks sane
    # reset commonName Domestic cat to: commonName Cat
    # reset orderKey 4677 to: orderKey 3239
    cat felCat9.config.ra
# config parameters for makeGenomeDb.pl:
db felCat9
clade mammal
genomeCladePriority 35
scientificName Felis catus
commonName Cat
assemblyDate Nov. 2017
assemblyLabel Genome Sequencing Center (GSC) at Washington University (WashU) School of Medicine
assemblyShortLabel Felis_catus_9.0
orderKey 3239
# mitochondrial sequence included in refseq release
# mitoAcc NC_001700.1
mitoAcc none
fastaFiles /hive/data/genomes/felCat9/ucsc/*.fa.gz
agpFiles /hive/data/genomes/felCat9/ucsc/*.agp
# qualFiles none
dbDbSpeciesDir cat
photoCreditURL  http://www.genome.gov/pressDisplay.cfm?photoID=42
photoCreditName Dr. Kristina Narfstrom
ncbiGenomeId 78
ncbiAssemblyId 1448961
ncbiAssemblyName Felis_catus_9.0
ncbiBioProject 16726
ncbiBioSample SAMN02953640
genBankAccessionID GCF_000181335.3
taxId 9685

#############################################################################
# setup UCSC named files (DONE - 2018-02-27 - Hiram)

    mkdir /hive/data/genomes/felCat9/ucsc
    cd /hive/data/genomes/felCat9/ucsc

    # check for duplicate sequences:
    time faToTwoBit -noMask ../refseq/G*_9.0_genomic.fna.gz refseq.2bit
    #  real    1m3.113s

    twoBitDup refseq.2bit
    # no output is a good result, otherwise, would have to eliminate duplicates
    # the scripts creating the fasta here will be using this refseq.2bit file

    ~/kent/src/hg/utils/automation/ucscCompositeAgp.pl \
       ../refseq/G*_9.0_genomic.fna.gz \
          ../refseq/G*_9.0_assembly_structure/Primary_Assembly
# NC_018723.3 chrA1
# NC_018724.3 chrA2
# NC_018725.3 chrA3
# NC_018726.3 chrB1
# NC_018727.3 chrB2
# NC_018728.3 chrB3
# NC_018729.3 chrB4
# NC_018730.3 chrC1
# NC_018731.3 chrC2
# NC_018732.3 chrD1
# NC_018733.3 chrD2
# NC_018734.3 chrD3
# NC_018735.3 chrD4
# NC_018736.3 chrE1
# NC_018737.3 chrE2
# NC_018738.3 chrE3
# NC_018739.3 chrF1
# NC_018740.3 chrF2
# NC_018741.3 chrX


    # unplaced sequences
    time ~/kent/src/hg/utils/automation/unplacedWithChroms.pl \
       ../refseq/*_assembly_structure/Primary_Assembly
# processed 4142 sequences into chrUn.fa.gz
# real    0m30.235s

    # unlocalized sequences
    time ~/kent/src/hg/utils/automation/unlocalizedWithChroms.pl \
       ../refseq/*_assembly_structure/Primary_Assembly
# D2
# D3
# A1
# B4
# A2
# E1
# C1
# F2
# B2
# E3
# C2
# B1
# X
# D1
# D4
# A3
# B3
# E2
# F1
# processed 346 sequences into chr*_random.gz 19 files
# real    0m5.891s

    # bash syntax here
    mitoAcc=`grep "^# mitoAcc" ../felCat9.config.ra | awk '{print $NF}'`
    printf "# mitoAcc %s\n" "$mitoAcc"
# mitoAcc NC_001700.1

    zcat \
  ../refseq/*_assembly_structure/non-nuclear/assem*/AGP/chrMT.comp.agp.gz \
     | grep -v "^#" | sed -e "s/^$mitoAcc/chrM/;" > chrM.agp

    printf ">chrM\n" > chrM.fa
    twoBitToFa -noMask refseq.2bit:$mitoAcc stdout | grep -v "^>" >> chrM.fa
    gzip chrM.fa

    # verify fasta and AGPs agree
    time faToTwoBit *.fa.gz test.2bit
    # real    1m10.270s

    time cat *.agp | checkAgpAndFa stdin test.2bit 2>&1 | tail -4
    # All AGP and FASTA entries agree - both files are valid
    # real    0m9.332s

    # and no sequence lost from orginal:
    twoBitToFa test.2bit stdout | faSize stdin
# 2521863845 bases (45410675 N's 2476453170 real 2476453170 upper 0 lower)
#	in 4508 sequences in 1 files
# Total size: mean 559419.7 sd 9108161.9 min 1969 (chrUn_NW_019369707v1)
#	max 242100913 (chrA1) median 9988

    # same numbers as above
# 2521863845 bases (45410675 N's 2476453170 real 1649339622 upper
#	827113548 lower) in 4508 sequences in 1 files
# Total size: mean 559419.7 sd 9108161.9 min 1969 (NW_019369707.1)
#	max 242100913 (NC_018723.3) median 9988

    # no longer need these temporary 2bit files
    rm refseq.2bit test.2bit

#############################################################################
#  Initial database build (DONE - 2018-02-27 - Hiram)

    cd /hive/data/genomes/felCat9
    # verify sequence and AGP are OK:
    time (makeGenomeDb.pl -workhorse=hgwdev -dbHost=hgwdev -fileServer=hgwdev \
         -stop=agp felCat9.config.ra) > agp.log 2>&1
    # real    2m18.763s
    # verify there was no error in that step:
    tail agp.log
    #  *** All done!  (through the 'agp' step)

    # then finish it off:
    time (makeGenomeDb.pl -workhorse=hgwdev -dbHost=hgwdev \
       -fileServer=hgwdev -continue=db felCat9.config.ra) > db.log 2>&1
    # real    20m30.661s

    # check in the trackDb files created in TemporaryTrackDbCheckout/
    #    and add felCat9 to trackDb/makefile

    # temporary symlink until masked sequence is available
    cd /hive/data/genomes/felCat9
    ln -s `pwd`/felCat9.unmasked.2bit /gbdb/felCat9/felCat9.2bit

##############################################################################
# cpgIslands on UNMASKED sequence (DONE - 2018-02-27 - Hiram)
    mkdir /hive/data/genomes/felCat9/bed/cpgIslandsUnmasked
    cd /hive/data/genomes/felCat9/bed/cpgIslandsUnmasked

    time (doCpgIslands.pl -dbHost=hgwdev -bigClusterHub=ku -buildDir=`pwd` \
       -tableName=cpgIslandExtUnmasked \
          -maskedSeq=/hive/data/genomes/felCat9/felCat9.unmasked.2bit \
             -workhorse=hgwdev -smallClusterHub=ku felCat9) > do.log 2>&1
    # real    10m10.843s

    cat fb.felCat9.cpgIslandExtUnmasked.txt
    # 39762268 bases of 2893270787 (1.374%) in intersection

#############################################################################
# cytoBandIdeo - (DONE - 2018-02-27 - Hiram)
    mkdir /hive/data/genomes/felCat9/bed/cytoBand
    cd /hive/data/genomes/felCat9/bed/cytoBand
    makeCytoBandIdeo.csh felCat9

#############################################################################
# gapOverlap (DONE - 2018-02-27 - Hiram)
    mkdir /hive/data/genomes/felCat9/bed/gapOverlap
    cd /hive/data/genomes/felCat9/bed/gapOverlap
    time (doGapOverlap.pl \
	-twoBit=/hive/data/genomes/felCat9/felCat9.unmasked.2bit felCat9 ) \
	> do.log 2>&1
    # real    0m54.051s

    # actually failed because there were none of these items found:
    # -rw-rw-r-- 1    0 Feb 27 15:25 felCat9.gapOverlap.bed

    time (doGapOverlap.pl -continue=cleanup \
	-twoBit=/hive/data/genomes/felCat9/felCat9.unmasked.2bit felCat9 ) \
	> cleanup.log 2>&1

#############################################################################
# tandemDups (DONE - 2018-02-27 - Hiram)
    mkdir /hive/data/genomes/felCat9/bed/tandemDups
    cd /hive/data/genomes/felCat9/bed/tandemDups
    time (~/kent/src/hg/utils/automation/doTandemDup.pl \
  -twoBit=/hive/data/genomes/felCat9/felCat9.unmasked.2bit felCat9) \
	> do.log 2>&1 &
    # real    122m33.710s
    # 53784656 bases of 2521863845 (2.133%) in intersection

    bigBedInfo felCat9.tandemDups.bb | sed -e 's/^/#  /;'
#  version: 4
#  fieldCount: 13
#  hasHeaderExtension: yes
#  isCompressed: yes
#  isSwapped: 0
#  extraIndexCount: 0
#  itemCount: 785,057
#  primaryDataSize: 19,206,976
#  primaryIndexSize: 96,344
#  zoomLevels: 9
#  chromCount: 1399
#  basesCovered: 1,394,080,174
#  meanDepth (of bases covered): 3.940832
#  minDepth: 1.000000
#  maxDepth: 5370.000000
#  std of depth: 17.389426

#############################################################################
# ucscToINSDC and ucscToRefSeq table/track (DONE - 2018-02-27 - Hiram)
    # the sequence here is working for a 'refseq' assembly
    # beware of a chrM situation may be specific depending upon what is
    # available in the assembly

    mkdir /hive/data/genomes/felCat9/bed/ucscToINSDC
    cd /hive/data/genomes/felCat9/bed/ucscToINSDC

    # find accession for chrM
    grep chrM ../../felCat9.agp
# chrM    1       17009   1       O       NC_001700.1     1       17009   +

    # find the genbank accession for NC_020006.2 at Entrez nucleotide
    # The NC_020006.2 name is the RefSeq name, the genbank name is: JX946196.2
    # the assembly_report does not have this AY name since the chrM sequence
    # is not in the genbank assembly:
    grep NC_001700.1 ../../refseq/GCF*_9.0_assembly_report.txt
# MT      assembled-molecule      MT      Mitochondrion   na      <>      NC_001700.1     non-nuclear     17009   chrM

    # lookup the genbank/INSDC accession name for chrM at:
    # https://www.ncbi.nlm.nih.gov/assembly/GCF_000181335.3

    # if there is a chrM, use its INSDC name as a second argument:
    # this is a RefSeq assembly, use the chrM refSeq name:
    ~/kent/src/hg/utils/automation/ucscToINSDC.sh \
      ../../refseq/GCF_*structure/Primary_Assembly NC_001700.1

    # this is actually ucscToRefSeq since this is a RefSeq assembly
    sort -k2 ucscToINSDC.txt > ucscToRefSeq.txt
    rm -f ucscToINSDC.txt
    awk '{printf "%s\t%s\n", $2, $1}' ucscToRefSeq.txt \
       | sort > refSeqToUcsc.txt

    # chrM processing needs special help, fixup with the sed
    # extract the refseq vs. genbank names from the assembly_report
    # columns 5 and 7 are the INSDC and RefSeq names
    grep -v "^#" ../../refseq/GCF*_assembly_report.txt | cut -f5,7 \
      | awk '{printf "%s\t%s\n", $2, $1}' | sed -e 's/na/U20753.1/' \
          | sort > refseq.insdc.txt

    awk '{printf "%s\t0\t%d\n", $1,$2}' ../../chrom.sizes \
         | sort > ucsc.coordinate.tab
    join -2 2 refseq.insdc.txt ucscToRefSeq.txt | tr '[ ]' '[\t]' | sort -k3 \
       | join -2 3 ucsc.coordinate.tab - | tr '[ ]' '[\t]' | cut -f1-4 \
           > ucscToRefSeq.bed

    join -2 2 refseq.insdc.txt ucscToRefSeq.txt | tr '[ ]' '[\t]' | sort -k3 \
       | join -2 3 ucsc.coordinate.tab - | tr '[ ]' '[\t]' | cut -f1-3,5 \
           > ucscToINSDC.bed

    # verify chrM is correct:
    grep chrM *.bed
    # ucscToINSDC.bed:chrM    0       17009   U20753.1
    # ucscToRefSeq.bed:chrM   0       17009   NC_001700.1

    # should be same line counts throughout:
    # in this case one is missing in the final result due to the duplicate
    # contig being removed
    wc -l *
    #	4508 refSeqToUcsc.txt
    #	4508 refseq.insdc.txt
    #	4508 ucsc.coordinate.tab
    #	4508 ucscToINSDC.bed
    #	4508 ucscToRefSeq.bed
    #	4508 ucscToRefSeq.txt

    export chrSize=`cut -f1 ucscToINSDC.bed | awk '{print length($0)}' | sort -n | tail -1`
    echo $chrSize
    # 27
    # use the $chrSize in this sed
    sed -e "s/21/$chrSize/" $HOME/kent/src/hg/lib/ucscToINSDC.sql \
         | hgLoadSqlTab felCat9 ucscToINSDC stdin ucscToINSDC.bed
    # should be the same for ucscToRefSeq:
    export chrSize=`cut -f1 ucscToRefSeq.bed | awk '{print length($0)}' | sort -n | tail -1`
    echo $chrSize
    #  27
    sed -e "s/21/$chrSize/" $HOME/kent/src/hg/lib/ucscToINSDC.sql \
       | sed -e 's/INSDC/RefSeq/g;' > ucscToRefSeq.sql
    hgLoadSqlTab felCat9 ucscToRefSeq ./ucscToRefSeq.sql ucscToRefSeq.bed

    # checkTableCoords should be silent
    checkTableCoords felCat9
    # each should cover %100 entirely:
    featureBits -countGaps felCat9 ucscToINSDC
    # 2521863845 bases of 2521863845 (100.000%) in intersection

    featureBits -countGaps felCat9 ucscToRefSeq
    # 2521863845 bases of 2521863845 (100.000%) in intersection

#########################################################################
# add chromAlias table (DONE - 2018-02-27 - Hiram)

    mkdir /hive/data/genomes/felCat9/bed/chromAlias
    cd /hive/data/genomes/felCat9/bed/chromAlias

    hgsql -N -e 'select chrom,name from ucscToRefSeq;' felCat9 \
        | sort -k1,1 > ucsc.refseq.tab
    hgsql -N -e 'select chrom,name from ucscToINSDC;' felCat9 \
        | sort -k1,1 > ucsc.genbank.tab

    ### Adding Ensembl alias with v95 release, after idKeys made: 2019-01-16
    join -t$'\t' ../idKeys/felCat9.idKeys.txt \
        ../../ens95/ensFelCat9.idKeys.txt | cut -f2- \
          | sort -k1,1 | join -t$'\t' <(sort -k1,1 ../../chrom.sizes) - \
            | awk '{printf "%s\t0\t%d\t%s\n", $1, $2, $3}' \
               | sort -k1,1 -k2,2n > ucscToEns.bed
    cut -f1,4 ucscToEns.bed | sort > ucsc.ensembl.tab
    wc -l *.tab
#   4508 ucsc.ensembl.tab
#   4508 ucsc.genbank.tab
#   4508 ucsc.refseq.tab

    ~/kent/src/hg/utils/automation/chromAlias.pl ucsc.*.tab \
	> felCat9.chromAlias.tab

for t in refseq genbank ensembl
do
  c0=`cat ucsc.$t.tab | wc -l`
  c1=`grep $t felCat9.chromAlias.tab | wc -l`
  ok="OK"
  if [ "$c0" -ne "$c1" ]; then
     ok="ERROR"
  fi
  printf "# checking $t: $c0 =? $c1 $ok\n"
done
# checking refseq: 4508 =? 4508 OK
# checking genbank: 4508 =? 4508 OK
# checking ensembl: 4508 =? 4508 OK

    hgLoadSqlTab felCat9 chromAlias ~/kent/src/hg/lib/chromAlias.sql \
        felCat9.chromAlias.tab

#########################################################################
# fixup search rule for assembly track/gold table (DONE - 2018-02-27 - Hiram)
    cd ~/kent/src/hg/makeDb/trackDb/cat/felCat9

    # preview prefixes and suffixes:
    hgsql -N -e "select frag from gold;" felCat9 \
      | sed -e 's/[0-9][0-9]*//;' | sort | uniq -c
#   4989 AANG.1
#     80 AC.1
#      1 AC.2
#      8 AY.1
#      1 NC_.1

    # implies a rule: '[AN][ACY][NG0-9_]+(\.[0-9]+)?'

    # verify this rule will find them all and eliminate them all:
    hgsql -N -e "select frag from gold;" felCat9 | wc -l
    # 5079

    hgsql -N -e "select frag from gold;" felCat9 \
       | egrep -e '[AN][ACY][NG0-9_]+(\.[0-9]+)?' | wc -l
    # 5079

    hgsql -N -e "select frag from gold;" felCat9 \
       | egrep -v -e '[AN][ACY][NG0-9_]+(\.[0-9]+)?' | wc -l
    # 0

    # hence, add to trackDb/chicken/felCat9/trackDb.ra
searchTable gold
shortCircuit 1
termRegex [AN][ACY][NG0-9_]+(\.[0-9]+)?
query select chrom,chromStart,chromEnd,frag from %s where frag like '%s%%'
searchPriority 8

    # verify searches work in the position box

##########################################################################
# running repeat masker (DONE - 2018-02-27 - Hiram)
    mkdir /hive/data/genomes/felCat9/bed/repeatMasker
    cd /hive/data/genomes/felCat9/bed/repeatMasker
    time  (doRepeatMasker.pl -buildDir=`pwd` \
        -bigClusterHub=ku -dbHost=hgwdev -workhorse=hgwdev \
        -smallClusterHub=ku felCat9) > do.log 2>&1 &
    # real    1035m8.587s
    # cluster difficulties during the cluster run, continuing:

    time  (doRepeatMasker.pl -buildDir=`pwd` \
        -continue=cat -bigClusterHub=ku -dbHost=hgwdev -workhorse=hgwdev \
        -smallClusterHub=ku felCat9) > cat.log 2>&1 &
    # bad record in the felCat9.fa.out, remove it with grep:
# RepeatMasker bug?: Undefined id, line 3670031 of input:
#  631  27.0  1.1  1.1  chrD2     87783169 87783348 (2403312) +  L1ME2          LINE/L1               5991 6170    (9)
    mv felCat9.fa.out felCat9.fa.out.broken
    grep -v "631  27.0  1.1  1.1  chrD2     87783169 87783348" \
	felCat9.fa.out.broken > felCat9.fa.out
    # and reconstruct the sorted file:
    mv felCat9.sorted.fa.out felCat9.sorted.fa.out.broken
    head -3 felCat9.fa.out > felCat9.sorted.fa.out
    tail -n +4 felCat9.fa.out | sort -k5,5 -k6,6n >> felCat9.sorted.fa.out
    # and the nested repeats remains the same without the error:
    /cluster/bin/scripts/extractNestedRepeats.pl felCat9.fa.out \
	| sort -k1,1 -k2,2n > felCat9.nestedRepeats.bed
    # continuing:
    time  (doRepeatMasker.pl -buildDir=`pwd` \
        -continue=mask -bigClusterHub=ku -dbHost=hgwdev -workhorse=hgwdev \
        -smallClusterHub=ku felCat9) > mask.log 2>&1 &
    # real    22m39.249s

    cat faSize.rmsk.txt
# 2521863845 bases (45410675 N's 2476453170 real 1408738694 upper 1067714476
#	lower) in 4508 sequences in 1 files
# Total size: mean 559419.7 sd 9108161.9 min 1969 (chrUn_NW_019369707v1)
#	max 242100913 (chrA1) median 9988
# %42.34 masked total, %43.11 masked real

    egrep -i "versi|relea" do.log
    # RepeatMasker version open-4.0.5
    #    January 31 2015 (open-4-0-5) version of RepeatMasker
    # CC   RELEASE 20140131;                                            *

    time featureBits -countGaps felCat9 rmsk
    # 1067715717 bases of 2521863845 (42.338%) in intersection
    # real    0m36.348s

    # why is it different than the faSize above ?
    # because rmsk masks out some N's as well as bases, the faSize count above
    #   separates out the N's from the bases, it doesn't show lower case N's

    # faster way to get the same result on high contig count assemblies:
    time hgsql -N -e 'select genoName,genoStart,genoEnd from rmsk;' felCat9 \
        | bedSingleCover.pl stdin | ave -col=4 stdin | grep "^total"
    #	total 1067715717.000000
    #   real    0m36.899s

##########################################################################
# running simple repeat (DONE - 2018-02-27 - Hiram)

    mkdir /hive/data/genomes/felCat9/bed/simpleRepeat
    cd /hive/data/genomes/felCat9/bed/simpleRepeat
    # using trf409 5 here guessing smaller genome (human == 6)
    time (doSimpleRepeat.pl -buildDir=`pwd` -bigClusterHub=ku \
        -dbHost=hgwdev -workhorse=hgwdev -smallClusterHub=ku \
        -trf409 5 felCat9) > do.log 2>&1 &
    # real    43m56.043s

    cat fb.simpleRepeat
    # 163817450 bases of 2893270787 (5.662%) in intersection

    # adding this trfMask to the other masking
    cd /hive/data/genomes/felCat9

    # when using the Window Masker result:
#    twoBitMask bed/windowMasker/felCat9.cleanWMSdust.2bit \
#       -add bed/simpleRepeat/trfMask.bed  felCat9.2bit
    #   you can safely ignore the warning about fields >= 13

    # when using Rmsk results, add to rmsk after it is done:
    twoBitMask felCat9.rmsk.2bit \
        -add bed/simpleRepeat/trfMask.bed felCat9.2bit
    #   you can safely ignore the warning about fields >= 13

    twoBitToFa felCat9.2bit stdout | faSize stdin > faSize.felCat9.2bit.txt
    cat faSize.felCat9.2bit.txt
# 2521863845 bases (45410675 N's 2476453170 real 1407138455 upper 1069314715
#	lower) in 4508 sequences in 1 files
# Total size: mean 559419.7 sd 9108161.9 min 1969 (chrUn_NW_019369707v1)
#	max 242100913 (chrA1) median 9988
# %42.40 masked total, %43.18 masked real

    # reset the symlink
    rm /gbdb/felCat9/felCat9.2bit
    ln -s `pwd`/felCat9.2bit /gbdb/felCat9/felCat9.2bit

#########################################################################
# CREATE MICROSAT TRACK (DONE - 2018-02-27 - Hiram)
    ssh hgwdev
    mkdir /cluster/data/felCat9/bed/microsat
    cd /cluster/data/felCat9/bed/microsat

    awk '($5==2 || $5==3) && $6 >= 15 && $8 == 100 && $9 == 0 {printf("%s\t%s\t%s\t%dx%s\n", $1, $2, $3, $6, $16);}' \
       ../simpleRepeat/simpleRepeat.bed > microsat.bed

    hgLoadBed felCat9 microsat microsat.bed
    # Read 105236 elements of size 4 from microsat.bed

##########################################################################
## WINDOWMASKER (DONE - 2018-02-28 - Hiram)

    mkdir /hive/data/genomes/felCat9/bed/windowMasker
    cd /hive/data/genomes/felCat9/bed/windowMasker
    time (doWindowMasker.pl -buildDir=`pwd` -workhorse=hgwdev \
        -dbHost=hgwdev felCat9) > do.log 2>&1
    # real    237m57.966s
    # continuing after rmsk done
    featureBits -countGaps felCat9 rmsk windowmaskerSdust \
	> fb.felCat9.rmsk.windowmaskerSdust.txt 2>&1
    time (doWindowMasker.pl -buildDir=`pwd` -workhorse=hgwdev \
        -continue=cleanup -dbHost=hgwdev felCat9) > cleanup.log 2>&1
    # real    1m52.250s

    # Masking statistics
    cat faSize.felCat9.cleanWMSdust.txt
# 2521863845 bases (45410675 N's 2476453170 real 1636004228 upper 840448942
#	lower) in 4508 sequences in 1 files
# Total size: mean 559419.7 sd 9108161.9 min 1969 (chrUn_NW_019369707v1)
#	max 242100913 (chrA1) median 9988
# %33.33 masked total, %33.94 masked real

    cat fb.felCat9.rmsk.windowmaskerSdust.txt
    # 588428058 bases of 2521863845 (23.333%) in intersection

##########################################################################
# run up idKeys files for ncbiRefSeq (DONE - 2018-02-27 - Hiram)
    mkdir /hive/data/genomes/felCat9/bed/idKeys
    cd /hive/data/genomes/felCat9/bed/idKeys

    time (doIdKeys.pl -twoBit=/hive/data/genomes/felCat9/felCat9.unmasked.2bit -buildDir=`pwd` felCat9) > do.log 2>&1 &
    # real    3m8.497s

    cat felCat9.keySignature.txt
    #  a563ebb4989e5d9dd8e1b57698664160

##########################################################################
# cpgIslands - (DONE - 2018-02-28 - Hiram)
    mkdir /hive/data/genomes/felCat9/bed/cpgIslands
    cd /hive/data/genomes/felCat9/bed/cpgIslands
    time (doCpgIslands.pl -dbHost=hgwdev -bigClusterHub=ku \
      -workhorse=hgwdev -smallClusterHub=ku felCat9) > do.log 2>&1 &
    # real    5m8.572s

    cat fb.felCat9.cpgIslandExt.txt
    # 49418190 bases of 2476453204 (1.996%) in intersection

##############################################################################
# genscan - (DONE - 2018-02-28 - Hiram)
    mkdir /hive/data/genomes/felCat9/bed/genscan
    cd /hive/data/genomes/felCat9/bed/genscan
    time (doGenscan.pl -buildDir=`pwd` -workhorse=hgwdev -dbHost=hgwdev \
      -bigClusterHub=ku felCat9) > do.log 2>&1 &
    # real    867m26.572s

    # one failed job, run with 2,000,000 window::

time ./runGsBig2M.csh chrE1 000 gtf/000/chrE1.gtf pep/000/chrE1.pep subopt/000/chrE1.bed
    # real    290m18.460s

    # continuing:
    time (doGenscan.pl -buildDir=`pwd` -workhorse=hgwdev -dbHost=hgwdev \
      -continue=makeBed -bigClusterHub=ku felCat9) > makeBed.log 2>&1 &
    # real    1m55.321s

    cat fb.felCat9.genscan.txt
    # 60893812 bases of 2476453204 (2.459%) in intersection

    cat fb.felCat9.genscanSubopt.txt
    # 51934230 bases of 2476453204 (2.097%) in intersection

#############################################################################
# augustus gene track (DONE - 2018-02-28 - Hiram)

    mkdir /hive/data/genomes/felCat9/bed/augustus
    cd /hive/data/genomes/felCat9/bed/augustus
    time (doAugustus.pl -buildDir=`pwd` -bigClusterHub=ku \
     -species=human -dbHost=hgwdev -workhorse=hgwdev felCat9) > do.log 2>&1 &
    # real    187m51.379s

    cat fb.felCat9.augustusGene.txt
    # 51863692 bases of 2476453204 (2.094%) in intersection

#############################################################################
# lastz/chain/net swap human/hg38 (DONE - 2018-03-14 - Hiram)
    # original alignment
    cd /hive/data/genomes/hg38/bed/lastzFelCat9.2018-03-14

    cat fb.hg38.chainFelCat9Link.txt
    # 1579231929 bases of 3049335806 (51.789%) in intersection
    cat fb.hg38.chainSynFelCat9Link.txt
    # 1516804589 bases of 3049335806 (49.742%) in intersection
    cat fb.hg38.chainRBestFelCat9Link.txt
    # 1449222744 bases of 3049335806 (47.526%) in intersection

    # and for the swap:
    mkdir /hive/data/genomes/felCat9/bed/blastz.hg38.swap
    cd /hive/data/genomes/felCat9/bed/blastz.hg38.swap

    time (doBlastzChainNet.pl -verbose=2 \
      /hive/data/genomes/hg38/bed/lastzFelCat9.2018-03-14/DEF \
        -swap -chainMinScore=3000 -chainLinearGap=medium \
          -workhorse=hgwdev -smallClusterHub=ku -bigClusterHub=ku \
            -syntenicNet) > swap.log 2>&1
    #  real    142m37.100s

    cat fb.felCat9.chainHg38Link.txt
    # 1486134443 bases of 2476453204 (60.011%) in intersection
    cat fb.felCat9.chainSynHg38Link.txt
    # 1452577988 bases of 2476453204 (58.656%) in intersection

    time (doRecipBest.pl -load -workhorse=hgwdev -buildDir=`pwd` \
	felCat9 hg38) > rbest.log 2>&1 &
    # real    623m28.676s

    cat fb.felCat9.chainRBestHg38Link.txt
    # 1449521349 bases of 2476453204 (58.532%) in intersection

#########################################################################
# lastz/chain/net swap mouse/mm10 (DONE - 2018-03-14 - Hiram)

    # alignment to mouse/mm10:
    cd /hive/data/genomes/mm10/bed/lastzFelCat9.2018-03-14

    cat fb.mm10.chainFelCat9Link.txt
    #   801023018 bases of 2652783500 (30.196%) in intersection
    cat fb.mm10.chainRBestFelCat9Link.txt
    # 761411281 bases of 2652783500 (28.702%) in intersection

    # and for the swap

    mkdir /hive/data/genomes/felCat9/bed/blastz.mm10.swap
    cd /hive/data/genomes/felCat9/bed/blastz.mm10.swap
    time (doBlastzChainNet.pl -verbose=2 \
	/hive/data/genomes/mm10/bed/lastzFelCat9.2018-03-14/DEF \
	-swap -syntenicNet \
	-workhorse=hgwdev -smallClusterHub=ku -bigClusterHub=ku \
	-chainMinScore=3000 -chainLinearGap=medium) > swap.log 2>&1 &
    #	 real    70m51.860s

    cat fb.felCat9.chainMm10Link.txt
    #   779862191 bases of 2476453204 (31.491%) in intersection
    cat fb.felCat9.chainSynMm10Link.txt
    # 754481540 bases of 2476453204 (30.466%) in intersection

    time (doRecipBest.pl -load  felCat9 mm10 -buildDir=`pwd` \
	-workhorse=hgwdev) > rbest.log 2>&1 &
    # real    375m4.937s

    cat fb.felCat9.chainRBestMm10Link.txt
    # 760753851 bases of 2476453204 (30.719%) in intersection

##############################################################################
# Create kluster run files (DONE - 2018-02-28 - Hiram)

    cd /hive/data/genomes/felCat9
    # numerator is felCat9 gapless bases "real" as reported by:
    featureBits -noRandom -noHap felCat9 gap
    # 45323781 bases of 2414945138 (1.877%) in intersection
    #                   ^^^

    # denominator is hg19 gapless bases as reported by:
    #   featureBits -noRandom -noHap hg19 gap
    #     234344806 bases of 2861349177 (8.190%) in intersection
    # 1024 is threshold used for human -repMatch:
    calc \( 2414945138 / 2861349177 \) \* 1024
    #  ( 2414945138 / 2861349177 ) * 1024 = 864.243987

    # ==> use -repMatch=800 same as felCat8
    cd /hive/data/genomes/felCat9
    blat felCat9.2bit \
         /dev/null /dev/null -tileSize=11 -makeOoc=jkStuff/felCat9.11.ooc \
        -repMatch=800
    #  Wrote 30315 overused 11-mers to jkStuff/felCat9.11.ooc
    # felCat8 at repMatch=800 was:
    #  Wrote 36446 overused 11-mers to jkStuff/felCat8.11.ooc

    #   check non-bridged gaps to see what the typical size is:
    hgsql -N \
        -e 'select * from gap where bridge="no" order by size;' felCat9 \
        | sort -k7,7nr | ave -col=7 stdin
    # these are all centromere's at 2 and 3 million:
# Q1 2000000.000000
# median 2000000.000000
# Q3 2000000.000000
# average 2052631.578947
# min 2000000.000000
# max 3000000.000000
# count 19
# total 39000000.000000
# standard deviation 229415.733871
# +----+-------+-----------+-----------+----+---+---------+------------+----+
# | 19 | chrA1 |  88880063 |  90880063 |  6 | N | 2000000 | centromere | no |
# |  9 | chrF2 |         0 |   2000000 |  1 | N | 2000000 | centromere | no |
# |  9 | chrF1 |         0 |   2000000 |  1 | N | 2000000 | centromere | no |
# |  1 | chrE2 |  23929417 |  25929417 | 22 | N | 2000000 | centromere | no |
# |  1 | chrE1 |  24862965 |  26862965 | 10 | N | 2000000 | centromere | no |
# | 12 | chrD4 |  31525997 |  33525997 | 18 | N | 2000000 | centromere | no |
# | 12 | chrD3 |  30780723 |  32780723 | 32 | N | 2000000 | centromere | no |
# | 11 | chrD2 |  19812854 |  21812854 | 14 | N | 2000000 | centromere | no |
# |  1 | chrD1 |  32184683 |  34184683 | 20 | N | 2000000 | centromere | no |
# |  2 | chrC2 |  75225443 |  77225443 | 10 | N | 2000000 | centromere | no |
# | 16 | chrA2 |  60080733 |  62080733 | 32 | N | 2000000 | centromere | no |
# | 15 | chrA3 |  50856976 |  52856976 | 18 | N | 2000000 | centromere | no |
# | 13 | chrB1 |  38294479 |  40294479 | 10 | N | 2000000 | centromere | no |
# | 12 | chrB2 |  29165855 |  31165855 |  8 | N | 2000000 | centromere | no |
# | 12 | chrB3 |  28663498 |  30663498 |  6 | N | 2000000 | centromere | no |
# | 14 | chrB4 |  43808484 |  45808484 | 12 | N | 2000000 | centromere | no |
# |  2 | chrC1 | 107907036 | 109907036 | 24 | N | 2000000 | centromere | no |
# | 15 |  chrX |  50521356 |  52521356 | 48 | N | 2000000 | centromere | no |
# |  1 | chrE3 |  16733304 |  19733304 |  8 | N | 3000000 | centromere | no |
# +----+-------+-----------+-----------+----+---+---------+------------+----+

    # therefore, minimum gap size of 2000000
  gapToLift -verbose=2 -minGap=2000000 felCat9 jkStuff/felCat9.nonBridged.lft \
       -bedFile=jkStuff/felCat9.nonBridged.bed

##############################################################################
# LIFTOVER TO felCat8 (DONE - 2018-02-28 - Hiram)
    ssh hgwdev
    mkdir /hive/data/genomes/felCat9/bed/blat.felCat8.2018-02-28
    cd /hive/data/genomes/felCat9/bed/blat.felCat8.2018-02-28
    time (doSameSpeciesLiftOver.pl -verbose=2 -buildDir=`pwd` \
	-ooc=/hive/data/genomes/felCat9/jkStuff/felCat9.11.ooc \
        -bigClusterHub=ku -dbHost=hgwdev -workhorse=hgwdev \
         felCat9 felCat8) > do.log 2>&1
    # real    605m27.304s

    # verify the convert link on the test browser is now active from felCat9 to
    # felCat8

##############################################################################
# GENBANK AUTO UPDATE (DONE - 2018-02-28 - Hiram)
    ssh hgwdev
    cd $HOME/kent/src/hg/makeDb/genbank
    git pull
    # /cluster/data/genbank/data/organism.lst shows:
    # #organism      mrnaCnt   estCnt  refSeqCnt
    # Felis catus	2446	921	419

    # edit etc/genbank.conf to add felCat9 just before felCat8
# Nov 2017 (GSC) at Washington University (WashU) School of Medicine
#	Felis_catus_9.0
# felCat9 (Cat)
felCat9.serverGenome = /hive/data/genomes/felCat9/felCat9.2bit
felCat9.clusterGenome = /hive/data/genomes/felCat9/felCat9.2bit
felCat9.ooc = /hive/data/genomes/felCat9/jkStuff/felCat9.11.ooc
felCat9.lift = /hive/data/genomes/felCat9/jkStuff/felCat9.nonBridged.lft
felCat9.perChromTables = no
felCat9.refseq.mrna.native.pslCDnaFilter  = ${finished.refseq.mrna.native.pslCDnaFilter}
felCat9.refseq.mrna.xeno.pslCDnaFilter    = ${finished.refseq.mrna.xeno.pslCDnaFilter}
felCat9.genbank.mrna.native.pslCDnaFilter = ${finished.genbank.mrna.native.pslCDnaFilter}
felCat9.genbank.mrna.xeno.pslCDnaFilter   = ${finished.genbank.mrna.xeno.pslCDnaFilter}
felCat9.genbank.est.native.pslCDnaFilter  = ${finished.genbank.est.native.pslCDnaFilter}
felCat9.genbank.est.xeno.pslCDnaFilter    = ${finished.genbank.est.xeno.pslCDnaFilter}
felCat9.genbank.mrna.xeno.load  = yes
felCat9.downloadDir = felCat9
felCat9.refseq.mrna.native.load  = yes
felCat9.refseq.mrna.xeno.load = yes
felCat9.refseq.mrna.xeno.loadDesc  = yes
# felCat9.upstreamGeneTbl = refGene
# felCat9.upstreamMaf = multiz6way
# /hive/data/genomes/felCat9/bed/multiz6way/species.list

    # add felCat9 to:
    #   etc/align.dbs etc/hgwdev.dbs
    git commit -m 'adding felCat9/cat refs #21043' \
	etc/genbank.conf etc/align.dbs etc/hgwdev.dbs
    git push
    # update /cluster/data/genbank/:
    make etc-update

    # a few days later the genbank tables will be in the database

#############################################################################
# ncbiRefSeq (DONE - 2018-03-13 - Hiram)

    mkdir /hive/data/genomes/felCat9/bed/ncbiRefSeq
    cd /hive/data/genomes/felCat9/bed/ncbiRefSeq

    time (~/kent/src/hg/utils/automation/doNcbiRefSeq.pl -buildDir=`pwd` \
      -bigClusterHub=ku -dbHost=hgwdev \
      -fileServer=hgwdev -smallClusterHub=ku -workhorse=hgwdev \
      refseq vertebrate_mammalian Felis_catus \
      GCF_000181335.3_Felis_catus_9.0 felCat9) > do.log 2>&1
    # real    8m52.047s

    cat fb.ncbiRefSeq.felCat9.txt
    #  81659062 bases of 2476453204 (3.297%) in intersection

#########################################################################
#  BLATSERVERS ENTRY (DONE - 2018-08-13 - Hiram)
#	After getting a blat server assigned by the Blat Server Gods,
    ssh hgwdev

    hgsql -e 'INSERT INTO blatServers (db, host, port, isTrans, canPcr) \
	VALUES ("felCat9", "blat1c", "17900", "1", "0"); \
	INSERT INTO blatServers (db, host, port, isTrans, canPcr) \
	VALUES ("felCat9", "blat1c", "17901", "0", "1");' \
	    hgcentraltest
    #	test it with some sequence

############################################################################
## reset default position to same as what felCat8 has
##  (DONE - 2018-03-13 - Hiram)

    ssh hgwdev
    hgsql -e 'update dbDb set defaultPos="chrA2:53333898-53447862"
	where name="felCat9";' hgcentraltest

############################################################################
# LIFTOVER TO felCat8 (DONE - 2018-03-19 - Hiram)
    ssh hgwdev
    mkdir /hive/data/genomes/felCat9/bed/blat.felCat8.2018-03-19
    cd /hive/data/genomes/felCat9/bed/blat.felCat8.2018-03-19
    time (doSameSpeciesLiftOver.pl -verbose=2 -buildDir=`pwd` \
	-ooc=/hive/data/genomes/felCat9/jkStuff/felCat9.11.ooc \
        -bigClusterHub=ku -dbHost=hgwdev -workhorse=hgwdev \
         felCat9 felCat8) > do.log 2>&1 &
    # real    192m10.105s

    # verify the convert link on the test browser is now active from felCat9 to
    # felCat8

############################################################################
# LIFTOVER TO felCat5 (DONE - 2018-03-19 - Hiram)
    ssh hgwdev
    mkdir /hive/data/genomes/felCat9/bed/blat.felCat5.2018-03-19
    cd /hive/data/genomes/felCat9/bed/blat.felCat5.2018-03-19
    time (doSameSpeciesLiftOver.pl -verbose=2 -buildDir=`pwd` \
	-ooc=/hive/data/genomes/felCat9/jkStuff/felCat9.11.ooc \
        -bigClusterHub=ku -dbHost=hgwdev -workhorse=hgwdev \
         felCat9 felCat5) > do.log 2>&1 &
    # real    138m37.419s

    # verify the convert link on the test browser is now active from felCat9 to
    # felCat5

############################################################################
# all.joiner update, downloads and in pushQ - (DONE - 2018-03-19 - Hiram)
    cd $HOME/kent/src/hg/makeDb/schema
    ~/kent/src/hg/utils/automation/verifyBrowser.pl felCat9
# 65 tables in database felCat9 - Cat, Felis catus
# verified 65 tables, 0 extra tables, 10 optional tables
# 13 genbank tables found
# verified 42 tables, 1 missing tables
# 1     gapOverlap
# liftOver to previous versions: 2, from previous versions: 2

    # there were no gapOverlap items found

    # fixup all.joiner until this is a clean output
    joinerCheck -database=felCat9 -tableCoverage all.joiner
    joinerCheck -database=felCat9 -times all.joiner
    joinerCheck -database=felCat9 -keys all.joiner

    cd /hive/data/genomes/felCat9
    rm -fr TemporaryTrackDbCheckout

    time (makeDownloads.pl -workhorse=hgwdev felCat9) > downloads.log 2>&1
    #  real    26m28.250s

    #   now ready for pushQ entry
    mkdir /hive/data/genomes/felCat9/pushQ
    cd /hive/data/genomes/felCat9/pushQ
  time (makePushQSql.pl -redmineList felCat9) > felCat9.pushQ.sql 2> stderr.out
    # remove the tandemDups from the file list:
    sed -i -e "/tandemDups/d" redmine.felCat9.table.list
    sed -i -e "/Tandem Dups/d" redmine.felCat9.releaseLog.txt

    #   check for errors in stderr.out, some are OK, e.g.:
    # WARNING: felCat9 does not have seq
    # WARNING: felCat9 does not have extFile

    # add the path names to the listing files in the redmine issue
    # in the three appropriate entry boxes:

/hive/data/genomes/felCat9/pushQ/redmine.felCat9.file.list
/hive/data/genomes/felCat9/pushQ/redmine.felCat9.releaseLog.txt
/hive/data/genomes/felCat9/pushQ/redmine.felCat9.table.list

#########################################################################
