# for emacs: -*- mode: sh; -*-

#	http://www.ncbi.nlm.nih.gov/genome/82
#	http://www.ncbi.nlm.nih.gov/bioproject/12555
#	http://www.ncbi.nlm.nih.gov/Traces/wgs/?val=AAFC01
#   http://www.ncbi.nlm.nih.gov/genome/assembly/313728/

# file template copied from bosTau6.txt
# Assembly Info:
# DATE:	25-Oct-2011
# ORGANISM:	Bos taurus
# TAXID:	9913
# ASSEMBLY LONG NAME:	Btau_4.6.1
# ASSEMBLY SHORT NAME:	Btau_4.6.1
# ASSEMBLY SUBMITTER:	Bovine Genome Sequencing Consortium
# ASSEMBLY TYPE:	Haploid
# NUMBER OF ASSEMBLY-UNITS:	1
# ASSEMBLY ACCESSION:	GCA_000003205.4
#
# FTP-RELEASE DATE: 03-Nov-2011


# Bos taurus (NCBI Project ID: 12555, Accession: GCA_000003205.4)
#   by Bovine Genome Sequencing and Analysis Consortiumn
#    assembly] sequence:
# ftp://ftp.ncbi.nlm.nih.gov/genbank/genomes/Eukaryotes/vertebrates_mammals/Bos_taurus/Btau_4.6.1/
# Project 12555
# url:
# http://www.ncbi.nlm.nih.gov/nuccore/AAFC00000000
# chrM: AY526085
# http://www.ncbi.nlm.nih.gov/nuccore/AY526085.1

# http://www.ncbi.nlm.nih.gov/Traces/wgs/?val=AAFC00
#  7X WGS coverage

##########################################################################
# Download sequence (DONE - 2011-05-02 Chin)
    mkdir /hive/data/genomes/bosTau7
    cd /hive/data/genomes/bosTau7
    mkdir genbank
    cd genbank
    wget --timestamping -r --cut-dirs=6 --level=0 -nH -x \
        --no-remove-listing -np \
"ftp://ftp.ncbi.nlm.nih.gov/genbank/genomes/Eukaryotes/vertebrates_mammals/Bos_taurus/Btau_4.6.1/*"
    # FINISHED --2011-12-16 15:36:00--
    # Downloaded: 244 files, 2.1G in 30m 24s (1.17 MB/s)
    # Read ASSEMBLY_INFO

    # measure total sequence in this assembly
    faSize Primary_Assembly/assembled_chromosomes/FASTA/chr*.fa \
        Primary_Assembly/unplaced_scaffolds/FASTA/un*.fa.gz
# 2981103241 bases (176446405 N's 2804656836 real 2804656836 upper
# 0 lower) in 11691 sequences in 32 files
# Total size: mean 254991.3 sd 4719241.6 min 152 (gi|346076751|gb|JH125284.1|)
# max 161428367 (gi|346683426|gb|CM000177.5|) median 8572
# N count: mean 15092.5 sd 280692.5
# U count: mean 239898.8 sd 4441763.7
# L count: mean 0.0 sd 0.0
# %0.00 masked total, %0.00 masked real


    mkdir ucscChr
    # stay at genbank directory
    #   fixup the accession names to become UCSC chrom names

export S=Primary_Assembly/assembled_chromosomes
cat  ${S}/chr2acc | grep -v "^#Chromosome" | cut -f2 | while read ACC
do
    C=`grep "${ACC}" ${S}/chr2acc | cut -f1`
    echo "${ACC} -> chr${C}"
    zcat ${S}/AGP/chr${C}.comp.agp.gz \
        | sed -e "s/^${ACC}/chr${C}/" | gzip > ucscChr/chr${C}.agp.gz
done


export S=Primary_Assembly/assembled_chromosomes
cut -f2 ${S}/chr2acc  | while read ACC
cat  ${S}/chr2acc | grep -v "^#Chromosome" | cut -f2 | while read ACC
do
    C=`grep "${ACC}" ${S}/chr2acc | cut -f1`
    echo "${ACC} -> chr${C}"
    echo ">chr${C}" > ucscChr/chr${C}.fa
    zcat ${S}/FASTA/chr${C}.fa.gz | grep -v "^>" >> ucscChr/chr${C}.fa
    gzip ucscChr/chr${C}.fa &
done
   # Check them with faSize
    faSize Primary_Assembly/assembled_chromosomes/FASTA/chr*.fa.gz
    # 2673141463 bases (159233025 N's 2513908438 real 2513908438 upper
    # 0 lower) in 31 sequences in 31 files

    faSize ucscChr/chr*.fa.gz
    #  2660906405 bases (20740230 N's 2640166175 real 2640166175 upper
    #  0 lower) in 30 sequences in 30 files

    # For unplaced scalfolds, named them as chrUn_xxxxxxxx
    #   where xxxxxx is the original access id as: chrUn_JH121286
    #   The ".1" at the end (if any) need to be filter out since
    #   MySQL does not allow "." as part of the table name and
    #   will casue problems in genbank task step later

export S=Primary_Assembly/unplaced_scaffolds
zcat ${S}/AGP/unplaced.scaf.agp.gz | grep "^#" > ucscChr/chrUn.agp
   # append the gap records
zcat ${S}/AGP/unplaced.scaf.agp.gz | grep -v "^#" \
        | sed -e "s/^/chrUn_/" -e "s/\.1//"  >> ucscChr/chrUn.agp
gzip ucscChr/chrUn.agp &

zcat ${S}/FASTA/unplaced.scaf.fa.gz \
        | sed -e "s#^>.*|gb|#>chrUn_#; s#|.*##"  -e "s/\.1//"  \
        | gzip > ucscChr/chrUn.fa.gz
   # about 11660 sequences in the unplaced
zcat ucscChr/chrUn.fa.gz | grep "^>" | wc
   #     11660   11660   46640

   # Check them with faSize
   faSize Primary_Assembly/unplaced_scaffolds/FASTA/unplaced.scaf.fa.gz
   # 307961778 bases (17213380 N's 290748398 real 290748398 upper
   # 0 lower) in 11660 sequences in 1 files

   faSize ucscChr/chrUn.fa.gz
   # 307961778 bases (17213380 N's 290748398 real 290748398 upper
   # 0 lower) in 11660 sequences in 1 files

    # N50
    cd /hive/data/genomes/bosTau7/
    n50.pl chrom.sizes
    #   reading: chrom.sizes
    #   contig count: 11692, total size: 2981119579, one half size: 1490559789
    # cumulative    N50 count   contig  contig size
    # 1444820354  12  chrX    88654062
    # 1490559789 one half size
    # 1529939826  13  chr12   85119472

    cat << '_EOF_' > bosTau7.config.ra
# Config parameters for makeGenomeDb.pl:
db bosTau7
clade mammal
genomeCladePriority 31
scientificName Bos taurus
commonName Cow
assemblyDate OCt. 2011
assemblyLabel Baylor Btau_4.6.1
assemblyShortLabel Baylor Btau_4.6.1
orderKey 236
mitoAcc AY526085
fastaFiles /hive/data/genomes/bosTau7/genbank/ucscChr/chr*.fa.gz
agpFiles /hive/data/genomes/bosTau7/genbank/ucscChr/chr*.agp.gz
# qualFiles none
dbDbSpeciesDir cow
photoCreditURL http://www.genome.gov/10005141
photoCreditName NHGRI Press Photos
ncbiGenomeId 82
ncbiAssemblyId 313728
ncbiAssemblyName Baylor Btau_4.6.1
ncbiBioProject 12555
genBankAccessionID GCA_000003205.4
taxId 9913
'_EOF_'

#########################################################################
# Initial  database build (DONE - 2012-01-03 - Chin)
    cd /hive/data/genomes/bosTau7
    cat << '_EOF_' > bosTau7.config.ra
# Config parameters for makeGenomeDb.pl:
db bosTau7
clade mammal
genomeCladePriority 31
scientificName Bos taurus
commonName Cow
assemblyDate OCt. 2011
assemblyLabel Baylor Btau_4.6.1
assemblyShortLabel Btau_4.6.1
orderKey 236
mitoAcc AY526085
fastaFiles /hive/data/genomes/bosTau7/genbank/ucscChr/chr*.fa.gz
agpFiles /hive/data/genomes/bosTau7/genbank/ucscChr/chr*.agp.gz
# qualFiles none
dbDbSpeciesDir cow
photoCreditURL http://www.genome.gov/10005141
photoCreditName NHGRI Press Photos
ncbiGenomeId 82
ncbiAssemblyId 313728
ncbiAssemblyName Baylor Btau_4.6.1
ncbiBioProject 12555
genBankAccessionID GCA_000003205.4
taxId 9913
'_EOF_'
    # << happy emacs
    time makeGenomeDb.pl -noGoldGapSplit -workhorse=hgwdev bosTau7.config.ra \
	> makeGenomeDb.log 2>&1 &
    # real    23m34.828s

    #	add the trackDb entries to the source tree, and the 2bit link:
    ln -s `pwd`/bosTau7.unmasked.2bit /gbdb/bosTau7/bosTau7.2bit

    #  Per instructions in makeGenomeDb.log:
# - replace *** use actual url for project etc.. in all html files
#Search for '***' notes in each file in and make corrections (sometimes the
#files used for a previous assembly might make a better template):
#  description.html /cluster/data/bosTau7/html/{trackDb.ra,gap.html,gold.html}
#
#Then copy these files to your ~/kent/src/hg/makeDb/trackDb/cow/bosTau7
# - cd ~/kent/src/hg/makeDb/trackDb
# - edit makefile to add bosTau7 to DBS.
# - git add cow/bosTau7/*.{ra,html}
# - git commit -m "Added bosTau7 to DBS." makefile
# - git commit -m "Initial descriptions for bosTau7." cow/bosTau7/*.{ra,html}
# - git pull; git push
# - Run make update DBS=bosTau7 and make alpha when done.
# - (optional) Clean up /cluster/data/bosTau7/TemporaryTrackDbCheckout

    # re-generated description html using update makeGenomeDb.pl and config.ra
    cd /hive/data/genomes/bosTau7
    makeGenomeDb.pl -forceDescription -continue=trackDb bosTau7.config.ra

#########################################################################
# RepeatMasker (DONE 2012-01-04 - Chin)
    mkdir /hive/data/genomes/bosTau7/bed/repeatMasker
    cd /hive/data/genomes/bosTau7/bed/repeatMasker
    time nice -n +19 doRepeatMasker.pl -buildDir=`pwd` \
	-workhorse=hgwdev -bigClusterHub=swarm -noSplit bosTau7 > do.log 2>&1 &
    #   real    472m4.476s
    cat faSize.rmsk.txt
    # 2981119579 bases (176446405 N's 2804673174 real 1415188620 upper
    # 1389484554 lower) in 11692 sequences in 1 files
    # %46.61 masked total, %49.54 masked real

#########################################################################
# simpleRepeats (DONE - 2012-01-05 - Chin)
    mkdir /hive/data/genomes/bosTau7/bed/simpleRepeat
    cd /hive/data/genomes/bosTau7/bed/simpleRepeat

    time nice -n +19 doSimpleRepeat.pl -buildDir=`pwd` -workhorse=hgwdev \
	-dbHost=hgwdev -bigClusterHub=swarm -smallClusterHub=memk \
        bosTau7 > do.log 2>&1 &
    #   real    39m2.413s

    cat fb.simpleRepeat
    # 40432561 bases of 2804801325 (1.442%) in intersection

    #	add to the repeatMasker
    cd /hive/data/genomes/bosTau7
    twoBitMask bosTau7.rmsk.2bit -add bed/simpleRepeat/trfMask.bed bosTau7.2bit
    #	safe to ignore warnings about >=13 fields
    twoBitToFa bosTau7.2bit stdout | faSize stdin > bosTau7.2bit.faSize.txt
    cat bosTau7.2bit.faSize.txt
    # 2981119579 bases (176446405 N's 2804673174 real 1414674956 upper
    # 1389998218 lower) in 11692 sequences in 1 files
    # %46.63 masked total, %49.56 masked real

    # double check with featureBits
    featureBits -countGaps bosTau7 gap
    # 176318254 bases of 2981119579 (5.914%) in intersection

    rm /gbdb/bosTau7/bosTau7.2bit
    ln -s `pwd`/bosTau7.2bit /gbdb/bosTau7/bosTau7.2bit

#########################################################################
# Marking *all* gaps - they are not all in the AGP file
#	(DONE - 2012-01-05 - Chin)
    mkdir /hive/data/genomes/bosTau7/bed/allGaps
    cd /hive/data/genomes/bosTau7/bed/allGaps

    time nice -n +19 findMotif -motif=gattaca -verbose=4 \
	-strand=+ ../../bosTau7.unmasked.2bit > findMotif.txt 2>&1
    #   real    0m42.321s
    grep "^#GAP " findMotif.txt | sed -e "s/^#GAP //" > allGaps.bed
    featureBits bosTau7 -not gap -bed=notGap.bed
    # 2804801325 bases of 2804801325 (100.000%) in intersection
    featureBits bosTau7 allGaps.bed notGap.bed -bed=new.gaps.bed
    # 128151 bases of 2804801325 (0.005%) in intersection
    #	what is the highest index in the existing gap table:
    hgsql -N -e "select ix from gap;" bosTau7 | sort -n | tail -1
    #	10242

    # use tcsh and ctrl-c to create the here doc
    cat << '_EOF_' > mkGap.pl
#!/usr/bin/env perl

use strict;
use warnings;

my $ix=`hgsql -N -e "select ix from gap;" bosTau7 | sort -n | tail -1`;
chomp $ix;

open (FH,"<new.gaps.bed") or die "can not read new.gaps.bed";
while (my $line = <FH>) {
    my ($chrom, $chromStart, $chromEnd, $rest) = split('\s+', $line);
    ++$ix;
    printf "%s\t%d\t%d\t%d\tN\t%d\tother\tyes\n", $chrom, $chromStart,
        $chromEnd, $ix, $chromEnd-$chromStart;
}
close (FH);
'_EOF_'
    # << happy emacs
    chmod +x ./mkGap.pl
    ./mkGap.pl > other.bed
    featureBits bosTau7 other.bed
    # 128151 bases of 2804801325 (0.005%) in intersection
    hgLoadBed -sqlTable=$HOME/kent/src/hg/lib/gap.sql \
	-noLoad bosTau7 otherGap other.bed
    #  Loaded 12612  elements of size 8
    #	adding this many:
    wc -l bed.tab
    #   12612 bed.tab
    #	starting with this many
    hgsql -e "select count(*) from gap;" bosTau7
    #	69725
    hgsql bosTau7 -e 'load data local infile "bed.tab" into table gap;'
    #	result count:
    hgsql -e "select count(*) from gap;" bosTau7
    #	82337
    # == 69725 + 12612


##########################################################################
## WINDOWMASKER (DONE 2012-01-05 Chin)
    mkdir /hive/data/genomes/bosTau7/bed/windowMasker
    cd /hive/data/genomes/bosTau7/bed/windowMasker
    time nice -n +19 doWindowMasker.pl -buildDir=`pwd` -workhorse=hgwdev \
        -dbHost=hgwdev bosTau7 > do.log 2>&1 &
    #   real    191m59.940s

    # Masking statistics
    twoBitToFa bosTau7.wmsk.2bit stdout | faSize stdin
    # 2981119579 bases (176446405 N's 2804673174 real 1660551615 upper
    # 1144121559 lower) in 11692 sequences in 1 files
    # %38.38 masked total, %40.79 masked real

    twoBitToFa bosTau7.wmsk.sdust.2bit stdout | faSize stdin
    # 2981119579 bases (176446405 N's 2804673174 real 1645487915 upper
    # 1159185259 lower) in 11692 sequences in 1 files
    # %38.88 masked total, %41.33 masked real

    hgLoadBed bosTau7 windowmaskerSdust windowmasker.sdust.bed.gz
    # Reading windowmasker.sdust.bed.gz
    # Read 13173670 elements of size 3 from windowmasker.sdust.bed.gz
    # Sorted
    # Creating table definition for windowmaskerSdust
    # Saving bed.tab
    # Loading bosTau7

    featureBits -countGaps bosTau7 windowmaskerSdust
    # 1335621689 bases of 2981119579 (44.803%) in intersection

    #   eliminate the gaps from the masking
    featureBits bosTau7 -not gap -bed=notGap.bed
    # 2804673174 bases of 2804673174 (100.000%) in intersection
    time nice -n +19 featureBits bosTau7 windowmaskerSdust notGap.bed \
        -bed=stdout | gzip -c > cleanWMask.bed.gz
    # real    4m52.720s
    # 1159185259 bases of 2804673174 (41.330%) in intersection

    #   reload track to get it clean
    hgLoadBed bosTau7 windowmaskerSdust cleanWMask.bed.gz
    # Read 13181547 elements of size 4 from cleanWMask.bed.gz
    # Loading bosTau7

    time featureBits -countGaps bosTau7 windowmaskerSdust
    #   real    1m11.488s
    # 1159185259 bases of 2981119579 (38.884%) in intersection

    zcat cleanWMask.bed.gz \
        | twoBitMask ../../bosTau7.unmasked.2bit stdin \
            -type=.bed bosTau7.cleanWMSdust.2bit
    twoBitToFa bosTau7.cleanWMSdust.2bit stdout | faSize stdin \
        > bosTau7.cleanWMSdust.faSize.txt
    cat bosTau7.cleanWMSdust.faSize.txt
    # 2981119579 bases (176446405 N's 2804673174 real 1645487915 upper
    # 1159185259 lower) in 11692 sequences in 1 files
    # %38.88 masked total, %41.33 masked real

    # how much does this window masker and repeat masker overlap:
    featureBits -countGaps bosTau7 rmsk windowmaskerSdust
    # 934428494 bases of 2981119579 (31.345%) in intersection

########################################################################
# Create kluster run files (DONE - 2012-01-05 - Chin)
    # numerator is bosTau7 gapless bases "real" as reported by:
    featureBits -noRandom -noHap bosTau7 gap
    # 159233025 bases of 2513924776 (6.334%) in intersection

    # denominator is hg19 gapless bases as reported by:
    #	featureBits -noRandom -noHap hg19 gap
    #     234344806 bases of 2861349177 (8.190%) in intersection
    # 1024 is threshold used for human -repMatch:
    calc \( 2513924776 / 2861349177 \) \* 1024
    ( 2513924776 / 2861349177 ) * 1024 = 899.666140

    # ==> use -repMatch=900 according to size scaled down from 1024 for human.
    #	and rounded down to nearest 50 (runup 899.666140 to 900 in this case)
    cd /hive/data/genomes/bosTau7
    blat bosTau7.2bit \
	 /dev/null /dev/null -tileSize=11 -makeOoc=jkStuff/bosTau7.11.ooc \
	-repMatch=900 &
    #	Wrote 37068 overused 11-mers to jkStuff/bosTau7.11.ooc
    mkdir /hive/data/staging/data/bosTau7
    cp -p bosTau7.2bit jkStuff/bosTau7.11.ooc /hive/data/staging/data/bosTau7
    cp -p chrom.sizes /hive/data/staging/data/bosTau7
    #	check non-bridged gaps to see what the typical size is:
    hgsql -N \
	-e 'select * from gap where bridge="no" order by size;' bosTau7 \
	| sort -k7,7nr
    #   most non-bridges gaps have size = 5000, follow pig's example (most
    #   100, but use 5000)
    #	decide on a minimum gap for this break, use either 100 or 5000 will
    #	generate 13387 liftOver rows, but if use 6000, only got 11703 rows.
    #   so use 100 here to get more liftOver row.
    gapToLift -verbose=2 -minGap=100 bosTau7 jkStuff/nonBridged.lft \
	-bedFile=jkStuff/nonBridged.bed
    cp -p jkStuff/nonBridged.lft \
	/hive/data/staging/data/bosTau7/bosTau7.nonBridged.lft
    # ask cluster-admin to copy (evry time if any file changed)
    #    /hive/data/staging/data/bosTau7 directory to
    #    /scratch/data/bosTau7 on cluster nodes
    # before porceed to genbank step

############################################################################
# set this as defaultDb (DONE - 2012-01-06 - Chin)
#   and make this the default genome for Cow

    hgsql -e 'update defaultDb set name="bosTau7" where name="bosTau6";' \
        hgcentraltest

########################################################################
# GENBANK AUTO UPDATE (DONE - 2012-01-09 - Chin)
    ssh hgwdev
    cd $HOME/kent/src/hg/makeDb/genbank
    git pull
    # /cluster/data/genbank/data/organism.lst shows:
    # #organism       mrnaCnt estCnt  refSeqCnt
    # Bos taurus      18945   1559562 13153

    # edit etc/genbank.conf to add bosTau7 just before susScr2

# bosTau7 (Cow)
bosTau7.serverGenome = /hive/data/genomes/bosTau7/bosTau7.2bit
bosTau7.clusterGenome = /scratch/data/bosTau7/bosTau7.2bit
bosTau7.ooc = /scratch/data/bosTau7/bosTau7.11.ooc
bosTau7.lift = no
bosTau7.perChromTables = no
bosTau7.refseq.mrna.native.pslCDnaFilter  = ${ordered.refseq.mrna.native.pslCDnaFilter}
bosTau7.refseq.mrna.xeno.pslCDnaFilter    = ${ordered.refseq.mrna.xeno.pslCDnaFilter}
bosTau7.genbank.mrna.native.pslCDnaFilter = ${ordered.genbank.mrna.native.pslCDnaFilter}
bosTau7.genbank.mrna.xeno.pslCDnaFilter   = ${ordered.genbank.mrna.xeno.pslCDnaFilter}
bosTau7.genbank.est.native.pslCDnaFilter  = ${ordered.genbank.est.native.pslCDnaFilter}
bosTau7.genbank.est.xeno.pslCDnaFilter    = ${ordered.genbank.est.xeno.pslCDnaFilter}
bosTau7.downloadDir = bosTau7
bosTau7.refseq.mrna.native.load  = yes
bosTau7.refseq.mrna.xeno.load = yes
bosTau7.refseq.mrna.xeno.loadDesc  = yes
bosTau7.upstreamGeneTbl = refGene




    git add etc/genbank.conf
    git commit -m "Added bosTau7" etc/genbank.conf
    git push
    # update /cluster/data/genbank/:
    make etc-update


# Edit src/lib/gbGenome.c to add new species.  Skipped

    screen	#  control this business with a screen since it takes a while
    cd /cluster/data/genbank

    time nice -n +19 ./bin/gbAlignStep -initial bosTau7 &
    # logFile: var/build/logs/2012.01.11-09:14:36.bosTau7.initalign.log
    #   real     1258m43.818s
    #   To re-do, rm the dir first:
    #     /cluster/data/genbank/data/aligned/genbank.187.0/bosTau7

    # load database when finished
    ssh hgwdev
    cd /cluster/data/genbank
    time nice -n +19 ./bin/gbDbLoadStep -drop -initialLoad bosTau7 &
    # logFile: var/dbload/hgwdev/logs/2012.01.12-10:33:25.dbload.log
    #   real    58m49.817s


    # enable daily alignment and update of hgwdev
    cd ~/kent/src/hg/makeDb/genbank
    git pull
    # add bosTau7 to:
    #   etc/align.dbs
    #   etc/hgwdev.dbs
    git add etc/align.dbs
    git add etc/hgwdev.dbs
    git commit -m "Added bosTau7 - Cow" etc/align.dbs etc/hgwdev.dbs
    git push
    make etc-update

##########################################################################
#  BLATSERVERS ENTRY (DONE 2012-01-13 - Chin)
#       After getting a blat server assigned by the Blat Server Gods,
#
#  bosTau7 (trans) on blatx port 17842
#  bosTau7 (untrans) on blatx port 17843

    hgsql -e 'INSERT INTO blatServers (db, host, port, isTrans, canPcr) \
        VALUES ("bosTau7", "blatx", "17842", "1", "0"); \
        INSERT INTO blatServers (db, host, port, isTrans, canPcr) \
        VALUES ("bosTau7", "blatx", "17843", "0", "1");' \
            hgcentraltest
    #   test it with some sequence


#########################################################################
# reset default position (DONE - 2012-01-13  - Chin)
#       Reset default position to an area contains RHO and a number of
#       other genes
    hgsql -e \
'update dbDb set defaultPos="chr22:56,647,062-58,290,728" where name="bosTau7";' \
        hgcentraltest

############################################################################
# ctgPos2 track - showing clone sequence locations on chromosomes
#	(DONE 2012-01-17 - Chin)
# NOTE - create bosTau7 entry in all.joiner since this is a new species
    mkdir /hive/data/genomes/bosTau7/bed/ctgPos2
    cd /hive/data/genomes/bosTau7/bed/ctgPos2
    cat << '_EOF_' > agpToCtgPos2.pl
#!/usr/bin/env perl

use warnings;
use strict;

my $argc = scalar(@ARGV);

if ($argc != 1) {
    printf STDERR "usage: zcat your.files.agp.gz | agpToCtgPos2.pl /dev/stdin > ctgPos2.tab\n";
    exit 255;
}

my $agpFile = shift;

open (FH, "<$agpFile") or die "can not read $agpFile";
while (my $line = <FH>) {
    next if ($line =~ m/^#/);
    chomp $line;
    my @a = split('\s+', $line);
    next if ($a[4] =~ m/^U$/);
    next if ($a[4] =~ m/^N$/);
    my $chrSize = $a[2]-$a[1]+1;
    my $ctgSize = $a[7]-$a[6]+1;
    die "sizes differ $chrSize != $ctgSize\n$line\n" if ($chrSize != $ctgSize);
    printf "%s\t%d\t%s\t%d\t%d\t%s\n", $a[5], $chrSize, $a[0], $a[1]-1, $a[2], $a[4];
}
close (FH);
'_EOF_'
    # << happy emacs

chmod 775 agpToCtgPos2.pl

export S=../../genbank/Primary_Assembly/assembled_chromosomes
cat  ${S}/chr2acc | grep -v "^#Chromosome" | cut -f2 | while read ACC
do
    C=`grep "${ACC}" ${S}/chr2acc | cut -f1`
    zcat ${S}/AGP/chr${C}.agp.gz \
        | sed -e "s/^${ACC}/chr${C}/"
done | ./agpToCtgPos2.pl /dev/stdin > ctgPos2.tab

    hgLoadSqlTab bosTau7 ctgPos2 $HOME/kent/src/hg/lib/ctgPos2.sql ctgPos2.tab
    # add the track ctgPos2 to src/hg/makeDb/trackDb/cow/bosTau7/trackDb.ra
    # at src/makeDb/trackdb do "make update DBS=bosTau7" or/and "make alpha"
    # based on result of
    # hgsql -N -e "select type from ctgPos2;" bosTau7 | sort | uniq
    # W
    # prepare the src/hg/makeDb/trackDb/cow/bosTau7/ctgPos2.html


############################################################################
# running cpgIsland business (DONE -2012-01-17 - Chin)
    mkdir /hive/data/genomes/bosTau7/bed/cpgIsland
    cd /hive/data/genomes/bosTau7/bed/cpgIsland
    cvs -d /projects/compbio/cvsroot checkout -P hg3rdParty/cpgIslands
    cd hg3rdParty/cpgIslands
    #   needed to fixup this source, adding include to readseq.c:
#include "string.h"
    #   and to cpg_lh.c:
#include "unistd.h"
#include "stdlib.h"
    # and fixing a declaration in cpg_lh.c
    sed -e "s#\(extern char\* malloc\)#// \1#" cpg_lh.c > tmp.c
    mv tmp.c cpg_lh.c
    make
    cd ../../
    ln -s hg3rdParty/cpgIslands/cpglh.exe
    mkdir -p hardMaskedFa
    cut -f1 ../../chrom.sizes | while read C
do
    echo ${C}
    twoBitToFa ../../bosTau7.2bit:$C stdout \
        | maskOutFa stdin hard hardMaskedFa/${C}.fa
done

    ssh swarm
    cd /hive/data/genomes/bosTau7/bed/cpgIsland
    mkdir results
    cut -f1 ../../chrom.sizes > chr.list
    cat << '_EOF_' > template
#LOOP
./runOne $(root1) {check out exists results/$(root1).cpg}
#ENDLOOP
'_EOF_'
    # << happy emacs

    #   the faCount business is to make sure there is enough sequence to
    #   work with in the fasta.  cpglh.exe does not like files with too many
    #   N's - it gets stuck
    cat << '_EOF_' > runOne
#!/bin/csh -fe
set C = `faCount hardMaskedFa/$1.fa | grep ^chr | awk '{print  $2 - $7 }'`
if ( $C > 200 ) then
    ./cpglh.exe hardMaskedFa/$1.fa > /scratch/tmp/$1.$$
    mv /scratch/tmp/$1.$$ $2
else
    touch $2
endif
'_EOF_'
    # << happy emacs


    chmod 775 runOne
    gensub2 chr.list single template jobList
    para create jobList
    para try
    para check ... etc
    para time
    para problems
    para status
    # then, kick it with para push
    # check it with plb
    # when all are done, para time shows:
    # Checking finished jobs
    # Completed: 11692 of 11692 jobs
    # CPU time in finished jobs:    201s       3.35m     0.06h    0.00d  0.000 y
    # IO & Wait Time:             31839s     530.65m     8.84h    0.37d  0.001 y
    # Average job time:                   3s       0.05m     0.00h    0.00d
    # Longest finished job:              17s       0.28m     0.00h    0.00d
    # Submission to last job:           111s       1.85m     0.03h    0.00d


    # Transform cpglh output to bed +
    catDir results | awk '{
$2 = $2 - 1;
width = $3 - $2;
printf("%s\t%d\t%s\t%s %s\t%s\t%s\t%0.0f\t%0.1f\t%s\t%s\n",
       $1, $2, $3, $5,$6, width,
       $6, width*$7*0.01, 100.0*2*$6/width, $7, $9);
}' > cpgIsland.bed

    ssh hgwdev
    cd /hive/data/genomes/bosTau7/bed/cpgIsland
    hgLoadBed bosTau7 cpgIslandExt -tab \
      -sqlTable=$HOME/kent/src/hg/lib/cpgIslandExt.sql cpgIsland.bed
    # Reading cpgIsland.bed
    # Read 38217 elements of size 10 from cpgIsland.bed
    # Sorted
    # Creating table definition for cpgIslandExt
    # Saving bed.tab
    # Loading bosTau7
    #   cleanup
    rm -fr hardMaskedFa


###########################################################################
# ornAna1 Platypus LASTZ/CHAIN/NET (DONE - 2012-01-18 - Chin)
    screen # use screen to control this job
    mkdir /cluster/data/bosTau7/bed/blastzOrnAna1.2012-01-18
    cd /cluster/data/bosTau7/bed/blastzOrnAna1.2012-01-18

    cat << '_EOF_' > DEF
# Cow vs. Platypus
BLASTZ_Y=3400
BLASTZ_L=6000
BLASTZ_K=2200
BLASTZ_Q=/cluster/data/blastz/HoxD55.q
BLASTZ_M=50

# TARGET: Cow bosTau7
SEQ1_DIR=/scratch/data/bosTau7/bosTau7.2bit
SEQ1_LEN=/cluster/data/bosTau7/chrom.sizes
# Maximum number of scaffolds that can be lumped together
SEQ1_LIMIT=300
SEQ1_CHUNK=20000000
SEQ1_LAP=0

# QUERY: Platypus ornAna1
SEQ2_DIR=/scratch/data/ornAna1/ornAna1.2bit
SEQ2_LEN=/cluster/data/ornAna1/chrom.sizes
SEQ2_CHUNK=20000000
SEQ2_LIMIT=300
SEQ2_LAP=0

BASE=/cluster/data/bosTau7/bed/blastzOrnAna1.2012-01-18
TMPDIR=/scratch/tmp
'_EOF_'
    # << happy emacs

    time nice -n +19 doBlastzChainNet.pl `pwd`/DEF \
        -syntenicNet -noDbNameCheck \
        -workhorse=hgwdev -smallClusterHub=memk -bigClusterHub=swarm \
        -chainMinScore=5000 -verbose=2 -chainLinearGap=loose > do.log 2>&1 &
    #   real    1780m16.857s

 *** All done !  Elapsed time: 1780m17s
 *** Make sure that goldenPath/bosTau7/vsOrnAna1/README.txt is accurate.
 *** Add {chain,net}OrnAna1 tracks to trackDb.ra if necessary.

    # filter with doRecipBest.pl
    time doRecipBest.pl -workhorse=hgwdev -buildDir=`pwd` \
        bosTau7 ornAna1 > rbest.log 2>&1
    # real    12m55.429s
    cat fb.bosTau7.chainOrnAna1Link.txt
    #   204121295 bases of 2804673174 (7.278%) in intersection
    cd /hive/data/genomes/bosTau7/bed
    ln -s blastzOrnAna1.2012-01-18 lastz.ornAna1


    mkdir /cluster/data/ornAna1/bed/blastz.bosTau7.swap
    cd /cluster/data/ornAna1/bed/blastz.bosTau7.swap

    time nice -n +19 doBlastzChainNet.pl \
        /cluster/data/bosTau7/bed/blastzOrnAna1.2012-01-18/DEF \
        -chainMinScore=5000 -verbose=2 -smallClusterHub=memk \
        -swap -syntenicNet \
          -chainLinearGap=loose -bigClusterHub=swarm > swap.log 2>&1 &
    #   real       58m6.038s
    cat fb.ornAna1.chainBosTau7Link.txt
    # 190094136 bases of 1842236818 (10.319%) in intersection

 *** All done !  Elapsed time: 58m6s
 *** Make sure that goldenPath/ornAna1/vsBosTau7/README.txt is accurate.
 *** Add {chain,net}BosTau7 tracks to trackDb.ra if necessary.

    cd /cluster/data/ornAna1/bed
    ln -s blastz.bosTau7.swap lastz.bosTau7

#####################################################################
# susScr2 Pig BLASTZ/CHAIN/NET (DONE - 2012-01-20 - Chin)
    screen # use a screen to manage this multi-day job
    mkdir /hive/data/genomes/bosTau7/bed/lastzSusScr2.2012-01-20
    cd /hive/data/genomes/bosTau7/bed/lastzSusScr2.2012-01-20

    cat << '_EOF_' > DEF
# Pig vs. Cow
BLASTZ_M=50

# TARGET: Cow BosTau7
SEQ1_DIR=/scratch/data/bosTau7/bosTau7.2bit
SEQ1_LEN=/scratch/data/bosTau7/chrom.sizes
SEQ1_CHUNK=10000000
SEQ1_LAP=10000

# QUERY: Pig SusScr2
SEQ2_DIR=/scratch/data/susScr2/susScr2.2bit
SEQ2_LEN=/scratch/data/susScr2/chrom.sizes
SEQ2_CHUNK=10000000
SEQ2_LAP=0

BASE=/hive/data/genomes/bosTau7/bed/lastzSusScr2.2012-01-20
TMPDIR=/scratch/tmp
'_EOF_'
    # << this line keeps emacs coloring happy

    time nice -n +19 doBlastzChainNet.pl -verbose=2 \
        `pwd`/DEF \
        -noLoadChainSplit -syntenicNet \
        -workhorse=hgwdev -smallClusterHub=memk -bigClusterHub=swarm \
        -chainMinScore=3000 -chainLinearGap=medium > do.log 2>&1 &
    #   real    1624m48.439s
    cat fb.bosTau7.chainSusScr2Link.txt
    # 1342940418 bases of 2804673174 (47.882%) in intersection
    cd /hive/data/genomes/bosTau7/bed
    ln -s lastzSusScr2.2012-01-20 lastz.susScr2

    # then swap
    mkdir /hive/data/genomes/susScr2/bed/blastz.bosTau7.swap
    cd /hive/data/genomes/susScr2/bed/blastz.bosTau7.swap
    time nice -n +19 doBlastzChainNet.pl -verbose=2 \
        /hive/data/genomes/bosTau7/bed/lastzSusScr2.2012-01-20/DEF \
        -swap -noLoadChainSplit -syntenicNet \
        -workhorse=hgwdev -smallClusterHub=memk -bigClusterHub=pk \
        -chainMinScore=3000 -chainLinearGap=medium > swap.log 2>&1 &
    #  real    209m45.776s

    cat fb.susScr2.chainBosTau7Link.txt
    # 1443560932 bases of 2231298548 (64.696%) in intersection


#########################################################################
#  SWAP mm9 LASTZ (DONE - 2012-01-22 - Chin)
    #	original alignment done at mm9.txt
    cd /hive/data/genomes/mm9/bed/lastzBosTau7.2012-01-22

    cat fb.mm9.chainBosTau7Link.txt
    # 695010371 bases of 2620346127 (26.524%) in intersection
    # Create link
    cd /hive/data/genomes/mm9/bed
    ln -s  lastzBosTau7.2012-01-22 lastz.bosTau7

    #   and the swap
    mkdir /hive/data/genomes/bosTau7/bed/blastz.mm9.swap
    cd /hive/data/genomes/bosTau7/bed/blastz.mm9.swap
    time nice -n +19 doBlastzChainNet.pl -verbose=2 \
        /hive/data/genomes/mm9/bed/lastzBosTau7.2012-01-22/DEF \
        -swap -syntenicNet  \
        -noLoadChainSplit \
        -workhorse=hgwdev -smallClusterHub=memk -bigClusterHub=swarm \
        -chainMinScore=3000 -chainLinearGap=medium > swap.log 2>&1
    #   real      51m44.505s
    cat fb.bosTau7.chainMm9Link.txt
    # 711305079 bases of 2804673174 (25.361%) in intersection
    cd /hive/data/genomes/bosTau7/bed
    ln -s blastz.mm9.swap lastz.mm9

############################################################################
#  SWAP hg19 LASTZ (DONE 2012-01-23 - Chin)
    #	original alignment
    cd /hive/data/genomes/hg19/bed/lastzBosTau7.2012-01-23
    cat fb.hg19.chainBosTau7Link.txt
    # 1360887008 bases of 2897316137 (46.971%) in intersection

    #   and the swap
    mkdir /hive/data/genomes/bosTau7/bed/blastz.hg19.swap
    cd /hive/data/genomes/bosTau7/bed/blastz.hg19.swap
    time nice -n +19 doBlastzChainNet.pl -verbose=2 \
        /hive/data/genomes/hg19/bed/lastzBosTau7.2012-01-23/DEF \
        -swap  -syntenicNet \
        -noLoadChainSplit \
        -workhorse=hgwdev -smallClusterHub=memk -bigClusterHub=swarm \
        -chainMinScore=3000 -chainLinearGap=medium > swap.log 2>&1 &
    #   real     95m9.611s
    cat fb.bosTau7.chainHg19Link.txt
    #   1388551419 bases of 2804673174 (49.508%) in intersection
    cd /hive/data/genomes/bosTau7/bed
    ln -s blastz.hg19.swap lastz.hg19

############################################################################
#  SWAP canFam2 LASTZ (DONE 2012-01-24 - Chin)
    #   original alignment
    cd /hive/data/genomes/canFam2/bed/lastzBosTau7.2012-01-2
    cat fb.canFam2.chainBosTau7Link.txt
    # 1381869475 bases of 2384996543 (57.940%) in intersection

    #   and the swap
    mkdir /hive/data/genomes/bosTau7/bed/blastz.canFam2.swap
    cd /hive/data/genomes/bosTau7/bed/blastz.canFam2.swap
    time nice -n +19 doBlastzChainNet.pl -verbose=2 \
        /hive/data/genomes/canFam2/bed/lastzBosTau7.2012-01-24/DEF \
        -swap -syntenicNet  \
        -noLoadChainSplit \
        -workhorse=hgwdev -smallClusterHub=memk -bigClusterHub=swarm \
        -chainMinScore=3000 -chainLinearGap=medium > swap.log 2>&1 &
    #   real      86m11.258s
    cat fb.bosTau7.chainCanFam2Link.txt
    # 1458512906 bases of 2804673174 (52.003%) in intersection
    cd /hive/data/genomes/bosTau7/bed
    ln -s blastz.canFam2.swap lastz.canFam2

############################################################################
#  SWAP rn4 LASTZ (DONE 2012-01-24 - Chin)
    cd /hive/data/genomes/rn4/bed/blastzBosTau7.2012-01-24
    cat fb.rn4.chainBosTau7Link.txt
    # 656136973 bases of 2571531505 (25.515%) in intersection

    #   and the swap
    mkdir /hive/data/genomes/bosTau7/bed/blastz.rn4.swap
    cd /hive/data/genomes/bosTau7/bed/blastz.rn4.swap
    time nice -n +19 doBlastzChainNet.pl -verbose=2 \
        /hive/data/genomes/rn4/bed/blastzBosTau7.2012-01-24/DEF \
        -swap -noLoadChainSplit -syntenicNet \
        -workhorse=hgwdev -smallClusterHub=memk -bigClusterHub=swarm \
        -chainMinScore=3000 -chainLinearGap=medium > swap.log 2>&1 &
    #   real     48m51.145

    cat fb.bosTau7.chainRn4Link.txt
    # 668440773 bases of 2804673174 (23.833%) in intersection
    cd /hive/data/genomes/bosTau7/bed
    ln -s blastz.rn4.swap lastz.rn4

############################################################################
#  SWAP monDom5 LASTZ (DONE 2012-01-26 - Chin)
    cd /hive/data/genomes/monDom5/bed/lastzBosTau7.2012-01-26
    cat fb.monDom5.chainBosTau7Link.txt
    # 206928725 bases of 3501660299 (5.909%) in intersection

    # and the swap
    mkdir /hive/data/genomes/bosTau7/bed/blastz.monDom5.swap
    cd /hive/data/genomes/bosTau7/bed/blastz.monDom5.swap
    time nice -n +19 doBlastzChainNet.pl -verbose=2 \
        /hive/data/genomes/monDom5/bed/lastzBosTau7.2012-01-26/DEF \
        -swap -noLoadChainSplit -syntenicNet \
        -workhorse=hgwdev -smallClusterHub=memk -bigClusterHub=swarm \
        -chainMinScore=5000 -chainLinearGap=loose > swap.log 2>&1 &
    #   real    59m33.272s

    cat fb.bosTau7.chainMonDom5Link.txt
    # 207518037 bases of 2804673174 (7.399%) in intersection
    cd /hive/data/genomes/bosTau7/bed
    ln -s blastz.monDom5.swap lastz.monDom5

#########################################################################
# GENSCAN GENE PREDICTIONS (DONE  2012-01-31 - Chin)
    mkdir /hive/data/genomes/bosTau7/bed/genscan
    cd /hive/data/genomes/bosTau7/bed/genscan

    # Check out hg3rdParty/genscanlinux to get latest genscan:
    cvs -d /projects/compbio/cvsroot checkout -P hg3rdParty/genscanlinux

    # create hard masked .fa files
    mkdir -p hardMaskedFa
    cut -f1 ../../chrom.sizes | while read C
do
    echo ${C}
    twoBitToFa ../../bosTau7.2bit:$C stdout \
        | maskOutFa stdin hard hardMaskedFa/${C}.fa
done

    # Generate a list file, genome.list, of all the hard-masked contig
    # chunks:
    find ./hardMaskedFa/ -type f | sed -e 's#^./##' > genome.list

    wc -l genome.list
    #   11692 genome.list

    # Run on small cluster (more mem than big cluster).
    ssh swarm
    cd /hive/data/genomes/bosTau7/bed/genscan
    # Make 3 subdirectories for genscan to put their output files in
    mkdir gtf pep subopt
    # Create template file, template, for gensub2.  For example (3-line
    # file):
    cat << '_EOF_' > template
#LOOP
/cluster/bin/x86_64/gsBig {check in exists+ $(path1)} {check out exists gtf/$(root1).gtf} -trans={check out exists pep/$(root1).pep} -subopt={check out exists subopt/$(root1).bed} -exe=hg3rdParty/genscanlinux/genscan -par=hg3rdParty/genscanlinux/HumanIso.smat -tmp=/tmp -window=2400000
#ENDLOOP
'_EOF_'
    # << emacs
    gensub2 genome.list single template jobList
    para create jobList
    para try
    para check ... etc...
    para time
    # Completed: 11690 of 11692 jobs
    # Crashed: 2 jobs
    # CPU time in finished jobs:  66494s    1108.23m    18.47h    0.77d  0.002 y
    # IO & Wait Time:             33892s     564.87m     9.41h    0.39d  0.001 y
    # Average job time:               9s       0.14m     0.00h    0.00d
    # Longest finished job:        3664s      61.07m     1.02h    0.04d
    # Submission to last job:      4007s      66.78m     1.11h    0.05d
    # Two jobs (chr19 and chr26) failed, re-run them:
    /cluster/bin/x86_64/gsBig hardMaskedFa/chr26.fa gtf/chr26.gtf -trans=pep/chr26.pep -subopt=subopt/chr26.bed -exe=hg3rdParty/genscanlinux/genscan -par=hg3rdParty/genscanlinux/HumanIso.smat -tmp=/tmp -window=200000

   /cluster/bin/x86_64/gsBig hardMaskedFa/chr19.fa gtf/chr19.gtf -trans=pep/chr19.pep -subopt=subopt/chr19.bed -exe=hg3rdParty/genscanlinux/genscan -par=hg3rdParty/genscanlinux/HumanIso.smat -tmp=/tmp -window=2200000

    # this did not work, runs out of file handles ?
    find ./gtf -type f | xargs -n 256 endsInLf -zeroOk

    # Concatenate results:
    cd /hive/data/genomes/bosTau7/bed/genscan
    find ./gtf -type f | xargs cat > genscan.gtf
    find ./pep -type f | xargs cat > genscan.pep
    find ./subopt -type f | xargs cat > genscanSubopt.bed

    # Load into the database (without -genePredExt because no frame
    # info):
    # Don't load the Pep anymore -- redundant since it's from genomic.
    ssh hgwdev
    cd /hive/data/genomes/bosTau7/bed/genscan
    # to construct a local file with the genePred business:
    gtfToGenePred genscan.gtf genscan.gp
    # this produces exactly the same thing and loads the table:
    ldHgGene -gtf bosTau7 genscan genscan.gtf
    # Reading genscan.gtf
    # Read 47871 transcripts in 344386 lines in 1 files
    #  47871 groups 4673 seqs 1 sources 1 feature types
    # 47871 gene predictions
    hgLoadBed bosTau7 genscanSubopt genscanSubopt.bed
    # Read 518687 elements of size 6 from genscanSubopt.bed
    featureBits bosTau7 genscan
    # 58056267 bases of 2804673174 (2.070%) in intersection	
    # previously:
    featureBits bosTau4 genscan
    #   58224594 bases of 2731830700 (2.131%) in intersection
    featureBits bosTau3 genscan
    #   59251085 bases of 2731807384 (2.169%) in intersection

#############################################################################
# LIFTOVER TO bosTau4 (DONE - 2012-05-11 - Chin)
    ssh hgwdev
    cd /hive/data/genomes/bosTau7
    ln -s jkStuff/bosTau7.11.ooc 11.ooc
    time nice -n +19 doSameSpeciesLiftOver.pl -verbose=2 \
        -bigClusterHub=swarm -dbHost=hgwdev -workhorse=hgwdev \
         bosTau7 bosTau4 > doLiftOverToBosTau4.log 2>&1 &
    # real    1567m22.050s

    pushd /usr/local/apache/htdocs-hgdownload/goldenPath/bosTau7/liftOver/
    md5sum bosTau7ToBosTau4.over.chain.gz >> md5sum.txt
    popd

#############################################################################
# LIFTOVER TO bosTau5 (DONE - 2012-05-11 - Chin)
    ssh hgwdev
    cd /hive/data/genomes/bosTau7
    ln -s jkStuff/bosTau7.11.ooc 11.ooc
    time nice -n +19 doSameSpeciesLiftOver.pl -verbose=2 \
        -bigClusterHub=swarm -dbHost=hgwdev -workhorse=hgwdev \
         bosTau7 bosTau5 > doLiftOverToBosTau5.log 2>&1 &
    # real    1570m4.968s

    pushd /usr/local/apache/htdocs-hgdownload/goldenPath/bosTau7/liftOver/
    md5sum bosTau7ToBosTau5.over.chain.gz >> md5sum.txt
    popd

#############################################################################
# LIFTOVER TO bosTau6 (DONE - 2012-05-11 - Chin)
    ssh hgwdev
    cd /hive/data/genomes/bosTau7
    ln -s jkStuff/bosTau7.11.ooc 11.ooc
    time nice -n +19 doSameSpeciesLiftOver.pl -verbose=2 \
        -bigClusterHub=swarm -dbHost=hgwdev -workhorse=hgwdev \
         bosTau7 bosTau6 > doLiftOverToBosTau6.log 2>&1 &
    # real    1570m6.579s

    pushd /usr/local/apache/htdocs-hgdownload/goldenPath/bosTau7/liftOver/
    md5sum bosTau7ToBosTau6.over.chain.gz >> md5sum.txt
    popd

#############################################################################
# LIFTOVER TO bosTau8 (DONE - 2014-10-17 - Steve)
    ssh hgwdev
    cd /hive/data/genomes/bosTau7
    ln -s jkStuff/bosTau7.11.ooc 11.ooc
    time nice -n +19 doSameSpeciesLiftOver.pl -verbose=2 \
        -bigClusterHub=ku -dbHost=hgwdev -workhorse=hgwdev \
         bosTau7 bosTau8 > doLiftOverToBosTau8.log 2>&1
    # real    85m39.000s

    pushd /usr/local/apache/htdocs-hgdownload/goldenPath/bosTau7/liftOver/
    md5sum bosTau7ToBosTau8.over.chain.gz >> md5sum.txt
    popd

#########################################################################
# all.joiner update, downloads and in pushQ - (DONE 2012-05-16 - Chin)
    cd $HOME/kent/src/hg/makeDb/schema
    # fixup all.joiner until this is a clean output
    joinerCheck -database=bosTau7 -all all.joiner

    mkdir /hive/data/genomes/bosTau7/goldenPath
    cd /hive/data/genomes/bosTau7/goldenPath
    makeDownloads.pl bosTau7 > do.log 2>&1

    #	now ready for pushQ entry
    mkdir /hive/data/genomes/bosTau7/pushQ
    cd /hive/data/genomes/bosTau7/pushQ
    makePushQSql.pl bosTau7 > bosTau7.pushQ.sql 2> stderr.out
    #	check for errors in stderr.out, some are OK, e.g.:
    # WARNING: hgwdev does not have /gbdb/bosTau7/wib/gc5Base.wib
    # WARNING: hgwdev does not have /gbdb/bosTau7/wib/quality.wib
    # WARNING: hgwdev does not have /gbdb/bosTau7/bbi/quality.bw
    # WARNING: bosTau7 does not have seq
    # WARNING: bosTau7 does not have extFile
    #
    # WARNING: Could not tell (from trackDb, all.joiner and hardcoded lists of
    # supporting and genbank tables) which tracks to assign these tables to:
    #   tableList

    #	copy it to hgwbeta
    scp -p bosTau7.pushQ.sql hgwbeta:/tmp
    ssh hgwbeta
    cd /tmp
    hgsql qapushq < bosTau7.pushQ.sql
    #	in that pushQ entry walk through each entry and see if the
    #	sizes will set properly

#########################################################################
#  SWAP mm10 LASTZ (DONE - 2012-06-19 - Chin)
    #   original alignment done at mm10.txt
    cd /hive/data/genomes/mm10/bed/lastzBosTau7.2012-03-21
    cat fb.mm10.chainBosTau7Link.txt
    # 696498363 bases of 2652783500 (26.255%) in intersection

    # Create link
    cd /hive/data/genomes/mm10/bed
    ln -s lastzBosTau7.2012-03-21 lastz.bosTau7

    #   and the swap
    mkdir /hive/data/genomes/bosTau7/bed/blastz.mm10.swap
    cd /hive/data/genomes/bosTau7/bed/blastz.mm10.swap
    time nice -n +19 doBlastzChainNet.pl -verbose=2 \
    /hive/data/genomes/mm10/bed/lastzBosTau7.2012-03-21/DEF \
    -swap -syntenicNet \
    -workhorse=hgwdev -smallClusterHub=encodek -bigClusterHub=swarm \
    -chainMinScore=3000 -chainLinearGap=medium > swap.log 2>&1 &
    #   real    77m58.759s
    cat fb.bosTau7.chainMm10Link.txt
    #   711923052 bases of 2804673174 (25.383%) in intersection
    # set sym link to indicate this is the lastz for this genome:
    cd /hive/data/genomes/bosTau7/bed
    ln -s blastz.mm10.swap lastz.mm10

##############################################################################
# LASTZ alpaca vicPac1 (DONE - 2012-06-19 - Chin)
    # establish a screen to control this job with a name to indicate
    # what it is
    screen -S bosTau7VicPac1
    mkdir /hive/data/genomes/bosTau7/bed/lastzVicPac1.2012-06-19
    cd /hive/data/genomes/bosTau7/bed/lastzVicPac1.2012-06-19

    # adjust the SEQ2_LIMIT with -stop=partition to get a reasonable
    #   number of jobs, 50,000 to something under 100,000
    cat << '_EOF_' > DEF
# alpaca vs cow
BLASTZ=/cluster/bin/penn/lastz-distrib-1.03.02/bin/lastz
# TARGET: cow BosTau7
SEQ1_DIR=/scratch/data/bosTau7/bosTau7.2bit
SEQ1_LEN=/scratch/data/bosTau7/chrom.sizes
SEQ1_CHUNK=20000000
SEQ1_LAP=10000

# QUERY: alpaca VicPac1
SEQ2_DIR=/scratch/data/vicPac1/vicPac1.2bit
SEQ2_LEN=/scratch/data/vicPac1/chrom.sizes
SEQ2_CHUNK=10000000
SEQ2_LAP=0
SEQ2_LIMIT=4000

BASE=/hive/data/genomes/bosTau7/bed/lastzVicPac1.2012-06-19
TMPDIR=/scratch/tmp
'_EOF_'
    # << happy emacs

    time nice -n +19 doBlastzChainNet.pl -verbose=2 \
    `pwd`/DEF \
    -syntenicNet \
    -workhorse=hgwdev -smallClusterHub=encodek -bigClusterHub=swarm \
    -chainMinScore=3000 -chainLinearGap=medium > do.log 2>&1 &
    #   real    3089m45.519s
    cat fb.bosTau7.chainVicPac1Link.txt
    #   1259182233 bases of 2804673174 (44.896%) in intersection
    # set sym link to indicate this is the lastz for this genome:
    cd /hive/data/genomes/bosTau7/bed
    ln -s lastzVicPac1.2012-06-19 lastz.vicPac1

    # better to have reciprocal best for this one since it is low
    # coverage:
    cd /hive/data/genomes/bosTau7/bed/lastzVicPac1.2012-06-19
    time doRecipBest.pl bosTau7 vicPac1 -buildDir=`pwd` -workhorse=hgwdev \
    > best.log 2>&1 &
    #   real    110m1.192s

    # swap
    mkdir /hive/data/genomes/vicPac1/bed/blastz.bosTau7.swap
    cd /hive/data/genomes/vicPac1/bed/blastz.bosTau7.swap
    time nice -n +19 doBlastzChainNet.pl -verbose=2 \
    /hive/data/genomes/bosTau7/bed/lastzVicPac1.2012-06-19/DEF \
    -swap -syntenicNet \
    -workhorse=hgwdev -smallClusterHub=encodek -bigClusterHub=swarm \
    -chainMinScore=3000 -chainLinearGap=medium > swap.log 2>&1 &
    #   real    717m22.295s
    cat fb.vicPac1.chainBosTau7Link.txt
    #  1309266730 bases of 1922910435 (68.088%) in intersection
    # set sym link to indicate this is the lastz for this genome:
    cd /hive/data/genomes/vicPac1/bed
    ln -s blastz.bosTau7.swap lastz.bosTau7

#########################################################################
#  SWAP canFam3 LASTZ (DONE - 2012-06-23 - Chin)
    #   original alignment done at canFam3.txt
    cd /hive/data/genomes/canFam3/bed/lastzBosTau7.2012-06-23

    cat fb.canFam3.chainBosTau7Link.txt
    # 1381966556 bases of 2392715236 (57.757%) in intersection
    # Create link
    cd /hive/data/genomes/canFam3/bed
    ln -s  lastzBosTau7.2012-06-23 lastz.bosTau7

    #   and the swap
    mkdir /hive/data/genomes/bosTau7/bed/blastz.canFam3.swap
    cd /hive/data/genomes/bosTau7/bed/blastz.canFam3.swap
    time nice -n +19 doBlastzChainNet.pl -verbose=2 \
        /hive/data/genomes/canFam3/bed/lastzBosTau7.2012-06-23/DEF \
        -swap -syntenicNet  \
        -noLoadChainSplit \
        -workhorse=hgwdev -smallClusterHub=memk -bigClusterHub=swarm \
        -chainMinScore=3000 -chainLinearGap=medium > swap.log 2>&1 &
    #   real      121m44.022s
    cat fb.bosTau7.chainCanFam3Link.txt
    # 1456104306 bases of 2804673174 (51.917%) in intersection
    cd /hive/data/genomes/bosTau7/bed
    ln -s blastz.canFam3.swap lastz.canFam3

##############################################################################
# LASTZ dolphin turTru2 (DONE 2012-06-19 - Chin)
    # establish a screen to control this job with a name to indicate
    # what it is
    screen -S bosTau7TurTru2
    mkdir /hive/data/genomes/bosTau7/bed/lastzTurTru2.2012-06-19
    cd /hive/data/genomes/bosTau7/bed/lastzTurTru2.2012-06-19

    # adjust the SEQ2_LIMIT with -stop=partition to get a reasonable
    #   number of jobs, 50,000 to something under 100,000
    cat << '_EOF_' > DEF
# dolphin vs cow
BLASTZ=/cluster/bin/penn/lastz-distrib-1.03.02/bin/lastz
# TARGET: Cow BosTau7
SEQ1_DIR=/scratch/data/bosTau7/bosTau7.2bit
SEQ1_LEN=/scratch/data/bosTau7/chrom.sizes
SEQ1_CHUNK=20000000
SEQ1_LAP=10000

# QUERY: dolphin TurTru2
SEQ2_DIR=/hive/data/genomes/turTru2/turTru2.2bit
SEQ2_LEN=/hive/data/genomes/turTru2/chrom.sizes
SEQ2_CHUNK=10000000
SEQ2_LAP=0
SEQ2_LIMIT=5000

BASE=/hive/data/genomes/bosTau7/bed/lastzTurTru2.2012-06-19
TMPDIR=/scratch/tmp
'_EOF_'
    # << happy emacs

    time nice -n +19 doBlastzChainNet.pl -verbose=2 \
    `pwd`/DEF \
    -syntenicNet \
    -workhorse=hgwdev -smallClusterHub=encodek -bigClusterHub=swarm \
    -chainMinScore=3000 -chainLinearGap=medium > do.log 2>&1 &
    #   real    7776m13.787s
    cat fb.bosTau7.chainTurTru2Link.txt
    #   1696154184 bases of 2804673174 (60.476%) in intersection
    # set sym link to indicate this is the lastz for this genome:
    cd /hive/data/genomes/bosTau7/bed
    ln -s lastzTurTru2.2012-06-19 lastz.turTru2

    # better to have reciprocal best for this one since it is low
    # coverage:
    cd /hive/data/genomes/bosTau7/bed/lastzTurTru2.2012-06-19
    time doRecipBest.pl bosTau7 turTru2 -buildDir=`pwd` -workhorse=hgwdev \
    > best.log 2>&1 &
    #   65m6.795s

    # then swap
    mkdir /hive/data/genomes/turTru2/bed/blastz.bosTau7.swap
    cd /hive/data/genomes/turTru2/bed/blastz.bosTau7.swap
    time nice -n +19 doBlastzChainNet.pl -verbose=2 \
    /hive/data/genomes/bosTau7/bed/lastzTurTru2.2012-06-19/DEF \
    -swap -syntenicNet \
    -workhorse=hgwdev -smallClusterHub=encodek -bigClusterHub=swarm \
    -chainMinScore=3000 -chainLinearGap=medium > swap.log 2>&1 &
    #   real    532m53.448s
    cat fb.turTru2.chainBosTau7Link.txt
    #   1625706583 bases of 2332402443 (69.701%) in intersection
    # set sym link to indicate this is the lastz for this genome:
    cd /hive/data/genomes/turTru2/bed
    ln -s blastz.bosTau7.swap lastz.bosTau7

############################################################################
# lastz White Rhino cerSim1 (DONE - 2012-10-24 - Hiram)
    screen -S bosTau7CerSim1
    mkdir /hive/data/genomes/bosTau7/bed/lastzCerSim1.2012-10-24
    cd /hive/data/genomes/bosTau7/bed/lastzCerSim1.2012-10-24

    cat << '_EOF_' > DEF
# Cow vs White Rhino
BLASTZ=/cluster/bin/penn/lastz-distrib-1.03.02/bin/lastz
BLASTZ_M=50

# TARGET: Cow BosTau7
SEQ1_DIR=/scratch/data/bosTau7/bosTau7.2bit
SEQ1_LEN=/scratch/data/bosTau7/chrom.sizes
SEQ1_CHUNK=20000000
SEQ1_LIMIT=50
SEQ1_LAP=10000

# QUERY: White Rhino CerSim1
SEQ2_DIR=/hive/data/genomes/cerSim1/cerSim1.2bit
SEQ2_LEN=/hive/data/genomes/cerSim1/chrom.sizes
SEQ2_CHUNK=20000000
SEQ2_LIMIT=50
SEQ2_LAP=0

BASE=/hive/data/genomes/bosTau7/bed/lastzCerSim1.2012-10-24
TMPDIR=/scratch/tmp
'_EOF_'
    # << happy emacs

    time nice -n +19 doBlastzChainNet.pl \
	`pwd`/DEF \
	-workhorse=hgwdev -chainMinScore=3000 -chainLinearGap=medium \
	-qRepeats=windowmaskerSdust \
	-bigClusterHub=swarm -verbose=2 > do.log 2>&1 &
    #   real    7139m16.881s

    cat fb.bosTau7.chainCerSim1Link.txt
    #   1628320914 bases of 2804673174 (58.057%) in intersection

    # set sym link to indicate this is the lastz for this genome:
    cd /hive/data/genomes/bosTau7/bed
    ln -s lastzCerSim1.2012-10-24 lastz.cerSim1

    # and the swap
    mkdir /hive/data/genomes/cerSim1/bed/blastz.bosTau7.swap
    cd /hive/data/genomes/cerSim1/bed/blastz.bosTau7.swap
    time nice -n +19 doBlastzChainNet.pl \
        /hive/data/genomes/bosTau7/bed/lastzCerSim1.2012-10-24/DEF \
	-workhorse=hgwdev -chainMinScore=3000 -chainLinearGap=medium \
	-swap -qRepeats=windowmaskerSdust \
	-bigClusterHub=swarm -verbose=2 > swap.log 2>&1 &
    #   real    120m35.871s

    cat fb.cerSim1.chainBosTau7Link.txt
    #   1592093865 bases of 2366858012 (67.266%) in intersection

    # set sym link to indicate this is the lastz for this genome:
    cd /hive/data/genomes/cerSim1/bed
    ln -s blastz.bosTau7.swap lastz.bosTau7

##############################################################################
# create ucscToINSDC name mapping (DONE - 2013-08-16 - Hiram)
    mkdir /hive/data/genomes/bosTau7/bed/ucscToINSDC
    cd /hive/data/genomes/bosTau7/bed/ucscToINSDC

    # copying these scripts from the previous load and improving them
    # with each instance
    ./translateNames.sh
    ./verifyAll.sh
    ./join.sh

    # verify the track link to INSDC functions

##############################################################################
# Farm Animal QTL track (IN PROGRESS - 2014-01-07 - Pauline)
    mkdir /hive/data/genomes/bosTau7/bed/animalQtl
    cd /hive/data/genomes/bosTau7/bed/animalQtl

    wget http://www.animalgenome.org/QTLdb/tmp/QTL_Btau_4.6.ubed.txt.gz

    zcat QTL_Btau_4.6.ubed.txt.gz >> bosTau7.animalQtl.bed
    cut -f1-4 bosTau7.animalQtl.bed > bosTau7.animalQtl.bed4
    hgLoadBed bosTau7 animalQtl bosTau7.animalQtl.bed4

    checkTableCoords bosTau7 animalQtl
    # no results - yay!
    #added entries in trackDb for track and search 

    #script to determine what to include in search rule:

cat > findNames <<EOF 
#!/bin/sh
#hgsql -N -e 'select name from animalQtl;' bosTau7 | sort >
#/tmp/bosTau7.animalQtl.name.txt
hgsql -N -e 'select name from animalQtl;' bosTau7  > /tmp/bosTau7.animalQtl.name.txt
hgsql -N -e 'select name from animalQtl;' galGal4  >> /tmp/bosTau7.animalQtl.name.txt
hgsql -N -e 'select name from animalQtl;' susScr3  >> /tmp/bosTau7.animalQtl.name.txt
hgsql -N -e 'select name from animalQtl;' oviAri3  >> /tmp/bosTau7.animalQtl.name.txt
export maxLen=`awk '{print length($0)}' /tmp/bosTau7.animalQtl.name.txt | sort -rn | head -1`
echo "scan to column: $maxLen"

export C=1
while [ $C -le $maxLen ];
do
echo -n " $C: "
sort -u /tmp/bosTau7.animalQtl.name.txt | awk '{ print substr($0,'$C',1) }' | sort -u | xargs echo | sed -e 's/ //g'
C=`echo $C | awk '{print $1+1}'`
done

#rm -f /tmp/bosTau7.animalQtl.name.txt
mv /tmp/bosTau7.animalQtl.name.txt .
EOF

#script output:
 1: 12346ABCDEFGHIJKLMNOPQRSTUVWYZcdfmpu
 2: -/0123456789:ABCDEFGHIJKLMNOPQRSTUVWXYacdeimort
 3: %-01234568:ABCDEFGHIKLMNOPRSTUVWXYZ_abcdefnost
 4: %-/0123456789:ABCDEFGHIKLMNOPRSTUVWYZ_abdehilnqrt
 5: -0123456789:ABCDEFGHIJKLMNOPQRSTUVWYZ_abcdiklnorstw
 6: %-.0123456789:ABCDEFGHIJKLMNOPQRSTUVWXYZ_bcdhimnoprstw
 7: -0123456789:ABCDEFGHIKLMNOPRSTVW_abcent
 8: 0123456789:ABCDEFGHILMNPRSTV_abeflo
 9: 0123456789:BCEFMOPTW_adeklnt
 10: 0123456789:CHLcdfirt
 11: -0123456789:aln
 12: 0123456789git
 13: 0123456789:
 14: 0123456789
 15: 0123456789
 16: 0123456789
 17: 012346
 18: 023456

#rule constructed based on script output - set termregex to this in trackDb
#search stanza
^[a-zA-Z0-9]+[a-zA-Z0-9:/-]+[a-zA-Z0-9:/-_%]+

                                                                                                                      614,23        Bot
##############################################################################
# Farm Animal Cytoband track (IN PROGRESS - 2014-02-04 - Pauline)
    mkdir /hive/data/genomes/bosTau7/bed/cytoBand
    cd /hive/data/genomes/bosTau7/bed/cytoBand

    wget http://www.animalgenome.org/QTLdb/tmp/QTL_Btau_4.6.ubed.txt.gz
    zcat QTL_Btau_4.6.ubed.txt.gz >> bosTau7.animalQtl.bed
############################################################################
# DBSNP B138 / SNP138 (DONE 1/31/14 angie)
    # RedMine #12490
    screen
    mkdir -p /hive/data/outside/dbSNP/138/cow
    cd /hive/data/outside/dbSNP/138/cow
    # Look at the directory listing of ftp://ftp.ncbi.nih.gov/snp/database/organism_data/
    # to find the subdir name to use as orgDir below (cow_9913 in this case).
    # Then click into that directory and look for file names like
    #    b(1[0-9][0-9])_
    # -- use the first num for build setting in config.ra
    # The buildAssembly setting in config.ra is empty because dbSNP stopped including
    # that in file names.
    cat > config.ra <<EOF
db bosTau7
build 138
buildAssembly 
orgDir cow_9913
EOF
    ~/kent/src/hg/utils/automation/doDbSnp.pl config.ra >& do.log & tail -f do.log
    # Some trial and error is always required to get the config.ra just right.
    # First stop:
# *** This release contains more than one assembly label.
# *** Please examine this list in case we need to exclude any of these:
#
#Bos_taurus_UMD_3.1
#Btau_4.6.1
# *** Add refAssemblyLabel to config.ra.  If keeping all labels, it will
# *** look like this:
#
#refAssemblyLabel Bos_taurus_UMD_3.1,Btau_4.6.1
#
# *** Edit out any of those that are not included in bosTau7 (e.g. Celera).
# *** Then restart this script with -continue=loadDbSnp .
    # bosTau7 is Btau_4.6.1 (UMD_3.1 is bosTau6):
    cat >> config.ra <<EOF
refAssemblyLabel Btau_4.6.1
EOF
    ~/kent/src/hg/utils/automation/doDbSnp.pl config.ra -continue=loadDbSnp >>& do.log &
    tail -f do.log
    # Next stop:
#*** 1 lines of b138_ContigInfo.bcp.gz contained contig names that
#*** could not be mapped to chrom.size via their GenBank contig mappings; see
#*** /hive/data/outside/dbSNP/138/cow/cantLiftUpSeqNames.txt .
#
#*** You must account for all 13387 contig_acc values in config.ra,
#*** using the liftUp and/or ignoreDbSnpContigsFile settings (see -help output).
#*** Check the auto-generated suggested.lft to see if it covers all
#*** 13387 contigs; if it does, add 'liftUp suggested.lft' to config.ra.
#*** Then run again with -continue=loadDbSnp .
    cat cantLiftUpSeqNames.txt
#unlocalized-scaffold (no chr23_GL629717_random in chrom.sizes)  NT_182078       chr23

    # Let's look at the assembly we got from GenBank:
    pushd /hive/data/genomes/bosTau7/genbank/Primary_Assembly/
    head unlocalized_scaffolds/unlocalized.chr2scaf
##Chromosome     Unlocalized Scaffold (Accession.version)
#23      GL629717.2
    # -- and that's it!  Apparently GL629717 is the only unlocalized contig.
    # Now check the first section in this file... seqeuences were picked up from
    # assembled_chromosomes and unplaced_scaffolds -- but not from unlocalized_scaffolds.
    # So I believe we have an orphan contig, and need to ignore dbSNP's mappings to it.
    popd
    cut -f 2 cant* > ignoreContig.txt
    cat >> config.ra <<EOF
liftUp suggested.lft
ignoreDbSnpContigsFile ignoreContig.txt
EOF
    ~/kent/src/hg/utils/automation/doDbSnp.pl config.ra -continue=loadDbSnp >>& do.log &
    tail -f do.log
    # The script halted because of a malformed fasta header (note the use of ""...""
    # instead of "..." in the alleles keyword):
#Parse error line 18636997:
#>gnl|dbSNP|rs137508789 rs=137508789|pos=51|len=101|taxid=9913|mol="genomic"|class=2|alleles=""N-,C/T, B-, R-,J-,C/T""|build=133|suspect=?
    # Tweak doDbSnp.pl to handle that, re-run with -debug to regenerate addToDbSnp.csh:
    ~/kent/src/hg/utils/automation/doDbSnp.pl config.ra -continue=addToDbSnp -stop=addToDbSnp -debug
    # Now edit addToDbSnp.csh to wrap "if(0)" around the parts that completed,
    # and run the rest of the commands:
    ./addToDbSnp.csh >>& do.log & tail -f do.log
    # Now continue with the next step after addToDbSnp.csh, bigJoin:
    ~/kent/src/hg/utils/automation/doDbSnp.pl config.ra -continue=bigJoin >>& do.log &
    tail -f do.log
    # Next stop: the same strangely formatted observed alleles above caused an
    # abort in snpNcbiToUcsc:
#Error line 18001467 of ucscNcbiSnp.bed: Encountered something that doesn't fit observedIndelFormat: N-,C/T, B-, R-,J-,C/T
#Command failed:
#ssh -x -o 'StrictHostKeyChecking = no' -o 'BatchMode = yes' hgwdev nice /hive/data/outside/dbSNP/138/cow/translate.csh
    # Changed snpNcbiToUcsc to assign an ObservedWrongFormat exception instead of aborting.
    # Try again:
    ~/kent/src/hg/utils/automation/doDbSnp.pl config.ra -continue=translate >>& do.log &
    tail -f do.log
#Skipped 172559 snp mappings due to errors -- see snp138Errors.bed
    # That's a lot of errors... take a look and ask dbSNP?  All are 'Missing observed'.
    # Spot-checking... rs210944221 turns up in rs_chNotOn.fas.gz which we exclude...
    # I think it's a bug for it to be there.  It's on chr1 in Btau_4.6.1 (bosTau6) according to
    # http://www.ncbi.nlm.nih.gov/SNP/snp_ref.cgi?type=rs&rs=rs210944221
    # Could it be in rs_chNotOn because it's not placed in UMD/bosTau7??
    # Maybe excluding rs_chNotOn is not the right thing to do.
    # Yep, all spot-checks imply we should keep rs_chNotOn.
    # OK, fixed doDbSnp.pl; re-run from addToDbSnp on...
    mkdir -p `cat workingDir`
    ~/kent/src/hg/utils/automation/doDbSnp.pl config.ra -continue=addToDbSnp >>& do.log &
    tail -f do.log
    # Here's the next place that strangely formatted observed alleles gum things up:
#hgLoadBed -tab -onServer -tmpDir=/data/tmp -allowStartEqualEnd bosTau7 snp138 -sqlTable=snp138.sql snp138.bed.gz
#Reading snp138.bed.gz
#Read 22370480 elements of size 25 from snp138.bed.gz
#Sorted
#Creating table definition for snp138
#Saving /data/tmp/loadBed_hgwdev_8e4_c23ae0.tab
#ERROR: expecting r,g,b specification, found: 'N-,C/T, B-, R-,J-,C/T'
    # The observed column does not contain rgb values.  Up to this point it didn't contain
    # commas either I guess... better add -type=bed6+ to the hgLoadBed command.
    # And better make hgLoadBed not assume itemRgb if bedN < 9!
    ~/kent/src/hg/utils/automation/doDbSnp.pl config.ra -continue=load >>& do.log &
    tail -f do.log
# *** All done!


##############################################################################
# FILTER SNP138 (DONE 2/3/14 angie)
    cd /hive/data/outside/dbSNP/138/cow
    zcat snp138.bed.gz \
    | ~/kent/src/hg/utils/automation/categorizeSnps.pl
#Mult:     2460543
#Common:   10223
#Flagged:  0
#leftover: 19899714
    # 10,461 SNPs out of > 22M that meet the threshold for Common... enough to warrant a
    # Common SNPs track?  Maybe; easy enough to drop it later if we decide it's too sparse.
    foreach f ({Mult,Common}.bed.gz)
        mv $f snp138$f
    end
    # Load tables
    foreach subset (Mult Common)
        hgLoadBed -tab -onServer -tmpDir=/data/tmp -allowStartEqualEnd -renameSqlTable \
          bosTau7 snp138$subset -sqlTable=snp138.sql snp138$subset.bed.gz
    end


##############################################################################
# DBSNP CODING ANNOTATIONS (138) (DONE 2/27/14 angie)
    # Originally done 2/3/14, but Brian L found hgc crash caused by script bug...
    cd /hive/data/outside/dbSNP/138/cow
    # ncbiFuncAnnotations.txt has NCBI coords: 0-based, fully closed.
    # Note: sort -u with the keys below is too restrictive -- we need full line uniq.
    zcat ncbiFuncAnnotations.txt.gz \
    | ~/kent/src/hg/utils/automation/fixNcbiFuncCoords.pl ncbiFuncInsertions.ctg.bed.gz \
    | sort -k1n,1n -k2,2 -k3n,3n -k5,5 -k6n,6n \
    | uniq \
      > ncbiFuncAnnotationsFixed.txt
    wc -l ncbiFuncAnnotationsFixed.txt
#243649 ncbiFuncAnnotationsFixed.txt
   # How many & what kinds of function types?
    cut -f 6 ncbiFuncAnnotationsFixed.txt \
    | sort -n | uniq -c
#  55145 3   (coding-synon) 
# 119906 8   (cds-reference)
#   1037 41  (nonsense)     
#  63668 42  (missense)     
#    146 43  (stop-loss)    
#    572 44  (frameshift)   
#   3175 45  (cds-indel)    

    # 2/27/14: Re-ran from here after fixing bug in script:
    # Gather up multiple annotation lines into one line per {snp, gene, frame}:
    ~/kent/src/hg/utils/automation/collectNcbiFuncAnnotations.pl ncbiFuncAnnotationsFixed.txt \
    | liftUp snp138CodingDbSnp.bed suggested.lft warn stdin
    hgLoadBed bosTau7 snp138CodingDbSnp -sqlTable=$HOME/kent/src/hg/lib/snp125Coding.sql \
      -renameSqlTable -tab -notItemRgb -allowStartEqualEnd \
      snp138CodingDbSnp.bed
#Read 123098 elements of size 11 from snp138CodingDbSnp.bed


##############################################################################
# LIFTOVER TO bosTau9 (DONE - 2018-11-07 - Hiram)
    ssh hgwdev
    mkdir /hive/data/genomes/bosTau7/bed/blat.bosTau9.2018-11-07
    cd /hive/data/genomes/bosTau7/bed/blat.bosTau9.2018-11-07
    time (doSameSpeciesLiftOver.pl -verbose=2 -buildDir=`pwd` \
	-ooc=/hive/data/genomes/bosTau7/jkStuff/bosTau7.11.ooc \
        -bigClusterHub=ku -dbHost=hgwdev -workhorse=hgwdev \
         bosTau7 bosTau9) > do.log 2>&1 &
    # real    562m47.638s

    # verify the convert link on the test browser is now
    # active from bosTau7 to bosTau9

##############################################################################
