# for emacs: -*- mode: sh; -*-

#       DATE:   24-Aug-2011
#       ORGANISM:       Sus scrofa
#       TAXID:  9823
#       ASSEMBLY LONG NAME:     Sscrofa10.2
#       ASSEMBLY SHORT NAME:    Sscrofa10.2
#       ASSEMBLY SUBMITTER:     Swine Genome Sequencing Consortium
#       ASSEMBLY TYPE:  Haploid
#       NUMBER OF ASSEMBLY-UNITS:       1
#       Assembly Accession:     GCA_000003025.4

#       FTP-RELEASE DATE: 08-Sep-2011

#       http://www.ncbi.nlm.nih.gov/genome/84
#       http://www.ncbi.nlm.nih.gov/genome/assembly/304498/
#       http://www.ncbi.nlm.nih.gov/bioproject/13421
#       http://www.ncbi.nlm.nih.gov/bioproject/28993 - chrMt NC_000845.1
#                                              included in Sscrofa10

#       http://www.ncbi.nlm.nih.gov/nuccore/326195957
#       http://www.ncbi.nlm.nih.gov/Traces/wgs/?val=AEMK01
#       Genome Coverage : 24x

#       http://www.ncbi.nlm.nih.gov/Taxonomy/Browser/wwwtax.cgi?id=9823

#       A different chrMt:
#       http://www.ncbi.nlm.nih.gov/bioproject/34563 - chrMt - NC_012095.1

# rsync://ftp.ncbi.nlm.nih.gov/genbank/genomes/Eukaryotes/vertebrates_mammals/Sus_scrofa/Sscrofa10.2/ ./

##########################################################################
# Download sequence (DONE - 2012-04-11 - Hiram)
    mkdir /hive/data/genomes/susScr3
    cd /hive/data/genomes/susScr3
    mkdir genbank
    cd genbank
    rsync -a -P \
rsync://ftp.ncbi.nlm.nih.gov/genbank/genomes/Eukaryotes/vertebrates_mammals/Sus_scrofa/Sscrofa10.2/ ./

    # verify the size of the sequence here:
    faSize Primary_Assembly/assembled_chromosomes/FASTA/*.fa.gz \
        Primary_Assembly/placed_scaffolds/FASTA/*.fa.gz \
        Primary_Assembly/unplaced_scaffolds/FASTA/*.fa.gz
    #   5138998834 bases (296506722 N's 4842492112 real 4842492112 upper
    #   0 lower) in 9925 sequences in 41 files
    #   Total size: mean 517783.3 sd 6498082.1
    #   min 100 (gi|333451020|gb|GL895433.1|)
    #   max 315321322 (gi|345632377|gb|CM000812.4|) median 146507

##########################################################################
# fixup names for UCSC standards (DONE - 2012-04-11 - Hiram)
    mkdir /hive/data/genomes/susScr3/ucsc
    cd /hive/data/genomes/susScr3/ucsc

    ########################  Assembled Chromosomes
    cat << '_EOF_' > toUcsc.pl
#!/bin/env perl

use strict;
use warnings;

my %accToChr;

open (FH, "<../genbank/Primary_Assembly/assembled_chromosomes/chr2acc") or
        die "can not read Primary_Assembly/assembled_chromosomes/chr2acc";
while (my $line = <FH>) {
    next if ($line =~ m/^#/);
    chomp $line;
    my ($chrN, $acc) = split('\s+', $line);
    $accToChr{$acc} = $chrN;
}
close (FH);

foreach my $acc (keys %accToChr) {
    my $chrN =  $accToChr{$acc};
    print "$acc $accToChr{$acc}\n";
    open (FH, "zcat ../genbank/Primary_Assembly/assembled_chromosomes/AGP/chr${chrN}.agp.gz|") or die "can not read chr${chrN}.agp.gz";
    open (UC, "|gzip -c >chr${chrN}.agp.gz") or die "can not write to chr${chrN}.agp.gz";
    while (my $line = <FH>) {
        if ($line =~ m/^#/) {
            print UC $line;
        } else {
            $line =~ s/^$acc/chr${chrN}/;
            print UC $line;
        }
    }
    close (FH);
    close (UC);
    open (FH, "zcat ../genbank/Primary_Assembly/assembled_chromosomes/FASTA/chr${chrN}.fa.gz|") or die "can not read chr${chrN}.fa.gz";
    open (UC, "|gzip -c >chr${chrN}.fa.gz") or die "can not write to chr${chrN}.fa.gz";
    while (my $line = <FH>) {
        if ($line =~ m/^>/) {
            printf UC ">chr${chrN}\n";
        } else {
            print UC $line;
        }
    }
    close (FH);
    close (UC);
}
'_EOF_'
    # << happy emacs
    chmod +x toUcsc.pl
    time ./toUcsc.pl
    #   real    11m13.537s

    ########################  Unplaced scaffolds
    # verify we don't have any .acc numbers different from .1
    zcat \
    ../genbank/Primary_Assembly/unplaced_scaffolds/AGP/unplaced.scaf.agp.gz \
	| cut -f1 | egrep "^GL|^AEMK" \
	| sed -e 's/^GL[0-9][0-9]*//; s/^AEMK[0-9][0-9]*//' | sort | uniq -c
    #   111316 .1
    #   130799 .2

    # in that case, since they are not all .1, we need to use the full name
    #   modified to remove the .
    # this is like the unplaced.pl script in other assemblies except it
    #	does not add chrUn_ to the names since they are all just scaffolds

    cat << '_EOF_' > unplaced.pl
#!/bin/env perl

use strict;
use warnings;

my $agpFile =  "../genbank/Primary_Assembly/unplaced_scaffolds/AGP/unplaced.scaf.agp.gz";
my $fastaFile =  "../genbank/Primary_Assembly/unplaced_scaffolds/FASTA/unplaced.scaf.fa.gz";
open (FH, "zcat $agpFile|") or die "can not read $agpFile";
open (UC, "|gzip -c > unplaced.agp.gz") or die "can not write to unplaced.agp.gz";
while (my $line = <FH>) {
    if ($line =~ m/^#/) {
        print UC $line;
    } else {
        $line =~ s/\./-/;
        printf UC "%s", $line;
    }
}
close (FH);
close (UC);

open (FH, "zcat $fastaFile|") or die "can not read $fastaFile";
open (UC, "|gzip -c > unplaced.fa.gz") or die "can not write to unplaced.fa.gz";
while (my $line = <FH>) {
    if ($line =~ m/^>/) {
        chomp $line;
        $line =~ s/.*gb\|//;
        $line =~ s/\|.*//;
        $line =~ s/\./-/;
        printf UC ">$line\n";
    } else {
        print UC $line;
    }
}
close (FH);
close (UC);
'_EOF_'
    # << happy emacs
    chmod +x unplaced.pl
    time ./unplaced.pl
    #   real    1m2.948s

    # verify nothing lost from original:
    time faSize *.fa.gz
    #   2808509378 bases (289538800 N's 2518970578 real 2518970578 upper
    #   0 lower) in 4582 sequences in 21 files
    #   Total size: mean 612944.0 sd 9554154.6 min 100 (GL895433-1)
    #   max 315321322 (chr1) median 25294
    #   real    1m3.821s

    # make sure none of the names got to be over 31 characers long:
    zcat *.agp.gz | grep -v "^#" | cut -f1 | awk '{print length($0)}' \
        | sort -rn | uniq -c | head
    #    332154 10
    #   3879 5
    #   6787 4

##########################################################################
# Initial makeGenomeDb.pl (DONE - 2012-04-11 - Hiram)
    cd /hive/data/genomes/susScr3
    cat << '_EOF_' > susScr3.config.ra
# Config parameters for makeGenomeDb.pl:
db susScr3
clade mammal
genomeCladePriority 35
scientificName Sus scrofa
commonName Pig
assemblyDate Aug. 2011
assemblyLabel SGSC Sscrofa10.2 (NCBI project 13421, GCA_000003025.4, WGS AEMK01)
assemblyShortLabel SGSC Sscrofa10.2
ncbiAssemblyName SGSC Sscrofa10.2
ncbiAssemblyId 304498
orderKey 2349
mitoAcc NC_000845
fastaFiles /hive/data/genomes/susScr3/ucsc/*.fa.gz
agpFiles /hive/data/genomes/susScr3/ucsc/*.agp.gz
# qualFiles none
dbDbSpeciesDir pig
taxId 9823
'_EOF_'
    # << happy emacs

    # verify sequence and agp are OK
    time makeGenomeDb.pl -workhorse=hgwdev -fileServer=hgwdev -dbHost=hgwdev \
        -stop=agp susScr3.config.ra > agp.log 2>&1
    #   real    8m23.506s
    # verify OK:
    tail -1 agp.log
    #   *** All done!  (through the 'agp' step)

    # finish it off
    time makeGenomeDb.pl -continue=db -workhorse=hgwdev -fileServer=hgwdev \
        -dbHost=hgwdev susScr3.config.ra > db.log 2>&1
    #   real    24m50.869s
    #	add the trackDb entries to the source tree, and the 2bit link:
    ln -s `pwd`/susScr3.unmasked.2bit /gbdb/susScr3/susScr3.2bit
    #	browser should function now

#########################################################################
# running repeat masker (DONE - 2012-04-11 - Hiram)
    mkdir /hive/data/genomes/susScr3/bed/repeatMasker
    cd /hive/data/genomes/susScr3/bed/repeatMasker
    time doRepeatMasker.pl -buildDir=`pwd` -noSplit \
	-bigClusterHub=swarm -dbHost=hgwdev -workhorse=hgwdev \
	-smallClusterHub=encodek susScr3 > do.log 2>&1 &
    #   real    990m24.340s

    cat faSize.rmsk.txt
    #   2808525991 bases (289538800 N's 2518987191 real 1404190109 upper
    #   1114797082 lower) in 4583 sequences in 1 files
    #   Total size: mean 612813.9 sd 9553116.1 min 100 (GL895433-1)
    #   max 315321322 (chr1) median 25278
    #   %39.69 masked total, %44.26 masked real

    egrep -i "versi|relea" do.log
#    April 26 2011 (open-3-3-0) version of RepeatMasker
# CC   RELEASE 20110920;
# RepeatMasker version development-$Id: RepeatMasker,v 1.26 2011/09/26 16:19:44 angie Exp $

    featureBits -countGaps susScr3 rmsk
    #   29525418 bases of 2525996391 (1.169%) in intersection

    # why is it different than the faSize above ?
    # because rmsk masks out some N's as well as bases, the count above
    #	separates out the N's from the bases, it doesn't show lower case N's

##########################################################################
# running simple repeat (DONE - 2012-04-11 - Hiram)
    mkdir /hive/data/genomes/susScr3/bed/simpleRepeat
    cd /hive/data/genomes/susScr3/bed/simpleRepeat
    time doSimpleRepeat.pl -buildDir=`pwd` -bigClusterHub=swarm \
	-dbHost=hgwdev -workhorse=hgwdev -smallClusterHub=encodek \
	susScr3 > do.log 2>&1 &
    #   real    9m41.939s
    cat fb.simpleRepeat
    #   29525418 bases of 2525996391 (1.169%) in intersection

    # add to rmsk after it is done:
    cd /hive/data/genomes/susScr3
    twoBitMask susScr3.rmsk.2bit \
	-add bed/simpleRepeat/trfMask.bed susScr3.2bit
    #	you can safely ignore the warning about fields >= 13

    twoBitToFa susScr3.2bit stdout | faSize stdin > faSize.susScr3.2bit.txt
    cat faSize.susScr3.2bit.txt
    #   2808525991 bases (289538800 N's 2518987191 real 1403003553 upper
    #   1115983638 lower) in 4583 sequences in 1 files
    #   Total size: mean 612813.9 sd 9553116.1 min 100 (GL895433-1)
    #   max 315321322 (chr1) median 25278
    #   %39.74 masked total, %44.30 masked real

    rm /gbdb/susScr3/susScr3.2bit
    ln -s `pwd`/susScr3.2bit /gbdb/susScr3/susScr3.2bit

#########################################################################
# Verify all gaps are marked, add any N's not in gap as type 'other'
#	(DONE - 2012-04-11 - Hiram)
    mkdir /hive/data/genomes/susScr3/bed/gap
    cd /hive/data/genomes/susScr3/bed/gap
    time nice -n +19 findMotif -motif=gattaca -verbose=4 \
	-strand=+ ../../susScr3.unmasked.2bit > findMotif.txt 2>&1
    #	real    0m4.977s
    grep "^#GAP " findMotif.txt | sed -e "s/^#GAP //" > allGaps.bed
    featureBits susScr3 -not gap -bed=notGap.bed
    #   2525996391 bases of 2525996391 (100.000%) in intersection

    time featureBits susScr3 allGaps.bed notGap.bed -bed=new.gaps.bed
    wc -l new.gaps.bed
    #   14502 new.gaps.bed
    #   39382069 bases of 2403678276 (1.638%) in intersection
    #   real    2m51.724s

    #	what is the highest index in the existing gap table:
    hgsql -N -e "select ix from gap;" susScr3 | sort -n | tail -1
    #	1382
    cat << '_EOF_' > mkGap.pl
#!/bin/env perl

use strict;
use warnings;

my $ix=`hgsql -N -e "select ix from gap;" susScr3 | sort -n | tail -1`;
chomp $ix;

open (FH,"<new.gaps.bed") or die "can not read new.gaps.bed";
while (my $line = <FH>) {
    my ($chrom, $chromStart, $chromEnd, $rest) = split('\s+', $line);
    ++$ix;
    printf "%s\t%d\t%d\t%d\tN\t%d\tother\tyes\n", $chrom, $chromStart,
        $chromEnd, $ix, $chromEnd-$chromStart;
}
close (FH);
'_EOF_'
    # << happy emacs
    chmod +x ./mkGap.pl
    ./mkGap.pl > other.bed
    featureBits -countGaps susScr3 other.bed
    #   702334 bases of 2808525991 (0.025%) in intersection
    wc -l other.bed
    #	14502
    hgLoadBed -sqlTable=$HOME/kent/src/hg/lib/gap.sql \
	-noLoad susScr3 otherGap other.bed
    #   Read 14502 elements of size 8 from other.bed
    #	starting with this many
    hgsql -e "select count(*) from gap;" susScr3
    #	 169119
    hgsql susScr3 -e 'load data local infile "bed.tab" into table gap;'
    #	result count:
    hgsql -e "select count(*) from gap;" susScr3
    #	183621
    # == 14502 + 169119
    # verify we aren't adding gaps where gaps already exist
    # this would output errors if that were true:
    gapToLift -minGap=1 susScr3 nonBridged.lift -bedFile=nonBridged.bed
    #   one of these new ones is adjacent to an existing gap:
# WARNING: overlapping gap at chr1:16262949-16312949(fragment) and chr1:16312949-16312957(other)

    # see example in danRer7.txt
    # are there non-bridged gaps here:
    hgsql -N -e "select bridge from gap;" susScr3 | sort | uniq -c
    #  169119 no
    #   14502 yes

##########################################################################
## WINDOWMASKER (DONE - 2012-04-11 - Hiram)
    mkdir /hive/data/genomes/susScr3/bed/windowMasker
    cd /hive/data/genomes/susScr3/bed/windowMasker
    time nice -n +19 doWindowMasker.pl -buildDir=`pwd` -workhorse=hgwdev \
	-dbHost=hgwdev susScr3 > do.log 2>&1 &
    #   real    224m8.502s

    # Masking statistics
    twoBitToFa susScr3.wmsk.2bit stdout | faSize stdin
    #   2808525991 bases (289538800 N's 2518987191 real 1636028761 upper
    #   882958430 lower) in 4583 sequences in 1 files
    #   Total size: mean 612813.9 sd 9553116.1 min 100 (GL895433-1)
    #   max 315321322 (chr1) median 25278
    #   %31.44 masked total, %35.05 masked real

    twoBitToFa susScr3.wmsk.sdust.2bit stdout | faSize stdin
    #   2808525991 bases (289538800 N's 2518987191 real 1622625959 upper
    #   896361232 lower) in 4583 sequences in 1 files
    #   Total size: mean 612813.9 sd 9553116.1 min 100 (GL895433-1)
    #   max 315321322 (chr1) median 25278
    #   %31.92 masked total, %35.58 masked real

    hgLoadBed susScr3 windowmaskerSdust windowmasker.sdust.bed.gz
    #   Read 13770155 elements of size 3 from windowmasker.sdust.bed.gz

    featureBits -countGaps susScr3 windowmaskerSdust
    #   1185785055 bases of 2808525991 (42.221%) in intersection

    #	eliminate the gaps from the masking
    featureBits susScr3 -not gap -bed=notGap.bed
    #   2525294057 bases of 2525294057 (100.000%) in intersection
    time nice -n +19 featureBits susScr3 windowmaskerSdust notGap.bed \
        -bed=stdout | gzip -c > cleanWMask.bed.gz
    #   902560289 bases of 2525294057 (35.741%) in intersection
    #   real    3m46.190s

    #	reload track to get it clean
    hgLoadBed susScr3 windowmaskerSdust cleanWMask.bed.gz
    #   Read 13751189 elements of size 4 from cleanWMask.bed.gz
    featureBits -countGaps susScr3 windowmaskerSdust
    #   902560289 bases of 2808525991 (32.136%) in intersection
    zcat cleanWMask.bed.gz \
	| twoBitMask ../../susScr3.unmasked.2bit stdin \
	    -type=.bed susScr3.cleanWMSdust.2bit
    twoBitToFa susScr3.cleanWMSdust.2bit stdout | faSize stdin \
        > susScr3.cleanWMSdust.faSize.txt
    cat susScr3.cleanWMSdust.faSize.txt
    #   2808525991 bases (289538800 N's 2518987191 real 1622625959 upper
    #   896361232 lower) in 4583 sequences in 1 files
    #   Total size: mean 612813.9 sd 9553116.1 min 100 (GL895433-1)
    #   max 315321322 (chr1) median 25278
    #   %31.92 masked total, %35.58 masked real

    # how much does this window masker and repeat masker overlap:
    featureBits -countGaps susScr3 rmsk windowmaskerSdust
    #   660954442 bases of 2808525991 (23.534%) in intersection

#########################################################################
# MASK SEQUENCE WITH WM+TRF (DONE - 2012-04-11 - Hiram)
    #	since rmsk masks so very little of the genome, use the clean WM mask
    #	on the genome sequence
#    cd /hive/data/genomes/susScr3
#    twoBitMask -add bed/windowMasker/susScr3.cleanWMSdust.2bit \
#	bed/simpleRepeat/trfMask.bed susScr3.2bit
    #	safe to ignore the warnings about BED file with >=13 fields
#    twoBitToFa susScr3.2bit stdout | faSize stdin > faSize.susScr3.txt
#    cat faSize.susScr3.txt
    #   927696114 bases (111611440 N's 816084674 real 607935500 upper
    #   208149174 lower) in 5678 sequences in 1 files
    #   Total size: mean 163384.3 sd 1922194.0 min 1000 (AERX01077754-1)
    #   max 51042256 (chrLG7) median 2009
    #   %22.44 masked total, %25.51 masked real

    #	create symlink to gbdb
#    rm /gbdb/susScr3/susScr3.2bit
#    ln -s `pwd`/susScr3.2bit /gbdb/susScr3/susScr3.2bit

########################################################################
# cpgIslands - (DONE - 2011-04-23 - Hiram)
    mkdir /hive/data/genomes/susScr3/bed/cpgIslands
    cd /hive/data/genomes/susScr3/bed/cpgIslands
    time doCpgIslands.pl susScr3 > do.log 2>&1
    #   real    16m36.325s

    cat fb.susScr3.cpgIslandExt.txt
    #   28059061 bases of 2525294057 (1.111%) in intersection

#########################################################################
# genscan - (DONE - 2011-04-25 - Hiram)
    mkdir /hive/data/genomes/susScr3/bed/genscan
    cd /hive/data/genomes/susScr3/bed/genscan
    time doGenscan.pl susScr3 > do.log 2>&1
    #   real    526m12.532s
    # one failing job:  (as found via 'para problems' on swarm)
./runGsBig.csh chr1 000 gtf/000/chr1.gtf pep/000/chr1.pep subopt/000/chr1.bed
    # rerun at a window size of 2000000 worked out OK
    #   real    116m26.816s
    time doGenscan.pl -continue=makeBed susScr3 > makeBed.log 2>&1
    #   real    22m49.789s

    cat fb.susScr3.genscan.txt
    #   54701861 bases of 2525294057 (2.166%) in intersection
    cat fb.susScr3.genscanSubopt.txt
    #   53271634 bases of 2525294057 (2.110%) in intersection

#########################################################################
# MAKE 11.OOC FILE FOR BLAT/GENBANK (DONE - 2012-05-04 - Hiram)
    # Use -repMatch=900, based on size -- for human we use 1024
    # use the "real" number from the faSize measurement,
    # hg19 is 2897316137, calculate the ratio factor for 1024:
    calc \( 2518987191 / 2897316137 \) \* 1024
    #	( 2518987191 / 2897316137 ) * 1024 = 890.286997

    # round up to 900  (susScr2 was 750)

    cd /hive/data/genomes/susScr3
    time blat susScr3.2bit /dev/null /dev/null -tileSize=11 \
      -makeOoc=jkStuff/susScr3.11.ooc -repMatch=900
    #   Wrote 27633 overused 11-mers to jkStuff/susScr3.11.ooc
    #   susScr2 had: Wrote 31605 overused 11-mers to jkStuff/susScr2.11.ooc
    #	real     1m33.117s

    # there are non-bridged gaps, create lift file needed for genbank
    hgsql -N -e "select bridge from gap;" susScr3 | sort | uniq -c
    #    169119 no
    #   14502 yes

    cd /hive/data/genomes/susScr3/jkStuff
    gapToLift susScr3 susScr3.nonBridged.lift -bedFile=susScr3.nonBridged.bed
    # this assembly has gaps abutting each other which produces warnings
    #   from this gapToLift program.
    # largest non-bridged contig:
    awk '{print $3-$2,$0}' susScr3.nonBridged.bed | sort -nr | head
    #   3862550 chr13   35251702        39114252        chr13.72

#########################################################################
# AUTO UPDATE GENBANK (DONE - 2012-05-04 - Hiram)
    # examine the file:
    /cluster/data/genbank/data/organism.lst
    # for your species to see what counts it has for:
# organism       mrnaCnt estCnt  refSeqCnt
# Sus scrofa	39994	1669350	3943
    # to decide which "native" mrna or ests you want to specify in genbank.conf
#       ORGANISM:       Sus scrofa

    ssh hgwdev
    cd $HOME/kent/src/hg/makeDb/genbank
    git pull
    # edit etc/genbank.conf to add:
# susScr3 (pig)
susScr3.serverGenome = /hive/data/genomes/susScr3/susScr3.2bit
susScr3.clusterGenome = /hive/data/genomes/susScr3/susScr3.2bit
susScr3.ooc = /hive/data/genomes/susScr3/jkStuff/susScr3.11.ooc
susScr3.lift = /hive/data/genomes/susScr3/jkStuff/susScr3.nonBridged.lift
susScr3.perChromTables = no
susScr3.refseq.mrna.native.pslCDnaFilter  = ${lowCover.refseq.mrna.native.pslCDnaFilter}
susScr3.refseq.mrna.xeno.pslCDnaFilter    = ${lowCover.refseq.mrna.xeno.pslCDnaFilter}
susScr3.genbank.mrna.native.pslCDnaFilter = ${lowCover.genbank.mrna.native.pslCDnaFilter}
susScr3.genbank.mrna.xeno.pslCDnaFilter   = ${lowCover.genbank.mrna.xeno.pslCDnaFilter}
susScr3.genbank.est.native.pslCDnaFilter  = ${lowCover.genbank.est.native.pslCDnaFilter}
susScr3.genbank.est.xeno.pslCDnaFilter    = ${lowCover.genbank.est.xeno.pslCDnaFilter}
susScr3.downloadDir = susScr3
susScr3.refseq.mrna.native.load  = yes
susScr3.refseq.mrna.xeno.load  = yes
susScr3.genbank.mrna.xeno.load = yes
susScr3.genbank.mrna.xeno.loadDesc = yes
susScr3.genbank.est.native.load = yes
# susScr3.upstreamGeneTbl = ensGene
# susScr3.upstreamMaf = multiz12way
# /hive/data/genomes/susScr3/bed/multiz12way/species.list

    # end of section added to etc/genbank.conf
    git commit -m "adding susScr3 pig" etc/genbank.conf
    git push
    make etc-update

    ssh hgwdev			# used to do this on "genbank" machine
    screen -S susScr3           # long running job managed in screen
    cd /cluster/data/genbank
    time nice -n +19 ./bin/gbAlignStep -initial susScr3 &
    #	  var/build/logs/2012.05.04-17:32:17.susScr3.initalign.log
    #   real    1612m51.055s

    # load database when finished
    ssh hgwdev
    cd /cluster/data/genbank
    time nice -n +19 ./bin/gbDbLoadStep -drop -initialLoad susScr3 &
    #	logFile: var/dbload/hgwdev/logs/2012.05.08-11:06:15.dbload.log
    #   real    100m45.660s

    # enable daily alignment and update of hgwdev (DONE - 2012-02-09 - Hiram)
    cd ~/kent/src/hg/makeDb/genbank
    git pull
    # add susScr3 to:
    vi etc/align.dbs etc/hgwdev.dbs
    git commit -m "Added susScr3." etc/align.dbs etc/hgwdev.dbs
    git push
    make etc-update

#########################################################################
# set default position to RHO gene displays  (DONE - 2012-07-26 - Hiram)
    hgsql -e \
'update dbDb set defaultPos="chr18:20905375-20909537" where name="susScr3";' \
	hgcentraltest

############################################################################
# pushQ entry (DONE - 2012-07-26 - Hiram)
    mkdir /hive/data/genomes/susScr3/pushQ
    cd /hive/data/genomes/susScr3/pushQ
    # Mark says don't let the transMap track get there
    time makePushQSql.pl susScr3 2> stderr.txt | grep -v transMap > susScr3.sql
    #   real    3m45.401s
    # check the stderr.txt for bad stuff, these kinds of warnings are OK:
# WARNING: hgwdev does not have /gbdb/susScr3/wib/gc5Base.wib
# WARNING: hgwdev does not have /gbdb/susScr3/wib/quality.wib
# WARNING: hgwdev does not have /gbdb/susScr3/bbi/quality.bw
# WARNING: susScr3 does not have seq
# WARNING: susScr3 does not have extFile
# WARNING: susScr3 does not have estOrientInfo

    scp -p susScr3.sql hgwbeta:/tmp
    ssh hgwbeta "hgsql qapushq < /tmp/susScr3.sql"

############################################################################
# LIFTOVER TO susScr2 (DONE - 2012-08-28 - Hiram )
    mkdir /hive/data/genomes/susScr3/bed/blat.susScr2.2012-08-28
    cd /hive/data/genomes/susScr3/bed/blat.susScr2.2012-08-28
    # -debug run to create run dir, preview scripts...
    doSameSpeciesLiftOver.pl -debug \
        -ooc /hive/data/genomes/susScr3/jkStuff/susScr3.11.ooc susScr3 susScr2
    # Real run:
    time nice -n +19 doSameSpeciesLiftOver.pl -verbose=2 \
        -ooc /hive/data/genomes/susScr3/jkStuff/susScr3.11.ooc \
	-bigClusterHub=swarm -dbHost=hgwdev -workhorse=hgwdev \
	 susScr3 susScr2 > do.log 2>&1
    #   real    78m46.791s

#############################################################################
# create ucscToINSDC name mapping (DONE - 2013-08-23 - Hiram)
    mkdir /hive/data/genomes/susScr3/bed/ucscToINSDC
    cd /hive/data/genomes/susScr3/bed/ucscToINSDC

    # copying these scripts from the previous load and improving them
    # with each instance
    ./translateNames.sh NC_000845.1
    ./verifyAll.sh
    ./join.sh

    sed -e "s/21/10/" $HOME/kent/src/hg/lib/ucscToINSDC.sql \
       | hgLoadSqlTab susScr3 ucscToINSDC stdin ucscToINSDC.tab
    checkTableCoords susScr3 ucscToINSDC
    featureBits -countGaps susScr3 ucscToINSDC
    # 2808525991 bases of 2808525991 (100.000%) in intersection

    # verify the track link to INSDC functions

##############################################################################
# Farm Animal QTL track (IN PROGRESS - 2014-01-06 - Pauline)
    mkdir /hive/data/genomes/susScr3/bed/animalQtl
    cd /hive/data/genomes/susScr3/bed/animalQtl

    wget http://www.animalgenome.org/hu/43874/QTL_SS_10.2.ubed.txt

    zcat QTL_SS_10.2.ubed.txt >> susScr3.animalQtl.bed
    cut -f1-4 susScr3.animalQtl.bed > susScr3.animalQtl.bed4
    hgLoadBed susScr3 animalQtl susScr3.animalQtl.bed4

    checkTableCoords susScr3 animalQtl
    # no results - yay!

    #had to reload to pickup new identifier syntax
    cut -f1-4  QTL_OAR_3.1.ubed.txt > susScr3.animalQtlTEST.bed4
    hgLoadBed susScr3 animalQtl susScr3.animalQtlTEST.bed4

    #added entries in trackDb for track and search


    #script to determine what to include in search rule:

cat > findNames <<EOF
#!/bin/sh
hgsql -N -e 'select name from animalQtl;' bosTau7  > /tmp/bosTau7.animalQtl.name.txt
hgsql -N -e 'select name from animalQtl;' galGal4  >> /tmp/bosTau7.animalQtl.name.txt
hgsql -N -e 'select name from animalQtl;' susScr3  >> /tmp/bosTau7.animalQtl.name.txt
hgsql -N -e 'select name from animalQtl;' oviAri3  >> /tmp/bosTau7.animalQtl.name.txt
export maxLen=`awk '{print length($0)}' /tmp/bosTau7.animalQtl.name.txt | sort -rn | head -1`
echo "scan to column: $maxLen"

export C=1
while [ $C -le $maxLen ];
do
echo -n " $C: "
sort -u /tmp/bosTau7.animalQtl.name.txt | awk '{ print substr($0,'$C',1) }' | sort -u | xargs echo | sed -e 's/ //g'
C=`echo $C | awk '{print $1+1}'`
done

#rm -f /tmp/bosTau7.animalQtl.name.txt
mv /tmp/bosTau7.animalQtl.name.txt .
EOF

#script output:
 1: 12346ABCDEFGHIJKLMNOPQRSTUVWYZcdfmpu
 2: -/0123456789:ABCDEFGHIJKLMNOPQRSTUVWXYacdeimort
 3: %-01234568:ABCDEFGHIKLMNOPRSTUVWXYZ_abcdefnost
 4: %-/0123456789:ABCDEFGHIKLMNOPRSTUVWYZ_abdehilnqrt
 5: -0123456789:ABCDEFGHIJKLMNOPQRSTUVWYZ_abcdiklnorstw
 6: %-.0123456789:ABCDEFGHIJKLMNOPQRSTUVWXYZ_bcdhimnoprstw
 7: -0123456789:ABCDEFGHIKLMNOPRSTVW_abcent
 8: 0123456789:ABCDEFGHILMNPRSTV_abeflo
 9: 0123456789:BCEFMOPTW_adeklnt
 10: 0123456789:CHLcdfirt
 11: -0123456789:aln
 12: 0123456789git
 13: 0123456789:
 14: 0123456789
 15: 0123456789
 16: 0123456789
 17: 012346
 18: 023456

#rule constructed based on script output - set termregex to this in trackDb
#search stanza
^[a-zA-Z0-9]+[a-zA-Z0-9:/-]+[a-zA-Z0-9:/-_%]+

############################################################################
# DBSNP B138 / SNP138 (DONE 1/29/14 angie)
    # RedMine #12490
    screen
    mkdir -p /hive/data/outside/dbSNP/138/pig
    cd /hive/data/outside/dbSNP/138/pig
    # Look at the directory listing of ftp://ftp.ncbi.nih.gov/snp/database/organism_data/
    # to find the subdir name to use as orgDir below (pig_9823 in this case).
    # Then click into that directory and look for file names like
    #    b(1[0-9][0-9])_
    # -- use the first num for build setting in config.ra
    # The buildAssembly setting in config.ra is empty because dbSNP stopped including
    # that in file names.
    cat > config.ra <<EOF
db susScr3
build 138
buildAssembly
orgDir pig_9823
EOF
    ~/kent/src/hg/utils/automation/doDbSnp.pl config.ra >& do.log & tail -f do.log
    # Some trial and error is always required to get the config.ra just right.
    # First stop:
#*** 4562 lines of b138_ContigInfo.bcp.gz contained contig names that
#*** could not be mapped to chrom.size via their GenBank contig mappings; see
#*** /hive/data/outside/dbSNP/138/pig/cantLiftUpSeqNames.txt .
#
#*** You must account for all 9906 contig_acc values in config.ra,
#*** using the liftUp and/or ignoreDbSnpContigsFile settings (see -help output).
#*** Check the auto-generated suggested.lft to see if it covers all
#*** 9906 contigs; if it does, add 'liftUp suggested.lft' to config.ra.
#*** Then run again with -continue=loadDbSnp .
    head  -5 cantLiftUpSeqNames.txt
#unplaced-scaffold (no chrUn_GL892100 in chrom.sizes)    NW_003536876    na
#unplaced-scaffold (no chrUn_GL892101 in chrom.sizes)    NW_003536877    na
#unplaced-scaffold (no chrUn_GL892102 in chrom.sizes)    NW_003536878    na
#unplaced-scaffold (no chrUn_GL892103 in chrom.sizes)    NW_003536879    na
#unplaced-scaffold (no chrUn_GL892104 in chrom.sizes)    NW_003536880    na
    grep GL892100 /hive/data/genomes/susScr3/chrom.sizes  | head
#GL892100-1      100
    # Uh-oh -- a naming convention unfamiliar to doDbSnpContig's automatic
    # mapping of RefSeq contig IDs to GenBank contig IDs to chrom.sizes IDs.
    # Adding a special case for susScr3 to doDbSnp.pl, trying again:
    ~/kent/src/hg/utils/automation/doDbSnp.pl config.ra -continue=loadDbSnp >>& do.log &
    tail -f do.log
    # Stopped at same point -- but now it has generated a complete suggested.lft,
    # so add that to config.ra and proceed:
    cat >> config.ra <<EOF
liftUp suggested.lft
EOF
    ~/kent/src/hg/utils/automation/doDbSnp.pl config.ra -continue=loadDbSnp >>& do.log &
    tail -f do.log
    # Lost mysql connection during a big load -- modified doDbSnp.pl to be more
    # efficient in the way it removes duplicates from snp138Seq.tab.
    # Regenerate load.csh using -debug:
    ~/kent/src/hg/utils/automation/doDbSnp.pl config.ra -continue=load -debug
    # Now edit load.csh to wrap "if(0)" around the parts that already completed,
    # and run the remaining steps:
    ./load.csh >>& do.log & tail -f do.log
    # All done!


##############################################################################
# FILTER SNP138 (DONE 1/31/14 angie)
    cd /hive/data/outside/dbSNP/138/pig
    zcat snp138.bed.gz \
    | ~/kent/src/hg/utils/automation/categorizeSnps.pl
#Mult:     129889
#Common:   187
#Flagged:  0
#leftover: 28592315
    # 187 SNPs out of > 28M that meet the threshold for Common... not enough to warrant a
    # Common SNPs track.
    foreach f ({Mult}.bed.gz)
        mv $f snp138$f
    end
    # Load tables
    foreach subset (Mult)
        hgLoadBed -tab -onServer -tmpDir=/data/tmp -allowStartEqualEnd -renameSqlTable \
          susScr3 snp138$subset -sqlTable=snp138.sql snp138$subset.bed.gz
    end


##############################################################################
# DBSNP CODING ANNOTATIONS (138) (DONE 2/27/14 angie)
    # Originally done 1/31/14, but Brian L found hgc crash caused by script bug...
    cd /hive/data/outside/dbSNP/138/pig
    # ncbiFuncAnnotations.txt has NCBI coords: 0-based, fully closed.
    # Note: sort -u with the keys below is too restrictive -- we need full line uniq.
    zcat ncbiFuncAnnotations.txt.gz \
    | ~/kent/src/hg/utils/automation/fixNcbiFuncCoords.pl ncbiFuncInsertions.ctg.bed.gz \
    | sort -k1n,1n -k2,2 -k3n,3n -k5,5 -k6n,6n \
    | uniq \
      > ncbiFuncAnnotationsFixed.txt
    wc -l ncbiFuncAnnotationsFixed.txt
#415603 ncbiFuncAnnotationsFixed.txt
   # How many & what kinds of function types?
    cut -f 6 ncbiFuncAnnotationsFixed.txt \
    | sort -n | uniq -c
# 129680 3   (coding-synon)
# 207072 8   (cds-reference)
#   1041 41  (nonsense)
#  77449 42  (missense)
#    161 43  (stop-loss)
#    107 44  (frameshift)
#     93 45  (cds-indel)

    # 2/27/14: Re-ran from here after fixing bug in script:
    # Gather up multiple annotation lines into one line per {snp, gene, frame}:
    ~/kent/src/hg/utils/automation/collectNcbiFuncAnnotations.pl ncbiFuncAnnotationsFixed.txt \
    | liftUp snp138CodingDbSnp.bed suggested.lft warn stdin
    hgLoadBed susScr3 snp138CodingDbSnp -sqlTable=$HOME/kent/src/hg/lib/snp125Coding.sql \
      -renameSqlTable -tab -notItemRgb -allowStartEqualEnd \
      snp138CodingDbSnp.bed
#Read 207193 elements of size 11 from snp138CodingDbSnp.bed


##############################################################################
##############################################################################
# TransMap V3 tracks. see makeDb/doc/transMapTracks.txt (2014-12-21 markd)
##############################################################################
# LASTZ pig/susScr3 sheep/oviAri3 - (DONE - 2014-05-19 - Hiram)
    mkdir /hive/data/genomes/susScr3/bed/lastzOviAri3.2014-05-19
    cd /hive/data/genomes/susScr3/bed/lastzOviAri3.2014-05-19

    cp -p \
/hive/users/hiram/multiz/100way/susScr3.oviAri3/susScr3.oviAri3.tuning.top400.txt \
     ./susScr3.oviAri3.tuning.Q.txt

    cat << '_EOF_' > DEF
# Pig vs. Sheep
# parameters obtained from a tuning run of lastz_D
# /hive/users/hiram/multiz/100way/susScr3.oviAri3/susScr3.oviAri3.tuning.top400.txt

BLASTZ=/cluster/bin/penn/lastz-distrib-1.03.52/bin/lastz
BLASTZ_O=400
BLASTZ_E=30
BLASTZ_M=50

BLASTZ_X=800
BLASTZ_Y=3400
BLASTZ_K=3000
BLASTZ_L=3000
BLASTZ_Q=/hive/data/genomes/susScr3/bed/lastzOviAri3.2014-05-19/susScr3.oviAri3.tuning.Q.txt
#       A     C     G     T
# A    80  -115   -29  -130
# C  -115   100   -93   -29
# G   -29   -93   100  -115
# T  -130   -29  -115    80

# TARGET: Pig susScr3
SEQ1_DIR=/hive/data/genomes/susScr3/susScr3.2bit
SEQ1_LEN=/hive/data/genomes/susScr3/chrom.sizes
SEQ1_CHUNK=20000000
SEQ1_LAP=10000
SEQ1_LIMIT=100

# QUERY: Sheep oviAri3
SEQ2_DIR=/hive/data/genomes/oviAri3/oviAri3.2bit
SEQ2_LEN=/hive/data/genomes/oviAri3/chrom.sizes
SEQ2_CHUNK=10000000
SEQ2_LIMIT=100
SEQ2_LAP=0

BASE=/hive/data/genomes/susScr3/bed/lastzOviAri3.2014-05-19
TMPDIR=/dev/shm
'_EOF_'
    # << happy emacs

    time (doBlastzChainNet.pl -verbose=2 `pwd`/DEF \
      -chainMinScore=3000 -chainLinearGap=medium \
        -workhorse=hgwdev -smallClusterHub=ku -bigClusterHub=ku \
          -syntenicNet) > do.log 2>&1

    # real    1170m6.492s

    cat fb.susScr3.chainOviAri3Link.txt
    # 1453348100 bases of 2525294057 (57.552%) in intersection

    time (doRecipBest.pl -buildDir=`pwd` susScr3 oviAri3) > rbest.log 2>&1 &
    # real    109m38.929s

    # and for the swap:
    mkdir /hive/data/genomes/oviAri3/bed/blastz.susScr3.swap
    cd /hive/data/genomes/oviAri3/bed/blastz.susScr3.swap

    time (doBlastzChainNet.pl -verbose=2 \
      /hive/data/genomes/susScr3/bed/lastzOviAri3.2014-05-19/DEF \
        -swap -chainMinScore=3000 -chainLinearGap=medium \
          -workhorse=hgwdev -smallClusterHub=ku -bigClusterHub=ku \
            -syntenicNet) > swap.log 2>&1
    #  real    155m31.793s

    cat fb.oviAri3.chainSusScr3Link.txt
    # 1311099603 bases of 2534335866 (51.733%) in intersection

    time (doRecipBest.pl -buildDir=`pwd` oviAri3 susScr3) > rbest.log 2>&1
    # real    42m24.943s

#########################################################################
# LIFTOVER TO susScr11 (DONE - 2017-07027 - Hiram )
    mkdir /hive/data/genomes/susScr3/bed/blat.susScr11.2017-07-27
    cd /hive/data/genomes/susScr3/bed/blat.susScr11.2017-07-27
    # -debug run to create run dir, preview scripts...
    doSameSpeciesLiftOver.pl -debug \
        -ooc /hive/data/genomes/susScr3/jkStuff/susScr3.11.ooc susScr3 susScr11
    # Real run:
    time (doSameSpeciesLiftOver.pl -verbose=2 \
        -ooc /hive/data/genomes/susScr3/jkStuff/susScr3.11.ooc \
	-bigClusterHub=ku -dbHost=hgwdev -workhorse=hgwdev \
	 susScr3 susScr11) > do.log 2>&1 &
    #   real    273m49.891s

    # verify browser can run a view in other genomes to susScr11 from susScr3

#############################################################################
