
# for emacs: -*- mode: sh; -*-

# This file describes browser build for the galGal4
#	Chicken Gallus Gallus genome: Nov. 2011

# http://www.ncbi.nlm.nih.gov/Traces/wgs/?val=AADN00
#	12X coverage via a variety of methods

#	http://www.ncbi.nlm.nih.gov/genome/111
#       http://www.ncbi.nlm.nih.gov/genome/assembly/317958/
#	http://www.ncbi.nlm.nih.gov/bioproject/13342 - WashU
#	http://www.ncbi.nlm.nih.gov/Traces/wgs/?val=AADN00
#	http://www.ncbi.nlm.nih.gov/bioproject/10808 - Montreal - chrMt

#############################################################################
# Fetch sequence from genbank (DONE - 2011-12-22 - Hiram)

    mkdir -p /hive/data/genomes/galGal4/genbank
    cd /hive/data/genomes/galGal4/genbank

    wget --timestamping -r --cut-dirs=6 --level=0 -nH -x \
        --no-remove-listing -np \
"ftp://ftp.ncbi.nlm.nih.gov/genbank/genomes/Eukaryotes/vertebrates_other/Gallus_gallus/Gallus_gallus-4.0/*"
    #	real    11m57.658s

    # measure sequence to be used here
    faSize Primary_Assembly/assembled_chromosomes/FASTA/*.fa.gz \
	Primary_Assembly/unplaced_scaffolds/FASTA/*.fa.gz \
	Primary_Assembly/unlocalized_scaffolds/FASTA//*.fa.gz
    #	1046915324 bases (14077289 N's 1032838035 real 1032838035 upper 0 lower) in 15931 sequences in 65 files
    #	Total size: mean 65715.6 sd 2473072.9 min 253 (gi|354532582|gb|AADN03009880.1|) max 195276750 (gi|357973830|gb|CM000093.3|) median 1206

#############################################################################
# process into UCSC naming scheme (DONE - 2011-02-17 - Hiram)
    mkdir /hive/data/genomes/galGal4/ucsc
    cd /hive/data/genomes/galGal4/ucsc

    cat << '_EOF_' > toUcsc.pl
#!/bin/env perl

use strict;
use warnings;

my %accToChr;

open (FH, "<../genbank/Primary_Assembly/assembled_chromosomes/chr2acc") or
        die "can not read Primary_Assembly/assembled_chromosomes/chr2acc";
while (my $line = <FH>) {
    next if ($line =~ m/^#/);
    chomp $line;
    my ($chrN, $acc) = split('\s+', $line);
    $accToChr{$acc} = $chrN;
}
close (FH);

foreach my $acc (keys %accToChr) {
    my $chrN =  $accToChr{$acc};
    print "$acc $accToChr{$acc}\n";
    open (FH, "zcat ../genbank/Primary_Assembly/assembled_chromosomes/AGP/chr${chrN}.agp.gz|") or die "can not read chr${chrN}.agp.gz";
    open (UC, ">chr${chrN}.agp") or die "can not write to chr${chrN}.agp";
    while (my $line = <FH>) {
        if ($line =~ m/^#/) {
            print UC $line;
        } else {
            $line =~ s/^$acc/chr${chrN}/;
            print UC $line;
        }
    }
    close (FH);
    close (UC);
    open (FH, "zcat ../genbank/Primary_Assembly/assembled_chromosomes/FASTA/chr${chrN}.fa.gz|") or die "can not read chr${chrN}.fa.gz";
    open (UC, ">chr${chrN}.fa") or die "can not write to chr${chrN}.fa";
    while (my $line = <FH>) {
        if ($line =~ m/^>/) {
            printf UC ">chr${chrN}\n";
        } else {
            print UC $line;
        }
    }
    close (FH);
    close (UC);
}
'_EOF_'
    # << happy emacs
    chmod +x toUcsc.pl

    cat << '_EOF_' > unplaced.pl
#!/bin/env perl

use strict;
use warnings;

my $agpFile =  "../genbank/Primary_Assembly/unplaced_scaffolds/AGP/unplaced.scaf.agp.gz";
my $fastaFile =  "../genbank/Primary_Assembly/unplaced_scaffolds/FASTA/unplaced.scaf.fa.gz";
open (FH, "zcat $agpFile|") or die "can not read $agpFile";
open (UC, ">unplaced.agp") or die "can not write to unplaced.agp";
while (my $line = <FH>) {
    if ($line =~ m/^#/) {
        print UC $line;
    } else {
        $line =~ s/\.1//;
        printf UC "chrUn_%s", $line;
    }
}
close (FH);
close (UC);

open (FH, "zcat $fastaFile|") or die "can not read $fastaFile";
open (UC, ">unplaced.fa") or die "can not write to unplaced.fa";
while (my $line = <FH>) {
    if ($line =~ m/^>/) {
        chomp $line;
        $line =~ s/.*gb\|//;
        $line =~ s/\.1\|.*//;
        printf UC ">chrUn_$line\n";
    } else {
        print UC $line;
    }
}
close (FH);
close (UC);
'_EOF_'
    # << happy emacs
    chmod +x unplaced.pl

    cat << '_EOF_' > unlocalized.pl
#!/bin/env perl

use strict;
use warnings;

my %accToChr;
my %chrNames;

open (FH, "<../genbank/Primary_Assembly/unlocalized_scaffolds/unlocalized.chr2scaf") or
        die "can not read Primary_Assembly/unlocalized_scaffolds/unlocalized.chr2scaf";
while (my $line = <FH>) {
    next if ($line =~ m/^#/);
    chomp $line;
    my ($chrN, $acc) = split('\s+', $line);
    $accToChr{$acc} = $chrN;
    $chrNames{$chrN} += 1;
}
close (FH);

foreach my $chrN (keys %chrNames) {
    my $agpFile =  "../genbank/Primary_Assembly/unlocalized_scaffolds/AGP/chr$chrN.unlocalized.scaf.agp.gz";
    my $fastaFile =  "../genbank/Primary_Assembly/unlocalized_scaffolds/FASTA/chr$chrN.unlocalized.scaf.fa.gz";
    open (FH, "zcat $agpFile|") or die "can not read $agpFile";
    open (UC, ">chr${chrN}_random.agp") or die "can not write to chr${chrN}_random.agp";
    while (my $line = <FH>) {
        if ($line =~ m/^#/) {
            print UC $line;
        } else {
            chomp $line;
            my (@a) = split('\t', $line);
            my $acc = $a[0];
            my $accNo1 = $acc;
            $accNo1 =~ s/.1$//;
            die "ERROR: acc not .1: $acc" if ($accNo1 =~ m/\./);
            die "ERROR: chrN $chrN not correct for $acc"
                if ($accToChr{$acc} ne $chrN);
            my $ucscName = "chr${chrN}_${accNo1}_random";
	    # these names became too long, limit is 31 in browser:
	    if ($chrN =~ m/LGE22C19W28_E50C23/) {
my $before = $ucscName;
		$ucscName =~ s/_E50C23//;
		$ucscName =~ s/AAD//;
printf STDERR "shorter: $before -> $ucscName\n";
	    }
            printf UC "%s", $ucscName;
            for (my $i = 1; $i < scalar(@a); ++$i) {
                printf UC "\t%s", $a[$i];
            }
            printf UC "\n";
        }
    }
    close (FH);
    close (UC);
    printf "chr%s\n", $chrN;
    open (FH, "zcat $fastaFile|") or die "can not read $fastaFile";
    open (UC, ">chr${chrN}_random.fa") or die "can not write to chr${chrN}_random.fa";
    while (my $line = <FH>) {
        if ($line =~ m/^>/) {
            chomp $line;
            my $acc = $line;
            $acc =~ s/.*gb\|//;
            $acc =~ s/\|.*//;
            my $accNo1 = $acc;
            $accNo1 =~ s/.1$//;
            die "ERROR: acc not .1: $acc" if ($accNo1 =~ m/\./);
            die "ERROR: chrN $chrN not correct for $acc"
                if ($accToChr{$acc} ne $chrN);
            my $ucscName = "chr${chrN}_${accNo1}_random";
	    # these names became too long, limit is 31 in browser:
	    if ($chrN =~ m/LGE22C19W28_E50C23/) {
		$ucscName =~ s/_E50C23//;
		$ucscName =~ s/AAD//;
	    }
            printf UC ">$ucscName\n";
        } else {
            print UC $line;
        }
    }
    close (FH);
    close (UC);
}
'_EOF_'
    # << happy emacs
    chmod +x unlocalized.pl

    time ./toUcsc.pl
    #	real    0m30.708s
    ./unlocalized.pl
    #	real    0m1.397s
    ./unplaced.pl
    #	real    0m1.611s

    gzip *.fa *.agp

    # verify nothing lost in the translation, should be the same as above
    #	except for the name translations
    faSize *.fa
# 1046915324 bases (14077289 N's 1032838035 real 1032838035 upper 0 lower) in 15931 sequences in 65 files
# Total size: mean 65715.6 sd 2473072.9 min 253 (chr2_AADN03009880_random) max 195276750 (chr1) median 1206

#############################################################################
#   Initial browser build (DONE - 2012-01-03 - Hiram)
    cd /hive/data/genomes/galGal4
    cat << '_EOF_' > galGal4.config.ra
# Config parameters for makeGenomeDb.pl:
db galGal4
clade vertebrate
genomeCladePriority 50
scientificName Gallus gallus
commonName Chicken
assemblyDate Nov. 2011
assemblyLabel ICGSC Gallus_gallus-4.0 (GCA_000002315.2)
assemblyShortLabel ICGSC Gallus_gallus-4.0
orderKey 434
mitoAcc NC_001323
fastaFiles /hive/data/genomes/galGal4/ucsc/*.fa.gz
agpFiles /hive/data/genomes/galGal4/ucsc/*.agp.gz
dbDbSpeciesDir chicken
taxId   9031
'_EOF_'
    # << happy emacs

    time makeGenomeDb.pl -stop=agp galGal4.config.ra > agp.log 2>&1
    #	real    1m37.730s
    # check the end of agp.log to verify it is OK
    time makeGenomeDb.pl -workhorse=hgwdev -fileServer=hgwdev \
	-continue=db galGal4.config.ra > db.log 2>&1
    #	real    8m20.996s
    # the first attempted failed in the:
    #	featureBits -or -countGaps galGal4 gold gap
    # due to the long chrom names, e.g.:
    # chrLGE22C19W28_E50C23_AADN03011267_random
# buffer overflow, size 32, format: %s, buffer: 'chrLGE22C19W28_E50C23_JH375242_'
    # the unlocalized.pl script was reworked a bit to shorten that one
    #	chr random name.

#############################################################################
# running repeat masker (DONE - 2012-01-04 - Hiram)
    mkdir /hive/data/genomes/galGal4/bed/repeatMasker
    cd /hive/data/genomes/galGal4/bed/repeatMasker
    time doRepeatMasker.pl -buildDir=`pwd` -noSplit \
	-bigClusterHub=swarm -dbHost=hgwdev -workhorse=hgwdev \
	-smallClusterHub=memk galGal4 > do.log 2>&1 &
    #	real    115m21.264s
    cat faSize.rmsk.txt
    #	1046932099 bases (14077289 N's 1032854810 real 921370201 upper
    #	111484609 lower) in 15932 sequences in 1 files
    #	%10.65 masked total, %10.79 masked real

    grep -i versi do.log
# RepeatMasker version development-$Id: RepeatMasker,v 1.26 2011/09/26 16:19:44 angie Exp $
#    April 26 2011 (open-3-3-0) version of RepeatMasker

    featureBits -countGaps galGal4 rmsk
    #	111540299 bases of 1046932099 (10.654%) in intersection
    # why is it different than the faSize above ?
    # because rmsk masks out some N's as well as bases, the count above
    #	separates out the N's from the bases, it doesn't show lower case N's

##########################################################################
# running simple repeat (DONE - 2012-01-04 - Hiram)
    mkdir /hive/data/genomes/galGal4/bed/simpleRepeat
    cd /hive/data/genomes/galGal4/bed/simpleRepeat
    time doSimpleRepeat.pl -buildDir=`pwd` -bigClusterHub=swarm \
	-dbHost=hgwdev -workhorse=hgwdev -smallClusterHub=memk \
	galGal4 > do.log 2>&1 &
    #	about 1 hour

    cat fb.simpleRepeat
    #	10508414 bases of 1032854810 (1.017%) in intersection

XXX - ready for this when rmsk is done
    # not going to add to rmsk here, using the window masker instead since
    #	it masks more sequence
    cd /hive/data/genomes/galGal4
    twoBitMask galGal4.rmsk.2bit \
	-add bed/simpleRepeat/trfMask.bed galGal4.2bit
    #	you can safely ignore the warning about fields >= 13

    twoBitToFa galGal4.2bit stdout | faSize stdin > faSize.galGal4.2bit.txt
    cat faSize.galGal4.2bit.txt
    #	1511735326 bases (153400444 N's 1358334882 real 1024824487 upper
    #	333510395 lower) in 19550 sequences in 1 files
    #	%22.06 masked total, %24.55 masked real

    rm /gbdb/galGal4/galGal4.2bit
    ln -s `pwd`/galGal4.2bit /gbdb/galGal4/galGal4.2bit

#########################################################################
# Verify all gaps are marked, add any N's not in gap as type 'other'
#	(DONE - 2012-01-04 - Hiram)
    mkdir /hive/data/genomes/galGal4/bed/gap
    cd /hive/data/genomes/galGal4/bed/gap
    time nice -n +19 findMotif -motif=gattaca -verbose=4 \
	-strand=+ ../../galGal4.unmasked.2bit > findMotif.txt 2>&1
    #	real    0m17.839s
    grep "^#GAP " findMotif.txt | sed -e "s/^#GAP //" > allGaps.bed
    time featureBits galGal4 -not gap -bed=notGap.bed
    #	1043113087 bases of 1043113087 (100.000%) in intersection
    #	real    0m8.581s

    time featureBits galGal4 allGaps.bed notGap.bed -bed=new.gaps.bed
    #	10258277 bases of 1043113087 (0.983%) in intersection
    #	real    1m29.109s

    #	what is the highest index in the existing gap table:
    hgsql -N -e "select ix from gap;" galGal4 | sort -n | tail -1
    #	446
    cat << '_EOF_' > mkGap.pl
#!/bin/env perl

use strict;
use warnings;

my $ix=`hgsql -N -e "select ix from gap;" galGal4 | sort -n | tail -1`;
chomp $ix;

open (FH,"<new.gaps.bed") or die "can not read new.gaps.bed";
while (my $line = <FH>) {
    my ($chrom, $chromStart, $chromEnd, $rest) = split('\s+', $line);
    ++$ix;
    printf "%s\t%d\t%d\t%d\tN\t%d\tother\tyes\n", $chrom, $chromStart,
        $chromEnd, $ix, $chromEnd-$chromStart;
}
close (FH);
'_EOF_'
    # << happy emacs
    chmod +x ./mkGap.pl
    ./mkGap.pl > other.bed
    wc -l other.bed
    #	11035
    featureBits -countGaps galGal4 other.bed
    #	10258277 bases of 1046932099 (0.980%) in intersection
    hgLoadBed -sqlTable=$HOME/kent/src/hg/lib/gap.sql \
	-noLoad galGal4 otherGap other.bed
    # verify no overlap:
    time featureBits -countGaps galGal4 gap other.bed
    #	0 bases of 1046932099 (0.000%) in intersection
    #	real    0m29.669s

    # verify no errors before adding to the table:
    time gapToLift -minGap=1 galGal4 nonBridged.before.lift \
	-bedFile=nonBridged.before.bed > before.gapToLift.txt 2>&1 &
    #	real    0m7.205s
    # check for warnings in before.gapToLift.txt, should be empty:
    #	-rw-rw-r-- 1       0 Jan  4 15:13 before.gapToLift.txt
    #	starting with this many:
    hgsql -e "select count(*) from gap;" galGal4
    #	2863
    hgsql galGal4 -e 'load data local infile "bed.tab" into table gap;'
    #	result count:
    hgsql -e "select count(*) from gap;" galGal4
    #	13898
    # == 2863 + 11035
    # verify we aren't adding gaps where gaps already exist
    # this would output errors if that were true:
    gapToLift -minGap=1 galGal4 nonBridged.lift -bedFile=nonBridged.bed
    # there should be no errors or other output, checked bridged gaps:
    hgsql -N -e "select bridge from gap;" galGal4 | sort | uniq -c
    #	915 no
    #	12983 yes

##########################################################################
## WINDOWMASKER (DONE - 2011-09-08 - Hiram)
    mkdir /hive/data/genomes/galGal4/bed/windowMasker
    cd /hive/data/genomes/galGal4/bed/windowMasker
    time nice -n +19 doWindowMasker.pl -buildDir=`pwd` -workhorse=hgwdev \
	-dbHost=hgwdev galGal4 > do.log 2>&1 &
    #	about 45 minutes

    # Masking statistics
    twoBitToFa galGal4.wmsk.2bit stdout | faSize stdin
    #	1046932099 bases (14077289 N's 1032854810 real 828377808 upper
    #	204477002 lower) in 15932 sequences in 1 files
    #	%19.53 masked total, %19.80 masked real

    twoBitToFa galGal4.wmsk.sdust.2bit stdout | faSize stdin
    #	1046932099 bases (14077289 N's 1032854810 real 822407863 upper
    #	210446947 lower) in 15932 sequences in 1 files
    #	%20.10 masked total, %20.38 masked real

    hgLoadBed galGal4 windowmaskerSdust windowmasker.sdust.bed.gz
    #	Loaded 5996190 elements of size 3

    featureBits -countGaps galGal4 windowmaskerSdust
    #	224522674 bases of 1046932099 (21.446%) in intersection

    #	eliminate the gaps from the masking
    featureBits galGal4 -not gap -bed=notGap.bed
    #	1032854810 bases of 1032854810 (100.000%) in intersection
    time nice -n +19 featureBits galGal4 windowmaskerSdust notGap.bed \
        -bed=stdout | gzip -c > cleanWMask.bed.gz
    #	210446947 bases of 1032854810 (20.375%) in intersection
    #	real    2m3.610s
    #	reload track to get it clean
    hgLoadBed galGal4 windowmaskerSdust cleanWMask.bed.gz
    #	Loaded 5994965 elements of size 4
    time featureBits -countGaps galGal4 windowmaskerSdust
    #	real    0m40.505s
    #	210446947 bases of 1046932099 (20.101%) in intersection

    #	mask with this clean result
    zcat cleanWMask.bed.gz \
	| twoBitMask ../../galGal4.unmasked.2bit stdin \
	    -type=.bed galGal4.cleanWMSdust.2bit
    twoBitToFa galGal4.cleanWMSdust.2bit stdout | faSize stdin \
        > galGal4.cleanWMSdust.faSize.txt
    cat galGal4.cleanWMSdust.faSize.txt
    #	1046932099 bases (14077289 N's 1032854810 real 822407863 upper
    #	210446947 lower) in 15932 sequences in 1 files
    #	%20.10 masked total, %20.38 masked real

    # how much does this window masker and repeat masker overlap:
    featureBits -countGaps galGal4 rmsk windowmaskerSdust
    #	71066487 bases of 1046932099 (6.788%) in intersection

#########################################################################
# MASK SEQUENCE WITH WM+TRF (DONE - 2012-01-05 - Hiram)
    cd /hive/data/genomes/galGal4
    twoBitMask -add bed/windowMasker/galGal4.cleanWMSdust.2bit \
	bed/simpleRepeat/trfMask.bed galGal4.2bit
    #	safe to ignore the warnings about BED file with >=13 fields
    twoBitToFa galGal4.2bit stdout | faSize stdin > faSize.galGal4.txt
    cat faSize.galGal4.txt
    #	1046932099 bases (14077289 N's 1032854810 real 822061190 upper
    #	210793620 lower) in 15932 sequences in 1 files
    #	%20.13 masked total, %20.41 masked real

    #	create symlink to gbdb
    ssh hgwdev
    rm /gbdb/galGal4/galGal4.2bit
    ln -s `pwd`/galGal4.2bit /gbdb/galGal4/galGal4.2bit

    #	what happens with all masks:
    twoBitMask -add galGal4.2bit bed/repeatMasker/galGal4.sorted.fa.out \
	galGal4.wm.trf.rmsk.2bit
    twoBitToFa galGal4.wm.trf.rmsk.2bit stdout | faSize stdin
    #	1046932099 bases (14077289 N's 1032854810 real 781762152 upper
    #	251092658 lower) in 15932 sequences in 1 files
    #	%23.98 masked total, %24.31 masked real
    # rmsk covers another 40 million bases 251092658-210793620 = 40299038

########################################################################
# cpgIslands - (DONE - 2011-04-23 - Hiram)
    mkdir /hive/data/genomes/galGal4/bed/cpgIslands
    cd /hive/data/genomes/galGal4/bed/cpgIslands
    time doCpgIslands.pl galGal4 > do.log 2>&1
    #   Elapsed time: 58m41s

    cat fb.galGal4.cpgIslandExt.txt
    #   14619450 bases of 1032854810 (1.415%) in intersection

#########################################################################
# genscan - (DONE - 2011-04-26 - Hiram)
    mkdir /hive/data/genomes/galGal4/bed/genscan
    cd /hive/data/genomes/galGal4/bed/genscan
    time doGenscan.pl galGal4 > do.log 2>&1
    #   real    29m37.528s
    # one broken job:
./runGsBig.csh chr8 000 gtf/000/chr8.gtf pep/000/chr8.pep subopt/000/chr8.bed
    # rerunning with window size of 2000000
    #   about 10 minutes
    #   continuing:
    time doGenscan.pl -continue=makeBed galGal4 > makeBed.log 2>&1
    #   real    12m15.506s

    cat fb.galGal4.genscan.txt
    #   22904400 bases of 1032854810 (2.218%) in intersection
    cat fb.galGal4.genscanSubopt.txt
    #   24186038 bases of 1032854810 (2.342%) in intersection

#########################################################################
# MAKE 11.OOC FILE FOR BLAT/GENBANK (DONE - 2012-05-04 - Hiram)
    # Use -repMatch=900, based on size -- for human we use 1024
    # use the "real" number from the faSize measurement,
    # hg19 is 2897316137, calculate the ratio factor for 1024:
    calc \( 1032838035 / 2897316137 \) \* 1024
    #	( 1032838035 / 2897316137 ) * 1024 = 365.036502

    # round up to 380  (galGal3 was 380)

    cd /hive/data/genomes/galGal4
    time blat galGal4.2bit /dev/null /dev/null -tileSize=11 \
      -makeOoc=jkStuff/galGal4.11.ooc -repMatch=380
    #   Wrote 12593 overused 11-mers to jkStuff/galGal4.11.ooc
    #   galGal3 had: Wrote 13061 overused 11-mers to /cluster/bluearc/galGal3/11.ooc
    #	real     0m25.395s

    # there are non-bridged gaps, create lift file needed for genbank
    hgsql -N -e "select bridge from gap;" galGal4 | sort | uniq -c
    #   915 no
    #   12983 yes

    cd /hive/data/genomes/galGal4/jkStuff
    gapToLift galGal4 galGal4.nonBridged.lift -bedFile=galGal4.nonBridged.bed
    # largest non-bridged contig:
    awk '{print $3-$2,$0}' galGal4.nonBridged.bed | sort -nr | head
    #   44546952 chr3   11581655        56128607        chr3.03

#########################################################################
# AUTO UPDATE GENBANK (DONE - 2012-05-04 - Hiram)
    # examine the file:
    /cluster/data/genbank/data/organism.lst
    # for your species to see what counts it has for:
# organism       mrnaCnt estCnt  refSeqCnt
# Felis catus	1081	919	354
    # to decide which "native" mrna or ests you want to specify in genbank.conf

    ssh hgwdev
    cd $HOME/kent/src/hg/makeDb/genbank
    git pull
    # edit etc/genbank.conf to add:
# galGal4 (chicken)
galGal4.serverGenome = /hive/data/genomes/galGal4/galGal4.2bit
galGal4.clusterGenome = /hive/data/genomes/galGal4/galGal4.2bit
galGal4.ooc = /hive/data/genomes/galGal4/jkStuff/galGal4.11.ooc
galGal4.lift = /hive/data/genomes/galGal4/jkStuff/galGal4.nonBridged.lift
galGal4.perChromTables = no
galGal4.refseq.mrna.native.pslCDnaFilter  = ${ordered.refseq.mrna.native.pslCDnaFilter}
galGal4.refseq.mrna.xeno.pslCDnaFilter    = ${ordered.refseq.mrna.xeno.pslCDnaFilter}
galGal4.genbank.mrna.native.pslCDnaFilter = ${ordered.genbank.mrna.native.pslCDnaFilter}
galGal4.genbank.mrna.xeno.pslCDnaFilter   = ${ordered.genbank.mrna.xeno.pslCDnaFilter}
galGal4.genbank.est.native.pslCDnaFilter  = ${ordered.genbank.est.native.pslCDnaFilter}
galGal4.refseq.mrna.native.load = yes
galGal4.refseq.mrna.xeno.load = yes
galGal4.genbank.mrna.xeno.load = yes
galGal4.downloadDir = galGal4
# galGal4.upstreamGeneTbl = refGene
# galGal4.upstreamMaf = multiz7way
# /hive/data/genomes/galGal3/bed/multiz7way/species.lst

    # end of section added to etc/genbank.conf
    git commit -m "adding galGal4 chicken" etc/genbank.conf
    git push
    make etc-update

    ssh hgwdev			# used to do this on "genbank" machine
    screen -S galGal4           # long running job managed in screen
    cd /cluster/data/genbank
    time nice -n +19 ./bin/gbAlignStep -initial galGal4 &
    #	  var/build/logs/2012.05.04-15:58:41.galGal4.initalign.log
    #   real    1604m15.676s

    # load database when finished
    ssh hgwdev
    cd /cluster/data/genbank
    time nice -n +19 ./bin/gbDbLoadStep -drop -initialLoad galGal4 &
    #	logFile: var/dbload/hgwdev/logs/2012.05.08-08:48:35.dbload.log
    #   real    48m18.467s

    # enable daily alignment and update of hgwdev (DONE - 2012-02-09 - Hiram)
    cd ~/kent/src/hg/makeDb/genbank
    git pull
    # add galGal4 to:
    vi etc/align.dbs etc/hgwdev.dbs
    git commit -m "Added galGal4." etc/align.dbs etc/hgwdev.dbs
    git push
    make etc-update

#########################################################################
# create downloads and pushQ entry (DONE - 2012-05-24 - Hiram)
    # first make sure all.joiner is up to date and has this new organism
    # a keys check should be clean:
    cd ~/kent/src/hg/makeDb/schema
    time joinerCheck -database=galGal4 -keys all.joiner
    #   real    2m30.542s

    cd /hive/data/genomes/galGal4
    time makeDownloads.pl -workhorse hgwdev galGal4 > download.log 2>&1
    #   real    17m56.594s

    mkdir /hive/data/genomes/galGal4/pushQ
    cd /hive/data/genomes/galGal4/pushQ
    time makePushQSql.pl galGal4 > galGal4.sql 2> stderr.out
    #   real    3m28.689s
    # check stderr.out for no significant problems, it is common to see:
WARNING: hgwdev does not have /gbdb/galGal4/wib/gc5Base.wib
WARNING: hgwdev does not have /gbdb/galGal4/wib/quality.wib
WARNING: hgwdev does not have /gbdb/galGal4/bbi/quality.bw
WARNING: galGal4 does not have seq
WARNING: galGal4 does not have extFile

    scp -p galGal4.sql hgwbeta:/tmp
    ssh hgwbeta
    cd /tmp
    hgsql qapushq < galGal4.sql

#########################################################################
# fixup search rule for assembly track/gold table (DONE - 2012-06-07 - Hiram)
    hgsql -N -e "select frag from gold;" galGal4 | sort | head -1
AADN03009256.1
    hgsql -N -e "select frag from gold;" galGal4 | sort | tail -2
JH375140.1
JH393278.1
NC_001323

    # to test the rule:
    hgsql -N -e "select frag from gold;" galGal4 | wc -l
    #   18795
    # should find all:
    hgsql -N -e "select frag from gold;" galGal4 \
        | egrep -e '[AJN][AHC][D3_][N079][0-9]+(\.1)?' | wc -l
    #   18795
    # or exclude all:
    hgsql -N -e "select frag from gold;" galGal4 \
        | egrep -v -e '[AJN][AHC][D3_][N079][0-9]+(\.1)?' | wc -l
    #   0

    # hence, add to trackDb/chicken/galGal4/trackDb.ra
searchTable gold
shortCircuit 1
termRegex [AJN][AHC][D3_][N079][0-9]+(\.1)?
query select chrom,chromStart,chromEnd,frag from %s where frag like '%s%%'
searchPriority 8

#########################################################################
# LASTZ Medium Ground Finch geoFor1 (DONE - 2012-07-27 - Hiram)
    # establish a screen to control this job with a name to indicate what it is
    screen -S geoFor1
    mkdir /hive/data/genomes/galGal4/bed/lastzGeoFor1.2012-07-27
    cd /hive/data/genomes/galGal4/bed/lastzGeoFor1.2012-07-27

    # adjust the SEQ2_LIMIT with -stop=partition to get a reasonable
    #	number of jobs, 50,000 to something under 100,000
    cat << '_EOF_' > DEF
# Chicken vs. Medium Ground Finch
BLASTZ=/cluster/bin/penn/lastz-distrib-1.03.02/bin/lastz

# TARGET: chicken galGal4
SEQ1_DIR=/hive/data/genomes/galGal4/galGal4.2bit
SEQ1_LEN=/hive/data/genomes/galGal4/chrom.sizes
SEQ1_CHUNK=10000000
SEQ1_LAP=10000
SEQ1_LIMIT=100

# QUERY: Medium Ground Finch GeoFor1
SEQ2_DIR=/hive/data/genomes/geoFor1/geoFor1.2bit
SEQ2_LEN=/hive/data/genomes/geoFor1/chrom.sizes
SEQ2_CHUNK=10000000
SEQ2_LAP=0
SEQ2_LIMIT=100

BASE=/hive/data/genomes/galGal4/bed/lastzGeoFor1.2012-07-27
TMPDIR=/scratch/tmp
'_EOF_'
    # << happy emacs

    doBlastzChainNet.pl -verbose=2 `pwd`/DEF \
        -workhorse=hgwdev -smallClusterHub=encodek -bigClusterHub=swarm \
        > do.log 2>&1
    # the chaining step ran for several days, the longest chain job:
# Longest finished job:          213567s    3559.45m    59.32h    2.47d
    #   Elapsed time: 4572m44s
    cat fb.galGal4.chainGeoFor1Link.txt
    #   744451026 bases of 1032854810 (72.077%) in intersection

    # set sym link to indicate this is the lastz for this genome:
    cd /hive/data/genomes/galGal4/bed
    ln -s lastzGeoFor1.2012-07-27 lastz.geoFor1

    mkdir /hive/data/genomes/geoFor1/bed/blastz.galGal4.swap
    cd /hive/data/genomes/geoFor1/bed/blastz.galGal4.swap
    time nice -n +19 doBlastzChainNet.pl -verbose=2 \
	/hive/data/genomes/galGal4/bed/lastzGeoFor1.2012-07-27/DEF \
	-swap -syntenicNet \
	-workhorse=hgwdev -smallClusterHub=encodek -bigClusterHub=swarm \
	> swap.log 2>&1 &
    #   real    81m0.123s
    cat fb.geoFor1.chainHg19Link.txt
    #   736380222 bases of 1041286029 (70.718%) in intersection
    # set sym link to indicate this is the lastz for this genome:
    cd /hive/data/genomes/geoFor1/bed
    ln -s blastz.galGal4.swap lastz.galGal4

##############################################################################
#  LASTZ petMar2 Lamprey (DONE - 2012-10-23 - Hiram)
    screen -S galGal4PetMar2	# use a screen to control this multi-day job
    mkdir /cluster/data/galGal4/bed/lastzPetMar2.2012-10-23
    cd /cluster/data/galGal4/bed/lastzPetMar2.2012-10-23

    cat << '_EOF_' > DEF
# Chicken vs Lamprey
BLASTZ=/cluster/bin/penn/lastz-distrib-1.03.02/bin/lastz
BLASTZ_Y=3400
BLASTZ_L=6000
BLASTZ_K=2200
BLASTZ_M=50
BLASTZ_Q=/cluster/data/blastz/HoxD55.q

# TARGET: Chicken galGal4
SEQ1_DIR=/hive/data/genomes/galGal4/galGal4.2bit
SEQ1_LEN=/hive/data/genomes/galGal4/chrom.sizes
SEQ1_CHUNK=10000000
SEQ1_LAP=10000
SEQ1_LIMIT=100

# QUERY: Lamprey petMar2
SEQ2_DIR=/hive/data/genomes/petMar2/petMar2.2bit
SEQ2_LEN=/hive/data/genomes/petMar2/chrom.sizes
SEQ2_CHUNK=10000000
SEQ2_LAP=0
SEQ2_LIMIT=100

BASE=/cluster/data/galGal4/bed/lastzPetMar2.2012-10-23
TMPDIR=/scratch/tmp
'_EOF_'
    # << happy emacs

    # why is memk used here for big hub ?  Should have been swarm
    time nice -n +19 doBlastzChainNet.pl \
	`pwd`/DEF \
	-workhorse=hgwdev -chainMinScore=5000 -chainLinearGap=loose \
	-tRepeats=windowmaskerSdust -qRepeats=windowmaskerSdust \
	-bigClusterHub=memk -verbose=2 > do.log 2>&1 &
    # shut this down on memk after awhile and finished it on swarm
    # real 4136m52.553s
    #	continuing after finishing on swarm:
    time nice -n +19 doBlastzChainNet.pl \
	-continue=cat `pwd`/DEF \
	-workhorse=hgwdev -chainMinScore=5000 -chainLinearGap=loose \
	-tRepeats=windowmaskerSdust -qRepeats=windowmaskerSdust \
	-bigClusterHub=swarm -verbose=2 > cat.log 2>&1 &
    #   Elapsed time: 49m2s

    cat fb.galGal4.chainPetMar2Link.txt
    #	18440677 bases of 1032854810 (1.785%) in intersection

    #	and the swap
    mkdir /cluster/data/petMar2/bed/blastz.galGal4.swap
    cd /cluster/data/petMar2/bed/blastz.galGal4.swap
    time nice -n +19 doBlastzChainNet.pl \
	/cluster/data/galGal4/bed/lastzPetMar2.2012-10-23/DEF \
	-chainMinScore=5000 -chainLinearGap=loose \
	-tRepeats=windowmaskerSdust -qRepeats=windowmaskerSdust \
	-swap -bigClusterHub=swarm -verbose=2 > swap.log 2>&1 &
    #	real    6m41.091s
    cat fb.petMar2.chainGalGal4Link.txt
    #	18576173 bases of 647368134 (2.869%) in intersection

############################################################################
# construct liftOver to galGal3 (DONE - 2013-04-30 - Hiram)
    screen -S galGal4	# manage this longish running job in a screen
    mkdir /hive/data/genomes/galGal4/bed/blat.galGal3.2013-04-30
    cd /hive/data/genomes/galGal4/bed/blat.galGal3.2013-04-30
    # check it with -debug first to see if it is going to work:
    time doSameSpeciesLiftOver.pl -buildDir=`pwd` -bigClusterHub=swarm \
	-ooc=/hive/data/genomes/galGal4/jkStuff/galGal4.11.ooc \
	-debug -dbHost=hgwdev -workhorse=hgwdev galGal4 galGal3 > do.log 2>&1
    # if that is OK, then run it:
    time doSameSpeciesLiftOver.pl -buildDir=`pwd` -bigClusterHub=swarm \
	-ooc=/hive/data/genomes/galGal4/jkStuff/galGal4.11.ooc \
	-dbHost=hgwdev -workhorse=hgwdev galGal4 galGal3 > do.log 2>&1
    #   real    24m2.551s

    # verify this file exists:
    #	/gbdb/galGal4/liftOver/galGal4ToGalGal3.over.chain.gz
    # and try out the conversion on genome-test from galGal4 to galGal3

############################################################################
# construct ucscToEnsembl name translation table (DONE - 2013-05-01 - Hiram)
    mkdir /hive/data/genomes/galGal4/ensembl
    cd /hive/data/genomes/galGal4/ensembl
    rsync -a -P  rsync://ftp.ensembl.org/ensembl/pub/mnt2/release-71/fasta/gallus_gallus/dna/Gallus_gallus.Galgal4.71.dna.toplevel.fa.gz ./
    faCount Gallus_gallus.Galgal4.71.dna.toplevel.fa.gz > ens71.faCount.txt
    cut -f1,2 ens71.faCount.txt \
       | egrep -v "^total|^#seq" | sort -k2nr > ens71.chrom.sizes
    faToTwoBit Gallus_gallus.Galgal4.71.dna.toplevel.fa.gz ens71.galGal4.2bit
    twoBitDup ens71.galGal4.2bit

    cat > ucscToEns.pl << '_EOF_'
#!/usr/bin/env perl

use strict;
use warnings;

my %ucscChromSizes;
my %contigToChr;

open (FH, "<../chrom.sizes") or die "can not read ../chrom.sizes";
while (my $line = <FH>) {
  chomp $line;
  my ($chr, $size) = split('\s+', $line);
  my $ucscChr = $chr;
  $ucscChromSizes{$chr} = $size;
  if ($chr =~ m/chrLGE22C19W28_N03011.*_random/) {
      $chr =~ s/chrLGE22C19W28_N03011/chrLGE22C19W28_AADN03011/;
  }
  if ($chr =~ m/_random/) {
     $chr =~ s/_random//;
     $chr =~ s/.*_//;
  }
  $chr =~ s/^chr[0-9]+_//;
  $chr =~ s/^chrUn_//;
  $chr =~ s/^chr//;
  $contigToChr{$chr} = $ucscChr;
}
close (FH);

open (NF, ">not.found.txt") or die "can not write to not.found.txt";
open (FH, "<ens71.chrom.sizes") or die "can not read ens71.chrom.sizes";
while (my $line = <FH>) {
  chomp $line;
  my ($chr, $size) = split('\s+', $line);
  $chr = "M" if ($chr =~ m/MT/);
  my $ucscContig = $chr;
  $chr = "MT" if ($chr =~ m/M/);
  $ucscContig =~ s/\.1$//;
  my $ucscChr = "unknown";
  my $ucscSize = 0;
  if (exists($contigToChr{$ucscContig})) {
     $ucscChr = $contigToChr{$ucscContig};
     $ucscSize = $ucscChromSizes{$ucscChr};
     die "sizes not equal ? $chr=$size $ucscChr=$ucscSize" if ($size != $ucscSize);
     printf "%s\t%s\n", $ucscChr, $chr;
  } else {
     $ucscContig = "chr${ucscContig}";
     if (exists($contigToChr{$ucscContig})) {
        $ucscChr = $contigToChr{$ucscContig};
        $ucscSize = $ucscChromSizes{$ucscChr};
        die "sizes not equal ? $chr=$size $ucscChr=$ucscSize" if ($size != $ucscSize);
        printf "%s\t%s\n", $ucscChr, $chr;
     } else {
       printf STDERR "can not find contig '$chr'\n";
       printf NF "%s\t%d\n", $chr, $size;
     }
  }
}
close (FH);
close (NF);
'_EOF_'
    # << happy emacs

    chmod +x ucscToEns.pl
    ./ucscToEns.pl > ucscToEnsembl.tab

    mkdir /hive/data/genomes/galGal4/bed/ucscToEnsembl
    cd /hive/data/genomes/galGal4/bed/ucscToEnsembl
    ln -s ../../ensembl/ucscToEnsembl.tab

    cat > ucscToEnsembl.sql << '_EOF_'
# UCSC to Ensembl chr name translation
CREATE TABLE ucscToEnsembl (
    ucsc varchar(255) not null,        # UCSC chromosome name
    ensembl varchar(255) not null,     # Ensembl chromosome name
              #Indices
    PRIMARY KEY(ucsc(30))
);
'_EOF_'
   # << happy emacs

   hgsql galGal4 -e "drop table ucscToEnsembl;"
   hgsql galGal4 < ucscToEnsembl.sql
   hgsql galGal4 \
     -e 'LOAD DATA LOCAL INFILE "ucscToEnsembl.tab" INTO TABLE ucscToEnsembl'

   # verify blue navigation bar will now go to Ensembl

############################################################################
# create ucscToINSDC name mapping (DONE - 2013-08-23 - Hiram)
    mkdir /hive/data/genomes/galGal4/bed/ucscToINSDC
    cd /hive/data/genomes/galGal4/bed/ucscToINSDC

    # copying these scripts from the previous load and improving them
    # with each instance
    ./translateNames.sh
    # this one needed some editing to ucscToINSDC.txt and sorting of that
    # to truncate some of the names as were done in the original build
    # since the names went beyond 31 characters
    ./verifyAll.sh
    ./join.sh

    # this didn't work for some odd reason.  Did a manual load of the table:
    # sed -e "s/21/31/" $HOME/kent/src/hg/lib/ucscToINSDC.sql \
    #   | hgLoadSqlTab galGal4 ucscToINSDC stdin ucscToINSDC.tab
    sed -e "s/21/31/" $HOME/kent/src/hg/lib/ucscToINSDC.sql > ucscToINSDC.sql
    hgsql galGal4 < ucscToINSDC.sql
    hgsql -e \
'load data local infile "ucscToINSDC.tab" into TABLE ucscToINSDC;' galGal4

    checkTableCoords galGal4 ucscToINSDC
    featureBits -countGaps galGal4 ucscToINSDC
    # 1046932099 bases of 1046932099 (100.000%) in intersection

    # verify the track link to INSDC functions

##############################################################################
#  BLASTZ/CHAIN/NET Turkey vs Chicken (DONE - 2013-10-18,25 - Hiram)
#   use a screen to manage this multi-day job
    screen -S melGal1

    mkdir /hive/data/genomes/galGal4/bed/lastzMelGal1.2013-10-18
    cd /hive/data/genomes/galGal4/bed/lastzMelGal1.2013-10-18

    cat << '_EOF_' > DEF
# Chicken vs. Turkey
BLASTZ_M=50

# TARGET: Chicken GalGal4
SEQ1_DIR=/hive/data/genomes/galGal4/galGal4.2bit
SEQ1_LEN=/hive/data/genomes/galGal4/chrom.sizes
SEQ1_CHUNK=20000000
SEQ1_LAP=10000
SEQ1_LIMIT=50

# QUERY: Turkey MelGal1
SEQ2_DIR=/scratch/data/melGal1/melGal1.2bit
SEQ2_LEN=/scratch/data/melGal1/chrom.sizes
SEQ2_CHUNK=10000000
SEQ2_LIMIT=100
SEQ2_LAP=0

BASE=/hive/data/genomes/galGal4/bed/lastzMelGal1.2013-10-18
TMPDIR=/dev/shm
'_EOF_'
    # << this line keeps emacs coloring happy
    # screen
    time nice -n +19 doBlastzChainNet.pl -verbose=2 \
        `pwd`/DEF \
        -syntenicNet \
        -workhorse=hgwdev -smallClusterHub=ku -bigClusterHub=ku \
        -chainMinScore=3000 -chainLinearGap=medium > do.log 2>&1
    # real    3091m50.056s
    time nice -n +19 doBlastzChainNet.pl -verbose=2 \
        `pwd`/DEF \
        -continue=chainMerge -syntenicNet \
        -workhorse=hgwdev -smallClusterHub=ku -bigClusterHub=ku \
        -chainMinScore=3000 -chainLinearGap=medium > chainMerge.log 2>&1
    #  real    103m43.211s


    cat fb.galGal4.chainMelGal1Link.txt
    #  925109523 bases of 1032854810 (89.568%) in intersection

    cd /hive/data/genomes/galGal4/bed
    ln -s lastzMelGal1.2013-10-18 lastz.melGal1

    #   and then swap
    mkdir /hive/data/genomes/melGal1/bed/blastz.galGal4.swap
    cd /hive/data/genomes/melGal1/bed/blastz.galGal4.swap
    time nice -n +19 doBlastzChainNet.pl -verbose=2 \
        /hive/data/genomes/galGal4/bed/lastzMelGal1.2013-10-18/DEF \
        -swap -syntenicNet \
        -workhorse=hgwdev -smallClusterHub=ku -bigClusterHub=ku \
        -chainMinScore=3000 -chainLinearGap=medium > swap.log 2>&1
    # real    106m45.413s

    cat fb.melGal1.chainGalGal4Link.txt
    # 889739529 bases of 935922386 (95.066%) in intersection

    cd /hive/data/genomes/melGal1/bed
    ln -s blastz.galGal4.swap lastz.galGal4

############################################################################
#  BLASTZ/CHAIN/NET Zebra finch vs Chicken (DONE - 2013-10-18,25 - Hiram)
#   use a screen to manage this multi-day job
    screen -S taeGut2

    mkdir /hive/data/genomes/galGal4/bed/lastzTaeGut2.2013-10-18
    cd /hive/data/genomes/galGal4/bed/lastzTaeGut2.2013-10-18

    cat << '_EOF_' > DEF
# Chicken vs. Zebra finch
BLASTZ_M=50

# TARGET: Chicken GalGal4
SEQ1_DIR=/hive/data/genomes/galGal4/galGal4.2bit
SEQ1_LEN=/hive/data/genomes/galGal4/chrom.sizes
SEQ1_CHUNK=20000000
SEQ1_LAP=10000
SEQ1_LIMIT=50

# QUERY: Zebra finch TaeGut2
SEQ2_DIR=/hive/data/genomes/taeGut2/taeGut2.2bit
SEQ2_LEN=/hive/data/genomes/taeGut2/chrom.sizes
SEQ2_CHUNK=10000000
SEQ2_LIMIT=300
SEQ2_LAP=0

BASE=/hive/data/genomes/galGal4/bed/lastzTaeGut2.2013-10-18
TMPDIR=/dev/shm
'_EOF_'
    # << this line keeps emacs coloring happy
    # screen
    time nice -n +19 doBlastzChainNet.pl -verbose=2 \
        `pwd`/DEF \
        -syntenicNet \
        -workhorse=hgwdev -smallClusterHub=ku -bigClusterHub=ku \
        -chainMinScore=3000 -chainLinearGap=medium > do.log 2>&1
    # real    3418m20.002s
    time nice -n +19 doBlastzChainNet.pl -verbose=2 \
        -continue=chainMerge `pwd`/DEF \
        -syntenicNet \
        -workhorse=hgwdev -smallClusterHub=ku -bigClusterHub=ku \
        -chainMinScore=3000 -chainLinearGap=medium > merge.log 2>&1
    # real    50m6.651s

    cat fb.galGal4.chainTaeGut2Link.txt
    # 705616239 bases of 1032854810 (68.317%) in intersection

    cd /hive/data/genomes/galGal4/bed
    ln -s lastzTaeGut2.2013-10-18 lastz.taeGut2

    #   and then swap
    mkdir /hive/data/genomes/taeGut2/bed/blastz.galGal4.swap
    cd /hive/data/genomes/taeGut2/bed/blastz.galGal4.swap
    time nice -n +19 doBlastzChainNet.pl -verbose=2 \
        /hive/data/genomes/galGal4/bed/lastzTaeGut2.2013-10-18/DEF \
        -swap -syntenicNet \
        -workhorse=hgwdev -smallClusterHub=ku -bigClusterHub=ku \
        -chainMinScore=3000 -chainLinearGap=medium > swap.log 2>&1 &
    # real    93m43.072s
    cat fb.taeGut2.chainGalGal4Link.txt
    #  814628366 bases of 1222864691 (66.616%) in intersection
    cd /hive/data/genomes/taeGut2/bed
    ln -s blastz.galGal4.swap lastz.galGal4

############################################################################
#  BLASTZ/CHAIN/NET Ostrich vs Chicken (DONE - 2013-11-06 - Hiram)
#   use a screen to manage this multi-day job
    screen -S strCam0

    mkdir /hive/data/genomes/galGal4/bed/lastzStrCam0.2013-11-06
    cd /hive/data/genomes/galGal4/bed/lastzStrCam0.2013-11-06

    cat << '_EOF_' > DEF
# Chicken vs. Ostrich
BLASTZ_M=50

# TARGET: Chicken GalGal4
SEQ1_DIR=/hive/data/genomes/galGal4/galGal4.2bit
SEQ1_LEN=/hive/data/genomes/galGal4/chrom.sizes
SEQ1_CHUNK=20000000
SEQ1_LAP=10000
SEQ1_LIMIT=50

# QUERY: Ostrich StrCam0
SEQ2_DIR=/hive/data/genomes/strCam0/strCam0.2bit
SEQ2_LEN=/hive/data/genomes/strCam0/chrom.sizes
SEQ2_CHUNK=10000000
SEQ2_LIMIT=300
SEQ2_LAP=0

BASE=/hive/data/genomes/galGal4/bed/lastzStrCam0.2013-11-06
TMPDIR=/dev/shm
'_EOF_'
    # << this line keeps emacs coloring happy
    # screen
    time nice -n +19 doBlastzChainNet.pl -verbose=2 \
        `pwd`/DEF \
	-tRepeats=windowmaskerSdust -qRepeats=windowmaskerSdust \
        -workhorse=hgwdev -smallClusterHub=ku -bigClusterHub=ku \
        -chainMinScore=3000 -chainLinearGap=medium > do.log 2>&1
    # real    4093m32.252s

    cat fb.galGal4.chainStrCam0Link.txt
    # 807682594 bases of 1032854810 (78.199%) in intersection

    cd /hive/data/genomes/galGal4/bed
    ln -s lastzStrCam0.2013-11-06 lastz.strCam0

    cd /hive/data/genomes/galGal4/bed/lastzStrCam0.2013-11-06
    time doRecipBest.pl -workhorse=hgwdev -buildDir=`pwd` \
        galGal4 strCam0 > rbest.log 2>&1 &
    # real    36m13.961s

    #   and then swap, can be done while Rbest is working:
    mkdir /hive/data/genomes/strCam0/bed/blastz.galGal4.swap
    cd /hive/data/genomes/strCam0/bed/blastz.galGal4.swap
    time nice -n +19 doBlastzChainNet.pl -verbose=2 \
        /hive/data/genomes/galGal4/bed/lastzStrCam0.2013-11-06/DEF \
        -swap \
        -workhorse=hgwdev -smallClusterHub=ku -bigClusterHub=ku \
        -chainMinScore=3000 -chainLinearGap=medium > swap.log 2>&1
    # real    58m45.359s
    cat fb.strCam0.chainGalGal4Link.txt
    #  798315880 bases of 1187883366 (67.205%) in intersection

    cd /hive/data/genomes/strCam0/bed
    ln -s blastz.galGal4.swap lastz.galGal4

    time doRecipBest.pl -workhorse=hgwdev -buildDir=`pwd` \
        strCam0 galGal4 > rbest.log 2>&1
    # real    40m28.417s

############################################################################
#  BLASTZ/CHAIN/NET Falco peregrinus vs Chicken (DONE - 2013-11-06 - Hiram)
#   use a screen to manage this multi-day job
    screen -S falPer1

    mkdir /hive/data/genomes/galGal4/bed/lastzFalPer1.2013-11-06
    cd /hive/data/genomes/galGal4/bed/lastzFalPer1.2013-11-06

    cat << '_EOF_' > DEF
# Chicken vs. Falco peregrinus
BLASTZ_M=50

# TARGET: Chicken GalGal4
SEQ1_DIR=/hive/data/genomes/galGal4/galGal4.2bit
SEQ1_LEN=/hive/data/genomes/galGal4/chrom.sizes
SEQ1_CHUNK=20000000
SEQ1_LAP=10000
SEQ1_LIMIT=40

# QUERY: Falco peregrinus FalPer1
SEQ2_DIR=/hive/data/genomes/falPer1/falPer1.2bit
SEQ2_LEN=/hive/data/genomes/falPer1/chrom.sizes
SEQ2_CHUNK=10000000
SEQ2_LIMIT=100
SEQ2_LAP=0

BASE=/hive/data/genomes/galGal4/bed/lastzFalPer1.2013-11-06
TMPDIR=/dev/shm
'_EOF_'
    # << this line keeps emacs coloring happy
    # screen
    time nice -n +19 doBlastzChainNet.pl -verbose=2 \
        `pwd`/DEF \
	-tRepeats=windowmaskerSdust -qRepeats=windowmaskerSdust \
        -workhorse=hgwdev -smallClusterHub=ku -bigClusterHub=ku \
        -chainMinScore=3000 -chainLinearGap=medium > do.log 2>&1
    # real    4115m9.767s

    cat fb.galGal4.chainFalPer1Link.txt
    # 848910764 bases of 1032854810 (82.191%) in intersection

    cd /hive/data/genomes/galGal4/bed
    ln -s lastzFalPer1.2013-11-06 lastz.falPer1

    cd /hive/data/genomes/galGal4/bed/lastzFalPer1.2013-11-06
    time doRecipBest.pl -workhorse=hgwdev -buildDir=`pwd` \
        galGal4 falPer1 > rbest.log 2>&1 &
    # real    37m52.361s


    #   and then swap
    mkdir /hive/data/genomes/falPer1/bed/blastz.galGal4.swap
    cd /hive/data/genomes/falPer1/bed/blastz.galGal4.swap
    time nice -n +19 doBlastzChainNet.pl -verbose=2 \
        /hive/data/genomes/galGal4/bed/lastzFalPer1.2013-11-06/DEF \
        -swap \
        -workhorse=hgwdev -smallClusterHub=ku -bigClusterHub=ku \
        -chainMinScore=3000 -chainLinearGap=medium > swap.log 2>&1

    # real    69m9.773s
    cat fb.falPer1.chainGalGal4Link.txt
    #  831664485 bases of 1153404357 (72.105%) in intersection

    cd /hive/data/genomes/falPer1/bed
    ln -s blastz.galGal4.swap lastz.galGal4

    time doRecipBest.pl -workhorse=hgwdev -buildDir=`pwd` \
        falPer1 galGal4 > rbest.log 2>&1
    #  real    45m55.767s

############################################################################
# (IN PROGRESS - 2014-01-07 - Pauline)
    mkdir /hive/data/genomes/galGal4/bed/animalQtl
    cd /hive/data/genomes/galGal4/bed/animalQtl

    wget http://www.animalgenome.org/QTLdb/tmp/QTL_GG_4.0.ubed.txt.gz
    zcat QTL_GG_4.0.ubed.txt.gz >> galGal4.animalQtl.bed

    cut -f1-4 galGal4.animalQtl.bed > galGal4.animalQtl.bed4
    hgLoadBed galGal4 animalQtl galGal4.animalQtl.bed4

    checkTableCoords galGal4 animalQtl
    #no results - yay!

    # originally used commands for intial version of file - ids were
    # name:idNumber, contributor reloaded these as numbers so skipped sed step
    # and processed as above.
    # trim redundant columns to the bare minimum, alter name to be
    # QTLname:idNumber to allow use of $p in external URL for idNumber
    cut -f1-4 galGal4.animalQtl.bed | sed -e 's/-/:/;' > galGal4.animalQtl.bed4
    hgLoadBed galGal4 animalQtl galGal4.animalQtl.bed4


    #script to determine what to include in search rule:

cat > findNames <<EOF 
#!/bin/sh
#hgsql -N -e 'select name from animalQtl;' bosTau7 | sort >
#/tmp/bosTau7.animalQtl.name.txt
hgsql -N -e 'select name from animalQtl;' bosTau7  > /tmp/bosTau7.animalQtl.name.txt
hgsql -N -e 'select name from animalQtl;' galGal4  >> /tmp/bosTau7.animalQtl.name.txt
hgsql -N -e 'select name from animalQtl;' susScr3  >> /tmp/bosTau7.animalQtl.name.txt
hgsql -N -e 'select name from animalQtl;' oviAri3  >> /tmp/bosTau7.animalQtl.name.txt
export maxLen=`awk '{print length($0)}' /tmp/bosTau7.animalQtl.name.txt | sort -rn | head -1`
echo "scan to column: $maxLen"

export C=1
while [ $C -le $maxLen ];
do
echo -n " $C: "
sort -u /tmp/bosTau7.animalQtl.name.txt | awk '{ print substr($0,'$C',1) }' | sort -u | xargs echo | sed -e 's/ //g'
C=`echo $C | awk '{print $1+1}'`
done

#rm -f /tmp/bosTau7.animalQtl.name.txt
mv /tmp/bosTau7.animalQtl.name.txt .
EOF

#script output:
 1: 12346ABCDEFGHIJKLMNOPQRSTUVWYZcdfmpu
 2: -/0123456789:ABCDEFGHIJKLMNOPQRSTUVWXYacdeimort
 3: %-01234568:ABCDEFGHIKLMNOPRSTUVWXYZ_abcdefnost
 4: %-/0123456789:ABCDEFGHIKLMNOPRSTUVWYZ_abdehilnqrt
 5: -0123456789:ABCDEFGHIJKLMNOPQRSTUVWYZ_abcdiklnorstw
 6: %-.0123456789:ABCDEFGHIJKLMNOPQRSTUVWXYZ_bcdhimnoprstw
 7: -0123456789:ABCDEFGHIKLMNOPRSTVW_abcent
 8: 0123456789:ABCDEFGHILMNPRSTV_abeflo
 9: 0123456789:BCEFMOPTW_adeklnt
 10: 0123456789:CHLcdfirt
 11: -0123456789:aln
 12: 0123456789git
 13: 0123456789:
 14: 0123456789
 15: 0123456789
 16: 0123456789
 17: 012346
 18: 023456

#rule constructed based on script output - set termregex to this in trackDb
#search stanza
^[a-zA-Z0-9]+[a-zA-Z0-9:/-]+[a-zA-Z0-9:/-_%]+

############################################################################
# DBSNP B138 / SNP138 (DONE 1/24/14 angie)
    # RedMine #12490
    screen
    mkdir -p /hive/data/outside/dbSNP/138/chicken
    cd /hive/data/outside/dbSNP/138/chicken
    # Look at the directory listing of ftp://ftp.ncbi.nih.gov/snp/database/organism_data/
    # to find the subdir name to use as orgDir below (chicken_9031 in this case).
    # Then click into that directory and look for file names like
    #    b(1[0-9][0-9])_
    # -- use the first num for build setting in config.ra
    # The buildAssembly setting in config.ra is empty because dbSNP stopped including
    # that in file names.
    cat > config.ra <<EOF
db galGal4
build 138
buildAssembly 
orgDir chicken_9031
EOF
    ~/kent/src/hg/utils/automation/doDbSnp.pl config.ra >& do.log & tail -f do.log
    # Some trial and error is always required to get the config.ra just right.
    # First stop:
#*** 34 lines of b138_ContigInfo.bcp.gz contained contig names that
#*** could not be mapped to chrom.size via their GenBank contig mappings; see
#*** /hive/data/outside/dbSNP/138/chicken/cantLiftUpSeqNames.txt .
#
#*** You must account for all 16847 contig_acc values in config.ra,
#*** using the liftUp and/or ignoreDbSnpContigsFile settings (see -help output).
#*** Check the auto-generated suggested.lft to see if it covers all
#*** 16847 contigs; if it does, add 'liftUp suggested.lft' to config.ra.
#*** Then run again with -continue=loadDbSnp .

    head -5 cantLiftUpSeqNames.txt
#unlocalized-scaffold (no chrLGE22C19W28_E50C23_AADN03011211_random in chrom.sizes)      NW_003766145    chrLGE22C19W28_E50C23
#unlocalized-scaffold (no chrLGE22C19W28_E50C23_AADN03011213_random in chrom.sizes)      NW_003766138    chrLGE22C19W28_E50C23
#unlocalized-scaffold (no chrLGE22C19W28_E50C23_AADN03011215_random in chrom.sizes)      NW_003766135    chrLGE22C19W28_E50C23
#unlocalized-scaffold (no chrLGE22C19W28_E50C23_AADN03011216_random in chrom.sizes)      NW_003766131    chrLGE22C19W28_E50C23
#unlocalized-scaffold (no chrLGE22C19W28_E50C23_AADN03011217_random in chrom.sizes)      NW_003766128    chrLGE22C19W28_E50C23

    grep chrLGE22C19W28_E50C23 /hive/data/genomes/galGal4/chrom.sizes
#chrLGE22C19W28_E50C23   965146
    # In our assembly, chrLGE22C19W28_E50C23 is an assembled unit, but dbSNP has mappings
    # to individual NW_* contigs.
    # GCF_000002315.3.assembly.txt, downloaded by doDbSnp.pl, has mappings of those NW_* contigs
    # to AADN* GenBank contig IDs like this:
    grep NW_003766145 GCF_000002315.3.assembly.txt
#ChrE22C19W28_E50C23_random_7180000979879        unlocalized-scaffold    LGE22C19W28_E50C23      Linkage Group   AADN03011211.1  =       NW_003766145.1  Primary Assembly

    # Now, see if galGal4/genbank/ has any info about the mapping of those AADN*'s to the
    # assembled LGE22C19W28_E50C23:
    pushd /hive/data/genomes/galGal4/genbank/Primary_Assembly
    find . -type f | xargs grep AADN03011213
#./unlocalized_scaffolds/unlocalized.chr2scaf:LGE22C19W28_E50C23 AADN03011213.1
#./component_localID2acc:ctg7180000923959        AADN03011213.1
#./scaffold_localID2acc:ChrE22C19W28_E50C23_random_7180000977649 AADN03011213.1
    # OK, not as hopeful as I thought (no coord offsets).
    # But, since AADN03011213 is associated with LGE22C19W28_E50C23 in unlocalized_scaffolds,
    # it was in the genbank assembly.  Why not in chrom.sizes?
    # -- because Hiram found that sequence names like chrLGE22C19W28_E50C23_AADN03011213_random
    # would be too long (search above for "these names became too long, limit is 31 in browser").
    # So Hiram shortened them.  Now that we know what they look like in chrom.sizes, we can
    # generate liftUp mappings to add to suggested.lft.
    popd

    # Map too-big sequence name to sequence size, sort on too-big name:
    egrep 'chrLGE22C19W28_(N0|JH)' /hive/data/genomes/galGal4/chrom.sizes \
    | sed -e 's/chrLGE22C19W28_/chrLGE22C19W28_E50C23_/; s/_N0/_AADN0/;' \
    | sort > bigNameToSize
    # Map too-big sequence name to RefSeq NW_* accession, sort on too-big name:
    awk '{print $3 "\t" $6;}' cantLiftUpSeqNames.txt | sort > bigNameToNW
    # Use join and sed to make some liftUp entries (and re-shorten the big names):
    join -o 2.2,1.2,1.1,1.2 bigNameToSize bigNameToNW \
    | sed -re 's/^/0\t/; s/ /\t/g; s/_E50C23//; s/AAD//;' \
    > bigName.lft
    wc -l bigName.lft
#34 bigName.lft

    # Now add our tweaked-name mappings to the auto-generated suggested.lft:
    cat suggested.lft bigName.lft > suggestedPlus.lft
    cat >> config.ra <<EOF
liftUp suggestedPlus.lft
EOF
    ~/kent/src/hg/utils/automation/doDbSnp.pl config.ra -continue=loadDbSnp >>& do.log &
    tail -f do.log
# *** All done!


##############################################################################
# FILTER SNP138 (DONE 1/27/14 angie)
    cd /hive/data/outside/dbSNP/138/chicken
    zcat snp138.bed.gz \
    | ~/kent/src/hg/utils/automation/categorizeSnps.pl
#Mult:     189438
#Common:   20461
#Flagged:  0
#leftover: 9173531
    # 20,461 SNPs out of > 9M that meet the threshold for Common... enough to warrant a
    # Common SNPs track?  Maybe; easy enough to drop it later if we decide it's too sparse.
    foreach f ({Mult,Common}.bed.gz)
        mv $f snp138$f
    end
    # Load tables
    foreach subset (Mult Common)
        hgLoadBed -tab -onServer -tmpDir=/data/tmp -allowStartEqualEnd -renameSqlTable \
          galGal4 snp138$subset -sqlTable=snp138.sql snp138$subset.bed.gz
    end


##############################################################################
# DBSNP CODING ANNOTATIONS (138) (DONE 2/27/14 angie)
    # Originally done 1/27/14, but Brian L found hgc crash caused by script bug...
    cd /hive/data/outside/dbSNP/138/chicken
    # ncbiFuncAnnotations.txt has NCBI coords: 0-based, fully closed.
    # Note: sort -u with the keys below is too restrictive -- we need full line uniq.
    zcat ncbiFuncAnnotations.txt.gz \
    | ~/kent/src/hg/utils/automation/fixNcbiFuncCoords.pl ncbiFuncInsertions.ctg.bed.gz \
    | sort -k1n,1n -k2,2 -k3n,3n -k5,5 -k6n,6n \
    | uniq \
      > ncbiFuncAnnotationsFixed.txt
    wc -l ncbiFuncAnnotationsFixed.txt
#281194 ncbiFuncAnnotationsFixed.txt
   # How many & what kinds of function types?
    cut -f 6 ncbiFuncAnnotationsFixed.txt \
    | sort -n | uniq -c
#  84191 3   (coding-synon)
# 125292 8   (cds-reference)
#   1230 41  (nonsense)
#  46722 42  (missense)
#     67 43  (stop-loss)
#   4017 44  (frameshift)
#  19675 45  (cds-indel)

    # 2/27/14: Re-ran from here after fixing bug in script:
    # Gather up multiple annotation lines into one line per {snp, gene, frame}:
    ~/kent/src/hg/utils/automation/collectNcbiFuncAnnotations.pl ncbiFuncAnnotationsFixed.txt \
    | liftUp snp138CodingDbSnp.bed suggested.lft warn stdin
    hgLoadBed galGal4 snp138CodingDbSnp -sqlTable=$HOME/kent/src/hg/lib/snp125Coding.sql \
      -renameSqlTable -tab -notItemRgb -allowStartEqualEnd \
      snp138CodingDbSnp.bed
#Read 145135 elements of size 11 from snp138CodingDbSnp.bed


##############################################################################
# Farm Animal Cytoband track (IN PROGRESS - 2014-02-04 - Pauline)
    mkdir /hive/data/genomes/galGal4/bed/cytoBand
    cd /hive/data/genomes/galGal4/bed/cytoBand

    wget http://www.animalgenome.org/hu/share/GbandCoord.tar.gz

##############################################################################
# TransMap V3 tracks. see makeDb/doc/transMapTracks.txt (2014-12-21 markd)
##############################################################################

##############################################################################
# LIFTOVER TO galGal5 (DONE - 2016-04-15 - Hiram)
    ssh hgwdev
    mkdir /hive/data/genomes/galGal4/bed/blat.galGal5.2016-04-15
    cd /hive/data/genomes/galGal4/bed/blat.galGal5.2016-04-15
    doSameSpeciesLiftOver.pl -verbose=2 \
        -debug -bigClusterHub=ku -dbHost=hgwdev -workhorse=hgwdev \
        -ooc=/hive/data/genomes/galGal4/jkStuff/galGal4.11.ooc \
         galGal4 galGal5
    time (doSameSpeciesLiftOver.pl -verbose=2 \
        -bigClusterHub=ku -dbHost=hgwdev -workhorse=hgwdev \
        -ooc=/hive/data/genomes/galGal4/jkStuff/galGal4.11.ooc \
         galGal4 galGal5) > doLiftOverToGalGal5.log 2>&1
    # real    155m34.324s

    # see if the liftOver menus function in the browser from galGal5 to galGal4

##############################################################################
# LASTZ chicken/galGal4 vs. Chinese softshell turtle - (DONE - 2016-05-17 - Hiram)
    mkdir /hive/data/genomes/galGal4/bed/lastzPelSin1.2016-05-17
    cd /hive/data/genomes/galGal4/bed/lastzPelSin1.2016-05-17

    printf '# chicken vs. Chinese softshell turtle
BLASTZ=/cluster/bin/penn/lastz-distrib-1.03.66/bin/lastz
BLASTZ_M=254

# TARGET: chicken galGal4
SEQ1_DIR=/hive/data/genomes/galGal4/galGal4.2bit
SEQ1_LEN=/hive/data/genomes/galGal4/chrom.sizes
SEQ1_CHUNK=20000000
SEQ1_LIMIT=100
SEQ1_LAP=10000

# QUERY: Chinese softshell turtle pelSin1
SEQ2_DIR=/hive/data/genomes/pelSin1/pelSin1.2bit
SEQ2_LEN=/hive/data/genomes/pelSin1/chrom.sizes
SEQ2_CHUNK=20000000
SEQ2_LAP=0
SEQ2_LIMIT=100

BASE= /hive/data/genomes/galGal4/bed/lastzPelSin1.2016-05-17
TMPDIR=/dev/shm
' > DEF

    time (doBlastzChainNet.pl `pwd`/DEF -verbose=2 \
        -chainMinScore=5000 -chainLinearGap=loose \
          -workhorse=hgwdev -smallClusterHub=ku -bigClusterHub=ku \
            -syntenicNet) > do.log 2>&1
    # real    85m6.637s

    cat fb.galGal4.chainPelSin1Link.txt
    #  253365632 bases of 1032854810 (24.531%) in intersection

    time (doRecipBest.pl -workhorse=hgwdev -buildDir=`pwd` galGal4 pelSin1) \
          > rbest.log 2>&1 &
    #  real    110m52.863s

    # and for the swap:
    mkdir /hive/data/genomes/pelSin1/bed/blastz.galGal4.swap
    cd /hive/data/genomes/pelSin1/bed/blastz.galGal4.swap

    time (doBlastzChainNet.pl -verbose=2 \
      /hive/data/genomes/galGal4/bed/lastzPelSin1.2016-05-17/DEF \
        -swap -chainMinScore=5000 -chainLinearGap=loose \
          -workhorse=hgwdev -smallClusterHub=ku -bigClusterHub=ku \
            -syntenicNet) > swap.log 2>&1
    #  real    50m44.030s

    cat fb.pelSin1.chainGalGal4Link.txt
    # 269390769 bases of 2106639384 (12.788%) in intersection

    time (doRecipBest.pl -workhorse=hgwdev -buildDir=`pwd` pelSin1 galGal4) \
       > rbest.log 2>&1
    # real    168m45.457s

#########################################################################
# LIFTOVER TO galGal6 (DONE - 2016-10-12 - Hiram)
    ssh hgwdev
    mkdir /hive/data/genomes/galGal4/bed/blat.galGal6.2018-10-12
    cd /hive/data/genomes/galGal4/bed/blat.galGal6.2018-10-12
    doSameSpeciesLiftOver.pl -verbose=2 \
        -debug -bigClusterHub=ku -dbHost=hgwdev -workhorse=hgwdev \
        -ooc=/hive/data/genomes/galGal4/jkStuff/galGal4.11.ooc \
         galGal4 galGal6
    time (doSameSpeciesLiftOver.pl -verbose=2 \
        -bigClusterHub=ku -dbHost=hgwdev -workhorse=hgwdev \
        -ooc=/hive/data/genomes/galGal4/jkStuff/galGal4.11.ooc \
         galGal4 galGal6) > doLiftOverToGalGal6.log 2>&1
    # real    47m3.714s

    # see if the liftOver menus function in the browser from galGal4 to galGal6

#########################################################################
