# for emacs: -*- mode: sh; -*-

# Danio rerio Zv9 sequence:
# ftp.ncbi.nlm.nih.gov/genbank/genomes/Eukaryotes/vertebrates_other/
#	Danio_rerio/Zv9/

#	http://www.ncbi.nlm.nih.gov/Traces/wgs/?val=CABZ00

##########################################################################
# Download sequence (DONE - 2010-11-30 - Hiram)
    mkdir /hive/data/genomes/danRer7
    cd /hive/data/genomes/danRer7
    mkdir genbank
    cd genbank
    wget --timestamping -r --cut-dirs=6 --level=0 -nH -x \
	--no-remove-listing -np \
"ftp://ftp.ncbi.nlm.nih.gov/genbank/genomes/Eukaryotes/vertebrates_other/Danio_rerio/Zv9/*"

###########################################################################
# fixup to UCSC names (DONE - 2010-11-30 - Hiram)
    cd /hive/data/genomes/danRer7/genbank

    cat << '_EOF_' > ucscNames.pl
#!/bin/env perl

use strict;
use warnings;

my %cuToChr;

open (FH, "<Primary_Assembly/assembled_chromosomes/chr2acc") or
	die "can not read Primary_Assembly/assembled_chromosomes/chr2acc";
while (my $line = <FH>) {
    chomp $line;
    my ($chrN, $cuName) = split('\s+', $line);
    $cuToChr{$cuName} = $chrN;
    printf "%d\t%s\n", $chrN, $cuName;
    open (AG, "zcat Primary_Assembly/assembled_chromosomes/AGP/chr${chrN}.comp.agp.gz|") or
	die "can not read Primary_Assembly/assembled_chromosomes/AGP/chr${chrN}.comp.agp.gz";
    open (UC, "|gzip -c > ucsc/chr${chrN}.agp.gz") or
	die "can not write to ucsc/chr${chrN}.agp.gz";
    while (my $agpLine = <AG>) {
	if ($agpLine =~ m/^#/) {
	    print UC $agpLine;
	} else {
	    chomp $agpLine;
	    my ($cuId, $rest) = split('\s+', $agpLine, 2);
	    printf UC "chr%d\t%s\n", $cuToChr{$cuId}, $rest;
	}
    }
    close (AG);
    close (UC);
    print `zcat Primary_Assembly/assembled_chromosomes/FASTA/chr${chrN}.fa.gz | sed -e "s#>.*#>chr${chrN}#" | gzip -c > ucsc/chr${chrN}.fa.gz`
}
close (FH);
'_EOF_'
    # << happy emacs
    chmod +x ucscNames.pl

    mkdir ucsc
    ./ucscNames.pl

    zcat Primary_Assembly/unplaced_scaffolds/AGP/unplaced.scaf.agp.gz \
	| sed -e "s#^FR\([0-9]*\).1#FR\1#" | gzip -c > ucsc/chr.unplaced.agp.gz

    zcat Primary_Assembly/unplaced_scaffolds/FASTA/unplaced.scaf.fa.gz \
	| sed -e "s#gi.*emb|##; s#.1| .*##" | gzip -c > ucsc/chr.unplaced.fa.gz

###########################################################################
# Fixup names to match Ensembl naming scheme (DONE - 2010-02-12 - Hiram)
    cd /hive/data/genomes/danRer7/genbank
    cat << '_EOF_' > scafNames.pl
#!/bin/env perl

use strict;
use warnings;

my %accToScaf;

open (FH, "<Primary_Assembly/localID2acc") or
        die "can not read Primary_Assembly/localID2acc";
while (my $line = <FH>) {
    chomp $line;
    my ($scafName, $acc) = split('\s+', $line);
    $accToScaf{$acc} = $scafName;
}
close (FH);

open (FH, "zcat Primary_Assembly/unplaced_scaffolds/AGP/unplaced.scaf.agp.gz|")
    or die "can not read Primary_Assembly/unplaced_scaffolds/AGP/unplaced.scaf.agp.gz";
open (UC, "| gzip -c > ucsc/chr.unplaced.agp.gz") or die "can not write to ucsc/chr.unplaced.agp.gz";

while (my $agpLine = <FH>) {
    if ($agpLine =~ m/^#/) {
        print UC $agpLine;
    } else {
        chomp $agpLine;
        my ($acc, $rest) = split('\s+', $agpLine, 2);
        printf UC "%s\t%s\n", $accToScaf{$acc}, $rest;
    }
}
close (FH);
close (UC);

open (FH, "zcat Primary_Assembly/unplaced_scaffolds/FASTA/unplaced.scaf.fa.gz|")
    or die "can not read Primary_Assembly/unplaced_scaffolds/FASTA/unplaced.scaf.fa.gz";
open (UC, "| gzip -c > ucsc/chr.unplaced.fa.gz") or die "can not write to ucsc/chr.unplaced.fa.gz";
while (my $line = <FH>) {
    if ($line =~ m/^>/) {
        chomp $line;
        $line =~ s/, whole genome shotgun sequence//;
        $line =~ s/^>.*assembly, />/;
        printf UC "%s\n", $line;
    } else {
        printf UC $line;
    }
}
close (FH);
close (UC);
'_EOF_'
    # << happy emacs
    ./scafNames.pl

###########################################################################
# initial database build (DONE - 2010-12-02 - Hiram)
    cd /hive/data/genomes/danRer7
    cat << '_EOF_' > danRer7.config.ra
# Config parameters for makeGenomeDb.pl:
db danRer7
clade vertebrate
scientificName Danio rerio
commonName Zebrafish
assemblyDate Jul. 2010
assemblyLabel Wellcome Trust Sanger Institute Zv9 (GCA_000002035.2)
assemblyShortLabel Zv9
orderKey 447
fastaFiles /hive/data/genomes/danRer7/genbank/ucsc/chr*.fa.gz
agpFiles /hive/data/genomes/danRer7/genbank/ucsc/chr*.agp.gz
# qualFiles none
dbDbSpeciesDir zebrafish
mitoAcc NC_002333.2
taxId 7955
'_EOF_'
    # << happy emacs

    makeGenomeDb.pl danRer7.config.ra
    ln -s `pwd`/danRer7.unmasked.2bit /gbdb/danRer7/danRer7.2bit

###########################################################################
# Ensembl v60 genes (DONE - 2010-12-02 - Hiram)
    cd /hive/data/genomes/danRer7
    cat << '_EOF_' > danRer7.ensGene.ra
# required db variable
db danRer7
# optional nameTranslation, the sed command that will transform
#       Ensemble names to UCSC names.  With quotes just to make sure.
nameTranslation "s/^\([0-9XY][0-9]*\)/chr\1/; s/^MT/chrM/"
'_EOF_'
    # << happy emacs

    doEnsGeneUpdate.pl -ensVersion=60 \
	-bigClusterHub=swarm -smallClusterHub=swarm -workhorse=hgwdev \
	    danRer7.ensGene.ra > ensGene.v60.log 2>&1

###########################################################################
# RepeatMasker (DONE - 2010-12-02 - Hiram)
    time doRepeatMasker.pl -buildDir=`pwd` -noSplit -bigClusterHub=swarm \
	-dbHost=hgwdev -workhorse=hgwdev danRer7 > do.log 2>&1 &
    #	real    1283m15.593s
    #	these jobs run about twice as slow as other genomes:
# Completed: 2861 of 2861 jobs
# CPU time in finished jobs:   28800301s  480005.01m  8000.08h  333.34d  0.913 y
# IO & Wait Time:               1812390s   30206.50m   503.44h   20.98d  0.057 y
# Average job time:               10700s     178.33m     2.97h    0.12d
# Longest finished job:           11428s     190.47m     3.17h    0.13d
# Submission to last job:        105184s    1753.07m    29.22h    1.22d

    # after resolving various swarm problems, continuing:
    time doRepeatMasker.pl -continue=cat -buildDir=`pwd` \
	-noSplit -bigClusterHub=swarm -dbHost=hgwdev \
	-workhorse=hgwdev danRer7 > cat.log 2>&1 &
    #	real    25m56.591s
    cat faSize.rmsk.txt
# 1412464843 bases (2694734 N's 1409770109 real 676920525 upper
#	732849584 lower) in 1133 sequences in 1 files
# %51.88 masked total, %51.98 masked real

###########################################################################
# TRF/SimpleRepeats (DONE - 2010-12-02 - Hiram)
    mkdir /hive/data/genomes/danRer7/bed/simpleRepeat
    time doSimpleRepeat.pl -buildDir=`pwd` -smallClusterHub=swarm \
	-bigClusterHub=swarm -dbHost=hgwdev -workhorse=hgwdev danRer7 \
    #  1.5 hours later
    # cluster run failed due to finding nothing on chrM
    time doSimpleRepeat.pl -continue=filter -buildDir=`pwd` \
	-smallClusterHub=swarm -bigClusterHub=swarm -dbHost=hgwdev \
	-workhorse=hgwdev danRer7 > filter.log 2>&1
    #	real    0m32.422s
    cat fb.simpleRepeat
# 101845718 bases of 1409772743 (7.224%) in intersection

    cd /hive/data/genomes/danRer7
    twoBitMask danRer7.rmsk.2bit \
	-add bed/simpleRepeat/trfMask.bed danRer7.2bit
    #	you can safely ignore the warning about fields >= 13

    twoBitToFa danRer7.2bit stdout | faSize stdin > faSize.danRer7.2bit.txt
    cat faSize.danRer7.2bit.txt
# 1412464843 bases (2694734 N's 1409770109 real 675828305 upper
#	733941804 lower) in 1133 sequences in 1 files
# %51.96 masked total, %52.06 masked real

    rm /gbdb/danRer7/danRer7.2bit
    ln -s `pwd`/danRer7.2bit /gbdb/danRer7/danRer7.2bit
#########################################################################
# Verify all gaps are marked, add any N's not in gap as type 'other'
#	(DONE - 2012-12-09 - Hiram)
    mkdir /hive/data/genomes/danRer7/bed/gap
    cd /hive/data/genomes/danRer7/bed/gap
    time nice -n +19 findMotif -motif=gattaca -verbose=4 \
	-strand=+ ../../danRer7.2bit > findMotif.txt 2>&1
    #	real    1m12.153s
    grep "^#GAP " findMotif.txt | sed -e "s/^#GAP //" > allGaps.bed
    featureBits danRer7 -not gap -bed=notGap.bed
    #	1409772743 bases of 1409772743 (100.000%) in intersection
    featureBits danRer7 allGaps.bed notGap.bed -bed=new.gaps.bed
    #	2634 bases of 1409772743 (0.000%) in intersection
    #	what is the highest index in the existing gap table:
    hgsql -N -e "select ix from gap;" danRer7 | sort -n | tail -1
    #	3063
    cat << '_EOF_' > mkGap.pl
#!/bin/env perl

use strict;
use warnings;

my $ix=`hgsql -N -e "select ix from gap;" danRer7 | sort -n | tail -1`;
chomp $ix;

open (FH,"<new.gaps.bed") or die "can not read new.gaps.bed";
while (my $line = <FH>) {
    my ($chrom, $chromStart, $chromEnd, $rest) = split('\s+', $line);
    ++$ix;
    printf "%s\t%d\t%d\t%d\tN\t%d\tother\tyes\n", $chrom, $chromStart,
        $chromEnd, $ix, $chromEnd-$chromStart;
}
close (FH);
'_EOF_'
    # << happy emacs
    chmod +x ./mkGap.pl
    ./mkGap.pl > other.bed
    featureBits -countGaps danRer7 other.bed
    #	2634 bases of 1412464843 (0.000%) in intersection
    wc -l other.bed
    #	1975
    hgLoadBed -sqlTable=$HOME/kent/src/hg/lib/gap.sql \
	-noLoad danRer7 otherGap other.bed
    #	starting with this many
    hgsql -e "select count(*) from gap;" danRer7
    #	26921
    hgsql danRer7 -e 'load data local infile "bed.tab" into table gap;'
    #	result count:
    hgsql -e "select count(*) from gap;" danRer7
    #	28896
    # == 26921 + 1975
    # there is an odd one marked as bridged when it is actually a continuation
    #	of a non-bridged gap.
    #	Found when running gapToLift below, without the -minGap=100.
    #	chr22:42178985-42178985
    hgsql -e 'update gap set bridge="no" where chromStart=42178984 AND chrom="chr22";' danRer7

##########################################################################
## WINDOWMASKER (DONE- 2013-06-21 - Hiram)
    mkdir /hive/data/genomes/danRer7/bed/windowMasker
    cd /hive/data/genomes/danRer7/bed/windowMasker
    time nice -n +19 doWindowMasker.pl -buildDir=`pwd` -workhorse=hgwdev \
	-dbHost=hgwdev danRer7 > do.log 2>&1 &
    # real     94m57.692s
    # XXX - failed in doLoad due to bugs in safef/hdb.c ...

    cd /hive/data/genomes/danRer7/bed/windowMasker
    time nice -n +19 doWindowMasker.pl -continue=cleanup \
      -buildDir=`pwd` -workhorse=hgwdev -dbHost=hgwdev danRer7 \
      > cleanup.log 2>&1 &


    # Masking statistics
    faSize.danRer7.cleanWMSdust.txt
# 1412464843 bases (2694734 N's 1409770109 real 701628147
#    upper 708141962 lower) in 1133 sequences in 1 files
# Total size: mean 1246659.2 sd 8088447.2 min 650 (Zv9_NA997)
#    max 77276063 (chr7) median 23078
# %50.14 masked total, %50.23 masked real

    # how much does this window masker and repeat masker overlap:
    featureBits -countGaps danRer7 rmsk windowmaskerSdust \
       > fb.danRer7.rmsk.windowmaskerSdust.txt 2>&1
    #   547071692 bases of 1412464843 (38.732%) in intersection

###########################################################################
# MAKE 11.OOC FILES FOR BLAT (DONE - 2010-12-09 - Hiram)
    ssh kolossus
    # numerator is danRer7 gapless bases "real" as reported by faSize
    # denominator is hg19 gapless bases as reported by featureBits,
    # 1024 is threshold used for human -repMatch:
    calc \( 1409770109 / 2897316137 \) \* 1024
    #	( 1409770109 / 2897316137 ) * 1024 = 498.255808
    # ==> use -repMatch=400 according to size scaled down from 1024 for human.
    #	and rounded down to nearest 100
    cd /hive/data/genomes/danRer7
    blat danRer7.2bit /dev/null /dev/null -tileSize=11 \
      -makeOoc=jkStuff/danRer7.11.ooc -repMatch=400
    #	Wrote 63970 overused 11-mers to jkStuff/danRer7.11.ooc

    mkdir /hive/data/staging/data/danRer7
    cp -p danRer7.2bit chrom.sizes jkStuff/danRer7.11.ooc \
	/hive/data/staging/data/danRer7

    # just to see what is going on, since all gaps here are almost all
    #	exactly 100 bases, they aren't really "non-bridged", not really.
    gapToLift -minGap=100 -bedFile=jkStuff/nonBridgedGaps.bed danRer7 \
	jkStuff/danRer7.nonBridged.lft

###########################################################################
# AUTO UPDATE GENBANK MRNA AND EST AND ZGC GENES RUN
# (DONE - 2010-12-09 - Hiram)
    ssh hgwdev
    cd $HOME/kent/src/hg/makeDb/genbank
    git pull
    # edit etc/genbank.conf to add danRer7 and commit to GIT

# danRer7 (zebrafish)
danRer7.serverGenome = /hive/data/genomes/danRer7/danRer7.2bit
danRer7.clusterGenome = /scratch/data/danRer7/danRer7.2bit
danRer7.ooc = /scratch/data/danRer7/11.ooc
danRer7.lift = no
danRer7.refseq.mrna.native.pslCDnaFilter  = ${ordered.refseq.mrna.native.pslCDnaFilter}
danRer7.refseq.mrna.xeno.pslCDnaFilter    = ${ordered.refseq.mrna.xeno.pslCDnaFilter}
danRer7.genbank.mrna.native.pslCDnaFilter = ${ordered.genbank.mrna.native.pslCDnaFilter}
danRer7.genbank.mrna.xeno.pslCDnaFilter   = ${ordered.genbank.mrna.xeno.pslCDnaFilter}
danRer7.genbank.est.native.pslCDnaFilter  = ${ordered.genbank.est.native.pslCDnaFilter}
danRer7.downloadDir = danRer7
danRer7.perChromTables = no
danRer7.refseq.mrna.xeno.load  = yes
danRer7.upstreamGeneTbl = refGene
danRer7.upstreamMaf = multiz8way /hive/data/genomes/danRer7/bed/multiz8way/species.lst

    # end of section added to etc/genbank.conf
    git commit -m "adding danRer7 definition" genbank.conf
    git push
    make etc-update

    # ~/kent/src/hg/makeDb/genbank/src/lib/gbGenome.c already contains
    # danRer genome information

    ssh genbank
    screen                        # long running job managed in screen
    cd /cluster/data/genbank
    time nice -n +19 ./bin/gbAlignStep -initial danRer7 &
    #	logFile: var/build/logs/2010.12.10-10:36:31.danRer7.initalign.log
    # 5h33m

    # load database when finished
    ssh hgwdev
    cd /cluster/data/genbank
    time nice -n +19 ./bin/gbDbLoadStep -drop -initialLoad danRer7 &
    #	logFile: var/dbload/hgwdev/logs/2010.12.10-16:27:54.dbload.log
    # about 30 minutes

    # enable daily alignment and update of hgwdev
    cd ~/kent/src/hg/makeDb/genbank
    git pull
    # add danRer7 to:
        etc/align.dbs
        etc/hgwdev.dbs
    git commit -m "Added danRer7." etc/align.dbs etc/hgwdev.dbs
    git push
    make etc-update

#########################################################################
# update default position to be similar to danRer6 (DONE - 2010-12-14 - Hiram)
    hgsql \
 	-e 'update dbDb set defaultPos = "chr13:25004825-25082273" where name = "danRer7";' \
	hgcentraltest

#########################################################################
# construct download files (DONE - 2010-12-14 - Hiram)
    cd /hive/data/genomes/danRer7
    time makeDownloads.pl  danRer7 -workhorse=hgwdev > downloads.log 2>&1
    #	real    12m48.251s
    # edit the README files in goldenPath/*/README

#########################################################################
# Construct pushQ entry (DONE - 2010-12-14 - Hiram)
    mkdir /hive/data/genomes/danRer7/pushQ
    cd /hive/data/genomes/danRer7/pushQ
    makePushQSql.pl danRer7 > danRer7.pushQ.sql
WARNING: hgwdev does not have /gbdb/danRer7/wib/gc5Base.wib
WARNING: hgwdev does not have /gbdb/danRer7/wib/quality.wib
WARNING: hgwdev does not have /gbdb/danRer7/bbi/quality.bw
WARNING: danRer7 does not have seq
WARNING: danRer7 does not have extFile
WARNING: Could not tell (from trackDb, all.joiner and hardcoded lists of
supporting and genbank tables) which tracks to assign these tables to:
  ensGtp
  ensPep
    scp -p danRer7.pushQ.sql hgwbeta:/tmp
    ssh hgwbeta
    hgsql qapushq < /tmp/danRer7.pushQ.sql

#########################################################################
# LASTZ Xenopus xenTro2 (DONE - 2010-12-17 - Hiram)
    mkdir /hive/data/genomes/danRer7/bed/lastzXenTro2.2010-12-17
    cd /hive/data/genomes/danRer7/bed/lastzXenTro2.2010-12-17

    cat << '_EOF_' > DEF
# zebrafish vs. X. tropicalis
BLASTZ_H=2000
BLASTZ_Y=3400
BLASTZ_L=6000
BLASTZ_K=2200
BLASTZ_Q=/scratch/data/blastz/HoxD55.q

# TARGET: Zebrafish danRer7
SEQ1_DIR=/scratch/data/danRer7/danRer7.2bit
SEQ1_LEN=/scratch/data/danRer7/chrom.sizes
SEQ1_CHUNK=20000000
SEQ1_LAP=10000
SEQ1_LIMIT=40

# QUERY: X. tropicalis xenTro2
SEQ2_DIR=/scratch/data/xenTro2/xenTro2.2bit
SEQ2_LEN=/scratch/data/xenTro2/chrom.sizes
SEQ2_CHUNK=10000000
SEQ2_LAP=0
SEQ2_LIMIT=50

BASE=/hive/data/genomes/danRer7/bed/lastzXenTro2.2010-12-17
TMPDIR=/scratch/tmp
'_EOF_'
    # << happy emacs

    #	establish a screen to control this job
    screen
    time nice -n +19 doBlastzChainNet.pl -verbose=2 \
	`pwd`/DEF \
	-noLoadChainSplit -chainMinScore=5000 -chainLinearGap=loose \
	-workhorse=hgwdev -smallClusterHub=memk -bigClusterHub=swarm \
	> do.log 2>&1 &

    cat fb.danRer7.chainXenTro2Link.txt
    #	90625809 bases of 1409770109 (6.428%) in intersection

    mkdir /hive/data/genomes/xenTro2/bed/blastz.danRer7.swap
    cd /hive/data/genomes/xenTro2/bed/blastz.danRer7.swap
    time nice -n +19 doBlastzChainNet.pl -verbose=2 \
	/hive/data/genomes/danRer7/bed/lastzXenTro2.2010-12-17/DEF \
	-noLoadChainSplit -chainMinScore=5000 -chainLinearGap=loose \
	-workhorse=hgwdev -smallClusterHub=memk -bigClusterHub=swarm \
	-swap > swap.log 2>&1 &
    #	real    32m57.901s
    cat fb.xenTro2.chainDanRer7Link.txt
    #	89862892 bases of 1359412157 (6.610%) in intersection

#########################################################################
# LASTZ Tetraodon TetNig2 (DONE - 2010-12-17 - Hiram)
    mkdir /hive/data/genomes/danRer7/bed/lastzTetNig2.2010-12-17
    cd /hive/data/genomes/danRer7/bed/lastzTetNig2.2010-12-17

    cat << '_EOF_' > DEF
# zebrafish vs tetraodon
BLASTZ_Y=9400
BLASTZ_L=3000
BLASTZ_K=3000
BLASTZ_M=50
BLASTZ_Q=/scratch/data/blastz/HoxD55.q

# TARGET: Zebrafish danRer7
SEQ1_DIR=/scratch/data/danRer7/danRer7.2bit
SEQ1_LEN=/scratch/data/danRer7/chrom.sizes
SEQ1_CHUNK=20000000
SEQ1_LAP=10000
SEQ1_LIMIT=40

# QUERY: Tetraodon TetNig2 - single chunk big enough to single largest item
SEQ2_DIR=/scratch/data/tetNig2/tetNig2.2bit
SEQ2_LEN=/scratch/data/tetNig2/chrom.sizes
SEQ2_CTGDIR=/scratch/data/tetNig2/tetNig2.contigs.2bit
SEQ2_CTGLEN=/scratch/data/tetNig2/tetNig2.contigs.sizes
SEQ2_LIFT=/scratch/data/tetNig2/tetNig2.contigs.lift
SEQ2_CHUNK=20000000
SEQ2_LAP=0
SEQ2_LIMIT=50

BASE=/hive/data/genomes/danRer7/bed/lastzTetNig2.2010-12-17
TMPDIR=/scratch/tmp
'_EOF_'
    # << happy emacs

    #	establish a screen to control this job
    screen
    time nice -n +19 doBlastzChainNet.pl -verbose=2 \
	`pwd`/DEF \
	-qRepeats=windowmaskerSdust \
	-noLoadChainSplit -chainMinScore=2000 -chainLinearGap=medium \
	-workhorse=hgwdev -smallClusterHub=memk -bigClusterHub=swarm \
	> do.log 2>&1 &
    #	 Elapsed time: 144m12s

    cat fb.danRer7.chainTetNig2Link.txt
    #	88435047 bases of 1409770109 (6.273%) in intersection

    mkdir /hive/data/genomes/tetNig2/bed/blastz.danRer7.swap
    cd /hive/data/genomes/tetNig2/bed/blastz.danRer7.swap
    time nice -n +19 doBlastzChainNet.pl -verbose=2 \
	/hive/data/genomes/danRer7/bed/lastzTetNig2.2010-12-17/DEF \
	-qRepeats=windowmaskerSdust \
	-noLoadChainSplit -chainMinScore=2000 -chainLinearGap=medium \
	-workhorse=hgwdev -smallClusterHub=memk -bigClusterHub=swarm \
	-swap > swap.log 2>&1 &
    #	real    8m43.275s

    cat fb.tetNig2.chainDanRer7Link.txt
    #	71259923 bases of 302314788 (23.571%) in intersection

##############################################################################
# LASTZ Stickleback gasAcu1 (DONE - 2010-12-17 - Hiram)
    mkdir /hive/data/genomes/danRer7/bed/lastzGasAcu1.2010-12-17
    cd /hive/data/genomes/danRer7/bed/lastzGasAcu1.2010-12-17

    cat << '_EOF_' > DEF
# zebrafish vs. stickleback

# using the "close" genome alignment parameters
#       see also: http://genomewiki.ucsc.edu/index.php/Mm9_multiple_alignment
BLASTZ_Y=9400
BLASTZ_L=3000
BLASTZ_K=3000
BLASTZ_M=50
BLASTZ_Q=/scratch/data/blastz/HoxD55.q

# TARGET: Zebrafish danRer7
SEQ1_DIR=/scratch/data/danRer7/danRer7.2bit
SEQ1_LEN=/scratch/data/danRer7/chrom.sizes
SEQ1_CHUNK=20000000
SEQ1_LAP=10000
SEQ1_LIMIT=40

# TARGET: Stickleback gasAcu1, chunk large enough to run largest piece
SEQ2_DIR=/scratch/data/gasAcu1/gasAcu1.2bit
SEQ2_LEN=/scratch/data/gasAcu1/chrom.sizes
SEQ2_CTGDIR=/scratch/data/gasAcu1/gasAcu1.contigs.2bit
SEQ2_CTGLEN=/scratch/data/gasAcu1/gasAcu1.contigs.sizes
SEQ2_LIFT=/scratch/data/gasAcu1/gasAcu1.contigs.lift
SEQ2_CHUNK=22000000
SEQ2_LAP=0
SEQ2_LIMIT=50

BASE=/hive/data/genomes/danRer7/bed/lastzGasAcu1.2010-12-17
TMPDIR=/scratch/tmp
'_EOF_'
    # << happy emacs

    #	establish a screen to control this job
    screen
    time nice -n +19 doBlastzChainNet.pl -verbose=2 \
	`pwd`/DEF \
	-noLoadChainSplit -chainMinScore=2000 -chainLinearGap=medium \
	-workhorse=hgwdev -smallClusterHub=memk -bigClusterHub=swarm \
	> do.log 2>&1 &
    #	real    247m45.584s
    cat fb.danRer7.chainGasAcu1Link.txt
    #	144782335 bases of 1409770109 (10.270%) in intersection

    mkdir /hive/data/genomes/gasAcu1/bed/blastz.danRer7.swap
    cd /hive/data/genomes/gasAcu1/bed/blastz.danRer7.swap
    time nice -n +19 doBlastzChainNet.pl -verbose=2 \
	/hive/data/genomes/danRer7/bed/lastzGasAcu1.2010-12-17/DEF \
	-noLoadChainSplit -chainMinScore=2000 -chainLinearGap=medium \
	-workhorse=hgwdev -smallClusterHub=memk -bigClusterHub=swarm \
	-swap > swap.log 2>&1 &
    #	real    25m35.024s
    cat fb.gasAcu1.chainDanRer7Link.txt
    #	122918766 bases of 446627861 (27.522%) in intersection

#########################################################################
# lastz Medaka oryLat2 (DONE - 2010-12-17 - Hiram)
    mkdir /hive/data/genomes/danRer7/bed/lastzOryLat2.2010-12-17
    cd /hive/data/genomes/danRer7/bed/lastzOryLat2.2010-12-17

    cat << '_EOF_' > DEF
# Zebrafish vs. Medaka

# using the "close" genome alignment parameters
#       see also: http://genomewiki.ucsc.edu/index.php/Mm9_multiple_alignment
BLASTZ_Y=9400
BLASTZ_L=3000
BLASTZ_K=3000
BLASTZ_M=50
BLASTZ_Q=/scratch/data/blastz/HoxD55.q

# TARGET: Zebrafish danRer7
SEQ1_DIR=/scratch/data/danRer7/danRer7.2bit
SEQ1_LEN=/scratch/data/danRer7/chrom.sizes
SEQ1_CHUNK=20000000
SEQ1_LAP=10000
SEQ1_LIMIT=40

# Query: Medaka oryLat2
SEQ2_DIR=/scratch/data/oryLat2/oryLat2.2bit
SEQ2_LEN=/scratch/data/oryLat2/chrom.sizes
SEQ2_CHUNK=20000000
SEQ2_LAP=0
SEQ2_LIMIT=50

BASE=/hive/data/genomes/danRer7/bed/lastzOryLat2.2010-12-17
TMPDIR=/scratch/tmp
'_EOF_'
    # << happy emacs

    time nice -n +19 doBlastzChainNet.pl -verbose=2 \
	`pwd`/DEF \
	-chainMinScore=2000 -chainLinearGap=medium \
	-noLoadChainSplit -qRepeats=windowmaskerSdust \
	-smallClusterHub=memk -bigClusterHub=swarm > do.log 2>&1 &
    #	real    258m58.096s
    cat fb.danRer7.chainOryLat2Link.txt
    #	141428502 bases of 1409770109 (10.032%) in intersection

    mkdir /hive/data/genomes/oryLat2/bed/blastz.danRer7.swap
    cd /hive/data/genomes/oryLat2/bed/blastz.danRer7.swap
    time nice -n +19 doBlastzChainNet.pl -verbose=2 \
	/hive/data/genomes/danRer7/bed/lastzOryLat2.2010-12-17/DEF \
	-chainMinScore=2000 -chainLinearGap=medium \
	-noLoadChainSplit -qRepeats=windowmaskerSdust \
	-swap -smallClusterHub=memk -bigClusterHub=swarm > swap.log 2>&1 &
    #	real    20m56.926s
    cat fb.oryLat2.chainDanRer7Link.txt
    #	124527579 bases of 700386597 (17.780%) in intersection

############################################################################
# lastz swap mm9 (DONE - 2010-12-17 - Hiram
    #	original alignment
    cd /hive/data/genomes/mm9/bed/lastzDanRer7.2010-12-17
    cat fb.danRer7.chainMm9Link.txt
    #	68190354 bases of 2620346127 (2.602%) in intersection

    #	and the swap
    mkdir /hive/data/genomes/danRer7/bed/blastz.mm9.swap
    cd /hive/data/genomes/danRer7/bed/blastz.mm9.swap
    time nice -n +19 doBlastzChainNet.pl -verbose=2 \
	/hive/data/genomes/mm9/bed/lastzDanRer7.2010-12-17/DEF \
	-noLoadChainSplit -chainMinScore=5000 -chainLinearGap=loose \
	-workhorse=hgwdev -smallClusterHub=encodek -bigClusterHub=swarm \
	-swap > swap.log 2>&1 &
    #	real    16m8.672s

    cat fb.danRer7.chainMm9Link.txt
    #	71960602 bases of 1409770109 (5.104%) in intersection

#########################################################################
# lastz swap from hg19 (DONE - 2010-12-20 - Hiram)
    #	original alignment
    cd /hive/data/genomes/hg19/bed/lastzDanRer7.2010-12-17
    cat fb.hg19.chainDanRer7Link.txt
    #	80849592 bases of 2897316137 (2.790%) in intersection

    #	running the swap
    mkdir /hive/data/genomes/danRer7/bed/blastz.hg19.swap
    cd /hive/data/genomes/danRer7/bed/blastz.hg19.swap
    time nice -n +19 doBlastzChainNet.pl -verbose=2 \
	/hive/data/genomes/hg19/bed/lastzDanRer7.2010-12-17/DEF \
	-noLoadChainSplit -chainMinScore=5000 -chainLinearGap=loose \
	-workhorse=hgwdev -smallClusterHub=memk -bigClusterHub=swarm \
	-swap > swap.log 2>&1 &
    #	real    42m52.920s
    cat fb.danRer7.chainHg19Link.txt
    #	86716552 bases of 1409770109 (6.151%) in intersection

#########################################################################
# lastz Fugu fr2 (DONE - 2010-12-21 - Hiram)
    mkdir /hive/data/genomes/danRer7/bed/lastzFr2.2010-12-21
    cd /hive/data/genomes/danRer7/bed/lastzFr2.2010-12-21

    cat << '_EOF_' > DEF
# Zebrafish vs. Fugu

# using the "close" genome alignment parameters
#       see also: http://genomewiki.ucsc.edu/index.php/Mm9_multiple_alignment
BLASTZ_Y=9400
BLASTZ_L=3000
BLASTZ_K=3000
BLASTZ_M=50
BLASTZ_Q=/scratch/data/blastz/HoxD55.q

# TARGET: Zebrafish danRer7
SEQ1_DIR=/scratch/data/danRer7/danRer7.2bit
SEQ1_LEN=/scratch/data/danRer7/chrom.sizes
SEQ1_CHUNK=20000000
SEQ1_LAP=10000
SEQ1_LIMIT=40

# QUERY: Fugu fr2
#       Align to the scaffolds, results lifed up to chrUn.sdTrf coordinates
SEQ2_DIR=/scratch/data/fr2/fr2.2bit
SEQ2_LEN=/hive/data/genomes/fr2/chrom.sizes
SEQ2_CTGDIR=/hive/data/genomes/fr2/noUn/fr2.scaffolds.2bit
SEQ2_CTGLEN=/hive/data/genomes/fr2/noUn/fr2.scaffolds.sizes
SEQ2_LIFT=/hive/data/genomes/fr2/jkStuff/liftAll.lft
SEQ2_CHUNK=10000000
SEQ2_LIMIT=30
SEQ2_LAP=0

BASE=/hive/data/genomes/danRer7/bed/lastzFr2.2010-12-21
TMPDIR=/scratch/tmp
'_EOF_'
    # << happy emacs

    screen # use screen to manage this long running job
    time nice -n +19 doBlastzChainNet.pl -verbose=2 \
	`pwd`/DEF \
	-chainMinScore=2000 -chainLinearGap=medium \
	-noLoadChainSplit -qRepeats=windowmaskerSdust \
	-smallClusterHub=memk -bigClusterHub=swarm > do.log 2>&1 &
    #	real    144m0.459s

    cat fb.danRer7.chainFr2Link.txt
    #	103798841 bases of 1409770109 (7.363%) in intersection

    mkdir /hive/data/genomes/fr2/bed/blastz.danRer7.swap
    cd /hive/data/genomes/fr2/bed/blastz.danRer7.swap
    time nice -n +19 doBlastzChainNet.pl -verbose=2 \
	/hive/data/genomes/danRer7/bed/lastzFr2.2010-12-21/DEF \
	-chainMinScore=2000 -chainLinearGap=medium \
	-noLoadChainSplit -qRepeats=windowmaskerSdust \
	-swap -smallClusterHub=memk -bigClusterHub=swarm > swap.log 2>&1 &
    #	real    10m26.244s
    cat fb.fr2.chainDanRer7Link.txt
    #	80078057 bases of 393312790 (20.360%) in intersection

############################################################################
## 8-Way Multiz (DONE - 2011-01-18 - Hiram)
    ssh hgwdev
    mkdir /hive/data/genomes/danRer7/bed/multiz8way
    cd /hive/data/genomes/danRer7/bed/multiz8way

    wget -O 46way.nh \
http://hgdownload.soe.ucsc.edu/goldenPath/hg19/multiz46way/46way.corrected.nh

# ((Zebrafish,
# (Stickleback,
# Tetraodon)),
# (X_tropicalis,
# (Mouse,
# Human)))


    #	All distances remain as specified in the 46way.nh
    /cluster/bin/phast/tree_doctor --prune-all-but danRer6,fr2,oryLat2,gasAcu1,tetNig2,xenTro2,mm9,hg19 46way.nh | sed -e "s/danRer6/danRer7/" > 8way.nh

# (((hg19:0.131183,mm9:0.352605):0.525173,xenTro2:0.852482):0.300396,(((tetNig2:0.224774,fr2:0.205294):0.191836,(gasAcu1:0.313967,oryLat2:0.478451):0.058404):0.322824,danRer7:0.731166):0.155214);

    # Mark reports three new reports for fish phylogeny:
    #	(zebrafish, (medaka, (stickleback, (fugu, tetraodon))))
# http://www.ncbi.nlm.nih.gov/pubmed/18771739 analyzed mitochondrial data.
# http://www.ncbi.nlm.nih.gov/pubmed/20546903 analyzed data from 11 nuclear genes.
# http://www.ncbi.nlm.nih.gov/pubmed/17709262 analyzed mitochrondrial data.

    # So, rearrange to get Zebrafish on top and Medaka out on its own:
    cat << '_EOF_' > zFishOnTop.nh
((danRer7:0.731166,(oryLat2:0.478451,(gasAcu1:0.313967,(tetNig2:0.224774,fr2:0.205294):0.191836):0.058404):0.322824):0.155214,(xenTro2:0.852482,(mm9:0.352605,hg19:0.131183):0.525173):0.300396);
'_EOF_'
    # << happy emacs

    # convert to species names
    /cluster/bin/phast/tree_doctor --rename \
"danRer7->Zebrafish; gasAcu1->Stickleback; oryLat2->Medaka; tetNig2->Tetraodon; fr2->Fugu; xenTro2->X_Tropicalis; mm9->Mouse; hg19->Human" \
zFishOnTop.nh

((Zebrafish:0.731166,(Medaka:0.478451,(Stickleback:0.313967,(Tetraodon:0.224774,Fugu:0.205294):0.191836):0.058404):0.322824):0.155214,(X_Tropicalis:0.852482,(Mouse:0.352605,Human:0.131183):0.525173):0.300396);

    #	Use this specification in the phyloGif tool:
    #	http://genome.ucsc.edu/cgi-bin/phyloGif
    #	to obtain a png image for /usr/local/apache/htdocs/images/phylo/danRer7_8way.gif

    /cluster/bin/phast/all_dists zFishOnTop.nh > 8way.distances.txt
    #	Use this output to create the table below
    grep -i danRer7 8way.distances.txt | sort -k3,3n
# danRer7 gasAcu1 1.426361
# danRer7 fr2     1.509524
# danRer7 tetNig2 1.529004
# danRer7 oryLat2 1.532441
# danRer7 hg19    1.843132
# danRer7 xenTro2 2.039258
# danRer7 mm9     2.064554

#	If you can fill in all the numbers in this table, you are ready for
#	the multiple alignment procedure
#

#                         featureBits chainLink measures
#                                           chainDanRer6Link  chain linearGap
#    distance                    on danRer7  on other     minScore
#  1  1.43 - stickleback gasAcu1   (% 10.27) (% 27.52)       2000     medium
#  2  1.51 - Fugu fr2              (%  7.36) (% 20.36)       2000     medium
#  3  1.53 - tetraodon tetNig2     (%  6.27) (% 23.57)       2000     medium
#  4  1.53 - medaka oryLat2        (% 10.03) (% 17.78)       2000     medium
#  5  1.84 - human hg19            (%  2.79) (%  6.15)       5000     loose
#  6  2.04 - xenopus xenTro2       (%  6.43) (%  6.61)       5000     loose
#  7  2.06 - mouse mm9             (%  2.60) (%  5.10)       5000     loose

# None of this concern for distances matters in building the first step, the
# maf files.

    #	bash shell syntax here ...
    export H=/hive/data/genomes/danRer7/bed
    mkdir mafLinks
    for G in gasAcu1 fr2 tetNig2 oryLat2 hg19 xenTro2 mm9
    do
	mkdir mafLinks/$G
	if [ ! -d ${H}/lastz.${G}/mafNet ]; then
	echo "missing directory lastz.${G}/mafNet"
		exit 255
	fi
	ln -s ${H}/lastz.$G/mafNet/*.maf.gz ./mafLinks/$G
    done

    # create species list and stripped down tree for autoMZ
    sed 's/[a-z][a-z]*_//g; s/:[0-9\.][0-9\.]*//g; s/;//; /^ *$/d' \
	zFishOnTop.nh > tmp.nh
    echo `cat tmp.nh` > tree-commas.nh
    echo `cat tree-commas.nh` | sed 's/ //g; s/,/ /g' > tree.nh
    sed 's/[()]//g; s/,/ /g' tree.nh > species.lst

    # location for cluster run
    #/hive/data/genomes/danRer7/bed/multiz8way/mafLinks/

    mkdir penn
    cp -p /cluster/bin/penn/multiz.2009-01-21/multiz penn
    cp -p /cluster/bin/penn/multiz.2009-01-21/maf_project penn
    cp -p /cluster/bin/penn/multiz.2009-01-21/autoMZ penn


    # this one is simple enough that it doesn't need a cluster run.

    cat > autoMultiz << '_EOF_'
#!/bin/csh -ef
set db = danRer7
set binDir = /hive/data/genomes/danRer7/bed/multiz8way/penn
rm -fr tmp
mkdir -p tmp
cd tmp
foreach s (`cat ../species.lst`)
    if ($s != $db) then
        zcat ../mafLinks/$s/$db.$s.net.maf > $db.$s.sing.maf
    endif
end
set path = ($binDir $path); rehash
$binDir/autoMZ + T=. E=$db "`cat ../tree.nh`" $db.*.sing.maf ../multiz8way.maf
cd ..
rm -fr tmp
'_EOF_'
    # << happy emacs
    chmod +x autoMultiz

    screen
    time ./autoMultiz > autoMultiz.log 2>&1 &
    #	real    242m15.543s
    #	log shows no errors.
    #	-rw-rw-r-- 1 1785003021 Jan 18 18:49 multiz8way.maf

    # Load into database
    ssh hgwdev
    cd /hive/data/genomes/danRer7/bed/multiz8way
    mkdir /gbdb/danRer7/multiz8way
    ln -s `pwd`/multiz8way.maf /gbdb/danRer7/multiz8way
    cd /scratch/tmp
    time nice -n +19 hgLoadMaf danRer7 multiz8way
    #	Indexing and tabulating /gbdb/danRer7/multiz8way/multiz8way.maf
    #	Loaded 2997216 mafs in 1 files from /gbdb/danRer7/multiz8way
    #	real    0m32.612s

    time nice -n +19 hgLoadMafSummary -minSize=10000 -mergeGap=500 \
	-maxSize=50000 danRer7 multiz8waySummary \
	/gbdb/danRer7/multiz8way/multiz8way.maf
    #	Created 1160923 summary blocks from 9245726 components
    #	and 2997216 mafs from /gbdb/danRer7/multiz8way/multiz8way.maf
    #	real    0m34.656s

    wc -l multiz8way*.tab
    #	2997216 multiz8way.tab
    #	1160923 multiz8waySummary.tab
    #	4158139 total

    rm multiz8way*.tab

#########################################################################
# GAP ANNOTATE MULTIZ6WAY MAF AND LOAD TABLES (DONE - 2011-01-19 - Hiram)
    # redone properly 2011-05-12
    # mafAddIRows has to be run on single chromosome maf files, it does not
    #	function correctly when more than one reference sequence
    #	are in a single file.
    ssh hgwdev
    mkdir -p /hive/data/genomes/danRer7/bed/multiz8way/anno/mafSplit
    cd /hive/data/genomes/danRer7/bed/multiz8way/anno/mafSplit
    time mafSplit -byTarget -useFullSequenceName \
	/dev/null . ../../multiz8way.maf
    #	real    1m0.519s

    mkdir /hive/data/genomes/danRer7/bed/multiz8way/anno
    cd /hive/data/genomes/danRer7/bed/multiz8way/anno
for DB in `sed -e "s/ danRer7//" ../species.lst`
do
    echo "${DB} "
    ln -s  /hive/data/genomes/${DB}/${DB}.N.bed ${DB}.bed
    echo ${DB}.bed  >> nBeds
    ln -s  /hive/data/genomes/${DB}/chrom.sizes ${DB}.len
    echo ${DB}.len  >> sizes
done

    mkdir annoSplit
    for F in mafSplit/*.maf
do
    B=`basename ${F}`
    mafAddIRows -nBeds=nBeds ${F} /hive/data/genomes/danRer7/danRer7.2bit \
	annoSplit/${B}
    echo $B
done
    # about 10 minutes
    # combine into one file
    head -q -n 1 annoSplit/chr1.maf > danRer7.8way.maf
    for F in annoSplit/*.maf
do
    grep -h -v "^#" ${F}
done >> danRer7.8way.maf
    #	about 2 minutes
    #	these maf files do not have the end marker, this does nothing:
    #	tail -q -n 1 mafSplit/chr1.maf >> panTro3.12way.maf
    # How about an official end marker:
    echo "##eof maf" >> danRer7.8way.maf

    # Load into database
    rm /gbdb/danRer7/multiz8way/multiz8way.maf   # remove old symlink
    ln -s `pwd`/danRer7.8way.maf /gbdb/danRer7/multiz8way/multiz8way.maf
    cd /scratch/tmp
    time nice -n +19 hgLoadMaf danRer7 multiz8way
    #	Loaded 3365771 mafs in 1 files from /gbdb/danRer7/multiz8way
    #	real    1m22.783s

    time nice hgLoadMafSummary -minSize=10000 -mergeGap=500 \
	-maxSize=50000 danRer7 multiz8waySummary \
	/gbdb/danRer7/multiz8way/multiz8way.maf
    #	Created 1160923 summary blocks from 9245726 components
    #	and 3365771 mafs from /gbdb/danRer7/multiz8way/multiz8way.maf
    #	real    1m37.885s

    wc -l multiz8way*.tab
    #	3365771 multiz8way.tab
    #	1160923 multiz8waySummary.tab
    #	4526694 total

    rm multiz8way*.tab

############################################################################
# Adding automatic generation of upstream files (DONE - 2010-12-23 - Hiram)
    # edit src/hg/makeDb/genbank/etc/genbank.conf to add:
danRer7.upstreamGeneTbl = refGene
danRer7.upstreamMaf = multiz8way /hive/data/genomes/danRer7/bed/multiz8way/species.lst

#########################################################################
# MULTIZ8WAY MAF FRAMES (DONE - 2011-01-19 - Hiram)
    ssh hgwdev
    mkdir /hive/data/genomes/danRer7/bed/multiz8way/frames
    cd /hive/data/genomes/danRer7/bed/multiz8way/frames
    mkdir genes

#danRer7 ensGene
#gasAcu1 ensGene
#tetNig2 ensGene
#fr2	ensGene
#oryLat2 ensGene
#xenTro2 ensGene
#hg19    knownGene
#mm9     knownGene
    #------------------------------------------------------------------------
    # get the genes for all genomes
    # mRNAs with CDS.  single select to get cds+psl, then split that up and
    # create genePred
    # using ensGene for danRer7 gasAcu1 tetNig2 fr2 oryLat2 xenTro2
for qDB in danRer7 gasAcu1 tetNig2 fr2 oryLat2 xenTro2
do
  echo hgsql -N -e \"'select * from 'ensGene\;\" ${qDB}
      hgsql -N -e "select * from ensGene" ${qDB} | cut -f 2-11 \
      | genePredSingleCover stdin stdout | gzip -2c > genes/$qDB.gp.gz
done
    # verify counts of genes are reasonable:
for T in genes/*.gz
do
    echo -n "# $T: "
    zcat $T | cut -f1 | sort | uniq -c | wc -l
done

# genes/danRer7.gp.gz:24068
# genes/fr2.gp.gz:18336
# genes/gasAcu1.gp.gz:20631
# genes/oryLat2.gp.gz:19576
# genes/tetNig2.gp.gz:19539
# genes/xenTro2.gp.gz:17888

    # using knownGene for mm9 hg19
    # genePreds; (must keep only the first 10 columns for knownGene)
    for qDB in mm9 hg19
    do
      echo hgsql -N -e \"'select * from 'knownGene\;\" ${qDB}
      hgsql -N -e "select * from knownGene" ${qDB} | cut -f 1-10 \
      | genePredSingleCover stdin stdout | gzip -2c > genes/$qDB.gp.gz
    done
for T in genes/*.gz
do
    echo -n "# $T: "
    zcat $T | cut -f1 | sort | uniq -c | wc -l
done

# genes/danRer7.gp.gz: 24068
# genes/fr2.gp.gz: 18336
# genes/gasAcu1.gp.gz: 20631
# genes/hg19.gp.gz: 20597
# genes/mm9.gp.gz: 20630
# genes/oryLat2.gp.gz: 19576
# genes/tetNig2.gp.gz: 19539
# genes/xenTro2.gp.gz: 17888

    # reloaded corrected maf 2011-05-12
    ssh hgwdev
    cd /hive/data/genomes/danRer7/bed/multiz8way/frames
    cat ../anno/danRer7.8way.maf \
	| nice -n +19 genePredToMafFrames danRer7 stdin stdout \
	    `sed -e "s#\([a-zA-Z0-9]*\)#\1 genes/\1.gp.gz#g" ../species.lst` \
		| gzip > multiz8way.mafFrames.gz
    # verify there are frames on everything:
    zcat multiz8way.mafFrames.gz | awk '{print $4}' | sort | uniq -c
    #	234538 danRer7
    #	418660 fr2
    #	437610 gasAcu1
    #	436329 hg19
    #	422074 mm9
    #	409432 oryLat2
    #	392409 tetNig2
    #	346443 xenTro2

    ssh hgwdev
    cd /hive/data/genomes/danRer7/bed/multiz8way/frames
    nice hgLoadMafFrames danRer7 multiz8wayFrames multiz8way.mafFrames.gz

############################################################################
# creating upstream mafs (DONE - 2011-10-04 - Hiram)
    ssh hgwdev
    mkdir /hive/data/genomes/danRer7/goldenPath/multiz8way
    cd /hive/data/genomes/danRer7/goldenPath/multiz8way
    # run this bash script
    cat << '_EOF_' > mkUpstream.sh
DB=danRer7
GENE=ensGene
NWAY=multiz8way
export DB GENE

for S in 1000 2000 5000
do
    echo "making upstream${S}.${GENE}.maf"
    echo "featureBits ${DB} ${GENE}:upstream:${S} -fa=/dev/null -bed=stdout"
    featureBits ${DB} ${GENE}:upstreamAll:${S} -fa=/dev/null -bed=stdout \
        | perl -wpe 's/_up[^\t]+/\t0/' | sort -k1,1 -k2,2n \
        | mafFrags ${DB} ${NWAY} stdin stdout \
                -orgs=/hive/data/genomes/${DB}/bed/${NWAY}/species.lst \
        | gzip -c > upstream${S}.${GENE}.maf.gz
    echo "done upstream${S}.${GENE}.maf.gz"
done
'_EOF_'
    # << happy emacs
    chmod +x mkUpstream.sh
    time ./mkUpstream.sh
    #	real    61m16.030s

    # FIXUP/VERIFY the genbank.conf file to indicate this gene choice for the
    #	upstream automatic generation

#######################################################################
# Phylogenetic tree from 8-way (DONE - 2011-01-19 - Hiram)
# 	We need one tree for all chroms

    cd /hive/data/genomes/danRer7/bed/multiz8way/
    mkdir mafSplit
    cd mafSplit
    mafSplit -byTarget -useFullSequenceName /dev/null . ../anno/multiz8way.maf
    # got 1072 mafs named after their chrom/scaff .maf
    # although there are over 1133 chroms and scaffolds,
    # some are too small or have nothing aligning.

    mkdir /hive/data/genomes/danRer7/bed/multiz8way/4d
    cd /hive/data/genomes/danRer7/bed/multiz8way/4d

    # danRer7 cannot be too picky, dropping this clause:
    #   refSeqStatus.status='Reviewed'
    hgsql danRer7 -Ne \
    "select * from refGene,refSeqStatus where refGene.name=refSeqStatus.mrnaAcc and mol='mRNA'" \
	| cut -f 2-20 | egrep -E -v "chrM" \
	> refSeqReviewed.gp
    wc -l refSeqReviewed.gp
    #	14431 refSeqReviewed.gp

    genePredSingleCover refSeqReviewed.gp stdout | sort > refSeqReviewedNR.gp
    wc -l refSeqReviewedNR.gp
    #	13835 refSeqReviewedNR.gp

    ssh memk
    mkdir /hive/data/genomes/danRer7/bed/multiz8way/4d/run
    cd /hive/data/genomes/danRer7/bed/multiz8way/4d/run
    mkdir ../mfa

    # newer versions of msa_view have a slightly different operation
    # the sed of the gp file inserts the reference species in the chr name
    cat << '_EOF_' > 4d.csh
#!/bin/csh -fe
set PHASTBIN = /cluster/bin/phast.build/cornellCVS/phast.2010-12-30/bin
set r = "/hive/data/genomes/danRer7/bed/multiz8way"
set c = $1
set infile = $r/mafSplit/$2
set outfile = $3
cd /scratch/tmp
# 'clean' maf
perl -wpe 's/^s ([^.]+)\.\S+/s $1/' $infile > $c.maf
awk -v C=$c '$2 == C {print}' $r/4d/refSeqReviewedNR.gp | sed -e "s/\t$c\t/\tdanRer7.$c\t/" > $c.gp
set NL=`wc -l $c.gp| gawk '{print $1}'`
if ("$NL" != "0") then
    $PHASTBIN/msa_view --4d --features $c.gp -i MAF $c.maf -o SS > $c.ss
    $PHASTBIN/msa_view -i SS --tuple-size 1 $c.ss > $r/4d/run/$outfile
else
    echo "" > $r/4d/run/$outfile
endif
rm -f $c.gp $c.maf $c.ss
'_EOF_'
    # << happy emacs
    chmod +x 4d.csh

    ls -1S /hive/data/genomes/danRer7/bed/multiz8way/mafSplit/*.maf | \
        egrep -E -v "chrM" \
	| sed -e "s#.*multiz8way/mafSplit/##" \
	> maf.list

    cat << '_EOF_' > template
#LOOP
4d.csh $(root1) $(path1) {check out line+ ../mfa/$(root1).mfa}
#ENDLOOP
'_EOF_'
    # << happy emacs

    gensub2 maf.list single template stdout | tac > jobList
    para create jobList
    para try ... check
    para -maxJob=30 push
    para time
    # three of them have no result:
    #	-rw-rw-r-- 1      0 Jan 19 10:13 Zv9_NA684.mfa
    #	-rw-rw-r-- 1      0 Jan 19 10:13 Zv9_NA269.mfa
    #	-rw-rw-r-- 1      0 Jan 19 10:13 Zv9_NA975.mfa

# Completed: 1068 of 1071 jobs
# Crashed: 3 jobs
# CPU time in finished jobs:        793s      13.21m     0.22h    0.01d  0.000 y
# IO & Wait Time:                  2779s      46.32m     0.77h    0.03d  0.000 y
# Average job time:                   3s       0.06m     0.00h    0.00d
# Longest finished job:              44s       0.73m     0.01h    0.00d
# Submission to last job:           343s       5.72m     0.10h    0.00d

    # combine mfa files
    ssh hgwdev
    cd /hive/data/genomes/danRer7/bed/multiz8way/4d
    # but first clean out junk 1-byte leftovers from above process.
    cd mfa
    find -type f -size 1c | xargs rm
    find -type f -size 0c | xargs rm
    cd ..
    #want comma-less species.lst
    /cluster/bin/phast.build/cornellCVS/phast.2010-12-30/bin/msa_view \
	--aggregate "`cat ../species.lst`" mfa/*.mfa | sed s/"> "/">"/ \
	    > 4d.all.mfa
    # check they are all in there:
    grep "^>" 4d.all.mfa
    #	>danRer7
    #	>oryLat2
    #	>gasAcu1
    #	>tetNig2
    #	>fr2
    #	>xenTro2
    #	>mm9
    #	>hg19

    # tree-commas.nh:
    # ((danRer7,(oryLat2,(gasAcu1,(tetNig2,fr2)))),(xenTro2,(mm9,hg19)))

    # use phyloFit to create tree model (output is phyloFit.mod)
    time nice -n +19 \
	/cluster/bin/phast.build/cornellCVS/phast.2010-12-30/bin/phyloFit \
	    --EM --precision MED --msa-format FASTA --subst-mod REV \
		--tree ../tree-commas.nh 4d.all.mfa
    #	real    1m21.248s
    mv phyloFit.mod all.mod

    grep TREE all.mod
    #	TREE: ((danRer7:0.580406,(oryLat2:0.364485,(gasAcu1:0.30842,(tetNig2:0.201742,fr2:0.182553):0.217051):0.106429):0.308334):0.15191,(xenTro2:0.747875,(mm9:0.228088,hg19:0.2107):0.397886):0.15191);

    # create subset of fish-only
    echo "tetNig2 fr2 gasAcu1 oryLat2 danRer7" > fish.lst
    # use only the mfa files that have all five fish sequences:
    mkdir fishMfa
for F in mfa/*.mfa
do
  C=`egrep "^> tetNig2|^> fr2|^> gasAcu1|^> oryLat2|^> danRer7" "${F}" | wc -l`
  if [ "${C}" -eq "5" ]; then
    echo ${F}
  fi
done | while read OK
do
    B=`basename ${OK}`
    awk '
BEGIN { fish = 0 }
{
    if (match($0, "^>")) {
	fish = 1
	if (match($0, "^> mm9")) { fish = 0 }
	if (match($0, "^> hg19")) { fish = 0 }
	if (match($0, "^> xenTro2")) { fish = 0 }
    }
    if (fish) {print}
}
' "${OK}" > fishMfa/${B}
done
    # verify only fish are present
    grep -h "^>" fishMfa/*.mfa | sort | uniq -c
    #	168 > danRer7
    #	168 > fr2
    #	168 > gasAcu1
    #	168 > oryLat2
    #	168 > tetNig2

    /cluster/bin/phast.build/cornellCVS/phast.2010-12-30/bin/msa_view \
	--aggregate "`cat fish.lst`" fishMfa/*.mfa | sed s/"> "/">"/ \
	> 4d.fish.mfa

    /cluster/bin/phast.build/cornellCVS/phast.2010-12-30/bin/tree_doctor \
        --no-branchlen --prune-all-but="`cat fish.lst`" ../tree-commas.nh \
                > tree-commas.fish.nh
    # use phyloFit to create tree model (output is phyloFit.mod)
    time nice -n +19 \
	/cluster/bin/phast.build/cornellCVS/phast.2010-12-30/bin/phyloFit \
	    --EM --precision MED --msa-format FASTA --subst-mod REV \
		--tree ./tree-commas.fish.nh 4d.fish.mfa
    mv phyloFit.mod fish.mod
    #	real    0m1.495s
    grep TREE fish.mod | sed 's/TREE\:\ //' > fish.8way.nh

    # (danRer7:0.441576,(oryLat2:0.358316,(gasAcu1:0.302322,(tetNig2:0.201705,fr2:0.179742):0.221704):0.106667):0.441576);

#########################################################################
# phastCons 6-way (DONE - 2011-01-19 - Hiram)

    # split 8way mafs into 10M chunks and generate sufficient statistics
    # files for # phastCons
    ssh swarm
    mkdir -p /hive/data/genomes/danRer7/bed/multiz8way/cons/SS
    cd /hive/data/genomes/danRer7/bed/multiz8way/cons/SS

    cat << '_EOF_' > mkSS.csh
#!/bin/csh -ef
set c = $1
set MAF = /hive/data/genomes/danRer7/bed/multiz8way/mafSplit/$c.maf
set WINDOWS = /hive/data/genomes/danRer7/bed/multiz8way/cons/SS/$c
set WC = `cat $MAF | wc -l`
set NL = `grep "^#" $MAF | wc -l`
if ( -s $2 ) then
    exit 0
endif
if ( -s $2.running ) then
    exit 0
endif

date >> $2.running

rm -fr $WINDOWS
mkdir $WINDOWS
pushd $WINDOWS > /dev/null
if ( $WC != $NL ) then
/cluster/bin/phast.build/cornellCVS/phast.2010-12-30/bin/msa_split \
    $MAF -i MAF -o SS -r $WINDOWS/$c -w 10000000,0 -I 1000 -B 5000
endif
popd > /dev/null
date >> $2
rm -f $2.running
'_EOF_'
    # << happy emacs
    chmod +x mkSS.csh

    cat << '_EOF_' > template
#LOOP
mkSS.csh $(root1) {check out line+ $(root1).done}
#ENDLOOP
'_EOF_'
    # << happy emacs

    #	do the easy ones first to see some immediate results
    ls -1S -r ../../mafSplit | sed -e "s/.maf//" > maf.list

    gensub2 maf.list single template jobList
    para -ram=8g create jobList
    para try ... check ... etc

# Completed: 1072 of 1072 jobs
# CPU time in finished jobs:       1029s      17.14m     0.29h    0.01d  0.000 y
# IO & Wait Time:                  3083s      51.39m     0.86h    0.04d  0.000 y
# Average job time:                   4s       0.06m     0.00h    0.00d
# Longest finished job:              63s       1.05m     0.02h    0.00d
# Submission to last job:           227s       3.78m     0.06h    0.00d

    find . -type f | grep ".ss$" | wc
    #	1082    1082   36982

    # Run phastCons
    #	This job is I/O intensive in its output files, beware where this
    #	takes place or do not run too many at once.
    ssh swarm
    mkdir -p /hive/data/genomes/danRer7/bed/multiz8way/cons/run.cons
    cd /hive/data/genomes/danRer7/bed/multiz8way/cons/run.cons

    #	there are going to be only one phastCons run using
    #	this same script.  It triggers off of the current working directory
    #	$cwd:t which is the "grp" in this script.  It is:
    #	all

    cat << '_EOF_' > doPhast.csh
#!/bin/csh -fe
set PHASTBIN = /cluster/bin/phast.build/cornellCVS/phast.2010-12-30/bin
set c = $1
set cX = $1:r
set f = $2
set len = $3
set cov = $4
set rho = $5
set grp = $cwd:t
set cons = /hive/data/genomes/danRer7/bed/multiz8way/cons
set tmp = $cons/tmp/$f
mkdir -p $tmp
set ssSrc = $cons
set useGrp = "$grp.mod"
ln -s $ssSrc/SS/$c/$f.ss $tmp
ln -s $cons/$grp/$grp.mod $tmp
pushd $tmp > /dev/null
$PHASTBIN/phastCons $f.ss $useGrp \
    --rho $rho --expected-length $len --target-coverage $cov --quiet \
    --seqname $c --idpref $c --most-conserved $f.bed --score > $f.pp
popd > /dev/null
mkdir -p pp/$c bed/$c
sleep 4
touch pp/$c bed/$c
rm -f pp/$c/$f.pp
rm -f bed/$c/$f.bed
mv $tmp/$f.pp pp/$c
mv $tmp/$f.bed bed/$c
rm -fr $tmp
'_EOF_'
    # << happy emacs
    chmod a+x doPhast.csh

    #	this template will serve for all runs
    #	root1 == chrom name, file1 == ss file name without .ss suffix
    cat << '_EOF_' > template
#LOOP
../run.cons/doPhast.csh $(root1) $(file1) 45 0.3 0.3 {check out line+ pp/$(root1)/$(file1).pp}
#ENDLOOP
'_EOF_'
    # << happy emacs

    find ../SS -type f | grep ".ss$" | sed -e 's/.ss$//' > ss.list
    wc -l ss.list
    #	1082 ss.list

    # Create parasol batch and run it
    # run for all species
    cd /hive/data/genomes/danRer7/bed/multiz8way/cons
    mkdir -p all
    cd all
    #	Using the .mod tree
    cp -p ../../4d/all.mod ./all.mod

    gensub2 ../run.cons/ss.list single ../run.cons/template jobList
    para -ram=8g create jobList
    para try ... check ...
    para -maxJob=64 push
# Completed: 1082 of 1082 jobs
# CPU time in finished jobs:       3539s      58.99m     0.98h    0.04d  0.000 y
# IO & Wait Time:                  7255s     120.91m     2.02h    0.08d  0.000 y
# Average job time:                  10s       0.17m     0.00h    0.00d
# Longest finished job:              35s       0.58m     0.01h    0.00d
# Submission to last job:           432s       7.20m     0.12h    0.01d

    cd /hive/data/genomes/danRer7/bed/multiz8way/cons/all

    find ./bed -type f | grep ".bed$" | xargs cat | sort -k1,1 -k2,2n \
	| awk '{printf "%s\t%d\t%d\tlod=%d\t%s\n", $1, $2, $3, $5, $5;}' \
	    > tmpMostConserved.bed

    /cluster/bin/scripts/lodToBedScore tmpMostConserved.bed > mostConserved.bed

    # load into database
    time nice -n +19 hgLoadBed danRer7 phastConsElements8way mostConserved.bed
    #	Loaded 1110857 elements of size 5
    #	real    0m7.114s

    # not sure what the targets should be for Zebrafish, on human we often
    # try for 5% overall cov, and 70% CDS cov
    featureBits danRer7 -enrichment refGene:cds phastConsElements8way
    #	--rho 0.3 --expected-length 45 --target-coverage 0.3
# refGene:cds 1.320%, phastConsElements8way 13.722%, both 1.101%
# cover 83.38%, enrich 6.08x
    featureBits danRer7 -enrichment ensGene:cds phastConsElements8way
# ensGene:cds 2.783%, phastConsElements8way 13.722%, both 2.170%
# cover 77.98%, enrich 5.68x

    # hg19 for comparison
    #	refGene:cds 1.196%, phastConsElements46way 5.065%,
    #	both 0.888%, cover 74.22%, enrich 14.65x
    #	ensGene:cds 1.278%, phastConsElements46way 5.065%,
    #	both 0.910%, cover 71.23%, enrich 14.06x
    #	knownGene:cds 1.252%, phastConsElements46way 5.065%,
    #	both 0.905%, cover 72.29%, enrich 14.27x

    # Create merged posterier probability file and wiggle track data files
    cd /hive/data/genomes/danRer7/bed/multiz8way/cons/all
    mkdir downloads

    find ./pp -type f | sed -e "s#^./##; s#\.# d #g; s#-# m #;" \
	| sort -k1,1 -k3,3n | sed -e "s# d #.#g; s# m #-#g;" | xargs cat \
	| gzip -c > downloads/phastCons8way.wigFix.gz

    # check integrity of data with wigToBigWig
    zcat downloads/phastCons8way.wigFix.gz \
	| wigToBigWig stdin /hive/data/genomes/danRer7/chrom.sizes \
	    phastCons8way.bw

    bigWigInfo phastCons8way.bw
# version: 4
# isCompressed: yes
# isSwapped: 0
# primaryDataSize: 631,292,485
# primaryIndexSize: 16,520,056
# zoomLevels: 10
# chromCount: 959
# basesCovered: 264,308,757
# mean: 0.680379
# min: 0.000000
# max: 1.000000
# std: 0.360578


    #	encode those files into wiggle data
    zcat downloads/phastCons8way.wigFix.gz \
	| wigEncode stdin phastCons8way.wig phastCons8way.wib
    #   Converted stdin, upper limit 1.00, lower limit 0.00
    du -hsc *.wi?
    #	253M    phastCons8way.wib
    #	49M     phastCons8way.wig
    #	301M    total

    # Load gbdb and database with wiggle.
    ln -s `pwd`/phastCons8way.wib /gbdb/danRer7/multiz8way/phastCons8way.wib
    nice hgLoadWiggle -pathPrefix=/gbdb/danRer7/multiz8way danRer7 \
	phastCons8way phastCons8way.wig

  # use to set trackDb.ra entries for wiggle min and max

    wigTableStats.sh danRer7 phastCons8way
# db.table      min max mean count sumData stdDev viewLimits
#danRer7.phastCons8way   0 1 0.680379 264308757 1.7983e+08 0.360578 viewLimits=0:1

    #  Create histogram to get an overview of all the data
    time nice -n +19 hgWiggle -doHistogram -db=danRer7 \
	-hBinSize=0.001 -hBinCount=1000 -hMinVal=0.0 -verbose=2 \
	    phastCons8way > histogram.data 2>&1
    #	real    0m22.444s

    #	create plot of histogram:

    cat << '_EOF_' | gnuplot > histo.png
set terminal png small x000000 xffffff xc000ff x66ff66 xffff00 x00ffff
set size 1.4, 0.8
set key left box
set grid noxtics
set grid ytics
set title " Zebrafish danRer7 Histogram phastCons8way track"
set xlabel " phastCons8way score"
set ylabel " Relative Frequency"
set y2label " Cumulative Relative Frequency (CRF)"
set y2range [0:1]
set y2tics
set yrange [0:0.02]

plot "histogram.data" using 2:5 title " RelFreq" with impulses, \
        "histogram.data" using 2:7 axes x1y2 title " CRF" with lines
'_EOF_'
    #	<< happy emacs

    display histo.png &

    # run for fish only species
    mkdir /hive/data/genomes/danRer7/bed/multiz8way/cons/fish
    cd /hive/data/genomes/danRer7/bed/multiz8way/cons/fish
    #	Using the fish.mod tree
    cp -p ../../4d/fish.mod ./fish.mod

    gensub2 ../run.cons/ss.list single ../run.cons/template jobList
    para -ram=8g create jobList
    para try ... check ...
    para -maxJob=32 push

# Completed: 1082 of 1082 jobs
# CPU time in finished jobs:       3479s      57.99m     0.97h    0.04d  0.000 y
# IO & Wait Time:                  7257s     120.95m     2.02h    0.08d  0.000 y
# Average job time:                  10s       0.17m     0.00h    0.00d
# Longest finished job:              35s       0.58m     0.01h    0.00d
# Submission to last job:           423s       7.05m     0.12h    0.00d

    cd /hive/data/genomes/danRer7/bed/multiz8way/cons/fish

    find ./bed -type f | grep ".bed$" | xargs cat | sort -k1,1 -k2,2n \
	| awk '{printf "%s\t%d\t%d\tlod=%d\t%s\n", $1, $2, $3, $5, $5;}' \
	    > tmpMostConserved.bed

    /cluster/bin/scripts/lodToBedScore tmpMostConserved.bed > mostConserved.bed

    # load into database
    time nice -n +19 hgLoadBed danRer7 \
	phastConsElements8wayFish mostConserved.bed
    #	Loaded 842567 elements of size 5
    #	real    0m5.941s

    # not sure what the targets should be for Zebrafish, on human we often
    # try for 5% overall cov, and 70% CDS cov
    featureBits danRer7 -enrichment refGene:cds phastConsElements8wayFish
    #	--rho 0.3 --expected-length 45 --target-coverage 0.3
# refGene:cds 1.320%, phastConsElements8wayFish 10.576%, both 1.093%
# cover 82.82%, enrich 7.83x

    featureBits danRer7 -enrichment ensGene:cds phastConsElements8wayFish
# ensGene:cds 2.783%, phastConsElements8wayFish 10.576%, both 2.141%
# cover 76.93%, enrich 7.27x

    # hg19 for comparison
    #	refGene:cds 1.196%, phastConsElements46wayPrimates 3.636%,
    #	both 0.738%, cover 61.69%, enrich 16.97x
    #	ensGene:cds 1.278%, phastConsElements46wayPrimates 3.636%,
    #	both 0.749%, cover 58.62%, enrich 16.12x
    #	knownGene:cds 1.252%, phastConsElements46wayPrimates 3.636%,
    #	both 0.747%, cover 59.66%, enrich 16.41x

    # Create merged posterier probability file and wiggle track data files
    cd /hive/data/genomes/danRer7/bed/multiz8way/cons/fish
    mkdir downloads

    find ./pp -type f | sed -e "s#^./##; s#\.# d #g; s#-# m #;" \
	| sort -k1,1 -k3,3n | sed -e "s# d #.#g; s# m #-#g;" | xargs cat \
	| gzip -c > downloads/phastCons8wayFish.wigFix.gz

    # check integrity of data with wigToBigWig
    zcat downloads/phastCons8wayFish.wigFix.gz \
	| wigToBigWig stdin /hive/data/genomes/danRer7/chrom.sizes \
	    phastCons8wayFish.bw

    bigWigInfo phastCons8wayFish.bw
# version: 4
# isCompressed: yes
# isSwapped: 0
# primaryDataSize: 589,598,860
# primaryIndexSize: 16,520,056
# zoomLevels: 10
# chromCount: 959
# basesCovered: 264,308,757
# mean: 0.584684
# min: 0.000000
# max: 1.000000
# std: 0.363150


    #	encode that ascii data into wiggle data
    zcat downloads/phastCons8wayFish.wigFix.gz \
	| wigEncode stdin phastCons8wayFish.wig phastCons8wayFish.wib
    #   Converted stdin, upper limit 1.00, lower limit 0.00
    du -hsc *.wi?
    #	253M    phastCons8wayFish.wib
    #	51M     phastCons8wayFish.wig
    #	303M    total

    # Load gbdb and database with wiggle.
    ln -s `pwd`/phastCons8wayFish.wib \
	/gbdb/danRer7/multiz8way/phastCons8wayFish.wib
    nice -n +19 hgLoadWiggle -pathPrefix=/gbdb/danRer7/multiz8way danRer7 \
	phastCons8wayFish phastCons8wayFish.wig

  # use to set trackDb.ra entries for wiggle min and max

    wigTableStats.sh danRer7 phastCons8wayFish
# db.table      min max mean count sumData stdDev viewLimits
#danRer7.phastCons8wayFish 0 1 0.584684 264308757 1.54537e+08 0.36315 viewLimits=0:1

    #  Create histogram to get an overview of all the data
    time nice -n +19 hgWiggle -doHistogram -db=danRer7 \
	-hBinSize=0.001 -hBinCount=1000 -hMinVal=0.0 -verbose=2 \
	    phastCons8wayFish > histogram.data 2>&1
    #	real    0m22.444s

    #	create plot of histogram:

    cat << '_EOF_' | gnuplot > histo.png
set terminal png small x000000 xffffff xc000ff x66ff66 xffff00 x00ffff
set size 1.4, 0.8
set key left box
set grid noxtics
set grid ytics
set title " Zebrafish danRer7 Histogram phastCons8wayFish track"
set xlabel " phastCons8wayFish score"
set ylabel " Relative Frequency"
set y2label " Cumulative Relative Frequency (CRF)"
set y2range [0:1]
set y2tics
set yrange [0:0.02]

plot "histogram.data" using 2:5 title " RelFreq" with impulses, \
        "histogram.data" using 2:7 axes x1y2 title " CRF" with lines
'_EOF_'
    #	<< happy emacs

    display histo.png &

#############################################################################
# phyloP for 8-way (DONE - 2011-01-19 - Hiram)
    # run phyloP with score=LRT
    ssh swarm
    mkdir /cluster/data/danRer7/bed/multiz8way/consPhyloP
    cd /cluster/data/danRer7/bed/multiz8way/consPhyloP

    mkdir run.phyloP
    cd run.phyloP
    # Adjust model file base composition background and rate matrix to be
    # representative of the chromosomes in play
    grep BACKGROUND ../../cons/all/all.mod | awk '{printf "%0.3f\n", $3 + $4}'
    #	0.583
    /cluster/bin/phast.build/cornellCVS/phast.2010-12-30/bin/modFreqs \
	../../cons/all/all.mod 0.583 > all.mod

    #	following the pattern from hg19 with only one grp: "all"
    cat << '_EOF_' > doPhyloP.csh
#!/bin/csh -fe
set PHASTBIN = /cluster/bin/phast.build/cornellCVS/phast.2010-12-30/bin
set f = $1
set file1 = $f:t
set out = $2
set cName = $f:t:r
set n = $f:r:e
set grp = $cwd:t
set cons = /hive/data/genomes/danRer7/bed/multiz8way/consPhyloP
set tmp = $cons/tmp/$grp/$f
rm -fr $tmp
mkdir -p $tmp
set ssSrc = "/hive/data/genomes/danRer7/bed/multiz8way/cons/SS/$f"
set useGrp = "$grp.mod"
ln -s $cons/run.phyloP/$grp.mod $tmp
pushd $tmp > /dev/null
$PHASTBIN/phyloP --method LRT --mode CONACC --wig-scores --chrom $cName \
    -i SS $useGrp $ssSrc.ss > $file1.wigFix
popd > /dev/null
mkdir -p $out:h
sleep 4
mv $tmp/$file1.wigFix $out
rm -fr $tmp
rmdir --ignore-fail-on-non-empty $cons/tmp/$grp/$cName
rmdir --ignore-fail-on-non-empty $cons/tmp/$grp
rmdir --ignore-fail-on-non-empty $cons/tmp
'_EOF_'
    # << happy emacs

    # Create list of chunks
    find ../../cons/SS -type f | grep ".ss$" \
	| sed -e "s/.ss$//; s#^../../cons/SS/##" > ss.list

    # Create template file
    #	file1 == $chr/$chunk/file name without .ss suffix
    cat << '_EOF_' > template
#LOOP
../run.phyloP/doPhyloP.csh $(path1) {check out line+ wigFix/$(dir1)/$(file1).wigFix}
#ENDLOOP
'_EOF_'
    # << happy emacs

    ######################   Running all species  #######################
    # setup run for all species
    mkdir /hive/data/genomes/danRer7/bed/multiz8way/consPhyloP/all
    cd /hive/data/genomes/danRer7/bed/multiz8way/consPhyloP/all
    rm -fr wigFix
    mkdir wigFix

    gensub2 ../run.phyloP/ss.list single ../run.phyloP/template jobList
    para -ram=8g create jobList
    para try ... check ... etc ...
    para -maxJob=64 push

    para time > run.time
# Completed: 1082 of 1082 jobs
# CPU time in finished jobs:       4804s      80.06m     1.33h    0.06d  0.000 y
# IO & Wait Time:                  7456s     124.27m     2.07h    0.09d  0.000 y
# Average job time:                  11s       0.19m     0.00h    0.00d
# Longest finished job:              61s       1.02m     0.02h    0.00d
# Submission to last job:           211s       3.52m     0.06h    0.00d

    # make downloads
    mkdir downloads

    find ./wigFix -type f | sed -e "s#^./##; s#\.# d #g; s#-# m #;" \
	| sort -k1,1 -k3,3n | sed -e "s# d #.#g; s# m #-#g;" | xargs cat \
	| gzip -c > downloads/phyloP8way.wigFix.gz

    # check integrity of data with wigToBigWig
    zcat downloads/phyloP8way.wigFix.gz \
	| wigToBigWig stdin /hive/data/genomes/danRer7/chrom.sizes phyloP8way.bw
    bigWigInfo phyloP8way.bw
# version: 4
# isCompressed: yes
# isSwapped: 0
# primaryDataSize: 463,742,565
# primaryIndexSize: 16,520,056
# zoomLevels: 10
# chromCount: 959
# basesCovered: 264,308,757
# mean: 0.475521
# min: -2.143000
# max: 2.572000
# std: 0.752223

    #	encode those files into wiggle data
    zcat downloads/phyloP8way.wigFix.gz \
	| wigEncode stdin phyloP8way.wig phyloP8way.wib
    #	Converted stdin, upper limit 2.57, lower limit -2.14
    du -hsc *.wi?
    #	253M    phyloP8way.wib
    #	49M     phyloP8way.wig
    #	301M    total

    # Load gbdb and database with wiggle.
    ln -s `pwd`/phyloP8way.wib /gbdb/danRer7/multiz8way/phyloP8way.wib
    nice hgLoadWiggle -pathPrefix=/gbdb/danRer7/multiz8way danRer7 \
	phyloP8way phyloP8way.wig

  # use to set trackDb.ra entries for wiggle min and max

    wigTableStats.sh danRer7 phyloP8way
# db.table      min max mean count sumData stdDev viewLimits
# danRer7.phyloP8way      -2.143 2.572 0.475521 264308757 1.25684e+08 0.752223
# viewLimits=-2.143:2.572

    #  Create histogram to get an overview of all the data
    time nice -n +19 hgWiggle -doHistogram -db=danRer7 \
	-hBinSize=0.001 -hBinCount=1000 -hMinVal=0.0 -verbose=2 \
	    phyloP8way > histogram.data 2>&1
    #	real    0m22.461s

    #	create plot of histogram:

    cat << '_EOF_' | gnuplot > histo.png
set terminal png small x000000 xffffff xc000ff x66ff66 xffff00 x00ffff
set size 1.4, 0.8
set key left box
set grid noxtics
set grid ytics
set title " Zebrafish danRer7 Histogram phyloP8way track"
set xlabel " phyloP8way score"
set ylabel " Relative Frequency"
set y2label " Cumulative Relative Frequency (CRF)"
set y2range [0:1]
set y2tics
set yrange [0:0.002]

plot "histogram.data" using 2:5 title " RelFreq" with impulses, \
        "histogram.data" using 2:7 axes x1y2 title " CRF" with lines
'_EOF_'
    #	<< happy emacs

    display histo.png &

    ######################   Running Fish species subset  #####################
    cd /hive/data/genomes/danRer7/bed/multiz8way/consPhyloP/run.phyloP
    # Adjust model file base composition background and rate matrix to be
    # representative of the chromosomes in play
    grep BACKGROUND ../../cons/fish/fish.mod | awk '{printf "%0.3f\n", $3 + $4}'
    #	0.613
    /cluster/bin/phast.build/cornellCVS/phast.2010-12-30/bin/modFreqs \
	../../cons/fish/fish.mod 0.613 > fish.mod

    # setup run fish species subset
    mkdir /hive/data/genomes/danRer7/bed/multiz8way/consPhyloP/fish
    cd /hive/data/genomes/danRer7/bed/multiz8way/consPhyloP/fish
    rm -fr wigFix
    mkdir wigFix

    gensub2 ../run.phyloP/ss.list single ../run.phyloP/template jobList
    para -ram=8g create jobList
    para try ... check ... etc ...
    para -maxJob=32 push

    para time > run.time
# Completed: 1082 of 1082 jobs
# CPU time in finished jobs:       2724s      45.40m     0.76h    0.03d  0.000 y
# IO & Wait Time:                  7369s     122.81m     2.05h    0.09d  0.000 y
# Average job time:                   9s       0.16m     0.00h    0.00d
# Longest finished job:              40s       0.67m     0.01h    0.00d
# Submission to last job:           356s       5.93m     0.10h    0.00d

    # make downloads
    mkdir downloads

    find ./wigFix -type f | sed -e "s#^./##; s#\.# d #g; s#-# m #;" \
	| sort -k1,1 -k3,3n | sed -e "s# d #.#g; s# m #-#g;" | xargs cat \
	| gzip -c > downloads/phyloP8wayFish.wigFix.gz

    # check integrity of data with wigToBigWig
    zcat downloads/phyloP8wayFish.wigFix.gz \
	| wigToBigWig stdin /hive/data/genomes/danRer7/chrom.sizes \
	    phyloP8wayFish.bw
    bigWigInfo phyloP8wayFish.bw
# version: 4
# isCompressed: yes
# isSwapped: 0
# primaryDataSize: 351,513,155
# primaryIndexSize: 16,520,056
# zoomLevels: 10
# chromCount: 959
# basesCovered: 264,308,757
# mean: 0.340395
# min: -1.499000
# max: 1.587000
# std: 0.617638

    #	encode those files into wiggle data
    zcat downloads/phyloP8wayFish.wigFix.gz \
	| wigEncode stdin phyloP8wayFish.wig phyloP8wayFish.wib
    #	Converted stdin, upper limit 1.59, lower limit -1.50
    du -hsc *.wi?
    #	253M    phyloP8wayFish.wib
    #	50M     phyloP8wayFish.wig
    #	302M    total

    # Load gbdb and database with wiggle.
    ln -s `pwd`/phyloP8wayFish.wib /gbdb/danRer7/multiz8way/phyloP8wayFish.wib
    nice hgLoadWiggle -pathPrefix=/gbdb/danRer7/multiz8way danRer7 \
	phyloP8wayFish phyloP8wayFish.wig

  # use to set trackDb.ra entries for wiggle min and max

    wigTableStats.sh danRer7 phyloP8wayFish
# db.table      min max mean count sumData stdDev viewLimits
# danRer7.phyloP8wayFish      -1.499 1.587 0.340395 264308757 8.99693e+07 0.617638 viewLimits=-1.499:1.587


    #  Create histogram to get an overview of all the data
    time nice -n +19 hgWiggle -doHistogram -db=danRer7 \
	-hBinSize=0.001 -hBinCount=1000 -hMinVal=0.0 -verbose=2 \
	    phyloP8wayFish > histogram.data 2>&1
    #	real    0m18.184s

    #	create plot of histogram:

    cat << '_EOF_' | gnuplot > histo.png
set terminal png small x000000 xffffff xc000ff x66ff66 xffff00 x00ffff
set size 1.4, 0.8
set key left box
set grid noxtics
set grid ytics
set title " Zebrafish danRer7 Histogram phyloP8wayFish track"
set xlabel " phyloP8wayFish score"
set ylabel " Relative Frequency"
set y2label " Cumulative Relative Frequency (CRF)"
set y2range [0:1]
set y2tics
set yrange [0:0.002]

plot "histogram.data" using 2:5 title " RelFreq" with impulses, \
        "histogram.data" using 2:7 axes x1y2 title " CRF" with lines
'_EOF_'
    #	<< happy emacs

    display histo.png &

#############################################################################
# download data for 8-way (DONE - 2011-01-19 - Hiram)
    mkdir /usr/local/apache/htdocs-hgdownload/goldenPath/danRer7/phastCons8way
    cd /usr/local/apache/htdocs-hgdownload/goldenPath/danRer7/phastCons8way
    ln -s \
/hive/data/genomes/danRer7/bed/multiz8way/cons/all/downloads/phastCons8way.wigFix.gz \
	./vertebrate.phastCons8way.wigFix.gz
    ln -s \
/hive/data/genomes/danRer7/bed/multiz8way/cons/fish/downloads/phastCons8wayFish.wigFix.gz \
	./fish.phastCons8way.wigFix.gz
    ln -s /hive/data/genomes/danRer7/bed/multiz8way/cons/all/phastCons8way.bw \
	./vertebrate.phastCons8way.bw
    ln -s \
/hive/data/genomes/danRer7/bed/multiz8way/cons/fish/phastCons8wayFish.bw \
	./fish.phastCons8way.bw

    ln -s /hive/data/genomes/danRer7/bed/multiz8way/cons/fish/fish.mod \
	./fish.mod
    ln -s /hive/data/genomes/danRer7/bed/multiz8way/cons/all/all.mod \
	./vertebrate.mod
    ln -s /hive/data/genomes/danRer7/bed/multiz8way/cons/README.txt .
    md5sum * > /hive/data/genomes/danRer7/bed/multiz8way/cons/md5sum.txt
    ln -s /hive/data/genomes/danRer7/bed/multiz8way/cons/md5sum.txt .

    mkdir /usr/local/apache/htdocs-hgdownload/goldenPath/danRer7/phyloP8way
    cd /usr/local/apache/htdocs-hgdownload/goldenPath/danRer7/phyloP8way
    ln -s \
/hive/data/genomes/danRer7/bed/multiz8way/consPhyloP/all/downloads/phyloP8way.wigFix.gz \
	./vertebrate.phyloP8way.wigFix.gz
    ln -s \
/hive/data/genomes/danRer7/bed/multiz8way/consPhyloP/fish/downloads/phyloP8wayFish.wigFix.gz \
	./fish.phastCons8way.wigFix.gz
    ln -s \
/hive/data/genomes/danRer7/bed/multiz8way/consPhyloP/fish/phyloP8wayFish.bw
	./fish.phastCons8way.bw
    ln -s \
	/hive/data/genomes/danRer7/bed/multiz8way/consPhyloP/all/phyloP8way.bw
	./vertebrate.phyloP8way.bw
    ln -s \
/hive/data/genomes/danRer7/bed/multiz8way/consPhyloP/run.phyloP/fish.mod \
	./fish.mod
    ln -s \
/hive/data/genomes/danRer7/bed/multiz8way/consPhyloP/run.phyloP/all.mod \
	./vertebrate.mod
    ln -s /hive/data/genomes/danRer7/bed/multiz8way/consPhyloP/README.txt .
    md5sum * > /hive/data/genomes/danRer7/bed/multiz8way/consPhyloP/md5sum.txt
    ln -s /hive/data/genomes/danRer7/bed/multiz8way/consPhyloP/md5sum.txt .

    cd /hive/data/genomes/danRer7/bed/multiz8way
    /cluster/bin/phast/tree_doctor --rename \
"danRer7->Zebrafish; gasAcu1->Stickleback; oryLat2->Medaka; tetNig2->Tetraodon; fr2->Fugu; xenTro2->X_Tropicalis; mm9->Mouse; hg19->Human" \
	zFishOnTop.nh > commonNames.8way.nh

    mkdir /hive/data/genomes/danRer7/bed/multiz8way/downloads
    cd /hive/data/genomes/danRer7/bed/multiz8way/downloads
    cp -p ../anno/danRer7.8way.maf multiz8way.maf
    gzip multiz8way.maf
    #	real    8m35.334s
    mkdir /usr/local/apache/htdocs-hgdownload/goldenPath/danRer7/multiz8way
    cd /usr/local/apache/htdocs-hgdownload/goldenPath/danRer7/multiz8way
    ln -s /hive/data/genomes/danRer7/bed/multiz8way/8way.nh .
    ln -s /hive/data/genomes/danRer7/bed/multiz8way/commonNames.8way.nh .
    ln -s \
/hive/data/genomes/danRer7/bed/multiz8way/downloads/multiz8way.maf.gz \
	.
    ln -s /hive/data/genomes/danRer7/bed/multiz8way/downloads/README.txt .

    md5sum * > /hive/data/genomes/danRer7/bed/multiz8way/downloads/md5sum.txt
    ln -s /hive/data/genomes/danRer7/bed/multiz8way/downloads/md5sum.txt .

#############################################################################
# HUMAN (hg18) PROTEINS TRACK (DONE braney 2010-12-29)
    # bash  if not using bash shell already
)
    cd /cluster/data/danRer7
    mkdir /cluster/data/danRer7/blastDb

    awk '{if ($2 > 1000000) print $1}' chrom.sizes > 1meg.lst
    twoBitToFa -seqList=1meg.lst  danRer7.2bit temp.fa
    faSplit gap temp.fa 1000000 blastDb/x -lift=blastDb.lft
#  1371 pieces of 1371 written
    rm temp.fa 1meg.lst

    awk '{if ($2 <= 1000000) print $1}' chrom.sizes > less1meg.lst
    twoBitToFa -seqList=less1meg.lst  danRer7.2bit temp.fa
    faSplit about temp.fa 1000000 blastDb/y
    rm temp.fa less1meg.lst

    cd blastDb
    for i in *.fa
    do
	/hive/data/outside/blast229/formatdb -i $i -p F
    done
    rm *.fa
    ls *.nsq | wc -l
# 1421

    mkdir -p /cluster/data/danRer7/bed/tblastn.hg18KG
    cd /cluster/data/danRer7/bed/tblastn.hg18KG
    echo  ../../blastDb/*.nsq | xargs ls -S | sed "s/\.nsq//"  > query.lst
    wc -l query.lst
# 1421 query.lst

   # we want around 250000 jobs
   calc `wc /cluster/data/hg18/bed/blat.hg18KG/hg18KG.psl | awk '{print $1}'`/\(250000/`wc query.lst | awk '{print $1}'`\)

# 36727/(250000/1421) = 208.756268

   mkdir -p kgfa
   split -l 209 /cluster/data/hg18/bed/blat.hg18KG/hg18KG.psl  kgfa/kg
   cd kgfa
   for i in *; do
     nice pslxToFa $i $i.fa;
     rm $i;
   done
   cd ..
   ls -1S kgfa/*.fa > kg.lst
   wc kg.lst
#  176  176 2288 kg.lst

   mkdir -p blastOut
   for i in `cat kg.lst`; do  mkdir blastOut/`basename $i .fa`; done
   tcsh
   cd /cluster/data/danRer7/bed/tblastn.hg18KG
   cat << '_EOF_' > blastGsub
#LOOP
blastSome $(path1) {check in line $(path2)} {check out exists blastOut/$(root2)/q.$(root1).psl }
#ENDLOOP
'_EOF_'

   cat << '_EOF_' > blastSome
#!/bin/sh
BLASTMAT=/hive/data/outside/blast229/data
export BLASTMAT
g=`basename $2`
f=/tmp/`basename $3`.$g
for eVal in 0.01 0.001 0.0001 0.00001 0.000001 1E-09 1E-11
do
if /hive/data/outside/blast229/blastall -M BLOSUM80 -m 0 -F no -e $eVal -p tblastn -d $1 -i $2 -o $f.8
then
        mv $f.8 $f.1
        break;
fi
done
if test -f  $f.1
then
    if /cluster/bin/i386/blastToPsl $f.1 $f.2
    then
	liftUp -nosort -type=".psl" -nohead $f.3 /cluster/data/danRer7/blastDb.lft carry $f.2
        liftUp -nosort -type=".psl" -pslQ -nohead $3.tmp /cluster/data/hg18/bed/blat.hg18KG/protein.lft warn $f.3
        if pslCheck -prot $3.tmp
        then
            mv $3.tmp $3
            rm -f $f.1 $f.2 $f.3 $f.4
        fi
        exit 0
    fi
fi
rm -f $f.1 $f.2 $3.tmp $f.8 $f.3 $f.4
exit 1
'_EOF_'
    # << happy emacs
    chmod +x blastSome
    exit

    ssh swarm
    cd /cluster/data/danRer7/bed/tblastn.hg18KG
    gensub2 query.lst kg.lst blastGsub blastSpec
    para create blastSpec
#    para try, check, push, check etc.

    para time

# Completed: 250096 of 250096 jobs
# CPU time in finished jobs:    7759615s  129326.92m  2155.45h   89.81d  0.246 y
# IO & Wait Time:               1198357s   19972.62m   332.88h   13.87d  0.038 y
# Average job time:                  36s       0.60m     0.01h    0.00d
# Longest finished job:            1267s      21.12m     0.35h    0.01d
# Submission to last job:         18191s     303.18m     5.05h    0.21d

    ssh swarm
    cd /cluster/data/danRer7/bed/tblastn.hg18KG
    mkdir chainRun
    cd chainRun
    tcsh
    cat << '_EOF_' > chainGsub
#LOOP
chainOne $(path1)
#ENDLOOP
'_EOF_'

    cat << '_EOF_' > chainOne
(cd $1; cat q.*.psl | simpleChain -prot -outPsl -maxGap=150000 stdin ../c.`basename $1`.psl)
'_EOF_'
    chmod +x chainOne
    ls -1dS ../blastOut/kg?? > chain.lst
    gensub2 chain.lst single chainGsub chainSpec
    # do the cluster run for chaining
    para create chainSpec
    para try, check, push, check etc.

# Completed: 176 of 176 jobs
# CPU time in finished jobs:      70348s    1172.47m    19.54h    0.81d  0.002 y
# IO & Wait Time:                 36253s     604.21m    10.07h    0.42d  0.001 y
# Average job time:                 606s      10.09m     0.17h    0.01d
# Longest finished job:           15700s     261.67m     4.36h    0.18d
# Submission to last job:         15708s     261.80m     4.36h    0.18d

    cd /cluster/data/danRer7/bed/tblastn.hg18KG/blastOut
    for i in kg??
    do
       cat c.$i.psl | awk "(\$13 - \$12)/\$11 > 0.6 {print}" > c60.$i.psl
       sort -rn c60.$i.psl | pslUniq stdin u.$i.psl
       awk "((\$1 / \$11) ) > 0.60 { print   }" c60.$i.psl > m60.$i.psl
       echo $i
    done
    sort u.*.psl m60* | uniq | sort -T /tmp -k 14,14 -k 16,16n -k 17,17n > ../blastHg18KG.psl
    cd ..
    pslCheck blastHg18KG.psl
# checked: 45598 failed: 0 errors: 0

    # load table
    ssh hgwdev
    cd /cluster/data/danRer7/bed/tblastn.hg18KG
    hgLoadPsl danRer7 blastHg18KG.psl

    # check coverage
    featureBits danRer7 blastHg18KG
# 21889813 bases of 1409770109 (1.553%) in intersection

    featureBits danRer7 blastHg18KG refGene  -enrichment
# blastHg18KG 1.553%, refGene 1.960%, both 0.816%, cover 52.58%, enrich 26.83x

    featureBits danRer7 blastHg18KG ensGene  -enrichment
# blastHg18KG 1.553%, ensGene 3.961%, both 1.392%, cover 89.68%, enrich 22.64x

    rm -rf blastOut
#end tblastn

#############################################################################
# GRC Incident database (DONE - 2011-02-10 - Hiram)
    # used to be NCBI Incident - changed to GRC Incident 2012-04-12
    # this procedure is run as a cron job in Hiram's account:

    #	43 09 * * * /hive/data/outside/grc/incidentDb/runUpdate.sh makeItSo

    # using the two scrips there: runUpdate.sh and update.sh
    # which are checked into the source tree as files:
    #	src/hg/utils/automation/grcIncidentUpdate.sh
    #	src/hg/utils/automation/grcRunIncidentUpdate.sh

    # they fetch the XML files from NCBI, convert them to SQL text
    # files, construct a bigBed file, and pushes it to genomewiki if
    # it is an update from previous

    # the table in the dataBase is: grcIncidentDb
    # which is the URL to the bb file, a single row:
    # http://genomewiki.ucsc.edu/images/6/64/DanRer7.grcIncidentDb.bb

#########################################################################
# LASTZ Turkey MelGal1 ( DONE - 2011-04-01 - Chin)
    mkdir /hive/data/genomes/danRer7/bed/lastzMelGal1.2011-04-01
    cd /hive/data/genomes/danRer7/bed/lastzMelGal1.2011-04-01

    cat << '_EOF_' > DEF
# Turkey vs Zebrafish

# TARGET: Zebrafish danRer7
SEQ1_DIR=/scratch/data/danRer7/danRer7.2bit
SEQ1_LEN=/scratch/data/danRer7/chrom.sizes
SEQ1_CHUNK=10000000
SEQ1_LAP=10000

# QUERY: Turkey melGal1 - single chunk big enough to run entire chrom
SEQ2_DIR=/scratch/data/melGal1/melGal1.2bit
SEQ2_LEN=/scratch/data/melGal1/chrom.sizes
SEQ2_CHUNK=10000000
SEQ2_LAP=0

BASE=/hive/data/genomes/danRer7/bed/lastzMelGal1.2011-04-01
TMPDIR=/scratch/tmp
'_EOF_'
    # << happy emacs

    # note: No -syntenicNet option below.
    screen
    time nice -n +19 doBlastzChainNet.pl -verbose=2 \
        `pwd`/DEF \
        -noLoadChainSplit \
        -workhorse=hgwdev -smallClusterHub=memk -bigClusterHub=swarm \
        -chainMinScore=5000 -chainLinearGap=loose > do.log 2>&1
    # real    179m52.632s
    cat fb.danRer7.chainMelGal1Link.txt
    #   35772051 bases of 1409770109 (2.537%) in intersection
    cd /hive/data/genomes/danRer7/bed
    ln -s lastzMelGal1.2011-04-01 lastz.melGal1

    #   running the swap
    mkdir /hive/data/genomes/melGal1/bed/blastz.danRer7.swap
    cd /hive/data/genomes/melGal1/bed/blastz.danRer7.swap
    time nice -n +19 doBlastzChainNet.pl -verbose=2 \
        /hive/data/genomes/danRer7/bed/lastzMelGal1.2011-04-01/DEF \
        -swap \
        -noLoadChainSplit \
        -workhorse=hgwdev -smallClusterHub=memk -bigClusterHub=swarm \
        -chainMinScore=5000 -chainLinearGap=loose > swap.log 2>&1
    #   real    3m28.672s
    cat fb.melGal1.chainDanRer7Link.txt
    #   25141503 bases of 935922386 (2.686%) in intersection
    cd /hive/data/genomes/melGal1/bed
    ln -s blastz.danRer7.swap lastz.danRer7
#############################################################################
# BUILD U MASS ChiP-Seq TRACKS (DONE 4/19/11, Fan)

ssh hgwdev
mkdir -p /hive/data/genomes/danRer7/bed/H3K4me
cd /hive/data/genomes/danRer7/bed/H3K4me

# Download the following files from http://lawsonlab.umassmed.edu/ChIP-SeqSupp.html:

wget --timestamping "http://lawsonlab.umassmed.edu/UCSC/Zv9_tracks/24h_input_signal_Zv9.WIG"
wget --timestamping "http://lawsonlab.umassmed.edu/UCSC/Zv9_tracks/24h_me1_signal_Zv9.WIG"
wget --timestamping "http://lawsonlab.umassmed.edu/UCSC/Zv9_tracks/24h_me3_signal_Zv9.WIG"
wget --timestamping "http://lawsonlab.umassmed.edu/UCSC/Zv9_tracks/24h_me1_peaks_Zv9.WIG"
wget --timestamping "http://lawsonlab.umassmed.edu/UCSC/Zv9_tracks/24h_me1_hotspots_Zv9.bed"
wget --timestamping "http://lawsonlab.umassmed.edu/UCSC/Zv9_tracks/24h_me3_peaks_Zv9.WIG"
wget --timestamping "http://lawsonlab.umassmed.edu/UCSC/Zv9_tracks/24h_me3_hotspots_Zv9.bed"

# build UMassInput track

cat '24h_input_signal_Zv9.WIG'|sed -e 's/chrMT/chrM/g' >UMassInput.wig.input
wigEncode UMassInput.wig.input UMassInput.wig UMassInput.wib
# Converted UMassInput.wig.input, upper limit 537.00, lower limit 10.00

mkdir /gbdb/danRer7/wib/
ln -s /hive/data/genomes/danRer7/bed/H3K4me/UMassInput.wib /gbdb/danRer7/wib/UMassInput.wib

hgLoadWiggle danRer7 UMassInput UMassInput.wig
#WARNING: Exceeded Zv9_NA205 size 14440 > 14336. dropping 11 data point(s)
#WARNING: Exceeded Zv9_NA648 size 16200 > 16134. dropping 7 data point(s)
#WARNING: Exceeded chr3 size 63268980 > 63268876. dropping 11 data point(s)
#WARNING: Exceeded chrM size 16730 > 16596. dropping 14 data point(s)

# Build UMassME1 track

cat '24h_me1_signal_Zv9.WIG'|sed -e 's/chrMT/chrM/g' >UMassME1.wig.input
wigEncode UMassME1.wig.input UMassME1.wig UMassME1.wib
# Converted UMassME1.wig.input, upper limit 406.00, lower limit 10.00

ln -s /hive/data/genomes/danRer7/bed/H3K4me/UMassME1.wib /gbdb/danRer7/wib/UMassME1.wib

hgLoadWiggle danRer7 UMassME1 UMassME1.wig
#WARNING: Exceeded Zv9_NA205 size 14400 > 14336. dropping 7 data point(s)
#WARNING: Exceeded Zv9_NA648 size 16220 > 16134. dropping 9 data point(s)
#WARNING: Exceeded Zv9_NA796 size 3350 > 3298. dropping 6 data point(s)
#WARNING: Exceeded Zv9_NA933 size 3040 > 3012. dropping 3 data point(s)
#WARNING: Exceeded Zv9_NA949 size 3090 > 3033. dropping 6 data point(s)
#WARNING: Exceeded Zv9_NA957 size 2920 > 2866. dropping 6 data point(s)
#WARNING: Exceeded Zv9_NA967 size 6710 > 6697. dropping 2 data point(s)
#WARNING: Exceeded chr3 size 63268980 > 63268876. dropping 11 data point(s)

# Build UMassME1Peak track

cat '24h_me1_peaks_Zv9.WIG'|sed -e 's/chrMT/chrM/g' >UMassME1Peak.wig.input
wigEncode UMassME1Peak.wig.input UMassME1Peak.wig UMassME1Peak.wib
# Converted UMassME1Peak.wig.input, upper limit 1599.00, lower limit 11.00

ln -s /hive/data/genomes/danRer7/bed/H3K4me/UMassME1Peak.wib /gbdb/danRer7/wib/UMassME1Peak.wib

hgLoadWiggle danRer7 UMassME1Peak UMassME1Peak.wig

# Build UMassME3 track

cat '24h_me3_signal_Zv9.WIG'|sed -e 's/chrMT/chrM/g' >UMassME3.wig.input
wigEncode UMassME3.wig.input UMassME3.wig UMassME3.wib
# Converted UMassME3.wig.input, upper limit 409.00, lower limit 10.00

ln -s /hive/data/genomes/danRer7/bed/H3K4me/UMassME3.wib /gbdb/danRer7/wib/UMassME3.wib

hgLoadWiggle danRer7 UMassME3 UMassME3.wig
#WARNING: Exceeded Zv9_NA205 size 14440 > 14336. dropping 11 data point(s)
#WARNING: Exceeded Zv9_NA648 size 16230 > 16134. dropping 10 data point(s)
#WARNING: Exceeded Zv9_NA845 size 60950 > 60897. dropping 6 data point(s)
#WARNING: Exceeded Zv9_NA850 size 26100 > 25985. dropping 12 data point(s)
#WARNING: Exceeded Zv9_NA957 size 2870 > 2866. dropping 1 data point(s)
#WARNING: Exceeded Zv9_NA959 size 1190 > 1142. dropping 5 data point(s)
#WARNING: Exceeded chr3 size 63268980 > 63268876. dropping 11 data point(s)

# Build UMassME3Peak track

cat '24h_me3_peaks_Zv9.WIG'|sed -e 's/chrMT/chrM/g' >UMassME3Peak.wig.input
wigEncode UMassME3Peak.wig.input UMassME3Peak.wig UMassME3Peak.wib
# Converted UMassME3Peak.wig.input, upper limit 3749.00, lower limit 14.00

ln -s /hive/data/genomes/danRer7/bed/H3K4me/UMassME3Peak.wib /gbdb/danRer7/wib/UMassME3Peak.wib

hgLoadWiggle danRer7 UMassME3Peak UMassME3Peak.wig

# Build UMassME1Hotspot track

cat '24h_me1_hotspots_Zv9.bed' |grep -v 'hotspot' > UMassME1Hotspot.bed
hgLoadBed danRer7 UMassME1Hotspot UMassME1Hotspot.bed
# Loaded 65585 elements of size 5

# Build UMassME3Hotspot track

cat '24h_me3_hotspots_Zv9.bed' |grep -v 'hotspot' > UMassME3Hotspot.bed
hgLoadBed danRer7 UMassME3Hotspot UMassME3Hotspot.bed
# Loaded 27546 elements of size 5

############################################################################
# running cpgIsland business (DONE - 2011-07-22 - Hiram)
    mkdir /hive/data/genomes/danRer7/bed/cpgIsland
    cd /hive/data/genomes/danRer7/bed/cpgIsland
    ln -s ../../../danRer6/bed/cpgIsland/cpglh.exe .

    mkdir -p hardMaskedFa
    cut -f1 ../../chrom.sizes | while read C
do
    echo ${C}
    twoBitToFa ../../danRer7.2bit:$C stdout \
	| maskOutFa stdin hard hardMaskedFa/${C}.fa
done

    ssh swarm
    cd /hive/data/genomes/danRer7/bed/cpgIsland
    mkdir results
    cut -f1 ../../chrom.sizes > chr.list
    cat << '_EOF_' > template
#LOOP
./runOne $(root1) {check out exists results/$(root1).cpg}
#ENDLOOP
'_EOF_'
    # << happy emacs

    #	the faCount business is to make sure there is enough sequence to
    #	work with in the fasta.  cpglh.exe does not like files with too many
    #	N's - it gets stuck
    cat << '_EOF_' > runOne
#!/bin/csh -fe
set C = `faCount hardMaskedFa/$1.fa | egrep "^chr|^Zv" | awk '{print  $2 - $7 }'`
if ( $C > 200 ) then
    ./cpglh.exe hardMaskedFa/$1.fa > /scratch/tmp/$1.$$
    mv /scratch/tmp/$1.$$ $2
else
    touch $2
endif
'_EOF_'
    # << happy emacs
    chmod +x runOne

    gensub2 chr.list single template jobList
    para create jobList
    para try
    para check ... etc
    para time
# Completed: 1133 of 1133 jobs
# CPU time in finished jobs:        102s       1.70m     0.03h    0.00d  0.000 y
# IO & Wait Time:                  4469s      74.49m     1.24h    0.05d  0.000 y
# Average job time:                   4s       0.07m     0.00h    0.00d
# Longest finished job:              10s       0.17m     0.00h    0.00d
# Submission to last job:           485s       8.08m     0.13h    0.01d

    # Transform cpglh output to bed +
    catDir results | awk '{
$2 = $2 - 1;
width = $3 - $2;
printf("%s\t%d\t%s\t%s %s\t%s\t%s\t%0.0f\t%0.1f\t%s\t%s\n",
       $1, $2, $3, $5,$6, width,
       $6, width*$7*0.01, 100.0*2*$6/width, $7, $9);
}' > cpgIsland.bed

    # verify longest unique chrom name:
    cut -f1 cpgIsland.bed | sed -e 's/_random//' \
	| awk '{print length($0)}' | sort -rn | head -1
    #	16
    # update the length 14 in the template to be 16:
    sed -e "s/14/16/" $HOME/kent/src/hg/lib/cpgIslandExt.sql > cpgIslandExt.sql

    cd /hive/data/genomes/danRer7/bed/cpgIsland
    hgLoadBed danRer7 cpgIslandExt -tab -sqlTable=cpgIslandExt.sql cpgIsland.bed
# Loaded 12683 elements of size 10

    featureBits danRer7 cpgIslandExt
    #	5207568 bases of 1409770109 (0.369%) in intersection
    # there should be no output from checkTableCoords:
    checkTableCoords -verboseBlocks -table=cpgIslandExt danRer7

    #	cleanup
    rm -fr hardMaskedFa

############################################################################
# CREATE LIFTOVER CHAINS FROM danRer7 TO danRer6 (DONE - 2011-09-09 - Hiram)
    doSameSpeciesLiftOver.pl -debug \
	-bigClusterHub=swarm -dbHost=hgwdev -workhorse=hgwdev \
	-ooc=/hive/data/genomes/danRer7/jkStuff/danRer7.11.ooc \
	danRer7 danRer6
    cd /hive/data/genomes/danRer7/bed/blat.danRer6.2011-09-09
    time doSameSpeciesLiftOver.pl \
	-bigClusterHub=swarm -dbHost=hgwdev -workhorse=hgwdev \
	-ooc=/hive/data/genomes/danRer7/jkStuff/danRer7.11.ooc \
	danRer7 danRer6 > do.log 2>&1 &
    #	real    166m45.904s
    # test the procedure in the genome-test browser

#########################################################################
# LASTZ Xenopus xenTro3 (DONE - 2010-12-17 - Hiram)
    mkdir /hive/data/genomes/danRer7/bed/lastzXenTro3.2011-09-20
    cd /hive/data/genomes/danRer7/bed/lastzXenTro3.2011-09-20

    cat << '_EOF_' > DEF
# zebrafish vs. X. tropicalis
BLASTZ_H=2000
BLASTZ_Y=3400
BLASTZ_L=6000
BLASTZ_K=2200
BLASTZ_Q=/scratch/data/blastz/HoxD55.q

# TARGET: Zebrafish danRer7
SEQ1_DIR=/scratch/data/danRer7/danRer7.2bit
SEQ1_LEN=/scratch/data/danRer7/chrom.sizes
SEQ1_CHUNK=20000000
SEQ1_LAP=10000
SEQ1_LIMIT=40

# QUERY: X. tropicalis xenTro3
SEQ2_DIR=/scratch/data/xenTro3/xenTro3.2bit
SEQ2_LEN=/scratch/data/xenTro3/chrom.sizes
SEQ2_CHUNK=10000000
SEQ2_LAP=0
SEQ2_LIMIT=50

BASE=/hive/data/genomes/danRer7/bed/lastzXenTro3.2011-09-20
TMPDIR=/scratch/tmp
'_EOF_'
    # << happy emacs

    #	establish a screen to control this job
    screen
    time nice -n +19 doBlastzChainNet.pl -verbose=2 \
	`pwd`/DEF \
	-chainMinScore=5000 -chainLinearGap=loose \
	-workhorse=hgwdev -smallClusterHub=memk -bigClusterHub=swarm \
	> do.log 2>&1 &
    #	real    1021m38.611s

    cat fb.danRer7.chainXenTro3Link.txt
    #	91771881 bases of 1409770109 (6.510%) in intersection
    cd /hive/data/genomes/danRer7/bed
    ln -s lastzXenTro3.2011-09-20 lastz.xenTro3

    mkdir /hive/data/genomes/xenTro3/bed/blastz.danRer7.swap
    cd /hive/data/genomes/xenTro3/bed/blastz.danRer7.swap
    time nice -n +19 doBlastzChainNet.pl -verbose=2 \
	/hive/data/genomes/danRer7/bed/lastzXenTro3.2011-09-20/DEF \
	-chainMinScore=5000 -chainLinearGap=loose \
	-workhorse=hgwdev -smallClusterHub=memk -bigClusterHub=swarm \
	-swap > swap.log 2>&1 &
    #	real    67m46.506s
    cat fb.xenTro3.chainDanRer7Link.txt
    #	96539932 bases of 1358334882 (7.107%) in intersection

########################################################################
# lastz rn5 rat (DONE - 2012-11-14 - Hiram)
    # the original alignment:
    cd /hive/data/genomes/rn5/bed/lastzDanRer7.2012-11-13
    cat fb.rn5.chainDanRer7Link.txt
    #	66573209 bases of 2572853723 (2.588%) in intersection

    #	and this swap:
    mkdir /hive/data/genomes/danRer7/bed/blastz.rn5.swap
    cd /hive/data/genomes/danRer7/bed/blastz.rn5.swap
    time nice -n +19 doBlastzChainNet.pl -verbose=2 \
	/hive/data/genomes/rn5/bed/lastzDanRer7.2012-11-13/DEF \
        -workhorse=hgwdev -smallClusterHub=encodek -bigClusterHub=swarm \
        -swap -chainMinScore=5000 -chainLinearGap=loose > swap.log 2>&1 &
    #	real    15m5.664s
    cat  fb.danRer7.chainRn5Link.txt
    #	68007020 bases of 1409770109 (4.824%) in intersection

    # set sym link to indicate this is the lastz for this genome:
    cd /hive/data/genomes/danRer7/bed
    ln -s blastz.rn5.swap lastz.rn5

#########################################################################
# create ucscToINSDC name mapping (DONE - 2013-08-23 - Hiram)
    mkdir /hive/data/genomes/danRer7/bed/ucscToINSDC
    cd /hive/data/genomes/danRer7/bed/ucscToINSDC

    # copying these scripts from the previous load and improving them
    # with each instance
    ./translateNames.sh NC_004299.1
    # this one need much extra work in the join since UCSC names
    # were an odd lot based on some other file in genbank
    #  ./verifyAll.sh - not valid here
    # join.sh much abused here
    ./join.sh

    sed -e "s/21/16/" $HOME/kent/src/hg/lib/ucscToINSDC.sql \
        | hgLoadSqlTab danRer7 ucscToINSDC stdin ucscToINSDC.tab
    checkTableCoords danRer7 ucscToINSDC
    featureBits -countGaps danRer7 ucscToINSDC
    # 1412464843 bases of 1412464843 (100.000%) in intersection

    # verify the track link to INSDC functions

##############################################################################
# genscan - (DONE - 2013-11-12 - Hiram)
    mkdir /hive/data/genomes/danRer7/bed/genscan
    cd /hive/data/genomes/danRer7/bed/genscan
    time doGenscan.pl danRer7 > do.log 2>&1
    #  real    31m10.607s
    # two broken jobs:
./lastRunGsBig.csh chr1 000 gtf/000/chr1.gtf pep/000/chr1.pep subopt/000/chr1.bed &
./lastRunGsBig.csh chr4 000 gtf/000/chr4.gtf pep/000/chr4.pep subopt/000/chr4.bed
    #  real    26m20.610s
    # run 'para time > run.time' on ku, then continuing:
    time doGenscan.pl -continue=makeBed danRer7 > makeBed.log 2>&1
    # real    3m9.048s

    cat fb.danRer7.genscan.txt
    #   54953492 bases of 1409770109 (3.898%) in intersection

    cat fb.danRer7.genscanSubopt.txt
    #   35758035 bases of 1409770109 (2.536%) in intersection

#########################################################################
# CPG Islands Unmasked track (DONE - 2014-04-24 - Hiram)

    mkdir /hive/data/genomes/danRer7/bed/cpgIslandsUnmasked
    cd /hive/data/genomes/danRer7/bed/cpgIslandsUnmasked
    time doCpgIslands.pl -buildDir=`pwd` -bigClusterHub=ku \
     -tableName=cpgIslandExtUnmasked -dbHost=hgwdev -smallClusterHub=ku \
        -workhorse=hgwdev \
          -maskedSeq=/hive/data/genomes/danRer7/danRer7.unmasked.2bit \
            danRer7 > do.log 2>&1

    #  Elapsed time: 3m3s

    cat fb.danRer7.cpgIslandExtUnmasked.txt
    # 29436058 bases of 1409770109 (2.088%) in intersection

##############################################################################
##############################################################################
# TransMap V3 tracks. see makeDb/doc/transMapTracks.txt (2014-12-21 markd)
##############################################################################
#########################################################################
# LIFTOVER TO danRer10 (DONE - 2015-01-29 - Hiram)
    ssh hgwdev
    mkdir /hive/data/genomes/danRer7/bed/blat.danRer10.2015-01-29
    cd /hive/data/genomes/danRer7/bed/blat.danRer10.2015-01-29
    time (doSameSpeciesLiftOver.pl -verbose=2 -buildDir=`pwd` \
	-ooc=/hive/data/genomes/danRer7/jkStuff/danRer7.11.ooc \
        -bigClusterHub=ku -dbHost=hgwdev -workhorse=hgwdev \
         danRer7 danRer10) > do.log 2>&1
    # real    182m36.018s

    # verify the convert link on the test browser is now active from danRer7 to
    # danRer10

#########################################################################
# Crispr track. See ../crisprTrack/README.txt (2016-09-15 max)
# Command: doCrispr.sh danRer7 ensGene
##############################################################################
