# for emacs: -*- mode: sh; -*-


# Drosophila Melanogaster -- 
# 
# euchromatin (2L, 2R, 3L, 3R, 4, X):
# Berkeley Drosophila Genome Project (fruitfly.org) release 4 (Apr. 2004)
# http://www.fruitfly.org/annot/release4.html
#
# heterochromatin (2h, 3h, 4h, Xh, Yh, U):
# Drosophila Heterochromatin Genome Project (dhgp.org) release 3.2 
# (submitted to GenBank June 2004) 
# http://www.dhgp.org/index_release_notes.html
#
# Gene annotations:
# FlyBase (http://flybase.bio.indiana.edu/) last updated ???
#

#  NOTE:  this doc may have genePred loads that fail to include
#  the bin column.  Please correct that for the next build by adding
#  a bin column when you make any of these tables:
#
#  mysql> SELECT tableName, type FROM trackDb WHERE type LIKE "%Pred%";
#  +-------------+-------------------------+
#  | tableName   | type                    |
#  +-------------+-------------------------+
#  | flyBaseGene | genePred flyBasePep     |
#  | refGene     | genePred refPep refMrna |
#  | nscanGene   | genePred                |
#  | geneid      | genePred geneidPep      |
#  | genscan     | genePred genscanPep     |
#  | augustus    | genePred                |
#  +-------------+-------------------------+


# DOWNLOAD SEQUENCE (DONE 9/8/04 angie)
    ssh kksilo
    mkdir /cluster/store8/dm2
    ln -s /cluster/store8/dm2 /cluster/data/dm2
    cd /cluster/data/dm2
    wget ftp://ftp.fruitfly.org/pub/download/compressed/na_euchromatin_genomic_dmel_RELEASE4.FASTA.gz
    zcat na_euchromatin_genomic_dmel_RELEASE4.FASTA.gz \
    | faSplit byname stdin dummyArg
    # Follow FlyBase's lead on the chromosome names, but still use our 
    # "chr" prefix:
    foreach c (2L 2R 3L 3R 4 X)
      mkdir $c
      sed -e 's/^>arm_/>chr/' arm_$c.fa > $c/chr$c.fa
      echo arm_$c.fa size:
      faSize arm_$c.fa
      echo $c/chr$c.fa size:
      faSize $c/chr$c.fa
      echo comparison:
      faCmp arm_$c.fa $c/chr$c.fa
      echo ""
    end
    # heterochromatin sold separately... ftp://ftp.dhgp.org/pub/DHGP still 
    # has release 3.2 as its latest.  But that's June 2004... wtf, grab it:
    wget ftp://ftp.dhgp.org/pub/DHGP/Release3.2/FASTA/super-scaffolds/heterochromatin_super-scaffolds-genomic_dmel_RELEASE3.2.FASTA.tar.gz
    tar xvzf heterochromatin_super-scaffolds-genomic_dmel_RELEASE3.2.FASTA.tar.gz
    foreach c (2h 3h 4h Xh Yh U)
      mkdir $c
      perl -wpe 's/^>(\w+).*/>chr$1/' $c.FASTA > $c/chr$c.fa
      echo $c.FASTA size:
      faSize $c.FASTA
      echo $c/chr$c.fa size:
      faSize $c/chr$c.fa
      echo comparison:
      faCmp $c.FASTA $c/chr$c.fa
      echo ""
    end
    # Carefully review output of those commands, then:
    rm *.fa *.FASTA

    # put away the big download files
    mkdir downloads
    mv *.gz downloads/


# DOWNLOAD MITOCHONDRION GENOME SEQUENCE (DONE 9/9/04 angie)
    mkdir /cluster/data/dm2/M
    cd /cluster/data/dm2/M
    # go to http://www.ncbi.nih.gov/ and search Nucleotide for 
    # "drosophila melanogaster mitochondrion genome".  That shows the gi number:
    # 5835233
    # Use that number in the entrez linking interface to get fasta:
    wget -O chrM.fa \
      'http://www.ncbi.nlm.nih.gov/entrez/query.fcgi?cmd=Text&db=Nucleotide&uid=5835233&dopt=FASTA'
    # Edit chrM.fa: make sure the long fancy header line says it's the 
    # Drosophila melanogaster mitochondrion complete genome, and then replace the 
    # header line with just ">chrM".


# SPLIT CHROM FA INTO SMALLER CHUNKS BY GAPS (DONE 9/9/04 angie)
    ssh kksilo
    cd /cluster/data/dm2
    foreach c (?{,?})
      faSplit -minGapSize=100 -lift=$c/chr$c.lft \
        gap $c/chr$c.fa 2000000 $c/chr${c}_
    end
    foreach ctgFa (?{,?}/chr*_*.fa)
      set ctg = $ctgFa:r
      mkdir $ctg
      mv $ctgFa $ctg
    end


# CREATING DATABASE (DONE 9/9/04 angie)
    ssh hgwdev
    # Make sure there is at least 5 gig free for the database
    df -h /var/lib/mysql
#/dev/sdc1             1.8T  535G  1.1T  33% /var/lib/mysql
    # Create the database.
    hgsql '' -e 'create database dm2'


# EXTRACT GAP INFORMATION FROM FASTA, LOAD GAP TRACK (DONE 9/9/04 angie)
    ssh kksilo
    cd /cluster/data/dm2
    # Do as Jim suggested back when we got dm1 sequence:
    # I think that we can probably just show all gaps as bridged
    # in the non-h chromosomes, and as unbridged in the h chromosomes
    # and leave it at that.

    # Extract gaps using scaffoldFaToAgp.  It's really meant for a different 
    # purpose, so clean up its output: remove the .lft and .agp, and remove 
    # the last line of .gap (extra gap added at end).  Also substitute in 
    # the correct chrom name in .gap.  
    foreach c (?{,?})
      set chr = chr$c
      pushd $c
      mv $chr.lft $chr.lft.bak
      scaffoldFaToAgp -minGapSize=100 $chr.fa
      rm $chr.{lft,agp}
      mv $chr.lft.bak $chr.lft
      set chrSize = `faSize $chr.fa | awk '{print $1;}'`
      set origLines = `cat $chr.gap | wc -l`
      awk '($2 != '$chrSize'+1) {print;}' $chr.gap \
      | sed -e "s/chrUn/$chr/" > $chr.gap2
      set newLines = `cat $chr.gap2 | wc -l`
      if ($newLines == ($origLines - 1)) then
        mv $chr.gap2 $chr.gap
      else
        echo "Error: $chr/$chr.gap2 has wrong number of lines."
      endif
      popd
    end
    # Call the gaps unbridged in chrU and chr*h:
    foreach c (U ?h)
      set chr = chr$c
      sed -e 's/yes/no/' $c/$chr.gap > $c/$chr.gap2
      mv $c/$chr.gap2 $c/$chr.gap
    end
    ssh hgwdev
    hgLoadGap dm2 /cluster/data/dm2


# MAKE JKSTUFF AND BED DIRECTORIES (DONE 9/9/04 angie)
    # This used to hold scripts -- better to keep them inline in the .doc 
    # so they're in CVS.  Now it should just hold lift file(s) and 
    # temporary scripts made by copy-paste from this file.  
    mkdir /cluster/data/dm2/jkStuff
    # This is where most tracks will be built:
    mkdir /cluster/data/dm2/bed


# MAKE LIFTALL.LFT (DONE 9/9/04 angie)
    ssh kksilo
    cd /cluster/data/dm2
    cat ?{,?}/chr*.lft > jkStuff/liftAll.lft


# RUN REPEAT MASKER (DONE 9/9/04 angie)
    # Note: drosophila library ("drosophila.lib") is dated May 27 '03.
    # Contigs (*/chr*_*/chr*_*.fa) are split into 500kb chunks to make 
    # RepeatMasker runs manageable on the cluster ==> results need lifting.

    # Split contigs into 500kb chunks:
    ssh kksilo
    cd /cluster/data/dm2
    foreach d ( */chr*_?{,?} )
      cd $d
      set contig = $d:t
      faSplit -minGapSize=100 -lift=$contig.lft -maxN=500000 \
        gap $contig.fa 500000 ${contig}_
      cd ../..
    end

    #- Make the run directory and job list:
    cd /cluster/data/dm2
    cat << '_EOF_' > jkStuff/RMDrosophila
#!/bin/csh -fe

cd $1
pushd .
/bin/mkdir -p /tmp/dm2/$2
/bin/cp $2 /tmp/dm2/$2/
cd /tmp/dm2/$2
/cluster/bluearc/RepeatMasker/RepeatMasker -s -spec drosophila $2
popd
/bin/cp /tmp/dm2/$2/$2.out ./
if (-e /tmp/dm2/$2/$2.tbl) /bin/cp /tmp/dm2/$2/$2.tbl ./
if (-e /tmp/dm2/$2/$2.cat) /bin/cp /tmp/dm2/$2/$2.cat ./
/bin/rm -fr /tmp/dm2/$2/*
/bin/rmdir --ignore-fail-on-non-empty /tmp/dm2/$2
/bin/rmdir --ignore-fail-on-non-empty /tmp/dm2
'_EOF_'
    # << this line makes emacs coloring happy
    chmod +x jkStuff/RMDrosophila
    mkdir RMRun
    cp /dev/null RMRun/RMJobs
    foreach d ( ?{,?}/chr*_?{,?} )
      set ctg = $d:t
      foreach f ( $d/${ctg}_?{,?}.fa )
        set f = $f:t
        echo /cluster/data/dm2/jkStuff/RMDrosophila \
             /cluster/data/dm2/$d $f /cluster/data/dm2/$d \
           '{'check out line+ /cluster/data/dm2/$d/$f.out'}' \
        >> RMRun/RMJobs
      end
    end

    # do the run
    ssh kk9
    cd /cluster/data/dm2/RMRun
    para create RMJobs
    para try, check, push, check,...
#Completed: 288 of 288 jobs
#Average job time:                3169s      52.82m     0.88h    0.04d
#Longest job:                     4752s      79.20m     1.32h    0.06d
#Submission to last job:         13769s     229.48m     3.82h    0.16d

    # Lift up the split-contig .out's to contig-level .out's
    ssh kksilo
    cd /cluster/data/dm2
    foreach d ( ?{,?}/chr*_?{,?} )
      cd $d
      set contig = $d:t
      liftUp $contig.fa.out $contig.lft warn ${contig}_*.fa.out > /dev/null
      cd ../..
    end

    # Lift up the contig-level .out's to chr-level
    foreach c (?{,?})
      cd $c
      if (-e chr$c.lft && ! -z chr$c.lft) then
        echo lifting $c
        /cluster/bin/i386/liftUp chr$c.fa.out chr$c.lft warn \
          `awk '{print $2"/"$2".fa.out";}' chr$c.lft` > /dev/null
      else
        echo Can\'t find $c/chr$c.lft \!
      endif
      cd ..
    end

    # Load the .out files into the database with:
    ssh hgwdev
    hgLoadOut dm2 /cluster/data/dm2/?{,?}/*.fa.out


# SIMPLE REPEATS (TRF) (DONE 9/9/04 angie)
    ssh kksilo
    mkdir /cluster/data/dm2/bed/simpleRepeat
    cd /cluster/data/dm2/bed/simpleRepeat
    mkdir trf
    cp /dev/null jobs.csh
    foreach f (/cluster/data/dm2/?{,?}/chr*_*/chr?{,?}_?{,?}.fa)
        set fout = $f:t:r.bed
        echo $fout
        echo "/cluster/bin/i386/trfBig -trf=/cluster/bin/i386/trf $f /dev/null -bedAt=trf/$fout -tempDir=/tmp" \
        >> jobs.csh
    end
    tcsh jobs.csh >&! jobs.log &
    # check on this with
    tail -f jobs.log
    wc -l jobs.csh
    ls -1 trf | wc -l
    # When job is done do:
    liftUp simpleRepeat.bed /cluster/data/dm2/jkStuff/liftAll.lft warn \
      trf/*.bed

    # Load this into the database as so
    ssh hgwdev
    hgLoadBed dm2 simpleRepeat \
      /cluster/data/dm2/bed/simpleRepeat/simpleRepeat.bed \
      -sqlTable=$HOME/src/hg/lib/simpleRepeat.sql


# FILTER SIMPLE REPEATS (TRF) INTO MASK (DONE 9/9/04 angie)
    # make a filtered version # of the trf output: 
    # keep trf's with period <= 12:
    ssh kksilo
    cd /cluster/data/dm2/bed/simpleRepeat
    mkdir -p trfMask
    foreach f (trf/*.bed)
        echo "filtering $f"
        awk '{if ($5 <= 12) print;}' $f > trfMask/$f:t
    end
    # Lift up filtered trf output to chrom coords as well:
    cd /cluster/data/dm2
    mkdir bed/simpleRepeat/trfMaskChrom
    foreach c (?{,?})
      liftUp bed/simpleRepeat/trfMaskChrom/chr$c.bed $c/chr$c.lft warn \
        `awk '{print "bed/simpleRepeat/trfMask/"$2".bed";}' $c/chr$c.lft`
    end


# MASK FA USING REPEATMASKER AND FILTERED TRF FILES (DONE 9/9/04 angie)
    ssh kksilo
    cd /cluster/data/dm2
    foreach c (?{,?})
      echo repeat- and trf-masking chr$c.fa
      /cluster/home/kent/bin/i386/maskOutFa -soft \
        $c/chr$c.fa $c/chr$c.fa.out $c/chr$c.fa
      /cluster/home/kent/bin/i386/maskOutFa -softAdd \
        $c/chr$c.fa bed/simpleRepeat/trfMaskChrom/chr$c.bed $c/chr$c.fa
    end
    foreach c (?{,?})
      echo repeat- and trf-masking contigs of chr$c
      foreach ctgFa ($c/chr*/chr${c}_?{,?}.fa)
        set trfMask=bed/simpleRepeat/trfMask/$ctgFa:t:r.bed
        /cluster/home/kent/bin/i386/maskOutFa -soft $ctgFa $ctgFa.out $ctgFa
        /cluster/home/kent/bin/i386/maskOutFa -softAdd $ctgFa $trfMask $ctgFa
      end
    end


# STORE SEQUENCE AND ASSEMBLY INFORMATION (DONE 9/9/04 angie)
    # Translate to nib
    ssh kksilo
    cd /cluster/data/dm2
    mkdir nib
    foreach c (?{,?})
      faToNib -softMask $c/chr$c.fa nib/chr$c.nib
    end

    # Make symbolic links from /gbdb/dm2/nib to the real nibs.
    ssh hgwdev
    mkdir -p /gbdb/dm2/nib
    foreach f (/cluster/data/dm2/nib/chr*.nib)
      ln -s $f /gbdb/dm2/nib
    end

    # Load /gbdb/dm2/nib paths into database and save size info.
    hgsql dm2  < ~/src/hg/lib/chromInfo.sql
    hgNibSeq -preMadeNib dm2 /gbdb/dm2/nib /cluster/data/dm2/?{,?}/chr?{,?}.fa
    echo "select chrom,size from chromInfo" | hgsql -N dm2 \
      > /cluster/data/dm2/chrom.sizes


# CREATING GRP TABLE FOR TRACK GROUPING (DONE 9/9/04 angie)
    # Copy all the data from the table "grp" 
    # in the existing database dm1 to the new database
    ssh hgwdev
    hgsql dm2 -e "create table grp (PRIMARY KEY(NAME)) select * from dm1.grp"


# MAKE GCPERCENT (DONE 2/2/04 angie)
     ssh hgwdev
     mkdir /cluster/data/dm2/bed/gcPercent
     cd /cluster/data/dm2/bed/gcPercent
     # create and load gcPercent table
     hgsql dm2  < ~/src/hg/lib/gcPercent.sql
     hgGcPercent dm2 ../../nib


# MAKE HGCENTRALTEST ENTRY AND TRACKDB TABLE FOR DROSOPHILA (DONE 9/9/04 angie)
    # Warning: must genome and organism fields must correspond
    # with defaultDb values
    echo 'INSERT INTO dbDb \
        (name, description, nibPath, organism, \
                defaultPos, active, orderKey, genome, scientificName, \
                htmlPath, hgNearOk, hgPbOk, sourceName) values \
        ("dm2", "Apr. 2004", "/gbdb/dm2/nib", "D. melanogaster", \
               "chr2L:825964-851061", 1, 55, "D. melanogaster", \
                "Drosophila melanogaster", "/gbdb/dm2/html/description.html", \
                0, 0, "BDGP v. 4 / DHGP v. 3.2");' \
      | hgsql -h genome-testdb hgcentraltest
    echo 'update defaultDb set name = "dm2" where genome = "D. melanogaster"' \
      | hgsql -h genome-testdb hgcentraltest

    # Make trackDb table so browser knows what tracks to expect:
    ssh hgwdev
    cd ~/kent/src/hg/makeDb/trackDb
    cvs up -d -P

    # Edit that makefile to add dm2 to DBS and do
    make update

    # go public on genome-test
    cvs commit makefile
    make alpha

    # Add trackDb directories and description.html
    mkdir drosophila/dm2
    cvs add drosophila/dm2
    # Write ~/kent/src/hg/makeDb/trackDb/drosophila/dm2/description.html 
    # with a description of the assembly and some sample position queries.  
    chmod a+r drosophila/dm2/description.html
    # Check it in and copy (via "make alpha" in trackDb/) to 
    # /cluster/data/dm2/html/.  
    cvs add drosophila/dm2/description.html
    cvs commit drosophila/dm2
    mkdir -p /gbdb/dm2/html
    make alpha


# MAKE HGCENTRALTEST BLATSERVERS ENTRY FOR DROSOPHILA (DONE 2/?/05 galt/angie)
    ssh hgwdev
    # Get appropriate hostname and port numbers from cluster admins:
    echo 'insert into blatServers values("dm2", "blat14", 17794, 1, 0); \
          insert into blatServers values("dm2", "blat14", 17795, 0, 1);' \
      | hgsql -h genome-testdb hgcentraltest


# PUT NIBS ON ISCRATCH (DONE 9/9/04 angie)
    ssh kkr1u00
    mkdir /iscratch/i/dm2
    cd /iscratch/i/dm2
    cp -pR /cluster/data/dm2/nib .
    iSync
    # Added "contigs" (chunks) 9/16/04
    mkdir maskedContigs
    cp -p /cluster/data/dm2/*/chr*_*/chr?{,?}_?{,?}.fa maskedContigs
    iSync


# AUTO UPDATE GENBANK MRNA RUN  (DONE 2/1/05 angie)
    # Update genbank config and source in CVS:
    cd ~/kent/src/hg/makeDb/genbank
    cvsup .
    # See if /cluster/data/genbank/etc/genbank.conf has had any un-checked-in
    # edits, check them in if necessary:
    diff /cluster/data/genbank/etc/genbank.conf etc/genbank.conf

    # Edit etc/genbank.conf and add these lines:
# dm2 (D. melanogaster)
dm2.genome = /iscratch/i/dm2/nib/chr*.nib
dm2.lift = /cluster/data/dm2/jkStuff/liftAll.lft
dm2.genbank.mrna.xeno.load = yes
dm2.genbank.est.xeno.load = no
dm2.downloadDir = dm2

    cvs ci etc/genbank.conf

    # Install to /cluster/data/genbank:
    make install-server

    ssh eieio
    cd /cluster/data/genbank
    # This is an -initial run, refseq only:
    nice bin/gbAlignStep -srcDb=refseq -type=mrna -initial dm2 &
    # Load results:
    ssh hgwdev
    cd /cluster/data/genbank
    nice bin/gbDbLoadStep -drop -initialLoad dm2
    featureBits dm2 refGene
#28230571 bases of 131698467 (21.436%) in intersection
    # Clean up:
    rm -rf work/initial.dm2

    ssh eieio
    cd /cluster/data/genbank
    # This is an -initial run, mRNA only:
    nice bin/gbAlignStep -srcDb=genbank -type=mrna -initial dm2 &
    # Load results:
    ssh hgwdev
    cd /cluster/data/genbank
    nice bin/gbDbLoadStep -drop -initialLoad dm2
    featureBits dm2 mrna
#23752517 bases of 131698467 (18.036%) in intersection
    featureBits dm2 xenoMrna
#5943805 bases of 131698467 (4.513%) in intersection
    # Clean up:
    rm -rf work/initial.dm2

    ssh eieio
    # -initial for ESTs:
    nice bin/gbAlignStep -srcDb=genbank -type=est -initial dm2 &
    # Load results:
    ssh hgwdev
    cd /cluster/data/genbank
    nice bin/gbDbLoadStep dm2
    featureBits dm2 intronEst
#12644283 bases of 131698467 (9.601%) in intersection
    featureBits dm2 est
#30993609 bases of 131698467 (23.534%) in intersection
    # Clean up:
    rm -rf work/initial.dm2


# PRODUCING GENSCAN PREDICTIONS (DONE 9/10/04 angie)
    # Check out hg3rdParty/genscanlinux to get latest genscan:
    ssh hgwdev
    mkdir /cluster/data/dm2/bed/genscan
    cd /cluster/data/dm2/bed/genscan
    cvs co hg3rdParty/genscanlinux
    ssh kksilo
    cd /cluster/data/dm2/bed/genscan
    # Make 3 subdirectories for genscan to put their output files in
    mkdir gtf pep subopt
    # Make hard-masked contigs
    foreach f (/cluster/data/dm2/?{,?}/chr*/chr?{,?}_?{,?}.fa)
      maskOutFa $f hard $f.masked
    end
    # Generate a list file, contigs.list, of all the hard-masked contigs that 
    # *do not* consist of all-N's (which would cause genscan to blow up)
    rm -f contigs.list
    touch contigs.list
    foreach f ( `ls -1S /cluster/data/dm2/?{,?}/chr*/chr?{,?}{,_random}_?{,?}.fa.masked` )
      egrep '[ACGT]' $f > /dev/null
      if ($status == 0) echo $f >> contigs.list
    end
    cat << '_EOF_' > gsub
#LOOP
/cluster/bin/i386/gsBig {check in line+ $(path1)} {check out line gtf/$(root1).gtf} -trans={check out line pep/$(root1).pep} -subopt={check out line subopt/$(root1).bed} -exe=hg3rdParty/genscanlinux/genscan -par=hg3rdParty/genscanlinux/HumanIso.smat -tmp=/tmp -window=2400000
#ENDLOOP
'_EOF_'
    # << this line keeps emacs coloring happy
    gensub2 contigs.list single gsub jobList
    # Run on small cluster -- genscan needs big mem.
    ssh kki
    cd /cluster/data/dm2/bed/genscan
    para create jobList
    para try, check, push, check, ...
#Completed: 81 of 81 jobs
#Average job time:                  48s       0.80m     0.01h    0.00d
#Longest job:                      212s       3.53m     0.06h    0.00d
#Submission to last job:           589s       9.82m     0.16h    0.01d

    # If there are crashes, diagnose with "para problems".  
    # If a job crashes due to genscan running out of memory, re-run it 
    # manually with "-window=1200000" instead of "-window=2400000".
    
    # Convert these to chromosome level files as so:
    ssh kksilo
    cd /cluster/data/dm2/bed/genscan
    liftUp genscan.gtf ../../jkStuff/liftAll.lft warn gtf/*.gtf
    liftUp genscanSubopt.bed ../../jkStuff/liftAll.lft warn subopt/*.bed
    cat pep/*.pep > genscan.pep

    # Load into the database as so:
    ssh hgwdev
    cd /cluster/data/dm2/bed/genscan
    ldHgGene -gtf dm2 genscan genscan.gtf
    hgPepPred dm2 generic genscanPep genscan.pep
    hgLoadBed dm2 genscanSubopt genscanSubopt.bed
    featureBits dm2 genscan
#24711981 bases of 131698467 (18.764%) in intersection

# MYTOUCH FIX - Jen - 2006-01-24
  sudo mytouch dm2 genscanPep 0412141800.00

# MAKE DOWNLOADABLE FILES (DONE 1/26/05 angie)
    ssh kksilo
    cd /cluster/data/dm2
    mkdir zips
    zip -j zips/chromOut.zip ?{,?}/chr?{,?}.fa.out
    zip -j zips/chromFa.zip ?{,?}/chr?{,?}.fa
    foreach f (?{,?}/chr?{,?}.fa)
      maskOutFa $f hard $f.masked
    end
    zip -j zips/chromFaMasked.zip ?{,?}/chr?{,?}.fa.masked
    cd bed/simpleRepeat
    zip ../../zips/chromTrf.zip trfMaskChrom/chr*.bed
    cd ../..
    # Make a starter mrna.zip -- it will get updated regularly on the RR. 
    /cluster/data/genbank/bin/i386/gbGetSeqs -gbRoot=/cluster/data/genbank \
      -db=dm2 -native genbank mrna zips/mrna.fa
    gzip zips/mrna.fa
    foreach f (zips/*.zip)
      echo $f
      unzip -t $f | tail -1
    end
    ssh hgwdev
    mkdir /usr/local/apache/htdocs/goldenPath/dm2
    cd /usr/local/apache/htdocs/goldenPath/dm2
    mkdir bigZips database
    # Create README.txt files in bigZips/ and database/ to explain the files.
    cp -p /cluster/data/dm2/zips/*.{zip,gz} bigZips
    cd bigZips
    md5sum *.{zip,gz} > md5sum.txt


# BLASTZ D.PSEUDOOBSCURA (DONE 9/10/04 angie)
# CHAIN PSEUDOOBSCURA BLASTZ (DONE 9/10/04 angie)
# NET PSEUDOOBSCURA BLASTZ (DONE 9/11/04 angie)
# GENERATE DP2 MAF FOR MULTIZ FROM NET (DONE 9/11/04 angie)
# MAKE VSDP2 DOWNLOADABLES (DONE 2/15/05 angie)
# -- dp2; Originally run with default blastz params; obsoleted by dp3 which 
# was later run with better params, below.

# BLASTZ ANOPHELES (DONE 9/13/04 angie)
# CHAIN ANOPHELES BLASTZ (DONE 9/13/04 angie)
# NET ANOPHELES BLASTZ (DONE 9/13/04 angie)
# MAKE VSANOGAM1 DOWNLOADABLES (DONE 1/28/05 angie)
# GENERATE ANOGAM1 MAF FOR MULTIZ FROM NET (DONE 9/13/04 angie)
# -- Originally run with "human-fugu" blastz params; replaced by a run with 
# better params, below.


# MULTIZ MELANOGASTER/YAKUBA/PSEUDOOBSCURA/ANOPHELES (DONE 9/22/04 angie)
    # put the MAFs on bluearc
    ssh kksilo
    mkdir -p /cluster/bluearc/multiz.flymo/my
    cp /cluster/data/dm2/bed/blastz.droYak1.2004-09-10/mafNet/*.maf \
      /cluster/bluearc/multiz.flymo/my
    mkdir -p /cluster/bluearc/multiz.flymo/mp
    cp /cluster/data/dm2/bed/blastz.dp2.2004-09-10/mafNet/*.maf \
      /cluster/bluearc/multiz.flymo/mp
    mkdir -p /cluster/bluearc/multiz.flymo/ma
    cp /cluster/data/dm2/bed/blastz.anoGam1.2004-09-13/mafNet/*.maf \
      /cluster/bluearc/multiz.flymo/ma

    ssh kki
    mkdir /cluster/data/dm2/bed/multiz.droYak1dp2anoGam1
    cd /cluster/data/dm2/bed/multiz.droYak1dp2anoGam1
    mkdir mypa
    # Use PSU's new var_multiz:
    cat << '_EOF_' > doMultiz
#!/bin/csh -ef
set path = (/cluster/bin/penn/var_multiz.2004.08.12 $path)
set chr = $1:t:r
if (-s $1 && -s $2) then
  set tmp = /scratch/$chr.tmp.maf
  var_multiz $1 $2 0 0 > $tmp
  maf_project $tmp dm2.$chr > /scratch/$chr.myp.maf
  rm $tmp
else if (-s $1) then
  cp $1 /scratch/$chr.myp.maf
else if (-s $2) then
  cp $2 /scratch/$chr.myp.maf
endif
if (-s /scratch/$chr.myp.maf && -s $3) then
  set tmp = /scratch/$chr.tmp.maf
  var_multiz /scratch/$chr.myp.maf $3 1 0 > $tmp
  maf_project $tmp dm2.$chr > $4
  rm $tmp
else if (-s /scratch/$chr.myp.maf) then
  cp /scratch/$chr.myp.maf $4
else if (-s $3) then
  cp $3 $4
endif
rm /scratch/$chr.myp.maf
'_EOF_'
    # << this line makes emacs coloring happy
    chmod a+x doMultiz
    cp /dev/null jobList
    foreach chr (`awk '{print $1;}' /cluster/data/dm2/chrom.sizes`)
      set f1 = /cluster/bluearc/multiz.flymo/my/$chr.maf
      set f2 = /cluster/bluearc/multiz.flymo/mp/$chr.maf
      set f3 = /cluster/bluearc/multiz.flymo/ma/$chr.maf
      echo "doMultiz $f1 $f2 $f3 mypa/$chr.maf" >> jobList
    end
    para create jobList
    para try, check, push, check
#Completed: 13 of 13 jobs
#Average job time:                 246s       4.09m     0.07h    0.00d
#Longest job:                      811s      13.52m     0.23h    0.01d
#Submission to last job:           811s      13.52m     0.23h    0.01d
    du -sh mypa
#417M    mypa

    # clean up bluearc
    rm -r /cluster/bluearc/multiz.flymo

    # setup external files for database reference
    ssh hgwdev
    mkdir /gbdb/dm2/mzDy1Dp2Ag1_phast
    ln -s /cluster/data/dm2/bed/multiz.droYak1dp2anoGam1/mypa/chr*.maf \
      /gbdb/dm2/mzDy1Dp2Ag1_phast/
    # load into database
    cd /tmp
    hgLoadMaf -warn dm2 mzDy1Dp2Ag1_phast
    cd /gbdb/dm2
    mkdir d_pseudoobscura_mypa d_yakuba_mypa a_gambiae_mypa
    cd /tmp
    ln -s /cluster/data/dm2/bed/blastz.dp2.2004-09-10/mafNet/*.maf \
      /gbdb/dm2/d_pseudoobscura_mypa
    hgLoadMaf -WARN dm2 d_pseudoobscura_mypa
    ln -s /cluster/data/dm2/bed/blastz.droYak1.2004-09-10/mafNet/*.maf \
      /gbdb/dm2/d_yakuba_mypa
    hgLoadMaf -WARN dm2 d_yakuba_mypa
    ln -s /cluster/data/dm2/bed/blastz.anoGam1.2004-09-13/mafNet/*.maf \
      /gbdb/dm2/a_gambiae_mypa
    hgLoadMaf -WARN dm2 a_gambiae_mypa


# PHASTCONS MELANOGASTER/YAKUBA/PSEUDOOBSCURA/ANOPHELES (DONE 9/22/04 angie)
    ssh kksilo
    # copy chrom fa to bluearc, break up the genome-wide MAFs into pieces
    mkdir -p /cluster/bluearc/dm2/chrom
    cp -p /cluster/data/dm2/?{,?}/chr*.fa /cluster/bluearc/dm2/chrom/
    ssh kki
    mkdir /cluster/data/dm2/bed/multiz.droYak1dp2anoGam1/phastCons
    mkdir /cluster/data/dm2/bed/multiz.droYak1dp2anoGam1/phastCons/run.split
    cd /cluster/data/dm2/bed/multiz.droYak1dp2anoGam1/phastCons/run.split
    set WINDOWS = /cluster/bluearc/dm2/phastCons/WINDOWS
    rm -fr $WINDOWS
    mkdir -p $WINDOWS
    cat << 'EOF' > doSplit.sh
#!/bin/csh -ef

set PHAST=/cluster/bin/phast
set FA_SRC=/cluster/bluearc/dm2/chrom
set WINDOWS=/cluster/bluearc/dm2/phastCons/WINDOWS

set maf=$1
set c = $maf:t:r
set tmpDir = /scratch/msa_split/$c
rm -rf $tmpDir
mkdir -p $tmpDir
${PHAST}/msa_split $maf -i MAF -M ${FA_SRC}/$c.fa -O dm2,droYak1,dp2,anoGam1 \
   -w 1000000,0 -r $tmpDir/$c -o SS -I 1000 -B 5000
cd $tmpDir
foreach file ($c.*.ss)
  gzip -c $file > ${WINDOWS}/$file.gz
end
rm -f $tmpDir/$c.*.ss
rmdir $tmpDir
'EOF'
# << for emacs
    chmod a+x doSplit.sh
    rm -f jobList
    foreach file (/cluster/data/dm2/bed/multiz.droYak1dp2anoGam1/mypa/*.maf) 
      if (-s $file) then
        echo "doSplit.sh {check in line+ $file}" >> jobList
      endif
    end
    para create jobList
    para try,  check, push, check...
#Completed: 13 of 13 jobs
#Average job time:                  23s       0.38m     0.01h    0.00d
#Longest job:                       53s       0.88m     0.01h    0.00d
#Submission to last job:            53s       0.88m     0.01h    0.00d
    cd ..

    # use the model previously estimated (see makeDm1.doc) as a starting model
    sed -e 's/dm1/dm2/g' /cluster/data/dm1/bed/phastCons4way/rev-dg.mod \
      > starting-tree.mod
    # -- Because of the very long branch length to anoGam1 being pretty 
    # much impossible to estimate from alignment data, edit that file to 
    # reduce the anoGam1 branch length from 2.66 to 0.5.  Otherwise 
    # estimation process blows up.  So our starting tree becomes
#TREE: (((dm2:0.058,droYak1:0.074):0.133,dp2:0.200):0,anoGam1:0.5);

    # Get genome-wide average GC content (for all species together,
    # not just the reference genome).  If you have a globally
    # estimated tree model, as above, you can get this from the
    # BACKGROUND line in the .mod file.  E.g.,
# ALPHABET: A C G T
# ...
# BACKGROUND: 0.276938 0.223190 0.223142 0.276730
    # add up the C and G:
    awk '$1 == "BACKGROUND:" {printf "%0.3f\n", $3 + $4;}' starting-tree.mod
#0.446
    # Great, use 0.446 as the --gc parameter in phastCons below:.

    # Now set up cluster job to estimate model parameters.  
    # Parameters will be estimated separately for each alignment fragment 
    # then will be combined across fragments.
    # Use --gc from above, and --target-coverage computed as follows:
    # 1. Jim would like phastConsElements coverage to be 25% for fly.
    # 2. 83% of dm2 is covered by chainDroYak1Link, so use .25 / .83 = .30 
    #    as an initial --target-coverage.
    # 3. If actual coverage is different from our target, come back to this 
    #    step, adjust --target-coverage and rerun up through phastConsElements.
    # -- Actually, Adam suggests starting with what proved to work for worm:
    #    --target-coverage 0.4 and --expected-lengths 25 (not 12 as for mammal)
    # -- OK, that led to too-high coverage:
    # 51511266 bases of 131698467 (39.113%) in intersection
    #    so next try 0.3, 30:
    # 49886398 bases of 131698467 (37.879%) in intersection
    #   how about 0.3, 20?
    # 42067218 bases of 131698467 (31.942%) in intersection
    #   0.25, 20?
    # 38397215 bases of 131698467 (29.155%) in intersection
    #   0.25, 15?
    # 35039841 bases of 131698467 (26.606%) in intersection
    # -- that's close enough to the target.  If the "bumpiness" of the 
    #    wiggle looks aesthetically pleasing then we're done.
    # -- OK, Adam would like to see more smoothing so that coding exons 
    #    stand out better, so try 0.20, 25:
    # 37574714 bases of 131698467 (28.531%) in intersection
    # More smooting would be nice for coding exons, so tried 0.20, 50, 
    # but Adam didn't like that as much overall so stick with 0.20, 25.  

    mkdir run.estimate
    cd run.estimate
    cat << '_EOF_' > doEstimate.sh
#!/bin/csh -ef
zcat $1 \
| /cluster/bin/phast/phastCons - ../starting-tree.mod --gc 0.435 --nrates 1,1 \
    --no-post-probs --ignore-missing --expected-lengths 25 \
    --target-coverage 0.20 --quiet --log $2 --estimate-trees $3
'_EOF_'
# << for emacs
    chmod a+x doEstimate.sh
    rm -fr LOG TREES
    mkdir -p LOG TREES
    rm -f jobList
    foreach f (/cluster/bluearc/dm2/phastCons/WINDOWS/*.ss.gz)
      set root = $f:t:r:r
      echo doEstimate.sh $f LOG/$root.log TREES/$root >> jobList
    end
    # run cluster job
    ssh kk9
    cd /cluster/data/dm2/bed/multiz.droYak1dp2anoGam1/phastCons/run.estimate
    para create jobList
    para try, check, push, check, ...
#Completed: 139 of 139 jobs
#Average job time:                 133s       2.22m     0.04h    0.00d
#Longest job:                      221s       3.68m     0.06h    0.00d
#Submission to last job:           328s       5.47m     0.09h    0.00d

    # Now combine parameter estimates.  We can average the .mod files
    # using phyloBoot.  This must be done separately for the conserved
    # and nonconserved models
    ssh kksilo
    cd /cluster/data/dm2/bed/multiz.droYak1dp2anoGam1/phastCons/run.estimate
    ls -1 TREES/*.cons.mod > cons.txt
    /cluster/bin/phast/phyloBoot --read-mods '*cons.txt' \
      --output-average ave.cons.mod > cons_summary.txt
    ls -1 TREES/*.noncons.mod > noncons.txt
    /cluster/bin/phast/phyloBoot --read-mods '*noncons.txt' \
      --output-average ave.noncons.mod > noncons_summary.txt
    grep TREE ave*.mod
#ave.cons.mod:TREE: (((dm2:0.028707,droYak1:0.019740):0.039091,dp2:0.065825):0.086142,anoGam1:0.086142);
#ave.noncons.mod:TREE: (((dm2:0.118202,droYak1:0.079147):0.162923,dp2:0.275544):0.360361,anoGam1:0.360361);
    # look over the files cons_summary.txt and noncons_summary.txt.
    # The means and medians should be roughly equal and the stdevs
    # should be reasonably small compared to the means, particularly
    # for rate matrix parameters (at bottom) and for branches to the
    # leaves of the tree.  The stdevs may be fairly high for branches
    # near the root of the tree; that's okay.  Some min values may be
    # 0 for some parameters.  That's okay, but watch out for very large
    # values in the max column, which might skew the mean.  If you see
    # any signs of bad outliers, you may have to track down the
    # responsible .mod files and throw them out.  I've never had to do
    # this; the estimates generally seem pretty well behaved.

    # NOTE: Actually, a random sample of several hundred to a thousand
    # alignment fragments (say, a number equal to the number of
    # available cluster nodes) should be more than adequate for
    # parameter estimation.  If pressed for time, use this strategy.

    # Now we are ready to set up the cluster job for computing the
    # conservation scores and predicted elements.  It's all downhill
    # from here.
    ssh kk9
    mkdir /cluster/data/dm2/bed/multiz.droYak1dp2anoGam1/phastCons/run.phast
    cd /cluster/data/dm2/bed/multiz.droYak1dp2anoGam1/phastCons/run.phast
    cat << 'EOF' > doPhastCons.sh
#!/bin/csh -ef
set pref = $1:t:r:r
set chr = `echo $pref | awk -F\. '{print $1}'`
set tmpfile = /scratch/phastCons.$$
zcat $1 \
| /cluster/bin/phast/phastCons - \
    ../run.estimate/ave.cons.mod,../run.estimate/ave.noncons.mod \
    --expected-lengths 25 --target-coverage 0.20 --quiet --seqname $chr \
    --idpref $pref \
    --viterbi /cluster/bluearc/dm2/phastCons/ELEMENTS/$pref.bed --score \
    --require-informative 0 \
  > $tmpfile
gzip -c $tmpfile > /cluster/bluearc/dm2/phastCons/POSTPROBS/$pref.pp.gz
rm $tmpfile
'EOF'
# << for emacs
    chmod a+x doPhastCons.sh
    rm -fr /cluster/bluearc/dm2/phastCons/{POSTPROBS,ELEMENTS}
    mkdir -p /cluster/bluearc/dm2/phastCons/{POSTPROBS,ELEMENTS}
    rm -f jobList
    foreach f (/cluster/bluearc/dm2/phastCons/WINDOWS/*.ss.gz)
      echo doPhastCons.sh $f >> jobList
    end
    para create jobList
    para try, check, push, check, ...
#Completed: 139 of 139 jobs
#Average job time:                  23s       0.38m     0.01h    0.00d
#Longest job:                       49s       0.82m     0.01h    0.00d
#Submission to last job:            52s       0.87m     0.01h    0.00d

    # back on kksilo:
    # combine predictions and transform scores to be in 0-1000 interval
    # do in a way that avoids limits on numbers of args
    cd /cluster/data/dm2/bed/multiz.droYak1dp2anoGam1/phastCons
    awk '{printf "%s\t%d\t%d\tlod=%d\t%s\n", $1, $2, $3, $5, $5;}' \
      /cluster/bluearc/dm2/phastCons/ELEMENTS/*.bed \
    | /cluster/bin/scripts/lodToBedScore > all.bed

    ssh hgwdev
    cd /cluster/data/dm2/bed/multiz.droYak1dp2anoGam1/phastCons
    featureBits dm2 all.bed
#37574714 bases of 131698467 (28.531%) in intersection
    # OK, close enough.
    hgLoadBed dm2 phastConsElements all.bed

    # Create wiggle on the small cluster
    ssh kki
    mkdir /cluster/data/dm2/bed/multiz.droYak1dp2anoGam1/phastCons/run.wib
    cd /cluster/data/dm2/bed/multiz.droYak1dp2anoGam1/phastCons/run.wib
    rm -rf /cluster/bluearc/dm2/phastCons/wib
    mkdir -p /cluster/bluearc/dm2/phastCons/wib
    cat << 'EOF' > doWigAsciiToBinary
#!/bin/csh -ef
set chr = $1
zcat `ls -1 /cluster/bluearc/dm2/phastCons/POSTPROBS/$chr.*.pp.gz \
      | sort -t\. -k2,2n` \
| wigAsciiToBinary -chrom=$chr \
    -wibFile=/cluster/bluearc/dm2/phastCons/wib/${chr}_phastCons stdin 
'EOF'
# << for emacs
    chmod a+x doWigAsciiToBinary
    rm -f jobList
    foreach chr (`ls -1 /cluster/bluearc/dm2/phastCons/POSTPROBS \
                  | awk -F\. '{print $1}' | sort -u`)
      echo doWigAsciiToBinary $chr >> jobList
    end
    para create jobList
    para try, check, push, check, ...
#Completed: 13 of 13 jobs
#Average job time:                  14s       0.24m     0.00h    0.00d
#Longest job:                       36s       0.60m     0.01h    0.00d
#Submission to last job:            36s       0.60m     0.01h    0.00d

    # back on kksilo, copy wibs, wigs and POSTPROBS (people sometimes want 
    # the raw scores) from bluearc
    cd /cluster/data/dm2/bed/multiz.droYak1dp2anoGam1/phastCons
    rm -rf wib POSTPROBS
    rsync -av /cluster/bluearc/dm2/phastCons/wib .
    rsync -av /cluster/bluearc/dm2/phastCons/POSTPROBS .

    # load wiggle component of Conservation track
    ssh hgwdev
    mkdir -p /gbdb/dm2/wib/mzDy1Dp2Ag1_phast
    cd /cluster/data/dm2/bed/multiz.droYak1dp2anoGam1/phastCons
    chmod 775 . wib
    chmod 664 wib/*.wib
    ln -s `pwd`/wib/*.wib /gbdb/dm2/wib/mzDy1Dp2Ag1_phast/
    hgLoadWiggle dm2 mzDy1Dp2Ag1_phast_wig \
      -pathPrefix=/gbdb/dm2/wib/mzDy1Dp2Ag1_phast wib/*.wig

#NOT DONE -- still using dm1 for this:
    # make top-5000 list and launcher on Adam's home page:
    sort -k5,5nr raw.bed | head -5000 > top5000.bed
    /cluster/home/acs/bin/make-launcher-with-scores.sh top5000.bed \
      /cse/grads/acs/public_html/dm-top5000-4way \
      "top 5000 conserved elements (4way)" dm2

    # and clean up bluearc.
    rm -r /cluster/bluearc/dm2/phastCons
    rm -r /cluster/bluearc/dm2/chrom


# LIFTOVER BDGP 3.2 ANNOTATIONS FROM DM1 (TEMPORARY) (DONE 9/15/04 angie)
    ssh hgwdev
    mkdir /cluster/data/dm2/bed/bdgp3.2.liftOver
    cd /cluster/data/dm2/bed/bdgp3.2.liftOver
    hgsql dm1 -N -e 'select * from bdgpGene' > dm1.bdgpGene.tab
    hgsql dm1 -N -e 'select * from bdgpNonCoding' > dm1.bdgpNonCoding.tab
    ssh kksilo
    cd /cluster/data/dm2/bed/bdgp3.2.liftOver
# lift files to try out:
# blastz:
#  /cluster/data/dm1/bed/blastz.dm2.2004-09-15/axtChain/dm1ToDm2.over.chain \
# blat on dm1 chroms vs. dm2 2Mb chunks:
#  /cluster/data/dm1/bed/blat.dm2.2004-09-16/dm1ToDm2.over.chain \
# blat on dm1 chroms vs. dm2 3kb chunks:
#  /cluster/data/dm1/bed/blat.dm2.2004-09-17/dm1ToDm2.over.chain \
    liftOver -genePred dm1.bdgpGene.tab \
      /cluster/data/dm1/bed/bedOver/dm1ToDm2.over.chain \
      bdgpLiftGene.tab bdgpLiftGene.unmapped.tab
    liftOver -genePred dm1.bdgpNonCoding.tab \
      /cluster/data/dm1/bed/bedOver/dm1ToDm2.over.chain \
      bdgpLiftNonCoding.tab bdgpLiftNonCoding.unmapped.tab
    # Not perfect but not bad:
    wc -l bdgp*.tab
#  18707 bdgpLiftGene.tab
#     78 bdgpLiftGene.unmapped.tab
#   2134 bdgpLiftNonCoding.tab
#     62 bdgpLiftNonCoding.unmapped.tab

    ssh hgwdev
    cd /cluster/data/dm2/bed/bdgp3.2.liftOver
    ldHgGene -predTab dm2 bdgpLiftGene bdgpLiftGene.tab
    ldHgGene -predTab dm2 bdgpLiftNonCoding bdgpLiftNonCoding.tab
    # Copy over the gene info tables
    hgsql dm2 -e 'create table bdgpLiftGeneInfo (PRIMARY KEY(bdgpName(7)), INDEX(flyBaseId(11))) select * from dm1.bdgpGeneInfo'
    hgsql dm2 -e 'create table bdgpLiftNonCodingInfo (INDEX(bdgpName(16)), INDEX(flyBaseId(11))) select * from dm1.bdgpNonCodingInfo'


# FLYBASE ANNOTATIONS (DONE 1/12/05 angie)
# REPLACED 2/23/05 -- SEE "FLYBASE 4.1" BELOW
    ssh kksilo
    mkdir /cluster/data/dm2/bed/flybase
    cd /cluster/data/dm2/bed/flybase
    foreach c (2L 2R 3L 3R 4 X)
      wget ftp://flybase.net/genomes/Drosophila_melanogaster/dmel_r4.0_20041119/gff/dmel-$c-r4.0.gff.gz
    end
    zcat *.gff.gz > flybase.gff3
    # What data sources are represented in this file?
    grep -v '^#' flybase.gff3 | awk '{print $2 "\t" $3;}' | sort | uniq -c
    # excerpt (many other sources, including blastx:... , sim4:... and 
    # tblastx:...; also various other types for source "."):
  18747 .       CDS
  62629 .       exon
  13472 .       gene
  19301 .       mRNA
     70 .       ncRNA
     40 .       pseudogene
     96 .       rRNA
     28 .       snRNA
     28 .       snoRNA
    288 .       tRNA
  36921 .       transcription_start_site
   1571 .       transposable_element
   4680 .       transposable_element_insertion_site

    # What keywords are defined in the 9th field?
    grep -v '^#' flybase.gff3 \
    | awk '{print $9;}' | perl -wpe 's/=[^;]+;/\n/g; s/=.*$//;' \
    | sort | uniq -c
    # This incarnation of gff3 looks similar to the one encountered in 
    # dp3, but still uses slightly different keywords and there are different 
    # data sources of interest.  So one-shot perl-script again:
    extractGenes.pl flybase.gff3

    # Get predicted proteins (for main annotations only)
    wget ftp://flybase.net/genomes/Drosophila_melanogaster/dmel_r4.0_20041119/fasta/dmel-all-translation-r4.0.fasta.gz
    zcat dmel-all-translation-r4.0.fasta.gz \
    | perl -wpe 's/^(>\w+)-P(\w)/$1-R$2/' > flybasePep.fa

    ssh hgwdev
    cd /cluster/data/dm2/bed/flybase
    # Protein-coding genes:
    ldHgGene -gtf dm2 flyBaseGene flybase.gtf
    hgPepPred dm2 generic flyBasePep flybasePep.fa
    # Noncoding genes:
    hgLoadBed dm2 flyBaseNoncoding flyBaseNoncoding.bed
    # Cross-referencing info for both coding and noncoding:
    hgsql dm2 < $HOME/kent/src/hg/lib/flyBase2004Xref.sql
    hgsql dm2 -e 'load data local infile "flyBase2004Xref.tab" \
      into table flyBase2004Xref'

    # add upstream* downloadable files (added 2/10/05)
    cd /usr/local/apache/htdocs/goldenPath/dm2/bigZips
    foreach size (1000 2000 5000)
      echo upstream$size
      featureBits dm2 flyBaseGene:upstream:$size -fa=stdout \
      | gzip -c > upstream$size.fa.gz
    end
    md5sum upstream*.fa.gz >> md5sum.txt


# BLASTZ D. PSEUDOOBSCURA (DP3) (DONE 1/18/05 angie)
# CHAIN PSEUDOOBSCURA BLASTZ (DONE 1/18/05 angie)
# NET PSEUDOOBSCURA BLASTZ (DONE 1/18/05 angie)
# GENERATE DP3 AXTNET AND MAF FOR MULTIZ (DONE 1/18/05 angie)
# MAKE VSDP3 DOWNLOADABLES (DONE 1/18/05 angie)
# -- Originally run with default blastz params; replaced by a run with 
# better params, below.


# BLASTZ.V7 EVAL: BLASTZ D. PSEUDOOBSCURA (DP3) (DONE 1/18/05 angie)
    ssh kk9
    mkdir /cluster/data/dm2/bed/blastzv7.dp3.2005-01-18
    cd /cluster/data/dm2/bed/blastzv7.dp3.2005-01-18
    ln -s blastzv7.dp3.2005-01-18 /cluster/data/dm2/bed/blastzv7.dp3
    # Note the full path for BLASTZ so we get blastz.v7 (which is not in 
    # PSU's CVS, argh!):
    cat << '_EOF_' > DEF
# D.melanogaster vs. D.pseudoobscura
export PATH=/usr/bin:/bin:/usr/local/bin:/cluster/bin/penn:/cluster/bin/i386:/cluster/home/angie/schwartzbin

ALIGN=blastz-run
BLASTZ=/cluster/bin/penn/blastz.2004-12-27/blastz-source/blastz
BLASTZ_H=2000
BLASTZ_ABRIDGE_REPEATS=0

# TARGET - D. melanogaster
SEQ1_DIR=/iscratch/i/dm2/nib
# unused: SEQ1_RMSK=
SEQ1_SMSK=
SEQ1_FLAG=-drosophila
SEQ1_IN_CONTIGS=0
SEQ1_CHUNK=10000000
SEQ1_LAP=10000

# QUERY - D. pseudoobscura
SEQ2_DIR=/iscratch/i/dp3/nib
# unused: SEQ2_RMSK=
SEQ2_SMSK=
SEQ2_FLAG=-drosophila
SEQ2_IN_CONTIGS=0
SEQ2_CHUNK=10000000
SEQ2_LAP=0

BASE=/cluster/data/dm2/bed/blastzv7.dp3.2005-01-18

DEF=$BASE/DEF
RAW=$BASE/raw
CDBDIR=$BASE
SEQ1_LEN=$BASE/S1.len
SEQ2_LEN=$BASE/S2.len

'_EOF_'
    # << this line keeps emacs coloring happy

    # Create run dir, job list, and raw/ dir structure:
    mkdir run
    ~/kent/src/utils/blastz-make-joblist DEF \
      /cluster/data/dm2/chrom.sizes /cluster/data/dp3/chrom.sizes \
    > run/j
    csh -ef ./xdir.sh
    cd run
    # cluster run
    para create j
    para try, check, push, check, ....
#Completed: 552 of 552 jobs
#Average job time:                 100s       1.66m     0.03h    0.00d
#Longest job:                      480s       8.00m     0.13h    0.01d
#Submission to last job:           733s      12.22m     0.20h    0.01d

    # back on kksilo...
    mkdir /cluster/data/dm2/bed/blastzv7.dp3.2005-01-18/run.1
    cd /cluster/data/dm2/bed/blastzv7.dp3.2005-01-18/run.1
    /cluster/bin/scripts/blastz-make-out2lav ../DEF .. > j
    # small cluster run
    ssh kki
    cd /cluster/data/dm2/bed/blastzv7.dp3.2005-01-18/run.1
    para create j
    para try, check, push, check, ....
#Completed: 23 of 23 jobs
#Average job time:                   8s       0.14m     0.00h    0.00d
#Longest job:                       29s       0.48m     0.01h    0.00d
#Submission to last job:            60s       1.00m     0.02h    0.00d

    # Translate .lav to axt:
    ssh kksilo
    cd /cluster/data/dm2/bed/blastzv7.dp3.2005-01-18
    rm -r raw
    mkdir axtChrom
    foreach c (lav/*)
      pushd $c
      set chr=$c:t
      set out=axtChrom/$chr.axt
      echo "Translating $chr lav to $out"
      cat `ls -1 *.lav | sort -g` \
        | lavToAxt stdin /cluster/data/dm2/nib /cluster/data/dp3/nib stdout \
        | axtSort stdin ../../$out
      popd
    end


# BLASTZ.V7 EVAL: CHAIN PSEUDOOBSCURA BLASTZ (DONE 1/18/05 angie)
    # Run axtChain on little cluster
    ssh kki
    cd /cluster/data/dm2/bed/blastzv7.dp3.2005-01-18
    mkdir -p axtChain/run1
    cd axtChain/run1
    mkdir chain
    ls -1S /cluster/data/dm2/bed/blastzv7.dp3.2005-01-18/axtChrom/*.axt \
      > input.lst
    cat << '_EOF_' > gsub
#LOOP
doChain {check in exists $(path1)} {check out line+ chain/$(root1).chain}
#ENDLOOP
'_EOF_'
    # << this line makes emacs coloring happy

    cat << '_EOF_' > doChain
#!/bin/csh -ef
axtChain -verbose=0 $1 \
  /iscratch/i/dm2/nib \
  /iscratch/i/dp3/nib stdout \
| chainAntiRepeat /iscratch/i/dm2/nib /iscratch/i/dp3/nib \
    stdin $2
'_EOF_'
    # << this line makes emacs coloring happy
    chmod a+x doChain
    gensub2 input.lst single gsub jobList
    para create jobList
    para try, check, push, check...
#Completed: 13 of 13 jobs
#Average job time:                  10s       0.17m     0.00h    0.00d
#Longest job:                       22s       0.37m     0.01h    0.00d
#Submission to last job:            22s       0.37m     0.01h    0.00d

    # now on the cluster server, sort chains
    ssh kksilo
    cd /cluster/data/dm2/bed/blastzv7.dp3.2005-01-18/axtChain
    chainMergeSort run1/chain/*.chain > all.chain
    chainSplit chain all.chain
    rm run1/chain/*.chain

    # take a look at score distr's
    foreach f (chain/*.chain)
      grep chain $f | awk '{print $2;}' | sort -nr > /tmp/score.$f:t:r
      echo $f:t:r
      textHistogram -binSize=10000 /tmp/score.$f:t:r
      echo ""
    end

    # Load chains into database
    ssh hgwdev
    cd /cluster/data/dm2/bed/blastzv7.dp3.2005-01-18/axtChain/chain
    foreach i (*.chain)
        set c = $i:r
        echo loading $c
        hgLoadChain dm2 ${c}_chainBz7Dp3 $i
    end
    # Compare to regular (blastz.v6) chains:
    featureBits dm2 chainDp3Link
#76557614 bases of 131698467 (58.131%) in intersection
    featureBits dm2 chainBz7Dp3Link
#76340831 bases of 131698467 (57.966%) in intersection
    # Looks like v6 found more than v7 in general, but not too different.
    # Look at some cases where blastz.v7 found something but v6 didn't:
    featureBits dm2 chainBz7Dp3Link \!chainDp3Link -minSize=20 -bed=stdout
#chrYh   212776  212811  chrYh.1
...
#chr3R   18027130        18027160        chr3R.59
...
#chr2h   300968  301032  chr2h.1
...
#34843 bases of 131698467 (0.026%) in intersection


    # BLATZ EVAL: D. PSEUDOOBSCURA (DP3) (IN PROGRESS 1/24/05 angie)
    ssh kk9
    mkdir /cluster/data/dm2/bed/blatz.dp3.2004-01-19
    cd /cluster/data/dm2/bed/blatz.dp3.2004-01-19
    mkdir chainRaw
    partitionSequence.pl 10000000 10000 /iscratch/i/dm2/nib \
      /cluster/data/dm2/chrom.sizes > dm2.lst
    partitionSequence.pl 10000000 10000 /iscratch/i/dp3/nib \
      /cluster/data/dp3/chrom.sizes > dp3.lst
    cat << '_EOF_' > gsub
#LOOP
blatz $(path1) $(path2) {check out line chainRaw/$(file1)_$(file2).chain}
#ENDLOOP
'_EOF_'
    # << this line keeps emacs coloring happy
    gensub2 dm2.lst dp3.lst gsub spec
    para create spec
    para try, check, push, check, ...
#Completed: 549 of 552 jobs
#Crashed: 3 jobs
#Average job time:                 289s       4.82m     0.08h    0.00d
#Longest job:                     1664s      27.73m     0.46h    0.02d
#Submission to last job:          6206s     103.43m     1.72h    0.07d
    # With default blatz params, 3 jobs crashed due to out-of-mem, 
    # all others succeeded.  Try again with thresholds more like 
    # blastz human-mouse, i.e. -minGapless=3000?  No, Jim says forget 
    # the crashers for now (narrow down later) and just forge ahead 
    # with the comparison to blastz (v6).
    ssh kksilo
    cd /cluster/data/dm2/bed/blatz.dp3.2004-01-19
    sed -e 's@/iscratch/i/d../nib/@@g; s/.nib//g;' chainRaw/* \
    | chainMergeSort stdin > all.chain
    chainSplit chain all.chain

    # take a look at score distr's -- LOTS of low-scoring chains
    foreach f (chain/*.chain)
      grep ^chain $f | awk '{print $2;}' | sort -nr > /tmp/score.$f:t:r
      echo $f:t:r
      textHistogram -binSize=10000 /tmp/score.$f:t:r
      echo ""
    end

    # Jim recommends -minScore=5000 for cross-species.  Filter for now:
    mv all.chain all.chain.unfiltered
    chainFilter -minScore=5000 all.chain.unfiltered > all.chain
    chainSplit chain all.chain
    gzip all.chain.unfiltered

    # Load chains into database
    ssh hgwdev
    cd /cluster/data/dm2/bed/blatz.dp3.2004-01-19/chain
    foreach i (*.chain)
        set c = $i:r
        echo loading $c
        hgLoadChain dm2 ${c}_chainBz1Dp3 $i
    end
    # Compare to regular (blastz.v6) chains:
    featureBits dm2 -enrichment flyBaseGene:CDS chainDp3Link
#flyBaseGene:CDS 16.551%, chainDp3Link 58.131%, both 15.303%, cover 92.46%, enrich 1.59x
    featureBits dm2 -enrichment flyBaseGene:CDS chainBz1Dp3Link
#flyBaseGene:CDS 16.551%, chainBz1Dp3Link 79.858%, both 16.179%, cover 97.76%, enrich 1.22x
    # Look at some examples where blatz covers some CDS that blastz doesn't:
    featureBits -chrom=chr3R dm2 flyBaseGene:CDS chainBz1Dp3Link \
      \!chainDp3Link -minSize=30 -bed=/tmp/bz1Cds.bed


# BLASTZ D.YAKUBA (DONE 1/26/05 angie)
# CHAIN YAKUBA BLASTZ (DONE 1/26/05 angie)
# NET YAKUBA BLASTZ (DONE 1/26/05 angie)
# GENERATE DROYAK1 MAF FOR MULTIZ FROM NET (DONE 1/26/05 angie)
# MAKE VSDROYAK1 DOWNLOADABLES (DONE 1/26/05 angie)
# -- Originally run with default blastz params; replaced by a run with 
# better params, below.


# BLASTZ D.ANANASSAE (DONE 1/28/05 angie)
# CHAIN ANANASSAE BLASTZ (DONE 1/28/05 angie)
# NET ANANASSAE BLASTZ (DONE 1/28/05 angie)
# GENERATE DROANA1 AXTNET AND MAF FOR MULTIZ (DONE 1/28/05 angie)
# MAKE VSDROANA1 DOWNLOADABLES (DONE 1/28/05 angie)
# -- Originally run with default blastz params; replaced by a run with 
# better params, below.


# BLASTZ D.VIRILIS (DONE 1/28/05 angie)
# CHAIN VIRILIS BLASTZ (DONE 1/28/05 angie)
# NET VIRILIS BLASTZ (DONE 1/28/05 angie)
# GENERATE DROVIR1 AXTNET AND MAF FOR MULTIZ (DONE 1/28/05 angie)
# MAKE VSDROVIR1 DOWNLOADABLES (DONE 1/28/05 angie)
# -- Originally run with default blastz params; replaced by a run with 
# better params, below.


# BLASTZ D.MOJAVENSIS (DONE 1/28/05 angie)
# CHAIN MOJAVENSIS BLASTZ (DONE 1/28/05 angie)
# NET MOJAVENSIS BLASTZ (DONE 1/28/05 angie)
# GENERATE DROMOJ1 AXTNET AND MAF FOR MULTIZ (DONE 1/28/05 angie)
# MAKE VSDROMOJ1 DOWNLOADABLES (DONE 1/28/05 angie)
# -- Originally run with default blastz params; replaced by a run with 
# better params, below.


# BLASTZ A.MELLIFERA (DONE 1/28/05 angie)
# CHAIN MELLIFERA BLASTZ (DONE 1/31/05 angie)
# NET MELLIFERA BLASTZ (DONE 1/31/05 angie)
# GENERATE APIMEL1 AXTNET AND MAF FOR MULTIZ (DONE 1/31/05 angie)
# MAKE VSAPIMEL1 DOWNLOADABLES (DONE 1/31/05 angie)
# -- Originally run on apiMel2 with "human-fugu" blastz params; outdated by 
# apiMel2 which was later run with better params, below.


# BLASTZ A.MELLIFERA 2.0 (DONE 2/8/05 Andy)
# CHAIN MELLIFERA 2.0 BLASTZ (DONE 2/9/05 Andy)
# NET MELLIFERA 2.0 BLASTZ (DONE 2/9/05 Andy)
# GENERATE APIMEL2 AXTNET AND MAF FOR MULTIZ (DONE 2/9/05 Andy)
# MAKE VSAPIMEL2 DOWNLOADABLES (DONE 2/9/2005 Andy)
# -- Originally run with "human-fugu" blastz params; replaced by a run with 
# better params, below.


# MULTIZ.V10 8WAY (6 FLIES, MOSQUITO, HONEYBEE) (REDONE 3/7/05 angie
# originally done 2/1/05
# REDONE 3/7/05 angie -- User found that yakuba was missing (due to 
# pairwise maf files having different .suffixes, and the script tolerating 
# missing files... yikes!!!).  :(
# REPLACED with 9WAY 5/23/05

# MULTIZ.V10 9WAY (7 FLIES, MOSQUITO, HONEYBEE) (DONE 5/24/05 angie)
# Better, looser blastz params were used to redo alignments of all
# other insects to dm2.  Much better coverage on the pairwise inputs
# now, so re-run multiz and phastCons to get a better Conservation
# track... also using dp3 and apiMel2 instead of dp2 and apiMel1, 
# and adding droSim1.
    # Tree (9-way):
    # ((((((dm2 droYak1) droAna1) dp3) (droVir1 droMoj1)) anoGam1) apiMel2)
    ssh kkstore01
    mkdir /cluster/data/dm2/bed/multiz9way.2005-05-23
    ln -s /cluster/data/dm2/bed/multiz9way.2005-05-23 \
      /cluster/data/dm2/bed/multiz9way
    cd /cluster/data/dm2/bed/multiz9way
    # Setup: Copy pairwise MAF to /santest/scratch:
    mkdir /santest/scratch/flyMultiz9way
    foreach db (droSim1 droYak1 droAna1 dp3 droVir1 droMoj1 anoGam1 apiMel2)
      cp -pR /cluster/data/dm2/bed/blastz.$db/mafNet \
        /santest/scratch/flyMultiz9way/$db
    end
    ls -lLR /santest/scratch/flyMultiz9way
    # Make output dir:
    mkdir maf
    # Create script to run multiz.v10 in the right order:
    cat << '_EOF_' > doMultiz.csh
#!/bin/csh -fe
set chr = $1
set tmp = /scratch/flyMultiz9way.$chr
mkdir $tmp

set REF = dm2.$chr
set SIM = /santest/scratch/flyMultiz9way/droSim1/$chr.maf.gz
set YAK = /santest/scratch/flyMultiz9way/droYak1/$chr.maf.gz
set ANA = /santest/scratch/flyMultiz9way/droAna1/$chr.maf.gz
set PSE = /santest/scratch/flyMultiz9way/dp3/$chr.maf.gz
set VIR = /santest/scratch/flyMultiz9way/droVir1/$chr.maf.gz
set MOJ = /santest/scratch/flyMultiz9way/droMoj1/$chr.maf.gz
set ANO = /santest/scratch/flyMultiz9way/anoGam1/$chr.maf.gz
set API = /santest/scratch/flyMultiz9way/apiMel2/$chr.maf.gz
set DEST = /cluster/data/dm2/bed/multiz9way/maf/$chr.maf

set MZ10 = /cluster/bin/penn/multiz.v10
set PROJECT = /cluster/bin/penn/maf_project

if ( -s $SIM && -s $YAK ) then 
    echo "Aligning $SIM $YAK..."
    zcat $SIM > $tmp/$chr.Sim.maf
    zcat $YAK > $tmp/$chr.Yak.maf
    $MZ10 $tmp/$chr.Sim.maf $tmp/$chr.Yak.maf 1 > $tmp/$chr.tmp.maf
    echo "Projecting on $REF..."
    $PROJECT $tmp/$chr.tmp.maf $REF > $tmp/$chr.SimYak.maf
else if ( -s $SIM ) then
    touch missing.$chr.Sim
    zcat $SIM > $tmp/$chr.SimYak.maf
else if ( -s $YAK ) then
    touch missing.$chr.Yak
    zcat $YAK > $tmp/$chr.SimYak.maf
endif

if ( -s $tmp/$chr.SimYak.maf && -s $ANA ) then 
    echo "Aligning $tmp/$chr.SimYak.maf $ANA..."
    zcat $ANA > $tmp/$chr.Ana.maf
    $MZ10 $tmp/$chr.SimYak.maf $tmp/$chr.Ana.maf 1 > $tmp/$chr.tmp.maf
    echo "Projecting on $REF..."
    $PROJECT $tmp/$chr.tmp.maf $REF > $tmp/$chr.SimYakAna.maf
else if ( -s $tmp/$chr.SimYak.maf ) then
    touch missing.$chr.Ana
    cp $tmp/$chr.SimYak.maf $tmp/$chr.SimYakAna.maf
else if ( -s $ANA ) then
    touch missing.$chr.SimYak
    zcat $ANA > $tmp/$chr.SimYakAna.maf
endif

if ( -s $PSE && -s $tmp/$chr.SimYakAna.maf ) then
    echo "Adding $PSE..."
    zcat $PSE > $tmp/$chr.Pse.maf
    $MZ10 $tmp/$chr.SimYakAna.maf $tmp/$chr.Pse.maf 1 > $tmp/$chr.tmp.maf
    echo "Projecting on $REF..."
    $PROJECT $tmp/$chr.tmp.maf $REF > $tmp/$chr.SimYakAnaPse.maf
else if ( -s $PSE ) then
    touch missing.$chr.SimYakAna
    zcat $PSE > $tmp/$chr.SimYakAnaPse.maf
else if ( -s $tmp/$chr.SimYakAna.maf ) then
    touch missing.$chr.Pse
    cp $tmp/$chr.SimYakAna.maf $tmp/$chr.SimYakAnaPse.maf
endif 

# droVir1 and droMoj1 are a subtree -- run on just them, then fold in:
if ( -s $VIR && -s $MOJ ) then 
    echo "Aligning $VIR $MOJ..."
    zcat $VIR > $tmp/$chr.Vir.maf
    zcat $MOJ > $tmp/$chr.Moj.maf
    $MZ10 $tmp/$chr.Vir.maf $tmp/$chr.Moj.maf 0 > $tmp/$chr.tmp.maf
    echo "Projecting on $REF..."
    $PROJECT $tmp/$chr.tmp.maf $REF > $tmp/$chr.VirMoj.maf
else if ( -s $VIR ) then
    touch missing.$chr.Moj
    zcat $VIR > $tmp/$chr.VirMoj.maf
else if ( -s $MOJ ) then
    touch missing.$chr.Vir
    zcat $MOJ > $tmp/$chr.VirMoj.maf
endif

if ( -s $tmp/$chr.VirMoj.maf && -s $tmp/$chr.SimYakAnaPse.maf ) then
    echo "Adding $tmp/$chr.VirMoj.maf..."
    $MZ10 $tmp/$chr.SimYakAnaPse.maf $tmp/$chr.VirMoj.maf 1 > $tmp/$chr.tmp.maf
    echo "Projecting on $REF..."
    $PROJECT $tmp/$chr.tmp.maf $REF > $tmp/$chr.SimYakAnaPseVirMoj.maf
else if ( -s $tmp/$chr.VirMoj.maf ) then
    touch missing.$chr.SimYakAnaPse
    cp $tmp/$chr.VirMoj.maf $tmp/$chr.SimYakAnaPseVirMoj.maf
else if ( -s $tmp/$chr.SimYakAnaPse.maf ) then
    touch missing.$chr.VirMoj
    cp $tmp/$chr.SimYakAnaPse.maf $tmp/$chr.SimYakAnaPseVirMoj.maf
endif 

if ( -s $ANO && -s $tmp/$chr.SimYakAnaPseVirMoj.maf ) then
    echo "Adding $ANO..."
    zcat $ANO > $tmp/$chr.Ano.maf
    $MZ10 $tmp/$chr.SimYakAnaPseVirMoj.maf $tmp/$chr.Ano.maf 1 > $tmp/$chr.tmp.maf
    echo "Projecting on $REF..."
    $PROJECT $tmp/$chr.tmp.maf $REF > $tmp/$chr.SimYakAnaPseVirMojAno.maf
else if ( -s $ANO ) then
    touch missing.$chr.SimYakAnaPseVirMoj
    zcat $ANO > $tmp/$chr.SimYakAnaPseVirMojAno.maf
else if ( -s $tmp/$chr.SimYakAnaPseVirMoj.maf ) then
    touch missing.$chr.Ano
    cp $tmp/$chr.SimYakAnaPseVirMoj.maf $tmp/$chr.SimYakAnaPseVirMojAno.maf
endif 

if ( -s $API && -s $tmp/$chr.SimYakAnaPseVirMojAno.maf ) then
    echo "Adding $API..."
    zcat $API > $tmp/$chr.Api.maf
    $MZ10 $tmp/$chr.SimYakAnaPseVirMojAno.maf $tmp/$chr.Api.maf 1 > $tmp/$chr.tmp.maf
    echo "Projecting on $REF..."
    $PROJECT $tmp/$chr.tmp.maf $REF > $tmp/$chr.final.maf
    cp $tmp/$chr.final.maf $DEST
else if ( -s $API ) then
    touch missing.$chr.SimYakAnaPseVirMojAno
    zcat $API > $DEST
else if ( -s $tmp/$chr.SimYakAnaPseVirMojAno.maf ) then
    touch missing.$chr.Api
    cp $tmp/$chr.SimYakAnaPseVirMojAno.maf $DEST
endif 

rm $tmp/$chr.*.maf
rmdir $tmp
'_EOF_'
    # << keep emacs coloring happy    
    chmod 755 doMultiz.csh
    awk '{print "./doMultiz.csh " $1;}' /cluster/data/dm2/chrom.sizes \
      > jobs.lst
    # Run on small cluster
    ssh kki
    cd /cluster/data/dm2/bed/multiz9way
    para create jobs.lst
    para try, check, push, check, ...
#Completed: 13 of 13 jobs
#Average job time:                2000s      33.34m     0.56h    0.02d
#Longest finished job:            5969s      99.48m     1.66h    0.07d
#Submission to last job:         15605s     260.08m     4.33h    0.18d
    ls -1 missing*
#ls: No match.

    # make /gbdb/ links to 9way maf files:
    ssh hgwdev
    mkdir -p /gbdb/dm2/multiz9way/maf/multiz9way
    ln -s /cluster/data/dm2/bed/multiz9way/maf/chr*.maf \
      /gbdb/dm2/multiz9way/maf/multiz9way/
    # load into database
    cd /tmp
    hgLoadMaf -warn dm2 multiz9way \
      -pathPrefix=/gbdb/dm2/multiz9way/maf/multiz9way
    # load summary table to replace pairwise
    time cat /cluster/data/dm2/bed/multiz9way/maf/chr*.maf \
    | nice hgLoadMafSummary dm2 multiz9waySummary stdin

    # put 9way MAF out for download
    ssh kkstore01
    cd /cluster/data/dm2/bed/multiz9way
    mkdir mafDownload
    foreach f (maf/*.maf)
      nice gzip -c $f > mafDownload/$f:t.gz
    end
    cd mafDownload
    md5sum *.maf.gz > md5sum.txt
    ssh hgwdev
    mkdir /usr/local/apache/htdocs/goldenPath/dm2/multiz9way
    ln -s /cluster/data/dm2/bed/multiz9way/mafDownload/{*.maf.gz,md5sum.txt} \
      /usr/local/apache/htdocs/goldenPath/dm2/multiz9way
    # make a README.txt

    # Cleanup
    rm -rf /santest/scratch/flyMultiz9way/


# PHASTCONS 8WAY WITH METHODS FROM PAPER (DONE 3/8/05 angie)
# originally done 2/2/05
# REDONE 3/8/05 angie -- after regenerating multiz alignments above.
    # Same procedure as for latest dm1 8way -- using the param estimation 
    # methods described in Adam's Genome Research paper.
# REPLACED with 9way 5/23/05


# PHASTCONS 9WAY WITH METHODS FROM PAPER (DONE 5/27/05 angie)
    ssh kkstore01
    mkdir -p /santest/scratch/dm2/chrom
    cp -p /cluster/data/dm2/?{,?}/chr*.fa /santest/scratch/dm2/chrom/
    # Split chrom fa into smaller windows for phastCons:
    ssh kki
    mkdir /cluster/data/dm2/bed/multiz9way/phastCons
    mkdir /cluster/data/dm2/bed/multiz9way/phastCons/run.split
    cd /cluster/data/dm2/bed/multiz9way/phastCons/run.split
    set WINDOWS = /santest/scratch/dm2/phastCons/WINDOWS
    rm -fr $WINDOWS
    mkdir -p $WINDOWS
    cat << 'EOF' > doSplit.sh
#!/bin/csh -ef

set PHAST=/cluster/bin/phast
set FA_SRC=/santest/scratch/dm2/chrom
set WINDOWS=/santest/scratch/dm2/phastCons/WINDOWS

set maf=$1
set c = $maf:t:r
set tmpDir = /scratch/msa_split/$c
rm -rf $tmpDir
mkdir -p $tmpDir
${PHAST}/msa_split $1 -i MAF -M ${FA_SRC}/$c.fa \
   -O dm2,droSim1,droYak1,droAna1,dp3,droVir1,droMoj1,anoGam1,apiMel2 \
   -w 1000000,0 -r $tmpDir/$c -o SS -I 1000 -B 5000
cd $tmpDir
foreach file ($c.*.ss)
  gzip -c $file > ${WINDOWS}/$file.gz
end
rm -f $tmpDir/$c.*.ss
rmdir $tmpDir
'EOF'
# << for emacs
    chmod a+x doSplit.sh
    rm -f jobList
    foreach file (/cluster/data/dm2/bed/multiz9way/maf/*.maf)
      if (-s $file) then
        echo "doSplit.sh {check in exists+ $file}" >> jobList
      endif
    end
    para create jobList
    para try,  check, push, check...
#Completed: 13 of 13 jobs
#Average job time:                  44s       0.73m     0.01h    0.00d
#Longest finished job:             117s       1.95m     0.03h    0.00d
#Submission to last job:           117s       1.95m     0.03h    0.00d

    ############### FIRST ITERATION OF PARAMETER ESTIMATION ONLY #############
    # Use consEntropy --NH to make it suggest a --expected-lengths param 
    # that we should try next.  Adam ran this on hg17 to find out the 
    # total entropy of the hg17 model:
    # consEntropy 0.265 12 ave.cons.mod ave.noncons.mod
#Transition parameters: gamma=0.265000, omega=12.000000, mu=0.083333, nu=0.030045
#Relative entropy: H=0.608216 bits/site
#Required length: N=16.085437 sites
#Total entropy: NH=9.783421 bits
    # Our target is that NH result: 9.7834 bits.  
    # Use phyloFit to make an initial model:
    ssh kolossus
    cd /cluster/data/dm2/bed/multiz9way/phastCons
    /cluster/bin/phast/msa_view ../maf/chr{2L,3R,4,X}.maf \
        --aggregate dm2,droSim1,droYak1,droAna1,dp3,droVir1,droMoj1,anoGam1,apiMel2 \
        -i MAF -o SS > all.ss
    /cluster/bin/phast/phyloFit all.ss \
      --tree "(((((((dm2,droSim1),droYak1),droAna1),dp3),(droVir1,droMoj1)),anoGam1),apiMel2)" \
      -i SS --out-root starting-tree
    cat starting-tree.mod 
#ALPHABET: A C G T 
#ORDER: 0
#SUBST_MOD: REV
#TRAINING_LNL: -393252389.008460
#BACKGROUND: 0.286960 0.213375 0.213302 0.286363 
#RATE_MAT:
#  -0.918686    0.202545    0.451023    0.265118 
#   0.272394   -1.108053    0.230681    0.604978 
#   0.606771    0.230760   -1.109751    0.272220 
#   0.265671    0.450782    0.202768   -0.919221 
#TREE: (((((((dm2:0.031118,droSim1:0.035465):0.029649,droYak1:0.076216):0.089605,droAna1:0.210217):0.044178,dp3:0.215369):0.046188,(droVir1:0.128011,droMoj1:0.139931):0.195030):0.109752,anoGam1:0.342365):0.228550,apiMel2:0.228550);
# also get GC content from model -- if similar enough, no need to extract it 
# separately above.
awk '$1 == "BACKGROUND:" {print $3 + $4;}' starting-tree.mod 
#0.426677
    # OK, use .427 for --gc below.

    # Use the values of --target-coverage and --expected-lengths from the 
    # last iteration of the 8way run, 0.573 and 36.8.
    # Multiply each subst rate on the TREE line by 3.5 which is roughly the 
    # ratio of noncons to cons in 
    # /cluster/data/dm2/bed/multiz8way/phastCons/run.estimate/ave.*.mod
    /cluster/bin/phast/tree_doctor -s 3.5 starting-tree.mod \
      > starting-tree.noncons.mod
    /cluster/bin/phast/consEntropy --NH 9.7834 0.573 36.8 \
      starting-tree{,.noncons}.mod
#( Solving for new omega: 36.800000 36.416935 36.415050 )
#Transition parameters: gamma=0.573000, omega=36.800000, mu=0.027174, nu=0.036465
#Relative entropy: H=1.931731 bits/site
#Expected min. length: L_min=5.081081 sites
#Expected max. length: L_max=5.398876 sites
#Total entropy: L_min*H=9.815283 bits
#Recommended expected length: omega=36.415050 sites (for L_min*H=9.783400)
    # OK, use --expected-lengths 36.4.

    ############## SUBSEQUENT ITERATIONS OF PARAM ESTIMATION ONLY ###########
    # We're here because the actual target coverage was not satisfactory,
    # so we're changing the --target-coverage param.  Given that we're 
    # changing that, take a guess at how we should change --expected-lengths
    # in order to also hit the total entropy target.
    cd /cluster/data/dm2/bed/multiz9way/phastCons/run.estimate
    # SECOND ITERATION:
    /cluster/bin/phast/consEntropy --NH 9.7834 0.52 36.8 ave.{cons,noncons}.mod
#Recommended expected length: omega=32.240755 sites (for L_min*H=9.783400)
    # OK, --expected-lengths 32.2
    # THIRD ITERATION:
    /cluster/bin/phast/consEntropy --NH 9.7834 0.45 32.2 ave.{cons,noncons}.mod
#Recommended expected length: omega=27.095993 sites (for L_min*H=9.783400)
    # ==> 27.1
    # FOURTH ITERATION:
    /cluster/bin/phast/consEntropy --NH 9.7834 0.425 27.1 ave.{cons,noncons}.mod
#Recommended expected length: omega=25.440532 sites (for L_min*H=9.783400)
    # ==> 25.4
    # FIFTH ITERATION:
    /cluster/bin/phast/consEntropy --NH 9.7834 0.393 25.4 ave.{cons,noncons}.mod
#Recommended expected length: omega=23.420199 sites (for L_min*H=9.783400)
    # ==> 23.4
    # SIXTH ITERATION:
    /cluster/bin/phast/consEntropy --NH 9.7834 0.398 23.4 ave.{cons,noncons}.mod
#Recommended expected length: omega=23.736472 sites (for L_min*H=9.783400)
    # ==> 23.7

    # Now set up cluster job to estimate model parameters given free params 
    # --target-coverage and --expected-lengths and the data.  
    ssh kk9
    mkdir /cluster/data/dm2/bed/multiz9way/phastCons/run.estimate
    cd /cluster/data/dm2/bed/multiz9way/phastCons/run.estimate
    # FIRST ITERATION: Use ../starting-tree.mod:
    cat << '_EOF_' > doEstimate.sh
#!/bin/csh -ef
zcat $1 \
| /cluster/bin/phast/phastCons - ../starting-tree.mod --gc 0.427 --nrates 1,1 \
    --no-post-probs --ignore-missing \
    --expected-lengths 36.4 --target-coverage 0.573 \
    --quiet --log $2 --estimate-trees $3
'_EOF_'
# << for emacs
    # SUBSEQUENT ITERATIONS: Use last iteration's estimated noncons model.
    cat << '_EOF_' > doEstimate.sh
#!/bin/csh -ef
zcat $1 \
| /cluster/bin/phast/phastCons - ave.noncons.mod --gc 0.427 --nrates 1,1 \
    --no-post-probs --ignore-missing \
    --expected-lengths 23.7 --target-coverage 0.398 \
    --quiet --log $2 --estimate-trees $3
'_EOF_'
# << for emacs
    chmod a+x doEstimate.sh
    rm -fr LOG TREES
    mkdir -p LOG TREES
    rm -f jobList
    foreach f (/santest/scratch/dm2/phastCons/WINDOWS/*.ss.gz)
      set root = $f:t:r:r
      echo doEstimate.sh $f LOG/$root.log TREES/$root >> jobList
    end
    # run cluster job
    para create jobList
    para try, check, push, check, ...#
#Completed: 139 of 139 jobs
#Average job time:                4131s      68.85m     1.15h    0.05d
#Longest finished job:            8787s     146.45m     2.44h    0.10d
#Submission to last job:          8886s     148.10m     2.47h    0.10d

    # Now combine parameter estimates.  We can average the .mod files
    # using phyloBoot.  This must be done separately for the conserved
    # and nonconserved models
    ssh kolossus
    cd /cluster/data/dm2/bed/multiz9way/phastCons/run.estimate
    ls -1 TREES/*.cons.mod > cons.txt
    /cluster/bin/phast/phyloBoot --read-mods '*cons.txt' \
      --output-average ave.cons.mod > cons_summary.txt
    ls -1 TREES/*.noncons.mod > noncons.txt
    /cluster/bin/phast/phyloBoot --read-mods '*noncons.txt' \
      --output-average ave.noncons.mod > noncons_summary.txt
    grep TREE ave*.mod
    # FIRST ITERATION:
#ave.cons.mod:TREE: (((((((dm2:0.019592,droSim1:0.027067):0.014875,droYak1:0.047389):0.041218,droAna1:0.116484):0.020166,dp3:0.125948):0.018479,(droVir1:0.073409,droMoj1:0.080389):0.106180):0.056105,anoGam1:0.235476):0.127958,apiMel2:0.127958);
#ave.noncons.mod:TREE: (((((((dm2:0.059164,droSim1:0.077944):0.047044,droYak1:0.144159):0.135352,droAna1:0.373788):0.067147,dp3:0.405695):0.062350,(droVir1:0.234677,droMoj1:0.258111):0.355922):0.191534,anoGam1:0.773044):0.423894,apiMel2:0.423894);
    # SECOND ITERATION:
#ave.cons.mod:TREE: (((((((dm2:0.019133,droSim1:0.026408):0.014525,droYak1:0.046282):0.040238,droAna1:0.113841):0.019632,dp3:0.123153):0.017914,(droVir1:0.071760,droMoj1:0.078554):0.104006):0.054630,anoGam1:0.230601):0.124952,apiMel2:0.124952);
#ave.noncons.mod:TREE: (((((((dm2:0.058357,droSim1:0.076814):0.046405,droYak1:0.142227):0.133457,droAna1:0.368999):0.065975,dp3:0.400678):0.061001,(droVir1:0.231694,droMoj1:0.254750):0.352032):0.188199,anoGam1:0.764503):0.418063,apiMel2:0.418063);
    # THIRD ITERATION:
#ave.cons.mod:TREE: (((((((dm2:0.018454,droSim1:0.025454):0.014002,droYak1:0.044650):0.038757,droAna1:0.109828):0.018811,dp3:0.118845):0.017034,(droVir1:0.069220,droMoj1:0.075740):0.100602):0.052271,anoGam1:0.222690):0.120267,apiMel2:0.120267);
#ave.noncons.mod:TREE: (((((((dm2:0.057221,droSim1:0.075276):0.045493,droYak1:0.139568):0.130778,droAna1:0.362074):0.064250,dp3:0.393146):0.058938,(droVir1:0.227290,droMoj1:0.249820):0.346379):0.183000,anoGam1:0.750810):0.409058,apiMel2:0.409058);
    # FOURTH ITERATION:
#ave.cons.mod:TREE: (((((((dm2:0.018200,droSim1:0.025097):0.013808,droYak1:0.044040):0.038208,droAna1:0.108335):0.018509,dp3:0.117244):0.016712,(droVir1:0.068273,droMoj1:0.074693):0.099332):0.051396,anoGam1:0.219717):0.118553,apiMel2:0.118553);
#ave.noncons.mod:TREE: (((((((dm2:0.056840,droSim1:0.074761):0.045186,droYak1:0.138661):0.129841,droAna1:0.359716):0.063642,dp3:0.390643):0.058209,(droVir1:0.225781,droMoj1:0.248129):0.344408):0.181133,anoGam1:0.745987):0.406113,apiMel2:0.406113);
    # FIFTH ITERATION:
#ave.cons.mod:TREE: (((((((dm2:0.017875,droSim1:0.024638):0.013564,droYak1:0.043255):0.037506,droAna1:0.106409):0.018128,dp3:0.115161):0.016304,(droVir1:0.067046,droMoj1:0.073340):0.097682):0.050261,anoGam1:0.215779):0.116296,apiMel2:0.116296);
#ave.noncons.mod:TREE: (((((((dm2:0.056321,droSim1:0.074061):0.044780,droYak1:0.137445):0.128624,droAna1:0.356472):0.062873,dp3:0.387024):0.057266,(droVir1:0.223703,droMoj1:0.245817):0.341759):0.178650,anoGam1:0.739131):0.401733,apiMel2:0.401733);
    # SIXTH ITERATION:
#ave.cons.mod:TREE: (((((((dm2:0.017924,droSim1:0.024709):0.013600,droYak1:0.043377):0.037613,droAna1:0.106707):0.018185,dp3:0.115488):0.016365,(droVir1:0.067237,droMoj1:0.073549):0.097937):0.050437,anoGam1:0.216405):0.116660,apiMel2:0.116660);
#ave.noncons.mod:TREE: (((((((dm2:0.056410,droSim1:0.074182):0.044846,droYak1:0.137649):0.128814,droAna1:0.357039):0.062987,dp3:0.387708):0.057413,(droVir1:0.224066,droMoj1:0.246214):0.342186):0.179046,anoGam1:0.740329):0.402594,apiMel2:0.402594);

    cat cons_summary.txt 
    # look over the files cons_summary.txt and noncons_summary.txt.
    # The means and medians should be roughly equal and the stdevs
    # should be reasonably small compared to the means, particularly
    # for rate matrix parameters (at bottom) and for branches to the
    # leaves of the tree.  The stdevs may be fairly high for branches
    # near the root of the tree; that's okay.  Some min values may be
    # 0 for some parameters.  That's okay, but watch out for very large
    # values in the max column, which might skew the mean.  If you see
    # any signs of bad outliers, you may have to track down the
    # responsible .mod files and throw them out.  I've never had to do
    # this; the estimates generally seem pretty well behaved.

    # NOTE: Actually, a random sample of several hundred to a thousand
    # alignment fragments (say, a number equal to the number of
    # available cluster nodes) should be more than adequate for
    # parameter estimation.  If pressed for time, use this strategy.

    # Check the total entropy figure to see if we're way off.
    # We probably don't need to do this, since it has always been very close 
    # if not the same as what we used above, but it only takes a second.
    # FIRST ITERATION:
    /cluster/bin/phast/consEntropy --NH 9.7834 0.573 36.4 ave.{cons,noncons}.mod
#Recommended expected length: omega=36.779882 sites (for L_min*H=9.783400)
    # OK, tweak --expected-lengths to 36.8 below (and for next iteration).
    # SECOND ITERATION:
    /cluster/bin/phast/consEntropy --NH 9.7834 0.52 32.2 ave.{cons,noncons}.mod
#Recommended expected length: omega=32.239216 sites (for L_min*H=9.783400)
    # ==> keep at 32.2.
    # THIRD ITERATION:
    /cluster/bin/phast/consEntropy --NH 9.7834 0.45 27.1 ave.{cons,noncons}.mod
#Recommended expected length: omega=27.102528 sites (for L_min*H=9.783400)
    # ==> keep at 27.1.
    # FOURTH ITERATION:
    /cluster/bin/phast/consEntropy --NH 9.7834 0.425 25.4 ave.{cons,noncons}.mod
#Recommended expected length: omega=25.445150 sites (for L_min*H=9.783400)
    # ==> keep at 25.4
    # FIFTH ITERATION:
    /cluster/bin/phast/consEntropy --NH 9.7834 0.393 23.4 ave.{cons,noncons}.mod
#Recommended expected length: omega=23.427025 sites (for L_min*H=9.783400)
    # ==> keep at 23.4
    # SIXTH ITERATION:
    /cluster/bin/phast/consEntropy --NH 9.7834 0.398 23.7 ave.{cons,noncons}.mod
#Recommended expected length: omega=23.735901 sites (for L_min*H=9.783400)
    # ==> keep at 23.7

    # Now we are ready to set up the cluster job for computing the
    # conservation scores and predicted elements.  The we measure the 
    # conserved elements coverage, and if that's not satisfactory then we 
    # adjust parameters and repeat.  
    ssh kk9
    mkdir /cluster/data/dm2/bed/multiz9way/phastCons/run.phast
    cd /cluster/data/dm2/bed/multiz9way/phastCons/run.phast
    cat << 'EOF' > doPhastCons.sh
#!/bin/csh -ef
set pref = $1:t:r:r
set chr = `echo $pref | awk -F\. '{print $1}'`
set tmpfile = /scratch/phastCons.$$
zcat $1 \
| /cluster/bin/phast/phastCons - \
    ../run.estimate/ave.cons.mod,../run.estimate/ave.noncons.mod \
    --expected-lengths 23.7 --target-coverage 0.398 \
    --quiet --seqname $chr --idpref $pref \
    --viterbi /santest/scratch/dm2/phastCons/ELEMENTS/$pref.bed --score \
    --require-informative 0 \
  > $tmpfile
gzip -c $tmpfile > /santest/scratch/dm2/phastCons/POSTPROBS/$pref.pp.gz
rm $tmpfile
'EOF'
# << for emacs
    chmod a+x doPhastCons.sh
    rm -fr /santest/scratch/dm2/phastCons/{POSTPROBS,ELEMENTS}
    mkdir -p /santest/scratch/dm2/phastCons/{POSTPROBS,ELEMENTS}
    rm -f jobList
    foreach f (/santest/scratch/dm2/phastCons/WINDOWS/*.ss.gz)
      echo doPhastCons.sh $f >> jobList
    end
    para create jobList
    para try, check, push, check, ...
#Completed: 139 of 139 jobs
#Average job time:                  18s       0.31m     0.01h    0.00d
#Longest finished job:              29s       0.48m     0.01h    0.00d
#Submission to last job:            46s       0.77m     0.01h    0.00d

    # back on kolossus:
    # combine predictions and transform scores to be in 0-1000 interval
    cd /cluster/data/dm2/bed/multiz9way/phastCons
    awk '{printf "%s\t%d\t%d\tlod=%d\t%s\n", $1, $2, $3, $5, $5;}' \
      /santest/scratch/dm2/phastCons/ELEMENTS/*.bed \
    | /cluster/bin/scripts/lodToBedScore > all.bed

    ssh hgwdev
    # Now measure coverage of CDS by conserved elements. 
    # We want the "cover" figure to be close to 68.9%.
    cd /cluster/data/dm2/bed/multiz9way/phastCons
    featureBits -enrichment dm2 flyBaseGene:cds all.bed
    # FIRST ITERATION: too high; decrease --target-coverage, re-estimate.
#flyBaseGene:cds 16.567%, all.bed 42.995%, both 12.251%, cover 73.95%, enrich 1.72x
    # SECOND ITERATION: still too high.
#flyBaseGene:cds 16.567%, all.bed 41.408%, both 12.030%, cover 72.62%, enrich 1.75x
    # THIRD ITERATION: still...
#flyBaseGene:cds 16.567%, all.bed 39.143%, both 11.693%, cover 70.58%, enrich 1.80x
    # FOURTH ITERATION: dang, should have used a calculator.
#flyBaseGene:cds 16.567%, all.bed 38.329%, both 11.563%, cover 69.80%, enrich 1.82x
    # FIFTH ITERATION: close... but overshot.  Back off just a bit.
#flyBaseGene:cds 16.567%, all.bed 37.277%, both 11.388%, cover 68.74%, enrich 1.84x
    # SIXTH ITERATION: done.
#flyBaseGene:cds 16.567%, all.bed 37.445%, both 11.417%, cover 68.92%, enrich 1.84x

    # Having met the CDS coverage target, load up the results.
    hgLoadBed dm2 phastConsElements9way all.bed

    # Create wiggle on the small cluster
    ssh kki
    mkdir /cluster/data/dm2/bed/multiz9way/phastCons/run.wib
    cd /cluster/data/dm2/bed/multiz9way/phastCons/run.wib
    rm -rf /santest/scratch/dm2/phastCons/wib
    mkdir -p /santest/scratch/dm2/phastCons/wib
    cat << 'EOF' > doWigEncode
#!/bin/csh -ef
set chr = $1
cd /santest/scratch/dm2/phastCons/wib
zcat `ls -1 /santest/scratch/dm2/phastCons/POSTPROBS/$chr.*.pp.gz \
      | sort -t\. -k2,2n` \
| wigEncode stdin ${chr}_phastCons.wi{g,b}
'EOF'
# << for emacs
    chmod a+x doWigEncode
    rm -f jobList
    foreach chr (`ls -1 /santest/scratch/dm2/phastCons/POSTPROBS \
                  | awk -F\. '{print $1}' | sort -u`)
      echo doWigEncode $chr >> jobList
    end
    para create jobList
    para try, check, push, check, ...
#Completed: 13 of 13 jobs
#Average job time:                  10s       0.16m     0.00h    0.00d
#Longest finished job:              23s       0.38m     0.01h    0.00d
#Submission to last job:            23s       0.38m     0.01h    0.00d

    # back on kkstore01, copy wibs, wigs and POSTPROBS (people sometimes want 
    # the raw scores) from santest
    cd /cluster/data/dm2/bed/multiz9way/phastCons
    rm -rf wib POSTPROBS
    rsync -av /santest/scratch/dm2/phastCons/wib .
    rsync -av /santest/scratch/dm2/phastCons/POSTPROBS .

    # load wiggle component of Conservation track
    ssh hgwdev
    mkdir /gbdb/dm2/multiz9way/wib
    cd /cluster/data/dm2/bed/multiz9way/phastCons
    chmod 775 . wib
    chmod 664 wib/*.wib
    ln -s `pwd`/wib/*.wib /gbdb/dm2/multiz9way/wib/
    hgLoadWiggle dm2 phastCons9way \
      -pathPrefix=/gbdb/dm2/multiz9way/wib wib/*.wig
    rm wiggle.tab

    # make top-5000 list and launcher on Adam's home page:
    sed -e 's/lod=//' all.bed | sort -k4,4nr | head -5000 \
    | awk '{printf "%s\t%d\t%d\tlod=%d\t%d\n", $1, $2, $3, $4, $4}' \
    > top5000.bed
    /cluster/home/acs/bin/make-launcher-with-scores.sh top5000.bed \
      /cse/grads/acs/public_html/dm-top5000-9way \
      "top 5000 conserved elements (9way)" dm2

    # and clean up santest.
    rm -r /santest/scratch/dm2/phastCons/{ELEMENTS,POSTPROBS,wib}
    rm -r /santest/scratch/dm2/chrom
    # Offer raw scores for download since fly folks are likely to be interested:
    ssh kkstore01
    cd /cluster/data/dm2/bed/multiz9way/phastCons/POSTPROBS
    mkdir ../postprobsDownload
    foreach chr (`awk '{print $1;}' ../../../../chrom.sizes`)
      zcat `ls -1 $chr.*.pp.gz | sort -t\. -k2,2n` | gzip -c \
        > ../postprobsDownload/$chr.pp.gz
    end
    cd ../postprobsDownload
    md5sum *.gz > md5sum.txt
    # Make a README.txt there too.
    ssh hgwdev
    mkdir /usr/local/apache/htdocs/goldenPath/dm2/phastCons9way
    cd /usr/local/apache/htdocs/goldenPath/dm2/phastCons9way
    ln -s /cluster/data/dm2/bed/multiz9way/phastCons/postprobsDownload/* .


# MAP FEEP PROBES (DONE 2/3/05 angie)
    ssh kolossus
    mkdir /cluster/data/dm2/bed/flyFeep
    cd /cluster/data/dm2/bed/flyFeep
    set feepDir = /projects/compbio/data/microarray/flyFEEP
    # Make per-chrom probe FASTA files:
    foreach c (2L 2R 2h 3L 3R 3h 4 X Xh Yh)
      echo $c
      awk '$3 == "'$c'" {printf ">%s\n%s\n", $1, $7;}' \
        $feepDir/probes3.1-Exons.1_apr11_exon \
        $feepDir/probes3.1-Noncoding.1_apr11_tiling \
        > chr${c}_exonNonExonProbes.fa
      awk '$1 != "ID" && $5 == "'$c'+" {printf ">%s\n%s\n", $1, $8;}' \
        $feepDir/probes3.1-junctions.txt \
        > chr${c}_junctionProbes.fa
    end
    # For exon/non-exon probes (36 consecutive bases), require 36 matching 
    # bases (for some reason -minScore=36 doesn't filter anything out!).
    foreach c (2L 2R 2h 3L 3R 3h 4 X Xh Yh)
      blat -noHead \
        /cluster/data/dm2/nib/chr$c.nib chr${c}_exonNonExonProbes.fa stdout \
      | grep ^36 | uniq \
      > chr${c}_exonNonExonProbes.psl
    end
    # For splice junction probes (2 blocks of 18 consecutive bases), 
    # use a small tileSize and -fine, and still require 36 matching bases:
    # This was slow, do as small cluster job next time, will still take hours.
    foreach c (2L 2R 2h 3L 3R 3h 4 X Xh Yh)
      blat -noHead -tileSize=6 -fine \
        /cluster/data/dm2/nib/chr$c.nib chr${c}_junctionProbes.fa stdout \
      | grep ^36 | uniq \
      > chr${c}_junctionProbes.psl
    end
    # Check on how many probes were aligned:
    foreach f (chr*exon*.fa chr*junc*.fa)
      set probes = `cat $f | wc -l`
      set probes = `expr $probes / 2`
      set aligned = `cat $f:r.psl | wc -l`
      set reallyAligned = `awk '{print $10;}' $f:r.psl | sort -u | wc -l `
      set not = `expr $probes - $reallyAligned`
      echo $f:r"\t"$probes"\t"$aligned"\t"$reallyAligned"\t"\($not\)
    end
    # set			count	aligned	reallyA	not
#chr2h_exonNonExonProbes	644	611	611	(33)
#chr2L_exonNonExonProbes	26631	26749	26627	(4)
#chr2R_exonNonExonProbes	26100	26111	26098	(2)
#chr3h_exonNonExonProbes	833	833	833	(0)
#chr3L_exonNonExonProbes	28756	28773	28749	(7)
#chr3R_exonNonExonProbes	35060	35064	35055	(5)
#chr4_exonNonExonProbes	    	1798	1797	1797	(1)
#chrX_exonNonExonProbes	    	28912	28917	28912	(0)
#chrXh_exonNonExonProbes	354	354	354	(0)
#chrYh_exonNonExonProbes	97	97	97	(0)
#chr2h_junctionProbes    	3       3       3       (0)
#chr2L_junctionProbes    	5048    5015    5012    (36)
#chr2R_junctionProbes        	5697    5709    5652    (45)
#chr3h_junctionProbes        	1       1       1       (0)
#chr3L_junctionProbes        	6498    6437    6429    (69)
#chr3R_junctionProbes        	8747    8695    8684    (63)
#chr4_junctionProbes         	1056    1048    1048    (8)
#chrXh_junctionProbes        	50      50      50      (0)
#chrX_junctionProbes         	3688    3684    3680    (8)
#chrYh_junctionProbes        	0       0       0       (0)
    # Most probes successfully mapped, but a fair number of duplicates.  
    # Dug into one of those (159937) and found that its probe sequence 
    # halves were from a tandem repeat!  (And didn't map to where they 
    # were supposed to in dm1 either.)  I suspect the probe sequences 
    # might not be correct in the files I downloaded.  Sent Chris Mason 
    # an email about that.  
    # If the probe dm1 locations are correct (I sure hope so), just not 
    # some given probe sequences, I could extract dm1 sequence and map 
    # that to dm2 instead of mapping the given probes as above.  

    # Translate PSL to bed12 (this is too simple for PSL in general, but 
    # since we require the entire query to map perfectly, it works here):
    awk '{printf "%s\t%d\t%d\t%s\t0\t%s\t%d\t%d\t0\t%d\t%s\t%s\n", \
                 $14, $16, $17, $10, $9, $16, $17, $18, $19, $20;}' \
      chr*.psl \
      > flyFeepProbesBed12.bed

    # Load probes
    ssh hgwdev
    cd /cluster/data/dm2/bed/flyFeep
    # Add scores to bed:
    hgMapMicroarray -bedIn flyFeep.bed hgFixed.flyFeepMedianRatio \
      flyFeepProbesBed12.bed
    hgLoadBed dm2 flyFeep flyFeep.bed
    # (back on fileserver) Winnow probes by Bussemaker lab's lists:
    set dm1FeepDir = /cluster/data/dm1/bed/flyFeep
    $dm1FeepDir/winnowBed.pl $dm1FeepDir/probesExpressedAboveBackground.txt \
      flyFeep.bed \
      > flyFeepPEAB.bed
    $dm1FeepDir/winnowBed.pl $dm1FeepDir/probesAnovaDiffExpressed.txt \
      flyFeep.bed \
      > flyFeepAnova.bed
    # (back on hgwdev) Load winnowed sets:
    hgLoadBed dm2 flyFeepPEAB flyFeepPEAB.bed
    hgLoadBed dm2 flyFeepAnova flyFeepAnova.bed


# FLYBASE 4.1 ANNOTATIONS (DONE 2/28/05 angie)
    ssh kksilo
    mkdir /cluster/data/dm2/bed/flybase4.1
    cd /cluster/data/dm2/bed/flybase4.1
    foreach c (2L 2R 3L 3R 4 X)
      wget ftp://flybase.net/genomes/Drosophila_melanogaster/dmel_r4.1_20050207/gff/dmel-$c-r4.1.gff.gz
    end
    zcat *.gff.gz > flybase.gff3
    # What data sources are represented in this file?
    grep -v '^#' flybase.gff3 | awk '{print $2 "\t" $3;}' | sort | uniq -c
    # excerpt (many other sources, including blastx:... , sim4:... and 
    # tblastx:...; also various other types for source "."):
  18941 .       CDS
  63033 .       exon
  14066 .       gene
  18941 .       mRNA
    144 .       ncRNA
     39 .       pseudogene
     96 .       rRNA
     29 .       snRNA
     28 .       snoRNA
    295 .       tRNA
  36921 .       transcription_start_site
   1571 .       transposable_element
  16404 .       transposable_element_insertion_site
    # What keywords are defined in the 9th field?
    grep -v '^#' flybase.gff3 \
    | awk '{print $9;}' | perl -wpe 's/=[^;]+;/\n/g; s/=.*$//;' \
    | sort | uniq -c
    # Once again, the previous round's parsing needed some updates to 
    # handle the new data.
    extractGenes.pl flybase.gff3
#Oddball parentless ID=Sps2-exon-10336472..10336531 for exon

    # Get predicted proteins (for main annotations only)
    wget ftp://flybase.net/genomes/Drosophila_melanogaster/dmel_r4.1_20050207/fasta/dmel-all-translation-r4.1.fasta.gz
    zcat dmel-all-translation-r4.1.fasta.gz \
    | perl -wpe 's/^(>\w+)-P(\w)/$1-R$2/' > flybasePep.fa

    ssh hgwdev
    cd /cluster/data/dm2/bed/flybase4.1
    # Protein-coding genes:
    ldHgGene -gtf dm2 flyBaseGene flybase.gtf
    hgPepPred dm2 generic flyBasePep flybasePep.fa
    # Fix typo caught by Galt & all.joiner:
    hgsql dm2 -e 'update flyBasePep set name = "CG6207-RC" where name = "GLCAT-P-PC"'
    # Noncoding genes:
    hgLoadBed dm2 flyBaseNoncoding flyBaseNoncoding.bed
    # Cross-referencing info for both coding and noncoding:
    hgsql dm2 < $HOME/kent/src/hg/lib/flyBase2004Xref.sql
    hgsql dm2 -e 'load data local infile "flyBase2004Xref.tab" \
      into table flyBase2004Xref'
    # Some featureBits comparisons with refGene which is pretty much like 
    # version 4.0 but apparently includes some noncoding genes too:
    featureBits dm2 refGene \!flyBaseGene -minSize=1000 -bed=stdout
#chr3R   8221731 8222987 chr3R.1
#...
#87577 bases of 131698467 (0.066%) in intersection
    # This shows only the dropped isoforms/duplicates (ignores noncoding):
    featureBits dm2 refGene \!flyBaseGene \!flyBaseNoncoding -minSize=1000 \
       -bed=stdout
#chr3R   15620310        15621493        chr3R.1
#chr2L   4698094 4700940 chr2L.1
#chrX    3638700 3639895 chrX.1
#49911 bases of 131698467 (0.038%) in intersection

    # add upstream* downloadable files
    cd /usr/local/apache/htdocs/goldenPath/dm2/bigZips
    foreach size (1000 2000 5000)
      echo upstream$size
      nice featureBits dm2 flyBaseGene:upstream:$size -fa=stdout \
      | nice gzip -c > upstream$size.fa.gz
    end
    md5sum *.zip *.gz > md5sum.txt


# FLYREG (DONE 2/23/05 angie)
    ssh kksilo
    mkdir /cluster/data/dm2/bed/flyreg
    cd /cluster/data/dm2/bed/flyreg
    wget http://www.gen.cam.ac.uk/casey/data/Bergman2004/v2.0/Footprint.GFF
    # This is not GTF; it should really be bed +.  The contributor, 
    # Casey Bergman, says that coords are 0-based half-open, so 
    # translation will be even easier.
    grep -v '^#' Footprint.GFF \
    | perl -wpe 'if (! s/^(\S+)\tBergman_data\tbinding_site\t(\d+)\t(\d+)\t.\t.\t.\tFactor \"([^\"]+)\"; Target \"([^\"]+)\"; PMID \"(\d+)\"; FPID \"(\d+)\"$/chr$1\t$2\t$3\t$4\t$5\t$6\t$7/) { die "Cant parse line $.:\n$_\n"; }' \
      > flyreg2.bed
    ssh hgwdev
    hgLoadBed -sqlTable=$HOME/kent/src/hg/lib/flyreg2.sql \
      dm2 flyreg2 /cluster/data/dm2/bed/flyreg/flyreg2.bed
    # Add Dan Pollard's MEME motif data for the footprints:
    ssh kksilo
    cd /cluster/data/dm2/bed/flyreg
    wget http://rana.lbl.gov/~dan/matrices/bergman2004/footprint_matrices.txt
    /cluster/data/dm1/bed/flyreg/extractMatrices.pl \
      footprint_matrices.txt > flyregMotif.tab
    ssh hgwdev
    cd /cluster/data/dm2/bed/flyreg
    sed -e 's/dnaMotif/flyregMotif/' $HOME/kent/src/hg/lib/dnaMotif.sql \
    > flyregMotif.sql
    hgsql dm2 < flyregMotif.sql
    hgsql dm2 -e 'load data local infile "flyregMotif.tab" into table flyregMotif'


# LIFTOVER DM1 FLYREG, COMPARE TO DM2 FLYREG (DONE 2/25/05 angie)
    # Casey Bergman is very eager to see alternate confirmation of his 
    # dm1->dm2 mapping of flyreg, so run liftOver on it:
    ssh kksilo
    cd /cluster/data/dm2/bed/flyreg
    liftOver /cluster/data/dm1/bed/flyreg/flyreg.bed \
      /cluster/data/dm1/bed/bedOver/dm1ToDm2.over.chain \
      flyregLift.bed flyregLift.unmapped
    wc -l flyreg2.bed flyregLift.bed flyregLift.unmapped 
#  1362 flyreg2.bed
#  1363 flyregLift.bed
#     0 flyregLift.unmapped
    awk '{print $1 "\t" $2 "\t" $3 "\t" $4 "\t" $5;}' flyreg2.bed | sort \
      > /tmp/1
    awk '{print $1 "\t" $2 "\t" $3 "\t" $4 "\t" $5;}' flyregLift.bed | sort \
      > /tmp/2
    diff -c /tmp/[12] > newVsLift
    # Analyzed the differences: 2 duplicates in original (uniquified in v2),
    # 1 new item in v2, and 4 items incorrectly mapped by liftOver (fooled 
    # by what looks like a slightly diverged local duplication) but probably
    # correctly mapped by Casey.


# SELF ALIGNMENTS (DONE 2/24/05 angie)
    # Doing this largely as a test of doBlastzChainNet.pl...
    ssh kksilo
    mkdir /cluster/data/dm2/bed/blastz.dm2.2005-02-23
    cd /cluster/data/dm2/bed/blastz.dm2.2005-02-23
    cat << '_EOF_' > DEF
# D. melanogaster vs. self

# TARGET - D. melanogaster
SEQ1_DIR=/iscratch/i/dm2/nib
SEQ1_CHUNK=10000000
SEQ1_LAP=10000
SEQ1_LEN=/cluster/data/dm2/chrom.sizes

# QUERY - D. melanogaster
SEQ2_DIR=/iscratch/i/dm2/nib
SEQ2_CHUNK=10000000
SEQ2_LAP=10000
SEQ2_LEN=/cluster/data/dm2/chrom.sizes

BASE=/cluster/data/dm2/bed/blastz.dm2.2005-02-23

'_EOF_'
    # << this line keeps emacs coloring happy
    doBlastzChainNet.pl DEF \
      -blastzOutRoot /panasas/store/dm2SelfOut >& do.log &
    tail -f do.log
    # The script died at the start of the cat step when kki couldn't see 
    # /panasas (doh!).  Asked cluster-admin to fix that.  
    # When fixed, restarted with -continue, appending to do.log:
    rm -r run.cat
    doBlastzChainNet.pl -continue cat DEF \
      -blastzOutRoot /panasas/store/dm2SelfOut >>& do.log &
    tail -f do.log
    rmdir /panasas/store/dm2SelfOut
    ln -s blastz.dm2.2005-02-23 /cluster/data/dm2/bed/blastz.dm2
    # chainSelf is already in top-level trackDb.ra, so no need to add.
    # Add /usr/local/apache/htdocs/goldenPath/dm2/vsSelf/README.txt


# BLASTZ/CHAIN/NET DROSIM1 (DONE 4/13/05 angie)
# -- Originally run with default blastz params; replaced by a run with 
# better params, below.


# BLASTZ/CHAIN/NET ANOGAM1 -- LOOSER PARAMS (DONE 5/19/05 angie)
    # Lower the L threshold from 6000 to 4000 (5-18) and then 3000 (5-19), 
    # see if that increases coverage from this measurement from the previous 
    # "human-fugu" params:
    # featureBits dm2 -chrom=chr2L -enrichment flyBaseGene chainAnoGam1Link
#flyBaseGene 22.967%, chainAnoGam1Link 15.999%, both 12.567%, cover 54.72%, enrich 3.42x
# 5/19, L=3000:  too much...  low-complexity-anchored alignments cluttering.
# but interesting to know we can get this kind of coverage:
#flyBaseGene 22.967%, chainAnoGam1Link 60.227%, both 18.542%, cover 80.73%, enrich 1.34x
    # So use results of this L=4000 run:
    ssh kkstore01
    mkdir /cluster/data/dm2/bed/blastz.anoGam1.2005-05-18
    cd /cluster/data/dm2/bed/blastz.anoGam1.2005-05-18
    cat << '_EOF_' > DEF
# D.melanogaster vs. A. gambiae

BLASTZ_H=2000
BLASTZ_Y=3400
BLASTZ_L=4000
BLASTZ_K=2200
BLASTZ_Q=/cluster/data/blastz/HoxD55.q
BLASTZ_ABRIDGE_REPEATS=0

# TARGET - D. melanogaster
SEQ1_DIR=/iscratch/i/dm2/nib
SEQ1_CHUNK=5000000
SEQ1_LAP=10000
SEQ1_LEN=/cluster/data/dm2/chrom.sizes

# QUERY - A. gambiae
SEQ2_DIR=/iscratch/i/anoGam1/nib
SEQ2_CHUNK=5000000
SEQ2_LAP=0
SEQ2_LEN=/cluster/data/anoGam1/chrom.sizes

BASE=/cluster/data/dm2/bed/blastz.anoGam1.2005-05-18

'_EOF_'
    # << this line keeps emacs coloring happy
    doBlastzChainNet.pl DEF \
      -blastzOutRoot /santest/scratch/dm2anoGam1 >& do.log &
    tail -f do.log
    featureBits dm2 -chrom=chr2L -enrichment flyBaseGene chainAnoGam1Link
# 5/18, L=4000:
#flyBaseGene 22.967%, chainAnoGam1Link 22.072%, both 14.077%, cover 61.29%, enrich 2.78x
    # Looks good in the browser -- top-level net chains extend farther 
    # without too much extra crap like we saw with L=3000.
    rm /cluster/data/dm2/bed/blastz.anoGam1
    ln -s blastz.anoGam1.2005-05-18 /cluster/data/dm2/bed/blastz.anoGam1


# BLASTZ/CHAIN/NET APIMEL2 -- LOOSER PARAMS (DONE 5/19/05 angie)
    # Since L=4000 noticeably helped mosquito, try it on the honeybee,
    # which had this coverage with L=6000:
#flyBaseGene 22.967%, chainApiMel2Link 13.767%, both 7.802%, cover 33.97%, enrich 2.47x
    mkdir /cluster/data/dm2/bed/blastz.apiMel2.2005-05-19
    cd /cluster/data/dm2/bed/blastz.apiMel2.2005-05-19
    cat << '_EOF_' > DEF
# D.melanogaster vs. A. mellifera

BLASTZ_H=2000
BLASTZ_Y=3400
BLASTZ_L=4000
BLASTZ_K=2200
BLASTZ_Q=/cluster/data/blastz/HoxD55.q
BLASTZ_ABRIDGE_REPEATS=0

# TARGET - D. melanogaster
SEQ1_DIR=/iscratch/i/dm2/nib
SEQ1_CHUNK=5000000
SEQ1_LAP=10000
SEQ1_LEN=/cluster/data/dm2/chrom.sizes

# QUERY - A. mellifer
SEQ2_DIR=/iscratch/i/apiMel2/nib
SEQ2_CHUNK=5000000
SEQ2_LAP=0
SEQ2_LEN=/cluster/data/apiMel2/chrom.sizes

BASE=/cluster/data/dm2/bed/blastz.apiMel2.2005-05-19

'_EOF_'
    # << this line keeps emacs coloring happy
    doBlastzChainNet.pl DEF \
      -blastzOutRoot /santest/scratch/dm2apiMel2 >& do.log &
    tail -f do.log
    featureBits dm2 -chrom=chr2L -enrichment flyBaseGene chainApiMel2Link
#flyBaseGene 22.967%, chainApiMel2Link 36.701%, both 11.095%, cover 48.31%, enrich 1.32x
    # Wow, quite an increase in coverage!  And drop in enrichment... some 
    # of the new alignments are on the low-complexity side but not nearly 
    # as bad as mosquito at L=3000.  Never know what to expect from these 
    # param experiments!  :|  Well, using L=4000 seems like a net gain so 
    # go with it...
    rm /cluster/data/dm2/bed/blastz.apiMel2
    ln -s blastz.apiMel2.2005-05-19 /cluster/data/dm2/bed/blastz.apiMel2


# BLASTZ/CHAIN/NET DROMOJ1 -- LOOSER PARAMS (DONE 5/19/05 angie)
    # Here's what L=6000 got us:
#flyBaseGene 22.967%, chainDroMoj1Link 59.129%, both 20.159%, cover 87.77%, enrich 1.48x
    # OK, let's see what droMoj1 looks like at L=4000...
    mkdir /cluster/data/dm2/bed/blastz.droMoj1.2005-05-19
    cd /cluster/data/dm2/bed/blastz.droMoj1.2005-05-19
    cat << '_EOF_' > DEF
# D. melanogaster vs. D. mojavensis

BLASTZ_H=2000
BLASTZ_Y=3400
BLASTZ_L=4000
BLASTZ_K=2200
BLASTZ_Q=/cluster/data/blastz/HoxD55.q

# TARGET - D. melanogaster
SEQ1_DIR=/iscratch/i/dm2/nib
SEQ1_CHUNK=10000000
SEQ1_LAP=10000
SEQ1_LEN=/cluster/data/dm2/chrom.sizes

# QUERY - D. mojavensis
SEQ2_DIR=/iscratch/i/droMoj1/droMoj1.2bit
SEQ2_CHUNK=5000000
SEQ2_LAP=10000
SEQ2_LEN=/cluster/data/droMoj1/chrom.sizes

BASE=/cluster/data/dm2/bed/blastz.droMoj1.2005-05-19
'_EOF_'
    # << this line keeps emacs coloring happy
    doBlastzChainNet.pl DEF \
      -blastzOutRoot /santest/scratch/dm2droMoj1 >& do.log &
    tail -f do.log
    featureBits dm2 -chrom=chr2L -enrichment flyBaseGene chainDroMoj1Link
#flyBaseGene 22.967%, chainDroMoj1Link 65.901%, both 20.698%, cover 90.12%, enrich 1.37x
    # Wow, actually looks pretty good in the browser.  Finds more non-top-
    # level alignments than L=6000, but not a huge number, and most look 
    # reasonable in the detailed view.
    rm -f /cluster/data/dm2/bed/blastz.droMoj1
    ln -s blastz.droMoj1.2005-05-19 /cluster/data/dm2/bed/blastz.droMoj1


# BLASTZ/CHAIN/NET DROVIR1 -- LOOSER PARAMS (DONE 5/20/05 angie)
    # coverage with default blastz params:
#flyBaseGene 22.967%, chainDroVir1Link 47.957%, both 18.585%, cover 80.92%, enrich 1.69x
    # OK, let's see what droVir1 looks like at L=4000...
    mkdir /cluster/data/dm2/bed/blastz.droVir1.2005-05-19
    cd /cluster/data/dm2/bed/blastz.droVir1.2005-05-19
    cat << '_EOF_' > DEF
# D. melanogaster vs. D. virilis

BLASTZ_H=2000
BLASTZ_Y=3400
BLASTZ_L=4000
BLASTZ_K=2200
BLASTZ_Q=/cluster/data/blastz/HoxD55.q

# TARGET - D. melanogaster
SEQ1_DIR=/iscratch/i/dm2/nib
SEQ1_CHUNK=10000000
SEQ1_LAP=10000
SEQ1_LEN=/cluster/data/dm2/chrom.sizes

# QUERY - D. virilis
SEQ2_DIR=/santest/scratch/droVir1/droVir1.2bit
SEQ2_CHUNK=5000000
SEQ2_LAP=10000
SEQ2_LEN=/santest/scratch/droVir1/chrom.sizes

BASE=/cluster/data/dm2/bed/blastz.droVir1.2005-05-19
'_EOF_'
    # << this line keeps emacs coloring happy
    doBlastzChainNet.pl DEF \
      -blastzOutRoot /santest/scratch/dm2droVir1 >& do.log &
    tail -f do.log
    featureBits dm2 -chrom=chr2L -enrichment flyBaseGene chainDroVir1Link
#flyBaseGene 22.967%, chainDroVir1Link 66.657%, both 20.876%, cover 90.90%, enrich 1.36x
    rm -f /cluster/data/dm2/bed/blastz.droVir1
    ln -s blastz.droVir1.2005-05-19 /cluster/data/dm2/bed/blastz.droVir1


# BLASTZ/CHAIN/NET DP3 -- LOOSER PARAMS (DONE 5/20/05 angie)
    # coverage with default blastz params was not too bad but let's see if 
    # we can do better:
#flyBaseGene 22.967%, chainDp3Link 67.232%, both 20.112%, cover 87.57%, enrich 1.30x
    mkdir /cluster/data/dm2/bed/blastz.dp3.2005-05-20
    cd /cluster/data/dm2/bed/blastz.dp3.2005-05-20
    cat << '_EOF_' > DEF
# D. melanogaster vs. D. pseudoobscura

BLASTZ_H=2000
BLASTZ_Y=3400
BLASTZ_L=4000
BLASTZ_K=2200
BLASTZ_Q=/cluster/data/blastz/HoxD55.q

# TARGET - D. melanogaster
SEQ1_DIR=/iscratch/i/dm2/nib
SEQ1_CHUNK=10000000
SEQ1_LAP=10000
SEQ1_LEN=/cluster/data/dm2/chrom.sizes

# QUERY - D. pseudoobscura
SEQ2_DIR=/iscratch/i/dp3/nib
SEQ2_CHUNK=5000000
SEQ2_LAP=10000
SEQ2_LEN=/cluster/data/dp3/chrom.sizes

BASE=/cluster/data/dm2/bed/blastz.dp3.2005-05-20
'_EOF_'
    # << this line keeps emacs coloring happy
    doBlastzChainNet.pl DEF \
      -blastzOutRoot /santest/scratch/dm2dp3 >& do.log &
    tail -f do.log
    featureBits dm2 -chrom=chr2L -enrichment flyBaseGene chainDp3Link
#flyBaseGene 22.967%, chainDp3Link 74.901%, both 21.335%, cover 92.89%, enrich 1.24x
    # Again, reasonable extension of coverage without too much crap.
    rm -f /cluster/data/dm2/bed/blastz.dp3
    ln -s blastz.dp3.2005-05-20 /cluster/data/dm2/bed/blastz.dp3


# BLASTZ/CHAIN/NET DROANA1 -- LOOSER PARAMS (DONE 5/21/05 angie)
    # coverage with default blastz params:
#flyBaseGene 22.967%, chainDroAna1Link 73.508%, both 20.981%, cover 91.35%, enrich 1.24x
    # OK, let's see what droAna1 looks like at L=4000...
    mkdir /cluster/data/dm2/bed/blastz.droAna1.2005-05-20
    cd /cluster/data/dm2/bed/blastz.droAna1.2005-05-20
    cat << '_EOF_' > DEF
# D. melanogaster vs. D. ananassae

BLASTZ_H=2000
BLASTZ_Y=3400
BLASTZ_L=4000
BLASTZ_K=2200
BLASTZ_Q=/cluster/data/blastz/HoxD55.q

# TARGET - D. melanogaster
SEQ1_DIR=/iscratch/i/dm2/nib
SEQ1_CHUNK=10000000
SEQ1_LAP=10000
SEQ1_LEN=/cluster/data/dm2/chrom.sizes

# QUERY - D. ananassae
SEQ2_DIR=/santest/scratch/droAna1/droAna1.2bit
SEQ2_CHUNK=1000000
SEQ2_LAP=10000
SEQ2_LEN=/santest/scratch/droAna1/chrom.sizes

BASE=/cluster/data/dm2/bed/blastz.droAna1.2005-05-20
'_EOF_'
    # << this line keeps emacs coloring happy
    doBlastzChainNet.pl DEF \
      -blastzOutRoot /santest/scratch/dm2droAna1 >& do.log &
    tail -f do.log
    featureBits dm2 -chrom=chr2L -enrichment flyBaseGene chainDroAna1Link
#flyBaseGene 22.967%, chainDroAna1Link 80.519%, both 21.949%, cover 95.57%, enrich 1.19x
    rm -f /cluster/data/dm2/bed/blastz.droAna1
    ln -s blastz.droAna1.2005-05-20 /cluster/data/dm2/bed/blastz.droAna1


# BLASTZ/CHAIN/NET DROYAK1 -- LOOSER PARAMS (DONE 5/21/05 angie)
    # coverage with default blastz params:
#flyBaseGene 22.967%, chainDroYak1Link 90.253%, both 22.586%, cover 98.34%, enrich 1.09x
    # OK, let's see what droYak1 looks like at L=4000...
    mkdir /cluster/data/dm2/bed/blastz.droYak1.2005-05-20
    cd /cluster/data/dm2/bed/blastz.droYak1.2005-05-20
    cat << '_EOF_' > DEF
# D. melanogaster vs. D. yakuba

BLASTZ_H=2000
BLASTZ_Y=3400
BLASTZ_L=4000
BLASTZ_K=2200
BLASTZ_Q=/cluster/data/blastz/HoxD55.q

# TARGET - D. melanogaster
SEQ1_DIR=/iscratch/i/dm2/nib
SEQ1_CHUNK=10000000
SEQ1_LAP=10000
SEQ1_LEN=/cluster/data/dm2/chrom.sizes

# QUERY - D. yakuba
SEQ2_DIR=/iscratch/i/droYak1/nib
SEQ2_CHUNK=10000000
SEQ2_LAP=10000
SEQ2_LEN=/cluster/data/droYak1/chrom.sizes

BASE=/cluster/data/dm2/bed/blastz.droYak1.2005-05-20
'_EOF_'
    # << this line keeps emacs coloring happy
    doBlastzChainNet.pl DEF \
      -blastzOutRoot /santest/scratch/dm2droYak1 >& do.log &
    tail -f do.log
    featureBits dm2 -chrom=chr2L -enrichment flyBaseGene chainDroYak1Link
#flyBaseGene 22.967%, chainDroYak1Link 91.896%, both 22.736%, cover 98.99%, enrich 1.08x
    rm -f /cluster/data/dm2/bed/blastz.droYak1
    ln -s blastz.droYak1.2005-05-20 /cluster/data/dm2/bed/blastz.droYak1


# BLASTZ/CHAIN/NET DROSIM1 -- LOOSER PARAMS (DONE 5/21/05 angie)
    # coverage with default blastz params:
#flyBaseGene 22.967%, chainDroSim1Link 90.656%, both 22.139%, cover 96.40%, enrich 1.06x
    # OK, let's see what droSim1 looks like at L=4000...
    mkdir /cluster/data/dm2/bed/blastz.droSim1.2005-05-21
    cd /cluster/data/dm2/bed/blastz.droSim1.2005-05-21
    cat << '_EOF_' > DEF
# D. melanogaster vs. D. simulans

BLASTZ_H=2000
BLASTZ_Y=3400
BLASTZ_L=4000
BLASTZ_K=2200
BLASTZ_Q=/cluster/data/blastz/HoxD55.q

# TARGET - D. melanogaster
SEQ1_DIR=/iscratch/i/dm2/nib
SEQ1_CHUNK=5000000
SEQ1_LAP=10000
SEQ1_LEN=/cluster/data/dm2/chrom.sizes

# QUERY - D. simulans
SEQ2_DIR=/iscratch/i/droSim1/droSim1.2bit
SEQ2_CHUNK=5000000
SEQ2_LAP=10000
SEQ2_LEN=/cluster/data/droSim1/chrom.sizes

BASE=/cluster/data/dm2/bed/blastz.droSim1.2005-05-21
'_EOF_'
    # << this line keeps emacs coloring happy
    doBlastzChainNet.pl DEF \
      -blastzOutRoot /santest/scratch/dm2droSim1 >& do.log &
    tail -f do.log
    featureBits dm2 -chrom=chr2L -enrichment flyBaseGene chainDroSim1Link
#flyBaseGene 22.967%, chainDroSim1Link 91.756%, both 22.420%, cover 97.62%, enrich 1.06x
    rm -f /cluster/data/dm2/bed/blastz.droSim1
    ln -s blastz.droSim1.2005-05-21 /cluster/data/dm2/bed/blastz.droSim1

# AUGUSTUS GENE PREDICTIONS (Done 6/1/2005 Andy)
    ssh hgwdev
    cd /cluster/data/dm2/bed
    mkdir augustus
    cd augustus/
    cat > cleanAugustus.awk << _EOF_
# Add the chrom name to the gene and transcript IDs.
{
   ch = \$1
   while (match(\$0, /"g[0-9\.]+"/))
       {
       before = substr(\$0, 1, RSTART)
       name = substr(\$0, RSTART+1, RLENGTH)
       after = substr(\$0, RSTART + RLENGTH + 1)
       \$0 = before "" ch "." name "" after
       }
   print
}
_EOF_
    wget http://augustus.gobics.de/predictions/dm2/dm2.allchr.augustus.gtf.gz
    zcat dm2.allchr.augustus.gtf.gz | awk -f cleanAugustus.awk | gzip > dm2.allchr.augustus.clean.gtf.gz
    ldHgGene -gtf dm2 augustus dm2.allchr.augustus.clean.gtf.gz
    rm dm2.allchr.augustus.gtf.gz

# BLAT FlyBase predicted DM proteins against DM (DONE 2005-06-27 braney)
    ssh hgwdev
    cd /cluster/data/dm2/bed
    mkdir blat.dm2FB
    cd blat.dm2FB
    pepPredToFa dm2 flyBasePep dm2FB.fa
    hgPepPred dm2 generic blastFBPep01 dm2FB.fa
    ssh kk
    cd /cluster/data/dm2/bed/blat.dm2FB
    cat << '_EOF_' > blatSome
#!/bin/csh -fe
/cluster/bin/i386/blat -t=dnax -q=prot -out=pslx $1 $2 $3
'_EOF_'
    # << keep emacs happy
    chmod +x blatSome
    ls -1S /iscratch/i/dm2/nib/*.nib > bug.lst
    mkdir fbfas
    cd fbfas
    faSplit sequence ../dm2FB.fa 5010 fb
    cd ..
    ls -1S fbfas/*.fa > fb.lst
    cat << '_EOF_' > blatGsub
#LOOP
blatSome $(path1) {check in line $(path2)} {check out line psl/$(root1)/$(root2).psl}
#ENDLOOP
'_EOF_'
    # << keep emacs happy
    gensub2 bug.lst fb.lst blatGsub blatSpec
    mkdir psl
    cd psl
    foreach i (`cat ../bug.lst`)
	mkdir `basename $i .nib`
    end
    cd ..
    para create blatSpec
    para push

# Completed: 63193 of 63193 jobs
# CPU time in finished jobs:     712543s   11875.72m   197.93h    8.25d  0.023 y
# IO & Wait Time:                172878s    2881.30m    48.02h    2.00d  0.005 y
# Average job time:                  14s       0.23m     0.00h    0.00d
# Longest finished job:            2172s      36.20m     0.60h    0.03d
# Submission to last job:          3192s      53.20m     0.89h    0.04d

    ssh eieio
    cd /cluster/data/dm2/bed/blat.dm2FB
    pslSort dirs raw.psl /tmp psl/*
    pslReps -nohead -minCover=0.9 -minAli=0.9 raw.psl cov90.psl /dev/null
    sort -rn cov90.psl | pslUniq stdin dm2FB.psl
    pslxToFa dm2FB.psl dm2FB_ex.fa -liftTarget=genome.lft -liftQuery=protein.lft
    fbName dm2 dm2FB.psl blastFBRef01
    ssh hgwdev
    cd /cluster/data/dm2/bed/blat.dm2FB
    hgsql dm2 < ~/kent/src/hg/lib/blastRef.sql
    echo "rename table blastRef to blastFBRef01" | hgsql dm2
    echo "load data local infile 'blastFBRef01' into table blastFBRef01" | hgsql dm2

# MAKE Human Proteins track (DONE 06-30-05 braney)
    ssh kk
    mkdir -p /cluster/data/dm2/blastDb
    cd /cluster/data/dm2/blastDb
    for i in ../*/*/*_[0-9]*_[0-9]*.fa; do ln -s $i .; done
    for i in *.fa; do formatdb -i $i -p F 2> /dev/null; done
    rm *.fa *.log

    ssh kkr1u00
    mkdir -p /iscratch/i/dm2/blastDb
    cp /cluster/data/dm2/blastDb/* /iscratch/i/dm2/blastDb
    iSync 2>&1 > sync.out

    mkdir -p /cluster/data/dm2/bed/tblastn.hg17KG
    cd /cluster/data/dm2/bed/tblastn.hg17KG
    ls -1S /iscratch/i/dm2/blastDb/*.nsq | sed "s/\.nsq//" > target.lst

    mkdir kgfa
    # calculate a reasonable number of jobs
    calc `wc /cluster/data/hg17/bed/blat.hg17KG/hg17KG.psl | awk "{print \\\$1}"`/\(150000/`wc target.lst | awk "{print \\\$1}"`\)
# 37365/(150000/288) = 71.740800

    split -l 72 /cluster/data/hg17/bed/blat.hg17KG/hg17KG.psl kgfa/kg
    cd kgfa
    for i in *; do pslxToFa $i $i.fa; rm $i; done
    cd ..
    ls -1S kgfa/*.fa > kg.lst
    rm -rf  /cluster/bluearc/dm2/bed/tblastn.hg17KG/blastOut
    mkdir -p /cluster/bluearc/dm2/bed/tblastn.hg17KG/blastOut
    ln -s  /cluster/bluearc/dm2/bed/tblastn.hg17KG/blastOut
    for i in `cat kg.lst`; do  mkdir blastOut/`basename $i .fa`; done
    tcsh
    cat << '_EOF_' > blastGsub
#LOOP
blastSome $(path1) {check in line $(path2)} {check out exists blastOut/$(root2)/q.$(root1).psl } 
#ENDLOOP
'_EOF_'
    cat << '_EOF_' > blastSome
#!/bin/sh
BLASTMAT=/iscratch/i/blast/data
export BLASTMAT
g=`basename $2`
f=/tmp/`basename $3`.$g
for eVal in 0.01 0.001 0.0001 0.00001 0.000001 1E-09 1E-11
do
if /scratch/blast/blastall -M BLOSUM80 -m 0 -F no -e $eVal -p tblastn -d $1 -i $2 -o $f.8
then
        mv $f.8 $f.1
        break;
fi
done
if test -f  $f.1
then
if /cluster/bin/i386/blastToPsl $f.1 $f.2
then
	liftUp -nosort -type=".psl" -nohead $f.3 /cluster/data/dm2/jkStuff/subChr.lft warn $f.2  
	liftUp -nosort -type=".psl" -nohead $f.4 /cluster/data/dm2/jkStuff/liftAll.lft warn $f.3  
        liftUp -nosort -type=".psl" -pslQ -nohead $3.tmp /cluster/data/hg17/bed/blat.hg17KG/protein.lft warn $f.4
        mv $3.tmp $3
        rm -f $f.1 $f.2 $f.3 $f.4
        exit 0
    fi
fi
rm -f $f.1 $f.2 $3.tmp $f.8 $f.3 $f.4
exit 1
'_EOF_'

    chmod +x blastSome
    gensub2 target.lst kg.lst blastGsub blastSpec

    ssh kk
    cd /cluster/data/dm2/bed/tblastn.hg17KG
    para create blastSpec
    para push
# Completed: 149472 of 149472 jobs
# CPU time in finished jobs:    5837692s   97294.87m  1621.58h   67.57d  0.185 y
# IO & Wait Time:               1404308s   23405.13m   390.09h   16.25d  0.045 y
# Average job time:                  48s       0.81m     0.01h    0.00d
# Longest finished job:            1095s      18.25m     0.30h    0.01d
# Submission to last job:         50465s     841.08m    14.02h    0.58d

    cat << '_EOF_' > chainGsub
#LOOP
chainSome $(path1)
#ENDLOOP
'_EOF_'

    ssh kki
    cd /cluster/data/dm2/bed/tblastn.hg17KG
    tcsh
    cat << '_EOF_' > chainSome
(cd $1; cat q.*.psl | simpleChain -prot -outPsl -maxGap=25000 stdin ../c.`basename $1`.psl)
'_EOF_'
    chmod +x chainSome

    ls -1dS `pwd`/blastOut/kg?? > chain.lst
    gensub2 chain.lst single chainGsub chainSpec

    para create chainSpec
    para push

# Completed: 519 of 519 jobs
# CPU time in finished jobs:       3835s      63.91m     1.07h    0.04d  0.000 y
# IO & Wait Time:                  5794s      96.57m     1.61h    0.07d  0.000 y
# Average job time:                  19s       0.31m     0.01h    0.00d
# Longest finished job:             114s       1.90m     0.03h    0.00d
# Submission to last job:           824s      13.73m     0.23h    0.01d

    ssh kkstore01
    cd /cluster/data/dm2/bed/tblastn.hg17KG/blastOut
    for i in kg??
    do 
	awk "(\$13 - \$12)/\$11 > 0.6 {print}" c.$i.psl > c60.$i.psl
	sort -rn c60.$i.psl | pslUniq stdin u.$i.psl
	awk "((\$1 / \$11) ) > 0.60 { print   }" c60.$i.psl > m60.$i.psl
	echo $i
    done

    sort -u -k 14,14 -k 16,16n -k 17,17n u.*.psl m60* > /cluster/data/dm2/bed/tblastn.hg17KG/blastHg17KG.psl
    cd ..

    ssh hgwdev
    cd /cluster/data/dm2/bed/tblastn.hg17KG
    hgLoadPsl dm2 blastHg17KG.psl
    exit

    # back to kksilo
    rm -rf blastOut

# End tblastn


# BLASTZ/CHAIN/NET TRICAS1 (DONE 7/11/05 Andy)
    ssh kkstore01
    mkdir /cluster/data/dm2/bed/blastz.triCas1.2005-07-10
    cd /cluster/data/dm2/bed/blastz.triCas1.2005-07-10    
    cat << "_EOF_" > DEF
# D. melanogaster vs. T. castaneum

BLASTZ_H=2000
BLASTZ_Y=3400
BLASTZ_L=4000
BLASTZ_K=2200
BLASTZ_Q=/cluster/data/blastz/HoxD55.q
BLASTZ_ABRIDGE_REPEATS=0

# TARGET - D. melanogaster
SEQ1_DIR=/iscratch/i/dm2/nib
SEQ1_CHUNK=10000000
SEQ1_LAP=10000
SEQ1_LEN=/cluster/data/dm2/chrom.sizes

# QUERY - D. virilis
SEQ2_DIR=/panasas/store/triCas1/triCas1.2bit
SEQ2_CHUNK=5000000
SEQ2_LAP=10000
SEQ2_LEN=/panasas/store/triCas1/chrom.sizes

BASE=/cluster/data/dm2/bed/blastz.triCas1.2005-07-10
_EOF_
    # << this line keeps emacs coloring happy
    doBlastzChainNet.pl DEF \
      -blastzOutRoot /panasas/store/dm2triCas1 >& do.log &
    tail -f do.log
    featureBits dm2 -chrom=chr2L -enrichment flyBaseGene chainDroVir1Link
#flyBaseGene 22.967%, chainDroVir1Link 66.657%, both 20.876%, cover 90.90%, enrich 1.36x
    rm -f /cluster/data/dm2/bed/blastz.triCas1
    ln -s blastz.triCas1.2005-05-19 /cluster/data/dm2/bed/blastz.triCas1


# GENEID PREDICTIONS FROM IMIM (DONE 7/26/05 angie)
    ssh hgwdev
    mkdir /cluster/data/dm2/bed/geneid
    cd /cluster/data/dm2/bed/geneid
    foreach chr (`awk '{print $1;}' ../../chrom.sizes`)
      wget http://genome.imim.es/genepredictions/D.melanogaster/golden_path_200404/geneidv1.2/$chr.gtf
    end
    ldHgGene -gtf -genePredExt dm2 geneid *.gtf


# BLASTZ/CHAIN/NET DROERE1 (DONE 8/8/05 angie)
    mkdir /cluster/data/dm2/bed/blastz.droEre1.2005-08-08
    cd /cluster/data/dm2/bed/blastz.droEre1.2005-08-08
    cat << '_EOF_' > DEF
# D. melanogaster vs. D. erecta

BLASTZ_H=2000
BLASTZ_Y=3400
BLASTZ_L=4000
BLASTZ_K=2200
BLASTZ_Q=/cluster/data/blastz/HoxD55.q

# TARGET - D. melanogaster
SEQ1_DIR=/iscratch/i/dm2/nib
SEQ1_CHUNK=10000000
SEQ1_LAP=10000
SEQ1_LEN=/cluster/data/dm2/chrom.sizes

# QUERY - D. erecta
SEQ2_DIR=/iscratch/i/droEre1/droEre1.2bit
SEQ2_CHUNK=5000000
SEQ2_LAP=10000
SEQ2_LEN=/cluster/data/droEre1/chrom.sizes

BASE=/cluster/data/dm2/bed/blastz.droEre1.2005-08-08
'_EOF_'
    # << this line keeps emacs coloring happy
    doBlastzChainNet.pl DEF \
      -blastzOutRoot /panasas/store/dm2droEre1 >& do.log &
    tail -f do.log
    featureBits dm2 -chrom=chr2L -enrichment flyBaseGene chainDroEre1Link
#flyBaseGene 22.967%, chainDroEre1Link 91.103%, both 22.725%, cover 98.95%, enrich 1.09x
    rm -f /cluster/data/dm2/bed/blastz.droEre1
    ln -s blastz.droEre1.2005-08-08 /cluster/data/dm2/bed/blastz.droEre1


# BLASTZ/CHAIN/NET DROANA2 (DONE 8/8/05 angie)
    mkdir /cluster/data/dm2/bed/blastz.droAna2.2005-08-08
    cd /cluster/data/dm2/bed/blastz.droAna2.2005-08-08
    cat << '_EOF_' > DEF
# D. melanogaster vs. D. ananassae

BLASTZ_H=2000
BLASTZ_Y=3400
BLASTZ_L=4000
BLASTZ_K=2200
BLASTZ_Q=/cluster/data/blastz/HoxD55.q

# TARGET - D. melanogaster
SEQ1_DIR=/iscratch/i/dm2/nib
SEQ1_CHUNK=10000000
SEQ1_LAP=10000
SEQ1_LEN=/cluster/data/dm2/chrom.sizes

# QUERY - D. ananassae
SEQ2_DIR=/iscratch/i/droAna2/droAna2.2bit
SEQ2_CHUNK=5000000
SEQ2_LAP=10000
SEQ2_LEN=/cluster/data/droAna2/chrom.sizes

BASE=/cluster/data/dm2/bed/blastz.droAna2.2005-08-08
'_EOF_'
    # << this line keeps emacs coloring happy
    doBlastzChainNet.pl DEF \
      -blastzOutRoot /panasas/store/dm2droAna2 >& do.log &
    tail -f do.log
    featureBits dm2 -chrom=chr2L -enrichment flyBaseGene chainDroAna2Link
#flyBaseGene 22.967%, chainDroAna2Link 81.385%, both 22.050%, cover 96.01%, enrich 1.18x
    rm -f /cluster/data/dm2/bed/blastz.droAna2
    ln -s blastz.droAna2.2005-08-08 /cluster/data/dm2/bed/blastz.droAna2


# BLASTZ/CHAIN/NET DROMOJ2 (DONE 8/9/05 angie)
    mkdir /cluster/data/dm2/bed/blastz.droMoj2.2005-08-08
    cd /cluster/data/dm2/bed/blastz.droMoj2.2005-08-08
    cat << '_EOF_' > DEF
# D. melanogaster vs. D. mojavensis

BLASTZ_H=2000
BLASTZ_Y=3400
BLASTZ_L=4000
BLASTZ_K=2200
BLASTZ_Q=/cluster/data/blastz/HoxD55.q

# TARGET - D. melanogaster
SEQ1_DIR=/iscratch/i/dm2/nib
SEQ1_CHUNK=10000000
SEQ1_LAP=10000
SEQ1_LEN=/cluster/data/dm2/chrom.sizes

# QUERY - D. mojavensis
SEQ2_DIR=/iscratch/i/droMoj2/droMoj2.2bit
SEQ2_CHUNK=5000000
SEQ2_LAP=10000
SEQ2_LEN=/cluster/data/droMoj2/chrom.sizes

BASE=/cluster/data/dm2/bed/blastz.droMoj2.2005-08-08
'_EOF_'
    # << this line keeps emacs coloring happy
    doBlastzChainNet.pl DEF \
      -blastzOutRoot /panasas/store/dm2droMoj2 >& do.log &
    tail -f do.log
    featureBits dm2 -chrom=chr2L -enrichment flyBaseGene chainDroMoj2Link
#flyBaseGene 22.967%, chainDroMoj2Link 66.524%, both 20.799%, cover 90.56%, enrich 1.36x
    rm -f /cluster/data/dm2/bed/blastz.droMoj2
    ln -s blastz.droMoj2.2005-08-08 /cluster/data/dm2/bed/blastz.droMoj2


# BLASTZ/CHAIN/NET DROGRI1 (DONE 8/9/05 angie)
    mkdir /cluster/data/dm2/bed/blastz.droGri1.2005-08-08
    cd /cluster/data/dm2/bed/blastz.droGri1.2005-08-08
    cat << '_EOF_' > DEF
# D. melanogaster vs. D. grimshawi

BLASTZ_H=2000
BLASTZ_Y=3400
BLASTZ_L=4000
BLASTZ_K=2200
BLASTZ_Q=/cluster/data/blastz/HoxD55.q

# TARGET - D. melanogaster
SEQ1_DIR=/iscratch/i/dm2/nib
SEQ1_CHUNK=10000000
SEQ1_LAP=10000
SEQ1_LEN=/cluster/data/dm2/chrom.sizes

# QUERY - D. grimshawi
SEQ2_DIR=/iscratch/i/droGri1/droGri1.2bit
SEQ2_CHUNK=5000000
SEQ2_LAP=10000
SEQ2_LEN=/cluster/data/droGri1/chrom.sizes

BASE=/cluster/data/dm2/bed/blastz.droGri1.2005-08-08
'_EOF_'
    # << this line keeps emacs coloring happy
    doBlastzChainNet.pl DEF \
      -blastzOutRoot /panasas/store/dm2droGri1 >& do.log &
    tail -f do.log
    featureBits dm2 -chrom=chr2L -enrichment flyBaseGene chainDroGri1Link
#flyBaseGene 22.967%, chainDroGri1Link 68.520%, both 20.822%, cover 90.66%, enrich 1.32x
    rm -f /cluster/data/dm2/bed/blastz.droGri1
    ln -s blastz.droGri1.2005-08-08 /cluster/data/dm2/bed/blastz.droGri1


# BLASTZ/CHAIN/NET DROVIR2 (DONE 8/12/05 angie)
    mkdir /cluster/data/dm2/bed/blastz.droVir2.2005-08-11
    cd /cluster/data/dm2/bed/blastz.droVir2.2005-08-11
    cat << '_EOF_' > DEF
# D. melanogaster vs. D. virilis

BLASTZ_H=2000
BLASTZ_Y=3400
BLASTZ_L=4000
BLASTZ_K=2200
BLASTZ_Q=/cluster/data/blastz/HoxD55.q

# TARGET - D. melanogaster
SEQ1_DIR=/iscratch/i/dm2/nib
SEQ1_CHUNK=10000000
SEQ1_LAP=10000
SEQ1_LEN=/cluster/data/dm2/chrom.sizes

# QUERY - D. virilis
SEQ2_DIR=/iscratch/i/droVir2/droVir2.2bit
SEQ2_CHUNK=5000000
SEQ2_LAP=10000
SEQ2_LEN=/cluster/data/droVir2/chrom.sizes

BASE=/cluster/data/dm2/bed/blastz.droVir2.2005-08-11
'_EOF_'
    # << this line keeps emacs coloring happy
    doBlastzChainNet.pl DEF \
      -blastzOutRoot /panasas/store/dm2droVir2 >& do.log &
    tail -f do.log
    featureBits dm2 -chrom=chr2L -enrichment flyBaseGene chainDroVir2Link
#flyBaseGene 22.967%, chainDroVir2Link 67.025%, both 20.901%, cover 91.01%, enrich 1.36x
    rm -f /cluster/data/dm2/bed/blastz.droVir2
    ln -s blastz.droVir2.2005-08-11 /cluster/data/dm2/bed/blastz.droVir2


# RE-RUN NETTOAXT, AXTTOMAF IN PREP FOR MULTIZ (DONE 8/19/05 angie)
    # Kate checked in a fix to netToAxt, to prevent overlapping blocks 
    # which were causing problems for (display of) multiple alignments.  
    # Before running multiz, regenerate the axtNet/ and mafNet/ pairwise 
    # inputs (axtNet downloads should probably be repushed after this).  
    # The /cluster/data/dm2/bed/blastz.*/axtChain/netChains.csh files 
    # contain the original axtNet and mafNet commands, adapted here:
    ssh kolossus
    cd /tmp
    cat << '_EOF_' > /cluster/data/dm2/jkStuff/reNetToAxtMaf.csh
#!/bin/csh -efx
foreach oDb (droSim1 droYak1 droEre1 droAna2 dp3 droVir2 droMoj2 droGri1 \
             anoGam1 apiMel2 triCas1)
  set oSeq = /iscratch/i/$oDb/$oDb.2bit
  if (! -e $oSeq) then
    set oSeq = /iscratch/i/$oDb/nib
  endif
  cd /cluster/data/dm2/bed/blastz.$oDb/axtChain
  netSplit dm2.$oDb.net.gz net
  chainSplit chain dm2.$oDb.all.chain.gz
  cd ..
  mv axtNet axtNet.bak
  mkdir axtNet
  foreach f (axtChain/net/*.net)
    netToAxt $f axtChain/chain/$f:t:r.chain \
      /iscratch/i/dm2/nib $oSeq stdout \
    | axtSort stdin stdout \
    | gzip -c > axtNet/$f:t:r.dm2.$oDb.net.axt.gz
  end
  mv mafNet mafNet.bak
  mkdir mafNet
  foreach f (axtNet/*.dm2.$oDb.net.axt.gz)
    axtToMaf -tPrefix=dm2. -qPrefix=$oDb. $f \
         /cluster/data/dm2/chrom.sizes /cluster/data/$oDb/chrom.sizes \
         stdout \
    | gzip -c > mafNet/$f:t:r:r:r:r:r.maf.gz
  end
end
'_EOF_'
    # << for emacs
    chmod 775 /cluster/data/dm2/jkStuff/reNetToAxtMaf.csh
    /cluster/data/dm2/jkStuff/reNetToAxtMaf.csh >& re.log
    echo $status
#0
    # Made sure everything ran smoothly.  Spot-checked some .bak's vs. 
    # new to make sure they're of comparable size -- newer can be slightly 
    # smaller due to lack of dup's, or slightly larger due to more headers.  
    # Clean up:
    foreach oDb (droSim1 droYak1 droEre1 droAna2 dp3 droVir2 droMoj2 droGri1 \
                 anoGam1 apiMel2 triCas1)
      rm -r /cluster/data/dm2/bed/blastz.$oDb/axtChain/{chain,net}
      rm -r /cluster/data/dm2/bed/blastz.$oDb/{axt,maf}Net.bak
    end


# MULTIZ.V10 12WAY (7 FLIES, MOSQUITO, HONEYBEE) (DONE 8/20/05 angie)
    # Since 9way, adding D. erecta, D. grimshawi, T. castaneum;
    # dro{Ana,Moj,Vir}1 --> dro{Ana,Moj,Vir}2.
    # Tree (12-way):
    # ((((((((dm2 droSim1) (droYak1 droEre1)) droAna2) dp3) ((droVir2 droMoj2) droGri1)) anoGam1) apiMel2) triCas1)
    ssh kkstore01
    mkdir /cluster/data/dm2/bed/multiz12way.2005-08-20
    ln -s /cluster/data/dm2/bed/multiz12way.2005-08-20 \
      /cluster/data/dm2/bed/multiz12way
    cd /cluster/data/dm2/bed/multiz12way
    # Setup: Copy pairwise MAF to /san/sanvol1/scratch:
    mkdir /san/sanvol1/scratch/flyMultiz12way
    foreach db (droSim1 droYak1 droEre1 droAna2 dp3 droVir2 droMoj2 droGri1 \
                anoGam1 apiMel2 triCas1)
      echo $db
      cp -pR /cluster/data/dm2/bed/blastz.$db/mafNet \
        /san/sanvol1/scratch/flyMultiz12way/$db
    end
    ls -lLR /san/sanvol1/scratch/flyMultiz12way
    # Make output dir:
    mkdir maf
    # Create scripts to run multiz.v10 in the right order:
    # (Thanks to Andy for the idea to use a subroutine for individual 
    # multiz invocations)
    cat << '_EOF_' > /cluster/bin/scripts/runMultizV10.csh
#!/bin/csh -fex
# Run multiz on two inputs.
set closerMaf = $1
set fartherMaf = $2
set v = $3
set ref = $4
set tmpDir = $5
set outMaf = $6

if ($outMaf == "") then
  echo "usage: $0 closerMaf fartherMaf v ref tmpDir outMaf"
  exit 1
endif

set MZ10 = /cluster/bin/penn/multiz.v10
set PROJECT = /cluster/bin/penn/maf_project

if (-e $closerMaf && $closerMaf:e == "gz") then
  gunzip -c $closerMaf > $tmpDir/closer.maf
  set closerMaf = $tmpDir/closer.maf
endif
if (-e $fartherMaf && $fartherMaf:e == "gz") then
  gunzip -c $fartherMaf > $tmpDir/farther.maf
  set fartherMaf = $tmpDir/farther.maf
endif

set closerSubbed = `echo $closerMaf | sed -e 's@/@_@g;'`
set fartherSubbed = `echo $fartherMaf | sed -e 's@/@_@g;'`

if ( -s $closerMaf && -s $fartherMaf ) then
    $MZ10 $fartherMaf $closerMaf $v > $tmpDir/tmp.maf
    $PROJECT $tmpDir/tmp.maf $ref > $outMaf
else if ( -s $closerMaf ) then
    touch missing.$fartherSubbed
    cp $closerMaf $outMaf
else if ( -s $fartherMaf ) then
    touch missing.$closerSubbed
    cp $fartherMaf $outMaf
else
    touch missing.$closerSubbed
    touch missing.$fartherSubbed
endif 
'_EOF_'
    # << for emacs
    cat << '_EOF_' > doMultizAll.csh
#!/bin/csh -fex
set chr = $1
set tmpDir = /scratch/flyMultiz12way.$chr
mkdir $tmpDir

set mafScratch = /san/sanvol1/scratch/flyMultiz12way

# Really should write a perl script to take a tree like this and generate 
# commands like the ones below:
# ((((((((dm2 droSim1) (droYak1 droEre1)) droAna2) dp3) ((droVir2 droMoj2) droGri1)) anoGam1) apiMel2) triCas1)

/cluster/bin/scripts/runMultizV10.csh \
  $mafScratch/droYak1/$chr.maf.gz \
  $mafScratch/droEre1/$chr.maf.gz \
  0 dm2.$chr $tmpDir $tmpDir/$chr.YakEre.maf
/cluster/bin/scripts/runMultizV10.csh \
  $mafScratch/droSim1/$chr.maf.gz \
  $tmpDir/$chr.YakEre.maf \
  1 dm2.$chr $tmpDir $tmpDir/$chr.SimYakEre.maf
/cluster/bin/scripts/runMultizV10.csh \
  $tmpDir/$chr.SimYakEre.maf \
  $mafScratch/droAna2/$chr.maf.gz \
  1 dm2.$chr $tmpDir $tmpDir/$chr.SimYakEreAna.maf
/cluster/bin/scripts/runMultizV10.csh \
  $tmpDir/$chr.SimYakEreAna.maf \
  $mafScratch/dp3/$chr.maf.gz \
  1 dm2.$chr $tmpDir $tmpDir/$chr.SimYakEreAnaPse.maf
/cluster/bin/scripts/runMultizV10.csh \
  $mafScratch/droVir2/$chr.maf.gz \
  $mafScratch/droMoj2/$chr.maf.gz \
  0 dm2.$chr $tmpDir $tmpDir/$chr.VirMoj.maf
/cluster/bin/scripts/runMultizV10.csh \
  $tmpDir/$chr.SimYakEreAnaPse.maf \
  $tmpDir/$chr.VirMoj.maf \
  1 dm2.$chr $tmpDir $tmpDir/$chr.SimYakEreAnaPseVirMoj.maf
/cluster/bin/scripts/runMultizV10.csh \
  $tmpDir/$chr.SimYakEreAnaPseVirMoj.maf \
  $mafScratch/droGri1/$chr.maf.gz \
  1 dm2.$chr $tmpDir $tmpDir/$chr.SimYakEreAnaPseVirMojGri.maf
/cluster/bin/scripts/runMultizV10.csh \
  $tmpDir/$chr.SimYakEreAnaPseVirMojGri.maf \
  $mafScratch/anoGam1/$chr.maf.gz \
  1 dm2.$chr $tmpDir $tmpDir/$chr.SimYakEreAnaPseVirMojGriAno.maf
/cluster/bin/scripts/runMultizV10.csh \
  $tmpDir/$chr.SimYakEreAnaPseVirMojGriAno.maf \
  $mafScratch/apiMel2/$chr.maf.gz \
  1 dm2.$chr $tmpDir $tmpDir/$chr.SimYakEreAnaPseVirMojGriAnoApi.maf
/cluster/bin/scripts/runMultizV10.csh \
  $tmpDir/$chr.SimYakEreAnaPseVirMojGriAnoApi.maf \
  $mafScratch/triCas1/$chr.maf.gz \
  1 dm2.$chr $tmpDir maf/$chr.maf

rm -f $tmpDir/*.maf
rmdir $tmpDir
'_EOF_'
    # << for emacs
    chmod 775 /cluster/bin/scripts/runMultizV10.csh
    chmod 775 doMultizAll.csh
    awk '{print "./doMultizAll.csh " $1;}' /cluster/data/dm2/chrom.sizes \
      > jobs.lst
    # Run on small cluster
    ssh kki
    cd /cluster/data/dm2/bed/multiz12way
    para make jobs.lst
    para time
#Completed: 13 of 13 jobs
#Average job time:                 124s       2.07m     0.03h    0.00d
#Longest finished job:             538s       8.97m     0.15h    0.01d
#Submission to last job:         15659s     260.98m     4.35h    0.18d
    ls -1 missing*
#ls: No match.

    # make /gbdb/ links to 12way maf files:
    ssh hgwdev
    mkdir -p /gbdb/dm2/multiz12way/maf/multiz12way
    ln -s /cluster/data/dm2/bed/multiz12way/maf/chr*.maf \
      /gbdb/dm2/multiz12way/maf/multiz12way/
    # load into database
    cd /tmp
    hgLoadMaf -warn dm2 multiz12way \
      -pathPrefix=/gbdb/dm2/multiz12way/maf/multiz12way
    # load summary table to replace pairwise
    cat /cluster/data/dm2/bed/multiz12way/maf/chr*.maf \
    | nice hgLoadMafSummary dm2 multiz12waySummary stdin

    # Dropped unused indexes (2006-05-09 kate)
    # NOTE: this is not required in the future, as the loader
    # has been fixed to not generate these indexes
    hgsql dm2 -e "alter table multiz9waySummary drop index chrom_2"
    hgsql dm2 -e "alter table multiz9waySummary drop index chrom_3"

    # put 12way MAF out for download
    ssh kolossus
    cd /cluster/data/dm2/bed/multiz12way
    mkdir mafDownload
    foreach f (maf/*.maf)
      nice gzip -c $f > mafDownload/$f:t.gz
    end
    cd mafDownload
    md5sum *.maf.gz > md5sum.txt
    ssh hgwdev
    mkdir /usr/local/apache/htdocs/goldenPath/dm2/multiz12way
    ln -s /cluster/data/dm2/bed/multiz12way/mafDownload/{*.maf.gz,md5sum.txt} \
      /usr/local/apache/htdocs/goldenPath/dm2/multiz12waya
    # make a README.txt

    # Cleanup
    rm -rf /san/sanvol1/scratch/flyMultiz12way/


# PHASTCONS 12WAY WITH METHODS FROM PAPER (DONE 9/20/05 angie)
# ((((((((dm2,droSim1),(droYak1,droEre1)),droAna2),dp3),((droVir2,droMoj2),droGri1)),anoGam1),apiMel2),triCas1)
    ssh kkstore01
    mkdir -p /san/sanvol1/scratch/dm2/chrom
    cp -p /cluster/data/dm2/?{,?}/chr*.fa /san/sanvol1/scratch/dm2/chrom/
    # Split chrom fa into smaller windows for phastCons:
    ssh pk
    mkdir /cluster/data/dm2/bed/multiz12way/phastCons
    mkdir /cluster/data/dm2/bed/multiz12way/phastCons/run.split
    cd /cluster/data/dm2/bed/multiz12way/phastCons/run.split
    set WINDOWS = /san/sanvol1/scratch/dm2/phastCons/WINDOWS
    rm -fr $WINDOWS
    mkdir -p $WINDOWS
    cat << 'EOF' > doSplit.sh
#!/bin/csh -ef

set PHAST=/cluster/bin/phast
set FA_SRC=/san/sanvol1/scratch/dm2/chrom
set WINDOWS=/san/sanvol1/scratch/dm2/phastCons/WINDOWS

set maf=$1
set c = $maf:t:r
set tmpDir = /scratch/msa_split/$c
rm -rf $tmpDir
mkdir -p $tmpDir
${PHAST}/msa_split $1 -i MAF -M ${FA_SRC}/$c.fa \
   -O dm2,droSim1,droYak1,droEre1,droAna2,dp3,droVir2,droMoj2,droGri1,anoGam1,apiMel2,triCas1 \
   -w 1000000,0 -r $tmpDir/$c -o SS -I 1000 -B 5000
cd $tmpDir
foreach file ($c.*.ss)
  gzip -c $file > ${WINDOWS}/$file.gz
end
rm -f $tmpDir/$c.*.ss
rmdir $tmpDir
'EOF'
# << for emacs
    chmod a+x doSplit.sh
    rm -f jobList
    foreach file (/cluster/data/dm2/bed/multiz12way/maf/*.maf)
      if (-s $file) then
        echo "doSplit.sh {check in exists+ $file}" >> jobList
      endif
    end
    para make jobList
    para time
#Completed: 13 of 13 jobs
#Average job time:                  81s       1.35m     0.02h    0.00d
#Longest finished job:             219s       3.65m     0.06h    0.00d
#Submission to last job:           219s       3.65m     0.06h    0.00d

    ############### FIRST ITERATION OF PARAMETER ESTIMATION ONLY #############
    # Use consEntropy --NH to make it suggest a --expected-lengths param 
    # that we should try next.  Adam ran this on hg17 to find out the 
    # total entropy of the hg17 model:
    # consEntropy 0.265 12 ave.cons.mod ave.noncons.mod
#Transition parameters: gamma=0.265000, omega=12.000000, mu=0.083333, nu=0.030045
#Relative entropy: H=0.608216 bits/site
#Required length: N=16.085437 sites
#Total entropy: NH=9.783421 bits
    # Our target is that NH result: 9.7834 bits.  
    # Use phyloFit to make an initial model:
    ssh kolossus
    cd /cluster/data/dm2/bed/multiz12way/phastCons
    /cluster/bin/phast/msa_view ../maf/chr{2L,3R,4,X}.maf \
        --aggregate dm2,droSim1,droYak1,droEre1,droAna2,dp3,droVir2,droMoj2,droGri1,anoGam1,apiMel2,triCas1 \
        -i MAF -o SS > all.ss
    /cluster/bin/phast/phyloFit all.ss \
      --tree "((((((((dm2,droSim1),(droYak1,droEre1)),droAna2),dp3),((droVir2,droMoj2),droGri1)),anoGam1),apiMel2),triCas1)" \
      -i SS --out-root starting-tree
    cat starting-tree.mod 
#ALPHABET: A C G T 
#ORDER: 0
#SUBST_MOD: REV
#TRAINING_LNL: -496988087.624743
#BACKGROUND: 0.288383 0.211930 0.211868 0.287819 
#RATE_MAT:
#  -0.922499    0.207559    0.437636    0.277304 
#   0.282436   -1.104124    0.227654    0.594033 
#   0.595687    0.227721   -1.105784    0.282377 
#   0.277848    0.437405    0.207862   -0.923114 
#TREE: ((((((((dm2:0.028572,droSim1:0.042955):0.010136,(droYak1:0.058491,droEre1:0.056179):0.048245):0.060386,droAna2:0.259454):0.024698,dp3:0.263678):0.012004,((droVir2:0.126666,droMoj2:0.147640):0.138076,droGri1:0.248877):0.126753):0.094211,anoGam1:0.382011):0.052522,apiMel2:0.456711):0.211776,triCas1:0.211776);
# also get GC content from model -- if similar enough, no need to extract it 
# separately above.
awk '$1 == "BACKGROUND:" {print $3 + $4;}' starting-tree.mod 
#0.423798
    # OK, use .424 for --gc below.

    # Use the values of --target-coverage and --expected-lengths from the 
    # last iteration of the 9way run, 0.398 and 23.7.
    # Multiply each subst rate on the TREE line by 3.5 which is roughly the 
    # ratio of noncons to cons in 
    # /cluster/data/dm2/bed/multiz8way/phastCons/run.estimate/ave.*.mod
    /cluster/bin/phast/tree_doctor -s 3.5 starting-tree.mod \
      > starting-tree.noncons.mod
    /cluster/bin/phast/consEntropy --NH 9.7834 0.398 23.7 \
      starting-tree{,.noncons}.mod
#( Solving for new omega: 23.700000 24.343724 24.335125 )
#Transition parameters: gamma=0.398000, omega=23.700000, mu=0.042194, nu=0.027896
#Relative entropy: H=2.651656 bits/site
#Expected min. length: L_min=3.660302 sites
#Expected max. length: L_max=3.808284 sites
#Phylogenetic information threshold: PIT=L_min*H=9.705862 bits
#Recommended expected length: omega=24.335125 sites (for L_min*H=9.783400)
    # OK, use --expected-lengths 24.3.

    ############## SUBSEQUENT ITERATIONS OF PARAM ESTIMATION ONLY ###########
    # We're here because the actual target coverage was not satisfactory,
    # so we're changing the --target-coverage param.  Given that we're 
    # changing that, take a guess at how we should change --expected-lengths
    # in order to also hit the total entropy target.
    cd /cluster/data/dm2/bed/multiz12way/phastCons/run.estimate
    # SECOND ITERATION:
    /cluster/bin/phast/consEntropy --NH 9.7834 0.393 24.1 ave.{cons,noncons}.mod
#Recommended expected length: omega=23.801397 sites (for L_min*H=9.783400)
    # OK, --expected-lengths 23.8
    # THIRD ITERATION:
    /cluster/bin/phast/consEntropy --NH 9.7834 0.45 32.2 ave.{cons,noncons}.mod
#Recommended expected length: omega=27.095993 sites (for L_min*H=9.783400)
    # ==> 27.1

    # Now set up cluster job to estimate model parameters given free params 
    # --target-coverage and --expected-lengths and the data.  
    ssh pk
    mkdir /cluster/data/dm2/bed/multiz12way/phastCons/run.estimate
    cd /cluster/data/dm2/bed/multiz12way/phastCons/run.estimate
    # FIRST ITERATION: Use ../starting-tree.mod:
    cat << '_EOF_' > doEstimate.sh
#!/bin/csh -ef
zcat $1 \
| /cluster/bin/phast/phastCons - ../starting-tree.mod --gc 0.424 --nrates 1,1 \
    --no-post-probs --ignore-missing \
    --expected-lengths 24.3 --target-coverage 0.398 \
    --quiet --log $2 --estimate-trees $3
'_EOF_'
# << for emacs
    # SUBSEQUENT ITERATIONS: Use last iteration's estimated noncons model.
    cat << '_EOF_' > doEstimate.sh
#!/bin/csh -ef
zcat $1 \
| /cluster/bin/phast/phastCons - ave.noncons.mod --gc 0.424 --nrates 1,1 \
    --no-post-probs --ignore-missing \
    --expected-lengths 23.8 --target-coverage 0.393 \
    --quiet --log $2 --estimate-trees $3
'_EOF_'
# << for emacs
    chmod a+x doEstimate.sh
    rm -fr LOG TREES
    mkdir -p LOG TREES
    rm -f jobList
    foreach f (/san/sanvol1/scratch/dm2/phastCons/WINDOWS/*.ss.gz)
      set root = $f:t:r:r
      echo doEstimate.sh $f LOG/$root.log TREES/$root >> jobList
    end
    para make jobList
    para time
#Completed: 139 of 139 jobs
#Average job time:                6022s     100.37m     1.67h    0.07d
#Longest finished job:           14397s     239.95m     4.00h    0.17d
#Submission to last job:         14419s     240.32m     4.01h    0.17d

    # Now combine parameter estimates.  We can average the .mod files
    # using phyloBoot.  This must be done separately for the conserved
    # and nonconserved models
    ssh kolossus
    cd /cluster/data/dm2/bed/multiz12way/phastCons/run.estimate
    ls -1 TREES/*.cons.mod > cons.txt
    /cluster/bin/phast/phyloBoot --read-mods '*cons.txt' \
      --output-average ave.cons.mod > cons_summary.txt
    ls -1 TREES/*.noncons.mod > noncons.txt
    /cluster/bin/phast/phyloBoot --read-mods '*noncons.txt' \
      --output-average ave.noncons.mod > noncons_summary.txt
    grep TREE ave*.mod
    # FIRST ITERATION:
#ave.cons.mod:TREE: ((((((((dm2:0.015283,droSim1:0.028475):0.004064,(droYak1:0.031904,droEre1:0.033703):0.024654):0.021822,droAna2:0.128599):0.008507,dp3:0.135654):0.003923,((droVir2:0.064832,droMoj2:0.074814):0.064856,droGri1:0.130122):0.056057):0.035045,anoGam1:0.225562):0.032118,apiMel2:0.207067):0.100280,triCas1:0.100280);
#ave.noncons.mod:TREE: ((((((((dm2:0.049188,droSim1:0.086476):0.013855,(droYak1:0.101928,droEre1:0.105461):0.080386):0.076467,droAna2:0.435801):0.030536,dp3:0.463395):0.014225,((droVir2:0.218767,droMoj2:0.254568):0.225537,droGri1:0.443544):0.200405):0.126301,anoGam1:0.786660):0.117326,apiMel2:0.720501):0.348402,triCas1:0.348402);
    # SECOND ITERATION:
#ave.cons.mod:TREE: ((((((((dm2:0.015261,droSim1:0.028419):0.004049,(droYak1:0.031866,droEre1:0.033655):0.024630):0.021816,droAna2:0.128628):0.008503,dp3:0.135728):0.003923,((droVir2:0.064840,droMoj2:0.074832):0.064900,droGri1:0.130188):0.056115):0.035117,anoGam1:0.225895):0.032170,apiMel2:0.207375):0.100397,triCas1:0.100397);
#ave.noncons.mod:TREE: ((((((((dm2:0.049200,droSim1:0.086486):0.013823,(droYak1:0.101982,droEre1:0.105514):0.080420):0.076507,droAna2:0.436376):0.030537,dp3:0.464200):0.014233,((droVir2:0.219076,droMoj2:0.254936):0.225862,droGri1:0.444226):0.200721):0.126658,anoGam1:0.788468):0.117561,apiMel2:0.722456):0.349173,triCas1:0.349173);
# THIRD ITERATION:

    cat cons_summary.txt 
    # look over the files cons_summary.txt and noncons_summary.txt.
    # The means and medians should be roughly equal and the stdevs
    # should be reasonably small compared to the means, particularly
    # for rate matrix parameters (at bottom) and for branches to the
    # leaves of the tree.  The stdevs may be fairly high for branches
    # near the root of the tree; that's okay.  Some min values may be
    # 0 for some parameters.  That's okay, but watch out for very large
    # values in the max column, which might skew the mean.  If you see
    # any signs of bad outliers, you may have to track down the
    # responsible .mod files and throw them out.  I've never had to do
    # this; the estimates generally seem pretty well behaved.

    # NOTE: Actually, a random sample of several hundred to a thousand
    # alignment fragments (say, a number equal to the number of
    # available cluster nodes) should be more than adequate for
    # parameter estimation.  If pressed for time, use this strategy.

    # FIRST ITERATION ONLY:
    # Check the total entropy figure to see if we're way off.
    # This takes an hour for 12way (exponential in #species) and has never 
    # produced a different answer from the input after the first iteration,
    # so do this for the first iteration only:
    /cluster/bin/phast/consEntropy --NH 9.7834 0.398 24.1 ave.{cons,noncons}.mod
#Recommended expected length: omega=24.095425 sites (for L_min*H=9.783400)
    # ==> use 24.1 instead of 24.3 below.

    # Now we are ready to set up the cluster job for computing the
    # conservation scores and predicted elements.  The we measure the 
    # conserved elements coverage, and if that's not satisfactory then we 
    # adjust parameters and repeat.  
    ssh pk
    mkdir /cluster/data/dm2/bed/multiz12way/phastCons/run.phast
    cd /cluster/data/dm2/bed/multiz12way/phastCons/run.phast
    cat << 'EOF' > doPhastCons.sh
#!/bin/csh -ef
set pref = $1:t:r:r
set chr = `echo $pref | awk -F\. '{print $1}'`
set tmpfile = /scratch/phastCons.$$
zcat $1 \
| /cluster/bin/phast/phastCons - \
    ../run.estimate/ave.cons.mod,../run.estimate/ave.noncons.mod \
    --expected-lengths 23.8 --target-coverage 0.393 \
    --quiet --seqname $chr --idpref $pref \
    --viterbi /san/sanvol1/scratch/dm2/phastCons/ELEMENTS/$pref.bed --score \
    --require-informative 0 \
  > $tmpfile
gzip -c $tmpfile > /san/sanvol1/scratch/dm2/phastCons/POSTPROBS/$pref.pp.gz
rm $tmpfile
'EOF'
# << for emacs
    chmod a+x doPhastCons.sh
    rm -fr /san/sanvol1/scratch/dm2/phastCons/{POSTPROBS,ELEMENTS}
    mkdir -p /san/sanvol1/scratch/dm2/phastCons/{POSTPROBS,ELEMENTS}
    rm -f jobList
    foreach f (/san/sanvol1/scratch/dm2/phastCons/WINDOWS/*.ss.gz)
      echo doPhastCons.sh $f >> jobList
    end
    para make jobList
    para time
#Completed: 139 of 139 jobs
#Average job time:                  15s       0.25m     0.00h    0.00d
#Longest finished job:              19s       0.32m     0.01h    0.00d
#Submission to last job:            45s       0.75m     0.01h    0.00d

    # back on kolossus:
    # combine predictions and transform scores to be in 0-1000 interval
    cd /cluster/data/dm2/bed/multiz12way/phastCons
    awk '{printf "%s\t%d\t%d\tlod=%d\t%s\n", $1, $2, $3, $5, $5;}' \
      /san/sanvol1/scratch/dm2/phastCons/ELEMENTS/*.bed \
    | /cluster/bin/scripts/lodToBedScore > all.bed

    ssh hgwdev
    # Now measure coverage of CDS by conserved elements. 
    # We want the "cover" figure to be close to 68.9%.
    cd /cluster/data/dm2/bed/multiz12way/phastCons
    featureBits -enrichment dm2 flyBaseGene:cds all.bed
    # FIRST ITERATION: just a bit too high, reduce target-coverage a bit:
#flyBaseGene:cds 16.567%, all.bed 35.522%, both 11.442%, cover 69.07%, enrich 1.94x
    # SECOND ITERATION: OK, this is the first time I've seen a reduction in 
    # target-coverage result in an increase in CDS coverage.  Dang.  Well, 
    # I'll take it as a sign that we're close enough (esp. since each iteration 
    # take a half-day now).
#flyBaseGene:cds 16.567%, all.bed 35.514%, both 11.446%, cover 69.09%, enrich 1.95x

    # Having met the CDS coverage target, load up the results.
    hgLoadBed dm2 phastConsElements12way all.bed

    # Create wiggle
    ssh pk
    mkdir /cluster/data/dm2/bed/multiz12way/phastCons/run.wib
    cd /cluster/data/dm2/bed/multiz12way/phastCons/run.wib
    rm -rf /san/sanvol1/scratch/dm2/phastCons/wib
    mkdir -p /san/sanvol1/scratch/dm2/phastCons/wib
    cat << 'EOF' > doWigEncode
#!/bin/csh -ef
set chr = $1
cd /san/sanvol1/scratch/dm2/phastCons/wib
zcat `ls -1 /san/sanvol1/scratch/dm2/phastCons/POSTPROBS/$chr.*.pp.gz \
      | sort -t\. -k2,2n` \
| wigEncode stdin ${chr}_phastCons.wi{g,b}
'EOF'
# << for emacs
    chmod a+x doWigEncode
    rm -f jobList
    foreach chr (`ls -1 /san/sanvol1/scratch/dm2/phastCons/POSTPROBS \
                  | awk -F\. '{print $1}' | sort -u`)
      echo doWigEncode $chr >> jobList
    end
    para make jobList
    para time
#Completed: 13 of 13 jobs
#Average job time:                  10s       0.16m     0.00h    0.00d
#Longest finished job:              24s       0.40m     0.01h    0.00d
#Submission to last job:            99s       1.65m     0.03h    0.00d

    # back on kkstore01, copy wibs, wigs and POSTPROBS (people sometimes want 
    # the raw scores) from san/sanvol1
    cd /cluster/data/dm2/bed/multiz12way/phastCons
    rm -rf wib POSTPROBS
    rsync -av /san/sanvol1/scratch/dm2/phastCons/wib .
    rsync -av /san/sanvol1/scratch/dm2/phastCons/POSTPROBS .

    # load wiggle component of Conservation track
    ssh hgwdev
    mkdir /gbdb/dm2/multiz12way/wib
    cd /cluster/data/dm2/bed/multiz12way/phastCons
    chmod 775 . wib
    chmod 664 wib/*.wib
    ln -s `pwd`/wib/*.wib /gbdb/dm2/multiz12way/wib/
    hgLoadWiggle dm2 phastCons12way \
      -pathPrefix=/gbdb/dm2/multiz12way/wib wib/*.wig
    rm wiggle.tab

    # and clean up san/sanvol1.
    rm -r /san/sanvol1/scratch/dm2/phastCons/{ELEMENTS,POSTPROBS,wib}
    rm -r /san/sanvol1/scratch/dm2/chrom
    # Offer raw scores for download since fly folks are likely to be interested:
    ssh kkstore01
    cd /cluster/data/dm2/bed/multiz12way/phastCons/POSTPROBS
    mkdir ../postprobsDownload
    foreach chr (`awk '{print $1;}' ../../../../chrom.sizes`)
      zcat `ls -1 $chr.*.pp.gz | sort -t\. -k2,2n` | gzip -c \
        > ../postprobsDownload/$chr.pp.gz
    end
    cd ../postprobsDownload
    md5sum *.gz > md5sum.txt
    # Make a README.txt there too.
    ssh hgwdev
    mkdir /usr/local/apache/htdocs/goldenPath/dm2/phastCons12way
    cd /usr/local/apache/htdocs/goldenPath/dm2/phastCons12way
    ln -s /cluster/data/dm2/bed/multiz12way/phastCons/postprobsDownload/* .


# MAKE 11.OOC FILE FOR BLAT (DONE 9/19/05 angie)
    # Use -repMatch=100 (based on size -- for human we use 1024, and 
    # fly size is ~4.4% of human judging by gapless dm1 genome size from 
    # featureBits -- we would use 45, but bump that up a bit to be more 
    # conservative).
    ssh kolossus
    blat /cluster/data/dm2/dm2.2bit /dev/null /dev/null -tileSize=11 \
      -makeOoc=/cluster/bluearc/dm2/11.ooc -repMatch=100
#Wrote 3031 overused 11-mers to /cluster/bluearc/dm2/11.ooc


# BAC END PAIRS (DONE 9/20/05 angie)
    # Excerpt from an email from Roger Hoskins, 9/17/05:
    # > There are two end-sequenced BAC libraries. The RPCI-98 BAC library, 
    # > produced by Pieter and Kazu, is called "BDGP" on the Genoscope 
    # > website; individual clones are named BACR01A01 - BACR48H12 (96-well 
    # > format; R stands for EcoRI). The DrosBAC library, produced in France 
    # > for the European Drosophila Genome Project, is called "EDGP" on the 
    # > Genoscope website; individual clones are named BACN01A01 - BACN47H12 
    # > and BACH48A01-BACH61H12 (N stands for NdeII; H stands for HinDIII).
    # >
    # > It's not necessary to query GenBank for these data. The BAC end 
    # > sequences are available from the Genoscope web site, Resources page:
    # > http://www.genoscope.cns.fr/externe/English/Outils/
    # > For RPCI-98:
    # > http://www.genoscope.cns.fr/externe/sequences/banque_Projet_AL
    # > For DrosBAC (EDGP):
    # > http://www.genoscope.cns.fr/externe/sequences/banque_Projet_AM
    # >
    # > Note that if you decide to do this, it would be important to have a 
    # > fairly high stringency imposed on the sequence alignments for mapping. 
    # > BACs only go to one location in the genome; paired ends must point 
    # > toward each other and be separated by an interval of say 50kb to 250 
    ssh kolossus
    # fileserver is unhappy... so work on bluearc for now, copy over later:
    mkdir -p /cluster/bluearc/dm2/bed/bacends
    cd /cluster/bluearc/dm2/bed/bacends
    wget http://www.genoscope.cns.fr/externe/sequences/banque_Projet_AL
    wget http://www.genoscope.cns.fr/externe/sequences/banque_Projet_AM
    faSize *
#40016446 bases (3778317 N's 36238129 real 33260718 upper 2977391 lower) in 41500 sequences in 2 files
#Total size: mean 964.3 sd 202.7 min 1 (AL0AA030DD12A1) max 1373 (AM0AA015AE07BP1) median 1009
#N count: mean 91.0 sd 80.8
#U count: mean 801.5 sd 207.4
#L count: mean 71.7 sd 45.5
    cat banque_Projet_A* > bacends.fa
    # Edit the <!DOCTYPE.../HTML> part out of bacends.fa.
    # Fly is much smaller than human so we can get away with one job 
    # per chrom, no splitting...
    ssh kki
    cd /cluster/bluearc/dm2/bed/bacends
    mkdir out
    echo bacends.fa > bacends.lst
    ls -1S /iscratch/i/dm2/nib/* > dm2.lst
cat > gsub << 'EOF'
#LOOP
/cluster/bin/x86_64/blat -noHead $(path1) $(path2) -ooc=/cluster/bluearc/dm2/11.ooc {check out exists out/$(root1).$(root2).psl}
#ENDLOOP
'EOF'
    # << for emacs
    gensub2 dm2.lst bacends.lst gsub jobList
    para make jobList
    para time
#Completed: 13 of 13 jobs
#Average job time:                  79s       1.32m     0.02h    0.00d
#Longest finished job:             360s       6.00m     0.10h    0.00d
#Submission to last job:           360s       6.00m     0.10h    0.00d

    # Back on kolossus:
    cat out/*.psl \
    | pslReps -nearTop=0.02 -minCover=0.60 -minAli=0.85 -noIntrons \
        stdin bacEnds.unpaired.psl /dev/null
#..Processed 318892 alignments

    # Translate the particular style of FASTA headers into bacEndPairs.txt:
    perl -we 'while (<>) { \
                if (/^>(\w+)\s+BAC=(\w+)\s+END=(\w+)/) { \
                  if (! defined $bacs{$2}) { \
                    $bacs{$2}->[0] = "";  $bacs{$2}->[1] = ""; \
                  } \
                  if ($3 eq "T7") { \
                    $bacs{$2}->[0] .= "$1,"; \
                  } else { \
                    $bacs{$2}->[1] .= "$1,"; \
                  } \
                } elsif (/^>/) { \
                  die "parse:\n$_\n"; \
                } \
              } \
              foreach $bac (keys %bacs) { \
                $t7 = $bacs{$bac}->[0];  $t7 = "," if ($t7 eq ""); \
                $other = $bacs{$bac}->[1];   $other = "," if ($other eq ""); \
                print "$t7\t$other\t$bac\n"; \
              }' bacends.fa \
    > bacEndPairs.txt

    # Roger's suggested 50kb-250kb range lost some (especially on the short 
    # side), so I ended up going with Terry's range for human, 35kb-350kb:
    pslPairs -tInsert=10000 -minId=0.91 -noBin -min=25000 -max=350000 \
             -slopval=10000 -hardMax=500000 \
             -slop -short -long -orphan -mismatch -verbose \
      bacEnds.unpaired.psl bacEndPairs.txt all_bacends bacEnds
    wc -l bacEnds.*
#      21 bacEnds.long
#     180 bacEnds.mismatch
#    7037 bacEnds.orphan
#   11417 bacEnds.pairs
#      37 bacEnds.short
#     126 bacEnds.slop
#  118041 bacEnds.unpaired.psl
      # (With Roger's sugg. 50kb-250kb, there were 575 short, 53 long)

    # Filter by score and sort by {chrom,chromStart}:
    awk '$5 >= 300 {print;}' bacEnds.pairs | sort -k1,2n > bacEndPairs.bed
    cat bacEnds.{slop,short,long,mismatch,orphan} \
    | awk '$5 >= 300 {print;}' | sort -k1,2n > bacEndPairsBad.bed
    wc -l *.bed
#  11410 bacEndPairs.bed
#   4842 bacEndPairsBad.bed

    # Filter the alignments into those we'll need in the all_bacends table:
    extractPslLoad -noBin bacEnds.unpaired.psl \
      bacEndPairs.bed bacEndPairsBad.bed \
    | sorttbl tname tstart | headchg -del \
    > bacEnds.load.psl

    cd ..
    mv /cluster/bluearc/dm2/bed/bacends /cluster/data/dm2/bed/bacends

    # load into database
    ssh hgwdev
    cd /cluster/data/dm2/bed/bacends
    hgLoadBed dm2 bacEndPairs bacEndPairs.bed \
      -notItemRgb -sqlTable=$HOME/kent/src/hg/lib/bacEndPairs.sql 
    # note - this track isn't pushed to RR, just used for assembly QA
    hgLoadBed dm2 bacEndPairsBad bacEndPairsBad.bed \
      -notItemRgb -sqlTable=$HOME/kent/src/hg/lib/bacEndPairsBad.sql
    hgLoadPsl dm2 -table=all_bacends bacEnds.load.psl
    # load BAC end sequences
    mkdir -p /gbdb/dm2/bacends
    ln -s /cluster/data/dm2/bed/bacends/bacends.fa /gbdb/dm2/bacends/
    hgLoadSeq dm2 /gbdb/dm2/bacends/bacends.fa


##########################################################################
# NSCAN track - (2005-09-29 markd)  loaded proteins 2005-10-13
    cd /cluster/data/dm2/bed/nscan/
    # obtained NSCAN-EST predictions from michael brent's group at WUSTL
    wget http://genome.cse.wustl.edu/predictions/Drosophila/Dm2_09_19_05/Dm2_09_19_05.tar.gz
    tar -zxf Dm2_09_19_05.tar.gz

    # change protein fasta file to have transcript id in header
    foreach f (chr_ptx/*.ptx)
        awk '/^>/{$0=$1".a"}{print $0}' $f >$f.fix
    end

    ldHgGene -gtf -genePredExt dm2 nscanGene  chr_gtf/chr*.gtf
    hgPepPred mm6 generic nscanPep chr_ptx/chr*.fix
    rm -rf chr_* *.tab

    # update trackDb; need a dm2-specific page to describe informants
    mouse/dm2/nscanGene.html    
    mouse/dm2/trackDb.ra


# BLASTZ/CHAIN/NET DROYAK2 (DONE 11/23/05 angie)
    mkdir /cluster/data/dm2/bed/blastz.droYak2.2005-11-23
    cd /cluster/data/dm2/bed/blastz.droYak2.2005-11-23
    cat << '_EOF_' > DEF
# D. melanogaster vs. D. yakuba

BLASTZ_H=2000
BLASTZ_Y=3400
BLASTZ_L=4000
BLASTZ_K=2200
BLASTZ_Q=/cluster/data/blastz/HoxD55.q

# TARGET - D. melanogaster
SEQ1_DIR=/iscratch/i/dm2/nib
SEQ1_CHUNK=10000000
SEQ1_LAP=10000
SEQ1_LEN=/cluster/data/dm2/chrom.sizes

# QUERY - D. yakuba
SEQ2_DIR=/iscratch/i/droYak2/droYak2.2bit
SEQ2_CHUNK=10000000
SEQ2_LAP=10000
SEQ2_LEN=/cluster/data/droYak2/chrom.sizes

BASE=/cluster/data/dm2/bed/blastz.droYak2.2005-11-23
'_EOF_'
    # << this line keeps emacs coloring happy
    doBlastzChainNet.pl DEF \
      -blastzOutRoot /cluster/bluearc/dm2droYak2 >& do.log &
    tail -f do.log
    featureBits dm2 -chrom=chr2L -enrichment flyBaseGene chainDroYak2Link
#flyBaseGene 22.967%, chainDroYak2Link 91.943%, both 22.758%, cover 99.09%, enrich 1.08x
    rm -f /cluster/data/dm2/bed/blastz.droYak2
    ln -s blastz.droYak2.2005-11-23 /cluster/data/dm2/bed/blastz.droYak2


# BLASTZ/CHAIN/NET DROPER1 (DONE 11/29/05 angie)
    mkdir /cluster/data/dm2/bed/blastz.droPer1.2005-11-28
    cd /cluster/data/dm2/bed/blastz.droPer1.2005-11-28
    cat << '_EOF_' > DEF
# D. melanogaster vs. D. persimilis

BLASTZ_H=2000
BLASTZ_Y=3400
BLASTZ_L=4000
BLASTZ_K=2200
BLASTZ_Q=/cluster/data/blastz/HoxD55.q

# TARGET - D. melanogaster
SEQ1_DIR=/iscratch/i/dm2/nib
SEQ1_CHUNK=10000000
SEQ1_LAP=10000
SEQ1_LEN=/cluster/data/dm2/chrom.sizes

# QUERY - D. persimilis
SEQ2_DIR=/iscratch/i/droPer1/droPer1.2bit
SEQ2_CHUNK=10000000
SEQ2_LAP=10000
SEQ2_LEN=/cluster/data/droPer1/chrom.sizes

BASE=/cluster/data/dm2/bed/blastz.droPer1.2005-11-28
'_EOF_'
    # << this line keeps emacs coloring happy
    doBlastzChainNet.pl DEF \
      -blastzOutRoot /panasas/store/dm2droPer1 >& do.log &
    tail -f do.log
    featureBits dm2 -chrom=chr2L -enrichment flyBaseGene chainDroPer1Link
#flyBaseGene 22.967%, chainDroPer1Link 74.744%, both 21.320%, cover 92.83%, enrich 1.24x
    rm -f /cluster/data/dm2/bed/blastz.droPer1
    ln -s blastz.droPer1.2005-11-28 /cluster/data/dm2/bed/blastz.droPer1


# BLASTZ/CHAIN/NET DROSEC1 (DONE 12/5/05 angie)
    mkdir /cluster/data/dm2/bed/blastz.droSec1.2005-11-30
    cd /cluster/data/dm2/bed/blastz.droSec1.2005-11-30
    cat << '_EOF_' > DEF
# D. melanogaster vs. D. sechellia

BLASTZ_H=2000
BLASTZ_Y=3400
BLASTZ_L=4000
BLASTZ_K=2200
BLASTZ_Q=/cluster/data/blastz/HoxD55.q

# TARGET - D. melanogaster
SEQ1_DIR=/iscratch/i/dm2/nib
SEQ1_CHUNK=10000000
SEQ1_LAP=10000
SEQ1_LEN=/cluster/data/dm2/chrom.sizes

# QUERY - D. sechellia
SEQ2_DIR=/iscratch/i/droSec1/droSec1.2bit
SEQ2_CHUNK=10000000
SEQ2_LAP=10000
SEQ2_LEN=/cluster/data/droSec1/chrom.sizes

BASE=/cluster/data/dm2/bed/blastz.droSec1.2005-11-30
'_EOF_'
    # << this line keeps emacs coloring happy
    doBlastzChainNet.pl DEF \
      -blastzOutRoot /panasas/store/dm2droSec1 >& do.log &
    tail -f do.log
    featureBits dm2 -chrom=chr2L -enrichment flyBaseGene chainDroSec1Link
#flyBaseGene 22.967%, chainDroSec1Link 92.860%, both 22.759%, cover 99.09%, enrich 1.07x
    rm -f /cluster/data/dm2/bed/blastz.droSec1
    ln -s blastz.droSec1.2005-11-30 /cluster/data/dm2/bed/blastz.droSec1


# FLYBASE 4.2 ANNOTATIONS (DONE (except for contacting flybase-help) 1/13/06 angie)
# RELOADED flyBaseNoncoding and flyBase2004Xref 5/30/06 after changing 
# extractGenes to catch some new RNA keywords after user reported loss of 
# many mir's.  (They were marked "ncRNA" in 4.1 but "miRNA" in 4.2.)  
# Now extractGenes catches [a-z]+RNA.
# RELOADED flyBaseGene 7/12/06 after user reported a problem with split
# start codons -- cause by error in extractGenes.pl that caused us not to
# notice a single-base overlap between CDS range and exon range.
    ssh kkstore01
    mkdir /cluster/data/dm2/bed/flybase4.2
    cd /cluster/data/dm2/bed/flybase4.2
    foreach c (2L 2R 3L 3R 4 X)
      wget ftp://flybase.net/genomes/Drosophila_melanogaster/dmel_r4.2_20050909/gff/dmel-$c-r4.2.1.gff.gz
    end
    zcat *.gff.gz > flybase.gff3
    # What data sources are represented in this file?
    grep -v '^#' flybase.gff3 | awk '{print $2 "\t" $3;}' | sort | uniq -c
    # excerpt (many other sources, including blastx:... , sim4:... and 
    # tblastx:...; also various other types for source "."):
  19178 .       CDS
  63583 .       exon
  14380 .       gene
  19178 .       mRNA
     66 .       miRNA
    113 .       ncRNA
     49 .       pseudogene
     96 .       rRNA
     46 .       snRNA
     63 .       snoRNA
    294 .       tRNA
  36921 .       transcription_start_site
   6006 .       transposable_element
  33268 .       transposable_element_insertion_site
    # What keywords are defined in the 9th field?
    grep -v '^#' flybase.gff3 \
    | awk '{print $9;}' | perl -wpe 's/=[^;]+;/\n/g; s/=.*$//;' \
    | sort | uniq -c
 117483 Dbxref
3111982 ID
1428395 Name
 847565 Parent
  20326 cyto_range
  46928 dbxref_2nd
  15705 gbunit
      6 species
  92077 synonym
  10032 synonym_2nd
    # Wow... for the first time, it looks like the same flavor of GFF3
    # has been used as last time!
    cp ../flybase4.1/extractGenes.pl .
    extractGenes.pl flybase.gff3

    # Get predicted proteins (for main annotations only)
    wget ftp://flybase.net/genomes/Drosophila_melanogaster/dmel_r4.2_20050909/fasta/dmel-all-translation-r4.2.1.fasta
    perl -wpe 's/^(>\w+)-P(\w)/$1-R$2/' dmel-all-translation-r4.2.1.fasta \
      > flybasePep.fa

    ssh hgwdev
    cd /cluster/data/dm2/bed/flybase4.2
    # Protein-coding genes:
    ldHgGene -gtf dm2 flyBaseGene flybase.gtf
    hgPepPred dm2 generic flyBasePep flybasePep.fa
    # Noncoding genes:
    hgLoadBed dm2 flyBaseNoncoding flyBaseNoncoding.bed
    # Cross-referencing info for both coding and noncoding:
    hgLoadSqlTab dm2 flyBase2004Xref ~/kent/src/hg/lib/flyBase2004Xref.sql \
      flyBase2004Xref.tab
    # Some featureBits comparisons with refGene which shoudl be pretty much 
    # like version 4.0 with a few noncoding genes tossed in... but this is 
    # a bigger difference than we expect:
    featureBits dm2 refGene \!flyBaseGene -minSize=1000 -bed=stdout
#516631 bases of 131698467 (0.392%) in intersection
    # Hmmm, flyBaseGene has nothing on the last 175,000bp of chrX!:
    # chrX:22,039,591-22,224,390
    # nor the first 40,000bp of chrX:
    # chrX:1-102,664
    # nor the last 270,000bp of chr3L:
    # chr3L:23,273,167-23,771,897
    # nor the first 500,000bp of chr2R:
    # chr2R:1-600,000
    # Also quite a few on the chr*h but I won't look at all those.
#TODO: contact flybase-help about the lack of annotations in those ranges...

    # This shows only the dropped isoforms/duplicates (ignores noncoding):
    featureBits dm2 refGene \!flyBaseGene \!flyBaseNoncoding -minSize=1000 \
       -bed=stdout
#chr3R   15620310        15621493        chr3R.1
#chr2L   4698094 4700940 chr2L.1
#chrX    3638700 3639895 chrX.1
#474282 bases of 131698467 (0.360%) in intersection

    # add upstream* downloadable files
    cd /usr/local/apache/htdocs/goldenPath/dm2/bigZips
    foreach size (1000 2000 5000)
      echo upstream$size
      nice featureBits dm2 flyBaseGene:upstream:$size -fa=stdout \
      | nice gzip -c > upstream$size.fa.gz
    end
    nice md5sum *.zip *.gz > md5sum.txt

    # 2/6/06: fill in missing symbol values with accession so hgNear
    # has something to draw a link on...
    hgsql dm2 -e 'update flyBase2004Xref set symbol = name where symbol = ""'

    # 7/12/06: reload flyBaseGene after correcting extractGenes.pl.
    ldHgGene -gtf dm2 flyBaseGene flybase.gtf


# FLYBASE ACODE CROSS-REFERENCING DATA (DONE 1/27/06 angie)
    # 10/11/06 -- reloaded after updating flyBaseGene, but no change.
    ssh hgwdev
    mkdir /cluster/data/dm2/bed/flyBase
    cd /cluster/data/dm2/bed/flyBase
    wget -O genes.txt.061011 \
      ftp://flybase.bio.indiana.edu/flybase/genes/genes.txt
    # Had to edit genes.txt.060126 to get around these two lines that didn't 
    # start with acode symbols:
    #line 1609033
#Spiracle sheaths dark.
    #line 3664838 (3664839 of original)
#Ommatidia are no
    hgFlyBase dm2 genes.txt.061011 -geneTable=flyBaseGene 
#Processed 57068 records in 4067314 lines


# MAP UNIPROT IDS TO CG*-R* IDS (DONE 1/27/06 angie - REDONE 10/11/06)
    # The UniProt.description table actually contains the most specific 
    # mapping of UniProt IDs to CG*-R* transcript IDs.  Use that when 
    # available, and otherwise use the more generic mapping to gene IDs 
    # that we extracted from the acode into fbUniProt.
    ssh hgwdev
    mkdir /cluster/data/dm2/bed/uniProtToFlyBase
    cd /cluster/data/dm2/bed/uniProtToFlyBase
    set taxon = `hgsql -N uniProt -e 'select id from taxon where binomial = "Drosophila melanogaster";'`
    # Grab UniProt descriptions of all D. mel. proteins so we can parse out
    # CG*-R* transcript IDs (and CG* gene IDs when -R* not specified).
    hgsql -N uniProt -e \
     'select description.* from description,accToTaxon where accToTaxon.taxon = '$taxon' and accToTaxon.acc = description.acc' \
      > uniProt.description.txt
    # fbUniProt maps FBgn* IDs to multiple UniProt IDs; use flyBase2004Xref
    # to map FBgn* to CG* (actually CG*-R* which implies more precision than
    # there really is).
    hgsql -N dm2 -e \
     'select name,uniProtId from flyBase2004Xref,fbUniProt where fbgn=geneId' \
      | sort > flyBaseMapping.txt
    # Grab the complete list of CG*-R* transcript IDs:
    hgsql -N dm2 -e 'select name from flyBaseGene' | sort > transcriptIds.txt
    ssh kolossus
    cd /cluster/data/dm2/bed/uniProtToFlyBase
    # Wrote a perl script, gleanDescription.pl, to parse the various patterns
    # used to encode real mappings (not mappings to homologs or things that 
    # look like CG*-P? IDs but aren't really).
    chmod a+x gleanDescription.pl
    ./gleanDescription.pl uniProt.description.txt | sort > uniProtMapping.txt
    # Look for duplicates:
    awk '{print $1;}' uniProtMapping.txt | uniq -c | awk '$1 != 1 {print;}'
#       2 CG17131-RA
#       5 CG32637-RA
    grep CG17131-PA uniProt.description.txt 
#Q8MS37  RE15579p (CG17131-PA).
#Q7PLU3  CG17131-PA.3.
    # I looked up both Q8MS37 and Q7PLU3 on http://www.pir.uniprot.org/
    # and both do mention CG17131-PA... Q8MS37 has been updated more recently
    # so I'll just delete the Q7PLU3 mapping.
    grep CG32637-PA uniProt.description.txt
#Q5K354  CG32637-PA protein (Fragment).
#Q5K356  CG32637-PA protein (Fragment).
#Q5K357  CG32637-PA protein (Fragment).
#Q5K361  CG32637-PA protein (Fragment).
#Q8IR71  CG32637-PA.
    # Here it's more clear-cut -- delete the fragments.
    egrep -v '(Q7PLU3|Q5K354|Q5K356|Q5K357|Q5K361)' uniProtMapping.txt \
      > tmp; mv tmp uniProtMapping.txt
    # Wrote a perl script, vagueDescription.pl, to parse out CG* without 
    # isoform specs -- can use those as backups when we don't have a more 
    # specific mapping for a transcript.
    chmod a+x vagueDescription.pl
    ./vagueDescription.pl uniProt.description.txt | sort \
      > uniProtMappingVague.txt
    # Write a perl script, combineMappings.pl, to add any additional mappings
    # available from acode to the UniProt mappings.
    chmod a+x combineMappings.pl
    ./combineMappings.pl transcriptIds.txt \
      uniProtMapping.txt uniProtMappingVague.txt flyBaseMapping.txt \
      > flyBaseToUniProt.txt
    # Make a 2-column version for loading:
    awk '{print $1 "\t" $2;}' flyBaseToUniProt.txt > flyBaseToUniProtLoad.txt
    # Load into a genericAlias-type table (this ignores the third column).
    ssh hgwdev
    cd /cluster/data/dm2/bed/uniProtToFlyBase
    sed -e 's/genericAlias/flyBaseToUniProt/' \
      ~/kent/src/hg/lib/genericAlias.sql > flyBaseToUniProt.sql
    hgLoadSqlTab dm2 flyBaseToUniProt flyBaseToUniProt.sql \
      flyBaseToUniProtLoad.txt


##########################################################################
# HGNEAR TABLES

# HGNEAR PROTEIN BLAST TABLES (DONE 1/31/06 angie - REDONE 4/6/07)
# mmBlastTab redone 5/1/06 -- makeMm8.doc run used outdated proteins
# hgBlastTab redone 5/9/06 -- makeHg18.doc run used outdated proteins
# Redone 4/3/07 with flybase4.3; mitochondrial proteins yanked 4/6/07 
# because they aren't in flyBase2004Xref and were causing joinerCheck errors.
    ssh hgwdev
    mkdir /cluster/data/dm2/bed/hgNearBlastp
    cd /cluster/data/dm2/bed/hgNearBlastp
    mkdir 070403
    cd 070403
    # Get the proteins used by the other hgNear organisms:
    pepPredToFa hg18 knownGenePep hg18.known.faa
    pepPredToFa mm8 knownGenePep mm8.known.faa
    pepPredToFa rn4 knownGenePep rn4.known.faa
    pepPredToFa danRer3 ensPep danRer3.ensPep.faa
    pepPredToFa dm2 flyBasePep dm2.flyBasePep.faa
    pepPredToFa ce2 sangerPep ce2.sangerPep.faa
    pepPredToFa sacCer1 sgdPep sacCer1.sgdPep.faa

    # Galt's synBlastp.csh filtering, which uses liftOver chains,
    # would be the next step if dm2 were reasonably closely related
    # (like mammal-mammal) to another Gene Sorter organism.  But it is 
    # pretty distant from all the others, so specify recipBest for all
    # query databases.
    cat << _EOF_ > config.ra
# Latest fly vs. other Gene Sorter orgs:
# human, mouse, rat, zebrafish, worm, yeast

targetGenesetPrefix flyBase
targetDb dm2
queryDbs hg18 mm8 rn4 danRer3 ce2 sacCer1
recipBest hg18 mm8 rn4 danRer3 ce2 sacCer1

dm2Fa     /cluster/data/dm2/bed/hgNearBlastp/070403/dm2.flyBasePep.faa
hg18Fa    /cluster/data/dm2/bed/hgNearBlastp/070403/hg18.known.faa
mm8Fa     /cluster/data/dm2/bed/hgNearBlastp/070403/mm8.known.faa
rn4Fa     /cluster/data/dm2/bed/hgNearBlastp/070403/rn4.known.faa
danRer3Fa /cluster/data/dm2/bed/hgNearBlastp/070403/danRer3.ensPep.faa
ce2Fa     /cluster/data/dm2/bed/hgNearBlastp/070403/ce2.sangerPep.faa
sacCer1Fa /cluster/data/dm2/bed/hgNearBlastp/070403/sacCer1.sgdPep.faa

buildDir /cluster/data/dm2/bed/hgNearBlastp/070403
scratchDir /san/sanvol1/scratch/dm2HgNearBlastp
_EOF_
    # Run with -noLoad so we can eyeball files, manually load dm2 tables now,
    # and later overload other databases' dmBlastTab tables.
    ~/kent/src/hg/utils/automation/doHgNearBlastp.pl -noLoad -workhorse kkr4u00 config.ra >& do.log &
    tail -f do.log
    # Ran these manually:
# *** -noLoad was specified -- you can run this script manually to load dm2 tables:
        run.dm2.dm2/loadPairwise.csh
#Loading database with 1171077 rows

# *** -noLoad was specified -- you can run these scripts manually to load dm2 tables:
        run.dm2.hg18/loadPairwise.csh
#Loading database with 5833 rows
        run.dm2.mm8/loadPairwise.csh
#Loading database with 5557 rows
        run.dm2.rn4/loadPairwise.csh
#Loading database with 3015 rows
        run.dm2.danRer3/loadPairwise.csh
#Loading database with 5253 rows
        run.dm2.ce2/loadPairwise.csh
#Loading database with 4537 rows
        run.dm2.sacCer1/loadPairwise.csh
#Loading database with 2172 rows

    # 4/6/07: remove mitochondrial genes from *BlastTab because they 
    # are missing from flyBase2004Xref and caused joinerCheck errors.
    # Would have been better to have joinerCheck'd flyBasePep before 
    # running doHgNearBlastp.
    hgsql dm2 -e 'delete from flyBaseBlastTab where target like "mt:%"'
    foreach table (hgBlastTab mmBlastTab rnBlastTab \
                   drBlastTab ceBlastTab scBlastTab)
      echo $table
      hgsql dm2 -e 'delete from '$table' where query like "mt:%"'
    end

    # 4/6/07: dm2 FlyBase/hgNear/hgGene update is in the pushQ, so I think 
    # it's a fine time to update dmBlastTab in other databases with the 
    # latest, so hgwdev is all consistent.  When that push is done, will 
    # need another push request for *.dmBlastTab loaded here.
# *** -noLoad was specified -- you can run these scripts manually to load dmBlastTab in query databases:
        run.hg18.dm2/loadPairwise.csh
        run.mm8.dm2/loadPairwise.csh
        run.rn4.dm2/loadPairwise.csh
        run.danRer3.dm2/loadPairwise.csh
        run.ce2.dm2/loadPairwise.csh
        run.sacCer1.dm2/loadPairwise.csh
    # Yank the "mt:%" targets from those to avoid joinerCheck errors.
    foreach db (hg18 mm8 rn4 danRer3 ce2 sacCer1)
      hgsql $db -e 'delete from dmBlastTab where target like "mt:%"'
    end


# CLUSTER GENES AND MAP TO OTHER DATA (DONE 2/6/06 angie - REDONE 4/3/07)
# flyBaseToRefSeq regenerated 5/9/06
    ssh hgwdev
    # First we have to add a proteinId column to flyBaseGene.
    hgClusterGenes dm2 flyBaseGene flyBaseIsoforms flyBaseCanonical \
      -protAccQuery='select name,alias from flyBaseToUniProt'
#Got 13566 clusters, from 19452 genes in 13 chromosomes
    # Create table that maps between flyBase genes and RefSeq
    # Re-ran 5/9/06 to resolve some joinerCheck errors about retired refSeqs:
    hgMapToGene dm2 refGene flyBaseGene flyBaseToRefSeq
    # Create table that maps between known genes and Pfam domains
    hgMapViaSwissProt dm2 flyBaseToUniProt name alias Pfam flyBaseToPfam

    # Create a table that maps between flyBase genes and the 
    # Stanford Microarray Project expression data. (see makeHgFixed.doc)
    hgsql dm2 -e 'drop table if exists flyBaseToCG; create table flyBaseToCG \
      select name,SUBSTRING_INDEX(name,"-",1) as value from flyBaseGene'
    hgsql dm2 -e 'create index name on flyBaseToCG(name(12))'
    hgExpDistance dm2 hgFixed.arbFlyLifeMedianRatio dummyArg \
      arbExpDistance -lookup=flyBaseToCG
#TODO: add the yale expr data too (feep)

    # Make sure that GO database is up to date.
    See README in /cluster/store1/geneOntology.


# MAKE A DESCRIPTION TABLE FOR HGNEAR KNOWNDETAILS (DONE 2/6/06 - REDONE 4/3/07)
    # hgNear's knownDetails column type expects a single table that links 
    # transcript ID with uniProt description, so whip one up here:
    # (used by hgGene too though hgGene could join on the fly)
    ssh hgwdev
    hgsql dm2 -e 'drop table if exists flyBaseToDescription; \
      create table flyBaseToDescription \
      select flyBaseToUniProt.name,uniProt.description.val \
      from flyBaseToUniProt,uniProt.description \
      where flyBaseToUniProt.alias = uniProt.description.acc'
    hgsql dm2 -e 'create index name on flyBaseToDescription(name(12))'


# FLYP2P (DONE 2/6/06 angie)
    # See hg/near/makeNear.doc...


# MAKE ORGANISM-SPECIFIC HGNEARDATA AND HGGENEDATA FILES (DONE 2/7/06 angie)
    cd ~/kent/src/hg/near/hgNear/hgNearData
    # Made dm1 and dm2 subdirs of D_melanogaster with separate sets of 
    # .ra files... see cvs revision history.
    # Similarly for hgGene/hgGeneData.


# ENABLE HGNEAR FOR DM2 IN HGCENTRALTEST (DONE 2/7/06 angie)
    hgsql -h genome-testdb hgcentraltest \
      -e  "update dbDb set hgNearOk = 1 where name = 'dm2';"

# REDO DM2-HG18 PAIRWISE BLAST (DONE 2/24/06 angie)
    # Oops, hgBlastTab got overwritten with dm2 4.1 genes by the 
    # hg18 build process.  Tweak back to 4.2:
    ssh hgwdev
    mkdir /cluster/data/dm2/bed/hgNearBlastp
    cd /cluster/data/dm2/bed/hgNearBlastp
    cat << _EOF_ > config.ra.hg18redo
# Redo vs. hg18 with 4.2 genes:

targetGenesetPrefix flyBase
targetDb dm2
queryDbs hg18

dm2Fa /cluster/data/dm2/bed/flybase4.2/flybasePep.fa
hg18Fa /cluster/data/hg18/bed/blastp/known.faa

buildDir /cluster/data/dm2/bed/hgNearBlastp
scratchDir /san/sanvol1/scratch/dm2HgNearBlastp
_EOF_
    doHgNearBlastp.pl -noSelf config.ra.hg18redo >& do.log & tail -f do.log

# END OF HGNEAR STUFF
#############################################################################


# RECIPROCAL BEST DM2-DROSIM1 (DONE 2/28/06 angie - REDONE 6/19/06)
    # Special request for user (Toshio Yoshimatsu and JJ Emerson, UChicago)
    ssh kolossus
    cd /cluster/data/dm2/bed/blastz.droSim1/axtChain
    # Swap dm2-best chains to be droSim1-referenced:
    chainStitchId dm2.droSim1.over.chain.gz stdout \
    | chainSwap stdin stdout \
    | chainSort stdin droSim1.dm2.tBest.chain
    # Net those on droSim1 to get droSim1-ref'd reciprocal best net:
    chainPreNet droSim1.dm2.tBest.chain \
      /cluster/data/{droSim1,dm2}/chrom.sizes stdout \
    | chainNet -minSpace=1 stdin /cluster/data/{droSim1,dm2}/chrom.sizes \
        stdout /dev/null \
    | netSyntenic stdin droSim1.dm2.rbest.net
    # Extract droSim1-ref'd reciprocal best chain:
    netChainSubset droSim1.dm2.rbest.net droSim1.dm2.tBest.chain stdout \
    | chainStitchId stdin stdout \
    | chainSort stdin droSim1.dm2.rbest.chain
    # Swap to get dm2-ref'd reciprocal best chain:
    chainSwap droSim1.dm2.rbest.chain stdout \
    | chainSort stdin dm2.droSim1.rbest.chain
    # Net those on dm2 to get dm2-ref'd reciprocal best net:
    chainPreNet dm2.droSim1.rbest.chain \
      /cluster/data/{dm2,droSim1}/chrom.sizes stdout \
    | chainNet -minSpace=1 -minScore=0 \
      stdin /cluster/data/{dm2,droSim1}/chrom.sizes stdout /dev/null \
    | netSyntenic stdin dm2.droSim1.rbest.net
    # Compress and clean up.
    rm droSim1.dm2.tBest.chain
    gzip *.rbest.*
    md5sum *.rbest.*.gz > md5sum.txt

    # Test coverage of *.rbest.* -- should be equal.
    netToBed -maxGap=1 droSim1.dm2.rbest.net.gz droSim1.dm2.rbest.net.bed
    netToBed -maxGap=1 dm2.droSim1.rbest.net.gz dm2.droSim1.rbest.net.bed
    chainToPsl droSim1.dm2.rbest.chain.gz \
      /cluster/data/{droSim1,dm2}/chrom.sizes \
      /cluster/data/droSim1/droSim1.2bit /cluster/data/dm2/dm2.2bit \
      droSim1.dm2.rbest.chain.psl
    chainToPsl dm2.droSim1.rbest.chain.gz \
      /cluster/data/{dm2,droSim1}/chrom.sizes \
      /cluster/data/dm2/dm2.2bit /cluster/data/droSim1/droSim1.2bit \
      dm2.droSim1.rbest.chain.psl
    ssh hgwdev
    cd /cluster/data/dm2/bed/blastz.droSim1/axtChain
    featureBits droSim1 droSim1.dm2.rbest.net.bed
#104047585 bases of 127256433 (81.762%) in intersection
    featureBits droSim1 droSim1.dm2.rbest.chain.psl
#104047585 bases of 127256433 (81.762%) in intersection
    featureBits dm2     dm2.droSim1.rbest.chain.psl
#104047585 bases of 131698467 (79.004%) in intersection
    featureBits dm2     dm2.droSim1.rbest.net.bed
#104047585 bases of 131698467 (79.004%) in intersection
    mkdir experiments
    mv *.bed *.psl experiments

    # Put out for download (hgwdev-only)
    ssh hgwdev
    mkdir /usr/local/apache/htdocs/goldenPath/dm2/vsDroSim1/reciprocalBest
    cd /usr/local/apache/htdocs/goldenPath/dm2/vsDroSim1/reciprocalBest
    rm -f *.gz md5sum.txt
    ln -s /cluster/data/dm2/bed/blastz.droSim1/axtChain/*.rbest.*.gz .
    ln -s /cluster/data/dm2/bed/blastz.droSim1/axtChain/md5sum.txt .
    # Make a README.txt.


# RECIPROCAL BEST DM2-DROYAK2 (DONE 6/20/06 angie)
    # Special request for user (Toshio Yoshimatsu and JJ Emerson, UChicago)
    ssh kolossus
    cd /cluster/data/dm2/bed/blastz.droYak2/axtChain
    # Swap dm2-best chains to be droYak2-referenced:
    chainStitchId dm2.droYak2.over.chain.gz stdout \
    | chainSwap stdin stdout \
    | chainSort stdin droYak2.dm2.tBest.chain
    # Net those on droYak2 to get droYak2-ref'd reciprocal best net:
    chainPreNet droYak2.dm2.tBest.chain \
      /cluster/data/{droYak2,dm2}/chrom.sizes stdout \
    | chainNet -minSpace=1 stdin /cluster/data/{droYak2,dm2}/chrom.sizes \
        stdout /dev/null \
    | netSyntenic stdin droYak2.dm2.rbest.net
    # Extract droYak2-ref'd reciprocal best chain:
    netChainSubset droYak2.dm2.rbest.net droYak2.dm2.tBest.chain stdout \
    | chainStitchId stdin droYak2.dm2.rbest.chain
    # Swap to get dm2-ref'd reciprocal best chain:
    chainSwap droYak2.dm2.rbest.chain stdout \
    | chainSort stdin dm2.droYak2.rbest.chain
    # Net those on dm2 to get dm2-ref'd reciprocal best net:
    chainPreNet dm2.droYak2.rbest.chain \
      /cluster/data/{dm2,droYak2}/chrom.sizes stdout \
    | chainNet -minSpace=1 -minScore=0 \
      stdin /cluster/data/{dm2,droYak2}/chrom.sizes stdout /dev/null \
    | netSyntenic stdin dm2.droYak2.rbest.net
    # Compress and clean up.
    rm droYak2.dm2.tBest.chain
    gzip *.rbest.*
    md5sum *.rbest.*.gz > md5sum.rbest.txt

    # Test coverage of *.rbest.* -- should be equal.
    netToBed -maxGap=1 droYak2.dm2.rbest.net.gz droYak2.dm2.rbest.net.bed
    netToBed -maxGap=1 dm2.droYak2.rbest.net.gz dm2.droYak2.rbest.net.bed
    chainToPsl droYak2.dm2.rbest.chain.gz \
      /cluster/data/{droYak2,dm2}/chrom.sizes \
      /cluster/data/droYak2/droYak2.2bit /cluster/data/dm2/dm2.2bit \
      droYak2.dm2.rbest.chain.psl
    chainToPsl dm2.droYak2.rbest.chain.gz \
      /cluster/data/{dm2,droYak2}/chrom.sizes \
      /cluster/data/dm2/dm2.2bit /cluster/data/droYak2/droYak2.2bit \
      dm2.droYak2.rbest.chain.psl
    ssh hgwdev
    cd /cluster/data/dm2/bed/blastz.droYak2/axtChain
    featureBits droYak2 droYak2.dm2.rbest.net.bed
#105927235 bases of 162681153 (65.113%) in intersection
    featureBits droYak2 droYak2.dm2.rbest.chain.psl
#105927235 bases of 162681153 (65.113%) in intersection
    featureBits dm2     dm2.droYak2.rbest.chain.psl
#105927235 bases of 131698467 (80.432%) in intersection
    featureBits dm2     dm2.droYak2.rbest.net.bed
#105927235 bases of 131698467 (80.432%) in intersection
    mkdir experiments
    mv *.bed *.psl experiments

    # Put out for download (hgwdev-only)
    ssh hgwdev
    mkdir /usr/local/apache/htdocs/goldenPath/dm2/vsDroYak2/reciprocalBest
    cd /usr/local/apache/htdocs/goldenPath/dm2/vsDroYak2/reciprocalBest
    rm -f *.gz md5sum.txt
    ln -s /cluster/data/dm2/bed/blastz.droYak2/axtChain/*.rbest.*.gz .
    ln -s /cluster/data/dm2/bed/blastz.droYak2/axtChain/md5sum.rbest.txt md5sum.txt
    # Make a README.txt.


########################################################################
### microRNA targets tracks  (DONE - 2006-03-29 - 2006-04-26 - Hiram)
### from: http://pictar.bio.nyu.edu/ Rajewsky Lab
### Nikolaus Rajewsky nr@scarbo.bio.nyu.edu
### Yi-Lu Wang ylw205@nyu.edu
### dg@thp.Uni-Koeln.DE
    ssh hgwdev
    mkdir /cluster/data/dm2/bed/picTar
    cd /cluster/data/dm2/bed/picTar
    wget --timestamping \
	'http://pictar.bio.nyu.edu/ucsc/new_melanogaster_S1_bed' \
	    -O newMelanogasterS1.bed

    wget --timestamping \
	'http://pictar.bio.nyu.edu/ucsc/new_melanogaster_S3_bed' \
	-O newMelanogasterS3.bed

    grep -v "^track" newMelanogasterS1.bed \
	| hgLoadBed -strict dm2 picTarMiRNAS1 stdin
    #	Loaded 24103 elements of size 9
    grep -v "^track" newMelanogasterS3.bed \
	| hgLoadBed -strict dm2 picTarMiRNAS3 stdin
    #	Loaded 11480 elements of size 9

    nice -n +19 featureBits dm2 picTarMiRNAS1
    #	59881 bases of 131698467 (0.045%) in intersection
    nice -n +19 featureBits dm2 picTarMiRNAS3
    #	26231 bases of 131698467 (0.020%) in intersection


# FLYBASE IN SITU IMAGES / EXPRESSION (DONE 5/3/06 angie - REDONE 4/7/07)
    # FlyBase has downloadable in situ images for BACs:
    # ftp://flybase.net/flybase/images/bac_insitu_pic/*
    # and FBti's:
    # ftp://flybase.net/flybase/images/in-situ-images/insitus.zip
    # but what Jim is interested in is the insitus for expression data.
    # Don't see that in the ftp listing, but they do make it easy to link in.
    ssh hgwdev
    cd /cluster/data/dm2/bed/flyBase
    wget -O summary.txt.070406 \
    'http://www.fruitfly.org/cgi-bin/ex/bquery.pl?qpage=entry&qtype=summarytext'
    # remove header line.
    hgLoadSqlTab dm2 bdgpExprLink ~/kent/src/hg/lib/bdgpExprLink.sql \
      summary.txt.070406


###########################################################################
# LIFTOVER CHAINS DM2 -> DM3 (DONE 8/1/06 angie)
    # Started 8/1 -- This creates the directory and scripts:
    ~/kent/src/utils/doSameSpeciesLiftOver.pl dm2 dm3 -debug
    # Run it for real and log it:
    cd /cluster/data/dm2/bed/blat.dm3.2006-08-01
    ~/kent/src/utils/doSameSpeciesLiftOver.pl dm2 dm3 \
       >& do.log & tail -f do.log


###########################################################################
# BLASTZ/CHAIN/NET DROERE2 (DONE 9/27/06 angie)
    ssh kkstore04
    mkdir /cluster/data/dm2/bed/blastz.droEre2.2006-09-27
    cd /cluster/data/dm2/bed/blastz.droEre2.2006-09-27
    cat << '_EOF_' > DEF
# D. melanogaster vs. D. erecta

BLASTZ_H=2000
BLASTZ_Y=3400
BLASTZ_L=4000
BLASTZ_K=2200
BLASTZ_Q=/cluster/data/blastz/HoxD55.q

# TARGET - D. melanogaster
SEQ1_DIR=/iscratch/i/dm2/nib
SEQ1_CHUNK=10000000
SEQ1_LAP=10000
SEQ1_LEN=/cluster/data/dm2/chrom.sizes

# QUERY - D. erecta
SEQ2_DIR=/iscratch/i/droEre2/droEre2.2bit
SEQ2_CHUNK=10000000
SEQ2_LAP=10000
SEQ2_LEN=/cluster/data/droEre2/chrom.sizes

BASE=/cluster/data/dm2/bed/blastz.droEre2.2006-09-27
'_EOF_'
    # << this line keeps emacs coloring happy
    doBlastzChainNet.pl DEF \
      -blastzOutRoot /panasas/store/dm2droEre2 >& do.log &
    tail -f do.log

    featureBits dm2 -chrom=chr2L -enrichment flyBaseGene chainDroEre2Link
#flyBaseGene 23.333%, chainDroEre2Link 90.665%, both 23.063%, cover 98.84%, enrich 1.09x
    rm -f /cluster/data/dm2/bed/blastz.droEre2
    ln -s blastz.droEre2.2006-09-27 /cluster/data/dm2/bed/blastz.droEre2


###########################################################################
# BLASTZ/CHAIN/NET DROGRI2 (DONE 9/27/06 angie)
    ssh kkstore04
    mkdir /cluster/data/dm2/bed/blastz.droGri2.2006-09-27
    cd /cluster/data/dm2/bed/blastz.droGri2.2006-09-27
    cat << '_EOF_' > DEF
# D. melanogaster vs. D. grimshawi

BLASTZ_H=2000
BLASTZ_Y=3400
BLASTZ_L=4000
BLASTZ_K=2200
BLASTZ_Q=/cluster/data/blastz/HoxD55.q

# TARGET - D. melanogaster
SEQ1_DIR=/iscratch/i/dm2/nib
SEQ1_CHUNK=10000000
SEQ1_LAP=10000
SEQ1_LEN=/cluster/data/dm2/chrom.sizes

# QUERY - D. grimshawi
SEQ2_DIR=/iscratch/i/droGri2/droGri2.2bit
SEQ2_CHUNK=10000000
SEQ2_LAP=10000
SEQ2_LEN=/cluster/data/droGri2/chrom.sizes

BASE=/cluster/data/dm2/bed/blastz.droGri2.2006-09-27
'_EOF_'
    # << this line keeps emacs coloring happy
    doBlastzChainNet.pl DEF \
      -blastzOutRoot /panasas/store/dm2droGri2 >& do.log &
    tail -f do.log
    featureBits dm2 -chrom=chr2L -enrichment flyBaseGene chainDroGri2Link
#flyBaseGene 23.333%, chainDroGri2Link 63.808%, both 20.793%, cover 89.12%, enrich 1.40x
    rm -f /cluster/data/dm2/bed/blastz.droGri2
    ln -s blastz.droGri2.2006-09-27 /cluster/data/dm2/bed/blastz.droGri2


###########################################################################
# BLASTZ/CHAIN/NET DROWIL1 (DONE 9/27/06 angie)
    ssh kkstore04
    mkdir /cluster/data/dm2/bed/blastz.droWil1.2006-09-27
    cd /cluster/data/dm2/bed/blastz.droWil1.2006-09-27
    cat << '_EOF_' > DEF
# D. melanogaster vs. D. willistoni

BLASTZ_H=2000
BLASTZ_Y=3400
BLASTZ_L=4000
BLASTZ_K=2200
BLASTZ_Q=/cluster/data/blastz/HoxD55.q

# TARGET - D. melanogaster
SEQ1_DIR=/iscratch/i/dm2/nib
SEQ1_CHUNK=10000000
SEQ1_LAP=10000
SEQ1_LEN=/cluster/data/dm2/chrom.sizes

# QUERY - D. willistoni
SEQ2_DIR=/iscratch/i/droWil1/droWil1.2bit
SEQ2_CHUNK=10000000
SEQ2_LAP=10000
SEQ2_LEN=/cluster/data/droWil1/chrom.sizes

BASE=/cluster/data/dm2/bed/blastz.droWil1.2006-09-27
'_EOF_'
    # << this line keeps emacs coloring happy
    doBlastzChainNet.pl DEF \
      -blastzOutRoot /cluster/bluearc/dm2droWil1 >& do.log &
    tail -f do.log
    featureBits dm2 -chrom=chr2L -enrichment flyBaseGene chainDroWil1Link
#flyBaseGene 23.333%, chainDroWil1Link 67.722%, both 21.106%, cover 90.45%, enrich 1.34x
    rm -f /cluster/data/dm2/bed/blastz.droWil1
    ln -s blastz.droWil1.2006-09-27 /cluster/data/dm2/bed/blastz.droWil1


###########################################################################
# BLASTZ/CHAIN/NET DROANA3 (DONE 9/27/06 angie)
    ssh kkstore04
    mkdir /cluster/data/dm2/bed/blastz.droAna3.2006-09-27
    cd /cluster/data/dm2/bed/blastz.droAna3.2006-09-27
    cat << '_EOF_' > DEF
# D. melanogaster vs. D. ananassae

BLASTZ_H=2000
BLASTZ_Y=3400
BLASTZ_L=4000
BLASTZ_K=2200
BLASTZ_Q=/cluster/data/blastz/HoxD55.q

# TARGET - D. melanogaster
SEQ1_DIR=/iscratch/i/dm2/nib
SEQ1_CHUNK=10000000
SEQ1_LAP=10000
SEQ1_LEN=/cluster/data/dm2/chrom.sizes

# QUERY - D. ananassae
SEQ2_DIR=/iscratch/i/droAna3/droAna3.2bit
SEQ2_CHUNK=10000000
SEQ2_LAP=10000
SEQ2_LEN=/cluster/data/droAna3/chrom.sizes

BASE=/cluster/data/dm2/bed/blastz.droAna3.2006-09-27
'_EOF_'
    # << this line keeps emacs coloring happy
    doBlastzChainNet.pl DEF \
      -blastzOutRoot /panasas/store/dm2droAna3 >& do.log &
    tail -f do.log
    featureBits dm2 -chrom=chr2L -enrichment flyBaseGene chainDroAna3Link
#flyBaseGene 23.333%, chainDroAna3Link 79.307%, both 22.211%, cover 95.19%, enrich 1.20x
    rm -f /cluster/data/dm2/bed/blastz.droAna3
    ln -s blastz.droAna3.2006-09-27 /cluster/data/dm2/bed/blastz.droAna3


###########################################################################
# BLASTZ/CHAIN/NET DROMOJ3 (DONE 9/27/06 angie)
    ssh kkstore04
    mkdir /cluster/data/dm2/bed/blastz.droMoj3.2006-09-27
    cd /cluster/data/dm2/bed/blastz.droMoj3.2006-09-27
    cat << '_EOF_' > DEF
# D. melanogaster vs. D. mojavensis

BLASTZ_H=2000
BLASTZ_Y=3400
BLASTZ_L=4000
BLASTZ_K=2200
BLASTZ_Q=/cluster/data/blastz/HoxD55.q

# TARGET - D. melanogaster
SEQ1_DIR=/iscratch/i/dm2/nib
SEQ1_CHUNK=10000000
SEQ1_LAP=10000
SEQ1_LEN=/cluster/data/dm2/chrom.sizes

# QUERY - D. mojavensis
SEQ2_DIR=/iscratch/i/droMoj3/droMoj3.2bit
SEQ2_CHUNK=10000000
SEQ2_LAP=10000
SEQ2_LEN=/cluster/data/droMoj3/chrom.sizes

BASE=/cluster/data/dm2/bed/blastz.droMoj3.2006-09-27
'_EOF_'
    # << this line keeps emacs coloring happy
    doBlastzChainNet.pl DEF \
      -blastzOutRoot /panasas/store/dm2droMoj3 >& do.log &
    tail -f do.log
    featureBits dm2 -chrom=chr2L -enrichment flyBaseGene chainDroMoj3Link
#flyBaseGene 23.333%, chainDroMoj3Link 60.956%, both 20.676%, cover 88.61%, enrich 1.45x
    rm -f /cluster/data/dm2/bed/blastz.droMoj3
    ln -s blastz.droMoj3.2006-09-27 /cluster/data/dm2/bed/blastz.droMoj3


###########################################################################
# BLASTZ/CHAIN/NET DROVIR3 (DONE 9/27/06 angie)
    ssh kkstore04
    mkdir /cluster/data/dm2/bed/blastz.droVir3.2006-09-27
    cd /cluster/data/dm2/bed/blastz.droVir3.2006-09-27
    cat << '_EOF_' > DEF
# D. melanogaster vs. D. virilis

BLASTZ_H=2000
BLASTZ_Y=3400
BLASTZ_L=4000
BLASTZ_K=2200
BLASTZ_Q=/cluster/data/blastz/HoxD55.q

# TARGET - D. melanogaster
SEQ1_DIR=/iscratch/i/dm2/nib
SEQ1_CHUNK=10000000
SEQ1_LAP=10000
SEQ1_LEN=/cluster/data/dm2/chrom.sizes

# QUERY - D. virilis
SEQ2_DIR=/iscratch/i/droVir3/droVir3.2bit
SEQ2_CHUNK=10000000
SEQ2_LAP=10000
SEQ2_LEN=/cluster/data/droVir3/chrom.sizes

BASE=/cluster/data/dm2/bed/blastz.droVir3.2006-09-27
'_EOF_'
    # << this line keeps emacs coloring happy
    doBlastzChainNet.pl DEF \
      -blastzOutRoot /panasas/store/dm2droVir3 >& do.log &
    tail -f do.log
    featureBits dm2 -chrom=chr2L -enrichment flyBaseGene chainDroVir3Link
#flyBaseGene 23.333%, chainDroVir3Link 62.998%, both 20.854%, cover 89.38%, enrich 1.42x
    rm -f /cluster/data/dm2/bed/blastz.droVir3
    ln -s blastz.droVir3.2006-09-27 /cluster/data/dm2/bed/blastz.droVir3


###########################################################################
# BLASTZ/CHAIN/NET DP4 (DONE 9/27/06 angie)
    ssh kkstore04
    mkdir /cluster/data/dm2/bed/blastz.dp4.2006-09-27
    cd /cluster/data/dm2/bed/blastz.dp4.2006-09-27
    cat << '_EOF_' > DEF
# D. melanogaster vs. D. pseudoobscura

BLASTZ_H=2000
BLASTZ_Y=3400
BLASTZ_L=4000
BLASTZ_K=2200
BLASTZ_Q=/cluster/data/blastz/HoxD55.q

# TARGET - D. melanogaster
SEQ1_DIR=/iscratch/i/dm2/nib
SEQ1_CHUNK=10000000
SEQ1_LAP=10000
SEQ1_LEN=/cluster/data/dm2/chrom.sizes

# QUERY - D. pseudoobscura
SEQ2_DIR=/iscratch/i/dp4/dp4.2bit
SEQ2_CHUNK=10000000
SEQ2_LAP=10000
SEQ2_LEN=/cluster/data/dp4/chrom.sizes

BASE=/cluster/data/dm2/bed/blastz.dp4.2006-09-27
'_EOF_'
    # << this line keeps emacs coloring happy
    doBlastzChainNet.pl DEF \
      -blastzOutRoot /cluster/bluearc/dm2dp4 >& do.log &
    tail -f do.log
    featureBits dm2 -chrom=chr2L -enrichment flyBaseGene chainDp4Link
#flyBaseGene 23.333%, chainDp4Link 73.891%, both 21.518%, cover 92.22%, enrich 1.25x
    rm -f /cluster/data/dm2/bed/blastz.dp4
    ln -s blastz.dp4.2006-09-27 /cluster/data/dm2/bed/blastz.dp4


###########################################################################
# BLASTZ/CHAIN/NET TRICAS2 (DONE 9/28/06)
    ssh kkstore04
    mkdir /cluster/data/dm2/bed/blastz.triCas2.2006-09-28
    cd /cluster/data/dm2/bed/blastz.triCas2.2006-09-28    
    cat <<_EOF_ > DEF
# D. melanogaster vs. T. castaneum

BLASTZ_H=2000
BLASTZ_Y=3400
BLASTZ_L=4000
BLASTZ_K=2200
BLASTZ_Q=/cluster/data/blastz/HoxD55.q
BLASTZ_ABRIDGE_REPEATS=0

# TARGET - D. melanogaster
SEQ1_DIR=/iscratch/i/dm2/nib
SEQ1_CHUNK=10000000
SEQ1_LAP=10000
SEQ1_LEN=/cluster/data/dm2/chrom.sizes

# QUERY - T. castaneum
SEQ2_DIR=/iscratch/i/triCas2/triCas2.2bit
SEQ2_CHUNK=5000000
SEQ2_LAP=10000
SEQ2_LEN=/iscratch/i/triCas2/chrom.sizes

BASE=/cluster/data/dm2/bed/blastz.triCas2.2006-09-28
_EOF_

    doBlastzChainNet.pl DEF \
      -blastzOutRoot /cluster/bluearc/dm2triCas2 >& do.log &
    tail -f do.log
    featureBits dm2 -chrom=chr2L -enrichment flyBaseGene chainTriCas2Link
#flyBaseGene 23.333%, chainTriCas2Link 16.438%, both 10.740%, cover 46.03%, enrich 2.80x
    rm -f /cluster/data/dm2/bed/blastz.triCas2
    ln -s blastz.triCas2.2006-09-28 /cluster/data/dm2/bed/blastz.triCas2


###########################################################################
# MULTIZ15WAY (DONE 9/28/06 angie)
# (((((((((dm2,(droSim1,droSec1)),(droYak2,droEre2)),droAna3),(dp4,droPer1)),droWil1),((droVir3,droMoj3),droGri2)),anoGam1),apiMel2),triCas2)
    ssh kkstore04
    mkdir /cluster/data/dm2/bed/multiz15way.2006-09-28
    cd /cluster/data/dm2/bed/multiz15way.2006-09-28

    # copy MAF's to cluster-friendly server
    mkdir /san/sanvol1/scratch/dm2/mafNet
    foreach s (droSim1 droSec1 droYak2 droEre2 droAna3 dp4 droPer1 droWil1 droVir3 droMoj3 droGri2 anoGam1 apiMel2 triCas2)
      echo $s
      rsync -av /cluster/data/dm2/bed/blastz.$s/mafNet/* \
        /san/sanvol1/scratch/dm2/mafNet/$s/
    end

    echo '(((((((((dm2,(droSim1,droSec1)),(droYak2,droEre2)),droAna3),(dp4,droPer1)),droWil1),((droVir3,droMoj3),droGri2)),anoGam1),apiMel2),triCas2)' \
      > tree-commas.nh
    sed -e 's/ //g; s/,/ /g' tree-commas.nh > tree.nh
    sed -e 's/[()]//g; s/,/ /g' tree.nh > species.lst

    ssh pk
    cd /cluster/data/dm2/bed/multiz15way.2006-09-28
    mkdir maf run
    cd run

    # stash binaries 
    mkdir penn
    cp -p /cluster/bin/penn/multiz.v11.x86_64/multiz-tba/multiz penn
    cp -p /cluster/bin/penn/multiz.v11.x86_64/multiz-tba/maf_project penn
    cp -p /cluster/bin/penn/multiz.v11.x86_64/multiz-tba/autoMZ penn

cat > autoMultiz.csh << 'EOF'
#!/bin/csh -ef
    set db = dm2
    set c = $1
    set maf = $2
    set run = `pwd`
    set tmp = /scratch/tmp/$db/multiz.$c
    set pairs = /san/sanvol1/scratch/$db/mafNet
    rm -fr $tmp
    mkdir -p $tmp
    cp ../{tree.nh,species.lst} $tmp
    pushd $tmp
    foreach s (`cat species.lst`)
        set in = $pairs/$s/$c.maf
        set out = $db.$s.sing.maf
        if ($s == $db) then
            continue
        endif
        if (-e $in.gz) then
            zcat $in.gz > $out
        else if (-e $in) then
            cp $in $out
        else
            echo "##maf version=1 scoring=autoMZ" > $out
        endif
    end
    set path = ($run/penn $path); rehash
    $run/penn/autoMZ + T=$tmp E=$db "`cat tree.nh`" $db.*.sing.maf $c.maf
    popd
    cp $tmp/$c.maf $maf
    rm -fr $tmp
'EOF'
    # << emacs
    chmod +x autoMultiz.csh

cat  << 'EOF' > spec
#LOOP
./autoMultiz.csh $(root1) {check out line+ /cluster/data/dm2/bed/multiz15way.2006-09-28/maf/$(root1).maf}
#ENDLOOP
'EOF'
    # << emacs

    awk '{print $1}' /cluster/data/dm2/chrom.sizes > chrom.lst
    gensub2 chrom.lst single spec jobList
    para make jobList
    para time
#Completed: 13 of 13 jobs
#CPU time in finished jobs:      37955s     632.58m    10.54h    0.44d  0.001 y
#IO & Wait Time:                   315s       5.25m     0.09h    0.00d  0.000 y
#Average job time:                2944s      49.06m     0.82h    0.03d
#Longest finished job:            9401s     156.68m     2.61h    0.11d
#Submission to last job:          9429s     157.15m     2.62h    0.11d


# ANNOTATE MULTIZ15WAY MAF AND LOAD TABLES (DONE 9/28/2006 angie)
    ssh kolossus
    mkdir /cluster/data/dm2/bed/multiz15way.2006-09-28/anno
    cd /cluster/data/dm2/bed/multiz15way.2006-09-28/anno
    mkdir maf run
    cd run
    rm -f sizes nBeds
    foreach db (`cat /cluster/data/dm2/bed/multiz15way.2006-09-28/species.lst`)
      ln -s  /cluster/data/$db/chrom.sizes $db.len
      if (! -e /cluster/data/$db/$db.N.bed) then
        twoBitInfo -nBed /cluster/data/$db/$db.{2bit,N.bed}
      endif
      ln -s  /cluster/data/$db/$db.N.bed $db.bed
      echo $db.bed  >> nBeds
      echo $db.len  >> sizes
    end
    echo date > jobs.csh
    # do smaller jobs first:
    foreach f (`ls -1rS ../../maf/*.maf`)
      echo nice mafAddIRows -nBeds=nBeds -sizes=sizes $f \
        /cluster/data/dm2/dm2.2bit ../maf/`basename $f` >> jobs.csh
    end 
    echo date >> jobs.csh
    csh -efx jobs.csh >&! jobs.log & tail -f jobs.log
    # 4min.

    # Load anno/maf
    ssh hgwdev
    cd /cluster/data/dm2/bed/multiz15way.2006-09-28/anno/maf
    mkdir -p /gbdb/dm2/multiz15way/anno/maf
    ln -s /cluster/data/dm2/bed/multiz15way.2006-09-28/anno/maf/*.maf \
      /gbdb/dm2/multiz15way/anno/maf
    date
    nice hgLoadMaf -pathPrefix=/gbdb/dm2/multiz15way/anno/maf dm2 multiz15way
    date
    # Do the computation-intensive part of hgLoadMafSummary on a workhorse 
    # machine and then load on hgwdev:
    ssh kolossus
    cd /cluster/data/dm2/bed/multiz15way.2006-09-28/anno/maf
    cat *.maf \
    | nice hgLoadMafSummary dm2 -minSize=30000 -mergeGap=1500 -maxSize=200000 \
             -test multiz15waySummary stdin
#Created 112880 summary blocks from 10610488 components and 1151094 mafs from stdin
    ssh hgwdev
    cd /cluster/data/dm2/bed/multiz15way.2006-09-28/anno/maf
    sed -e 's/mafSummary/multiz15waySummary/' ~/kent/src/hg/lib/mafSummary.sql \
      > /tmp/multiz15waySummary.sql
    time nice hgLoadSqlTab dm2 multiz15waySummary /tmp/multiz15waySummary.sql \
      multiz15waySummary.tab
#0.000u 0.001s 0:01.59 0.0%      0+0k 0+0io 0pf+0w
    rm *.tab
    ln -s multiz15way.2006-09-28 /cluster/data/dm2/bed/multiz15way


# MULTIZ15WAY DOWNLOADABLES (UNANNOTATED) (DONE 10/3/2006 angie)
    # Annotated MAF is now documented, so use anno/maf for downloads.
    ssh hgwdev
    mkdir /cluster/data/dm2/bed/multiz15way.2006-09-28/mafDownloads
    cd /cluster/data/dm2/bed/multiz15way.2006-09-28/mafDownloads
    # upstream mafs 
cat > mafFrags.csh << 'EOF'
    date
    foreach i (1000 2000 5000)
        echo "making upstream$i.maf"
        nice featureBits dm2 refGene:upstream:$i -fa=/dev/null -bed=up.bad
        awk -F '\t' '{printf("%s\t%s\t%s\t%s\t%s\t%s\n", $1, $2, $3, substr($4, 0, 9), 0, $5)}' up.bad > up.bed
        rm up.bad
        nice mafFrags dm2 multiz15way up.bed upstream$i.maf \
                -orgs=../species.lst
        rm up.bed
    end
    date
'EOF'
    # << emacs
    time csh mafFrags.csh >&! mafFrags.log & tail -f mafFrags.log
#86.335u 42.090s 6:47.44 31.5%   0+0k 0+0io 0pf+0w

    ssh kolossus
    cd /cluster/data/dm2/bed/multiz15way.2006-09-28/mafDownloads
cat > downloads.csh << 'EOF'
    date
    foreach f (../anno/maf/chr*.maf)
        set c = $f:t:r
        echo $c
        nice gzip -c $f > $c.maf.gz
    end
    md5sum *.gz > md5sum.txt
    date
'EOF'
    # << emacs
    time csh -efx downloads.csh >&! downloads.log
#325.699u 4.199s 5:34.17 98.7%   0+0k 0+0io 1pf+0w
    nice gzip up*.maf
    nice md5sum up*.maf.gz >> md5sum.txt
    # Make a README.txt .

    ssh hgwdev
    set dir = /usr/local/apache/htdocs/goldenPath/dm2/multiz15way
    mkdir $dir
    ln -s /cluster/data/dm2/bed/multiz15way.2006-09-28/mafDownloads/*.{gz,txt} $dir


# MULTIZ15WAY MAF FRAMES (DONE 9/29/2006 angie)
# multiz15wayFrames regenerated 11/28/06.
    ssh hgwdev
    mkdir /cluster/data/dm2/bed/multiz15way.2006-09-28/frames
    cd /cluster/data/dm2/bed/multiz15way.2006-09-28/frames
    # The following is adapted from MarkD's Makefile used for mm7...

    #------------------------------------------------------------------------
    # get the genes for all genomes (when available)
    # currently we have nothing for:
    # droSec1 droEre2 droAna3 dp4 droPer1 droWil1 droVir3 droMoj3 droGri2 triCas2

    # mRNAs with CDS.  single select to get cds+psl, then split that up and
    # create genePred
    # using mrna table as genes: droSim1 droYak2 anoGam1 apiMel2
    mkdir genes
    foreach queryDb (droSim1 droYak2 anoGam1 apiMel2)
      set tmpExt = `mktemp temp.XXXXXX`
      set tmpMrnaCds = ${queryDb}.mrna-cds.${tmpExt}
      set tmpMrna = ${queryDb}.mrna.${tmpExt}
      set tmpCds = ${queryDb}.cds.${tmpExt}
      echo $queryDb
      hgsql -N -e 'select all_mrna.qName,cds.name,all_mrna.* \
                   from all_mrna,gbCdnaInfo,cds \
                   where (all_mrna.qName = gbCdnaInfo.acc) and \
                     (gbCdnaInfo.cds != 0) and (gbCdnaInfo.cds = cds.id)' \
       ${queryDb} > ${tmpMrnaCds}
      cut -f 1-2  ${tmpMrnaCds} > ${tmpCds}
      cut -f 4-100  ${tmpMrnaCds} > ${tmpMrna}
      mrnaToGene -cdsFile=${tmpCds} -smallInsertSize=8 -quiet ${tmpMrna} \
        stdout \
      | genePredSingleCover stdin stdout | gzip -2c \
        > /scratch/tmp/$queryDb.tmp.gz
      rm ${tmpMrnaCds} ${tmpMrna} ${tmpCds}
      mv /scratch/tmp/$queryDb.tmp.gz genes/$queryDb.gp.gz
      rm -f $tmpExt
    end
    # using refGene for dm2
    # genePreds; (must keep only the first 10 columns for knownGene)
    foreach queryDb (dm2)
      if ($queryDb == "dm2") then
        set geneTbl = refGene
      endif
      hgsql -N -e "select * from $geneTbl" ${queryDb} | cut -f 1-10 \
      | genePredSingleCover stdin stdout | gzip -2c \
        > /scratch/tmp/$queryDb.tmp.gz
      mv /scratch/tmp/$queryDb.tmp.gz genes/$queryDb.gp.gz
      rm -f $tmpExt
    end

    # MarkD's 1-pass maf methodology
    # This section was re-run *with* dm2 genes (oops, thanks Brooke) 11/28/06.
    ssh kolossus
    cd /cluster/data/dm2/bed/multiz15way/frames
    nice tcsh # easy way to get process niced
    (cat  ../anno/maf/*.maf \
     | time genePredToMafFrames dm2 stdin stdout \
         dm2 genes/dm2.gp.gz \
         droSim1 genes/droSim1.gp.gz droYak2 genes/droYak2.gp.gz \
         anoGam1 genes/anoGam1.gp.gz apiMel2 genes/apiMel2.gp.gz \
     | gzip -c > multiz15way.mafFrames.gz) >& frames.log &

    ssh hgwdev
    cd /cluster/data/dm2/bed/multiz15way/frames
    hgLoadMafFrames dm2 multiz15wayFrames multiz15way.mafFrames.gz


# PHASTCONS 15WAY WITH METHODS FROM PAPER (DONE 10/3/06 angie)
# (((((((((dm2,(droSim1,droSec1)),(droYak2,droEre2)),droAna3),(dp4,droPer1)),droWil1),((droVir3,droMoj3),droGri2)),anoGam1),apiMel2),triCas2)
# NOTE FROM QA (5/7/08):  Overlapping data points were found in dm3 phastCons
# data by user Michael Hiller:
# http://www.soe.ucsc.edu/pipermail/genome/2008-April/016241.html
# The overlapping data points were caused by a bug in phastCons that Adam has
# since fixed.  The bug is present in dm2 phastCons data, but as it is not the
# most recent assembly for this organism, we are going to leave it as-is.  An
# example region containing the overlap is: chr3R:6,999,990-7,000,010 .

    ssh kkstore04
    mkdir -p /san/sanvol1/scratch/dm2/chrom
    cp -p /cluster/data/dm2/?{,?}/chr*.fa /san/sanvol1/scratch/dm2/chrom/
    # Split chrom fa into smaller windows for phastCons:
    ssh pk
    mkdir /cluster/data/dm2/bed/multiz15way/phastCons
    mkdir /cluster/data/dm2/bed/multiz15way/phastCons/run.split
    cd /cluster/data/dm2/bed/multiz15way/phastCons/run.split
    set WINDOWS = /san/sanvol1/scratch/dm2/phastCons/WINDOWS
    rm -fr $WINDOWS
    mkdir -p $WINDOWS
    cat << 'EOF' > doSplit.sh
#!/bin/csh -ef

set PHAST=/cluster/bin/phast
set FA_SRC=/san/sanvol1/scratch/dm2/chrom
set WINDOWS=/san/sanvol1/scratch/dm2/phastCons/WINDOWS

set maf=$1
set c = $maf:t:r
set tmpDir = /scratch/msa_split/$c
rm -rf $tmpDir
mkdir -p $tmpDir
${PHAST}/msa_split $1 -i MAF -M ${FA_SRC}/$c.fa \
   -O dm2,droSim1,droSec1,droYak2,droEre2,droAna3,dp4,droPer1,droWil1,droVir3,droMoj3,droGri2,anoGam1,apiMel2,triCas2 \
   -w 1000000,0 -r $tmpDir/$c -o SS -I 1000 -B 5000
cd $tmpDir
foreach file ($c.*.ss)
  gzip -c $file > ${WINDOWS}/$file.gz
end
rm -f $tmpDir/$c.*.ss
rmdir $tmpDir
'EOF'
# << for emacs
    chmod a+x doSplit.sh
    rm -f jobList
    foreach file (/cluster/data/dm2/bed/multiz15way/maf/*.maf)
      if (-s $file) then
        echo "doSplit.sh {check in exists+ $file}" >> jobList
      endif
    end
    para make jobList
    para time
#Completed: 13 of 13 jobs
#CPU time in finished jobs:       1075s      17.92m     0.30h    0.01d  0.000 y
#IO & Wait Time:                   101s       1.68m     0.03h    0.00d  0.000 y
#Average job time:                  90s       1.51m     0.03h    0.00d
#Longest finished job:             259s       4.32m     0.07h    0.00d
#Submission to last job:           270s       4.50m     0.07h    0.00d

    ############### FIRST ITERATION OF PARAMETER ESTIMATION ONLY #############
    # Use consEntropy --NH to make it suggest a --expected-lengths param 
    # that we should try next.  Adam ran this on hg17 to find out the 
    # total entropy of the hg17 model:
    # consEntropy 0.265 12 ave.cons.mod ave.noncons.mod
#Transition parameters: gamma=0.265000, omega=12.000000, mu=0.083333, nu=0.030045
#Relative entropy: H=0.608216 bits/site
#Required length: N=16.085437 sites
#Total entropy: NH=9.783421 bits
    # Our target is that NH result: 9.7834 bits.  
    # Use phyloFit to make an initial model:
    ssh kolossus
    cd /cluster/data/dm2/bed/multiz15way/phastCons
    /cluster/bin/phast/msa_view ../maf/chr{2L,3R,4,X}.maf \
        --aggregate dm2,droSim1,droSec1,droYak2,droEre2,droAna3,dp4,droPer1,droWil1,droVir3,droMoj3,droGri2,anoGam1,apiMel2,triCas2 \
        -i MAF -o SS > all.ss
    /cluster/bin/phast/phyloFit all.ss \
      --tree "(((((((((dm2,(droSim1,droSec1)),(droYak2,droEre2)),droAna3),(dp4,droPer1)),droWil1),((droVir3,droMoj3),droGri2)),anoGam1),apiMel2),triCas2)" \
      -i SS --out-root starting-tree
    cat starting-tree.mod 
#ALPHABET: A C G T 
#ORDER: 0
#SUBST_MOD: REV
#TRAINING_LNL: -533323268.267133
#BACKGROUND: 0.285852 0.214455 0.214368 0.285325 
#RATE_MAT:
#  -0.924927    0.204278    0.451988    0.268661 
#   0.272287   -1.098668    0.225404    0.600978 
#   0.602709    0.225494   -1.100568    0.272364 
#   0.269158    0.451705    0.204631   -0.925493 
#TREE: (((((((((dm2:0.032478,(droSim1:0.017650,droSec1:0.015740):0.017736):0.026088,(droYak2:0.058116,droEre2:0.055952):0.031922):0.084953,droAna3:0.218988):0.051563,(dp4:0.013624,droPer1:0.015374):0.210705):0.046101,droWil1:0.292357):0.019351,((droVir3:0.109131,droMoj3:0.142623):0.047595,droGri2:0.154583):0.189999):0.113599,anoGam1:0.357817):0.092848,apiMel2:0.386309):0.169225,triCas2:0.169225);
# also get GC content from model -- if similar enough, no need to extract it 
# separately above.
awk '$1 == "BACKGROUND:" {print $3 + $4;}' starting-tree.mod 
#0.428823
    # OK, use .429 for --gc below.

    # Use the values of --target-coverage and --expected-lengths from the 
    # last iteration of the 9way run, 0.398 and 23.7.
    # Multiply each subst rate on the TREE line by 3.5 which is roughly the 
    # ratio of noncons to cons in 
    # /cluster/data/dm2/bed/multiz8way/phastCons/run.estimate/ave.*.mod
    /cluster/bin/phast/tree_doctor -s 3.5 starting-tree.mod \
      > starting-tree.noncons.mod
    /cluster/bin/phast/consEntropy --NH 9.7834 0.393 23.8 \
      starting-tree{,.noncons}.mod
    # Uh-oh.... out of memory.  OK!  Well, proceed with params from before.

    # Now set up cluster job to estimate model parameters given free params 
    # --target-coverage and --expected-lengths and the data.  
    ssh pk
    mkdir /cluster/data/dm2/bed/multiz15way/phastCons/run.estimate
    cd /cluster/data/dm2/bed/multiz15way/phastCons/run.estimate
    # FIRST ITERATION: Use ../starting-tree.mod:
    cat << '_EOF_' > doEstimate.sh
#!/bin/csh -ef
zcat $1 \
| /cluster/bin/phast/phastCons - ../starting-tree.mod --gc 0.429 --nrates 1,1 \
    --no-post-probs --ignore-missing \
    --expected-lengths 23.8 --target-coverage 0.393 \
    --quiet --log $2 --estimate-trees $3
'_EOF_'
# << for emacs
    # SUBSEQUENT ITERATIONS: Use last iteration's estimated noncons model.
    cat << '_EOF_' > doEstimate.sh
#!/bin/csh -ef
zcat $1 \
| /cluster/bin/phast/phastCons - ave.noncons.mod --gc 0.429 --nrates 1,1 \
    --no-post-probs --ignore-missing \
    --expected-lengths 23.8 --target-coverage 0.393 \
    --quiet --log $2 --estimate-trees $3
'_EOF_'
# << for emacs
    chmod a+x doEstimate.sh
    rm -fr LOG TREES
    mkdir -p LOG TREES
    rm -f jobList
    foreach f (/san/sanvol1/scratch/dm2/phastCons/WINDOWS/*.ss.gz)
      set root = $f:t:r:r
      echo doEstimate.sh $f LOG/$root.log TREES/$root >> jobList
    end
    para make jobList
    para time
#Completed: 139 of 139 jobs
#CPU time in finished jobs:    1771638s   29527.30m   492.12h   20.51d  0.056 y
#IO & Wait Time:                  1154s      19.23m     0.32h    0.01d  0.000 y
#Average job time:               12754s     212.56m     3.54h    0.15d
#Longest finished job:           18435s     307.25m     5.12h    0.21d
#Submission to last job:         26561s     442.68m     7.38h    0.31d

    # Now combine parameter estimates.  We can average the .mod files
    # using phyloBoot.  This must be done separately for the conserved
    # and nonconserved models
    ssh kolossus
    cd /cluster/data/dm2/bed/multiz15way/phastCons/run.estimate
    ls -1 TREES/*.cons.mod > cons.txt
    /cluster/bin/phast/phyloBoot --read-mods '*cons.txt' \
      --output-average ave.cons.mod > cons_summary.txt
    ls -1 TREES/*.noncons.mod > noncons.txt
    /cluster/bin/phast/phyloBoot --read-mods '*noncons.txt' \
      --output-average ave.noncons.mod > noncons_summary.txt
    grep TREE ave*.mod
    # FIRST ITERATION:
#ave.cons.mod:TREE: (((((((((dm2:0.017975,(droSim1:0.012862,droSec1:0.012586):0.011123):0.012087,(droYak2:0.031478,droEre2:0.032714):0.015973):0.033568,droAna3:0.107755):0.019847,(dp4:0.008814,droPer1:0.009578):0.104435):0.016369,droWil1:0.150471):0.005457,((droVir3:0.056362,droMoj3:0.072079):0.020331,droGri2:0.083731):0.092042):0.049021,anoGam1:0.218175):0.046658,apiMel2:0.187335):0.106621,triCas2:0.106621);
#ave.noncons.mod:TREE: (((((((((dm2:0.059235,(droSim1:0.039799,droSec1:0.039025):0.035598):0.041349,(droYak2:0.104304,droEre2:0.106715):0.054017):0.119983,droAna3:0.377089):0.071962,(dp4:0.028079,droPer1:0.031175):0.369228):0.060569,droWil1:0.536488):0.019784,((droVir3:0.195542,droMoj3:0.254453):0.073077,droGri2:0.290773):0.336972):0.180643,anoGam1:0.783559):0.174661,apiMel2:0.670401):0.382291,triCas2:0.382291);
    # SECOND ITERATION:

    cat cons_summary.txt 
    # look over the files cons_summary.txt and noncons_summary.txt.
    # The means and medians should be roughly equal and the stdevs
    # should be reasonably small compared to the means, particularly
    # for rate matrix parameters (at bottom) and for branches to the
    # leaves of the tree.  The stdevs may be fairly high for branches
    # near the root of the tree; that's okay.  Some min values may be
    # 0 for some parameters.  That's okay, but watch out for very large
    # values in the max column, which might skew the mean.  If you see
    # any signs of bad outliers, you may have to track down the
    # responsible .mod files and throw them out.  I've never had to do
    # this; the estimates generally seem pretty well behaved.

    # NOTE: Actually, a random sample of several hundred to a thousand
    # alignment fragments (say, a number equal to the number of
    # available cluster nodes) should be more than adequate for
    # parameter estimation.  If pressed for time, use this strategy.

    # No consEntropy check because it runs out of mem on an 8G-mem small 
    # cluster node and has not completed in a day on 16G-mem kolossus.

    # Now we are ready to set up the cluster job for computing the
    # conservation scores and predicted elements.  The we measure the 
    # conserved elements coverage, and if that's not satisfactory then we 
    # adjust parameters and repeat.  
    ssh pk
    mkdir /cluster/data/dm2/bed/multiz15way/phastCons/run.phast
    cd /cluster/data/dm2/bed/multiz15way/phastCons/run.phast
    cat << 'EOF' > doPhastCons.sh
#!/bin/csh -ef
set pref = $1:t:r:r
set chr = `echo $pref | awk -F\. '{print $1}'`
set tmpfile = /scratch/phastCons.$$
zcat $1 \
| /cluster/bin/phast/phastCons - \
    ../run.estimate/ave.cons.mod,../run.estimate/ave.noncons.mod \
    --expected-lengths 23.8 --target-coverage 0.393 \
    --quiet --seqname $chr --idpref $pref \
    --viterbi /san/sanvol1/scratch/dm2/phastCons/ELEMENTS/$pref.bed --score \
    --require-informative 0 \
  > $tmpfile
gzip -c $tmpfile > /san/sanvol1/scratch/dm2/phastCons/POSTPROBS/$pref.pp.gz
rm $tmpfile
'EOF'
# << for emacs
    chmod a+x doPhastCons.sh
    rm -fr /san/sanvol1/scratch/dm2/phastCons/{POSTPROBS,ELEMENTS}
    mkdir -p /san/sanvol1/scratch/dm2/phastCons/{POSTPROBS,ELEMENTS}
    rm -f jobList
    foreach f (/san/sanvol1/scratch/dm2/phastCons/WINDOWS/*.ss.gz)
      echo doPhastCons.sh $f >> jobList
    end
    para make jobList
    para time
#Completed: 139 of 139 jobs
#CPU time in finished jobs:       1388s      23.14m     0.39h    0.02d  0.000 y
#IO & Wait Time:                   402s       6.70m     0.11h    0.00d  0.000 y
#Average job time:                  13s       0.21m     0.00h    0.00d
#Longest finished job:              17s       0.28m     0.00h    0.00d
#Submission to last job:           170s       2.83m     0.05h    0.00d

    # back on kolossus:
    # combine predictions and transform scores to be in 0-1000 interval
    cd /cluster/data/dm2/bed/multiz15way/phastCons
    awk '{printf "%s\t%d\t%d\tlod=%d\t%s\n", $1, $2, $3, $5, $5;}' \
      /san/sanvol1/scratch/dm2/phastCons/ELEMENTS/*.bed \
    | /cluster/bin/scripts/lodToBedScore > all.bed

    ssh hgwdev
    # Now measure coverage of CDS by conserved elements. 
    # We want the "cover" figure to be close to 68.9%.
    cd /cluster/data/dm2/bed/multiz15way/phastCons
    featureBits -enrichment dm2 flyBaseGene:cds all.bed
    # FIRST ITERATION: 68.9'll do!
#flyBaseGene:cds 16.649%, all.bed 37.421%, both 11.471%, cover 68.90%, enrich 1.84x

    # Having met the CDS coverage target, load up the results.
    hgLoadBed dm2 phastConsElements15way all.bed

    # Create wiggle
    ssh pk
    mkdir /cluster/data/dm2/bed/multiz15way/phastCons/run.wib
    cd /cluster/data/dm2/bed/multiz15way/phastCons/run.wib
    rm -rf /san/sanvol1/scratch/dm2/phastCons/wib
    mkdir -p /san/sanvol1/scratch/dm2/phastCons/wib
    cat << 'EOF' > doWigEncode
#!/bin/csh -ef
set chr = $1
cd /san/sanvol1/scratch/dm2/phastCons/wib
zcat `ls -1 /san/sanvol1/scratch/dm2/phastCons/POSTPROBS/$chr.*.pp.gz \
      | sort -t\. -k2,2n` \
| wigEncode stdin ${chr}_phastCons.wi{g,b}
'EOF'
# << for emacs
    chmod a+x doWigEncode
    rm -f jobList
    foreach chr (`ls -1 /san/sanvol1/scratch/dm2/phastCons/POSTPROBS \
                  | awk -F\. '{print $1}' | sort -u`)
      echo doWigEncode $chr >> jobList
    end
    para make jobList
    para time
#Completed: 13 of 13 jobs
#CPU time in finished jobs:         71s       1.18m     0.02h    0.00d  0.000 y
#IO & Wait Time:                    28s       0.47m     0.01h    0.00d  0.000 y
#Average job time:                   8s       0.13m     0.00h    0.00d
#Longest finished job:              17s       0.28m     0.00h    0.00d
#Submission to last job:           101s       1.68m     0.03h    0.00d

    # back on kkstore04, copy wibs, wigs and POSTPROBS (people sometimes want 
    # the raw scores) from san/sanvol1
    cd /cluster/data/dm2/bed/multiz15way/phastCons
    rm -rf wib POSTPROBS
    rsync -av /san/sanvol1/scratch/dm2/phastCons/wib .
    rsync -av /san/sanvol1/scratch/dm2/phastCons/POSTPROBS .

    # load wiggle component of Conservation track
    ssh hgwdev
    mkdir /gbdb/dm2/multiz15way/wib
    cd /cluster/data/dm2/bed/multiz15way/phastCons
    chmod 775 . wib
    chmod 664 wib/*.wib
    ln -s `pwd`/wib/*.wib /gbdb/dm2/multiz15way/wib/
    hgLoadWiggle dm2 phastCons15way \
      -pathPrefix=/gbdb/dm2/multiz15way/wib wib/*.wig
    rm wiggle.tab

    # and clean up san/sanvol1.
    rm -r /san/sanvol1/scratch/dm2/phastCons/{ELEMENTS,POSTPROBS,wib}
    rm -r /san/sanvol1/scratch/dm2/chrom
    # Offer raw scores for download since fly folks are likely to be interested:
    ssh kkstore04
    cd /cluster/data/dm2/bed/multiz15way/phastCons/POSTPROBS
    mkdir ../postprobsDownload
    foreach chr (`awk '{print $1;}' ../../../../chrom.sizes`)
      zcat `ls -1 $chr.*.pp.gz | sort -t\. -k2,2n` | gzip -c \
        > ../postprobsDownload/$chr.pp.gz
    end
    cd ../postprobsDownload
    md5sum *.gz > md5sum.txt
    # Make a README.txt there too.
    ssh hgwdev
    mkdir /usr/local/apache/htdocs/goldenPath/dm2/phastCons15way
    cd /usr/local/apache/htdocs/goldenPath/dm2/phastCons15way
    ln -s /cluster/data/dm2/bed/multiz15way/phastCons/postprobsDownload/* .

    # make a tree picture using phyloGif.
    cd /cluster/data/dm2/bed/multiz15way
    /usr/local/apache/cgi-bin/phyloGif -phyloGif_tree=tree-commas-orgNames.nh \
      -phyloGif_width=260 -phyloGif_height=480 > dm2_15way.gif
    cp -p dm2_15way.gif ~/browser/images/phylo/dm2_15way.gif
    # Check in browser/images/phylo/dm2_15way.gif.  In a clean ~/browser:
    cd ~/browser
    make alpha


#########################################################################
# ORegAnno Open Regulatory Annotation 		Belinda Giardine April 2007
# 
# Update data via a reload
# trackDb is at top level
# parser for dump from Obi Griffith is at PSU.
# cp data to UCSC and load
hgsql dm2
	truncate table oreganno;
	truncate table oregannoAttr;
	truncate table oregannoLink;

#load the tables
hgsql hgFixed
    load data local infile "oregannoAttr.dm2.txt" into table oregannoAttr;
    load data local infile "oregannoLink.dm2.txt" into table oregannoLink;
grep "^chr" oreganno.dm2.txt | sort -k1,1 -k2,2n > oreganno.bed
hgLoadBed dm2 oreganno oreganno.bed -noSort -oldTable -tab
rm oreganno.bed

#########################################################################
# FLYBASE 4.3 ANNOTATIONS (DONE (except for contacting flybase-help) 10/11/06 angie - UPDATED 4/6/07)
    ssh kkstore04
    mkdir /cluster/data/dm2/bed/flybase4.3
    cd /cluster/data/dm2/bed/flybase4.3
    foreach c (2L 2R 3L 3R 4 X)
      wget ftp://flybase.net/genomes/Drosophila_melanogaster/dmel_r4.3_20060303/gff/$c.gff
    end
    cat *.gff > flybase.gff3
    # What data sources are represented in this file?
    grep -v '^#' flybase.gff3 | awk '{print $2 "\t" $3;}' | sort | uniq -c
    # excerpt (many other sources, including blastx:... , sim4:... and 
    # tblastx:...; also various other types for source "FlyBase" and "."):
  85778 FlyBase CDS
  64452 FlyBase exon
  14449 FlyBase gene
  19376 FlyBase mRNA
     66 FlyBase miRNA
    116 FlyBase ncRNA
     51 FlyBase pseudogene
     96 FlyBase rRNA
     46 FlyBase snRNA
     63 FlyBase snoRNA
    292 FlyBase tRNA
  36921 .     transcription_start_site
   6003 FlyBase transposable_element
  35734 FlyBase transposable_element_insertion_site
    # Notable differences since 4.2: "FlyBase" instead of "." for most of 
    # those, and now there seens to be more than one CDS per exon while it 
    # use to be ~1 CDS per gene.  Hmmmm....

    # What keywords are defined in the 9th field?  Watch out for stray ;'s
    # from html-escaped special characters...
    grep -v '^#' flybase.gff3 \
    | awk '{print $9;}' \
    | perl -wpe 's/&\w\w\w\w?;//g; s/=[^;]+;/\n/g; s/=.*$//;' \
    | sort | uniq -c
4250095 
  74483 Alias
  31137 Dbxref
2795760 ID
1432800 Name
    765 Ontology_term
 515484 Parent
1046375 Synonym
  17901 associated_with
  15773 cyto_range
  36666 derived_cyto_location
  96162 evidence
  15657 gbunit
  12101 programversion
  11837 putative_ortholog_of
  11790 to_name
 121126 to_species
    # Notable differences: synonym -> Synonym;  no more synonym_2nd or 
    # dbxref_2nd (or species); some new keywords but I think we can ignore.

    # Copy over 4.2 script and modify to fit the latest Immortal Unchanging
    # Widely Adopted Standard GFF3.
    cp ../flybase4.2/extractGenes.pl .
    extractGenes.pl flybase.gff3

    # Get predicted proteins (for main annotations only)
    wget ftp://flybase.net/genomes/Drosophila_melanogaster/dmel_r4.3_20060303/fasta/dmel-all-translation-r4.3.fasta
    # save some space.
    nice gzip ?{,?}.gff dmel-all-transl*
    # Translate the fancy headers (some of which already start with CG-P's 
    # that match the FBtr's -- but not all!) into CG-R's to match flyBaseGene:
    zcat dmel-all-translation-r4.3.fasta.gz \
    | perl -we 'open(X, "flyBase2004Xref.tab") || die; \
                while (<X>) { \
                  @w = split("\t"); \
                  $fbtr2cgr{$w[3]} = $w[0]; \
                } \
                while (<>) { \
                  if (/^>.*parent=(FBtr\d+)/) { \
                    $cgr = $fbtr2cgr{$1}; \
                    s/^>.*/>$cgr/ if ($cgr); \
                  } \
                  print; \
                } \
               ' \
      > flybasePep.fa

    ssh hgwdev
    cd /cluster/data/dm2/bed/flybase4.3
    # Protein-coding genes:
    ldHgGene -gtf dm2 flyBaseGene flyBase*.gtf
    hgPepPred dm2 generic flyBasePep flybasePep.fa
    # Noncoding genes:
    hgLoadBed dm2 flyBaseNoncoding flyBaseNoncoding.bed
    # Cross-referencing info for both coding and noncoding:
    hgLoadSqlTab dm2 flyBase2004Xref ~/kent/src/hg/lib/flyBase2004Xref.sql \
      flyBase2004Xref.tab
    # -- had to remove a couple duplicate lines in flyBase2004Xref.tab.  
    # CG4110-RA and CG13904-RA had multiple mappings, both with consecutive 
    # FBtr's so I tossed out the earlier FBtr for both.  

    # Do this next time around, to catch joinerCheck errors before 
    # running doHgNearBlastp with these proteins:
    runJoiner.csh dm2 flyBasePep

    # Some featureBits comparisons with refGene which shoudl be pretty much 
    # like version 4.0 with a few noncoding genes tossed in... but this is 
    # a bigger difference than we expect:
    featureBits dm2 refGene \!flyBaseGene -minSize=1000 -bed=stdout
#604659 bases of 131698467 (0.459%) in intersection
    # Hmmm, flyBaseGene has nothing on the last 165,000bp of chrX!:
    # chrX:22,058,584-22,224,390
    # Here's anothre 176,000 gap in chrX:
    # chrX:21,836,109-22,012,685
    # nor the last 496,000bp of chr3L:
    # chr3L:23,275,448-23,771,897
    # nor the first 527,000bp of chr2R:
    # chr2R:1-527,716
    # No files for chr2h, chr3h, chr4h, chrXh, chrYh, chrU, so those are 
    # entirely devoid of FlyBase Genes... yikes.
#TODO: contact flybase-help about the lack of annotations in those ranges...

    # This shows only the dropped isoforms/duplicates (ignores noncoding):
    featureBits dm2 refGene \!flyBaseGene \!flyBaseNoncoding -minSize=1000 \
       -bed=stdout
#chr3R   15620310        15621493        chr3R.1
#chr2L   4698094 4700940 chr2L.1
#chrX    3638700 3639895 chrX.1
#474282 bases of 131698467 (0.360%) in intersection

    # add upstream* downloadable files
    cd /usr/local/apache/htdocs/goldenPath/dm2/bigZips
    foreach size (1000 2000 5000)
      echo upstream$size
      nice featureBits dm2 flyBaseGene:upstream:$size -fa=stdout \
      | nice gzip -c > upstream$size.fa.gz
    end
    nice md5sum *.zip *.gz > md5sum.txt
    # Now go redo all the sections above for tables related to flyBase*.

    # 4/6/07: remove mitochondrial genes from flyBasePep because they 
    # are missing from flyBase2004Xref and caused joinerCheck errors.
    hgsql dm2 -e 'delete from flyBasePep where name like "mt:%"'


#########################################################################
# AFFYMETRIX TIMECOURSE (DONE 12/21/06 angie)
    ssh kkstore04
    mkdir /cluster/data/dm2/bed/affyDrosDev
    cd /cluster/data/dm2/bed/affyDrosDev
    # Download files from Affy:
    wget -r --level=1 --directory-prefix=transfrags --no-directories \
      http://transcriptome.affymetrix.com/download/publication/dros_dev/transfrags
    foreach chr (`awk '{print $1;}' ../../chrom.sizes`)
      mkdir $chr
      pushd $chr
      foreach t (1 2 3 4 5 6 7 8 9 10 11 12)
        wget --timestamping http://transcriptome.affymetrix.com/download/publication/dros_dev/graphs/$chr/Dro_Total_AS_${t}_C01.sig.gr
      end
      popd
    end

    # Transfrags -- Add chromosome to make bed3 (start is 0-based per Sujit):
    foreach t (1 2 3 4 5 6 7 8 9 10 11 12)
      cp /dev/null affyDrosDevTransfrags$t.bed
      foreach chr (`awk '{print $1;}' ../../chrom.sizes`)
        if ($chr == "chrM") continue
        awk '{print "'$chr'\t" $1 "\t" $2;}' \
          transfrags/$chr.Dro_Total_AS_${t}_C01 >> affyDrosDevTransfrags$t.bed
      end
    end
    # Transcription level -- Add wig format variableStep lines and 
    # agglomerate into per-timepoint .wig files:
    foreach t (1 2 3 4 5 6 7 8 9 10 11 12)
      cp /dev/null affyDrosDevSignal$t.varStep.wig
      foreach chr (`awk '{print $1;}' ../../chrom.sizes`)
        if ($chr == "chrM") continue
        echo 'variableStep chrom='$chr >> affyDrosDevSignal$t.varStep.wig
        cat $chr/Dro_Total_AS_${t}_C01.sig.gr >> affyDrosDevSignal$t.varStep.wig
      end
      wigEncode affyDrosDevSignal$t.varStep.wig affyDrosDevSignal$t.wi{g,b}
    end
#Converted affyDrosDevSignal1.varStep.wig, upper limit 1156.00, lower limit -860.00
#Converted affyDrosDevSignal2.varStep.wig, upper limit 1499.75, lower limit -580.50
#Converted affyDrosDevSignal3.varStep.wig, upper limit 1627.50, lower limit -586.00
#Converted affyDrosDevSignal4.varStep.wig, upper limit 1794.50, lower limit -718.50
#Converted affyDrosDevSignal5.varStep.wig, upper limit 2277.25, lower limit -913.00
#Converted affyDrosDevSignal6.varStep.wig, upper limit 2095.25, lower limit -864.00
#Converted affyDrosDevSignal7.varStep.wig, upper limit 1071.75, lower limit -560.50
#Converted affyDrosDevSignal8.varStep.wig, upper limit 1182.50, lower limit -1019.00
#Converted affyDrosDevSignal9.varStep.wig, upper limit 2028.75, lower limit -942.50
#Converted affyDrosDevSignal10.varStep.wig, upper limit 1212.00, lower limit -779.00
#Converted affyDrosDevSignal11.varStep.wig, upper limit 1456.75, lower limit -454.50
#Converted affyDrosDevSignal12.varStep.wig, upper limit 1203.50, lower limit -459.50
    # --> trackDb setting: type wig -1019.0 2277.25

    # Load tables (--> composite tracks)
    ssh hgwdev
    cd /cluster/data/dm2/bed/affyDrosDev
    mkdir /gbdb/dm2/affyDrosDev
    ln -s /cluster/data/dm2/bed/affyDrosDev/affyDrosDevSignal*.wib \
      /gbdb/dm2/affyDrosDev
    foreach t (1 2 3 4 5 6 7 8 9 10 11 12)
      hgLoadBed dm2 affyDrosDevTransfrags$t affyDrosDevTransfrags$t.bed
      hgLoadWiggle dm2 affyDrosDevSignal$t \
        -pathPrefix=/gbdb/dm2/affyDrosDev affyDrosDevSignal$t.wig
    end
    rm bed.tab wiggle.tab

###########################################################################
# HUMAN (hg18) PROTEINS TRACK (DONE,  2007-01-26, braney)
    ssh kkstore04
    bash # if not using bash shell already
    # use blastDb database made for hg17 protein track

    mkdir -p  /san/sanvol1/scratch/dm2/blastDb; 
    cd /cluster/data/dm2/blastDb
    for i in nhr nin nsq; 
    do 
	cp *.$i /san/sanvol1/scratch/dm2/blastDb; 
    done

   mkdir /cluster/data/dm2/bed/tblastn.hg18KG
   cd  /cluster/data/dm2/bed/tblastn.hg18KG
   echo  /san/sanvol1/scratch/dm2/blastDb/*.nsq | xargs ls -S | sed "s/\.nsq//"  > query.lst
   wc -l query.lst
# 288 query.lst

   # we want around 250000 jobs
   calc `wc /cluster/data/hg18/bed/blat.hg18KG/hg18KG.psl | awk "{print \\\$1}"`/\(250000/`wc query.lst | awk "{print \\\$1}"`\)
# 36727/(250000/288) = 42.309504

   mkdir -p /cluster/bluearc/dm2/bed/tblastn.hg18KG/kgfa
   split -l 60 /cluster/data/hg18/bed/blat.hg18KG/hg18KG.psl /cluster/bluearc/dm2/bed/tblastn.hg18KG/kgfa/kg
   ln -s /cluster/bluearc/dm2/bed/tblastn.hg18KG/kgfa kgfa
   cd kgfa
   for i in *; do 
     nice pslxToFa $i $i.fa; 
     rm $i; 
     done
   cd ..
   ls -1S kgfa/*.fa > kg.lst
   mkdir -p /cluster/bluearc/dm2/bed/tblastn.hg18KG/blastOut
   ln -s /cluster/bluearc/dm2/bed/tblastn.hg18KG/blastOut
   for i in `cat kg.lst`; do  mkdir blastOut/`basename $i .fa`; done
   tcsh
   cd /cluster/data/dm2/bed/tblastn.hg18KG
   cat << '_EOF_' > blastGsub
#LOOP
blastSome $(path1) {check in line $(path2)} {check out exists blastOut/$(root2)/q.$(root1).psl }
#ENDLOOP
'_EOF_'

   cat << '_EOF_' > blastSome
#!/bin/sh
BLASTMAT=/cluster/bluearc/blast229/data
export BLASTMAT
g=`basename $2`
f=/tmp/`basename $3`.$g
for eVal in 0.01 0.001 0.0001 0.00001 0.000001 1E-09 1E-11
do
if /cluster/bluearc/blast229/blastall -M BLOSUM80 -m 0 -F no -e $eVal -p tblastn -d $1 -i $2 -o $f.8
then
        mv $f.8 $f.1
        break;
fi
done
if test -f  $f.1
then
    if /cluster/bin/i386/blastToPsl $f.1 $f.2
    then
	liftUp -nosort -type=".psl" -nohead $f.3 /cluster/data/dm2/jkStuff/subChr.lft warn $f.2  
	liftUp -nosort -type=".psl" -nohead $f.4 /cluster/data/dm2/jkStuff/liftAll.lft warn $f.3  
        liftUp -nosort -type=".psl" -pslQ -nohead $3.tmp /cluster/data/hg17/bed/blat.hg17KG/protein.lft warn $f.4

        if pslCheck -prot $3.tmp                                                  
        then                                                                      
            mv $3.tmp $3                                                          
            rm -f $f.1 $f.2 $f.3 $f.4
        fi
        exit 0                                                                    
    fi                                                                            
fi                                                                                
rm -f $f.1 $f.2 $3.tmp $f.8 $f.3 $f.4
exit 1
'_EOF_'
    # << happy emacs
    chmod +x blastSome
    gensub2 query.lst kg.lst blastGsub blastSpec
    
    # then run the Blast cluster jobs
    ssh pk
    cd /cluster/data/dm2/bed/tblastn.hg18KG
    para create blastSpec
    para try, check, push, check etc.
    para time

# Completed: 176544 of 176544 jobs
# CPU time in finished jobs:    1614468s   26907.81m   448.46h   18.69d  0.051 y
# IO & Wait Time:                589357s    9822.61m   163.71h    6.82d  0.019 y
# Average job time:                  12s       0.21m     0.00h    0.00d
# Longest finished job:              73s       1.22m     0.02h    0.00d
# Submission to last job:         58680s     978.00m    16.30h    0.68d

    ssh kkstore03
    cd /cluster/data/dm2/bed/tblastn.hg18KG
    tcsh
    mkdir chainRun
    cd chainRun
    cat << '_EOF_' > chainGsub
#LOOP
chainOne $(path1)
#ENDLOOP
'_EOF_'

    cat << '_EOF_' > chainOne
(cd $1; cat q.*.psl | simpleChain -prot -outPsl -maxGap=50000 stdin /cluster/bluearc/dm2/bed/tblastn.hg18KG/blastOut/c.`basename $1`.psl)
'_EOF_'
    chmod +x chainOne
    ls -1dS \
      /cluster/bluearc/dm2/bed/tblastn.hg18KG/blastOut/kg?? > chain.lst
    gensub2 chain.lst single chainGsub chainSpec
    # do the cluster run for chaining
    ssh kk
    cd /cluster/data/dm2/bed/tblastn.hg18KG/chainRun
    para create chainSpec
    para try, check, push, check etc.

# Completed: 613 of 613 jobs
# CPU time in finished jobs:       6560s     109.33m     1.82h    0.08d  0.000 y
# IO & Wait Time:                 31904s     531.74m     8.86h    0.37d  0.001 y
# Average job time:                  63s       1.05m     0.02h    0.00d
# Longest finished job:             176s       2.93m     0.05h    0.00d
# Submission to last job:          1338s      22.30m     0.37h    0.02d

    ssh kkstore03
    cd /cluster/data/dm2/bed/tblastn.hg18KG/blastOut
    bash # if using another shell
    for i in kg??
    do
       cat c.$i.psl | awk "(\$13 - \$12)/\$11 > 0.6 {print}" > c60.$i.psl
       sort -rn c60.$i.psl | pslUniq stdin u.$i.psl
       awk "((\$1 / \$11) ) > 0.60 { print   }" c60.$i.psl > m60.$i.psl
       echo $i
    done
    sort -T /tmp -k 14,14 -k 16,16n -k 17,17n u.*.psl m60* | uniq > /cluster/data/dm2/bed/tblastn.hg18KG/blastHg18KG.psl
    pslCheck /cluster/data/dm2/bed/tblastn.hg18KG/blastHg18KG.psl
    # this is ok.
    # load table 
    ssh hgwdev
    cd /cluster/data/dm2/bed/tblastn.hg18KG
    hgLoadPsl dm2 blastHg18KG.psl
    # check coverage
    featureBits dm2 blastHg18KG
# 5946352 bases of 131698467 (4.515%) in intersection

    featureBits dm2 refGene:cds blastHg18KG -enrichment
# refGene:cds 17.097%, blastHg17KG 4.427%, both 4.246%, cover 24.83%, enrich 5.61x

    ssh kkstore04
    rm -rf /cluster/data/dm2/bed/tblastn.hg18KG/blastOut
    rm -rf /cluster/bluearc/dm2/bed/tblastn.hg18KG/blastOut

#end human proteins
#################################################################


#########################################################################
# CHROMOSOME BANDS (DONE 6/19/07 angie)
    ssh hgwdev
    cd /cluster/data/dm2/bed/flybase4.3
    grep chromosome_band flybase.gff3 \
    | perl -wpe 'chomp; @w = split; \
                 $w[3]--; $w[3] = 0 if ($w[3] < 0); \
                 $w[8] =~ s/^.*ID=band-(\w+);$/$1/; \
                 if ($w[4] <= 0) { s/^.*$//; } else { \
                 s/^.*$/chr$w[0]\t$w[3]\t$w[4]\t$w[8]\t\n/; }' \
      > cytoBand.txt
    # Did a bit of manual cleanup... there are some rows that have negative
    # starts that should clearly have negative ends, but for some reason 
    # their end coords have been substituted with 10, 20, 30, 40 --
    # I just deleted those 12 rows.
    hgLoadSqlTab dm2 cytoBand ~/kent/src/hg/lib/cytoBand.sql cytoBand.txt
    # Drop or modify the items that are off the end of their chroms:
    awk '{print "delete from dm2.cytoBand where chrom = \""$1"\" and chromStart >= "$2"; update dm2.cytoBand set chromEnd = "$2" where chrom = \""$1"\" and chromEnd > "$2";";}' \
      /cluster/data/dm2/chrom.sizes \
    | hgsql dm2
    checkTableCoords -verbose=2 dm2 cytoBand

    # Normally we would just create cytoBandIdeo by select * from cytoBand --
    # but that gives too high of a resolution for any bands to be visible!  
    # So make a boiled-down cytoBandIdeo that just has {number, letter} 
    # instead of {number, letter, number}.
    perl -we 'while (<>) { \
                chomp; @w=split(" "); \
                if ($w[3] =~ /(\d+[A-Z]+)\d+/) { \
                  $b = $1; \
                } else { die "doh!" } \
                if (! defined $lastB) { \
                  $start = $w[1]; \
                } elsif ($lastB ne $b) { \
                  print "$chrom\t$start\t$end\t$lastB\t\n"; \
                  $start = $w[1]; \
                } \
                ($chrom, $end, $lastB) = ($w[0], $w[2], $b); \
              } \
              print "$chrom\t$start\t$end\t$lastB\t\n";' cytoBand.txt \
      > cytoBandIdeo.txt
    hgLoadSqlTab dm2 cytoBandIdeo ~/kent/src/hg/lib/cytoBandIdeo.sql \
      cytoBandIdeo.txt
    # Drop or modify the items that are off the end of their chroms:
    awk '{print "delete from dm2.cytoBandIdeo where chrom = \""$1"\" and chromStart >= "$2"; update dm2.cytoBandIdeo set chromEnd = "$2" where chrom = \""$1"\" and chromEnd > "$2";";}' \
      /cluster/data/dm2/chrom.sizes \
    | hgsql dm2
    checkTableCoords -verbose=2 dm2 cytoBandIdeo



#########################################################################
# EvoFold track (done, 10.02.2007, Jakob Skou Pedersen)
#
# EvoFold predictions were made based on the drosophila subset (12
# species) of the 15-way multiz insect alignment. The data generation
# process will change in the future and is therefore not described in
# detail here. Instead we start with the final set of predictions
# supplied by me.

  ssh hgwdev
  mkdir -p /cluster/data/dm2/bed/evofold
  cd /cluster/data/dm2/bed/evofold
  cp /cluster/home/jsp/projects/flies/finalDataSets/finalPredAll.bed foldsDm2.bed
  # The folds_hg17.bed is a 9-column bed file: column 1-6 provide
  # standard bed information, column 7 is element length, column 8 is
  # the RNA secondary structure in parentheses format, and column nine
  # is a commaseparated list of position specific confidence scores
  # (floats).
  hgLoadBed -notItemRgb -sqlTable=/cluster/home/jsp/prog/kent/src/hg/lib/evofold.sql dm2 evofold foldsDm2.bed


#########################################################################
# BDTNP CHIP-CHIP (DONE 4/9/08 angie)
# New data added 2/2/09 and 8/25/10, see next sections.
    # Berkeley Drosophila Transcription Network Project's ChIP-Chip data,
    # from Mark Biggin (MDBiggin at lbl dot gov).
    ssh kkstore04
    mkdir /cluster/data/dm2/bed/bdtnpChipper
    cd /cluster/data/dm2/bed/bdtnpChipper
    wget http://rana.lbl.gov/~smacarth/ChIPChip/Public_Release/wiggle/all_public_080409_1FDR.wig.bz2
    wget http://rana.lbl.gov/~smacarth/ChIPChip/Public_Release/wiggle/all_public_080409_25FDR.wig.bz2
    # The files contain concatenated custom tracks; split those out into
    # one .wig file per track, with UCSC table names:
    bzcat all_public* | perl -we \
      'my $fh; \
       while (<>) { \
         if (/^track .* name="([\w-]+)"/) { \
           close($fh) if ($fh); \
           my $name = $1; \
           $name =~ s/([a-zI]+)_(H14)?(\d+)?_\d+_675bp_smoothed_scores-sym-(\d+)/bdtnp\u$1$3Fdr$4/ \
             || die "parse:\n$name\n\t"; \
           open($fh, ">$name.wig") || die "Cant open >$name.wig: $!\n"; \
         } else { \
           die "data before track line?  at input line $.:\n$_" if (! $fh); \
           print $fh $_; \
         } \
       }'
    # These two warnings are for the polII filenames, no problem:
#Use of uninitialized value in concatenation (.) or string at -e line 6, <> line 714856.
#Use of uninitialized value in concatenation (.) or string at -e line 6, <> line 3415135.

    mkdir wigEncoded
    cd wigEncoded
    foreach f (../*Fdr*.wig)
      wigEncode $f $f:t:r.{wig,wib}
    end
#Converted ../bdtnpBcd1Fdr1.wig, upper limit 8.14, lower limit 0.80
#Converted ../bdtnpBcd1Fdr25.wig, upper limit 8.14, lower limit 0.80
#Converted ../bdtnpBcd2Fdr1.wig, upper limit 10.74, lower limit 0.68
#Converted ../bdtnpBcd2Fdr25.wig, upper limit 10.74, lower limit 0.68
#Converted ../bdtnpCad1Fdr1.wig, upper limit 16.20, lower limit 1.06
#Converted ../bdtnpCad1Fdr25.wig, upper limit 16.20, lower limit 0.48
#Converted ../bdtnpGt2Fdr1.wig, upper limit 22.22, lower limit 1.21
#Converted ../bdtnpGt2Fdr25.wig, upper limit 22.22, lower limit 0.49
#Converted ../bdtnpHb1Fdr1.wig, upper limit 14.38, lower limit 0.92
#Converted ../bdtnpHb1Fdr25.wig, upper limit 14.38, lower limit 0.86
#Converted ../bdtnpHb2Fdr1.wig, upper limit 24.25, lower limit 0.88
#Converted ../bdtnpHb2Fdr25.wig, upper limit 24.25, lower limit 0.78
#Converted ../bdtnpKni1Fdr1.wig, upper limit 8.08, lower limit 1.78
#Converted ../bdtnpKni1Fdr25.wig, upper limit 8.08, lower limit 1.01
#Converted ../bdtnpKni2Fdr1.wig, upper limit 8.44, lower limit 1.07
#Converted ../bdtnpKni2Fdr25.wig, upper limit 8.44, lower limit 0.76
#Converted ../bdtnpKr1Fdr1.wig, upper limit 14.47, lower limit 0.68
#Converted ../bdtnpKr1Fdr25.wig, upper limit 14.47, lower limit 0.68
#Converted ../bdtnpKr2Fdr1.wig, upper limit 12.92, lower limit 0.66
#Converted ../bdtnpKr2Fdr25.wig, upper limit 12.92, lower limit 0.66
#Converted ../bdtnpPolIIFdr1.wig, upper limit 30.86, lower limit 0.93
#Converted ../bdtnpPolIIFdr25.wig, upper limit 30.86, lower limit 0.70
#Converted ../bdtnpZ2Fdr1.wig, upper limit 11.49, lower limit 0.95
#Converted ../bdtnpZ2Fdr25.wig, upper limit 11.49, lower limit 0.79

    ssh hgwdev
    mkdir /gbdb/dm2/bdtnp
    ln -s /cluster/data/dm2/bed/bdtnpChipper/wigEncoded/*.wib /gbdb/dm2/bdtnp/
    cd /cluster/data/dm2/bed/bdtnpChipper/wigEncoded
    foreach f (*.wig)
      hgLoadWiggle -pathPrefix=/gbdb/dm2/bdtnp dm2 $f:r $f
    end
    rm wiggle.tab

    # Print out template for 24 subtrack defs (longLabel will need editing):
    set pri = 1
    foreach f (*.wig)
      echo "    track $f:r"
      echo "    subTrack bdtnpChipper"
      set fac = `echo $f:r | perl -wpe 's/^bdtnp([A-Za-z]+).*/\l$1/ || die;'`
      set FAC = `echo $fac | perl -wpe 's/^(\w+)$/\U$1/;'`
      set ab  = `echo $f:r | perl -wpe 's/^bdtnp[A-Za-z]+(\d)?F.*/$1/ || die;'`
      set fdr = `echo $f:r | perl -wpe 's/.*Fdr(\d+)$/$1/ || die;'`
      echo "    subGroups fdr=$fdr factor=$FAC"
      echo "    shortLabel $fac AB $ab FDR $fdr%"
      echo "    longLabel ($fac) antibody $ab, stage 4-5 embryos, False Discovery Rate (FDR) $fdr%"
      if ($fdr == 1) then
        echo "    color 50,50,200"
      else
        echo "    color 25,150,25"
      endif
      echo "    priority $pri"
      echo ""
      set pri = `expr $pri + 1`
    end


#########################################################################
# BDTNP CHIP-CHIP (DONE 2/2/09 angie)
    ssh hgwdev
    cd /hive/data/genomes/dm2/bed/bdtnpChipper
    wget http://rana.lbl.gov/~simi/wiggle/BDTNP_all_new_020209_1FDR.wig.bz2
    wget http://rana.lbl.gov/~simi/wiggle/BDTNP_all_new_020209_25FDR.wig.bz2
    # The files contain concatenated custom tracks; split those out into
    # one .wig file per track, with UCSC table names:
    bzcat BDTNP_all_new* | perl -we \
      'my $fh; \
       while (<>) { \
         if (/^track .* name="([\w-\.]+)"/) { \
           close($fh) if ($fh); \
           my $name = $1; \
           $name =~ s/_FQ_/_1_/;  $name =~ s/_BQ_/_2_/; \
           $name =~ s/([a-zA-Z0-9]+)_(H14)?(\d+)?_\d+_(FactorResults\.)?675bp_smoothed_scores-sym-(\d+)/bdtnp\u$1$3Fdr$5/ \
             || die "parse:\n$name\n\t"; \
           print "$name\n"; \
           open($fh, ">$name.wig") || die "Cant open >$name.wig: $!\n"; \
         } elsif (/^track /) { \
           die "missed a track line:\n$_\n\t"; \
         } else { \
           die "data before track line?  at input line $.:\n$_" if (! $fh); \
           print $fh $_; \
         } \
       }'
    cd wigEncoded
    foreach f (../*Fdr*.wig)
      if (! -e $f:t:r.wig) then
        wigEncode $f $f:t:r.{wig,wib}
      endif
    end
#Converted ../bdtnpD1Fdr1.wig, upper limit 20.33, lower limit 0.51
#Converted ../bdtnpD1Fdr25.wig, upper limit 20.33, lower limit 0.51
#Converted ../bdtnpDa2Fdr1.wig, upper limit 26.09, lower limit 0.79
#Converted ../bdtnpDa2Fdr25.wig, upper limit 26.09, lower limit 0.49
#Converted ../bdtnpDl3Fdr1.wig, upper limit 16.97, lower limit 0.65
#Converted ../bdtnpDl3Fdr25.wig, upper limit 16.97, lower limit 0.62
#Converted ../bdtnpFtz3Fdr1.wig, upper limit 16.54, lower limit 1.07
#Converted ../bdtnpFtz3Fdr25.wig, upper limit 16.54, lower limit 0.84
#Converted ../bdtnpH1Fdr1.wig, upper limit 18.52, lower limit 0.68
#Converted ../bdtnpH1Fdr25.wig, upper limit 18.52, lower limit 0.68
#Converted ../bdtnpH2Fdr1.wig, upper limit 13.23, lower limit 0.68
#Converted ../bdtnpH2Fdr25.wig, upper limit 13.23, lower limit 0.65
#Converted ../bdtnpHkb1Fdr1.wig, upper limit 11.84, lower limit 1.03
#Converted ../bdtnpHkb1Fdr25.wig, upper limit 11.84, lower limit 0.35
#Converted ../bdtnpHkb2Fdr1.wig, upper limit 10.73, lower limit 0.87
#Converted ../bdtnpHkb2Fdr25.wig, upper limit 10.73, lower limit 0.33
#Converted ../bdtnpHkb3Fdr1.wig, upper limit 10.50, lower limit 0.90
#Converted ../bdtnpHkb3Fdr25.wig, upper limit 10.50, lower limit 0.37
#Converted ../bdtnpMad2Fdr1.wig, upper limit 12.58, lower limit 0.67
#Converted ../bdtnpMad2Fdr25.wig, upper limit 12.58, lower limit 0.67
#Converted ../bdtnpMed2Fdr1.wig, upper limit 12.58, lower limit 0.67
#Converted ../bdtnpMed2Fdr25.wig, upper limit 12.58, lower limit 0.67
#Converted ../bdtnpPrd1Fdr1.wig, upper limit 13.97, lower limit 0.78
#Converted ../bdtnpPrd1Fdr25.wig, upper limit 13.97, lower limit 0.78
#Converted ../bdtnpPrd2Fdr1.wig, upper limit 12.00, lower limit 0.84
#Converted ../bdtnpPrd2Fdr25.wig, upper limit 12.00, lower limit 0.84
#Converted ../bdtnpRun1Fdr1.wig, upper limit 10.02, lower limit 0.60
#Converted ../bdtnpRun1Fdr25.wig, upper limit 10.02, lower limit 0.60
#Converted ../bdtnpRun2Fdr1.wig, upper limit 7.58, lower limit 1.16
#Converted ../bdtnpRun2Fdr25.wig, upper limit 7.58, lower limit 0.58
#Converted ../bdtnpShn2Fdr1.wig, upper limit 12.62, lower limit 1.19
#Converted ../bdtnpShn2Fdr25.wig, upper limit 12.62, lower limit 0.81
#Converted ../bdtnpShn3Fdr1.wig, upper limit 10.28, lower limit 1.09
#Converted ../bdtnpShn3Fdr25.wig, upper limit 10.28, lower limit 0.95
#Converted ../bdtnpSlp11Fdr1.wig, upper limit 40.39, lower limit 1.06
#Converted ../bdtnpSlp11Fdr25.wig, upper limit 40.39, lower limit 0.62
#Converted ../bdtnpSna1Fdr1.wig, upper limit 8.41, lower limit 0.97
#Converted ../bdtnpSna1Fdr25.wig, upper limit 8.41, lower limit 0.77
#Converted ../bdtnpSna2Fdr1.wig, upper limit 14.30, lower limit 0.57
#Converted ../bdtnpSna2Fdr25.wig, upper limit 14.30, lower limit 0.57
#Converted ../bdtnpTFIIB1Fdr1.wig, upper limit 21.81, lower limit 0.59
#Converted ../bdtnpTFIIB1Fdr25.wig, upper limit 21.81, lower limit 0.59
#Converted ../bdtnpTll1Fdr1.wig, upper limit 16.66, lower limit 0.98
#Converted ../bdtnpTll1Fdr25.wig, upper limit 16.66, lower limit 0.57
#Converted ../bdtnpTwi1Fdr1.wig, upper limit 26.11, lower limit 0.77
#Converted ../bdtnpTwi1Fdr25.wig, upper limit 26.11, lower limit 0.68
#Converted ../bdtnpTwi2Fdr1.wig, upper limit 26.66, lower limit 0.79
#Converted ../bdtnpTwi2Fdr25.wig, upper limit 26.66, lower limit 0.55

    foreach f (/hive/data/genomes/dm2/bed/bdtnpChipper/wigEncoded/*.wib)
      if (! -l /gbdb/dm2/bdtnp/$f:t) then
        ln -s $f /gbdb/dm2/bdtnp/
        hgLoadWiggle -pathPrefix=/gbdb/dm2/bdtnp dm2 $f:t:r $f:t:r.wig
      endif
    end
    rm wiggle.tab

    # Print out template for new subtrack defs (longLabel will need editing):
    set pri = 25
    foreach f (`ls -l *.wig | grep Feb | awk '{print $9;}'`)
      echo "    track $f:r"
      echo "    subTrack bdtnpChipper off"
      set fac = `echo $f:r | perl -wpe 's/^bdtnp([A-Za-z]+).*/\l$1/ || die;'`
      set FAC = `echo $fac | perl -wpe 's/^(\w+)$/\U$1/;'`
      set ab  = `echo $f:r | perl -wpe 's/^bdtnp[A-Za-z]+(\d+)?Fdr.*/$1/ || die;'`
      set fdr = `echo $f:r | perl -wpe 's/.*Fdr(\d+)$/$1/ || die;'`
      echo "    subGroups fdr=$fdr factor=$FAC"
      echo "    shortLabel $fac AB $ab FDR $fdr%"
      echo "    longLabel ($fac) antibody $ab, stage 4-5 embryos, False Discovery Rate (FDR) $fdr%"
      if ($fdr == 1) then
        echo "    color 50,50,200"
      else
        echo "    color 25,150,25"
      endif
      echo "    priority $pri"
      echo ""
      set pri = `expr $pri + 1`
    end


#########################################################################
# BDTNP CHIP-CHIP UPDATE (DONE 9/14/10 angie)
    mkdir /hive/data/genomes/dm2/bed/bdtnpChipper/100819
    cd /hive/data/genomes/dm2/bed/bdtnpChipper/100819
    # Unpacked chip-chip and DNase accessible regions files emailed by 
    # Mark Biggins Aug. 9th, 13th, 16th and Sep. 13, 2010.
    # Download some additional DNase accessibility files:
    alias wg 'wget --timestamping --no-check-certificate --user=biggin --password=xxxx'
    # NOTE: these files were replaced 8/30, see below...
    wg https://data.htseq.org/biggin/wig/s5_rep9481.wig
    wg https://data.htseq.org/biggin/wig/s5_rep9482.wig
    wg https://data.htseq.org/biggin/wig/s9_rep9127.wig
    wg https://data.htseq.org/biggin/wig/s9_rep9128.wig
    wg https://data.htseq.org/biggin/wig/s10_rep8816.wig
    wg https://data.htseq.org/biggin/wig/s10_rep8820.wig
    wg https://data.htseq.org/biggin/wig/s11_rep9485.wig
    wg https://data.htseq.org/biggin/wig/s11_rep9486.wig
    wg https://data.htseq.org/biggin/wig/s14_rep9477.wig
    wg https://data.htseq.org/biggin/wig/s14_rep9478.wig

    # Accessibility levels:
    foreach f (s*_rep*.wig)
      set t = `echo $f | perl -wpe 's/^s(\d+)_rep(\d+).wig$/bdtnpDnaseS{$1}R$2/'`
      echo $f --\> $t
      tail -n +1 $f \
      | sed -e 's/step=65/step=20/' \
      | wigToBigWig -clip stdin /hive/data/genomes/dm2/chrom.sizes $t.bw
    end
    # All files have a few items off the end of some chroms:
#line 2243431 of stdin: chromosome chr2h has 1694122 bases, but item ends at 1694124
#line 3432024 of stdin: chromosome chr3L has 23771897 bases, but item ends at 23771904
#line 4975059 of stdin: chromosome chr3h has 2955737 bases, but item ends at 2955744
#line 5039139 of stdin: chromosome chr4 has 1281640 bases, but item ends at 1281644
    foreach f (bdtnpDnase*.bw)
      ln -s `pwd`/$f /gbdb/dm2/bdtnp/$f
      hgBbiDbLink dm2 $f:r /gbdb/dm2/bdtnp/$f
    end
    # Accessible regions (updated stage 9 9/14/10 with corrected data):
    foreach f (5%*.txt)
      mac2unix "$f"
      set t = `echo "$f" | perl -wpe 's/^.*stage (\d+).txt/bdtnpDnaseAccS$1/'`
      echo $f --\> $t
      tail -n +3 "$f" \
      | awk '{print $1, ($2-1), $3;}' \
      | hgLoadBed dm2 $t stdin
    end
#5% FDR regions stage 10.txt --> bdtnpDnaseAccS10
#Loaded 23766 elements of size 3
#5% FDR regions stage 11.txt --> bdtnpDnaseAccS11
#Loaded 19743 elements of size 3
#5% FDR regions stage 14.txt --> bdtnpDnaseAccS14
#Loaded 23065 elements of size 3
#5% FDR regions stage 5.txt --> bdtnpDnaseAccS5
#Loaded 22694 elements of size 3
#5% FDR regions stage 9.txt --> bdtnpDnaseAccS9
#Loaded 16217 elements of size 3
    # New ChIP/chip data:
    foreach f ({hb,med}*.wig)
      set t = `echo $f | perl -wpe 'm/^([a-z]+)_(\d+)(S\d+).*-(\d+).wig$/ || die; \
                                    ($fac, $ab, $s, $fdr) = ($1, $2, $3, $4); \
                                    $fac = ucfirst($fac); \
                                    $_ = "bdtnp$fac$ab${s}Fdr$fdr"'`
      echo $f --\> $t
      wigToBigWig $f /hive/data/genomes/dm2/chrom.sizes $t.bw
    end
    foreach f (bdtnp[HM]*.bw)
      echo $f:t
      ln -s `pwd`/$f /gbdb/dm2/bdtnp/$f
      hgBbiDbLink dm2 $f:r /gbdb/dm2/bdtnp/$f
    end

    # 8/30/10: new DNase data (normalized; password in email 8/27):
    alias wg 'wget --timestamping --no-check-certificate --user=encode --password=xxxx'
    wg http://www.uwencode.org/~sthomas/data/biggin/wiggle.tar.gz
    tar xvzf wiggle.tar.gz
    foreach f (wiggle/p5_S*_DS*.wig)
      set t = `echo $f | perl -wpe 's/^wiggle\/p5_(S\d+)_DS(\d+).wig$/bdtnpDnase{$1}R$2/'`
      echo $f --\> $t
      wigToBigWig -clip $f /hive/data/genomes/dm2/chrom.sizes $t.bw
    end
    # again, all files have a few items that extend past end of chrom, NBD:
#line 2243431 of wiggle/p5_S9_DS9128.wig: chromosome chr2h has 1694122 bases, but item ends at 1694124
#line 3432024 of wiggle/p5_S9_DS9128.wig: chromosome chr3L has 23771897 bases, but item ends at 23771904
#line 4975059 of wiggle/p5_S9_DS9128.wig: chromosome chr3h has 2955737 bases, but item ends at 2955744
#line 5039139 of wiggle/p5_S9_DS9128.wig: chromosome chr4 has 1281640 bases, but item ends at 1281644
    # No need to re-run hgBbiDbLink -- filenames are same.
    # Get new max for trackDb bigWig specs:
    foreach f ( bdtnpDnaseS*.bw )
      echo $f
      bigWigInfo $f | egrep '^(min|max)'
      echo ""
    end


#########################################################################

################################################
# AUTOMATE UPSTREAM FILE CREATION (2008-10-15 markd)
update genbank.conf:
dm2.upstreamGeneTbl = refGene
dm2.upstreamMaf = multiz15way /hive/data/genomes/dm2/bed/multiz15way/species.lst
