# for emacs: -*- mode: sh; -*-


# Drosophila yakuba -- 
# 
# WUSTL's 
# http://www.genome.wustl.edu/???
#


# DOWNLOAD SEQUENCE (DONE 5/21/04 angie)
    ssh kksilo
    mkdir /cluster/store6/droYak1
    cd /cluster/data
    ln -s /cluster/store6/droYak1 droYak1
    cd /cluster/data/droYak1
    mkdir downloads
    cd downloads
    wget ftp://genome.wustl.edu/private/lhillier/old/dy040407/agp_040407.gz
    wget ftp://genome.wustl.edu/private/lhillier/old/dy040407/fa_040407.gz
    tar xvzf agp_040407.gz
    tar xvzf fa_040407.gz
    foreach c (2L 2R 2h 3L 3R 3h 4 U X Xh Yh)
      mkdir ../$c
      if (-e chr$c.fa) then
        mv chr$c.fa ../$c
        mv chr$c.agp ../$c
      endif
      if (-e chr${c}_random.fa) then
        mv chr${c}_random.fa ../$c
        mv chr${c}_random.agp ../$c
      endif
    end
    # download mitochondrion sequence
    mkdir M
    cd M
    # go to http://www.ncbi.nih.gov/ and search Genome for 
    # "yakuba mitochondrion".  That shows the gi number:
    # 5834829
    # Use that number in the entrez linking interface to get fasta:
    wget -O chrM.fa \
      'http://www.ncbi.nlm.nih.gov/entrez/query.fcgi?cmd=Text&db=Nucleotide&uid=5834829&dopt=FASTA'
    # Edit chrM.fa: make sure the long fancy header line says it's the 
    # Drosophila yakuba mitochondrion complete genome, and then replace the 
    # header line with just ">chrM".
    cd ..
    # checkAgpAndFa prints out way too much info -- keep the end/stderr only:
    foreach c (?{,?})
      foreach agp ($c/chr$c{,_random}.agp)
        if (-e $agp) then
          set fa = $agp:r.fa
          echo checking consistency of $agp and $fa
          checkAgpAndFa $agp $fa | tail -1
        endif
      end
    end
    faSize */chr*.fa
#179076649 bases (9662478 N's 169414171 real) in 18 sequences in 18 files
#Total size: mean 9948702.7 sd 13848622.8 min 4082 (chr4_random) max 45280584 (chrU) median 1395135
#N count: mean 536804.3 sd 1230439.2
    # see what kind of gap types we have:
    awk '$5 == "N" {print $7;}' */chr*.agp | uniq | sort | uniq
#contig
#fragment
    # wow, biggest gap size is 1800!
    awk '$5 == "N" {print $6;}' */chr*.agp | sort -nr | head -1
    # rough hist of gap sizes from AGP:
    awk '$5 == "N" {print $6;}' */chr*.agp | textHistogram stdin -binSize=100
    # fancy hist of gap size from FASTA:
    cat */chr*.fa | faGapSizes -niceSizes=10,50,100,200,300,700,1000,1800 stdin



# BREAK UP SEQUENCE INTO 5 MB CHUNKS AT CONTIGS/GAPS (DONE 5/21/04 angie)
    ssh kksilo
    cd /cluster/data/droYak1
    foreach c (?{,?})
      foreach agp ($c/chr$c{,_random}.agp)
        if (-e $agp) then
          set fa = $agp:r.fa
          echo splitting $agp and $fa
          cp -p $agp $agp.bak
          cp -p $fa $fa.bak
          splitFaIntoContigs $agp $fa . -nSize=5000000
        endif
      end
    end
    # splitFaIntoContigs makes new dirs for _randoms.  Move their contents 
    # back into the main chrom dirs and get rid of the _random dirs.
    foreach d (*_random)
      set base = `echo $d | sed -e 's/_random$//'`
      mkdir -p $base/lift
      mv $d/lift/oOut.lst $base/lift/rOut.lst
      mv $d/lift/ordered.lft $base/lift/random.lft
      mv $d/lift/ordered.lst $base/lift/random.lst
      rmdir $d/lift
      mv $d/* $base
      rmdir $d
    end
    # Make a "pseudo-contig" for processing chrM too:
    mkdir M/chrM_1
    sed -e 's/chrM/chrM_1/' M/chrM.fa > M/chrM_1/chrM_1.fa
    mkdir M/lift
    echo "chrM_1/chrM_1.fa.out" > M/lift/oOut.lst
    echo "chrM_1" > M/lift/ordered.lst
    # 7/13/04: Oops, first time around, used the wrong hardcoded value!
    # So liftAll.lft had too big of an upper limit for chrM until now:
    set msize = `faSize M/chrM.fa | awk '{print $1;}'`
    echo "0\tM/chrM_1\t$msize\tchrM\t$msize" > M/lift/ordered.lft


# MAKE JKSTUFF AND BED DIRECTORIES (DONE 5/21/04 angie)
    # This used to hold scripts -- better to keep them inline here so 
    # they're in CVS.  Now it should just hold lift file(s) and 
    # temporary scripts made by copy-paste from this file.  
    mkdir /cluster/data/droYak1/jkStuff
    # This is where most tracks will be built:
    mkdir /cluster/data/droYak1/bed


# CREATING DATABASE (DONE 5/21/04 angie)
    # Create the database.
    ssh hgwdev
    # Make sure there is at least 5 gig free for the database
    df -h /var/lib/mysql
    hgsql -e 'create database droYak1'


# CREATING GRP TABLE FOR TRACK GROUPING (DONE 2/23/04 angie)
    ssh hgwdev
    echo "create table grp (PRIMARY KEY(NAME)) select * from hg16.grp" \
      | hgsql droYak1


# MAKE CHROMINFO TABLE WITH (TEMPORARILY UNMASKED) NIBS (DONE 5/21/04 angie)
    # Make nib/, unmasked until RepeatMasker and TRF steps are done.
    # Do this now so we can load up RepeatMasker and run featureBits; 
    # can also load up other tables that don't depend on masking.  
    ssh kksilo
    cd /cluster/data/droYak1
    mkdir nib
    foreach c (?{,?})
      foreach f ($c/chr${c}{,_random}.fa)
        if (-e $f) then
          echo "nibbing $f"
          /cluster/bin/i386/faToNib $f nib/$f:t:r.nib
        endif
      end
    end

    # Make symbolic links from /gbdb/droYak1/nib to the real nibs.
    ssh hgwdev
    mkdir -p /gbdb/droYak1/nib
    foreach f (/cluster/data/droYak1/nib/chr*.nib)
      ln -s $f /gbdb/droYak1/nib
    end
    # Load /gbdb/droYak1/nib paths into database and save size info.
    cd /cluster/data/droYak1
    hgsql droYak1  < $HOME/kent/src/hg/lib/chromInfo.sql
    hgNibSeq -preMadeNib droYak1 /gbdb/droYak1/nib */chr*.fa
    echo "select chrom,size from chromInfo" | hgsql -N droYak1 > chrom.sizes
    # take a look at chrom.sizes, should be 18 lines
    wc chrom.sizes


# REPEATMASKER (DONE 5/21/04 angie)
    #- Split contigs into 500kb chunks, at gaps if possible:
    ssh kksilo
    cd /cluster/data/droYak1
    foreach c (?{,?})
      foreach d ($c/chr${c}*_?{,?})
        cd $d
        echo "splitting $d"
        set contig = $d:t
        faSplit gap $contig.fa 500000 ${contig}_ -lift=$contig.lft \
          -minGapSize=100
        cd ../..
      end
    end

    #- Make the run directory and job list:
    cd /cluster/data/droYak1
    cat << '_EOF_' > jkStuff/RMDrosophila
#!/bin/csh -fe

cd $1
pushd .
/bin/mkdir -p /tmp/droYak1/$2
/bin/cp $2 /tmp/droYak1/$2/
cd /tmp/droYak1/$2
/cluster/bluearc/RepeatMasker/RepeatMasker -ali -s -spec drosophila $2
popd
/bin/cp /tmp/droYak1/$2/$2.out ./
if (-e /tmp/droYak1/$2/$2.align) /bin/cp /tmp/droYak1/$2/$2.align ./
if (-e /tmp/droYak1/$2/$2.tbl) /bin/cp /tmp/droYak1/$2/$2.tbl ./
if (-e /tmp/droYak1/$2/$2.cat) /bin/cp /tmp/droYak1/$2/$2.cat ./
/bin/rm -fr /tmp/droYak1/$2/*
/bin/rmdir --ignore-fail-on-non-empty /tmp/droYak1/$2
/bin/rmdir --ignore-fail-on-non-empty /tmp/droYak1
'_EOF_'
    # << this line makes emacs coloring happy
    chmod +x jkStuff/RMDrosophila
    mkdir RMRun
    cp /dev/null RMRun/RMJobs
    foreach c (?{,?})
      foreach d ($c/chr${c}{,_random}_?{,?})
          set ctg = $d:t
          foreach f ( $d/${ctg}_?{,?}.fa )
            set f = $f:t
            echo /cluster/data/droYak1/jkStuff/RMDrosophila \
                 /cluster/data/droYak1/$d $f \
               '{'check out line+ /cluster/data/droYak1/$d/$f.out'}' \
              >> RMRun/RMJobs
          end
      end
    end

    #- Do the run
    ssh kk9
    cd /cluster/data/droYak1/RMRun
    para create RMJobs
    para try, para check, para check, para push, para check,...
#Completed: 452 of 452 jobs
#Average job time:                2607s      43.44m     0.72h    0.03d
#Longest job:                     4138s      68.97m     1.15h    0.05d
#Submission to last job:         14079s     234.65m     3.91h    0.16d

    #- Lift up the 500KB chunk .out's to 5MB ("pseudo-contig") level
    ssh kksilo
    cd /cluster/data/droYak1
    foreach d (*/chr*_?{,?})
      set contig = $d:t
      echo $contig
      liftUp $d/$contig.fa.out $d/$contig.lft warn $d/${contig}_*.fa.out \
        > /dev/null
    end

    #- Lift pseudo-contigs to chromosome level
    foreach c (?{,?})
      echo lifting $c
      cd $c
      if (-e lift/ordered.lft && ! -z lift/ordered.lft) then
        liftUp chr$c.fa.out lift/ordered.lft warn `cat lift/oOut.lst` \
        > /dev/null
      endif
      if (-e lift/random.lft && ! -z lift/random.lft) then
        liftUp chr${c}_random.fa.out lift/random.lft warn `cat lift/rOut.lst` \
        > /dev/null
      endif
      cd ..
    end

    #- Load the .out files into the database with:
    ssh hgwdev
    cd /cluster/data/droYak1
    hgLoadOut droYak1 */chr*.fa.out


# VERIFY REPEATMASKER RESULTS (DONE 5/21/04 angie)
    # Eyeball some repeat annotations in the browser, compare to lib seqs.
    # Run featureBits on droYak1 and on a comparable genome build, and compare:
    ssh hgwdev
    featureBits droYak1 rmsk
#29021326 bases of 169423277 (17.129%) in intersection
    # compare to dm1:
    featureBits dm1 rmsk
#15452875 bases of 126527731 (12.213%) in intersection
    # wow, new coverage is a lot higher though lib doesn't seem to have 
    # grown so much.  running by LaDeana & Jim.


# MAKE HGCENTRALTEST ENTRY AND TRACKDB TABLE (DONE 5/21/04 angie)
    # Warning: genome and organism fields must correspond
    # with defaultDb values
    hgsql -h genome-testdb hgcentraltest \
      -e 'INSERT INTO dbDb \
        (name, description, nibPath, organism, \
                defaultPos, active, orderKey, genome, scientificName, \
                htmlPath, hgNearOk, hgPbOk, sourceName) values \
        ("droYak1", "Apr. 2004", "/gbdb/droYak1/nib", "D.yakuba", \
               "chr2L:827700-845800", 1, 57, "D.yakuba", \
                "Drosophila yakuba", "/gbdb/droYak1/html/description.html", \
                0, 0, "WUSTL version 1.0");'
    hgsql -h genome-testdb hgcentraltest \
      -e 'INSERT INTO defaultDb (genome, name) values ("D.yakuba", "droYak1");'

    # Make trackDb table so browser knows what tracks to expect:
    ssh hgwdev
    cd ~/src/hg/makeDb/trackDb
    cvs up -d -P

    # Edit that makefile to add droYak1 in all the right places and do
    make update

    mkdir /gbdb/droYak1/html
    # go public on genome-test
    #make alpha
    cvs commit makefile

    # Add trackDb directories
    mkdir drosophila/droYak1
    cvs add drosophila/droYak1
    cvs commit drosophila/droYak1


# MAKE LIFTALL.LFT (DONE 5/21/04 angie)
    # Redone 7/13/04: chrM/lift/ordered.lft was regenerated w/correct size.
    ssh kksilo
    cd /cluster/data/droYak1
    cat */lift/{ordered,random}.lft > jkStuff/liftAll.lft


# SIMPLE REPEATS (TRF) (DONE 5/21/04 angie)
    ssh kksilo
    mkdir /cluster/data/droYak1/bed/simpleRepeat
    cd /cluster/data/droYak1/bed/simpleRepeat
    mkdir trf
    cp /dev/null jobs.csh
    foreach d (/cluster/data/droYak1/?{,?}/chr*_?{,?})
      set ctg = $d:t
      foreach f ($d/${ctg}.fa)
        set fout = $f:t:r.bed
        echo $fout
        echo "/cluster/bin/i386/trfBig -trf=/cluster/bin/i386/trf $f /dev/null -bedAt=trf/$fout -tempDir=/tmp" \
        >> jobs.csh
      end
    end
    tcsh jobs.csh >&! jobs.log &
    # check on this with
    tail -f jobs.log
    wc -l jobs.csh
    ls -1 trf | wc -l
    liftUp simpleRepeat.bed ../../jkStuff/liftAll.lft warn \
      trf/*.bed > /dev/null

    # Load this into the database as so
    ssh hgwdev
    hgLoadBed droYak1 simpleRepeat \
      /cluster/data/droYak1/bed/simpleRepeat/simpleRepeat.bed \
      -sqlTable=$HOME/src/hg/lib/simpleRepeat.sql


# FILTER SIMPLE REPEATS (TRF) INTO MASK (DONE 5/21/04 angie)
    # make a filtered version of the trf output: 
    # keep trf's with period <= 12:
    ssh kksilo
    cd /cluster/data/droYak1/bed/simpleRepeat
    mkdir trfMask
    foreach f (trf/*.bed)
      echo -n "filtering $f... "
      awk '{if ($5 <= 12) print;}' $f > trfMask/$f:t
    end
    # Lift up filtered trf output to chrom coords:
    mkdir trfMaskChrom
    foreach f (../../?{,?}/chr*.fa)
      set c = $f:t:r
      liftUp trfMaskChrom/$c.bed ../../jkStuff/liftAll.lft warn \
        trfMask/${c}_[0-9]*.bed > /dev/null
    end


# MASK FA USING REPEATMASKER AND FILTERED TRF FILES (DONE 5/21/04 angie)
    ssh kksilo
    cd /cluster/data/droYak1
    # Soft-mask (lower-case) the contig and chr .fa's, 
    # then make hard-masked versions from the soft-masked.  
    set trfCtg=bed/simpleRepeat/trfMask
    set trfChr=bed/simpleRepeat/trfMaskChrom
    foreach f (*/chr*.fa)
      echo "repeat- and trf-masking $f"
      maskOutFa -soft $f $f.out $f
      set chr = $f:t:r
      maskOutFa -softAdd $f $trfChr/$chr.bed $f
      echo "hard-masking $f"
      maskOutFa $f hard $f.masked
    end
# This warning is rare (but less so this year, hmm)
# -- if it indicates a problem, it's only 
# with the repeat annotation and doesn't affect our masking.
#WARNING: negative rEnd: -1937 chrU:5728141-5728199 SAR_DM
#WARNING: negative rEnd: -1555 chrU:5728312-5728374 SAR_DM
#WARNING: negative rEnd: -2302 chrU:38943439-38943488 SAR_DM
#WARNING: negative rEnd: -2121 chrU:38943621-38943652 SAR_DM
    foreach c (?{,?})
      echo "repeat- and trf-masking contigs of chr$c, chr${c}_random"
      foreach d ($c/chr*_?{,?})
        set ctg=$d:t
        set f=$d/$ctg.fa
        maskOutFa -soft $f $f.out $f
        maskOutFa -softAdd $f $trfCtg/$ctg.bed $f
        maskOutFa $f hard $f.masked
      end
    end
# same deal here:
#WARNING: negative rEnd: -1937 chrU_2:725794-725852 SAR_DM
#WARNING: negative rEnd: -1555 chrU_2:725965-726027 SAR_DM
#WARNING: negative rEnd: -2302 chrU_8:3896563-3896612 SAR_DM
#WARNING: negative rEnd: -2121 chrU_8:3896745-3896776 SAR_DM
    #- Rebuild the nib files, using the soft masking in the fa:
    foreach f (*/chr*.fa)
      faToNib -softMask $f nib/$f:t:r.nib
    end
    # Make one big 2bit file as well, and make a link to it in 
    # /gbdb/droYak1/nib because hgBlat looks there:
    faToTwoBit */chr*.fa droYak1.2bit
    ssh hgwdev
    ln -s /cluster/data/droYak1/droYak1.2bit /gbdb/droYak1/nib/


# GOLD AND GAP TRACKS (DONE 5/21/04 angie)
    ssh hgwdev
    cd /cluster/data/droYak1
    cp /dev/null chrom.lst
    foreach f (?{,?}/chr*.agp chrM)
      echo $f:t:r >> chrom.lst
    end
    hgGoldGapGl -noGl -chromLst=chrom.lst droYak1 /cluster/data/droYak1 .
    # featureBits fails if there's no chrM_gap, so make one:
    # echo "create table chrM_gap like chr1_gap" | hgsql droYak1
    # oops, that won't work until v4.1, so do this for the time being:
    hgsql droYak1 -e 'create table chrM_gap select * from chr2L_gap where 0=1'


# MAKE DOWNLOADABLE SEQUENCE FILES (DONE 5/22/04 angie)
    ssh kksilo
    cd /cluster/data/droYak1
    #- Build the .zip files -- no genbank for now.
    cat << '_EOF_' > jkStuff/zipAll.csh
rm -rf zip
mkdir zip
zip -j zip/chromAgp.zip [0-9A-Z]*/chr*.agp
zip -j zip/chromOut.zip */chr*.fa.out
zip -j zip/chromFa.zip */chr*.fa
zip -j zip/chromFaMasked.zip */chr*.fa.masked
cd bed/simpleRepeat
zip ../../zip/chromTrf.zip trfMaskChrom/chr*.bed
cd ../..
'_EOF_'
    # << this line makes emacs coloring happy
    csh ./jkStuff/zipAll.csh |& tee zipAll.log
    cd zip
    #- Look at zipAll.log to make sure all file lists look reasonable.  
    #- Check zip file integrity:
    foreach f (*.zip)
      unzip -t $f > $f.test
      tail -1 $f.test
    end
    wc -l *.zip.test

    #- Copy the .zip files to hgwdev:/usr/local/apache/...
    ssh hgwdev
    cd /cluster/data/droYak1/zip
    set gp = /usr/local/apache/htdocs/goldenPath/droYak1
    mkdir -p $gp/bigZips
    cp -p *.zip $gp/bigZips
    mkdir -p $gp/chromosomes
    foreach f ( ../*/chr*.fa )
      zip -j $gp/chromosomes/$f:t.zip $f
    end

    cd $gp/bigZips
    md5sum *.zip > md5sum.txt
    cd $gp/chromosomes
    md5sum *.zip > md5sum.txt
    # Take a look at bigZips/* and chromosomes/*, update their README.txt's
    # Can't make refGene upstream sequence files - no refSeq for yakuba.


# PUT MASKED SEQUENCE OUT FOR CLUSTER RUNS (DONE 5/22/04 angie)
    ssh kkr1u00
    # Chrom-level mixed nibs that have been repeat- and trf-masked:
    rm -rf /iscratch/i/droYak1/nib
    mkdir -p /iscratch/i/droYak1/nib
    cp -p /cluster/data/droYak1/nib/chr*.nib /iscratch/i/droYak1/nib
    # Pseudo-contig fa that have been repeat- and trf-masked:
    rm -rf /iscratch/i/droYak1/trfFa
    mkdir /iscratch/i/droYak1/trfFa
    foreach d (/cluster/data/droYak1/*/chr*_?{,?})
      cp $d/$d:t.fa /iscratch/i/droYak1/trfFa
    end
    iSync


# PRODUCING GENSCAN PREDICTIONS (DONE 5/22/04 angie)
    # Run on small cluster -- genscan needs big mem.
    ssh hgwdev
    mkdir /cluster/data/droYak1/bed/genscan
    cd /cluster/data/droYak1/bed/genscan
    # Check out hg3rdParty/genscanlinux to get latest genscan:
    cvs co hg3rdParty/genscanlinux
    # Run on small cluster (more mem than big cluster).
    ssh kki
    cd /cluster/data/droYak1/bed/genscan
    # Make 3 subdirectories for genscan to put their output files in
    mkdir gtf pep subopt
    # Generate a list file, genome.list, of all the hard-masked contigs that 
    # *do not* consist of all-N's (which would cause genscan to blow up)
    cp /dev/null genome.list
    foreach f ( `ls -1S /cluster/data/droYak1/*/chr*_*/chr*_?{,?}.fa.masked` )
      egrep '[ACGT]' $f > /dev/null
      if ($status == 0) echo $f >> genome.list
    end
    wc -l genome.list
    # Create template file, gsub, for gensub2.  For example (3-line file):
    cat << '_EOF_' > gsub
#LOOP
/cluster/bin/i386/gsBig {check in line+ $(path1)} {check out line gtf/$(root1).gtf} -trans={check out line pep/$(root1).pep} -subopt={check out line subopt/$(root1).bed} -exe=hg3rdParty/genscanlinux/genscan -par=hg3rdParty/genscanlinux/HumanIso.smat -tmp=/tmp -window=2400000
#ENDLOOP
'_EOF_'
    # << this line makes emacs coloring happy
    gensub2 genome.list single gsub jobList
    para create jobList
    para try, check, push, check, ...
#Completed: 49 of 49 jobs
#Average job time:                 110s       1.84m     0.03h    0.00d
#Longest job:                      171s       2.85m     0.05h    0.00d
#Submission to last job:           425s       7.08m     0.12h    0.00d
    # If there are crashes, diagnose with "para problems".  
    # If a job crashes due to genscan running out of memory, re-run it 
    # manually with "-window=1200000" instead of "-window=2400000".

    # Convert these to chromosome level files as so:
    ssh kksilo
    cd /cluster/data/droYak1/bed/genscan
    liftUp genscan.gtf ../../jkStuff/liftAll.lft warn gtf/*.gtf
    liftUp genscanSubopt.bed ../../jkStuff/liftAll.lft warn subopt/*.bed
    cat pep/*.pep > genscan.pep

    # Load into the database as so:
    ssh hgwdev
    cd /cluster/data/droYak1/bed/genscan
    ldHgGene droYak1 genscan genscan.gtf
    hgPepPred droYak1 generic genscanPep genscan.pep
    hgLoadBed droYak1 genscanSubopt genscanSubopt.bed


# SWAP BLASTZ MELANOGASTER-YAKUBA TO YAKUBA-MEL (dm1) (DONE 5/22/04 angie)
    ssh kolossus
    mkdir /cluster/data/droYak1/bed/blastz.dm1.swap.2004-05-22
    cd /cluster/data/droYak1/bed/blastz.dm1.swap.2004-05-22
    set aliDir = /cluster/data/dm1/bed/blastz.droYak1.2004-05-22
    cp $aliDir/S1.len S2.len
    cp $aliDir/S2.len S1.len
    mkdir unsorted axtChrom
    cat $aliDir/axtChrom/chr*.axt \
    | axtSwap stdin $aliDir/S1.len $aliDir/S2.len stdout \
    | axtSplitByTarget stdin unsorted
    # Sort the shuffled .axt files.
    foreach f (unsorted/*.axt)
      echo sorting $f:t:r
      axtSort $f axtChrom/$f:t
    end
    du -sh $aliDir/axtChrom unsorted axtChrom
#623M    /cluster/data/dm1/bed/blastz.droYak1.2004-05-22/axtChrom
#623M    unsorted
#623M    axtChrom
    rm -r unsorted


# CHAIN MELANOGASTER BLASTZ (DONE 5/27/04 angie)
    # Run axtChain on little cluster
    ssh kki
    cd /cluster/data/droYak1/bed/blastz.dm1.swap.2004-05-22
    mkdir -p axtChain/run1
    cd axtChain/run1
    mkdir out chain
    ls -1S /cluster/data/droYak1/bed/blastz.dm1.swap.2004-05-22/axtChrom/*.axt \
      > input.lst
    cat << '_EOF_' > gsub
#LOOP
doChain {check in exists $(path1)} {check out line+ chain/$(root1).chain} {check out exists out/$(root1).out}
#ENDLOOP
'_EOF_'
    # << this line makes emacs coloring happy

    cat << '_EOF_' > doChain
#!/bin/csh
axtFilter -notQ=chrUn_random $1 \
| axtChain stdin -verbose=0 \
  /iscratch/i/droYak1/nib \
  /cluster/bluearc/drosophila/dm1/nib $2 > $3
'_EOF_'
    # << this line makes emacs coloring happy
    chmod a+x doChain
    gensub2 input.lst single gsub jobList
    para create jobList
    para try, check, push, check...
#Completed: 18 of 18 jobs
#Average job time:                  16s       0.26m     0.00h    0.00d
#Longest job:                       59s       0.98m     0.02h    0.00d
#Submission to last job:            59s       0.98m     0.02h    0.00d

    # now on the cluster server, sort chains
    ssh kksilo
    cd /cluster/data/droYak1/bed/blastz.dm1.swap.2004-05-22/axtChain
    chainMergeSort run1/chain/*.chain > all.chain
    chainSplit chain all.chain
    rm run1/chain/*.chain

    # take a look at score distr's
    foreach f (chain/*.chain)
      grep chain $f | awk '{print $2;}' | sort -nr > /tmp/score.$f:t:r
      echo $f:t:r
      textHistogram -binSize=10000 /tmp/score.$f:t:r
      echo ""
    end

    # Load chains into database
    ssh hgwdev
    cd /cluster/data/droYak1/bed/blastz.dm1.swap.2004-05-22/axtChain/chain
    foreach i (*.chain)
        set c = $i:r
        echo loading $c
        hgLoadChain droYak1 ${c}_chainDm1 $i
    end


# NET MELANOGASTER BLASTZ (DONE 5/27/04 angie)
    ssh kksilo
    cd /cluster/data/droYak1/bed/blastz.dm1.swap.2004-05-22/axtChain
    chainPreNet all.chain ../S1.len ../S2.len stdout \
    | chainNet stdin -minSpace=1 ../S1.len ../S2.len stdout /dev/null \
    | netSyntenic stdin noClass.net

    # Add classification info using db tables:
    ssh hgwdev
    cd /cluster/data/droYak1/bed/blastz.dm1.swap.2004-05-22/axtChain
    netClass -noAr noClass.net droYak1 dm1 melanogaster.net

    # Make a 'syntenic' subset:
    ssh kksilo
    cd /cluster/data/droYak1/bed/blastz.dm1.swap.2004-05-22/axtChain
    rm noClass.net
    netFilter -syn melanogaster.net > melanogasterSyn.net

    # Load the nets into database 
    ssh hgwdev
    cd /cluster/data/droYak1/bed/blastz.dm1.swap.2004-05-22/axtChain
    netFilter -minGap=10 melanogaster.net |  hgLoadNet droYak1 netDm1 stdin
    netFilter -minGap=10 melanogasterSyn.net | hgLoadNet droYak1 netSyntenyDm1 stdin


# MAKE VSDM1 DOWNLOADABLES (DONE 5/27/04 angie)
    ssh kksilo
    cd /cluster/data/droYak1/bed/blastz.dm1.swap.2004-05-22/axtChain
    cp all.chain melanogaster.chain
    zip /cluster/data/droYak1/zip/melanogaster.chain.zip melanogaster.chain
    rm melanogaster.chain
    zip /cluster/data/droYak1/zip/melanogaster.net.zip melanogaster.net
    cd /cluster/data/droYak1/bed/blastz.dm1.swap.2004-05-22
    zip /cluster/data/droYak1/zip/axtChrom.zip axtChrom/chr*.axt

    ssh hgwdev
    mkdir /usr/local/apache/htdocs/goldenPath/droYak1/vsDm1
    cd /usr/local/apache/htdocs/goldenPath/droYak1/vsDm1
    mv /cluster/data/droYak1/zip/melanogaster*.zip .
    mv /cluster/data/droYak1/zip/axtChrom.zip .
    md5sum *.zip > md5sum.txt
    # Copy over & edit README.txt w/pointers to chain, net formats.


# AUTO UPDATE GENBANK MRNA RUN  (DONE 7/15/04 angie)
    ssh hgwdev
    # Update genbank config and source in CVS:
    cd ~/kent/src/hg/makeDb/genbank
    cvsup .
    # See if /cluster/data/genbank/etc/genbank.conf has had any un-checked-in
    # edits, check them in if necessary:
    diff /cluster/data/genbank/etc/genbank.conf etc/genbank.conf

    # Edit etc/genbank.conf and add these lines:
# droYak1 (D. yakuba)
droYak1.genome = /iscratch/i/droYak1/nib/chr*.nib
droYak1.lift = /cluster/data/droYak1/jkStuff/liftAll.lft
droYak1.refseq.mrna.native.load = no
droYak1.refseq.mrna.xeno.load = yes
droYak1.refseq.mrna.xeno.pslReps = -minCover=0.15 -minAli=0.75 -nearTop=0.005
droYak1.genbank.mrna.xeno.load = yes
droYak1.genbank.est.xeno.load = no
droYak1.downloadDir = droYak1

    cvs ci etc/genbank.conf
    # Since D. yakuba is a new species for us, edit src/lib/gbGenome.c.  
    # Pick some other browser species, & monkey-see monkey-do.  
    cvs diff src/lib/gbGenome.c
    make
    cvs ci src/lib/gbGenome.c

    # Install to /cluster/data/genbank:
    make install-server

    ssh eieio
    cd /cluster/data/genbank
    # This is an -initial run, (xeno) RefSeq only:
    nice bin/gbAlignStep -srcDb=refseq -type=mrna -initial droYak1 &
    tail -f [its logfile]
    # Load results:
    ssh hgwdev
    cd /cluster/data/genbank
    nice bin/gbDbLoadStep -verbose=1 -drop -initialLoad droYak1
    featureBits droYak1 xenoRefGene
#26062889 bases of 169423277 (15.383%) in intersection
    # Clean up:
    rm -rf work/initial.droYak1

    # This is an -initial run, mRNA only:
    nice bin/gbAlignStep -srcDb=genbank -type=mrna -initial droYak1 &
    tail -f [its logfile]
    # Load results:
    ssh hgwdev
    cd /cluster/data/genbank
    nice bin/gbDbLoadStep -verbose=1 -drop -initialLoad droYak1
    featureBits droYak1 mrna
#350085 bases of 169423277 (0.207%) in intersection
    featureBits droYak1 xenoMrna
#22644430 bases of 169423277 (13.366%) in intersection
    # Clean up:
    rm -rf work/initial.droYak1

    ssh eieio
    # -initial for ESTs
    nice bin/gbAlignStep -srcDb=genbank -type=est -initial droYak1 &
    tail -f [its logfile]
    # Load results:
    ssh hgwdev
    cd /cluster/data/genbank
    nice bin/gbDbLoadStep -verbose=1 droYak1 &
    # Wow, very very little for yakuba, even for ESTs!
    featureBits droYak1 intronEst
#277291 bases of 169423277 (0.164%) in intersection
    featureBits droYak1 est
#420049 bases of 169423277 (0.248%) in intersection
    # Clean up:
    rm -rf work/initial.droYak1


# MAKE GCPERCENT (DONE 7/21/04 angie)
    ssh hgwdev
    mkdir /cluster/data/droYak1/bed/gcPercent
    cd /cluster/data/droYak1/bed/gcPercent
    # create and load gcPercent table
    hgsql droYak1  < ~/src/hg/lib/gcPercent.sql
    hgGcPercent droYak1 ../../nib


# MAKE HGCENTRALTEST BLATSERVERS ENTRY (DONE 7/21/04 angie)
    ssh hgwdev
    echo 'insert into blatServers values("droYak1", "blat7", "17780", 1, 0); \
          insert into blatServers values("droYak1", "blat7", "17781", 0, 1);' \
      | hgsql -h genome-testdb hgcentraltest


# MAKE Drosophila Proteins track
    ssh kksilo
    mkdir -p /cluster/data/droYak1/blastDb
    cd /cluster/data/droYak1/blastDb
    for i in ../*/*/*_[0-9]*_[0-9]*.fa; do ln -s $i .; done
    for i in *.fa; do formatdb -i $i -p F 2> /dev/null; done
    rm *.fa *.log

    ssh kkr1u00
    mkdir -p /iscratch/i/droYak1/blastDb
    cp /cluster/data/droYak1/blastDb/* /iscratch/i/droYak1/blastDb
    (~kent/bin/iSync) 2>&1 > sync.out
    
    mkdir -p /cluster/data/droYak1/bed/tblastn.dm1FB
    cd /cluster/data/droYak1/bed/tblastn.dm1FB
    ls -1S /iscratch/i/droYak1/blastDb/*.nsq | sed "s/\.nsq//" > bug.lst
    exit

    # back to kksilo
    cd /cluster/data/droYak1/bed/tblastn.dm1FB
    mkdir fbfa
    # calculate a reasonable number of jobs
    calc `wc /cluster/data/dm1/bed/blat.dm1FB/dm1FB.psl | awk "{print \\\$1}"`/\(150000/`wc bug.lst | awk "{print \\\$1}"`\)
    # 18735/(150000/463) = 57.828700
    split -l 58 /cluster/data/dm1/bed/blat.dm1FB/dm1FB.psl fbfa/fb
    cd fbfa
    for i in *; do pslxToFa $i $i.fa; rm $i; done
    cd ..
    ls -1S fbfa/*.fa > fb.lst
    mkdir blastOut
    for i in `cat fb.lst`; do  mkdir blastOut/`basename $i .fa`; done
    cat << '_EOF_' > blastGsub
#LOOP
blastSome $(path1) {check in line $(path2)} {check out exists blastOut/$(root2)/q.$(root1).psl } 
#ENDLOOP
'_EOF_'
    cat << '_EOF_' > blastSome
#!/bin/sh
BLASTMAT=/iscratch/i/blast/data
export BLASTMAT
f=/tmp/`basename $3`
for eVal in 0.01 0.001 0.0001 0.00001 0.000001 1E-09 1E-11
do
if /scratch/blast/blastall -M BLOSUM80 -m 0 -F no -e $eVal -p tblastn -d $1 -i $2 -o $f.8
then
        mv $f.8 $f.1
        break;
fi
done
if test -f  $f.1
then
if /cluster/bin/i386/blastToPsl $f.1 $f.2
then
        liftUp -nosort -type=".psl" -nohead $f.3 /cluster/data/droYak1/jkStuff/subLift.lft warn $f.2
        liftUp -nosort -type=".psl" -nohead $f.4 /cluster/data/droYak1/jkStuff/liftAll.lft warn $f.3
        liftUp -nosort -type=".psl" -pslQ -nohead $3.tmp /iscratch/i/dm1/protein.lft warn $f.4
        mv $3.tmp $3
        rm -f $f.1 $f.2 $f.3
        exit 0
    fi
fi
rm -f $f.1 $f.2 $3.tmp $f.3
exit 1
'_EOF_'

    chmod +x blastSome
    gensub2 bug.lst fb.lst blastGsub blastSpec

    ssh kk
    cd /cluster/data/droYak1/bed/tblastn.dm1FB
    para create blastSpec
    para shove   # jobs will need to be restarted, but they should all finish
# Completed: 150012 of 150012 jobs
# CPU time in finished jobs:    2892894s   48214.90m   803.58h   33.48d  0.092 y
# IO & Wait Time:                748046s   12467.43m   207.79h    8.66d  0.024 y
# Average job time:                  24s       0.40m     0.01h    0.00d
# Longest job:                      562s       9.37m     0.16h    0.01d
# Submission to last job:         14761s     246.02m     4.10h    0.17d

    cat << '_EOF_' > chainGsub
#LOOP
chainSome $(path1)
#ENDLOOP
'_EOF_'

    cat << '_EOF_' > chainSome
(cd $1; cat q.*.psl | simpleChain -prot -outPsl -maxGap=25000 stdin ../c.`basename $1`.psl)
'_EOF_'
    chmod +x chainSome

    ls -1dS `pwd`/blastOut/fb?? > chain.lst
    gensub2 chain.lst single chainGsub chainSpec

    ssh kki
    cd /cluster/data/droYak1/bed/tblastn.dm1FB
    para create chainSpec
    para push
# Completed: 324 of 324 jobs
#  CPU time in finished jobs:        409s       6.81m     0.11h    0.00d  0.000 y
#  IO & Wait Time:                  8539s     142.32m     2.37h    0.10d  0.000 y
#  Average job time:                  28s       0.46m     0.01h    0.00d
#  Longest job:                       89s       1.48m     0.02h    0.00d
#  Submission to last job:           585s       9.75m     0.16h    0.01d

    exit
    # back to kksilo
    cd /cluster/data/droYak1/bed/tblastn.dm1FB/blastOut
    for i in fb??
    do 
	awk "(\$13 - \$12)/\$11 > 0.6 {print}" c.$i.psl > c60.$i.psl
	sort -rn c60.$i.psl | pslUniq stdin u.$i.psl
	awk "((\$1 / \$11) ) > 0.60 { print   }" c60.$i.psl > m60.$i.psl
	echo $i
    done

    cat u.*.psl m60* |  sort -T /tmp -k 14,14 -k 16,16n -k 17,17n  | uniq  > ../preblastDm1FB.psl
    cd ..
    blatDir=/cluster/data/dm1/bed/blat.dm1FB
    protDat -fb preblastDm1FB.psl $blatDir/dm1FB.psl $blatDir/dm1FB.mapNames blastDm1FB.psl

    ssh hgwdev
    cd /cluster/data/droYak1/bed/tblastn.dm1FB
    hgLoadPsl droYak1 blastDm1FB.psl
    exit

    # back to kksilo
    rm -rf blastOut

# End tblastn


# MAKE 11.OOC FILE FOR BLAT (DONE 11/4/04 angie)
    # Use -repMatch=100 (based on size -- for human we use 1024, and 
    # fly size is ~4.4% of human judging by gapless dm1 genome size from 
    # featureBits -- we would use 45, but bump that up a bit to be more 
    # conservative).
    ssh kkr1u00
    mkdir /cluster/bluearc/droYak1
    blat /cluster/data/droYak1/droYak1.2bit /dev/null /dev/null -tileSize=11 \
      -makeOoc=/cluster/bluearc/droYak1/11.ooc -repMatch=100
#Wrote 7023 overused 11-mers to /cluster/bluearc/droYak1/11.ooc
    cp -p /cluster/bluearc/droYak1/*.ooc /iscratch/i/droYak1/
    iSync


# SWAP CHAINS FROM DM2, BUILD NETS ETC. (REDONE 5/23/05 angie)
# Originally done 3/2/05 -- redone (better params) 5/23/05.
    mkdir /cluster/data/droYak1/bed/blastz.dm2.swap
    cd /cluster/data/droYak1/bed/blastz.dm2.swap
    doBlastzChainNet.pl -swap /cluster/data/dm2/bed/blastz.droYak1/DEF \
      >& do.log &
    tail -f do.log
    # Add {chain,net}Dm2 to trackDb.ra if necessary.


# MAKE Drosophila Proteins track  (DONE braney 06-30-05)
    ssh kk
    mkdir -p /cluster/data/droYak1/bed/tblastn.dm2FB
    cd /cluster/data/droYak1/bed/tblastn.dm2FB
    ls -1S /iscratch/i/droYak1/blastDb/*.nsq | sed "s/\.nsq//" > target.lst
    mkdir fbfa
    # calculate a reasonable number of jobs 
    calc `wc /cluster/data/dm2/bed/blat.dm2FB/dm2FB.psl | awk "{print \\\$1}"`/\(164630/`wc target.lst | awk "{print \\\$1}"`\)
# 18929/(164630/452) = 51.970528

    split -l 52 /cluster/data/dm2/bed/blat.dm2FB/dm2FB.psl fbfa/fb
    cd fbfa
    for i in *; do pslxToFa $i $i.fa; rm $i; done
    cd ..
    ls -1S fbfa/*.fa > fb.lst
    mkdir -p /cluster/bluearc/droYak1/bed/tblastn.dm2FB/blastOut  
    ln -s /cluster/bluearc/droYak1/bed/tblastn.dm2FB/blastOut  
    for i in `cat fb.lst`; do  mkdir blastOut/`basename $i .fa`; done
    tcsh
    cat << '_EOF_' > blastGsub
#LOOP
blastSome $(path1) {check in line $(path2)} {check out exists blastOut/$(root2)/q.$(root1).psl } 
#ENDLOOP
'_EOF_'
    cat << '_EOF_' > blastSome
#!/bin/sh
BLASTMAT=/iscratch/i/blast/data
export BLASTMAT
g=`basename $2`
f=/tmp/`basename $3`.$g
for eVal in 0.01 0.001 0.0001 0.00001 0.000001 1E-09 1E-11
do
if /scratch/blast/blastall -M BLOSUM80 -m 0 -F no -e $eVal -p tblastn -d $1 -i $2 -o $f.8
then
        mv $f.8 $f.1
        break;
fi
done
if test -f  $f.1
then
if /cluster/bin/i386/blastToPsl $f.1 $f.2
then
        liftUp -nosort -type=".psl" -nohead $f.3 /cluster/data/droYak1/jkStuff/subLift.lft warn $f.2
        liftUp -nosort -type=".psl" -nohead $f.4 /cluster/data/droYak1/jkStuff/liftAll.lft warn $f.3
        liftUp -nosort -type=".psl" -pslQ -nohead $3.tmp /cluster/data/dm2/bed/blat.dm2FB/protein.lft warn $f.4
        mv $3.tmp $3
        rm -f $f.1 $f.2 $f.3 $f.4
        exit 0
    fi
fi
rm -f $f.1 $f.2 $3.tmp $f.3 $f.8 $f.4
exit 1
'_EOF_'

    chmod +x blastSome
    gensub2 target.lst fb.lst blastGsub blastSpec

    para create blastSpec
    para push

# Completed: 164980 of 164980 jobs
# CPU time in finished jobs:    1200354s   20005.90m   333.43h   13.89d  0.038 y
# IO & Wait Time:                648791s   10813.19m   180.22h    7.51d  0.021 y
# Average job time:                  11s       0.19m     0.00h    0.00d
# Longest finished job:             194s       3.23m     0.05h    0.00d
# Submission to last job:         23476s     391.27m     6.52h    0.27d

    ssh kki
    cd /cluster/data/droYak1/bed/tblastn.dm2FB
    tcsh
    cat << '_EOF_' > chainGsub
#LOOP
chainSome $(path1)
#ENDLOOP
'_EOF_'

    cat << '_EOF_' > chainSome
(cd $1; cat q.*.psl | simpleChain -prot -outPsl -maxGap=25000 stdin ../c.`basename $1`.psl)
'_EOF_'
    chmod +x chainSome

    ls -1dS `pwd`/blastOut/fb?? > chain.lst
    gensub2 chain.lst single chainGsub chainSpec

    para create chainSpec
    para push
 
# Completed: 153 of 153 jobs
# CPU time in finished jobs:       8279s     137.98m     2.30h    0.10d  0.000 y
# IO & Wait Time:                  4933s      82.22m     1.37h    0.06d  0.000 y
# Average job time:                  86s       1.44m     0.02h    0.00d
# Longest finished job:            3445s      57.42m     0.96h    0.04d
# Submission to last job:          3823s      63.72m     1.06h    0.04d

    cd /cluster/data/droYak1/bed/tblastn.dm2FB/blastOut
    for i in fb??
    do 
	awk "(\$13 - \$12)/\$11 > 0.6 {print}" c.$i.psl > c60.$i.psl
	sort -rn c60.$i.psl | pslUniq stdin u.$i.psl
	awk "((\$1 / \$11) ) > 0.60 { print   }" c60.$i.psl > m60.$i.psl
	echo $i
    done

    sort -u -T /tmp -k 14,14 -k 16,16n -k 17,17n u.*.psl m60* > /cluster/data/droYak1/bed/tblastn.dm2FB/blastDm2FB.psl
    cd ..

    ssh hgwdev
    cd /cluster/data/droYak1/bed/tblastn.dm2FB
    hgLoadPsl droYak1 blastDm2FB.psl
    exit

    # back to kksilo
    rm -rf blastOut

# End tblastn


# GENEID PREDICTIONS FROM IMIM (DONE 7/26/05 angie)
    ssh hgwdev
    mkdir /cluster/data/droYak1/bed/geneid
    cd /cluster/data/droYak1/bed/geneid
    foreach chr (`awk '{print $1;}' ../../chrom.sizes`)
      wget http://genome.imim.es/genepredictions/D.yakuba/golden_path_200404/geneidv1.2/$chr.gtf
    end
    ldHgGene -gtf -genePredExt droYak1 geneid *.gtf


# GENEMAPPER PREDICTIONS FROM UCB (DONE 1/24/06 angie)
    ssh hgwdev
    mkdir /cluster/data/droYak1/bed/geneMapper
    cd /cluster/data/droYak1/bed/geneMapper
    wget http://bio.math.berkeley.edu/genemapper/GFF/rel0.2/DroYak_1.gff
    # Get rid of custom track header lines:
    egrep -v '^(track|browser) ' DroYak_1.gff > geneMapper.gtf
    # Don't use -genePredExt... there are no start/stop_codon items, so 
    # all get marked "incmpl", and name2 always gets the same value as name.
    ldHgGene -gtf droYak1 geneMapper geneMapper.gtf


# SWAP CHAINS FROM DROSIM1, BUILD NETS ETC. (DONE 6/26/06 angie)
    mkdir /cluster/data/droYak1/bed/blastz.droSim1.swap
    cd /cluster/data/droYak1/bed/blastz.droSim1.swap
    doBlastzChainNet.pl -swap /cluster/data/droSim1/bed/blastz.droYak1/DEF \
      >& do.log &
    tail -f do.log
    ln -s blastz.droSim1.swap /cluster/data/droYak1/bed/blastz.droSim1


