#############################################################################
## 26-Way Multiz (DONE - 2015-08-13 - Hiram)
    ssh hgwdev
    mkdir /hive/data/genomes/ce11/bed/multiz26way
    cd /hive/data/genomes/ce11/bed/multiz26way

    # constructed a tree from NCBI taxonomy and then edit to make it
    # a binary tree:
(((((((((((((ce11:0.1, (caePb3:0.1, (caeRem4:0.1, cb4:0.1):0.1):0.1):0.1,
 caeJap4:0.1):0.1, caeSp111:0.1):0.1, caeAng2:0.1):0.1, caeSp51:0.1):0.1,
  hetBac1:0.1):0.1, (strRat2:0.1, panRed1:0.1):0.1):0.1, ((ancCey1:0.1,
   necAme1:0.1):0.1, haeCon2:0.1):0.1):0.1, ascSuu1:0.1):0.1, (priExs1:0.1,
    priPac3:0.1):0.1):0.1, ((melHap1:0.1, melInc2:0.1):0.1,
     burXyl1:0.1):0.1):0.1, (((dirImm1:0.1, loaLoa1:0.1):0.1, oncVol1:0.1):0.1,
      bruMal2:0.1):0.1):0.1, (triSpi1:0.1, triSui1:0.1):0.1);


    #	what that looks like:
 ~/kent/src/hg/utils/phyloTrees/asciiTree.pl ce11.26way.nh | sed -e 's/^/# /;'
# (((((((((((((ce11:0.1,
#             (caePb3:0.1,
#             (caeRem4:0.1,
#             cb4:0.1):0.1):0.1):0.1,
#            caeJap4:0.1):0.1,
#           caeSp111:0.1):0.1,
#          caeAng2:0.1):0.1,
#         caeSp51:0.1):0.1,
#        hetBac1:0.1):0.1,
#       (strRat2:0.1,
#       panRed1:0.1):0.1):0.1,
#      ((ancCey1:0.1,
#       necAme1:0.1):0.1,
#      haeCon2:0.1):0.1):0.1,
#     ascSuu1:0.1):0.1,
#    (priExs1:0.1,
#    priPac3:0.1):0.1):0.1,
#   ((melHap1:0.1,
#    melInc2:0.1):0.1,
#   burXyl1:0.1):0.1):0.1,
#  (((dirImm1:0.1,
#    loaLoa1:0.1):0.1,
#   oncVol1:0.1):0.1,
#  bruMal2:0.1):0.1):0.1,
# (triSpi1:0.1,
# triSui1:0.1):0.1);

    # extract species list from that .nh file
    sed 's/[a-z][a-z]*_//g; s/:[0-9\.][0-9\.]*//g; s/;//; /^ *$/d' \
        ce11.26way.nh | xargs echo | sed 's/ //g; s/,/ /g' \
        | sed 's/[()]//g; s/,/ /g' | tr '[ ]' '[\n]' > species.list.txt

    # construct db to name translation list:
    cat species.list.txt | while read DB
do
hgsql -N -e "select name,organism from dbDb where name=\"${DB}\";" hgcentraltest
done | sed -e "s/\t/->/; s/ /_/g;" | sed -e 's/$/;/' | sed -e 's/\./_/g' \
        | sed -e 's/-nosed/_nosed/; s/-eating/_eating/;' > db.to.name.txt

    # construct a common name .nh file:
    /cluster/bin/phast/tree_doctor --rename \
    "`cat db.to.name.txt`" ce11.26way.nh | sed -e 's/00*)/)/g; s/00*,/,/g' \
       | $HOME/kent/src/hg/utils/phyloTrees/asciiTree.pl /dev/stdin \
         > ce11.26way.commonNames.nh
    cat ce11.26way.commonNames.nh | sed -e 's/^/# /;'
# (((((((((((((C__elegans:0.1,
#             (C__brenneri:0.1,
#             (C__remanei:0.1,
#             C__briggsae:0.1):0.1):0.1):0.1,
#            C__japonica:0.1):0.1,
#           C__tropicalis:0.1):0.1,
#          C__angaria:0.1):0.1,
#         C__sp__5_ju800:0.1):0.1,
#        H__bacteriophora:0.1):0.1,
#       (S__ratti_ed321:0.1,
#       P__redivivus:0.1):0.1):0.1,
#      ((Hookworm:0.1,
#       Hookworm:0.1):0.1,
#      H__contortus:0.1):0.1):0.1,
#     Pig_roundworm:0.1):0.1,
#    (P__exspectatus:0.1,
#    P__pacificus:0.1):0.1):0.1,
#   ((M__hapla:0.1,
#    M__incognita:0.1):0.1,
#   Pine_wood_nematode:0.1):0.1):0.1,
#  (((Dog_heartworm_nematode:0.1,
#    Eye_worm:0.1):0.1,
#   O__volvulus:0.1):0.1,
#  B__malayi:0.1):0.1):0.1,
# (T__spiralis:0.1,
# Whipworm:0.1):0.1);

#	Use this specification in the phyloGif tool:
#	http://genome.ucsc.edu/cgi-bin/phyloGif
#	to obtain a png image for src/hg/htdocs/images/phylo/ce11_26way.png
    grep TREE 4d/all.mod | awk '{print $NF}' \
       | ~/kent/src/hg/utils/phyloTrees/asciiTree.pl /dev/stdin > t.nh
    ~/kent/src/hg/utils/phyloTrees/scientificNames.sh t.nh \
	| $HOME/kent/src/hg/utils/phyloTrees/asciiTree.pl /dev/stdin \
	    > ce11.26way.scientificNames.nh

    ~/kent/src/hg/utils/phyloTrees/asciiTree.pl ce11.26way.nh > t.nh
    ~/kent/src/hg/utils/phyloTrees/scientificNames.sh t.nh \
       | $HOME/kent/src/hg/utils/phyloTrees/asciiTree.pl /dev/stdin \
          > ce11.26way.scientificNames.nh
    rm -f t.nh
    cat ce11.26way.scientificNames.nh | sed -e 's/^/# /;'
# (((((((((((((Caenorhabditis_elegans:0.1,
#             (Caenorhabditis_brenneri:0.1,
#             (Caenorhabditis_remanei:0.1,
#             Caenorhabditis_briggsae:0.1):0.1):0.1):0.1,
#            Caenorhabditis_japonica:0.1):0.1,
#           Caenorhabditis_tropicalis:0.1):0.1,
#          Caenorhabditis_angaria:0.1):0.1,
#         Caenorhabditis_sp5_ju800:0.1):0.1,
#        Heterorhabditis_bacteriophora:0.1):0.1,
#       (Strongyloides_ratti:0.1,
#       Panagrellus_redivivus:0.1):0.1):0.1,
#      ((Ancylostoma_ceylanicum:0.1,
#       Necator_americanus:0.1):0.1,
#      Haemonchus_contortus:0.1):0.1):0.1,
#     Ascaris_suum:0.1):0.1,
#    (Pristionchus_exspectatus:0.1,
#    Pristionchus_pacificus:0.1):0.1):0.1,
#   ((Meloidogyne_hapla:0.1,
#    Meloidogyne_incognita:0.1):0.1,
#   Bursaphelenchus_xylophilus:0.1):0.1):0.1,
#  (((Dirofilaria_immitis:0.1,
#    Loa_loa:0.1):0.1,
#   Onchocerca_volvulus:0.1):0.1,
#  Brugia_malayi:0.1):0.1):0.1,
# (Trichinella_spiralis:0.1,
# Trichuris_suis:0.1):0.1);

    /cluster/bin/phast/all_dists ce11.26way.nh | grep ce11 \
        | sed -e "s/ce11.//" | sort -k2n > 26way.distances.txt
    #	Use this output to create the table below
    cat 26way.distances.txt | sed -e 's/^/# /;'
# caeJap4       0.300000
# caePb3        0.300000
# caeRem4       0.400000
# caeSp111      0.400000
# cb4   0.400000
# caeAng2       0.500000
# caeSp51       0.600000
# hetBac1       0.700000
# panRed1       0.900000
# strRat2       0.900000
# ascSuu1       1.000000
# haeCon2       1.000000
# ancCey1       1.100000
# necAme1       1.100000
# priExs1       1.200000
# priPac3       1.200000
# burXyl1       1.300000
# bruMal2       1.400000
# melHap1       1.400000
# melInc2       1.400000
# oncVol1       1.500000
# triSpi1       1.500000
# triSui1       1.500000
# dirImm1       1.600000
# loaLoa1       1.600000

    cat << '_EOF_' > sizeStats.pl
#!/usr/bin/env perl

use strict;
use warnings;

open (FH, "<26way.distances.txt") or
        die "can not read 26way.distances.txt";

my $count = 0;
while (my $line = <FH>) {
    chomp $line;
    my ($D, $dist) = split('\s+', $line);
    my $chain = "chain" . ucfirst($D);
    my $B="/hive/data/genomes/ce11/bed/lastz.$D/fb.ce11." .
        $chain . "Link.txt";
    my $chainLinkMeasure =
        `awk '{print \$5}' ${B} 2> /dev/null | sed -e "s/(//; s/)//"`;
    chomp $chainLinkMeasure;
    $chainLinkMeasure = 0.0 if (length($chainLinkMeasure) < 1);
    $chainLinkMeasure =~ s/\%//;
    my $swapFile="/hive/data/genomes/${D}/bed/lastz.ce11/fb.${D}.chainCe11Link.txt";
    my $swapMeasure = "N/A";
    if ( -s $swapFile ) {
	$swapMeasure =
	    `awk '{print \$5}' ${swapFile} 2> /dev/null | sed -e "s/(//; s/)//"`;
	chomp $swapMeasure;
	$swapMeasure = 0.0 if (length($swapMeasure) < 1);
	$swapMeasure =~ s/\%//;
    }
    my $orgName=
    `hgsql -N -e "select organism from dbDb where name='$D';" hgcentraltest`;
    chomp $orgName;
    if (length($orgName) < 1) {
        $orgName="N/A";
    }
    ++$count;
    printf "# %02d  %.4f (%% %06.3f) (%% %06.3f) - %s %s\n", $count, $dist,
        $chainLinkMeasure, $swapMeasure, $orgName, $D;
}
close (FH);
'_EOF_'
    # << happy emacs
    chmod +x ./sizeStats.pl
    ./sizeStats.pl
#

#	If you can fill in all the numbers in this table, you are ready for
#	the multiple alignment procedure

#       featureBits chainLink measures
#               chainLink
#  N distance  on ce11  on other     other species
# 01  0.3000 (% 27.709) (% 19.533) - C. japonica caeJap4
# 02  0.3000 (% 40.638) (% 32.296) - C. brenneri caePb3
# 03  0.4000 (% 41.715) (% 33.416) - C. remanei caeRem4
# 04  0.4000 (% 37.870) (% 47.362) - C. tropicalis caeSp111
# 05  0.4000 (% 39.384) (% 36.204) - C. briggsae cb4
# 06  0.5000 (% 17.682) (% 20.987) - C. angaria caeAng2
# 07  0.6000 (% 40.332) (% 31.721) - C. sp. 5 ju800 caeSp51
# 08  0.7000 (% 10.486) (% 13.556) - H. bacteriophora/m31e hetBac1
# 09  0.9000 (% 06.585) (% 10.359) - Microworm panRed1
# 10  0.9000 (% 05.669) (% 11.848) - Threadworm strRat2
# 11  1.0000 (% 05.722) (% 02.224) - Pig roundworm ascSuu1
# 12  1.0000 (% 08.936) (% 03.405) - Barber pole worm haeCon2
# 13  1.1000 (% 09.736) (% 03.237) - A. ceylanicum ancCey1
# 14  1.1000 (% 08.902) (% 04.115) - N. americanus necAme1
# 15  1.2000 (% 06.222) (% 04.321) - P. exspectatus priExs1
# 16  1.2000 (% 06.308) (% 04.534) - P. pacificus priPac3
# 17  1.3000 (% 06.291) (% 08.293) - Pine wood nematode burXyl1
# 18  1.4000 (% 05.065) (% 05.521) - Filarial worm bruMal2
# 19  1.4000 (% 03.916) (% 06.840) - M. hapla melHap1
# 20  1.4000 (% 03.516) (% 05.539) - M. incognita melInc2
# 21  1.5000 (% 05.193) (% 05.092) - O. volvulus oncVol1
# 22  1.5000 (% 02.848) (% 04.678) - Trichinella triSpi1
# 23  1.5000 (% 02.917) (% 03.714) - Whipworm triSui1
# 24  1.6000 (% 05.005) (% 05.474) - Dog heartworm dirImm1
# 25  1.6000 (% 05.056) (% 05.274) - Eye worm loaLoa1

# None of this concern for distances matters in building the first step, the
# maf files.

# Survey N50 stats to determine quality of assembly:

echo -e "# db\tcontigCount\tN50\ttotalSize\tperCentAtN50"
export T=/tmp/txy.$$
cat species.list.txt | while read D
do
  rm -f ${T}
  n50.pl /hive/data/genomes/${D}/chrom.sizes > ${T} 2>&1
  cc=`grep "contig count:" ${T} | awk '{print $4}' | sed -e 's/,//g;'`
  ts=`grep "contig count:" ${T} | awk '{print $7}' | sed -e 's/,//g;'`
  n50Size=`tail -1 ${T} | awk '{print $NF}'`
  perCent=`echo $n50Size $ts | awk '{printf "%.6f", 100*$1/$2}'`
  echo -e "# ${D}\t${cc}\t${n50Size}\t${ts}\t% ${perCent}"
done | sort -k4,4nr

# db    contigCount     N50     totalSize       perCentAtN50
# oncVol1       710     25485961        97402144        % 26.165708
# caeSp111      665     20921866        79321433        % 26.376057
# ce11  7       17493829        100286401       % 17.443870
# cb4   13      17485439        108434085       % 16.125408
# strRat2       135     11693564        43150242        % 27.099649
# triSpi1       6864    6373445 63542128        % 10.030267
# priPac3       18084   1244534 172510819       % 0.721424
# burXyl1       5528    949830  74576239        % 1.273636
# ancCey1       1736    668412  313092710       % 0.213487
# triSui1       4293    503034  74248995        % 0.677496
# caeRem4       3670    435512  145442736       % 0.299439
# ascSuu1       12989   413062  269573965       % 0.153228
# caePb3        3305    381961  190369721       % 0.200642
# hetBac1       1241    312328  76992477        % 0.405660
# panRed1       941     262414  65110236        % 0.403030
# necAme1       11865   211861  244088665       % 0.086797
# bruMal2       9781    200860  95156665        % 0.211083
# loaLoa1       5765    174388  91379422        % 0.190839
# priExs1       4369    142245  177536987       % 0.080121
# caeJap4       18817   94149   166256191       % 0.056629
# haeCon2       23823   83299   369720058       % 0.022530
# caeAng2       34621   79858   105997628       % 0.075339
# dirImm1       16062   71281   88323343        % 0.080705
# melInc2       2996    62516   86079534        % 0.072626
# melHap1       3452    37608   53017507        % 0.070935
# caeSp51       15261   25228   131797386       % 0.019142

# Conclusion: Syntenic Net: oncVol1 caeSp111 cb4 strRat2 triSpi1
# ordinary net: the rest
# XXX took a look at the syntenic net alignments, way too much sequence
#     was missing, therefore always use the 'ordinary' net

    # create species list and stripped down tree for autoMZ
    sed 's/[a-z][a-z]*_//g; s/:[0-9\.][0-9\.]*//g; s/;//; /^ *$/d' \
	ce11.26way.nh | xargs echo | sed 's/ //g; s/,/ /g' > tree.nh

    sed 's/[()]//g; s/,/ /g' tree.nh > species.list
    # should be 26:
    wc -w species.list
    #  26 species.list

    # ce11 caePb3 caeRem4 cb4 caeJap4 caeSp111 caeAng2 caeSp51 hetBac1
    # strRat2 panRed1 ancCey1 necAme1 haeCon2 ascSuu1 priExs1 priPac3 melHap1
    # melInc2 burXyl1 dirImm1 loaLoa1 oncVol1 bruMal2 triSpi1 triSui1

    #	bash shell syntax here ...
    cd /hive/data/genomes/ce11/bed/multiz26way
    export H=/hive/data/genomes/ce11/bed
    mkdir mafLinks

    # using 'ordinary' net for all:
    for G in `sed -e 's/ce11 //' species.list`
    do
      mkdir -p mafLinks/$G
      echo ln -s ${H}/lastz.$G/mafNet/*.maf.gz ./mafLinks/$G
      ln -s ${H}/lastz.$G/mafNet/*.maf.gz ./mafLinks/$G
    done

    # verify the symLinks are good:
    ls -ogrtL mafLinks/*/* | sed -e 's#.*/chr#chr#;' | sort | uniq -c
    # 25 chrI.maf.gz
    # 25 chrII.maf.gz
    # 25 chrIII.maf.gz
    # 25 chrIV.maf.gz
    # 24 chrM.maf.gz
    # 25 chrV.maf.gz
    # 25 chrX.maf.gz

    # it appears one of them did not have any matching sequence to chrM
    ls -lR | grep chrM | awk '{print $NF}' \
      | sed -e 's#.*lastz.##; s#/mafNet.*##;' | sort \
        | xargs echo | fold -w72 -s
# ancCey1 ascSuu1 bruMal2 burXyl1 caeAng2 caeJap4 caePb3 caeRem4 caeSp51
# cb4 dirImm1 haeCon2 hetBac1 loaLoa1 melHap1 melInc2 necAme1 oncVol1
# panRed1 priExs1 priPac3 strRat2 triSpi1 triSui1

    # compared to all of them:
    sort species.list.txt |  xargs echo | fold -w 72 -s
# ancCey1 ascSuu1 bruMal2 burXyl1 caeAng2 caeJap4 caePb3 caeRem4 caeSp111
# caeSp51 cb4 ce11 dirImm1 haeCon2 hetBac1 loaLoa1 melHap1 melInc2
# necAme1 oncVol1 panRed1 priExs1 priPac3 strRat2 triSpi1 triSui1

    # looks like it is caeSp111 that has no chrM alignment
    # found same pattern on ce10, curious.

    # even though there are 26 of these, they can still be done as
    # full chromosomes without the splitting procedure

    mkdir /hive/data/genomes/ce11/bed/multiz26way/fullChromRun
    cd /hive/data/genomes/ce11/bed/multiz26way/fullChromRun
    mkdir maf run
    cd run
    mkdir penn
    cp -p /cluster/bin/penn/multiz.2009-01-21_patched/multiz penn
    cp -p /cluster/bin/penn/multiz.2009-01-21_patched/maf_project penn
    cp -p /cluster/bin/penn/multiz.2009-01-21_patched/autoMZ penn

    find ../../mafLinks -type l | grep ".maf.gz$" | xargs -L 1 basename \
       | sed -e 's/.gz//;' | sort -u > maf.list

    wc -l maf.list

    #	set the db and pairs directories here
    cat > autoMultiz.csh << '_EOF_'
#!/bin/csh -ef
set db = ce11
set c = $1
set result = $2
set run = `/bin/pwd`
set tmp = /dev/shm/$db/multiz.$c
set pairs = /hive/data/genomes/ce11/bed/multiz26way/mafLinks
/bin/rm -fr $tmp
/bin/mkdir -p $tmp
/bin/cp -p ../../tree.nh ../../species.list $tmp
pushd $tmp > /dev/null
foreach s (`/bin/sed -e "s/$db //" species.list`)
    set in = $pairs/$s/$c
    set out = $db.$s.sing.maf
    if (-e $in.gz) then
        /bin/zcat $in.gz > $out
        if (! -s $out) then
            echo "##maf version=1 scoring=autoMZ" > $out
        endif
    else if (-e $in) then
        /bin/ln -s $in $out
    else
        echo "##maf version=1 scoring=autoMZ" > $out
    endif
end
set path = ($run/penn $path); rehash
$run/penn/autoMZ + T=$tmp E=$db "`cat tree.nh`" $db.*.sing.maf $c \
        > /dev/null
popd > /dev/null
/bin/rm -f $result
/bin/cp -p $tmp/$c $result
/bin/rm -fr $tmp
'_EOF_'
# << happy emacs
    chmod +x autoMultiz.csh

    cat  << '_EOF_' > template
#LOOP
./autoMultiz.csh $(file1) {check out line+ /hive/data/genomes/ce11/bed/multiz26way/fullChromRun/maf/$(root1).maf}
#ENDLOOP
'_EOF_'
# << happy emacs

    ssh ku
    cd /hive/data/genomes/ce11/bed/multiz26way/fulChromRun/run
    gensub2 maf.list single template jobList
    para -ram=32g create jobList
# Completed: 7 of 7 jobs
# CPU time in finished jobs:      11928s     198.80m     3.31h    0.14d  0.000 y
# IO & Wait Time:                    29s       0.48m     0.01h    0.00d  0.000 y
# Average job time:                1708s      28.47m     0.47h    0.02d
# Longest finished job:            2176s      36.27m     0.60h    0.03d
# Submission to last job:          2189s      36.48m     0.61h    0.03d


    cd /hive/data/genomes/ce11/bed/multiz26way
    du -hsc maf
    # 1.3G    maf

    # Load into database
    ssh hgwdev
    cd /hive/data/genomes/ce11/bed/multiz26way/fullChromRun/maf
    mkdir /gbdb/ce11/multiz26way
    ln -s `pwd`/*.maf /gbdb/ce11/multiz26way
    cd /dev/shm
    time hgLoadMaf ce11 multiz26way
    # Loaded 1469313 mafs in 7 files from /gbdb/ce11/multiz26way
    # real    0m28.115s

    time cat /gbdb/ce11/multiz26way/*.maf \
	| hgLoadMafSummary ce11 -minSize=10000 -verbose=2 \
		-mergeGap=500 -maxSize=50000  multiz26waySummary stdin
    # Created 397940 summary blocks from 12243969 components and 1469313 mafs
    # from stdin
    # real    0m38.197s

# -rw-rw-r-- 1 70164926 Aug 13 14:34 multiz26way.tab
# -rw-rw-r-- 1 17851130 Aug 13 14:35 multiz26waySummary.tab

    wc -l multiz26way*
    # 1469313 multiz26way.tab
    #  397940 multiz26waySummary.tab

    rm multiz26way*.tab

##############################################################################
# GAP ANNOTATE MULTIZ7WAY MAF AND LOAD TABLES (DONE - 2015-08-13 - Hiram)
    # mafAddIRows has to be run on single chromosome maf files, it does not
    #	function correctly when more than one reference sequence
    #	are in a single file.  If there is a single maf file, need to split
    #   into individual per-chrom maf files
    mkdir -p /hive/data/genomes/ce11/bed/multiz26way/anno
    cd /hive/data/genomes/ce11/bed/multiz26way/anno

    # this would split a single file into per-chrom files
#    mkdir -p /hive/data/genomes/ce11/bed/multiz26way/anno/mafSplit
#    cd /hive/data/genomes/ce11/bed/multiz26way/anno/mafSplit
#    time mafSplit -outDirDepth=1 -byTarget -useFullSequenceName \
#        /dev/null . ../../multiz26way.maf

    # already per-chrom files here:
    find ../fullChromRun/maf -type f | wc -l
    #   7

    # check for N.bed files everywhere:
    cd /hive/data/genomes/ce11/bed/multiz26way/anno
    for DB in `cat ../species.list`
do
    if [ ! -s /hive/data/genomes/${DB}/${DB}.N.bed ]; then
        echo "MISS: ${DB}"
        cd /hive/data/genomes/${DB}
        twoBitInfo -nBed ${DB}.2bit ${DB}.N.bed
    else
        echo "  OK: ${DB}"
    fi
done
    cd /hive/data/genomes/ce11/bed/multiz26way/anno
    # they end up empty for ce11 and melHap1, no gaps:
    hgsql -e 'select count(*) from gap;' ce11
# +----------+
# | count(*) |
# +----------+
# |        0 |
# +----------+
    hgsql -e 'select count(*) from gap;' melHap1
# +----------+
# | count(*) |
# +----------+
# |        0 |
# +----------+

    cd /hive/data/genomes/ce11/bed/multiz26way/anno
    for DB in `cat ../species.list`
do
    echo "${DB} "
    ln -s  /hive/data/genomes/${DB}/${DB}.N.bed ${DB}.bed
    echo ${DB}.bed  >> nBeds
    ln -s  /hive/data/genomes/${DB}/chrom.sizes ${DB}.len
    echo ${DB}.len  >> sizes
done
    # make sure they all are successful symLinks:
    ls -ogrtL

    screen -S ce11      # use a screen to control this longish job
    ssh ku
    cd /hive/data/genomes/ce11/bed/multiz26way/anno
    mkdir result

    cat << '_EOF_' > template
#LOOP
mafAddIRows -nBeds=nBeds ../fullChromRun/maf/$(path1) /hive/data/genomes/ce11/ce11.2bit {check out exists+ result/$(path1)}
#ENDLOOP
'_EOF_'
    # << happy emacs

    ls ../maf > maf.list
    gensub2 maf.list single template jobList
    # limit jobs on a node with the -maxJob=64 requirement because they go fast
    # XXX - use more memory next time to allow all big jobs to finish
    para -ram=32g create jobList
    para try ... check ... push ...
    # don't run too many at once, these go very fast
    para -maxJob=64 push
# Completed: 7 of 7 jobs
# CPU time in finished jobs:         89s       1.48m     0.02h    0.00d  0.000 y
# IO & Wait Time:                    18s       0.30m     0.01h    0.00d  0.000 y
# Average job time:                  15s       0.25m     0.00h    0.00d
# Longest finished job:              21s       0.35m     0.01h    0.00d
# Submission to last job:            25s       0.42m     0.01h    0.00d

    # verify all result files have some content, look for 0 size files:
    find ./result -type f -size 0
    # should see none
    # or in this manner:
    find ./result -type f | xargs ls -og | sort -k3nr | tail
# -rw-rw-r-- 1    785220 Aug 13 14:41 ./result/chrM.maf

    du -hsc result
    # 2.4G    result

    # construct symlinks to get the individual maf files into gbdb:
    cd /hive/data/genomes/ce11/bed/multiz26way/anno/result
    rm /gbdb/ce11/multiz26way/*.maf   # remove previous results
    ln -s `pwd`/*.maf /gbdb/ce11/multiz26way

    # Load into database
    cd /dev/shm
    time hgLoadMaf -pathPrefix=/gbdb/ce11/multiz26way \
        ce11 multiz26way
    # Loaded 1512814 mafs in 7 files from /gbdb/ce11/multiz26way
    # real    0m42.327s

    time cat /gbdb/ce11/multiz26way/*.maf \
	| hgLoadMafSummary ce11 -minSize=10000 -verbose=2 \
		-mergeGap=500 -maxSize=50000  multiz26waySummary stdin
    # Indexing and tabulating stdin
    # Created 397940 summary blocks from 12243969 components and
    #    1512814 mafs from stdin
    # real    0m51.963s

# -rw-rw-r-- 1 72568758 Aug 13 14:44 multiz26way.tab
# -rw-rw-r-- 1 18647010 Aug 13 14:45 multiz26waySummary.tab

    wc -l multiz26way*
    #   1512814 multiz26way.tab
    #    397940 multiz26waySummary.tab

    rm multiz26way*.tab

######################################################################
# MULTIZ7WAY MAF FRAMES (DONE - 2015-09-08 - Hiram)
    ssh hgwdev
    mkdir /hive/data/genomes/ce11/bed/multiz26way/frames
    cd /hive/data/genomes/ce11/bed/multiz26way/frames
#   survey all the genomes to find out what kinds of gene tracks they have
    cat << '_EOF_' > showGenes.csh
#!/bin/csh -fe
foreach db (`cat ../species.list`)
    echo -n "${db}: "
    set tables = `hgsql $db -N -e "show tables like '%Gene%'"`
    foreach table ($tables)
        if ($table == "ensGene" || $table == "refGene" || \
           $table == "mgcGenes" || $table == "knownGene" || \
           $table == "ws245Genes" || $table == "ncbiGenes" || \
           $table == "xenoRefGene" ) then
           set count = `hgsql $db -N -e "select count(*) from $table"`
            echo -n "${table}: ${count}, "
        endif
    end
    set orgName = `hgsql hgcentraltest -N -e \
            "select scientificName from dbDb where name='$db'"`
    set orgId = `hgsql ce11 -N -e \
            "select id from organism where name='$orgName'"`
    if ($orgId == "") then
        echo "Mrnas: 0"
    else
        set count = `hgsql ce11 -N -e "select count(*) from gbCdnaInfo where organism=$orgId"`
        echo "Mrnas: ${count}"
    endif
end
'_EOF_'
    # << happy emacs
    chmod +x ./showGenes.csh
    time ./showGenes.csh
# ce11: ensGene: 57834, refGene: 53470, ws245Genes: 57528, Mrnas: 52714
# caePb3: ws245Genes: 33210, xenoRefGene: 189584,
# caeRem4: ws245Genes: 32899, xenoRefGene: 155137,
# cb4: ws245Genes: 23050, xenoRefGene: 130721,
# caeJap4: ws245Genes: 38144, xenoRefGene: 158453,
# caeSp111: ws245Genes: 27721,
# caeAng2: ws245Genes: 33934,
# caeSp51: ws245Genes: 46280,
# hetBac1: ws245Genes: 20928,
# strRat2: ncbiGenes: 12459,
# panRed1: ws245Genes: 26372,
# ancCey1: ws245Genes: 65583,
# necAme1: ws245Genes: 19643,
# haeCon2: ws245Genes: 26359,
# ascSuu1: ws245Genes: 3082, xenoRefGene: 134962,
# priExs1: ws245Genes: 24642, xenoRefGene: 0,
# priPac3: ws245Genes: 24243, xenoRefGene: 89943,
# melHap1: ws245Genes: 14420, xenoRefGene: 88300,
# melInc2: ws245Genes: 20365,
# burXyl1: ws245Genes: 18074,
# dirImm1: ws245Genes: 12857,
# loaLoa1: ws245Genes: 15577,
# oncVol1: ws245Genes: 12385,
# bruMal2: ws245Genes: 18709,
# triSpi1: ws245Genes: 16380,
# triSui1: ws245Genes: 14435,

    # from that summary, use these gene sets:
    # ws245Genes - ce11 caePb3 caeRem4 cb4 caeJap4 caeSp111 caeAng2 caeSp51
    #              hetBac1 panRed1 ancCey1 necAme1 haeCon2
    #              priExs1 priPac3 melHap1 melInc2 burXyl1 dirImm1 loaLoa1
    #              oncVol1 bruMal2 triSpi1 triSui1
    # ncbiGenes - strRat2
    # ascSuu1 - assembly confusion, can not determine genes

    mkdir genes

    #   1. ws245Genes
    for DB in ce11 caePb3 caeRem4 cb4 caeJap4 caeSp111 caeAng2 caeSp51 hetBac1 panRed1 ancCey1 necAme1 haeCon2 priExs1 priPac3 melHap1 melInc2 burXyl1 dirImm1 loaLoa1 oncVol1 bruMal2 triSpi1 triSui1
do
hgsql -N -e "select name,chrom,strand,txStart,txEnd,cdsStart,cdsEnd,exonCount,exonStarts,exonEnds from ws245Genes" ${DB} \
      | genePredSingleCover stdin stdout | gzip -2c \
        > /scratch/tmp/${DB}.tmp.gz
    mv /scratch/tmp/${DB}.tmp.gz genes/$DB.gp.gz
    echo -n "$DB: "
    genePredCheck -db=${DB} genes/${DB}.gp.gz
done
ce11: checked: 20412 failed: 0
caePb3: checked: 30626 failed: 0
caeRem4: checked: 31433 failed: 0
cb4: checked: 21737 failed: 0
caeJap4: checked: 29913 failed: 0
caeSp111: checked: 22306 failed: 0
caeAng2: checked: 27919 failed: 0
caeSp51: checked: 34679 failed: 0
hetBac1: checked: 20928 failed: 0
panRed1: checked: 24071 failed: 0
ancCey1: checked: 35775 failed: 0
necAme1: checked: 18984 failed: 0
haeCon2: checked: 21837 failed: 0
priExs1: checked: 23767 failed: 0
priPac3: checked: 22872 failed: 0
melHap1: checked: 14103 failed: 0
melInc2: checked: 19189 failed: 0
burXyl1: checked: 18074 failed: 0
dirImm1: checked: 12857 failed: 0
loaLoa1: checked: 14903 failed: 0
oncVol1: checked: 12133 failed: 0
bruMal2: checked: 13471 failed: 0
triSpi1: checked: 16370 failed: 0
triSui1: checked: 13600 failed: 0

    #   1. ncbiGenes: strRat2
    for DB in strRat2
do
hgsql -N -e "select name,chrom,strand,txStart,txEnd,cdsStart,cdsEnd,exonCount,exonStarts,exonEnds from ncbiGenes" ${DB} \
      | genePredSingleCover stdin stdout | gzip -2c \
        > /scratch/tmp/${DB}.tmp.gz
    mv /scratch/tmp/${DB}.tmp.gz genes/$DB.gp.gz
    echo -n "$DB: "
    genePredCheck -db=${DB} genes/${DB}.gp.gz
done
    # strRat2: checked: 12439 failed: 0

    # verify counts for genes are reasonable:
    for T in genes/*.gz
do
    echo -n "# $T: "
    zcat $T | cut -f1 | sort | uniq -c | wc -l
done
# genes/ancCey1.gp.gz: 35775
# genes/bruMal2.gp.gz: 13471
# genes/burXyl1.gp.gz: 18074
# genes/caeAng2.gp.gz: 27919
# genes/caeJap4.gp.gz: 29913
# genes/caePb3.gp.gz: 30626
# genes/caeRem4.gp.gz: 31433
# genes/caeSp111.gp.gz: 22306
# genes/caeSp51.gp.gz: 34679
# genes/cb4.gp.gz: 21737
# genes/ce11.gp.gz: 20412
# genes/dirImm1.gp.gz: 12857
# genes/haeCon2.gp.gz: 21837
# genes/hetBac1.gp.gz: 20928
# genes/loaLoa1.gp.gz: 14903
# genes/melHap1.gp.gz: 14103
# genes/melInc2.gp.gz: 19189
# genes/necAme1.gp.gz: 18984
# genes/oncVol1.gp.gz: 12133
# genes/panRed1.gp.gz: 24071
# genes/priExs1.gp.gz: 23767
# genes/priPac3.gp.gz: 22872
# genes/strRat2.gp.gz: 12439
# genes/triSpi1.gp.gz: 16370
# genes/triSui1.gp.gz: 13600

    time (cat ../anno/result/*.maf \
	| genePredToMafFrames ce11 stdin stdout \
           `egrep -v "ascSuu1" ../species.list.txt | xargs echo | sed -e "s#\([a-zA-Z0-9]*\)#\1 genes/\1.gp.gz#g;"` \
		| gzip > multiz26wayFrames.bed.gz)
    #   real    1m40.091s

    # verify there are frames on everything, should be 15 species:
    zcat multiz26wayFrames.bed.gz | awk '{print $4}' | sort | uniq -c \
       | sed -e 's/^/# /;'
#  102228 ancCey1
#   62865 bruMal2
#   93717 burXyl1
#  129846 caeAng2
#  162169 caeJap4
#  183626 caePb3
#  219355 caeRem4
#  190425 caeSp111
#  214566 caeSp51
#  204669 cb4
#  123221 ce11
#   62882 dirImm1
#   89757 haeCon2
#   75007 hetBac1
#   62443 loaLoa1
#   53145 melHap1
#   46477 melInc2
#   83965 necAme1
#   65069 oncVol1
#   75620 panRed1
#   59341 priExs1
#   61095 priPac3
#   75605 strRat2
#   43472 triSpi1
#   41941 triSui1

    #   load the resulting file
    ssh hgwdev
    cd /hive/data/genomes/ce11/bed/multiz26way/frames
    time hgLoadMafFrames ce11 multiz26wayFrames multiz26wayFrames.bed.gz
    #   real    0m29.914s

    time featureBits -countGaps ce11 multiz26wayFrames
    # 29102814 bases of 100286401 (29.020%) in intersection
    #  real    0m16.783s

    #   enable the trackDb entries:
# frames multiz26wayFrames
# irows on
    #   appears to work OK

#########################################################################
# Phylogenetic tree from 26-way (DONE - 2015-08-18 - Hiram)
    mkdir /hive/data/genomes/ce11/bed/multiz26way/4d
    cd /hive/data/genomes/ce11/bed/multiz26way/4d

    # the annotated mafs are in
    ../anno/result/*.maf

    # using ensGene for ce11
    hgsql -N -e "select name,chrom,strand,txStart,txEnd,cdsStart,cdsEnd,exonCount,exonStarts,exonEnds from ensGene" ce11 > ce11.ensGene.gp

    genePredSingleCover ce11.ensGene.gp stdout | sort > ce11.ensGeneNR.gp
    wc -l *
    #    57834 ce11.ensGene.gp
    #    20412 ce11.ensGeneNR.gp

    ssh ku
    mkdir /hive/data/genomes/ce11/bed/multiz26way/4d/run
    cd /hive/data/genomes/ce11/bed/multiz26way/4d/run
    mkdir ../mfa

    # newer versions of msa_view have a slightly different operation
    # the sed of the gp file inserts the reference species in the chr name
    cat << '_EOF_' > 4d.csh
#!/bin/csh -fe
set PHASTBIN = /cluster/bin/phast.build/cornellCVS/phast.2010-12-30/bin
set r = "/hive/data/genomes/ce11/bed/multiz26way"
set c = $1:r
set infile = $r/anno/result/$2
set outDir = $r/4d/mfa/$3:h
set outfile = $r/4d/mfa/$3
/bin/mkdir -p $outDir
cd /scratch/tmp
/bin/awk -v C=$c '$2 == C {print}' $r/4d/ce11.ensGeneNR.gp | sed -e "s/\t$c\t/\tce11.$c\t/" > $c.gp
set NL=`wc -l $c.gp| gawk '{print $1}'`
echo $NL
if ("$NL" != "0") then
    $PHASTBIN/msa_view --4d --features $c.gp -i MAF $infile -o SS > $c.ss
    $PHASTBIN/msa_view -i SS --tuple-size 1 $c.ss > $outfile
else
    echo "" > $outfile
endif
/bin/rm -f $c.gp $c.ss
'_EOF_'
    # << happy emacs
    chmod +x 4d.csh


    cat << '_EOF_' > template
#LOOP
4d.csh $(file1) $(path1) {check out line+ ../mfa/$(dir1)/$(root1).mfa}
#ENDLOOP
'_EOF_'
    # << happy emacs

    ls ../../anno/result > maf.list

    gensub2 maf.list single template jobList
    # -ram=32g to make sure they don't fail out of memory and don't run too fast
    para -ram=32g create jobList
    para try ... check   # the try runs all 7 jobs
    para time
# Completed: 7 of 7 jobs
# CPU time in finished jobs:         95s       1.59m     0.03h    0.00d  0.000 y
# IO & Wait Time:                    21s       0.35m     0.01h    0.00d  0.000 y
# Average job time:                  17s       0.28m     0.00h    0.00d
# Longest finished job:              21s       0.35m     0.01h    0.00d
# Submission to last job:            25s       0.42m     0.01h    0.00d

    # Not all results have contents, that is OK

    # combine mfa files
    ssh hgwdev
    cd /hive/data/genomes/ce11/bed/multiz26way/4d
    # none of these results are empty:
    ls -ogrt mfa
# -rw-rw-r-- 1    1417 Aug 18 15:14 chrM.mfa
# -rw-rw-r-- 1 5577097 Aug 18 15:14 chrII.mfa
# -rw-rw-r-- 1 4732903 Aug 18 15:14 chrIV.mfa
# -rw-rw-r-- 1 4185213 Aug 18 15:14 chrIII.mfa
# -rw-rw-r-- 1 4402105 Aug 18 15:14 chrX.mfa
# -rw-rw-r-- 1 6934921 Aug 18 15:14 chrV.mfa
# -rw-rw-r-- 1 4628253 Aug 18 15:14 chrI.mfa

    #want comma-less species.list
    time /cluster/bin/phast.build/cornellCVS/phast.2010-12-30/bin/msa_view \
	--aggregate "`cat ../species.list`" mfa/*.mfa | sed s/"> "/">"/ \
	    > 4d.all.mfa
    # real    0m4.558s

    # check they are all in there:
    grep "^>" 4d.all.mfa | wc -l
    # 26
    grep "^>" 4d.all.mfa | sed -e 's/^/# /;'
# >ce11
# >caePb3
# >caeRem4
# >cb4
# >caeJap4
# >caeSp111
# >caeAng2
# >caeSp51
# >hetBac1
# >strRat2
# >panRed1
# >ancCey1
# >necAme1
# >haeCon2
# >ascSuu1
# >priExs1
# >priPac3
# >melHap1
# >melInc2
# >burXyl1
# >dirImm1
# >loaLoa1
# >oncVol1
# >bruMal2
# >triSpi1
# >triSui1

    sed 's/[a-z][a-z]*_//g; s/:[0-9\.][0-9\.]*//g; s/;//; /^ *$/d' \
	../ce11.26way.nh | xargs echo | sed -e 's/ //g' > tree_commas.nh
    # tree_commas.nh looks like:
    # (((((((((((((ce11,(caePb3,(caeRem4,cb4))),caeJap4),caeSp111),caeAng2),
    #   caeSp51),hetBac1),(strRat2,panRed1)),((ancCey1,necAme1),haeCon2)),
    #      ascSuu1),(priExs1,priPac3)),((melHap1,melInc2),burXyl1)),
    #          (((dirImm1,loaLoa1),oncVol1),bruMal2)),(triSpi1,triSui1))

    # use phyloFit to create tree model (output is phyloFit.mod)
    time /cluster/bin/phast.build/cornellCVS/phast.2010-12-30/bin/phyloFit \
	    --EM --precision MED --msa-format FASTA --subst-mod REV \
		--tree tree_commas.nh 4d.all.mfa
    #   real    102m23.619s

    mv phyloFit.mod all.mod

    grep TREE all.mod
# TREE: (((((((((((((ce11:0.641433,(caePb3:0.487648,
# (caeRem4:0.443246,cb4:0.445294):0.0464154):0.0165647):0.00599635,
# caeJap4:0.828929):0.00213273,caeSp111:0.526296):0.00774532,
# caeAng2:0.983829):0.0266384,caeSp51:0.415189):0.385011,
# hetBac1:1.35734):0.0946305,(strRat2:1.37255,
# panRed1:1.10699):0.139767):0.0425575,
# ((ancCey1:0.387375,necAme1:0.494722):0.308076,
# haeCon2:0.755625):0.31865):0.0426445,ascSuu1:1.32614):0.0433534,
# (priExs1:0.112444,priPac3:0.0999032):0.752898):0.0633886,
# ((melHap1:0.237743,melInc2:0.205426):1.14699,
# burXyl1:0.685906):0.156962):0.334957,(((dirImm1:0.213158,
# loaLoa1:0.198781):0.00222742,oncVol1:0.16899):0.0727414,
# bruMal2:0.174965):1.0837):0.137915,(triSpi1:0.820134,
# triSui1:0.850407):0.137915);

    # resulting distance table:
    /cluster/bin/phast/all_dists ce11.26way.nh | grep ce11 \
        | sed -e "s/ce11.//" | sort -k2n | sed -e 's/^/#    /;'
#    caeSp51    1.099135
#    caePb3     1.145646
#    caeRem4    1.147659
#    cb4        1.149707
#    caeSp111   1.175858
#    caeJap4    1.476358
#    caeAng2    1.641136
#    priPac3    2.144944
#    priExs1    2.157485
#    burXyl1    2.198399
#    ancCey1    2.220246
#    haeCon2    2.280420
#    necAme1    2.327593
#    panRed1    2.410344
#    hetBac1    2.426297
#    ascSuu1    2.574929
#    strRat2    2.675904
#    triSpi1    2.786452
#    triSui1    2.816725
#    melInc2    2.864909
#    melHap1    2.897226
#    bruMal2    2.949153
#    oncVol1    3.015920
#    loaLoa1    3.047938
#    dirImm1    3.062315


#########################################################################
# phastCons 26-way (DONE - 2015-09-08 - Hiram)
    # split 26way mafs into 10M chunks and generate sufficient statistics
    # files for # phastCons
    ssh ku
    mkdir -p /hive/data/genomes/ce11/bed/multiz26way/cons/SS
    cd /hive/data/genomes/ce11/bed/multiz26way/cons/SS
    mkdir result done

    cat << '_EOF_' > mkSS.csh
#!/bin/csh -ef
set c = $1
set MAF = /hive/data/genomes/ce11/bed/multiz26way/anno/result/$c.maf
set WINDOWS = /hive/data/genomes/ce11/bed/multiz26way/cons/SS/result/$c
set WC = `cat $MAF | wc -l`
set NL = `grep "^#" $MAF | wc -l`
if ( -s $2 ) then
    exit 0
endif
if ( -s $2.running ) then
    exit 0
endif

/bin/date >> $2.running

/bin/rm -fr $WINDOWS
/bin/mkdir -p $WINDOWS
pushd $WINDOWS > /dev/null
if ( $WC != $NL ) then
/cluster/bin/phast.build/cornellCVS/phast.2010-12-30/bin/msa_split \
    $MAF -i MAF -o SS -r $WINDOWS/$c -w 10000000,0 -I 1000 -B 5000
endif
popd > /dev/null
/bin/date >> $2
/bin/rm -f $2.running
'_EOF_'
    # << happy emacs
    chmod +x mkSS.csh

    cat << '_EOF_' > template
#LOOP
mkSS.csh $(root1) {check out line+ done/$(root1)}
#ENDLOOP
'_EOF_'
    # << happy emacs

    ls ../../anno/result > maf.list

    gensub2 maf.list single template jobList
    # beware overloaded the cluster with these fast running high I/O jobs
    # however, in this case, there are only 7
    para -ram=32g create jobList
    para try   # this runs all 7
    para time > run.time
# Completed: 7 of 7 jobs
# CPU time in finished jobs:        161s       2.68m     0.04h    0.00d  0.000 y
# IO & Wait Time:                    15s       0.25m     0.00h    0.00d  0.000 y
# Average job time:                  25s       0.42m     0.01h    0.00d
# Longest finished job:              34s       0.57m     0.01h    0.00d
# Submission to last job:            45s       0.75m     0.01h    0.00d

    find ./result -type f | wc -l
    #	 14

    # Run phastCons
    #	This job is I/O intensive in its output files, beware where this
    #	takes place or do not run too many at once.
    ssh ku
    mkdir -p /hive/data/genomes/ce11/bed/multiz26way/cons/run.cons
    cd /hive/data/genomes/ce11/bed/multiz26way/cons/run.cons

    #	This is setup for multiple runs based on subsets, but only running
    #   the 'all' subset here.
    #   It triggers off of the current working directory
    #	$cwd:t which is the "grp" in this script.  Running:
    #	all and vertebrates

    cat << '_EOF_' > doPhast.csh
#!/bin/csh -fe
set PHASTBIN = /cluster/bin/phast.build/cornellCVS/phast.2010-12-30/bin
set c = $1
set d = $2
set f = $3
set len = $4
set cov = $5
set rho = $6
set grp = $cwd:t
set cons = /hive/data/genomes/ce11/bed/multiz26way/cons
set tmp = $cons/tmp/${d}_${c}
mkdir -p $tmp
set ssSrc = $cons/SS/result
set useGrp = "$grp.mod"
if (-s $cons/$grp/$grp.non-inf) then
  ln -s $cons/$grp/$grp.mod $tmp
  ln -s $cons/$grp/$grp.non-inf $tmp
  ln -s $ssSrc/$d/$f $tmp
else
  ln -s $ssSrc/$d/$f $tmp
  ln -s $cons/$grp/$grp.mod $tmp
endif
pushd $tmp > /dev/null
if (-s $grp.non-inf) then
  $PHASTBIN/phastCons $f $useGrp \
    --rho $rho --expected-length $len --target-coverage $cov --quiet \
    --not-informative `cat $grp.non-inf` \
    --seqname $c --idpref $c --most-conserved $c.bed --score > $c.pp
else
  $PHASTBIN/phastCons $f $useGrp \
    --rho $rho --expected-length $len --target-coverage $cov --quiet \
    --seqname $c --idpref $c --most-conserved $c.bed --score > $c.pp
endif
popd > /dev/null
mkdir -p pp/$d bed/$d
sleep 4
touch pp/$d bed/$d
rm -f pp/$d/$c.pp
rm -f bed/$d/$c.bed
mv $tmp/$c.pp pp/$d
mv $tmp/$c.bed bed/$d
rm -fr $tmp
'_EOF_'
    # << happy emacs
    chmod +x doPhast.csh

    #	this template will serve for all runs
    #	root1 == chrom name, file1 == ss file name without .ss suffix
    cat << '_EOF_' > template
#LOOP
../run.cons/doPhast.csh $(root1) $(dir1) $(file1) 45 0.3 0.3 {check out line+ pp/$(dir1)/$(root1).pp}
#ENDLOOP
'_EOF_'
    # << happy emacs

    find ../SS/result -type f | sed -e "s#../SS/result/##" > ss.list
    wc -l ss.list
    #	14 ss.list

    # Create parasol batch and run it
    # run for all species
    mkdir /hive/data/genomes/ce11/bed/multiz26way/cons/all
    cd /hive/data/genomes/ce11/bed/multiz26way/cons/all
    #	Using the .mod tree
    cp -p ../../4d/all.mod ./all.mod

    gensub2 ../run.cons/ss.list single ../run.cons/template jobList
    # beware overwhelming the cluster with these fast running high I/O jobs
    # however, in this case, there are only 14 jobs
    para -ram=32g create jobList
    para try ... check ...
# Completed: 14 of 14 jobs
# CPU time in finished jobs:        412s       6.87m     0.11h    0.00d  0.000 y
# IO & Wait Time:                    81s       1.35m     0.02h    0.00d  0.000 y
# Average job time:                  35s       0.59m     0.01h    0.00d
# Longest finished job:              51s       0.85m     0.01h    0.00d
# Submission to last job:           108s       1.80m     0.03h    0.00d

    # create Most Conserved track
    cd /hive/data/genomes/ce11/bed/multiz26way/cons/all
    time cut -f1 ../../../../chrom.sizes | while read C
do
    ls -d bed/${C} 2> /dev/null | while read D
    do
        echo ${D}/${C}*.bed 1>&2
        cat ${D}/${C}*.bed
    done | sort -k1,1 -k2,2n \
    | awk '{printf "%s\t%d\t%d\tlod=%d\t%s\n", "'${C}'", $2, $3, $5, $5;}'
done > tmpMostConserved.bed
    # real    0m40.828s


    /cluster/bin/scripts/lodToBedScore tmpMostConserved.bed > mostConserved.bed
    # -rw-rw-r-- 1 13454157 Sep  8 11:44 tmpMostConserved.bed
    # -rw-rw-r-- 1 13671554 Sep  8 11:45 mostConserved.bed

    wc -l *.bed
    # 406479 mostConserved.bed
    # 406479 tmpMostConserved.bed

    # load into database
    ssh hgwdev
    cd /hive/data/genomes/ce11/bed/multiz26way/cons/all
    time hgLoadBed ce11 phastConsElements26way mostConserved.bed
    # Read 406479 elements of size 5 from mostConserved.bed
    # real    0m3.735s

    # on human we often try for 5% overall cov, and 70% CDS cov
    # most bets are off here for that goal, these alignments are too few
    #	and too far between
    #	--rho 0.3 --expected-length 45 --target-coverage 0.3
    featureBits ce11 -enrichment ensGene:cds phastConsElements26way
    # ensGene:cds 25.321%, phastConsElements26way 41.217%, both 19.293%,
    # cover 76.20%, enrich 1.85x

    # Create merged posterier probability file and wiggle track data files
    cd /hive/data/genomes/ce11/bed/multiz26way/cons/all
    mkdir downloads

    # the third sed fixes the chrom names, removing the partition extensions
    time (find ./pp -type f | sed -e "s#^./##; s#\.# d #g; s#-# m #;" \
	| sort -k1,1 -k3,3n | sed -e "s# d #.#g; s# m #-#g;" | xargs cat \
	| sed -e 's/\.[0-9][0-9]*-[0-9][0-9]* start/ start/' \
        | gzip -c > downloads/phastCons26way.wigFix.gz)
    #   real    0m43.836s

    # check integrity of data with wigToBigWig
    time (zcat downloads/phastCons26way.wigFix.gz \
	| wigToBigWig -verbose=2 stdin /hive/data/genomes/ce11/chrom.sizes \
	    phastCons26way.bw) > bigWig.log 2>&1
    egrep "real|VmPeak" bigWig.log
    # pid=22748: VmPeak:      676500 kB
    # real    0m49.827s

    bigWigInfo phastCons26way.bw | sed -e 's/^/# /;'
# version: 4
# isCompressed: yes
# isSwapped: 0
# primaryDataSize: 118,518,400
# primaryIndexSize: 2,746,816
# zoomLevels: 10
# chromCount: 7
# basesCovered: 58,852,043
# mean: 0.686605
# min: 0.000000
# max: 1.000000
# std: 0.385078

    #	encode those files into wiggle data
    time (zcat downloads/phastCons26way.wigFix.gz \
	| wigEncode stdin phastCons26way.wig phastCons26way.wib)
    #   Converted stdin, upper limit 1.00, lower limit 0.00
    #   real    0m17.307s


    du -hsc *.wi?
    # 57M     phastCons26way.wib
    # 7.8M    phastCons26way.wig
# -rw-rw-r-- 1  58852043 Sep  8 11:49 phastCons26way.wib
# -rw-rw-r-- 1   8107673 Sep  8 11:49 phastCons26way.wig

    # Load gbdb and database with wiggle.
    ln -s `pwd`/phastCons26way.wib /gbdb/ce11/multiz26way/phastCons26way.wib
    time hgLoadWiggle -pathPrefix=/gbdb/ce11/multiz26way \
	ce11 phastCons26way phastCons26way.wig
    #   real    0m1.059s

    # use to set trackDb.ra entries for wiggle min and max
    # and verify table is loaded correctly

    wigTableStats.sh ce11 phastCons26way
# db.table           min max mean       count sumData
# ce11.phastCons26way 0 1 0.686605 58852043 4.04081e+07
#    stdDev  viewLimits
#   0.385078 viewLimits=0:1

    #  Create histogram to get an overview of all the data
    time hgWiggle -doHistogram -db=ce11 \
        -hBinSize=0.001 -hBinCount=1000 -hMinVal=0.0 -verbose=2 \
	    phastCons26way > histogram.data 2>&1
    #	real    0m3.288s

    #	create plot of histogram:

    cat << '_EOF_' | gnuplot > ce11.26way.phastCons.histo.png
set terminal png small x000000 xffffff xc000ff x66ff66 xffff00 x00ffff font \
"/usr/share/fonts/default/Type1/n022004l.pfb"
set size 1.4, 0.8
set key left box
set grid noxtics
set grid ytics
set title " C. elegans - ce11 Histogram phastCons26way track"
set xlabel " phastCons26way score"
set ylabel " Relative Frequency"
set y2label " Cumulative Relative Frequency (CRF)"
set y2range [0:1]
set y2tics
set yrange [0:0.02]

plot "histogram.data" using 2:5 title " RelFreq" with impulses, \
        "histogram.data" using 2:7 axes x1y2 title " CRF" with lines
'_EOF_'
    #	<< happy emacs

    display ce11.26way.phastCons.histo.png &

#########################################################################
# phyloP for 26-way (DONE - 2015-09-08 - Hiram)
    # run phyloP with score=LRT
    ssh ku
    mkdir -p /cluster/data/ce11/bed/multiz26way/consPhyloP/run.phyloP
    cd /cluster/data/ce11/bed/multiz26way/consPhyloP/run.phyloP

    # Adjust model file base composition background and rate matrix to be
    # representative of the chromosomes in play
    grep BACKGROUND ../../cons/all/all.mod | awk '{printf "%0.3f\n", $3 + $4}'
    #	0.567
    /cluster/bin/phast.build/cornellCVS/phast.2010-12-30/bin/modFreqs \
	../../cons/all/all.mod 0.567 > all.mod
    # verify, the BACKGROUND should now be paired up:
    grep BACK all.mod
    #   BACKGROUND: 0.216500 0.283500 0.283500 0.216500 

    cat << '_EOF_' > doPhyloP.csh
#!/bin/csh -fe
set PHASTBIN = /cluster/bin/phast.build/cornellCVS/phast.2010-12-30/bin
set f = $1
set d = $f:h
set file1 = $f:t
set out = $2
set cName = $f:t:r
set grp = $cwd:t
set cons = /hive/data/genomes/ce11/bed/multiz26way/consPhyloP
set tmp = $cons/tmp/$grp/$f
/bin/rm -fr $tmp
/bin/mkdir -p $tmp
set ssSrc = "/hive/data/genomes/ce11/bed/multiz26way/cons/SS/result/$f"
set useGrp = "$grp.mod"
/bin/ln -s $cons/run.phyloP/$grp.mod $tmp
pushd $tmp > /dev/null
$PHASTBIN/phyloP --method LRT --mode CONACC --wig-scores --chrom $cName \
    -i SS $useGrp $ssSrc.ss > $file1.wigFix
popd > /dev/null
/bin/mkdir -p $out:h
sleep 4
/bin/touch $out:h
/bin/mv $tmp/$file1.wigFix $out
/bin/rm -fr $tmp
/bin/rmdir --ignore-fail-on-non-empty $cons/tmp/$grp/$d:h
/bin/rmdir --ignore-fail-on-non-empty $cons/tmp/$grp
/bin/rmdir --ignore-fail-on-non-empty $cons/tmp
'_EOF_'
    # << happy emacs
    chmod +x doPhyloP.csh

    # Create list of chunks
    find ../../cons/SS/result -type f | grep ".ss$" \
	| sed -e "s/.ss$//; s#^../../cons/SS/result/##" > ss.list
    # make sure the list looks good
    wc -l ss.list
    #	14 ss.list

    # Create template file
    #	file1 == $chr/$chunk/file name without .ss suffix
    cat << '_EOF_' > template
#LOOP
../run.phyloP/doPhyloP.csh $(path1) {check out line+ wigFix/$(dir1)/$(file1).wigFix}
#ENDLOOP
'_EOF_'
    # << happy emacs

    ######################   Running all species  #######################
    # setup run for all species
    mkdir /hive/data/genomes/ce11/bed/multiz26way/consPhyloP/all
    cd /hive/data/genomes/ce11/bed/multiz26way/consPhyloP/all
    rm -fr wigFix
    mkdir wigFix

    gensub2 ../run.phyloP/ss.list single ../run.phyloP/template jobList
    # beware overloading the cluster with these quick and high I/O jobs
    # -ram=32g will ensure they can all complete and will not run too many
    #   at the same time, there are only 14 here anyway
    para -ram=32g create jobList
    para try ... check ... push ... etc ...
    para time > run.time
# Completed: 14 of 14 jobs
# CPU time in finished jobs:      10816s     180.26m     3.00h    0.13d  0.000 y
# IO & Wait Time:                    88s       1.47m     0.02h    0.00d  0.000 y
# Average job time:                 779s      12.98m     0.22h    0.01d
# Longest finished job:            1283s      21.38m     0.36h    0.01d
# Submission to last job:          1294s      21.57m     0.36h    0.01d

    # back on dev
    cd /hive/data/genomes/ce11/bed/multiz26way/consPhyloP/all
    mkdir downloads

    time (find ./wigFix -type f | sed -e "s#^./##; s#\.# d #g; s#-# m #;" \
	| sort -k1,1 -k3,3n | sed -e "s# d #.#g; s# m #-#g;" | xargs cat \
	| gzip -c > downloads/phyloP26way.wigFix.gz)
    #   real    0m54.344s

    # -rw-rw-r-- 1 102990761 Sep  8 12:40 phyloP26way.wigFix.gz

    # check integrity of data with wigToBigWig
    time (zcat downloads/phyloP26way.wigFix.gz \
	| wigToBigWig -verbose=2 stdin /hive/data/genomes/ce11/chrom.sizes \
	phyloP26way.bw) > bigWig.log 2>&1
    egrep "real|VmPeak" bigWig.log
    # pid=39486: VmPeak:      676500 kB
    # real    0m54.488s

    bigWigInfo phyloP26way.bw  | sed -e 's/^/# /;'
# version: 4
# isCompressed: yes
# isSwapped: 0
# primaryDataSize: 136,567,148
# primaryIndexSize: 2,746,816
# zoomLevels: 10
# chromCount: 7
# basesCovered: 58,852,043
# mean: 0.970667
# min: -3.992000
# max: 9.159000
# std: 1.577332

    #	encode those files into wiggle data
    time (zcat downloads/phyloP26way.wigFix.gz \
	| wigEncode stdin phyloP26way.wig phyloP26way.wib)
    # Converted stdin, upper limit 9.16, lower limit -3.99
    # real    0m19.281s

    du -hsc *.wi?
    # 57M     phyloP26way.wib
    # 8.0M    phyloP26way.wig
# -rw-rw-r-- 1  58852043 Sep  8 12:46 phyloP26way.wib
# -rw-rw-r-- 1   8322927 Sep  8 12:46 phyloP26way.wig

    # Load gbdb and database with wiggle.
    ln -s `pwd`/phyloP26way.wib /gbdb/ce11/multiz26way/phyloP26way.wib
    time hgLoadWiggle -pathPrefix=/gbdb/ce11/multiz26way ce11 \
	phyloP26way phyloP26way.wig
    # real    0m0.842s

    # use to set trackDb.ra entries for wiggle min and max
    # and verify table is loaded correctly

    wigTableStats.sh ce11 phyloP26way
# db.table      min max mean count sumData
# ce11.phyloP26way -3.992 9.159 0.970667 58852043 5.71257e+07
#       stdDev viewLimits
#      1.57733 viewLimits=-3.992:8.85733

    #	that range is: 9.159+3.992 = 13.151 for hBinSize=0.013151

    #  Create histogram to get an overview of all the data
    time hgWiggle -doHistogram \
	-hBinSize=0.013151 -hBinCount=1000 -hMinVal=-3.992 -verbose=2 \
	    -db=ce11 phyloP26way > histogram.data 2>&1
    #   real    0m3.666s

    # find the Y range for the 2:5 graph
    grep "^[0-9]" histogram.data | ave -col=5 stdin | sed -e 's/^/# /;'
# Q1 0.000065
# median 0.000179
# Q3 0.000924
# average 0.001026
# min 0.000000
# max 0.030633
# count 975
# total 0.999992
# standard deviation 0.002321

    # find the X range for the 2:5 graph
    grep "^[0-9]" histogram.data | ave -col=2 stdin | sed -e 's/^/# /;'
# Q1 -0.533287
# median 2.662410
# Q3 5.884400
# average 2.660747
# min -3.992000
# max 9.159000
# count 975
# total 2594.228452
# standard deviation 3.707122

    #	create plot of histogram:
    cat << '_EOF_' | gnuplot > ce11.26way.phyloP.histo.png
set terminal png small x000000 xffffff xc000ff x66ff66 xffff00 x00ffff font \
"/usr/share/fonts/default/Type1/n022004l.pfb"
set size 1.4, 0.8
set key left box
set grid noxtics
set grid ytics
set title " C. elegans ce11 Histogram phyloP26way track"
set xlabel " phyloP26way score"
set ylabel " Relative Frequency"
set y2label " Cumulative Relative Frequency (CRF)"
set y2range [0:1]
set y2tics
set xrange [-2:4]
set yrange [0:0.015]

plot "histogram.data" using 2:5 title " RelFreq" with impulses, \
        "histogram.data" using 2:7 axes x1y2 title " CRF" with lines
'_EOF_'
    #	<< happy emacs

    display ce11.26way.phyloP.histo.png

#############################################################################
# construct download files for 26-way (DONE - 2015-09-09 - Hiram)
    mkdir /usr/local/apache/htdocs-hgdownload/goldenPath/ce11/multiz26way
    mkdir /usr/local/apache/htdocs-hgdownload/goldenPath/ce11/phastCons26way
    mkdir /usr/local/apache/htdocs-hgdownload/goldenPath/ce11/phyloP26way
    mkdir /hive/data/genomes/ce11/bed/multiz26way/downloads
    cd /hive/data/genomes/ce11/bed/multiz26way/downloads
    mkdir multiz26way phastCons26way phyloP26way
    cd multiz26way
    # using the separate per-chrom maf files as more
    #    convenient than the single one
    time rsync -a -P ../../anno/result/ .
    # sent 2499852297 bytes  received 148 bytes  161280802.90 bytes/sec
    # total size is 2499546722  speedup is 1.00
    # real    0m15.117s

    du -hsc *.maf
    #  2.4G    maf

    time gzip *.maf
    #   real    2m32.825s

    du -hsc maf.gz
    #  333M    maf

    grep TREE ../../4d/all.mod | sed -e 's/TREE: //' \
      | ~/kent/src/hg/utils/phyloTrees/asciiTree.pl /dev/stdin \
         > ce11.26way.nh
    ~/kent/src/hg/utils/phyloTrees/commonNames.sh ce11.26way.nh \
      | ~/kent/src/hg/utils/phyloTrees/asciiTree.pl /dev/stdin \
         | sed -e 's/__/_/;' > ce11.26way.commonNames.nh
    ~/kent/src/hg/utils/phyloTrees/scientificNames.sh ce11.26way.nh \
	| $HOME/kent/src/hg/utils/phyloTrees/asciiTree.pl /dev/stdin \
	    > ce11.26way.scientificNames.nh
    time md5sum *.nh > md5sum.txt
    #   real    1m55.320s
    ln -s `pwd`/*.nh `pwd`/*.txt `pwd`/*.gz \
        /usr/local/apache/htdocs-hgdownload/goldenPath/ce11/multiz26way

    # obtain the README.txt from ce10/multiz9way and update for this
    #   situation

    #####################################################################
    cd /hive/data/genomes/ce11/bed/multiz26way/downloads/phastCons26way

    ln -s ../../cons/all/downloads/phastCons26way.wigFix.gz \
        ./ce11.phastCons26way.wigFix.gz
    ln -s ../../cons/all/phastCons26way.bw ./ce11.phastCons26way.bw
    ln -s ../../cons/all/all.mod ./ce11.phastCons26way.mod
    time md5sum *.gz *.mod *.bw > md5sum.txt
    #   real    0m33.880s

    # obtain the README.txt from ce10/phastCons9way and update for this
    #   situation
    ln -s `pwd`/*.gz `pwd`/*.mod `pwd`/*.bw `pwd`/*.txt \
      /usr/local/apache/htdocs-hgdownload/goldenPath/ce11/phastCons26way

    #####################################################################
    cd /hive/data/genomes/ce11/bed/multiz26way/downloads/phyloP26way

    ln -s ../../consPhyloP/all/downloads/phyloP26way.wigFix.gz \
        ./ce11.phyloP26way.wigFix.gz
    ln -s ../../consPhyloP/run.phyloP/all.mod ce11.phyloP26way.mod
    ln -s ../../consPhyloP/all/phyloP26way.bw ce11.phyloP26way.bw

    time md5sum *.mod *.bw *.gz > md5sum.txt
    #   real    0m40.489s

    # obtain the README.txt from ce10/phyloP9way and update for this
    #   situation
    ln -s `pwd`/* \
      /usr/local/apache/htdocs-hgdownload/goldenPath/ce11/phyloP26way

    ###########################################################################
    ## create upstream refGene maf files
    cd /hive/data/genomes/ce11/bed/multiz26way/downloads/multiz26way
    # going to construct upstream for ws245Genes, refGene and ensGene

    # bash script
#!/bin/sh
export geneTbl="ws245Genes"
for S in 1000 2000 5000
do
    echo "making upstream${S}.maf"
    featureBits ce11 ${geneTbl}:upstream:${S} -fa=/dev/null -bed=stdout \
        | perl -wpe 's/_up[^\t]+/\t0/' | sort -k1,1 -k2,2n \
        | /cluster/bin/$MACHTYPE/mafFrags ce11 multiz26way \
                stdin stdout \
                -orgs=/hive/data/genomes/ce11/bed/multiz26way/species.list \
        | gzip -c > upstream${S}.${geneTbl}.maf.gz
    echo "done upstream${S}.${geneTbl}.maf.gz"
done
    #   real    13m46.813s

    # and run up the refGene upstreams
#!/bin/sh
export geneTbl="refGene"
for S in 1000 2000 5000
do
    echo "making upstream${S}.maf"
    featureBits ce11 ${geneTbl}:upstreamAll:${S} -fa=/dev/null -bed=stdout \
        | perl -wpe 's/_up[^\t]+/\t0/' | sort -k1,1 -k2,2n \
        | /cluster/bin/$MACHTYPE/mafFrags ce11 multiz26way \
                stdin stdout \
                -orgs=/hive/data/genomes/ce11/bed/multiz26way/species.list \
        | gzip -c > upstream${S}.${geneTbl}.maf.gz
    echo "done upstream${S}.${geneTbl}.maf.gz"
done
    # real    63m58.714s

    # and run up the ensGene upstreams
#!/bin/sh
export geneTbl="ensGene"
for S in 1000 2000 5000
do
    echo "making upstream${S}.maf"
    featureBits ce11 ${geneTbl}:upstream:${S} -fa=/dev/null -bed=stdout \
        | perl -wpe 's/_up[^\t]+/\t0/' | sort -k1,1 -k2,2n \
        | /cluster/bin/$MACHTYPE/mafFrags ce11 multiz26way \
                stdin stdout \
                -orgs=/hive/data/genomes/ce11/bed/multiz26way/species.list \
        | gzip -c > upstream${S}.${geneTbl}.maf.gz
    echo "done upstream${S}.${geneTbl}.maf.gz"
done
    # real    14m4.262s

    # obtain the README.txt from ce9/multiz10way and update for this
    #   situation
    # information for table of species in the README files, need to
    # edit it in after adding it to the end of this file:

    cat ../../species.list | tr '[ ]' '[\n]' | while read D
do
 netType=`ls ../../mafLinks/${D}/ce11.${D}.*.maf.gz | sed -e "s#.*ce11.${D}.##; s#.maf.gz##;" | sed -e 's/synNet/syntenic/; s/rbest/reciprocal best/;'`
 info=`hgsql -N -e "select organism,\" - \",scientificName,description from dbDb where name=\"$D\";" hgcentraltest`
 echo "${info} ${netType}"
done | tr '[\t]' '[ ]' >> README.txt

C. elegans         Caenorhabditis elegans   Aug. 2014 (WBcel245/ce11) reference

A. ceylanicum      Ancylostoma ceylanicum   Mar. 2014
                                         (WS243/Acey_2013.11.30.genDNA/ancCey1)
Barber pole worm   Haemonchus contortus     Jul. 2013  (WormBase WS239/haeCon2)
C. angaria         Caenorhabditis angaria   Mar. 2012           (WS245/caeAng2)
C. brenneri        Caenorhabditis brenneri  Nov. 2010
                                                    (C. brenneri 6.0.1b/caePb3)
C. briggsae        Caenorhabditis briggsae  Apr. 2011               (WS225/cb4)
C. japonica        Caenorhabditis japonica  Aug. 2010     (WUSTL 7.0.1/caeJap4)
C. remanei         Caenorhabditis remanei   Jul. 2007           (WS220/caeRem4)
C. sp. 5 ju800     Caenorhabditis sp5 ju800 Jan. 2012
                                  (WS230/Caenorhabditis_sp_5-JU800-1.0/caeSp51)
C. tropicalis      Caenorhabditis tropicalis Nov. 2010   (WUSTL 3.0.1/caeSp111)
Dog heartworm      Dirofilaria immitis      Sep. 2013
                                                (WS240/D. immitis v2.2/dirImm1)
Eye worm           Loa loa                  Jul. 2012
                                         (WS235/L_loa_Cameroon_isolate/loaLoa1)
Filarial worm      Brugia malayi            May. 2014
                                                   (WS244/B_malayi-3.1/bruMal2)
H. bacteriophora/m31e Heterorhabditis bacteriophora Aug. 2011
                                           (WS229/H. bacteriophora 7.0/hetBac1)
M. hapla           Meloidogyne hapla        Sep. 2008
                                                   (M. hapla VW9 WS210/melHap1)
M. incognita       Meloidogyne incognita    Feb. 2008
                                        (M. incognita WS245/PRJEA28837/melInc2)
Microworm          Panagrellus redivivus    Feb. 2013           (Pred3/panRed1)
N. americanus      Necator americanus       Dec. 2013
                                                (WS242/N_americanus_v1/necAme1)
O. volvulus        Onchocerca volvulus      Nov. 2013
                                         (WS241/O_volvulus_Cameroon_v3/oncVol1)
P. exspectatus     Pristionchus exspectatus Mar. 2014
                                               (WS243/P_exspectatus_v1/priExs1)
P. pacificus       Pristionchus pacificus   Aug. 2014  (P_pacificus-v2/priPac3)
Pig roundworm      Ascaris suum             Sep. 2012
                                                    (WS229/AscSuum_1.0/ascSuu1)
Pine wood nematode Bursaphelenchus xylophilus Nov. 2011
                                            (WS229/B. xylophilus Ka4C1/burXyl1)
Threadworm         Strongyloides ratti      Sep. 2014  (S. ratti ED321/strRat2)
Trichinella        Trichinella spiralis     Jan. 2011
                                     (WS225/Trichinella_spiralis-3.7.1/triSpi1)
Whipworm           Trichuris suis           Jul. 2014 (WS243/T. suis
                                                       DCEP-RM93M male/triSui1)

    # some other symlinks were already made above
    ln -s `pwd`/upstream*.gz README.txt \
        /usr/local/apache/htdocs-hgdownload/goldenPath/ce11/multiz26way

#############################################################################
# hgPal downloads (DONE - 2015-09-09 - Hiram)
#   FASTA from 26-way for ws245Genes, refGene and ensGene

    ssh hgwdev
    screen -S ce11HgPal
    mkdir /hive/data/genomes/ce11/bed/multiz26way/pal
    cd /hive/data/genomes/ce11/bed/multiz26way/pal
    cat ../species.list | tr '[ ]' '[\n]' > order.list

    export mz=multiz26way
    export gp=ws245Genes
    export db=ce11
    export I=0
    mkdir exonAA exonNuc
    for C in `sort -nk2 ../../../chrom.sizes | cut -f1`
    do
        I=`echo $I | awk '{print $1+1}'`
	echo "mafGene -chrom=$C -exons -noTrans $db $mz $gp order.list stdout | gzip -c > exonNuc/$C.exonNuc.fa.gz &"
	echo "mafGene -chrom=$C -exons $db $mz $gp order.list stdout | gzip -c > exonAA/$C.exonAA.fa.gz &"
        if [ $I -gt 6 ]; then
            echo "date"
            echo "wait"
            I=0
        fi
    done > $gp.jobs
    echo "date" >> $gp.jobs
    echo "wait" >> $gp.jobs

    time sh -x ./$gp.jobs > $gp.jobs.log 2>&1
    #   real    23m34.032s

    time zcat exonAA/*.gz | gzip -c > $gp.$mz.exonAA.fa.gz
    #   real    0m18.841s

    time zcat exonNuc/*.gz | gzip -c > $gp.$mz.exonNuc.fa.gz
    #   real    2m8.738s

    export mz=multiz26way
    export gp=ws245Genes
    export db=ce11
    export pd=/usr/local/apache/htdocs-hgdownload/goldenPath/$db/$mz/alignments
    mkdir -p $pd
    md5sum *.fa.gz > md5sum.txt
    ln -s `pwd`/$gp.$mz.exonAA.fa.gz $pd/$gp.exonAA.fa.gz
    ln -s `pwd`/$gp.$mz.exonNuc.fa.gz $pd/$gp.exonNuc.fa.gz
    ln -s `pwd`/md5sum.txt $pd/

    rm -rf exonAA exonNuc

    ### need other gene track alignments also
    # running up refGene
    cd /hive/data/genomes/ce11/bed/multiz26way/pal
    export mz=multiz26way
    export gp=refGene
    export db=ce11
    export I=0
    mkdir exonAA exonNuc
    for C in `sort -nk2 ../../../chrom.sizes | cut -f1`
    do
        I=`echo $I | awk '{print $1+1}'`
	echo "mafGene -chrom=$C -exons -noTrans $db $mz $gp order.list stdout | gzip -c > exonNuc/$C.exonNuc.fa.gz &"
	echo "mafGene -chrom=$C -exons $db $mz $gp order.list stdout | gzip -c > exonAA/$C.exonAA.fa.gz &"
        if [ $I -gt 6 ]; then
            echo "date"
            echo "wait"
            I=0
        fi
    done > $gp.jobs
    echo "date" >> $gp.jobs
    echo "wait" >> $gp.jobs

    time sh -x $gp.jobs > $gp.jobs.log 2>&1
    #   real    22m52.785s

    export mz=multiz26way
    export gp=refGene
    export db=ce11
    time zcat exonAA/*.gz | gzip -c > $gp.$mz.exonAA.fa.gz
    #   real    0m17.738s
    time zcat exonNuc/*.gz | gzip -c > $gp.$mz.exonNuc.fa.gz
    #   real    1m55.287s

    du -hsc exonAA exonNuc $gp.*fa.gz
    # 88M     exonAA
    # 172M    exonNuc
    # 88M     refGene.multiz26way.exonAA.fa.gz
    # 172M    refGene.multiz26way.exonNuc.fa.gz

    rm -rf exonAA exonNuc

    # we're only distributing exons at the moment
    export mz=multiz26way
    export gp=refGene
    export db=ce11
    export pd=/usr/local/apache/htdocs-hgdownload/goldenPath/$db/$mz/alignments
    mkdir -p $pd
    ln -s `pwd`/$gp.$mz.exonAA.fa.gz $pd/$gp.exonAA.fa.gz
    ln -s `pwd`/$gp.$mz.exonNuc.fa.gz $pd/$gp.exonNuc.fa.gz
    ln -s `pwd`/md5sum.txt $pd/

    # running up ensGene
    cd /hive/data/genomes/ce11/bed/multiz26way/pal
    export mz=multiz26way
    export gp=ensGene
    export db=ce11
    export I=0
    mkdir exonAA exonNuc
    for C in `sort -nk2 ../../../chrom.sizes | cut -f1`
    do
        I=`echo $I | awk '{print $1+1}'`
	echo "mafGene -chrom=$C -exons -noTrans $db $mz $gp order.list stdout | gzip -c > exonNuc/$C.exonNuc.fa.gz &"
	echo "mafGene -chrom=$C -exons $db $mz $gp order.list stdout | gzip -c > exonAA/$C.exonAA.fa.gz &"
        if [ $I -gt 6 ]; then
            echo "date"
            echo "wait"
            I=0
        fi
    done > $gp.jobs
    echo "date" >> $gp.jobs
    echo "wait" >> $gp.jobs

    time sh -x $gp.jobs > $gp.jobs.log 2>&1
XXX - running - Wed Sep  9 10:57:22 PDT 2015
    #   real    24m14.022s

    export mz=multiz26way
    export gp=ensGene
    export db=ce11
    time zcat exonAA/*.gz | gzip -c > $gp.$mz.exonAA.fa.gz
    #   real    0m19.157s
    time zcat exonNuc/*.gz | gzip -c > $gp.$mz.exonNuc.fa.gz
    #   real    2m4.686s

    du -hsc exonAA exonNuc $gp.*fa.gz
    # 88M     exonAA
    # 187M    exonNuc
    # 88M     ensGene.multiz26way.exonAA.fa.gz
    # 187M    ensGene.multiz26way.exonNuc.fa.gz

    rm -rf exonAA exonNuc

    # we're only distributing exons at the moment
    export mz=multiz26way
    export gp=ensGene
    export db=ce11
    export pd=/usr/local/apache/htdocs-hgdownload/goldenPath/$db/$mz/alignments
    mkdir -p $pd
    ln -s `pwd`/$gp.$mz.exonAA.fa.gz $pd/$gp.exonAA.fa.gz
    ln -s `pwd`/$gp.$mz.exonNuc.fa.gz $pd/$gp.exonNuc.fa.gz
    ln -s `pwd`/md5sum.txt $pd/

#############################################################################
# wiki page for 26-way (DONE - 2015-07-28 - Hiram)
    mkdir /hive/users/hiram/bigWays/ce11.26way
    cd /hive/users/hiram/bigWays
    echo "ce11" > ce11.26way/ordered.list
 awk '{print $1}' /hive/data/genomes/ce11/bed/multiz26way/26way.distances.txt \
       >> ce11.26way/ordered.list

    # sizeStats.sh catches up the cached measurements required for data
    # in the tables.  They may already be done.
    ./sizeStats.sh ce11.26way/ordered.list
    # dbDb.sh constructs ce11.26way/Ce11_26-way_conservation_alignment.html
    ./dbDb.sh ce11 26way
    # sizeStats.pl constructs ce11.26way/Ce11_26-way_Genome_size_statistics.html
    ./sizeStats.pl ce11 26way

    # defCheck.pl constructs Ce11_26-way_conservation_lastz_parameters.html
    ./defCheck.pl ce11 26way

    # this constructs the html pages in ce11.26way/:
# -rw-rw-r-- 1 4153 Jun  5 11:03 Ce11_26-way_conservation_alignment.html
# -rw-rw-r-- 1 5833 Jun  5 11:04 Ce11_26-way_Genome_size_statistics.html
# -rw-rw-r-- 1 3854 Jun  5 11:04 Ce11_26-way_conservation_lastz_parameters.html

    # add those pages to the genomewiki.  Their page names are the
    # names of the .html files without the .html:
#  Ce11_26-way_conservation_alignment
#  Ce11_26-way_Genome_size_statistics
#  Ce11_26-way_conservation_lastz_parameters

    # when you view the first one you enter, it will have links to the
    # missing two.

#############################################################################
