#!/bin/csh -f
exit

# NHGRI DNASE I HYPERSENSITIVE SITES (2004-10-15 kate)
    # Updated 2005-06-01 angie, see below
    # Data from Greg Crawford <crawford@mail.nih.gov> at the Collins Lab
    cd /cluster/data/encode/NHGRI
    set data = lab/DNaseI_HS_NHGRI_ENCODE2.txt
    sed '/^track/d' $data > DNAseHS.bed
    hgLoadBed hg16 encode_NHGRI_DNAseHS DNAseHS.bed

    # Updated 2005-06-01 angie
    cd /cluster/data/encode/NHGRI/crawford/2005-06-01
    # Jim asked to add scores for grayscale-coloring:
    #   clusters of 2 drawn in 50%,  clusters of 3 drawn in 75%,
    #   and clusters of 4 or more drawn in 100% black.
    #   Shorten names: CD4_not_activated_500bp_13381_2 --> 13381_2
    tail +2 CD4_Activated.txt \
    | perl -wpe 'if (/CD4_\S+500bp_(\d+)_(\d+)/) { \
                   $id = $1 . "_" . $2; \
                   $score = ($2 >= 4) ? 1000 : $2 * 250; \
                   s/CD4\S+/$id\t$score/; } else { die "parse"; }' \
    | hgLoadBed hg16 encodeNhgriDnaseHsAct stdin
    tail +2 CD4_Not_Activated.txt \
    | perl -wpe 'if (/CD4_\S+500bp_(\d+)_(\d+)/) { \
                   $id = $1 . "_" . $2; \
                   $score = ($2 >= 4) ? 1000 : $2 * 250; \
                   s/CD4\S+/$id\t$score/; } else { die "parse"; }' \
    | hgLoadBed hg16 encodeNhgriDnaseHsNonAct stdin


# STANFORD PROMOTERS FROM RICK MYERS LAB (2004-10-17 kate)
# Updated with new data 2005-05-26 angie, see below
# Updated with new data 2005-09-15 kate, see below

    /cluster/data/encode/StanfordPromoters
    mkdir StanfordPromoters_20040825
    cd StanfordPromoters_20040825

    csh makePromoterFiles.csh > promoterExps.tab
    wc -l promoterExps.exp
        # 14

    # currently hgTracks only looks in hgFixed for the experiment table
    #  (add a trackDb option)
    hgExperiment hgFixed encode_Stanford_Promoters \
                    promoterExps.tab promoterPos.bed promoterData.tab
    hgLoadBed hg16 encode_Stanford_Promoters encode_Stanford_Promoters.tab

    # data update (2005-04-12 kate) 
    /cluster/data/encode/StanfordPromoters
    mkdir -p 2005-04-12; cd 2005-04-12
    mkdir lab; cd lab
    # save from email
    ls
    # ENCODE_DATA_SUBMIT_2005.04.12.txt  UCSCREADME.txt
    # NOTE: scores are outside of 1-1000.  I've asked for
    # a redo from Sara.

    # UPDATED 2005-09-07 kate (using angie's procedure)
    cd /cluster/data/encode/StanfordPromoters
    mkdir 2005-08-23
    cd 2005-08-23
    rm latest previous
    ln -s 2005-05-25 previous
    ln -s 2005-08-23 latest
    mkdir latest/lab
    cd latest/lab
    # Saved email attachments from Sara Hartman.  
    # Angie's notes...
    # Used ooffice to translate Description.doc to Description.html and 
    # NegativeControlDataStanfordPromoters.xls to tab-sep'd ....txt.
    # Datafile column descriptions from Sara's email 5/25/2005, augmented 
    # with the bed9+ fields that we will shuffle them into:
    # 1->4) Accession number of the mRNA used to predict the promoter
    # 2->10) Gene model ID. If a gene has more than one predicted promoter 
    #        they would have the same gene model ID.
    # 3->1) chromosome
    # 4->6) strand (0 for negative, 1 for positive)
    # 5->2) Promoter Start coordinate
    # 6->3) Promoter End Coordinate
    # 7->X) Promoter length
    # 8->11) Promoter Name [really a description -- some of these are quoted]
    # 9->12) Luciferase signal A
    # 10->13) Renilla Signal A
    # 11->14) Luciferase Signal B
    # 12->15) Renilla Signal B
    # 13->16) Average Luciferase/Renilla Ratio
    # 14->17) Normalized Luciferase/Renilla Ratio
    # 15->18) Normalized and log2 transformed Luciferase Renilla Ratio
    # 16->5) Score (from 0-1000)
    # -- The Average file is like that but missing columns 9 to 14.
    # Some cell-line data files have some rows with "Bad Txfn" in the last 
    # 4 columns; Sara said we should ignore those rows.

    # Kate tweaked Angie's scripting for newer data, and wrapped in a file...
cat > doProm.hg16.csh << 'EOF'
    foreach f (lab/StanfordPromoters_[A-Z]*.txt)
      set cellType = `echo $f | perl -wpe 's^lab/StanfordPromoters_(.*)_.*^$1^'`
      echo $cellType
      if ($cellType == "Average") then
        tail +2 $f \
        | perl -wpe 'chomp; @w = split("\t"); $w[7] =~ s/^\"(.*)\"$/$1/; \
                     $w[3] =~ tr/01/-+/; \
                     $_ = join("\t", \
  $w[2], $w[4], $w[5], $w[0], $w[9], $w[3], $w[4], $w[5], 0, $w[1], $w[7], \
  $w[8]) . "\n";' \
	| makeColoredBed > encodeStanfordPromoters$cellType.hg16.bed
      else
        tail +2 $f \
	| grep -v "Bad Txfn" \
        | perl -wpe 'chomp; @w = split("\t"); $w[7] =~ s/^\"(.*)\"$/$1/; \
                     $w[3] =~ tr/01/-+/; \
                     $_ = join("\t", \
  $w[2], $w[4], $w[5], $w[0], $w[15], $w[3], $w[4], $w[5], 0, $w[1], $w[7], \
  $w[8], $w[9], $w[10], $w[11], $w[12], $w[13], $w[14]) . "\n";' \
	| makeColoredBed > encodeStanfordPromoters$cellType.hg16.bed
      endif
    end
'EOF'
    csh doProm.hg16.csh >&! doProm.hg16.log
cat > doLoad.hg16.csh << 'EOF'
    foreach f (encode*.hg16.bed)
      set track = $f:r:r
      if ($track == "encodeStanfordPromotersAverage") then
        hgLoadBed -tab -noBin -sqlTable=$HOME/kent/src/hg/lib/$track.sql \
	  hg16 $track $f
      else
        sed -e "s/encodeStanfordPromoters/$track/" \
          $HOME/kent/src/hg/lib/encodeStanfordPromoters.sql > /tmp/esp.sql
        hgLoadBed -tab -noBin -sqlTable=/tmp/esp.sql hg16 $track $f
      endif
    end
'EOF'
    csh doLoad.hg16.csh > &! doLoad.hg16.log

    #Angie did enrichment analysis on May data...
    featureBits hg16 -enrichment encode_Stanford_Promoters \
      encodeStanfordPromotersSnu182
#encode_Stanford_Promoters 0.011%, encodeStanfordPromotersSnu182 0.011%, both 0.011%, cover 99.84%, enrich 9194.85x
    featureBits hg16 encode_Stanford_Promoters \!encodeStanfordPromotersSnu182 \
      -bed=stdout
#chr21   32827801        32828259        chr21.1
#chr21   33617743        33617751        chr21.2
#chr11   1749652 1749673 chr11.1
#chr7    126437698       126437700       chr7.1
#chr5    131669622       131669641       chr5.1
#508 bases of 2865248791 (0.000%) in intersection
    # -- of those, only the first is not in the new set at all.  
    featureBits hg16 \!encode_Stanford_Promoters encodeStanfordPromotersSnu182 \
      -bed=stdout
#chr21   33617194        33617203        chr21.1
#chr18   32827801        32828259        chr18.1
#chr7    126438195       126438196       chr7.1
#chr2    235023631       235024097       chr2.1
#934 bases of 2865248791 (0.000%) in intersection
    # -- only the second and the last are not in the old set at all.
    # Put the negative control data spreadsheet out for download.
    mkdir -p \
  /usr/local/apache/htdocs/goldenPath/hg16/encode/datafiles/stanfordPromoters
    nice gzip NegativeControlDataStanfordPromoters.txt
    cp -p NegativeControlDataStanfordPromoters.{xls,txt.gz} \
  /usr/local/apache/htdocs/goldenPath/hg16/encode/datafiles/stanfordPromoters/
    # Added a README.txt.


# UVA DNA REPLICATION TEMPORAL PROFILING (2004-10-22 kate)
# Updated with new data 2005-05-25 angie, see below
    # Dutta lab (Univ. Virginia) replication data
    #       Submitted as custom track, coords on hg15

    cd /cluster/data/encode/UVa
    mkdir 2004-10-22
    ln -s 2004-10-22  latest
    cd latest
    mkdir -p lab
    cd lab
    # download data
    # data (was MS-WORD)
    dos2unix lab/Dutta_Rep_profile.txt
    # documentation (ditto)
    dos2unix lab/UCSC-_data_release.txt

    # split file of customtracks into multiple files
    ./splitCustomTracks.pl lab/Dutta_Rep_profile.txt repl
    wc -l *.?-?{,?}
        # 163 repl.0-2
        # 301 repl.2-4
        # 296 repl.4-6
        # 523 repl.6-8
        # 421 repl.8-10
        # 1704 total

    wc -l lab/Dutta_Rep_profile.txt
      # 1704 lab/Dutta_Rep_profile.txt

    cat > checkEncodeTable.csh << 'EOF'
    hgsql hg16 -N -s -e "SELECT er.name, count(*) FROM encodeRegions er, $1 hs WHERE er.chrom=hs.chrom AND hs.chromStart between er.chromStart AND er.chromEnd group by er.name"
'EOF'
    # << for emacs

    # load w/o binning to maintain ordering by chromStart
    foreach f (repl.?-?{,?})
        liftOver $f /cluster/data/hg15/bed/liftOver/hg15ToHg16.over.chain \
                    $f.hg16.bed $f.unmapped
        set time = `echo $f:e | sed 's/-/_/'`
        hgLoadBed -noBin hg16 encode_UVA_DNARep_$time $f.hg16.bed
        csh checkEncodeTable.csh encode_UVA_DNARep_$time > $time.ct
    end
    wc -l *.hg16.bed
        # 160 repl.0-2.hg16.bed
        # 298 repl.2-4.hg16.bed
        # 294 repl.4-6.hg16.bed
        # 520 repl.6-8.hg16.bed
        # 416 repl.8-10.hg16.bed
       # 1688 total
    wc -l *.unmapped
          # 0 repl.0-2.unmapped
          # 0 repl.2-4.unmapped
          # 0 repl.4-6.unmapped
          # 0 repl.6-8.unmapped
          # 0 repl.8-10.unmapped
          # 0 total
    cut -f 1 *.ct | sort | uniq > regions
    wc -l regions
     # 43 regions (1 missing)
    hgsql  hg16 -N -s -e "SELECT name from encodeRegions" | sort | diff - regions 
    # UPDATED 2005-05-25 WITH NEW DATA (angie)
    cd /cluster/data/encode/UVa
    mkdir 2005-05-25
    rm latest
    ln -s 2005-05-25 latest
    ln -s 2004-10-22 previous
    cd latest
    mkdir lab
    cd lab
    # saved email attachments here.
    # symlink to nicer names:
    ln -s encode0to2hrAuD4030.bed.txt repl.0
    ln -s encode2to4hrAuD4030.bed.txt repl.2
    ln -s encode4to6hrAuD4030.bed.txt repl.4
    ln -s encode6to8hrAuD4030.bed.txt repl.6
    ln -s encode8to10hrAuD4030.bed.txt repl.8
    wc -l repl.*
#    249 repl.0
#    618 repl.2
#    305 repl.4
#    569 repl.6
#    429 repl.8
#   2170 total
    # Any 0-length items?
    foreach f (encode*)
      echo $f
      awk '$3 == $2 {print;}' $f
    end
    # Any >1Mbp items?  (Previously there were some of these items spanning 
    # multiple encode regions...
    cd lab
    foreach f (encode*)
      echo $f
      awk '$3 - $2 > 1000000 {print;}' $f
    end
    cd ..
    foreach F (lab/repl.?)
      set f = $F:t
      echo $f
      tail +2 $F \
      | liftOver stdin /cluster/data/hg15/bed/liftOver/hg15ToHg16.over.chain \
          $f.hg16.bed $f.unmapped
      set time = $f:e
      hgLoadBed -noBin hg16 encodeUvaDnaRep$time $f.hg16.bed
      csh /cluster/data/encode/UVa/2004-10-22/checkEncodeTable.csh \
        encodeUvaDnaRep$time > $time.ct
    end
    wc -l *.bed
#    248 repl.0.hg16.bed
#    616 repl.2.hg16.bed
#    304 repl.4.hg16.bed
#    568 repl.6.hg16.bed
#    428 repl.8.hg16.bed
#   2164 total
    wc -l *.unmapped
#      0 repl.0.unmapped
#      2 repl.2.unmapped
#      0 repl.4.unmapped
#      0 repl.6.unmapped
#      0 repl.8.unmapped
#      2 total
    cat repl.2.unmapped 
##Split in new
#chr21   39652869        39703581
    # The main chr21 over.chain is interrupted by a little inversion chain 
    # 39661243 39662854... so try liftOvering the start and end:
    cat > hg15.chr21troublespot.bed <<EOF
chr21 39652869 39652870
chr21 39703580 39703581
EOF
    liftOver hg15.chr21troublespot.bed \
       /cluster/data/hg15/bed/liftOver/hg15ToHg16.over.chain \
       hg16.chr21troublespot.{bed,unmapped}
    cat hg16.chr21troublespot.*
#chr21   39652870        39652871
#chr21   39703140        39703141
    # OK, manually add an entry for chr21 39652870 39703141 to repl.2.hg16.bed:
    $EDITOR repl.2.hg16.bed
    wc -l repl.2.hg16.bed
#    617 repl.2.hg16.bed
    # Reload:
    hgLoadBed -noBin hg16 encodeUvaDnaRep2 repl.2.hg16.bed

    # Look for ENCODE regions completely missing from all 5 tables:
    cut -f 1 *.ct | sort | uniq > regions
    wc -l regions
    # 43 regions (1 missing)
    hgsql  hg16 -N -s -e "SELECT name from encodeRegions" | sort \
    | diff - regions 
    # < ENm011
    foreach t (0 2 4 6 8)
      set t2 = `expr $t + 2`
      featureBits hg16 -enrichment encode_UVA_DNARep_${t}_$t2 \
        encodeUvaDnaRep$t
    end
#encode_UVA_DNARep_0_2 0.154%, encodeUvaDnaRep0 0.154%, both 0.154%, cover 99.90%, enrich 648.98x
#encode_UVA_DNARep_2_4 0.621%, encodeUvaDnaRep2 0.621%, both 0.620%, cover 99.93%, enrich 161.04x
#encode_UVA_DNARep_4_6 0.181%, encodeUvaDnaRep4 0.181%, both 0.181%, cover 99.75%, enrich 551.92x
#encode_UVA_DNARep_6_8 0.418%, encodeUvaDnaRep6 0.418%, both 0.418%, cover 99.90%, enrich 239.17x
#encode_UVA_DNARep_8_10 0.178%, encodeUvaDnaRep8 0.177%, both 0.177%, cover 99.42%, enrich 562.02x


# UCSD CHIP/CHIP (2004-09-30 kate)
    # Bing Ren's Lab
    # Provided by Chunxu Qu [mailto:qchunxu@ucsd.edu] on Oct. 1, 2004

    # Unpack data and rename files 

    ssh kksilo
    cd /cluster/data/encode
    mkdir -p UCSD/project1
    cd UCSD/project1
    gunzip project1.tar.gz
    foreach f (*.wig)
        mv $f $f.bed
    end

    # consists of 8 custom track wiggle BED files
    ssh hgwdev
    mkdir /gbdb/hg16/encode/UCSD_ChIP/wib
    cd /cluster/data/encode/UCSD/project1
    mkdir wig
    cd wig
    cat > makeWig.csh << 'EOF'
    foreach f (../*.bed)
        # get base name
        set b = $f:t:r:r
        # strip non-informative parts of name
        set n = `echo $b | sed -e 's/ave_//' -e 's/_p//' | tr '[a-z]' '[A-Z]'`
        echo $n
        # create wiggle data files from BED
        # NOTE: wigBedToBinary has been replaced by wigEncode
        sed -e '/^browser/d' -e '/^track/d' $f | \
            wigBedToBinary stdin $n.wig $n.wib
        ln -s `pwd`/$n.wib /gbdb/hg16/encode/UCSD_ChIP/wib
        # load into database
        hgLoadWiggle hg16 encode_UCSD_ChIP_$n $n.wig \
                    -pathPrefix=/gbdb/hg16/encode/UCSD_ChIP/wib
    end
    'EOF'
    # << for emacs

    csh makeWig.csh >&! makeWig.log &
    # Converted stdin, upper limit 16.00, lower limit 0.00, etc.

    hgsql hg16 -s -e "select count(*) from encode_UCSD_ChIP_RNAP_HELA"
        # 19571

    # create entries in trackDb, default height 16 pixels
    # Use Jim's suggestion to associate a color with a cell line
         # Pol2 Hela (black)
         # Pol2 THP1 (darkish blue)
         # Pol2 IMR90 (darkish red)
         # Pol2 HCT116 (darkish green)
         # TAF1 Hela (black)
         # TAF1 THP1 (darkish blue)
         # TAF1 IMR90 (darkish red)
         # TAF1 HCT116 (darkish green)

    # UCSD/Ludwig Institute Histone Methylation (2005-04-21 kate)
    # NOTE: this data was submitted 12/14/04, but somehow fell thru
    # the cracks (it was the week before CSHL).
    # Files received from email:  ave_AcH3_imr90.wig ave_MeH3K4_imr90.wig
# Provided by Chunxu Qu [mailto:qchunxu@ucsd.edu]

    cd /cluster/data/encode/UCSD
    cd project1
    mv ave_AcH3_imr90.wig ave_AcH3_imr90_p.wig.bed
    mv ave_MeH3K4_imr90.wig ave_MeH3K4_imr90_p.wig.bed
    mkdir -p /gbdb/hg16/encode/UcsdChip/wib
    mkdir wig.2005-04-21
    cd latest
    # edit makeWig.csh to use wigEncode and standard table naming
    cat > makeWig.csh << 'EOF'
    foreach f (../*.bed)
        # get base name
        set b = $f:t:r:r
        # convert to standard format (initial caps) "AntibodyCell"
        set n = `echo $b | perl -wpe 's/ave_(.*)_(.*)_p/\u\L$1\E\u\L$2\E/'`
        echo $n
        # create wiggle data files from BED
        sed -e '/^browser/d' -e '/^track/d' $f | \
            wigEncode stdin $n.wig $n.wib
        ln -s `pwd`/$n.wib /gbdb/hg16/encode/UcsdChip/wib
        # load into database
        hgLoadWiggle hg16 encodeUcsdChip$n $n.wig \
                    -pathPrefix=/gbdb/hg16/encode/UcsdChip/wib
    end
'EOF'
    # << for emacs
    csh makeWig.csh >&! makeWig.log &

    # cleanup TODO: remove old /gbdb files, doc files, tables 
    # on hgwdev, beta & RR

    # try as bedGraph type 
    cd /cluster/data/encode/UCSD
    cd project1
    mkdir bed.2005-04-21
    ln -s bed.2005-04-21 latest
    cd latest 
    cat > makeBed.csh << 'EOF'
    foreach f (../*.bed)
        # get base name
        set b = $f:t:r:r
        # convert to standard format (initial caps) "AntibodyCell"
        set n = `echo $b | perl -wpe 's/ave_(.*)_(.*)_p/\u\L$1\E\u\L$2\E/'`
        echo $n
        # create wiggle data files from BED
        sed -e '/^browser/d' -e '/^track/d' $f > $n.bed
        # load into database
        hgLoadBed hg16 encodeUcsdChip$n $n.bed
    end
'EOF'
    # << for emacs
    csh makeBed.csh >&! makeBed.log &
    # set up 4 composite tracks (per antibody), 4 cell lines each
    # using colors to highlight cell types:
         # Hela (black)
         # THP1 (darkish blue)
         # IMR90 (darkish red)
         # HCT116 (darkish green)

# UCSD Histone Methylation Chip/chip on IFN-g induced Hela (2005-05-26)
    cd /cluster/data/encode/UCSD
    mkdir -p 2005-05-26
    cd 2005-05-26
    # copied files from ftp dir
    # TODO: load as bedGraph, as above

    # bed format; use vi to strip header lines
    cp tmH3K4_p0.wig tmH3K4_p0.bed
    cp tmH3K4_p30.wig tmH3K4_p30.bed
    
    # save originals
    mv tmH3K4_p0.wig tmH3K4_p0.wig.orig
    mv tmH3K4_p30.wig tmH3K4_p30.wig.orig

    # load
    wigEncode tmH3K4_p0.bed tmH3K4_p0.wig tmH3K4_p0.wib
    ln -s `pwd`/tmH3K4_p0.wib /gbdb/hg16/encode/UcsdChip/wib
    hgLoadWiggle hg16 encode_UCSD_ChIP_tmH3K4_p0 tmH3K4_p0.wig -pathPrefix=/gbdb/hg16/encode/UcsdChip/wib


# STANFORD MLAGAN MSA OF FREEZE4 (2005-06-13 kate)
#       From George Asimenos, Batzoglou lab (asimenos@stanford.edu)
    
    cd /cluster/data/encode
    mkdir -p MLAGAN/freeze4/{cons,maf}
    cd MLAGAN/freeze4/maf
    # move maf submission in
    tar xvfz ENCODE_LAGAN_maf.tgz
    mkdir out tmp

        # human   hg16
        # mouse   mm6
        # rat     rn3
        # macaque rheMac1       rhesus
        # dog     canFam1
        # chimp   panTro1
        # monodelphis     monDom1       opossum
        # xenopus xenTro1       x_tropicalis
        # fugu    fr1
        # zebrafish       danRer2
        # tetraodon       tetNig1

cat > mafProject.csh << 'EOF'
    set tmpDir = /cluster/data/encode/MLAGAN/freeze4/maf/tmp
    set outDir = /cluster/data/encode/MLAGAN/freeze4/maf/out
    foreach r (EN[mr]*)
        echo $r
        set c = `echo "SELECT chrom from encodeRegions WHERE name='$r'" | \
                        hgsql -N hg16`
        set start =  \
                `echo "SELECT chromStart from encodeRegions WHERE name='$r'" | \
                        hgsql -N hg16`
        set size = \
                `echo "SELECT size from chromInfo WHERE chrom='$c'" | \
                        hgsql -N hg16`
        /cluster/data/encode/MLAGAN/mafCoord.pl < $r/$r.maf \
                human.1 hg16.$c $start $size | \
            sed 's/^a$/a score=0.0/' > $tmpDir/$r.db.maf
        /cluster/bin/penn/maf_project $tmpDir/$r.db.maf hg16.$c \
                        > $outDir/$r.maf
    end
'EOF'
    # << for emacs
    csh mafProject.csh >&! mafProject.log 
    # seg faults in ENm008 and ENr324
    # move maf files for these regions to "bad" dir
    # Minmei created projected files for these regions.
    # (our installed maf_project must be outdated)

    # change species to match browser
    cd /cluster/data/encode/MLAGAN/freeze4/maf
    mkdir load
    foreach r (EN[mr]*)
        sed -e 's/monodelphis/opossum/' -e 's/macaque/rhesus/' \
            -e 's/xenopus/x_tropicalis/' out/$r.maf > load/$r.maf
    end

    mkdir -p /gbdb/hg16/encode/MLAGAN/freeze4/mafs
    set d = /gbdb/hg16/encode/MLAGAN/freeze4/mafs
    rm $d/*.maf
    ln -s `pwd`/load/*.maf $d
    hgLoadMaf -WARN hg16 encodeMlaganAlign -pathPrefix=$d
        # lots of "score too small" errors. Allow this for now.
        # Submitters may want to fix this later.

    # create summary table for zoomed out views of pairwise
    cd /cluster/data/encode/MLAGAN/freeze4/maf/latest
    cat out/*.maf | hgLoadMafSummary -minSize=10000 -mergeGap=500 \
                                -maxSize=50000 hg16 encodeMlaganSummary stdin

    # conservation wiggles
    cat lab/chr*.wig | \
        wigEncode stdin GerpCons.wig GerpCons.wib
        # Converted stdin, upper limit 3.00, lower limit 0.00

    # GERP (from Greg Cooper)
    mkdir -p /gbdb/hg16/encode/MLAGAN/freeze4
    rm -f /gbdb/hg16/encode/MLAGAN/freeze4/GerpCons.wib
    ln -s `pwd`/GerpCons.wib /gbdb/hg16/encode/MLAGAN/freeze4
    hgLoadWiggle -pathPrefix=/gbdb/hg16/encode/MLAGAN/freeze4 hg16 \
                        encodeMlaganGerpCons GerpCons.wig 

    # binCons and phastCons (from Elliott)
    # Reload on 6/30 (phastCons changed)
    cd /cluster/data/encode/MLAGAN/freeze4/cons/latest/lab/jun30
    wget -r -nd ftp://kronos.nhgri.nih.gov/pub/outgoing/elliott/encode_lagan
    cd ../..
    gunzip -c lab/jun30/human.EN*.pC.*.wig.gz | \
        wigEncode stdin phastCons.wig phastCons.wib
            # Converted stdin, upper limit 1.00, lower limit 0.00
    gunzip -c lab/jun30/EN*.binCons.lagan.*.gz | \
        wigEncode stdin binCons.wig binCons.wib
            # Converted stdin, upper limit 1.00, lower limit 0.00
    set d = /gbdb/hg16/encode/MLAGAN/freeze4
    rm -f $d/{phastCons,binCons}.wib
    ln -s `pwd`/phastCons.wib $d
    ln -s `pwd`/binCons.wib $d
    hgLoadWiggle -pathPrefix=$d hg16 encodeMlaganPhastCons phastCons.wig 
    hgLoadWiggle -pathPrefix=$d hg16 encodeMlaganBinCons binCons.wig 

    # conserved elements (from GERP)
    # NOTE: abbreviate item names and round float scores
    perl -we 'while (<>) { \
      if (/^chr/) { \
        ($chrom,$start,$end,$name,$score) = split; \
        $name =~ s/GERP.score=/GERP./; \
        printf("%s\t%d\t%d\t%s\t%.0f\n", $chrom,$start,$end,$name,$score); \
      } }' lab/ENCODE_GERP_cons.bed > encodeMlaganGerpEl.tab
    hgLoadBed -noBin hg16 encodeMlaganGerpEl encodeMlaganGerpEl.tab
    # Loaded 48480 elements of size 5

    # load conserved elements (6 tables)
    # scored elements
    # Updated 6/30 for Elliott
    cd /cluster/data/encode/MSA/latest
    hgLoadBed -noBin hg16 encodeMlaganBinConsEl lab/lagan.binCons.bed
        # loaded 18677
    hgLoadBed -noBin hg16 encodeMlaganPhastConsEl lab/lagan.phastCons.bed
        # loaded 36105 elements
    # unscored - fix at score=500
    ln -s lab/lagan.or.bed MlaganUnionEl.bed
    ln -s lab/lagan.or.nc.bed MlaganNcUnionEl.bed
    ln -s lab/lagan.and.bed MlaganIntersectEl.bed
    ln -s lab/lagan.and.nc.bed MlaganNcIntersectEl.bed

    foreach f (Mlagan*.bed)
        set b = $f:r
        set t = encode$b
        sed -e 's/$/\t500/' $f | hgLoadBed -noBin hg16 $t stdin
        checkTableCoords hg16 $t
    end

# CONSENSUS ELEMENTS: MLAGAN AND TBA CONSERVED ELEMENTS (2005-06-29 kate)
    # From Elliott Margulies
    
    cd /cluster/data/encode
    mkdir -p MSA/freeze4
    cd MSA/freeze4
    mkdir jun30/lab
    cd jun30/lab
    wget -nd ftp://kronos.nhgri.nih.gov/pub/outgoing/elliott/msa/constrained_elements.tar.gz
    tar xvfz *.gz
    cd ../../

    # load conserved elements (combined lagan and tba) (6 tables)
    cd /cluster/data/encode/MSA/freeze4
    mkdir -p jun30/lab
    cd jun30/lab
    wget -nd ftp://kronos.nhgri.nih.gov/pub/outgoing/elliott/msa/constrained_elements.tar.gz
    cd ../..
    ln -s lab/tl.or.bed AllUnionEl.bed
    ln -s lab/tl.or.nc.bed AllNcUnionEl.bed
    ln -s lab/tl.and.bed AllIntersectEl.bed
    ln -s lab/tl.and.nc.bed AllNcIntersectEl.bed

    foreach f (All*.bed)
        set b = $f:r
        set t = encode$b
        sed -e 's/$/\t500/' $f | hgLoadBed -noBin hg16 $t stdin
        checkTableCoords hg16 $t
    end


# STANFORD  MSA OF FREEZE2 (2004-11-17 kate)
#       From George Asimenos, Batzoglou lab (asimenos@stanford.edu)
    
    cd /cluster/data/encode
    cd MLAGAN/freeze2
    mkdir lab out tmp
    set tmpDir = /cluster/data/encode/MLAGAN/freeze2/tmp
    set outDir = /cluster/data/encode/MLAGAN/freeze2/out
    cd lab
    wget -r http://ai.stanford.edu/~asimenos/beta_encode_Nov_2004/
    mv ai.stanford.edu/*asimenos/beta* .
    rm -fr ai.stanford.edu
    cd beta_encode_Nov_2004/data
cat > makeMaf.csh << 'EOF'
    set tmpDir = /cluster/data/encode/MLAGAN/freeze2/tmp
    set outDir = /cluster/data/encode/MLAGAN/freeze2/out
    foreach r (EN*)
        echo $r
        set c = `echo "SELECT chrom from encodeRegions WHERE name='$r'" | \
                        hgsql -N hg16`
        set start =  \
                `echo "SELECT chromStart from encodeRegions WHERE name='$r'" | \
                        hgsql -N hg16`
        set size = \
                `echo "SELECT size from chromInfo WHERE chrom='$c'" | \
                        hgsql -N hg16`
        bunzip2 -c $r/$r.maf.bz2 | \
          /cluster/data/encode/MLAGAN/mafCoord.pl human hg16.$c $start $size |\
            sed 's/^a$/a score=0.0/' > $tmpDir/$r.db.maf
        /cluster/bin/penn/maf_project $tmpDir/$r.db.maf hg16.$c \
                        > $outDir/$r.human.maf
    end
'EOF'
    # << for emacs
    csh makeMaf.csh >&! makeMaf.log &

    mkdir /gbdb/hg16/encode_MSA2_MLAGAN
    ln -s $outDir/*.human.maf /gbdb/hg16/encode_MSA2_MLAGAN
    hgLoadMaf -WARN hg16 encode_MSA2_MLAGAN

    # create description page from lab/index.html


# AFFY TRANSCRIPTION AND CHIP/CHIP TEMPORAL PROFILING (2004-12-08 kate)
#	(RELOADED TABLES 2005-05-31 as bedGraph types - Hiram)
# UPDATED 2005-06-06 (ChIP/chip) and 2005-06-08 (RNA) angie - see below

    # HL60 cells stimulated with Retinoic Acid
    # Transcription and PolII ChIP/Chip at 4 timepoints (8 experiments)
    # wig & bed for each experiment 
    # filenames: EC_HL60_RA_RNA_00hr_AS.hs.NCBIv34.{bed,wig}
    #            EC_HL60_RA_RNApol2_00hr_AS_b.hs.NCBIv34.{bed,wig}
    # Use these to construct 16 tracks:
    # track encodeAffyRnaHl60Hr{00,02,08,32}
    # track encodeAffyChIpRnapHl60Hr{00,02,08,32}
    # RNA/Affy HL60 0hr
    # ChIP/Affy Pol2 0hr

    mkdir -p /gbdb/hg16/encode/Affy/2004-12-03/wib
    cd /cluster/data/encode
    mkdir -p Affy/Dec03_2004
    cd Affy/Dec03_2004
    wget http://transcriptome.affymetrix.com/download/ENCODE/ucsc_formats.tar.bz2
    bunzip2 ucsc_formats.tar.bz2
        # trailing garbage after EOF ignored
    tar xvf ucsc_formats.tar
    # NOTE: the script below creates $hrs as 0,2,8,32
    # and tables were renamed to 00,02,08,32, so change script
    # next time it's loaded.
    cd ucsc_formats
cat > load.csh << 'EOF'
    foreach f (bed/*.bed)
        set base = $f:t:r
        set info = `echo $f:t | sed 's/EC_HL60_RA_\([^_][^_]*\)_\([0-9][0-9]*\)hr.*/\1.\2/'`
        set hrs = `expr $info:e + 0`
        set exp = `echo $info:r |  \
                        sed -e 's/RNApol2/ChIpRnap/' -e 's/RNA/Rna/'`
        if ($exp == "Rna") then
            set meas = "Sig"
        else
            set meas = "Pval"
        endif
        set track = encodeAffy${exp}Hl60SitesHr$hrs
        sed '1d' $f | \
            hgLoadBed -noBin hg16 $track stdin
        set ct = `hgsql -N -s -e "select count(*) from $track" hg16`
        echo "track $track"
        echo "$f:t	$ct"
        set track = encodeAffy${exp}Hl60${meas}Hr$hrs
        sed '/^track/d' wig/$base.wig | wigEncode stdin $track.wig $track.wib
        ln -s `pwd`/$track.wib /gbdb/hg16/encode/Affy/2004-12-03/wib
        # load into database
        hgLoadWiggle hg16 ${track} $track.wig \
                    -pathPrefix=/gbdb/hg16/encode/Affy/2004-12-03/wib
    end
'EOF'
    # << for emacs
    csh load.csh >&! load.log &
    grep EC_HL60 load.log | grep -v wig > loaded.txt
    # email loaded.txt to contributor
    awk '{print $1}' bed/*.bed | sort | uniq > chroms.txt
    # use in trackDb chromosomes setting
    grep encodeAffy load.log > trackDb.ra
    # use as basis of hg16 trackDb entries
    # (create 4 composite tracks)

    cd description
    # edit
    cp EC_HL60_RA_RNA_00hr_AS_hs.NCBIv34.bed.dsc encodeAffyRnaHl60Sites.html
    cp EC_HL60_RA_RNA_00hr_AS_hs.NCBIv34.wig.dsc encodeAffyRnaHl60Sig.html
    cp EC_HL60_RA_RNApol2_00hr_AS_hs.NCBIv34.bed.dsc encodeAffyChIpRnapHl60Sites.html
    cp EC_HL60_RA_RNApol2_00hr_AS_hs.NCBIv34.wig.dsc encodeAffyChIpRnapHl60Pval.html

    # UPDATED ChIP/chip 2005-06-06 angie
    cd /cluster/data/encode/Affy/2005-06-05
    # Load up the 41 bed3 tables of ChIP/chip sites...
    foreach f (chipchip/bed/*.bed)
      set track = `echo $f:t:r:r:r | perl -wpe \
                     's/EC_AS_HL60_DN_RA_//; \
                      s/_C01_EC_AS_HL60_DN_RA_Input_All_B1_B2_B3//; \
                      s/Pol2/Rnap/;  s/RARecA/Rara/;  s/TFIIB-R/TFIIB/; \
                      s/(\S+)_(\d+)hr/encodeAffyChIpHl60Sites\u\L$1\EHr$2/; \
                      s/H3k27t/H3K27me3/; s/Hish4/H4Kac4/;'`
      echo $track
      tail +2 $f \
      | hgLoadBed -noBin hg16 $track stdin
    end
    # Loop over those bed files again to make trackDb.ra contents...
    set pri = 2
    foreach f (chipchip/bed/*.bed)
      set track = `echo $f:t:r:r:r | perl -wpe \
                     's/EC_AS_HL60_DN_RA_//; \
                      s/_C01_EC_AS_HL60_DN_RA_Input_All_B1_B2_B3//; \
                      s/Pol2/Rnap/;  s/RARecA/Rara/;  s/TFIIB-R/TFIIB/; \
                      s/(\S+)_(\d+)hr/encodeAffyChIpHl60Sites\u\L$1\EHr$2/; \
                      s/H3k27t/H3K27me3/; s/Hish4/H4Kac4/;'`
      set factor = `echo $f:t:r:r:r | perl -wpe \
                     's/EC_AS_HL60_DN_RA_//; \
                      s/_\d+hr_C01_EC_AS_HL60_DN_RA_Input_All_B1_B2_B3//; \
                      s/RARecA/RARA/;  s/TFIIB-R/TFIIB/; \
                      s/H3K27T/H3K27me3/; s/HisH4/H4Kac4/;'`
      set hh = `echo $f:t:r:r:r | perl -wpe 's/.*_(\d+)hr_.*/$1/'`
      set hr = `expr $hh + 0`
      set fIndex = `echo $factor | sed -e \
            's/Brg1/0/; s/CEBPe/1/; s/CTCF/2/; s/H3K27me3/3/; s/H4Kac4/4/; \
             s/P300/5/; s/PU1/6/; s/Pol2/7/; s/RARA/8/; s/SIRT1/9/; \
             s/TFIIB/10/;'`
      if ($fIndex == 10) then
        set r = 0
	set g = 0
	set b = 200
      else
        set r = `expr 25 \* \( 9 - $fIndex \)`
        set g = `expr 25 \* $fIndex`
        set b = 0
      endif
      echo track $track
      echo subTrack encodeAffyChIpHl60SitesHr$hh
      echo shortLabel Affy $factor RA ${hr}h
      echo longLabel Affymetrix ChIP/Chip \($factor retinoic acid-treated HL-60, ${hr}hrs\) Sites
      echo color $r,$g,$b
      echo priority $pri
      echo ""
      set pri = `expr $pri + 2`
    end
    ssh kolossus
    cd /cluster/data/encode/Affy/2005-06-01
    # chr7 has >150,000 rows, and >100,000 means trouble for bedGraph 
    # performance; so load this as wiggle instead.
    #  Fixup off-by-one error in these files: 2005-08-01 - Hiram
    #	Add the -lift=1 to the wigEncode command
    #	The data in the .wig files is actually 0-relative, it should
    #	have been 1-relative.  Add 1 to get it to be 1-relative.
    mkdir wig wib
    foreach f (chipchip/wig/*.median.wig.bz2)
      set track = `echo $f:t:r:r:r:r | perl -wpe \
                     's/EC_AS_HL60_DN_RA_//; \
                      s/_C01_EC_AS_HL60_DN_RA_Input_All_B1_B2_B3//; \
                      s/Pol2/Rnap/;  s/RARecA/Rara/;  s/TFIIB-R/TFIIB/; \
                      s/(\S+)_(\d+)hr/encodeAffyChIpHl60Pval\u\L$1\EHr$2/; \
                      s/H3k27t/H3K27me3/; s/Hish4/H4Kac4/;'`
      echo $track
      bzcat $f | wigEncode -lift=1 stdin wig/$track.wig wib/$track.wib
    end
    # lower limit is 0 for all.  Upper limit varies quite a bit.
    # *Hr00: 72.60 (encodeAffyChIpHl60PvalCtcfHr00) to 534.54 (H4Kac4)
    # *Hr02: 65.62 (encodeAffyChIpHl60PvalCtcfHr02) to 425.79 (H4Kac4)
    # *Hr08: 85.14 (encodeAffyChIpHl60PvalSirt1Hr08) to 505.86 (H4Kac4)
    # *Hr32: 50.18 (encodeAffyChIpHl60PvalCtcfHr32) to 490.03 (H4Kac4)
    # Use the top upper value as the limit for now... if the other 
    # subtracks look too smushed, maybe we can adjust?
    # back on hgwdev:
    cd /cluster/data/encode/Affy/2005-06-01
    set gbdbDir = /gbdb/hg16/encode/Affy/2005-06-01
    mkdir -p $gbdbDir/wib
    foreach f (chipchip/wig/*.median.wig.bz2)
      set track = `echo $f:t:r:r:r:r | perl -wpe \
                     's/EC_AS_HL60_DN_RA_//; \
                      s/_C01_EC_AS_HL60_DN_RA_Input_All_B1_B2_B3//; \
                      s/Pol2/Rnap/;  s/RARecA/Rara/;  s/TFIIB-R/TFIIB/; \
                      s/(\S+)_(\d+)hr/encodeAffyChIpHl60Pval\u\L$1\EHr$2/; \
                      s/H3k27t/H3K27me3/; s/Hish4/H4Kac4/;'`
      echo $track
      ln -s `pwd`/wib/$track.wib $gbdbDir/wib/
      nice hgLoadWiggle hg16 $track wig/$track.wig -pathPrefix=$gbdbDir
    end
    # same trackDb-helper:
    set pri = 1
    foreach f (chipchip/wig/*.median.wig.bz2)
      set track = `echo $f:t:r:r:r:r | perl -wpe \
                     's/EC_AS_HL60_DN_RA_//; \
                      s/_C01_EC_AS_HL60_DN_RA_Input_All_B1_B2_B3//; \
                      s/Pol2/Rnap/;  s/RARecA/Rara/;  s/TFIIB-R/TFIIB/; \
                      s/(\S+)_(\d+)hr/encodeAffyChIpHl60Pval\u\L$1\EHr$2/; \
                      s/H3k27t/H3K27me3/; s/Hish4/H4Kac4/;'`
      set factor = `echo $f:t:r:r:r:r | perl -wpe \
                     's/EC_AS_HL60_DN_RA_//; \
                      s/_\d+hr_C01_EC_AS_HL60_DN_RA_Input_All_B1_B2_B3//; \
                      s/RARecA/RARA/;  s/TFIIB-R/TFIIB/; \
                      s/H3K27T/H3K27me3/; s/HisH4/H4Kac4/;'`
      set hh = `echo $f:t:r:r:r | perl -wpe 's/.*_(\d+)hr_.*/$1/'`
      set hr = `expr $hh + 0`
      set fIndex = `echo $factor | sed -e \
            's/Brg1/0/; s/CEBPe/1/; s/CTCF/2/; s/H3K27me3/3/; s/H4Kac4/4/; \
             s/P300/5/; s/PU1/6/; s/Pol2/7/; s/RARA/8/; s/SIRT1/9/; \
             s/TFIIB/10/;'`
      if ($fIndex == 10) then
        set r = 0
	set g = 0
	set b = 200
      else
        set r = `expr 25 \* \( 9 - $fIndex \)`
        set g = `expr 25 \* $fIndex`
        set b = 0
      endif
      echo track $track
      echo subTrack encodeAffyChIpHl60PvalHr$hh
      echo shortLabel Affy $factor RA ${hr}h
      echo longLabel Affymetrix ChIP/Chip \($factor retinoic acid-treated HL-60, ${hr}hrs\) P-Value
      echo color $r,$g,$b
      echo priority $pri
      echo ""
      set pri = `expr $pri + 2`
    end

    # UPDATED RNA 2005-06-08 angie
    cd /cluster/data/encode/Affy/2005-06-01
    tail +2 rna/bed/EC_AS_GM06990_RCyP+_C01vsNULL.sig.gr.bed \
    | hgLoadBed -noBin hg16 encodeAffyRnaGm06990Sites stdin
    tail +2 rna/bed/EC_AS_HeLa_RCyP+_C01vsNULL.sig.gr.bed \
    | hgLoadBed -noBin hg16 encodeAffyRnaHeLaSites stdin
    foreach f (rna/bed/*HL60*.bed)
      set track = `echo $f:t:r:r:r | perl -wpe \
        's/EC_AS_HL60_RWP\+_RA_(\d+)hr_C01vsNULL/encodeAffyRnaHl60SitesHr$1/;'`
      echo $track
      tail +2 $f \
      | hgLoadBed -noBin hg16 $track stdin
    end
    set gbdbDir = /gbdb/hg16/encode/Affy/2005-06-01
    mkdir -p $gbdbDir/wib
    set track = encodeAffyRnaGm06990Signal
    #  Fixup off-by-one error in these files: 2005-08-01 - Hiram
    bzcat rna/wig/EC_AS_GM06990_RCyP+_C01vsNULL.sig.wig.bz2 \
    | wigEncode -lift=1 stdin wig/$track.wig wib/$track.wib
#    bzcat rna/wig/EC_AS_GM06990_RCyP+_C01vsNULL.sig.wig.bz2 \
#    | wigEncode stdin wig/$track.wig wib/$track.wib
#Converted stdin, upper limit 1686.50, lower limit -899.25
    ln -s `pwd`/wib/$track.wib $gbdbDir/wib/
    nice hgLoadWiggle hg16 $track wig/$track.wig -pathPrefix=$gbdbDir
    set track = encodeAffyRnaHeLaSignal
#  Fixup off-by-one error in these files: 2005-08-01 - Hiram
    bzcat rna/wig/EC_AS_HeLa_RCyP+_C01vsNULL.sig.wig.bz2 \
    | wigEncode -lift=1 stdin wig/$track.wig wib/$track.wib
#    bzcat rna/wig/EC_AS_HeLa_RCyP+_C01vsNULL.sig.wig.bz2 \
#    | wigEncode stdin wig/$track.wig wib/$track.wib
#Converted stdin, upper limit 1250.00, lower limit -763.50
    ln -s `pwd`/wib/$track.wib $gbdbDir/wib/
    nice hgLoadWiggle hg16 $track wig/$track.wig -pathPrefix=$gbdbDir
    # Reloaded these four 6/29/05 after Jim and Hiram found that they'd 
    # been loaded with the HeLa data due to a copy/paste error:
#  Fixup off-by-one error in these files: 2005-08-01 - Hiram
#	add -lift=1 to the wigEncode command
    foreach f (rna/wig/*HL60*C01vsNULL.sig.wig.bz2)
      set track = `echo $f:t:r:r:r | perl -wpe \
       's/EC_AS_HL60_RWP\+_RA_(\d+)hr_C01vsNULL/encodeAffyRnaHl60SignalHr$1/;'`
      echo $track
      bzcat $f \
      | wigEncode -lift=1 stdin wig/$track.wig wib/$track.wib
      ln -s `pwd`/wib/$track.wib $gbdbDir/wib/
      nice hgLoadWiggle hg16 $track wig/$track.wig -pathPrefix=$gbdbDir
    end
#Converted stdin, upper limit 1405.00, lower limit -1052.50
#Converted stdin, upper limit 1238.50, lower limit -1019.75
#Converted stdin, upper limit 1587.00, lower limit -1168.00
#Converted stdin, upper limit 1290.00, lower limit -910.75


# BOSTON UNIVERSITY / ZHIPING WENG LAB FIRST EXON DATA (WORKING 2005-01-27 kate)

    ssh kksilo
    cd /cluster/data/encode
    mkdir -p BU
    cd BU
    #mkdir -p 2005-01-06
    #ln -s 2005-01-06/latest
    #cd latest
    # copy file from email:  BED.tar.gz
    #tar xvfz BED.tar.gz

    #mkdir -p 2005-04-29
    #ln -s 2005-04-29/latest
    mkdir -p 2005-05-04
    ln -s 2005-05-04 latest
    cd latest
    # save file from email
    tar xvfz BED.BUMay4.tar.gz

    cd BED
    cp rcPCREncodeDataDescription.html ~/kent/src/hg/makeDb/trackDb/human/hg16/encodeBuFirstExon.html
    # checkin description file and edit
    cd ..

    # adjust chromStart
    cat > lowerStart.awk << 'EOF'
        {printf ("%s\t%d\t%d\t%s\t%d\t%s\t%d\t%d\t%s\t%d\t%s\t%s\t%s\n",
                $1, $2-1, $3, $4, $5, $6, $7-1, $8, $9, $10, $11, $12, $13);}
'EOF'
    # << for emacs
    # rearrange data colums to be BED 13, add RGB values to "reserved" field
    # based on score
    makeColoredBed < BED/cerebralcortex.bed | \
                        awk -f lowerStart.awk > encodeBuFirstExonCerebrum.bed
    makeColoredBed < BED/colon.bed  | \
                        awk -f lowerStart.awk > encodeBuFirstExonColon.bed
    makeColoredBed < BED/heart.bed  | \
                        awk -f lowerStart.awk > encodeBuFirstExonHeart.bed
    makeColoredBed < BED/kidney.bed  | \
                        awk -f lowerStart.awk > encodeBuFirstExonKidney.bed
    makeColoredBed < BED/liver.bed  | \
                        awk -f lowerStart.awk > encodeBuFirstExonLiver.bed
    makeColoredBed < BED/lung.bed  | \
                        awk -f lowerStart.awk > encodeBuFirstExonLung.bed
    makeColoredBed < BED/skeletalmuscle.bed  | \
                        awk -f lowerStart.awk > encodeBuFirstExonSkMuscle.bed
    makeColoredBed < BED/spleen.bed  | \
                        awk -f lowerStart.awk > encodeBuFirstExonSpleen.bed
    makeColoredBed < BED/stomach.bed  | \
                        awk -f lowerStart.awk > encodeBuFirstExonStomach.bed
    makeColoredBed < BED/testis.bed  | \
                        awk -f lowerStart.awk > encodeBuFirstExonTestis.bed

    # take a look at data distribution (previous was skewed toward 0)
    awk '{print $5}' *.bed | sort -n > scores.txt
    textHistogram scores.txt -binSize=100 -maxBinCount=11
    # 0       107.000000
    # 100     8.000000
    # 200     32.000000
    # 300     53.000000
    # 400     91.000000
    # 500     130.000000
    # 600     186.000000
    # 700     118.000000
    # 800     33.000000
    # 900     1.000000
    # 1000    1.000000
    ## cluster at 0, followed by a nice bell curve

    cat > load.csh << 'EOF'
    foreach f (Cerebrum Colon Heart Kidney Liver Lung SkMuscle Spleen Stomach Testis)
        hgLoadBed hg16 -noBin encodeBuFirstExon encodeBuFirstExon$f.bed -sqlTable=${HOME}/kent/src/hg/lib/encodeBuFirstExon.sql
        hgsql hg16 -e "DROP TABLE IF EXISTS encodeBuFirstExon$f"
        hgsql hg16 -e "ALTER TABLE encodeBuFirstExon RENAME TO encodeBuFirstExon$f"
        checkTableCoords hg16 encodeBuFirstExon$f
    end
'EOF'

    csh load.csh >&! load.log

    # generate list of chromosomes with data for trackDb entry
    foreach f (Cerebrum Colon Heart Kidney Liver Lung SkMuscle Spleen Stomach Testis)
        /cluster/data/encode/scripts//showChromList.csh hg16 encodeBuFirstExon$f
    end
        #chr11,chr13,chr15,chr16,chr19,chr2,chr5,chr7,chr9,chrX


# GENCODE Sanger Havana annotations  (2005-06-07 kate)
    # previous versions are in other dirs
    # reloaded class table 6/13 based on request of Havana team
    ssh hgwdev
    mkdir -p /cluster/data/encode/Gencode
    cd /cluster/data/encode/Gencode
    mkdir 2005-06-07
    cd 2005-06-07
    wget --timestamping --force-directories --directory-prefix=. \
	--dont-remove-listing --recursive --level=3 --no-parent \
	--no-host-directories --cut-dirs=5 \
    ftp://genome.imim.es/pub/other/gencode/data/havana-encode/current/CHR_coord_hg17/global_files/44regions_CHR_coord.gtf
    # Previous: Downloaded:  7,718,475 bytes in 2 files (note: 1/2 as many as previous)
    # Current: Downloaded: 7,753,057 bytes in 2 files
    mv current/CHR_coord_hg17/global_files/44regions_CHR_coord.gtf .
    rm -fr current
    wc -l 44regions_CHR_coord.gtf
        # CURRENT:   46418 44regions_CHR_coord.gtf
        # PREVIOUS: 46191 44regions_CHR_coord.gtf
    ln -s 44regions_CHR_coord.gtf gencode.gtf

    # converting to genePred before lifting to create gene track
    #  due to problems in liftOver -gff found by Mark D.
    # Exclude non-Vega sources as per France Denoued at IMIM 
    grep VEGA gencode.gtf > gencode.vega.gtf
    ldHgGene -out=gencode.gp -gtf -genePredExt null null gencode.vega.gtf
        # CURRENT: Read 2888 transcripts in 46127 lines in 1 files
        # PREVIOUS: Read 2542 transcripts in 40133 lines in 1 files
    liftOver -genePred gencode.gp \
            /cluster/data/hg17/bed/liftOver/hg17ToHg16.chain \
                gencode.hg16.gp gencode.gp.unmapped
     wc -l gencode.hg16.gp
        #  previous 2540 gencode.hg16.gp
        #  2851 gencode.hg16.gp
      wc -l gencode.gp.unmapped
        # PREVIOUS
        # 4 -> 2 problem transcripts
        # AC005538.4-001 and AC005538.4-006
        # UPDATE
        # 37 unmapped, all on chrX except:
        # AC005538.4-001 and AC005538.4-006 on chr2
         Each transcript had 1 exon unmapped -- notifying submitter
    ldHgGene -predTab -genePredExt hg16 encodeGencodeGene gencode.hg16.gp
        # 2851 gene predictions
    checkTableCoords hg16 encodeGencodeGene

    # introns track
    liftOver -gff gencode.gtf \
            /cluster/data/hg17/bed/liftOver/hg17ToHg16.chain \
                gencode.hg16.gtf gencode.gtf.unmapped
    wc -l gencode.hg16.gtf
    #  CURRENT:   46239 gencode.hg16.gtf
    #  PREVIOUS: 46014 gencode.hg16.gtf

    grep intron gencode.hg16.gtf | wc -l
        # 15695
    grep intron gencode.hg16.gtf | grep -v not_tested | wc -l
        # CURRENT: 467
        # PREVIOUS: 397
    grep -v not_tested gencode.hg16.gtf | sed -e 's/-intron/-/g' | \
        ldGencodeIntron hg16 encodeGencodeIntron stdin
            # CURRENT: 467 introns in 1 files
            # PREVIOUS: 397 introns in 1 files
            # OLDER: 425 introns in 1 files

    # create gene class table
    hgsql hg16 < ~/kent/src/hg/lib/gencodeGeneClass.sql
    awk '{printf "%s\t%s\n", $10, $2}' gencode.vega.gtf | \
        sed -e 's/"//g' -e 's/;//' -e 's/VEGA_//' \
            -e 's/_val/_gencode_conf/' -e 's/Antisense/Novel_transcript/' | \
        sort | uniq > gencodeGeneClass.tab
    wc -l gencodeGeneClass.tab
        # 2888 gencodeGeneClass.tab
    echo "LOAD DATA LOCAL INFILE 'gencodeGeneClass.tab' into table gencodeGeneClass" | hgsql hg16

    # create table of accessions in reference set
    # These are genes with classes: Known, Novel_CDS,
    #   Novel_transcript_gencode_conf, Putative_gencode_conf 
    egrep 'Known|Novel_CDS|Novel_transcript_gencode_conf|Putative_gencode_conf' \
        gencodeGeneClass.tab > gencodeGeneRef.tab
    sed 's/gencodeGeneClass/gencodeGeneRef/' \
        ~/kent/src/hg/lib/gencodeGeneClass.sql | \
            hgsql hg16 
    wc -l encodeGencodeRef.tab
    ldHgGene -predTab -genePredExt hg16 encodeGencodeGene gencode.hg16.gp
    # add to all.joiner

    # create table of reference annotaton (unfortunate redundant processing)
    # for data analysis fair.  We may want to replace this with a composite
    # track with 2 subtracks: one for reference annotation, plus "other"
    # Very hokey -- I used TB to filter by the 4 categories
    # (Known, Novel_CDS, Novel_transcript_gencode_conf, Putative_gencode_conf)

    # inventory sources for genepreds (genes will be colored by these)
    hgsql -N hg16 \
        -e "select  distinct(class) from gencodeGeneClass" > classes.txt
    wc -l classes.txt
        # 10

    # run Mark D's gene checker on VEGA_Known and email results to submitter
    echo " select encodeGencodeGene.* from encodeGencodeGene, gencodeGeneClass where encodeGencodeGene.name=gencodeGeneClass.name and gencodeGeneClass.class='Known'" | hgsql hg16 -N > gencode.known.gp
    /cluster/bin/i386/gene-check -incl-ok -nib-dir /cluster/data/hg16/nib -details-out encodeGencodeGene.chk-details gencode.known.gp encodeGencodeGene.chk


    # create a filtered table for use at data analysis workshop
    cd /cluster/data/encode/Gencode/2005-06-07
    hgsql hg16 < encodeGencodeGeneFiltered.sql
    insert into encodeGencodeGeneFiltered select t1.* from encodeGencodeGene as t1,
      gencodeGeneClass as t2 where t1.name = t2.name and t2.class = "Known";
    # 2291 rows
    insert into encodeGencodeGeneFiltered select t1.* from encodeGencodeGene as t1,
      gencodeGeneClass as t2 where t1.name = t2.name and t2.class = "Novel_CDS";
    # 104 rows


##########################################################################
# Sanger Chip/chip  (DONE 2005-03-14 kate)
    ssh hgwdev
    cd /cluster/data/encode/sanger/
    mkdir -p chipchip/2005-02-15
    cd chipchip/2005-02-15

    # obtain data files from ftp site
    cp /var/ftp/encode/H3K4me3_GM06990_1.description.txt .
    cp /var/ftp/encode/H3K4me3_GM06990_1.wig.txt .
    rm /var/ftp/encode/H3K*.txt

    # 4 columns: chrom, start, end, score (float)
    # data value range is .12 to 74.875
    # format as BED file
    grep "^chr" H3K4me3_GM06990_1.wig.txt > H3K4me3_GM06990_1.bed

    # create table schema for BED 4 with float score
    cat > table.sql << 'EOF'
    CREATE TABLE tablename (
        chrom varchar(255) not null,        # Chromosome (this species)
        chromStart int unsigned not null,   # Start position in chromosome (forward strand)
        chromEnd int unsigned not null,     # End position in chromosome
        score float not null,       # Overall score
                  #Indices
        INDEX(chrom(13),chromStart),
        INDEX(chrom(13),chromEnd)
    );
    'EOF'
    # << for emacs

    #	NOTE !*! hgLoadBed has been fixed 2005-06-01 to be able to load
    #	these bedGraph types without this special sql business.

    set tablename = encodeSangerChipH3me3GM06990
    sed "s/tablename/$tablename/" table.sql > $tablename.sql

    hgsql hg16 < $tablename.sql
    hgLoadBed -noBin -sqlTable=$tablename.sql hg16 $tablename H3K4me3_GM06990_1.bed
    # Loaded 17604 elements of size 4

    # create trackDb entry:

    # type bedGraph 4
    # shortLabel ChIP/Sanger
    # longLabel Sanger ChIP/Chip (H3K4me3 GM06990) in ENCODE Regions
    # minLimit 0
    # maxLimit 75


##########################################################################
# Genome Institute of Singapore -ChIP/PET of p53 TFBS (2005-03-31 kate)
# UPDATED (2005-06-13 angie)
    cd /cluster/data/encode
    mkdir -p GIS/2005-04-07
    cd GIS/2005-04-07
    # files: description.html, p53ChIP-PET.bed
    sed '1d' p53ChIP-PET.bed | hgLoadBed hg16 encodeGisChipPet stdin
    # Loaded 65513 elements of size 12

    # UPDATE (2005-06-13 angie)
    # Use cluster-count info, now embedded into the name, to make scored BED:
    cd /cluster/data/encode/GIS/p53/2005-06-10
    tail +2 p53ChIP-PET-hg16.bed \
    | perl -wpe 'chomp; @w = split; \
                 if ($w[3] =~ /^\d+-(\d+)$/) { \
                   $w[4] = ($1 >= 4 ? 1000 : ($1 >= 3 ? 800 : 333)); \
                 } else { die "parse"; } \
                 $_ = join("\t", @w) . "\n";' \
    | hgLoadBed hg16 encodeGisChipPet stdin
    # checkTableCoords cron job caught some overlapping blocks.  
    # I would like to fix it, but Atif prefers to leave in the overlapping 
    # blocks for consistency with what is on their site!  :[  Oh well.


##########################################################################
# TEST RE-LOADING AFFY TRANSCRIPTION AND CHIP/CHIP TEMPORAL PROFILING
#	- Hiram 2005-05-24
#	as bedGraph type tracks instead of wiggle to provide accurate
#	data retrieval of the original data rather than have the wiggle
#	transformations interferring.  An experiment to see if there is
#	a large difference in storage requirements for this data.
    ssh hgwdev
    mkdir /cluster/data/encode/Affy/Dec03_2004/bedGraph
    cd /cluster/data/encode/Affy/Dec03_2004/bedGraph

    #	NOTE !*! hgLoadBed has been fixed 2005-06-01 to be able to load
    #	these bedGraph types without this special sql business.

    for HR in 00 02 08 32
    do
	rm -f encodeAffyRnaHl60SigHr${HR}.bed
	rm -f bedGraph.sql
	zcat ../ucsc_formats/wig/EC_HL60_RA_RNA_${HR}hr_AS.hs.NCBIv34.wig.gz | \
	    /cluster/bin/scripts/varStepToBedGraph.pl /dev/stdin | \
		sort -k1,1 -k2,2n > encodeAffyRnaHl60SigHr${HR}.bed
	hgsql -e "drop table encodeAffyRnaHl60SigHr${HR}" hg16 2> /dev/stderr
	echo "CREATE TABLE encodeAffyRnaHl60SigHr${HR} (" > bedGraph.sql
	echo 'bin smallint unsigned not null,' >> bedGraph.sql
	echo 'chrom varchar(255) not null,' >> bedGraph.sql
	echo 'chromStart int unsigned not null,' >> bedGraph.sql
	echo 'chromEnd int unsigned not null,' >> bedGraph.sql
	echo 'dataValue float not null,' >> bedGraph.sql
	echo 'INDEX(chrom(6),bin)' >> bedGraph.sql
	echo ');' >> bedGraph.sql
	hgLoadBed -sqlTable=bedGraph.sql hg16 encodeAffyRnaHl60SigHr${HR} \
		encodeAffyRnaHl60SigHr${HR}.bed
    done
    #
    #	Measuring previous data requirements:
    ssh hgwdev
    cd /gbdb/hg16/encode/Affy/2004-12-03/wib
    du -hscL encodeAffyRna*
# 19M     encodeAffyRnaHl60SigHr00.wib
# 19M     encodeAffyRnaHl60SigHr02.wib
# 19M     encodeAffyRnaHl60SigHr08.wib
# 19M     encodeAffyRnaHl60SigHr32.wib
# 76M     total
    #	examining the "show table status" on these tables shows:
    #	index length: 277,504 * 4 = 1,110,016 = 1.06 Mb
    #   data length: 3,291,136 * 4 = 13,181,472 = 12.57 Mb
    #	therefore total size: 76 + 1 + 13 = 90 Mb
    #	These new tables loaded are:
    #	index length: 9,033,728 * 4 = 36,134,912 = 34.46 Mb
    #   data length: 17,331,152 * 4 = 69,096,304 = 65.90 Mb
    #	thus total length: 34 + 66  = 100 Mb
    #	At this density of data these storage areas are certainly
    #	comparable and practically equivalent.

    #	NOTE !*! hgLoadBed has been fixed 2005-06-01 to be able to load
    #	these bedGraph types without this special sql business.

    #	do the other set of data too
    for HR in 00 02 08 32
    do
	rm -f encodeAffyChIpRnapHl60PvalHr${HR}.bed
	rm -f bedGraph.sql
	zcat \
	../ucsc_formats/wig/EC_HL60_RA_RNApol2_${HR}hr_AS_b.hs.NCBIv34.wig.gz \
	    | /cluster/bin/scripts/varStepToBedGraph.pl /dev/stdin | \
		sort -k1,1 -k2,2n > encodeAffyChIpRnapHl60PvalHr${HR}.bed
	hgsql -e "drop table encodeAffyChIpRnapHl60PvalHr${HR}" hg16 2> /dev/stderr
	echo "CREATE TABLE encodeAffyChIpRnapHl60PvalHr${HR} (" > bedGraph.sql
	echo 'bin smallint unsigned not null,' >> bedGraph.sql
	echo 'chrom varchar(255) not null,' >> bedGraph.sql
	echo 'chromStart int unsigned not null,' >> bedGraph.sql
	echo 'chromEnd int unsigned not null,' >> bedGraph.sql
	echo 'dataValue float not null,' >> bedGraph.sql
	echo 'INDEX(chrom(6),bin)' >> bedGraph.sql
	echo ');' >> bedGraph.sql
	hgLoadBed -sqlTable=bedGraph.sql hg16 \
	    encodeAffyChIpRnapHl60PvalHr${HR} \
		encodeAffyChIpRnapHl60PvalHr${HR}.bed
    done
    #

########################################################################
# Sanger Chip/chip updated  (DONE - 2005-05-25 - Hiram)
#	data supplier contacts: Christoph Koch <cmk@sanger.ac.uk>
#	Ian Dunham <id1@sanger.ac.uk> Robert Andrews <rma@sanger.ac.uk>
    ssh hgwdev
    mkdir /cluster/data/encode/sanger/chipchip/2005-05-25
    cd /cluster/data/encode/sanger/chipchip
    rm latest
    ln -s 2005-05-25 latest
    cd 2005-05-25

    cp -p /var/ftp/encode/*GM06990* .

    #	convert to simple bedGraph files:  (stripping browser and track
    #	line definitions from the custom track format), make sure they
    #	are sorted properly.
    grep "^chr" H3K4me1_GM06990_1.wig.txt | sort -k1,1 -k2,2n > \
	H3K4me1_GM06990_1.bed
    grep "^chr" H3K4me3_GM06990_2.wig.txt | sort -k1,1 -k2,2n > \
	H3K4me3_GM06990_2.bed
    grep "^chr" H4ac_GM06990_1.wig.txt | sort -k1,1 -k2,2n > \
	H4ac_GM06990_1.bed 
    grep "^chr" H3K4me2_GM06990_1.wig.txt | sort -k1,1 -k2,2n > \
	H3K4me2_GM06990_1.bed 
    grep "^chr" H3ac_GM06990_1.wig.txt | sort -k1,1 -k2,2n > \
	H3ac_GM06990_1.bed 

    #	find min and max in the data:
    awk '{print $4}' *.bed | sort -n | head -1
    #	0.02
    awk '{print $4}' *.bed | sort -n | tail -1
    #	72.34

    #	This used to be a single track with only one data set
    #	Place these new five data sets into a single composite track

    #	Simplify the naming scheme here:
    ln -s H3K4me1_GM06990_1.bed H3K4me1.bed
    ln -s H3K4me2_GM06990_1.bed H3K4me2.bed
    ln -s H3K4me3_GM06990_2.bed H3K4me3.bed
    ln -s H3ac_GM06990_1.bed H3ac.bed
    ln -s H4ac_GM06990_1.bed H4ac.bed

    #	NOTE !*! hgLoadBed has been fixed 2005-06-01 to be able to load
    #	these bedGraph types without this special sql business.

    #	so the names can now be used in a for loop:
    for T in H3K4me1 H3K4me2 H3K4me3 H3ac H4ac
    do
	hgsql -e "drop table encodeSangerChip${T}" hg16 2> /dev/stderr
	echo "CREATE TABLE encodeSangerChip${T} (" > bedGraph.sql
	echo 'bin smallint unsigned not null,' >> bedGraph.sql
	echo 'chrom varchar(255) not null,' >> bedGraph.sql
	echo 'chromStart int unsigned not null,' >> bedGraph.sql
	echo 'chromEnd int unsigned not null,' >> bedGraph.sql
	echo 'score float not null,' >> bedGraph.sql
	echo 'INDEX(chrom(6),bin)' >> bedGraph.sql
	echo ');' >> bedGraph.sql
	hgLoadBed -sqlTable=bedGraph.sql hg16 encodeSangerChip${T} ${T}.bed
    done

    #	transform the five description pages into one single page.
    #	see hg16/trackDb.ra for encodeSangerChipH3H4 composite track
    #	definitions
    #	clean up /var/ftp/encode
    
    rm -f /var/ftp/encode/Description_H3K4me1_GM06990_1.html
    rm -f /var/ftp/encode/Description_H3K4me2_GM06990_1.html
    rm -f /var/ftp/encode/Description_H3K4me3_GM06990_2.html
    rm -f /var/ftp/encode/Description_H3ac_GM06990_1.html
    rm -f /var/ftp/encode/Description_H4ac_GM06990_1.html
    rm -f /var/ftp/encode/H3K4me1_GM06990_1.wig.txt
    rm -f /var/ftp/encode/H3K4me2_GM06990_1.wig.txt
    rm -f /var/ftp/encode/H3K4me3_GM06990_2.wig.txt
    rm -f /var/ftp/encode/H3ac_GM06990_1.wig.txt
    rm -f /var/ftp/encode/H4ac_GM06990_1.wig.txt

    #	Sanger confirmed they no longer need the old data, remove the
    #	table from hgwdev:
    hgsql -e "drop table encodeSangerChip;" hg16

    #   Additional data submitted 2005-06-02
    ssh hgwdev
    cd /cluster/data/encode/sanger/chipchip/2005-06-02
    ln -s 
    grep "^chr" H3K4me2_K562_1.wig.txt | sort -k1,1 -k2,2n > \
	H3K4me2K562.bedGraph4
    grep "^chr" H4ac_K562_1.wig.txt | sort -k1,1 -k2,2n > \
	H4acK562.bedGraph4
    grep "^chr" H3ac_K562_1.wig.txt | sort -k1,1 -k2,2n > \
	H3acK562.bedGraph4
    for T in H3K4me2K562 H4acK562 H3acK562
    do
	hgLoadBed -bedGraph=4 hg16 encodeSangerChip${T} ${T}.bedGraph4
    done

    # Additional data submitted 2005-06-27 (Heather)
    ssh hgwdev
    cd /cluster/data/encode/sanger/chipchip/2005-06-27
    grep "^chr" H3K4me3_K562_1.wig.txt | sort -k1,1 -k2,2n > H3K4me3_K562.bedGraph4
    hgLoadBed -bedGraph=4 hg16 encodeSangerChipH3K4me3K562 H3K4me3_K562.bedGraph4
    

########################################################################
#  DONE - 2005-06-06 - Hiram
#  Loading tables UCSD Ludwig Institute ChIP/Chip(H3K4me3 Hela and
#	H3K4Me2 Hela/IFN-g - two tables here replace the data we
#	received on 2005-05-26 -> tmH3K4_p0.wig and tmH3K4_p30.wig
#	The data in the two files is identical except for their track
#	label comments.
#	data supplier contact: Bing Ren biren@UCSD.EDU

    ssh hgwdev
    cd /cluster/data/encode/UCSD/2005-05-31
    for T in acH3 acH4 dmH3K4 stat1 tmH3K4
    do
	grep "^chr" ${T}_p0.wig | sort -k1,1 -k2,2n \
		> encodeUcsdChipHeLaH3H4${T}_p0.bedGraph4
	grep "^chr" ${T}_p30.wig | sort -k1,1 -k2,2n \
		> encodeUcsdChipHeLaH3H4${T}_p30.bedGraph4
	hgLoadBed -bedGraph=4 hg16 encodeUcsdChipHeLaH3H4${T}_p0 \
		encodeUcsdChipHeLaH3H4${T}_p0.bedGraph4
	hgLoadBed -bedGraph=4 hg16 encodeUcsdChipHeLaH3H4${T}_p30 \
		encodeUcsdChipHeLaH3H4${T}_p30.bedGraph4
    done

    # a couple of later additions, just under the wire
    cd /cluster/data/encode/UCSD/2005-06-06
    #	rename to track name
    ln -s RNAP/rnap_p0.wig encodeUcsdChipHeLaH3H4RNAP_p0.wig
    ln -s Taf250/taf250_p0.wig encodeUcsdChipHeLaH3H4TAF250_p0.wig
    ln -s RNAP/rnap_p30.wig encodeUcsdChipHeLaH3H4RNAP_p30.wig
    ln -s Taf250/taf250_p30.wig encodeUcsdChipHeLaH3H4TAF250_p30.wig
for T in RNAP TAF250
do
    grep "^chr" encodeUcsdChipHeLaH3H4${T}_p0.wig | sort -k1,1 -k2,2n \
        > encodeUcsdChipHeLaH3H4${T}_p0.bed
    grep "^chr" encodeUcsdChipHeLaH3H4${T}_p30.wig | sort -k1,1 -k2,2n \
        > encodeUcsdChipHeLaH3H4${T}_p30.bed
    hgLoadBed -strict -bedGraph=4 hg16 encodeUcsdChipHeLaH3H4${T}_p0 \
        encodeUcsdChipHeLaH3H4${T}_p0.bed
    hgLoadBed -strict -bedGraph=4 hg16 encodeUcsdChipHeLaH3H4${T}_p30 \
        encodeUcsdChipHeLaH3H4${T}_p30.bed
done

    #	extensive edits were performed on the delivered description files
    #	they were pretty sparse

# GIS RNA PET (2005-06-03 angie)
# Data updated (previous data was mistakenly in hg17 coords) (2006-10-19 kate)
    # Data saved to:
    #cd /cluster/data/encode/GIS/rna5fu/2005-05-31
    #ln -s 2005-05-31 /cluster/data/encode/GIS/rna5fu/latest
    cd /cluster/data/encode/GIS/rna/2005-10-31

    # Jim requested colored bed.  Names are of the form $id-$mapCount-$atCount.
    # Coloring scheme (inspired by Chuck's transcriptome track):
    #  ($mapCount == 1 && $atCount > 1):  dark blue
    #  ($mapCount == 1 && $atCount == 1): light blue
    #  ($mapCount > 1  && $atCount > 1):  medium brown
    #  ($mapCount > 1  && $atCount == 1): light brown
    # Angie's process, slightly modified (kate)
cat > load.hg16.csh << 'EOF'
    foreach f (lab/HCT116-hg16.bed lab/MCF7-hg16.bed)
      set cellType = `echo $f:t:r | sed -e 's/-hg16//'`
      echo $cellType
      set table = encodeGisRnaPet$cellType
      grep '^chr' $f | \
      perl -wpe \
     'chomp; @w = split; \
      if ($w[3] =~ /\d+-(\d+)-(\d+)/) { \
        ($mc, $ac) = ($1, $2, $3); \
        if ($mc == 1)   { $w[8] = ($ac > 1) ? "35,35,175" : "160,160,188"; } \
        elsif ($mc > 1) { $w[8] = ($ac > 1) ? "180,120,0" : "225,150,0"; } \
        else { die "mc $mc" } \
      } else { die "parse"; } \
      $_ = join(" ", @w) . "\n";'  > $table.bed
      hgLoadBed hg16 $table $table.bed
    end
'EOF'
csh load.hg16.csh >&! load.hg16.log
    rm *.bed


# UCSD/LI Nimblegen whole genome TAF1 & condensed validation (2005-06-02 angie)
    # Data saved to:
    cd /cluster/data/encode/UCSD/nimblegen/2005-05-31
    # TAF1.bed contains two separate tracks: known and novel sites.
    # I manually split it into two files: TAF1_known.bed and TAF1_novel.bed.
    # Coordinates were off-by-1 so adjust before loading:
    awk '{print $1 "\t" $2-1 "\t" $3-1;}' TAF1_known.bed \
    | hgLoadBed -noBin hg16 encodeUcsdNgChipKnownSites stdin
    awk '{print $1 "\t" $2-1 "\t" $3-1;}' TAF1_novel.bed \
    | hgLoadBed -noBin hg16 encodeUcsdNgChipNovelSites stdin
    # There are >1M rows on the big chromosomes, while 100k seems to be 
    # kind of an upper limit for bedGraph, performance-wise (>3sec load).
    # So load as wiggle; adjust coords here too:
    ssh kolossus
    cd /cluster/data/encode/UCSD/nimblegen/2005-05-31
    set track = encodeUcsdNgChipSignal
    awk '$1 == "variableStep" {print;} \
         $1 ~ /[0-9]+/ {print $1+1 "\t" $2;}' chr*.sub \
    > $track.varStep
    wigEncode $track.varStep $track.wig $track.wib
#Converted stdin, upper limit 6.47, lower limit -5.61
    # back on hgwdev:
    set track = encodeUcsdNgChipSignal
    set gbdbDir = /gbdb/hg16/encode/UCSD/2005-05-31/wib
    mkdir -p $gbdbDir
    ln -s `pwd`/$track.wib $gbdbDir/
    nice hgLoadWiggle hg16 $track $track.wig -pathPrefix=$gbdbDir
    # Load validation subtracks; standardize hist mod names and adjust coords:
    foreach f (*.wig)
      set cellType = `echo $f:r | perl -wpe 's/(\S+)_condensed_sorted/\u$1/; \
                                  s/Ach3/H3ac/g;  s/Meh3k4/H3K4me/g;'`
      set table = encodeUcsdNgValChip$cellType
      echo $f ' -> ' $table
      tail +5 $f | awk '{print $1 "\t" $2-1 "\t" $3-1 "\t" $4;}' \
      | nice hgLoadBed -onServer -bedGraph=4 hg16 $table stdin
    end
    # back on kolossus, clean up:
    gzip encodeUcsdNgChipSignal.wig encodeUcsdNgChipSignal.varStep 
    rm *.tab

# UCSD/LI Nimblegen RnaP & H3K4me3 (2005-06-03 angie)
    ssh kolossus
    cd /cluster/data/encode/UCSD/nimblegen/2005-06-01
    # Data are in hg17 coords, so must liftOver to hg16.
    foreach f (lab/Nim*/*.wig)
      set track = `echo $f:t:r | sed -e \
         's/rnap/encodeUcsdNgHeLaRnap/; s/tmh3k4/encodeUcsdNgHeLaH3K4me3/;'`
      echo $track
      tail +5 $f \
      | liftOver stdin /cluster/data/hg17/bed/liftOver/hg17ToHg16.over.chain \
          hg16.$track.bed hg16.$track.unmapped
    end
    wc -l hg16*
# 384969 hg16.encodeUcsdNgHeLaH3K4me3_p0.bed
#    360 hg16.encodeUcsdNgHeLaH3K4me3_p0.unmapped
# 384969 hg16.encodeUcsdNgHeLaH3K4me3_p30.bed
#    360 hg16.encodeUcsdNgHeLaH3K4me3_p30.unmapped
# 384969 hg16.encodeUcsdNgHeLaRnap_p0.bed
#    360 hg16.encodeUcsdNgHeLaRnap_p0.unmapped
# 384969 hg16.encodeUcsdNgHeLaRnap_p30.bed
#    360 hg16.encodeUcsdNgHeLaRnap_p30.unmapped
    # All but one of the unmapped's were from chrX, and most of those were 
    # "Deleted in new" (i.e. in hg17 but not hg16).  Seems OK.
    # There are <100,000 per chrom, so load as bedGraph.
    ssh hgwdev
    cd /cluster/data/encode/UCSD/nimblegen/2005-06-01
    foreach f (hg16.*.bed)
      hgLoadBed -onServer -bedGraph=4 hg16 $f:t:r $f
    end

###########################################################################
# TBA Alignments from Elliott Margulies (2005-06-06 kate)
    mkdir /cluster/data/encode/TBA
    cd /cluster/data/encode/TBA
    mkdir 2005-06-08
    cd 2005-06-08
    mkdir lab; cd lab
    wget --timestamping --force-directories --directory-prefix=. \
	--dont-remove-listing --recursive --level=3 --no-parent \
	--no-host-directories --cut-dirs=5 \
    ftp://kronos.nhgri.nih.gov/pub/outgoing/elliott/encode_tba/MAY-2005/

    # load mafs
    gunzip *.maf.gz
    mkdir -p /gbdb/hg16/encode/TBA/maf
    rm -f /gbdb/hg16/encode/TBA/maf/*.maf
    ln -s /cluster/data/encode/TBA/latest/lab/*.maf /gbdb/hg16/encode/TBA/maf
    cd ..
    hgLoadMaf -pathPrefix=/gbdb/hg16/encode/TBA/maf -WARN \
        hg16 encodeTbaAlign >&! load.log
    # Loaded 229243 mafs in 44 files from /gbdb/hg16/encode/TBA/maf
    # NOTE: many warnings about "Score too small"

    # load phastCons conservation wiggle
    gunzip -c lab/wig/human.EN*.pC.*.wig.gz | \
        wigEncode stdin phastCons.wig phastCons.wib
    rm -f /gbdb/hg16/encode/TBA/phastCons.wib
    ln -s `pwd`/phastCons.wib /gbdb/hg16/encode/TBA
    hgLoadWiggle -pathPrefix=/gbdb/hg16/encode/TBA hg16 \
                        encodeTbaPhastCons phastCons.wig 

    # Updated with 6/22 binCons data at Elliott's request (2005-06-29 kate)
    cd /cluster/data/encode/TBA/latest/lab/wig/
    mkdir jun30
    cd jun30
    wget -r -nd ftp://kronos.nhgri.nih.gov/pub/outgoing/elliott/encode_tba/MAY-2005/wig/
    cd ../../
    gunzip -c lab/wig/jun30/EN*.binCons.*.wig.gz | \
        wigEncode stdin binCons.wig binCons.wib
    gunzip -c lab/wig/jun30/human*.pC.*.wig.gz | \
        wigEncode stdin phastCons.wig phastCons.wib
    set d = /gbdb/hg16/encode/TBA
    rm -f $d/binCons.wib $d/phastCons.wib
    ln -s `pwd`/binCons.wib $d
    ln -s `pwd`/phastCons.wib $d
    hgLoadWiggle -pathPrefix=$d hg16 encodeTbaPhastCons phastCons.wig 
    hgLoadWiggle -pathPrefix=$d hg16 encodeTbaPhastCons phastCons.wig 

    # load MAF summary table
    cat lab/*.maf | hgLoadMafSummary -minSize=10000 -mergeGap=500 \
                                -maxSize=50000 hg16 encodeTbaSummary stdin


    # load conserved elements (6 tables)
    # scored elements
    hgLoadBed -noBin hg16 encodeTbaBinConsEl lab/tba.binCons.bed
        # loaded 18677
    hgLoadBed -noBin hg16 encodeTbaPhastConsEl lab/tba.phastCons.bed
        # loaded 36105 elements
    # unscored - fix at score=500
    ln -s lab/tba.or.bed TbaUnionEl.bed
    ln -s lab/tba.or.nc.bed TbaNcUnionEl.bed
    ln -s lab/tba.and.bed TbaIntersectEl.bed
    ln -s lab/tba.and.nc.bed TbaNcIntersectEl.bed

    foreach f (Tba*.bed)
        set b = $f:r
        set t = encode$b
        sed -e 's/$/\t500/' $f | hgLoadBed -noBin hg16 $t stdin
        checkTableCoords hg16 $t
    end

    # load GERP conservation score and elements
    # conservation wiggle 
    cat lab/GERP/chr*.wig | \
        wigEncode stdin GerpCons.wig GerpCons.wib
        # Converted stdin, upper limit 3.00, lower limit 0.00
    mkdir -p /gbdb/hg16/encode/TBA
    rm -f /gbdb/hg16/encode/TBA/GerpCons.wib
    ln -s `pwd`/GerpCons.wib /gbdb/hg16/encode/TBA
    hgLoadWiggle -pathPrefix=/gbdb/hg16/encode/TBA hg16 \
                        encodeTbaGerpCons GerpCons.wig 

    # conserved elements 
    # NOTE: round float scores
    set t = encodeTbaGerpEl
    perl -we 'while (<>) { \
      if (/^chr/) { \
        ($chrom,$start,$end,$name,$score) = split; \
        $name =~ s/GERP.score=/GERP./; \
        printf("%s\t%d\t%d\t%s\t%.0f\n", $chrom,$start,$end,$name,$score); \
      } }' lab/GERP/ENC_GERP_TBA_cons.bed > $t.tab
    hgLoadBed -noBin hg16 $t $t.tab
    # Loaded 40576 elements of size 5
    
###########################################################################
#  Loading Yale Affy ChIP/Chip data (DONE - 2005-06-03 - Hiram)
    #	Two files were found to have bad start,end coordinates, they
    #	were resubmitted by Joel Rozowsky:
    #	encode_Yale_Chipchip_STAT1_Hela_BingRenArray_Pvalue_Map.wig
    #	encode_Yale_Chipchip_STAT1_Hela_BingRenArray_Signal_Map.wig
    #	This prompted the addition of the -strict option to hgLoadBed
    #	data supplier contact: Joel Rozowsky joel.rozowsky@yale.edu

    cd /cluster/data/encode/yale/affy/2005-06-01/CHIPCHIP_STAT1
    #	rename the data files to their resulting table names:
    ln -s encode_Yale_Chipchip_STAT1_Hela_BingRenArray_Pvalue_Map.wig \
	encodeYaleChIPSTAT1HeLaBingRenPval.wig
    ln -s encode_Yale_Chipchip_STAT1_Hela_BingRenArray_Signal_Map.wig \
	encodeYaleChIPSTAT1HeLaBingRenSig.wig
    ln -s encode_Yale_Chipchip_STAT1_Hela_BingRenArray_Sites.bed \
	encodeYaleChIPSTAT1HeLaBingRenSites.bed
    ln -s \
	encode_Yale_Chipchip_STAT1_Hela_Maskless36merevery36bp_Pvalue_Map.wig \
	encodeYaleChIPSTAT1HeLaMaskLess36mer36bpPval.wig
    ln -s \
	encode_Yale_Chipchip_STAT1_Hela_Maskless36merevery36bp_Signal_Map.wig \
	encodeYaleChIPSTAT1HeLaMaskLess36mer36bpSig.wig
    ln -s encode_Yale_Chipchip_STAT1_Hela_Maskless36merevery36bp_Sites.bed \
	encodeYaleChIPSTAT1HeLaMaskLess36mer36bpSite.bed
    ln -s \
	encode_Yale_Chipchip_STAT1_Hela_Maskless50merevery38bp_Pvalue_Map.wig \
	encodeYaleChIPSTAT1HeLaMaskLess50mer38bpPval.wig
    ln -s \
	encode_Yale_Chipchip_STAT1_Hela_Maskless50merevery38bp_Signal_Map.wig \
	encodeYaleChIPSTAT1HeLaMaskLess50mer38bpSig.wig
    ln -s encode_Yale_Chipchip_STAT1_Hela_Maskless50merevery38bp_Sites.bed \
	encodeYaleChIPSTAT1HeLaMaskLess50mer38bpSite.bed
    ln -s \
	encode_Yale_Chipchip_STAT1_Hela_Maskless50merevery50bp_Pvalue_Map.wig \
	encodeYaleChIPSTAT1HeLaMaskLess50mer50bpPval.wig
    ln -s \
	encode_Yale_Chipchip_STAT1_Hela_Maskless50merevery50bp_Signal_Map.wig \
	encodeYaleChIPSTAT1HeLaMaskLess50mer50bpSig.wig
    ln -s encode_Yale_Chipchip_STAT1_Hela_Maskless50merevery50bp_Sites.bed \
	encodeYaleChIPSTAT1HeLaMaskLess50mer50bpSite.bed
    #	now they can simply be loaded, the .wig files:
for T in BingRenPval BingRenSig MaskLess36mer36bpPval MaskLess50mer38bpPval \
        MaskLess50mer50bpPval MaskLess36mer36bpSig MaskLess50mer38bpSig \
        MaskLess50mer50bpSig
do
    TBL="encodeYaleChIPSTAT1HeLa${T}"
    grep "^chr" ${TBL}.wig | sort -k1,1 -k 2,2n |
        hgLoadBed -strict -bedGraph=4 hg16 ${TBL} stdin
done
    #	The bed files:
for T in BingRenSites MaskLess36mer36bpSite MaskLess50mer38bpSite \
	MaskLess50mer50bpSite
do
    TBL="encodeYaleChIPSTAT1HeLa${T}"
    grep "^chr" ${TBL}.bed | sort -k1,1 -k 2,2n |
        hgLoadBed -strict hg16 ${TBL} stdin
done
    #	very good description html files were delivered with this data
    #	set.

    #	The four Signal data sets were packaged into the composite track:
    #	encodeYaleChIPSTAT1Sig
    #	The four Sites data sets were packaged into the composite track:
    #	encodeYaleChIPSTAT1Sites
    #	The four Pvalue data sets were packaged into the composite track:
    #	encodeYaleChIPSTAT1Pval

#######################################################################
#  Yale Affy RNA data sets (DONE - 2005-06-07 - Hiram)
#	data supplier contact: Joel Rozowsky joel.rozowsky@yale.edu
#  NOTE 2005-08-01 ### *** It turns out something was off-by-one with the
#  coordinates on all these data points.  At this time we are not yet
#  certain what happened.  At any rate, this data is being reloaded to
#  fix these coordinate problems.
    cd /cluster/data/encode/yale/affy/2005-06-01/RNA_NEUTROPHIL
    #	create symlinks to rename the files to their table names:
   ln -s encode_Yale_Affy_Neutrophil_RNA_Tars.html \
	encodeYaleAffyNeutRNATars.html
   ln -s encode_Yale_Affy_Neutrophil_RNA_Tars_biorep1.bed \
	encodeYaleAffyNeutRNATars01.bed
   ln -s encode_Yale_Affy_Neutrophil_RNA_Tars_biorep2.bed \
	encodeYaleAffyNeutRNATars02.bed
   ln -s encode_Yale_Affy_Neutrophil_RNA_Tars_biorep3.bed \
	encodeYaleAffyNeutRNATars03.bed
   ln -s encode_Yale_Affy_Neutrophil_RNA_Tars_biorep4.bed \
	encodeYaleAffyNeutRNATars04.bed
   ln -s encode_Yale_Affy_Neutrophil_RNA_Tars_biorep5.bed \
	encodeYaleAffyNeutRNATars05.bed
   ln -s encode_Yale_Affy_Neutrophil_RNA_Tars_biorep6.bed \
	encodeYaleAffyNeutRNATars06.bed
   ln -s encode_Yale_Affy_Neutrophil_RNA_Tars_biorep7.bed \
	encodeYaleAffyNeutRNATars07.bed
   ln -s encode_Yale_Affy_Neutrophil_RNA_Tars_biorep8.bed \
	encodeYaleAffyNeutRNATars08.bed
   ln -s encode_Yale_Affy_Neutrophil_RNA_Tars_biorep9.bed \
	encodeYaleAffyNeutRNATars09.bed
   ln -s encode_Yale_Affy_Neutrophil_RNA_Tars_biorep10.bed \
	encodeYaleAffyNeutRNATars10.bed
   ln -s encode_Yale_Affy_Neutrophil_RNA_Tars.bed \
	encodeYaleAffyNeutRNATarsAll.bed
   ln -s encode_Yale_Affy_Neutrophil_RNA_Transcript_Map.html \
	encodeYaleAffyNeutRNATransMap.html
   ln -s encode_Yale_Affy_Neutrophil_RNA_Transcript_Map_biorep1.wig \
	encodeYaleAffyNeutRNATransMap01.wig
   ln -s encode_Yale_Affy_Neutrophil_RNA_Transcript_Map_biorep2.wig \
	encodeYaleAffyNeutRNATransMap02.wig
   ln -s encode_Yale_Affy_Neutrophil_RNA_Transcript_Map_biorep3.wig \
	encodeYaleAffyNeutRNATransMap03.wig
   ln -s encode_Yale_Affy_Neutrophil_RNA_Transcript_Map_biorep4.wig \
	encodeYaleAffyNeutRNATransMap04.wig
   ln -s encode_Yale_Affy_Neutrophil_RNA_Transcript_Map_biorep5.wig \
	encodeYaleAffyNeutRNATransMap05.wig
   ln -s encode_Yale_Affy_Neutrophil_RNA_Transcript_Map_biorep6.wig \
	encodeYaleAffyNeutRNATransMap06.wig
   ln -s encode_Yale_Affy_Neutrophil_RNA_Transcript_Map_biorep7.wig \
	encodeYaleAffyNeutRNATransMap07.wig
   ln -s encode_Yale_Affy_Neutrophil_RNA_Transcript_Map_biorep8.wig \
	encodeYaleAffyNeutRNATransMap08.wig
   ln -s encode_Yale_Affy_Neutrophil_RNA_Transcript_Map_biorep9.wig \
	encodeYaleAffyNeutRNATransMap09.wig
   ln -s encode_Yale_Affy_Neutrophil_RNA_Transcript_Map_biorep10.wig \
	encodeYaleAffyNeutRNATransMap10.wig
   ln -s encode_Yale_Affy_Neutrophil_RNA_Transcript_Map.wig \
	encodeYaleAffyNeutRNATransMapAll.wig
    #	The TransMap tracks were too slow loaded as bedGraph, load them as
    #	wiggle data.  Something was fishy with trying to pipe wigEncode
    #	stdout to hgLoadWiggle stdin.  It kept complaining about some
    #	line as if the stderr of the wigEncode was getting through the
    #	pipe.  Hence the temporary w.${TBL} file:
    #	Re-loaded 2005-08-01 to add 1 to start,end coordinates
    mkdir -p /gbdb/hg16/encode/Yale/Affy/wib
    mkdir wib
for T in All
do
    TBL="encodeYaleAffyNeutRNATransMap${T}"
    rm -f w.${TBL} wib/${TBL}.wib t.wig
    echo -e "chr1\t148374651\t148374672\t0" > t.wig
    grep "^chr" ${TBL}.wig | \
	awk '{printf "%s\t%d\t%d\t%s\n", $1, $2+1, $3+1, $4}' >> t.wig
    sort -k1,1 -k2,2n t.wig | \
	wigEncode stdin w.${TBL} wib/${TBL}.wib
    hgLoadWiggle -pathPrefix=/gbdb/hg16/encode/Yale/Affy hg16 ${TBL} w.${TBL}
    rm -f /gbdb/hg16/encode/Yale/Affy/wib/${TBL}.wib
    ln -s `pwd`/wib/${TBL}.wib \
        /gbdb/hg16/encode/Yale/Affy/wib
    rm -f w.${TBL} t.wig
done
    
for T in 01 02 03 04 05 06 07 08 09 10
do
    TBL="encodeYaleAffyNeutRNATransMap${T}"
    rm -f w.${TBL} wib/${TBL}.wib
    grep "^chr" ${TBL}.wig | \
	awk '{printf "%s\t%d\t%d\t%s\n", $1, $2+1, $3+1, $4}' \
	| sort -k1,1 -k2,2n | \
	wigEncode stdin w.${TBL} wib/${TBL}.wib
    hgLoadWiggle -pathPrefix=/gbdb/hg16/encode/Yale/Affy hg16 ${TBL} w.${TBL}
    rm -f /gbdb/hg16/encode/Yale/Affy/wib/${TBL}.wib
    ln -s `pwd`/wib/${TBL}.wib \
        /gbdb/hg16/encode/Yale/Affy/wib
    rm -f w.${TBL}
done
    #	Loading the Tars files:
for T in All 01 02 03 04 05 06 07 08 09 10
do
    TBL="encodeYaleAffyNeutRNATars${T}"
    echo "${TBL}.bed"
    grep "^chr" ${TBL}.bed | \
	awk '{printf "%s\t%d\t%d\n", $1, $2+1, $3+1}' \
	| sort -k1,1 -k2,2n | \
	hgLoadBed -strict hg16 ${TBL} stdin
done
    #	The file encode_Yale_Affy_Neutrophil_RNA_Transcript_Map.wig
    #	had an unusual error in its chromEnd column, it was
    #	manually fixed here and verified with Joel that this was the
    #	correct fix:
    #	51506c51506
    #	< chr2   234999988       2.35e+08        -5.7625 
    #	---
    #	> chr2   234999988       235000006       -5.7625 
    #	717784c717784
    #	< chrX   151999980       1.52e+08        0.6375 
    #	---
    #	> chrX   151999980       152000005       0.6375 
    #	This is a pattern of errors that will be seen in other files in
    #	this data set.  Some kind of PERL thing Joel suspects.

    #	The 11 TransMap data sets are grouped into the composite track:
    #	encodeYaleAffyRNATransMap
    #	And the 11 TARs are grouped into: encodeYaleAffyRNATars

    #	Same procedure for the PLACENTA tissue samples:
    ln -s encode_Yale_Affy_Placenta_RNA_Tars.bed \
	encodeYaleAffyPlacRNATars.bed
    ln -s encode_Yale_Affy_Placenta_RNA_Tars.html \
	encodeYaleAffyPlacRNATars.html
    ln -s encode_Yale_Affy_Placenta_RNA_Transcript_Map.html \
	encodeYaleAffyPlacRNATransMap.html
    ln -s encode_Yale_Affy_Placenta_RNA_Transcript_Map.wig \
	encodeYaleAffyPlacRNATransMap.wig

    #	There is only one of each type here
    #	Reloaded 2005-08-01, added 1 to all start,end coordinates and
    #	added the missing first probe at chr1 148374651 148374672
    mkdir wib
    rm -f t.wig
    echo -e "chr1\t148374651\t148374672\t0" > t.wig
    grep "^chr" encodeYaleAffyPlacRNATransMap.wig  | \
	awk '{printf "%s\t%d\t%d\t%s\n", $1, $2+1, $3+1, $4}' >> t.wig
    sort -k1,1 -k2,2n t.wig | \
        wigEncode stdin tmp.wig wib/encodeYaleAffyPlacRNATransMap.wib
    hgLoadWiggle -pathPrefix=/gbdb/hg16/encode/Yale/Affy hg16 \
        encodeYaleAffyPlacRNATransMap tmp.wig
    rm -f /gbdb/hg16/encode/Yale/Affy/wib/encodeYaleAffyPlacRNATransMap.wib
    ln -s `pwd`/wib/encodeYaleAffyPlacRNATransMap.wib \
        /gbdb/hg16/encode/Yale/Affy/wib
    rm -f tmp.wig
    rm -f t.wig

    grep "^chr" encodeYaleAffyPlacRNATars.bed | \
	awk '{printf "%s\t%d\t%d\n", $1, $2+1, $3+1}' \
	| sort -k1,1 -k2,2n | \
        hgLoadBed -strict hg16 encodeYaleAffyPlacRNATars stdin

    #	These two were added to the previously mentioned composites:
    #	encodeYaleAffyRNATransMap and encodeYaleAffyRNATars

    #	Same procedure for the NB4 data:
    cd /cluster/data/encode/yale/affy/2005-06-01/RNA_NB4
    ln -s encode_Yale_Affy_NB4_RA_RNA_Tars.bed \
	encodeYaleAffyNB4RARNATars.bed
    ln -s encode_Yale_Affy_NB4_Untreated_RNA_Tars.bed \
	encodeYaleAffyNB4UntrRNATars.bed
    ln -s encode_Yale_Affy_NB4_TPA_RNA_Tars.bed \
	encodeYaleAffyNB4TPARNATars.bed
    ln -s encode_Yale_Affy_NB4_RA_RNA_Transcript_Map.wig \
	encodeYaleAffyNB4RARNATransMap.wig
    ln -s encode_Yale_Affy_NB4_TPA_RNA_Transcript_Map.wig \
	encodeYaleAffyNB4TPARNATransMap.wig
    ln -s encode_Yale_Affy_NB4_Untreated_RNA_Transcript_Map.wig \
	encodeYaleAffyNB4UntrRNATransMap.wig

for T in NB4RA NB4TPA NB4Untr
do
    TNAME=encodeYaleAffy${T}RNATars
    grep "^chr" ${TNAME}.bed | \
	awk '{printf "%s\t%d\t%d\n", $1, $2+1, $3+1}' \
	| sort -k1,1 -k2,2n | \
        hgLoadBed -strict hg16 ${TNAME} stdin
done

mkdir wib

for T in NB4RA NB4TPA NB4Untr
do
    TBL=encodeYaleAffy${T}RNATransMap
    rm -f w.${T} t.wig wib/${TBL}.wib
    echo -e "chr1\t148374651\t148374672\t0" > t.wig
    grep "^chr" ${TBL}.wig | \
	awk '{printf "%s\t%d\t%d\t%s\n", $1, $2+1, $3+1, $4}' >> t.wig
    sort -k1,1 -k2,2n t.wig | \
        wigEncode stdin w.${T} wib/${TBL}.wib
    hgLoadWiggle -pathPrefix=/gbdb/hg16/encode/Yale/Affy hg16 \
            ${TBL} w.${T}
    rm -f /gbdb/hg16/encode/Yale/Affy/wib/${TBL}.wib
    ln -s `pwd`/wib/${TBL}.wib /gbdb/hg16/encode/Yale/Affy/wib
    rm -f w.${T} t.wig
done
    #	These six were added to the previously mentioned composites:
    #	encodeYaleAffyRNATransMap and encodeYaleAffyRNATars

    #	The primary .html file was enhanced to include the specifics of
    #	each of these three data sets into the one document

#########################################################################
#  Farnham Lab/UC Davis Nimblegen track (DONE - 2005-06-07 - Hiram)
#	Data supplier contact: Kyle J. Munn - kmunn@nimblegen.com
    cd /cluster/data/encode/UcDavis/2005-06-01
    awk '{printf "%s\t%s\t%s\t%s\n", $1,$4,$5,$6}' E2F1_Median.gff | \
	sort -k1,1 -k2,2n > encodeUCDavisE2F1Median.bedGraph4
    hgLoadBed -strict -bedGraph=4 hg16 encodeUCDavisE2F1Median \
        encodeUCDavisE2F1Median.bedGraph4


#########################################################################
#  Rearrangement of existing UCSD tracks, combining into a single
#  composite track: encodeLIChIP - (DONE - 2005-06-07 - Hiram)
#	In general, the data supplier contact for UCSD data is:
#	Bing Ren biren@UCSD.EDU
# long label: Ludwig Institute/UCSD ChIP/Chip (Pol2, TAF1, AcH3, H3K4me,
# 	H3K27me3 antibodies)
# short label: LI ChIP Various
    ssh hgwdev
    #	two new data tracks:
    cd /cluster/data/encode/UCSD/2005-06-01
    grep "^chr" suz12.wig | sort -k1,1 -k2,2n > \
	encodeUcsdChipH3K27me3Suz12.bedGraph
    grep "^chr" tmh3k27.wig | sort -k1,1 -k2,2n > \
	encodeUcsdChipH3K27me3.bedGraph

    hgLoadBed -strict -bedGraph=4 hg16 encodeUcsdChipH3K27me3Suz12 \
        encodeUcsdChipH3K27me3Suz12.bedGraph
    hgLoadBed -strict -bedGraph=4 hg16 encodeUcsdChipH3K27me3 \
        encodeUcsdChipH3K27me3.bedGraph

    #	Reloading existing tracks to change the data value column from a
    #	string to a float:
    mkdir /cluster/data/encode/UCSD/fix_columns
    cd /cluster/data/encode/UCSD/fix_columns
    #	fetch data from existing tables
for T in encodeUcsdChipMeh3k4Imr90 encodeUcsdChipAch3Imr90 \
        encodeUcsdChipTaf250Hct116 encodeUcsdChipTaf250Imr90 \
        encodeUcsdChipTaf250Thp1 encodeUcsdChipTaf250Hela \
        encodeUcsdChipRnapHct116 encodeUcsdChipRnapImr90 \
        encodeUcsdChipRnapThp1 encodeUcsdChipRnapHela
do
    hgsql -N -e "describe ${T}" hg16 > ${T}.desc
    hgsql -N -e \
        "select chrom,chromStart,chromEnd,name from ${T};" hg16 | \
        sort -k1,1 -k2,2n > ${T}.bed
done

    #	reload these tables as bed graph, adding a _f to the table names:
    for T in encodeUcsdChipMeh3k4Imr90 encodeUcsdChipAch3Imr90 \
        encodeUcsdChipTaf250Hct116 encodeUcsdChipTaf250Imr90 \
        encodeUcsdChipTaf250Thp1 encodeUcsdChipTaf250Hela \
        encodeUcsdChipRnapHct116 encodeUcsdChipRnapImr90 \
        encodeUcsdChipRnapThp1 encodeUcsdChipRnapHela
do
    hgLoadBed -strict -bedGraph=4 hg16 ${T}_f ${T}.bed
done
    #	each individual track's short and long label:
# LI Pol2 HeLa - Ludwig Institute ChIP/Chip (Pol2 ab, HeLa cells)
# LI Pol2 THP1 - Ludwig Institute ChIP/Chip (Pol2 ab, THP1 cells)
# LI Pol2 IMR90 - Ludwig Institute ChIP/Chip (Pol2 ab, IMR90 cells)
# LI Pol2 HCT116 - Ludwig Institute ChIP/Chip (Pol2 ab, HCT116 cells)
# LI TAF1 HeLa - Ludwig Institute ChIP/Chip (TAF1 ab, HeLa cells)
# LI TAF1 THP1 - Ludwig Institute ChIP/Chip (TAF1 ab, THP1 cells)
# LI TAF1 IMR90 - Ludwig Institute ChIP/Chip (TAF1 ab, IMR90 cells)
# LI TAF1 HCT116 - Ludwig Institute ChIP/Chip (TAF1 ab, HCT116 cells)
# LI AcH3 IMR90 - Ludwig Institute ChIP/Chip (AcH3 ab, IMR90 cells)
# LI H3K4me IMR90 - Ludwig Institute ChIP/Chip (MeH3K4 ab, IMR90 cells)
# LI SUZ12 HeLa - Ludwig Institute ChIP/Chip (SUZ12 protein ab, HeLa cells
#	unstimulated)
# LI H3K27me3 HeLa - Ludwig Institute ChIP/Chip (H3K27me3 ab, HeLa cells
#	unstimulated)
    #	Replaces the composite track: encodeUcsdChipRnap
    # LI ChIP Pol2 - Ludwig Institute ChIP/Chip (Pol2 initiation form antibody)
    #		which had four subtracks.
    #   And: encodeUcsdChipTaf250
    # LI ChIP TAF1 - Ludwig Institute ChIP/Chip (TAF1 antibody)
    #		which had four subtracks.
    #	And the single track: encodeUcsdChipAch3Imr90
    # LI AcH3 IMR90 - Ludwig Institute ChIP/Chip (AcH3 antibody, IMR90 cells)
    #   And the single track:  encodeUcsdChipMeh3k4Imr90
    #LI MeH3K4 IMR90 - Ludwig Institute ChIP/Chip (MeH3K4 antibody, IMR90 cells)

##########################################################################
# EGASP Partial (2005-06-22 kate)
# Gene tracks submitted for the EGASP competition, submitted
#       by the Gencode group (Roderic Guigo, Julien Legarde, IMIM) 
    cd /cluster/data/encode
    mkdir -p EGASP/Partial/lab
    cd EGASP/Partial/lab
    wget --timestamping --force-directories --directory-prefix=. \
	--dont-remove-listing --recursive --level=3 --no-parent \
	--no-host-directories --cut-dirs=5 -nd \
       ftp://genome.imim.es/pub/projects/gencode/data/egasp05/submitted_predictions/EGASP_partial/

    wc -l lab/*.gtf
       # 1778 lab/ASPic.gtf
       # 4215 lab/AceSCAN.gtf
       # 2692 lab/Augustus_EST-Protein.gtf
       # 2347 lab/Augustus_abinitio.gtf
       # 2736 lab/Augustus_any.gtf
       # 2567 lab/Augustus_dualgenome.gtf
       # 3458 lab/GeneZilla.gtf
       # 2194 lab/SAGA.gtf
    # NOTE: exclude ASPic, which contains only intron records
    # Filenames above, with _CHR_COORDS_hg17.gff appended, are chrom coordinate versions

    # GeneZilla
    ldHgGene -out=genezilla.gp null null lab/GeneZilla.*.gff
        # Read 454 transcripts in 3949 lines in 1 files
        # 454 groups 20 seqs 1 sources 1 feature types
        # 454 gene predictions
    liftOver -genePred genezilla.gp \
            /cluster/data/hg17/bed/liftOver/hg17ToHg16.chain \
                genezilla.hg16.gp genezilla.gp.unmapped
    wc -l genezilla.hg16.gp genezilla.gp.unmapped
      # 655 genezilla.hg16.gp
      # 2 genezilla.gp.unmapped
        # ENr131-19
    ldHgGene -predTab hg16 encodeEgaspPartGenezilla genezilla.hg16.gp
        # 655 gene predictions
    genePredCheck -db=hg16 encodeEgaspPartGenezilla

    # SAGA
    # Strip out trailing ## on lines where manual changes were made
    #   (see notes in .gtf file)
    sed -e 's/ ##.*//' lab/SAGA.*.gff | \
        ldHgGene -out=saga.gp null null stdin
            # 378 gene predictions
    liftOver -genePred saga.gp \
            /cluster/data/hg17/bed/liftOver/hg17ToHg16.chain \
                saga.hg16.gp saga.gp.unmapped
    wc -l saga.hg16.gp saga.gp.unmapped
      # 377 saga.hg16.gp
      # 2 saga.gp.unmapped
        # ENr131.3.0
    ldHgGene -predTab hg16 encodeEgaspPartSaga saga.hg16.gp
        # 377 gene predictions
    genePredCheck -db=hg16 encodeEgaspPartSaga

    # Augustus
   wc -l lab/Augustus*.gff
       # 2646 lab/Augustus_EST-Protein.gtf_CHR_COORDS_hg17.gff
       # 2301 lab/Augustus_abinitio.gtf_CHR_COORDS_hg17.gff
       # 2690 lab/Augustus_any.gtf_CHR_COORDS_hg17.gff
       # 2521 lab/Augustus_dualgenome.gtf_CHR_COORDS_hg17.gff
   ln -s lab/Augustus_EST-Protein.gtf_CHR_COORDS_hg17.gff augustus.est.gff
   ln -s lab/Augustus_abinitio.gtf_CHR_COORDS_hg17.gff augustus.abinitio.gff
   ln -s lab/Augustus_any.gtf_CHR_COORDS_hg17.gff augustus.any.gff
   ln -s lab/Augustus_dualgenome.gtf_CHR_COORDS_hg17.gff augustus.dual.gff

    # convert to genePred
    foreach f (augustus.*.gff)
        set b = $f:r
        ldHgGene -out=$b.gp -genePredExt null null $f
    end
        # Reading abinitio.gff
        # 418 gene predictions
        # Reading any.gff
        # 399 gene predictions
        # Reading dual.gff
        # 413 gene predictions
        # Reading est.gff
        # 381 gene predictions

    # lift to hg16
    foreach f (augustus.*.gp)
        set b = $f:r
        liftOver -genePred $b.gp \
            /cluster/data/hg17/bed/liftOver/hg17ToHg16.chain \
                $b.hg16.gp $b.gp.unmapped
    end
    wc -l augustus.*.hg16.gp
        # 417 augustus.abinitio.hg16.gp
        # 398 augustus.any.hg16.gp
        # 412 augustus.dual.hg16.gp
        # 380 augustus.est.hg16.gp

    wc -l augustus.*.unmapped
      # 2 augustus.abinitio.gp.unmapped
      # 2 augustus.any.gp.unmapped
      # 2 augustus.dual.gp.unmapped
      # 2 augustus.est.gp.unmapped

    # load
    foreach f (augustus.*.hg16.gp)
        genePredCheck $f
        set t = `echo $f | sed -e 's/augustus.\(.*\).hg16.gp/encodeEgaspPartAugustus\u\1/'`
        ldHgGene -predTab -genePredExt hg16 $t $f
        checkTableCoords hg16 $t
    end

    # AceSCAN
    # Split into two tracks -- conserved, and other, based on feature
    wc -l lab/AceSCAN*.gff
        # 4163 lab/AceSCAN.gtf_CHR_COORDS_hg17.gff
    sed -n '/ACE\[+\]exon/s//exon/p' lab/AceSCAN*.gff > aceCons.scored.gff
    sed -n '/ACE\[-\]exon/s//exon/p' lab/AceSCAN*.gff > aceOther.scored.gff
    wc -l ace*.gff
        #    159 aceCons.gff
        #    4004 aceOther.gff

cat > doGene.csh << 'EOF'
    foreach f (ace*.scored.gff)
        echo $f
        set b = $f:r:r
        awk -F\\t '/^chr/ {printf "%s\t%s\t%s\t%s\t%s\t.\t%s\t%s\t%s\n", $1, $2, $3, $4, $5, $7, $8, $9}' $f | \
            ldHgGene -out=$b.gp null null stdin
        liftOver -genePred $b.gp \
                /cluster/data/hg17/bed/liftOver/hg17ToHg16.chain \
                    $b.hg16.gp $b.gp.unmapped
        wc -l $b.hg16.gp $b.gp.unmapped
        set t = `echo $b | perl -wpe 's/(.*)/encodeEgaspPart\u$1/'`
        ldHgGene -predTab hg16 $t $b.hg16.gp
        genePredCheck -db=hg16 $t
    end
'EOF'
csh doGene.csh >&! doGene.log
# aceCons
# 117 gene predictions
# aceOther
# 727 gene predictions (708 mapped)

##########################################################################
# EGASP Full (2005-06-27 kate)
# Gene tracks submitted for the EGASP competition, submitted
#       by the Gencode group (Roderic Guigo, Julien Legarde, IMIM) 
    cd /cluster/data/encode
    mkdir -p EGASP/Full/lab
    cd EGASP/Full/lab
    wget --timestamping --force-directories --directory-prefix=. \
	--dont-remove-listing --recursive --level=3 --no-parent \
	--no-host-directories --cut-dirs=5 \
            ftp://genome.imim.es/pub/projects/gencode/data/egasp05/submitted_predictions/EGASP_on_all_regions/ -nd
    wc -l *.gtf
    cd ..
    mkdir custom
      # 34059 Aceview.gtf
    ln -s lab/Aceview.*.gff Aceview.gff
      # 11968 CSTminer.gtf
            # SKIP THIS (it's sequence tags, not gene preds)
       # 3241 DOGFISH-C.gtf
    ln -s lab/DOGFISH-C.*.gff Dogfish.gff
      # 13106 EnsEMBL_predictions.gtf
    ln -s `pwd`/lab/EnsEMBL*gff custom/Ensembl.gff
        # Has frames
      # 19004 ExoGean.gtf
    ln -s lab/ExoGean.*.gff Exogean.gff
       # 9273 ExonHunter.gtf
    ln -s `pwd`/lab/ExonHunter.*.gff custom/Exonhunter.gff
       # NOTE: has frame and start_codon annotations
      # 11679 FGenesh++.gtf
    ln -s lab/FGene*.gff Fgenesh.gff
        # 688 FProm.gtf
        # SKIPPING (promoter, not genepred)
       # 4178 GeneID_U12.gtf
    ln -s `pwd`/lab/GeneID_U12.*.gff custom/GeneIdU12.gff
      # 12107 GeneMark_on_masked_seq.gtf
    ln -s `pwd`/lab/GeneMark*.gff custom/Genemark.gff
        # Has transcript_ID but not gene_id

    # Redo Jigsaw 2005-08-16 kate
       # 4004 JigSaw.gtf
    ln -s `pwd`/lab/JigSaw.*.gff custom/Jigsaw.gff

        # 740 McPromoter.gtf
        # SKIPPING (promoters)
      # 12816 N-SCAN-Pairagon_any_v1.gtf
    sed -e 's/.pmasked.100Kpad.fa//g' lab/N-SCAN-Pairagon_any*.gff > PairagonAny.gff
      # 12798 N-SCAN-Pairagon_mRNA_EST_v1.gtf
    sed -e 's/.pmasked.100Kpad.fa//g' lab/N-SCAN-Pairagon_mRNA*.gff > PairagonMrna.gff
       # 7676 N-SCAN-Pairagon_multiple_v2.gtf
    sed -e 's/.pmasked.100Kpad.fa//g' lab/N-SCAN-Pairagon_multiple*.gff > PairagonMultiple.gff
       # 7676 N-SCAN-Pairagon_novel_v2.gtf
    sed -e 's/.pmasked.100Kpad.fa//g' lab/N-SCAN-Pairagon_novel*.gff > PairagonNovel.gff
       # 6107 SGP_U12.gtf
    ln -s `pwd`/lab/SGP_U12.*.gff custom/SGP2U12.gff
      # 10007 SPIDA.gtf
    ln -s lab/SPIDA.*.gff Spida.gff
        # 208 SoftBerry_pseudogenes.gtf
    sed -e 's/pseudogene\t/CDS\t/' lab/SoftBerry*.gff > SoftberryPseudo.gff
      # 11540 TwinScan-MARS.gtf
    sed -e 's/transcript_id "W-U-Mm/transcript_id "/' lab/Twin*.gff > custom/Twinscan.gff
        # Has start_codon annotation and frames
        # 252 Uncover.gtf
        # SKIPPING (skipped exons, not gene preds)
    
    # Process "standard" gff files
    # NOTE: must dummy out scores -- float values
cat > doGene.csh << 'EOF'
foreach f (*.gff)
    set b = $f:r
    echo $f
    awk -F\\t '/^chr/ {printf "%s\t%s\t%s\t%s\t%s\t.\t%s\t%s\t%s\n", $1, $2, $3, $4, $5, $7, $8, $9}' $f | \
    ldHgGene -out=$b.gp null null stdin
    liftOver -genePred $b.gp /cluster/data/hg17/bed/liftOver/hg17ToHg16.chain \
                $b.hg16.gp $b.gp.unmapped
    wc -l $b.hg16.gp $b.gp.unmapped
    set t = encodeEgaspFull$b
    ldHgGene -predTab hg16 $t $b.hg16.gp
    genePredCheck -db=hg16 $t
end
'EOF'
csh doGene.csh >&! doGene.log
# NOTE: warnings about missing frames are OK.  We're using -genePredExt
# just to pick up the gene name

    # process special files
    cd custom
    grep -v pseudogene Ensembl.gff | \
        ldHgGene -genePredExt -out=Ensembl.gp null null stdin
        # 735 gene predictions
    grep pseudogene Ensembl.gff | \
        ldHgGene -genePredExt -out=EnsemblPseudo.gp null null stdin
        # 34
    ldHgGene -gtf -genePredExt -out=Exonhunter.gp null null Exonhunter.gff
        # 1435 gene predictions
    ldHgGene -gtf -genePredExt -out=Jigsaw.gp null null Jigsaw.gff
    awk -F\\t '/^chr/ {printf "%s\t%s\t%s\t%s\t%s\t.\t%s\t%s\t%s\n", $1, $2, $3, $4, $5, $7, $8, $9}' Twinscan.gff | \
        ldHgGene -genePredExt -gtf -out=Twinscan.gp null null stdin
            # 954 gene predictions

    # GENEID and SGP2 with special intron tracks
    # NOTE: dummy out scores (wrong format -- float, and neg. values)
    awk -F\\t '/^chr/ {printf "%s\t%s\t%s\t%s\t%s\t.\t%s\t%s\t%s\n", $1, $2, $3, $4, $5, $7, $8, $9}' < GeneIdU12.gff > geneId.gff
    ldHgGene -genePredExt -out=GeneId.gp null null geneId.gff
        # 476 gene predictions

    awk -F\\t '/^chr/ {printf "%s\t%s\t%s\t%s\t%s\t.\t%s\t%s\t%s\n", $1, $2, $3, $4, $5, $7, $8, $9}' < SGP2U12.gff > sgp2.gff
    ldHgGene -genePredExt -out=Sgp2.gp null null sgp2.gff
        # 930 gene predictions

    # lift and load some customs
cat > doGene.csh << 'EOF'
foreach f (*.gp)
    set b = $f:r
    echo $f
    liftOver -genePred $b.gp /cluster/data/hg17/bed/liftOver/hg17ToHg16.chain \
                $b.hg16.gp $b.gp.unmapped
    wc -l $b.hg16.gp $b.gp.unmapped
    set t = encodeEgaspFull$b
    ldHgGene -genePredExt -predTab hg16 $t $b.hg16.gp
    genePredCheck -db=hg16 $t
end
'EOF'
# << for emacs
csh doGene.csh >&! doGene.log

    # process others
    perl -wpe 's/(.*transcript_id) (\S+)$/$1 $2 gene_id $2/' Genemark.gff | \
        ldHgGene -out=Genemark.gp null null stdin
        # 880 gene predictions
    liftOver -genePred  \
        Genemark.gp /cluster/data/hg17/bed/liftOver/hg17ToHg16.chain \
                Genemark.hg16.gp Genemark.gp.unmapped
    set t = "encodeEgaspFullGenemark"
    ldHgGene -predTab hg16 $t Genemark.hg16.gp
        # 872 gene predictions
    genePredCheck -db=hg16 $t

    # create genepreds containing just exons flanking U12 introns
    # use U12 annotation as gene name, so it appears on details page
    grep U12 geneId.gff | perl -wpe \
 's/(^.*gene_id) (\S+) (.*exon_type) (.*)(U12[^-]+)(.*)/$1 "$5"; $3 $4$5$6/' \
        > geneId.introns.gff
    ldHgGene -genePredExt -out=geneId.introns.gp null null geneId.introns.gff
        #  24 gene predictions
    liftOver -genePred geneId.introns.gp \
        /cluster/data/hg17/bed/liftOver/hg17ToHg16.chain \
                geneId.introns.hg16.gp geneId.introns.gp.unmapped
    set t = encodeEgaspFullGeneIdU12
    ldHgGene -predTab -genePredExt hg16 $t geneId.introns.hg16.gp
    genePredCheck -db=hg16 $t

    set b = sgp2
    grep U12 $b.gff | perl -wpe \
 's/(^.*gene_id) (\S+) (.*exon_type) (.*)(U12[^-]+)(.*)/$1 "$5"; $3 $4$5$6/' \
        > $b.introns.gff
    ldHgGene -genePredExt -out=$b.introns.gp null null $b.introns.gff
        #  20 gene predictions
    liftOver -genePred $b.introns.gp \
        /cluster/data/hg17/bed/liftOver/hg17ToHg16.chain \
                $b.introns.hg16.gp $b.introns.gp.unmapped
    set t = encodeEgaspFullSgp2U12
    ldHgGene -predTab -genePredExt hg16 $t $b.introns.hg16.gp
    genePredCheck -db=hg16 $t


##########################################################################
# TIGR Jigsaw Genes from EGASP (2005-06-20 kate)
# Contrib: Jonathan Allen <jallen@tigr.org>
# NOTE: hg17 coords

    cd /cluster/data/encode
    mkdir -p EGASP/Jigsaw/2005-06-01/lab
    cd EGASP/Jigsaw/2005-06-01/lab

    # convert to genepred, in preparation for dropping to hg16
    # NOTE: no start_codons, so omit -gtf option
    grep '^chr' lab/jigsaw_ucsc.gtf | \
        ldHgGene -out=jigsaw.gp -genePredExt null null stdin
        # Read 454 transcripts in 3949 lines in 1 files
        # 454 groups 20 seqs 1 sources 1 feature types
        # 454 gene predictions
    liftOver -genePred jigsaw.gp \
            /cluster/data/hg17/bed/liftOver/hg17ToHg16.chain \
                jigsaw.hg16.gp jigsaw.gp.unmapped
    wc -l jigsaw.hg16.gp jigsaw.gp.unmapped
        # 450 jigsaw.hg16.gp
        #   8 jigsaw.gp.unmapped
    ldHgGene -predTab -genePredExt hg16 encodeEgaspUpdJigsaw jigsaw.hg16.gp
        # 450 gene predictions
    genePredCheck -db=hg16 encodeEgaspUpdJigsaw


##########################################################################
# AUGUSTUS GENES FROM EGASP (2005-06-10 kate)
    # Contrib: Mario Stanke (mstanke@gwdg.de)
    # Department of Bioinformatics of the University of Gottingen in Germany
    # submitted in hg17 coords.
    # NOTE: these were dropped from Update track -- Mario says they
        # belong in the Partial.

    ssh hgwdev
    cd /cluster/data/encode
    mkdir -p EGASP/Augustus
    cd EGASP/Augustus
    mkdir 2005-06-22
    cd 2005-06-22
    mkdir lab
    cd lab
    wget --timestamping --force-directories --directory-prefix=. \
	--dont-remove-listing --recursive --level=3 --no-parent \
	--no-host-directories --cut-dirs=5 http://augustus.gobics.de/predictions/encode/chr_coords/ --accept=gtf

    # NOTE: remove non .gtf files
    wc -l *.gtf
   # 4075 44encode.augustus.ESTprotein.chrcoord.sgtf
   # 3701 44encode.augustus.abinitio.chrcoords.gtf
   # 4162 44encode.augustus.anyInformation.chrcoords.gtf
   # 4026 44encode.augustus.dualgenome.chrcoords.gtf
   cd ..
   ln -s lab/44encode.augustus.ESTprotein.chrcoords.gtf est.gtf
   ln -s lab/44encode.augustus.abinitio.chrcoords.gtf  abinitio.gtf
   ln -s lab/44encode.augustus.anyInformation.chrcoords.gtf any.gtf
   ln -s lab/44encode.augustus.dualGenome.chrcoords.gtf dual.gtf

    # convert to genePred
    foreach f (*.gtf)
        set b = $f:t:r
        ldHgGene -out=$b.gp -genePredExt null null $f
    end
    #Reading est.gtf
    #543 gene predictions
    #Reading abinitio.gtf
    #622 gene predictions
    #Reading any.gtf
    #571 gene predictions
    #Reading dual.gtf
    #617 gene predictions

    # lift to hg16
    foreach f (*.gp)
        set b = $f:r
        liftOver -genePred $b.gp \
            /cluster/data/hg17/bed/liftOver/hg17ToHg16.chain \
                $b.hg16.gp $b.gp.unmapped
    end
    wc -l *.hg16.gp
        # 616 abinitio.hg16.gp
        # 568 any.hg16.gp
        # 612 dual.hg16.gp
        # 541 est.hg16.gp

    wc -l *.unmapped
     # 12 abinitio.gp.unmapped
        # Missing 1 or 2 exons: ENr131.g14.1, ENm006.g18.1, ENm006.g20.1,
                        # ENm006.g23.1, ENm006.g28.1, ENm006.g49.1
     # 6 any.gp.unmapped
        # ENr131.g14.1, ENm006.g21.1, ENm006.g41.1
     # 10 dual.gp.unmapped
        # ENr131.g14.1, ENm006.g16.1, ENm006.g20.1, ENm006.g25.1 , ENm006.g42.1
     # 4 esg.gp.unmapped
        # Missing 1 exon: ENr131.g12.1, ENm006.g21.1

    # load
    foreach f (*.hg16.gp)
        genePredCheck $f
        set t = `echo $f | sed -e 's/\(.*\).hg16.gp/encodeEgaspUpdAugustus\u\1/'`
        ldHgGene -predTab -genePredExt hg16 $t $f
        checkTableCoords hg16 $t
    end

##########################################################################
# EXOGEAN GENES FROM EGASP (2005-06-10 kate)
    # Contrib: Sarah Djebali, ENS-Dyogean (France)
    cd /cluster/data/encode
    mkdir -p EGASP/Exogean
    cd EGASP/Exogean
    mkdir 2005-06-23
    cd 2005-06-23
    mkdir lab
    # retrieve files from FTP site
    wc -l lab/*.gtf
    # 18958 All44_EXOGEAN_new3_ucsc.gtf
    
    # reformat file with tab delimiters , add dummy frame so we
    # can pick up transcript ID
    # convert to genePred
    perl -wpe 's/(chr\S+)\s+(\S+)\s+(\S+)\s+(\S+)\s+(\S+)\s+(\S+)\s+(\S+)\s+(.*)/$1\t$2\t$3\t$4\t$5\t$6\t$7\t.\t$8/' lab/*exogean*.gtf > exogean.gtf
    ldHgGene -out=exogean.gp null null exogean.gtf
        # 850 gene predictions
    liftOver -genePred exogean.gp \
            /cluster/data/hg17/bed/liftOver/hg17ToHg16.chain \
                exogean.hg16.gp exogean.gp.unmapped
    wc -l exogean.hg16.gp exogean.gp.unmapped
        # 834 exogean.hg16.gp
        # 32 exogean.gp.unmapped
          # 16 unmapped
          # ENr131.2.{13,14,15}
          # ENm006.2.{7,9,10,11,12,13,14}
          # ENm006.9.{1,2,21,22,23,24}

    ldHgGene -predTab hg16 encodeEgaspUpdExogean exogean.hg16.gp
        # 834 gene predictions
    genePredCheck -db=hg16 encodeEgaspUpdExogean


##########################################################################
# GENEIDU12 and SGPU12 GENES FROM EGASP (2005-06-24 kate)
    # Contrib: Tyler Alioto, IMIM, Barcelona
    cd /cluster/data/encode
    mkdir -p EGASP/GeneIdU12
    cd EGASP/GeneIdU12
    mkdir -p 2005-06-10/lab
    cd 2005-06-10
    wc -l lab/*.gtf
        # 3551 lab/GeneIDu12_hg16.gtf
        # 5020 lab/SGP2u12_hg16.gtf
    # load gene preds, replacing real-valued score with '.'
    awk -F\\t '/^chr/ {printf "%s\t%s\t%s\t%s\t%s\t.\t%s\t%s\t%s\n", $1, $2, $3, $4, $5, $7, $8, $9}' < lab/GeneIDu12_hg16.gtf > geneId.gtf
    awk -F\\t '/^chr/ {printf "%s\t%s\t%s\t%s\t%s\t.\t%s\t%s\t%s\n", $1, $2, $3, $4, $5, $7, $8, $9}' < lab/SGP2u12_hg16.gtf > sgp2.gtf

    ldHgGene -genePredExt hg16 encodeEgaspUpdGeneId geneId.gtf
        # 475 gene predictions
    genePredCheck -db=hg16 encodeEgaspUpdGeneId

    ldHgGene -genePredExt hg16 encodeEgaspUpdSgp2 sgp2.gtf
        # 929 gene predictions
    genePredCheck -db=hg16 encodeEgaspUpdSgp2

    # create genepreds containing just exons flanking U12 introns
    # use U12 annotation as gene name, so it appears on details page
    grep U12 geneId.gtf | perl -wpe \
 's/(^.*gene_id) (\S+) (.*exon_type) (.*)(U12[^-]+)(.*)/$1 "$5"; $3 $4$5$6/' \
        > geneId.introns.gtf
    ldHgGene -genePredExt hg16 encodeEgaspUpdGeneIdU12 geneId.introns.gtf
        #  27 gene predictions

    grep U12 sgp2.gtf | perl -wpe \
 's/(^.*gene_id) (\S+) (.*exon_type) (.*)(U12[^-]+)(.*)/$1 "$5"; $3 $4$5$6/' \
        > sgp2.introns.gtf
    ldHgGene -genePredExt hg16 encodeEgaspUpdSgp2U12 sgp2.introns.gtf
        # 20 gene predictions

##########################################################################
# EGASP Yale Pseudogenes (2005-06-25 kate)
#       Contact: Deyou Zheng <deyou.zheng@yale.edu>

    cd /cluster/data/encode
    mkdir -p EGASP/yale/2005-06-20/lab
    cd EGASP/yale/2005-06-20
    wc -l lab/*.submitted
        #  203 lab/YalePgene-NCBI34.gtf.submitted

    # munge to create CDS entries to display, and assign pseudogene
    # name as transcript_id, and pseudogene type as gene_id so
    # it displays on details page
    sed -e 's/pseudogene\t/CDS\t/' -e 's/pgene_type/gene_id/'  \
        -e 's/alt_name ENCODE_Yale/transcript_id /' \
                lab/YalePgene-NCBI34.gtf.submitted > yale.gtf
    ldHgGene -genePredExt hg16 encodeEgaspUpdYalePseudo yale.gtf
        # 203 gene predictions 


##########################################################################
# Yale Nimblegen data (DONE - 2005-06-08 - Hiram)
#	Data supplier contact: Olof Emanuelsson olof.emanuelsson@yale.edu
    ssh hgwdev
    cd /cluster/data/encode/yale/nimblegen/2005-06-01
    # symlink data files to the names as their table names will be:
D="encode_Yale_MAS_pbclassicprot_01Jun2005"

fold -s -w72 ${D}/encode_Yale_MAS_Placenta_RNA_pbclassicprot_Tars-FWD_Maskless36
merevery36bp.html > \
        encodeYaleMASPlacRNATarsFwdMless36mer36bp.html

fold -s -w72 ${D}/encode_Yale_MAS_Placenta_RNA_pbclassicprot_Tars-REV_Maskless36
merevery36bp.html > \
        encodeYaleMASPlacRNATarsRevMless36mer36bp.html

ln -s ${D}/encode_Yale_MAS_Placenta_RNA_pbclassicprot_Tars_FWD.Maskless36merever
y36bp.bed \
        encodeYaleMASPlacRNATarsFwdMless36mer36bp.bed

ln -s ${D}/encode_Yale_MAS_Placenta_RNA_pbclassicprot_Tars_REV.Maskless36merever
y36bp.bed \
        encodeYaleMASPlacRNATarsRevMless36mer36bp.bed

fold -s -w72 ${D}/encode_Yale_MAS_Placenta_RNA_pbclassicprot_Transcript-Map-FWD_
Maskless36merevery36bp.html > \
        encodeYaleMASPlacRNATransMapFwdMless36mer36bp.html

ln -s ${D}/encode_Yale_MAS_Placenta_RNA_pbclassicprot_Transcript-Map-FWD_Maskles
s36merevery36bp.wig \
        encodeYaleMASPlacRNATransMapFwdMless36mer36bp.wig

fold -s -w72 ${D}/encode_Yale_MAS_Placenta_RNA_pbclassicprot_Transcript-Map-REV_
Maskless36merevery36bp.html > \
        encodeYaleMASPlacRNATransMapRevMless36mer36bp.html

ln -s ${D}/encode_Yale_MAS_Placenta_RNA_pbclassicprot_Transcript-Map-REV_Maskles
s36merevery36bp.wig \
        encodeYaleMASPlacRNATransMapRevMless36mer36bp.wig
    #  load the tables
for T in encodeYaleMASPlacRNATarsFwdMless36mer36bp \
        encodeYaleMASPlacRNATarsRevMless36mer36bp
do
    grep "^chr" ${T}.bed | sort -k1,1 -k2,2n | \
        hgLoadBed -strict hg16 ${T} stdin
done
    # Combine these two tables into the composite: encodeYaleMASPlacRNATars
    # Yale MAS TAR - Yale MAS array, RNA Transcriptionally Active Regions,
    #	Placenta, Nimblegen Protocol

for T in encodeYaleMASPlacRNATransMapFwdMless36mer36bp \
        encodeYaleMASPlacRNATransMapRevMless36mer36bp
do
    grep "^chr" ${T}.wig | sort -k1,1 -k2,2n | \
        hgLoadBed -bedGraph=4 -strict hg16 ${T} stdin
done
    # Combine these two tables into the composite: encodeYaleMASPlacRNATransMap
    # Yale MAS RNA - Yale MAS array, RNA Transcript Map, Placenta,
    #	Nimblegen Protocol

#######################################################################
#	More Yale Nimblegen data - 2005-07-05 - Hiram
    cd /cluster/data/encode/yale/nimblegen/2005-05-31
    tar xvzf encode_Yale_MAS_nimbleprot_31May2005.tar.gz

D="encode_Yale_MAS_nimbleprot_31May2005"

fold -s -w72 ${D}/encode_Yale_MAS_NB4_RNA_nimbleprot_Tars-FWD_Maskless36merevery36bp.html > \
	encodeYaleMASNB4RNANProtTarsFWDMless36mer36bp.html
fold -s -w72 ${D}/encode_Yale_MAS_NB4_RNA_nimbleprot_Tars-REV_Maskless36merevery36bp.html > \
	encodeYaleMASNB4RNANProtTarsREVMless36mer36bp.html
#
fold -s -w72 ${D}/encode_Yale_MAS_NB4_RNA_nimbleprot_Transcript-Map-FWD_Maskless36merevery36bp.html > \
	encodeYaleMASNB4RNANprotTMFWDMless36mer36bp.html
fold -s -w72 ${D}/encode_Yale_MAS_NB4_RNA_nimbleprot_Transcript-Map-REV_Maskless36merevery36bp.html > \
	encodeYaleMASNB4RNANprotTMREVMless36mer36bp.html
#
fold -s -w72 ${D}/encode_Yale_MAS_Placenta_RNA_nimbleprot_Tars-FWD_Maskless36merevery36bp.html > \
	encodeYaleMASPlacRNANprotTarsFWDMless36mer36bp.html
fold -s -w72 ${D}/encode_Yale_MAS_Placenta_RNA_nimbleprot_Tars-REV_Maskless36merevery36bp.html > \
	encodeYaleMASPlacRNANprotTarsREVMless36mer36bp.html
#
fold -s -w72 ${D}/encode_Yale_MAS_Placenta_RNA_nimbleprot_Transcript-Map-FWD_Maskless36merevery36bp.html > \
	encodeYaleMASPlacRNANprotTMFWDMless36mer36bp.html
fold -s -w72 ${D}/encode_Yale_MAS_Placenta_RNA_nimbleprot_Transcript-Map-REV_Maskless36merevery36bp.html > \
	encodeYaleMASPlacRNANprotTMREVMless36mer36bp.html
####
ln -s \
  ${D}/encode_Yale_MAS_NB4_RNA_nimbleprot_Tars_FWD.Maskless36merevery36bp.bed \
	encodeYaleMASNB4RNANProtTarsFWDMless36mer36bp.bed
ln -s \
  ${D}/encode_Yale_MAS_NB4_RNA_nimbleprot_Tars_REV.Maskless36merevery36bp.bed \
	encodeYaleMASNB4RNANProtTarsREVMless36mer36bp.bed
#
ln -s \
  ${D}/encode_Yale_MAS_NB4_RNA_nimbleprot_Transcript-Map-FWD_Maskless36merevery36bp.wig \
	encodeYaleMASNB4RNANprotTMFWDMless36mer36bp.wig
ln -s \
  ${D}/encode_Yale_MAS_NB4_RNA_nimbleprot_Transcript-Map-REV_Maskless36merevery36bp.wig \
        encodeYaleMASNB4RNANprotTMREVMless36mer36bp.wig
#
ln -s ${D}/encode_Yale_MAS_Placenta_RNA_nimbleprot_Tars_FWD.Maskless36merevery36bp.bed \
	encodeYaleMASPlacRNANprotTarsFWDMless36mer36bp.bed
ln -s ${D}/encode_Yale_MAS_Placenta_RNA_nimbleprot_Tars_REV.Maskless36merevery36bp.bed \
	encodeYaleMASPlacRNANprotTarsREVMless36mer36bp.bed
#
ln -s ${D}/encode_Yale_MAS_Placenta_RNA_nimbleprot_Transcript-Map-FWD_Maskless36merevery36bp.wig \
	encodeYaleMASPlacRNANprotTMFWDMless36mer36bp.wig
ln -s ${D}/encode_Yale_MAS_Placenta_RNA_nimbleprot_Transcript-Map-REV_Maskless36merevery36bp.wig \
	encodeYaleMASPlacRNANprotTMREVMless36mer36bp.wig

    #  load the tables
for T in encodeYaleMASNB4RNANProtTarsFWDMless36mer36bp \
	encodeYaleMASNB4RNANProtTarsREVMless36mer36bp \
	encodeYaleMASPlacRNANprotTarsREVMless36mer36bp \
	encodeYaleMASPlacRNANprotTarsFWDMless36mer36bp
do
    grep "^chr" ${T}.bed | sort -k1,1 -k2,2n | \
        hgLoadBed -strict hg16 ${T} stdin
done
    # Combine these four tables into the existing composite:
    #	encodeYaleMASPlacRNATars
    # Yale MAS TAR - Yale MAS array, RNA Transcriptionally Active Regions,
    #	Placenta, Nimblegen Protocol

for T in encodeYaleMASNB4RNANprotTMFWDMless36mer36bp \
    encodeYaleMASNB4RNANprotTMREVMless36mer36bp \
    encodeYaleMASPlacRNANprotTMFWDMless36mer36bp \
    encodeYaleMASPlacRNANprotTMREVMless36mer36bp
do
    grep "^chr" ${T}.wig | sort -k1,1 -k2,2n | \
        hgLoadBed -bedGraph=4 -strict hg16 ${T} stdin
done
    # Combine these four tables into the existing composite:
    #	encodeYaleMASPlacRNATransMap
    # Yale MAS RNA - Yale MAS array, RNA Transcript Map, Placenta,
    #	Nimblegen Protocol

#######################################################################
# Boston University ORChID track - (DONE - 2005-06-09 - Hiram)
#	data developer contact:  Jay Greenbaum jj@bu.edu
    ssh hgwdev
    cd /cluster/data/encode/BU/orchid/2005-06-09
    wget --timestamping "http://orchid.bu.edu/encode/tracks/description.html" \
        -O description.html
    wget --timestamping "http://orchid.bu.edu/encode/tracks/oh_cleavage.wig" \
        -O oh_cleavage.wig
    #	It turns out this oh_cleavage.wig file has data for three
    #	tracks in it, and all the data is precisely exactly identically
    #	the same.  So, only one of the tracks is needed.
    #	Use this perl script to split up this data into the three
    #	data files: t0 t1 t2
    ./splitTracks.pl oh_cleavage.wig
    #	encode one of the data files:
    mkdir wib
    wigEncode t0 encodeBu_ORChID1.wig wib/encodeBu_ORChID1.wib
    #	and load
    hgLoadWiggle -pathPrefix=/gbdb/hg16/encode/Bu/2005-06-09 hg16 \
        encodeBu_ORChID1 encodeBu_ORChID1.wig
    mkdir -p /gbdb/hg16/encode/Bu/2005-06-09/wib
    ln -s `pwd`/wib/encodeBu_ORChID1.wib /gbdb/hg16/encode/Bu/2005-06-09/wib
    #	trackDb entry is for: track encodeBu_ORChID1
#	shortLabel BU ORChID
#	longLabel Boston University ORChID
#	type wig -0.56 1.58
#	maxHeightPixels 128:36:16
#	windowingFunction Mean
#	color 44,44,200
#	altColor 200,44,44
#	viewLimits 0.22:0.5
#	autoScale Off


# STANFORD DNA METHYLATION, RTPCR AND CHIP (2005-06-20 angie)
    # RTPCR:
    cd /cluster/data/encode/stanford/2005-05-31/
    tail +2 lab/stanford_rtpcr \
    | perl -wpe 'chomp; @w = split("\t"); \
         $w[1] =~ s/\>/gt/;  $w[1] =~ s/(EN\w\d+) /$1_/; \
         $score = int(300 + ($w[6] / 10));  $score = 1000 if ($score > 1000); \
         $_ = "$w[2]\t$w[3]\t$w[4]\t$w[1]\t$score\t$w[5]\t$w[6]\n";' \
    | hgLoadBed hg16 \
        -noBin -sqlTable=$HOME/kent/src/hg/lib/encodeStanfordRtPcr.sql \
        encodeStanfordRtPcr stdin
    # Methylation:
    cd /cluster/data/encode/stanford/2005-05-31/
    set scoreColumn = 5
    foreach cellType (Be2C CRL1690 HCT116 HT1080 HepG2 JEG3 Snu182 U87)
      tail +2 lab/stanford_meth \
      | awk '$'$scoreColumn' != "outlier" { \
               print $2 " " $3-1 " " $4 " " $'$scoreColumn';}' \
      | hgLoadBed -bedGraph=4 hg16 encodeStanfordMeth$cellType stdin
      set scoreColumn = `expr $scoreColumn + 1`
    end
    set pri = 1
    foreach cellType (Be2C CRL1690 HCT116 HT1080 HepG2 JEG3 Snu182 U87)
      echo track encodeStanfordMeth$cellType
      echo subTrack encodeStanfordMeth
      echo shortLabel Stan Meth $cellType
      echo longLabel Stanford Methylation \($cellType cells\)
      echo priority $pri
      echo ""
      set pri = `expr $pri + 1`
    end
    ssh kolossus
    cd /cluster/data/encode/stanford/2005-05-31/
    perl -we 'while (<>) { \
      if (/^track name=(\S+)/) { \
        $track = "encodeStanfordMethSmoothed$1"; \
        close(OUT); open(OUT, ">$track.bed") || die; \
      } else { \
        @w = split; print OUT "$w[0]\t$w[1]\t$w[2]\t$w[4]\n"; \
      } }' lab/stanford_meth.bed
    sort -k4n,4n encodeStanfordMethSmoothed*.bed | head
    sort -k4n,4n encodeStanfordMethSmoothed*.bed | tail
    # range of scores: 0 to 364088
    # back on hgwdev:
    foreach f (encodeStanfordMethSmoothed*.bed)
      hgLoadBed -bedGraph=4 hg16 $f:r $f
    end
    # ChIP/chip:
    cd /cluster/data/encode/stanford/2005-05-31/
    set scoreColumn = 5
    foreach specifier (HCT116Sp1 HCT116Sp3 JurkatSp1 JurkatSp3 K562Sp1 K562Sp3)
      tail +2 lab/stanford_sp_ChIP \
      | sed -e 's/chr0/chr/;' \
      | awk '$'$scoreColumn' != "outlier" { \
               print $2 " " $3-1 " " $4 " " $'$scoreColumn';}' \
      | hgLoadBed -bedGraph=4 hg16 encodeStanfordChip$specifier stdin
      set scoreColumn = `expr $scoreColumn + 1`
    end
    set pri = 1
    foreach specifier (HCT116Sp1 HCT116Sp3 JurkatSp1 JurkatSp3 K562Sp1 K562Sp3)
      set cellType = `echo $specifier | perl -wpe 's/(Sp1|Sp3)//'`
      set factor = `echo $specifier | perl -wpe 's/(HCT116|Jurkat|K562)//'`
      echo track encodeStanfordChip$specifier
      echo subTrack encodeStanfordChip
      echo shortLabel Stan $cellType $factor
      echo longLabel Stanford ChIP/chip \($cellType cells, $factor ChIP\)
      echo priority $pri
      echo ""
      set pri = `expr $pri + 1`
    end
    ssh kolossus
    cd /cluster/data/encode/stanford/2005-05-31/
    perl -we 'while (<>) { \
      if (/^track name=(\S+)/) { \
        $track = "encodeStanfordChipSmoothed$1"; \
        $track =~ s@_ChIP/total@@;  $track =~ s@_@@; \
        close(OUT); open(OUT, ">$track.bed") || die $!; \
      } else { \
        @w = split; print OUT "$w[0]\t$w[1]\t$w[2]\t$w[4]\n"; \
      } }' lab/stanford_sp_ChIP.bed
    sort -k4n,4n encodeStanfordChipSmoothed*.bed | head
    sort -k4n,4n encodeStanfordChipSmoothed*.bed | tail
    # range of scores: 0 to 721474
    # back on hgwdev:
    foreach f (encodeStanfordChipSmoothed*.bed)
      hgLoadBed -bedGraph=4 hg16 $f:r $f
    end
    rm bed.tab


# SANGER GENOTYPE/EXPRESSION ASSOCIATION (angie 6/25/05)
    cd /cluster/data/encode/sanger/genotypeExpressionAssociation
    sed -e 's/bed5FloatScore/encodeSangerGenoExprAssociation/' \
      $HOME/kent/src/hg/lib/bed5FloatScore.sql > /tmp/tmp.sql
    grep --no-filename -v encode_region sanger*.txt \
    | awk '{printf "%s\t%d\t%d\t%s\t%d\t%s\n", \
           $1, $2, $3, $6, (250 + ($4 * 375)), $4}' \
    | hgLoadBed hg16 encodeSangerGenoExprAssociation stdin \
      -sqlTable=/tmp/tmp.sql
    rm bed.tab


##########################################################################
# Indels from Jim Mullikin
# Heather, June 2005

    ssh hgwdev
    cd /cluster/data/encode/NHGRI/mullikin
    hgsql hg16 < encodeIndels.sql
    split4.pl < hg16.ENCODE.DIPtrack.Q23.bed4+ > split4.out
    # use a modified makeColoredBed
    ./makeColoredBed < split4.out > encodeIndels.bed  
    # don't use -strict because we have lots of simple insertions (where chromStart = chromEnd)
    hgLoadBed hg16 encodeIndels -tab -sqlTable=encodeIndels.sql encodeIndels.bed
    # check reference length
    mysql> select chrom, chromStart, chromEnd, (chromEnd-chromStart) as size, traceName, reference, length(reference) as refsize from encodeIndels where (chromEnd-chromStart) != length(reference) and length(reference) > 1;
    # Empty set (0.07 sec)


##########################################################################
# Recombination Rates from Gil McVean
# Heather, June 2005

# 10 ENCODE regions that were resequenced:
# ENr112 (chr2)
# ENr131 (chr2)
# ENr113 (chr4)
# ENm010 (chr7)
# ENm013 (chr7)
# ENm014 (chr7)
# ENr321 (chr8)
# ENr232 (chr9)
# ENr123 (chr12)
# ENr213 (chr18)

# units are cM/Mb

# extra precision in input files

    ssh hgwdev
    cd /cluster/data/encode/oxford/mcvean

#   a few notes on the data: had to add "chr" and flip start/end
    fix.pl < ENCODE_release16.txt.2 > ENCODE_release16.bed

    hgLoadBed -strict -sqlTable=bedGraph.sql hg16 encodeRecomb ENCODE_release16.bed

##########################################################################
# Recombination Rates from Gil McVean
# Daryl, July 15, 2005

    ## data from Chris Spencer and GilMcVean in Oxford
    cd /cluster/data/encode/oxford/mcvean 
    tail +2 hotspot.txt | sed 's/p/\t/;s/q/\t/;s/ /\t/' | awk '{printf "chr%s\t%d\t%d\n", $1, $3-1, $3}' > hotspot.bed
    hgLoadBed hg16 encodeRecombHotspot hotspot.bed

##########################################################################
# HapMap SNPs 
# Heather, June 2005

# 10 ENCODE regions that were resequenced:
# ENr112 (chr2)
# ENr131 (chr2)
# ENr113 (chr4)
# ENm010 (chr7)
# ENm013 (chr7)
# ENm014 (chr7)
# ENr321 (chr8)
# ENr232 (chr9)
# ENr123 (chr12)
# ENr213 (chr18)

    ssh hgwdev
    cd /cluster/data/encode/hapmap/16c.1/alleleFrequencies
    ./formatAll.csh

	#!/bin/csh
	foreach pop (CEU CHB JPT YRI)
	    rm -f allele_freqs_$pop.bed
	    zcat allele_freqs_EN*${pop}* | awk -f split.awk > allele_freqs_$pop.bed
	end

	!/chrom/ {printf "%s\t%d\t%d\t%s\t%d\t%s\t%s\t%s\t%s\t%.3f\t%.3f\t%.3f\t%d\n",tolower($2),$3-1,$3,$1,500+(1000*($12>$15?$15:$12)),$4,$6,$11,$14,$12,$15,($12>$15?$15:$12),$17;}

    hgLoadBed -strict -sqlTable=encodeHapMapAlleleFreqCEU.sql hg16 encodeHapMapAlleleFreqCEU allele_freqs_CEU.bed
    hgLoadBed -strict -sqlTable=encodeHapMapAlleleFreqCHB.sql hg16 encodeHapMapAlleleFreqCHB allele_freqs_CHB.bed
    hgLoadBed -strict -sqlTable=encodeHapMapAlleleFreqJPT.sql hg16 encodeHapMapAlleleFreqJPT allele_freqs_JPT.bed
    hgLoadBed -strict -sqlTable=encodeHapMapAlleleFreqYRI.sql hg16 encodeHapMapAlleleFreqYRI allele_freqs_YRI.bed


##########################################################################
# HapMap resequencing coverage
# Daryl, July 11, 2005

    foreach pop (CEU CHB JPT YRI)
	grep $pop coverage.bed|awk '{printf "chr%s\t%d\t%d\t%s\t%d\n",$2,$3-1,$3,$1,$4}'|cut -f1-3,5>encodeHapMapCov$pop.bedGraph 
    end
    
    grep CHB coverage.bed|awk '{printf "chr%s\t%d\t%d\t%s\t%d\n",$2,$3-1,$3,$1,$4}'|cut -f1-3,5>encodeHapMapCovHCB.bedGraph 
    grep JAP coverage.bed|awk '{printf "chr%s\t%d\t%d\t%s\t%d\n",$2,$3-1,$3,$1,$4}'|cut -f1-3,5>encodeHapMapCovJPT.bedGraph 
    
    
    foreach pop (CEU CHB JPT YRI)
	    nice time hgLoadBed -bedGraph=4 -strict hg16 encodeHapMapCovBedGraph$pop encodeHapMapCov$pop.bedGraph > & load$pop.log
    end
    
    set wibPath = /gbdb/hg16/encode/sanger/wib
    foreach pop (CEU CHB JPT YRI)
	time wigEncode encodeHapMapCov$pop.bedGraph encodeHapMapCov$pop.wig encodeHapMapCov$pop.wib
	rm -f $wibPath/encodeHapMapCov$pop.wib
	ln -s `pwd`/encodeHapMapCov$pop.wib $wibPath
	chmod o+r . encodeHapMapCov$pop.wib
	time hgLoadWiggle -pathPrefix=$wibPath hg16 encodeHapMapCov$pop encodeHapMapCov$pop.wig
    end


##########################################################################
# ENCODE Regulome 
# Heather, June 2005

    ssh hgwdev

    # I am hand spliting these files on account of being rushed
    # Could have used /cluster/data/encode/BU/2005-06-09/splitTracks.pl

    cd /cluster/data/encode/Regulome/sens
    # waiting for more granular data from John Stam
    # this is wiggle data
    # using /gbdb/hg16/encode/Regulome/june2005

    # John Stam told me amplicons are the same for all cell lines
    # this is bed 5
    cd /cluster/data/encode/Regulome/2005-06-12/amplicons/odd
    grep -h "^chr" *CACO2.txt > odd.bed
    hgLoadBed -strict -sqlTable=odd.sql hg16 encodeRegulomeAmpliconOdd odd.bed
    cd /cluster/data/encode/Regulome/2005-06-12/amplicons/even
    grep -h "^chr" *CACO2.txt > even.bed
    hgLoadBed -strict -sqlTable=even.sql hg16 encodeRegulomeAmpliconEven even.bed

    cd /cluster/data/encode/Regulome/2005-06-12/prob
    # this is bedGraph
    grep -h "^chr" *CACO2.txt > CACO2.bed
    grep -h "^chr" *GM06900.txt > GM06900.bed
    grep -h "^chr" *SKNSH.txt > SKNSH.bed
    grep -h "^chr" *ERY.txt > ERY.bed
    grep -h "^chr" *K562.txt > K562.bed
    hgLoadBed -strict -sqlTable=encodeRegulomeProbCACO2.sql hg16 encodeRegulomeProbCACO2 CACO2.bed
    hgLoadBed -strict -sqlTable=encodeRegulomeProbGM06990.sql hg16 encodeRegulomeProbGM06990 GM06990.bed
    hgLoadBed -strict -sqlTable=encodeRegulomeProbSKNSH.sql hg16 encodeRegulomeProbSKNSH SKNSH.bed
    hgLoadBed -strict -sqlTable=encodeRegulomeProbERY.sql hg16 encodeRegulomeProbERY ERY.bed
    hgLoadBed -strict -sqlTable=encodeRegulomeProbK562.sql hg16 encodeRegulomeProbK562 K562.bed

    cd /cluster/data/encode/Regulome/2005-06-12/quality
    # this is bed 5
    grep -h "^chr" *CACO2.txt > CACO2.bed
    grep -h "^chr" *GM06900.txt > GM06900.bed
    grep -h "^chr" *SKNSH.txt > SKNSH.bed
    grep -h "^chr" *ERY.txt > ERY.bed
    grep -h "^chr" *K562.txt > K562.bed
    hgLoadBed -strict -sqlTable=encodeRegulomeQualityCACO2.sql hg16 encodeRegulomeQualityCACO2 CACO2.bed
    hgLoadBed -strict -sqlTable=encodeRegulomeQualityGM06990.sql hg16 encodeRegulomeQualityGM06990 GM06990.bed
    hgLoadBed -strict -sqlTable=encodeRegulomeQualitySKNSH.sql hg16 encodeRegulomeQualitySKNSH SKNSH.bed
    hgLoadBed -strict -sqlTable=encodeRegulomeQualityERY.sql hg16 encodeRegulomeQualityERY ERY.bed
    hgLoadBed -strict -sqlTable=encodeRegulomeQualityK562.sql hg16 encodeRegulomeQualityK562 K562.bed


    ## Last Minute data fixes and additions
    ## Daryl; July 10, 2005
    cd /cluster/data/encode/Regulome/2005-07-08/old
    sed 's/XXX/HepG2/' ~/kent/src/hg/lib/encodeRegulomeProb.sql    > encodeRegulomeProbHepG2.sql
    sed 's/XXX/HepG2/' ~/kent/src/hg/lib/encodeRegulomeQuality.sql > encodeRegulomeQualityHepG2.sql
    splitTracks.pl stdin < ENm003_QCP_HepG2.txt
    grep -vh track t0 | hgLoadBed -strict -sqlTable=encodeRegulomeProbHepG2.sql    hg16 encodeRegulomeProbHepG2    stdin
    grep -vh track t3 | hgLoadBed -strict -sqlTable=encodeRegulomeQualityHepG2.sql hg16 encodeRegulomeQualityHepG2 stdin

    sed 's/XXX/Huh7/' ~/kent/src/hg/lib/encodeRegulomeProb.sql    > encodeRegulomeProbHuh7.sql
    sed 's/XXX/Huh7/' ~/kent/src/hg/lib/encodeRegulomeQuality.sql > encodeRegulomeQualityHuh7.sql
    cat EN*_QCP_Huh7.txt | splitTracks.pl stdin
    grep -vh track t0 t5 t10 t15 | hgLoadBed -strict -sqlTable=encodeRegulomeProbHuh7.sql    hg16 encodeRegulomeProbHuh7    stdin
    grep -vh track t3 t8 t13 t18 | hgLoadBed -strict -sqlTable=encodeRegulomeQualityHuh7.sql hg16 encodeRegulomeQualityHuh7 stdin

    cd /cluster/data/encode/Regulome/2005-07-08/new
    foreach cellLine (CACO2 GM06990 HepG2 Huh7 K562 SKNSH Adult_Erythroblast)
	set f = encodeRegulomeBase$cellLine
	cat EN*_QCP_${cellLine}_Baseline.wig | wigEncode stdin $f.wig $f.wib
	ln -s `pwd`/$f.wib       /gbdb/hg16/encode/Regulome/wib
	hgLoadWiggle -pathPrefix=/gbdb/hg16/encode/Regulome/wib hg16 $f $f.wig
    end


##########################################################################
# ENCODE RNA transcription compilation (Matt Weirauch and Todd Lowe)
# Daryl and Matt Weirauch, July 8, 2005 (reloaded with new evoFold scores on July 10 and again on July12)

    cd /cluster/data/encode/ucsc/lowe
    hgLoadBed -sqlTable=/cluster/home/daryl/kent/src/hg/lib/encodeRna.sql hg16 encodeRna encodeRna.bed -strict


##########################################################################
# TBA23 EVOFOLD  - RNA secondary structure predictions based on TBA23 (Jakob Skou Pedersen)
# Jakob Skou Pedersen, July 12, 2005
#
# The evofold program is still in development and the procedures used
# to generate the data are likely to change some over the next few
# months. The documentation of the data generation is therefore
# skipped and we start with a data file provided by me.
    ssh -C hg17
    mkdir -p /cluster/data/encode/evofold
    cd /cluster/data/encode/evofold
    cp /cluster/home/jsp/data/rnass/genome-scan/vertebrate/encode/tba23/tba_folds.bed tbaFolds.bed
    # tbaFolds.bed is a 9-column bed file: column 1-6 provide standard
    # bed information, column 7 is element length, column 8 is the RNA
    # secondary structure in parentheses format, and column nine is a
    # commaseparated list of position specific confidence scores
    # (floats).

    # do a little table swapping to use existing sql file  
    echo "RENAME TABLE evofold to jsp_tmp" | hgsql hg16
    hgLoadBed -notItemRgb -sqlTable=/cluster/home/jsp/prog/kent/src/hg/lib/evofold.sql hg16 evofold tbaFolds.bed
    echo "RENAME TABLE evofold to encodeTba23EvoFold" | hgsql hg16
    echo "RENAME TABLE jsp_tmp to evofold" | hgsql hg16
  

##########################################################################
# ENCODE HapMap Primate Alleles
# Daryl, July 1, 2005

    ##########################################################################
    ## data acquisition, reformatting, cleaning, and checking
    ## run most of this on local disk

    ## get the data
    wget ftp://www.hapmap.org/frequencies/2005-06_16c.1_phaseI/ENCODE/non-redundant/allele_freqs_EN*.txt.gz....

    ## reformat the data and remove misformatted lines
    formatAll.csh

	#!/bin/csh
	foreach pop (CEU CHB JPT YRI)
	    rm -f allele_freqs_$pop.bed
	    zcat allele_freqs/allele_freqs_EN*${pop}* | awk -f split.awk > allele_freqs_$pop.bed
	end
	rm -f alleles.bed
	awk '$0 !~ /\?/ {printf "%s\t%d\t%d\t%s\t0\t%s\t%s\t%s\n",$1,$2,$3,$4,$6,$8,$9}' allele_freqs_*.bed | sort -u > alleles.bed

    rm -f hg16.bed
    removeErrors.pl < alleles.bed

	#!/usr/bin/perl -W
	%line=();%dups=();
	### collect the input data from the alleles.bed file
	while (<>)
	{
	    chomp();
	    @f=split;
	    $key="$f[0].$f[1].$f[2]";
	    $val="$f[3]\t$f[4]\t$f[5]\t$f[6]\t$f[7]";
	    if (defined $line{$key})
	    {
		$val .= "\t$line{$key}";
		$dups{$key} = "$val";
	    }
	    $line{$key}=$val;
	}
	### write the output file of unique information for each position
	open(OUT,">alleles.out");
	foreach $key (sort keys %line)
	{
	    @f=split /\t/, $line{$key};
	    @k=split /\./, $key;
	    printf OUT "$k[0]\t$k[1]\t$k[2]\t$f[0]\t$f[1]\t$f[2]\t$f[3]\t$f[4]\n";
	}
	### write a file of observed errors (duplications); "?" alleles have been removed before this step.
	open(ERR,">alleles.err");
	foreach $key (sort keys %dups)
	{
	    @k=split /\./, $key;
	    printf ERR "$k[0]\t$k[1]\t$k[2]\t$dups{$key}\n";
	}

    ## QA -- counts should be the same, strand should be "+ -"; 4 nucleotides should be shown
    wc -l alleles.*
    cut -f 1-3 alleles.out | sort -u | wc -l
    cut -f 1-4 alleles.out | sort -u | wc -l
    cut -f 4   alleles.out | sort -u | wc -l
    cut -f 6   alleles.out | sort -u | xargs echo "strand " 
    cut -f 7   alleles.out | sort -u | xargs echo "ref    "
    cut -f 8   alleles.out | sort -u | xargs echo "other  "
    
    ##########################################################################
    ## Start the real work ... lift to the other assemblies
    ## requires local over.chain files or symlinks

    ln -s alleles.out hg16.bed
    liftOver hg16.bed hg16ToPanTro1.over.chain panTro1.bed hg16toPanTro1.unmapped
    grep -v "#" hg16toPanTro1.unmapped > hg16toPanTro1.unmapped.bed
    wc -l hg16.bed panTro1.bed hg16toPanTro1.unmapped.bed

    liftOver hg16.bed hg16ToHg17.over.chain    hg17.bed    hg16toHg17.unmapped
    grep -v "#" hg16toHg17.unmapped > hg16toHg17.unmapped.bed
    wc -l hg16.bed hg17.bed hg16toHg17.unmapped.bed

    liftOver hg17.bed hg17ToRheMac1.over.chain rheMac1.bed hg17toRheMac1.unmapped
    grep -v "#" hg17toRheMac1.unmapped > hg17toRheMac1.unmapped.bed
    wc -l hg17.bed rheMac1.bed hg17toRheMac1.unmapped.bed

    ##########################################################################
    ## Now that the positions are available, get the bases at those positions
    ## requires nib files on /scratch
    nice getPanTro1Base.pl < panTro1.bed > panTro1Base.bed
    #13.440u 29.590s 1:30.91 47.3%   0+0k 0+0io 2333377pf+0w

	#!/usr/bin/perl -W
	$nibPath="/scratch/panTro1/nib";
	$foo1=$foo2=@bar="";
	while (<>)
	{
	    chomp();
	    $seq="#";
	    ($chrom, $chromStart, $chromEnd, $foo1, $foo2, $strand, @bar) = split /\t/;
	    $cmd="/cluster/home/daryl/bin/i386/nibFrag $nibPath/$chrom.nib $chromStart $chromEnd $strand stdout";
	    open(RESULT, "$cmd |") or die "can't start nibFrag:\n$cmd\n";
	    <RESULT>;
	    $seq=<RESULT>; #assumes a single line
	    print "$_\t";
	    print "$seq";
	    close(RESULT);
	}

    ## requires 2bit file on /scratch
    nice getRheMac1Base.pl < rheMac1.bed > rheMac1Base.bed
    #5571.970u 501.900s 1:49:14.10 92.6%     0+0k 0+0io 2455781pf+0w

	#!/usr/bin/perl -W
	$twoBitPath="/cluster/data/rheMac1/rheMac1.2bit";
	@bar="";
	open(TMP,"</scratch/getRheMac1Base.tmp");
	while ($line=<>)
	{
	    chomp($line);
	    $seq=".";
	    ($chrom, $chromStart, $chromEnd, @bar) = split /\t/, $line;
	    $cmd="twoBitToFa ${twoBitPath}:${chrom}:${chromStart}-${chromEnd} stdout";
	    open(RESULT, "$cmd |") or die "can't start twoBitToFa:\n$cmd\n";
	    while (<RESULT>)    { $seq =  "$_";  }
	    if ($seq  =~ /^\#/) { $seq =  ".\n"; }
	    if ($line =~ /-/)   { $seq =~ tr/acgtACGT/tgcaTGCA/; }
	    print "$line\t$seq";
	    close(RESULT);
	}

    ##########################################################################
    ## Get the qual scores to go with the bases
    ## run on hgwdev -- requires the database
    nice getOneQual.pl panTro1 < panTro1Base.bed > panTro1BaseQual.bed
    #42.530u 95.540s 7:19.14 31.4%   0+0k 0+0io 4260405pf+0w

	#!/usr/bin/perl -W
	$db = shift;
	$foo=@bar="";
	while ($bed = <>)
	{
	    chomp($bed);
	    $score="0";
	    ($chrom, $foo, $chromEnd, @bar) = split /\t/, $bed;
	    $cmd="hgWiggle -db=$db -position=$chrom:$chromEnd-$chromEnd -rawDataOut quality";
	    open(RESULT, "$cmd |") or die "can't start '$cmd'\n";
	    while ($line = <RESULT>)
	    {
		if ($line =~ /^\#/) { next; }
		$score=int($line+.5);
	    }
	    print "$bed\t$score\n";
	}

    nice getOneQual.pl rheMac1 < rheMac1Base.bed > rheMac1BaseQual.bed
    #35.170u 69.300s 5:37.85 30.9%   0+0k 0+0io 4043288pf+0w

    ##########################################################################
    ## Put it all together
    ## run on local machine
    nice mergeResults.pl hg16.bed panTro1BaseQual.bed rheMac1BaseQual.bed hg16.panTro1.rheMac1.bed

	#!/usr/bin/perl -W
	if (scalar (@ARGV) != 4)  {die "usage:  mergeResults.pl hg16.bed panTro1BaseQual.bed rheMac1BaseQual.bed hg16.panTro1.rheMac1.bed"}
	$hf = shift; $cf = shift; $rf = shift; $of = shift; 
	%c=(); %r=();
	$sameC=$sameR=$sameB=$diffC=$diffR=$diffB=$missC=$missR=$missB=0;
	open(H,"<$hf") or die "can't open $hf"; 
	open(C,"<$cf") or die "can't open $cf";
	open(R,"<$rf") or die "can't open $rf";
	open(O,">$of") or die "can't open $of";
	print O "rs# humanChrom humanPos humanStrand refallele otherallele chimpChrom chimpPos chimpStrand chimpAllele chimpQual rhesusChrom rhesusPos rhesusStrand rhesusAllele rhesusQual\n";
	while (<C>) 
	{ 
	    chomp(); 
	    if    (/\s(rs\d+)\s/)          { $rsId=$1; $rsId =~ s/ //g; $c{$rsId}=$_; } 
	    elsif (/\s(BCM-HGSC\S*)\s/)    { $rsId=$1; $rsId =~ s/ //g; $c{$rsId}=$_; } 
	    elsif (/\s(CSHL-HapMap\S+)\s/) { $rsId=$1; $rsId =~ s/ //g; $c{$rsId}=$_; } 
	    elsif (/\s(JCM\S+)\s/)         { $rsId=$1; $rsId =~ s/ //g; $c{$rsId}=$_; } 
	    else  { print "Chimp  ERR: $_\n"; } 
	}
	while (<R>) 
	{ 
	    chomp(); 
	    if    (/\s(rs\d+)\s/)          { $rsId=$1; $rsId =~ s/ //g; $r{$rsId}=$_; } 
	    elsif (/\s(BCM-HGSC\S*)\s/)    { $rsId=$1; $rsId =~ s/ //g; $r{$rsId}=$_; } 
	    elsif (/\s(CSHL-HapMap\S+)\s/) { $rsId=$1; $rsId =~ s/ //g; $r{$rsId}=$_; } 
	    elsif (/\s(JCM\S+)\s/)         { $rsId=$1; $rsId =~ s/ //g; $r{$rsId}=$_; } 
	    else  { print "Rhesus ERR: $_\n"; }
	}
	while (<H>) 
	{ 
	    chomp(); 
	    if    (/\s(rs\d+)\s/)          { $rsId=$1; $rsId =~ s/ //g; }
	    elsif (/\s(BCM-HGSC\S*)\s/)    { $rsId=$1; $rsId =~ s/ //g; }
	    elsif (/\s(CSHL-HapMap\S+)\s/) { $rsId=$1; $rsId =~ s/ //g; }
	    elsif (/\s(JCM\S+)\s/)         { $rsId=$1; $rsId =~ s/ //g; }
	    else  { print "Human ERR: $_\n"; }
	    $Hchrom=$HchromStart=$HchromEnd=$Hname=$Hscore=$Hstrand=$Hnuc1=$Hnuc2=".";
	    $Cchrom=$CchromStart=$CchromEnd=$Cname=$Cscore=$Cstrand=$Cnuc1=$Cnuc2=$Cnuc=$Cqual=$C=".";
	    $Rchrom=$RchromStart=$RchromEnd=$Rname=$Rscore=$Rstrand=$Rnuc1=$Rnuc2=$Rnuc=$Rqual=$R=".";
	    ($Hchrom, $HchromStart, $HchromEnd, $Hname, $Hscore, $Hstrand, $Hnuc1, $Hnuc2) = split /\t/, $_;
	    if (defined $c{$rsId}) { $C = $c{$rsId}; ($Cchrom, $CchromStart, $CchromEnd, $Cname, $Cscore, $Cstrand, $Cnuc1, $Cnuc2, $Cnuc, $Cqual) = split /\t/, $C; 
				    if (!defined($Cqual)) { $Cqual=".";} if (!defined($Cnuc)) { $Cnuc=".";} }
	    else { $Cchrom=$CchromStart=$CchromEnd=$Cname=$Cscore=$Cstrand=$Cnuc1=$Cnuc2=$Cnuc=$Cqual=$C="."; }
	    if (defined $r{$rsId}) { $R = $r{$rsId}; ($Rchrom, $RchromStart, $RchromEnd, $Rname, $Rscore, $Rstrand, $Rnuc1, $Rnuc2, $Rnuc, $Rqual) = split /\t/, $R; 
				    if (!defined($Cqual)) { print "$_\n"; $Rqual=".";}}
	    else { $Rchrom=$RchromStart=$RchromEnd=$Rname=$Rscore=$Rstrand=$Rnuc1=$Rnuc2=$Rnuc=$Rqual=$R="."; }
	    print O "$Hname $Hchrom $HchromEnd $Hstrand $Hnuc1 $Hnuc2 ";
	    print O "$Cchrom $CchromEnd $Cstrand $Cnuc $Cqual ";
	    print O "$Rchrom $RchromEnd $Rstrand $Rnuc $Rqual\n"; 
	    if (!(defined $Cnuc) || $Cnuc eq ""|| $Cnuc eq ".") { $Cnuc="x"; $missC++; }
	    if (!(defined $Rnuc) || $Rnuc eq ""|| $Rnuc eq ".") { $Rnuc="x"; $missR++; }
	    if ( $Cnuc eq "x" && $Rnuc eq "x") { $missB++; }
	    else
	    {
		$Hnuc1 =~ tr/a-z/A-Z/; $Hnuc2 =~ tr/a-z/A-Z/; $Cnuc  =~ tr/a-z/A-Z/;
		if ( ($Cnuc eq $Hnuc1) || ($Cnuc eq $Hnuc2) ) { $sameC++; } elsif ($Cnuc ne "x"){$diffC++;}
		if ( ($Rnuc eq $Hnuc1) || ($Rnuc eq $Hnuc2) ) { $sameR++; } elsif ($Rnuc ne "x"){$diffR++;}
		if ((($Cnuc eq $Hnuc1) || ($Cnuc eq $Hnuc2)) && (($Rnuc eq $Hnuc1) || ($Rnuc eq $Hnuc2))) { $sameB++; }
		if ((($Cnuc ne $Hnuc1) && ($Cnuc ne $Hnuc2)) && (($Rnuc ne $Hnuc1) || ($Rnuc ne $Hnuc2)) && ( $Cnuc ne "x" && $Rnuc ne "x")) { $diffB++; }
	    }
	}

    ##########################################################################
    ## Get summary results
    countDiffs.pl hg16.bed panTro1Base.bed rheMac1Base.bed

	#!/usr/bin/perl -W
	if (scalar (@ARGV) != 3){die "usage:  countDiffs.pl hg16.bed panTro1Base.bed rheMac1Base.bed"}
	$hf = shift; $cf = shift; $rf = shift;
	open(H,"<$hf") or die "can't open $hf"; open(C,"<$cf") or die "can't open $cf"; open(R,"<$rf") or die "can't open $rf";
	%c=(); %r=(); $Hcount=0;
	while (<C>) 
	{
	    chomp(); 
	    if    (/\s(rs\d+)\s/)          { $rsId=$1; $rsId =~ s/ //g; $c{$rsId}=$_; } 
	    elsif (/\s(BCM-HGSC\S*)\s/)    { $rsId=$1; $rsId =~ s/ //g; $c{$rsId}=$_; } 
	    elsif (/\s(CSHL-HapMap\S+)\s/) { $rsId=$1; $rsId =~ s/ //g; $c{$rsId}=$_; } 
	    elsif (/\s(JCM\S+)\s/)         { $rsId=$1; $rsId =~ s/ //g; $c{$rsId}=$_; } 
	    else  { print "Chimp  ERR: $_\n"; } 
	}
	while (<R>) 
	{ 
	    chomp(); 
	    if    (/\s(rs\d+)\s/)          { $rsId=$1; $rsId =~ s/ //g; $r{$rsId}=$_; } 
	    elsif (/\s(BCM-HGSC\S*)\s/)    { $rsId=$1; $rsId =~ s/ //g; $r{$rsId}=$_; } 
	    elsif (/\s(CSHL-HapMap\S+)\s/) { $rsId=$1; $rsId =~ s/ //g; $r{$rsId}=$_; } 
	    elsif (/\s(JCM\S+)\s/)         { $rsId=$1; $rsId =~ s/ //g; $r{$rsId}=$_; } 
	    else  { print "Chimp  ERR: $_\n"; } 
	}
	$Csame=$Cdiff=$Cmiss=$Rsame=$Rdiff=$Rmiss=0;
	$Hchrom=$HchromStart=$HchromEnd=$Hname=$Hscore=$Hstrand=$Hnuc1=$Hnuc2=".";
	while (<H>)
	{
	    $Hcount++;
	    chomp();
	    $C=$R=".";
	    if    (/\s(rs\d+)\s/)          { $rsId=$1; $rsId =~ s/ //g; }
	    elsif (/\s(BCM-HGSC\S*)\s/)    { $rsId=$1; $rsId =~ s/ //g; }
	    elsif (/\s(CSHL-HapMap\S+)\s/) { $rsId=$1; $rsId =~ s/ //g; }
	    elsif (/\s(JCM\S+)\s/)         { $rsId=$1; $rsId =~ s/ //g; }
	    else  { print "Human  ERR: $_\n"; } 
	    if (defined $c{$rsId}) { ($Cchrom,$CchromStart,$CchromEnd,$Cname,$Cscore,$Cstrand,$Cnuc1,$Cnuc2,$Cnuc)=split /\t/,$c{$rsId};} 
	    else { $Cchrom=$CchromStart=$CchromEnd=$Cname=$Cscore=$Cstrand=$Cnuc1=$Cnuc2=$Cnuc="."; }
	    if (defined $r{$rsId}) { ($Rchrom,$RchromStart,$RchromEnd,$Rname,$Rscore,$Rstrand,$Rnuc1,$Rnuc2,$Rnuc)=split /\t/,$r{$rsId};} 
	    else { $Rchrom=$RchromStart=$RchromEnd=$Rname=$Rscore=$Rstrand=$Rnuc1=$Rnuc2=$Rnuc="."; }
	    ($Hchrom, $HchromStart, $HchromEnd, $Hname, $Hscore, $Hstrand, $Hnuc1, $Hnuc2)        = split /\t/, $_;
	    $Hnuc1 =~ tr/a-z/A-Z/; $Hnuc2 =~ tr/a-z/A-Z/; $Cnuc  =~ tr/a-z/A-Z/; $Rnuc  =~ tr/a-z/A-Z/;
	    if ($Cnuc eq "" || $Cnuc eq ".") { $Cmiss++; } elsif (($Cnuc eq $Hnuc1) || ($Cnuc eq $Hnuc2)) { $Csame++; } 
	    else { $Cdiff++;}# print "C $Hnuc1 $Hnuc2 $Cnuc $Rnuc $rsId \n";}
	    if ($Rnuc eq "" || $Rnuc eq ".") { $Rmiss++; } elsif (($Rnuc eq $Hnuc1) || ($Rnuc eq $Hnuc2)) { $Rsame++; } 
	    else { $Rdiff++;}# print "R $Hnuc1 $Hnuc2 $Cnuc $Rnuc $rsId \n";}
	}
	$Ctotl = $Csame+$Cdiff; $Cfrac = $Csame/$Ctotl;
	$Rtotl = $Rsame+$Rdiff; $Rfrac = $Rsame/$Rtotl;
	print  "\tChimp\tRhesus\tHuman\n";
	print  "totl:\t$Ctotl\t$Rtotl\t$Hcount\n";
	print  "same:\t$Csame\t$Rsame\n";
	print  "diff:\t$Cdiff\t$Rdiff\n";
	printf "frac:\t%.3f\t%.3f\n",$Cfrac,$Rfrac;
	print  "miss:\t$Cmiss\t$Rmiss\n";

    ##########################################################################
    ## Reformat once again and load the database
    awk -f makeTableBed.awk hg16.panTro1.rheMac1.bed > encodePrimateAlleles.bed
	
	$0 !~ /Chrom/ {printf "%s\t%d\t%d\t%s\t%d\t%c\t%c\t%c\t%s\t%d\t%c\t%s\t%d\t%s\t%d\t%c\t%c\t%d\n", $2,$3-1,$3,$1,0,$4,$5,$6,$7,$8,$9,$10,$11,$12,$13,$14,$15,$16}

    hgLoadBed hg16 encodePrimateAlleles encodePrimateAlleles.bed -sqlTable=encodePrimateAlleles.sql -tab -strict

#######################################################################
#	loading Riken CAGE data - 2005-06-10 - Hiram
#			completed 2005-07-06 - Hiram
#	data submitter contact: Albin Sandelin - sandelin@gsc.riken.go.jp
#
    cd /cluster/data/encode/Riken/2005-06-08
    tar xvzf CAGE_tracks.tar.gz

    D="CAGE_tracks"

    ln -s ${D}/CAGE_MINUS_STRAND_ENCODE.html encodeRikenCageMinus.html
    ln -s ${D}/CAGE_MINUS_STRAND_ENCODE.wig encodeRikenCageMinus.wig
    ln -s ${D}/CAGE_PLUS_STRAND_ENCODE.html encodeRikenCagePlus.html
    ln -s ${D}/CAGE_PLUS_STRAND_ENCODE.wig encodeRikenCagePlus.wig
    ln -s ${D}/CAGE_mapped_tags.gff encodeRikenCageMappedTags.gff
    ln -s ${D}/CAGE_mapped_tags.html encodeRikenCageMappedTags.html

    for T in encodeRikenCageMinus encodeRikenCagePlus
    do
    grep "^chr" ${T}.wig | sort -k1,1 -k2,2n | \
        hgLoadBed -strict -bedGraph=4 hg16 ${T} stdin
    done

    #	And some extra test data to help verify those tags are in the
    #	correct location:
    ./gffToBed6.pl encodeRikenCageMappedTags.gff | \
        sort -k1,1 -k2,2n > encodeRikenCageMappedTags.bed6
    hgLoadBed -strict hg16 encodeRikenCageMappedTags \
	encodeRikenCageMappedTags.bed6

    grep "\-$" encodeRikenCageMappedTags.bed.6 | sort -k1,1 -k2,2n | \
        bedItemOverlapCount hg16 -verbose=2 stdin > negOverlaps.bedGraph4

    grep "+$" encodeRikenCageMappedTags.bed.6 | sort -k1,1 -k2,2n | \
        bedItemOverlapCount hg16 -verbose=2 stdin > posOverlaps.bedGraph4

    hgLoadBed -strict -bedGraph=4 hg16 \
	encodeRikenCageMappedTagsPositive posOverlaps.bedGraph4
    hgLoadBed -strict -bedGraph=4 hg16 \
	encodeRikenCageMappedTagsNegative negOverlaps.bedGraph4

    ##### - an off by one error was discovered in the Minus.wig data
    # Albin resubmitted data, but then the PLUS data was broken and the Minus
    # was not correct either.  So, apply the following fixup script to the data:
    cd /cluster/data/encode/Riken/2005-07-04
    cat << '_EOF_' > fixup.pl
#!/usr/bin/env perl
#
use strict;
use warnings;

open (FH,"../2005-06-08/encodeRikenCageMinus.wig") or
        die "Can not open ../2005-06-08/encodeRikenCageMinus.wig";
while (my $line=<FH>)
    {
    chomp $line;
    if ($line =~ m/^chr/)
        {
        my ($chr, $start, $end, $score) = split('\s+', $line);
        printf "%s\t%d\t%d\t%d\n", $chr, $start-1, $end-1, $score;
        }
    else
        {
        print "$line\n";
        }
    }

close (FH);
'_EOF_'
    #	happy emacs
    chmod +x fixup.pl
    ./fixup.pl > encodeRikenCageMinus.wig
    # examine the region: chr4:118,973,382-118,973,413
    #	to verify that the plus and minus predicted gene start sites line
    #	up properly with the raw Mapped Tags
    #	reload this fixed data
    for T in encodeRikenCageMinus
    do
    grep "^chr" ${T}.wig | sort -k1,1 -k2,2n | \
        hgLoadBed -strict -bedGraph=4 hg16 ${T} stdin
    done

    #	The track's long label: "Riken CAGE - Predicted Gene Start Sites"
    #	Short label: "Riken CAGE"
    #	table names: encodeRikenCagePlus and encodeRikenCageMinus
    #	in the composite track: encodeRikenCage
    #	and the extra TEST TRACK ONLY tables:
    #	composite: encodeRikenCageMappedTagsScore
    #		short label: Riken CAGE MT
    #		long: Riken CAGE Mapped Tags overlap count - TEST TRACK ONLY
    #		subtracks: encodeRikenCageMappedTagsPositive
    #			encodeRikenCageMappedTagsNegative
    #   table: encodeRikenCageMappedTags
    #		Riken CAGE Tags
    #		Riken CAGE Mapped Tags - TEST TRACK ONLY


#########################################################################
# Affy 25bp Probe locations (DONE - 2005-07-29 - Hiram)
    ssh hgwdev
    mkdir /cluster/data/encode/Affy/2005-07-18
    wget --timestamping \
'http://transcriptome.affymetrix.com/download/ENCODE/affy_june1/soft/RNA/EC_AS_GM06990_RCyP+_C01vsNULL.sig.soft.bz2' \
	-O EC_AS_GM06990_RCyP+_C01vsNULL.sig.soft.bz2
    bunzip2 EC_AS_GM06990_RCyP+_C01vsNULL.sig.soft.bz2
    #       After some experimentation, it was determined that the
    #       coordinates in this EC* file are actually zero-relative
    #       and thus should be taken at face value.  No need to translate
    #       anything.  There remains a question of why there was
    #       confusion about whether this was a 1 or 0 relative file.

grep "^chr" EC_AS_GM06990_RCyP+_C01vsNULL.sig.soft \
	| awk '{print $1}' | sed -e "s/_/ /" | awk '
BEGIN {count=1}
{
printf "%s\t%d\t%d\t%d\n", $1, $2, $2+25, count; ++count;
}
' | sort -k1,1 -k2,2n | gzip > encodeAffyEncode25bpProbes.bed.gz

    hgLoadBed -strict hg16 encodeAffyEncode25bpProbes \
        encodeAffyEncode25bpProbes.bed.gz
    #	Create trackDb entry, was in human/hg16/trackDb.ra later moved
    #	up to human/trackDb.ra as ENCODE moves to hg17:
track encodeAffyEncode25bpProbes
shortLabel Affy 25bp Probes
longLabel Affymetrix 25 bp Probe Locations
group encodeTxLevels
visibility hide
type bed 4 .
priority 41

#########################################################################
#  Gencode Unannotated Region Classification (DONE - 2005-08-02 - Hiram)
    #	A number of table browser operations are going to be performed.
    #	Saving the results into this directory:
    ssh hgwdev
    mkdir /cluster/data/encode/analysis/genes_transcripts
    #	Using the table browser on genome-test
    #	clade: Vertebrate, genome: Human, Assembly: July 2003
    #	track: Gencode Genes, group: encodeGencodeGene, 
    #	region: genome
    #	no filter, no intersection, no correlation
    #	output format: "chosen fields from selected and related tables"
    #	output file: genCodeTxStartEnd.gtfLike
    #	file type returned: gzip compressed
    #	press the "get output" button and select the "Linked Tables"
    #	of gencodeGeneClass, press the "Allow Selection from Checked Tables"
    #	and then select fields:
    #	name, chrom, strand, txStart, txEnd, gencodeGeneClass.class
    #	the "get output" will now send you the file:
    #	genCodeTxStartEnd.gtfLike.gz

    #	convert that output to a bed file:
zcat genCodeTxStartEnd.gtfLike.gz | awk '
{
printf "%s\t%d\t%d\t%s_%s\t0\t%s\n", $2, $4, $5, $1, $6, $3
}
' | sort -k1,1 -k2,2n > genCode.bed

    #	And filter that for the "reference set" of Gencode genes:
    egrep \
"Known|Novel_CDS|Novel_transcript_gencode_conf|Putative_gencode_conf" \
        genCode.bed > referenceGeneExtents.bed

    #	This becomes our filter to find the exons of interest,
    #	Add a custom track line to this file:
    echo \
'track name="referenceGencodeExtents" description="reference Gencode gene set, txStart to txEnd"' \
	> referenceGeneExtents.bed.ct
    cat referenceGeneExtents.bed >> referenceGeneExtents.bed.ct
    #	And then load up this file as a custom track on hg16
    #	Back in the table browser, select Gencode Genes, full genome,
    #	custom track type of output, and ask for
    #	"Exons plus 0 bases at each end"
    #	get custom track in table browser
    #	Now you have two custom tracks,
    #	Intersect those two custom tracks, AND type of intersection,
    #	to produce a bed file referenceGencodeExons.bed
    #	Add a custom track header to this file:
    echo \
'track name="referenceGencodeExons" description="Reference Gencode Exons" visibility=3' > referenceGencodeExons.bed.ct
    cat referenceGencodeExons.bed >> referenceGencodeExons.bed.ct
    #	add the referenceGeneExtents.bed.ct to that:
    cat referenceGeneExtents.bed.ct > customTracks.bed.ct
    cat referenceGencodeExons.bed.ct >> customTracks.bed.ct
    #	And load that file as a new set of two custom tracks.
    #	Again in the table browser, do an intersection between the
    #	geneExtents track and this Exon track, but invert the Exons
    #	to give you a result that is NOT Exon which == Introns
    # Complement referenceGencodeExons before intersection/union
    #	AND of referenceGencodeExtents and referenceGencodeExons
    #	Select bed file output, file name:
    #	referenceGencodeIntrons.bed

    #	Create a special custom track that is simply a definition of all
    #	the chrom extents in hg16 to be used for intersection purposes
    echo \
'track name="chromDefinitions" description="hg16 chromosome extents, 0 to N"'
	> hg16.chroms.bed.ct
    hgsql -N -e "select chrom, size from chromInfo;" hg16 | grep -v _random \
	| awk '{ printf "%s\t0\t%d\n", $1, $2}' >> hg16.chroms.bed.ct
    #	Add this to the growing collection of tracks:
    cat hg16.chroms.bed.ct >> customTracks.bed.ct
    #	And load it up for more intersections.
    #	Again in the table browser, intersect the chromDefinitions with
    #	the referenceGencodeExtents
    #	AND of chromDefinitions and referenceGencodeExtents 
    #	Complement referenceGencodeExtents before intersection/union
    #	A bed file result of this gives you the Intergenic regions
    #	referenceIntergenicGencode.bed
    #	Now you are ready to calculate the regions
for T in referenceGencodeIntrons referenceIntergenicGencode
do
zcat ${T}.bed.ct.gz | grep -v track | sort -k1,1 -k2,2n |
awk '
    {
    if (($3 - $2) > 10000)
        { printf "%s\t%d\t%d\t%s\n", $1, $2+5000, $3-5000, $4 }
    }
' > ${T}.distal.bed4

zcat ${T}.bed.ct.gz | grep -v track | sort -k1,1 -k2,2n |
awk '
{
if (($3 - $2) > 10000)
        {
        printf "%s\t%d\t%d\t%s_5\n", $1, $2, $2+5000, $4
        printf "%s\t%d\t%d\t%s_3\n", $1, $3-5000, $3, $4
        } else {
        printf "%s\t%d\t%d\t%s_unshaved\n", $1, $2, $3, $4
        }
}
' > ${T}.proximal.bed4

done
    #	The results of these are the files to load:
-rw-rw-r--    1    23830 Aug  2 16:21 referenceIntergenicGencode.proximal.bed4
-rw-rw-r--    1     8343 Aug  2 16:21 referenceIntergenicGencode.distal.bed4
-rw-rw-r--    1   194964 Aug  2 16:21 referenceGencodeIntrons.proximal.bed4
-rw-rw-r--    1    12032 Aug  2 16:21 referenceGencodeIntrons.distal.bed4
    #	Load them into database and create trackDb entries:
    hgLoadBed -strict hg16 encodeGencodeIntronsDistal \
	referenceGencodeIntrons.distal.bed4
    hgLoadBed -strict hg16 encodeGencodeIntronsProximal \
	referenceGencodeIntrons.proximal.bed4
    hgLoadBed -strict hg16 encodeGencodeIntergenicDistal \
	referenceIntergenicGencode.distal.bed4
    hgLoadBed -strict hg16 encodeGencodeIntergenicProximal \
	referenceIntergenicGencode.proximal.bed4
    #	composite track: encodeGencodeRegions
    #	with those four subTracks
    #	track encodeGencodeRegions
    #	shortLabel Gencode Unann
    #	longLabel Gencode Unannotated Region Classification

#########################################################################
#	Classifing 6 Affy transfrag tracks and 5 Yale TAR tracks into
#	four location catagories  - (DONE - 2005-08-03 - Hiram)
#  With the Gencode Unannotated Region Classification completed above
#	And loaded into the database, this is partially a tables browser
#	operation and partially a command line operation.
#  In the table browser, create an intersection between one of
#	the four regions, for example IntronsDistal and one of these
#	eleven tracks, for example encodeAffyRnaGm06990Sites
#	Note the URL in your WEB browser for that intersected result,
#	then using your own hgTables binary, run the following loop with
#	the hgsid from the URL you see you your WEB browser.  What's
#	happening here is that there are some variables that are coming
#	from your cart, and some being changed on this command line:
ssh hgwdev
cd /cluster/data/encode/analysis/genes_transcripts
mkdir -p fetchedTracks

hgTables=$HOME/kent/src/hg/hgTables/hgTables

for G in encodeGencodeIntronsDistal encodeGencodeIntronsProximal \
        encodeGencodeIntergenicDistal encodeGencodeIntergenicProximal
do
for Q in encodeAffyRnaGm06990Sites encodeAffyRnaHeLaSites \
    encodeAffyRnaHl60SitesHr00 encodeAffyRnaHl60SitesHr02 \
    encodeAffyRnaHl60SitesHr08 encodeAffyRnaHl60SitesHr32
do
echo "Genome: ${G}"
echo "Query:  ${Q}"
${hgTables} \
"hgsid=1235494&hgta_printCustomTrackHeaders=on&boolshad.hgta_printCustomTrackHea
ders=1&hgta_ctName=tb_${G}&hgta_ctDesc=table+browser+query+on+${G}&hgta_ctVis=pa
ck&hgta_ctUrl=&fbQual=whole&fbUpBases=200&fbDownBases=200&hgta_doGetBed=get+BED"
 \
hgta_table=${G} \
hgta_intersectTable=${Q} | \
sed -e "/^Set-/d; /^Content-/d; /^$/d" > "fetchedTracks/${G}.${Q}.txt"
done
done
    #	Then, loading those 44 result files:
for R in IntergenicDistal IntergenicProximal \
        IntronsDistal IntronsProximal
do
    for T in encodeAffyRnaGm06990Sites \
        encodeAffyRnaHeLaSites \
        encodeAffyRnaHl60SitesHr00 \
        encodeAffyRnaHl60SitesHr02 \
        encodeAffyRnaHl60SitesHr08 \
        encodeAffyRnaHl60SitesHr32 \
        encodeYaleAffyNB4RARNATars \
        encodeYaleAffyNB4TPARNATars \
        encodeYaleAffyNB4UntrRNATars \
        encodeYaleAffyNeutRNATarsAll \
        encodeYaleAffyPlacRNATars
    do
    grep -v "^track" fetchedTracks/encodeGencode${R}.${T}.txt | \
        sort -k1,1 -k2,2n | hgLoadBed -strict hg16 ${T}${R} stdin
    done
done
    #	Create the trackDb entry for these 44 tables with a single
    #	composite container track:
    #	track encodeNoncodingTransFrags
    #	shortLabel Unann Transfrags
    #	longLabel Yale and Affymetrix Unannotated TransFrags

#########################################################################
#	Collapse the 44 tables into four tables (DONE - 2005-08-03 - Hiram)
#	Using the table browser, select 11 of the tables from that track
#	to use in the subtrack merge, do this four times for each of the
#	four regions.  Merging 11 tables at a time, saving four files:
ssh hgwdev
cd /cluster/data/encode/analysis/genes_transcripts
-rw-rw-r--    1    29102 Aug  3 15:34 allintergenicDistal.bed.gz
-rw-rw-r--    1    15294 Aug  3 15:38 allintergenicProximal.bed.gz
-rw-rw-r--    1    16319 Aug  3 15:43 allintronsDistal.bed.gz
-rw-rw-r--    1    79296 Aug  3 15:46 allintronsProximal.bed.gz

#	And loading those four tracks:
zcat allintergenicDistal.bed.gz | hgLoadBed -strict hg16 \
        encodeAllIntergenicDistal stdin
zcat allintergenicProximal.bed.gz | hgLoadBed -strict hg16 \
        encodeAllIntergenicProximal stdin
zcat allintronsDistal.bed.gz | hgLoadBed -strict hg16 \
        encodeAllIntronsDistal stdin
zcat allintronsProximal.bed.gz | hgLoadBed -strict hg16 \
        encodeAllIntronsProximal stdin 
    #	Create trackDb entry for a composite track containing those four:
    #	track encodeWorkshopSelections
    #	shortLabel Consens Unann TF
    #	longLabel ENCODE Consensus Unannotated Transfrags
