# for emacs: -*- mode: sh; -*-
#############################################################################
# ENCODE Analysis Working Group (AWG) tracks on hg19 
#
# Data in these tracks are generated by uniform processing pipelines that merge replicates
# and apply consistent analysis across different data production labs, so they are
# of greater interest to most users (and easier to use) than the ENCODE lab tracks.
#
# Wiki pages for AWG: 

#############################################################################
# Uniform DNase peaks (2013-1-23 DONE kate)
#
# Note - this work started in conjunction with update of ENCODE Regulation super-track
# as it is needed for the DNase Clusters track.
#
# Bob Thurman at UW ENCODE (Stam lab) generated this data by running the UW HotSpot
# pipeline on UW and Duke datasets (aligned reads).  
# It is currently hosted on the ENCODE Analysis Hub.
# Note: DNase signal tracks from AWG are also available on the hub -- these
# were generated from aligned reads using the wiggler pipeline, 
# without any replicate or lab merging.

    cd /hive/data/genomes/hg19/bed
    mkdir -p wgEncodeAwgDnaseUniform

    # Track composite will be wgEncodeAwgDnaseUniform

    # get track description, trackDb, and files from EBI site that hosts analysis hub
    wget http://www.ebi.ac.uk/~swilder/ENCODE/awgHub/hg19/uniformDnase.html
    wget http://www.ebi.ac.uk/~swilder/ENCODE/awgHub/hg19/uniformDnase.txt
    cp uniformDnase.html ~/kent/src/hg/makeDb/trackDb/human/hg19/wgEncodeAwgDnaseUniform.html
    sed -n 's/bigDataUrl /wget /p' uniformDnase.txt > wget.csh

    mkdir bb narrowPeak
    cd bb
    csh ../wget.csh >&! ../wget.out &
# 125 files (.bb), dated Nov 22, 2011
# Sample case had ~150K peaks (by bigBedInfo)
# These are narrowPeak (by bigBedSummary) w/ position & signalVal, no name, score or stats
#$ bigBedToBed wgEncodeUwDnaseSKNMC.fdr01peaks.hg19.bb stdout | head
chr1    10180   10330   .       0       .       26.000000       -1      -1      -1

    # rename to UCSC style and convert to bed: wgEncodeAwgDnase<lab><cell>UniPk.narrowPeak
    ls *.fdr01peaks.hg19.bb | \
        perl -wpe 's^(wgEncode)(\S+)(Dnase)(\S+)(.fdr01peaks.hg19.bb)^bigBedToBed $1$2$3$4$5 ../narrowPeak/$1Awg$3\u\L$2\E\u\L$4\EUniPk.narrowPeak^' > ../unbig.csh

    # convert to bed and rename
    csh ../unbig.csh >&! ../unbig.out &
    # unbig.out is empty, all went OK

    cd narrowPeak
cat > ../loadDnase.csh << 'EOF'
    set b = wgEncodeAwgDnase
    set v = ${b}Uniform
    set t = ../$b.ra
    set m = ../$b.metaDb.ra
    rm -f $t $m
    #gunzip *.gz
    foreach f (${b}*.narrowPeak)
        set table = $f:r
        hgLoadBed -noNameIx -fillInScore=signalValue -trimSqlTable -sqlTable=$HOME/kent/src/hg/lib/encode/narrowPeak.sql -renameSqlTable -as=$HOME/kent/src/hg/lib/encode/narrowPeak.as hg19 $table $f
        gzip $f
        set md5sum = `md5sum $f.gz | awk '{print $1}'`
        set lab = `echo $table | perl -wpe 's/(wgEncodeAwgDnase)([A-Z][a-z]+)(.*)UniPk/$2/'`
        set cell = `echo $table | perl -wpe 's/(wgEncodeAwgDnase)([A-Z][a-z]+)(.*)UniPk/$3/'`
        echo "    track $table" >> $t
        echo "    type narrowPeak" >> $t
        echo "    parent $v" >> $t
        echo "    shortLabel $cell DNase" >> $t
        echo "    longLabel $cell DNaseI HS Uniform Peaks from ENCODE/Analysis" >> $t
        echo "    subGroups cellType=$cell treatment=None lab=$lab view=Peaks" >> $t
        echo "" >> $t

        echo "    metaObject $table" >> $m
        echo "    objType table" >> $m
        echo "    cell $cell" >> $m
        echo "    treatment None" >> $m
        echo "    composite $b" >> $m
        echo "    dataType DnaseSeq" >> $m
        echo "    dataVersion ENCODE Jan 2011 Freeze" >> $m
        echo "    dccAccession X" >> $m
        echo "    project wgEncode" >> $m
        echo "    tableName $table" >> $m
        echo "    fileName $f.gz" >> $m
        echo "    md5sum $md5sum" >> $m
        echo "    view Peaks" >> $m
        echo "" >> $m
    end
'EOF'


csh ../loadDnase.csh >&! ../loadDnase.out &

    # NOTE: error loading wgEncodeAwgDnaseDukeHuh7.5UniPk due to presence of '.'
    # I didn't catch this till much later...  renaming to *Huh75*
    hgLoadBed -noNameIx -fillInScore=signalValue -trimSqlTable -sqlTable=$HOME/kent/src/hg/lib/encode/narrowPeak.sql -renameSqlTable -as=$HOME/kent/src/hg/lib/encode/narrowPeak.as hg19 wgEncodeAwgDnaseDukeHuh75UniPk wgEncodeAwgDnaseDukeHuh7.5UniPk.narrowPeak
    # Reading wgEncodeAwgDnaseDukeHuh7.5UniPk.narrowPeak
    # Read 182986 elements of size 10 from wgEncodeAwgDnaseDukeHuh7.5UniPk.narrowPeak
    # Sorted
    # Creating table definition for wgEncodeAwgDnaseDukeHuh75UniPk
    # Saving bed.tab
    # Loading hg19

    set trackDb = ~/kent/src/hg/makeDb/trackDb/human/hg19
    cp wgEncodeAwgDnaseUniform.ra $trackDb
    # merge in composite settings from analysis hub, edit labels for length, treatment
    # note: raToTab and google spreadsheet greatly help with 17-char shortLabel adjustment
    # We could use a tabToRa too...

    cp wgEncodeAwgDnase.metaDb.ra $trackDb/metaDb/alpha/wgEncodeAwgDnaseUniform.ra
    pushd $trackDb/human/hg19/metaDb/alpha
    # add composite settings (as in ENCODE UW Dnase track)
    # edit cells, treatments to match CV.  
    mdbUpdate hg19 -test wgEncodeAwgDnaseUniform.ra
    mdbUpdate hg19 wgEncodeAwgDnaseUniform.ra

    popd
    # obtain lab and DCC accession from metaDb
    rm -f accession.metaDb.ra
    mdbPrint hg19 -vars="lab=UW replicate=1 dataType=DnaseSeq view=Peaks" | egrep 'metaObject|objType|lab |cell|treatment|dccAccession|dataVersion|^$' | sed -e 's/wgEncodeUwDnase/wgEncodeAwgDnaseUw/' -e 's/PkRep1/UniPk/' > accession.metaDb.ra
    mdbPrint hg19 -vars="lab=Duke dataType=DnaseSeq view=Peaks" | egrep 'metaObject|objType|lab |cell|treatment|dccAccession|dataVersion|^$' | sed -e 's/wgEncodeOpenChromDnase/wgEncodeAwgDnaseDuke/' -e 's/Pk/UniPk/' >> accession.metaDb.ra
    raMerge accession.metaDb.ra $trackDb/metaDb/alpha/wgEncodeAwgDnaseUniform.ra > metaDb.new.ra
    mv metaDb.new.ra $trackDb/metaDb/alpha/wgEncodeAwgDnaseUniform.new.ra
    # note -- could have used above to get CV cell type as well (I did this manually)

    # Add object for composite (copy from UW DNase)
    # edit to remove orphan stanzas, create wgEncodeAwgDnaseUniform.ra

    # load and print with magic number
    pushd $trackDb; make; popd
    mdbPrint hg19 -vars="composite=wgEncodeAwgDnaseUniform" > wgEncodeAwgDnaseUniform.ra
    # check in to source control

    # set up file downloads in expected locale
    mkdir downloads
    set dir = /hive/groups/encode/dcc/pipeline/downloads/hg19

    # must use name of composite
    mkdir -p $dir/wgEncodeAwgDnaseUniform
    cd $dir/wgEncodeAwgDnaseUniform

    # setup downloads area in ENCODE 2 standard fashion (as on encodewiki 'Data Wrangler HOWTO')
    mkdir release1
    ln -s release1 beta
    ln -s release1 releaseLatest

    cd /hive/genomes/hg19/bed/wgEncodeAwgDnaseUniform/narrowPeaks
cat  << 'EOF' > link.csh
    foreach f (*.gz)
        echo $f
        ln $f /hive/groups/encode/dcc/pipeline/downloads/hg19/wgEncodeAwgDnaseUniform/releaseLatest
    end
'EOF'
    csh link.csh
    cd $dir/wgEncodeAwgDnaseUniform/releaseLatest

    # add files.txt, md5sum.txt
    encodeMkFilesList hg19 
    ln md5sum.txt releaseLatest

    # add README
    sed 's/wgEncodeUwDnase/wgEncodeAwgDnaseUniform/' $dir/wgEncodeUwDnase/README.txt > $dir/wgEncodeAwgDnaseUniform
    cp $dir/wgEncodeUwDnase/README.txt $dir/wgEncodeAwgDnaseUniform/releaseLatest

    # get file list for pushQ
    cd ~/kent/src/hg/makeDb/doc/encodeDccHg19
    /cluster/bin/scripts/encodeMkChangeNotes hg19 wgEncodeAwgDnaseUniform 1 > wgEncodeAwgDnaseUniform.release1.notes

    # QA preview (vs. redmine issue)
    encodeQaInit hg19 wgEncodeAwgDnaseUniform 1 9850 -t 


#############################################################################
# Uniform TFBS (2013-1-24 IN PROGRESS kate)
#
# This track is based on data from the March 2012 ENCODE data freeze, processed by
# Anshul Kundaje's pipeline (https://sites.google.com/site/anshulkundaje/projects/idr).
# This round of uniform processing has a slightly more relaxed threshold (2% FDR vs 1% based on
# input Anshul received from users than used for the Jan 2011 data freeze (Analysis pubs and
# and UCSC Txn Factor ChIP-seq V2).  And there are more data sets (708 vs. 495)
#
# Based on Anshul's recommendations, using 'optimal' SPP peaks, with black list filtering
# (as in previous track).

    cd /hive/data/genomes/hg19/bed
    mkdir -p wgEncodeAwgTfbsUniform
    cd wgEncodeAwgTfbsUniform
    set build = `pwd`
    mkdir logs

    mkdir ebi; cd ebi
    # set user = encode-box-01
    # set pwd = enc*deDOWN
    wget -r -nd -np --ftp-user=$user --ftp-password=$pwd ftp://ftp-private.ebi.ac.uk/byDataType/peaks/june2012/spp/optimal
    # using the user name and password in the ENCODE wiki at:
    # http://encodewiki.ucsc.edu/EncodeDCC/index.php/Notes_on_using_the_Aspera_client_(and_EBI_ftp)
    # These are also available at:
    # http://www.ebi.ac.uk/~anshul/public/encodeRawData/peaks_spp/mar2012/distinct/idrOptimalBlackListFilt

    # NOTE: quality metrics are available here:
    # https://docs.google.com/spreadsheet/ccc?key=0Am6FxqAtrFDwdHdRcHNQUy03SjBoSVMxdUNyZV9Rdnc#gid=7
    # -> Check with Anshul for list of data sets to exclude from clustering based on quality.
    # For now, use all.
    # sample filename:
    #   spp.idrOptimal.bf.wgEncodeSydhTfbsHct116Pol2UcdAlnRep0.bam_VS_wgEncodeSydhTfbsHct116InputUcdAlnRep1.bam.narrowPeak.gz

    ls -l filelist.txt
    # 708 filelist.txt
    cd ..

    # extract UCSC object names (experiment and control) from peaks file name
    ls ebi/*.narrowPeak.gz > peak.lst
    regClusterMakeTableOfTables ans02 peak.lst partial.table -verbose=3 >&! logs/regTable.out &

    # attach metadata, by antibody and by target
    regClusterAttachMetadataToTableOfTables hg19 partial.table withDupes.table -verbose=3
    regClusterAttachMetadataToTableOfTables -antibodyTarget hg19 partial.fixed.table \
        withDupes.target.table -verbose=3
    # NOTE: partial.fixed.table corrected HAIB GM12878 Egr-1 to Rep3, but lacks controls

    # NOTE next time rename files immediately, and use UCSC names in scripting

    # 91 cell types, 201 antibodies (177 targets), 28 treatments, 9 labs
    ######################

    # unzip and rename to UCSC style : wgEncodeAwgTfbs<lab><cell><antibody>[<treatment>]UniPk.narrowPeak
    # load table
    # create trackDb and metaDb entries
    # zip and md5sum
    # save cellType and antibody targets for subGroups at the end

cat > nameTable.csh << 'EOF'
    # create table name for an experiment object
    set base = $1
    set exp = $2
    set vars = `echo $exp | \
        sed -e 's/Control.*AlnRep1//' -e 's/StdAlnRep1//' -e 's/AlnRep1//' -e 's/wgEncode//' \
            -e 's/AlnRep3//' -e 's/Tfbs//' -e 's/BroadHistone/Broad/' -e 's/OpenChromChip/Uta/'`
    set table = ${base}${vars}UniPk
    echo $table
'EOF'

cat > loadOne.csh << 'EOF'
    # load table
    set infile = $1
    set exp = $2
    set outdir = $3
    set base = $4
    set table = `csh nameTable.csh $base $exp`

    hgLoadBed -noNameIx -fillInScore=signalValue -trimSqlTable \
                -sqlTable=$HOME/kent/src/hg/lib/encode/narrowPeak.sql -renameSqlTable \
                -as=$HOME/kent/src/hg/lib/encode/narrowPeak.as hg19 $table $infile
    # return value 
'EOF'

cat > makeTag.csh << 'EOF'
    echo $1 | perl -wpe 's/[^a-z0-9]+//ig; s/(.*)/\U$1/'
'EOF'

cat > targetToFactor.csh << 'EOF'
    set factor = $1
    if ($factor =~ eGFP-*) then
        echo $factor | sed 's/^eGFP-//'
    else
        echo $factor
    endif
'EOF'

cat > trackOne.csh << 'EOF'
    #create track

    set exp = $1
    set composite = $2
    set base = $3
    set cellFile = $4
    set factorFile = $5

    # get essential metadata
    set meta = `mdbPrint hg19 -obj=$exp | \
                    raToTab stdin -cols=lab,cell,antibody,treatment stdout`
    if ($status != 0) then
        echo "ERROR: $exp"
        continue
    endif

    set lab = $meta[1]
    set cell = $meta[2]
    set tier = `encodeCvTerm cell $cell tier`

    # color subtracks with tier 1 and 2 cells
    if ($tier != "3") then
        set color = `encodeCvTerm cell $cell color`
        if ($status != 0) then
            unset color
        endif
    endif
    set cellTag = `csh makeTag.csh $cell`

    set antibody = $meta[3]
    set factor = `encodeCvTerm antibody $antibody target`

    # special handling of UChicago GFP tagged targets -- strip prefix to get factor
    set factor = `csh targetToFactor.csh $factor`

    if ($#meta == 4) then
        set treatment = $meta[4]
    else
        set treatment = "None"
    endif
    echo -n "$exp $lab $cell $tier $antibody $factor $treatment"

    set factorTag = `csh makeTag.csh $factor`
    echo "$factorTag=$factor" >> $factorFile

    if ($tier != "3") then
        set cellTag = "a${tier}${cellTag}"
        echo "$cellTag=$cell&nbsp;(Tier_${tier})" >> $cellFile
    else
        echo "$cellTag=$cell" >> $cellFile
    endif

    set table = `csh nameTable.csh $base $exp`
    set trackDb = ${composite}.ra
    echo "    track $table" >> $trackDb

    # first cut at default tracks on
    if ($tier != "3" && \
        ($factor == "CTCF" || $factor == "JUN" || $factor == "E2F4" || $factor == "MYC" )) then
        echo "    parent $composite" >> $trackDb
    else
        echo "    parent $composite off" >> $trackDb
    endif

    if ($?treatment && $treatment != "None") then
        set treatmentLabel = " ($treatment)"
    else
        set treatmentLabel = ""
    endif

    echo "    shortLabel ${cell}${treatmentLabel} $factor" >> $trackDb
    echo "    longLabel $cell${treatmentLabel} TFBS Uniform Peaks of ${antibody} from ENCODE/${lab}/Analysis" >> $trackDb
    echo "    subGroups tier=a${tier}0 cellType=$cellTag factor=$factorTag lab=$lab">> $trackDb
    if ($?color) then
        echo "    color $color" >> $trackDb
    endif
    echo "" >> $trackDb
'EOF'

cat > metaOne.csh << 'EOF'
    # create metadata file

    set exp = $1
    set composite = $2
    set base = $3
    set table = `csh nameTable.csh $base $exp`
    set metaDb = ${composite}.metaDb.ra

    # get essential metadata
    set meta = `mdbPrint hg19 -obj=$exp | \
                    raToTab stdin -cols=lab,cell,antibody,treatment stdout`
    if ($status != 0) then
        echo "ERROR: $exp"
        continue
    endif

    set lab = $meta[1]
    set cell = $meta[2]
    set antibody = $meta[3]
    set factor = `encodeCvTerm antibody $antibody target`

    # special handling of UChicago GFP tagged targets -- strip prefix to get factor
    set factor = `csh targetToFactor.csh $factor`

    if ($#meta == 4) then
        set treatment = $meta[4]
    else
        set treatment = None
    endif

    # now the control info (very inconsistent)
    set meta = `mdbPrint hg19 -obj=$exp | \
                    raToTab stdin -cols=controlId,control stdout`
    if ($status != 0) then
        echo "ERROR: $exp"
        continue
    endif
    set controlId = $meta[1]

    # the 'control' term is used In SYDH, but not HAIB
    if ($#meta == 2) then
        set control = $meta[2]
    endif

    # finally, the DCC accession
    set accession = "X"
    set meta = `mdbPrint hg19 -obj=$exp | \
                    raToTab stdin -cols=dccAccession stdout`
    if ($status == 0) then
        set accession = $meta
    endif

    echo "$table :  $exp $lab $cell $antibody $treatment $controlId"
    echo "metaObject $table" >> $metaDb
    echo "objType table" >> $metaDb
    echo "cell $cell" >> $metaDb
    echo "target $factor" >> $metaDb
    echo "antibody $antibody" >> $metaDb
    if ($?control) then
        echo "control $control" >> $metaDb
    endif
    echo "controlId $controlId" >> $metaDb
    echo "lab $lab" >> $metaDb
    echo "treatment $treatment" >> $metaDb
    echo "composite $composite" >> $metaDb
    echo "dataType ChipSeq" >> $metaDb
    echo "dataVersion ENCODE Mar 2012 Freeze" >> $metaDb
    echo "dccAccession $accession" >> $metaDb
    echo "project wgEncode" >> $metaDb
    echo "tableName $table" >> $metaDb
    echo "fileName $table.narrowPeak.gz" >> $metaDb
    echo "view Peaks" >> $metaDb
    echo "" >> $metaDb
'EOF'

cat > renameOne.csh << 'EOF'
    # rename files based on generated table names
    set exp = $1
    set base = $2
    set infile = $3
    set outdir = $4
    set table = `csh nameTable.csh $base $exp`
    cp -p $infile $outdir/$table.narrowPeak.gz
'EOF'


cat > loadTfbs.csh << 'EOF'
    # parse filename and metaObjects for experiment and control from output
    # of regClusterMakeTableOfTables
    set fileList = $1
    set base = wgEncodeAwgTfbs
    set composite = ${base}Uniform

    set outdir = downloads
    mkdir -p $outdir

    @ ct = 1
    foreach x (`sed 's/\t/|/g' $fileList`)
        set obj = `echo $x | awk -F\| '{print $1, $7, $8}'`
        set infile = $obj[1]
        set exp = $obj[2]
        set ctrl = $obj[3]

        echo "${ct} Track $exp composite=$composite base=$base infile=$infile ctrl=$ctrl"
        @ ct++
        # csh loadOne.csh $infile $exp $outdir $base
        # csh trackOne.csh $exp $composite $base cells.txt factors.txt 
        # csh metaOne.csh $exp $composite $base
        #csh renameOne.csh $exp $base $infile $outdir
    end
'EOF'

#ERROR: wgEncodeHaibTfbsGm12878Egr1Pcr2xAlnRep1
# Seems there is only a Rep3 for this prefix.  There exist Rep1 and Rep2 for a different protocol
#V0416101 ( vs Pcr2x).  And the two protocols have different dccAccessions...  We will
# consider the Rep3 was used as it was later data and Anshul's filename has Pcr2x 
# (and try to verify with Anshul).
# -> handle this one manually
# NOTE: instructions for this patch may be rough

    # wgEncodeAwgTfbsHaibGm12878Egr1Pcr2xUniPk.narrowPeak.gz is an orphaned file, there is no matching met adata
    # add in missing dataset
    cd $build
    grep Gm12878Egr1 partial.table > haibFix.table  
    # edit to change to Rep3
    csh loadTfbs.csh haibFix.table >&! logs/loadHaib.out
    cp wgEncodeAwgTfbsUniform.metaDb.ra haibFix.metaDb.ra
    # edit out all but new entry
    mdbUpdate hg19 haibFix.metaDb.ra
    mdbPrint hg19 -composite=wgEncodeAwgTfbsUniform > wgEncodeAwgTfbsUniform.metaDb.new.ra
    diff wgEncodeAwgTfbsUniform.metaDb.new.ra $trackDb/human/hg19/metaDb/alpha
    # just the HAIB fix
    pushd $trackDb/human/hg19/metaDb/alpha
    mdbPrint hg19 -composite=wgEncodeAwgTfbsUniform > wgEncodeAwgTfbsUniform.ra
    cd $dir/wgEncodeAwgTfbsUniform/releaseLatest

    csh loadTfbs.csh partial.table >&! logs/loadTfbs.out &

    sort factors.txt | uniq | sed 's/$/ \\/' > factors.uniq.txt
    sort cells.txt | uniq | sed 's/$/ \\/' > cells.uniq.txt
    # insert above as subGroups in trackDb

    grep track wgEncodeAwgTfbsUniform.ra
    grep metaObject wgEncodeAwgTfbsUniform.metaDb.ra | wc -l
    # 707
    wc -l partial.table
    # 708
    grep metaObject *.metaDb.ra | grep -v Uchi | wc -l
    # 694
    # NOTE:  lost one here -- need to investigate.  And remove UChicago from metadata.
    # The lost file (from below): wgEncodeAwgTfbsHaibGm12878Egr1Pcr2xUniPk 
    # It is a single replicate, Rep3 (?!), so couldn't be located in metadata as Rep1

    set trackDb = ~/kent/src/hg/makeDb/trackDb
    cp wgEncodeAwgTfbsUniform.ra human/hg19/$trackDb
    # add composite settings (as in ENCODE SYDH TFBS track)
    # merge in composite settings from analysis hub, edit labels for length, treatment
    # note: raToTab and google spreadsheet greatly help with 17-char shortLabel adjustment
    # We could use a tabToRa too...

    # INVALID cv lookup: control = 'n/a' not found. in obj: wgEncodeAwgTfbsUtaK562CtcfUniPk
    # 274 of these 
    # INVALID cv lookup: antibody = 'Pol2(b)' DEPRECATED. in obj: wgEncodeAwgTfbsBroadHelas3Pol2bUniPk
    # 4 of these
    pushd $trackDb/human/hg19/metaDb/alpha
    mdbUpdate -delete hg19 -composite=wgEncodeAwgTfbsUniform.ra
    mdbUpdate hg19 wgEncodeAwgTfbsUniform.ra
    # 707 (note -- need to remove UChicago)
    mdbPrint hg19 -composite=wgEncodeAwgTfbsUniform > wgEncodeAwgTfbsUniform.ra
    git add wgEncodeAwgTfbsUniform.ra  # commit

    # grab track description from previous version on the Analysis hub
    # - edit for conformance to browser style
    # - update with new counts for cell types, etc.
    # - have Anshul update (done)
    mkdir html
    cd html
    wget -nd -np http://ftp.ebi.ac.uk/pub/databases/ensembl/encode/integration_data_jan2011/hg19/uniformTfbs.html
    wget -nd -np http://ftp.ebi.ac.uk/pub/databases/ensembl/encode/integration_data_jan2011/hg19/trackDb.txt
    cp uniformTfbs.html ~/kent/src/hg/makeDb/trackDb/human/hg19/wgEncodeAwgTfbsUniform.html
    cd ..

    # fix up labels
    # short (<=17)
    hgsql hg19 -e 'select shortLabel from trackDb_kate where tableName like "wgEncodeAwgTfbs%" and length(shortLabel) > 17' > badShorts.txt
    # use <cell>+<treatment> <factor>
    # to distinguish same experiment diff labs, add single char lab id:
    # h=HudsonAlpha, s=Stanford, t=UTexas-Austin, w=UWashington, c=USC, v=Harvard, y=Yale, b=Broad, i=UChicago (Illinois?)
    # Add '2, e.g. 'h2, if 2 experiments from same lab (e.g. different control or antibody)
    hgsql hg19 -e 'select longLabel from trackDb_kate where tableName like "wgEncodeAwgTfbs%" and length(longLabel) > 80' > badLongs.txt
    # Shorten by abbreviating treatments, and excise 'from' 

    # Add quality scores from Anshul's manual curation :
    # https://docs.google.com/spreadsheet/ccc?key=0Am6FxqAtrFDwdHdRcHNQUy03SjBoSVMxdUNyZV9Rdnc#gid=9 
    # add metadata term 'quality'=['good','caution'] for integrated metric [1,0]
    # drop failed datasets (quality == -1)
    # add passing U Chicago (total 5)

    cd $build
    mkdir quality
    cd quality
    # download .csv from google spreadsheet above to 
    wc -l tfbsQualityReport.csv
    # 712

    ####
    # rescue good Chicago datasets
    grep Chicago tfbsQualityReport.csv | awk -F',' '$2 != -1 {print $3}'
    #wgEncodeUchicagoTfbsK562Ehdac8ControlAlnRep0
    #wgEncodeUchicagoTfbsK562EjundControlAlnRep0
    #wgEncodeUchicagoTfbsK562EjunbControlAlnRep0
    #wgEncodeUchicagoTfbsK562Egata2ControlAlnRep0
    #wgEncodeUchicagoTfbsK562EfosControlAlnRep0

    grep Uchicago partial.table > chicago.table
    # edit out all but good datasets, listed above -> chicago.good.table

    csh loadTfbs.csh chicago.good.table >&! logs/loadTfbs.chicago.out &

    ####
    # remove failed datasets
    awk -F',' '$2 == -1 {print $3}' tfbsQualityReport.csv | grep -v chicago > bad.table
    #wgEncodeSydhTfbsGm12878Spt20StdAlnRep0
    #wgEncodeSydhTfbsGm12878Gcn5StdAlnRep0
    #wgEncodeHaibTfbsPfsk1Pol24h8V0416101AlnRep0
    #wgEncodeHaibTfbsA549Sp1V0422111Etoh02AlnRep0
        # NOTE: this one not in Anshul's file directory
    #wgEncodeSydhTfbsHepg2CebpzIggrabAlnRep0
    #wgEncodeSydhTfbsHelas3Gcn5StdAlnRep0
    #wgEncodeSydhTfbsGm12878Irf3IggmusAlnRep0
    #wgEncodeSydhTfbsK562Xrcc4StdAlnRep0
    #wgEncodeSydhTfbsHepg2Srebp2PravastStdAlnRep0
    #wgEncodeSydhTfbsHepg2Srebp1PravastStdAlnRep0
    #wgEncodeSydhTfbsHepg2Pol2PravastStdAlnRep0
    # 11 total

    # update trackDb: add stanzas, uniquify short labels, add 1 new lab, 2 new factors to subgroups
        # tweak cell type tags for tiers>3
    # update track description
    # update metaDb

    # was 695.  Added 5, removed 10 -> -5. Should be 690.
    # Read 691 metadata objects from hg19
    # Yes! (includes composite object)

    # TODO: Histogram of score distributions (may want to normalize to >1std)
    # Ref JK suggestion during code review 3/26/12

    # TODO: Add 5 chicago and remove 10 baddies links from downloads dir (and releaseLatest)

    # add quality metric from Anshul's spreadsheet
    cd quality
cat > getQuality.pl << 'EOF'
    while (<>) {
        ($id, $qual, $obj, $junk) = split(",");
        if ($qual eq "0") {
            $qual = "caution";
        } else if ($qual eq "1") {
            $qual = "good";
        } else {
            next;
        }
        $obj =~ s/Rep0/Rep1/;
        print "$obj.$qual\n";
    }
'EOF'

    perl getQuality.pl < tfbsQualityReport.csv > quality.lst
    wc -l quality.lst
    # 690

cat > addQuality.csh << 'EOF'
    set qualList = $1
    foreach q (`cat $qualList`)
    set obj = $q:r
    set qual = $q:e
    set obj = `csh ../nameTable.csh wgEncodeAwgTfbs $obj`
    echo "metaObject $obj"
    echo "quality $qual"
    echo ""
    end
'EOF'

    csh addQuality.csh quality.lst < quality.ra
    mdbUpdate hg19 quality.ra
    #Affected 689 row(s) in hg19.metaDb_kate
    # ugh, lost 1

    mdbPrint hg19 -composite=wgEncodeAwgTfbsUniform > wgEncodeAwgTfbsUniform.ra

    # NOTE: add quality typeOfTerm and the two values (caution, good) to cv.ra

# hand-tweak 3 HudsonAlpha that have problematic V02, etc. in filenames
#wgEncodeAwgTfbsHaibK562CtcfcPcr1xUniPk - peak count matches: wgEncodeHaibTfbsK562CtcfcPcr1xAlnRep0V2 (quality 1)
#wgEncodeAwgTfbsHaibU87NrsfPcr2xUniPk - peak count matches:wgEncodeHaibTfbsU87NrsfPcr2xAlnRep0V2 (quality 1)
#wgEncodeAwgTfbsHaibU87Pol24h8V0416101UniPk - peak count matches: wgEncodeHaibTfbsU87Pol24h8V0416101AlnRep0V2  (quality 1)

# Redo scores in tables and files.  Files had no scores -- we will assign using 'reg' scoring
# method (after evaluating multiple methods with 'bedScore').  Tables had encode pipeline
# scoring method -- these will be replaced.
# Tried all 4 scoring methods -- reviewing graphs with Jim, it was decided to use 'reg' scoring,
# transformed with single normalization value.


cat > scoreNarrowPeaks.csh << 'EOF'
    set method = $1
    set peakDir = $2
   set uniformDir = $peakDir/${method}.uniform
   rm -fr $dir $uniformDir
   mkdir $dir $uniformDir
   bedScore -col=7 -verbose=2 -method=$method peaks/*.narrowPeak $dir
   bedScore -uniform -col=7 -verbose=2 -method=$method peaks/*.narrowPeak $uniformDir
'EOF'

cat > doScorePeaks.csh << 'EOF'
    set outDir = scoredPeaks
    mkdir -p $outDir $outDir/logs
    foreach method (encode reg std2 asinh)
        echo "Scoring by $method"
        csh scoreNarrowPeaks.csh $method $outDir >&! $outDir/logs/$method.log
    end
'EOF'

    csh doScorePeaks.csh >&! doScoredPeaks.log

    # We will use files in scoredPeaks/reg.uniform

    # reload tables

cat > loadScored.csh << 'EOF'
    cd scoredPeaks/reg.uniform
    foreach path (*.narrowPeak)
        set table = $path:r
        echo $table
        /cluster/bin/x86_64/hgLoadBed -noNameIx -trimSqlTable \
                -sqlTable=$HOME/kent/src/hg/lib/encode/narrowPeak.sql -renameSqlTable \
                -as=$HOME/kent/src/hg/lib/encode/narrowPeak.as hg19 $table $path
    end
'EOF'
    csh loadScored.csh >&! logs/loadScored.out &

    # replace downloads

    mv downloads downloads.orig

cat > downloadScored.csh << 'EOF'
    mkdir -p downloads
    cd scoredPeaks/reg.uniform
    foreach file (*.narrowPeak)
        echo $file
        gzip -c $file > ../../downloads/$file.gz
    end
'EOF'
    csh downloadScored.csh >&! logs/downloadScored.out &

    # set up file downloads in expected locale
    set dir = /hive/groups/encode/dcc/pipeline/downloads/hg19
# must use name of composite
    mkdir -p $dir/wgEncodeAwgTfbsUniform
    cd $dir/wgEncodeAwgTfbsUniform

    # link files from build area
    cd /hive/data/genomes/hg19/bed/wgEncodeAwgTfbsUniform/downloads
cat  << 'EOF' > link.csh
    foreach f (*.gz)
        echo $f
        ln $f /hive/groups/encode/dcc/pipeline/downloads/hg19/wgEncodeAwgTfbsUniform
    end
'EOF'
    csh link.csh

    # rescue dataset missing from peaks/ so not rescored
    cd $build
    csh renameOne.csh wgEncodeSydhTfbsHepg2Srebp1InslnStdAlnRep1 wgEncodeAwgTfbs ebi/spp.idrOptimal.bf.wgEncodeSydhTfbsHepg2Srebp1InslnStdAlnRep0.bam_VS_wgEncodeSydhTfbsHepg2InputInslnStdAlnRep0.bam.narrowPeak.gz downloads.orig
    set f = wgEncodeAwgTfbsSydhHepg2Srebp1InslnUniPk.narrowPeak
    zcat downloads.orig/$f.gz > peaks/$f
    bedScore -col=7 -verbose=2 -method=reg -uniform peaks/$f scoredPeaks/reg.uniform/$f
    set table = $f:r
    /cluster/bin/x86_64/hgLoadBed -noNameIx -trimSqlTable \
            -sqlTable=$HOME/kent/src/hg/lib/encode/narrowPeak.sql -renameSqlTable \
            -as=$HOME/kent/src/hg/lib/encode/narrowPeak.as hg19 $table scoredPeaks/reg.uniform/$f
    cd scoredPeaks/reg.uniform
    gzip -c $f > ../../downloads/$f.gz
    cd /hive/data/genomes/hg19/bed/wgEncodeAwgTfbsUniform/downloads
    ln $f.gz /hive/groups/encode/dcc/pipeline/downloads/hg19/wgEncodeAwgTfbsUniform

    # add files.txt, md5sum.txt
    cd /hive/groups/encode/dcc/pipeline/downloads/hg19/wgEncodeAwgTfbsUniform
    encodeMkFilesList hg19 -md5 -cv=/cluster/home/kate/kent/src/hg/makeDb/trackDb/cv/alpha
    # wgEncodeAwgTfbsHaibPfsk1Pol24h8V0416101UniPk.narrowPeak.gz is an orphaned file, there is no matching metadata
    rm wgEncodeAwgTfbsHaibPfsk1Pol24h8V0416101UniPk.narrowPeak.gz

    # setup downloads area in ENCODE 2 standard fashion (as on encodewiki 'Data Wrangler HOWTO')
    mkdir release1
    ln -s release1 beta
    ln -s release1 releaseLatest

    # add composite object to metaDb (use wgEncodeAwgDnaseUniform as example)

    # add README
    sed 's/wgEncodeHaibTfbs/wgEncodeAwgTfbsUniform/' $dir/wgEncodeHaibTfbs/README.txt > $dir/wgEncodeAwgTfbsUniform/README.txt
    # edit
    cp $dir/wgEncodeTfbsUniform/README.txt $dir/wgEncodeAwgTfbsUniform/releaseLatest

    # get file list for pushQ
    cd ~/kent/src/hg/makeDb/doc/encodeDccHg19
    /cluster/bin/scripts/encodeMkChangeNotes hg19 wgEncodeAwgTfbsUniform 1 > wgEncodeAwgTfbsUniform.release1.notes
    # check in notes file

    # QA preview (vs. redmine issue)
    encodeQaInit hg19 wgEncodeAwgTfbsUniform 1 10042 -t 

###############################################
# Master list of 2.9M DNase sites from Analysis   (in progress kate)
# (Chromatin landscape Nature paper, Sep 6 2012)
# Filename is in Supplementary data PDF, p. 49, 
#       Supplementary Methods section 1.2, "DHS Master List and its annotation"

    cd /hive/data/genomes/hg19/bed/wgEncodeAwgDnaseMasterSites
    set track = wgEncodeAwgDnaseMasterSites

    mkdir ebi; cd ebi

    wget ftp://ftp.ebi.ac.uk/pub/databases/ensembl/encode/integration_data_jan2011/byDataType/openchrom/jan2011/combined_peaks/multi-tissue.master.ntypes.simple.hg19.bed

    wc -l *
    # 2890742 multi-tissue.master.ntypes.simple.hg19.bed

    head -3 *
    chr1    10120   10270   MCV-37  39.232800
    chr1    10440   10590   MCV-4   36.369500
    chr1    16140   16290   MCV-5   18.768900

    ave -col=5 ebi/*.bed
    # Q1 6.028710
    # median 9.877880
    # Q3 34.639700
    # average 37.963365
    # min 4.215660
    # max 5445.570000
    # count 2890742
    # total 109742293.141479
    # standard deviation 69.927595

    # These are z-scores (by my reading of the Chromatin landscape supplementary inf)

    # histogram recommends log transform when filling in score
    # Truncate to reasonable significant digits (%.2f)

    # From Bob Thurman, UW, a file with cell sources for each DHS site:
    # paste this on to bed
    multi-tissue.master.ntypes.simple.names.hg19.txt

    # create source table:  id  name
    #tr ';' '\n' < multi*.txt | sort | uniq > cells.txt
    #wc -l cells.txt
    #125 cells.txt

    # Format to use: BED5 +
    # 
    # chr start end name score floatScore sourceCount sources (cellId1, cellId2...)
    autoSql -addBin -dbLink bed5Sources.as bed5Sources

    # Paste together bed and sources, trim MCV- from item name to leave just the count
    # of cell types (Bob Thurman suggestion), replace semi-colons in source list with commas, 
    # and add score field, log transformed and scaled to 100-1000 range:
    # m = 900/(ln(max)-ln(min))
    #   -> 125.6
    # b = ln(max) * m - 1000
    #   -> 80
    # score = floatScore * m + b
    awk '{sub(/MCV-/,"",$4); printf("%s\t%u\t%u\t%u\t%d\t%.2f\n", $1,$2,$3,$4,125.6*log($5)-80,$5)}' \
        ebi/multi-tissue.master.ntypes.simple.hg19.bed > dhs.bed

    sed -e 's/;/,/g' multi-tissue.master.ntypes.simple.names.hg19.txt | \
        paste dhs.bed - > dhs.sources.bed

# Create files for loading database tables: track table (bed5Sources) 
    # and sourceTable (idName).  Use new tool to split above format (bed5 + floatScore + sources)
    # Name field has count of cell types having DHS overlapping the master site
    hgBedSources dhs.sources.bed wgEncodeAwgDnaseMaster

    # 
cat checklinks.csh  << 'EOF'
     foreach t (`awk '{print $2}' < wgEncodeAwgDnaseMasterSources.tab`)
         echo -n "$t :"
         curl -s "http://genome.ucsc.edu/cgi-bin/hgEncodeVocab?ra=encode/cv.ra&deprecated=true&term=$t" \
                 | grep -c 'argument not found'
     end
'EOF'

    # manually map to CV to create maplinks.txt
    csh maplinks.csh wgEncodeAwgDnaseMasterSources.tab > sources.fixed.tab
    mv sources.fixed.tab wgEncodeAwgDnaseMasterSources.unfixed.tab
    csh maplinks.csh wgEncodeAwgDnaseMasterSources.unfixed.tab > wgEncodeAwgDnaseMasterSources.tab

$ csh checklinks.csh wgEncodeDnaseMasterSources.tab| grep -v ':0'
# ignore those with +
CD14+ :1
CD20+ :1
CD34+_Mobilized :1
# fix these (add hyphen)
HMVEC-dBlAd :1
HMVEC-dBlNeo :1
HMVEC-dLyAd :1
HMVEC-dLyNeo :1
# needs 7.5
Huh-75 :1
# live with these (treated)
UrotheliaUt189 :1
WI-38_TAM :1
HeLa-S3s3Ifna4h :1

    # edit maplinks2.csh
    mv wgEncodeAwgDnaseMasterSources.tab wgEncodeAwgDnaseMasterSources.unfixed2.tab
    csh maplinks2.csh wgEncodeAwgDnaseMasterSources.unfixed2.tab > wgEncodeAwgDnaseMasterSources.tab

    # Load sources table
    autoSql $HOME/kent/src/hg/lib/idName.as idName
    hgLoadSqlTab hg19 wgEncodeAwgDnaseMasterSources idName.sql wgEncodeAwgDnaseMasterSources.tab

    # BED table
    mv wgEncodeAwgDnaseMaster.bed wgEncodeAwgDnaseMasterSites.bed
    hgLoadBed -noNameIx -type=bed5+3 \
        -sqlTable=$HOME/kent/src/hg/lib/bed5Sources.sql \
        -renameSqlTable -as=$HOME/kent/src/hg/lib/bed5Sources.as \
            hg19 wgEncodeAwgDnaseMasterSites wgEncodeAwgDnaseMasterSites.bed
    # Read 2890742 elements of size 8 from wgEncodeAwgDnaseMasterSites.bed

    # Downloads
    mkdir downloads
    cp wgEncodeAwgDnaseMasterSources.tab wgEncodeAwgDnaseMasterSites.bed downloads
    cd downloads
    gzip wgEncodeAwgDnaseMasterSites.bed
    pushd

    set dir = /hive/groups/encode/dcc/pipeline/downloads/hg19
    mkdir -p $dir/wgEncodeAwgDnaseMasterSites
    cd $dir/wgEncodeAwgDnaseMasterSites
    mkdir release1
    ln -s release1 beta
    ln -s release1 releaseLatest

    popd
    ln wgEncodeAwgDnaseMasterSites.bed.gz wgEncodeAwgDnaseMasterSources.tab \
        $dir/wgEncodeAwgDnaseMasterSites
    cd $dir/wgEncodeAwgDnaseMasterSites

    # add metadata file trackDb/human/hg19/metaDb/alpha/wgEncodeAwgDnaseMasterSites.ra
    encodeMkFilesList hg19 -md5 -cv=/cluster/home/kate/kent/src/hg/makeDb/trackDb/cv/alpha

    # Add README.txt

    # Notes file for QA
    cd ~/kent/src/hg/makeDb/doc/encodeDccHg19
    /cluster/bin/scripts/encodeMkChangeNotes hg19 wgEncodeAwgDnaseMasterSites 1 > wgEncodeAwgDnaseMasterSites.release1.notes

    # QA preview (vs. redmine issue)
    encodeQaInit hg19 wgEncodeAwgDnaseMasterSites 1 13230 -t 


###############################################
# DNAseI Fingerprinting  (in progress Kate)

#From SeqAnswers!

    cd /hive/data/genomes/hg19/bed/wgEncodeAwg/dnase/footprints
    set track = wgEncodeAwgDnaseFootprint

    ftp://ftp.ebi.ac.uk/pub/databases/ensembl/encode/supplementary/integration_data_jan2011/byDataType/footprints/jan2011/

    mkdir ebi; cd ebi
    set dir = ftp.ebi.ac.uk/pub/databases/ensembl/encode/supplementary/integration_data_jan2011/byDataType/footprints/jan2011/

    wget ftp://$dir/README
    wget ftp://$dir/combined.fps.gz
    wget ftp://$dir/all.footprints.gz

    gunzip combined.fps.gz

    wc -l ebi/combined.fps
    #  8374968 ebi/combined.fps
    # big file -> use bigBed

    mkdir /gbdb/hg19/$track
    set table = wgEncodeAwgDnaseCombinedFootprint
    bedToBigBed combined.fps /cluster/data/hg19/chrom.sizes $table.bb
    ln  -s `pwd`/$table.bb /gbdb/hg19/$track/$table.bb
    hgBbiDbLink hg19 $table /gbdb/hg19/$track/$table.bb

    
#############################################################################
# Segmentations (2014-2-18 kate)

# There are 3 segmentations on the AWG hub:
#       1. ChromHMM
#       2. Segway
#       3. Combined
# UCSC already has a ChromHMM track (currently in the Broad Histone supertrack, but
# this is a newer segmentation using that method (10-state vs. 15)).
# The segmentations will be formatted similarly to existing track (BED 9)
# Docs and data will be pulled from the AWG hub.

    cd /hive/data/genomes/hg19/bed
    mkdir -p wgEncodeAwgGenomeSegmentation
    cd wgEncodeAwgGenomeSegmentation
    mkdir ebi
    cd ebi

    # get track description, trackDb, and files from EBI site that hosts analysis hub
    wget http://ftp.ebi.ac.uk/pub/databases/ensembl/encode/integration_data_jan2011/hg19/trackDb.txt 
    wget http://ftp.ebi.ac.uk/pub/databases/ensembl/encode/integration_data_jan2011/hg19/genomeSegmentation.html 
    cp genomeSegmentation.html ~/kent/src/hg/makeDb/trackDb/human/hg19/wgEncodeAwgSegmentation.html 

    grep segmentation trackDb.txt | grep bigDataUrl | sed 's/.*\///' > files.txt
    wc -l files.txt
    # 18 (6 each for the 3 methods)

cat > getFiles.csh << 'EOF'
    foreach f (`cat files.txt`)
        echo  $f
        wget http://ftp.ebi.ac.uk/pub/databases/ensembl/encode/integration_data_jan2011/byDataType/segmentations/jan2011/hub/$f
    end
'EOF'

    # NOTE: bigBedInfo shows Segway with roughly 10x item count of ChromHMM (12M vs. 1.2M)
    # ChromHMM:  1.1M - 1.4M range
    # Segway: 12.7M - 15.8M
    # Combined: .7M - 3.2M

    # Compare hub ChromHMM files with current UCSC track
    cd ..
    zcat /hive/groups/encode/dcc/analysis/ftp/pipeline/hg19/wgEncodeBroadHmm/wgEncodeBroadHmmGm12878HMM.bed.gz > \
        bed/gm12878.broad.bed
    bigBedToBed ebi/gm12878.ChromHMM.bb bed/gm123878.chromHmm.bed

    #These look very different -- confirming they were redone for Sep 2012 papers.

    cd ebi
cat > expandFiles.csh << 'EOF'
    foreach f (`cat files.txt`)
        echo $f
        set b = $f:r
        set cell = `echo $b:r | perl -wpe 's/(.)(.*)/\u$1\L$2/'`
        set method = `echo $b:e | perl -wpe 's/(.)(.*)/\u$1\L$2/'`
        set out = wgEncodeAwgSegmentation${method}${cell}
        bigBedToBed $f ../bed/$out.bed
    end
'EOF'
    csh expandFiles.csh >&! expandFiles.out &

    # Load into tables
    cd bed
cat > loadFiles.csh << 'EOF'
    foreach f (*.bed)
        hgLoadBed -tab hg19 $f:r $f
    end
'EOF'
    csh loadFiles.csh >&! loadFiles.out &
    cd ..

    # Add metadata
    # add+edit trackDb/human/hg19/metaDb/alpha/wgEncodeAwgSegmentation.ra
    pushd ~/kent/src/hg/makeDb/trackDb/human/hg19/metaDb/alpha
    mdbUpdate hg19 -test wgEncodeAwgSegmentation.ra
    mdbUpdate hg19 wgEncodeAwgSegmentation.ra
    mdbPrint hg19 -composite=wgEncodeAwgSegmentation > wgEncodeAwgSegmentation.ra
    # add to makefile
    # commit to git
    popd

    # Create downloads
    cd bed
    foreach f (*.bed)
        echo $f
        gzip -c $f > ../downloads/$f.gz
    end
    cd ..

    # set up file downloads in expected locale
    set dir = /hive/groups/encode/dcc/pipeline/downloads/hg19/wgEncodeAwgSegmentation
    mkdir -p $dir
    ln downloads/*.gz $dir

    # release1, releaseLatest ?
    ln -s release1 beta

    # make alpha in trackDb (install metaDb)

    # add files.txt, md5sum.txt
    cd $dir
    encodeMkFilesList hg19 
    md5sum *.gz > md5sum.txt
    # create README file

    /cluster/bin/scripts/encodeMkChangeNotes hg19 wgEncodeAwgSegmentation 1 > wgEncodeAwgSegmentation.release1.notes
    encodeQaInit hg19 wgEncodeAwgSegmentation 1 12259 -t

# Responses to QA

# Fixes to track description from Steve Wilder (kate 4/11/14)

    cd /hive/data/genomes/hg19/bed/wgEncodeAwgGenomeSegmentation/ebi
    cp genomeSegmentation.html genomeSegmentation.html.old
    wget  http://www.ebi.ac.uk/~swilder/ENCODE/awgHubdevel/hg19/genomeSegmentation.html
    diff genomeSegmentation.html genomeSegmentation.html.old




        





