# for emacs: -*- mode: sh; -*-

# Regulation tracks for hg38 / GRCh38

#############################################################################
# Building UW DNAse I ENCODE2 tracks (In progress 2014-09-03 Jim Kent)

#These tracks contain the results of DNAse I hypersensitivity experiments from the 
#John Stamatoyannapoulos lab at the University of Washington done for the ENCODE Project
#phase 2.

#The data was processed according to the July 2014 version of the ENCODE 3 DNAse
#processing pipeline.  At a high level this means pooling aligning the reads
#with the bwa program against hg38 with the 'sponge' sequence, removing multiple
#mapping reads, and reads that aligned to the sponge or mitochondria,  pooling
#the results for all replicates, and running the hotspot program.  The bigWig output
#was normalized so that the average value genome-wide is 1.

#The bam files were created by the encode analysis pipeline on each replicate separately
#and the process for doing this won't be described here.  It is a bit complex, and
#really will just need to be reworked into something simpler now that we've no longer
#are working directly on that contract.  This build assumes that the relevant bam
#files are in the /hive/groups/encode/3/eap/cach directory.

# To do the mapping again you'd start with fastq files and use the script
# eap_run_bwa_se on an index that included the sponge as well as hg38
# chromosomes (but not alternative haplotypes).   Bwa itself is run in a
# rather vanilla mode, with no options beyond -t 4 to parallelize the
# first pass of the alignment in 4 threads.
# The first section of methods here are to create a hub with peaks,  hotspots, and signal
# from pooled replicates.

#The detailed instructions after the bam files are available are: 

##In more detail.  First 
mkdir /hive/data/genomes/hg38/bed/uwDnase1

## Run program to generate most of parasol batches
ssh encode-02
cd /hive/data/genomes/hg38/bed/uwDnase1
dnaseHg38Batch batchDir

## By hand edit split batchDir into pooled and single replicate versions in directories
## run_pooled and run_replicates (sorry for the hand work)

## Do parasol runs on pooled
ssh ku
cd cd /hive/data/genomes/hg38/bed/uwDnase1/run_pooled
para make
para time
#Completed: 95 of 95 jobs
#CPU time in finished jobs:    2908517s   48475.28m   807.92h   33.66d  0.092 y
#IO & Wait Time:                     0s       0.00m     0.00h    0.00d  0.000 y
#Average job time:               27838s     463.96m     7.73h    0.32d
#Longest finished job:          128043s    2134.05m    35.57h    1.48d
#Submission to last job:        128747s    2145.78m    35.76h    1.49d
#Estimated complete:                 0s       0.00m     0.00h    0.00d

## Do parasol runs on replicates (these are not actually currently used)
ssh ku
cd /hive/data/genomes/hg38/bed/uwDnase1/run_replicates
para make
para time
#completed: 189 of 189 jobs
#CPU time in finished jobs:    4025020s   67083.66m  1118.06h   46.59d  0.128 y
#IO & Wait Time:                     0s       0.00m     0.00h    0.00d  0.000 y
#Average job time:               20115s     335.25m     5.59h    0.23d
#Longest finished job:          110245s    1837.42m    30.62h    1.28d
#Submission to last job:        111410s    1856.83m    30.95h    1.29d
#Estimated complete:                 0s       0.00m     0.00h    0.00d
#Note that one of the experiments only has replicate 2.  It's because both
#iterations of replicate 1 were deprecated.

## Augment metadata
ssh hgwdev
cd /hive/data/genomes/hg38/bed/uwDnase1
dnaseHg38AddTreatments batchDir/meta.tab meta.tab

## Do correlations between all pooled experiments 
ssh ku
cd /hive/data/genomes/hg38/bed/uwDnase1
mkdir run_correlations
cd run_correlations

# Create little script to make tab separated output out of bigWigCorrelate results
cat << '_EOF_' > corr2
#!/bin/tcsh -efx
echo -n "$1\t$2\t" > $3
bigWigCorrelate $1 $2 >> $3
'_EOF_'
  # << happy emacs

# Create gensub2 input
cat << '_EOF_' > gsub
#LOOP
corr2 $(path1) $(path2) out/$(root1)_vs_$(root2)
#ENDLOOP
'_EOF_'
  # << happy emacs

# Run gensub2 with brand new selfPair method on all pooled files
ls -1 /hive/data/genomes/hg38/bed/uwDnase1/run_pooled/*.bigWig > fileList
gensub2 fileList selfPair gsub jobList

# The parasol run using just 10 CPUs because we are i/o heavy
para create jobList
para push -maxJob=10
para time
#Completed: 4465 of 4465 jobs
#CPU time in finished jobs:     349724s    5828.74m    97.15h    4.05d  0.011 y
#IO & Wait Time:                 58019s     966.98m    16.12h    0.67d  0.002 y
#Average job time:                  91s       1.52m     0.03h    0.00d
#Longest finished job:             701s      11.68m     0.19h    0.01d
#Submission to last job:         47080s     784.67m    13.08h    0.54d
#Estimated complete:                 0s       0.00m     0.00h    0.00d

# Concatenate results
cat out/* > ../correlation.tab

# Set up inputs for clustering run to choose colors and make tree
cd /hive/data/genomes/hg38/bed/uwDnase1
ls -1 /hive/data/genomes/hg38/bed/uwDnase1/run_pooled/*.bigWig > ../pooled.lst
grep -v '^#' meta.tab | cut -f 6 > foo
paste pooled.lst foo > pooled.labels

# Run clustering program, which takes about 20 hours
mkdir /hive/data/genomes/hg38/bed/uwDnase1/calcGraph
cd /hive/data/genomes/hg38/bed/uwDnase1/calcGraph
mkdir -p /scratch/kent/tmpDir
bigWigCluster ../pooled.lst /hive data/genomes/hg38/chrom.sizes uwDnase1.json uwDnase1.tab -precalc=../correlation.tab -threads=10 -tmpDir=/scratch/kent/tmpDir -labels=../pooled.labels

## Make normalized versions of wigs (Might be able to encorperate this into
# the pooled job maker in the future
ssh ku
cd /hive/data/genomes/hg38/bed/uwDnase1
mkdir run_normalized
ls -1 /hive/data/genomes/hg38/bed/uwDnase1/run_pooled/*.bigWig | \
	sed 's/.pooled.bigwig//' > run_normalized/fileList
cd run_normalized
mkdir out

# Make normalization script
cat << '_EOF_' > norm1
#!/bin/tcsh -efx
set m = `bigWigInfo $1 | awk '/mean/ {print 1.0/$2}'`
bigWigToBedGraph $1 stdout | colTransform 4 stdin 0 $m tmp.bedGraph
bedGraphToBigWig tmp.bedGraph /hive/data/genomes/hg38/chrom.sizes tmp.bw
rm tmp.bedGraph
mv tmp.bw $2
'_EOF_'
  # << happy emacs
 
# Create gensub2 input
cat << '_EOF_' > gsub
#LOOP
edwCdJob /hive/data/genomes/hg38/bed/uwDnase1/run_normalized/norm1 $(path1).pooled.bigWig /hive/data/genomes/hg38/bed/uwDnase1/run_normalized/out/$(root1).norm.bw
#ENDLOOP
#ENDLOOP
'_EOF_'
  # << happy emacs

# Do parasol run
gensub2 fileList single gsub jobList
para make jobList -maxJob=20
para time
#Completed: 95 of 95 jobs
#CPU time in finished jobs:      20273s     337.88m     5.63h    0.23d  0.001 y
#IO & Wait Time:                     0s       0.00m     0.00h    0.00d  0.000 y
#Average job time:                 189s       3.15m     0.05h    0.00d
#Longest finished job:             364s       6.07m     0.10h    0.00d
#Submission to last job:          2006s      33.43m     0.56h    0.02d
#Estimated complete:                 0s       0.00m     0.00h    0.00d


# Link results into pooled directory
ln -s /hive/data/genomes/hg38/bed/uwDnase1/run_normalized/out/*.bw /hive/data/genomes/hg38/bed/uwDnase1/run_pooled/
sed 's/pooled.bigWig/norm.bw/' < calcGraph/uwDnase1.tab > colors.tab

# Run program to generate trackDb file.  The source is in /hg/makeDb/outside/uwDnaseTrackHub
cd /hive/data/genomes/hg38/bed/uwDnase1
uwDnaseTrackHub meta.tab run_pooled colors.tab hub


##########################################################
# Create DNase tracks from the hub files (In progress 2014-12-09 Kate)

# There are 3 tracks (Redmine #14353):
#       1) Composite with peaks, hotspots, and signal (as on hub)
#       2) Multiwig of signal, colored by similarity (as on hub)
#       3) Clusters (as on hg19)

# The hub data file dir is:  /hive/data/genomes/hg38/bed/uwDnase1/run_pooled
# (and normalized signal files are linked into that dir from :
# /hive/data/genomes/hg38/bed/uwDnase1/run_normalized/out
# The hub trackDb file is: /hive/data/genomes/hg38/bed/uwDnase1/hub/hg38/trackDb.txt

#################################
# DNAse peaks, hotspots, and signal track, and the multiwig

# Add scores to bigBeds (use signalValue)

    cd /hive/data/genomes/hg38/bed/uwDnase1/run_pooled
    mkdir -p scored/in scored/out
    foreach f (*.narrowPeak *.broadPeak)
        bigBedToBed $f scored/in/$f
    end
    cd scored/in
    bedScore -uniform -method=reg -col=7 *.narrowPeak ../out >&! score.out
    bedScore -uniform -method=reg -col=7 *.broadPeak ../out >>&! score.out &

    cd ../out
    foreach f (*.broadPeak)
        echo $f
        bedToBigBed -type=bed6+3 -as=$HOME/kent/src/hg/lib/encode/broadPeak.as \
               $f /hive/data/genomes/hg38/chrom.sizes ../$f.bb
    end

# Link data files into gbdb and create bbi tables
    mkdir /hive/data/gbdb/hg38/bbi/uwDnase
    cd /hive/data/genomes/hg38/bed/uwDnase1/run_pooled/scored
    set bbi = /gbdb/hg38/bbi/uwDnase/scored
    mkdir $bbi

    foreach f (*.broadPeak.bb)
        echo $f
        ln -s `pwd`/$f $bbi
        set exp = `echo $f | sed 's/wgEncodeEH\([0-9]*\).*/WgEncodeEH\1/'`
        hgBbiDbLink hg38 uwEnc2DnaseHot${exp} $bbi/$f
    end

    foreach f (*.narrowPeak.bb)
        echo $f
        ln -s `pwd`/$f $bbi
        set exp = `echo $f | sed 's/wgEncodeEH\([0-9]*\).*/WgEncodeEH\1/'`
        hgBbiDbLink hg38 uwEnc2DnasePeaks${exp} $bbi/$f
    end

    cd /hive/data/genomes/hg38/bed/uwDnase1/run_normalized/out
    foreach f (*.bw)
        ln -s `pwd`/$f $bbi/$f
        set exp = `echo $f | sed 's/wgEncodeEH\([0-9]*\).*/WgEncodeEH\1/'`
        hgBbiDbLink hg38 uwEnc2DnaseSignal${exp} $bbi/$f
        hgBbiDbLink hg38 uwEnc2DnaseWig${exp} $bbi/$f
    end

# Load peaks into database (needed by hgc. we may be able to drop these with code changes)
    cd /hive/data/genomes/hg38/bed/uwDnase1/run_pooled/scored/out
    foreach f (*.narrowPeak)
        echo $f
        set exp = `echo $f | sed 's/wgEncodeEH\([0-9]*\).*/WgEncodeEH\1/'`
        #hgLoadBed -fillInScore=signalValue -trimSqlTable -sqlTable=$HOME/kent/src/hg/lib/encode/narrowPeak.sql -renameSqlTable -as=$HOME/kent/src/hg/lib/encode/narrowPeak.as hg38 uwEnc2DnaseBedPeaks${exp} $f
        hgLoadBed -trimSqlTable -sqlTable=$HOME/kent/src/hg/lib/encode/narrowPeak.sql -renameSqlTable -as=$HOME/kent/src/hg/lib/encode/narrowPeak.as hg38 uwEnc2DnaseBedPeaks${exp} $f
    end

# Use cell curation to make more informative long labels
    cd /hive/data/genomes/hg38/bed/uwDnase1
    hgsql hg38 -Ne 'select * from wgEncodeCell' > cells/cellInfo.tab
    uwDnaseTrackHub -cellFile=cells/cellInfo.tab meta.tab run_pooled colors.tab kateHub4

# Convert trackDb from hub to native
    cd /hive/data/genomes/hg38/bed/uwDnase1/
    mkdir tracks
    cd tracks
    cp ../kateHub4/hg38/trackDb.txt .
    sed -e 's/type bigBed/type bigBed 6 +/' -e '/bigDataUrl/d' trackDb.txt > trackDb.ra
    cp trackDb.ra ~/kent/src/hg/makeDb/trackDb/human/hg38/uwDnase.ra

# NOTE: hotspot5 had a bug causing it to silently abort peak identification on some datasets 
# before completing all chromosomes.  I fixed this issue, and created hotspot5.1 version, which
# was used to remake the broad and narrow peaks.  The corrected bigBeds are dated Feb. 13-19.
#
# The hotspot tool is hosted on hive at:  /hive/groups/encode/encode3/tools/hotspot-distr
# The version is v4 (though the tool prints hotspot5 as its name)
# The fix is:
#
# diff  hotspot-deploy-made/src/InputDataReader.cpp  hotspot-deploy-fixed/src/InputDataReader.cpp
# 74a75
# >                         numLines++;
# I ran this by Bob Thurman who currently maintains the package, and he thought it reasonable -- said
# he would test but we've not heard back.  I will post it to gitHub as an issue sometime.

# Scored files are:
#       /hive/data/genomes/hg38/bed/wgEncodeRegDnase/run_pooled_hotspot5.1/scored/*broadPeak.bb
#       /hive/data/genomes/hg38/bed/wgEncodeRegDnase/run_pooled_hotspot5.1/scored/out/*.narrowPeak

#################################
# DNase clusters track (Kate Feb 2015)

    cd /hive/data/genomes/hg38/bed/uwDnase1
    mkdir clusters
    # cd run_pooled/scored/out
    cd run_pooled_hotspot5.1/scored/out
    ls *.pooled.narrowPeak > ../../../clusters/peak.lst

    # calculate normalization factor
    regClusterMakeTableOfTables -verbose=3 eapDnase01Hg38 \
        ../../../clusters/peak.lst ../../../clusters/peak.table >&! ../../../clusters/regTable.out &

    # cluster
    regCluster -bedSources ../../../clusters/peak.table /dev/null ../../../clusters/peak.bed \
        >&! ../../../clusters/regCluster.out &
    2011652 singly-linked clusters, 2076756 clusters in 96 chromosomes
    # NOTE: more clusters (2.2M) in hg19 (which included Duke data)

    # filter out low scoring
    cd ../../../clusters
    awk '$5 >= 100' peak.bed > peak.filtered.bed
    wc -l peak.filtered.bed
    # 1330766 peak.filtered.bed
    # retained 66% vs 83% in hg19 (seems low ??)

    # --> keep them all for now, filter with UI

    # format to BED5+floatscore+sources for hgBedSources 
    #   which will extract, uniquify, and assign ID's to sources
    awk 'BEGIN {OFS="\t"}{print $1, $2, $3, $4, $5, 0, $7;}' peak.bed > peak.bed6
    hgBedSources peak.bed6 regDnase
    mv regDnaseSources.tab uwEnc2DnaseSources.tab

    # hand edit to fix curation --  add RA treatments where needed
    # NHBE_RA -> NHBE_RA+RA
    # SK-N-SH_RA -> SK-N-SH_RA+RA

    # load sources table
    autoSql $HOME/kent/src/hg/lib/idName.as idName
    hgLoadSqlTab hg38 uwEnc2DnaseSources idName.sql uwEnc2DnaseSources.tab

    # merge files and format to BED5+sourceCount+sourceIds+sourceVals
    awk '{print $8}' peak.bed > peak.vals
    awk 'BEGIN {OFS="\t"}{print $1, $2, $3, $4, $5, $7, $8;}' regDnase.bed | \
        paste - peak.vals > uwEnc2DnaseClustered.bed
    hgLoadBed hg38 uwEnc2DnaseClustered -sqlTable=$HOME/kent/src/hg/lib/bed5SourceVals.sql \
        -renameSqlTable -as=$HOME/kent/src/hg/lib/bed5SourceVals.as uwEnc2DnaseClustered.bed

    # create inputs file to display metadata on details page
    # NOTE: this can probably be jettisoned in favor of new code, since source info
    # is now in the BED file
cat > makeInputs.csh << 'EOF'
    set tables = `hgsql hg38 -Ne "show tables like 'uwEnc2DnasePeaks%'"`
    foreach t ($tables)
        set exp = `echo $t | sed 's/uwEnc2DnasePeaksWgEncode/wgEncode/'`
        set t = `echo $t | sed 's/Peaks/BedPeaks/'`
        set cell = `encodeExp show $exp cellType`
        set treatment = `encodeExp show $exp treatment`
        echo "$t\t$cell\t$treatment"
    end
'EOF'
    csh makeInputs.csh > inputs.tab
    hgLoadSqlTab hg38 uwEnc2DnaseInputs ~/kent/src/hg/lib/clusterInputEapDnase.sql inputs.tab

    # try bigBed version
    sed 's/BedPeaks/Peaks/' inputs.tab > bigInputs.tab
    hgsql hg38 -e 'alter table uwEnc2DnaseInputs rename to uwEnc2DnaseInputs_old'
    hgLoadSqlTab hg38 uwEnc2DnaseInputs ~/kent/src/hg/lib/clusterInputEapDnase.sql bigInputs.tab

    #Hmm, hgc peakClusters doesn't appear to work with bigBed peak files...
    # Revert trackDb to BED peak files


#################
# Load up data.  Clean up peaks & hotspots a little first (truncate signal, -log10 of pVal) 
# and then rename all -- replacing wgEncodeEH* with cell and treatment, to help users.  
# Change prefix to match hg19
# (kate 2015-06-17)

    cd /hive/data/genomes/hg38/bed/uwDnase1
    mkdir load
    cd load
cat > expToFile.csh << 'EOF'
#!/bin/csh -ef
    set exp = `echo $1 | sed 's/wgEncodeEH\([0-9]*\).*/\1/'`
    set cell = `encodeExp show $exp cellType | sed -e 's/[-+_]//g' -e 's/\(.\)\(.*\)/\U\1\L\2/'`
    set treatment = `encodeExp show $exp treatment | sed -e 's/[-+_]//g' -e 's/\(.\)\(.*\)/\U\1\L\2/'`
    echo ${cell}${treatment}
'EOF'

cat > renameWig.csh << 'EOF'
#!/bin/csh -ef
set bbi = /gbdb/hg38/bbi/wgEncodeRegDnase
mkdir -p $bbi
set path = (`pwd` $path)
set build = /hive/data/genomes/hg38/bed/wgEncodeRegDnase
set data = run_normalized/out
set wd = `pwd`
pushd $build/$data
rm $wd/edit.csh
foreach f (*.norm.bw)
    echo "${f} "
    set exp = $f:r:r
    set vars = `expToFile.csh $exp`
    set t = wgEncodeRegDnaseUw${vars}Signal
    echo "-e s/uwEnc2DnaseSignal${exp}/$t/ \" >> $wd/edit.csh
    ln -s $build/$data/$f $bbi/$t.bw
    hgBbiDbLink hg38 $t $bbi/$t.bw
end
popd
'EOF'

cat > loadPeak.csh << 'EOF'
#!/bin/csh -ef
set path = (`pwd` $path)
set wd = `pwd`
set build = /hive/data/genomes/hg38/bed/wgEncodeRegDnase
set data = run_pooled/scored/out
pushd $build/$data
foreach f (*.narrowPeak)
    set exp = $f:r:r
    set vars = `expToFile.csh $exp`
    set t = wgEncodeRegDnaseUw${vars}Peak
    echo "${f}   ${t}"
    # truncate signal value to int and transform pValue to -log10
    # -log10 pValue
    awk 'BEGIN {OFS="\t"} {{if ($8==0) $8=4.94e-324; {$7=int($7); $8=-log($8)/log(10);print}}}' $f | \
        hgLoadBed  -trimSqlTable -sqlTable=$HOME/kent/src/hg/lib/encode/narrowPeak.sql \
        -renameSqlTable -as=$HOME/kent/src/hg/lib/encode/narrowPeak.as hg38 $t stdin
end
'EOF'

cat > loadHot.csh << 'EOF'
#!/bin/csh -ef
set bbi = /gbdb/hg38/bbi/wgEncodeRegDnase
mkdir -p $bbi
set path = (`pwd` $path)
set wd = `pwd`
set build = /hive/data/genomes/hg38/bed/wgEncodeRegDnase
set data = run_pooled/scored/out
pushd $build/$data
foreach f (*.broadPeak)
    set exp = $f:r:r
    set vars = `expToFile.csh $exp`
    set t = wgEncodeRegDnaseUw${vars}Hotspot
    echo "${f}   ${t}"
    #echo "-e s/uwEnc2DnaseHot${exp}/$t/ \\" >> $wd/edit2.csh
    awk 'BEGIN {OFS="\t"} {{if ($8==0) $8=4.94e-324; {$7=int($7); $8=-log($8)/log(10);print}}}' $f > bed.tmp
    bedToBigBed -type=bed6+3 -as=${HOME}/kent/src/hg/lib/encode/broadPeak.as \ 
                bed.tmp /hive/data/genomes/hg38/chrom.sizes ../$f.bb
    #ln -s $build/$data/$f $bbi/$t.broadPeak.bb
    #hgBbiDbLink hg38 $t $bbi/$t.broadPeak.bb
end
'EOF'

# Manually tweak edit.csh and run to replace table and track names in trackDb.ra
# Do same for broadPeak bigBeds, and narrowPeak tables

csh edit5.csh ../clusters/bigInputs.tab > ../clusters/wgEncodeRegDnaseClusteredInputs.tab
cd ../clusters
hgLoadSqlTab hg38 wgEncodeRegDnaseClusteredInputs \
        ~/kent/src/hg/lib/clusterInputEapDnase.sql wgEncodeRegDnaseClusteredInputs.tab

hgql hg38 -e "alter table uwEnc2DnaseClustered rename to wgEncodeRegDnaseClustered"
hgsql hg38 -e "alter table uwEnc2DnaseSources rename to wgEncodeRegDnaseClusteredSources"


#################
# Cell table (use for cell metadata, instead of metaDb)

    cd /hive/data/genomes/hg38/bed/uwDnase1
    mkdir cells
    cd cells

    # collect cell info from ENCODE2 and ENCODE3
    ~/kent/src/hg/encode3/cellsFromEncode3.py > cells.tsv

    # load to google spreadsheet and clean 
    #https://docs.google.com/a/soe.ucsc.edu/spreadsheets/d/10EWdr-JTtDvfLKKLPvP3T2ft6MZ5KVdKbBzY76SRaug/edit#gid=1783710206

    # extract useful columns to file and load
    tail -n +2 wgEncodeCell.tab | \
        nl -v 0 | \
        hgLoadSqlTab hg38 wgEncodeCell ~/kent/src/hg/lib/encode/wgEncodeCell.sql stdin

    # add order URL's
    #https://docs.google.com/spreadsheets/d/14HvZfqJdClt6mfcwf2w0xRPvdhk5o7bgV77LMc1qTcU/edit?usp=sharing
    tail -n +2 wgEncodeCellUrl.tsv | \
        nl -v 0 | \
        hgLoadSqlTab hg38 wgEncodeCell ~/kent/src/hg/lib/encode/wgEncodeCell.sql stdin

    # to verify links
    checkUrlsInTable hg38 wgEncodeCell > errs.txt

# Update table when cron of above reports errors

cat > badUrls.txt << 'EOF'
http://www.sciencellonline.com/site/productInformation.php?keyword=1830 302
http://www.sciencellonline.com/site/productInformation.php?keyword=1820 302
http://www.sciencellonline.com/site/productInformation.php?keyword=7110 302
http://www.sciencellonline.com/site/productInformation.php?keyword=1810 302
http://www.sciencellonline.com/site/productInformation.php?keyword=1000 302
http://www.sciencellonline.com/site/productInformation.php?keyword=1100 302
http://www.sciencellonline.com/site/productInformation.php?keyword=6300 302
http://www.sciencellonline.com/site/productInformation.php?keyword=6320 302
http://www.sciencellonline.com/site/productInformation.php?keyword=200  302
http://www.sciencellonline.com/site/productInformation.php?keyword=1310 302
http://www.sciencellonline.com/site/productInformation.php?keyword=6570 302
http://www.sciencellonline.com/site/productInformation.php?keyword=2720 302
http://www.sciencellonline.com/site/productInformation.php?keyword=2620 302
http://www.sciencellonline.com/site/productInformation.php?keyword=6560 302
http://www.sciencellonline.com/site/productInformation.php?keyword=7630 302
http://www.sciencellonline.com/site/productInformation.php?keyword=6580 302
http://www.sciencellonline.com/site/productInformation.php?keyword=3120 302
http://www.sciencellonline.com/site/productInformation.php?keyword=3300 302
http://www.sciencellonline.com/site/productInformation.php?keyword=4000 302
http://www.sciencellonline.com/site/productInformation.php?keyword=6540 302
http://www.sciencellonline.com/site/productInformation.php?keyword=7130 302
'EOF'

# NOTE: site query string has changed to:
#       http://sciencellonline.com/catalogsearch/result/?q=1820
    hgsql hg38 -e 'select * from wgEncodeCell' > wgEncodeCell.2015-05-06.tab
    sed -e 's^www.sciencellonline.com/site/productInformation.php^sciencellonline.com/catalogsearch/result/^' -e 's^keyword=^q=^' wgEncodeCell.2015-05-06.tab > wgEncodeCell.2015-05-07.tab
    ln -s wgEncodeCell.2015-05-07.tab wgEncodeCell.latest.tab
    # save old table for now
    hgsql hg38 -e "alter table wgEncodeCell rename to wgEncodeCell_old"
    hgLoadSqlTab hg38 wgEncodeCell ~/kent/src/hg/lib/encode/wgEncodeCell.sql wgEncodeCell.latest.tab
    # check, then remove old table
    hgsql hg38 -e "drop table wgEncodeCell_old"
    checkUrlsInTable hg38 wgEncodeCell > errs.txt

# Nov 2015 -- sciencell links are reported as permanently moved (HTTP status code 301)
# Time to fix them!  21 total
# navigate to links and grab correct URL
# These must be fixed in the wgEncodeCell table (for hg38) and also in cv.ra (for hg19 tracks)
#  (2015-11 kate)

cd /hive/data/genomes/hg38/bed/wgEncodeRegDnase/cells
checkUrlsInTable hg38 wgEncodeCell > wgEncodeCell.errs

# hand edited below
http://sciencellonline.com/catalogsearch/result/?q=1830 301
        http://www.sciencellonline.com/human-astrocytes-hippocampal.html HA-h
http://sciencellonline.com/catalogsearch/result/?q=1820 301
        http://www.sciencellonline.com/human-astrocytes-spinal-cord.html HA-sp
http://sciencellonline.com/catalogsearch/result/?q=7110 301
        http://www.sciencellonline.com/haepic.html HAEpiC
http://sciencellonline.com/catalogsearch/result/?q=1810 301
        http://www.sciencellonline.com/human-astrocytes-cerebellar.html HA-c
http://sciencellonline.com/catalogsearch/result/?q=1000 301
        http://www.sciencellonline.com/human-brain-microvascular-endothelial-cells.html HBMEC
http://sciencellonline.com/catalogsearch/result/?q=1100 301
        http://www.sciencellonline.com/human-brain-vascular-smooth-muscle-cells.html HBVSMC
http://sciencellonline.com/catalogsearch/result/?q=6300 301
        http://www.sciencellonline.com/human-cardiac-fibroblasts.html HCF
http://sciencellonline.com/catalogsearch/result/?q=6320 301
        http://www.sciencellonline.com/human-cardiac-fibroblasts-adult-atrial.html HCF-aa
http://sciencellonline.com/catalogsearch/result/?q=200  301
        http://www.sciencellonline.com/human-cardiac-myocytes.html HCM
http://sciencellonline.com/catalogsearch/result/?q=1310 301
        http://www.sciencellonline.com/human-choroid-plexus-epithelial-cells.html HCPEpiC
http://sciencellonline.com/catalogsearch/result/?q=6570 301
        http://www.sciencellonline.com/hconf.html HConF
http://sciencellonline.com/catalogsearch/result/?q=2720 301
        http://www.sciencellonline.com/heepic.html HEEpiC
http://sciencellonline.com/catalogsearch/result/?q=2620 301
        http://www.sciencellonline.com/human-gingival-fibroblasts.html HGnF
http://sciencellonline.com/catalogsearch/result/?q=6560 301
        http://www.sciencellonline.com/hipepic.html HIPEpiC
http://sciencellonline.com/catalogsearch/result/?q=7630 301
        http://www.sciencellonline.com/hmf.html HMF
http://sciencellonline.com/catalogsearch/result/?q=6580 301
        http://www.sciencellonline.com/hnpcepic.html HNPCEpiC
http://sciencellonline.com/catalogsearch/result/?q=3120 301
        http://www.sciencellonline.com/hpaaf.html HPAAF
http://sciencellonline.com/catalogsearch/result/?q=3300 301
        http://www.sciencellonline.com/hpf.html HPF
http://sciencellonline.com/catalogsearch/result/?q=4000 301
        http://www.sciencellonline.com/human-renal-glomerular-endothelial-cells.html HRGEC
http://sciencellonline.com/catalogsearch/result/?q=6540 301
        http://www.sciencellonline.com/hrpepic.html HRPEpiC
http://sciencellonline.com/catalogsearch/result/?q=7130 301
        http://www.sciencellonline.com/hvmf.html HVMF
###
# hand-munge above into a shell script to edit .tab file
# sed -e 's^bad-url^good-url^'
hgsql hg38 -Ne 'select * from wgEncodeCell' > wgEncodeCell.2015-11-11.tab
csh fixCells.csh wgEncodeCell.2015-11-11.tab > wgEncodeCell.2015-11-11.fixed.tab
hgsql hg38 -e "alter table wgEncodeCell rename to wgEncodeCell_old"
hgLoadSqlTab hg38 wgEncodeCell ~/kent/src/hg/lib/encode/wgEncodeCell.sql wgEncodeCell.2015-11-11.fixed.tab
# check, then remove old table
checkUrlsInTable hg38 wgEncodeCell > errs.txt
hgsql hg38 -e "drop table wgEncodeCell_old"





# Treatment table
# TBD

# create term/description tab sep file (currently just treatments in UW DNase)
#cd /hive/data/genomes/hg38/bed/uwDnase1
#cd cells
#tail -n +2 treatments.tab | \
    #nl -v 0 | \
    #hgLoadSqlTab hg38 wgEncodeTreatment ~/kent/src/hg/lib/encode/wgEncodeTreatment.sql stdin


# Comparison to lifted track (Chris Eisenhart)
# Visually, new track is appears noisy. This is not unexpected as it excludes Duke data.
# There are more elements in new track: 2076756 vs 1867194

# Coverage is similar:

featureBits hg38 wgEncodeRegDnaseClustered
# 451551920 bases of 3049335806 (14.808%) in intersection

featureBits hg38 wgEncodeRegDnaseClusteredLifted
# 477271764 bases of 3049335806 (15.652%) in intersection

# Comparing chr1:

featureBits hg38 -chrom=chr1 wgEncodeRegDnaseClustered
#41505780 bases of 230486321 (18.008%) in intersection
featureBits hg38 -chrom=chr1 wgEncodeRegDnaseClusteredLifted
#46543116 bases of 230486321 (20.193%) in intersection

# Greater number of elements must be due to mappings on new alt chroms
# (96 chroms in new track, 38 in old)


##############################################################################
# wgEncodeReg ENCODE Regulatory tracks (Done Chris Eisenhart)
# Transcription, Layered H3K4Me1, Layered H3K4Me3, Layered H3K27Ac
    mkdir /hive/data/genomes/hg38/bed/hg19MassiveLift/wgEncodeReg
    mkdir /hive/data/genomes/hg38/bed/hg19MassiveLift/wgEncodeReg/wgEncodeRegMarkH3k27ac
    mkdir /hive/data/genomes/hg38/bed/hg19MassiveLift/wgEncodeReg/wgEncodeRegMarkH34me1
    mkdir /hive/data/genomes/hg38/bed/hg19MassiveLift/wgEncodeReg/wgEncodeRegMarkH3k4me3
    mkdir /hive/data/genomes/hg38/bed/hg19MassiveLift/wgEncodeReg/wgEncodeRegTfbsClusteredV3
    mkdir /hive/data/genomes/hg38/bed/hg19MassiveLift/wgEncodeReg/wgEncodeRegTxn
    mkdir /hive/data/genomes/hg38/bed/hg19MassiveLift/wgEncodeReg
    liftManyBigWigs /cluster/home/ceisenhart/kent/src/hg/utils/liftList/bigWigList.ra

##############################################################################
# ENCODE Registry of Candidate cis-Regulatory Elements
#
# 2020-04-14  kate
#
# From ENCODE 3 Data Analysis Center at U Mass Med Center (Zlab)
# Data contacts:  Henry Pratt, Jill Moore, Zhiping Weng PI
#
# RM #24668
#
# Download BED file (hosted on their integrative hub)

cd /hive/data/outside/encode3/ccre
wget http://gcp.wenglab.org/hubs/integrative1/data/GRCh38/CTA/GRCh38-ccREs.bigBed

# Later Jill asked to add scores, download that

wget -nd https://users.wenglab.org/moorej3/Human-maxZ-DNase.txt.gz
gunzip Human-maxZ-DNase.txt.gz

# check score distribution
textHistogram -real -col=2 Human-maxZ-DNase.txt
1.000000 ***************** 102913
2.000000 ************************************************************ 362463
3.000000 ************************************************* 294351
4.000000 ********************* 128201
5.000000 ***** 29345
6.000000 * 6991
7.000000  2096
8.000000  168
9.000000  5
10.000000  2

# noting that order of accessions in score file doesn't match bed file ;-(
sort Human-maxZ-DNase.txt > Human-maxZ-DNase.sorted.txt
sort -k 4 GRCh38-ccREs.bed > GRCh38-ccREs.sorted.bed

paste GRCh38-ccREs.sorted.bed Human-maxZ-DNase.sorted.txt > ccres.prescored.bed
# manually sanity check start and end of file to see that accessions match

# score using zscore,  min(zscore*100),1000), and reformat
awk '{OFS="\t"; print $1, $2, $3, $4, ($13>10)? 1000 : int($13 * 100), $6, $7, $8, $9, $10, $13}' \
        ccres.prescored.bed | bedSort stdin ccres.scored.bed

# Reformat to add fields for filtering and mouseover, etc.
set f = encodeCcreCombined
set bin = ~/kent/src/hg/makeDb/outside/encode3/ccre
perl $bin/makeCcreCombined.pl < ccres.scored.bed > $f.bed
set lib = ~/kent/src/hg/lib
bedToBigBed -tab -type=bed9+6 -as=$lib/$f.as $f.bed /hive/data/genomes/hg38/chrom.sizes $f.bb
ln -s `pwd`/$f.bb /gbdb/hg38/encode3/ccre/

###############################

