#########################################################################
# Phylogenetic tree from 160-way (DONE - 2014-10-17 - Hiram)
    mkdir /hive/data/genomes/eboVir3/bed/multiz160way/4d
    cd /hive/data/genomes/eboVir3/bed/multiz160way/4d

    # the annotated maf is:
    ../noDups/multiz160way.maf

    # using ncbiGene for eboVir3
    hgsql -N -e 'select * from ncbiGene;' eboVir3 \
       | cut -f2- > eboVir3.ncbiGene.gp

    genePredSingleCover eboVir3.ncbiGene.gp stdout \
       | sort > eboVir3.ncbiGeneNR.gp
    wc -l *.gp
    #  9 eboVir3.ncbiGene.gp
    #  7 eboVir3.ncbiGeneNR.gp

    sed -e 's/eboVir3.KM034562v1/KM034562v1.KM034562v1/' \
       ../noDups/multiz160way.maf > multiz160way.KM034562v1.maf

    /cluster/bin/phast.build/cornellCVS/phast.2010-12-30/bin/msa_view \
       --4d --features eboVir3.ncbiGeneNR.gp \
         -i MAF multiz160way.KM034562v1.maf -o SS > multiz160way.ss

    /cluster/bin/phast.build/cornellCVS/phast.2010-12-30/bin/msa_view \
        -i SS --tuple-size 1 multiz160way.ss > multiz160way.mfa

    #want comma-less species.list
    /cluster/bin/phast.build/cornellCVS/phast.2010-12-30/bin/msa_view \
	--aggregate "`cat ../species.list`" multiz160way.mfa | sed s/"> "/">"/ \
	    > 4d.all.mfa
    # check they are all in there:
    grep "^>" 4d.all.mfa | sort -u | wc -l
    #  160

    sed 's/[a-z][a-z]*_//g; s/:[0-9\.][0-9\.]*//g; s/;//; /^ *$/d' \
	../160way.nh | xargs echo | sed -e 's/ //g' > tree_commas.nh
    # tree_commas.nh looks like:
    #   (((((eboVir3,panTro4),rheMac3),(mm10,rn5)),canFam3),monDom5)
    # use phyloFit to create tree model (output is phyloFit.mod)
    time nice -n +19 \
	/cluster/bin/phast.build/cornellCVS/phast.2010-12-30/bin/phyloFit \
	    --EM --precision MED --msa-format FASTA --subst-mod REV \
		--tree tree_commas.nh 4d.all.mfa
    #   real    0m6.583s


    mv phyloFit.mod all.mod

#########################################################################
# phastCons 160-way (DONE - 2014-06-04 - Hiram)
    # split 160way mafs into 10M chunks and generate sufficient statistics
    # files for # phastCons
    ssh ku
    mkdir -p /hive/data/genomes/eboVir3/bed/multiz160way/cons/SS
    cd /hive/data/genomes/eboVir3/bed/multiz160way/cons/SS
    /cluster/bin/phast.build/cornellCVS/phast.2010-12-30/bin/msa_split \
  /hive/data/genomes/eboVir3/bed/multiz160way/noDups/multiz160way.maf -i MAF \
    -o SS -r multiz160way -w 10000000,0 -I 1000 -B 5000

    # Run phastCons

export len=45
export cov=0.3
export rho=0.3
export c=eboVir3

sed -e 's/KM034562v1/eboVir3/g' ../4d/all.mod > all.mod

/cluster/bin/phast.build/cornellCVS/phast.2010-12-30/bin/phastCons \
  SS/multiz160way.1-18957.ss all.mod \
    --rho $rho --expected-length $len --target-coverage $cov --quiet \
    --seqname $c --idpref $c --most-conserved $c.bed --score > $c.pp

awk '{printf "%s\t%d\t%d\tlod=%d\t%s\n", "KM034562v1", $2, $3, $5, $5}' \
   eboVir3.bed > tmpMostConserved.bed
/cluster/bin/scripts/lodToBedScore tmpMostConserved.bed > mostConserved.bed

    wigToBigWig -verbose=2 KM034562v1.pp \
      /hive/data/genomes/eboVir3/chrom.sizes KM034562v1.bw
    bigWigInfo KM034562v1.bw
version: 4
isCompressed: yes
isSwapped: 0
primaryDataSize: 21,741
primaryIndexSize: 6,348
zoomLevels: 6
chromCount: 1
basesCovered: 18,957
mean: 0.764529
min: 0.000000
max: 1.000000
std: 0.397315

    wigEncode KM034562v1.pp phastCons160way.wig phastCons160way.wib
    #  Converted KM034562v1.pp, upper limit 1.00, lower limit 0.00

    # load into database
    ssh hgwdev
    cd /hive/data/genomes/eboVir3/bed/multiz160way/cons
    hgLoadBed eboVir3 phastConsElements160way mostConserved.bed
    hgLoadBed eboVir3 strainPhastConsElements160way mostConserved.bed
    # Read 129 elements of size 5 from mostConserved.bed
    featureBits eboVir3 phastConsElements160way
    #  14451 bases of 18957 (76.230%) in intersection

    ln -s `pwd`/phastCons160way.wib \
          /gbdb/eboVir3/multiz160way/phastCons160way.wib
    hgLoadWiggle -pathPrefix=/gbdb/eboVir3/multiz160way \
	eboVir3 phastCons160way phastCons160way.wig
    hgLoadWiggle -pathPrefix=/gbdb/eboVir3/multiz160way \
	eboVir3 strainPhastCons160way phastCons160way.wig

    # on human we often try for 5% overall cov, and 70% CDS cov
    # most bets are off here for that goal, these alignments are too few
    #	and too far between
    #	--rho 0.3 --expected-length 45 --target-coverage 0.3
    featureBits eboVir3 -enrichment ncbiGene:cds phastConsElements160way
    # ncbiGene:cds 76.573%, phastConsElements160way 76.230%, both 70.127%,
    #  cover 91.58%, enrich 1.20x


    wigTableStats.sh eboVir3 phastCons160way
# db.table          min max mean       count sumData      stdDev  viewLimits
eboVir3.phastCons160way 0 1 0.764529 18957 14493.2 0.397315 viewLimits=0:1

    #  Create histogram to get an overview of all the data
    hgWiggle -doHistogram -db=eboVir3 \
	-hBinSize=0.001 -hBinCount=1000 -hMinVal=0.0 -verbose=2 \
	    phastCons160way > histogram.data 2>&1
    #	real    2m40.179s

    #	create plot of histogram:

    cat << '_EOF_' | gnuplot > histo.png
set terminal png small x000000 xffffff xc000ff x66ff66 xffff00 x00ffff
set size 1.4, 0.8
set key left box
set grid noxtics
set grid ytics
set title " Ebola eboVir3 Histogram phastCons160way track"
set xlabel " phastCons160way score"
set ylabel " Relative Frequency"
set y2label " Cumulative Relative Frequency (CRF)"
set y2range [0:1]
set y2tics
set yrange [0:0.02]

plot "histogram.data" using 2:5 title " RelFreq" with impulses, \
        "histogram.data" using 2:7 axes x1y2 title " CRF" with lines
'_EOF_'
    #	<< happy emacs

    display histo.png &

#########################################################################
# phyloP for 7-way (DONE - 2014-10-17 - Hiram)
    # run phyloP with score=LRT
    mkdir /cluster/data/eboVir3/bed/multiz160way/consPhyloP
    cd /cluster/data/eboVir3/bed/multiz160way/consPhyloP

    # Adjust model file base composition background and rate matrix to be
    # representative of the chromosomes in play
    grep BACKGROUND ../4d/all.mod | awk '{printf "%0.3f\n", $3 + $4}'
    #	0.320
    /cluster/bin/phast.build/cornellCVS/phast.2010-12-30/bin/modFreqs \
	../4d/all.mod 0.320 > all.mod
    # verify, the BACKGROUND should now be paired up:
    grep BACK all.mod
    #   BACKGROUND: 0.340000 0.160000 0.160000 0.340000 

    sed -e 's/eboVir3/KM034562v1/' ../cons/SS/multiz160way.1-18957.ss \
      > KM034562v1.ss

    /cluster/bin/phast.build/cornellCVS/phast.2010-12-30/bin/phyloP \
      --method LRT --mode CONACC --wig-scores --chrom KM034562v1 \
        -i SS all.mod KM034562v1.ss > KM034562v1.wigFix

    # check integrity of data with wigToBigWig
    wigToBigWig -verbose=2 KM034562v1.wigFix \
      /hive/data/genomes/eboVir3/chrom.sizes phyloP160way.bw

    # pid=63708: VmPeak:       33732 kB

    bigWigInfo phyloP160way.bw
# version: 4
# isCompressed: yes
# isSwapped: 0
# primaryDataSize: 52,079
# primaryIndexSize: 6,348
# zoomLevels: 6
# chromCount: 1
# basesCovered: 18,957
# mean: 2.048103
# min: -20.000000
# max: 7.305000
# std: 2.709788

    #	encode those files into wiggle data
    wigEncode KM034562v1.wigFix phyloP160way.wig phyloP160way.wib

    #   Converted KM034562v1.wigFix, upper limit 7.30, lower limit -20.00

    # Load gbdb and database with wiggle.
    ln -s `pwd`/phyloP160way.wib /gbdb/eboVir3/multiz160way/phyloP160way.wib
    hgLoadWiggle -pathPrefix=/gbdb/eboVir3/multiz160way eboVir3 \
	phyloP160way phyloP160way.wig
    hgLoadWiggle -pathPrefix=/gbdb/eboVir3/multiz160way eboVir3 \
	strainPhyloP160way phyloP160way.wig

    # use to set trackDb.ra entries for wiggle min and max
    # and verify table is loaded correctly

    wigTableStats.sh eboVir3 phyloP160way
# db.table      min max mean count sumData
# eboVir3.phyloP160way    -20 7.305 2.0481 18957 38825.9
#       stdDev viewLimits
#  2.70979 viewLimits=-11.5008:7.305

    #	that range is: 20 + 7.405 = 27.305 for hBinSize=0.027305

    #  Create histogram to get an overview of all the data
    hgWiggle -doHistogram -hBinSize=0.027305 -hBinCount=1000 \
       -hMinVal=-20 -verbose=2 \
	    -db=eboVir3 phyloP160way > histogram.data 2>&1

    # find out the range for the 2:5 graph
    grep -v chrom histogram.data | grep "^[0-9]" | ave -col=5 stdin
# Q1 0.000158
# median 0.000791
# Q3 0.002532
# average 0.002123
# min 0.000053
# max 0.045049
# count 471
# total 1.000058
# standard deviation 0.004132

    #	create plot of histogram:
    cat << '_EOF_' | gnuplot > histo.png
set terminal png small x000000 xffffff xc000ff x66ff66 xffff00 x00ffff
set size 1.4, 0.8
set key left box
set grid noxtics
set grid ytics
set title " Ebola eboVir3 Histogram phyloP160way track"
set xlabel " phyloP160way score"
set ylabel " Relative Frequency"
set y2label " Cumulative Relative Frequency (CRF)"
set y2range [0:1]
set y2tics
set yrange [0:0.02]

plot "histogram.data" using 2:5 title " RelFreq" with impulses, \
        "histogram.data" using 2:7 axes x1y2 title " CRF" with lines
'_EOF_'
    #	<< happy emacs

    display histo.png &

#############################################################################
# 160-way downloads (DONE - 2014-10-22 - Hiram)
    mkdir -p \
       /hive/data/genomes/eboVir3/bed/multiz160way/downloads/phastCons160way
    cd /hive/data/genomes/eboVir3/bed/multiz160way/downloads/phastCons160way

    cp -p ../../cons/KM034562v1.pp ./eboVir3.phastCons160way.wigFix
    gzip eboVir3.phastCons160way.wigFix
    ln -s ../../cons/all.mod ./eboVir3.phastCons160way.mod
    ln -s ../../cons/KM034562v1.bw ./eboVir3.phastCons160way.bw

    sed -e 's/^s#//; s/#/\t/; s/#g;//' \
       ~/kent/src/hg/makeDb/doc/eboVir3/ucscName.strainName.sed \
          > accession.to.sequenceIdentifier.txt
 

    # reusing the README.txt file from:
# /usr/local/apache/htdocs-hgdownload/goldenPath/hg38/phastCons7way/README.txt
    # edited here to reflect this construction

    md5sum *.txt *.gz *.mod *.bw > md5sum.txt

    mkdir -p \
       /hive/data/genomes/eboVir3/bed/multiz160way/downloads/phyloP160way
    cd /hive/data/genomes/eboVir3/bed/multiz160way/downloads/phyloP160way

    ln -s ../../consPhyloP/all.mod ./eboVir3.phyloP160way.mod
    cp -p ../../consPhyloP/KM034562v1.wigFix ./eboVir3.phyloP160way.wigFix
    ln -s ../../consPhyloP/phyloP160way.bw ./eboVir3.phyloP160way.bw
    gzip *.wigFix

    sed -e 's/^s#//; s/#/\t/; s/#g;//' \
       ~/kent/src/hg/makeDb/doc/eboVir3/ucscName.strainName.sed \
          > accession.to.sequenceIdentifier.txt

    # reusing the README.txt file from:
# /usr/local/apache/htdocs-hgdownload/goldenPath/hg38/phyloP7way/README.txt
    # edited here to reflect this construction

    md5sum *.txt *.gz *.mod *.bw > md5sum.txt


    mkdir /usr/local/apache/htdocs-hgdownload/goldenPath/eboVir3/phastCons160way
    cd /usr/local/apache/htdocs-hgdownload/goldenPath/eboVir3/phastCons160way
    ln -s \
     /hive/data/genomes/eboVir3/bed/multiz160way/downloads/phastCons160way/* ./

    mkdir /usr/local/apache/htdocs-hgdownload/goldenPath/eboVir3/phyloP160way
    cd /usr/local/apache/htdocs-hgdownload/goldenPath/eboVir3/phyloP160way
    ln -s \
     /hive/data/genomes/eboVir3/bed/multiz160way/downloads/phyloP160way/* ./
    
##############################################################################
