#########################################################################
# Phylogenetic tree from 44-way (DONE - 2020-03-13 - Hiram)
    mkdir /hive/data/genomes/wuhCor1/bed/multiz44way/4d
    cd /hive/data/genomes/wuhCor1/bed/multiz44way/4d

    # tried using the 'defraged' maf:
    ../defraged.multiz44way.maf
    # that did not work
    # Skipping 4d.all.mfa; insufficient informative sites ...

    # using the full maf:
    ../multiz44way.maf

    # using ncbiGene for wuhCor1
    hgsql -N -e 'select * from ncbiGene;' wuhCor1 \
       | cut -f2- > wuhCor1.ncbiGene.gp

    genePredSingleCover wuhCor1.ncbiGene.gp stdout \
       | sort > wuhCor1.ncbiGeneNR.gp
    wc -l *.gp
    #  13 wuhCor1.ncbiGene.gp
    #  10 wuhCor1.ncbiGeneNR.gp

    sed -e 's/wuhCor1.NC_045512v2/NC_045512v2.NC_045512v2/' \
       ../multiz44way.maf > multiz44way.NC_045512v2.maf

    time /cluster/bin/phast.build/cornellCVS/phast.2010-12-30/bin/msa_view \
       --4d --features wuhCor1.ncbiGeneNR.gp \
         -i MAF multiz44way.NC_045512v2.maf -o SS > multiz44way.ss
    # real    0m1.273s

    time /cluster/bin/phast.build/cornellCVS/phast.2010-12-30/bin/msa_view \
        -i SS --tuple-size 1 multiz44way.ss > multiz44way.mfa
    # real    0m0.015s

    #want comma-less species.list
    /cluster/bin/phast.build/cornellCVS/phast.2010-12-30/bin/msa_view \
       --aggregate "`cat ../species.list`" multiz44way.mfa | sed s/"> "/">"/ \
	    > 4d.all.mfa
    # real    0m0.019s

    # check they are all in there:
    grep "^>" 4d.all.mfa | sort -u | wc -l
    #  44

    sed -e 's/ /,/g;' ../tree.nh > tree_commas.nh

    # tree_commas.nh looks like:
# (((NC_045512v2,MN996532v1),((((DQ022305v2,GQ153547v1),GQ153542v1),
# (MG772933v1,MG772934v1)),((((((DQ071615v1,KJ473815v1),((((FJ588686v1,
# KY770858v1),((((((KF367457v1,KY417144v1),(KY417151v1,KY417152v1)),
# ((KY417142v1,MK211377v1),(MK211376v1,MK211378v1))),((((KT444582v1,
# KY417143v1),KY417149v1),KY417146v1),(KY417147v1,KY417148v1))),(KJ473816v1,
# KY417145v1)),MK211375v1)),NC_004718v3),KP886808v1)),MK211374v1),
# (KF569996v1,KU973692v1)),JX993988v1),((((DQ412042v1,DQ648856v1),
# (KJ473812v1,KY770860v1)),KY938558v1),((DQ412043v1,KJ473814v1),
# JX993987v1))))),(KY352407v1,NC_014470v1))

    # use phyloFit to create tree model (output is phyloFit.mod)
    time nice -n +19 \
	/cluster/bin/phast.build/cornellCVS/phast.2010-12-30/bin/phyloFit \
	    --EM --precision MED --msa-format FASTA --subst-mod REV \
		--tree tree_commas.nh 4d.all.mfa
    #   real    0m1.520s

    mv phyloFit.mod all.mod

    grep "TREE:" all.mod | sed -e 's/TREE: //;' > 44way.nh

    /cluster/bin/phast.build/cornellCVS/phast.2010-12-30/bin/all_dists \
      44way.nh | grep NC_045512v2 | sed -e 's/NC_045512v2.//;' \
         | sort -k2n > 44way.distances.txt
    sed -e 's/^/# /;' 44way.distances.txt | head
# MN996532v1    0.111391
# DQ022305v2    0.756533
# GQ153542v1    0.758373
# GQ153547v1    0.758589
# JX993987v1    0.825373
# KJ473814v1    0.844563

    sed -e 's/^/# /;' 44way.distances.txt | tail
# KT444582v1    0.961789
# KY352407v1    1.063753
# NC_014470v1   1.075344
# MG772933v1    1.076854
# MG772934v1    1.106462

#########################################################################
# phastCons 44-way (DONE - 2020-03-13 - Hiram)
    # split 44way mafs into 10M chunks and generate sufficient statistics
    # files for # phastCons
    ssh hgwdev
    mkdir -p /hive/data/genomes/wuhCor1/bed/multiz44way/cons/SS
    cd /hive/data/genomes/wuhCor1/bed/multiz44way/cons/SS
    /cluster/bin/phast.build/cornellCVS/phast.2010-12-30/bin/msa_split \
       ../../defraged.multiz44way.maf -i MAF -o SS \
          -r multiz44way -w 10000000,0 -I 1000 -B 5000

    # Run phastCons

export len=45
export cov=0.3
export rho=0.3
export c=wuhCor1


    cd /hive/data/genomes/wuhCor1/bed/multiz44way/cons

sed -e 's/NC_045512v2/wuhCor1/g' ../4d/all.mod > all.mod

time /cluster/bin/phast.build/cornellCVS/phast.2010-12-30/bin/phastCons \
  SS/multiz44way.1-29903.ss all.mod \
    --rho $rho --expected-length $len --target-coverage $cov --quiet \
    --seqname $c --idpref $c --most-conserved $c.bed --score \
     | sed -e "s/$c/NC_045512v2/;" > $c.pp
# real    0m1.058s

awk '{printf "%s\t%d\t%d\tlod=%d\t%s\n", "NC_045512v2", $2, $3, $5, $5}' \
   wuhCor1.bed > tmpMostConserved.bed
/cluster/bin/scripts/lodToBedScore tmpMostConserved.bed > mostConserved.bed

    wigToBigWig -verbose=2 $c.pp /hive/data/genomes/wuhCor1/chrom.sizes $c.bw
# pid=31961: VmPeak:       40836 kB

    bigWigInfo wuhCor1.bw | sed -e 's/^/# /;'
# version: 4
# isCompressed: yes
# isSwapped: 0
# primaryDataSize: 34,334
# primaryIndexSize: 6,436
# zoomLevels: 6
# chromCount: 1
# basesCovered: 29,903
# mean: 0.855761
# min: 0.000000
# max: 1.000000
# std: 0.317335

    wigEncode $c.pp phastCons44way.wig phastCons44way.wib
    #  Converted wuhCor1.pp, upper limit 1.00, lower limit 0.00


    # load into database
    ssh hgwdev
    cd /hive/data/genomes/wuhCor1/bed/multiz44way/cons
    hgLoadBed wuhCor1 phastConsElements44way mostConserved.bed
    # Read 550 elements of size 5 from mostConserved.bed
    hgLoadBed wuhCor1 strainPhastConsElements44way mostConserved.bed
    # Read 550 elements of size 5 from mostConserved.bed

    featureBits wuhCor1 phastConsElements44way
    #  25779 bases of 29903 (86.209%) in intersection
    featureBits wuhCor1 strainPhastConsElements44way
    #  25779 bases of 29903 (86.209%) in intersection

    ln -s `pwd`/phastCons44way.wib \
          /gbdb/wuhCor1/multiz44way/phastCons44way.wib
    hgLoadWiggle -pathPrefix=/gbdb/wuhCor1/multiz44way \
	wuhCor1 phastCons44way phastCons44way.wig
    hgLoadWiggle -pathPrefix=/gbdb/wuhCor1/multiz44way \
	wuhCor1 strainPhastCons44way phastCons44way.wig

    # on human we often try for 5% overall cov, and 70% CDS cov
    # most bets are off here for that goal, these alignments are too few
    #	and too far between
    #	--rho 0.3 --expected-length 45 --target-coverage 0.3
    featureBits wuhCor1 -enrichment ncbiGene:cds phastConsElements44way
# ncbiGene:cds 97.850%, phastConsElements44way 86.209%, both 84.062%,
#   cover 85.91%, enrich 1.00x

    wigTableStats.sh wuhCor1 phastCons44way
# db.table          min max mean       count sumData      stdDev  viewLimits
wuhCor1.phastCons44way  0 1 0.855761 29903 25589.8 0.317335 viewLimits=0:1

    #  Create histogram to get an overview of all the data
    hgWiggle -doHistogram -db=wuhCor1 \
	-hBinSize=0.001 -hBinCount=1000 -hMinVal=0.0 -verbose=2 \
	    phastCons44way > histogram.data 2>&1
    #	real    2m40.179s

XXX - to be done
    #	create plot of histogram:

    cat << '_EOF_' | gnuplot > histo.png
set terminal png small x000000 xffffff xc000ff x66ff66 xffff00 x00ffff
set size 1.4, 0.8
set key left box
set grid noxtics
set grid ytics
set title " Ebola wuhCor1 Histogram phastCons44way track"
set xlabel " phastCons44way score"
set ylabel " Relative Frequency"
set y2label " Cumulative Relative Frequency (CRF)"
set y2range [0:1]
set y2tics
set yrange [0:0.02]

plot "histogram.data" using 2:5 title " RelFreq" with impulses, \
        "histogram.data" using 2:7 axes x1y2 title " CRF" with lines
'_EOF_'
    #	<< happy emacs

    display histo.png &

#########################################################################
# phyloP for 7-way (DONE - 2020-03-13 - Hiram)
    # run phyloP with score=LRT
    mkdir /cluster/data/wuhCor1/bed/multiz44way/consPhyloP
    cd /cluster/data/wuhCor1/bed/multiz44way/consPhyloP

    # Adjust model file base composition background and rate matrix to be
    # representative of the chromosomes in play
    grep BACKGROUND ../4d/all.mod | awk '{printf "%0.3f\n", $3 + $4}'
    #	0.248
    /cluster/bin/phast.build/cornellCVS/phast.2010-12-30/bin/modFreqs \
	../4d/all.mod 0.248 > all.mod
    # verify, the BACKGROUND should now be paired up:
    grep BACK all.mod
    #   BACKGROUND: 0.376000 0.124000 0.124000 0.376000 

    sed -e 's/wuhCor1/NC_045512v2/' ../cons/SS/multiz44way.1-29903.ss \
      > NC_045512v2.ss

    time /cluster/bin/phast.build/cornellCVS/phast.2010-12-30/bin/phyloP \
      --method LRT --mode CONACC --wig-scores --chrom NC_045512v2 \
        -i SS all.mod NC_045512v2.ss > NC_045512v2.wigFix
    # real    0m6.054s


    # check integrity of data with wigToBigWig
    wigToBigWig -verbose=2 NC_045512v2.wigFix \
      /hive/data/genomes/wuhCor1/chrom.sizes phyloP44way.bw
    # pid=57048: VmPeak:       40872 kB

    bigWigInfo phyloP44way.bw | sed -e 's/^/# /;'
# version: 4
# isCompressed: yes
# isSwapped: 0
# primaryDataSize: 61,491
# primaryIndexSize: 6,436
# zoomLevels: 6
# chromCount: 1
# basesCovered: 29,903
# mean: 1.653331
# min: -11.968000
# max: 4.256000
# std: 1.896185

    #	encode those files into wiggle data
    wigEncode NC_045512v2.wigFix phyloP44way.wig phyloP44way.wib
    # Converted NC_045512v2.wigFix, upper limit 4.26, lower limit -11.97

    # Load gbdb and database with wiggle.
    ln -s `pwd`/phyloP44way.wib /gbdb/wuhCor1/multiz44way/phyloP44way.wib
    hgLoadWiggle -pathPrefix=/gbdb/wuhCor1/multiz44way wuhCor1 \
	phyloP44way phyloP44way.wig
    hgLoadWiggle -pathPrefix=/gbdb/wuhCor1/multiz44way wuhCor1 \
	strainPhyloP44way phyloP44way.wig

    # use to set trackDb.ra entries for wiggle min and max
    # and verify table is loaded correctly

    wigTableStats.sh wuhCor1 phyloP44way
# db.table                  min max   mean  count sumData
# wuhCor1.phyloP44way     -11.968 4.256 1.65333 29903 49439.5
#    stdDev viewLimits
#    1.89619 viewLimits=-7.8276:4.256

    #	that range is: 4.256 + 11.968 = 16.224 for hBinSize=0.016224

    #  Create histogram to get an overview of all the data
    hgWiggle -doHistogram -hBinSize=0.027305 -hBinCount=1000 \
       -hMinVal=-20 -verbose=2 \
	    -db=wuhCor1 phyloP44way > histogram.data 2>&1

XXX - to be done
    # find out the range for the 2:5 graph
    grep -v chrom histogram.data | grep "^[0-9]" | ave -col=5 stdin
# Q1 0.000158
# median 0.000791
# Q3 0.002532
# average 0.002123
# min 0.000053
# max 0.045049
# count 471
# total 1.000058
# standard deviation 0.004132

    #	create plot of histogram:
    cat << '_EOF_' | gnuplot > histo.png
set terminal png small x000000 xffffff xc000ff x66ff66 xffff00 x00ffff
set size 1.4, 0.8
set key left box
set grid noxtics
set grid ytics
set title " Ebola wuhCor1 Histogram phyloP44way track"
set xlabel " phyloP44way score"
set ylabel " Relative Frequency"
set y2label " Cumulative Relative Frequency (CRF)"
set y2range [0:1]
set y2tics
set yrange [0:0.02]

plot "histogram.data" using 2:5 title " RelFreq" with impulses, \
        "histogram.data" using 2:7 axes x1y2 title " CRF" with lines
'_EOF_'
    #	<< happy emacs

    display histo.png &

#############################################################################
# 44-way downloads (DONE - 2020-03-18 - Hiram)
    mkdir -p \
       /hive/data/genomes/wuhCor1/bed/multiz44way/downloads/phastCons44way
    cd /hive/data/genomes/wuhCor1/bed/multiz44way/downloads/phastCons44way

    cp -p ../../cons/wuhCor1.pp ./wuhCor1.phastCons44way.wigFix
    gzip wuhCor1.phastCons44way.wigFix
    ln -s ../../cons/all.mod ./wuhCor1.phastCons44way.mod
    ln -s ../../cons/wuhCor1.bw ./wuhCor1.phastCons44way.bw
    ln -s ../../acc.date.description.list wuhCor1.44way.nameList.txt

    # reusing the README.txt file from the 119way already completed:
# /usr/local/apache/htdocs-hgdownload/goldenPath/wuhCor1/phastCons119way/README.txt
    # edited here to reflect this construction

    md5sum *.txt *.gz *.mod *.bw > md5sum.txt

    mkdir -p \
       /hive/data/genomes/wuhCor1/bed/multiz44way/downloads/phyloP44way
    cd /hive/data/genomes/wuhCor1/bed/multiz44way/downloads/phyloP44way

    ln -s ../../consPhyloP/all.mod ./wuhCor1.phyloP44way.mod
    cp -p ../../consPhyloP/NC_045512v2.wigFix ./wuhCor1.phyloP44way.wigFix
    ln -s ../../consPhyloP/phyloP44way.bw ./wuhCor1.phyloP44way.bw
    gzip *.wigFix

    ln -s ../../acc.date.description.list wuhCor1.44way.nameList.txt

    # reusing the README.txt file from:
# /usr/local/apache/htdocs-hgdownload/goldenPath/wuhCor1/phyloP119way/README.txt
    # edited here to reflect this construction

    md5sum *.txt *.gz *.mod *.bw > md5sum.txt


    mkdir /usr/local/apache/htdocs-hgdownload/goldenPath/wuhCor1/phastCons44way
    cd /usr/local/apache/htdocs-hgdownload/goldenPath/wuhCor1/phastCons44way
    ln -s \
     /hive/data/genomes/wuhCor1/bed/multiz44way/downloads/phastCons44way/* ./

    mkdir /usr/local/apache/htdocs-hgdownload/goldenPath/wuhCor1/phyloP44way
    cd /usr/local/apache/htdocs-hgdownload/goldenPath/wuhCor1/phyloP44way
    ln -s \
     /hive/data/genomes/wuhCor1/bed/multiz44way/downloads/phyloP44way/* ./
    
##############################################################################
