#############################################################################
# GTEx V6 (October 2015) Kate
# Create BED from hgFixed tables (see doc/gtex)
# Reloading during QA of track (fixing gene classes, adding scores).  (March 2016) Kate

cd /hive/data/outside/gtex/V6

# see doc/hg19.txt for how this genePred was made
set chain = /hive/data/genomes/hg19/bed/liftOver/hg19ToHg38.over.chain.gz
liftOver -genePred gencodeV19.hg19.genePred $chain gtexGeneModelV6.hg38.genePred \
                gencode.V19.hg38.unmapped
# 926 unmapped
hgLoadGenePred hg38 gtexGeneModelV6 gtexGeneModelV6.hg38.genePred

# OLD: creates gtexGeneModelV6.hg38.genePred
# OLD: NOTE: drops 192 transcripts.  One I spot-checked indeed didn't exist in our hg38 genes

cd /hive/data/genomes/hg38/bed
mkdir -p gtex
cd gtex

# table renamed to gtexGeneModel later

# Use latest GENCODE attrs file to get biotypes

# create bed table
~/kent/src/hg/makeDb/outside/hgGtexGeneBed/hgGtexGeneBed  hg38 -gtexVersion=V6 \
        -noLoad -gencodeVersion=$gencodeVersion gtexGeneBedV6 -verbose=2 >&! makeGeneBed.log

# 45 genes not found in GENCODE attributes table

#Max score: 219385.906250

wc -l *.tab
wc -l *.tab
52896 gtexGeneBedV6.tab
# 55810 gtexGeneBedV6.tab

# exploratory for assigning score based on sum of scores (expScores field)
set bedScore = ~/kent/src/utils/bedScore/bedScore
$bedScore -col=10 -minScore=1 -method=std2 gtexGeneBedV6.tab gtexGeneBedV6.std2.bed
$bedScore -col=10 -minScore=1 -method=encode gtexGeneBedV6.tab gtexGeneBedV6.encode.bed
$bedScore -col=10 -minScore=1 -method=reg gtexGeneBedV6.tab gtexGeneBedV6.reg.bed
$bedScore -col=10 -minScore=1 -method=asinh gtexGeneBedV6.tab gtexGeneBedV6.asinh.bed

$bedScore -col=10 -minScore=1 -method=reg -log gtexGeneBedV6.tab gtexGeneBedV6.reg.log.bed
$bedScore -log -col=10 -minScore=1 -method=std2 gtexGeneBedV6.tab gtexGeneBedV6.std2.log.bed
$bedScore -log -col=10 -minScore=1 -method=encode gtexGeneBedV6.tab gtexGeneBedV6.encode.log.bed
$bedScore -log -col=10 -minScore=1 -method=asinh gtexGeneBedV6.tab gtexGeneBedV6.asinh.log.bed

# Using -log -method=encode, as this is closest to density plot of all scores
# as here: GtexTotalExpressionDensity.png, GtexTotalExpressionFrequency.png
textHistogram -real -autoScale=14 -log -col=5 gtexGeneBedV6.encode.log.bed
1.000000 ************************************************************ 22889
72.357214 *************************************************** 4862
143.714428 ************************************************** 4199
215.071643 ************************************************* 3931
286.428857 ************************************************** 4329
357.786071 *************************************************** 5419
429.143285 ************************************************** 4472
500.500500 ********************************************* 1953
571.857714 ************************************** 564
643.214928 ******************************** 200
714.572142 ************************* 61
785.929356 *********** 6
857.286571 ******** 4
928.643785 ************ 7

$bedScore -col=10 -minScore=0 -log -method=encode gtexGeneBedV6.tab gtexGeneBedV6.bed


# load up
set lib = ~/kent/src/hg/lib

hgLoadBed hg38 -noBin -tab -type=bed6+4 \
        -as=$lib/gtexGeneBed.as -sqlTable=$lib/gtexGeneBed.sql -renameSqlTable \
                gtexGeneBedNewV6 gtexGeneBedV6.bed

# Add GTEx to Gene Sorter (2016-08-18 kate)
# See hg/near/makeNear.doc

#############################################################################
# GTEx V8 (Apr 2020) Kate
# Create BED from hgFixed tables (see doc/gtex)
 
# Load gene models (Gencode V26 transcript union from GTEx)

cd /hive/data/outside/gtex/V8/rnaSeq
gtfToGenePred gencode.v26.GRCh38.genes.gtf gencodeV26.hg38.genePred \
                -infoOut=gtexGeneModelInfoV8.tab
hgLoadGenePred hg38 gtexGeneModelV8 gencodeV26.hg38.genePred

# Get transcript for each gene (why ?)
tail -n +2 gtexGeneModelInfoV8.tab | awk '{printf("%s\t%s\n", $1, $9)}' > gtexGeneTranscriptsV8.tab
#hgLoadSqlTab hgFixed gtexTranscriptV8 ~/kent/src/hg/lib/gtexTranscript.sql gtexGeneTranscriptsV8.tab
# no schema (or table on hgwdev.hgFixed)

# Load BED table
cd /hive/data/genomes/hg38/bed/gtex
mkdir V8
cd V8

set gencode = V26
~/kent/src/hg/makeDb/outside/hgGtexGeneBed/hgGtexGeneBed \
        hg38 -noLoad -gtexVersion=V8 -gencodeVersion=$gencode gtexGeneV8 -verbose=2 >&! log.txt &

Reading wgEncodeGencodeAttrs table
Reading gtexGeneModelV8 table
Reading gtexTissueMedian table
Writing tab file gtexGeneV8
Max score: 267400.000000

# Add scores
set bedScore = ~/kent/src/utils/bedScore/bedScore
$bedScore -col=10 -minScore=0 -log -method=encode gtexGeneV8.tab gtexGeneBedV8.bed
textHistogram -real -autoScale=14 -log -col=5 gtexGeneBedV8.bed
0.000000 ************************************************************ 21287
71.428643 **************************************************** 5635
142.857286 **************************************************** 5513
214.285929 *************************************************** 4683
285.714571 *************************************************** 4480
357.143214 *************************************************** 4748
428.571857 **************************************************** 5466
500.000500 ************************************************ 3117
571.429143 ***************************************** 911
642.857786 ********************************* 252
714.286429 ************************** 81
785.715071 ************** 11
857.143714 ******** 4
928.572357 *************** 12

# load up
set lib = ~/kent/src/hg/lib
hgLoadBed hg38 -noBin -tab -type=bed6+4 \
        -as=$lib/gtexGeneBed.as -sqlTable=$lib/gtexGeneBed.sql -renameSqlTable \
                gtexGeneV8 gtexGeneBedV8.bed
#Read 56200 elements of size 10 from gtexGeneBedV8.bed


### TODO 
# Add GTEx to Gene Sorter (2016-08-18 kate)
# See hg/near/makeNear.doc

#############################################################################
# GTEx V8 cis-eQTLs CAVIAR High Confidence (Apr 2022) Matt

cd /hive/data/genomes/hg38/bed/gtex/V8/eQtl/finemap_CAVIAR

# Download files from
wget https://storage.googleapis.com/gtex_analysis_v8/single_tissue_qtl_data/GTEx_v8_finemapping_CAVIAR.tar
# unpack files
tar xvf GTEx_v8_finemapping_CAVIAR.tar

# Other files used:
# Lookup table for all variants genotyped in GTEx
wget https://storage.googleapis.com/gtex_analysis_v8/reference/GTEx_Analysis_2017-06-05_v8_WholeGenomeSeq_838Indiv_Analysis_Freeze.lookup_table.txt.gz
# Gene-level model based on the GENCODE 26 transcript model, where isoforms were collapsed to a single transcript per gene.
wget https://storage.googleapis.com/gtex_analysis_v8/reference/gencode.v26.GRCh38.genes.gtf

# Initially planned to use this file: # CAVIAR_Results_v8_GTEx_LD_HighConfidentVariants.gz
# as it seemed to be a filtered subset of eQTLs
# Description from GTEx_v8_finemapping_CAVIAR/README.txt
# ***CAVIAR_Results_v8_GTEx_LD_HighConfidentVariants.gz --> is a single file for all GTEx tissues and all eGene where we report
# all the high causal variants (variants that have posterior probability of > 0.1).

# Sample header line:
# TISSUE  GENE    eQTL    CHROM   POS     Probability
# Brain_Caudate_basal_ganglia     ENSG00000248485.1       1_161274374     1       161274374       0.157456

# However, the names/positions in the eQTL column are not unique meaning
# We want file with unique variant/eQTL names that match those in the GTEx
# variant mapping file: GTEx_Analysis_2017-06-05_v8_WholeGenomeSeq_838Indiv_Analysis_Freeze.lookup_table.txt.gz
# Looks like the CAVIAR_Results_v8_GTEx_LD_ALL_NOCUTOFF_with_Allele.txt.gz has names 
# that match the names in the variant/eQTL mapping file

# Description from GTEx_v8_finemapping_CAVIAR/README.txt
# ***CAVIAR_Results_v8_GTEx_LD_ALL_NOCUTOFF_with_Allele.txt.gz  —> is a single file for all GTEx tissues and all eGenes where we reported
# the CPP (Causal Posterior Probability). Each eQTL contains the allele information. Sample header file:
# TISSUE  GENE    eQTL    CHROM   POS Probability
# Brain_Caudate_basal_ganglia ENSG00000248485.1   chr1_161274374_G_A_b38  1   161274374   0.157456

# So, take NOCUTOFF_with_Allele file and then filter for probability >0.1 (which is how HighConfidentVariants file was created according to GTEx README)
zcat CAVIAR_Results_v8_GTEx_LD_ALL_NOCUTOFF_with_Allele.txt.gz | awk '$6 > 0.1' | gzip -c > CAVIAR_Results_v8_GTEx_LD_HighConfidentVariants_with_Allele.gz

# Confirm that it has the same size as original HighConfidentVariants file
zcat CAVIAR_Results_v8_GTEx_LD_HighConfidentVariants.gz | wc -l
1257158
zcat CAVIAR_Results_v8_GTEx_LD_HighConfidentVariants_with_Allele.gz | wc -l
1257158

# There seem to 31 duplicate eQTLs, meaning that the two lines for these entries are duplicated (probability values, etc.)
# These will essentially be collapsed into a single entry by the buildInteract script
zcat CAVIAR_Results_v8_GTEx_LD_HighConfidentVariants_with_Allele.gz | sort | uniq -c | sort -k1,1nr | awk '$1 > 1' > dupe_eQtls.txt
wc -l dupe_eQtls.txt
31 dupe_eQtls.txt

# Wrote buildGtexEqtlBed script to help build bed files from eQTL files:
cat << '_EOF_' > buildGtexEqtlBed
#!/usr/bin/env python3

import sys, gzip

qtlFile = sys.argv[1]
gpFile = sys.argv[2]
alleleFile = sys.argv[3]
qtlIdCol = sys.argv[4] # Column name in file header, e.g. eQTL
probCol = sys.argv[5] # Column name in file header, e.g. Probability
# DAP-G files have an extra probability column that we want to capture
if len(sys.argv) > 6:
    clusterProbCol = sys.argv[6]
else:
    clusterProbCol = False

# Open up all of our files
qfh = gzip.open(qtlFile,"r") # QTL file
gfh = open(gpFile,"r") # genePred file
afh = gzip.open(alleleFile,"r") # allele file

# Set up dicts for each
qtlDict = dict()
gpDict = dict()
alleleDict = dict()

# Get index of field that we want to replace values in
header = qfh.readline().decode('ASCII').rstrip().split("\t")
qtl_col = header.index(qtlIdCol)
prob_col = header.index(probCol)
if clusterProbCol:
    clus_prob_col = header.index(clusterProbCol)

# Process eQTL file
for line in qfh:
    qtlLine = line.decode('ASCII').strip().split("\t")
    # Skip header line in file
    #if "TISSUE" not in qtlLine:
    tissue = qtlLine[0]
    gene = qtlLine[1]
    e_qtl = qtlLine[qtl_col]
    # each of these elements on their own aren't unique, but together they are
    qtlKey = e_qtl + "|" + tissue + "|" + gene
    # Put these into a dict where key is unique name we've made
    qtlDict[qtlKey] = qtlLine

# Process genePred file
for line in gfh:
    gpLine = line.strip().split("\t")
    gene_id = gpLine[0]
    # Put into dict with key on ENSG* ID, e.g.
    gpDict[gene_id] = gpLine

# Process allele file from GTEx
for line in afh:
    # gzip file, so we need to decode to asii first
    aLine = line.decode('ASCII').strip().split("\t")

    # Key our dict using the allele, which is unique in the file
    allele = aLine[0]
    alleleDict[allele] = aLine

for entry in qtlDict:
    # Get gene information from the genePred dict
    # eQTL file/dict has ENSG id in it
    gene_id = qtlDict[entry][1]
    if gene_id in gpDict.keys():
        gene_sym = gpDict[gene_id][11]
        gene_chrom = gpDict[gene_id][1]
        gene_strand = gpDict[gene_id][2]
        gene_start = int(gpDict[gene_id][3])
        gene_end = int(gpDict[gene_id][4])
    # Check if eQTL is in allele/eQTL dictionary so that we can grab the rsID
    var_name = qtlDict[entry][qtl_col]
    if var_name in alleleDict.keys():
        rsID = alleleDict[var_name][6]
        if rsID == '.':
            rsID = var_name
        var_start = int(alleleDict[var_name][2]) - 1
        var_end = int(alleleDict[var_name][2])
        chrom = alleleDict[var_name][1]

    # This block of if/else sets up the blockSizes/blockStart/blockCount for
    # various orientations of eQTLs/genes
    if chrom == gene_chrom:
        # Handles case where eQTL is internal to a gene
        if var_start > gene_start and var_start < gene_end:
            chromStart = gene_start
            chromEnd = gene_end
            blockCount = "1"
            blockSizes = gene_end - gene_start
            blockStarts = "0"
            strand = "."
        # Handles case where eQTL is downstream of gene
        elif var_start > gene_end:
            chromStart = gene_start
            chromEnd = var_end
            bs1 = gene_end - gene_start
            blockCount = "2"
            blockSizes = str(bs1) + ",1"
            blockStarts = "0," + str(var_start - gene_start)
            strand = "-"
        # Handles case where eQTL is upstream of gene
        elif var_start < gene_start:
            chromStart = var_start
            chromEnd = gene_end
            blockCount = "2"
            bs2 = gene_end - gene_start
            blockSizes = "1," + str(bs2)
            blockStarts = "0," + str(gene_start - var_start)
            strand = "+"
        # Handles case where eQTL and start of gene overlap
        elif var_start == gene_start:
            chromStart = var_start
            chromEnd = gene_end
            blockCount = "1"
            blockSizes = str(gene_end - var_start)
            blockStarts = "0"
            strand = "."
        # Handles case where eQTL and end of gene overlap
        elif gene_end == var_start:
            chromStart = gene_start
            chromEnd = var_end
            blockCount = "1"
            blockSizes = str(var_end - gene_start)
            blockStarts = "0"
            strand = "."

        # Add each element of our bed line to a list to print later
        bedLine = list()
        bedLine.append(chrom) # chrom
        bedLine.append(chromStart) # chrStart
        bedLine.append(chromEnd) # chrEnd
        bedLine.append(rsID + "/" + gene_sym + "/" + qtlDict[entry][0]) # name
        bedLine.append(int("1000"))# score
        bedLine.append(strand)# strand
        bedLine.append(var_start) # thickStart
        bedLine.append(var_end) # thickEnd
        bedLine.append("0,0,0") #color - eventually replaced by 'addTissueColors' script
        bedLine.append(blockCount) # blockCount
        bedLine.append(blockSizes) # blockSizes
        bedLine.append(blockStarts) # blockSizes
        bedLine.append(chrom + ":" + str(var_start+1) + "-" + str(var_end)) # prettified gene pos
        bedLine.append(rsID) # sourceName
        bedLine.append(chrom + ":" + str(gene_start+1) + "-" + str(gene_end)) # prettified gene pos
        bedLine.append(gene_sym) # geneName
        bedLine.append(gene_id) # geneId
        bedLine.append(gene_strand) # targetStrand
        bedLine.append(qtlDict[entry][0]) # tissue
        bedLine.append(qtlDict[entry][prob_col]) # CAVIAR == Causal Posterior Probability; DAPG == SNP posterior inclusion probability
        if clusterProbCol:
            # Signal-level posterior_inclusion probability (sum of the PIPs from all members of the signal cluster)
            bedLine.append(qtlDict[entry][clus_prob_col])

    # Print bed line to stdout, can redirect to a file to save it
    print(*bedLine, sep="\t")
'_EOF_'


# Then run buildGtexEqtlBed to build an bed file
./buildGtexEqtlBed CAVIAR_Results_v8_GTEx_LD_HighConfidentVariants_with_Allele.gz ../gencode.v26.GRCh38.genes.gpExt \
  ../GTEx_Analysis_2017-06-05_v8_WholeGenomeSeq_838Indiv_Analysis_Freeze.lookup_table.txt.gz eQTL Probability | \
  bedSort stdin gtexCaviar.sorted.bed

# Create script to add colors to bed file
cat << '_EOF_' > addTissueColors

#!/usr/bin/env python3

####
# Intially created bed file w/o colors
# This script will replace col 9 with tissue-specific colors
####

import sys

bedFile = sys.argv[1]
colorFile = sys.argv[2]

# Read in color file from //
cfh = open(colorFile, "r")
# Process this file and turn it into a dictionary
colors = dict()
for line in cfh:
    splitLine = line.strip().split("\t")
    # Keys in dictionary are tissue, e.g. Adipose_Subcutaneous
    colors[splitLine[0]] = splitLine

# Read in bed file
ifh = open(bedFile, "r")
for line in ifh:
    splitLine = line.strip().split("\t")
    # Extract tisue information from bed file
    tissue = splitLine[18]
    # If tissue is in color dict keys, then we're good
    if tissue in colors.keys():
        color = colors[tissue][2]
        # Set color field equal to tissue color
        splitLine[8] = color
    # Output modified line to stdout, can save to file by redirecting stdout to a file
    print(*splitLine, sep="\t")

'_EOF_'

# Then add colors:
./addTissueColors gtexCaviar.sorted.bed ../gtexTissueColor.v8.tab > gtexCaviar.sorted.colors.bed

# Create custom as file for this bigBed:
cat << '_EOF_' > dapg.as
table bed
"bed12+9 describing interaction between an eQTL and a target gene"
    (
    string          chrom;       "Chromosome (or contig, scaffold, etc.). For interchromosomal, use 2 records"
    uint            chromStart;  "Start position of eQTL"
    uint            chromEnd;    "End position of gene"
    string          name;        "Name of item for display in pattern eQTL name/Gene Symbol/Tissue"
    uint            score;       "Score. Always 1000."
    char[1]         strand;      "+ or - for strand"
    uint            thickStart;  "eQTL start"
    uint            thickEnd;    "eQTL end"
    uint            reserved;    "Item color, based on tissue colors GTEx Gene track"
    int             blockCount;  "Number of blocks"
    int[blockCount] blockSizes;  "Comma separated list of block sizes"
    int[blockCount] chromStarts; "Start positions of blocks relative to chromStart"
    string          eqtlPos;     "eQTL position"
    string          eqtlName;    "eQTL name (most often a dbSNP identifier)"
    string          genePos;     "Gene position"
    string          geneName;    "Gene symbol of target gene"
    string          geneId;      "ENSG ID of target gene"
    string          geneStrand;  "Strand of target gene"
    string          tissue;      "Tissue"
    double          cpp;         "Causal Posterior Probability (CPP) that quantifies the probability that a variant is causal"
    )
'_EOF_'

# Build bigBed
bedToBigBed -as=caviar.as -type=bed12+8 gtexCaviar.sorted.colors.bed /hive/data/genomes/hg38/chrom.sizes gtexCaviar.bb

#############################################################################
# GTEx V8 cis-eQTLs DAP-G High Confidence (Apr 2022) Matt

cd /hive/data/genomes/hg38/bed/gtex/V8/eQtl/finemap_dap-g

# Download files from
wget https://storage.googleapis.com/gtex_analysis_v8/single_tissue_qtl_data/GTEx_v8_finemapping_DAPG.tar

# unpack files
tar xvf GTEx_v8_finemapping_DAPG.tar

# Most files in VCF, but there is which is a TSV, GTEx_v8_finemapping_DAPG.txt.gz
# Sample header lines
tissue_id       gene_id cluster_id      cluster_pip     variant_id      variant_pip
Adipose_Subcutaneous    ENSG00000000457.13      1       1.000   chr1_169891332_G_A_b38  9.881e-01
Adipose_Subcutaneous    ENSG00000000457.13      1       1.000   chr1_169913898_G_A_b38  1.170e-02
Adipose_Subcutaneous    ENSG00000000457.13      2       0.994   chr1_169894024_A_C_b38  1.208e-01

# That's a big file:
zcat GTEx_v8_finemapping_DAPG.txt.gz |wc -l
21648585
# How can we filter that down?
# Looks like they provide a filtered file, GTEx_v8_finemapping_DAPG.CS95.txt.gz,
# but it's in VCF format. Can we replicate it using their TSV file?

# That VCF is filtered on 'SPIP' only including those clusters with SPIP > 0.95, where
# SPIP: signal-level posterior_inclusion probability (sum of the PIPs from all members of the signal cluster)
# SPIP seems equivalent to the 'cluster_pip' in our TSV file
# So, create a new file where we only include those entries with cluster pip > 0.95
zcat GTEx_v8_finemapping_DAPG.txt.gz | awk '$4>0.95' | gzip -c > GTEx_v8_finemapping_DAPG.pip_95.txt.gz

# A much smaller file (though still quite a lot of variants!)
zcat GTEx_v8_finemapping_DAPG.pip_95.txt.gz |wc -l
7608545

# Wrote buildGtexEqtlBed script to help build bed files from eQTL files:
cat << '_EOF_' > buildGtexEqtlBed
#!/usr/bin/env python3

import sys, gzip

qtlFile = sys.argv[1]
gpFile = sys.argv[2]
alleleFile = sys.argv[3]
qtlIdCol = sys.argv[4] # Column name in file header, e.g. eQTL
probCol = sys.argv[5] # Column name in file header, e.g. Probability
# DAP-G files have an extra probability column that we want to capture
if len(sys.argv) > 6:
    clusterProbCol = sys.argv[6]
else:
    clusterProbCol = False

# Open up all of our files
qfh = gzip.open(qtlFile,"r") # QTL file
gfh = open(gpFile,"r") # genePred file
afh = gzip.open(alleleFile,"r") # allele file

# Set up dicts for each
qtlDict = dict()
gpDict = dict()
alleleDict = dict()

# Get index of field that we want to replace values in
header = qfh.readline().decode('ASCII').rstrip().split("\t")
qtl_col = header.index(qtlIdCol)
prob_col = header.index(probCol)
if clusterProbCol:
    clus_prob_col = header.index(clusterProbCol)

# Process eQTL file
for line in qfh:
    qtlLine = line.decode('ASCII').strip().split("\t")
    # Skip header line in file
    #if "TISSUE" not in qtlLine:
    tissue = qtlLine[0]
    gene = qtlLine[1]
    e_qtl = qtlLine[qtl_col]
    # each of these elements on their own aren't unique, but together they are
    qtlKey = e_qtl + "|" + tissue + "|" + gene
    # Put these into a dict where key is unique name we've made
    qtlDict[qtlKey] = qtlLine

# Process genePred file
for line in gfh:
    gpLine = line.strip().split("\t")
    gene_id = gpLine[0]
    # Put into dict with key on ENSG* ID
    gpDict[gene_id] = gpLine

# Process allele file from GTEx
for line in afh:
    # gzip file, so we need to decode to asii first
    aLine = line.decode('ASCII').strip().split("\t")

    # Key our dict using the allele, which is unique in the file
    allele = aLine[0]
    alleleDict[allele] = aLine

for entry in qtlDict:
    # Get gene information from the genePred dict
    # eQTL file/dict has ENSG id in it
    gene_id = qtlDict[entry][1]
    if gene_id in gpDict.keys():
        gene_sym = gpDict[gene_id][11]
        gene_chrom = gpDict[gene_id][1]
        gene_strand = gpDict[gene_id][2]
        gene_start = int(gpDict[gene_id][3])
        gene_end = int(gpDict[gene_id][4])
    # Check if eQTL is in allele/eQTL dictionary so that we can grab the rsID
    var_name = qtlDict[entry][qtl_col]
    if var_name in alleleDict.keys():
        rsID = alleleDict[var_name][6]
        if rsID == '.':
            rsID = var_name
        var_start = int(alleleDict[var_name][2]) - 1
        var_end = int(alleleDict[var_name][2])
        chrom = alleleDict[var_name][1]

    # This block of if/else sets up the blockSizes/blockStart/blockCount for
    # various orientations of eQTLs/genes
    if chrom == gene_chrom:
        # Handles case where eQTL is internal to a gene
        if var_start > gene_start and var_start < gene_end:
            chromStart = gene_start
            chromEnd = gene_end
            blockCount = "1"
            blockSizes = gene_end - gene_start
            blockStarts = "0"
            strand = "."
        # Handles case where eQTL is downstream of gene
        elif var_start > gene_end:
            chromStart = gene_start
            chromEnd = var_end
            bs1 = gene_end - gene_start
            blockCount = "2"
            blockSizes = str(bs1) + ",1"
            blockStarts = "0," + str(var_start - gene_start)
            strand = "-"
        # Handles case where eQTL is upstream of gene
        elif var_start < gene_start:
            chromStart = var_start
            chromEnd = gene_end
            blockCount = "2"
            bs2 = gene_end - gene_start
            blockSizes = "1," + str(bs2)
            blockStarts = "0," + str(gene_start - var_start)
            strand = "+"
        # Handles case where eQTL and start of gene overlap
        elif var_start == gene_start:
            chromStart = var_start
            chromEnd = gene_end
            blockCount = "1"
            blockSizes = str(gene_end - var_start)
            blockStarts = "0"
            strand = "."
        # Handles case where eQTL and end of gene overlap
        elif gene_end == var_start:
            chromStart = gene_start
            chromEnd = var_end
            blockCount = "1"
            blockSizes = str(var_end - gene_start)
            blockStarts = "0"
            strand = "."

        # Add each element of our bed line to a list to print later
        bedLine = list()
        bedLine.append(chrom) # chrom
        bedLine.append(chromStart) # chrStart
        bedLine.append(chromEnd) # chrEnd
        bedLine.append(rsID + "/" + gene_sym + "/" + qtlDict[entry][0]) # name
        bedLine.append(int("1000"))# score
        bedLine.append(strand)# strand
        bedLine.append(var_start) # thickStart
        bedLine.append(var_end) # thickEnd
        bedLine.append("0,0,0") #color - eventually replaced by 'addTissueColors' script
        bedLine.append(blockCount) # blockCount
        bedLine.append(blockSizes) # blockSizes
        bedLine.append(blockStarts) # blockSizes
        bedLine.append(chrom + ":" + str(var_start+1) + "-" + str(var_end)) # prettified gene pos
        bedLine.append(rsID) # sourceName
        bedLine.append(chrom + ":" + str(gene_start+1) + "-" + str(gene_end)) # prettified gene pos
        bedLine.append(gene_sym) # geneName
        bedLine.append(gene_id) # geneId
        bedLine.append(gene_strand) # targetStrand
        bedLine.append(qtlDict[entry][0]) # tissue
        bedLine.append(qtlDict[entry][prob_col]) # CAVIAR == Causal Posterior Probability; DAPG == SNP posterior inclusion probability
        if clusterProbCol:
            # Signal-level posterior_inclusion probability (sum of the PIPs from all members of the signal cluster)
            bedLine.append(qtlDict[entry][clus_prob_col])

    # Print bed line to stdout, can redirect to a file to save it
    print(*bedLine, sep="\t")
'_EOF_'


# Then run buildGtexEqtlBed to build an bed file
./buildGtexEqtlBed GTEx_v8_finemapping_DAPG.pip_95.txt.gz ../gencode.v26.GRCh38.genes.gpExt \
  ../GTEx_Analysis_2017-06-05_v8_WholeGenomeSeq_838Indiv_Analysis_Freeze.lookup_table.txt.gz variant_id variant_pip cluster_pip | \
  bedSort stdin gtexDapg.sorted.bed

# Create script to add colors to bed file
cat << '_EOF_' > addTissueColors

#!/usr/bin/env python3

####
# Intially created bed file w/o colors
# This script will replace col 9 with tissue-specific colors
####

import sys

bedFile = sys.argv[1]
colorFile = sys.argv[2]

# Read in color file from //
cfh = open(colorFile, "r")
# Process this file and turn it into a dictionary
colors = dict()
for line in cfh:
    splitLine = line.strip().split("\t")
    # Keys in dictionary are tissue, e.g. Adipose_Subcutaneous
    colors[splitLine[0]] = splitLine

# Read in bed file
ifh = open(bedFile, "r")
for line in ifh:
    splitLine = line.strip().split("\t")
    # Extract tisue information from bed file
    tissue = splitLine[18]
    # If tissue is in color dict keys, then we're good
    if tissue in colors.keys():
        color = colors[tissue][2]
        # Set color field equal to tissue color
        splitLine[8] = color
    # Output modified line to stdout, can save to file by redirecting stdout to a file
    print(*splitLine, sep="\t")

'_EOF_'

# Then add colors:
./addTissueColors gtexDapg.sorted.bed ../gtexTissueColor.v8.tab > gtexDapg.sorted.colors.bed

# Create custom as file for this bigBed:
cat << '_EOF_' > dapg.as
table bed
"bed12+9 describing interaction between an eQTL and a target gene"
    (
    string          chrom;       "Chromosome (or contig, scaffold, etc.). For interchromosomal, use 2 records"
    uint            chromStart;  "Start position of eQTL"
    uint            chromEnd;    "End position of gene"
    string          name;        "Name of item for display in pattern eQTL name/Gene Symbol/Tissue"
    uint            score;       "Score. Always 1000."
    char[1]         strand;      "+ or - for strand"
    uint            thickStart;  "eQTL start"
    uint            thickEnd;    "eQTL end"
    uint            reserved;    "Item color, based on tissue colors GTEx Gene track"
    int             blockCount;  "Number of blocks"
    int[blockCount] blockSizes;  "Comma separated list of block sizes"
    int[blockCount] chromStarts; "Start positions of blocks relative to chromStart"
    string          eqtlPos;     "eQTL position"
    string          eqtlName;    "eQTL name (most often a dbSNP identifier)"
    string          genePos;     "Gene position"
    string          geneName;    "Gene symbol of target gene"
    string          geneId;      "ENSG ID of target gene"
    string          geneStrand;  "Strand of target gene"
    string          tissue;      "Tissue"
    double          pip;         "SNP posterior inclusion probability (higher PIP value indicates the SNP is more likely to be the casual eQTL)"
    double          clusterPip;  "Signal-level posterior inclusion probability (sum of the PIPs from all members of the signal cluster)"
    )
'_EOF_'

# Build bigBed
bedToBigBed -as=dapg.as -type=bed12+9 gtexDapg.sorted.colors.bed /hive/data/genomes/hg38/chrom.sizes gtexDapg.bb

#############################################################################
# GTEx V9 Single Nuclei Expression August 31, 2022 ChrisL
#############################################################################
For building single cell rna-seq tracks from this paper:
https://www.ncbi.nlm.nih.gov/pmc/articles/PMC9383269/

Files:
https://storage.googleapis.com/gtex_analysis_v9/snrna_seq_data/GTEx_8_tissues_snRNAseq_atlas_071421.public_obs.h5ad

WORKDIR=/hive/data/outside/GTEx/V9/snRnaSeq/
mkdir -p $WORKDIR && cd $WORKDIR

wget https://storage.googleapis.com/gtex_analysis_v9/snrna_seq_data/GTEx_8_tissues_snRNAseq_atlas_071421.public_obs.h5ad

# Turn scanpy files into matrices and metadata:
time hcaUnpack5 GTEx_8_tissues_snRNAseq_atlas_071421.public_obs.h5ad atlasOut
# Data on 209126 cells x 17695 genes
# Sparse matrix with 173681687 elements
# outputting 17695 row matrix, a dot every 100 rows
# 
# real    3m24.718s
# user    2m39.348s
# sys 0m27.629s

# Look at the column summaries to see what is interesting:
./getSimpleMetaStats atlasOut/meta.tsv | less -S
cd atlasOut
mkdir bbi
mkdir clust
~/bin/x86_64/matrixClusterColumns -makeIndex=clust/exprMatrix.ix exprMatrix.tsv meta.tsv \
    prep clust/prepMatrix.tsv clust/prepStats.tsv \
    "Tissue Composition" clust/TissueCompMatrix.tsv clust/TissueCompStats.tsv \
    "Tissue Site Detail" clust/TissueSiteDetailMatrix.tsv clust/TissueSiteDetailStats.tsv \
    "Sample Ischemic Time (mins)" clust/SampleIschemicTimeMatrix.tsv clust/SampleIschemicTimeStats.tsv \
    "Granular cell type" clust/GranularMatrix.tsv clust/GranularStats.tsv \
    "Broad cell type" clust/BroadCellTypeMatrix.tsv clust/BroadCellTypeStats.tsv \
    Age_bin clust/Age_binMatrix.tsv clust/Age_binStats.tsv \
    Sex clust/SexMatrix.tsv clust/SexStats.tsv

# Figure out gene set
gencodeVersionForGenes gene.lst /hive/data/inside/geneSymVerTx.tsv -bed=mapping.bed

# Turn into bigBarCharts
for s in prep TissueComp TissueSiteDetail SampleIschemicTime Granular BroadCellType Age_bin Sex
do
    matrixToBarChartBed clust/${s}Matrix.tsv mapping.bed clust/${s}.bed -stats=clust/${s}Stats.tsv -trackDb=clust/${s}.ra
    bedSort clust/${s}.bed clust/${s}.bed
    bedToBigBed clust/${s}.bed /hive/data/genomes/hg38/chrom.sizes bbi/${s}.bb -type=bed6+3 -as=${HOME}/kent/src/hg/lib/simpleBarChartBed.as
done

# Make a combined column for facetting
tawk '{print $0,$31 "-" $46}' meta.tsv > meta.tsv.plusCombined
~/bin/x86_64/matrixClusterColumns exprMatrix.tsv meta.tsv.plusCombined \
    "Tissue-Broad cell type" clust/combined.tsv clust/combinedStats.tsv
matrixToBarChartBed clust/combined.tsv mapping.bed clust/combined.bed -stats=clust/combinedStats.tsv -trackDb=clust/combined.ra
bedSort clust/combined.bed clust/combined.bed 
bedToBigBed clust/combined.bed /hive/data/genomes/hg38/chrom.sizes bbi/combined.bb -type=bed6+3 -as=${HOME}/kent/src/hg/lib/simpleBarChartBed.as

# now un extract the original combined columns back into the counted up meta.tsv
# which will give you the columns to facet on
echo -e "#Organ and Cell Type	count	total	Organ	Broad_Cell_Type" > combinedStats.withFacets
paste <(tail -n +2 clust/combinedStats.tsv) <(tail -n +2 meta.tsv | cut -f31,46 | sort -u) >> combinedStats.withFacets
# manually make cells.colors by taking colors for Broad Cell Type from the paper
# make categories file for trackDb
join -t$'\t' -1 5 -2 1 <(grep -v "^#" combinedStats.withFacets | sort -t$'\t' -k5) ../cells.colors | cut -f2,6 | sort > atlasOut.categories
# Make stats file for trackDb 
echo -e "#Organ and Cell Type      count   total   Organ   Broad_Cell_Type color" > combinedStats.withFacets.withColors.tsv
join -t$'\t' -1 5 -2 1 <(grep -v "^#" combinedStats.withFacets | sort -t$'\t' -k5) ../cells.colors | tawk '{print $2,$3,$4,$5,$1,$6}' | sort >> combinedStats.withFacets.withColors.tsv 

# link the tracks
cd /gbdb/hg38/gtex
mkdir snRnaSeq
ln -s /hive/data/outside/gtex/V9/snRnaSeq/atlasOut/bbi/*.bb .
