# GWAS from the COVID-19 Host Genetics Initiative (HGI)
# (2020-07-02 kate)

# From: covid19hg.org/results

# Contacts:  Rachel Liao,  Juha Karjalainen (Broad)
juha.karjalainen@helsinki.fi

# Create build dir
cd /hive/data/outside/covid/covidHostGenetics

# GWAS meta-analyses file format

1 #CHR    chromosome
2 POS     chromosome position in build 37
3 REF     non-effect allele
4 ALT     effect allele (beta is for this allele)
5 SNP     #CHR:POS:REF:ALT
{STUDY}_AF_Allele2      allele frequency in {STUDY}
{STUDY}_AF_fc   allele frequency in {SeUDY} / allele frequency in gnomAD v3 (1000000 if frequency in gnomAD is 0). Calculated based on each study'ss ancestry in gnomAD
{STUDY}_N
6 + (X = #studies * 3) all_meta_N      number of studies that had the variant after AF and INFO filtering and as such were used for the meta
7 + X all_inv_var_meta_beta   effect size on log(OR) scale
8 + X all_inv_var_meta_sebeta standard error of effect size
9 + X all_inv_var_meta_p      p-value
10 + X all_inv_var_het_p       p-value from Cochran's Q heterogeneity test

# additional columns:
11 + X "all_meta_sample_N"
12 + X "all_meta_AF"
13 + X "rsid"

# additional for hg19 liftover. Values in hg38.
14 + X "anew_chr"
15 + X "anew_pos"
16 + X "REF.1"
17 + X "ALT.1"

# Studies

1 Genetic determinants of COVID-19 complications in the Brazilian population      BRACOVID
2 Genetic modifiers for COVID-19 related illness  BelCovid
3 deCODE  DECODE
4 FinnGen FinnGen
5 GEN-COVID, reCOVID      GENCOVID
6 UK 100,000 Genomes Project      genomicsengland100kgp_EUR
7 Genes & Health  GNH
8 Generation Scotland     GS
9 COVID19-Host(a)ge       HOSTAGE
10 Helix Exome+ COVID-19 Phenotypes        Helix
11 UK Blood Donors Cohort  INTERVAL
12 LifeLines CytoSNP       LifelinesCyto
13 LifeLines Global Screening Array        LifelinesGsa
14 Netherlands Twin Register       NTR
15 Partners Healthcare Biobank     PHBB
16 Qatar Genome Program    QGP
17 UK Biobank      UKBB

#####################
# Consult with Ana on Aug 12

1. Restrict to analyses with enough power (based on plots):
        B2: hospitalized covid vs. population  (3199 cases, 8 studies)
        C2: covid vs. population (6696 cases, 18 studies)

2. Label options: rsID and/or ALT/REF (use ALT/REF if no label)

3. Mouseover: pValue, effect size, #studies

4. Filters: pValue (default=5), #studies,

######################
# Make with new .as (input from Juha), and hg38 tracks
# (2020-09-04 kate)

# Analysis B2: hospitalized covid vs. population
# cases: 3199
# studies: 8

# Analysis C2: covid vs. population
# cases: 6696
# studies: 18


# download hg19 files
wget https://storage.googleapis.com/covid19-hg-public/20200619/results/build_37/COVID19_HGI_ANA_B2_V2_20200701.b37.txt.gz
wget https://storage.googleapis.com/covid19-hg-public/20200619/results/build_37/COVID19_HGI_ANA_C2_V2_20200701.b37.txt.gz

# download hg38 files
wget https://storage.googleapis.com/covid19-hg-public/20200619/results/COVID19_HGI_ANA_B2_V2_20200701.txt.gz
wget https://storage.googleapis.com/covid19-hg-public/20200619/results/COVID19_HGI_ANA_C2_V2_20200701.txt.gz
gunzip *.z

wc -l covidHgiGwas*.hg38.txt
   15392647 covidHgiGwas.B2.hg38.txt
   24600933 covidHgiGwas.C2.hg38.txt


# rename
ln -s COVID19_HGI_ANA_B2_V2_20200701.txt covidHgiGwas.B2.hg38.txt
ln -s COVID19_HGI_ANA_C2_V2_20200701.txt covidHgiGwas.C2.hg38.txt
ln -s COVID19_HGI_ANA_B2_V2_20200701.b37.txt covidHgiGwas.B2.hg19.txt
ln -s COVID19_HGI_ANA_C2_V2_20200701.b37.txt covidHgiGwas.C2.hg19.txt

wc -l covidHgiGwas*
   15392647 covidHgiGwas.B2.hg38.txt
   24600933 covidHgiGwas.C2.hg38.txt

mkdir -p /gbdb/hg19/covidHgiGwas /gbdb/hg38/covidHgiGwas

cat > makeHgi.csh << 'EOF'
set bin = ~/kent/src/hg/makeDb/outside/covid
foreach d (B2.hg19 B2.hg38 C2.hg19 C2.hg38)
    set db = $d:e
    set a = $d:r
    set sizes = /hive/data/genomes/$db/chrom.sizes
    set b = covidHgiGwas
    set f = $b.$a.$db
    set bb = $b$a.$db
    echo $b.$a.$db
    if ($a == "B2") then
        set studies = 8
    else
        set studies = 18
    endif
    perl $bin/makeCovidHgiGwas.pl $db $studies $f.txt > $f.bed
    bedSort $f.bed $f.sorted.bed
    bedToBigBed -type=bed9+12 -as=$bin/covidHgiGwas.as -tab $f.sorted.bed $sizes $bb.bb
    ln -s `pwd`/$bb.bb /gbdb/hg19/covidHgiGwas
end
'EOF'

make makeHgi.csh >&! makeHgi.out &

##################
# Get effect size distribution to inform color gradient selection

% textHistogram -real -minVal=-25  -binSize=1 -maxBinCount=50  effectSizes.txt 
-13.000000  1
-12.000000  1
-11.000000  0
-10.000000  2
-9.000000  3
-8.000000  6
-7.000000  6
-6.000000 * 19
-5.000000 *** 53
-4.000000 ********************************************************* 944
-3.000000 ************************************************************ 1000
-2.000000 ************************************************************ 1000
-1.000000 ************************************************************ 1000
0.000000 ************************************************************ 1001
1.000000 ************************************************************ 1000
2.000000 ************************************************************ 1000
3.000000 ************************************************************ 1000
4.000000 ************************************************************ 994
5.000000 ********************************************************** 971
6.000000 **************************************************** 866
7.000000 ******************************************* 717
8.000000 ****************************** 504
9.000000 ********************** 371
10.000000 *********** 183
11.000000 ***** 78
12.000000 *** 43
13.000000 ** 37
14.000000 * 13
15.000000  4
16.000000  6
17.000000  2
18.000000  0
19.000000  0
20.000000  1


##################################
# Release 4 (October 2020)
# Dec. 2020 Kate

#23 contributors world-wide, including 23 and me
#14 European contributors in 23andme leave-out analysis

# download, all leaving out 23andme (to allow full download)
# using 4 analyses this time, as per Ana recommendation (enough power now in A2 and C1)

# Note the analysis methods have changed slightly.  Effect size ranges are much smaller.

B2: Hospitalized Covid vs population
Cases: 8638 - 23andme (613+140) =  7885
Controls: 1736547 - 23andme (680416+94327) = 961804

C2: Covid vs population
Cases: 30937 - 23andme (506+9913+2553) = 17965
Controls: 1471815 - 23andme (3110+85072+13086) =

A2: Very severe respratory confirmed covid vs population
Cases: 4933 - 23andme (495 + 102) = 4336
Controls: 1398672 (680440 + 94330) = 623902

C1: Covid vs. lab/self-reported negative
Cases: 24057 - 23andme (506 + 9913 + 2553) = 11085
Controls: 218062 - 23andme (85072 + 13086 + 3110) = 116794
Studies: 23 - 23andme (3) = 20

# hg38
wget https://storage.googleapis.com/covid19-hg-public/20200915/results/20201020/COVID19_HGI_B2_ALL_leave_23andme_20201020.txt.gz
wget https://storage.googleapis.com/covid19-hg-public/20200915/results/20201020/COVID19_HGI_C2_ALL_leave_23andme_20201020.txt.gz

wget https://storage.googleapis.com/covid19-hg-public/20200915/results/20201020/COVID19_HGI_A2_ALL_leave_23andme_20201020.txt.gz
wget https://storage.googleapis.com/covid19-hg-public/20200915/results/20201020/COVID19_HGI_C1_ALL_leave_23andme_20201020.txt.gz

# hg19
wget https://storage.googleapis.com/covid19-hg-public/20200915/results/20201020/COVID19_HGI_B2_ALL_leave_23andme_20201020.b37.txt.gz
wget https://storage.googleapis.com/covid19-hg-public/20200915/results/20201020/COVID19_HGI_C2_ALL_leave_23andme_20201020.b37.txt.gz

wget https://storage.googleapis.com/covid19-hg-public/20200915/results/20201020/COVID19_HGI_A2_ALL_leave_23andme_20201020.b37.txt.gz
wget https://storage.googleapis.com/covid19-hg-public/20200915/results/20201020/COVID19_HGI_C1_ALL_leave_23andme_20201020.b37.txt.gz

gunzip *.z

# rename
ln -s COVID19_HGI_B2_ALL_leave_23andme_20201020.b37.txt covidHgiGwasR4.B2.hg19.txt
ln -s COVID19_HGI_B2_ALL_leave_23andme_20201020.txt covidHgiGwasR4.B2.hg38.txt
ln -s COVID19_HGI_C2_ALL_leave_23andme_20201020.b37.txt covidHgiGwasR4.C2.hg19.txt
ln -s COVID19_HGI_C2_ALL_leave_23andme_20201020.txt covidHgiGwasR4.C2.hg38.txt

ln -s COVID19_HGI_A2_ALL_leave_23andme_20201020.txt covidHgiGwasR4.A2.hg38.txt
ln -s COVID19_HGI_A2_ALL_leave_23andme_20201020.b37.txt covidHgiGwasR4.A2.hg19.txt
ln -s COVID19_HGI_C1_ALL_leave_23andme_20201020.txt covidHgiGwasR4.C1.hg38.txt
ln -s COVID19_HGI_C1_ALL_leave_23andme_20201020.b37.txt covidHgiGwasR4.C1.hg19.txt

wc -l covidHgiGwas*.txt

# previous (release 3) track
#15392647 covidHgiGwas.B2.hg38.txt
# 15M
#24600933 covidHgiGwas.C2.hg38.txt

   11830413 covidHgiGwasR4.A2.hg19.txt
   11846556 covidHgiGwasR4.A2.hg38.txt
   11434012 covidHgiGwasR4.B2.hg19.txt
   11449434 covidHgiGwasR4.B2.hg38.txt
# 11M
   13060711 covidHgiGwasR4.C1.hg19.txt
   13075183 covidHgiGwasR4.C1.hg38.txt
   12706061 covidHgiGwasR4.C2.hg19.txt
   12722892 covidHgiGwasR4.C2.hg38.txt

# Note more filtered than release 3 data


# Columns
#
# head -2 covidHgiGwasR4.B2.hg19.txt

CHR    POS     REF     ALT     SNP
all_meta_N      all_inv_var_meta_beta   all_inv_var_meta_sebeta all_inv_var_meta_p      all_inv_var_het_p
all_meta_sample_N       all_meta_AF     rsid

# 13 fields

# These are same as Release 3, except:
# 1. hg19 lacks trailing columns from lift
# 2. there are no study-specific values (between SNP and all_meta_N)

# determine range of effect size for filter settings, excluding low significance (log pvalue<3 items)

# parse and generate bigBed files

cat > makeHgi.csh << 'EOF'
    set bin = ~/kent/src/hg/makeDb/outside/covid
foreach d (B2.hg19 B2.hg38 C2.hg19 C2.hg38 A2.hg19 A2.hg38 C1.hg19 C1.hg38)
    set db = $d:e
    set a = $d:r
    set sizes = /hive/data/genomes/$db/chrom.sizes
    set b = covidHgiGwasR4
    set f = $b.$a.$db
    set bb = $b$a.$db
    echo $b.$a.$db
    if ($a == "B2") then
        set studies = 21
    else if ($a == "C2") then
        set studies = 33
    else if ($a == "A2") then
        set studies = 12
    else if ($a == "C1") then
        set studies = 20
    endif

    # colored by pvalue  (effect size determines lolly height)
    # NOTE: this track will not be on RR (perhaps will host on genome-test for comparison with
    #   earlier data
    perl $bin/makeCovidHgiGwas.pl $db $studies $f.txt R4 > $f.bed
    bedSort $f.bed $f.sorted.bed
    bedToBigBed -type=bed9+12 -as=$bin/covidHgiGwas.as -tab $f.sorted.bed $sizes $bb.bb
    ln -s `pwd`/$bb.bb /gbdb/$db/covidHgiGwas

    # colored by effect size (pvalue determines lolly height)
    #perl $bin/makeCovidHgiGwas.pl $db $studies $f.txt R4 .025 > $f.EC.bed
# Use medians: positive theshold .178, negative threshold .119
    perl $bin/makeCovidHgiGwas.pl $db $studies $f.txt R4 .066 .115 > $f.EC.bed
    bedSort $f.EC.bed $f.sorted.EC.bed
    bedToBigBed -type=bed9+12 -as=$bin/covidHgiGwas.as -tab $f.sorted.EC.bed $sizes $bb.EC.bb
    ln -s `pwd`/$bb.EC.bb /gbdb/$db/covidHgiGwas
    end
'EOF'

# At Ana's suggestion, remake track with per-track effect size coloring thresholds

cat >getEffectSizes.csh << 'EOF'
foreach a (A2 B2 C1 C2)
    echo -n "$a\t"
    awk '$13 >= 3.0 {print}' covidHgiGwasR4.$a.hg38.EC.bed > $a.highPvalues.bed
    awk '$10 < 0 {print $21}' $a.highPvalues.bed | sort -n >  negEffects.$a.txt
    awk '$10 > 0 {print $21}' $a.highPvalues.bed | sort -n > posEffects.$a.txt
    set neg = `ave negEffects.$a.txt  | grep median | awk '{print $2}'`
    set pos = `ave posEffects.$a.txt  | grep median | awk '{print $2}'`
    echo "$neg\t$pos"
end
'EOF'

cat getEffectsizes.out
A2      0.130000        0.276000
B2      0.118000        0.206000
C1      0.094000        0.169000
C2      0.066000        0.115000


cat > reColor.csh << 'EOF'
    set bin = ~/kent/src/hg/makeDb/outside/covid
foreach d (A2.hg38 B2.hg38 C1.hg38 C2.hg38 C2.hg19 A2.hg19 C1.hg19 B2.hg19 )
    set db = $d:e
    set a = $d:r
    set sizes = /hive/data/genomes/$db/chrom.sizes
    set b = covidHgiGwasR4
    set f = $b.$a.$db
    set bb = $b.$a.$db
    echo $b.$a.$db
    if ($a == "A2") then
        set studies = 12
        # median negative and positive effect sizes
        set neg = 0.130000; set pos = 0.276000
    else if ($a == "B2") then
        set studies = 21
        set neg = 0.118000 ; set pos = 0.206000
    else if ($a == "C1") then
        set studies = 20
        set neg = 0.094000; set pos = 0.169000
    else if ($a == "C2") then
        set studies = 33
        set neg = 0.066000; set pos = 0.115000
    endif

    # colored by pvalue  (effect size determines lolly height)
    #perl $bin/makeCovidHgiGwas.pl $db $studies $f.txt R4 > $f.beta.bed
    #bedSort $f.beta.bed $f.beta.sorted.bed
    #bedToBigBed -type=bed9+12 -as=$bin/covidHgiGwas.as -tab $f.beta.sorted.bed $sizes $bb.beta.bb
    #ln -s `pwd`/$bb.beta.bb /gbdb/$db/covidHgiGwas

    # colored by effect size (pvalue determines lolly height)
    perl $bin/makeCovidHgiGwas.pl $db $studies $f.txt R4 $neg $pos > $f.bed
    bedSort $f.bed $f.sorted.bed
    bedToBigBed -type=bed9+12 -as=$bin/covidHgiGwas.as -tab $f.sorted.bed $sizes $bb.bb
    ln -s `pwd`/$bb.bb /gbdb/$db/covidHgiGwas
    end
'EOF'

csh reColor.csh >&! reColor.out &
tail -100f reColor.out

