#############################################################################
# Illumina Infinium Microarrays - May 2022 - Jairo
# 
# Download the data file from illumina
wget https://webdata.illumina.com/downloads/productfiles/methylationEPIC/infinium-methylationepic-v-1-0-b5-manifest-file-csv.zip

# Unzip the file
zcat infinium-methylationepic-v-1-0-b5-manifest-file-csv.zip > infinium-methylationepic-v-1-0-b5-manifest-file.csv

# Replace commas with tabs
sed -i 's/,/\t/g' infinium-methylationepic-v-1-0-b5-manifest-file.csv

# Get necessary fields
# Fields used
#   49 - chrom
#   50 - start
#   51 - end
#   1  - Ilumina ID
#   0  - score (not used)
#   52 - strand
#   17 - UCSC_RefGene_Accession
#   44 - SNP_ID (rsID)
#   46 - SNP_MinorAlleleFrequency
#    7 - Infinium_Design_Type
#    4 - AlleleA_Probset
#    6 - AlleleB_Probset

cat infinium-methylationepic-v-1-0-b5-manifest-file.csv | awk -F'\t' '{ print $49,$50,$51,$1,0,$52,$17,$44,$46,$7,$4,$6 }' > hg38.EPIC.bed

# Removing lines that have "NA" as the hg38 chrom, start, and end fields
grep -v "NA" hg38.EPIC.bed > EPIC.hg38

# Remove lines with no data (controls)
grep -v "^ " EPIC.hg38 > EPIC.hg38.clean

# Sorting the file
sort -k1,1 -k2,2n EPIC.hg38.clean > EPIC.hg38.final.bed

# Replace spaces with tabs
sed -i 's/ /\t/g' EPIC.hg38.final.bed 

# Create a list of possible 450K sites
cat infinium-methylationepic-v-1-0-b5-manifest-file.csv | awk -F'\t' '{ print $1,$24 }' > possible450K.sites
# Get the list of 450K sites by name
grep "TRUE" possible450K.sites | awk -F' ' '{ print $1 }' > 450K.only
# Filter out the sites from the 850K bed file
grep -f 450K.only EPIC.hg38.final.bed > 450k.final.bed &

# Creating the bigBeds
bedToBigBed -tab -type=bed6+6 -as=epic.as EPIC.hg38.final.bed hg38.chrom.sizes EPIC.bb
bedToBigBed -tab -type=bed6+6 -as=450k.as 450k.final.bed hg38.chrom.sizes 450k.bb

# Moving the data into /hive and creating symlinks in /gbdb
cp EPIC.bb /hive/data/genomes/hg38/bed/illumina/epic850K.bb
cp 450k.bb /hive/data/genomes/hg38/bed/illumina/illumina450K.bb

ln -s /hive/data/genomes/hg38/bed/illumina/epic850K.bb /gbdb/hg38/bbi/illumina/epic850K.bb
ln -s /hive/data/genomes/hg38/bed/illumina/illumina450K.bb /gbdb/hg38/bbi/illumina/illumina450K.bb

#############################################################################
# Illumina CytoSNP-850K Probe array - June 2022 - Jairo
# 
wget https://webdata.illumina.com/downloads/productfiles/cytosnp-850k/v1-2/cytosnp-850k-v1-2-ns550-d2-manifest-file-csv.zip

# Unzip the file
unzip cytosnp-850k-v1-2-ns550-d2-manifest-file-csv.zip 

# Rename
mv CytoSNP-850Kv1-2_NS550_D2\ CSV.csv cytosnp-850k-hg38.csv

# Removing the header from the csv file
vi cytosnp-850k-hg38.csv

# Replace commas with tabs
sed -i 's/,/\t/g' cytosnp-850k-hg38.csv 

# Get necessary fields
cat cytosnp-850k-hg38.csv | awk -F'\t' -vOFS='\t' '{ print "chr"$10,$11-1,$11,$1,0,$21,$3,$2,$4,$5,$6,$7,$8,$12,$14,$15,$16,$17,$18,$19,$20 }' > cytoSnp.bed 

# Sorting the file
sort -k1,1 -k2,2n cytoSnp.bed > cytoSnp.bed.sorted

# Create the bigbed
bedToBigBed -tab -type=bed6+15 -as=../cytoSNP.as cytoSnp.bed.final http://hgdownload.soe.ucsc.edu/goldenPath/hg38/bigZips/hg38.chrom.sizes cytoSnp.bb

# Move the bigBed into /hive and create symlinks to /gbdb
cp cytoSnp.bb /hive/data/genomes/hg38/bed/cytoSnp/cytoSnp850k.bb
ln -s /hive/data/genomes/hg38/bed/cytoSnp/cytoSnp850k.bb /gbdb/hg38/bbi/cytoSnp/cytoSnp850k.bb

#############################################################################

# Illumina Methylation EPICv2 - May 2025 - Lou

#EPIC2 files downloaded from https://support.illumina.com/array/array_kits/infinium-methylationepic-beadchip-kit/downloads.html
#.as file details: https://hgwdev.gi.ucsc.edu/~lrnassar/temp/methylationepic%20v2-manifest-column-headings%20A2.pdf

#e.g of line 1:
#cg25324105_BC11,cg25324105,1754126,ATTTATAAACTAATAACCCAAAATACATTTCCCAAAAACCTTCACAACCA,99753217,GTTTATAAACTAATAACCCGAAATACATTTCCCAAAAACCTTCGCGACCG,A,Red,R,cg,F,B,C,1,I,19,37692358,Human,GRCh

#The coordinate, 37692358, needs to be come -1 and +1: chr19 37692357 37692359

#For hg19, same thing. e.g. 38183259 should be:
#chr19 38183258 38183260

#The plan for how to make the file:
#add chr to the start all strings in field 16 and field 50

#Make a new field 52 called hg38chromStart that is field 17-1 and field 53 called hg38chromEnd that is field 17+1
#Make a new field 54 called hg38chromStart that is field 51-1 and field 55 called hg38chromEnd that is field 51+1

################# USED PYTHON SCRIPT BELOW TO CREATE FIXED FILE #############

EPIC2File = open('/hive/data/outside/illuminaEPIC2/EPIC-8v2-0_A2.csv','r')
fixedFile = open('/hive/data/outside/illuminaEPIC2/fixedFile.tsv','w')

for line in EPIC2File:
    if line.startswith("IlmnID"):
        line = line.rstrip().replace(',', '\t')
        line = line+"\thg38chromStart\thg38chromEnd\thg19chromStart\thg19chromEnd\tscore\n"
        fixedFile.write(line)
    elif line.startswith("cg"):
        line = line.rstrip().replace(',', '\t').split("\t")
        hg38chromStart = str(int(line[16])-1)
        hg38chromEnd = str(int(line[16])+1)
        hg38chrom = 'chr'+line[15]
        if hg38chrom == 'chr0':
            continue
        else:
            if line[10] == "F":
                strand = "+"
            elif line[10] == "R":
                strand = "-"
            elif line[10] == "0":
                strand = "."
            if line[50] != '':
                hg19chromStart = str(int(line[50])-1)
                hg19chromEnd = str(int(line[50])+1)
                hg19chrom = 'chr'+line[49]
            else:
                hg19chromStart = line[50]
                hg19chromEnd = line[50]
                hg19chrom = line[49]
    
            newLine = "\t".join(line[0:10])+"\t"+strand+"\t"+"\t".join(line[11:15])+"\t"+hg38chrom+"\t"+"\t".join(line[16:49])+"\t"+hg19chrom+"\t"+line[50]+"\t"+hg38chromStart+"\t"+hg38chromEnd+"\t"+hg19chromStart+"\t"+hg19chromEnd+"\t0\n"
            fixedFile.write(newLine)

EPIC2File.close()
fixedFile.close()

################ SCRIPT DONE ##############

#hg38
~max/usr/scripts/tabToBed --bedFields=chrom=15,start=51,end=52,strand=10,name=0,score=55 /hive/data/outside/illuminaEPIC2/fixedFile.tsv /hive/data/outside/illuminaEPIC2/outputFile.bed /hive/data/outside/illuminaEPIC2/outputFile.as

#Had to remove the following bad IDs that were bigger than the chromosome

cg07146279_TC21
cg11930929_TC21
cg18295427_TC21
cg01734724_TC21

bedToBigBed -sort -as=backup.as -type=bed12+ -tab outputFile.bed http://hgdownload.soe.ucsc.edu/goldenPath/hg38/bigZips/hg38.chrom.sizes illuminaEpicMethylationV2.bb

ln -s /hive/data/outside/illuminaEPIC2/illuminaEpicMethylationV2.bb /gbdb/hg38/bbi/illumina/illuminaEPICv2.bb

#hg19

awk -F'\t' 'NF >= 54 && $54 == ""' fixedFile.tsv | cut -f1 > toSkipInHg19.txt

$ wc -l toSkipInHg19.txt
301 toSkipInHg19.txt

grep -vf toSkipInHg19.txt fixedFile.tsv > hg19.fixedFile.tsv

~max/usr/scripts/tabToBed --bedFields=chrom=49,start=53,end=54,strand=10,name=0,score=55 /hive/data/outside/illuminaEPIC2/hg19.fixedFile.tsv /hive/data/outside/illuminaEPIC2/hg19File.bed /hive/data/outside/illuminaEPIC2/outputFile.as

#Manual tweaks to .as file resulting in hg19IlluminaEPIC.as
#Manually deleted 2 items that are "chr0" - that means they are unmapped

bedToBigBed -sort -as=hg19IlluminaEPIC.as -type=bed12+ -tab /hive/data/outside/illuminaEPIC2/hg19File.bed http://hgdownload.soe.ucsc.edu/goldenPath/hg19/bigZips/hg19.chrom.sizes illuminaEpicMethylationV2Hg19.bb

ln -s /hive/data/outside/illuminaEPIC2/illuminaEpicMethylationV2Hg19.bb /gbdb/hg19/bbi/illumina/illuminaEPICv2.bb

######################################

#########################################################################

# Bionano, max, July 4 2025
# received files from apang@bionano.com
# converted to bigBed with standard bedToBigBed
