#########################################################################
# MaveDB (04-14-2025) Jonathan

# MaveDB data for this track was taken from mappings they created and linked to
# from https://github.com/VariantEffect/dcd_mapping2.  More specifically, from
# https://mavedb-mapping.s3.us-east-2.amazonaws.com/mappings.tar.gz.

mkdir -p /hive/data/outside/mavedb/dcd_mappings
cd /hive/data/outside/mavedb/dcd_mappings
wget https://mavedb-mapping.s3.us-east-2.amazonaws.com/mappings.tar.gz
tar xzvf mappings.tar.gz

mkdir -p /hive/data/genomes/hg38/bed/mavedb/2025_04_12
cd /hive/data/genomes/hg38/bed/mavedb/2025_04_12

# We skip 00000053-a-1.json because it's a very large dataset that also only
# contains haplotype data, which we don't have a display for.  57-d-1 is excluded
# because it's haplotype data, but with a few records that can be misinterpreted
# as single variants.
time for fil in `grep -l hgvs.p /hive/data/outside/mavedb/dcd_mappings/mappings/*.json`;
    do if [[ "$fil" == *"00000053-a-1.json" || "$fil" == *"00000057-d-1.json" ]]; then continue; fi
    echo $fil
    perl ~/kent/src/oneShot/parseMave/parseMave.pl < $fil >> combined.bed;
    done

bedSort combined.bed combined.bed
cp $HOME/kent/src/hg/lib/heatmap.as .
# edit heatmap.as to add these fields to the end
#    lstring maveLink; "Link to MaveDB"
#    lstring abstractText; "Abstract"
#    lstring methodText; "Methods"

bedToBigBed -type=bed12+ -tab -as=heatmap.as combined.bed /hive/data/genomes/hg38/chrom.sizes all_mave.bb

mkdir -p /gbdb/hg38/maveDB
cd /gbdb/hg38/maveDB
ln -s /hive/data/genomes/hg38/bed/mavedb/2025_04_12/all_mave.bb

# Now for the sequence alignments.  parseMave.pl generated a corresponding sequence file: seqFile.fa
# Split the contents of seqFile.fa into two files: dna_seqs.fa and aa_seqs.fa (this was done manually).
# The plan is to align the DNA sequences both directly to the genome and projected through knownGene,
# while the AA sequences are only going to be projected through knownGene (those seemed consistently
# a bit better than direct alignments in testing).

mkdir alignments
cd alignments

# Align DNA sequences to genome
blat -noHead -stepSize=5 -repMatch=2253 -minScore=20 -minIdentity=0 /hive/data/genomes/hg38/hg38.2bit ../dna_seqs.fa dna_align.psl

# Align DNA and AA to knownGene, then project onto the genome
blat -noHead -q=prot -t=dnax -stepSize=5 -repMatch=2253 -minScore=20 -minIdentity=0 /gbdb/hg38/targetDb/hg38KgSeqV48.2bit ../aa_seqs.fa aa_kg_align.psl
blat -noHead -stepSize=5 -repMatch=2253 -minScore=20 -minIdentity=0 /gbdb/hg38/targetDb/hg38KgSeqV48.2bit ../dna_seqs.fa dna_kg_align.psl

# Map to genome; note that the target names from hg38KgSeqV48.2bit have gene symbols attached; need to remove those
# to link with the gene names mapped to the genome
pslMap <(cat aa_kg_align.psl | perl -pe 's/(ENST\S+)__\S+/$1/;' ) /hive/data/genomes/hg38/bed/gencodeV48/build/ucscGenes.psl aa_kg_genome_align.psl
pslMap <(cat dna_kg_align.psl | perl -pe 's/(ENST\S+)__\S+/$1/;' ) /hive/data/genomes/hg38/bed/gencodeV48/build/ucscGenes.psl dna_kg_genome_align.psl.tmp

# Fixing the projected DNA alignment scores so they can be more directly compared with the direct alignments
pslRecalcMatch dna_kg_genome_align.psl.tmp /hive/data/genomes/hg38/hg38.2bit ../dna_seqs.fa dna_kg_genome_align.psl

# Short script for filtering alignments to look for overlap with the loci of corresponding heatmaps

printf '$pslName = shift;

open($exppos, "<", "../combined.bed");
while ($line = <$exppos>)
{
    chomp $line;
    @f = split /\s+/, $line;
    ($chr, $start, $end, $name) = ($f[0], $f[1], $f[2], $f[3]);
    #$chr = lc($chr);
    $posMap{$name} = "$chr $start $end";
}
close($exppos);

open($alignF, "<", $pslName);
while ($line = <$alignF>)
{
    chomp $line;
    @f = split /\t/, $line;
    ($match, $name, $chr, $start, $end) = ($f[0], $f[9], $f[13], $f[15], $f[16]);
    $name =~ m/^([^_]+)_/ or die "Unable to get name from $name";
    $expId = $1;
    $heatmapPos = $posMap{$expId};
    $heatmapPos =~ m/^(\S+) (\S+) (\S+)$/ or die "Unable to parse position from $heatmapPos for $expId";
    ($hChr, $hStart, $hEnd) = ($1, $2, $3);
    if ($chr eq $hChr)
    {
        if (($hStart < $end) && ($hEnd > $start))
        {
            print "$line\n";
        }
    }
}
close($alignF);
' > filter.pl

# Filter for alignments overlapping with the loci of the heatmaps
perl filter.pl dna_align.psl | sort -k1,1nr -k10,10 | pslUniq stdin dna_align_uniq.psl
perl filter.pl dna_kg_genome_align.psl | sort -k1,1nr -k10,10 | pslUniq stdin dna_kg_genome_align_uniq.psl
perl filter.pl aa_kg_genome_align.psl | sort -k1,1nr -k10,10 | pslUniq stdin mavedb_aa.psl

# favor high match counts, then after that high substitution counts, then finally favor knownGene-mapped
# alignments (for more consistent exon boundaries)
sort -k2,2nr -k3,3nr -k1,1 <(sed -e 's/^/a\t/' dna_kg_genome_align_uniq.psl) <(sed -e 's/^/b\t/' dna_align_uniq.psl) |
    cut -f 2- | pslUniq stdin mavedb_dna.psl

# Two squences weren't mapped by this process: 53-a-2 and 02-a-2.  Try those again with dnax alignments.
# The knownGene projected versions looked better again, so we went with those.

grep -A 1 -P '(53|02)-a-2' ../dna_seqs.fa > stragglers.fa
blat -q=dnax -t=dnax -noHead -stepSize=5 -repMatch=2253 -minScore=20 -minIdentity=0 /gbdb/hg38/targetDb/hg38KgSeqV48.2bit ../stragglers.fa stragglers_kg_align.psl
pslMap <(cat stragglers_kg_align.psl | perl -pe 's/(ENST\S+)__\S+/$1/;' ) /hive/data/genomes/hg38/bed/gencodeV48/build/ucscGenes.psl stragglers_kg_genome_align.psl
perl filter.pl stragglers_kg_genome_align.psl | sort -k1,1nr -k10,10 | pslUniq stdin stragglers_kg_genome_align_uniq.psl
cat stragglers_kg_genome_align_uniq.psl >> mavedb_dna.psl

# Now build the bigBeds.  We can't include sequence for the projected AA alignments because pslMap changes the
# sequence length (from AA to cDNA), and that breaks the sequence linkage system for pslToBigPsl.
pslToBigPsl -fa=../seqFile.fa mavedb_dna.psl stdout | sort -k1,1 -k2,2n > mavedb_dna.bigPslInput
pslToBigPsl mavedb_aa.psl stdout | sort -k1,1 -k2,2n > mavedb_aa.bigPslInput
bedToBigBed -as=../bigPsl.as -type=bed12+13 -tab mavedb_dna.bigPslInput /hive/data/genomes/hg38/chrom.sizes mavedb_dna.bb
bedToBigBed -as=../bigPsl.as -type=bed12+13 -tab mavedb_aa.bigPslInput /hive/data/genomes/hg38/chrom.sizes mavedb_aa.bb

#########################################################################

$ pslRecalcMatch dna_alignments.psl /gbdb/hg38/hg38.2bit ../../seqFile.fa recalc_dna_alignments.psl
[jcasper@hgwdev altPslChoice]$ sort -k10,10 recalc_dna_alignments.psl | pslCDnaFilter -globalNearBest=0 stdin best_of_dna_align.psl
                        seqs    aligns
             total:     112     2891
        weird over:     24      395
   drop globalBest:     104     1758
        kept weird:     24      253
              kept:     112     1133
[jcasper@hgwdev altPslChoice]$ pslToBigPsl -fa=../../seqFile.fa best_of_dna_align.psl stdout | sort -k1,1 -k2,2n > alt_mavedb_dna.bigPslInput
[jcasper@hgwdev altPslChoice]$ bedToBigBed -as=../../bigPsl.as -type=bed12+13 -tab alt_mavedb_dna.bigPslInput /hive/data/genomes/hg38/chrom.sizes alt_mavedb_dna.bb
