# convert uniprot to tab files
~/kent/src/utils/uniprotToTab /hive/data/outside/uniProt/current/ 9606 uniprotTab/

# ABO = swissprot P16442 (hg19) or trembl A0A087X0C2 (exact match against gencode on hg38)

cp ~/kent/src/hg/utils/uniprotMutations/bed12UniProt*.as ./
cp ~/kent/src/hg/utils/uniprotMutations/mapUniprot_doBlast ./

# try to make a lift file uniprot -> hg38
~/kent/src/hg/utils/uniprotMutations/makeUniProtToHg.sh uniprotTab/ hg38

# gencode fix for hg38:
# gencode maps some genes currently to trembl sequences
# confirmed by email from Branwen

# manually fix the mapping of uniprot -> knownGene -> genome by using the
# HGNC symbols
# 1) use mapping uniprot -> symbol from uniprot
cat uniprotTab/uniprot.9606.tab  | cut -f2,18 | awk '($2!="")' | tabExpand 1 /dev/stdin > upToSym.tab
# 2) use mapping known gene -> symbol from gencode
hgsql hg38 -NB -e 'select kgId, geneSymbol from kgXref' > kgToSym.tab
# 3) filter the psl to keep only best alignments from the same gene
find upMap/work/aligns -name '*.psl' | xargs cat | pslSameGene upToSym.tab kgToSym.tab /dev/stdin | pslCDnaFilter stdin -globalNearBest=0  -bestOverlap -filterWeirdOverlapped stdout | sort -k 14,14 -k 16,16n -k 17,17n > uniProtTohg38.psl
# 4) use this to create a mapping uniprot -> genome
pslMap upMap/work/uniProtVsUcscMRna.psl upMap/work/ucscMRna.psl uniProtTohg38.psl

~/kent/src/utils/uniprotLift uniprotTab/ 9606 /hive/data/genomes/hg38/chrom.sizes uniProtTohg38.psl

