# T CELLS
# massage the input tables 
cat tcell_compact.csv | csvToTab > tcell_compact.tab
cat tcell_full.csv | csvToTab > tcell_full.tab
egrep "PubMed ID|ebola|filovirus" -i tcell_compact.tab > tcell_ebola.tab
egrep "PubMed ID|ebola|filovirus" -i tcell_full.tab  > tcell_ebola_full.tab
tabCollapse tcell_ebola.tab Epitope_Linear_Sequence > tcell_ebola_coll.tab
tabCut -n "Antigen Object Primary Molecule Sequence,MHC Allele Name"  tcell_ebola_full.tab | tawk '($1!="" && $2!="")' | tabCollapse /dev/stdin "Antigen_Object_Primary_Molecule_Sequence" > tcellAlleles_coll.tab

# map sequences
less tcell_ebola_coll.tab | cut -f10 | tawk '{print ">"$1"\n"$1""}' > tcell.fa
blat -maxIntron=6 -t=dnax -q=prot /hive/data/genomes/eboVir3/eboVir3.2bit tcell.fa stdout -noHead -minScore=7 -tileSize=4 -stepSize=1| pslCDnaFilter stdin stdout -minId=0.85 -minCover=0.85 | pslToBed stdin tcell.bed

# pull the alleles out of the big file, add everything to the bed file and
# convert
bedAppend tcell.bed tcell_ebola.tab 9 tcellPlus.bed tcell.as -t tcellAlleles_coll.tab 
cat tcellPlus.bed | bedFixBlockOverlaps stdin | grep Positive > tcellPlusFix.bed

# sort into MHCI and MHCII based on two rules
# if the allele was annotated, use the allele to derive the MHC class
# if the allele wasn't annotated use the length: <=12 is ClassI, >12 is classII
cat tcellPlusFix.bed | tawk '$92 ~ /[Cc]lass I,|HLA-[ABCEFG]|H-2-[bdKDL]/ || ($92=="" && length($4)<=12)' > tcellPlusI.bed
cat tcellPlusFix.bed | tawk '$92 ~ /[Cc]lass II,|HLA-D|-IA|-IE/ || ($92=="" && length($4)>12)' > tcellPlusII.bed
bedToBigBed tcellPlusI.bed /hive/data/genomes/eboVir3/chrom.sizes tcellPlusI.bb -tab -type=bed12+ -as=tcell.as
bedToBigBed tcellPlusII.bed /hive/data/genomes/eboVir3/chrom.sizes tcellPlusII.bb -tab -type=bed12+ -as=tcell.as
ln -s `pwd`/tcellPlusI.bb /gbdb/eboVir3/bbi/iedb/tcellPlusI.bb
ln -s `pwd`/tcellPlusII.bb /gbdb/eboVir3/bbi/iedb/tcellPlusII.bb
hgBbiDbLink eboVir3 iedbTcellII /gbdb/eboVir3/bbi/iedb/tcellPlusII.bb
hgBbiDbLink eboVir3 iedbTcellI /gbdb/eboVir3/bbi/iedb/tcellPlusI.bb

# B CELLS
cat bcell_compact.csv | csvToTab > bcell_compact.tab
cat bcell_full.csv | csvToTab > bcell_full.tab
egrep "PubMed ID|ebola" -i bcell_compact.tab > bcell_ebola.tab
egrep -i "PubMed|ebola" bcell_full.tab > bcell_full_ebola.tab
tabCollapse bcell_ebola.tab Epitope_Linear_Sequence > bcell_ebola_coll.tab 
less bcell_ebola_coll.tab | cut -f10 | tawk '{print ">"$1"\n"$1""}' > bcell.fa
# BLAST finds 7% more matches, than BLAT
#blat -t=dnax -q=prot /hive/data/genomes/eboVir3/eboVir3.2bit bcell.fa stdout -noHead -minScore=8 -tileSize=6 -stepSize=1 -repMatch=1000000000 | sort -k1,1n -r | pslCDnaFilter stdin stdout -minId=0.85 -minCover=0.85 | pslToBed stdin stdout | bedSort stdin bcell.bed
#bedAppend bcell.bed bcell_ebola_coll.tab 9 bcellPlus.bed bcell.as
twoBitToFa ../../eboVir3.2bit db.fa
formatdb -i db.fa -p F
blastall -e 10000 -p tblastn -i bcell.fa -d db.fa -W 2 2> err.log | blastToPsl stdin stdout | pslCDnaFilter -minId=0.85 stdin stdout -minCover=0.85| pslToBed stdin stdout | bedSort stdin tblastn.bed &

# join in additional fields and create .as file
bedAppend tblastn.bed bcell_ebola_coll.tab 9 bcellPlus.bed bcell.as
# keep only positive sequences
# here we remove the majority of the data! keep it in a different track?
grep Positive bcellPlus.bed | bedFixBlockOverlaps stdin > bcellPlusPos.bed

# replace names with ab-Name/species
cp ../../eboVir2/bed/iedb/specTrans.tab ./
cut -f26,75,290,291 bcell_full_ebola.tab | tabReplace specTrans.tab 1 stdin |  tawk '{if ($3!="") {print $1, $3"/"$2} else {print $1, substr($1,1,6)"/"$2;}}' | sed -e 's/<.*sub>//g' | tawk '($2!="")' | tawk '($1!="")' | sed -e 's/, /,/g' > pepNames.tab
tabReplace pepNames.tab 3 bcellPlusPos.bed > bcellPlusPosName.bed

bedToBigBed bcellPlusPosName.bed /hive/data/genomes/eboVir3/chrom.sizes bcellPlus.bb -type=bed12+ -as=bcell.as -tab
ln -s `pwd`/bcellPlus.bb /gbdb/eboVir3/bbi/iedb/bcellPlus.bb
hgBbiDbLink eboVir3 iedbBcell /gbdb/eboVir2/bbi/iedb/bcellPlus.bb 

#  same for negative results - probably not worth it, all from a single screen
#  in Plos ONe
grep Negative bcellPlus.bed | bedFixBlockOverlaps stdin > bcellPlusNeg.bed
tabReplace pepNames.tab 3 bcellPlusNeg.bed > bcellPlusNegName.bed
bedToBigBed bcellPlusNegName.bed /hive/data/genomes/eboVir3/chrom.sizes bcellPlusNeg.bb -type=bed12+ -as=bcell.as -tab
ln -s `pwd`/bcellPlusNeg.bb /gbdb/eboVir3/bbi/iedb/bcellPlusNeg.bb
hgBbiDbLink eboVir3 iedbBcellNeg /gbdb/eboVir3/bbi/iedb/bcellPlusNeg.bb
