
###########################################################################
### obtain the Ensembl sequence to create idKeys
###########################################################################

mkdir -p /hive/data/genomes/hg38/bed/chromAlias/update.2022-11-09/ensemblRecent
cd /hive/data/genomes/hg38/bed/chromAlias/update.2022-11-09/ensemblRecent

wget --timestamping \
http://ftp.ensembl.org/pub/release-108/fasta/homo_sapiens/dna/Homo_sapiens.GRCh38.dna.toplevel.fa.gz

# alternate sequences are too large to place everything into one 2bit file
mkdir split
faSplit byname Homo_sapiens.GRCh38.dna.toplevel.fa.gz split/

# make 2bit files for each sequence

mkdir -p twoBit

for F in split/*
do
  echo $F | sed -e 's/.fa$//; s#split/##;' | while read C
  do
     printf "faToTwoBit ${F} twoBit/${C}.2bit\n"
  done
done > run.2bit.list

# run 32 of them at a time
parallel -j 32 < run.2bit.list

# create chrom.sizes from those 2bit files:
for F in twoBit/*.2bit
do
  twoBitInfo $F stdout
done | sort -k2,2nr > ensembl.GRCh38.p13.chrom.sizes

# create twoBitDup idKeys for each sequence
mkdir -p keys

for F in twoBit/*
do
  echo $F | sed -e 's/.2bit$//; s#twoBit/##;' | while read C
  do
     printf "twoBitDup ${F} -keyList=keys/${C}.txt\n"
  done
done > run.twoBitDup.list

# run 32 at a time:
parallel -j 32 < run.twoBitDup.list

# put all those keys together into one idKeys.txt file:

cat keys/* | sort > ensembl.GRCh38.p13.idKeys.txt

###########################################################################
# ready to match idKeys
###########################################################################

# chrY doesn't match since Ensembl has masked the PAR on Y
# but it is the same length sequence and does match except for the PAR
# NOT doing the 'grep -v 270752' to eliminate the contamination sequence

cd /hive/data/genomes/hg38/bed/chromAlias/update.2022-11-09/ensemblRecent

( printf "chrY\tY\n"
join -t$'\t' ../../../idKeys.p14/hg38.idKeys.txt \
   ensembl.GRCh38.p13.idKeys.txt | cut -f2-) | sort > perfect.ucsc.ensembl.tsv

# the ones that don't match are because of the construction of the alt
# sequence at Ensembl:

join -v2 -t$'\t' ../../../idKeys.p14/hg38.idKeys.txt \
   ensembl.GRCh38.p13.idKeys.txt | grep -v -w "Y" | cut -f2- \
     | sort > noKeyMatch.ucsc.ensembl.tsv

###########################################################################
# Extract the alt sequences from the artifical Ensembl chromosomes
###########################################################################

cd /hive/data/genomes/hg38/bed/chromAlias/update.2022-11-09/ensemblRecent

# this script will remove the leading and trailing N sequences from
# the fasta, and it will measure the leading N length to get the lift
# coordinate

printf '#!/usr/bin/env perl

use strict;
use warnings;

my $argc = scalar(@ARGV);

if ($argc != 1) {
  printf STDERR "usage: ./trimEnds.pl file.fa > file.fa 2> lift.file\n";
  exit 255;
}

my $skipBeginning = 1;
my @endTrim;
my $liftOffset = 0;
my $inFile = shift;
my $seqName = $inFile;
$seqName =~ s/.fa//;
$seqName =~ s#split/##;
open (FH, "<$inFile") or die "can not read $inFile";
while (my $line = <FH>) {
  chomp $line;
  if ($line =~ m/^>/) {
    printf "%s\n", $line;
  } elsif ($skipBeginning && ($line =~ m/^NN*N$/)) {
    $liftOffset += length($line);
  } elsif ($skipBeginning) {
       $skipBeginning = 0;
       my $lineLength = length($line);
       $line =~ s/^N+//;
       $liftOffset += $lineLength - length($line);
       printf STDERR "%s\t%d\n", $seqName, $liftOffset;
       push @endTrim, $line;
  } else {
       push @endTrim, $line;
  }
}
close (FH);

my $endTrimmed = 0;
my $arrayLength = scalar(@endTrim);
for (my $i = $arrayLength - 1; $endTrimmed < 1 && $i > 0; --$i) {
  if ($endTrim[$i] =~ m/^$/) {
     delete $endTrim[$i];
  } elsif ($endTrim[$i] =~ m/^N*$/) {
     delete $endTrim[$i];
  } else {
     $endTrimmed = 1;
     $endTrim[$i] =~ s/N+$//;
  }
}

$arrayLength = scalar(@endTrim);
for (my $i = 0; $i < $arrayLength; ++$i) {
  printf "%s\n", $endTrim[$i];
}
' > trimEnds.pl

chmod +x trimEnds.pl

mkdir -p noNs

cut -f2 ../../noKeyMatch.ucsc.ensembl.tsv | grep -v -w Y | while read C
do
  if [ -s "noNs/${C}.fa" ]; then
    printf "# done ${C}\n" 1>&2
  else
    printf "./runTrim ${C}\n"
  fi
done > trim.run.list

mkdir noNs lift

printf '#!/bin/bash
export C=$1
./trimEnds.pl split/${C}.fa > noNs/${C}.fa 2> lift/${C}.tsv
' > runTrim

chmod +x runTrim

# run 32 at a time
parallel -j32 < trim.run.list

sort lift/*.tsv > ensemblLift.tsv

### and make up a 2bit file with all these trimmed sequences
faToTwoBit noNs/*.fa noNs.2bit
### and idKeys via twoBitDup
twoBitDup -keyList=stdout noNs.2bit | sort > noNs.idKeys.txt

###########################################################################
# continue to match idKeys for these alternate sequences
###########################################################################

cd /hive/data/genomes/hg38/bed/chromAlias/update.2022-11-09/ensemblRecent

join -t$'\t' ../../../idKeys.p14/hg38.idKeys.txt  noNs.idKeys.txt  \
  | cut -f2- | sort > alts.OK.ucsc.ensembl.tsv

# the outstanding mis-matches are due to Ensembl reversed sequence

comm -23 <(cut -f2 ../../../idKeys.p14/hg38.idKeys.txt | sort) \
   <(cut -f1 alts.OK.ucsc.ensembl.tsv perfect.ucsc.ensembl.tsv | sort) \
      > to.be.determined.ucsc.ensembl.txt

# reverse those UCSC sequences:

mkdir -p ucscRc

for C in `cat to.be.determined.ucsc.ensembl.txt`
do
   printf "twoBitToFa /hive/data/genomes/hg38/hg38.2bit:${C} stdout | faRc stdin ucscRc/${C}_r.fa\n"
done > faRc.run.list

# run 32 at a time
parallel -j 32 < faRc.run.list

faToTwoBit ucscRc/*_r.fa ./ucscRc.2bit
twoBitDup -keyList=stdout ucscRc.2bit | sort > ucscRc.idKeys.txt

join -t$'\t' ucscRc.idKeys.txt ./update.2021-09-20/ensemblRecent/noNs.idKeys \
  | cut -f2- | sed -e 's/RC_//;' > reversedAlts.ucsc.ensembl.tsv

### and finally, all together now:

sort alts.OK.ucsc.ensembl.tsv perfect.ucsc.ensembl.tsv \
   reversedAlts.ucsc.ensembl.tsv > ucscToEnsembl.hg38.p14.tsv

# verify have matches to each Ensembl sequence:

wc -l ensembl.GRCh38.p13.chrom.sizes ucscToEnsembl.hg38.p14.tsv
#   639 ensembl.GRCh38.p13.chrom.sizes
#   639 ucscToEnsembl.hg38.p14.tsv

# save previous ucscToEnsembl table:

hgsql -N -e 'select * from ucscToEnsembl;' hg38 \
   > hg38.ucscToEnsembl.previous.tsv

# this new one is only different since the contamination sequence
# has not been removed:

diff ucscToEnsembl.hg38.p14.tsv hg38.ucscToEnsembl.previous.tsv
619d618
< chrUn_KI270752v1      KI270752.1


# load the ucscToEnsembl table:

printf '
# UCSC to Ensembl chr name translation
CREATE TABLE ucscToEnsembl (
    ucsc varchar(255) not null,        # UCSC chromosome name
    ensembl varchar(255) not null,     # Ensembl chromosome name
              #Indices
    PRIMARY KEY(ucsc(23))
);
' > ucscToEnsembl.sql

hgsql hg38 -e 'drop table ucscToEnsembl;'
hgsql hg38 < ucscToEnsembl.sql
hgsql hg38 \
  -e 'LOAD DATA LOCAL INFILE "ucscToEnsembl.hg38.p14.tsv" INTO TABLE ucscToEnsembl'

# save previous ensemblLift table:

hgsql -N -e 'select * from ensemblLift;' hg38 \
  | sort > hg38.ensemblLift.previous.tsv

# appear to have identical results:

   sum  hg38.ensemblLift.previous.tsv ensemblLift.tsv
#    59375    13 hg38.ensemblLift.previous.tsv
#    59375    13 ensemblLift.tsv

# 'offset' is now a reserved MariaDb word, need to back quote it so it will work

printf '
CREATE TABLE ensemblLift (
    chrom varchar(255) not null,      # Ensembl chromosome name
    `offset` int unsigned,     # offset to add to UCSC position
              #Indices
    PRIMARY KEY(chrom(38))
);
' > ensemblLift.sql

hgsql hg38 -e 'drop table ensemblLift;'
hgsql hg38 < ensemblLift.sql
hgsql hg38 \
  -e 'LOAD DATA LOCAL INFILE "ensemblLift.tsv" INTO TABLE ensemblLift'


###########################################################################
### obtain the Ensembl sequence to create idKeys
###########################################################################

mkdir -p /hive/data/genomes/hg38/bed/chromAlias/update.2021-09-20/ensemblRecent
cd /hive/data/genomes/hg38/bed/chromAlias/update.2021-09-20/ensemblRecent

wget --timestamping \
http://ftp.ensembl.org/pub/release-104/fasta/homo_sapiens/dna/Homo_sapiens.GRCh38.dna.toplevel.fa.gz

# alternate sequences are too large to place everything into one 2bit file

faSplit byname Homo_sapiens.GRCh38.dna.toplevel.fa.gz split/

# make 2bit files for each sequence

mkdir -p twoBit

for F in split/*
do
  echo $F | sed -e 's/.fa$//; s#split/##;' | while read C
  do
     printf "faToTwoBit ${F} twoBit/${C}.2bit\n"
  done
done

# create twoBitDup idKeys for each sequence
mkdir -p keys

for F in twoBit/*
do
  echo $F | sed -e 's/.2bit$//; s#twoBit/##;' | while read C
  do
     twoBitDup ${F} -keyList=keys/${C}.txt
  done
done

# put all those keys together into one idKeys.txt file:

cat keys/* | sort > ensembl.GRCh38.p13.idKeys.txt

###########################################################################
# ready to match idKeys
###########################################################################

# chrY doesn't match since Ensembl has masked the PAR on Y
# but it is the same length sequence and does match except for the PAR
# the 'grep -v 270752' eliminates the contamination sequence

cd /hive/data/genomes/hg38/bed/chromAlias
( printf "chrY\tY\n"
join -t$'\t' ../idKeys.p13/hg38.idKeys.txt \
  ./update.2021-09-20/ensemblRecent/ensembl.GRCh38.p13.idKeys.txt \
    | cut -f2- ) | grep -v 270752 | sort > perfect.ucsc.ensembl.tsv

# the ones that don't match are because of the construction of the alt
# sequence at Ensembl:

join -v2 -t$'\t' ../idKeys.p13/hg38.idKeys.txt \
  ./update.2021-09-20/ensemblRecent/ensembl.GRCh38.p13.idKeys.txt \
    | grep -v -w "Y" | cut -f2- | sort > noKeyMatch.ucsc.ensembl.tsv

###########################################################################
# Extract the alt sequences from the artifical Ensembl chromosomes
###########################################################################

cd /hive/data/genomes/hg38/bed/chromAlias/update.2021-09-20/ensemblRecent

# this script will remove the leading and trailing N sequences from
# the fasta, and it will measure the leading N length to get the lift
# coordinate

printf '#!/usr/bin/env perl

use strict;
use warnings;

my $argc = scalar(@ARGV);

if ($argc != 1) {
  printf STDERR "usage: ./trimEnds.pl file.fa > file.fa 2> lift.file\n";
  exit 255;
}

my $skipBeginning = 1;
my @endTrim;
my $liftOffset = 0;
my $inFile = shift;
my $seqName = $inFile;
$seqName =~ s/.fa//;
$seqName =~ s#split/##;
open (FH, "<$inFile") or die "can not read $inFile";
while (my $line = <FH>) {
  chomp $line;
  if ($line =~ m/^>/) {
    printf "%s\n", $line;
  } elsif ($skipBeginning && ($line =~ m/^NN*N$/)) {
    $liftOffset += length($line);
  } elsif ($skipBeginning) {
       $skipBeginning = 0;
       my $lineLength = length($line);
       $line =~ s/^N+//;
       $liftOffset += $lineLength - length($line);
       printf STDERR "%s\t%d\n", $seqName, $liftOffset;
       push @endTrim, $line;
  } else {
       push @endTrim, $line;
  }
}
close (FH);

my $endTrimmed = 0;
my $arrayLength = scalar(@endTrim);
for (my $i = $arrayLength - 1; $endTrimmed < 1 && $i > 0; --$i) {
  if ($endTrim[$i] =~ m/^$/) {
     delete $endTrim[$i];
  } elsif ($endTrim[$i] =~ m/^N*$/) {
     delete $endTrim[$i];
  } else {
     $endTrimmed = 1;
     $endTrim[$i] =~ s/N+$//;
  }
}

$arrayLength = scalar(@endTrim);
for (my $i = 0; $i < $arrayLength; ++$i) {
  printf "%s\n", $endTrim[$i];
}
' > trimEnds.pl

chmod +x trimEnds.pl

mkdir -p noNs

cut -f2 ../../noKeyMatch.ucsc.ensembl.tsv | grep -v -w Y | while read C
do
  if [ -s "noNs/${C}.fa" ]; then
    printf "# done ${C}\n"
  else
    printf "# working ${C}\n"
    ./trimEnds.pl split/${C}.fa > noNs/${C}.fa
  fi
done 2> ensemblLift.tsv

### and make up a 2bit file with all these trimmed sequences
faToTwoBit noNs/*.fa noNs.2bit
### and idKeys via twoBitDup
twoBitDup -keyList=stdout noNs.2bit | sort > noNs.idKeys.txt

###########################################################################
# continue to match idKeys for these alternate sequences
###########################################################################

cd /hive/data/genomes/hg38/bed/chromAlias

join -t$'\t' ../idKeys.p13/hg38.idKeys.txt \
   ./update.2021-09-20/ensemblRecent/noNs.idKeys.txt \
    | cut -f2- | sort > alts.OK.ucsc.ensembl.tsv

# the outstanding mis-matches are due to Ensembl reversed sequence

comm -23 <(cut -f2 ../idKeys.p13/hg38.idKeys.txt | sort) \
  <(cut -f1 alts.OK.ucsc.ensembl.tsv perfect.ucsc.ensembl.tsv | sort) \
     | grep -v 270752 > to.be.determined.ucsc.ensembl.txt

# reverse those UCSC sequences:

mkdir -p ucscRc

for C in `cat to.be.determined.ucsc.ensembl.txt`
do
   twoBitToFa ../../hg38.p13.2bit:${C} stdout | faRc stdin ucscRc/${C}_r.fa
done

faToTwoBit ucscRc/*_r.fa ./ucscRc.2bit
twoBitDup -keyList=stdout ucscRc.2bit | sort > ucscRc.idKeys.txt

join -t$'\t' ucscRc.idKeys.txt ./update.2021-09-20/ensemblRecent/noNs.idKeys \
  | cut -f2- | sed -e 's/RC_//;' > reversedAlts.ucsc.ensembl.tsv

### and finally, all together now:

sort alts.OK.ucsc.ensembl.tsv perfect.ucsc.ensembl.tsv \
   reversedAlts.ucsc.ensembl.tsv > ucscToEnsembl.hg38.p13.tsv

### after all this, there is one sequence in UCSC: chr15_KN538374v1_fix
### which can not be found in Ensembl.  I don't know why.  When I blat
### it at Ensembl, it says it is a section of chr15.  This sequence
### is also highly related to chr15_KI270905v1_alt

# load the ucscToEnsembl table:

printf '
# UCSC to Ensembl chr name translation
CREATE TABLE ucscToEnsembl (
    ucsc varchar(255) not null,        # UCSC chromosome name
    ensembl varchar(255) not null,     # Ensembl chromosome name
              #Indices
    PRIMARY KEY(ucsc(23))
);
' > ucscToEnsembl.sql

hgsql hg38 -e 'drop table ucscToEnsembl;'
hgsql hg38 < ucscToEnsembl.sql
hgsql hg38 \
  -e 'LOAD DATA LOCAL INFILE "ucscToEnsembl.hg38.p13.tsv" INTO TABLE ucscToEnsembl'

printf '
CREATE TABLE ensemblLift (
    chrom varchar(255) not null,      # Ensembl chromosome name
    offset int unsigned not null,     # offset to add to UCSC position
              #Indices
    PRIMARY KEY(chrom(38))
);
' > ensemblLift.sql

hgsql hg38 -e 'drop table ensemblLift;'
hgsql hg38 < ensemblLift.sql
hgsql hg38 \
  -e 'LOAD DATA LOCAL INFILE "update.2021-09-20/ensemblRecent/ensemblLift.tsv" INTO TABLE ensemblLift'

