
idKeys for Ensembl assemblies were
calculated in:  /hive/data/outside/ensembl/genomes/release-103/

New assemblies have been built for genbank, refseq and ucsc

Working in:
mkdir /hive/data/inside/assemblyEquivalence/2021-03-10
cd /hive/data/inside/assemblyEquivalence/2021-03-10

mkdir ensembl refseq genbank ucsc

###########  refseq keys  ##################
cd ../refseq

find /hive/data/genomes/asmHubs/refseqBuild/GCF \
    -maxdepth 4 -mindepth 4 -type d | while read buildDir
do
  asmId=`basename "${buildDir}"`
  keySig="${buildDir}/idKeys/${asmId}.keySignature.txt"
  idKeysDir=`dirname "${keySig}"`
  if [ -s "${keySig}" ]; then
     idKeys="$idKeysDir/$asmId.idKeys.txt"
     fullCount=`cat $idKeys | wc -l`
     uniqueCount=`cut -f1 $idKeys | sort -u | wc -l`
     keySig=`cat "${keySig}"`
  printf "%s\t%s\t%d\t%d\n" "${keySig}" "${asmId}" "${fullCount}" "${uniqueCount}"
  fi
done | sort -k1,1 > refseq.keySignatures.txt

###########  genbank keys  ##################
cd ../genbank

find /hive/data/genomes/asmHubs/genbankBuild/GCA \
    -maxdepth 4 -mindepth 4 -type d | while read buildDir
do
  asmId=`basename "${buildDir}"`
  keySig="${buildDir}/idKeys/${asmId}.keySignature.txt"
  idKeysDir=`dirname "${keySig}"`
  if [ -s "${keySig}" ]; then
     idKeys="$idKeysDir/$asmId.idKeys.txt"
     fullCount=`cat $idKeys | wc -l`
     uniqueCount=`cut -f1 $idKeys | sort -u | wc -l`
     keySig=`cat "${keySig}"`
 printf "%s\t%s\t%d\t%d\n" "${keySig}" "${asmId}" "${fullCount}" "${uniqueCount}"
  fi
done | sort -k1,1 > genbank.keySignatures.txt

###########  UCSC keys  ##################
cd ../ucsc

ls -d /hive/data/genomes/* | while read dbDir
do
  db=`basename "${dbDir}"`
  keySig="${dbDir}/bed/idKeys/${db}.keySignature.txt"
  idKeysDir=`dirname "${keySig}"`
  if [ -s "${keySig}" ]; then
     idKeys="$idKeysDir/$db.idKeys.txt"
     fullCount=`cat $idKeys | wc -l`
     uniqueCount=`cut -f1 $idKeys | sort -u | wc -l`
     keySig=`cat "${keySig}"`
   printf "%s\t%s\t%d\t%d\n" "${keySig}" "${db}" "${fullCount}" "${uniqueCount}"
  fi
done | sort -k1,1 > ucsc.keySignatures.txt

cd /hive/data/inside/assemblyEquivalence/2021-03-10
ln -s /cluster/home/hiram/kent/src/hg/makeDb/doc/assemblyEquivalence/exact.sh .

###########  Ensembl keys  ##################
cd ensembl

# does not always work, use the wget instead
# rsync -av --stats \
# rsync://ftp.ensembl.org/ensembl/pub/release-103/species_EnsemblVertebrates.txt \
#    ./

wget --timestamping  \
ftp://ftp.ensembl.org/pub/release-103/species_EnsemblVertebrates.txt

# extract the GCx assembly names:

awk -F$'\t' '{printf "%s_%s\n", $6,$5}' species_EnsemblVertebrates.txt \
   | grep "^G" | sort > ensembl.v103.asmId.txt

# compare this to the ones we have on hand here

cut -f2 ../genbank/genbank.keySignatures.txt  | sort > genbank.asmId.here.txt

wc -l ensembl.v103.asmId.txt genbank.asmId.here.txt | sed -e 's/^/# /;'
#   295 ensembl.v103.asmId.txt
#  1428 genbank.asmId.here.txt

comm -12 ensembl.v103.asmId.txt genbank.asmId.here.txt | wc -l
#	270

# will need to catch up the missing 25:
comm -23 ensembl.v103.asmId.txt genbank.asmId.here.txt

find -L /hive/data/outside/ensembl/genomes/release-103/idKeys -type f \
  | grep keySignature.txt | while read K
do
  idKeysDir=`dirname "${K}"`
  id=`basename "${K}" | sed -e 's/.keySignature.txt//;'`
  idKeys="$idKeysDir/$id.idKeys.txt"
  fullCount=`cat $idKeys | wc -l`
  uniqueCount=`cut -f1 $idKeys | sort -u | wc -l`
  keySig=`cat "${K}"`
  printf "%s\t%s\t%d\t%d\n" "${keySig}" "${id}" "${fullCount}" "${uniqueCount}"
done | sort -k1,1 > ensembl-103.keySignatures.txt

find /hive/data/outside/ensembl/genomes/release-101/idKeys -type f \
  | grep keySignature.txt | while read K
do
  idKeysDir=`dirname "${K}"`
  id=`basename "${K}" | sed -e 's/.keySignature.txt//;'`
  idKeys="$idKeysDir/$id.idKeys.txt"
  fullCount=`cat $idKeys | wc -l`
  uniqueCount=`cut -f1 $idKeys | sort -u | wc -l`
  keySig=`cat "${K}"`
  printf "%s\t%s\t%d\t%d\n" "${keySig}" "${id}" "${fullCount}" "${uniqueCount}"
done | sort -k1,1 > ensembl-101.keySignatures.txt

find /hive/data/outside/ensembl/genomes/release-100/idKeys -type f \
  | grep keySignature.txt | while read K
do
  idKeysDir=`dirname "${K}"`
  id=`basename "${K}" | sed -e 's/.keySignature.txt//;'`
  idKeys="$idKeysDir/$id.idKeys.txt"
  fullCount=`cat $idKeys | wc -l`
  uniqueCount=`cut -f1 $idKeys | sort -u | wc -l`
  keySig=`cat "${K}"`
  printf "%s\t%s\t%d\t%d\n" "${keySig}" "${id}" "${fullCount}" "${uniqueCount}"
done | sort -k1,1 > ensembl-100.keySignatures.txt

find /hive/data/outside/ensembl/genomes/release-99/idKeys -type f \
  | grep keySignature.txt | while read K
do
  idKeysDir=`dirname "${K}"`
  id=`basename "${K}" | sed -e 's/.keySignature.txt//;'`
  idKeys="$idKeysDir/$id.idKeys.txt"
  fullCount=`cat $idKeys | wc -l`
  uniqueCount=`cut -f1 $idKeys | sort -u | wc -l`
  keySig=`cat "${K}"`
  printf "%s\t%s\t%d\t%d\n" "${keySig}" "${id}" "${fullCount}" "${uniqueCount}"
done | sort -k1,1 > ensembl-99.keySignatures.txt

sort -u ensembl-103.keySignatures.txt ensembl-101.keySignatures.txt \
  ensembl-100.keySignatures.txt \
  ensembl-99.keySignatures.txt > ensembl.keySignatures.txt

###########  Exact matches  ##################

./exact.sh

~/kent/src/hg/makeDb/doc/assemblyEquivalence/exactTableTsv.pl \
  | sort > table.2021-03-10.tsv

###########  Near matches  ##################

mkdir /hive/data/inside/assemblyEquivalence/2021-03-10/notExact
cd /hive/data/inside/assemblyEquivalence/2021-03-10/notExact

sed -e 's/release-99/release-103/;' \
  /cluster/home/hiram/kent/src/hg/makeDb/doc/assemblyEquivalence/runOne > runOne
chmod +x runOne

time (~/kent/src/hg/makeDb/doc/assemblyEquivalence/allByAll.sh) > all.log 2>&1
# real    22m56.438s

sed -e 's/\tcount A .*//;' results/match.*.txt > notExact.table.2021-03-10.tsv

###########  Table contents to load  ##################

sort -u notExact.table.2021-03-10.tsv ../table.2021-03-10.tsv \
    > hgFixed.asmEquivalent.tsv

### Compare to existing:
hgsql -N -e 'select * from asmEquivalent;' hgFixed | sort \
   > existing.2021-03-10.tsv

wc -l hgFixed.asmEquivalent.tsv existing.2021-03-10.tsv
  2546 hgFixed.asmEquivalent.tsv
  2344 existing.2021-03-10.tsv

### How about the important ones:

egrep "mm10|mm39|hg38|hg19" hgFixed.asmEquivalent.tsv
GCA_000001405.27_GRCh38.p12     hg38    genbank ucsc    595     595     595
GCF_000001635.27_GRCm39 mm39    refseq  ucsc    61      61      61
Mus_musculus.GRCm39     mm39    ensembl ucsc    61      61      61
hg38    GCA_000001405.27_GRCh38.p12     ucsc    genbank 595     595     595
mm39    GCF_000001635.27_GRCm39 ucsc    refseq  61      61      61
mm39    Mus_musculus.GRCm39     ucsc    ensembl 61      61      61

### most should be identical to before
join hgFixed.asmEquivalent.tsv existing.2021-03-10.tsv | wc -l
         5144

### probably should *not* be losing any from before:
join -v 2 hgFixed.asmEquivalent.tsv existing.2021-03-10.tsv | wc -l
      0
# if not 0, investigate.  Sometimes a new assembly is now an
# exact match to something where it was a near match before to
# a previous assembly of that organism.

### Showing 5 this time:
### This is because the nearMiss calculation is only taking place on the
### most recent Ensembl release, not all the past ones.  These two assemblies
### have new version in ens103 release, thus these older Ensembl assemblies
### are not being checked for near miss.  For now, going to leave these
### missing from the table.  Can't maintain all older Ensembl equivalents.

Esox_lucius.Eluc_V3 GCA_000721915.3_Eluc_V3 ensembl genbank 1018 1019 1018
GCA_000721915.3_Eluc_V3 Esox_lucius.Eluc_V3 genbank ensembl 1018 1018 1019
Sarcophilus_harrisii.DEVIL7.0 GCA_000189315.1_Devil_ref_v7.0 ensembl genbank 35974 35975 35974
Sarcophilus_harrisii.DEVIL7.0 GCF_000189315.1_Devil_ref_v7.0 ensembl refseq 35974 35975 35974
Sarcophilus_harrisii.DEVIL7.0 sarHar1 ensembl ucsc 35974 35975 35974


### there should be some new ones
join -v 1 hgFixed.asmEquivalent.tsv existing.2021-03-10.tsv | wc -l
         165

### There should be no duplicate equivalents:
cut -f1,2 hgFixed.asmEquivalent.tsv | sort | uniq -c | sort -rn | head
      1 zonAlb1 Zonotrichia_albicollis.Zonotrichia_albicollis-1.0.1
      1 zonAlb1 GCF_000385455.1_Zonotrichia_albicollis-1.0.1
      1 zonAlb1 GCA_000385455.1_Zonotrichia_albicollis-1.0.1
      1 xipMac1 GCA_000241075.1_Xiphophorus_maculatus-4.4.2
      1 xenTro9 Xenopus_tropicalis.Xenopus_tropicalis_v9.1
... etc ...

#### To load up new table contents:
hgLoadSqlTab hgFixed asmEquivalent ~/kent/src/hg/lib/asmEquivalent.sql \
   hgFixed.asmEquivalent.tsv

hgsql -N -e 'select * from asmEquivalent;' hgFixed \
   | sort > updated.2021-03-10.tsv

wc -l updated.2021-03-10.tsv existing.2021-03-10.tsv
  2546 updated.2021-03-10.tsv
  2344 existing.2021-03-10.tsv

# Previously:
  2344 updated.2021-03-10.tsv
  2320 existing.2021-03-10.tsv
