# for emacs: -*- mode: sh; -*-

# This file describes the construction of NCBI RefSeq track -- latest first

#########################################################################
# ncbiRefSeq.p11 Genes (DONE - 2018-01-09 - Angie)
# Previously done 2017-09-28; redone 11-16 to include mito "rna" from chrM genomic seq;
# redone 2018-01-09 to pick up changes in NCBI download files from 2017-12-22
    mkdir /hive/data/genomes/hg38/bed/ncbiRefSeq.p11.2018-01-09
    cd /hive/data/genomes/hg38/bed/ncbiRefSeq.p11.2018-01-09

    time (~/kent/src/hg/utils/automation/doNcbiRefSeq.pl -buildDir=`pwd` \
      refseq vertebrate_mammalian Homo_sapiens \
      GCF_000001405.37_GRCh38.p11 hg38) > do.log 2>&1 & tail -f do.log
    # *** All done !  Elapsed time: 23m51s
    # real    23m51.091s

    cat fb.ncbiRefSeq.hg38.txt
    # 129593530 bases of 3049335806 (4.250%) in intersection

#########################################################################

#########################################################################
# ncbiRefSeq.p8 Genes (DONE - 2016-08-18 - Hiram)

    mkdir /hive/data/genomes/hg38/bed/ncbiRefSeq.p8
    cd /hive/data/genomes/hg38/bed/ncbiRefSeq.p8

    # running step wise as this script is still under development
    time (~/kent/src/hg/utils/automation/doNcbiRefSeq.pl -buildDir=`pwd` \
      -bigClusterHub=ku -dbHost=hgwdev \
      -stop=download -fileServer=hgwdev -smallClusterHub=ku -workhorse=hgwdev \
      refseq vertebrate_mammalian Homo_sapiens \
      GCF_000001405.34_GRCh38.p8 hg38) > download.log 2>&1
    # real    12m35.601s

    time (~/kent/src/hg/utils/automation/doNcbiRefSeq.pl -buildDir=`pwd` \
      -continue=process -bigClusterHub=ku -dbHost=hgwdev \
      -stop=process -fileServer=hgwdev -smallClusterHub=ku -workhorse=hgwdev \
      refseq vertebrate_mammalian Homo_sapiens \
      GCF_000001405.34_GRCh38.p8 hg38) > process.log 2>&1
    # real    14m18.953s

    time (~/kent/src/hg/utils/automation/doNcbiRefSeq.pl -buildDir=`pwd` \
      -continue=load -bigClusterHub=ku -dbHost=hgwdev \
      -stop=load -fileServer=hgwdev -smallClusterHub=ku -workhorse=hgwdev \
      refseq vertebrate_mammalian Homo_sapiens \
      GCF_000001405.34_GRCh38.p8 hg38) > load.log 2>&1
    # real    0m59.189s

    cat fb.ncbiRefSeq.hg38.txt
    #  141131948 bases of 3049335806 (4.628%) in intersection

#############################################################################

#########################################################################
# ncbiRefSeq.p7 Genes (DONE - 2016-05-20 - Hiram)

    mkdir /hive/data/genomes/hg38/bed/ncbiRefSeq.p7
    cd /hive/data/genomes/hg38/bed/ncbiRefSeq.p7

    # running step wise as this script is still under development
    time (~/kent/src/hg/utils/automation/doNcbiRefSeq.pl -buildDir=`pwd` \
      -bigClusterHub=ku -dbHost=hgwdev \
      -stop=download -fileServer=hgwdev -smallClusterHub=ku -workhorse=hgwdev \
      refseq vertebrate_mammalian Homo_sapiens \
      GCF_000001405.33_GRCh38.p7 hg38) > download.log 2>&1
    # real    12m35.601s

    time (~/kent/src/hg/utils/automation/doNcbiRefSeq.pl -buildDir=`pwd` \
      -continue=process -bigClusterHub=ku -dbHost=hgwdev \
      -stop=process -fileServer=hgwdev -smallClusterHub=ku -workhorse=hgwdev \
      refseq vertebrate_mammalian Homo_sapiens \
      GCF_000001405.33_GRCh38.p7 hg38) > process.log 2>&1
    # real    14m18.953s

    time (~/kent/src/hg/utils/automation/doNcbiRefSeq.pl -buildDir=`pwd` \
      -continue=load -bigClusterHub=ku -dbHost=hgwdev \
      -stop=load -fileServer=hgwdev -smallClusterHub=ku -workhorse=hgwdev \
      refseq vertebrate_mammalian Homo_sapiens \
      GCF_000001405.33_GRCh38.p7 hg38) > load.log 2>&1
    # real    0m59.189s

    cat fb.ncbiRefSeq.hg38.txt
    #  141131948 bases of 3049335806 (4.628%) in intersection

#############################################################################
mkdir /hive/data/genomes/hg38/bed/ncbiRefSeq
cd /hive/data/genomes/hg38/bed/ncbiRefSeq

export asmName="GCF_000001405.31_GRCh38.p5"

# most of these files could be linked in from the refseq download, for example:
ln -s /hive/data/genomes/hg38/refseq/${asmName}_rna.gbff.gz ./

# or, fetched freshly from  NCBI refseq assemblies:
for F in ${asmName}_genomic.fna.gz ${asmName}_genomic.gff.gz \
   ${asmName}_rna.fna.gz ${asmName}_assembly_report.txt
do
 time rsync -L -a -P rsync://ftp.ncbi.nlm.nih.gov/genomes/refseq/vertebrate_mammalian/Homo_sapiens/latest_assembly_versions/${asmName}/${F} ./
done

# measure these items:

faCount ${asmName}_rna.fna.gz > rna.faCount.txt
faCount ${asmName}_genomic.fna.gz > genomic.faCount.txt

egrep -v "^#seq|^total" rna.faCount.txt | cut -f1,2 \
   | sort -k2nr > rna.chrom.sizes
egrep -v "^#seq|^total" genomic.faCount.txt | cut -f1,2 \
   | sort -k2nr > genomic.chrom.sizes

# this constructes the unlifted genePred file with NCBI chrom names
gff3ToGenePred -useName -attrsOut=${asmName}.attrs.useName.txt \
    ${asmName}_genomic.gff.gz \
       ${asmName}.useName.gp

# extract additional names from column nine to use as an index for
#  the variety of names that identify each item
~/kent/src/hg/makeDb/doc/hg38/gffToIx.pl ${asmName}_genomic.gff.gz \
   > ${asmName}_genomic.ix.txt

# for psl alignments, only need to work on the items with Gap specifications
# The NG_ named items do not function correctly
zgrep -v "NG_" ${asmName}_genomic.gff.gz | grep "Gap=" >> only.gap.gff

# construct the PSL alignments from the Gap records
sed -e 's/cDNA_match/match/;' only.gap.gff \
  | gff3ToPsl genomic.chrom.sizes rna.chrom.sizes stdin ${asmName}.psl

# this doesn't work, too many UCSC names missing from the NCBI file:
grep -v "^#" ${asmName}_assembly_report.txt | sed -e 's/^M//g;' \
    | cut -f7,9,10 \
       |awk -F'\t' '{printf "0\t%s\t%d\t%s\t%d\n", $1, $2, $3, $2}' \
           > insdcToHg38.lift

# the automated hubs process does construct a good NCBI to UCSC lift file:

export liftFile="/hive/data/inside/ncbi/genomes/refseq/vertebrate_mammalian/Homo_sapiens/latest_assembly_versions/${asmName}/${asmName}.ncbiToUcsc.lift"

# liftUp the genePred from NCBI chrom names to UCSC chrom names
liftUp -extGenePred -type=.gp stdout \
     ${liftFile} warn ${asmName}.useName.gp | gzip -c \
        > ${asmName}.ucsc.gp.gz
touch -r ${asmName}.useName.gp ${asmName}.ucsc.gp.gz
# some items will fail because they are on patches or alternative sequence
# from Patch 5 that does not exist in hg38
genePredCheck -db=hg38 ${asmName}.ucsc.gp.gz
# checked: 150709 failed: 1455

# filter out the patches and alternative sequences that are not in hg38
genePredFilter -db=hg38 ${asmName}.ucsc.gp.gz stdout \
    | gzip -c > ${asmName}.hg38.gp.gz
# this should be perfectly good
genePredCheck -db=hg38 ${asmName}.hg38.gp.gz
# checked: 149254 failed: 0


# liftUp the psl file from NCBI chrom names to UCSC chrom names
pslSwap ${asmName}.psl stdout \
   | liftUp -type=.psl stdout ${liftFile} drop stdin \
      | gzip -c > ${asmName}.ucsc.psl.gz
touch -r ${asmName}.psl ${asmName}.ucsc.psl.gz
# same problem here as above, not everything is in hg38,
#   filter patch 5 items out:
pslCheck -pass=stdout -db=hg38 ${asmName}.ucsc.psl.gz 2> /dev/null \
  | gzip -c > ${asmName}.hg38.psl.gz
pslCheck -db=hg38 ${asmName}.hg38.psl.gz
# checked: 1062 failed: 0 errors: 0

# separate categories of genes:

zegrep "^N(M|R)|^YP" ${asmName}.hg38.gp.gz > curated.gp
zegrep "^X(M|R)" ${asmName}.hg38.gp.gz > predicted.gp
zegrep -v "^N(M|R)|^YP|X(M|R)" ${asmName}.hg38.gp.gz > other.gp

# verify none lost
wc -l curated.gp predicted.gp other.gp
   54869 curated.gp
   88703 predicted.gp
    5682 other.gp
  149254 total

zcat ${asmName}.hg38.gp.gz | wc -l
149254

# extract CDS records from the rna.gbff.gz for bigPsl construction:
~/kent/src/hg/makeDb/doc/hg38/gbffToCds.pl ${asmName}_rna.gbff.gz > rna.cds

# ensure simple names are in the rna.fa file
zcat ${asmName}_rna.fna.gz | sed -e 's/ .*//;' > rna.fa
pslToBigPsl -fa=rna.fa -cds=rna.cds ${asmName}.hg38.psl.gz test.bigPsl
sort -k1,1 -k2,2n test.bigPsl > ${asmName}.hg38.bigPsl
bedToBigBed -type=bed12+12 -tab -as=$HOME/kent/src/hg/lib/bigPsl.as \
  ${asmName}.hg38.bigPsl ../../chrom.sizes ${asmName}.hg38.bigPsl.bb

# loading genePred tracks:

hgLoadGenePred -genePredExt hg38 ncbiRefSeqCurated curated.gp
genePredCheck -db=hg38 ncbiRefSeqCurated
# checked: 54869 failed: 0

hgLoadGenePred -genePredExt hg38 ncbiRefSeqPredicted predicted.gp
genePredCheck -db=hg38 ncbiRefSeqPredicted
# checked: 88703 failed: 0

hgLoadGenePred -genePredExt hg38 ncbiRefSeqOther other.gp
genePredCheck -db=hg38 ncbiRefSeqOther
# checked: 5682 failed: 0

########## early experiment, not used later
# # and the bigPsl file:
# mkdir -p /gbdb/hg38/bbi/ncbiRefSeq
# ln -s `pwd`/${asmName}.hg38.bigPsl.bb /gbdb/hg38/bbi/ncbiRefSeqBigPsl.bb
# hgBbiDbLink hg38 ncbiRefSeqBigPsl /gbdb/hg38/bbi/ncbiRefSeqBigPsl.bb
########## early experiment, not used later

#############################################################################
# addition of HGMD-restricted subset, Max, Jan 29 2019
# updated Dec 2019
# updated Dec 2020
# updated Aug 2023
# updated May 2024
# updated Dec 2024
year=2024
cd /hive/data/genomes/hg38/bed/ncbiRefSeq.p14.2023-11-23/
# change in 2019: ignore the version numbers, otherwise only 1815 transcripts left, big update by HGMD in 2019?
# adding "." so NM_123 doesn't match NM_123123                                                                             
cat /hive/data/outside/hgmd/$year.4-hgmd-public_hg38.tsv | cut -f7 | cut -d. -f1 | sort -u | awk '{print $1"."}' > hgmdTranscripts.txt
zcat process/hg38.curated.gp.gz | fgrep -f hgmdTranscripts.txt - > hgmd.curated.gp
hgLoadGenePred -genePredExt hg38 ncbiRefSeqHgmd hgmd.curated.gp
# wc -l says:
10772 hgmd.curated.gp for the 2021 version
12602 hgmd.curated.gp for the 2023 version
15167 hgmd.curated.gp for the 2024 version
#############################################################################
# ncbiRefSeq.p13 update (DONE - 2019-12-06 - Hiram)

# current version information
    cat /gbdb/hg38/ncbiRefSeq/ncbiRefSeqVersion.txt
    # NCBI Homo sapiens Annotation Release 109 (2018-03-29)

# Version information from the file:

# /hive/data/outside/ncbi/genomes/refseq/vertebrate_mammalian/Homo_sapiens/
#   all_assembly_versions/GCF_000001405.39_GRCh38.p13/
#   GCF_000001405.39_GRCh38.p13_genomic.gff.gz

#!annotation-date 09/05/2019
#!annotation-source NCBI Homo sapiens Updated Annotation Release 109.20190905

    mkdir /hive/data/genomes/hg38/bed/ncbiRefSeq.p13.2019-12-06
    cd /hive/data/genomes/hg38/bed/ncbiRefSeq.p13.2019-12-06

    ### BEFORE loading this updated table

    featureBits -countGaps hg38 ncbiRefSeq
 # 134109466 bases of 3257347282 (4.117%) in intersection

    featureBits -enrichment hg38 refGene ncbiRefSeq
 # refGene 3.098%, ncbiRefSeq 4.332%, both 2.920%, cover 94.23%, enrich 21.75x

    featureBits -enrichment hg38 ncbiRefSeq refGene
 # ncbiRefSeq 4.332%, refGene 3.098%, both 2.920%, cover 67.40%, enrich 21.75x

    featureBits -enrichment hg38 ncbiRefSeqCurated refGene
 # ncbiRefSeqCurated 2.880%, refGene 3.098%, both 2.846%, cover 98.84%, enrich 31.90x

    featureBits -enrichment hg38 refGene ncbiRefSeqCurated
 # refGene 3.098%, ncbiRefSeqCurated 2.880%, both 2.846%, cover 91.86%, enrich 31.90x

    # running step wise just to be careful
    time (~/kent/src/hg/utils/automation/doNcbiRefSeq.pl -buildDir=`pwd` \
      -bigClusterHub=ku -dbHost=hgwdev \
      -stop=download -fileServer=hgwdev -smallClusterHub=ku -workhorse=hgwdev \
      refseq vertebrate_mammalian Homo_sapiens \
      GCF_000001405.39_GRCh38.p13 hg38) > download.log 2>&1
    # real    3m23.090s

    time (~/kent/src/hg/utils/automation/doNcbiRefSeq.pl -buildDir=`pwd` \
      -continue=process -bigClusterHub=ku -dbHost=hgwdev \
      -stop=process -fileServer=hgwdev -smallClusterHub=ku -workhorse=hgwdev \
      refseq vertebrate_mammalian Homo_sapiens \
      GCF_000001405.39_GRCh38.p13 hg38) > process.log 2>&1
    # real    6m10.922s


    time (~/kent/src/hg/utils/automation/doNcbiRefSeq.pl -buildDir=`pwd` \
      -continue=load -bigClusterHub=ku -dbHost=hgwdev \
      -stop=load -fileServer=hgwdev -smallClusterHub=ku -workhorse=hgwdev \
      refseq vertebrate_mammalian Homo_sapiens \
      GCF_000001405.39_GRCh38.p13 hg38) > load.log 2>&1
    # real    0m41.366s

    ### AFTER loading this updated table
    # compare this result:
    cat fb.ncbiRefSeq.hg38.txt
    # 136778258 bases of 3095998939 (4.418%) in intersection

    # with previous version existing table (from fb above):
    # 134109466 bases of 3257347282 (4.117%) in intersection

    featureBits -enrichment hg38 refGene ncbiRefSeq
 # refGene 3.098%, ncbiRefSeq 4.418%, both 3.073%, cover 99.19%, enrich 22.45x
 # previous:
 # refGene 3.098%, ncbiRefSeq 4.332%, both 2.920%, cover 94.23%, enrich 21.75x

    featureBits -enrichment hg38 ncbiRefSeq refGene
 # ncbiRefSeq 4.418%, refGene 3.098%, both 3.073%, cover 69.56%, enrich 22.45x
 # previous:
 # ncbiRefSeq 4.332%, refGene 3.098%, both 2.920%, cover 67.40%, enrich 21.75x

    featureBits -enrichment hg38 ncbiRefSeqCurated refGene
 # ncbiRefSeqCurated 3.073%, refGene 3.098%, both 3.067%, cover 99.81%, enrich 32.22x
 # previous:
 # ncbiRefSeqCurated 2.880%, refGene 3.098%, both 2.846%, cover 98.84%, enrich 31.90x

    featureBits -enrichment hg38 refGene ncbiRefSeqCurated
 # refGene 3.098%, ncbiRefSeqCurated 3.073%, both 3.067%, cover 98.99%, enrich 32.22x
 # previous:
 # refGene 3.098%, ncbiRefSeqCurated 2.880%, both 2.846%, cover 91.86%, enrich 31.90x

#########################################################################
# addition of RefSeq Select-restricted subset, Max, Feb 10 2019
cd /hive/data/genomes/hg38/bed/ncbiRefSeq.p13.2019-12-06/
zcat download/*_genomic.gff.gz | egrep 'tag=(RefSeq|MANE) Select'  | cut -f9- | tr ';' '\n' | grep Name= | grep -v NP_ | cut -d= -f2 | sort -u > refseqSelectTranscripts.txt
cat process/hg38.curated.gp | fgrep -f refseqSelectTranscripts.txt - > refseqSelect.curated.gp
hgLoadGenePred -genePredExt hg38 ncbiRefSeqSelect refseqSelect.curated.gp
wc -l refseqSelect.curated.gp
21071 refseqSelect.curated.gp
#############################################################################

#############################################################################
# update 2020-10-27 (DONE - Hiram - 2020-10-27)

  mkdir /hive/data/genomes/hg38/bed/ncbiRefSeq.p13.2020-10-27
  cd /hive/data/genomes/hg38/bed/ncbiRefSeq.p13.2020-10-27

  time (~/kent/src/hg/utils/automation/doNcbiRefSeq.pl -buildDir=`pwd` \
      -bigClusterHub=ku -dbHost=hgwdev \
      -fileServer=hgwdev -smallClusterHub=hgwdev -workhorse=hgwdev \
      GCF_000001405.39_GRCh38.p13 hg38) > do.log 2>&1 &
  # real    11m46.506s

  cat fb.ncbiRefSeq.hg38.txt
  # 137385668 bases of 3110768607 (4.416%) in intersection

#############################################################################
# update 2021-05-13 (DONE - Hiram - 2021-05-13)

  mkdir /hive/data/genomes/hg38/bed/ncbiRefSeq.p13.2021-05-13
  cd /hive/data/genomes/hg38/bed/ncbiRefSeq.p13.2021-05-13

  time (~/kent/src/hg/utils/automation/doNcbiRefSeq.pl -buildDir=`pwd` \
      -bigClusterHub=ku -dbHost=hgwdev \
      -fileServer=hgwdev -smallClusterHub=hgwdev -workhorse=hgwdev \
      GCF_000001405.39_GRCh38.p13 hg38) > do.log 2>&1 &
  # real    11m46.506s

  cat fb.ncbiRefSeq.hg38.txt
  # 137385668 bases of 3110768607 (4.416%) in intersection

#############################################################################

#############################################################################
# Add psuedo-track of old transcripts (DONE - ChrisL - 2023-09-21)
# Updated ChrisL 2023-10-06
#############################################################################
# see also /hive/data/outside/refSeqHistorical/newBuild.sh
# get the gff3
wget https://ftp.ncbi.nlm.nih.gov/refseq/H_sapiens/historical/GRCh38/current/GCF_000001405.40-RS_2023_03_genomic.gff.gz
wget https://ftp.ncbi.nlm.nih.gov/refseq/H_sapiens/historical/GRCh38/current/GCF_000001405.40-RS_2023_03_knownrefseq_alns.bam
wget https://ftp.ncbi.nlm.nih.gov/refseq/H_sapiens/historical/GRCh38/current/GCF_000001405.40-RS_2023_03_knownrefseq_alns.bam.bai
wget https://ftp.ncbi.nlm.nih.gov/refseq/H_sapiens/historical/GRCh38/GCF_000001405.40-RS_2023_03_historical/GCF_000001405.40-RS_2023_03_knownrefseq_rna.gbff.gz

# extract the sequences
cp /hive/data/genomes/hg38/goldenPath/bigZips/p14/hg38.p14.chromAlias.txt chromAlias.txt
samtools fasta GCF_000001405.40-RS_2023_03_knownrefseq_alns.bam > refSeqHistorical.fa
faToTwoBit -ignoreDups refSeqHistorical.fa refSeqHistorical.2bit
bamToPsl -nohead -chromAlias=chromAlias.txt \
        -allowDups GCF_000001405.40-RS_2023_03_knownrefseq_alns.bam out.psl 
~/kent/src/hg/utils/automation/gbffToCds.pl GCF_000001405.40-RS_2023_03_knownrefseq_rna.gbff.gz \
    | sort > refSeqHistorical.cds
# Check our cds is correct:
hgsql -Ne "select * from ncbiRefSeqCds where id like 'NM%' order by rand() limit 6" hg38
# +----------------+-----------+
# | NM_001110798.2 | 263..1552 |
# | NM_001190818.2 | 181..2766 |
# | NM_002009.4    | 466..1050 |
# | NM_006896.4    | 131..823  |
# | NM_025190.4    | 193..4254 |
# | NM_177402.5    | 215..1474 |
# +----------------+-----------+
grep "NM_025190.4\|NM_001110798.2\|NM_006896.4\|NM_177402.5\|NM_002009.4\|NM_001190818.2" refSeqHistorical.cds
# NM_001110798.2    263..1552
# NM_001190818.2  181..2766
# NM_002009.4 466..1050
# NM_006896.4 131..823
# NM_025190.4 193..4254
# NM_177402.5 215..1474

# Now we have psl and cds for searching, make a genePred track for display
zcat GCF_000001405.40-RS_2023_03_genomic.gff.gz \
    | gff3ToGenePred -maxParseErrors=-1 -maxConvertErrors=-1 -warnAndContinue \
        -attrsOut=attrs.out -unprocessedRootsOut=unprocessedRoots.out \
        -refseqHacks stdin out.gp 2>err

# rename chromosomes to ucsc names, and remove the ids
# from the current version of the refSeq track
hgsql -Ne "select distinct(name) from ncbiRefSeq" hg38 | sort > ncbiRefSeq.currentIds
chromToUcsc -k 2 -a chromAlias.txt -i out.gp \
    | sort > out.ucscChrom.gp
chromToUcsc -k 14 -a chromAlias.txt -i out.psl \
    | sort > out.ucscChrom.psl
cut -f10 out.ucscChrom.psl | sort -u | comm -23 - ncbiRefSeq.currentIds > old.ids
grep -Fwf old.ids out.ucscChrom.gp > refSeqHistorical.gp
grep -Fwf old.ids out.ucscChrom.psl > refSeqHistorical.psl
# make sure we have the right sequences:
twoBitToFa -seqList=old.ids refSeqHistorical.2bit refSeqHistorical.deDuped.fa

# make the bed file for the track
hgLoadGenePred -genePredExt hg38 ncbiRefSeqHistorical refSeqHistorical.gp
#genePredToBigGenePred refSeqHistorical.gp stdout | sort -k1,1 -k2,2n > refSeqHistorical.bigGp
#bedToBigBed -type=bed12+8 -tab -sizesIsChromAliasBb \
#    -as=$HOME/kent/src/hg/lib/bigGenePred.as -extraIndex=name \
#    refSeqHistorical.bigGp \
#    /hive/data/genomes/hg38/goldenPath/bigZips/p14/hg38.p14.chromAlias.bb \
#    refSeqHistorical.bb

# ensure only coding transcripts have a cds
awk -F$'\\t' '$6 != $7 {print $1;}' refSeqHistorical.gp | sort -u > coding.cds.names
join -t$'\t' coding.cds.names <(sort -u refSeqHistorical.cds) > refSeqHistorical.cds.coding

# load psl table
hgLoadPsl hg38 -table=ncbiRefSeqPslHistorical refSeqHistorical.psl 

# load cds table
hgLoadSqlTab hg38 ncbiRefSeqCdsHistorical ~/kent/src/hg/lib/cdsSpec.sql refSeqHistorical.cds.coding

# load seq and ext tables
ln -sf `pwd`/refSeqHistorical.deDuped.fa /gbdb/hg38/ncbiRefSeq/refSeqHistorical.fa
hgLoadSeq -drop -seqTbl=seqNcbiRefSeqHistorical -extFileTbl=extNcbiRefSeqHistorical hg38 /gbdb/hg38/ncbiRefSeq/refSeqHistorical.fa

# link the files into /gbdb
ln -s `pwd`/refSeqHistorical.bb /gbdb/hg38/ncbiRefSeq/refSeqHistorical.bb

# Update 10-10-2023, need to build the ncbiRefSeqOld and ncbiRefSeqLinkOld
# tables for associated hgvs searching
# extract labels from semi-structured text in gbff COMMENT/description sections:
zcat GCF_000001405.40-RS_2023_03_knownrefseq_rna.gbff.gz \
  | (grep ' :: ' || true) \
  | perl -wpe 's/\\s+::.*//; s/^\\s+//;' \
  | sort -u > pragmaLabels.txt
# genbank processor extracts infomation about the RNAs
/hive/data/outside/genbank/bin/x86_64/gbProcess \
    /dev/null raFile.txt GCF_000001405.40-RS_2023_03_knownrefseq_rna.gbff.gz \
    2>gbProcess.err
# Processing GCF_000001405.40-RS_2023_03_knownrefseq_rna.gbff.gz into /dev/null and raFile.txt

# gbProcess has a bunch of bogus info, trim it to what we need:
cat /hive/data/outside/refSeqHistorical/pragmaLabels.txt \
    | tr -s '[:blank:]' \
    | cut -d':' -f1 \
    | sort -u \
    | cut -d' ' -f2- > pragmaLabels.cleaned.txt
~/kent/src/hg/utils/automation/gff3ToRefLink.pl \
    raFile.txt \
    GCF_000001405.40-RS_2023_03_genomic.gff.gz \
    pragmaLabels.cleaned.txt \
    continueOnParentErrors \
    2> refLink.stderr \
    | sort > refLink.tab
# join the refLink metadata with curated+predicted names
cut -f1 refSeqHistorical.gp | sort -u > name.list
join -t$'\t' name.list refLink.tab > ncbiRefSeqLinkOld.tab
# loading the cross reference data
hgLoadSqlTab hg38 ncbiRefSeqLinkHistorical ~/kent/src/hg/lib/ncbiRefSeqLink.sql ncbiRefSeqLinkOld.tab
