##############################################################################
### chain the chains obtained from the HPRC project
### done - 2023-08-14 - Hiram

mkdir /hive/data/genomes/hg38/bed/hprc/chain
cd /hive/data/genomes/hg38/bed/hprc/chain

# avoid the broken one GCA_018504655.1
ls ../bigChainToChain/hg38.chainHprc*.chain | grep -v GCA_018504655.1 \
  > chain.list

# using this 'runOne' script for each one:
#######################################################################

#!/bin/bash

set -beEu -o pipefail

export hg38TwoBit="/hive/data/genomes/hg38/hg38.2bit"
export chainFile="${1}"
export inChain="../bigChainToChain/${chainFile}"
export asmName=`echo $chainFile | sed -e 's/hg38.chainHprc//; s/.chain//;'`

# may already be done
if [ -s "${chainFile}" ]; then
  exit 0
fi

export buildDir=""
export twoBit=""
export asmId=""

case "${asmName}" in
    GCA_*) gcX="${asmName:0:3}"
       d0="${asmName:4:3}"
       d1="${asmName:7:3}"
       d2="${asmName:10:3}"
       tB="`ls -d /hive/data/genomes/asmHubs/allBuild/${gcX}/${d0}/${d1}/${d2}/${asmName}_*`"
       buildDir="`realpath ${tB}`"
       asmId="`basename $buildDir`"
       twoBit="${buildDir}/${asmId}.2bit"
       ;;
     T2T-*)
       asmId="hs1"
       buildDir="/hive/data/genomes/$asmId"
       twoBit="${buildDir}/${asmId}.2bit"
       ;;
     *)  printf "ERROR: unrecognized $asmName\n" 1>&2
       exit 255
       ;;
esac

printf "%s\n" "${chainFile}" 1>&2

time (chainToAxt "${inChain}" "${hg38TwoBit}" \
   "${twoBit}" stdout | axtChain -minScore=0 -linearGap=loose \
      stdin "${hg38TwoBit}" "${twoBit}" stdout | chainDupFilter stdin /scratch/tmp/${asmName}.noDup.chain /scratch/tmp/${asmName}.dup.chain ) > err/${chainFile} 2>&1
time (chainToAxt /scratch/tmp/${asmName}.dup.chain   "${hg38TwoBit}" \
   "${twoBit}" stdout | axtChain -minScore=0 -linearGap=loose \
      stdin "${hg38TwoBit}" "${twoBit}" stdout | cat - /scratch/tmp/${asmName}.noDup.chain | chainSort stdin stdout | awk '/chain/ {$13=NR} { print}' > "${chainFile}"  ) > err/${chainFile} 2>&1
rm -f /scratch/tmp/${asmName}.dup.chain /scratch/tmp/${asmName}.noDup.chain


##############################################################

to generate a jobList

printf "#LOOP
./runOne $(path1)
#ENDLOOP
" > template

gensub2 chain.list single template jobList

# running them all, one at a time

mkdir featBits counts err

time (~/kent/src/hg/utils/automation/perlPara.pl 1 jobList) > do.log 2>&1 
# real    236m9.479s

##############################################################################
# turn those chains into nets (DONE - 2023-08-15 - Hiram)

mkdir /hive/data/genomes/hg38/bed/hprc/net
cd /hive/data/genomes/hg38/bed/hprc/net

ls ../chain  | grep hg38.chainHprc > chain.list

printf '#LOOP
runOne $(path1) {check out exists+ $(root1).net}
#ENDLOOP
' > template

gensub2 chain.list single template jobList

para -ram=4g create jobList
para push
para time
Completed: 88 of 88 jobs
CPU time in finished jobs:        285s       4.74m     0.08h    0.00d  0.000 y
IO & Wait Time:                   206s       3.44m     0.06h    0.00d  0.000 y
Average job time:                   6s       0.09m     0.00h    0.00d
Longest finished job:               8s       0.13m     0.00h    0.00d
Submission to last job:             8s       0.13m     0.00h    0.00d

###########################

Loading these nets:

#!/bin/bash

set -beEu -o pipefail

ls hg38.chainHprc*.net | while read netFile
do
export asmName=`echo $netFile | sed -e 's/hg38.chainHprc//; s/.net//;'`

export buildDir=""
export twoBit=""
export asmId=""

case "${asmName}" in
    GCA_*) asmId=`echo $asmName | sed -e 's/\./v/;'`
       ;;
     T2T-*)
       asmId="Hs1"
       ;;
     *)  printf "ERROR: unrecognized $asmName\n" 1>&2
       exit 255
       ;;
esac

printf "hgLoadNet -warn hg38 netHprc${asmId} ${netFile}\n" 1>&2
hgLoadNet -warn hg38 netHprc${asmId} ${netFile}

done
