# for emacs: -*- mode: sh; -*-

# This file describes how we made the browser database on
# NCBI build 38 (December 2013 freeze) aka:
#	GRCh38 - Genome Reference Consortium Human Reference 38
#	Assembly Accession: GCA_000001405.2

#############################################################################
## Download sequence - DONE - 2013-12-24
    mkdir /hive/data/genomes/hg38
    mkdir /hive/data/genomes/hg38/genbank
    cd /hive/data/genomes/hg38/genbank
    time rsync -a -P rsync://ftp.ncbi.nlm.nih.gov/genbank/genomes/Eukaryotes/vertebrates_mammals/Homo_sapiens/GRCh38/ ./
# sent 19643 bytes  received 4914689807 bytes  4490369.53 bytes/sec
# total size is 4914019581  speedup is 1.00

# real    18m14.497s

#############################################################################
## convert to UCSC names - DONE - 2013-12-24
#  with this release, NCBI has adopted a naming convention that is similar
#  to UCSC.  The delivered sequence with these names can be found in:
#  /hive/data/genomes/hg38/genbank/seqs_for_alignment_pipelines/
#
#  The following scripts reproduce this naming scheme from the separate
#  files in the release
#
    mkdir /hive/data/genomes/hg38/ucsc
    cat << '_EOF_' > ucscCompositeAgp.pl
#!/bin/env perl

use strict;
use warnings;

my %accToChr;

open (FH, "<../genbank/Primary_Assembly/assembled_chromosomes/chr2acc") or
        die "can not read Primary_Assembly/assembled_chromosomes/chr2acc";
while (my $line = <FH>) {
    next if ($line =~ m/^#/);
    chomp $line;
    my ($chrN, $acc) = split('\s+', $line);
    $accToChr{$acc} = $chrN;
}
close (FH);

foreach my $acc (keys %accToChr) {
    my $chrN =  $accToChr{$acc};
    print "$acc $accToChr{$acc}\n";
    open (FH, "zcat ../genbank/Primary_Assembly/assembled_chromosomes/AGP/chr${chrN}.comp.agp.gz|") or die "can not read chr${chrN}.comp.agp.gz";
    open (UC, ">chr${chrN}.agp") or die "can not write to chr${chrN}.agp";
    while (my $line = <FH>) {
        if ($line =~ m/^#/) {
            print UC $line;
        } else {
            $line =~ s/^$acc/chr${chrN}/;
            print UC $line;
        }
    }
    close (FH);
    close (UC);
    open (FH, "zcat ../genbank/Primary_Assembly/assembled_chromosomes/FASTA/chr${chrN}.fa.gz|") or die "can not read chr${chrN}.fa.gz";
    open (UC, ">chr${chrN}.fa") or die "can not write to chr${chrN}.fa";
    while (my $line = <FH>) {
        if ($line =~ m/^>/) {
            printf UC ">chr${chrN}\n";
        } else {
            print UC $line;
        }
    }
    close (FH);
    close (UC);
}
'_EOF_'
    # << happy emacs
    chmod +x ucscCompositeAgp.pl

    cat << '_EOF_' > unlocalized.pl
#!/bin/env perl

use strict;
use warnings;

my %accToChr;
my %chrNames;

open (FH, "<../genbank/Primary_Assembly/unlocalized_scaffolds/unlocalized.chr2scaf") or
        die "can not read Primary_Assembly/unlocalized_scaffolds/unlocalized.chr2scaf";
while (my $line = <FH>) {
    next if ($line =~ m/^#/);
    chomp $line;
    my ($chrN, $acc) = split('\s+', $line);
    $acc =~ s/\./v/;
    $accToChr{$acc} = $chrN;
    $chrNames{$chrN} += 1;
}
close (FH);

foreach my $chrN (keys %chrNames) {
    my $agpFile =  "../genbank/Primary_Assembly/unlocalized_scaffolds/AGP/chr$chrN.unlocalized.scaf.agp.gz";
    my $fastaFile =  "../genbank/Primary_Assembly/unlocalized_scaffolds/FASTA/chr$chrN.unlocalized.scaf.fa.gz";
    open (FH, "zcat $agpFile|") or die "can not read $agpFile";
    open (UC, ">chr${chrN}_random.agp") or die "can not write to chr${chrN}_random.agp";
    while (my $line = <FH>) {
        if ($line =~ m/^#/) {
            print UC $line;
        } else {
            chomp $line;
            my (@a) = split('\t', $line);
            my $acc = $a[0];
            $acc =~ s/\./v/;
            die "ERROR: chrN $chrN not correct for $acc"
                if ($accToChr{$acc} ne $chrN);
            my $ucscName = "chr${chrN}_${acc}_random";
            printf UC "%s", $ucscName;
            for (my $i = 1; $i < scalar(@a); ++$i) {
                printf UC "\t%s", $a[$i];
            }
            printf UC "\n";
        }
    }
    close (FH);
    close (UC);
    printf "chr%s\n", $chrN;
    open (FH, "zcat $fastaFile|") or die "can not read $fastaFile";
    open (UC, ">chr${chrN}_random.fa") or die "can not write to chr${chrN}_random.fa";
    while (my $line = <FH>) {
        if ($line =~ m/^>/) {
            chomp $line;
            my $acc = $line;
            $acc =~ s/.*gb\|//;
            $acc =~ s/. Homo.*//;
            $acc =~ s/\./v/;
            die "ERROR: chrN $chrN not correct for $acc"
                if ($accToChr{$acc} ne $chrN);
            my $ucscName = "chr${chrN}_${acc}_random";
            printf UC ">$ucscName\n";
        } else {
            print UC $line;
        }
    }
    close (FH);
    close (UC);
}
'_EOF_'
    # << happy emacs
    chmod +x unlocalized.pl

    cat << '_EOF_' > unplaced.pl
#!/bin/env perl

use strict;
use warnings;

my $agpFile =  "../genbank/Primary_Assembly/unplaced_scaffolds/AGP/unplaced.scaf.agp.gz";
my $fastaFile =  "../genbank/Primary_Assembly/unplaced_scaffolds/FASTA/unplaced.scaf.fa.gz";
open (FH, "zcat $agpFile|") or die "can not read $agpFile";
open (UC, ">chrUn.agp") or die "can not write to chrUn.agp";
while (my $line = <FH>) {
    if ($line =~ m/^#/) {
        print UC $line;
    } else {
        $line =~ s/\./v/;
        printf UC "chrUn_%s", $line;
    }
}
close (FH);
close (UC);

open (FH, "zcat $fastaFile|") or die "can not read $fastaFile";
open (UC, ">chrUn.fa") or die "can not write to chrUn.fa";
while (my $line = <FH>) {
    if ($line =~ m/^>/) {
        chomp $line;
        $line =~ s/.*gb\|//;
        $line =~ s/. Homo.*//;
        $line =~ s/\./v/;
        printf UC ">chrUn_$line\n";
    } else {
        print UC $line;
    }
}
close (FH);
close (UC);
'_EOF_'
    # << happy emacs
    chmod +x unplaced.pl

    cat << '_EOF_' > altSequence.pl
#!/usr/bin/env perl

use strict;
use warnings;
use File::Basename;

open (AG, ">chrAlt.agp") or die "can not write to chrAlt.agp";
open (FA, ">chrAlt.fa") or die "can not write to chrAlt.fa";
open (FH, "find ../genbank/ALT* -type f | grep alt_scaffold_placement.txt|") or die "can not find alt_scaffold_placement.txt files";
while (my $file = <FH>) {
  chomp $file;
  my $dirName = dirname($file);
  my $agpFile = "$dirName/AGP/alt.scaf.agp.gz";
  my $fastaFile = "$dirName/FASTA/alt.scaf.fa.gz";
  # key is genbank acc name, value is UCSC chr name
  my %nameDelta;
#  printf STDERR "# %s\n", $file;
  open (AL, "<$file") or die "can not read $file";
  while (my $line = <AL>) {
     next if ($line =~ m/^#/);
     chomp $line;
     my ($alt_asm_name, $prim_asm_name, $alt_scaf_name, $alt_scaf_acc,
          $parent_type, $parent_name, $parent_acc, $region_name, $ori,
           $alt_scaf_start, $alt_scaf_stop, $parent_start, $parent_stop,
            $alt_start_tail, $alt_stop_tail) = split('\t', $line);
     my $ucscAcc = $alt_scaf_acc;
     $ucscAcc =~ s/\./v/;
     my $ucscName = sprintf("chr%s_%s_alt", $parent_name, $ucscAcc);
     printf "%s %s\n", $alt_scaf_acc, $ucscName;
     if (exists ($nameDelta{$alt_scaf_acc})) {
         die "duplicate name incorrect ? $alt_scaf_acc $nameDelta{$alt_scaf_acc} ne $ucscName" if ($nameDelta{$alt_scaf_acc} ne $ucscName);
     } else {
         $nameDelta{$alt_scaf_acc} = $ucscName;
     }
  }
  close (AL);
  open (AL, "zcat $agpFile|") or die "can not read $agpFile";
  while (my $line = <AL>) {
     if ($line =~ m/^#/) {
       print AG "$line";
     } else {
       my ($acc, $rest) = split('\t', $line, 2);
       die "can not find ucsc name for $acc" if (!exists($nameDelta{$acc}));
       printf AG "%s\t%s", $nameDelta{$acc}, $rest;
     }
  }
  close (AL);
  open (AL, "zcat $fastaFile|") or die "can not read $fastaFile";
  while (my $line = <AL>) {
     chomp $line;
     if ($line =~ m/^>/) {
       $line =~ s/.*gb.//;
       $line =~ s/. Homo.*//;
       die "can not find ucsc name for $line" if (!exists($nameDelta{$line}));
       printf FA ">%s\n", $nameDelta{$line};
     } else {
       printf FA "%s\n", $line;
     }
  }
  close (AL);
}
close (FH);
close (AG);
close (FA);
'_EOF_'
    # << happy emacs
    chmod +x altSequence.pl

    ./ucscCompositeAgp.pl
    ./unlocalized.pl
    ./unplaced.pl
    ./altSequence.pl

    # temporarily verify the fasta and AGP are complete and compatible
    faToTwoBit chr*.fa hg38.test.2bit
    cat chr*.agp > hg38.agp
    checkAgpAndFa hg38.agp hg38.test.2bit 2>&1 | tail -1
# All AGP and FASTA entries agree - both files are valid

    rm -f hg38.agp hg38.test.2bit

    # comparing faCounts of this 2bit file and the sequences delivered
    # in genbank/seqs_for_alignment_pipelines/
    # result in the exact same sequence

#############################################################################
## initial db build - DONE - 2013-12-24 - Hiram

    cd /hive/data/genomes/hg38
    cat << '_EOF_' > hg38.config.ra
# Config parameters for makeGenomeDb.pl:
db hg38
scientificName Homo sapiens
commonName Human
assemblyDate Dec. 2013
assemblyLabel GRCh38 Genome Reference Consortium Human Reference 38 (GCA_000001405.2)
assemblyShortLabel GRCh38
orderKey 13
mitoAcc none
fastaFiles /hive/data/genomes/hg38/ucsc/chr*.fa
agpFiles /hive/data/genomes/hg38/ucsc/chr*.agp
# qualFiles /dev/null
dbDbSpeciesDir human
photoCreditURL http://www.cbse.ucsc.edu/
photoCreditName Graphic courtesy of CBSE
ncbiGenomeId 51
ncbiAssemblyId 883148
ncbiAssemblyName GRCh38
ncbiBioProject 31257
genBankAccessionID GCA_000001305.2
taxId   9606
'_EOF_'
    # << happy emacs

    # step wise to first verify AGP and Fasta files
    time makeGenomeDb.pl -stop=agp hg38.config.ra > agp.log 2>&1

    # looking good, continue:
    time makeGenomeDb.pl -continue=db hg38.config.ra > db.log 2>&1

    # add the files produced by the trackDb build to the source tree

    # this path is fixed in the makeGenomeDb.pl for next time
    # honor new convention for bbi location files:
    cd /gbdb/hg38/bbi
    mkdir gc5BaseBw
    mv gc5Base.bw gc5BaseBw
    cd gc5BaseBw
    # before
    hgsql -e 'select * from gc5BaseBw;' hg38
# +---------------------------+
# | fileName                  |
# +---------------------------+
# | /gbdb/hg38/bbi/gc5Base.bw |
# +---------------------------+
    # and fixed
    hgBbiDbLink hg38 gc5BaseBw `pwd`/gc5Base.bw
    hgsql -e 'select * from gc5BaseBw;' hg38
# +-------------------------------------+
# | fileName                            |
# +-------------------------------------+
# | /gbdb/hg38/bbi/gc5BaseBw/gc5Base.bw |
# +-------------------------------------+

#############################################################################
## RepeatMasker with CrossMatch - DONE - 2013-12-24,27 - Hiram
    mkdir /hive/data/genomes/hg38/bed/repeatMaskerCM
    cd /hive/data/genomes/hg38/bed/repeatMaskerCM
    # running this step wise so it can be loaded into its own table
    time doRepeatMasker.pl -stop=mask -bigClusterHub=ku \
       -workhorse=hgwdev -dbHost=hgwdev -buildDir=`pwd` hg38 > mask.log 2>&1
    # real    3443m13.026s
# RepeatMasker version June 20 2013 open-4.0.3
# Search Engine: cross-match version 1.090518
# RepeatMasker Database: 20130422

    # take the install script from this -debug run and alter it to load
    # the table into rmskCM
    time doRepeatMasker.pl -continue=install -stop=install -debug \
       -bigClusterHub=ku -workhorse=hgwdev -dbHost=hgwdev -buildDir=`pwd` hg38
    cat fb.hg38.rmskCM.txt
    # 1586326530 bases of 3209286105 (49.429%) in intersection

    # profile of repeat elements:
#  1852545 rmskClass/SINE.tab
#  1570523 rmskClass/LINE.tab
#   748597 rmskClass/LTR.tab
#   703682 rmskClass/Simple_repeat.tab
#   499108 rmskClass/DNA.tab
#   102856 rmskClass/Low_complexity.tab
#     7962 rmskClass/Satellite.tab
#     5750 rmskClass/Retroposon.tab
#     5667 rmskClass/LTR?.tab
#     5622 rmskClass/Unknown.tab
#     4516 rmskClass/snRNA.tab
#     3294 rmskClass/DNA?.tab
#     2026 rmskClass/tRNA.tab
#     1840 rmskClass/rRNA.tab
#     1784 rmskClass/RC.tab
#     1672 rmskClass/srpRNA.tab
#     1420 rmskClass/scRNA.tab
#      704 rmskClass/RNA.tab
#      411 rmskClass/RC?.tab
#       38 rmskClass/SINE?.tab

    # using this RM result with trfMask for the final masked sequence
    cd /hive/data/genomes/hg38
    twoBitMask hg38.rmskCM.2bit -add bed/simpleRepeat/trfMask.bed hg38.2bit
    twoBitToFa hg38.2bit stdout | faSize stdin > faSize.hg38.2bit.txt
# 3209286105 bases (159970322 N's 3049315783 real 1460684798 upper 1588630985 lower) in 455 sequences in 1 files
# %49.50 masked total, %52.10 masked real

    featureBits -countGaps hg38 rmskCM '!rmskHmmer' -bed=crossMatchUnique.bed
    # 24868153 bases of 3209286105 (0.775%) in intersection
    hgLoadBed hg38 crossMatchUnique crossMatchUnique.bed
    # Read 2352219 elements of size 4 from crossMatchUnique.bed

#############################################################################
## repeating RepeatMasker Blastn run (DONE - 2014-01-07 - Hiram)
    mkdir /hive/data/genomes/hg38/bed/rmskBlastn
    cd /hive/data/genomes/hg38/bed/rmskBlastn

    time $HOME/kent/src/hg/utils/automation/doRepeatMasker.pl \
      -useRMBlastn -bigClusterHub=ku -workhorse=hgwdev -dbHost=hgwdev \
        -stop=mask -buildDir=`pwd` hg38 > mask.log
    # real    203m33.670s

# 3209286105 bases (159970322 N's 3049315783 real 1491207906 upper 1558107877 lower) in 455 sequences in 1 files
# %48.55 masked total, %51.10 masked real

    # install step with debug so the script can be altered to load into
    # a specific rmskBlastn table:

    $HOME/kent/src/hg/utils/automation/doRepeatMasker.pl \
      -useRMBlastn -bigClusterHub=ku -workhorse=hgwdev -dbHost=hgwdev \
        -continue=install -debug -buildDir=`pwd` hg38

#############################################################################
## repeating RepeatMasker cross-match run (DONE - 2014-01-07 - Hiram)
    mkdir /hive/data/genomes/hg38/bed/rmskCM
    cd /hive/data/genomes/hg38/bed/rmskCM

    # missed recording stderr ....  forgot the 2>&1
    time $HOME/kent/src/hg/utils/automation/doRepeatMasker.pl \
      -bigClusterHub=ku -workhorse=hgwdev -dbHost=hgwdev \
        -stop=mask -buildDir=`pwd` hg38 > mask.log
    # real    1897m33.517s
    # running from Tue Jan  7 16:10:33 PST 2014 thru 08 Jan 23:48
#  *** All done!  (through the 'mask' step) - Elapsed time: 1897m34s
#  *** Steps were performed in /hive/data/genomes/hg38/bed/rmskCM
    # running install manually to allow edit of the script to load
    # a specific rmskCm table
    time $HOME/kent/src/hg/utils/automation/doRepeatMasker.pl \
      -bigClusterHub=ku -workhorse=hgwdev -dbHost=hgwdev \
        -continue=install -stop=install -buildDir=`pwd` hg38 -debug

#############################################################################
## RepeatMasker with RM Blastn - DONE - 2013-12-24,25 - Hiram
    mkdir /hive/data/genomes/hg38/bed/repeatMaskerBlastn
    cd /hive/data/genomes/hg38/bed/repeatMaskerBlastn
    # running this step wise so it can be loaded into its own table
    time doRepeatMasker.pl -stop=mask -useRMBlastn -bigClusterHub=ku \
       -workhorse=hgwdev -dbHost=hgwdev -buildDir=`pwd` hg38 > mask.log 2>&1
    # real    354m55.842s

    # take the install script from this -debug run and alter it to load
    # the table into rmskBlastn
    doRepeatMasker.pl -useRMBlastn -bigClusterHub=ku  -continue=install \
     -stop=install -debug -workhorse=hgwdev -dbHost=hgwdev -buildDir=`pwd` hg38
    # 1560264046 bases of 3209286105 (48.617%) in intersection
    # profile of repeat elements:
#   1824560 rmskClass/SINE.tab
#   1552814 rmskClass/LINE.tab
#    738435 rmskClass/LTR.tab
#    715998 rmskClass/Simple_repeat.tab
#    486591 rmskClass/DNA.tab
#    105026 rmskClass/Low_complexity.tab
#      7712 rmskClass/Satellite.tab
#      5638 rmskClass/Retroposon.tab
#      5276 rmskClass/Unknown.tab
#      5100 rmskClass/LTR?.tab
#      4548 rmskClass/snRNA.tab
#      3033 rmskClass/DNA?.tab
#      1987 rmskClass/tRNA.tab
#      1809 rmskClass/rRNA.tab
#      1710 rmskClass/RC.tab
#      1633 rmskClass/srpRNA.tab
#      1428 rmskClass/scRNA.tab
#       614 rmskClass/RNA.tab
#       376 rmskClass/RC?.tab
#        38 rmskClass/SINE?.tab
#         3 rmskClass/Unspecified.tab
#   5464329 total

#############################################################################
## repeating RepeatMasker run with HMMER - DONE - 2014-01-08 - Hiram
    mkdir /hive/data/genomes/hg38/bed/rmskHmmer
    cd /hive/data/genomes/hg38/bed/rmskHmmer

    # trying cpu=4 and ram=32g
    time $HOME/kent/src/hg/utils/automation/doRepeatMasker.pl \
      -stop=mask -useHMMER -bigClusterHub=ku \
       -workhorse=hgwdev -dbHost=hgwdev -buildDir=`pwd` hg38 > mask.log 2>&1
    # 6 jobs required more than 32 Gb of memory to complete, ran them on
    # hgwdev to complete, then continuing:
    time $HOME/kent/src/hg/utils/automation/doRepeatMasker.pl \
      -continue=cat -stop=mask -useHMMER -bigClusterHub=ku \
       -workhorse=hgwdev -dbHost=hgwdev -buildDir=`pwd` hg38 > cat.log 2>&1
    #  real    24m5.274s
# 3209286105 bases (159970322 N's 3049315783 real 1314916231 upper 1734399552 lower) in 455 sequences in 1 files
# %54.04 masked total, %56.88 masked real

    # running install manually to allow edit of the script to load
    # a specific rmskHmmer table
    time $HOME/kent/src/hg/utils/automation/doRepeatMasker.pl \
      -continue=install -debug -useHMMER -bigClusterHub=ku \
       -workhorse=hgwdev -dbHost=hgwdev -buildDir=`pwd` hg38

    time ./doLoad_rmskHmmer.bash > load.log 2>&1
    # real    4m47.432s

    featureBits -countGaps hg38 rmskHmmer > fb.hg38.rmskHmmer.txt 2>&1
    # 1734398971 bases of 3209286105 (54.043%) in intersection

    grep rmskClass hg38.class.profile.txt \
        | sed -e 's#rmskClass/##; s/.tab//;' | sort -rn
    # profile of repeat elements:
#  1884179 SINE
#  1702529 LINE
#   805427 LTR
#   636906 Simple_repeat
#   565171 DNA
#    95480 Low_complexity
#    11861 Retroposon
#    10852 Satellite
#     9181 LTR?
#     6783 scRNA
#     4582 DNA?
#     3914 Unknown
#     2059 RC
#     1517 srpRNA
#     1484 RNA
#      970 SINE?
#      806 RC?
#      464 rRNA
#  5744165 total

    featureBits -countGaps hg38 rmskHmmer '!rmskCM' -bed=hmmerUnique.bed
    # 172940594 bases of 3209286105 (5.389%) in intersection
    hgLoadBed hg38 hmmerUnique hmmerUnique.bed
    # Read 3099505 elements of size 4 from hmmerUnique.bed

#############################################################################
## RepeatMasker with HMMER - DONE - 2013-12-24,26 - Hiram
    mkdir /hive/data/genomes/hg38/bed/repeatMaskerHMMER
    cd /hive/data/genomes/hg38/bed/repeatMaskerHMMER

    time doRepeatMasker.pl -stop=mask -useHMMER -bigClusterHub=ku \
       -workhorse=hgwdev -dbHost=hgwdev -buildDir=`pwd` hg38 > mask.log 2>&1
    # take the install script from this -debug run and alter it to load
    # the table into rmskHmmer
    doRepeatMasker.pl -continue=install -stop=install -useHMMER \
      -bigClusterHub=ku -workhorse=hgwdev -dbHost=hgwdev \
         -buildDir=`pwd` hg38 > mask.log 2>&1
    # 1702017722 bases of 3209286105 (53.034%) in intersection
    # profile of repeat elements:
#   1879864 rmskClass/SINE.tab
#   1678216 rmskClass/LINE.tab
#    794231 rmskClass/LTR.tab
#    651561 rmskClass/Simple_repeat.tab
#    551965 rmskClass/DNA.tab
#     97186 rmskClass/Low_complexity.tab
#     10756 rmskClass/Retroposon.tab
#     10448 rmskClass/Satellite.tab
#      8393 rmskClass/LTR?.tab
#      5849 rmskClass/scRNA.tab
#      4282 rmskClass/Unknown.tab
#      4276 rmskClass/DNA?.tab
#      2000 rmskClass/RC.tab
#      1573 rmskClass/srpRNA.tab
#      1291 rmskClass/RNA.tab
#       906 rmskClass/snRNA.tab
#       747 rmskClass/SINE?.tab
#       723 rmskClass/RC?.tab
#       722 rmskClass/rRNA.tab
#       468 rmskClass/tRNA.tab
#   5705457 total

#############################################################################
# rmsk from genbank release (DONE - 2014-12-25 - Hiram)
    mkdir /hive/data/genomes/hg38/bed/repeatMaskerGenbank
    cd /hive/data/genomes/hg38/bed/repeatMaskerGenbank

    head -3 ../repeatMaskerBlastn/hg38.fa.out > genbank.rm.out
find ../../genbank -type f | grep rm.out | grep -v "/placed_scaffolds/" | while read F
do
  headRest 3 $F
done | sort -k5,45 -k6,6n >> genbank.rm.out
    grep -v "^#" ../../genbank/GCA_000001405.15_GRCh38_top-level.acc2name \
       | awk '{printf "s/%s/%s/g;\n", $1, $3}' > accessionToUcsc.sed.txt

    sed -e "`cat accessionToUcsc.sed.txt`" genbank.rm.out > ucscNames.rm.out

    head -3 ucscNames.rm.out > hg38.sorted.fa.out
    tail -n +4 ucscNames.rm.out  | sort -k5,5 -k6,6n >> hg38.sorted.fa.out

    hgLoadOut -table=rmskGenbank -nosplit hg38 hg38.sorted.fa.out
    hgLoadOut -verbose=2 -tabFile=hg38.rmskGenbank.tab -table=rmskGenbank \
       -nosplit hg38 hg38.sorted.fa.out 2> bad.records.txt
    # fixed up one of the masking scripts from the other runs to construct
    # the bbi files

    # 1581568556 bases of 3209286105 (49.281%) in intersection
    # profile of repeat elements:
#   1849444 rmskClass/SINE.tab
#   1586141 rmskClass/LINE.tab
#    759248 rmskClass/LTR.tab
#    502186 rmskClass/DNA.tab
#    433789 rmskClass/Simple_repeat.tab
#    396378 rmskClass/Low_complexity.tab
#     10198 rmskClass/Satellite.tab
#      5884 rmskClass/LTR?.tab
#      4595 rmskClass/snRNA.tab
#      4163 rmskClass/Retroposon.tab
#      2802 rmskClass/Unknown.tab
#      2157 rmskClass/DNA?.tab
#      2154 rmskClass/tRNA.tab
#      1915 rmskClass/rRNA.tab
#      1860 rmskClass/RC.tab
#      1784 rmskClass/srpRNA.tab
#      1397 rmskClass/scRNA.tab
#       822 rmskClass/RNA.tab
#       488 rmskClass/SINE?.tab
#       445 rmskClass/RC?.tab
#   5567850 total

#############################################################################
## running TRF simple repeats - DONE - 2013-12-24,29 - Hiram
    # this procedure ran into much trouble on this release.  The new
    # repeat sequences in the centromeres caused trf to run indefinitely.
    # I tried different sizes of chunks, working down to 20 Mbase chunks.
    # Even still, some jobs would not complete.  Those broke down even
    # more, eventually to the smallest bit of 30 Kbase that needed to
    # run all the way down to 3,000 based chunks with 1,000 base overlaps.

    # this did not work:
    screen # use screen to manage this day-long job
    mkdir /hive/data/genomes/hg38/bed/simpleRepeat
    cd /hive/data/genomes/hg38/bed/simpleRepeat
    time doSimpleRepeat.pl -bigClusterHub=ku -workhorse=hgwdev \
	-smallClusterHub=ku -buildDir=`pwd` hg38 > do.log 2>&1
    cd /hive/data/genomes/hg38/bed
    # move it aside:
    mv simpleRepeat simpleRepeat.2013-12-24

    # Instead, something like this:
    mkdir /hive/data/genomes/hg38/bed/simpleRepeat.2013-12-27/splitGap
    cd /hive/data/genomes/hg38/bed/simpleRepeat.2013-12-27/splitGap
    mkdir -p noGap

    twoBitToFa ../../../hg38.unmasked.2bit stdout \
       | faSplit -lift=noGap.lift gap stdin 5000000 noGap/hg38_
    # make sure nothing has gone missing:
    faCount noGap/*.fa > faCount.txt
    tail -1 faCount.txt
# total 3068387174 898285419 623727342 626335137 900967885  19071391 30979734
    # compared to the full sequence, same numbers for ACGT:
    twoBitToFa ../../../hg38.unmasked.2bit stdout | faCount stdin
# total 3209286105 898285419 623727342 626335137 900967885 159970322 30979743
    faToTwoBit noGap/*.fa hg38.nogap.2bit
    twoBitInfo hg38.nogap.2bit stdout | sort -k2,2nr > hg38.nogap.sizes


    mkdir /hive/data/genomes/hg38/bed/simpleRepeat.2013-12-27/run20M
    cd /hive/data/genomes/hg38/bed/simpleRepeat.2013-12-27/run20M
    rm -rf /hive/data/genomes/hg38/TrfPart20M
    /cluster/bin/scripts/simplePartition.pl \
/hive/data/genomes/hg38/bed/simpleRepeat.2013-12-27/splitGap/hg38.nogap.2bit \
   20000000 /hive/data/genomes/hg38/TrfPart20M
   rm -f /hive/data/genomes/hg38/bed/simpleRepeat.2013-12-27/TrfPart20M
   ln -s /hive/data/genomes/hg38/TrfPart20M \
      /hive/data/genomes/hg38/bed/simpleRepeat.2013-12-27/TrfPart20M
   ssh ku
   cd /hive/data/genomes/hg38/bed/simpleRepeat.2013-12-27/run20M
   gensub2 /hive/data/genomes/hg38/TrfPart20M/partitions.lst single gsub jobList
   para create jobList
   para push
   # 20 jobs would not complete:
# Completed: 143 of 163 jobs
# Jobs currently running: 20
# CPU time in finished jobs:      76994s    1283.24m    21.39h    0.89d  0.002 y
# IO & Wait Time:                  1095s      18.24m     0.30h    0.01d  0.000 y
# Time in running jobs:         1807279s   30121.32m   502.02h   20.92d  0.057 y
# Average job time:                 546s       9.10m     0.15h    0.01d
# Longest running job:            90422s    1507.03m    25.12h    1.05d
# Longest finished job:           43348s     722.47m    12.04h    0.50d
# Submission to last job:         43363s     722.72m    12.05h    0.50d
   # determine which are the last jobs as individual bits:
   para status | grep -v done | awk '{print $(NF-1),$NF}' | grep TrfRun \
     > not.done.list
   awk '{print $NF}' not.done.list | sed -e 's/.bed//' | while read F
do
   cat $F
done > seq.specs.not.done

   mkdir /hive/data/genomes/hg38/bed/simpleRepeat.2013-12-27/run20M/lastJobs
   cd /hive/data/genomes/hg38/bed/simpleRepeat.2013-12-27/run20M/lastJobs
   mkdir fasta
   for seqSpec in `cat ../seq.specs.not.done`
do
  fName=`echo $seqSpec | sed -e 's/.*://'`
  echo $fName
  twoBitToFa $seqSpec fasta/$fName.fa
done
  ls -1S `pwd`/fasta > part.list
  cat << '_EOF_' > template
#LOOP
./runTrf {check in line+ $(path1)}  {check out line bed/$(root1).bed}
#ENDLOOP
'_EOF_'
  # << happy emacs

  cat << '_EOF_' > runTrf
#!/bin/bash
set -beEu -o pipefail
export path1=$1
export inputFN=`basename $1`
export outpath=$2
export outputFN=`basename $2`
mkdir -p /dev/shm/$outputFN
cp -p $path1 /dev/shm/$outputFN
cd /dev/shm/$outputFN
/cluster/bin/x86_64/trfBig -trf=/cluster/bin/x86_64/trf \
      $inputFN /dev/null -bedAt=$outputFN -tempDir=/dev/shm
cd /hive/data/genomes/hg38/bed/simpleRepeat.2013-12-27/run20M/lastJobs
rm -f $outpath
cp -p /dev/shm/$outputFN/$outputFN $outpath
rm -fr /dev/shm/$outputFN/*
rmdir --ignore-fail-on-non-empty /dev/shm/$outputFN
'_EOF_'
  # << happy emacs
  chmod +x runTrf

  gensub2 part.list single template jobList
  para create jobList
  para push
  # not all of these jobs will finish either:
# Completed: 85 of 106 jobs
# Jobs currently running: 21
# CPU time in finished jobs:      58076s     967.93m    16.13h    0.67d  0.002 y
# IO & Wait Time:                   828s      13.81m     0.23h    0.01d  0.000 y
# Time in running jobs:         1988997s   33149.95m   552.50h   23.02d  0.063 y
# Average job time:                 693s      11.55m     0.19h    0.01d
# Longest running job:            94730s    1578.83m    26.31h    1.10d
# Longest finished job:           34216s     570.27m     9.50h    0.40d
# Submission to last job:         34342s     572.37m     9.54h    0.40d

  # can use what we have here:
  liftUp result.bed ../../splitGap/noGap.lift error bed/*.bed
  # find jobs not done
  para status | grep -v done | awk '{print $(NF-1),$NF}' | grep TrfRun \
     > not.done.list
  # splitting up those last jobs:
  mkdir /hive/data/genomes/hg38/bed/simpleRepeat.2013-12-27/run20M/splitBits
  cd /hive/data/genomes/hg38/bed/simpleRepeat.2013-12-27/run20M/splitBits
  mkdir noGap
  awk '{print $2}' ../lastJobs/not.done.list | while read F
do
  cp -p $F ./noGap/
done

  # split into 1,000,000 chunks with 10,000 overlap:
  mkdir -p 1M_10K

for F in noGap/*.fa
do
  B=`basename $F | sed -e 's/.fa//'`
  echo "faSplit -lift=$B.lift -extra=10000 size $F 1000000 1M_10K/$B_"
  faSplit -lift=$B.lift -extra=10000 size $F 1000000 1M_10K/${B}_
done

  ls -1S `pwd`/1M_10K/*.fa > part.list
  cat << '_EOF_' > runTrf
#!/bin/bash
set -beEu -o pipefail
export path1=$1
export inputFN=`basename $1`
export outpath=$2
export outputFN=`basename $2`
mkdir -p /dev/shm/$outputFN
cp -p $path1 /dev/shm/$outputFN
cd /dev/shm/$outputFN
/cluster/bin/x86_64/trfBig -trf=/cluster/bin/x86_64/trf \
      $inputFN /dev/null -bedAt=$outputFN -tempDir=/dev/shm
cd /hive/data/genomes/hg38/bed/simpleRepeat.2013-12-27/run20M/splitBits
rm -f $outpath
cp -p /dev/shm/$outputFN/$outputFN $outpath
rm -fr /dev/shm/$outputFN/*
rmdir --ignore-fail-on-non-empty /dev/shm/$outputFN
'_EOF_'
  # << happy emacs

  cat << '_EOF_' > template
#LOOP
./runTrf {check in line+ $(path1)}  {check out line bed/$(root1).bed}
#ENDLOOP
'_EOF_'
  # << happy emacs

  gensub2 part.list single template jobList
  para create jobList
  para push
  # not all of these jobs will complete either:
# Completed: 53 of 96 jobs
# CPU time in finished jobs:     212403s    3540.05m    59.00h    2.46d  0.007 y
# IO & Wait Time:                  1851s      30.85m     0.51h    0.02d  0.000 y
# Average job time:                4043s      67.38m     1.12h    0.05d
# Longest finished job:           68726s    1145.43m    19.09h    0.80d
# Submission to last job:         68890s    1148.17m    19.14h    0.80d
  # use what results we have here:
  cat *.lift  | liftUp parts.bed stdin error bed/*.bed
  liftUp -type=.bed stdout ../../splitGap/noGap.lift error parts.bed \
    | sort -u | sort -k1,1 -k2,2n > hg38.result.bed

  para status | grep -v -w done | awk '{print $(NF-1)}' > will.not.finish.txt

  # split those last bits:
  mkdir /hive/data/genomes/hg38/bed/simpleRepeat.2013-12-27/run20M/splitSplitBits
  cd /hive/data/genomes/hg38/bed/simpleRepeat.2013-12-27/run20M/splitSplitBits
  mkdir splitBits
  cat ../splitBits/will.not.finish.txt | while read F
do
  cp -p $F splitBits
done

  #  100K chunks with 10K overlap
  mkdir -p 100K_10K

for F in splitBits/*.fa
do
  B=`basename $F | sed -e 's/.fa//'`
  echo "faSplit -lift=$B.lift -extra=10000 size $F 1000000 1M_10K/$B_"
  faSplit -lift=$B.lift -extra=10000 size $F 100000 100K_10K/${B}_
done

  cat << '_EOF_' > runTrf
#!/bin/bash
set -beEu -o pipefail
export path1=$1
export inputFN=`basename $1`
export outpath=$2
export outputFN=`basename $2`
mkdir -p /dev/shm/$outputFN
cp -p $path1 /dev/shm/$outputFN
cd /dev/shm/$outputFN
/cluster/bin/x86_64/trfBig -trf=/cluster/bin/x86_64/trf \
      $inputFN /dev/null -bedAt=$outputFN -tempDir=/dev/shm
cd /hive/data/genomes/hg38/bed/simpleRepeat.2013-12-27/run20M/splitSplitBits
rm -f $outpath
cp -p /dev/shm/$outputFN/$outputFN $outpath
rm -fr /dev/shm/$outputFN/*
rmdir --ignore-fail-on-non-empty /dev/shm/$outputFN
'_EOF_'
  # << happy emacs
  chmod +x runTrf

  cat << '_EOF_' > template
#LOOP
./runTrf {check in line+ $(path1)}  {check out line bed/$(root1).bed}
#ENDLOOP
'_EOF_'
  # << happy emacs

  ls -1S `pwd`/100K_10K/*.fa > part.list
  gensub2 part.list single template jobList
  para create jobList
  para push
  # one last bit does not complete:
# Completed: 420 of 421 jobs
# CPU time in finished jobs:      19862s     331.04m     5.52h    0.23d  0.001 y
# IO & Wait Time:                  2360s      39.33m     0.66h    0.03d  0.000 y
# Average job time:                  53s       0.88m     0.01h    0.00d
# Longest finished job:             368s       6.13m     0.10h    0.00d
# Submission to last job:           448s       7.47m     0.12h    0.01d

  # can use the results obtained here:
  cat *.lift  | liftUp splitParts.bed stdin error bed/*.bed
  cat ../splitBits/*.lift | liftUp parts.bed  stdin error splitParts.bed
  liftUp -type=.bed stdout ../../splitGap/noGap.lift error parts.bed | sort -u \
    | sort -k1,1 -k2,2n > hg38.result.bed

  para status | grep -v -w done | awk '{print $(NF-1)}'
  # last chunk: 100K_10K/hg38_89_2_00.fa

  mkdir /hive/data/genomes/hg38/bed/simpleRepeat.2013-12-27/run20M/last100K
  cd /hive/data/genomes/hg38/bed/simpleRepeat.2013-12-27/run20M/last100K
  cp -p ../splitSplitBits/100K_10K/hg38_89_2_00.fa .

  # 20K chunks with 10K overlap:
  mkdir -p 20K_10K

for F in hg38_89_2_00.fa
do
  B=`basename $F | sed -e 's/.fa//'`
  echo "faSplit -lift=$B.lift -extra=10000 size $F 20000 20K_10K/$B_"
  faSplit -lift=$B.lift -extra=10000 size $F 20000 20K_10K/${B}_
done

  ls -1S `pwd`/20K_10K/*.fa > part.list
  cat << '_EOF_' > runTrf
#!/bin/bash
set -beEu -o pipefail
export path1=$1
export inputFN=`basename $1`
export outpath=$2
export outputFN=`basename $2`
mkdir -p /dev/shm/$outputFN
cp -p $path1 /dev/shm/$outputFN
cd /dev/shm/$outputFN
/cluster/bin/x86_64/trfBig -trf=/cluster/bin/x86_64/trf \
      $inputFN /dev/null -bedAt=$outputFN -tempDir=/dev/shm
cd /hive/data/genomes/hg38/bed/simpleRepeat.2013-12-27/run20M/last100K
rm -f $outpath
cp -p /dev/shm/$outputFN/$outputFN $outpath
rm -fr /dev/shm/$outputFN/*
rmdir --ignore-fail-on-non-empty /dev/shm/$outputFN
'_EOF_'
  # << happy emacs
  chmod +s runTrf
  cat << '_EOF_' > template
#LOOP
./runTrf {check in line+ $(path1)}  {check out line bed/$(root1).bed}
#ENDLOOP
'_EOF_'
  # << happy emacs

  gensub2 part.list single template jobList
  para create jobList
  para push
  # one of these jobs will not finish:
# Completed: 4 of 5 jobs
# CPU time in finished jobs:         10s       0.17m     0.00h    0.00d  0.000 y
# IO & Wait Time:                    16s       0.26m     0.00h    0.00d  0.000 y
# Average job time:                   7s       0.11m     0.00h    0.00d
# Longest finished job:               8s       0.13m     0.00h    0.00d
# Submission to last job:            16s       0.27m     0.00h    0.00d

  # can use the results we have here:
  cat *.lift  | liftUp 20Kparts.bed stdin error bed/*.bed
  cat ../splitSplitBits/*.lift | liftUp 100Kpart.bed stdin error 20Kparts.bed
  cat ../splitBits/*.lift | liftUp parts.bed  stdin error 100Kpart.bed
  liftUp -type=.bed stdout ../../splitGap/noGap.lift error parts.bed | sort -u \
    | sort -k1,1 -k2,2n > hg38.result.bed

  # finally, what turns out to be the last batch:
  mkdir /hive/data/genomes/hg38/bed/simpleRepeat.2013-12-27/run20M/last30K
  cd /hive/data/genomes/hg38/bed/simpleRepeat.2013-12-27/run20M/last30K
  cp -p ../last100K/20K_10K/hg38_89_2_00_3.fa .

  # 2K chunks with 1K overlap
  mkdir -p 2K_1K

for F in hg38_89_2_00_3.fa
do
  B=`basename $F | sed -e 's/.fa//'`
  echo "faSplit -lift=$B.lift -extra=1000 size $F 2000 2K_1K/$B_"
  faSplit -lift=$B.lift -extra=1000 size $F 2000 2K_1K/${B}_
done

  ls -1S `pwd`/2K_1K/*.fa > part.list
  cat << '_EOF_' > runTrf
#!/bin/bash
set -beEu -o pipefail
export path1=$1
export inputFN=`basename $1`
export outpath=$2
export outputFN=`basename $2`
mkdir -p /dev/shm/$outputFN
cp -p $path1 /dev/shm/$outputFN
cd /dev/shm/$outputFN
/cluster/bin/x86_64/trfBig -trf=/cluster/bin/x86_64/trf \
      $inputFN /dev/null -bedAt=$outputFN -tempDir=/dev/shm
cd /hive/data/genomes/hg38/bed/simpleRepeat.2013-12-27/run20M/last30K
rm -f $outpath
cp -p /dev/shm/$outputFN/$outputFN $outpath
rm -fr /dev/shm/$outputFN/*
rmdir --ignore-fail-on-non-empty /dev/shm/$outputFN
'_EOF_'
  # << happy emacs
  chmod +x runTrf
  cat << '_EOF_' > template
#LOOP
./runTrf {check in line+ $(path1)}  {check out line bed/$(root1).bed}
#ENDLOOP
'_EOF_'
  # << happy emacs

  gensub2 part.list single template jobList
  para create
  para push
# Completed: 15 of 15 jobs
# CPU time in finished jobs:          1s       0.02m     0.00h    0.00d  0.000 y
# IO & Wait Time:                    26s       0.43m     0.01h    0.00d  0.000 y
# Average job time:                   2s       0.03m     0.00h    0.00d
# Longest finished job:               4s       0.07m     0.00h    0.00d
# Submission to last job:            14s       0.23m     0.00h    0.00d

  cat *.lift  | liftUp 2Kparts.bed stdin error bed/*.bed
  cat ../last100K/*.lift | liftUp 20Kpart.bed stdin error 2Kparts.bed
  cat ../splitSplitBits/*.lift | liftUp 100Kpart.bed stdin error 20Kpart.bed
  cat ../splitBits/*.lift | liftUp parts.bed  stdin error 100Kpart.bed
  liftUp -type=.bed stdout ../../splitGap/noGap.lift error parts.bed | sort -u \
    | sort -k1,1 -k2,2n > hg38.result.bed

  ## To put it all together:
  cd /hive/data/genomes/hg38/bed/simpleRepeat.2013-12-27/run20M
  cat /hive/data/genomes/hg38/TrfPart20M/???/*.bed lastJobs/bed/*.bed \
     splitBits/parts.bed splitSplitBits/parts.bed last100K/parts.bed \
     last30K/parts.bed > beforeLift.simpleRepeat.bed
  liftUp -type=.bed stdout ../splitGap/noGap.lift error \
     beforeLift.simpleRepeat.bed | sort -u \
       | sort -k1,1 -k2,2n > simpleRepeat.bed

  awk '{if ($5 <= 12) print;}' simpleRepeat.bed > trfMask.bed

  hgLoadBed hg38 simpleRepeat simpleRepeat.bed \
        -sqlTable=$HOME/kent/src/hg/lib/simpleRepeat.sql
  featureBits hg38 simpleRepeat > fb.simpleRepeat 2>&1
  cat fb.simpleRepeat
# 146785521 bases of 3049335806 (4.814%) in intersection

  cd /hive/data/genomes/hg38/bed
  ln -s simpleRepeat.2013-12-27/run20M simpleRepeat

############################################################################

 # WINDOWMASKER - DONE - 2013-12-24 - Hiram
    mkdir /hive/data/genomes/hg38/bed/windowMasker
    cd /hive/data/genomes/hg38/bed/windowMasker
    time nice -n +19 doWindowMasker.pl -buildDir=`pwd` -workhorse=hgwdev \
	-dbHost=hgwdev hg38 > do.log 2>&1 &

############################################################################
# Verify all gaps are marked - DONE - 2013-12-24 - Hiram
    mkdir /hive/data/genomes/hg38/bed/gap
    cd /hive/data/genomes/hg38/bed/gap
    time nice -n +19 findMotif -motif=gattaca -verbose=4 \
	-strand=+ ../../hg38.unmasked.2bit > findMotif.txt 2>&1
    #	real    0m28.634s
    grep "^#GAP " findMotif.txt | sed -e "s/^#GAP //" > allGaps.bed
    featureBits hg38 -not gap -bed=notGap.bed
    #	3049335806 bases of 3049335806 (100.000%) in intersection
    time featureBits hg38 allGaps.bed notGap.bed -bed=new.gaps.bed
    #   20023 bases of 3049335806 (0.001%) in intersection
    # real    0m20.427s
    # this indicates that 20,023 bases are not marked as N's
    # with this element size profile:
    awk '{print $3-$2}' new.gaps.bed | ave stdin
# Q1 1.000000
# median 1.000000
# Q3 100.000000
# average 44.894619
# min 1.000000
# max 1000.000000
# count 446
# total 20023.000000
# standard deviation 81.743447

    # the four largest ones:
# 1000 chr2         32916625        32917625        chr2.7
# 1000 chr2         32867130        32868130        chr2.6
#  348 chr20        36314371        36314719        chr20.36
#  200 chr12       123443533       123443733        chr12.10

#########################################################################
## CYTOBAND - fixing the ideogram track (DONE - 2014-06-11 - Hiram)
    ## the file we used before was broken
    mkdir -p /hive/data/outside/ncbi/ideogram/2014-06
    cd /hive/data/outside/ncbi/ideogram/2014-06
    # fetch all the ideogram files:
    rsync -a -P rsync://ftp.ncbi.nlm.nih.gov/pub/gdp/ ./
    mkdir /hive/data/genomes/hg38/bed/cytoBandUpdate
    cd /hive/data/genomes/hg38/bed/cytoBandUpdate

    # Create bed file
    $HOME/kent/src/utils/ncbi/createNcbiCytoBand.pl \
/hive/data/outside/ncbi/ideogram/2014-06/ideogram_9606_GCF_000001305.14_850_V1

    # add in the other genome data:
    hgsql -N -e 'select * from cytoBand;' hg38 \
        | egrep "chrU|chrM|_alt|_random" >> cytoBand.bed

    $HOME/kent/src/utils/ncbi/cytoBandVerify.pl
    #   everything checks out OK on 455 chroms

    # Load the bed file
    hgLoadBed -tab -noBin -sqlTable=$HOME/kent/src/hg/lib/cytoBand.sql \
	hg38 cytoBand cytoBand.bed
    cut -f1 cytoBand.bed | sort -u | awk '{print length($1)}' | sort -rn | head
    #  23
    sed -e 's/12/23/' $HOME/kent/src/hg/lib/cytoBand.sql > cytoBand.sql
    sort -k1,1 -k2,2n cytoBand.bed \
	| hgLoadSqlTab hg38 cytoBand cytoBand.sql stdin

    # Make cytoBandIdeo track for ideogram gif on hgTracks page.
    # cytoBandIdeo is just a replicate of the cytoBand track.
    hgsql -e "drop table cytoBandIdeo;" hg38
    hgsql hg38 -e "create table cytoBandIdeo (index(chrom(23),chromStart)) as select * from cytoBand;"

#########################################################################
##  CYTOBAND - ideogram track (DONE - 2014-03-04 - Hiram)
    ssh hgwdev
    mkdir -p /hive/data/outside/ncbi/ideogram/2014-03
    cd /hive/data/outside/ncbi/ideogram/2014-03

    # fetch all the ideogram files:
    rsync -a -P rsync://ftp.ncbi.nlm.nih.gov/pub/gdp/ ./

    mkdir /hive/data/genomes/hg38/bed/cytoBand
    cd /hive/data/genomes/hg38/bed/cytoBand

    # Create bed file
    $HOME/kent/src/utils/ncbi/createNcbiCytoBand.pl \
/hive/data/outside/ncbi/ideogram/2014-03/ideogram_9606_GCF_000001305.14_850_V1

    # add in the other genome data:
    hgsql -N -e 'select * from cytoBand;' hg38 > bobTable.bed

    egrep "chrU|chrM|_alt|_random" bobTable.bed >> cytoBand.bed

    ## can now verify before load:
    $HOME/kent/src/utils/ncbi/cytoBandVerify.pl
    #   everything checks out OK on 455 chroms

    # Load the bed file
    hgLoadBed -tab -noBin -sqlTable=$HOME/kent/src/hg/lib/cytoBand.sql \
	hg38 cytoBand cytoBand.bed
    cut -f1 cytoBand.bed | sort -u | awk '{print length($1)}' | sort -rn | head
    #  23
    sed -e 's/12/23/' $HOME/kent/src/hg/lib/cytoBand.sql > cytoBand.sql
    sort -k1,1 -k2,2n cytoBand.bed \
	| hgLoadSqlTab hg38 cytoBand cytoBand.sql stdin

    # Make cytoBandIdeo track for ideogram gif on hgTracks page.
    # cytoBandIdeo is just a replicate of the cytoBand track.
    hgsql -e "drop table cytoBandIdeo;" hg38
    hgsql hg38 -e "create table cytoBandIdeo (index(chrom(23),chromStart)) as select * from cytoBand;"

##########################################################################
# cytoBandIdeo - (DONE - 2013-12-26 - Hiram)
    mkdir /hive/data/genomes/hg38/bed/cytoBand
    cd /hive/data/genomes/hg38/bed/cytoBand
    makeCytoBandIdeo.csh hg38

#making temporary liftover of items from hg19
liftOver /hive/data/genomes/hg19/bed/ncbiCytoBand/cytobands.bed \
      /hive/data/gbdb/hg19/liftOver/hg19ToHg38.over.chain.gz \
      cytobands.bed unMapped

liftOver -minBlocks=0.5 /hive/data/genomes/hg19/bed/ncbiCytoBand/cytobands.bed \
      /hive/data/gbdb/hg19/liftOver/hg19ToHg38.over.chain.gz \
      cytobands.0.5.bed unMapped0.5

###############################                    ######################
# cytoBandIdeo - (reDONE - 2014-02-25 - kuhn)

# adding centromeres to generic cytonBandIdeo tavle as it exists.
# (lifted track is already gone)

# get the cen values for hg38
hgsql -Ne "SELECT DISTINCT chrom FROM centromeres" hg38 | sort > hg38.chroms
rm -f hg38.cens
foreach chrom (`cat hg38.chroms`)
  set cenStart=""
  set cenEnd=""
  set cenStart=`hgsql -Ne 'SELECT MIN(chromStart) FROM centromeres WHERE chrom = "'$chrom'"' hg38`
  set cenEnd=`hgsql -Ne 'SELECT MAX(chromEnd) FROM centromeres WHERE chrom = "'$chrom'"' hg38`
  echo "$chrom $cenStart $cenEnd" >> hg38.cens
end

# Modified makeCytoBandIdeo.csh to use this file instead of looking
#   for centromeres in a gap table.
# Replaced existing cytoBandIdeo table, which was really only a copy
#   of chromInfo.

##########################################################################
# hg19 <-> hg38 difference tracks (DONE - 2013-12-28 - Hiram)
    mkdir /hive/data/genomes/hg19/bed/liftOverHg38
    cd /hive/data/genomes/hg19/bed/liftOverHg38

    #	not needed, but interesting, collect all the fragment
    #	definitions from the gold tables:
    hgsql -N -e "select frag,fragStart,fragEnd,strand from gold;" hg19 \
        | sort > hg19.gold.frags.tab

    hgsql -N -e "select frag,fragStart,fragEnd,strand from gold;" hg38 \
        | sort > hg38.gold.frags.tab

    # construct common and difference listings
    comm -12 hg19.gold.frags.tab hg38.gold.frags.tab \
	> identical.hg19.hg38.frags.tab
    comm -23 hg19.gold.frags.tab hg38.gold.frags.tab \
	> unique.hg19Only.frags.tab
    comm -13 hg19.gold.frags.tab hg38.gold.frags.tab \
	> unique.hg38Only.frags.tab

    # better yet, get full information about each fragment
    hgsql -N -e "select chrom,chromStart,chromEnd,ix,type,frag,fragStart,fragEnd,strand from gold;" hg19 \
        | sort -k6 > hg19.gold.tab

    hgsql -N -e "select chrom,chromStart,chromEnd,ix,type,frag,fragStart,fragEnd,strand from gold;" hg38 \
        | sort -k6 > hg38.gold.tab

    # construct a single key for each fragment for joining.
    # the key is frag,fragStart,fragEnd,strand
    awk '{printf "%s,%d,%d,%s\t%s\t%s\t%s\t%d\t%d\t%d\t%s\n",
	$6,$7,$8,$9,$6,$9,$1,$2,$3,$4,$5}' hg19.gold.tab | sort \
	> hg19.fragKey.tab
    awk '{printf "%s,%d,%d,%s\t%s\t%s\t%s\t%d\t%d\t%d\t%s\n",
	$6,$7,$8,$9,$6,$9,$1,$2,$3,$4,$5}' hg38.gold.tab | sort \
	> hg38.fragKey.tab

    # now, by joining those keys, we can get exact identicals, and
    # the only-in listings as bed files to load as tracks:
    join hg19.fragKey.tab hg38.fragKey.tab \
	| awk '{printf "%s\t%d\t%d\t%s\t1000\t%s\t%d\t%d\t0,0,128\n", $4,$5,$6,$2,$3,$5,$6}' \
        | sort -k1,1 -k2,2n > hg19.hg38.identical.bed

    join hg19.fragKey.tab hg38.fragKey.tab \
	| awk '{printf "%s\t%d\t%d\t%s\t1000\t%s\t%d\t%d\t0,0,128\n", $11,$12,$13,$9,$10,$12,$13}' \
        | sort -k1,1 -k2,2n > hg38.hg19.identical.bed

    join -v 1 hg19.fragKey.tab hg38.fragKey.tab \
	| awk '{printf "%s\t%d\t%d\t%s\t0\t%s\n", $4,$5,$6,$2,$3}' \
        | sort -k1,1 -k2,2n > hg19.only.bed

    join -v 2 hg19.fragKey.tab hg38.fragKey.tab \
	| awk '{printf "%s\t%d\t%d\t%s\t0\t%s\n", $4,$5,$6,$2,$3}' \
        | sort -k1,1 -k2,2n > hg38.only.bed

    hgLoadBed hg19 hg38ContigDiff hg19.only.bed
    hgLoadBed hg38 hg19ContigDiff hg38.only.bed

    wc -l hg??.only.bed
    #  6097 hg19.only.bed
    #  23632 hg38.only.bed

    # this leaves the outstanding question of "why" they might be in
    #	the only-in listings.  Some contigs may be different versions,
    #   sometimes different sections of the same contig are used,
    #	and contigs are dropped from hg19 to hg38, or new contigs added
    #	to hg38 to fill in gaps from hg19
    # Let's see if we can measure some of this:
    awk '{print $4}' hg19.only.bed | sort -u > hg19.only.ids.list
    awk '{print $4}' hg38.only.bed | sort -u > hg38.only.ids.list

    # Looks like 5405 idential contigs with different parts used:
    comm -12 hg19.only.ids.list hg38.only.ids.list > differentPortions.list
    wc -l differentPortions.list
    # 5405

    # and perhaps 63 = 5468-5405 of different versions of same contig:
    sed -e "s/\.[0-9]*$//" hg19.only.ids.list | sort -u \
	> hg19.noVersions.ids.list
    sed -e "s/\.[0-9]*$//" hg38.only.ids.list | sort -u \
	> hg38.noVersions.ids.list
    comm -12 hg19.noVersions.ids.list hg38.noVersions.ids.list | wc -l
    #	5468
    sed -e "s/\.[0-9]*$//" differentPortions.list | sort -u \
	> differentPortions.noVersions.list
    comm -12 hg19.noVersions.ids.list hg38.noVersions.ids.list | sort -u \
	> noVersions.common.list
    # indeed, 63 contigs of different versions:
    comm -23 noVersions.common.list differentPortions.noVersions.list \
	| sort -u > differentVersions.list
    wc -l differentVersions.list
    #	63

    # dividing up these items:
    cat << '_EOF_' > identifyPortions.pl
#!/usr/bin/env perl

use strict;
use warnings;

my %differentVersions;
my %differentPortions;

open (FH, "<differentVersions.list" ) or
	die "can not read differentVersions.list";
while (my $line = <FH>) {
    chomp $line;
    $differentVersions{$line} = 1;
}
close (FH);

open (FH, "differentPortions.list" ) or
	die "can not read differentPortions.list";
while (my $line = <FH>) {
    chomp $line;
    $differentPortions{$line} = 1;
}
close (FH);

my %hg19Done;
open (DP, ">hg19.differentPortions.bed") or die "can not write to hg19.differentPortions.bed";
open (DV, ">hg19.differentVersions.bed") or die "can not write to hg19.differentVersions.bed";
open (FH, "<hg19.only.bed" ) or die "can not read hg19.only.bed";
while (my $line = <FH>) {
    chomp $line;
    my ($chr, $start, $end, $acc, $score, $strand) = split('\s+', $line);
    # assume done while $acc is still complete
    $hg19Done{$acc} = 1;
    if (exists($differentPortions{$acc})) {
	printf DP "%s\n", $line;
    } else {
	my $trimAcc = $acc;
	$trimAcc =~ s/\.[0-9]+$//;
	if (exists($differentVersions{$trimAcc})) {
	    printf DV "%s\n", $line;
	} else {
            # this one does not match
	    $hg19Done{$acc} = 0;
	}
    }
}
close (FH);
close (DV);
close (DP);
open (DR, ">hg19.dropped.bed") or die "can not write to hg19.dropped.bed";
open (FH, "<hg19.only.bed" ) or die "can not read hg19.only.bed";
while (my $line = <FH>) {
    chomp $line;
    my ($chr, $start, $end, $acc, $score, $strand) = split('\s+', $line);
    if (0 == $hg19Done{$acc}) {
	printf DR "%s\n", $line;
    }
}
close (FH);
close (DR);

my %hg38Done;
open (DP, ">hg38.differentPortions.bed") or die "can not write to hg38.differentPortions.bed";
open (DV, ">hg38.differentVersions.bed") or die "can not write to hg38.differentVersions.bed";
open (FH, "<hg38.only.bed" ) or die "can not read hg38.only.bed";
while (my $line = <FH>) {
    chomp $line;
    my ($chr, $start, $end, $acc, $score, $strand) = split('\s+', $line);
    # assume done while $acc is still complete
    $hg38Done{$acc} = 1;
    if (exists($differentPortions{$acc})) {
	printf DP "%s\n", $line;
    } else {
	my $trimAcc = $acc;
	$trimAcc =~ s/\.[0-9]+$//;
	if (exists($differentVersions{$trimAcc})) {
	    printf DV "%s\n", $line;
	} else {
            # this one does not match
	    $hg38Done{$acc} = 0;
	}
    }
}
close (FH);
close (DV);
close (DP);
open (DR, ">hg38.newTo19.bed") or die "can not write to hg38.newTo19.bed";
open (FH, "<hg38.only.bed" ) or die "can not read hg38.only.bed";
while (my $line = <FH>) {
    chomp $line;
    my ($chr, $start, $end, $acc, $score, $strand) = split('\s+', $line);
    if (0 == $hg38Done{$acc}) {
	printf DR "%s\n", $line;
    }
}
close (FH);
close (DR);
'_EOF_'
    # << happy emacs
    chmod +x identifyPortions.pl
    ./identifyPortions.pl
    # make sure nothing was lost
    sort hg19.differentVersions.bed hg19.differentPortions.bed \
	hg19.dropped.bed  | sum
    #	43711   233
    sort hg19.only.bed | sum
    #	43711   233
    sort hg38.differentVersions.bed hg38.differentPortions.bed \
	hg38.newTo19.bed | sum
    #	00502   911
    sort hg38.only.bed | sum
    #	00502   911

    sort -k1,1 -k2,2n hg38.differentVersions.bed hg38.differentPortions.bed \
	hg38.newTo19.bed > hg38.itemRgb.bed
    sort -k1,1 -k2,2n hg19.differentVersions.bed hg19.differentPortions.bed \
	hg19.dropped.bed > hg19.itemRgb.bed

    hgLoadBed hg19 hg38ContigDiff hg19.itemRgb.bed
    # if you wanted to load the identicals in this track too:
    sort -k1,1 -k2,2n hg38.hg19.identical.bed hg38.itemRgb.bed \
       | hgLoadBed hg38 hg38ContigDiff stdin
    # but we don't, we deliver only the differences
    hgLoadBed hg38 hg38ContigDiff hg38.itemRgb.bed

#########################################################################
# construct ooc file to be used in blat operations
#                      DONE - 2012-12-30 - Hiram
# can be done on unmasked sequence the same result as masked:
    cd /hive/data/genomes/hg38
    time blat hg38.unmasked.2bit /dev/null /dev/null \
       -tileSize=11 -makeOoc=jkStuff/hg38.11.ooc -repMatch=1024

    # been confirmed, the 100-base non-bridged gaps are really non-bridged
    gapToLift -minGap=100 -bedFile=jkStuff/nonBridgedGaps.bed hg38 \
	jkStuff/hg38.nonBridged.lft

##############################################################################
# cpgIslands - (DONE - 2014-01-07 - Hiram)
    # run on the Hmmer + trfMask sequence
    mkdir /hive/data/genomes/hg38/bed/cpgIslands
    cd /hive/data/genomes/hg38/bed/cpgIslands
    time $HOME/kent/src/hg/utils/automation/doCpgIslands.pl \
      -dbHost=hgwdev -bigClusterHub=ku -buildDir=`pwd` \
        -workhorse=hgwdev -smallClusterHub=ku hg38 > do.log 2>&1
    # real    3m31.684s
    # wc -l cpgIsland.bed -> 30456 cpgIsland.bed
    cat fb.hg38.cpgIslandExt.txt
    #  23654068 bases of 3049335806 (0.776%) in intersection

    # Previously in hg19:
    featureBits -countGaps hg19 cpgIslandExt
    # 21842742 bases of 3137161264 (0.696%) in intersection

    # when run on Hmmer and Trf masked sequence:
    # wc -l cpgIsland.bed -> 30416 cpgIsland.bed
    #   23635946 bases of 3049335806 (0.775%) in intersection

    # when run on unmasked sequence:
    # wc -l cpgIsland.bed -> 55149 cpgIsland.bed
    # 33637531 bases of 3049335806 (1.103%) in intersection
##############################################################################
# rerun cpgIslands on contig sequence (DONE - 2014-01-07 - Hiram)
    # this is a test of the contig sequence file,
    # should get a very similar answer to the above
    mkdir /hive/data/genomes/hg38/bed/cpgIslandsContigs
    cd /hive/data/genomes/hg38/bed/cpgIslandsContigs

    # run stepwise so the lift can be done on the result before loading
    time $HOME/kent/src/hg/utils/automation/doCpgIslands.pl \
      -dbHost=hgwdev -bigClusterHub=ku -buildDir=`pwd` \
       -stop=makeBed -maskedSeq=/hive/data/genomes/hg38/hg38.contigs.2bit \
        -workhorse=hgwdev -smallClusterHub=ku hg38 > makeBed.log 2>&1
    # real    9m31.502s
    # fails on the bedToBigBed creation since this isn't the actual
    # hg38 sequence.
    mv cpgIsland.bed cpgIsland.beforeLift.bed
    liftUp -type=.bed stdout ../../jkStuff/hg38.contigs.lift carry \
      cpgIsland.beforeLift.bed | sort -k1,1 -k2,2n > cpgIsland.bed
    bedToBigBed -tab -type=bed4+6 -as=$HOME/kent/src/hg/lib/cpgIslandExt.as \
       cpgIsland.bed ../../chrom.sizes hg38.cpgIslandExt.bb
    zcat ../cpgIslands/cpgIsland.bed.gz | sort -k1,1 -k2,2n > t.bed
    # Surprisingly, a few more are detected, perhaps due to the different
    # masking since this contig run is on the final corrected cross-match rmsk
    # plus TRF, the above was on the corrupted HMMER+TRF mask:
    wc -l cpgIsland.bed t.bed
#   30477 cpgIsland.bed
#   30456 t.bed
    # 2,835 different items between the two:
    sort t.bed cpgIsland.bed | uniq -c | awk '$1 < 2' | wc -l
    # 2835
    # 29.049 identical items
    sort t.bed cpgIsland.bed | uniq -c | awk '$1 == 2' | wc -l
    # 29049
    cut -f1-3 cpgIsland.bed | sort > contigs.bed
    cut -f1-3 t.bed | sort > fullSequence.bed
    # 29,339 identical locations:
    comm -12 contigs.bed fullSequence.bed | wc -l
    # 29339

    time $HOME/kent/src/hg/utils/automation/doCpgIslands.pl \
      -dbHost=hgwdev -bigClusterHub=ku -buildDir=`pwd` \
       -continue=load -maskedSeq=/hive/data/genomes/hg38/hg38.contigs.2bit \
        -workhorse=hgwdev -smallClusterHub=ku hg38 > load.log 2>&1
    # real    0m12.056s

    cat fb.hg38.cpgIslandExt.txt
    # 23610399 bases of 3049335806 (0.774%) in intersection

##############################################################################
# rerun cpgIslands on contig UNMASKED sequence (DONE - 2014-01-07 - Hiram)
    mkdir /hive/data/genomes/hg38/bed/cpgIslandsContigsUnmasked
    cd /hive/data/genomes/hg38/bed/cpgIslandsContigsUnmasked

    twoBitToFa -noMask ../../hg38.contigs.2bit stdout \
      | faToTwoBit stdin hg38.contigsUnmasked.2bit

    # verify sequence is OK:
    twoBitToFa hg38.contigsUnmasked.2bit stdout | faSize stdin
# 3061688741 bases (12372958 N's 3049315783 real 3049315783 upper 0 lower)
#    in 733 sequences in 1 files
# %0.00 masked total, %0.00 masked real
    twoBitToFa hg38.contigsUnmasked.2bit stdout | faCount stdin | tail -1
# total 3061688741 898285419 623727342 626335137 900967885  12372958 30979743
    # ACGT CpG same as original hg38.2bit except for the missing N's:
# total 3209286105 898285419 623727342 626335137 900967885 159970322 30979743

    # run stepwise so the lift can be done on the result before loading
    time $HOME/kent/src/hg/utils/automation/doCpgIslands.pl \
      -dbHost=hgwdev -bigClusterHub=ku -buildDir=`pwd` \
       -stop=makeBed -maskedSeq=`pwd`/hg38.contigsUnmasked.2bit \
        -workhorse=hgwdev -smallClusterHub=ku hg38 > makeBed.log 2>&1
    # real    11m0.690s
    # as above, failed on the bedToBigBed step since this isn't the full hg38
    # sequence
    mv cpgIsland.bed cpgIsland.beforeLift.bed
    liftUp -type=.bed stdout ../../jkStuff/hg38.contigs.lift carry \
      cpgIsland.beforeLift.bed | sort -k1,1 -k2,2n > cpgIsland.bed
    bedToBigBed -tab -type=bed4+6 -as=$HOME/kent/src/hg/lib/cpgIslandExt.as \
       cpgIsland.bed ../../chrom.sizes hg38.cpgIslandExt.bb
    # a lot more here that for masked sequence:
    wc -l cpgIsland.bed ../cpgIslandsContigs/cpgIsland.bed
    # 55149 cpgIsland.bed
    # 30477 ../cpgIslandsContigs/cpgIsland.bed
    featureBits -countGaps hg38 cpgIsland.bed
    # 33637531 bases of 3209286105 (1.048%) in intersection
    featureBits -countGaps hg38 ../cpgIslandsContigs/cpgIsland.bed
    # 23610399 bases of 3209286105 (0.736%) in intersection

    # debug load step so it can be loaded into a separate table:
    $HOME/kent/src/hg/utils/automation/doCpgIslands.pl \
      -dbHost=hgwdev -bigClusterHub=ku -buildDir=`pwd` \
       -debug -continue=load -maskedSeq=`pwd`/hg38.contigsUnmasked.2bit \
        -workhorse=hgwdev -smallClusterHub=ku hg38

    time ./doLoadCpg.csh > load.log 2>&1
    # real    0m2.179s
    # 33637531 bases of 3049335806 (1.103%) in intersection

#########################################################################
# construct liftOver to hg19 (DONE - 2013-12-31 - Hiram)
    # it turns out it doesn't matter if the query or target 2bit files
    # are masked.  This procedure can be done on completely unmasked sequences
    # for both, same result masked or not masked
    screen -S hg38	# manage this longish running job in a screen
    mkdir /hive/data/genomes/hg38/bed/blat.hg19.2013-12-31
    cd /hive/data/genomes/hg38/bed/blat.hg19.2013-06-10
    # this was run in manual steps as experiments were done about the masking
    # check it with -debug first to see if it is going to work:
    doSameSpeciesLiftOver.pl -stop=net -buildDir=`pwd` -bigClusterHub=ku \
      -dbHost=hgwdev -workhorse=hgwdev -debug \
        -ooc=/hive/data/genomes/hg38/jkStuff/hg38.11.ooc hg38 hg19
    # the debug step doesn't actually construct enough files to run the
    # steps manually.  The chaining has an extra procedure that is performed
    # while not in 'debug' mode
    # the run.blat was operated manually, then chaining:
    time doSameSpeciesLiftOver.pl -continue=chain -stop=net -buildDir=`pwd` \
      -bigClusterHub=ku \
        -dbHost=hgwdev -workhorse=hgwdev \
           -ooc=/hive/data/genomes/hg38/jkStuff/hg38.11.ooc \
             hg38 hg19 > chain.log 2>&1
    # real    22m31.635s
    # loading is only a few seconds:
    doSameSpeciesLiftOver.pl -continue=load -buildDir=`pwd` \
     -bigClusterHub=ku \
       -dbHost=hgwdev -workhorse=hgwdev \
          -ooc=/hive/data/genomes/hg38/jkStuff/hg38.11.ooc \
             hg38 hg19 > load.log 2>&1

    # verify this file exists:
    #	/gbdb/hg38/liftOver/hg38ToHg19.over.chain.gz
    # and try out the conversion on genome-test from hg38 to hg19
    # same file should exist for downloads:
    #  /usr/local/apache/htdocs-hgdownload/goldenPath/hg38/liftOver/hg38ToHg19.over.chain.gz

############################################################################
# marking the PAR regions: (DONE - 2014-01-09 - Hiram)
    # after much experimentation with the AGP files and the given NCBI
    # files in hg38/genbank/Primary_Assembly/pseudoautosomal_region
    # the PAR region definitions can be seen in the par_align.gff file:
# CM000685.2  10001  2781479  ->  CM000686.2 10001 2781479
# CM000685.2  155701383  156030895 -> CM000686.2 56887903 57217415
    # equivalent to:
# chrX  10001  2781479  ->  chrY 10001 2781479
# chrX  155701383  156030895 -> chrY 56887903 57217415

    # subtract one for the chromStart position:
    cat << '_EOF_' > hg38Par.bed4
chrX 10000      2781479   PAR1
chrX 155701382  156030895 PAR2
chrY 10000      2781479   PAR1
chrY 56887902   57217415  PAR2
'_EOF_'
    # << happy emacs

    hgLoadBed hg38 par hg38Par.bed4
    checkTableCoords  hg38

    # hg19 had:
+-------+------------+-----------+------+
| chrom | chromStart | chromEnd  | name |
+-------+------------+-----------+------+
| chrX  |      60000 |   2699520 | PAR1 |
| chrX  |  154931043 | 155260560 | PAR2 |
| chrY  |      10000 |   2649520 | PAR1 |
| chrY  |   59034049 |  59363566 | PAR2 |
+-------+------------+-----------+------+

    # The AGP files come close to definining the location, but not
    # precisely.  The first region uses different bits of AC006209.25:
zcat ../../genbank/Primary_Assembly/assembled_chromosomes/AGP/chrX.comp.agp.gz\
  | grep AC006209.25
CM000685.2      2665048 2677319 56      F       AC006209.25     127483  139754 -
CM000685.2      2677869 2804801 58      F       AC006209.25     1       126933 -
zcat ../../genbank/Primary_Assembly/assembled_chromosomes/AGP/chrY.comp.agp.gz\
  | grep AC006209.25
CM000686.2      2665048 2677319 56      F       AC006209.25     127483  139754 -
CM000686.2      2677869 2781479 58      F       AC006209.25     23323   126933 -

    # and the second region uses different bits of AJ271735.1:
zcat ../../genbank/Primary_Assembly/assembled_chromosomes/AGP/chrX.comp.agp.gz\
  | grep AJ271735.1 | head -1
CM000685.2 155676925 155719966 3096  O AJ271735.1     44687    87728   +
zcat ../../genbank/Primary_Assembly/assembled_chromosomes/AGP/chrY.comp.agp.gz\
  | grep AJ271735.1 | head -1
CM000686.2  56887903  56906486  356  O AJ271735.1     69145    87728   +

    # combining all the contig definitions from each will find all the
    # exact identical contig bits:
zcat ../../genbank/Primary_Assembly/assembled_chromosomes/AGP/chrY.comp.agp.gz\
  | grep -v "^#" | awk '$5 != "N"' \
    | awk '{printf "%s_%d_%d\t%s\t%d\t%d\n", $6,$7,$8,$1,$2,$3}' \
    | sort > chrY.comp.agp.txt
zcat ../../genbank/Primary_Assembly/assembled_chromosomes/AGP/chrX.comp.agp.gz\
  | grep -v "^#" | awk '$5 != "N"' \
    | awk '{printf "%s_%d_%d\t%s\t%d\t%d\n", $6,$7,$8,$1,$2,$3}' \
    | sort > chrX.comp.agp.txt
   join -t'^I' chrY.comp.agp.txt chrX.comp.agp.txt | head

CM000685.2  10001   44821   CM000686.2      10001   44821
...
CM000685.2  2677320 2677868 CM000686.2      2677320 2677868

CM000685.2 155719967  155720351       CM000686.2      56906487        56906871
...
CM000685.2 155964490  156030895       CM000686.2      57151010        57217415

############################################################################
## altLocations track (DONE - 2014-01-02 - Hiram)
    # indicate corresponding locations between haplotypes and reference
    mkdir /hive/data/genomes/hg38/bed/altLocations
    cd /hive/data/genomes/hg38/bed/altLocations

    find ../../genbank/ALT_* -type f | grep alt_scaffold_placement.txt \
  | while read F
do
  grep -v "^#" ${F} | sed -e 's/\./v/;' | awk -F'\t' '{printf "chr%s\t%d\t%d\tchr%s_%s_alt\n", $6,$12-1,$13,$6, $4}'
done | sort -k1,1 -k2,2n > chrToAlt.bed

    # note silent hidden <tab> character in the join -t argument
    # explicit as written here

find ../../genbank/ALT_* -type f | grep alt_scaffold_placement.txt \
  | while read F
do
  grep -v "^#" ${F} | sed -e 's/\./v/;' | awk -F'\t' '{printf "chr%s_%s_alt\tchr%s:%d-%d\n", $6,$4,$6,$12,$13}'
done | sort > altToChr.tab
sort ../../chrom.sizes | join -t'^I' - altToChr.tab \
   | awk '{printf "%s\t0\t%d\t%s\n", $1,$2,$3}' > altToChr.bed


   hgLoadBed hg38 altLocations chrToAlt.bed altToChr.bed
   featureBits -countGaps hg38 altLocations
   # 170113652 bases of 3209286105 (5.301%) in intersection

############################################################################
## genscan (DONE - 2014-01-07 - Hiram)
   mkdir /hive/data/genomes/hg38/bed/genscan
   cd /hive/data/genomes/hg38/bed/genscan

   # using the contig sequence
   # running stepwise to allow the lifting of the final result
   time $HOME/kent/src/hg/utils/automation/doGenscan.pl hg38 -buildDir=`pwd` \
     -maskedSeq=/hive/data/genomes/hg38/hg38.contigs.2bit \
       -stop=makeBed -bigClusterHub=ku -dbHost=hgwdev -workhorse=hgwdev \
        > do.log 2>&1
   # three jobs did not finish due to almost all N's in the sequence,
   # just a couple of bases in each piece.  Their empty result is good enough.
   time $HOME/kent/src/hg/utils/automation/doGenscan.pl hg38 -buildDir=`pwd` \
     -maskedSeq=/hive/data/genomes/hg38/hg38.contigs.2bit \
       -continue=makeBed -stop=makeBed -bigClusterHub=ku -dbHost=hgwdev \
         -workhorse=hgwdev > makeBed.log 2>&1
   # real    0m48.161s

   cd lifted
   mkdir -p gtf subopt nameFixed/gtf nameFixed/pep newNames pep
   for F in ../gtf/000/*.gtf
do
   B=`basename $F`
   liftUp gtf/${B} ../../../jkStuff/hg38.contigs.lift carry $F
   echo $B
done
   for F in ../subopt/000/*.bed
do
   B=`basename $F`
   liftUp subopt/${B} ../../../jkStuff/hg38.contigs.lift carry $F
   echo $B
done

   ls gtf/chr*_[0-9][0-9].gtf \
     | sed -e 's/_[0-9][0-9]//; s#gtf/##; s/.gtf//;' | sort -u | while read C
do
   cat ../pep/000/${C}_[0-9][0-9].pep > pep/${C}.pep
   cat gtf/${C}_[0-9][0-9].gtf | ./gtfFixId.pl ${C} > nameFixed/gtf/${C}.gtf
   ./pepNameFix.pl ${C} > nameFixed/pep/${C}.pep
done

   cat nameFixed/gtf/*.gtf > ../hg38.genscan.gtf
   ls gtf | egrep -v '^chr[0-9XY][0-9]*_[0-9][0-9].gtf' | while read C
do
   cat gtf/${C}
done >> ../hg38.genscan.gtf

   cat nameFixed/pep/*.pep > ../hg38.genscan.pep
   ls gtf | egrep -v '^chr[0-9XY][0-9]*_[0-9][0-9].gtf' \
     | sed -e 's/.gtf/.pep/' | while read C
do
   cat ../pep/000/${C}
done >> ../hg38.genscan.pep

   cd /hive/data/genomes/hg38/bed/genscan
   cat lifted/subopt/*.bed | sort -k1,1 -k2,2n > hg38.genscanSubopt.bed

   gtfToGenePred hg38.genscan.gtf hg38.genscan.gp
   genePredCheck -db=hg38 hg38.genscan.gp
   # checked: 44149 failed: 0
   genePredToBed hg38.genscan.gp hg38.genscan.bed
   bedToBigBed hg38.genscan.bed ../../chrom.sizes hg38.genscan.bb
   bedToBigBed hg38.genscanSubopt.bed ../../chrom.sizes hg38.genscanSubopt.bb
   ldHgGene -gtf hg38 genscan hg38.genscan.gtf
# Read 44149 transcripts in 339212 lines in 1 files
#  44149 groups 345 seqs 1 sources 1 feature types

    cat fb.hg38.genscan.txt
    # 58278346 bases of 3049335806 (1.911%) in intersection
    cat fb.hg38.genscanSubopt.txt
    # 55020514 bases of 3049335806 (1.804%) in intersection

    # oddly, we are getting half of what hg19 had ?
    featureBits hg19 genscan
    # 106433874 bases of 2897316137 (3.674%) in intersection

    # This is because hg19 was run on soft-masked sequence and not
    # on hard masked sequence

############################################################################
## genscan on unmasked sequence experiment (DONE - 2013-12-03 - Hiram)
   ## instead, working on unmasked sequence:
   mkdir /hive/data/genomes/hg38/bed/genscan/unmaskedRun
   cd /hive/data/genomes/hg38/bed/genscan/unmaskedRun

   mkdir liftSpecs
   split -a 3 -d -l 1 ../../../jkStuff/hg38.nonBridged.lift liftSpecs/hg38_

   mkdir fasta
for F in liftSpecs/hg38_*
do
   L=`cut -f2 $F`
   echo $L
   /cluster/home/hiram/kent/src/hg/utils/lft2BitToFa.pl \
       ../../../hg38.unmasked.2bit $F > fasta/${L}.fa
done


   cat << '_EOF_' > template
#LOOP
./runGsBig.bash $(path1) {check out exists gtf/$(root1).gtf} {check out exists pep/$(root1).pep} {check out exists subopt/$(root1).bed}
#ENDLOOP
'_EOF_'
  # << happy emacs
   cat << '_EOF_' > runGsBig.bash
#!/bin/bash

set -beEu -o pipefail

export seqFile=$1
export resultGtf=$2
export resultPep=$3
export resultSubopt=$4
/cluster/bin/x86_64/gsBig $seqFile $resultGtf -trans=$resultPep -subopt=$resultSubopt -exe=/scratch/data/genscan/genscan -par=/scratch/data/genscan/HumanIso.smat -tmp=/dev/shm -window=2400000
'_EOF_'
  # << happy emacs

  ls -1S `pwd`/fasta/*.fa > part.list
  gensub2 part.list single template jobList
  para create jobList
  para push
  # several jobs crashed:
# Completed: 726 of 733 jobs
# Crashed: 7 jobs
# CPU time in finished jobs:      62501s    1041.68m    17.36h    0.72d  0.002 y
# IO & Wait Time:                  2563s      42.72m     0.71h    0.03d  0.000 y
# Average job time:                  90s       1.49m     0.02h    0.00d
# Longest finished job:            3288s      54.80m     0.91h    0.04d
# Submission to last job:          3294s      54.90m     0.92h    0.04d

  para status | grep -v -w done | awk '{print $(NF-3)}' > crashed.job.list

  mkdir /hive/data/genomes/hg38/bed/genscan/unmaskedRun/crashedJobs
  cd /hive/data/genomes/hg38/bed/genscan/unmaskedRun/crashedJobs
  mkdir splitBits

  for F in chr2.06 chr1.03 chr3.05 chr12.07 chr10.05 chr17.08 chr11.04
do
   faSplit -lift=${F}.lift gap ../fasta/${F}.fa 2000000 splitBits/${F}_
done

  ls -1S `pwd`/splitBits/*.fa > part.list
  cat << '_EOF_' > runGsBig.bash
#!/bin/bash

set -beEu -o pipefail

export seqFile=$1
export resultGtf=$2
export resultPep=$3
export resultSubopt=$4
/cluster/bin/x86_64/gsBig $seqFile $resultGtf -trans=$resultPep -subopt=$resultSubopt -exe=/scratch/data/genscan/genscan -par=/scratch/data/genscan/HumanIso.smat -tmp=/dev/shm -window=2400000
'_EOF_'
  # << happy emacs
  chmod +x runGsBig.bash

  cat << '_EOF_' > template
#LOOP
./runGsBig.bash $(path1) {check out exists gtf/$(root1).gtf} {check out exists pep/$(root1).pep} {check out exists subopt/$(root1).bed}
#ENDLOOP
'_EOF_'
  # << happy emacs

  gensub2 part.list single template jobList
  para create jobList
  para push
# Completed: 331 of 334 jobs
# Crashed: 3 jobs
# CPU time in finished jobs:      18097s     301.62m     5.03h    0.21d  0.001 y
# IO & Wait Time:                  1085s      18.08m     0.30h    0.01d  0.000 y
# Average job time:                  58s       0.97m     0.02h    0.00d
# Longest finished job:              79s       1.32m     0.02h    0.00d
# Submission to last job:           249s       4.15m     0.07h    0.00d
  # the last three completed with -window=1600000

  # lifting results:
  cat << '_EOF_' > fixIds.pl
#!/usr/bin/env perl

use strict;
use warnings;

my $argc = scalar(@ARGV);

if ($argc != 1) {
  printf STDERR "usage: cat chrN.M.lifted | ./fixIds.pl chrN.M\n";
  exit 255;
}

my $F=shift;
my $C = $F;
$C =~ s/\.[0-9][0-9]//;

my $id = 0;
my $prevId = "";
open (GT, ">${F}.gtf") or die "can not write to ${F}.gtf";
while (my $line=<>) {
   chomp $line;
   my $geneId = $line;
   $geneId =~ s/^${C}.*gene_id "${C}//;
   $geneId =~ s/";.*//;
   $id += 1 if ( $prevId ne $geneId);
   $line =~ s/${C}[0-9]+.[0-9]+/${F}.$id/g;
   printf GT "%s\n", $line;
   $prevId = $geneId;
}
close (GT);
'_EOF_'
  # << happy emacs
  chmod +x fixIds.pl
  for F in chr1.03 chr10.05 chr11.04 chr12.07 chr17.08 chr2.06 chr3.05
do
  echo "${F}" 1>&2
  cut -f2 ${F}.lift | while read P
  do
     liftUp -type=.gtf stdout ${F}.lift error gtf/${P}.gtf
  done > ${F}.lifted.gtf
  cat ${F}.lifted.gtf | ./fixIds.pl ${F}
done
  # copied these results to ../gtf/ to get into the final result
# -rw-rw-r-- 1 3349959 Jan  2 15:33 chr1.03.gtf
# -rw-rw-r-- 1 2439182 Jan  2 15:33 chr10.05.gtf
# -rw-rw-r-- 1 1068097 Jan  2 15:33 chr11.04.gtf
# -rw-rw-r-- 1 2392548 Jan  2 15:33 chr12.07.gtf
# -rw-rw-r-- 1 1831336 Jan  2 15:33 chr17.08.gtf
# -rw-rw-r-- 1 3539694 Jan  2 15:33 chr2.06.gtf
# -rw-rw-r-- 1 2309903 Jan  2 15:33 chr3.05.gtf

  for F in chr1.03 chr10.05 chr11.04 chr12.07 chr17.08 chr2.06 chr3.05
do
  echo "${F}" 1>&2
  cut -f2 ${F}.lift | while read P
  do
     liftUp -type=.bed stdout ${F}.lift error subopt/${P}.bed
  done > ${F}.lifted.subopt.bed
done
  # copied these results to ../subopt/ to get into the final result
# -rw-rw-r-- 1 3349959 Jan  2 15:33 chr1.03.gtf
# -rw-rw-r-- 1 2439182 Jan  2 15:33 chr10.05.gtf
# -rw-rw-r-- 1 1068097 Jan  2 15:33 chr11.04.gtf
# -rw-rw-r-- 1 2392548 Jan  2 15:33 chr12.07.gtf
# -rw-rw-r-- 1 1831336 Jan  2 15:33 chr17.08.gtf
# -rw-rw-r-- 1 3539694 Jan  2 15:33 chr2.06.gtf
# -rw-rw-r-- 1 2309903 Jan  2 15:33 chr3.05.gtf


  cat << '_EOF_' > pepNameFix.pl
#!/usr/bin/env perl

use strict;
use warnings;

# BIG ASSUMPTION ! ! ! - the peptides are in the same order as
# they are in the GTF file ! ! !

my $argc = scalar(@ARGV);

if ($argc != 1) {
  printf STDERR "usage: cat chrN.M.needNameFix.pep | ./pepNameFix.pl chrN.M > chrN.M.pep\n";
  exit 255;
}

my $C=shift;

my $id = 1;

while (my $line = <>) {
  if ($line =~ m/^>/) {
    printf ">%s.%d\n", $C, $id++;
  } else {
    print $line;
  }
}
'_EOF_'
  # << happy emacs
  chmod +x pepNameFix.pl

for F in chr1.03 chr10.05 chr11.04 chr12.07 chr17.08 chr2.06 chr3.05
do
  echo "${F}" 1>&2
  cut -f2 ${F}.lift | while read P
  do
     cat pep/${P}.pep
  done > ${F}.needNameFix.pep
  cat ${F}.needNameFix.pep | ./pepNameFix.pl ${F} > ${F}.pep
done
  # copied these results to ../pep/ to get into the final result:
# -rw-rw-r-- 1 1592655 Jan  2 15:55 chr1.03.pep
# -rw-rw-r-- 1 1169168 Jan  2 15:55 chr10.05.pep
# -rw-rw-r-- 1  519106 Jan  2 15:55 chr11.04.pep
# -rw-rw-r-- 1 1152111 Jan  2 15:55 chr12.07.pep
# -rw-rw-r-- 1  775052 Jan  2 15:55 chr17.08.pep
# -rw-rw-r-- 1 1799546 Jan  2 15:55 chr2.06.pep
# -rw-rw-r-- 1 1248762 Jan  2 15:55 chr3.05.pep

  # and then, adding in all the results together

  cd /hive/data/genomes/hg38/bed/genscan/unmaskedRun
  cat << '_EOF_' > gtfIdFix.pl
#!/usr/bin/env perl

use strict;
use warnings;

my $argc = scalar(@ARGV);

if ($argc != 1) {
  printf STDERR "usage: cat lifted/gtf/chrN.gtf | ./gtfIdFix.pl chrN\n";
  exit 255;
}

my $C=shift;

my $id = 0;
my $prevId = "";
open (NM, ">nameFixed/newNames/${C}.tab") or die "can not write to nameFixed/newNames/${C}.tab";
open (GT, ">nameFixed/gtf/${C}.gtf") or die "can not write to nameFixed/gtf/${C}.gtf";
while (my $line=<>) {
   chomp $line;
   my $geneId = $line;
   $geneId =~ s/^${C}.*gene_id "//;
   $geneId =~ s/";.*//;
   if ( $prevId ne $geneId) {
     $id += 1;
     printf NM "%s\t%s.%d\n", $geneId, $C, $id;
   }
   $line =~ s/${C}.[0-9]+.[0-9]+/${C}.$id/g;
   printf GT "%s\n", $line;
   $prevId = $geneId;
}
close (GT);
close (NM);
'_EOF_'
  # << happy emacs
  chmod +x gtfIdFix.pl

  rm -fr lifted
  rm -fr nameFix
  mkdir -p lifted
  mkdir -p lifted/gtf
  mkdir -p lifted/pep
  mkdir -p lifted/subopt
  mkdir -p nameFix
  mkdir -p nameFix/gtf
  mkdir -p nameFix/newNames

  for F in liftSpecs/hg38_*
do
   L=`cut -f2 $F`
   C=`cut -f4 $F`
   liftUp -type=.gtf stdout ${F} error gtf/${L}.gtf >> lifted/gtf/${C}.gtf
   cat pep/${L}.pep >> lifted/pep/${C}.pep
   liftUp -type=.bed stdout ${F} error subopt/${L}.bed >> lifted/subopt/${C}.bed
done

  for F in lifted/gtf/*.gtf
do
  C=`basename $F | sed -e 's/.gtf//'`
  cat $F | ./gtfIdFix.pl $C
done

mkdir -p nameFixed/pep

  cat << '_EOF_' > pepNameFix.pl
#!/usr/bin/env perl

use strict;
use warnings;

my $argc = scalar(@ARGV);
if ($argc != 1) {
  printf STDERR "usage: ./pepNameFix.pl chrN > chrN.pep\n";
  exit 255
}

my $C = shift;
my %newName;

open (FH, "<lifted/pep/$C.pep") or die "can not read <lifted/pep/$C.pep";
open (NM, "<nameFixed/newNames/$C.tab") or die "can not read nameFixed/newNames/$C.tab";
while (my $line = <NM>) {
  chomp $line;
  my ($needFix, $fixedName) = split('\t', $line);
  $newName{$needFix} = $fixedName;
}
close (NM);

while (my $line = <FH>) {
  if ($line =~m /^>/) {
    chomp $line;
    $line =~ s/^>//;
    die "can not find name to fix $line" if (!exists($newName{$line}));
    printf ">%s\n", $newName{$line};
  } else {
    print $line;
  }
}
close (FH);
'_EOF_'
  # << happy emacs
  chmod +x pepNameFix.pl

  for F in lifted/pep/*.pep
do
  C=`basename $F | sed -e 's/.pep//'`
  echo $C
  ./pepNameFix.pl $C > nameFixed/pep/$C.pep
done

#############################################################################
# Mark the new centromere regions (DONE - 2014-01-09 - Hiram)
    mkdir /hive/data/genomes/hg38/bed/centromere
    cd /hive/data/genomes/hg38/bed/centromere
    grep GJ ../../hg38.agp > hg38.centContigs.agp

    awk '{printf "%s\t%d\t%d\t%s\n", $1, $2-1, $3, $6}' hg38.centContigs.agp \
      > hg38.centContigs.bed4

    hgLoadBed hg38 centromeres hg38.centContigs.bed4
    checkTableCoords hg38 centromeres

#############################################################################
## alternate sequence/haplotype alignments (DONE - 2014-01-23 - Hiram)
    mkdir /hive/data/genomes/hg38/bed/lastzAltSequences
    cd /hive/data/genomes/hg38/bed/lastzAltSequences

rm -fr hg38.haplotypes.lift temp.lift targetFa queryFa
mkdir targetFa
mkdir queryFa
touch temp.lift

cat ../altLocations/chrToAlt.bed | while read L
do
  chrName=`echo $L | awk '{print $1}'`
  chromSize=`egrep "^$chrName   " ../../chrom.sizes | cut -f2`
  chrStart=`echo $L | awk '{if (($2-10000)>=0) {printf "%d", $2-10000} else {printf "0"}}'`
  chrEnd=`echo $L | awk -v chromSize=$chromSize '{if (($3+10000)<=chromSize) {printf "%d", $3+10000} else {printf "%d", chromSize}}'`
  chrSize=`echo $chrEnd $chrStart | awk '{print $1-$3}'`
  queryName=`echo $L | awk '{print $4}'`
  partName="${chrName}_${chrStart}_${chrEnd}"
  echo $chrName $chrStart $chrEnd $queryName $partName $chromSize
  echo -e "$chrStart\t${partName}\t$chrSize\t$chrName\t$chromSize" >> temp.lift
  twoBitToFa ../../hg38.unmasked.2bit:$chrName:$chrStart-$chrEnd stdout | sed -e "s/^>.*/>$partName/;" > targetFa/$queryName.fa
  twoBitToFa ../../hg38.unmasked.2bit:$queryName queryFa/$queryName.fa
done

sort -u temp.lift | sort -k4,4 -k1,1n > hg38.haplotypes.lift

    # these were run serially on hgwdev, they could be a cluster run:
    ssh ku
    mkdir /hive/data/genomes/hg38/bed/lastzAltSequences/run.blastz
    cd /hive/data/genomes/hg38/bed/lastzAltSequences/run.blastz
    mkdir ../lav ../psl

    # construct the jobList
    ls ../targetFa | sed -e 's/.fa//;' | while read partName
do
   echo "./runJob.sh ${partName}"
done > jobList

    cat << '_EOF_' > runJob
#!/bin/sh

export partName=$1
export target="../targetFa/$partName.fa"
export query="../queryFa/$partName.fa"
export lav="../lav/$partName.lav"
export psl="../psl/$partName.psl"

/cluster/bin/penn/lastz-distrib-1.03.46/bin/lastz \
  $target $query \
  Y=15000 T=2 M=254 O=600 H=2000 O=600 E=150 K=10000 L=10000 \
  Q=/scratch/data/blastz/human_chimp.v2.q > $lav
lavToPsl $lav stdout | liftUp $psl ../hg38.haplotypes.lift error stdin
'_EOF_'
    # << happy emacs

    # these were run serially on hgwdev, they could be a cluster run:
    time ./jobList > do.log
    # real    61m35.898s

    # chaining lastz results:
    mkdir -p /hive/data/genomes/hg38/bed/lastzAltSequences/axtChain/run/chain
    cd /hive/data/genomes/hg38/bed/lastzAltSequences/axtChain/run

    ls ../../psl/*.psl | while read P
do
  B=`basename $P | sed -e 's/.psl//'`
  echo $B $P
  ls -og $P ../../targetFa/${B}.fa ../../queryFa/${B}.fa
  /cluster/home/hiram/kent/src/hg/mouseStuff/axtChain/axtChain \
    -psl -scoreScheme=/scratch/data/blastz/human_chimp.v2.q \
    -minScore=1000 -linearGap=medium $P \
    ../../../../hg38.unmasked.2bit \
    ../../../../hg38.unmasked.2bit stdout \
  | chainAntiRepeat ../../../../hg38.unmasked.2bit \
    ../../../../hg38.unmasked.2bit stdin chain/${B}.chain
done

   # real    7m54.677s

   cd /hive/data/genomes/hg38/bed/lastzAltSequences/axtChain
   find ./run/chain -name "*.chain" | chainMergeSort -inputList=stdin \
       | nice gzip -c > hg38.haplotypes.all.chain.gz
   chainPreNet  hg38.haplotypes.all.chain.gz ../../../chrom.sizes \
     /hive/data/genomes/hg38/chrom.sizes stdout \
       | chainNet  stdin -minSpace=1 ../../../chrom.sizes \
          ../../../chrom.sizes stdout /dev/null \
             | netSyntenic stdin noClass.net

    # Make liftOver chains from chroms to alternates:
    netChainSubset -verbose=0 noClass.net hg38.haplotypes.all.chain.gz stdout \
      | chainStitchId stdin stdout | gzip -c > hg38.haplotypes.over.chain.gz
    # swap the alignments to get the alternates to chrom mappings:
    chainSwap hg38.haplotypes.over.chain.gz stdout \
       | gzip -c > hg38.reference.over.chain.gz
    # and put them all together so mappings go both directions
    chainMergeSort hg38.haplotypes.over.chain.gz hg38.reference.over.chain.gz \
        | gzip -c > hg38.haploReference.over.chain.gz

    hgLoadChain -tIndex hg38 chainAltSequence hg38.haploReference.over.chain.gz
    netClass -verbose=0 -noAr noClass.net hg38 hg38 hg38.hg38AltSequence.net
    netFilter -minGap=10 hg38.hg38AltSequence.net \
      | hgLoadNet -verbose=0 hg38 netAltSequence stdin

    chainToPsl hg38.haploReference.over.chain.gz ../../../chrom.sizes \
      ../../../chrom.sizes \
        /hive/data/genomes/hg38/hg38.unmasked.2bit  \
          /hive/data/genomes/hg38/hg38.unmasked.2bit  \
             hg38.beforeRecalc.haploReference.over.psl

    pslCheck -targetSizes=../../../chrom.sizes \
        -querySizes=../../../chrom.sizes \
    hg38.beforeRecalc.haploReference.over.psl 2>&1 | tail -1
    # checked: 3092 failed: 57 errors: 57

    pslRecalcMatch hg38.beforeRecalc.haploReference.over.psl \
    ../../../hg38.unmasked.2bit ../../../hg38.unmasked.2bit  \
        hg38.haploReference.over.psl

    pslCheck -targetSizes=../../../chrom.sizes \
      -querySizes=../../../chrom.sizes \
         hg38.haploReference.over.psl 2>&1 | tail -1
    # checked: 3092 failed: 0 errors: 0

    hgLoadPsl hg38 -table=altSequenceLiftOver hg38.haploReference.over.psl

#############################################################################
## construct non-bridged contig sequence (DONE - 2014-01-10 - Hiram)
    mkdir /hive/data/genomes/hg38/bed/nonBridgedContigs
    cd /hive/data/genomes/hg38/bed/nonBridgedContigs

    # only need the actual split chroms in this lift, and the
    # _nn name is a bit more convenient than the .nn:
    gapToLift -minGap=100 hg38 stdout | sed -e 's/\./_/;' \
        | awk '$1 != 0' > hg38.contigs.lift
    # the warnings gapToLift issues are about gaps defined in the table
    # that are abutting to each other.  teleomere gaps are next to contig gaps
    # those lifts in the format of a bed file:
    awk '{printf "%s\t%d\t%d\t%s\n", $4, $1, $1+$3, $2}' hg38.contigs.lift \
        > hg38.contigs.bed
    # the negation of that is the gaps between the contigs
    #  fixup the .N to _nn with the awk:
    featureBits -not -countGaps hg38 hg38.contigs.bed -bed=stdout \
| awk '{split($4,a,"."); printf "%s\t%d\t%d\t%s_%02d\n", $1,$2,$3,a[1],a[2]}' \
             > hg38.gaps.bed
    # 268613637 bases of 3209286105 (8.370%) in intersection

    # together, those two should be %100 of the genome exactly:
    featureBits -countGaps -or hg38 hg38.contigs.bed hg38.gaps.bed
    #  3209286105 bases of 3209286105 (100.000%) in intersection

    # the list of all those other bits not in the split chroms:
    egrep "_alt|chrUn|chrM|_random" hg38.gaps.bed | cut -f1 \
       | sort > other.bits.list

    # extract those chrom pieces and the other bits from the masked sequence:
    (twoBitToFa -bed=hg38.contigs.bed ../../hg38.2bit stdout; \
      twoBitToFa -seqList=other.bits.list ../../hg38.2bit stdout) \
        | faToTwoBit stdin hg38.contigs.2bit
    twoBitInfo hg38.contigs.2bit stdout | sort -k2nr > hg38.contigs.chrom.sizes
    # verify nothing has been lost:
    twoBitToFa ../../hg38.2bit stdout | faCount stdin | tail -1
# total 3209286105 898285419 623727342 626335137 900967885 159970322 30979743
    twoBitToFa hg38.contigs.2bit stdout | faCount stdin | tail -1
# total 3061688741 898285419 623727342 626335137 900967885  12372958 30979743
    # the ACGT and CPG counts remain the same, only N's have been lost

    # make a copy of this at the top:
    cp -p hg38.contigs.2bit ../..
    cp -p hg38.contigs.lift ../../jkStuff

    # load as a track to be able to see where they are:
    egrep "chrUn|chrM|_alt|_random" hg38.contigs.chrom.sizes \
	| awk '{printf "%s\t0\t%d\t%s\n", $1, $2, $1}' \
	> fullCoverage.hg38Contigs.bed
    cat hg38.contigs.bed >>  fullCoverage.hg38Contigs.bed
    featureBits -or -countGaps hg38 fullCoverage.hg38Contigs.bed gap
    # 3209286105 bases of 3209286105 (100.000%) in intersection

    hgLoadBed hg38 contigAlignmentSegments fullCoverage.hg38Contigs.bed

#############################################################################
## analysis of repeat elements from each RM run
## (DONE - 2014-01-10 - Hiram)
    mkdir /hive/data/genomes/hg38/bed/repeatElementCount
    cd /hive/data/genomes/hg38/bed/repeatElementCount
    for F in ../rmsk*/hg38.class.profile.txt \
          ../repeatMaskerGenbank/hg38.class.profile.txt
do
   D=`dirname $F`
   B=`basename $D | sed -e 's/repeatMaskerGenbank/NCBI/; s/rmsk//;'`
   echo "==== $B ===="
   grep rmskClass $F | sed -e 's#rmskClass/##; s/.tab//;' \
     | awk '{printf "%s\t%d\n", $2, $1}' | sort > ${B}.tab
done

   # Hmmer does not have snRNA and tRNA ?
   echo -e "snRNA\t0" >> Hmmer.tab
   echo -e "tRNA\t0" >> Hmmer.tab
   sort Hmmer.tab > t.tab
   mv t.tab Hmmer.tab

   echo "#  Repeat Masker item counts" > table.result.txt
   echo "#  class         NCBI cross-match rmblastn HMMER" >> table.result.txt
   join NCBI.tab CM.tab  | join - Blastn.tab  | join - Hmmer.tab \
     | awk '{printf "%-15s\t%7d\t%7d\t%7d\t%7d\n", $1,$2,$3,$4,$5}' \
       | sort -k2,2nr >> table.result.txt

   cat table.result.txt
#  Repeat Masker item counts
#  class         NCBI cross-match rmblastn HMMER
SINE            1849444 1852545 1822406 1884179
LINE            1586141 1570523 1551012 1702529
LTR              759248  748597  737799  805427
DNA              502186  499108  485558  565171
Simple_repeat    433789  703682  716968  636906
Low_complexity   396378  102856  105181   95480
Satellite         10198    7962    7703   10852
LTR?               5884    5667    5068    9181
snRNA              4595    4516    4548       0
Retroposon         4163    5750    5630   11861
Unknown            2802    5622    5263    3914
DNA?               2157    3294    3018    4582
tRNA               2154    2026    1983       0
rRNA               1915    1840    1810     464
RC                 1860    1784    1706    2059
srpRNA             1784    1672    1633    1517
scRNA              1397    1420    1426    6783
RNA                 822     704     611    1484
SINE?               488      38      38     970
RC?                 445     411     374     806

total           5567850 5520017 5459735 5744165

#############################################################################
## blat server turned on (DONE - 2014-01-13 - Hiram)
#	After getting a blat server assigned by the Blat Server Gods,
    ssh hgwdev

    hgsql -e 'INSERT INTO blatServers (db, host, port, isTrans, canPcr) \
	VALUES ("hg38", "blat4c", "17780", "1", "0"); \
	INSERT INTO blatServers (db, host, port, isTrans, canPcr) \
	VALUES ("hg38", "blat4c", "17781", "0", "1");' \
	    hgcentraltest
    #	test it with some sequence

############################################################################
## reset default position to ABO gene (DONE - 2014-01-13 - Hiram)
    ssh hgwdev
    hgsql -e 'update dbDb set defaultPos="chr9:133252000-133280861"
	where name="hg38";' hgcentraltest

#########################################################################
## update grp table with new set of standard rows (DONE - 2014-01-29 - Hiram)
    hgsql -e 'alter table grp rename grpOriginal;' hg38
    hgsql -e 'drop table grp;' hg38
    hgsql -e "create table grp (PRIMARY KEY(NAME)) select * from hg19.grp" hg38
    hgsql -e 'delete from grp where name="denisova";' hg38
    hgsql -e 'delete from grp where name="pub";' hg38
    hgsql -e 'delete from grp where name="neandertal";' hg38
    hgsql -e 'update grp set defaultIsClosed=0 where name="map";' hg38

    hgsql -e 'drop table grpOriginal;' hg38

############################################################################
# PREPARE LINEAGE SPECIFIC REPEAT FILES FOR LASTZ (DONE - 2014-01-21 - Hiram)
    ssh ku
    mkdir /hive/data/genomes/hg38/bed/linSpecRep
    cd /hive/data/genomes/hg38/bed/linSpecRep
    #	create individual .out files from the master record in ../repeatMasker
    mkdir splitOut
    cat << '_EOF_' > split.csh
#!/bin/csh -fe
set C = $1
head -3 ../repeatMasker/hg38.sorted.fa.out > splitOut/${C}.out
grep "${C} " ../repeatMasker/hg38.sorted.fa.out >> splitOut/${C}.out
'_EOF_'
    # << happy emacs
    chmod +x split.csh

    cat << '_EOF_' > template
#LOOP
split.csh $(root1) {check out line+ splitOut/$(root1).out}
#ENDLOOP
'_EOF_'
    # << happy emacs

    # small ones first:
    cut -f1 ../../chrom.sizes | tac > chrom.list
    gensub2 chrom.list single template jobList
    para create jobList
    para try ... check ... push ... etc...
# Completed: 93 of 93 jobs
# CPU time in finished jobs:        127s       2.12m     0.04h    0.00d  0.000 y
# IO & Wait Time:                 17154s     285.90m     4.76h    0.20d  0.001 y
# Average job time:                 186s       3.10m     0.05h    0.00d
# Longest finished job:             224s       3.73m     0.06h    0.00d
# Submission to last job:           280s       4.67m     0.08h    0.00d

    #	now, we can date and process each of those .out files
    #	constructing the humanSpecific set of repeats
    #   this means repeats found in human, and not in others
    #   using mouse here for 'others' is good enough, a variety
    #   of other species could be used (rat dog cow) where they all
    #   produce the same result
    mkdir dateRepeats
    cd dateRepeats
    cat << '_EOF_' > mkLSR
#!/bin/bash
set -beEu -o pipefail
rm -f $1.out_mus-musculus
ln -s ../splitOut/$1.out .
/scratch/data/RepeatMasker/DateRepeats $1.out -query human -comp mouse
rm $1.out
mkdir -p ../humanSpecific
/cluster/bin/scripts/extractRepeats 1 $1.out_mus-musculus \
	> ../humanSpecific/$1.out.spec
'_EOF_'
    #	<< happy emacs
    chmod +x mkLSR

    cat << '_EOF_' > template
#LOOP
./mkLSR $(path1) {check out line+ ../humanSpecific/$(path1).out.spec}
#ENDLOOP
'_EOF_'
    #	<< happy emacs

    gensub2 ../chrom.list single template jobList
    para try ... check ... push ... etc...
    para time
# Completed: 455 of 455 jobs
# CPU time in finished jobs:      13985s     233.08m     3.88h    0.16d  0.000 y
# IO & Wait Time:                  1470s      24.50m     0.41h    0.02d  0.000 y
# Average job time:                  34s       0.57m     0.01h    0.00d
# Longest finished job:             111s       1.85m     0.03h    0.00d
# Submission to last job:          1427s      23.78m     0.40h    0.02d


    # We also need the nibs for blastz runs with lineage specific repeats
    mkdir /hive/data/genomes/hg38/bed/nibs
    cd /hive/data/genomes/hg38/bed/nibs
    cut -f1 ../../chrom.sizes | while read C
do
    twoBitToFa -seq=${C} ../../hg38.2bit stdout \
	| faToNib -softMask stdin ${C}.nib
    echo "${C} done"
done

    # verify nothing lost
    cat ../../chrom.sizes \
     | awk '{printf "nibFrag -masked %s.nib 0 %d + stdout\n", $1, $2}' \
        | sh | faSize stdin
# 3209286105 bases (159970322 N's 3049315783 real 1460684798 upper
#  1588630985 lower) in 455 sequences in 1 files
# Total size: mean 7053376.1 sd 31548372.6
#  min 970 (chrUn_KI270394v1.nib:0-970)
#  max 248956422 (chr1.nib:0-248956422) median 161218
# %49.50 masked total, %52.10 masked real

    mkdir /hive/data/staging/data/hg38/nib
    rsync -a --progress ./ /hive/data/staging/data/hg38/nib

#############################################################################
## GRC Contigs/ctgPos2 track (DONE - 2014-12-25 - Hiram)
    # provide mapping of UCSC chrom names to GRC names
    mkdir /hive/data/genomes/hg38/bed/ctgPos2
    cd /hive/data/genomes/hg38/bed/ctgPos2
    grep -v "^#" ../../genbank/GCA_000001405.15_GRCh38_top-level.acc2name \
	| awk '{printf "s/^%s/%s/g;\n", $1, $3}' > accessionToUcsc.sed.txt

    find ../../genbank -type f | grep "/assembled_chromosomes/AGP/" | sed -e 's/.comp//' | while read F
do
   if [ -s $F ]; then
      zcat $F | grep -v "^#"
   fi
done | sed -e "`cat accessionToUcsc.sed.txt`" > ucsc.grch38.agp

    awk '$5 != "N"' ucsc.grch38.agp \
| awk '{printf "%s\t%d\t%s\t%d\t%d\t%s\n", $6, $3-$2+1, $1, $2-1, $3, $5}' \
	| sort -u | sort -k3,3 -k4,4n > ctgPos2.tab


    export ctgSize=`awk '{print length($1)}' ctgPos2.tab | sort -n | tail -1`
    export chrSize=`awk '{print length($3)}' ctgPos2.tab | sort -n | tail -1`

    sed -e "s/20/$ctgSize/; s/16/$chrSize/;" \
	/cluster/home/hiram/kent/src/hg/lib/ctgPos2.sql > hg38.ctgPos2.sql

    hgLoadSqlTab hg38 ctgPos2 hg38.ctgPos2.sql ctgPos2.tab

############################################################################
# constructing download files (WORKING - 2014-01-15 - Hiram)
    # add hg38 to all.joiner and verify it is clean:
    joinerCheck -database=hg38 -keys all.joiner
# Checking keys on database hg38
#  hg38.ucscToINSDC.chrom - hits 455 of 455 (100.000%) ok
    # and all table coordinates are OK:
    checkTableCoords hg38

    cd /hive/data/genomes/hg38
    time $HOME/kent/src/hg/utils/automation/makeDownloads.pl \
      -workhorse=hgwdev hg38
    # makeDownloads.pl has made a preliminary set of files

    # need to fixup these names and add chromFa.tar.gz files
    cd /hive/data/genomes/hg38/goldenPath/bigZips

    mkdir chroms
    mkdir maskedChroms

    faSplit byname hg38.fa.gz chroms/
    faSplit byname hg38.fa.masked.gz maskedChroms/

    tar cvzf ./hg38.chromFa.tar.gz ./chroms/
    tar cvzf ./hg38.chromFaMasked.tar.gz ./maskedChroms/

    cd /usr/local/apache/htdocs-hgdownload/goldenPath/hg38/bigZips
    ln -s /hive/data/genomes/hg38/goldenPath/bigZips/hg38.chromFa.tar.gz hg38.chromFa.tar.gz
    ln -s /hive/data/genomes/hg38/goldenPath/bigZips/hg38.chromFaMasked.tar.gz hg38.chromFaMasked.tar.gz

    #also added entries for above to md5sum.txt and README.txt

############################################################################
# LASTZ MOUSE Mm10 (DONE - 2014-01-23,31 - Hiram)
    # can no longer use the lineage specific repeats with the new lastz
    # use a screen to manage this longish job:
    screen -S hg38Mm10

    mkdir /hive/data/genomes/hg38/bed/lastzMm10.2014-01-23
    cd /hive/data/genomes/hg38/bed/lastzMm10.2014-01-23

    # best to always specify an exact path to lastz so we know which one is used
    # lastz default parameters are human-mouse parameters

    cat << '_EOF_' > DEF
# human vs mouse
BLASTZ=/cluster/bin/penn/lastz-distrib-1.03.52/bin/lastz

# TARGET: Human Hg38
SEQ1_DIR=/hive/data/genomes/hg38/hg38.2bit
SEQ1_LEN=/hive/data/genomes/hg38/chrom.sizes
SEQ1_CTGDIR=/hive/data/genomes/hg38/hg38.contigs.2bit
SEQ1_CTGLEN=/hive/data/genomes/hg38/hg38.contigs.chrom.sizes
SEQ1_LIFT=/hive/data/genomes/hg38/jkStuff/hg38.contigs.lift
SEQ1_CHUNK=40000000
SEQ1_LAP=10000

# QUERY: Mouse Mm10
SEQ2_DIR=/scratch/data/mm10/mm10.2bit
SEQ2_LEN=/scratch/data/mm10/chrom.sizes
SEQ2_CHUNK=20000000
SEQ2_LAP=0

BASE=/hive/data/genomes/hg38/bed/lastzMm10.2014-01-23
TMPDIR=/dev/shm
'_EOF_'
    # << happy emacs

    time $HOME/kent/src/hg/utils/automation/doBlastzChainNet.pl \
	-verbose=2 \
        -stop=net `pwd`/DEF \
        -syntenicNet -workhorse=hgwdev -smallClusterHub=ku -bigClusterHub=ku \
	-fileServer=hgwdev \
        -chainMinScore=3000 -chainLinearGap=medium > net.log 2>&1
    #	real    1494m26.135s ---- busy cluster
    time $HOME/kent/src/hg/utils/automation/doBlastzChainNet.pl \
	-verbose=2 \
        -continue=load `pwd`/DEF \
        -syntenicNet -workhorse=hgwdev -smallClusterHub=ku -bigClusterHub=ku \
	-fileServer=hgwdev \
        -chainMinScore=3000 -chainLinearGap=medium > load.log 2>&1
    #	Elapsed time: 43m11s
    cat fb.hg38.chainMm10Link.txt
    # 964465044 bases of 3049335806 (31.629%) in intersection

    #	and the swap
    mkdir /hive/data/genomes/mm10/bed/blastz.hg38.swap
    cd /hive/data/genomes/mm10/bed/blastz.hg38.swap
    time nice -n +19 doBlastzChainNet.pl -verbose=2 \
	/hive/data/genomes/hg38/bed/lastzMm10.2014-01-23/DEF \
	-swap -syntenicNet \
	-workhorse=hgwdev -smallClusterHub=ku -bigClusterHub=ku \
	-chainMinScore=3000 -chainLinearGap=medium > swap.log 2>&1
    #   real    83m28.397s

    cat fb.mm10.chainHg38Link.txt
    #	937030766 bases of 2652783500 (35.323%) in intersection

#########################################################################
# LASTZ Dog CanFam3 (DONE - 2014-01-26 - Hiram)
    mkdir /hive/data/genomes/hg38/bed/lastzCanFam3.2014-01-26
    cd /hive/data/genomes/hg38/bed/lastzCanFam3.2014-01-26

    cat << '_EOF_' > DEF
# human vs dog
BLASTZ=/cluster/bin/penn/lastz-distrib-1.03.52/bin/lastz

# TARGET: Human Hg38
SEQ1_DIR=/hive/data/genomes/hg38/hg38.2bit
SEQ1_LEN=/hive/data/genomes/hg38/chrom.sizes
SEQ1_CTGDIR=/hive/data/genomes/hg38/hg38.contigs.2bit
SEQ1_CTGLEN=/hive/data/genomes/hg38/hg38.contigs.chrom.sizes
SEQ1_LIFT=/hive/data/genomes/hg38/jkStuff/hg38.contigs.lift
SEQ1_CHUNK=20000000
SEQ1_LAP=10000

# QUERY: Dog CanFam3
SEQ2_DIR=/hive/data/genomes/canFam3/canFam3.2bit
SEQ2_LEN=/hive/data/genomes/canFam3/chrom.sizes
SEQ2_CHUNK=20000000
SEQ2_LAP=0

BASE=/hive/data/genomes/hg38/bed/lastzCanFam3.2014-01-26
TMPDIR=/dev/shm
'_EOF_'
    # << happy emacs

    #	establish a screen to control this job
    screen hg38CanFam3
    time $HOME/kent/src/hg/utils/automation/doBlastzChainNet.pl -verbose=2 \
	`pwd`/DEF \
	-syntenicNet \
	-workhorse=hgwdev -smallClusterHub=ku -bigClusterHub=ku \
	-chainMinScore=3000 -chainLinearGap=medium > do.log 2>&1
    # Elapsed time: 1396m22s - busy cluster
    cat fb.hg38.chainCanFam3Link.txt
    #  1523987456 bases of 3049335806 (49.978%) in intersection

    #	running the swap
    mkdir /hive/data/genomes/canFam3/bed/blastz.hg38.swap
    cd /hive/data/genomes/canFam3/bed/blastz.hg38.swap
    time nice -n +19 doBlastzChainNet.pl -verbose=2 \
	/hive/data/genomes/hg38/bed/lastzCanFam3.2014-01-26/DEF \
	-syntenicNet -swap \
	-workhorse=hgwdev -smallClusterHub=ku -bigClusterHub=ku \
	-chainMinScore=3000 -chainLinearGap=medium > swap.log 2>&1
    #	real    107m57.787s

    cat fb.canFam3.chainHg38Link.txt
    #	1437624815 bases of 2392715236 (60.083%) in intersection

#########################################################################
# LASTZ Macaca Mulatta RheMac3 (DONE - 2014-01-27,02-10 - Hiram)
    mkdir /hive/data/genomes/hg38/bed/lastzRheMac3.2014-01-27
    cd /hive/data/genomes/hg38/bed/lastzRheMac3.2014-01-27

    # best to always specify an exact path to lastz so we know which one is used
    # lastz default parameters are human-mouse parameters

    cat << '_EOF_' > DEF
# human vs macaca mulatta
BLASTZ=/cluster/bin/penn/lastz-distrib-1.03.52/bin/lastz
# maximum M allowed with lastz is only 254
BLASTZ_M=254
BLASTZ_Q=/scratch/data/blastz/human_chimp.v2.q
BLASTZ_O=600
BLASTZ_E=150
# other parameters from panTro2 vs hg18 lastz on advice from Webb
BLASTZ_K=4500
BLASTZ_Y=15000
BLASTZ_T=2

# TARGET: Human Hg38
SEQ1_DIR=/hive/data/genomes/hg38/hg38.2bit
SEQ1_LEN=/hive/data/genomes/hg38/chrom.sizes
SEQ1_CTGDIR=/hive/data/genomes/hg38/hg38.contigs.2bit
SEQ1_CTGLEN=/hive/data/genomes/hg38/hg38.contigs.chrom.sizes
SEQ1_LIFT=/hive/data/genomes/hg38/jkStuff/hg38.contigs.lift
SEQ1_CHUNK=20000000
SEQ1_LAP=10000

# QUERY: Macaca Mulatta RheMac3
SEQ2_DIR=/scratch/data/rheMac3/rheMac3.2bit
SEQ2_LEN=/scratch/data/rheMac3/chrom.sizes
SEQ2_CHUNK=20000000
SEQ2_LAP=0
SEQ2_IN_CONTIGS=0

BASE=/hive/data/genomes/hg38/bed/lastzRheMac3.2014-01-27
TMPDIR=/dev/shm
'_EOF_'
    # << happy emacs
    time $HOME/kent/src/hg/utils/automation/doBlastzChainNet.pl \
        `pwd`/DEF \
        -syntenicNet -fileServer=hgwdev \
	-chainMinScore=5000 -chainLinearGap=medium \
        -workhorse=hgwdev -smallClusterHub=ku -bigClusterHub=ku > do.log 2>&1
    #   Elapsed time: 1426m43s - busy cluster
    cat fb.hg38.chainRheMac3Link.txt
    #   2431208700 bases of 3049335806 (79.729%) in intersection

    #   running the swap
    mkdir /hive/data/genomes/rheMac3/bed/blastz.hg38.swap
    cd /hive/data/genomes/rheMac3/bed/blastz.hg38.swap
    time nice -n +19 doBlastzChainNet.pl -verbose=2 \
        /hive/data/genomes/hg38/bed/lastzRheMac3.2014-01-27/DEF \
        -swap -syntenicNet -chainMinScore=5000 -chainLinearGap=medium \
        -workhorse=hgwdev -smallClusterHub=ku -bigClusterHub=ku > swap.log 2>&1
    #    82m32.329s
    cat fb.rheMac3.chainHg38Link.txt
    #   2288533769 bases of 2639145830 (86.715%) in intersection

#########################################################################
## construct analysis set (DONE - 2014-01-27 - Hiram)
    mkdir /hive/data/genomes/hg38/bed/analysisSet
    cd /hive/data/genomes/hg38/bed/analysisSet
    mkdir -p splitFa

    faToTwoBit \
../../genbank/seqs_for_alignment_pipelines/GCA_000001405.15_GRCh38_no_alt_analysis_set.fna.gz \
	hg38.unmasked.analysisSet.2bit

    faCount splitFa/c*.fa > splitFa.faCount.txt

    egrep -v "chr[0-9_BGHIJKLXv]*_alt" ../rmskCM/hg38.sorted.fa.out \
	> hg38.analysisSet.out

    twoBitMask hg38.unmasked.analysisSet.2bit hg38.analysisSet.out \
	hg38.rmsk.analysisSet.2bit

    egrep -v "chr[0-9_BGHIJKLXv]*_alt" ../simpleRepeat/trfMask.bed \
	> trfMask.analysisSet.bed

    twoBitMask hg38.rmsk.analysisSet.2bit -add trfMask.analysisSet.bed \
	hg38.analysisSet.2bit

    twoBitToFa hg38.unmasked.analysisSet.2bit stdout | faSize stdin
# 3099922541 bases (165046090 N's 2934876451 real 2934876451 upper 0 lower)
#	in 195 sequences in 1 files
# Total size: mean 15897038.7 sd 46804464.6 min 970 (chrUn_KI270394v1)
#	max 248956422 (chr1) median 32032
# %0.00 masked total, %0.00 masked real

    twoBitToFa hg38.analysisSet.2bit stdout | faSize stdin
# 3099922541 bases (165046090 N's 2934876451 real 1409378896 upper 1525497555
#	lower) in 195 sequences in 1 files
# Total size: mean 15897038.7 sd 46804464.6 min 970 (chrUn_KI270394v1)
#	max 248956422 (chr1) median 32032
# %49.21 masked total, %51.98 masked real

    mkdir hg38.analysisSet.chroms
    twoBitToFa hg38.analysisSet.2bit stdout \
	| faSplit byname stdin hg38.analysisSet.chroms/

    tar cvzf ./hg38.analysisSet.chroms.tar.gz ./hg38.analysisSet.chroms

    ln -s `pwd`/hg38.analysisSet.2bit \
        /usr/local/apache/htdocs-hgdownload/goldenPath/hg38/bigZips
    ln -s `pwd`/hg38.analysisSet.chroms.tar.gz \
        /usr/local/apache/htdocs-hgdownload/goldenPath/hg38/bigZips
    # add these md5 sums to md5sum.txt
    md5sum hg38.analysisSet.2bit hg38.analysisSet.chroms.tar.gz >> \
        /usr/local/apache/htdocs-hgdownload/goldenPath/hg38/bigZips/md5sum.txt

    cp ../../genbank/README_ANALYSIS_SETS README.analysisSet.txt
    # add note at the top of README:
    ######################################################################
    UCSC copy of the file from:

    ftp://ftp.ncbi.nlm.nih.gov/genbank/genomes/Eukaryotes/vertebrates_mammals/Homo_sapiens/GRCh38/seqs_for_alignment_pipelines/README_ANALYSIS_SETS

    ln -s `pwd`/README.analysisSet.txt \
        /usr/local/apache/htdocs-hgdownload/goldenPath/hg38/bigZips

#########################################################################
# the FULL analysis set (DONE - 2014-03-18 - Hiram
    mkdir /hive/data/genomes/hg38/bed/fullAnalysisSet
    cd /hive/data/genomes/hg38/bed/fullAnalysisSet

    mkdir hg38.fullAnalysisSet.chroms
    twoBitToFa ../analysisSet/hg38.analysisSet.2bit stdout \
       | faSplit byname stdin hg38.fullAnalysisSet.chroms/

    grep _alt ../../chrom.sizes | cut -f 1 > alt.list

    twoBitToFa -seqList=alt.list ../../hg38.2bit stdout \
       | faSplit byname stdin hg38.fullAnalysisSet.chroms/

    faCount hg38.fullAnalysisSet.chroms/chr*.fa > faCount.fullAnalysisSet.txt

    faToTwoBit hg38.fullAnalysisSet.chroms/chr*.fa hg38.fullAnalysisSet.2bit
    twoBitInfo hg38.fullAnalysisSet.2bit stdout | sort -k2nr > chrom.sizes

    tar cvzf ./hg38.fullAnalysisSet.chroms.tar.gz ./hg38.fullAnalysisSet.chroms

#########################################################################
# LASTZ Self/hg38 (DONE - 2014-01-25,02-10 - Hiram)
    # can no longer use the lineage specific repeats with the new lastz
    # use a screen to manage this longish job:
    screen -S hg38Self

    mkdir /hive/data/genomes/hg38/bed/lastzSelf.2014-01-25
    cd /hive/data/genomes/hg38/bed/lastzSelf.2014-01-25
    # construct the non-bridged contigs sequence to use:
    (twoBitToFa ../nonBridgedContigs/hg38.chroms.contigs.2bit stdout;
      twoBitToFa ../../hg38.2bit:chrM stdout) | faToTwoBit stdin hg38.self.2bit
    twoBitInfo hg38.self.2bit stdout | sort -k2nr > hg38.self.chrom.sizes

    # best to always specify an exact path to lastz so we know which one is used
    # lastz default parameters are human-mouse parameters

    cat << '_EOF_' > DEF
# human vs human with mouse defaults
BLASTZ=/cluster/bin/penn/lastz-distrib-1.03.52/bin/lastz

# TARGET: Human Hg38
SEQ1_DIR=/hive/data/genomes/hg38/hg38.2bit
SEQ1_LEN=/hive/data/genomes/hg38/chrom.sizes
SEQ1_CTGDIR=/hive/data/genomes/hg38/bed/lastzSelf.2014-01-25/hg38.self.2bit
SEQ1_CTGLEN=/hive/data/genomes/hg38/bed/lastzSelf.2014-01-25/hg38.self.chrom.sizes
SEQ1_LIFT=/hive/data/genomes/hg38/jkStuff/hg38.contigs.lift
SEQ1_CHUNK=20000000
SEQ1_LAP=10000

# QUERY: Human Hg38
SEQ2_DIR=/hive/data/genomes/hg38/hg38.2bit
SEQ2_LEN=/hive/data/genomes/hg38/chrom.sizes
SEQ2_CTGDIR=/hive/data/genomes/hg38/bed/lastzSelf.2014-01-25/hg38.self.2bit
SEQ2_CTGLEN=/hive/data/genomes/hg38/bed/lastzSelf.2014-01-25/hg38.self.chrom.sizes
SEQ2_LIFT=/hive/data/genomes/hg38/jkStuff/hg38.contigs.lift
SEQ2_CHUNK=20000000
SEQ2_LAP=0

BASE=/hive/data/genomes/hg38/bed/lastzSelf.2014-01-25
TMPDIR=/dev/shm
'_EOF_'
_EOF_

    time $HOME/kent/src/hg/utils/automation/doBlastzChainNet.pl \
	-verbose=2 \
        -stop=net `pwd`/DEF \
        -syntenicNet -workhorse=hgwdev -smallClusterHub=ku -bigClusterHub=ku \
	-fileServer=hgwdev \
        -chainMinScore=3000 -chainLinearGap=medium > net.log 2>&1
    #  real    1518m15.817s -- problems
    # there was a problem in the 'part014' batch.  running that manually:
    mkdir /hive/data/genomes/hg38/bed/lastzSelf.2014-01-25/run.blastz/lastJob
    cd /hive/data/genomes/hg38/bed/lastzSelf.2014-01-25/run.blastz/lastJob
    # make 100 jobs out of the 10 parts:
    mkdir -p psl
    cp ../tParts/part014.lst ./xpart014.lst
    split -l 1 xpart014.lst -d -a 3 part
    for F in part0*
do
   mv $F $F.lst
done

for T in part0*.lst
do
  for Q in part0*.lst
  do
    mkdir -p psl/${T}
    echo /cluster/home/hiram/kent/src/hg/utils/automation/blastz-run-ucsc -outFormat psl -dropSelf ${T} ${Q} ../../DEF \{check out exists psl/${T}/${T}.${Q}.psl\}
  done
done > jobList
    para -ram=32g create jobList
    para push
    # one last failing job:
# Completed: 99 of 100 jobs
# CPU time in finished jobs:       2836s      47.27m     0.79h    0.03d  0.000 y
# IO & Wait Time:                   279s       4.65m     0.08h    0.00d  0.000 y
# Average job time:                  31s       0.52m     0.01h    0.00d
# Longest finished job:             586s       9.77m     0.16h    0.01d
# Submission to last job:           620s      10.33m     0.17h    0.01d

    mkdir /hive/data/genomes/hg38/bed/lastzSelf.2014-01-25/run.blastz/lastJob/split010
    cd /hive/data/genomes/hg38/bed/lastzSelf.2014-01-25/run.blastz/lastJob/split010
    mkdir psl

    twoBitToFa /hive/data/genomes/hg38/bed/lastzSelf.2014-01-25/hg38.self.2bit:chr16_03:0-1689648 part010.fa

    faSplit -lift=split010.lift size part010.fa 169000 split010_
TOP="/hive/data/genomes/hg38/bed/lastzSelf.2014-01-25/run.blastz/lastJob/split010"

for T in split*.fa
do
  mkdir -p psl/${T}
  echo "${TOP}/${T}" > ${T}.lst
  faToTwoBit  ${T} ${T}.2bit
  for Q in split*.fa
  do
     echo "/cluster/home/hiram/kent/src/hg/utils/automation/blastz-run-ucsc -outFormat psl -dropSelf ${T}.lst ${Q}.lst DEF {check out exists psl/${T}/${T}.${Q}.psl}"
  done
done > jobList
     para -ram=32g create jobList

# Completed: 100 of 100 jobs
# CPU time in finished jobs:     176579s    2942.99m    49.05h    2.04d  0.006 y
# IO & Wait Time:                  1239s      20.64m     0.34h    0.01d  0.000 y
# Average job time:                1778s      29.64m     0.49h    0.02d
# Longest finished job:           29343s     489.05m     8.15h    0.34d
# Submission to last job:         29348s     489.13m     8.15h    0.34d

    catDir psl/* | grep -v "^#" > raw.psl

    liftUp -type=.psl stdout split010.lift error raw.psl \
        | liftUp -pslQ -type=.psl chr16_03.psl split010.lift error stdin

    # this combination allowed psl headers to sneak in the middle,
    # had to be cleaned:
    catDir psl/* | grep -v "^#" > part014.psl
    cat split010/chr16_03.psl >> part014.psl
    cp -p part014.psl ../../psl/part014.lst/part014.lst_part014.lst.psl

    time $HOME/kent/src/hg/utils/automation/doBlastzChainNet.pl \
	-verbose=2 \
        -continue=cat -stop=net `pwd`/DEF \
        -syntenicNet -workhorse=hgwdev -smallClusterHub=ku -bigClusterHub=ku \
	-fileServer=hgwdev \
        -chainMinScore=3000 -chainLinearGap=medium > cat.log 2>&1
    # real    43m11.340s
    # failed in chaining, running manually on hgwdev
    time ./bigJobs.sh > bigJobs.log 2>&1
    #  real    468m59.648s

    time ./part014.sh > part014.log 2>&1

    # real    1319m57.911s
    # -rw-rw-r-- 1 3581498246 Feb  8 14:37 part014.lst.chain
    time $HOME/kent/src/hg/utils/automation/doBlastzChainNet.pl \
	-verbose=2 \
        -continue=chainMerge -stop=net `pwd`/DEF \
        -syntenicNet -workhorse=hgwdev -smallClusterHub=ku -bigClusterHub=ku \
	-fileServer=hgwdev \
        -chainMinScore=3000 -chainLinearGap=medium > chainMerge.log 2>&1

    time $HOME/kent/src/hg/utils/automation/doBlastzChainNet.pl \
	-verbose=2 \
        -continue=load -stop=load `pwd`/DEF \
        -syntenicNet -workhorse=hgwdev -smallClusterHub=ku -bigClusterHub=ku \
	-fileServer=hgwdev \
        -chainMinScore=3000 -chainLinearGap=medium > load.log 2>&1

    hgLoadChain -normScore -tIndex hg38 chainSelf hg38.hg38.all.chain.gz
    #  Loading 104815249 chains into hg38.chainSelf

    cat fb.hg38.chainSelfLink.txt
    #   392419010 bases of 3049335806 (12.869%) in intersection
    cd /hive/data/genomes/hg38/bed
    ln -s lastzSelf.2014-01-25 lastz.self
    ln -s lastzSelf.2014-01-25 lastz.hg38

#########################################################################
## 4-Way Multiz for UCSC Genes construction (DONE - 2014-02-11 - Hiram)
    ssh hgwdev
    mkdir /hive/data/genomes/hg38/bed/multiz4way
    cd /hive/data/genomes/hg38/bed/multiz4way

    #	extract our 4 organisms from the 44-way on hg18:
    ln -s /hive/data/genomes/hg18/bed/multiz44way/44way.4d.nh ./44way.nh

    /cluster/bin/phast/tree_doctor \
	--prune-all-but hg19,mm10,canFam3,rheMac3 $HOME/kent/src/hg/utils/phyloTrees/120way.nh \
	| sed -e "s/hg19/hg38/" > 4way.nh

    #	this looks like:
    cat 4way.nh
(((hg38:0.033974,rheMac3:0.037601):0.109934,mm10:0.356483):0.020593,canFam3:0.165928);


    #	Use this specification in the phyloGif tool:
    #	http://genome.ucsc.edu/cgi-bin/phyloGif
    #	to obtain a gif image for htdocs/images/phylo/hg38_4way.gif

    /cluster/bin/phast/all_dists 4way.nh > 4way.distances.txt
    #	Use this output to create the table below
    grep -y hg38 4way.distances.txt | sort -k3,3n
#
#	If you can fill in all the numbers in this table, you are ready for
#	the multiple alignment procedure
#
#                         featureBits chainLink measures
#                                        chainHg38Link   chain    linearGap
#    distance                      on hg38    on other   minScore
#  1  0.071575 - rhesus rheMac3 (% 79.729) (% 86.715)       5000     medium
#  2  0.330429 - dog canFam3    (% 49.978) (% 60.083)       3000     medium
#  3  0.500391 - mouse mm10     (% 31.629) (% 35.323)       3000     medium

    #	using the syntenic nets
    cd /cluster/data/hg38/bed/multiz4way
    mkdir mafLinks
    cd mafLinks
    mkdir rheMac3 canFam3 mm10

    for D in mm10 canFam3 rheMac3
do
    ln -s ../../../lastz.${D}/axtChain/hg38.${D}.synNet.maf.gz ./${D}/
done

    mkdir /hive/data/genomes/hg38/bed/multiz4way/mafSplit
    cd /hive/data/genomes/hg38/bed/multiz4way/mafSplit
    for D in mm10 canFam3 rheMac3
do
    echo "working: ${D}"
    zcat ../mafLinks/${D}/hg38.${D}.synNet.maf.gz > ${D}.maf
    mkdir -p ${D}
    mafSplit -byTarget -useFullSequenceName /dev/null ${D}/${D}_  ${D}.maf
    rm -f ${D}.maf
done

    #	determine what is the newest version of multiz and use that
    cd /hive/data/genomes/hg38/bed/multiz4way
    mkdir penn
    cp -p /cluster/bin/penn/multiz.2009-01-21_patched/multiz penn
    cp -p /cluster/bin/penn/multiz.2009-01-21_patched/maf_project penn
    cp -p /cluster/bin/penn/multiz.2009-01-21_patched/autoMZ penn

    # the autoMultiz cluster run
    ssh ku
    cd /hive/data/genomes/hg38/bed/multiz4way

    # create species list and stripped down tree for autoMZ
    sed 's/[a-z][a-z]*_//g; s/:[0-9\.][0-9\.]*//g; s/;//; /^ *$/d' \
	4way.nh > tmp.nh
    echo `cat tmp.nh` | sed 's/ //g; s/,/ /g' > tree.nh
    sed 's/[()]//g; s/,/ /g' tree.nh > species.lst

    mkdir run maf
    cd run

    #	NOTE: you need to set the db and multiz dirname properly in this script
    cat > autoMultiz << '_EOF_'
#!/bin/csh -ef
set db = hg38
set c = $1
set maf = $2
set binDir = /hive/data/genomes/hg38/bed/multiz4way/penn
set tmp = /dev/shm/$db/multiz.$c
set pairs = /hive/data/genomes/hg38/bed/multiz4way/mafSplit
rm -fr $tmp
mkdir -p $tmp
cp ../{tree.nh,species.lst} $tmp
pushd $tmp
foreach s (`cat species.lst`)
    set in = $pairs/$s/${s}_$c.maf
    set out = $db.$s.sing.maf
    if ($s == $db) then
	continue
    endif
    if (-e $in.gz) then
	zcat $in.gz > $out
    else if (-e $in) then
	cp $in $out
    else
	echo "##maf version=1 scoring=autoMZ" > $out
    endif
end
set path = ($binDir $path); rehash
$binDir/autoMZ + T=$tmp E=$db "`cat tree.nh`" $db.*.sing.maf $c.maf
popd
cp $tmp/$c.maf $maf
rm -fr $tmp
'_EOF_'
    # << happy emacs
    chmod +x autoMultiz

cat  << '_EOF_' > template
#LOOP
./autoMultiz $(root1) {check out line+ /hive/data/genomes/hg38/bed/multiz4way/maf/$(root1).maf}
#ENDLOOP
'_EOF_'
    # << happy emacs

    cut -f1 /cluster/data/hg38/chrom.sizes > chrom.lst
    gensub2 chrom.lst single template jobList
    para create jobList
    # 455 jobs
    para try ... check ... push ... etc ...
# Completed: 455 of 455 jobs
# CPU time in finished jobs:      50111s     835.18m    13.92h    0.58d  0.002 y
# IO & Wait Time:                  5574s      92.91m     1.55h    0.06d  0.000 y
# Average job time:                 122s       2.04m     0.03h    0.00d
# Longest finished job:            4717s      78.62m     1.31h    0.05d
# Submission to last job:          4722s      78.70m     1.31h    0.05d

    #	combine results into a single file for loading and gbdb reference
    cd /hive/data/genomes/hg38/bed/multiz4way
    grep "^#" maf/chr19_GL949749v2_alt.maf | grep -v "eof maf" > multiz4way.maf
    grep -h -v "^#" maf/*.maf >> multiz4way.maf
    grep "^#" maf/chr19_GL949749v2_alt.maf | grep "eof maf" >> multiz4way.maf
    #	real    3m27.561s

    #	makes a 8.5 Gb file:
    #   -rw-rw-r-- 1 9044143788 Feb 11 12:51 multiz4way.maf

    # Load into database
    ssh hgwdev
    cd /hive/data/genomes/hg38/bed/multiz4way
    mkdir /gbdb/hg38/multiz4way
    ln -s /hive/data/genomes/hg38/bed/multiz4way/multiz4way.maf \
	/gbdb/hg38/multiz4way
    #	the hgLoadMaf generates huge tmp files, locate them in /dev/shm
    cd /dev/shm
    time nice -n +19 hgLoadMaf hg38 multiz4way
    #   Loaded 6141667 mafs in 1 files from /gbdb/hg38/multiz4way
    #   real    2m2.812s

    cd /hive/data/genomes/hg38/bed/multiz4way
    time (cat /gbdb/hg38/multiz4way/*.maf \
        | hgLoadMafSummary -verbose=2 -minSize=10000 \
	-mergeGap=500 -maxSize=50000 hg38 multiz4waySummary stdin)
    # Created 1266559 summary blocks from 11780291 components and 6141667 mafs
    # real    3m0.791s
# -rw-rw-r-- 1  311246327 Feb 11 12:54 multiz4way.tab
# -rw-rw-r-- 1   58730176 Feb 11 12:58 multiz4waySummary.tab
    wc -l multiz4way*
    # 6141667 multiz4way.tab
    # 1266559 multiz4waySummary.tab
    # 7408226 total

#########################################################################
## RE-load alternate sequence for PSL display (DONE - 2016-01-15 - Hiram)
## The procedure below
##    "load alternate sequence for PSL display (DONE - #2014-02-24 - Hiram)
## produced an illegal psl Table altSeqLiftOverPsl:
    pslCheck -db=hg38 altSeqLiftOverPsl
    checked: 266 failed: 264 errors: 1046

## Since then, the gff3ToPsl command has been updated to be a bit more
##  robust, so, the following sequence produces the new alignment file:
    mkdir -p /hive/data/genomes/hg38/bed/altAlignments/redo2016
    cd /hive/data/genomes/hg38/bed/altAlignments/redo2016

mkdir -p ucscPsl

awk -F'/' '{printf "s/^%s\t/%s\t/g;\n", $3,$2}' ../accessionToUcsc.sed.txt \
    > ucscToNcbi.sed.txt

sed -f ucscToNcbi.sed.txt ../../../chrom.sizes > ncbi.chrom.sizes

paste ncbi.chrom.sizes ../../../chrom.sizes \
  | awk -F'\t' '{printf "0\t%s\t%d\t%s\t%d\n", $1,$2,$3,$4}' \
    > ncbiToUcsc.lift

find ../../../genbank -type f | grep alt_scaffolds | grep "\.gff$" \
  | while read gff
do
  name=`basename $gff | sed -e 's/_.*//;'`
  fasta=`dirname $gff | sed -e 's#alignments#FASTA/alt.scaf.fa.gz#;'`
  size=`faCount $fasta | grep -w total | cut -f2`
  printf "%s\t%d\n" "$name" "$size" > target.sizes
  gff3ToPsl ncbi.chrom.sizes target.sizes $gff $name.psl
  pslCheck ${name}.psl
  liftUp -type=.psl stdout ncbiToUcsc.lift error ${name}.psl \
    | liftUp -type=.psl -pslQ ucscPsl/${name}.psl ncbiToUcsc.lift error stdin
  pslCheck ucscPsl/${name}.psl
done

  pslSort dirs altSeqLiftOverPsl.psl ./tmp ucscPsl
  pslCheck -db=hg38 altSeqLiftOverPsl.psl

  hgLoadPsl hg38 altSeqLiftOverPsl.psl
  pslCheck -db=hg38 altSeqLiftOverPsl
  #  checked: 266 failed: 0 errors: 0

#########################################################################
## load alternate sequence for PSL display (DONE - 2014-02-24 - Hiram)
    mkdir /hive/data/genomes/hg38/bed/altAlignments/sequence
    cd /hive/data/genomes/hg38/bed/altAlignments/sequence

    rm -fr hg38.haplotypes.lift temp.lift targetFa queryFa
    mkdir targetFa
    mkdir queryFa
    touch temp.lift

    cat ../../altLocations/chrToAlt.bed | while read L
do
  chrName=`echo $L | awk '{print $1}'`
  chromSize=`egrep "^$chrName   " ../../../chrom.sizes | cut -f2`
  chrStart=`echo $L | awk '{printf "%d", $2}'`
  chrEnd=`echo $L | awk  '{printf "%d", $3}'`
  chrSize=`echo $chrEnd $chrStart | awk '{print $1-$3}'`
  queryName=`echo $L | awk '{print $4}'`
  partName="${chrName}_${chrStart}_${chrEnd}"
  echo $chrName $chrStart $chrEnd $queryName $partName $chromSize
  echo -e "$chrStart\t${partName}\t$chrSize\t$chrName\t$chromSize" >> temp.lift
  twoBitToFa ../../../hg38.2bit:$chrName:$chrStart-$chrEnd stdout | sed -e "s/^>.*/>$partName/;" > targetFa/$queryName.fa
  twoBitToFa ../../../hg38.2bit:$queryName queryFa/$queryName.fa
done

sort -u temp.lift | sort -k4,4 -k1,1n > hg38.haplotypes.lift

    mkdir /gbdb/hg38/ncbiAltMappings
    cd /hive/data/genomes/hg38/bed/altAlignments/sequence/queryFa
    ln -s `pwd`/*.fa /gbdb/hg38/ncbiAltMappings
    cd /hive/data/genomes/hg38/bed/altAlignments/sequence
    hgLoadSeq -drop -seqTbl=seqNcbiAltSequence -extFileTbl=extNcbiAltSequence \
        hg38 /gbdb/hg38/ncbiAltMappings/*.fa

    pslSwap ../altAlignments.psl stdout \
      | pslRecalcMatch stdin ../../../hg38.2bit ../../../hg38.2bit \
        hg38.referenceTarget.psl

    # the table name altSeqLiftOverPsl is recognized in hgc to allow display
    # of the details of the alignments
    hgLoadPsl hg38 -table=altSeqLiftOverPsl hg38.referenceTarget.psl

#########################################################################
## alternate sequence alignments EXPERIMENT (DONE - 2014-01-17 - Hiram)
    # the lastzAltSequences.2014-01-23 alignment was used for this instead
    # of this procedure
    mkdir /hive/data/genomes/hg38/bed/altAlignments
    cd /hive/data/genomes/hg38/bed/altAlignments

    grep -v "^#" ../../genbank/GCA_000001405.15_GRCh38_top-level.acc2name \
	| awk '{printf "s/%s/%s/g;\n", $1, $3}' > accessionToUcsc.sed.txt

    find ../../genbank -type f | grep alt_scaffolds | grep "\.gff$" \
	| while read F
do
   cat $F | sed -f accessionToUcsc.sed.txt \
	| gff3ToPsl ../../chrom.sizes stdin stdout
done > altAlignments.psl
	| xargs cat | sed -f accessionToUcsc.sed.txt \
	| gff3ToPsl ../../chrom.sizes stdin altAlignments.psl

    time pslRecalcMatch altAlignments.psl ../../hg38.2bit ../../hg38.2bit \
        altRecalcMatch.psl
    # real    0m51.122s

    # just to see what they look like in different formats:
    pslToChain altRecalcMatch.psl altAlignments.chain
    chainToAxt altAlignments.chain ../../hg38.2bit ../../hg38.2bit \
	altAlignments.axt
    axtToMaf -score altAlignments.axt ../../chrom.sizes ../../chrom.sizes \
        altAlignments.maf

    mkdir mafSplits
    mafSplit /dev/null mafSplits/ altAlignments.maf
    # doesn't work:
# Can't find chrom in MAF component src: chr6_GL000250v2_alt

    mkdir splits psl
    find ../../genbank -type f | grep alt_scaffolds | grep "\.gff$" \
        | while read F
do
   chrAlt=`basename $F | sed -e 's/_.*//' | sed -f accessionToUcsc.sed.txt`
   echo $chrAlt
   cat $F | sed -f accessionToUcsc.sed.txt \
        | gff3ToPsl ../../chrom.sizes stdin splits/${chrAlt}.psl
   pslRecalcMatch splits/${chrAlt}.psl ../../hg38.2bit ../../hg38.2bit \
	psl/${chrAlt}.psl
done

   mkdir swap
   mkdir swap/psl swap/chain swap/axt swap/maf swap/anno
   for F in psl/*.psl
do
  B=`basename $F | sed -e 's/.psl//'`
  echo $B
  pslSwap $F stdout | pslRecalcMatch stdin ../../hg38.2bit ../../hg38.2bit \
      swap/psl/${B}.psl
  pslToChain swap/psl/${B}.psl swap/chain/${B}.chain
  chainToAxt swap/chain/${B}.chain ../../hg38.2bit ../../hg38.2bit \
	swap/axt/${B}.axt
  axtToMaf -score swap/axt/${B}.axt ../../chrom.sizes ../../chrom.sizes stdout \
      | sed -e 's/^s chr\([0-9XYM][0-9]* \)/s ref38.chr\1/; s/^s chr\([0-9XYM][0-9]*_\)/s alt38.chr\1/;' > swap/maf/${B}.maf
  mafAddIRows -nBeds=nBeds swap/maf/${B}.maf ../../hg38.2bit swap/anno/${B}.maf
done
# axtToMaf -score swap/axt/${B}.axt ../../chrom.sizes ../../chrom.sizes stdout \
#      | sed -e 's/^s chr/s hg38.chr/' > swap/maf/${B}.maf

   twoBitInfo -nBed ../../hg38.2bit ../../hg38.N.bed
   ln -s  ../../hg38.N.bed hg38.bed
   ln -s ../../hg38.N.bed ref38.bed
   ln -s ../../hg38.N.bed alt38.bed
   echo hg38.bed > nBeds
   echo ref38.bed >> nBeds
   echo alt38.bed >> nBeds
   ln -s  ../../chrom.sizes hg38.len
   ln -s  ../../chrom.sizes ref38.len
   ln -s  ../../chrom.sizes alt38.len
   echo hg38.len > sizes
   echo ref38.len >> sizes
   echo alt38.len >> sizes

   mkdir chain axt maf anno
   for F in psl/*.psl
do
   B=`basename $F | sed -e 's/.psl//'`
   echo $B
   pslToChain $F chain/${B}.chain
   chainToAxt chain/${B}.chain ../../hg38.2bit ../../hg38.2bit axt/${B}.axt
  axtToMaf -score axt/${B}.axt ../../chrom.sizes ../../chrom.sizes stdout \
      | sed -e 's/^s chr\([0-9XYM][0-9]* \)/s ref38.chr\1/; s/^s chr\([0-9XYM][0-9]*_\)/s alt38.chr\1/;' > maf/${B}.maf
   mafAddIRows -nBeds=nBeds maf/${B}.maf ../../hg38.2bit anno/${B}.maf
done

#   axtToMaf -score axt/${B}.axt ../../chrom.sizes ../../chrom.sizes stdout \
#      | sed -e 's/^s chr/s hg38.chr/' > maf/${B}.maf

############################################################################
# Liftover Gencode V19 from hg19  (DONE braney 2014-02-14)

mkdir /cluster/data/hg38/bed/liftOverGencodeV19
cd /cluster/data/hg38/bed/liftOverGencodeV19

echo "show tables like 'wgEncodeGencode%19'" | hgsql hg19 | tail -n +2 > all.gencode.tables
echo " select tableName from trackDb where tableName like 'wgEncodeGencode_%V19';" | hgsql hg19 --skip-column-names > genePred.gencode.tables

# load the non-genepred table as is.   This isn't quite the right thing to do
# with exon support, but it's good enough for our purposes at the moment
join -v 1 *.gencode.tables | while read t; do echo "create table $t select * from hg19.$t" | hgsql hg38; echo $t; done

for i in `cat genePredExt.gencode.tables`;
do
    echo "select name,score,name2 from $i" | hgsql hg19 | sort > $i.name2Score.txt;
    genePredToFakePsl hg19 $i $i.psl $i.cds;
    pslMap -chainMapFile -swapMap $i.psl /gbdb/hg19/liftOver/hg19ToHg38.over.chain.gz stdout | pslCDnaFilter -uniqueMapped stdin stdout |  sort -k 14,14 -k 16,16n | pslToBed -cds=$i.cds stdin stdout | bedToGenePred stdin stdout | sort |  join /dev/stdin $i.name2Score.txt| tr ' ' '\t' | hgLoadGenePred -genePredExt hg38 $i stdin;
    echo $i;
done

for i in `cat genePred.gencode.tables`;
do
    genePredToFakePsl hg19 $i $i.psl $i.cds;
    pslMap -chainMapFile -swapMap $i.psl /gbdb/hg19/liftOver/hg19ToHg38.over.chain.gz stdout | pslCDnaFilter -uniqueMapped stdin stdout |  sort -k 14,14 -k 16,16n | pslToBed -cds=$i.cds stdin stdout | bedToGenePred stdin stdout |  tr ' ' '\t' | hgLoadGenePred hg38 $i stdin;
    echo $i;
done

#####################################################################
## tRNAs track ( 2014-02-18 braney DONE)
## this is a preliminary version for UCSC build.  NOT FOR RELEASE!
ssh hgwdev
cd /hive/data/genomes/hg38/bed
mkdir tRNAs
cd tRNAs

cp  /hive/users/pchan/tRNAs/Eukaryota/hg38/hg38-tRNAs.bed .

hgLoadBed -tab hg38 tRNAs hg38-tRNAs.bed -sqlTable=$HOME/kent/src/hg/lib/tRNAs.sql

## tRNAs track (2015-10-04, Chris FINISHING BUILD FOR RELEASE)
    cd /hive/data/genomes/hg38/bed/tRNAs
    cat /hive/users/pchan/gtrnadb2/Eukaryota/hg38/hg38-tRNAs.bed | sed 's^</BLOCKQUOTE>^^g' | > hg38-tRNAs2.bed
    hgsql hg38 -e 'drop table if exists tRNAs'
    hgLoadBed -tab hg38 tRNAs hg38-tRNAs2.bed -sqlTable=$HOME/kent/src/hg/lib/tRNAs.sql
    mkdir gif
    cp -p /hive/users/pchan/gtrnadb2/Eukaryota/hg38/images/* gif
    cd /hive/data/gbdb/hg38
    ln -s /hive/data/genomes/hg38/bed/tRNAs/gif RNA-img
    cd /usr/local/apache/htdocs-ceisenhart/RNA-img
    ln -s /gbdb/hg38/RNA-img hg38

############################################################################
# EXONIPHY , lifted from hg19 (DONE - braney 2014-02-19)
#	needed for ucscGenes building
    # exoniphyHg19.gp is prepared as follows
    mkdir /cluster/data/hg38/bed/exoniphy
    cd /cluster/data/hg38/bed/exoniphy
    hgsql hg19 -e "select * from exoniphy" -N | cut  -f 2-16 > exoniphyHg19.gp
    time nice -n +19 liftOver -genePred exoniphyHg19.gp \
	/cluster/data/hg19/bed/liftOver/hg19ToHg38.over.chain.gz \
	    exoniphyHg38.gp unmapped
    # real    0m2.015s
    # user    0m1.894s
    # sys     0m0.076s

    wc -l *
    #   186601 exoniphyHg19.gp
    #   186533 exoniphyHg38.gp
    #      136 unmapped
    #   373270 total

    cd /cluster/data/hg38/bed/exoniphy
    nice -n +19 hgLoadGenePred -genePredExt hg38 exoniphy exoniphyHg38.gp
    nice -n +19 featureBits hg38 exoniphy
    # 28807039 bases of 3049335806 (0.945%) in intersection
    nice -n +19 featureBits hg19 exoniphy
    # 28661160 bases of 2897316137 (0.989%) in intersection

#########################################################################
# LASTZ Rat Rn5 (DONE - 2014-02-27 - Hiram)
    #	establish a screen to control this job
    screen -S hg38Rn5
    mkdir /hive/data/genomes/hg38/bed/lastzRn5.2014-02-27
    cd /hive/data/genomes/hg38/bed/lastzRn5.2014-02-27

    # XXX don't forget to specify the BLASTZ binary:
    cat << '_EOF_' > DEF
# human vs rat
BLASTZ=/cluster/bin/penn/lastz-distrib-1.03.52/bin/lastz

# TARGET: Human Hg38
SEQ1_DIR=/hive/data/genomes/hg38/hg38.2bit
SEQ1_LEN=/hive/data/genomes/hg38/chrom.sizes
SEQ1_CTGDIR=/hive/data/genomes/hg38/hg38.contigs.2bit
SEQ1_CTGLEN=/hive/data/genomes/hg38/hg38.contigs.chrom.sizes
SEQ1_LIFT=/hive/data/genomes/hg38/jkStuff/hg38.contigs.lift
SEQ1_CHUNK=20000000
SEQ1_LAP=10000

# QUERY: Rat Rn5
SEQ2_DIR=/hive/data/genomes/rn5/rn5.2bit
SEQ2_LEN=/hive/data/genomes/rn5/chrom.sizes
SEQ2_CHUNK=10000000
SEQ2_LIMIT=100
SEQ2_LAP=0

BASE=/hive/data/genomes/hg38/bed/lastzRn5.2014-02-27
TMPDIR=/scratch/tmp
'_EOF_'
    # << happy emacs

    time doBlastzChainNet.pl -verbose=2 \
	`pwd`/DEF \
	-syntenicNet \
	-workhorse=hgwdev -smallClusterHub=ku -bigClusterHub=ku \
	-chainMinScore=3000 -chainLinearGap=medium > do.log 2>&1

    #   real    658m53.984s
    cat fb.hg38.chainRn5Link.txt
    # 938823407 bases of 3049335806 (30.788%) in intersection

    #	running the swap
    mkdir /hive/data/genomes/rn5/bed/blastz.hg38.swap
    cd /hive/data/genomes/rn5/bed/blastz.hg38.swap
    time nice -n +19 doBlastzChainNet.pl -verbose=2 \
	/hive/data/genomes/hg38/bed/lastzRn5.2014-02-27/DEF \
	-swap \
	-workhorse=hgwdev -smallClusterHub=ku -bigClusterHub=ku \
	-chainMinScore=3000 -chainLinearGap=medium > swap.log 2>&1
    #   real    66m53.095s
    cat fb.rn5.chainHg38Link.txt
    #   934256475 bases of 2572853723 (36.312%) in intersection

    # syntenic net for 14-way use 2014-04-02 - Hiram
    cd /hive/data/genomes/rn5/bed/blastz.hg38.swap
    time nice -n +19 doBlastzChainNet.pl -verbose=2 \
	/hive/data/genomes/hg38/bed/lastzRn5.2014-02-27/DEF \
	-continue=syntenicNet -syntenicNet -swap \
	-workhorse=hgwdev -smallClusterHub=ku -bigClusterHub=ku \
	-chainMinScore=3000 -chainLinearGap=medium > synNet.log 2>&1
    #  real    16m54.489s

##############################################################################
# LASTZ Rat Rn4 (DONE - 2014-02-27 - Hiram)
    #	establish a screen to control this job
    screen -S hg38Rn4
    mkdir /hive/data/genomes/hg38/bed/lastzRn4.2014-02-27
    cd /hive/data/genomes/hg38/bed/lastzRn4.2014-02-27

    # XXX don't forget to specify the BLASTZ binary:
    cat << '_EOF_' > DEF
# human vs rat
BLASTZ=/cluster/bin/penn/lastz-distrib-1.03.52/bin/lastz

# TARGET: Human Hg38
SEQ1_DIR=/hive/data/genomes/hg38/hg38.2bit
SEQ1_LEN=/hive/data/genomes/hg38/chrom.sizes
SEQ1_CTGDIR=/hive/data/genomes/hg38/hg38.contigs.2bit
SEQ1_CTGLEN=/hive/data/genomes/hg38/hg38.contigs.chrom.sizes
SEQ1_LIFT=/hive/data/genomes/hg38/jkStuff/hg38.contigs.lift
SEQ1_CHUNK=20000000
SEQ1_LAP=10000

# QUERY: Rat Rn4
SEQ2_DIR=/hive/data/genomes/rn4/rn4.2bit
SEQ2_LEN=/hive/data/genomes/rn4/chrom.sizes
SEQ2_CHUNK=10000000
SEQ2_LIMIT=100
SEQ2_LAP=0

BASE=/hive/data/genomes/hg38/bed/lastzRn4.2014-02-27
TMPDIR=/scratch/tmp
'_EOF_'
    # << happy emacs

    doBlastzChainNet.pl -verbose=2 \
	`pwd`/DEF \
	-syntenicNet \
	-workhorse=hgwdev -smallClusterHub=ku -bigClusterHub=ku \
	-chainMinScore=3000 -chainLinearGap=medium > do.log 2>&1
    #   real    658m53.984s

    cat fb.hg38.chainRn4Link.txt
    #   913992768 bases of 3049335806 (29.974%) in intersection

    #	running the swap
    mkdir /hive/data/genomes/rn4/bed/blastz.hg38.swap
    cd /hive/data/genomes/rn4/bed/blastz.hg38.swap
    time nice -n +19 doBlastzChainNet.pl -verbose=2 \
	/hive/data/genomes/hg38/bed/lastzRn4.2014-02-27/DEF \
	-swap \
	-workhorse=hgwdev -smallClusterHub=ku -bigClusterHub=ku \
	-chainMinScore=3000 -chainLinearGap=medium > swap.log 2>&1 &
    #   real    73m5.666s

    cat fb.rn4.chainHg38Link.txt
    #	889613774 bases of 2571531505 (34.595%) in intersection

##############################################################################
# GENEID GENE PREDICTIONS (DONE - 2014-03-07 - Hiram)
    ssh hgwdev
    mkdir /hive/data/genomes/hg38/bed/geneid
    cd /hive/data/genomes/hg38/bed/geneid
    mkdir download
    cd download
    for C in `cut -f1 ../../../chrom.sizes`
    do
	echo $C
 wget --timestamping \
http://genome.crg.es/genepredictions/H.sapiens/hg38/geneid_v1.4/${C}.gtf3
    wget --timestamping \
http://genome.crg.es/genepredictions/H.sapiens/hg38/geneid_v1.4/${C}.prot
    done

    cd ..
    cat download/*.gtf | ldHgGene -gtf -genePredExt hg38 geneid stdin
    #	Read 33428 transcripts in 277332 lines in 1 files
    #	33428 groups 92 seqs 1 sources 3 feature types
    #	33428 gene predictions

############################################################################
# GENEREVIEWS TRACK (DONE 2014-05-17 - Chin)
# This track depends on some tasks completed for hg19, specifically:
#
# $HOME/kent/src/hg/lib/geneReviewsGrshortNBKid.sql
# $HOME/kent/src/hg/lib/geneReviewsGrshortTitleNBKid.sql
# $HOME/kent/src/hg/lib/geneReviewsDetail.sql
# $HOME/kent/src/hg/makeDb/trackDb/human/geneReviews.html
#
# Unlike hg19, this hg38 tracks is generated by the automatic geneReviews
# scripts in
# /hive/data/outside/otto/geneReviews, specifically buildGeneReviews.sh.
# Current data are fetched weekly from NCBI
# ftp://ftp.ncbi.nlm.nih.gov/pub/GeneReviews/
# to /hive/data/outside/otto/geneReviews/${DATE}.

###########################################################################
# Chimp Lastz run (DONE - 2014-05-27 - Hiram)
    screen -S hg38PanTro4      # use a screen to manage this longish running job
    mkdir /hive/data/genomes/hg38/bed/lastzPanTro4.2014-05-27
    cd /hive/data/genomes/hg38/bed/lastzPanTro4.2014-05-27

    # always set the BLASTZ program so we know what version was used
    cat << '_EOF_' > DEF
# human vs chimp
BLASTZ=/cluster/bin/penn/lastz-distrib-1.03.52/bin/lastz
BLASTZ_O=600
BLASTZ_E=150
# maximum M allowed with lastz is only 254
BLASTZ_M=254

BLASTZ_T=2
BLASTZ_Y=15000
BLASTZ_K=4500
BLASTZ_Q=/scratch/data/blastz/human_chimp.v2.q
#    A    C    G    T
#    90 -330 -236 -356
#  -330  100 -318 -236
#  -236 -318  100 -330
#  -356 -236 -330   90

# TARGET: Human Hg38
SEQ1_DIR=/scratch/data/hg38/hg38.2bit
SEQ1_LEN=/scratch/data/hg38/chrom.sizes
SEQ1_CHUNK=10000000
SEQ1_LAP=10000
SEQ1_IN_CONTIGS=0

# QUERY: Chimp PanTro4
SEQ2_DIR=/hive/data/genomes/panTro4/panTro4.2bit
SEQ2_LEN=/hive/data/genomes/panTro4/chrom.sizes
SEQ2_CHUNK=10000000
SEQ2_LAP=0
SEQ2_LIMIT=200
SEQ2_IN_CONTIGS=0

BASE=/hive/data/genomes/hg38/bed/lastzPanTro4.2014-05-27
TMPDIR=/dev/shm
'_EOF_'
    # << emacs

    time (doBlastzChainNet.pl `pwd`/DEF -verbose=2 \
        -chainMinScore=5000 -chainLinearGap=medium \
        -workhorse=hgwdev -smallClusterHub=ku -bigClusterHub=ku \
        -syntenicNet) > do.log 2>&1
    # real    154m12.215s
    cat fb.hg38.chainPanTro4Link.txt
    # 2839294579 bases of 3049335806 (93.112%) in intersection

    # filter with doRecipBest.pl
    time (doRecipBest.pl -workhorse=hgwdev -buildDir=`pwd` \
        hg38 panTro4) > rbest.log 2>&1
    # real    57m55.320s

    # running the swap
    mkdir /hive/data/genomes/panTro4/bed/blastz.hg38.swap
    cd /hive/data/genomes/panTro4/bed/blastz.hg38.swap
    time (doBlastzChainNet.pl -verbose=2 \
        -swap /hive/data/genomes/hg38/bed/lastzPanTro4.2014-05-27/DEF \
        -chainMinScore=5000 -chainLinearGap=medium \
        -workhorse=hgwdev -smallClusterHub=ku -bigClusterHub=ku \
        -syntenicNet) > swap.log 2>&1
    cat fb.panTro4.chainHg38Link.txt
    # 2776497530 bases of 2902338967 (95.664%) in intersection
    # real    98m23.729s

    time (doRecipBest.pl -workhorse=hgwdev -buildDir=`pwd` \
        panTro4 hg38) > rbest.log 2>&1
    # real    64m33.812s

#############################################################################
# Opossum Lastz run (DONE - 2014-05-27 - Hiram)
    screen -S hg38MonDom5      # use a screen to manage this longish running job
    mkdir /hive/data/genomes/hg38/bed/lastzMonDom5.2014-05-27
    cd /hive/data/genomes/hg38/bed/lastzMonDom5.2014-05-27

    # always set the BLASTZ program so we know what version was used
    cat << '_EOF_' > DEF
# human vs chimp
BLASTZ=/cluster/bin/penn/lastz-distrib-1.03.52/bin/lastz
BLASTZ_M=50

BLASTZ_Y=3400
BLASTZ_L=6000
BLASTZ_K=2200
BLASTZ_Q=/scratch/data/blastz/HoxD55.q
#     A    C    G    T
#    91  -90  -25 -100
#   -90  100 -100  -25
#   -25 -100  100  -90
#  -100  -25  -90  91

# TARGET: Human Hg38
SEQ1_DIR=/scratch/data/hg38/hg38.2bit
SEQ1_LEN=/scratch/data/hg38/chrom.sizes
SEQ1_CHUNK=10000000
SEQ1_LAP=10000
SEQ1_LIMIT=5

# QUERY: Opossum MonDom5
SEQ2_DIR=/scratch/data/monDom5/monDom5.2bit
SEQ2_LEN=/hive/data/genomes/monDom5/chrom.sizes
SEQ2_CHUNK=10000000
SEQ2_LAP=0

BASE=/hive/data/genomes/hg38/bed/lastzMonDom5.2014-05-27
TMPDIR=/dev/shm
'_EOF_'
    # << emacs

    time (doBlastzChainNet.pl `pwd`/DEF -verbose=2 \
        -chainMinScore=5000 -chainLinearGap=loose \
        -workhorse=hgwdev -smallClusterHub=ku -bigClusterHub=ku \
        -syntenicNet) > do.log 2>&1
    # real    670m13.280s
    # one failed chain run for hg19, finished manually on hgwdev, then:
    time (doBlastzChainNet.pl `pwd`/DEF -verbose=2 \
        -continue=chainMerge -chainMinScore=5000 -chainLinearGap=loose \
        -workhorse=hgwdev -smallClusterHub=ku -bigClusterHub=ku \
        -syntenicNet) > chainMerge.log 2>&1
    # real    164m28.822s

    cat fb.hg38.chainMonDom5Link.txt
    # 438195373 bases of 3049335806 (14.370%) in intersection

    # filter with doRecipBest.pl
    time (/cluster/bin/scripts/doRecipBest.pl -buildDir=`pwd` \
        -dbHost=hgwdev -workhorse=hgwdev hg38 monDom5) > rbest.log 2>&1
    # real    130m22.825s

    # running the swap
    mkdir /hive/data/genomes/monDom5/bed/blastz.hg38.swap
    cd /hive/data/genomes/monDom5/bed/blastz.hg38.swap
    time (doBlastzChainNet.pl -verbose=2 \
        /hive/data/genomes/hg38/bed/lastzMonDom5.2014-05-27/DEF \
        -swap -chainMinScore=5000 -chainLinearGap=loose \
        -workhorse=hgwdev -smallClusterHub=ku -bigClusterHub=ku \
        -syntenicNet) > swap.log 2>&1
    # real    102m41.443s

    cat fb.monDom5.chainHg38Link.txt
    # 420069915 bases of 3501660299 (11.996%) in intersection
    time (/cluster/bin/scripts/doRecipBest.pl -buildDir=`pwd` \
        -dbHost=hgwdev -workhorse=hgwdev monDom5 hg38) > rbest.log 2>&1
    #  real    90m56.189s

_EOF_
#############################################################################
# LOCUS REFERENCE GENOMIC (LRG) REGIONS AND TRANSCRIPTS (DONE 10/25/19 angie)
# Redmine #13359, #24285 -- otto-mate To Do #17877
# previously done 7/7/14, 9/9/16, 5/30/18
# THIS IS NOW AN OTTO JOB !!
    set today = `date +%Y_%m_%d`
    mkdir -p /hive/data/genomes/hg38/bed/lrg/$today
    cd /hive/data/genomes/hg38/bed/lrg/$today
    wget ftp://ftp.ebi.ac.uk/pub/databases/lrgex/LRG_public_xml_files.zip
    unzip LRG_public_xml_files.zip

    # Run script to convert LRG*.xml files to BED+ for regions and genePredExt+fa for transcripts:
    # parseLrgXml.pl updated 2020-09-16 to add four new fields to the gp output
    # the four extra fields are identifiers for:
    # NCBI transcript, Ensembl transcript, NCBI protein, Ensembl protein

    ~/kent/src/hg/utils/automation/parseLrgXml.pl GRCh38
    genePredCheck lrgTranscriptsUnmapped.gp
#Error: lrgTranscriptsUnmapped.gp:765: LRG_7t1 no exonFrame on CDS exon 46
#checked: 1029 failed: 1
    # If there are complaints e.g. about exonFrame, look for inconsistencies in the
    # affected transcript's coding_region/coordinates vs. exon/intron info in xml.
    # Contact Variation team leader Fiona Cunningham @EBI to resolve in the background
    # (missing exonFrame info doesn't affect our track representation because we end up using
    # psl).  We agreed to disagree about exon 46 of LRG_7t1 because that last coding exon
    # portion is only the stop codon.

    # No longer necessary to filter out alt and fix patches since they have been added to hg38.

    # and we need the transcript plus gene name later:
    cut -f1,12 lrgTranscriptsUnmapped.gp | sort > transcript.gene.name.txt

    # five extra columns have been added to the genePred (2020-10-05 - Hiram)
    # extract them so they can be added to the psl:
    awk -F$'\t' '{printf "%s\t%s\t%s\t%s\t%s\t%s %s %s %s\n", $1,$16,$17,$18,$19, $16,$18,$17,$19}' lrgTranscriptsUnmapped.gp | sort \
       | join -t$'\t' - transcript.gene.name.txt \
         | awk -F$'\t' '{printf "%s\t%s\t%s\t%s\t%s\t%s\t%s %s\n", $1,$2,$3,$4,$5,$7,$6,$7}' > lrgTransExtraFields.tsv

    # the five extra fields are identifiers for:
    # NCBI transcript, Ensembl transcript, NCBI protein, Ensembl protein,
    #	Gene name

    # Load LRG regions:
    #bedToBigBed lrg.bed /hive/data/genomes/hg38/chrom.sizes lrg.bb \
    #-tab -type=bed12+ -as=$HOME/kent/src/hg/lib/lrg.as -extraIndex=name
    # after ML #29689, added ncbiAcc field, Max, July 1, 2022
    # changed to:
    bedToBigBed lrg.bed /hive/data/genomes/hg38/chrom.sizes lrg.bb \
    -tab -type=bed12+ -as=$HOME/kent/src/hg/lib/lrg.as -extraIndex=name,ncbiAcc
    ln -sf `pwd`/lrg.bb /gbdb/hg38/bbi/lrg.bb
    hgBbiDbLink hg38 lrg /gbdb/hg38/bbi/lrg.bb

    # Map LRG fixed_annotation transcripts from LRG coords to hg38 coords (HT MarkD):
    lrgToPsl lrg.bed /hive/data/genomes/hg38/chrom.sizes lrg.psl
    pslCheck lrg.psl
#checked: 919 failed: 0 errors: 0
    awk '{print $10 "\t" $11;}' lrg.psl > lrg.sizes
    genePredToFakePsl -chromSize=lrg.sizes placeholder \
      lrgTranscriptsUnmapped.gp lrgTranscriptsFakePsl.psl lrgTranscripts.cds
    pslMap lrgTranscriptsFakePsl.psl lrg.psl lrgTranscriptsHg38.psl
    mrnaToGene -genePredExt -cdsFile=lrgTranscripts.cds -keepInvalid \
      lrgTranscriptsHg38.psl lrgTranscriptsHg38NoName2.gp
#Warning: no CDS for LRG_163t1
#Warning: no CDS for LRG_347t1
    # It's OK if mrnaToGene complains about "no CDS" for a non-coding tx (RefSeq accession NR_*).
    grep -l NR_ LRG_163.xml LRG_347.xml
#LRG_163.xml
#LRG_347.xml

    cat lrgCdna.tab | sed -e 's/^/>/;' | tr '\t' '\n' > lrgCdna.fa
    # construct bigPsl with five extra fields
    pslToBigPsl -fa=lrgCdna.fa -cds=lrgTranscripts.cds \
	lrgTranscriptsHg38.psl bigPsl.txt

    # add the five extra identifiers to the bigPsl file:
    join -t$'\t' -1 4 \
       -o 1.1,1.2,1.3,1.4,1.5,1.6,1.7,1.8,1.9,1.10,1.11,1.12,1.13,1.14,1.15\
,1.16,1.17,1.18,1.19,1.20,1.21,1.22,1.23,1.24,1.25,2.2,2.3,2.4,2.5,2.6,2.7 \
       <(sort -k4 bigPsl.txt) lrgTransExtraFields.tsv \
         | sort -k1,1 -k2,2n > lrgExtraTranscriptsHg38.bigPsl.bed

    bedToBigBed -as=bigPsl+6.as -type=bed12+19 -tab \
       lrgExtraTranscriptsHg38.bigPsl.bed ../../../chrom.sizes lrgBigPsl.bb
    bigBedInfo lrgBigPsl.bb
    rm -f /gbdb/hg38/bbi/lrgBigPsl.bb
    ln -sf `pwd`/lrgBigPsl.bb /gbdb/hg38/bbi
    hgBbiDbLink hg38 lrgBigPsl /gbdb/hg38/bbi/lrgBigPsl.bb


    # Load PSL, CDS and sequences.
    hgLoadPsl hg38 -table=lrgTranscriptAli lrgTranscriptsHg38.psl
    hgLoadSqlTab hg38 lrgCds ~/kent/src/hg/lib/cdsSpec.sql lrgTranscripts.cds
    hgPepPred hg38 tab lrgCdna lrgCdna.tab
    hgPepPred hg38 tab lrgPep lrgPep.tab


#############################################################################
## 7-Way Multiz (DONE - 2014-06-02 - Hiram)
    ssh hgwdev
    mkdir /hive/data/genomes/hg38/bed/multiz7way
    cd /hive/data/genomes/hg38/bed/multiz7way

    # from the 63-way in the source tree, select out the 7 used here:
    /cluster/bin/phast/tree_doctor \
        --prune-all-but hg19,panTro4,rheMac3,mm10,rn5,canFam3,monDom5 \
        /cluster/home/hiram/kent/src/hg/utils/phyloTrees/130way.nh \
          | sed -e 's/hg19/hg38/' > hg38.7way.nh

    #	what that looks like:
    ~/kent/src/hg/utils/phyloTrees/asciiTree.pl hg38.7way.nh
# (((((hg38:0.006550,
#     panTro4:0.006840):0.027424,
#    rheMac3:0.037601):0.109934,
#   (mm10:0.084509,
#   rn5:0.091589):0.271974):0.020593,
#  canFam3:0.165928):0.258392,
# monDom5:0.340786);

    # extract species list from that .nh file
    sed 's/[a-z][a-z]*_//g; s/:[0-9\.][0-9\.]*//g; s/;//; /^ *$/d' \
        hg38.7way.nh | xargs echo | sed 's/ //g; s/,/ /g' \
        | sed 's/[()]//g; s/,/ /g' | tr '[ ]' '[\n]' > species.list.txt

    # construct db to name translation list:
    cat species.list.txt | while read DB
do
hgsql -N -e "select name,organism from dbDb where name=\"${DB}\";" hgcentraltest
done | sed -e "s/\t/->/; s/ /_/g;" | sed -e 's/$/;/' | sed -e 's/\./_/g' \
        > db.to.name.txt

    # construct a common name .nh file:
    /cluster/bin/phast/tree_doctor --rename \
    "`cat db.to.name.txt`" hg38.7way.nh | sed -e 's/00*)/)/g; s/00*,/,/g' \
       | $HOME/kent/src/hg/utils/phyloTrees/asciiTree.pl /dev/stdin \
         > hg38.7way.commonNames.nh

    $HOME/kent/src/hg/utils/phyloTrees/asciiTree.pl hg38.7way.nh > t.nh
    $HOME/kent/src/hg/utils/phyloTrees/scientificNames.sh t.nh \
       | $HOME/kent/src/hg/utils/phyloTrees/asciiTree.pl /dev/stdin \
          > hg38.7way.scientificNames.nh
    rm -f t.nh
    cat hg38.7way.scientificNames.nh
# (((((Homo_sapiens:0.00655,
#     Pan_troglodytes:0.00684):0.027424,
#    Macaca_mulatta:0.037601):0.109934,
#   (Mus_musculus:0.084509,
#   Rattus_norvegicus:0.091589):0.271974):0.020593,
#  Canis_lupus_familiaris:0.165928):0.258392,
# Monodelphis_domestica:0.340786);

    ~/kent/src/hg/utils/phyloTrees/asciiTree.pl hg38.7way.commonNames.nh
# (((((Human:0.00655,
#     Chimp:0.00684):0.027424,
#    Rhesus:0.037601):0.109934,
#   (Mouse:0.084509,
#   Rat:0.091589):0.271974):0.020593,
#  Dog:0.165928):0.258392,
# Opossum:0.340786);

    #	Use this specification in the phyloGif tool:
    #	http://genome.ucsc.edu/cgi-bin/phyloGif
    #	to obtain a png image for src/hg/htdocs/images/phylo/hg38_7way.png

    /cluster/bin/phast/all_dists hg38.7way.nh | grep hg38 \
        | sed -e "s/hg38.//" | sort -k2n > 7way.distances.txt
    #	Use this output to create the table below
    head 7way.distances.txt
# taeGut1 0.075718
# melUnd1 0.220312
# galGal4 0.507021
# melGal1 0.509140
# hg19    1.175433
# mm10    1.383071

    cat << '_EOF_' > sizeStats.pl
#!/usr/bin/env perl

use strict;
use warnings;

open (FH, "<7way.distances.txt") or
        die "can not read 7way.distances.txt";

my $count = 0;
while (my $line = <FH>) {
    chomp $line;
    my ($D, $dist) = split('\s+', $line);
    my $chain = "chain" . ucfirst($D);
    my $B="/hive/data/genomes/hg38/bed/lastz.$D/fb.hg38." .
        $chain . "Link.txt";
    my $chainLinkMeasure =
        `awk '{print \$5}' ${B} 2> /dev/null | sed -e "s/(//; s/)//"`;
    chomp $chainLinkMeasure;
    $chainLinkMeasure = 0.0 if (length($chainLinkMeasure) < 1);
    $chainLinkMeasure =~ s/\%//;
    my $swapFile="/hive/data/genomes/${D}/bed/lastz.hg38/fb.${D}.chainHg38Link.txt";
    my $swapMeasure = "N/A";
    if ( -s $swapFile ) {
	$swapMeasure =
	    `awk '{print \$5}' ${swapFile} 2> /dev/null | sed -e "s/(//; s/)//"`;
	chomp $swapMeasure;
	$swapMeasure = 0.0 if (length($swapMeasure) < 1);
	$swapMeasure =~ s/\%//;
    }
    my $orgName=
    `hgsql -N -e "select organism from dbDb where name='$D';" hgcentraltest`;
    chomp $orgName;
    if (length($orgName) < 1) {
        $orgName="N/A";
    }
    ++$count;
    printf "# %02d  %.4f (%% %06.3f) (%% %06.3f) - %s %s\n", $count, $dist,
        $chainLinkMeasure, $swapMeasure, $orgName, $D;
}
close (FH);
'_EOF_'
    # << happy emacs
    chmod +x ./sizeStats.pl
    ./sizeStats.pl
#

#	If you can fill in all the numbers in this table, you are ready for
#	the multiple alignment procedure

#       featureBits chainLink measures
#               chainLink
#  N distance  on hg38  on other     other species
# 01  0.0134 (% 93.112) (% 95.664) - Chimp panTro4
# 02  0.0716 (% 79.729) (% 86.715) - Rhesus rheMac3
# 03  0.3304 (% 49.978) (% 60.083) - Dog canFam3
# 04  0.5004 (% 31.629) (% 35.323) - Mouse mm10
# 05  0.5075 (% 30.788) (% 36.312) - Rat rn5
# 06  0.7637 (% 14.370) (% 11.996) - Opossum monDom5

# None of this concern for distances matters in building the first step, the
# maf files.

    # create species list and stripped down tree for autoMZ
    sed 's/[a-z][a-z]*_//g; s/:[0-9\.][0-9\.]*//g; s/;//; /^ *$/d' \
	hg38.7way.nh | xargs echo | sed 's/ //g; s/,/ /g' > tree.nh

    sed 's/[()]//g; s/,/ /g' tree.nh > species.list
    #   hg38 panTro4 rheMac3 mm10 rn5 canFam3 monDom5

    #	bash shell syntax here ...
    cd /hive/data/genomes/hg38/bed/multiz7way
    export H=/hive/data/genomes/hg38/bed
    mkdir mafLinks
    # want syntenic net for: panTro4 rheMac3 mm10 rn5 canFam3
    # and unfiltered maf net for: monDom5
    for G in panTro4 rheMac3 mm10 rn5 canFam3
    do
      mkdir mafLinks/$G
      echo ln -s ${H}/lastz.$G/axtChain/hg38.${G}.synNet.maf.gz ./mafLinks/$G
      ln -s ${H}/lastz.$G/axtChain/hg38.${G}.synNet.maf.gz ./mafLinks/$G
    done

    mkdir mafLinks/monDom5
    echo ln -s ${H}/lastz.monDom5/mafNet/*.maf.gz ./mafLinks/monDom5
    ln -s ${H}/lastz.monDom5/mafNet/*.maf.gz ./mafLinks/monDom5
    # verify the symLinks are good:
    ls -ogrtL mafLinks/*/*
#-rw-rw-r-- 1  709500062 Jan 25 12:15 mafLinks/mm10/hg38.mm10.synNet.maf.gz
#-rw-rw-r-- 1 1089643630 Jan 27 19:15 mafLinks/canFam3/hg38.canFam3.synNet.maf.gz
#-rw-rw-r-- 1 1277455681 Jan 28 21:52 mafLinks/rheMac3/hg38.rheMac3.synNet.maf.gz
#-rw-rw-r-- 1  687500679 Mar  1 12:27 mafLinks/rn5/hg38.rn5.synNet.maf.gz
#-rw-rw-r-- 1 1463969868 May 27 11:41 mafLinks/panTro4/hg38.panTro4.synNet.maf.gz
#-rw-rw-r-- 1  323347908 May 29 12:38 mafLinks/monDom5/hg38.monDom5.net.maf.gz

    # split the maf files into a set of hashed named files
    # this hash named split keeps the same chr/contig names in the same
    # named hash file.
    mkdir /hive/data/genomes/hg38/bed/multiz7way/mafSplit
    cd /hive/data/genomes/hg38/bed/multiz7way/mafSplit
    for D in `sed -e "s/hg38 //" ../species.list`
do
    echo "${D}"
    mkdir $D
    cd $D
    echo "mafSplit -byTarget -useHashedName=8 /dev/null . ../../mafLinks/${D}/*.maf.gz"
    mafSplit -byTarget -useHashedName=8 /dev/null . \
	../../mafLinks/${D}/*.maf.gz
    cd ..
done

    # construct a list of all possible maf file names.
    # they do not all exist in each of the species directories
    find . -type f | wc -l
    # 641
    find . -type f | grep ".maf$" | xargs -L 1 basename | sort -u > maf.list
    wc -l maf.list
    # 118 maf.list

    mkdir /hive/data/genomes/hg38/bed/multiz7way/splitRun
    cd /hive/data/genomes/hg38/bed/multiz7way/splitRun
    mkdir maf run
    cd run
    mkdir penn
    cp -p /cluster/bin/penn/multiz.2009-01-21_patched/multiz penn
    cp -p /cluster/bin/penn/multiz.2009-01-21_patched/maf_project penn
    cp -p /cluster/bin/penn/multiz.2009-01-21_patched/autoMZ penn

    #	set the db and pairs directories here
    cat > autoMultiz.csh << '_EOF_'
#!/bin/csh -ef
set db = hg38
set c = $1
set result = $2
set run = `/bin/pwd`
set tmp = /dev/shm/$db/multiz.$c
set pairs = /hive/data/genomes/hg38/bed/multiz7way/mafSplit
/bin/rm -fr $tmp
/bin/mkdir -p $tmp
/bin/cp -p ../../tree.nh ../../species.list $tmp
pushd $tmp > /dev/null
foreach s (`/bin/sed -e "s/$db //" species.list`)
    set in = $pairs/$s/$c
    set out = $db.$s.sing.maf
    if (-e $in.gz) then
        /bin/zcat $in.gz > $out
        if (! -s $out) then
            echo "##maf version=1 scoring=autoMZ" > $out
        endif
    else if (-e $in) then
        /bin/ln -s $in $out
    else
        echo "##maf version=1 scoring=autoMZ" > $out
    endif
end
set path = ($run/penn $path); rehash
$run/penn/autoMZ + T=$tmp E=$db "`cat tree.nh`" $db.*.sing.maf $c \
        > /dev/null
popd > /dev/null
/bin/rm -f $result
/bin/cp -p $tmp/$c $result
/bin/rm -fr $tmp
'_EOF_'
# << happy emacs
    chmod +x autoMultiz.csh

    cat  << '_EOF_' > template
#LOOP
./autoMultiz.csh $(file1) {check out line+ /hive/data/genomes/hg38/bed/multiz7way/splitRun/maf/$(root1).maf}
#ENDLOOP
'_EOF_'
# << happy emacs

    ln -s ../../mafSplit/maf.list maf.list
    ssh ku
    cd /hive/data/genomes/hg38/bed/multiz7way/splitRun/run
    gensub2 maf.list single template stdout > jobList
    para -ram=8g create jobList
# Completed: 118 of 118 jobs
# CPU time in finished jobs:     118241s    1970.69m    32.84h    1.37d  0.004 y
# IO & Wait Time:                   682s      11.36m     0.19h    0.01d  0.000 y
# Average job time:                1008s      16.80m     0.28h    0.01d
# Longest finished job:           10068s     167.80m     2.80h    0.12d
# Submission to last job:         10076s     167.93m     2.80h    0.12d

    # combine into one file  (the 1>&2 redirect sends the echo to stderr)
    cd /hive/data/genomes/hg38/bed/multiz7way
    head -1 splitRun/maf/017.maf > multiz7way.maf
    for F in splitRun/maf/*.maf
do
    echo "${F}" 1>&2
    egrep -v "^#" ${F}
done >> multiz7way.maf
    tail -1 splitRun/maf/017.maf >> multiz7way.maf
# -rw-rw-r-- 1 15635828403 Jun  3 11:49 multiz7way.maf

    # Load into database
    ssh hgwdev
    cd /hive/data/genomes/hg38/bed/multiz7way
    mkdir /gbdb/hg38/multiz7way
    ln -s `pwd`/multiz7way.maf /gbdb/hg38/multiz7way
    cd /dev/shm
    time nice -n +17 hgLoadMaf hg38 multiz7way
    # Loaded 10270624 mafs in 1 files from /gbdb/hg38/multiz7way
    # real    3m51.265s

    time nice -n +17 hgLoadMafSummary -verbose=2 -minSize=30000 \
	-mergeGap=1500 -maxSize=200000 hg38 multiz7waySummary \
	/gbdb/hg38/multiz7way/multiz7way.maf
    # Created 1260918 summary blocks from 35384988 components
    # and 10270624 mafs from /gbdb/hg38/multiz7way/multiz7way.maf
    # real    5m39.388s


    wc -l multiz7way*.tab
    # 10270624 multiz7way.tab
    # 1260918 multiz7waySummary.tab
    # 11531542 total

    rm multiz7way*.tab

##############################################################################
# GAP ANNOTATE MULTIZ7WAY MAF AND LOAD TABLES (DONE - 2014-06-03 - Hiram)
    # mafAddIRows has to be run on single chromosome maf files, it does not
    #	function correctly when more than one reference sequence
    #	are in a single file.  Need to split of the maf file into individual
    #   maf files
    mkdir -p /hive/data/genomes/hg38/bed/multiz7way/anno/mafSplit
    cd /hive/data/genomes/hg38/bed/multiz7way/anno/mafSplit

    time mafSplit -outDirDepth=1 -byTarget -useFullSequenceName \
        /dev/null . ../../multiz7way.maf
    #   real    4m8.617s

    find . -type f | wc -l
    #   353

    # check for N.bed files everywhere:
    cd /hive/data/genomes/hg38/bed/multiz7way/anno
    for DB in `cat ../species.list`
do
    if [ ! -s /hive/data/genomes/${DB}/${DB}.N.bed ]; then
        echo "MISS: ${DB}"
#        cd /hive/data/genomes/${DB}
#        twoBitInfo -nBed ${DB}.2bit ${DB}.N.bed
    else
        echo "  OK: ${DB}"
    fi
done

    cd /hive/data/genomes/hg38/bed/multiz7way/anno
    for DB in `cat ../species.list`
do
    echo "${DB} "
    ln -s  /hive/data/genomes/${DB}/${DB}.N.bed ${DB}.bed
    echo ${DB}.bed  >> nBeds
    ln -s  /hive/data/genomes/${DB}/chrom.sizes ${DB}.len
    echo ${DB}.len  >> sizes
done
    # make sure they all are successful symLinks:
    ls -ogrtL

    screen -S hg38      # use a screen to control this longish job
    ssh ku
    cd /hive/data/genomes/hg38/bed/multiz7way/anno
    mkdir result
    for D in `ls mafSplit`
do
    echo mkdir result/${D}
    mkdir result/${D}
done
    cat << '_EOF_' > template
#LOOP
mafAddIRows -nBeds=nBeds mafSplit/$(path1) /hive/data/genomes/hg38/hg38.2bit {check out exists+ result/$(path1)}
#ENDLOOP
'_EOF_'
    # << happy emacs

    find ./mafSplit -type f | sed -e 's#^./mafSplit/##' > maf.list
    gensub2 maf.list single template jobList
    # limit jobs on a node with the ram=32g requirement because they go fast
    para -ram=32g create jobList
    para try ... check ... push ...
# Completed: 353 of 353 jobs
# CPU time in finished jobs:        530s       8.83m     0.15h    0.01d  0.000 y
# IO & Wait Time:                  1057s      17.62m     0.29h    0.01d  0.000 y
# Average job time:                   4s       0.07m     0.00h    0.00d
# Longest finished job:              63s       1.05m     0.02h    0.00d
# Submission to last job:           220s       3.67m     0.06h    0.00d

    # verify all result files have some content, look for 0 size files:
    find ./result -type f -size 0
    # should see none
    # or in this manner:
    find ./result -type f | xargs ls -og | sort -k3nr | tail

    # combine into one file  (the 1>&2 redirect sends the echo to stderr)
    head -q -n 1 result/0/chr8.maf > hg38.7way.maf
    find ./result -type f | while read F
do
    echo "${F}" 1>&2
    grep -h -v "^#" ${F}
done >> hg38.7way.maf

    #	these maf files do not have the end marker, this does nothing:
    #	tail -q -n 1 result/0/chr8.maf >> hg38.7way.maf
    # How about an official end marker:
    echo "##eof maf" >> hg38.7way.maf
    ls -og
# -rw-rw-r--  1 17795297196 Jun  3 14:01 hg38.7way.maf

    du -hsc hg38.7way.maf
    # 17G     hg38.7way.maf

    # construct symlinks to get the individual maf files into gbdb:
    rm /gbdb/hg38/multiz7way/multiz7way.maf   # remove previous results
    ln -s `pwd`/hg38.7way.maf /gbdb/hg38/multiz7way/multiz7way.maf

    # Load into database
    cd /dev/shm
    time nice -n +19 hgLoadMaf -pathPrefix=/gbdb/hg38/multiz7way \
        hg38 multiz7way
    # Loaded 10359242 mafs in 1 files from /gbdb/hg38/multiz7way
    # real    4m21.862s

    time hgLoadMafSummary -verbose=2 -minSize=30000 \
	-mergeGap=1500 -maxSize=200000 hg38 multiz7waySummary \
        /gbdb/hg38/multiz7way/multiz7way.maf
#  Created 1260918 summary blocks from 35384988 components
#  and 10359242 mafs from /gbdb/hg38/multiz7way/multiz7way.maf
#  real    6m6.583s

# -rw-rw-r-- 1 530538267 Jun  3 14:05 multiz7way.tab
# -rw-rw-r-- 1  60616616 Jun  3 14:15 multiz7waySummary.tab

    rm multiz7way*.tab

######################################################################
# MULTIZ7WAY MAF FRAMES (DONE - 2014-06-03 - Hiram)
    ssh hgwdev
    mkdir /hive/data/genomes/hg38/bed/multiz7way/frames
    cd /hive/data/genomes/hg38/bed/multiz7way/frames
#   survey all the genomes to find out what kinds of gene tracks they have
    cat << '_EOF_' > showGenes.csh
#!/bin/csh -fe
foreach db (`cat ../species.list`)
    echo -n "${db}: "
    set tables = `hgsql $db -N -e "show tables like '%Gene%'"`
    foreach table ($tables)
        if ($table == "ensGene" || $table == "refGene" || \
           $table == "mgcGenes" || $table == "knownGene" || \
           $table == "xenoRefGene" ) then
           set count = `hgsql $db -N -e "select count(*) from $table"`
            echo -n "${table}: ${count}, "
        endif
    end
    set orgName = `hgsql hgcentraltest -N -e \
            "select scientificName from dbDb where name='$db'"`
    set orgId = `hgsql hg19 -N -e \
            "select id from organism where name='$orgName'"`
    if ($orgId == "") then
        echo "Mrnas: 0"
    else
        set count = `hgsql hg19 -N -e "select count(*) from gbCdnaInfo where organism=$orgId"`
        echo "Mrnas: ${count}"
    endif
end
'_EOF_'
    # << happy emacs
    chmod +x ./showGenes.csh
    time ./showGenes.csh
# hg38: knownGene: 104178, mgcGenes: 34081, refGene: 54852, xenoRefGene: 172740, Mrnas: 10723716
# panTro4: ensGene: 29160, refGene: 2622, xenoRefGene: 280516, Mrnas: 11163
# rheMac3: refGene: 6369, xenoRefGene: 275096, Mrnas: 443642
# mm10: ensGene: 94647, knownGene: 61642, mgcGenes: 26768, refGene: 33765, xenoRefGene: 161178, Mrnas: 5224613
# rn5: ensGene: 29188, mgcGenes: 6924, refGene: 18567, xenoRefGene: 175416, Mrnas: 1247500
# canFam3: ensGene: 29884, refGene: 1582, xenoRefGene: 253196, Mrnas: 387195
# monDom5: ensGene: 24882, refGene: 492, xenoRefGene: 248251,  Mrnas: 2461

    # from that summary, use these gene sets:
    # refGene - rheMac3
    # ensGene - panTro4 rn5 canFam3 monDom5
    # knownGene - hg38 mm10

    mkdir genes
    #   1. knownGene: hg38 mm10
    for DB in hg38 mm10
do
    hgsql -N -e "select name,chrom,strand,txStart,txEnd,cdsStart,cdsEnd,exonCount,exonStarts,exonEnds from knownGene" ${DB} \
      | genePredSingleCover stdin stdout | gzip -2c \
        > genes/${DB}.gp.gz
done
    #   2. ensGene:
    for DB in panTro4 rn5 canFam3 monDom5
do
hgsql -N -e "select name,chrom,strand,txStart,txEnd,cdsStart,cdsEnd,exonCount,exonStarts,exonEnds from ensGene" ${DB} \
      | genePredSingleCover stdin stdout | gzip -2c \
        > /scratch/tmp/${DB}.tmp.gz
    mv /scratch/tmp/${DB}.tmp.gz genes/$DB.gp.gz
    echo "${DB} done"
done
    #   3. refGene
    for DB in rheMac3
do
hgsql -N -e "select * from refGene" ${DB} | cut -f2- \
      | genePredSingleCover stdin stdout | gzip -2c \
        > /scratch/tmp/${DB}.tmp.gz
    mv /scratch/tmp/${DB}.tmp.gz genes/$DB.gp.gz
    echo "${DB} done"
done

    # verify counts for genes are reasonable:
    for T in genes/*.gz
do
    echo -n "# $T: "
    zcat $T | cut -f1 | sort | uniq -c | wc -l
done
# genes/canFam3.gp.gz: 19507
# genes/hg38.gp.gz: 21887
# genes/mm10.gp.gz: 21013
# genes/monDom5.gp.gz: 21033
# genes/panTro4.gp.gz: 18657
# genes/rheMac3.gp.gz: 5614
# genes/rn5.gp.gz: 22863

    time (cat ../anno/hg38.7way.maf \
	| nice -n +19 genePredToMafFrames hg38 stdin stdout \
	    `sed -e "s#\([a-zA-Z0-9]*\)#\1 genes/\1.gp.gz#g" ../species.list` \
		| gzip > multiz7wayFrames.bed.gz)
    #   real    3m44.591s

    # verify there are frames on everything, should be 7 species:
    zcat multiz7wayFrames.bed.gz | awk '{print $4}' | sort | uniq -c
# 265160 canFam3
# 208941 hg38
# 253323 mm10
# 574521 monDom5
# 200156 panTro4
#  49802 rheMac3
# 244731 rn5

    #   load the resulting file
    ssh hgwdev
    cd /hive/data/genomes/hg38/bed/multiz7way/frames
    time hgLoadMafFrames hg38 multiz7wayFrames multiz7wayFrames.bed.gz
    #   real    0m19.959s

    time featureBits -countGaps hg38 multiz7wayFrames
    #   52686177 bases of 3209286105 (1.642%) in intersection
    #   real    0m12.593s

    #   enable the trackDb entries:
# frames multiz7wayFrames
# irows on
    #   appears to work OK

#########################################################################
# Phylogenetic tree from 7-way (DONE - 2014-06-04 - Hiram)
    mkdir /hive/data/genomes/hg38/bed/multiz7way/4d
    cd /hive/data/genomes/hg38/bed/multiz7way/4d

    # the annotated maf is:
    ../anno/hg38.7way.maf

    # using knownGene for hg38
    hgsql -N -e "select name,chrom,strand,txStart,txEnd,cdsStart,cdsEnd,exonCount,exonStarts,exonEnds from knownGene" hg38 > hg38.knownGene.gp

    genePredSingleCover hg38.knownGene.gp stdout | sort > hg38.knownGeneNR.gp
    wc -l hg38.knownGeneNR.gp
    #	21887 hg38.knownGeneNR.gp

    mkdir annoSplit
    cd annoSplit
    time mafSplit -verbose=2 -outDirDepth=1 -byTarget -useFullSequenceName \
        /dev/null . ../../anno/hg38.7way.maf
    # real    5m14.770s

    find . -type f | wc -l
    #   353
    ssh ku
    mkdir /hive/data/genomes/hg38/bed/multiz7way/4d/run
    cd /hive/data/genomes/hg38/bed/multiz7way/4d/run
    mkdir ../mfa

    # newer versions of msa_view have a slightly different operation
    # the sed of the gp file inserts the reference species in the chr name
    cat << '_EOF_' > 4d.csh
#!/bin/csh -fe
set PHASTBIN = /cluster/bin/phast.build/cornellCVS/phast.2010-12-30/bin
set r = "/hive/data/genomes/hg38/bed/multiz7way"
set c = $1:r
set infile = $r/4d/annoSplit/$2
set outDir = $r/4d/mfa/$3:h
set outfile = $r/4d/mfa/$3
/bin/mkdir -p $outDir
cd /scratch/tmp
/bin/awk -v C=$c '$2 == C {print}' $r/4d/hg38.knownGeneNR.gp | sed -e "s/\t$c\t/\thg38.$c\t/" > $c.gp
set NL=`wc -l $c.gp| gawk '{print $1}'`
echo $NL
if ("$NL" != "0") then
    $PHASTBIN/msa_view --4d --features $c.gp -i MAF $infile -o SS > $c.ss
    $PHASTBIN/msa_view -i SS --tuple-size 1 $c.ss > $outfile
else
    echo "" > $outfile
endif
/bin/rm -f $c.gp $c.ss
'_EOF_'
    # << happy emacs
    chmod +x 4d.csh

    find ../annoSplit -type f | sed -e "s#../annoSplit/##" > maf.list

    cat << '_EOF_' > template
#LOOP
4d.csh $(file1) $(path1) {check out line+ ../mfa/$(dir1)/$(root1).mfa}
#ENDLOOP
'_EOF_'
    # << happy emacs

    gensub2 maf.list single template jobList
    para create jobList
    para try ... check
    para time
# Completed: 353 of 353 jobs
# CPU time in finished jobs:        836s      13.93m     0.23h    0.01d  0.000 y
# IO & Wait Time:                  1172s      19.54m     0.33h    0.01d  0.000 y
# Average job time:                   6s       0.09m     0.00h    0.00d
# Longest finished job:              72s       1.20m     0.02h    0.00d
# Submission to last job:            89s       1.48m     0.02h    0.00d

    # Not all results have contents, that is OK

    # combine mfa files
    ssh hgwdev
    cd /hive/data/genomes/hg38/bed/multiz7way/4d
    # remove the broken empty files, size 0 and size 1:
    find ./mfa -type f -size 0 | xargs rm -f
    # most interesting, this did not identify files of size 1:
#    find ./mfa -type f -size 1
    find ./mfa -type f | xargs ls -og | awk '$3 == 1' | awk '{print $NF}' \
        > empty.list
    cat empty.list | xargs rm -f
    #want comma-less species.list
    /cluster/bin/phast.build/cornellCVS/phast.2010-12-30/bin/msa_view \
	--aggregate "`cat ../species.list`" mfa/*/*.mfa | sed s/"> "/">"/ \
	    > 4d.all.mfa
    # check they are all in there:
    grep "^>" 4d.all.mfa
    #    >hg38
    #    >panTro4
    #    >rheMac3
    #    >mm10
    #    >rn5
    #    >canFam3
    #    >monDom5

    sed 's/[a-z][a-z]*_//g; s/:[0-9\.][0-9\.]*//g; s/;//; /^ *$/d' \
	../hg38.7way.nh | xargs echo | sed -e 's/ //g' > tree_commas.nh
    # tree_commas.nh looks like:
    #   (((((hg38,panTro4),rheMac3),(mm10,rn5)),canFam3),monDom5)
    # use phyloFit to create tree model (output is phyloFit.mod)
    time nice -n +19 \
	/cluster/bin/phast.build/cornellCVS/phast.2010-12-30/bin/phyloFit \
	    --EM --precision MED --msa-format FASTA --subst-mod REV \
		--tree tree_commas.nh 4d.all.mfa
    #   real    0m6.583s


    mv phyloFit.mod all.mod

    grep TREE all.mod
# TREE: (((((hg38:0.00673596,panTro4:0.00686169):0.0248146,rheMac3:0.0357598):0.0970072,(mm10:0.081661,rn5:0.0874126):0.246527):0.0264964,canFam3:0.156769):0.303241,monDom5:0.303241);

    # compare these calculated lengths to the tree extracted from 130way:
    grep TREE all.mod | sed -e 's/TREE: //' \
      | /cluster/bin/phast/all_dists /dev/stdin | grep hg38 | sort -k3n \
        | sed -e "s/hg38.//; s/^/    #  /"
    #  panTro4  0.013598
    #  rheMac3  0.067310
    #  canFam3  0.311823
    #  mm10     0.456746
    #  rn5      0.462497
    #  monDom5  0.761536

    # yes, somewhat similar
    /cluster/bin/phast/all_dists ../hg38.7way.nh | grep hg38 \
        | sort -k3n | sed -e "s/hg38.//; s/^/    #  /"
    #  panTro4   0.013390
    #  rheMac3   0.071575
    #  canFam3   0.330429
    #  mm10      0.500391
    #  rn5       0.507471
    #  monDom5   0.763679

#########################################################################
# phastCons 7-way (DONE - 2014-06-04 - Hiram)
    # split 7way mafs into 10M chunks and generate sufficient statistics
    # files for # phastCons
    ssh ku
    mkdir -p /hive/data/genomes/hg38/bed/multiz7way/cons/SS
    cd /hive/data/genomes/hg38/bed/multiz7way/cons/SS
    mkdir result done

    cat << '_EOF_' > mkSS.csh
#!/bin/csh -ef
set d = $1
set c = $2
set doneDir = done/$d
set MAF = /hive/data/genomes/hg38/bed/multiz7way/anno/result/$d/$c.maf
set WINDOWS = /hive/data/genomes/hg38/bed/multiz7way/cons/SS/result/$d/$c
set WC = `cat $MAF | wc -l`
set NL = `grep "^#" $MAF | wc -l`
if ( -s $3 ) then
    exit 0
endif
if ( -s $3.running ) then
    exit 0
endif

/bin/mkdir -p $doneDir
/bin/date >> $3.running

/bin/rm -fr $WINDOWS
/bin/mkdir -p $WINDOWS
pushd $WINDOWS > /dev/null
if ( $WC != $NL ) then
/cluster/bin/phast.build/cornellCVS/phast.2010-12-30/bin/msa_split \
    $MAF -i MAF -o SS -r $WINDOWS/$c -w 10000000,0 -I 1000 -B 5000
endif
popd > /dev/null
/bin/date >> $3
/bin/rm -f $3.running
'_EOF_'
    # << happy emacs
    chmod +x mkSS.csh

    cat << '_EOF_' > template
#LOOP
mkSS.csh $(dir1) $(root1) {check out line+ done/$(dir1)/$(root1)}
#ENDLOOP
'_EOF_'
    # << happy emacs

    #	do the easy ones first to see some immediate results
    find ../../anno/result -type f | sed -e "s#../../anno/result/##" > maf.list

    gensub2 maf.list single template jobList
    para -ram=32g create jobList
    para try ... check ... etc
# Completed: 353 of 353 jobs
# CPU time in finished jobs:       1216s      20.27m     0.34h    0.01d  0.000 y
# IO & Wait Time:                  1385s      23.08m     0.38h    0.02d  0.000 y
# Average job time:                   7s       0.12m     0.00h    0.00d
# Longest finished job:             111s       1.85m     0.03h    0.00d
# Submission to last job:           189s       3.15m     0.05h    0.00d

    find ./result -type f | wc -l
    #	 641

    # Run phastCons
    #	This job is I/O intensive in its output files, beware where this
    #	takes place or do not run too many at once.
    ssh ku
    mkdir -p /hive/data/genomes/hg38/bed/multiz7way/cons/run.cons
    cd /hive/data/genomes/hg38/bed/multiz7way/cons/run.cons

    #	This is setup for multiple runs based on subsets, but only running
    #   the 'all' subset here.
    #   It triggers off of the current working directory
    #	$cwd:t which is the "grp" in this script.  Running:
    #	all and vertebrates

    cat << '_EOF_' > doPhast.csh
#!/bin/csh -fe
set PHASTBIN = /cluster/bin/phast.build/cornellCVS/phast.2010-12-30/bin
set c = $1
set d = $2
set f = $3
set len = $4
set cov = $5
set rho = $6
set grp = $cwd:t
set cons = /hive/data/genomes/hg38/bed/multiz7way/cons
set tmp = $cons/tmp/${d}_${c}
mkdir -p $tmp
set ssSrc = $cons/SS/result
set useGrp = "$grp.mod"
if (-s $cons/$grp/$grp.non-inf) then
  ln -s $cons/$grp/$grp.mod $tmp
  ln -s $cons/$grp/$grp.non-inf $tmp
  ln -s $ssSrc/$d/$f $tmp
else
  ln -s $ssSrc/$d/$f $tmp
  ln -s $cons/$grp/$grp.mod $tmp
endif
pushd $tmp > /dev/null
if (-s $grp.non-inf) then
  $PHASTBIN/phastCons $f $useGrp \
    --rho $rho --expected-length $len --target-coverage $cov --quiet \
    --not-informative `cat $grp.non-inf` \
    --seqname $c --idpref $c --most-conserved $c.bed --score > $c.pp
else
  $PHASTBIN/phastCons $f $useGrp \
    --rho $rho --expected-length $len --target-coverage $cov --quiet \
    --seqname $c --idpref $c --most-conserved $c.bed --score > $c.pp
endif
popd > /dev/null
mkdir -p pp/$d bed/$d
sleep 4
touch pp/$d bed/$d
rm -f pp/$d/$c.pp
rm -f bed/$d/$c.bed
mv $tmp/$c.pp pp/$d
mv $tmp/$c.bed bed/$d
rm -fr $tmp
rmdir --ignore-fail-on-non-empty $cons/tmp/$d:h
'_EOF_'
    # << happy emacs
    chmod +x doPhast.csh

    #	this template will serve for all runs
    #	root1 == chrom name, file1 == ss file name without .ss suffix
    cat << '_EOF_' > template
#LOOP
../run.cons/doPhast.csh $(root1) $(dir1) $(file1) 45 0.3 0.3 {check out line+ pp/$(dir1)/$(root1).pp}
#ENDLOOP
'_EOF_'
    # << happy emacs

    find ../SS/result -type f | sed -e "s#../SS/result/##" > ss.list
    wc -l ss.list
    #	641 ss.list

    # Create parasol batch and run it
    # run for all species
    cd /hive/data/genomes/hg38/bed/multiz7way/cons
    mkdir -p all
    cd all
    #	Using the .mod tree
    cp -p ../../4d/all.mod ./all.mod

    gensub2 ../run.cons/ss.list single ../run.cons/template jobList
    para -ram=32g create jobList
    para try ... check ...
    para push
# Completed: 641 of 641 jobs
# CPU time in finished jobs:       6557s     109.29m     1.82h    0.08d  0.000 y
# IO & Wait Time:                  4497s      74.94m     1.25h    0.05d  0.000 y
# Average job time:                  17s       0.29m     0.00h    0.00d
# Longest finished job:              33s       0.55m     0.01h    0.00d
# Submission to last job:           120s       2.00m     0.03h    0.00d

    # create Most Conserved track
    cd /hive/data/genomes/hg38/bed/multiz7way/cons/all
    cut -f1 ../../../../chrom.sizes | while read C
do
    ls -d bed/?/${C} 2> /dev/null | while read D
    do
        echo ${D}/${C}*.bed 1>&2
        cat ${D}/${C}*.bed
    done | sort -k1,1 -k2,2n \
    | awk '{printf "%s\t%d\t%d\tlod=%d\t%s\n", "'${C}'", $2, $3, $5, $5;}'
done > tmpMostConserved.bed

    /cluster/bin/scripts/lodToBedScore tmpMostConserved.bed > mostConserved.bed
    #    -rw-rw-r--  1 42636652 Jun  4 10:45 tmpMostConserved.bed
    #    -rw-rw-r--  1 43721828 Jun  4 10:45 mostConserved.bed

    # load into database
    ssh hgwdev
    cd /hive/data/genomes/hg38/bed/multiz7way/cons/all
    time nice -n +19 hgLoadBed hg38 phastConsElements7way mostConserved.bed
    #  Read 1234990 elements of size 5 from mostConserved.bed
    #  real    0m11.390s

    # on human we often try for 5% overall cov, and 70% CDS cov
    # most bets are off here for that goal, these alignments are too few
    #	and too far between
    #	--rho 0.3 --expected-length 45 --target-coverage 0.3
    featureBits hg38 -enrichment knownGene:cds phastConsElements7way
    # knownGene:cds 1.266%, phastConsElements7way 4.551%,
    #    both 0.888%, cover 70.16%, enrich 15.42x

    # Create merged posterier probability file and wiggle track data files
    cd /hive/data/genomes/hg38/bed/multiz7way/cons/all
    mkdir downloads

    # the third sed fixes the chrom names, removing the partition extensions
    time (find ./pp -type f | sed -e "s#^./##; s#\.# d #g; s#-# m #;" \
	| sort -k1,1 -k3,3n | sed -e "s# d #.#g; s# m #-#g;" | xargs cat \
	| sed -e 's/\.[0-9][0-9]*-[0-9][0-9]* start/ start/' \
        | gzip -c > downloads/phastCons7way.wigFix.gz)
    #   real    37m47.242s

    # check integrity of data with wigToBigWig
    time (zcat downloads/phastCons7way.wigFix.gz \
	| wigToBigWig -verbose=2 stdin /hive/data/genomes/hg38/chrom.sizes \
	    phastCons7way.bw) > bigWig.log 2>&1 &
    tail bigWig.log
    # pid=34733: VmPeak:    33106324 kB
    #   real    40m53.287s

    bigWigInfo phastCons7way.bw
# version: 4
# isCompressed: yes
# isSwapped: 0
# primaryDataSize: 5,675,802,079
# primaryIndexSize: 92,579,900
# zoomLevels: 10
# chromCount: 353
# basesCovered: 2,898,191,577
# mean: 0.168088
# min: 0.000000
# max: 1.000000
# std: 0.233827

    #	encode those files into wiggle data
    time (zcat downloads/phastCons7way.wigFix.gz \
	| wigEncode stdin phastCons7way.wig phastCons7way.wib)
    #   Converted stdin, upper limit 1.00, lower limit 0.00
    #   real    15m28.525s

    du -hsc *.wi?
    #  2.7G    phastCons7way.wib
    #  282M    phastCons7way.wig
    #  3.0G    total

    # Load gbdb and database with wiggle.
    ln -s `pwd`/phastCons7way.wib /gbdb/hg38/multiz7way/phastCons7way.wib
    time nice -n +19 hgLoadWiggle -pathPrefix=/gbdb/hg38/multiz7way \
	hg38 phastCons7way phastCons7way.wig
    #   real    0m33.502s

    # use to set trackDb.ra entries for wiggle min and max
    # and verify table is loaded correctly

    wigTableStats.sh hg38 phastCons7way
# db.table          min max mean       count sumData      stdDev  viewLimits
hg38.phastCons7way 0 1 0.168088 2898191577 4.87152e+08 0.233827 viewLimits=0:1

    #  Create histogram to get an overview of all the data
    time nice -n +19 hgWiggle -doHistogram -db=hg38 \
	-hBinSize=0.001 -hBinCount=1000 -hMinVal=0.0 -verbose=2 \
	    phastCons7way > histogram.data 2>&1
    #	real    2m40.179s

    #	create plot of histogram:

    cat << '_EOF_' | gnuplot > histo.png
set terminal png small x000000 xffffff xc000ff x66ff66 xffff00 x00ffff
set size 1.4, 0.8
set key left box
set grid noxtics
set grid ytics
set title " Human hg38 Histogram phastCons7way track"
set xlabel " phastCons7way score"
set ylabel " Relative Frequency"
set y2label " Cumulative Relative Frequency (CRF)"
set y2range [0:1]
set y2tics
set yrange [0:0.02]

plot "histogram.data" using 2:5 title " RelFreq" with impulses, \
        "histogram.data" using 2:7 axes x1y2 title " CRF" with lines
'_EOF_'
    #	<< happy emacs

    display histo.png &

#########################################################################
# phyloP for 7-way (DONE - 2014-06-04 - Hiram)
    # run phyloP with score=LRT
    ssh ku
    mkdir /cluster/data/hg38/bed/multiz7way/consPhyloP
    cd /cluster/data/hg38/bed/multiz7way/consPhyloP

    mkdir run.phyloP
    cd run.phyloP
    # Adjust model file base composition background and rate matrix to be
    # representative of the chromosomes in play
    grep BACKGROUND ../../cons/all/all.mod | awk '{printf "%0.3f\n", $3 + $4}'
    #	0.556
    /cluster/bin/phast.build/cornellCVS/phast.2010-12-30/bin/modFreqs \
	../../cons/all/all.mod 0.556 > all.mod
    # verify, the BACKGROUND should now be paired up:
    grep BACK all.mod
    #   BACKGROUND: 0.222000 0.278000 0.278000 0.222000

    cat << '_EOF_' > doPhyloP.csh
#!/bin/csh -fe
set PHASTBIN = /cluster/bin/phast.build/cornellCVS/phast.2010-12-30/bin
set f = $1
set d = $f:h
set file1 = $f:t
set out = $2
set cName = $f:t:r
set grp = $cwd:t
set cons = /hive/data/genomes/hg38/bed/multiz7way/consPhyloP
set tmp = $cons/tmp/$grp/$f
/bin/rm -fr $tmp
/bin/mkdir -p $tmp
set ssSrc = "/hive/data/genomes/hg38/bed/multiz7way/cons/SS/result/$f"
set useGrp = "$grp.mod"
/bin/ln -s $cons/run.phyloP/$grp.mod $tmp
pushd $tmp > /dev/null
$PHASTBIN/phyloP --method LRT --mode CONACC --wig-scores --chrom $cName \
    -i SS $useGrp $ssSrc.ss > $file1.wigFix
popd > /dev/null
/bin/mkdir -p $out:h
sleep 4
/bin/touch $out:h
/bin/mv $tmp/$file1.wigFix $out
/bin/rm -fr $tmp
/bin/rmdir --ignore-fail-on-non-empty $cons/tmp/$grp/$d
/bin/rmdir --ignore-fail-on-non-empty $cons/tmp/$grp/$d:h
/bin/rmdir --ignore-fail-on-non-empty $cons/tmp/$grp
/bin/rmdir --ignore-fail-on-non-empty $cons/tmp
'_EOF_'
    # << happy emacs

    # Create list of chunks
    find ../../cons/SS/result -type f | grep ".ss$" \
	| sed -e "s/.ss$//; s#^../../cons/SS/result/##" > ss.list
    # make sure the list looks good
    wc -l ss.list
    #	641 ss.list

    # Create template file
    #	file1 == $chr/$chunk/file name without .ss suffix
    cat << '_EOF_' > template
#LOOP
../run.phyloP/doPhyloP.csh $(path1) {check out line+ wigFix/$(dir1)/$(file1).wigFix}
#ENDLOOP
'_EOF_'
    # << happy emacs

    ######################   Running all species  #######################
    # setup run for all species
    mkdir /hive/data/genomes/hg38/bed/multiz7way/consPhyloP/all
    cd /hive/data/genomes/hg38/bed/multiz7way/consPhyloP/all
    rm -fr wigFix
    mkdir wigFix

    gensub2 ../run.phyloP/ss.list single ../run.phyloP/template jobList
    # the -ram=8g will allow only one job per node to slow this down since
    #	it would run too fast otherwise.  Either run on one of the small
    #	klusters or use the -ram=8g on the para create
    para -ram=32g create jobList
    para try ... check ... push ... etc ...
    para time > run.time
# Completed: 641 of 641 jobs
# CPU time in finished jobs:       4755s      79.24m     1.32h    0.06d  0.000 y
# IO & Wait Time:                  4343s      72.39m     1.21h    0.05d  0.000 y
# Average job time:                  14s       0.24m     0.00h    0.00d
# Longest finished job:              27s       0.45m     0.01h    0.00d
# Submission to last job:          1152s      19.20m     0.32h    0.01d

    # make downloads
    mkdir downloads

    time (find ./wigFix -type f | sed -e "s#^./##; s#\.# d #g; s#-# m #;" \
	| sort -k1,1 -k3,3n | sed -e "s# d #.#g; s# m #-#g;" | xargs cat \
	| gzip -c > downloads/phyloP7way.wigFix.gz) &
    #   real    29m51.665s

    # check integrity of data with wigToBigWig
    time (zcat downloads/phyloP7way.wigFix.gz \
	| wigToBigWig -verbose=2 stdin /hive/data/genomes/hg38/chrom.sizes \
	phyloP7way.bw) > bigWig.log 2>&1 &
    egrep "real|VmPeak" bigWig.log
    # pid=76577: VmPeak:    33106320 kB
    #  real    42m53.038s


    bigWigInfo phyloP7way.bw
# version: 4
# isCompressed: yes
# isSwapped: 0
# primaryDataSize: 3,759,451,708
# primaryIndexSize: 92,579,900
# zoomLevels: 10
# chromCount: 353
# basesCovered: 2,898,191,577
# mean: 0.074472
# min: -5.220000
# max: 1.062000
# std: 0.545945

    #	encode those files into wiggle data
    time (zcat downloads/phyloP7way.wigFix.gz \
	| wigEncode stdin phyloP7way.wig phyloP7way.wib) &
    #   Converted stdin, upper limit 1.06, lower limit -5.22
    #   real    16m11.861s


    du -hsc *.wi?
    #   47M     phyloP7way.wib
    #   12M     phyloP7way.wig
    #   58M     total

    # Load gbdb and database with wiggle.
    ln -s `pwd`/phyloP7way.wib /gbdb/hg38/multiz7way/phyloP7way.wib
    nice hgLoadWiggle -pathPrefix=/gbdb/hg38/multiz7way hg38 \
	phyloP7way phyloP7way.wig

    # use to set trackDb.ra entries for wiggle min and max
    # and verify table is loaded correctly

    wigTableStats.sh hg38 phyloP7way
# db.table      min max mean count sumData
# hg38.phyloP7way -5.22 1.062 0.0744721 2898191577 2.15834e+08
#       stdDev viewLimits
#     0.545945 viewLimits=-2.65525:1.062

    #	that range is: 5.22+1.062 = 6.282 for hBinSize=0.006282

    #  Create histogram to get an overview of all the data
    time nice -n +19 hgWiggle -doHistogram \
	-hBinSize=0.006282 -hBinCount=1000 -hMinVal=-5.22 -verbose=2 \
	    -db=hg38 phyloP7way > histogram.data 2>&1
    #   real    2m55.843s


    # find out the range for the 2:5 graph
    grep -v chrom histogram.data | grep "^[0-9]" | ave -col=5 stdin
# Q1 0.000001
# median 0.000060
# Q3 0.000656
# average 0.001022
# min 0.000000
# max 0.065461
# count 978
# total 0.999982
# standard deviation 0.004157

    #	create plot of histogram:
    cat << '_EOF_' | gnuplot > histo.png
set terminal png small x000000 xffffff xc000ff x66ff66 xffff00 x00ffff
set size 1.4, 0.8
set key left box
set grid noxtics
set grid ytics
set title " Human hg38 Histogram phyloP7way track"
set xlabel " phyloP7way score"
set ylabel " Relative Frequency"
set y2label " Cumulative Relative Frequency (CRF)"
set y2range [0:1]
set y2tics
set yrange [0:0.02]

plot "histogram.data" using 2:5 title " RelFreq" with impulses, \
        "histogram.data" using 2:7 axes x1y2 title " CRF" with lines
'_EOF_'
    #	<< happy emacs

    display histo.png &

#############################################################################
# construct download files for 7-way (DONE - 2014-06-05 - Hiram)
    mkdir /usr/local/apache/htdocs-hgdownload/goldenPath/hg38/multiz7way
    mkdir /usr/local/apache/htdocs-hgdownload/goldenPath/hg38/phastCons7way
    mkdir /usr/local/apache/htdocs-hgdownload/goldenPath/hg38/phyloP7way
    mkdir /hive/data/genomes/hg38/bed/multiz7way/downloads
    cd /hive/data/genomes/hg38/bed/multiz7way/downloads
    mkdir multiz7way phastCons7way phyloP7way
    cd multiz7way
    time cp -p ../../anno/hg38.7way.maf .
    #   real    0m55.984s
    time gzip *.maf
    #   real    46m53.149s

    ln -s ../../hg38.7way.nh .
    ln -s ../../hg38.7way.commonNames.nh .
    time md5sum *.nh *.maf.gz > md5sum.txt
    #   real    1m55.317s
    ln -s `pwd`/* \
        /usr/local/apache/htdocs-hgdownload/goldenPath/hg38/multiz7way

    du -hsc *.maf.gz ../../anno/hg38.7way.maf
    #  3.5G    hg38.7way.maf.gz
    #   17G     ../../anno/hg38.7way.maf

    #####################################################################
    cd /hive/data/genomes/hg38/bed/multiz7way/downloads/phastCons7way

    ln -s ../../cons/all/downloads/phastCons7way.wigFix.gz \
        ./hg38.phastCons7way.wigFix.gz
    ln -s ../../cons/all/phastCons7way.bw ./hg38.phastCons7way.bw
    ln -s ../../cons/all/all.mod ./hg38.phastCons7way.mod
    time md5sum *.gz *.mod *.bw > md5sum.txt
    #   real    0m37.384s
    # obtain the README.txt from petMar2/phastCons7way and update for this
    #   situation
    ln -s `pwd`/*.gz `pwd`/*.mod `pwd`/*.bw `pwd`/*.txt \
      /usr/local/apache/htdocs-hgdownload/goldenPath/hg38/phastCons7way

    #####################################################################
    cd /hive/data/genomes/hg38/bed/multiz7way/downloads/phyloP7way

    ln -s ../../consPhyloP/all/downloads/phyloP7way.wigFix.gz \
        ./hg38.phyloP7way.wigFix.gz
    ln -s ../../consPhyloP/run.phyloP/all.mod hg38.phyloP7way.mod
    ln -s ../../consPhyloP/all/phyloP7way.bw hg38.phyloP7way.bw

    time md5sum *.mod *.bw *.gz > md5sum.txt
    #   real    0m29.431s

    # obtain the README.txt from geoFor1/phyloP7way and update for this
    #   situation
    ln -s `pwd`/* \
      /usr/local/apache/htdocs-hgdownload/goldenPath/hg38/phyloP7way

    ###########################################################################
    ## create upstream refGene maf files
    cd /hive/data/genomes/hg38/bed/multiz7way/downloads/multiz7way
    # bash script
#!/bin/sh
export geneTbl="knownGene"
for S in 1000 2000 5000
do
    echo "making upstream${S}.maf"
    featureBits hg38 ${geneTbl}:upstream:${S} -fa=/dev/null -bed=stdout \
        | perl -wpe 's/_up[^\t]+/\t0/' | sort -k1,1 -k2,2n \
        | /cluster/bin/$MACHTYPE/mafFrags hg38 multiz7way \
                stdin stdout \
                -orgs=/hive/data/genomes/hg38/bed/multiz7way/species.list \
        | gzip -c > upstream${S}.${geneTbl}.maf.gz
    echo "done upstream${S}.${geneTbl}.maf.gz"
done
    #   real    60m16.631s

    md5sum upstream*.gz >> md5sum.txt

    # some other symlinks were already made above
    # obtain the README.txt from geoFor1/multiz7way and update for this
    #   situation
    ln -s `pwd`/upstream*.gz README.txt \
        /usr/local/apache/htdocs-hgdownload/goldenPath/hg38/multiz7way

#############################################################################
# hgPal downloads (DONE - 2014-06-06 - Hiram)
#   FASTA from 7-way for knownGene, refGene and knownCanonical

    ssh hgwdev
    screen -S hg38HgPal
    mkdir /hive/data/genomes/hg38/bed/multiz7way/pal
    cd /hive/data/genomes/hg38/bed/multiz7way/pal
    cat ../species.list | tr '[ ]' '[\n]' > order.list

    export mz=multiz7way
    export gp=knownGene
    export db=hg38
    export I=0
    mkdir exonAA exonNuc
    for C in `sort -nk2 ../../../chrom.sizes | cut -f1`
    do
        I=`echo $I | awk '{print $1+1}'`
	echo "mafGene -chrom=$C -exons -noTrans $db $mz $gp order.list stdout | gzip -c > exonNuc/$C.exonNuc.fa.gz &"
	echo "mafGene -chrom=$C -exons $db $mz $gp order.list stdout | gzip -c > exonAA/$C.exonAA.fa.gz &"
        if [ $I -gt 6 ]; then
            echo "date"
            echo "wait"
            I=0
        fi
    done > $gp.jobs
    echo "date" >> $gp.jobs
    echo "wait" >> $gp.jobs

    time ./$gp.jobs > $gp.jobs.log 2>&1 &
    #   real    28m46.919s

    time zcat exonAA/*.gz | gzip -c > $gp.$mz.exonAA.fa.gz
    #   real    0m23.798s
    time zcat exonNuc/*.gz | gzip -c > $gp.$mz.exonNuc.fa.gz
    #   real    1m28.197s

    export mz=multiz7way
    export gp=knownGene
    export db=hg38
    export pd=/usr/local/apache/htdocs-hgdownload/goldenPath/$db/$mz/alignments
    mkdir -p $pd
    md5sum *.fa.gz > md5sum.txt
    ln -s `pwd`/$gp.$mz.exonAA.fa.gz $pd/$gp.exonAA.fa.gz
    ln -s `pwd`/$gp.$mz.exonNuc.fa.gz $pd/$gp.exonNuc.fa.gz
    ln -s `pwd`/md5sum.txt $pd/

    rm -rf exonAA exonNuc

    ### need other gene track alignments also
    # running up refGene
    cd /hive/data/genomes/hg38/bed/multiz7way/pal
    export mz=multiz7way
    export gp=refGene
    export db=hg38
    export I=0
    mkdir exonAA exonNuc
    for C in `sort -nk2 ../../../chrom.sizes | cut -f1`
    do
        I=`echo $I | awk '{print $1+1}'`
	echo "mafGene -chrom=$C -exons -noTrans $db $mz $gp order.list stdout | gzip -c > exonNuc/$C.exonNuc.fa.gz &"
	echo "mafGene -chrom=$C -exons $db $mz $gp order.list stdout | gzip -c > exonAA/$C.exonAA.fa.gz &"
        if [ $I -gt 6 ]; then
            echo "date"
            echo "wait"
            I=0
        fi
    done > $gp.jobs
    echo "date" >> $gp.jobs
    echo "wait" >> $gp.jobs

    time sh -x $gp.jobs > $gp.jobs.log 2>&1
    #   real    15m15.424s

    export mz=multiz7way
    export gp=refGene
    export db=hg38
    time zcat exonAA/*.gz | gzip -c > $gp.$mz.exonAA.fa.gz
    #   real    0m23.119s
    time zcat exonNuc/*.gz | gzip -c > $gp.$mz.exonNuc.fa.gz
    #   real    1m15.547s

    du -hsc exonAA exonNuc refGene*.fa.gz
    #  59M     exonAA
    #  101M    exonNuc
    #  59M     refGene.multiz7way.exonAA.fa.gz
    #  101M    refGene.multiz7way.exonNuc.fa.gz
    #  317M    total

    rm -rf exonAA exonNuc

    # we're only distributing exons at the moment
    export mz=multiz7way
    export gp=refGene
    export db=hg38
    export pd=/usr/local/apache/htdocs-hgdownload/goldenPath/$db/$mz/alignments
    mkdir -p $pd
    md5sum *.fa.gz > md5sum.txt
    ln -s `pwd`/$gp.$mz.exonAA.fa.gz $pd/$gp.exonAA.fa.gz
    ln -s `pwd`/$gp.$mz.exonNuc.fa.gz $pd/$gp.exonNuc.fa.gz
    ln -s `pwd`/md5sum.txt $pd/

    ### And knownCanonical
    cd /hive/data/genomes/hg38/bed/multiz7way/pal
    export mz=multiz7way
    export gp=knownCanonical
    export db=hg38
    mkdir exonAA exonNuc ppredAA ppredNuc knownCanonical

    cut -f1 ../../../chrom.sizes | while read C
    do
        echo $C
	hgsql hg38 -N -e "select chrom, chromStart, chromEnd, transcript from knownCanonical where chrom='$C'" > knownCanonical/$C.known.bed
    done

    ls knownCanonical/*.known.bed | while read F
    do
      if [ -s $F ]; then
         echo $F | sed -e 's#knownCanonical/##; s/.known.bed//'
      fi
    done | while read C
    do
	echo "date"
	echo "mafGene -geneBeds=knownCanonical/$C.known.bed  $db $mz knownGene order.list stdout | \
	    gzip -c > ppredAA/$C.ppredAA.fa.gz"
	echo "mafGene -geneBeds=knownCanonical/$C.known.bed -noTrans $db $mz knownGene order.list stdout | \
	    gzip -c > ppredNuc/$C.ppredNuc.fa.gz"
	echo "mafGene -geneBeds=knownCanonical/$C.known.bed -exons -noTrans $db $mz knownGene order.list stdout | \
	    gzip -c > exonNuc/$C.exonNuc.fa.gz"
	echo "mafGene -geneBeds=knownCanonical/$C.known.bed -exons $db $mz knownGene order.list stdout | \
	    gzip -c > exonAA/$C.exonAA.fa.gz"
    done > $gp.$mz.jobs

    time sh -x $gp.$mz.jobs > $gp.$mz.job.log 2>&1
    # real    72m58.133s

    rm *.known.bed
    mz=multiz7way
    gp=knownCanonical
    db=hg38
    zcat exonAA/c*.gz | gzip -c > $gp.$mz.exonAA.fa.gz &
    zcat exonNuc/c*.gz | gzip -c > $gp.$mz.exonNuc.fa.gz &
    zcat ppredAA/c*.gz | gzip -c > $gp.$mz.ppredAA.fa.gz &
    zcat ppredNuc/c*.gz | gzip -c > $gp.$mz.ppredNuc.fa.gz

    rm -rf exonAA exonNuc ppredAA ppredNuc

    mz=multiz7way
    gp=knownCanonical
    db=hg38
    pd=/usr/local/apache/htdocs-hgdownload/goldenPath/$db/$mz/alignments
    mkdir -p $pd
    ln -s `pwd`/$gp.$mz.exonAA.fa.gz $pd/$gp.exonAA.fa.gz
    ln -s `pwd`/$gp.$mz.exonNuc.fa.gz $pd/$gp.exonNuc.fa.gz
    cd  $pd
    md5sum *.exon*.fa.gz > md5sum.txt

#############################################################################
# wiki page for 7-way (DONE - 2014-06-04 - Hiram)
    mkdir /hive/users/hiram/bigWays/hg38.7way
    cd /hive/users/hiram/bigWays
    echo "hg38" > hg38.7way/ordered.list
    awk '{print $1}' /hive/data/genomes/hg38/bed/multiz7way/7way.distances.txt \
       >> hg38.7way/ordered.list

    # sizeStats.sh catches up the cached measurements required for data
    # in the tables.  They may already be done.
    ./sizeStats.sh hg38.7way/ordered.list
    # dbDb.sh constructs hg38.7way/Hg38_7-way_conservation_alignment.html
    ./dbDb.sh hg38 7way
    # sizeStats.pl constructs hg38.7way/Hg38_7-way_Genome_size_statistics.html
    ./sizeStats.pl hg38 7way

    # defCheck.pl constructs Hg38_7-way_conservation_lastz_parameters.html
    ./defCheck.pl hg38 7way

    # this constructs the html pages in hg38.7way/:
# -rw-rw-r-- 1 4153 Jun  5 11:03 Hg38_7-way_conservation_alignment.html
# -rw-rw-r-- 1 5833 Jun  5 11:04 Hg38_7-way_Genome_size_statistics.html
# -rw-rw-r-- 1 3854 Jun  5 11:04 Hg38_7-way_conservation_lastz_parameters.html

    # add those pages to the genomewiki.  Their page names are the
    # names of the .html files without the .html:
#  Hg38_7-way_conservation_alignment
#  Hg38_7-way_Genome_size_statistics
#  Hg38_7-way_conservation_lastz_parameters

    # when you view the first one you enter, it will have links to the
    # missing two.

#############################################################################
# GRC Incident database (DONE - 2014-06-14 - Hiram)
    # this procedure is run as a cron job in Hiram's account:

    #	33 09 * * * /hive/data/outside/grc/incidentDb/runUpdate.sh makeItSo

    # data comes from: ftp://ftp.ncbi.nlm.nih.gov/pub/grc/
    # processed by /hive/data/outside/grc/incidentDb/grcUpdate.sh

    # the table in the dataBase is: grcIncidentDb
    # which is the URL to the bb file, a single row:
    # http://genomewiki.ucsc.edu/images/7/7f/Hg38.grcIncidentDb.bb

#############################################################################
# RepeatMasker Visualization track (DONE - 2014-07-25 - Hiram)
    mkdir /hive/data/genomes/hg38/bed/rmskJoined
    cd /hive/data/genomes/hg38/bed/rmskJoined

    ln -s ../repeatMasker/hg38.sorted.fa.out .
    ln -s ../repeatMasker/hg38.fa.align.gz .

    # working on fixing this script for the next release of RM
    /scratch/data/RepeatMasker140131/util/nextVerRmToUCSCTables.pl \
            -out hg38.sorted.fa.out -align hg38.fa.align.gz

    hgLoadBed -sqlTable=$HOME/kent/src/hg/lib/rmskJoined.sql \
        -renameSqlTable -verbose=4 -tab \
            -type=bed4+9 -as=$HOME/kent/src/hg/lib/rmskJoined.as hg38 \
                rmskJoinedBaseline hg38.sorted.fa.join.bed \
                    > loadJoined.log 2>&1

    hgLoadSqlTab hg38 rmskAlignBaseline \
        /cluster/home/hiram/kent/src/hg/lib/rmskAlign.sql \
            hg38.fa.align.tsv > loadAlign.log 2>&1

    hgLoadOutJoined -verbose=2 hg38 hg38.sorted.fa.out > loadOut.log 2>&1

    featureBits -countGaps hg38 rmskJoinedBaseline
    #    2716777279 bases of 3209286105 (84.654%) in intersection

##############################################################################
# LASTZ Macaca Mulatta RheMac2 (DONE - 2014-07-13 - braney)
    mkdir /hive/data/genomes/hg38/bed/lastzRheMac2.2014-07-11
    cd /hive/data/genomes/hg38/bed/lastzRheMac2.2014-07-11

    # best to always specify an exact path to lastz so we know which one is used
    # lastz default parameters are human-mouse parameters

    cat << '_EOF_' > DEF
# human vs macaca mulatta
BLASTZ=/cluster/bin/penn/lastz-distrib-1.03.52/bin/lastz
# maximum M allowed with lastz is only 254
BLASTZ_M=254
BLASTZ_Q=/scratch/data/blastz/human_chimp.v2.q
BLASTZ_O=600
BLASTZ_E=150
# other parameters from panTro2 vs hg18 lastz on advice from Webb
BLASTZ_K=4500
BLASTZ_Y=15000
BLASTZ_T=2

# TARGET: Human Hg38
SEQ1_DIR=/hive/data/genomes/hg38/hg38.2bit
SEQ1_LEN=/hive/data/genomes/hg38/chrom.sizes
SEQ1_CTGDIR=/hive/data/genomes/hg38/hg38.contigs.2bit
SEQ1_CTGLEN=/hive/data/genomes/hg38/hg38.contigs.chrom.sizes
SEQ1_LIFT=/hive/data/genomes/hg38/jkStuff/hg38.contigs.lift
SEQ1_CHUNK=20000000
SEQ1_LAP=10000

# QUERY: Macaca Mulatta RheMac2
SEQ2_DIR=/scratch/data/rheMac2/rheMac2.2bit
SEQ2_LEN=/scratch/data/rheMac2/chrom.sizes
SEQ2_CHUNK=20000000
SEQ2_LAP=0
SEQ2_IN_CONTIGS=0

BASE=/hive/data/genomes/hg38/bed/lastzRheMac2.2014-07-11
TMPDIR=/dev/shm
'_EOF_'
    # << happy emacs
    time $HOME/kent/src/hg/utils/automation/doBlastzChainNet.pl \
        `pwd`/DEF \
        -syntenicNet -fileServer=hgwdev \
	-chainMinScore=5000 -chainLinearGap=medium \
        -workhorse=hgwdev -smallClusterHub=ku -bigClusterHub=ku > do.log 2>&1
    #  Elapsed time: 141m36s
    cat fb.hg38.chainRheMac2Link.txt
    # 2455106923 bases of 3049335806 (80.513%) in intersection

    #   running the swap
    mkdir /hive/data/genomes/rheMac2/bed/blastz.hg38.swap
    cd /hive/data/genomes/rheMac2/bed/blastz.hg38.swap
    time nice -n +19 doBlastzChainNet.pl -verbose=2 \
        /hive/data/genomes/hg38/bed/lastzRheMac2.2014-07-11/DEF \
        -swap -syntenicNet -chainMinScore=5000 -chainLinearGap=medium \
        -workhorse=hgwdev -smallClusterHub=ku -bigClusterHub=ku > swap.log 2>&1
    # 83m26.095s
    cat fb.rheMac2.chainHg38Link.txt
    # 2313950599 bases of 2646704109 (87.428%) in intersection
#

#########################################################################
# LASTZ Chlorocebus sabaeus  (DONE - 2014-07-13 - braney)
    mkdir /hive/data/genomes/hg38/bed/lastzChlSab2.2014-07-11
    cd /hive/data/genomes/hg38/bed/lastzChlSab2.2014-07-11

    # best to always specify an exact path to lastz so we know which one is used
    # lastz default parameters are human-mouse parameters

    cat << '_EOF_' > DEF
# human vs Chlorocebus sabaeus
BLASTZ=/cluster/bin/penn/lastz-distrib-1.03.52/bin/lastz
# maximum M allowed with lastz is only 254
BLASTZ_M=254
BLASTZ_Q=/scratch/data/blastz/human_chimp.v2.q
BLASTZ_O=600
BLASTZ_E=150
# other parameters from panTro2 vs hg18 lastz on advice from Webb
BLASTZ_K=4500
BLASTZ_Y=15000
BLASTZ_T=2


# TARGET: Human Hg38
SEQ1_DIR=/hive/data/genomes/hg38/hg38.2bit
SEQ1_LEN=/hive/data/genomes/hg38/chrom.sizes
SEQ1_CTGDIR=/hive/data/genomes/hg38/hg38.contigs.2bit
SEQ1_CTGLEN=/hive/data/genomes/hg38/hg38.contigs.chrom.sizes
SEQ1_LIFT=/hive/data/genomes/hg38/jkStuff/hg38.contigs.lift
SEQ1_CHUNK=20000000
SEQ1_LAP=10000

# QUERY Chlorocebus sabaeus chlSab2
SEQ2_DIR=/scratch/data/chlSab2/chlSab2.2bit
SEQ2_LEN=/scratch/data/chlSab2/chrom.sizes
SEQ2_CHUNK=20000000
SEQ2_LAP=0
SEQ2_IN_CONTIGS=0

BASE=/hive/data/genomes/hg38/bed/lastzChlSab2.2014-07-11
TMPDIR=/dev/shm
'_EOF_'
    # << happy emacs
    time $HOME/kent/src/hg/utils/automation/doBlastzChainNet.pl \
        `pwd`/DEF \
        -syntenicNet -fileServer=hgwdev \
	-chainMinScore=5000 -chainLinearGap=medium \
        -workhorse=hgwdev -smallClusterHub=ku -bigClusterHub=ku > do.log 2>&1
    # Elapsed time: 142m4s
    cat fb.hg38.chainChlSab2Link.txt
    # 2573435303 bases of 3049335806 (84.393%) in intersection

    #   running the swap
    mkdir /hive/data/genomes/chlSab2/bed/blastz.hg38.swap
    cd /hive/data/genomes/chlSab2/bed/blastz.hg38.swap
    time nice -n +19 doBlastzChainNet.pl -verbose=2 \
        /hive/data/genomes/hg38/bed/lastzChlSab2.2014-07-11/DEF \
        -swap -syntenicNet -chainMinScore=5000 -chainLinearGap=medium \
        -workhorse=hgwdev -smallClusterHub=ku -bigClusterHub=ku > swap.log 2>&1
    # 88m48.411s
    cat fb.chlSab2.chainHg38Link.txt
    # 2429053010 bases of 2752019208 (88.264%) in intersection

#########################################################################
# SEGMENTAL DUPLICATIONS (DONE - 2014-08-13 - Hiram)
    # redmine issue: refs #13580

    # file received in email from Archana Natarajan Raja (araja at uw.edu)
    mkdir /hive/data/genomes/hg38/bed/genomicSuperDups
    cd /hive/data/genomes/hg38/bed/genomicSuperDups
# -rw-r--r-- 1 16478617 Aug 11 16:18 GenomicSuperDup.tab

    # no longer filtering items smaller than 1,000 bases, see note
    # in redmine issue refs #13580
# While the size of the 24 alignments are less than 1000 bases , the size of
# their pairs to which they align are always >1000, you can confirm this by
# looking at the value in column 22 in your table (alignB -ucsc format), will
# always be >1000 bp . We are seeing this only now because there are lots of
# new and resolved duplications added to hg38. Hence , I would recommend not
# filtering these items and uploading the current set as is.

    # there is no chrEBV in the browser:
    grep -v chrEBV GenomicSuperDup.tab | sed -e 's/\t_\t/\t-\t/;' \
      | hgLoadBed hg38 genomicSuperDups stdin \
	-sqlTable=$HOME/kent/src/hg/lib/genomicSuperDups.sql
    #  Read 69894 elements of size 29 from stdin

    checkTableCoords  hg38 genomicSuperDups
    # <silence>  (the chrEBV was found with this check)

    featureBits -countGaps hg38 genomicSuperDups
    # 175429664 bases of 3209286105 (5.466%) in intersection

    featureBits -countGaps hg19 genomicSuperDups
    #  166092393 bases of 3137161264 (5.294%) in intersection
    featureBits -countGaps hg18 genomicSuperDups
    #  159204446 bases of 3107677273 (5.123%) in intersection

    featureBits -countGaps mm10 genomicSuperDups
    # 214917441 bases of 2730871774 (7.870%) in intersection
    featureBits -countGaps mm9 genomicSuperDups
    # 208214567 bases of 2725765481 (7.639%) in intersection

##############################################################################
# cloneEnds (DONE - 2014-08-14 - Hiram)

    mkdir /hive/data/genomes/hg38/bed/cloneEnds
    cd /hive/data/genomes/hg38/bed/cloneEnds

    # fetch the NCBI INSDC name correspondence file:
    rsync -a -P rsync://ftp.ncbi.nlm.nih.gov/genomes/ASSEMBLY_REPORTS/All/GCF_000001405.26.assembly.txt ./

    # fetch the clone reports
    mkdir reports
    rsync -a -P \
rsync://ftp.ncbi.nih.gov/repository/clone/reports/Homo_sapiens/*.GCF_000001405.26.106.*.gff \
       ./reports/

    # script to establish refSeq to UCSC chrom names:

    cat << '_EOF_' > refSeqNames.pl
#!/usr/bin/env perl

use strict;
use warnings;

open (FH, "<GCF_000001405.26.assembly.txt") or die "can not read GCF_000001405.26.assembly.txt";
while (my $line = <FH>) {
  chomp $line;
  next if ($line =~ m/^#/);
  my @a = split('\t', $line);
  my $chrN = $a[2];
  my $refSeq = $a[6];
  my $contig = $a[4];
  my $type = $a[1];
  next if (!defined $type);
  next if (!defined $refSeq);
  next if (!defined $contig);
  my $suffix = "";
  if ($type eq "alt-scaffold") {
     $suffix = "_alt";
  } elsif ($type eq "unlocalized-scaffold") {
     $suffix = "_random";
  } elsif ($type eq "unplaced-scaffold") {
     $chrN = "Un";
  }
  $chrN = "M" if ($chrN eq "MT");
  if ($a[0] =~ m/_/) {
    $contig =~ s/\./v/;
    printf "%s\tchr%s_%s%s\n", $refSeq, $chrN, $contig, $suffix;
  } else {
    printf "%s\tchr%s\n", $refSeq, $chrN;
  }
}
close (FH);
'_EOF_'
    # << happy emacs

    chmod +x refSeqNames.pl

    ./refSeqNames.pl > refSeq.ucscName.tab

    # establish full library list:
    ls reports/*.GCF_000001405.26.106.*.gff | sed -e 's#reports/##' \
       | cut -d"." -f1 | sort -u > library.list.txt

    # a script to scan the GFF files, with the refSeq.ucscName.tab
    # name correspondence to construct bed files

    cat << '_EOF_' > hg38.pl
#!/usr/bin/env perl

use strict;
use warnings;

my $argc = scalar(@ARGV);

if ($argc < 1) {
  printf STDERR "usage: ./hg38.pl <report.gff> [moreReports.gff]\n";
  exit 255;
}

my %refSeqToUcsc;   # key is refSeq name, value is UCSC chrom name
open (FH, "<refSeq.ucscName.tab") or die "can not read refSeq.ucscName.tab";
while (my $line = <FH>) {
  chomp $line;
  my ($refSeq, $ucsc) = split('\t', $line);
  $refSeqToUcsc{$refSeq} = $ucsc;
}
close (FH);

my %chromSizes;    # key is UCSC chrom name, key is chrom size
open (FH, "</hive/data/genomes/hg38/chrom.sizes") or die "can not read hg38/chrom.sizes";
while (my $line = <FH>) {
  chomp $line;
  my ($chr, $size) = split('\t', $line);
  $chromSizes{$chr} = $size;
}
close (FH);

while (my $file = shift) {
my %starts;   # key is parent ID, value is start end coordinates start,end
my %ends;	# key is parent ID, value is end end coordinates start,end
my %parents;	# key is parent ID, value is 1 to signify exists
my %endNames;   # key is parent ID, value is the Name of the parent clone_insert

printf STDERR "# processing $file\n";

open (FH, "<$file") or die "can not read $file";
while (my $line = <FH>) {
  chomp $line;
  next if ($line=~ m/^#/);
  my @a = split('\t', $line);
  next if (scalar(@a) < 1);
  my $contig = $a[0];
  $contig =~ s/ref.//;
  $contig =~ s/\|//;
  my $ucscChr = $refSeqToUcsc{$contig};
  if (!defined($ucscChr)) {
    printf STDERR "# ERR: contig not in refSeqToUcsc: '$contig'\n";
    next;
  }
  next if (! exists($chromSizes{$ucscChr}));
  my $chromSize = $chromSizes{$ucscChr};
  my $chromStart = $a[3] - 1;
  my $chromEnd = $a[4];
  if ($chromStart > $chromSize) {
    printf STDERR "# warning chromStart over size $ucscChr $chromStart $chromEnd\n";
    $chromStart = $chromSize-1;
  }
  if ($chromEnd > $chromSize) {
    my $overRun = $chromEnd - $chromSize;
    printf STDERR "# warning chromEnd over size by $overRun -> $ucscChr $chromStart $chromEnd\n";
    $chromEnd = $chromSize;
  }
  my $id="notFound";
  my $name="notFound";
  my $parent="notFound";
  my @b = split(';', $a[8]);
  for (my $i = 0; $i < scalar(@b); ++$i) {
     my ($tag, $value) = split('=', $b[$i]);
     if ($tag eq "ID") {
        $id = $value;
        if ($id !~ m/-/) {
          if (exists($parents{$id})) {
            printf STDERR "# WARN: duplicate parent: $id";
          } else {
            $parents{$id} = $ucscChr;
          }
        }
     } elsif ($tag eq "Parent") {
        $parent = $value;
     } elsif ($tag eq "Name") {
        $name = $value;
     }
  }
  my $type="notFound";
  my $insertType = $a[2];
  if ($insertType =~ m/clone_insert_start/) {
     $type = "start";
     if ($parent eq "notFound") {
       printf STDERR "# ERR: can not find parent for start $name Ttype $id\n";
     } else {
       if (!exists($parents{$parent})) {
         printf STDERR "# ERR: start found $name  with no parent $parent declared\n";
       } elsif (exists($starts{$parent})) {
         printf STDERR "# ERR: duplicate start for $parent\n";
       } elsif ($ucscChr eq $parents{$parent}) {
         $starts{$parent} = sprintf("%s\t%s", $chromStart, $chromEnd);
       } else {
         printf STDERR "# ERR: start on different chrom $ucscChr than parent $parent $parents{$parent}\n";
       }
     }
  } elsif ($insertType =~ m/clone_insert_end/) {
     $type = "end";
     if ($parent eq "notFound") {
       printf STDERR "# ERR: can not find parent for end $name Ttype $id\n";
     } else {
       if (!exists($parents{$parent})) {
         printf STDERR "# ERR: end found $name  with no parent $parent declared\n";
       } elsif (exists($ends{$parent})) {
         printf STDERR "# ERR: duplicate end for $parent\n";
       } elsif ($ucscChr eq $parents{$parent}) {
         $ends{$parent} = sprintf("%s\t%s", $chromStart, $chromEnd);
       } else {
         printf STDERR "# ERR: end on different chrom $ucscChr than parent $parent $parents{$parent}\n";
       }
     }
  } elsif ($insertType =~ m/clone_insert/) {
     $type = "insert";
     $endNames{$id} = $name;
  }
  $name =~ s/gi\|//g;
  $id =~ s/gi\|//g;
  printf STDERR "%s\t%d\t%d\t%s_%s_%s\t0\t%s\n", $ucscChr, $chromStart, $chromEnd, $name, $type, $id, $a[6];
}	# while (my $line = <FH>)

close (FH);

foreach my $parent (keys %parents) {
  if (! exists($starts{$parent}) ) {
    printf STDERR "# ERR: no start for $parent\n";
  } elsif (! exists($ends{$parent}) ) {
    printf STDERR "# ERR: no end for $parent\n";
  } else {
    my $strand = "+";
    my $chrStart = 0;
    my $chrEnd = 0;
    my $blockStart = 0;
    my ($sStart, $sEnd) = split('\t', $starts{$parent});
    my ($eStart, $eEnd) = split('\t', $ends{$parent});
    my $startSize = $sEnd - $sStart;
    my $endSize = $eEnd - $eStart;
    if ($eStart < $sStart) {
      $chrStart = $eStart;
      $chrEnd = $sEnd;
      $blockStart = $sStart - $chrStart;
      $strand = "-";
      $startSize = $eEnd - $eStart;
      $endSize = $sEnd - $sStart;
    } else {
      $chrStart = $sStart;
      $chrEnd = $eEnd;
      $blockStart = $eStart - $chrStart;
    }
    if ($startSize > $blockStart) {
      printf STDERR "# startSize > blockStart $endNames{$parent}\n";
    } else {
      printf "%s\t%d\t%d\t%s\t0\t%s\t%d\t%d\t0\t2\t%d,%d\t0,%d\n", $parents{$parent}, $chrStart, $chrEnd, $endNames{$parent}, $strand, $chrStart, $chrEnd, $startSize, $endSize, $blockStart;
    }
  }
}
}
'_EOF_'
    # << happy emacs

    chmod +x hg38.pl

    # process GFF files into bed files into separateLibs/ directory
for L in `cat library.list.txt`
do
   export destDir="separateLibs/${L}"
   echo "working: ${L}" 1>&1
   mkdir -p "${destDir}"
   ./hg38.pl reports/${L}.GCF_000001405.26.106.*.gff \
       2> ${destDir}/tmp.bed6 | sort -k1,1 -k2,2n > ${destDir}/hg38.${L}.bed
   sort -k1,1 -k2,2n ${destDir}/tmp.bed6 > ${destDir}/hg38.${L}.items.bed6
done

    # use only those libraries with more than 20,000 clone ends
    wc -l separateLibs/*/*.bed | sort -n | grep -v total | awk '$1 > 20000' \
        | sed -e 's#.*separateLibs/##; s#/.*##' > libs.over20K.list

    # note those libraries with less than 20,000 clone ends
    wc -l separateLibs/*/*.bed | grep -v total | awk '$1 < 20000' | sed -e 's#.*separateLibs/##; s#/.*##' > libs.under20K.list

    # filter out bad ends, length must be <= median size times three
    cat libs.over20K.list | while read D
do
   if [ ! -s separateLibs/${D}/lengths.txt ]; then
      awk '{print $3-$2}' separateLibs/${D}/hg38.${D}.bed \
        > separateLibs/${D}/lengths.txt
   fi
   median3X=`ave separateLibs/${D}/lengths.txt | grep median | awk '{printf "%d", $2*3}'`
   awk '($3-$2) < '$median3X'' separateLibs/${D}/hg38.${D}.bed > separateLibs/${D}/hg38.median3X.bed
   awk '($3-$2) >= '$median3X'' separateLibs/${D}/hg38.${D}.bed > separateLibs/${D}/hg38.badMap.bed
   before=`cat separateLibs/${D}/hg38.${D}.bed | wc -l`
   after=`cat separateLibs/${D}/hg38.median3X.bed | wc -l`
   dropped=`echo $before $after | awk '{print $1-$2}'`
   perCent=`echo $dropped $before | awk '{printf "%.2f", 100*'$dropped/$before'}'`
   echo "$D $before - $after = $dropped -> % $perCent dropped"
done

#  ABC20 24692 - 24474 = 218 -> % 0.88 dropped
#  RP11 86660 - 85903 = 757 -> % 0.87 dropped
#  CTD 95853 - 94941 = 912 -> % 0.95 dropped
#  CH17 105618 - 105060 = 558 -> % 0.53 dropped
#  ABC21 182154 - 180973 = 1181 -> % 0.65 dropped
#  ABC22 189939 - 188743 = 1196 -> % 0.63 dropped
#  COR02 208263 - 206782 = 1481 -> % 0.71 dropped
#  ABC18 325080 - 322904 = 2176 -> % 0.67 dropped
#  ABC27 334178 - 331822 = 2356 -> % 0.71 dropped
#  ABC24 398944 - 395776 = 3168 -> % 0.79 dropped
#  ABC23 436965 - 433896 = 3069 -> % 0.70 dropped
#  ABC16 452220 - 449101 = 3119 -> % 0.69 dropped
#  COR2A 583008 - 578578 = 4430 -> % 0.76 dropped
#  WI2 587165 - 582843 = 4322 -> % 0.74 dropped
#  ABC7 649297 - 644071 = 5226 -> % 0.80 dropped
#  ABC11 729962 - 724864 = 5098 -> % 0.70 dropped
#  ABC9 755994 - 750648 = 5346 -> % 0.71 dropped
#  ABC12 777816 - 771827 = 5989 -> % 0.77 dropped
#  ABC10 787969 - 781331 = 6638 -> % 0.84 dropped
#  ABC13 810822 - 803589 = 7233 -> % 0.89 dropped
#  ABC14 845573 - 839126 = 6447 -> % 0.76 dropped
#  ABC8 1204275 - 1192784 = 11491 -> % 0.95 dropped

   # loading the median3X files
for L in `cat libs.over20K.list`
do
    echo $L 1>&2
    hgLoadBed -type=bed12 hg38 cloneEnd${L} \
       separateLibs/${L}/hg38.median3X.bed \
        > separateLibs/loadBed.${L}.log 2>&1
done

   # loading the dropped ends:
   mkdir /hive/data/genomes/hg38/bed/cloneEnds/droppedTooBig
   # link them to here
   cat ../libs.over20K.list | while read L
do
  ln -s ../separateLibs/${L}/hg38.badMap.bed ${L}.badMap.bed
done
  # then load
  hgLoadBed -type=bed12 hg38 cloneEndbadEnds *.badMap.bed

    # construct multiple mapped ends:
for L in `cat libs.over20K.list`
do
    cat separateLibs/${L}/hg38.median3X.bed
done | sort -k4 > allEnds.bed

    cut -f4 allEnds.bed | sort | uniq -c | sort -rn > allEnds.names.count.txt

    awk '$1 > 1' allEnds.names.count.txt | awk '{print $2}' \
       | sort > multiples.names.txt

    join -t'	' -o "2.1,2.2,2.3,2.4,2.5,2.6,2.7,2.8,2.9,2.10,2.11,2.12" \
       -2 4 multiples.names.txt allEnds.bed | sort -k1,1 -k2,2n \
           > allEnds.multiple.locations.bed

    hgLoadBed -type=bed12 hg38 cloneEndmultipleMaps \
        allEnds.multiple.locations.bed > load.multipleMaps.log 2>&1

    awk '$6 == "+"' allEnds.bed | sort -k1,1 -k2,2n \
      | bedItemOverlapCount hg38 stdin > allEnds.forward.bedGraph

    awk '$6 == "-"' allEnds.bed | sort -k1,1 -k2,2n \
      | bedItemOverlapCount hg38 stdin > allEnds.reverse.bedGraph

    bedGraphToBigWig allEnds.forward.bedGraph \
       /hive/data/genomes/hg38/chrom.sizes \
         cloneEndcoverageForward.bw

    bedGraphToBigWig allEnds.reverse.bedGraph \
       /hive/data/genomes/hg38/chrom.sizes \
          cloneEndcoverageReverse.bw

    mkdir /gbdb/hg38/bbi/cloneEnd
    ln -s `pwd`/cloneEndcoverageForward.bw /gbdb/hg38/bbi/cloneEnd
    ln -s `pwd`/cloneEndcoverageReverse.bw /gbdb/hg38/bbi/cloneEnd

    hgBbiDbLink hg38 cloneEndcoverageForward \
        /gbdb/hg38/bbi/cloneEnd/cloneEndcoverageForward.bw
    hgBbiDbLink hg38 cloneEndcoverageReverse \
        /gbdb/hg38/bbi/cloneEnd/cloneEndcoverageReverse.bw

    ### Fixup the scores to indicate how many multiple mappings as mentioned
    ### in the hg19 bacEnds description page: one mapping: score = 1000
    ### multiple mappings: score = 1500/count
    ### the sort | uniq -c | awk does this score calculation with the name
    ###   in column 1
    ### The join puts the existing table together with those scores
    ### DONE - 2015-06-18 - Hiram

    mkdir /hive/data/genomes/hg38/bed/cloneEnds/addCounts
    cd /hive/data/genomes/hg38/bed/cloneEnds/addCounts
    mkdir score withScore noScore withScore
    for table in cloneEndABC10 cloneEndABC11 cloneEndABC12 cloneEndABC13 \
cloneEndABC14 cloneEndABC16 cloneEndABC18 cloneEndABC20 cloneEndABC21 \
cloneEndABC22 cloneEndABC23 cloneEndABC24 cloneEndABC27 cloneEndABC7 \
cloneEndABC8 cloneEndABC9 cloneEndCH17 cloneEndCOR02 cloneEndCOR2A \
cloneEndCTD cloneEndRP11 cloneEndWI2 cloneEndbadEnds cloneEndmultipleMaps
do
  hgsql -N -e "select name from $table;" hg38 | sort | uniq -c |
      awk '{ if (1 == $1) {printf "%s\t1000\n", $2} else {printf "%s\t%d\n", $2, 1500/$1} }' \
         | sort > score/hg38.$table.score.tab
  hgsql -N -e "select * from $table order by name;" hg38 \
      | sort -k5 > noScore/hg38.$table.tab
  join -t'^I' -1 5 noScore/hg38.$table.tab score/hg38.$table.score.tab \
  | awk '{printf "%d\t%s\t%d\t%d\t%s\t%d\t%s\t%d\t%d\t%d\t%d\t%s\t%s\n", $2,$3,$4,$5,$1,$14,$7,$8,$9,$10,$11,$12,$13}' \
    | sort -k2,2 -k3,3n > withScore/hg38.$table.withScore.tab
  hgsql -e "delete from $table;" hg38
  hgsql -e "load data local infile \"withScore/hg38.$table.withScore.tab\" into table $table;" hg38
done

##############################################################################
# SIB Transcriptome (DONE 2014-08-27 Steve)

    # Create working directory and download data from where Christian
    # Iseli (christian.iseli@unil.ch) put it, and unpack.
    mkdir -p /hive/data/genomes/hg38/bed/sibTranscriptome
    cd /hive/data/genomes/hg38/bed/sibTranscriptome
    wget --timestamping http://ludwig-sun1.unil.ch/~chris/HTr.gtf.gz
    wget --timestamping http://ludwig-sun1.unil.ch/~chris/txg.tar.gz

    tar -zxvf txg.tar.gz

    zcat HTr.gtf.gz | ldHgGene hg38 sibGene stdin
    # Reading stdin
    # Read 208508 transcripts in 2824960 lines in 1 files
    # 208508 groups 25 seqs 1 sources 2 feature types
    # 208508 gene predictions

    # Do a little data cleanup and transformation and load splice graphs
    # into database.
    sed 's/altGraphX/sibTxGraph/' ~/kent/src/hg/lib/altGraphX.sql > sibTxGraph.sql
    cat txg/*.txg | txgToAgx stdin stdout | hgLoadBed -notItemRgb \
      -sqlTable=sibTxGraph.sql hg38 sibTxGraph stdin
    # Reading stdin
    # Read 47817 elements of size 18 from stdin
    # Sorted
    # Creating table definition for sibTxGraph from sql: sibTxGraph.sql
    # Saving bed.tab
    # Loading hg38

    # Create sibAltEvents track for analyzed alt-splices.
    # Not on RR for hg18 and hg19, so do not push it out
    cat txg/*.txg | txgAnalyze stdin /cluster/data/hg38/hg38.2bit sibAltEvents.bed
    awk '$2 >= 0' sibAltEvents.bed | sort | uniq > foo.bed
    hgLoadBed hg38 sibAltEvents foo.bed
    # Reading foo.bed
    # Read 452436 elements of size 6 from foo.bed
    # Sorted
    # Creating table definition for sibAltEvents, bedSize: 6
    # Saving bed.tab
    # Loading hg38

    # push sibGene and sibTxGraph for hg38

############################################################################
# Orangutan Lastz run (DONE - 2014-05-27 - Hiram)
    screen -S hg38PonAbe2      # use a screen to manage this longish running job
    mkdir /hive/data/genomes/hg38/bed/lastzPonAbe2.2014-09-02
    cd /hive/data/genomes/hg38/bed/lastzPonAbe2.2014-09-02

    # always set the BLASTZ program so we know what version was used
    cat << '_EOF_' > DEF
# human vs chimp
BLASTZ=/cluster/bin/penn/lastz-distrib-1.03.52/bin/lastz
BLASTZ_O=600
BLASTZ_E=150
# maximum M allowed with lastz is only 254
BLASTZ_M=254

BLASTZ_T=2
BLASTZ_Y=15000
BLASTZ_K=4500
BLASTZ_Q=/scratch/data/blastz/human_chimp.v2.q
#    A    C    G    T
#    90 -330 -236 -356
#  -330  100 -318 -236
#  -236 -318  100 -330
#  -356 -236 -330   90

# TARGET: Human Hg38
SEQ1_DIR=/scratch/data/hg38/hg38.2bit
SEQ1_LEN=/scratch/data/hg38/chrom.sizes
SEQ1_CHUNK=20000000
SEQ1_LAP=10000
SEQ1_IN_CONTIGS=0

# QUERY: Orangutan PonAbe2
SEQ2_DIR=/hive/data/genomes/ponAbe2/ponAbe2.2bit
SEQ2_LEN=/hive/data/genomes/ponAbe2/chrom.sizes
SEQ2_CHUNK=10000000
SEQ2_LAP=0
SEQ2_LIMIT=100
SEQ2_IN_CONTIGS=0

BASE=/hive/data/genomes/hg38/bed/lastzPonAbe2.2014-09-02
TMPDIR=/dev/shm
'_EOF_'

    time (doBlastzChainNet.pl `pwd`/DEF -verbose=2 \
        -chainMinScore=5000 -chainLinearGap=medium \
        -workhorse=hgwdev -smallClusterHub=ku -bigClusterHub=ku \
        -syntenicNet) > do.log 2>&1
    # real    144m46.575s
    cat fb.hg38.chainPonAbe2Link.txt
    # 2719618310 bases of 3049335806 (89.187%) in intersection

    # filter with doRecipBest.pl
    time (doRecipBest.pl -workhorse=hgwdev -buildDir=`pwd` \
        hg38 ponAbe2) > rbest.log 2>&1
    # real    60m1.060s
    time (doRecipBest.pl -load -continue=load -workhorse=hgwdev \
	-buildDir=`pwd` hg38 ponAbe2) > loadRBest.log 2>&1 &
    # real    3m35.834s

    cat fb.hg38.chainRBestPonAbe2Link.txt
    # 2538296592 bases of 3049335806 (83.241%) in intersection

    # running the swap
    mkdir /hive/data/genomes/ponAbe2/bed/blastz.hg38.swap
    cd /hive/data/genomes/ponAbe2/bed/blastz.hg38.swap
    time (doBlastzChainNet.pl -verbose=2 \
        -swap /hive/data/genomes/hg38/bed/lastzPonAbe2.2014-09-02/DEF \
        -chainMinScore=5000 -chainLinearGap=medium \
        -workhorse=hgwdev -smallClusterHub=ku -bigClusterHub=ku \
        -syntenicNet) > swap.log 2>&1
    # real    102m27.866s
    cat fb.ponAbe2.chainHg38Link.txt
    #  2773568958 bases of 3093572278 (89.656%) in intersection

    time (doRecipBest.pl -workhorse=hgwdev -buildDir=`pwd` \
        ponAbe2 hg38) > rbest.log 2>&1
    # real    78m47.312s




#############################################################################
# Add chrX alts to par (DONE 2014-10-14 angie)
# Thanks to Hiram for pointing out that intersecting chrX positions in
# altLocations and par shows whether a chrX alt overlaps a PAR.
    cd /hive/data/genomes/hg38/bed/par
    hgsql hg38 -e 'select * from altLocations where chrom = "chrX"'
#+-----+-------+------------+----------+---------------------+
#| bin | chrom | chromStart | chromEnd | name                |
#+-----+-------+------------+----------+---------------------+
#|  73 | chrX  |     319337 |   601516 | chrX_KI270880v1_alt |
#|  73 | chrX  |     326487 |   601516 | chrX_KI270913v1_alt |
#| 149 | chrX  |   79965153 | 80097082 | chrX_KI270881v1_alt |
#+-----+-------+------------+----------+---------------------+
    hgsql hg38 -e 'select * from par where chrom = "chrX"'
#+-----+-------+------------+-----------+------+
#| bin | chrom | chromStart | chromEnd  | name |
#+-----+-------+------------+-----------+------+
#|   9 | chrX  |      10000 |   2781479 | PAR1 |
#| 221 | chrX  |  155701382 | 156030895 | PAR2 |
#+-----+-------+------------+-----------+------+
    # chrX_KI270880v1_alt and chrX_KI270913v1_alt are entirely contained in PAR1;
    # chrX_KI270881v1_alt is not in either PAR.
    hgsql hg38 -e 'select chrom,size from chromInfo \
                     where chrom in ("chrX_KI270880v1_alt", "chrX_KI270913v1_alt");'
#+---------------------+--------+
#| chrom               | size   |
#+---------------------+--------+
#| chrX_KI270880v1_alt | 284869 |
#| chrX_KI270913v1_alt | 274009 |
#+---------------------+--------+
    # Process that into bed4 with name=PAR1:
    hgsql hg38 -NBe 'select chrom, 0, size, "PAR1" from chromInfo \
                       where chrom in ("chrX_KI270880v1_alt", "chrX_KI270913v1_alt");' \
      >> hg38Par.bed4
    hgLoadBed hg38 par hg38Par.bed4
    checkTableCoords hg38 par


#############################################################################
# LASTZ Cow bosTau8 (DONE - 2014-10-15 - Steve)
    mkdir /hive/data/genomes/hg38/bed/lastzBosTau8.2014-10-215
    cd /hive/data/genomes/hg38/bed/lastzBosTau8.2014-10-15

    cat << '_EOF_' > DEF
# human vs cow
# maximum M allowed with lastz is only 254
BLASTZ_M=254

# TARGET: Human hg38
SEQ1_DIR=/scratch/data/hg38/hg38.2bit
SEQ1_LEN=/scratch/data/hg38/chrom.sizes
SEQ1_CHUNK=10000000
SEQ1_LAP=10000

# QUERY: Cow bosTau8
SEQ2_DIR=/hive/data/genomes/bosTau8/bosTau8.2bit
SEQ2_LEN=/hive/data/genomes/bosTau8/chrom.sizes
SEQ2_CHUNK=10000000
SEQ2_LAP=0


BASE=/hive/data/genomes/hg38/bed/lastzBosTau8.2014-10-15
TMPDIR=/scratch/tmp
'_EOF_'
    # << happy emacs
    time nice -n +19 doBlastzChainNet.pl -verbose=2 \
        `pwd`/DEF \
        -syntenicNet \
        -noLoadChainSplit \
        -workhorse=hgwdev -smallClusterHub=ku -bigClusterHub=ku \
        -chainMinScore=3000 -chainLinearGap=medium > do.log 2>&1
    # real    602m37.523s
    cat fb.hg38.chainBosTau8Link.txt
    # 1401921010 bases of 3049335806 (45.975%) in intersection
    # Create link
    cd /hive/data/genomes/hg38/bed
    ln -s  lastzBosTau8.2014-10-15 lastz.bosTau8

    #   running the swap
    mkdir /hive/data/genomes/bosTau8/bed/blastz.hg38.swap
    cd /hive/data/genomes/bosTau8/bed/blastz.hg38.swap
    time nice -n +19 doBlastzChainNet.pl -verbose=2 \
        /hive/data/genomes/hg38/bed/lastzBosTau8.2014-10-15/DEF \
        -swap  -syntenicNet \
        -noLoadChainSplit \
        -workhorse=hgwdev -smallClusterHub=ku -bigClusterHub=ku \
        -chainMinScore=3000 -chainLinearGap=medium > swap.log 2>&1
    #   real     116m32.121s
    cat fb.bosTau8.chainHg38Link.txt
    #   1336307377 bases of 2649307237 (50.440%) in intersection
    cd /hive/data/genomes/bosTau8/bed
    ln -s blastz.hg38.swap lastz.hg38

############################################################################
# NCBI ClinVar (new version -DONE - 2014-11-08 - Max)
# see hg19.txt
#########################################################################

########################################################################
# CNV Developmental Delay track (2014-11-21 Steve)

    mkdir /hive/data/genomes/hg38/bed/cnvDevDelay
    cd /hive/data/genomes/hg38/bed/cnvDevDelay

wget --timestamping 'ftp://ftp.ncbi.nlm.nih.gov/pub/dbVar/data/Homo_sapiens/by_study/nstd100_Coe_et_al_2014/gvf/nstd100_Coe_et_al_2014.GRCh38.remap.all.germline.ucsc.gvf.gz'
wget --timestamping 'ftp://ftp.ncbi.nlm.nih.gov/pub/dbVar/data/Homo_sapiens/by_study/nstd54_Cooper_et_al_2011/gvf/nstd54_Cooper_et_al_2011.GRCh38.remap.all.germline.ucsc.gvf.gz'

cp /kent/src/hg/utils/automation/gvfToBed8Attrs.pl .
mv gvfToBed8Attrs.pl gvfToBed8AttrsCase.pl
cp gvfToBed8AttrsCase.pl gvfToBed8AttrsControl100.pl
cp gvfToBed8AttrsCase.pl gvfToBed8AttrsControl54.pl

# made three local copies of Angie's gvf conversion script - one to include
# only case individuals from nstd100, one to include only control individuals
# from nstd100 and one to include only control individuals from nstd54

# had to add an additional elsif statement to the nstd100 scripts to filter
# based on sample_name field:

#  } elsif ($tag eq "sample_name") {
#    $sample_name = $val;
#  }

# added line 33/35 to each file:

# next if ($sample_name eq "Unknown"); # keep only "case" individuals from nstd100
# next if ($sample_name ne "Unknown"); # keep only "control" individuals from nstd100
# next if ($phenotype ne "not_reported"); # keep only "control" individuals from nstd54

zcat nstd100_Coe_et_al_2014.GRCh38.remap.all.germline.ucsc.gvf.gz | gvfToBed8AttrsCase.pl > cnvDevDelayAllCase.bed
zcat nstd100_Coe_et_al_2014.GRCh38.remap.all.germline.ucsc.gvf.gz | gvfToBed8AttrsControl100.pl > cnvDevDelayAllControl.bed
zcat nstd54_Cooper_et_al_2011.GRCh38.remap.all.germline.ucsc.gvf.gz | gvfToBed8AttrsControl54.pl >> cnvDevDelayAllControl.bed

# GRCh38 data from dbVar had different naming scheme for alternate chromosomes
# (e.g., chr1|NT_187515.1 instead of chr1_KI270762v1_alt), so needed to write
# a script to substitute the correct UCSC names

    cat << '_EOF_' > chromXref.pl
#!/usr/bin/env perl

use strict;
use warnings;

sub usage() {
  printf STDERR "usage: ./chromXref.pl <infile> <outfile>\n"
}

my $argc = scalar(@ARGV);

if ($argc != 2) {
  usage;
  exit 255;
}

open (file1, "<hg38.xref") or die "cannot read hg38.xref";

my @accArray = ();
my $i = 0;
while (my $line = <file1>) {
  chomp($line);
  my ($type, $chr, $acc1, $acc2) = split('\t', $line);
  ($type, undef) = split('-', $type);
  ($acc1, my $version) = split('\.', $acc1);
  if ($type eq "unlocalized") {
    $type = "random";
  }
  my $ucscAcc = "_" . $acc1 . "v" . $version . "_" . $type;
  $accArray[$i][0] = $ucscAcc;
  $accArray[$i][1] = $acc2;
  $i++;
}

close (file1);

open (file2, "<$ARGV[0]") or die "cannot read $ARGV[0]";
open (file3, ">$ARGV[1]") or die "cannot read $ARGV[1]";
local $/;
my $fileContents = <file2>;
for ($i = 0; $i < scalar(@accArray); $i++) {
  my $temp1 = $accArray[$i][1];
  my $temp2 = $accArray[$i][0];
  if ($fileContents =~ m/\|$temp1/) {
    $fileContents =~ s/\|$temp1/$temp2/g;
  }
}

print file3 $fileContents;
close (file2);
close (file3);
'_EOF_'
    # << happy emacs

cp /hive/data/genomes/hg38/genbank/GCF_000001405.26.assembly.txt .

cat GCF_000001405.26.assembly.txt | grep -v '^#\|assembled\|unplaced' | awk '{print $2 "\t" $3 "\t" $5 "\t" $7}' > hg38.xref

chromXref.pl cnvDevDelayAllCase.bed cnvDevDelayAllCaseUcsc.bed
chromXref.pl cnvDevDelayAllControl.bed cnvDevDelayAllControlUcsc.bed

hgLoadBed -tab -renameSqlTable -sqlTable=$HOME/kent/src/hg/lib/bed8Attrs.sql \
        -allowStartEqualEnd hg38 cnvDevDelayCase cnvDevDelayAllCaseUcsc.bed

hgLoadBed -tab -renameSqlTable -sqlTable=$HOME/kent/src/hg/lib/bed8Attrs.sql \
        -allowStartEqualEnd hg38 cnvDevDelayControl cnvDevDelayAllControlUcsc.bed

    checkTableCoords hg38 cnvDevDelayCase
    checkTableCoords hg38 cnvDevDelayControl


#########################################################################
# RETROFINDER RETROPOSED GENES ucscRetro track VERSION 9
# (2015-01-12 - 2015-01-20, hartera, DONE)
ssh hgwdev
mkdir -p /hive/groups/gencode/pseudogenes/retroFinder/hg38.20150112
cd /hive/groups/gencode/pseudogenes/retroFinder/hg38.20150112

cat << '_EOF_' > DEF

RETRO_OPTIONS="-verbose=4 -minAli=0.98 -nearTop=0.005 "
VERSION=9
RUNDATE="2015-01-12"
DB=hg38
SCORETHRESH=550
GENOMENAME='Homo sapiens'
GBDB=hg
DATE=20150112
RUNDIR=/hive/groups/gencode/pseudogenes/retroFinder/$DB.$DATE
BINDIR=/hive/users/hartera/GencodeWG/retroFinder/branches/version2/bin
KENTDIR=/cluster/home/hartera/kent
KENTBINDIR=/cluster/home/hartera/bin/x86_64
MRNABASE=/hive/data/genomes/$DB/bed/mrnaBlastz.$VERSION
TMPMRNA=$RUNDIR/mrnaBlastz/$DB
TMPEST=$RUNDIR/est/$DB
USEALTSEQS=0
EST=all_est
SPLICED_EST=intronEst
SPLIT_EST=0
SPLIT_SPLICED_EST=0
LASTZPROG=/cluster/bin/penn/x86_64/lastz
SCRIPT=/hive/users/hartera/GencodeWG/retroFinder/branches/version2/src/pipeline
GENOME=/hive/data/genomes
RETRODIR=$GENOME/$DB/bed/retro
BASE=$RUNDIR/retro
OUTDIR=${BASE}/version${VERSION}/${DB}
RESULT=$OUTDIR/result
RESULTSPLIT=$OUTDIR/resultSplit
LOG=$OUTDIR/log
OUT=$OUTDIR/out
OVERLAPDIR=$OUTDIR/run.o
TABLE=ucscRetroInfo$VERSION
ORTHOTABLE=ucscRetroOrtho$VERSION
ALIGN=ucscRetroAli$VERSION
LOCAL=/scratch/data/$DB
TWOBIT=$GENOME/$DB/$DB.2bit
RMSK=rmsk
NET1=netMm10
NET2=netCanFam3
NET3=netRheMac3
# these two nets determine which retros are classified as ancient,
# use two farthest nets
ANCIENT1=netMm10
ANCIENT2=netCanFam3
GENE1=knownGene
GENE2=refGene
GENE3=wgEncodeGencodeCompV19
CLUSTER=ku
SPECIES="hg38 mm10"
ROOTDIR="/cluster/home/hartera/public_html/retro/hg38Jun14"
WEBROOT=$ROOTDIR/retro.$VERSION
WEBSERVER=http://hgwdev-hartera.soe.ucsc.edu
SHUFFLEDIR=shuffle
SHUFFLEROOT=$WEBROOT/$SHUFFLEDIR
DUPDIR=dups
DUPROOT=$WEBROOT/$DUPDIR
AGEDIR=age
AGEROOT=$WEBROOT/$AGEDIR
EXPDIR=exp
GENEPFAM=knownGene
PFAM=knownToPfam
PFAMIDFIELD=name
PFAMDOMAIN=value
ALTSPICE=
#ALTSPLICE=sibTxGraph
SPLITBYAGE=$SCRIPT/splitRetrosByAge
PDB=proteins140122
#ARRAY=gnfAtlas2
#AFFYPROBE="affyU133A,affyGnf1h"
#ARRAYMEDIAN=hgFixed.gnfHumanAtlas2Median
#ARRAYRATIO=hgFixed.gnfHumanAtlas2AllRatio
#ARRAYABS=hgFixed.gnfHumanAtlas2All
#ARRAYEXP=hgFixed.gnfHumanAtlas2MedianExps
#ARRAYEXPALL=hgFixed.gnfHumanAtlas2AllExps
#ARRAYLOOKUP=knownToGnfAtlas2
#ARRAYPSLS="/hive/data/genomes/hg19/bed/geneAtlas2/affyU133A.psl /hive/data/genomes/hg19/bed/geneAtlas2/affyGnf1h.psl"
'_EOF_'
    # << happy emacs
chmod +x DEF

mkdir -p /hive/data/genomes/hg38/bed/retro
mkdir -p /hive/data/genomes/hg38/bed/mrnaBlastz.9
mkdir -p /hive/groups/gencode/pseudogenes/retroFinder/hg38.20150112/mrnaBlastz
cd /hive/groups/gencode/pseudogenes/retroFinder/hg38.20150112/mrnaBlastz
cp ../DEF .

# Create S1.len file
rom.sizes without random chroms or chrM, there are many alt loci also
# in hg38 that were not in hg19 so 285 chroms total.
cat /hive/data/genomes/hg38/chrom.sizes | grep -v random \
   | grep -v chrUn | grep -v chrM > S1.len
cp S1.len /hive/data/genomes/hg38/bed/mrnaBlastz.9

screen
# Run steps 1 to 5 of RetroFinder pipeline from scripts in CCDS SVN source tree:
retroFinder/branches/version2/src/pipeline/ucscStep1.sh DEF
# check cluster jobs on ku
retroFinder/branches/version2/src/pipeline/ucscStep2.sh DEF
retroFinder/branches/version2/src/pipeline/ucscStep3.sh DEF
#check cluster jobs on ku
retroFinder/branches/version2/src/pipeline/ucscStep4.sh DEF
#check cluster jobs on ku
    # Load the track
retroFinder/branches/version2/src/pipeline/ucscStep5.sh DEF
cd /hive/groups/gencode/pseudogenes/retroFinder/hg38.20150112/retro/version9/hg38
retroFinder/branches/version2/src/pipeline/filterMrna.sh
retroFinder/branches/version2/src/pipeline/filterEst.sh
# Check cluster jobs on ku
retroFinder/branches/version2/src/pipeline/analyseExpress.sh
# Check cluster jobs on ku
#added ucscRetroAli9 to kent/src/hg/makeDb/human/hg38/trackDb.ra
# copied
# /hive/groups/gencode/pseudogenes/retroFinder/hg38.20150112/retro/version9/hg38/trackDb.retro
# entry to kent/src/hg/makeDb/trackDb/human/hg38/trackDb.ra and edited it to
# remove the full date and add:
# dataVersion Jan. 2015
# Scripts copied ucscRetroAli9.psl, ucscRetroInfo9.bed and ucscRetroCds9.tab
# to /hive/data/genomes/hg38/bed/retro/

##########
# Make dbVar chrom to UCSC chrom lift file
#  DONE braney 2/12/15
cd /cluster/data/hg38/jkStuff
sort /cluster/data/hg38/chrom.sizes > tmpChrom
grep -v '^#\|assembled' /hive/data/genomes/hg38/genbank/GCF_000001405.26.assembly.txt | awk 'BEGIN {OFS="\t"} {print "chr" $3 "_" $5 "_" $2, "chr" $3 "|"$7}' | sed 's/-scaffold//' | sed 's/unlocalized/random/' | sed 's/_unplaced//' | sed 's/chrna/chrUn/g' | sed 's/\./v/'  | sort | join /dev/stdin tmpChrom | awk 'BEGIN {OFS="\t"} {print 0, $2, $3, $1, $3}'  > dbVar.lift
awk 'BEGIN {OFS="\t"} {print 0, $1, $2, $1, $2}' /cluster/data/hg38/chrom.sizes >> dbVar.lift
rm tmpChrom

#########################################################################
# UCSC to RefSeq name correspondence (DONE - 2015-04-13 - Hiram)

    mkdir /hive/data/genomes/hg38/bed/ucscToRefSeq
    cd /hive/data/genomes/hg38/bed/ucscToRefSeq

    # columns 5 and 7 are the INSDC and RefSeq names

    grep -v "^#" ../../genbank/GCF_000001405.26.assembly.txt \
      | awk -F'\t' '{printf "%s\t%s\n", $5,$7}'  | sort > insdc.refSeq.tab

    hgsql -N -e 'select name,chrom,chromStart,chromEnd from ucscToINSDC;' hg38 \
      | sort > insdc.ucsc.tab

    join insdc.ucsc.tab insdc.refSeq.tab | tr '[ ]' '[\t]' \
       | cut -f2- > ucsc.refSeq.tab


    export chrSize=`cut -f1 ucsc.refSeq.tab | awk '{print length($0)}' | sort -n | tail -1`
    sed -e "s/21/$chrSize/" $HOME/kent/src/hg/lib/ucscToINSDC.sql \
       | sed -e 's/INSDC/RefSeq/g;' > ucscToRefSeq.sql
    hgLoadSqlTab hg38 ucscToRefSeq ./ucscToRefSeq.sql ucsc.refSeq.tab

    checkTableCoords  hg38 -table=ucscToRefSeq

#########################################################################
#CREATE MICROSAT TRACK (DONE - 2015-05-22 - Hiram)
    ssh hgwdev
    mkdir /cluster/data/hg38/bed/microsat
    cd /cluster/data/hg38/bed/microsat

    awk '($5==2 || $5==3) && $6 >= 15 && $8 == 100 && $9 == 0 {printf("%s\t%s\t%s\t%dx%s\n", $1, $2, $3, $6, $16);}' \
       ../simpleRepeat/simpleRepeat.bed > microsat.bed

    hgLoadBed hg38 microsat microsat.bed

#############################################################################
# ENCODE Regulatory tracks  (Kate & Chris)

# see reg.txt
#########################################################################
# GWIPS-viz Ribo-seq - (DONE - 2016-02-05 - Steve)
# contact Audrey Michel (audreymannion@gmail.com)
# redmine #16765

obtained bigWig file from shared Google drive
https://drive.google.com/a/soe.ucsc.edu/folderview?id=0B_xvV_5tXzOGQ1h5NEh4bnhNTDg&usp=sharing_eid

mkdir /hive/data/genomes/hg38/bed/gwipsvizRiboseq
cp Global_RiboProElong.10_02_2016.bw /hive/data/genomes/hg38/bed/gwipsvizRiboseq/gwipsvizRiboseq.bw

mkdir /gbdb/hg38/bbi/gwipsvizRiboseq
cd /gbdb/hg38/bbi/gwipsvizRiboseq
ln -s /hive/data/genomes/hg38/bed/gwipsvizRiboseq/gwipsvizRiboseq.bw gwipsvizRiboseq.bw

hgsql hg38
create table gwipsvizRiboseq select * from gc5BaseBw;
update gwipsvizRiboseq set fileName="/gbdb/hg38/bbi/gwipsvizRiboseq/gwipsvizRiboseq.bw" where fileName="/gbdb/hg38/bbi/gc5BaseBw/gc5Base.bw";

#########################################################################
# COSMIC v81 DONE Chris Eisenhart 2017-05-11
# Make a new COSCMIC track for hg19
mkdir /hive/data/outside/cosmic/hg38/v81
cd /hive/data/outside/cosmic/hg38/v81

# Get the new data
sftp ceisenha@ucsc.edu@sftp-cancer.sanger.ac.uk
# Login to SFTP server then run these commands
get /files/grch38/cosmic/v81/CosmicMutantExport.tsv.gz

# Remove the 'NS' fields, search for the \t after to exclude the E'NS'ST transcripts.
zcat CosmicMutantExport.tsv.gz | sed 's/NS\t/\t/g' > cosMut.tsv

# Use a script to convert to bed format.
cosmicToBed cosMut.tsv cosMut.bed
# This many lines were skipped, 131597 for not having genomic coordinate

# Sort and convert to big bed using the .as file.
sort -k1,1 -k2,2n cosMut.bed > sCosMut.bed
bedToBigBed -type=bed4+31 -as=cosmicNew.as sCosMut.bed /hive/data/genomes/hg38/chrom.sizes cosMutHg38V81.bb -tab -extraIndex=name,cosmLabel

# Link it up so the outside world can see it.
cd /gbdb/hg38/cosmic/
ln -s /hive/data/outside/cosmic/hg38/v81/cosMutHg38V81.bb .
#########################################################################
# hoffmanMappability hub import (2 super tracks) DONE Chris Eisenhart 2017-05-16
mkdir /hive/data/outside/hoffmanMappability/hg38
cd /hive/data/outside/hoffmanMappability/hg38
wget https://www.pmgenomics.ca/hoffmanlab/proj/bismap/trackhub/hg38/trackDb.txt
# Get the trackDb file
importTrackHub trackDb.txt hofMap.ra /gbdb/hg38/hoffmanMappability/ --trackDbPath=$HOME/kent/src/hg/makeDb/trackDb/human/hg38/ --test
# Check that the commands are what we want, then run for real
importTrackHub trackDb.txt hofMap.ra /gbdb/hg38/hoffmanMappability/ --trackDbPath=$HOME/kent/src/hg/makeDb/trackDb/human/hg38/
# View the .ra file to make sure things are ok, here changed the groups to map,
# added the alpha tags, and removed the 'show' from 'superTrack on show'
cp hofMap.ra ~/kent/src/hg/makeDb/trackDb/human/hg38
# Include hofMap.ra in the trackDb.ra file

# the importTrackHub failed on redirection, fetch all the files manually:
# 2017-09-15 - Hiram

cd /hive/data/outside/hoffmanMappability/hg38

grep bigDataUrl trackDb.txt | awk '{print $NF}' | sed -e 's#https://www.pmgenomics.ca/hoffmanlab/proj/bismap/trackhub/hg38/##;' | while read F
do
  echo $F
  rm -f $F
  wget --timestamping "https://www.pmgenomics.ca/hoffmanlab/proj/bismap/trackhub/hg38/${F}"
done
    # real    29m40.429s

#########################################################################
# tcgaExpr super track Chris Eisenhart, DONE, 2017-05-17
# tcgaTranscExpr
# TCGA transcript level expression barChart track, from TOIL pipeline recompute (John Vivian)
# biorxiv.org/content/biorxiv/early/2016/07/07/062497.full.pdf
mkdir /hive/data/outside/tcgaBarcharts/
mkdir /hive/data/outside/tcgaBarcharts/transcripts
cd /hive/data/outside/tcgaBarcharts/transcripts

# Get all the meta data
cp ~max/projects/cirm/datasetPages/tcgaGtex/tcgaMeta.tab .
# Cut out the meta data the script wants, sample name and group.
cut -f 1,5 tcgaMeta.tab | sed 's/ /_/g' > tcgaLargeSamples.tsv

# Get and clean the matrix
cp ~max/projects/cirm/datasetPages/tcgaGtex/tcga.tpm.tab .
# Clean up the transcript names (remove the .#)
cut -f 1 tcga.tpm.tab | cut -f 1 -d "." > tcgaTranscripts.txt
cut -f 2- tcga.tpm.tab > tcgaTpmValues.tsv
paste tcgaTranscripts.txt tcgaTpmValues.tsv > tcgaMatrix.tsv

# Build a coordinate map
hgsql hg38 -e "select * from ensGene" | cut -f 2- | sort > ensGene
hgsql hg38 -e "select * from ensemblToGeneName" | sort >  ensemblToGeneName
join ensGene ensemblToGeneName | awk '{print $2"\t"$4"\t"$5"\t"$1"\t0\t"$3"\t"$16}' > coord.bed

# Use the meta data, matrix, and coordinate map to generate a barchart bed
time expMatrixToBarchartBed tcgaLargeSamples.tsv tcgaMatrix.tsv coord.bed tcgaTransExp.bed --groupOrder tcgaGroupOrder.txt

# NOTE: Use the header line of the bed file to populate the barChartBars field in the trackDb.
# The order of the labels in the barChartBars field should match the order of the labels in the
# expScores column in the bed file header.

# Sort and convert into a bigBed file.
sort -k1,1 -k2,2n tcgaTransExp.bed > sortedTcgaTransExp.bed
bedToBigBed -type=bed6+5 -as=$HOME/kent/src/hg/lib/barChartTranscExp.as sortedTcgaTransExp.bed /hive/data/genomes/hg38/chrom.sizes tcgaTransExp.bb

# Link the files into gbdb
cd /gbdb/hgFixed/human/expMatrix
ln -s /hive/data/outside/tcgaBarcharts/transcripts/tcgaLargeSamples.tsv tcgaLargeSamples.tab
ln -s /hive/data/outside/tcgaBarcharts/transcripts/tcgaMatrix.tsv tcgaMatrix.tab
ln -s /hive/data/outside/tcgaBarcharts/transcripts/tcgaTransExp.bb .

###########3
# Reload bigBed with a schema that will be shared with genes track, to support
# configuration as subtracks in a composite
# (2007-08-30 kate)
cd /hive/data/outside/tcgaBarcharts/transcripts
bedToBigBed -type=bed6+5 -as=$HOME/kent/src/hg/lib/barChartGeneExpr.as sortedTcgaTransExp.bed /hive/data/genomes/hg38/chrom.sizes tcgaTranscExpr.hg38.bb
mkdir /gbdb/hg38/tcga
ln -s `pwd`/tcgaTranscExpr.hg38.bb /gbdb/hg38/tcga/tcgaTranscExpr.bb

# TCGA gene level expression barChart track, from TOIL pipeline recompute (John Vivian)
# tcgaGeneExpr
mkdir ../genes
cd ../genes

# Get the gene matrix.
cp ~max/projects/cirm/datasetPages/tcgaGtex/tcga.geneTpm.tab .

# Make a coordinate file, the genes in gtexGeneModelV6 have .# versions which are
# removed with the temp fils.
hgsql hg38 -e "select * from hg38.gtexGeneModelV6" | awk '{print $3"\t"$5"\t"$6"\t"$2"\t0\t"$4"\t"$2}' > coord6+1.bed.temp
cut -f 4 coord6+1.bed.temp | cut -f 1 -d "." > foo
cut -f 1-3 coord6+1.bed.temp > foo2
paste foo2 foo > foo3
cut -f 5- coord6+1.bed.temp > foo4
paste foo3 foo4 > coord6+1.bed
# This bed file didn't have the right gene names (ENS rather than Hugo), fix it.
hgsql hg38 -e "select * From knownCanonical" > foo
wc foo
cut -f 6 foo | cut -f 1 -d "."
cut -f 6 foo | cut -f 1 -d "." > foo2
head foo
cut -f 1-3 foo > foo3
paste foo2 foo3 > foo4
cut -f 4- coord6+1.bed > foo5
join <(sort foo5) <(sort foo4) | awk '{print $5"\t"$6"\t"$7"\t"$1"\t0\t"$3"\t"$4}' > coord6+1.3.bed

# Generate the bed file, can use the same transcript file
time expMatrixToBarchartBed ../transcripts/tcgaLargeSamples.tsv tcga.geneTpm.tab coord6+1.3.bed tcgaGeneExp.bed --groupOrder=../transcripts/tcgaGroupOrder.txt

# Convert to big bed
sort -k1,1 -k2,2n tcgaGeneExp.bed > sortedTcgaGeneExp.bed
bedToBigBed -type=bed6+5 -as=$HOME/kent/src/hg/lib/barChartGeneExp.as sortedTcgaGeneExp.bed /hive/data/genomes/hg38/chrom.sizes tcgaGeneExp.bb

# Link to gbdb
cd /gbdb/hgFixed/human/expMatrix
ln -s /hive/data/outside/tcgaBarcharts/genes/tcgaGeneExp.bb .
ln -s /hive/data/outside/tcgaBarcharts/genes/tcga.geneTpm.tab tcgaGeneMatrix.tab

###########3
# Reload bigBed with a schema that will be shared with transcript track, to support
# configuration as subtracks in a composite
# Apparently Chris actually loaded the #3 file (added gene names, adjusted end coord apparently)
# (2007-08-30 kate)
cd /hive/data/outside/tcgaBarcharts/genes
bedToBigBed -type=bed6+5 -as=$HOME/kent/src/hg/lib/barChartGeneExpr.as sortedTcgaGeneExp3.bed /hive/data/genomes/hg38/chrom.sizes tcgaGeneExpr.hg38.bb
mkdir /gbdb/hg38/tcga
ln -s `pwd`/tcgaGeneExpr.hg38.bb /gbdb/hg38/tcga/tcgaGeneExpr.bb

#########################################################################
# gtexTransExp Chris Eisenhart, done, 2017-05-23
# TCGA transcript level RNA-seq, from TOIL pipeline recompute (John Vivian)
# biorxiv.org/content/biorxiv/early/2016/07/07/062497.full.pdf
mkdir /hive/data/outside/gtex/barChartTrack
cd /hive/data/outside/gtex/barChartTrack

# Seems John included some TCGA data (CML) in the GTEx matrix and samples, the cleaning steps remove this.
# Make a clean sample file
cat ../johnVivianRecompute/sraToSample.txt | sed 's/ male /\tmale\t/g' | sed 's/ female /\tfemale\t/g' | cut -f 3 | sed 's/ - /-/g' | sed 's/ /_/g' > gtexSampleGroups.txt
cat ../johnVivianRecompute/sraToSample.txt | cut -f 1 -d " " > gtexSampleNames.txt
paste gtexSampleNames.txt gtexSampleGroups.txt > gtexSamples.txt
grep -v '(CML)' gtexSamples.tsv > cleanGtexSamples.tsv

# Make a clean matrix
cut -f 1 ../johnVivianRecompute/gtex.tpm.tab | cut -f 1 -d "." > gtexTranscripts.txt
cut -f 2- ../johnVivianRecompute/gtex.tpm.tab > gtexTpmValues.tsv
paste gtexTranscripts.txt gtexTpmValues.tsv > gtexMatrix.tsv
rowsToCols gtexMatrix.tsv tspsdGtexMatrix.tsv
sort tspsdGtexMatrix.tsv > sortedTspsdGtexMatrix.tsv
grep -v '(CML)' gtexSamples.tsv | cut -f 1 | sed 's/Run_s/#transcript/g' | sort > sortedCleanGtexSamples.tsv
join sortedCleanGtexSamples.tsv sortedTspsdGtexMatrix.tsv > cleanTspsdGtexMatrix.tsv
rowsToCols cleanTspsdMatrix.tsv cleanGtexMatrix.tsv

# Build a coordinate map
hgsql hg38 -e "select * from ensGene" | cut -f 2- | sort > ensGene
hgsql hg38 -e "select * from ensemblToGeneName" | sort >  ensemblToGeneName
join ensGene ensemblToGeneName | awk '{print $2"\t"$4"\t"$5"\t"$1"\t0\t"$3"\t"$16}' > coord.bed
# NOTE: CHRISL10-05-2021 - the above ensGene steps weren't actually done or the files were removed,
# there was a coord.tsv which I used instead so the below re-run could work
tawk '{print $1,$2,$3,$4,0,$5,$6}' coord.tsv > coord.bed
# END CHRISL10-05-2021 NOTE)

# Get the gtex ordering
hgsql hgFixed -e "select * from gtexTissue" | cut -f 3 | sed 's/ - /-/g' | sed 's/ /_/g' | sed '1D' > gtexGroupOrder.txt

# Use the meta data, matrix, and coordinate map to generate a barchart bed
# NOTE: CHRISL10-05-2021 - re-ran this step to fix float parsing bug:
time expMatrixToBarchartBed cleanGtexSamples.tsv cleanGtexMatrix.tsv coord.bed gtexTransExp.bed --groupOrderFile gtexGroupOrder.txt

# NOTE: Use the header line of the bed file to populate the barChartBars field in the trackDb.
# The order of the labels in the barChartBars field should match the order of the labels in the
# expScores column in the bed file header.

# Sort and convert into a bigBed file.
sort -k1,1 -k2,2n gtexTransExp.bed > sortedGtexTransExp.bed
# NOTE: CHRISL10-05-2021 - re-ran bedToBigBed step with correct file names
bedToBigBed -as=$HOME/kent/src/hg/lib/barChartBed.as -type=bed6+5 sortedGtexTransExp.bed /hive/data/genomes/hg38/chrom.sizes gtexTranscExpr.bb

# Link the files into gbdb
cd /gbdb/hgFixed/human/expMatrix
ln -s /hive/data/outside/gtex/barChartTrack/cleanGtexSamples.tsv cleanGtexSamples.tab
ln -s /hive/data/outside/gtex/barChartTrack/cleanGtexMatrix.tsv cleanGtexMatris.tab

# <2007-08-30 kate)
cd /gbdb/hg38/gtex
ln -s /hive/data/outside/gtex/barChartTrack/gtexTranscExpr.bb .

#########################################################################
# LASTZ human/hg38 vs. Zebrafish /danRer11
#	(DONE - 2017-06-12 - Chris)

    mkdir /hive/data/genomes/hg38/bed/lastzDanRer11.2017-06-12
    cd /hive/data/genomes/hg38/bed/lastzDanRer11.2017-06-12

    printf '# human vs zebrafish danRer11
BLASTZ=/cluster/bin/penn/lastz-distrib-1.03.66/bin/lastz
BLASTZ_M=254

# TARGET: human hg38
SEQ1_DIR=/hive/data/genomes/hg38/hg38.2bit
SEQ1_LEN=/hive/data/genomes/hg38/chrom.sizes
SEQ1_CTGDIR=/hive/data/genomes/hg38/hg38.contigs.2bit
SEQ1_CTGLEN=/hive/data/genomes/hg38/hg38.contigs.chrom.sizes
SEQ1_LIFT=/hive/data/genomes/hg38/jkStuff/hg38.contigs.lift
SEQ1_CHUNK=40000000
SEQ1_LIMIT=20
SEQ1_LAP=10000

# QUERY: zebrafish danRer11
SEQ2_DIR=/hive/data/genomes/danRer11/danRer11.2bit
SEQ2_LEN=/hive/data/genomes/danRer11/chrom.sizes
SEQ2_CHUNK=20000000
SEQ2_LIMIT=200
SEQ2_LAP=0

BASE=/hive/data/genomes/hg38/bed/lastzDanRer11.2017-06-12
TMPDIR=/dev/shm
' > DEF

    time (doBlastzChainNet.pl `pwd`/DEF -verbose=2 \
        -chainMinScore=3000 -chainLinearGap=medium \
          -workhorse=hgwdev -smallClusterHub=ku -bigClusterHub=ku \
            -noDbNameCheck -syntenicNet) > do.log 2>&1
    # real    3327m39.074s

	cat fb.hg38.chainDanRer11Link.txt
    # 41036733 bases of 3049335806 (1.346%) in intersection

	973293331 bases of 3049335806 (31.918%) in intersection

    time (doRecipBest.pl -workhorse=hgwdev -buildDir=`pwd` hg38 danRer11) \
       > rbest.log 2>&1 &

    # and for the swap:
    mkdir /hive/data/genomes/danRer11/bed/blastz.hg38.swap
    cd /hive/data/genomes/danRer11/bed/blastz.hg38.swap

    time (doBlastzChainNet.pl -verbose=2 \
      /hive/data/genomes/hg38/bed/lastzDanRer11.2017-06-12/DEF \
        -swap -chainMinScore=3000 -chainLinearGap=medium \
          -workhorse=hgwdev -smallClusterHub=ku -bigClusterHub=ku \
            -noDbNameCheck -syntenicNet) > swap.log 2>&1
	#  real	39m24.916s

    cat fb.danRer11.chainHg38Link.txt
    # 47869194 bases of 1674677181 (2.858%) in intersection

    time (doRecipBest.pl -workhorse=hgwdev -buildDir=`pwd` danRer11 hg38) \
       > rbest.log 2>&1 &
    # real	638m45.337s
_EOF_
#########################################################################
# refSeqFuncElems NCBI refSeq functional elements, REDONE 2017-11-29 Angie
# previously done 2017-08-01 by Chris E

mkdir /hive/data/genomes/hg38/bed/refSeqFuncElems.2017-11-29
cd /hive/data/genomes/hg38/bed/refSeqFuncElems.2017-11-29

# NOTE FOR NEXT TIME: instead of using interim GFF, in the future these annotations might be
# folded into the same main release GFF3 from which the ncbiRefSeq* tables are extracted by
# doNcbiRefSeq.pl.
wget ftp://ftp.ncbi.nlm.nih.gov/genomes/H_sapiens/GFF_interim/interim_GRCh38.p11_top_level_2017-06-27.gff3.gz

# Get mapping of RefSeq NC_* chromosome accs (and NT_*, NW_*) to hg38 chrom names
hgsql hg38 -NBe 'select alias, chrom from chromAlias where source = "refseq" order by alias' \
> refSeqToChrom.tab
cut -f 2 refSeqToChrom.tab | sed -e 's/^/^/' > chrom.tab

# Use Terence Murphy's list of feature types (and the multi-type attribute regulatory_class)
# to identify Functional Elements and swap in hg38 chrom names.
# Use subColumn -miss so it doesn't quit when it sees a patch contig that doesn't map to an
# hg38 chrom.  Use grep -f chrom.tab to filter out patch contig annotations.
zcat interim_GRCh38.p11_top_level_2017-06-27.gff3.gz \
| grep -P "(\t(CAAT_signal|GC_rich_promoter_region|TATA_box|enhancer|insulator|locus_control_region|mobile_genetic_element|origin_of_replication|promoter|protein_binding_site|recombination_feature|regulatory_region|repeat_region|sequence_feature|sequence_secondary_structure|silencer|stem_loop)\t|regulatory_class=)" \
| subColumn -miss=/dev/null 1 stdin refSeqToChrom.tab stdout \
| grep -f chrom.tab > funcElems.gff
wc -l funcElems.gff
#5756 funcElems.gff

# Transform GFF to BED+
~/kent/src/hg/utils/automation/parseRefSeqFuncElems funcElems.gff /dev/stdout \
| sort -k1,1 -k2n,2n > refSeqFuncElems.bed
wc -l refSeqFuncElems.bed
#5756 refSeqFuncElems.bed

# Make bigBed and link from /gbdb
bedToBigBed -tab -type=bed9+7 -as=$HOME/kent/src/hg/lib/refSeqFuncElems.as \
  refSeqFuncElems.bed /hive/data/genomes/hg38/chrom.sizes refSeqFuncElems.bb
rm -f /gbdb/hg38/ncbiRefSeq/refSeqFuncElems.bb
ln -s `pwd`/refSeqFuncElems.bb /gbdb/hg38/ncbiRefSeq/

###################################################################
# cosmicRegions (DONE 2017-08-03 Chris)
# Make a new COSCMIC track for hg38 v82
mkdir /hive/data/outside/cosmic/hg38/v82
cd /hive/data/outside/cosmic/hg38/v82

# Get the new data
sftp ceisenha@ucsc.edu@sftp-cancer.sanger.ac.uk
# Login to SFTP server then run these commands
get /files/grch38/cosmic/v82/CosmicMutantExport.tsv.gz

# Remove the 'NS' fields, search for the \t after to exclude the E'NS'ST transcripts.
zcat CosmicMutantExport.tsv.gz | sed 's/NS\t/\t/g' > cosMut.tsv

# Use a script to convert to bed format.
cosmicToBed cosMut.tsv cosMut.bed
# This many lines were skipped, 134601 for not having genomic coordinate

# Sort and convert to big bed using the .as file.
sort -k1,1 -k2,2n cosMut.bed > sCosMut.bed
bedToBigBed -type=bed8+31 -as=cosmicNew.as sCosMut.bed /hive/data/genomes/hg38/chrom.sizes cosMutHg38V82.bb -tab -extraIndex=name,cosmLabel


# Link it up so the outside world can see it.
cd /gbdb/hg38/cosmic/
ln -s /hive/data/outside/cosmic/hg38/v82/cosMutHg38V82.bb .

#########################################################################
# RepeatMasker Visualization track update (DONE - 2018-05-04 - ChrisL)
    screen -S rmskJoined.2018-05-04
    mkdir /hive/data/genomes/hg38/bed/rmskJoined.2018-05-04
    cd /hive/data/genomes/hg38/bed/rmskJoined.2018-05-04

    ln -s ../repeatMasker/hg38.sorted.fa.out .
    ln -s ../repeatMasker/hg38.fa.align.gz .

    # this script points to the most recent RepeatMasker version:
    time (/scratch/data/RepeatMasker/util/rmToUCSCTables.pl \
        -out hg38.sorted.fa.out -align hg38.fa.align.gz) > do.log 2>&1 &

    # no differences, forgot to remake rmsk files
    # so instead remake the rmsk track and try again
    mkdir /hive/data/genomes/hg38/bed/repeatMasker.2018-05-04
    cd /hive/data/genomes/hg38/bed/repeatMasker.2018-05-04

    # remake the sorted.fa.out and fa.align.gz, stop after masking
    # so rmsk table isn't overwritten
    time (doRepeatMasker.pl -stop=mask -useHMMER -bigClusterHub=ku \
       -workhorse=hgwdev -dbHost=hgwdev -buildDir=`pwd` hg38) > mask.log 2>&1 &
    # RepeatMasker bug?: Undefined id, line 1440295 of input:
    #    10  26.1  0.0  0.0  chr13     114292339 114292382   (71946) C  L1P4           LINE/L1               (17) 6149   6106
    # RepeatMasker bug?: Undefined id, line 3529762 of input:
    #   992   2.3  0.5  0.0  chr3      180461254 180462048 (17833511) C  L1PA3          LINE/L1                (3) 6152   5354
    # RepeatMasker bug?: Undefined id, line 3529763 of input:
    #  1153   3.2  0.2  0.0  chr3      180462043 180463006 (17832553) +  L1PA3          LINE/L1               4392 5357  (789)
    # RepeatMasker bug?: Undefined id, line 5303571 of input:
    #   220  22.5  0.0 17.7  chr9      105798076 105799127 (32595590) C  SATR2          Satellite              (4)  866      1
    # real    643m17.617s

    # get rid of the missing id items:
    grep -v "114292339 114292382\|180461254 180462048\|180462043 180463006\|105798076 105799127" \
        hg38.fa.out > clean.hg38.fa.out
    mv clean.hg38.fa.out hg38.fa.out

    # finish the last step of doCat.csh:
    /cluster/bin/scripts/extractNestedRepeats.pl hg38.fa.out | sort -k1,1 -k2,2n > hg38.nestedRepeats.bed

    cd /hive/data/genomes/hg38/bed/rmskJoined.2018-05-04

    rm hg38.sorted.fa.out
    rm hg38.fa.align.gz
    rm *.tsv
    ln -s ../repeatMasker.2018-05-04/hg38.sorted.fa.out .
    ln -s ../repeatMasker.2018-05-04/hg38.fa.align .

    # and then re-run
    time (/scratch/data/RepeatMasker/util/rmToUCSCTables.pl \
        -out hg38.sorted.fa.out -align hg38.fa.align.gz) > rerun.log 2>&1 &
    # real    141m7.268s

    # confirm the counts are different from the previous version:
    # wc -l ../rmskJoined/hg38.fa.align.tsv ../rmskJoined/hg38.sorted.fa.join.bed ../rmskJoined/hg38.sorted.fa.out.tsv
   7203858 ../rmskJoined/hg38.fa.align.tsv
   4607727 ../rmskJoined/hg38.sorted.fa.join.bed
   5520118 ../rmskJoined/hg38.sorted.fa.out.tsv
  17331703 total
    # wc -l *.tsv
   7227245 hg38.fa.align.tsv
   4828114 hg38.sorted.fa.join.tsv
   5916189 hg38.sorted.fa.out.tsv
  17971548 total

    hgLoadBed -sqlTable=$HOME/kent/src/hg/lib/rmskJoined.sql \
        -renameSqlTable -verbose=4 -tab \
            -type=bed4+9 -as=$HOME/kent/src/hg/lib/rmskJoined.as hg38 \
                rmskJoinedCurrent hg38.sorted.fa.join.tsv \
                    > loadJoined.log 2>&1

    hgLoadSqlTab hg38 rmskAlignCurrent \
        /cluster/home/chmalee/kent/src/hg/lib/rmskAlign.sql \
            hg38.fa.align.tsv > loadAlign.log 2>&1

    hgLoadOutJoined -verbose=2 -table=rmskOutCurrent hg38 hg38.sorted.fa.out > loadOut.log 2>&1

    featureBits -countGaps hg38 rmskJoinedCurrent
    # 2796899855 bases of 3209286105 (87.150%) in intersection
#########################################################################
# Hi-C Visualization based on Krietenstein 2019 (DONE - 2019-10-07 - Jonathan)
mkdir -p /hive/data/genomes/hg38/bed/hic
cd /hive/data/genomes/hg38/bed/hic

# Files are located on 4D Nucleome (data.4dnucleome.org).  The URL for the paper on that
# site is https://data.4dnucleome.org/publications/b13590b2-a341-4e5e-ad5e-72e233b32e9d/.
# The four file IDs downloaded below are for contact matrix .hic files created for
# different cell-line/protocol combinations
wget 'https://data.4dnucleome.org/files-processed/4DNFI2TK7L2F/@@download/4DNFI2TK7L2F.hic' # H1-hESC Micro-C XL
wget 'https://data.4dnucleome.org/files-processed/4DNFIQYQWPF5/@@download/4DNFIQYQWPF5.hic' # H1-hESC in situ
wget 'https://data.4dnucleome.org/files-processed/4DNFI18Q799K/@@download/4DNFI18Q799K.hic' # HFFc6 Micro-C XL
wget 'https://data.4dnucleome.org/files-processed/4DNFIFLJLIS5/@@download/4DNFIFLJLIS5.hic' # HFFc6 in situ

printf "All files were downloaded from the 4D Nucleome Data Portal at data.4dnucleome.org.
These are processed contact matrices from Krietenstein et al. (2019) Ultrastructural details
of mammalian chromosme architecture. (https://www.biorxiv.org/content/10.1101/639922v1).

4DNFI2TK7L2F.hic - Micro-C XL data set on H1-hESC
4DNFIQYQWPF5.hic - in situ Hi-C data set on H1-hESC
4DNFI18Q799K.hic - Micro-C  XL data set on HFFc6
4DNFIFLJLIS5.hic - in situ Hi-C data set on HFFc6" > README.txt

mkdir -p /gbdb/hg38/bbi/hic
cd /gbdb/hg38/bbi/hic
ln -s /hive/data/genomes/hg38/bed/hic/* .


#########################################################################
# LASTZ Self/hg38 (DONE 2020-02-11 - Angie)
    # RM #24695
    # Re-run with updated process to include pslDropOverlap .
    # Use "contigs" from previous run lastzSelf.2014-01-25/hg38.self.2bit

    screen -S hg38Self -t hg38Self
    mkdir /hive/data/genomes/hg38/bed/lastzSelf.2020-01-27
    cd /hive/data/genomes/hg38/bed/lastzSelf.2020-01-27
    cat << _EOF_ > DEF
# human vs human with mouse defaults
BLASTZ=/cluster/bin/penn/lastz-distrib-1.04.00/bin/lastz

# TARGET: Human hg38
SEQ1_DIR=/hive/data/genomes/hg38/hg38.2bit
SEQ1_LEN=/hive/data/genomes/hg38/chrom.sizes
SEQ1_CTGDIR=/hive/data/genomes/hg38/bed/lastzSelf.2014-01-25/hg38.self.2bit
SEQ1_CTGLEN=/hive/data/genomes/hg38/bed/lastzSelf.2014-01-25/hg38.self.chrom.sizes
SEQ1_LIFT=/hive/data/genomes/hg38/jkStuff/hg38.contigs.lift
SEQ1_CHUNK=20000000
SEQ1_LAP=10000

# QUERY: Human hg38
SEQ2_DIR=/hive/data/genomes/hg38/hg38.2bit
SEQ2_LEN=/hive/data/genomes/hg38/chrom.sizes
SEQ2_CTGDIR=/hive/data/genomes/hg38/bed/lastzSelf.2014-01-25/hg38.self.2bit
SEQ2_CTGLEN=/hive/data/genomes/hg38/bed/lastzSelf.2014-01-25/hg38.self.chrom.sizes
SEQ2_LIFT=/hive/data/genomes/hg38/jkStuff/hg38.contigs.lift
SEQ2_CHUNK=20000000
SEQ2_LAP=0

BASE=/hive/data/genomes/hg38/bed/lastzSelf.2020-01-27
TMPDIR=/dev/shm
_EOF_

    # NOTE FOR NEXT TIME: use -chainMinScore=10000 (at least), not 3000

    ~/kent/src/hg/utils/automation/doBlastzChainNet.pl `pwd`/DEF -verbose=2 \
        -chainMinScore=3000 -chainLinearGap=medium -syntenicNet \
        -workhorse=hgwdev -smallClusterHub=hgwdev -bigClusterHub=ku \
        -stop=net >& do.log &
    tail -f do.log


    # After two days, 4 jobs are running, one of which (part014.lst vs itself) crashed with
    # out-of-mem error.  After 3 days, 3 jobs completed but part014.lst runs lastz out of mem.
    # Split part014.lst up into components, run on hgwdev (more mem).
    mkdir /hive/data/genomes/hg38/bed/lastzSelf.2020-01-27/run.blastz/part014
    cd /hive/data/genomes/hg38/bed/lastzSelf.2020-01-27/run.blastz/part014
    mkdir psl
    cp /dev/null jobList
    for t in $(cat ../tParts/part014.lst); do
      tBase=$(basename $t)
      for q in $(cat ../tParts/part014.lst); do
        qBase=$(basename $q)
        echo "$HOME/kent/src/hg/utils/automation/blastz-run-ucsc -outFormat psl -dropSelf $t $q ../../DEF {check out exists psl/${tBase}_${qBase}.psl }" >> jobList
      done
    done
    para create jobList
    para try, check, push, etc,
    # 94 of the jobs ran for 12s or less.  The other 6 are chr{X_Y}_00 vs. self & each other,
    # chr13_16 vs self and chr16_03 vs self.  All but chr16_03 vs self completed in < 6 minutes.
#Completed: 99 of 100 jobs
#Crashed: 1 jobs
#CPU time in finished jobs:       1559s      25.98m     0.43h    0.02d  0.000 y
#IO & Wait Time:                   248s       4.14m     0.07h    0.00d  0.000 y
#Average job time:                  18s       0.30m     0.01h    0.00d
#Longest finished job:             321s       5.35m     0.09h    0.00d
#Submission to last job:         94681s    1578.02m    26.30h    1.10d

    # Dang, chr16_03 vs. self still runs out of mem even on hgwdev.
    mkdir /hive/data/genomes/hg38/bed/lastzSelf.2020-01-27/run.blastz/part014/chr16_03
    cd /hive/data/genomes/hg38/bed/lastzSelf.2020-01-27/run.blastz/part014/chr16_03
    twoBitToFa /hive/data/genomes/hg38/bed/lastzSelf.2014-01-25/hg38.self.2bit:chr16_03:0-1689648 \
      chr16_03.fa
    faSplit -lift=chr16_03.lift size chr16_03.fa 169000 chr16_03_split_
    faToTwoBit chr16_03_split_*.fa chr16_03_split.2bit
    twoBitInfo chr16_03_split.2bit stdout | sort -k2nr > chr16_03_split.sizes
    sed -re 's@CTGDIR.*@CTGDIR=/hive/data/genomes/hg38/bed/lastzSelf.2020-01-27/run.blastz/part014/chr16_03/chr16_03_split.2bit@;
             s@CTGLEN.*@CTGLEN=/hive/data/genomes/hg38/bed/lastzSelf.2020-01-27/run.blastz/part014/chr16_03/chr16_03_split.sizes@;' \
      ../../../DEF > DEF.split
    mkdir psl
    cwd=$(pwd)
    while read tBase tSize; do
      while read qBase qSize; do
        echo "$HOME/kent/src/hg/utils/automation/blastz-run-ucsc -outFormat psl -dropSelf $cwd/chr16_03_split.2bit:$tBase:0-$tSize $cwd/chr16_03_split.2bit:$qBase:0-$qSize DEF.split {check out exists psl/${tBase}_${qBase}.psl}"
      done < chr16_03_split.sizes
    done < chr16_03_split.sizes > jobList
    para create jobList
    para try, check, push, etc,
#Completed: 100 of 100 jobs
#CPU time in finished jobs:     142614s    2376.89m    39.61h    1.65d  0.005 y
#IO & Wait Time:                   167s       2.79m     0.05h    0.00d  0.000 y
#Average job time:                1428s      23.80m     0.40h    0.02d
#Longest finished job:           22861s     381.02m     6.35h    0.26d
#Submission to last job:         22874s     381.23m     6.35h    0.26d
    # 6 hours for chr16_03_split_00 vs. itself.  ~4.5h for _09 vs _00.
    cat psl/*.psl \
    | liftUp -nohead -type=.psl stdout \
        chr16_03.lift error stdin \
    | liftUp -nohead -type=.psl -pslQ \
        ../psl/hg38.self.2bit:chr16_03:0-1689648_hg38.self.2bit:chr16_03:0-1689648.psl \
        chr16_03.lift error stdin

    cd ..
    cat psl/* > ../../psl/part014.lst/part014.lst_part014.lst.psl

    # Make run.time file or doBlastzChainNet.pl won't continue:
    cd /hive/data/genomes/hg38/bed/lastzSelf.2020-01-27/run.blastz
    para time >& run.time

    # Resume doBlastzChainNet.pl:
    cd /hive/data/genomes/hg38/bed/lastzSelf.2020-01-27
    ~/kent/src/hg/utils/automation/doBlastzChainNet.pl `pwd`/DEF -verbose=2 \
        -chainMinScore=3000 -chainLinearGap=medium -syntenicNet \
        -workhorse=hgwdev -smallClusterHub=hgwdev -bigClusterHub=ku \
        -continue=cat -stop=net >& do2.log &
    tail -f do2.log
#Batch failed after 4 tries on chain.csh part016.lst chain/part016.lst.chain
#Command failed:
#ssh -x -o 'StrictHostKeyChecking = no' -o 'BatchMode = yes' hgwdev nice /hive/data/genomes/hg38/bed/lastzSelf.2020-01-27/axtChain/run/doChainRun.csh

    cd /hive/data/genomes/hg38/bed/lastzSelf.2020-01-27/axtChain/run
    para problems
    # mostly these:
#errAbort re-entered due to out-of-memory condition. Exiting.
    # one job made it through errAbort:
#needLargeMem: Out of memory - request size 564838920 bytes, errno: 12
    para time
#Completed: 59 of 68 jobs
#Crashed: 9 jobs
#CPU time in finished jobs:      24727s     412.12m     6.87h    0.29d  0.001 y
#IO & Wait Time:                     0s       0.00m     0.00h    0.00d  0.000 y
#Average job time:                 409s       6.82m     0.11h    0.00d
#Longest finished job:            2350s      39.17m     0.65h    0.03d
#Submission to last job:          2462s      41.03m     0.68h    0.03d
    para crashed
#chain.csh part012.lst {check out line+ chain/part012.lst.chain}
#chain.csh part017.lst {check out line+ chain/part017.lst.chain}
#chain.csh part016.lst {check out line+ chain/part016.lst.chain}
#chain.csh part015.lst {check out line+ chain/part015.lst.chain}
#chain.csh part014.lst {check out line+ chain/part014.lst.chain}
#chain.csh hg38.self.2bit:chr1_10: {check out line+ chain/hg38.self.2bit:chr1_10:.chain}
#chain.csh hg38.self.2bit:chr10_05: {check out line+ chain/hg38.self.2bit:chr10_05:.chain}
#chain.csh hg38.self.2bit:chr7_00: {check out line+ chain/hg38.self.2bit:chr7_00:.chain}

    # Run the jobs outside of parasol (~11h):
    csh -efx chain.csh part012.lst chain/part012.lst.chain &
    csh -efx chain.csh part017.lst chain/part017.lst.chain &
    csh -efx chain.csh part016.lst chain/part016.lst.chain &
    csh -efx chain.csh part015.lst chain/part015.lst.chain &
    csh -efx chain.csh part014.lst chain/part014.lst.chain &
    csh -efx chain.csh hg38.self.2bit:chr1_10: chain/hg38.self.2bit:chr1_10:.chain &
    csh -efx chain.csh hg38.self.2bit:chr10_05: chain/hg38.self.2bit:chr10_05:.chain &
    csh -efx chain.csh hg38.self.2bit:chr7_00: chain/hg38.self.2bit:chr7_00:.chain &
    csh -efx chain.csh hg38.self.2bit:chr16_08: chain/hg38.self.2bit:chr16_08:.chain &

    # Resume doBlastzChainNet.pl again:
    cd /hive/data/genomes/hg38/bed/lastzSelf.2020-01-27
    ~/kent/src/hg/utils/automation/doBlastzChainNet.pl `pwd`/DEF -verbose=2 \
        -chainMinScore=3000 -chainLinearGap=medium -syntenicNet \
        -workhorse=hgwdev -smallClusterHub=hgwdev -bigClusterHub=ku \
        -continue=chainMerge -stop=net >& do3.log &
    tail -f do3.log
# *** All done !  Elapsed time: 19m11s

    # Load track w/new name chainSelfRedo to compare to existing chainSelf:
    hgLoadChain -normScore -tIndex hg38 chainSelfRedo axtChain/hg38.hg38.all.chain.gz

    # No idea why but somehow the liftUp seems not to have worked for part012 and part017,
    # so the all.chain had chr22_31, chr8_01 etc.  :b  run again again.
    cd /hive/data/genomes/hg38/bed/lastzSelf.2020-01-27/axtChain/run
    mv chain/part012.lst.chain{,.bak}
    mv chain/part017.lst.chain{,.bak}
    csh -efx chain.csh part012.lst chain/part012.lst.chain >& part012.log &
    csh -efx chain.csh part017.lst chain/part017.lst.chain >& part017.log &
    # Those completed successfully.  Dunno why the earlier ones didn't get lifted.
    cd ..
    mv hg38.hg38.all{,.oopsPartUnlifted}.chain.gz
    # Reconstruct hg38.hg38.all.chain.gz (the chainMerge step is just this command):
    find /hive/data/genomes/hg38/bed/lastzSelf.2020-01-27/axtChain/run/chain -name "*.chain" \
    | chainMergeSort -inputList=stdin \
    | nice gzip -c \
      > /hive/data/genomes/hg38/bed/lastzSelf.2020-01-27/axtChain/hg38.hg38.all.chain.gz

    # NOTE FOR NEXT TIME: this filtering step will be unnecessary when -minScore=10000 is used
    # from the beginning.
    # Filter to minScore of 10000 (too much fluff with -minScore=3000) per Jim (see #24695)
    cd /hive/data/genomes/hg38/bed/lastzSelf.2020-01-27/axtChain
    mv hg38.hg38.all.chain.gz hg38.hg38.all.unfiltered.chain.gz
    chainFilter hg38.hg38.all.unfiltered chain.gz -minScore=10000 \
    | gzip -c > hg38.hg38.all.chain.gz
    hgLoadChain -normScore -tIndex hg38 chainSelfRedo hg38.hg38.all.chain.gz
    checkTableCoords hg38 chainSelfRedo

    # Rename to chainSelf and update lastz symlinks and downloads
    hgsql hg38 -e 'drop table chainSelf; drop table chainSelfLink;
                   rename table chainSelfRedo to chainSelf;
                   rename table chainSelfRedoLink to chainSelfLink;'
    cd /hive/data/genomes/hg38/bed
    rm lastz.self lastz.hg38
    ln -s lastzSelf.2020-01-27 lastz.self
    ln -s lastzSelf.2020-01-27 lastz.hg38
    cd /hive/data/genomes/hg38/bed/lastzSelf.2020-01-27/axtChain
    cp /hive/data/genomes/hg38/bed/lastzSelf.2014-01-25/axtChain/README.txt .
    $EDITOR README.txt
    md5sum hg38.hg38.all.chain.gz > md5sum.txt
    # Make sure that the old download dir has only symlinks, no real files, then remove and rebuild.
    ls -lR /usr/local/apache/htdocs-hgdownload/goldenPath/hg38/vsSelf/
    rm -r /usr/local/apache/htdocs-hgdownload/goldenPath/hg38/vsSelf/
    mkdir /usr/local/apache/htdocs-hgdownload/goldenPath/hg38/vsSelf/
    cd /usr/local/apache/htdocs-hgdownload/goldenPath/hg38/vsSelf/
    ln -s /hive/data/genomes/hg38/bed/lastzSelf.2020-01-27/axtChain/{README.txt,hg38.hg38.all.chain.gz,md5sum.txt} .


#########################################################################
# NCBI ReMap alignments (DONE 2020-02-11 Angie)
# RM 24449
    mkdir /hive/data/genomes/hg38/bed/chainHg19ReMap
    cd /hive/data/genomes/hg38/bed/chainHg19ReMap
    wget ftp://ftp.ncbi.nlm.nih.gov/pub/remap/Homo_sapiens/current/GCF_000001405.39_GRCh38.p13/GCF_000001405.25_GRCh37.p13/GCF_000001405.39-GCF_000001405.25.gff
    # We will need to substitute all the RefSeq chrom and contig IDs with our own names.
    # The same alt contig can appear in both assemblies with the same name, so replace
    # hg19 names at the beginning of the line and hg38 names after "Target=".
    hgsql hg19 -NBe 'select alias, chrom from chromAlias where find_in_set("refseq", source)' \
    | sed -re 's/\./\\./;' \
    | awk '{print "s/^" $1 "\\b/" $2 "/;";}' \
      > hg38.hg19.chromAlias.sed
    hgsql hg38 -NBe 'select alias, chrom from chromAlias where find_in_set("refseq", source)' \
    | sed -re 's/\./\\./;' \
    | awk '{print "s/Target=" $1 "\\b/Target=" $2 "/;";}' \
      >> hg38.hg19.chromAlias.sed

    # There are some GRCh38.p13 sequences that we have not yet imported into hg38 -- use -dropT.
    sed -f hg38.hg19.chromAlias.sed GCF_000001405.39-GCF_000001405.25.gff \
    | gff3ToPsl -dropT /hive/data/genomes/{hg19,hg38}/chrom.sizes stdin stdout \
    | pslPosTarget stdin stdout \
    | sort -k14,14 -k16n,16n > remap.hg38.hg19.psl

    # Convert to chain for browser display.  Some of the remap chains have minScore < 1000 and
    # by default would be dropped by chainScore... use -minScore=0 to prevent that.
    time pslToChain remap.hg38.hg19.psl stdout \
    | chainScore -minScore=0 stdin /hive/data/genomes/{hg38/hg38.2bit,hg19/hg19.2bit} \
        remap.hg38.hg19.chain
#real    9m31.900s
#user    9m1.624s
#sys     0m20.863s
    hgLoadChain hg38 -tIndex chainHg19ReMap remap.hg38.hg19.chain
#Loading 5315 chains into hg38.chainHg19ReMap
    time axtChain -psl -linearGap=medium -verbose=0 remap.hg38.hg19.psl \
      /hive/data/genomes/hg38/hg38.2bit /hive/data/genomes/hg19/hg19.2bit \
      remap.axtChain.hg38.hg19.chain
#real    2m26.333s
#user    2m4.237s
#sys     0m22.071s
    hgLoadChain hg38 -tIndex chainHg19ReMapAxtChain remap.axtChain.hg38.hg19.chain
#Loading 2115 chains into hg38.chainHg19ReMapAxtChain

###################################################
#Agilent SNP/CNV arrays 3/11/21
#Downloaded by web browser
cd /hive/data/genomes/hg38/bed/agilentProbes/genetiSureCyto
fetchChromSizes hg38 > hg38.chrom.sizes
bedSort hg38.GenetiSure_Cyto_CGH+SNP_Microarray_4x180K_085591_D_BED_20200302.bed hg38.GenetiSure_Cyto_CGH+SNP_Microarray_4x180K_085591_D_BED_20200302.bed
uniq hg38.GenetiSure_Cyto_CGH+SNP_Microarray_4x180K_085591_D_BED_20200302.bed >hg38.GenetiSure_Cyto_CGH+SNP_Microarray_4x180K_085591_D_BED_20200302.uniq.bed
bedToBigBed hg38.GenetiSure_Cyto_CGH+SNP_Microarray_4x180K_085591_D_BED_20200302.uniq.bed hg38.chrom.sizes hg38.GenetiSure_Cyto_CGH+SNP_Microarray_4x180K_085591_D_BED_20200302.bb
bedSort hg38.GenetiSure_Cyto_CGH_Microarray_8x60K_085590_D_BED_20200302.bed hg38.GenetiSure_Cyto_CGH_Microarray_8x60K_085590_D_BED_20200302.bed
uniq hg38.GenetiSure_Cyto_CGH_Microarray_8x60K_085590_D_BED_20200302.bed > hg38.GenetiSure_Cyto_CGH_Microarray_8x60K_085590_D_BED_20200302.uniq.bed
bedToBigBed hg38.GenetiSure_Cyto_CGH_Microarray_8x60K_085590_D_BED_20200302.uniq.bed hg38.chrom.sizes hg38.GenetiSure_Cyto_CGH_Microarray_8x60K_085590_D_BED_20200302.bb
bedSort hg38.GenetiSure_Cyto_CGH_Microarray_4x180K_085589_D_BED_20200302.bed hg38.GenetiSure_Cyto_CGH_Microarray_4x180K_085589_D_BED_20200302.bed
uniq hg38.GenetiSure_Cyto_CGH_Microarray_4x180K_085589_D_BED_20200302.bed > hg38.GenetiSure_Cyto_CGH_Microarray_4x180K_085589_D_BED_20200302.uniq.bed
bedToBigBed hg38.GenetiSure_Cyto_CGH_Microarray_4x180K_085589_D_BED_20200302.uniq.bed hg38.chrom.sizes hg38.GenetiSure_Cyto_CGH_Microarray_4x180K_085589_D_BED_20200302.bb
mkdir -p /gbdb/hg38/snpCnvArrays/agilent
cd /gbdb/hg38/snpCnvArrays/agilent
ln -s /hive/data/genomes/hg38/bed/agilentProbes/genetiSureCyto/hg38.GenetiSure_Cyto_CGH+SNP_Microarray_4x180K_085591_D_BED_20200302.bb
ln -s /hive/data/genomes/hg38/bed/agilentProbes/genetiSureCyto/hg38.GenetiSure_Cyto_CGH_Microarray_8x60K_085590_D_BED_20200302.bb
ln -s /hive/data/genomes/hg38/bed/agilentProbes/genetiSureCyto/hg38.GenetiSure_Cyto_CGH_Microarray_4x180K_085589_D_BED_20200302.bb
vi ~/kent/src/hg/makeDb/trackDb/human/hg38/trackDb.ra

#########################################################################
# DECIPHER CNV & SNV - initial build (DONE 2022-04-08 Jonathan)
# RM 29130

cd /hive/data/genomes/outside/otto/decipher
mkdir 2022-04-05
cd 2022-04-05

# manually fetch decipher-variants-grch38-2022-04-03.bed from DECIPHER
../buildDecipher decipher-variants-grch38-2022-04-03.bed

for i in `cat ../decipher.tables`
        do
        n=$i"New"
        o=$i"Old"
        hgsqlSwapTables hg38 $n $i $o -dropTable3
        done

mkdir -p /gbdb/hg38/decipher
cd /gbdb/hg38/decipher
ln -s /hive/data/outside/otto/decipher/2022-04-05/decipherCnv.bb .

#########################################################################
# COSMIC (DONE 07-11-2023)
# RM 29625

#Fetch file
cd /hive/data/outside/cosmic/hg38/v98/
wget 'https://cog.sanger.ac.uk/cosmic/GRCh38/ucsc/v98/ucsc_export.bed.gz?AWSAccessKeyId=KRV7P7QR9DL41J9EWGA2&Expires=1686847188&Signature=4YV3CuFKudxIhqVdWAaCe0CMAiY%3D' -O ucsc_export.bed.gz
wget 'https://cog.sanger.ac.uk/cosmic/GRCh38/ucsc/v98/ucsc_export.bed.gz?AWSAccessKeyId=KRV7P7QR9DL41J9EWGA2&Expires=1687525456&Signature=jBdJOlOOaqmMWNnOtJUyNRptVj4%3D'
mv ucsc_export.bed.gz\?AWSAccessKeyId\=KRV7P7QR9DL41J9EWGA2\&Expires\=1687525456\&Signature\=jBdJOlOOaqmMWNnOtJUyNRptVj4\= ucsc_export.bed.gz

#Reorder to columns to conform to bed 6+3
zcat ucsc_export.bed.gz | awk -F'\t' -v OFS="\t" '{ print $1, $2, $3, $7, 0, $6, $4, $5, $8 }' | sort -k1,1 -k2,2n > cosmic.bed

#Tiny bit of python to identify the broken lines in the file where chromStart > chromEnd

#for line in myFile:
#    newLine = line.split("\t")
#    if int(newLine[1]) > int(newLine[2]):
#        print(line)
#        n+=1
#print(n)

#remove those broken records from the file
cat cosmic.bed | grep -vf badRecords.bed > cosmic.fixed.bed

#subtract to conform to bed format for all the items that have same star and endPos

cat cosmic.fixed.bed | awk 'BEGIN {OFS="\t"} {
if ($2 == $3)
        print $1,$2-1,$3,$4,$5,$6,$7,$8,$9;
else
        print $0;
}' > cosmic.fixedPos.bed

bedToBigBed -type=bed6+3 -as=/hive/data/outside/cosmic/hg38/v98/cosmic.as /hive/data/outside/cosmic/hg38/v98/cosmic.fixedPos.bed /hive/data/genomes/hg38/chrom.sizes /hive/data/outside/cosmic/hg38/v98/cosmic.bb -tab

#make symlink
ln -s /hive/data/outside/cosmic/hg38/v98/cosmic.bb /gbdb/hg38/cosmic/cosmic.bb

#This data has since been updated, see new makedoc doc/hg38/cosmicV98.txt and rm #32430

##############################################################################
# LIFTOVER TO GCA_018873775.2_hg01243.v3.0 (DONE - 2023-08-13 - Hiram)
    ssh hgwdev
    # going to need an ooc for hg38.p14.2bit
    cd /hive/data/genomes/hg38
    time blat hg38.p14.2bit /dev/null /dev/null -tileSize=11 \
      -makeOoc=hg38.p14.ooc -repMatch=1024
    # Wrote 36808 overused 11-mers to hg38.p14.ooc
    # real    0m50.753s

    # and ooc for this GenArk hub
    cd /hive/data/genomes/asmHubs/genbankBuild/GCA/018/873/775/GCA_018873775.2_hg01243.v3.0
  time blat GCA_018873775.2_hg01243.v3.0.2bit /dev/null /dev/null -tileSize=11 \
      -makeOoc=GCA_018873775.2_hg01243.v3.0.ooc -repMatch=1024
# Wrote 39087 overused 11-mers to GCA_018873775.2_hg01243.v3.0.ooc
# real    0m49.426s

  mkdir /hive/data/genomes/hg38/bed/blat.GCA_018873775.2_hg01243.v3.0.2023-08-13
    cd /hive/data/genomes/hg38/bed/blat.GCA_018873775.2_hg01243.v3.0.2023-08-13

    doSameSpeciesLiftOver.pl -verbose=2 -buildDir=`pwd` \
        -debug -bigClusterHub=hgwdev -dbHost=hgwdev -workhorse=hgwdev \
        -target2Bit=/hive/data/genomes/hg38/hg38.2bit \
        -targetSizes=/hive/data/genomes/hg38/chrom.sizes \
 -query2Bit=/hive/data/genomes/asmHubs/genbankBuild/GCA/018/873/775/GCA_018873775.2_hg01243.v3.0/GCA_018873775.2_hg01243.v3.0.2bit \
 -querySizes=/hive/data/genomes/asmHubs/genbankBuild/GCA/018/873/775/GCA_018873775.2_hg01243.v3.0/GCA_018873775.2_hg01243.v3.0.chrom.sizes \
        -ooc=/hive/data/genomes/hg38/hg38.p14.ooc \
         hg38 GCA_018873775.2

    # trying -ram=6g to get full use of hgwdev kluster nodes
    time (~/kent/src/hg/utils/automation/doSameSpeciesLiftOver.pl \
        -verbose=2 -buildDir=`pwd` \
        -bigClusterHub=hgwdev -dbHost=hgwdev -workhorse=hgwdev \
        -target2Bit=/hive/data/genomes/hg38/hg38.2bit \
        -targetSizes=/hive/data/genomes/hg38/chrom.sizes \
 -query2Bit=/hive/data/genomes/asmHubs/genbankBuild/GCA/018/873/775/GCA_018873775.2_hg01243.v3.0/GCA_018873775.2_hg01243.v3.0.2bit \
 -querySizes=/hive/data/genomes/asmHubs/genbankBuild/GCA/018/873/775/GCA_018873775.2_hg01243.v3.0/GCA_018873775.2_hg01243.v3.0.chrom.sizes \
        -ooc=/hive/data/genomes/hg38/hg38.p14.ooc \
         hg38 GCA_018873775.2) > doLiftOverToGCA_018873775.2.log 2>&1
    # real    12654m58.134s

    # broken after the alignment was done, with the parasol endless loop
    # error message in the log file:
    #  select failure in rudp: Invalid argument
    # killed that, cleaned the 4Tb log file, and gave up on this alignment
    # since the lastz/chain/net is much better

    # see if the liftOver menus function in the browser from hg38
    #    to GCA_018873775.2

##############################################################################
# LIFTOVER GCA_018873775.2_hg01243.v3.0 to hg38 (DONE - 2023-08-13 - Hiram)
    ssh hgwdev

    mkdir /hive/data/genomes/asmHubs/genbankBuild/GCA/018/873/775/GCA_018873775.2_hg01243.v3.0/trackData/blat.hg38.2023-08-13
    cd /hive/data/genomes/asmHubs/genbankBuild/GCA/018/873/775/GCA_018873775.2_hg01243.v3.0/trackData/blat.hg38.2023-08-13

    doSameSpeciesLiftOver.pl -verbose=2 -buildDir=`pwd` \
        -debug -bigClusterHub=hgwdev -dbHost=hgwdev -workhorse=hgwdev \
 -target2Bit=/hive/data/genomes/asmHubs/genbankBuild/GCA/018/873/775/GCA_018873775.2_hg01243.v3.0/GCA_018873775.2_hg01243.v3.0.2bit \
 -targetSizes=/hive/data/genomes/asmHubs/genbankBuild/GCA/018/873/775/GCA_018873775.2_hg01243.v3.0/GCA_018873775.2_hg01243.v3.0.chrom.sizes \
        -query2Bit=/hive/data/genomes/hg38/hg38.2bit \
        -querySizes=/hive/data/genomes/hg38/chrom.sizes \
        -ooc=/hive/data/genomes/asmHubs/genbankBuild/GCA/018/873/775/GCA_018873775.2_hg01243.v3.0/GCA_018873775.2_hg01243.v3.0.ooc \
         GCA_018873775.2 hg38

    # trying -ram=6g to get full use of hgwdev kluster nodes
    time (~/kent/src/hg/utils/automation/doSameSpeciesLiftOver.pl \
        -verbose=2 -buildDir=`pwd` \
        -bigClusterHub=hgwdev -dbHost=hgwdev -workhorse=hgwdev \
 -target2Bit=/hive/data/genomes/asmHubs/genbankBuild/GCA/018/873/775/GCA_018873775.2_hg01243.v3.0/GCA_018873775.2_hg01243.v3.0.2bit \
 -targetSizes=/hive/data/genomes/asmHubs/genbankBuild/GCA/018/873/775/GCA_018873775.2_hg01243.v3.0/GCA_018873775.2_hg01243.v3.0.chrom.sizes \
        -query2Bit=/hive/data/genomes/hg38/hg38.2bit \
        -querySizes=/hive/data/genomes/hg38/chrom.sizes \
        -ooc=/hive/data/genomes/asmHubs/genbankBuild/GCA/018/873/775/GCA_018873775.2_hg01243.v3.0/GCA_018873775.2_hg01243.v3.0.ooc \
         GCA_018873775.2 hg38) > doLiftOverToHg38.log 2>&1

    # broken after the alignment was done, with the parasol endless loop
    # error message in the log file:
    #  select failure in rudp: Invalid argument
    # killed that, cleaned the 4Tb log file, and gave up on this alignment
    # since the lastz/chain/net is much better
    # real    193m24.137s

    # see if the liftOver menus function in the browser from GCA_018873775.2
    #    to hg38

##############################################################################
# LIFTOVER TO GCA_018503275.1_NA19240.pri.mat.f1_v2 (TBD - 2023-08-14 - Hiram)
    ssh hgwdev

    # ooc for this GenArk hub
    cd /hive/data/genomes/asmHubs/genbankBuild/GCA/018/503/275/GCA_018503275.1_NA19240.pri.mat.f1_v2
  time blat GCA_018503275.1_NA19240.pri.mat.f1_v2.2bit /dev/null /dev/null \
      -tileSize=11 -repMatch=1024 \
      -makeOoc=GCA_018503275.1_NA19240.pri.mat.f1_v2.11.ooc
  # Wrote 35866 overused 11-mers to GCA_018503275.1_NA19240.pri.mat.f1_v2.11.ooc
    # real    0m32.298s

  mkdir /hive/data/genomes/hg38/bed/blat.GCA_018503275.1_NA19240.pri.mat.f1_v2.2023-08-14
  cd /hive/data/genomes/hg38/bed/blat.GCA_018503275.1_NA19240.pri.mat.f1_v2.2023-08-14

    ~/kent/src/hg/utils/automation/doSameSpeciesLiftOver.pl -verbose=2 \
        -buildDir=`pwd` -ram=4g -chainRam=16g \
        -debug -bigClusterHub=hgwdev -dbHost=hgwdev -workhorse=hgwdev \
        -target2Bit=/hive/data/genomes/hg38/hg38.2bit \
        -targetSizes=/hive/data/genomes/hg38/chrom.sizes \
 -query2Bit=/hive/data/genomes/asmHubs/genbankBuild/GCA/018/503/275/GCA_018503275.1_NA19240.pri.mat.f1_v2/GCA_018503275.1_NA19240.pri.mat.f1_v2.2bit \
 -querySizes=/hive/data/genomes/asmHubs/genbankBuild/GCA/018/503/275/GCA_018503275.1_NA19240.pri.mat.f1_v2/GCA_018503275.1_NA19240.pri.mat.f1_v2.chrom.sizes \
        -ooc=/hive/data/genomes/hg38/hg38.p14.ooc \
         hg38 GCA_018503275.1

    # trying -ram=4g to get full use of hgwdev kluster nodes
    time (~/kent/src/hg/utils/automation/doSameSpeciesLiftOver.pl \
        -verbose=2 -buildDir=`pwd` -ram=4g -chainRam=16g \
        -bigClusterHub=hgwdev -dbHost=hgwdev -workhorse=hgwdev \
        -target2Bit=/hive/data/genomes/hg38/hg38.2bit \
        -targetSizes=/hive/data/genomes/hg38/chrom.sizes \
 -query2Bit=/hive/data/genomes/asmHubs/genbankBuild/GCA/018/503/275/GCA_018503275.1_NA19240.pri.mat.f1_v2/GCA_018503275.1_NA19240.pri.mat.f1_v2.2bit \
 -querySizes=/hive/data/genomes/asmHubs/genbankBuild/GCA/018/503/275/GCA_018503275.1_NA19240.pri.mat.f1_v2/GCA_018503275.1_NA19240.pri.mat.f1_v2.chrom.sizes \
        -ooc=/hive/data/genomes/hg38/hg38.p14.ooc \
         hg38 GCA_018503275.1) > doLiftOverToGCA_018503275.1.log 2>&1
    # real    11370m18.026s

    # broken after the alignment was done, with the parasol endless loop
    # error message in the log file:
    #  select failure in rudp: Invalid argument
    # killed that, cleaned the 4Tb log file, and gave up on this alignment
    # since the lastz/chain/net is much better
    # -rw-rw-r-- 1 4363949695640 Aug 22 09:16 doLiftOverToGCA_018503275.1.log

    # see if the liftOver menus function in the browser from hg38
    #    to GCA_018503275.1

##############################################################################
# LIFTOVER GCA_018503275.1_NA19240.pri.mat.f1_v2 to hg38 (DONE - 2023-08-14 - Hiram)
    ssh hgwdev

    mkdir /hive/data/genomes/asmHubs/genbankBuild/GCA/018/503/275/GCA_018503275.1_NA19240.pri.mat.f1_v2/trackData/blat.hg38.2023-08-14
    cd /hive/data/genomes/asmHubs/genbankBuild/GCA/018/503/275/GCA_018503275.1_NA19240.pri.mat.f1_v2/trackData/blat.hg38.2023-08-14

    ~/kent/src/hg/utils/automation/doSameSpeciesLiftOver.pl -verbose=2 \
        -buildDir=`pwd` -ram=4g -chainRam=16g \
        -debug -bigClusterHub=hgwdev -dbHost=hgwdev -workhorse=hgwdev \
 -target2Bit=/hive/data/genomes/asmHubs/genbankBuild/GCA/018/503/275/GCA_018503275.1_NA19240.pri.mat.f1_v2/GCA_018503275.1_NA19240.pri.mat.f1_v2.2bit \
 -targetSizes=/hive/data/genomes/asmHubs/genbankBuild/GCA/018/503/275/GCA_018503275.1_NA19240.pri.mat.f1_v2/GCA_018503275.1_NA19240.pri.mat.f1_v2.chrom.sizes \
        -query2Bit=/hive/data/genomes/hg38/hg38.2bit \
        -querySizes=/hive/data/genomes/hg38/chrom.sizes \
        -ooc=/hive/data/genomes/asmHubs/genbankBuild/GCA/018/503/275/GCA_018503275.1_NA19240.pri.mat.f1_v2/GCA_018503275.1_NA19240.pri.mat.f1_v2.11.ooc \
         GCA_018503275.1 hg38

    time (~/kent/src/hg/utils/automation/doSameSpeciesLiftOver.pl -verbose=2 \
        -buildDir=`pwd` -ram=4g -chainRam=16g \
        -bigClusterHub=hgwdev -dbHost=hgwdev -workhorse=hgwdev \
 -target2Bit=/hive/data/genomes/asmHubs/genbankBuild/GCA/018/503/275/GCA_018503275.1_NA19240.pri.mat.f1_v2/GCA_018503275.1_NA19240.pri.mat.f1_v2.2bit \
 -targetSizes=/hive/data/genomes/asmHubs/genbankBuild/GCA/018/503/275/GCA_018503275.1_NA19240.pri.mat.f1_v2/GCA_018503275.1_NA19240.pri.mat.f1_v2.chrom.sizes \
        -query2Bit=/hive/data/genomes/hg38/hg38.2bit \
        -querySizes=/hive/data/genomes/hg38/chrom.sizes \
        -ooc=/hive/data/genomes/asmHubs/genbankBuild/GCA/018/503/275/GCA_018503275.1_NA19240.pri.mat.f1_v2/GCA_018503275.1_NA19240.pri.mat.f1_v2.11.ooc \
         GCA_018503275.1 hg38) > liftOverToHg38.log 2>&1
    # real    5082m17.500s

    # this is interesting, this alignment completed and actually has good
    # coverage:
    cat fb.GCA_018503275.1.chain.Hg38Link.txt
    # 2928654519 bases of 3032066086 (96.589%) in intersection

    # see if the liftOver menus function in the browser from GCA_018503275.1
    #    to hg38

##############################################################################
## update grp table add new row for HPRC (DONE - 2023-08-29 - Hiram)
## existing structure:

    hgsql -e 'desc grp;' hg38

+-----------------+-----------+------+-----+---------+-------+
| Field           | Type      | Null | Key | Default | Extra |
+-----------------+-----------+------+-----+---------+-------+
| name            | char(255) | NO   | PRI |         |       |
| label           | char(255) | NO   |     |         |       |
| priority        | float     | NO   |     | 0       |       |
| defaultIsClosed | int(11)   | YES  |     | NULL    |       |
+-----------------+-----------+------+-----+---------+-------+

    #  add one new row:
    hgsql hg38 \
      -e "INSERT INTO grp VALUES ('hprc', 'Human Pangenome - HPRC', 3.6, 0);"

    # resulting table:

    hgsql -e 'select * from grp order by priority;' hg38
+------------+------------------------------------+----------+-----------------+
| name       | label                              | priority | defaultIsClosed |
+------------+------------------------------------+----------+-----------------+
| user       | Custom Tracks                      |        1 |               0 |
| remc       | Reference Epigenome Mapping Center |      1.2 |               1 |
| map        | Mapping and Sequencing             |        2 |               0 |
| genes      | Genes and Gene Predictions         |        3 |               0 |
| phenDis    | Phenotype and Literature           |      3.4 |               0 |
| pub        | Literature                         |      3.5 |               0 |
| hprc       | Human Pangenome - HPRC             |      3.6 |               0 |
| covid      | COVID-19                           |      3.6 |               0 |
| singleCell | Single Cell RNA-seq                |      3.7 |               0 |
| rna        | mRNA and EST                       |        4 |               0 |
| expression | Expression                         |      4.5 |               0 |
| regulation | Regulation                         |        5 |               0 |
| compGeno   | Comparative Genomics               |        6 |               0 |
| varRep     | Variation                          |        7 |               0 |
| rep        | Repeats                            |        8 |               0 |
| x          | Experimental                       |       10 |               1 |
+------------+------------------------------------+----------+-----------------+

##############################################################################
# Affy CytoScan HD track, refs #32856  (2024-01-23 Gerardo)
cd /hive/data/genomes/hg38/bed/
mkdir genotypeArrays; cd genotypeArrays
#The user sent Gerardo a direct email with a shared folder link. Gerardo downloaded the bed files and made them available on dev.
#The user provided two bed files (https://hgwdev-gperez2.gi.ucsc.edu/~gperez2/mlq/mlq_32791/). Gerardo used the version 2 bed file for the track.
wget https://hgwdev-gperez2.gi.ucsc.edu/~gperez2/mlq/mlq_32791/CytoScanHD_Accel_Array.na36.bed.zip
unzip CytoScanHD_Accel_Array.na36.bed.zip
# Removed header and sorted the file
grep -v 'track' CytoScanHD_Accel_Array.na36.bed | bedSort stdin stdout > affyCytoScanHD.bed
bedToBigBed -tab -type=bed12 affyCytoScanHD.bed https://hgdownload.soe.ucsc.edu/goldenPath/hg38/bigZips/hg38.chrom.sizes affyCytoScanHD.bb 
cd /gbdb/hg38
mkdir genotypeArrays; cd genotypeArrays
# Making symlink for big file and raw bed file
ln -s /hive/data/genomes/hg38/bed/genotypeArrays/affyCytoScanHD.bb
ln -s /hive/data/genomes/hg38/bed/genotypeArrays/CytoScanHD_Accel_Array.na36.bed.zip
cd ~/kent/src/hg/makeDb/trackDb/human/hg38
vi trackDb.ra

##############################################################################
# LASTZ Human Hg38 vs. California sea lion GCF_009762305.2
#    (DONE - 2024-03-06 - jairo)

    mkdir /hive/data/genomes/hg38/bed/lastzGCF_009762305.2.2024-03-06
    cd /hive/data/genomes/hg38/bed/lastzGCF_009762305.2.2024-03-06

    printf '# California sea lion GCF_009762305.2 vs. Human Hg38
BLASTZ=/cluster/bin/penn/lastz-distrib-1.04.03/bin/lastz

# TARGET: Human  hg38
SEQ1_DIR=/hive/data/genomes/hg38/hg38.2bit
SEQ1_LEN=/hive/data/genomes/hg38/chrom.sizes
SEQ1_CHUNK=20000000
SEQ1_LAP=10000
SEQ1_LIMIT=40

# QUERY: California sea lion 2020-07-14 GCF_009762305.2_mZalCal1.pri.v2
SEQ2_DIR=/hive/data/genomes/asmHubs/GCF/009/762/305/GCF_009762305.2/GCF_009762305.2.2bit
SEQ2_LEN=/hive/data/genomes/asmHubs/GCF/009/762/305/GCF_009762305.2/GCF_009762305.2.chrom.sizes.txt
SEQ2_CHUNK=20000000
SEQ2_LAP=0
SEQ2_LIMIT=100

BASE=/hive/data/genomes/hg38/bed/lastzGCF_009762305.2.2024-03-06
TMPDIR=/dev/shm

' > DEF

    time (~/kent/src/hg/utils/automation/doBlastzChainNet.pl -trackHub -noDbNameCheck -verbose=2 `pwd`/DEF -syntenicNet \
       -qAsmId GCF_009762305.2_mZalCal1.pri.v2 -workhorse=hgwdev -smallClusterHub=hgwdev -bigClusterHub=ku \
        -chainMinScore=3000 -chainLinearGap=medium) > do.log 2>&1
    grep -w real do.log | sed -e 's/^/    # /;'
    # real      1018m28.119s

    sed -e 's/^/    # /;' fb.hg38.chainGCF_009762305.2Link.txt
    # 1633315994 bases of 3299210039 (49.506%) in intersection
    sed -e 's/^/    # /;' fb.hg38.chainSynGCF_009762305.2Link.txt
    # 1564193911 bases of 3299210039 (47.411%) in intersection

    time (~/kent/src/hg/utils/automation/doRecipBest.pl -trackHub -load -workhorse=hgwdev -buildDir=`pwd` \
       \
      -query2Bit="/hive/data/genomes/asmHubs/GCF/009/762/305/GCF_009762305.2/GCF_009762305.2.2bit" \
-querySizes="/hive/data/genomes/asmHubs/GCF/009/762/305/GCF_009762305.2/GCF_009762305.2.chrom.sizes.txt" \
        hg38 GCF_009762305.2) > rbest.log 2>&1

    grep -w real rbest.log | sed -e 's/^/    # /;'
    # real      303m36.739s

    sed -e 's/^/    # /;' fb.hg38.chainRBest.GCF_009762305.2.txt
    # 1461974620 bases of 3299210039 (44.313%) in intersection

    ### and for the swap

    cd /hive/data/genomes/asmHubs/allBuild/GCF/009/762/305/GCF_009762305.2_mZalCal1.pri.v2/trackData/blastz.hg38.swap

   time (~/kent/src/hg/utils/automation/doBlastzChainNet.pl -trackHub -noDbNameCheck -swap -verbose=2 \
   -qAsmId GCF_009762305.2_mZalCal1.pri.v2 /hive/data/genomes/hg38/bed/lastzGCF_009762305.2.2024-03-06/DEF -swapDir=`pwd` \
  -syntenicNet -workhorse=hgwdev -smallClusterHub=hgwdev -bigClusterHub=ku \
    -chainMinScore=3000 -chainLinearGap=medium) > swap.log 2>&1

    grep -w real swap.log | sed -e 's/^/    # /;'
    # real      103m25.220s

    sed -e 's/^/    # /;' fb.GCF_009762305.2.chainHg38Link.txt
    # 1493183463 bases of 2409685272 (61.966%) in intersection
    sed -e 's/^/    # /;' fb.GCF_009762305.2.chainSynHg38Link.txt
    # 1457122207 bases of 2409685272 (60.469%) in intersection
\    time (~/kent/src/hg/utils/automation/doRecipBest.pl -trackHub -load -workhorse=hgwdev -buildDir=`pwd` \
    \
   -target2bit="/hive/data/genomes/asmHubs/GCF/009/762/305/GCF_009762305.2/GCF_009762305.2.2bit" \
-targetSizes="/hive/data/genomes/asmHubs/GCF/009/762/305/GCF_009762305.2/GCF_009762305.2.chrom.sizes.txt" \
   GCF_009762305.2 hg38) > rbest.log 2>&1

    grep -w real rbest.log | sed -e 's/^/    # /;'
    # real      286m31.189s

    sed -e 's/^/    # /;' fb.GCF_009762305.2.chainRBest.Hg38.txt
    # 1461710350 bases of 2409685272 (60.660%) in intersection

##############################################################################
# hg38.chromAlias.bb was incorrectly built without indexes so it will not
# work with bedToBigBed 2024-04-08 markd

cd /hive/data/genomes/hg38/goldenPath/bigZips/initial
mv hg38.chromAlias.bb  hg38.chromAlias.noindexes.bb
bigBedInfo -asOut hg38.chromAlias.noindexes.bb >hg38.chromAlias.as
bigBedToBed hg38.chromAlias.noindexes.bb  hg38.chromAlias.bed
bedToBigBed -tab -type=bed3+ -as=hg38.chromAlias.as hg38.chromAlias.bed -sizesIs2Bit  -extraIndex=ucsc,assembly,ensembl,genbank,refseq hg38.2bit hg38.chromAlias.bb

##############################################################################

# ENCODE 4 TF rPeak Clusters - RM #34930 - Lou 12/19/24

mkdir /hive/data/genomes/hg38/bed/ENCODEv4TFrPeaks
cd /hive/data/genomes/hg38/bed/ENCODEv4TFrPeaks
hubClone -download https://users.wenglab.org/gaomingshi/TF.rpeak.test.txt
ln -s /hive/data/genomes/hg38/bed/ENCODEv4TFrPeaks/no_trim.TF_name.rPeaks.bb /gbdb/hg38/bbi/ENCODE4/TFrPeakClusters.bb
ln -s /hive/data/genomes/hg38/bed/ENCODEv4TFrPeaks/no_trim.TF_name.decorator.bb /gbdb/hg38/bbi/ENCODE4/TFrPeakClustersDecorator.bb
# Then just moved the files to the ENCODEv4TFrPeaks dir, moved/tweaked HTML and trackDb
#########################################################################
# COSMIC v101 (01-24-2025) Gerardo
# RM 34904

# Karen from COSMIC emailed data files links to Max. Max renamed the files to make it clear which db they belong to.
## $ ll /hive/data/outside/cosmic/ucsc_export*.bed.gz
## rw-rw-r- 1 max protein 198M Jan 14 16:15
## /hive/data/outside/cosmic/ucsc_export.v101.hg19.bed.gz
## rw-rw-r- 1 max protein 198M Jan 14 13:35
## /hive/data/outside/cosmic/ucsc_export.v101.hg38.bed.gz

mkdir -p /hive/data/outside/cosmic/{hg19,hg38}/v101

cd /hive/data/outside/cosmic/hg38/v101
zcat /hive/data/outside/cosmic/ucsc_export.v101.hg38.bed.gz  | awk -F'\t' -v OFS="\t" '{ print $1, $2, $3, $7, 0, $6, $4, $5, $8 }' | sort -k1,1 -k2,2n > cosmic.bed
bedToBigBed -type=bed6+3 -as=/hive/data/outside/cosmic/hg38/v98/cosmic.as cosmic.bed /hive/data/genomes/hg38/chrom.sizes cosmic.bb -tab
cd /gbdb/hg38/cosmic/
ln -s /hive/data/outside/cosmic/hg38/v98/cosmic.bb cosmicv98.bb
ln -sf /hive/data/outside/cosmic/hg38/v101/cosmic.bb
#Updated human/hg38/trackDb.ra and human/hg38/cosmicMuts.html



