# for emacs: -*- mode: sh; -*-

# This file describes browser build for the felCat5
# Felis catus - Cat

#       DATE:   13-Sep-2011
#       ORGANISM:       Felis catus
#       TAXID:  9685
#       ASSEMBLY LONG NAME:     Felis_catus-6.2
#       ASSEMBLY SHORT NAME:    Felis_catus-6.2
#       ASSEMBLY SUBMITTER:     International Cat Genome Sequencing Consortium
#       ASSEMBLY TYPE:  Haploid
#       NUMBER OF ASSEMBLY-UNITS:       1
#       ASSEMBLY ACCESSION:     GCA_000181335.2

#       FTP-RELEASE DATE: 20-Dec-2011

#       rsync://ftp.ncbi.nlm.nih.gov/genbank/genomes/Eukaryotes/vertebrates_mammals/Felis_catus/Felis_catus-6.2/

#	Genome ID:
#       http://www.ncbi.nlm.nih.gov/genome/78
#       http://www.ncbi.nlm.nih.gov/bioproject/16726
#       http://www.ncbi.nlm.nih.gov/nuccore/359264388
#       http://www.ncbi.nlm.nih.gov/Traces/wgs/?val=AANG02
#       Genome Coverage : 2x Sanger; 12x 454

#	Taxonomy:
#       http://www.ncbi.nlm.nih.gov/Taxonomy/Browser/wwwtax.cgi?id=9685

#	Mitochondrial sequence:
#       http://www.ncbi.nlm.nih.gov/bioproject/10762 - chrMt NC_001700.1

#	Assembly ID: 320798
#       http://www.ncbi.nlm.nih.gov/genome/assembly/320798/


#############################################################################
# fetch sequence from genbank (DONE - 2012-04-11 - Hiram)
    mkdir /hive/data/genomes/felCat5/genbank
    cd /hive/data/genomes/felCat5/genbank

    time rsync -a -P \
rsync://ftp.ncbi.nlm.nih.gov/genbank/genomes/Eukaryotes/vertebrates_mammals/Felis_catus/Felis_catus-6.2/ ./
    #   real    25m17.609s

    # measure sequence to be used here
    faSize Primary_Assembly/assembled_chromosomes/FASTA/*.fa.gz \
        Primary_Assembly/unlocalized_scaffolds/FASTA/*.fa.gz \
        Primary_Assembly/unplaced_scaffolds/FASTA/*.fa.gz
    #   2455524127 bases (91244929 N's 2364279198 real 2364279198 upper
    #   0 lower) in 5499 sequences in 39 files
    #   Total size: mean 446540.1 sd 8151278.3
    #   min 205 (gi|361624552|gb|JH413284.1|)
    #   max 239302903 (gi|362110686|gb|CM001378.1|) median 1243

#############################################################################
# fixup names for UCSC standards (DONE - 2012-04-11 - Hiram)
    mkdir /hive/data/genomes/felCat5/ucsc
    cd /hive/data/genomes/felCat5/ucsc

    ########################  Assembled Chromosomes
    cat << '_EOF_' > toUcsc.pl
#!/bin/env perl

use strict;
use warnings;

my %accToChr;

open (FH, "<../genbank/Primary_Assembly/assembled_chromosomes/chr2acc") or
        die "can not read Primary_Assembly/assembled_chromosomes/chr2acc";
while (my $line = <FH>) {
    next if ($line =~ m/^#/);
    chomp $line;
    my ($chrN, $acc) = split('\s+', $line);
    $accToChr{$acc} = $chrN;
}
close (FH);

foreach my $acc (keys %accToChr) {
    my $chrN =  $accToChr{$acc};
    print "$acc $accToChr{$acc}\n";
    open (FH, "zcat ../genbank/Primary_Assembly/assembled_chromosomes/AGP/chr${chrN}.agp.gz|") or die "can not read chr${chrN}.agp.gz";
    open (UC, "|gzip -c >chr${chrN}.agp.gz") or die "can not write to chr${chrN}.agp.gz";
    while (my $line = <FH>) {
        if ($line =~ m/^#/) {
            print UC $line;
        } else {
            $line =~ s/^$acc/chr${chrN}/;
            print UC $line;
        }
    }
    close (FH);
    close (UC);
    open (FH, "zcat ../genbank/Primary_Assembly/assembled_chromosomes/FASTA/chr${chrN}.fa.gz|") or die "can not read chr${chrN}.fa.gz";
    open (UC, "|gzip -c >chr${chrN}.fa.gz") or die "can not write to chr${chrN}.fa.gz";
    while (my $line = <FH>) {
        if ($line =~ m/^>/) {
            printf UC ">chr${chrN}\n";
        } else {
            print UC $line;
        }
    }
    close (FH);
    close (UC);
}
'_EOF_'
    # << happy emacs
    chmod +x toUcsc.pl
    time ./toUcsc.pl
    #   real    12m6.675s

    ########################  Unplaced scaffolds
    cat << '_EOF_' > unplaced.pl
#!/bin/env perl

use strict;
use warnings;

my $agpFile =  "../genbank/Primary_Assembly/unplaced_scaffolds/AGP/unplaced.scaf.agp.gz";
my $fastaFile =  "../genbank/Primary_Assembly/unplaced_scaffolds/FASTA/unplaced.scaf.fa.gz";
open (FH, "zcat $agpFile|") or die "can not read $agpFile";
open (UC, "|gzip -c >unplaced.agp.gz") or die "can not write to unplaced.agp.gz";
while (my $line = <FH>) {
    if ($line =~ m/^#/) {
        print UC $line;
    } else {
        $line =~ s/\.1//;
        printf UC "chrUn_%s", $line;
    }
}
close (FH);
close (UC);

open (FH, "zcat $fastaFile|") or die "can not read $fastaFile";
open (UC, "|gzip -c >unplaced.fa.gz") or die "can not write to unplaced.fa.gz";
while (my $line = <FH>) {
    if ($line =~ m/^>/) {
        chomp $line;
        $line =~ s/.*gb\|//;
        $line =~ s/\.1\|.*//;
        printf UC ">chrUn_$line\n";
    } else {
        print UC $line;
    }
}
close (FH);
close (UC);
'_EOF_'
    # << happy emacs
    chmod +x unplaced.pl
    time ./unplaced.pl
    #	real    0m3.062s
    # make sure none of the names got to be over 31 characers long:
    zcat unplaced.agp.gz | grep -v "^#" | cut -f1 | awk '{print length($0)}' \
        | sort -rn | uniq -c | head
    #   5621 14

    ########################  Unlocalized scaffolds
    cat << '_EOF_' > unlocalized.pl
#!/bin/env perl

use strict;
use warnings;

my %accToChr;
my %chrNames;

open (FH, "<../genbank/Primary_Assembly/unlocalized_scaffolds/unlocalized.chr2scaf") or
        die "can not read Primary_Assembly/unlocalized_scaffolds/unlocalized.chr2scaf";
while (my $line = <FH>) {
    next if ($line =~ m/^#/);
    chomp $line;
    my ($chrN, $acc) = split('\s+', $line);
    $accToChr{$acc} = $chrN;
    $chrNames{$chrN} += 1;
}
close (FH);

foreach my $chrN (keys %chrNames) {
    my $agpFile =  "../genbank/Primary_Assembly/unlocalized_scaffolds/AGP/chr$chrN.unlocalized.scaf.agp.gz";
    my $fastaFile =  "../genbank/Primary_Assembly/unlocalized_scaffolds/FASTA/chr$chrN.unlocalized.scaf.fa.gz";
    open (FH, "zcat $agpFile|") or die "can not read $agpFile";
    open (UC, "|gzip -c >chr${chrN}_random.agp.gz") or die "can not write to chr${chrN}_random.agp";
    while (my $line = <FH>) {
        if ($line =~ m/^#/) {
            print UC $line;
        } else {
            chomp $line;
            my (@a) = split('\t', $line);
            my $acc = $a[0];
            my $accNo1 = $acc;
            $accNo1 =~ s/.1$//;
            die "ERROR: acc not .1: $acc" if ($accNo1 =~ m/\./);
            die "ERROR: chrN $chrN not correct for $acc"
                if ($accToChr{$acc} ne $chrN);
            my $ucscName = "chr${chrN}_${accNo1}_random";
            printf UC "%s", $ucscName;
            for (my $i = 1; $i < scalar(@a); ++$i) {
                printf UC "\t%s", $a[$i];
            }
            printf UC "\n";
        }
    }
    close (FH);
    close (UC);
    printf "chr%s\n", $chrN;
    open (FH, "zcat $fastaFile|") or die "can not read $fastaFile";
    open (UC, "|gzip -c >chr${chrN}_random.fa.gz") or die "can not write to chr${chrN}_random.fa";
    while (my $line = <FH>) {
        if ($line =~ m/^>/) {
            chomp $line;
            my $acc = $line;
            $acc =~ s/.*gb\|//;
            $acc =~ s/\|.*//;
            my $accNo1 = $acc;
            $accNo1 =~ s/.1$//;
            die "ERROR: acc not .1: $acc" if ($accNo1 =~ m/\./);
            die "ERROR: chrN $chrN not correct for $acc"
                if ($accToChr{$acc} ne $chrN);
            my $ucscName = "chr${chrN}_${accNo1}_random";
            printf UC ">$ucscName\n";
        } else {
            print UC $line;
        }
    }
    close (FH);
    close (UC);
}
'_EOF_'
    # << happy emacs
    chmod +x unlocalized.pl
    time ./unlocalized.pl
    #	real    0m6.459s

    # verify nothing lost from original:
    faSize *.fa.gz
    #   2455524127 bases (91244929 N's 2364279198 real 2364279198 upper
    #   0 lower) in 5499 sequences in 39 files
    #   Total size: mean 446540.1 sd 8151278.3 min 205 (chrUn_JH413284)
    #   max 239302903 (chrA1) median 1243

    # make sure none of the names got to be over 31 characers long:
    zcat *.agp.gz | grep -v "^#" | cut -f1 | awk '{print length($0)}' \
        | sort -rn | uniq -c | head
    #   2860 21
    #   255 20
    #   5621 14
    #   1632 5
    #   187 4

#############################################################################
#  Initial database build (DONE - 2012-04-11 - Hiram)

    cd /hive/data/genomes/felCat5
    cat << '_EOF_' > felCat5.config.ra
# Config parameters for makeGenomeDb.pl:
db felCat5
clade mammal
genomeCladePriority 17
scientificName Felis catus
commonName Cat
assemblyDate Sep. 2011
assemblyLabel International Cat Genome Sequencing Consortium Felis_catus-6.2 (NCBI project 16726, accession AANG00000000.2, GCA_000181335.2)
assemblyShortLabel ICGSC Felis_catus 6.2
ncbiAssemblyName Felis_catus-6.2
ncbiAssemblyId 320798
orderKey 2418
mitoAcc NC_001700
fastaFiles /hive/data/genomes/felCat5/ucsc/*.fa.gz
agpFiles /hive/data/genomes/felCat5/ucsc/*.agp.gz
# qualFiles none
dbDbSpeciesDir cat
taxId 9685
'_EOF_'
    # << happy emacs

    # verify sequence and agp are OK
    time makeGenomeDb.pl -workhorse=hgwdev -fileServer=hgwdev -dbHost=hgwdev \
        -stop=agp felCat5.config.ra > agp.log 2>&1
    #	real    2m49.310s
    # verify OK:
    tail -1 agp.log
    #   *** All done!  (through the 'agp' step)

    time makeGenomeDb.pl -continue=db -workhorse=hgwdev -fileServer=hgwdev \
        -dbHost=hgwdev felCat5.config.ra > db.log 2>&1
    #   real    18m37.337s

##########################################################################
# running repeat masker (DONE - 2012-04-11 - Hiram)
    mkdir /hive/data/genomes/felCat5/bed/repeatMasker
    cd /hive/data/genomes/felCat5/bed/repeatMasker
    time doRepeatMasker.pl -buildDir=`pwd` -noSplit \
	-bigClusterHub=swarm -dbHost=hgwdev -workhorse=hgwdev \
	-smallClusterHub=encodek felCat5 > do.log 2>&1 &
    #   about 17h 35m

    cat faSize.rmsk.txt
    #   2455541136 bases (91244929 N's 2364296207 real 1342935452 upper
    #   1021360755 lower) in 5500 sequences in 1 files
    #   Total size: mean 446462.0 sd 8150539.1 min 205 (chrUn_JH413284)
    #   max 239302903 (chrA1) median 1243
    #   %41.59 masked total, %43.20 masked real

    egrep -i "versi|relea" do.log
#    April 26 2011 (open-3-3-0) version of RepeatMasker
# CC   RELEASE 20110920;
# RepeatMasker version development-$Id: RepeatMasker,v 1.26 2011/09/26 16:19:44 angie Exp $

    featureBits -countGaps felCat5 rmsk
    #   1023331634 bases of 2455541136 (41.674%) in intersection

    # why is it different than the faSize above ?
    # because rmsk masks out some N's as well as bases, the count above
    #	separates out the N's from the bases, it doesn't show lower case N's

##########################################################################
# running simple repeat (DONE - 2012-04-11 - Hiram)
    mkdir /hive/data/genomes/felCat5/bed/simpleRepeat
    cd /hive/data/genomes/felCat5/bed/simpleRepeat
    time doSimpleRepeat.pl -buildDir=`pwd` -bigClusterHub=swarm \
	-dbHost=hgwdev -workhorse=hgwdev -smallClusterHub=encodek \
	felCat5 > do.log 2>&1 &
    #   about 14 minutes
    cat fb.simpleRepeat
    #   39293231 bases of 2403678276 (1.635%) in intersection

    # add to rmsk after it is done:
    cd /hive/data/genomes/felCat5
    twoBitMask felCat5.rmsk.2bit \
	-add bed/simpleRepeat/trfMask.bed felCat5.2bit
    #	you can safely ignore the warning about fields >= 13

    twoBitToFa felCat5.2bit stdout | faSize stdin > faSize.felCat5.2bit.txt
    cat faSize.felCat5.2bit.txt
    #   2455541136 bases (91244929 N's 2364296207 real 1341962038 upper
    #   1022334169 lower) in 5500 sequences in 1 files
    #   Total size: mean 446462.0 sd 8150539.1 min 205 (chrUn_JH413284)
    #   max 239302903 (chrA1) median 1243
    #   %41.63 masked total, %43.24 masked real

    rm /gbdb/felCat5/felCat5.2bit
    ln -s `pwd`/felCat5.2bit /gbdb/felCat5/felCat5.2bit

#########################################################################
# Verify all gaps are marked, add any N's not in gap as type 'other'
#	(DONE - 2012-04-11 - Hiram)
    mkdir /hive/data/genomes/felCat5/bed/gap
    cd /hive/data/genomes/felCat5/bed/gap
    time nice -n +19 findMotif -motif=gattaca -verbose=4 \
	-strand=+ ../../felCat5.unmasked.2bit > findMotif.txt 2>&1
    #	real    0m24.662s
    grep "^#GAP " findMotif.txt | sed -e "s/^#GAP //" > allGaps.bed
    featureBits felCat5 -not gap -bed=notGap.bed
    #   2403678276 bases of 2403678276 (100.000%) in intersection
    #   real    0m12.025s
    time featureBits felCat5 allGaps.bed notGap.bed -bed=new.gaps.bed
    #   39382069 bases of 2403678276 (1.638%) in intersection
    #   real    2m51.724s

    #	what is the highest index in the existing gap table:
    hgsql -N -e "select ix from gap;" felCat5 | sort -n | tail -1
    #	264
    cat << '_EOF_' > mkGap.pl
#!/bin/env perl

use strict;
use warnings;

my $ix=`hgsql -N -e "select ix from gap;" felCat5 | sort -n | tail -1`;
chomp $ix;

open (FH,"<new.gaps.bed") or die "can not read new.gaps.bed";
while (my $line = <FH>) {
    my ($chrom, $chromStart, $chromEnd, $rest) = split('\s+', $line);
    ++$ix;
    printf "%s\t%d\t%d\t%d\tN\t%d\tother\tyes\n", $chrom, $chromStart,
        $chromEnd, $ix, $chromEnd-$chromStart;
}
close (FH);
'_EOF_'
    # << happy emacs
    chmod +x ./mkGap.pl
    ./mkGap.pl > other.bed
    featureBits -countGaps felCat5 other.bed
    #	39382069 bases of 2455541136 (1.604%) in intersection
    wc -l other.bed
    #	195055
    hgLoadBed -sqlTable=$HOME/kent/src/hg/lib/gap.sql \
	-noLoad felCat5 otherGap other.bed
    #   Read 195055 elements of size 8 from other.bed
    #	starting with this many
    hgsql -e "select count(*) from gap;" felCat5
    #	 2528
    hgsql felCat5 -e 'load data local infile "bed.tab" into table gap;'
    #	result count:
    hgsql -e "select count(*) from gap;" felCat5
    #	197583
    # == 2528 + 195055
    # verify we aren't adding gaps where gaps already exist
    # this would output errors if that were true:
    gapToLift -minGap=1 felCat5 nonBridged.lift -bedFile=nonBridged.bed
    # see example in danRer7.txt
    # there are no non-bridged gaps here:
    hgsql -N -e "select bridge from gap;" felCat5 | sort | uniq -c
    #   900 no
    #   196683 yes

##########################################################################
## WINDOWMASKER (DONE - 2012-04-11 - Hiram)
    mkdir /hive/data/genomes/felCat5/bed/windowMasker
    cd /hive/data/genomes/felCat5/bed/windowMasker
    time nice -n +19 doWindowMasker.pl -buildDir=`pwd` -workhorse=hgwdev \
	-dbHost=hgwdev felCat5 > do.log 2>&1 &
    #   about 2h 47m

    # Masking statistics
    twoBitToFa felCat5.wmsk.2bit stdout | faSize stdin
    #   2455541136 bases (91244929 N's 2364296207 real 1589489056 upper
    #   774807151 lower) in 5500 sequences in 1 files
    #   Total size: mean 446462.0 sd 8150539.1 min 205 (chrUn_JH413284)
    #   max 239302903 (chrA1) median 1243
    #   %31.55 masked total, %32.77 masked real

    twoBitToFa felCat5.wmsk.sdust.2bit stdout | faSize stdin
    #   2455541136 bases (91244929 N's 2364296207 real 1577768583 upper
    #   786527624 lower) in 5500 sequences in 1 files
    #   Total size: mean 446462.0 sd 8150539.1 min 205 (chrUn_JH413284)
    #   max 239302903 (chrA1) median 1243
    #   %32.03 masked total, %33.27 masked real

    hgLoadBed felCat5 windowmaskerSdust windowmasker.sdust.bed.gz
    #   Read 13534078 elements of size 3 from windowmasker.sdust.bed.gz

    featureBits -countGaps felCat5 windowmaskerSdust
    #   877772553 bases of 2455541136 (35.747%) in intersection

    #	eliminate the gaps from the masking
    featureBits felCat5 -not gap -bed=notGap.bed
    #   2364296207 bases of 2364296207 (100.000%) in intersection

    time nice -n +19 featureBits felCat5 windowmaskerSdust notGap.bed \
        -bed=stdout | gzip -c > cleanWMask.bed.gz
    #   786527624 bases of 2364296207 (33.267%) in intersection
    #   real    4m29.505s

    #	reload track to get it clean
    hgLoadBed felCat5 windowmaskerSdust cleanWMask.bed.gz
    #   Read 13586894 elements of size 4 from cleanWMask.bed.gz
    featureBits -countGaps felCat5 windowmaskerSdust
    #   786527624 bases of 2455541136 (32.031%) in intersection
    zcat cleanWMask.bed.gz \
	| twoBitMask ../../felCat5.unmasked.2bit stdin \
	    -type=.bed felCat5.cleanWMSdust.2bit
    twoBitToFa felCat5.cleanWMSdust.2bit stdout | faSize stdin \
        > felCat5.cleanWMSdust.faSize.txt
    cat felCat5.cleanWMSdust.faSize.txt
    #   2455541136 bases (91244929 N's 2364296207 real 1577768583 upper
    #   786527624 lower) in 5500 sequences in 1 files
    #   Total size: mean 446462.0 sd 8150539.1 min 205 (chrUn_JH413284)
    #   max 239302903 (chrA1) median 1243
    #   %32.03 masked total, %33.27 masked real

    # how much does this window masker and repeat masker overlap:
    featureBits -countGaps felCat5 rmsk windowmaskerSdust
    #   553445549 bases of 2455541136 (22.539%) in intersection

#########################################################################
# MASK SEQUENCE WITH WM+TRF (DONE - 2012-04-11 - Hiram)
    # Not masking with WM here since RM masked well
    #	since rmsk masks so very little of the genome, use the clean WM mask
    #	on the genome sequence
#    cd /hive/data/genomes/felCat5
#    twoBitMask -add bed/windowMasker/felCat5.cleanWMSdust.2bit \
#	bed/simpleRepeat/trfMask.bed felCat5.2bit
    #	safe to ignore the warnings about BED file with >=13 fields
#    twoBitToFa felCat5.2bit stdout | faSize stdin > faSize.felCat5.txt
#    cat faSize.felCat5.txt
    #   927696114 bases (111611440 N's 816084674 real 607935500 upper
    #   208149174 lower) in 5678 sequences in 1 files
    #   Total size: mean 163384.3 sd 1922194.0 min 1000 (AERX01077754-1)
    #   max 51042256 (chrLG7) median 2009
    #   %22.44 masked total, %25.51 masked real

    #	create symlink to gbdb
#    rm /gbdb/felCat5/felCat5.2bit
#    ln -s `pwd`/felCat5.2bit /gbdb/felCat5/felCat5.2bit

########################################################################
# cpgIslands - (DONE - 2011-04-23 - Hiram)
    mkdir /hive/data/genomes/felCat5/bed/cpgIslands
    cd /hive/data/genomes/felCat5/bed/cpgIslands
    time doCpgIslands.pl felCat5 > do.log 2>&1
    #   real    28m6.499s

    cat fb.felCat5.cpgIslandExt.txt
    #   39523291 bases of 2364296207 (1.672%) in intersection

#########################################################################
# genscan - (DONE - 2011-04-25 - Hiram)
    mkdir /hive/data/genomes/felCat5/bed/genscan
    cd /hive/data/genomes/felCat5/bed/genscan
    time doGenscan.pl felCat5 > do.log 2>&1
    #   real    275m20.499s
    #   several failing jobs, as found from 'para problems' on swarm:
./runGsBig.csh chrA2 000 gtf/000/chrA2.gtf pep/000/chrA2.pep subopt/000/chrA2.bed
./runGsBig.csh chrB1 000 gtf/000/chrB1.gtf pep/000/chrB1.pep subopt/000/chrB1.bed
./runGsBig.csh chrC1 000 gtf/000/chrC1.gtf pep/000/chrC1.pep subopt/000/chrC1.bed
./runGsBig.csh chrD1 000 gtf/000/chrD1.gtf pep/000/chrD1.pep subopt/000/chrD1.bed
./runGsBig.csh chrD2 000 gtf/000/chrD2.gtf pep/000/chrD2.pep subopt/000/chrD2.bed
    # reruning with window size of 2000000 A2 completed
    # reruning with window size of 1500000 B1 C1 D1 D2 still failing
    #   real    202m31.369s
    # reruning with window size of 1000000
    #   real    86m4.437s
    # reruning with window size of 500000
    # reruning with window size of 250000 with -tmp=/dev/shm
    #   real    62m34.760s
    # reruning with window size of 100000 with -tmp=/dev/shm
    #   real    65m27.077s
    # reruning with window size of 50000 with -tmp=/dev/shm
    #   real    64m44.787s

    #   This is not working.  Need to run these split
    mkdir /hive/data/genomes/felCat5/bed/genscan/splitRun
    cd /hive/data/genomes/felCat5/bed/genscan/splitRun
    gapToLift felCat5 felCat5.nonBridged.lift -bedFile=felCat5.nonBridged.bed

    for C in A2 B1 C1 D1 D2
do
    echo chr${C}
    cd /hive/data/genomes/felCat5/bed/genscan/splitRun
    grep -w "chr${C}" felCat5.nonBridged.lift | grep -v random \
        | sed -e "s/chr${C}./chr${C}_/" > chr${C}.nonBridged.lift
    mkdir chr${C}
    faToTwoBit ../hardMaskedFa/000/chr${C}.fa chr${C}/chr${C}.2bit
    ~/kent/src/hg/utils/lft2BitToFa.pl chr${C}/chr${C}.2bit \
        chr${C}.nonBridged.lift > chr${C}/chr${C}.nonBridged.fa
    cd /hive/data/genomes/felCat5/bed/genscan/splitRun/chr${C}
    mkdir split${C}
    faSplit sequence chr${C}.nonBridged.fa 100 split${C}/chr${C}_
done

    for C in A2 B1 C1 D1 D2
do
    echo chr${C}
    cd /hive/data/genomes/felCat5/bed/genscan/splitRun/chr${C}
    echo '#!/bin/sh' > cmdList.sh
    export NL=-1
    ls split${C} | while read F
  do
  NL=`echo $NL | awk '{print $1+1}'`
  if [ "${NL}" -gt 7 ]; then
    NL=0
    echo "echo waiting before $F" >> cmdList.sh
    echo wait >> cmdList.sh
  fi
  echo "/cluster/bin/x86_64/gsBig split${C}/${F} gtf/${F}.gtf -trans=pep/${F}.pep -subopt=subopt/${F}.bed -exe=/scratch/data/genscan/genscan -par=/scratch/data/genscan/HumanIso.smat -tmp=/dev/shm -window=2400000 &"
  done >> cmdList.sh
    echo "echo waiting at end" >> cmdList.sh
    echo "wait" >> cmdList.sh
    chmod +x cmdList.sh
    rm -fr gtf pep subopt
    mkdir gtf pep subopt
done

    # running each of these individually
# chrA2 real    18m18.896s
# chrB1 real    26m41.138s
# chrC1 real    27m18.315s
# chrD1 real    16m40.449s
# chrD2 real    9m49.652s
    # collecting the results:

for C in chrA2 chrB1 chrC1 chrD1 chrD2
do
    cd /hive/data/genomes/felCat5/bed/genscan/splitRun/${C}
    cat gtf/${C}_*.gtf | liftUp -type=.gtf stdout ../${C}.nonBridged.lift error stdin \
        | sed -e "s/${C}_0\([0-4]\)\./${C}.\1/g" > ${C}.gtf
cat subopt/${C}_*.bed | liftUp -type=.bed stdout ../${C}.nonBridged.lift error stdin \
        | sed -e "s/${C}_0\([0-4]\)\./${C}.\1/g" > ${C}.subopt.bed
cat pep/${C}_*.pep | sed -e "s/${C}_0\([0-4]\)\./${C}.\1/g" > ${C}.pep
ls -l ../../gtf/00?/${C}.gtf ../../pep/00?/${C}.pep ../../subopt/00?/${C}.bed
ls -l ${C}.gtf ${C}.pep ${C}.subopt.bed
done
    # after verifying the sizes of the files seem same compared to what
    #   happened in the main run:
for C in chrA2 chrB1 chrC1 chrD1 chrD2
do
    cd /hive/data/genomes/felCat5/bed/genscan/splitRun/${C}
   ls -l ../../gtf/00?/${C}.gtf ../../pep/00?/${C}.pep ../../subopt/00?/${C}.bed
    ls -l ${C}.gtf ${C}.pep ${C}.subopt.bed
    cd /hive/data/genomes/felCat5/bed/genscan/splitRun
done

    # this is tricky, it is counting on the file existing, empty or otherwise
for C in chrA2 chrB1 chrC1 chrD1 chrD2
do
    cd /hive/data/genomes/felCat5/bed/genscan/splitRun/${C}
    D=`ls ../../gtf/00?/${C}.gtf`
    rm -f "${D}"
    cp -p ${C}.gtf "${D}"
    D=`ls ../../pep/00?/${C}.pep`
    rm -f "${D}"
    cp -p ${C}.pep "${D}"
    D=`ls ../../subopt/00?/${C}.bed`
    rm -f "${D}"
    cp -p ${C}.subopt.bed "${D}"
    cd /hive/data/genomes/felCat5/bed/genscan/splitRun
done

    # Now, we can continue
    time doGenscan.pl -continue=makeBed -workhorse=hgwdev -dbHost=hgwdev \
        felCat5 > makeBed.log 2>&1
    #   real    3m40.678s

    cat fb.felCat5.genscan.txt
    #   56450443 bases of 2364296207 (2.388%) in intersection
    cat fb.felCat5.genscanSubopt.txt
    #   50327551 bases of 2364296207 (2.129%) in intersection

#########################################################################
# MAKE 11.OOC FILE FOR BLAT/GENBANK (DONE - 2012-05-04 - Hiram)
    # Use -repMatch=900, based on size -- for human we use 1024
    # use the "real" number from the faSize measurement,
    # hg19 is 2897316137, calculate the ratio factor for 1024:
    calc \( 2364296207 / 2897316137 \) \* 1024
    #	( 2364296207 / 2897316137 ) * 1024 = 835.614480

    # round up to 850  (felCat4 was 700)

    cd /hive/data/genomes/felCat5
    time blat felCat5.2bit /dev/null /dev/null -tileSize=11 \
      -makeOoc=jkStuff/felCat5.11.ooc -repMatch=850
    #   Wrote 24227 overused 11-mers to jkStuff/felCat5.11.ooc
    #   felCat4 had: Wrote 23033 overused 11-mers to jkStuff/felCat4.11.ooc
    #	real     1m10.025s

    # there are non-bridged gaps, create lift file needed for genbank
    hgsql -N -e "select bridge from gap;" felCat5 | sort | uniq -c
    #   900 no
    #   196683 yes

    cd /hive/data/genomes/felCat5/jkStuff
    gapToLift felCat5 felCat5.nonBridged.lift -bedFile=felCat5.nonBridged.bed
    # largest non-bridged contig:
    awk '{print $3-$2,$0}' felCat5.nonBridged.bed | sort -nr | head
    #   22000847 chrA1  191124230       213125077       chrA1.58

#########################################################################
# AUTO UPDATE GENBANK (DONE - 2012-05-04 - Hiram)
    # examine the file:
    /cluster/data/genbank/data/organism.lst
    # for your species to see what counts it has for:
# organism       mrnaCnt estCnt  refSeqCnt
# Felis catus	1081	919	354
    # to decide which "native" mrna or ests you want to specify in genbank.conf

    ssh hgwdev
    cd $HOME/kent/src/hg/makeDb/genbank
    git pull
    # edit etc/genbank.conf to add:
# felCat5
felCat5.serverGenome = /hive/data/genomes/felCat5/felCat5.2bit
felCat5.clusterGenome = /hive/data/genomes/felCat5/felCat5.2bit
felCat5.ooc = /hive/data/genomes/felCat5/jkStuff/felCat5.11.ooc
felCat5.lift = /hive/data/genomes/felCat5/jkStuff/felCat5.nonBridged.lift
felCat5.perChromTables = no
felCat5.refseq.mrna.native.pslCDnaFilter  = ${ordered.refseq.mrna.native.pslCDnaFilter}
felCat5.refseq.mrna.xeno.pslCDnaFilter    = ${ordered.refseq.mrna.xeno.pslCDnaFilter}
felCat5.genbank.mrna.native.pslCDnaFilter = ${ordered.genbank.mrna.native.pslCDnaFilter}
felCat5.genbank.mrna.xeno.pslCDnaFilter   = ${ordered.genbank.mrna.xeno.pslCDnaFilter}
felCat5.genbank.est.native.pslCDnaFilter  = ${ordered.genbank.est.native.pslCDnaFilter}
felCat5.genbank.est.xeno.pslCDnaFilter    = ${ordered.genbank.est.xeno.pslCDnaFilter}
felCat5.downloadDir = felCat5
felCat5.refseq.mrna.native.load  = yes
felCat5.refseq.mrna.xeno.load = yes
felCat5.refseq.mrna.xeno.loadDesc  = yes
# felCat5.upstreamGeneTbl = refGene
# felCat5.upstreamMaf = multiz6way
# /hive/data/genomes/felCat5/bed/multiz6way/species.list

    # end of section added to etc/genbank.conf
    git commit -m "adding felCat5 rat" etc/genbank.conf
    git push
    make etc-update

    ssh hgwdev			# used to do this on "genbank" machine
    screen -S felCat5           # long running job managed in screen
    cd /cluster/data/genbank
    time nice -n +19 ./bin/gbAlignStep -initial felCat5 &
    #	  var/build/logs/2012.05.04-15:01:45.felCat5.initalign.log
    #   real    1299m14.421s

    # load database when finished
    ssh hgwdev
    cd /cluster/data/genbank
    time nice -n +19 ./bin/gbDbLoadStep -drop -initialLoad felCat5 &
    #	logFile: var/dbload/hgwdev/logs/2012.05.08-09:58:52.dbload.log
    #   real    41m1.805s

    # enable daily alignment and update of hgwdev (DONE - 2012-02-09 - Hiram)
    cd ~/kent/src/hg/makeDb/genbank
    git pull
    # add felCat5 to:
    vi etc/align.dbs etc/hgwdev.dbs
    git commit -m "Added felCat5." etc/align.dbs etc/hgwdev.dbs
    git push
    make etc-update

#########################################################################
# set default position to RHO gene displays  (DONE - 2012-07-23 - Hiram)
    hgsql -e \
'update dbDb set defaultPos="chrA2:53225502-53231571" where name="felCat5";' \
	hgcentraltest

############################################################################
# pushQ entry (DONE - 2012-07-23 - Hiram)
    mkdir /hive/data/genomes/felCat5/pushQ
    cd /hive/data/genomes/felCat5/pushQ
    # Mark says don't let the transMap track get there
    time makePushQSql.pl felCat5 2> stderr.txt | grep -v transMap > felCat5.sql
    #   real    3m42.309s

    scp -p felCat5.sql hgwbeta:/tmp
    ssh hgwbeta
    cd /tmp
    hgsql qapushq < felCat5.sql

###########################################################################
# construct lift file Ensembl names to UCSC names (DONE - 2013-02-15 - Chin)
    cd /hive/data/genomes/felCat5/jkStuff

cat ../chrom.sizes | while read L
do
ucName=`echo "${L}" | awk '{print $1}'`
ucSize=`echo "${L}" | awk '{print $2}'`
ensName=`echo $L | sed -e 's/^chrM/MT/; s/^chr[0-9A-Za-z]*_//; s/_random//; s/^chr//; s/^\([JA][HA][CZ0-9]*\)/\1.1/;' | awk '{print $1}'`
ensSize=`echo $L | awk '{print $2}'`
echo -e "0\t$ensName\t$ensSize\t$ucName\t$ucSize"
done > ensToUcsc.lift

############################################################################
# create ucscToINSDC name mapping (DONE - 2013-08-23 - Hiram)
    mkdir /hive/data/genomes/felCat5/bed/ucscToINSDC
    cd /hive/data/genomes/felCat5/bed/ucscToINSDC

    # copying these scripts from the previous load and improving them
    # with each instance
    ./translateNames.sh NC_001700.1
    ./verifyAll.sh
    ./join.sh

    sed -e "s/21/21/" $HOME/kent/src/hg/lib/ucscToINSDC.sql \
        | hgLoadSqlTab felCat5 ucscToINSDC stdin ucscToINSDC.tab
    checkTableCoords felCat5 ucscToINSDC
    featureBits -countGaps felCat5 ucscToINSDC
    # 2455541136 bases of 2455541136 (100.000%) in intersection

    # verify the track link to INSDC functions

##############################################################################
# LIFTOVER TO FelCat4 (Started - 2013-12-11 - Pauline)

    mkdir /hive/data/genomes/felCat5/bed/blat.felCat4.2013-12-11
    cd /hive/data/genomes/felCat5/bed/blat.felCat4.2013-12-11
    # -debug run to create run dir, preview scripts...
    doSameSpeciesLiftOver.pl -debug \
        -ooc /hive/data/genomes/felCat5/jkStuff/felCat5.11.ooc felCat5 felCat4
    # Real run:
    time nice -n +19 doSameSpeciesLiftOver.pl -verbose=2 \
        -ooc /hive/data/genomes/felCat5/jkStuff/felCat5.11.ooc \
        -bigClusterHub=ku -dbHost=hgwdev -workhorse=hgwdev \
         felCat5 felCat4 > do.log 2>&1
    #  old time real    58m58.107s

#############################################################################
# LASTZ Human Swap (DONE - 2014-03-12 - Pauline)

   #   running the swap
    mkdir /hive/data/genomes/felCat5/bed/blastz.hg19.swap
    cd /hive/data/genomes/felCat5/bed/blastz.hg19.swap
    time nice -n +19 doBlastzChainNet.pl -verbose=2 \
        /hive/data/genomes/hg19/bed/lastzfelCat5.2014-02-26/DEF \
        -syntenicNet -swap \
        -workhorse=hgwdev -smallClusterHub=ku -bigClusterHub=ku \
        -chainMinScore=3000 -chainLinearGap=medium >& swap.log

    cat fb.felCat5.chainHg19Link.txt
    # 1453639480 bases of 2364296207 (61.483%) in intersection

############################################################################
# construct liftOver to felCat8 (DONE - 2014-12-05 - Hiram)
    screen -S felCat8	# manage this longish running job in a screen
    mkdir /hive/data/genomes/felCat5/bed/blat.felCat8.2014-12-05
    cd /hive/data/genomes/felCat5/bed/blat.felCat8.2014-12-05
    # check it with -debug first to see if it is going to work:
    time doSameSpeciesLiftOver.pl -buildDir=`pwd` -bigClusterHub=ku \
	-ooc=/hive/data/genomes/felCat5/jkStuff/felCat5.11.ooc \
	-debug -dbHost=hgwdev -workhorse=hgwdev felCat5 felCat8
    #   real    0m1.740s
    # if that is OK, then run it:
    time (doSameSpeciesLiftOver.pl -buildDir=`pwd` -bigClusterHub=ku \
	-ooc=/hive/data/genomes/felCat5/jkStuff/felCat5.11.ooc \
	-dbHost=hgwdev -workhorse=hgwdev felCat5 felCat8) > do.log 2>&1
    #   real    228m49.131s

    # verify this file exists:
    #	/gbdb/felCat5/liftOver/felCat5ToFelCat8.over.chain.gz
    # and try out the conversion on genome-test from felCat5 to felCat8

#########################################################################
##############################################################################
# TransMap V3 tracks. see makeDb/doc/transMapTracks.txt (2014-12-21 markd)
##############################################################################

##############################################################################
# GENEID GENE PREDICTIONS (DONE - 2015-07-31 - Hiram)
    ssh hgwdev
    mkdir /hive/data/genomes/felCat5/bed/geneid
    cd /hive/data/genomes/felCat5/bed/geneid
    wget --timestamping \
http://genome.crg.es/genepredictions/F.catus/felCat5/geneid_v1.4/00README
    wget --timestamping \
http://genome.crg.es/genepredictions/F.catus/felCat5/geneid_v1.4/felCat5.geneid.gtf

    ldHgGene -gtf -genePredExt felCat5 geneid felCat5.geneid.gtf
    # Read 35051 transcripts in 270716 lines in 1 files
    #  35051 groups 3655 seqs 1 sources 3 feature types
    # 35051 gene predictions

    featureBits -enrichment felCat5 augustusGene:CDS geneid
# augustusGene:CDS 1.323%, geneid 1.520%, both 1.038%, cover 78.41%,
#     enrich 51.59x

############################################################################
# LIFTOVER TO felCat9 (DONE - 2018-03-19 - Hiram)
    ssh hgwdev
    mkdir /hive/data/genomes/felCat5/bed/blat.felCat9.2018-02-28
    cd /hive/data/genomes/felCat5/bed/blat.felCat9.2018-02-28
    time (doSameSpeciesLiftOver.pl -verbose=2 -buildDir=`pwd` \
	-ooc=/hive/data/genomes/felCat5/jkStuff/felCat5.11.ooc \
        -bigClusterHub=ku -dbHost=hgwdev -workhorse=hgwdev \
         felCat5 felCat9) > do.log 2>&1 &
    # real    149m5.173s

    # verify the convert link on the test browser is now active from felCat5 to
    # felCat9

############################################################################
