# for emacs: -*- mode: sh; -*-

#	Rabbit
# Oryctolagus cuniculus from Broad, version oryCun2 (released Apr 2009)
#    Project website:
#    ftp://ftp.broad.mit.edu/pub/assemblies/mammals/rabbit/oryCun2/

#	http://www.ncbi.nlm.nih.gov/bioproject/12819
#	http://www.ncbi.nlm.nih.gov/genome/316
#	http://www.ncbi.nlm.nih.gov/Traces/wgs/?val=AAGW00

############################################################################
# Download files from Broad (DONE - 2009-08-11 - Hiram)
    mkdir /hive/data/genomes/oryCun2
    cd /hive/data/genomes/oryCun2
    mkdir broad
    cd broad
    wget --timestamping \
"ftp://ftp.broad.mit.edu/pub/assemblies/mammals/rabbit/oryCun2/*"

    #	fixup the quality scores
    qaToQac assembly.quals.gz stdout \
	| qacAgpLift Chromosomes.agp stdin chromosomes.qual.qac

############################################################################
# Build browser (DONE - 2009-08-11 - Hiram)
    cd /hive/data/genomes/oryCun2
    cat << '_EOF_' > oryCun2.contig.ra
# Config parameters for makeGenomeDb.pl:
db oryCun2
clade vertebrate
scientificName Oryctolagus cuniculus
commonName Rabbit
assemblyDate Apr. 2009
assemblyLabel Broad Institute oryCun2 (NCBI project 12819, AAGW00000000)
orderKey 189
mitoAcc NC_001913
fastaFiles /hive/data/genomes/oryCun2/broad/assembly.bases.gz
agpFiles /hive/data/genomes/oryCun2/broad/Chromosomes.agp
qualFiles /hive/data/genomes/oryCun2/broad/chromosomes.qual.qac
dbDbSpeciesDir rabbit
taxId 9986
'_EOF_'
    # << happy emacs

    #	verify sequence is OK before going on
    makeGenomeDb.pl -stop=agp oryCun2.config.ra > agp.log 2>&1
    #	continuing
    makeGenomeDb.pl -noGoldGapSplit -continue=db oryCun2.config.ra \
	> db.log 2>&1

##########################################################################
# Repeat Masker (DONE - 2009-08-11 - Hiram)
    mkdir /hive/data/genomes/oryCun2/bed/repeatMasker
    cd /hive/data/genomes/oryCun2/bed/repeatMasker
    doRepeatMasker.pl -verbose=2 -workhorse=hgwdev \
	-noSplit -buildDir=`pwd` oryCun2 > do.log 2>&1 &
    cat faSize.rmsk.txt
# 2737490501 bases (133467217 N's 2604023284 real 1466215749 upper
#	1137807535 lower) in 3242 sequences in 1 files
# %41.56 masked total, %43.69 masked real

########################################################################
# Simple Repeats (DONE - 2009-08-11 - Hiram)
    mkdir /hive/data/genomes/oryCun2/bed/simpleRepeat
    cd /hive/data/genomes/oryCun2/bed/simpleRepeat
    doSimpleRepeat.pl -workhorse=hgwdev \
	-buildDir=`pwd` oryCun2 > do.log 2>&1 &
    #	fails on the job for chrM, make an empty result:
    touch /hive/data/genomes/oryCun2/TrfPart/009/009.lst.bed
    doSimpleRepeat.pl -workhorse=hgwdev -continue=filter \
	-buildDir=`pwd` oryCun2 > filter.log 2>&1 &
    #	about 13 minutes
    cat fb.simpleRepeat
    #	11549259 bases of 332311746 (3.475%) in intersection

########################################################################
# Create a ctgPos2 table to show scaffolds (DONE - 2009-08-11 - Hiram)
    cd /hive/data/genomes/oryCun2/broad
    cat << '_EOF_' > mkCtgPos2.pl
#!/usr/bin/env perl

use strict;
use warnings;

my %scafStart;  # key is scaffold name, value is scaffold start coordinate
my %scafEnd;    # key is scaffold name, value is scaffold end coordinate
my %ctgStart;   # key is scaffold name, value is ctg name where it starts
my %ctgEnd;     # key is scaffold name, value is ctg name where it ends
my @scafNames;
my $scafCount = 0;

my $prevCtg = "";
my $prevEnd = 0;
my $scafName = "";

open (FH, "<assembly.agp") or die "can not read assembly.agp";
while (my $line = <FH>) {
    my ($name, $start, $end, $id, $type, $ctg, $rest) = split('\t', $line, 7);
    if (length ($scafName) > 0) {
        if ($scafName ne $name) {
            $scafEnd{$scafName} = $prevEnd;
            $ctgEnd{$scafName} = $prevCtg;
            $scafName = $name;
            $scafStart{$scafName} = $start;
            $ctgStart{$scafName} = $ctg;
            $scafEnd{$scafName} = $end;
            $ctgEnd{$scafName} = $ctg;
            $scafNames[$scafCount++] = $scafName;
        }
    } else {
        $scafName = $name;
        $scafStart{$scafName} = $start;
        $ctgStart{$scafName} = $ctg;
        $scafEnd{$scafName} = $end;
        $ctgEnd{$scafName} = $ctg;
        $scafNames[$scafCount++] = $scafName;
    }
    $prevCtg = $ctg;
    $prevEnd = $end;
}
close (FH);
$scafEnd{$scafName} = $prevEnd;
$ctgEnd{$scafName} = $prevCtg;

printf STDERR "working with $scafCount scaffolds\n";
printf STDERR "first one: $scafNames[0], last one: $scafNames[$scafCount-1]\n";

my %ctgsChr;    # key is ctg name, value is chrom name
my %ctgsStart;  # key is ctg name, value is chrom start
my %ctgsEnd;    # key is ctg name, value is chrom end

open (FH, "<Chromosomes.agp") or die "can not read Chromosomes.agp";
while (my $line = <FH>) {
    my ($chr, $start, $end, $id, $type, $ctg, $rest) = split('\t', $line, 7);
    if ($ctg =~ m/^contig_/) {
        $ctgsChr{$ctg} = $chr;
        $ctgsStart{$ctg} = $start-1;
        $ctgsEnd{$ctg} = $end;
    }
}
close (FH);

for (my $i = 0; $i < $scafCount; ++$i) {
    my $scafName = $scafNames[$i];
    my $startCtg = $ctgStart{$scafName};
    my $endCtg = $ctgEnd{$scafName};
    my $size = $ctgsEnd{$endCtg} - $ctgsStart{$startCtg};
    die "ERROR: non matching chr name between start and end ctg"
        if ($ctgsChr{$startCtg} ne $ctgsChr{$endCtg});
    if ($size < 0) {
        $size = $ctgsEnd{$startCtg} - $ctgsStart{$endCtg};
        printf "%s\t%d\t%s\t%d\t%d\tW\n", $scafName, $size,
            $ctgsChr{$startCtg}, $ctgsStart{$endCtg}, $ctgsEnd{$startCtg};
    } else {
        printf "%s\t%d\t%s\t%d\t%d\tW\n", $scafName, $size,
            $ctgsChr{$startCtg}, $ctgsStart{$startCtg}, $ctgsEnd{$endCtg};
    }

}
'_EOF_'
    # << happy emacs
    chmod +x mkCtgPos2.pl
    ./mkCtgPos2.pl > ctgPos2.tab

    hgLoadSqlTab oryCun2 ctgPos2 $HOME/kent/src/hg/lib/ctgPos2.sql ctgPos2.tab

########################################################################
# Checking *all* gaps - are they all mentioned in the AGP file ?
#	(DONE - 2009-08-12 - Hiram)
    mkdir /hive/data/genomes/oryCun2/bed/allGaps
    time nice -n +19 findMotif -motif=gattaca -verbose=4 \
	-strand=+ ../../oryCun2.2bit > findMotif.txt 2>&1
    #	real    0m7.967s
    grep "^#GAP " findMotif.txt | sed -e "s/^#GAP //" > allGaps.bed
    featureBits oryCun2 -not gap -bed=notGap.bed
    featureBits oryCun2 allGaps.bed notGap.bed -bed=new.gaps.bed
    #	0 bases of 2604023284 (0.000%) in intersection
    # all N's in the sequence are marked as gap, do not need to add anything
    #	to the gap table (see also: tetNig2.txt for procedure)

########################################################################
# Add simple repeats to RM masked sequence (DONE - 2009-08-12 - Hiram)
    cd /hive/data/genomes/oryCun2
    twoBitMask oryCun2.rmsk.2bit -add bed/simpleRepeat/trfMask.bed oryCun2.2bit
    rm /gbdb/oryCun2/oryCun2.2bit
    ln -s `pwd`/oryCun2.2bit /gbdb/oryCun2/oryCun2.2bit
    twoBitToFa  oryCun2.2bit stdout \
	| faSize stdin > oryCun2.2bit.faSize.txt 2>&1
    #	2737490501 bases (133467217 N's 2604023284 real 1465050403 upper
    #	1138972881 lower) in 3242 sequences in 1 files
    #	%41.61 masked total, %43.74 masked real

##########################################################################
## WINDOWMASKER (DONE- 2013-06-21 - Hiram)
    mkdir /hive/data/genomes/oryCun2/bed/windowMasker
    cd /hive/data/genomes/oryCun2/bed/windowMasker
    time nice -n +19 doWindowMasker.pl -buildDir=`pwd` -workhorse=hgwdev \
	-dbHost=hgwdev oryCun2 > do.log 2>&1 &
    # real     227m23.979s

    # Masking statistics
    faSize.oryCun2.cleanWMSdust.txt
# 2737490501 bases (133467217 N's 2604023284 real 1556378311
#    upper 1047644973 lower) in 3242 sequences in 1 files
# Total size: mean 844383.3 sd 9428352.4 min 3025 (chrUn3219)
#    max 194850757 (chr1) median 15631
# %38.27 masked total, %40.23 masked real

    # how much does this window masker and repeat masker overlap:
    featureBits -countGaps oryCun2 rmsk windowmaskerSdust \
       > fb.oryCun2.rmsk.windowmaskerSdust.txt 2>&1
    #   781386731 bases of 2737490501 (28.544%) in intersection

########################################################################
# create ooc file and kluster data files (DONE - 2009-08-12 - Hiram)
    # calculate rabbit/human size ratio to determine repMatch count
	   oryCun2 real / hg19 real
920 = 1024 * 2604023284 / 2897310462
    blat oryCun2.2bit /dev/null /dev/null -tileSize=11 \
      -makeOoc=jkStuff/oryCun2.11.ooc -repMatch=920
    #	Wrote 32543 overused 11-mers to jkStuff/oryCun2.11.ooc

    #	create non-bridged contigs lift and sequence
    mkdir /hive/data/genomes/oryCun2/contigs
    cd /hive/data/genomes/oryCun2/contigs
    gapToLift -verbose=2 oryCun2 oryCun2.contigs.lift \
	-bedFile=oryCun2.contigs.bed
    ~/kent/src/hg/utils/lft2BitToFa.pl ../oryCun2.2bit oryCun2.contigs.lift \
	| gzip -c > oryCun2.contigs.fa.gz
    #	verify nothing lost
    faSize *.fa.gz > contigs.faSize.txt 2>&1
    #	2671429501 bases (67406217 N's 2604023284 real 1465050403 upper
    #	1138972881 lower) in 3319 sequences in 1 files
    #	%42.64 masked total, %43.74 masked real
    # only total and N count different, no sequence lost
    faToTwoBit oryCun2.contigs.fa.gz oryCun2.contigs.2bit
    twoBitInfo oryCun2.contigs.2bit stdout | sort -k2rn > oryCun2.contigs.sizes

    cd /hive/data/genomes/oryCun2
    mkdir /hive/data/staging/oryCun2
    cp -p jkStuff/oryCun2.11.ooc oryCun2.2bit \
	chrom.sizes contigs/oryCun2.contigs.2bit \
	contigs/oryCun2.contigs.sizes /hive/data/staging/oryCun2

########################################################################
# lastz swap from Human Hg19 (DONE - 2009-08-27 - Hiram)
    # original alignment
    cd /hive/data/genomes/hg19/bed/lastzOryCun2.2009-08-12
    cat fb.hg19.chainOryCun2Link.txt
    #	1283994337 bases of 2897316137 (44.317%) in intersection

    mkdir /hive/data/genomes/oryCun2/bed/blastz.hg19.swap
    cd /hive/data/genomes/oryCun2/bed/blastz.hg19.swap

    time nice -n +19 doBlastzChainNet.pl -verbose=2 \
	/hive/data/genomes/hg19/bed/lastzOryCun2.2009-08-12/DEF \
	-noLoadChainSplit -chainMinScore=3000 -chainLinearGap=medium \
	-workhorse=hgwdev -smallClusterHub=encodek -bigClusterHub=swarm \
	-swap -syntenicNet > swap.log 2>&1 &
    #	real    176m35.932s
    cat fb.oryCun2.chainHg19Link.txt
    #	1260477501 bases of 2604023284 (48.405%) in intersection

##############################################################################
############################################################################
# TRANSMAP vertebrate.2009-09-13 build  (2009-09-20 markd)

vertebrate-wide transMap alignments were built  Tracks are created and loaded
by a single Makefile. This is available from:
   svn+ssh://hgwdev.soe.ucsc.edu/projects/compbio/usr/markd/svn/projs/transMap/tags/vertebrate.2009-09-13

see doc/builds.txt for specific details.
############################################################################
# GENBANK AUTO UPDATE (DONE - 2009-09-25 - Hiram)
    #	add the following to genbank/etc/genbank.conf just before Orang ponAbe2
# Rabbit
oryCun2.serverGenome = /hive/data/genomes/oryCun2/oryCun2.2bit
oryCun2.clusterGenome = /scratch/data/oryCun2/oryCun2.2bit
oryCun2.ooc = /scratch/data/oryCun2/oryCun2.11.ooc
oryCun2.lift = /hive/data/genomes/oryCun2/contigs/oryCun2.contigs.lift
oryCun2.perChromTables = no
oryCun2.refseq.mrna.native.pslCDnaFilter  = ${ordered.refseq.mrna.native.pslCDnaFilter}
oryCun2.refseq.mrna.xeno.pslCDnaFilter    = ${ordered.refseq.mrna.xeno.pslCDnaFilter}
oryCun2.genbank.mrna.native.pslCDnaFilter = ${ordered.genbank.mrna.native.pslCDnaFilter}
oryCun2.genbank.mrna.xeno.pslCDnaFilter   = ${ordered.genbank.mrna.xeno.pslCDnaFilter}
oryCun2.genbank.est.native.pslCDnaFilter  = ${ordered.genbank.est.native.pslCDnaFilter}
oryCun2.genbank.est.xeno.pslCDnaFilter    = ${ordered.genbank.est.xeno.pslCDnaFilter}
oryCun2.downloadDir = oryCun2
oryCun2.refseq.mrna.native.load  = yes
oryCun2.refseq.mrna.xeno.load = yes
oryCun2.refseq.mrna.xeno.loadDesc  = yes

    # Edit src/lib/gbGenome.c to add new species.  With these two lines:
    #	static char *papHamNames[] = {"Papio hamadryas",
    #		"Papio hamadryas hamadryas", NULL};
    #	{"papHam", papHamNames},
# Oryctolagus cuniculus from Broad, version oryCun2 (released Apr 2009)

    cvs ci -m "Adding Baboon Papio hamadryas hamadryas" src/lib/gbGenome.c
    make install-server

    ssh genbank
    screen  # control this business with a screen since it takes a while
    cd /cluster/data/genbank

    time nice -n +19 bin/gbAlignStep -initial oryCun2 &
    #	logFile: var/build/logs/2009.09.25-14:55:27.oryCun2.initalign.log
    #	real    163m41.298s
    #	something when haywire in the swarm batch run, finished
    #	that manually, then, continuing:
    time nice -n +19 bin/gbAlignStep -continue=finish -initial oryCun2 &
    #	var/build/logs/2009.09.28-10:46:25.oryCun2.initalign.log
    #	real    76m41.952s

    # load database when finished
    ssh hgwdev
    cd /cluster/data/genbank
    time nice -n +19 ./bin/gbDbLoadStep -drop -initialLoad oryCun2
    #	var/dbload/hgwdev/logs/2009.09.28-12:15:42.dbload.log
    #	real    32m14.821s

    # enable daily alignment and update of hgwdev
    cd ~/kent/src/hg/makeDb/genbank
    cvsup
    # add oryCun2 to:
        etc/align.dbs
        etc/hgwdev.dbs
    cvs ci -m "Added oryCun2 - Oryctolagus cuniculus - Rabbit" \
	etc/align.dbs etc/hgwdev.dbs
    make etc-update
    #	done - 2009-09-28 - Hiram

############################################################################
#	update defaultDb and default position to RHO refSeq gene
    hgsql hgcentraltest \
    	-e 'update defaultDb set name="oryCun2" where genome="Rabbit";'
    hgsql -e \
'update dbDb set defaultPos="chr9:11026346-11032288" where name="oryCun2";' \
	hgcentraltest

############################################################################
# cavPor3 Guinea Pig alignment (DONE - 2010-01-21 - Hiram)
    mkdir /hive/data/genomes/oryCun2/bed/lastzCavPor3.2010-01-21
    cd /hive/data/genomes/oryCun2/bed/lastzCavPor3.2010-01-21

    cat << '_EOF_' > DEF
# Rabbit vs. Guinea Pig

# TARGET: Rabbit oryCun2
SEQ1_DIR=/scratch/data/oryCun2/oryCun2.2bit
SEQ1_LEN=/scratch/data/oryCun2/chrom.sizes
SEQ1_CHUNK=10000000
SEQ1_LAP=10000
SEQ1_LIMIT=50

# QUERY: Guinea Pig
SEQ2_DIR=/scratch/data/cavPor3/cavPor3.2bit
SEQ2_LEN=/scratch/data/cavPor3/chrom.sizes
SEQ2_CHUNK=10000000
SEQ2_LIMIT=50
SEQ2_LAP=0

BASE=/hive/data/genomes/oryCun2/bed/lastzCavPor3.2010-01-21
TMPDIR=/scratch/tmp
'_EOF_'
    # << happy emacs

    time nice -n +19 doBlastzChainNet.pl -verbose=2 \
	`pwd`/DEF \
	-noLoadChainSplit -syntenicNet \
	-workhorse=hgwdev -smallClusterHub=pk -bigClusterHub=swarm \
	-chainMinScore=3000 -chainLinearGap=medium > do.log 2>&1 &
    #	real    1132m45.207s
    cat fb.oryCun2.chainCavPor3Link.txt
    #	964546600 bases of 2604023284 (37.041%) in intersection

    mkdir /hive/data/genomes/cavPor3/bed/blastz.oryCun2.swap
    cd /hive/data/genomes/cavPor3/bed/blastz.oryCun2.swap
    time doBlastzChainNet.pl -verbose=2 \
	/hive/data/genomes/oryCun2/bed/lastzCavPor3.2010-01-21/DEF \
	-swap -noLoadChainSplit -workhorse=hgwdev -bigClusterHub=swarm \
	-chainMinScore=3000 -chainLinearGap=medium > swap.log 2>&1 &
    #	real    186m1.649s
    cat fb.cavPor3.chainOryCun2Link.txt
    #	1003499831 bases of 2663369733 (37.678%) in intersection

############################################################################
# monDom5 Opossum alignment (DONE - 2010-01-21 - Hiram)
    mkdir /hive/data/genomes/oryCun2/bed/lastzMonDom5.2010-01-21
    cd /hive/data/genomes/oryCun2/bed/lastzMonDom5.2010-01-21

    cat << '_EOF_' > DEF
# Rabbit vs. Opossum

# TARGET: Rabbit oryCun2
SEQ1_DIR=/scratch/data/oryCun2/oryCun2.2bit
SEQ1_LEN=/scratch/data/oryCun2/chrom.sizes
SEQ1_CHUNK=10000000
SEQ1_LAP=10000
SEQ1_LIMIT=50

# QUERY: Opossum
SEQ2_DIR=/scratch/data/monDom5/monDom5.2bit
SEQ2_LEN=/scratch/data/monDom5/chrom.sizes
SEQ2_CHUNK=10000000
SEQ2_LAP=0

BASE=/hive/data/genomes/oryCun2/bed/lastzMonDom5.2010-01-21
TMPDIR=/scratch/tmp
'_EOF_'
    # << happy emacs

    time nice -n +19 doBlastzChainNet.pl -verbose=2 \
	`pwd`/DEF \
	-noLoadChainSplit -syntenicNet \
	-workhorse=hgwdev -smallClusterHub=memk -bigClusterHub=swarm \
	-chainMinScore=5000 -chainLinearGap=loose > do.log 2>&1 &
    #	real    1791m17.275s
    cat fb.oryCun2.chainMonDom5Link.txt
    #	176460344 bases of 2604023284 (6.776%) in intersection

    mkdir /hive/data/genomes/monDom5/bed/blastz.oryCun2.swap
    cd /hive/data/genomes/monDom5/bed/blastz.oryCun2.swap
    time doBlastzChainNet.pl -verbose=2 \
	/hive/data/genomes/oryCun2/bed/lastzMonDom5.2010-01-21/DEF \
	-swap -noLoadChainSplit -workhorse=hgwdev -smallClusterHub=memk \
	-bigClusterHub=swarm \
	-chainMinScore=5000 -chainLinearGap=loose > swap.log 2>&1 &
    #	real    74m20.206s
    cat fb.monDom5.chainOryCun2Link.txt
    #	178331930 bases of 3501660299 (5.093%) in intersection

############################################################################
# all.joiner update - (DONE - 2010-04-02 - Hiram)
    cd $HOME/kent/src/hg/makeDb/schema
    # fixup all.joiner until this is a clean output
    joinerCheck -database=oryCun2 -all all.joiner

    mkdir /hive/data/genomes/oryCun2/goldenPath
    cd /hive/data/genomes/oryCun2/goldenPath
    makeDownloads.pl oryCun2 > do.log 2>&1
    #	edit the README files to indicate reality and take a look
    #	to verify expected files are present

    #	now ready for pushQ entry
    mkdir /hive/data/genomes/oryCun2/pushQ
    cd /hive/data/genomes/oryCun2/pushQ
    makePushQSql.pl oryCun2 > oryCun2.pushQ.sql 2> stderr.out
    #	copy it to hgwbeta
    scp -p oryCun2.pushQ.sql hgwbeta:/tmp
    ssh hgwbeta
    cd /tmp
    hgsql qapushq < oryCun2.pushQ.sql
    #	in that pushQ entry walk through each entry and see if the
    #	sizes will set properly

############################################################################
# lift file to Ensembl scaffold coordinates (DONE - 2010-04-02 - Hiram)
    cd /hive/data/genomes/oryCun2/broad
    cat << '_EOF_' > ctgPos2ToEnsembl.pl
#!/usr/bin/env perl

my %chromSize;
open (FH, "<../chrom.sizes") or die "can not read ../chrom.sizes";
while (my $line = <FH>) {
    chomp $line;
    my ($chr, $size) = split('\s+',$line);
    $chromSize{$chr} = $size;
}
close (FH);

open (FH, "<ctgPos2.tab") or die "can not read ctgPos2.tab";
while (my $line = <FH>) {
    chomp $line;
    my ($scaf, $size, $chr, $start, $end, $type) = split('\s+', $line);
    printf "%s\t%s\t%s\t%s\t%s\n", $start, $scaf, $size, $chr, $chromSize{$chr};
}
close (FH);
'_EOF_'
    # << happy emacs
    chmod +x ctgPos2ToEnsembl.pl
    ./ctgPos2ToEnsembl.pl | sort -k4,4 -k3,3n > ../jkStuff/ensGene.lft

############################################################################
# lastz Mouse Mm9 swap (DONE - 2009-08-26 - Hiram)
    #	original alignment
    cd /hive/data/genomes/mm9/bed/lastzOryCun2.2010-01-15
    cat fb.mm9.chainOryCun2Link.txt
# 670229789 bases of 2620346127 (25.578%) in intersection

    #	and for the swap
    mkdir /hive/data/genomes/oryCun2/bed/blastz.mm9.swap
    cd /hive/data/genomes/oryCun2/bed/blastz.mm9.swap
    time nice -n +19 doBlastzChainNet.pl -verbose=2 \
	/hive/data/genomes/mm9/bed/lastzOryCun2.2010-01-15/DEF \
	-noLoadChainSplit -chainMinScore=3000 -chainLinearGap=medium \
	-swap -workhorse=hgwdev -smallClusterHub=encodek -bigClusterHub=swarm \
	> swap.log 2>&1 &
    #	real    84m6.571s
    cat fb.oryCun2.chainMm9Link.txt
    #	669602734 bases of 2604023284 (25.714%) in intersection

############################################################################
# HUMAN (hg18) PROTEINS TRACK (DONE braney 2010-04-28)
    # bash  if not using bash shell already

    cd /cluster/data/oryCun2
    mkdir /cluster/data/oryCun2/blastDb

    awk '{if ($2 > 1000000) print $1}' chrom.sizes > 1meg.lst
    twoBitToFa -seqList=1meg.lst  oryCun2.2bit temp.fa
    faSplit gap temp.fa 1000000 blastDb/x -lift=blastDb.lft
    rm temp.fa 1meg.lst

    awk '{if ($2 <= 1000000) print $1}' chrom.sizes > less1meg.lst
    twoBitToFa -seqList=less1meg.lst  oryCun2.2bit temp.fa
    faSplit about temp.fa 1000000 blastDb/y
    rm temp.fa less1meg.lst

    cd blastDb
    for i in *.fa
    do
	/hive/data/outside/blast229/formatdb -i $i -p F
    done
    rm *.fa
    ls *.nsq | wc -l
# 3326

    mkdir -p /cluster/data/oryCun2/bed/tblastn.hg18KG
    cd /cluster/data/oryCun2/bed/tblastn.hg18KG
    echo  ../../blastDb/*.nsq | xargs ls -S | sed "s/\.nsq//"  > query.lst
    wc -l query.lst
# 3326 query.lst


   # we want around 350000 jobs
   calc `wc /cluster/data/hg18/bed/blat.hg18KG/hg18KG.psl | awk '{print $1}'`/\(350000/`wc query.lst | awk '{print $1}'`\)

# 36727/(350000/3326) = 349.011434


   mkdir -p kgfa
   split -l 349 /cluster/data/hg18/bed/blat.hg18KG/hg18KG.psl  kgfa/kg
   cd kgfa
   for i in *; do
     nice pslxToFa $i $i.fa;
     rm $i;
   done
   cd ..
   ls -1S kgfa/*.fa > kg.lst
   wc kg.lst
#  106  106 1378 kg.lst

   mkdir -p blastOut
   for i in `cat kg.lst`; do  mkdir blastOut/`basename $i .fa`; done
   tcsh
   cd /cluster/data/oryCun2/bed/tblastn.hg18KG
   cat << '_EOF_' > blastGsub
#LOOP
blastSome $(path1) {check in line $(path2)} {check out exists blastOut/$(root2)/q.$(root1).psl }
#ENDLOOP
'_EOF_'

   cat << '_EOF_' > blastSome
#!/bin/sh
BLASTMAT=/hive/data/outside/blast229/data
export BLASTMAT
g=`basename $2`
f=/tmp/`basename $3`.$g
for eVal in 0.01 0.001 0.0001 0.00001 0.000001 1E-09 1E-11
do
if /hive/data/outside/blast229/blastall -M BLOSUM80 -m 0 -F no -e $eVal -p tblastn -d $1 -i $2 -o $f.8
then
        mv $f.8 $f.1
        break;
fi
done
if test -f  $f.1
then
    if /cluster/bin/i386/blastToPsl $f.1 $f.2
    then
	liftUp -nosort -type=".psl" -nohead $f.3 /cluster/data/oryCun2/blastDb.lft carry $f.2
        liftUp -nosort -type=".psl" -pslQ -nohead $3.tmp /cluster/data/hg18/bed/blat.hg18KG/protein.lft warn $f.3
        if pslCheck -prot $3.tmp
        then
            mv $3.tmp $3
            rm -f $f.1 $f.2 $f.3 $f.4
        fi
        exit 0
    fi
fi
rm -f $f.1 $f.2 $3.tmp $f.8 $f.3 $f.4
exit 1
'_EOF_'
    # << happy emacs
    chmod +x blastSome
    exit

    ssh swarm
    cd /cluster/data/oryCun2/bed/tblastn.hg18KG
    gensub2 query.lst kg.lst blastGsub blastSpec
    para create blastSpec
#    para try, check, push, check etc.

    para time
# Completed: 352556 of 352556 jobs
# CPU time in finished jobs:   15911919s  265198.66m  4419.98h  184.17d  0.505 y
# IO & Wait Time:               2044688s   34078.13m   567.97h   23.67d  0.065 y
# Average job time:                  51s       0.85m     0.01h    0.00d
# Longest finished job:             257s       4.28m     0.07h    0.00d
# Submission to last job:         22838s     380.63m     6.34h    0.26d

    ssh swarm
    cd /cluster/data/oryCun2/bed/tblastn.hg18KG
    mkdir chainRun
    cd chainRun
    tcsh
    cat << '_EOF_' > chainGsub
#LOOP
chainOne $(path1)
#ENDLOOP
'_EOF_'

    cat << '_EOF_' > chainOne
(cd $1; cat q.*.psl | simpleChain -prot -outPsl -maxGap=150000 stdin ../c.`basename $1`.psl)
'_EOF_'
    chmod +x chainOne
    ls -1dS ../blastOut/kg?? > chain.lst
    gensub2 chain.lst single chainGsub chainSpec
    # do the cluster run for chaining
    para create chainSpec
    para try, check, push, check etc.

# Completed: 106 of 106 jobs
# CPU time in finished jobs:     659954s   10999.23m   183.32h    7.64d  0.021 y
# IO & Wait Time:                 79322s    1322.03m    22.03h    0.92d  0.003 y
# Average job time:                6974s     116.24m     1.94h    0.08d
# Longest finished job:           34335s     572.25m     9.54h    0.40d
# Submission to last job:         34345s     572.42m     9.54h    0.40d

    cd /cluster/data/oryCun2/bed/tblastn.hg18KG/blastOut
    for i in kg??
    do
       cat c.$i.psl | awk "(\$13 - \$12)/\$11 > 0.6 {print}" > c60.$i.psl
       sort -rn c60.$i.psl | pslUniq stdin u.$i.psl
       awk "((\$1 / \$11) ) > 0.60 { print   }" c60.$i.psl > m60.$i.psl
       echo $i
    done
    sort u.*.psl m60* | uniq | sort -T /tmp -k 14,14 -k 16,16n -k 17,17n > ../blastHg18KG.psl
    cd ..
    pslCheck blastHg18KG.psl
# checked: 97569 failed: 0 errors: 0

    # load table
    ssh hgwdev
    cd /cluster/data/oryCun2/bed/tblastn.hg18KG
    hgLoadPsl oryCun2 blastHg18KG.psl

    # check coverage
    featureBits oryCun2 blastHg18KG
# 42826260 bases of 2604023284 (1.645%) in intersection

    featureBits oryCun2 blastHg18KG refGene  -enrichment
# blastHg18KG 1.645%, refGene 0.076%, both 0.048%, cover 2.95%, enrich 38.75x

    featureBits oryCun2 blastHg18KG xenoRefGene  -enrichment
# blastHg18KG 1.645%, xenoRefGene 1.837%, both 0.978%, cover 59.48%, enrich 32.37x

    rm -rf blastOut
#end tblastn

############################################################################
# construct ucscToEnsembl chrom name translation (2011-04-21 - Hiram)
    # the Ensembl genes v62 have new chrom names, construct translation:
    mkdir /hive/data/genomes/oryCun2/bed/ucscToEnsembl
    cd /hive/data/genomes/oryCun2/bed/ucscToEnsembl

grep "^chr" /hive/data/genomes/oryCun2/genbank/Primary_Assembly/localID2acc \
    | sort > genbank.names
cut -f1 genbank.names | sort > findChrNames

cut -f1 ../../chrom.sizes | sort > ucsc.names

join ucsc.names genbank.names | sed -e 's/\.1$//' > ens.names

comm -23 ucsc.names findChrNames | grep -v chrM | while read C
do
    ensName=${C/chr/}
    echo -e "${C}\t${ensName}"
done >> ens.names

sort ens.names > ucscToEnsembl.tab

# find length of ucsc names for key length in SQL below:
awk '{print length($1)}' ucscToEnsembl.tab | sort -u
4
5
9

    cat << '_EOF_' > ucscToEnsembl.sql
# UCSC to Ensembl chr name translation
CREATE TABLE ucscToEnsembl (
    ucsc varchar(255) not null,        # UCSC chromosome name
    ensembl varchar(255) not null,     # Ensembl chromosome name
              #Indices
    PRIMARY KEY(ucsc(9))
);
'_EOF_'

    hgsql oryCun2 < ucscToEnsembl.sql
    hgsql oryCun2 \
-e 'LOAD DATA LOCAL INFILE "ucscToEnsembl.tab" INTO TABLE ucscToEnsembl'

############################################################################
#  oryCun2 - Rabbit - Ensembl Genes version 62  (DONE - 2011-04-21 - hiram)
    ssh hgwdev
    cd /hive/data/genomes/oryCun2/jkStuff
    # construct new lift file with new Ensembl contig names

    cat << '_EOF_' > ens62Lift.pl
#!/bin/env perl

use strict;
use warnings;

my %chrSize;

open (FH, "<../chrom.sizes") or die "can not read ../chrom.sizes";
while (my $line = <FH>) {
    chomp $line;
    my ($chr, $size) = split('\s+', $line);
    $chrSize{$chr} = $size;
}
close (FH);

my %ensName;
open (FH, "<../bed/ucscToEnsembl/ucscToEnsembl.tab") or
        die "can not read ../bed/ucscToEnsembl/ucscToEnsembl.tab";
while (my $line = <FH>) {
    chomp $line;
    my ($chr, $name) = split('\s+', $line);
    $ensName{$chr} = $name;
}
close (FH);

foreach my $chr (keys %ensName) {
    printf "0\t%s\t%d\t%s\t%d\n", $ensName{$chr}, $chrSize{$chr}, $chr, $chrSize{$chr};
}
'_EOF_'
    # << happy emacs
    chmod +x ens62Lift.pl
    ./ens62Lift.pl > ens.62.lft

    # now, using that in the ensGene definition

    cd /hive/data/genomes/oryCun2
    cat << '_EOF_' > oryCun2.ensGene.ra
# required db variable
db oryCun2
# the lift file will change the chrom names, no nameTranslation needed
# nameTranslation "s/^/chr/;"
# ensembl v62 has new naming scheme based on NCBI release:
liftUp /hive/data/genomes/oryCun2/jkStuff/ens.62.lft
'_EOF_'
#  << happy emacs

    doEnsGeneUpdate.pl  -ensVersion=62 oryCun2.ensGene.ra
    ssh hgwdev
    cd /hive/data/genomes/oryCun2/bed/ensGene.62
    featureBits oryCun2 ensGene
    # 31797207 bases of 2604023284 (1.221%) in intersection

#############################################################################
# N-SCAN gene predictions (nscanGene) - (2011-05-02 markd)

/cluster/home/jeltje/hive/rabbit/oryCun2.prot.fa
/cluster/home/jeltje/hive/rabbit/chr_gtf/chr*.fa

    # obtained NSCAN predictions from Jeltje van Baren <jeltje@mbari.org>,
    cd /hive/data/genomes/oryCun2/bed/nscan
    cp /cluster/home/jeltje/hive/rabbit/oryCun2.prot.fa  .
    cp -r /cluster/home/jeltje/hive/rabbit/chr_gtf .
    gzip oryCun2.* chr_gtf/*
    chmod a-w oryCun2.* chr_gtf/*

    # load track
    zcat chr_gtf/*.gtf.gz | gtfToGenePred -genePredExt stdin stdout | hgLoadGenePred -genePredExt oryCun2 nscanGene stdin
    hgPepPred oryCun2 generic nscanPep oryCun2.prot.fa.gz
    rm *.tab

    # rabbit/oryCun2/trackDb.ra, add:
    track nscanGene override
    informant Rabbit N-SCAN used human (hg19) as the informant for conservation.

    # verify top-level search spec, should produce no results an an error for egrep:
    hgsql -Ne 'select name from nscanGene' oryCun2 | egrep -v '^chr[0-9a-zA-Z_]+\.([0-9]+|pasa)((\.[0-9a-z]+)?\.[0-9a-z]+)?$' |head

############################################################################
# SWAP lastz mm10 (DONE - 2012-03-19 - Hiram)
    # original alignment to mm10
    cat /hive/data/genomes/mm10/bed/lastzOryCun2.2012-03-16/fb.mm10.chainOryCun2Link.txt
    #	669778489 bases of 2652783500 (25.248%) in intersection

    # and this swap
    mkdir /hive/data/genomes/oryCun2/bed/blastz.mm10.swap
    cd /hive/data/genomes/oryCun2/bed/blastz.mm10.swap
    time nice -n +19 doBlastzChainNet.pl -verbose=2 \
	/hive/data/genomes/mm10/bed/lastzOryCun2.2012-03-16/DEF \
	-swap -syntenicNet \
	-workhorse=hgwdev -smallClusterHub=encodek -bigClusterHub=swarm \
	-chainMinScore=3000 -chainLinearGap=medium > swap.log 2>&1 &
    #	real    64m40.959s
    cat fb.oryCun2.chainMm10Link.txt
    #	668643668 bases of 2604023284 (25.677%) in intersection
    # set sym link to indicate this is the lastz for this genome:
    cd /hive/data/genomes/oryCun2/bed
    ln -s blastz.mm10.swap lastz.mm10


############################################################################
# NUMTS TRACK (DONE 2013-06-16 - Chin)
# copy and unzip
# http://193.204.182.50/files/tracks.zip and
# http://193.204.182.50/files/other%20NumtS%20tracks.zip
# to /hive/data/outside/NumtS/2012/tracks

    mkdir /hive/data/genomes/oryCun2/bed/NumtS
    cd /hive/data/genomes/oryCun2/bed/NumtS
    cp /hive/data/outside/NumtS/2012/tracks/Oryctolagus_cuniculus_2/* .
    cat tracks_numtS_nuclear_OryCun2 | grep -v "^track" > oryCun2NumtS.bed
    cat tracks_NumtS_assembled_OryCun2 | grep -v "^track" > oryCun2NumtSAssembled.bed
    cat tracks_mit_NumtS_OryCun2 | grep -v "^track" >  oryCun2NumtSMitochondrion.bed

    # load the 3 bed files to oryCun2
    hgLoadBed oryCun2  numtSAssembled  oryCun2NumtSAssembled.bed
    hgLoadBed oryCun2 numtS oryCun2NumtS.bed
    hgLoadBed oryCun2 numtSMitochondrion oryCun2NumtSMitochondrion.bed
    # Make /gbdb/ links and load bam
    mkdir /gbdb/oryCun2/NumtS
    ln -s `pwd`/Rabbit-2_NumtS_seqs.sorted.bam{,.bai} /gbdb/oryCun2/NumtS/
    hgBbiDbLink oryCun2 bamAllNumtSSorted /gbdb/oryCun2/NumtS/Rabbit-2_seqs.sorted.bam
    # setup trackDb for oryCun2


##############################################################################
# create ucscToINSDC name mapping (DONE - 2013-08-23 - Hiram)
    mkdir /hive/data/genomes/oryCun2/bed/ucscToINSDC
    cd /hive/data/genomes/oryCun2/bed/ucscToINSDC

    # copying these scripts from the previous load and improving them
    # with each instance
    ./translateNames.sh NC_001913.1
    # used the join.sh from danRer7, same difficulty here
    # join.sh much abused here
    ./join.sh
    # special verify here:
    ./verifyAll.sh

    sed -e "s/21/9/" $HOME/kent/src/hg/lib/ucscToINSDC.sql \
        | hgLoadSqlTab oryCun2 ucscToINSDC stdin ucscToINSDC.tab
    checkTableCoords oryCun2 ucscToINSDC
    featureBits -countGaps oryCun2 ucscToINSDC
    # 2737490501 bases of 2737490501 (100.000%) in intersection

    # verify the track link to INSDC functions

##############################################################################
# genscan - (DONE - 2013-11-12 - Hiram)
    mkdir /hive/data/genomes/oryCun2/bed/genscan
    cd /hive/data/genomes/oryCun2/bed/genscan
    time doGenscan.pl oryCun2 > do.log 2>&1
XXX - running - Tue Nov 12 14:05:25 PST 2013
    #  real    12m28.640s
7 broken jobs:
./lastRunGsBig.csh chr12 000 gtf/000/chr12.gtf pep/000/chr12.pep subopt/000/chr12.bed &
./lastRunGsBig.csh chr13 000 gtf/000/chr13.gtf pep/000/chr13.pep subopt/000/chr13.bed &
./lastRunGsBig.csh chrUn0046 000 gtf/000/chrUn0046.gtf pep/000/chrUn0046.pep subopt/000/chrUn0046.bed &
wait
./lastRunGsBig.csh chr5 000 gtf/000/chr5.gtf pep/000/chr5.pep subopt/000/chr5.bed &
./lastRunGsBig.csh chr3 000 gtf/000/chr3.gtf pep/000/chr3.pep subopt/000/chr3.bed &
./lastRunGsBig.csh chr16 000 gtf/000/chr16.gtf pep/000/chr16.pep subopt/000/chr16.bed &
./lastRunGsBig.csh chr1 000 gtf/000/chr1.gtf pep/000/chr1.pep subopt/000/chr1.bed &
wait
XXX - running - Wed Nov 13 15:49:50 PST 2013
    #  real    713m7.112s
XXX some still broken, running with 1800000
    real    191m54.615s
XXX some still broken, running with 1600000 Fri Nov 15 08:07:34 PST 2013
real    110m12.295s
XXX still broken, running with 1000000 Fri Nov 15 13:49:11 PST 2013
real    85m13.525s
XXX still broken, time to split and run

    export TOP="/hive/data/genomes/oryCun2/bed/genscan/splitRun"
    cd ${TOP}
    gapToLift oryCun2 oryCun2.nonBridged.lift -bedFile=oryCun2.nonBridged.bed

for C in 1 12 13 16
do
    cd ${TOP}
    grep -w "chr${C}" oryCun2.nonBridged.lift |  sed -e "s/chr${C}./chr${C}_/" > chr${C}.nonBridged.lift
    mkdir chr${C}
    faToTwoBit ../hardMaskedFa/*/chr${C}.fa chr${C}/chr${C}.2bit
    ~/kent/src/hg/utils/lft2BitToFa.pl chr${C}/chr${C}.2bit \
       chr${C}.nonBridged.lift > chr${C}/chr${C}.nonBridged.fa
    cd chr${C}
    mkdir split
    faSplit sequence chr${C}.nonBridged.fa 100 split/chr${C}_
done

export TOP="/hive/data/genomes/oryCun2/bed/genscan/splitRun"
cd $TOP

for C in 1 12 13 16
do
  cd chr${C}
  /usr/bin/time -p ./cmdList.sh > cmdList.log 2>&1
  cd $TOP
done

export TOP="/hive/data/genomes/oryCun2/bed/genscan/splitRun"
cd $TOP

for C in chr1 chr12 chr13 chr16
do
  cd ${C}
  cat gtf/${C}_*.gtf | liftUp -type=.gtf ${C}.lifted.gtf \
      ../${C}.nonBridged.lift error stdin
  cat subopt/${C}_*.bed | liftUp -type=.bed ${C}.lifted.subopt.bed \
      ../${C}.nonBridged.lift error stdin
  cat pep/${C}_*.pep > ${C}.needNameFix.pep
  ../fixIds.pl ${C}
  cd $TOP
done

    # run 'para time > run.time' on ku, then continuing:
    time doGenscan.pl -continue=makeBed oryCun2 > makeBed.log 2>&1
    #  real    4m24.839s

    cat fb.oryCun2.genscan.txt
    #   67825143 bases of 2604023284 (2.605%) in intersection

    cat fb.oryCun2.genscanSubopt.txt
    #   69954121 bases of 2604023284 (2.686%) in intersection

#########################################################################
##############################################################################
# TransMap V3 tracks. see makeDb/doc/transMapTracks.txt (2014-12-21 markd)
##############################################################################

##############################################################################
# GENEID GENE PREDICTIONS (DONE - 2015-07-31 - Hiram)
    ssh hgwdev
    mkdir /hive/data/genomes/oryCun2/bed/geneid
    cd /hive/data/genomes/oryCun2/bed/geneid
    wget --timestamping \
http://genome.crg.es/genepredictions/O.cuniculus/oryCun2/geneid_v1.4/00README
    wget --timestamping \
http://genome.crg.es/genepredictions/O.cuniculus/oryCun2/geneid_v1.4/oryCun2.geneid.gtf

    ldHgGene -gtf -genePredExt oryCun2 geneid oryCun2.geneid.gtf
    # Read 41107 transcripts in 293112 lines in 1 files
    #  41107 groups 3038 seqs 1 sources 3 feature types
    # 41107 gene predictions

    featureBits -enrichment oryCun2 augustusGene:CDS geneid
# augustusGene:CDS 1.267%, geneid 1.539%, both 1.010%, cover 79.71%,
#     enrich 51.80x

##############################################################################
# cpgIslands on UNMASKED sequence (DONE - 2014-07-16 - Hiram)
    mkdir /hive/data/genomes/oryCun2/bed/cpgIslandsUnmasked
    cd /hive/data/genomes/oryCun2/bed/cpgIslandsUnmasked

    time (doCpgIslands.pl -buildDir=`pwd` -bigClusterHub=ku \
	-tableName=cpgIslandExtUnmasked \
	    -maskedSeq=/hive/data/genomes/oryCun2/oryCun2.unmasked.2bit \
    -dbHost=hgwdev -smallClusterHub=ku -workhorse=hgwdev oryCun2) > do.log 2>&1
    # real    24m54.858s

    cat fb.oryCun2.cpgIslandExtUnmasked.txt
    # 63694762 bases of 2604023284 (2.446%) in intersection

#############################################################################
# cpgIslands - (DONE - 2014-07-15 - Hiram)
    mkdir /hive/data/genomes/oryCun2/bed/cpgIslands
    cd /hive/data/genomes/oryCun2/bed/cpgIslands
    time (doCpgIslands.pl -buildDir=`pwd` -bigClusterHub=ku \
     -dbHost=hgwdev -smallClusterHub=ku -workhorse=hgwdev oryCun2) > do.log 2>&1
    # real    3m31.486s

    cat fb.oryCun2.cpgIslandExt.txt
    # 47233011 bases of 2604023284 (1.814%) in intersection

##############################################################################
