# for emacs: -*- mode: sh; -*-

# This file describes browser build for the canFam3
#	Canis lupus familiaris genome: Nov. 2011

# http://www.ncbi.nlm.nih.gov/Traces/wgs/?val=AAEX00
#	7X coverage via a variety of methods

#	http://www.ncbi.nlm.nih.gov/genome/85
#	http://www.ncbi.nlm.nih.gov/bioproject/13179
#	http://www.ncbi.nlm.nih.gov/Traces/wgs/?val=AAEX00

#       http://www.ncbi.nlm.nih.gov/genome/assembly/317138/
#	http://www.ncbi.nlm.nih.gov/Traces/wgs/?val=AAEX03


#############################################################################
# Fetch sequence from genbank (DONE - 2012-01-04 - Hiram)

    mkdir -p /hive/data/genomes/canFam3/genbank
    cd /hive/data/genomes/canFam3/genbank

    wget --timestamping -r --cut-dirs=6 --level=0 -nH -x \
        --no-remove-listing -np \
"ftp://ftp.ncbi.nlm.nih.gov/genbank/genomes/Eukaryotes/vertebrates_mammals/Canis_lupus/CanFam3.1/*"
    #	Downloaded: 293 files, 1.8G in 24m 51s (1.25 MB/s)

    # measure sequence to be used here
    faSize Primary_Assembly/assembled_chromosomes/FASTA/*.fa.gz \
	Primary_Assembly/unplaced_scaffolds/FASTA/*.fa.gz
    #	2410960148 bases (18261639 N's 2392698509 real 2392698509 upper 0
    #	lower) in 3267 sequences in 40 files
    #	%0.00 masked total, %0.00 masked real

#############################################################################
# process into UCSC naming scheme (DONE - 2012-01-05 - Hiram)
    mkdir /hive/data/genomes/canFam3/ucsc
    cd /hive/data/genomes/canFam3/ucsc

    cat << '_EOF_' > toUcsc.pl
#!/bin/env perl

use strict;
use warnings;

my %accToChr;

open (FH, "<../genbank/Primary_Assembly/assembled_chromosomes/chr2acc") or
        die "can not read Primary_Assembly/assembled_chromosomes/chr2acc";
while (my $line = <FH>) {
    next if ($line =~ m/^#/);
    chomp $line;
    my ($chrN, $acc) = split('\s+', $line);
    $accToChr{$acc} = $chrN;
}
close (FH);

foreach my $acc (keys %accToChr) {
    my $chrN =  $accToChr{$acc};
    print "$acc $accToChr{$acc}\n";
    open (FH, "zcat ../genbank/Primary_Assembly/assembled_chromosomes/AGP/chr${chrN}.agp.gz|") or die "can not read chr${chrN}.agp.gz";
    open (UC, ">chr${chrN}.agp") or die "can not write to chr${chrN}.agp";
    while (my $line = <FH>) {
        if ($line =~ m/^#/) {
            print UC $line;
        } else {
            $line =~ s/^$acc/chr${chrN}/;
            print UC $line;
        }
    }
    close (FH);
    close (UC);
    open (FH, "zcat ../genbank/Primary_Assembly/assembled_chromosomes/FASTA/chr${chrN}.fa.gz|") or die "can not read chr${chrN}.fa.gz";
    open (UC, ">chr${chrN}.fa") or die "can not write to chr${chrN}.fa";
    while (my $line = <FH>) {
        if ($line =~ m/^>/) {
            printf UC ">chr${chrN}\n";
        } else {
            print UC $line;
        }
    }
    close (FH);
    close (UC);
}
'_EOF_'
    # << happy emacs
    chmod +x toUcsc.pl

    cat << '_EOF_' > unplaced.pl
#!/bin/env perl

use strict;
use warnings;

my $agpFile =  "../genbank/Primary_Assembly/unplaced_scaffolds/AGP/unplaced.scaf.agp.gz";
my $fastaFile =  "../genbank/Primary_Assembly/unplaced_scaffolds/FASTA/unplaced.scaf.fa.gz";
open (FH, "zcat $agpFile|") or die "can not read $agpFile";
open (UC, ">unplaced.agp") or die "can not write to unplaced.agp";
while (my $line = <FH>) {
    if ($line =~ m/^#/) {
        print UC $line;
    } else {
        $line =~ s/\.1//;
        printf UC "chrUn_%s", $line;
    }
}
close (FH);
close (UC);

open (FH, "zcat $fastaFile|") or die "can not read $fastaFile";
open (UC, ">unplaced.fa") or die "can not write to unplaced.fa";
while (my $line = <FH>) {
    if ($line =~ m/^>/) {
        chomp $line;
        $line =~ s/.*gb\|//;
        $line =~ s/\.1\|.*//;
        printf UC ">chrUn_$line\n";
    } else {
        print UC $line;
    }
}
close (FH);
close (UC);
'_EOF_'
    # << happy emacs
    chmod +x unplaced.pl

    ./toUcsc.pl
    ./unplaced.pl

    gzip *.fa *.agp

    # verify nothing lost in the translation, should be the same as above
    #	except for the name translations
    faSize *.fa
# 2410960148 bases (18261639 N's 2392698509 real 2392698509 upper 0 lower)
#	in 3267 sequences in 40 files
# %0.00 masked total, %0.00 masked real
# 2410960148 bases (18261639 N's 2392698509 real 2392698509 upper 0

#############################################################################
#   Initial browser build (DONE - 2012-01-05 - Hiram)
    cd /hive/data/genomes/canFam3
    cat << '_EOF_' > canFam3.config.ra
# Config parameters for makeGenomeDb.pl:
db canFam3
clade vertebrate
genomeCladePriority 20
scientificName Canis lupus familiaris
commonName Dog
assemblyDate Sep. 2011
assemblyLabel Broad CanFam3.1 (GCA_000002285.2)
assemblyShortLabel Broad CanFam3.1
orderKey 226
mitoAcc NC_002008
fastaFiles /hive/data/genomes/canFam3/ucsc/*.fa.gz
agpFiles /hive/data/genomes/canFam3/ucsc/*.agp.gz
dbDbSpeciesDir dog
taxId   9615
'_EOF_'
    # << happy emacs

    time makeGenomeDb.pl -stop=agp canFam3.config.ra > agp.log 2>&1
    # less than two minutes
    # check the end of agp.log to verify it is OK
    time makeGenomeDb.pl -workhorse=hgwdev -fileServer=hgwdev \
	-continue=db canFam3.config.ra > db.log 2>&1
    #	real    18m17.938s
    # verify the end of db.log indicates success

#############################################################################
# running repeat masker (DONE - 2012-01-05 - Hiram)
    mkdir /hive/data/genomes/canFam3/bed/repeatMasker
    cd /hive/data/genomes/canFam3/bed/repeatMasker
    time doRepeatMasker.pl -buildDir=`pwd` -noSplit \
	-bigClusterHub=swarm -dbHost=hgwdev -workhorse=hgwdev \
	-smallClusterHub=memk canFam3 > do.log 2>&1 &
    #	real    366m34.586s
    cat faSize.rmsk.txt
    #	2410976875 bases (18261639 N's 2392715236 real 1364929603 upper
    #	1027785633 lower) in 3268 sequences in 1 files
    #	%42.63 masked total, %42.95 masked real

    grep -i versi do.log
# RepeatMasker version development-$Id: RepeatMasker,v 1.26 2011/09/26 16:19:44 angie Exp $
#    April 26 2011 (open-3-3-0) version of RepeatMasker

    featureBits -countGaps canFam3 rmsk
    #	1027963174 bases of 2410976875 (42.637%) in intersection
    # why is it different than the faSize above ?
    # because rmsk masks out some N's as well as bases, the count above
    #	separates out the N's from the bases, it doesn't show lower case N's

##########################################################################
# running simple repeat (DONE - 2012-01-05 - Hiram)
    mkdir /hive/data/genomes/canFam3/bed/simpleRepeat
    cd /hive/data/genomes/canFam3/bed/simpleRepeat
    time doSimpleRepeat.pl -buildDir=`pwd` -bigClusterHub=swarm \
	-dbHost=hgwdev -workhorse=hgwdev -smallClusterHub=memk \
	canFam3 > do.log 2>&1 &
    #	real    98m15.932s

    cat fb.simpleRepeat
    #	65887025 bases of 2402709449 (2.742%) in intersection

    # adding RepeatMasker to get completed masked sequence
    cd /hive/data/genomes/canFam3
    twoBitMask canFam3.rmsk.2bit \
	-add bed/simpleRepeat/trfMask.bed canFam3.2bit
    #	you can safely ignore the warning about fields >= 13

    twoBitToFa canFam3.2bit stdout | faSize stdin > faSize.canFam3.2bit.txt
    cat faSize.canFam3.2bit.txt
    #	2410976875 bases (18261639 N's 2392715236 real 1363595724 upper
    #	1029119512 lower) in 3268 sequences in 1 files
    #	%42.68 masked total, %43.01 masked real

    #	reset the symlink:
    rm /gbdb/canFam3/canFam3.2bit
    ln -s `pwd`/canFam3.2bit /gbdb/canFam3/canFam3.2bit

    #	what would it be like to include WindowMasker:
    zcat bed/windowMasker/cleanWMask.bed.gz \
	| twoBitMask -type=.bed -add canFam3.2bit stdin canFam3.wm.trf.rmsk.2bit
    twoBitToFa canFam3.wm.trf.rmsk.2bit stdout | faSize stdin
    #	2410976875 bases (18261639 N's 2392715236 real 1119889172 upper
    #	1272826064 lower) in 3268 sequences in 1 files
    #	%52.79 masked total, %53.20 masked real
    # WM would add 243 million bases masked: 1272826064-1029119512 = 243706552

#########################################################################
# Verify all gaps are marked, add any N's not in gap as type 'other'
#	(DONE - 2012-01-05 - Hiram)
    mkdir /hive/data/genomes/canFam3/bed/gap
    cd /hive/data/genomes/canFam3/bed/gap
    time nice -n +19 findMotif -motif=gattaca -verbose=4 \
	-strand=+ ../../canFam3.unmasked.2bit > findMotif.txt 2>&1
    #	real    0m35.232s
    grep "^#GAP " findMotif.txt | sed -e "s/^#GAP //" > allGaps.bed
    time featureBits canFam3 -not gap -bed=notGap.bed
    #	2402709449 bases of 2402709449 (100.000%) in intersection
    #	real    0m18.120s

    time featureBits canFam3 allGaps.bed notGap.bed -bed=new.gaps.bed
    #	9994213 bases of 2402709449 (0.416%) in intersection
    #	real    0m32.175s

    #	what is the highest index in the existing gap table:
    hgsql -N -e "select ix from gap;" canFam3 | sort -n | tail -1
    #	94
    cat << '_EOF_' > mkGap.pl
#!/bin/env perl

use strict;
use warnings;

my $ix=`hgsql -N -e "select ix from gap;" canFam3 | sort -n | tail -1`;
chomp $ix;

open (FH,"<new.gaps.bed") or die "can not read new.gaps.bed";
while (my $line = <FH>) {
    my ($chrom, $chromStart, $chromEnd, $rest) = split('\s+', $line);
    ++$ix;
    printf "%s\t%d\t%d\t%d\tN\t%d\tother\tyes\n", $chrom, $chromStart,
        $chromEnd, $ix, $chromEnd-$chromStart;
}
close (FH);
'_EOF_'
    # << happy emacs
    chmod +x ./mkGap.pl
    ./mkGap.pl > other.bed
    wc -l other.bed
    #	19473
    featureBits -countGaps canFam3 other.bed
    #	9994213 bases of 2410976875 (0.415%) in intersection
    # verify no overlap:
    time featureBits -countGaps canFam3 gap other.bed
    #	0 bases of 2410976875 (0.000%) in intersection
    hgLoadBed -sqlTable=$HOME/kent/src/hg/lib/gap.sql \
	-noLoad canFam3 otherGap other.bed
    #	Loaded 19473 elements of size 8
    #	real    0m29.669s

    # verify no errors before adding to the table:
    gapToLift -minGap=1 canFam3 nonBridged.before.lift \
	-bedFile=nonBridged.before.bed > before.gapToLift.txt 2>&1
    # check for warnings in before.gapToLift.txt, should be empty:
    #	-rw-rw-r-- 1       0 Jan  6 08:17 before.gapToLift.txt
    #	starting with this many:
    hgsql -e "select count(*) from gap;" canFam3
    #	4403
    hgsql canFam3 -e 'load data local infile "bed.tab" into table gap;'
    #	result count:
    hgsql -e "select count(*) from gap;" canFam3
    #	23876
    # == 4403 + 19473
    # verify we aren't adding gaps where gaps already exist
    # this would output errors if that were true:
    gapToLift -minGap=1 canFam3 nonBridged.lift -bedFile=nonBridged.bed
    # there should be no errors or other output, checked bridged gaps:
    hgsql -N -e "select bridge from gap;" canFam3 | sort | uniq -c
    #	80 no
    #	23796 yes

##########################################################################
## WINDOWMASKER (DONE - 2011-09-08 - Hiram)
    mkdir /hive/data/genomes/canFam3/bed/windowMasker
    cd /hive/data/genomes/canFam3/bed/windowMasker
    time nice -n +19 doWindowMasker.pl -buildDir=`pwd` -workhorse=hgwdev \
	-dbHost=hgwdev canFam3 > do.log 2>&1 &
    #	real    135m51.510s

    # Masking statistics
    twoBitToFa canFam3.wmsk.2bit stdout | faSize stdin
    #	2410976875 bases (18261639 N's 2392715236 real 1597393388 upper
    #	795321848 lower) in 3268 sequences in 1 files
    #	%32.99 masked total, %33.24 masked real

    twoBitToFa canFam3.wmsk.sdust.2bit stdout | faSize stdin
    #	2410976875 bases (18261639 N's 2392715236 real 1582594665 upper
    #	810120571 lower) in 3268 sequences in 1 files
    #	%33.60 masked total, %33.86 masked real

    hgLoadBed canFam3 windowmaskerSdust windowmasker.sdust.bed.gz
    #	Loaded 13647111 elements of size 3

    featureBits -countGaps canFam3 windowmaskerSdust
    #	828382210 bases of 2410976875 (34.359%) in intersection

    #	eliminate the gaps from the masking
    featureBits canFam3 -not gap -bed=notGap.bed
    #	2392715236 bases of 2392715236 (100.000%) in intersection
    time nice -n +19 featureBits canFam3 windowmaskerSdust notGap.bed \
        -bed=stdout | gzip -c > cleanWMask.bed.gz
    #	810120571 bases of 2392715236 (33.858%) in intersection
    #	real    2m1.844s
    #	reload track to get it clean
    hgLoadBed canFam3 windowmaskerSdust cleanWMask.bed.gz
    #	Loaded 13644959 elements of size 4
    time featureBits -countGaps canFam3 windowmaskerSdust
    #	810120571 bases of 2410976875 (33.601%) in intersection
    #	real    1m19.909s

    # how much overlap with repeat masker:
    featureBits -countGaps canFam3 rmsk windowmaskerSdust
    #	565424408 bases of 2410976875 (23.452%) in intersection

    #	mask sequence with this clean result
    zcat cleanWMask.bed.gz \
	| twoBitMask ../../canFam3.unmasked.2bit stdin \
	    -type=.bed canFam3.cleanWMSdust.2bit
    twoBitToFa canFam3.cleanWMSdust.2bit stdout | faSize stdin \
        > canFam3.cleanWMSdust.faSize.txt
    cat canFam3.cleanWMSdust.faSize.txt
    #	2410976875 bases (18261639 N's 2392715236 real 1582594665 upper
    #	810120571 lower) in 3268 sequences in 1 files
    #	%33.60 masked total, %33.86 masked real

#########################################################################
# NOT - MASK SEQUENCE WITH WM+TRF (NOT - 2012-01-06 - Hiram)
    #	not masking this genome with WM+TRF since RepeatMasker has
    #	masked enough
    cd /hive/data/genomes/canFam3
    twoBitMask -add bed/windowMasker/canFam3.cleanWMSdust.2bit \
	bed/simpleRepeat/trfMask.bed canFam3.2bit
    #	safe to ignore the warnings about BED file with >=13 fields
    twoBitToFa canFam3.2bit stdout | faSize stdin > faSize.canFam3.txt
    cat faSize.canFam3.txt

    #	create symlink to gbdb
    ssh hgwdev
    rm /gbdb/canFam3/canFam3.2bit
    ln -s `pwd`/canFam3.2bit /gbdb/canFam3/canFam3.2bit

    #	what happens with all masks:
    twoBitMask -add canFam3.2bit bed/repeatMasker/canFam3.sorted.fa.out \
	canFam3.wm.trf.rmsk.2bit
    twoBitToFa canFam3.wm.trf.rmsk.2bit stdout | faSize stdin

########################################################################
# cpgIslands - (DONE - 2011-04-23 - Hiram)
    mkdir /hive/data/genomes/canFam3/bed/cpgIslands
    cd /hive/data/genomes/canFam3/bed/cpgIslands
    time doCpgIslands.pl canFam3 > do.log 2>&1
    #   real    5m8.626s

    cat fb.canFam3.cpgIslandExt.txt
    #   39980778 bases of 2392715236 (1.671%) in intersection

#########################################################################
# genscan - (DONE - 2011-04-25 - Hiram)
    mkdir /hive/data/genomes/canFam3/bed/genscan
    cd /hive/data/genomes/canFam3/bed/genscan
    time doGenscan.pl canFam3 > do.log 2>&1
    #   real    56m58.015s

    cat fb.canFam3.genscan.txt
    #   56089579 bases of 2392715236 (2.344%) in intersection
    cat fb.canFam3.genscanSubopt.txt
    #   49232090 bases of 2392715236 (2.058%) in intersection

#########################################################################
# MAKE 11.OOC FILE FOR BLAT/GENBANK (DONE - 2012-05-01 - Hiram)
    # Use -repMatch=900, based on size -- for human we use 1024
    # use the "real" number from the faSize measurement,
    # hg19 is 2897316137, calculate the ratio factor for 1024:
    calc \( 2392715236 / 2897316137 \) \* 1024
    #	( 2392715236 / 2897316137 ) * 1024 = 845.658632

    # round up to 900  (canFam2 used 852)

    cd /hive/data/genomes/canFam3
    time blat canFam3.2bit /dev/null /dev/null -tileSize=11 \
      -makeOoc=jkStuff/canFam3.11.ooc -repMatch=900
    #   Wrote 24788 overused 11-mers to jkStuff/canFam3.11.ooc
    #	real    1m11.629s

    # there are a few non-bridged gaps, make lift file for genbank
    hgsql -N -e "select bridge from gap;" canFam3 | sort | uniq -c
    #   80 no
    #   23796 yes
    cd /hive/data/genomes/canFam3/jkStuff
    gapToLift canFam3 canFam3.nonBridged.lift -bedFile=canFam3.nonBridged.bed
    # largest non-bridged contig:
    awk '{print $3-$2,$0}' canFam3.nonBridged.bed | sort -nr | head
    #   123773608 chrX  95534   123869142       chrX.01

#########################################################################
# AUTO UPDATE GENBANK (DONE - 2012-02-08 - Hiram)
    # examine the file:
    /cluster/data/genomes/genbank/data/genomes/organism.lst
    # for your species to see what counts it has for:
# organism       mrnaCnt estCnt  refSeqCnt
# Mus musculus    334577  4853663 26288
    # to decide which "native" mrna or ests you want to specify in genbank.conf
    # of course, canFam3 has plenty of everything

    ssh hgwdev
    cd $HOME/kent/src/hg/makeDb/genbank
    git pull
    # edit etc/genbank.conf to add canFam3 just after canFam2 and commit to GIT
# canFam3 (dog)
canFam3.serverGenome = /hive/data/genomes/canFam3/canFam3.2bit
canFam3.clusterGenome = /hive/data/genomes/canFam3/canFam3.2bit
canFam3.ooc = /hive/data/genomes/canFam3/jkStuff/canFam3.11.ooc
canFam3.lift = /hive/data/genomes/canFam3/jkStuff/canFam3.nonBridged.lift
canFam3.align.unplacedChroms = chrUn
canFam3.refseq.mrna.native.pslCDnaFilter  = ${ordered.refseq.mrna.native.pslCDnaFilter}
canFam3.refseq.mrna.xeno.pslCDnaFilter    = ${ordered.refseq.mrna.xeno.pslCDnaFilter}
canFam3.genbank.mrna.native.pslCDnaFilter = ${ordered.genbank.mrna.native.pslCDnaFilter}
canFam3.genbank.mrna.xeno.pslCDnaFilter   = ${ordered.genbank.mrna.xeno.pslCDnaFilter}
canFam3.genbank.est.native.pslCDnaFilter  = ${ordered.genbank.est.native.pslCDnaFilter}
canFam3.refseq.mrna.native.load = yes
canFam3.refseq.mrna.xeno.load = yes
canFam3.genbank.mrna.xeno.load = yes
canFam3.downloadDir = canFam3
canFam3.upstreamGeneTbl = ensGene

    # end of section added to etc/genbank.conf
    git commit -m "adding definition for canFam3" genbank.conf
    git push
    make etc-update

    ssh hgwdev			# used to do this on "genbank" machine
    screen -S mm10		# long running job managed in screen
    cd /cluster/data/genbank
    # rerun this with canFam3.perChromTables = no in the genbank.conf
    #   2012-05-24,25
    time nice -n +19 ./bin/gbAlignStep -initial canFam3 &
    #	logFile: var/build/logs/2012.05.24-15:31:48.canFam3.initalign.log
    #   second time: about 32 minutes
    #   first time: real    381m29.244s

    # load data/genomesbase when finished
    ssh hgwdev
    cd /cluster/data/genbank
    time nice -n +19 ./bin/gbDbLoadStep -drop -initialLoad canFam3 &
    #   logFile: var/dbload/hgwdev/logs/2012.05.25-12:52:27.dbload.log
    #   real    125m30.185s

    # enable daily alignment and update of hgwdev (DONE - 2012-02-09 - Hiram)
    cd ~/kent/src/hg/makeDb/genbank
    git pull
    # add canFam3 to:
    vi etc/align.dbs etc/hgwdev.dbs
    git commit -m "Added canFam3." etc/align.dbs etc/hgwdev.dbs
    git push
    make etc-update

############################################################################
# create pushQ entry (DONE - 2012-05-23 - Hiram)
    # first make sure all.joiner is up to date and has this new organism
    # a keys check should be clean:
    cd ~/kent/src/hg/makeDb/schema
    joinerCheck -database=canFam3 -keys all.joiner

    mkdir /hive/data/genomes/canFam3/pushQ
    cd /hive/data/genomes/canFam3/pushQ
    makePushQSql.pl canFam3 > canFam3.sql 2> stderr.out
    # check stderr.out for no significant problems, it is common to see:
# WARNING: hgwdev does not have /gbdb/canFam3/wib/gc5Base.wib
# WARNING: hgwdev does not have /gbdb/canFam3/wib/quality.wib
# WARNING: hgwdev does not have /gbdb/canFam3/bbi/quality.bw
# WARNING: canFam3 does not have seq
# WARNING: canFam3 does not have extFile
    # which are no real problem
    # if some tables are not identified:
# WARNING: Could not tell (from trackDb, all.joiner and hardcoded lists of
# supporting and genbank tables) which tracks to assign these tables to:
#	<some table list ... >
    # put them in manually after loading the pushQ entry
    scp -p canFam3.sql hgwbeta:/tmp
    ssh hgwbeta
    cd /tmp
    hgsql qapushq < canFam3.sql

#########################################################################
# LASTZ Cow BosTau7 (DONE - 2012-06-23 - Chin)
    screen -S bosTau7CanFam3
    mkdir /hive/data/genomes/canFam3/bed/lastzBosTau7.2012-06-23
    cd /hive/data/genomes/canFam3/bed/lastzBosTau7.2012-06-23

    # adjust the SEQ2_LIMIT with -stop=partition to get a reasonable
    #   number of jobs, 50,000 to something under 100,000

    cat << '_EOF_' > DEF
# dog vs cow
# maximum M allowed with lastz is only 254
BLASTZ_M=254

# TARGET: Dog canFam3
SEQ1_DIR=/hive/data/genomes/canFam3/canFam3.2bit
SEQ1_LEN=/hive/data/genomes/canFam3/chrom.sizes
SEQ1_CHUNK=20000000
SEQ1_LAP=10000

# QUERY: Cow bosTau7
SEQ2_DIR=/hive/data/genomes/bosTau7/bosTau7.2bit
SEQ2_LEN=/hive/data/genomes/bosTau7/chrom.sizes
SEQ2_CHUNK=10000000
SEQ2_LAP=0
SEQ2_LIMIT=2000

BASE=/hive/data/genomes/canFam3/bed/lastzBosTau7.2012-06-23
TMPDIR=/scratch/tmp
'_EOF_'
    # << happy emacs

    time nice -n +19 doBlastzChainNet.pl -verbose=2 \
        `pwd`/DEF \
        -syntenicNet \
        -noLoadChainSplit \
        -workhorse=hgwdev -smallClusterHub=memk -bigClusterHub=swarm \
        -chainMinScore=3000 -chainLinearGap=medium > do.log 2>&1 &
    # real    1528m6.142s
    cat fb.canFam3.chainBosTau7Link.txt
    # 1381966556 bases of 2392715236 (57.757%) in intersection
    # Create link
    cd /hive/data/genomes/canFam3/bed
    ln -s  lastzBosTau7.2012-06-23 lastz.bosTau7

    #   and the swap
    mkdir /hive/data/genomes/bosTau7/bed/blastz.canFam3.swap
    cd /hive/data/genomes/bosTau7/bed/blastz.canFam3.swap
    time nice -n +19 doBlastzChainNet.pl -verbose=2 \
        /hive/data/genomes/canFam3/bed/lastzBosTau7.2012-06-23/DEF \
        -swap -syntenicNet  \
        -noLoadChainSplit \
        -workhorse=hgwdev -smallClusterHub=memk -bigClusterHub=swarm \
        -chainMinScore=3000 -chainLinearGap=medium > swap.log 2>&1 &
    #   real      121m44.022s
    cat fb.bosTau7.chainCanFam3Link.txt
    # 1456104306 bases of 2804673174 (51.917%) in intersection
    cd /hive/data/genomes/bosTau7/bed
    ln -s blastz.canFam3.swap lastz.canFam3


#########################################################################
# LASTZ Cow BosTau6 (DONE - 2012-06-24 - Chin)
    screen -S bosTau6CanFam3
    mkdir /hive/data/genomes/canFam3/bed/lastzBosTau6.2012-06-24
    cd /hive/data/genomes/canFam3/bed/lastzBosTau6.2012-06-24

    # adjust the SEQ2_LIMIT with -stop=partition to get a reasonable
    #   number of jobs, 50,000 to something under 100,000

    cat << '_EOF_' > DEF
# dog vs cow
# maximum M allowed with lastz is only 254
BLASTZ_M=254

# TARGET: Dog canFam3
SEQ1_DIR=/hive/data/genomes/canFam3/canFam3.2bit
SEQ1_LEN=/hive/data/genomes/canFam3/chrom.sizes
SEQ1_CHUNK=20000000
SEQ1_LAP=10000

# QUERY: Cow bosTau6
SEQ2_DIR=/hive/data/genomes/bosTau6/bosTau6.2bit
SEQ2_LEN=/hive/data/genomes/bosTau6/chrom.sizes
SEQ2_CHUNK=10000000
SEQ2_LAP=0
SEQ2_LIMIT=2000

BASE=/hive/data/genomes/canFam3/bed/lastzBosTau6.2012-06-24
TMPDIR=/scratch/tmp
'_EOF_'
    # << happy emacs

    time nice -n +19 doBlastzChainNet.pl -verbose=2 \
        `pwd`/DEF \
        -syntenicNet \
        -noLoadChainSplit \
        -workhorse=hgwdev -smallClusterHub=memk -bigClusterHub=swarm \
        -chainMinScore=3000 -chainLinearGap=medium > do.log 2>&1 &
    # real    1392m1.959s
    cat fb.canFam3.chainBosTau6Link.txt
    # 1387159926 bases of 2392715236 (57.974%) in intersection
    # Create link
    cd /hive/data/genomes/canFam3/bed
    ln -s  lastzBosTau6.2012-06-24 lastz.bosTau6

    #   and the swap
    mkdir /hive/data/genomes/bosTau6/bed/blastz.canFam3.swap
    cd /hive/data/genomes/bosTau6/bed/blastz.canFam3.swap
    time nice -n +19 doBlastzChainNet.pl -verbose=2 \
        /hive/data/genomes/canFam3/bed/lastzBosTau6.2012-06-24/DEF \
        -swap -syntenicNet  \
        -noLoadChainSplit \
        -workhorse=hgwdev -smallClusterHub=memk -bigClusterHub=swarm \
        -chainMinScore=3000 -chainLinearGap=medium > swap.log 2>&1 &
    #   real      119m54.003s
    cat fb.bosTau6.chainCanFam3Link.txt
    #   1399687351 bases of 2649682029 (52.825%) in intersection
    cd /hive/data/genomes/bosTau6/bed
    ln -s blastz.canFam3.swap lastz.canFam3

#########################################################################
# swap LASTZ from Human hg19 (DONE - 2012-07-04 - Hiram)
    # the original alignment
    cd /hive/data/genomes/hg19/bed/lastzCanFam3.2012-07-03
    cat fb.hg19.chainCanFam3Link.txt
    #   1502192631 bases of 2897316137 (51.848%) in intersection

    #   and for the swap

    mkdir /hive/data/genomes/canFam3/bed/blastz.hg19.swap
    cd /hive/data/genomes/canFam3/bed/blastz.hg19.swap
    time nice -n +19 doBlastzChainNet.pl -verbose=2 \
	/hive/data/genomes/hg19/bed/lastzCanFam3.2012-07-03/DEF \
	-swap -syntenicNet \
	-workhorse=hgwdev -smallClusterHub=encodek -bigClusterHub=swarm \
	-chainMinScore=3000 -chainLinearGap=medium > swap.log 2>&1 &
    #   real    103m14.464s
    cat fb.canFam3.chainHg19Link.txt
    #   1455183825 bases of 2392715236 (60.817%) in intersection
    # set sym link to indicate this is the lastz for this genome:
    cd /hive/data/genomes/canFam3/bed
    ln -s blastz.hg19.swap lastz.hg19

##############################################################################
# construct liftOver to canFam2 (DONE - 2012-11-27,29 - Hiram)
    screen -S canFam2	# manage this longish running job in a screen
    mkdir /hive/data/genomes/canFam3/bed/blat.canFam2.2012-11-27
    cd /hive/data/genomes/canFam3/bed/blat.canFam2.2012-11-27
    # check it with -debug first to see if it is going to work:
    time doSameSpeciesLiftOver.pl -buildDir=`pwd` -bigClusterHub=swarm \
	-ooc=/hive/data/genomes/canFam3/jkStuff/canFam3.11.ooc \
	-debug -dbHost=hgwdev -workhorse=hgwdev canFam3 canFam2

    # if that is OK, then run it:
    time doSameSpeciesLiftOver.pl -buildDir=`pwd` -bigClusterHub=swarm \
	-ooc=/hive/data/genomes/canFam3/jkStuff/canFam3.11.ooc \
	-dbHost=hgwdev -workhorse=hgwdev canFam3 canFam2 > do.log 2>&1
    #   real    1752m51.425s
    # two jobs appear to be a problem, running manually on hgwdev in run.chain:
./job.csh part025.lst/canFam2.2bit:chrUn: chainRaw/part025.lst/canFam2.2bit:chrUn:.chain &
./job.csh part026.lst/canFam2.2bit:chrUn: chainRaw/part026.lst/canFam2.2bit:chrUn:.chain
wait
    #   real    54m24.882s
    # the problem is that those axtChain operations use a lot of memory,
    # for some reason on the kluster nodes, they just slow down and get
    #   stuck at 2 Gb
    # continuing:
    time doSameSpeciesLiftOver.pl -buildDir=`pwd` -bigClusterHub=swarm \
	-ooc=/hive/data/genomes/canFam3/jkStuff/canFam3.11.ooc \
	-continue=net -dbHost=hgwdev -workhorse=hgwdev canFam3 canFam2 \
        > net.log 2>&1
    # missed the output in the net.log
    #   real    117m45.220s

    # verify this file exists:
    og -L /gbdb/canFam3/liftOver/canFam3ToCanFam2.over.chain.gz
# -rw-rw-r-- 1 583607 May  5 02:16 /gbdb/canFam3/liftOver/canFam3ToCanFam2.over.chain.gz

    # and try out the conversion on genome-test from canFam3 to canFam2

############################################################################
# LASTZ Chimp PanTro4 (WORKING - 2013-04-30 - Hiram)
    mkdir /hive/data/genomes/canFam3/bed/lastzPanTro4.2013-04-30
    cd /hive/data/genomes/canFam3/bed/lastzPanTro4.2013-04-30

    cat << '_EOF_' > DEF
# dog vs. chimp

# TARGET: Dog CanFam3
SEQ1_DIR=/hive/data/genomes/canFam3/canFam3.2bit
SEQ1_LEN=/hive/data/genomes/canFam3/chrom.sizes
SEQ1_CHUNK=10000000
SEQ1_LAP=10000

# QUERY: Chimp PanTro4
SEQ2_DIR=/hive/data/genomes/panTro4/panTro4.2bit
SEQ2_LEN=/hive/data/genomes/panTro4/chrom.sizes
SEQ2_CHUNK=10000000
SEQ2_LAP=0

BASE=/hive/data/genomes/canFam3/bed/lastzPanTro4.2013-04-30
TMPDIR=/scratch/tmp
'_EOF_'
    # << happy emacs

    #	establish a screen to control this job
    screen
    time nice -n +19 doBlastzChainNet.pl -verbose=2 \
	`pwd`/DEF \
	-syntenicNet -noLoadChainSplit \
	-workhorse=hgwdev -smallClusterHub=encodek -bigClusterHub=swarm \
	-chainMinScore=3000 -chainLinearGap=medium > do.log 2>&1 &
    #   real    1278m31.863s
    # no encodek available, continuing:
    time nice -n +19 doBlastzChainNet.pl -verbose=2 \
	-continue=cat `pwd`/DEF \
	-syntenicNet -noLoadChainSplit \
	-workhorse=hgwdev -smallClusterHub=swarm -bigClusterHub=swarm \
	-chainMinScore=3000 -chainLinearGap=medium > cat.log 2>&1 &
    #   real    11m42.544s
    # one chain job failed due to memory limits, finished manually, continuing:
    time nice -n +19 doBlastzChainNet.pl -verbose=2 \
	-continue=chainMerge `pwd`/DEF \
	-syntenicNet -noLoadChainSplit \
	-workhorse=hgwdev -smallClusterHub=encodek -bigClusterHub=swarm \
	-chainMinScore=3000 -chainLinearGap=medium > chainMerge.log 2>&1 &
    #	real    114m33.002s
    cat fb.canFam3.chainPanTro4Link.txt
    #	1435182107 bases of 2392715236 (59.981%) in intersection

    #	running the swap - DONE - 2013-05-02
    mkdir /hive/data/genomes/panTro4/bed/blastz.canFam3.swap
    cd /hive/data/genomes/panTro4/bed/blastz.canFam3.swap
    time nice -n +19 doBlastzChainNet.pl -verbose=2 \
        /hive/data/genomes/canFam3/bed/lastzPanTro4.2013-04-30/DEF \
	-swap -syntenicNet -noLoadChainSplit \
	-workhorse=hgwdev -smallClusterHub=memk -bigClusterHub=swarm \
	-chainMinScore=3000 -chainLinearGap=medium > swap.log 2>&1 &
    #   real    113m28.428s
    cat fb.panTro4.chainCanFam3Link.txt
    # 1490574959 bases of 2902338967 (51.358%) in intersection

############################################################################
# create ucscToINSDC name mapping (DONE - 2013-08-16 - Hiram)
    mkdir /hive/data/genomes/canFam3/bed/ucscToINSDC
    cd /hive/data/genomes/canFam3/bed/ucscToINSDC

    # copying these scripts from the previous load and improving them
    # with each instance
    ./translateNames.sh
    ./verifyAll.sh
    ./join.sh

    # verify the track link to INSDC functions

_EOF_
##############################################################################
# DBSNP B139 / SNP139 (DONE 1/24/14 angie)
    # RedMine #12490
    screen
    mkdir -p /hive/data/outside/dbSNP/139/dog
    cd /hive/data/outside/dbSNP/139/dog
    # Look at the directory listing of ftp://ftp.ncbi.nih.gov/snp/database/organism_data/
    # to find the subdir name to use as orgDir below (dog_9615 in this case).
    # Then click into that directory and look for file names like
    #    b(1[0-9][0-9])_
    # -- use the first num for build setting in config.ra
    # The buildAssembly setting in config.ra is empty because dbSNP stopped including
    # that in file names.
    cat > config.ra <<EOF
db canFam3
orgDir dog_9615
build 139
buildAssembly
EOF
    ~/kent/src/hg/utils/automation/doDbSnp.pl config.ra >& do.log & tail -f do.log
    # Some trial and error is always required to get the config.ra just right.
#*** You must account for all 3310 contig_acc values in config.ra,
#*** using the liftUp and/or ignoreDbSnpContigsFile settings (see -help output).
#*** Check the auto-generated suggested.lft to see if it covers all
#*** 3310 contigs; if it does, add 'liftUp suggested.lft' to config.ra.
#*** Then run again with -continue=loadDbSnp .
    wc -l suggested.lft
#3310 suggested.lft
    # ok, looks like the auto-generated suggested.lft will do the trick.
    cat >> config.ra <<EOF
liftUp suggested.lft
EOF
    ~/kent/src/hg/utils/automation/doDbSnp.pl config.ra -continue=loadDbSnp >>& do.log &
    tail -f do.log
# *** All done!


##############################################################################
# FILTER SNP139 (DONE 1/24/14 angie)
    cd /hive/data/outside/dbSNP/139/dog
    zcat snp139.bed.gz \
    | ~/kent/src/hg/utils/automation/categorizeSnps.pl
#Mult:     532567
#Common:   85
#Flagged:  0
#leftover: 3041103
    # 85 SNPs that meet the threshold for Common is not enough to warrant a
    # Common SNPs track.  Add only Mult:
    foreach f ({Mult}.bed.gz)
        mv $f snp139$f
    end
    # Load tables
    foreach subset (Mult)
        hgLoadBed -tab -onServer -tmpDir=/data/tmp -allowStartEqualEnd -renameSqlTable \
          canFam3 snp139$subset -sqlTable=snp139.sql snp139$subset.bed.gz
    end


##############################################################################
# DBSNP CODING ANNOTATIONS (139) (DONE 2/27/14 angie)
    # Originally done 1/24/14, but Brian L found hgc crash caused by script bug...
    cd /hive/data/outside/dbSNP/139/dog
    # ncbiFuncAnnotations.txt has NCBI coords: 0-based, fully closed.
    # Note: sort -u with the keys below is too restrictive -- we need full line uniq.
    zcat ncbiFuncAnnotations.txt.gz \
    | ~/kent/src/hg/utils/automation/fixNcbiFuncCoords.pl ncbiFuncInsertions.ctg.bed.gz \
    | sort -k1n,1n -k2,2 -k3n,3n -k5,5 -k6n,6n \
    | uniq \
      > ncbiFuncAnnotationsFixed.txt
    wc -l ncbiFuncAnnotationsFixed.txt
#49664 ncbiFuncAnnotationsFixed.txt
   # How many & what kinds of function types?
    cut -f 6 ncbiFuncAnnotationsFixed.txt \
    | sort -n | uniq -c
#  12275 3   (coding-synon)
#  24817 8   (cds-reference)
#     77 41  (nonsense)
#   9018 42  (missense)
#     11 43  (stop-loss)
#   3461 44  (frameshift)
#      5 45  (cds-indel)

    # 2/27/14: Re-ran from here after fixing bug in script:
    # Gather up multiple annotation lines into one line per {snp, gene, frame}:
    ~/kent/src/hg/utils/automation/collectNcbiFuncAnnotations.pl ncbiFuncAnnotationsFixed.txt \
    | liftUp snp139CodingDbSnp.bed suggested.lft warn stdin
    hgLoadBed canFam3 snp139CodingDbSnp -sqlTable=$HOME/kent/src/hg/lib/snp125Coding.sql \
      -renameSqlTable -tab -notItemRgb -allowStartEqualEnd \
      snp139CodingDbSnp.bed
#Read 24824 elements of size 11 from snp139CodingDbSnp.bed


##############################################################################
##############################################################################
# TransMap V3 tracks. see makeDb/doc/transMapTracks.txt (2014-12-21 markd)
##############################################################################

##############################################################################
# GENEID GENE PREDICTIONS (DONE - 2015-07-31 - Hiram)
    ssh hgwdev
    mkdir /hive/data/genomes/canFam3/bed/geneid
    cd /hive/data/genomes/canFam3/bed/geneid
    wget --timestamping \
http://genome.crg.es/genepredictions/C.familiaris/canFam3/geneid_v1.4/00README
    wget --timestamping \
http://genome.crg.es/genepredictions/C.familiaris/canFam3/geneid_v1.4/canFam3.geneid.gtf

    ldHgGene -gtf -genePredExt canFam3 geneid canFam3.geneid.gtf
    # Read 32342 transcripts in 261967 lines in 1 files
    #  32342 groups 1737 seqs 1 sources 3 feature types
    # 32342 gene predictions

    featureBits -enrichment canFam3 augustusGene:CDS geneid
# augustusGene:CDS 1.360%, geneid 1.501%, both 1.082%, cover 79.59%,
#     enrich 53.04x

##########################################################################
# LASTZ chicken galGal4 (DONE - 2015-09-25 - Hiram)
    # establish a screen to control this job with a name to indicate what it is
    screen -S canFam3GalGal4
    mkdir /hive/data/genomes/canFam3/bed/lastzGalGal4.2015-09-25
    cd /hive/data/genomes/canFam3/bed/lastzGalGal4.2015-09-25

    printf "%s\n" \
'# dog vs chicken
BLASTZ=/cluster/bin/penn/lastz-distrib-1.03.66/bin/lastz

# TARGET: dog CanFam3
SEQ1_DIR=/hive/data/genomes/canFam3/canFam3.2bit
SEQ1_LEN=/hive/data/genomes/canFam3/chrom.sizes
SEQ1_CHUNK=20000000
SEQ1_LAP=10000
SEQ1_LIMIT=100

# QUERY: chicken GalGal4
SEQ2_DIR=/hive/data/genomes/galGal4/galGal4.2bit
SEQ2_LEN=/hive/data/genomes/galGal4/chrom.sizes
SEQ2_CHUNK=10000000
SEQ2_LIMIT=100
SEQ2_LAP=0

BASE=/hive/data/genomes/canFam3/bed/lastzGalGal4.2015-09-25
TMPDIR=/dev/shm' > DEF

    time (doBlastzChainNet.pl -verbose=2 `pwd`/DEF \
	-syntenicNet -workhorse=hgwdev -smallClusterHub=ku -bigClusterHub=ku \
        -chainMinScore=5000 -chainLinearGap=loose) > do.log 2>&1
    #	real    247m1.571s

    cat fb.canFam3.chainGalGal4Link.txt
    #	73335538 bases of 2392715236 (3.065%) in intersection

    time (doRecipBest.pl -buildDir=`pwd` canFam3 galGal4) > rbest.log 2>&1 &
    # lost the end of rbest.log due to hive difficulties

    mkdir /hive/data/genomes/galGal4/bed/blastz.canFam3.swap
    cd /hive/data/genomes/galGal4/bed/blastz.canFam3.swap
    time (doBlastzChainNet.pl -verbose=2 \
	/hive/data/genomes/canFam3/bed/lastzGalGal4.2015-09-25/DEF \
	-swap -syntenicNet \
	-workhorse=hgwdev -smallClusterHub=ku -bigClusterHub=ku \
	-chainMinScore=5000 -chainLinearGap=loose) > swap.log 2>&1
    # real    6m50.282s

    cat fb.galGal4.chainCanFam3Link.txt
    #	65384958 bases of 1032854810 (6.331%) in intersection

    # set sym link to indicate this is the lastz for this genome:
    cd /hive/data/genomes/galGal4/bed
    ln -s blastz.canFam3.swap lastz.canFam3

    time (doRecipBest.pl -workhorse=hgwdev -buildDir=`pwd` galGal4 canFam3) \
      > rbest.log 2>&1
    # continuing after hive difficulties and finishing the recipBest step
    # manually
    time (doRecipBest.pl -workhorse=hgwdev -continue=download -buildDir=`pwd` \
     galGal4 canFam3) > download.log 2>&1
    # real    0m4.400s

##############################################################################
# LASTZ frog xenTro9 (DONE - 2017-04-05 - Hiram)
    # establish a screen to control this job with a name to indicate what it is
    screen -S canFam3XenTro9
    mkdir /hive/data/genomes/canFam3/bed/lastzXenTro9.2017-04-05
    cd /hive/data/genomes/canFam3/bed/lastzXenTro9.2017-04-05

    printf '# dog vs frog
BLASTZ=/cluster/bin/penn/lastz-distrib-1.03.66/bin/lastz

# TARGET: dog CanFam3
SEQ1_DIR=/hive/data/genomes/canFam3/canFam3.2bit
SEQ1_LEN=/hive/data/genomes/canFam3/chrom.sizes
SEQ1_CHUNK=20000000
SEQ1_LAP=10000
SEQ1_LIMIT=30

# QUERY: frog XenTro9
SEQ2_DIR=/hive/data/genomes/xenTro9/xenTro9.2bit
SEQ2_LEN=/hive/data/genomes/xenTro9/chrom.sizes
SEQ2_CHUNK=10000000
SEQ2_LIMIT=30
SEQ2_LAP=0

BASE=/hive/data/genomes/canFam3/bed/lastzXenTro9.2017-04-05
TMPDIR=/dev/shm
' > DEF

    time (doBlastzChainNet.pl -verbose=2 `pwd`/DEF \
	-syntenicNet -workhorse=hgwdev -smallClusterHub=ku -bigClusterHub=ku \
        -chainMinScore=5000 -chainLinearGap=loose) > do.log 2>&1
    #	real    661m0.702s

    # ku difficulties due to /dev/shm/ being full, continuing:
    time (doBlastzChainNet.pl -verbose=2 `pwd`/DEF \
	-syntenicNet -workhorse=hgwdev -smallClusterHub=ku -bigClusterHub=ku \
        -continue=cat -chainMinScore=5000 -chainLinearGap=loose) > cat.log 2>&1
    # real    14m3.596s

    # chain trouble, continuing:
    time (doBlastzChainNet.pl -verbose=2 `pwd`/DEF \
	-syntenicNet -workhorse=hgwdev -smallClusterHub=ku -bigClusterHub=ku \
        -continue=chainMerge -chainMinScore=5000 -chainLinearGap=loose) > chainMerge.log 2>&1
    #	real    11m28.081s

    cat fb.canFam3.chainXenTro9Link.txt
    #	73335538 bases of 2392715236 (3.065%) in intersection

    time (doRecipBest.pl -workhorse=hgwdev -buildDir=`pwd` canFam3 xenTro9) \
       > rbest.log 2>&1 &
    # real    134m47.527s

    # and the swap

    mkdir /hive/data/genomes/xenTro9/bed/blastz.canFam3.swap
    cd /hive/data/genomes/xenTro9/bed/blastz.canFam3.swap
    time (doBlastzChainNet.pl -verbose=2 \
	/hive/data/genomes/canFam3/bed/lastzXenTro9.2017-04-05/DEF \
	-swap -syntenicNet \
	-workhorse=hgwdev -smallClusterHub=ku -bigClusterHub=ku \
	-chainMinScore=5000 -chainLinearGap=loose) > swap.log 2>&1
    # real    11m49.432s

    cat fb.xenTro9.chainCanFam3Link.txt
    #	45357837 bases of 1369865365 (3.311%) in intersection

    time (doRecipBest.pl -workhorse=hgwdev -buildDir=`pwd` xenTro9 canFam3) \
      > rbest.log 2>&1
    # real    131m41.477s

##############################################################################
# LIFTOVER TO canFam6 (DONE - 2021-05-17 - Hiram)
    ssh hgwdev
    mkdir /hive/data/genomes/canFam3/bed/blat.canFam6.2021-05-17
    cd /hive/data/genomes/canFam3/bed/blat.canFam6.2021-05-17
    doSameSpeciesLiftOver.pl -verbose=2 \
        -debug -bigClusterHub=ku -dbHost=hgwdev -workhorse=hgwdev \
        -ooc=/hive/data/genomes/canFam3/jkStuff/canFam3.11.ooc \
         canFam3 canFam6
    time (doSameSpeciesLiftOver.pl -verbose=2 \
        -bigClusterHub=ku -dbHost=hgwdev -workhorse=hgwdev \
        -ooc=/hive/data/genomes/canFam3/jkStuff/canFam3.11.ooc \
         canFam3 canFam6) > doLiftOverToCanFam6.log 2>&1
    # real    364m22.481s

    # see if the liftOver menus function in the browser from canFam3 to canFam6

##############################################################################
# LIFTOVER TO canFam5 (DONE - 2020-07-28 - Hiram)
    ssh hgwdev
    mkdir /hive/data/genomes/canFam3/bed/blat.canFam5.2020-07-28
    cd /hive/data/genomes/canFam3/bed/blat.canFam5.2020-07-28
    doSameSpeciesLiftOver.pl -verbose=2 \
        -debug -bigClusterHub=ku -dbHost=hgwdev -workhorse=hgwdev \
        -ooc=/hive/data/genomes/canFam3/jkStuff/canFam3.11.ooc \
         canFam3 canFam5
    time (doSameSpeciesLiftOver.pl -verbose=2 \
        -bigClusterHub=ku -dbHost=hgwdev -workhorse=hgwdev \
        -ooc=/hive/data/genomes/canFam3/jkStuff/canFam3.11.ooc \
         canFam3 canFam5) > doLiftOverToCanFam5.log 2>&1
    # real    316m50.048s

    # see if the liftOver menus function in the browser from canFam3 to canFam5

##############################################################################
# LIFTOVER TO canFam4 (DONE - 2020-04-02 - Hiram)
    ssh hgwdev
    mkdir /hive/data/genomes/canFam3/bed/blat.canFam4.2020-04-02
    cd /hive/data/genomes/canFam3/bed/blat.canFam4.2020-04-02
    doSameSpeciesLiftOver.pl -verbose=2 \
        -debug -bigClusterHub=ku -dbHost=hgwdev -workhorse=hgwdev \
        -ooc=/hive/data/genomes/canFam3/jkStuff/canFam3.11.ooc \
         canFam3 canFam4
    time (doSameSpeciesLiftOver.pl -verbose=2 \
        -bigClusterHub=ku -dbHost=hgwdev -workhorse=hgwdev \
        -ooc=/hive/data/genomes/canFam3/jkStuff/canFam3.11.ooc \
         canFam3 canFam4) > doLiftOverToCanFam4.log 2>&1
    # real    1264m58.309s

    # see if the liftOver menus function in the browser from canFam3 to canFam4

##############################################################################
# LASTZ German shepard canFam4 (DONE - 2020-05-11 - Hiram)
    # establish a screen to control this job with a name to indicate what it is
    screen -S canFam3CanFam4
    mkdir /hive/data/genomes/canFam3/bed/lastzCanFam4.2020-05-11
    cd /hive/data/genomes/canFam3/bed/lastzCanFam4.2020-05-11

    printf '# boxer vs german shepard
BLASTZ=/cluster/bin/penn/lastz-distrib-1.04.03/bin/lastz
BLASTZ_M=254

# TARGET: boxer Tasha/canFam3
SEQ1_DIR=/hive/data/genomes/canFam3/canFam3.2bit
SEQ1_LEN=/hive/data/genomes/canFam3/chrom.sizes
SEQ1_CHUNK=20000000
SEQ1_LAP=10000
SEQ1_LIMIT=30

# QUERY: German shepard Mischka/canFam4
SEQ2_DIR=/hive/data/genomes/canFam4/canFam4.2bit
SEQ2_LEN=/hive/data/genomes/canFam4/chrom.sizes
SEQ2_CHUNK=10000000
SEQ2_LIMIT=30
SEQ2_LAP=0

BASE=/hive/data/genomes/canFam3/bed/lastzCanFam4.2020-05-11
TMPDIR=/dev/shm
' > DEF

    time (doBlastzChainNet.pl `pwd`/DEF -verbose=2 \
        -chainMinScore=3000 -chainLinearGap=medium \
          -workhorse=hgwdev -smallClusterHub=hgwdev -bigClusterHub=ku \
            -syntenicNet) > do.log 2>&1

    cat fb.canFam3.chainCanFam4Link.txt
    #	2362925930 bases of 2392715236 (98.755%) in intersection

    cat fb.canFam3.chainSynCanFam4Link.txt
    #   2353029988 bases of 2392715236 (98.341%) in intersection

    time (doRecipBest.pl -load -workhorse=hgwdev -buildDir=`pwd` \
	canFam3 canFam4) > rbest.log 2>&1 &
    # 2318530060 bases of 2392715236 (96.900%) in intersection
    # real    42m53.973s

    # and the swap

    mkdir /hive/data/genomes/canFam4/bed/blastz.canFam3.swap
    cd /hive/data/genomes/canFam4/bed/blastz.canFam3.swap
    time (doBlastzChainNet.pl -verbose=2 \
	/hive/data/genomes/canFam3/bed/lastzCanFam4.2020-05-11/DEF \
	-swap -syntenicNet \
	-workhorse=hgwdev -smallClusterHub=hgwdev -bigClusterHub=ku \
	-chainMinScore=3000 -chainLinearGap=medium) > swap.log 2>&1
    # real    237m57.356s

    cat fb.canFam4.chainCanFam3Link.txt
    #	2441363671 bases of 2481941580 (98.365%) in intersection

    cat fb.canFam4.chainSynCanFam3Link.txt
    #   2422035026 bases of 2481941580 (97.586%) in intersection

    time (doRecipBest.pl -load -workhorse=hgwdev -buildDir=`pwd` \
	canFam4 canFam3) > rbest.log 2>&1
    # real    46m50.771s

    cat fb.canFam4.chainRBest.CanFam3.txt
    # 2326711360 bases of 2481941580 (93.746%) in intersection


##############################################################################
# canFam3 EVA SNPs (European Variation Archive) (DONE 2022-02-27 - b0b)

# dbSNP no longer does non-human variants
# new EVA release yesterday !
# canFam3.1 = GCA_000002285.2

# get files (EVA Release 3)
cd /hive/data/outside/dogSnps/release3 
wget https://ftp.ebi.ac.uk/pub/databases/eva/rs_releases/release_3/by_species/canis_lupus_familiaris/GCA_000002285.2/GCA_000002285.2_current_ids.vcf.gz

# what is .csi file???? 
# https://ftp.ebi.ac.uk/pub/databases/eva/rs_releases/release_3/by_species/canis_lupus_familiaris/GCA_000002285.2/GCA_000002285.2_current_ids.vcf.gz.csi

# previous:
#  5655125 evaRsIDs
#  same number in this release

# from their pages:
# http://ftp.ebi.ac.uk/pub/databases/eva/rs_releases/release_3/by_species/canis_lupus_familiaris/GCA_000002285.2/README_rs_ids_counts.txt
#    Unique RS ID counts
#    GCA_000002285.2_current_ids.vcf.gz	5599185
#
# fewer due to multiple mapping of rsIDs

gunzip GCA_000002285.2_current_ids.vcf.gz

# fix chromNames chr01 > chr1
cat GCA_000002285.2_current_ids.vcf | sed "s/chr0/chr/" > dogSNPs.vcf
bgzip dogSNPs.vcf

# vcfToBed requires a tabix
tabix -p vcf dogSNPs.vcf.gz

# make a bed file with 3 useful flags
vcfToBed dogSNPs.vcf.gz dogSNPs3Flags -fields=VC,SID,RS_VALIDATED
# VC=Variant Class, SID=Submitter ID,  RS_VALIDATED=validated from dbSNP 

wc -l *bed
# 5655126 dogSNPs3Flags.bed
wc -l *vcf
# 5655174 GCA_000002285.2_current_ids.vcf

# sort file
bedSort dogSNPs3Flags.bed dogSNPs3Flags.bed

head -2 dogSNPs3Flags.bed
#chrom  chromStart      chromEnd        name    score   strand  thickStart      thickEnd        itemRgb ref     alt     FILTER  VC      SID     RS_VALIDATED
#chr1    111     112     rs850979046     0       .       111     112     0,0,0   A       G       .       SO:0001483      BROAD_VGB_CANINE_PON_SNP_DISCOVERY

# drop unnecessary fields:  thickStart      thickEnd        itemRgb 
cat dogSNPs3Flags.bed \
  | awk -F"\t" '{print $1"\t"$2"\t"$3"\t"$4"\t"$5"\t"$6"\t"$10"\t"$11"\t"$13"\t"$14"\t"$15}' \
  > evaSnps1.bed

head evaSnps1.bed
#chrom  chromStart      chromEnd        name    score   strand  ref     alt     VC      SID     RS_VALIDATED
# chr1    111     112     rs850979046     0       .       A       G       SO:0001483      BROAD_VGB_CANINE_PON_SNP_DISCOVERY

# add "No" to last column where needed (RS_VALIDATED)
cat evaSnps1.bed \
  | awk '{if ($11 != "Yes") { print $1"\t"$2"\t"$3"\t"$4"\t"$5"\t"$6"\t"$7"\t"$8"\t"$9"\t"$10"\tNo"; } \
              else          { print $1"\t"$2"\t"$3"\t"$4"\t"$5"\t"$6"\t"$7"\t"$8"\t"$9"\t"$10"\t"$11; }}' \
   > evaSnps2.bed 
   
head evaSnps2.bed
#chrom  chromStart      chromEnd        name    score   strand  ref     alt     VC      SID     No
# chr1    111     112     rs850979046     0       .       A       G       SO:0001483      BROAD_VGB_CANINE_PON_SNP_DISCOVERY      No

cat evaSnps2.bed | awk -F"\t" '{print $11}' | sort | uniq -c
# 4644128 No
# 1010998 Yes

# replace Sequence Ontology  "SO:" IDs with actual snp types
cat evaSnps2.bed \
  | sed "s/SO:0001483/substitution/" \
  | sed "s/SO:0000159/deletion/" \
  | sed "s/SO:0000667/insertion/" \
  | sed "s/SO:0000705/tandemRepeat/" \
  | sed "s/SO:0002007/multipleNucleotideSubstitution/" \
  | sed "s/SO:1000032/delins/" > evaSnps3.bed

cat evaSnps3.bed | awk '{print $9}' | sort | uniq -c | sort -nr
# 4713243 substitution
#  474379 deletion
#  467490 insertion
#       7 tandemRepeat
#       5 delins
#       1 multipleNucleotideSubstitution
#       1 VC

# still has header row (VC)

head evaSnps3.bed
#chrom  chromStart      chromEnd        name    score   strand  ref     alt     VC      SID     No
# chr1    111     112     rs850979046     0       .       A       G       substitution    BROAD_VGB_CANINE_PON_SNP_DISCOVERY      No

# make bigBed
bedToBigBed -tab -as=../evaSnp.as -type=bed6+5 -extraIndex=name evaSnps3.bed ../chromInfo.txt evaSnp.bb

# error msg:
# evaSnpsr.bed is not sorted at line 115.  Please use "sort -k1,1 -k2,2n" or bedSort and try again.
# ?? but it =looks= sorted there:

# j114 chr1    6189    6189    rs851043138     0       .       T       TAG     insertion       BROAD_VGB_CANINE_PON_SNP_DISCOVERY      No
# 115 chr1    6188    6189    rs852166058     0       .       T       G       substitution    BROAD_VGB_CANINE_PON_SNP_DISCOVERY      No
# 116 chr1    6193    6194    rs851187136     0       .       G       A       substitution    BROAD_VGB_CANINE_PON_SNP_DISCOVERY      No

bedSort evaSnps3.bed evaSnps4.bed

# trackDb/ra entry:
cd kent/src/hg/makeDb/trackDb/dog/canFam3/trackDb.ra

# track evaSnp
# shortLabel EVA SNP Release 3
# longLabel Short Genetic Variants from European Variant Archive Release 3
# type bigBed 6 +
# group varRep
# visibility pack
# bigDataUrl /gbdb/canFam3/bbi/evaSnp.bb
# url https://www.ebi.ac.uk/eva/?variant&accessionID=$$

# adding search in trackDb.ra (thanks jonathan):
# added the following to dog/canFam3/trackDb.ra

# searchTable evaSnp
# searchType bigBed
# searchMethod exact
# padding 50

# search works:

bedToBigBed -tab -as=evaSnp.as -type=bed6+5 -extraIndex=name evaSnps4.bed chromInfo.txt evaSnp.bb
# pass1 - making usageList (39 chroms): 1815 millis
# pass2 - checking and writing primary data (5655125 records, 11 fields): 19207 millis
# Sorting and writing extra index 0: 4299 millis

bigBedInfo evaSnp.bb
# version: 4
# fieldCount: 11
# hasHeaderExtension: yes
# isCompressed: yes
# isSwapped: 0
# extraIndexCount: 1
# itemCount: 5,655,125
# primaryDataSize: 78,986,590
# primaryIndexSize: 365,236
# zoomLevels: 10
# chromCount: 39
# basesCovered: 6,322,774
# meanDepth (of bases covered): 1.025209
# minDepth: 1.000000
# maxDepth: 14.000000
# std of depth: 0.217172

# copy to active /gbdb directory for track
cp evaSnp.bb /cluster/data/canFam3/bed/evaSnp/evaSnp.bb

# make gbdb symlink
cd /gbdb/canFam3/bbi
ln -s /cluster/data/canFam3/bed/evaSnp/evaSnp.bb evaSnp.bb

# submitters
cat evaSnps4.bed | awk '{print $10}' | sed -e "s/,/\n/g" | sort | uniq -c | sort -r
3492428 BROAD_VGB_CANINE_PON_SNP_DISCOVERY
2458565 BROAD_DBSNP.2005.2.4.16.57
 697053 TIGR_1.0
 234166 BROAD_VGB_LYMPHOMA_SNP_DISCOVERY
   4202 DOG_GEN_LAB_1032011
     94 AMOUCD_MAL_20+TERV_1
...

cat evaSnps4.bed | awk '{print $10}' | sed -e "s/,/\n/g" | sort | uniq -c | sort -r
# 52

# 52 different submitters.  most from a few places, mostly the Broad

##############################################################################
# canFam3 EVA SNPs (European Variation Archive) (DONE 2022-03-21 - b0b)
# NOTE:  superceded by Lou's pipeline for consistency.
#   doc'd for multiple assemblies at: evaSnp3.txt
#
# adding VAI annotations
# redoing the track after talking with Lou and standardinzing on data fields after VAI step

# canFam3.1 = GCA_000002285.2

# get files (EVA Release 3)
cd /hive/data/outside/dogSnps/release3 
wget https://ftp.ebi.ac.uk/pub/databases/eva/rs_releases/release_3/by_species/canis_lupus_familiaris/GCA_000002285.2/GCA_000002285.2_current_ids.vcf.gz

# what is .csi file???? 
# https://ftp.ebi.ac.uk/pub/databases/eva/rs_releases/release_3/by_species/canis_lupus_familiaris/GCA_000002285.2/GCA_000002285.2_current_ids.vcf.gz.csi

# previous:
#  5655125 evaRsIDs
#  same number in this release

# from their pages:
# http://ftp.ebi.ac.uk/pub/databases/eva/rs_releases/release_3/by_species/canis_lupus_familiaris/GCA_000002285.2/README_rs_ids_counts.txt
#    Unique RS ID counts
#    GCA_000002285.2_current_ids.vcf.gz	5599185
#
# fewer due to multiple mapping of rsIDs

gunzip GCA_000002285.2_current_ids.vcf.gz

# fix chromNames chr01 > chr1
cat GCA_000002285.2_current_ids.vcf | sed "s/chr0/chr/" > dogSNPs.vcf
bgzip dogSNPs.vcf

# vcfToBed requires a tabix
tabix -p vcf dogSNPs.vcf.gz

# make a bed file with 2 useful flags to match fields agreed w Lou (for his mm39)
vcfToBed dogSNPs.vcf.gz dogSNPs2Flags -fields=VC,SID

# VC=Variant Class, SID=Submitter ID

wc -l dogSNPs2F*
# 5655126 dogSNPs2Flags.bed

# from earlier:
zcat GC*vcf.gz | wc -l
# 5655174
# a few extra rows (48) in vcf due to header lines
zcat GC*vcf.gz | grep "^#" | wc -l
# 49 
# accounts for the extra lines (bed has a header line)

# sort file
sort -k1,1 -k2,2n dogSNPs2Flags.bed > dogSNPs1.bed

head dogSNPs1.bed
#chrom  chromStart      chromEnd        name    score   strand  thickStart      thickEnd        itemRgb ref     alt     FILTER  VC      SID
#chr1    111     112     rs850979046     0       .       111     112     0,0,0   A       G       .       SO:0001483      BROAD_VGB_CANINE_PON_SNP_DISCOVERY

# what is FILTER FIELD ??
cat dogSNPs1.bed | awk '{print $12}' | sort | uniq -c | sort -nr
# 5655125 .
#      1 FILTER
# will remove


#------------- ----------- ----------

# process the SO: terms
# also see stand-alone script soTest3.csh

set soTerms1=( `cat soTerms.txt | awk '{print $1}'` )
set soTerms2=( `cat soTerms.txt | awk '{print $2}'` )

# echo "bedFile = $bedFile"
# echo "soTerms1 = $soTerms1"
# echo "soTerms2 = $soTerms2"

set bedFile=dogSNPs1.bed

# find out if there are any new SO: terms in bedFile
# make list of all terms used in current file
set bedTermList=`cat $bedFile | awk '{print $13}' | sort -u | grep -v VC`

# echo "bedTermList = $bedTermList"

set newTerms=""
set newTermFlag=0
foreach term ( $bedTermList )
  # echo $term
  grep $term soTerms.txt > /dev/null
  if ( $status ) then
     set newTerms=`echo $newTerms $term`
     set newTermFlag=1
  endif
end

# echo "newTermFlag = $newTermFlag"
# echo "newTerms    = $newTerms"

# if new terms, announce and exit
if ( $newTermFlag == 1 ) then
  echo "\nfound term(s) not in soTerms.txt file:  $newTerms "
  echo "look here to add new term(s) file:"
  foreach new  ( $newTerms )
    echo "http://www.sequenceontology.org/browser/current_release/term/$new"
  end
  echo "stopping\n"
  exit
endif

# build sed file to substitute SO: terms in single pass through $bedFile
set outfile = temp$$
set numTerms=`echo $soTerms1 | wc -w`
# echo "numTerms = $numTerms" 
set i=1
set command=""
while ( $i <= $numTerms )
  set command=`echo $command "sed "\'"s/"$soTerms1[$i]"/"$soTerms2[$i]"/'"" | \\n"`
  @ i = $i + 1
end
# do something with terminal pipe (add cat)
set command=`echo $command cat`
echo $command > sedFile
chmod 700 sedFile

# use sedFile to substitute terms in big file
cat $bedFile | sedFile > $outfile
# rm -f sedFile

mv $outfile dogSNPs2.bed
cat dogSNPs2.bed | awk '{print $13}' | sort | uniq -c | sort -nr | grep -v VC

# 4713243 substitution
#  474379 deletion
#  467490 insertion
#       7 tandem_repeat
#       5 delins
#       1 multiple_nucleotide_substitution

head -2 dogSNPs2.bed

#chrom  chromStart chromEnd   name      score strand    thickStart      thickEnd        itemRgb ref     alt     FILTER  VC      SID
#chr1    111     112  rs850979046        0  .  111 112   0,0,0   A       G       .       substitution    BROAD_VGB_CANINE_PON_SNP_DISCOVERY

# compare vcf with new bed file
zcat GCA*vcf*.gz | grep -v "^#" | wc -l
# 5655125
zcat dog*gz | grep -v "^#" | wc -l
# 5655125

# diff = changed chr0 -> chr in the dog*vcf.gz


#------------- ----------- ----------

# get VAI info for the variants
# break up into 1M chunks

zcat dogSNPs.vcf.gz | grep "^#" > top.vcf
  
zcat dogSNPs.vcf.gz | grep -v "^#" | head -1000000 > dog.1.vcf 
zcat dogSNPs.vcf.gz | grep -v "^#" | head -2000000 | tail -1000000 > dog.2.vcf 
zcat dogSNPs.vcf.gz | grep -v "^#" | head -3000000 | tail -1000000 > dog.3.vcf 
zcat dogSNPs.vcf.gz | grep -v "^#" | head -4000000 | tail -1000000 > dog.4.vcf 
zcat dogSNPs.vcf.gz | grep -v "^#" | head -5000000 | tail -1000000 > dog.5.vcf 
zcat dogSNPs.vcf.gz | grep -v "^#" | tail -655125  > dog.6.vcf 

# confirm count was right and no rows missing in last sub-million file
# top record in .6.vcf:
zcat dogSNPs.vcf.gz | grep -a3 -b3  rs23801983
# finds: rs852500901,  bottom record in .5M.vcf as it should

# sort and add header section to all:
foreach file ( 1 2 3 4 5 6 )
  cat top.vcf dog.$file.vcf > dog.${file}M.vcf
  rm dog.$file.vcf
end

# sort, add header section to all, zip and tabix and run VAI:
foreach file ( 1 2 3 4 5 6 )
  echo dog.$file.vcf
  sort -k1,1 -k2,2n dog.$file.vcf > dog.$file.sorted.vcf
  cat top.vcf dog.$file.sorted.vcf > dog.${file}M.vcf
  bgzip dog.${file}M.vcf
  tabix -p vcf dog.${file}M.vcf.gz
  nohup vai.pl --genetrack=refGene --variantLimit=2000000 \
    --hgvsG=off --hgvsCN=off --hgvsP=off \         
    canFam3  dog.${file}M.vcf.gz > dogVAI.${file}M.tab
  rm dog.$file.vcf
  rm dog.$file.sorted.vcf
end

# dog.1.vcf
# dog.2.vcf
# Bad start cookie 58553421 freeing 1ad01458
# dog.3.vcf
# dog.4.vcf
# dog.5.vcf
# dog.6.vcf

# ???
# earlier runs complained about sorting chr9 and 10, but that stopped
# that could explain why chr9 are missing?

# check chroms
foreach Mfile ( 1M 2M 3M 4M 5M 6M )
  echo dogVAI.$Mfile.tab
  echo "total = `cat dogVAI.$Mfile.tab | grep -v "^#" | grep -v Variation | wc -l`"
  cat dogVAI.$Mfile.tab | grep -v "^#" | grep -v Variation \
   | awk '{print $2}' | awk -F":" '{print $1}' | sed "s/chr//" | uniq -c 
  echo
end

# num chrom
#
# dogVAI.1M.tab
# total = 1003545
#  278257 1
#  201889 2
#  221767 3
#  197571 4
#  104061 5
# 
# dogVAI.2M.tab
# total = 667077
#  153949 10
#  125894 5
#  195404 6
#  183370 7
#    8460 8
# 
# dogVAI.3M.tab
# total = 1003876
#   10994 10
#  155516 11
#  171019 12
#  158827 13
#  129042 14
#  148097 15
#  164806 16
#   65575 17
# 
# dogVAI.4M.tab
# total = 1004320
#   96116 17
#  155747 18
#  128985 19
#  142248 20
#  137296 21
#  133613 22
#  133426 23
#   76889 24
# 
# dogVAI.5M.tab
# total = 1003316
#   62388 24
#  142397 25
#  132475 26
#  125810 27
#  118120 28
#   99798 29
#  108964 30
#  115047 31
#   96547 32
#    1770 33
# 
# dogVAI.6M.tab
# total = 656521
#   81867 33
#  122826 34
#   87030 35
#   73781 36
#   86617 37
#   77114 38
#  127286 X

# .2M. has too few (600K) 
##   dogVAI.2M.tab
##   total = 667077
# and is missing chr8 (some), 9 and 10 (some?)

# check total number of variants by chrom on chroms in .1M., .2M. and .3M.
foreach chrom ( 5 6 7 8 9 10 )
  echo "chr$chrom  `zcat dogSNPs.vcf | grep chr$chrom | wc -l`"
  echo "      `cat dogVAI.1M.tab dogVAI.2M.tab dogVAI.3M.tab | grep chr$chrom | wc -l`"
end 

# chrN vcf
#      vai

# chr5  227782
#       229955
# chr6  195105
#       195404
# chr7  182655
#       183370
# chr8  176807
#       8460
# chr9  166162
#       0
# chr10  164704
#       164943

# so it's only chr8 and chr9 missing some

# pull out just those two chroms and process each separately
foreach chrom ( 8 9 )
  echo "chr$chrom"
  zcat dogSNPs.vcf | grep -v "^#" | grep chr$chrom > dog.chr$chrom.vcf
end

wc -l dog.chr*
# 176806 dog.chr8.vcf
# 166161 dog.chr9.vcf

# rerun sort, zip, tabix vai on chr8 and 9
foreach chrom ( chr8 chr9 )
  echo dog.$chrom.vcf
  sort -k1,1 -k2,2n dog.$chrom.vcf > dog.$chrom.sorted.vcf
  cat top.vcf dog.$chrom.sorted.vcf > dog.$chrom.vcf
  bgzip dog.$chrom.vcf
  tabix -p vcf dog.$chrom.vcf.gz
  nohup vai.pl --genetrack=refGene --variantLimit=2000000 \
    --hgvsG=off --hgvsCN=off --hgvsP=off \         
    canFam3  dog.$chrom.vcf.gz > dogVAI.$chrom.tab
  rm dog.$chrom.sorted.vcf
end

# compare original dogSNPs.vcf number with result of the chr-specific files
foreach chrom ( 8 9 )
  echo "chr$chrom  `zcat dogSNPs.vcf | grep chr$chrom | wc -l`"
  echo "      `cat dogVAI.chr8.tab dogVAI.chr9.tab | grep chr$chrom | wc -l`"
end 

# this looks good:
# 
# chr8  176807
#       178750
# chr9  166162
#       166773

# now need to trim out chr8,9 from .2M. to avoid dups

og *tab
# -rw-rw-r-- 1 69411279 Mar 15 14:57 dogVAI.1M.tab
# -rw-rw-r-- 1 46249858 Mar 15 14:57 dogVAI.2M.tab
# -rw-rw-r-- 1 69950179 Mar 15 14:58 dogVAI.3M.tab
# -rw-rw-r-- 1 70175430 Mar 15 14:58 dogVAI.4M.tab
# -rw-rw-r-- 1 70183534 Mar 15 14:58 dogVAI.5M.tab
# -rw-rw-r-- 1 45362411 Mar 15 14:59 dogVAI.6M.tab
# -rw-rw-r-- 1 12316589 Mar 15 15:46 dogVAI.chr8.tab
# -rw-rw-r-- 1 11640462 Mar 15 15:46 dogVAI.chr9.tab

# remove chr8,9 from .2M.
egrep -v "chr8|chr9" dogVAI.2M.tab > dogVAI.2M.trim.tab
mv dogVAI.2M.tab dogVAI.2M.tab.old 

# these should have everything:

og *VAI*tab
-rw-rw-r-- 1 69411279 Mar 15 14:57 dogVAI.1M.tab
-rw-rw-r-- 1 69950179 Mar 15 14:58 dogVAI.3M.tab
-rw-rw-r-- 1 70175430 Mar 15 14:58 dogVAI.4M.tab
-rw-rw-r-- 1 70183534 Mar 15 14:58 dogVAI.5M.tab
-rw-rw-r-- 1 45362411 Mar 15 14:59 dogVAI.6M.tab
-rw-rw-r-- 1 12316589 Mar 15 15:46 dogVAI.chr8.tab
-rw-rw-r-- 1 11640462 Mar 15 15:46 dogVAI.chr9.tab
-rw-rw-r-- 1 45672614 Mar 16 17:56 dogVAI.2M.trim.tab


# cat *VAI*tab | egrep -v "#|Variation" | awk '{print $2}' | awk -F":" '{print $1}' | sort | uniq -c | sort -k2,2
# counts all chroms, but not very useful right here

cat *VAI*tab | egrep -v "#|Variation" | wc -l
# 5675716
# looks good

# combine all the files more or less in order and then sort on second field (chr)
cat dogVAI.1M.tab dogVAI.2M.trim.tab dogVAI.chr8.tab dogVAI.chr9.tab dogVAI.3M.tab \
  dogVAI.4M.tab dogVAI.5M.tab dogVAI.6M.tab | egrep -v "^#|Varia" | sort -k2,2n > dogVAI.tab
wc -l dogVAI.tab
# 5675716 dogVAI.tab

# which fields to keep ?
grep missense dogVAI.tab | head | awk '{print $1"\t"$2"\t"$3"\t"$7"\t"$11}'
# rs1152388402    chr1:54192143 C missense_variant  I/M
# rs1152388406    chr3:69456869 T missense_variant  C/F
# rs1152388411    chr5:42186445 G missense_variant  H/R
# rs1152388412    chr6:741429   T missense_variant  R/H

# second and third will not be in final bed but might be useful for troubleshooting

#------------- ----------- ----------

# get ready to join bed and tab (vai) files

# sort the dogSNPs2.bed on field 4
sort -k 4 dogSNPs2.bed > dogSNPs2.rsSorted.bed
head -3 dogSNPs2.rsSorted.bed
#chrom  chromStart chromEnd   name      score strand    thickStart      thickEnd       itemRgb  ref     alt     FILTER  VC      SID
#chr1    54192142   54192143   rs1152388402 0  . 54192142        54192143        0,0,0  G        C       .       substitution    LDG_DI_UGENT_BE_DOG
#chr2    58622672   58622675   rs1152388403 0  . 58622672        58622675        0,0,0  ACT      CTAGCTAC        .       delins  LDG_DI_UGENT_BE_DOG
tail -2 dogSNPs2.rsSorted.bed
#chr37   14758761   14758762   rs9256889 0  .  14758761  14758762        0,0,0   G      A        .       substitution    TIGR_1.0,BROAD_DBSNP.2005.2.4.16.57
#chr37   14760548   14760548   rs9256890 0  .  14760548  14760548        0,0,0   A      AA       .       insertion       TIGR_1.0

# sort is ok

# big join using VAI.sm (with only fields of interest) 
# and sorted on rsID
cat dogVAI.tab | awk '{print $1"\t"$2"\t"$3"\t"$7"\t"$11}' | sort > dogVAI.sm.rsSorted.tab
head -3 dogVAI.sm.rsSorted.tab
# rs1152388402    chr1:54192143 C missense_variant  I/M
# rs1152388403    chr2:58622673-58622675  CTAGCTAC  frameshift_variant    YYWAV...KEAE*(415
tail -3 dogVAI.sm.rsSorted.tab
# rs9256888       chr37:14758839 G        intergenic_variant      -
# rs9256889       chr37:14758762 A        intergenic_variant      -

# sort is ok

# join on rsID field (4 in bed file, 1 in vai file)
#   and export 

#  man join:
# -e"." flag replaces missing field with . -- can't get it to work
# -a1 flag retains items from file 1 even if unpaired
# -t"\t" flag sets delimeter as tab (never worked)
# found online
#  -e : tells what to substitute in case of missing fields. does not work without -o.

# keep 1-11,13,14 from bed (12 is emply FILTER field)
# keep 2-5 from vai (could have used full VAI and saved the shortening step)
#    could keep 2,7,11 from orig VAI
# two fields of VAI*.tab will not be in final bed but might be useful for troubleshooting
join -1 4 -2 1 -a1 -e "-" -o 1.1 1.2 1.3 1.4 1.5 1.6 1.7 1.8 1.9 \
  1.10 1.11 1.13 1.14 2.2 2.3 2.4 2.5 dogSNPs2.rsSorted.bed dogVAI.sm.rsSorted.tab \ 
  > join.full.out

# repeat with all fields included -- remove FILTER field and chrN:nnnn and vaiAllele fields later
join -1 4 -2 1 -a1 -e "-" -o auto dogSNPs2.rsSorted.bed dogVAI.sm.rsSorted.tab \
  > join.full.out


# many alleles do not match btw eva and VAI
cat join.full.out | awk '{print $11, $16}' | sort | uniq -c | sort -n
#       1 A AGGCCAAGTCCCCAGTGAAGGAGG
#       1 A GGC
#       1 A GTGAAG
# ...
#   13258 GA A
#   14152 CA A
#   14451 TC C
#   14880 TA A
#   15647 AC C
#   16785 CT T
#   17796 AG G
#   18448 TG G
#  102410 G -
#  134016 T -
#  138818 A -
#  148690 C -
# 1146149 C C
# 1148246 G G
# 1216747 T T
# 1217747 A A

# rows in that output:
cat join.full.out | awk '{print $11, $16}' | sort | uniq -c | sort -n | wc -l
# 31612
 
# will carry on - there are some diffs btw the way EVA and VAI show alleles
# make a bed file with just the fields in the evaSnp.as

cat join.full.out | awk '{print $2"\t"$3"\t"$4"\t"$1"\t"$5"\t"$6"\t"$7"\t"$8"\t"$9"\t"$10"\t"$11"\t"$13"\t"$14"\t"$17"\t"$18}' \
  | sort -k1,1 -k2,2n > dogSNPs.wVAI.bed

# it did not like breaking the line with a \ in the mid of print string

head -2 dogSNPs.wVAI.bed
# chr1    111     112  rs850979046        0  .  111 112   0,0,0   A       G       substitution    BROAD_VGB_CANINE_PON_SNP_DISCOVERY      intergenic_variant      -
# chr1    131     132  rs851217143        0  .  131 132   0,0,0   C       A       substitution    BROAD_VGB_CANINE_PON_SNP_DISCOVERY      intergenic_variant      -

tail -2 dogSNPs.wVAI.bed
# chrX    123869059  123869060  rs852151786  0  . 123869059       123869060       0,0,0  G        A       substitution    BROAD_VGB_CANINE_PON_SNP_DISCOVERY      intergenic_variant      -
# chrX    123869065  123869066  rs850877859  0  . 123869065       123869066       0,0,0  C        A       substitution    BROAD_VGB_CANINE_PON_SNP_DISCOVERY      intergenic_variant      -

cat dogSNPs.wVAI.bed | grep miss | head -2
# chr1    13734813   13734813   rs852746064  0  . 13734813        13734813        0,0,0  C        CG      insertion       BROAD_VGB_CANINE_PON_SNP_DISCOVERY      missense_variantD/Y
# chr1    13734814   13734814   rs852366799  0  . 13734814        13734814        0,0,0  G        GC      insertion       BROAD_VGB_CANINE_PON_SNP_DISCOVERY      missense_variantD/Y

cat dogSNPs.wVAI.bed | awk '{print $(NF-1)}' | sort | uniq -c | sort -n
#       1 stop_retained_variant
#       2 stop_lost
#       5 initiator_codon_variant
#      12 coding_sequence_variant
#      13 complex_transcript_variant
#      45 splice_donor_variant
#      55 splice_acceptor_variant
#      76 stop_gained
#      80 inframe_deletion
#      83 inframe_insertion
#     194 non_coding_transcript_exon_variant
#     396 frameshift_variant
#     525 5_prime_UTR_variant
#     784 NMD_transcript_variant
#    1305 splice_region_variant
#    1349 no_sequence_alteration
#    2225 3_prime_UTR_variant
#    3386 missense_variant
#    4577 synonymous_variant
#   35932 upstream_gene_variant
#   37856 downstream_gene_variant
#  146403 intron_variant
# 5560422 intergenic_variant

#------------- ----------- ----------

# substitute itemRgb values

# get variant colors
set bedFile=dogSNPs.wVAI.bed

# build sed file to substitute colors in single pass through $bedFile
set varClass=`cat variantColors | egrep "255|128" | awk '{print $1}'`
set varColor=`cat variantColors | egrep "255|128" | awk '{print $2}'`
set numTerms=`echo $varClass | wc -w`
echo "numTerms = $numTerms" 

# iterate through var classes and get rgb color for substitution
set i=1
while ( $i <= $numTerms )
  echo "perl -pe "\'"s/(0,0,0)(.*)($varClass[$i])/$varColor[$i]\2\3/"\'" | \\" >> varsedFile
  @ i = $i + 1
end
# terminate last pipe
echo "cat" >> varsedFile

# substitute itemRgb
chmod 700 varsedFile
cat $bedFile | varsedFile > dogSNPs.fin.bed

# check substitution
cat dogSNPs.fin.bed | awk '{print $9,"\t", $14}' | sort | uniq -c | sort -nr
# 5560422 0,0,0    intergenic_variant
#  146403 0,0,0    intron_variant
#   37856 0,0,0    downstream_gene_variant
#   35932 0,0,0    upstream_gene_variant
#    4577 0,128,0          synonymous_variant
#    3386 255,0,0          missense_variant
#    2225 0,0,255          3_prime_UTR_variant
#    1349 0,0,0    no_sequence_alteration
#    1305 255,0,0          splice_region_variant
#     784 0,0,0    NMD_transcript_variant
#     525 0,0,255          5_prime_UTR_variant
#     396 255,0,0          frameshift_variant
#     194 0,0,255          non_coding_transcript_exon_variant
#      83 255,0,0          inframe_insertion
#      80 255,0,0          inframe_deletion
#      76 255,0,0          stop_gained
#      55 255,0,0          splice_acceptor_variant
#      45 255,0,0          splice_donor_variant
#      13 0,0,255          complex_transcript_variant
#      12 255,0,0          coding_sequence_variant
#       5 255,0,0          initiator_codon_variant
#       2 255,0,0          stop_lost
#       1 0,128,0          stop_retained_variant

# check concordance btw EVA and VAI classes
cat dogSNPs.fin.bed | awk '{print $12,"\t", $14}' | sort | uniq -c | sort -nr
# 4564905 substitution     intergenic_variant
#  499731 deletion         intergenic_variant
#  495782 insertion        intergenic_variant
#  115427 substitution     intron_variant
#   30498 substitution     downstream_gene_variant
#   28906 substitution     upstream_gene_variant
#   15795 deletion         intron_variant
#   15176 insertion        intron_variant
#    4571 substitution     synonymous_variant
#    3778 deletion         upstream_gene_variant
#    3713 insertion        downstream_gene_variant
#    3645 deletion         downstream_gene_variant
#    3366 substitution     missense_variant
#    3248 insertion        upstream_gene_variant
#    1682 substitution     3_prime_UTR_variant
#    1258 substitution     no_sequence_alteration
#     860 substitution     splice_region_variant
#     604 substitution     NMD_transcript_variant
#     384 substitution     5_prime_UTR_variant
#     282 insertion        3_prime_UTR_variant
#     261 deletion         3_prime_UTR_variant
#     255 deletion         splice_region_variant
#     196 insertion        frameshift_variant
#     189 insertion        splice_region_variant
#     173 deletion         frameshift_variant
#     168 substitution     non_coding_transcript_exon_variant
#     117 deletion         NMD_transcript_variant
#      73 deletion         5_prime_UTR_variant
#      70 deletion         inframe_deletion
#      63 insertion        NMD_transcript_variant
#      63 insertion        5_prime_UTR_variant
#      62 substitution     stop_gained
#      59 insertion        inframe_insertion
#      52 insertion        no_sequence_alteration
#      38 deletion         no_sequence_alteration
#      31 substitution     splice_acceptor_variant
#      27 substitution     splice_donor_variant
#      26 substitution     frameshift_variant
#      20 deletion         splice_acceptor_variant
#      14 substitution     inframe_insertion
#      14 deletion         non_coding_transcript_exon_variant
#      13 insertion        missense_variant
#      12 insertion        non_coding_transcript_exon_variant
#      12 deletion         splice_donor_variant
#      10 insertion        stop_gained
#      10 deletion         inframe_insertion
#       8 insertion        complex_transcript_variant
#       6 insertion        splice_donor_variant
#       6 insertion        inframe_deletion
#       6 insertion        coding_sequence_variant
#       6 deletion         missense_variant
#       6 deletion         coding_sequence_variant
#       5 tandem_repeat    intron_variant
#       5 delins   5_prime_UTR_variant
#       5 deletion         complex_transcript_variant
#       4 substitution     initiator_codon_variant
#       4 substitution     inframe_deletion
#       4 insertion        splice_acceptor_variant
#       4 deletion         stop_gained
#       3 tandem_repeat    intergenic_variant
#       3 insertion        synonymous_variant
#       3 deletion         synonymous_variant
#       2 substitution     stop_lost
#       1 tandem_repeat    splice_region_variant
#       1 substitution     stop_retained_variant
#       1 multiple_nucleotide_substitution         no_sequence_alteration
#       1 multiple_nucleotide_substitution         missense_variant
#       1 delins   intergenic_variant
#       1 delins   frameshift_variant
#       1 deletion         initiator_codon_varia

# intersting.  some classes seem odd:
#       3 insertion        synonymous_variant
#       3 deletion         synonymous_variant
#       6 insertion        inframe_deletion
#      10 deletion         inframe_insertion
#      13 insertion        missense_variant
#      14 substitution     inframe_insertion
#      26 substitution     frameshift_variant
#      38 deletion         no_sequence_alteration
#      52 insertion        no_sequence_alteration

#------------- ----------- ----------

# make bigBed
bedToBigBed -tab -as=evaSnp.as -type=bed9+6 -extraIndex=name dogSNPs.fin.bed chromInfo.txt evaSnp.bb
# pass1 - making usageList (39 chroms): 3107 millis
# pass2 - checking and writing primary data (5795726 records, 15 fields): 30701 millis
# Sorting and writing extra index 0: 4152 millis

bigBedInfo evaSnp.bb
# version: 4
# fieldCount: 15
# hasHeaderExtension: yes
# isCompressed: yes
# isSwapped: 0
# extraIndexCount: 1
# itemCount: 5,795,726
# primaryDataSize: 119,152,623
# primaryIndexSize: 373,584
# zoomLevels: 10
# chromCount: 39
# basesCovered: 6,374,175
# meanDepth (of bases covered): 1.066073
# minDepth: 1.000000
# maxDepth: 45.000000
# std of depth: 0.527133

# move new file to /gbdb
cp evaSnp.bb /cluster/data/canFam3/bed/evaSnp/evaSnp.bb

# trackDb/ra entry:
cd kent/src/hg/makeDb/trackDb/dog/canFam3

# add trackDb rows
# itemRgb on
# mouseOver $ucscClass $aaChange

# change trackDb rows
# type bigBed 9 + 

# change search from padding 50
# padding 20

# set filters
# list of ucscClass:
cat variantColors | grep . | awk '{print $1","}'
# rearranging to useful order (red first, then blue, then green, then black)
#
# missense_variant,
# frameshift_variant,
# inframe_deletion,
# inframe_insertion,
# initiator_codon_variant,
# stop_gained,
# stop_lost,
# splice_acceptor_variant,
# splice_donor_variant,
# splice_region_variant,
# exon_loss_variant,
# coding_sequence_variant,
# 5_prime_UTR_variant,
# 3_prime_UTR_variant,
# NMD_transcript_variant,
# synonymous_variant,
# stop_retained_variant,
# complex_transcript_variant,
# intron_variant,
# non_coding_transcript_exon_variant,
# upstream_gene_variant,
# downstream_gene_variant,
# intergenic_variant,
# no_sequence_alteration,

# add new filter lines to trackDb
# filterValues.varClass (new name for same field -- orig EVA class
# filterValues.ucscClass missense_variant,frameshift_variant,inframe_deletion,inframe_insertion,initiator_codon_variant,stop_gained,stop_lost,splice_acceptor_variant,splice_donor_variant,splice_region_variant,exon_loss_variant,coding_sequence_variant,5_prime_UTR_variant,3_prime_UTR_variant,NMD_transcript_variant,synonymous_variant,stop_retained_variant,complex_transcript_variant,non_coding_transcript_exon_variant,intron_variant,upstream_gene_variant,downstream_gene_variant,intergenic_variant,no_sequence_alteration,  

# drop old filter lines (no longer using RS_VALIDATED field
filterValues.valid No,Yes

##############################################################################
# FANTOM5 refs #21605 (2023-06-09 Gerardo)
cd /hive/data/outside/
mkdir fantom5
cd fantom5
hubClone -download https://fantom.gsc.riken.jp/5/datahub/hub.txt
cd /gbdb/canFam3
mkdir fantom5
cd fantom5
# Making symlinks for big files
for file in $(ls /hive/data/outside/fantom5/riken_f5/canFam3/*.b*) ; do ln -s $file; done
cd /hive/data/outside/fantom5/riken_f5/canFam3/
cp trackDb.txt fantom5.ra
vi fantom5.ra
# Indented subtracks
# Changing bigDataUrl
# Removing non-alpha characters
cd ~/kent/src/hg/makeDb/trackDb/human/canFam3
cp /hive/data/outside/fantom5/riken_f5/canFam3/fantom5.ra .
vi trackDb.ra
#include fantom5.ra alpha
##############################################################################
