# for emacs: -*- mode: sh; -*-

# Bos Taurus -- Baylor Release 4.2 (Apr 23 2009)
##########################################################################
# Creating minimal build in order make liftover files between Btau4.2 and Btau4
# DOWNLOAD SEQUENCE (working - 2010-05-18 - Chin)

    # set up main genome directory

    ssh hgwdev
    cd /hive/data/genomes
    mkdir -p /hive/data/genomes/bosTau4P2/baylor/Contigs
    cd /hive/data/genomes/bosTau4P2/baylor/Contigs

    for F in README.Btau20080815.txt Contigs/*
do
    wget --timestamping \
"ftp://ftp.hgsc.bcm.tmc.edu/pub/data/Btaurus/fasta/Btau20080815/${F}"
done
    mv README.Btau20080815.txt ../

    mkdir LinearizedChromosomes
    cd LinearizedChromosomes
    wget --timestamping \
"ftp://ftp.hgsc.bcm.tmc.edu/pub/data/Btaurus/fasta/Btau20080815/LinearizedChromosomes/*"



    #   Looking up chrM in Entrez nucleotide search,
    #   from bosTau3 note: NC_006853 GI:60101824
    #   search for Bos taurus mitochondrion complete genome
    #     found it, and it had been update 
    #   Note, their Contigs files are from their 2006 release.
    #   Their chrom files are new to this release.

    #   fixup the Chr case names in the agp file:
    cd /hive/data/genomes/bosTau4P2/baylor
    sed -e "s/^Chr/chr/" Contigs/Btau4.2.agp > bosTau4P2.agp

    #   fixup the contig names in the contigs.fa and qual files, utilize
    #   the following perl scripts:
    #   Change ">gi|112123023|gb|AAFC03055291.1| Bos taurus Ctg58.CH240-395G16,
    #   whole genome shotgun sequence" line 
    #    to
    #   ">AAFC03055291"


    cat << '_EOF_' > fixContigNames.pl
#!/usr/bin/env perl

use warnings;
use strict;

my $argc = scalar(@ARGV);

if (1 != $argc) {
    print "usage ./fixContigNames.pl Btau20060815.contigs.fa.gz \\\n",
        " | gzip > bosTau4P2.contigs.fa.gz\n";
    exit 255;
}

my $fName = shift;

open (FH,"zcat $fName|") or die "can not read $fName $!";
while (my $line = <FH>) {
    if ($line =~ m/^>/) {
        my @a = split('\|', $line);
        $a[3] =~ s/\.[0-9]+//;
        print ">$a[3]\n";
    } else {
        print "$line";
    }
}
close (FH)
'_EOF_'
    # << happy emacs
    cat << '_EOF_' > fixQuals.pl
#!/usr/bin/env perl

use warnings;
use strict;

my $argc = scalar(@ARGV);

if (1 != $argc) {
    print "usage ./fixQuals.pl Btau20060815.contigs.fa.qual.gz \\\n",
        " | gzip > bosTau4P2.qual.gz\n";
    exit 255;
}

my $fName = shift;

open (FH,"zcat $fName|") or die "can not read $fName $!";
while (my $line = <FH>) {
    if ($line =~ m/^>/) {
        $line =~ s/\.[0-9]+.*//;
    }
    print "$line";
}
close (FH)
'_EOF_'
    # << happy emacs

    #   There are extra records in the qual and contigs fa files.
    #   Only the sequences in the agp file should be used:
    grep "^chr" bosTau4P2.agp | egrep -v "clone|fragment" | cut -f6 \
        | sort > agp.names
    #   run those scripts, extract only the used records, and lift the
    #   qual scores via the AGP file to a qac file.
    chmod +x fixContigNames.pl fixQuals.pl
XXXX 05-19
    ./fixContigNames.pl LinearizedChromosomes/Btau_4.2_chr.fa.gz(Btau20060815.contigs.fa.gz??) \
        | faSomeRecords stdin agp.names stdout | gzip > bosTau4P2.contigs.fa.gz
    ./fixQuals.pl Btau20060815.contigs.fa.qual.gz \
        | faSomeRecords stdin agp.names stdout \
        | qaToQac stdin stdout \
        | qacAgpLift bosTau4P2.agp stdin bosTau4P2.qual.qac
Read 0 qacs from stdin
Got 11893 chroms in bosTau4P2.agp
    chr1 size=161108518
missing data: no quality scores for AAFC03067444.1


#############################################################################
#  running makeGenomeDb.pl (DONE - 2008-03-07 - Hiram)
    ssh kkstore06
    cd /hive/data/genomes/bosTau4P2
    cat << '_EOF_' > bosTau4P2.config.ra
db bosTau4P2
clade mammal
scientificName Bos Taurus
assemblyDate Oct. 2007
assemblyLabel Baylor Release 4.0
orderKey 236
dbDbSpeciesDir cow
mitoAcc NC_006853
agpFiles /hive/data/genomes/bosTau4P2/baylor/bosTau4P2.agp
fastaFiles /hive/data/genomes/bosTau4P2/baylor/bosTau4P2.contigs.fa.gz
qualFiles /hive/data/genomes/bosTau4P2/baylor/bosTau4P2.qual.qac
commonName Cow
'_EOF_'
    # << happy emacs

    makeGenomeDb.pl bosTau4P2.config.ra > makeGenomeDb.log 2>&1

#########################################################################
# REPEATMASKER (DONE - 2008-03-07 - Hiram)
    ssh kkstore06
    screen # use a screen to manage this job
    mkdir /hive/data/genomes/bosTau4P2/bed/repeatMasker
    cd /hive/data/genomes/bosTau4P2/bed/repeatMasker
    # 
    #   This was going too slow on memk, on the order of 8 days.
    #   chilled that memk batch, then switched it to finish on pk
    doRepeatMasker.pl -buildDir=/hive/data/genomes/bosTau4P2/bed/repeatMasker \
        -bigClusterHub=memk bosTau4P2 > do.log 2>&1 &
    #   continuing
    doRepeatMasker.pl -buildDir=/hive/data/genomes/bosTau4P2/bed/repeatMasker \
        -continue=cat -bigClusterHub=pk bosTau4P2 > cat.log 2>&1 &

    time nice -n +19 featureBits bosTau4P2 rmsk > fb.bosTau4P2.rmsk.txt 2>&1 &
    #   1280927549 bases of 2731830700 (46.889%) in intersection




    # RepeatMasker and lib version from do.log:
    #   RepeatMasker,v 1.20 2008/01/16 18:15:45 angie Exp $
    #   Jan 11 2008 (open-3-1-9) version of RepeatMasker
    #   CC   RELEASE 20071204;

    # Compare coverage to previous assembly:
    featureBits bosTau3 rmsk
    # 1200525422 bases of 2731807384 (43.946%) in intersection
    featureBits bosTau2 rmsk
    # 1291561904 bases of 2812203870 (45.927%) in intersection

#########################################################################
# SIMPLE REPEATS TRF (DONE - 2008-03-07 - Hiram)
    ssh kkstore06
    screen # use a screen to manage this job
    mkdir /hive/data/genomes/bosTau4P2/bed/simpleRepeat
    cd /hive/data/genomes/bosTau4P2/bed/simpleRepeat
    # 
    doSimpleRepeat.pl -buildDir=/hive/data/genomes/bosTau4P2/bed/simpleRepeat \
        bosTau4P2 > do.log 2>&1 &
    #   after RM run is done, add this mask:
    cd /hive/data/genomes/bosTau4P2
    twoBitMask bosTau4P2.rmsk.2bit -add bed/simpleRepeat/trfMask.bed bosTau4P2.2bit

    twoBitToFa bosTau4P2.2bit stdout | faSize stdin
    #   2917974530 bases (186161294 N's 2731813236 real
    #   1450855409 upper 1280957827 lower) in 11900 sequences in 1 files
    #   %43.90 masked total, %46.89 masked real

    twoBitToFa bosTau4P2.rmsk.2bit stdout | faSize stdin
    #   2917974530 bases (186161294 N's 2731813236 real
    #   1451369861 upper 1280443375 lower) in 11900 sequences in 1 files
    #   %43.88 masked total, %46.87 masked real

    
#########################################################################
# Create OOC file for genbank runs (DONE - 2008-03-10 - Hiram)
# use same repMatch value as bosTau2
    ssh kkstore06
    cd /hive/data/genomes/bosTau3
    blat bosTau4P2.2bit /dev/null /dev/null -tileSize=11 \
        -makeOoc=bosTau4P2.11.ooc -repMatch=1005
    
    ssh kkr1u00
    mkdir /iscratch/i/bosTau4P2
    cd /iscratch/i/bosTau4P2
    cp -p /hive/data/genomes/bosTau4P2/bosTau4P2.2bit .
    for R in 2 3 4 5 6 7 8
do
    rsync -a --progress ./ kkr${R}u00:/iscratch/i/bosTau4P2/
done

#########################################################################




----------- 3Md stuff -------------------------------------------------------
# get sequence from UMD

wget --timestamping -nd 'ftp://ftp.cbcb.umd.edu/pub/data/Bos_taurus/Bos_taurus_UMD_3.0/bos_taurus.agp'
wget --timestamping -nd 'ftp://ftp.cbcb.umd.edu/pub/data/Bos_taurus/Bos_taurus_UMD_3.0/bos_taurus.fa.gz'

# fixup ids for agp
#  change ">Chr" 
#    to   ">chr"
mv bos_taurus.agp bos_taurus.orig.agp
cat bos_taurus.orig.agp | sed 's/\(^Chr\)/chr/' > bos_taurus.agp


# back to the main directory
cd /hive/data/genomes/bosTau4P2

# Run automation to make the basic genome

    cat << '_EOF_' > bosTau4P2.config.ra
# Config parameters for makeGenomeDb.pl:
db bosTau4P2
clade mammal
scientificName Bos Taurus
assemblyDate Aug. 2009
assemblyLabel Univ. Maryland Release 3.0
orderKey 236
dbDbSpeciesDir cow
mitoAcc 60101824
commonName Cow
taxId 9913
fastaFiles /hive/data/genomes/bosTau4P2/download/bos_taurus.fa.gz
agpFiles /hive/data/genomes/bosTau4P2/download/bos_taurus.agp
subsetLittleIds Y
'_EOF_'
    # << happy emacs


time makeGenomeDb.pl bosTau4P2.config.ra > & makeGenomeDb.pl.out &
# took 11 minutes

#note: I added subsetLittleIds option to config.ra 
# because I had checked the .agp which was ok,
# but the script complained that there were some extra
# sequences in the fasta file.  This option allows
# the system to ignore those, as long as everything
# in column 6 of agp is found in the fasta file(s).

featureBits -countGaps bosTau4P2 gap
#20737263 bases of 2660922743 (0.779%) in intersection

cat chrom.sizes | awk '{sum+=$2;print sum,$0}'
#2660922743
# similar total

# TODO
# Organism Image
wget -O /usr/local/apache/htdocs/images/Aplysia_californica.jpg \
 'http://upload.wikimedia.org/wikipedia/commons/thumb/e/ef/Aplysia_californica.jpg/250px-Aplysia_californica.jpg'

# TODO
# Edit and check-in templates for description.html, gold.html, gap.html, bosTau4P2/trackDb.ra

# repeat mask
# for next time, put this under the bed/ dir
mkdir repeatMasker
cd repeatMasker
time doRepeatMasker.pl bosTau4P2 -buildDir=`pwd` > & doRepeatMasker.pl.out &


cat faSize.rmsk.txt 
#%47.84 masked total, %48.22 masked real

featureBits -countGaps bosTau4P2 rmsk
#1273525727 bases of 2660922743 (47.860%) in intersection

# simple repeat masker trf
cd /hive/data/genomes/bosTau4P2/bed
mkdir simpleRepeat
cd simpleRepeat
time doSimpleRepeat.pl bosTau4P2 -buildDir=`pwd` > & doSimpleRepeat.pl.out &


# failed only on chrM which doesn't matter
# perhaps someone will look into why bigTrf fails on chrM
ssh swarm
cd /hive/data/genomes/bosTau4P2/bed/simpleRepeat/run.cluster
/parasol/bin/para time > run.time

time doSimpleRepeat.pl -continue filter -buildDir `pwd` bosTau4P2 \
  >> & doSimpleRepeat.pl.out &


featureBits -countGaps bosTau4P2 simpleRepeat
#29450984 bases of 2660922743 (1.107%) in intersection

# make final masked .2bit
cd /hive/data/genomes/bosTau4P2
twoBitMask bosTau4P2.rmsk.2bit -add bed/simpleRepeat/trfMask.bed bosTau4P2.2bit
#Warning: BED file bed/simpleRepeat/trfMask.bed has >=13 fields which means it
#might contain block coordinates, but this program uses only the first three
#fields
# (the entire span -- no support for blocks).
# this seems to be generic output we always see.

############################################################################
#	prepare cluster data (DONE - 2009-11-20 - Galt)

ssh hgwdev
cd /hive/data/genomes/bosTau4P2

# create gbdb symlink
ln -s `pwd`/bosTau4P2.2bit /gbdb/bosTau4P2/

time blat bosTau4P2.2bit /dev/null /dev/null -tileSize=11 -makeOoc=11.ooc -repMatch=1024
#Wrote 27298 overused 11-mers to 11.ooc
#151.299u 3.434s 3:06.07 83.1%   0+0k 0+0io 3pf+0w

mkdir /hive/data/staging/data/bosTau4P2
cp -p bosTau4P2.2bit /hive/data/staging/data/bosTau4P2
cp -p 11.ooc /hive/data/staging/data/bosTau4P2
cp -p chrom.sizes /hive/data/staging/data/bosTau4P2

# ask admin to sync this directory: /hive/data/staging/data/bosTau4P2/
#	to the kluster nodes /scratch/data/bosTau4P2/

#############################################################################
# LIFTOVER TO bosTau4 (DONE - 2009-11-23 - Galt )
    mkdir /hive/data/genomes/bosTau4P2/bed/blat.bosTau4.2009-11-23
    cd /hive/data/genomes/bosTau4P2/bed/blat.bosTau4.2009-11-23
    time nice +19 doSameSpeciesLiftOver.pl -verbose=2 \
	-bigClusterHub=pk -dbHost=hgwdev -workhorse=hgwdev \
	 bosTau4P2 bosTau4 >& do.log
    # it actually ran out of space on /scratch/tmp (used over 16GB temporarily)
    # and I moved the temp dir somewhere else and
    # finished the net step.  Then I did the load and cleanup steps.


###################################################

#Change bosTau4P2 in hgcentraltest.dbDb to active=0:

hgsql hgcentraltest

update dbDb set active=0 where name='bosTau4P2';
# changed because liftover is confusing otherwise
# when people are expecting successive versions
# of the same assembly rather than alternate assemblies.
update dbDb set description='Aug. 2009 (UMD3)' where name='bosTau4P2';
