# for emacs: -*- mode: sh; -*-

# armadillo ( Dasypus novemcinctus )

#	http://www.ncbi.nlm.nih.gov/bioproject/12594
#	http://www.ncbi.nlm.nih.gov/genome/235
#	http://www.ncbi.nlm.nih.gov/Traces/wgs/?val=AAGV00

#########################################################################
# DOWNLOAD SEQUENCE (working braney 
    ssh kolossus
    mkdir /hive/data/genomes/dasNov2
    rm /cluster/data/dasNov2
    ln -s /hive/data/genomes/dasNov2 /cluster/data
    mkdir /cluster/data/dasNov2/broad
    cd /cluster/data/dasNov2/broad

    wget --timestamping \
ftp://ftp.broad.mit.edu/pub/assemblies/mammals/armadillo/dasNov2/assembly.agp \
ftp://ftp.broad.mit.edu/pub/assemblies/mammals/armadillo/dasNov2/assembly.bases.gz \
ftp://ftp.broad.mit.edu/pub/assemblies/mammals/armadillo/dasNov2/assembly.quals.gz 
    md5sum ass* > assembly.md5sum


    qaToQac assembly.quals.gz stdout | qacAgpLift assembly.agp stdin dasNov2.qual.qac

    wget --timestamping \
ftp://ftp.broad.mit.edu/pub/assemblies/mammals/armadillo/dasNov2/BasicStats.out

# no BasicStats.out

   cut -f 1 assembly.agp | uniq -c | wc -l 
# Number of scaffolds: 292141


#########################################################################
# Create .ra file and run makeGenomeDb.pl (DONE braney 2008-09-22)
    ssh kolossus
    cd /cluster/data/dasNov2
cat << _EOF_ >dasNov2.config.ra
# Config parameters for makeGenomeDb.pl:
db dasNov2
clade mammal
genomeCladePriority 35
scientificName  Dasypus novemcinctus 
commonName Armadillo
assemblyDate Jul. 2008
assemblyLabel Broad Institute dasNov2 
orderKey 346.1
#mitoAcc AJ222767
mitoAcc none
fastaFiles /cluster/data/dasNov2/broad/assembly.bases.gz
agpFiles /cluster/data/dasNov2/broad/assembly.agp
qualFiles /cluster/data/dasNov2/broad/dasNov2.qual.qac
dbDbSpeciesDir armadillo
_EOF_

    makeGenomeDb.pl -workhorse=kolossus -verbose=2 dasNov2.config.ra > makeGenomeDb.out 2>&1 &

    # when done
    cut -f 2 chrom.sizes | ave stdin

# Q1 2152.000000
# median 4946.000000
# Q3 10566.000000
# average 16477.740413
# min 600.000000
# max 985764.000000
# count 292141
# total 4813823562.000000
# standard deviation 32801.578215

#########################################################################
# REPEATMASKER (DONE braney  2008-10-07)
    ssh kolossus
    screen # use a screen to manage this job
    mkdir /cluster/data/dasNov2/bed/repeatMasker
    cd /cluster/data/dasNov2/bed/repeatMasker
    doRepeatMasker.pl -buildDir=/cluster/data/dasNov2/bed/repeatMasker \
        dasNov2 > do.log 2>&1 &

# Completed: 10555 of 10555 jobs
# CPU time in finished jobs:   22325655s  372094.25m  6201.57h  258.40d  0.708 y
# IO & Wait Time:                 86043s    1434.05m    23.90h    1.00d  0.003 y
# Average job time:                2123s      35.39m     0.59h    0.02d
# Longest finished job:            7949s     132.48m     2.21h    0.09d
# Submission to last job:         69062s    1151.03m    19.18h    0.80d


    doRepeatMasker.pl -buildDir=/cluster/data/dasNov2/bed/repeatMasker \
        -continue cat dasNov2 > do2.log 2>&1 &

    # Note: can run simpleRepeats simultaneously
    #### When done with RM:
    ssh hgwdev
    cd /cluster/data/dasNov2/bed/repeatMasker

    time nice -n +19 featureBits dasNov2 rmsk > fb.dasNov2.rmsk.txt 2>&1 &
# 888874669 bases of 2371493872 (37.482%) in intersection


#########################################################################
# SIMPLE REPEATS TRF (DONE braney 2008-10-07)
    ssh kolossus
    screen # use a screen to manage this job
    mkdir /cluster/data/dasNov2/bed/simpleRepeat
    cd /cluster/data/dasNov2/bed/simpleRepeat
    # 
    doSimpleRepeat.pl -buildDir=/cluster/data/dasNov2/bed/simpleRepeat \
	dasNov2 > do.log 2>&1 &

    #### When done
    ssh pk
    para time

# Completed: 97 of 97 jobs
# CPU time in finished jobs:      61803s    1030.05m    17.17h    0.72d  0.002 y
# IO & Wait Time:                  3111s      51.85m     0.86h    0.04d  0.000 y
# Average job time:                 669s      11.15m     0.19h    0.01d
# Longest finished job:            7949s     132.48m     2.21h    0.09d
# Submission to last job:          7957s     132.62m     2.21h    0.09d

    featureBits dasNov2 simpleRepeat
# 26484090 bases of 2371493872 (1.117%) in intersection

    #	after RM run is done, add this mask:
    cd /cluster/data/dasNov2
    twoBitMask dasNov2.rmsk.2bit -add bed/simpleRepeat/trfMask.bed dasNov2.2bit

    twoBitToFa dasNov2.2bit stdout | faSize stdin

# 4813823562 bases (2442329690 N's 2371493872 real 1488699780 upper 882794092
# lower) in 292141 sequences in 1 files
# Total size: mean 16477.7 sd 32801.6 min 600 (scaffold_292140) max 985764
# (scaffold_0) median 4946
# N count: mean 8360.1 sd 19087.4
# U count: mean 5095.8 sd 10731.3
# L count: mean 3021.8 sd 5626.4
# %18.34 masked total, %37.23 masked real

    twoBitToFa dasNov2.rmsk.2bit stdout | faSize stdin
# 4813823562 bases (2442329690 N's 2371493872 real 1489221550 upper 882272322
# lower) in 292141 sequences in 1 files
# Total size: mean 16477.7 sd 32801.6 min 600 (scaffold_292140) max 985764
# (scaffold_0) median 4946
# N count: mean 8360.1 sd 19087.4
# U count: mean 5097.6 sd 10734.5
# L count: mean 3020.0 sd 5623.4
# %18.33 masked total, %37.20 masked real

    # Link to it from /gbdb
    ssh hgwdev
    rm -f  /gbdb/dasNov2/dasNov2.2bit
    ln -s /cluster/data/dasNov2/dasNov2.2bit /gbdb/dasNov2/dasNov2.2bit

#########################################################################
# prepare for kluster runs (DONE - 2008-10-22 - Hiram)
    # compare to size of real bases to adjust the repMatch
    #	hg18: 2881421696
    #	dasNov2: 2371493872
    # thus: 1024 * 2371493872/2881421696 = 842
    #	rounding up to 900 for a bit more conservative masking
    cd /hive/data/genomes/dasNov2
    time blat dasNov2.2bit \
	/dev/null /dev/null -tileSize=11 -makeOoc=dasNov2.11.ooc -repMatch=900
    #	Wrote 30934 overused 11-mers to dasNov2.11.ooc
    #	real    2m49.196s

    #	and staging data for push to kluster nodes
    mkdir /hive/data/staging/data/dasNov2
    cp -p dasNov2.2bit chrom.sizes dasNov2.11.ooc \
	/hive/data/staging/data/dasNov2
    #	request to cluster admin to push this to the kluster nodes
    #	/scratch/data/

###########################################################################
# add NCBI identifiers to the dbDb (DONE - 2008-10-22 - Hiram)
    hgsql -e 'update dbDb set
sourceName="Broad Institute dasNov2 (NCBI project 12594, AAGV020000000)" where name="dasNov2";' hgcentraltest
###########################################################################
