# for emacs: -*- mode: sh; -*-


# Ciona Intestinalis V2.0 from JGI

#  NOTE:  this doc may have genePred loads that fail to include
#  the bin column.  Please correct that for the next build by adding
#  a bin column when you make any of these tables:
#
#  mysql> SELECT tableName, type FROM trackDb WHERE type LIKE "%Pred%";
#  +-------------+---------------------------------+
#  | tableName   | type                            |
#  +-------------+---------------------------------+
#  | refGene     | genePred refPep refMrna         |
#  | xenoRefGene | genePred xenoRefPep xenoRefMrna |
#  +-------------+---------------------------------+


# DOWNLOAD SEQUENCE

    ssh kkstore02
    mkdir /cluster/store11/ci2
    ln -s  /cluster/store11/ci2 /cluster/data
    cd /cluster/data/ci2
    wget "ftp://ftp.jgi-psf.org/pub/JGI_data/Ciona/v2.0/ciona050324.unmasked.fasta.gz"
    gunzip ciona050324.unmasked.fasta.gz

    mkdir chunks500k jkStuff scaffolds nib

    cd scaffolds
    faSplit byname ../fromJGI/*fasta .
    for i in *.fa
    do
	faToNib $i ../nib/`basename $i .fa`.nib
    done

    cd ..

    faSplit gap fromJGI/*fasta 500000 chunks500k/x  -lift=jkStuff/chunks500k.lft -minGapSize=100

# REPEAT MASKING
    #- Make the run directory and job list:
    cd /cluster/data/ci2
    tcsh

    cat << '_EOF_' > jkStuff/RMCiona
#!/bin/csh -fe

cd $1
pushd .
/bin/mkdir -p /tmp/ci2/$2
/bin/cp $2 /tmp/ci2/$2/
cd /tmp/ci2/$2
/cluster/bluearc/RepeatMasker/RepeatMasker -ali -s $2
popd
/bin/cp /tmp/ci2/$2/$2.out ./
if (-e /tmp/ci2/$2/$2.align) /bin/cp /tmp/ci2/$2/$2.align ./
if (-e /tmp/ci2/$2/$2.tbl) /bin/cp /tmp/ci2/$2/$2.tbl ./
if (-e /tmp/ci2/$2/$2.cat) /bin/cp /tmp/ci2/$2/$2.cat ./
/bin/rm -fr /tmp/ci2/$2/*
/bin/rmdir --ignore-fail-on-non-empty /tmp/ci2/$2
/bin/rmdir --ignore-fail-on-non-empty /tmp/ci2
'_EOF_'
    # << this line makes emacs coloring happy
    chmod +x jkStuff/RMCiona
    exit
    mkdir RMRun
    for i in chunks500k/*.fa
    do
	d=`dirname $i`
	f=`basename $i`
    echo "/cluster/data/ci2/jkStuff/RMCiona /cluster/data/ci2/$d $f {check out line+ /cluster/data/ci2/$d/$f.out} "
    done > RMRun/RMJobs

    #- Do the run
    ssh kk
    cd /cluster/data/ci2/RMRun
    para create RMJobs
    para try, para check, para check, para push, para check,...
# Completed: 4617 of 4617 jobs
# CPU time in finished jobs:     133300s    2221.66m    37.03h    1.54d  0.004 y
# IO & Wait Time:                703794s   11729.90m   195.50h    8.15d  0.022 y
# Average job time:                 181s       3.02m     0.05h    0.00d
# Longest finished job:             651s      10.85m     0.18h    0.01d
# Submission to last job:          2948s      49.13m     0.82h    0.03d

    featureBits ci2 rmsk
# 22881665 bases of 141233565 (16.201%) in intersection

# SIMPLE REPEATS (TRF)
    ssh kkstore02
    mkdir -p /cluster/data/ci2/bed/simpleRepeat
    cd /cluster/data/ci2/bed/simpleRepeat
    mkdir trf
    tcsh
    cp /dev/null jobs.csh
    foreach d (/cluster/data/ci2/chunks500k)
      foreach f ($d/*.fa)
        set fout = $f:t:r.bed
        echo $fout
        echo "/cluster/bin/i386/trfBig -trf=/cluster/bin/i386/trf $f /dev/null -bedAt=trf/$fout -tempDir=/tmp" >> jobs.csh
      end
    end
    screen
    csh -ef jobs.csh

    # check on this with
    tail -f jobs.log
    wc -l jobs.csh
    ls -1 trf | wc -l
    endsInLf trf/*
    # When job is done do:
    liftUp simpleRepeat.bed /cluster/data/ci2/jkStuff/liftAll.lft error  trf/*.bed

    # Load into the database:
    ssh hgwdev
    hgLoadBed ci2 simpleRepeat /cluster/data/ci2/bed/simpleRepeat/simpleRepeat.bed -sqlTable=$HOME/kent/src/hg/lib/simpleRepeat.sql
    featureBits ci2 simpleRepeat
# 6047914 bases of 141233565 (4.282%) in intersection


# Create the database.
    ssh hgwdev
    echo 'create database ci2' | hgsql ''
# CREATING GRP TABLE FOR TRACK GROUPING
    echo "create table grp (PRIMARY KEY(NAME)) select * from hg17.grp" | hgsql ci2

    echo 'insert into blatServers values("ci2", "blat10", "17780", "1"); \
          insert into blatServers values("ci2", "blat10", "17781", "0");' \
      | hgsql -h genome-testdb hgcentraltest

    # Add dbDb and defaultDb entries:
    echo 'insert into dbDb (name, description, nibPath, organism, defaultPos, active, orderKey, genome, scientificName,  \
          htmlPath, hgNearOk)  values("ci2", "March 2005", "/gbdb/ci2/nib", "Ciona", "", 1, \
          44, "C. intestinalis", "Ciona intestinalias", "/gbdb/ci2/html/description.html", 0);' \
    | hgsql -h genome-testdb hgcentraltest

# STORING O+O SEQUENCE AND ASSEMBLY INFORMATION
    # Make symbolic links in /gbdb/ci2/nib to the real nibs.
    ssh hgwdev
    mkdir /gbdb/ci2
    ln -s /cluster/data/ci2/nib /gbdb/ci2/nib
    # Load /gbdb/ci2/nib paths into database
    cd /cluster/data/ci2
    hgsql ci2  < ~kent/src/hg/lib/chromInfo.sql
    hgNibSeq -preMadeNib ci2 /gbdb/ci2/nib scaffolds/*.fa

    # make gcPercent table
    ssh hgwdev
    mkdir -p /cluster/data/ci2/bed/gcPercent
    cd /cluster/data/ci2/bed/gcPercent
    hgsql ci2  < ~/kent/src/hg/lib/gcPercent.sql
    hgGcPercent ci2 ../../nib
#dropped gcPercent table (DONE braney 2005-08-19)

### AUTO UPDATE GENBANK MRNA RUN  (DONE markd 2005-08-17)
    # genbank done with revised alignment procedure
    # Update genbank config and source in CVS:
    cd ~/kent/src/hg/makeDb/genbank
    cvs update etc

    # Edit etc/genbank.conf and add these lines:
    # NOTE: braney created a ooc and this was added after the initial alignment
# ci2 (ciona intestinalis)
ci2.serverGenome = /cluster/data/ci2/ci2.2bit
ci2.clusterGenome = /iscratch/i/ci2/ci2.2bit
ci2.ooc = no
ci2.maxIntron = 20000
ci2.lift = no
ci2.refseq.mrna.native.load = yes
ci2.refseq.mrna.xeno.load = yes
ci2.refseq.mrna.xeno.pslCDnaFilter = -minCover=0.25 -coverNearTop=0.005 -minId=0.15 -idNearTop=0.005 -maxRepMatch=0.4 -bestOverlap
ci2.genbank.mrna.xeno.load = yes
ci2.genbank.est.xeno.load = no
ci2.downloadDir = ci2
ci2.perChromTables = no

    cvs ci etc/genbank.conf
    # Install to /cluster/data/genbank:
    make etc-update

    ssh kkstore02
    cd /cluster/data/genbank
    # do mrnas and ests in one run
    nice bin/gbAlignStep -initial ci2 &

    # Load results:
    ssh hgwdev
    cd /cluster/data/genbank
    nice bin/gbDbLoadStep -drop -initialLoad ci2

### gap and repeats tables
    ssh hgwdev
    mkdir -p /cluster/data/ci2/bed/gapRmsk
    cd /cluster/data/ci2/bed/gapRmsk
    simpleGap /cluster/data/ci2/nib gap.bed repeats.bed
    echo "drop table gap;" | hgsql ci2
    hgsql ci2 < ~/kent/src/hg/lib/gap.sql
    hgLoadBed -oldTable ci2 gap gap.bed
    echo "create index chrom on gap (chrom(13), bin) ;" | hgsql ci2
    echo "create index chrom_2 on gap  (chrom(13), chromStart);" | hgsql ci2
    echo "create index chrom_3 on gap  (chrom(13), chromEnd);" | hgsql ci2

    # do RepeatMasking
    cd /cluster/data/ci
    echo "drop index bin on rmsk;" | hgsql ci2
    echo "drop index genoStart on rmsk;" | hgsql ci2
    echo "drop index genoEnd on rmsk;" | hgsql ci2
    echo "create index chrom_2 on rmsk  (genoName(13), genoStart);" | hgsql ci2
    echo "create index chrom_3 on rmsk  (genoName(13), genoEnd);" | hgsql ci2

    ssh kkstore02
    mkdir -p /cluster/data/ci2/bed/simpleRepeat
    cd /cluster/data/ci2/bed/simpleRepeat
    mkdir trf
    for i in ../../scaffolds/*.fa
    do
    	trfBig $i /dev/null -bedAt=trf/`basename $i .fa`.bed > /dev/null 2>&1 ; echo $i;
    done
    cat trf/* > simpleRepeat.bed

    ssh hgwdev
    hgLoadBed ci2 simpleRepeat /cluster/data/ci2/bed/simpleRepeat/simpleRepeat.bed \
    	-sqlTable=$HOME/kent/src/hg/lib/simpleRepeat.sql

    ssh kkstore02
    cd /cluster/data/ci2/bed/simpleRepeat
    mkdir -p trfMask
    cd trf
    for i in *.bed
    do
      awk '{if ($5 <= 12) print;}' $i > ../trfMask/$i
    done

    cd ../trfMask
    liftUp ../../../all.trfMask.bed ../../../jkStuff/chunks500k.lft error *.bed

    # use RepeatMasker and simpleRepeat to build masked fa's
    cd /cluster/data/ci2
    mkdir maskedScaffolds
    cat scaffolds/*.fa |  maskOutFa -soft stdin all.out all.tmp.fa
    maskOutFa -softAdd all.tmp.fa all.trfMask.bed all.masked.fa

    # Rebuild the nib files, using the soft masking in the fa:
    mkdir -p /cluster/data/ci2/nib
    cd /cluster/data/ci2/maskedScaffolds

    for i in *.fa
    do
      faToNib -softMask $i ../nib/`basename $i .fa`.nib
    done

    # Make one big 2bit file as well, and make a link to it in
    faToTwoBit *.fa ../ci2.2bit
    ln -s /cluster/data/ci2/ci2.2bit /gbdb/ci2/

### SNAP GENE PREDICTIONS FROM COLIN DEWEY
    ssh hgwdev
    mkdir /cluster/data/ci2/bed/snap
    cd /cluster/data/ci2/bed/snap
    # contact: Colin Dewey <cdewey@eecs.berkeley.edu>
    wget "http://hanuman.math.berkeley.edu/~cdewey/tracks/SNAP.CioInt_2.gff.gz"
    gunzip SNAP.CioInt_2.gff.gz
    ldHgGene -gtf -frame -id -geneName ci2 snapGene SNAP.CioInt_2.gff
# 31491546 bases of 141233565 (22.297%) in intersection
# in ci1 28699953 bases of 113192845 (25.355%) in intersection

# MAKE DOWNLOADABLE SEQUENCE FILES  (re-DONE braney 2008-09-17)
    ssh kkstore02
    cd /cluster/data/ci2
    #- Build the .zip files
    csh
    cat << '_EOF_' > jkStuff/zipAll.csh
rm -rf zip
mkdir zip
zip -j zip/Scaffold.out.zip all.out
cd maskedScaffolds
zip -j ../zip/ScaffoldFa.zip *.fa
cd ../hardMaskedScaffolds
zip -j ../zip/ScaffoldFaMasked.zip *.fa
cd ../bed/simpleRepeat
zip -j ../../zip/ScaffoldTrf.zip simpleRepeat.bed
'_EOF_'
    # << this line makes emacs coloring happy
    chmod +x ./jkStuff/zipAll.csh
    ./jkStuff/zipAll.csh  |& tee zipAll.log
    cd zip
    #- Look at zipAll.log to make sure all file lists look reasonable.
    #- Check zip file integrity:
    foreach f (*.zip)
      unzip -t $f > $f.test
      tail -1 $f.test
    end
    wc -l *.zip.test
    rm *.zip.test

    #- Copy the .zip files to hgwdev:/usr/local/apache/...
    ssh hgwdev
    cd /cluster/data/ci2/zip
    set gp = /usr/local/apache/htdocs/goldenPath/ci2
    mkdir -p $gp/bigZips
    cp -p *.zip $gp/bigZips
    # mkdir -p $gp/scaffolds
    # foreach f ( ../*/chr*.fa )
      # zip -j $gp/chromosomes/$f:t.zip $f
    # end

    cd $gp/bigZips
    md5sum *.zip > md5sum.txt
    # cd $gp/chromosomes
    # md5sum *.zip > md5sum.txt
    # Take a look at bigZips/* and chromosomes/*, update their README.txt's

# MAKE 11.OOC FILE FOR BLAT
    mkdir /cluster/bluearc/ci2
    blat /cluster/data/ci2/ci2.2bit /dev/null /dev/null -tileSize=11 \
      -makeOoc=/cluster/bluearc/ci2/11.ooc -repMatch=100
# Wrote 10830 overused 11-mers to /cluster/bluearc/ci2/11.ooc
    cp -p /cluster/bluearc/ci2/*.ooc /iscratch/i/ci2/
    iSync

#######################################################################
# GC5BASE (DONE - 2005-08-19 - Hiram)
    ssh kkstore02
    mkdir /cluster/data/ci2/bed/gc5Base
    cd /cluster/data/ci2/bed/gc5Base
    hgGcPercent -wigOut -doGaps -file=stdout -win=5 ci2 \
        /cluster/data/ci2 | wigEncode stdin gc5Base.wig gc5Base.wib

    ssh hgwdev
    cd /cluster/data/ci2/bed/gc5Base
    mkdir /gbdb/ci2/wib
    ln -s `pwd`/gc5Base.wib /gbdb/ci2/wib
    hgLoadWiggle ci2 gc5Base gc5Base.wig


#######################################################################
### AUTO UPDATE GENBANK MRNA RUN  (DONE markd 2005-08-31)
    # redo genbank revised alignment procedure once again to
    # pickup local near best pslCDnaFilter

    # Update genbank config and source in CVS:
    cd ~/kent/src/hg/makeDb/genbank
    cvs update etc

    # Edit etc/genbank.conf and add these lines:
# ci2 (ciona intestinalis)
ci2.serverGenome = /cluster/data/ci2/ci2.2bit
ci2.clusterGenome = /iscratch/i/ci2/ci2.2bit
ci2.ooc = /iscratch/i/ci2/11.ooc
ci2.maxIntron = 20000
ci2.lift = no
ci2.refseq.mrna.native.pslCDnaFilter  = ${lowCover.refseq.mrna.native.pslCDnaFilter}
ci2.refseq.mrna.xeno.pslCDnaFilter    = ${lowCover.refseq.mrna.xeno.pslCDnaFilter}
ci2.genbank.mrna.native.pslCDnaFilter = ${lowCover.genbank.mrna.native.pslCDnaFilter}
ci2.genbank.mrna.xeno.pslCDnaFilter   = ${lowCover.genbank.mrna.xeno.pslCDnaFilter}
ci2.genbank.est.native.pslCDnaFilter  = ${lowCover.genbank.est.native.pslCDnaFilter}
ci2.genbank.est.xeno.pslCDnaFilter    = ${lowCover.genbank.est.xeno.pslCDnaFilter}
ci2.refseq.mrna.native.load = yes
ci2.refseq.mrna.xeno.load = yes
ci2.genbank.mrna.xeno.load = yes
ci2.genbank.est.xeno.load = no
ci2.downloadDir = ci2
ci2.perChromTables = no

    cvs ci etc/genbank.conf
    # Install to /cluster/data/genbank:
    make etc-update

    ssh kkstore02
    cd /cluster/data/genbank
    # do mrnas and ests in one run
    nice bin/gbAlignStep -initial ci2 &

    # Load results:
    ssh hgwdev
    cd /cluster/data/genbank
    nice bin/gbDbLoadStep -drop -initialLoad ci2

    # Note: download sequences are made as part of the first
    # genbank update on hgdownload.

############################################################################
#  Adding Ensembl Genes (DONE - 2008-02-22 - Hiram)
    ssh kkstore02
    cd /cluster/data/ci2
    cat << '_EOF_' > ci2.ensGene.ra
# required db and ensVersion variables
db ci2
ensVersion 48
# optional nameTranslation, the sed command that will transform
#       Ensemble names to UCSC names.  With quotes just to make sure.
nameTranslation "s/^\([0-9][pq]\)/chr0\1/; s/^\([0-9][0-9][pq]\)/chr\1/; "
'_EOF_'
    # << happy emacs
    doEnsGeneUpdate.pl oryCun1.ensGene.ra

##########################################################################pubStart
# Publications track (DONE - 04-27-12 - Max)

# article download and conversion is run every night on hgwdev:
# 22 22 * * * /hive/data/inside/literature/pubtools/pubCronDailyUpdate.sh
# the script downloads files into /hive/data/outside/literature/{PubMedCentral,ElsevierConsyn}/
# then converts them to text into /hive/data/outside/literature/{pmc,elsevier}

# all configuration of the pipeline is in /hive/data/inside/literature/pubtools/lib/pubConf.py

# data processing was run manually like this
export PATH=/cluster/home/max/bin/x86_64:/cluster/bin/x86_64:/cluster/home/max/software/bin/:/cluster/software/bin:/cluster/home/max/projects/pubtools:/cluster/home/max/bin/x86_64:/hive/groups/recon/local/bin:/usr/local/bin:/usr/bin:/bin:/usr/bin/X11:/cluster/home/max/usr/src/scripts:/cluster/home/max/usr/src/oneshot:/cluster/home/max/bin:/cluster/bin/scripts:.:/cluster/home/max/usr/bin:/usr/lib64/qt-3.3/bin:/usr/kerberos/bin:/usr/local/bin:/bin:/usr/bin:/usr/lpp/mmfs/bin/:/opt/dell/srvadmin/bin:/cluster/bin/scripts:/hive/users/hiram/cloud/ec2-api-tools-1.3-51254/bin:/cluster/home/max/bin:/usr/bin/X11:/usr/java/jdk1.6.0_20/bin:/cluster/home/max/bin:/hive/data/inside/literature/pubtools/
# pmc
cd /hive/data/inside/literature/pubtools/runs/pmcBlat/
pubBlat init /hive/data/inside/literature/blat/pmc/ /hive/data/inside/literature/text/pmc
ssh swarm
cd /hive/data/inside/literature/pubtools/runs/pmcBlat/
pubBlat steps:annot-tables
exit
pubBlat load

# elsevier
cd /hive/data/inside/literature/pubtools/runs/elsBlat/
pubBlat init /hive/data/inside/literature/blat/elsevier/ /hive/data/inside/literature/text/elsevier
ssh swarm
cd /hive/data/inside/literature/pubtools/runs/elsBlat/
pubBlat steps:annot-tables
exit
pubBlat load
#--pubEnd

#############################################################################
# LIFTOVER TO ci3 (DONE - 2015-08-26 - Hiram )
    cd /hive/data/genomes/ci2
    twoBitToFa ci2.2bit stdout | faSize stdin > faSize.ci2.2bit.txt
    head -1 faSize.ci2.2bit.txt
# 173499994 bases (32266429 N's 141233565 real 118279813 upper 22953752 lower) in 4390 sequences in 1 files

    #       vvvvvvvvv  use 'real' number from the faSize measurement:
    calc \( 141233565 / 2861349177 \) \* 1024
    # ( 141233565 / 2861349177 ) * 1024 = 50.543699

    # ==> use -repMatch=100 since the smaller '50' number would count too many
    blat ci2.2bit /dev/null /dev/null -tileSize=11 \
       -makeOoc=jkStuff/ci2.11.ooc -repMatch=100
    # Wrote 10830 overused 11-mers to jkStuff/ci2.11.ooc

    #  ci3 had: Wrote 6681 overused 11-mers to jkStuff/ci3.11.ooc

    mkdir /hive/data/genomes/ci2/bed/blat.ci3.2015-08-26
    cd /hive/data/genomes/ci2/bed/blat.ci3.2015-08-26
    # -debug run to create run dir, preview scripts...
    doSameSpeciesLiftOver.pl -buildDir=`pwd` \
	-bigClusterHub=ku -dbHost=hgwdev -workhorse=hgwdev \
	-ooc=/hive/data/genomes/ci2/jkStuff/ci2.11.ooc -debug ci2 ci3
    # Real run:
    time (doSameSpeciesLiftOver.pl -buildDir=`pwd` \
      -bigClusterHub=ku -dbHost=hgwdev -workhorse=hgwdev \
       -ooc=/hive/data/genomes/ci2/jkStuff/ci2.11.ooc ci2 ci3) > do.log 2>&1
    #  real    18m32.003s
    # verify it works on genome-test

#############################################################################
# Crispr track. See ../crisprTrack/README.txt (2016-09-15 max)
# Command: doCrispr.sh ci2 ensGene
##############################################################################
