#!/bin/csh 
exit;

############################################################
# get metadata.txt file and non-accessioned sequences
# from Elliott Margulies
# /cluster/bluearc/encode/metadata.txt
############################################################

# Check to make sure there are no redundant lines in the metadata file
# These should return the same number
cut -f1-2,16 metadata.txt | sort    | wc -l
cut -f1-2,16 metadata.txt | sort -u | wc -l

# make sure that the metadata.txt file has a header:
head -2 metadata.txt
#niscOrganism    encodeRegion    freezeDate      ncbiTaxonId     assemblyProvider        assemblyDate    assemblyId      chromosome      chrom_start     chrom_end       chromLength     strand  accessionVersion   numBases        numN    thisContigNum   totalNumContigs comment
#armadillo       ENm001  JAN-2006        9361    NISC    08-NOV-2005     .       .       0       0       0       +       NT_086518.3     1930245 100000  1       1       .


# Check to verify that coverage is as expected, that no species or
# targets were omitted:

cut -f1,2,5 metadata.txt | tail +2 | sort -u | cut -f1,3 | sort | uniq -c
#      1 armadillo       Broad Assisted Assembly v 1.0
#     43 armadillo       NISC
#     44 baboon  NISC
#     41 cat     Mullikin Phusion Assembly of Agencourt Traces
#      3 cat     NISC
#     43 chicken CGSC Feb. 2004
#      1 chicken NCBI
#      1 chimp   NCBI
#     43 chimp   NCBI Build 1 v1
#      1 colobus_monkey  NISC
#     40 cow     BCM
#      1 cow     NCBI
#     43 dog     Broad Institute v. 2.0
#      1 dog     NCBI
#      1 dusky_titi      NISC
#     44 eehedgehog      Mullikin Phusion Assembly of Broad Traces
#      2 elephant        Broad Assisted Assembly v 1.0
#     42 elephant        NISC
#     36 fugu    IMCB/JGI
#     44 galago  NISC
#      8 hedgehog        NISC
#     44 human   NCBI Build 35
#     44 macaque BCM
#     44 marmoset        NISC
#     43 monodelphis     Broad Institute
#      1 monodelphis     NCBI
#     44 mouse   NCBI
#      3 mouse_lemur     NISC
#      1 owl_monkey      NISC
#     29 platypus        Mullikin Phusion Assembly of WUGSC Traces
#     15 platypus        NISC
#     16 rabbit  Broad Assisted Assembly v 1.0
#     28 rabbit  NISC
#     43 rat     Baylor HGSC v3.1
#      1 rat     NCBI
#     44 rfbat   NISC
#     24 shrew   Mullikin Phusion Assembly of Broad Traces
#     20 shrew   NISC
#     44 tenrec  Mullikin Phusion Assembly of Broad Traces
#     42 tetraodon       Genoscope V7
#      1 tetraodon       NCBI
#     39 xenopus JGI
#     43 zebrafish       Sanger Zv4


cd /cluster/bluearc/encode
cat >! makeLists.pl << '_EOF_'
#!/usr/bin/perl -W

%speciesOptions =
    (
     "armadillo"      => "mammal ",
     "baboon"         => "human ",
     "cat"            => "cat ",
     "chicken"        => "chicken ",
     "chimp"          => "human ",
     "cow"            => "cow ",
     "colobus_monkey" => "human ",
     "dog"            => "dog ",
     "dusky_titi"     => "cow ",
     "elephant"       => "mammal ",
     "fugu"           => "fugu ",
     "galago"         => "human ",
     "eehedgehog"     => "mammal ",
     "hedgehog"       => "mammal ",
     "human"          => "human ",
     "macaque"        => "macaca ",
     "marmoset"       => "human ",
     "monodelphis"    => "mammal ",
     "mouse"          => "mouse ",
     "mouse_lemur"    => "human ",
     "owl_monkey"     => "human ",
     "pig"            => "pig ",
     "platypus"       => "mammal ",
     "rat"            => "rattus ",
     "rabbit"         => "mammal ",
     "rfbat"          => "mammal ",
     "shrew"          => "mammal ",
     "tenrec"         => "mammal ",
     "tetraodon"      => "fugu ",
     "xenopus"        => "xenops ",
     "zebrafish"      => "danio ",
     );

$encodeLibDir = "/cluster/data/encode/ortho/reconLibs";
$reconLibrary = "supplemental.lib";

%libOptions =
    (
     "armadillo"      => "$encodeLibDir/armadillo/$reconLibrary ",
     "baboon"         => "$encodeLibDir/baboon/$reconLibrary ",
     "elephant"       => "$encodeLibDir/elephant/$reconLibrary ",
     "galago"         => "$encodeLibDir/galago/$reconLibrary ",
     "marmoset"       => "$encodeLibDir/marmoset/$reconLibrary ",
     "mouse_lemur"    => "$encodeLibDir/mouse_lemur/$reconLibrary ",
     "platypus"       => "$encodeLibDir/platypus/$reconLibrary ",
     "rabbit"         => "$encodeLibDir/rabbit/$reconLibrary ",
     "rfbat"          => "$encodeLibDir/rfbat/$reconLibrary ",
     "tenrec"         => "$encodeLibDir/tenrec/$reconLibrary ",
     );
$efetchURL  = "http://www.ncbi.nih.gov/entrez/eutils/efetch.fcgi?db=nucleotide&rettype=fasta&id=";
$bluearc    = "/cluster/bluearc/encode";
$metadata   = "$bluearc/metadata.txt";
$wgetList   = "$bluearc/wgetList";
$maskList   = "$bluearc/para/maskList";

system ("mkdir -p $bluearc/trf $bluearc/RepeatMasker $bluearc/masked $bluearc/para");

open(METADATA, " < $metadata") or die "can't open $metadata for reading.\n";
open(WGETLIST, " > $wgetList") or die "can't open $wgetList";
open(MASKLIST, " > $maskList") or die "can't open $maskList";

print(WGETLIST "cd $bluearc\n");

while (<METADATA>) 
{
    if (/niscOrganism/) { next; }
    @fields     = split /\t/;
    $species    = $fields[0];
    $region     = $fields[1];
    $acc        = $fields[12];
    $index      = $fields[15];

    $how = "none";
    if (defined $speciesOptions{lc($species)}) 
    {
	$spcOpt="$speciesOptions{lc($species)}";
	$how="speciesDefault";
    }
    else { $spcOpt=""; }
    if (defined     $libOptions{lc($species)}) 
    {
	$libOpt=    "$libOptions{lc($species)}";
	if ($spcOpt eq "") { $how="speciesDefault"; }
	else { $how="defaultAndCustom";	}
    }
    else { $libOpt=""; }
    if ($acc =~ /^\w{2}\_\d+/) { print(WGETLIST "wget -O $bluearc/original/$acc \"${efetchURL}$acc\"\n"); }
    if ($acc !~ /NT_/) { $acc="$species.$region.$index.fa"; }
    print(MASKLIST "/cluster/bin/scripts/encodeMask.csh $how $acc $spcOpt $libOpt\n");
}

'_EOF_'

#

    #
    
set freezeDate = "JAN-2006"
cd /cluster/bluearc/encode
mkdir -p original para masked trf RepeatMasker
chmod ug+x makeLists.pl
./makeLists.pl
chmod ug+x wgetList

./wgetList
# check for network error
grep -i error original/NT_*
# should return nothing


############################################################
# mask sequences

cat >! /cluster/bin/scripts/encodeMask.csh << '_EOF_'
#!/bin/csh
if ( $#argv < 2 ) then
    echo " usage: $0 <how>            <seqFileName> [<species>] [<customLib>] "
    echo "        $0 none             <seqFileName> "
    echo "        $0 speciesDefault   <seqFileName> <species> "
    echo "        $0 custom           <seqFileName> <customLibPath> "
    echo "        $0 defaultAndCustom <seqFileName> <species> <customLibPath> " 
    exit 1
endif

set rm  = /cluster/bluearc/RepeatMasker/RepeatMasker
set trf = /cluster/bin/i386/trfBig
set mof = /cluster/bin/i386/maskOutFa
set opt = $1
set seqFileName = $2

cd /tmp
/bin/rm -f $seqFileName*
/bin/cp -f /cluster/bluearc/encode/original/$seqFileName .
$trf $seqFileName $seqFileName.TRF -bed -trf=/cluster/bin/i386/trf
$mof $seqFileName $seqFileName.bed $seqFileName.trf -soft

if ( $opt == "none" ) then
    $rm  -nocut $seqFileName
    $mof $seqFileName.trf $seqFileName.out $seqFileName.fin -softAdd
else if ( $opt == "speciesDefault" ) then
    $rm  -nocut -species $3 $seqFileName
    $mof $seqFileName.trf $seqFileName.out $seqFileName.fin -softAdd
else if ( $opt == "custom" ) then
    $rm  -nocut -lib $3 $seqFileName
    $mof $seqFileName.trf $seqFileName.out $seqFileName.fin -softAdd
else if ( $opt == "defaultAndCustom" ) then
    $rm  -nocut -species $3 $seqFileName
    $mof $seqFileName.trf $seqFileName.out $seqFileName.default -softAdd
    mv $seqFileName.out $seqFileName.$3.out
    $rm  -nocut -lib $4 $seqFileName
    $mof $seqFileName.default $seqFileName.out $seqFileName.fin -softAdd
    mv $seqFileName.out $seqFileName.custom.out
else
    echo " usage: $0 <how>            <seqFileName> [<species>] [<customLib>] "
    echo "        $0 none             <seqFileName> "
    echo "        $0 speciesDefault   <seqFileName> <species> "
    echo "        $0 custom           <seqFileName> <customLibPath> "
    echo "        $0 defaultAndCustom <seqFileName> <species> <customLibPath> " 
    rm -rf $acc*
    exit 1
endif
    
/bin/mv -f $seqFileName.default /cluster/bluearc/encode/default/$seqFileName
/bin/mv -f $seqFileName.fin /cluster/bluearc/encode/masked/$seqFileName
/bin/mv -f $seqFileName.bed /cluster/bluearc/encode/trf/
/bin/mv -f $seqFileName.*out /cluster/bluearc/encode/RepeatMasker/
/bin/rm -rf $seqFileName*
'_EOF_'

#
chmod ug+x /cluster/bin/scripts/encodeMask.csh

## check that all of the sequences are present in the original
## directory to avoid crashed jobs
> wc -l /cluster/bluearc/encode/para/maskList 
#  10709 /cluster/bluearc/encode/para/maskList
> ls -1 /cluster/bluearc/encode/original/| wc -l
#  10709

ssh kk
cd /cluster/bluearc/encode/para
para create maskList
para push
para check

para time
#10709 jobs in batch
#0 jobs (including everybody's) in Parasol queue.
#Checking finished jobs
#Completed: 10709 of 10709 jobs
#CPU time in finished jobs:    3111337s   51855.62m   864.26h   36.01d  0.099 y
#IO & Wait Time:                 41641s     694.01m    11.57h    0.48d  0.001 y
#Average job time:                 294s       4.91m     0.08h    0.00d
#Longest running job:                0s       0.00m     0.00h    0.00d
#Longest finished job:           16313s     271.88m     4.53h    0.19d
#Submission to last job:         17875s     297.92m     4.97h    0.21d

############################################################
# rewrite files with new headers

cat >! writeEncodeFasta.pl << '_EOF_'
#!/usr/bin/perl -W

$releaseDate = shift || die "usage: writeEncodeFasta.pl releaseDate [metadata.txt]";
$metadata    = shift || "metadata.txt";

($sec,$min,$hour,$mday,$mon,$year)=localtime(time);
printf "%4d-%02d-%02d %02d:%02d:%02d Started\n",$year+1900,$mon+1,$mday,$hour,$min,$sec;

$outDir  = "/cluster/bluearc/encode/$releaseDate";
system "mkdir -p $outDir";

if (-e "$outDir") { system "mv -i $outDir/ $outDir.old/"; }
system "mkdir -p $outDir/";
open(METADATA," < $metadata") or die "Can't open $metadata";
while (<METADATA>) 
{
    chomp();
    $org=$region=$acc=$source=$outFile=".";
    if ( /niscOrganism/ ) { next; } # ignore header
    @row = split /\t/;
    if ( $#row != 17 ) { die "Bad input format: $_"; }
    $header        = ">";
    $header       .= join "|", @row;
    $org           = $row[0];
    $region        = $row[1];
    $acc           = $row[12];
    $thisContigNum = $row[15];
    $outFile = "$outDir/$region/$org.$region.fa";
    system ("mkdir -p $outDir/$region");
    system ("touch $outFile");
    open (DEST, " >> $outFile") or die "Can't open $outFile";
    print DEST "$header\n";
    $source = "/cluster/bluearc/encode/masked/$acc";
    unless ( -e $source && $acc ne '.') { $source = "/cluster/bluearc/encode/masked/$org.$region.$thisContigNum.fa";}
    open (SOURCE, " < $source") or die ("Can't open $source");
    <SOURCE>; # ignore header
    while (<SOURCE>) { print DEST; }
    close(SOURCE);
    close(DEST);
}

($sec,$min,$hour,$mday,$mon,$year)=localtime(time);
printf "%4d-%02d-%02d %02d:%02d:%02d Finished writing files\n",$year+1900,$mon+1,$mday,$hour,$min,$sec;

system "cd $outDir/..; chmod o+x $metadata; cp -L $metadata $outDir/; tar cfz $releaseDate.tgz $metadata $releaseDate/EN*/*";

($sec,$min,$hour,$mday,$mon,$year)=localtime(time);
printf "%4d-%02d-%02d %02d:%02d:%02d Finished creating tarball\n",$year+1900,$mon+1,$mday,$hour,$min,$sec;

system "mkdir -p /usr/local/apache/htdocs/encode/downloads/msa/$releaseDate";
system "chmod o+rx /usr/local/apache/htdocs/encode/downloads/msa/$releaseDate";
system "chmod -R o+r  $metadata $outDir/ $outDir/../$releaseDate.tgz";
system "chmod o+rx $outDir $outDir/EN*";
system "rsync -a $outDir/                    /usr/local/apache/htdocs/encode/downloads/msa/$releaseDate/sequences";
system "rsync -a $outDir/../$metadata        /usr/local/apache/htdocs/encode/downloads/msa/$releaseDate/";
system "rsync -a $outDir/../$releaseDate.tgz /usr/local/apache/htdocs/encode/downloads/msa/$releaseDate/";

($sec,$min,$hour,$mday,$mon,$year)=localtime(time);
printf "%4d-%02d-%02d %02d:%02d:%02d End\n",$year+1900,$mon+1,$mday,$hour,$min,$sec;

'_EOF_'

#

chmod ug+x writeEncodeFasta.pl
set releaseDate = JAN-2006
./writeEncodeFasta.pl $releaseDate metadata.txt
# 2006-01-21 08:11:36 Started
# 2006-01-21 08:16:06 Finished writing files
# 2006-01-21 08:25:06 Finished creating tarball
# 2006-01-21 08:26:12 End

