#!/usr/bin/perl
#
# gbMakeDownload [options] database ...
#
# create download sequences
#
# in order to create upstream sequences a file
# etc/.hg.mkdownload.conf must contain host/password
# to database.
#
#
# $Id: gbMakeDownload,v 1.22 2010/04/11 07:36:05 markd Exp $
#
use strict;
use warnings;
use File::Basename;
use FindBin;
use lib "$FindBin::Bin/../lib";
use gbCommon;
setupServerPath();

my $usage = 
    " gbMakeDownload [options] database ...\n"
    . "\n"
    . "create download files\n"
    . "\n"
    . " Options:\n"
    . "   -getDownloadSeqs=days - create sequence gzipped fasta files and save in\n"
    . "    \$gbRoot/data/ftp if ftp/\$db/download.time doesn't exists or time is\n"
    . "    more than \$days days old. If not specified, they will be recreated only\n"
    . "    if they don't exist. Specify 0 to force recreation.\n"
    . "   -downloadRootDir=dir - save download files in this directory, default is:\n"
    . "       ./data/ftp\n"
    . "   -maxDbs=n - only build files for this many outdated genomes databases and stop.\n"
    . "    Since build can take a long time, this helps prevent bogus stale lock messages. \n"
    . "   -maxHours=hours only build files for this many house and stop. Avoid bogus stale\n"
    . "    lock messages and too frequence success message\n"
    . "   -verbose=n\n";

my $verboseArg;
my $getDownloadSeqsDays = undef;
my $downloadRootDir = "data/ftp";
my $maxBlatTargetDbs = 3;  # number of targetDbs to keep
my $maxDbs = $#ARGV+2;  # init to more than number specified
my $maxHours = undef;
my $startTime = time();

# get file time modification time or undef if it doen't exist
sub getFileMTime($) {
    my($file) = @_;
    if (! -e $file) {
        return undef;
    } else {
        my @st = stat($file);
        if ($#st == 0) {
            die("can't stat $file");
        }
        return $st[9];
    }
}

# determine if a download file needs to be updated for a database
sub fileOutOfDate($) {
    my($file) = @_;
    my $mtime = getFileMTime($file);
    if (defined($getDownloadSeqsDays) && defined($mtime)) {
        # check if non-existant or out-of-date
        my $deltaDays = (time() - $mtime) / (60 * 60 * 24);
        return ($deltaDays > $getDownloadSeqsDays);
    } else {
        # check only if non-existant
        return !defined($mtime);
    }
}

# determine if a file is older, return true if it doesn't exist
sub fileOlder($$) {
    my($file, $other) = @_;
    my $ftime = getFileMTime($file);
    my $otime = getFileMTime($other);
    if (!defined($ftime) || !defined($otime)) {
        return 1;
    } else {
        return $ftime < $otime;
    }
}

# is a partition enabled
sub isLoadEnabled($$$$) {
    my($db, $srcDb, $type, $orgCat) = @_;
    return getDbConfNo($db, "$srcDb.$type.$orgCat.load");
}

# is a blat targetDb enabled
sub isBlatTargetDbEnabled($$) {
    my($db, $srcDb) = @_;
    return getDbConfNoUndef($db, "$srcDb.mrna.blatTargetDb");
}

# generate key used to look up sequence fa basename
sub mkSeqBaseKey($$$) {
    my($srcDb, $type, $orgCat) = @_;
    return $srcDb . ".". $type . ".". $orgCat;
}

# table of key to basename
my %baseNameTbl;
$baseNameTbl{mkSeqBaseKey("genbank", "mrna", "native")} = "mrna";
$baseNameTbl{mkSeqBaseKey("genbank", "mrna", "xeno")}   = "xenoMrna";
$baseNameTbl{mkSeqBaseKey("genbank", "est", "native")}  = "est";
$baseNameTbl{mkSeqBaseKey("refseq", "mrna", "native")}  = "refMrna";
$baseNameTbl{mkSeqBaseKey("refseq", "mrna", "xeno")}  = "xenoRefMrna";

# get path to a sequence download file.
sub getSeqFaPath($$$$) {
    my($db, $srcDb, $type, $orgCat) = @_;
    my $baseName = $baseNameTbl{mkSeqBaseKey($srcDb, $type, $orgCat)};
    my $downloadDir = getSeqDownloadDir($downloadRootDir, $db);
    return "$downloadDir/$baseName.fa.gz";
}

# create an md5sum of a file, with path in file being
# specified file name, rather than full path.
sub createMd5($$$) {
    my($file, $name, $md5file) = @_;
    my $sumstr = callProg("md5sum $file");
    chomp($sumstr);
    my($sum, $junk) = split(/\s+/,$sumstr);
    open(MD5SUM, ">", $md5file) || die("can't open $md5file");
    print MD5SUM "$sum  $name\n";
    close(MD5SUM);
}

# Build a sequence fa and md5sum files
sub makeSeqFa($$$$$) {
    my($db, $srcDb, $type, $orgCat, $seqFa) = @_;
    my $md5 = "$seqFa.md5";
    prMsg("creating $seqFa");
    makeFileDir($seqFa);
    my $tmpFa = "$seqFa.tmp.gz";
    my $tmpMd5 = "$md5.tmp";
    my $cmd = "gbGetSeqs -get=seq -db=$db -$orgCat";
    if (defined($verboseArg)) {
        $cmd .= " " . $verboseArg;
    }
    $cmd .= " $srcDb $type $tmpFa";
    runProg($cmd);
    createMd5($tmpFa, basename($seqFa), $tmpMd5);
    renameFile($tmpFa, $seqFa);
    renameFile($tmpMd5, $md5);
}

# Create one sequence FASTA if the table is loaded for this genome
# and the files if out of date or doesn't exist.  If called with
# $justCheck, just return status indicating if they need to be
# updated
sub maybeMakeSeqFa($$$$$) {
    my($justCheck, $db, $srcDb, $type, $orgCat) = @_;
    if (isLoadEnabled($db, $srcDb, $type, $orgCat)) {
        my $seqFa =  getSeqFaPath($db, $srcDb, $type, $orgCat);
        if (fileOutOfDate($seqFa)) {
            if (!$justCheck) {
                makeSeqFa($db, $srcDb, $type, $orgCat, $seqFa);
            }
            return 1;
        }
    }
    return 0;
}

# get base name for a BLAT targetDb, geting srcDb
sub getBlatTargetDbBaseName($) {
    my($srcDb) = @_;
    return (($srcDb eq "refseq") ? "refMrna" : "mrna");
}


# get sorted list of blat target dbs
sub getBlatTargetDbs($$) {
    my($db, $srcDb) = @_;
    return sort(glob(getBlatTargetDbDir($db) . "/" . getBlatTargetDbBaseName($srcDb) . ".*T*.2bit"));
}

# generate name for new blat target db
sub mkBlatTargetDbName($$) {
    my($db, $srcDb) = @_;
    return getBlatTargetDbDir($db) . "/" . getBlatTargetDbBaseName($srcDb) . "." . isoTimeStamp() . ".2bit";
}

# Build a blat targetDb 2bit file
sub makeBlatTargetDb($$$) {
    my($db, $srcDb, $seqFa) = @_;
    my $seqTwoBit = mkBlatTargetDbName($db, $srcDb);
    prMsg("creating $seqTwoBit");
    makeFileDir($seqTwoBit);
    my $tmpTwoBit = $seqTwoBit . ".tmp";
    my $cmd = "faToTwoBit -ignoreDups $seqFa $tmpTwoBit";
    runProg($cmd);
    renameFile($tmpTwoBit, $seqTwoBit);
}

# Create one sequence blat targetDb 2bit file if it is out of date for the
# source fasta.  If $justCheck, just return status indicating if they need to
# be updated
sub maybeMakeBlatTargetDb($$$$$) {
    my($justCheck, $db, $srcDb, $type, $srcFaStale) = @_;
    if (isLoadEnabled($db, $srcDb, $type, "native") && isBlatTargetDbEnabled($db, $srcDb)) {
        my @seqTwoBits = getBlatTargetDbs($db, $srcDb);
        my $seqFa =  getSeqFaPath($db, $srcDb, $type, "native");
        if ($srcFaStale || ($#seqTwoBits < 0) || fileOlder($seqTwoBits[$#seqTwoBits], $seqFa)) {
            if (!$justCheck) {
                makeBlatTargetDb($db, $srcDb, $seqFa);
                # remove targetDbs over max
                if ($#seqTwoBits+1 >= $maxBlatTargetDbs) {
                    unlinkFiles(@seqTwoBits[0..($#seqTwoBits-$maxBlatTargetDbs)+1]);
                }
            }
            return 1;
        }
    }
    return 0;
}

# make sequence fasta download files, or just check if they need updates
sub maybeMakeSeqFaFiles($$) {
    my($justCheck, $db) = @_;
    my $needUpdated = 0;
    my $mrnaFaDirty = 0;
    if (maybeMakeSeqFa($justCheck, $db, "genbank", "mrna", "native")) {
        $needUpdated = 1;
        $mrnaFaDirty = 1;
    }
    if (maybeMakeBlatTargetDb($justCheck, $db, "genbank", "mrna", $mrnaFaDirty)) {
        $needUpdated = 1;
    }
    if (maybeMakeSeqFa($justCheck, $db, "genbank", "mrna", "xeno")) {
        $needUpdated = 1;
    }
    if (maybeMakeSeqFa($justCheck, $db, "genbank", "est", "native")) {
        $needUpdated = 1;
    }
    if (maybeMakeSeqFa($justCheck, $db, "refseq", "mrna", "xeno")) {
        $needUpdated = 1;
    }
    $mrnaFaDirty = 0;
    if (maybeMakeSeqFa($justCheck, $db, "refseq", "mrna", "native")) {
        $needUpdated = 1;
        $mrnaFaDirty = 1;
    }
    if (maybeMakeBlatTargetDb($justCheck, $db, "refseq", "mrna", $mrnaFaDirty)) {
        $needUpdated = 1;
    }
    return $needUpdated;
}

# make an upstream FASTA  file and checksum
sub makeUpstreamFaSize($$$$) {
    my($db, $geneTbl, $size, $fa) = @_;
    my $md5 = "$fa.md5";
    my $tmpFa = "$fa.tmp.gz";
    my $tmpMd5 = "$md5.tmp";
    makeDir(dirname($fa));
    runPipe("featureBits -fa=stdout $db ${geneTbl}:upstream:${size} | gzip -c > $tmpFa");
    createMd5($tmpFa, basename($fa), $tmpMd5);
    renameFile($tmpFa, $fa);
    renameFile($tmpMd5, $md5);
}

# Determine gene table to use to creating upstream files, or undef if none.
sub getUpstreamGeneTbl($) {
    my($db) = @_;
    my $geneTbl = getDbConfUndef($db, "upstreamGeneTbl");
    if (defined($geneTbl)) {
        return $geneTbl;
    } else {
        return undef;
    }
}    

# Get the paris of upstream MAF tables and organism files to use to creating
# upstream MAFS, or undef if not specified.
sub getUpstreamMafTblsOrgs($) {
    my($db) = @_;
    my $spec = getDbConfUndef($db, "upstreamMaf");
    if (!defined($spec)) {
        return undef;
    }
    my @parts = splitSpaceList($spec);
    if ($#parts+1 & 1) {
        gbError("expected pairs of arguments to $db.upstreamMaf, got: $spec");
    }
    return @parts;
}

# create upstream FASTA file for a database and size if it doesn't exist or
# is out of date.
sub maybeMakeUpstreamFaSize($$$) {
    my($justCheck, $db, $size) = @_;
    my $downloadDir = getSeqDownloadDir($downloadRootDir, $db);
    my $fa = "$downloadDir/upstream${size}.fa.gz";
    my $geneTbl = getUpstreamGeneTbl($db);
    if (defined($geneTbl) && fileOutOfDate($fa)) {
        if (!$justCheck) {
            makeUpstreamFaSize($db, $geneTbl, $size, $fa);
        }
        return 1;
    } else {
        return 0;
    }
}

# create upstream FASTA files for a database if they don't exist or are out
# of date.
sub maybeMakeUpstreamFa($$) {
    my($justCheck, $db) = @_;
    my $needUpdated = 0;
    foreach my $size (1000, 2000, 5000) {
        if (maybeMakeUpstreamFaSize($justCheck, $db, $size)) {
            $needUpdated = 1;
        }
    }
    return $needUpdated;
}

# make an upstream MAF file and checksum
sub makeUpstreamMafSize($$$$$$) {
    my($db, $geneTbl, $mafTbl, $mafOrgs, $size, $maf) = @_;
    my $md5 = "$maf.md5";
    my $tmpMaf = "$maf.tmp.gz";
    my $tmpMd5 = "$md5.tmp";
    prMsg("creating $maf");
    makeFileDir($maf);
    # WARNING: -fa=/dev/null is need as it changes bed format.
    # sed cleanups nasty names generated by featureBits
    runPipe("featureBits -bed=stdout -fa=/dev/null -verbose=0 $db ${geneTbl}:upstream:${size} | sed -re 's/_up_[^\\t]+//' | sort -k1,1 -k2,2n | mafFrags -txStarts -meFirst -orgs=${mafOrgs} ${db} ${mafTbl} stdin stdout | gzip -c > $tmpMaf");
    createMd5($tmpMaf, basename($maf), $tmpMd5);
    renameFile($tmpMaf, $maf);
    renameFile($tmpMd5, $md5);
}

# create upstream MAF file for a database and size if it doesn't exist or
# are out of date.
sub maybeMakeUpstreamMafSize($$$$$) {
    my($justCheck, $db, $mafTbl, $mafOrgs, $size) = @_;
    my $geneTbl = getUpstreamGeneTbl($db);
    if (defined($geneTbl) && defined($mafTbl)) {
        my $downloadDir = getMafDownloadDir($downloadRootDir, $db, $mafTbl);
        my $maf = "$downloadDir/upstream${size}.maf.gz";
        if (fileOutOfDate($maf)) {
            if (!$justCheck) {
                makeUpstreamMafSize($db, $geneTbl, $mafTbl, $mafOrgs, $size, $maf);
            }
            return 1;
        }
    }
    return 0;
}

# create upstream MAF download files for a database and table if they don't exist
# or are out of date.
sub maybeMakeUpstreamMaf($$$$) {
    my($justCheck, $db, $mafTbl, $mafOrgs) = @_;
    my $needUpdated = 0;
    foreach my $size (1000, 2000, 5000) {
        if (maybeMakeUpstreamMafSize($justCheck, $db, $mafTbl, $mafOrgs, $size)) {
            $needUpdated = 1;
        }
    }
    return $needUpdated;
}

# create upstream download files for a database  if they don't exist
# or are out of date.
sub maybeMakeUpstreamMafs($$) {
    my($justCheck, $db) = @_;
    my @pairs = getUpstreamMafTblsOrgs($db);
    my $needUpdated = 0;
    for (my $i = 0; $i <= $#pairs; $i+=2) {
        if (maybeMakeUpstreamMaf($justCheck, $db, $pairs[$i], $pairs[$i+1])) {
            $needUpdated = 1;
        }
    }
    return $needUpdated;
}

# make download files, or just check if they need updates
sub updateDownloadFiles($$) {
    my($justCheck, $db) = @_;
    if (!$justCheck) {
        prMsg("making download sequences for $db");
    }
    my $needUpdated = 0;
    if (maybeMakeSeqFaFiles($justCheck, $db)) {
        $needUpdated = 1;
    }
    if (maybeMakeUpstreamFa($justCheck, $db)) {
        $needUpdated = 1;
    }
    if (maybeMakeUpstreamMafs($justCheck, $db)) {
        $needUpdated = 1;
    }
    return $needUpdated;
}

# test to see if we should keep processing databases
sub keepGoing($) {
    my($upCnt) = @_;
    if ($upCnt >= $maxDbs) {
        return 0;
    } elsif (defined($maxHours) && (((time()-$startTime)/(60*60)) >= $maxHours)) {
        return 0;
    } else {
        return 1;
    }
}

# Entry
while (($#ARGV >= 0) && ($ARGV[0] =~ /^-.*/)) {
    my $opt = $ARGV[0];
    shift @ARGV;
    if ($opt =~ /^-verbose=/) {
        $verboseArg = "-verbose=" . parseOptEq($opt);
        $gbCommon::verbose = parseOptEq($opt);
    } elsif ($opt =~ /^-getDownloadSeqs(=|$)/) {
        $getDownloadSeqsDays = parseOptEq($opt);
    } elsif ($opt =~ /^-downloadRootDir(=|$)/) {
        $downloadRootDir = parseOptEq($opt);
    } elsif ($opt =~ /^-maxDbs(=|$)/) {
        $maxDbs = parseOptEq($opt);
    } elsif ($opt =~ /^-maxHours(=|$)/) {
        $maxHours = parseOptEq($opt);
    } else {
        gbError("invalid option \"$opt\"\n$usage");
    }
}
if ($#ARGV < 0) {
    gbError("wrong # args: $usage");
}
my @databases = @ARGV;
$ENV{"HGDB_CONF"} = "etc/.hg.mkdownload.conf";

# Before locking, determine if anything needs to be done.  This avoids
# a log and e-mail message if not needed.
my @outdatedDbs;
foreach my $db (@databases) {
    if (updateDownloadFiles(1, $db)) {
        push(@outdatedDbs, $db);
    }
}
if ($#outdatedDbs < 0) {
    exit(0);
}
             
beginTask("mkdownload", "mkdownload");
prMsg("outdated: " . join(" ", @outdatedDbs));
my $upCnt = 0;
foreach my $db (@outdatedDbs) {
    if (keepGoing($upCnt)) {
        if (updateDownloadFiles(0, $db)) {
            $upCnt++;
        }
    }
}
endTask();

