#!/usr/bin/env python import logging, sys, optparse, os, time, atexit from collections import defaultdict from os.path import join, basename, dirname, isfile, isdir import string logDir = "webSyncLog" flagFname = join(logDir, "isRunning.flag") # ==== functions ===== def parseArgs(): " setup logging, parse command line arguments and options. -h shows auto-generated help page " parser = optparse.OptionParser("""usage: %prog [options] - download from https server, using files.txt on their end to get the list of files To create files.txt on the remote end, this simple command can be used to create a list of files: du -ab > files.txt But the above command is slow, includes directories (will lead to warnings) and does not follow symlinks, so rather use this command: find -L . -type f -print0 | du -Lab --files0-from=- > files.txt Then run this in the download directory: webSync https://there.org/ This will create a "webSyncLog" directory in the current directory, compare https://there.org/files.txt with the files in the current directory, transfer the missing files and write the changes to webSync/transfer.log. The URL will be saved after the first run and is not necessary from then on. You can add cd xxx && webSync to your crontab. It will not start if it's already running (flagfile). Status files after a run: - webSyncLog/biggerHere.txt - list of files that are bigger here. These could be errors or OK. - webSyncLog/files.here.txt - the list of files here - webSyncLog/files.there.txt - the list of files there, current copy of https://there.org/files.txt - webSyncLog/missingThere.txt - the list of files not on https://there.org anymore but here - webSyncLog/transfer.log - big transfer log, each run, date and size of transferred file is noted here. """) parser.add_option("-d", "--debug", dest="debug", action="store_true", help="show debug messages") parser.add_option("-x", "--connections", dest="connections", action="store", help="Maximum number of parallel connections to the server, default %default", default=10) parser.add_option("-s", "--skipScan", dest="skipScan", action="store_true", help="Do not scan local file sizes again, in case you know it is up to date") #parser.add_option("-f", "--file", dest="file", action="store", help="run on file") #parser.add_option("", "--test", dest="test", action="store_true", help="do something") (options, args) = parser.parse_args() if args==[] and not isfile(join(logDir, "url.txt")): parser.print_help() exit(1) if options.debug: logging.basicConfig(level=logging.DEBUG) else: logging.basicConfig(level=logging.INFO) return args, options # ----------- main -------------- def run(cmd, mustRun=True): " wrapper around os.system " logging.info("Running %s" % cmd) ret = os.system(cmd) if ret!=0 and mustRun: logging.error("Could not run command %s" % cmd) sys.exit(0) return ret def delFlag(): " called on exit " if isfile(flagFname): logging.debug("Removing flag file") os.remove(flagFname) def parseFileList(fname): " read output from 'du -ab' and return as dict filename -> size. Skips directory names " logging.info("Reading %s" % fname) sizes = dict() dirs = set() for line in open(fname): size, fname = string.split(line.rstrip("\n"), maxsplit=1) size = int(size.strip()) fname = fname.strip().lstrip(".").lstrip("/") fdir = dirname(fname) if fdir!="": dirs.add(fdir) sizes[fname] = size for d in dirs: if d in sizes: del sizes[d] logging.info("Read %d filenames, %d directories" % (len(sizes), len(dirs))) return sizes, dirs def writeDiffUrls(thereFiles, hereFiles, url, outFname, biggerHereFname, missingThereFname, logFh): " write missing or unequal sized files to outFname, prefixed by url. Return false if nothing was found. " logging.debug("Writing missing files to %s" % outFname) ofh = open(outFname, "w") missingCount = 0 smallerHereCount = 0 smallerThereCount = 0 count = 0 biggerHereFh = open(biggerHereFname, "w") missingThereFh = open(missingThereFname, "w") totalSizeToGet = 0 for fn, sizeThere in thereFiles.iteritems(): if fn=="files.txt": continue sizeHere = hereFiles.get(fn) doGet = False if sizeHere==None: missingCount +=1 doGet = True elif sizeHere > sizeThere: errMsg = "File %s is bigger here than there: here %d bytes, there %d bytes, diff %d" %\ (fn, sizeHere, sizeThere, (sizeHere-sizeThere)) logging.debug(errMsg) shortErr = "%s %d %d %d\n" % (fn, sizeHere, sizeThere, (sizeHere-sizeThere)) biggerHereFh.write(shortErr) biggerHereFh.flush() smallerThereCount += 1 elif sizeHere < sizeThere: smallerHereCount += 1 doGet = True if doGet: totalSizeToGet += sizeThere ofh.write(join(url, fn)) ofh.write("\n") ofh.write(" dir=%s\n" % dirname(fn)) # special syntax of aria2c config file to specify output filename ofh.write(" out=%s\n" % basename(fn)) count += 1 ofh.close() missingThereCount = 0 for fn, sizeHere in hereFiles.iteritems(): if fn not in thereFiles: missingThereFh.write("%s\n"% fn) missingThereCount += 1 if missingThereCount!=0: logging.info("Found %d files that are missing there (removed by upstream, see webSyncLog/missingThere.txt)" % missingThereCount) if smallerThereCount!=0: logging.info("Found %d files that are bigger here (errors? see webSyncLog/biggerHere.txt)" % smallerThereCount) if missingCount!=0: logging.info("Found %d files that are missing here" % missingCount) if smallerHereCount!=0: logging.info("Found %d files that are smaller here" % smallerHereCount) if count==0: logging.info("Nothing to download") return False logging.info("downloading %d files now (missing here or smaller here), %d bytes" % (count, totalSizeToGet)) logFh.write("========== webSync start %s ===========\n" % time.strftime("%Y-%m-%d %H:%M")) logFh.write("missingHere=%d smallerHere=%d biggerHere=%d\n" % (missingCount, smallerHereCount, smallerThereCount)) logFh.flush() return True def runAria2c(fname, ariaLog, logFh, connCount): "run aria2c with fname as the input file " # XX is continue=true a good idea? # XX check-certificate=false is definitely a bad idea logFh.write("aria2c start %s \n" % time.strftime("%Y-%m-%d %H:%M")) logFh.flush() cmd = "aria2c -x %d -Z -i %s --summary-interval=0 --continue=true --enable-color=false --check-certificate=false --auto-file-renaming=false --file-allocation=none --allow-overwrite=true > %s" % \ (connCount, fname, ariaLog) run(cmd) def rewriteAriaLog(ariaLogFname, logFh, fileSizes): " rewrite aria log lines to logFname " # 660a86|OK | 0B/s|DCM_FrazerGroup/07.26.2017/RSEM_out/b0e17437-900f-423f-a6e8-30f725eefe2f.RSEMLog # ff071f|ERR | 0B/s|https://cirmtransfer.salk.edu/DCM_FrazerGroup/02.16.2017/STAR_out/f7b48d9d-678d-4121-a4eb-b6b3f3ebf983__STARtmp/BAMsort/2 # 1ac78e|INPR| n/a|https://cirmtransfer.salk.edu/DCM_BruneauGroup/08.08.2017_IK-2056_3BsdmRNA/QC_figures/per_sequence_gc_content.jpg logFh.write("aria2c transfer log parsing %s\n" % time.strftime("%Y-%m-%d %H:%M")) for line in open(ariaLogFname): if line.count("|")==3: if line.startswith("gid"): continue row = line.split("|") gid, status, speed, fname = row status = status.strip() speed = speed.strip() fname = fname.strip() fSize = fileSizes[fname] logFh.write("*%s\t%s\t%s\t%d\n" % (status, speed, fname, fSize)) logFh.write("aria2c log parse done %s\n" % time.strftime("%Y-%m-%d %H:%M")) def webSync(url, options): " download using files.txt and aria2c " if not isdir(logDir): os.makedirs(logDir) if isfile(flagFname): logging.error("%s exists. It looks like another instance of webSync is already running." % flagFname) sys.exit(1) atexit.register(delFlag) open(flagFname, "w") filesThereName = join(logDir, "files.there.txt") filesHereName = join(logDir, "files.here.txt") if isfile(filesThereName) and not options.skipScan: os.remove(filesThereName) if isfile(filesHereName) and not options.skipScan: os.remove(filesHereName) if not isfile(filesThereName): fileUrl = join(url, "files.txt") logging.debug("Downloading %s" % fileUrl) cmd = "wget -q %s --no-check-certificate -O %s" % (fileUrl, filesThereName) run(cmd) if not isfile(filesHereName): #cmd = "find -L . -type f -exec du -Lab {} + > %s" % filesHereName cmd = "find -L . -type f -print0 | du -Lab --files0-from=- > %s" % filesHereName run(cmd) hereFiles, hereDirs = parseFileList(filesHereName) thereFiles, thereDirs = parseFileList(filesThereName) logging.debug("checking %d directories, e.g. %s" % (len(thereDirs), list(thereDirs)[:3])) for d in thereDirs: if not isdir(d): os.makedirs(d) biggerHereFname = join(logDir, "biggerHere.txt") missingThereFname = join(logDir, "missingThere.txt") ariaCmdFname = join(logDir, "aria2c.in.tmp") logFname = join(logDir, "transfer.log") logFh = open(logFname, "a") doDownload = writeDiffUrls(thereFiles, hereFiles, url, ariaCmdFname, biggerHereFname, missingThereFname, logFh) if doDownload: ariaLogFname = join(logDir, "aria2c.out.tmp") runAria2c(ariaCmdFname, ariaLogFname, logFh, options.connections) rewriteAriaLog(ariaLogFname, logFh, thereFiles) logFh.write("========== webSync end %s ===========\n" % time.strftime("%Y-%m-%d %H:%M")) logFh.close() logging.info("websync done") def main(): args, options = parseArgs() urlFname = join(logDir, "url.txt") if len(args)==1: url = args[0] open(urlFname, "w").write(url) else: url = open(urlFname).read() webSync(url, options) #if options.test: #logging.debug("test is set") #f = open(options.file, "r") main()