#!/usr/bin/env python3

import logging, sys, optparse, os, json, subprocess, shutil, string, glob, tempfile, re
import shlex
from pathlib import Path
from collections import defaultdict, OrderedDict
from os.path import join, basename, dirname, isfile, relpath, abspath, splitext, isdir, normpath
#import pyyaml   # not loaded here, so it's not a hard requirement, is lazy loaded in parseMetaYaml()

# ==== functions =====
# allowed file types by hubtools up
# copied from the javascript file hgHubConnect.js
fileTypeExtensions = {
    "bigBed": [ ".bb", ".bigbed" ],
    "bam": [ ".bam" ],
    "vcf": [ ".vcf" ],
    "vcfTabix": [ ".vcf.gz", "vcf.bgz" ],
    "bigWig": [ ".bw", ".bigwig" ],
    "hic": [ ".hic" ],
    "cram": [ ".cram" ],
    "bigBarChart": [ ".bigbarchart" ],
    "bigGenePred": [ ".bgp", ".biggenepred" ],
    "bigMaf": [ ".bigmaf" ],
    "bigInteract": [ ".biginteract" ],
    "bigPsl": [ ".bigpsl" ],
    "bigChain": [ ".bigchain" ],
    "bamIndex": [ ".bam.bai", ".bai" ],
    "tabixIndex": [ ".vcf.gz.tbi", "vcf.bgz.tbi" ]
}

asHead = """table bed
"Browser extensible data (<=12 fields) "
    (
"""

asLines = """
    string chrom;      "Chromosome (or contig, scaffold, etc.)"
    uint   chromStart; "Start position in chromosome"
    uint   chromEnd;   "End position in chromosome"
    string name;       "Name of item"
    uint   score;      "Score from 0-1000"
    char[1] strand;    "+ or -"
    uint thickStart;   "Start of where display should be thick (start codon)"
    uint thickEnd;     "End of where display should be thick (stop codon)"
    uint reserved;     "Used as itemRgb as of 2004-11-22"
    int blockCount;    "Number of blocks"
    int[blockCount] blockSizes; "Comma separated list of block sizes"
    int[blockCount] chromStarts; "Start positions relative to chromStart"
""".split("\n")

def parseArgs():
    " setup logging, parse command line arguments and options. -h shows auto-generated help page "
    parser = optparse.OptionParser("""usage: %prog [options] <cmd> - create and edit UCSC track hubs

    hubtools make <assemblyCode e.g. hg38>: create a track hub for all bigBed/bigWig files under a directory.
    Creates single-file hub.txt, and tries to guess reasonable settings from the file names:
    - bigBed/bigWig files in the current directory will be top level tracks
    - big* files in subdirectories become composites
    - for every filename, the part before the first dot becomes the track base name
    - if a directory has more than 80% of track base names with both a bigBed
      and bigWig file, views are activated for this composite
    - track attributes can be changed using tracks.ra, tracks.tsv, tracks.json or tracks.yaml files, in
      each top or subdirectory
    - The first column of tracks.tsv must be 'track'.
    - tracks.json and tracks.yaml must have objects at the top level, the attributes are <trackName> or
      the special attribute "hub".

    hubtools up <hubName>: upload files to hubSpace
    - needs ~/.hubtools.conf with a line apiKey="xxxx". Create a key by going to
      My Data > Track Hubs > Track Development on https://genome.ucsc.edu
    - uploads all files from the -i directory or the current dir if not specified.

    hubtools jbrowse <url> <db> : convert Jbrowse trackList.json files to hub.txt.
    - <url> is the URL to the Jbrowse2 installation, e.g. http://furlonglab.embl.de/FurlongBrowser/
    - <db> is assembly identifier

    hubtools tab <fname>: convert a hub.txt or trackDb.txt to tab-sep format, easier to bulk-edit with sed/cut/etc.
    - <fname> is the input filename. "hub" and "genome" stanzas are skipped.
    - output goes to stdout

    hubtools conv -i myTsvDir
    - convert .tsv files in inDir to .bigBed files in current directory

    hubtools archive -i xxx/archive -o outDir
    - Create a hub from an extracted xxx.tar.gz file "archive" directory. You can download a .tar.gz from
      in the Genome Browser under My Data > My Session. The archive contains all your custom tracks.
      It is usually easier to use the 'ct' command, as this will automatically download an archive
      an convert it, given a full genome browser URL with an "hgsid" parameter on the URL.

    hubtools ct <hgTracksUrl>
    - Download all custom tracks from a Genome Browser URL, either as a /s/ stable shortlink or 
      the temporary hgsid=xxxx URL

    Examples:
    hubtools conv -i myTsvs/
    hubtools make hg38
    hubtools jbrowse http://furlonglab.embl.de/FurlongBrowser/ dm3
    hubtools tab hub.txt > tracks.tsv
    hubtools session

    tar xvfz SC_20230723_backup.tar.gz
    hubtools archive -i archive -o hub
    hubtools ct 'https://genome-euro.ucsc.edu/cgi-bin/hgTracks?db=hg38&position=chr7%3A155798978%2D155810671&hgsid=345826202_8d3Mpumjaw9IXYI2RDaSt5xMoUhH'

    For the "hubtools make" step, you can specify additional options for your tracks via tracks.json/.yaml or tracks.tsv:

    tracks.json can look like this, can have more keys, one per track, or the special key "hub":
    {
        "hub" :      { "hub": "mouse_motor_atac", "shortLabel":"scATAC-seq Developing Cranial Motor Neurons" },
        "myTrack" :  { "shortLabel" : "My nice track" }
    }
    tracks.tsv should look like this, other columns can be added:

        #track<tab>shortLabel
        myTrack<tab>My nice track

    For a list of all possible fields/columns see https://genome.ucsc.edu/goldenpath/help/trackDb/trackDbHub.html:
    """)


    parser.add_option("-i", "--inDir", dest="inDir", action="store", help="Input directory where files are stored. Default is current directory")
    parser.add_option("-o", "--outDir", dest="outDir", action="store", help="Input directory where hub.txt file is created. Default is same as input directory.")

    #parser.add_option("", "--igv", dest="igv", action="store", help="import an igv.js trackList.json file hierarchy")
    parser.add_option("-d", "--debug", dest="debug", action="store_true", help="show verbose debug messages")
    parser.add_option("-u", "--upload", dest="upload", action="store_true", help="upload all files from outDir to hubSpace")
    (options, args) = parser.parse_args()

    if len(args)==0:
        parser.print_help()
        exit(1)

    if options.debug:
        logging.basicConfig(level=logging.DEBUG)
    else:
        logging.basicConfig(level=logging.INFO)
    return args, options

def errAbort(msg):
    " print and abort) "
    logging.error(msg)
    sys.exit(1)

def makedirs(d):
    if not isdir(d):
        os.makedirs(d)

def parseConf(fname):
    " parse a hg.conf style file, return as dict key -> value (all strings) "
    logging.debug("Parsing "+fname)
    conf = {}
    for line in open(fname):
        line = line.strip()
        if line.startswith("#"):
            continue
        elif line.startswith("include "):
            inclFname = line.split()[1]
            absFname = normpath(join(dirname(fname), inclFname))
            if os.path.isfile(absFname):
                inclDict = parseConf(absFname)
                conf.update(inclDict)
        elif "=" in line: # string search for "="
            key, value = line.split("=",1)
            conf[key] = value
    return conf

# cache of hg.conf contents
hgConf = None

def parseHgConf():
    """ return hg.conf as dict key:value. """
    global hgConf
    if hgConf is not None:
        return hgConf

    hgConf = dict() # python dict = hash table

    fname = os.path.expanduser("~/.hubtools.conf")
    if isfile(fname):
        hgConf = parseConf(fname)
    else:
        fname = os.path.expanduser("~/.hg.conf")
        if isfile(fname):
            hgConf = parseConf(fname)

def cfgOption(name, default=None):
    " return hg.conf option or default "
    global hgConf

    if not hgConf:
        parseHgConf()

    return hgConf.get(name, default)

def parseMetaRa(fname):
    """parse tracks.ra or tracks.txt and return as a dict of trackName -> dict of key->val """
    logging.debug("Reading %s as .ra" % fname)
    trackName = None
    stanzaData = {}
    ret = {}
    for line in open(fname):
        line = line.strip()
        if line.startswith("#"):
            continue
        if line=="":
            if len(stanzaData)==0:
                # double newline
                continue
            if trackName is None:
                errAbort("File %s has a stanza without a track name" % fname)
            if trackName in ret:
                errAbort("File %s contains two stanzas with the same track name '%s' " % trackName)

            ret[trackName] = stanzaData
            stanzaData = {}
            trackName = None
            continue

        key, val = line.split(" ", maxsplit=1)
        if key == "track":
            trackName = val
            continue
        else:
            stanzaData[key] = val

    if len(stanzaData)!=0:
        ret[trackName] = stanzaData
    logging.debug("Got %s from .ra" % str(ret))
    return ret

def parseMetaTsv(fname):
    " parse a tracks.tsv file and return as a dict of trackName -> dict of key->val "
    headers = None
    meta = {}
    logging.debug("Parsing track meta data from %s in tsv format" % fname)
    for line in open(fname):
        row = line.rstrip("\r\n").split("\t")
        if headers is None:
            assert(line.startswith("track\t") or line.startswith("#track"))
            row[0] = row[0].lstrip("#")
            headers = row
            continue

        assert(len(row)==len(headers))
        key = row[0]

        rowDict = {}
        for header, val in zip(headers[1:], row[1:]):
            rowDict[header] = val
        #row = {k:v for k,v in zip(headers, fs)}
        meta[key] = rowDict
    return meta

def parseMetaJson(fname):
    " parse a json file and merge it into meta and return "
    logging.debug("Reading %s as json" % fname)
    newMeta = json.load(open(fname))
    return newMeta

def parseMetaYaml(fname):
    " parse yaml file "
    import yaml # if this doesn't work, run 'pip install pyyaml'
    with open(fname) as stream:
        try:
            return yaml.safe_load(stream)
        except yaml.YAMLError as exc:
            logging.error(exc)

def parseMeta(inDir):
    " parse a tab-sep file with headers and return an ordered dict firstField -> dictionary "
    fname = join(inDir, "tracks.tsv")
    meta = OrderedDict()
    if isfile(fname):
        tsvMeta = parseMetaTsv(fname)
        meta = allMetaOverride(meta, tsvMeta)

    fname = join(inDir, "tracks.json")
    if isfile(fname):
        jsonMeta = parseMetaJson(fname)
        meta = allMetaOverride(meta, jsonMeta)

    fname = join(inDir, "tracks.ra")
    if isfile(fname):
        raMeta = parseMetaRa(fname)
        meta = allMetaOverride(meta, raMeta)

    fname = join(inDir, "tracks.yaml")
    if isfile(fname):
        yamlMeta = parseMetaYaml(fname)
        meta = allMetaOverride(meta, yamlMeta)

    logging.debug("Got meta from %s: %s" % (inDir, str(meta)))
    return meta

def writeHubGenome(ofh, db, inMeta):
    " create a hub.txt and genomes.txt file, hub.txt is just a template "
    meta = inMeta.get("hub", {})
    ofh.write("hub autoHub\n")
    ofh.write("shortLabel %s\n" % meta.get("shortLabel", "Auto-generated hub"))
    ofh.write("longLabel %s\n" % meta.get("longLabel", "Auto-generated hub"))
    #ofh.write("genomesFile genomes.txt\n")
    if "descriptionUrl" in meta:
        ofh.write("descriptionUrl %s\n" % meta["descriptionUrl"])
    ofh.write("email %s\n" % meta.get("email", "yourEmail@example.com"))
    ofh.write("useOneFile on\n\n")

    ofh.write("genome %s\n\n" % db)
    return ofh

def readSubdirs(inDir, subDirs):
    " given a list of dirs, find those that are composite dirs (not supporting supertracks for now) "
    compDicts, superDicts = {}, {}
    for subDir in subDirs:
        subPath = join(inDir, subDir)
        subSubDirs, subDict = readFnames(subPath)
        if len(subDict)==0:
            # no files in this dir
            continue
        if len(subSubDirs)==0:
            compDicts[subDir] = subDict
        #else:
            #superDicts[subDir] = subDict

    return compDicts, superDicts

def reorderDirs(compDirs, meta):
    " order the directories in compDirs in the order that they appear in the meta data -> will mean that composites have the right order "
    if len(meta)==0:
        logging.debug("Not reordering these subdirectories: %s" % compDirs.keys())
        return compDirs

    # first use the names in the meta data, and put in the right order
    newCompDirs = OrderedDict()
    for dirName in meta.keys():
        if dirName in compDirs:
            newCompDirs[dirName] = compDirs[dirName]

    # then add everything else at the end
    for dirName in compDirs:
        if dirName not in newCompDirs:
            newCompDirs[dirName] = compDirs[dirName]

    logging.debug("Reordered input directories based on meta data. New order is: %s" % newCompDirs.keys())
    return newCompDirs


def readDirs(inDir, meta):
    " recurse down into directories and return containerType -> parentName -> fileBase -> type -> list of absPath "
    ret = {}
    subDirs, topFiles = readFnames(inDir)
    ret["top"] = { None : topFiles } # top-level track files have None as the parent

    compDirs, superDirs = readSubdirs(inDir, subDirs)
    compDirs = reorderDirs(compDirs, meta)
    # superDirs not used yet, no time

    ret["comps"] = compDirs
    ret["supers"] = superDirs

    return ret

def readFnames(inDir):
    " return dict with basename -> fileType -> filePath "
    fnameDict = defaultdict(dict)
    #tdbDir = abspath(dirname(trackDbPath))
    subDirs = []
    for fname in os.listdir(inDir):
        filePath = join(inDir, fname)
        if isdir(filePath):
            subDirs.append(fname)
            continue
        baseName, ext = splitext(basename(fname))
        ext = ext.lower()

        # actually, use the part before the first dot, not the one before the extension, as the track name
        # this means that a.scaled.bigBed and a.raw.bigWig get paired correctly
        fileBase = basename(fname).split(".")[0]

        if ext==".bw" or ext==".bigwig":
            fileType = "bigWig"
        elif ext==".bb" or ext==".bigbed":
            fileType = "bigBed"
        else:
            logging.debug("file %s is not bigBed nor bigWig, skipping" % fname)
            continue

        absFname = abspath(filePath)
        #relFname = relFname(absFname, tdbDir)
        fnameDict[fileBase].setdefault(fileType, [])
        #fnameDict[baseName][fileType].setdefault([])
        fnameDict[fileBase][fileType].append(absFname)
    return subDirs, fnameDict

def mostFilesArePaired(fnameDict):
    " check if 80% of the tracks have a pair bigBed+bigWig"
    pairCount = 0
    for baseName, typeDict in fnameDict.items():
        if "bigBed" in typeDict and "bigWig" in typeDict:
            pairCount += 1

    pairShare = pairCount / len(fnameDict)
    return ( pairShare > 0.8 )

def writeLn(ofh, spaceCount, line):
    "write line to ofh, with spaceCount before it "
    ofh.write("".join([" "]*spaceCount))
    ofh.write(line)
    ofh.write("\n")

def writeStanza(ofh, indent, tdb):
    " write a stanza given a tdb key-val dict "
    track = tdb["track"]

    shortLabel = tdb.get("shortLabel", track.replace("_", " "))
    visibility = tdb.get("visibility", "pack")
    longLabel  = tdb.get("longLabel", shortLabel)

    if not "type" in tdb:
        errAbort("Track info has no type attribute: %s" % tdb)
    trackType = tdb["type"]

    writeLn(ofh, indent, "track %s" % track)
    writeLn(ofh, indent, "shortLabel %s" % shortLabel)
    if longLabel:
        writeLn(ofh, indent, "longLabel %s" % longLabel)
    if "parent" in tdb:
        writeLn(ofh, indent, "parent %s" % tdb["parent"])
    writeLn(ofh, indent, "type %s" % trackType)
    writeLn(ofh, indent, "visibility %s" % visibility)
    if "bigDataUrl" in tdb:
        writeLn(ofh, indent, "bigDataUrl %s" % tdb["bigDataUrl"])

    for key, val in tdb.items():
        if key in ["track", "shortLabel", "longLabel", "type", "bigDataUrl", "visibility", "parent"]:
            continue
        writeLn(ofh, indent, "%s %s" % (key, val))

    ofh.write("\n")

def metaOverride(tdb, meta):
    " override track info for one single track, from meta into tdb "
    trackName = tdb["track"]

    if trackName not in meta and "__" in trackName:
        logging.debug("Using only basename of track %s" % trackName)
        trackName = trackName.split("__")[1]

    if trackName not in meta:
        logging.debug("No meta info for track %s" % tdb["track"])
        return

    trackMeta = meta[trackName]

    for key, val in trackMeta.items():
        if val!="":
            tdb[key] = trackMeta[key]

def allMetaOverride(tdb, meta):
    " override track info for all tracks, from meta into tdb "
    if meta is None:
        return tdb

    for trackName in meta:
        trackMeta = meta[trackName]
        if trackName not in tdb:
            tdb[trackName] = {}

        trackTdb = tdb[trackName]

        for key, val in trackMeta.items():
            trackTdb[key] = val

    return tdb

def reorderTracks(fileDict, meta):
    " given an unsorted dictionary of files and ordered metadata, try to sort the files according to the metadata"
    if len(meta)==0:
        return fileDict # no meta data -> no ordering necessary

    trackOrder = []
    # meta is an OrderedDict, so the keys are also ordered
    for trackName in meta.keys():
        if "__" in trackName: # in composite mode, the tracknames contain the parent and the track type
            trackName = trackName.split("__")[1]
        trackOrder.append( trackName )

    trackOrder = list(meta.keys())

    newFiles = OrderedDict()
    doneTracks = set()
    # first add the tracks in the order of the meta data
    for trackBase in trackOrder:
        # the tsv file can have the track names either as basenames or as full tracknames
        if trackBase not in fileDict and "__" in trackBase:
            trackBase = trackBase.split("__")[1]

        if trackBase in fileDict:
            newFiles[trackBase] = fileDict[trackBase]
            doneTracks.add(trackBase)

    logging.debug("Track ordering from meta data used: %s" % newFiles.keys())

    # then add all other tracks
    for trackBase, fileData in fileDict.items():
        if trackBase not in doneTracks:
            newFiles[trackBase] = fileDict[trackBase]
            logging.debug("Not specified in meta, so adding at the end: %s" % trackBase)

    logging.debug("Final track order is: %s" % newFiles.keys())

    assert(len(newFiles)==len(fileDict))
    return newFiles

def writeTdb(inDir, dirDict, dirType, tdbDir, ofh):
    " given a dict with basename -> type -> filenames, write track entries to ofh "
    # this code is getting increasingly complex because it supports composites/views and pairing of bigBed/bigWig files
    # either this needs better comments or maybe a separate code path for this rare use case
    global compCount

    fnameDict = dirDict[dirType]

    for parentName, typeDict in fnameDict.items():
        if parentName is None: # top level tracks: use top tracks.tsv
            subDir = inDir
        else: # container tracks -> use tracks.tsv in the subdirectory
            subDir = join(inDir, parentName)

        parentMeta = parseMeta(subDir)

        indent = 0
        parentHasViews = False

        groupMeta = {}

        if dirType=="comps":
            tdb = {
                    "track" : parentName,
                    "shortLabel": parentName,
                    "visibility" : "dense",
                    "compositeTrack" : "on",
                    "autoScale" : "group",
                    "type" : "bed 4"
                  }
            metaOverride(tdb, parentMeta)
            groupMeta = parentMeta

            parentHasViews = mostFilesArePaired(typeDict)

            if parentHasViews:
                tdb["subGroup1"] = "view Views PK=Peaks SIG=Signals"
                logging.info("Container track %s has >80%% of paired files, activating views" % parentName)

            writeStanza(ofh, indent, tdb)
            indent = 4

            if parentHasViews:
                # we have composites with paired files? -> write the track stanzas for the two views
                groupMeta = parseMeta(subDir)
                tdbViewPeaks = {
                        "track" : parentName+"ViewPeaks",
                        "shortLabel" : parentName+" Peaks",
                        "parent" : parentName,
                        "view" : "PK",
                        "visibility" : "dense",
                        "type" : "bigBed",
                        "scoreFilter" : "off",
                        "viewUi" : "on"
                        }
                metaOverride(tdbViewPeaks, parentMeta)
                writeStanza(ofh, indent, tdbViewPeaks)

                tdbViewSig = {
                        "track" : parentName+"ViewSignal",
                        "shortLabel" : parentName+" Signal",
                        "parent" : parentName,
                        "view" : "SIG",
                        "visibility" : "dense",
                        "type" : "bigWig",
                        "viewUi" : "on"
                        }
                metaOverride(tdbViewSig, parentMeta)
                writeStanza(ofh, indent, tdbViewSig)
        else:
            # no composites
            groupMeta = parseMeta(subDir)

        typeDict = reorderTracks(typeDict, groupMeta)

        for trackBase, typeFnames in typeDict.items():
            for fileType, absFnames in typeFnames.items():
                assert(len(absFnames)==1) # for now, not sure what to do when we get multiple basenames of the same file type
                absFname = absFnames[0]
                fileBase = basename(absFname)
                relFname = relpath(absFname, tdbDir)

                labelSuff = ""
                if parentHasViews:
                    if fileType=="bigWig":
                        labelSuff = " Signal"
                    elif fileType=="bigBed":
                        labelSuff = " Peaks"
                    else:
                        assert(False) # views and non-bigWig/Bed are not supported yet?

                if parentName is not None:
                    parentPrefix = parentName+"__"
                else:
                    parentPrefix = ""

                trackName = parentPrefix+trackBase+"__"+fileType
                tdb = {
                        "track"      :  trackName,
                        "shortLabel" :  trackBase+labelSuff,
                        "longLabel"  :  trackBase+labelSuff,
                        "visibility" :  "dense",
                        "type"       :  fileType,
                        "bigDataUrl" :  relFname,
                      }

                if parentName:
                    tdb["parent"] = parentName

                if parentHasViews:
                    onOff = "on"
                    if trackName in groupMeta and "visibility" in groupMeta[trackName]:
                        vis = groupMeta[trackName]["visibility"]
                        if vis=="hide":
                            onOff = "off"
                        del tdb["visibility"]

                    if fileType=="bigBed":
                        tdb["parent"] = parentName+"ViewPeaks"+" "+onOff
                        tdb["subGroups"] = "view=PK"
                    else:
                        tdb["parent"] = parentName+"ViewSignal"+" "+onOff
                        tdb["subGroups"] = "view=SIG"

                metaOverride(tdb, groupMeta)

                if trackName in groupMeta and "visibility" in groupMeta[trackName]:
                    del tdb["visibility"]

                writeStanza(ofh, indent, tdb)

def importJbrowse(baseUrl, db, outDir):
    " import an IGV trackList.json hierarchy "
    import requests
    outFn = join(outDir, "hub.txt")
    ofh = open(outFn, "w")
    writeHubGenome(ofh, db, {})

    trackListUrl = baseUrl+"/data/trackList.json"
    logging.info("Loading %s" % trackListUrl)
    trackList = requests.get(trackListUrl).json()
    tdbs = []
    for tl in trackList["tracks"]:
        if "type" in tl and tl["type"]=="SequenceTrack":
            logging.info("Genome is: "+tl["label"])
            continue
        if tl["storeClass"]=="JBrowse/Store/SeqFeature/NCList":
            logging.info("NCList found: ",tl)
            continue
        tdb = {}
        tdb["track"] = tl["label"]
        tdb["shortLabel"] = tl["key"]
        if "data_file" in tl:
            tdb["bigDataUrl"] = baseUrl+"/data/"+tl["data_file"]
        else:
            tdb["bigDataUrl"] = baseUrl+"/data/"+tl["urlTemplate"]
        if tl["storeClass"] == "JBrowse/Store/SeqFeature/BigWig":
            tdb["type"] = "bigWig"

        dispMode = tl.get("display_mode")
        if dispMode:
            if dispMode=="normal":
                tdb["visibility"] = "pack"
            elif dispMode=="compact":
                tdb["visibility"] = "dense"
            else:
                tdb["visibility"] = "pack"
        else:
            tdb["visibility"] = "pack"
    
        writeStanza(ofh, 0, tdb)

def installModule(package):
    " install a package "
    logging.info("Could not find Python module '%s', trying to install with pip" % package)
    subprocess.check_call([sys.executable, "-m", "pip", "install", package])

def cacheLoad(fname):
    " load file cache from json file "
    if not isfile(fname):
        logging.debug("No upload cache present")
        return {}

    logging.debug("Loading "+fname)
    return json.load(open(fname))

def cacheWrite(uploadCache, fname):
    logging.debug("Writing "+fname)
    json.dump(uploadCache, open(fname, "w"), indent=4)

def getFileType(fbase):
    " return the file type defined in the hubspace system, given a base file name "
    ret = "NA"
    for fileType, fileExts in fileTypeExtensions.items():
        for fileExt in fileExts:
            if fbase.endswith(fileExt):
                ret = fileType
                break
        if ret!="NA":
            break

    logging.debug("file type for %s is %s" % (fbase, ret))
    return ret

def uploadFiles(tdbDir, hubName):
    "upload files to hubspace. Server name and token can come from ~/.hubtools.conf "
    try:
        from tusclient import client
    except ModuleNotFoundError:
        installModule("tuspy")
        from tusclient import client

    serverUrl = cfgOption("tusUrl", "https://hubspace.gi.ucsc.edu/files")

    cookies = {}
    cookieNameUser = cfgOption("wiki.userNameCookie", "wikidb_mw1_UserName")
    cookieNameId = cfgOption("wiki.loggedInCookie", "wikidb_mw1_UserID")

    apiKey = cfgOption("apiKey")
    if apiKey is None:
        errAbort("To upload files, the file ~/.hubtools.conf must contain a line like apiKey='xxx').\n"
                "Go to https://genome.ucsc.edu/cgi-bin/hgHubConnect#dev to create a new  apiKey. Then run \n"
                "   echo 'apiKey=\"xxxx\"' >> ~/.hubtools.conf \n"
                "and run the 'hubtools up' command again.")

    logging.info(f"TUS server URL: {serverUrl}")
    my_client = client.TusClient(serverUrl)

    cacheFname = join(tdbDir, ".hubtools.files.json")
    uploadCache = cacheLoad(cacheFname)

    logging.debug("trackDb directory is %s" % tdbDir)
    for rootDir, _, files in os.walk(tdbDir):
        for fbase in files:
            if fbase.startswith("."):
                continue
            localPath = normpath(join(rootDir, fbase))
            logging.debug("localPath: %s" % localPath)
            localMtime = os.stat(localPath).st_mtime

            # skip files that have not changed their mtime since last upload
            if localPath in uploadCache:
                cacheMtime = uploadCache[localPath]["mtime"]
                if localMtime == cacheMtime:
                    logging.info("%s: file mtime unchanged, not uploading again" % localPath)
                    continue
                else:
                    logging.debug("file %s: mtime is %f, cache mtime is %f, need to re-upload" %
                            (localPath, localMtime, cacheMtime))
            else:
                logging.debug("file %s not in upload cache" % localPath)

            fileType = getFileType(fbase)

            fileAbsPath = abspath(localPath)
            remoteRelPath = relpath(fileAbsPath, tdbDir)
            remoteDir = dirname(remoteRelPath)

            meta = {
                    "apiKey" : apiKey,
                    "parentDir" : remoteDir,
                    "genome":"NA",
                    "fileName" : fbase,
                    "hubName" : hubName,
                    "hubtools" : "true",
                    "fileType": fileType,
                    "lastModified" : str(int(localMtime)*1000),
            }

            logging.info(f"Uploading {localPath}, meta {meta}")
            uploader = my_client.uploader(localPath, metadata=meta)
            uploader.upload()

            # note that this file was uploaded
            cache = {}
            mtime = os.stat(localPath).st_mtime
            cache["mtime"] = mtime
            cache["size"] = os.stat(localPath).st_size
            uploadCache[localPath] = cache

    cacheWrite(uploadCache, cacheFname)

def iterRaStanzas(fname):
    " parse an ra-style (trackDb) file and yield dictionaries "
    data = dict()
    logging.debug("Parsing %s in trackDb format" % fname)
    with open(fname, "rt") as ifh:
        for l in ifh:
            l = l.lstrip(" ").rstrip("\r\n")
            if len(l)==0:
                yield data
                data = dict()
            else:
                if " " not in l:
                    continue
                key, val = l.split(" ", maxsplit=1)
                data[key] = val

    if len(data)!=0:
        yield data

def raToTab(fname):
    " convert .ra file to .tsv "
    stanzas = []
    allFields = set()
    for stanza in iterRaStanzas(fname):
        if "hub" in stanza or "genome" in stanza:
            continue
        allFields.update(stanza.keys())
        stanzas.append(stanza)

    if "track" in allFields:
        allFields.remove("track")
    if "shortLabel" in allFields:
        allFields.remove("shortLabel")

    hasLongLabel = False
    if "longLabel" in allFields:
        allFields.remove("longLabel")
        hasLongLabel = True

    sortedFields = sorted(list(allFields))
    # make sure that track shortLabel and longLabel come first and always there, handy for manual edits
    if hasLongLabel:
        sortedFields.insert(0, "longLabel")
    sortedFields.insert(0, "shortLabel")
    sortedFields.insert(0, "track")

    ofh = sys.stdout
    ofh.write("#")
    ofh.write("\t".join(sortedFields))
    ofh.write("\n")

    for s in stanzas:
        row = []
        for fieldName in sortedFields:
            row.append(s.get(fieldName, ""))
        ofh.write("\t".join(row))
        ofh.write("\n")

def guessFieldDesc(fieldNames):
    "given a list of field names, try to guess to which fields in bed12 they correspond "
    logging.info("No field description specified, guessing fields from TSV headers: %s" % fieldNames)
    fieldDesc = {}
    skipFields = set()

    for fieldIdx, fieldName in enumerate(fieldNames):
        caseField = fieldName.lower()
        if caseField in ["chrom", "chromosome"]:
            name = "chrom"
        elif caseField.endswith(" id") or caseField.endswith("accession"):
            name = "name"
        elif caseField in ["start", "chromstart", "position"]:
            name = "start"
        elif caseField in ["Reference"]:
            name = "refAllele"
        elif caseField in ["strand"]:
            name = "strand"
        elif caseField in ["score"]:
            name = "score"
        else:
            continue
        fieldDesc[name] = fieldIdx
        skipFields.add(fieldIdx)
    logging.info("TSV <-> BED correspondance: %s" % fieldDesc)
    return fieldDesc, skipFields

def parseFieldDesc(fieldDescStr, fieldNames):
    " given a string chrom=1,start=2,end=3,... return dict {chrom:1,...} "
    if not fieldDescStr:
        return guessFieldDesc(fieldNames)

    fieldDesc, skipFields = guessFieldDesc(fieldNames)

    if fieldDescStr:
        for part in fieldDescStr.split(","):
            fieldName, fieldIdx = part.split("=")
            fieldIdx = int(fieldIdx)
            fieldDesc[fieldName] = fieldIdx
            skipFields.add(fieldIdx)

    return fieldDesc, skipFields

def makeBedRow(row, fieldDesc, skipFields, isOneBased):
    " given a row of a tsv file and a fieldDesc with BedFieldName -> field-index, return a bed12+ row with extra fields "
    bedRow = []

    # first construct the bed 12 fields
    for fieldName in ["chrom", "start", "end", "name", "score", "strand",
            "thickStart", "thickEnd", "itemRgb", "blockCount", "blockSizes", "chromStarts"]:

        fieldIdx = fieldDesc.get(fieldName)
        if fieldIdx is not None:
            if fieldName=="start":
                chromStart = int(row[fieldDesc["start"]])
                if isOneBased:
                    chromStart = chromStart-1
                val = str(chromStart)
            elif fieldName=="end":
                chromEnd = int(val)
            else:
                val = row[fieldIdx]
        else:
            if fieldName=="end":
                chromEnd = chromStart+1
                val = str(chromEnd)
            elif fieldName=="score":
                val = "0"
            elif fieldName=="strand":
                val = "."
            elif fieldName=="thickStart":
                val = str(chromStart)
            elif fieldName=="thickEnd":
                val = str(chromEnd)
            elif fieldName=="itemRgb":
                val = "0,0,0"
            elif fieldName=="blockCount":
                val = "1"
            elif fieldName=="blockSizes":
                val = str(chromEnd-chromStart)
            elif fieldName=="chromStarts":
                #val = str(chromStart)
                val = "0"
            else:
                logging.error("Cannot find a field for %s" % fieldName)
                sys.exit(1)

        bedRow.append(val)

    # now handle all other fields
    for fieldIdx, val in enumerate(row):
        if not fieldIdx in skipFields:
            bedRow.append( row[fieldIdx] )

    return bedRow

def fetchChromSizes(db, outputFileName):
    " find on local disk or download a <db>.sizes text file "
    # Construct the URL based on the database name
    url = f"https://hgdownload.cse.ucsc.edu/goldenPath/{db}/database/chromInfo.txt.gz"
    # Send a request to download the file
    response = requests.get(url, stream=True)
    # Check if the request was successful
    if response.status_code == 200:
        # Open the output gzip file for writing
        with gzip.open(outputFileName, 'wt') as outFile:
            # Open the response content as a gzip file in text mode
            with gzip.GzipFile(fileobj=response.raw, mode='r') as inFile:
                # Read the content using csv reader to handle tab-separated values
                reader = csv.reader(inFile, delimiter='\t')
                writer = csv.writer(outFile, delimiter='\t', lineterminator='\n')
                # Iterate through each row, and retain only the first two fields
                for row in reader:
                    writer.writerow(row[:2])  # Write only the first two fields
    else:
        raise Exception(f"Failed to download file from {url}, status code: {response.status_code}")
    logging.info("Downloaded %s to %s" % (db, outputFileName))

def getChromSizesFname(db):
    " return fname of chrom sizes, download into ~/.local/ucscData/ if not found "
    fname = "/hive/data/genomes/%s/chrom.sizes" % db
    if isfile(fname):
        return fname

    dataDir = "~/.local/hubtools"
    fname = join(dataDir, "%s.sizes" % db)
    fname = os.path.expanduser(fname)
    if not isfile(fname):
        makedirs(dataDir)
        fetchChromSizes(db, fname)
    return fname

def convTsv(db, tsvFname, outBedFname, outAsFname, outBbFname):
    " convert tsv files in inDir to outDir, assume that they all have one column for chrom, start and end. Try to guess these or fail. "
    #tsvFname, outBedFname, outAsFname = args

    # join and output merged bed
    bigCols = set() # col names of columns with > 255 chars

    #bedFh = open(bedFname, "w")
    unsortedBedFh = tempfile.NamedTemporaryFile(suffix=".bed", dir=dirname(outBedFname), mode="wt")
    fieldNames = None
    #isOneBased = options.oneBased
    isOneBased = True
    bedFieldsDesc = None # in the future, the user may want to input a string like name=0,start=1, but switch this off for now

    for line in open(tsvFname):
        row = line.rstrip("\r\n").split("\t")
        if fieldNames is None:
            fieldNames = row
            fieldDesc, notExtraFields = parseFieldDesc(bedFieldsDesc, fieldNames)
            continue

        # note fields with data > 255 chars.
        for colName, colData in zip(fieldNames, row):
            if len(colData)>255:
                bigCols.add(colName)

        bedRow = makeBedRow(row, fieldDesc, notExtraFields, isOneBased)

        chrom = bedRow[0]
        if chrom.isdigit() or chrom in ["X", "Y"]:
            bedRow[0] = "chr"+bedRow[0]

        unsortedBedFh.write( ("\t".join(bedRow)))
        unsortedBedFh.write("\n")
    unsortedBedFh.flush()

    cmd = "sort -k1,1 -k2,2n %s > %s" % (unsortedBedFh.name, outBedFname)
    assert(os.system(cmd)==0)
    unsortedBedFh.close() # removes the temp file

    # generate autosql
    # BED fields
    #bedColCount = int(options.type.replace("bed", "").replace("+" , ""))
    bedColCount = 12
    asFh = open(outAsFname, "w")
    asFh.write(asHead)
    asFh.write("\n".join(asLines[:bedColCount+1]))
    asFh.write("\n")

    # extra fields
    #fieldNames = fieldNames[bedColCount:]
    for fieldIdx, field in enumerate(fieldNames):
        if fieldIdx in notExtraFields:
            continue
        name = field.replace(" ","")
        name = field.replace("%","perc_")
        name = re.sub("[^a-zA-Z0-9]", "", name)
        name = name[0].lower()+name[1:]
        fType = "string"
        if field in bigCols:
            fType = "lstring"
        asFh.write('      %s %s; "%s" \n' % (fType, name, field))
    asFh.write(")\n")
    asFh.close()

    chromSizesFname = getChromSizesFname(db)

    cmd = ["bedToBigBed", outBedFname, chromSizesFname, outBbFname, "-tab", "-type=bed%d+" % bedColCount, "-as=%s" % outAsFname]
    subprocess.check_call(cmd)

def convTsvDir(inDir, db, outDir):
    " find tsv files under inDir and convert them all to .bb "
    ext = "tsv"
    pat = join(inDir, '**/*.'+ext)
    logging.debug("Finding files under %s (%s), writing output to %s" % (inDir, pat, outDir))

    for fname in glob.glob(pat, recursive=True):
        logging.debug("Found %s" % fname)
        absFname = abspath(fname)
        relFname = relpath(absFname, inDir)
        outPath = Path(join(outDir, relFname))

        if not outPath.parents[0].is_dir():
            logging.debug("mkdir -p %s" % outPath.parents[0])
            makedirs(outPath.parents[0])

        bedFname = outPath.with_suffix(".bed")
        asFname = outPath.with_suffix(".as")
        bbFname = outPath.with_suffix(".bb")

        convTsv(db, fname, bedFname, asFname, bbFname)

def parseTrackLine(s):
    " Use shlex to split the string respecting quotes, written by chatGPT "
    lexer = shlex.shlex(s, posix=True)
    lexer.whitespace_split = True
    lexer.wordchars += '='  # Allow '=' as part of words

    tokens = list(lexer)

    # Convert the tokens into a dictionary
    it = iter(tokens)
    result = {}
    for token in it:
        if '=' in token:
            key, value = token.split('=', 1)
            result[key] = value.strip('"')  # Remove surrounding quotes if present
        else:
            result[token] = next(it).strip('"')  # Handle cases like name="...".

    return result

def readTrackLines(fnames):
    " read the first line and convert to a dict of all fnames. "
    logging.debug("Reading track lines from %s" % fnames)
    ret = {}
    for fn in fnames:
        line1 = open(fn).readline().rstrip("\n")
        notTrack = line1.replace("track ", "", 1)
        tdb = parseTrackLine(notTrack)
        ret[fn] = tdb
    return ret

def stripFirstLine(inputFilename, outputFilename):
    " written by chatGpt: copies all lines to output, except the first line "
    with open(inputFilename, 'r') as infile, open(outputFilename, 'w') as outfile:
        # Skip the first line
        next(infile)
        # Write the rest of the lines to the output file
        for line in infile:
            outfile.write(line)

def bedToBigBed(inFname, db, outFname):
    " convert bed to bigbed file, handles chrom.sizes download "
    chromSizesFname = getChromSizesFname(db)
    cmd = ["bedToBigBed", inFname, chromSizesFname, outFname]
    logging.debug("Running %s" % cmd)
    logging.info(f'Converting {inFname} to {outFname}. (chromSizes: {chromSizesFname}')
    subprocess.check_call(cmd)

def convCtDb(db, inDir, outDir):
    " convert one db part of a track archive to an output directory "
    findGlob = join(inDir, "*.ct")
    inFnames = glob.glob(findGlob)
    if len(inFnames)==0:
        logging.info("No *.ct files found in %s" % findGlob)
        return

    tdbData = readTrackLines(inFnames)

    makedirs(outDir)
    hubTxtFname = join(outDir, "hub.txt")
    ofh = open(hubTxtFname, "wt")

    meta = parseMeta(inDir)
    writeHubGenome(ofh, db, meta)

    tdbIdx = 0
    for fname, tdb in tdbData.items():
        tdbIdx += 1
        # custom track names can include spaces, spec characters, etc. Strip all those
        # append a number to make sure that the result is unique
        track = tdb["name"]
        track = re.sub('[^A-Za-z0-9]+', '', track)+"_"+str(tdbIdx)
        tdb["track"] = track

        tdb["shortLabel"] = tdb["name"]
        del tdb["name"]
        tdb["longLabel"] = tdb["description"]
        del tdb["description"]

        if "bigDataUrl" not in tdb:
            bedFname = join(outDir, tdb["track"]+".bed")
            bbFname = join(outDir, tdb["track"]+".bb")
            stripFirstLine(fname, bedFname)
            bedToBigBed(bedFname, db, bbFname)
            os.remove(bedFname)
            tdb["bigDataUrl"] = tdb["track"]+".bb"
            tdb["type"] = "bigBed"

        writeStanza(ofh, 0, tdb)

    logging.info("Wrote %s" % hubTxtFname)
    ofh.close()

def convArchDir(inDir, outDir):
    " convert a directory created from the .tar.gz file downloaded via our track archive feature "
    dbContent = os.listdir(inDir)
    dbDirs = []
    for db in dbContent:
        subDir = join(inDir, db)
        if isdir(subDir):
            dbDirs.append((db, subDir))

    if len(dbDirs)==0:
        errAbort("No directories found under %s. Extract the tarfile and point this program at the 'archive' directory." % inDir)

    for db, inSubDir in dbDirs:
        logging.debug("Processing %s, db=%s" % (inSubDir, db))
        outSubDir = join(outDir, db)
        convCtDb(db, inSubDir, outSubDir)

def hgsidFromUrl(url):
    " return hgsid given a URL. written by chatGPT "
    # Parse the URL
    parsed_url = urllib.parse.urlparse(url)
    server_name = f"{parsed_url.scheme}://{parsed_url.netloc}"
    query_params = urllib.parse.parse_qs(parsed_url.query)
    hgsid = query_params.get('hgsid')
    # Return the hgsid value if it exists
    if hgsid:
        return server_name, hgsid[0]  # Extracting the first value from the list
    errAbort("hgsid parameter not found in "+url)

def convCtUrl(url, outDir):
    " given an hgTracks URL with an hgsid on it, get all custom track lines and create a hub file for it. Does not download custom tracks. "
    serverUrl, hgsid = hgsidFromUrl(url)
    cartDumpUrl = serverUrl+"/cgi-bin/cartDump?hgsid="+hgsid


def hubtools(args, options):
    """ find files under dirName and create a trackDb.txt for them"""

    cmd = args[0]

    inDir = "."
    if options.inDir:
        inDir = options.inDir

    outDir = "."
    if options.outDir:
        outDir = options.outDir

    if cmd=="up":
        if len(args)<2:
            errAbort("The 'up' command requires one argument, the name of the hub. You can specify any name, "
                "ideally a short, meaningful string, e.g. atacseq, muscle-rna or yamamoto2022. Avoid special characters.")
        hubName = args[1]
        uploadFiles(inDir, hubName)
        return

    tdbDir = inDir
    if options.outDir:
        tdbDir = options.outDir

    if cmd=="jbrowse":
        importJbrowse(args[1], args[2], tdbDir)
    elif cmd == "tab":
        raToTab(args[1])
    elif cmd == "make":
        db = args[1]

        meta = parseMeta(inDir)
        dirFiles = readDirs(inDir, meta)

        hubFname = join(tdbDir, "hub.txt")
        logging.info("Writing %s" % hubFname)
        ofh = open(hubFname, "w")

        meta = parseMeta(inDir)
        writeHubGenome(ofh, db, meta)

        writeTdb(inDir, dirFiles, "top", tdbDir, ofh)
        writeTdb(inDir, dirFiles, "comps", tdbDir, ofh)

        ofh.close()

    elif cmd=="conv":
        db = args[1]

        convTsvDir(inDir, db, outDir)

    elif cmd=="archive":
        convArchDir(inDir, outDir)

    elif cmd=="ct":
        url = args[1]
        convCtUrl(url, outDir)

    else:
        logging.error("Unknown command: '%s'" % args[1])


# ----------- main --------------
def main():
    args, options = parseArgs()

    hubtools(args, options)


main()
