#!/usr/bin/awk -f
# aarToAxt [aar]
#
# Convert Aligned Ancient Repeat (AAR) format file (which is AXT-like), to an
# AXT file.
#
# Format:
#   - aar file: a series of multi-line records, each record seperated by
#     a blank line
#   - record: an AXT alignment, followed by one or more AAR alignments.
#   - AXT alignment: 3 lines:
#        - header
#        - two alignment lines
#   - AAR alignment: 4 lines:
#        - repeat info
#        - target and query coordinates
#        - two alignment lines
# 
#   
BEGIN {
    inAarAlign = 0;
    aarCount = 0;
}

# Record seperator
/^$/ {
    inAarAlign = 0;
    next;  # don't check any more states
}

# AXT alignment part of record
!inAarAlign {
    # First AXT line, save sequence names
    tName = $2;
    qName = $5;
    qStrand = $8;

    # skip two alignment lines
    getline;
    getline;

    # Move on two aar
    getline;
    inAarAlign = 1;
}

# AAR part of record
inAarAlign {
    aarCount++;

    # Don't care about 1st line

    # 2nd line has coords
    getline;
    tStart = $1;
    tEnd = $2;
    qStart = $3;
    qEnd = $4;

    # 3rd and 4th are alignment
    getline;
    tAlign = $0;
    getline;
    qAlign = $0;

    # Validate that the alignment sequences are the same length
    if (length(tAlign) != length(qAlign)) {
        printf "Error: alignment sequences are different lengths for %s:%d-%d\n",
            tName, tStart, tEnd  > "/dev/stderr";
        exit(1);
    }

    # Check if target sequence length matches range
    tLen = (tEnd - tStart) + 1;
    tSeq = tAlign;
    gsub("-+", "", tSeq);
    if (length(tSeq) != tLen) {
        printf "Error: target sequence length does not match target range length %s:%d-%d\n",
            tName, tStart, tEnd  > "/dev/stderr";
        exit(1);
    }

    # Check if query sequence length matches range
    # There is a bug in the AAR program that causes qStart to be
    # incorrect.  This can be detected and corrected by checking 
    # the length of the sequence.  Once this is fixed, change this
    # to an error check.
    qLen = (qEnd - qStart) + 1;
    qSeq = qAlign;
    gsub("-+", "", qSeq);
    if (length(qSeq) != qLen) {
        printf "Warning: query sequence length does not match query range length for\n" > "/dev/stderr";
        printf "  %s:%d-%d, adjusting size to compensate for known bug\n",
            qName, qStart, qEnd > "/dev/stderr";
        qStart = qStart+(qLen-length(qSeq));
    }
    

    # output the AXT record
    print aarCount,tName,tStart,tEnd,qName,qStart,qEnd,qStrand,0;
    print tAlign;
    print qAlign;
    print ""; # end-of-record
}
