#!/bin/csh
exit;

# start with Mercator data from Colin Dewey, and copy it to working directory: 
#    /cluster/data/encode/ortho3/mercatorInput/encodeMaps20050304.tar.gz
#
#-----Original Message-----
#From: Colin Dewey [mailto:cdewey@eecs.berkeley.edu] 
#Sent: Friday, March 04, 2005 1:09 PM
#To: Daryl Thomas
#Subject: Re: hg17 ENCODE region coordinates
#
#Hi Daryl,
#
#Thanks for the coordinates.  I should have realized that I could grab 
#these from the table browser, sorry!  The table browser is very nice 
#now... I was happily surprised that I could do table joins!
#
#Here are our new orthology mappings.  Everything is in the same form as 
#last time, so hopefully there will be no problems.  But do let me know 
#if you run into anything.
#
#Also, it seems that our group has yet to get alignments to you in MAF 
#format.  So I've decided to take on this task.  When I'm done, perhaps 
#I can send you our alignments from the last freeze just to make sure 
#that I have the formats right?
#
#Thanks,
#
#Colin

# ${HOME}/kent/src/hg/lib/encodeRegionMercator.sql
#create table encodeRegionsMercator (
#    bin        smallint(5)  unsigned not null,
#    chrom      varchar(255)          not null,
#    chromStart int(10)      unsigned not null,
#    chromEnd   int(10)      unsigned not null,
#    name       varchar(255)          not null,
#    score      int(10)      unsigned not null,
#    strand     char(1)               not null
#)

###
::::::::::::::
mercatorInput/makeBed5.pl
::::::::::::::
#!/usr/bin/perl -W

while (<>) {
    chomp();
    if (/^(.*)\s+(\d+)\s+(\d+)\s+(EN\D\d\d\d)\.(\d+)_of.*/) {
        printf("$1\t$2\t$3\t$4\t$5\n");
    } else {
        die "Bad input line: $_";
    }
}
###

set dir = /cluster/data/encode/ortho3/mercatorInput
mkdir -p $dir
cd $dir

tar xfz encodeMaps20050304.tar.gz
chmod g+w $dir/*
foreach db (bosTau1 canFam1 danRer2 galGal2 mm5 monDom1 panTro1 rn3 tetNig1)
    echo ========= $db ==========
    rm -f $db.log $db.unmerged.bed5 $db.original.bed
    mv -f $db.bed $db.original.bed
    makeBed5.pl < $db.original.bed > $db.unmerged.bed5
    liftOverMerge -mergeGap=20000 -verbose=2 $db.unmerged.bed5 $db.bed5 >& $db.log
    hgLoadBed -tab -sqlTable=${HOME}/kent/src/hg/lib/encodeRegionsMercator.sql $db encodeRegionsMercator $db.original.bed >>& $db.log
    hgLoadBed -tab $db encodeRegionsMercatorMerged $db.bed5 >>& $db.log
end
rm -f bed.tab

########################################################################
#############  liftOver orthologous regions  ###########################
########################################################################
set dir = /cluster/data/encode/ortho3/liftOverRegions
mkdir -p $dir
cd $dir
hgsql hg17 -e "SELECT * FROM encodeRegions ORDER BY name" | tail +2 > hg17.bed
hgsql -h genome-testdb hgcentraltest -e "select toDb, path from liftOverChain " | grep /gbdb/hg17 | grep -v hg16 > hg17.over.chain.files

########################################################################
# CHIMP
    # THESE NOTES ARE FROM ORTHO2
    # used chains from reciprocal best net -- these were more
    # chopped up, but regions in these don't span unbridged gaps
    # minMatch .7 leaves ENr324 and ENm011 unmapped
    # Lowered match threshold until region coverage looked good (to .4)
    # (except m11, for some reason),  with minimum of fragmentation
    # Same at .3 and .2 -- at the point where regions look reasonable,
    # lowering the threshold in this way doesn't change them.
    # NOTE: reducing minSizeQ to 10000, due to fragmented panTro1 assembly
liftOver -minMatch=.01 -minSizeT=4000 \
    -minSizeQ=10000 -multiple -chainTable=hg17.chainPanTro1 \
    hg17.bed /cluster/data/hg17/bed/liftOver/hg17ToPanTro1.chain \
    panTro1.unmerged.bed5 panTro1.unmapped.bed -verbose=2 >& panTro1.log
liftOverMerge -mergeGap=20000 -verbose=2 panTro1.unmerged.bed5 panTro1.bed5 >>& panTro1.log
wc -l panTro1.*bed*
    #     81 panTro1.bed5
    #      0 panTro1.unmapped.bed
    #    107 panTro1.unmerged.bed5
awk '{printf "%s\t%s\t%s\t%s_%s\n", $1, $2, $3, $4, $5}' panTro1.bed5 > panTro1.bed
hgLoadBed -noBin panTro1 encodeRegionsLiftOver panTro1.bed >>& panTro1.log

########################################################################
# RAT
liftOver -minMatch=.01 -minSizeT=4000 \
    -minSizeQ=20000 -multiple -chainTable=hg17.chainRn3 \
    hg17.bed /cluster/data/hg17/bed/liftOver/hg17ToRn3.over.chain \
    rn3.unmerged.bed5 rn3.unmapped.bed -verbose=2 >& rn3.log
liftOverMerge -mergeGap=20000 -verbose=2 rn3.unmerged.bed5 rn3.bed5 >>& rn3.log
wc -l rn3.*bed*
    #     53 rn3.bed5
    #      0 rn3.unmapped.bed
    #     65 rn3.unmerged.bed5
awk '{printf "%s\t%s\t%s\t%s_%s\n", $1, $2, $3, $4, $5}' rn3.bed5 > rn3.bed
hgLoadBed -noBin rn3 encodeRegionsLiftOver rn3.bed >>& rn3.log

########################################################################
# CHICKEN
    # allow smaller chain sizes in query (only 1K, vs. 20K in close species)
liftOver -minMatch=.01 minSizeT=4000 \
    -multiple -minSizeQ=1000 -chainTable=hg17.chainGalGal2 \
    hg17.bed /cluster/data/hg17/bed/liftOver/hg17ToGalGal2.over.chain \
    galGal2.unmerged.bed5 galGal2.unmapped.bed -verbose=2 >& galGal2.log
liftOverMerge -mergeGap=20000 -verbose=2 galGal2.unmerged.bed5 galGal2.bed5 >>& galGal2.log
wc -l galGal2.*bed*
    #     94 galGal2.bed5
    #      4 galGal2.unmapped.bed
    #    131 galGal2.unmerged.bed5
    ##Partially deleted in new
    #chr13   29418015        29918015        ENr111
    #chr5    141880150       142380150       ENr212
awk '{printf "%s\t%s\t%s\t%s_%s\n", $1, $2, $3, $4, $5}' galGal2.bed5 > galGal2.bed
hgLoadBed -noBin galGal2 encodeRegionsLiftOver galGal2.bed >>& galGal2.log

########################################################################
# MOUSE MM5
liftOver -minMatch=.01 -minSizeT=4000 -minSizeQ=20000 -multiple \
    hg17.bed /cluster/data/hg17/bed/liftOver/hg17ToMm5.chain \
    mm5.unmerged.bed5 mm5.unmapped.bed -verbose=2 >& mm5.log
liftOverMerge -mergeGap=20000 -verbose=2 mm5.unmerged.bed5 mm5.bed5 >>& mm5.log
wc -l mm5.*bed*
    #     54 mm5.bed5
    #      0 mm5.unmapped.bed
    #     60 mm5.unmerged.bed5
awk '{printf "%s\t%s\t%s\t%s_%s\n", $1, $2, $3, $4, $5}' mm5.bed5 > mm5.bed
hgLoadBed -noBin mm5 encodeRegionsLiftOver mm5.bed >>& mm5.log

########################################################################
# DOG
liftOver -minMatch=.01 -minSizeT=4000 \
    -minSizeQ=20000 -multiple -chainTable=hg17.chainCanFam1 \
    hg17.bed /cluster/data/hg17/bed/liftOver/hg17ToCanFam1.chain \
    canFam1.unmerged.bed5 canFam1.unmapped.bed -verbose=2 >& canFam1.log
liftOverMerge -mergeGap=20000 -verbose=2 canFam1.unmerged.bed5 canFam1.bed5 >>& canFam1.log
wc -l canFam1.*bed*
    #     51 canFam1.bed5
    #      0 canFam1.unmapped.bed
    #     58 canFam1.unmerged.bed5
awk '{printf "%s\t%s\t%s\t%s_%s\n", $1, $2, $3, $4, $5}' canFam1.bed5 > canFam1.bed
hgLoadBed -noBin canFam1 encodeRegionsLiftOver canFam1.bed >>& canFam1.log

########################################################################
# OPOSSUM
liftOver -minMatch=.01 -minSizeT=4000 \
    -minSizeQ=5000 -multiple -chainTable=hg17.chainMonDom1 \
    hg17.bed /cluster/data/hg17/bed/liftOver/hg17ToMonDom1.chain \
    monDom1.unmerged.bed5 monDom1.unmapped.bed -verbose=2 >& monDom1.log
liftOverMerge -mergeGap=20000 -verbose=2 monDom1.unmerged.bed5 monDom1.bed5 >>& monDom1.log
wc -l monDom1.*bed*
    #    143 monDom1.bed5
    #      0 monDom1.unmapped.bed
    #    191 monDom1.unmerged.bed5
awk '{printf "%s\t%s\t%s\t%s_%s\n", $1, $2, $3, $4, $5}' monDom1.bed5 > monDom1.bed
hgLoadBed -noBin monDom1 encodeRegionsLiftOver monDom1.bed >>& monDom1.log

########################################################################
# TETRA
rm -f tetNig1.log tetNig1.*bed*
liftOver -minMatch=.01 -minSizeT=4000 \
    -minSizeQ=1000 -multiple -chainTable=hg17.chainTetNig1 \
    hg17.bed /cluster/data/hg17/bed/liftOver/hg17ToTetNig1.chain \
    tetNig1.unmerged.bed5 tetNig1.unmapped.bed -verbose=2 >& tetNig1.log
liftOverMerge -mergeGap=20000 -verbose=2 tetNig1.unmerged.bed5 tetNig1.bed5 >>& tetNig1.log
wc -l tetNig1.*bed*
    #    197 tetNig1.bed5
    #     12 tetNig1.unmapped.bed
    #    225 tetNig1.unmerged.bed5
awk '{printf "%s\t%s\t%s\t%s_%s\n", $1, $2, $3, $4, $5}' tetNig1.bed5 > tetNig1.bed
hgLoadBed -noBin tetNig1 encodeRegionsLiftOver tetNig1.bed >>& tetNig1.log

########################################################################
# ZEBRAFISH
rm -f danRer2.log danRer2.*bed*
liftOver -minMatch=.01 -minSizeT=4000 \
    -minSizeQ=1000 -multiple -chainTable=hg17.chainDanRer2 \
    hg17.bed /cluster/data/hg17/bed/liftOver/hg17ToDanRer2.chain \
    danRer2.unmerged.bed5 danRer2.unmapped.bed -verbose=2 >& danRer2.log
liftOverMerge -mergeGap=20000 -verbose=2 danRer2.unmerged.bed5 danRer2.bed5 >>& danRer2.log
wc -l danRer2.*bed*
    #    238 danRer2.bed5
    #      2 danRer2.unmapped.bed
    #    278 danRer2.unmerged.bed5
awk '{printf "%s\t%s\t%s\t%s_%s\n", $1, $2, $3, $4, $5}' danRer2.bed5 > danRer2.bed
hgLoadBed -noBin danRer2 encodeRegionsLiftOver danRer2.bed >>& danRer2.log

########################################################################
# COW
liftOver -minMatch=.01 -minSizeT=4000 \
    -minSizeQ=20000 -multiple -chainTable=hg17.chainBosTau1 \
    hg17.bed /cluster/data/hg17/bed/liftOver/hg17ToBosTau1.chain \
                    /gbdb/hg17/liftOver/hg17ToBosTau1.over.chain
    bosTau1.unmerged.bed5 bosTau1.unmapped.bed -verbose=2 >& bosTau1.log
liftOverMerge -mergeGap=20000 -verbose=2 bosTau1.unmerged.bed5 bosTau1.bed5 >>& bosTau1.log
wc -l bosTau1.*bed*
    #
awk '{printf "%s\t%s\t%s\t%s_%s\n", $1, $2, $3, $4, $5}' bosTau1.bed5 > bosTau1.bed
hgLoadBed -noBin bosTau1 encodeRegionsLiftOver bosTau1.bed >>& bosTau1.log

rm -f bed.tab


########################################################################
#############       Create consensus regions     #######################
#############  (Union of liftOver and Mercator)  #######################
########################################################################

###
::::::::::::::
consensus/makeBed5.pl
::::::::::::::
#!/usr/bin/perl -W
#assumes that input is sorted by name (region)
$region="";
$regionCounter=0;

while (<>) {
    chomp();
    if (/^(.*)\s+(\d+)\s+(\d+)\s+(EN[mr]\d\d\d)/)
    {
        if ($region eq $4) { $regionCounter++; } else { $region=$4; $regionCounter=1; }
        printf("$1\t$2\t$3\t$4\t$regionCounter\n");
    }
    elsif (/^(.*)\s+(\d+)\s+(\d+)\s+(MEN[mr]\d\d\d)/)
    {
    # regions with names that begin with 'M' have been manually marked for 
    # removal and can be ignored.
    next;
    }
    else
    {
        die "Bad input line: $_";
    }
}
###

# create consensus regions and load them
set dir = /cluster/data/encode/ortho3/consensus
mkdir -p $dir
cd $dir

foreach db (panTro1 rn3 galGal2 mm5 canFam1 monDom1 tetNig1 danRer2)
    rm -f $db.log $db.*.bed* $db.encodeRegionsConsensus*
    #           db table                  db table                        out file                               err file
    regionOrtho $db.encodeRegionsLiftOver $db.encodeRegionsMercatorMerged $db.encodeRegionsConsensusUnmerged.bed $db.regionOrtho.err >& $db.log
    sort -k4 -k1.4n -k2n $db.encodeRegionsConsensusUnmerged.bed | makeBed5.pl > $db.encodeRegionsConsensusUnmerged.bed5
    liftOverMerge -mergeGap=20000 -verbose=2 $db.encodeRegionsConsensusUnmerged.bed5 $db.encodeRegionsConsensus1.bed5 >>& $db.log
    awk '{printf "%s\t%d\t%d\t%s_%s\n", $1, $2, $3, $4, $5}' $db.encodeRegionsConsensus1.bed5 > $db.encodeRegionsConsensus1.bed
    hgLoadBed $db encodeRegionsConsensus -tab $db.encodeRegionsConsensus1.bed >>& $db.log
    # remove the regions that are smaller than 2 kb and update the local file from the table
    hgsql $db -e "select * from encodeRegionsConsensus where chromEnd-chromStart<2000" > $db.smallRegions.bed
    hgsql $db -e "delete from encodeRegionsConsensus where chromEnd-chromStart<2000"
    hgsql $db -e "select chrom, chromStart, chromEnd, name from encodeRegionsConsensus" > $db.encodeRegionsConsensus2.bed
    sort -k4 -k1.4n -k2n $db.encodeRegionsConsensus2.bed | makeBed5.pl > $db.encodeRegionsConsensus.bed5
    awk '{printf "%s\t%d\t%d\t%s_%s\n", $1, $2, $3, $4, $5}' $db.encodeRegionsConsensus.bed5 > $db.encodeRegionsConsensus.bed
    hgLoadBed $db encodeRegionsConsensus -tab $db.encodeRegionsConsensus.bed >>& $db.log
end


########################################################################
#############  Make html frames pages for analysis  ####################
########################################################################

set dir = /cluster/data/encode/ortho3/html
mkdir -p $dir
cd $dir

rm -f positionFile
hgsql -e "select chrom, chromStart+1, chromEnd, name from encodeRegions order by name" hg17 | tail +2 | awk '{printf "%s:%d-%d\t%s\n",$1,$2,$3,$4}' > positionFile

cp -f ${HOME}/kent/src/hg/encode/regionOrtho/mkOrthologAllFrame.pl /cluster/bin/scripts/mkOrthologAllFrame.pl
chmod ug+x /cluster/bin/scripts/mkOrthologAllFrame.pl

# 20-30 seconds each - lots of nib lookups
foreach db (panTro1 rn3 galGal2 mm5 canFam1 monDom1 tetNig1 danRer2)
    echo `date` $db
    rm -f $db.html
    mkOrthologAllFrame.pl descriptionFile positionFile headerFile hg17 \
	$db encodeRegionsConsensus encodeRegionsLiftOver encodeRegionsMercator \
	> $db.html
end

set outDir = /usr/local/apache/htdocs/encode/ortho3
cp -f *.html $outDir
chmod o+r $outDir/*.html

## prune excessively large mercator regions

set dir = /cluster/data/encode/ortho3/mercatorPrune
mkdir -p $dir
cd $dir
rm -f updateDatabase.sql ; grep -P "\d" *Regions.bed | awk '{printf "%s\t%d\t%d\t%s\t%d\n",$1,$2,$3,$4,$3-$2}' | updateDatabase.pl > updateDatabase.sql
hgsql hg17 < updateDatabase.sql

## recompute consensus, and remake html files by copying the and executing code from those sections above


########################################################################
###############  Write AGP files from consensus regions  ###############
########################################################################

#!/bin/csh -f
# "Make doc" for creating AGP files for ENCODE ortholog regions based on
#  coordinates in bed files in the "consensus" directory
## based on makeAgp2.doc

rm -f *.contig.tab */*.packing.list >& /dev/null

# TODO: contigAcc files should go in $outDir (neater this way)

set orthoDir     = /cluster/data/encode/ortho3
set agpDir       = $orthoDir/agp
set consensusDir = $orthoDir/consensus
set downloadDir  = /usr/local/apache/htdocs/encode/ortho3/agp
set regionAgp    = regionAgp
mkdir -p $agpDir $downloadDir
chmod o+rx $downloadDir
rm -rf $agpDir/* $downloadDir/*
cd $agpDir

###
::::::::::::::
agp/makeBed5.pl
::::::::::::::
#!/usr/bin/perl -W
#assumes that input is already numbered
while (<>) {
    chomp();
    if (/chromStart/) {
	continue;
    }
    if (/^(.*)\s+(\d+)\s+(\d+)\s+(EN[mr]\d\d\d).(\d+)/) {
        printf("$1\t$2\t$3\t$4_$5\n");
    } else {
        die "Bad input line: $_";
    }
}
###

########################################################################
# create script to run AGP-maker on an assembly
rm -rf genomeAgps.csh >& /dev/null
cat > genomeAgps.csh << 'EOF'
# create AGP's for a genome assembly
# NOTE: should have freeze dir (ortho3) as an arg
# WARNING: this script uses $consensusDir/*.bed5, but instructions
# below create a separate version, from the encodeRegionsConsensus table
# Suggest next time using just the $consensusDir everywhere

if ($#argv != 3) then
    echo "usage: $0 <db> <org> <outdir>"
    exit 1
endif
echo
echo command line: $0 $1 $2 $3
echo working dir:  `pwd`

set consensusDir = /cluster/data/encode/ortho3/consensus
set db           = $1
set org          = $2
set outDir       = $3
set buildDir     = /cluster/data/$db
set bedFile      = $consensusDir/$db.encodeRegionsConsensus.bed5
set regionAgp    = regionAgp
mkdir -p $outDir

if (-f $org.contig.tab) then
    set contigArg = "-contigFile=$org.contig.tab"
else
    set contigArg = ""
endif

set runRegionAgp = "$regionAgp $contigArg -namePrefix=${org}_ $bedFile stdin -dir $outDir"

if (-f $buildDir/chrom.lst) then
    cat $buildDir/chrom.lst | \
    xargs -iX cat $buildDir/X/chr{X,X_random}.agp | $runRegionAgp
else
    cat $buildDir/?{,?}/*.agp | $runRegionAgp
endif
'EOF'
# << this line makes emacs coloring happy

########################################################################
# create test scripts

cd /cluster/data/encode/ortho3/agp/tests
cat > getSeqFromAcc.pl <<EOF
#!/usr/bin/perl -W
# download contig sequences from Genbank to $db.contig.fa directory
$URL = "http://www.ncbi.nih.gov/entrez/eutils/efetch.fcgi?db=nucleotide&rettype=fasta&id=";
$db=shift;
$dir = "/cluster/data/encode/ortho3/agp/tests/$db.contig.fa";
system("mkdir -p $dir");
while (<>)
{
    chomp();
    system("wget -O $dir/$_ \"$URL$_\"");
}
EOF
#
chmod ug+x getSeqFromAcc.pl

cd /cluster/data/encode/ortho3/agp/tests
cat > testFa.csh << 'EOF'
set db = $1
set org = $2
foreach f (../$db/*.agp)
    set seq = $f:t:r
    set region = `echo $seq | sed "s/${org}_//"`
    set coords = `awk -v REGION=$region '$4 == REGION {printf "%s.%d.%d", $1, $2, $3}' /cluster/data/encode/ortho3/consensus/$db.bed`
    set chr = $coords:r:r
    nibFrag /cluster/data/$db/nib/${chr}.nib \
        $coords:r:e $coords:e + $db.nibTest/$db.$region.fa
    faCmp $db.nibTest/$db.$region.fa $db.test/${org}_$region.fa
end
'EOF'
# << for emacs

########################################################################

# MOUSE AGP'S (DONE 2005-03-17 kate)
#!/bin/csh -fe

cd /cluster/data/encode/ortho3/agp
set db  = mm5
set org = mouse

#create consensus list from database, verify it's the same as consensus dir
mkdir $db
hgsql $db -e "select chrom, chromStart, chromEnd, name from encodeRegionsConsensus" | makeBed5.pl > $db/$db.encodeRegionsConsensus.bed5
sort $db/$db.encodeRegionsConsensus.bed5 > temp
sort ../consensus/$db.encodeRegionsConsensus.bed5 | diff - temp
# should be empty
hgsql -e "select count(*) from $db.encodeRegionsConsensus"
# 56

# Create AGP's
csh genomeAgps.csh $db $org $db

# Test 
cd tests
rm -fr $db.test $db.nibTest 
mkdir $db.test $db.nibTest
# note: tab required in grep (retype when pasting)
# watch out for hard-coded freeze names in scripts
cat ../$db/${org}_EN*.agp | grep -v "	N" | cut -f6 > $db.accessions
# download contigs -- not needed if already run
# NOTE: preferable to extract these from build dir
./getSeqFromAcc.pl $db < $db.accessions >& $db.log
awk -F\| '{if (/^>/) printf (">%s\n", $4); else print;}' $db.contig.fa/* > $db.contigfile.fa
cat ../$db/*.agp > $db.contig.agp
agpAllToFaFile $db.contig.agp $db.contigfile.fa $db.fa 
faSplit byname $db.fa $db.test/
csh testFa.csh $db $org ortho3 >&! testFa.$org.log &
head testFa.$org.log
# verify that all regions are the same
grep -v same testFa.$org.log
cd ..

# Create packing list
/cluster/data/encode/bin/scripts/encodeRegionPackingList \
    $encodeRegionPackingList $db/$db.encodeRegionsConsensus.bed5 $db $org \
    "Mus musculus" 10090  C57BL/6J MAY-2004 $db "NCBI Build 33" > $db/$db.packing.list
grep Region $db/$db.packing.list | wc -l
# 56

# Copy  to downloads area
set downloadDir = /usr/local/apache/htdocs/encode/ortho3/agp
cp -f $db/$db.packing.list $downloadDir
tar cvfz $downloadDir/$db.agp.tar.gz $db/*.agp
chmod o+r $downloadDir/$db.*
tar tvfz $downloadDir/$db.agp.tar.gz | wc -l
# 56

######################################################################a
# RAT AGP'S (DONE 2005-03-16 kate)

cd /cluster/data/encode/ortho3/agp
set db  = rn3
set org = rat
set downloadDir = /usr/local/apache/htdocs/encode/ortho3/agp
set encodeRegionPackingList = /cluster/data/encode/bin/scripts/encodeRegionPackingList

#create consensus list
mkdir $db
hgsql $db -e "select chrom, chromStart, chromEnd, name from encodeRegionsConsensus" | makeBed5.pl > $db/$db.encodeRegionsConsensus.bed5

# Get contig to accession map
hgsql $db -s -e "select * from contigAcc" > $org.contig.tab

# Create AGP's
csh genomeAgps.csh $db $org $db

# Test
cd tests
mkdir -p $db.test $db.nibTest
# note: tab required in grep (retype when pasting)
cat ../$db/${org}_EN*.agp | grep -v "	N" | cut -f6 > $db.accessions
head $db.accessions
# download contig sequences to $db.contig.fa directory
# note: this doesn't handle errors well -- better to extract from 
#  build dir contig fasta files
./getSeqFromAcc.pl $db ortho3 < $db.accessions >& $db.log
# verify all contigs were transferred
ls -l $db.contig.fa | grep ' 0 ' | awk '{print $9}' > $db.redo.acc
#./accHeader.pl $db.contig.fa $db >>& $db.log
awk -F\| '{if (/^>/) printf (">%s\n", $4); else print;}' $db.contig.fa/* > $db.contigfile.fa
cat ../$db/*.agp > $db.contig.agp
agpAllToFaFile $db.contig.agp $db.contigfile.fa $db.fa
faSplit byname $db.fa $db.test/
csh  testFa.csh $db $org ortho3 >&! testFa.$org.log
grep -v same testFa.$org.log
cd ..

# Create packing list
$encodeRegionPackingList $db/$db.encodeRegionsConsensus.bed5 $db $org "Rattus norvegicus" 10116 BN/SsNHsdMCW JUN-2003 $db "Baylor HGSC v3.1" > $db/$db.packing.list

# Copy  to downloads area
cp -f $db/$db.packing.list $downloadDir
tar cvfz $downloadDir/$db.agp.tar.gz $db/*.agp
chmod o+r $downloadDir/*

hgsql -e "select count(*) from $db.encodeRegionsConsensus"
tar tvfz $downloadDir/$db.agp.tar.gz | wc -l
grep Region $downloadDir/$db.packing.list | wc -l


########################################################################
# CHICKEN AGP'S (DONE 2005-03-16 kate)

cd /cluster/data/encode/ortho3/agp
set db  = galGal2
set org = chicken

#create consensus list from database, verify it's the same as consensus dir
mkdir $db
hgsql $db -e "select chrom, chromStart, chromEnd, name from encodeRegionsConsensus" | makeBed5.pl > $db/$db.encodeRegionsConsensus.bed5
sort $db/$db.encodeRegionsConsensus.bed5 > temp
sort ../consensus/$db.encodeRegionsConsensus.bed5 | diff - temp
# should be empty
hgsql -e "select count(*) from $db.encodeRegionsConsensus"
# 92

# Get contig to accession mapping (documented in makeGalGal2.doc) 
hgsql $db -s -e "select * from contigAcc" > $org.contig.tab

# Create AGP's
# errors here: missing /cluster/data/galGal2/chr*_random.agp and chrM.agp files
# but there are no ENCODE orthologous sequences on these chicken chromosomes
csh genomeAgps.csh $db $org $db

# Test 
cd tests
rm -fr $db.test $db.nibTest
mkdir -p $db.test $db.nibTest
cat ../$db/${org}_EN*.agp | grep -v "	N" | cut -f6 > $db.accessions
head $db.accessions 
./getSeqFromAcc.pl $db ortho3 < $db.accessions >& $db.log
ls -l $db.contig.fa | grep ' 0 ' | awk '{print $9}' > $db.redo.acc
awk -F\| '{if (/^>/) printf (">%s\n", $4); else print;}' $db.contig.fa/* > $db.contigfile.fa
cat ../$db/*.agp > $db.contig.agp
agpAllToFaFile $db.contig.agp $db.contigfile.fa $db.fa
faSplit byname $db.fa $db.test/
csh  testFa.csh $db $org ortho3 >&! testFa.$org.log &
head testFa.$org.log
grep -v same testFa.$org.log
# no output -> passed test
cd ..

# Create packing list
/cluster/data/encode/bin/scripts/encodeRegionPackingList \
    $db/$db.encodeRegionsConsensus.bed5 $db $org \
    "Gallus gallus" 9031 N/A FEB-2004 $db "CGSC Feb. 2004" > $db/$db.packing.list
grep Region $db/$db.packing.list | wc -l
# 92

# Copy  to downloads area
set downloadDir = /usr/local/apache/htdocs/encode/ortho3/agp
cp -f $db/$db.packing.list $downloadDir
tar cvfz $downloadDir/$db.agp.tar.gz $db/*.agp
tar tvfz $downloadDir/$db.agp.tar.gz | wc -l
# 92 

########################################################################
#  DOG AGP'S (DONE 2006-03-16 kate)
set db  = canFam1
set org = dog

cd /cluster/data/encode/ortho3/agp
set downloadDir = /usr/local/apache/htdocs/encode/ortho3/agp
set encodeRegionPackingList = /cluster/data/encode/bin/scripts/encodeRegionPackingList

# create consensus list
mkdir $db
hgsql $db -e "select chrom, chromStart, chromEnd, name from encodeRegionsConsensus" | makeBed5.pl > $db/$db.encodeRegionsConsensus.bed5
hgsql $db -s -e "select * from contigAcc" > $org.contig.tab

# Create AGP's
csh genomeAgps.csh $db $org $db
# ignore error messages for random chrom's and chrM (not in dog assembly)

# Tests
cd tests
mkdir -p $db.test $db.nibTest
# type the literal tab in
cat ../$db/${org}_EN*.agp | grep -v "	N" | cut -f6 > $db.accessions
head $db.accessions
./getSeqFromAcc.pl $db ortho3 < $db.accessions >& $db.log &
# verify download worked
ls -l $db.contig.fa | grep ' 0 ' | awk '{print $9}' > $db.redo.acc
wc -l $db.redo.acc
# download if any left
awk -F\| '{if (/^>/) printf (">%s\n", $4); else print;}' $db.contig.fa/* > $db.contigfile.fa
cat ../$db/*.agp > $db.contig.agp
agpAllToFaFile $db.contig.agp $db.contigfile.fa $db.fa
faSplit byname $db.fa $db.test/
csh  testFa.csh $db $org ortho3 >&! testFa.$org.log
head testFa.$org.log
grep -v same testFa.$org.log
# no output -> passed test
cd ..

# Create packing list
/cluster/data/encode/bin/scripts/encodeRegionPackingList \
    $db/$db.encodeRegionsConsensus.bed5 \
    $db $org "Canis Familiaris" 9615 N/A JUL-2004 $db \
    "Broad Institute v. 1.0" > $db/$db.packing.list

# Copy  to downloads area
cp -f $db/$db.packing.list $downloadDir
tar cvfz $downloadDir/$db.agp.tar.gz $db/*.agp

hgsql -e "select count(*) from $db.encodeRegionsConsensus"
tar tvfz $downloadDir/$db.agp.tar.gz | wc -l
grep Region $downloadDir/$db.packing.list | wc -l
# 52

########################################################################
# CHIMP AGP'S (DONE 2005-03-16 kate)
set db  = panTro1
set org = chimp

cd /cluster/data/encode/ortho3/agp
set downloadDir = /usr/local/apache/htdocs/encode/ortho3/agp
set encodeRegionPackingList = /cluster/data/encode/bin/scripts/encodeRegionPackingList

# create consensus list
mkdir $db
hgsql $db -e "select chrom, chromStart, chromEnd, name from encodeRegionsConsensus" | makeBed5.pl > $db/$db.encodeRegionsConsensus.bed5
hgsql $db -s -e "select * from contigAcc" > $org.contig.tab

# Create AGP's
# NOTE: next time, put chimpChromContigs.agp into $outDir
cat /cluster/data/$db/?{,?}/*.agp | \
   chimpChromContigAgp stdin /cluster/data/$db/assembly.agp chimpChromContigs.agp
regionAgp -contigFile=$org.contig.tab -namePrefix=${org}_ \
    $db/$db.encodeRegionsConsensus.bed5 chimpChromContigs.agp -dir $db

# Create packing list
/cluster/data/encode/bin/scripts/encodeRegionPackingList \
    $db/$db.encodeRegionsConsensus.bed5 \
    $db  $org "Pan troglodytes" 9598 N/A NOV-2003 $db \
    "NCBI Build 1 v1" > $db/$db.packing.list

# Test
cd tests
mkdir -p $db.test $db.nibTest
# type the literal tab in
cat ../$db/${org}_EN*.agp | grep -v "	N" | cut -f6 > $db.accessions
head $db.accessions
./getSeqFromAcc.pl $db ortho3 < $db.accessions >& $db.log &
# verify download worked
ls -l $db.contig.fa | grep ' 0 ' | awk '{print $9}' > $db.redo.acc
wc -l $db.redo.acc
# download if any left
awk -F\| '{if (/^>/) printf (">%s\n", $4); else print;}' $db.contig.fa/* > $db.contigfile.fa
cat ../$db/*.agp > $db.contig.agp
agpAllToFaFile $db.contig.agp $db.contigfile.fa $db.fa
faSplit byname $db.fa $db.test/
csh  testFa.csh $db $org ortho3 >&! testFa.$org.log
head testFa.$org.log
grep -v same testFa.$org.log
# no output -> passed test
cd ..

# Copy  to downloads area
cp -f $db/$db.packing.list $downloadDir
tar cvfz $downloadDir/$db.agp.tar.gz $db/*.agp

set downloadDir  = /usr/local/apache/htdocs/encode/ortho3/agp
hgsql -e "select count(*) from $db.encodeRegionsConsensus"
tar tvfz $downloadDir/$db.agp.tar.gz | wc -l
grep Region $downloadDir/$db.packing.list | wc -l
# 81

########################################################################
# OPOSSUM AGP'S (DONE 2005-03-17 kate)
set db  = monDom1
set org = opossum

cd /cluster/data/encode/ortho3/agp

#create consensus list from database, verify it's the same as consensus dir
mkdir $db
hgsql $db -e "select chrom, chromStart, chromEnd, name from encodeRegionsConsensus" | makeBed5.pl > $db/$db.encodeRegionsConsensus.bed5
sort $db/$db.encodeRegionsConsensus.bed5 > temp
sort ../consensus/$db.encodeRegionsConsensus.bed5 | diff - temp
# should be empty
hgsql -e "select count(*) from $db.encodeRegionsConsensus"
# 146

# get contig to accession mapping
hgsql $db -s -e "select * from contigAcc" > $org.contig.tab

# Create AGP's
gunzip -c /cluster/data/monDom1/broad.mit.edu/assembly.agp.gz > $db.assembly.agp
regionAgp -contigFile=$org.contig.tab -namePrefix=${org}_ \
        $db/$db.encodeRegionsConsensus.bed5 $db.assembly.agp -dir $db
rm $db.assembly.agp

# Test
cd tests
rm -fr $db.test $db.nibTest
mkdir -p $db.test $db.nibTest
# type the literal tab in
cat ../$db/${org}_EN*.agp | grep -v "	N" | cut -f6 > $db.accessions
head $db.accessions
./getSeqFromAcc.pl $db ortho3 < $db.accessions >& $db.log &
# verify download worked
ls -l $db.contig.fa | grep ' 0 ' | awk '{print $9}' > $db.redo.acc
wc -l $db.redo.acc
# download if any left
awk -F\| '{if (/^>/) printf (">%s\n", $4); else print;}' $db.contig.fa/* > $db.contigfile.fa
cat ../$db/*.agp > $db.contig.agp
agpAllToFaFile $db.contig.agp $db.contigfile.fa $db.fa
faSplit byname $db.fa $db.test/

cat > testScaffoldFa.csh << 'EOF'
set db = $1
set org = $2
set freeze = $3
foreach f (../$db/*.agp)
    set seq = $f:t:r
    set region = `echo $seq | sed "s/${org}_//"`
    set coords = `awk -v REGION=$region '$4 == REGION {printf "%s.%d.%d", $1, $2, $3}' /cluster/data/encode/$freeze/html/$db.encodeRegionsConsensus.bed`
    set chr = $coords:r:r
    twoBitToFa /cluster/data/$db/monDom1.2bit \
        -seq=$chr -start=$coords:r:e -end=$coords:e $db.nibTest/$db.$region.fa
    faCmp $db.nibTest/$db.$region.fa $db.test/${org}_$region.fa
end
'EOF'
# << make emacs happy

csh testScaffoldFa.csh  $db $org ortho3 >&! testFa.$org.log
head testFa.$org.log
grep -v same testFa.$org.log
# no output -> passed test
cd ..

# Create packing list
/cluster/data/encode/bin/scripts/encodeRegionPackingList \
    $db/$db.encodeRegionsConsensus.bed5 $db $org "Monodelphis domestica" 13616 N/A \
    OCT-2004 $db "Broad Institute" > $db/$db.packing.list
grep Region $db/$db.packing.list | wc -l
# 146

# Copy  to downloads area
set downloadDir  = /usr/local/apache/htdocs/encode/ortho3/agp
cp -f $db/$db.packing.list $downloadDir
tar cvfz $downloadDir/$db.agp.tar.gz $db/*.agp
tar tvfz $downloadDir/$db.agp.tar.gz | wc -l
#  146


########################################################################
# TETRA AGP'S (DONE 2005-03-17 kate)
set db  = tetNig1
set org = tetra

cd /cluster/data/encode/ortho3/agp

#create consensus list from database, verify it's the same as consensus dir
mkdir $db
hgsql $db -e "select chrom, chromStart, chromEnd, name from encodeRegionsConsensus" | makeBed5.pl > $db/$db.encodeRegionsConsensus.bed5
sort $db/$db.encodeRegionsConsensus.bed5 > temp
sort ../consensus/$db.encodeRegionsConsensus.bed5 | diff - temp
# should be empty
hgsql -e "select count(*) from $db.encodeRegionsConsensus"
# 185

# get contig to accession mapping
hgsql $db -s -e "select * from contigAcc" > $org.contig.tab

# Create AGP's
csh genomeAgps.csh $db $org $db

# Test
cd tests
rm -fr $db.test $db.nibTest
mkdir -p $db.test $db.nibTest
# type the literal tab in
cat ../$db/${org}_EN*.agp | grep -v "	N" | cut -f6 > $db.accessions
head $db.accessions
./getSeqFromAcc.pl $db ortho3 < $db.accessions >& $db.log &
# verify download worked
ls -l $db.contig.fa | grep ' 0 ' | awk '{print $9}' > $db.redo.acc
wc -l $db.redo.acc
# download if any left
awk -F\| '{if (/^>/) printf (">%s\n", $4); else print;}' $db.contig.fa/* > $db.contigfile.fa
cat ../$db/*.agp > $db.contig.agp
agpAllToFaFile $db.contig.agp $db.contigfile.fa $db.fa
faSplit byname $db.fa $db.test/
csh  testFa.csh $db $org ortho3 >&! testFa.$org.log
head testFa.$org.log
grep -v same testFa.$org.log
# no output -> passed test
cd ..

# Create packing list
/cluster/data/encode/bin/scripts/encodeRegionPackingList \
    $db/$db.encodeRegionsConsensus.bed5 $db $org "Tetraodon nigroviridis" 99883 N/A \
    FEB-2004 $db "Genoscope V7" > $db/$db.packing.list
grep Region $db/$db.packing.list | wc -l
# 185

# Copy  to downloads area
set downloadDir  = /usr/local/apache/htdocs/encode/ortho3/agp
cp -f $db/$db.packing.list $downloadDir
tar cvfz $downloadDir/$db.agp.tar.gz $db/*.agp
tar tvfz $downloadDir/$db.agp.tar.gz | wc -l

########################################################################
# ZEBRAFISH AGP'S (DONE 2005-03-17 kate)
set db  = danRer2
set org = zebrafish

cd /cluster/data/encode/ortho3/agp

#create consensus list from database, verify it's the same as consensus dir
mkdir $db
hgsql $db -e "select chrom, chromStart, chromEnd, name from encodeRegionsConsensus" | makeBed5.pl > $db/$db.encodeRegionsConsensus.bed5
sort $db/$db.encodeRegionsConsensus.bed5 > temp
sort ../consensus/$db.encodeRegionsConsensus.bed5 | diff - temp
# should be empty
hgsql -e "select count(*) from $db.encodeRegionsConsensus"
# 247

# get contig to accession mapping
hgsql $db -s -e "select * from contigAcc" > $org.contig.tab

# Create AGP's
csh genomeAgps.csh $db $org $db

# Test
cd tests
rm -fr $db.test $db.nibTest
mkdir -p $db.test $db.nibTest
# type the literal tab in
cat ../$db/${org}_EN*.agp | grep -v "	N" | cut -f6 > $db.accessions
head $db.accessions
./getSeqFromAcc.pl $db ortho3 < $db.accessions >& $db.log &
# verify download worked
ls -l $db.contig.fa | grep ' 0 ' | awk '{print $9}' > $db.redo.acc
wc -l $db.redo.acc
# download if any left
awk -F\| '{if (/^>/) printf (">%s\n", $4); else print;}' $db.contig.fa/* > $db.contigfile.fa
cat ../$db/*.agp > $db.contig.agp
agpAllToFaFile $db.contig.agp $db.contigfile.fa $db.fa
faSplit byname $db.fa $db.test/
csh  testFa.csh $db $org ortho3 >&! testFa.$org.log
head testFa.$org.log
grep -v same testFa.$org.log
# no output -> passed test
cd ..

# Create packing list
/cluster/data/encode/bin/scripts/encodeRegionPackingList \
    $db/$db.encodeRegionsConsensus.bed5 $db $org "Danio rerio" 7955 N/A \
    JUN-2004 $db "Sanger Zv4" > $db/$db.packing.list
grep Region $db/$db.packing.list | wc -l
# 247

# Copy  to downloads area
set downloadDir  = /usr/local/apache/htdocs/encode/ortho3/agp
cp -f $db/$db.packing.list $downloadDir
tar cvfz $downloadDir/$db.agp.tar.gz $db/*.agp
tar tvfz $downloadDir/$db.agp.tar.gz | wc -l
# 247


########################################################################
# cleanup

chmod -R o+r $downloadDir/* >& /dev/null
exit

########################################################################

# print summary stats

cat > stats.csh << 'EOF'
hgsql hg16 -N -e "select name from encodeRegions" | sort > allRegions
foreach f ($1/*.list)
    set db = $f:t:r:r
    set agps = `grep AgpFile $f | wc -l`
    set len = `awk '/SeqLength/ {len += $2} END {print len}' $f`
    set missing = `grep AgpFile $f | sed 's/.*\(EN.*\)_.*/\1/' | sort|uniq | diff - allRegions`
    printf "%-8s %5s %10s %s\n" $db $agps $len $missing
end 
rm allRegions
'EOF'

csh stats.csh $downloadDir

