BedFlanking

From genomewiki
Jump to navigationJump to search

A recurring question on the genome mailing list is: "How can I annotate flanking features with the table browser?". The answers are usually elusive, as far as I know, you cannot do this directly with the table browser. I couldn't find a tool for it in the source tree either. It's possible with an SQL query but very very slow (SQL is not very "linear"). I therefore hacked something together in C using Jim's lib. The way it's written is a bit weird, I would love to see some comments here in the wiki. I hope it doesn't exist yet in the source tree.

You can use it like this:

 bedFlanking myRegions.bed knownGenes.bed result.bed

To compile it, you need the normal makefile for all of Jim's programs, like this here and, of course, the UCSC source tree. Install and compile the source tree as described in the documentation (it's worth the time). Copy/paste the following source to a file, say "bedFlanking.c", copy/paste the makefile to the file "makefile" and then run "makefile PROG=bedFlanking KENT=(path where you installed the source tree)".

/* bedFlanking - add flanking genes/featurenames to bed file regions */
/* Max Haeussler, 06/2006, you can do what you want with this code but if you modify it, please add your corrections to this page!

#include "common.h"
#include "bed.h"
#include "bed.h"
#include "options.h"
#include "common.h"
#include "fa.h"
#include "dnaseq.h"
#include "errabort.h"
#include "verbose.h"

#define TAG_END 0x70000000
#define TAG_START 0

int concat;
int dropOverlap;

void usage()
/* Explain usage and exit. */
{
errAbort(
  "bedFlanking - add flanking genes/featurenames to bed file regions \n"
  "     given a list of regions of interest, we add the featurenames \n"
  "     from the two flanking features (usually genes) to it.\n"
  "     both bed files have to sorted! (-> bedSort) \n"
  "     If regions overlap the exons, they are trimmed, however, only the \n"
  "     LAST part of the overlapping region is considered \n"
  "usage:\n"
  "   bedFlanking [options] regionsInterest.bed genes.bed outfile (stdout ok)\n"
  "options:\n"
  "     -upstream    output only upstream genes\n"
  "     -minLen=x      after trimming, drop feature if shorter than x bp\n"
  "     -concat      do not overwrite old feature names but concat info to it\n"
  "     -dropOverlap   if a region contains a gene, drop the region.\n"
  "                    default is to ignore the gene\n");
}


boolean overlap(struct bed* g1, struct bed* g2) {
    return (rangeIntersection(g1->chromStart, g1->chromEnd, g2->chromStart, g2->chromEnd)>0); 
}

int length(struct bed* g1) {
    return g1->chromEnd - g1->chromStart; 
}

void bedLoadVerbose(char* fname, struct bed** list) {
    verbose(1,"Loading %s \n",fname);
    *list = bedLoadAll(fname);
    verbose(1, "Loaded %d elements \n", slCount(*list));
}

/** search chrom changes in bedlist and insert dummy features to mark chrom changes */
void insertChromEndDummies(struct bed* list) {
    while (list!=NULL) {
            if (list->next==NULL || strcmp(list->next->chrom, list->chrom)!=0 ) {
                struct bed* dummy = needMem(sizeof(struct bed));
                dummy->chromStart=TAG_END;
                dummy->chromEnd=TAG_END;
                dummy->chrom = list->chrom;
                dummy->name="Gap";
                dummy->chrom=list->chrom;

                dummy->next = list->next;
                list->next = dummy;
                list = dummy->next;
            }
        if (list==NULL)
            return;
        list = list->next;
    }
}

/** print feature to output */
void printBedLine(FILE* out, struct bed* region, struct bed* leftGene, struct bed* rightGene, boolean upstream) {
    if (upstream) {
        int printed=0;
        char name[511];
        name[0] = 0;
        if (rightGene->strand[0]=='+') {
            strcat(name, rightGene->name);
            printed++;
        }
        if (leftGene->strand[0]=='-') {
            if (printed) {
                strcat(name, "-");
            }
            strcat(name, leftGene->name);
            printed++;
        }
        if (printed==0) {
            strcat(name, leftGene->name);
            strcat(name, "-");
            strcat(name, rightGene->name);
        }
        fprintf(out,"%s\t%d\t%d\t%s\n", region->chrom, region->chromStart, region->chromEnd, name);
    }
    else {
        if (concat)  
        fprintf(out,"%s\t%d\t%d\t%s|%s|%s\t%d\t%s\n", region->chrom, region->chromStart, region->chromEnd, region->name, leftGene->name, rightGene->name, region->score, region->strand);
        else
        fprintf(out,"%s\t%d\t%d\t%s|%s\t%d\t%s\n", region->chrom, region->chromStart, region->chromEnd, leftGene->name, rightGene->name, region->score, region->strand);
    }

}

/** main subroutine */ 
void bedFlanking(char* regionName, char* geneName, FILE* outf, boolean upstream, int minLen) {

    struct bed *region = NULL;
    struct bed *gene = NULL;

    //    if (bedSizeRegion < 3 ) {
    //        errAbort("%s must have a at least 3 cols\n", regionName);
    //    }
    //    if (bedSizeGene < 3 ) {
    //        errAbort("%s must have a at least 3 cols\n", geneName);
    //    }
    //
    bedLoadVerbose(regionName, &region);
    bedLoadVerbose(geneName, &gene);

    /* prep dummy element */
    struct bed* dummy = needMem(sizeof(struct bed));
    dummy->next = gene;
    dummy->chromStart=0;
    dummy->chromEnd=0;
    dummy->chrom=gene->chrom;
    dummy->name="(ScaffoldEnd)";
    boolean chromChange = FALSE;
    //char* lastChrom = "xxxxxx";

    struct bed* leftGene = dummy;
    struct bed* rightGene = gene;

    /* iterate over chromosomes */
    insertChromEndDummies(gene); 
    
    //struct bed* b = gene;
    //while (b !=NULL) {
        //bedOutputN(b, 3, outf, '\t', '\n');
        //b = b->next;
    //}

    /* iterate over regions */
    while ( rightGene != NULL && region != NULL ) {

        //verbose(1, "\nRegion %s:%d-%d\n", region->chrom, region->chromStart, region->chromEnd);
        //verbose(1, "LeftGene at %s:%d-%d (%s) \nRightGene at %s:%d-%d (%s)\n", leftGene->chrom, leftGene->chromStart, leftGene->chromEnd, leftGene->name, rightGene->chrom, rightGene->chromStart, rightGene->chromEnd, rightGene->name);
        //verbose(1, "\n");

        /* if needed: fast forward to next gene with same chrom */
        while (rightGene!=NULL && strcmp(region->chrom,rightGene->chrom)!=0) {
            //verbose(1, "skipping %s, %d\n", leftGene->chrom, leftGene->chromStart);
            leftGene = rightGene;
            rightGene = rightGene->next;
            if (rightGene==NULL) 
                return;
            if (leftGene->chromStart==TAG_END) { // Adjust dummy values to make "<" compare work
                leftGene->chromStart=TAG_START;
                leftGene->chromEnd=TAG_START;
                leftGene->chrom=rightGene->chrom;
            }
        }

        /* if next gene is completely INSIDE the region, we ignore either the gene or the region */
        while (region->chromStart <= rightGene->chromStart && region->chromEnd >= rightGene->chromEnd) {
            if (!dropOverlap) {
                leftGene->next = rightGene->next;
                rightGene = rightGene->next;
            }
            else
                region = region->next;
        }

        /* region ends somewhere before next exon */
        if (region->chromEnd <= rightGene->chromEnd ) {
            /* region overlaps gene: trim region */
            if (overlap(region, leftGene)) {
                region->chromStart = leftGene->chromEnd;
            }
            if (overlap(region, rightGene)) {
                region->chromEnd = rightGene->chromStart-1;
            }

            /* print if not too short */
            if (length(region) > minLen) {
                printBedLine(outf, region, leftGene, rightGene, upstream);
                //lastChrom = region->chrom;
                region = region->next;
            }
            else {
                /* otherwise goto next region and bail out */
                //lastChrom = region->chrom;
                region = region->next;
            }
        } 
        else 
            /* get next exons */
            if (rightGene->next!=NULL) {
                leftGene = rightGene;
                rightGene = rightGene->next;
                if (leftGene->chromStart==TAG_END) {
                    leftGene->chromStart=0;
                    leftGene->chromEnd=0;
                }
            }

    } // while rightGene!=NULL
} // func

/* == MAIN ========================= */
int main(int argc, char *argv[])
/* Process command line. */
{
//int* xy = 0;
//printf ("%d", *xy);
static struct optionSpec optionSpecs[] = {
     {"verbosity", OPTION_INT} ,
     {"minLen", OPTION_INT} ,
     {"upstream", OPTION_BOOLEAN} ,
     {"concat", OPTION_BOOLEAN} , 
     {"dropOverlap", OPTION_BOOLEAN} 
    };
char *regionFile, *geneFile, *outfileName;
FILE *outf;
int verbosity=1;

if (argc < 3) usage();
optionInit(&argc, argv, optionSpecs);

verbosity = optionInt("verbosity", verbosity);
verboseSetLogFile("stderr");
verboseSetLevel(verbosity);

regionFile  = argv[1];
geneFile = argv[2];
outfileName = argv[3];
boolean upstream = optionExists("upstream");
int minLen = 10;
minLen = optionInt("minLen", minLen);
concat = optionExists("concat");
dropOverlap = optionExists("dropOverlap"); 

if (outfileName==NULL) {
    errAbort("error: No outfile specfified.\n");
}
if (strcmp(outfileName,"stdout")) {
    outf = mustOpen(outfileName,  "w");
}
else {
    outf = stdout;
}
bedFlanking(regionFile, geneFile, outf, upstream, minLen);

fclose(outf);
fprintf(stderr, "%s created.\n", outfileName);
return 0;
}