#!/usr/bin/Rscript # Genomik und Bioinformatik I # Projekt 3 (Greedy contig assembly) # written by Andreas Loibl (enrollment no. 1524148) # THIS IS JUST A DRAFT IMPLEMENTATION! library("Biostrings") find_matches <- function(pool, read) { matches <- whichPDict(pool, read) if(sum(positions_used[matches])) cat("°") # the circle (°) shows occourrences of repetitions matches <- matches[which(!positions_used[matches])] return(matches) } extend <- function(contig) { cat("extending:", as.character(contig)) read <- subseq(contig, -nchar(pool[[1]])) while(TRUE) { match_positions <- list() for(base in DNA_ALPHABET[1:4]) { read_to_search_for <- c(subseq(read,2),DNAString(base)) matches <- c(find_matches(pool, read_to_search_for), -find_matches(pool, reverseComplement(read_to_search_for))) if(length(matches)) match_positions[[base]] <- matches } if(!length(match_positions)) { cat("\nno read found.\n") return(contig) } else if(length(match_positions)>1) { cat("\nmismatch.\n") return(contig) } else { for(position in match_positions[[1]]) positions_used[abs(position)] <<- TRUE match <- reads[[abs(match_positions[[1]][1])]] if(match_positions[[1]][1]<0) match <- reverseComplement(match) cat(as.character(subseq(match, -1))) contig <- c(contig, subseq(match, -1)) read <- match } } } dump_contigs <- function(contigs) { cc <- c(as.character(contigs), as.character(reverseComplement(contigs))) cc <- unique(sort(cc)) cat(cc, sep="\n") } resample <- function(x, ...) x[sample.int(length(x), ...)] # taken from example(sample) ## For R >= 2.11.0 only percent <- 100 reads <- read.DNAStringSet("errorfreeReads.fna", format="fasta") reads <- resample(reads, percent/100 * length(reads)) reads <- unique(reads) pool <- PDict(reads) positions_used <- logical(length=length(pool)) contigs <- DNAStringSet() while(length(which(!positions_used))) { random_position <- resample(which(!positions_used), 1) positions_used[random_position] <- TRUE contig <- reads[[random_position]] cat("processing contig", length(contigs)+1, "-", length(which(!positions_used)), "unprocessed reads left...\n") contigs <- append(contigs, DNAStringSet(reverseComplement(extend(reverseComplement(extend(contig)))))) } dump_contigs(contigs)