#!/usr/bin/Rscript # Genomik und Bioinformatik I # Projekt 2 (GC-skew in bacterial genomes) # written by Andreas Loibl (enrollment no. 1524148) # these libraries are recommended, but for most of the functions optional. # see the comment headers of the functions for more information whether it is needed or not. require("XML") require("Biobase") require("Biostrings") # URL where the Entrez Programming Utilities API can be accessed # see the "e*" functions or http://eutils.ncbi.nlm.nih.gov/ for more information urlbase <- "http://eutils.ncbi.nlm.nih.gov/entrez/eutils/" # comment out the following line to disable the use of the annotations cache; # see function "eAnnotations" for more information if(!exists("annotations_cache")) annotations_cache <<- list(NULL) # eSummary ########### # downloads the document summary of a given primary ID (i.e. GI number for nucleotide database) # from NCBI (using Entrez Programming Utilities API) # # dependencies: # library(XML) [required] eSummary <- function(id) { if(!existsFunction("xmlInternalTreeParse")) stop("library(XML) is required for this function!") url <- paste(urlbase, "esummary.fcgi?db=nuccore&retmode=xml&id=", id, sep="") xml <- xmlInternalTreeParse(readLines(url)) return(xml) } # eFetch ########## # downloads the sequence of a given primary ID (i.e. GI number for nucleotide database) # in FASTA format from NCBI (using Entrez Programming Utilities API) # # dependencies: # library(Biostrings) [optional, recommended] eFetch <- function(id) { url <- paste(urlbase, "efetch.fcgi?db=nuccore&rettype=fasta&id=", id, sep="") cat("downloading...\n") if(existsFunction("readFASTA")) { return(readFASTA(url, strip.descs=TRUE)) } else { # assuming the url only contains one FASTA block... # (this is just a fallback if Biostrings is not available) lines <- readLines(url) return(list(list(desc = gsub("^>", "", lines[1]), seq = paste(lines[-1], collapse="")))) } } # eAnnotations ############### # downloads the sequence of a given primary ID (i.e. GI number for nucleotide database) # in "Feature Table report" (text) format from NCBI (using Entrez Programming Utilities API) # if the annotations cache is enabled it is stored there for faster access on later use # eAnnotations <- function(id) { if(typeof(id) != "character") id <- as.character(id) url <- paste(urlbase, "efetch.fcgi?db=nuccore&rettype=ft&id=", id, sep="") if(exists("annotations_cache")) { if(is.null(annotations_cache[[id]])) annotations_cache[[id]] <<- readLines(url) return(annotations_cache[[id]]) } return(readLines(url)) } # retrieve_genome ################## # searches the NCBI nucleotide database for the supplied search term and downloads the matching # result using eFetch and returns it. # # * if more than one match is found the user will be prompted to choose the desired one from a list # * if more than 20 matches are found then no list is shown, only an error message claiming that there # are too many matches (so the user has to use more precise search terms) # * if there are no matches an error message will be returned # # as search term any string that can identify a genome (e.g. a string like "e.coli k12 dh10b" or a # identifier like the EMBL ID or the GI number) may be used. # # dependencies: # library(XML) [required] retrieve_genome <- function(term) { if(!existsFunction("xmlInternalTreeParse")) stop("library(XML) is required for this function!") urlencode <- function(data) return(paste("%", charToRaw(data), collapse="", sep="")) url <- paste(urlbase, "esearch.fcgi?db=nuccore&term=", urlencode(term), sep="") xml <- xmlInternalTreeParse(readLines(url)) results.count <- xmlValue(getNodeSet(xml, "/eSearchResult/Count")[[1]]) if(results.count > 20) { stop("too many (", results.count, ") results!\n") } else if(results.count > 1) { IDs <- list(NULL); IDs[[1]] <- NULL for(id in xmlApply(getNodeSet(xml, "/eSearchResult/IdList")[[1]], xmlValue)) { n <- length(IDs)+1 IDs[[n]] <- id eSum <- eSummary(id) title <- xmlValue(getNodeSet(eSum, "/eSummaryResult/DocSum/Item[@Name='Title']")[[1]]) caption <- xmlValue(getNodeSet(eSum, "/eSummaryResult/DocSum/Item[@Name='Caption']")[[1]]) size <- round(as.numeric(xmlValue(getNodeSet(eSum, "/eSummaryResult/DocSum/Item[@Name='Length']")[[1]]))/1024/1024, 2) cat(" ", n, "\t", title, " [", caption, "] ", size, "MB \n") } choice <- readline("Choice: ") choice.id <- IDs[[as.numeric(choice)]] if(choice.id > 0) { return(eFetch(choice.id)) } else { return(FALSE) } } else if(results.count ==1) { id <- xmlValue(getNodeSet(xml, "/eSearchResult/IdList/Id")[[1]]) return(eFetch(id)) } else { stop("genome not found!") } } # get_annotation ################# # fetches the annotation at a given position inside a sequence (and its neighborhood) # # * "data" needs to be a FASTA list that has a GI number in its description, # because this identifier is required for eAnnotations to fetch the annotations from NCBI # * if "pos" is not supplied the whole annotation will be returned (as list of strings) # * if "variance" is zero only exact matches will be shown, otherwise all matches within this # given variance are listed # get_annotation <- function(data, pos=0, variance=1000) { if(typeof(data) == "list") { # get ID from FASTA Header and fetch annotations from NCBI data <- eAnnotations(strsplit(data[[1]]$desc, "\\|")[[1]][2]) } else { stop("data needs to be a FASTA list that has a GI number in its description") } data <- strsplit(strsplit(gsub("\r\t\t\t", "", paste(data, collapse="\n\r")),"\r")[[1]],"\t") data[[1]] <- NULL # remove header pos <- round(pos) # values like 0.2bp make no sense if(pos==0) return(data) cat("Exact matches:\n") for(section in data) { base.span <- as.numeric(gsub("[^0-9]","",section)[1:2]) if(pos %in% seq(base.span[1], base.span[2])) cat(sub("\n","\n\t",section),"\n") } if(variance==0) return(0) cat("Surrounding area:\n") for(section in data) { base.span <- as.numeric(gsub("[^0-9]","",section)[1:2]) if(abs(base.span[1]-pos) < variance || abs(base.span[2]-pos) < variance) cat(sub("\n","\n\t",section),"\n") } } # unFASTA ########## # returns the supplied sequence as a character vector (i.e. "unlist" FASTA lists) # unFASTA <- function(seq) { if(typeof(seq) == "list") seq <- seq[[1]]$seq return(seq) } # skew_characters ################## # returns a list containing the frequencies of A, T, C and G nucleotides in the supplied sequence # # if the Biobase-library is present this function will be a lot faster so it is recommended to have this library installed. # # dependencies: # library(Biobase) [optional, recommended] skew_characters <- function(seq) { if(existsFunction("alphabetFrequency")) { freq <- alphabetFrequency(DNAString(seq)) return(list(A=freq["A"], T=freq["T"], C=freq["C"], G=freq["G"])) } else { freq <- t(data.frame(table(factor(strsplit(seq,"")[[1]],levels=c("A","T","C","G"))), row.names=1)) return(list(A=freq[,"A"], T=freq[,"T"], C=freq[,"C"], G=freq[,"G"])) } } # GC_skew ########## # calculates the GC skew of the supplied sequence and therefore returns a numeric value between -1 and 1 # GC_skew <- function(seq) { seq <- unFASTA(seq) freq <- skew_characters(seq) if(freq$G+freq$C == 0) return(0) return(as.numeric((freq$G-freq$C)/(freq$G+freq$C))) } # cumulated_GC_skew #################### # returns a vector of GC skews that are cumulated for each position supplied in "n" # # * "n" is the number of nucleotides of which the cumulated GC skew should be calculated. # - if not supplied the GC skew will be calculated for the whole length of the sequence # - "n" may be a vector consisting of several nucleotide positions in the sequence, # so a vector of the cumulated GC skew of the first n nucleotides for each position # of the vector will be returned # * "step" is a shortcut for "seq(1, n, step)" as vector "n" # # for better performance this function doesn't count the GC skew from the start of the sequence # for each position in the vector "n", but it sums up the differences from the last one to the # next one (using diff/diffinv): # # NOT: BUT: # 3 ### 3 ### # 5 ##### 5 ## # 9 ######### 9 #### # cumulated_GC_skew <- function(seq, n=nchar(seq), step=NULL) { seq <- unFASTA(seq) pos <- 1 if(!is.null(step)) n <- seq(0, n, step) if(length(n)>1) n <- diff(n) # differenciate \ diffinv(sapply(n, function(x) { # integrate --> see function header comment (performance) freq <- skew_characters(substr(seq,pos,pos+x-1)) pos <<- pos+x return(freq$G-freq$C) }))[-1] # first element of diffinv is always 0 -> skip that } # sliding_GC_skew ################## # returns a vector of GC skews that are calculated for each window of sequence # for the given sliding parameters "winsize" and "stepsize" # sliding_GC_skew <- function(seq, winsize, stepsize) { seq <- unFASTA(seq) return(mapply(function(shiftx) {GC_skew(substr(seq,shiftx,shiftx+winsize-1))}, seq(1, (nchar(seq)-winsize+1), stepsize))) } # get_extremum ############### # returns the (numeric) position of the extremum of the cumulated GC skew of a given sequence # the parameter FUN should therefore be either which.min (for minimas) or which.max (for maximas) # # this function is recursive and approximates the extremum better in every step. # get_extremum <- function(seq, FUN) { seq <- unFASTA(seq) l <- nchar(seq)*2/100 s <- cumulated_GC_skew(seq, step=l) pos <- FUN(s)*l if(l<=5) return(pos) lb <- max(c(pos-l*2,1)) ub <- min(c(pos+l*2,nchar(seq))) return(lb+get_extremum(substr(seq,lb,ub), FUN)-1) } # get_name_of_sequence ####################### # returns the "name" (5th column in the description of a FASTA list) of a sequence. # get_name_of_sequence <- function(data) { if(typeof(data) == "list") return(strsplit(data[[1]]$desc, "\\|")[[1]][5]) else return(" sequence") } # plot_cumulated_GC_skew ####################### # cumulates the GC skew along the given sequence, locates and outputs the positions of its minimum # and maximum and plots these extremas into a graph of the cumulated GC skew # # (invisibly) returns the positions of minimum and maximum (may be useful for further processing, # e.g. for get_annotations) # plot_cumulated_GC_skew <- function(seq, step=0) { if(step==0) step <- nchar(seq)/100 data <- cumulated_GC_skew(seq, step=step) # calculate exact positions of minimum and maximum minimum <- round(get_extremum(seq, which.min)) maximum <- round(get_extremum(seq, which.max)) plot(seq(step/2, length(data)*step, step), data, type="l", ylab="GC skew", xlab="chromosome position", main=paste("Cumulated GC skew of", get_name_of_sequence(seq), sep="") ) abline(v=c(minimum, maximum), lty=3) text(x=0, y=min(data), adj=c(0,0), labels=paste("Minimum: ", minimum, "bp\nMaximum: ", maximum, "bp", sep="")) return(invisible(list(min = minimum, max = maximum))) } # plot_sliding_GC_skew ####################### # slides a window of given window- and step-size along the given sequence, calculates the GC skew # for each window and plots it into a graph # plot_sliding_GC_skew <- function(seq, winsize, stepsize) { data <- sliding_GC_skew(seq, winsize, stepsize) plot(data, type="l", ylab="GC skew", xlab="chromosome position", main=paste("Sliding GC skew of", get_name_of_sequence(seq), sep="") ) text(x=1, y=min(data), adj=c(0,0), labels=paste("Window size: ", winsize, "bp\nStep size: ", stepsize, "bp", sep="")) } # example_run ############## # this function is just an example that shows how the other functions can be used and how they work together. # example_run <- function() { cat("Example run:\n") cat("(01) Fetch data from NCBI - Searching for \"e.coli k12 dh10b\"...\n") ecoli <- retrieve_genome("e.coli k12 dh10b") cat("(02) Sliding GC skew (Plot, winsize=50000, stepsize=10000)\n") par(mfrow=c(2,2)) # split window into 4 (2x2) plots plot_sliding_GC_skew(ecoli, winsize=50000, stepsize=10000) cat("(03) Sliding GC skew (Plot, winsize=20000, stepsize=5000)\n") plot_sliding_GC_skew(ecoli, winsize=20000, stepsize=5000) cat("(04) Cumulated GC skew (Plot, step=80000)\n") plot_cumulated_GC_skew(ecoli, step=80000) cat("(05) Cumulated GC skew (Plot, step=10000, store minimum)\n") minimum <- plot_cumulated_GC_skew(ecoli, step=10000)$min cat("(06) Minimum of cumulated GC skew is at", minimum, "- looking up that position in annotations database...\n") get_annotation(ecoli, minimum, variance=0) # variance=0 because I expect a "perfect" match for the gidA gene at the minimum cat("(07) Step 06 should have located the gidA gene at the minimum of the cumulated GC skew...\n", " The gidA gene is located next to the origin of replication, so we have (approximatly) located the oriC :-)\n") cat("(08) Fetch data from NCBI - Searching for \"AY394850\" (corona virus related to SARS)...\n") sars <- retrieve_genome("AY394850") cat("(09) GC_skew:", GC_skew(sars), "\n") cat("(10) cumulated_GC_skew (n=100):", cumulated_GC_skew(sars, 100), "\n") }