#!/usr/bin/Rscript
# Genomik und Bioinformatik I
# Projekt 2 (GC-skew in bacterial genomes)
# written by Andreas Loibl (enrollment no. 1524148)

# these libraries are recommended, but for most of the functions optional.
# see the comment headers of the functions for more information whether it is needed or not.
require("XML")
require("Biobase")
require("Biostrings")

# URL where the Entrez Programming Utilities API can be accessed
# see the "e*" functions or http://eutils.ncbi.nlm.nih.gov/ for more information
urlbase <- "http://eutils.ncbi.nlm.nih.gov/entrez/eutils/"

# comment out the following line to disable the use of the annotations cache;
# see function "eAnnotations" for more information
if(!exists("annotations_cache")) annotations_cache <<- list(NULL)

# eSummary
###########
# downloads the document summary of a given primary ID (i.e. GI number for nucleotide database)
# from NCBI (using Entrez Programming Utilities API)
# 
# dependencies:
# 	library(XML)			[required]
eSummary <- function(id)
{
        if(!existsFunction("xmlInternalTreeParse")) stop("library(XML) is required for this function!")	
        url <- paste(urlbase, "esummary.fcgi?db=nuccore&retmode=xml&id=", id, sep="")
        xml <- xmlInternalTreeParse(readLines(url))
	return(xml)
}

# eFetch
##########
# downloads the sequence of a given primary ID (i.e. GI number for nucleotide database)
# in FASTA format from NCBI (using Entrez Programming Utilities API)
# 
# dependencies:
# 	library(Biostrings)		[optional, recommended]
eFetch <- function(id)
{
        url <- paste(urlbase, "efetch.fcgi?db=nuccore&rettype=fasta&id=", id, sep="")
	cat("downloading...\n")
	if(existsFunction("readFASTA"))
	{
	        return(readFASTA(url, strip.descs=TRUE))
	}
	else
	{
		# assuming the url only contains one FASTA block...
		# (this is just a fallback if Biostrings is not available)
		lines <- readLines(url)
		return(list(list(desc = gsub("^>", "", lines[1]), seq = paste(lines[-1], collapse=""))))
	}
}

# eAnnotations
###############
# downloads the sequence of a given primary ID (i.e. GI number for nucleotide database)
# in "Feature Table report" (text) format from NCBI (using Entrez Programming Utilities API)
# if the annotations cache is enabled it is stored there for faster access on later use
#
eAnnotations <- function(id)
{
	if(typeof(id) != "character") id <- as.character(id)
        url <- paste(urlbase, "efetch.fcgi?db=nuccore&rettype=ft&id=", id, sep="")
	if(exists("annotations_cache"))
	{
		if(is.null(annotations_cache[[id]])) annotations_cache[[id]] <<- readLines(url)
		return(annotations_cache[[id]])
	}
        return(readLines(url))
}

# retrieve_genome
##################
# searches the NCBI nucleotide database for the supplied search term and downloads the matching
# result using eFetch and returns it.
# 
# * if more than one match is found the user will be prompted to choose the desired one from a list
# * if more than 20 matches are found then no list is shown, only an error message claiming that there
#   are too many matches (so the user has to use more precise search terms)
# * if there are no matches an error message will be returned
# 
# as search term any string that can identify a genome (e.g. a string like "e.coli k12 dh10b" or a
# identifier like the EMBL ID or the GI number) may be used.
# 
# dependencies:
# 	library(XML)			[required]
retrieve_genome <- function(term)
{
	if(!existsFunction("xmlInternalTreeParse")) stop("library(XML) is required for this function!")
	urlencode <- function(data) return(paste("%", charToRaw(data), collapse="", sep=""))
	url <- paste(urlbase, "esearch.fcgi?db=nuccore&term=", urlencode(term), sep="")
	xml <- xmlInternalTreeParse(readLines(url))
	results.count <- xmlValue(getNodeSet(xml, "/eSearchResult/Count")[[1]])
	if(results.count > 20)
	{
		stop("too many (", results.count, ") results!\n")
	}
	else if(results.count > 1)
	{
		IDs <- list(NULL); IDs[[1]] <- NULL
		for(id in xmlApply(getNodeSet(xml, "/eSearchResult/IdList")[[1]], xmlValue))
		{
			n <- length(IDs)+1
			IDs[[n]] <- id
			eSum <- eSummary(id)
			title <- xmlValue(getNodeSet(eSum, "/eSummaryResult/DocSum/Item[@Name='Title']")[[1]])
			caption <- xmlValue(getNodeSet(eSum, "/eSummaryResult/DocSum/Item[@Name='Caption']")[[1]])
			size <- round(as.numeric(xmlValue(getNodeSet(eSum, "/eSummaryResult/DocSum/Item[@Name='Length']")[[1]]))/1024/1024, 2)
			cat("   ", n, "\t", title, " [", caption, "] ", size, "MB \n")
		}
		choice <- readline("Choice: ")
		choice.id <- IDs[[as.numeric(choice)]]
		if(choice.id > 0)
		{
			return(eFetch(choice.id))
		}
		else
		{
			return(FALSE)
		}
	}
	else if(results.count ==1)
	{
		id <- xmlValue(getNodeSet(xml, "/eSearchResult/IdList/Id")[[1]])
		return(eFetch(id))
	}
	else
	{
		stop("genome not found!")
	}
}

# get_annotation
#################
# fetches the annotation at a given position inside a sequence (and its neighborhood)
# 
# * "data" needs to be a FASTA list that has a GI number in its description,
#   because this identifier is required for eAnnotations to fetch the annotations from NCBI
# * if "pos" is not supplied the whole annotation will be returned (as list of strings)
# * if "variance" is zero only exact matches will be shown, otherwise all matches within this
#   given variance are listed
#
get_annotation <- function(data, pos=0, variance=1000)
{
	if(typeof(data) == "list")
	{
		# get ID from FASTA Header and fetch annotations from NCBI
		data <- eAnnotations(strsplit(data[[1]]$desc, "\\|")[[1]][2])
	}
	else
	{
		stop("data needs to be a FASTA list that has a GI number in its description")
	}
	data <- strsplit(strsplit(gsub("\r\t\t\t", "", paste(data, collapse="\n\r")),"\r")[[1]],"\t")
	data[[1]] <- NULL # remove header
	pos <- round(pos) # values like 0.2bp make no sense
	if(pos==0) return(data)
	cat("Exact matches:\n")
	for(section in data)
	{
		base.span <- as.numeric(gsub("[^0-9]","",section)[1:2])
		if(pos %in% seq(base.span[1], base.span[2])) cat(sub("\n","\n\t",section),"\n")
	}
	if(variance==0) return(0)
	cat("Surrounding area:\n")
	for(section in data)
	{
		base.span <- as.numeric(gsub("[^0-9]","",section)[1:2])
		if(abs(base.span[1]-pos) < variance || abs(base.span[2]-pos) < variance) cat(sub("\n","\n\t",section),"\n")
	}
}

# unFASTA
##########
# returns the supplied sequence as a character vector (i.e. "unlist" FASTA lists)
# 
unFASTA <- function(seq)
{
	if(typeof(seq) == "list") seq <- seq[[1]]$seq
	return(seq)
}

# skew_characters
##################
# returns a list containing the frequencies of A, T, C and G nucleotides in the supplied sequence
# 
# if the Biobase-library is present this function will be a lot faster so it is recommended to have this library installed.
# 
# dependencies:
# 	library(Biobase)		[optional, recommended]
skew_characters <- function(seq)
{
	if(existsFunction("alphabetFrequency"))
	{
		freq <- alphabetFrequency(DNAString(seq))
		return(list(A=freq["A"], T=freq["T"], C=freq["C"], G=freq["G"]))
	}
	else
	{
		freq <- t(data.frame(table(factor(strsplit(seq,"")[[1]],levels=c("A","T","C","G"))), row.names=1))
		return(list(A=freq[,"A"], T=freq[,"T"], C=freq[,"C"], G=freq[,"G"]))
	}
}

# GC_skew
##########
# calculates the GC skew of the supplied sequence and therefore returns a numeric value between -1 and 1
# 
GC_skew <- function(seq)
{
	seq <- unFASTA(seq)
	freq <- skew_characters(seq)
	if(freq$G+freq$C == 0) return(0)
	return(as.numeric((freq$G-freq$C)/(freq$G+freq$C)))
}

# cumulated_GC_skew
####################
# returns a vector of GC skews that are cumulated for each position supplied in "n"
# 
# * "n" is the number of nucleotides of which the cumulated GC skew should be calculated.
#   - if not supplied the GC skew will be calculated for the whole length of the sequence
#   - "n" may be a vector consisting of several nucleotide positions in the sequence,
#     so a vector of the cumulated GC skew of the first n nucleotides for each position
#     of the vector will be returned
# * "step" is a shortcut for "seq(1, n, step)" as vector "n"
# 
# for better performance this function doesn't count the GC skew from the start of the sequence
# for each position in the vector "n", but it sums up the differences from the last one to the
# next one (using diff/diffinv):
# 
#  NOT:               BUT:
#  3 ###              3 ###
#  5 #####            5    ##
#  9 #########        9      ####
#
cumulated_GC_skew <- function(seq, n=nchar(seq), step=NULL)
{
	seq <- unFASTA(seq)
	pos <- 1
	if(!is.null(step)) n <- seq(0, n, step)
	if(length(n)>1) n <- diff(n)    # differenciate \
	diffinv(sapply(n, function(x) {     # integrate --> see function header comment (performance)
		freq <- skew_characters(substr(seq,pos,pos+x-1))
		pos <<- pos+x
		return(freq$G-freq$C)
	}))[-1] # first element of diffinv is always 0 -> skip that
}

# sliding_GC_skew
##################
# returns a vector of GC skews that are calculated for each window of sequence
# for the given sliding parameters "winsize" and "stepsize"
# 
sliding_GC_skew <- function(seq, winsize, stepsize)
{
	seq <- unFASTA(seq)
        return(mapply(function(shiftx) {GC_skew(substr(seq,shiftx,shiftx+winsize-1))},
                seq(1, (nchar(seq)-winsize+1), stepsize)))
}

# get_extremum
###############
# returns the (numeric) position of the extremum of the cumulated GC skew of a given sequence
# the parameter FUN should therefore be either which.min (for minimas) or which.max (for maximas)
# 
# this function is recursive and approximates the extremum better in every step.
# 
get_extremum <- function(seq, FUN)
{
	seq <- unFASTA(seq)
	l <- nchar(seq)*2/100
	s <- cumulated_GC_skew(seq, step=l)
	pos <- FUN(s)*l
	if(l<=5) return(pos)
	lb <- max(c(pos-l*2,1))
	ub <- min(c(pos+l*2,nchar(seq)))
        return(lb+get_extremum(substr(seq,lb,ub), FUN)-1)
}

# get_name_of_sequence
#######################
# returns the "name" (5th column in the description of a FASTA list) of a sequence.
# 
get_name_of_sequence <- function(data)
{
        if(typeof(data) == "list")
                return(strsplit(data[[1]]$desc, "\\|")[[1]][5])
	else
		return(" sequence")
}

# plot_cumulated_GC_skew
#######################
# cumulates the GC skew along the given sequence, locates and outputs the positions of its minimum
# and maximum and plots these extremas into a graph of the cumulated GC skew
# 
# (invisibly) returns the positions of minimum and maximum (may be useful for further processing,
# e.g. for get_annotations)
# 
plot_cumulated_GC_skew <- function(seq, step=0)
{
	if(step==0) step <- nchar(seq)/100
	data <- cumulated_GC_skew(seq, step=step)

	# calculate exact positions of minimum and maximum
	minimum <- round(get_extremum(seq, which.min))
	maximum <- round(get_extremum(seq, which.max))

	plot(seq(step/2, length(data)*step, step), data, type="l",
		ylab="GC skew",
		xlab="chromosome position",
		main=paste("Cumulated GC skew of", get_name_of_sequence(seq), sep="")
		)
	abline(v=c(minimum, maximum), lty=3)
	text(x=0, y=min(data), adj=c(0,0), labels=paste("Minimum: ", minimum, "bp\nMaximum: ", maximum, "bp", sep=""))
	return(invisible(list(min = minimum, max = maximum)))
}

# plot_sliding_GC_skew
#######################
# slides a window of given window- and step-size along the given sequence, calculates the GC skew
# for each window and plots it into a graph
# 
plot_sliding_GC_skew <- function(seq, winsize, stepsize)
{
	data <- sliding_GC_skew(seq, winsize, stepsize)
	plot(data, type="l",
		ylab="GC skew",
		xlab="chromosome position",
		main=paste("Sliding GC skew of", get_name_of_sequence(seq), sep="")
		)
	text(x=1, y=min(data), adj=c(0,0), labels=paste("Window size: ", winsize, "bp\nStep size: ", stepsize, "bp", sep=""))
}

# example_run
##############
# this function is just an example that shows how the other functions can be used and how they work together.
# 
example_run <- function()
{
	cat("Example run:\n")
	cat("(01) Fetch data from NCBI - Searching for \"e.coli k12 dh10b\"...\n")
	ecoli <- retrieve_genome("e.coli k12 dh10b")
	cat("(02) Sliding GC skew (Plot, winsize=50000, stepsize=10000)\n")
	par(mfrow=c(2,2)) # split window into 4 (2x2) plots
	plot_sliding_GC_skew(ecoli, winsize=50000, stepsize=10000)
	cat("(03) Sliding GC skew (Plot, winsize=20000, stepsize=5000)\n")
	plot_sliding_GC_skew(ecoli, winsize=20000, stepsize=5000)
	cat("(04) Cumulated GC skew (Plot, step=80000)\n")
	plot_cumulated_GC_skew(ecoli, step=80000)
	cat("(05) Cumulated GC skew (Plot, step=10000, store minimum)\n")
	minimum <- plot_cumulated_GC_skew(ecoli, step=10000)$min
	cat("(06) Minimum of cumulated GC skew is at", minimum, "- looking up that position in annotations database...\n")
	get_annotation(ecoli, minimum, variance=0) # variance=0 because I expect a "perfect" match for the gidA gene at the minimum
	cat("(07) Step 06 should have located the gidA gene at the minimum of the cumulated GC skew...\n",
	     "    The gidA gene is located next to the origin of replication, so we have (approximatly) located the oriC :-)\n")
	cat("(08) Fetch data from NCBI - Searching for \"AY394850\" (corona virus related to SARS)...\n")
	sars <- retrieve_genome("AY394850")
	cat("(09) GC_skew:", GC_skew(sars), "\n")
	cat("(10) cumulated_GC_skew (n=100):", cumulated_GC_skew(sars, 100), "\n")
}