#!/usr/bin/Rscript # Genomik und Bioinformatik II # Projekt 2 (Tumor classification by SVM) # written by Andreas Loibl and Corina Stemmer library(Biobase) library(ALL) library(e1071) library(multtest) library(MCRestimate) data(ALL) # 1.1 samples_b <- which(substr(ALL$BT,1,1)=="B") samples_bcr.abl.neg <- which(ALL$mol.biol %in% c("BCR/ABL", "NEG")) subset <- intersect(samples_b, samples_bcr.abl.neg) subset.data <- ALL[,subset] # 1.2 subset.data$label <- factor(subset.data$mol.biol) # 1.3 test <- sample(1:length(subset),6) test.data <- subset.data[,test] train.data <- subset.data[,-test] cat("Q: How many patients are in the training set?\n") cat("A:", dim(train.data)["Samples"], "\n") cat("Q: How many are labelled BCR/ABL\n") cat("A:", sum(train.data$mol.biol=="BCR/ABL"), "\n") cat("Q: and how many are labelled NEG?\n") cat("A:", sum(train.data$mol.biol=="NEG"), "\n") # 2 train.e <- t(exprs(train.data)) train.var <- apply(train.e,2,var) dev.new() hist(train.var) train.top1k <- names(tail(sort(train.var), 1000)) train.top1k.data <- train.data[train.top1k,] train.top1k.e <- t(exprs(train.top1k.data)) # 2.1 model <- svm(train.top1k.e, train.top1k.data$label, kernel="linear") print(summary(model)) train.pred <- predict(model, train.top1k.e) print(table(train.pred, train.top1k.data$label)) model <- svm(train.top1k.e, train.top1k.data$label, kernel="linear", cross=10) print(summary(model)) # 2.2 test.top1k.data <- test.data[train.top1k,] test.top1k.e <- t(exprs(test.top1k.data)) test.pred <- predict(model, test.top1k.e) print(table(test.pred, test.top1k.data$label)) # 2.3 rand.data <- train.top1k.data print(rand.data$label) rand.data$label <- sample(train.top1k.data$label) print(rand.data$label) rand.e <- t(exprs(rand.data)) model <- svm(rand.e, rand.data$label, kernel="linear", cross=10) print(summary(model)) # predict rand rand.pred <- predict(model, rand.e) print(table(rand.pred, rand.data$label)) # predict test rand.pred <- predict(model, test.top1k.e) print(table(rand.pred, test.top1k.data$label)) # 2.4 tmp <- mt.teststat(t(train.e), as.numeric(train.data$label == "NEG"), test="t") train.top100 <- which(order(tmp, decreasing=TRUE) <= 100) train.top100.data <- train.data[train.top100,] train.top100.e <- t(exprs(train.top100.data)) informative_genes <- function(data, labels, nrGenes) { which(order(mt.teststat(data, as.numeric(factor(labels))-1, test="t"), decreasing=TRUE) <= nrGenes) } top100 <- informative_genes(exprs(subset.data), subset.data$label, 100) top100.data <- subset.data[top100,] accuracy <- matrix(0,10,4,,list(c(1, 2, 10, 50, 75, 100, 150, 200, 500, 1000), c("polynomial", "sigmoid", "linear", "radial"))) for(genecount in rownames(accuracy)) { sdata <- informative_genes(exprs(train.data), train.data$label, as.numeric(genecount)) sdata.data <- train.data[sdata,] sdata.e <- t(exprs(sdata.data)) for(kernel in colnames(accuracy)) { model <- svm(sdata.e, sdata.data$label, kernel=kernel, cross=10) accuracy[genecount,kernel] <- model$tot.accuracy } } dev.new() plot(x=rep(1:10, 4), y=c(accuracy), col=rep(1:4, each=10), axes=FALSE, xlab="Gene count", ylab="Accuracy") for(i in 1:4) lines(accuracy[,i], col=i) axis(1, at=1:10, lab=rownames(accuracy)) axis(2) legend(8, min(accuracy)+8, colnames(accuracy), col=1:4, pch=19) # 3.1 cross_validate <- function(data, labels, steps=10, nrGenes) { e <- data x <- sample(1:dim(data)[1]) ids <- split(x,matrix(1:steps,steps,length(x))[1:length(x)]) single <- numeric(steps) for(i in 1:steps) { train.e <- e[-unlist(ids[i]),] train.labels <- labels[-unlist(ids[i])] test.labels <- labels[unlist(ids[i])] test.e <- e[unlist(ids[i]),] tmp <- mt.teststat(t(train.e), as.numeric(train.labels)-1, test="t") train.subset <- which(order(tmp, decreasing=TRUE) <= nrGenes) train.subset.e <- train.e[,train.subset] test.subset.e <- test.e[,train.subset] model <- svm(train.subset.e, train.labels, kernel="linear") pred <- predict(model, test.subset.e) single[i] <- sum(pred == test.labels)/length(test.labels)*100 } return(list(accuracies=single, tot.accuracy=mean(single))) } out_loop <- replicate(100, svm(rand.e, rand.data$label, kernel="linear", cross=10)$tot.accuracy) in_loop <- replicate(100, cross_validate(rand.e, rand.data$label, 10, 100)$tot.accuracy) t.test(in_loop, out_loop) dev.new() boxplot(data.frame(in_loop, out_loop)) # 3.2 nl_cross_validate <- function(known.patients, class.column, classification.fun, poss.parameters = list(), cross.outer = 10, cross.repeat = 3, cross.inner = cross.outer) { result <- MCRestimate(eset=known.patients, class.column=class.column, classification.fun=classification.fun, poss.parameters=poss.parameters, cross.outer=cross.outer, cross.repeat=cross.repeat, cross.inner=cross.inner) dev.new() plot(result) } nl_cross_validate(top100.data, "label", "SVM.wrap")