#!/usr/bin/Rscript
data <- read.table("ex29_hierarchical_clustering.txt", header=T, row.names=1)
data <- log2(data)
good <- as.matrix(data[,1:8])
poor <- as.matrix(data[,9:16])
genes.count <- dim(data)[1]

# top 50 variant genes
genes.top50 <- names(tail(sort(apply(data, 1, var)),50))
data.top50 <- data[genes.top50,]

# top 50 variant genes (good vs. poor)
tmp <- sapply(1:genes.count, function(row) var(good[row,], poor[row,]))
data.top50 <- as.matrix(data[which(order(tmp, decreasing=TRUE) <= 50),])

# euclid
d.euclid <- dist(data.top50)
plot(density(d.euclid))

# pearson
d.pearson <- as.dist(1 - cor(t(data.top50)))
plot(density(d.pearson))

# hierarchical clustering
h.euclid <- hclust(d.euclid)
h.pearson <- hclust(d.pearson)

par(mfrow=c(2,3))
plot(hclust(d.euclid, "single"))
plot(hclust(d.euclid, "average"))
plot(hclust(d.euclid, "complete"))
plot(hclust(d.pearson, "single"))
plot(hclust(d.pearson, "average"))
plot(hclust(d.pearson, "complete"))

# use samples for clustering
data.top50 <- t(data.top50)

# euclid
d.euclid <- dist(data.top50)
plot(density(d.euclid))

# pearson
d.pearson <- as.dist(1 - cor(t(data.top50)))
plot(density(d.pearson))

# hierarchical clustering
h.euclid <- hclust(d.euclid)
h.pearson <- hclust(d.pearson)

par(mfrow=c(2,3))
plot(hclust(d.euclid, "single"))
plot(hclust(d.euclid, "average"))
plot(hclust(d.euclid, "complete"))
plot(hclust(d.pearson, "single"))
plot(hclust(d.pearson, "average"))
plot(hclust(d.pearson, "complete"))

# kmeans clustering
cl <- kmeans(t(data.top50), 2)
plot(data.top50, col=cl$cluster)
points(cl$centers, col = 1:2, pch = 8, cex=2)