3
votes

I have been clustering some of my values and then grouping them. I then plot some density plots using ggplot2 and overlay the clusters. An example image is below: enter image description here

For each group in the cluster I plot a density plot and overlay them. The colouring in the density plots corresponds to the groupings in the clustering.

My problem is, I have split the data manually based on the groupings and put them in their own individual text table (see code below). This is very inefficient and can be come very tedious for large data sets. How can I dynamically plot the density plots in ggplot2 without separating the clusters into their own individual text tables?

The original input tables looks like this before it was split:

scores <- read.table(textConnection("
file        max        min        avg               lowest
132         5112.0     6520.0     5728.0            5699.0
133         4720.0     6064.0     5299.0            5277.0
5           4617.0     5936.0     5185.0            5165.0
1           4384.0     5613.0     4917.0            4895.0
1010        5008.0     6291.0     5591.0            5545.0
104         4329.0     5554.0     4858.0            4838.0
105         4636.0     5905.0     5193.0            5165.0
35          4304.0     5578.0     4842.0            4831.0
36          4360.0     5580.0     4891.0            4867.0
37          4444.0     5663.0     4979.0            4952.0
31          4328.0     5559.0     4858.0            4839.0
39          4486.0     5736.0     5031.0            5006.0
32          4334.0     5558.0     4864.0            4843.0
"), header=TRUE)

The code I used to generate the plot: Please note combining the base graphics with grid is still not working correctly

library(ggplot2)
library(grid)

layout(matrix(c(1,2,3,1,4,5), 2, 3, byrow = TRUE))

# define function to create multi-plot setup (nrow, ncol)
vp.setup <- function(x,y){
grid.newpage()
pushViewport(viewport(layout = grid.layout(x,y)))
}

# define function to easily access layout (row, col)
vp.layout <- function(x,y){
viewport(layout.pos.row=x, layout.pos.col=y)
}

vp.setup(2,3)

file_vals <- read.table(textConnection("
file        avg_vals
133         1.5923
132         1.6351
1010        1.6532
104         1.6824
105         1.6087
39          1.8694
32          1.9934
31          1.9919
37          1.8638
36          1.9691
35          1.9802
1           1.7283
5           1.7637
"), header=TRUE)

red <- read.table(textConnection("
file        max        min        avg               lowest
31          4328.0     5559.0     4858.0            4839.0
32          4334.0     5558.0     4864.0            4843.0
36          4360.0     5580.0     4891.0            4867.0
35          4304.0     5578.0     4842.0            4831.0
"), header=TRUE)

blue <- read.table(textConnection("
file        max        min        avg               lowest
133         4720.0     6064.0     5299.0            5277.0
105         4636.0     5905.0     5193.0            5165.0
104         4329.0     5554.0     4858.0            4838.0
132         5112.0     6520.0     5728.0            5699.0
1010        5008.0     6291.0     5591.0            5545.0
"), header=TRUE)

green <- read.table(textConnection("
file        max        min        avg               lowest
39          4486.0     5736.0     5031.0            5006.0
37          4444.0     5663.0     4979.0            4952.0
5           4617.0     5936.0     5185.0            5165.0
1           4384.0     5613.0     4917.0            4895.0
"), header=TRUE)


# Perform Cluster
d <- dist(file_vals$avg_vals, method = "euclidean")
fit <- hclust(d, method="ward")
plot(fit, labels=file_vals$file)
groups <- cutree(fit, k=3)

cols = c('red', 'blue', 'green', 'purple', 'orange', 'magenta', 'brown', 'chartreuse4','darkgray','cyan1')
rect.hclust(fit, k=3, border=cols)


# Desnity plots
dat = rbind(data.frame(Cluster='Red', max_vals = red$max), data.frame(Cluster='Blue', max_vals = blue$max), data.frame(Cluster='Green', max_vals = green$max))
max = (ggplot(dat,aes(x=max_vals)))
max = max + geom_density(aes(fill=factor(Cluster)), alpha=.3) + xlim(c(3500, 5500)) + scale_fill_manual(values=c("red",'blue',"green"))
max = max + labs(fill = 'Clusters')
print(max, vp=vp.layout(1,2))

dat = rbind(data.frame(Cluster='Red', min_vals = red$min), data.frame(Cluster='Blue', min_vals = blue$min), data.frame(Cluster='Green', min_vals = green$min))
min = (ggplot(dat,aes(x=min_vals)))
min = min + geom_density(aes(fill=factor(Cluster)), alpha=.3) + xlim(c(5000, 7000)) + scale_fill_manual(values=c("red",'blue',"green"))
min = min + labs(fill = 'Clusters')
print(min, vp=vp.layout(1,3))

dat = rbind(data.frame(Cluster='Red', avg_vals = red$avg), data.frame(Cluster='Blue', avg_vals = blue$avg), data.frame(Cluster='Green', avg_vals = green$avg))
avg = (ggplot(dat,aes(x=avg_vals)))
avg = avg + geom_density(aes(fill=factor(Cluster)), alpha=.3) + xlim(c(4000, 6000)) + scale_fill_manual(values=c("red",'blue',"green"))
avg = avg + labs(fill = 'Clusters')
print(avg, vp=vp.layout(2,2))

dat = rbind(data.frame(Cluster='Red', lowest_vals = red$lowest), data.frame(Cluster='Blue', lowest_vals = blue$lowest), data.frame(Cluster='Green', lowest_vals = green$lowest))
lowest = (ggplot(dat,aes(x=lowest_vals)))
lowest = lowest + geom_density(aes(fill=factor(Cluster)), alpha=.3) + xlim(c(4000, 6000)) + scale_fill_manual(values=c("red",'blue',"green"))
lowest = lowest + labs(fill = 'Clusters')
print(lowest, vp=vp.layout(2,3))
1

1 Answers

1
votes

In this way you can automatically create your desired plot with 4 panels.

First, the data:

scores <- read.table(textConnection("
file        max        min        avg               lowest
132         5112.0     6520.0     5728.0            5699.0
133         4720.0     6064.0     5299.0            5277.0
5           4617.0     5936.0     5185.0            5165.0
1           4384.0     5613.0     4917.0            4895.0
1010        5008.0     6291.0     5591.0            5545.0
104         4329.0     5554.0     4858.0            4838.0
105         4636.0     5905.0     5193.0            5165.0
35          4304.0     5578.0     4842.0            4831.0
36          4360.0     5580.0     4891.0            4867.0
37          4444.0     5663.0     4979.0            4952.0
31          4328.0     5559.0     4858.0            4839.0
39          4486.0     5736.0     5031.0            5006.0
32          4334.0     5558.0     4864.0            4843.0
"), header=TRUE)

file_vals <- read.table(textConnection("
file        avg_vals
                                   133         1.5923
                                   132         1.6351
                                   1010        1.6532
                                   104         1.6824
                                   105         1.6087
                                   39          1.8694
                                   32          1.9934
                                   31          1.9919
                                   37          1.8638
                                   36          1.9691
                                   35          1.9802
                                   1           1.7283
                                   5           1.7637
                                   "), header=TRUE)

Both data frames can be merged into a single one:

dat <- merge(scores, file_vals, by = "file")

Fit:

d <- dist(dat$avg_vals, method = "euclidean")
fit <- hclust(d, method="ward")
groups <- cutree(fit, k=3)
cols <- c('red', 'blue', 'green', 'purple', 'orange', 'magenta', 'brown', 'chartreuse4','darkgray','cyan1')

Add a column with the colour names (based on the fit):

dat$group <- cols[groups]

Reshape data from wide to long format:

dat_re <- reshape(dat, varying = c("max", "min", "avg", "lowest"), direction = "long", drop = c("file", "avg_vals"), v.names = "value", idvar = "group", times = c("max", "min", "avg", "lowest"), new.row.names = seq(nrow(scores) * 4))

Plot:

p <- (ggplot(dat_re ,aes(x = value))) +
geom_density(aes(fill = group), alpha=.3) +
scale_fill_manual(values=cols) +
labs(fill = 'Clusters') +
facet_wrap( ~ time)

print(p)

enter image description here