Merge columns of a dataframe by two conditions using aggregate

5

votes

I have a matrix like this

  P   A   B  C 
  1   2   0  5
  2   1   1  3
  3   0   4  7
  1   1   1  0
  3   1   1  0
  3   0   2  1
  2   3   3  4

I want to merge/sort the rows by P and by each of the columns. So that each P value is for each column once and the value for each P in each column is summed up. The result should be:

I tried already aggregate but it only helps me to sum up every P value for all columns so that I have just one row for each P.

r aggregate

because each p row will appear for each column. – Miguel123

it is something like mixture of merge and split – Miguel123

4

votes

One idea is to split your data frame on P and apply a custom function(fun1) which creates a matrix with 0 and replaces the diagonal with the sum of the columns. i.e.

fun1 <- function(x){
m1 <- matrix(0, ncol = ncol(x) - 1, nrow = ncol(x) - 1)
diag(m1) <- sapply(x[-1], sum)
return(m1)
       }

l1 <- split(df, df$P)
do.call(rbind, lapply(l1, fun1))

#       [,1] [,2] [,3]
# [1,]    3    0    0
# [2,]    0    1    0
# [3,]    0    0    5
# [4,]    4    0    0
# [5,]    0    4    0
# [6,]    0    0    7
# [7,]    1    0    0
# [8,]    0    7    0
# [9,]    0    0    8

Or to get it to your desired output, then

final_df <- as.data.frame(cbind(rep(names(l1), each = ncol(df)-1), 
                                             do.call(rbind, lapply(l1, fun1))))
names(final_df) <- names(df)

final_df
#  P A B C
#1 1 3 0 0
#2 1 0 1 0
#3 1 0 0 5
#4 2 4 0 0
#5 2 0 4 0
#6 2 0 0 7
#7 3 1 0 0
#8 3 0 7 0
#9 3 0 0 8

3

votes

Another idea is to use diag function itself to create a matrix. Then you can rbind these matrices together.

xx=aggregate(. ~ P, df, sum)
yy=xx[,-1]
yy=as.data.frame(t(yy))
cbind(rep(1:ncol(yy),nrow(yy)),do.call("rbind", lapply(yy, function(xx) diag(xx, nrow = nrow(yy), ncol = nrow(yy)))))

      [,1] [,2] [,3] [,4]
 [1,]    1    3    0    0
 [2,]    2    0    1    0
 [3,]    3    0    0    5
 [4,]    1    4    0    0
 [5,]    2    0    4    0
 [6,]    3    0    0    7
 [7,]    1    1    0    0
 [8,]    2    0    7    0
 [9,]    3    0    0    8

3

votes

We get the maximum value of frequency count from 'P' column ('i1'), aggregate the columns grouped by 'P' to get the sum ('df2'), replicate the rows of 'df2' by the 'i1', split the dataset by 'P' and change the non-diagonal elements in other columns to 0 and return it as data.frame, order and change the row names to NULL.

i1 <- max(table(df1$P))
df2 <- aggregate(.~P, df1, sum)
df3 <-  df2[rep(1:nrow(df2), i1)]
res <- unsplit(lapply(split(df3, df3$P), function(x) {
         x[-1] <- diag(3)*x[-1]
         x}), df3$P)
res1 <- res[order(res$P),]
row.names(res1) <- NULL
res1
#  P A B C
#1 1 3 0 0
#2 1 0 1 0
#3 1 0 0 5
#4 2 4 0 0
#5 2 0 4 0
#6 2 0 0 7
#7 3 1 0 0
#8 3 0 7 0
#9 3 0 0 8

Or using data.table, convert the 'data.frame' to 'data.table' (setDT(df1)), loop through the Subset of Data.table (.SD), get the sum, grouped by 'P', replicate the rows of the summarised dataset and change the non-diagonal elements to 0 (as discussed in the first solution).

library(data.table)
setDT(df1)[, lapply(.SD, sum), by = P
           ][rep(1:.N, i1)
            ][, .SD*diag(ncol(df1)-1), by = P]
#   P A B C
#1: 1 3 0 0
#2: 1 0 1 0
#3: 1 0 0 5
#4: 2 4 0 0
#5: 2 0 4 0
#6: 2 0 0 7
#7: 3 1 0 0
#8: 3 0 7 0
#9: 3 0 0 8

Or using dplyr

library(dplyr)
library(purrr)
d1 <- as.data.frame(diag(i1))
df2 <-  df1 %>% 
             group_by(P) %>% 
             summarise_each(funs(sum)) %>% 
             replicate(i1, ., simplify = FALSE) %>%
             bind_rows() %>% 
             arrange(P)
df2[-1] <- map2(df2[-1], d1, ~.x * .y)
df2
# A tibble: 9 × 4
#      P     A     B     C
#   <int> <dbl> <dbl> <dbl>
#1     1     3     0     0
#2     1     0     1     0
#3     1     0     0     5
#4     2     4     0     0
#5     2     0     4     0
#6     2     0     0     7
#7     3     1     0     0
#8     3     0     7     0
#9     3     0     0     8

1

votes

Unless I'm missing something, the following looks valid too. Start by computing the sums per "P":

s = as.matrix(rowsum(dat[-1], dat$P))

Create the final matrix:

k = s[rep(1:nrow(s), each = ncol(s)), ]

Compute indices to replace with "0"s:

k[col(k) != (row(k) - 1) %% ncol(k) + 1] = 0
k
#  A B C
#1 3 0 0
#1 0 1 0
#1 0 0 5
#2 4 0 0
#2 0 4 0
#2 0 0 7
#3 1 0 0
#3 0 7 0
#3 0 0 8

data:

dat = structure(list(P = c(1L, 2L, 3L, 1L, 3L, 3L, 2L), A = c(2L, 1L, 
0L, 1L, 1L, 0L, 3L), B = c(0L, 1L, 4L, 1L, 1L, 2L, 3L), C = c(5L, 
3L, 7L, 0L, 0L, 1L, 4L)), .Names = c("P", "A", "B", "C"), class = "data.frame", row.names = c(NA, 
-7L))

Having computed s, user20650 's more straightforward alternative:

matrix(diag(ncol(s)), nrow(s) * ncol(s), ncol(s), byrow = TRUE) * c(t(s))

or, also, messing with other interesting alternatives on the same idea:

kronecker(rep_len(1, nrow(s)), diag(ncol(s))) * c(t(s))

diag(ncol(s))[rep(1:ncol(s), nrow(s)), ] * s[rep(1:nrow(s), each = ncol(s)), ]

Merge columns of a dataframe by two conditions using aggregate

4 Answers