2
votes

I have loaded a table of integer data with 2,200 columns. What I'd like to do is condense the data down by averaging the values in every 5 columns and placing that in a new column in a new table.

For example, if I had:

Col1 | Col2 | Col3 | Col4 | Col5 | Col6 | Col7 | Col8 | Col9 | Col10
  2      4      6      8     10     12     14     16     18     20

I would get:

Col1 | Col2
  6     16 

Which is just the average of the values in columns 1-5 from the original table in Col1 and the average of the values in columns 6-10 in Col2.

I haven't quite wrapped my head around R syntax, so any help would be appreciated.

5

5 Answers

3
votes

Here's one approach that's applicable if the number of elements to be grouped is divisible by n (5, in your case):

x <- 1:100
n <- 5
tapply(x, rep(seq(1, length(x), n), each=n), mean)

 # 1  6 11 16 21 26 31 36 41 46 51 56 61 66 71 76 81 86 91 96 
 # 3  8 13 18 23 28 33 38 43 48 53 58 63 68 73 78 83 88 93 98 

The first row of output contains element names, and the second row contains means of successive groups of n elements.

To apply this to all rows of a matrix or data.frame, you can do, e.g.:

m <- matrix(1:1000, ncol=100)
apply(m, 1, function(x) tapply(x, rep(seq(1, length(x), n), each=n), mean))

EDIT

This alternative approach will give you some performance gains due to vectorisation with rowMeans:

t(mapply(function(x, y) rowMeans(m[, x:y]),
         seq(1, ncol(m), n), seq(n, ncol(m), n)))
3
votes

Oops, I see this is the comment of @user20650 in @jbaums answer. The rowsum function splits rows of a matrix by a factor, and sums the columns of each split. So for

m <- matrix(1:1000, ncol=100)
n <- 5

we have

rowsum(t(m), rep(seq_len(ncol(m) / n), each=n)) / n

This is fast, if that's important

library(microbenchmark)
f0 = function(m, n) rowsum(t(m), rep(seq_len(ncol(m) / n), each=n)) / n
f1 = function(m, n) 
    apply(m, 1, function(x) tapply(x, rep(seq(1, length(x), n), each=n), mean))
f2 = function(m, n)
    t(mapply(function(x, y) rowMeans(m[, x:y]),
        seq(1, ncol(m), n), seq(n, ncol(m), n)))

all.equal(f0(m, n), f1(m, n), check.attributes=FALSE)
## [1] TRUE

all.equal(f0(m, n), f2(m, n), check.attributes=FALSE)
## [1] TRUE    

microbenchmark(f0(m, n), f1(m, n), f2(m, n))
## Unit: microseconds
##      expr      min        lq   median        uq      max neval
##  f0(m, n)  164.351  170.1675  176.730  187.8570  237.419   100
##  f1(m, n) 8060.639 8513.3035 8696.742 8908.5190 9771.019   100
##  f2(m, n)  540.894  588.3820  603.787  634.1615  732.209   100
1
votes

Here's another approach using a loop and rowMeans instead, in case you prefer a loop in this case. Will work for matrices, but needs adjustment for vectors.

# example data
dat <- as.data.frame( matrix(1:20,ncol=10,byrow=TRUE) )
# pick range
range <- 5

ind <- seq(1,ncol(dat),range)
newdat <- NULL

for(i in ind){
  newcol <- rowMeans(dat[,i:(i+range-1)])
  newdat <- cbind(newdat, newcol)
}

Will result in:

> newdat
     newcol newcol
[1,]      3      8
[2,]     13     18
1
votes

@jbaums answer looks pretty good. Since I had already started this answer, I thought I would post my solution as well.

#Make some fake data
require(data.table)
data <- data.table(t(iris[,1:4]))

#Transpose since rows are easier to deal with than columns
data <- data.table(t(data))
data[ , row := .I]

#Sum by every 5 rows
data <- data[ , lapply(.SD,sum), by=cut(row,seq(0,nrow(data),5))]

#Transpose back to original results
result <- data.table(t(data))
1
votes

If you wanted to get the means of the elements from col1-col5, col6-col10, etc.

m1 <- matrix(c(rep(1:100, 2), 1:20), ncol=22)
n <- 5
p1 <- prod(dim(m1))
n1 <- nrow(m1)*n
n2 <- p1-p1%%n1

c(rowMeans(matrix(m1[1:n2], nrow=p1%/%n1, byrow=TRUE)), mean(m1[(n2+1):p1]))
#[1]  25.5 75.5 25.5 75.5 10.5

Or

 sapply(seq(1,ncol(m1), by=n), function(i) mean(m1[,i:(min(c(i+n-1), ncol(m1)))]) )
 #[1] 25.5 75.5 25.5 75.5 10.5

With some labels

 indx <- seq(1,n2/nrow(m1), by=n)
 indx1 <- paste("Col",paste(indx, indx+4, sep="-"),sep="_")
 indx2 <- paste("Col", paste(seq(p1%%n1+1, ncol(m1)),collapse="-"), sep="_")
 c(rowMeans(matrix(m1[1:n2], nrow=p1%/%n1, byrow=TRUE, dimnames=list(indx1, NULL))), setNames(mean(m1[(n2+1):p1]), indx2))
 # Col_1-5  Col_6-10 Col_11-15 Col_16-20 Col_21-22 
 #  25.5      75.5      25.5      75.5      10.5 

Update

I realized that you wanted the rowMeans by splitting up columns 1:5, 6:10, 11:15 etc. If that is the case:

 res1 <- cbind( colMeans(aperm(array(m1[1:n2], dim=c(nrow(m1), n, p1%/%n1)), c(2,1,3))),
                 rowMeans(m1[,(ncol(m1)-ncol(m1)%%n+1):ncol(m1)]))

which is equal to manual splitting the columns

 res2 <- cbind(rowMeans(m1[,1:5]), rowMeans(m1[,6:10]), rowMeans(m1[,11:15]), 
              rowMeans(m1[,16:20]), rowMeans(m1[,21:22]))
   identical(res1,res2)
  #[1] TRUE

  colnames(res1) <- c(indx1,indx2)
  res1
  #     Col_1-5 Col_6-10 Col_11-15 Col_16-20 Col_21-22
  #[1,]      21       71        21        71         6
  #[2,]      22       72        22        72         7
  #[3,]      23       73        23        73         8
  #[4,]      24       74        24        74         9
  #[5,]      25       75        25        75        10
  #[6,]      26       76        26        76        11
  #[7,]      27       77        27        77        12
  #[8,]      28       78        28        78        13
  #[9,]      29       79        29        79        14
 #[10,]      30       80        30        80        15