9
votes

I have a data set that has the following information:

Subject    Value1    Value2    Value3      UniqueNumber
001        1         0         1           3
002        0         1         1           2
003        1         1         1           1

If the value of UniqueNumber > 0, I would like to sum the values with dplyr for each subject from rows 1 through UniqueNumber and calculate the mean. So for Subject 001, sum = 2 and mean = .67.

total = 0;
average = 0;
for(i in 1:length(Data$Subject)){
   for(j in 1:ncols(Data)){
   if(Data$UniqueNumber[i] > 0){
    total[i] = sum(Data[i,1:j])
    average[i] = mean(Data[i,1:j])
   }
}

Edit: I am only looking to sum through the number of columns listed in the 'UniqueNumber' column. So this is looping through every row and stopping at column listed in 'UniqueNumber'. Example: Row 2 with Subject 002 should sum up the values in columns 'Value1' and 'Value2', while Row 3 with Subject 003 should only sum the value in column 'Value1'.

6
You can try df %>% mutate(sum = ifelse(UniqueNumber > 0, rowSums(.[, 2:(length(.)-1)]), NA), mean = ifelse(UniqueNumber > 0, rowMeans(.[, 2:(length(.)-1)]), NA)). - tmfmnk
@tmfmnk I don't think your code will iterate through on the length of the UniqueNumber. It looks like my results are summing through the whole column and not stopping at the value of the column UniqueValue. - statsguyz

6 Answers

11
votes

Not a tidyverse fan/expert, but I would try this using long format. Then, just filter by row index per group and then run any functions you want on a single column (much easier this way).

library(tidyr)
library(dplyr)

Data %>% 
  gather(variable, value, -Subject, -UniqueNumber) %>% # long format
  group_by(Subject) %>% # group by Subject in order to get row counts
  filter(row_number() <= UniqueNumber) %>% # filter by row index
  summarise(Mean = mean(value), Total = sum(value)) %>% # do the calculations
  ungroup() 

## A tibble: 3 x 3
#  Subject  Mean Total
#     <int> <dbl> <int>
# 1       1 0.667     2
# 2       2 0.5       1
# 3       3 1         1

A very similar way to achieve this could be filtering by the integers in the column names. The filter step comes before the group_by so it could potentially increase performance (or not?) but it is less robust as I'm assuming that the cols of interest are called "Value#"

Data %>% 
  gather(variable, value, -Subject, -UniqueNumber) %>% #long format
  filter(as.numeric(gsub("Value", "", variable, fixed = TRUE)) <= UniqueNumber) %>% #filter
  group_by(Subject) %>% # group by Subject
  summarise(Mean = mean(value), Total = sum(value)) %>% # do the calculations
  ungroup()

## A tibble: 3 x 3
#  Subject  Mean Total
#     <int> <dbl> <int>
# 1       1 0.667     2
# 2       2 0.5       1
# 3       3 1         1

Just for fun, adding a data.table solution

library(data.table)

data.table(Data) %>% 
  melt(id = c("Subject", "UniqueNumber")) %>%
  .[as.numeric(gsub("Value", "", variable, fixed = TRUE)) <= UniqueNumber,
    .(Mean = round(mean(value), 3), Total = sum(value)),
    by = Subject]

#    Subject  Mean Total
# 1:       1 0.667     2
# 2:       2 0.500     1
# 3:       3 1.000     1
3
votes

Here is another method that uses tidyr::nest to collect the Values columns into a list so that we can iterate through the table with map2. In each row, we select the correct values from the Values list-col and take the sum or mean respectively.

library(tidyverse)
tbl <- read_table2(
"Subject    Value1    Value2    Value3      UniqueNumber
001        1         0         1           3
002        0         1         1           2
003        1         1         1           1"
)
tbl %>%
  filter(UniqueNumber > 0) %>%
  nest(starts_with("Value"), .key = "Values") %>%
  mutate(
    sum = map2_dbl(UniqueNumber, Values, ~ sum(.y[1:.x], na.rm = TRUE)),
    mean = map2_dbl(UniqueNumber, Values, ~ mean(as.numeric(.y[1:.x], na.rm = TRUE))),
  )
#> # A tibble: 3 x 5
#>   Subject UniqueNumber Values             sum  mean
#>   <chr>          <dbl> <list>           <dbl> <dbl>
#> 1 001                3 <tibble [1 × 3]>     2 0.667
#> 2 002                2 <tibble [1 × 3]>     1 0.5  
#> 3 003                1 <tibble [1 × 3]>     1 1

Created on 2019-02-14 by the reprex package (v0.2.1)

2
votes

Check this solution:

df %>%
  gather(key, val, Value1:Value3) %>%
  group_by(Subject) %>%
  mutate(
    Sum = sum(val[c(1:(UniqueNumber[1]))]),
    Mean = mean(val[c(1:(UniqueNumber[1]))]),
  ) %>%
  spread(key, val)

Output:

 Subject UniqueNumber   Sum  Mean Value1 Value2 Value3
  <chr>          <int> <dbl> <dbl>  <dbl>  <dbl>  <dbl>
1 001                3     2 0.667      1      0      1
2 002                2     1 0.5        0      1      1
3 003                1     1 1          1      1      1
2
votes

OP might be interested only for dplyr solution but for comparison purposes and for future readers a base R option using mapply

cols <- grep("^Value", names(df))

cbind(df, t(mapply(function(x, y) {
      if (y > 0) {
        vals = as.numeric(df[x, cols[1:y]])
        c(Sum = sum(vals, na.rm = TRUE), Mean = mean(vals, na.rm = TRUE))
       }
       else 
        c(0, 0)
},1:nrow(df), df$UniqueNumber)))

#  Subject Value1 Value2 Value3 UniqueNumber Sum  Mean
#1       1      1      0      1            3   2 0.667
#2       2      0      1      1            2   1 0.500
#3       3      1      1      1            1   1 1.000

Here we subset each row based on its respective UniqueNumber and then calculate it's sum and mean if the UniqueNumber value is greater than 0 or else return only 0.

1
votes

A solution that uses purrr::map_df(which is from the same author as dplyr).

library(dplyr)
library(purrr)
l_dat <- split(dat, dat$Subject) # first we need to split in a list

map_df(l_dat, function(x) {
  n_cols <- x$UniqueNumber # finds the number of columns
  x <- as.numeric(x[2:(n_cols+1)]) # subsets x and converts to numeric
  mean(x, na.rm=T) # mean to be returned
})
# output:
# # A tibble: 1 x 3
#     `1`   `2`   `3`
#   <dbl> <dbl> <dbl>
# 1 0.667   0.5     1

Another option (output format closer to a dplyr solution):

map_df(l_dat, function(x) {
  n_cols <- x$UniqueNumber
  id <- x$Subject
  x <- as.numeric(x[2:(n_cols+1)])
  tibble(id=id, mean_values=mean(x, na.rm=T))
})
# # A tibble: 3 x 2
# id mean_values
# <int>       <dbl>
# 1     1       0.667
# 2     2       0.5  
# 3     3       1   

Just as an example I added a sum() then divided by length(x)-1:

map_df(l_dat, function(x) {
  n_cols <- x$UniqueNumber
  id <- x$Subject
  x <- as.numeric(x[2:(n_cols+1)])
  tibble(id=id, 
                mean_values=sum(x, na.rm=T)/(length(x)-1)) # change here
})
# # A tibble: 3 x 2
# id mean_values
# <int>       <dbl>
# 1     1          1.
# 2     2          1.
# 3     3        Inf  #beware of this case where you end up dividing by 0

Data:

tt <- "Subject    Value1    Value2    Value3      UniqueNumber
001        1         0         1           3
002        0         1         1           2
003        1         1         1           1"

dat <- read.table(text=tt, header=T)
1
votes

I think the easiest way is to set to NA the zeros that really should be NA, then use rowSums and rowMeans on the appropriate subset of columns.

Data[2:4][(col(dat[2:4])>dat[[5]])] <- NA
Data
#   Subject Value1 Value2 Value3 UniqueNumber
# 1       1      1      0      1            3
# 2       2      0      1     NA            2
# 3       3      1     NA     NA            1

library(dplyr)
Data%>%
  mutate(sum  =  rowSums(.[2:4], na.rm = TRUE),
         mean = rowMeans(.[2:4], na.rm = TRUE))

#   Subject Value1 Value2 Value3 UniqueNumber sum      mean
# 1       1      1      0      1            3   2 0.6666667
# 2       2      0      1     NA            2   1 0.5000000
# 3       3      1     NA     NA            1   1 1.0000000

or transform(Data, sum = rowSums(Data[2:4],na.rm = TRUE), mean = rowMeans(Data[2:4],na.rm = TRUE)) to stay in base R.

data

Data <- structure(
  list(Subject = 1:3, 
       Value1 = c(1L, 0L, 1L), 
       Value2 = c(0L, 1L, NA), 
       Value3 = c(1L, NA, NA), 
       UniqueNumber = c(3L, 2L, 1L)), 
  .Names = c("Subject","Value1", "Value2", "Value3", "UniqueNumber"),
  row.names = c(NA, 3L), class = "data.frame")