0
votes

My df contains series of columns with similar names that are grouped every three columns, similar to this:

>df<-data.frame(c(0,1,4,5),c(0,1,3,3),c(0,1,1,1),c(0,1,1,1),c(0,1,1,1),c(0,1,1,1),c(0,8,1,9),c(6,1,1,1),c(5,1,3,4))

 >names(df)<-c("AA1","AA2","AA3","BB1","BB2","BB3","CC1","CC2","CC3")

> df

AA1 AA2 AA3 BB1 BB2 BB3 CC1 CC2 CC3

1 0 0 0 0 0 0 0 3 3

2 1 1 1 1 1 1 8 1 1

3 4 6 1 1 1 1 1 1 3

4 5 5 1 1 1 1 9 1 4

This essentially shows 3 different measurements (1,2,3) per examination type(AA,BB,CC) for 4 patients. In reality I have a huge dataset with 3 measurements for 10 different examinations on 2,000 patients. I would like to add a new column with classification of disease as follows: If the score for at least one measurement per examination (XX1,XX2,XX2 where XX=AA or BB or CC) is >4 then the patient has the disease. So the new dataset would look like that:

>

AA1 AA2 AA3 BB1 BB2 BB3 CC1 CC2 CC3 DISEASE

1 0 0 0 0 0 0 0 3 3 0

2 1 1 1 1 1 1 8 1 1 1

3 4 6 1 1 1 1 1 1 3 1

4 5 5 1 1 1 1 9 1 4 1

2
You should tidy up your data.Roland
@user3495945. I updated the code with explanationakrun

2 Answers

1
votes
df<-data.frame(c(0,1,4,5),c(0,1,3,3),c(0,1,1,1),c(0,1,1,1),c(0,1,1,1),c(0,1,1,1),c(0,8,1,9),c(6,1,1,1),c(5,1,3,4))

names(df)<-c("AA1","AA2","AA3","BB1","BB2","BB3","CC1","CC2","CC3")

A solution with your data format:

rowSums(df > 4) > 0
#[1]  TRUE  TRUE FALSE  TRUE

This uses the fact that logical values get coerced to 0 and 1 when calculating their sum.

But tidy data is usually preferable:

df$id <- rownames(df)
library(reshape2)
DF <- melt(df, id.var="id")
DF$exam <- gsub("[[:digit:]+]", "", DF$variable)
DF$meas <- as.numeric(gsub("[[:alpha:]+]", "", DF$variable))

head(DF)
#  id variable value exam meas
#1  1      AA1     0   AA    1
#2  2      AA1     1   AA    1
#3  3      AA1     4   AA    1
#4  4      AA1     5   AA    1
#5  1      AA2     0   AA    2
#6  2      AA2     1   AA    2


#Is patient diseased?
library(plyr)
ddply(DF, .(id), summarize, disease = any(value > 4))
#  id disease
#1  1    TRUE
#2  2    TRUE
#3  3   FALSE
#4  4    TRUE

#Which exam was positive?
ddply(DF, .(id, exam), summarize, disease = any(value > 4))
#   id exam disease
#1   1   AA   FALSE
#2   1   BB   FALSE
#3   1   CC    TRUE
#4   2   AA   FALSE
#5   2   BB   FALSE
#6   2   CC    TRUE
#7   3   AA   FALSE
#8   3   BB   FALSE
#9   3   CC   FALSE
#10  4   AA    TRUE
#11  4   BB   FALSE
#12  4   CC    TRUE
1
votes
df <- structure(list(AA1 = c(0, 1, 4, 5), AA2 = c(0, 1, 3, 3), AA3 = c(0, 
1, 1, 1), BB1 = c(0, 1, 1, 1), BB2 = c(0, 1, 1, 1), BB3 = c(0, 
1, 1, 1), CC1 = c(0, 8, 1, 9), CC2 = c(3, 1, 1, 1), CC3 = c(3, 
1, 3, 4)), .Names = c("AA1", "AA2", "AA3", "BB1", "BB2", "BB3", 
"CC1", "CC2", "CC3"), row.names = c(NA, -4L), class = "data.frame")

indx <- gsub("\\d+","", colnames(df)) #deletes the numberrs from colnames
lst <- split(seq_len(ncol(df)), indx) # seq_len(ncol(df)) #gives the sequence of column and split it by indx

 lapply(lst, function(i) df[,i]) #subset the data by column number

 lapply(lst, function(i) do.call(`pmax`, df[,i])>=4) #gives maximum value for each row and we test if that values is `>=4`

 rowSums(sapply(lst, function(i) do.call(`pmax`, df[,i])>=4))
 #[1] 0 1 1 2
 !!rowSums(sapply(lst, function(i) do.call(`pmax`, df[,i])>=4)) #double negation 
 #[1] FALSE  TRUE  TRUE  TRUE
 (!!rowSums(sapply(lst, function(i) do.call(`pmax`, df[,i])>=4))) +0
 #[1] 0 1 1 1

Putting the above code all at once

df$DISEASE <- (!!rowSums(sapply(split(seq_len(ncol(df)),
                  gsub("\\d+","", colnames(df))), function(i)
                    ( do.call(`pmax`, df[,i])>=4))))+0


 df
#   AA1 AA2 AA3 BB1 BB2 BB3 CC1 CC2 CC3 DISEASE
# 1   0   0   0   0   0   0   0   3   3       0
# 2   1   1   1   1   1   1   8   1   1       1
# 3   4   3   1   1   1   1   1   1   3       1
# 4   5   3   1   1   1   1   9   1   4       1

Or

colnames(df) <- gsub("([A-Za-z]+)(\\d+)", "\\1_\\2", colnames(df)) #created `-` between alphabets and numbers for the colnames
df$id <- 1:nrow(df) #created an id column

df1 <- reshape(df, idvar="id", varying=grep("[A-Z]", colnames(df)), direction="long", sep="_") #used reshape to get the columns starting with `AA` in one column, similarly for `BB` and `CC`

#Split by the reshaped dataset by id and look for any values that are >=4 in each list element
df$DISEASE <- sapply(split(df1[,-(1:2)], df$id), function(x) any(x >=4)) +0 
df[,-10]
#   AA_1 AA_2 AA_3 BB_1 BB_2 BB_3 CC_1 CC_2 CC_3 DISEASE
# 1    0    0    0    0    0    0    0    3    3       0
# 2    1    1    1    1    1    1    8    1    1       1
# 3    4    3    1    1    1    1    1    1    3       1
# 4    5    3    1    1    1    1    9    1    4       1