extract information from a data frame

Question

I have a data frame like below

df<- structure(list(s1 = structure(1:3, .Label = c("3-4", "4-1", "5-4"
    ), class = "factor"), s2 = structure(1:3, .Label = c("2-4", "3-15", 
    "7-16"), class = "factor")), .Names = c("s1", "s2"), row.names = c(NA, 
    -3L), class = "data.frame")

Looks like below

> df
#   s1   s2
#1 3-4  2-4
#2 4-1 3-15
#3 5-4 7-16

what I want to do is to first search and find those values that are similar after - for example here 4 is in first row of s1, first row of s2 and third row of s1

-The second column indicates how many times those values were found

-The third column shows how many of them are from first column of df

-The fourth column shows how many of them are from second column of df

-The fifth is which strings are from the first columns

-The sixth is which strings are from teh second columns

the output looks like this

Value    repeated     s1N   s1N   ss1    ss2
4           3         2      1    3,5     2
1           1         1      -     4      -
15          1         -      1     -      3
16          1         -      1     -      7

Output is unclear. Is your output exhaustive given your input dataset? It appears that there should be rows for Values of {3, 2, 5, 7} — Brandon Loudermilk
@BrandonLoudermilk I saw your edit which was not correct. let me explain you. Please look at the data, what do you see after hyphen ? lets say first column named s1, I see 4, 1, and 4. for the second column I see 4, 15 and 16. so the first column is to see how many times these are repeated. 4 is repeated 3 times, 1 is repeated only once, 15 the same and 16 the same. Is it clear now ? — nik
@Mol I have rewritten the codes based on your real data, it should work now. — fhlgood

Jonas Coussement Jonas Coussement · Accepted Answer · 2016-03-01T16:40:13

First thing you will need to do is extract the numbers from your strings. Running:

newdfstring <- apply(df,1:2, function(v) unlist(strsplit(v,"-")))
newdf <- apply(newdfstring,1:3, as.numeric)

splits the strings in the first line, and converts them to numeric values in the second. The result is a 3-dimensional matrix which you can use to extract your values.

First create a new dataframe:

#length of the columns in the new frame = number of unique values
dflength <- length(unique(array(newdf[2,,]))) 
dfout <- data.frame(Value=rep(0,dflength),repeated=rep(0,dflength),s1N=rep(0,dflength),s2N=rep(0,dflength),ss1=rep(0,dflength),ss2=rep(0,dflength))

The most obvious way (yet maybe not the most efficient) would then be to loop and match whatever it is you need:

dfout$Value <- unique(array(newdf[2,,]))
for(i in 1:dflength){
  getID <- which(as.data.frame(table(newdf[2,,]))$Var1==dfout$Value[i])
  dfout$repeated[i] <- as.data.frame(table(newdf[2,,]))$Freq[getID]
  dfout$s1N[i] <- as.data.frame(table(newdf[2,,1]))$Freq[getID]
  if(is.na(dfout$s1N[i])){
    dfout$s1N[i] <- 0
  }
  dfout$s2N[i] <- as.data.frame(table(newdf[2,,2]))$Freq[getID]
  if(is.na(dfout$s2N[i])){
    dfout$s2N[i] <- 0
  }
  getID <- which(newdf[2,,1]==dfout$Value[i])
  if(length(getID)>0){
    dfout$ss1[i] <- toString(newdf[1,,1][getID])
  } else {
    dfout$ss1[i] <- 0
  }
  getID <- which(newdf[2,,2]==dfout$Value[i])
  if(length(getID)>0){
    dfout$ss2[i] <- toString(newdf[1,,2][getID])
  } else {
    dfout$ss2[i] <- 0
  }
}
dfout
#  Value repeated s1N s2N  ss1 ss2
#1     4        3   2   1 3, 5   2
#2     1        1   1   1    4   0
#3    15        1   0   1    0   3
#4    16        1   0   0    0   7

EDIT to loop n amount of s values

newdfstring <- apply(df,1:2, function(v) unlist(strsplit(v,"-")))
newdf <- apply(newdfstring,1:3, as.numeric)
dflength <- length(unique(array(newdf[2,,])))
#find the number of s variables
slength <- length(newdf[1,1,])
#create a matrix of appropriate size
dfout <- matrix(data=NA,nrow=dflength,ncol=(2+2*slength))
#create a (near)-empty names array, we will fill it in later
names <- c("Value","repeated",rep("",2*slength))
#fill in the Values column
dfout[,1] <- unique(array(newdf[2,,]))
#loop for every s variable
for(j in 1:slength){
  #get their names, paste N or s and add them to the names array
  names[2+j] <- paste(names(df)[j],"N",sep="")
  names[2+j+slength] <- paste("s",names(df)[j],sep="")
  #loop to get the other values
  for(i in 1:dflength){
    getID <- which(as.data.frame(table(newdf[2,,]))$Var1==dfout[i,1])
    dfout[i,2] <- as.data.frame(table(newdf[2,,]))$Freq[getID]
    dfout[i,2+j] <- as.data.frame(table(newdf[2,,j]))$Freq[getID]
    if(is.na(dfout[i,2+j])){
      dfout[i,2+j] <- 0
    }
    getID <- which(newdf[2,,j]==dfout[i,1])
    if(length(getID)>0){
      dfout[i,2+j+slength] <- toString(newdf[1,,j][getID])
    } else {
      dfout[i,2+j+slength] <- 0
    }
  }
}
colnames(dfout)<-names
as.data.frame(dfout)
#  Value repeated s1N s2N  ss1 ss2
#1     4        3   2   1 3, 5   2
#2     1        1   1   1    4   0
#3    15        1   0   1    0   3
#4    16        1   0   0    0   7

extract information from a data frame

4 Answers