2
votes

Given a data frame with columns:

  • "length1" integer as characters
  • "length2" each element is a string of numbers

I would like to get the percentage of the length2 column with respect to the length1 column. So something like df$length2 / df$lenght1 *100. Pls see the following minimal example:

> df=data.frame(length1=c("10","12","14"))
> df$length2=list("2,3,4","4,5,3","3,2,6")
> df

length1 length2
1      10   2,3,4
2      12   4,5,3
3      14   3,2,6

> dfresult=df
> dfresult$resultInPercent=list("20,30,40","33,41,25","21,14,42")
> dfresult

  length1 length2 resultInPercent
1      10   2,3,4        20,30,40
2      12   4,5,3        33,41,25
3      14   3,2,6        21,14,42

I cant get it to work, my approach was:

dfresult=apply(df, 1, function(x) 
{

  lapply(lapply(lapply(x$length2,strsplit,split=","),as.numeric),function(y)
     {
        round(as.numeric(y)/as.numeric(x$length1)*100)
     }

  )
 } 
)

Error in lapply(lapply(x$length2, strsplit, split = ","), as.numeric) : (list) object cannot be coerced to type 'double'

I gave up here and got the feeling what I do is way to complicated.

3

3 Answers

7
votes

Another idea:

library(dplyr)
library(tidyr)

df %>%
  separate_rows(length2) %>% 
  mutate_all(funs(as.numeric(as.character(.)))) %>%
  group_by(length1) %>%
  summarise(l2 = list(length2), 
            l3 = list(round(100 * length2 / length1))) 

Which gives:

## A tibble: 3 x 3
#  length1        l2        l3
#    <dbl>    <list>    <list>
#1      10 <dbl [3]> <dbl [3]>
#2      12 <dbl [3]> <dbl [3]>
#3      14 <dbl [3]> <dbl [3]>

This store the results in lists making it easily accessible for further operations:

#Observations: 3
#Variables: 3
#$ length1 <dbl> 10, 12, 14
#$ l2      <list> [<2, 3, 4>, <4, 5, 3>, <3, 2, 6>]
#$ l3      <list> [<20, 30, 40>, <33, 42, 25>, <21, 14, 43>]
5
votes

Here's somewhat vectorized solution using data.table

library(data.table)
temp <- round(setDT(df)[, tstrsplit(length2, ",", fixed = TRUE, type.convert = TRUE)] /
              as.numeric(levels(df$length1))[df$length1] * 100)
df[, resultInPercent := do.call(paste, c(temp, sep = ","))]
df
#    length1 length2 resultInPercent
# 1:      10   2,3,4        20,30,40
# 2:      12   4,5,3        33,42,25
# 3:      14   3,2,6        21,14,43

Some benchmarks

library(data.table)
library(microbenchmark)
library(dplyr)
library(tidyr)

set.seed(123)
bigdf <- data.frame(length1 = sample(1e4),
                    length2 = I(replicate(1e4, "2,3,4", simplify = FALSE)))
bigdf2 <- copy(bigdf)

Steve <- function(df){ # changed `list` to `toStirng` so all outputs match
  df %>%
    separate_rows(length2) %>% 
    mutate_all(funs(as.numeric(as.character(.)))) %>%
    group_by(length1) %>%
    summarise(res = toString(round(100 * length2 / length1)))
}

David <- function(df) {
  temp <- round(setDT(df)[, tstrsplit(length2, ",", fixed = TRUE, type.convert = TRUE)] /
                as.numeric(levels(df$length1))[df$length1] * 100)
  df[, resultInPercent := do.call(paste, c(temp, sep = ","))]
  df
}

akrun <- function(df) {
  df["resultInPercent "] <- 
  mapply(function(x,y) toString(round(x/y)), 
       lapply(strsplit(as.character(df$length2), ","), as.numeric),
       as.numeric(as.character(df$length1))/100)
  df
}

microbenchmark(Steve(bigdf), David(bigdf2), akrun(bigdf))    
#          expr       min        lq      mean    median       uq      max neval cld
#  Steve(bigdf) 475.62891 488.96441 501.77668 497.47626 507.9581 571.5748   100   c
# David(bigdf2)  17.78974  18.16284  18.77208  18.36107  18.6625  29.8744   100 a  
#  akrun(bigdf) 145.98749 149.93839 154.36653 151.82216 154.4384 218.4145   100  b 
2
votes

As the columns are factor class, we split the 'length2' after converting to character class by the delimiter ,, convert the elements in list to numeric, use mapply to divide the elements of the list with the corresponding vector elements of 'length1', round the output and convert to a single string (toString is a wrapper for paste(., collapse=", "))

mapply(function(x,y) toString(round(x/y)), 
    lapply(strsplit(as.character(df$length2), ","), as.numeric),
      as.numeric(as.character(df$length1))/100)
#[1] "20, 30, 40" "33, 42, 25" "21, 14, 43"