0
votes

I have a wide-format data.table like so:

library(data.table)
dt_wide <- data.table(
  "id" = seq(1:10),
  "yw_1001" = trunc( runif(10,0,100) ),
  "yw_1002" = trunc( runif(10,0,100) ),
  "yw_1003" = trunc( runif(10,0,100) ),
  "yw_1004" = trunc( runif(10,0,100) ),
  "yw_1005" = trunc( runif(10,0,100) ),
  "yw_1006" = trunc( runif(10,0,100) ),
  "yw_1007" = trunc( runif(10,0,100) ),
  "yw_1008" = trunc( runif(10,0,100) ),
  "yw_1009" = trunc( runif(10,0,100) ),
  "yw_1010" = trunc( runif(10,0,100) ),
  "yw_1011" = trunc( runif(10,0,100) ),
  "yw_1012" = trunc( runif(10,0,100) ),
  "yw_1013" = trunc( runif(10,0,100) ),
  "yw_1014" = trunc( runif(10,0,100) ),
  "yw_1015" = trunc( runif(10,0,100) ),
  "yw_1016" = trunc( runif(10,0,100) ),
  "yw_1017" = trunc( runif(10,0,100) ),
  "yw_1018" = trunc( runif(10,0,100) ),
  "yw_1019" = trunc( runif(10,0,100) ),
  "yw_1020" = trunc( runif(10,0,100) ),
  "yw_1021" = trunc( runif(10,0,100) ),
  "yw_1022" = trunc( runif(10,0,100) ),
  "yw_1023" = trunc( runif(10,0,100) ),
  "yw_1024" = trunc( runif(10,0,100) ),
  "yw_1025" = trunc( runif(10,0,100) ),
  "yw_1026" = trunc( runif(10,0,100) ),
  "yw_1027" = trunc( runif(10,0,100) ),
  "yw_1028" = trunc( runif(10,0,100) ),
  "yw_1029" = trunc( runif(10,0,100) ),
  "yw_1030" = trunc( runif(10,0,100) ),
  "yw_1031" = trunc( runif(10,0,100) ),
  "yw_1032" = trunc( runif(10,0,100) ),
  "yw_1033" = trunc( runif(10,0,100) ),
  "yw_1034" = trunc( runif(10,0,100) ),
  "yw_1035" = trunc( runif(10,0,100) ),
  "yw_1036" = trunc( runif(10,0,100) ),
  "yw_1037" = trunc( runif(10,0,100) ),
  "yw_1038" = trunc( runif(10,0,100) ),
  "yw_1039" = trunc( runif(10,0,100) ),
  "yw_1040" = trunc( runif(10,0,100) ),
  "yw_1041" = trunc( runif(10,0,100) ),
  "yw_1042" = trunc( runif(10,0,100) ),
  "yw_1043" = trunc( runif(10,0,100) ),
  "yw_1044" = trunc( runif(10,0,100) ),
  "yw_1045" = trunc( runif(10,0,100) ),
  "yw_1046" = trunc( runif(10,0,100) ),
  "yw_1047" = trunc( runif(10,0,100) ),
  "yw_1048" = trunc( runif(10,0,100) ),
  "yw_1049" = trunc( runif(10,0,100) ),
  "yw_1050" = trunc( runif(10,0,100) ),
  "yw_1051" = trunc( runif(10,0,100) ),
  "yw_1052" = trunc( runif(10,0,100) )
  )

The cols correspond to year (first two digits) and week number (last to digits).

In my actual dataset (nrow=5,500,000, ncol=1400), I cannot data.table::melt because it would create a data.table over the row limit.

In reality, I only need values from certain week numbers. Take the following data.table

dt2 <- data.table(
  "id" = seq(1:10),
  "date" = sample(seq(as.Date('2010/01/01'), as.Date('2010/12/31'), by="day"), 10)
)

I need to keep the values in dt_wide that are 5, 10, and 15 weeks after the date in in dt2 for each unique id. Ideally, want to reduce col size in dt_wide so I can melt to long format.

Any suggestions?

2
Please use set.seed while generating random data. Also show expected output for the data shared. It is not clear to me how dt2 is related to dt_wide. Also how column name yw_1001 corresponds to year and week number.Ronak Shah

2 Answers

0
votes

Here is an easy way that you can do with some mutations capitalizing on dplyr, tidyr, stringr and lubridate packages:

# Calling required libraries
library(data.table)
library(dplyr)

# Creating dataframe
dt_wide <- data.table(
  "id" = seq(1:10),
  "yw_1001" = trunc( runif(10,0,100) ),
  "yw_1002" = trunc( runif(10,0,100) ),
  "yw_1003" = trunc( runif(10,0,100) ),
  "yw_1004" = trunc( runif(10,0,100) ),
  "yw_1005" = trunc( runif(10,0,100) ),
  "yw_1006" = trunc( runif(10,0,100) ),
  "yw_1007" = trunc( runif(10,0,100) ),
  "yw_1008" = trunc( runif(10,0,100) ),
  "yw_1009" = trunc( runif(10,0,100) ),
  "yw_1010" = trunc( runif(10,0,100) ),
  "yw_1011" = trunc( runif(10,0,100) ),
  "yw_1012" = trunc( runif(10,0,100) ),
  "yw_1013" = trunc( runif(10,0,100) ),
  "yw_1014" = trunc( runif(10,0,100) ),
  "yw_1015" = trunc( runif(10,0,100) ),
  "yw_1016" = trunc( runif(10,0,100) ),
  "yw_1017" = trunc( runif(10,0,100) ),
  "yw_1018" = trunc( runif(10,0,100) ),
  "yw_1019" = trunc( runif(10,0,100) ),
  "yw_1020" = trunc( runif(10,0,100) ),
  "yw_1021" = trunc( runif(10,0,100) ),
  "yw_1022" = trunc( runif(10,0,100) ),
  "yw_1023" = trunc( runif(10,0,100) ),
  "yw_1024" = trunc( runif(10,0,100) ),
  "yw_1025" = trunc( runif(10,0,100) ),
  "yw_1026" = trunc( runif(10,0,100) ),
  "yw_1027" = trunc( runif(10,0,100) ),
  "yw_1028" = trunc( runif(10,0,100) ),
  "yw_1029" = trunc( runif(10,0,100) ),
  "yw_1030" = trunc( runif(10,0,100) ),
  "yw_1031" = trunc( runif(10,0,100) ),
  "yw_1032" = trunc( runif(10,0,100) ),
  "yw_1033" = trunc( runif(10,0,100) ),
  "yw_1034" = trunc( runif(10,0,100) ),
  "yw_1035" = trunc( runif(10,0,100) ),
  "yw_1036" = trunc( runif(10,0,100) ),
  "yw_1037" = trunc( runif(10,0,100) ),
  "yw_1038" = trunc( runif(10,0,100) ),
  "yw_1039" = trunc( runif(10,0,100) ),
  "yw_1040" = trunc( runif(10,0,100) ),
  "yw_1041" = trunc( runif(10,0,100) ),
  "yw_1042" = trunc( runif(10,0,100) ),
  "yw_1043" = trunc( runif(10,0,100) ),
  "yw_1044" = trunc( runif(10,0,100) ),
  "yw_1045" = trunc( runif(10,0,100) ),
  "yw_1046" = trunc( runif(10,0,100) ),
  "yw_1047" = trunc( runif(10,0,100) ),
  "yw_1048" = trunc( runif(10,0,100) ),
  "yw_1049" = trunc( runif(10,0,100) ),
  "yw_1050" = trunc( runif(10,0,100) ),
  "yw_1051" = trunc( runif(10,0,100) ),
  "yw_1052" = trunc( runif(10,0,100) )
)

# Creating dataframe with point of interest
dt2 <- data.table(
  "id" = seq(1:10),
  "date" = sample(seq(as.Date('2010/01/01'), as.Date('2010/12/31'), by="day"), 10)
)

# Mutating data to get only required columns
columns_to_select <-
  dt2 %>%
  # Getting dates after 5/10/15 weeks
  mutate(after5 = date + (7 * 5),
         after10 = date + (7 * 10),
         after15 = date + (7 * 15)) %>%
  # Converting dates from wide format to long format
  tidyr::gather(key = "key", value = req_date, -c(id, date)) %>%
  # Converting date into respective column name in dt_wide dataframe
  mutate(year = format(as.Date(req_date), "%y"),
         week = stringr::str_pad(lubridate::week(req_date), 2, "left", "0"),
         select_date = paste0("yw_", year, week)) %>%
  # Selecting only required column into a vector
  select(select_date) %>%
  pull()

# Choosing from the wide dataframe only required columns
dt_wide %>%
  select(id, contains(columns_to_select))

# id yw_1024 yw_1044 yw_1017 yw_1014 yw_1045 yw_1031 yw_1035 yw_1029 yw_1049 yw_1022 yw_1019 yw_1050 yw_1036 yw_1040 yw_1034 yw_1027 yw_1041
# 1:  1      59       7      11       7      93      19      83      48      75      94      19       9      93      41       6      26      18
# 2:  2      84      22      18      70      29      53      63      26      23      12      93      84      17      57      96      93      98
# 3:  3       4      72      56      35      65      73      58      91      27      65      58       5      62      13      36      79      26
# 4:  4      36       5      26      56      34      27      60      64      79      27      40      64      32       0      96      56      19
# 5:  5      44      82      78      23      71      78      36      43      63      95      91      37      21      87      63      73      25
# 6:  6      46      45      81      89      59       0      85       3      68      23      90      82      93      42      28      67      32
# 7:  7      56      32       7      26      49      31      79      93      14      45      25      79      39      64      64      86      91
# 8:  8      82      99      46      79      81      56      39      10      20      27      83      29      30      30      35      96      24
# 9:  9      10      87      28      40      51      41      95      43      62      59      44      19      72      76      27      65      36
# 10: 10      81      19      44      55      22      53      98      54      16      29      30      28      20       2       5      39      23
0
votes

Here's a simple solution using data.table (and the lubridate package for dates). Since your issue appears to build from memory constraints, simply call melt() for each row in your (small) ID table using only the necessary columns. Note, too, that we have to check to see if a given date exists in dt_wide because some dates are not present.

rbindlist(lapply(1:nrow(dt2), function(x) {
  rowid <- dt2[x, id]
  rowdate <- dt2[x, date]
  
  dates <- rowdate + lubridate::weeks(c(5, 10, 15))
  cols <- paste0("yw_", strftime(dates, "%y%W"))
  
  # Because some dates aren't in the table
  cols <- intersect(cols, colnames(dt_wide))
  if (!length(cols)) return(NULL)
  
  melt(dt_wide[id == rowid, c("id", cols), with = F], id.vars = "id")
}))