MAJOR EDIT to clarify as answers are wrong
I have a data.table with group columns (split_by), key columns (key_by) and trait ids columns (intersect_by)
I want in each group of split_by, keep only the rows where the trait ids are shared by all the present keys in the group.
For example:
dt <- data.table(id = 1:6, key1 = 1, key2 = c(1:2, 2), group_id1= 1, group_id2= c(1:2, 2:1, 1:2), trait_id1 = 1, trait_id2 = 2:1)
setkey(dt, group_id1, group_id2, trait_id1, trait_id2)
dt
id key1 key2 group_id1 group_id2 trait_id1 trait_id2
1: 4 1 1 1 1 1 1
2: 1 1 1 1 1 1 2
3: 5 1 2 1 1 1 2
4: 2 1 2 1 2 1 1
5: 6 1 2 1 2 1 1
6: 3 1 2 1 2 1 2
res <- intersect_this_by(dt,
key_by = c("key1"),
split_by = c("group_id1", "group_id2"),
intersect_by = c("trait_id1", "trait_id2"))
I want res to be like this:
> res[]
id key1 key2 group_id1 group_id2 trait_id1 trait_id2
1: 1 1 1 1 1 1 2
2: 5 1 2 1 1 1 2
3: 2 1 2 1 2 1 1
4: 6 1 2 1 2 1 1
5: 3 1 2 1 2 1 2
We see id 4 has been dropped as in group_id1 = 1 and group_id2 = 1 combination group (the group which id 4 belongs) there is only one combination of keys (1,1) which has these traits (1,1) whereas there are two keys combinations in this group: (1,1) and (1,2) so the traits (1,1) are not shared by all keys in this group so we drop this trait from this group, hence drop id 4. On the contrary, id 1 and 5 have same traits but different keys and they represent all the keys ( (1,1) and (1,2)) in this group so traits of id 1 and 5 are kept.
A function to achieve this is given there:
intersect_this_by2 <- function(dt,
key_by = NULL,
split_by = NULL,
intersect_by = NULL){
dtc <- as.data.table(dt)
# compute number of keys in the group
dtc[, n_keys := uniqueN(.SD), by = split_by, .SDcols = key_by]
# compute number of keys represented by each trait in each group
# and keep row only if they represent all keys from the group
dtc[, keep := n_keys == uniqueN(.SD), by = c(intersect_by, split_by), .SDcols = key_by]
dtc <- dtc[keep == TRUE][, c("n_keys", "keep") := NULL]
return(dtc)
}
But it gets quite slow for big datasets or complex traits/keys/groups... the real data.table has got 10 millions rows and the traits have 30 levels... Is there any way to improve it? Any obvious pitfalls? Thanks for the help
FINAL EDIT: Uwe proposed a concise solution which is 40% faster than my initial code (which I deleted here because it was confusing) The final function looks like this:
intersect_this_by_uwe <- function(dt,
key_by = c("key1"),
split_by = c("group_id1", "group_id2"),
intersect_by = c("trait_id1", "trait_id2")){
dti <- copy(dt)
dti[, original_order_id__ := 1:.N]
setkeyv(dti, c(split_by, intersect_by, key_by))
uni <- unique(dti, by = c(split_by, intersect_by, key_by))
unique_keys_by_group <-
unique(uni, by = c(split_by, key_by))[, .N, by = c(split_by)]
unique_keys_by_group_and_trait <-
uni[, .N, by = c(split_by, intersect_by)]
# 1st join to pick group/traits combinations with equal number of unique keys
selected_groups_and_traits <-
unique_keys_by_group_and_trait[unique_keys_by_group,
on = c(split_by, "N"), nomatch = 0L]
# 2nd join to pick records of valid subsets
dti[selected_groups_and_traits, on = c(split_by, intersect_by)][
order(original_order_id__), -c("original_order_id__","N")]
}
And for the records the benchmarks on the 10M rows dataset:
> microbenchmark::microbenchmark(old_way = {res <- intersect_this_by(dt,
+ key_by = c("key1"),
+ split_by = c("group_id1", "group_id2"),
+ intersect_by = c("trait_id1", "trait_id2"))},
+ new_way = {res <- intersect_this_by2(dt,
+ key_by = c("key1"),
+ split_by = c("group_id1", "group_id2"),
+ intersect_by = c("trait_id1", "trait_id2"))},
+ new_way_uwe = {res <- intersect_this_by_uwe(dt,
+ key_by = c("key1"),
+ split_by = c("group_id1", "group_id2"),
+ intersect_by = c("trait_id1", "trait_id2"))},
+ times = 10)
Unit: seconds
expr min lq mean median uq max neval cld
old_way 3.145468 3.530898 3.514020 3.544661 3.577814 3.623707 10 b
new_way 15.670487 15.792249 15.948385 15.988003 16.097436 16.206044 10 c
new_way_uwe 1.982503 2.350001 2.320591 2.394206 2.412751 2.436381 10 a
dt[id %in% dt[, .SD[, if (.N > 1) id, by=.(trait_id1, trait_id2)], by=.(group_id1, group_id2)]$V1]
– chinsoon12