A base
alternative.
1. Find and count pairs
Because you only have numerical values, we coerce data to matrix. This will make subsequent calculations considerably faster. Create lag and lead versions (column-wise) of the data, i.e. remove the last column (m[ , -ncol(m)]
) and first column (m[ , -ncol(m)]
) respectively.
Coerce the lag and lead data to 'from' and 'to' vectors, and count pairs (table
). Convert table to matrix. Select first pair with max frequency.
m <- as.matrix(d)
tt <- table(from = as.vector(m[ , -ncol(m)]), to = as.vector(m[ , -1]))
m2 <- cbind(from = as.integer(dimnames(tt)[[1]]),
to = rep(as.integer(dimnames(tt)[[2]]), each = dim(tt)[1]),
freq = as.vector(tt))
m3 <- m2[which.max(m2[ , "freq"]), ]
# from to freq
# 3 0 3
If you want all pairs with maximum frequency, use m2[m2[ , "freq"] == max(m2[ , "freq"]), ]
instead.
2. Replace values of most frequent pair and set rest to zero
Make a copy of the original data. Fill it with zero. Grab the 'from' and 'to' values of the 'max pair'. Get indexes of matches in lag and lead data, which correspond to 'from' columns. rbind
with indexes of 'to' columns. At the indexes, replace zeros with 2.
m_bin <- m
m_bin[] <- 0
ix <- which(m[ , -ncol(m)] == m3["from"] &
m[ , -1] == m3["to"],
arr.ind = TRUE)
m_bin[rbind(ix, cbind(ix[ , "row"], ix[ , "col"] + 1))] <- 2
m_bin
# var1 var2 var3 var4 var5
# [1,] 0 0 2 2 0
# [2,] 0 0 2 2 0
# [3,] 2 2 0 0 0
# [4,] 0 0 0 0 0
3. Benchmark
I use data of a size somewhat similar to that mentioned by OP in comment: a data frame with 10000 rows, 100 columns, and sampling from 100 different values.
I compare the code above (f_m()
) with the zoo
answer (f_zoo()
; functions below). To compare the output, I add dimnames
to the zoo
result.
The result shows that f_m
is considerably faster.
set.seed(1)
nr <- 10000
nc <- 100
d <- as.data.frame(matrix(sample(1:100, nr * nc, replace = TRUE),
nrow = nr, ncol = nc))
res_f_m <- f_m(d)
res_f_zoo <- f_zoo(d)
dimnames(res_f_zoo) <- dimnames(res_f_m)
all.equal(res_f_m, res_f_zoo)
# [1] TRUE
system.time(f_m(d))
# user system elapsed
# 0.12 0.01 0.14
system.time(f_zoo(d))
# user system elapsed
# 61.58 26.72 88.45
f_m <- function(d){
m <- as.matrix(d)
tt <- table(from = as.vector(m[ , -ncol(m)]),
to = as.vector(m[ , -1]))
m2 <- cbind(from = as.integer(dimnames(tt)[[1]]),
to = rep(as.integer(dimnames(tt)[[2]]),
each = dim(tt)[1]),
freq = as.vector(tt))
m3 <- m2[which.max(m2[ , "freq"]), ]
m_bin <- m
m_bin[] <- 0
ix <- which(m[ , -ncol(m)] == m3["from"] &
m[ , -1] == m3["to"],
arr.ind = TRUE)
m_bin[rbind(ix, cbind(ix[ , "row"], ix[ , "col"] + 1))] <- 2
return(m_bin)
}
f_zoo <- function(d){
pairs <- sort(table(c(rollapply(t(d), 2, toString))))
top <- scan(text = names(tail(pairs, 1)), sep = ",", what = 0L, quiet = TRUE)
right <- rollapplyr(unname(t(d)), 2, identical, top, fill = FALSE)
left <- rbind(right[-1, ], FALSE)
t(2 * (left | right))
}