2 other options:
1) rolling join:
DT[is.na(dist), dist := {
x0y0 <- DT[!is.na(dist)][.SD, on=.(time), roll=Inf, .(time=x.time, dist=x.dist)]
x1y1 <- DT[!is.na(dist)][.SD, on=.(time), roll=-Inf, .(time=x.time, dist=x.dist)]
(x1y1$dist - x0y0$dist) / (x1y1$time - x0y0$time) * (time - x0y0$time) + x0y0$dist
}]
DT
2) another close variant of smingerson answer using nafill
DT[, dist := {
y0 <- nafill(dist, "locf")
x0 <- nafill(replace(time, is.na(dist), NA), "locf")
y1 <- nafill(dist, "nocb")
x1 <- nafill(replace(time, is.na(dist), NA), "nocb")
fifelse(is.na(dist), (y1 - y0) / (x1 - x0) * (time - x0) + y0, dist)
}]
timing code:
library(data.table)
set.seed(0L)
# df=data.frame(time=seq(7173,7195,1),dist=c(31091.33,NA,31100.00,31103.27,NA,NA,NA,NA,31124.98,NA,31132.81,NA,NA,NA,NA,31154.19,NA,31161.47,NA,NA,NA,NA,31182.97))
# DT=data.table(df)
nr <- 1e7
nNA <- nr/2
DT <- data.table(time=1:nr, dist=replace(rnorm(nr), sample(1:nr, nNA), NA_real_))
DT00 <- copy(DT)
DT01 <- copy(DT)
DT1 <- copy(DT)
DT20 <- copy(DT)
DT201 <- copy(DT)
DT202 <- copy(DT)
DT21 <- copy(DT)
mtd00 <- function() {
DT00[, g := rleid(is.na(dist))]
DT00[is.na(dist), dist := {
i <- .I[c(1, .N)] + c(-1, 1)
DT00[i[1]:i[2], approx(dist, n = .N)$y[-c(1, .N)]]
}, by = g]
}
mtd01 <- function() {
DT01[, g := rleid(is.na(dist))]
DT01[is.na(dist), dist := {
i <- .I[c(1, .N)] + c(-1, 1)
DT01[i[1]:i[2], dist[1] + 1:(.N - 2)*(dist[.N] - dist[1])/(.N - 1)]
}, by = g]
}
mtd1 <- function() {
DT1[,dist_before := nafill(dist, "locf")]
DT1[,dist_after := nafill(dist, "nocb")]
DT1[, rle := rleid(dist)][,missings := max(.N + 1 , 2), by = rle][]
DT1[is.na(dist), dist_before + .SD[,.I] *
(dist_after - dist_before)/(missings), by = rle]
}
mtd20 <- function() {
DT20[is.na(dist), {
x0y0 <- DT20[!is.na(dist)][.SD, on=.(time), roll=Inf, .(time=x.time, dist=x.dist)]
x1y1 <- DT20[!is.na(dist)][.SD, on=.(time), roll=-Inf, .(time=x.time, dist=x.dist)]
(x1y1$dist - x0y0$dist) / (x1y1$time - x0y0$time) * (time - x0y0$time) + x0y0$dist
}]
}
mtd201 <- function() {
i <- DT201[, is.na(dist)]
DT201[(i), {
x0y0 <- DT201[(!i)][.SD, on=.(time), roll=Inf, .(time=x.time, dist=x.dist)]
x1y1 <- DT201[(!i)][.SD, on=.(time), roll=-Inf, .(time=x.time, dist=x.dist)]
(x1y1$dist - x0y0$dist) / (x1y1$time - x0y0$time) * (time - x0y0$time) + x0y0$dist
}]
}
mtd202 <- function() {
i <- DT201[is.na(dist), which=TRUE]
DT201[i, {
x0y0 <- DT201[-i][.SD, on=.(time), roll=Inf, .(time=x.time, dist=x.dist)]
x1y1 <- DT201[-i][.SD, on=.(time), roll=-Inf, .(time=x.time, dist=x.dist)]
(x1y1$dist - x0y0$dist) / (x1y1$time - x0y0$time) * (time - x0y0$time) + x0y0$dist
}]
}
mtd21 <- function() {
DT21[, {
y0 <- nafill(dist, "locf")
x0 <- nafill(replace(time, is.na(dist), NA), "locf")
y1 <- nafill(dist, "nocb")
x1 <- nafill(replace(time, is.na(dist), NA), "nocb")
fifelse(is.na(dist), (y1 - y0) / (x1 - x0) * (time - x0) + y0, dist)
}]
}
bench::mark(
#mtd00(), mtd01(),
#mtd1(),
mtd20(), mtd201(), mtd202(),
mtd21(), check=FALSE)
timings:
# A tibble: 4 x 13
expression min median `itr/sec` mem_alloc `gc/sec` n_itr n_gc total_time result memory time gc
<bch:expr> <bch:tm> <bch:tm> <dbl> <bch:byt> <dbl> <int> <dbl> <bch:tm> <list> <list> <list> <list>
1 mtd20() 1.19s 1.19s 0.838 1.01GB 1.68 1 2 1.19s <dbl [5,000,000]> <df[,3] [292 x 3~ <bch:t~ <tibble [1 x ~
2 mtd201() 1.12s 1.12s 0.894 954.06MB 0.894 1 1 1.12s <dbl [5,000,000]> <df[,3] [341 x 3~ <bch:t~ <tibble [1 x ~
3 mtd202() 1.16s 1.16s 0.864 858.66MB 1.73 1 2 1.16s <dbl [5,000,000]> <df[,3] [392 x 3~ <bch:t~ <tibble [1 x ~
4 mtd21() 729.93ms 729.93ms 1.37 763.11MB 1.37 1 1 729.93ms <dbl [10,000,000~ <df[,3] [215 x 3~ <bch:t~ <tibble [1 x ~
edit: to address comment on using is.na(dist)
multiple times:
set.seed(0L)
nr <- 1e7
nNA <- nr/2
DT <- data.table(time=1:nr, dist=replace(rnorm(nr), sample(1:nr, nNA), NA_real_))
DT20 <- copy(DT)
DT201 <- copy(DT)
DT202 <- copy(DT)
mtd20 <- function() {
DT20[is.na(dist), dist := {
x0y0 <- DT20[!is.na(dist)][.SD, on=.(time), roll=Inf, .(time=x.time, dist=x.dist)]
x1y1 <- DT20[!is.na(dist)][.SD, on=.(time), roll=-Inf, .(time=x.time, dist=x.dist)]
(x1y1$dist - x0y0$dist) / (x1y1$time - x0y0$time) * (time - x0y0$time) + x0y0$dist
}]
}
mtd201 <- function() {
i <- DT201[, is.na(dist)]
DT201[(i), dist := {
x0y0 <- DT201[(!i)][.SD, on=.(time), roll=Inf, .(time=x.time, dist=x.dist)]
x1y1 <- DT201[(!i)][.SD, on=.(time), roll=-Inf, .(time=x.time, dist=x.dist)]
(x1y1$dist - x0y0$dist) / (x1y1$time - x0y0$time) * (time - x0y0$time) + x0y0$dist
}]
}
mtd202 <- function() {
i <- DT201[is.na(dist), which=TRUE]
DT201[i, dist := {
x0y0 <- DT201[-i][.SD, on=.(time), roll=Inf, .(time=x.time, dist=x.dist)]
x1y1 <- DT201[-i][.SD, on=.(time), roll=-Inf, .(time=x.time, dist=x.dist)]
(x1y1$dist - x0y0$dist) / (x1y1$time - x0y0$time) * (time - x0y0$time) + x0y0$dist
}]
}
timings:
# A tibble: 3 x 13
expression min median `itr/sec` mem_alloc `gc/sec` n_itr n_gc total_time result memory time gc
<bch:expr> <bch:tm> <bch:tm> <dbl> <bch:byt> <dbl> <int> <dbl> <bch:tm> <list> <list> <list> <list>
1 mtd20() 24.1ms 25.8ms 37.5 1.01GB 13.6 11 4 294ms <df[,2] [10,000,000 x 2]> <df[,3] [310 x 3]> <bch:tm> <tibble [15 x 3]>
2 mtd201() 24.8ms 25.6ms 38.2 954.07MB 8.19 14 3 366ms <df[,2] [10,000,000 x 2]> <df[,3] [398 x 3]> <bch:tm> <tibble [17 x 3]>
3 mtd202() 24ms 25.6ms 38.3 76.39MB 8.22 14 3 365ms <df[,2] [10,000,000 x 2]> <df[,3] [241 x 3]> <bch:tm> <tibble [17 x 3]>
Not seeing much diff in timings when reduce the number of is.na(dist)
calls