0
votes

Background

I have a data file which consists of Sea Levels near the Dutch Storm Barrier over time, for various days. The goal is to fit a linear model that describes this evolution of the sea level, and given a certain time frame, make a 5 minute ahead-prediction of the sea level (forecasting).

Approach

Given a certain day (chosen on forehand), I chose a time frame on which I want to fit\train the linear model. After some technical adjustments (see below for actual code), I fitted the model. Then, the linear model object and 5 minutes time range are used in the command 'predict()' for a prediction, and the 'forecast' along with a confidence interval is graphed, just behind the fitted model in the first time frame, all in one plot (see below for an example plot).

Problem

The forecast of the model always over- or under predicts hugely. In terms of magnitude, the forecast is a factor 10^10 (or, equivalently, e+10) off. At the same time, the R^2 and R_adj^2 are 'quite high', (0,972 and 0,9334, respectively) and the model diagnostics (leverages, fitted vs residuals, normal qq) look 'reasonably good'. Hence, my problem/question is: How can the model that fits the data so well, predict/forecast so badly? My only explanation is mistake in the code, but I can't spot it.

The data set

More specifically, the dataset is a data frame, which consists (apart from a index column) of 3 columns: 'date' ( "yyyy-mm-dd" format), 'time' ( "hh:mm:ss" format) and 'water' (integer between approx -150 and 350, sea level in cm). Here's a small slice of the data which already gives rise to the above problem:

> SeaLvlAug30[fitRngAug, ]
        date       time       water
1574161 2010-08-30 04:40:00   253
1574162 2010-08-30 04:40:10   254
1574163 2010-08-30 04:40:20   253
1574164 2010-08-30 04:40:30   250
1574165 2010-08-30 04:40:40   250
1574166 2010-08-30 04:40:50   252
1574167 2010-08-30 04:41:00   250
1574168 2010-08-30 04:41:10   247
1574169 2010-08-30 04:41:20   246
1574170 2010-08-30 04:41:30   245
1574171 2010-08-30 04:41:40   242
1574172 2010-08-30 04:41:50   241
1574173 2010-08-30 04:42:00   242
1574174 2010-08-30 04:42:10   244
1574175 2010-08-30 04:42:20   245
1574176 2010-08-30 04:42:30   247
1574177 2010-08-30 04:42:40   247
1574178 2010-08-30 04:42:50   249
1574179 2010-08-30 04:43:00   250
1574180 2010-08-30 04:43:10   250

Minimal runnable R code

# Construct a time frame of a day with steps of 10 seconds
SeaLvlDayTm <- c(1:8640)*10 
# Construct the desired fit Range and prediction Range
ftRng <- c(1:20)
predRng <- c(21:50)
# Construct the desired columns for the data frame
date <- rep("2010-08-30", length(c(ftRng,predRng))) 
time <- c("04:40:00", "04:40:10", "04:40:20", "04:40:30", "04:40:40", "04:40:50", "04:41:00", 
      "04:41:10", "04:41:20", "04:41:30", "04:41:40", "04:41:50", "04:42:00", "04:42:10", 
      "04:42:20", "04:42:30", "04:42:40", "04:42:50", "04:43:00", "04:43:10", "04:43:20", 
      "04:43:30", "04:43:40", "04:43:50", "04:44:00", "04:44:10", "04:44:20", "04:44:30",
      "04:44:40", "04:44:50", "04:45:00", "04:45:10", "04:45:20", "04:45:30", "04:45:40", 
      "04:45:50", "04:46:00", "04:46:10", "04:46:20", "04:46:30", "04:46:40", "04:46:50",
      "04:47:00", "04:47:10", "04:47:20", "04:47:30", "04:47:40", "04:47:50", "04:48:00", 
      "04:48:10")
timeSec <- c(1681:1730)*10
water <- c(253, 254, 253, 250, 250, 252, 250, 247, 246, 245, 242, 241, 242, 244, 245, 247, 
       247, 249, 250, 250, 249, 249, 250, 249, 246, 246, 248, 248, 245, 247, 251, 250, 
       251, 255, 256, 256, 257, 259, 257, 256, 260, 260, 257, 260, 261, 258, 256, 256,
       258, 258)
# Construct the data frame
SeaLvlAugStp2 <- data.frame(date, time, timeSec, water)
# Change the index set of the data frame to correspond that of a year
rownames(SeaLvlAugStp2) <- c(1574161:1574210)
#Use a seperate variable for the time (because of a weird error)
SeaLvlAugFtTm <- SeaLvlAugStp2$timeSec[ftRng]
# Fit the linear model
lmObjAug <- lm(SeaLvlAugStp2$water[ftRng] ~ sin((2*pi)/44700 * SeaLvlAugFtTm)
           + cos((2*pi)/44700 * SeaLvlAugFtTm) + poly(SeaLvlAugFtTm, 3)
           + cos((2*pi)/545 * SeaLvlAugFtTm) + sin((2*pi)/545 * SeaLvlAugFtTm)
           + cos((2*pi)/205 * SeaLvlAugFtTm) + sin((2*pi)/205 * SeaLvlAugFtTm)
           + cos((2*pi)/85 * SeaLvlAugFtTm) + sin((2*pi)/85 * SeaLvlAugFtTm), 
           data = SeaLvlAug30Stp2[ftRng, ]) 
# Get information about the linear model fit
summary(lmObjAug)
plot(lmObjAug)
#Compute time range prediction and fit
prdtRngTm <- timeSec[prdtRng]
ftRngTm <- timeSec[ftRng]
#Compute prediction/forecast based on fitted data linear model
prdtAug <- predict(lmObjAug, newdata=data.frame(SeaLvlAugFtTm = prdtRngTm), interval="prediction", level=0.95)
#Calculate lower and upper bound confidence interval prediction 
lwrAug <- prdtAug[, 2]
uprAug <- prdtAug[, 3]
#Calculate minimum and maximum y axis plot
yminAug <- min(SeaLvlAug30$water[fitRngAug], SeaLvlAug30$water[prdtRngAug], lwrAug)
ymaxAug <- max(SeaLvlAug30$water[fitRngAug], SeaLvlAug30$water[prdtRngAug], uprAug)
#Make the plot
plot((timeSec/10)[ftRng], SeaLvlAugStp2$water[ftRng], xlim = c(min(timeSec/10), max(prdtRngAug30)), ylim = c(yminAug, ymaxAug), col = 'green', pch = 19, main = "Sea Level high water & prediction August 30 ", xlab = "Time (seconds)", ylab = "Sea Level (cm)")
polygon(c(sort(prdtRngTm/10), rev(sort(prdtRngTm/10))), c(uprAug, rev(lwrAug)), col = "gray", border = "gray")
points(prdtRngTm/10, SeaLvlAug30$water[prdtRngTm/10], col = 'green', pch = 19)
lines(ftRngTm/10, fitted(lmObjAug), col = 'blue', lwd = 2)
lines(prdtRngTm/10, prdtAug[, 1], col = 'blue', lwd = 2)
legend("topleft", legend = c("Observ.", "Predicted", "Conf. Int."), lwd = 2, col=c("green", "blue", "gray"), lty = c(NA, 1, 1), pch = c(19, NA, NA))

Example plot

Sea Lvl High Water & prediction August 30

1
It's easier to help you if you include a simple reproducible example with sample input and desired output that can be used to test and verify possible solutions. Without the value for SeaLvlAug30 we can't copy/paste the code to run it. If the code is only for SeaLvlDayTm[fitRngAug], then just edit your example to use that smaller dataset and make your example complete.MrFlick
I have seen something similar to the modeling results you have described when I have overfit my data, so I have a suggestion for a test to try. As a test, first fit the data in your post to a straight line, then - one at a time - add in the additional terms in your model. The straight line alone should of course give bad results, which should improve as each term is added without the large over- and under-predicting. If overfitting occurs you should then see see the over- and under-fitting behavior. Easy test.James Phillips
Thanks for your responses. @MrFlick I added a (arguably simple) reproducible example with sample input that resembles the real data I have.Joshwa Michels

1 Answers

0
votes

Until you post a question with code that we can run we won't be able to help you more but in the meantime here is a quick and dirty forecast from Rob J Hyndman forecast package:

string_data <- "row date       time       water
1574161 2010-08-30 04:40:00   253
1574162 2010-08-30 04:40:10   254
1574163 2010-08-30 04:40:20   253
1574164 2010-08-30 04:40:30   250
1574165 2010-08-30 04:40:40   250
1574166 2010-08-30 04:40:50   252
1574167 2010-08-30 04:41:00   250
1574168 2010-08-30 04:41:10   247
1574169 2010-08-30 04:41:20   246
1574170 2010-08-30 04:41:30   245
1574171 2010-08-30 04:41:40   242
1574172 2010-08-30 04:41:50   241
1574173 2010-08-30 04:42:00   242
1574174 2010-08-30 04:42:10   244
1574175 2010-08-30 04:42:20   245
1574176 2010-08-30 04:42:30   247
1574177 2010-08-30 04:42:40   247
1574178 2010-08-30 04:42:50   249
1574179 2010-08-30 04:43:00   250
1574180 2010-08-30 04:43:10   250"

SeaLvlAug30 <- read.table(textConnection(string_data), header=TRUE)

library(forecast)

fit <- auto.arima(SeaLvlAug30$water)
summary(fit)

preds <- forecast(fit, h = 25)
preds
# Point Forecast    Lo 80    Hi 80    Lo 95    Hi 95
# 21       249.7563 247.7314 251.7812 246.6595 252.8531
# 22       249.4394 246.1177 252.7611 244.3593 254.5195
# 23       249.1388 244.9831 253.2945 242.7832 255.4944
# 24       248.8930 244.2626 253.5234 241.8114 255.9746
# 25       248.7110 243.8397 253.5822 241.2610 256.1609
# 26       248.5867 243.6073 253.5661 240.9713 256.2021
# 27       248.5085 243.4867 253.5302 240.8284 256.1885
# 28       248.4636 243.4280 253.4991 240.7624 256.1647
# 29       248.4410 243.4020 253.4800 240.7345 256.1475
# 30       248.4322 243.3927 253.4718 240.7249 256.1396
# 31       248.4311 243.3916 253.4707 240.7238 256.1385
# 32       248.4337 243.3941 253.4733 240.7263 256.1411
# 33       248.4376 243.3979 253.4773 240.7300 256.1452
# 34       248.4414 243.4016 253.4812 240.7337 256.1492
# 35       248.4447 243.4048 253.4845 240.7368 256.1525
# 36       248.4471 243.4072 253.4870 240.7392 256.1550
# 37       248.4488 243.4089 253.4887 240.7409 256.1567
# 38       248.4499 243.4100 253.4898 240.7420 256.1578
# 39       248.4505 243.4106 253.4905 240.7426 256.1585
# 40       248.4509 243.4109 253.4908 240.7429 256.1588
# 41       248.4510 243.4111 253.4910 240.7431 256.1589
# 42       248.4510 243.4111 253.4910 240.7431 256.1590
# 43       248.4510 243.4111 253.4910 240.7431 256.1590
# 44       248.4510 243.4110 253.4909 240.7430 256.1589
# 45       248.4509 243.4110 253.4909 240.7430 256.1589

plot(preds)

forecast