2
votes

I have the following time series data:

    Date  duration Volume
1  1-Jul  12am-2am   0.80
2  1-Jul   2am-4am   0.80
3  1-Jul   4am-6am   0.80
4  1-Jul   6am-8am   5.00
5  1-Jul  8am-10am  14.00
6  1-Jul 10am-12pm   3.40
7  1-Jul  12pm-2pm   0.80
8  1-Jul   2pm-4pm   0.80
9  1-Jul   4pm-6pm   2.40
10 1-Jul   6pm-8pm  12.00
11 1-Jul  8pm-10pm  14.00
12 1-Jul 10pm-12am   3.40
13 1-Jul  12am-2am   0.60
14 1-Jul   2am-4am   0.60
15 1-Jul   4am-6am   0.60
16 1-Jul   6am-8am   5.50
17 1-Jul  8am-10am  14.00
18 1-Jul 10am-12pm   4.00
19 1-Jul  12pm-2pm   0.80
20 1-Jul   2pm-4pm   0.65
21 1-Jul   4pm-6pm   6.30
22 1-Jul   6pm-8pm  19.50
23 1-Jul  8pm-10pm  19.45
24 1-Jul 10pm-12am   9.00

I would like to convert 'Date' & 'duration' column into R date format. Also, is it possible to combine these two columns to a single 'date_time' column to make it easy to forecast using auto.arima().

Thanks

1
For a single column, look at intervals (in the stringr package) which allow you to describe a specific time between two date-times, which seems to be what you want here.crazybilly

1 Answers

2
votes

Wanted to challenge myself and stumbled upon this question, this is my solution to it.

#first lets create the sample data

date <- c("1-Jul","1-Jul","1-Jul","1-Jul","1-Jul","1-Jul","1-Jul","1-Jul","1-Jul","1-Jul","1-Jul","1-Jul","2-Jul","2-Jul","2-Jul","2-Jul","2-Jul","2-Jul","2-Jul","2-Jul","2-Jul","2-Jul","2-Jul","2-Jul")
duration <- c("12am-2am","2am-4am","4am-6am","6am-8am","8am-10am","10am-12pm","12pm-2pm","2pm-4pm","4pm-6pm","6pm-8pm","8pm-10pm","10pm-12am","12am-2am","2am-4am","4am-6am","6am-8am","8am-10am","10am-12pm","12pm-2pm","2pm-4pm","4pm-6pm","6pm-8pm","8pm-10pm","10pm-12am")
volume <- c("0.80","0.80","0.80","5.00","14.00","3.40","0.80","0.80","2.40","12.00","14.00","3.40","0.60","0.60","0.60","5.50","14.00","4.00","0.80","0.65","6.30","19.50","19.45","9.00")

df <- data.frame(date,duration,volume, stringsAsFactors = F)

bla <- t(as.data.frame(sapply(df$duration, strsplit, "-")))
rownames(bla) <- c(1:nrow(bla))

default_year <- "2020"

#seperate the timestamps in start and end times
df <- cbind(df,bla)

#add current year to the date and make it a as.Date
df$date <- as.Date(paste0(default_year,"-",df$date),format='%Y-%d-%b')

#convert "am" and "pm" to 24h mode
df$`1` <- gsub("12am","00:00",df$`1`)
df$`1` <- gsub("am",":00",df$`1`)
df$`1`[grep("pm",df$`1`)] <- paste0(ifelse(as.numeric(gsub("pm","",df$`1`[grep("pm",df$`1`)]))==12,12,as.numeric(gsub("pm","",df$`1`[grep("pm",df$`1`)]))+12),":00")

df$`2` <- gsub("12am","00:00",df$`2`)
df$`2` <- gsub("am",":00",df$`2`)
df$`2`[grep("pm",df$`2`)] <- paste0(ifelse(as.numeric(gsub("pm","",df$`2`[grep("pm",df$`2`)]))==12,12,as.numeric(gsub("pm","",df$`2`[grep("pm",df$`2`)]))+12),":00")

#paste date and time vectors together
df$t_start <- paste0(df$date,"-",df$`1`)
df$t_end <- paste0(df$date,"-",df$`2`)

#make them posix
df$t_start <- as.POSIXct(df$t_start, format='%Y-%d-%m-%H:%M')
df$t_end <- as.POSIXct(df$t_end, format='%Y-%d-%m-%H:%M')-1

#save it
new_df <- data.frame(df$t_start,df$t_end,df$volume)

new_df
            df.t_start            df.t_end df.volume
1  2020-01-07 00:00:00 2020-01-07 01:59:59      0.80
2  2020-01-07 02:00:00 2020-01-07 03:59:59      0.80
3  2020-01-07 04:00:00 2020-01-07 05:59:59      0.80
4  2020-01-07 06:00:00 2020-01-07 07:59:59      5.00
5  2020-01-07 08:00:00 2020-01-07 09:59:59     14.00
6  2020-01-07 10:00:00 2020-01-07 11:59:59      3.40
7  2020-01-07 12:00:00 2020-01-07 13:59:59      0.80
8  2020-01-07 14:00:00 2020-01-07 15:59:59      0.80
9  2020-01-07 16:00:00 2020-01-07 17:59:59      2.40
10 2020-01-07 18:00:00 2020-01-07 19:59:59     12.00
11 2020-01-07 20:00:00 2020-01-07 21:59:59     14.00
12 2020-01-07 22:00:00 2020-01-06 23:59:59      3.40
13 2020-02-07 00:00:00 2020-02-07 01:59:59      0.60
14 2020-02-07 02:00:00 2020-02-07 03:59:59      0.60
15 2020-02-07 04:00:00 2020-02-07 05:59:59      0.60
16 2020-02-07 06:00:00 2020-02-07 07:59:59      5.50
17 2020-02-07 08:00:00 2020-02-07 09:59:59     14.00
18 2020-02-07 10:00:00 2020-02-07 11:59:59      4.00
19 2020-02-07 12:00:00 2020-02-07 13:59:59      0.80
20 2020-02-07 14:00:00 2020-02-07 15:59:59      0.65
21 2020-02-07 16:00:00 2020-02-07 17:59:59      6.30
22 2020-02-07 18:00:00 2020-02-07 19:59:59     19.50
23 2020-02-07 20:00:00 2020-02-07 21:59:59     19.45
24 2020-02-07 22:00:00 2020-02-06 23:59:59      9.00