1
votes

I have a quarter of a million events lieke this:

                 Slot Anzahl Nutzung TimeSlotNr WochenSlots Tag
1 2011-01-01 00:00:00      2   Firma          1         242   1
2 2011-01-01 00:00:00     50  Privat          1         242   1
3 2011-01-01 00:30:00      1   Firma          2         243   1
4 2011-01-01 00:30:00     49  Privat          2         243   1
5 2011-01-01 01:00:00      1   Firma          3         244   1
6 2011-01-01 01:00:00     48  Privat          3         244   1

A slot represents half 30 minutes, "Anzahl" is the number of events in a slot, the first slot starts at 2011-01-01 00:00:00 "WochenSlots" is the TimeSlotNr %% 336 and starts on a saturday 00:00:00. So i want to see the distribution in a week.

You see the development of bookings since 2011 by customertype. The peeks and holes are errors.

And the same data agregated to weeks (monday 00:00 - sunday 23:00)

What I want to do now is:

  • show the dates in x-scale (monday 00:00 - sunday 24:00)
  • show lines (envelopes) which shows the distribution for x% of the events.

And i have no idea, how to do that.

  ggplot(data=PB2) + 
    geom_point(mapping = aes(x = WochenSlots, y = Anzahl, colour = Nutzung), alpha=0.6) +
    scale_y_continuous(labels = scales::percent) +
    facet_wrap(~Nutzung,
               shrink = TRUE,
               nrow = 2,
               scales = "free_y")

dput(PB2[1:100, ]) structure(list(Slot = structure(c(1293840000, 1293840000, 1293841800, 1293841800, 1293843600, 1293843600, 1293845400, 1293845400, 1293847200, 1293847200, 1293849000, 1293849000, 1293850800, 1293850800, 1293852600, 1293852600, 1293854400, 1293854400, 1293856200, 1293856200, 1293858000, 1293858000, 1293859800, 1293859800, 1293861600, 1293861600, 1293863400, 1293863400, 1293865200, 1293865200, 1293867000, 1293867000, 1293868800, 1293868800, 1293870600, 1293870600, 1293872400, 1293872400, 1293874200, 1293874200, 1293876000, 1293876000, 1293877800, 1293877800, 1293879600, 1293879600, 1293881400, 1293881400, 1293883200, 1293883200, 1293885000, 1293885000, 1293886800, 1293886800, 1293888600, 1293888600, 1293890400, 1293890400, 1293892200, 1293892200, 1293894000, 1293894000, 1293895800, 1293895800, 1293897600, 1293897600, 1293899400, 1293899400, 1293901200, 1293901200, 1293903000, 1293903000, 1293904800, 1293904800, 1293906600, 1293906600, 1293908400, 1293908400, 1293910200, 1293910200, 1293912000, 1293912000, 1293913800, 1293913800, 1293915600, 1293915600, 1293917400, 1293917400, 1293919200, 1293919200, 1293921000, 1293921000, 1293922800, 1293922800, 1293924600, 1293924600, 1293926400, 1293926400, 1293928200, 1293928200), class = c("POSIXct", "POSIXt"), tzone = "UTC"), Anzahl = c(2L, 50L, 1L, 49L, 1L, 48L, 1L, 43L, 1L, 43L, 1L, 30L, 1L, 27L, 0L, 22L, 0L, 19L, 0L, 20L, 0L, 18L, 0L, 17L, 0L, 17L, 0L, 17L, 0L, 17L, 0L, 18L, 0L, 19L, 2L, 19L, 2L, 19L, 2L, 20L, 2L, 21L, 2L, 21L, 2L, 20L, 2L, 18L, 2L, 22L, 2L, 24L, 3L, 25L, 1L, 28L, 1L, 30L, 1L, 33L, 1L, 32L, 1L, 28L, 2L, 24L, 2L, 25L, 2L, 25L, 2L, 22L, 2L, 20L, 1L, 15L, 2L, 14L, 1L, 13L, 1L, 11L, 1L, 12L, 1L, 11L, 1L, 9L, 1L, 8L, 1L, 7L, 1L, 5L, 1L, 4L, 1L, 3L, 0L, 3L), Nutzung = c("Firma", "Privat", "Firma", "Privat", "Firma", "Privat", "Firma", "Privat", "Firma", "Privat", "Firma", "Privat", "Firma", "Privat", "Firma", "Privat", "Firma", "Privat", "Firma", "Privat", "Firma", "Privat", "Firma", "Privat", "Firma", "Privat", "Firma", "Privat", "Firma", "Privat", "Firma", "Privat", "Firma", "Privat", "Firma", "Privat", "Firma", "Privat", "Firma", "Privat", "Firma", "Privat", "Firma", "Privat", "Firma", "Privat", "Firma", "Privat", "Firma", "Privat", "Firma", "Privat", "Firma", "Privat", "Firma", "Privat", "Firma", "Privat", "Firma", "Privat", "Firma", "Privat", "Firma", "Privat", "Firma", "Privat", "Firma", "Privat", "Firma", "Privat", "Firma", "Privat", "Firma", "Privat", "Firma", "Privat", "Firma", "Privat", "Firma", "Privat", "Firma", "Privat", "Firma", "Privat", "Firma", "Privat", "Firma", "Privat", "Firma", "Privat", "Firma", "Privat", "Firma", "Privat", "Firma", "Privat", "Firma", "Privat", "Firma", "Privat"), TimeSlotNr = c(1L, 1L, 2L, 2L, 3L, 3L, 4L, 4L, 5L, 5L, 6L, 6L, 7L, 7L, 8L, 8L, 9L, 9L, 10L, 10L, 11L, 11L, 12L, 12L, 13L, 13L, 14L, 14L, 15L, 15L, 16L, 16L, 17L, 17L, 18L, 18L, 19L, 19L, 20L, 20L, 21L, 21L, 22L, 22L, 23L, 23L, 24L, 24L, 25L, 25L, 26L, 26L, 27L, 27L, 28L, 28L, 29L, 29L, 30L, 30L, 31L, 31L, 32L, 32L, 33L, 33L, 34L, 34L, 35L, 35L, 36L, 36L, 37L, 37L, 38L, 38L, 39L, 39L, 40L, 40L, 41L, 41L, 42L, 42L, 43L, 43L, 44L, 44L, 45L, 45L, 46L, 46L, 47L, 47L, 48L, 48L, 49L, 49L, 50L, 50L), WochenSlots = c(242, 242, 243, 243, 244, 244, 245, 245, 246, 246, 247, 247, 248, 248, 249, 249, 250, 250, 251, 251, 252, 252, 253, 253, 254, 254, 255, 255, 256, 256, 257, 257, 258, 258, 259, 259, 260, 260, 261, 261, 262, 262, 263, 263, 264, 264, 265, 265, 266, 266, 267, 267, 268, 268, 269, 269, 270, 270, 271, 271, 272, 272, 273, 273, 274, 274, 275, 275, 276, 276, 277, 277, 278, 278, 279, 279, 280, 280, 281, 281, 282, 282, 283, 283, 284, 284, 285, 285, 286, 286, 287, 287, 288, 288, 289, 289, 290, 290, 291, 291), Tag = c(1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 2L, 2L, 2L, 2L)), .Names = c("Slot", "Anzahl", "Nutzung", "TimeSlotNr", "WochenSlots", "Tag"), row.names = c(NA, 100L), class = "data.frame")

1
For the first bullet, ensure that Slot is of class POSIXct (PB2$Slot = as.POSIXct(PB2$Slot)) and then use Slot as the x mapping. For the second, do you mean you want a curve that shows, for each x-value, the y-value above which X% of the events occur?eipi10
For the second, do you mean you want a curve that shows, for each x-value, the y-value above which X% of the events occur? YESRüdiger Kladt
Can you provide a data sample using dput? For example, paste into your question the output of dput(PB2[1:100, ]) to provide the first 100 rows of data.eipi10
I provided a sample as you mentioned.Rüdiger Kladt

1 Answers

1
votes

It looks like quantile regression might be what you need. The data sample you posted only has one observation at each time point, so I've created some fake data for illustration. In the plot below, we use a flexible spline function for the regression function and we draw regression lines at the 25th and 75th percentiles of the data. Let me know if this is what you had in mind.

library(ggplot2)
library(quantreg)
library(splines)

# Fake data
set.seed(2)
dat = data.frame(x=runif(1e4,0,20))
dat$y = cos(dat$x) + 10 + rnorm(1e4, 2)

ggplot(dat, aes(x,y)) +
  geom_point(alpha=0.1, colour="blue", size=0.5) +
  geom_quantile(formula=y ~ ns(x, 10), quantiles=c(0.25, 0.75),
                colour="red", size=1) +
  theme_classic()

enter image description here