I thought about your question, and I originally wrote a routine that would map the date intervals into a 1D binary array, where each entry in the array is a day, and consecutive days are consecutive entries. With this data structure, you can perform dilation and erosion to fill in small gaps, thus merging the intervals, and then map the consolidated intervals back into date ranges. Thus we use standard raster connected components logic to solve your problem, as per your idea (a graph-based connected components could work as well...)
This works fine, and I can post the code if you are really interested, but then I wondered what the advantages are of the former apporach over the simple routine of just iterating through the (pre-sorted) date ranges and merging the next into the current if the gap is small.
Here is the code for the simple routine, and it takes about 120 micro seconds to run using the sample data. If you expand the sample data by repeating it 10,000 times, this routine takes about 1 sec on my computer.
When I timed the morphology based solution, it was about 2x slower. It might work better under certain circumstances, but I would suggest we try simple first, and see if there's a real problem that requires a different algorithmic approach.
from datetime import datetime
from datetime import timedelta
import numpy as np
The sample data provided in the question:
SAMPLE_DATA = [['1', '08/01/2011', '08/31/2011'],
['1', '09/01/2011', '09/30/2011'],
['1', '11/01/2011', '11/30/2011'],
['1', '12/01/2011', '12/31/2011'],
['1', '01/01/2012', '01/31/2012'],
['1', '03/01/2012', '03/31/2012'],
['2', '11/11/2011', '11/30/2011'],
['2', '12/11/2011', '12/31/2011'],
['2', '01/11/2014', '01/31/2014'],
['2', '03/11/2014', '03/31/2014']]
The simple approach:
def simple_method(in_data=SAMPLE_DATA, person='1', fill_gap_days=31, printit=False):
date_format_str = "%m/%d/%Y"
dat = np.array(in_data)
dat = dat[dat[:, 0] == person, 1:] # just this person's data
# assume date intervals are already sorted by start date
new_intervals = []
cur_start = None
cur_end = None
gap_days = timedelta(days=fill_gap_days)
for (s_str, e_str) in dat:
dt_start = datetime.strptime(s_str, date_format_str)
dt_end = datetime.strptime(e_str, date_format_str)
if cur_end is None:
cur_start = dt_start
cur_end = dt_end
continue
else:
if cur_end + gap_days >= dt_start:
# merge, keep existing cur_start, extend cur_end
cur_end = dt_end
else:
# new interval, save previous and reset current to this
new_intervals.append((cur_start, cur_end))
cur_start = dt_start
cur_end = dt_end
# make sure final interval is saved
new_intervals.append((cur_start, cur_end))
if printit:
print_it(person, new_intervals, date_format_str)
return new_intervals
And here's the simple pretty printing function to print the ranges.
def print_it(person, consolidated_ranges, fmt):
for (s, e) in consolidated_ranges:
print(person, s.strftime(fmt), e.strftime(fmt))
Running in ipython as follows. Note that printing the result can be turned off for timing the computation.
In [10]: _ = simple_method(printit=True)
1 08/01/2011 09/30/2011
1 11/01/2011 03/31/2012
Running in ipython with %timeit macro:
In [8]: %timeit simple_method(in_data=SAMPLE_DATA)
10000 loops, best of 3: 118 µs per loop
In [9]: %timeit simple_method(in_data=SAMPLE_DATA*10000)
1 loops, best of 3: 1.06 s per loop
[EDIT 8 Feb 2016: To make a long answer longer...]
As I prefaced in my response, I did create a morphological / 1D connected components version and in my timing it was about 2x slower. But for the sake of completeness, I'll show the morphological method, and maybe others will have insight on if there's a big area for speed-up left somewhere in it.
#using same imports as previous code with one more
import calendar as cal
def make_occupancy_array(start_year, end_year):
"""
Represents the time between the start and end years, inclusively, as a 1-D array
of 'pixels', where each pixel corresponds to a day. Consecutive days are thus
mapped to consecutive pixels. We can perform morphology on this 1D array to
close small gaps between date ranges.
"""
years_days = [(yr, 366 if cal.isleap(yr) else 365) for yr in range(start_year, end_year+1)]
YD = np.array(years_days) # like [ (2011, 365), (2012, 366), ... ] in ndarray form
total_num_days = YD[:, 1].sum()
occupancy = np.zeros((total_num_days,), dtype='int')
return YD, occupancy
With the occupancy array to represent the time intervals, we need two functions to map from dates to positions in the array and the inverse.
def map_date_to_position(dt, YD):
"""
Maps the datetime value to a position in the occupancy array
"""
# the start position is the offset to day 1 in the dt1,year,
# plus the day of year - 1 for dt1 (day of year is 1-based indexed)
yr = dt.year
assert yr in YD[:, 0] # guard...YD should include all years for this person's dates
position = YD[YD[:, 0] < yr, 1].sum() # the sum of the days in year before this year
position += dt.timetuple().tm_yday - 1
return position
def map_position_to_date(pos, YD):
"""
Inverse of map_date_to_position, this maps a position in the
occupancy array back to a datetime value
"""
yr_offsets = np.cumsum(YD[:, 1])
day_offsets = yr_offsets - pos
idx = np.flatnonzero(day_offsets > 0)[0]
year = YD[idx, 0]
day_of_year = pos if idx == 0 else pos - yr_offsets[idx-1]
# construct datetime as first of year plus day offset in year
dt = datetime.strptime(str(year), "%Y")
dt += timedelta(days=int(day_of_year)+1)
return dt
The following function fills the relevant part of the occupancy array given start and end dates (inclusive) and optionally extends the end of the range by a gap-filling margin (like 1-sided dilation).
def set_occupancy(dt1, dt2, YD, occupancy, fill_gap_days=0):
"""
For a date range starting dt1 and ending, inclusively, dt2,
sets the corresponding 'pixels' in occupancy vector to 1.
If fill_gap_days > 0, then the end 'pixel' is extended
(dilated) by this many positions, so that we can fill
the gaps between intervals that are close to each other.
"""
pos1 = map_date_to_position(dt1, YD)
pos2 = map_date_to_position(dt2, YD) + fill_gap_days
occupancy[pos1:pos2] = 1
Once we have the consolidated intervals in the occupancy array, we need to read them back out into date intervals, optionally performing 1-sided erosion if we'd previously done gap filling.
def get_occupancy_intervals(OCC, fill_gap_days=0):
"""
Find the runs in the OCC array corresponding
to the 'dilated' consecutive positions, and then
'erode' back to the correct end dates by subtracting
the fill_gap_days.
"""
starts = np.flatnonzero(np.diff(OCC) > 0) # where runs of nonzeros start
ends = np.flatnonzero(np.diff(OCC) < 0) # where runs of nonzeros end
ends -= fill_gap_days # erode back to original length prior to dilation
return [(s, e) for (s, e) in zip(starts, ends)]
Putting it all together...
def morphology_method(in_data=SAMPLE_DATA, person='1', fill_gap_days=31, printit=False):
date_format_str = "%m/%d/%Y"
dat = np.array(in_data)
dat = dat[dat[:, 0] == person, 1:] # just this person's data
# for the intervals of this person, get starting and ending years
# we assume the data is already sorted
#start_year = datetime.strptime(dat[0, 0], date_format_str)
#end_year = datetime.strptime(dat[-1, 1], date_format_str)
start_times = [datetime.strptime(d, date_format_str) for d in dat[:, 0]]
end_times = [datetime.strptime(d, date_format_str) for d in dat[:, 1]]
start_year = start_times[0].year
end_year = end_times[-1].year
# create the occupancy array, dilated so that each interval
# is extended by fill_gap_days to 'fill in' the small gaps
# between intervals
YD, OCC = make_occupancy_array(start_year, end_year)
for (s, e) in zip(start_times, end_times):
set_occupancy(s, e, YD, OCC, fill_gap_days)
# return the intervals from OCC after having filled gaps,
# and trim end dates back to original position.
consolidated_pos = get_occupancy_intervals(OCC, fill_gap_days)
# map positions back to date-times
consolidated_ranges = [(map_position_to_date(s, YD), map_position_to_date(e, YD)) for
(s, e) in consolidated_pos]
if printit:
print_it(person, consolidated_ranges, date_format_str)
return consolidated_ranges