I've split the problem into sub problems... The test to check it is all working as expected:
# These are the two lists I want to pair
a = [ "apples"
, "oranges"
, "bananas"
, "blueberries" ]
b = [ "apples"
, "blueberries"
, "oranges"
, "bananas" ]
# This is the expected result
expected = [ (0, 0)
, (None, 1)
, (1, 2)
, (2, 3)
, (3, None) ]
# Test the function gets the correct result
assert expected == get_indexes_for_best_pairing(a, b)
print("Tests pass!")
1. Create a list of all of the pairs
Create a map of the values from list A with the list of the associated indices...
def map_list(list):
map = {}
for i in range(0, len(list)):
# Each element could be contained multiple times in each
# list, therefore we need to create a sub array of indices
if not list[i] in map:
map[list[i]] = []
# Add the index onto this sub array
map[list[i]].append(i)
return map
The map
would look like...
{ "apples": [0]
, "oranges": [1]
, "bananas": [2]
, "blueberries": [3] }
Find all of the pairs by cross referencing list B...
def get_pairs(a, b):
map = map_list(a)
pairs = []
for i in range(0, len(b)):
v = b[i]
if v in map:
for j in range(0, len(map[v])):
pairs.append((map[v][j], i))
return pairs
The pairs
are as follows...
[ (0, 0)
, (3, 1)
, (1, 2)
, (2, 3) ]
2. Get the scores for each pair
Simply loop through the pairs and look up the value in the original list:
def get_pairs_scores(pairs, a):
return [len(a[i]) for i, _ in pairs]
3. Create a list of pairs that each pair excludes
For each pair find the other pairs it excludes...
def get_pairs_excluded_by_pair(pairs, i):
# Check if the context pair excludes the pair, if both of the
# pairs indexes are greater or less than the other pair, then
# the pairs are inclusive and we will have a positive number,
# otherwise it will be negative
return [j for j in range(0, len(pairs))
# If the current context pair is also the pair we are comparing
# skip to the next pair
if i != j
and ((pairs[i][0] - pairs[j][0]) * (pairs[i][1] - pairs[j][1]) < 0)]
def get_pairs_excluded_by_pairs(pairs):
excludes = []
for i in range(0, len(pairs)):
excludes.append(get_pairs_excluded_by_pair(pairs, i))
return excludes
The pairs_excludes
method will return...
[ []
, [2, 3]
, [1]
, [1] ]
4. Calculate the total cumulative score for each pair minus the pairs it excludes
Plus the score of the pairs that are excluded by the pairs it excludes... etc, etc.
Use a depth first algorithm to traverse a acyclic graph of excludes to get the cumulative score for each pair... This is the meat of what we need to do...
def get_cumulative_scores_for_pairs(pairs, excludes, scores):
cumulative = []
# For each pair referenced in the excludes structure we create a new
# graph which starting from that pair. This graph tells us the total
# cumulative score for that pair
for i in range(0, len(pairs)):
score = 0
# Keep a reference of the nodes that have already been checked by
# in this graph using a set. This makes the graph acyclic
checked = set()
checked.add(i)
# We keep a note of where we are in the graph using this trail
# The pairs relate to the index in the pair_excludes. if pair
# first is x and pair second is y it refers to pair_excludes[x][y]
trail = []
# We start the current x, y to be the first exclude of the current
# start node
current = [i, 0]
# Sorry, tree traversal... Might not very readable could
# be done with recursion if that is your flavour
while True:
# Get the referenced excluded node
if len(excludes[current[0]]) > current[1]:
j = excludes[current[0]][current[1]]
# We do not want to calculate the same pair twice
if not j in checked:
# It has not been checked so we move our focus to
# this pair so we can examine its excludes
trail.append(current)
# We mark the pair as checked so that we do
# not try and focus on it if it turns up again
checked.add(j)
current = [j, 0]
# We perform a trick here, where when we traverse
# down or up a layer we flip the sign on the score.
# We do this because the score for pairs that we
# exclude need to be subtracted from the score whereas
# scores for pairs that we now can include because of
# that exclude need to be added to the score.
score = -score
# It the pair has already been checked, check its
# next sibling next time around
else:
current[1] += 1
# There are no more nodes to check at this level
else:
# We subtract the cumulative score from the score of the
# pair we are leaving. We do this when we traverse back up
# to the parent or as the last step of each graph finally
# subtracting the total cumulative score from the start node
# score.
score = scores[current[0]] - score
if len(trail):
# Pop the next item on the trail to become our context
# for the next iteration
current = trail.pop()
# Exit criteria... The trail went cold
else:
break
# Add the score to the array
cumulative.append(score)
return cumulative
This method should return an array that looks like...
[ 6
, -3
, 3
, 3 ]
5. Pick only the best pairs
We need to then store the index with the score so that we can sort on score without losing the index.
Sort the cumulative scores in order we create a list of the indices indices
...
# Sort pairs by score retaining the index to the pair
arr = sorted([(i, cumulative[i])
for i in range(0, len(cumulative))],
key=lambda item: item[1])
It looks like...
[ (1, -3)
, (2, 3)
, (3, 3)
, (0, 6) ]
Pick the top scoring items deleting the excluded items as we go, this way we retain the best pairs and discard the worst...
def get_best_pairs(a, b):
pairs = get_pairs(a, b)
excludes = get_pairs_excluded_by_pairs(pairs)
scores = get_pairs_scores(pairs, a)
cumulative = get_cumulative_scores_for_pairs(pairs, excludes, scores)
# Sort pairs by score retaining the index to the pair
arr = sorted([(i, cumulative[i])
for i in range(0, len(cumulative))],
key=lambda item: item[1])
# Work through in order of scores to find the best pair combination
top = []
while len(arr):
topitem = arr.pop()
top.append(topitem[0])
# Remove the indices that are excluded by this one
arr = [(i, score)
for i, score in arr
if i not in excludes[topitem[0]]]
# Sort the resulting pairs by index
return sorted([pairs[i] for i in top], key=lambda item: item[0])
Our top
list would look like, where the pair with the index 1
has been dropped because it was low scoring and excluded by higher scoring pairs...
[ (0, 0)
, (1, 2)
, (2, 3) ]
6. Build the result
Sort the selected pairs and build the result by incrementing each index to the next pair. When we run out of pairs increment until we reach the end of each list...
def get_indexes_for_best_pairing(a, b):
pairs = get_best_pairs(a, b)
result = [];
i = 0
j = 0
next = None
pair = None
while True:
# This is the first loop or we just dropped a pair into the result
# vector so we need to get the next one
if next == None:
# Get the next pair and we will increment the index up to this
if len(pairs):
next = pairs.pop(0)
pair = next
# No more pairs increment the index to the end of both lists
else:
next = (len(a), len(b))
pair = None
# We increment the index of the first list first
if i < next[0]:
result.append((i, None))
i += 1
# We increment the index of the second list when first has reached
# the next pair
elif j < next[1]:
result.append((None, j))
j += 1
# If both indexes are fully incremented up to the next pair and we
# have a pair to add we add it to the result and increment both
# clearing the next parameter so we get a new one next time around
elif pair != None:
result.append((pair[0], pair[1]));
i += 1
j += 1
next = None
# We reached the end
else:
break
return result
And finally our result would look like...
[ (0, 0)
, (None, 1)
, (1, 2)
, (2, 3)
, (3, None) ]