0
votes

Let us have a sequence of numbers. Say {1, 2, 3, 4, 5}. And a set of subsequence substitution with weights. For example:
{1} -> {3} : 0.2 - witch means 1 could be substituted by 3 with weight 0.1
{2, 3, 4} -> {4, 3} : 0.3
{5} -> {2} : 0.4

I need to find all the sequence we can get using substitutions with a restriction to weight.

The restriction works like this: the sum of substitutions weights of any n (3) items (in a row) should less given number e<= 0.5.
For our example the result is:
{1, 2, 3, 4, 5} : substitution weights: {0, 0, 0, 0, 0}. Sum of any 3 items in a row less than 0.5
{3, 2, 3, 4, 5} : substitution weights: {0.2, 0, 0, 0, 0}
{1, 4, 3, 5} : substitution weights: {0, 0.3/3, 0.3/3, 0.3/3, 0} the sequence of 3 symbols so /3
{3, 4, 3, 5} substitution weights: {0.2, 0.3/3, 0.3/3, 0.3/3, 0} {1, 2, 3, 4, 2} substitution weights: {0, 0, 0, 0, 0.4}
{3, 2, 3, 4, 2} substitution weights: {0.2, 0, 0, 0, 0.4}

We do not allow {1, 4, 3, 2} because substitution weights {0, 0.3/3, 0.3/3, 0.3/3, 0.4} have 3 last items with sum weight = 0.6.

In real example subsequence substitution set is big. Almost any short subsequence has a replacement. It's obvious that the task could be done using brute force. But I'm looking for a way to do it fast. Any help would be appreciated.

UPDATED
Actually, I deal with strings instead of sequence of number. So far I came up with the following agorithm (implemented in C#):

public class Substitution
{
    public string SubstituteWhat { get; private set; }
    public string SubstituteTo { get; private set; }
    public double  TotalWeight { get; private set; }
    public double  SymbolWeight { get; private set; }

    public Substitution(string substituteWhat, string substituteTo, double totalWeight)
    {
        SubstituteWhat = substituteWhat;
        SubstituteTo = substituteTo;
        TotalWeight = totalWeight;
        SymbolWeight = TotalWeight/SubstituteWhat.Length;
    }

    public override string ToString()
    {
        return string.Format("SubstituteWhat: {0}, SubstituteTo: {1}, TotalWeight: {2}", SubstituteWhat, SubstituteTo, TotalWeight);
    }
}

class SubstitutedStringWindowImpl
{
    public string OriginalPhrase { get; set; }
    public string SubstitutedPhrase { get; set; }

    private double[] weightVector;
    private double windowSum;
    private int addItemPointer;
    private int substructItemPoiner;

    public SubstitutedStringWindowImpl(string phrase, int windowLength)
    {
        this.OriginalPhrase = phrase;
        SubstitutedPhrase = String.Empty;
        weightVector = new double[phrase.Length + windowLength ];
        windowSum = 0;
        substructItemPoiner = 0;
        addItemPointer = windowLength;
    }

    public bool PerformSubstitution(
        Substitution substitution, 
        double maxWeight,
        out SubstitutedStringWindowImpl substitutedString)
    {
        substitutedString = MemberwiseClone() as SubstitutedStringWindowImpl;
        substitutedString.weightVector = weightVector.ToArray();

        for (int i = 0; i < substitution.SubstituteWhat.Length; i++)
        {
            substitutedString.weightVector[substitutedString.addItemPointer] = substitution.SymbolWeight;
            substitutedString.windowSum = substitutedString.windowSum -
                                          substitutedString.weightVector[substitutedString.substructItemPoiner] +
                                          substitutedString.weightVector[substitutedString.addItemPointer];

            substitutedString.substructItemPoiner++;
            substitutedString.addItemPointer++;

            if (substitutedString.windowSum > maxWeight)
                return false;

            if (substitutedString.addItemPointer == substitutedString.weightVector.Length)
                break;
        }

        substitutedString.SubstitutedPhrase = SubstitutedPhrase + substitution.SubstituteTo;

        return true;
    }
}
internal class SubstitutionManagerWindowImpl
{

    private readonly Dictionary<string, List<Substitution>> substitutionsDict;
    private readonly double maxWeight;
    private readonly int maxSubstitutionLength;
    private readonly int windowLength;

    public SubstitutionManagerWindowImpl(
        List<Substitution> substitutions,
        double maxWeight,
        int windowLength)
    {
        this.substitutionsDict = substitutions.GroupBy(x => x.SubstituteWhat)
            .ToDictionary(x => x.Key, x => x.ToList());
        this.maxWeight = maxWeight;
        this.windowLength = windowLength;
        maxSubstitutionLength = substitutions.Max(x => x.SubstituteWhat.Length);
    }


    private List<SubstitutedStringWindowImpl> GetAllSubstitutionsPrivate(
        SubstitutedStringWindowImpl stringToHandle, int symbolCount)
    {
        if (stringToHandle.OriginalPhrase.Length == symbolCount)
            return new List<SubstitutedStringWindowImpl> {stringToHandle};

        var result = new List<SubstitutedStringWindowImpl>();

        for (int i = 1;
            i <= Math.Min(maxSubstitutionLength, stringToHandle.OriginalPhrase.Length - symbolCount);
            i++)
        {
            var subphraseToSubstitute = stringToHandle.OriginalPhrase.Substring(symbolCount, i);

            List<Substitution> appropriateSubstitutions;
            if (!substitutionsDict.TryGetValue(subphraseToSubstitute, out appropriateSubstitutions))
                continue;

            foreach (var substitution in appropriateSubstitutions)
            {
                SubstitutedStringWindowImpl handledString;

                if (!stringToHandle.PerformSubstitution(substitution, maxWeight, out handledString))
                    continue;

                result.AddRange(GetAllSubstitutionsPrivate(handledString,
                    symbolCount + substitution.SubstituteWhat.Length));
            }
        }

        return result;
    }

    // this is the entry function
    public List<string> GetAllSubstitutions(string phrase)
    {
        var res = GetAllSubstitutionsPrivate(new SubstitutedStringWindowImpl(phrase,windowLength), 0);
        return res.Select(x => x.SubstitutedPhrase).ToList();

    }
}

But it seems that it doesn't do the work fast enoung. Any suggestions how to improve it?

3
You did not show 12342 which should be a valid substitution. Can you explain, please? - George Polevoy
Did you find any way to do it? What about the different options in the answers? - samy

3 Answers

2
votes

It doesn't seem as though it would take much sophistication to be able to enumerate with polynomial delay. Define a recursive procedure

all_substitutions(substitutions, first_half, last_n_weights, second_half)

where first_half is a substituted prefix of the input and second_half is the unsubstituted suffix of the input such that the two halves make up the whole input. The body of all_substitutions should test whether second_half is empty. If so, yield first_half and return. Otherwise, for each feasible substitution at the beginning of second_half, make it and recurse appropriately (where we count substituting a singleton for itself as a feasible substitution with weight 0).

If a procedure along these lines is not fast enough, then it would be helpful to see actual code, because further improvements will be data-structural.

0
votes

I've tried to have a look at this, and was bothered by the fact that I couldn't seem to be able to cache the results, which is usually a very efficient optimisation. After some tweaking I found a way.

The code substitutes in a sequence as soon as possible and then recursively substitutes the remaining sequence on the right. It doesn't bother passing to the subsequence being substituted the previous cost of substitutions, since the subsequence will be reanalyzed at a later time without the first substitution, hence changing the previous cost value.

However, cost is taken into account when the parent sequence receives the result of substitutions in the sub-sequences: it only keeps those that satisfy the fact that their substitution cost won't blow the cost limit. Of course as in every cache mechanism you trade memory for speed, but the cache is very useful here. (Ideally anyway, the caching should be independent from the CappedCostSubstitutionEnumerator; using AOP you could inject when needed)

We first define the Sequence and the Substitution objects

public class Sequence<T>: List<T>
{
    public double Cost { get; set; }

    public override string ToString() // for the cache key
    {
        return string.Join("|", this.Select(t => t.ToString()));
    }

    public Sequence(IEnumerable<T> sequence)
    {
        this.AddRange(sequence);
        Cost = 0;
    }
}

public class Substitution<T>
{
    public IEnumerable<T> OriginalSequence { get; set; }
    public IEnumerable<T> SwappedSequence { get; set; }
    public double Cost { get; set; }
}

Now we can define the substitutions enumerator

public class CappedCostSubstitutionEnumerator<T>
{
    private static Dictionary<string, List<Sequence<T>>> Cache { get; set; }

    static CappedCostSubstitutionEnumerator()
    {
        Cache = new Dictionary<string, List<Sequence<T>>>();
    }

    public double AuthorizedCost { get; set; }
    public List<Substitution<T>> Substitutions { get; set; }

    public CappedCostSubstitutionEnumerator(double maxAuthorizedCost)
    {
        Substitutions = new List<Substitution<T>>();
        AuthorizedCost = maxAuthorizedCost;
    }

    public List<List<T>> EnumerateSequenceSubstitutions(List<T> sequence)
    {
        var values = EnumerateSequenceSubstitutions(new Sequence<T>(sequence));
        return values.Select(s => s as List<T>).ToList();
    }

    private List<Sequence<T>> EnumerateSequenceSubstitutions(Sequence<T> sourceSequence)
    {
        var sourceSequenceKey = sourceSequence.ToString();
        if (Cache.ContainsKey(sourceSequenceKey))
        {
            return Cache[sourceSequenceKey];
        }
        else
        {
            var sequenceSubstitutionsResults = new List<Sequence<T>> { sourceSequence };

            foreach (var substitution in Substitutions.Where(substitution => substitution.Cost <= AuthorizedCost))
            {
                SubstituteWhereOriginalSubstitutionSequenceIsFound(sourceSequence, substitution, sequenceSubstitutionsResults);
            }

            Cache.Add(sourceSequenceKey, sequenceSubstitutionsResults);

            return sequenceSubstitutionsResults;
        }
    }

    private void SubstituteWhereOriginalSubstitutionSequenceIsFound(Sequence<T> sourceSequence, Substitution<T> substitution,
        List<Sequence<T>> sequenceSubstitutionsResults)
    {
        var indexInSequence = 0;
        var originalSequenceLength = substitution.OriginalSequence.Count();
        var upperIndexInSequence = sourceSequence.Count() - originalSequenceLength;
        // we are going to look for the substitution pattern at each possible place in the source sequence
        while (indexInSequence <= upperIndexInSequence)
        {
            var evaluatedMatch = sourceSequence.Skip(indexInSequence).Take(originalSequenceLength);
            if (evaluatedMatch.SequenceEqual<T>(substitution.OriginalSequence))
            {
                var start = sourceSequence.Take(indexInSequence);
                var substituted = substitution.SwappedSequence;
                var remain = sourceSequence.Skip(indexInSequence + originalSequenceLength);
                // now we want the subsequences without taking cost into account
                var subSequences = EnumerateSequenceSubstitutions(new Sequence<T>(remain));
                // we filter the subsequences on the total cost here
                foreach (
                    var subSequence in
                        subSequences.Where(
                            subSeq => (subSeq.Cost + substitution.Cost) <= (AuthorizedCost - sourceSequence.Cost)))
                {
                    sequenceSubstitutionsResults.Add(
                        new Sequence<T>(start.Concat(substituted).Concat(subSequence))
                        {
                            Cost = substitution.Cost + subSequence.Cost
                        });
                }
            }
            indexInSequence++;
        }
    }
}

Some points:

  • I use lists but it may not be the most efficient data holder; in fact I'm pretty sure that memory fragmentation could be an issue. Using a memory pool could be a good idea if it causes problems
  • Walking the sequence one element at a time is not the most efficient, there are some better ways of finding sequence occurences
  • The cache really helps, for the example below it is hit 1796 times, for 53 insertions

To use it do something like this:

private static void Main(string[] args)
{
    var p = new CappedCostSubstitutionEnumerator<int>(0.5);
    p.Substitutions.Add(new Substitution<int>() {OriginalSequence = new int[] {1}, SwappedSequence = new int[] {3}, Cost = 0.2});
    p.Substitutions.Add(new Substitution<int>() { OriginalSequence = new int[] { 2,3,4 }, SwappedSequence = new int[] { 4,3 }, Cost = 0.3 });
    p.Substitutions.Add(new Substitution<int>() { OriginalSequence = new int[] { 5 }, SwappedSequence = new int[] { 2 }, Cost = 0.4 });
    p.Substitutions.Add(new Substitution<int>() { OriginalSequence = new int[] { 5,1 }, SwappedSequence = new int[] { 4, 3 }, Cost = 0.3 });
    p.Substitutions.Add(new Substitution<int>() { OriginalSequence = new int[] { 2,3 }, SwappedSequence = new int[] { 4, 3 }, Cost = 0.3 });
    p.Substitutions.Add(new Substitution<int>() { OriginalSequence = new int[] { 2 }, SwappedSequence = new int[] { 4, 3 }, Cost = 0.1 });
    p.Substitutions.Add(new Substitution<int>() { OriginalSequence = new int[] {  4 }, SwappedSequence = new int[] { 4, 3 }, Cost = 0.3 });

    var results = p.EnumerateSequenceSubstitutions(new List<int>() { 1, 2, 3, 4, 5, 1, 2, 3, 4, 5, 1, 2, 3, 4, 5, 1, 2, 3, 4, 5, 1, 2, 3, 4, 5, 5, 1, 2, 3, 4, 5, 1, 2, 3, 4, 5, 1, 2, 3, 4, 5, 1, 2, 3, 4, 5, 1, 2, 3, 4, 5, 5 });
    // results contains 5390 values
}
0
votes

Here is my solution, along with some tests. Substitutions for a sequence of 30 tokens is computed in under one second on my computer.

It works as a binary tree, and caches intermediate results in a context object, so all the branches share the cache. The problem is it's not easy to split the source to two subtrees, because there are various token lengths. It's solved with repeating splitting at different points based on every possible token length, so every token length get's it's chance in both subtrees.

There are a lot of optimizations involved, including pre-filtering replacement tokens by their 'budgets' slots (it's impossible to compute for every budget).

Note that original item is not included in the results as an optimization for recursive function. Also, i have replaced double with decimal to avoid the rounding errors, because it substracts costs from budgets. Actually, i'd recommend replacing costs with integers in different range.

using System;
using System.Collections.Generic;
using System.Linq;
using Microsoft.VisualStudio.TestTools.UnitTesting;

namespace SubstitutionsWithBudget
{
    [TestClass]
    public class SubstitutionTest
    {
        private static readonly List<Substitution> Substitutions = new List<Substitution>
            {
                new Substitution("1", "3", 0.2m),
                new Substitution("234", "43", 0.3m),
                new Substitution("5", "2", 0.4m)
            };

        private readonly SubstitutionsDefinition substitutionsDefinition = new SubstitutionsDefinition(Substitutions);

        [TestMethod]
        public void OriginalQuestion()
        {
            string source = "12345";

            var variants = EnumerateSubstitutions(new Context(), substitutionsDefinition, source, 0.6m);

            foreach (var variant in variants)
            {
                Console.WriteLine(variant);
            }

            Assert.AreEqual(5, variants.Count());
        }

        [TestMethod]
        public void MultiplicityTest()
        {
            const int multiplicity = 6;
            string source = string.Join("", Enumerable.Repeat("12345", multiplicity));

            var variants = EnumerateSubstitutions(new Context(), substitutionsDefinition, source, multiplicity * 0.6m).ToList();

            foreach (var variant in variants.Take(10))
            {
                Console.WriteLine(variant);
            }
        }

        [TestMethod]
        public void SomeUsefulApplication()
        {
            var substitutions = new List<Substitution>
            {
                new Substitution("monkey", "elephant", 2m),
                new Substitution("monkey", "shark", 3m),
                new Substitution("banana", "apple", 1m),
                new Substitution("feed", "kill", 4m),
            };
            var resultingSubstitutions = EnumerateSubstitutions(
                new Context(),
                new SubstitutionsDefinition(substitutions),
                "feed monkey with banana",
                4)
                .OrderBy(s => s.Item2).ToList();
            foreach (var substitution in resultingSubstitutions)
            {
                Console.WriteLine(substitution);
            }

            Assert.IsTrue(resultingSubstitutions.Any(
                s => s.Item1 == "feed elephant with banana"));
            Assert.IsFalse(resultingSubstitutions.Any(
                s => s.Item1 == "kill shark with banana"));
        }

        IEnumerable<Tuple<string, decimal>> EnumerateSubstitutions(Context context, SubstitutionsDefinition substitutionsDefinition, string source, decimal maxCost)
        {
            if (source.Length == 0)
            {
                yield break;
            }
            var possibleSubstitutions = substitutionsDefinition.GetSubstitutions(source, maxCost).ToList();

            // find substitutions of whole string
            foreach (var possibleSubstitution in possibleSubstitutions)
            {
                yield return Tuple.Create(possibleSubstitution.Destination, possibleSubstitution.Weight);
            }

            // Variable split boundary to accomodate tokens of different length
            var middle = source.Length / 2;
            var window = substitutionsDefinition.MaxLength - 1;

            if (window > source.Length - 2)
            {
                window = source.Length - 2;
            }
            var min = middle - window / 2;

            var returned = new HashSet<Tuple<string, decimal>> { Tuple.Create(source, 0m) };

            for (var i = 0; i <= window; i++)
            {
                var divideAt = min + i;
                var left = source.Substring(0, divideAt);
                var right = source.Substring(divideAt, source.Length - divideAt);
                var q =
                    from leftSubstitution in context.GetCachedResult(Tuple.Create(left, maxCost),
                        () => EnumerateSubstitutions(context, substitutionsDefinition, left, maxCost)).Concat(new[] { Tuple.Create(left, 0m) })
                    let leftCost = leftSubstitution.Item2
                    from rightSubstitution in context.GetCachedResult(Tuple.Create(right, maxCost - leftCost),
                    () => EnumerateSubstitutions(context, substitutionsDefinition, right, maxCost - leftCost)).Concat(new[] { Tuple.Create(right, 0m) })
                    where leftCost + rightSubstitution.Item2 <= maxCost
                    select new { leftSubstitution, rightSubstitution };
                q = q.ToList();

                foreach (var item in q.Select(pair => Tuple.Create(pair.leftSubstitution.Item1 + pair.rightSubstitution.Item1,
                    pair.leftSubstitution.Item2 + pair.rightSubstitution.Item2)).Where(item => !returned.Contains(item)))
                {
                    yield return item;
                    returned.Add(item);
                }
            }
        }
    }
    public struct Substitution
    {
        public readonly string Souce;
        public readonly string Destination;
        public readonly decimal Weight;

        public Substitution(string souce, string destination, decimal weight)
        {
            Souce = souce;
            Destination = destination;
            Weight = weight;
        }
    }

    public class Context
    {
        private readonly Dictionary<Tuple<string, decimal>, List<Tuple<string, decimal>>> cache = new Dictionary<Tuple<string, decimal>, List<Tuple<string, decimal>>>();

        public IEnumerable<Tuple<string, decimal>> GetCachedResult(Tuple<string, decimal> key, Func<IEnumerable<Tuple<string, decimal>>> create)
        {
            List<Tuple<string, decimal>> result;
            cache.TryGetValue(key, out result);
            if (result != null) return result;
            result = create().ToList();
            cache.Add(key, result);
            return result;
        }

        public void AddToCache(Tuple<string, decimal> key, List<Tuple<string, decimal>> result)
        {
            cache.Add(key, result);
        }
    }

    public class SubstitutionsDefinition
    {
        private readonly decimal maxCost;
        private const int Granularity = 10;

        /// <summary>
        /// Holds substitutions lookups based on budget slots.
        /// A slot with higher budget also holds values of all lower-budget slots.
        /// </summary>
        private readonly ILookup<int, ILookup<string, Substitution>> byCost;

        private readonly int maxLength;

        private readonly ILookup<string, Substitution> simple;

        private bool simpleMode;

        public int MaxLength { get { return maxLength; } }

        // Really helpful if there are a lot of substitutions
        public IEnumerable<Substitution> GetSubstitutions(string token, decimal budget)
        {
            return simpleMode
                ? GetSubstitutionsSmallSet(token, budget)
                : GetSubstitutionsLargeSet(token, budget);
        }

        private IEnumerable<Substitution> GetSubstitutionsLargeSet(string token, decimal budget)
        {
            return byCost[GetSlot(budget)].SelectMany(i => i[token]).Where(s => s.Weight <= budget);
        }

        public IEnumerable<Substitution> GetSubstitutionsSmallSet(string token, decimal budget)
        {
            return simple[token].Where(s => s.Weight <= budget);
        }

        public SubstitutionsDefinition(IEnumerable<Substitution> substitutions)
        {
            var list = substitutions.ToList();
            simpleMode = list.Count < 20; // Need to be found experimentally.
            simple = list.ToLookup(i => i.Souce);
            maxCost = list.Max(s => s.Weight);
            maxLength = list.Max(s => s.Souce.Length);
            var byLength = list.ToLookup(s => s.Souce.Length);
            var q =
                from length in list.Select(i => i.Souce.Length).Distinct()
                from budgetSlot in Enumerable.Range(0, Granularity + 1)
                from item in byLength[length]
                where item.Weight <= GetCost(budgetSlot)
                group item by budgetSlot;
            byCost = q.ToLookup(i => i.Key, i => i.ToLookup(j => j.Souce));
        }

        private decimal GetCost(int slot)
        {
            var d = maxCost * slot;
            return d / Granularity;
        }

        private int GetSlot(decimal weight)
        {
            var maxWeight = Math.Min(weight, maxCost);
            return (int)(Granularity * maxWeight / maxCost);
        }
    }
}