Answering my own question: I have created a pull request that adds breakSubstring
to Data.ByteString.Lazy
(adapted from the strict version).
Until the pull request is merged, the following code can be used:
{-# LANGUAGE BangPatterns
module Lib (breakSubstring) where
import Data.Bits (finiteBitSize, shiftL, (.|.), (.&.))
import Data.Word (Word32)
import Prelude
import qualified Data.ByteString.Lazy as BSL
breakSubstring
:: BSL.ByteString
-> BSL.ByteString
-> (BSL.ByteString, BSL.ByteString)
breakSubstring pat =
case lp of
0 -> \src -> (BSL.empty, src)
1 -> BSL.break (== BSL.head pat)
_ -> if lp * 8 <= fromIntegral (finiteBitSize (0 :: Word))
then shift
else karpRabin
where
lp = BSL.length pat
karpRabin :: BSL.ByteString -> (BSL.ByteString, BSL.ByteString)
karpRabin src
| BSL.length src < lp = (src, BSL.empty)
| otherwise = search (rollingHash $ BSL.take lp src) lp
where
k = 2891336453 :: Word32
rollingHash = BSL.foldl' (\h b -> h * k + fromIntegral b) 0
hp = rollingHash pat
m = k ^ lp
get = fromIntegral . BSL.index src
search !hs !i
| hp == hs && pat == BSL.take lp b = u
| BSL.length src <= i = (src, BSL.empty)
| otherwise = search hs' (i + 1)
where
u@(_, b) = BSL.splitAt (i - lp) src
hs' = hs * k +
get i -
m * get (i - lp)
{-# INLINE karpRabin
shift :: BSL.ByteString -> (BSL.ByteString, BSL.ByteString)
shift !src
| BSL.length src < lp = (src, BSL.empty)
| otherwise = search (intoWord $ BSL.take lp src) lp
where
intoWord :: BSL.ByteString -> Word
intoWord = BSL.foldl' (\w b -> (w `shiftL` 8) .|. fromIntegral b) 0
wp = intoWord pat
mask = (1 `shiftL` fromIntegral (8 * lp)) - 1
search !w !i
| w == wp = BSL.splitAt (i - lp) src
| BSL.length src <= i = (src, BSL.empty)
| otherwise = search w' (i + 1)
where
b = fromIntegral (BSL.index src i)
w' = mask .&. ((w `shiftL` 8) .|. b)
{-# INLINE shift
breakSubstring
(it usestake
andisPrefixOf
which are available for Lazy bytestrings) a suitable solution? – user2407038unsafeX
functions that are not available for lazy bytestrings. Maybe I can use the normal functions. – Martijn Rijkeboer