My solution does not use try
, but is relatively complicated: I used
your question as an excuse to learn how to create a lexer in
Parsec without using
makeTokenParser
:D I avoid try
because the only look ahead happens in the lexer (tokenize
), where the various bracket pairs are identified.
The high level idea is that we treat {{
, }}
, [[
, and ]]
as
special tokens and parse the input into an AST. You didn't specify
the grammar precisely, so I chose a simple one that generates your
examples:
node ::= '{{' node* '}}'
| '[[' node* ']]'
| string
string ::= <non-empty string without '{{', '}}', '[[', or ']]'>
I parse an input string into a list of nodes. The first top-level
link ([[
) node, if any, is the link you're looking for.
The approach I've taken should be relatively robust to grammar
changes. E.g., if you want to allow only strings inside links, then
change '[[' node* ']]'
to '[[' string ']]'
. (In the code
link = L <$> between llink rlink nodes
becomes
link = L <$> between llink rlink string
).
The code is fairly long, but mostly straightforward. Most of it
concerns creating the token stream (lexing) and parsing the individual
tokens. After this, the actual Node
parsing is very simple.
Here it is:
{-# LANGUAGE TupleSections #-}
import Text.Parsec hiding (char , string)
import Text.Parsec.Pos (updatePosString , updatePosChar)
import Control.Applicative ((<$>) , (<*) , (<*>))
import Control.Monad (forM_)
import Data.List (find)
----------------------------------------------------------------------
-- Lexing.
-- Character or punctuation.
data Token = C Char | P String deriving Eq
instance Show Token where
show (C c) = [c]
show (P s) = s
tokenize :: String -> [Token]
tokenize [] = []
tokenize [c] = [C c]
tokenize (c1:c2:cs) = case [c1,c2] of
"[[" -> ts
"]]" -> ts
"{{" -> ts
"}}" -> ts
_ -> C c1 : tokenize (c2:cs)
where
ts = P [c1,c2] : tokenize cs
----------------------------------------------------------------------
-- Token parsers.
-- We update the 'sourcePos' while parsing the tokens. Alternatively,
-- we could have annotated the tokens with positions above in
-- 'tokenize', and then here we would use 'token' instead of
-- 'tokenPrim'.
llink , rlink , lbrace , rbrace :: Parsec [Token] u Token
[llink , rlink , lbrace , rbrace] =
map (t . P) ["[[" , "]]" , "{{" , "}}"]
where
t x = tokenPrim show update match where
match y = if x == y then Just x else Nothing
update pos (P s) _ = updatePosString pos s
char :: Parsec [Token] u Char
char = tokenPrim show update match where
match (C c) = Just c
match (P _) = Nothing
update pos (C c) _ = updatePosChar pos c
----------------------------------------------------------------------
-- Node parsers.
-- Link, braces, or string.
data Node = L [Node] | B [Node] | S String deriving Show
nodes :: Parsec [Token] u [Node]
nodes = many node
node :: Parsec [Token] u Node
node = link <|> braces <|> string
link , braces , string :: Parsec [Token] u Node
link = L <$> between llink (rlink <?> "]]") nodes
braces = B <$> between lbrace (rbrace <?> "}}") nodes
string = S <$> many1 char
----------------------------------------------------------------------
parseNodes :: String -> Either ParseError [Node]
parseNodes = parse (nodes <* eof) "<no file>" . tokenize
----------------------------------------------------------------------
-- Tests.
getLink :: [Node] -> Maybe Node
getLink = find isLink where
isLink (L _) = True
isLink _ = False
parseLink :: String -> Either ParseError (Maybe Node)
parseLink = either Left (Right . getLink) . parseNodes
testList = [ " [[rightLink]] "
, " {{ [[Wrong_Link]] }} [[rightLink]]"
, " {{ {{ }} [[Wrong_Link]] }} [[rightLink]]"
, " [[{{[[someLink]]}}]] {{}} {{[[asdf]]}}"
-- Pathalogical example from comments.
, "{{ab}cd}}"
-- A more pathalogical example.
, "{ [ { {asf{[[[asdfa]]]}aasdff ] ] ] {{[[asdf]]}}asdf"
-- No top level link.
, "{{[[Wrong_Link]]asdf[[WRong_Link]]{{}}}}{{[[[[Wrong]]]]}}"
-- Too many '{{'.
, "{{ {{ {{ [[ asdf ]] }} }}"
-- Too many '}}'.
, "{{ {{ [[ asdf ]] }} }} }}"
-- Too many '[['.
, "[[ {{ [[{{[[asdf]]}}]]}}"
]
main =
forM_ testList $ \ t -> do
putStrLn $ "Test: ^" ++ t ++ "$"
let parses = ( , ) <$> parseNodes t <*> parseLink t
printParses (n , l) = do
putStrLn $ "Nodes: " ++ show n
putStrLn $ "Link: " ++ show l
printError = putStrLn . show
either printError printParses parses
putStrLn ""
The output from main
is:
Test: ^ [[rightLink]] $
Nodes: [S " ",L [S "rightLink"],S " "]
Link: Just (L [S "rightLink"])
Test: ^ {{ [[Wrong_Link]] }} [[rightLink]]$
Nodes: [S " ",B [S " ",L [S "Wrong_Link"],S " "],S " ",L [S "rightLink"]]
Link: Just (L [S "rightLink"])
Test: ^ {{ {{ }} [[Wrong_Link]] }} [[rightLink]]$
Nodes: [S " ",B [S " ",B [S " "],S " ",L [S "Wrong_Link"],S " "],S " ",L [S "rightLink"]]
Link: Just (L [S "rightLink"])
Test: ^ [[{{[[someLink]]}}]] {{}} {{[[asdf]]}}$
Nodes: [S " ",L [B [L [S "someLink"]]],S " ",B [],S " ",B [L [S "asdf"]]]
Link: Just (L [B [L [S "someLink"]]])
Test: ^{{ab}cd}}$
Nodes: [B [S "ab}cd"]]
Link: Nothing
Test: ^{ [ { {asf{[[[asdfa]]]}aasdff ] ] ] {{[[asdf]]}}asdf$
Nodes: [S "{ [ { {asf{",L [S "[asdfa"],S "]}aasdff ] ] ] ",B [L [S "asdf"]],S "asdf"]
Link: Just (L [S "[asdfa"])
Test: ^{{[[Wrong_Link]]asdf[[WRong_Link]]{{}}}}{{[[[[Wrong]]]]}}$
Nodes: [B [L [S "Wrong_Link"],S "asdf",L [S "WRong_Link"],B []],B [L [L [S "Wrong"]]]]
Link: Nothing
Test: ^{{ {{ {{ [[ asdf ]] }} }}$
"<no file>" (line 1, column 26):
unexpected end of input
expecting }}
Test: ^{{ {{ [[ asdf ]] }} }} }}$
"<no file>" (line 1, column 24):
unexpected }}
expecting end of input
Test: ^[[ {{ [[{{[[asdf]]}}]]}}$
"<no file>" (line 1, column 25):
unexpected end of input
expecting ]]
"{{ab}cd}}"
parsed? – Karolis Juodelėab}cd
. – John GnoneOf "}"
will stop afterab
. – Karolis Juodelėmany (noneOf "}" <|> notFollowedByIteslf '}')
– John G