1
votes

I am trying to parse an XML dump of Wikipedia to find certain links on each page using the Haskell Parsec library. Links are denoted by double brackets: texttext[[link]]texttext. To simplify the scenario as much as possible, let's say I am looking for the first link not enclosed in double curly braces (which can be nested): {{ {{ [[Wrong Link]] }} [[Wrong Link]] }} [[Right Link]]. I have written a parser to discard links which are enclosed in non-nested double braces:

import Text.Parsec

getLink :: String -> Either ParseError String
getLink = parse linkParser "Links"

linkParser = do
    beforeLink
    link <- many $ noneOf "]"
    string "]]"
    return link

beforeLink = manyTill (many notLink) (try $ string "[[")

notLink = try doubleCurlyBrac <|> (many1 normalText)

normalText = noneOf "[{"
           <|> notFollowedByItself '['
           <|> notFollowedByItself '{'

notFollowedByItself c = try ( do x <- char c
                                 notFollowedBy $ char c
                                 return x)

doubleCurlyBrac = between (string "{{") (string "}}") (many $ noneOf "}")

getLinkTest = fmap getLink testList
    where testList = ["   [[rightLink]]   "                            --Correct link is found
                     , "  {{    [[Wrong_Link]]    }}  [[rightLink]]"   --Correct link is found
                     , "  {{  {{ }} [[Wrong_Link]] }} [[rightLink]]" ] --Wrong link is found 

I have tried making the doubleCurlyBrac parser recursive to also discard links in nested curly braces, without success:

doubleCurlyBrac = between (string "{{") (string "}}") betweenBraces
        where betweenBraces = doubleCurlyBrac <|> (many $ try $ noneOf "}")

This parser stops consuming input after the first }}, rather than the final one, in a nested example. Is there an elegant way to write a recursive parser to (in this case) correctly ignore links in nested double curly braces? Also, can it be done without using try? I have found that since try does not consume input, it often causes the parser to hang on unexpected, ill-formed input.

2
How do you want "{{ab}cd}}" parsed?Karolis Juodelė
A more detailed grammar description would be helpful.ntc2
@KarolisJuodelė In that example, the parser should pick out ab}cd.John G
@JohnG it should, but the noneOf "}" will stop after ab.Karolis Juodelė
@KarolisJuodelė I was trying to simplify as much as possible for the sake of clarity in the question. My actual code was something like many (noneOf "}" <|> notFollowedByIteslf '}')John G

2 Answers

2
votes

Here's a more direct version that doesn't use a custom lexer. It does use try though, and I don't see how to avoid it here. The problem is that it seems we need a non-committing look ahead to distinguish double brackets from single brackets; try is for non-committing look ahead.

The high level approach is that same as in my first answer. I've been careful to make the three node parsers commute -- making the code more robust to change -- by using both try and notFollowedBy:

{-# LANGUAGE TupleSections #-}
import Text.Parsec hiding (string)
import qualified Text.Parsec
import Control.Applicative ((<$>) , (<*) , (<*>))
import Control.Monad (forM_)
import Data.List (find)

import Debug.Trace

----------------------------------------------------------------------
-- Token parsers.

llink , rlink , lbrace , rbrace :: Parsec String u String
[llink , rlink , lbrace , rbrace] = reserved
reserved = map (try . Text.Parsec.string) ["[[" , "]]" , "{{" , "}}"]

----------------------------------------------------------------------
-- Node parsers.

-- Link, braces, or string.
data Node = L [Node] | B [Node] | S String deriving Show

nodes :: Parsec String u [Node]
nodes = many node

node :: Parsec String u Node
node = link <|> braces <|> string

link , braces , string :: Parsec String u Node
link   = L <$> between llink  rlink  nodes
braces = B <$> between lbrace rbrace nodes
string = S <$> many1 (notFollowedBy (choice reserved) >> anyChar)

----------------------------------------------------------------------

parseNodes :: String -> Either ParseError [Node]
parseNodes = parse (nodes <* eof) "<no file>"

----------------------------------------------------------------------
-- Tests.

getLink :: [Node] -> Maybe Node
getLink = find isLink where
  isLink (L _) = True
  isLink _     = False

parseLink :: String -> Either ParseError (Maybe Node)
parseLink = either Left (Right . getLink) . parseNodes

testList = [ "   [[rightLink]]   "
           , "  {{    [[Wrong_Link]]    }}  [[rightLink]]"
           , "  {{  {{ }} [[Wrong_Link]] }} [[rightLink]]"
           , " [[{{[[someLink]]}}]] {{}} {{[[asdf]]}}"
           -- Pathalogical example from comments.
           , "{{ab}cd}}"
           -- A more pathalogical example.
           , "{ [ { {asf{[[[asdfa]]]}aasdff ] ] ] {{[[asdf]]}}asdf" 
           -- No top level link.
           , "{{[[Wrong_Link]]asdf[[WRong_Link]]{{}}}}{{[[[[Wrong]]]]}}"
           -- Too many '{{'.
           , "{{ {{ {{ [[ asdf ]] }} }}"
           -- Too many '}}'.
           , "{{ {{ [[ asdf ]] }} }} }}"
           -- Too many '[['.
           , "[[ {{ [[{{[[asdf]]}}]]}}"
           ]

main =
  forM_ testList $ \ t -> do
  putStrLn $ "Test: ^" ++ t ++ "$"
  let parses = ( , ) <$> parseNodes t <*> parseLink t
      printParses (n , l) = do
        putStrLn $ "Nodes: " ++ show n
        putStrLn $ "Link: " ++ show l
      printError = putStrLn . show
  either printError printParses parses
  putStrLn ""

The output is the same in the non-error cases:

Test: ^   [[rightLink]]   $
Nodes: [S "   ",L [S "rightLink"],S "   "]
Link: Just (L [S "rightLink"])

Test: ^  {{    [[Wrong_Link]]    }}  [[rightLink]]$
Nodes: [S "  ",B [S "    ",L [S "Wrong_Link"],S "    "],S "  ",L [S "rightLink"]]
Link: Just (L [S "rightLink"])

Test: ^  {{  {{ }} [[Wrong_Link]] }} [[rightLink]]$
Nodes: [S "  ",B [S "  ",B [S " "],S " ",L [S "Wrong_Link"],S " "],S " ",L [S "rightLink"]]
Link: Just (L [S "rightLink"])

Test: ^ [[{{[[someLink]]}}]] {{}} {{[[asdf]]}}$
Nodes: [S " ",L [B [L [S "someLink"]]],S " ",B [],S " ",B [L [S "asdf"]]]
Link: Just (L [B [L [S "someLink"]]])

Test: ^{{ab}cd}}$
Nodes: [B [S "ab}cd"]]
Link: Nothing

Test: ^{ [ { {asf{[[[asdfa]]]}aasdff ] ] ] {{[[asdf]]}}asdf$
Nodes: [S "{ [ { {asf{",L [S "[asdfa"],S "]}aasdff ] ] ] ",B [L [S "asdf"]],S "asdf"]
Link: Just (L [S "[asdfa"])

Test: ^{{[[Wrong_Link]]asdf[[WRong_Link]]{{}}}}{{[[[[Wrong]]]]}}$
Nodes: [B [L [S "Wrong_Link"],S "asdf",L [S "WRong_Link"],B []],B [L [L [S "Wrong"]]]]
Link: Nothing

but the parse error messages are not as informative in the cases of unmatched openings:

Test: ^{{ {{ {{ [[ asdf ]] }} }}$
"<no file>" (line 1, column 26):
unexpected end of input
expecting "[[", "{{", "]]" or "}}"

Test: ^{{ {{ [[ asdf ]] }} }} }}$
"<no file>" (line 1, column 26):
unexpected "}}"

Test: ^[[ {{ [[{{[[asdf]]}}]]}}$
"<no file>" (line 1, column 25):
unexpected end of input
expecting "[[", "{{", "]]" or "}}"

and I couldn't figure out how to fix them.

1
votes

My solution does not use try, but is relatively complicated: I used your question as an excuse to learn how to create a lexer in Parsec without using makeTokenParser :D I avoid try because the only look ahead happens in the lexer (tokenize), where the various bracket pairs are identified.

The high level idea is that we treat {{, }}, [[, and ]] as special tokens and parse the input into an AST. You didn't specify the grammar precisely, so I chose a simple one that generates your examples:

node ::= '{{' node* '}}'
       | '[[' node* ']]'
       | string
string ::= <non-empty string without '{{', '}}', '[[', or ']]'>

I parse an input string into a list of nodes. The first top-level link ([[) node, if any, is the link you're looking for.

The approach I've taken should be relatively robust to grammar changes. E.g., if you want to allow only strings inside links, then change '[[' node* ']]' to '[[' string ']]'. (In the code

link = L <$> between llink  rlink  nodes

becomes

link = L <$> between llink  rlink  string

).

The code is fairly long, but mostly straightforward. Most of it concerns creating the token stream (lexing) and parsing the individual tokens. After this, the actual Node parsing is very simple.

Here it is:

{-# LANGUAGE TupleSections #-}
import Text.Parsec hiding (char , string)
import Text.Parsec.Pos (updatePosString , updatePosChar)
import Control.Applicative ((<$>) , (<*) , (<*>))
import Control.Monad (forM_)
import Data.List (find)

----------------------------------------------------------------------
-- Lexing.

-- Character or punctuation.
data Token = C Char | P String deriving Eq

instance Show Token where
  show (C c) = [c]
  show (P s) = s

tokenize :: String -> [Token]
tokenize [] = []
tokenize [c] = [C c]
tokenize (c1:c2:cs) = case [c1,c2] of
  "[[" -> ts
  "]]" -> ts
  "{{" -> ts
  "}}" -> ts
  _    -> C c1 : tokenize (c2:cs)
  where
    ts = P [c1,c2] : tokenize cs

----------------------------------------------------------------------
-- Token parsers.

-- We update the 'sourcePos' while parsing the tokens.  Alternatively,
-- we could have annotated the tokens with positions above in
-- 'tokenize', and then here we would use 'token' instead of
-- 'tokenPrim'.
llink , rlink , lbrace , rbrace :: Parsec [Token] u Token
[llink , rlink , lbrace , rbrace] =
  map (t . P) ["[[" , "]]" , "{{" , "}}"]
  where
    t x = tokenPrim show update match where
      match y = if x == y then Just x else Nothing
      update pos (P s) _ = updatePosString pos s

char :: Parsec [Token] u Char
char = tokenPrim show update match where
  match (C c) = Just c
  match (P _) = Nothing
  update pos (C c) _ = updatePosChar pos c

----------------------------------------------------------------------
-- Node parsers.

-- Link, braces, or string.
data Node = L [Node] | B [Node] | S String deriving Show

nodes :: Parsec [Token] u [Node]
nodes = many node

node :: Parsec [Token] u Node
node = link <|> braces <|> string

link , braces , string :: Parsec [Token] u Node
link   = L <$> between llink  (rlink <?> "]]")  nodes
braces = B <$> between lbrace (rbrace <?> "}}") nodes
string = S <$> many1 char

----------------------------------------------------------------------

parseNodes :: String -> Either ParseError [Node]
parseNodes = parse (nodes <* eof) "<no file>" . tokenize

----------------------------------------------------------------------
-- Tests.

getLink :: [Node] -> Maybe Node
getLink = find isLink where
  isLink (L _) = True
  isLink _     = False

parseLink :: String -> Either ParseError (Maybe Node)
parseLink = either Left (Right . getLink) . parseNodes

testList = [ "   [[rightLink]]   "
           , "  {{    [[Wrong_Link]]    }}  [[rightLink]]"
           , "  {{  {{ }} [[Wrong_Link]] }} [[rightLink]]"
           , " [[{{[[someLink]]}}]] {{}} {{[[asdf]]}}"
           -- Pathalogical example from comments.
           , "{{ab}cd}}"
           -- A more pathalogical example.
           , "{ [ { {asf{[[[asdfa]]]}aasdff ] ] ] {{[[asdf]]}}asdf" 
           -- No top level link.
           , "{{[[Wrong_Link]]asdf[[WRong_Link]]{{}}}}{{[[[[Wrong]]]]}}"
           -- Too many '{{'.
           , "{{ {{ {{ [[ asdf ]] }} }}"
           -- Too many '}}'.
           , "{{ {{ [[ asdf ]] }} }} }}"
           -- Too many '[['.
           , "[[ {{ [[{{[[asdf]]}}]]}}"
           ]

main =
  forM_ testList $ \ t -> do
  putStrLn $ "Test: ^" ++ t ++ "$"
  let parses = ( , ) <$> parseNodes t <*> parseLink t
      printParses (n , l) = do
        putStrLn $ "Nodes: " ++ show n
        putStrLn $ "Link: " ++ show l
      printError = putStrLn . show
  either printError printParses parses
  putStrLn ""

The output from main is:

Test: ^   [[rightLink]]   $
Nodes: [S "   ",L [S "rightLink"],S "   "]
Link: Just (L [S "rightLink"])

Test: ^  {{    [[Wrong_Link]]    }}  [[rightLink]]$
Nodes: [S "  ",B [S "    ",L [S "Wrong_Link"],S "    "],S "  ",L [S "rightLink"]]
Link: Just (L [S "rightLink"])

Test: ^  {{  {{ }} [[Wrong_Link]] }} [[rightLink]]$
Nodes: [S "  ",B [S "  ",B [S " "],S " ",L [S "Wrong_Link"],S " "],S " ",L [S "rightLink"]]
Link: Just (L [S "rightLink"])

Test: ^ [[{{[[someLink]]}}]] {{}} {{[[asdf]]}}$
Nodes: [S " ",L [B [L [S "someLink"]]],S " ",B [],S " ",B [L [S "asdf"]]]
Link: Just (L [B [L [S "someLink"]]])

Test: ^{{ab}cd}}$
Nodes: [B [S "ab}cd"]]
Link: Nothing

Test: ^{ [ { {asf{[[[asdfa]]]}aasdff ] ] ] {{[[asdf]]}}asdf$
Nodes: [S "{ [ { {asf{",L [S "[asdfa"],S "]}aasdff ] ] ] ",B [L [S "asdf"]],S "asdf"]
Link: Just (L [S "[asdfa"])

Test: ^{{[[Wrong_Link]]asdf[[WRong_Link]]{{}}}}{{[[[[Wrong]]]]}}$
Nodes: [B [L [S "Wrong_Link"],S "asdf",L [S "WRong_Link"],B []],B [L [L [S "Wrong"]]]]
Link: Nothing

Test: ^{{ {{ {{ [[ asdf ]] }} }}$
"<no file>" (line 1, column 26):
unexpected end of input
expecting }}

Test: ^{{ {{ [[ asdf ]] }} }} }}$
"<no file>" (line 1, column 24):
unexpected }}
expecting end of input

Test: ^[[ {{ [[{{[[asdf]]}}]]}}$
"<no file>" (line 1, column 25):
unexpected end of input
expecting ]]