Recently, I asked a question here: Boost Spirit Segfault In Parser
In this post it was pointed out the grammar I was working with was absolutely left recursive and that spirit is a PEG parser generator, meaning left recursion is impossible.
I converted the grammar to a non-left recursive grammar, using the rule from the section dealing with left recursion in the Dragon Book.
Given a left recursive grammar
A -> A >> alpha | beta
It can be converted to the equivalent right recursive grammar by doing the following:
A -> beta >> A'
A' -> alpha >> A' | epsilon
Here is the resulting parser, with what I believe to be the the non-left recursive productions:
namespace interpreter {
namespace qi = boost::spirit::qi;
template <typename Iterator, typename Skipper>
struct InterpreterGrammar : qi::grammar<Iterator, Skipper>
{
template <typename TokenDef>
InterpreterGrammar(TokenDef const& tok)
: InterpreterGrammar::base_type(start)
{
using boost::phoenix::ref;
start %= functionList >> endList >> qi::eoi;
// different expressions
exp %=
qi::token(k_alphaTerminal) >> qi::token(k_equalTo) >> qi::token(k_alphaTerminal) >> expPrime
|
qi::token(k_numericTerminal) >> expPrime
|
qi::token(k_trueTok) >> expPrime
|
qi::token(k_falseTok) >> expPrime;
expPrime %=
qi::token(k_equalTo) >> exp >> expPrime
|
qi::token(k_notEq) >> exp >> expPrime
|
qi::token(k_less) >> exp >> expPrime
|
qi::token(k_lessEq) >> exp >> expPrime
|
qi::token(k_greater) >> exp >> expPrime
|
qi::token(k_greaterEq) >> exp >> expPrime
|
qi::token(k_andTok) >> exp >> expPrime
|
qi::token(k_orTok) >> exp >> expPrime
|
qi::token(k_notTok) >> exp
|
qi::token(k_plues) >> exp >> expPrime
|
qi::token(k_minus) >> exp >> expPrime
|
qi::token(k_mult) >> exp >> expPrime
|
qi::token(k_minus) >> exp
|
qi::token(k_leftParen) >> exp >> qi::token(k_rightParen)
|
qi::token(k_alphaTerminal) >> qi::token(k_leftBracket) >> exp >> qi::token(k_rightBracket)
|
qi::token(k_alphaTerminal) >> qi::token(k_leftParen) >> qi::token(k_rightParen)
|
qi::token(k_alphaTerminal) >> qi::token(k_leftParen) >> exp >> qi::token(k_rightParen)
|
qi::eps;
// parameter list
paramList %= exp >> paramListPrime;
paramListPrime %= qi::token(k_comma) >> exp >> paramListPrime
|
qi::eps;
// return statements
returnStatement %= qi::token(k_returnTok) >> exp
|
qi::token(k_returnTok);
// function call statements
callStatement %= qi::token(k_alphaTerminal) >> qi::token(k_leftParen) >> qi::token(k_rightParen)
|
qi::token(k_alphaTerminal) >> qi::token(k_leftParen) >> paramList >> qi::token(k_rightParen);
// variable assignment
assignmentStatement %= qi::token(k_alphaTerminal) >> qi::token(k_assign) >> exp
|
qi::token(k_alphaTerminal) >> qi::token(k_leftBracket) >> exp
>> qi::token(k_rightBracket) >> qi::token(k_assign) >> exp;
// list of integers
intList %= qi::token(k_numericTerminal) >> intListPrime;
intListPrime %=
qi::token(k_comma) >> qi::token(k_numericTerminal) >> intListPrime
|
qi::eps;
// print out a variable
printStatement %= qi::token(k_print) >> exp;
// take input
inputStatement %= qi::token(k_alphaTerminal) >> qi::token(k_input);
// conditional statement
conditionStatement %= qi::token(k_ifTok) >> exp >> qi::token(k_colon) >> statements >> optionalElse;
// consitions have optional else
optionalElse %= qi::token(k_elseTok) >> qi::token(k_colon) >> statements
|
qi::eps;
// while loop
whileStatement %= qi::token(k_whileTok) >> exp >> qi::token(k_colon) >> statements >> qi::token(k_elihw);
// actual program statements
endList %= end >> endListPrime;
endListPrime %= end >> endListPrime
|
qi::eps;
// end possibilities of program in global space
end %= callStatement
|
printStatement
|
qi::token(k_alphaTerminal) >> qi::token(k_assign) >> qi::token(k_input)
|
qi::token(k_alphaTerminal) >> qi::token(k_assign) >> exp
|
qi::token(k_alphaTerminal) >> qi::token(k_assign) >> qi::token(k_leftBracket) >> intList
>> qi::token(k_rightBracket)
|
qi::token(k_alphaTerminal) >> qi::token(k_leftBracket) >> exp >> qi::token(k_rightBracket)
>> qi::token(k_assign) >> exp;
// function parameters
param %=
qi::token(k_alphaTerminal) >> paramPrime
|
qi::token(k_alphaTerminal) >> qi::token(k_leftBracket) >> qi::token(k_rightBracket)
>> paramPrime;
// for handling left recursion in paramlist
paramPrime %=
qi::token(k_comma) >> qi::token(k_alphaTerminal) >> paramPrime
|
qi::eps;
// define a statement as assignment print input condition while or call
statement %=
assignmentStatement
|
printStatement
|
inputStatement
|
conditionStatement
|
whileStatement
|
callStatement
|
returnStatement;
// general statement list
statements %= statement >> statementsPrime;
// for handling left recursion in statements
statementsPrime %= statement >> statementsPrime
|
qi::eps;
// functions
functionList %= qi::token(k_def) >> qi::token(k_alphaTerminal) >> qi::token(k_leftParen)
>> param >> qi::token(k_rightParen) >> qi::token(k_colon)
>> statements >> qi::token(k_fed)
|
qi::token(k_def) >> qi::token(k_alphaTerminal) >> qi::token(k_leftParen)
>> qi::token(k_rightParen) >> qi::token(k_colon) >> statements >> qi::token(k_fed)
| qi::eps;
BOOST_SPIRIT_DEBUG_NODES((start)(functionList));
debug(start);
}
qi::rule<Iterator, Skipper> start;
qi::rule<Iterator, Skipper> functionList;
qi::rule<Iterator, Skipper> endList;
qi::rule<Iterator, Skipper> endListPrime;
qi::rule<Iterator, Skipper> param;
qi::rule<Iterator, Skipper> paramPrime;
qi::rule<Iterator, Skipper> paramList;
qi::rule<Iterator, Skipper> paramListPrime;
qi::rule<Iterator, Skipper> statements;
qi::rule<Iterator, Skipper> statementsPrime;
qi::rule<Iterator, Skipper> statement;
qi::rule<Iterator, Skipper> assignmentStatement;
qi::rule<Iterator, Skipper> printStatement;
qi::rule<Iterator, Skipper> inputStatement;
qi::rule<Iterator, Skipper> conditionStatement;
qi::rule<Iterator, Skipper> whileStatement;
qi::rule<Iterator, Skipper> callStatement;
qi::rule<Iterator, Skipper> returnStatement;
qi::rule<Iterator, Skipper> exp;
qi::rule<Iterator, Skipper> expPrime;
qi::rule<Iterator, Skipper> intList;
qi::rule<Iterator, Skipper> intListPrime;
qi::rule<Iterator, Skipper> optionalElse;
qi::rule<Iterator, Skipper> end;
};
}
Here is the lexer
#include <boost/spirit/include/qi.hpp>
#include <boost/spirit/include/lex_lexertl.hpp>
#include <boost/spirit/include/phoenix_operator.hpp>
#include <boost/spirit/include/phoenix_statement.hpp>
#include <boost/spirit/include/phoenix_container.hpp>
#include <iostream>
#include <fstream>
#include <streambuf>
#include <boost/bind.hpp>
#include <boost/ref.hpp>
namespace interpreter
{
namespace lex = boost::spirit::lex;
enum Tokens
{
k_andTok = 1,
k_def = 2,
k_elihw = 3,
k_elseTok = 4,
k_falseTok = 5,
k_fed = 6,
k_fi = 7,
k_ifTok = 8,
k_input = 9,
k_notTok = 10,
k_orTok = 11,
k_print = 12,
k_returnTok = 13,
k_trueTok = 14,
k_whileTok = 15,
k_plues = 16,
k_minus = 17,
k_mult = 18,
k_div = 19,
k_bang = 20,
k_equalTo = 21,
k_greaterEq = 22,
k_lessEq = 23,
k_notEq = 24,
k_less = 25,
k_greater = 26,
k_assign = 27,
k_comma = 28,
k_colon = 29,
k_leftParen = 30,
k_rightParen = 31,
k_leftBracket = 32,
k_rightBracket = 33,
k_alphaTerminal = 34,
k_numericTerminal = 35
};
template <typename Lexer>
struct LexerTokens : lex::lexer<Lexer>
{
LexerTokens() :
whiteSpace("[ \\t\\n]"),
andTok("and"),
def("def"),
elihw("elihw"),
elseTok("else"),
falseTok("false"),
fed("fed"),
fi("fi"),
ifTok("if"),
input("input"),
notTok("not"),
orTok("or"),
print("print"),
returnTok("return"),
trueTok("true"),
whileTok("while"),
plus("\\+"),
minus("\\-"),
mult("\\*"),
div("\\/"),
bang("\\!"),
equalTo("=="),
greaterEq(">="),
lessEq("<="),
notEq("!="),
less("<"),
greater(">"),
assign("="),
comma(","),
colon(":"),
leftParen("\\("),
rightParen("\\)"),
leftBracket("\\["),
rightBracket("\\["),
alphaTerminal("[a-z][a-zA-Z0-9]*"),
numericTerminal("[0-9]*")
{
this->self("WHITESPACE") = whiteSpace;
this->self.add
(andTok, k_andTok)
(def, k_def)
(elihw, k_elihw)
(elseTok, k_elseTok)
(falseTok, k_falseTok)
(fed, k_fed)
(fi, k_fi)
(ifTok, k_ifTok)
(andTok, k_andTok)
(input, k_input)
(notTok, k_notTok)
(orTok, k_orTok)
(print, k_print)
(returnTok, k_returnTok)
(trueTok, k_trueTok)
(whileTok, k_whileTok)
(plus, k_plues)
(minus, k_minus)
(mult, k_mult)
(div, k_div)
(bang, k_bang)
(equalTo, k_equalTo)
(greaterEq, k_greaterEq)
(lessEq, k_lessEq)
(notEq, k_notEq)
(less, k_less)
(greater, k_greater)
(assign, k_assign)
(comma, k_comma)
(colon, k_colon)
(leftParen, k_leftParen)
(rightParen, k_rightParen)
(leftBracket, k_leftBracket)
(rightBracket, k_rightBracket)
(alphaTerminal, k_alphaTerminal)
(numericTerminal, k_numericTerminal);
}
lex::token_def<lex::omit> whiteSpace;
lex::token_def<std::string> andTok;
lex::token_def<std::string> def;
lex::token_def<std::string> elihw;
lex::token_def<std::string> elseTok;
lex::token_def<std::string> falseTok;
lex::token_def<std::string> fed;
lex::token_def<std::string> fi;
lex::token_def<std::string> ifTok;
lex::token_def<std::string> input;
lex::token_def<std::string> notTok;
lex::token_def<std::string> orTok;
lex::token_def<std::string> print;
lex::token_def<std::string> returnTok;
lex::token_def<std::string> trueTok;
lex::token_def<std::string> whileTok;
lex::token_def<std::string> plus;
lex::token_def<std::string> minus;
lex::token_def<std::string> mult;
lex::token_def<std::string> div;
lex::token_def<std::string> bang;
lex::token_def<std::string> equalTo;
lex::token_def<std::string> greaterEq;
lex::token_def<std::string> lessEq;
lex::token_def<std::string> notEq;
lex::token_def<std::string> less;
lex::token_def<std::string> greater;
lex::token_def<std::string> assign;
lex::token_def<std::string> comma;
lex::token_def<std::string> colon;
lex::token_def<std::string> leftParen;
lex::token_def<std::string> rightParen;
lex::token_def<std::string> leftBracket;
lex::token_def<std::string> rightBracket;
lex::token_def<std::string> alphaTerminal;
lex::token_def<std::string> numericTerminal;
};
And here is my example test program
int main(int argc, char** argv)
{
namespace lex = boost::spirit::lex;
namespace qi = boost::spirit::qi;
typedef lex::lexertl::token< char const*, lex::omit, boost::mpl::true_ > token_type;
typedef lex::lexertl::lexer<token_type> lexer_type;
typedef interpreter::LexerTokens<lexer_type>::iterator_type iterator_type;
typedef qi::in_state_skipper<interpreter::LexerTokens<lexer_type>::lexer_def> skipper_type;
interpreter::LexerTokens< lexer_type > lexer;
interpreter::InterpreterGrammar< iterator_type, skipper_type > parser(lexer);
std::string sourceCode("def print_it(x, y): print 3*x + y return fed print_it(8,1) x = 3 print_it(x, x)");
char const* first = sourceCode.c_str();
char const* last = &first[sourceCode.size()];
bool r = lex::tokenize_and_phrase_parse(first, last, lexer, parser, qi::in_state("WHITESPACE")[lexer.self]);
std::cout << "Remaining " << std::string(first,last) << std::endl;
std::cout << "R is " << r << std::endl;
}
These revisions have given rise to a few questions, first, by looking at the grammar informally, without constructing the full LL(1) parsing table, I don't believe this grammar is LL(1). I still need to verify this, but I was wondering will spirit, be able to parse this grammar? I know PEGs typically use the / operator to do lookahead, does spirit do this currently? I read in another post that spirit may not?
Second, this grammar fails, I notice that when, in the start production, I simplify the start production and make it:
start %= functionList;
and then change the input to be:
def print_it(x, y): print 3*x + y return fed
the grammar debug statement states that the parse was successful. However, I see the string remaining is:
print_it(x, y): print 3*x + y return fed
So only the first token was actually parsed. After a bit of debugging I am unsure why the parse is successful and why only a single symbol is consumed, could this be an issue with the lexer?
Additionally, I see similar results when I change the start production to be:
start %= endList;
and using the input
y = x
This however, fails to parse and only consumes the character y.
Finally, the output of my debug statement is not very helpful, when running with the debug statement the output that is produced is:
<start>
<try>[][][][][][][][][][][][][][][][][][][][]</try>
<fail/>
</start>
Remaining print_it(x, y): print 3*x + y return fed print_it(8,1) x = 3 print_it(x, x)
R is 0
Which I assume to mean twenty productions are attempted in the grammar, from the twenty empty [], is that a correct assumption? Also why are the [] empty? I typically see them with some text that is useful in debugging. Is it because the grammar is automatically a success if the regular expression is matched? If that is the case, is there a way to get the debug statement to print helpful output when using the token enum token as opposed to adding expressions using macros?
Any help or points in the correct direction is appreciated, thank you.
start %= functionList;
should really not be valid. I suppose it might work because of%=
, but to assign rules to other rules without implying a parser expression, usefunctionList.alias()
– sehe