0
votes

I am writing a compiler for a formatting language and I am writing the bison file. My grammar is correct I think but when I added a recursion rule it and then read the test source file it says that it accepts the rule for the ending tag but that the token is unexpected... The thing is though is that before I added the recursion rule (for some tags inbetween the start and end tag) it worked fine... Here are some details

This is the source file

\begin{document}

\title{test}
\author{test}
\date{21/02/1985}
\pagesetup{35, 80}

\end{document}

This is the bison file

%{
    #include <stdio.h>
    #include <stdlib.h>
    #include <string.h>

    extern int  yylex();
    extern int  yyparse();
    extern FILE *yyin;
    extern FILE *yyout;
    extern int  yylineno;

    void yyerror(const char*);
    int header_status(int,int,int,int,int);

    // counters to check nubmer or document properties used, must all become 1
    int title   = 0;
    int author  = 0;
    int date    = 0;
    int pgsetup = 0;
    int tabsz   = 0;
%}

%union{
    int     iVal;
    char*   sVal;
}

%error-verbose

%start source
%token <sVal> SLASH
%token <sVal> BLOCK_S BLOCK_E
%token <sVal> DOC LIST ENUM
%token <sVal> TITLE AUTHOR DATE PG_SETUP TAB_SZ SECTION PARAGRAPH ITEM LINE
%token <sVal> LBRACE RBRACE LPAREN RPAREN
%token <sVal> DOCUMENT DIMENSIONS DATE_VAL STRING
%token <iVal> NUMBER
%token <sVal> ERROR_UN ERROR_IL WORD

%%

source
    : /* empty */  
    | entry_point doc_properties txt_properties exit_point
        { 
            if ( header_status(title, author, date, pgsetup, tabsz) == 0 )
                printf("\nfail\n"); //YYABORT;
        }
    ;

entry_point
    : SLASH BLOCK_S LBRACE DOC RBRACE 
    ;

doc_properties
    : /* empty */  
    | doc_properties header_properties
    ;

header_properties
    : title_property    { title++; }
    | author_property   { author++; }
    | date_property     { date++; }
    | pg_setup_property { pgsetup++; }
    | tab_sz_property   { tabsz++; }
    ;

txt_properties
    : /* empty */  
    ;

title_property
    : SLASH TITLE LBRACE STRING RBRACE
    ;

author_property
    : SLASH AUTHOR LBRACE STRING RBRACE
    ;

date_property
    : SLASH DATE LBRACE DATE_VAL RBRACE
    ;

pg_setup_property
    : SLASH PG_SETUP LBRACE DIMENSIONS RBRACE
    ;

tab_sz_property
    : SLASH TAB_SZ LPAREN NUMBER RPAREN
    ;

exit_point
    : SLASH BLOCK_E LBRACE DOC RBRACE
    ;


%%

int main (int argc, char* argv[])
{
    if ( argc < 2 || argc > 3)
    {
        fprintf(stdout, "%s: fatal error: needs one or two arguments\n\n\t%s inputFileName [outputFileName]\n\n", argv[0], argv[0]);
    }
    else if ( argc == 2 )
    {
        char* fn = (char *)calloc(strlen(argv[1])+12, sizeof(char));
        strcpy(fn, argv[1]);
        strcat(fn, ".output.txt");
        fprintf(stderr, "%s: using default output naming: <%s>\n\n", argv[0], fn);

        yyin = fopen(argv[1], "r");
        yyout = fopen(fn, "w");     
        yyparse();
        fclose(yyin);
        fclose(yyout);
    }
    else if ( argc == 3 )
    {
        yyin = fopen(argv[1], "r");
        yyout = fopen(argv[2], "w");        
        yyparse();
        fclose(yyin);
        fclose(yyout);
    }
    return 0;
}

void yyerror(const char* str) 
{
    fprintf(stderr,"syntax error[%d]: %s\n",yylineno, str);
}

int header_status(int title, int author, int date, int pgsetup, int tabsz)
{
    if ( title == 1 && author == 1 && date == 1 && pgsetup == 1 && tabsz == 1 )
    {
        return 1;
    }
    else
    {
        if ( title > 1 ) fprintf(stderr,"syntax error: title property was declared more than once\n");
        else if ( title < 1 ) fprintf(stderr,"syntax error: title property was not declared (all document properties must be present)\n");

        if ( author > 1 ) fprintf(stderr,"syntax error: author property was declared more than once\n");
        else if ( author < 1 ) fprintf(stderr,"syntax error: author property was not declared (all document properties must be present)\n");

        if ( date > 1 ) fprintf(stderr,"syntax error: date property was declared more than once\n");
        else if ( date < 1 ) fprintf(stderr,"syntax error: date property was not declared (all document properties must be present)\n");

        if ( pgsetup > 1 ) fprintf(stderr,"syntax error: pagesetup property was declared more than once\n");
        else if ( pgsetup < 1 ) fprintf(stderr,"syntax error: pagesetup property was not declared (all document properties must be present)\n");

        if ( tabsz > 1 ) fprintf(stderr,"syntax error: title tabsize was declared more than once\n");
        else if ( tabsz < 1 ) fprintf(stderr,"syntax error: title tabsize was not declared (all document properties must be present)\n");

        return 0;
    }
}

My problem I think is in

doc_properties
: /* empty */  
| doc_properties header_properties
;

When I had it empty and just

\begin{document}
\end{document}

for the source file it was fine. Specifically the tokens would be

SLASH BLOCK_S LBRACE DOC RBRACE 
SLASH BLOCK_E LBRACE DOC RBRACE

When I added the rule with the recursion though when it reached at the 'end' the trace would say that it accepted the rule (lexical) and then it generated a syntax error "unexpected BLOCK_E". The only thing I can think of is that it is expecting some other tag but in the recursion I have the empty as an alternative so why...

Also when I added the final tag

\begin{document}

\title{test}
\author{test}
\date{21/02/1985}
\pagesetup{35, 80}
\tabsize(4)

\end{document}

when it reached at 4 it says that is accepts the rule in the lex file and that rule

return NUMBER;

but it says unexpected $undefined, expecting NUMBER when it just said that it accepts the rule and frankly I don't think it could read anything else...

My question is for the first part though...

If it is any help this is the flex file

%{
    #include <stdio.h>
    #include <stdlib.h>
    #include <string.h>
    #include "UnicTextLang.y.tab.h"

    #define SAVE_S yylval.sVal = strdup(yytext)
    #define SAVE_I yylval.iVal = atoi(yytext)   
%}

WS      [ \t\n\r]
TAG     [a-zA-Z_][a-zA-Z0-9\-_]+
WORD    [a-zA-Z0-9`~!@#$%\^&*()\-_=+[\]{}\\|;:'",<.>/?]
NUMBER  ([1-9])|([1-9][0-9])|([1-3][0-9][0-9])
DIMEN   {NUMBER}{WS}*,{WS}*{NUMBER}
DAY     (0[1-9])|([12][0-9])|(3[01])
MONTH   (0[1-9])|(1[0-2])
YEAR    (19|20)[0-9]{2}
DATE    {DAY}\/{MONTH}\/{YEAR}

%option yylineno
%option noyywrap
%option noinput
%option nounput
%option debug

%x PROPERTY
%x VALUE
%x BLOCK
%x NUMBER

%%

^\\|{WS}\\                  { BEGIN(PROPERTY); /* fprintf(stdout, "FLEX> BEGINING PROPERTY [%d]: %s|\n", yylineno, yytext); */ SAVE_S; return SLASH; }
{WS}?\{                     { BEGIN(VALUE); /* fprintf(stdout, "FLEX> READING PROPERTY VALUE [%d]: %s|\n", yylineno, yytext); */ SAVE_S; return LBRACE; }
{WS}?\(                     { BEGIN(NUMBER); /* fprintf(stdout, "FLEX> READING NUMBER VALUE [%d]: %s|\n", yylineno, yytext); */ SAVE_S; return LPAREN; }
{WS}                        { /* fprintf(stdout, "FLEX> EATING WHITESPACE(i)\n"); */ }
[^ \t\n\r\{(\\][^ \t\n\r]+  { fprintf(stderr, "lexical error[%d]: hingeless word: %s\n", yylineno, yytext); SAVE_S; return WORD; }
.                           { fprintf(stderr, "lexical error[%d]: illegal character detected: %s\n", yylineno, yytext); SAVE_S; return ERROR_IL; }

<PROPERTY>begin             { BEGIN(BLOCK); /* fprintf(stdout, "FLEX> \n\t%s\n\n", yytext); */ SAVE_S; return BLOCK_S; }
<PROPERTY>end               { BEGIN(BLOCK); /* fprintf(stdout, "FLEX> \n\t%s\n\n", yytext); */ SAVE_S; return BLOCK_E; }

<PROPERTY>title             { BEGIN(INITIAL); /* fprintf(stdout, "FLEX> \n\t%s\n\n", yytext); */ SAVE_S; return TITLE; }
<PROPERTY>author            { BEGIN(INITIAL); /* fprintf(stdout, "FLEX> \n\t%s\n\n", yytext); */ SAVE_S; return AUTHOR; }
<PROPERTY>date              { BEGIN(INITIAL); /* fprintf(stdout, "FLEX> \n\t%s\n\n", yytext); */ SAVE_S; return DATE; }
<PROPERTY>pagesetup         { BEGIN(INITIAL); /* fprintf(stdout, "FLEX> \n\t%s\n\n", yytext); */ SAVE_S; return PG_SETUP; }
<PROPERTY>tabsize           { BEGIN(INITIAL); /* fprintf(stdout, "FLEX> \n\t%s\n\n", yytext); */ SAVE_S; return TAB_SZ; }

<PROPERTY>section           { BEGIN(INITIAL); /* fprintf(stdout, "FLEX> \n\t%s\n\n", yytext); */ SAVE_S; return SECTION; }
<PROPERTY>paragraph         { BEGIN(INITIAL); /* fprintf(stdout, "FLEX> \n\t%s\n\n", yytext); */ SAVE_S; return PARAGRAPH; }
<PROPERTY>item              { BEGIN(INITIAL); /* fprintf(stdout, "FLEX> \n\t%s\n\n", yytext); */ SAVE_S; return ITEM; }
<PROPERTY>newline           { BEGIN(INITIAL); /* fprintf(stdout, "FLEX> \n\t%s\n\n", yytext); */ SAVE_S; return LINE; }

<PROPERTY>{TAG}             { BEGIN(INITIAL); fprintf(stderr, "lexical error[%d]: |%s| undefined property: expecting property\n", yylineno, yytext); SAVE_S; return ERROR_UN; }
<PROPERTY>{WS}              { BEGIN(INITIAL); /* fprintf(stdout, "FLEX> EATING WHITESPACE(p)\n"); */ }
<PROPERTY>[^ \t\n\r\{(]+    { BEGIN(INITIAL); fprintf(stderr, "lexical error[%d]: |%s| undefined property: illegal character detected\n", yylineno, yytext); SAVE_S; return ERROR_IL; }
<PROPERTY>.                 { fprintf(stderr, "lexical error[%d]: illegal character detected: %s\n", yylineno, yytext); SAVE_S; return ERROR_IL; }

<VALUE>{WS}*{DIMEN}{WS}*    { /* fprintf(stdout, "FLEX> \n\tdims: %s\n\n", yytext); */ SAVE_S; return DIMENSIONS; }
<VALUE>{WS}*{DATE}{WS}*     { /* fprintf(stdout, "FLEX> \n\tdate: %s\n\n", yytext); */ SAVE_S; return DATE_VAL; }
<VALUE>[^}]*                { /* fprintf(stdout, "FLEX> \n\tstrg: %s\n\n", yytext); */ SAVE_S; return STRING; }
<VALUE>\}                   { BEGIN(INITIAL); /* fprintf(stdout, "FLEX> FINISHED READING PROPERTY VALUE [%d]: %s|\n", yylineno, yytext); */ SAVE_S; return RBRACE; }
<VALUE>.                    { fprintf(stderr, "lexical error[%d]: illegal character detected: %s\n", yylineno, yytext); SAVE_S; return ERROR_IL; }

<NUMBER>{WS}*{NUMBER}{WS}*  { /* fprintf(stdout, "FLEX> \n\tnumb: %s\n\n", yytext); */ SAVE_I; return NUMBER; }
<NUMBER>[^)]*               { fprintf(stderr, "lexical error[%d]: |%s| illegal value: expecting number(1-399)\n", yylineno, yytext); SAVE_S; return STRING; }
<NUMBER>\)                  { BEGIN(INITIAL); /* fprintf(stdout, "FLEX> FINISHED READING NUMBER VALUE [%d]: %s|\n", yylineno, yytext); */ SAVE_S; return RPAREN; }
<NUMBER>.                   { fprintf(stderr, "lexical error[%d]: illegal character detected: %s\n", yylineno, yytext); SAVE_S; return ERROR_IL; }

<BLOCK>{WS}?\{              { /* fprintf(stdout, "FLEX> READING BLOCK TYPE [%d]: %s|\n", yylineno, yytext); */ SAVE_S; return LBRACE; }
<BLOCK>{WS}*document{WS}*   { /* fprintf(stdout, "FLEX> \n\tresv: %s\n\n", yytext); */ SAVE_S; return DOC; }
<BLOCK>{WS}*itemize{WS}*    { /* fprintf(stdout, "FLEX> \n\tresv: %s\n\n", yytext); */ SAVE_S; return LIST; }
<BLOCK>{WS}*enumerate{WS}*  { /* fprintf(stdout, "FLEX> \n\tresv: %s\n\n", yytext); */ SAVE_S; return ENUM; }
<BLOCK>[^{}]*               { fprintf(stderr, "lexical error[%d]: |%s| undefined block type: expecting block type\n", yylineno, yytext); SAVE_S; return ERROR_UN; }
<BLOCK>\}                   { BEGIN(INITIAL); /* fprintf(stdout, "FLEX> FINISHED READING BLOCK TYPE [%d]: %s|\n", yylineno, yytext); */ SAVE_S; return RBRACE;}
<BLOCK>.                    { fprintf(stderr, "lexical error[%d]: illegal character detected: %s\n", yylineno, yytext); SAVE_S; return ERROR_IL; }

%%
1

1 Answers

3
votes

The basic problem you have is that parser needs two-token lookahead to determine where the doc_properties end. This is because you recognize '\' as a separate token from the property string, so after seeing the input SLASH BLOCK_S with the next input token being SLASH, it doesn't know if it should reduce an empty txt_properties (anticipating a BLOCK_E after the SLASH), or shift into the header_properties rule in anticipation of matching a header property.

There are a number of what to deal with this. Perhaps the simplest would be to get rid of the SLASH token altogether, as it just serves to tell the lexer when to look for a property string. Get rid of the return SLASH; statement in the first lex action (so it doesn't return a token, but instead keeps looking for the property after the \ to return that token), and delete SLASH everywhere it appears in your grammar.

Another possibility is to unfactor the grammar to get rid of the epsilon rules (as they are what necessitate the early reductions that lead to shift/reduce conflicts). With no epsilon rules the parser can shift into a composite state where it is simultaneously recognizing multiple rules with identical prefixes on the RHS (this ability is the advantage of LR parsing over LL). To do this, you would have rules like:

source: /* empty */
      | entry_point exit_point
      | entry_point doc_properties exit_point
      | entry_point txt_properties exit_point
      | entry_point doc_properties txt_properties exit_point
      ;

and would change doc_properties and txt_properties to recognize 1-or-more rather than 0-or-more:

doc_properties: header_property
              | doc_properties header_property
              ;