BISON + FLEX grammar - why tokens are being concatenated together

Question

I would like to understand why BISON is concatenating two tokens on the following rule

stmt:
  declaration                 { ... }
  | assignment                { ... }
  | exp                       { ... }
  | ID ';'  <-- this rule     { ...       
                                fprintf(stderr, "\n my id is '%s'", $1);
                                ...

if you check the output will get what I mean. I run my parser and I input the characters ab; to the program. According to my bison grammar this should be parsed as an ID followed by a ;. And at some extent it is what happens.

However, when I try to use the $1 variable of the rule ID ';' the program outputs ab; to me instead of ab.

running the program

ab;                                   <-- this my input to the program

#TOKEN 294[ID] yytext -> ab
Next token is token "identifier" (1.1: )
Shifting token "identifier" (1.1: )
Entering state 5
Reading a token:
#TOKEN 59[] yytext -> ;
Next token is token ';' (1.1: )
Shifting token ';' (1.1: )
Entering state 16
Reducing stack by rule 6 (line 133):
   $1 = token "identifier" (1.1: )     <-- first token which is 'ab'
   $2 = token ';' (1.1: )              <-- second token which is ';'

[stmt] 4:
 my id is 'ab;'                        <-- the issue! This should be 'ab' not 'ab;'   
ERROR: No such ID ab; found
-> $$ = nterm stmt (1.1: )
Stack now 0 1
Entering state 10
Reducing stack by rule 2 (line 126):
   $1 = nterm prog (1.1: )
   $2 = nterm stmt (1.1: )
-> $$ = nterm prog (1.1: )
Stack now 0
Entering state 1
Reading a token:

grammar

%{
#include <stdio.h>
#include <string>
#include <map>
#include <math.h>
#include "noname-parse.h"
#include "noname-types.h"

extern int yylex(void);
extern void yyerror(const char *error_msg);
extern void division_by_zero(YYLTYPE &yylloc);


std::map<std::string, symrec*> symbol_table;
std::map<std::string, symrec*>::iterator symbol_table_it;
%}

//////////////////////////////////////////////////
///////////* Bison declarations.  *///////////////
//////////////////////////////////////////////////

%union {

  char* id_v;
  double double_v;
  long long_v;

  symrecv symrecv;
  char* error_msg;
};

%{

  bool symbol_exist(const char* key) {
    std::string skey = key;
    symbol_table_it = symbol_table.find(skey);
    return  (symbol_table_it != symbol_table.end());
  }

  void symbol_insert(const char* key, symrecv symrecv) {
    std::string skey = key;
    symbol_table[skey] = symrecv;
  }

  symrecv symbol_retrieve(const char* key) {
    std::string skey = key;
    return symbol_table[skey];
  }

  void print_stmt(symrecv sym) {

    if (sym->type == TYPE_LONG) {
      fprintf(stderr, "%d", sym->value.intv);

    } else if (sym->type == TYPE_DOUBLE) {
      fprintf(stderr, "%lf", sym->value.doublev);

    } else {
      fprintf(stderr, "print not implemented for type %d", sym->type);
    }
  }
%}

%token LINE_BREAK            "line_break"             
// %token ';'              "stmt_sep"           
%token LETTER                "letter"         
%token DIGIT                 "digit"         
%token DIGITS                "digits"         
%token DARROW                "darrow"         
%token ELSE                  "else"       
%token FALSE                 "false"         
%token IF                    "if"     
%token IN                    "in"     
%token LET                   "let"       
%token LOOP                  "loop"       
%token THEN                  "then"       
%token WHILE                 "while"         
%token BREAK                 "break"         
%token CASE                  "case"       
%token NEW                   "new"       
%token NOT                   "not"       
%token TRUE                  "true"       
%token NEWLINE               "newline"           
%token NOTNEWLINE            "notnewline"             
%token WHITESPACE            "whitespace"             
%token LE                    "le"     
%token ASSIGN                "assign"         
%token NULLCH                "nullch"         
%token BACKSLASH             "backslash"             
%token STAR                  "star"       
%token NOTSTAR               "notstar"           
%token LEFTPAREN             "leftparen"             
%token NOTLEFTPAREN          "notleftparen"               
%token RIGHTPAREN            "rightparen"             
%token NOTRIGHTPAREN         "notrightparen"                 
%token LINE_COMMENT          "line_comment"               
%token START_COMMENT         "start_comment"                 
%token END_COMMENT           "end_comment"               
%token QUOTES                "quotes"         
%token ERROR                 "error"

%token <id_v> ID             "identifier"
%token <double_v> DOUBLE     "double"
%token <long_v> LONG         "long"
%type  <symrecv> assignment  "assignment"
%type  <symrecv> declaration "declaration"
%type  <symrecv> exp         "expression"
%type  <symrecv> stmt        "statement"

%left '-' '+'
%left '*' '/'
%left LET ID 
%right '^'        /* exponentiation */
%precedence NEG   /* negation--unary minus */

%start prog

%% 

//////////////////////////////////////////////////
///////////* The grammar follows. *///////////////
//////////////////////////////////////////////////

prog:
  %empty
  | prog stmt
;

stmt:
  declaration        { fprintf(stderr, "\n[stmt] 2: "); print_stmt($1); }
  | assignment       { fprintf(stderr, "\n[stmt] 3: "); print_stmt($1); }
  | exp              { fprintf(stderr, "\n[stmt] 1: "); print_stmt($1); }
  | ID ';'           { fprintf(stderr, "\n[stmt] 4: "); 

    fprintf(stderr, "\n my id is '%s'", $1);

    $$ = (symrec *) malloc (sizeof (symrec));

    if (!symbol_exist($1)) {

      char buf[1024];
      sprintf(buf, "No such ID %s found", $1);
      yyerror(buf);

    } else {

      $$->name = $1;
      $$->value.doublev = symbol_retrieve($1)->value.doublev;
      printf("\nID %s -> %lf", $1, $$->value.doublev);
    }
  }
  | error            { printf("%d:%d", @1.first_column, @1.last_column); }
;

assignment:
  ID ASSIGN exp ';' {

    $$ = (symrec *) malloc (sizeof (symrec));

    if (!symbol_exist($1)) {

      char buf[1024];
      sprintf(buf, "No such ID %s found", $1);
      yyerror(buf);

    } else {

      $$->name = $1;
      $$->type = $3->type;
      $$->value.doublev = $3->value.doublev;
      symbol_insert($1, $$);
      // printf("\nID %s -> %lf", $1, $$->value.doublev);
      printf("\n[assignment]");
    }
  }
  | LET ID ASSIGN exp ';' {

    $$ = (symrec *) malloc (sizeof (symrec));

    if (symbol_exist($2)) {

      char buf[1024];
      sprintf(buf, "Cannot redefine ID %s", $2);
      yyerror(buf);

    } else {

      $$->name = $2;
      $$->type = $4->type;
      $$->value.doublev = $4->value.doublev;
      symbol_insert($2, $$);
      // printf("\nID %s -> %lf", $1, $$->value.doublev);
      printf("\n[assignment]");
    }
  }
;

declaration:
  LET ID ';' {

    $$ = (symrec *) malloc (sizeof (symrec));

    if (symbol_exist($2)) {

      char buf[1024];
      sprintf(buf, "Cannot redefine ID %s", $2);
      yyerror(buf);

    } else {

      $$->name = $2;
      // $$->type = $1->type == TYPE_DOUBLE || $3->type == TYPE_DOUBLE ? TYPE_DOUBLE : $1->type;
      symbol_insert($2, $$);
      // $$->value.doublev = symbol_table_it->second->value.doublev;
      // printf("\nID %s -> %lf", $1, $$->value.doublev);
      printf("\n[declaration]");
    }
  }
;

exp:
  LONG {
    $$ = (symrec *) malloc (sizeof (symrec));
    $$->name = (char*) "__annon";
    $$->type = TYPE_LONG;
    $$->value.intv = $1;
    printf("\nexp %ld", $1);
  }
  | DOUBLE {
    $$ = (symrec *) malloc (sizeof (symrec));
    $$->name = (char*) "__annon";
    $$->type = TYPE_DOUBLE;
    $$->value.doublev = $1;
    printf("\nexp %lf", $1);
  }
  | exp '+' exp        {
      // $$ = $1 + $3;
      $$ = (symrec *) malloc (sizeof (symrec));
      $$->name = (char*) "__annon";
      $$->type = $1->type == TYPE_DOUBLE || $3->type == TYPE_DOUBLE ? TYPE_DOUBLE : $1->type;
      $$->value.doublev = $1->value.doublev + $3->value.doublev;
      printf("\nexp + exp %lf %lf", $1->value.doublev, $3->value.doublev);
    }
  | exp '-' exp        {
      // $$ = $1 - $3;
      $$ = (symrec *) malloc (sizeof (symrec));
      $$->name = (char*) "__annon";
      $$->type = $1->type == TYPE_DOUBLE || $3->type == TYPE_DOUBLE ? TYPE_DOUBLE : $1->type;
      $$->value.doublev = $1->value.doublev - $3->value.doublev;
      printf("\nexp - exp %lf %lf", $1->value.doublev, $3->value.doublev);
    }
  | exp '*' exp        {
      // $$ = $1 * $3;
      $$ = (symrec *) malloc (sizeof (symrec));
      $$->name = (char*) "__annon";
      $$->type = $1->type == TYPE_DOUBLE || $3->type == TYPE_DOUBLE ? TYPE_DOUBLE : $1->type;
      $$->value.doublev = $1->value.doublev * $3->value.doublev;
      printf("\nexp * exp %lf %lf", $1->value.doublev, $3->value.doublev);
    }
  | exp '/' exp {
      $$ = (symrec *) malloc (sizeof (symrec));
      $$->name = (char*) "__annon";
      $$->type = $1->type == TYPE_DOUBLE || $3->type == TYPE_DOUBLE ? TYPE_DOUBLE : $1->type;

      if ($3->value.doublev) {
        // $$ = $1 / $3;
        $$->value.doublev = $1->value.doublev / $3->value.doublev;
      } else {
        // $$ = $1;
        $$->value.doublev = $1->value.doublev;
        division_by_zero(@3);
      }
      printf("\nexp / exp %lf %lf", $1->value.doublev, $3->value.doublev);
    }
  | '-' exp  %prec NEG {
      /**
        * The %prec simply instructs Bison that the rule ‘| '-' exp’ 
        * has the same precedence as NEG—in this case the next-to-highest
        */
      // $$ = -($2->value.doublev);
      $$ = (symrec *) malloc (sizeof (symrec));
      $$->name = (char*) "__annon";
      $$->type = $2->type;
      $$->value.doublev = -$2->value.doublev;
      printf("\nexp ^ exp %lf", $2->value.doublev);
    }
  | exp '^' exp        {
      //$$ = pow($1->value.doublev, $3->value.doublev);
      $$ = (symrec *) malloc (sizeof (symrec));
      $$->name = (char*) "__annon";
      $$->type = $1->type;
      $$->value.doublev = pow($1->value.doublev, $3->value.doublev);
      printf("\nexp ^ exp %lf %lf", $1->value.doublev, $3->value.doublev);
    }
  | '(' exp ')'        {
      // $$ = $2->value.doublev;
      $$ = (symrec *) malloc (sizeof (symrec));
      $$->name = (char*) "__annon";
      $$->type = $2->type;
      $$->value.doublev = $2->value.doublev;
      printf("\n(exp) %lf", $2->value.doublev);
    }
  | error                 { printf("\nERROR on exp rule"); }
  ;
%%

lexer

%{
  #include "stdio.h"
  #include "stdlib.h"
  #include "lexer-utilities.h"
  #include "noname-parse.h"
  #include "noname-types.h"

  int num_lines = 0, num_chars = 0;
  extern YYSTYPE yylval;
  extern void yyerror(char const *s);

  extern int curr_lineno;
  extern int verbose_flag;

  unsigned int comment = 0;
%}

%option noyywrap 
  // %option noyywrap nounput batch debug yylineno
  // %option warn noyywrap nodefault yylineno reentrant bison-bridge 

%x COMMENT
%x STRING

LINE_BREAK      \n
LETTER          [a-zA-Z]
ALPHA           [a-zA-Z$_]
DIGIT           [0-9]
DIGITS          {DIGIT}+
LONG            {DIGIT}+
DOUBLE          {DIGIT}+(\.{DIGIT}+)?
ID              {ALPHA}({ALPHA}|{DIGIT})*

ELSE            [eE][lL][sS][eE]
FALSE           f[aA][lL][sS][eE]
IF              [iI][fF]
IN              [iI][nN]
LET             [lL][eE][tT]
LOOP            [lL][oO][oO][pP]
THEN            [tT][hH][eE][nN]
WHILE           [wW][hH][iI][lL][eE]
BREAK           [bB][rR][eE][aA][kK]
CASE            [cC][aA][sS][eE]
NEW             [nN][eE][wW]
NOT             [nN][oO][tT]
TRUE            t[rR][uU][eE]
NEWLINE         [\n]
NOTNEWLINE      [^\n]
WHITESPACE      [ \t\r\f\v]+
ASSIGN          =
LE              <=
DARROW          =>
NULLCH          [\0]
BACKSLASH       [\\]
STAR            [*]
NOTSTAR         [^*]
LEFTPAREN       [(]
NOTLEFTPAREN    [^(]
RIGHTPAREN      [)]
NOTRIGHTPAREN   [^)]

LINE_COMMENT    "--"
START_COMMENT   "/*"
END_COMMENT     "*/"

QUOTES          \"


%%

{LINE_BREAK}                    {
                                  ++num_chars;
                                  ++num_lines;
                                }

{START_COMMENT} {
  comment++;
  BEGIN(COMMENT);
}

<COMMENT><<EOF>> {
  yylval.error_msg = "EOF in comment";
  BEGIN(INITIAL);
  return (ERROR);
}

<COMMENT>{BACKSLASH}(.|{NEWLINE}) {
  backslash_common();
};

<COMMENT>{BACKSLASH}               ;

<COMMENT>{START_COMMENT} {
  comment++;
}

<COMMENT>{END_COMMENT} {
  comment--;
  if (comment == 0) {
    BEGIN(INITIAL);
  }
}

<COMMENT>.                      { ++num_chars; }

<INITIAL>{END_COMMENT} {
  yylval.error_msg = "Unmatched */";
  return (ERROR);
}

<*>{WHITESPACE}                  { ++num_chars; }
<INITIAL>{ASSIGN}                { return (ASSIGN); }
<INITIAL>{ELSE}                  { return (ELSE); }
<INITIAL>{IF}                    { return (IF); }
<INITIAL>{IN}                    { return (IN); }
<INITIAL>{LET}                   { return (LET); }
<INITIAL>{THEN}                  { return (THEN); }
<INITIAL>{WHILE}                 { return (WHILE); }
<INITIAL>{CASE}                  { return (CASE); }
<INITIAL>{NEW}                   { return (NEW); }
<INITIAL>{NOT}                   { return (NOT); }
<INITIAL>{ID}      {
  yylval.id_v = yytext;
  return (ID); }
<INITIAL>{LONG}     {
  yylval.long_v = atoi(yytext);
  return (LONG); }
<INITIAL>{DOUBLE}  {
  yylval.double_v = atof(yytext);
  return (DOUBLE); }

<INITIAL>","                     { return int(','); }
<INITIAL>":"                     { return int(':'); }
<INITIAL>"{"                     { return int('{'); }
<INITIAL>"}"                     { return int('}'); }
<INITIAL>"+"                     { return int('+'); }
<INITIAL>"-"                     { return int('-'); }
<INITIAL>"*"                     { return int('*'); }
<INITIAL>"/"                     { return int('/'); }
<INITIAL>"<"                     { return int('<'); }
<INITIAL>"~"                     { return int('~'); }
<INITIAL>"."                     { return int('.'); }
<INITIAL>"@"                     { return int('@'); }
<INITIAL>"("                     { return int('('); }
<INITIAL>")"                     { return int(')'); }
<INITIAL>"&"                     { return int('&'); }
<INITIAL>";"                     { return int(';'); }

<INITIAL>. {
    printf("lexer error '%s'", yytext);
    yylval.error_msg = yytext; return 0; 
  }

%%

rici rici · Accepted Answer · 2017-04-16T22:51:23

This flex action is incorrect:

  yylval.id_v = yytext;

yytext points into an internal work buffer. Its contents will change every time the scanner is called. So if you want to keep the string which makes up the token, you must copy the string into your own storage, for example using strdup. (Don't forget to free the allocated storage when you are finished with it.)

BISON + FLEX grammar - why tokens are being concatenated together

1 Answers