0
votes

I know that one can reject a token in lex as in:

    .*    {
             //Reject the entire line but get its value
             yylval->strval = strdup(yytext);
             REJECT;
          };

I am doing this to get the content of the whole line for error reporting later. So I want to preserve this line and have access to it in the bison code. After rejecting the entire line, tokens in the line are matched to other lex rules and expected to be parsed and match parsing rules in .y. (Or am I wrong?)

One way to preserve the rejected line is to assign it to a global variable which is, of course, not elegant, and not correct for other reasons (This is a reentrant parse/lexer).

Any suggestions?

1

1 Answers

2
votes

First, don't use REJECT for this. REJECT is a hideous hack which massively degrades your lexer, and has very few practical use cases in a parser.

Also, it won't work as you have written it. After you reject a match to .*, the next match returned will be the next shorter .* match -- i.e. all but the last character in the line -- rather than a match for another pattern (unless all but the last character in the line happens to match another pattern with higher priority).

If you want to capture and temporarily store every line read, the easiest way to do it is to provide your own YY_INPUT which reads the input one line at a time. (I'd suggest using the Posix 2008 getline() interface if your C library supports it.) Just save this line (and its length).

Now, to your actual question: where to save things in a reentrant lexer so that the parser can see them.

Fortunately, flex provides a way for you to augment the yyscan_t context object with your own arbitrary "extra" data, yyextra. (Outside of the scanner code itself, you need to use the accessor yyget_extra(scanner) to get at this value.) By default, yyextra has type void *, but you can override that using %option extra-type. It's usual for yyextra to be a pointer to your own context object, although it may be possible to make yyextra a small structure if you don't need much context. (yyget_extra and yyset_extra pass yyextra by value, which is not appropriate for many context objects.)

Here's a (somewhat) simple example of all this. Most of the boilerplate below was taken from the skeleton in this answer; refer to that link for an explanation of most of the oddities.

File sample.l

%option noyywrap 8bit noinput nounput nodefault
%option reentrant bison-bridge bison-locations yylineno
%option extra-type="struct ScanExtra*"
%{
  #include "sample.tab.h"
  #define YY_INPUT(buf,result,max_size) do {                       \
    yyextra->linebuf_len = getline(&yyextra->linebuf,              \
                                   &yyextra->linebuf_size, yyin);  \
    if (yyextra->linebuf_len == -1) result = YY_NULL;              \
    else if (yyextra->linebuf_len <= max_size) {                   \
      memcpy(buf, yyextra->linebuf, yyextra->linebuf_len);         \
      result = yyextra->linebuf_len;                               \
    }                                                              \
    else { /* Handle long lines */ }                               \
  } while(0)
%}

%%
[[:space:]]      ; /* Ignore whitespace */
#.*              ; /* Ignore comments */
[[:alnum:]]+     { yylval->str = strdup(yytext); return WORD; }
.                { return *yytext; }
%%
/* These functions are here for simplicity. Normally, I'd put them in a
 * separate parse_utils.c file.
 * They're declared in sample.h (except for yyerror).
 */

/* Creates a scanner with an initialised ScanExtra */
yyscan_t myscanner_create(void) {
  struct ScanExtra* extra = calloc(1, sizeof *extra);
  yyscan_t scanner;
  yylex_init_extra(extra, &scanner);
  return scanner;
}

/* Destroys a scanner */
void myscanner_destroy(yyscan_t scanner) {
  free(yyget_extra(scanner));
  yylex_destroy(scanner);
}

#include <ctype.h>
void yyerror(YYLTYPE* yyllocp, yyscan_t scanner, const char* msg) {
  /* Get rid of trailing whitespace in the current line */
  const char* buf = yyget_extra(scanner)->linebuf;
  int len = yyget_extra(scanner)->linebuf_len;
  while (len && isspace(buf[len - 1])) --len; 
  fprintf(stderr, "Syntax error near '%s' in line %d: '%.*s'\n",
          yyget_text(scanner),
          yyget_lineno(scanner),
          len, buf);
}

file sample.y

%define api.pure full
%locations
%param { yyscan_t scanner }

%code top {
  #include <stdio.h>
} 
%code requires {
  typedef void* yyscan_t;

  /* I define ScanExtra here so that it goes into the generated header file */
  struct ScanExtra {
    char*   linebuf;
    size_t  linebuf_size;
    ssize_t linebuf_len;
  };
}
%code {
  int yylex(YYSTYPE* yylvalp, YYLTYPE* yyllocp, yyscan_t scanner);
  void yyerror(YYLTYPE* yyllocp, yyscan_t scanner, const char* msg);
}

%union {
  char* str;
}
%token <str> WORD
%%
item: WORD { free($1); }
    | '(' opt_item_list ')'
item_list: item | item_list ',' item
opt_item_list: %empty | item_list

file sample.h

#include "sample.tab.h"
#include "sample.lex.h"
yyscan_t myscanner_create(void);
void myscanner_destroy(yyscan_t);

file: Makefile

all: sample

sample.lex.c: sample.l
        flex -o $@ --header-file=$(patsubst %.c,%.h,$@) --debug $<

sample.tab.c: sample.y
        bison -o $@ --defines=$(patsubst %.c,%.h,$@) --debug $<

sample: main.c sample.tab.c sample.lex.c sample.h
        $(CC) -o $@ -Wall --std=c11 -ggdb -D_XOPEN_SOURCE=700 $(filter %.c,$^)

clean:
        rm -f sample.tab.c sample.lex.c sample.tab.h sample.lex.h sample