Let's assume I have a simple grammar of positive integers and alphabetic strings separated by commas. I want to parse this grammar using Flex and Bison, and I want to use multiple input buffers with Flex, for whatever reasons (maybe data is arriving over the network or a serial line or whatever). The problem I'm seeing is that when a string or an integer (which are both variable length tokens) are split between the end of one buffer and the start of the next, the lexer reports two tokens when there should only be one.
In the example below, the chunks are 10,
asdf
and g,
. If that was all in one buffer it would yield the tokens INT(10)
COMMA
STR(asdfg)
COMMA
. But with the 'g' in a different buffer from the 'asdf', the lexer actually produces INT(10)
COMMA
STR(asdf)
STR(g)
COMMA
. It appears the logic upon reaching the end of the buffer is (1) check if the input matches a token, (2) refill the buffer. I feel it should be the other way around: (2) refill the buffer, (1) check if the input matches a token.
I want to make sure I'm not doing something silly with the way I'm changing the buffers.
stdout/stderr:
read_more_input: Setting up buffer containing: 10,
--accepting rule at line 48 ("10")
Starting parse
Entering state 0
Reading a token: Next token is token INT_TERM ()
Shifting token INT_TERM ()
Entering state 1
Return for a new token:
--accepting rule at line 50 (",")
Reading a token: Next token is token COMMA ()
Shifting token COMMA ()
Entering state 4
Reducing stack by rule 2 (line 67):
$1 = token INT_TERM ()
$2 = token COMMA ()
-> $$ = nterm int_non_term ()
Stack now 0
Entering state 3
Return for a new token:
--(end of buffer or a NUL)
--EOF (start condition 0)
read_more_input: Setting up buffer containing: asdf
--(end of buffer or a NUL)
--accepting rule at line 49 ("asdf")
Reading a token: Next token is token STR_TERM ()
Shifting token STR_TERM ()
Entering state 6
Return for a new token:
--(end of buffer or a NUL)
--EOF (start condition 0)
read_more_input: Setting up buffer containing: g,
--accepting rule at line 49 ("g")
Reading a token: Next token is token STR_TERM ()
syntax errorError: popping token STR_TERM ()
Stack now 0 3
Error: popping nterm int_non_term ()
Stack now 0
Cleanup: discarding lookahead token STR_TERM ()
Stack now 0
Lex file:
%{
#include <stdbool.h>
#include "yacc.h"
bool read_more_input(yyscan_t scanner);
%}
%option reentrant bison-bridge
%%
[0-9]+ { yylval->int_value = atoi(yytext); return INT_TERM; }
[a-zA-Z]+ { yylval->str_value = strdup(yytext); return STR_TERM; }
, { return COMMA; }
<<EOF>> {
if (!read_more_input(yyscanner)) {
yyterminate();
}
}
Yacc file:
%{
// This appears to be a bug. This typedef breaks a dependency cycle between the headers.
// See https://stackguides.com/questions/44103798/cyclic-dependency-in-reentrant-flex-bison-headers-with-union-yystype
typedef void * yyscan_t;
#include <stdbool.h>
#include "yacc.h"
#include "lex.h"
%}
%define api.pure full
%lex-param {yyscan_t scanner}
%parse-param {yyscan_t scanner}
%define api.push-pull push
%union {
int int_value;
char * str_value;
}
%token <int_value> INT_TERM
%type <int_value> int_non_term
%token <str_value> STR_TERM
%type <str_value> str_non_term
%token COMMA
%%
complete : int_non_term str_non_term { printf(" === %d === %s === \n", $1, $2); }
int_non_term : INT_TERM COMMA { $$ = $1; }
str_non_term : STR_TERM COMMA { $$ = $1; }
%%
char * packets[]= {"10,", "asdf", "g,"};
int current_packet = 0;
bool read_more_input(yyscan_t scanner) {
if (current_packet >= 3) {
fprintf(stderr, "read_more_input: No more input\n");
return false;
}
fprintf(stderr, "read_more_input: Setting up buffer containing: %s\n", packets[current_packet]);
size_t buffer_size = strlen(packets[current_packet]) + 2;
char * buffer = (char *) calloc(buffer_size, sizeof(char));
memcpy(buffer, packets[current_packet], buffer_size - 2);
yy_scan_buffer(buffer, buffer_size, scanner);
current_packet++;
return true;
}
int main(int argc, char** argv) {
yyscan_t scanner;
yylex_init(&scanner) ;
read_more_input(scanner);
yyset_debug(1, scanner);
yydebug = 1;
int status;
yypstate *ps = yypstate_new ();
YYSTYPE pushed_value;
do {
status = yypush_parse(ps, yylex(&pushed_value, scanner), &pushed_value, scanner);
} while(status == YYPUSH_MORE);
yypstate_delete (ps);
yylex_destroy (scanner) ;
return 0;
}