0
votes

I am trying to print an AST and printing an Identifier's actual name in the tree. I am using lex and yacc.

For some reason yacc reads ahead all the tokens in a single row until ';' and this prevents me from using yytext to access an identifier's text value.

This is the lex file:

%{

%}



%%
"&&"      {return OP_AND;}
\"|"\"|"    {return OP_OR;} 
!=      {return OP_NE;}
==      {return OP_EQ;}
>       {return OP_GT;}
>=      {return OP_GE;}
\<      {return OP_LT;}
\<=     {return OP_LE;}
!       {return OP_NOT;}
=       {return ASSIGN;}
\+      {return PLUS;}
\-      {return MINUS;}
\*      {return MULT;}
\/      {return DIV;}
\&      {return ADRS;}
\^      {return PTR_VAL;}
\;  {return SEMI;}
,   {return COMMA;}
\{  {return CURLY_O;}
\}  {return CURLY_C;}
\(  {return PAREN_O;}
\)  {return PAREN_C;}
\|      {return BAR;}
\[  {return SQR_O;}
\]  {return SQR_C;}
else      {return ELSE;}
if        {return IF;}
do      {return DO;}
for        {return FOR;}
bool   {return BOOL;}
void   {return VOID;}
int { return INT;}
charp   {return CHARP;}
intp   {return INTP;}
string   {return STRING;}
while       {return WHILE;}
true      {return TRUE_;}
false     {return FALSE_;}
return    {return RETURN;}
null        {return NULL_;}



0|[1-9][0-9]*           {return CONST_INT;}

[a-zA-Z][a-zA-Z0-9]*           { return IDENT;}

\'.\'       {return CONST_CHAR; }
\".*\"  {return CONST_STRING; }

\[\$([^\$]*\$+[^\]])*[^\$]*\$+\]    { /*comment*/}

[ \n\t]   { /* skip whitespace */}

.     {return ILLEAGAL;}


%%

This is the yacc file: * note that some of the rules are not completely ready but are not used by this input file and are not relevant to this case.

%{
#include <stdio.h>
#include <string.h>
#include <stdlib.h>

extern char *  yytext;

typedef struct node
{
    char * token;
    struct node *b1;
    struct node *b2;
    struct node *b3;
    struct node *b4;
}node;
int yylex();
int yyerror();

struct node* mknode2(char* token,node* b1,node* b2);
struct node* mknode3(char* token,node*b1,node*b2,node*b3);
struct node* mknode4(char* token,node*b1,node*b2,node*b3,node*b4);
int printtree(node* tree);
#define YYSTYPE node*
%}


%token SEMI COMMA CURLY_O CURLY_C PAREN_O PAREN_C SQR_O SQR_C BAR
%token BOOL INT CHAR STRING INTP CHARP VOID
%token IF DO WHILE FOR RETURN
%token CONST_INT CONST_CHAR CONST_STRING NULL_ IDENT TRUE_ FALSE_
%token ILLEAGAL
%right ASSIGN 
%left  OP_OR OP_AND OP_NE OP_EQ OP_GT OP_GE OP_LT OP_LE 
%left  PLUS MINUS
%left  MULT DIV
%right OP_NOT
%right ADRS
%right PTR_VAL
%nonassoc LOWER_THAN_ELSE
%nonassoc ELSE
%start program

%%

program    : functions {printtree($1);}
           ;

functions  : functions function {$$ = mknode2("FUNC" ,$2,$1);}  
           |    {$$ = NULL;}                    
           ;

function   : type id PAREN_O f_params PAREN_C CURLY_O body CURLY_C { $$ = mknode3($1,$2,$4,$7);}
           ;

body       : var_decls stmts ret_stmt {$$ = mknode3("BODY" ,$1 , $2 , $3);}
           ;

nest_block : stmts {$$=$1;} 
       ;

f_params   : f_params_ {$$ = $1;}    
           | {$$ = NULL;}             
           ;

f_params_  : f_params_ COMMA param {$$ = mknode2("," , $3 , $1);}
           | param {$$ = $1;}
           ;

param      : type id {$$ = mknode2($1 , NULL , $2 );}     
           ;

var_decls  : var_decls var_decl { $$ = mknode2("DECL" , $2 , $1);}
          | var_decls function {$$ = mknode2("DECL" , $2 , $1);}
           | {$$ = NULL;}                       
           ;

var_decl   : type var_list SEMI { $$ = mknode2($1 , NULL , $2); }
           ;

var_list   : var_list COMMA dec_assign {$$ = mknode2(",", $3 , $1);}
           | dec_assign { $$ = $1 ;}
           ;

dec_assign : id ASSIGN expr { $$ = mknode2("=", $1 , $3);}
           | id {$$ = $1; }                      
           ;


type       : BOOL {$$ = yytexy;}        
           | INT { $$ = yytext;}        
           | CHAR {$$ = yytext;}        
           | INTP {$$ = yytext;}        
           | CHARP {$$ = yytext;}       
           | STRING {$$ = yytext;}      
           | VOID {$$ = yytext;}    
           ;



stmts      : stmts stmt { $$ = mknode2("STMT" , $2 , $1 );}     
           | { $$ = NULL ;}         
           ;

stmt       : assignment SEMI {$$ = $1 ;}                
           | fct_call   SEMI    {$$ = $1; }     
           | IF PAREN_O expr PAREN_C opt_nest   %prec LOWER_THAN_ELSE {$$ = mknode2("if" , $3 , $5 );}
           | IF PAREN_O expr PAREN_C opt_nest ELSE opt_nest {$$ = mknode3("if" , $3 ,  $5 , $7 );}          
           | WHILE PAREN_O expr PAREN_C opt_nest {$$ = mknode2("while" , $3 , $5);}
           | DO CURLY_O nest_block CURLY_C WHILE PAREN_O expr PAREN_C SEMI {$$ = mknode2("do" , $3 , $7);}
           | FOR PAREN_O assignment SEMI expr SEMI assignment PAREN_C opt_nest {$$ = mknode4("for" , $3 , $5 , $7 , $9 );}
           ;


opt_nest   : CURLY_O nest_block CURLY_C {$$ = $2;}      
           | stmt {$$ = $1;}    
           ;


ret_stmt   : RETURN expr SEMI { $$ = mknode2("return" , NULL , $2);}
        | { $$ = NULL; }    
           ;

assignment : id ASSIGN expr {$$ = mknode2("=" , $1, $3);}               
          | id SQR_O expr SQR_C ASSIGN expr {$$ = mknode3("=" , $1 , $3, $6 );}     
           ;

fct_call   : id PAREN_O expr_list PAREN_C {$$ = mknode2("FUNCALL" , $1 ,$3 );}
           ;

expr_list  : expr_list_ expr {$$ = mknode2("AGRS" , $2 , $1);}
       | {$$ = NULL;}       
       ;

expr_list_ : expr_list_ expr COMMA { $$ = mknode2("," , $2 , $1);}
       | {$$ = NULL;}       
       ;

id         : IDENT { $$ = mknode2(strdup(yytext) , NULL , NULL);}
           ;



expr       : expr PLUS expr { $$ = mknode2("+" , $1 , $3);} 
           | expr MINUS expr { $$ = mknode2("-" , $1 , $3);}
           | expr MULT expr { $$ = mknode2("*" , $1 , $3);} 
           | expr DIV expr { $$ = mknode2("/" , $1 , $3);}  
           | expr OP_AND expr { $$ = mknode2("&&" , $1 , $3);}  
           | expr OP_OR expr { $$ = mknode2("||" , $1 , $3);}
           | expr OP_NE expr { $$ = mknode2("!=" , $1 , $3);}
           | expr OP_EQ expr {$$=mknode2("==",$1,$3);}
           | expr OP_GT expr {$$=mknode2(">",$1,$3);}   
           | expr OP_GE expr {$$=mknode2(">=",$1,$3);}
           | expr OP_LT expr {$$=mknode2("<",$1,$3);}
           | expr OP_LE expr {$$=mknode2("<=",$1,$3);}
           | OP_NOT expr {$$=mknode2("!",NULL,$2);}
           | PTR_VAL expr {$$=mknode2("^",NULL,$2);}
           | ADRS expr  {$$=mknode2("&",NULL,$2);}
           | MINUS expr {$$=mknode2("-",NULL,$2);}      
           | literal {$$=mknode2($1,NULL,NULL);}
           | fct_call {$$= $1 ;}
           | id {$$= $1;}           
           | PAREN_O expr PAREN_C {$$=mknode2("(_)",NULL,$2);}  
           | BAR expr BAR {$$=mknode2("|_|",NULL,$2);}
           ;

literal    : CONST_INT {$$ = "const_int";}
           | TRUE_ { $$ = "true" ;}
           | FALSE_ { $$ = "false" ;}
           | CONST_CHAR { $$ = "const_char" ;}
           | CONST_STRING { $$ = "const_string" ;}
           | NULL_ { $$ = "null" ;}
           ;

%%
#include "lex.yy.c"
void main(){
    return yyparse();
}


int tabCount =0;

struct node* mknode2(char* token,node*b1,node*b2)
{

    node* newnode=(node*)malloc(sizeof(node));
    char* newstr;

    if( token ){
        printf("token: %s \n" , token);
        newstr=(char*)malloc(sizeof(token)+1);
        newstr[sizeof(token)]='\0';
        strcpy(newstr,token);
    }
    else{

        newstr=(char*)malloc(3);
        strcpy(newstr,"AA");
    }

    newnode->b1=b1;
    newnode->b2=b2;
    newnode->b3=NULL;
    newnode->b4=NULL;
    newnode->token=newstr;

    return newnode;
}

struct node* mknode3(char* token,node*b1,node*b2,node*b3)
{
    node* newnode= mknode2(token,b1,b2);
    newnode->b3=b3;
    return newnode;
}

struct node* mknode4(char* token,node*b1,node*b2,node*b3,node*b4)
{
    node* newnode= mknode3(token,b1,b2,b3);
    newnode->b4=b4;
    return newnode;
}



void printTabs(){
    int i;
    for(i=0;i<tabCount;i++){
        printf("\t");   
    }
}


int printtree(node* tree)
{


    tabCount++;
    printTabs();
    printf("%s\n",tree->token);

    if(tree->b1)printtree(tree->b1);
    if(tree->b2)printtree(tree->b2);
    if(tree->b3)printtree(tree->b3);
    if(tree->b4)printtree(tree->b4);

    tabCount--;
    return 1;
}


int yyerror(){
    printf("ERROR!\n");
    return 0;
}

this is the input file:

void main(int x){
    int y,w,r; int z;
}

this is the output AST in preorder:

FUNC
    void
        main
        int x
            x
        BODY
            DECL
                int z;
                    ;
                DECL
                    int y,w,r;
                        ,
                            ;
                            ,
                                ,
                                ,

the output i expected:

FUNC
    void
        main
        int
            x
        BODY
            DECL
                int
                    z
                DECL
                    int 
                        ,
                            y
                            ,
                                w
                                r

As can be seen when I get to the rule: type in my yacc file yytext contains the whole sentence ("int y,w,r;" instead of just "int") and when I get to the rule: id it only contain one token, that happens to be one token forward then the expected token.
- Note that in the function name and the function arguments yytext works properly.
- I tried printing yytext from the lex file and the tokens seems to contain right values.

1

1 Answers

1
votes

yytext points to the last lexeme scanned. Any other previously-saved pointer into the scanner's intrrnal buffer (such as a previous value of yytext) is invalid, and xannot be assumed to point at anything useful.

In a parser action, you have really very little idea what the last token scanned is. Yacc/bison generate LALR(1) parsers, where the 1 indicates "one lookahead token". That means that the parser (can) always examine the next token, and if it does so that is the token yytext will point at. Some implementations always read ahead; others, like bison, sometimes do not read ahead if the lookahead won't change parser behaviour. So which token yytext points at during a parser action is unspecified and may change in future versions or with different implementations.

Bottom line: Never use yytext in a parser action. And never pass the pointer value of yytext from the scanner to the parser. If you will need the string value of the token in the parser, make a copy of the string and pass the address of the copy to the parser through yylval. If you use strdup or similar, don't forget to free the copy when you no longer need it.