File r36/lalr.red artifact 82b0c25669 on branch master



symbolic;
spool "lalr.log";
on comp, backtrace;
unglobal '(!*raise);
fluid '(!*raise);

%=============================================================================
% The file provides parser support for Lisp. The file contains three
% major sections.  The first is a lexical analyser, which has been
% coded to support RLISP and Standard Lisp. Its interface is modelled after
% the one used with the lex/yacc combination commonly used with Unix.
%
% The second section is the generic part of an LR parser. It requires
% tables to tell it what actions to take, and calls the lexical analyser to
% obtain tokens.
%
% The final part is an LALR(1) parser generator, which can take the
% specification of a grammar and construct tables that direct the
% generic parser skeleton.
%
%=============================================================================

%
% This is a lexical anaylser for use with RLISP. Its interface is
% styles after the one needed by yacc, in that it exports a function
% called yylex() that returns as a value a numeric category code, but
% sets a variable yylval to hold further information about the token
% just parsed. Single character objects are coded as their (ASCII?) code
% [leaving this code non-portable to machines with other encodings?].
% Other things must have been given 'lex_code properties indicate the
% associated category code.  This lexer handles ' and ` as special prefix
% characters that introduce Lisp-stype s-expressions. and it knows about
% RLISP-style comments and a few diphthongs. It also supports some
% simple preprocessor directives.
%
%                                              Arthur Norman.  April 1995


fluid '(!*raise !*lower !*echo);
global '(lex_char yylval last64 last64p which_line);

% I keep a circular buffer with the last 64 characters that have been
% read. Initially the buffer contains NILs rather than characters, so I can
% tell when it is only partially filled. 

smacro procedure yyreadch();
 << last64p := last64p + 1;
    if last64p = 64 then last64p := 0;
    lex_char := readch();
    if lex_char = !$eol!$ then which_line := which_line + 1;
    putv(last64, last64p, lex_char);
    lex_char >>;

symbolic procedure yyerror msg;
  begin
    scalar c;
    terpri();
    princ "+++++ Parse error at line "; prin which_line; princ ":";
    if atom msg then msg := list msg;
    for each s in msg do << princ " "; princ s >>;
    terpri();
    for i := 1:64 do <<
       last64p := last64p + 1;
       if last64p = 64 then last64p := 0;
       c := getv(last64, last64p);
       if c = !$eof!$ then princ "<EOF>"
       else if not (c = nil) then princ c >>;
    if not (c = !$eol!$) then terpri()
  end;

% Before a succession of calls to yylex() it is necessary to
% ensure that lex_char is set suitably and that the circular buffer
% used to store characters for error messages is ready for use.

symbolic procedure start_parser();
 << last64 := mkvect 64;
    last64p := 0;
    which_line := 1;
    yyreadch() >>;

%
% The following version of YYLEX provides RLISP with a facility for
% conditional compilation.  The protocol is that text is included or
% excluded at the level of tokens.  Control by use of new reserved
% tokens !#if, !#else and !#endif.  These are used in the form:
%    !#if (some Lisp expression for use as a condition)
%    ... RLISP input ...
%    !#else
%    ... alternative RLISP input ...
%    !#endif
%
% The form
%    !#if C1 ... !#elif C2 ... !#elif C3 ... !#else ... !#endif
% is also supported.
%
% Conditional compilation can be nested.  If the Lisp expression used to
% guard a condition causes an error it is taken to be a FALSE condition.
% It is not necessary to have an !#else before !#endif if no alternative
% text is needed. Although the examples here put !#if etc at the start of
% lines this is not necessary (though it may count as good style?). Since
% the condtion will be read using RLISPs own list-reader there could be
% condtional compilation guarding parts of it - the exploitation of that
% possibility is to be discouraged!
%
% Making the condition a raw Lisp expression makes sure that parsing it
% is easy. It makes it possible to express arbitrary conditions, but it is
% hoped that most conditions will not be very elaborate - things like
%    !#if (not (member 'csl lispsystem!*))
%         error();
%    !#else
%         magic();
%    !#endif
% or
%    !#if debugging_mode   % NB if variable is unset that counts as nil
%    print "message";      % so care should be taken to select the most
%    !#endif               % useful default sense for such tests
% should be about as complicated as reasonable people need.
% 
% Two further facilities are provided:
%    !#eval (any lisp expression)
% causes that expression to be evaluated at parse time.  Apart from any
% side-effects in the evaluation the text involved is all ignored. It is
% expected that this will only be needed in rather curious cases, for instance
% to set system-specific options for a compiler.
%
%    !#define symbol value
% where the value should be another symbol, a string or a number, causes
% the first symbol to be mapped onto the second value wherever it occurs in
% subsequent input.  No special facility for undoing the effect of a
% !#define is provided, but the general-purpose !#eval could be used to
% remove the '!#define property that is involved.
%
% NOTE: The special symbols !#if etc are NOT recognised within Lisp
%       quoted expressions, so test like the following will be
%       ineffective:
%            a := '(
%                P
%            !#if q_is_wanted
%                Q
%            !#endif
%                Q);
%       but on the other hand code like
%            if sym = '!#if then ...
%       behaves the way that had probably been wanted. Unlike the C
%       preprocessor, this system recognizes directives within rather than
%       just at the start of lines.


symbolic procedure yylex();
  begin
    scalar w, done;
% I take a rather robust view here - words that are intended to be used as
% keywords may not be written with included escape characters. Thus for
% instance this lexer will view "be!gin" or "!begin" as being a simple
% symbol and NOT being the keyword "begin".
    w := lex_basic_token();
% The "while not done" loop is so that I can restart the scan after seeing
% a pre-processor directive such as !#if.
    while not done do <<
% The word "COMMENT" introduces a comment that terminates at the next ";"
% or "$".
        while yylval = 'comment and
              w = '!:symbol_or_keyword do <<
            while not (lex_char = '!; or lex_char = '!$) do
                yyreadch();
            yyreadch();
            w := lex_basic_token() >>;
% If a word was spelt out directly (without any escape characters in it) it
% may be a keyword - if it is, then convert it here.
        if w = '!:symbol_or_keyword then <<
            if w := get(yylval, '!#define) then <<
                yylval := cdr w; 
                w := car w >>
            else <<
                if done := get(yylval, 'lex_code) then w := done
                else if flagp(yylval, 'rlis) then
                    w := get('rlistat, 'lex_code)
                else if flagp(yylval, 'endstat) then
                    w := get('endstat, 'lex_code)
                else w := get('!:symbol, 'lex_code);
                done := t >> >>
% A word with escapes in might be a pre-processor directive.
        else if w = '!:symbol then <<
            if yylval eq '!#if then <<
                read_s_expression();
                w := lex_conditional yylval >>
            else if yylval eq '!#else or
                    yylval eq '!#elif then <<
                yylval := nil;
                w := lex_skipping(w, nil) >>
            else if yylval eq '!#endif then w := lex_basic_token()
            else if yylval eq '!#eval then << 
                read_s_expression();
                errorset(yylval, nil, nil);
                w := lex_basic_token() >>
            else if yylval eq '!#define then <<
                read_s_expression();
                w := yylval;    % Ought to be a symbol
                done := read_s_expression();
                if idp w then put(w, '!#define, done . yylval);
                w := lex_basic_token();
                done := nil >>
            else <<
                w := get('!:symbol, 'lex_code);
                done := t >> >>
        else if numberp w then <<
% Now gobble up extra characters for multi-character operators (eg ">=").
% Note that I only look one character ahead here.
            while done := atsoc(lex_char, get(yylval, 'lex_dipthong)) do <<
                w := cdr done;
                yylval := cdr w;
                w := get(car w, 'lex_code);
                yyreadch() >>;
            if done := get(yylval, '!#define) then <<
                yylval := cdr done;
                w := car done;
                done := nil >>
            else done := t >>
        else <<
            done := t;
            w := get(w, 'lex_code) >> >>;
    return w
  end;



% If, when reading ordinary text, I come across the token !#if I read
% the expression following. If that evaluates to TRUE I just keep on
% on reading. So the sequence "!#if t" is in effect ignored. Then
% if later on I just ignore an "!#endif" all will be well.  If on the other
% hand the expression evaluates to NIL (or if evaluation fails), I will
% call lex_skipping() to discard more tokens (up to and including
% the next "!#else", "!#elif t" or "!endif").

symbolic procedure lex_conditional x;
  begin
    scalar w;
    w := lex_basic_token();
    x := errorset(x, nil, nil);
    if errorp x or null car x then return lex_skipping(w, nil)
    else return w
  end;

% I call lex_skipping when I find "!#if nil" or "!#else" or "!#elif"
% that is processed. When a top-level "!#else" or "!#elif" is found it
% is discarded before calling lex_skipping, since it must follow a
% successful "!#if" and hence introduce material to be thrown away.

symbolic procedure lex_skipping(w, x);
  begin
    scalar done;
% In this code x keep track of the depth of testing of "!#if" constructions
    while not done do <<
        if w = 0 then done := t   % End of file
        else <<
            if w = '!:symbol then <<
                if yylval = '!#endif then <<
                    if null x then done := t
                    else x := cdr x >>
                else if yylval = '!#if then x := nil . x
                else if yylval = '!#else and null x then done := t
                else if yylval = '!#elif and null x then <<
                    read_s_expression();
                    done := errorset(yylval, nil, nil);
                    if errorp done or null car done then done := nil >> >>;
            w := lex_basic_token() >> >>;
    return w
  end;



% In some cases RLISP operators are made up out of two (or more) characters.
% I map '**' onto '^', and >=, <= onto GEQ and LEQ.
% ":=" becomes SETQ.   I turn << and >> onto symbols that can not be
% read directly (!:lsect and !:rsect).
% This means that the system that sets up lex_code properties had really
% better make sure that it gives setq, geq, leq, !:rsect and !:lsect values.

put('!*, 'lex_dipthong, '((!* !^ . !^)));
put('!:, 'lex_dipthong, '((!= setq . setq)));
put('!>, 'lex_dipthong, '((!= geq . geq),
                          (!> !:rsect . !:rsect)));
put('!<, 'lex_dipthong, '((!= leq . leq),
                          (!< !:lsect . !:lsect)));

put('!^, 'lex_code, char!-code '!^);


% lex_basic_token() will read the next token from the current input stream and
% leave a value in yylval to show what was found. It does not handle the
% word "comment", nor does it consolidate things like ':' followed by '=' into
% ':='. Those steps are left to yylex(). But lex_basic_token() does recognize 
% the quote prefix, as in '(lisp expression).  The return value is numeric
% for single-character tokens, but otherwise a descriptive symbol.

% Some people would consider the Lisp dialect that I am using here to be
% significantly flawed, in that I need to build symbols, numbers and
% strings up as lists, and then use COMPRESS to make the real objects. The
% CONS operations involved can be seen as an overhead, and going back to
% something like the VERY old-fashioned clearbuff/pack/boffo world might
% avoid that.

symbolic procedure lex_basic_token();
  begin
    scalar r, w;
% First skip over whitespace. Note that at some stage in the future RLISP
% may want to make newlines significant and partially equivalent to
% semicolons, but that is not supported at present.
    while lex_char = '!  or lex_char = !$eol!$ or
          (lex_char = '!% and <<
             while not (lex_char = !$eol!$ or lex_char = !$eof!$) do
                yyreadch();
             t >>) do yyreadch();
% Symbols start with a letter or an escaped character and continue with
% letters, digits, underscores and escapes.
    if liter lex_char or
       (lex_char = '!! and begin
          scalar !*raise, !*lower;  % Rebind !*raise & !*lower to avoid..
          r := lex_char . r;      % case folding when the next character..
          yyreadch();   % is read.
          return (w := t) end) then <<
      r := lex_char. r;
      while liter(yyreadch()) or
            digit lex_char or
            lex_char = '!_ or
            (lex_char = '!! and begin
               scalar !*raise, !*lower;
               r := lex_char . r;
               yyreadch();
               return (w := t) end) do
        r := lex_char . r;
% If there was a '!' in the word I will never treat it as a keyword.
      yylval := compress reversip r;
       return if w then '!:symbol else '!:symbol_or_keyword >>
% Numbers are either integers or floats. A floating point number is
% indicated by either a point "." or an exponent marker "e". In the code
% here I keep a flag (in w) to indicate if I had a floating or integer
% value, but in the end I ignore this and hand back the lexical category
% :number in both cases.
    else if digit lex_char then <<
      r := list lex_char;
      while digit (yyreadch()) do r := lex_char . r;
      if lex_char = '!. then <<
        w := t;       % Flag to indicate floating point
        r := lex_char . r;
        while digit (yyreadch()) do r := lex_char . r >>;
% I permit the user to write the exponent marker in either case.
      if lex_char = '!e or lex_char = '!E then <<
% If the input as 1234E56 I expand it as 1234.0E56
        if not w then r := '!0 . '!. . r;
        w := t;
        r := '!e . r;
        yyreadch();
        if lex_char = '!+ or lex_char = '!- then <<
          r := lex_char . r;
          yyreadch() >>;
% If there is no digit written after "E" I insert a zero. Thus overall the
% input 17E gets treated as 17.0E0
        if not digit lex_char then r := '!0 . r
        else <<
          r := lex_char . r;
          while digit (yyreadch()) do r := lex_char . r >> >>;
      yylval := compress reversip r;
      return '!:number >>
% Strings are enclosed in double-quotes, and "abc""def" is a string with
% a double-quote mark within it. Note no case folding on characters in a
% string.
    else if lex_char = '!" then <<
      begin
        scalar !*raise, !*lower;
        repeat <<
          r := lex_char . r;
          while not ((yyreadch()) = '!") do r := lex_char . r;
          r := lex_char . r;
          yyreadch() >> until not (lex_char = '!");
      end;
      yylval := compress reversip r;
      return '!:string >>
% "'" and "`" introduce Lisp syntax S-expressions
    else if lex_char = '!' then <<
      yyreadch();
      read_s_expression();
      yylval := list('quote, yylval);
      return '!:list >>
    else if lex_char = '!` then <<
      yyreadch();
      read_s_expression();
      yylval := list('backquote, yylval);
      return '!:list >>
    else <<
      yylval := lex_char;
% I take special notice of end of file, since it is fairly drastic.
% In particular I do not attempt to advance lex_char beyond it. So I do
% TWO things here: I avoid advancing the input, and I return the code 0
% as an end-of-file indication.
      if yylval = !$eof!$ then return 0
      else <<
        yyreadch();
        return char!-code yylval >> >>
  end;

%
% I use a hand-written recursive descent parser for Lisp S-expressions
% mainly because the syntax involved is so VERY simple. A rough sketch of
% the syntax required is given here, but in reality (in part because I do
% not want to have to report syntax errors) I implement a more liberal
% syntax, especially as it relates to dotted-pair notation. This of course
% is one of the natural dangers in using recursive descent... the syntax
% actually parsed is only properly defined by direct reference to the code.
%

% s_tail      =   ")" |
%                 "." s_expr ")" |
%                 s_expr s_tail;
% 
% s_vectail   =   "]" |
%                 s_expr s_vectail;
% 
% s_expr      =   symbol |
%                 number |
%                 string |
%                 "(" s_tail |
%                 "[" s_vectail |
%                 "'" s_expr |
%                 "`" s_expr |
%                 "," s_expr |
%                 ",@" s_expr;

global '(dot_char rpar_char rsquare_char);

dot_char     := char!-code '!.;
rpar_char    := char!-code '!);
rsquare_char := char!-code '!];

symbolic procedure read_s_expression();
 <<
% At the start of an S-expression I want to check for the characters
% "(", "[" and ",". Thus I need to skip whitespace.
    while lex_char = '!  or lex_char = '!$eol!$ do yyreadch();
    if lex_char = '!( then begin
      scalar r, w, w1;
      yyreadch();
      w := read_s_expression();
      while not (w = rpar_char or w = dot_char or w = 0) do <<
        r := yylval . r;
% Note that at the end of the list read_s_expression() will read the ")"
% as a token.
        w := read_s_expression() >>;
      if not w = dot_char then yylval := reversip r
      else <<
        read_s_expression();  % Thing after the "."
        w := yylval;
% Reverse the list putting a dotted item on the end.
        while r do <<
          w1 := cdr r;
          rplacd(r, w);
          w := r;
          r := w1 >>;
        yylval := w;
% I will be somewhat liberal about syntactic problems with dotted pair
% notation, since it is unclear how I can usefully report or repair errors.
        while lex_char = '!  or lex_char = '!$eol!$ do
            yyreadch();
% When I find a ")" I do not read beyond it immediately, but reset lex_char
% to whitespace. This may help prevent unwanted hangups in interactive use.
        if lex_char = '!) then lex_char := '!  >>;
      return '!:list end
% "[" introduces a simple vector.
    else if lex_char = '![ then begin
      scalar r, w, w1;
      yyreadch();
      w := read_s_expression();
      w1 := -1;
      while not (w = rsquare_char or w = 0) do <<
        r := yylval . r;
        w1 := w1 + 1;
        w := read_s_expression() >>;
% Create a vector of the correct size and copy information into it.
      w := mkvect w1;
      r := reversip r;
      w1 := 0;
      while r do <<
        putv(w, w1, car r);
        w1 := w1 + 1;
        r := cdr r >>;
      yylval := w;
      return '!:list end
% I spot "," and ",@" here, and should wonder if I should (a) police that
% they are only expected to make sense within the scope of a "`" and (b)
% whether I ought to expand them in terms of LIST, CONS, APPEND etc here.
% For now I just hand back markers that show where they occured.
    else if lex_char = '!, then <<
      yyreadch();
      if lex_char = '!@ then <<
        yyreadch();
        read_s_expression();
        yylval := list('!,!@, yylval) >>
      else <<
        read_s_expresssion();
        yylval := list('!,, yylval) >>;
      'list >>
% Care with ")" and "]" not to read ahead further than is essential.
    else if lex_char = '!) or lex_char = '!] or lex_char = '!. then <<
      yylval := lex_char;
      lex_char := '! ;
      char!-code yylval >>      
% In most cases (including "'" and "`") I just hand down to read a token.
% This covers the cases of symbols, numbers and strings.
    else lex_basic_token() >>;



%=============================================================================

% Here I have a general-purpose LR(1) parser skeleton.  This needs to
% have a source of tokens, and some tables that will direct its actions.

% The format of the tables required by this code will be a little curious,
% mainly because they represent some attempt to compact the information that
% is needed. Note that the CSL functions mkvect16, putv16, getv16 are
% somewhat similar to the regular mkvect, putv and getv, but may be used
% if the vector contents will always be 16-bit fixnums.

global '(!*verbose);

!*verbose := t;      % How much will the parset-generator print?

global '(goto_index goto_old_state goto_new_state);

% For each terminal I have a pointer (stored in goto_index) into
% a pair of vectors, goto_old_state and goto_new_state. The first of these
% holds states that I might be in, and the second holds the ones I must
% move into after a reduction has been performed. In the goto_old_state
% table the value "-1" is taken to match any state that I am attempting
% to look up. Thus it can be used to terminate a segment of the table. I am
% entitled to let undefined locations in the goto table respond with
% any value that just happens.

smacro procedure get_goto(state, non_terminal);
 << w1 := getv16(goto_index, non_terminal);
    while not (((w2 := getv16(goto_old_state, w1)) = -1) or
               w2 = state) do w1 := w1 + 1;
    getv16(goto_new_state, w1) >>;

global '(action_index, action_terminal action_result);

% In a rather similar way, actions are found via an association-list
% like look-up in a table. In this table a strictly positive number n stands
% for (SHIFT n) [observe that if I reserve zero as the number of the
% initial state of my augmented grammar I can never need to shift into
% state 0].  The value 0 in the table represents ACCEPT. This leaves
% nagative values to cover reductions and error cases.

smacro procedure get_action(state, terminal);
 << w1 := getv16(action_index, state);
    while not (((w2 := getv16(action_terminal, w1)) = -1) or
               w2 = terminal) do w1 := w1 + 1;
    getv(action_result, w1) >>;

global '(action_first_error action_error_messages);

global '(action_fn action_A action_n);

symbolic procedure yyparse();
  begin
    scalar sym_stack, state_stack, next_input, w, w1, w2;
    state_stack := list 0;  % Note that state 0 must be the initial one.
    start_parser();
    next_input := yylex();
    while not (w := get_action(car state_stack, next_input)) = 0 do
        if w > 0 then <<
            sym_stack := next_input . sym_stack;
            state_stack := cadr w . state_stack;
            next_input := yylex() >>
        else begin   
            scalar A, n, action;
            w := - (w + 1);
            if w < action_first_error then <<
                action := getv(action_fn, w);
                n := getv8(action_n, w);
                A := getv16(action_A, w);
% I am now reducing by "A -> beta { action() }" where beta has n items
                w := nil;
                for i := 1:n do <<
                    w := car sym_stack . w;
                    sym_stack := cdr sym_stack;
                    state_stack := cdr state_stack >>;
                w := reversip w;
                if action then w := apply1(action, w)
                else w := A . w;
                sym_stack := w . sym_stack;
                state_stack := get_goto(car state_stack, A) >>
            else <<
                w := w - action_first_error;
                yyerror getv(action_error_messages, w);
% The next activity must result in the loop ending...
                state_stack := list 0;
                sym_stack := '(error);
                next_input := 0 >>
        end;
    return car sym_stack
  end;

%=============================================================================
%
% A grammar is represented as a list of rules. A rule
%      sym : x y z { p q r }
%          | x y z { p q r }
%          ;
% maps onto the list
%      (sym   ((x y z) p q r)
%             ((x y z) p q r))
% and items on the right hand side can be symbols or strings. Strings
% stand for terminals. Symbols that are mentioned on a left hand side of
% a production are non-terminals, others are considered to be terminals
% supported by the lexer.
%
% ***** I am still working out what to do with the "semantic actions".

global '(terminals non_terminals symbols goto_cache action_map);

smacro procedure lalr_productions x;
    get(x, 'produces);

smacro procedure lalr_set_productions(x, y);
    put(x, 'produces, y);

symbolic procedure lalr_prin_symbol x;
    if x = 0 then princ "$"
    else if x = nil then princ "<empty>"
    else if x = '!. then princ "."
    else if numberp x and rassoc(x, terminals) then
        prin car rassoc(x, terminals)
    else if stringp x then prin x
    else for each c in explode2uc x do princ c;

symbolic procedure lalr_display_symbols();
  begin
    princ "Terminal symbols are:"; terpri();
    for each x in terminals do <<
        princ " "; prin car x;
        princ ":"; prin cdr x >>;
    terpri();
    princ "Non-terminal symbols are:"; terpri();
    for each x in non_terminals do begin
       scalar w;
       princ "["; prin get(x, 'non_terminal_code); princ "]";
       lalr_prin_symbol x;
       w := ":";
       for each y in lalr_productions x do <<
           ttab 20; princ w; w := "|";
           for each z in car y do << princ " "; lalr_prin_symbol z >>;
           if posn() > 48 then terpri();
           ttab 48;
           princ "{";
           for each z in cdr y do << princ " "; prin z >>;
           princ " }";
           terpri() >>;
       ttab 20;
       princ ";";
       terpri() end;
    terpri();
  end;

symbolic procedure lalr_print_action_map();
  begin
    princ "Action map:"; terpri();
    for each x in action_map do <<
        prin cdr x; princ ":"; ttab 12; prin car x; terpri() >>
  end;

symbolic procedure lalr_set_grammar g;
  begin
    scalar name, vals, tnum, w;
    terminals := non_terminals := symbols := nil;
    tnum := 0;
% I will start by augmenting the grammar with an initial production...
    g := list('s!', list list caar g) . g;
    for each x in g do <<
        name := car x; vals := cdr x;
        if name member non_terminals then
            vals := append(vals, lalr_productions name)
        else non_terminals := name . non_terminals;
        for each vv in cdr x do
          for each v in car vv do <<
            if stringp v or numberp v then <<
                if not assoc(v, terminals) then
                    terminals := (v . (tnum := tnum+1)) . terminals >>
            else if not (v member symbols) then symbols := v . symbols >>;
        lalr_set_productions(name, vals) >>;
    for each name in non_terminals do symbols := delete(name, symbols);
    for each v in symbols do terminals := (v . (tnum := tnum+1)) . terminals;
% I reverse the list of non-terminals here so that the starting symbol
% remains as the first item.
    non_terminals := reversip non_terminals;
    tnum := -1;
    for each v in non_terminals do
        put(v, 'non_terminal_code, tnum := tnum+1);
    symbols := append(non_terminals, for each x in terminals collect cdr x);
    goto_cache := mkhash(length non_terminals, 1, 1.5);
    if !*verbose then lalr_display_symbols();
% Map all terminals onto numeric codes.
    for each x in non_terminals do
       lalr_set_productions(x, 
           for each y in lalr_productions x collect
               sublis(terminals, car y) . cdr y);
% Map all actions onto numeric codes, such that identical actions all have the
% same code
    action_map := nil;
    tnum := -1;
    for each x in non_terminals do
        for each a in lalr_productions x do <<
            w := assoc(cdr a, action_map);
            if null w then <<
                w := cdr a . (tnum := tnum + 1);
                action_map := w . action_map >>;
            rplacd(a, list cdr w) >>;
    action_map := reversip action_map;
    if !*verbose then lalr_print_action_map();
    lalr_calculate_first non_terminals;
  end;

symbolic procedure lalr_clean_up();
  begin
    for each x in terminals do <<
        remprop(x, 'produces);
        remprop(x, 'lalr_first);
        remprop(x, 'non_terminal_code) >>;
    terminals := non_terminals := symbols := nil;
    goto_cache := action_map := nil
  end;

symbolic procedure lalr_action(lhs, rhs);
    cdr assoc(rhs, lalr_productions lhs);

symbolic procedure lalr_print_firsts g;
  begin
    princ "FIRST sets for each non-terminal:"; terpri();
    for each x in g do <<
        lalr_prin_symbol x;
        princ ": ";
        ttab 15;
        for each y in get(x, 'lalr_first) do <<
            princ " "; lalr_prin_symbol y >>;
        terpri() >>
  end;

symbolic procedure lalr_calculate_first g;
  begin
    scalar w, y, z, done;
    for each x in g do
        if assoc(nil, lalr_productions x) then put(x, 'lalr_first, '(nil));
    repeat <<
        done := nil;
        for each x in g do <<
            z := get(x, 'lalr_first);
            for each y1 in lalr_productions x do <<
                y := car y1;
                while y and
                      not numberp y
                      and (nil member (w := get(car y, 'lalr_first))) do <<
                    z := union(w, z);
                    y := cdr y >>;
                if null y then nil
                else if numberp car y then z := union(list car y, z)
                else z := union(get(car y, 'lalr_first), z) >>;
            if not (z = get(x, 'lalr_first)) then done := t;
            put(x, 'lalr_first, z) >>
    >> until not done;
    if !*verbose then lalr_print_firsts g;
    return nil
  end;

symbolic procedure lalr_first l;
  begin
    scalar r, w;
    while l and
          not numberp car l and
          (nil member (w := get(car l, 'lalr_first))) do <<
        r := union(delete(nil, w), r);
        l := cdr l >>;
    if null l then r := nil . r
    else if numberp car l then r := union(list car l, r)
    else r := union(w, r);
    return r
  end;


% The next few procedures are as documented in Figure 4.38 of Red Dragon

symbolic procedure lalr_print_items(heading, cc);
  begin
    princ heading;
    terpri();
    for each y in cc do <<
        princ "Item number "; prin cdr y; terpri();
        for each x in sort(car y, function orderp) do <<
            lalr_prin_symbol caar x; princ " ->";
            for each y in cdar x do << princ " "; lalr_prin_symbol y >>;
            princ "  :  ";
            lalr_prin_symbol cadr x;
            terpri() >>;
        for each x in hashcontents goto_cache do
          for each xx in cdr x do
            if car xx = cdr y then <<
                ttab 10; lalr_prin_symbol car x;
                princ " GOTO state "; prin cdr xx; terpri() >> >>
  end;

symbolic procedure lalr_items g;
  begin
    scalar c, val, done, w, w1, w2, n;
    val := lalr_productions 's!';
    if cdr val then error(0, "Starting state must only reduce to one thing")
    else val := caar val;
    n := 0;
    c := list (lalr_closure list list(('s!' . '!. . val), 0) . n);
    repeat <<
        done := nil;
        for each i in c do
            for each x in symbols do
                if w := lalr_goto(car i, x) then <<
                     w1 := assoc(w, c);
                     if w1 then <<
                         w2 := gethash(x, goto_cache);
                         if not assoc(cdr i, w2) then
                             puthash(x, goto_cache, (cdr i . cdr w1) . w2) >>
                     else <<
                         c := (w . (n := n + 1)) . c;
                         puthash(x, goto_cache,
                                    (cdr i . n) . gethash(x, goto_cache));
                         done := t >> >>
    >> until not done;
    c := reversip c;   % So that item numbers come out in nicer order.
    if !*verbose then lalr_print_items("LR(1) Items:", c);
    return c
  end;

symbolic procedure lalr_closure i;
  begin
    scalar pending, a, rule, tail, done, ff, w;
    pending := i;
    while pending do <<
        ff := car pending;  % [(A -> alpha . B beta), a]
        pending := cdr pending;
        rule := car ff; a := cadr ff; tail := cdr ('!. member rule);
        if tail and not numberp car tail then <<
            ff := lalr_first append(cdr tail, list a);
            for each p in lalr_productions car tail do
                for each b in ff do <<
                    w := list(car tail . '!. . car p, b);
% It might be better to store items as hash tables, since then the
% member-check here would be much faster.
                    if not (w member i) then <<
                        i := w . i;
                        pending := w . pending >> >> >> >>;
    return i
  end;

symbolic procedure lalr_move_dot(z, x);
  begin
    scalar r;
    while not (car z = '!.) do <<
        r := car z . r;
        z := cdr z >>;
    z := cdr z;
    if not (z and car z = x) then return nil;
    z := car z . '!. . cdr z;
    while r do <<
        z := car r . z;
        r := cdr r >>;
    return z
  end;

symbolic procedure lalr_goto(i, x);
  begin
    scalar j, w;
    for each z in i do <<
        w := lalr_move_dot(car z, x);
        if w then j := list(w, cadr z) . j >>;
    return lalr_closure j
  end;

symbolic procedure lalr_cached_goto(i, x);
    cdr assoc(i, gethash(x, goto_cache));

% Next part of Algorithm 4.11 from the Red Dragon

symbolic procedure lalr_remove_duplicates x;
  begin
    scalar r;
    if null x then return nil;
    x := sort(x, function orderp);
    r := list car x;
    x := cdr x;
    while x do <<
        if not (car x = car r) then r := car x . r;
        x := cdr x >>;
    return r
  end;

symbolic procedure lalr_core i;
    lalr_remove_duplicates for each x in car i collect car x;

symbolic procedure lalr_same_core(i1, i2);
    lalr_core i1 = lalr_core i2;

% cc is a list of items, while i is a single item. If cc already contains
% an item with the same core as I then merge i into that, and adjust any
% goto records either out of or into i to refer now to the thing merged
% with.

fluid '(renamings);

symbolic procedure lalr_insert_core(i, cc);
   if null cc then list i
   else if lalr_same_core(i, car cc) then <<
       renamings := (i . cdar cc) . renamings;
       (union(car i, caar cc) . cdar cc) . cdr cc >>
   else car cc . lalr_insert_core(i, cdr cc);

symbolic procedure lalr_rename_gotos();
  begin
    scalar w;
    for each x in non_terminals do <<
        w := sublis(renamings, gethash(x, goto_cache));
        puthash(x, goto_cache, lalr_remove_duplicates w) >>
  end;

% Part of Algorithm 4.10 of the Red Dragon

symbolic procedure lalr_print_actions action_table;
  begin
    scalar w;
    princ "Actions:"; terpri();
    for each x in action_table do
      for each xx in cdr x do <<
        prin car x;  ttab 20;
        lalr_prin_symbol car xx; ttab 40;
        w := cadr xx;
        if eqcar(w, 'reduce) then <<
            princ "reduce ";
            lalr_prin_symbol caadr w;
            princ " ->";
            for each v in cdadr w do << princ " "; lalr_prin_symbol v >>;
            princ " {";
            for each v in caddr w do << princ " "; prin v >>;
            princ " }";
            terpri() >>
        else << prin w; terpri() >> >>
  end;

symbolic procedure lalr_make_actions c;
  begin
    scalar action_table, aa, j, w;
    for each i in c do <<
        aa := nil;
        for each r in car i do <<
            w := cdr ('!. member cdar r);
            if w and numberp car w then <<
                j := lalr_cached_goto(cdr i, car w);
                aa := list(car w, list('shift, j)) . aa >>
            else if null w and not (caar r = 's!') then <<
                w := reverse cdr reverse car r;
                aa :=
                    list(cadr r, list('reduce, w, lalr_action(car w, cdr w))) .
                    aa >>
            else if null w and caar r = 's!' then
                aa := list(0, 'accept) . aa >>;
        action_table := (cdr i . lalr_remove_duplicates aa) . action_table >>;
    action_index := mkvect16 caar action_table;
    action_table := reversip action_table;
    if !*verbose then lalr_print_actions action_table;
    j := 0; w := nil;
    for each x in action_table do <<
        putv16(action_index, car x, j);
        aa := lalr_lay_out_actions cdr x;
        while aa do <<
            w := (0 . 0) . w;
            j := j + 1 >> >>;
    action_terminal := mkvect16 j;
    action_result := mkvect16 j;
    while j > 0 do <<
        j := j - 1;
        putv16(action_terminal, j, caar w);
        putv16(action_result, j, cdar w);
        w := cdr w >>
  end;

symbolic procedure lalr_most_common_dest p;
  begin
    scalar r, w;
    for each x in p do
        if (w := assoc(cdr x, r)) then rplacd(w, cdr w + 1)
        else r := (cdr x . 1) . r;
    w := car r;
    for each x in cdr r do if cdr x > cdr w then w := x;
    return car w
  end;

symbolic procedure lalr_make_gotos();
  begin
    scalar p, r1, w, r;
    p := 0;
    for each x in hashcontents goto_cache do
        if not numberp car x then <<
            if !*verbose then
                for each xx in cdr x do <<
                    prin car xx; ttab 10; lalr_prin_symbol car x;
                    princ " GOTO state "; prin cdr xx; terpri() >>;
            r1 := (get(car x, 'non_terminal_code) . p) . r1;
            if cdr x then <<
                w := lalr_most_common_dest cdr x;
                for each xx in cdr x do if not (cdr xx = w) then <<
                    r := xx . r;
                    p := p + 1 >>;
                r := ((-1) . w) . r;
                p := p + 1 >> >>;
    goto_index := mkvect16 length non_terminals;
    goto_old_state := mkvect16 p;
    goto_new_state := mkvect16 p;
    for each x in r1 do putv16(goto_index, car x, cdr x);
    while p > 0 do <<
        p := p - 1;
        putv16(goto_old_state, p, caar r);
        putv16(goto_new_state, p, cdar r);
        r := cdr r >>;
    princ "goto_index: ";     print goto_index;
    princ "goto_old_state: "; print goto_old_state;
    princ "goto_new_state: "; print goto_new_state
  end;

% A main driver function that performs all the steps involved
% in building parse tables for a given grammar.

symbolic procedure lalr_construct_parser g;
  begin
    scalar c, cc, renamings;
    lalr_set_grammar g;
    c := lalr_items non_terminals;
    renamings := nil;
    for each i in c do cc := lalr_insert_core(i, cc);
    lalr_rename_gotos();
    if !*verbose then lalr_print_items("Merged Items:", cc);
    lalr_make_actions cc;
    lalr_make_gotos();
    lalr_clean_up()
  end;

%=============================================================================

% Now some test cases

on time;

% Here I set up a sample grammar
%    S' -> S
%    S  -> CC      { A1 }
%    C  -> cC      { A2 }
%        | d       { A3 }
% (example 4.42 from Aho, Sethi and Ullman's Red Dragon book, with
%  some dummy semantic actions added. Note that I do not need to insert
% the production S' -> S for myself since the analysis code will
% augment my grammar with it for me anyway.

grammar := '((S  ((C C) A1))
             (C  (("c" C) A2)
                 (("d") A3))
            );

lalr_construct_parser grammar;

% Example 4.46 from the Red Dragon

g4_46 := '((S   ((L "=" R) a1)
                ((R)       a2))
           (L   (("*" R)   a3)
                ((id)      a4))
           (R   ((L)       a5)));

lalr_construct_parser g4_46;


% Now a much more complicated grammar - one that recognizes the syntax of
% RLISP.

rlisp_grammar := '(

(command         ((  cmnd sep ) action)
                 ((  end sep ) action)
                 ((  command cmnd sep ) action)
                 ((  command end sep ) action)
)


(sep             ((  ";" ) action)
                 ((  "$" ) action)
)


(proc_type       ((  symbolic ) action)
                 ((  algebraic ) action)
)


(proc_qual       ((  expr ) action)
                 ((  macro ) action)
                 ((  smacro ) action)
)


(sym_list        ((  ")" ) action)
                 ((  "," symbol sym_list ) action)
)


(infix           ((  setq ) action)
                 ((  or ) action)
                 ((  and ) action)
                 ((  member ) action)
                 ((  memq ) action)
                 ((  "=" ) action)
                 ((  neq ) action)
                 ((  eq ) action)
                 ((  geq ) action)
                 ((  ">" ) action)
                 ((  leq ) action)
                 ((  "<" ) action)
                 ((  freeof ) action)
                 ((  "+" ) action)
                 ((  "-" ) action)
                 ((  "*" ) action)
                 ((  "/" ) action)
                 ((  "^" ) action)
                 ((  "." ) action)
)

(prefix          ((  not ) action)
                 ((  "+" ) action)
                 ((  "-" ) action)
)


(proc_head       ((  symbol ) action)
                 ((  symbol symbol ) action)
                 ((  symbol "(" ")" ) action)
                 ((  symbol "(" symbol sym_list ) action)
                 ((  prefix symbol ) action)
                 ((  symbol infix symbol ) action)
)


(proc_def        ((  procedure proc_head sep cmnd ) action)
                 ((  proc_type procedure proc_head sep cmnd ) action)
                 ((  proc_qual procedure proc_head sep cmnd ) action)
                 ((  proc_type proc_qual procedure proc_head sep cmnd ) action)
)


(rlistat         ((  rlistat ) action)
                 ((  in ) action)
                 ((  on ) action)
)


(rltail          ((  expr ) action)
                 ((  expr "," rltail ) action)
)


(cmnd            ((  expr ) action)
                 ((  rlistat rltail ) action)
)


(if_stmt         ((  if expr then cmnd else cmnd ) action)
                 ((  if expr then cmnd ) action)
)


(for_update      ((  ":" expr ) action)
                 ((  step expr until expr ) action)
)


(for_action      ((  do ) action)
                 ((  sum ) action)
                 ((  collect ) action)
)


(for_inon        ((  in ) action)
                 ((  on ) action)
)


(for_stmt        ((  for symbol setq expr for_update for_action cmnd ) action)
                 ((  for each symbol for_inon expr for_action cmnd ) action)
                 ((  foreach symbol for_inon expr for_action cmnd ) action)
)


(while_stmt      ((  while expr do cmnd ) action)
)


(repeat_stmt     ((  repeat cmnd until expr ) action)
)


(return_stmt     ((  return ) action)
                 ((  return expr ) action)
)


(goto_stmt       ((  goto symbol ) action)
                 ((  go symbol ) action)
                 ((  go to symbol ) action)
)


(group_tail      ((  rsect ) action)
                 ((  sep rsect ) action)
                 ((  sep cmnd group_tail ) action)
)


(group_expr      ((  lsect cmnd group_tail ) action)
)


(scalar_tail     ((  sep ) action)
                 ((  "," symbol scalar_tail ) action)
                 ((  "," integer scalar_tail ) action)
)


(scalar_def      ((  scalar symbol scalar_tail ) action)
                 ((  integer symbol scalar_tail ) action)
)


(scalar_defs     ((  scalar_def ) action)
                 ((  scalar_defs scalar_def ) action)
)


(block_tail      ((  end ) action)
                 ((  cmnd end ) action)
                 ((  symbol ":" block_tail ) action)
                 ((  cmnd sep block_tail ) action)
                 ((  sep block_tail ) action)
)

(block_expr      ((  begin scalar_defs block_tail ) action)
                 ((  begin block_tail ) action)
)


(lambda_vars     ((  sep ) action)
                 ((  "," symbol lambda_vars ) action)
)


(lambda_expr     ((  lambda symbol lambda_vars cmnd ) action)
                 ((  lambda "(" ")" sep cmnd ) action)
                 ((  lambda "(" symbol sym_list sep cmnd ) action)
)


(expr            ((  rx0 ) action)
                 ((  lx0 ) action)
)


(rx0             ((  lx0 where symbol "=" rx1 ) action)
                 ((  rx1 ) action)
)


(lx0             ((  lx0 where symbol "=" lx1 ) action)
                 ((  lx1 ) action)
)


(rx1             ((  lx2 setq rx1 ) action)
                 ((  rx2 ) action)
)


(lx1             ((  lx2 setq lx1 ) action)
                 ((  lx2 ) action)
)


(rx2tail         ((  rx3 ) action)
                 ((  lx3 or rx2tail ) action)
)

(rx2             ((  lx3 or rx2tail ) action)
                 ((  rx3 ) action)
)


(lx2tail         ((  lx3 ) action)
                 ((  lx3 or lx2tail ) action)
)

(lx2             ((  lx3 or lx2tail ) action)
                 ((  lx3 ) action)
)


(rx3tail         ((  rx4 ) action)
                 ((  lx4 and rx3tail ) action)
)

(rx3             ((  lx4 and rx3tail ) action)
                 ((  rx4 ) action)
)


(lx3tail         ((  lx4 ) action)
                 ((  lx4 and lx3tail ) action)
)

(lx3             ((  lx4 and lx3tail ) action)
                 ((  lx4 ) action)
)


(rx4             ((  not rx4 ) action)
                 ((  rx5 ) action)
)


(lx4             ((  not lx4 ) action)
                 ((  lx5 ) action)
)


(rx5             ((  lx6 member ry6 ) action)
                 ((  lx6 memq ry6 ) action)
                 ((  lx6 "=" ry6 ) action)
                 ((  lx6 neq ry6 ) action)
                 ((  lx6 eq ry6 ) action)
                 ((  lx6 geq ry6 ) action)
                 ((  lx6 ">" ry6 ) action)
                 ((  lx6 leq ry6 ) action)
                 ((  lx6 "<" ry6 ) action)
                 ((  lx6 freeof ry6 ) action)
                 ((  rx6 ) action)
)


(lx5             ((  lx6 member ly6 ) action)
                 ((  lx6 memq ly6 ) action)
                 ((  lx6 "=" ly6 ) action)
                 ((  lx6 neq ly6 ) action)
                 ((  lx6 eq ly6 ) action)
                 ((  lx6 geq ly6 ) action)
                 ((  lx6 ">" ly6 ) action)
                 ((  lx6 leq ly6 ) action)
                 ((  lx6 "<" ly6 ) action)
                 ((  lx6 freeof ly6 ) action)
                 ((  lx6 ) action)
)


(ry6             ((  not ry6 ) action)
                 ((  rx6 ) action)
)


(ly6             ((  not ly6 ) action)
                 ((  lx6 ) action)
)


(rx6tail         ((  ry6a ) action)
                 ((  ly6a "+" rx6tail ) action)
)

(rx6             ((  lx6a "+" rx6tail ) action)
                 ((  rx6a ) action)
)


(lx6tail         ((  ly6a ) action)
                 ((  ly6a "+" lx6tail ) action)
)

(lx6             ((  lx6a "+" lx6tail ) action)
                 ((  lx6a ) action)
)


(ry6a            ((  not ry6a ) action)
                 ((  rx6a ) action)
)


(rx6a            ((  lx6a "-" ry7 ) action)
                 ((  rx7 ) action)
)


(ly6a            ((  not ly6a ) action)
                 ((  lx6a ) action)
)


(lx6a            ((  lx6a "-" ly7 ) action)
                 ((  lx7 ) action)
)


(ry7             ((  not ry7 ) action)
                 ((  rx7 ) action)
)


(rx7             ((  "+" ry7 ) action)
                 ((  "-" ry7 ) action)
                 ((  rx8 ) action)
)


(ly7             ((  not ly7 ) action)
                 ((  lx7 ) action)
)


(lx7             ((  "+" ly7 ) action)
                 ((  "-" ly7 ) action)
                 ((  lx8 ) action)
)


(rx8tail         ((  ry9 ) action)
                 ((  ly9 "*" rx8tail ) action)
)

(rx8             ((  lx9 "*" rx8tail ) action)
                 ((  rx9 ) action)
)


(lx8tail         ((  ly9 ) action)
                 ((  ly9 "*" lx8tail ) action)
)

(lx8             ((  lx9 "*" lx8tail ) action)
                 ((  lx9 ) action)
)


(ry9             ((  not ry9 ) action)
                 ((  "+" ry9 ) action)
                 ((  "-" ry9 ) action)
                 ((  rx9 ) action)
)


(rx9             ((  lx9 "/" ry10 ) action)
                 ((  rx10 ) action)
)


(ly9             ((  not ly9 ) action)
                 ((  "+" ly9 ) action)
                 ((  "-" ly9 ) action)
                 ((  lx9 ) action)
)


(lx9             ((  lx9 "/" ly10 ) action)
                 ((  lx10 ) action)
)


(ly10            ((  not ly10 ) action)
                 ((  "+" ly10 ) action)
                 ((  "-" ly10 ) action)
                 ((  lx10 ) action)
)


(lx10            ((  lx11 "^" ly10 ) action)
                 ((  lx11 ) action)
)


(ry10            ((  not ry10 ) action)
                 ((  "+" ry10 ) action)
                 ((  "-" ry10 ) action)
                 ((  rx10 ) action)
)


(rx10            ((  lx11 "^" ry10 ) action)
                 ((  rx11 ) action)
)


(ry11            ((  not ry11 ) action)
                 ((  "+" ry11 ) action)
                 ((  "-" ry11 ) action)
                 ((  rx11 ) action)
)


(rx11            ((  x12 "." ry11 ) action)
                 ((  if_stmt ) action)
                 ((  for_stmt ) action)
                 ((  while_stmt ) action)
                 ((  repeat_stmt ) action)
                 ((  return_stmt ) action)
                 ((  goto_stmt ) action)
                 ((  lambda_expr ) action)
                 ((  proc_type ) action)
                 ((  proc_def ) action)
                 ((  endstat ) action)
)


(ly11            ((  not ly11 ) action)
                 ((  "+" ly11 ) action)
                 ((  "-" ly11 ) action)
                 ((  lx11 ) action)
)


(lx11            ((  x12 "." ly11 ) action)
                 ((  x12 ) action)
)


(arg_list        ((  expr ")" ) action)
                 ((  expr "," arg_list ) action)
)


(x12             ((  x13 "[" expr "]" ) action)
                 ((  x13 "(" ")" ) action)
                 ((  x13 "(" expr "," arg_list ) action)
                 ((  x13 x12 ) action)
                 ((  x13 ) action)
)


(x13             ((  symbol ) action)
                 ((  number ) action)
                 ((  string ) action)
                 ((  quoted ) action)
                 ((  backquoted ) action)
                 ((  group_expr ) action)
                 ((  block_expr ) action)
                 ((  "(" expr ")" ) action)
)
)$


% lalr_construct_parser rlisp_grammar;


end;



REDUCE Historical
REDUCE Sourceforge Project | Historical SVN Repository | GitHub Mirror | SourceHut Mirror | NotABug Mirror | Chisel Mirror | Chisel RSS ]