Artifact 82b0c25669ca364aa8a770c734e307b36aeafd4598f15338206d582d4532d04a:
- File
r36/lalr.red
— part of check-in
[152fb3bdbb]
at
2011-10-17 17:58:33
on branch master
— svn:eol-style, svn:executable and line endings for files
in historical/r36 treegit-svn-id: https://svn.code.sf.net/p/reduce-algebra/code/trunk/historical@1480 2bfe0521-f11c-4a00-b80e-6202646ff360 (user: schoepf@users.sourceforge.net, size: 50504) [annotate] [blame] [check-ins using] [more...]
symbolic; spool "lalr.log"; on comp, backtrace; unglobal '(!*raise); fluid '(!*raise); %============================================================================= % The file provides parser support for Lisp. The file contains three % major sections. The first is a lexical analyser, which has been % coded to support RLISP and Standard Lisp. Its interface is modelled after % the one used with the lex/yacc combination commonly used with Unix. % % The second section is the generic part of an LR parser. It requires % tables to tell it what actions to take, and calls the lexical analyser to % obtain tokens. % % The final part is an LALR(1) parser generator, which can take the % specification of a grammar and construct tables that direct the % generic parser skeleton. % %============================================================================= % % This is a lexical anaylser for use with RLISP. Its interface is % styles after the one needed by yacc, in that it exports a function % called yylex() that returns as a value a numeric category code, but % sets a variable yylval to hold further information about the token % just parsed. Single character objects are coded as their (ASCII?) code % [leaving this code non-portable to machines with other encodings?]. % Other things must have been given 'lex_code properties indicate the % associated category code. This lexer handles ' and ` as special prefix % characters that introduce Lisp-stype s-expressions. and it knows about % RLISP-style comments and a few diphthongs. It also supports some % simple preprocessor directives. % % Arthur Norman. April 1995 fluid '(!*raise !*lower !*echo); global '(lex_char yylval last64 last64p which_line); % I keep a circular buffer with the last 64 characters that have been % read. Initially the buffer contains NILs rather than characters, so I can % tell when it is only partially filled. smacro procedure yyreadch(); << last64p := last64p + 1; if last64p = 64 then last64p := 0; lex_char := readch(); if lex_char = !$eol!$ then which_line := which_line + 1; putv(last64, last64p, lex_char); lex_char >>; symbolic procedure yyerror msg; begin scalar c; terpri(); princ "+++++ Parse error at line "; prin which_line; princ ":"; if atom msg then msg := list msg; for each s in msg do << princ " "; princ s >>; terpri(); for i := 1:64 do << last64p := last64p + 1; if last64p = 64 then last64p := 0; c := getv(last64, last64p); if c = !$eof!$ then princ "<EOF>" else if not (c = nil) then princ c >>; if not (c = !$eol!$) then terpri() end; % Before a succession of calls to yylex() it is necessary to % ensure that lex_char is set suitably and that the circular buffer % used to store characters for error messages is ready for use. symbolic procedure start_parser(); << last64 := mkvect 64; last64p := 0; which_line := 1; yyreadch() >>; % % The following version of YYLEX provides RLISP with a facility for % conditional compilation. The protocol is that text is included or % excluded at the level of tokens. Control by use of new reserved % tokens !#if, !#else and !#endif. These are used in the form: % !#if (some Lisp expression for use as a condition) % ... RLISP input ... % !#else % ... alternative RLISP input ... % !#endif % % The form % !#if C1 ... !#elif C2 ... !#elif C3 ... !#else ... !#endif % is also supported. % % Conditional compilation can be nested. If the Lisp expression used to % guard a condition causes an error it is taken to be a FALSE condition. % It is not necessary to have an !#else before !#endif if no alternative % text is needed. Although the examples here put !#if etc at the start of % lines this is not necessary (though it may count as good style?). Since % the condtion will be read using RLISPs own list-reader there could be % condtional compilation guarding parts of it - the exploitation of that % possibility is to be discouraged! % % Making the condition a raw Lisp expression makes sure that parsing it % is easy. It makes it possible to express arbitrary conditions, but it is % hoped that most conditions will not be very elaborate - things like % !#if (not (member 'csl lispsystem!*)) % error(); % !#else % magic(); % !#endif % or % !#if debugging_mode % NB if variable is unset that counts as nil % print "message"; % so care should be taken to select the most % !#endif % useful default sense for such tests % should be about as complicated as reasonable people need. % % Two further facilities are provided: % !#eval (any lisp expression) % causes that expression to be evaluated at parse time. Apart from any % side-effects in the evaluation the text involved is all ignored. It is % expected that this will only be needed in rather curious cases, for instance % to set system-specific options for a compiler. % % !#define symbol value % where the value should be another symbol, a string or a number, causes % the first symbol to be mapped onto the second value wherever it occurs in % subsequent input. No special facility for undoing the effect of a % !#define is provided, but the general-purpose !#eval could be used to % remove the '!#define property that is involved. % % NOTE: The special symbols !#if etc are NOT recognised within Lisp % quoted expressions, so test like the following will be % ineffective: % a := '( % P % !#if q_is_wanted % Q % !#endif % Q); % but on the other hand code like % if sym = '!#if then ... % behaves the way that had probably been wanted. Unlike the C % preprocessor, this system recognizes directives within rather than % just at the start of lines. symbolic procedure yylex(); begin scalar w, done; % I take a rather robust view here - words that are intended to be used as % keywords may not be written with included escape characters. Thus for % instance this lexer will view "be!gin" or "!begin" as being a simple % symbol and NOT being the keyword "begin". w := lex_basic_token(); % The "while not done" loop is so that I can restart the scan after seeing % a pre-processor directive such as !#if. while not done do << % The word "COMMENT" introduces a comment that terminates at the next ";" % or "$". while yylval = 'comment and w = '!:symbol_or_keyword do << while not (lex_char = '!; or lex_char = '!$) do yyreadch(); yyreadch(); w := lex_basic_token() >>; % If a word was spelt out directly (without any escape characters in it) it % may be a keyword - if it is, then convert it here. if w = '!:symbol_or_keyword then << if w := get(yylval, '!#define) then << yylval := cdr w; w := car w >> else << if done := get(yylval, 'lex_code) then w := done else if flagp(yylval, 'rlis) then w := get('rlistat, 'lex_code) else if flagp(yylval, 'endstat) then w := get('endstat, 'lex_code) else w := get('!:symbol, 'lex_code); done := t >> >> % A word with escapes in might be a pre-processor directive. else if w = '!:symbol then << if yylval eq '!#if then << read_s_expression(); w := lex_conditional yylval >> else if yylval eq '!#else or yylval eq '!#elif then << yylval := nil; w := lex_skipping(w, nil) >> else if yylval eq '!#endif then w := lex_basic_token() else if yylval eq '!#eval then << read_s_expression(); errorset(yylval, nil, nil); w := lex_basic_token() >> else if yylval eq '!#define then << read_s_expression(); w := yylval; % Ought to be a symbol done := read_s_expression(); if idp w then put(w, '!#define, done . yylval); w := lex_basic_token(); done := nil >> else << w := get('!:symbol, 'lex_code); done := t >> >> else if numberp w then << % Now gobble up extra characters for multi-character operators (eg ">="). % Note that I only look one character ahead here. while done := atsoc(lex_char, get(yylval, 'lex_dipthong)) do << w := cdr done; yylval := cdr w; w := get(car w, 'lex_code); yyreadch() >>; if done := get(yylval, '!#define) then << yylval := cdr done; w := car done; done := nil >> else done := t >> else << done := t; w := get(w, 'lex_code) >> >>; return w end; % If, when reading ordinary text, I come across the token !#if I read % the expression following. If that evaluates to TRUE I just keep on % on reading. So the sequence "!#if t" is in effect ignored. Then % if later on I just ignore an "!#endif" all will be well. If on the other % hand the expression evaluates to NIL (or if evaluation fails), I will % call lex_skipping() to discard more tokens (up to and including % the next "!#else", "!#elif t" or "!endif"). symbolic procedure lex_conditional x; begin scalar w; w := lex_basic_token(); x := errorset(x, nil, nil); if errorp x or null car x then return lex_skipping(w, nil) else return w end; % I call lex_skipping when I find "!#if nil" or "!#else" or "!#elif" % that is processed. When a top-level "!#else" or "!#elif" is found it % is discarded before calling lex_skipping, since it must follow a % successful "!#if" and hence introduce material to be thrown away. symbolic procedure lex_skipping(w, x); begin scalar done; % In this code x keep track of the depth of testing of "!#if" constructions while not done do << if w = 0 then done := t % End of file else << if w = '!:symbol then << if yylval = '!#endif then << if null x then done := t else x := cdr x >> else if yylval = '!#if then x := nil . x else if yylval = '!#else and null x then done := t else if yylval = '!#elif and null x then << read_s_expression(); done := errorset(yylval, nil, nil); if errorp done or null car done then done := nil >> >>; w := lex_basic_token() >> >>; return w end; % In some cases RLISP operators are made up out of two (or more) characters. % I map '**' onto '^', and >=, <= onto GEQ and LEQ. % ":=" becomes SETQ. I turn << and >> onto symbols that can not be % read directly (!:lsect and !:rsect). % This means that the system that sets up lex_code properties had really % better make sure that it gives setq, geq, leq, !:rsect and !:lsect values. put('!*, 'lex_dipthong, '((!* !^ . !^))); put('!:, 'lex_dipthong, '((!= setq . setq))); put('!>, 'lex_dipthong, '((!= geq . geq), (!> !:rsect . !:rsect))); put('!<, 'lex_dipthong, '((!= leq . leq), (!< !:lsect . !:lsect))); put('!^, 'lex_code, char!-code '!^); % lex_basic_token() will read the next token from the current input stream and % leave a value in yylval to show what was found. It does not handle the % word "comment", nor does it consolidate things like ':' followed by '=' into % ':='. Those steps are left to yylex(). But lex_basic_token() does recognize % the quote prefix, as in '(lisp expression). The return value is numeric % for single-character tokens, but otherwise a descriptive symbol. % Some people would consider the Lisp dialect that I am using here to be % significantly flawed, in that I need to build symbols, numbers and % strings up as lists, and then use COMPRESS to make the real objects. The % CONS operations involved can be seen as an overhead, and going back to % something like the VERY old-fashioned clearbuff/pack/boffo world might % avoid that. symbolic procedure lex_basic_token(); begin scalar r, w; % First skip over whitespace. Note that at some stage in the future RLISP % may want to make newlines significant and partially equivalent to % semicolons, but that is not supported at present. while lex_char = '! or lex_char = !$eol!$ or (lex_char = '!% and << while not (lex_char = !$eol!$ or lex_char = !$eof!$) do yyreadch(); t >>) do yyreadch(); % Symbols start with a letter or an escaped character and continue with % letters, digits, underscores and escapes. if liter lex_char or (lex_char = '!! and begin scalar !*raise, !*lower; % Rebind !*raise & !*lower to avoid.. r := lex_char . r; % case folding when the next character.. yyreadch(); % is read. return (w := t) end) then << r := lex_char. r; while liter(yyreadch()) or digit lex_char or lex_char = '!_ or (lex_char = '!! and begin scalar !*raise, !*lower; r := lex_char . r; yyreadch(); return (w := t) end) do r := lex_char . r; % If there was a '!' in the word I will never treat it as a keyword. yylval := compress reversip r; return if w then '!:symbol else '!:symbol_or_keyword >> % Numbers are either integers or floats. A floating point number is % indicated by either a point "." or an exponent marker "e". In the code % here I keep a flag (in w) to indicate if I had a floating or integer % value, but in the end I ignore this and hand back the lexical category % :number in both cases. else if digit lex_char then << r := list lex_char; while digit (yyreadch()) do r := lex_char . r; if lex_char = '!. then << w := t; % Flag to indicate floating point r := lex_char . r; while digit (yyreadch()) do r := lex_char . r >>; % I permit the user to write the exponent marker in either case. if lex_char = '!e or lex_char = '!E then << % If the input as 1234E56 I expand it as 1234.0E56 if not w then r := '!0 . '!. . r; w := t; r := '!e . r; yyreadch(); if lex_char = '!+ or lex_char = '!- then << r := lex_char . r; yyreadch() >>; % If there is no digit written after "E" I insert a zero. Thus overall the % input 17E gets treated as 17.0E0 if not digit lex_char then r := '!0 . r else << r := lex_char . r; while digit (yyreadch()) do r := lex_char . r >> >>; yylval := compress reversip r; return '!:number >> % Strings are enclosed in double-quotes, and "abc""def" is a string with % a double-quote mark within it. Note no case folding on characters in a % string. else if lex_char = '!" then << begin scalar !*raise, !*lower; repeat << r := lex_char . r; while not ((yyreadch()) = '!") do r := lex_char . r; r := lex_char . r; yyreadch() >> until not (lex_char = '!"); end; yylval := compress reversip r; return '!:string >> % "'" and "`" introduce Lisp syntax S-expressions else if lex_char = '!' then << yyreadch(); read_s_expression(); yylval := list('quote, yylval); return '!:list >> else if lex_char = '!` then << yyreadch(); read_s_expression(); yylval := list('backquote, yylval); return '!:list >> else << yylval := lex_char; % I take special notice of end of file, since it is fairly drastic. % In particular I do not attempt to advance lex_char beyond it. So I do % TWO things here: I avoid advancing the input, and I return the code 0 % as an end-of-file indication. if yylval = !$eof!$ then return 0 else << yyreadch(); return char!-code yylval >> >> end; % % I use a hand-written recursive descent parser for Lisp S-expressions % mainly because the syntax involved is so VERY simple. A rough sketch of % the syntax required is given here, but in reality (in part because I do % not want to have to report syntax errors) I implement a more liberal % syntax, especially as it relates to dotted-pair notation. This of course % is one of the natural dangers in using recursive descent... the syntax % actually parsed is only properly defined by direct reference to the code. % % s_tail = ")" | % "." s_expr ")" | % s_expr s_tail; % % s_vectail = "]" | % s_expr s_vectail; % % s_expr = symbol | % number | % string | % "(" s_tail | % "[" s_vectail | % "'" s_expr | % "`" s_expr | % "," s_expr | % ",@" s_expr; global '(dot_char rpar_char rsquare_char); dot_char := char!-code '!.; rpar_char := char!-code '!); rsquare_char := char!-code '!]; symbolic procedure read_s_expression(); << % At the start of an S-expression I want to check for the characters % "(", "[" and ",". Thus I need to skip whitespace. while lex_char = '! or lex_char = '!$eol!$ do yyreadch(); if lex_char = '!( then begin scalar r, w, w1; yyreadch(); w := read_s_expression(); while not (w = rpar_char or w = dot_char or w = 0) do << r := yylval . r; % Note that at the end of the list read_s_expression() will read the ")" % as a token. w := read_s_expression() >>; if not w = dot_char then yylval := reversip r else << read_s_expression(); % Thing after the "." w := yylval; % Reverse the list putting a dotted item on the end. while r do << w1 := cdr r; rplacd(r, w); w := r; r := w1 >>; yylval := w; % I will be somewhat liberal about syntactic problems with dotted pair % notation, since it is unclear how I can usefully report or repair errors. while lex_char = '! or lex_char = '!$eol!$ do yyreadch(); % When I find a ")" I do not read beyond it immediately, but reset lex_char % to whitespace. This may help prevent unwanted hangups in interactive use. if lex_char = '!) then lex_char := '! >>; return '!:list end % "[" introduces a simple vector. else if lex_char = '![ then begin scalar r, w, w1; yyreadch(); w := read_s_expression(); w1 := -1; while not (w = rsquare_char or w = 0) do << r := yylval . r; w1 := w1 + 1; w := read_s_expression() >>; % Create a vector of the correct size and copy information into it. w := mkvect w1; r := reversip r; w1 := 0; while r do << putv(w, w1, car r); w1 := w1 + 1; r := cdr r >>; yylval := w; return '!:list end % I spot "," and ",@" here, and should wonder if I should (a) police that % they are only expected to make sense within the scope of a "`" and (b) % whether I ought to expand them in terms of LIST, CONS, APPEND etc here. % For now I just hand back markers that show where they occured. else if lex_char = '!, then << yyreadch(); if lex_char = '!@ then << yyreadch(); read_s_expression(); yylval := list('!,!@, yylval) >> else << read_s_expresssion(); yylval := list('!,, yylval) >>; 'list >> % Care with ")" and "]" not to read ahead further than is essential. else if lex_char = '!) or lex_char = '!] or lex_char = '!. then << yylval := lex_char; lex_char := '! ; char!-code yylval >> % In most cases (including "'" and "`") I just hand down to read a token. % This covers the cases of symbols, numbers and strings. else lex_basic_token() >>; %============================================================================= % Here I have a general-purpose LR(1) parser skeleton. This needs to % have a source of tokens, and some tables that will direct its actions. % The format of the tables required by this code will be a little curious, % mainly because they represent some attempt to compact the information that % is needed. Note that the CSL functions mkvect16, putv16, getv16 are % somewhat similar to the regular mkvect, putv and getv, but may be used % if the vector contents will always be 16-bit fixnums. global '(!*verbose); !*verbose := t; % How much will the parset-generator print? global '(goto_index goto_old_state goto_new_state); % For each terminal I have a pointer (stored in goto_index) into % a pair of vectors, goto_old_state and goto_new_state. The first of these % holds states that I might be in, and the second holds the ones I must % move into after a reduction has been performed. In the goto_old_state % table the value "-1" is taken to match any state that I am attempting % to look up. Thus it can be used to terminate a segment of the table. I am % entitled to let undefined locations in the goto table respond with % any value that just happens. smacro procedure get_goto(state, non_terminal); << w1 := getv16(goto_index, non_terminal); while not (((w2 := getv16(goto_old_state, w1)) = -1) or w2 = state) do w1 := w1 + 1; getv16(goto_new_state, w1) >>; global '(action_index, action_terminal action_result); % In a rather similar way, actions are found via an association-list % like look-up in a table. In this table a strictly positive number n stands % for (SHIFT n) [observe that if I reserve zero as the number of the % initial state of my augmented grammar I can never need to shift into % state 0]. The value 0 in the table represents ACCEPT. This leaves % nagative values to cover reductions and error cases. smacro procedure get_action(state, terminal); << w1 := getv16(action_index, state); while not (((w2 := getv16(action_terminal, w1)) = -1) or w2 = terminal) do w1 := w1 + 1; getv(action_result, w1) >>; global '(action_first_error action_error_messages); global '(action_fn action_A action_n); symbolic procedure yyparse(); begin scalar sym_stack, state_stack, next_input, w, w1, w2; state_stack := list 0; % Note that state 0 must be the initial one. start_parser(); next_input := yylex(); while not (w := get_action(car state_stack, next_input)) = 0 do if w > 0 then << sym_stack := next_input . sym_stack; state_stack := cadr w . state_stack; next_input := yylex() >> else begin scalar A, n, action; w := - (w + 1); if w < action_first_error then << action := getv(action_fn, w); n := getv8(action_n, w); A := getv16(action_A, w); % I am now reducing by "A -> beta { action() }" where beta has n items w := nil; for i := 1:n do << w := car sym_stack . w; sym_stack := cdr sym_stack; state_stack := cdr state_stack >>; w := reversip w; if action then w := apply1(action, w) else w := A . w; sym_stack := w . sym_stack; state_stack := get_goto(car state_stack, A) >> else << w := w - action_first_error; yyerror getv(action_error_messages, w); % The next activity must result in the loop ending... state_stack := list 0; sym_stack := '(error); next_input := 0 >> end; return car sym_stack end; %============================================================================= % % A grammar is represented as a list of rules. A rule % sym : x y z { p q r } % | x y z { p q r } % ; % maps onto the list % (sym ((x y z) p q r) % ((x y z) p q r)) % and items on the right hand side can be symbols or strings. Strings % stand for terminals. Symbols that are mentioned on a left hand side of % a production are non-terminals, others are considered to be terminals % supported by the lexer. % % ***** I am still working out what to do with the "semantic actions". global '(terminals non_terminals symbols goto_cache action_map); smacro procedure lalr_productions x; get(x, 'produces); smacro procedure lalr_set_productions(x, y); put(x, 'produces, y); symbolic procedure lalr_prin_symbol x; if x = 0 then princ "$" else if x = nil then princ "<empty>" else if x = '!. then princ "." else if numberp x and rassoc(x, terminals) then prin car rassoc(x, terminals) else if stringp x then prin x else for each c in explode2uc x do princ c; symbolic procedure lalr_display_symbols(); begin princ "Terminal symbols are:"; terpri(); for each x in terminals do << princ " "; prin car x; princ ":"; prin cdr x >>; terpri(); princ "Non-terminal symbols are:"; terpri(); for each x in non_terminals do begin scalar w; princ "["; prin get(x, 'non_terminal_code); princ "]"; lalr_prin_symbol x; w := ":"; for each y in lalr_productions x do << ttab 20; princ w; w := "|"; for each z in car y do << princ " "; lalr_prin_symbol z >>; if posn() > 48 then terpri(); ttab 48; princ "{"; for each z in cdr y do << princ " "; prin z >>; princ " }"; terpri() >>; ttab 20; princ ";"; terpri() end; terpri(); end; symbolic procedure lalr_print_action_map(); begin princ "Action map:"; terpri(); for each x in action_map do << prin cdr x; princ ":"; ttab 12; prin car x; terpri() >> end; symbolic procedure lalr_set_grammar g; begin scalar name, vals, tnum, w; terminals := non_terminals := symbols := nil; tnum := 0; % I will start by augmenting the grammar with an initial production... g := list('s!', list list caar g) . g; for each x in g do << name := car x; vals := cdr x; if name member non_terminals then vals := append(vals, lalr_productions name) else non_terminals := name . non_terminals; for each vv in cdr x do for each v in car vv do << if stringp v or numberp v then << if not assoc(v, terminals) then terminals := (v . (tnum := tnum+1)) . terminals >> else if not (v member symbols) then symbols := v . symbols >>; lalr_set_productions(name, vals) >>; for each name in non_terminals do symbols := delete(name, symbols); for each v in symbols do terminals := (v . (tnum := tnum+1)) . terminals; % I reverse the list of non-terminals here so that the starting symbol % remains as the first item. non_terminals := reversip non_terminals; tnum := -1; for each v in non_terminals do put(v, 'non_terminal_code, tnum := tnum+1); symbols := append(non_terminals, for each x in terminals collect cdr x); goto_cache := mkhash(length non_terminals, 1, 1.5); if !*verbose then lalr_display_symbols(); % Map all terminals onto numeric codes. for each x in non_terminals do lalr_set_productions(x, for each y in lalr_productions x collect sublis(terminals, car y) . cdr y); % Map all actions onto numeric codes, such that identical actions all have the % same code action_map := nil; tnum := -1; for each x in non_terminals do for each a in lalr_productions x do << w := assoc(cdr a, action_map); if null w then << w := cdr a . (tnum := tnum + 1); action_map := w . action_map >>; rplacd(a, list cdr w) >>; action_map := reversip action_map; if !*verbose then lalr_print_action_map(); lalr_calculate_first non_terminals; end; symbolic procedure lalr_clean_up(); begin for each x in terminals do << remprop(x, 'produces); remprop(x, 'lalr_first); remprop(x, 'non_terminal_code) >>; terminals := non_terminals := symbols := nil; goto_cache := action_map := nil end; symbolic procedure lalr_action(lhs, rhs); cdr assoc(rhs, lalr_productions lhs); symbolic procedure lalr_print_firsts g; begin princ "FIRST sets for each non-terminal:"; terpri(); for each x in g do << lalr_prin_symbol x; princ ": "; ttab 15; for each y in get(x, 'lalr_first) do << princ " "; lalr_prin_symbol y >>; terpri() >> end; symbolic procedure lalr_calculate_first g; begin scalar w, y, z, done; for each x in g do if assoc(nil, lalr_productions x) then put(x, 'lalr_first, '(nil)); repeat << done := nil; for each x in g do << z := get(x, 'lalr_first); for each y1 in lalr_productions x do << y := car y1; while y and not numberp y and (nil member (w := get(car y, 'lalr_first))) do << z := union(w, z); y := cdr y >>; if null y then nil else if numberp car y then z := union(list car y, z) else z := union(get(car y, 'lalr_first), z) >>; if not (z = get(x, 'lalr_first)) then done := t; put(x, 'lalr_first, z) >> >> until not done; if !*verbose then lalr_print_firsts g; return nil end; symbolic procedure lalr_first l; begin scalar r, w; while l and not numberp car l and (nil member (w := get(car l, 'lalr_first))) do << r := union(delete(nil, w), r); l := cdr l >>; if null l then r := nil . r else if numberp car l then r := union(list car l, r) else r := union(w, r); return r end; % The next few procedures are as documented in Figure 4.38 of Red Dragon symbolic procedure lalr_print_items(heading, cc); begin princ heading; terpri(); for each y in cc do << princ "Item number "; prin cdr y; terpri(); for each x in sort(car y, function orderp) do << lalr_prin_symbol caar x; princ " ->"; for each y in cdar x do << princ " "; lalr_prin_symbol y >>; princ " : "; lalr_prin_symbol cadr x; terpri() >>; for each x in hashcontents goto_cache do for each xx in cdr x do if car xx = cdr y then << ttab 10; lalr_prin_symbol car x; princ " GOTO state "; prin cdr xx; terpri() >> >> end; symbolic procedure lalr_items g; begin scalar c, val, done, w, w1, w2, n; val := lalr_productions 's!'; if cdr val then error(0, "Starting state must only reduce to one thing") else val := caar val; n := 0; c := list (lalr_closure list list(('s!' . '!. . val), 0) . n); repeat << done := nil; for each i in c do for each x in symbols do if w := lalr_goto(car i, x) then << w1 := assoc(w, c); if w1 then << w2 := gethash(x, goto_cache); if not assoc(cdr i, w2) then puthash(x, goto_cache, (cdr i . cdr w1) . w2) >> else << c := (w . (n := n + 1)) . c; puthash(x, goto_cache, (cdr i . n) . gethash(x, goto_cache)); done := t >> >> >> until not done; c := reversip c; % So that item numbers come out in nicer order. if !*verbose then lalr_print_items("LR(1) Items:", c); return c end; symbolic procedure lalr_closure i; begin scalar pending, a, rule, tail, done, ff, w; pending := i; while pending do << ff := car pending; % [(A -> alpha . B beta), a] pending := cdr pending; rule := car ff; a := cadr ff; tail := cdr ('!. member rule); if tail and not numberp car tail then << ff := lalr_first append(cdr tail, list a); for each p in lalr_productions car tail do for each b in ff do << w := list(car tail . '!. . car p, b); % It might be better to store items as hash tables, since then the % member-check here would be much faster. if not (w member i) then << i := w . i; pending := w . pending >> >> >> >>; return i end; symbolic procedure lalr_move_dot(z, x); begin scalar r; while not (car z = '!.) do << r := car z . r; z := cdr z >>; z := cdr z; if not (z and car z = x) then return nil; z := car z . '!. . cdr z; while r do << z := car r . z; r := cdr r >>; return z end; symbolic procedure lalr_goto(i, x); begin scalar j, w; for each z in i do << w := lalr_move_dot(car z, x); if w then j := list(w, cadr z) . j >>; return lalr_closure j end; symbolic procedure lalr_cached_goto(i, x); cdr assoc(i, gethash(x, goto_cache)); % Next part of Algorithm 4.11 from the Red Dragon symbolic procedure lalr_remove_duplicates x; begin scalar r; if null x then return nil; x := sort(x, function orderp); r := list car x; x := cdr x; while x do << if not (car x = car r) then r := car x . r; x := cdr x >>; return r end; symbolic procedure lalr_core i; lalr_remove_duplicates for each x in car i collect car x; symbolic procedure lalr_same_core(i1, i2); lalr_core i1 = lalr_core i2; % cc is a list of items, while i is a single item. If cc already contains % an item with the same core as I then merge i into that, and adjust any % goto records either out of or into i to refer now to the thing merged % with. fluid '(renamings); symbolic procedure lalr_insert_core(i, cc); if null cc then list i else if lalr_same_core(i, car cc) then << renamings := (i . cdar cc) . renamings; (union(car i, caar cc) . cdar cc) . cdr cc >> else car cc . lalr_insert_core(i, cdr cc); symbolic procedure lalr_rename_gotos(); begin scalar w; for each x in non_terminals do << w := sublis(renamings, gethash(x, goto_cache)); puthash(x, goto_cache, lalr_remove_duplicates w) >> end; % Part of Algorithm 4.10 of the Red Dragon symbolic procedure lalr_print_actions action_table; begin scalar w; princ "Actions:"; terpri(); for each x in action_table do for each xx in cdr x do << prin car x; ttab 20; lalr_prin_symbol car xx; ttab 40; w := cadr xx; if eqcar(w, 'reduce) then << princ "reduce "; lalr_prin_symbol caadr w; princ " ->"; for each v in cdadr w do << princ " "; lalr_prin_symbol v >>; princ " {"; for each v in caddr w do << princ " "; prin v >>; princ " }"; terpri() >> else << prin w; terpri() >> >> end; symbolic procedure lalr_make_actions c; begin scalar action_table, aa, j, w; for each i in c do << aa := nil; for each r in car i do << w := cdr ('!. member cdar r); if w and numberp car w then << j := lalr_cached_goto(cdr i, car w); aa := list(car w, list('shift, j)) . aa >> else if null w and not (caar r = 's!') then << w := reverse cdr reverse car r; aa := list(cadr r, list('reduce, w, lalr_action(car w, cdr w))) . aa >> else if null w and caar r = 's!' then aa := list(0, 'accept) . aa >>; action_table := (cdr i . lalr_remove_duplicates aa) . action_table >>; action_index := mkvect16 caar action_table; action_table := reversip action_table; if !*verbose then lalr_print_actions action_table; j := 0; w := nil; for each x in action_table do << putv16(action_index, car x, j); aa := lalr_lay_out_actions cdr x; while aa do << w := (0 . 0) . w; j := j + 1 >> >>; action_terminal := mkvect16 j; action_result := mkvect16 j; while j > 0 do << j := j - 1; putv16(action_terminal, j, caar w); putv16(action_result, j, cdar w); w := cdr w >> end; symbolic procedure lalr_most_common_dest p; begin scalar r, w; for each x in p do if (w := assoc(cdr x, r)) then rplacd(w, cdr w + 1) else r := (cdr x . 1) . r; w := car r; for each x in cdr r do if cdr x > cdr w then w := x; return car w end; symbolic procedure lalr_make_gotos(); begin scalar p, r1, w, r; p := 0; for each x in hashcontents goto_cache do if not numberp car x then << if !*verbose then for each xx in cdr x do << prin car xx; ttab 10; lalr_prin_symbol car x; princ " GOTO state "; prin cdr xx; terpri() >>; r1 := (get(car x, 'non_terminal_code) . p) . r1; if cdr x then << w := lalr_most_common_dest cdr x; for each xx in cdr x do if not (cdr xx = w) then << r := xx . r; p := p + 1 >>; r := ((-1) . w) . r; p := p + 1 >> >>; goto_index := mkvect16 length non_terminals; goto_old_state := mkvect16 p; goto_new_state := mkvect16 p; for each x in r1 do putv16(goto_index, car x, cdr x); while p > 0 do << p := p - 1; putv16(goto_old_state, p, caar r); putv16(goto_new_state, p, cdar r); r := cdr r >>; princ "goto_index: "; print goto_index; princ "goto_old_state: "; print goto_old_state; princ "goto_new_state: "; print goto_new_state end; % A main driver function that performs all the steps involved % in building parse tables for a given grammar. symbolic procedure lalr_construct_parser g; begin scalar c, cc, renamings; lalr_set_grammar g; c := lalr_items non_terminals; renamings := nil; for each i in c do cc := lalr_insert_core(i, cc); lalr_rename_gotos(); if !*verbose then lalr_print_items("Merged Items:", cc); lalr_make_actions cc; lalr_make_gotos(); lalr_clean_up() end; %============================================================================= % Now some test cases on time; % Here I set up a sample grammar % S' -> S % S -> CC { A1 } % C -> cC { A2 } % | d { A3 } % (example 4.42 from Aho, Sethi and Ullman's Red Dragon book, with % some dummy semantic actions added. Note that I do not need to insert % the production S' -> S for myself since the analysis code will % augment my grammar with it for me anyway. grammar := '((S ((C C) A1)) (C (("c" C) A2) (("d") A3)) ); lalr_construct_parser grammar; % Example 4.46 from the Red Dragon g4_46 := '((S ((L "=" R) a1) ((R) a2)) (L (("*" R) a3) ((id) a4)) (R ((L) a5))); lalr_construct_parser g4_46; % Now a much more complicated grammar - one that recognizes the syntax of % RLISP. rlisp_grammar := '( (command (( cmnd sep ) action) (( end sep ) action) (( command cmnd sep ) action) (( command end sep ) action) ) (sep (( ";" ) action) (( "$" ) action) ) (proc_type (( symbolic ) action) (( algebraic ) action) ) (proc_qual (( expr ) action) (( macro ) action) (( smacro ) action) ) (sym_list (( ")" ) action) (( "," symbol sym_list ) action) ) (infix (( setq ) action) (( or ) action) (( and ) action) (( member ) action) (( memq ) action) (( "=" ) action) (( neq ) action) (( eq ) action) (( geq ) action) (( ">" ) action) (( leq ) action) (( "<" ) action) (( freeof ) action) (( "+" ) action) (( "-" ) action) (( "*" ) action) (( "/" ) action) (( "^" ) action) (( "." ) action) ) (prefix (( not ) action) (( "+" ) action) (( "-" ) action) ) (proc_head (( symbol ) action) (( symbol symbol ) action) (( symbol "(" ")" ) action) (( symbol "(" symbol sym_list ) action) (( prefix symbol ) action) (( symbol infix symbol ) action) ) (proc_def (( procedure proc_head sep cmnd ) action) (( proc_type procedure proc_head sep cmnd ) action) (( proc_qual procedure proc_head sep cmnd ) action) (( proc_type proc_qual procedure proc_head sep cmnd ) action) ) (rlistat (( rlistat ) action) (( in ) action) (( on ) action) ) (rltail (( expr ) action) (( expr "," rltail ) action) ) (cmnd (( expr ) action) (( rlistat rltail ) action) ) (if_stmt (( if expr then cmnd else cmnd ) action) (( if expr then cmnd ) action) ) (for_update (( ":" expr ) action) (( step expr until expr ) action) ) (for_action (( do ) action) (( sum ) action) (( collect ) action) ) (for_inon (( in ) action) (( on ) action) ) (for_stmt (( for symbol setq expr for_update for_action cmnd ) action) (( for each symbol for_inon expr for_action cmnd ) action) (( foreach symbol for_inon expr for_action cmnd ) action) ) (while_stmt (( while expr do cmnd ) action) ) (repeat_stmt (( repeat cmnd until expr ) action) ) (return_stmt (( return ) action) (( return expr ) action) ) (goto_stmt (( goto symbol ) action) (( go symbol ) action) (( go to symbol ) action) ) (group_tail (( rsect ) action) (( sep rsect ) action) (( sep cmnd group_tail ) action) ) (group_expr (( lsect cmnd group_tail ) action) ) (scalar_tail (( sep ) action) (( "," symbol scalar_tail ) action) (( "," integer scalar_tail ) action) ) (scalar_def (( scalar symbol scalar_tail ) action) (( integer symbol scalar_tail ) action) ) (scalar_defs (( scalar_def ) action) (( scalar_defs scalar_def ) action) ) (block_tail (( end ) action) (( cmnd end ) action) (( symbol ":" block_tail ) action) (( cmnd sep block_tail ) action) (( sep block_tail ) action) ) (block_expr (( begin scalar_defs block_tail ) action) (( begin block_tail ) action) ) (lambda_vars (( sep ) action) (( "," symbol lambda_vars ) action) ) (lambda_expr (( lambda symbol lambda_vars cmnd ) action) (( lambda "(" ")" sep cmnd ) action) (( lambda "(" symbol sym_list sep cmnd ) action) ) (expr (( rx0 ) action) (( lx0 ) action) ) (rx0 (( lx0 where symbol "=" rx1 ) action) (( rx1 ) action) ) (lx0 (( lx0 where symbol "=" lx1 ) action) (( lx1 ) action) ) (rx1 (( lx2 setq rx1 ) action) (( rx2 ) action) ) (lx1 (( lx2 setq lx1 ) action) (( lx2 ) action) ) (rx2tail (( rx3 ) action) (( lx3 or rx2tail ) action) ) (rx2 (( lx3 or rx2tail ) action) (( rx3 ) action) ) (lx2tail (( lx3 ) action) (( lx3 or lx2tail ) action) ) (lx2 (( lx3 or lx2tail ) action) (( lx3 ) action) ) (rx3tail (( rx4 ) action) (( lx4 and rx3tail ) action) ) (rx3 (( lx4 and rx3tail ) action) (( rx4 ) action) ) (lx3tail (( lx4 ) action) (( lx4 and lx3tail ) action) ) (lx3 (( lx4 and lx3tail ) action) (( lx4 ) action) ) (rx4 (( not rx4 ) action) (( rx5 ) action) ) (lx4 (( not lx4 ) action) (( lx5 ) action) ) (rx5 (( lx6 member ry6 ) action) (( lx6 memq ry6 ) action) (( lx6 "=" ry6 ) action) (( lx6 neq ry6 ) action) (( lx6 eq ry6 ) action) (( lx6 geq ry6 ) action) (( lx6 ">" ry6 ) action) (( lx6 leq ry6 ) action) (( lx6 "<" ry6 ) action) (( lx6 freeof ry6 ) action) (( rx6 ) action) ) (lx5 (( lx6 member ly6 ) action) (( lx6 memq ly6 ) action) (( lx6 "=" ly6 ) action) (( lx6 neq ly6 ) action) (( lx6 eq ly6 ) action) (( lx6 geq ly6 ) action) (( lx6 ">" ly6 ) action) (( lx6 leq ly6 ) action) (( lx6 "<" ly6 ) action) (( lx6 freeof ly6 ) action) (( lx6 ) action) ) (ry6 (( not ry6 ) action) (( rx6 ) action) ) (ly6 (( not ly6 ) action) (( lx6 ) action) ) (rx6tail (( ry6a ) action) (( ly6a "+" rx6tail ) action) ) (rx6 (( lx6a "+" rx6tail ) action) (( rx6a ) action) ) (lx6tail (( ly6a ) action) (( ly6a "+" lx6tail ) action) ) (lx6 (( lx6a "+" lx6tail ) action) (( lx6a ) action) ) (ry6a (( not ry6a ) action) (( rx6a ) action) ) (rx6a (( lx6a "-" ry7 ) action) (( rx7 ) action) ) (ly6a (( not ly6a ) action) (( lx6a ) action) ) (lx6a (( lx6a "-" ly7 ) action) (( lx7 ) action) ) (ry7 (( not ry7 ) action) (( rx7 ) action) ) (rx7 (( "+" ry7 ) action) (( "-" ry7 ) action) (( rx8 ) action) ) (ly7 (( not ly7 ) action) (( lx7 ) action) ) (lx7 (( "+" ly7 ) action) (( "-" ly7 ) action) (( lx8 ) action) ) (rx8tail (( ry9 ) action) (( ly9 "*" rx8tail ) action) ) (rx8 (( lx9 "*" rx8tail ) action) (( rx9 ) action) ) (lx8tail (( ly9 ) action) (( ly9 "*" lx8tail ) action) ) (lx8 (( lx9 "*" lx8tail ) action) (( lx9 ) action) ) (ry9 (( not ry9 ) action) (( "+" ry9 ) action) (( "-" ry9 ) action) (( rx9 ) action) ) (rx9 (( lx9 "/" ry10 ) action) (( rx10 ) action) ) (ly9 (( not ly9 ) action) (( "+" ly9 ) action) (( "-" ly9 ) action) (( lx9 ) action) ) (lx9 (( lx9 "/" ly10 ) action) (( lx10 ) action) ) (ly10 (( not ly10 ) action) (( "+" ly10 ) action) (( "-" ly10 ) action) (( lx10 ) action) ) (lx10 (( lx11 "^" ly10 ) action) (( lx11 ) action) ) (ry10 (( not ry10 ) action) (( "+" ry10 ) action) (( "-" ry10 ) action) (( rx10 ) action) ) (rx10 (( lx11 "^" ry10 ) action) (( rx11 ) action) ) (ry11 (( not ry11 ) action) (( "+" ry11 ) action) (( "-" ry11 ) action) (( rx11 ) action) ) (rx11 (( x12 "." ry11 ) action) (( if_stmt ) action) (( for_stmt ) action) (( while_stmt ) action) (( repeat_stmt ) action) (( return_stmt ) action) (( goto_stmt ) action) (( lambda_expr ) action) (( proc_type ) action) (( proc_def ) action) (( endstat ) action) ) (ly11 (( not ly11 ) action) (( "+" ly11 ) action) (( "-" ly11 ) action) (( lx11 ) action) ) (lx11 (( x12 "." ly11 ) action) (( x12 ) action) ) (arg_list (( expr ")" ) action) (( expr "," arg_list ) action) ) (x12 (( x13 "[" expr "]" ) action) (( x13 "(" ")" ) action) (( x13 "(" expr "," arg_list ) action) (( x13 x12 ) action) (( x13 ) action) ) (x13 (( symbol ) action) (( number ) action) (( string ) action) (( quoted ) action) (( backquoted ) action) (( group_expr ) action) (( block_expr ) action) (( "(" expr ")" ) action) ) )$ % lalr_construct_parser rlisp_grammar; end;