Handout 4: A predictive (recursive descent) parser example


This is a complete development of a parser for a slight simplification
of Grammar 3.3 from the text.  The simplification involves dropping
the minus and divides operations (- and /), so that there are only two
infix operators, + and *. The $ represents an EOF token signalling the
end of the input token stream.

----------------------------------------------------------------------
Grammar 1: Basic (ambiguous) grammar
----------------------------------------------------------------------
Nonterminals: {S,E}  (start, expressions)
Terminals: { id, num, *, +, (, ), $ }
Productions:

   S → E $
   E → id
   E → num
   E → E * E
   E → E + E
   E → ( E )

----------------------------------------------------------------------

The first step is to elaborate the grammar to express the fact that *
has greater precedence than +. This produces an unambiguous grammar. 

----------------------------------------------------------------------
Grammar 2: Unambiguous grammar
----------------------------------------------------------------------
Nonterminals: {E,T,F}  (expressions, terms, factors)
Terminals: { id, num, *, +, (, ), $ }
Productions:

   S → E $

   E → E + T     T → T * F      F → id
   E → T         T → F          F → num
                                F → ( E )

----------------------------------------------------------------------

The next step is to eliminate the left-recursive productions
like "E → E + T", because these prevent a single token lookahead
from determining a unique production.

----------------------------------------------------------------------
Grammar 2: Unambiguous grammar
----------------------------------------------------------------------
Nonterminals: {E,T,F}  (expressions, terms, factors)
Terminals: { id, num, *, +, (, ) }
Productions:

   S → E $

   E → T E'       T → F T'        F → id
   E' → + T E'    T' → * F T'     F → num
   E' →           T' →            F → ( E )

----------------------------------------------------------------------

Next we calculate the nullable, FIRST, and FOLLOWS properties of
the nonterminals S, E, E', T, T', and F. (Nonterminal X is 
nullable if there is a derivation from X to the empty token string).
We start by calculating the nullable property using an iterative
process.

     nullable      FIRST        FOLLOW
--------------------------------------------
S      ?
E      ?
E'     t
T      ?
T'     t
F      f

Here we see immediately that E' and T' are nullable because
they have productions rewriting them to empty. We see immediately
that F is not nullable because the rhs's of each of the F productions
start with a terminal.

     nullable      FIRST        FOLLOW
--------------------------------------------
S      ?
E      ?
E'     t
T      f
T'     t
F      f

T is not nullable because is sole production has a rhs starting
with F, and we now know that F is not nullable.

     nullable      FIRST        FOLLOW
--------------------------------------------
S      ?
E      f
E'     t
T      f
T'     t
F      f

E is not nullable because is sole production has a rhs starting
with T, and we now know that T is not nullable.

     nullable      FIRST        FOLLOW
--------------------------------------------
S      f
E      f
E'     t
T      f
T'     t
F      f

S is not nullable because is sole production has a rhs starting
with E, and we now know that E is not nullable.


Now we calculate the FIRST and FOLLOW sets for each nonterminal
using a similar iterative process. We start with entries that
are immediately evident from inspection of the productions:

     nullable      FIRST        FOLLOW
--------------------------------------------
S      f
E      f                        $,)
E'     t          +
T      f
T'     t          *
F      f          id,num,(


     nullable      FIRST        FOLLOW
--------------------------------------------
S      f
E      f                        $,)
E'     t          +             $,)
T      f          id,num,(      +
T'     t          *
F      f          id,num,(      *

FIRST(T) = {id,num,(} from (T → F T') and FIRST(F) = {id,num,(}.
FOLLOW(T) = {+} from (E → T E') and FIRST(E') = {+}.
FOLLOW(F) = {*} from (T → F T') and FIRST(T') = {*}.
FOLLOW(E') = {$,)} from (E → T E') and FOLLOW(E) = {$,)}.

     nullable      FIRST        FOLLOW
--------------------------------------------
S      f
E      f          id,num,(      $,)
E'     t          +             $,)
T      f          id,num,(      +,$,)
T'     t          *             +
F      f          id,num,(      *,+

FIRST(E) = {id,num,(} from (E → T E'); FIRST(T) = {id,num,(}.
FOLLOW(T') = {+} from (T → F T'); FOLLOW(T) = {+}.
FOLLOW(F) = {*,+} from (T → F T'); FOLLOW(T) = {+}; nullable(T').
FOLLOW(T) = {+,$,)} from (E → T E'); FOLLOW(E) = {$,)}; nullable(E').

     nullable      FIRST        FOLLOW
--------------------------------------------
S      f          id,num,(
E      f          id,num,(      $,)
E'     t          +             $,)
T      f          id,num,(      +,$,)
T'     t          *             +,$,)
F      f          id,num,(      *,+,$,)

FIRST(S) = {id,num,(} from (S → E $); FIRST(E) = {id,num,(}.
FOLLOW(T') = {+,$,)} from (T → F T'); FOLLOW(T) = {+,$,)}.
FOLLOW(F) = {*,+,$,)} from (T → F T'); FOLLOW(T) = {+,$,)}; nullable(T').

And here the iteration stops, because no new tokens can be added
to any of the FIRST or FOLLOW sets.

Now we build a table of productions indexed by nonterminals (rows)
and terminals (columns)

       +         *         id        num        (         )         $
----------------------------------------------------------------------
S                        S → E$    S → E$    S → E$    
E                        E → TE'   E → TE'   E → TE'
E'  E' → +TE'                                           E' →      E' → 
T                        T → FT'   T → FT'   T → FT'
T'  T' →     T' → *FT'                                  T' →      T' →
F                        F → id    F → num   F → (E)

Here a production (X → s) is entered into the table at (X,x) if

   (1) x ∈ FIRST(X)
or (2) s nullible and x ∈ FOLLOW(X).


Since this table has at most one production for each position, we
can translate it into a predictive (recursive descent) parser:

----------------------------------------------------------------------
Program: predictive parser
----------------------------------------------------------------------

datatype nonTerm = S | E | E' | T | T' | F

datatype terminal
  = ID of string | NUM of int | PLUS | TIMES | LPAR | RPAR | EOF

(* datatype of values built by parser *)
datatype pexp
  = Eexp of pexp * pexp     (* E -> T E' *)
  | E'exp of pexp * pexp    (* E' -> + T E' *)
  | Texp of pexp * pexp     (* T -> F T' *)
  | T'exp of pexp * pexp    (* T' -> * F T' *)
  | NumExp of int           (* num *)
  | IdExp of string         (* id *)
  | EmptyExp                (* E' ->  or T' -> *)

(* desired expression type *)
datatype expr
  = Num of int
  | Var of string
  | Plus of expr * expr
  | Times of expr * expr

exception ParseError
fun parseError msg = (print (msg^"\n"); raise ParseError)

(* token input *)
val input: terminal list ref = ref nil

fun current () =
    case !input
      of t::_ => t
       | nil => EOF

fun advance () =
    case !input
      of _:: ts => input := ts
       | nil => ()

(* parsing *)

fun check EOF =
    (case !input
       of nil => ()
        | EOF :: _ => ()
        | _ => parseError "check")
  | check term =
    (case !input
       of t::ts => if t = term then input := ts else parseError "check"
        | nil => parseError "check")

fun parse(S) =
    (case current()
       of (ID _ | NUM _ | LPAR) => parse(E) before check(EOF)
	| _ => parseError("bad S"))
 
  | parse(E) =
    (case current()
       of (ID _ | NUM _ | LPAR) => Eexp(parse(T),parse(E'))
	| _ => parseError("bad E"))

  | parse(E') =
    (case current()
       of PLUS => (advance(); E'exp(parse(T), parse(E')))
	| RPAR => EmptyExp
	| EOF => EmptyExp
	| _ => parseError("bad E'"))

  | parse(T) =
    (case current()
       of (ID _ | NUM _ | LPAR) => Texp(parse(F),parse(T'))
	| _ => parseError("bad E"))

  | parse(T') =
    (case current()
       of PLUS => EmptyExp
	| TIMES => (advance(); T'exp(parse(F), parse(T')))
	| RPAR => EmptyExp
	| EOF => EmptyExp
	| _ => parseError("bad T'"))

  | parse(F) =
    (case current()
       of ID s => (advance(); IdExp(s))
        | NUM n => (advance(); NumExp(n))
        | LPAR => (advance(); parse(E) before check(RPAR))
	| _ => parseError("bad F"))


(* converting pexp produced by parser into an expr *)

(* mkExpr : pexp -> expr *)
fun mkExpr (Eexp(t,e')) =
    (case e'
       of EmptyExp => mkExpr t
        | E'exp(t1,e1') => mkSum(Plus(mkExpr(t), mkExpr(t1)), e1'))
  | mkExpr (Texp(f,t')) =
    (case t'
       of EmptyExp => mkExpr f
        | T'exp(f1,t1') => mkProd(Times(mkExpr(f), mkExpr(f1)), t1'))
  | mkExpr (IdExp(s)) = Var s
  | mkExpr (NumExp(n)) = Num n
  | mkExpr (EmptyExp) = parseError "empty"

and mkSum (e, EmptyExp) = e
  | mkSum (e, E'exp(t,e')) = mkSum(Plus(e,mkExpr t), e')

and mkProd (e, EmptyExp) = e
  | mkProd (e, T'exp(f,t')) = mkProd(Times(e,mkExpr f), t')


(* run the parser on a list of terminals *)
fun run (toks: terminal list) =
    (input := toks;
     let val e = parse(S)
      in mkExpr e
     end)