1 // (C) Copyright Vesa Karvonen 2004.
3 // Distributed under the Boost Software License, Version 1.0.
4 // (See accompanying file LICENSE.)
6 #include "brd_parser.h"
15 // ## An interpreter for the lambda calculus
17 // We will now implement a simple interpreter for the lambda
18 // calculus using the variant record generator ("datatype.h") and
19 // the backtracking recursive descent parser generator
20 // ("brd_parser.h"). We will discuss the generators here only
21 // briefly. The reader should refer to their documentation for
22 // detailed description of their design and usage. We will also
23 // assume that the reader is familiar with lambda calculus and do
24 // not introduce it here.
26 // ### The representation of programs
28 // Let's start by defining the most important data types used in the
31 // We will represent identifiers using interned constant strings, as
32 // defined by the `str'-module ("str.h" and "str.c"). The main
33 // advantage of the representation is that it makes it very easy and
34 // efficient to compare identifiers for equality.
36 typedef str_type id_type
;
38 // Like many language processors do, our intepreter uses an abstract
39 // syntax tree representation of programs. A natural way to
40 // represent abstract syntax trees is to use variant records. In our
41 // little language, a program consist of a list of definitions,
42 // which is a rather minimal syntactic extension to pure lambda
43 // calculus intended to simplify the writing of longer programs, and
44 // a single lambda calculus expression to evaluate. A definition
45 // simply associates an identifier with the value of an expression.
46 // Finally, a lambda calculus expression is either a lambda
47 // abstraction, a function application or a variable reference.
49 // The previous informal definition is captured precisely by the
50 // following data type definition, which defines a set of three
53 DATATYPE_define((prg_type
,
69 // As indicated, the above defines three variant record types at
70 // once. The reason for defining them all at once is that the type
71 // of a program, `prg_type', refers to the type of definitions,
72 // `def_type', and the type of definitions refers to the type of
73 // expressions, `exp_type'. In this case, we could have also defined
74 // the three types separately, in reverse order, because the
75 // dependency relation between the types is acyclic, but it is
76 // perhaps more natural to define the type of programs first. If a
77 // set of variant records has a cyclic dependency relation, then
78 // there is no choice, they have to be defined all at once.
80 // In general, the `DATATYPE_define' macro takes a sequence of
81 // variant record type definitions. Each type definition consists of
82 // a pair of the type name and a sequence of constructors
83 // definitions. Each constructor definition is a pair consisting of
84 // the constructor name and a sequence of field types. The
85 // `DATATYPE_define' macro then produces definitions of all the
86 // variant record types along with constructor functions for all the
87 // variant constructors.
89 // ### Unparsing programs
91 // In the previous section we defined the representation of the
92 // abstract syntax of our little language. In this section we will
93 // consider the concrete syntax of our language by defining a
94 // function that translates the abstract syntax tree of an
95 // expression to the concrete syntax.
97 // First, let's take a look at a BNF description of the concrete
98 // syntax we are going to generate. The description is embedded in
99 // the constant string `lambda_syntax' below.
101 static const str_type lambda_syntax
102 = (" <prg> ::= <def> <prg> Sequence of definitions\n"
103 " | <exp> Expression to reduce\n"
105 " <def> ::= def id = <exp> Constant definition\n"
107 " <exp> ::= \\ id . <exp> Lambda abstraction\n"
108 " | ( <exp> <exp> ) Function application\n"
109 " | id Variable reference\n"
111 " id = [a-zA-Z_][0-9a-zA-Z_]* Identifier\n");
113 // As you can see, each definition begins with the token `def' and
114 // the token `=' separates the identifier and the expression of a
115 // definition. Similarly a lambda abstraction begins with the token
116 // `\' and a `.' separates the identifier and the expression of a
117 // lambda abstraction. Function application is a parenthesized pair
118 // of expressions without a puncturator between the expressions.
120 // The description of the syntax of identifiers above isn't in BNF,
121 // but is rather a regular expression. An identifier must start
122 // with a letter or an underscore and can optionally continue with
123 // digits, letters or underscores.
125 // While not an explicit part of the above syntax description,
126 // whitespace separates adjacent tokens that could otherwise be
127 // mistaken to be a single token, but whitespace is otherwise
130 // The constant `simple_example' contains a simple example of the
133 static const str_type simple_example
134 = (" def true = \\t.\\e.t\n"
135 " def false = \\t.\\e.e\n"
136 " ((true false) true)\n");
138 // The reader should recognize the definitions of the Church
141 // For our purpose, which is to be able to print the value of a
142 // program. it will be sufficient to be able to translate the
143 // abstract syntax of an expression to the above concrete syntax.
144 // For this purpose we will define the function `exp_unparse'.
146 static str_type
exp_unparse(exp_type exp
) {
149 (Exp_Lambda
,(var
)(body
), {
150 return str_cat("\\", var
, ".", exp_unparse(body
),
152 (Exp_Apply
,(lhs
)(rhs
), {
153 return str_cat("(", exp_unparse(lhs
),
154 " ", exp_unparse(rhs
),
161 // The most interesting thing in `exp_unparse' is the use of the
162 // `DATATYPE_switch' macro. Everything else should be quite obvious.
164 // The `DATATYPE_switch' macro takes three parameters. The first
165 // parameter is the expression whose value is to be examined. The
166 // second is the type of the value. The third parameter is a
167 // sequence of cases. Each case, represented by a variadic tuple,
168 // names a constructor and a sequence of variables to be bound to
169 // the fields of the constructor followed by a block of code to be
170 // executed if the value matches the constructor. Strictly speaking,
171 // it isn't necessary to make the code into a block, like is done
172 // above, by surrouding it with braces, but it makes the code stand
173 // out making it easier to read.
175 // The `DATATYPE_switch' macro generates code that evaluates the
176 // given expression and examines its value to decide which case to
177 // execute, depending on the constructor. Then the variables
178 // specified along with the constructor pattern are given the values
179 // of the fields of the variant. The combination of case analysis
180 // and the binding of variables to fields makes `DATATYPE_switch'
181 // rather convenient to use compared to writing code with similar
182 // semantics manually.
184 // You might also make a note of the use of recursion. Recursion is
185 // a natural way to implement algorithms on inductive data types.
186 // While C doesn't require a compiler to implement tail recursion
187 // optimization, a good compiler implements it anyway and the
188 // cost of using recursion is not prohibitive.
190 // ### Evaluating programs
192 // We will now turn to the evaluation of programs. The simple
193 // evaluator we will implement here is based on beta reduction, that
194 // is, direct substitution of values for variables.
196 // We will start by defining the `exp_subst' function, which
197 // substitutes a given expression for a given variable in another
200 static exp_type
exp_subst(exp_type exp
, id_type id
, exp_type in
) {
203 (Exp_Lambda
,(var
)(body
), {
206 : Exp_Lambda(var
, exp_subst(exp
, id
, body
)); })
207 (Exp_Apply
,(lhs
)(rhs
), {
208 return Exp_Apply(exp_subst(exp
, id
, lhs
),
209 exp_subst(exp
, id
, rhs
)); })
216 // Then we will define the function `exp_reduce', which reduces, or
217 // evaluates, a given expression. Our `exp_reduce' has the strict
218 // semantics, meaning that actual parameters are evaluated before
219 // substitution. We leave it as exercise to change the interpreter
220 // to use lazy evaluation.
222 static exp_type
exp_reduce(exp_type exp
) {
227 (Exp_Apply
,(lhs
)(rhs
), {
229 (exp_reduce(lhs
), exp_type
,
230 (Exp_Lambda
,(var
)(body
), {
231 return exp_reduce(exp_subst(exp_reduce(rhs
),
235 ERROR_exit("'%s' doesn't reduce to a Lambda.",
236 exp_unparse(lhs
)); })
238 ERROR_exit("'%s' doesn't reduce to a Lambda.",
239 exp_unparse(lhs
)); })); })
241 ERROR_exit("Unbound variable '%s'.", var
); }));
244 // As you can see above, we handle errors in a very simple way. Upon
245 // encoutering an error, we simply exit with a rather uninformative
248 // Another thing worth pointing out above is that we didn't specify
249 // any variables to be bound for the `Exp_Apply' and `Exp_Var'
250 // constructors in the inner `DATATYPE_switch'. If we had specified
251 // variables to be bound, we would have gotten warnings for not
252 // referring to them.
254 // The astute reader familiar with lambda calculus and beta
255 // reduction should now be thinking that we forgot alpha conversion,
256 // that is, renaming of bound variables to avoid variable capture.
257 // We could have handled alpha conversion in `exp_reduce', like is
258 // often done in mathematical definitions of beta reduction, but
259 // doing so would be inefficient, leading to repeated alpha
260 // conversion. Instead, we will implement full renaming of all bound
261 // variables prior to beta reduction, which is usually more
264 // First we'll define the function `id_fresh' that generates a fresh
265 // identifier. We make a point of making the form of the generated
266 // identifier such that it doesn't correspond to the syntax of the
269 static id_type
id_fresh(id_type base
) {
270 static unsigned int counter
= 0;
273 ERROR_exit("Internal counter overflow.");
275 return str_cat(base
, "{", uint_to_str(counter
), "}", str_end
);
278 // Then we'll define the function `exp_fresh' which renames all
279 // bound variables in an expression producing a new expression.
281 static exp_type
exp_fresh(exp_type exp
) {
284 (Exp_Lambda
,(var
)(body
), {
285 id_type new_var
= id_fresh(var
);
286 return Exp_Lambda(new_var
,
287 exp_fresh(exp_subst(Exp_Var(new_var
), var
, body
))); })
288 (Exp_Apply
,(lhs
)(rhs
), {
289 return Exp_Apply(exp_fresh(lhs
), exp_fresh(rhs
)); })
294 // The above `exp_fresh' function is still rather inefficient, but
295 // it is still usually much more efficient than performing alpha
296 // conversion repeatedly.
298 // There is one more thing left to do. Definitions are not a part of
299 // the pure lambda calculus and aren't handled by our previous
300 // functions. For the sake of simplicity, and to demonstrate that
301 // definitions don't make the language stronger, we will simply
302 // convert a program into an expression by turning each definition
303 // into an expression that applies the body of the definition to a
304 // lambda whose variable is the name of the definition and whose
305 // body is the rest of the program. This shows that definitions are
306 // just syntactic sugar.
308 static exp_type
prg_to_exp(prg_type prg
) {
311 (Prg_Def
,(first
)(rest
), {
315 return Exp_Apply(Exp_Lambda(var
, prg_to_exp(rest
)),
321 // We could now build programs by calling the variant constructors
322 // by hand and then calling the evaluation functions. However, we
323 // will not do it, because it is much more convenient to define a
324 // parser and then use the significantly more concise concrete
329 // We will use the simple backtracking recursive descent parser
330 // generator ("brd_parser.h") to implement the parser, but first we
331 // need a function for lexing an identifier token.
333 static _Bool
id_parse(str_type
*pstr
, id_type
*id
) {
337 str_type str
= *pstr
;
339 if (!isalpha(*str
) && '_' != *str
)
342 while (isalnum(*str
) || '_' == *str
)
345 *id
= str_substr(*pstr
, str
);
351 // With the `id_parse' and a couple of functions provided by the
352 // `str'-module ("str.h" and "str.c"), namely `str_skip_spaces' and
353 // `str_match_prefix', we are ready to define the parser.
357 str_skip_spaces
, str_match_prefix
,
358 (id
, id_type
, id_parse
),
360 ((def
, first
)(prg
, rest
), *prg
= Prg_Def(first
, rest
); )
361 ((exp
, last
), *prg
= Prg_Exp(last
); ))
363 (("def")(id
, var
)("=")(exp
, body
), *def
= Def(var
, body
); ))
365 (("\\")(id
, var
)(".")(exp
, body
), *exp
= Exp_Lambda(var
, body
); )
366 (("(")(exp
, lhs
)(exp
, rhs
)(")"), *exp
= Exp_Apply(lhs
, rhs
); )
367 ((id
, var
), *exp
= Exp_Var(var
); )))
369 // The `BRD_PARSER' macro implements the simple parser generator.
370 // The first two parameters of the macro specify the linkage and
371 // name of the main entry point to the generated parser. The next
372 // two parameters specify what to do between tokens and how to match
373 // punctuation tokens. The next parameter is a sequence of token
374 // specifications each being a triple of the name, type and lexing
375 // function. The last parameter is a sequence of productions,
376 // defining the grammar, with semantic actions.
378 // The `BRD_PARSER' macro then generates a simple backtracking
379 // recursive descent parser according to the grammar that executes
380 // the semantic actions while parsing. The semantic actions should
381 // not have side effects, because, for simplicity, the generated
382 // parser doesn't wait until it has a complete parse, but rather
383 // calls the actions immediately upon recognizing a complete
386 // The above parser doesn't recognize `def' as a keyword, but
387 // instead allows an identifier named `def'. This could lead to
388 // syntax errors being reported late. We leave it as an exercise to
389 // treat `def' properly as a keyword that is not allowed as an
392 // ### Putting it all together
394 // We now have all the ingredients to write a function that
395 // evaluates a given program. The below `eval' function takes a
396 // string, parses the string to yield a program, and then executes
397 // the program using the functions we defined in previous sections.
399 static str_type
eval(str_type code
) {
402 if (!prg_parse(&str
, &prg
))
403 ERROR_exit("Couldn't parse '%s'.", str
);
404 str_skip_spaces(&str
);
406 ERROR_exit("Garbage '%s' following program.", str
);
408 return exp_unparse(exp_reduce(exp_fresh(prg_to_exp(prg
))));
411 // The only thing left to do in `main' is to check the command line
412 // arguments and display usage instructions when appropriate. As a
413 // final twist, we use the interpreter to evaluate the simple
414 // example that is part of the instructions.
416 int main(int argc
, char* argv
[]) {
420 "Usage: lambda '<prg>'\n"
422 "The entire program must be a single argument.\n"
428 "For example, the program\n"
432 "would evaluate to\n"
436 "The numbers inside braces are produced by alpha conversion.\n",
439 eval(simple_example
));
443 printf("%s\n", eval(argv
[1]));
450 // 1. Extend the parser to allow comments and to treat `def' as a
453 // 2. Change the interpreter to use lazy evaluation.
455 // 3. Design and implement better error handling and error messages.