Fix typo
[rofl0r-order-pp.git] / example / lambda / lambda.c
blob1b76e6bf63dfee6da6ab8f527d667e67ff7e53b5
1 // (C) Copyright Vesa Karvonen 2004.
2 //
3 // Distributed under the Boost Software License, Version 1.0.
4 // (See accompanying file LICENSE.)
6 #include "brd_parser.h"
7 #include "datatype.h"
8 #include "error.h"
9 #include "str.h"
10 #include <assert.h>
11 #include <ctype.h>
12 #include <stdio.h>
13 #include <stdlib.h>
15 // ## An interpreter for the lambda calculus
17 // We will now implement a simple interpreter for the lambda
18 // calculus using the variant record generator ("datatype.h") and
19 // the backtracking recursive descent parser generator
20 // ("brd_parser.h"). We will discuss the generators here only
21 // briefly. The reader should refer to their documentation for
22 // detailed description of their design and usage. We will also
23 // assume that the reader is familiar with lambda calculus and do
24 // not introduce it here.
26 // ### The representation of programs
28 // Let's start by defining the most important data types used in the
29 // interpreter.
31 // We will represent identifiers using interned constant strings, as
32 // defined by the `str'-module ("str.h" and "str.c"). The main
33 // advantage of the representation is that it makes it very easy and
34 // efficient to compare identifiers for equality.
36 typedef str_type id_type;
38 // Like many language processors do, our intepreter uses an abstract
39 // syntax tree representation of programs. A natural way to
40 // represent abstract syntax trees is to use variant records. In our
41 // little language, a program consist of a list of definitions,
42 // which is a rather minimal syntactic extension to pure lambda
43 // calculus intended to simplify the writing of longer programs, and
44 // a single lambda calculus expression to evaluate. A definition
45 // simply associates an identifier with the value of an expression.
46 // Finally, a lambda calculus expression is either a lambda
47 // abstraction, a function application or a variable reference.
49 // The previous informal definition is captured precisely by the
50 // following data type definition, which defines a set of three
51 // variant records.
53 DATATYPE_define((prg_type,
54 (Prg_Def,
55 (def_type)(prg_type))
56 (Prg_Exp,
57 (exp_type)))
58 (def_type,
59 (Def,
60 (id_type)(exp_type)))
61 (exp_type,
62 (Exp_Lambda,
63 (id_type)(exp_type))
64 (Exp_Apply,
65 (exp_type)(exp_type))
66 (Exp_Var,
67 (id_type))))
69 // As indicated, the above defines three variant record types at
70 // once. The reason for defining them all at once is that the type
71 // of a program, `prg_type', refers to the type of definitions,
72 // `def_type', and the type of definitions refers to the type of
73 // expressions, `exp_type'. In this case, we could have also defined
74 // the three types separately, in reverse order, because the
75 // dependency relation between the types is acyclic, but it is
76 // perhaps more natural to define the type of programs first. If a
77 // set of variant records has a cyclic dependency relation, then
78 // there is no choice, they have to be defined all at once.
80 // In general, the `DATATYPE_define' macro takes a sequence of
81 // variant record type definitions. Each type definition consists of
82 // a pair of the type name and a sequence of constructors
83 // definitions. Each constructor definition is a pair consisting of
84 // the constructor name and a sequence of field types. The
85 // `DATATYPE_define' macro then produces definitions of all the
86 // variant record types along with constructor functions for all the
87 // variant constructors.
89 // ### Unparsing programs
91 // In the previous section we defined the representation of the
92 // abstract syntax of our little language. In this section we will
93 // consider the concrete syntax of our language by defining a
94 // function that translates the abstract syntax tree of an
95 // expression to the concrete syntax.
97 // First, let's take a look at a BNF description of the concrete
98 // syntax we are going to generate. The description is embedded in
99 // the constant string `lambda_syntax' below.
101 static const str_type lambda_syntax
102 = (" <prg> ::= <def> <prg> Sequence of definitions\n"
103 " | <exp> Expression to reduce\n"
104 "\n"
105 " <def> ::= def id = <exp> Constant definition\n"
106 "\n"
107 " <exp> ::= \\ id . <exp> Lambda abstraction\n"
108 " | ( <exp> <exp> ) Function application\n"
109 " | id Variable reference\n"
110 "\n"
111 " id = [a-zA-Z_][0-9a-zA-Z_]* Identifier\n");
113 // As you can see, each definition begins with the token `def' and
114 // the token `=' separates the identifier and the expression of a
115 // definition. Similarly a lambda abstraction begins with the token
116 // `\' and a `.' separates the identifier and the expression of a
117 // lambda abstraction. Function application is a parenthesized pair
118 // of expressions without a puncturator between the expressions.
120 // The description of the syntax of identifiers above isn't in BNF,
121 // but is rather a regular expression. An identifier must start
122 // with a letter or an underscore and can optionally continue with
123 // digits, letters or underscores.
125 // While not an explicit part of the above syntax description,
126 // whitespace separates adjacent tokens that could otherwise be
127 // mistaken to be a single token, but whitespace is otherwise
128 // ignored.
130 // The constant `simple_example' contains a simple example of the
131 // syntax.
133 static const str_type simple_example
134 = (" def true = \\t.\\e.t\n"
135 " def false = \\t.\\e.e\n"
136 " ((true false) true)\n");
138 // The reader should recognize the definitions of the Church
139 // booleans above.
141 // For our purpose, which is to be able to print the value of a
142 // program. it will be sufficient to be able to translate the
143 // abstract syntax of an expression to the above concrete syntax.
144 // For this purpose we will define the function `exp_unparse'.
146 static str_type exp_unparse(exp_type exp) {
147 DATATYPE_switch
148 (exp, exp_type,
149 (Exp_Lambda,(var)(body), {
150 return str_cat("\\", var, ".", exp_unparse(body),
151 str_end); })
152 (Exp_Apply,(lhs)(rhs), {
153 return str_cat("(", exp_unparse(lhs),
154 " ", exp_unparse(rhs),
155 ")",
156 str_end); })
157 (Exp_Var,(var), {
158 return var; }));
161 // The most interesting thing in `exp_unparse' is the use of the
162 // `DATATYPE_switch' macro. Everything else should be quite obvious.
164 // The `DATATYPE_switch' macro takes three parameters. The first
165 // parameter is the expression whose value is to be examined. The
166 // second is the type of the value. The third parameter is a
167 // sequence of cases. Each case, represented by a variadic tuple,
168 // names a constructor and a sequence of variables to be bound to
169 // the fields of the constructor followed by a block of code to be
170 // executed if the value matches the constructor. Strictly speaking,
171 // it isn't necessary to make the code into a block, like is done
172 // above, by surrouding it with braces, but it makes the code stand
173 // out making it easier to read.
175 // The `DATATYPE_switch' macro generates code that evaluates the
176 // given expression and examines its value to decide which case to
177 // execute, depending on the constructor. Then the variables
178 // specified along with the constructor pattern are given the values
179 // of the fields of the variant. The combination of case analysis
180 // and the binding of variables to fields makes `DATATYPE_switch'
181 // rather convenient to use compared to writing code with similar
182 // semantics manually.
184 // You might also make a note of the use of recursion. Recursion is
185 // a natural way to implement algorithms on inductive data types.
186 // While C doesn't require a compiler to implement tail recursion
187 // optimization, a good compiler implements it anyway and the
188 // cost of using recursion is not prohibitive.
190 // ### Evaluating programs
192 // We will now turn to the evaluation of programs. The simple
193 // evaluator we will implement here is based on beta reduction, that
194 // is, direct substitution of values for variables.
196 // We will start by defining the `exp_subst' function, which
197 // substitutes a given expression for a given variable in another
198 // expression.
200 static exp_type exp_subst(exp_type exp, id_type id, exp_type in) {
201 DATATYPE_switch
202 (in, exp_type,
203 (Exp_Lambda,(var)(body), {
204 return var == id
205 ? in
206 : Exp_Lambda(var, exp_subst(exp, id, body)); })
207 (Exp_Apply,(lhs)(rhs), {
208 return Exp_Apply(exp_subst(exp, id, lhs),
209 exp_subst(exp, id, rhs)); })
210 (Exp_Var,(var), {
211 return var == id
212 ? exp
213 : in; }));
216 // Then we will define the function `exp_reduce', which reduces, or
217 // evaluates, a given expression. Our `exp_reduce' has the strict
218 // semantics, meaning that actual parameters are evaluated before
219 // substitution. We leave it as exercise to change the interpreter
220 // to use lazy evaluation.
222 static exp_type exp_reduce(exp_type exp) {
223 DATATYPE_switch
224 (exp, exp_type,
225 (Exp_Lambda,, {
226 return exp; })
227 (Exp_Apply,(lhs)(rhs), {
228 DATATYPE_switch
229 (exp_reduce(lhs), exp_type,
230 (Exp_Lambda,(var)(body), {
231 return exp_reduce(exp_subst(exp_reduce(rhs),
232 var,
233 body)); })
234 (Exp_Apply,, {
235 ERROR_exit("'%s' doesn't reduce to a Lambda.",
236 exp_unparse(lhs)); })
237 (Exp_Var,, {
238 ERROR_exit("'%s' doesn't reduce to a Lambda.",
239 exp_unparse(lhs)); })); })
240 (Exp_Var,(var), {
241 ERROR_exit("Unbound variable '%s'.", var); }));
244 // As you can see above, we handle errors in a very simple way. Upon
245 // encoutering an error, we simply exit with a rather uninformative
246 // message.
248 // Another thing worth pointing out above is that we didn't specify
249 // any variables to be bound for the `Exp_Apply' and `Exp_Var'
250 // constructors in the inner `DATATYPE_switch'. If we had specified
251 // variables to be bound, we would have gotten warnings for not
252 // referring to them.
254 // The astute reader familiar with lambda calculus and beta
255 // reduction should now be thinking that we forgot alpha conversion,
256 // that is, renaming of bound variables to avoid variable capture.
257 // We could have handled alpha conversion in `exp_reduce', like is
258 // often done in mathematical definitions of beta reduction, but
259 // doing so would be inefficient, leading to repeated alpha
260 // conversion. Instead, we will implement full renaming of all bound
261 // variables prior to beta reduction, which is usually more
262 // efficient.
264 // First we'll define the function `id_fresh' that generates a fresh
265 // identifier. We make a point of making the form of the generated
266 // identifier such that it doesn't correspond to the syntax of the
267 // language.
269 static id_type id_fresh(id_type base) {
270 static unsigned int counter = 0;
272 if (!++counter)
273 ERROR_exit("Internal counter overflow.");
275 return str_cat(base, "{", uint_to_str(counter), "}", str_end);
278 // Then we'll define the function `exp_fresh' which renames all
279 // bound variables in an expression producing a new expression.
281 static exp_type exp_fresh(exp_type exp) {
282 DATATYPE_switch
283 (exp, exp_type,
284 (Exp_Lambda,(var)(body), {
285 id_type new_var = id_fresh(var);
286 return Exp_Lambda(new_var,
287 exp_fresh(exp_subst(Exp_Var(new_var), var, body))); })
288 (Exp_Apply,(lhs)(rhs), {
289 return Exp_Apply(exp_fresh(lhs), exp_fresh(rhs)); })
290 (Exp_Var,, {
291 return exp; }));
294 // The above `exp_fresh' function is still rather inefficient, but
295 // it is still usually much more efficient than performing alpha
296 // conversion repeatedly.
298 // There is one more thing left to do. Definitions are not a part of
299 // the pure lambda calculus and aren't handled by our previous
300 // functions. For the sake of simplicity, and to demonstrate that
301 // definitions don't make the language stronger, we will simply
302 // convert a program into an expression by turning each definition
303 // into an expression that applies the body of the definition to a
304 // lambda whose variable is the name of the definition and whose
305 // body is the rest of the program. This shows that definitions are
306 // just syntactic sugar.
308 static exp_type prg_to_exp(prg_type prg) {
309 DATATYPE_switch
310 (prg, prg_type,
311 (Prg_Def,(first)(rest), {
312 DATATYPE_switch
313 (first, def_type,
314 (Def,(var)(body), {
315 return Exp_Apply(Exp_Lambda(var, prg_to_exp(rest)),
316 body); })); })
317 (Prg_Exp,(last), {
318 return last; }));
321 // We could now build programs by calling the variant constructors
322 // by hand and then calling the evaluation functions. However, we
323 // will not do it, because it is much more convenient to define a
324 // parser and then use the significantly more concise concrete
325 // syntax.
327 // ### The parser
329 // We will use the simple backtracking recursive descent parser
330 // generator ("brd_parser.h") to implement the parser, but first we
331 // need a function for lexing an identifier token.
333 static _Bool id_parse(str_type *pstr, id_type *id) {
334 assert(pstr);
335 assert(*pstr);
337 str_type str = *pstr;
339 if (!isalpha(*str) && '_' != *str)
340 return 0;
342 while (isalnum(*str) || '_' == *str)
343 ++str;
345 *id = str_substr(*pstr, str);
346 *pstr = str;
348 return 1;
351 // With the `id_parse' and a couple of functions provided by the
352 // `str'-module ("str.h" and "str.c"), namely `str_skip_spaces' and
353 // `str_match_prefix', we are ready to define the parser.
355 BRD_PARSER
356 (static, prg_parse,
357 str_skip_spaces, str_match_prefix,
358 (id, id_type, id_parse),
359 (prg, prg_type,
360 ((def, first)(prg, rest), *prg = Prg_Def(first, rest); )
361 ((exp, last), *prg = Prg_Exp(last); ))
362 (def, def_type,
363 (("def")(id, var)("=")(exp, body), *def = Def(var, body); ))
364 (exp, exp_type,
365 (("\\")(id, var)(".")(exp, body), *exp = Exp_Lambda(var, body); )
366 (("(")(exp, lhs)(exp, rhs)(")"), *exp = Exp_Apply(lhs, rhs); )
367 ((id, var), *exp = Exp_Var(var); )))
369 // The `BRD_PARSER' macro implements the simple parser generator.
370 // The first two parameters of the macro specify the linkage and
371 // name of the main entry point to the generated parser. The next
372 // two parameters specify what to do between tokens and how to match
373 // punctuation tokens. The next parameter is a sequence of token
374 // specifications each being a triple of the name, type and lexing
375 // function. The last parameter is a sequence of productions,
376 // defining the grammar, with semantic actions.
378 // The `BRD_PARSER' macro then generates a simple backtracking
379 // recursive descent parser according to the grammar that executes
380 // the semantic actions while parsing. The semantic actions should
381 // not have side effects, because, for simplicity, the generated
382 // parser doesn't wait until it has a complete parse, but rather
383 // calls the actions immediately upon recognizing a complete
384 // production.
386 // The above parser doesn't recognize `def' as a keyword, but
387 // instead allows an identifier named `def'. This could lead to
388 // syntax errors being reported late. We leave it as an exercise to
389 // treat `def' properly as a keyword that is not allowed as an
390 // identifier.
392 // ### Putting it all together
394 // We now have all the ingredients to write a function that
395 // evaluates a given program. The below `eval' function takes a
396 // string, parses the string to yield a program, and then executes
397 // the program using the functions we defined in previous sections.
399 static str_type eval(str_type code) {
400 str_type str = code;
401 prg_type prg = 0;
402 if (!prg_parse(&str, &prg))
403 ERROR_exit("Couldn't parse '%s'.", str);
404 str_skip_spaces(&str);
405 if ('\0' != *str)
406 ERROR_exit("Garbage '%s' following program.", str);
408 return exp_unparse(exp_reduce(exp_fresh(prg_to_exp(prg))));
411 // The only thing left to do in `main' is to check the command line
412 // arguments and display usage instructions when appropriate. As a
413 // final twist, we use the interpreter to evaluate the simple
414 // example that is part of the instructions.
416 int main(int argc, char* argv[]) {
417 if (2 != argc) {
418 fprintf
419 (stderr,
420 "Usage: lambda '<prg>'\n"
421 "\n"
422 "The entire program must be a single argument.\n"
423 "\n"
424 "Syntax:\n"
425 "\n"
426 "%s"
427 "\n"
428 "For example, the program\n"
429 "\n"
430 "%s"
431 "\n"
432 "would evaluate to\n"
433 "\n"
434 " %s\n"
435 "\n"
436 "The numbers inside braces are produced by alpha conversion.\n",
437 lambda_syntax,
438 simple_example,
439 eval(simple_example));
440 return EXIT_FAILURE;
443 printf("%s\n", eval(argv[1]));
445 return EXIT_SUCCESS;
448 // ### Exercises
450 // 1. Extend the parser to allow comments and to treat `def' as a
451 // keyword.
453 // 2. Change the interpreter to use lazy evaluation.
455 // 3. Design and implement better error handling and error messages.