example/lambda/lambda.c

   1 // (C) Copyright Vesa Karvonen 2004.
   2 //
   3 // Distributed under the Boost Software License, Version 1.0.
   4 // (See accompanying file LICENSE.)
   5
   6 #include "brd_parser.h"
   7 #include "datatype.h"
   8 #include "error.h"
   9 #include "str.h"
  10 #include <assert.h>
  11 #include <ctype.h>
  12 #include <stdio.h>
  13 #include <stdlib.h>
  14
  15 // ## An interpreter for the lambda calculus
  16 //
  17 // We will now implement a simple interpreter for the lambda
  18 // calculus using the variant record generator ("datatype.h") and
  19 // the backtracking recursive descent parser generator
  20 // ("brd_parser.h"). We will discuss the generators here only
  21 // briefly. The reader should refer to their documentation for
  22 // detailed description of their design and usage. We will also
  23 // assume that the reader is familiar with lambda calculus and do
  24 // not introduce it here.
  25 //
  26 // ### The representation of programs
  27 //
  28 // Let's start by defining the most important data types used in the
  29 // interpreter.
  30 //
  31 // We will represent identifiers using interned constant strings, as
  32 // defined by the `str'-module ("str.h" and "str.c"). The main
  33 // advantage of the representation is that it makes it very easy and
  34 // efficient to compare identifiers for equality.
  35
  36 typedef str_type id_type;
  37
  38 // Like many language processors do, our intepreter uses an abstract
  39 // syntax tree representation of programs. A natural way to
  40 // represent abstract syntax trees is to use variant records. In our
  41 // little language, a program consist of a list of definitions,
  42 // which is a rather minimal syntactic extension to pure lambda
  43 // calculus intended to simplify the writing of longer programs, and
  44 // a single lambda calculus expression to evaluate. A definition
  45 // simply associates an identifier with the value of an expression.
  46 // Finally, a lambda calculus expression is either a lambda
  47 // abstraction, a function application or a variable reference.
  48 //
  49 // The previous informal definition is captured precisely by the
  50 // following data type definition, which defines a set of three
  51 // variant records.
  52
  53 DATATYPE_define((prg_type,
  54                  (Prg_Def,
  55                   (def_type)(prg_type))
  56                  (Prg_Exp,
  57                   (exp_type)))
  58                 (def_type,
  59                  (Def,
  60                   (id_type)(exp_type)))
  61                 (exp_type,
  62                  (Exp_Lambda,
  63                   (id_type)(exp_type))
  64                  (Exp_Apply,
  65                   (exp_type)(exp_type))
  66                  (Exp_Var,
  67                   (id_type))))
  68
  69 // As indicated, the above defines three variant record types at
  70 // once. The reason for defining them all at once is that the type
  71 // of a program, `prg_type', refers to the type of definitions,
  72 // `def_type', and the type of definitions refers to the type of
  73 // expressions, `exp_type'. In this case, we could have also defined
  74 // the three types separately, in reverse order, because the
  75 // dependency relation between the types is acyclic, but it is
  76 // perhaps more natural to define the type of programs first. If a
  77 // set of variant records has a cyclic dependency relation, then
  78 // there is no choice, they have to be defined all at once.
  79 //
  80 // In general, the `DATATYPE_define' macro takes a sequence of
  81 // variant record type definitions. Each type definition consists of
  82 // a pair of the type name and a sequence of constructors
  83 // definitions. Each constructor definition is a pair consisting of
  84 // the constructor name and a sequence of field types. The
  85 // `DATATYPE_define' macro then produces definitions of all the
  86 // variant record types along with constructor functions for all the
  87 // variant constructors.
  88 //
  89 // ### Unparsing programs
  90 //
  91 // In the previous section we defined the representation of the
  92 // abstract syntax of our little language. In this section we will
  93 // consider the concrete syntax of our language by defining a
  94 // function that translates the abstract syntax tree of an
  95 // expression to the concrete syntax.
  96 //
  97 // First, let's take a look at a BNF description of the concrete
  98 // syntax we are going to generate. The description is embedded in
  99 // the constant string `lambda_syntax' below.
 100
 101 static const str_type lambda_syntax
 102 = ("  <prg> ::= <def> <prg>         Sequence of definitions\n"
 103    "         |  <exp>               Expression to reduce\n"
 104    "\n"
 105    "  <def> ::= def id = <exp>      Constant definition\n"
 106    "\n"
 107    "  <exp> ::= \\ id . <exp>        Lambda abstraction\n"
 108    "         |  ( <exp> <exp> )     Function application\n"
 109    "         |  id                  Variable reference\n"
 110    "\n"
 111    "  id = [a-zA-Z_][0-9a-zA-Z_]*   Identifier\n");
 112
 113 // As you can see, each definition begins with the token `def' and
 114 // the token `=' separates the identifier and the expression of a
 115 // definition. Similarly a lambda abstraction begins with the token
 116 // `\' and a `.' separates the identifier and the expression of a
 117 // lambda abstraction. Function application is a parenthesized pair
 118 // of expressions without a puncturator between the expressions.
 119 //
 120 // The description of the syntax of identifiers above isn't in BNF,
 121 // but is rather a regular expression. An identifier must start
 122 // with a letter or an underscore and can optionally continue with
 123 // digits, letters or underscores.
 124 //
 125 // While not an explicit part of the above syntax description,
 126 // whitespace separates adjacent tokens that could otherwise be
 127 // mistaken to be a single token, but whitespace is otherwise
 128 // ignored.
 129 //
 130 // The constant `simple_example' contains a simple example of the
 131 // syntax.
 132
 133 static const str_type simple_example
 134 = ("  def true = \\t.\\e.t\n"
 135    "  def false = \\t.\\e.e\n"
 136    "  ((true false) true)\n");
 137
 138 // The reader should recognize the definitions of the Church
 139 // booleans above.
 140 //
 141 // For our purpose, which is to be able to print the value of a
 142 // program. it will be sufficient to be able to translate the
 143 // abstract syntax of an expression to the above concrete syntax.
 144 // For this purpose we will define the function `exp_unparse'.
 145
 146 static str_type exp_unparse(exp_type exp) {
 147   DATATYPE_switch
 148     (exp, exp_type,
 149      (Exp_Lambda,(var)(body), {
 150        return str_cat("\\", var, ".", exp_unparse(body),
 151                       str_end); })
 152      (Exp_Apply,(lhs)(rhs), {
 153        return str_cat("(", exp_unparse(lhs),
 154                       " ", exp_unparse(rhs),
 155                       ")",
 156                       str_end); })
 157      (Exp_Var,(var), {
 158        return var; }));
 159 }
 160
 161 // The most interesting thing in `exp_unparse' is the use of the
 162 // `DATATYPE_switch' macro. Everything else should be quite obvious.
 163 //
 164 // The `DATATYPE_switch' macro takes three parameters. The first
 165 // parameter is the expression whose value is to be examined. The
 166 // second is the type of the value. The third parameter is a
 167 // sequence of cases. Each case, represented by a variadic tuple,
 168 // names a constructor and a sequence of variables to be bound to
 169 // the fields of the constructor followed by a block of code to be
 170 // executed if the value matches the constructor. Strictly speaking,
 171 // it isn't necessary to make the code into a block, like is done
 172 // above, by surrouding it with braces, but it makes the code stand
 173 // out making it easier to read.
 174 //
 175 // The `DATATYPE_switch' macro generates code that evaluates the
 176 // given expression and examines its value to decide which case to
 177 // execute, depending on the constructor. Then the variables
 178 // specified along with the constructor pattern are given the values
 179 // of the fields of the variant. The combination of case analysis
 180 // and the binding of variables to fields makes `DATATYPE_switch'
 181 // rather convenient to use compared to writing code with similar
 182 // semantics manually.
 183 //
 184 // You might also make a note of the use of recursion. Recursion is
 185 // a natural way to implement algorithms on inductive data types.
 186 // While C doesn't require a compiler to implement tail recursion
 187 // optimization, a good compiler implements it anyway and the
 188 // cost of using recursion is not prohibitive.
 189 //
 190 // ### Evaluating programs
 191 //
 192 // We will now turn to the evaluation of programs. The simple
 193 // evaluator we will implement here is based on beta reduction, that
 194 // is, direct substitution of values for variables.
 195 //
 196 // We will start by defining the `exp_subst' function, which
 197 // substitutes a given expression for a given variable in another
 198 // expression.
 199
 200 static exp_type exp_subst(exp_type exp, id_type id, exp_type in) {
 201   DATATYPE_switch
 202     (in, exp_type,
 203      (Exp_Lambda,(var)(body), {
 204        return var == id
 205          ? in
 206          : Exp_Lambda(var, exp_subst(exp, id, body)); })
 207      (Exp_Apply,(lhs)(rhs), {
 208        return Exp_Apply(exp_subst(exp, id, lhs),
 209                         exp_subst(exp, id, rhs)); })
 210      (Exp_Var,(var), {
 211        return var == id
 212          ? exp
 213          : in; }));
 214 }
 215
 216 // Then we will define the function `exp_reduce', which reduces, or
 217 // evaluates, a given expression. Our `exp_reduce' has the strict
 218 // semantics, meaning that actual parameters are evaluated before
 219 // substitution. We leave it as exercise to change the interpreter
 220 // to use lazy evaluation.
 221
 222 static exp_type exp_reduce(exp_type exp) {
 223   DATATYPE_switch
 224     (exp, exp_type,
 225      (Exp_Lambda,, {
 226        return exp; })
 227      (Exp_Apply,(lhs)(rhs), {
 228        DATATYPE_switch
 229          (exp_reduce(lhs), exp_type,
 230           (Exp_Lambda,(var)(body), {
 231             return exp_reduce(exp_subst(exp_reduce(rhs),
 232                                         var,
 233                                         body)); })
 234           (Exp_Apply,, {
 235             ERROR_exit("'%s' doesn't reduce to a Lambda.",
 236                        exp_unparse(lhs)); })
 237           (Exp_Var,, {
 238             ERROR_exit("'%s' doesn't reduce to a Lambda.",
 239                        exp_unparse(lhs)); })); })
 240      (Exp_Var,(var), {
 241        ERROR_exit("Unbound variable '%s'.", var); }));
 242 }
 243
 244 // As you can see above, we handle errors in a very simple way. Upon
 245 // encoutering an error, we simply exit with a rather uninformative
 246 // message.
 247 //
 248 // Another thing worth pointing out above is that we didn't specify
 249 // any variables to be bound for the `Exp_Apply' and `Exp_Var'
 250 // constructors in the inner `DATATYPE_switch'. If we had specified
 251 // variables to be bound, we would have gotten warnings for not
 252 // referring to them.
 253 //
 254 // The astute reader familiar with lambda calculus and beta
 255 // reduction should now be thinking that we forgot alpha conversion,
 256 // that is, renaming of bound variables to avoid variable capture.
 257 // We could have handled alpha conversion in `exp_reduce', like is
 258 // often done in mathematical definitions of beta reduction, but
 259 // doing so would be inefficient, leading to repeated alpha
 260 // conversion. Instead, we will implement full renaming of all bound
 261 // variables prior to beta reduction, which is usually more
 262 // efficient.
 263 //
 264 // First we'll define the function `id_fresh' that generates a fresh
 265 // identifier. We make a point of making the form of the generated
 266 // identifier such that it doesn't correspond to the syntax of the
 267 // language.
 268
 269 static id_type id_fresh(id_type base) {
 270   static unsigned int counter = 0;
 271
 272   if (!++counter)
 273     ERROR_exit("Internal counter overflow.");
 274
 275   return str_cat(base, "{", uint_to_str(counter), "}", str_end);
 276 }
 277
 278 // Then we'll define the function `exp_fresh' which renames all
 279 // bound variables in an expression producing a new expression.
 280
 281 static exp_type exp_fresh(exp_type exp) {
 282   DATATYPE_switch
 283     (exp, exp_type,
 284      (Exp_Lambda,(var)(body), {
 285        id_type new_var = id_fresh(var);
 286        return Exp_Lambda(new_var,
 287                          exp_fresh(exp_subst(Exp_Var(new_var), var, body))); })
 288      (Exp_Apply,(lhs)(rhs), {
 289        return Exp_Apply(exp_fresh(lhs), exp_fresh(rhs)); })
 290      (Exp_Var,, {
 291        return exp; }));
 292 }
 293
 294 // The above `exp_fresh' function is still rather inefficient, but
 295 // it is still usually much more efficient than performing alpha
 296 // conversion repeatedly.
 297 //
 298 // There is one more thing left to do. Definitions are not a part of
 299 // the pure lambda calculus and aren't handled by our previous
 300 // functions. For the sake of simplicity, and to demonstrate that
 301 // definitions don't make the language stronger, we will simply
 302 // convert a program into an expression by turning each definition
 303 // into an expression that applies the body of the definition to a
 304 // lambda whose variable is the name of the definition and whose
 305 // body is the rest of the program. This shows that definitions are
 306 // just syntactic sugar.
 307
 308 static exp_type prg_to_exp(prg_type prg) {
 309   DATATYPE_switch
 310     (prg, prg_type,
 311      (Prg_Def,(first)(rest), {
 312        DATATYPE_switch
 313          (first, def_type,
 314           (Def,(var)(body), {
 315             return Exp_Apply(Exp_Lambda(var, prg_to_exp(rest)),
 316                              body); })); })
 317      (Prg_Exp,(last), {
 318        return last; }));
 319 }
 320
 321 // We could now build programs by calling the variant constructors
 322 // by hand and then calling the evaluation functions. However, we
 323 // will not do it, because it is much more convenient to define a
 324 // parser and then use the significantly more concise concrete
 325 // syntax.
 326 //
 327 // ### The parser
 328 //
 329 // We will use the simple backtracking recursive descent parser
 330 // generator ("brd_parser.h") to implement the parser, but first we
 331 // need a function for lexing an identifier token.
 332
 333 static _Bool id_parse(str_type *pstr, id_type *id) {
 334   assert(pstr);
 335   assert(*pstr);
 336
 337   str_type str = *pstr;
 338
 339   if (!isalpha(*str) && '_' != *str)
 340     return 0;
 341
 342   while (isalnum(*str) || '_' == *str)
 343     ++str;
 344
 345   *id = str_substr(*pstr, str);
 346   *pstr = str;
 347
 348   return 1;
 349 }
 350
 351 // With the `id_parse' and a couple of functions provided by the
 352 // `str'-module ("str.h" and "str.c"), namely `str_skip_spaces' and
 353 // `str_match_prefix', we are ready to define the parser.
 354
 355 BRD_PARSER
 356 (static, prg_parse,
 357  str_skip_spaces, str_match_prefix,
 358  (id, id_type, id_parse),
 359  (prg, prg_type,
 360   ((def, first)(prg, rest),          *prg = Prg_Def(first, rest); )
 361   ((exp, last),                      *prg = Prg_Exp(last); ))
 362  (def, def_type,
 363   (("def")(id, var)("=")(exp, body), *def = Def(var, body); ))
 364  (exp, exp_type,
 365   (("\\")(id, var)(".")(exp, body),  *exp = Exp_Lambda(var, body); )
 366   (("(")(exp, lhs)(exp, rhs)(")"),   *exp = Exp_Apply(lhs, rhs); )
 367   ((id, var),                        *exp = Exp_Var(var); )))
 368
 369 // The `BRD_PARSER' macro implements the simple parser generator.
 370 // The first two parameters of the macro specify the linkage and
 371 // name of the main entry point to the generated parser. The next
 372 // two parameters specify what to do between tokens and how to match
 373 // punctuation tokens. The next parameter is a sequence of token
 374 // specifications each being a triple of the name, type and lexing
 375 // function. The last parameter is a sequence of productions,
 376 // defining the grammar, with semantic actions.
 377 //
 378 // The `BRD_PARSER' macro then generates a simple backtracking
 379 // recursive descent parser according to the grammar that executes
 380 // the semantic actions while parsing. The semantic actions should
 381 // not have side effects, because, for simplicity, the generated
 382 // parser doesn't wait until it has a complete parse, but rather
 383 // calls the actions immediately upon recognizing a complete
 384 // production.
 385 //
 386 // The above parser doesn't recognize `def' as a keyword, but
 387 // instead allows an identifier named `def'. This could lead to
 388 // syntax errors being reported late. We leave it as an exercise to
 389 // treat `def' properly as a keyword that is not allowed as an
 390 // identifier.
 391 //
 392 // ### Putting it all together
 393 //
 394 // We now have all the ingredients to write a function that
 395 // evaluates a given program. The below `eval' function takes a
 396 // string, parses the string to yield a program, and then executes
 397 // the program using the functions we defined in previous sections.
 398
 399 static str_type eval(str_type code) {
 400   str_type str = code;
 401   prg_type prg = 0;
 402   if (!prg_parse(&str, &prg))
 403     ERROR_exit("Couldn't parse '%s'.", str);
 404   str_skip_spaces(&str);
 405   if ('\0' != *str)
 406     ERROR_exit("Garbage '%s' following program.", str);
 407
 408   return exp_unparse(exp_reduce(exp_fresh(prg_to_exp(prg))));
 409 }
 410
 411 // The only thing left to do in `main' is to check the command line
 412 // arguments and display usage instructions when appropriate. As a
 413 // final twist, we use the interpreter to evaluate the simple
 414 // example that is part of the instructions.
 415
 416 int main(int argc, char* argv[]) {
 417   if (2 != argc) {
 418     fprintf
 419       (stderr,
 420        "Usage: lambda '<prg>'\n"
 421        "\n"
 422        "The entire program must be a single argument.\n"
 423        "\n"
 424        "Syntax:\n"
 425        "\n"
 426        "%s"
 427        "\n"
 428        "For example, the program\n"
 429        "\n"
 430        "%s"
 431        "\n"
 432        "would evaluate to\n"
 433        "\n"
 434        "  %s\n"
 435        "\n"
 436        "The numbers inside braces are produced by alpha conversion.\n",
 437        lambda_syntax,
 438        simple_example,
 439        eval(simple_example));
 440     return EXIT_FAILURE;
 441   }
 442
 443   printf("%s\n", eval(argv[1]));
 444
 445   return EXIT_SUCCESS;
 446 }
 447
 448 // ### Exercises
 449 //
 450 // 1. Extend the parser to allow comments and to treat `def' as a
 451 //    keyword.
 452 //
 453 // 2. Change the interpreter to use lazy evaluation.
 454 //
 455 // 3. Design and implement better error handling and error messages.