docs/refman.tex

   1 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
   2 %
   3 %  Here are some abbreviations used within this document
   4 %
   5 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
   6 \newfont{\bsf}{cmssbx10}
   7 \newfont{\largebsf}{cmssbx12}
   8 %\newfont{\nontermfont}{cmssbx10}
   9 \newcommand{\nontermfont}{\it}
  10 \newcommand{\codefont}{\tt}
  11 \newcommand{\symfont}{\bf}
  12
  13 \newcommand{\Prop}{{\bsf Prop}}               % typeset in sans serif
  14 \newcommand{\LargeProp}{{\largebsf Prop}}
  15 \newcommand{\plusplus}{{\tt++}}             % ++
  16 \newcommand{\Cpp}{{C{\tt++}}}               % C++ logo
  17 \newcommand{\Gpp}{{G{\tt++}}}               % G++ logo
  18 \newcommand{\email}{{\tt leunga@cs.nyu.edu}}  % my email address
  19 \newcommand{\web}{{\tt http://valis.cs.nyu.edu:8888/\~{}leunga}} % my web page
  20 \newcommand{\propweb}{\web{\tt/prop.html}}
  21 \newcommand{\Version}{2.3.4}                  % Prop version
  22 \newcommand{\comment}[1]{}
  23
  24 \newcommand{\sectionfont}{}
  25 %\newcommand{\Section}[1]{\section{\sectionfont #1}}
  26 %\newcommand{\Subsection}[1]{\subsection{\sectionfont #1}}
  27 %\newcommand{\Subsubsection}[1]{\subsubsection{\sectionfont #1}}
  28
  29 \newcommand{\Section}[1]{\pagebreak\section{\sectionfont #1}%
  30    \markright{\usebox{\LINE}\large \thesection. \sc #1}
  31    }
  32 \newcommand{\Subsection}[1]{\subsection{\sectionfont #1}}
  33 \newcommand{\Subsubsection}[1]{\subsubsection{\sectionfont #1}}
  34
  35 \newcommand{\N}[1]{\mbox{{\nontermfont #1}}}
  36 \newcommand{\T}[1]{{\codefont #1}}
  37 \newcommand{\MORE}[2]{#1\T{#2} \mbox{$\ldots$} \T{#2} #1}
  38 \newcommand{\ALT}[2]{#1\ \T{#2} \mbox{$\ldots$} \T{#2} #1}
  39 \newcommand{\LIST}[1]{\MORE{#1}{,}}
  40 \newcommand{\SEQ}[1]{\MORE{#1}{;}}
  41 \newcommand{\OPT}[1]{{\it [ #1 ]}}
  42 \newcommand{\C}[1]{\mbox{\rm #1}}
  43 \newcommand{\IS}{{\symfont ::=}}
  44 \newcommand{\OR}{\ \mbox{$\symfont \mid$}\ }
  45 \newcommand{\Id}{\N{Id}\ }
  46 \newcommand{\Pat}{\N{Pat}\ }
  47 \newcommand{\Exp}{\N{Exp}\ }
  48 \newcommand{\TypeExp}{\N{Type\_Exp}\ }
  49 \newenvironment{syntax}{\begin{quotation}\noindent\begin{tabular}{lcll}} %
  50    {\end{tabular}\end{quotation}}
  51
  52 \newcommand{\CLASSDEF}[1]{\T{#1}\index{Classes!{\codefont #1}}}
  53 \newcommand{\INDEX}[1]{\index{#1}}
  54 \newcommand{\NDEF}[1]{\N{#1}\index{Syntax!Non--terminals!{\nontermfont #1}}}
  55 \newcommand{\OPTIONDEF}[1]{{\tt #1}\index{Command line options!{\tt #1}}}
  56 \newcommand{\TDEF}[1]{\T{#1}\index{Syntax!Keywords!{\codefont #1}}}
  57 \newcommand{\INCOMPLETE}{{\em This section is incomplete.}}
  58 \newenvironment{Error}{\begin{center}\begin{tabular}{p{3in}p{3in}}
  59   \bf Error & \bf Explanation}{\\\hline\end{tabular}\end{center}}
  60 \newcommand{\errormsg}[1]{\\\hline {\tt #1}&}
  61 \newenvironment{Tips}{\noindent {\bf Tips:} \quad}{}
  62
  63 \newcommand{\support}{
  64    {\small\noindent This work is supported in part by
  65     an award from Hewlett-Packard Corporation, from IBM corporation, and
  66     an NYU Research Challenge Grant.
  67    }}
  68 \newcommand{\warning}{
  69    {\small\noindent The author will neither assume responsibility for any
  70    damages caused by the use of this product, nor accept warranty or update
  71    claims.  This product is distributed in the hope that it will be useful,
  72    but WITHOUT ANY WARRANTY; without even the implied warranty of
  73    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
  74
  75    \small\noindent This product is in the public domain,
  76    and may be freely distributed.
  77
  78    \small\noindent {\sf Prop} is a research prototype.
  79     The information contained in this document is subject
  80     to change without notice.
  81    }}
  82
  83    \makeindex
  84
  85 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
  86 %
  87 %   Beginning of the document
  88 %
  89 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
  90 \documentclass{article}
  91 \usepackage{html}
  92
  93    \pagestyle{myheadings}
  94
  95    \title{{\Huge \bf Prop \\
  96            Language Reference Manual} \\
  97           {\large \bf (for version \Version)}
  98          }
  99    \author{{\Large Allen Leung} \\
 100            E-mail: \email \\
 101            \htmladdnormallink{\web}{http://valis.cs.nyu.edu:8888/~leunga}
 102            \\
 103            Courant Institute of Mathematical Sciences \\
 104            251 Mercer Street \\
 105            New York, NY 10012 \\
 106           }
 107
 108    \setlength{\textwidth}{6.6in}
 109    \setlength{\evensidemargin}{0in}
 110    \setlength{\oddsidemargin}{0in}
 111    \setlength{\textheight}{8.75in}
 112    \setlength{\topmargin}{-0.5in}
 113    \setlength{\parskip}{1mm}
 114    \setlength{\parindent}{3ex}
 115    \setlength{\itemsep}{0.5ex}
 116
 117    \newsavebox{\LINE}
 118    \savebox{\LINE}[0.0in][l]{\rule[-1ex]{\textwidth}{.01in}}
 119 \begin{document}
 120
 121    \maketitle
 122    \vspace{\fill}
 123    %\support
 124
 125    \begin{abstract}
 126     This reference manual provides an introduction to
 127     \Prop, release \Version.  \Prop{} is a multiparadigm extension of \Cpp,
 128     and is designed for building high performance compiler
 129     and language transformation systems, using pattern matching and rewriting.
 130     This guide describes the syntax and semantics of
 131     the \Prop{} language and describes how to develop programs
 132     using the \Prop{} to \Cpp{} translator.
 133
 134    \end{abstract}
 135    \warning
 136    \titlepage
 137    \tableofcontents
 138    \bibliographystyle{alpha}
 139
 140 \Section{Introduction} \label{sec:intro}
 141
 142    This is the language reference manual for \Prop, a compiler generator.
 143 \Prop{} can be used to generate lexers, parsers, analysis and
 144 transformation tools using pattern matching and rewriting.
 145 The \Prop{} language is a {\em multi-paradigm}
 146 extension of \Cpp\cite{C++} and
 147 \Cpp{} programs generated from \Prop{} specifications can be compiled
 148 into efficient code.
 149
 150    Version \Version{} of \Prop{} includes the following major features:
 151 \begin{enumerate}
 152    \item Algebraic datatypes and pattern matching, and automatic
 153 mapping of algebraic datatypes into \Cpp{} class hierarchies.
 154    \item A LALR(1) parser generator in the tradition of {\it yacc} and
 155 {\it bison}.  Each grammar is encapsulated into a parser class and
 156 it is possible to have multiple instances of a parser.
 157    \item Regular expression string matching and lexer generation.
 158    \item Transformation of algebraic datatypes using conditional
 159    rewriting.
 160    \item In addition, using the view mechanism,
 161 it is possible to treat externally defined C or \Cpp{}
 162 data structures as if they were \Prop{} defined datatypes.
 163 The full range of pattern matching and rewriting capability can
 164 be used on these externally defined data..
 165 \end{enumerate}
 166
 167    In addition, we intend to implement/fix-up the following features in the
 168 near future:
 169 \begin{enumerate}
 170    \item Persistence.
 171    \item Pretty printing/reading.
 172    \item Visualization of data structures using {\em vcg}\cite{vcg,vcg-manual}.
 173    \item Automatic mapping of graph structure specifications into
 174 \Cpp{} classes.
 175    \item Graph rewriting system generation.
 176 \end{enumerate}
 177
 178 \Subsection{Availability}
 179
 180    The latest release of \Prop{} \Version{} is available from
 181 \begin{quotation}
 182 \htmladdnormallink{\propweb}{http://valis.cs.nyu.edu:8888/~leunga/prop.html}
 183 \end{quotation}
 184 New updates of the source code and documentation will be available
 185 from this site.
 186
 187 \Subsection{Organization of this Reference Manual}
 188    This reference manual is organized as follows.  Section~\ref{sec:general}
 189 overviews the general features of \Prop.
 190 Section~\ref{sec:parser}
 191 describes the lexer and parser specification formalisms.
 192 Section~\ref{sec:datatype} describes the algebraic datatypes and
 193 pattern matching features.  Section~\ref{sec:tree-rewriting} describes
 194 the tree rewriting mechanism.  Finally, section~\ref{sec:usage} describes
 195 the process of running the \Prop{} translator.
 196
 197    We'll use the following extended BNF notation to specify the syntax
 198 of the language.  Terminals are typeset in {\tt typewriter} font,
 199 while non-terminals are typeset in {\nontermfont itallics} font.
 200 Given a syntactic class \N{S}, we use \LIST{\N{S}} to denote one
 201 or more occurrences of \N{S} separated by commas.  Similarly,
 202 \SEQ{\N{S}} denotes one or more occurrences of \N{S} separated by
 203 semi-colons.  Slanted brackets are used to denote an optional occurrence
 204 of a syntactic class.   We'll combined the two forms when we
 205 want to denote zero or more occurrences of some term.
 206 For example, \OPT{\LIST{\N{S}}} denotes
 207 zero or more occurrences of \N{S}.
 208
 209 \Section{General Concepts} \label{sec:general}
 210
 211 \Subsection{Syntactic Conventions}
 212
 213 The syntax of \Prop{} is an extension of \Cpp.  Most new constructs
 214 of \Prop{} starts with new keywords, listed below.
 215 \begin{center}
 216 \tt
 217 \begin{tabular}{lllllll}
 218 applicative & arb & as & bagof & begin & bitfield & card \\
 219 category & classof & collectable & constraint & dataflow & datatype & declare \\
 220 dequeof & dom & domain & edges: & elsif & end & equiv: \\
 221 exists & expect: & feature & finalizable & forall & fun & function \\
 222 functor & graphrewrite & graphtype & implies: & inference & inherited & inline \\
 223 instantiate & is & law & left: & less & lexeme & listof \\
 224 loop & mapof & match & matchall & matchscan & module & multimapof \\
 225 mutable & nodes: & of & persistent & prec: & printable & priqueueof \\
 226 procedure & queueof & ran & refine & relation & rewrite & right: \\
 227 setof & sharing & signature & space: & syntax & synthesized & then \\
 228 time: & topdown & traced & treeparser & tupleof & unifiable & unique \\
 229 view & where & with & xor: & bottomup: & topdown: & before: \\
 230 preorder: & postorder: & cutrewrite & failrewrite & index: \\
 231 \end{tabular}
 232 \end{center}
 233
 234    In addition, the following new multi-character symbols are added:
 235 \begin{verbatim}
 236    <->  <=>  :=  :-  .(   .[
 237    #[   #(   #{  [|  |]   (|  |)  {|  |}
 238 \end{verbatim}
 239
 240 \Subsection{Basic syntactic classes}
 241
 242
 243 We'll use \NDEF{Integer}, \NDEF{Char}, \NDEF{Real}, \NDEF{String}
 244 and \NDEF{Id} to denote the syntactic classes of
 245 integers, characters, real numbers, strings and identifiers, respectively.
 246 As in \Cpp, characters are quoted with the single quotes \verb|'|, while
 247 strings are quoted with double quotes \verb|"|.  An identifier is
 248 any sequence of alphanumeric of characters that begins with a letter.
 249 In addition, we'll use the syntactic class
 250 \NDEF{Stmt} to denote any valid combination of \Cpp{} and \Prop{}
 251 statements.
 252
 253 \Subsection{Literals}
 254
 255   In addition to the usual \Cpp{} literals, \Prop{} introduces two new type
 256 of literals: {\em quark literals} and {\em regular expression literals}.
 257 Quark literals are of type \T{Quark} and are written as strings
 258 prefixed by the \verb|#| symbol.
 259 \begin{syntax}
 260   \NDEF{Quark} & \IS & \T{\#}\N{String} \\
 261 \end{syntax}
 262
 263 Quarks act like atoms in Lisp, in which name equality implies
 264 pointer equality.  In contrast, in \Cpp{} two string literals that are
 265 identical in value do not necessarily reside in the same address.
 266 Given strings \verb|s1| and \verb|s2|, \verb|strcmp(s1,s2) == 0| does
 267 not imply \verb|s1 == s2|.  Quarks are defined in the library include file
 268 \verb|<AD/strings/quark.h>|.
 269
 270 Regular expression literals are similar to string literals except
 271 that they are quoted by slashes \verb|/|.   We'll discuss
 272 the regular expression in section~\ref{sec:regular-expressions}.
 273
 274 \Subsection{The \Prop{} Language}
 275
 276 The basic \Prop{} language is the same as \Cpp{} with the following
 277 syntactic extensions:
 278 \begin{quotation}
 279 \begin{tabular}{lll}
 280          & \bf Keywords              & \bf Functions \\
 281   \bsf 1 & \T{datatype} \T{refine} \T{instantiate}
 282                                      & Algebraic datatype definitions \\
 283   \bsf 2 & \T{match} \T{matchall}    & Pattern matching \\
 284   \bsf 3 & \T{rewrite}               & Rewriting \\
 285   \bsf 4 & \T{lexeme}                & Lexical category definition \\
 286   \bsf 5 & \T{matchscan}             & Lexical scanning \\
 287   \bsf 6 & \T{syntax}                & Parser specification \\
 288   \bsf 7 & \T{graphtype}             & Graph data structure specification \\
 289   \bsf 8 & \T{graphrewrite}          & Graph rewriting \\
 290 \end{tabular}
 291 \end{quotation}
 292
 293 Like \Cpp, a \Prop{} is typically divided into the {\em specification},
 294 which defines the data structure and its interface, and the
 295 {\em implementation} parts.   \Prop{} specifications should
 296 be placed in a file with a \verb|.ph| suffix, while an implementation
 297 should be placed in a file with a \verb|.pC|\footnote{The suffixes {\tt .pcc},
 298 {\tt .pCpp} etc. can also be used.}.  The translator will convert each
 299 \verb|.p|$xxx$ into a file with the suffix \verb|.|$xxx$.  The translated
 300 output can then be fed to the \Cpp{} compiler for compilation.
 301 \vspace{\fill}
 302
 303 \Section{Lexer and Parser Generation} \label{sec:parser}
 304
 305    In this section we'll describe the lexer and parser generation
 306 mechanism of \Prop.  Note that the use of these features is not
 307 mandatory: the user can substitute any other lexer/parser generation tools
 308 or use hand written lexers and parsers.
 309 On the other hand, higher level of integration is possible when using
 310 \Prop's lexer and parser mechanisms.
 311
 312 \Subsection{Lexer Specification}
 313    Lexical analyzers are specified using the \verb|matchscan|
 314 statement.  This construct is responsible for generating the actual string
 315 matching DFA.  The actual buffering mechanisms are provided by the
 316 classes \verb|LexerBuffer|, \verb|IOLexerBuffer| and \verb|IOLexerStack|.
 317 These classes are part of the support library distributed with \Prop.
 318
 319    We assume that the user is familiar with lexer generator tools like
 320 {\em lex}\cite{lex} or {\em flex}\cite{flex}.
 321
 322 \Subsubsection{Regular expressions} \label{sec:regular-expressions}
 323    Figure~\ref{fig:regexp} describes the set of current
 324 supported regular expressions in \Prop.  The syntax is similar
 325 to what is found in {\em lex}, or {\em egrep}.
 326
 327 \begin{figure}[htbp]
 328 \begin{center}
 329   \begin{tabular}{|l|l|} \hline
 330     $c$                & matches character $c$ if it is not a meta character \\
 331     $e_1e_2$           & matches $e_1$ then $e_2$ \\
 332     \verb|.|           & matches any character except \verb|\n| \\
 333     $\verb|\|c$        & matches escape sequence $c$ \\
 334     \verb|^|$e$        & matches $e$ at the start of the line \\
 335     $e_1\verb.|.e_2$   & matches $e_1$ or $e_2$ \\
 336     $e\verb|*|$        & matches zero or more $e$ \\
 337     $e\verb|+|$        & matches one or more $e$ \\
 338     $e\verb|?|$        & matches zero or one $e$ \\
 339     \verb|(|$e$\verb|)| & grouping \\
 340     \verb|<<|$C_1, C_2 \ldots C_n$\verb|>>|$e$ & matches $e$ only
 341          if we are in a context from one of $C_1, C_2, \ldots C_n$ \\
 342     \verb|{|{\em lexeme}\verb|}| & matches lexeme \\
 343   \hline
 344   \end{tabular}
 345 \end{center}
 346 \caption{\label{fig:regexp} Regular expressions.}
 347 \end{figure}
 348
 349 The symbols \verb%\ [ ] ( ) { } << >> * + . - ? |% are meta characters and are
 350 interpreted non-literally.  The escape character \verb|\|
 351 can be prepended to the meta characters if they occur as literals in context.
 352
 353 Precedence-wise, meta characters \verb|*|, \verb|+| and \verb|?| bind tighter
 354 than juxtaposition.  Thus the regular expression
 355 \verb|ab*| means \verb|a(b*)|.  Parenthesis can be used to override
 356 the default precedence.
 357
 358 Character classes are of the form as found in {\em lex}:
 359 (i) $c_1$\verb|-|$c_2$ denotes the range of characters from
 360 $c_1$ to $c_2$; (ii) single (non-meta) characters denote themselves;
 361 (iii) the meta character \verb|^| can be used to negate the set.
 362 For example, the regular expression \verb|[a-zA-Z][a-zA-Z0-9]*|
 363 specifies an alphanumeric identifier that must starts with a letter.
 364 Similarly, the regular expression \verb|[^ \t\n]| matches any character
 365 except a space, a tab or a newline.
 366
 367 {\em Lexemes} are simply abbreviated names given to a regular expression
 368 pattern.  They act like macros in {\em lex}.
 369
 370 While a lexer is scanning, it may be in one of many {\em contexts}.
 371 Contexts can be used to group a set of related lexical rules; such rules
 372 are only applicable when the contexts are active.  This makes the
 373 lexer behave like a set of DFAs, with the ability to switch between
 374 DFAs under programmer control.
 375
 376
 377 \Subsubsection{Lexeme}
 378
 379 Lexemes are defined using the \T{lexeme} declaration in \Prop.
 380 Its syntax is
 381 \begin{syntax}
 382   \NDEF{Lexeme\_Decl} & \IS & \TDEF{lexeme} \ALT{\N{Lexeme\_Eq}}{|} \T{;} \\
 383   \NDEF{Lexeme\_Eq}   & \IS & \Id \T{=} \N{Regexp} \\
 384 \end{syntax}
 385
 386 For example, the following lexeme definition is used in the \Prop{}
 387 translator to define the lexical structure of common lexical items.
 388
 389 \begin{verbatim}
 390 lexeme digits      = /[0-9]+/
 391      | sign        = /[\+\-]/
 392      | integer     = /{digits}/
 393      | exponent    = /[eE]{sign}?{digits}/
 394      | mantissa    = /({digits}\.{digits}?|\.{digits})/
 395      | real        = /{mantissa}{exponent}?/
 396      | string      = /"([^\\"\n]|\\.)*"/
 397      | character   = /'([^\\'\n]|\\.[0-9a-f]*)'/
 398      | regexp      = /\/([^\/\n*]|\\.)([^\/\n]|\\.)*\//
 399      ;
 400 \end{verbatim}
 401
 402 Note that regular expression literals are written between
 403 slashes: \verb|/|$re$\verb|/|.
 404
 405 \Subsubsection{Lexeme class}
 406
 407 Often we wish to group a set of lexemes together into {\em lexeme classes} if
 408 they logically behave in some uniform manner; for example, if they
 409 act uniformly in a lexical rule.  By grouping related lexemes together
 410 into a class we can refer to them succinctly by their class name.
 411
 412 The syntax of a lexeme class declaration is
 413
 414 \begin{syntax}
 415   \NDEF{Lexeme\_Class\_Decl} & \IS & \TDEF{lexeme class}
 416        \ALT{\N{Lexeme\_Class\_Eq}}{and} \T{;} \\
 417   \NDEF{Lexeme\_Class\_Eq} & \IS & \Id \T{=} \ALT{\N{Lexeme\_Spec}}{|} \\
 418   \NDEF{Lexeme\_Spec} & \IS & \N{String} \\
 419                    & \OR & \Id \N{Regexp} \\
 420 \end{syntax}
 421
 422 The following example,
 423 the lexeme classes \verb|MainKeywords|, \verb|Symbols|, and \verb|Literals|
 424 are defined.
 425 \begin{verbatim}
 426 lexeme class MainKeywords =
 427     "rewrite" | "inference" | "match" | "matchall" | "matchscan"
 428   | "refine"  | "classof" | "type" | "datatype" | "instantiate"
 429   | "lexeme"  | "bitfield" | "begin" | "syntax"
 430   | "dataflow" | "module" | "signature" | "constraint" | "declare"
 431   | "procedure" | "fun" | "function" | "domain"
 432   | "graphtype" | "graphrewrite"
 433
 434 and  Symbols =
 435     ".." | "..." | "<->" | "::" | "&&" | "||" | "++" | "--" | "->"
 436   | "<<" | ">>" | ">=" | "<=" | "+=" | "-=" | "*=" | "/=" | "%=" | "=="
 437   | "!=" | "<<=" | ">>=" | "&=" | "|=" | "^=" | "=>" | "<-" | "<=>"
 438   | ":=" | ":-" | LONG_BAR /\-\-\-\-\-+/
 439
 440 and Literals =
 441     INT_TOK     /{integer}/
 442   | REAL_TOK    /{real}/
 443   | CHAR_TOK    /{character}/
 444   | STRING_TOK  /{string}/
 445   ;
 446 \end{verbatim}
 447
 448 \Subsubsection{Tokens}
 449
 450 Lexical tokens are defined using the \verb|datatype| declaration.
 451 The syntax is as follows.
 452 \begin{syntax}
 453   \NDEF{Tokens\_Decl} & \IS & \TDEF{datatype} \Id \T{::} \TDEF{lexeme} \T{=} \\
 454       & & \quad \ALT{\N{Token\_Spec}}{|} \T{;} \\
 455   \NDEF{Token\_Spec} & \IS & \TDEF{lexeme class} \Id &
 456        \C{Include a lexeme class} \\
 457                   & \OR & \Id \OPT{\N{Regexp}}  &
 458        \C{Single token spec} \\
 459 \end{syntax}
 460
 461 A token datatype is defined by including one of more lexeme classes,
 462 and by defining stand-alone tokens.  If a lexeme class is included,
 463 all the tokens defined within the lexeme class are included.  A \Cpp{}
 464 \T{enum} type of the same name as the token datatype is generated.
 465 If a token is given an identifier name, then the same name is used
 466 as the \T{enum} literal.  On the other hand, if a string is used
 467 to denote a token, then it can be referred to by prefixing the
 468 string with a dot \verb|.|  For example, the token \verb|"=>"| can
 469 be referenced as \verb|."=>"| within a program.
 470
 471 As an example, the following token datatype definition is used within
 472 the \Prop{} translator.  Here, the keywords are first partitioned into
 473 6 different lexeme classes.  In additional, the tokens \T{ID\_TOK},
 474 \T{REGEXP\_TOK}, etc. are defined.
 475
 476 \begin{verbatim}
 477 datatype PropToken :: lexeme =
 478     lexeme class MainKeywords
 479   | lexeme class Keywords
 480   | lexeme class SepKeywords
 481   | lexeme class Symbols
 482   | lexeme class Special
 483   | lexeme class Literals
 484   | ID_TOK       /{patvar}/
 485   | REGEXP_TOK   /{regexp}/
 486   | QUARK_TOK    /#{string}/
 487   | BIGINT_TOK   /#{sign}{integer}/
 488   | PUNCTUATIONS /[\<\>\,\.\;\&\|\^\!\~\+\-\*\/\%\?\=\:\\]/
 489   ;
 490 \end{verbatim}
 491
 492 \Subsubsection{The \T{matchscan} statement}\label{sec:matchscan}
 493
 494 The \T{matchscan} statement is used to perform tokenization.  The user can
 495 specify a set of string pattern matching rules within a \T{matchscan}
 496 construct. Given a object of class \verb|LexerBuffer|, the \T{matchscan}
 497 statement looks for the rule that matches the longest prefix from
 498 the input stream and executes the action associated with the rule.
 499 Ties are broken by the lexical ordering of the rules.
 500
 501 The general syntax is as follows:
 502 \begin{syntax}
 503 \NDEF{Matchscan}
 504       & \IS & \N{Matchscan\_Mode} \OPT{\N{Context\_Spec}} \T{(} \N{Exp} \T{)}
 505              & variant 1 \\
 506       & &  \T{\{} \ALT{\T{case} \N{Matchscan\_Rule}}{} \T{\}} \\
 507       & \OR & \N{Matchscan\_Mode} \OPT{\N{Context\_Spec}}
 508             \T{(} \N{Exp} \T{)} & variant 2 \\
 509       & &   \T{\{} \ALT{\N{Matchscan\_Rule}}{|} \T{\}} \\
 510       & \OR & \N{Matchscan\_Mode} \OPT{\N{Context\_Spec}} \T{(} \N{Exp} \T{)}
 511             \T{of} & variant 3 \\
 512       & & \quad \ALT{\N{Matchscan\_Rule}}{|} \\
 513       & & \T{end} \T{matchscan} \T{;}\\
 514 \NDEF{Matchscan\_Mode} & \IS &
 515            \TDEF{matchscan} \OPT{\T{while}} & case sensitive\\
 516    & \OR & \TDEF{matchscan*} \OPT{\T{while}}& case insensitive\\
 517 \NDEF{Context\_Spec} & \IS & \T{[} \LIST{\N{Id}} \T{]} \\
 518 \NDEF{Matchscan\_Rule}
 519       & \IS & \OPT{\T{<<} \ALT{\N{Context}}{,} \T{>>}} \\
 520       &     & \quad \T{lexeme} \T{class} \N{Id} \T{:} \N{Matchscan\_Action} \\
 521       & \OR & \OPT{\T{<<} \ALT{\N{Context}}{,} \T{>>}} \\
 522       &    &  \quad \N{Regexp} \T{:} \N{Matchscan\_Action} \\
 523 \NDEF{Matchscan\_Action} & \IS & \T{\{} \N{Code} \T{\}} \\
 524           & \OR & \N{Code} & for variant 1 only \\
 525 \end{syntax}
 526
 527 The two different modes of operation are \T{matchscan} and \T{matchscan*},
 528 which respectively match strings case sensitively and insensitively.
 529 The modifier \T{where} may optionally specify that the matching process
 530 should repeat until no rules apply, or the end of stream condition
 531 is reached.
 532
 533 By default, if no rules apply and if the input stream is non-empty,
 534 then an error has occurred.  The \T{matchscan} statement will
 535 invoke the method \verb|error()| of the \T{LexerBuffer} object by default.
 536
 537 For example, the following is part of the \Prop{} lexer specification.
 538
 539 \begin{verbatim}
 540    datatype LexicalContext = NONE | C | PROP | COMMENT | ...;
 541
 542 int PropParser::get_token()
 543 {
 544    matchscan[LexicalContext] while (lexbuf)
 545    {
 546       ...
 547    |  <<C>> /[ \t\\\014]/:            { emit(); }
 548    |  <<C>> /(\/\/.*)?\n/:            { emit(); line++; }
 549    |  <<C>> /^#.*/:                   { emit(); }
 550    |  <<PROP>> lexeme class MainKeywords: { return ?lexeme; }
 551    |  <<PROP>> lexeme class SepKeywords: { return ?lexeme; }
 552    |  <<PROP>> QUARK_TOK:             { return QUARK_TOK; }
 553    |  <<PROP>> BIGINT_TOK:            { return BIGINT_TOK; }
 554    |  <<PROP>> REGEXP_TOK:            { return REGEXP_TOK; }
 555    |  <<PROP>> PUNCTUATIONS:          { return lexbuf[0]; }
 556    |  <<PROP>> /[ \t\014]/:           { /* skip */ }
 557    |  <<PROP>> /(\/\/.*)?\n/:         { line++; }
 558    |  /\/\*/:                         { emit(); set_context(COMMENT); }
 559    |  <<COMMENT>> /\*\//:             { emit(); set_context(PROP); }
 560    |  <<COMMENT>> /\n/:               { emit(); line++; }
 561    |  <<COMMENT>> /./:                { emit(); }
 562    |  /./: { error("%Lillegal character %c\n", lexbuf[0]); }
 563    }
 564 }
 565 \end{verbatim}
 566
 567 Here, the lexer is partitioned in multiple lexical contexts: context
 568 \verb|C| deals with \Cpp{} code while context \verb|PROP| deals with
 569 \Prop{} extensions.  The special context \verb|COMMENT| is used
 570 to parse \verb|/* */| delimited comments.  Contexts are changed
 571 using the \verb|set_context| method defined in class \verb|LexerBuffer|.
 572
 573 The special variable \verb|?lexeme| can be used within a rule
 574 that matches a lexeme class.  For example, within the rule
 575 \begin{verbatim}
 576    |  <<PROP>> lexeme class MainKeywords:    { return ?lexeme; }
 577 \end{verbatim}
 578 the variable \verb|?lexeme| is bound to
 579 the token \verb|."rewrite"| if the string ``rewrite'' is matched;
 580 it is bound to the token \verb|."inference"| if the string
 581 ``inference'' is matched and so on.
 582
 583 \Subsubsection{Class \T{LexerBuffer}}
 584 We'll next describe the class \T{LexerBuffer} and its subclasses.
 585
 586 Class \CLASSDEF{LexerBuffer}
 587 is the base class in the lexical buffer hierarchy.
 588 It is defined in the library include file \verb|<AD/automata/lexerbuf.h>|.
 589 This class is responsible for implementing a string buffer for use
 590 during lexical analysis.
 591
 592 As it stands, it can be used directly if the lexer input is directly
 593 from a string.  Memory management of the buffer is assumed to be handled
 594 by the user.
 595
 596 The class \T{LexerBuffer} has three constructors.  The default
 597 constructor initializes the string buffer to NULL.  The two
 598 other constructors initialize the string buffer to a string given
 599 by the user.  In the case when the length is not supplied, the buffer
 600 is assumed to be \verb|'\0'|-terminated.  The two \verb|set_buffer| methods
 601 can be used to set the current string buffer.  Notice that
 602 all lexical analysis operations are done in place.  The user should
 603 not alter the string buffer directly, but should use the interface
 604 provided by this class instead.
 605
 606 \begin{verbatim}
 607 class LexerBuffer {
 608 public:
 609    LexerBuffer();
 610    LexerBuffer(char *);
 611    LexerBuffer(char *, size_t);
 612    virtual ~LexerBuffer();
 613    virtual void set_buffer (char *, size_t);
 614            void set_buffer (char *);
 615 };
 616 \end{verbatim}
 617
 618 The following methods are used access the string buffer.
 619 Method \T{capacity} returns the size of the buffer.  Method
 620 \T{length} returns the length of the current matched token.
 621 Methods \T{text} can be used to obtain a point to location
 622 of the current matched token.  The string returned is guaranteed
 623 to be \verb|'\0'|-terminated.  Methods \T{operator []} return
 624 the $i$th character of the token.  Finally, method \T{lookahead}
 625 returns the character code of the next character to be matched.
 626
 627 \begin{verbatim}
 628    int capacity () const;
 629    int length   () const;
 630    const char * text () const;
 631          char * text ();
 632    char  operator [] (int i) const;
 633    char& operator [] (int i);
 634    int lookahead () const;
 635    void push_back (int n)
 636 \end{verbatim}
 637
 638 In addition to the string buffer, the class \T{LexerBuffer} keeps
 639 track of two additional types of information: the current context
 640 of the DFA, and whether the next token starts at the
 641 beginning of the line, or in our terminology, whether it is
 642 {\em anchored}.  These are manipulated with the following methods:
 643
 644 \begin{verbatim}
 645    int context    () const;
 646    void set_context (int c = 0);
 647    Bool is_anchored() const;
 648    void set_anchored(Bool a = true);
 649 \end{verbatim}
 650
 651 Finally, the following methods should be redefined by subclasses
 652 to alter the behavior of this class.  By default, the class \T{LexerBuffer}
 653 calls \T{fill\_buffer()} when it reaches the end of the string; subclasses
 654 can use this method to refill the buffer and return the number of
 655 characters read.  Currently, \T{fill\_buffer} is defined to do nothing
 656 and return 0.  When it reaches the end of the file
 657 (i.e. when \T{fill\_buffer()} fails to refill the buffer and the
 658 scanning process finishes), method \T{end\_of\_file} is called.
 659 Currently, this is a no-op.  Finally, the error handling routine
 660 \T{error()} is called with the position of the beginning and the end
 661 of the buffer in which the error occurs.  By default, this routine
 662 prints out the buffer.
 663
 664 \begin{verbatim}
 665 protected:
 666    virtual size_t fill_buffer();
 667    virtual void   end_of_file();
 668    virtual void   error(const char * start, const char * stop);
 669 \end{verbatim}
 670
 671 \Subsubsection{Class \T{IOLexerBuffer}}
 672
 673   Class \CLASSDEF{IOLexerBuffer}
 674 is a derived class of \T{LexerBuffer}.  It
 675 automatically manages an internal buffer and receives input from
 676 an \T{istream}.  It is defined in the file \verb|<AD/automata/iolexerbuf.h>|
 677
 678 This class provides the following additional features:
 679 Constructor \verb|IOLexerBuffer(istream&)| ties the input to a
 680 stream.  If the default constructor is used, then it automatically
 681 ties the input to the stream \verb|cin|.  The method \verb|set_stream|
 682 can be used to set the stream to some other input.
 683
 684 \begin{verbatim}
 685 class IOLexerBuffer : public LexerBuffer {
 686    size_t    buffer_size;  // size of the buffer
 687    istream * input;        // input stream
 688 public:
 689    IOLexerBuffer();
 690    IOLexerBuffer(istream&);
 691    virtual ~IOLexerBuffer();
 692    virtual void set_stream (istream&);
 693 };
 694 \end{verbatim}
 695
 696 By default, class \verb|IOLexerBuffer| reads new data from the input
 697 stream using a line buffering discipline.  This mechanism is suitable
 698 for interactive, but for other applications it may be more efficient
 699 to use block buffered input.  The protected method
 700 \verb|read_buffer| controls this reading behavior; it
 701 is called whenever the data in the string buffer has been
 702 consumed and new input is needed from the stream.  It is
 703 passed the position of the buffer and its remaining capacity,
 704 and it returns the number of new characters that are read.  Subclasses
 705 can redefine this method.
 706
 707 \begin{verbatim}
 708 protected:
 709    virtual size_t read_buffer(char *, size_t);
 710 \end{verbatim}
 711
 712 \Subsubsection{Class \T{IOLexerStack}}
 713
 714 Class \CLASSDEF{IOLexerStack}
 715 is a derived class of \T{LexerStack}.  It provides
 716 a mechanism of reading from a stack of \verb|istream|'s.  Streams
 717 can be pushed and popped from the stack.  The next token is
 718 obtained from the stream from the top of the stack.  The class allows
 719 allows easy implementation of
 720 constructs such as the \verb|#include| file mechanism of the C preprocessor.
 721
 722 The interface of this class is listed below:
 723 \begin{verbatim}
 724 class IOLexerStack : public IOLexerBuffer {
 725 public:
 726    IOLexerStack();
 727    IOLexerStack(istream&);
 728    virtual ~IOLexerStack();
 729
 730    virtual void     push_stream (istream&);
 731    virtual istream& pop_stream  ();
 732 };
 733 \end{verbatim}
 734
 735 \Subsection{Parser Specification}
 736
 737 Parsers are specified as a two phase process:
 738 \begin{enumerate}
 739    \item First a {\em syntax class} is defined.  A syntax class
 740 declaration is like a normal \Cpp{} class declaration and has a similar
 741 syntax, except that \Prop{} will also generate the interface of the parser.
 742 All parsers that \Prop{} generates are encapsulated within a class.  This
 743 makes it easy for the programmer to add additional data for parsing
 744 use, and to have multiple instances of a parser.
 745    \item Secondly, the grammar of the language is defined in a
 746 {\em syntax} declaration as a set of productions,
 747 in a syntax similar to that of {\em yacc}.
 748 \end{enumerate}
 749
 750 We'll describe these two phases.
 751
 752 \Subsubsection{Syntax class}
 753
 754 A syntax class definition specifies an object class that encapsulates
 755 a parser.  Its syntax is the same as the usual \Cpp{} class definition,
 756 except that the prefix \T{syntax} is used:
 757
 758 \begin{syntax}
 759 \NDEF{Syntax\_Class\_Decl} & \IS & \TDEF{syntax class} \Id
 760    \OPT{\T{:} \N{Inherit\_List}} \T{\{} \N{Class\_Body} \T{\}} \T{;} \\
 761 \end{syntax}
 762
 763 This specification automatically generate a class with the following interface
 764 (assuming that the parser class in question is called \T{ParserClass})
 765 \begin{verbatim}
 766 public:
 767    ParserClass();  // constructor
 768 public:
 769    virtual void parse();
 770 \end{verbatim}
 771
 772 The constructor and the method \verb|parse()| will be automatically
 773 generated later.
 774
 775 A parser class expects the following method to be defined:
 776 \begin{verbatim}
 777    int get_token();
 778 \end{verbatim}
 779 \noindent The function of \verb|get_token| is to return the
 780 next token from the lexer stream whenever it is called.
 781 It should return \verb|EOF| if the stream is empty.
 782
 783 \Subsubsection{Syntax declaration}
 784
 785 The grammar of a language is specified in the \T{syntax} declaration.
 786 Its syntax is listed as follows:
 787
 788 \begin{syntax}
 789 \NDEF{Syntax\_Decl} & \IS & \TDEF{syntax} \Id \T{\{} \\
 790     & & \quad \OPT{\MORE{\N{Precedence\_Decl}}{}} \\
 791     & & \quad \OPT{\N{Expect\_Decl}} \\
 792     & & \quad \OPT{\MORE{\N{Production\_Rule}}{}} \\
 793     & & \T{\}} \\
 794 \end{syntax}
 795
 796 The name of a \T{syntax} declaration should match that
 797 of a \T{syntax} \T{class}.  A \T{syntax} declaration is divided
 798 into three parts: (i) the operator precedence section, (ii) the
 799 \T{expect:} $n$ section, and (iii) the grammar section.
 800
 801 \Subsubsection{Precedence}
 802
 803 Operator precedences are specified using precedence declarations.
 804 The syntax is:
 805
 806 \begin{syntax}
 807 \NDEF{Precedence\_Decl}
 808     & \IS & \TDEF{left:} \N{Integer} \ALT{\N{Operator}}{} \T{;}
 809           & left associative operators \\
 810     & \OR & \TDEF{right:} \N{Integer} \ALT{\N{Operator}}{} \T{;}
 811     & right associative operators \\
 812 \NDEF{Operator} & \IS & \N{Char}   & single character operator \\
 813              & \OR & \N{String} & multi-character operator \\
 814              & \OR & \N{Cons}   & constructor \\
 815 \end{syntax}
 816
 817 The integer associated with a precedence declaration is the precedence
 818 level of an operator.  The higher the level of an operator, the {\em lower}
 819 its precedence.
 820
 821 For example, the following set of precedences are used in the \Prop{}
 822 language parser:
 823 \begin{verbatim}
 824 left: 23 "as";
 825 left: 22 "::";
 826 left: 21 "||";
 827 left: 20 "equiv:";
 828 left: 19 "xor:";
 829 left: 18 "implies:";
 830 left: 17 "&&" "and";
 831 right: 16 "|=" "&=" "^=" "<<=" ">>=";
 832 right: 15 '=' ":=" "+=" "-=" "*=" "/=" "%=";
 833 left: 14 '|';
 834 left: 13 ':';
 835 left: 12 ';';
 836 left: 11 '^';
 837 left: 10 '&';
 838 left: 9 "==" "!=";
 839 left: 8 '<' '>' ">=" "<=";
 840 left: 7 "<<" ">>";
 841 left: 6 '+' '-' "with" "less";
 842 left: 5 '*' '/' '%';
 843 left: 4 "++" "--";
 844 left: 3 '!' '~' "arb" "card" "dom" "ran";
 845 left: 2 '[' ']' '{' '}' '.' "->" ;
 846 \end{verbatim}
 847
 848 \Subsubsection{{\tt expect:} $n$}
 849
 850 The declaration \T{expect} $n$ specifies that there should be $n$
 851 shift/reduce errors in the grammar.  If the actual grammar deviates
 852 from this number, a warning will be generated.  The alternative
 853 form \T{expect:} \T{\_} suppresses all warnings.  The syntax is:
 854
 855 \begin{syntax}
 856 \NDEF{Expect\_Decl}    & \IS & \TDEF{expect:} \N{Integer} \T{;}
 857     & expects $n$ shift/reduce errors \\
 858                      & \OR & \T{expect:} \T{\_} \T{;}
 859     & expects many shift/reduce errors \\
 860 \end{syntax}
 861
 862 This declaration is optional.  If omitted, warnings will be printed
 863 if parser conflicts are found during parser generation.
 864
 865 \Subsubsection{Production rules}
 866
 867
 868 The syntax of the production rules is as follows:
 869
 870 \begin{syntax}
 871 \NDEF{Production\_Rule} & \IS &
 872   \Id \OPT{\TypeExp} \T{:} \\
 873   & & \quad \OPT{\ALT{\N{One\_Alt}}{|}} \T{;} \\
 874 \NDEF{One\_Alt}  & \IS & \OPT{\ALT{\N{Symbol}}{}} & one alternative \\
 875 \NDEF{Symbol}
 876     & \IS & \Id        & non-terminal \\
 877     & \OR & \N{Cons}   & terminal (a datatype constructor) \\
 878     & \OR & \N{String} & terminal (a datatype constructor) \\
 879     & \OR & \N{Char}   & terminal (from the ASCII character set) \\
 880     & \OR & \T{?}      & the error terminal \\
 881     & \OR & \T{\$}     & the end of file terminal \\
 882     & \OR & \T{\{} \N{Code} \T{\}} & embedded actions \\
 883 \end{syntax}
 884
 885 Each rule specifies a set of productions of the form
 886 \[ \begin{array}{lcl}
 887     lhs &  :  & A_{11} A_{12} \ldots A_{1n_1} \\
 888         & \OR & A_{21} A_{22} \ldots A_{2n_2} \\
 889         & \vdots & \\
 890         & \OR & A_{m1} A_{m2} \ldots A_{mn_m} \\
 891    \end{array}
 892 \]
 893 where $lhs$ is the non-terminal and $A_{ij}$'s are a mixture of
 894 terminals, non-terminals and embedded actions.  If synthesized
 895 attributes are used, then the type of the s-attribute should be
 896 annotated next to the lhs non-terminal.
 897
 898 Terminals can be specified in a few ways: (i) a character literal
 899 is a predefined terminal of the same name, (ii) a string literal
 900 is matched with a \T{datatype} or a \T{lexeme} \T{class} constructor
 901 of the same name, (iii) an identifier is a terminal if there is a
 902 \T{datatype} constructor of the same name.  Any other identifiers
 903 are assumed to be non-terminals.
 904
 905 For example, the following are two sets of production rules used in the
 906 \Prop{} translator itself:
 907 \begin{verbatim}
 908 ty Ty:  simple_ty               { $$ = $1; }
 909 |       ty '=' exp              { $$ = DEFVALty($1,$3); }
 910 |       ty '*'                  { $$ = mkptrty($1); }
 911 |       ty '&'                  { $$ = mkrefty($1); }
 912 |       ty '[' ']'              { $$ = mkptrty($1); }
 913 |       ty '[' exp ']'          { $$ = mkarrayty($1,$3); }
 914 ;
 915 exp Exp:
 916         app_exp                 { $$ = $1; }
 917 |       exp '+' exp             { $$ = BINOPexp("+",$1,$3); }
 918 |       exp '-' exp             { $$ = BINOPexp("-",$1,$3); }
 919 |       exp '*' exp             { $$ = BINOPexp("*",$1,$3); }
 920 |       exp '/' exp             { $$ = BINOPexp("/",$1,$3); }
 921 |       exp '%' exp             { $$ = BINOPexp("%",$1,$3); }
 922 |       exp '^' exp             { $$ = BINOPexp("^",$1,$3); }
 923 |       exp "+=" exp            { $$ = BINOPexp("+=",$1,$3); }
 924 |       exp "-=" exp            { $$ = BINOPexp("-=",$1,$3); }
 925 |       exp "*=" exp            { $$ = BINOPexp("*=",$1,$3); }
 926 |       exp "/=" exp            { $$ = BINOPexp("/=",$1,$3); }
 927 |       exp "%=" exp            { $$ = BINOPexp("%=",$1,$3); }
 928 \end{verbatim}
 929 \noindent {\em etc $\ldots$}
 930
 931 Here, \T{ty} is the non-terminal of this production.  The type of
 932 the synthesized attribute of this rule is \T{Ty}, and it is
 933 written next to the non-terminal.  Like {\em yacc}, the synthesized
 934 attribute of a rule can be referenced using the \verb.$. variables:
 935 variable \TDEF{\$\$} denote the synthesized attribute of the current rule,
 936 while \TDEF{\$$n$} where $n$ is 1, 2, 3, \ldots denote the synthesized
 937 attribute of the $n$th rhs symbol of the current rule.
 938
 939 \Subsubsection{Parser report}
 940
 941 If the command line option \OPTIONDEF{-r} is used, then
 942 in addition to generating the code for the grammar, the translator
 943 will also generate a comprehensive report of the grammar and
 944 the resulting LALR(1) automaton.  The amount of details in this
 945 report can be varied using the the verbose option
 946 \OPTIONDEF{-v}:
 947
 948 \begin{description}
 949    \item[{\tt -v1}]  Print full information for all LR(0) states.
 950    \item[{\tt -v2}]  Also print out the lookahead set of a reduction rule.
 951 This may be useful for tracking down shift/reduce and reduce/reduce conflicts.
 952    \item[{\tt -v3}]  Do both \verb|-v1| and \verb|-v2|.
 953 \end{description}
 954
 955 \Subsubsection{Interfacing with the generated lexer}
 956
 957 The easiest way of interfacing with a lexer is to embed
 958 an object of class \T{LexerBuffer}, \T{IOLexerBuffer}, or \T{IOLexerStack}
 959 within a syntax class\footnote{Alternatively, inheritance may be used.}.
 960 We can then define the tokenization method
 961 \verb|get_token| using the \T{matchscan} statement described
 962 in section~\ref{sec:matchscan}.
 963
 964 \Subsection{Debugging Parsers}
 965
 966 Currently, only the most rudimentary debugging facilities have been
 967 implemented for parser.
 968 The user can define the macro \verb|DEBUG_|$C$, where $C$ is the
 969 name of a syntax class before the \T{syntax} definition.  This will activate
 970 the parser debugging code.  During parsing, the reductions that are taken
 971 will be printed to \verb|cerr|.
 972
 973 \Section{Algebraic Datatypes and Pattern Matching} \label{sec:datatype}
 974
 975     \Prop{} implements algebraic datatypes and pattern matching in the
 976 style of Standard ML\cite{SML}.  Tree, DAG and even graph structures can be
 977 specified as a set of datatype equations in algebraic datatype specifications.
 978 \Prop{} then proceeds to translate these into concrete \Cpp{} classes.
 979 This makes it very easy to implement complex data structures.
 980
 981    Algebraic datatypes can be manipulated and transformed
 982 using \Prop{} pattern matching constructs, which decompose a datatype
 983 value into its constituents.  Complex patterns involving multiple
 984 objects can be specified.  These are translated into efficient
 985 testing and selection code in \Cpp.
 986
 987 In the following we shall give a brief overview of the pattern matching
 988 features of \Prop.  For most users of modern declarative languages many
 989 of these features are already familiar constructs.
 990
 991 \Subsection{A quick tour of pattern matching}
 992
 993    Algebraic datatypes are specified using \verb|datatype| definitions,
 994 which define the inductive structure of one of more types using
 995 a tree-grammar like syntax.
 996 %In addition, pretty printers,
 997 %lexical scanners, parsers, persistence I/O methods and
 998 %garbage collection inferences can also be specified with
 999 %additional options in the same construct.
1000 When a datatype is declared,
1001 the following operations are implicitly defined by the datatype compiler:
1002 (1) the constructors for all the variants of a type;
1003 (2) the identity test operator \verb|==|, and the assignment
1004     operator \verb|=| for this type; and
1005 (3) the member functions needed to decompose a datatype value during
1006     pattern matching.
1007
1008    We'll select the internals of a compiler for a simplified imperative
1009 language as the running example in this section.
1010 Suppose that in this language an expression is composed of
1011 identifiers, integer constants and the four arithmetic operators.
1012 Then the structure of the abstract syntax tree can be specified as follows:
1013
1014 \begin{verbatim}
1015    datatype Exp = INT (int)
1016                 | ID  (const char *)
1017                 | ADD (Exp, Exp)
1018                 | SUB (Exp, Exp)
1019                 | MUL (Exp, Exp)
1020                 | DIV (Exp, Exp)
1021                 ;
1022 \end{verbatim}
1023
1024 Abstract syntax of an expression such as {\em a * b - 17}
1025 can be constructed directly in a prefix syntax, directly mirroring
1026 that of the definition.  The \Prop{} datatype
1027 compiler will automatically generate a \Cpp{} class hierarchy to represent
1028 the variants of type \verb|Exp|.  Datatype constructor
1029 functions(not to be mistaken with \Cpp's class constructors) will
1030 also be automatically generated using the same names as the variants.
1031
1032 \begin{verbatim}
1033    Exp formula = ADD(MUL(ID("a"),ID("b")),INT(17));
1034 \end{verbatim}
1035
1036 Datatype values can be decomposed using the \verb|match| statement, which
1037 can be seen as a generalization of \C's \verb|switch| construct.  Pattern
1038 matching is a combination of conditional branching and value
1039 binding.  For example, a typical evaluation function for the type \verb|Exp|
1040 can be written as in the following example.  Notice that each arm of
1041 a \verb|case| is in fact a pattern(with optional variables)
1042 mirroring the syntax of a datatype.  The pattern variables(written
1043 with the prefix \verb|?| in the sequel) of a matching
1044 arm is {\em bound} to the value of the matching value, which can be
1045 subsequently referenced and modified in the action of an arm.
1046
1047 \begin{verbatim}
1048 int eval (Exp e, const map<const char *, int>& env)
1049 {  match (e)
1050    {  case INT ?i:        return ?i;
1051       case ID  ?id:       return env[?id];
1052       case ADD (?e1,?e2): return eval(?e1,env) + eval(?e2,env);
1053       case SUB (?e1,?e2): return eval(?e1,env) - eval(?e2,env);
1054       case MUL (?e1,?e2): return eval(?e1,env) * eval(?e2,env);
1055       case DIV (?e1,?e2): return eval(?e1,env) / eval(?e2,env);
1056    }
1057 }
1058 \end{verbatim}
1059
1060 \Subsubsection{Pattern matching versus object-oriented style}
1061
1062   Although a comparable evaluation function can be written
1063 in object oriented style using late binding, as in below, in general pattern
1064 matching is much more powerful than late binding in \Cpp, which only
1065 allows dispatching based on the type of one receiver.
1066
1067 \begin{verbatim}
1068   // Class definitions
1069   class Exp {
1070   public:
1071      virtual int eval(const map<const char *, int>& env) const = 0;
1072   };
1073   class INT : Exp {
1074      int i;
1075   public:
1076      int eval(const map<const char *, int>& env);
1077   };
1078   class ID : Exp {
1079      const char * id
1080   public:
1081      int eval(const map<const char *, int>& env);
1082   };
1083   ...
1084
1085   // Member functions
1086   int INT::eval(const map<const char *, int>& env) const { return i; }
1087   int ID ::eval(const map<const char *, int>& env) const { return id; }
1088   int ADD::eval(const map<const char *, int>& env) const
1089      { return e1->eval(env) + e2->eval(env); }
1090   int SUB::eval(const map<const char *, int>& env) const
1091      { return e1->eval(env) - e2->eval(env); }
1092   int MUL::eval(const map<const char *, int>& env) const
1093      { return e1->eval(env) * e2->eval(env); }
1094   int DIV::eval(const map<const char *, int>& env) const
1095      { return e1->eval(env) / e2->eval(env); }
1096 \end{verbatim}
1097
1098 For example, in the following function we use nested patterns,
1099 non-linear patterns (i.e. patterns with multiple occurrences of
1100 a pattern variable), and guards to perform algebraic simplification
1101 of an expression.  Although the patterns are relatively simple in
1102 this example, in general arbitrarily complex patterns may be used.
1103
1104 \begin{verbatim}
1105 Exp simplify (Exp redex)
1106 {  // recursive traversal code omitted ...
1107
1108    // match while repeats the matching process
1109    // until no more matches are found.
1110    match while (redex)
1111    {  ADD(INT 0,  ?x):     { redex = ?x; }
1112    |  ADD(INT ?x, INT ?y): { redex = INT(?x+?y); }
1113    |  ADD(?x,     INT 0)   { redex = ?x; }
1114    |  SUB(?x,     INT 0):  { redex = ?x; }
1115    |  SUB(?x,     ?x):     { redex = INT(0); }
1116    |  SUB(INT ?x, INT ?y): { redex = INT(?x-?y); }
1117    |  MUL(INT 0,  ?x):     { redex = INT(0); }
1118    |  MUL(?x,     INT 0):  { redex = INT(0); }
1119    |  DIV(?x,     ?x):     { redex = INT(1); }
1120         // don't divide by zero.
1121    |  DIV(INT ?x, INT ?y) | ?y != 0: { redex = INT(?x/?y); }
1122    |  ...
1123    }
1124    return redex;
1125 }
1126 \end{verbatim}
1127
1128 Pattern matching in \Prop{} is not restricted to one datatype at
1129 a time.  In the following example, we use matching on multiple values to
1130 define equality on expressions inductively.  For variety, we'll
1131 use the \verb|fun| variant of \verb|match|, which defines a function
1132 in rule form.  Notice that the last case
1133 of the match set uses {\em wild cards} \verb|_| to catch all the other
1134 non-equal combinations.  Since \Cpp{} does not provide multiple dispatching,
1135 implementing binary (or $n$-ary) matching operations on variant datatypes are
1136 in general cumbersome and verbose in object-oriented style. In contrast,
1137 using an applicative pattern matching style many manipulations and
1138 transformations on variant datatypes with tree-like or graph-like structure
1139 can be expressed succinctly.
1140
1141 \begin{verbatim}
1142 fun equal INT ?i,     INT ?j: bool: { return ?i == ?j; }
1143   | equal ID  ?a,     ID  ?b:       { return strcmp(a,b) == 0; }
1144   | equal ADD(?a,?b), ADD(?c,?d):   { return equal(?a,?c) && equal(?b,?d); }
1145   | equal SUB(?a,?b), SUB(?c,?d):   { return equal(?a,?c) && equal(?b,?d); }
1146   | equal MUL(?a,?b), MUL(?c,?d):   { return equal(?a,?c) && equal(?b,?d); }
1147   | equal DIV(?a,?b), DIV(?c,?d):   { return equal(?a,?c) && equal(?b,?d); }
1148   | equal _,          _:            { return false; }
1149   ;
1150 \end{verbatim}
1151
1152 \Subsubsection{More examples} \label{sec:Wff}
1153
1154   As another example, we can specify the term structure of
1155 {\em well-formed formulas} in proposition calculus as follows.  Notice
1156 that the constructors \verb|F| and \verb|T| are nullary.
1157
1158 \begin{verbatim}
1159    datatype Wff = F
1160                 | T
1161                 | Var     (const char *)
1162                 | And     (Wff, Wff)
1163                 | Or      (Wff, Wff)
1164                 | Not     (Wff)
1165                 | Implies (Wff, Wff)
1166                 ;
1167 \end{verbatim}
1168
1169 Datatypes that are parametrically polymorphic, such as lists and trees, can
1170 be defined by parameterizing them with respect to one of more types.
1171 For example, both lists and tree below are parametric on one type argument
1172 \verb|T|.
1173
1174 \begin{verbatim}
1175    datatype List<T> = nil
1176                     | cons(T, List<T>);
1177    datatype Tree<T> = empty
1178                     | leaf(T)
1179                     | node(Tree<T>, T, Tree<T>);
1180
1181    List<int> primes = cons(2,cons(3,cons(5,cons(7,nil))));
1182    List<int> more_primes = cons(11,cons(13,primes));
1183    Tree<char *> names = node(leaf("Church"),"Godel",empty);
1184 \end{verbatim}
1185
1186 As a programming convenience, \Prop{} has a set of built-in list-like
1187 syntactic forms.  Unlike languages such as ML, however,
1188 these forms are not predefined to be any specific list types.  Instead, it is
1189 possible for the user to use these forms on any datatypes with
1190 a natural binary {\em cons} operator and a nullary {\em nil} constructor.
1191 For instance, the previous list datatype can be redefined as follows:
1192
1193 \begin{verbatim}
1194    datatype List<T> = #[] | #[ T ... List<T> ];
1195    List<int> primes = #[ 2, 3, 5, 7 ];
1196    List<int> more_primes = #[ 11, 13 ... primes ];
1197    List<char *> names = #[ "Church", "Godel", "Turing", "Curry" ];
1198
1199    template <class T>
1200       List<T> append (List<T> a, List<T> b)
1201       {  match (a)
1202          {  case #[]:          return b;
1203             case #[hd ... tl]: return #[hd ... append(tl,b)];
1204          }
1205       }
1206 \end{verbatim}
1207 The empty list is written as \verb|#[]|, while {\em cons(a,b)}
1208 is written as \verb|#[ a ... b ]|.   An expression of the special form
1209 \verb|#[a, b, c]|, for instance, is simple syntactic sugar for
1210 repeated application of the {\em cons} operator, i.e.
1211
1212 \begin{verbatim}
1213     #[a, b, c] == #[a ... #[ b ... #[ c ... #[] ] ] ].
1214     #[a, b, c ... d ] == #[a ... #[ b ... #[ c ... d ] ] ].
1215 \end{verbatim}
1216
1217 List-like special forms are not limited to datatypes with only two variants.
1218 For example, we can define a datatype similar in structure to S-expressions
1219 in {\em Lisp} or {\em scheme}.  Here's how such as datatype may be
1220 defined\footnote{for simplicity, we'll use a string representation for atoms
1221 instead of a more efficient method.}:
1222
1223 \begin{verbatim}
1224    datatype Sexpr = INT    (int)
1225                   | REAL   (double)
1226                   | STRING (char *)
1227                   | ATOM   (const char *)
1228                   | #()
1229                   | #( Sexpr ... Sexpr )
1230    where type Atom = Sexpr  // synonym for Sexpr
1231    ;
1232 \end{verbatim}
1233
1234    With this datatype specification in place, we can construct values
1235 of type \verb|Sexpr| in a syntax close to that of {\em Lisp}.  For example,
1236 we can define lambda expressions corresponding to the combinators $I$, $K$
1237 and $S$ as follows:
1238
1239 \begin{verbatim}
1240    Atom LAMBDA = ATOM("LAMBDA");
1241    Atom f      = ATOM("f");
1242    Atom x      = ATOM("x");
1243    Atom y      = ATOM("y");
1244    Atom NIL    = #();
1245
1246    Sexpr I = #(LAMBDA, #(x), x);
1247    Sepxr K = #(LAMBDA, #(x), #(LAMBDA, #(y), x));
1248    Sepxr S = #(LAMBDA, #(f),
1249                 #(LAMBDA, #(x),
1250                    #(LAMBDA, #(y), #(#(f,x), #(g,x)))));
1251 \end{verbatim}
1252
1253 Similar to list-like forms, vector-like forms are also available.
1254 This addresses one of the flaws of the \Cpp{} language, which lacks
1255 first class array constructors.
1256 Vectors are simply homogeneous arrays whose sizes
1257 are fixed and are determined at creation time\footnote{For efficiency,
1258 bounds checking is {\em not} performed.}.   Random access within
1259 vectors can be done in constant time.  Unlike lists, however, the
1260 prepending operation is not supported.   Vectors literals are delimited
1261 with the composite brackets \verb!(| ... |)!, \verb![| ... |]!, or
1262 \verb!{| ... |}!.  In the following example the datatype \verb|Exp|
1263 uses vectors to represent the cooefficients of the polynomials:
1264
1265 \begin{verbatim}
1266    datatype Vector<T> = (| T |);
1267    datatype Exp = Polynomial (Var, Vector<int>)
1268                 | Functor (Exp, Vector<Exp>)
1269                 | Atom (Var)
1270                 | ...
1271    where type Var = const char *;
1272    Exp formula = Polynomial("X", (| 1, 2, 3 |));
1273 \end{verbatim}
1274
1275 Commonly used patterns can be given synonyms so that they can be readily
1276 reused without undue repetition.  This can be accomplished by defining pseudo
1277 datatype constructors to stand for common patterns using
1278 {\em datatype law} definitions.  For example, the following set of
1279 laws define some commonly used special forms for a {\em Lisp}-like language
1280 using the previously defined \verb|Sexpr| datatype.
1281
1282 \begin{verbatim}
1283    datatype law inline Lambda(x,e)  = #(ATOM "LAMBDA", #(x), e)
1284               | inline Quote(x)     = #(ATOM "QUOTE", x)
1285               | inline If(a,b,c)    = #(ATOM "IF", a, b, c)
1286               | inline Nil          = #()
1287               | inline ProgN(exprs) = #(ATOM "PROGN" ... exprs)
1288               | SpecialForm  = #(ATOM ("LAMBDA" || "IF" ||
1289                                        "PROGN" || "QUOTE") ... _)
1290               | Call(f,args) = ! SpecialForm && #(f ... args)
1291               ;
1292 \end{verbatim}
1293
1294 Note that the pattern \verb|SpecialForm| is meant to match all
1295 special forms in our toy language: i.e. {\em lambdas}, {\em ifs}, {\em progn}'s
1296 and {\em quotes}.  The pattern disjunction connective \verb.||. is used
1297 to link these forms together.  Since we'd like the \verb|Call| pattern
1298 to match only if the S-expression is not a special form, we use
1299 the pattern negation and conjunction operators, \verb|!| and \verb|&&|
1300 are used to screen out special forms.
1301
1302
1303 With these definitions in place,
1304 an interpreter for our language can be written thus:
1305
1306 \begin{verbatim}
1307    Sexpr eval (Sexpr e)
1308    {  match (e)
1309       {  Call(?f,?args):     { /* perform function call */ }
1310       |  Lambda(?x,?e):      { /* construct closure */ }
1311       |  If(?e,?then,?else): { /* branching */ }
1312       |  Quote(?x):          { return ?x; }
1313       |  ...:                { /* others */ }
1314       }
1315    }
1316 \end{verbatim}
1317
1318
1319 As an interesting note, the special form pattern can also be rewritten
1320 using regular expression string matching, as in the following:
1321
1322 \begin{verbatim}
1323  datatype law SpecialForm = #(ATOM /LAMBDA|IF|PROGN|QUOTE/ ... _)
1324 \end{verbatim}
1325
1326 In addition, since the law constructors \verb|Lambda|, \verb|Quote|,
1327 \verb|If|, \verb|Nil| and \verb|ProgN| are defined with the \verb|inline|
1328 keyword, these constructors can be used to within expressions as abbreviates
1329 for their rhs definitions.  For example, writing
1330 \verb|Lambda(x,x)| is the same writing \verb|#(ATOM("Lambda"),#(x),x)|.
1331
1332 \Subsubsection{Variants of match}
1333
1334 Aside from the usual plain pattern matching constructs, a few variants of the
1335 \verb|match| construct are offered.  We'll briefly enumerate a few of these:
1336 \begin{itemize}
1337   \item The \verb|matchall| construct is a variant of \verb|match| that
1338    executes all matching rules(instead of just the first one) in sequence.
1339   \item Each rule of a \verb|match| statement can have associated cost
1340     expressions.  Instead of selecting the first matching rule to
1341     execute as in the default, all matching rules are considered
1342     and the rule with the least cost is executed.  Ties are broken by
1343     choosing the rule that comes first lexically.  For example:
1344
1345    \begin{verbatim}
1346    match (ir)
1347    {  ADD(LOAD(?r0),?r1) \ e1: { ... }
1348    |  ADD(?r0,LOAD(?r0)) \ e2: { ... }
1349    |  ADD(?r0, ?r1)      \ e3: { ... }
1350    |  ...
1351    }
1352    \end{verbatim}
1353   \item A \verb|match| construct can be modified with the \verb|while|
1354      modifier, as in the following example.  A match modified thus is
1355      repeatedly matched until none of the patterns are applicable.
1356      For example, the following routine uses a \verb|match while| statement
1357      to traverse to the leftmost leaf.
1358
1359 \begin{verbatim}
1360    template <class T>
1361    Tree<T> left_most_leaf(Tree<T> tree)
1362    {  match while (tree)
1363       {  case node(t,_,_):  tree = t;
1364       }
1365       return tree;
1366    }
1367 \end{verbatim}
1368   \item Finally, the \verb|matchscan| variant of match can be used to
1369      perform string matching on a stream.  For example, a simple lexical
1370      scanner can be written as follows.
1371
1372      \begin{verbatim}
1373    int lexer (LexerBuffer& in)
1374    {  matchscan while (in)
1375       {  /if/:                     { return IF; }
1376       |  /else/:                   { return ELSE; }
1377       |  /while/:                  { return WHILE; }
1378       |  /for/:                    { return FOR; }
1379       |  /break/:                  { return BREAK; }
1380       |  /[0-9]+/:                 { return INTEGER; }
1381       |  /[a-zA-Z_][a-zA-Z_0-9]*/: { return IDENTIFIER; }
1382       |  /[ \t]+/:                 { /* skip spaces */ }
1383       |  ... // other rules
1384       }
1385    }
1386      \end{verbatim}
1387 \end{itemize}
1388
1389 \Subsection{Algebraic Datatypes} \label{sec:algebraic-datatypes}
1390    Algebraic datatypes are defined using the \T{datatype}
1391 construct.  Its syntax is as follows.
1392
1393 \begin{syntax}
1394 \NDEF{Datatype\_Decl}
1395    & \IS & \TDEF{datatype} \\
1396    &     & \quad \OPT{\ALT{\N{Datatype\_Spec}}{and}} \\
1397    &     & \quad \OPT{\TDEF{where type} \ALT{\N{Type\_Spec}}{and}} \\
1398    &     & \quad \OPT{\TDEF{law} \ALT{\N{Law\_Spec}}{and}} \\
1399    &     & \T{;} \\
1400 \end{syntax}
1401
1402 Each \T{datatype} declaration specifies a set of
1403 (potentially mutually recursive) types, a set of type abbreviations,
1404 and a set of pattern matching abbreviations called {\em laws}.
1405
1406 \begin{syntax}
1407 \NDEF{Datatype\_Spec}
1408    & \IS & \Id \OPT{\T{<} \LIST{\N{Id}} \T{>}} \\
1409    &     & \quad \OPT{\T{:} \N{Inherit\_List}} \\
1410    &     & \quad \OPT{\T{::} \N{Datatype\_Qualifiers}} \\
1411    &     & \quad \OPT{\T{=} \N{Cons\_Specs}} \\
1412 \NDEF{Cons\_Specs}
1413    & \IS & \ALT{\N{Cons\_Spec}}{|} \\
1414    &     & \quad \OPT{\T{public:} \N{Datatype\_Body}} \\
1415 \NDEF{Cons\_Spec}
1416    & \IS & \N{Simple\_Cons\_Spec}  \\
1417    &     &  \quad \OPT{\T{with} \T{\{} \N{Class\_Decl} \T{\}}} \\
1418 \NDEF{Simple\_Cons\_Spec}
1419    & \IS & \Id \OPT{\N{String}} & unit constructor \\
1420    & \OR & \N{String}              & unit constructor \\
1421    & \OR & \Id \OPT{\T{of}} \TypeExp  & constructor with arguments \\
1422    & \OR & \T{\#[} \T{]}                 & unit list constructor \\
1423    & \OR & \T{\#\{} \T{\}}               & unit list constructor \\
1424    & \OR & \T{\#(} \T{)}                 & unit list constructor \\
1425    & \OR & \T{\#[} \TypeExp \T{...} \TypeExp \T{]}
1426       & list constructor \\
1427    & \OR & \T{\#\{} \TypeExp \T{...} \TypeExp \T{\}}
1428       & list constructor \\
1429    & \OR & \T{\#(} \TypeExp \T{...} \TypeExp \T{)}
1430       & list constructor \\
1431    & \OR & \T{[|} \TypeExp \T{|]} & vector constructor \\
1432    & \OR & \T{(|} \TypeExp \T{|)} & vector constructor \\
1433    & \OR & \T{\{|} \TypeExp \T{|\}} & vector constructor \\
1434 \end{syntax}
1435
1436
1437 Datatypes can be annotated with {\em qualifiers}.  They tell the \Prop{}
1438 translator to generate additional code for each datatype to provide
1439 additional functionality.
1440
1441 \begin{syntax}
1442 \NDEF{Datatype\_Qualifiers}
1443    & \IS & \MORE{\N{Datatype\_Qualifier}}{} \\
1444 \NDEF{Datatype\_Qualifier}
1445    & \IS & \TDEF{collectable} & garbage collectable \\
1446    & \OR & \TDEF{rewrite}     & optimized for tree rewriting \\
1447    & \OR & \TDEF{persistent}  & generate persistence interface \\
1448    & \OR & \TDEF{lexeme}      & used for parser/lexer tokens  \\
1449    & \OR & \TDEF{inline}      & use inlined implementation \\
1450    & \OR & \TDEF{extern}      & use non-inlined implementation \\
1451 \end{syntax}
1452
1453 Type specifications assign abbreviations to commonly used type expressions.
1454 They act like \T{typedef}'s in \Cpp, except that \Prop{} can make use
1455 of the type information provided by these specifications.
1456
1457 \begin{syntax}
1458 \NDEF{Type\_Spec}
1459    & \IS & \Id \T{=} \TypeExp \\
1460 \end{syntax}
1461
1462 Law specifications are used to define abbreviations to datatype
1463 patterns\footnote{Patterns are discussed in
1464 section~\ref{sec:pattern-matching}.}.
1465 They behave like macros in some sense, except that unlike macros, they are
1466 properly type checked by \Prop.  In addition, if the keyword \T{inline} is
1467 used, then \Prop{} will treat the lhs as an expression and generate
1468 a function of the same name that can be used to construct the datatype term.
1469
1470 \begin{syntax}
1471 \NDEF{Law\_Spec}
1472    & \IS & \OPT{\T{inline}} \Id \OPT{\N{Law\_Arg}} \T{=} \Pat \\
1473 \NDEF{Law\_Arg}
1474    & \IS & \Id \\
1475    & \OR & \T{(} \LIST{\N{Id}} \T{)} \\
1476 \end{syntax}
1477
1478 \Prop{} recognizes the following set of type expressions.
1479 \begin{syntax}
1480 \NDEF{Type\_Exp}
1481    & \IS & \N{Type\_Id}                & type name \\
1482    & \OR & \TypeExp \T{*}              & pointer type \\
1483    & \OR & \TypeExp \T{\&}             & reference type \\
1484    & \OR & \N{Type\_Qualifier} \N{Type\_Id}      & qualified type \\
1485    & \OR & \N{Datatype\_Qualifier} \TypeExp       & annotation  \\
1486    & \OR & \T{(} \TypeExp \T{)}          & grouping \\
1487    & \OR & \T{(} \TypeExp \T{,}
1488       \LIST{\N{TypeExp}} \T{)} & tuple type \\
1489    & \OR & \T{(} \TypeExp \T{,}
1490       \LIST{\N{TypeExp}} \T{)} & tuple class \\
1491    & \OR & \T{\{} \LIST{\N{Lab\_Type\_Exp}} \T{\}} & record type \\
1492    & \OR & \TypeExp \T{=} \N{Exp} & default value \\
1493 \NDEF{Lab\_Type\_Exp} & \IS &
1494    \Id \T{:} \TypeExp \\
1495 \NDEF{Type\_Qualifier}
1496    & \IS & \T{class}       & a class type \\
1497    & \OR & \T{const}       & constant type \\
1498    & \OR & \T{rewrite}     & rewritable \\
1499    & \OR & \T{collectable} & garbage collected \\
1500    & \OR & \T{persistent}  & persistent object  \\
1501 \end{syntax}
1502
1503 In general, four different types of datatype constructors can be defined:
1504 \begin{description}
1505  \item[nullary constructors]  These are constructors without an argument.
1506  \Prop{} maps these into small integers and no heap space will be consumed.
1507  \item[tuple argument constructors] These are constructors with one
1508  or more unamed arguments.
1509  \item[record argument constructors] These are constructors with one
1510  or more named arguments written within braces.  For example, in the
1511  following datatype definition a record argument constructor called
1512  \verb|Add| is defined:
1513  \begin{verbatim}
1514   datatype Exp = Add { left : Exp, right : Exp }
1515                | ...
1516  \end{verbatim}
1517   To construct an \verb|Add| expression, we can use any one of the following
1518   forms:
1519   \begin{verbatim}
1520      Exp e1 = Add(x,y);
1521      Exp e2 = Add'{ left = x, right = y };
1522      Exp e3 = Add'{ right = y, left = x };
1523   \end{verbatim}
1524   Note that we are allowed to rearrange the labels in any order if
1525   the record expression form is used.
1526  \item[empty constructors] These are constructors written with
1527  an empty argument \verb|()|.  For example, in the following definition
1528  an empty constructor called \verb|NONE| is defined:
1529  \begin{verbatim}
1530     datatype Exp : public AST
1531         = NONE ()
1532         | NONE2
1533         | ...
1534  \end{verbatim}
1535  Here, the constructor \verb|NONE| will inherit from the class \verb|AST|,
1536  while the nullay constructor \verb|NONE2| will not.
1537 \end{description}
1538
1539 \Subsubsection{Instantiating a datatype}
1540
1541 Each declared datatype should be {\em instantiated} in some implementation
1542 file using the \TDEF{instantiate datatype} statement.  The translator
1543 will generate additional code to implement the datatype.  The general
1544 syntax is:
1545 \begin{syntax}
1546  \NDEF{Instantiate\_Decl}
1547     & \IS &
1548        \TDEF{instantiate datatype} \LIST{\N{Type\_Spec}} \\
1549     & \OR &
1550        \TDEF{instantiate} \T{extern} \T{datatype} \LIST{\N{Type\_Spec}} \\
1551  \NDEF{Type\_Spec} & \IS & \Id \OPT{ \T{<} \LIST{\N{TypeExp}} \T{>}} \\
1552 \end{syntax}
1553
1554 \begin{Tips}
1555    When a datatype is defined with the \TDEF{inline} qualifier,
1556 \Prop{} will try to generate inline methods whenever possible.
1557 On the other hand, if the \TDEF{extern} qualifier is used, \Prop{}
1558 will try to generate an non-inline implementation, and only one
1559 copy are generated at where the \T{instantiate datatype} statement occurs.
1560
1561 Since most \Prop{} datatypes are light-weight objects, \T{inline} is taken
1562 as the default.  The user can override this default using
1563 \OPTIONDEF{-save\_space} option of the translator
1564 (see page~\pageref{option:save-space}).
1565 \end{Tips}
1566
1567 \begin{Tips}
1568    When the option \OPTIONDEF{-fno-implicit-templates} is used
1569 under \Gpp, the user should \verb|#define| the symbol
1570 \INDEX{{\tt PROP\_EXPLICIT\_TEMPLATE\_INSTANTIATION}} before
1571 an \T{instantiate datatype} statement.
1572 This will enable the code for explicit template instantiation.
1573 \end{Tips}
1574
1575 \Subsection{Pattern Matching} \label{sec:pattern-matching}
1576    The basic pattern matching construct is the \T{match} statement.
1577 A match statement can be seen as a generalization of \Cpp's \T{switch}
1578 statement, in which the expression that follows a \T{case} can be
1579 a general complex pattern.
1580
1581    Its basic syntax is as follows:
1582 \begin{syntax}
1583 \NDEF{Match} & \IS &
1584    \N{Match\_Mode} \ALT{\T{(} \N{Exp} \T{)}}{and}  \\
1585         & & \T{\{} \MORE{\T{case} \N{Match\_Rule}}{} \T{\}} & variant 1 \\
1586           & \OR &
1587    \N{Match\_Mode} \ALT{\T{(} \N{Exp} \T{)}}{and} \\
1588         & & \T{\{} \ALT{\N{Match\_Rule}}{|} \T{\}} & variant 2 \\
1589           & \OR &
1590    \N{Match\_Mode} \ALT{\T{(} \N{Exp} \T{)}}{and} \T{of} & variant 3 \\
1591         & & \quad \ALT{\N{Match\_Rule}}{|} \\
1592         & & \T{end} \T{match} \T{;} \\
1593 \NDEF{Match\_Mode} & \IS & \TDEF{match} \OPT{\T{while}} \\
1594                 & \OR & \TDEF{matchall} \OPT{\T{while}} \\
1595 \NDEF{Match\_Rule} & \IS & \Pat \OPT{\N{Guard}} \OPT{\N{Cost}}
1596         \T{:} \N{Match\_Action} \\
1597 \NDEF{Guard}       & \IS & \T{if} \N{Exp} \\
1598                 & \OR & \T{where} \N{Exp} \\
1599                 & \OR & \T{|} \N{Exp} \\
1600 \NDEF{Cost} & \IS & \verb|\| \Exp & cost expression \\
1601 \NDEF{Match\_Action} & \IS & \T{\{} \N{Code} \T{\}} \\
1602                   & \OR & \N{Code} & for variant 1 only \\
1603 \end{syntax}
1604
1605 The 3 syntactic variants of the match statement are equivalent in meaning.
1606 A \T{match} statement takes a list of match objects and a
1607 set of pattern matching rules.  The first rule that matches the pattern
1608 completely will be executed.  Notice that guards can be placed on each rule,
1609 and a rule is only applicable if the guards evaluate to true.
1610
1611 \paragraph{Cost minimization}
1612 In addition, if cost expressions are specified, then the matching process
1613 will proceed as follows:
1614 \begin{enumerate}
1615    \item Locate all applicable rules.  If no such rule exists then exit.
1616    \item Compute the cost expressions of all applicable rules.
1617    \item Choose the action that has the lowest cost to execute.  If
1618 a tie occurs, resolve by the textual order of the rules.
1619 \end{enumerate}
1620
1621 \paragraph{Match all rules}
1622 The \T{matchall} statement is a variant of \T{match}.  In the matchall
1623 mode, the actions of {\em all} applicable rules are executed in the
1624 textual order.  Caution: since the matching variable binding
1625 process occurs {\em before} the rules are executed, the rhs actions
1626 should not alter the matched value or the pattern variables may
1627 be bound to the wrong value.
1628
1629 \paragraph{Repetition}
1630 In addition, both \T{match} and \T{matchall} can be annotated with the
1631 \TDEF{while} modifier.  It this case the semantics of the matching
1632 construct is to repeat until no more rule is applicable.  For example,
1633 the following code fragment returns the last element of a list
1634 \begin{verbatim}
1635 datatype List = #[] | #[ int ... List ];
1636
1637 int last (List l)
1638 {  match while (l)
1639    { case #[]:         assert(0); // an error
1640      case #[i]:        return i;
1641      case #[h ... t]:  l = t;
1642    }
1643 }
1644 \end{verbatim}
1645
1646 \paragraph{Pattern syntax}
1647 The basic form of a pattern is a literal, which
1648 that matches only a single integer, real number, string, etc.  Complex
1649 patterns can be constructed inductively
1650 using datatype constructors, tuples, lists, and
1651 {\em logical pattern connectives}.
1652
1653 The syntax of the patterns is as follows:
1654 \begin{syntax}
1655 \NDEF{Pat} &
1656    \IS & \N{Integer} & matches integer literal \\
1657  & \OR & \N{Real}    & matches real literal \\
1658  & \OR & \N{Char}    & matches character literal \\
1659  & \OR & \N{Quark}   & matches quark literal \\
1660  & \OR & \N{String}  & matches string literal \\
1661  & \OR & \N{Regexp}  & matches any string that matches {\em regexp}\\
1662  & \OR & \N{Pat\_Var}  & a pattern variable; matches anything \\
1663  & \OR & \T{\_}      & wild card; matches anything \\
1664  & \OR & \N{Cons}     & matches a datatype constructor  \\
1665  & \OR & \N{Cons} \N{PatArg}& a constructor application pattern \\
1666  & \OR & \Id \T{as} \Pat & binds a variable to the sub-pattern \\
1667  & \OR & \Pat \T{:} \TypeExp & typed pattern \\
1668  & \OR & \Pat \T{||} \Pat  & matches either pattern \\
1669  & \OR & \Pat \T{\&\&} \Pat  & matches both patterns \\
1670  & \OR & \T{!} \Pat             & matches anything but the pattern \\
1671  & \OR & \T{(} \Pat \T{)}       & Grouping \\
1672  & \OR & \T{\#[} \LIST{\N{Pat}} \T{]} & list pattern; exact length match \\
1673  & \OR & \T{\#[} \LIST{\N{Pat}} \T{...} \T{]} & list pattern;
1674      non-exact length match \\
1675  & \OR & \T{[|} \LIST{\N{Pat}} \T{|]} & vector pattern; exact length match \\
1676  & \OR & \T{[|} \LIST{\N{Pat}} \T{...} \T{|]}
1677     & vector pattern; matches to the left \\
1678  & \OR & \T{[|} \T{...} \LIST{\N{Pat}} \T{|]}
1679     & vector pattern; matches to the right \\
1680 \NDEF{PatArg} & \IS & \Pat \\
1681            & \OR & \T{(} \LIST{\N{Pat}} \T{)} \\
1682            & \OR & \T{\{} \LIST{\N{Lab\_Pat}} \OPT{\T{...}} \T{\}} \\
1683 \NDEF{Lab\_Pat} & \IS & \Id \T{=} \Pat \\
1684 \NDEF{Pat\_Var} & \IS & \Id       \\
1685             & \OR & \T{?}\Id  \\
1686 \end{syntax}
1687
1688 \Subsection{Refining a datatype}
1689
1690    A datatype definition can be refined in a few ways:
1691
1692 \begin{enumerate}
1693   \item{\bf inheritance} A datatype can be declared to be inherited from
1694 one or more classes $C_1, C_2, \ldots, C_n$.
1695 The effect is that all variants of a datatype will be inherited from the
1696 classes $C_1, C_2, \ldots C_2$.
1697
1698   \item{\bf member functions}  Member functions and addition attributes
1699 can be attached to the datatype variants using the keyword \TDEF{with}.
1700 \end{enumerate}
1701
1702 For example, the following datatype \T{Exp} is inherited from some user
1703 defined classes \verb|AST| and \verb|Object|.   Furthermore, a virtual
1704 member function called print is defined.
1705
1706 \begin{verbatim}
1707 datatype Exp : public AST, virtual public Object
1708    = INT int
1709          with { ostream& print(ostream&); }
1710    | PLUS  Exp, Exp
1711          with { ostream& print(ostream&); }
1712    | MINUS Exp, Exp
1713          with { ostream& print(ostream&); }
1714    | MULT  Exp, Exp
1715          with { ostream& print(ostream&); }
1716    | DIV   Exp, Exp
1717          with { ostream& print(ostream&); }
1718    | VAR { name : const char *, typ : Type }
1719          with { ostream& print(ostream&); }
1720    public:
1721    {  virtual ostream& print (ostream&) = 0;
1722       void * operator new (size_t);
1723    }
1724 ;
1725 \end{verbatim}
1726
1727 The allocator and printing methods can be defined as follows:
1728 \begin{verbatim}
1729 void * classof Exp::operator new(size_t) { ... }
1730 ostream& classof INT::print(ostream& s)   { return s << INT; }
1731 ostream& classof PLUS::print(ostream& s)
1732    { return s << '(' << this->#1 << '+' << this->#2 <<')'; }
1733 ostream& classof MINUS::print(ostream& s)
1734    { return s << '(' << this->#1 << '-' << this->#2 <<')'; }
1735 ostream& classof MULT::print(ostream& s)
1736    { return s << '(' << this->#1 << '*' << this->#2 <<')'; }
1737 ostream& classof DIV::print(ostream& s)
1738    { return s << '(' << this->#1 << '/' << this->#2 <<')'; }
1739 ostream& classof VAR::print(ostream& s)   { return s << name; }
1740 \end{verbatim}
1741
1742 The special type form \TDEF{classof} {\em con} is used to reference
1743 the type of a constructor.  Arguments of a constructor uses the following
1744 naming convention: (i) if the constructor $C$ takes only one argument,
1745 then the name of the argument is $C$; (ii) if $C$ takes two or more
1746 unnamed arguments, then these are named \verb|#1|, \verb|#2|, etc;
1747 (iii) finally, labeled arguments are given the same name as the label.
1748
1749 For readability reasons, it is often useful to separate a datatype definition
1750 from the member functions and inheritance definitions.   The \TDEF{refine}
1751 declaration can be used in this situation.  For example, the above
1752 example can be restated more succinctly as follows:
1753
1754 \begin{verbatim}
1755 //
1756 // Datatype definition section
1757 //
1758 datatype Exp = INT int
1759              | PLUS  Exp, Exp
1760              | MINUS Exp, Exp
1761              | MULT  Exp, Exp
1762              | DIV   Exp, Exp
1763              | VAR { name : const char *, typ : Type }
1764 ;
1765
1766 //
1767 // Refinement section
1768 //
1769 refine Exp : public AST, virtual public Object
1770        { virtual ostream& print (ostream&) = 0;
1771          void * operator new (size_t);
1772        }
1773 and    INT, PLUS, MINUS, MULT, DIV, VAR
1774        { ostream& print(ostream&);
1775        }
1776 ;
1777 \end{verbatim}
1778
1779 The general syntax of the \TDEF{refine} declaration is as follows:
1780 \begin{syntax}
1781 \NDEF{Refine\_Decl}
1782    & \IS & \T{refine} \ALT{\N{Refine\_Spec}}{\T{and}} \T{;} \\
1783 \NDEF{Refine\_Spec}
1784    & \IS & \LIST{\N{Id}} \\
1785    &     & \quad \OPT{\T{:} \N{Inherit\_List}} \\
1786    &     & \quad \OPT{\T{::} \N{Datatype\_Qualifiers}} \\
1787    &     & \quad \OPT{\T{\{} \N{Datatype\_Body} \T{\}}} \\
1788 \end{syntax}
1789 Here, \N{Id} refers to either a constructor name or a datatype name.
1790
1791 \Subsection{Memory management}
1792
1793    By default, a datatype constructor calls the \verb|operator new| to
1794 allocated memory.  There are a few ways to change this behavior: (i)
1795 redefine the \verb|operator new| within the datatype using refinement; or
1796 (ii) inherit from a class that redefines this method.
1797
1798    If a {\em placement} \verb|operator new| is defined in datatype,
1799 the placement constructor syntax can be used to invoke this
1800 operator:
1801 \begin{syntax}
1802    \NDEF{Placement\_Cons}
1803    & \IS & \N{Cons} \T{'(} \LIST{\N{Exp}} \T{)} \T{(}
1804    \OPT{\LIST{\N{Exp}}} \T{)} \\
1805    & \OR & \N{Cons} \T{'(} \LIST{\N{Exp}} \T{)} \T{\{}
1806    \OPT{\LIST{{\N{Id} = \N{Exp}}}}
1807      \T{\}} \\
1808 \end{syntax}
1809
1810 For example, in the following the extra parameter \verb|mem| will
1811 be passed to placement \verb|new| operator.
1812 \begin{verbatim}
1813   Exp e1 = INT'(mem)(1);
1814   Exp e2 = VAR'(mem){ name = "foo", typ = t };
1815 \end{verbatim}
1816
1817 \Subsubsection{Garbage collection}
1818
1819   The support library contains an implementation\footnote{The implementation
1820 has only been tested on Linux, SunOS and Solaris.  Please contact the
1821 author for details
1822 if you'd like to port the collector to your particular platform.} of a
1823 conservative garbage collector using two algorithms: one using
1824 a mostly copying scheme\cite{Mostly-copying,Gen-mostly-copying}
1825 and one using mark-sweep.
1826
1827   A datatype can be defined to be garbage collectable with the
1828 qualifier \TDEF{collectable}.  In the following example, the datatype
1829 \T{Exp} is will be allocated from the a garbage collected heap instead
1830 of the default heap.  Since it references another
1831 user defined garbage collectable object
1832 \verb|SymbolTable|, the pointer to that symbol table is also marked
1833 as \T{collectable}.
1834 \begin{verbatim}
1835    // SymbolTable is collectable
1836    class SymbolTable: public GCObject
1837    {
1838    public:
1839       class Id : public GCObject { ... };
1840       ...
1841    };
1842
1843    datatype Exp :: collectable
1844       = INT int
1845       | VAR (collectable SymbolTable::Id, collectable SymbolTable *)
1846       | ADD Exp, Exp
1847       | SUB Exp, Exp
1848       | MUL Exp, Exp
1849       | DIV Exp, Exp
1850       ;
1851 \end{verbatim}
1852
1853 The corresponding \TDEF{instantiate datatype} statement for \T{Exp}
1854 will generate the appropriate tracing methods for garbage collection.
1855
1856 Please see appendix~\ref{appendix:gc} for more details concerning
1857 garbage collection.
1858
1859 \Subsubsection{Persistence}
1860
1861   Datatypes can be interfaced with the persistent object library
1862 supplied with the translator using the \TDEF{persistent} qualifier.
1863 A persistent object defines the protocol for serializing its representation
1864 into an architecture independent format on a persistence stream.
1865
1866 For example, the following datatype declaration defines a persistent
1867 type \verb|Exp|.  Note that a pointer to the symbol table
1868 is annotated with the \T{persisent} qualifier to signify that the object
1869 associated with the pointer should be traversed during the serialization
1870 phase.
1871 \begin{verbatim}
1872    // SymbolTable is persistent
1873    class SymbolTable: public PObject
1874    {
1875    public:
1876       class Id : public PObject { ... };
1877       ...
1878    };
1879
1880    datatype Exp :: persistent
1881       = INT int
1882       | VAR (persistent SymbolTable::Id, persistent SymbolTable *)
1883       | ADD Exp, Exp
1884       | SUB Exp, Exp
1885       | MUL Exp, Exp
1886       | DIV Exp, Exp
1887       ;
1888 \end{verbatim}
1889
1890 A {persistent object id} {\em must} be provided with each distinct
1891 datatype.  Object ids are used to identify the classes
1892 of persistent objects.  Thus they must be unique within an application.
1893 Each persistent type tag is simply a string, and can be defined using
1894 the \TDEF{refine persistent} statement.
1895
1896 For example, the following statement declares the persistent object
1897 id for type \verb|Exp| above to be \verb|"Simple expressions"|.
1898 \begin{verbatim}
1899    refine persistent Exp => "Simple expressions";
1900 \end{verbatim}
1901
1902 The corresponding \TDEF{instantiate datatype} statement will generate
1903 the appropriate methods to communicate with a persistent stream.
1904 \begin{verbatim}
1905   instantiate datatype Exp;
1906 \end{verbatim}
1907
1908 To write an object $e$ to the datafile \verb|"foo"|, we can say:
1909 \begin{verbatim}
1910    #include <AD/persist/postream.h>
1911    ofstream out("foo");
1912    Postream pout(out);
1913    pout << e;
1914    out.close();
1915 \end{verbatim}
1916
1917 To read the object back from the file, we can say:
1918 \begin{verbatim}
1919    #include <AD/persist/pistream.h>
1920    EXP e;
1921    ifstream in("foo");
1922    Pistream pin(in);
1923    e = (Exp)read_object(pin);
1924    in.close();
1925 \end{verbatim}
1926
1927 For more details concerning persistent streams and persist objects,
1928 please see directory \verb|<AD/persist>| for details.
1929
1930 \Section{Inference} \label{sec:inference}
1931
1932    Semantic processing
1933 in compilers and other language processors, such as data flow analysis,
1934 can frequently be specified as in a rule-based,
1935 logical deductive style.  In \Prop, deductive inference using forward
1936 chaining\footnote{
1937 The current
1938 implementation of {\sf Prop} translates inference into very naive code.
1939 This feature will be phased out and replaced by
1940 the more general and much faster graph rewriting mechanism.
1941 }
1942 is provided as a built-in mechanism, orthogonal
1943 to pattern matching and rewriting, for manipulating user-defined
1944 algebraic datatypes.
1945
1946 Similar to syntax classes, {\bf inference classes} may be used
1947 for data encapsulation.  An inference class is a combination of a \Cpp{}
1948 class, a database of inference relations, and a collection of inference rules
1949 of the form {\em lhs \verb|->| rhs}.  The lhs of an inference rule is a set
1950 of patterns in conjunctive form.  During the inference process, a rule is
1951 fired when its lhs condition is satisfied.  A fired rule then executes
1952 the corresponding rhs action, which may assert or retract additional relations
1953 from the database.   Using multiple inheritance, it is possible to combine
1954 a rewriting class with an inference class such that the rewriting process
1955 generates new relations to drive the inference process, or vice versa.
1956
1957 \Subsection{An Example}
1958 Datatype relations are not a distinct kind of data structure but are in fact
1959 simply algebraic datatypes declared to have a \TDEF{relation}
1960 attribute.  For example, in
1961 the following definition three relation types \verb|Person|, \verb|Parent|
1962 and \verb|Gen| are defined.
1963
1964 \begin{verbatim}
1965    datatype Person :: relation = person (const char *)
1966        and  Parent :: relation = parent (const char *, const char *)
1967        and  Gen    :: relation = same_generation (const char *, const char *);
1968
1969    instantiate datatype Person, Parent, Gen;
1970 \end{verbatim}
1971
1972 Using these relations we can define an inference class that computes whether
1973 two persons are in the same generation.   Nine axioms are defined (i.e.
1974 those whose lhs are empty) in the following.  The two inference rules
1975 state that (1) a person is in the same generation as him/herself,
1976 and (2) two persons are in the same generation if
1977 their parents are in the same generation.
1978
1979 \begin{verbatim}
1980    inference class SameGeneration {};
1981
1982    inference SameGeneration
1983    {  -> person("p1") and person("p2") and
1984          person("p3") and person("p4") and
1985          person("p5");
1986
1987       -> parent("p1", "p2") and
1988          parent("p1", "p3") and
1989          parent("p2", "p4") and
1990          parent("p3", "P5");
1991
1992       person ?p -> same_generation (?p, ?p);
1993
1994       parent (?x, ?y) and parent (?z, ?w) and same_generation (?x, ?z)
1995       -> same_generation(?y, ?w);
1996    };
1997 \end{verbatim}
1998
1999 In general, datatypes qualified as \verb|relation|s will inherit
2000 from the base class \verb|Fact|, while a rewrite class definition
2001 implicitly defines two member functions used to assert and retract facts
2002 in the internal database.  For example, in the above example, the following
2003 protocol will be automatically generated by the inference compiler.
2004
2005 \begin{verbatim}
2006    class SameGeneration : ...
2007    {
2008    public:
2009        virtual Rete&    infer       ();       // start the inference process
2010        virtual ReteNet& operator += (Fact *); // assert fact
2011        virtual ReteNet& operator -= (Fact *); // retract fact
2012    };
2013 \end{verbatim}
2014
2015
2016
2017 Using these methods, an application can insert or remove relations
2018 from an inference class.  This will in turn trigger any attached inference
2019 rules of the class.
2020
2021 \Subsubsection{Another example}
2022
2023 Consider the following example, which is used to compute Pythagorean
2024 triangles.  Only one axiom and two rules are used.  The axiom and the
2025 first rule are used to assert the relations \verb|num(1)| to \verb|num(n)|
2026 into the database, where \verb|n| is limited by the term \verb|limit(n)|.
2027 The second inference rule is responsible for printing out only
2028 the appropriate combinations of numbers.
2029
2030 \begin{verbatim}
2031    datatype Number :: relation = num int | limit int;
2032
2033    inference class Triangle {};
2034
2035    inference Triangle
2036    {  ->  num 1;
2037
2038           num m
2039       and limit n | n > m
2040       ->  num (m+1);
2041
2042           num a
2043       and num b
2044       and num c | a < b && b < c && a*a + b*b == c*c
2045       ->  { cout << a << " * " << a << " + "
2046                  << b << " * " << b << " = "
2047                  << c << " * " << c << "\n";
2048           };
2049    };
2050 \end{verbatim}
2051
2052 Now, to print all the triangle identities lying in range of $1$ to $100$,
2053 we only have to create an instance of the inference class, insert the
2054 limit, and start the inference process, as in below:
2055
2056 \begin{verbatim}
2057    Triangle triangle;
2058    triangle += limit(100);
2059    triangle.infer();
2060 \end{verbatim}
2061
2062 \Subsection{Inference Class}
2063 \INCOMPLETE
2064
2065 \Subsection{Inference Rules}
2066 \INCOMPLETE
2067
2068 \Section{Tree Rewriting} \label{sec:tree-rewriting}
2069
2070 \Prop's tree rewriting mechanism let us transform a tree in
2071 algebraic datatype form into another tree according to a set of
2072 {\em rewriting rules}.  Unlike plain pattern matching described
2073 in section~\ref{sec:pattern-matching}, which only apply to the
2074 root of a tree, rewriting rules are applicable to all parts of
2075 the tree.  This allows the user to concentrate on developing the
2076 set of transformations; the actual traversal of the tree object
2077 is handled implicitly by \Prop.  When used properly, this
2078 mechanism has an advantage over plain pattern matching since
2079 rewriting rules remain valid even when a new variant to a datatype
2080 is introduced.
2081
2082 In \Prop, a rewriting system is developed in a similar manner as a parser.
2083 The first phase requires the user to defines a {\em rewrite class}
2084 to encapsulate the rewriting rules.
2085
2086 Frequently, the rewriting mechanism is used to collect information about
2087 a tree structure; furthermore, more transformation rules are {\em conditional}:
2088 i.e. we want them to be applicable only if certain conditions are satisfied.
2089 We can accomplish both of these tasks by defining data members and methods
2090 in the rewriting class.  These are accessible during the rewriting
2091 process.  Information collected during rewriting can be stored within
2092 the data members.
2093
2094 In the rewriting formalism, equational
2095 rules of the form {\em lhs} \verb|->| {\em rhs} are specified by the user.
2096 During processing, each instance of the lhs in a complex tree is replaced
2097 by an instance of the rhs, until no such replacement is possible.
2098 Equational rules can often be used to specify semantics based simplification
2099 (e.g. constant folding and simplification based on simple algebraic
2100 identities) or transformation(e.g. code selection in a compiler
2101 backend\cite{codegen-with-trees}).
2102
2103 Unlike plain pattern matching, however, the structural traversal process
2104 in rewriting is implicitly inferred from the type structure of an
2105 algebraic datatype, as specified in its definition.
2106
2107 There are two main forms of rewriting modes available:
2108 \begin{itemize}
2109    \item The first is {\bf normalization} mode: a given tree is reduced
2110 using the matching rules until no more redexes are available.  There
2111 are two modes of operations available:
2112        \begin{itemize}
2113           \item in {\bf replacement} mode, the redex of a tree will be
2114 physically overwritten.
2115           \item in {\bf applicative} mode, on the other hand, a new
2116 tree corresponding to the replacement value will be constructed.
2117        \end{itemize}
2118      Replacement mode is used as the default since it is usually
2119 the more efficient of the two.
2120    \item The second form is {\bf reduction} and {\bf transformation}.
2121 In this mode a tree parse of the input term is computed.  If cost functions
2122 are attached to the rules, then they will also be used to determine a
2123 minimal cost reduction sequence.  During this process attached actions of
2124 a rule may be invoked to synthesize new data.
2125 \end{itemize}
2126
2127    Each independent set of rewriting rules in \Prop{} is encapsulated
2128 in its own {\bf rewrite class}.  A rewrite class is basically a normal \Cpp{}
2129 class with a set of rewriting rules attached.   During rewriting, the
2130 data members and the member functions are visible according to the normal
2131 \Cpp{} scoping rules.  This makes it is easy to encapsulate additional data
2132 computed as a side effect during the rewriting process.
2133
2134 \Subsection{A rewriting example}
2135
2136    Consider an abbreviated simplifier for the well-formed formula datatype
2137 \verb|Wff| defined in the section~\ref{sec:Wff}.  The rewrite class for this
2138 can be defined as follows.  Since there is no encapsulated data in this
2139 example, only the default constructor for the class needs to be defined.
2140 A rewrite class definition requires the traversal list to be
2141 specified.  This is simply a list of datatypes
2142 involved in the rewriting traversal process.
2143 In this instance only \verb|Wff| is needed.
2144
2145 \begin{verbatim}
2146    rewrite class Simplify (Wff)
2147    {
2148    public:
2149       Simplify() {}
2150    };
2151 \end{verbatim}
2152
2153    The rewrite rules for the simplifier can then be specified succinctly as
2154 follows.  Like the \verb|match| statement, in general the rhs
2155 of a rewrite rule can be any statement.  A special statement
2156 \verb|rewrite(e)| can be used to rewrite the current redex into another form.
2157 If the rhs is of the form \verb|rewrite(e)|, then it can be abbreviated
2158 to \verb|e|, as in below:
2159 \begin{verbatim}
2160    rewrite Simplify
2161    {  And(F,  _):      F
2162    |  And(_,  F):      F
2163    |  And(T,  ?X):     ?X
2164    |  And(?X, T):      ?X
2165    |  Or (T,  _):      T
2166    |  Or (_,  T):      T
2167    |  Or (F,  ?X):     ?X
2168    |  Or (?X, F):      ?X
2169    |  Not(Not(?X)):    ?X
2170    |  Not(And(?X,?Y)): Or(Not(?X), Not(?Y))
2171    |  Not(Or(?X,?Y)):  And(Not(?X), Not(?Y))
2172    |  Implies(?X,?Y):  Or(Not(?X), ?Y)
2173    |  And (?X, ?X):     ?X
2174    |  Or (?X, ?X):      ?X
2175    // etc ...
2176    };
2177 \end{verbatim}
2178
2179 The rewrite class definition creates a new class of the same name.  This new
2180 class defines an implicit \verb|operator ()| with the protocol below.  This
2181 member function can be invoked to perform the rewriting in a functional
2182 syntax.
2183
2184 \begin{verbatim}
2185    class Simplify : ... {
2186    {  ...
2187    public:
2188       void operator () (Wff);
2189       // Wff operator () (Wff); // if rewrite class is applicative
2190    };
2191
2192    Wff wff = ...;
2193    Simplify simplify; // create a new instance of the rewrite class
2194    simplify(wff);    // rewrite the term wff
2195 \end{verbatim}
2196
2197
2198 \Subsubsection{Conditional rewriting and actions}
2199
2200    Rewriting rules may be guarded with predicates to limit their
2201 applicability.  In addition, the {\em rhs} of a rewrite rule is not limited
2202 to only a replacement expression: in general, any arbitrarily complex sequence
2203 of code may be used.  For example, in the following set of rewriting
2204 rules we use guards to prevent undesirable replacements to be made during
2205 expression constant folding:
2206
2207 \begin{verbatim}
2208    rewrite ConstantFolding
2209    {  ADD (INT a, INT b):  INT(a+b)
2210    |  SUB (INT a, INT b):  INT(a-b)
2211    |  MUL (INT a, INT b):
2212       {  int c = a * b;                      // silent overflow
2213          if (a == 0 || b == 0 || c / b == a) // no overflow?
2214          {  rewrite(INT(c)); }
2215          else
2216          {  cerr << "Overflow in multiply\n"; }
2217       }
2218    |  DIV (INT a, INT b) | b == 0:  { cerr << "Division by zero\n"; }
2219    |  DIV (INT a, INT b):  INT(a/b)
2220    |  // etc...
2221    };
2222 \end{verbatim}
2223
2224 \Subsection{Rewrite class} \label{sec:rewrite-class}
2225
2226 The syntax of a rewrite class declaration is as follows:
2227
2228 \begin{syntax}
2229 \NDEF{Rewrite\_Class\_Decl} & \IS & \TDEF{rewrite class} \Id
2230             \T{(} \LIST{\N{TypeExp}} \T{)} \\
2231    & & \quad \OPT{\T{:} \N{Inherit\_List}}
2232         \OPT{\T{::} \MORE{\N{Rewrite\_Mode}}{}} \\
2233    & & \quad \T{\{} \N{Class\_Body} \T{\}} \T{;} \\
2234 \NDEF{Rewrite\_Mode} & \IS & \TDEF{treeparser} \\
2235                   & \OR & \TDEF{applicative} \\
2236                   & \OR & \TDEF{topdown} \\
2237 \end{syntax}
2238
2239 This is simply the usual \Cpp{} class declaration syntax
2240 extended to the following information:
2241 \begin{description}
2242    \item[a traversal list] enclosed in parenthesis.  The traversal
2243 list of a rewrite class defines the set of types that rewriting
2244 should traverse.  If a datatype object contains an argument of a type
2245 not listed in the traversal list, its value will not be altered.
2246    \item[rewrite mode] this defines the rewriting mode of the
2247 rewrite class.
2248 \end{description}
2249
2250 \Subsection{Rewriting rules}
2251
2252 Rewriting rules are specified in a rewrite declaration.
2253 The syntax of a rewriting specification is as follows:
2254
2255 \begin{syntax}
2256 \NDEF{Rewrite\_Decl}
2257    & \IS & \TDEF{rewrite} \Id & variant 1 \\
2258    &    & \T{\{} \OPT{\N{Index\_Decl}}
2259         \ALT{\T{case} \N{Rewrite\_Rules}}{} \T{\}} \\
2260    & \OR & \T{rewrite} \Id & variant 2 \\
2261    &    & \T{\{} \OPT{\N{Index\_Decl}} \ALT{\N{Rewrite\_Rules}}{|} \T{\}} \\
2262 \NDEF{Rewrite\_Rule}
2263   & \IS & \OPT{\N{Rewrite\_Modifier}} \OPT{\N{Id} \T{->}} \\
2264   & & \quad \Pat \OPT{\N{Guard}} \OPT{\N{Cost}} \T{:} \N{Rewrite\_Action} \\
2265 \NDEF{Rewrite\_Modifier}
2266   & \IS & \T{bottomup:} & bottom up mode \\
2267   & \OR & \T{topdown:} & top down mode \\
2268   & \OR & \T{before:} & before actions \\
2269   & \OR & \T{preorder:} & preorder actions \\
2270   & \OR & \T{postorder:} & postorder actions \\
2271 \NDEF{Rewrite\_Action} & \IS & \T{\{} \N{Code} \T{\}} \\
2272                   & \OR & \Exp \\
2273 \end{syntax}
2274
2275 The name of a rewrite declaration should match the name of
2276 a rewrite class.
2277 The two syntactic forms of \T{rewrite} have equivalent semantics.
2278
2279 The special statement \T{rewrite}($e$) may be used
2280 inside the rhs action to rewrite the current redex into $e$.
2281 Note that \T{rewrite}($e$) is a branching statement; statements
2282 after \T{rewrite} are not reachable.
2283
2284 \Subsection{Rewriting modifiers}
2285 In version 2.3.3 onward, rewrite rules can be modified by the
2286 keywords: \T{bottomup:}, \T{topdown:}, \T{before:}, \T{preorder:}
2287 and \T{postorder:}.   These modifiers
2288 alter the order in which a set of rewriting rules is applied.
2289
2290 These modifiers act like delimiters, similar
2291 to the way scoping keywords like \T{public:}, \T{protected:} and
2292 \T{private:} are used to delimit declarations in C++.
2293 For instance, a set of rules prefixed by the modifier \T{topdown:}
2294 will utilize the topdown strategy for reduction.  If no modifier are
2295 specified, then bottom-up will be the default.
2296
2297 Note that all five modes of rules can be used together in a rewriting system
2298 under tree rewriting mode (see \ref{sec:rewriting-modes})
2299 and their executions are intermixed together.
2300
2301 The semantics of these modifiers are as follows:
2302 \begin{description}
2303 \item[\TDEF{bottomup:}] This specifies the modified rewriting rules should
2304 be applied in the default bottom-up mode.  In this mode, the
2305 innermost/leftmost redex is chosen as the next redex.  So reduction
2306 occurs in a bottomup manner.  This means that if a term $t$ is a redex
2307 and if the term $t$ contains a subterm $t'$ which is also a redex,
2308 $t'$ will be reduced before $t$.
2309 \item[\TDEF{topdown:}]  This specifies the modified rewriting rules should be
2310 applied in a topdown mode.  In this mode, the rewriter
2311 will first try to locate the outermost/leftmost redex.  Reduction
2312 will occur in a (mostly) topdown manner.
2313 \item[\TDEF{before:}]  This specifies that the modified rewriting rules
2314 should be tried before the topdown phase.
2315 In addition, unlike topdown mode, if state-caching if used (see
2316 \ref{sec:state-caching}), the rules are only tried once.
2317 Otherwise, these act just like the preorder modifier, described below.
2318 \item[\TDEF{preorder:}]  This specifies that
2319 the modified rewriting rules should be
2320 tried only at the preorder traversal of a term.
2321 \item[\TDEF{postorder:}]  This specifies the rewriting rules should be
2322 tried only at the postorder traversal of a term.
2323 \end{description}
2324
2325 \Subsubsection{Rewriting modifier example} \label{sec:rewriting-modifier}
2326
2327 We'll use the following example, extracted from a query optimizer for
2328 a subset of the relational calculus, to demonstrate the use
2329 of the rewriting modifiers.
2330
2331 The query optimizer transforms the following abstract syntax, defined
2332 as a set of \Prop's datatypes.
2333 \begin{verbatim}
2334 datatype List<T> = #[] | #[ T ... List<T> ];
2335 datatype Literal = INT int
2336                  | STRING const char *
2337                  | BOOL Bool
2338
2339 and      Exp     = OP Id, Exps
2340                  | APP Id, Exps
2341                  | LIT Literal
2342                  | ID Id
2343                  | TUPLE Exps                  // [ E1, E2, ... En ]
2344                  | FORALL Id, Exp, Exp         // forall x in E1 . E2
2345                  | EXISTS Id, Exp, Exp         // exists x in E1 . E2
2346                  | GUARD (Exp, Exp)            //
2347                  | GENERATOR (Ids, Exps, Exp)  // [x_1,...,x_n] in X_1*...*X_n
2348                  | LET  (Id, Exp, Exp)
2349
2350 law inline Int  i    = LIT(INT i)
2351 |   inline Boolean b = LIT(BOOL b)
2352 |          True      = Boolean true
2353 |          False     = Boolean false
2354 |   inline And a,b   = OP("and",#[a,b])
2355 |   inline Or  a,b   = OP("or", #[a,b])
2356 |   inline Not a     = OP("not",#[a])
2357 |   inline Eq a,b    = OP("=",  #[a,b])
2358 |   inline Ne a,b    = OP("/=", #[a,b])
2359 |   inline Gt a,b    = OP(">",  #[a,b])
2360 |   inline Ge a,b    = OP(">=", #[a,b])
2361 |   inline Lt a,b    = OP("<",  #[a,b])
2362 |   inline Le a,b    = OP("<=", #[a,b])
2363 |   inline In a,b    = OP("in", #[a,b])
2364 |   inline Union a,b = OP("union", #[a,b])
2365 |   inline Count a   = OP("#", #[a])
2366
2367 where type Id         = const char *
2368 and        Ids        = List<Id>
2369 and        Exps       = List<Exp>
2370 ;
2371 \end{verbatim}
2372
2373 Note that existential and
2374 universal quantifiers are represented by the constructors \T{FORALL}
2375 and \T{EXISTS} respectively.  For example, $\exists x \in A.p$ is
2376 presented as the term $\T{EXISTS}(x,A,p)$.  Here $x$ is a {\em binding}
2377 occurrence for $x$; note that the variable $x$ may occur within
2378 the predicate $p$.  In addition, a list comprehension
2379 expression such as
2380 \[ \{ e : [x_1,\ldots,x_n] \in X_1
2381 \times \ldots \times X_n\ |\ p \} \]
2382 is represented as the compound term
2383 \[
2384   \T{GENERATOR}(\#[x_1,\ldots,x_n],\#[X_1,\ldots,X_n],\T{GUARD}(p,e))
2385 \]
2386
2387 Suppose we'll like to rename all variables in a query $Q$
2388 so that no two binding occurrences are given the same name.
2389 This can be easily accomplished using a combination
2390 of \T{preorder:} and \T{postorder:} rules in the following manner:
2391 \begin{enumerate}
2392 \item[(i)] Use preorder rules to check for new variable bindings,
2393 for example in $\T{EXISTS}(x,A,p)$.
2394 For each new binding occurrence for $x$, create
2395 a new name for $x$.
2396 These will be performed before the subterms $A$ and $p$ are traversed.
2397 \item[(ii)] Whenever a variable $x$ is found during the sub-traversal
2398 of $A$ and $p$, rename the variable by looking up its new name.
2399 \item[(iii)] Finally, use postorder rules to remove the current
2400 variable substitution whenever we're exiting a binding occurrence.
2401 \end{enumerate}
2402
2403 For example, the following set of rewriting rules accomplish this renaming
2404 task for our query language using exactly this method.
2405 \begin{verbatim}
2406 rewrite RemoveDuplicateNames
2407 {
2408    // We use before and after rules to remove duplicates from variable names.
2409    // As each new binding occurrence is found, we enter a new binding
2410    // into the current environment.  Identifiers found within
2411    // the binding occurrence are then renamed.
2412
2413 preorder: // insert new bindings
2414    EXISTS(x,_,_):     { new_binding(x); }
2415 |  FORALL(x,_,_):     { new_binding(x); }
2416 |  GENERATOR(xs,_,_): { new_binding(xs); }
2417 |  LET(x,_,_):        { new_binding(x); }
2418
2419          // rename variables
2420 |  ID x:             { rename(x); }
2421
2422 postorder: // removes the binding
2423 |  EXISTS(x,A,_):      { old_binding(x); }
2424 |  FORALL(x,A,_):      { old_binding(x); }
2425 |  GENERATOR(xs,As,_): { old_binding(xs); }
2426 |  LET(x,_,_):         { old_binding(x); }
2427 };
2428 \end{verbatim}
2429
2430 Note that we have accomplished a depth first traversal using rewriting
2431 without writing any traversal code!  As a side benefit,
2432 since the traversal is automatically determined by
2433 the structure of the datatypes, we do not have to rewrite this
2434 renaming code even if the abstract syntax of the query language is
2435 extended, as long as no additional binding operators are added.
2436
2437 \Subsection{The \T{rewrite} statement}
2438
2439    While the rewrite class construct provides a very general abstraction
2440 for rewriting, in general its full power is unneeded.  It is often
2441 convenient to be able to perform rewriting on a term without having
2442 to make a new name for a class just for the occasion, especially if member
2443 functions and member data are unneeded.  To accommodate these situations,
2444 the \verb|rewrite| statement is provided
2445 to perform a set rewriting transformations on a term without having
2446 to define a temporary rewrite class.   It is simply syntactic sugar
2447 for the more general rewrite class and rewrite
2448 rules specifications.
2449 For example, a simplify routine for type \verb|Exp| defined above can be
2450 specified as follows:
2451
2452 \begin{verbatim}
2453    Exp simplify (Exp e)
2454    {  // transformations on e before
2455       rewrite (e) type (Exp)
2456       {  ADD (INT a, INT b):  INT(a+b)
2457       |  SUB (INT a, INT b):  INT(a-b)
2458       |  MUL (INT a, INT b):  INT(a*b)
2459       |  ...
2460       }
2461       // transformations on e after
2462       return e;
2463    }
2464 \end{verbatim}
2465
2466 The \verb|rewrite| normally performs the replacement in-place.
2467 An applicative version of the same can be written as follows\footnote{The
2468 variable {\tt e} is assigned the new copy.}:
2469
2470 \begin{verbatim}
2471    Exp simplify (Exp e)
2472    {  rewrite (e) => e type (Exp)
2473       {  ADD (INT a, INT b):  INT(a+b)
2474       |  SUB (INT a, INT b):  INT(a+b)
2475       |  MUL (INT a, INT b):  INT(a*b)
2476       |  ...
2477       }
2478       return e;
2479    }
2480 \end{verbatim}
2481
2482 The syntax of the rewrite statement is as follows.  The
2483 traversal list of the set of rewrite rule is listed next to
2484 the keyword \T{type}.
2485
2486 \begin{syntax}
2487 \NDEF{Rewrite\_Stmt}
2488    & \IS & \TDEF{rewrite} \T{(} \Exp \T{)} \OPT{\T{=>} \N{Exp}} \\
2489    &     & \quad \T{type} \T{(} \LIST{\N{TypeExp}} \T{)} \\
2490    &     & \T{\{} \OPT{\N{Index\_Decl}} \N{Rewrite\_Rules} \T{\}} \\
2491    & \OR & \TDEF{rewrite} \T{(} \N{Exp} \T{)} \OPT{\T{=>} \N{Exp}} \\
2492    &    & \quad \TDEF{type} \T{(} \LIST{\N{TypeExp}} \T{)} \T{of} \\
2493    &    & \quad \OPT{\N{Index\_Decl}} \N{Rewrite\_Rules} \\
2494    &    & \T{end} \T{rewrite} \T{;} \\
2495 \end{syntax}
2496
2497 \Subsubsection{Rewriting modes} \label{sec:rewriting-modes}
2498    There are two basic modes of operations in a rewrite class: (i)
2499 {\em tree rewriting} and (ii) {\em tree parsing}.
2500 Tree rewriting repeatedly looks for redexes in a tree and transforms it into
2501 another tree.   There are two sub-modes of operations: (a) {\em applicative}
2502 and (b) {\em in-place}.  Applicative mode is specified using the
2503 \T{applicative} mode specifier when declaring a rewrite class.
2504 In this mode, a new tree is built from a
2505 bottom-up manner, and the subject tree is left unaltered.  The default
2506 mode is ``in-place.''  In this mode, the subject tree is overwritten
2507 with the transformed expression.
2508
2509    On the other hand, in tree parsing mode, the left hand side
2510 of a rewrite rules specification is treated as a tree grammar.
2511 The tree parser will look for a derivation of given tree object from
2512 the start non-terminal.
2513 A guarded rule will only be used if the guard evaluates to true.
2514
2515    If rules are annotated with cost expressions, then the
2516 tree parser will try to locate a derivation chain with a minimal total
2517 cost.  After a derivation tree is found, the tree can be repeated
2518 traversed.  The rhs actions will be invoked during this traversal process.
2519
2520 \Subsubsection{State caching optimization} \label{sec:state-caching}
2521
2522    The pattern matching process proceeds in a bottom up manner:
2523 for each subtree of the form $l(t_1,t_2,\ldots,t_n)$, $n \ge 0$,
2524 a state number is computed from the current tree label $l$ and the
2525 state numbers of its subtrees $t_1, t_2, \ldots, t_n$.  The set of matched
2526 rules can be directly computed from the current state number.
2527 This means that in the absence of redex replacements, pattern matching
2528 takes time proportional only $O(|t| + r)$, where $|t|$ is the size
2529 of the input tree $t$ and $r$ is time it takes to execute the rhs actions.
2530 That is, the runtime is insensitive to the number of rewrite rules or the
2531 complexity of the patterns.
2532
2533    However, if the input is a DAG rather than a tree\footnote{Rewriting
2534 of a generic graph structure is not recommended with \T{rewrite}.},
2535 or if replacements are performed, then the above analysis may not hold.
2536 Replacements during rewriting often require state information of the
2537 replacement term to be recomputed to further the matching process.
2538 Since computation of state can involve a complete traversal of a term,
2539 replacement can become expensive if the replacement term is large.
2540 For instance, consider the following replacement rule, which replaces
2541 all expressions of the form {\em 2*x} into {\em x+x}:
2542
2543 \begin{verbatim}
2544    rewrite class StrengthReduction
2545    {
2546       MUL (INT 2, ?x):  ADD(?x, ?x)
2547       ...
2548    }
2549 \end{verbatim}
2550
2551 \noindent
2552 Since the subterm \verb|?x| could be arbitrarily large, recomputing the
2553 state encoding for \verb|ADD(?x,?x)| naively takes time in proportion to the
2554 size of \verb|?x|.  However, since the subtree \verb|?x| occurs as part
2555 of the pattern, its state has already been computed, and it would be
2556 wasteful to recompute its state from scratch.
2557
2558 A similar observation can be made to the case when the input is a DAG,
2559 where many subcomponents are shared between nodes.  In order to speed
2560 up the rewriting process, the state of each shared copy of the nodes
2561 should be computed only once.
2562
2563 In order to speedup the state computation process in these situations,
2564 {\em \INDEX{state caching}} can be enabled by
2565 specifying a {\em rewriting index}.
2566 This can be accomplished in two ways: (i) use the qualifier
2567 \TDEF{rewrite} when defining a datatype.  This specifies the
2568 {\em primary index}.  Or alternatively, (ii) use index declarations
2569 to specify one or more {\em secondary indices}.
2570
2571 We'll first describe the first method.
2572 A \TDEF{rewrite} qualifier can be used
2573 in the definition of a datatype.  For instance:
2574
2575 \begin{verbatim}
2576    datatype Exp :: rewrite
2577       = INT (int)
2578       | ID  (const char *)
2579       | ADD (Exp, Exp)
2580       | SUB (Exp, Exp)
2581       | MUL (Exp, Exp)
2582       | DIV (Exp, Exp)
2583       ;
2584 \end{verbatim}
2585
2586 The \T{rewrite} qualifier tells the translator to generate an extra
2587 state field in the implementation of \T{Exp}.  This state field is implicitly
2588 updated during the rewriting process.  When the rewriter encounters a
2589 node with a cached state it can quickly short-circuit all unnecessary
2590 state computation.
2591
2592 {\bf A word of caution:} since each rewriting system computes
2593 its own encoding, rewriting systems should not be mixed if they shard the
2594 same index, i.e. actions of a
2595 rewriting system should not invoke another rewriting
2596 system on a term that is participating in the current rewriting system,
2597 if both systems use the same index.  This limitation can be overcome
2598 by using secondary indices, which we'll discuss next.
2599
2600 \Subsubsection{Specifying secondary indices}
2601
2602 From release 2.3.0 onward, it is possible to specify secondary indices
2603 within a rewriting system.  This makes it possible to invoke other
2604 rewriting systems within the execution of another, while retaining the
2605 state caching optimizations throughout.
2606
2607 There are two forms of secondary index: {\em internal index} and
2608 {\em external index}.  Internal indices are stored directly within
2609 a rewrite term and require the user to pre-allocate space for each term.
2610 In contrast, external indices are stored in a separate data structure
2611 such as a hash table.
2612
2613 The syntax of secondary index declarations is as follows:
2614 \begin{syntax}
2615 \NDEF{Index\_Decl} & \IS & \TDEF{index:} \LIST{\N{Index\_Spec}} \T{;} \\
2616 \NDEF{Index\_Spec} & \IS &
2617    \TypeExp \T{=} \T{none} & \C{Disable indexing} \\
2618    & \OR & \TypeExp \T{=} \N{Id} & \C{Internal index} \\
2619    & \OR & \TypeExp \T{=} \TDEF{extern} \N{Id} & \C{External index} \\
2620 \end{syntax}
2621
2622
2623 \Subsubsection{Using an internal index}
2624
2625 In order to make use of an index, certain member functions have to defined
2626 by the user.  Given an internal index named {\em foo}, the rewriter
2627 invokes two functions named
2628 \begin{quotation}
2629 \noindent
2630    {\tt int get\_{\em foo}\_state() const; } \\
2631    {\tt void set\_{\em foo}\_state(int); }
2632 \end{quotation}
2633 \noindent within the generated code.  The user should provide the
2634 implementations for these functions within the datatypes.
2635
2636 For example, reusing the well-formed formulas example (see
2637 section~\ref{sec:Wff}),
2638 an internal index on the datatype \T{Wff} can be implemented as follows:
2639 \begin{verbatim}
2640 #include <AD/rewrite/burs.h>
2641 class WffIndex {
2642    int state;
2643 public:
2644    WffIndex() : state(BURS::undefined_state) {}
2645    int get_wff_state() const { return state; }
2646    void set_wff_state(int s) { state = s; }
2647 };
2648
2649 datatype Wff : public WffIndex
2650    = F
2651    | T
2652    | Var     (const char *)
2653    | And     (Wff, Wff)
2654    | Or      (Wff, Wff)
2655    | Not     (Wff)
2656    | Implies (Wff, Wff)
2657    ;
2658
2659 ...
2660
2661 rewrite Simplify {
2662    index: Wff = wff;
2663    ...
2664 };
2665 \end{verbatim}
2666
2667 Here, the class \T{WffIndex} provides the internal index interface
2668 functions \verb|get_wff_state| and \verb|set_wff_state| expected by
2669 the rewriter.  Note that each term must be initialized with the
2670 state \verb|BURS::undefined_state|.  This constant is defined within
2671 the library include file \verb|<AD/rewrite/burs.h>|.
2672
2673 To enable the index, in the rewriter
2674 we specify that datatype \T{Wff} should make use of the index named \T{wff},
2675 using the \T{index:} declaration.
2676
2677 \Subsubsection{Using an external index}
2678
2679 External indices are specified in a similar manner.  Given a datatype
2680 {\em Foo} and an external index named {\em bar}, the rewriter invokes
2681 calls to the following functions:
2682 \begin{quotation}
2683 \noindent
2684    {\tt int get\_{\em bar}\_state({\em Foo}) const; } \\
2685    {\tt void set\_{\em bar}\_state({\em Foo}, int); }
2686 \end{quotation}
2687 Typically, these are implemented as member functions in the rewrite class.
2688
2689 The rewriter uses the function {\tt get\_{\em bar}\_state} to lookup
2690 previously cached information set by the function {\tt set\_{\em bar}\_state}.
2691 Typically, the implementation of the two functions can be implemented
2692 as hash tables, using the addresses of the terms as hash functions and
2693 pointer equality comparisons.   Note that caching can be {\em lossy}; i.e.
2694 it is perfectly alright for the cache to eliminate cached information to
2695 keep its size under control.  If no cache information is found,
2696 the function {\tt get\_{\em bar}\_state} should return
2697 \verb|BURS::undefined_state|.
2698
2699 \paragraph{Class \CLASSDEF{RewriteCache}}
2700    To make it easy for the users to implement their own external indices,
2701 two sample external index implementation are provided.  The first
2702 is the class \verb|RewriteCache| defined in the library file
2703 \verb|<AD/rewrite/cache.h>|.
2704
2705    The class \verb|RewriteCache| implements a simple fixed capacity
2706    cache with the following member functions.
2707 \begin{verbatim}
2708    RewriteCache();
2709    RewriteCache(int capacity);
2710   ~RewriteCache();
2711
2712    void clear();                   // after rewriting
2713    void initialize();              // before rewriting
2714    void initialize(int capacity);  // before rewriting/also set capacity
2715
2716    int capacity() const;
2717
2718    void set_state(const void * t, int s);
2719    int get_state(const void * t) const;
2720
2721    void invalidate(const void * t);
2722 \end{verbatim}
2723
2724 The user should call the function \verb|initialize| before rewriting
2725 in order to setup the cache and/or clear out previously cached result.
2726 The functions \verb|set_state| and \verb|get_state| can be used to
2727 implement the necessary state caching functions expected by the rewriter.
2728
2729 Returning to our \T{Wff} example, we have the following possible implementation
2730 of the external index:
2731 \begin{verbatim}
2732    rewrite class Simplify (Wff)
2733    {  RewriteCache cache;
2734    public:
2735       int get_wff_state(Wff t) const { return cache.get_state(t); }
2736       void set_wff_state(Wff t, int s) { cache.set_state(t,s); }
2737    };
2738
2739    rewrite Simplify
2740    {  index: Wff = extern wff;
2741       ...
2742    };
2743 \end{verbatim}
2744
2745 \begin{Tips}
2746 If manual memory management is used,
2747 a term should not be deallocated from memory if a cache
2748 contains a handle to it.  The method \verb|invalidate| can be used to
2749 make sure the entry associated with a term is removed from the cache.
2750 \end{Tips}
2751
2752 \paragraph{Class \CLASSDEF{GCRewriteCache}}
2753
2754 {\em This class has not be completely tested.}
2755
2756 Since a garbage collectable object is not reclaimed by the collector
2757 whenever there exists a reference to it, using the class
2758 \verb|RewriteCache| may create problems since objects are not reclaimed
2759 if it is referenced by the cache.  The template class \verb|GCRewriteCache|
2760 solves this problem by using {\em weak pointers} in its internal
2761 implementation.
2762
2763 The template class \verb|GCRewriteCache<T>| expects \verb|T| to be
2764 a class derived from \verb|GCObject|.
2765 This class implements the following functions:
2766 \begin{verbatim}
2767    GCRewriteCache();
2768    GCRewriteCache(int capacity);
2769   ~GCRewriteCache();
2770
2771    void clear();                   // after rewriting
2772    void initialize();              // before rewriting
2773    void initialize(int capacity);  // before rewriting/also set capacity
2774
2775    int capacity() const;
2776
2777    void set_state(T * t, int s);
2778    int get_state(T * t) const;
2779
2780    void invalidate(T * t);
2781 \end{verbatim}
2782
2783 The usage of these functions are identical to that of class
2784 \verb|RewriteCache|.
2785
2786 \Subsection{Short circuiting rewrite paths with \TDEF{cutrewrite}}
2787
2788    From version 2.3.0 onward, the \T{cutrewrite}($e$)
2789 statement may be used wherever a \T{rewrite}($e)$ statement is expected.
2790 The \T{cutrewrite} statement can be used to selectively ignore certain
2791 subterms during the rewriting process.
2792
2793 The semantics of \T{cutrewrite}($e$) is to replace the current redex
2794 with $e$ but do not bother with looking for other redexes in the replacement
2795 term.  This means that the replacement term will not be re-traversed
2796 immediately.  Furthermore, the replaced term will not match any other
2797 left hand side sub-patterns except pattern variables and wildcards.
2798
2799    This feature is very useful for preventing the rewriter from
2800 looking inside certain terms.
2801
2802    To demonstrate, consider the simple following example:
2803 \begin{verbatim}
2804    datatype Exp = NUM of int
2805                 | ADD of Exp, Exp
2806                 | SUB of Exp, Exp
2807                 | MUL of Exp, Exp
2808                 | DIV of Exp, Exp
2809                 | FINAL of Exp
2810                 ;
2811    Exp e = ADD(NUM(1),MUL(NUM(2),NUM(3)));
2812 \end{verbatim}
2813
2814 Suppose we want to find all numbers within
2815 the expression \T{e} and increment their values by 1.
2816 The straight forward way of writing:
2817 \begin{verbatim}
2818    rewrite (e) type (Exp) {
2819       NUM x: NUM(x+1)
2820    }
2821 \end{verbatim}
2822 \noindent will not work, since the replacement term is a new redex,
2823 and the rewriting system will not terminate.
2824
2825 We can reimplement the rewriting system as a two stage process.
2826 First, we mark all terms that we do not want to be changed; whenever we
2827 find a marked term, we execute a \T{cutrewrite} to prevent the term
2828 from being changed.  Then we unmark the terms.
2829 \begin{verbatim}
2830    rewrite (e) type (Exp) {
2831       NUM x:   FINAL(NUM(x+1));
2832    preorder:
2833       FINAL _: cutrewrite;
2834    }
2835    rewrite (e) type (Exp) {
2836       FINAL x: x
2837    }
2838 \end{verbatim}
2839
2840 In the first rewrite statement, replacement terms that we
2841 want to stay fixed are encapsulated
2842 within the auxiliary \T{FINAL} constructor.
2843 Recall that preorder rules are tried before the subterms of a redex are
2844 reduced.  Thus we can make sure all that
2845 all terms that are arguments to a \T{FINAL} term is left unaltered
2846 by adding a \T{preorder} rule that executes a \T{cutrewrite}\footnote{
2847 If an expression argument is not given to a \T{cutrewrite} statement, then
2848 the redex is left unchanged.}
2849 Finally, after the first rewrite statement finishes, we use
2850 the second rewrite statement to remove all \T{FINAL} terms generated
2851 in the first.
2852
2853 The semantics of \T{cutrewrite} needs further explanation.  Consider
2854 the following rewrite system:
2855 \begin{verbatim}
2856    rewrite (e) type (Exp) {
2857       NUM x:          cutrewrite(NUM(x+1))
2858    |  MUL(X, NUM 1):  X
2859    |  MUL(NUM 1,X):   X
2860    }
2861 \end{verbatim}
2862 Given the term \verb|MUL(NUM(0),NUM(2))|, the subterms \verb|NUM(0)|
2863 and \verb|NUM(1)|
2864 will be rewritten by the first rule
2865 into \verb|NUM(1)| and \verb|NUM(3)| respectively.
2866 Furthermore, the replacement term \verb|NUM(1)| will {\em not} match
2867 the left subpattern in \verb|MUL(NUM 1,X)|.  Thus the result of the rewrite
2868 will be \verb|MUL(NUM(1),NUM(3))|.
2869
2870 \begin{Tips}
2871 Since the semantics of a rewriting system with \T{cutrewrite} can
2872 be tricky to analyze, its use should be avoided as much as possible.
2873 \end{Tips}
2874
2875 \Subsection{Conditional failure with \TDEF{failrewrite}}
2876
2877    From version 2.3.0 onward, the \T{failrewrite}
2878 statement may be used wherever a \T{rewrite} statement is expected.
2879 The \T{failrewrite} statement is an alternate way of writing
2880 conditional rewriting rules, and it is often useful when the condition
2881 to be checked is too complex to be written as a guard.
2882
2883 When a \T{failrewrite} statement is executed within the rhs action of a
2884 rewrite rule, the current pattern is rejected and the rewriter
2885 will proceed to try other matches.
2886
2887 For instance, suppose the current redex is
2888 \verb|DIV(INT 0,INT 0)| and the following rules are specified within
2889 a rewriting system:
2890 \begin{verbatim}
2891    DIV(INT X,INT Y): { if (Y == 0) failrewrite; else rewrite(NUM(X/Y); }
2892 |  DIV(INT 0,Y):     NUM(0)
2893 \end{verbatim}
2894
2895 The execution of these rules will proceed as follows:  both patterns
2896 match the redex, but the first rule will be tried first.  Since
2897 \verb|Y| is zero, the \T{failrewrite} statement is executed.  The rewriter
2898 will proceed to try the
2899 second rule and the redex will be replaced with \verb|NUM(0)|.
2900 Note that if we omit the \T{failrewrite} statement
2901 in the first rule, as in:
2902 \begin{verbatim}
2903    DIV(INT X,INT Y): { if (Y != 0) rewrite(NUM(X/Y); }
2904 |  DIV(INT 0,Y):     NUM(0)
2905 \end{verbatim}
2906 the rewriter will commit on the first rule
2907 and no action will be performed on the redex.
2908
2909 Like \T{rewrite} and \T{cutrwerite}, \T{failrewrite}
2910 acts as a branching statement and all statements
2911 immediatedly following \T{failrewrite} are dead code.
2912
2913 \Subsection{Confluence and termination}
2914
2915    Currently no automatic checking is available to make sure that
2916 a set of rewriting rules is confluent and will terminate.  This is currently
2917 the user's responsibility.  Some verification features will be developed
2918 in the future.
2919
2920 \Subsection{Debugging Tree Rewriting}
2921
2922    By default, \Prop{} generates a macro \verb|DEBUG_|$C$ for each
2923 rewrite class named $C$.  The user can redefine this macro in order to
2924 trace the replacements that are performed during the execution of a
2925 set of rewriting rules.  The default definition of \verb|DEBUG_|$C$
2926 is as follows:
2927 \[
2928 \verb|#define DEBUG_|C({\it R,redex,filename,linenumber,ruletext})\ {\it R}
2929 \]
2930 i.e. it simply returns the value of {\it R}.
2931
2932 The arguments to the macro are as follows:
2933 \begin{itemize}
2934 \item {\it R} is the new replacement term.
2935 \item {\it redex} is the current redex, before the replacement has been
2936 performed.
2937 \item {\it filename} is the file name of the current rewriting rule.
2938 \item {\it linenumber}  is the location of the current rewriting rule.
2939 \item {\it ruletext} is a string that describes the current rule.
2940 \end{itemize}
2941
2942 In order to activate tracing, the user can redefine the macro
2943 \verb|DEBUG_FOO| (for rewrite class \verb|FOO|) as follows:
2944 \begin{verbatim}
2945 #define DEBUG_FOO(a,b,c,d,e) print_rule(a,b,c,d,e)
2946
2947 template <class T>
2948   T print_rule(T replacement, T redex,
2949                const char * file_name,
2950                int line, const char * rule_text)
2951   { cerr << file_name << ':' << line << ": " << rule_text
2952          << " redex = " << redex << " replacement = " << replacement
2953          << endl;
2954     return replacement;
2955   }
2956 \end{verbatim}
2957 Note that these definitions must appear
2958 before the \verb|rewrite FOO| declaration.
2959
2960 \Subsection{Optimizing Tree Rewriting} \label{sec:optimize-rewriting}
2961
2962    An experimental rewriting optimizer based on partial evaluation
2963 techniques has been implemented in release 2.3.4 of \Prop.  The optimizer
2964 can be enabled with the option \OPTIONDEF{-Orewriting}.
2965
2966 Informally speaking, the rewriting optimizer will try to locate
2967 ``longer'' reduction sequence and add these to your rewriting rules.
2968 The result is that during execution these bigger reduction steps are
2969 taken whenever possible.
2970
2971    For example, consider the \verb|Wff| datatype defined
2972 in section~\ref{sec:rewriting-modifier}.  Suppose we have the following
2973 two rewriting rules in our rule set:
2974 \begin{verbatim}
2975    Not(Not X): X
2976 |  FORALL(X,A,P): Not(EXISTS(X,A,Not(P)))
2977 \end{verbatim}
2978
2979 Given an input term of the form \verb|FORALL(X,A,Not(P))|, the term will
2980 first be reduced to \verb|Not(EXISTS(X,A,Not(Not(P))))| via the second rule,
2981 then to \verb|Not(EXISTS(X,A,P))| via the first rule.  The optimizer
2982 can discover that this two step reduction can be compressed into one
2983 step if the rewriting system is augmented with the following rule:
2984
2985 \begin{verbatim}
2986    FORALL(X,A,Not P): Not(EXISTS(X,A,P))
2987 \end{verbatim}
2988
2989 Note that this augmented rule is a {\em specialization} of the two rules
2990 given by the user.  In general, the rewriting optimizer will repeatedly
2991 look for useful specializations and add them to the user rule set\footnote{
2992 To ensure termination of the optimization process,
2993 augmented rules will not be further specialized.}.
2994 These specialized rules will perform multiple reductions in one single step.
2995
2996 To make use of the optimizer, the rewriting system must have the following
2997 properties:
2998 \begin{enumerate}
2999    \item  It must be strongly normalizing, i.e. it will terminate under all
3000    inputs.
3001    \item  It must be confluent, or else confluence is not necessary.
3002    Notice that the optimizer can and will alternate the reduction order
3003    of a rule set.  Thus if a rewriting system is non-confluent, an
3004    optimized rewriting system may give a different result.
3005 \end{enumerate}
3006
3007 In addition, the following things should hold to maximize the effectiveness
3008 of the optimizer.
3009 \begin{enumerate}
3010    \item  Indices (i.e. state caching) should be enabled.   Otherwise
3011    the optimizer will not be able to optimize a rule.
3012    \item  Specializable rules must be simple, i.e. the right hand side
3013    must be written as an expression consist of only \Prop{} datatypes.
3014    If the right hand side is a complex statement or if it involves
3015    external function calls, the optimizer will fail to analyze it properly.
3016    \item  Only bottom-up rules may be present.
3017    \item  Avoid conditional rules whenever possible.
3018 \end{enumerate}
3019
3020 Finally, it should be warned that an optimized rewriting system
3021 may generate a lot more code than the un-optimized one.  The user should
3022 view the generated report and check that excessive specializations have not
3023 been performed.
3024
3025 \Section{User defined datatypes: Views} \label{sec:views}
3026
3027    For many applications it is necessary to interface \Prop{} to
3028 data structures predefined in libraries, or generated by other tools.
3029 In order to make it possible to work with these external data structures,
3030 \Prop{}, from version 2.3.3. onward, provides a {\em view} mechanism to
3031 interface with these data structures.  Using views, an externally defined
3032 type can be used within \Prop's pattern matching and rewriting
3033 mechanisms transparently, just as if it were defined by \Prop.
3034 The \Prop{} translator will call the
3035 appropriate interface functions specified by the user to manipulate
3036 these external datatypes.
3037
3038 \Subsection{A first example} \label{sec:tagged-union}.
3039 To illustrate the use of views, we'll begin with a simple example.
3040
3041 Suppose in our application we use the following tagged C union to
3042 represent our expression datatype.  An expression is represented
3043 simply as a pointer to this structure.  An null expression is
3044 represented simply as the NULL pointer.
3045
3046 \begin{verbatim}
3047 enum exp_tag { Int = 1, Ident, Add, Sub, Mul, Div };
3048 struct exp {
3049    enum exp_tag tag;
3050    union {
3051       int          number;
3052       const char * ident;
3053       struct { struct exp * l, * r } children;
3054    } u;
3055 };
3056 typedef struct exp * Exp;
3057 /* User defined constructor functions */
3058 Exp NONE = NULL;
3059 extern Exp INT(int);
3060 extern Exp IDENT(const char *);
3061 extern Exp ADD(Exp, Exp);
3062 extern Exp SUB(Exp, Exp);
3063 extern Exp MUL(Exp, Exp);
3064 extern Exp DIV(Exp, Exp);
3065 \end{verbatim}
3066
3067 Suppose this data structure is already used extensively in other
3068 modules, and we'd like to \Prop{} pattern matching mechanisms while keeping
3069 the old code intact.  We can accomplish this by declaring a
3070 view to this data structure, using a \TDEF{datatype view}
3071 definition:
3072 \begin{verbatim}
3073 dataype Exp :: view =
3074    match (this ? this->tag : 0)
3075      view 0     => NONE
3076    | view Int   => INT(int = this->u.number)
3077    | view Ident => ID (const char * = this->u.ident)
3078    | view Add   => ADD(Exp = this->u.children.l, Exp = this->u.children.r)
3079    | view Sub   => SUB(Exp = this->u.children.l, Exp = this->u.children.r)
3080    | view Mul   => MUL(Exp = this->u.children.l, Exp = this->u.children.r)
3081    | view Div   => DIV(Exp = this->u.children.l, Exp = this->u.children.r)
3082    ;
3083 \end{verbatim}
3084
3085 In the above, a view named \verb|Exp| is defined.   Let's describe
3086 what the definition means: in general,
3087 a view definition has three main parts, the {\em view selector},
3088 a set of {\em view transformation rules}, and a set {\em of view accessor
3089 expressions}.  These parts perform the following tasks:
3090
3091 \begin{description}
3092 \item[View selector]  The {\em view selector} is specified in the
3093 the \TDEF{match} part of the definition.   In this example,
3094 the expression associated with \T{match} is
3095 \verb|this ? 0 : this->tag|.  This returns a variant discrimination integer
3096 from \verb|0| to \verb|Div|.
3097 Note that the pseudo variable \TDEF{this}
3098 refers to an object of type \verb|Exp|.
3099
3100 \item[View transformation rules]
3101 Each of the clauses
3102 \[ \TDEF{view} {\it exp} \verb|=>| {\it term} \]
3103 specifies
3104 a {\em view transformation rule}.   These rules are used to specify
3105 how to transform a computed variant tag into a pattern.    In general,
3106 expression {\it exp} must be a constant expression usable in a
3107 \verb|case| statement.
3108
3109 \item[View accessors]
3110 In each of the constructor arguments, an {\em view accessor} expression
3111 can be defined to access the logical components of the constructor.
3112 In the rule:
3113 \begin{verbatim}
3114    | view Add   => ADD(Exp = this->u.children.l, Exp = this->u.children.r)
3115 \end{verbatim}
3116
3117 The expressions \verb|this->u.children.l| and \verb|this->u.children.r|
3118 tell \Prop{} how to access the left and right child of the an addition
3119 expression node.
3120
3121 Note that since the variant \verb|NONE| has no arguments, it is represented
3122 as a nullary constructor, and no accessors expressions are defined.
3123 \end{description}
3124
3125 A pretty printing function for \verb|struct exp| can be implemented
3126 in \Prop{} using pattern matching as follows:
3127 \begin{verbatim}
3128 ostream& operator << (ostream& s, Exp e)
3129 {  match (e)
3130    {  NONE:     { s << "none"; }
3131    |  INT i:    { s << i; }
3132    |  ID id:    { s << id; }
3133    |  ADD(a,b): { s << '(' << a << " + " << b << '); }
3134    |  SUB(a,b): { s << '(' << a << " - " << b << '); }
3135    |  MUL(a,b): { s << '(' << a << " * " << b << '); }
3136    |  DIV(a,b): { s << '(' << a << " / " << b << '); }
3137    }
3138    return s;
3139 }
3140 \end{verbatim}
3141
3142 Note that this is exactly the same code we would've written if
3143 \verb|Exp| is defined as a datatype rather than a view.  This means
3144 that the user can reimplement the expression data structure
3145 as a \Prop's datatype at a latter time, without altering the \Prop{}
3146 source code.  In effect, the {\em logical} view of a datatype, and its
3147 {\em physical} implementation can separately.
3148 With this {\em representation transparency} capability,
3149 programs written in \Prop's pattern matching formalisms can evolve
3150 in an incrementally and pain-free manner.
3151
3152 The generated code for the above function will resemble the following
3153 output.  Note that all the view selectors and view accessors are
3154 inlined into the code, and in general, views incur no overhead to
3155 their use.
3156 \begin{verbatim}
3157 ostream& operator << (ostream& s, Exp e)
3158 {  switch (e ? e->tag : 0)
3159    {  case 0:      { s << "none"; }
3160       case Int:    { s << e->u.number; }
3161       case Ident:  { s << e->u.ident; }
3162       case Add:    { s << '(' << e->u.children.l
3163                        << " + " << e->children.r << '); }
3164       case Sub:    { s << '(' << e->u.children.l
3165                        << " - " << e->children.r << '); }
3166       case Mul:    { s << '(' << e->u.children.l
3167                        << " * " << e->children.r << '); }
3168       case Div:    { s << '(' << e->u.children.l
3169                        << " / " << e->children.r << '); }
3170    }
3171    return s;
3172 }
3173 \end{verbatim}
3174
3175 \Subsection{Another view example}
3176
3177 Suppose we use the following alternative implementation, a
3178 \verb|Expression| class hierarchy to represent
3179 an expression in our AST.
3180 Note that \verb|Exp| is an abstract base class inherited
3181 by the classes \verb|Number|, \verb|Operator|, \verb|Identifer|, etc.
3182
3183 \begin{verbatim}
3184 class Expression
3185 {
3186 public:
3187
3188    // This class uses a ``wide'' interface
3189    virtual int  number() const         = 0;  // returns a number
3190    virtual const char * ident() const  = 0;  // returns an identifier
3191
3192    // returns the ith subchild
3193    virtual const Expression * child(int) const = 0;
3194    // mutable version of above
3195    virtual Expression *& child(int) = 0;
3196 };
3197
3198 typedef Expression * Exp;
3199
3200 class NullExp  : public Expression { ... };
3201 class Number   : public Expression { ... };
3202 class Operator : public Expression { ... };
3203 class Identifier : public Expression { ... };
3204 \end{verbatim}
3205
3206 In order to provide a suitable interface to \Prop's view mechanism,
3207 we introduce an additional member function to return a variant tag:
3208
3209 \begin{verbatim}
3210 class Expression
3211 {
3212 public:
3213    enum Variant { None, Int, Ident, Add, Sub, Mul, Div };
3214    virtual Variant get_variant() const = 0;
3215 }
3216 \end{verbatim}
3217
3218 In addition, we assume the following constructor functions have been
3219 implemented:
3220
3221 \begin{verbatim}
3222 extern Exp INT(int);
3223 extern Exp IDENT(const char *);
3224 extern Exp ADD(Exp, Exp);
3225 extern Exp SUB(Exp, Exp);
3226 extern Exp MUL(Exp, Exp);
3227 extern Exp DIV(Exp, Exp);
3228 \end{verbatim}
3229
3230 Now, with these definitions in place, we can define a datatype view of the
3231 above \verb|Expression| hierarchy using the following definition:
3232
3233 \begin{verbatim}
3234 datatype Exp :: view
3235    = match (this->get_variant())
3236      view Expression::None  => NONE
3237    | view Expression::Int   => INT (int = this->number())
3238    | view Expression::Ident => ID  (const char * = this->ident())
3239    | view Expression::Add   => ADD (Exp = this->child(0),
3240                                     Exp = this->child(1))
3241    | view Expression::Sub   => SUB (Exp = this->child(0),
3242                                     Exp = this->child(1))
3243    | view Expression::Mul   => MUL (Exp = this->child(0),
3244                                     Exp = this->child(1))
3245    | view Expression::Div   => DIV (Exp = this->child(0),
3246                                     Exp = this->child(1))
3247    ;
3248 \end{verbatim}
3249
3250 An evaluation function can be written
3251 using \Prop's pattern matching construct:
3252
3253 \begin{verbatim}
3254 int eval(Exp e, const Env& env)
3255 {  match (e)
3256    {  NONE:          { return 0; }
3257    |  INT i:         { return i; }
3258    |  ID id:         { return env(id); }
3259    |  ADD (x,y):     { return eval(x,env) + eval(y,env); }
3260    |  SUB (x,y):     { return eval(x,env) - eval(y,env); }
3261    |  MUL (x,y):     { return eval(x,env) * eval(y,env); }
3262    |  DIV (x,INT 0): { cerr << "Division by zero"; return 0; }
3263    |  DIV (x,y):     { return eval(x,env) / eval(y,env); }
3264    }
3265 }
3266 \end{verbatim}
3267 Note that this code works equally well with the tagged union representation
3268 defined in section~\ref{sec:tagged-union}.
3269
3270 We can also use rewriting to transform an expression tree as follows
3271
3272 \begin{verbatim}
3273 void simplify(Exp& e)
3274 {
3275    rewrite (e) type (Exp)
3276    {  ADD(INT i, INT j): INT(i+j)
3277    |  SUB(INT i, INT j): INT(i-j)
3278    |  MUL(INT i, INT j): INT(i*j)
3279    |  DIV(INT i, INT j): INT(i/j)
3280    }
3281 }
3282 \end{verbatim}
3283
3284 \Subsection{Syntax of view definitions}
3285
3286 The general syntax of a view definition resembles that of a
3287 \T{datatype} definition:
3288
3289 \begin{syntax}
3290 \NDEF{Datatype\_Spec}
3291    & \IS & \N{Datatype\_View\_Spec} \\
3292 \NDEF{Datatype\_View\_Spec}
3293    & \IS & \Id \T{::} \T{view} \T{=} \\
3294    &     & \quad \T{match} \T{(} \N{Exp} \T{)} \\
3295    &     & \quad \N{View\_Cons\_Specs} \\
3296 \NDEF{View\_Cons\_Specs}
3297    & \IS & \ALT{\TDEF{view} \N{Exp} \T{=>} \N{Cons\_Spec}}{|} \\
3298 \end{syntax}
3299
3300 Default arguments in a \N{Cons\_Spec} are interpreted as view accessors.
3301
3302 There are a few general rules to observe when defining a view:
3303 \begin{enumerate}
3304  \item A \T{datatype view} definition only defines the interface
3305  to an external data structure and no code is generated.  In particular,
3306  the defined view name is not an actual \Cpp{} type.
3307  Thus the user is responsible for defining all appropriate type definitions.
3308  \item All other qualifiers other than \T{view} are ignored.   Thus
3309  automatic generation of pretty printers, garbage collection interface
3310  routines, etc. are disabled for views.
3311  \item In general, the view accessors must be usable as lvalues
3312  if the user wants to modify a view datatype.  This includes the
3313  situation when in-place rewriting is used.
3314  \item Finally, the user are responsible for generating the
3315  constructor functions.  In the two examples given above, the user
3316  should implement the functions \T{INT}, \T{ID}, etc.
3317 \end{enumerate}
3318
3319 \Section{Graph Types and Graph Rewriting} \label{sec:graph}
3320
3321 A typical compiler uses many types of graphs to represent internal program
3322 information.  For instance, control flow graphs,
3323 data dependence graphs, call graphs, def/use chains are
3324 some of the most common examples.  In order to automate the process
3325 of implementing and manipulating these graph structures, \Prop{} provides
3326 a generic formalism-- {\em graph types}--
3327 for specifying multisorted, attributed, hierarchical directed graphs in
3328 very high level manner.  The data structure mapping process of translating
3329 high level graph specifications into concrete \Cpp{} classes is completely
3330 automated.  Thus the user can concentrate on specifying the semantic
3331 relationships between the elements of the domain, instead of the
3332 implementation details.
3333
3334 The graph structure mapping process uses real-time set machine
3335 simulation\cite{real-time-SETL} and subtype analysis\cite{subtype-SETL}
3336 to convert associative accesses involving sets, maps, and multimaps
3337 into hash free, worst-case constant time pointer manipulations.
3338 This optimization is performed transparently by the translator.
3339
3340 Manipulation of graphs structures are done in three ways:
3341 \begin{description}
3342 \item[object-oriented]
3343  Using a standard interface generated from \Prop{} the user can
3344 manipulate a graph in the usual procedural/object-oriented manner.
3345 \item[set formalism]  Using an embedded SETL\cite{SETL}-like sublanguage,
3346 the user can manipulate the graphs using high level set operations
3347 such as set comprehension.
3348 \item[graph rewriting]  At the highest level, analysis and transformation
3349 can be carried out using the graph rewriting formalism.
3350 \end{description}
3351
3352 In this section we'll describe each of these topics in detail.
3353
3354 \Subsection{Graph Types} \label{sec:graph-types}
3355
3356 A \INDEX{graph type} declaration is used to specify a
3357 graph structure
3358 with multiple sorts of labeled nodes and labeled directed edges.
3359 Attributes can be attached to both nodes and edges.
3360 The syntax of \T{graphtype} declarations is as follows:
3361 \begin{syntax}
3362 \NDEF{Graph\_Type}
3363       & \IS & \TDEF{graphtype} \Id \\
3364       & & \quad \OPT{\T{:} \N{Inherit\_List}} \\
3365       & & \quad \OPT{\T{::} \MORE{\N{Graph\_Mode}}{}} \\
3366       & & \T{declare} \\
3367       & & \quad \TDEF{node:} \ALT{\N{Node\_Def}}{|} \\
3368       & & \quad \TDEF{edge:} \ALT{\N{Edge\_Def}}{|} \\
3369       & & \T{begin} \\
3370       & & \quad \N{Code} \\
3371       & & \T{end} \T{graphtype} \T{;} \\
3372 \NDEF{Graph\_Mode} & \IS & \\
3373 \NDEF{Node\_Def} & \IS & \Id \OPT{\OPT{\T{of}} \N{TypeExp}} \\
3374 \NDEF{Edge\_Def}
3375     & \IS & \Id \OPT{\T{of}} \TypeExp \T{->} \TypeExp \\
3376     & \OR & \Id \OPT{\T{of}} \TypeExp \T{<->} \TypeExp \\
3377     & \OR & \Id \OPT{\T{of}} \TypeExp \T{<=>} \TypeExp \\
3378     & \OR & \Id \OPT{\T{of}} \TypeExp \T{<=>*} \TypeExp \\
3379 \end{syntax}
3380
3381 \Subsection{The Graph Interface} \label{sec:graph-interface}
3382 \INCOMPLETE
3383
3384 \Subsection{Set Formalisms} \label{sec:set-formalisms}
3385 \INCOMPLETE
3386
3387 \Subsection{Graph Rewriting} \label{sec:graph-rewriting}
3388 \INCOMPLETE
3389
3390 \Section{Running the Translator} \label{sec:usage}
3391
3392 The \Prop{} translator is a program called \verb|prop|.  The translator
3393 uses the following command line syntax:
3394 \begin{syntax}
3395   \NDEF{Running\_Prop} & ::= &
3396    \T{prop} \OPT{{\em prop\_options}} \ALT{{\em file}}{} \\
3397 \end{syntax}
3398
3399 Here, \N{File} is a file with suffix \verb|.p|$*$.  By default, the output
3400 file will have the same name with the \verb|p| extension removed.
3401 For example, a file named ``foo.pC'' will be translated into
3402 the file ``foo.C''  Similarly, a file named ``bar.ph'' will be translated
3403 into ``bar.h''
3404
3405 \Subsection{Options}
3406 The available options to the translator are as follows:
3407 \begin{description}
3408 \item[\OPTIONDEF{-G -GNU}]      Use GNU style error messages.
3409 \item[\OPTIONDEF{-I{\em path}}] Use the search path {\em path}.
3410 \Prop{} will search {\em path} to look for \Prop{} files with
3411 the suffix \verb|.ph| inside a \verb|#include| directive.  Multiple
3412 paths can be specified by using multiple \verb|-I| options.
3413 \item[\OPTIONDEF{-l -no\_line\_directives}]   By default, \Prop{} will try
3414 to generate \verb|#line| directives to correlate the source and
3415 the translated output.  This option suppresses this feature.
3416 \item[\OPTIONDEF{-M -make\_depends}]  Similarly to the \verb|-M| option
3417 in \verb|cc| and \verb|cpp|.  This option generates a suitable dependency
3418 list for Makefiles.
3419 \item[\OPTIONDEF{-memory\_usage}]  Print memory use during translation.
3420 \item[\OPTIONDEF{-n -no\_codegen}]
3421 Don't output any code; perform semantic checking
3422 only.
3423 \item[\OPTIONDEF{-N -non\_linear}]
3424 Use non-linear patterns during pattern matching.
3425 A non-linear pattern is one in which a variable occurs more than
3426 once.   By default, \Prop{} consider this an error.
3427 \item[\OPTIONDEF{-o{\em outfile}}]  Send output to {\em outfile} instead of
3428 the default.
3429 \item[\OPTIONDEF{-Ofast\_string\_match}]
3430 Generate (potentially) faster string matching
3431 code by expanding string literal tests into character tests.  By default,
3432 \Prop{} generates string tests using \verb|strcmp| and binary search.
3433 \item[\OPTIONDEF{-Oadaptive\_matching}]  Use the adaptive pattern matching
3434 algorithm\footnote{The implementation may not be effective.}.
3435 \item[\OPTIONDEF{-Oinline\_casts}]
3436 Generate inline type casts instead of function
3437 calls that do the same.  This may be faster for compilation.
3438 \item[\OPTIONDEF{-Orewriting}]  Enable the rewriting optimizer.
3439 See also section~\ref{sec:optimize-rewriting} for details.
3440 \item[\OPTIONDEF{-Otagged\_pointer}]
3441 Use a tagged pointer representation instead
3442 of representing variant tags explicitly in a structure field.  For
3443 example, if there are two variants called \verb|Foo| and
3444 \verb|Bar|  in a datatype.  Then a pointer to \verb|Foo| can be tagged
3445 with a low order bit 0 while a pointer to \verb|Bar| can be tagged with
3446 a low order bit 1.  Accessing the tag thus becomes just a simple bit
3447 test.  This should save space and may be faster\footnote{However, this feature
3448 has not been fully tested.}
3449 \item[\OPTIONDEF{-r -report}]
3450 Generate a report whenever necessary.  Parser, string
3451 matching, and rewriting constructs produce reports.
3452 \item[\OPTIONDEF{-s -strict}]  Use strict semantic checking mode.
3453 All warnings are considered to be errors.
3454 \item[\OPTIONDEF{-S -save\_space}]\label{option:save-space}
3455 Use space saving mode.
3456 Try not to use inline functions if code space can be saved.
3457 Datatype constructors will not be inlined
3458 in this mode.  Instead, datatype constructors will be generated at
3459 where the corresponding \TDEF{instantiate datatype} declaration occurs.
3460 \item[\OPTIONDEF{-t -stdout}]   Send translated program to the standard output.
3461 \item[\OPTIONDEF{-use\_global\_pool}]  Use global memory pool for allocation.
3462 \item[\OPTIONDEF{-v{\em num}}]  Use verbose mode in report generation.
3463 This will provide more detailed information.
3464 \end{description}
3465
3466 \Subsection{Error Messages}
3467
3468 The following is a canonical list of all error messages generated by
3469 the \Prop{} translator.  Each message is prefixed by the
3470 file name and line number in which the error occurs.    Most of these
3471 errors messages are self explanatory; more detailed explanations are provided
3472 below.
3473
3474 \begin{Error}
3475 \errormsg{unknown option {\em option}}  The translator does not recognize
3476 the command line {\em option}.   Type {\em prop} to see a list of options.
3477 \errormsg{Error in command: {\em command}}  The translator fails when trying
3478 to execution {\em command}.  When multiple files are specified in a command
3479 line, {\em prop} invokes itself on each of the file.
3480 \errormsg{{\em pat} with type {\em type} is not unifiable}  Pattern has not
3481 been declared to be a unifiable type.
3482 \errormsg{Sorry: pattern not supported in rewriting: {\em pat}}  Pattern
3483 {\em pat} is not supported.  Currently, logical pattern connectives
3484 are not supported in rewriting.  You'll have to rewrite them into
3485 non-logical form.
3486 \errormsg{Unknown complexity operator: {\em op}}
3487 \errormsg{accessor is undefined for view pattern: {\em pat}} An accessor
3488 function has not been declared for a constructor.  When using datatype views
3489 \errormsg{arity mismatch (expecting $n$) in pattern: {\em pat}}  Pattern
3490 {\em pat} is expected to have arity $n$.   The arity is the number of
3491 expressions that you're matching concurrently.  If you are matching $n$
3492 objects then there must be $n$ patterns for each rule.
3493 \errormsg{arity mismatch in logical pattern: {\em pat}}  Logical patterns
3494 do not match the arity of a pattern.
3495 \errormsg{bad constructor type {\em type}}
3496 \errormsg{bad view constructor pattern: {\em pat}}
3497 \errormsg{can't find include file {\em file}}  Additional search paths
3498 can be specified with the \verb|-I| option
3499 \errormsg{component \#$i$ not found in constructor {\em con}}
3500 Datatype constructor {\em con} does not have a component named \#$i$.
3501 This can happen when you write a record constructor expression and misspelt
3502 one of the labels.
3503 \errormsg{component $l$ not found in constructor {\em con}}  Datatype
3504 constructor {\em con} does not have a labeled component $l$.
3505 \errormsg{constructor {\em con} already has print formats}
3506 \errormsg{constructor {\em con} is not a class}
3507 \errormsg{constructor {\em con} is not a lexeme}  A constructor is used
3508 as a terminal in the parser but it has not been declared to be a lexeme.
3509 \errormsg{constructor {\em con} doesn't take labeled arguments}  You're trying
3510 to use record constructor syntax on a constructor take does not take
3511 record arguments.
3512 \errormsg{constructor {\em con} is undefined}
3513 \errormsg{cyclic type definition in type {\em id} = {\em type}}
3514 Type abbreviations cannot be cyclic.  For recursive types, use \verb|datatype|
3515 definitions.
3516 \errormsg{datatype {\em T} is not a lexeme type}  {\em T} is used as a
3517 terminal within syntax class when it has not been declared as a lexeme type.
3518 All non-terminals must be a defined using the \verb|lexeme| qualifier.
3519 \errormsg{determinism {\em det} not recognized}
3520 \end{Error}
3521
3522 \begin{Error}
3523 \errormsg{duplicated definition of pattern constructor '{\em con}'}
3524   The constructor {\em con} has already been in another datatype.  {\em Prop}
3525   does not allow overloading of constructor names.
3526 \errormsg{duplicated label '$l$' in expression: {\em exp}}
3527   A record type has one of its labels duplicated.
3528 \errormsg{duplicated label '$l$' in type {\em type}}
3529 \errormsg{duplicated pattern variable '{\em id}'.  Use option -N}
3530    By default,
3531    pattern variables may not be duplicated within a pattern.  Non-linear
3532    patterns are allowed only when the option \verb|-N| is invoked.
3533 \errormsg{edge $e$ is not defined in graphtype {\em id} }
3534 \errormsg{empty type list in rewrite (...) ... }
3535 \errormsg{expecting $c_1$ ... $c_2$ (from line $l$) but found $c_3$ ... $c_4$
3536    instead}  Quotation symbols are not properly balanced.
3537 \errormsg{expecting non-terminal {\em nt} to have type $t_1$ but found $t_2$}
3538    Non-terminal {\em nt} has already been defined to have a
3539    synthesized attribute type of $t_1$ but $t_2$ is found.   This can happen
3540    if you have rules with the same lhs non-terminal previously defined.
3541 \errormsg{expecting type $t_1$ but found $t_2$ in pattern variable '{\em v}'}
3542 {\em Prop} performs type inference on all the lhs patterns to determine
3543 the types of all variables.  Type errors can occur if the patterns are
3544 miswritten.
3545 \errormsg{expecting type $t_1$ but found $t_2$}
3546 \errormsg{flexible vector pattern currently not supported in
3547 rewriting: {\em pat}}
3548 \errormsg{format \#$i$ used on constructor {\em con}}
3549 \errormsg{format \#$l$ used on non-record constructor {\em con}}
3550 \errormsg{function name mismatch: expecting $f$ ...}
3551 \errormsg{illegal character $c$}
3552 \errormsg{illegal context type: {\em type}}  A context type in
3553 a \verb|matchscan| statement must be previously defined to be datatype.
3554 The unit constructors of the datatype can be used as context values.
3555 \errormsg{illegal context(s) in pattern {\em pat}}
3556 \errormsg{illegal format '\_' on constructor {\em con}}
3557 \errormsg{illegal incomplete record pattern: {\em pat}}
3558 \errormsg{illegal print format '$c$' in constructor {\em con}}
3559 \errormsg{illegal record label '$l$' in expression: {\em exp}}
3560 \errormsg{illegal width in bitfield '{\em id} ($n$)'}
3561 \errormsg{inherited attribute '{\em type}' can only be used in treeparser
3562 mode in rewrite class {\em id}}
3563 \end{Error}
3564
3565 \begin{Error}
3566 \errormsg{law {\em id}(...) = {\em pat} is not invertible}   Pattern {\em pat}
3567 cannot be treated as an expression.  It may involve logical patterns and
3568 wild cards.
3569 \errormsg{law '{\em id}': bound variable '$v$' is absent in body {\em exp}}
3570 A parameter variable $v$ must be present within the body of a law.
3571
3572 \errormsg{law '{\em id}': type {\em type} cannot be used in parameter {\em id}}
3573
3574 \errormsg{lexeme {\em id} is undefined}
3575
3576 \errormsg{lexeme class {\em id} is undefined}
3577
3578 \errormsg{lexeme pattern is undefined for constructor {\em con}}
3579
3580 \errormsg{lexeme \{{\em id}\} already defined as {\em regexp}}
3581
3582 \errormsg{lexeme \{{\em id}\} is undefined in {\em regexp}}
3583
3584 \errormsg{missing '\}' in regular expression {\em regexp}}
3585
3586 \errormsg{missing label '$l$' in expression: {\em con} {\em exp}}
3587
3588 \errormsg{missing non-terminal in tree grammar rule: {\em nt}}
3589 \errormsg{missing type {\em type} in the traversal list of rewrite class {\em id}}
3590 Within the rewriting rules, you have used a pattern that involve a constructor
3591 of type {\em  type} directly but no such types are defined in the
3592 rewrite class definition.  You should add the type to the traversal list.
3593
3594 \errormsg{missing type info in expression {\em exp} : {\em type}}
3595    Sometimes {\em prop} fails to infer the type of an expression within
3596 a {\em match} statement.  It such
3597 cases it is necessary to help out the translator by adding additional
3598 type information in the patterns.  For example, rewrite some pattern $p$
3599 as $(p : \tau)$ where $\tau$ is the type of $p$.
3600
3601 \errormsg{missing type info in function {\em f} {\em type}}  Similar to above
3602 \errormsg{missing type info in rule: {\em f} {\em pat} : {\em type}}
3603    Similar to above.
3604 \errormsg{missing view selector for pattern: {\em pat}}
3605 \errormsg{multiple mixed polarity pattern variable '{\em v}'}
3606 \errormsg{multiple unit constructors in polymorphic type {\em id} {\em arg}
3607 is not supported}
3608 \errormsg{negative cost {\em cost} is illegal}
3609 \errormsg{node {\em id} is not defined in graphtype {\em id}}
3610 \errormsg{non lexeme type {\em type} in pattern {\em pat}}
3611 \errormsg{non-relation type {\em type} in pattern: {\em pat}}
3612 \errormsg{pattern is undefined for lexeme {\em l}}
3613 \errormsg{pattern scope stack overflows}
3614 \errormsg{pattern scope stack underflows}
3615 \end{Error}
3616
3617 \begin{Error}
3618 \errormsg{pattern variable '$v$' has no binding at this point}
3619 \errormsg{persist object id is undefined for {\em con}}
3620 \errormsg{persistence pid {\em name} already allocated for type {\em type}}
3621 \errormsg{persistence redefined for type {\em type}}
3622 \errormsg{precedence symbol must be terminal: {\em term}}
3623 \errormsg{predicate {\em p}: expecting type $t_1$ but found $t_2$}
3624 \errormsg{redefinition of bitfield '{\em field}'.}
3625 \errormsg{redefinition of constructor '{\em con}'}
3626 \errormsg{redefinition of datatype {\em id}}
3627 \errormsg{redefinition of lexeme class {\em id}}
3628 \errormsg{replacement not in rewrite class: rewrite {\em exp}}
3629 \errormsg{rewrite class {\em id} has already been defined}
3630 \errormsg{rewrite class {\em id} is undefined}
3631 \errormsg{rule $r$ has incomplete type {\em type}}
3632 \errormsg{rule $r$ is of a non datatype: {\em type}}
3633 \errormsg{syntax class {\em id} has already been defined}
3634 \errormsg{syntax class {\em id} is undefined}
3635 \errormsg{synthesized attribute '{\em type}'
3636 can only be used in treeparser mode in rewrite class {\em id}}
3637 \errormsg{the class representation of constructor {\em id} has been elided}
3638 \errormsg{this rule is never selected: $r$}
3639 \errormsg{this rule is never used: $r$}
3640 \errormsg{too few arguments {\em arg} in instantiation of type scheme {\em type}}
3641 \errormsg{too many arguments {\em arg} in instantiation of type scheme {\em type}}
3642 \errormsg{type {\em type} is not a datatype}
3643 \errormsg{type {\em type} is undefined}
3644 \end{Error}
3645
3646 \begin{Error}
3647 \errormsg{type {\em id} = {\em type} is not a datatype}
3648 \errormsg{type {\em id} has already been defined as {\em type}}
3649 \errormsg{type {\em id} has unknown class: {\em C}}
3650 \errormsg{type '{\em type}' is not rewritable in treeparser mode rewrite class
3651 {\em id}}
3652     All datatypes used within the treeparser mode of rewriting must be defined
3653     with the \verb|rewrite| qualifier.
3654 \errormsg{type error in pattern {\em pat}: {\em type}}
3655 \errormsg{type mismatch between nonterminal {\em nt}(type $t_1$)
3656 and rule $r$(type $t_2$)}
3657 \errormsg{type mismatch in pattern: {\em pat}}
3658 \errormsg{type mismatch in rule $r$}
3659 \errormsg{type mismatch in rule $r$ {\em pat}}
3660 \errormsg{type or constructor {\em con} is undefined}
3661 \errormsg{unable to apply pattern scheme {\em pat}}
3662 \errormsg{unbalanced $c_1$ ... $c_2$ at end of file}
3663 \errormsg{undefined non-terminal {\em nt}}
3664 \errormsg{unification fails occurs check with $t_1$ and $t_2$}
3665 \errormsg{unmatched ending quote $c$}
3666 \errormsg{unrecognized quoted expression `{\em exp}`}
3667 \errormsg{unrecognized quoted pattern `{\em pat}`}
3668 \end{Error}
3669
3670 \pagebreak
3671 \appendix
3672
3673 \Section{Garbage Collection in the {\sf Prop} Library} \label{appendix:gc}
3674    In this appendix
3675 we describe the design and implementation of a garbage collector
3676 based on the Customisable Memory Management framework(CMM)\cite{CMM} in our
3677 \Prop{} \Cpp{} class library.  Like the previous approach, we have
3678 implemented a mostly copying conservative collector based on the work
3679 of Bartlett\cite{Mostly-copying}.
3680 Similar to CMM, our architecture provides a protocol to allow
3681 multiple garbage collectors using different algorithms to coexist in the
3682 same memory space.  A few improvements are made to
3683 improve the performance, the flexibility and the functionality of
3684 our collector: to reduce retention due to false roots identification,
3685 blacklisting\cite{Boehm} is used to identify troublesome heap
3686 addresses; the architecture of the system has been generalized so that it is
3687 now possible to have multiple instantiations of Bartlett-style heaps;
3688 finally, object finalization and weak pointer support
3689 are added.  Our collector has been tested on gcc 2.5.8 under Linux,
3690 and SunOS 4.1.x\footnote{An alpha version also works under Solaris. Thanks
3691 to Paul Dietz for the patch.}
3692
3693 \Subsection{Introduction}
3694    The \Prop{} project involves the design and implementation of
3695 an environment and an extension language based on \Cpp{}
3696 for compiler construction and program transformation.
3697 An outgrowth of this project is the \Prop{}
3698 \Cpp{} class library, which contains an extensive set of support algorithms
3699 and data structures.  Since a typical compiler manipulates many complex tree,
3700 dag and graph objects with extended lifetime, manual memory management using
3701 \Cpp's constructors and destructors, reference counting smart pointers, or
3702 some other techniques is frequently complex and error prone.
3703 Furthermore, with the advent of new algorithms for garbage collection,
3704 manual memory management techniques do not necessarily provide better
3705 performance or space utilization.  To make it possible to make use of garbage
3706 collection as a viable alternative for memory management in
3707 \Cpp\cite{Safe-C++-GC}, we have implemented a garbage collection class
3708 hierarchy as part of the \Prop{} library.  The class library can be used
3709 directly by the users who'd like to program in \Cpp; it can also be
3710 used as part of the runtime system of a highly level language implemented
3711 in \Cpp, as we have done for \Prop.
3712
3713    We have several good reasons to prefer garbage collection over manual
3714 memory management.  The \Prop{} language contains many declarative constructs
3715 and extensions such as algebraic datatypes, pattern matching, rewriting, and
3716 logical inference.  When a user programs in \Prop{} using these constructs,
3717 an applicative style of programming is the most natural paradigm.
3718
3719 \Subsection{The Framework}
3720
3721    We base our design on the work on the Customisable Memory Management(CMM)
3722 system\cite{CMM}.  In this framework, multiple independent heaps(including
3723 the usually non-collectable heap) can coexist with each other.
3724 Bartlett's mostly copying garbage collector is used as the primary collector.
3725 CMM extends the work of Bartlett\cite{Mostly-copying}
3726 by allowing cross heap pointers and unrestricted interior pointers.
3727
3728   However, all collectable objects are required to derive from a base class
3729 and reimplement a tracing method, which identifies the internal pointers of
3730 an object.  This burden is usually quite small and in fact can be
3731 automated from the type definition\footnote{In \Prop, a special keyword
3732 \T{collectable} is used to identify garbage collected classes and types.
3733 The \Prop{} translator uses this type annotation to derive the appropriate
3734 tracing methods.}
3735
3736   One of the major advantages of the CMM framework is that multiple
3737 coorperating collectors can coexist at the same time, which makes it
3738 possible to customize the behavior of each collector with respect to
3739 the lifetime behavior of the objects.  In \cite{CMM}, examples are
3740 given in which the lifetime of certain class of objects exhibit
3741 first-in/first-out behavior.  In this case a special collector can be
3742 written to take full advantage of this property.
3743
3744 \subsection{Our Framework}
3745    Our framework retains all the benefits and flexibilities of CMM,
3746 while extending it in several minor but useful ways:
3747 \begin{itemize}
3748    \item Like CMM, interior pointers(i.e. pointers to the middle of an object)
3749 and crossheap pointers (i.e. complex data structures linking nodes locating
3750 in multiple logical heaps.) are supported.  Pointers to collectable
3751 objects are non-intrusive: i.e. they are normal \Cpp{} pointers
3752 and not encapsulated in templates.
3753    \item Also like CMM, we allow multiple garbage collectors using different
3754 algorithms to coexist.  Unlike CMM, however, we allow multiple
3755 Bartlett-style collectors to be instantiated.  Most of the services
3756 involving low level page management and object marking have been relegated
3757 to a separate heap manager.
3758    \item We provide support for finalization and weakpointers.
3759    \item We have implemented blacklisting\cite{Boehm} to reduce the chance
3760 of false roots identification.
3761 \end{itemize}
3762
3763 \subsection{The Implementation}
3764
3765    Our implementation has been completely written from scratch since
3766 we do not desire to utilize copyrighted code and, more importantly,
3767 we have to make a few important architectural changes:
3768  All low level memory management services, such as management of
3769 the page table and the object bitmap, is now relegated to the class
3770 \CLASSDEF{GCHeapManager}.  Collectors now act as clients as of the heap manager
3771 and the Bartlett-style collector no longer has any special status.
3772
3773 \subsection{Architecture}
3774
3775    The architecture of the memory management system is partitioned into
3776 a few classes, each responsible for providing distinct services:
3777
3778 \begin{itemize}
3779    \item \CLASSDEF{GCHeapManager} --- The heap manager.  The heap manager
3780     manages the heap table, performs page level
3781     allocation and deallocation, and provides miscellaneous services
3782     like blacklisting.  It also manages the object bitmaps.
3783    \item \CLASSDEF{GC} --- The base class for all garbage collectors.
3784     This base class describes the protocol used by all the collector
3785     classes\footnote{This base class is also inherited from class
3786     \CLASSDEF{Mem}, so that it adheres to the \Prop{} memory management
3787     protocol.}
3788    \item \CLASSDEF{CGC} --- The base class for conservative collectors.
3789      This class is inherited from class \CLASSDEF{GC} and implements some
3790      methods for locating the stack, heap, and static data areas.
3791    \item \CLASSDEF{BGC} --- Bartlett-style mostly copying collector.
3792       This class is inherited from class \CLASSDEF{CGC} and implements
3793       the Bartlett mostly copying algorithm.
3794    \item \CLASSDEF{MarkSweepGC} --- Mark/sweep style conservative collector.
3795       This class is inherited from class \CLASSDEF{CGC} and implements
3796       a mark/sweep collection algorithm.
3797    \item \CLASSDEF{WeakPointerManager} --- The weakpointer manager.
3798       This class manages the weak pointer table and provides a few
3799       weak pointer scavenging services for the collectors.
3800 \end{itemize}
3801
3802 \Subsection{The Programmatic Interface}
3803
3804    The programmatic interface to the garbage collector involves two
3805 base classes, \CLASSDEF{GC} and \CLASSDEF{GCObject}.  The base class
3806 \CLASSDEF{GC} provides an uniform interface to all types of collectors
3807 and memory
3808 managers while class {\sf GCObject} provides the interface to all
3809 collectable classes.  Table~\ref{GC-Classes} contains a listing
3810 of the classes in our hierarchy.
3811
3812 \begin{table}
3813    \begin{center}
3814       \begin{tabular}{|l|l|} \hline
3815         Class           & Purpose \\ \hline \hline
3816         \sf GCObject    & Collectable object base class \\
3817         \sf GC          & Garbage collector base class \\
3818         \sf CGC         & Conservative garbage collector base class \\
3819         \sf BGC         & Bartlett style mostly copying collector \\
3820         \sf MarkSweepGC & A mark-sweep collector \\
3821         \sf UserHeap    & A heap for pointerless object \\
3822         \sf GCVerifier  & A heap walker that verifies the integrity of a
3823                           structure \\
3824       \hline
3825       \end{tabular}
3826    \end{center}
3827    \caption{\label{GC-Classes} Garbage Collection Classes.}
3828 \end{table}
3829
3830 Class \CLASSDEF{GCObject} is extremely simple: it redefines the operators
3831 \verb|new| and \verb|delete| to allocate memory from the default collector,
3832 and declares a virtual method ``\verb|trace|'' to be defined by subclasses
3833 (more on this later.)
3834
3835 Memory for a \CLASSDEF{GCObject} is allocated and freed using the usually
3836 \verb|new| and \verb|delete| operators.  Of course, freeing memory explicitly
3837 using \verb|delete| is optional for many subclasses of \CLASSDEF{GC}.
3838
3839 \begin{verbatim}
3840    class GCObject {
3841    public:
3842       inline void * operator new(size_t n, GC& gc = *GC::default_gc)
3843          { return gc.m_alloc(n); }
3844       inline void * operator new(size_t n, size_t N, GC& gc = *GC::default_gc)
3845          { return gc.m_alloc(n > N ? n : N); }
3846       inline void   operator delete(void *);
3847       virtual void trace(GC *) = 0;
3848    };
3849 \end{verbatim}
3850
3851 \subsection{Memory Allocation}
3852
3853 The base class \CLASSDEF{GC} is slightly more complex, as it has to provide
3854 a few different functionalities.  The first service that class \CLASSDEF{GC}
3855 must
3856 provide is of course memory allocation and deallocation.  As a time saving
3857 device we can specify what the default collector is using the methods
3858 \verb|get_default_gc| and \verb|set_default_gc|.  Method \verb|GCObject::new|
3859 will use this collector by default, unless placement syntax is used.
3860 Method \verb|GCObject::delete|, on the other hand, can correctly infer the
3861 proper collector to use by the address of the object.
3862
3863 The low level methods to allocate and deallocate memory are \verb|m_alloc|
3864 and \verb|free| respectively.  The programmers usually do not have to
3865 use these methods directly.
3866
3867 The method to invoke a garbage collection of a specific level is
3868 \verb|collect(int level)|.  This forces an explicit collection.
3869 Method \verb|grow_heap(size_t)| can also be used to explicitly increase
3870 the heap size of a collector.  Depending of the actual behavior
3871 of the subclasses, these methods may have different effects.
3872
3873 \begin{verbatim}
3874    class GC {
3875    protected:
3876       static GC * default_gc;
3877    public:
3878       static GC&  get_default_gc()       { return *default_gc; }
3879       static void set_default_gc(GC& gc) { default_gc = &gc; }
3880       virtual void * m_alloc   (size_t) = 0;
3881       virtual void   free      (void *);
3882       virtual void   collect   (int level = 0) = 0;
3883       virtual void   grow_heap (size_t) = 0;
3884       static  void garbage_collect() { default_gc->collect(); }
3885       virtual void set_gc_ratio(int);
3886       virtual void set_initial_heap_size (size_t);
3887       virtual void set_min_heap_growth   (size_t);
3888    };
3889 \end{verbatim}
3890
3891 \subsection{The GC Protocol}
3892 The collector and collectable objects must cooperate with
3893 each other by abiding to a simple protocol:
3894 \begin{itemize}
3895    \item All objects that are to be garbage collected must be derived
3896 from \CLASSDEF{GCObject}.  The application programmer must also supply a
3897 ``{\bf tracing}'' method.  The purpose of this method is to identify
3898 all internal pointers to other \CLASSDEF{GCObject}'s.   This method is not
3899 used by the application programmer directly but only used internally
3900 by the garbage collectors.
3901    \item The tracing method of each collectable must in turn call
3902     the tracing method in the class \CLASSDEF{GC} with each pointer to
3903     a collectable object that the object owns:
3904 \begin{verbatim}
3905    class GC {
3906    public:
3907       virtual GCObject * trace (GCObject *) = 0;
3908       inline  void trace (GCObject& obj);
3909    };
3910 \end{verbatim}
3911
3912      Briefly, the rules are as follows:
3913    \begin{enumerate}
3914       \item The tracing method of a collectable \T{Foo} has the following
3915          general form:
3916           \begin{verbatim}
3917              void Foo::trace(GC * gc)
3918              {
3919                 ...
3920              }
3921           \end{verbatim}
3922       \item If class \T{Foo} has a member that is a pointer \verb|p|
3923             to a collectable object of type \T{Bar}, then add:
3924            \begin{verbatim}
3925                p = (Bar)gc->trace(p);
3926            \end{verbatim}
3927             to the body of \verb|Foo::trace|.
3928      \item  If class \T{Foo} has a member object
3929             \verb|x| that is a subclass of \T{GCObject}, also add:
3930            \begin{verbatim}
3931                gc->trace(x);
3932            \end{verbatim}
3933             to the body of \verb|Foo::trace|.
3934      \item  If class \T{Foo} is derived from a class \verb|Bar| that
3935             is a subclass of \T{GCObject}, add:
3936            \begin{verbatim}
3937                Bar::trace(gc);
3938            \end{verbatim}
3939             to the body of \verb|Foo::trace|.
3940    \end{enumerate}
3941     Notice that these methods can be arranged in any order.
3942 \end{itemize}
3943
3944 This protocol can be used by both copying and non-copying collectors.
3945 In addition, the class \CLASSDEF{GCVerifier} also uses this protocol to
3946 walk the heap in order to verify the integrity of a garbage collected
3947 data structure.
3948
3949 \subsection{Messages and Statistics}
3950     All garbage collectors use the following protocols for status
3951 reporting and statistics gathering.
3952
3953 \begin{verbatim}
3954    class GC {
3955    public:
3956       enum GCNotify {
3957          gc_no_notify,
3958          gc_notify_minor_collection,
3959          gc_notify_major_collection,
3960          gc_notify_weak_pointer_collection,
3961          gc_print_collection_time,
3962          gc_print_debugging_info
3963       }
3964       virtual int      verbosity() const;
3965       virtual void     set_verbosity(int);
3966       virtual ostream& get_console() const;
3967       virtual void     set_console(ostream&);
3968    };
3969 \end{verbatim}
3970
3971 \begin{verbatim}
3972    class GC {
3973    public:
3974       struct Statistics {
3975          const char *   algorithm;
3976          const char *   version;
3977          size_t         bytes_used;
3978          size_t         bytes_managed;
3979          size_t         bytes_free;
3980          struct timeval gc_user_time;
3981          struct timeval gc_system_time;
3982          struct timeval total_gc_user_time;
3983          struct timeval total_gc_system_time;
3984       }
3985       virtual Statistics statistics();
3986    };
3987 \end{verbatim}
3988
3989     Each collector has a current verbosity level, which can be set
3990 and reset using the methods \verb|set_verbosity(int)| and
3991 \verb|verbosity() const|.  The verbosity level is actually a bit
3992 set containing flags of the type \verb|GC::GCNotify|.  A combination
3993 of these options can be used.
3994 \begin{itemize}
3995   \item \verb|gc_no_notify| --- no messages at all.
3996   \item \verb|gc_notify_minor_collection| --- all minor collections
3997     will be notified by printing a message on the current console.
3998   \item \verb|gc_notify_major_collection| --- similarly for all major
3999     collections.
4000   \item \verb|gc_notify_weak_pointer_collection| --- notify the user
4001   when weak pointers are being collected.
4002   \item \verb|gc_print_collection_time| --- this option will let the
4003     collector to print the time spent during collection (and finalization).
4004   \item \verb|gc_print_debugging_info| --- this option will print additional
4005     information such as stack and heap addresses during garbage collection.
4006 \end{itemize}
4007
4008    The current console of a collector is defaulted to the \Cpp{} stream
4009 \verb|cerr|.  It can be set and inspected with the methods
4010 \verb|set_console(ostream&)| and \verb|ostream& get_console()| respectively.
4011 For example, the user can redirect all garbage collection messages
4012 to a log file \verb|"gc.log"| by executing the following during initialization:
4013
4014 \begin{verbatim}
4015    ofstream gc_log("gc.log");
4016    GC::get_default_gc().set_console(gc_log);
4017 \end{verbatim}
4018
4019 \subsection{The Bartlett style mostly copying collector}
4020
4021    Currently a Bartlett-style mostly copying collector has been implemented.
4022 The current version is non-generational but we expect that a generational
4023 version will be available in the near future.
4024
4025    The Bartlett-style collector is implemented as the concrete class
4026 \CLASSDEF{BGC}.  Aside from all the services provided by class \CLASSDEF{GC},
4027 \CLASSDEF{BGC} also provides the following method:
4028
4029 \begin{verbatim}
4030    class BGC : public CGC {
4031    public:
4032       virtual void set_gc_ratio(int);
4033       virtual void set_initial_heap_size (size_t);
4034       virtual void set_min_heap_growth   (size_t);
4035    }
4036 \end{verbatim}
4037
4038   The gc ratio refers to the ratio of used heap space versus
4039 total heap space.  Class \CLASSDEF{BGC} will invoke the garbage collection
4040 if this is exceeded.  The default gc ratio for \T{BGC} is 75\%.
4041
4042   The initial size of the heap and the minimal number of bytes to
4043 request from the system during a heap expansion can be adjusted using
4044 the methods \verb|set_initial_heap_size| and \verb|set_min_heap_growth|
4045 respectively.  These methods are declared in the \T{GC}
4046 protocol.   By default, the initial heap size for this collector
4047 is 128K and each heap expansion will increase the heap by 512K.
4048
4049 If an application uses a lot garbage collected storage it is a good
4050 idea to set the heap size to larger capacity during initialization.
4051 Otherwise, the collector will have to perform more garbage collection
4052 and heap expansions to reach a stable state.
4053
4054 \subsection{The Mark Sweep collector}
4055
4056    An alternative to the copying collector is the the mark sweep collector.
4057 Like the previous collector, this collector also uses conservative scanning
4058 to locate roots.  Unlikely the Boehm collector, type accurate marking
4059 is used through the user supplied tracing method.
4060
4061    Since the default collector is of the \T{BGC} variety, the user must
4062 create an instance of \T{MarkSweepGC} if the mark sweep collector
4063 is to be used.
4064
4065 \begin{verbatim}
4066    class MarkSweepGC : public CGC {
4067    public:
4068       virtual void set_gc_ratio(int);
4069       virtual void set_initial_heap_size (size_t);
4070       virtual void set_min_heap_growth   (size_t);
4071    }
4072 \end{verbatim}
4073
4074    The gc ratio in class \T{MarkSweepGC} determines whether heap
4075 expansion is to be performed after a collection.  The default gc ratio
4076 is 50\%, thus if after garbage collection the ratio of used versus total
4077 heap space exceeds one half, the heap will be expanded.
4078
4079    For most purposes the two collectors are interchangeable.  Since
4080 all types of garbage collectors under our framework use the same protocol,
4081 applications can be written without fixing a specific garbage collection
4082 algorithm before hand.
4083
4084 \subsection{Finalization}
4085
4086    One common \Cpp{} idiom uses constructor and destructor for
4087 resource allocation: resources (including memory but may include others,
4088 such as file handles, graphical widgets, etc) that are acquired
4089 in a class constructor are released(finalized) in the class'es destructor.
4090
4091    There are, however, two opposing views in how finalization should be handled
4092 in a garbage collector.  The first assumes that garbage collection
4093 simulates an infinite amount of memory and so automatic finalization is
4094 inapproriate.   The second takes a more pragmatic view, and assumes that
4095 automatic finalization should be provided since otherwise explicit resource
4096 tracking must be provided by the user for collectable datatypes, making garbage
4097 collection much less useful than it can be.
4098
4099    We will refrain from participating in any religious and philosophical
4100 wars on which dogma is the ``One True Way''.
4101 nstead, both types of collectors are provided.
4102
4103    By default, all garbage collection classes do
4104 not perform finalization on garbage
4105 collected objects.  If object finalization is desired then the user
4106 can do one of two things:
4107 \begin{itemize}
4108    \item Enable the finalization of the default heap by putting
4109 \begin{verbatim}
4110    GC::get_default_gc().set_finalization(true);
4111 \end{verbatim}
4112    in the initialization code, or
4113    \item Instantiate a different instance of the garbage collector
4114     and allocate objects needing finalization from this collector.
4115     This may be a better method since not all objects need finalization
4116     and those that do not can still be allocated from the default collector.
4117     This way we only have to pay for the performance penalty (if any)
4118     proportional to the usage of finalization.
4119 \end{itemize}
4120
4121    For example, if a collectable class named \T{Foo} should be properly
4122 finalized we can declare it in the following manner:
4123
4124 \begin{verbatim}
4125    extern BGC bgc_f;  // somewhere an instance is defined
4126
4127    class Foo : public GCObject {
4128       Resource r;
4129    public:
4130       Foo() { r.allocate(); /* allocate resource */ }
4131      ~Foo() { r.release(); /* release all resource */ }
4132
4133       // Make object allocate from bgc_f instead
4134       // of the default non-collectable gc.
4135       void * operator new (size_t n) { return bgc_f.m_alloc(n); }
4136    };
4137 \end{verbatim}
4138
4139    When an instance of class \verb|Foo| is identified as garbage, its
4140 destructor will be called.  [Aside: {\it currently issue such as
4141 finalization synchronization is not handled directly by the collector.
4142 So if there is synchronization constraints during finalization it must
4143 be handled by the object destructor.  Future versions of \Prop{} will
4144 provide subclasses with more sophisticated finalization mechanisms.} ]
4145
4146    Notice that the default \verb|delete| operator of all garbage collectable
4147 classes will call the object destructor explicitly.  It is a good idea to use
4148 explicit \verb|delete| if finalization is a rare need since this service
4149 involves an explicit scan of the garbage memory, which may degrade performace
4150 with excessive swapping: without finalization the garbage pages will not
4151 have to be referenced with a copying collector.
4152
4153    It should also be noted that since the collector is conservative, there
4154 is no guarantee that garbage will be identified as such:
4155 there is no guarantee that all garbage resources will be released.
4156 In general, however, the efficacy of the collector is quite good and
4157 so non-critical or plentiful resources can be safely finalized with
4158 this collector.
4159
4160 \Subsubsection{Weak Pointers}
4161
4162    It is frequently very useful to be able to keep track of garbage
4163 collectable objects through the use of ``{\bf weak pointers}.''
4164 Collectors ignore the presence of weak pointers to garbage collected
4165 objects; objects with only referencing weak pointers will still be
4166 collected.  Weak pointers to objects that become garbage will
4167 be automatically reset to null.
4168
4169    Weak pointers are provided through a smart pointer template
4170 \CLASSDEF{WeakPointer}, whose definition is shown below:
4171
4172 \begin{verbatim}
4173 template <class T> class WeakPointer {
4174    public:
4175       inline WeakPointer();
4176       inline WeakPointer(const WeakPointer<T>& wp);
4177       inline WeakPointer(T * ptr);
4178       inline WeakPointer<T>& operator = (const WeakPointer<T>& wp);
4179       inline WeakPointer<T>& operator = (T * ptr);
4180       inline bool is_null() const;
4181       inline operator const T * () const;
4182       inline operator       T * ();
4183       inline const T * operator -> () const;
4184       inline       T * operator -> ();
4185       inline const T&  operator *  () const;
4186       inline       T&  operator *  ();
4187    };
4188 \end{verbatim}
4189
4190 A weakpointer to a garbage collectable
4191 class \verb|A| can be defined as \verb|WeakPointer<A>|.
4192 If a \Prop{} datatype \verb|T| has been defined, a weakpointer to type
4193 \verb|T| can defined using the
4194 \TDEF{classof} keyword.  For example:
4195
4196 \begin{verbatim}
4197    datatype Wff :: collectable = True | False | And(...)  | ...;
4198
4199    WeakPointer<classof Wff> a = And(True,False);
4200 \end{verbatim}
4201
4202 \Subsubsection{The Heap Walker}
4203
4204    There is a simple class called \CLASSDEF{GCVerify} that can be used
4205 to verify the integrity of a complex allocated data structure.
4206 This is frequently useful during the development of a new collector,
4207 or a complex class derive class of \T{GCObject}.
4208
4209    The interface of this class is shown below.
4210
4211 \begin{verbatim}
4212    class GCVerifier : protected GC {
4213    public:
4214       virtual bool is_valid_pointer          (GCObject *);
4215       virtual bool is_valid_interior_pointer (GCObject *);
4216       virtual bool is_valid_structure        (GCObject *);
4217       virtual bool is_valid_pointer          (GCObject *, GC *);
4218       virtual bool is_valid_interior_pointer (GCObject *, GC *);
4219       virtual bool is_valid_structure        (GCObject *, GC *);
4220       size_t number_of_nodes() const;
4221    };
4222 \end{verbatim}
4223
4224    Class \CLASSDEF{GCVerify} is derived from class \CLASSDEF{GC} so that the
4225 same object tracing protocol can be used.  Briefly, three different methods
4226 are provided for verification:
4227 \begin{itemize}
4228    \item Call to \verb|is_valid_pointer(p,gc)| verifies that
4229     \verb|p| is a valid pointer to an object within the collector \verb|gc|.
4230    \item Call to \verb|is_valid_pointer(p,gc)| verifies that
4231     \verb|p| is a valid interior pointer to an object
4232      within the collector \verb|gc|.
4233    \item Call to \verb|is_valid_structure(p,gc)| verifies the entire
4234      structure reachable by \verb|p| is valid.  This method uses
4235      \verb|GCObject::trace| to locate the correct sub-structures.
4236 \end{itemize}
4237
4238    There are alternative versions of the same methods that assumes
4239 the default collector is used.   Furthermore
4240 \begin{itemize}
4241    \item Method \verb|number_of_nodes()| returns the node count of the
4242 last structure traced using \verb|is_valid_structure|, and
4243    \item If \verb|set_verbosity(GC::gc_print_debugging_info)| is used
4244 then error messages will be printed during structure tracing.
4245 \end{itemize}
4246
4247 \bibliography{refman}
4248
4249 \input{refman.ind}
4250 \end{document}