initial
[prop.git] / docs / refman.tex
blobab1805a3f47afb22df5b671c45aa65b5cebc2864
1 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
3 % Here are some abbreviations used within this document
5 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
6 \newfont{\bsf}{cmssbx10}
7 \newfont{\largebsf}{cmssbx12}
8 %\newfont{\nontermfont}{cmssbx10}
9 \newcommand{\nontermfont}{\it}
10 \newcommand{\codefont}{\tt}
11 \newcommand{\symfont}{\bf}
13 \newcommand{\Prop}{{\bsf Prop}} % typeset in sans serif
14 \newcommand{\LargeProp}{{\largebsf Prop}}
15 \newcommand{\plusplus}{{\tt++}} % ++
16 \newcommand{\Cpp}{{C{\tt++}}} % C++ logo
17 \newcommand{\Gpp}{{G{\tt++}}} % G++ logo
18 \newcommand{\email}{{\tt leunga@cs.nyu.edu}} % my email address
19 \newcommand{\web}{{\tt http://valis.cs.nyu.edu:8888/\~{}leunga}} % my web page
20 \newcommand{\propweb}{\web{\tt/prop.html}}
21 \newcommand{\Version}{2.3.4} % Prop version
22 \newcommand{\comment}[1]{}
24 \newcommand{\sectionfont}{}
25 %\newcommand{\Section}[1]{\section{\sectionfont #1}}
26 %\newcommand{\Subsection}[1]{\subsection{\sectionfont #1}}
27 %\newcommand{\Subsubsection}[1]{\subsubsection{\sectionfont #1}}
29 \newcommand{\Section}[1]{\pagebreak\section{\sectionfont #1}%
30 \markright{\usebox{\LINE}\large \thesection. \sc #1}
32 \newcommand{\Subsection}[1]{\subsection{\sectionfont #1}}
33 \newcommand{\Subsubsection}[1]{\subsubsection{\sectionfont #1}}
35 \newcommand{\N}[1]{\mbox{{\nontermfont #1}}}
36 \newcommand{\T}[1]{{\codefont #1}}
37 \newcommand{\MORE}[2]{#1\T{#2} \mbox{$\ldots$} \T{#2} #1}
38 \newcommand{\ALT}[2]{#1\ \T{#2} \mbox{$\ldots$} \T{#2} #1}
39 \newcommand{\LIST}[1]{\MORE{#1}{,}}
40 \newcommand{\SEQ}[1]{\MORE{#1}{;}}
41 \newcommand{\OPT}[1]{{\it [ #1 ]}}
42 \newcommand{\C}[1]{\mbox{\rm #1}}
43 \newcommand{\IS}{{\symfont ::=}}
44 \newcommand{\OR}{\ \mbox{$\symfont \mid$}\ }
45 \newcommand{\Id}{\N{Id}\ }
46 \newcommand{\Pat}{\N{Pat}\ }
47 \newcommand{\Exp}{\N{Exp}\ }
48 \newcommand{\TypeExp}{\N{Type\_Exp}\ }
49 \newenvironment{syntax}{\begin{quotation}\noindent\begin{tabular}{lcll}} %
50 {\end{tabular}\end{quotation}}
52 \newcommand{\CLASSDEF}[1]{\T{#1}\index{Classes!{\codefont #1}}}
53 \newcommand{\INDEX}[1]{\index{#1}}
54 \newcommand{\NDEF}[1]{\N{#1}\index{Syntax!Non--terminals!{\nontermfont #1}}}
55 \newcommand{\OPTIONDEF}[1]{{\tt #1}\index{Command line options!{\tt #1}}}
56 \newcommand{\TDEF}[1]{\T{#1}\index{Syntax!Keywords!{\codefont #1}}}
57 \newcommand{\INCOMPLETE}{{\em This section is incomplete.}}
58 \newenvironment{Error}{\begin{center}\begin{tabular}{p{3in}p{3in}}
59 \bf Error & \bf Explanation}{\\\hline\end{tabular}\end{center}}
60 \newcommand{\errormsg}[1]{\\\hline {\tt #1}&}
61 \newenvironment{Tips}{\noindent {\bf Tips:} \quad}{}
63 \newcommand{\support}{
64 {\small\noindent This work is supported in part by
65 an award from Hewlett-Packard Corporation, from IBM corporation, and
66 an NYU Research Challenge Grant.
68 \newcommand{\warning}{
69 {\small\noindent The author will neither assume responsibility for any
70 damages caused by the use of this product, nor accept warranty or update
71 claims. This product is distributed in the hope that it will be useful,
72 but WITHOUT ANY WARRANTY; without even the implied warranty of
73 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
75 \small\noindent This product is in the public domain,
76 and may be freely distributed.
78 \small\noindent {\sf Prop} is a research prototype.
79 The information contained in this document is subject
80 to change without notice.
83 \makeindex
85 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
87 % Beginning of the document
89 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
90 \documentclass{article}
91 \usepackage{html}
93 \pagestyle{myheadings}
95 \title{{\Huge \bf Prop \\
96 Language Reference Manual} \\
97 {\large \bf (for version \Version)}
99 \author{{\Large Allen Leung} \\
100 E-mail: \email \\
101 \htmladdnormallink{\web}{http://valis.cs.nyu.edu:8888/~leunga}
103 Courant Institute of Mathematical Sciences \\
104 251 Mercer Street \\
105 New York, NY 10012 \\
108 \setlength{\textwidth}{6.6in}
109 \setlength{\evensidemargin}{0in}
110 \setlength{\oddsidemargin}{0in}
111 \setlength{\textheight}{8.75in}
112 \setlength{\topmargin}{-0.5in}
113 \setlength{\parskip}{1mm}
114 \setlength{\parindent}{3ex}
115 \setlength{\itemsep}{0.5ex}
117 \newsavebox{\LINE}
118 \savebox{\LINE}[0.0in][l]{\rule[-1ex]{\textwidth}{.01in}}
119 \begin{document}
121 \maketitle
122 \vspace{\fill}
123 %\support
125 \begin{abstract}
126 This reference manual provides an introduction to
127 \Prop, release \Version. \Prop{} is a multiparadigm extension of \Cpp,
128 and is designed for building high performance compiler
129 and language transformation systems, using pattern matching and rewriting.
130 This guide describes the syntax and semantics of
131 the \Prop{} language and describes how to develop programs
132 using the \Prop{} to \Cpp{} translator.
134 \end{abstract}
135 \warning
136 \titlepage
137 \tableofcontents
138 \bibliographystyle{alpha}
140 \Section{Introduction} \label{sec:intro}
142 This is the language reference manual for \Prop, a compiler generator.
143 \Prop{} can be used to generate lexers, parsers, analysis and
144 transformation tools using pattern matching and rewriting.
145 The \Prop{} language is a {\em multi-paradigm}
146 extension of \Cpp\cite{C++} and
147 \Cpp{} programs generated from \Prop{} specifications can be compiled
148 into efficient code.
150 Version \Version{} of \Prop{} includes the following major features:
151 \begin{enumerate}
152 \item Algebraic datatypes and pattern matching, and automatic
153 mapping of algebraic datatypes into \Cpp{} class hierarchies.
154 \item A LALR(1) parser generator in the tradition of {\it yacc} and
155 {\it bison}. Each grammar is encapsulated into a parser class and
156 it is possible to have multiple instances of a parser.
157 \item Regular expression string matching and lexer generation.
158 \item Transformation of algebraic datatypes using conditional
159 rewriting.
160 \item In addition, using the view mechanism,
161 it is possible to treat externally defined C or \Cpp{}
162 data structures as if they were \Prop{} defined datatypes.
163 The full range of pattern matching and rewriting capability can
164 be used on these externally defined data..
165 \end{enumerate}
167 In addition, we intend to implement/fix-up the following features in the
168 near future:
169 \begin{enumerate}
170 \item Persistence.
171 \item Pretty printing/reading.
172 \item Visualization of data structures using {\em vcg}\cite{vcg,vcg-manual}.
173 \item Automatic mapping of graph structure specifications into
174 \Cpp{} classes.
175 \item Graph rewriting system generation.
176 \end{enumerate}
178 \Subsection{Availability}
180 The latest release of \Prop{} \Version{} is available from
181 \begin{quotation}
182 \htmladdnormallink{\propweb}{http://valis.cs.nyu.edu:8888/~leunga/prop.html}
183 \end{quotation}
184 New updates of the source code and documentation will be available
185 from this site.
187 \Subsection{Organization of this Reference Manual}
188 This reference manual is organized as follows. Section~\ref{sec:general}
189 overviews the general features of \Prop.
190 Section~\ref{sec:parser}
191 describes the lexer and parser specification formalisms.
192 Section~\ref{sec:datatype} describes the algebraic datatypes and
193 pattern matching features. Section~\ref{sec:tree-rewriting} describes
194 the tree rewriting mechanism. Finally, section~\ref{sec:usage} describes
195 the process of running the \Prop{} translator.
197 We'll use the following extended BNF notation to specify the syntax
198 of the language. Terminals are typeset in {\tt typewriter} font,
199 while non-terminals are typeset in {\nontermfont itallics} font.
200 Given a syntactic class \N{S}, we use \LIST{\N{S}} to denote one
201 or more occurrences of \N{S} separated by commas. Similarly,
202 \SEQ{\N{S}} denotes one or more occurrences of \N{S} separated by
203 semi-colons. Slanted brackets are used to denote an optional occurrence
204 of a syntactic class. We'll combined the two forms when we
205 want to denote zero or more occurrences of some term.
206 For example, \OPT{\LIST{\N{S}}} denotes
207 zero or more occurrences of \N{S}.
209 \Section{General Concepts} \label{sec:general}
211 \Subsection{Syntactic Conventions}
213 The syntax of \Prop{} is an extension of \Cpp. Most new constructs
214 of \Prop{} starts with new keywords, listed below.
215 \begin{center}
217 \begin{tabular}{lllllll}
218 applicative & arb & as & bagof & begin & bitfield & card \\
219 category & classof & collectable & constraint & dataflow & datatype & declare \\
220 dequeof & dom & domain & edges: & elsif & end & equiv: \\
221 exists & expect: & feature & finalizable & forall & fun & function \\
222 functor & graphrewrite & graphtype & implies: & inference & inherited & inline \\
223 instantiate & is & law & left: & less & lexeme & listof \\
224 loop & mapof & match & matchall & matchscan & module & multimapof \\
225 mutable & nodes: & of & persistent & prec: & printable & priqueueof \\
226 procedure & queueof & ran & refine & relation & rewrite & right: \\
227 setof & sharing & signature & space: & syntax & synthesized & then \\
228 time: & topdown & traced & treeparser & tupleof & unifiable & unique \\
229 view & where & with & xor: & bottomup: & topdown: & before: \\
230 preorder: & postorder: & cutrewrite & failrewrite & index: \\
231 \end{tabular}
232 \end{center}
234 In addition, the following new multi-character symbols are added:
235 \begin{verbatim}
236 <-> <=> := :- .( .[
237 #[ #( #{ [| |] (| |) {| |}
238 \end{verbatim}
240 \Subsection{Basic syntactic classes}
243 We'll use \NDEF{Integer}, \NDEF{Char}, \NDEF{Real}, \NDEF{String}
244 and \NDEF{Id} to denote the syntactic classes of
245 integers, characters, real numbers, strings and identifiers, respectively.
246 As in \Cpp, characters are quoted with the single quotes \verb|'|, while
247 strings are quoted with double quotes \verb|"|. An identifier is
248 any sequence of alphanumeric of characters that begins with a letter.
249 In addition, we'll use the syntactic class
250 \NDEF{Stmt} to denote any valid combination of \Cpp{} and \Prop{}
251 statements.
253 \Subsection{Literals}
255 In addition to the usual \Cpp{} literals, \Prop{} introduces two new type
256 of literals: {\em quark literals} and {\em regular expression literals}.
257 Quark literals are of type \T{Quark} and are written as strings
258 prefixed by the \verb|#| symbol.
259 \begin{syntax}
260 \NDEF{Quark} & \IS & \T{\#}\N{String} \\
261 \end{syntax}
263 Quarks act like atoms in Lisp, in which name equality implies
264 pointer equality. In contrast, in \Cpp{} two string literals that are
265 identical in value do not necessarily reside in the same address.
266 Given strings \verb|s1| and \verb|s2|, \verb|strcmp(s1,s2) == 0| does
267 not imply \verb|s1 == s2|. Quarks are defined in the library include file
268 \verb|<AD/strings/quark.h>|.
270 Regular expression literals are similar to string literals except
271 that they are quoted by slashes \verb|/|. We'll discuss
272 the regular expression in section~\ref{sec:regular-expressions}.
274 \Subsection{The \Prop{} Language}
276 The basic \Prop{} language is the same as \Cpp{} with the following
277 syntactic extensions:
278 \begin{quotation}
279 \begin{tabular}{lll}
280 & \bf Keywords & \bf Functions \\
281 \bsf 1 & \T{datatype} \T{refine} \T{instantiate}
282 & Algebraic datatype definitions \\
283 \bsf 2 & \T{match} \T{matchall} & Pattern matching \\
284 \bsf 3 & \T{rewrite} & Rewriting \\
285 \bsf 4 & \T{lexeme} & Lexical category definition \\
286 \bsf 5 & \T{matchscan} & Lexical scanning \\
287 \bsf 6 & \T{syntax} & Parser specification \\
288 \bsf 7 & \T{graphtype} & Graph data structure specification \\
289 \bsf 8 & \T{graphrewrite} & Graph rewriting \\
290 \end{tabular}
291 \end{quotation}
293 Like \Cpp, a \Prop{} is typically divided into the {\em specification},
294 which defines the data structure and its interface, and the
295 {\em implementation} parts. \Prop{} specifications should
296 be placed in a file with a \verb|.ph| suffix, while an implementation
297 should be placed in a file with a \verb|.pC|\footnote{The suffixes {\tt .pcc},
298 {\tt .pCpp} etc. can also be used.}. The translator will convert each
299 \verb|.p|$xxx$ into a file with the suffix \verb|.|$xxx$. The translated
300 output can then be fed to the \Cpp{} compiler for compilation.
301 \vspace{\fill}
303 \Section{Lexer and Parser Generation} \label{sec:parser}
305 In this section we'll describe the lexer and parser generation
306 mechanism of \Prop. Note that the use of these features is not
307 mandatory: the user can substitute any other lexer/parser generation tools
308 or use hand written lexers and parsers.
309 On the other hand, higher level of integration is possible when using
310 \Prop's lexer and parser mechanisms.
312 \Subsection{Lexer Specification}
313 Lexical analyzers are specified using the \verb|matchscan|
314 statement. This construct is responsible for generating the actual string
315 matching DFA. The actual buffering mechanisms are provided by the
316 classes \verb|LexerBuffer|, \verb|IOLexerBuffer| and \verb|IOLexerStack|.
317 These classes are part of the support library distributed with \Prop.
319 We assume that the user is familiar with lexer generator tools like
320 {\em lex}\cite{lex} or {\em flex}\cite{flex}.
322 \Subsubsection{Regular expressions} \label{sec:regular-expressions}
323 Figure~\ref{fig:regexp} describes the set of current
324 supported regular expressions in \Prop. The syntax is similar
325 to what is found in {\em lex}, or {\em egrep}.
327 \begin{figure}[htbp]
328 \begin{center}
329 \begin{tabular}{|l|l|} \hline
330 $c$ & matches character $c$ if it is not a meta character \\
331 $e_1e_2$ & matches $e_1$ then $e_2$ \\
332 \verb|.| & matches any character except \verb|\n| \\
333 $\verb|\|c$ & matches escape sequence $c$ \\
334 \verb|^|$e$ & matches $e$ at the start of the line \\
335 $e_1\verb.|.e_2$ & matches $e_1$ or $e_2$ \\
336 $e\verb|*|$ & matches zero or more $e$ \\
337 $e\verb|+|$ & matches one or more $e$ \\
338 $e\verb|?|$ & matches zero or one $e$ \\
339 \verb|(|$e$\verb|)| & grouping \\
340 \verb|<<|$C_1, C_2 \ldots C_n$\verb|>>|$e$ & matches $e$ only
341 if we are in a context from one of $C_1, C_2, \ldots C_n$ \\
342 \verb|{|{\em lexeme}\verb|}| & matches lexeme \\
343 \hline
344 \end{tabular}
345 \end{center}
346 \caption{\label{fig:regexp} Regular expressions.}
347 \end{figure}
349 The symbols \verb%\ [ ] ( ) { } << >> * + . - ? |% are meta characters and are
350 interpreted non-literally. The escape character \verb|\|
351 can be prepended to the meta characters if they occur as literals in context.
353 Precedence-wise, meta characters \verb|*|, \verb|+| and \verb|?| bind tighter
354 than juxtaposition. Thus the regular expression
355 \verb|ab*| means \verb|a(b*)|. Parenthesis can be used to override
356 the default precedence.
358 Character classes are of the form as found in {\em lex}:
359 (i) $c_1$\verb|-|$c_2$ denotes the range of characters from
360 $c_1$ to $c_2$; (ii) single (non-meta) characters denote themselves;
361 (iii) the meta character \verb|^| can be used to negate the set.
362 For example, the regular expression \verb|[a-zA-Z][a-zA-Z0-9]*|
363 specifies an alphanumeric identifier that must starts with a letter.
364 Similarly, the regular expression \verb|[^ \t\n]| matches any character
365 except a space, a tab or a newline.
367 {\em Lexemes} are simply abbreviated names given to a regular expression
368 pattern. They act like macros in {\em lex}.
370 While a lexer is scanning, it may be in one of many {\em contexts}.
371 Contexts can be used to group a set of related lexical rules; such rules
372 are only applicable when the contexts are active. This makes the
373 lexer behave like a set of DFAs, with the ability to switch between
374 DFAs under programmer control.
377 \Subsubsection{Lexeme}
379 Lexemes are defined using the \T{lexeme} declaration in \Prop.
380 Its syntax is
381 \begin{syntax}
382 \NDEF{Lexeme\_Decl} & \IS & \TDEF{lexeme} \ALT{\N{Lexeme\_Eq}}{|} \T{;} \\
383 \NDEF{Lexeme\_Eq} & \IS & \Id \T{=} \N{Regexp} \\
384 \end{syntax}
386 For example, the following lexeme definition is used in the \Prop{}
387 translator to define the lexical structure of common lexical items.
389 \begin{verbatim}
390 lexeme digits = /[0-9]+/
391 | sign = /[\+\-]/
392 | integer = /{digits}/
393 | exponent = /[eE]{sign}?{digits}/
394 | mantissa = /({digits}\.{digits}?|\.{digits})/
395 | real = /{mantissa}{exponent}?/
396 | string = /"([^\\"\n]|\\.)*"/
397 | character = /'([^\\'\n]|\\.[0-9a-f]*)'/
398 | regexp = /\/([^\/\n*]|\\.)([^\/\n]|\\.)*\//
400 \end{verbatim}
402 Note that regular expression literals are written between
403 slashes: \verb|/|$re$\verb|/|.
405 \Subsubsection{Lexeme class}
407 Often we wish to group a set of lexemes together into {\em lexeme classes} if
408 they logically behave in some uniform manner; for example, if they
409 act uniformly in a lexical rule. By grouping related lexemes together
410 into a class we can refer to them succinctly by their class name.
412 The syntax of a lexeme class declaration is
414 \begin{syntax}
415 \NDEF{Lexeme\_Class\_Decl} & \IS & \TDEF{lexeme class}
416 \ALT{\N{Lexeme\_Class\_Eq}}{and} \T{;} \\
417 \NDEF{Lexeme\_Class\_Eq} & \IS & \Id \T{=} \ALT{\N{Lexeme\_Spec}}{|} \\
418 \NDEF{Lexeme\_Spec} & \IS & \N{String} \\
419 & \OR & \Id \N{Regexp} \\
420 \end{syntax}
422 The following example,
423 the lexeme classes \verb|MainKeywords|, \verb|Symbols|, and \verb|Literals|
424 are defined.
425 \begin{verbatim}
426 lexeme class MainKeywords =
427 "rewrite" | "inference" | "match" | "matchall" | "matchscan"
428 | "refine" | "classof" | "type" | "datatype" | "instantiate"
429 | "lexeme" | "bitfield" | "begin" | "syntax"
430 | "dataflow" | "module" | "signature" | "constraint" | "declare"
431 | "procedure" | "fun" | "function" | "domain"
432 | "graphtype" | "graphrewrite"
434 and Symbols =
435 ".." | "..." | "<->" | "::" | "&&" | "||" | "++" | "--" | "->"
436 | "<<" | ">>" | ">=" | "<=" | "+=" | "-=" | "*=" | "/=" | "%=" | "=="
437 | "!=" | "<<=" | ">>=" | "&=" | "|=" | "^=" | "=>" | "<-" | "<=>"
438 | ":=" | ":-" | LONG_BAR /\-\-\-\-\-+/
440 and Literals =
441 INT_TOK /{integer}/
442 | REAL_TOK /{real}/
443 | CHAR_TOK /{character}/
444 | STRING_TOK /{string}/
446 \end{verbatim}
448 \Subsubsection{Tokens}
450 Lexical tokens are defined using the \verb|datatype| declaration.
451 The syntax is as follows.
452 \begin{syntax}
453 \NDEF{Tokens\_Decl} & \IS & \TDEF{datatype} \Id \T{::} \TDEF{lexeme} \T{=} \\
454 & & \quad \ALT{\N{Token\_Spec}}{|} \T{;} \\
455 \NDEF{Token\_Spec} & \IS & \TDEF{lexeme class} \Id &
456 \C{Include a lexeme class} \\
457 & \OR & \Id \OPT{\N{Regexp}} &
458 \C{Single token spec} \\
459 \end{syntax}
461 A token datatype is defined by including one of more lexeme classes,
462 and by defining stand-alone tokens. If a lexeme class is included,
463 all the tokens defined within the lexeme class are included. A \Cpp{}
464 \T{enum} type of the same name as the token datatype is generated.
465 If a token is given an identifier name, then the same name is used
466 as the \T{enum} literal. On the other hand, if a string is used
467 to denote a token, then it can be referred to by prefixing the
468 string with a dot \verb|.| For example, the token \verb|"=>"| can
469 be referenced as \verb|."=>"| within a program.
471 As an example, the following token datatype definition is used within
472 the \Prop{} translator. Here, the keywords are first partitioned into
473 6 different lexeme classes. In additional, the tokens \T{ID\_TOK},
474 \T{REGEXP\_TOK}, etc. are defined.
476 \begin{verbatim}
477 datatype PropToken :: lexeme =
478 lexeme class MainKeywords
479 | lexeme class Keywords
480 | lexeme class SepKeywords
481 | lexeme class Symbols
482 | lexeme class Special
483 | lexeme class Literals
484 | ID_TOK /{patvar}/
485 | REGEXP_TOK /{regexp}/
486 | QUARK_TOK /#{string}/
487 | BIGINT_TOK /#{sign}{integer}/
488 | PUNCTUATIONS /[\<\>\,\.\;\&\|\^\!\~\+\-\*\/\%\?\=\:\\]/
490 \end{verbatim}
492 \Subsubsection{The \T{matchscan} statement}\label{sec:matchscan}
494 The \T{matchscan} statement is used to perform tokenization. The user can
495 specify a set of string pattern matching rules within a \T{matchscan}
496 construct. Given a object of class \verb|LexerBuffer|, the \T{matchscan}
497 statement looks for the rule that matches the longest prefix from
498 the input stream and executes the action associated with the rule.
499 Ties are broken by the lexical ordering of the rules.
501 The general syntax is as follows:
502 \begin{syntax}
503 \NDEF{Matchscan}
504 & \IS & \N{Matchscan\_Mode} \OPT{\N{Context\_Spec}} \T{(} \N{Exp} \T{)}
505 & variant 1 \\
506 & & \T{\{} \ALT{\T{case} \N{Matchscan\_Rule}}{} \T{\}} \\
507 & \OR & \N{Matchscan\_Mode} \OPT{\N{Context\_Spec}}
508 \T{(} \N{Exp} \T{)} & variant 2 \\
509 & & \T{\{} \ALT{\N{Matchscan\_Rule}}{|} \T{\}} \\
510 & \OR & \N{Matchscan\_Mode} \OPT{\N{Context\_Spec}} \T{(} \N{Exp} \T{)}
511 \T{of} & variant 3 \\
512 & & \quad \ALT{\N{Matchscan\_Rule}}{|} \\
513 & & \T{end} \T{matchscan} \T{;}\\
514 \NDEF{Matchscan\_Mode} & \IS &
515 \TDEF{matchscan} \OPT{\T{while}} & case sensitive\\
516 & \OR & \TDEF{matchscan*} \OPT{\T{while}}& case insensitive\\
517 \NDEF{Context\_Spec} & \IS & \T{[} \LIST{\N{Id}} \T{]} \\
518 \NDEF{Matchscan\_Rule}
519 & \IS & \OPT{\T{<<} \ALT{\N{Context}}{,} \T{>>}} \\
520 & & \quad \T{lexeme} \T{class} \N{Id} \T{:} \N{Matchscan\_Action} \\
521 & \OR & \OPT{\T{<<} \ALT{\N{Context}}{,} \T{>>}} \\
522 & & \quad \N{Regexp} \T{:} \N{Matchscan\_Action} \\
523 \NDEF{Matchscan\_Action} & \IS & \T{\{} \N{Code} \T{\}} \\
524 & \OR & \N{Code} & for variant 1 only \\
525 \end{syntax}
527 The two different modes of operation are \T{matchscan} and \T{matchscan*},
528 which respectively match strings case sensitively and insensitively.
529 The modifier \T{where} may optionally specify that the matching process
530 should repeat until no rules apply, or the end of stream condition
531 is reached.
533 By default, if no rules apply and if the input stream is non-empty,
534 then an error has occurred. The \T{matchscan} statement will
535 invoke the method \verb|error()| of the \T{LexerBuffer} object by default.
537 For example, the following is part of the \Prop{} lexer specification.
539 \begin{verbatim}
540 datatype LexicalContext = NONE | C | PROP | COMMENT | ...;
542 int PropParser::get_token()
544 matchscan[LexicalContext] while (lexbuf)
547 | <<C>> /[ \t\\\014]/: { emit(); }
548 | <<C>> /(\/\/.*)?\n/: { emit(); line++; }
549 | <<C>> /^#.*/: { emit(); }
550 | <<PROP>> lexeme class MainKeywords: { return ?lexeme; }
551 | <<PROP>> lexeme class SepKeywords: { return ?lexeme; }
552 | <<PROP>> QUARK_TOK: { return QUARK_TOK; }
553 | <<PROP>> BIGINT_TOK: { return BIGINT_TOK; }
554 | <<PROP>> REGEXP_TOK: { return REGEXP_TOK; }
555 | <<PROP>> PUNCTUATIONS: { return lexbuf[0]; }
556 | <<PROP>> /[ \t\014]/: { /* skip */ }
557 | <<PROP>> /(\/\/.*)?\n/: { line++; }
558 | /\/\*/: { emit(); set_context(COMMENT); }
559 | <<COMMENT>> /\*\//: { emit(); set_context(PROP); }
560 | <<COMMENT>> /\n/: { emit(); line++; }
561 | <<COMMENT>> /./: { emit(); }
562 | /./: { error("%Lillegal character %c\n", lexbuf[0]); }
565 \end{verbatim}
567 Here, the lexer is partitioned in multiple lexical contexts: context
568 \verb|C| deals with \Cpp{} code while context \verb|PROP| deals with
569 \Prop{} extensions. The special context \verb|COMMENT| is used
570 to parse \verb|/* */| delimited comments. Contexts are changed
571 using the \verb|set_context| method defined in class \verb|LexerBuffer|.
573 The special variable \verb|?lexeme| can be used within a rule
574 that matches a lexeme class. For example, within the rule
575 \begin{verbatim}
576 | <<PROP>> lexeme class MainKeywords: { return ?lexeme; }
577 \end{verbatim}
578 the variable \verb|?lexeme| is bound to
579 the token \verb|."rewrite"| if the string ``rewrite'' is matched;
580 it is bound to the token \verb|."inference"| if the string
581 ``inference'' is matched and so on.
583 \Subsubsection{Class \T{LexerBuffer}}
584 We'll next describe the class \T{LexerBuffer} and its subclasses.
586 Class \CLASSDEF{LexerBuffer}
587 is the base class in the lexical buffer hierarchy.
588 It is defined in the library include file \verb|<AD/automata/lexerbuf.h>|.
589 This class is responsible for implementing a string buffer for use
590 during lexical analysis.
592 As it stands, it can be used directly if the lexer input is directly
593 from a string. Memory management of the buffer is assumed to be handled
594 by the user.
596 The class \T{LexerBuffer} has three constructors. The default
597 constructor initializes the string buffer to NULL. The two
598 other constructors initialize the string buffer to a string given
599 by the user. In the case when the length is not supplied, the buffer
600 is assumed to be \verb|'\0'|-terminated. The two \verb|set_buffer| methods
601 can be used to set the current string buffer. Notice that
602 all lexical analysis operations are done in place. The user should
603 not alter the string buffer directly, but should use the interface
604 provided by this class instead.
606 \begin{verbatim}
607 class LexerBuffer {
608 public:
609 LexerBuffer();
610 LexerBuffer(char *);
611 LexerBuffer(char *, size_t);
612 virtual ~LexerBuffer();
613 virtual void set_buffer (char *, size_t);
614 void set_buffer (char *);
616 \end{verbatim}
618 The following methods are used access the string buffer.
619 Method \T{capacity} returns the size of the buffer. Method
620 \T{length} returns the length of the current matched token.
621 Methods \T{text} can be used to obtain a point to location
622 of the current matched token. The string returned is guaranteed
623 to be \verb|'\0'|-terminated. Methods \T{operator []} return
624 the $i$th character of the token. Finally, method \T{lookahead}
625 returns the character code of the next character to be matched.
627 \begin{verbatim}
628 int capacity () const;
629 int length () const;
630 const char * text () const;
631 char * text ();
632 char operator [] (int i) const;
633 char& operator [] (int i);
634 int lookahead () const;
635 void push_back (int n)
636 \end{verbatim}
638 In addition to the string buffer, the class \T{LexerBuffer} keeps
639 track of two additional types of information: the current context
640 of the DFA, and whether the next token starts at the
641 beginning of the line, or in our terminology, whether it is
642 {\em anchored}. These are manipulated with the following methods:
644 \begin{verbatim}
645 int context () const;
646 void set_context (int c = 0);
647 Bool is_anchored() const;
648 void set_anchored(Bool a = true);
649 \end{verbatim}
651 Finally, the following methods should be redefined by subclasses
652 to alter the behavior of this class. By default, the class \T{LexerBuffer}
653 calls \T{fill\_buffer()} when it reaches the end of the string; subclasses
654 can use this method to refill the buffer and return the number of
655 characters read. Currently, \T{fill\_buffer} is defined to do nothing
656 and return 0. When it reaches the end of the file
657 (i.e. when \T{fill\_buffer()} fails to refill the buffer and the
658 scanning process finishes), method \T{end\_of\_file} is called.
659 Currently, this is a no-op. Finally, the error handling routine
660 \T{error()} is called with the position of the beginning and the end
661 of the buffer in which the error occurs. By default, this routine
662 prints out the buffer.
664 \begin{verbatim}
665 protected:
666 virtual size_t fill_buffer();
667 virtual void end_of_file();
668 virtual void error(const char * start, const char * stop);
669 \end{verbatim}
671 \Subsubsection{Class \T{IOLexerBuffer}}
673 Class \CLASSDEF{IOLexerBuffer}
674 is a derived class of \T{LexerBuffer}. It
675 automatically manages an internal buffer and receives input from
676 an \T{istream}. It is defined in the file \verb|<AD/automata/iolexerbuf.h>|
678 This class provides the following additional features:
679 Constructor \verb|IOLexerBuffer(istream&)| ties the input to a
680 stream. If the default constructor is used, then it automatically
681 ties the input to the stream \verb|cin|. The method \verb|set_stream|
682 can be used to set the stream to some other input.
684 \begin{verbatim}
685 class IOLexerBuffer : public LexerBuffer {
686 size_t buffer_size; // size of the buffer
687 istream * input; // input stream
688 public:
689 IOLexerBuffer();
690 IOLexerBuffer(istream&);
691 virtual ~IOLexerBuffer();
692 virtual void set_stream (istream&);
694 \end{verbatim}
696 By default, class \verb|IOLexerBuffer| reads new data from the input
697 stream using a line buffering discipline. This mechanism is suitable
698 for interactive, but for other applications it may be more efficient
699 to use block buffered input. The protected method
700 \verb|read_buffer| controls this reading behavior; it
701 is called whenever the data in the string buffer has been
702 consumed and new input is needed from the stream. It is
703 passed the position of the buffer and its remaining capacity,
704 and it returns the number of new characters that are read. Subclasses
705 can redefine this method.
707 \begin{verbatim}
708 protected:
709 virtual size_t read_buffer(char *, size_t);
710 \end{verbatim}
712 \Subsubsection{Class \T{IOLexerStack}}
714 Class \CLASSDEF{IOLexerStack}
715 is a derived class of \T{LexerStack}. It provides
716 a mechanism of reading from a stack of \verb|istream|'s. Streams
717 can be pushed and popped from the stack. The next token is
718 obtained from the stream from the top of the stack. The class allows
719 allows easy implementation of
720 constructs such as the \verb|#include| file mechanism of the C preprocessor.
722 The interface of this class is listed below:
723 \begin{verbatim}
724 class IOLexerStack : public IOLexerBuffer {
725 public:
726 IOLexerStack();
727 IOLexerStack(istream&);
728 virtual ~IOLexerStack();
730 virtual void push_stream (istream&);
731 virtual istream& pop_stream ();
733 \end{verbatim}
735 \Subsection{Parser Specification}
737 Parsers are specified as a two phase process:
738 \begin{enumerate}
739 \item First a {\em syntax class} is defined. A syntax class
740 declaration is like a normal \Cpp{} class declaration and has a similar
741 syntax, except that \Prop{} will also generate the interface of the parser.
742 All parsers that \Prop{} generates are encapsulated within a class. This
743 makes it easy for the programmer to add additional data for parsing
744 use, and to have multiple instances of a parser.
745 \item Secondly, the grammar of the language is defined in a
746 {\em syntax} declaration as a set of productions,
747 in a syntax similar to that of {\em yacc}.
748 \end{enumerate}
750 We'll describe these two phases.
752 \Subsubsection{Syntax class}
754 A syntax class definition specifies an object class that encapsulates
755 a parser. Its syntax is the same as the usual \Cpp{} class definition,
756 except that the prefix \T{syntax} is used:
758 \begin{syntax}
759 \NDEF{Syntax\_Class\_Decl} & \IS & \TDEF{syntax class} \Id
760 \OPT{\T{:} \N{Inherit\_List}} \T{\{} \N{Class\_Body} \T{\}} \T{;} \\
761 \end{syntax}
763 This specification automatically generate a class with the following interface
764 (assuming that the parser class in question is called \T{ParserClass})
765 \begin{verbatim}
766 public:
767 ParserClass(); // constructor
768 public:
769 virtual void parse();
770 \end{verbatim}
772 The constructor and the method \verb|parse()| will be automatically
773 generated later.
775 A parser class expects the following method to be defined:
776 \begin{verbatim}
777 int get_token();
778 \end{verbatim}
779 \noindent The function of \verb|get_token| is to return the
780 next token from the lexer stream whenever it is called.
781 It should return \verb|EOF| if the stream is empty.
783 \Subsubsection{Syntax declaration}
785 The grammar of a language is specified in the \T{syntax} declaration.
786 Its syntax is listed as follows:
788 \begin{syntax}
789 \NDEF{Syntax\_Decl} & \IS & \TDEF{syntax} \Id \T{\{} \\
790 & & \quad \OPT{\MORE{\N{Precedence\_Decl}}{}} \\
791 & & \quad \OPT{\N{Expect\_Decl}} \\
792 & & \quad \OPT{\MORE{\N{Production\_Rule}}{}} \\
793 & & \T{\}} \\
794 \end{syntax}
796 The name of a \T{syntax} declaration should match that
797 of a \T{syntax} \T{class}. A \T{syntax} declaration is divided
798 into three parts: (i) the operator precedence section, (ii) the
799 \T{expect:} $n$ section, and (iii) the grammar section.
801 \Subsubsection{Precedence}
803 Operator precedences are specified using precedence declarations.
804 The syntax is:
806 \begin{syntax}
807 \NDEF{Precedence\_Decl}
808 & \IS & \TDEF{left:} \N{Integer} \ALT{\N{Operator}}{} \T{;}
809 & left associative operators \\
810 & \OR & \TDEF{right:} \N{Integer} \ALT{\N{Operator}}{} \T{;}
811 & right associative operators \\
812 \NDEF{Operator} & \IS & \N{Char} & single character operator \\
813 & \OR & \N{String} & multi-character operator \\
814 & \OR & \N{Cons} & constructor \\
815 \end{syntax}
817 The integer associated with a precedence declaration is the precedence
818 level of an operator. The higher the level of an operator, the {\em lower}
819 its precedence.
821 For example, the following set of precedences are used in the \Prop{}
822 language parser:
823 \begin{verbatim}
824 left: 23 "as";
825 left: 22 "::";
826 left: 21 "||";
827 left: 20 "equiv:";
828 left: 19 "xor:";
829 left: 18 "implies:";
830 left: 17 "&&" "and";
831 right: 16 "|=" "&=" "^=" "<<=" ">>=";
832 right: 15 '=' ":=" "+=" "-=" "*=" "/=" "%=";
833 left: 14 '|';
834 left: 13 ':';
835 left: 12 ';';
836 left: 11 '^';
837 left: 10 '&';
838 left: 9 "==" "!=";
839 left: 8 '<' '>' ">=" "<=";
840 left: 7 "<<" ">>";
841 left: 6 '+' '-' "with" "less";
842 left: 5 '*' '/' '%';
843 left: 4 "++" "--";
844 left: 3 '!' '~' "arb" "card" "dom" "ran";
845 left: 2 '[' ']' '{' '}' '.' "->" ;
846 \end{verbatim}
848 \Subsubsection{{\tt expect:} $n$}
850 The declaration \T{expect} $n$ specifies that there should be $n$
851 shift/reduce errors in the grammar. If the actual grammar deviates
852 from this number, a warning will be generated. The alternative
853 form \T{expect:} \T{\_} suppresses all warnings. The syntax is:
855 \begin{syntax}
856 \NDEF{Expect\_Decl} & \IS & \TDEF{expect:} \N{Integer} \T{;}
857 & expects $n$ shift/reduce errors \\
858 & \OR & \T{expect:} \T{\_} \T{;}
859 & expects many shift/reduce errors \\
860 \end{syntax}
862 This declaration is optional. If omitted, warnings will be printed
863 if parser conflicts are found during parser generation.
865 \Subsubsection{Production rules}
868 The syntax of the production rules is as follows:
870 \begin{syntax}
871 \NDEF{Production\_Rule} & \IS &
872 \Id \OPT{\TypeExp} \T{:} \\
873 & & \quad \OPT{\ALT{\N{One\_Alt}}{|}} \T{;} \\
874 \NDEF{One\_Alt} & \IS & \OPT{\ALT{\N{Symbol}}{}} & one alternative \\
875 \NDEF{Symbol}
876 & \IS & \Id & non-terminal \\
877 & \OR & \N{Cons} & terminal (a datatype constructor) \\
878 & \OR & \N{String} & terminal (a datatype constructor) \\
879 & \OR & \N{Char} & terminal (from the ASCII character set) \\
880 & \OR & \T{?} & the error terminal \\
881 & \OR & \T{\$} & the end of file terminal \\
882 & \OR & \T{\{} \N{Code} \T{\}} & embedded actions \\
883 \end{syntax}
885 Each rule specifies a set of productions of the form
886 \[ \begin{array}{lcl}
887 lhs & : & A_{11} A_{12} \ldots A_{1n_1} \\
888 & \OR & A_{21} A_{22} \ldots A_{2n_2} \\
889 & \vdots & \\
890 & \OR & A_{m1} A_{m2} \ldots A_{mn_m} \\
891 \end{array}
893 where $lhs$ is the non-terminal and $A_{ij}$'s are a mixture of
894 terminals, non-terminals and embedded actions. If synthesized
895 attributes are used, then the type of the s-attribute should be
896 annotated next to the lhs non-terminal.
898 Terminals can be specified in a few ways: (i) a character literal
899 is a predefined terminal of the same name, (ii) a string literal
900 is matched with a \T{datatype} or a \T{lexeme} \T{class} constructor
901 of the same name, (iii) an identifier is a terminal if there is a
902 \T{datatype} constructor of the same name. Any other identifiers
903 are assumed to be non-terminals.
905 For example, the following are two sets of production rules used in the
906 \Prop{} translator itself:
907 \begin{verbatim}
908 ty Ty: simple_ty { $$ = $1; }
909 | ty '=' exp { $$ = DEFVALty($1,$3); }
910 | ty '*' { $$ = mkptrty($1); }
911 | ty '&' { $$ = mkrefty($1); }
912 | ty '[' ']' { $$ = mkptrty($1); }
913 | ty '[' exp ']' { $$ = mkarrayty($1,$3); }
915 exp Exp:
916 app_exp { $$ = $1; }
917 | exp '+' exp { $$ = BINOPexp("+",$1,$3); }
918 | exp '-' exp { $$ = BINOPexp("-",$1,$3); }
919 | exp '*' exp { $$ = BINOPexp("*",$1,$3); }
920 | exp '/' exp { $$ = BINOPexp("/",$1,$3); }
921 | exp '%' exp { $$ = BINOPexp("%",$1,$3); }
922 | exp '^' exp { $$ = BINOPexp("^",$1,$3); }
923 | exp "+=" exp { $$ = BINOPexp("+=",$1,$3); }
924 | exp "-=" exp { $$ = BINOPexp("-=",$1,$3); }
925 | exp "*=" exp { $$ = BINOPexp("*=",$1,$3); }
926 | exp "/=" exp { $$ = BINOPexp("/=",$1,$3); }
927 | exp "%=" exp { $$ = BINOPexp("%=",$1,$3); }
928 \end{verbatim}
929 \noindent {\em etc $\ldots$}
931 Here, \T{ty} is the non-terminal of this production. The type of
932 the synthesized attribute of this rule is \T{Ty}, and it is
933 written next to the non-terminal. Like {\em yacc}, the synthesized
934 attribute of a rule can be referenced using the \verb.$. variables:
935 variable \TDEF{\$\$} denote the synthesized attribute of the current rule,
936 while \TDEF{\$$n$} where $n$ is 1, 2, 3, \ldots denote the synthesized
937 attribute of the $n$th rhs symbol of the current rule.
939 \Subsubsection{Parser report}
941 If the command line option \OPTIONDEF{-r} is used, then
942 in addition to generating the code for the grammar, the translator
943 will also generate a comprehensive report of the grammar and
944 the resulting LALR(1) automaton. The amount of details in this
945 report can be varied using the the verbose option
946 \OPTIONDEF{-v}:
948 \begin{description}
949 \item[{\tt -v1}] Print full information for all LR(0) states.
950 \item[{\tt -v2}] Also print out the lookahead set of a reduction rule.
951 This may be useful for tracking down shift/reduce and reduce/reduce conflicts.
952 \item[{\tt -v3}] Do both \verb|-v1| and \verb|-v2|.
953 \end{description}
955 \Subsubsection{Interfacing with the generated lexer}
957 The easiest way of interfacing with a lexer is to embed
958 an object of class \T{LexerBuffer}, \T{IOLexerBuffer}, or \T{IOLexerStack}
959 within a syntax class\footnote{Alternatively, inheritance may be used.}.
960 We can then define the tokenization method
961 \verb|get_token| using the \T{matchscan} statement described
962 in section~\ref{sec:matchscan}.
964 \Subsection{Debugging Parsers}
966 Currently, only the most rudimentary debugging facilities have been
967 implemented for parser.
968 The user can define the macro \verb|DEBUG_|$C$, where $C$ is the
969 name of a syntax class before the \T{syntax} definition. This will activate
970 the parser debugging code. During parsing, the reductions that are taken
971 will be printed to \verb|cerr|.
973 \Section{Algebraic Datatypes and Pattern Matching} \label{sec:datatype}
975 \Prop{} implements algebraic datatypes and pattern matching in the
976 style of Standard ML\cite{SML}. Tree, DAG and even graph structures can be
977 specified as a set of datatype equations in algebraic datatype specifications.
978 \Prop{} then proceeds to translate these into concrete \Cpp{} classes.
979 This makes it very easy to implement complex data structures.
981 Algebraic datatypes can be manipulated and transformed
982 using \Prop{} pattern matching constructs, which decompose a datatype
983 value into its constituents. Complex patterns involving multiple
984 objects can be specified. These are translated into efficient
985 testing and selection code in \Cpp.
987 In the following we shall give a brief overview of the pattern matching
988 features of \Prop. For most users of modern declarative languages many
989 of these features are already familiar constructs.
991 \Subsection{A quick tour of pattern matching}
993 Algebraic datatypes are specified using \verb|datatype| definitions,
994 which define the inductive structure of one of more types using
995 a tree-grammar like syntax.
996 %In addition, pretty printers,
997 %lexical scanners, parsers, persistence I/O methods and
998 %garbage collection inferences can also be specified with
999 %additional options in the same construct.
1000 When a datatype is declared,
1001 the following operations are implicitly defined by the datatype compiler:
1002 (1) the constructors for all the variants of a type;
1003 (2) the identity test operator \verb|==|, and the assignment
1004 operator \verb|=| for this type; and
1005 (3) the member functions needed to decompose a datatype value during
1006 pattern matching.
1008 We'll select the internals of a compiler for a simplified imperative
1009 language as the running example in this section.
1010 Suppose that in this language an expression is composed of
1011 identifiers, integer constants and the four arithmetic operators.
1012 Then the structure of the abstract syntax tree can be specified as follows:
1014 \begin{verbatim}
1015 datatype Exp = INT (int)
1016 | ID (const char *)
1017 | ADD (Exp, Exp)
1018 | SUB (Exp, Exp)
1019 | MUL (Exp, Exp)
1020 | DIV (Exp, Exp)
1022 \end{verbatim}
1024 Abstract syntax of an expression such as {\em a * b - 17}
1025 can be constructed directly in a prefix syntax, directly mirroring
1026 that of the definition. The \Prop{} datatype
1027 compiler will automatically generate a \Cpp{} class hierarchy to represent
1028 the variants of type \verb|Exp|. Datatype constructor
1029 functions(not to be mistaken with \Cpp's class constructors) will
1030 also be automatically generated using the same names as the variants.
1032 \begin{verbatim}
1033 Exp formula = ADD(MUL(ID("a"),ID("b")),INT(17));
1034 \end{verbatim}
1036 Datatype values can be decomposed using the \verb|match| statement, which
1037 can be seen as a generalization of \C's \verb|switch| construct. Pattern
1038 matching is a combination of conditional branching and value
1039 binding. For example, a typical evaluation function for the type \verb|Exp|
1040 can be written as in the following example. Notice that each arm of
1041 a \verb|case| is in fact a pattern(with optional variables)
1042 mirroring the syntax of a datatype. The pattern variables(written
1043 with the prefix \verb|?| in the sequel) of a matching
1044 arm is {\em bound} to the value of the matching value, which can be
1045 subsequently referenced and modified in the action of an arm.
1047 \begin{verbatim}
1048 int eval (Exp e, const map<const char *, int>& env)
1049 { match (e)
1050 { case INT ?i: return ?i;
1051 case ID ?id: return env[?id];
1052 case ADD (?e1,?e2): return eval(?e1,env) + eval(?e2,env);
1053 case SUB (?e1,?e2): return eval(?e1,env) - eval(?e2,env);
1054 case MUL (?e1,?e2): return eval(?e1,env) * eval(?e2,env);
1055 case DIV (?e1,?e2): return eval(?e1,env) / eval(?e2,env);
1058 \end{verbatim}
1060 \Subsubsection{Pattern matching versus object-oriented style}
1062 Although a comparable evaluation function can be written
1063 in object oriented style using late binding, as in below, in general pattern
1064 matching is much more powerful than late binding in \Cpp, which only
1065 allows dispatching based on the type of one receiver.
1067 \begin{verbatim}
1068 // Class definitions
1069 class Exp {
1070 public:
1071 virtual int eval(const map<const char *, int>& env) const = 0;
1073 class INT : Exp {
1074 int i;
1075 public:
1076 int eval(const map<const char *, int>& env);
1078 class ID : Exp {
1079 const char * id
1080 public:
1081 int eval(const map<const char *, int>& env);
1085 // Member functions
1086 int INT::eval(const map<const char *, int>& env) const { return i; }
1087 int ID ::eval(const map<const char *, int>& env) const { return id; }
1088 int ADD::eval(const map<const char *, int>& env) const
1089 { return e1->eval(env) + e2->eval(env); }
1090 int SUB::eval(const map<const char *, int>& env) const
1091 { return e1->eval(env) - e2->eval(env); }
1092 int MUL::eval(const map<const char *, int>& env) const
1093 { return e1->eval(env) * e2->eval(env); }
1094 int DIV::eval(const map<const char *, int>& env) const
1095 { return e1->eval(env) / e2->eval(env); }
1096 \end{verbatim}
1098 For example, in the following function we use nested patterns,
1099 non-linear patterns (i.e. patterns with multiple occurrences of
1100 a pattern variable), and guards to perform algebraic simplification
1101 of an expression. Although the patterns are relatively simple in
1102 this example, in general arbitrarily complex patterns may be used.
1104 \begin{verbatim}
1105 Exp simplify (Exp redex)
1106 { // recursive traversal code omitted ...
1108 // match while repeats the matching process
1109 // until no more matches are found.
1110 match while (redex)
1111 { ADD(INT 0, ?x): { redex = ?x; }
1112 | ADD(INT ?x, INT ?y): { redex = INT(?x+?y); }
1113 | ADD(?x, INT 0) { redex = ?x; }
1114 | SUB(?x, INT 0): { redex = ?x; }
1115 | SUB(?x, ?x): { redex = INT(0); }
1116 | SUB(INT ?x, INT ?y): { redex = INT(?x-?y); }
1117 | MUL(INT 0, ?x): { redex = INT(0); }
1118 | MUL(?x, INT 0): { redex = INT(0); }
1119 | DIV(?x, ?x): { redex = INT(1); }
1120 // don't divide by zero.
1121 | DIV(INT ?x, INT ?y) | ?y != 0: { redex = INT(?x/?y); }
1122 | ...
1124 return redex;
1126 \end{verbatim}
1128 Pattern matching in \Prop{} is not restricted to one datatype at
1129 a time. In the following example, we use matching on multiple values to
1130 define equality on expressions inductively. For variety, we'll
1131 use the \verb|fun| variant of \verb|match|, which defines a function
1132 in rule form. Notice that the last case
1133 of the match set uses {\em wild cards} \verb|_| to catch all the other
1134 non-equal combinations. Since \Cpp{} does not provide multiple dispatching,
1135 implementing binary (or $n$-ary) matching operations on variant datatypes are
1136 in general cumbersome and verbose in object-oriented style. In contrast,
1137 using an applicative pattern matching style many manipulations and
1138 transformations on variant datatypes with tree-like or graph-like structure
1139 can be expressed succinctly.
1141 \begin{verbatim}
1142 fun equal INT ?i, INT ?j: bool: { return ?i == ?j; }
1143 | equal ID ?a, ID ?b: { return strcmp(a,b) == 0; }
1144 | equal ADD(?a,?b), ADD(?c,?d): { return equal(?a,?c) && equal(?b,?d); }
1145 | equal SUB(?a,?b), SUB(?c,?d): { return equal(?a,?c) && equal(?b,?d); }
1146 | equal MUL(?a,?b), MUL(?c,?d): { return equal(?a,?c) && equal(?b,?d); }
1147 | equal DIV(?a,?b), DIV(?c,?d): { return equal(?a,?c) && equal(?b,?d); }
1148 | equal _, _: { return false; }
1150 \end{verbatim}
1152 \Subsubsection{More examples} \label{sec:Wff}
1154 As another example, we can specify the term structure of
1155 {\em well-formed formulas} in proposition calculus as follows. Notice
1156 that the constructors \verb|F| and \verb|T| are nullary.
1158 \begin{verbatim}
1159 datatype Wff = F
1161 | Var (const char *)
1162 | And (Wff, Wff)
1163 | Or (Wff, Wff)
1164 | Not (Wff)
1165 | Implies (Wff, Wff)
1167 \end{verbatim}
1169 Datatypes that are parametrically polymorphic, such as lists and trees, can
1170 be defined by parameterizing them with respect to one of more types.
1171 For example, both lists and tree below are parametric on one type argument
1172 \verb|T|.
1174 \begin{verbatim}
1175 datatype List<T> = nil
1176 | cons(T, List<T>);
1177 datatype Tree<T> = empty
1178 | leaf(T)
1179 | node(Tree<T>, T, Tree<T>);
1181 List<int> primes = cons(2,cons(3,cons(5,cons(7,nil))));
1182 List<int> more_primes = cons(11,cons(13,primes));
1183 Tree<char *> names = node(leaf("Church"),"Godel",empty);
1184 \end{verbatim}
1186 As a programming convenience, \Prop{} has a set of built-in list-like
1187 syntactic forms. Unlike languages such as ML, however,
1188 these forms are not predefined to be any specific list types. Instead, it is
1189 possible for the user to use these forms on any datatypes with
1190 a natural binary {\em cons} operator and a nullary {\em nil} constructor.
1191 For instance, the previous list datatype can be redefined as follows:
1193 \begin{verbatim}
1194 datatype List<T> = #[] | #[ T ... List<T> ];
1195 List<int> primes = #[ 2, 3, 5, 7 ];
1196 List<int> more_primes = #[ 11, 13 ... primes ];
1197 List<char *> names = #[ "Church", "Godel", "Turing", "Curry" ];
1199 template <class T>
1200 List<T> append (List<T> a, List<T> b)
1201 { match (a)
1202 { case #[]: return b;
1203 case #[hd ... tl]: return #[hd ... append(tl,b)];
1206 \end{verbatim}
1207 The empty list is written as \verb|#[]|, while {\em cons(a,b)}
1208 is written as \verb|#[ a ... b ]|. An expression of the special form
1209 \verb|#[a, b, c]|, for instance, is simple syntactic sugar for
1210 repeated application of the {\em cons} operator, i.e.
1212 \begin{verbatim}
1213 #[a, b, c] == #[a ... #[ b ... #[ c ... #[] ] ] ].
1214 #[a, b, c ... d ] == #[a ... #[ b ... #[ c ... d ] ] ].
1215 \end{verbatim}
1217 List-like special forms are not limited to datatypes with only two variants.
1218 For example, we can define a datatype similar in structure to S-expressions
1219 in {\em Lisp} or {\em scheme}. Here's how such as datatype may be
1220 defined\footnote{for simplicity, we'll use a string representation for atoms
1221 instead of a more efficient method.}:
1223 \begin{verbatim}
1224 datatype Sexpr = INT (int)
1225 | REAL (double)
1226 | STRING (char *)
1227 | ATOM (const char *)
1228 | #()
1229 | #( Sexpr ... Sexpr )
1230 where type Atom = Sexpr // synonym for Sexpr
1232 \end{verbatim}
1234 With this datatype specification in place, we can construct values
1235 of type \verb|Sexpr| in a syntax close to that of {\em Lisp}. For example,
1236 we can define lambda expressions corresponding to the combinators $I$, $K$
1237 and $S$ as follows:
1239 \begin{verbatim}
1240 Atom LAMBDA = ATOM("LAMBDA");
1241 Atom f = ATOM("f");
1242 Atom x = ATOM("x");
1243 Atom y = ATOM("y");
1244 Atom NIL = #();
1246 Sexpr I = #(LAMBDA, #(x), x);
1247 Sepxr K = #(LAMBDA, #(x), #(LAMBDA, #(y), x));
1248 Sepxr S = #(LAMBDA, #(f),
1249 #(LAMBDA, #(x),
1250 #(LAMBDA, #(y), #(#(f,x), #(g,x)))));
1251 \end{verbatim}
1253 Similar to list-like forms, vector-like forms are also available.
1254 This addresses one of the flaws of the \Cpp{} language, which lacks
1255 first class array constructors.
1256 Vectors are simply homogeneous arrays whose sizes
1257 are fixed and are determined at creation time\footnote{For efficiency,
1258 bounds checking is {\em not} performed.}. Random access within
1259 vectors can be done in constant time. Unlike lists, however, the
1260 prepending operation is not supported. Vectors literals are delimited
1261 with the composite brackets \verb!(| ... |)!, \verb![| ... |]!, or
1262 \verb!{| ... |}!. In the following example the datatype \verb|Exp|
1263 uses vectors to represent the cooefficients of the polynomials:
1265 \begin{verbatim}
1266 datatype Vector<T> = (| T |);
1267 datatype Exp = Polynomial (Var, Vector<int>)
1268 | Functor (Exp, Vector<Exp>)
1269 | Atom (Var)
1270 | ...
1271 where type Var = const char *;
1272 Exp formula = Polynomial("X", (| 1, 2, 3 |));
1273 \end{verbatim}
1275 Commonly used patterns can be given synonyms so that they can be readily
1276 reused without undue repetition. This can be accomplished by defining pseudo
1277 datatype constructors to stand for common patterns using
1278 {\em datatype law} definitions. For example, the following set of
1279 laws define some commonly used special forms for a {\em Lisp}-like language
1280 using the previously defined \verb|Sexpr| datatype.
1282 \begin{verbatim}
1283 datatype law inline Lambda(x,e) = #(ATOM "LAMBDA", #(x), e)
1284 | inline Quote(x) = #(ATOM "QUOTE", x)
1285 | inline If(a,b,c) = #(ATOM "IF", a, b, c)
1286 | inline Nil = #()
1287 | inline ProgN(exprs) = #(ATOM "PROGN" ... exprs)
1288 | SpecialForm = #(ATOM ("LAMBDA" || "IF" ||
1289 "PROGN" || "QUOTE") ... _)
1290 | Call(f,args) = ! SpecialForm && #(f ... args)
1292 \end{verbatim}
1294 Note that the pattern \verb|SpecialForm| is meant to match all
1295 special forms in our toy language: i.e. {\em lambdas}, {\em ifs}, {\em progn}'s
1296 and {\em quotes}. The pattern disjunction connective \verb.||. is used
1297 to link these forms together. Since we'd like the \verb|Call| pattern
1298 to match only if the S-expression is not a special form, we use
1299 the pattern negation and conjunction operators, \verb|!| and \verb|&&|
1300 are used to screen out special forms.
1303 With these definitions in place,
1304 an interpreter for our language can be written thus:
1306 \begin{verbatim}
1307 Sexpr eval (Sexpr e)
1308 { match (e)
1309 { Call(?f,?args): { /* perform function call */ }
1310 | Lambda(?x,?e): { /* construct closure */ }
1311 | If(?e,?then,?else): { /* branching */ }
1312 | Quote(?x): { return ?x; }
1313 | ...: { /* others */ }
1316 \end{verbatim}
1319 As an interesting note, the special form pattern can also be rewritten
1320 using regular expression string matching, as in the following:
1322 \begin{verbatim}
1323 datatype law SpecialForm = #(ATOM /LAMBDA|IF|PROGN|QUOTE/ ... _)
1324 \end{verbatim}
1326 In addition, since the law constructors \verb|Lambda|, \verb|Quote|,
1327 \verb|If|, \verb|Nil| and \verb|ProgN| are defined with the \verb|inline|
1328 keyword, these constructors can be used to within expressions as abbreviates
1329 for their rhs definitions. For example, writing
1330 \verb|Lambda(x,x)| is the same writing \verb|#(ATOM("Lambda"),#(x),x)|.
1332 \Subsubsection{Variants of match}
1334 Aside from the usual plain pattern matching constructs, a few variants of the
1335 \verb|match| construct are offered. We'll briefly enumerate a few of these:
1336 \begin{itemize}
1337 \item The \verb|matchall| construct is a variant of \verb|match| that
1338 executes all matching rules(instead of just the first one) in sequence.
1339 \item Each rule of a \verb|match| statement can have associated cost
1340 expressions. Instead of selecting the first matching rule to
1341 execute as in the default, all matching rules are considered
1342 and the rule with the least cost is executed. Ties are broken by
1343 choosing the rule that comes first lexically. For example:
1345 \begin{verbatim}
1346 match (ir)
1347 { ADD(LOAD(?r0),?r1) \ e1: { ... }
1348 | ADD(?r0,LOAD(?r0)) \ e2: { ... }
1349 | ADD(?r0, ?r1) \ e3: { ... }
1350 | ...
1352 \end{verbatim}
1353 \item A \verb|match| construct can be modified with the \verb|while|
1354 modifier, as in the following example. A match modified thus is
1355 repeatedly matched until none of the patterns are applicable.
1356 For example, the following routine uses a \verb|match while| statement
1357 to traverse to the leftmost leaf.
1359 \begin{verbatim}
1360 template <class T>
1361 Tree<T> left_most_leaf(Tree<T> tree)
1362 { match while (tree)
1363 { case node(t,_,_): tree = t;
1365 return tree;
1367 \end{verbatim}
1368 \item Finally, the \verb|matchscan| variant of match can be used to
1369 perform string matching on a stream. For example, a simple lexical
1370 scanner can be written as follows.
1372 \begin{verbatim}
1373 int lexer (LexerBuffer& in)
1374 { matchscan while (in)
1375 { /if/: { return IF; }
1376 | /else/: { return ELSE; }
1377 | /while/: { return WHILE; }
1378 | /for/: { return FOR; }
1379 | /break/: { return BREAK; }
1380 | /[0-9]+/: { return INTEGER; }
1381 | /[a-zA-Z_][a-zA-Z_0-9]*/: { return IDENTIFIER; }
1382 | /[ \t]+/: { /* skip spaces */ }
1383 | ... // other rules
1386 \end{verbatim}
1387 \end{itemize}
1389 \Subsection{Algebraic Datatypes} \label{sec:algebraic-datatypes}
1390 Algebraic datatypes are defined using the \T{datatype}
1391 construct. Its syntax is as follows.
1393 \begin{syntax}
1394 \NDEF{Datatype\_Decl}
1395 & \IS & \TDEF{datatype} \\
1396 & & \quad \OPT{\ALT{\N{Datatype\_Spec}}{and}} \\
1397 & & \quad \OPT{\TDEF{where type} \ALT{\N{Type\_Spec}}{and}} \\
1398 & & \quad \OPT{\TDEF{law} \ALT{\N{Law\_Spec}}{and}} \\
1399 & & \T{;} \\
1400 \end{syntax}
1402 Each \T{datatype} declaration specifies a set of
1403 (potentially mutually recursive) types, a set of type abbreviations,
1404 and a set of pattern matching abbreviations called {\em laws}.
1406 \begin{syntax}
1407 \NDEF{Datatype\_Spec}
1408 & \IS & \Id \OPT{\T{<} \LIST{\N{Id}} \T{>}} \\
1409 & & \quad \OPT{\T{:} \N{Inherit\_List}} \\
1410 & & \quad \OPT{\T{::} \N{Datatype\_Qualifiers}} \\
1411 & & \quad \OPT{\T{=} \N{Cons\_Specs}} \\
1412 \NDEF{Cons\_Specs}
1413 & \IS & \ALT{\N{Cons\_Spec}}{|} \\
1414 & & \quad \OPT{\T{public:} \N{Datatype\_Body}} \\
1415 \NDEF{Cons\_Spec}
1416 & \IS & \N{Simple\_Cons\_Spec} \\
1417 & & \quad \OPT{\T{with} \T{\{} \N{Class\_Decl} \T{\}}} \\
1418 \NDEF{Simple\_Cons\_Spec}
1419 & \IS & \Id \OPT{\N{String}} & unit constructor \\
1420 & \OR & \N{String} & unit constructor \\
1421 & \OR & \Id \OPT{\T{of}} \TypeExp & constructor with arguments \\
1422 & \OR & \T{\#[} \T{]} & unit list constructor \\
1423 & \OR & \T{\#\{} \T{\}} & unit list constructor \\
1424 & \OR & \T{\#(} \T{)} & unit list constructor \\
1425 & \OR & \T{\#[} \TypeExp \T{...} \TypeExp \T{]}
1426 & list constructor \\
1427 & \OR & \T{\#\{} \TypeExp \T{...} \TypeExp \T{\}}
1428 & list constructor \\
1429 & \OR & \T{\#(} \TypeExp \T{...} \TypeExp \T{)}
1430 & list constructor \\
1431 & \OR & \T{[|} \TypeExp \T{|]} & vector constructor \\
1432 & \OR & \T{(|} \TypeExp \T{|)} & vector constructor \\
1433 & \OR & \T{\{|} \TypeExp \T{|\}} & vector constructor \\
1434 \end{syntax}
1437 Datatypes can be annotated with {\em qualifiers}. They tell the \Prop{}
1438 translator to generate additional code for each datatype to provide
1439 additional functionality.
1441 \begin{syntax}
1442 \NDEF{Datatype\_Qualifiers}
1443 & \IS & \MORE{\N{Datatype\_Qualifier}}{} \\
1444 \NDEF{Datatype\_Qualifier}
1445 & \IS & \TDEF{collectable} & garbage collectable \\
1446 & \OR & \TDEF{rewrite} & optimized for tree rewriting \\
1447 & \OR & \TDEF{persistent} & generate persistence interface \\
1448 & \OR & \TDEF{lexeme} & used for parser/lexer tokens \\
1449 & \OR & \TDEF{inline} & use inlined implementation \\
1450 & \OR & \TDEF{extern} & use non-inlined implementation \\
1451 \end{syntax}
1453 Type specifications assign abbreviations to commonly used type expressions.
1454 They act like \T{typedef}'s in \Cpp, except that \Prop{} can make use
1455 of the type information provided by these specifications.
1457 \begin{syntax}
1458 \NDEF{Type\_Spec}
1459 & \IS & \Id \T{=} \TypeExp \\
1460 \end{syntax}
1462 Law specifications are used to define abbreviations to datatype
1463 patterns\footnote{Patterns are discussed in
1464 section~\ref{sec:pattern-matching}.}.
1465 They behave like macros in some sense, except that unlike macros, they are
1466 properly type checked by \Prop. In addition, if the keyword \T{inline} is
1467 used, then \Prop{} will treat the lhs as an expression and generate
1468 a function of the same name that can be used to construct the datatype term.
1470 \begin{syntax}
1471 \NDEF{Law\_Spec}
1472 & \IS & \OPT{\T{inline}} \Id \OPT{\N{Law\_Arg}} \T{=} \Pat \\
1473 \NDEF{Law\_Arg}
1474 & \IS & \Id \\
1475 & \OR & \T{(} \LIST{\N{Id}} \T{)} \\
1476 \end{syntax}
1478 \Prop{} recognizes the following set of type expressions.
1479 \begin{syntax}
1480 \NDEF{Type\_Exp}
1481 & \IS & \N{Type\_Id} & type name \\
1482 & \OR & \TypeExp \T{*} & pointer type \\
1483 & \OR & \TypeExp \T{\&} & reference type \\
1484 & \OR & \N{Type\_Qualifier} \N{Type\_Id} & qualified type \\
1485 & \OR & \N{Datatype\_Qualifier} \TypeExp & annotation \\
1486 & \OR & \T{(} \TypeExp \T{)} & grouping \\
1487 & \OR & \T{(} \TypeExp \T{,}
1488 \LIST{\N{TypeExp}} \T{)} & tuple type \\
1489 & \OR & \T{(} \TypeExp \T{,}
1490 \LIST{\N{TypeExp}} \T{)} & tuple class \\
1491 & \OR & \T{\{} \LIST{\N{Lab\_Type\_Exp}} \T{\}} & record type \\
1492 & \OR & \TypeExp \T{=} \N{Exp} & default value \\
1493 \NDEF{Lab\_Type\_Exp} & \IS &
1494 \Id \T{:} \TypeExp \\
1495 \NDEF{Type\_Qualifier}
1496 & \IS & \T{class} & a class type \\
1497 & \OR & \T{const} & constant type \\
1498 & \OR & \T{rewrite} & rewritable \\
1499 & \OR & \T{collectable} & garbage collected \\
1500 & \OR & \T{persistent} & persistent object \\
1501 \end{syntax}
1503 In general, four different types of datatype constructors can be defined:
1504 \begin{description}
1505 \item[nullary constructors] These are constructors without an argument.
1506 \Prop{} maps these into small integers and no heap space will be consumed.
1507 \item[tuple argument constructors] These are constructors with one
1508 or more unamed arguments.
1509 \item[record argument constructors] These are constructors with one
1510 or more named arguments written within braces. For example, in the
1511 following datatype definition a record argument constructor called
1512 \verb|Add| is defined:
1513 \begin{verbatim}
1514 datatype Exp = Add { left : Exp, right : Exp }
1515 | ...
1516 \end{verbatim}
1517 To construct an \verb|Add| expression, we can use any one of the following
1518 forms:
1519 \begin{verbatim}
1520 Exp e1 = Add(x,y);
1521 Exp e2 = Add'{ left = x, right = y };
1522 Exp e3 = Add'{ right = y, left = x };
1523 \end{verbatim}
1524 Note that we are allowed to rearrange the labels in any order if
1525 the record expression form is used.
1526 \item[empty constructors] These are constructors written with
1527 an empty argument \verb|()|. For example, in the following definition
1528 an empty constructor called \verb|NONE| is defined:
1529 \begin{verbatim}
1530 datatype Exp : public AST
1531 = NONE ()
1532 | NONE2
1533 | ...
1534 \end{verbatim}
1535 Here, the constructor \verb|NONE| will inherit from the class \verb|AST|,
1536 while the nullay constructor \verb|NONE2| will not.
1537 \end{description}
1539 \Subsubsection{Instantiating a datatype}
1541 Each declared datatype should be {\em instantiated} in some implementation
1542 file using the \TDEF{instantiate datatype} statement. The translator
1543 will generate additional code to implement the datatype. The general
1544 syntax is:
1545 \begin{syntax}
1546 \NDEF{Instantiate\_Decl}
1547 & \IS &
1548 \TDEF{instantiate datatype} \LIST{\N{Type\_Spec}} \\
1549 & \OR &
1550 \TDEF{instantiate} \T{extern} \T{datatype} \LIST{\N{Type\_Spec}} \\
1551 \NDEF{Type\_Spec} & \IS & \Id \OPT{ \T{<} \LIST{\N{TypeExp}} \T{>}} \\
1552 \end{syntax}
1554 \begin{Tips}
1555 When a datatype is defined with the \TDEF{inline} qualifier,
1556 \Prop{} will try to generate inline methods whenever possible.
1557 On the other hand, if the \TDEF{extern} qualifier is used, \Prop{}
1558 will try to generate an non-inline implementation, and only one
1559 copy are generated at where the \T{instantiate datatype} statement occurs.
1561 Since most \Prop{} datatypes are light-weight objects, \T{inline} is taken
1562 as the default. The user can override this default using
1563 \OPTIONDEF{-save\_space} option of the translator
1564 (see page~\pageref{option:save-space}).
1565 \end{Tips}
1567 \begin{Tips}
1568 When the option \OPTIONDEF{-fno-implicit-templates} is used
1569 under \Gpp, the user should \verb|#define| the symbol
1570 \INDEX{{\tt PROP\_EXPLICIT\_TEMPLATE\_INSTANTIATION}} before
1571 an \T{instantiate datatype} statement.
1572 This will enable the code for explicit template instantiation.
1573 \end{Tips}
1575 \Subsection{Pattern Matching} \label{sec:pattern-matching}
1576 The basic pattern matching construct is the \T{match} statement.
1577 A match statement can be seen as a generalization of \Cpp's \T{switch}
1578 statement, in which the expression that follows a \T{case} can be
1579 a general complex pattern.
1581 Its basic syntax is as follows:
1582 \begin{syntax}
1583 \NDEF{Match} & \IS &
1584 \N{Match\_Mode} \ALT{\T{(} \N{Exp} \T{)}}{and} \\
1585 & & \T{\{} \MORE{\T{case} \N{Match\_Rule}}{} \T{\}} & variant 1 \\
1586 & \OR &
1587 \N{Match\_Mode} \ALT{\T{(} \N{Exp} \T{)}}{and} \\
1588 & & \T{\{} \ALT{\N{Match\_Rule}}{|} \T{\}} & variant 2 \\
1589 & \OR &
1590 \N{Match\_Mode} \ALT{\T{(} \N{Exp} \T{)}}{and} \T{of} & variant 3 \\
1591 & & \quad \ALT{\N{Match\_Rule}}{|} \\
1592 & & \T{end} \T{match} \T{;} \\
1593 \NDEF{Match\_Mode} & \IS & \TDEF{match} \OPT{\T{while}} \\
1594 & \OR & \TDEF{matchall} \OPT{\T{while}} \\
1595 \NDEF{Match\_Rule} & \IS & \Pat \OPT{\N{Guard}} \OPT{\N{Cost}}
1596 \T{:} \N{Match\_Action} \\
1597 \NDEF{Guard} & \IS & \T{if} \N{Exp} \\
1598 & \OR & \T{where} \N{Exp} \\
1599 & \OR & \T{|} \N{Exp} \\
1600 \NDEF{Cost} & \IS & \verb|\| \Exp & cost expression \\
1601 \NDEF{Match\_Action} & \IS & \T{\{} \N{Code} \T{\}} \\
1602 & \OR & \N{Code} & for variant 1 only \\
1603 \end{syntax}
1605 The 3 syntactic variants of the match statement are equivalent in meaning.
1606 A \T{match} statement takes a list of match objects and a
1607 set of pattern matching rules. The first rule that matches the pattern
1608 completely will be executed. Notice that guards can be placed on each rule,
1609 and a rule is only applicable if the guards evaluate to true.
1611 \paragraph{Cost minimization}
1612 In addition, if cost expressions are specified, then the matching process
1613 will proceed as follows:
1614 \begin{enumerate}
1615 \item Locate all applicable rules. If no such rule exists then exit.
1616 \item Compute the cost expressions of all applicable rules.
1617 \item Choose the action that has the lowest cost to execute. If
1618 a tie occurs, resolve by the textual order of the rules.
1619 \end{enumerate}
1621 \paragraph{Match all rules}
1622 The \T{matchall} statement is a variant of \T{match}. In the matchall
1623 mode, the actions of {\em all} applicable rules are executed in the
1624 textual order. Caution: since the matching variable binding
1625 process occurs {\em before} the rules are executed, the rhs actions
1626 should not alter the matched value or the pattern variables may
1627 be bound to the wrong value.
1629 \paragraph{Repetition}
1630 In addition, both \T{match} and \T{matchall} can be annotated with the
1631 \TDEF{while} modifier. It this case the semantics of the matching
1632 construct is to repeat until no more rule is applicable. For example,
1633 the following code fragment returns the last element of a list
1634 \begin{verbatim}
1635 datatype List = #[] | #[ int ... List ];
1637 int last (List l)
1638 { match while (l)
1639 { case #[]: assert(0); // an error
1640 case #[i]: return i;
1641 case #[h ... t]: l = t;
1644 \end{verbatim}
1646 \paragraph{Pattern syntax}
1647 The basic form of a pattern is a literal, which
1648 that matches only a single integer, real number, string, etc. Complex
1649 patterns can be constructed inductively
1650 using datatype constructors, tuples, lists, and
1651 {\em logical pattern connectives}.
1653 The syntax of the patterns is as follows:
1654 \begin{syntax}
1655 \NDEF{Pat} &
1656 \IS & \N{Integer} & matches integer literal \\
1657 & \OR & \N{Real} & matches real literal \\
1658 & \OR & \N{Char} & matches character literal \\
1659 & \OR & \N{Quark} & matches quark literal \\
1660 & \OR & \N{String} & matches string literal \\
1661 & \OR & \N{Regexp} & matches any string that matches {\em regexp}\\
1662 & \OR & \N{Pat\_Var} & a pattern variable; matches anything \\
1663 & \OR & \T{\_} & wild card; matches anything \\
1664 & \OR & \N{Cons} & matches a datatype constructor \\
1665 & \OR & \N{Cons} \N{PatArg}& a constructor application pattern \\
1666 & \OR & \Id \T{as} \Pat & binds a variable to the sub-pattern \\
1667 & \OR & \Pat \T{:} \TypeExp & typed pattern \\
1668 & \OR & \Pat \T{||} \Pat & matches either pattern \\
1669 & \OR & \Pat \T{\&\&} \Pat & matches both patterns \\
1670 & \OR & \T{!} \Pat & matches anything but the pattern \\
1671 & \OR & \T{(} \Pat \T{)} & Grouping \\
1672 & \OR & \T{\#[} \LIST{\N{Pat}} \T{]} & list pattern; exact length match \\
1673 & \OR & \T{\#[} \LIST{\N{Pat}} \T{...} \T{]} & list pattern;
1674 non-exact length match \\
1675 & \OR & \T{[|} \LIST{\N{Pat}} \T{|]} & vector pattern; exact length match \\
1676 & \OR & \T{[|} \LIST{\N{Pat}} \T{...} \T{|]}
1677 & vector pattern; matches to the left \\
1678 & \OR & \T{[|} \T{...} \LIST{\N{Pat}} \T{|]}
1679 & vector pattern; matches to the right \\
1680 \NDEF{PatArg} & \IS & \Pat \\
1681 & \OR & \T{(} \LIST{\N{Pat}} \T{)} \\
1682 & \OR & \T{\{} \LIST{\N{Lab\_Pat}} \OPT{\T{...}} \T{\}} \\
1683 \NDEF{Lab\_Pat} & \IS & \Id \T{=} \Pat \\
1684 \NDEF{Pat\_Var} & \IS & \Id \\
1685 & \OR & \T{?}\Id \\
1686 \end{syntax}
1688 \Subsection{Refining a datatype}
1690 A datatype definition can be refined in a few ways:
1692 \begin{enumerate}
1693 \item{\bf inheritance} A datatype can be declared to be inherited from
1694 one or more classes $C_1, C_2, \ldots, C_n$.
1695 The effect is that all variants of a datatype will be inherited from the
1696 classes $C_1, C_2, \ldots C_2$.
1698 \item{\bf member functions} Member functions and addition attributes
1699 can be attached to the datatype variants using the keyword \TDEF{with}.
1700 \end{enumerate}
1702 For example, the following datatype \T{Exp} is inherited from some user
1703 defined classes \verb|AST| and \verb|Object|. Furthermore, a virtual
1704 member function called print is defined.
1706 \begin{verbatim}
1707 datatype Exp : public AST, virtual public Object
1708 = INT int
1709 with { ostream& print(ostream&); }
1710 | PLUS Exp, Exp
1711 with { ostream& print(ostream&); }
1712 | MINUS Exp, Exp
1713 with { ostream& print(ostream&); }
1714 | MULT Exp, Exp
1715 with { ostream& print(ostream&); }
1716 | DIV Exp, Exp
1717 with { ostream& print(ostream&); }
1718 | VAR { name : const char *, typ : Type }
1719 with { ostream& print(ostream&); }
1720 public:
1721 { virtual ostream& print (ostream&) = 0;
1722 void * operator new (size_t);
1725 \end{verbatim}
1727 The allocator and printing methods can be defined as follows:
1728 \begin{verbatim}
1729 void * classof Exp::operator new(size_t) { ... }
1730 ostream& classof INT::print(ostream& s) { return s << INT; }
1731 ostream& classof PLUS::print(ostream& s)
1732 { return s << '(' << this->#1 << '+' << this->#2 <<')'; }
1733 ostream& classof MINUS::print(ostream& s)
1734 { return s << '(' << this->#1 << '-' << this->#2 <<')'; }
1735 ostream& classof MULT::print(ostream& s)
1736 { return s << '(' << this->#1 << '*' << this->#2 <<')'; }
1737 ostream& classof DIV::print(ostream& s)
1738 { return s << '(' << this->#1 << '/' << this->#2 <<')'; }
1739 ostream& classof VAR::print(ostream& s) { return s << name; }
1740 \end{verbatim}
1742 The special type form \TDEF{classof} {\em con} is used to reference
1743 the type of a constructor. Arguments of a constructor uses the following
1744 naming convention: (i) if the constructor $C$ takes only one argument,
1745 then the name of the argument is $C$; (ii) if $C$ takes two or more
1746 unnamed arguments, then these are named \verb|#1|, \verb|#2|, etc;
1747 (iii) finally, labeled arguments are given the same name as the label.
1749 For readability reasons, it is often useful to separate a datatype definition
1750 from the member functions and inheritance definitions. The \TDEF{refine}
1751 declaration can be used in this situation. For example, the above
1752 example can be restated more succinctly as follows:
1754 \begin{verbatim}
1756 // Datatype definition section
1758 datatype Exp = INT int
1759 | PLUS Exp, Exp
1760 | MINUS Exp, Exp
1761 | MULT Exp, Exp
1762 | DIV Exp, Exp
1763 | VAR { name : const char *, typ : Type }
1767 // Refinement section
1769 refine Exp : public AST, virtual public Object
1770 { virtual ostream& print (ostream&) = 0;
1771 void * operator new (size_t);
1773 and INT, PLUS, MINUS, MULT, DIV, VAR
1774 { ostream& print(ostream&);
1777 \end{verbatim}
1779 The general syntax of the \TDEF{refine} declaration is as follows:
1780 \begin{syntax}
1781 \NDEF{Refine\_Decl}
1782 & \IS & \T{refine} \ALT{\N{Refine\_Spec}}{\T{and}} \T{;} \\
1783 \NDEF{Refine\_Spec}
1784 & \IS & \LIST{\N{Id}} \\
1785 & & \quad \OPT{\T{:} \N{Inherit\_List}} \\
1786 & & \quad \OPT{\T{::} \N{Datatype\_Qualifiers}} \\
1787 & & \quad \OPT{\T{\{} \N{Datatype\_Body} \T{\}}} \\
1788 \end{syntax}
1789 Here, \N{Id} refers to either a constructor name or a datatype name.
1791 \Subsection{Memory management}
1793 By default, a datatype constructor calls the \verb|operator new| to
1794 allocated memory. There are a few ways to change this behavior: (i)
1795 redefine the \verb|operator new| within the datatype using refinement; or
1796 (ii) inherit from a class that redefines this method.
1798 If a {\em placement} \verb|operator new| is defined in datatype,
1799 the placement constructor syntax can be used to invoke this
1800 operator:
1801 \begin{syntax}
1802 \NDEF{Placement\_Cons}
1803 & \IS & \N{Cons} \T{'(} \LIST{\N{Exp}} \T{)} \T{(}
1804 \OPT{\LIST{\N{Exp}}} \T{)} \\
1805 & \OR & \N{Cons} \T{'(} \LIST{\N{Exp}} \T{)} \T{\{}
1806 \OPT{\LIST{{\N{Id} = \N{Exp}}}}
1807 \T{\}} \\
1808 \end{syntax}
1810 For example, in the following the extra parameter \verb|mem| will
1811 be passed to placement \verb|new| operator.
1812 \begin{verbatim}
1813 Exp e1 = INT'(mem)(1);
1814 Exp e2 = VAR'(mem){ name = "foo", typ = t };
1815 \end{verbatim}
1817 \Subsubsection{Garbage collection}
1819 The support library contains an implementation\footnote{The implementation
1820 has only been tested on Linux, SunOS and Solaris. Please contact the
1821 author for details
1822 if you'd like to port the collector to your particular platform.} of a
1823 conservative garbage collector using two algorithms: one using
1824 a mostly copying scheme\cite{Mostly-copying,Gen-mostly-copying}
1825 and one using mark-sweep.
1827 A datatype can be defined to be garbage collectable with the
1828 qualifier \TDEF{collectable}. In the following example, the datatype
1829 \T{Exp} is will be allocated from the a garbage collected heap instead
1830 of the default heap. Since it references another
1831 user defined garbage collectable object
1832 \verb|SymbolTable|, the pointer to that symbol table is also marked
1833 as \T{collectable}.
1834 \begin{verbatim}
1835 // SymbolTable is collectable
1836 class SymbolTable: public GCObject
1838 public:
1839 class Id : public GCObject { ... };
1843 datatype Exp :: collectable
1844 = INT int
1845 | VAR (collectable SymbolTable::Id, collectable SymbolTable *)
1846 | ADD Exp, Exp
1847 | SUB Exp, Exp
1848 | MUL Exp, Exp
1849 | DIV Exp, Exp
1851 \end{verbatim}
1853 The corresponding \TDEF{instantiate datatype} statement for \T{Exp}
1854 will generate the appropriate tracing methods for garbage collection.
1856 Please see appendix~\ref{appendix:gc} for more details concerning
1857 garbage collection.
1859 \Subsubsection{Persistence}
1861 Datatypes can be interfaced with the persistent object library
1862 supplied with the translator using the \TDEF{persistent} qualifier.
1863 A persistent object defines the protocol for serializing its representation
1864 into an architecture independent format on a persistence stream.
1866 For example, the following datatype declaration defines a persistent
1867 type \verb|Exp|. Note that a pointer to the symbol table
1868 is annotated with the \T{persisent} qualifier to signify that the object
1869 associated with the pointer should be traversed during the serialization
1870 phase.
1871 \begin{verbatim}
1872 // SymbolTable is persistent
1873 class SymbolTable: public PObject
1875 public:
1876 class Id : public PObject { ... };
1880 datatype Exp :: persistent
1881 = INT int
1882 | VAR (persistent SymbolTable::Id, persistent SymbolTable *)
1883 | ADD Exp, Exp
1884 | SUB Exp, Exp
1885 | MUL Exp, Exp
1886 | DIV Exp, Exp
1888 \end{verbatim}
1890 A {persistent object id} {\em must} be provided with each distinct
1891 datatype. Object ids are used to identify the classes
1892 of persistent objects. Thus they must be unique within an application.
1893 Each persistent type tag is simply a string, and can be defined using
1894 the \TDEF{refine persistent} statement.
1896 For example, the following statement declares the persistent object
1897 id for type \verb|Exp| above to be \verb|"Simple expressions"|.
1898 \begin{verbatim}
1899 refine persistent Exp => "Simple expressions";
1900 \end{verbatim}
1902 The corresponding \TDEF{instantiate datatype} statement will generate
1903 the appropriate methods to communicate with a persistent stream.
1904 \begin{verbatim}
1905 instantiate datatype Exp;
1906 \end{verbatim}
1908 To write an object $e$ to the datafile \verb|"foo"|, we can say:
1909 \begin{verbatim}
1910 #include <AD/persist/postream.h>
1911 ofstream out("foo");
1912 Postream pout(out);
1913 pout << e;
1914 out.close();
1915 \end{verbatim}
1917 To read the object back from the file, we can say:
1918 \begin{verbatim}
1919 #include <AD/persist/pistream.h>
1920 EXP e;
1921 ifstream in("foo");
1922 Pistream pin(in);
1923 e = (Exp)read_object(pin);
1924 in.close();
1925 \end{verbatim}
1927 For more details concerning persistent streams and persist objects,
1928 please see directory \verb|<AD/persist>| for details.
1930 \Section{Inference} \label{sec:inference}
1932 Semantic processing
1933 in compilers and other language processors, such as data flow analysis,
1934 can frequently be specified as in a rule-based,
1935 logical deductive style. In \Prop, deductive inference using forward
1936 chaining\footnote{
1937 The current
1938 implementation of {\sf Prop} translates inference into very naive code.
1939 This feature will be phased out and replaced by
1940 the more general and much faster graph rewriting mechanism.
1942 is provided as a built-in mechanism, orthogonal
1943 to pattern matching and rewriting, for manipulating user-defined
1944 algebraic datatypes.
1946 Similar to syntax classes, {\bf inference classes} may be used
1947 for data encapsulation. An inference class is a combination of a \Cpp{}
1948 class, a database of inference relations, and a collection of inference rules
1949 of the form {\em lhs \verb|->| rhs}. The lhs of an inference rule is a set
1950 of patterns in conjunctive form. During the inference process, a rule is
1951 fired when its lhs condition is satisfied. A fired rule then executes
1952 the corresponding rhs action, which may assert or retract additional relations
1953 from the database. Using multiple inheritance, it is possible to combine
1954 a rewriting class with an inference class such that the rewriting process
1955 generates new relations to drive the inference process, or vice versa.
1957 \Subsection{An Example}
1958 Datatype relations are not a distinct kind of data structure but are in fact
1959 simply algebraic datatypes declared to have a \TDEF{relation}
1960 attribute. For example, in
1961 the following definition three relation types \verb|Person|, \verb|Parent|
1962 and \verb|Gen| are defined.
1964 \begin{verbatim}
1965 datatype Person :: relation = person (const char *)
1966 and Parent :: relation = parent (const char *, const char *)
1967 and Gen :: relation = same_generation (const char *, const char *);
1969 instantiate datatype Person, Parent, Gen;
1970 \end{verbatim}
1972 Using these relations we can define an inference class that computes whether
1973 two persons are in the same generation. Nine axioms are defined (i.e.
1974 those whose lhs are empty) in the following. The two inference rules
1975 state that (1) a person is in the same generation as him/herself,
1976 and (2) two persons are in the same generation if
1977 their parents are in the same generation.
1979 \begin{verbatim}
1980 inference class SameGeneration {};
1982 inference SameGeneration
1983 { -> person("p1") and person("p2") and
1984 person("p3") and person("p4") and
1985 person("p5");
1987 -> parent("p1", "p2") and
1988 parent("p1", "p3") and
1989 parent("p2", "p4") and
1990 parent("p3", "P5");
1992 person ?p -> same_generation (?p, ?p);
1994 parent (?x, ?y) and parent (?z, ?w) and same_generation (?x, ?z)
1995 -> same_generation(?y, ?w);
1997 \end{verbatim}
1999 In general, datatypes qualified as \verb|relation|s will inherit
2000 from the base class \verb|Fact|, while a rewrite class definition
2001 implicitly defines two member functions used to assert and retract facts
2002 in the internal database. For example, in the above example, the following
2003 protocol will be automatically generated by the inference compiler.
2005 \begin{verbatim}
2006 class SameGeneration : ...
2008 public:
2009 virtual Rete& infer (); // start the inference process
2010 virtual ReteNet& operator += (Fact *); // assert fact
2011 virtual ReteNet& operator -= (Fact *); // retract fact
2013 \end{verbatim}
2017 Using these methods, an application can insert or remove relations
2018 from an inference class. This will in turn trigger any attached inference
2019 rules of the class.
2021 \Subsubsection{Another example}
2023 Consider the following example, which is used to compute Pythagorean
2024 triangles. Only one axiom and two rules are used. The axiom and the
2025 first rule are used to assert the relations \verb|num(1)| to \verb|num(n)|
2026 into the database, where \verb|n| is limited by the term \verb|limit(n)|.
2027 The second inference rule is responsible for printing out only
2028 the appropriate combinations of numbers.
2030 \begin{verbatim}
2031 datatype Number :: relation = num int | limit int;
2033 inference class Triangle {};
2035 inference Triangle
2036 { -> num 1;
2038 num m
2039 and limit n | n > m
2040 -> num (m+1);
2042 num a
2043 and num b
2044 and num c | a < b && b < c && a*a + b*b == c*c
2045 -> { cout << a << " * " << a << " + "
2046 << b << " * " << b << " = "
2047 << c << " * " << c << "\n";
2050 \end{verbatim}
2052 Now, to print all the triangle identities lying in range of $1$ to $100$,
2053 we only have to create an instance of the inference class, insert the
2054 limit, and start the inference process, as in below:
2056 \begin{verbatim}
2057 Triangle triangle;
2058 triangle += limit(100);
2059 triangle.infer();
2060 \end{verbatim}
2062 \Subsection{Inference Class}
2063 \INCOMPLETE
2065 \Subsection{Inference Rules}
2066 \INCOMPLETE
2068 \Section{Tree Rewriting} \label{sec:tree-rewriting}
2070 \Prop's tree rewriting mechanism let us transform a tree in
2071 algebraic datatype form into another tree according to a set of
2072 {\em rewriting rules}. Unlike plain pattern matching described
2073 in section~\ref{sec:pattern-matching}, which only apply to the
2074 root of a tree, rewriting rules are applicable to all parts of
2075 the tree. This allows the user to concentrate on developing the
2076 set of transformations; the actual traversal of the tree object
2077 is handled implicitly by \Prop. When used properly, this
2078 mechanism has an advantage over plain pattern matching since
2079 rewriting rules remain valid even when a new variant to a datatype
2080 is introduced.
2082 In \Prop, a rewriting system is developed in a similar manner as a parser.
2083 The first phase requires the user to defines a {\em rewrite class}
2084 to encapsulate the rewriting rules.
2086 Frequently, the rewriting mechanism is used to collect information about
2087 a tree structure; furthermore, more transformation rules are {\em conditional}:
2088 i.e. we want them to be applicable only if certain conditions are satisfied.
2089 We can accomplish both of these tasks by defining data members and methods
2090 in the rewriting class. These are accessible during the rewriting
2091 process. Information collected during rewriting can be stored within
2092 the data members.
2094 In the rewriting formalism, equational
2095 rules of the form {\em lhs} \verb|->| {\em rhs} are specified by the user.
2096 During processing, each instance of the lhs in a complex tree is replaced
2097 by an instance of the rhs, until no such replacement is possible.
2098 Equational rules can often be used to specify semantics based simplification
2099 (e.g. constant folding and simplification based on simple algebraic
2100 identities) or transformation(e.g. code selection in a compiler
2101 backend\cite{codegen-with-trees}).
2103 Unlike plain pattern matching, however, the structural traversal process
2104 in rewriting is implicitly inferred from the type structure of an
2105 algebraic datatype, as specified in its definition.
2107 There are two main forms of rewriting modes available:
2108 \begin{itemize}
2109 \item The first is {\bf normalization} mode: a given tree is reduced
2110 using the matching rules until no more redexes are available. There
2111 are two modes of operations available:
2112 \begin{itemize}
2113 \item in {\bf replacement} mode, the redex of a tree will be
2114 physically overwritten.
2115 \item in {\bf applicative} mode, on the other hand, a new
2116 tree corresponding to the replacement value will be constructed.
2117 \end{itemize}
2118 Replacement mode is used as the default since it is usually
2119 the more efficient of the two.
2120 \item The second form is {\bf reduction} and {\bf transformation}.
2121 In this mode a tree parse of the input term is computed. If cost functions
2122 are attached to the rules, then they will also be used to determine a
2123 minimal cost reduction sequence. During this process attached actions of
2124 a rule may be invoked to synthesize new data.
2125 \end{itemize}
2127 Each independent set of rewriting rules in \Prop{} is encapsulated
2128 in its own {\bf rewrite class}. A rewrite class is basically a normal \Cpp{}
2129 class with a set of rewriting rules attached. During rewriting, the
2130 data members and the member functions are visible according to the normal
2131 \Cpp{} scoping rules. This makes it is easy to encapsulate additional data
2132 computed as a side effect during the rewriting process.
2134 \Subsection{A rewriting example}
2136 Consider an abbreviated simplifier for the well-formed formula datatype
2137 \verb|Wff| defined in the section~\ref{sec:Wff}. The rewrite class for this
2138 can be defined as follows. Since there is no encapsulated data in this
2139 example, only the default constructor for the class needs to be defined.
2140 A rewrite class definition requires the traversal list to be
2141 specified. This is simply a list of datatypes
2142 involved in the rewriting traversal process.
2143 In this instance only \verb|Wff| is needed.
2145 \begin{verbatim}
2146 rewrite class Simplify (Wff)
2148 public:
2149 Simplify() {}
2151 \end{verbatim}
2153 The rewrite rules for the simplifier can then be specified succinctly as
2154 follows. Like the \verb|match| statement, in general the rhs
2155 of a rewrite rule can be any statement. A special statement
2156 \verb|rewrite(e)| can be used to rewrite the current redex into another form.
2157 If the rhs is of the form \verb|rewrite(e)|, then it can be abbreviated
2158 to \verb|e|, as in below:
2159 \begin{verbatim}
2160 rewrite Simplify
2161 { And(F, _): F
2162 | And(_, F): F
2163 | And(T, ?X): ?X
2164 | And(?X, T): ?X
2165 | Or (T, _): T
2166 | Or (_, T): T
2167 | Or (F, ?X): ?X
2168 | Or (?X, F): ?X
2169 | Not(Not(?X)): ?X
2170 | Not(And(?X,?Y)): Or(Not(?X), Not(?Y))
2171 | Not(Or(?X,?Y)): And(Not(?X), Not(?Y))
2172 | Implies(?X,?Y): Or(Not(?X), ?Y)
2173 | And (?X, ?X): ?X
2174 | Or (?X, ?X): ?X
2175 // etc ...
2177 \end{verbatim}
2179 The rewrite class definition creates a new class of the same name. This new
2180 class defines an implicit \verb|operator ()| with the protocol below. This
2181 member function can be invoked to perform the rewriting in a functional
2182 syntax.
2184 \begin{verbatim}
2185 class Simplify : ... {
2186 { ...
2187 public:
2188 void operator () (Wff);
2189 // Wff operator () (Wff); // if rewrite class is applicative
2192 Wff wff = ...;
2193 Simplify simplify; // create a new instance of the rewrite class
2194 simplify(wff); // rewrite the term wff
2195 \end{verbatim}
2198 \Subsubsection{Conditional rewriting and actions}
2200 Rewriting rules may be guarded with predicates to limit their
2201 applicability. In addition, the {\em rhs} of a rewrite rule is not limited
2202 to only a replacement expression: in general, any arbitrarily complex sequence
2203 of code may be used. For example, in the following set of rewriting
2204 rules we use guards to prevent undesirable replacements to be made during
2205 expression constant folding:
2207 \begin{verbatim}
2208 rewrite ConstantFolding
2209 { ADD (INT a, INT b): INT(a+b)
2210 | SUB (INT a, INT b): INT(a-b)
2211 | MUL (INT a, INT b):
2212 { int c = a * b; // silent overflow
2213 if (a == 0 || b == 0 || c / b == a) // no overflow?
2214 { rewrite(INT(c)); }
2215 else
2216 { cerr << "Overflow in multiply\n"; }
2218 | DIV (INT a, INT b) | b == 0: { cerr << "Division by zero\n"; }
2219 | DIV (INT a, INT b): INT(a/b)
2220 | // etc...
2222 \end{verbatim}
2224 \Subsection{Rewrite class} \label{sec:rewrite-class}
2226 The syntax of a rewrite class declaration is as follows:
2228 \begin{syntax}
2229 \NDEF{Rewrite\_Class\_Decl} & \IS & \TDEF{rewrite class} \Id
2230 \T{(} \LIST{\N{TypeExp}} \T{)} \\
2231 & & \quad \OPT{\T{:} \N{Inherit\_List}}
2232 \OPT{\T{::} \MORE{\N{Rewrite\_Mode}}{}} \\
2233 & & \quad \T{\{} \N{Class\_Body} \T{\}} \T{;} \\
2234 \NDEF{Rewrite\_Mode} & \IS & \TDEF{treeparser} \\
2235 & \OR & \TDEF{applicative} \\
2236 & \OR & \TDEF{topdown} \\
2237 \end{syntax}
2239 This is simply the usual \Cpp{} class declaration syntax
2240 extended to the following information:
2241 \begin{description}
2242 \item[a traversal list] enclosed in parenthesis. The traversal
2243 list of a rewrite class defines the set of types that rewriting
2244 should traverse. If a datatype object contains an argument of a type
2245 not listed in the traversal list, its value will not be altered.
2246 \item[rewrite mode] this defines the rewriting mode of the
2247 rewrite class.
2248 \end{description}
2250 \Subsection{Rewriting rules}
2252 Rewriting rules are specified in a rewrite declaration.
2253 The syntax of a rewriting specification is as follows:
2255 \begin{syntax}
2256 \NDEF{Rewrite\_Decl}
2257 & \IS & \TDEF{rewrite} \Id & variant 1 \\
2258 & & \T{\{} \OPT{\N{Index\_Decl}}
2259 \ALT{\T{case} \N{Rewrite\_Rules}}{} \T{\}} \\
2260 & \OR & \T{rewrite} \Id & variant 2 \\
2261 & & \T{\{} \OPT{\N{Index\_Decl}} \ALT{\N{Rewrite\_Rules}}{|} \T{\}} \\
2262 \NDEF{Rewrite\_Rule}
2263 & \IS & \OPT{\N{Rewrite\_Modifier}} \OPT{\N{Id} \T{->}} \\
2264 & & \quad \Pat \OPT{\N{Guard}} \OPT{\N{Cost}} \T{:} \N{Rewrite\_Action} \\
2265 \NDEF{Rewrite\_Modifier}
2266 & \IS & \T{bottomup:} & bottom up mode \\
2267 & \OR & \T{topdown:} & top down mode \\
2268 & \OR & \T{before:} & before actions \\
2269 & \OR & \T{preorder:} & preorder actions \\
2270 & \OR & \T{postorder:} & postorder actions \\
2271 \NDEF{Rewrite\_Action} & \IS & \T{\{} \N{Code} \T{\}} \\
2272 & \OR & \Exp \\
2273 \end{syntax}
2275 The name of a rewrite declaration should match the name of
2276 a rewrite class.
2277 The two syntactic forms of \T{rewrite} have equivalent semantics.
2279 The special statement \T{rewrite}($e$) may be used
2280 inside the rhs action to rewrite the current redex into $e$.
2281 Note that \T{rewrite}($e$) is a branching statement; statements
2282 after \T{rewrite} are not reachable.
2284 \Subsection{Rewriting modifiers}
2285 In version 2.3.3 onward, rewrite rules can be modified by the
2286 keywords: \T{bottomup:}, \T{topdown:}, \T{before:}, \T{preorder:}
2287 and \T{postorder:}. These modifiers
2288 alter the order in which a set of rewriting rules is applied.
2290 These modifiers act like delimiters, similar
2291 to the way scoping keywords like \T{public:}, \T{protected:} and
2292 \T{private:} are used to delimit declarations in C++.
2293 For instance, a set of rules prefixed by the modifier \T{topdown:}
2294 will utilize the topdown strategy for reduction. If no modifier are
2295 specified, then bottom-up will be the default.
2297 Note that all five modes of rules can be used together in a rewriting system
2298 under tree rewriting mode (see \ref{sec:rewriting-modes})
2299 and their executions are intermixed together.
2301 The semantics of these modifiers are as follows:
2302 \begin{description}
2303 \item[\TDEF{bottomup:}] This specifies the modified rewriting rules should
2304 be applied in the default bottom-up mode. In this mode, the
2305 innermost/leftmost redex is chosen as the next redex. So reduction
2306 occurs in a bottomup manner. This means that if a term $t$ is a redex
2307 and if the term $t$ contains a subterm $t'$ which is also a redex,
2308 $t'$ will be reduced before $t$.
2309 \item[\TDEF{topdown:}] This specifies the modified rewriting rules should be
2310 applied in a topdown mode. In this mode, the rewriter
2311 will first try to locate the outermost/leftmost redex. Reduction
2312 will occur in a (mostly) topdown manner.
2313 \item[\TDEF{before:}] This specifies that the modified rewriting rules
2314 should be tried before the topdown phase.
2315 In addition, unlike topdown mode, if state-caching if used (see
2316 \ref{sec:state-caching}), the rules are only tried once.
2317 Otherwise, these act just like the preorder modifier, described below.
2318 \item[\TDEF{preorder:}] This specifies that
2319 the modified rewriting rules should be
2320 tried only at the preorder traversal of a term.
2321 \item[\TDEF{postorder:}] This specifies the rewriting rules should be
2322 tried only at the postorder traversal of a term.
2323 \end{description}
2325 \Subsubsection{Rewriting modifier example} \label{sec:rewriting-modifier}
2327 We'll use the following example, extracted from a query optimizer for
2328 a subset of the relational calculus, to demonstrate the use
2329 of the rewriting modifiers.
2331 The query optimizer transforms the following abstract syntax, defined
2332 as a set of \Prop's datatypes.
2333 \begin{verbatim}
2334 datatype List<T> = #[] | #[ T ... List<T> ];
2335 datatype Literal = INT int
2336 | STRING const char *
2337 | BOOL Bool
2339 and Exp = OP Id, Exps
2340 | APP Id, Exps
2341 | LIT Literal
2342 | ID Id
2343 | TUPLE Exps // [ E1, E2, ... En ]
2344 | FORALL Id, Exp, Exp // forall x in E1 . E2
2345 | EXISTS Id, Exp, Exp // exists x in E1 . E2
2346 | GUARD (Exp, Exp) //
2347 | GENERATOR (Ids, Exps, Exp) // [x_1,...,x_n] in X_1*...*X_n
2348 | LET (Id, Exp, Exp)
2350 law inline Int i = LIT(INT i)
2351 | inline Boolean b = LIT(BOOL b)
2352 | True = Boolean true
2353 | False = Boolean false
2354 | inline And a,b = OP("and",#[a,b])
2355 | inline Or a,b = OP("or", #[a,b])
2356 | inline Not a = OP("not",#[a])
2357 | inline Eq a,b = OP("=", #[a,b])
2358 | inline Ne a,b = OP("/=", #[a,b])
2359 | inline Gt a,b = OP(">", #[a,b])
2360 | inline Ge a,b = OP(">=", #[a,b])
2361 | inline Lt a,b = OP("<", #[a,b])
2362 | inline Le a,b = OP("<=", #[a,b])
2363 | inline In a,b = OP("in", #[a,b])
2364 | inline Union a,b = OP("union", #[a,b])
2365 | inline Count a = OP("#", #[a])
2367 where type Id = const char *
2368 and Ids = List<Id>
2369 and Exps = List<Exp>
2371 \end{verbatim}
2373 Note that existential and
2374 universal quantifiers are represented by the constructors \T{FORALL}
2375 and \T{EXISTS} respectively. For example, $\exists x \in A.p$ is
2376 presented as the term $\T{EXISTS}(x,A,p)$. Here $x$ is a {\em binding}
2377 occurrence for $x$; note that the variable $x$ may occur within
2378 the predicate $p$. In addition, a list comprehension
2379 expression such as
2380 \[ \{ e : [x_1,\ldots,x_n] \in X_1
2381 \times \ldots \times X_n\ |\ p \} \]
2382 is represented as the compound term
2384 \T{GENERATOR}(\#[x_1,\ldots,x_n],\#[X_1,\ldots,X_n],\T{GUARD}(p,e))
2387 Suppose we'll like to rename all variables in a query $Q$
2388 so that no two binding occurrences are given the same name.
2389 This can be easily accomplished using a combination
2390 of \T{preorder:} and \T{postorder:} rules in the following manner:
2391 \begin{enumerate}
2392 \item[(i)] Use preorder rules to check for new variable bindings,
2393 for example in $\T{EXISTS}(x,A,p)$.
2394 For each new binding occurrence for $x$, create
2395 a new name for $x$.
2396 These will be performed before the subterms $A$ and $p$ are traversed.
2397 \item[(ii)] Whenever a variable $x$ is found during the sub-traversal
2398 of $A$ and $p$, rename the variable by looking up its new name.
2399 \item[(iii)] Finally, use postorder rules to remove the current
2400 variable substitution whenever we're exiting a binding occurrence.
2401 \end{enumerate}
2403 For example, the following set of rewriting rules accomplish this renaming
2404 task for our query language using exactly this method.
2405 \begin{verbatim}
2406 rewrite RemoveDuplicateNames
2408 // We use before and after rules to remove duplicates from variable names.
2409 // As each new binding occurrence is found, we enter a new binding
2410 // into the current environment. Identifiers found within
2411 // the binding occurrence are then renamed.
2413 preorder: // insert new bindings
2414 EXISTS(x,_,_): { new_binding(x); }
2415 | FORALL(x,_,_): { new_binding(x); }
2416 | GENERATOR(xs,_,_): { new_binding(xs); }
2417 | LET(x,_,_): { new_binding(x); }
2419 // rename variables
2420 | ID x: { rename(x); }
2422 postorder: // removes the binding
2423 | EXISTS(x,A,_): { old_binding(x); }
2424 | FORALL(x,A,_): { old_binding(x); }
2425 | GENERATOR(xs,As,_): { old_binding(xs); }
2426 | LET(x,_,_): { old_binding(x); }
2428 \end{verbatim}
2430 Note that we have accomplished a depth first traversal using rewriting
2431 without writing any traversal code! As a side benefit,
2432 since the traversal is automatically determined by
2433 the structure of the datatypes, we do not have to rewrite this
2434 renaming code even if the abstract syntax of the query language is
2435 extended, as long as no additional binding operators are added.
2437 \Subsection{The \T{rewrite} statement}
2439 While the rewrite class construct provides a very general abstraction
2440 for rewriting, in general its full power is unneeded. It is often
2441 convenient to be able to perform rewriting on a term without having
2442 to make a new name for a class just for the occasion, especially if member
2443 functions and member data are unneeded. To accommodate these situations,
2444 the \verb|rewrite| statement is provided
2445 to perform a set rewriting transformations on a term without having
2446 to define a temporary rewrite class. It is simply syntactic sugar
2447 for the more general rewrite class and rewrite
2448 rules specifications.
2449 For example, a simplify routine for type \verb|Exp| defined above can be
2450 specified as follows:
2452 \begin{verbatim}
2453 Exp simplify (Exp e)
2454 { // transformations on e before
2455 rewrite (e) type (Exp)
2456 { ADD (INT a, INT b): INT(a+b)
2457 | SUB (INT a, INT b): INT(a-b)
2458 | MUL (INT a, INT b): INT(a*b)
2459 | ...
2461 // transformations on e after
2462 return e;
2464 \end{verbatim}
2466 The \verb|rewrite| normally performs the replacement in-place.
2467 An applicative version of the same can be written as follows\footnote{The
2468 variable {\tt e} is assigned the new copy.}:
2470 \begin{verbatim}
2471 Exp simplify (Exp e)
2472 { rewrite (e) => e type (Exp)
2473 { ADD (INT a, INT b): INT(a+b)
2474 | SUB (INT a, INT b): INT(a+b)
2475 | MUL (INT a, INT b): INT(a*b)
2476 | ...
2478 return e;
2480 \end{verbatim}
2482 The syntax of the rewrite statement is as follows. The
2483 traversal list of the set of rewrite rule is listed next to
2484 the keyword \T{type}.
2486 \begin{syntax}
2487 \NDEF{Rewrite\_Stmt}
2488 & \IS & \TDEF{rewrite} \T{(} \Exp \T{)} \OPT{\T{=>} \N{Exp}} \\
2489 & & \quad \T{type} \T{(} \LIST{\N{TypeExp}} \T{)} \\
2490 & & \T{\{} \OPT{\N{Index\_Decl}} \N{Rewrite\_Rules} \T{\}} \\
2491 & \OR & \TDEF{rewrite} \T{(} \N{Exp} \T{)} \OPT{\T{=>} \N{Exp}} \\
2492 & & \quad \TDEF{type} \T{(} \LIST{\N{TypeExp}} \T{)} \T{of} \\
2493 & & \quad \OPT{\N{Index\_Decl}} \N{Rewrite\_Rules} \\
2494 & & \T{end} \T{rewrite} \T{;} \\
2495 \end{syntax}
2497 \Subsubsection{Rewriting modes} \label{sec:rewriting-modes}
2498 There are two basic modes of operations in a rewrite class: (i)
2499 {\em tree rewriting} and (ii) {\em tree parsing}.
2500 Tree rewriting repeatedly looks for redexes in a tree and transforms it into
2501 another tree. There are two sub-modes of operations: (a) {\em applicative}
2502 and (b) {\em in-place}. Applicative mode is specified using the
2503 \T{applicative} mode specifier when declaring a rewrite class.
2504 In this mode, a new tree is built from a
2505 bottom-up manner, and the subject tree is left unaltered. The default
2506 mode is ``in-place.'' In this mode, the subject tree is overwritten
2507 with the transformed expression.
2509 On the other hand, in tree parsing mode, the left hand side
2510 of a rewrite rules specification is treated as a tree grammar.
2511 The tree parser will look for a derivation of given tree object from
2512 the start non-terminal.
2513 A guarded rule will only be used if the guard evaluates to true.
2515 If rules are annotated with cost expressions, then the
2516 tree parser will try to locate a derivation chain with a minimal total
2517 cost. After a derivation tree is found, the tree can be repeated
2518 traversed. The rhs actions will be invoked during this traversal process.
2520 \Subsubsection{State caching optimization} \label{sec:state-caching}
2522 The pattern matching process proceeds in a bottom up manner:
2523 for each subtree of the form $l(t_1,t_2,\ldots,t_n)$, $n \ge 0$,
2524 a state number is computed from the current tree label $l$ and the
2525 state numbers of its subtrees $t_1, t_2, \ldots, t_n$. The set of matched
2526 rules can be directly computed from the current state number.
2527 This means that in the absence of redex replacements, pattern matching
2528 takes time proportional only $O(|t| + r)$, where $|t|$ is the size
2529 of the input tree $t$ and $r$ is time it takes to execute the rhs actions.
2530 That is, the runtime is insensitive to the number of rewrite rules or the
2531 complexity of the patterns.
2533 However, if the input is a DAG rather than a tree\footnote{Rewriting
2534 of a generic graph structure is not recommended with \T{rewrite}.},
2535 or if replacements are performed, then the above analysis may not hold.
2536 Replacements during rewriting often require state information of the
2537 replacement term to be recomputed to further the matching process.
2538 Since computation of state can involve a complete traversal of a term,
2539 replacement can become expensive if the replacement term is large.
2540 For instance, consider the following replacement rule, which replaces
2541 all expressions of the form {\em 2*x} into {\em x+x}:
2543 \begin{verbatim}
2544 rewrite class StrengthReduction
2546 MUL (INT 2, ?x): ADD(?x, ?x)
2549 \end{verbatim}
2551 \noindent
2552 Since the subterm \verb|?x| could be arbitrarily large, recomputing the
2553 state encoding for \verb|ADD(?x,?x)| naively takes time in proportion to the
2554 size of \verb|?x|. However, since the subtree \verb|?x| occurs as part
2555 of the pattern, its state has already been computed, and it would be
2556 wasteful to recompute its state from scratch.
2558 A similar observation can be made to the case when the input is a DAG,
2559 where many subcomponents are shared between nodes. In order to speed
2560 up the rewriting process, the state of each shared copy of the nodes
2561 should be computed only once.
2563 In order to speedup the state computation process in these situations,
2564 {\em \INDEX{state caching}} can be enabled by
2565 specifying a {\em rewriting index}.
2566 This can be accomplished in two ways: (i) use the qualifier
2567 \TDEF{rewrite} when defining a datatype. This specifies the
2568 {\em primary index}. Or alternatively, (ii) use index declarations
2569 to specify one or more {\em secondary indices}.
2571 We'll first describe the first method.
2572 A \TDEF{rewrite} qualifier can be used
2573 in the definition of a datatype. For instance:
2575 \begin{verbatim}
2576 datatype Exp :: rewrite
2577 = INT (int)
2578 | ID (const char *)
2579 | ADD (Exp, Exp)
2580 | SUB (Exp, Exp)
2581 | MUL (Exp, Exp)
2582 | DIV (Exp, Exp)
2584 \end{verbatim}
2586 The \T{rewrite} qualifier tells the translator to generate an extra
2587 state field in the implementation of \T{Exp}. This state field is implicitly
2588 updated during the rewriting process. When the rewriter encounters a
2589 node with a cached state it can quickly short-circuit all unnecessary
2590 state computation.
2592 {\bf A word of caution:} since each rewriting system computes
2593 its own encoding, rewriting systems should not be mixed if they shard the
2594 same index, i.e. actions of a
2595 rewriting system should not invoke another rewriting
2596 system on a term that is participating in the current rewriting system,
2597 if both systems use the same index. This limitation can be overcome
2598 by using secondary indices, which we'll discuss next.
2600 \Subsubsection{Specifying secondary indices}
2602 From release 2.3.0 onward, it is possible to specify secondary indices
2603 within a rewriting system. This makes it possible to invoke other
2604 rewriting systems within the execution of another, while retaining the
2605 state caching optimizations throughout.
2607 There are two forms of secondary index: {\em internal index} and
2608 {\em external index}. Internal indices are stored directly within
2609 a rewrite term and require the user to pre-allocate space for each term.
2610 In contrast, external indices are stored in a separate data structure
2611 such as a hash table.
2613 The syntax of secondary index declarations is as follows:
2614 \begin{syntax}
2615 \NDEF{Index\_Decl} & \IS & \TDEF{index:} \LIST{\N{Index\_Spec}} \T{;} \\
2616 \NDEF{Index\_Spec} & \IS &
2617 \TypeExp \T{=} \T{none} & \C{Disable indexing} \\
2618 & \OR & \TypeExp \T{=} \N{Id} & \C{Internal index} \\
2619 & \OR & \TypeExp \T{=} \TDEF{extern} \N{Id} & \C{External index} \\
2620 \end{syntax}
2623 \Subsubsection{Using an internal index}
2625 In order to make use of an index, certain member functions have to defined
2626 by the user. Given an internal index named {\em foo}, the rewriter
2627 invokes two functions named
2628 \begin{quotation}
2629 \noindent
2630 {\tt int get\_{\em foo}\_state() const; } \\
2631 {\tt void set\_{\em foo}\_state(int); }
2632 \end{quotation}
2633 \noindent within the generated code. The user should provide the
2634 implementations for these functions within the datatypes.
2636 For example, reusing the well-formed formulas example (see
2637 section~\ref{sec:Wff}),
2638 an internal index on the datatype \T{Wff} can be implemented as follows:
2639 \begin{verbatim}
2640 #include <AD/rewrite/burs.h>
2641 class WffIndex {
2642 int state;
2643 public:
2644 WffIndex() : state(BURS::undefined_state) {}
2645 int get_wff_state() const { return state; }
2646 void set_wff_state(int s) { state = s; }
2649 datatype Wff : public WffIndex
2652 | Var (const char *)
2653 | And (Wff, Wff)
2654 | Or (Wff, Wff)
2655 | Not (Wff)
2656 | Implies (Wff, Wff)
2661 rewrite Simplify {
2662 index: Wff = wff;
2665 \end{verbatim}
2667 Here, the class \T{WffIndex} provides the internal index interface
2668 functions \verb|get_wff_state| and \verb|set_wff_state| expected by
2669 the rewriter. Note that each term must be initialized with the
2670 state \verb|BURS::undefined_state|. This constant is defined within
2671 the library include file \verb|<AD/rewrite/burs.h>|.
2673 To enable the index, in the rewriter
2674 we specify that datatype \T{Wff} should make use of the index named \T{wff},
2675 using the \T{index:} declaration.
2677 \Subsubsection{Using an external index}
2679 External indices are specified in a similar manner. Given a datatype
2680 {\em Foo} and an external index named {\em bar}, the rewriter invokes
2681 calls to the following functions:
2682 \begin{quotation}
2683 \noindent
2684 {\tt int get\_{\em bar}\_state({\em Foo}) const; } \\
2685 {\tt void set\_{\em bar}\_state({\em Foo}, int); }
2686 \end{quotation}
2687 Typically, these are implemented as member functions in the rewrite class.
2689 The rewriter uses the function {\tt get\_{\em bar}\_state} to lookup
2690 previously cached information set by the function {\tt set\_{\em bar}\_state}.
2691 Typically, the implementation of the two functions can be implemented
2692 as hash tables, using the addresses of the terms as hash functions and
2693 pointer equality comparisons. Note that caching can be {\em lossy}; i.e.
2694 it is perfectly alright for the cache to eliminate cached information to
2695 keep its size under control. If no cache information is found,
2696 the function {\tt get\_{\em bar}\_state} should return
2697 \verb|BURS::undefined_state|.
2699 \paragraph{Class \CLASSDEF{RewriteCache}}
2700 To make it easy for the users to implement their own external indices,
2701 two sample external index implementation are provided. The first
2702 is the class \verb|RewriteCache| defined in the library file
2703 \verb|<AD/rewrite/cache.h>|.
2705 The class \verb|RewriteCache| implements a simple fixed capacity
2706 cache with the following member functions.
2707 \begin{verbatim}
2708 RewriteCache();
2709 RewriteCache(int capacity);
2710 ~RewriteCache();
2712 void clear(); // after rewriting
2713 void initialize(); // before rewriting
2714 void initialize(int capacity); // before rewriting/also set capacity
2716 int capacity() const;
2718 void set_state(const void * t, int s);
2719 int get_state(const void * t) const;
2721 void invalidate(const void * t);
2722 \end{verbatim}
2724 The user should call the function \verb|initialize| before rewriting
2725 in order to setup the cache and/or clear out previously cached result.
2726 The functions \verb|set_state| and \verb|get_state| can be used to
2727 implement the necessary state caching functions expected by the rewriter.
2729 Returning to our \T{Wff} example, we have the following possible implementation
2730 of the external index:
2731 \begin{verbatim}
2732 rewrite class Simplify (Wff)
2733 { RewriteCache cache;
2734 public:
2735 int get_wff_state(Wff t) const { return cache.get_state(t); }
2736 void set_wff_state(Wff t, int s) { cache.set_state(t,s); }
2739 rewrite Simplify
2740 { index: Wff = extern wff;
2743 \end{verbatim}
2745 \begin{Tips}
2746 If manual memory management is used,
2747 a term should not be deallocated from memory if a cache
2748 contains a handle to it. The method \verb|invalidate| can be used to
2749 make sure the entry associated with a term is removed from the cache.
2750 \end{Tips}
2752 \paragraph{Class \CLASSDEF{GCRewriteCache}}
2754 {\em This class has not be completely tested.}
2756 Since a garbage collectable object is not reclaimed by the collector
2757 whenever there exists a reference to it, using the class
2758 \verb|RewriteCache| may create problems since objects are not reclaimed
2759 if it is referenced by the cache. The template class \verb|GCRewriteCache|
2760 solves this problem by using {\em weak pointers} in its internal
2761 implementation.
2763 The template class \verb|GCRewriteCache<T>| expects \verb|T| to be
2764 a class derived from \verb|GCObject|.
2765 This class implements the following functions:
2766 \begin{verbatim}
2767 GCRewriteCache();
2768 GCRewriteCache(int capacity);
2769 ~GCRewriteCache();
2771 void clear(); // after rewriting
2772 void initialize(); // before rewriting
2773 void initialize(int capacity); // before rewriting/also set capacity
2775 int capacity() const;
2777 void set_state(T * t, int s);
2778 int get_state(T * t) const;
2780 void invalidate(T * t);
2781 \end{verbatim}
2783 The usage of these functions are identical to that of class
2784 \verb|RewriteCache|.
2786 \Subsection{Short circuiting rewrite paths with \TDEF{cutrewrite}}
2788 From version 2.3.0 onward, the \T{cutrewrite}($e$)
2789 statement may be used wherever a \T{rewrite}($e)$ statement is expected.
2790 The \T{cutrewrite} statement can be used to selectively ignore certain
2791 subterms during the rewriting process.
2793 The semantics of \T{cutrewrite}($e$) is to replace the current redex
2794 with $e$ but do not bother with looking for other redexes in the replacement
2795 term. This means that the replacement term will not be re-traversed
2796 immediately. Furthermore, the replaced term will not match any other
2797 left hand side sub-patterns except pattern variables and wildcards.
2799 This feature is very useful for preventing the rewriter from
2800 looking inside certain terms.
2802 To demonstrate, consider the simple following example:
2803 \begin{verbatim}
2804 datatype Exp = NUM of int
2805 | ADD of Exp, Exp
2806 | SUB of Exp, Exp
2807 | MUL of Exp, Exp
2808 | DIV of Exp, Exp
2809 | FINAL of Exp
2811 Exp e = ADD(NUM(1),MUL(NUM(2),NUM(3)));
2812 \end{verbatim}
2814 Suppose we want to find all numbers within
2815 the expression \T{e} and increment their values by 1.
2816 The straight forward way of writing:
2817 \begin{verbatim}
2818 rewrite (e) type (Exp) {
2819 NUM x: NUM(x+1)
2821 \end{verbatim}
2822 \noindent will not work, since the replacement term is a new redex,
2823 and the rewriting system will not terminate.
2825 We can reimplement the rewriting system as a two stage process.
2826 First, we mark all terms that we do not want to be changed; whenever we
2827 find a marked term, we execute a \T{cutrewrite} to prevent the term
2828 from being changed. Then we unmark the terms.
2829 \begin{verbatim}
2830 rewrite (e) type (Exp) {
2831 NUM x: FINAL(NUM(x+1));
2832 preorder:
2833 FINAL _: cutrewrite;
2835 rewrite (e) type (Exp) {
2836 FINAL x: x
2838 \end{verbatim}
2840 In the first rewrite statement, replacement terms that we
2841 want to stay fixed are encapsulated
2842 within the auxiliary \T{FINAL} constructor.
2843 Recall that preorder rules are tried before the subterms of a redex are
2844 reduced. Thus we can make sure all that
2845 all terms that are arguments to a \T{FINAL} term is left unaltered
2846 by adding a \T{preorder} rule that executes a \T{cutrewrite}\footnote{
2847 If an expression argument is not given to a \T{cutrewrite} statement, then
2848 the redex is left unchanged.}
2849 Finally, after the first rewrite statement finishes, we use
2850 the second rewrite statement to remove all \T{FINAL} terms generated
2851 in the first.
2853 The semantics of \T{cutrewrite} needs further explanation. Consider
2854 the following rewrite system:
2855 \begin{verbatim}
2856 rewrite (e) type (Exp) {
2857 NUM x: cutrewrite(NUM(x+1))
2858 | MUL(X, NUM 1): X
2859 | MUL(NUM 1,X): X
2861 \end{verbatim}
2862 Given the term \verb|MUL(NUM(0),NUM(2))|, the subterms \verb|NUM(0)|
2863 and \verb|NUM(1)|
2864 will be rewritten by the first rule
2865 into \verb|NUM(1)| and \verb|NUM(3)| respectively.
2866 Furthermore, the replacement term \verb|NUM(1)| will {\em not} match
2867 the left subpattern in \verb|MUL(NUM 1,X)|. Thus the result of the rewrite
2868 will be \verb|MUL(NUM(1),NUM(3))|.
2870 \begin{Tips}
2871 Since the semantics of a rewriting system with \T{cutrewrite} can
2872 be tricky to analyze, its use should be avoided as much as possible.
2873 \end{Tips}
2875 \Subsection{Conditional failure with \TDEF{failrewrite}}
2877 From version 2.3.0 onward, the \T{failrewrite}
2878 statement may be used wherever a \T{rewrite} statement is expected.
2879 The \T{failrewrite} statement is an alternate way of writing
2880 conditional rewriting rules, and it is often useful when the condition
2881 to be checked is too complex to be written as a guard.
2883 When a \T{failrewrite} statement is executed within the rhs action of a
2884 rewrite rule, the current pattern is rejected and the rewriter
2885 will proceed to try other matches.
2887 For instance, suppose the current redex is
2888 \verb|DIV(INT 0,INT 0)| and the following rules are specified within
2889 a rewriting system:
2890 \begin{verbatim}
2891 DIV(INT X,INT Y): { if (Y == 0) failrewrite; else rewrite(NUM(X/Y); }
2892 | DIV(INT 0,Y): NUM(0)
2893 \end{verbatim}
2895 The execution of these rules will proceed as follows: both patterns
2896 match the redex, but the first rule will be tried first. Since
2897 \verb|Y| is zero, the \T{failrewrite} statement is executed. The rewriter
2898 will proceed to try the
2899 second rule and the redex will be replaced with \verb|NUM(0)|.
2900 Note that if we omit the \T{failrewrite} statement
2901 in the first rule, as in:
2902 \begin{verbatim}
2903 DIV(INT X,INT Y): { if (Y != 0) rewrite(NUM(X/Y); }
2904 | DIV(INT 0,Y): NUM(0)
2905 \end{verbatim}
2906 the rewriter will commit on the first rule
2907 and no action will be performed on the redex.
2909 Like \T{rewrite} and \T{cutrwerite}, \T{failrewrite}
2910 acts as a branching statement and all statements
2911 immediatedly following \T{failrewrite} are dead code.
2913 \Subsection{Confluence and termination}
2915 Currently no automatic checking is available to make sure that
2916 a set of rewriting rules is confluent and will terminate. This is currently
2917 the user's responsibility. Some verification features will be developed
2918 in the future.
2920 \Subsection{Debugging Tree Rewriting}
2922 By default, \Prop{} generates a macro \verb|DEBUG_|$C$ for each
2923 rewrite class named $C$. The user can redefine this macro in order to
2924 trace the replacements that are performed during the execution of a
2925 set of rewriting rules. The default definition of \verb|DEBUG_|$C$
2926 is as follows:
2928 \verb|#define DEBUG_|C({\it R,redex,filename,linenumber,ruletext})\ {\it R}
2930 i.e. it simply returns the value of {\it R}.
2932 The arguments to the macro are as follows:
2933 \begin{itemize}
2934 \item {\it R} is the new replacement term.
2935 \item {\it redex} is the current redex, before the replacement has been
2936 performed.
2937 \item {\it filename} is the file name of the current rewriting rule.
2938 \item {\it linenumber} is the location of the current rewriting rule.
2939 \item {\it ruletext} is a string that describes the current rule.
2940 \end{itemize}
2942 In order to activate tracing, the user can redefine the macro
2943 \verb|DEBUG_FOO| (for rewrite class \verb|FOO|) as follows:
2944 \begin{verbatim}
2945 #define DEBUG_FOO(a,b,c,d,e) print_rule(a,b,c,d,e)
2947 template <class T>
2948 T print_rule(T replacement, T redex,
2949 const char * file_name,
2950 int line, const char * rule_text)
2951 { cerr << file_name << ':' << line << ": " << rule_text
2952 << " redex = " << redex << " replacement = " << replacement
2953 << endl;
2954 return replacement;
2956 \end{verbatim}
2957 Note that these definitions must appear
2958 before the \verb|rewrite FOO| declaration.
2960 \Subsection{Optimizing Tree Rewriting} \label{sec:optimize-rewriting}
2962 An experimental rewriting optimizer based on partial evaluation
2963 techniques has been implemented in release 2.3.4 of \Prop. The optimizer
2964 can be enabled with the option \OPTIONDEF{-Orewriting}.
2966 Informally speaking, the rewriting optimizer will try to locate
2967 ``longer'' reduction sequence and add these to your rewriting rules.
2968 The result is that during execution these bigger reduction steps are
2969 taken whenever possible.
2971 For example, consider the \verb|Wff| datatype defined
2972 in section~\ref{sec:rewriting-modifier}. Suppose we have the following
2973 two rewriting rules in our rule set:
2974 \begin{verbatim}
2975 Not(Not X): X
2976 | FORALL(X,A,P): Not(EXISTS(X,A,Not(P)))
2977 \end{verbatim}
2979 Given an input term of the form \verb|FORALL(X,A,Not(P))|, the term will
2980 first be reduced to \verb|Not(EXISTS(X,A,Not(Not(P))))| via the second rule,
2981 then to \verb|Not(EXISTS(X,A,P))| via the first rule. The optimizer
2982 can discover that this two step reduction can be compressed into one
2983 step if the rewriting system is augmented with the following rule:
2985 \begin{verbatim}
2986 FORALL(X,A,Not P): Not(EXISTS(X,A,P))
2987 \end{verbatim}
2989 Note that this augmented rule is a {\em specialization} of the two rules
2990 given by the user. In general, the rewriting optimizer will repeatedly
2991 look for useful specializations and add them to the user rule set\footnote{
2992 To ensure termination of the optimization process,
2993 augmented rules will not be further specialized.}.
2994 These specialized rules will perform multiple reductions in one single step.
2996 To make use of the optimizer, the rewriting system must have the following
2997 properties:
2998 \begin{enumerate}
2999 \item It must be strongly normalizing, i.e. it will terminate under all
3000 inputs.
3001 \item It must be confluent, or else confluence is not necessary.
3002 Notice that the optimizer can and will alternate the reduction order
3003 of a rule set. Thus if a rewriting system is non-confluent, an
3004 optimized rewriting system may give a different result.
3005 \end{enumerate}
3007 In addition, the following things should hold to maximize the effectiveness
3008 of the optimizer.
3009 \begin{enumerate}
3010 \item Indices (i.e. state caching) should be enabled. Otherwise
3011 the optimizer will not be able to optimize a rule.
3012 \item Specializable rules must be simple, i.e. the right hand side
3013 must be written as an expression consist of only \Prop{} datatypes.
3014 If the right hand side is a complex statement or if it involves
3015 external function calls, the optimizer will fail to analyze it properly.
3016 \item Only bottom-up rules may be present.
3017 \item Avoid conditional rules whenever possible.
3018 \end{enumerate}
3020 Finally, it should be warned that an optimized rewriting system
3021 may generate a lot more code than the un-optimized one. The user should
3022 view the generated report and check that excessive specializations have not
3023 been performed.
3025 \Section{User defined datatypes: Views} \label{sec:views}
3027 For many applications it is necessary to interface \Prop{} to
3028 data structures predefined in libraries, or generated by other tools.
3029 In order to make it possible to work with these external data structures,
3030 \Prop{}, from version 2.3.3. onward, provides a {\em view} mechanism to
3031 interface with these data structures. Using views, an externally defined
3032 type can be used within \Prop's pattern matching and rewriting
3033 mechanisms transparently, just as if it were defined by \Prop.
3034 The \Prop{} translator will call the
3035 appropriate interface functions specified by the user to manipulate
3036 these external datatypes.
3038 \Subsection{A first example} \label{sec:tagged-union}.
3039 To illustrate the use of views, we'll begin with a simple example.
3041 Suppose in our application we use the following tagged C union to
3042 represent our expression datatype. An expression is represented
3043 simply as a pointer to this structure. An null expression is
3044 represented simply as the NULL pointer.
3046 \begin{verbatim}
3047 enum exp_tag { Int = 1, Ident, Add, Sub, Mul, Div };
3048 struct exp {
3049 enum exp_tag tag;
3050 union {
3051 int number;
3052 const char * ident;
3053 struct { struct exp * l, * r } children;
3054 } u;
3056 typedef struct exp * Exp;
3057 /* User defined constructor functions */
3058 Exp NONE = NULL;
3059 extern Exp INT(int);
3060 extern Exp IDENT(const char *);
3061 extern Exp ADD(Exp, Exp);
3062 extern Exp SUB(Exp, Exp);
3063 extern Exp MUL(Exp, Exp);
3064 extern Exp DIV(Exp, Exp);
3065 \end{verbatim}
3067 Suppose this data structure is already used extensively in other
3068 modules, and we'd like to \Prop{} pattern matching mechanisms while keeping
3069 the old code intact. We can accomplish this by declaring a
3070 view to this data structure, using a \TDEF{datatype view}
3071 definition:
3072 \begin{verbatim}
3073 dataype Exp :: view =
3074 match (this ? this->tag : 0)
3075 view 0 => NONE
3076 | view Int => INT(int = this->u.number)
3077 | view Ident => ID (const char * = this->u.ident)
3078 | view Add => ADD(Exp = this->u.children.l, Exp = this->u.children.r)
3079 | view Sub => SUB(Exp = this->u.children.l, Exp = this->u.children.r)
3080 | view Mul => MUL(Exp = this->u.children.l, Exp = this->u.children.r)
3081 | view Div => DIV(Exp = this->u.children.l, Exp = this->u.children.r)
3083 \end{verbatim}
3085 In the above, a view named \verb|Exp| is defined. Let's describe
3086 what the definition means: in general,
3087 a view definition has three main parts, the {\em view selector},
3088 a set of {\em view transformation rules}, and a set {\em of view accessor
3089 expressions}. These parts perform the following tasks:
3091 \begin{description}
3092 \item[View selector] The {\em view selector} is specified in the
3093 the \TDEF{match} part of the definition. In this example,
3094 the expression associated with \T{match} is
3095 \verb|this ? 0 : this->tag|. This returns a variant discrimination integer
3096 from \verb|0| to \verb|Div|.
3097 Note that the pseudo variable \TDEF{this}
3098 refers to an object of type \verb|Exp|.
3100 \item[View transformation rules]
3101 Each of the clauses
3102 \[ \TDEF{view} {\it exp} \verb|=>| {\it term} \]
3103 specifies
3104 a {\em view transformation rule}. These rules are used to specify
3105 how to transform a computed variant tag into a pattern. In general,
3106 expression {\it exp} must be a constant expression usable in a
3107 \verb|case| statement.
3109 \item[View accessors]
3110 In each of the constructor arguments, an {\em view accessor} expression
3111 can be defined to access the logical components of the constructor.
3112 In the rule:
3113 \begin{verbatim}
3114 | view Add => ADD(Exp = this->u.children.l, Exp = this->u.children.r)
3115 \end{verbatim}
3117 The expressions \verb|this->u.children.l| and \verb|this->u.children.r|
3118 tell \Prop{} how to access the left and right child of the an addition
3119 expression node.
3121 Note that since the variant \verb|NONE| has no arguments, it is represented
3122 as a nullary constructor, and no accessors expressions are defined.
3123 \end{description}
3125 A pretty printing function for \verb|struct exp| can be implemented
3126 in \Prop{} using pattern matching as follows:
3127 \begin{verbatim}
3128 ostream& operator << (ostream& s, Exp e)
3129 { match (e)
3130 { NONE: { s << "none"; }
3131 | INT i: { s << i; }
3132 | ID id: { s << id; }
3133 | ADD(a,b): { s << '(' << a << " + " << b << '); }
3134 | SUB(a,b): { s << '(' << a << " - " << b << '); }
3135 | MUL(a,b): { s << '(' << a << " * " << b << '); }
3136 | DIV(a,b): { s << '(' << a << " / " << b << '); }
3138 return s;
3140 \end{verbatim}
3142 Note that this is exactly the same code we would've written if
3143 \verb|Exp| is defined as a datatype rather than a view. This means
3144 that the user can reimplement the expression data structure
3145 as a \Prop's datatype at a latter time, without altering the \Prop{}
3146 source code. In effect, the {\em logical} view of a datatype, and its
3147 {\em physical} implementation can separately.
3148 With this {\em representation transparency} capability,
3149 programs written in \Prop's pattern matching formalisms can evolve
3150 in an incrementally and pain-free manner.
3152 The generated code for the above function will resemble the following
3153 output. Note that all the view selectors and view accessors are
3154 inlined into the code, and in general, views incur no overhead to
3155 their use.
3156 \begin{verbatim}
3157 ostream& operator << (ostream& s, Exp e)
3158 { switch (e ? e->tag : 0)
3159 { case 0: { s << "none"; }
3160 case Int: { s << e->u.number; }
3161 case Ident: { s << e->u.ident; }
3162 case Add: { s << '(' << e->u.children.l
3163 << " + " << e->children.r << '); }
3164 case Sub: { s << '(' << e->u.children.l
3165 << " - " << e->children.r << '); }
3166 case Mul: { s << '(' << e->u.children.l
3167 << " * " << e->children.r << '); }
3168 case Div: { s << '(' << e->u.children.l
3169 << " / " << e->children.r << '); }
3171 return s;
3173 \end{verbatim}
3175 \Subsection{Another view example}
3177 Suppose we use the following alternative implementation, a
3178 \verb|Expression| class hierarchy to represent
3179 an expression in our AST.
3180 Note that \verb|Exp| is an abstract base class inherited
3181 by the classes \verb|Number|, \verb|Operator|, \verb|Identifer|, etc.
3183 \begin{verbatim}
3184 class Expression
3186 public:
3188 // This class uses a ``wide'' interface
3189 virtual int number() const = 0; // returns a number
3190 virtual const char * ident() const = 0; // returns an identifier
3192 // returns the ith subchild
3193 virtual const Expression * child(int) const = 0;
3194 // mutable version of above
3195 virtual Expression *& child(int) = 0;
3198 typedef Expression * Exp;
3200 class NullExp : public Expression { ... };
3201 class Number : public Expression { ... };
3202 class Operator : public Expression { ... };
3203 class Identifier : public Expression { ... };
3204 \end{verbatim}
3206 In order to provide a suitable interface to \Prop's view mechanism,
3207 we introduce an additional member function to return a variant tag:
3209 \begin{verbatim}
3210 class Expression
3212 public:
3213 enum Variant { None, Int, Ident, Add, Sub, Mul, Div };
3214 virtual Variant get_variant() const = 0;
3216 \end{verbatim}
3218 In addition, we assume the following constructor functions have been
3219 implemented:
3221 \begin{verbatim}
3222 extern Exp INT(int);
3223 extern Exp IDENT(const char *);
3224 extern Exp ADD(Exp, Exp);
3225 extern Exp SUB(Exp, Exp);
3226 extern Exp MUL(Exp, Exp);
3227 extern Exp DIV(Exp, Exp);
3228 \end{verbatim}
3230 Now, with these definitions in place, we can define a datatype view of the
3231 above \verb|Expression| hierarchy using the following definition:
3233 \begin{verbatim}
3234 datatype Exp :: view
3235 = match (this->get_variant())
3236 view Expression::None => NONE
3237 | view Expression::Int => INT (int = this->number())
3238 | view Expression::Ident => ID (const char * = this->ident())
3239 | view Expression::Add => ADD (Exp = this->child(0),
3240 Exp = this->child(1))
3241 | view Expression::Sub => SUB (Exp = this->child(0),
3242 Exp = this->child(1))
3243 | view Expression::Mul => MUL (Exp = this->child(0),
3244 Exp = this->child(1))
3245 | view Expression::Div => DIV (Exp = this->child(0),
3246 Exp = this->child(1))
3248 \end{verbatim}
3250 An evaluation function can be written
3251 using \Prop's pattern matching construct:
3253 \begin{verbatim}
3254 int eval(Exp e, const Env& env)
3255 { match (e)
3256 { NONE: { return 0; }
3257 | INT i: { return i; }
3258 | ID id: { return env(id); }
3259 | ADD (x,y): { return eval(x,env) + eval(y,env); }
3260 | SUB (x,y): { return eval(x,env) - eval(y,env); }
3261 | MUL (x,y): { return eval(x,env) * eval(y,env); }
3262 | DIV (x,INT 0): { cerr << "Division by zero"; return 0; }
3263 | DIV (x,y): { return eval(x,env) / eval(y,env); }
3266 \end{verbatim}
3267 Note that this code works equally well with the tagged union representation
3268 defined in section~\ref{sec:tagged-union}.
3270 We can also use rewriting to transform an expression tree as follows
3272 \begin{verbatim}
3273 void simplify(Exp& e)
3275 rewrite (e) type (Exp)
3276 { ADD(INT i, INT j): INT(i+j)
3277 | SUB(INT i, INT j): INT(i-j)
3278 | MUL(INT i, INT j): INT(i*j)
3279 | DIV(INT i, INT j): INT(i/j)
3282 \end{verbatim}
3284 \Subsection{Syntax of view definitions}
3286 The general syntax of a view definition resembles that of a
3287 \T{datatype} definition:
3289 \begin{syntax}
3290 \NDEF{Datatype\_Spec}
3291 & \IS & \N{Datatype\_View\_Spec} \\
3292 \NDEF{Datatype\_View\_Spec}
3293 & \IS & \Id \T{::} \T{view} \T{=} \\
3294 & & \quad \T{match} \T{(} \N{Exp} \T{)} \\
3295 & & \quad \N{View\_Cons\_Specs} \\
3296 \NDEF{View\_Cons\_Specs}
3297 & \IS & \ALT{\TDEF{view} \N{Exp} \T{=>} \N{Cons\_Spec}}{|} \\
3298 \end{syntax}
3300 Default arguments in a \N{Cons\_Spec} are interpreted as view accessors.
3302 There are a few general rules to observe when defining a view:
3303 \begin{enumerate}
3304 \item A \T{datatype view} definition only defines the interface
3305 to an external data structure and no code is generated. In particular,
3306 the defined view name is not an actual \Cpp{} type.
3307 Thus the user is responsible for defining all appropriate type definitions.
3308 \item All other qualifiers other than \T{view} are ignored. Thus
3309 automatic generation of pretty printers, garbage collection interface
3310 routines, etc. are disabled for views.
3311 \item In general, the view accessors must be usable as lvalues
3312 if the user wants to modify a view datatype. This includes the
3313 situation when in-place rewriting is used.
3314 \item Finally, the user are responsible for generating the
3315 constructor functions. In the two examples given above, the user
3316 should implement the functions \T{INT}, \T{ID}, etc.
3317 \end{enumerate}
3319 \Section{Graph Types and Graph Rewriting} \label{sec:graph}
3321 A typical compiler uses many types of graphs to represent internal program
3322 information. For instance, control flow graphs,
3323 data dependence graphs, call graphs, def/use chains are
3324 some of the most common examples. In order to automate the process
3325 of implementing and manipulating these graph structures, \Prop{} provides
3326 a generic formalism-- {\em graph types}--
3327 for specifying multisorted, attributed, hierarchical directed graphs in
3328 very high level manner. The data structure mapping process of translating
3329 high level graph specifications into concrete \Cpp{} classes is completely
3330 automated. Thus the user can concentrate on specifying the semantic
3331 relationships between the elements of the domain, instead of the
3332 implementation details.
3334 The graph structure mapping process uses real-time set machine
3335 simulation\cite{real-time-SETL} and subtype analysis\cite{subtype-SETL}
3336 to convert associative accesses involving sets, maps, and multimaps
3337 into hash free, worst-case constant time pointer manipulations.
3338 This optimization is performed transparently by the translator.
3340 Manipulation of graphs structures are done in three ways:
3341 \begin{description}
3342 \item[object-oriented]
3343 Using a standard interface generated from \Prop{} the user can
3344 manipulate a graph in the usual procedural/object-oriented manner.
3345 \item[set formalism] Using an embedded SETL\cite{SETL}-like sublanguage,
3346 the user can manipulate the graphs using high level set operations
3347 such as set comprehension.
3348 \item[graph rewriting] At the highest level, analysis and transformation
3349 can be carried out using the graph rewriting formalism.
3350 \end{description}
3352 In this section we'll describe each of these topics in detail.
3354 \Subsection{Graph Types} \label{sec:graph-types}
3356 A \INDEX{graph type} declaration is used to specify a
3357 graph structure
3358 with multiple sorts of labeled nodes and labeled directed edges.
3359 Attributes can be attached to both nodes and edges.
3360 The syntax of \T{graphtype} declarations is as follows:
3361 \begin{syntax}
3362 \NDEF{Graph\_Type}
3363 & \IS & \TDEF{graphtype} \Id \\
3364 & & \quad \OPT{\T{:} \N{Inherit\_List}} \\
3365 & & \quad \OPT{\T{::} \MORE{\N{Graph\_Mode}}{}} \\
3366 & & \T{declare} \\
3367 & & \quad \TDEF{node:} \ALT{\N{Node\_Def}}{|} \\
3368 & & \quad \TDEF{edge:} \ALT{\N{Edge\_Def}}{|} \\
3369 & & \T{begin} \\
3370 & & \quad \N{Code} \\
3371 & & \T{end} \T{graphtype} \T{;} \\
3372 \NDEF{Graph\_Mode} & \IS & \\
3373 \NDEF{Node\_Def} & \IS & \Id \OPT{\OPT{\T{of}} \N{TypeExp}} \\
3374 \NDEF{Edge\_Def}
3375 & \IS & \Id \OPT{\T{of}} \TypeExp \T{->} \TypeExp \\
3376 & \OR & \Id \OPT{\T{of}} \TypeExp \T{<->} \TypeExp \\
3377 & \OR & \Id \OPT{\T{of}} \TypeExp \T{<=>} \TypeExp \\
3378 & \OR & \Id \OPT{\T{of}} \TypeExp \T{<=>*} \TypeExp \\
3379 \end{syntax}
3381 \Subsection{The Graph Interface} \label{sec:graph-interface}
3382 \INCOMPLETE
3384 \Subsection{Set Formalisms} \label{sec:set-formalisms}
3385 \INCOMPLETE
3387 \Subsection{Graph Rewriting} \label{sec:graph-rewriting}
3388 \INCOMPLETE
3390 \Section{Running the Translator} \label{sec:usage}
3392 The \Prop{} translator is a program called \verb|prop|. The translator
3393 uses the following command line syntax:
3394 \begin{syntax}
3395 \NDEF{Running\_Prop} & ::= &
3396 \T{prop} \OPT{{\em prop\_options}} \ALT{{\em file}}{} \\
3397 \end{syntax}
3399 Here, \N{File} is a file with suffix \verb|.p|$*$. By default, the output
3400 file will have the same name with the \verb|p| extension removed.
3401 For example, a file named ``foo.pC'' will be translated into
3402 the file ``foo.C'' Similarly, a file named ``bar.ph'' will be translated
3403 into ``bar.h''
3405 \Subsection{Options}
3406 The available options to the translator are as follows:
3407 \begin{description}
3408 \item[\OPTIONDEF{-G -GNU}] Use GNU style error messages.
3409 \item[\OPTIONDEF{-I{\em path}}] Use the search path {\em path}.
3410 \Prop{} will search {\em path} to look for \Prop{} files with
3411 the suffix \verb|.ph| inside a \verb|#include| directive. Multiple
3412 paths can be specified by using multiple \verb|-I| options.
3413 \item[\OPTIONDEF{-l -no\_line\_directives}] By default, \Prop{} will try
3414 to generate \verb|#line| directives to correlate the source and
3415 the translated output. This option suppresses this feature.
3416 \item[\OPTIONDEF{-M -make\_depends}] Similarly to the \verb|-M| option
3417 in \verb|cc| and \verb|cpp|. This option generates a suitable dependency
3418 list for Makefiles.
3419 \item[\OPTIONDEF{-memory\_usage}] Print memory use during translation.
3420 \item[\OPTIONDEF{-n -no\_codegen}]
3421 Don't output any code; perform semantic checking
3422 only.
3423 \item[\OPTIONDEF{-N -non\_linear}]
3424 Use non-linear patterns during pattern matching.
3425 A non-linear pattern is one in which a variable occurs more than
3426 once. By default, \Prop{} consider this an error.
3427 \item[\OPTIONDEF{-o{\em outfile}}] Send output to {\em outfile} instead of
3428 the default.
3429 \item[\OPTIONDEF{-Ofast\_string\_match}]
3430 Generate (potentially) faster string matching
3431 code by expanding string literal tests into character tests. By default,
3432 \Prop{} generates string tests using \verb|strcmp| and binary search.
3433 \item[\OPTIONDEF{-Oadaptive\_matching}] Use the adaptive pattern matching
3434 algorithm\footnote{The implementation may not be effective.}.
3435 \item[\OPTIONDEF{-Oinline\_casts}]
3436 Generate inline type casts instead of function
3437 calls that do the same. This may be faster for compilation.
3438 \item[\OPTIONDEF{-Orewriting}] Enable the rewriting optimizer.
3439 See also section~\ref{sec:optimize-rewriting} for details.
3440 \item[\OPTIONDEF{-Otagged\_pointer}]
3441 Use a tagged pointer representation instead
3442 of representing variant tags explicitly in a structure field. For
3443 example, if there are two variants called \verb|Foo| and
3444 \verb|Bar| in a datatype. Then a pointer to \verb|Foo| can be tagged
3445 with a low order bit 0 while a pointer to \verb|Bar| can be tagged with
3446 a low order bit 1. Accessing the tag thus becomes just a simple bit
3447 test. This should save space and may be faster\footnote{However, this feature
3448 has not been fully tested.}
3449 \item[\OPTIONDEF{-r -report}]
3450 Generate a report whenever necessary. Parser, string
3451 matching, and rewriting constructs produce reports.
3452 \item[\OPTIONDEF{-s -strict}] Use strict semantic checking mode.
3453 All warnings are considered to be errors.
3454 \item[\OPTIONDEF{-S -save\_space}]\label{option:save-space}
3455 Use space saving mode.
3456 Try not to use inline functions if code space can be saved.
3457 Datatype constructors will not be inlined
3458 in this mode. Instead, datatype constructors will be generated at
3459 where the corresponding \TDEF{instantiate datatype} declaration occurs.
3460 \item[\OPTIONDEF{-t -stdout}] Send translated program to the standard output.
3461 \item[\OPTIONDEF{-use\_global\_pool}] Use global memory pool for allocation.
3462 \item[\OPTIONDEF{-v{\em num}}] Use verbose mode in report generation.
3463 This will provide more detailed information.
3464 \end{description}
3466 \Subsection{Error Messages}
3468 The following is a canonical list of all error messages generated by
3469 the \Prop{} translator. Each message is prefixed by the
3470 file name and line number in which the error occurs. Most of these
3471 errors messages are self explanatory; more detailed explanations are provided
3472 below.
3474 \begin{Error}
3475 \errormsg{unknown option {\em option}} The translator does not recognize
3476 the command line {\em option}. Type {\em prop} to see a list of options.
3477 \errormsg{Error in command: {\em command}} The translator fails when trying
3478 to execution {\em command}. When multiple files are specified in a command
3479 line, {\em prop} invokes itself on each of the file.
3480 \errormsg{{\em pat} with type {\em type} is not unifiable} Pattern has not
3481 been declared to be a unifiable type.
3482 \errormsg{Sorry: pattern not supported in rewriting: {\em pat}} Pattern
3483 {\em pat} is not supported. Currently, logical pattern connectives
3484 are not supported in rewriting. You'll have to rewrite them into
3485 non-logical form.
3486 \errormsg{Unknown complexity operator: {\em op}}
3487 \errormsg{accessor is undefined for view pattern: {\em pat}} An accessor
3488 function has not been declared for a constructor. When using datatype views
3489 \errormsg{arity mismatch (expecting $n$) in pattern: {\em pat}} Pattern
3490 {\em pat} is expected to have arity $n$. The arity is the number of
3491 expressions that you're matching concurrently. If you are matching $n$
3492 objects then there must be $n$ patterns for each rule.
3493 \errormsg{arity mismatch in logical pattern: {\em pat}} Logical patterns
3494 do not match the arity of a pattern.
3495 \errormsg{bad constructor type {\em type}}
3496 \errormsg{bad view constructor pattern: {\em pat}}
3497 \errormsg{can't find include file {\em file}} Additional search paths
3498 can be specified with the \verb|-I| option
3499 \errormsg{component \#$i$ not found in constructor {\em con}}
3500 Datatype constructor {\em con} does not have a component named \#$i$.
3501 This can happen when you write a record constructor expression and misspelt
3502 one of the labels.
3503 \errormsg{component $l$ not found in constructor {\em con}} Datatype
3504 constructor {\em con} does not have a labeled component $l$.
3505 \errormsg{constructor {\em con} already has print formats}
3506 \errormsg{constructor {\em con} is not a class}
3507 \errormsg{constructor {\em con} is not a lexeme} A constructor is used
3508 as a terminal in the parser but it has not been declared to be a lexeme.
3509 \errormsg{constructor {\em con} doesn't take labeled arguments} You're trying
3510 to use record constructor syntax on a constructor take does not take
3511 record arguments.
3512 \errormsg{constructor {\em con} is undefined}
3513 \errormsg{cyclic type definition in type {\em id} = {\em type}}
3514 Type abbreviations cannot be cyclic. For recursive types, use \verb|datatype|
3515 definitions.
3516 \errormsg{datatype {\em T} is not a lexeme type} {\em T} is used as a
3517 terminal within syntax class when it has not been declared as a lexeme type.
3518 All non-terminals must be a defined using the \verb|lexeme| qualifier.
3519 \errormsg{determinism {\em det} not recognized}
3520 \end{Error}
3522 \begin{Error}
3523 \errormsg{duplicated definition of pattern constructor '{\em con}'}
3524 The constructor {\em con} has already been in another datatype. {\em Prop}
3525 does not allow overloading of constructor names.
3526 \errormsg{duplicated label '$l$' in expression: {\em exp}}
3527 A record type has one of its labels duplicated.
3528 \errormsg{duplicated label '$l$' in type {\em type}}
3529 \errormsg{duplicated pattern variable '{\em id}'. Use option -N}
3530 By default,
3531 pattern variables may not be duplicated within a pattern. Non-linear
3532 patterns are allowed only when the option \verb|-N| is invoked.
3533 \errormsg{edge $e$ is not defined in graphtype {\em id} }
3534 \errormsg{empty type list in rewrite (...) ... }
3535 \errormsg{expecting $c_1$ ... $c_2$ (from line $l$) but found $c_3$ ... $c_4$
3536 instead} Quotation symbols are not properly balanced.
3537 \errormsg{expecting non-terminal {\em nt} to have type $t_1$ but found $t_2$}
3538 Non-terminal {\em nt} has already been defined to have a
3539 synthesized attribute type of $t_1$ but $t_2$ is found. This can happen
3540 if you have rules with the same lhs non-terminal previously defined.
3541 \errormsg{expecting type $t_1$ but found $t_2$ in pattern variable '{\em v}'}
3542 {\em Prop} performs type inference on all the lhs patterns to determine
3543 the types of all variables. Type errors can occur if the patterns are
3544 miswritten.
3545 \errormsg{expecting type $t_1$ but found $t_2$}
3546 \errormsg{flexible vector pattern currently not supported in
3547 rewriting: {\em pat}}
3548 \errormsg{format \#$i$ used on constructor {\em con}}
3549 \errormsg{format \#$l$ used on non-record constructor {\em con}}
3550 \errormsg{function name mismatch: expecting $f$ ...}
3551 \errormsg{illegal character $c$}
3552 \errormsg{illegal context type: {\em type}} A context type in
3553 a \verb|matchscan| statement must be previously defined to be datatype.
3554 The unit constructors of the datatype can be used as context values.
3555 \errormsg{illegal context(s) in pattern {\em pat}}
3556 \errormsg{illegal format '\_' on constructor {\em con}}
3557 \errormsg{illegal incomplete record pattern: {\em pat}}
3558 \errormsg{illegal print format '$c$' in constructor {\em con}}
3559 \errormsg{illegal record label '$l$' in expression: {\em exp}}
3560 \errormsg{illegal width in bitfield '{\em id} ($n$)'}
3561 \errormsg{inherited attribute '{\em type}' can only be used in treeparser
3562 mode in rewrite class {\em id}}
3563 \end{Error}
3565 \begin{Error}
3566 \errormsg{law {\em id}(...) = {\em pat} is not invertible} Pattern {\em pat}
3567 cannot be treated as an expression. It may involve logical patterns and
3568 wild cards.
3569 \errormsg{law '{\em id}': bound variable '$v$' is absent in body {\em exp}}
3570 A parameter variable $v$ must be present within the body of a law.
3572 \errormsg{law '{\em id}': type {\em type} cannot be used in parameter {\em id}}
3574 \errormsg{lexeme {\em id} is undefined}
3576 \errormsg{lexeme class {\em id} is undefined}
3578 \errormsg{lexeme pattern is undefined for constructor {\em con}}
3580 \errormsg{lexeme \{{\em id}\} already defined as {\em regexp}}
3582 \errormsg{lexeme \{{\em id}\} is undefined in {\em regexp}}
3584 \errormsg{missing '\}' in regular expression {\em regexp}}
3586 \errormsg{missing label '$l$' in expression: {\em con} {\em exp}}
3588 \errormsg{missing non-terminal in tree grammar rule: {\em nt}}
3589 \errormsg{missing type {\em type} in the traversal list of rewrite class {\em id}}
3590 Within the rewriting rules, you have used a pattern that involve a constructor
3591 of type {\em type} directly but no such types are defined in the
3592 rewrite class definition. You should add the type to the traversal list.
3594 \errormsg{missing type info in expression {\em exp} : {\em type}}
3595 Sometimes {\em prop} fails to infer the type of an expression within
3596 a {\em match} statement. It such
3597 cases it is necessary to help out the translator by adding additional
3598 type information in the patterns. For example, rewrite some pattern $p$
3599 as $(p : \tau)$ where $\tau$ is the type of $p$.
3601 \errormsg{missing type info in function {\em f} {\em type}} Similar to above
3602 \errormsg{missing type info in rule: {\em f} {\em pat} : {\em type}}
3603 Similar to above.
3604 \errormsg{missing view selector for pattern: {\em pat}}
3605 \errormsg{multiple mixed polarity pattern variable '{\em v}'}
3606 \errormsg{multiple unit constructors in polymorphic type {\em id} {\em arg}
3607 is not supported}
3608 \errormsg{negative cost {\em cost} is illegal}
3609 \errormsg{node {\em id} is not defined in graphtype {\em id}}
3610 \errormsg{non lexeme type {\em type} in pattern {\em pat}}
3611 \errormsg{non-relation type {\em type} in pattern: {\em pat}}
3612 \errormsg{pattern is undefined for lexeme {\em l}}
3613 \errormsg{pattern scope stack overflows}
3614 \errormsg{pattern scope stack underflows}
3615 \end{Error}
3617 \begin{Error}
3618 \errormsg{pattern variable '$v$' has no binding at this point}
3619 \errormsg{persist object id is undefined for {\em con}}
3620 \errormsg{persistence pid {\em name} already allocated for type {\em type}}
3621 \errormsg{persistence redefined for type {\em type}}
3622 \errormsg{precedence symbol must be terminal: {\em term}}
3623 \errormsg{predicate {\em p}: expecting type $t_1$ but found $t_2$}
3624 \errormsg{redefinition of bitfield '{\em field}'.}
3625 \errormsg{redefinition of constructor '{\em con}'}
3626 \errormsg{redefinition of datatype {\em id}}
3627 \errormsg{redefinition of lexeme class {\em id}}
3628 \errormsg{replacement not in rewrite class: rewrite {\em exp}}
3629 \errormsg{rewrite class {\em id} has already been defined}
3630 \errormsg{rewrite class {\em id} is undefined}
3631 \errormsg{rule $r$ has incomplete type {\em type}}
3632 \errormsg{rule $r$ is of a non datatype: {\em type}}
3633 \errormsg{syntax class {\em id} has already been defined}
3634 \errormsg{syntax class {\em id} is undefined}
3635 \errormsg{synthesized attribute '{\em type}'
3636 can only be used in treeparser mode in rewrite class {\em id}}
3637 \errormsg{the class representation of constructor {\em id} has been elided}
3638 \errormsg{this rule is never selected: $r$}
3639 \errormsg{this rule is never used: $r$}
3640 \errormsg{too few arguments {\em arg} in instantiation of type scheme {\em type}}
3641 \errormsg{too many arguments {\em arg} in instantiation of type scheme {\em type}}
3642 \errormsg{type {\em type} is not a datatype}
3643 \errormsg{type {\em type} is undefined}
3644 \end{Error}
3646 \begin{Error}
3647 \errormsg{type {\em id} = {\em type} is not a datatype}
3648 \errormsg{type {\em id} has already been defined as {\em type}}
3649 \errormsg{type {\em id} has unknown class: {\em C}}
3650 \errormsg{type '{\em type}' is not rewritable in treeparser mode rewrite class
3651 {\em id}}
3652 All datatypes used within the treeparser mode of rewriting must be defined
3653 with the \verb|rewrite| qualifier.
3654 \errormsg{type error in pattern {\em pat}: {\em type}}
3655 \errormsg{type mismatch between nonterminal {\em nt}(type $t_1$)
3656 and rule $r$(type $t_2$)}
3657 \errormsg{type mismatch in pattern: {\em pat}}
3658 \errormsg{type mismatch in rule $r$}
3659 \errormsg{type mismatch in rule $r$ {\em pat}}
3660 \errormsg{type or constructor {\em con} is undefined}
3661 \errormsg{unable to apply pattern scheme {\em pat}}
3662 \errormsg{unbalanced $c_1$ ... $c_2$ at end of file}
3663 \errormsg{undefined non-terminal {\em nt}}
3664 \errormsg{unification fails occurs check with $t_1$ and $t_2$}
3665 \errormsg{unmatched ending quote $c$}
3666 \errormsg{unrecognized quoted expression `{\em exp}`}
3667 \errormsg{unrecognized quoted pattern `{\em pat}`}
3668 \end{Error}
3670 \pagebreak
3671 \appendix
3673 \Section{Garbage Collection in the {\sf Prop} Library} \label{appendix:gc}
3674 In this appendix
3675 we describe the design and implementation of a garbage collector
3676 based on the Customisable Memory Management framework(CMM)\cite{CMM} in our
3677 \Prop{} \Cpp{} class library. Like the previous approach, we have
3678 implemented a mostly copying conservative collector based on the work
3679 of Bartlett\cite{Mostly-copying}.
3680 Similar to CMM, our architecture provides a protocol to allow
3681 multiple garbage collectors using different algorithms to coexist in the
3682 same memory space. A few improvements are made to
3683 improve the performance, the flexibility and the functionality of
3684 our collector: to reduce retention due to false roots identification,
3685 blacklisting\cite{Boehm} is used to identify troublesome heap
3686 addresses; the architecture of the system has been generalized so that it is
3687 now possible to have multiple instantiations of Bartlett-style heaps;
3688 finally, object finalization and weak pointer support
3689 are added. Our collector has been tested on gcc 2.5.8 under Linux,
3690 and SunOS 4.1.x\footnote{An alpha version also works under Solaris. Thanks
3691 to Paul Dietz for the patch.}
3693 \Subsection{Introduction}
3694 The \Prop{} project involves the design and implementation of
3695 an environment and an extension language based on \Cpp{}
3696 for compiler construction and program transformation.
3697 An outgrowth of this project is the \Prop{}
3698 \Cpp{} class library, which contains an extensive set of support algorithms
3699 and data structures. Since a typical compiler manipulates many complex tree,
3700 dag and graph objects with extended lifetime, manual memory management using
3701 \Cpp's constructors and destructors, reference counting smart pointers, or
3702 some other techniques is frequently complex and error prone.
3703 Furthermore, with the advent of new algorithms for garbage collection,
3704 manual memory management techniques do not necessarily provide better
3705 performance or space utilization. To make it possible to make use of garbage
3706 collection as a viable alternative for memory management in
3707 \Cpp\cite{Safe-C++-GC}, we have implemented a garbage collection class
3708 hierarchy as part of the \Prop{} library. The class library can be used
3709 directly by the users who'd like to program in \Cpp; it can also be
3710 used as part of the runtime system of a highly level language implemented
3711 in \Cpp, as we have done for \Prop.
3713 We have several good reasons to prefer garbage collection over manual
3714 memory management. The \Prop{} language contains many declarative constructs
3715 and extensions such as algebraic datatypes, pattern matching, rewriting, and
3716 logical inference. When a user programs in \Prop{} using these constructs,
3717 an applicative style of programming is the most natural paradigm.
3719 \Subsection{The Framework}
3721 We base our design on the work on the Customisable Memory Management(CMM)
3722 system\cite{CMM}. In this framework, multiple independent heaps(including
3723 the usually non-collectable heap) can coexist with each other.
3724 Bartlett's mostly copying garbage collector is used as the primary collector.
3725 CMM extends the work of Bartlett\cite{Mostly-copying}
3726 by allowing cross heap pointers and unrestricted interior pointers.
3728 However, all collectable objects are required to derive from a base class
3729 and reimplement a tracing method, which identifies the internal pointers of
3730 an object. This burden is usually quite small and in fact can be
3731 automated from the type definition\footnote{In \Prop, a special keyword
3732 \T{collectable} is used to identify garbage collected classes and types.
3733 The \Prop{} translator uses this type annotation to derive the appropriate
3734 tracing methods.}
3736 One of the major advantages of the CMM framework is that multiple
3737 coorperating collectors can coexist at the same time, which makes it
3738 possible to customize the behavior of each collector with respect to
3739 the lifetime behavior of the objects. In \cite{CMM}, examples are
3740 given in which the lifetime of certain class of objects exhibit
3741 first-in/first-out behavior. In this case a special collector can be
3742 written to take full advantage of this property.
3744 \subsection{Our Framework}
3745 Our framework retains all the benefits and flexibilities of CMM,
3746 while extending it in several minor but useful ways:
3747 \begin{itemize}
3748 \item Like CMM, interior pointers(i.e. pointers to the middle of an object)
3749 and crossheap pointers (i.e. complex data structures linking nodes locating
3750 in multiple logical heaps.) are supported. Pointers to collectable
3751 objects are non-intrusive: i.e. they are normal \Cpp{} pointers
3752 and not encapsulated in templates.
3753 \item Also like CMM, we allow multiple garbage collectors using different
3754 algorithms to coexist. Unlike CMM, however, we allow multiple
3755 Bartlett-style collectors to be instantiated. Most of the services
3756 involving low level page management and object marking have been relegated
3757 to a separate heap manager.
3758 \item We provide support for finalization and weakpointers.
3759 \item We have implemented blacklisting\cite{Boehm} to reduce the chance
3760 of false roots identification.
3761 \end{itemize}
3763 \subsection{The Implementation}
3765 Our implementation has been completely written from scratch since
3766 we do not desire to utilize copyrighted code and, more importantly,
3767 we have to make a few important architectural changes:
3768 All low level memory management services, such as management of
3769 the page table and the object bitmap, is now relegated to the class
3770 \CLASSDEF{GCHeapManager}. Collectors now act as clients as of the heap manager
3771 and the Bartlett-style collector no longer has any special status.
3773 \subsection{Architecture}
3775 The architecture of the memory management system is partitioned into
3776 a few classes, each responsible for providing distinct services:
3778 \begin{itemize}
3779 \item \CLASSDEF{GCHeapManager} --- The heap manager. The heap manager
3780 manages the heap table, performs page level
3781 allocation and deallocation, and provides miscellaneous services
3782 like blacklisting. It also manages the object bitmaps.
3783 \item \CLASSDEF{GC} --- The base class for all garbage collectors.
3784 This base class describes the protocol used by all the collector
3785 classes\footnote{This base class is also inherited from class
3786 \CLASSDEF{Mem}, so that it adheres to the \Prop{} memory management
3787 protocol.}
3788 \item \CLASSDEF{CGC} --- The base class for conservative collectors.
3789 This class is inherited from class \CLASSDEF{GC} and implements some
3790 methods for locating the stack, heap, and static data areas.
3791 \item \CLASSDEF{BGC} --- Bartlett-style mostly copying collector.
3792 This class is inherited from class \CLASSDEF{CGC} and implements
3793 the Bartlett mostly copying algorithm.
3794 \item \CLASSDEF{MarkSweepGC} --- Mark/sweep style conservative collector.
3795 This class is inherited from class \CLASSDEF{CGC} and implements
3796 a mark/sweep collection algorithm.
3797 \item \CLASSDEF{WeakPointerManager} --- The weakpointer manager.
3798 This class manages the weak pointer table and provides a few
3799 weak pointer scavenging services for the collectors.
3800 \end{itemize}
3802 \Subsection{The Programmatic Interface}
3804 The programmatic interface to the garbage collector involves two
3805 base classes, \CLASSDEF{GC} and \CLASSDEF{GCObject}. The base class
3806 \CLASSDEF{GC} provides an uniform interface to all types of collectors
3807 and memory
3808 managers while class {\sf GCObject} provides the interface to all
3809 collectable classes. Table~\ref{GC-Classes} contains a listing
3810 of the classes in our hierarchy.
3812 \begin{table}
3813 \begin{center}
3814 \begin{tabular}{|l|l|} \hline
3815 Class & Purpose \\ \hline \hline
3816 \sf GCObject & Collectable object base class \\
3817 \sf GC & Garbage collector base class \\
3818 \sf CGC & Conservative garbage collector base class \\
3819 \sf BGC & Bartlett style mostly copying collector \\
3820 \sf MarkSweepGC & A mark-sweep collector \\
3821 \sf UserHeap & A heap for pointerless object \\
3822 \sf GCVerifier & A heap walker that verifies the integrity of a
3823 structure \\
3824 \hline
3825 \end{tabular}
3826 \end{center}
3827 \caption{\label{GC-Classes} Garbage Collection Classes.}
3828 \end{table}
3830 Class \CLASSDEF{GCObject} is extremely simple: it redefines the operators
3831 \verb|new| and \verb|delete| to allocate memory from the default collector,
3832 and declares a virtual method ``\verb|trace|'' to be defined by subclasses
3833 (more on this later.)
3835 Memory for a \CLASSDEF{GCObject} is allocated and freed using the usually
3836 \verb|new| and \verb|delete| operators. Of course, freeing memory explicitly
3837 using \verb|delete| is optional for many subclasses of \CLASSDEF{GC}.
3839 \begin{verbatim}
3840 class GCObject {
3841 public:
3842 inline void * operator new(size_t n, GC& gc = *GC::default_gc)
3843 { return gc.m_alloc(n); }
3844 inline void * operator new(size_t n, size_t N, GC& gc = *GC::default_gc)
3845 { return gc.m_alloc(n > N ? n : N); }
3846 inline void operator delete(void *);
3847 virtual void trace(GC *) = 0;
3849 \end{verbatim}
3851 \subsection{Memory Allocation}
3853 The base class \CLASSDEF{GC} is slightly more complex, as it has to provide
3854 a few different functionalities. The first service that class \CLASSDEF{GC}
3855 must
3856 provide is of course memory allocation and deallocation. As a time saving
3857 device we can specify what the default collector is using the methods
3858 \verb|get_default_gc| and \verb|set_default_gc|. Method \verb|GCObject::new|
3859 will use this collector by default, unless placement syntax is used.
3860 Method \verb|GCObject::delete|, on the other hand, can correctly infer the
3861 proper collector to use by the address of the object.
3863 The low level methods to allocate and deallocate memory are \verb|m_alloc|
3864 and \verb|free| respectively. The programmers usually do not have to
3865 use these methods directly.
3867 The method to invoke a garbage collection of a specific level is
3868 \verb|collect(int level)|. This forces an explicit collection.
3869 Method \verb|grow_heap(size_t)| can also be used to explicitly increase
3870 the heap size of a collector. Depending of the actual behavior
3871 of the subclasses, these methods may have different effects.
3873 \begin{verbatim}
3874 class GC {
3875 protected:
3876 static GC * default_gc;
3877 public:
3878 static GC& get_default_gc() { return *default_gc; }
3879 static void set_default_gc(GC& gc) { default_gc = &gc; }
3880 virtual void * m_alloc (size_t) = 0;
3881 virtual void free (void *);
3882 virtual void collect (int level = 0) = 0;
3883 virtual void grow_heap (size_t) = 0;
3884 static void garbage_collect() { default_gc->collect(); }
3885 virtual void set_gc_ratio(int);
3886 virtual void set_initial_heap_size (size_t);
3887 virtual void set_min_heap_growth (size_t);
3889 \end{verbatim}
3891 \subsection{The GC Protocol}
3892 The collector and collectable objects must cooperate with
3893 each other by abiding to a simple protocol:
3894 \begin{itemize}
3895 \item All objects that are to be garbage collected must be derived
3896 from \CLASSDEF{GCObject}. The application programmer must also supply a
3897 ``{\bf tracing}'' method. The purpose of this method is to identify
3898 all internal pointers to other \CLASSDEF{GCObject}'s. This method is not
3899 used by the application programmer directly but only used internally
3900 by the garbage collectors.
3901 \item The tracing method of each collectable must in turn call
3902 the tracing method in the class \CLASSDEF{GC} with each pointer to
3903 a collectable object that the object owns:
3904 \begin{verbatim}
3905 class GC {
3906 public:
3907 virtual GCObject * trace (GCObject *) = 0;
3908 inline void trace (GCObject& obj);
3910 \end{verbatim}
3912 Briefly, the rules are as follows:
3913 \begin{enumerate}
3914 \item The tracing method of a collectable \T{Foo} has the following
3915 general form:
3916 \begin{verbatim}
3917 void Foo::trace(GC * gc)
3921 \end{verbatim}
3922 \item If class \T{Foo} has a member that is a pointer \verb|p|
3923 to a collectable object of type \T{Bar}, then add:
3924 \begin{verbatim}
3925 p = (Bar)gc->trace(p);
3926 \end{verbatim}
3927 to the body of \verb|Foo::trace|.
3928 \item If class \T{Foo} has a member object
3929 \verb|x| that is a subclass of \T{GCObject}, also add:
3930 \begin{verbatim}
3931 gc->trace(x);
3932 \end{verbatim}
3933 to the body of \verb|Foo::trace|.
3934 \item If class \T{Foo} is derived from a class \verb|Bar| that
3935 is a subclass of \T{GCObject}, add:
3936 \begin{verbatim}
3937 Bar::trace(gc);
3938 \end{verbatim}
3939 to the body of \verb|Foo::trace|.
3940 \end{enumerate}
3941 Notice that these methods can be arranged in any order.
3942 \end{itemize}
3944 This protocol can be used by both copying and non-copying collectors.
3945 In addition, the class \CLASSDEF{GCVerifier} also uses this protocol to
3946 walk the heap in order to verify the integrity of a garbage collected
3947 data structure.
3949 \subsection{Messages and Statistics}
3950 All garbage collectors use the following protocols for status
3951 reporting and statistics gathering.
3953 \begin{verbatim}
3954 class GC {
3955 public:
3956 enum GCNotify {
3957 gc_no_notify,
3958 gc_notify_minor_collection,
3959 gc_notify_major_collection,
3960 gc_notify_weak_pointer_collection,
3961 gc_print_collection_time,
3962 gc_print_debugging_info
3964 virtual int verbosity() const;
3965 virtual void set_verbosity(int);
3966 virtual ostream& get_console() const;
3967 virtual void set_console(ostream&);
3969 \end{verbatim}
3971 \begin{verbatim}
3972 class GC {
3973 public:
3974 struct Statistics {
3975 const char * algorithm;
3976 const char * version;
3977 size_t bytes_used;
3978 size_t bytes_managed;
3979 size_t bytes_free;
3980 struct timeval gc_user_time;
3981 struct timeval gc_system_time;
3982 struct timeval total_gc_user_time;
3983 struct timeval total_gc_system_time;
3985 virtual Statistics statistics();
3987 \end{verbatim}
3989 Each collector has a current verbosity level, which can be set
3990 and reset using the methods \verb|set_verbosity(int)| and
3991 \verb|verbosity() const|. The verbosity level is actually a bit
3992 set containing flags of the type \verb|GC::GCNotify|. A combination
3993 of these options can be used.
3994 \begin{itemize}
3995 \item \verb|gc_no_notify| --- no messages at all.
3996 \item \verb|gc_notify_minor_collection| --- all minor collections
3997 will be notified by printing a message on the current console.
3998 \item \verb|gc_notify_major_collection| --- similarly for all major
3999 collections.
4000 \item \verb|gc_notify_weak_pointer_collection| --- notify the user
4001 when weak pointers are being collected.
4002 \item \verb|gc_print_collection_time| --- this option will let the
4003 collector to print the time spent during collection (and finalization).
4004 \item \verb|gc_print_debugging_info| --- this option will print additional
4005 information such as stack and heap addresses during garbage collection.
4006 \end{itemize}
4008 The current console of a collector is defaulted to the \Cpp{} stream
4009 \verb|cerr|. It can be set and inspected with the methods
4010 \verb|set_console(ostream&)| and \verb|ostream& get_console()| respectively.
4011 For example, the user can redirect all garbage collection messages
4012 to a log file \verb|"gc.log"| by executing the following during initialization:
4014 \begin{verbatim}
4015 ofstream gc_log("gc.log");
4016 GC::get_default_gc().set_console(gc_log);
4017 \end{verbatim}
4019 \subsection{The Bartlett style mostly copying collector}
4021 Currently a Bartlett-style mostly copying collector has been implemented.
4022 The current version is non-generational but we expect that a generational
4023 version will be available in the near future.
4025 The Bartlett-style collector is implemented as the concrete class
4026 \CLASSDEF{BGC}. Aside from all the services provided by class \CLASSDEF{GC},
4027 \CLASSDEF{BGC} also provides the following method:
4029 \begin{verbatim}
4030 class BGC : public CGC {
4031 public:
4032 virtual void set_gc_ratio(int);
4033 virtual void set_initial_heap_size (size_t);
4034 virtual void set_min_heap_growth (size_t);
4036 \end{verbatim}
4038 The gc ratio refers to the ratio of used heap space versus
4039 total heap space. Class \CLASSDEF{BGC} will invoke the garbage collection
4040 if this is exceeded. The default gc ratio for \T{BGC} is 75\%.
4042 The initial size of the heap and the minimal number of bytes to
4043 request from the system during a heap expansion can be adjusted using
4044 the methods \verb|set_initial_heap_size| and \verb|set_min_heap_growth|
4045 respectively. These methods are declared in the \T{GC}
4046 protocol. By default, the initial heap size for this collector
4047 is 128K and each heap expansion will increase the heap by 512K.
4049 If an application uses a lot garbage collected storage it is a good
4050 idea to set the heap size to larger capacity during initialization.
4051 Otherwise, the collector will have to perform more garbage collection
4052 and heap expansions to reach a stable state.
4054 \subsection{The Mark Sweep collector}
4056 An alternative to the copying collector is the the mark sweep collector.
4057 Like the previous collector, this collector also uses conservative scanning
4058 to locate roots. Unlikely the Boehm collector, type accurate marking
4059 is used through the user supplied tracing method.
4061 Since the default collector is of the \T{BGC} variety, the user must
4062 create an instance of \T{MarkSweepGC} if the mark sweep collector
4063 is to be used.
4065 \begin{verbatim}
4066 class MarkSweepGC : public CGC {
4067 public:
4068 virtual void set_gc_ratio(int);
4069 virtual void set_initial_heap_size (size_t);
4070 virtual void set_min_heap_growth (size_t);
4072 \end{verbatim}
4074 The gc ratio in class \T{MarkSweepGC} determines whether heap
4075 expansion is to be performed after a collection. The default gc ratio
4076 is 50\%, thus if after garbage collection the ratio of used versus total
4077 heap space exceeds one half, the heap will be expanded.
4079 For most purposes the two collectors are interchangeable. Since
4080 all types of garbage collectors under our framework use the same protocol,
4081 applications can be written without fixing a specific garbage collection
4082 algorithm before hand.
4084 \subsection{Finalization}
4086 One common \Cpp{} idiom uses constructor and destructor for
4087 resource allocation: resources (including memory but may include others,
4088 such as file handles, graphical widgets, etc) that are acquired
4089 in a class constructor are released(finalized) in the class'es destructor.
4091 There are, however, two opposing views in how finalization should be handled
4092 in a garbage collector. The first assumes that garbage collection
4093 simulates an infinite amount of memory and so automatic finalization is
4094 inapproriate. The second takes a more pragmatic view, and assumes that
4095 automatic finalization should be provided since otherwise explicit resource
4096 tracking must be provided by the user for collectable datatypes, making garbage
4097 collection much less useful than it can be.
4099 We will refrain from participating in any religious and philosophical
4100 wars on which dogma is the ``One True Way''.
4101 nstead, both types of collectors are provided.
4103 By default, all garbage collection classes do
4104 not perform finalization on garbage
4105 collected objects. If object finalization is desired then the user
4106 can do one of two things:
4107 \begin{itemize}
4108 \item Enable the finalization of the default heap by putting
4109 \begin{verbatim}
4110 GC::get_default_gc().set_finalization(true);
4111 \end{verbatim}
4112 in the initialization code, or
4113 \item Instantiate a different instance of the garbage collector
4114 and allocate objects needing finalization from this collector.
4115 This may be a better method since not all objects need finalization
4116 and those that do not can still be allocated from the default collector.
4117 This way we only have to pay for the performance penalty (if any)
4118 proportional to the usage of finalization.
4119 \end{itemize}
4121 For example, if a collectable class named \T{Foo} should be properly
4122 finalized we can declare it in the following manner:
4124 \begin{verbatim}
4125 extern BGC bgc_f; // somewhere an instance is defined
4127 class Foo : public GCObject {
4128 Resource r;
4129 public:
4130 Foo() { r.allocate(); /* allocate resource */ }
4131 ~Foo() { r.release(); /* release all resource */ }
4133 // Make object allocate from bgc_f instead
4134 // of the default non-collectable gc.
4135 void * operator new (size_t n) { return bgc_f.m_alloc(n); }
4137 \end{verbatim}
4139 When an instance of class \verb|Foo| is identified as garbage, its
4140 destructor will be called. [Aside: {\it currently issue such as
4141 finalization synchronization is not handled directly by the collector.
4142 So if there is synchronization constraints during finalization it must
4143 be handled by the object destructor. Future versions of \Prop{} will
4144 provide subclasses with more sophisticated finalization mechanisms.} ]
4146 Notice that the default \verb|delete| operator of all garbage collectable
4147 classes will call the object destructor explicitly. It is a good idea to use
4148 explicit \verb|delete| if finalization is a rare need since this service
4149 involves an explicit scan of the garbage memory, which may degrade performace
4150 with excessive swapping: without finalization the garbage pages will not
4151 have to be referenced with a copying collector.
4153 It should also be noted that since the collector is conservative, there
4154 is no guarantee that garbage will be identified as such:
4155 there is no guarantee that all garbage resources will be released.
4156 In general, however, the efficacy of the collector is quite good and
4157 so non-critical or plentiful resources can be safely finalized with
4158 this collector.
4160 \Subsubsection{Weak Pointers}
4162 It is frequently very useful to be able to keep track of garbage
4163 collectable objects through the use of ``{\bf weak pointers}.''
4164 Collectors ignore the presence of weak pointers to garbage collected
4165 objects; objects with only referencing weak pointers will still be
4166 collected. Weak pointers to objects that become garbage will
4167 be automatically reset to null.
4169 Weak pointers are provided through a smart pointer template
4170 \CLASSDEF{WeakPointer}, whose definition is shown below:
4172 \begin{verbatim}
4173 template <class T> class WeakPointer {
4174 public:
4175 inline WeakPointer();
4176 inline WeakPointer(const WeakPointer<T>& wp);
4177 inline WeakPointer(T * ptr);
4178 inline WeakPointer<T>& operator = (const WeakPointer<T>& wp);
4179 inline WeakPointer<T>& operator = (T * ptr);
4180 inline bool is_null() const;
4181 inline operator const T * () const;
4182 inline operator T * ();
4183 inline const T * operator -> () const;
4184 inline T * operator -> ();
4185 inline const T& operator * () const;
4186 inline T& operator * ();
4188 \end{verbatim}
4190 A weakpointer to a garbage collectable
4191 class \verb|A| can be defined as \verb|WeakPointer<A>|.
4192 If a \Prop{} datatype \verb|T| has been defined, a weakpointer to type
4193 \verb|T| can defined using the
4194 \TDEF{classof} keyword. For example:
4196 \begin{verbatim}
4197 datatype Wff :: collectable = True | False | And(...) | ...;
4199 WeakPointer<classof Wff> a = And(True,False);
4200 \end{verbatim}
4202 \Subsubsection{The Heap Walker}
4204 There is a simple class called \CLASSDEF{GCVerify} that can be used
4205 to verify the integrity of a complex allocated data structure.
4206 This is frequently useful during the development of a new collector,
4207 or a complex class derive class of \T{GCObject}.
4209 The interface of this class is shown below.
4211 \begin{verbatim}
4212 class GCVerifier : protected GC {
4213 public:
4214 virtual bool is_valid_pointer (GCObject *);
4215 virtual bool is_valid_interior_pointer (GCObject *);
4216 virtual bool is_valid_structure (GCObject *);
4217 virtual bool is_valid_pointer (GCObject *, GC *);
4218 virtual bool is_valid_interior_pointer (GCObject *, GC *);
4219 virtual bool is_valid_structure (GCObject *, GC *);
4220 size_t number_of_nodes() const;
4222 \end{verbatim}
4224 Class \CLASSDEF{GCVerify} is derived from class \CLASSDEF{GC} so that the
4225 same object tracing protocol can be used. Briefly, three different methods
4226 are provided for verification:
4227 \begin{itemize}
4228 \item Call to \verb|is_valid_pointer(p,gc)| verifies that
4229 \verb|p| is a valid pointer to an object within the collector \verb|gc|.
4230 \item Call to \verb|is_valid_pointer(p,gc)| verifies that
4231 \verb|p| is a valid interior pointer to an object
4232 within the collector \verb|gc|.
4233 \item Call to \verb|is_valid_structure(p,gc)| verifies the entire
4234 structure reachable by \verb|p| is valid. This method uses
4235 \verb|GCObject::trace| to locate the correct sub-structures.
4236 \end{itemize}
4238 There are alternative versions of the same methods that assumes
4239 the default collector is used. Furthermore
4240 \begin{itemize}
4241 \item Method \verb|number_of_nodes()| returns the node count of the
4242 last structure traced using \verb|is_valid_structure|, and
4243 \item If \verb|set_verbosity(GC::gc_print_debugging_info)| is used
4244 then error messages will be printed during structure tracing.
4245 \end{itemize}
4247 \bibliography{refman}
4249 \input{refman.ind}
4250 \end{document}