boa2/boa/parser.py

   1 """Lisp syntax parser
   2
   3 This module is deliberately kept simple (and inefficient). The parse
   4 tree is just a cons list (code is data), so that implementing things
   5 like `quote` and `define-syntax` will be straightforward.
   6 """
   7
   8 import re
   9
  10 from boa.error import *
  11 from boa.primitives import symbol, list as cons_list
  12 from boa.evaluator import sequence_to_exp
  13
  14 c = re.compile
  15 PATTERNS = [
  16     ('whitespace',     c(r'(\s+)')),
  17     ('comment',        c(r'(;[^\n]*)')),
  18     ('(',              c(r'(\()')),
  19     (')',              c(r'(\))')),
  20     ('number',         c(r'''( [+\-]?    ## optional sign,
  21                                (?:       ## followed by some
  22                                          ## decimals
  23                                     \d+\.\d+
  24                                   | \d+\.
  25                                   | \.\d+
  26                                   | \d+
  27                                 )
  28                               )
  29                                 ''',
  30                          re.VERBOSE)),
  31     ('symbol',         c(r'''([a-zA-Z\+\=\?\!\@\#\$\%\^\&\*\-\_\/\.\>\<]
  32                               [\w\+\=\?\!\@\#\$\%\^\&\*\-\_\/\.\>\<]*)''',
  33                          re.VERBOSE)),
  34     ('string',         c(r'''
  35                            "
  36                            (([^\"] | \\")*)
  37                            "
  38                            ''',
  39                          re.VERBOSE)),
  40     ("'",              c(r'(\')')),
  41     ('`',              c(r'(`)')),
  42     (',',              c(r'(,)')),
  43     ]
  44
  45 def tokenize(s):
  46     tokens = []
  47     while s:
  48         m = None
  49         for type, regex in PATTERNS:
  50             m = regex.match(s)
  51             if m:
  52                 token = m.group(1)
  53                 tokens.append((type, token))
  54                 s = s[m.span()[1]:]
  55                 break
  56         if not m:
  57             error("TOKENIZE error from: %s..." % s[:20])
  58     return tokens
  59
  60 def filter_executable_tokens(tokens):
  61     return filter(
  62         lambda x: x[0] not in ('whitespace', 'comment'),
  63         tokens)
  64
  65 def parse(text):
  66     tokens = filter_executable_tokens(tokenize(text))
  67     sexps, n = [], 0
  68     while n < len(tokens):
  69         sexp, n = parse_sexp(tokens, n)
  70         sexps.append(sexp)
  71     return sequence_to_exp(cons_list(*sexps))
  72
  73 def parse_sexp(tokens, n):
  74     if tokens[n][0] is 'string':
  75         return tokens[n][1], n+1
  76     if tokens[n][0] is 'number':
  77         return int(tokens[n][1]), n+1 # ??
  78     if tokens[n][0] is 'symbol':
  79         return symbol(tokens[n][1]), n+1
  80     if tokens[n][0] is "'":
  81         e, n = parse_sexp(tokens, n+1)
  82         return cons_list(symbol("quote"), e), n
  83     if tokens[n][0] == '(':
  84         sexps, n = [], n+1
  85         while tokens[n][0] != ')':
  86             e, n = parse_sexp(tokens, n)
  87             sexps.append(e)
  88         return cons_list(*sexps), n+1
  89     error("PARSE error -- Invalid/unsupported token: %s" % tokens[n][0])