Added grv@ and dvh@ as owners of developer private API.
[chromium-blink-merge.git] / tools / idl_parser / idl_lexer.py
blobd7311df1ca6023a16d4957913401c173402d5599
1 #!/usr/bin/env python
2 # Copyright (c) 2013 The Chromium Authors. All rights reserved.
3 # Use of this source code is governed by a BSD-style license that can be
4 # found in the LICENSE file.
6 """ Lexer for PPAPI IDL
8 The lexer uses the PLY library to build a tokenizer which understands both
9 WebIDL and Pepper tokens.
11 WebIDL, and WebIDL regular expressions can be found at:
12 http://www.w3.org/TR/2012/CR-WebIDL-20120419/
13 PLY can be found at:
14 http://www.dabeaz.com/ply/
15 """
17 import os.path
18 import sys
21 # Try to load the ply module, if not, then assume it is in the third_party
22 # directory.
24 try:
25 # Disable lint check which fails to find the ply module.
26 # pylint: disable=F0401
27 from ply import lex
28 except ImportError:
29 module_path, module_name = os.path.split(__file__)
30 third_party = os.path.join(module_path, '..', '..', 'third_party')
31 sys.path.append(third_party)
32 # pylint: disable=F0401
33 from ply import lex
36 # IDL Lexer
38 class IDLLexer(object):
39 # 'literals' is a value expected by lex which specifies a list of valid
40 # literal tokens, meaning the token type and token value are identical.
41 literals = r'"*.(){}[],;:=+-/~|&^?<>'
43 # 't_ignore' contains ignored characters (spaces and tabs)
44 t_ignore = ' \t'
46 # 'tokens' is a value required by lex which specifies the complete list
47 # of valid token types.
48 tokens = [
49 # Data types
50 'float',
51 'integer',
52 'string',
54 # Symbol and keywords types
55 'COMMENT',
56 'identifier',
58 # MultiChar operators
59 'ELLIPSIS',
62 # 'keywords' is a map of string to token type. All tokens matching
63 # KEYWORD_OR_SYMBOL are matched against keywords dictionary, to determine
64 # if the token is actually a keyword.
65 keywords = {
66 'any' : 'ANY',
67 'attribute' : 'ATTRIBUTE',
68 'boolean' : 'BOOLEAN',
69 'byte' : 'BYTE',
70 'callback' : 'CALLBACK',
71 'const' : 'CONST',
72 'creator' : 'CREATOR',
73 'Date' : 'DATE',
74 'deleter' : 'DELETER',
75 'dictionary' : 'DICTIONARY',
76 'DOMString' : 'DOMSTRING',
77 'double' : 'DOUBLE',
78 'enum' : 'ENUM',
79 'false' : 'FALSE',
80 'float' : 'FLOAT',
81 'exception' : 'EXCEPTION',
82 'getter': 'GETTER',
83 'implements' : 'IMPLEMENTS',
84 'Infinity' : 'INFINITY',
85 'inherit' : 'INHERIT',
86 'interface' : 'INTERFACE',
87 'legacycaller' : 'LEGACYCALLER',
88 'long' : 'LONG',
89 'Nan' : 'NAN',
90 'null' : 'NULL',
91 'object' : 'OBJECT',
92 'octet' : 'OCTET',
93 'optional' : 'OPTIONAL',
94 'or' : 'OR',
95 'partial' : 'PARTIAL',
96 'readonly' : 'READONLY',
97 'sequence' : 'SEQUENCE',
98 'setter': 'SETTER',
99 'short' : 'SHORT',
100 'static' : 'STATIC',
101 'stringifier' : 'STRINGIFIER',
102 'typedef' : 'TYPEDEF',
103 'true' : 'TRUE',
104 'unsigned' : 'UNSIGNED',
105 'unrestricted' : 'UNRESTRICTED',
106 'void' : 'VOID'
109 # Token definitions
111 # Lex assumes any value or function in the form of 't_<TYPE>' represents a
112 # regular expression where a match will emit a token of type <TYPE>. In the
113 # case of a function, the function is called when a match is made. These
114 # definitions come from WebIDL.
116 # These need to be methods for lexer construction, despite not using self.
117 # pylint: disable=R0201
118 def t_ELLIPSIS(self, t):
119 r'\.\.\.'
120 return t
122 # Regex needs to be in the docstring
123 # pylint: disable=C0301
124 def t_float(self, t):
125 r'-?(([0-9]+\.[0-9]*|[0-9]*\.[0-9]+)([Ee][+-]?[0-9]+)?|[0-9]+[Ee][+-]?[0-9]+)'
126 return t
128 def t_integer(self, t):
129 r'-?([1-9][0-9]*|0[Xx][0-9A-Fa-f]+|0[0-7]*)'
130 return t
133 # A line ending '\n', we use this to increment the line number
134 def t_LINE_END(self, t):
135 r'\n+'
136 self.AddLines(len(t.value))
138 # We do not process escapes in the IDL strings. Strings are exclusively
139 # used for attributes and enums, and not used as typical 'C' constants.
140 def t_string(self, t):
141 r'"[^"]*"'
142 t.value = t.value[1:-1]
143 self.AddLines(t.value.count('\n'))
144 return t
146 # A C or C++ style comment: /* xxx */ or //
147 def t_COMMENT(self, t):
148 r'(/\*(.|\n)*?\*/)|(//.*(\n[ \t]*//.*)*)'
149 self.AddLines(t.value.count('\n'))
150 return t
152 # A symbol or keyword.
153 def t_KEYWORD_OR_SYMBOL(self, t):
154 r'_?[A-Za-z][A-Za-z_0-9]*'
156 # All non-keywords are assumed to be symbols
157 t.type = self.keywords.get(t.value, 'identifier')
159 # We strip leading underscores so that you can specify symbols with the same
160 # value as a keywords (E.g. a dictionary named 'interface').
161 if t.value[0] == '_':
162 t.value = t.value[1:]
163 return t
165 def t_ANY_error(self, t):
166 msg = 'Unrecognized input'
167 line = self.Lexer().lineno
169 # If that line has not been accounted for, then we must have hit
170 # EoF, so compute the beginning of the line that caused the problem.
171 if line >= len(self.index):
172 # Find the offset in the line of the first word causing the issue
173 word = t.value.split()[0]
174 offs = self.lines[line - 1].find(word)
175 # Add the computed line's starting position
176 self.index.append(self.Lexer().lexpos - offs)
177 msg = 'Unexpected EoF reached after'
179 pos = self.Lexer().lexpos - self.index[line]
180 out = self.ErrorMessage(line, pos, msg)
181 sys.stderr.write(out + '\n')
182 self._lex_errors += 1
185 def AddLines(self, count):
186 # Set the lexer position for the beginning of the next line. In the case
187 # of multiple lines, tokens can not exist on any of the lines except the
188 # last one, so the recorded value for previous lines are unused. We still
189 # fill the array however, to make sure the line count is correct.
190 self.Lexer().lineno += count
191 for _ in range(count):
192 self.index.append(self.Lexer().lexpos)
194 def FileLineMsg(self, line, msg):
195 # Generate a message containing the file and line number of a token.
196 filename = self.Lexer().filename
197 if filename:
198 return "%s(%d) : %s" % (filename, line + 1, msg)
199 return "<BuiltIn> : %s" % msg
201 def SourceLine(self, line, pos):
202 # Create a source line marker
203 caret = ' ' * pos + '^'
204 # We decrement the line number since the array is 0 based while the
205 # line numbers are 1 based.
206 return "%s\n%s" % (self.lines[line - 1], caret)
208 def ErrorMessage(self, line, pos, msg):
209 return "\n%s\n%s" % (
210 self.FileLineMsg(line, msg),
211 self.SourceLine(line, pos))
214 # Tokenizer
216 # The token function returns the next token provided by IDLLexer for matching
217 # against the leaf paterns.
219 def token(self):
220 tok = self.Lexer().token()
221 if tok:
222 self.last = tok
223 return tok
226 def GetTokens(self):
227 outlist = []
228 while True:
229 t = self.Lexer().token()
230 if not t:
231 break
232 outlist.append(t)
233 return outlist
235 def Tokenize(self, data, filename='__no_file__'):
236 lexer = self.Lexer()
237 lexer.lineno = 1
238 lexer.filename = filename
239 lexer.input(data)
240 self.lines = data.split('\n')
242 def KnownTokens(self):
243 return self.tokens
245 def Lexer(self):
246 if not self._lexobj:
247 self._lexobj = lex.lex(object=self, lextab=None, optimize=0)
248 return self._lexobj
250 def _AddToken(self, token):
251 if token in self.tokens:
252 raise RuntimeError('Same token: ' + token)
253 self.tokens.append(token)
255 def _AddTokens(self, tokens):
256 for token in tokens:
257 self._AddToken(token)
259 def _AddKeywords(self, keywords):
260 for key in keywords:
261 value = key.upper()
262 self._AddToken(value)
263 self.keywords[key] = value
265 def _DelKeywords(self, keywords):
266 for key in keywords:
267 self.tokens.remove(key.upper())
268 del self.keywords[key]
270 def __init__(self):
271 self.index = [0]
272 self._lex_errors = 0
273 self.linex = []
274 self.filename = None
275 self.keywords = {}
276 self.tokens = []
277 self._AddTokens(IDLLexer.tokens)
278 self._AddKeywords(IDLLexer.keywords)
279 self._lexobj = None
280 self.last = None
281 self.lines = None
283 # If run by itself, attempt to build the lexer
284 if __name__ == '__main__':
285 lexer_object = IDLLexer()