Reland "Non-SFI mode: Switch to newlib. (patchset #4 id:60001 of https://codereview...
[chromium-blink-merge.git] / ppapi / generators / idl_lexer.py
blob47d64a243b3e5d0f494b3e6ff779fede7533b83a
1 #!/usr/bin/env python
2 # Copyright (c) 2012 The Chromium Authors. All rights reserved.
3 # Use of this source code is governed by a BSD-style license that can be
4 # found in the LICENSE file.
6 """ Lexer for PPAPI IDL """
9 # IDL Lexer
11 # The lexer is uses the PLY lex library to build a tokenizer which understands
12 # WebIDL tokens.
14 # WebIDL, and WebIDL regular expressions can be found at:
15 # http://dev.w3.org/2006/webapi/WebIDL/
16 # PLY can be found at:
17 # http://www.dabeaz.com/ply/
19 import os.path
20 import re
21 import sys
24 # Try to load the ply module, if not, then assume it is in the third_party
25 # directory, relative to ppapi
27 try:
28 from ply import lex
29 except:
30 module_path, module_name = os.path.split(__file__)
31 third_party = os.path.join(module_path, '..', '..', 'third_party')
32 sys.path.append(third_party)
33 from ply import lex
35 from idl_option import GetOption, Option, ParseOptions
38 Option('output', 'Generate output.')
41 # IDL Lexer
43 class IDLLexer(object):
44 # 'tokens' is a value required by lex which specifies the complete list
45 # of valid token types.
46 tokens = [
47 # Symbol and keywords types
48 'COMMENT',
49 'DESCRIBE',
50 'ENUM',
51 'LABEL',
52 'SYMBOL',
53 'INLINE',
54 'INTERFACE',
55 'STRUCT',
56 'TYPEDEF',
57 'OR',
59 # Extra WebIDL keywords
60 'CALLBACK',
61 'DICTIONARY',
62 'OPTIONAL',
63 'STATIC',
65 # Invented for apps use
66 'NAMESPACE',
68 # Data types
69 'FLOAT',
70 'OCT',
71 'INT',
72 'HEX',
73 'STRING',
75 # Operators
76 'LSHIFT',
77 'RSHIFT'
80 # 'keywords' is a map of string to token type. All SYMBOL tokens are
81 # matched against keywords, to determine if the token is actually a keyword.
82 keywords = {
83 'describe' : 'DESCRIBE',
84 'enum' : 'ENUM',
85 'label' : 'LABEL',
86 'interface' : 'INTERFACE',
87 'readonly' : 'READONLY',
88 'struct' : 'STRUCT',
89 'typedef' : 'TYPEDEF',
91 'callback' : 'CALLBACK',
92 'dictionary' : 'DICTIONARY',
93 'optional' : 'OPTIONAL',
94 'static' : 'STATIC',
95 'namespace' : 'NAMESPACE',
97 'or' : 'OR',
100 # 'literals' is a value expected by lex which specifies a list of valid
101 # literal tokens, meaning the token type and token value are identical.
102 literals = '"*.(){}[],;:=+-/~|&^?'
104 # Token definitions
106 # Lex assumes any value or function in the form of 't_<TYPE>' represents a
107 # regular expression where a match will emit a token of type <TYPE>. In the
108 # case of a function, the function is called when a match is made. These
109 # definitions come from WebIDL.
111 # 't_ignore' is a special match of items to ignore
112 t_ignore = ' \t'
114 # Constant values
115 t_FLOAT = r'-?(\d+\.\d*|\d*\.\d+)([Ee][+-]?\d+)?|-?\d+[Ee][+-]?\d+'
116 t_INT = r'-?[0-9]+[uU]?'
117 t_OCT = r'-?0[0-7]+'
118 t_HEX = r'-?0[Xx][0-9A-Fa-f]+'
119 t_LSHIFT = r'<<'
120 t_RSHIFT = r'>>'
122 # A line ending '\n', we use this to increment the line number
123 def t_LINE_END(self, t):
124 r'\n+'
125 self.AddLines(len(t.value))
127 # We do not process escapes in the IDL strings. Strings are exclusively
128 # used for attributes, and not used as typical 'C' constants.
129 def t_STRING(self, t):
130 r'"[^"]*"'
131 t.value = t.value[1:-1]
132 self.AddLines(t.value.count('\n'))
133 return t
135 # A C or C++ style comment: /* xxx */ or //
136 def t_COMMENT(self, t):
137 r'(/\*(.|\n)*?\*/)|(//.*(\n[ \t]*//.*)*)'
138 self.AddLines(t.value.count('\n'))
139 return t
141 # Return a "preprocessor" inline block
142 def t_INLINE(self, t):
143 r'\#inline (.|\n)*?\#endinl.*'
144 self.AddLines(t.value.count('\n'))
145 return t
147 # A symbol or keyword.
148 def t_KEYWORD_SYMBOL(self, t):
149 r'_?[A-Za-z][A-Za-z_0-9]*'
151 # All non-keywords are assumed to be symbols
152 t.type = self.keywords.get(t.value, 'SYMBOL')
154 # We strip leading underscores so that you can specify symbols with the same
155 # value as a keywords (E.g. a dictionary named 'interface').
156 if t.value[0] == '_':
157 t.value = t.value[1:]
158 return t
160 def t_ANY_error(self, t):
161 msg = "Unrecognized input"
162 line = self.lexobj.lineno
164 # If that line has not been accounted for, then we must have hit
165 # EoF, so compute the beginning of the line that caused the problem.
166 if line >= len(self.index):
167 # Find the offset in the line of the first word causing the issue
168 word = t.value.split()[0]
169 offs = self.lines[line - 1].find(word)
170 # Add the computed line's starting position
171 self.index.append(self.lexobj.lexpos - offs)
172 msg = "Unexpected EoF reached after"
174 pos = self.lexobj.lexpos - self.index[line]
175 file = self.lexobj.filename
176 out = self.ErrorMessage(file, line, pos, msg)
177 sys.stderr.write(out + '\n')
178 self.lex_errors += 1
181 def AddLines(self, count):
182 # Set the lexer position for the beginning of the next line. In the case
183 # of multiple lines, tokens can not exist on any of the lines except the
184 # last one, so the recorded value for previous lines are unused. We still
185 # fill the array however, to make sure the line count is correct.
186 self.lexobj.lineno += count
187 for i in range(count):
188 self.index.append(self.lexobj.lexpos)
190 def FileLineMsg(self, file, line, msg):
191 if file: return "%s(%d) : %s" % (file, line + 1, msg)
192 return "<BuiltIn> : %s" % msg
194 def SourceLine(self, file, line, pos):
195 caret = '\t^'.expandtabs(pos)
196 # We decrement the line number since the array is 0 based while the
197 # line numbers are 1 based.
198 return "%s\n%s" % (self.lines[line - 1], caret)
200 def ErrorMessage(self, file, line, pos, msg):
201 return "\n%s\n%s" % (
202 self.FileLineMsg(file, line, msg),
203 self.SourceLine(file, line, pos))
205 def SetData(self, filename, data):
206 # Start with line 1, not zero
207 self.lexobj.lineno = 1
208 self.lexobj.filename = filename
209 self.lines = data.split('\n')
210 self.index = [0]
211 self.lexobj.input(data)
212 self.lex_errors = 0
214 def __init__(self):
215 self.lexobj = lex.lex(object=self, lextab=None, optimize=0)
220 # FilesToTokens
222 # From a set of source file names, generate a list of tokens.
224 def FilesToTokens(filenames, verbose=False):
225 lexer = IDLLexer()
226 outlist = []
227 for filename in filenames:
228 data = open(filename).read()
229 lexer.SetData(filename, data)
230 if verbose: sys.stdout.write(' Loaded %s...\n' % filename)
231 while 1:
232 t = lexer.lexobj.token()
233 if t is None: break
234 outlist.append(t)
235 return outlist
238 def TokensFromText(text):
239 lexer = IDLLexer()
240 lexer.SetData('unknown', text)
241 outlist = []
242 while 1:
243 t = lexer.lexobj.token()
244 if t is None: break
245 outlist.append(t.value)
246 return outlist
249 # TextToTokens
251 # From a block of text, generate a list of tokens
253 def TextToTokens(source):
254 lexer = IDLLexer()
255 outlist = []
256 lexer.SetData('AUTO', source)
257 while 1:
258 t = lexer.lexobj.token()
259 if t is None: break
260 outlist.append(t.value)
261 return outlist
265 # TestSame
267 # From a set of token values, generate a new source text by joining with a
268 # single space. The new source is then tokenized and compared against the
269 # old set.
271 def TestSame(values1):
272 # Recreate the source from the tokens. We use newline instead of whitespace
273 # since the '//' and #inline regex are line sensitive.
274 text = '\n'.join(values1)
275 values2 = TextToTokens(text)
277 count1 = len(values1)
278 count2 = len(values2)
279 if count1 != count2:
280 print "Size mismatch original %d vs %d\n" % (count1, count2)
281 if count1 > count2: count1 = count2
283 for i in range(count1):
284 if values1[i] != values2[i]:
285 print "%d >>%s<< >>%s<<" % (i, values1[i], values2[i])
287 if GetOption('output'):
288 sys.stdout.write('Generating original.txt and tokenized.txt\n')
289 open('original.txt', 'w').write(src1)
290 open('tokenized.txt', 'w').write(src2)
292 if values1 == values2:
293 sys.stdout.write('Same: Pass\n')
294 return 0
296 print "****************\n%s\n%s***************\n" % (src1, src2)
297 sys.stdout.write('Same: Failed\n')
298 return -1
302 # TestExpect
304 # From a set of tokens pairs, verify the type field of the second matches
305 # the value of the first, so that:
306 # INT 123 FLOAT 1.1
307 # will generate a passing test, where the first token is the SYMBOL INT,
308 # and the second token is the INT 123, third token is the SYMBOL FLOAT and
309 # the fourth is the FLOAT 1.1, etc...
310 def TestExpect(tokens):
311 count = len(tokens)
312 index = 0
313 errors = 0
314 while index < count:
315 type = tokens[index].value
316 token = tokens[index + 1]
317 index += 2
319 if type != token.type:
320 sys.stderr.write('Mismatch: Expected %s, but got %s = %s.\n' %
321 (type, token.type, token.value))
322 errors += 1
324 if not errors:
325 sys.stdout.write('Expect: Pass\n')
326 return 0
328 sys.stdout.write('Expect: Failed\n')
329 return -1
332 def Main(args):
333 filenames = ParseOptions(args)
335 try:
336 tokens = FilesToTokens(filenames, GetOption('verbose'))
337 values = [tok.value for tok in tokens]
338 if GetOption('output'): sys.stdout.write(' <> '.join(values) + '\n')
339 if GetOption('test'):
340 if TestSame(values):
341 return -1
342 if TestExpect(tokens):
343 return -1
344 return 0
346 except lex.LexError as le:
347 sys.stderr.write('%s\n' % str(le))
348 return -1
351 if __name__ == '__main__':
352 sys.exit(Main(sys.argv[1:]))