2 # Copyright (c) 2012 The Chromium Authors. All rights reserved.
3 # Use of this source code is governed by a BSD-style license that can be
4 # found in the LICENSE file.
6 """ Lexer for PPAPI IDL """
11 # The lexer is uses the PLY lex library to build a tokenizer which understands
14 # WebIDL, and WebIDL regular expressions can be found at:
15 # http://dev.w3.org/2006/webapi/WebIDL/
16 # PLY can be found at:
17 # http://www.dabeaz.com/ply/
24 # Try to load the ply module, if not, then assume it is in the third_party
25 # directory, relative to ppapi
30 module_path
, module_name
= os
.path
.split(__file__
)
31 third_party
= os
.path
.join(module_path
, '..', '..', 'third_party')
32 sys
.path
.append(third_party
)
35 from idl_option
import GetOption
, Option
, ParseOptions
38 Option('output', 'Generate output.')
43 class IDLLexer(object):
44 # 'tokens' is a value required by lex which specifies the complete list
45 # of valid token types.
47 # Symbol and keywords types
58 # Extra WebIDL keywords
64 # Invented for apps use
79 # 'keywords' is a map of string to token type. All SYMBOL tokens are
80 # matched against keywords, to determine if the token is actually a keyword.
82 'describe' : 'DESCRIBE',
85 'interface' : 'INTERFACE',
86 'readonly' : 'READONLY',
88 'typedef' : 'TYPEDEF',
90 'callback' : 'CALLBACK',
91 'dictionary' : 'DICTIONARY',
92 'optional' : 'OPTIONAL',
94 'namespace' : 'NAMESPACE',
97 # 'literals' is a value expected by lex which specifies a list of valid
98 # literal tokens, meaning the token type and token value are identical.
99 literals
= '"*.(){}[],;:=+-/~|&^?'
103 # Lex assumes any value or function in the form of 't_<TYPE>' represents a
104 # regular expression where a match will emit a token of type <TYPE>. In the
105 # case of a function, the function is called when a match is made. These
106 # definitions come from WebIDL.
108 # 't_ignore' is a special match of items to ignore
112 t_FLOAT
= r
'-?(\d+\.\d*|\d*\.\d+)([Ee][+-]?\d+)?|-?\d+[Ee][+-]?\d+'
113 t_INT
= r
'-?[0-9]+[uU]?'
115 t_HEX
= r
'-?0[Xx][0-9A-Fa-f]+'
119 # A line ending '\n', we use this to increment the line number
120 def t_LINE_END(self
, t
):
122 self
.AddLines(len(t
.value
))
124 # We do not process escapes in the IDL strings. Strings are exclusively
125 # used for attributes, and not used as typical 'C' constants.
126 def t_STRING(self
, t
):
128 t
.value
= t
.value
[1:-1]
129 self
.AddLines(t
.value
.count('\n'))
132 # A C or C++ style comment: /* xxx */ or //
133 def t_COMMENT(self
, t
):
134 r
'(/\*(.|\n)*?\*/)|(//.*(\n[ \t]*//.*)*)'
135 self
.AddLines(t
.value
.count('\n'))
138 # Return a "preprocessor" inline block
139 def t_INLINE(self
, t
):
140 r
'\#inline (.|\n)*?\#endinl.*'
141 self
.AddLines(t
.value
.count('\n'))
144 # A symbol or keyword.
145 def t_KEYWORD_SYMBOL(self
, t
):
146 r
'_?[A-Za-z][A-Za-z_0-9]*'
148 # All non-keywords are assumed to be symbols
149 t
.type = self
.keywords
.get(t
.value
, 'SYMBOL')
151 # We strip leading underscores so that you can specify symbols with the same
152 # value as a keywords (E.g. a dictionary named 'interface').
153 if t
.value
[0] == '_':
154 t
.value
= t
.value
[1:]
157 def t_ANY_error(self
, t
):
158 msg
= "Unrecognized input"
159 line
= self
.lexobj
.lineno
161 # If that line has not been accounted for, then we must have hit
162 # EoF, so compute the beginning of the line that caused the problem.
163 if line
>= len(self
.index
):
164 # Find the offset in the line of the first word causing the issue
165 word
= t
.value
.split()[0]
166 offs
= self
.lines
[line
- 1].find(word
)
167 # Add the computed line's starting position
168 self
.index
.append(self
.lexobj
.lexpos
- offs
)
169 msg
= "Unexpected EoF reached after"
171 pos
= self
.lexobj
.lexpos
- self
.index
[line
]
172 file = self
.lexobj
.filename
173 out
= self
.ErrorMessage(file, line
, pos
, msg
)
174 sys
.stderr
.write(out
+ '\n')
178 def AddLines(self
, count
):
179 # Set the lexer position for the beginning of the next line. In the case
180 # of multiple lines, tokens can not exist on any of the lines except the
181 # last one, so the recorded value for previous lines are unused. We still
182 # fill the array however, to make sure the line count is correct.
183 self
.lexobj
.lineno
+= count
184 for i
in range(count
):
185 self
.index
.append(self
.lexobj
.lexpos
)
187 def FileLineMsg(self
, file, line
, msg
):
188 if file: return "%s(%d) : %s" % (file, line
+ 1, msg
)
189 return "<BuiltIn> : %s" % msg
191 def SourceLine(self
, file, line
, pos
):
192 caret
= '\t^'.expandtabs(pos
)
193 # We decrement the line number since the array is 0 based while the
194 # line numbers are 1 based.
195 return "%s\n%s" % (self
.lines
[line
- 1], caret
)
197 def ErrorMessage(self
, file, line
, pos
, msg
):
198 return "\n%s\n%s" % (
199 self
.FileLineMsg(file, line
, msg
),
200 self
.SourceLine(file, line
, pos
))
202 def SetData(self
, filename
, data
):
203 # Start with line 1, not zero
204 self
.lexobj
.lineno
= 1
205 self
.lexobj
.filename
= filename
206 self
.lines
= data
.split('\n')
208 self
.lexobj
.input(data
)
212 self
.lexobj
= lex
.lex(object=self
, lextab
=None, optimize
=0)
219 # From a set of source file names, generate a list of tokens.
221 def FilesToTokens(filenames
, verbose
=False):
224 for filename
in filenames
:
225 data
= open(filename
).read()
226 lexer
.SetData(filename
, data
)
227 if verbose
: sys
.stdout
.write(' Loaded %s...\n' % filename
)
229 t
= lexer
.lexobj
.token()
235 def TokensFromText(text
):
237 lexer
.SetData('unknown', text
)
240 t
= lexer
.lexobj
.token()
242 outlist
.append(t
.value
)
248 # From a block of text, generate a list of tokens
250 def TextToTokens(source
):
253 lexer
.SetData('AUTO', source
)
255 t
= lexer
.lexobj
.token()
257 outlist
.append(t
.value
)
264 # From a set of token values, generate a new source text by joining with a
265 # single space. The new source is then tokenized and compared against the
268 def TestSame(values1
):
269 # Recreate the source from the tokens. We use newline instead of whitespace
270 # since the '//' and #inline regex are line sensitive.
271 text
= '\n'.join(values1
)
272 values2
= TextToTokens(text
)
274 count1
= len(values1
)
275 count2
= len(values2
)
277 print "Size mismatch original %d vs %d\n" % (count1
, count2
)
278 if count1
> count2
: count1
= count2
280 for i
in range(count1
):
281 if values1
[i
] != values2
[i
]:
282 print "%d >>%s<< >>%s<<" % (i
, values1
[i
], values2
[i
])
284 if GetOption('output'):
285 sys
.stdout
.write('Generating original.txt and tokenized.txt\n')
286 open('original.txt', 'w').write(src1
)
287 open('tokenized.txt', 'w').write(src2
)
289 if values1
== values2
:
290 sys
.stdout
.write('Same: Pass\n')
293 print "****************\n%s\n%s***************\n" % (src1
, src2
)
294 sys
.stdout
.write('Same: Failed\n')
301 # From a set of tokens pairs, verify the type field of the second matches
302 # the value of the first, so that:
304 # will generate a passing test, where the first token is the SYMBOL INT,
305 # and the second token is the INT 123, third token is the SYMBOL FLOAT and
306 # the fourth is the FLOAT 1.1, etc...
307 def TestExpect(tokens
):
312 type = tokens
[index
].value
313 token
= tokens
[index
+ 1]
316 if type != token
.type:
317 sys
.stderr
.write('Mismatch: Expected %s, but got %s = %s.\n' %
318 (type, token
.type, token
.value
))
322 sys
.stdout
.write('Expect: Pass\n')
325 sys
.stdout
.write('Expect: Failed\n')
330 filenames
= ParseOptions(args
)
333 tokens
= FilesToTokens(filenames
, GetOption('verbose'))
334 values
= [tok
.value
for tok
in tokens
]
335 if GetOption('output'): sys
.stdout
.write(' <> '.join(values
) + '\n')
336 if GetOption('test'):
339 if TestExpect(tokens
):
343 except lex
.LexError
as le
:
344 sys
.stderr
.write('%s\n' % str(le
))
348 if __name__
== '__main__':
349 sys
.exit(Main(sys
.argv
[1:]))