2 # Copyright (c) 2012 The Chromium Authors. All rights reserved.
3 # Use of this source code is governed by a BSD-style license that can be
4 # found in the LICENSE file.
6 """ Lexer for PPAPI IDL """
11 # The lexer is uses the PLY lex library to build a tokenizer which understands
14 # WebIDL, and WebIDL regular expressions can be found at:
15 # http://dev.w3.org/2006/webapi/WebIDL/
16 # PLY can be found at:
17 # http://www.dabeaz.com/ply/
24 # Try to load the ply module, if not, then assume it is in the third_party
25 # directory, relative to ppapi
30 module_path
, module_name
= os
.path
.split(__file__
)
31 third_party
= os
.path
.join(module_path
, '..', '..', 'third_party')
32 sys
.path
.append(third_party
)
35 from idl_option
import GetOption
, Option
, ParseOptions
38 Option('output', 'Generate output.')
43 class IDLLexer(object):
44 # 'tokens' is a value required by lex which specifies the complete list
45 # of valid token types.
47 # Symbol and keywords types
59 # Extra WebIDL keywords
65 # Invented for apps use
80 # 'keywords' is a map of string to token type. All SYMBOL tokens are
81 # matched against keywords, to determine if the token is actually a keyword.
83 'describe' : 'DESCRIBE',
86 'interface' : 'INTERFACE',
87 'readonly' : 'READONLY',
89 'typedef' : 'TYPEDEF',
91 'callback' : 'CALLBACK',
92 'dictionary' : 'DICTIONARY',
93 'optional' : 'OPTIONAL',
95 'namespace' : 'NAMESPACE',
100 # 'literals' is a value expected by lex which specifies a list of valid
101 # literal tokens, meaning the token type and token value are identical.
102 literals
= '"*.(){}[],;:=+-/~|&^?'
106 # Lex assumes any value or function in the form of 't_<TYPE>' represents a
107 # regular expression where a match will emit a token of type <TYPE>. In the
108 # case of a function, the function is called when a match is made. These
109 # definitions come from WebIDL.
111 # 't_ignore' is a special match of items to ignore
115 t_FLOAT
= r
'-?(\d+\.\d*|\d*\.\d+)([Ee][+-]?\d+)?|-?\d+[Ee][+-]?\d+'
116 t_INT
= r
'-?[0-9]+[uU]?'
118 t_HEX
= r
'-?0[Xx][0-9A-Fa-f]+'
122 # A line ending '\n', we use this to increment the line number
123 def t_LINE_END(self
, t
):
125 self
.AddLines(len(t
.value
))
127 # We do not process escapes in the IDL strings. Strings are exclusively
128 # used for attributes, and not used as typical 'C' constants.
129 def t_STRING(self
, t
):
131 t
.value
= t
.value
[1:-1]
132 self
.AddLines(t
.value
.count('\n'))
135 # A C or C++ style comment: /* xxx */ or //
136 def t_COMMENT(self
, t
):
137 r
'(/\*(.|\n)*?\*/)|(//.*(\n[ \t]*//.*)*)'
138 self
.AddLines(t
.value
.count('\n'))
141 # Return a "preprocessor" inline block
142 def t_INLINE(self
, t
):
143 r
'\#inline (.|\n)*?\#endinl.*'
144 self
.AddLines(t
.value
.count('\n'))
147 # A symbol or keyword.
148 def t_KEYWORD_SYMBOL(self
, t
):
149 r
'_?[A-Za-z][A-Za-z_0-9]*'
151 # All non-keywords are assumed to be symbols
152 t
.type = self
.keywords
.get(t
.value
, 'SYMBOL')
154 # We strip leading underscores so that you can specify symbols with the same
155 # value as a keywords (E.g. a dictionary named 'interface').
156 if t
.value
[0] == '_':
157 t
.value
= t
.value
[1:]
160 def t_ANY_error(self
, t
):
161 msg
= "Unrecognized input"
162 line
= self
.lexobj
.lineno
164 # If that line has not been accounted for, then we must have hit
165 # EoF, so compute the beginning of the line that caused the problem.
166 if line
>= len(self
.index
):
167 # Find the offset in the line of the first word causing the issue
168 word
= t
.value
.split()[0]
169 offs
= self
.lines
[line
- 1].find(word
)
170 # Add the computed line's starting position
171 self
.index
.append(self
.lexobj
.lexpos
- offs
)
172 msg
= "Unexpected EoF reached after"
174 pos
= self
.lexobj
.lexpos
- self
.index
[line
]
175 file = self
.lexobj
.filename
176 out
= self
.ErrorMessage(file, line
, pos
, msg
)
177 sys
.stderr
.write(out
+ '\n')
181 def AddLines(self
, count
):
182 # Set the lexer position for the beginning of the next line. In the case
183 # of multiple lines, tokens can not exist on any of the lines except the
184 # last one, so the recorded value for previous lines are unused. We still
185 # fill the array however, to make sure the line count is correct.
186 self
.lexobj
.lineno
+= count
187 for i
in range(count
):
188 self
.index
.append(self
.lexobj
.lexpos
)
190 def FileLineMsg(self
, file, line
, msg
):
191 if file: return "%s(%d) : %s" % (file, line
+ 1, msg
)
192 return "<BuiltIn> : %s" % msg
194 def SourceLine(self
, file, line
, pos
):
195 caret
= '\t^'.expandtabs(pos
)
196 # We decrement the line number since the array is 0 based while the
197 # line numbers are 1 based.
198 return "%s\n%s" % (self
.lines
[line
- 1], caret
)
200 def ErrorMessage(self
, file, line
, pos
, msg
):
201 return "\n%s\n%s" % (
202 self
.FileLineMsg(file, line
, msg
),
203 self
.SourceLine(file, line
, pos
))
205 def SetData(self
, filename
, data
):
206 # Start with line 1, not zero
207 self
.lexobj
.lineno
= 1
208 self
.lexobj
.filename
= filename
209 self
.lines
= data
.split('\n')
211 self
.lexobj
.input(data
)
215 self
.lexobj
= lex
.lex(object=self
, lextab
=None, optimize
=0)
222 # From a set of source file names, generate a list of tokens.
224 def FilesToTokens(filenames
, verbose
=False):
227 for filename
in filenames
:
228 data
= open(filename
).read()
229 lexer
.SetData(filename
, data
)
230 if verbose
: sys
.stdout
.write(' Loaded %s...\n' % filename
)
232 t
= lexer
.lexobj
.token()
238 def TokensFromText(text
):
240 lexer
.SetData('unknown', text
)
243 t
= lexer
.lexobj
.token()
245 outlist
.append(t
.value
)
251 # From a block of text, generate a list of tokens
253 def TextToTokens(source
):
256 lexer
.SetData('AUTO', source
)
258 t
= lexer
.lexobj
.token()
260 outlist
.append(t
.value
)
267 # From a set of token values, generate a new source text by joining with a
268 # single space. The new source is then tokenized and compared against the
271 def TestSame(values1
):
272 # Recreate the source from the tokens. We use newline instead of whitespace
273 # since the '//' and #inline regex are line sensitive.
274 text
= '\n'.join(values1
)
275 values2
= TextToTokens(text
)
277 count1
= len(values1
)
278 count2
= len(values2
)
280 print "Size mismatch original %d vs %d\n" % (count1
, count2
)
281 if count1
> count2
: count1
= count2
283 for i
in range(count1
):
284 if values1
[i
] != values2
[i
]:
285 print "%d >>%s<< >>%s<<" % (i
, values1
[i
], values2
[i
])
287 if GetOption('output'):
288 sys
.stdout
.write('Generating original.txt and tokenized.txt\n')
289 open('original.txt', 'w').write(src1
)
290 open('tokenized.txt', 'w').write(src2
)
292 if values1
== values2
:
293 sys
.stdout
.write('Same: Pass\n')
296 print "****************\n%s\n%s***************\n" % (src1
, src2
)
297 sys
.stdout
.write('Same: Failed\n')
304 # From a set of tokens pairs, verify the type field of the second matches
305 # the value of the first, so that:
307 # will generate a passing test, where the first token is the SYMBOL INT,
308 # and the second token is the INT 123, third token is the SYMBOL FLOAT and
309 # the fourth is the FLOAT 1.1, etc...
310 def TestExpect(tokens
):
315 type = tokens
[index
].value
316 token
= tokens
[index
+ 1]
319 if type != token
.type:
320 sys
.stderr
.write('Mismatch: Expected %s, but got %s = %s.\n' %
321 (type, token
.type, token
.value
))
325 sys
.stdout
.write('Expect: Pass\n')
328 sys
.stdout
.write('Expect: Failed\n')
333 filenames
= ParseOptions(args
)
336 tokens
= FilesToTokens(filenames
, GetOption('verbose'))
337 values
= [tok
.value
for tok
in tokens
]
338 if GetOption('output'): sys
.stdout
.write(' <> '.join(values
) + '\n')
339 if GetOption('test'):
342 if TestExpect(tokens
):
346 except lex
.LexError
as le
:
347 sys
.stderr
.write('%s\n' % str(le
))
351 if __name__
== '__main__':
352 sys
.exit(Main(sys
.argv
[1:]))