1 # -*- coding: utf-8 -*-
6 This module implements a Jinja / Python combination lexer. The
7 `Lexer` class provided by this module is used to do some preprocessing
10 On the one hand it filters out invalid operators like the bitshift
11 operators we don't allow in templates. On the other hand it separates
12 template code and python code in expressions.
14 :copyright: (c) 2010 by the Jinja Team.
15 :license: BSD, see LICENSE for more details.
19 from operator
import itemgetter
20 from collections
import deque
21 from jinja2
.exceptions
import TemplateSyntaxError
22 from jinja2
.utils
import LRUCache
23 from jinja2
._compat
import next
, iteritems
, implements_iterator
, text_type
, \
27 # cache for the lexers. Exists in order to be able to have multiple
28 # environments with the same lexer
29 _lexer_cache
= LRUCache(50)
31 # static regular expressions
32 whitespace_re
= re
.compile(r
'\s+', re
.U
)
33 string_re
= re
.compile(r
"('([^'\\]*(?:\\.[^'\\]*)*)'"
34 r
'|"([^"\\]*(?:\\.[^"\\]*)*)")', re
.S
)
35 integer_re
= re
.compile(r
'\d+')
37 # we use the unicode identifier rule if this python version is able
38 # to handle unicode identifiers, otherwise the standard ASCII one.
40 compile('föö', '<unknown>', 'eval')
42 name_re
= re
.compile(r
'\b[a-zA-Z_][a-zA-Z0-9_]*\b')
44 from jinja2
import _stringdefs
45 name_re
= re
.compile(r
'[%s][%s]*' % (_stringdefs
.xid_start
,
46 _stringdefs
.xid_continue
))
48 float_re
= re
.compile(r
'(?<!\.)\d+\.\d+')
49 newline_re
= re
.compile(r
'(\r\n|\r|\n)')
51 # internal the tokens and keep references to them
52 TOKEN_ADD
= intern('add')
53 TOKEN_ASSIGN
= intern('assign')
54 TOKEN_COLON
= intern('colon')
55 TOKEN_COMMA
= intern('comma')
56 TOKEN_DIV
= intern('div')
57 TOKEN_DOT
= intern('dot')
58 TOKEN_EQ
= intern('eq')
59 TOKEN_FLOORDIV
= intern('floordiv')
60 TOKEN_GT
= intern('gt')
61 TOKEN_GTEQ
= intern('gteq')
62 TOKEN_LBRACE
= intern('lbrace')
63 TOKEN_LBRACKET
= intern('lbracket')
64 TOKEN_LPAREN
= intern('lparen')
65 TOKEN_LT
= intern('lt')
66 TOKEN_LTEQ
= intern('lteq')
67 TOKEN_MOD
= intern('mod')
68 TOKEN_MUL
= intern('mul')
69 TOKEN_NE
= intern('ne')
70 TOKEN_PIPE
= intern('pipe')
71 TOKEN_POW
= intern('pow')
72 TOKEN_RBRACE
= intern('rbrace')
73 TOKEN_RBRACKET
= intern('rbracket')
74 TOKEN_RPAREN
= intern('rparen')
75 TOKEN_SEMICOLON
= intern('semicolon')
76 TOKEN_SUB
= intern('sub')
77 TOKEN_TILDE
= intern('tilde')
78 TOKEN_WHITESPACE
= intern('whitespace')
79 TOKEN_FLOAT
= intern('float')
80 TOKEN_INTEGER
= intern('integer')
81 TOKEN_NAME
= intern('name')
82 TOKEN_STRING
= intern('string')
83 TOKEN_OPERATOR
= intern('operator')
84 TOKEN_BLOCK_BEGIN
= intern('block_begin')
85 TOKEN_BLOCK_END
= intern('block_end')
86 TOKEN_VARIABLE_BEGIN
= intern('variable_begin')
87 TOKEN_VARIABLE_END
= intern('variable_end')
88 TOKEN_RAW_BEGIN
= intern('raw_begin')
89 TOKEN_RAW_END
= intern('raw_end')
90 TOKEN_COMMENT_BEGIN
= intern('comment_begin')
91 TOKEN_COMMENT_END
= intern('comment_end')
92 TOKEN_COMMENT
= intern('comment')
93 TOKEN_LINESTATEMENT_BEGIN
= intern('linestatement_begin')
94 TOKEN_LINESTATEMENT_END
= intern('linestatement_end')
95 TOKEN_LINECOMMENT_BEGIN
= intern('linecomment_begin')
96 TOKEN_LINECOMMENT_END
= intern('linecomment_end')
97 TOKEN_LINECOMMENT
= intern('linecomment')
98 TOKEN_DATA
= intern('data')
99 TOKEN_INITIAL
= intern('initial')
100 TOKEN_EOF
= intern('eof')
102 # bind operators to token types
107 '//': TOKEN_FLOORDIV
,
132 reverse_operators
= dict([(v
, k
) for k
, v
in iteritems(operators
)])
133 assert len(operators
) == len(reverse_operators
), 'operators dropped'
134 operator_re
= re
.compile('(%s)' % '|'.join(re
.escape(x
) for x
in
135 sorted(operators
, key
=lambda x
: -len(x
))))
137 ignored_tokens
= frozenset([TOKEN_COMMENT_BEGIN
, TOKEN_COMMENT
,
138 TOKEN_COMMENT_END
, TOKEN_WHITESPACE
,
139 TOKEN_WHITESPACE
, TOKEN_LINECOMMENT_BEGIN
,
140 TOKEN_LINECOMMENT_END
, TOKEN_LINECOMMENT
])
141 ignore_if_empty
= frozenset([TOKEN_WHITESPACE
, TOKEN_DATA
,
142 TOKEN_COMMENT
, TOKEN_LINECOMMENT
])
145 def _describe_token_type(token_type
):
146 if token_type
in reverse_operators
:
147 return reverse_operators
[token_type
]
149 TOKEN_COMMENT_BEGIN
: 'begin of comment',
150 TOKEN_COMMENT_END
: 'end of comment',
151 TOKEN_COMMENT
: 'comment',
152 TOKEN_LINECOMMENT
: 'comment',
153 TOKEN_BLOCK_BEGIN
: 'begin of statement block',
154 TOKEN_BLOCK_END
: 'end of statement block',
155 TOKEN_VARIABLE_BEGIN
: 'begin of print statement',
156 TOKEN_VARIABLE_END
: 'end of print statement',
157 TOKEN_LINESTATEMENT_BEGIN
: 'begin of line statement',
158 TOKEN_LINESTATEMENT_END
: 'end of line statement',
159 TOKEN_DATA
: 'template data / text',
160 TOKEN_EOF
: 'end of template'
161 }.get(token_type
, token_type
)
164 def describe_token(token
):
165 """Returns a description of the token."""
166 if token
.type == 'name':
168 return _describe_token_type(token
.type)
171 def describe_token_expr(expr
):
172 """Like `describe_token` but for token expressions."""
174 type, value
= expr
.split(':', 1)
179 return _describe_token_type(type)
182 def count_newlines(value
):
183 """Count the number of newline characters in the string. This is
184 useful for extensions that filter a stream.
186 return len(newline_re
.findall(value
))
189 def compile_rules(environment
):
190 """Compiles all the rules from the environment into a list of rules."""
193 (len(environment
.comment_start_string
), 'comment',
194 e(environment
.comment_start_string
)),
195 (len(environment
.block_start_string
), 'block',
196 e(environment
.block_start_string
)),
197 (len(environment
.variable_start_string
), 'variable',
198 e(environment
.variable_start_string
))
201 if environment
.line_statement_prefix
is not None:
202 rules
.append((len(environment
.line_statement_prefix
), 'linestatement',
203 r
'^[ \t\v]*' + e(environment
.line_statement_prefix
)))
204 if environment
.line_comment_prefix
is not None:
205 rules
.append((len(environment
.line_comment_prefix
), 'linecomment',
206 r
'(?:^|(?<=\S))[^\S\r\n]*' +
207 e(environment
.line_comment_prefix
)))
209 return [x
[1:] for x
in sorted(rules
, reverse
=True)]
212 class Failure(object):
213 """Class that raises a `TemplateSyntaxError` if called.
214 Used by the `Lexer` to specify known errors.
217 def __init__(self
, message
, cls
=TemplateSyntaxError
):
218 self
.message
= message
219 self
.error_class
= cls
221 def __call__(self
, lineno
, filename
):
222 raise self
.error_class(self
.message
, lineno
, filename
)
228 lineno
, type, value
= (property(itemgetter(x
)) for x
in range(3))
230 def __new__(cls
, lineno
, type, value
):
231 return tuple.__new
__(cls
, (lineno
, intern(str(type)), value
))
234 if self
.type in reverse_operators
:
235 return reverse_operators
[self
.type]
236 elif self
.type == 'name':
240 def test(self
, expr
):
241 """Test a token against a token expression. This can either be a
242 token type or ``'token_type:token_value'``. This can only test
243 against string values and types.
245 # here we do a regular string equality check as test_any is usually
246 # passed an iterable of not interned strings.
247 if self
.type == expr
:
250 return expr
.split(':', 1) == [self
.type, self
.value
]
253 def test_any(self
, *iterable
):
254 """Test against multiple token expressions."""
255 for expr
in iterable
:
261 return 'Token(%r, %r, %r)' % (
269 class TokenStreamIterator(object):
270 """The iterator for tokenstreams. Iterate over the stream
271 until the eof token is reached.
274 def __init__(self
, stream
):
281 token
= self
.stream
.current
282 if token
.type is TOKEN_EOF
:
284 raise StopIteration()
290 class TokenStream(object):
291 """A token stream is an iterable that yields :class:`Token`\s. The
292 parser however does not iterate over it but calls :meth:`next` to go
293 one token ahead. The current active token is stored as :attr:`current`.
296 def __init__(self
, generator
, name
, filename
):
297 self
._iter
= iter(generator
)
298 self
._pushed
= deque()
300 self
.filename
= filename
302 self
.current
= Token(1, TOKEN_INITIAL
, '')
306 return TokenStreamIterator(self
)
309 return bool(self
._pushed
) or self
.current
.type is not TOKEN_EOF
310 __nonzero__
= __bool__
# py2
312 eos
= property(lambda x
: not x
, doc
="Are we at the end of the stream?")
314 def push(self
, token
):
315 """Push a token back to the stream."""
316 self
._pushed
.append(token
)
319 """Look at the next token."""
320 old_token
= next(self
)
321 result
= self
.current
323 self
.current
= old_token
327 """Got n tokens ahead."""
331 def next_if(self
, expr
):
332 """Perform the token test and return the token if it matched.
333 Otherwise the return value is `None`.
335 if self
.current
.test(expr
):
338 def skip_if(self
, expr
):
339 """Like :meth:`next_if` but only returns `True` or `False`."""
340 return self
.next_if(expr
) is not None
343 """Go one token ahead and return the old one"""
346 self
.current
= self
._pushed
.popleft()
347 elif self
.current
.type is not TOKEN_EOF
:
349 self
.current
= next(self
._iter
)
350 except StopIteration:
355 """Close the stream."""
356 self
.current
= Token(self
.current
.lineno
, TOKEN_EOF
, '')
360 def expect(self
, expr
):
361 """Expect a given token type and return it. This accepts the same
362 argument as :meth:`jinja2.lexer.Token.test`.
364 if not self
.current
.test(expr
):
365 expr
= describe_token_expr(expr
)
366 if self
.current
.type is TOKEN_EOF
:
367 raise TemplateSyntaxError('unexpected end of template, '
368 'expected %r.' % expr
,
370 self
.name
, self
.filename
)
371 raise TemplateSyntaxError("expected token %r, got %r" %
372 (expr
, describe_token(self
.current
)),
374 self
.name
, self
.filename
)
381 def get_lexer(environment
):
382 """Return a lexer which is probably cached."""
383 key
= (environment
.block_start_string
,
384 environment
.block_end_string
,
385 environment
.variable_start_string
,
386 environment
.variable_end_string
,
387 environment
.comment_start_string
,
388 environment
.comment_end_string
,
389 environment
.line_statement_prefix
,
390 environment
.line_comment_prefix
,
391 environment
.trim_blocks
,
392 environment
.lstrip_blocks
,
393 environment
.newline_sequence
,
394 environment
.keep_trailing_newline
)
395 lexer
= _lexer_cache
.get(key
)
397 lexer
= Lexer(environment
)
398 _lexer_cache
[key
] = lexer
403 """Class that implements a lexer for a given environment. Automatically
404 created by the environment class, usually you don't have to do that.
406 Note that the lexer is not automatically bound to an environment.
407 Multiple environments can share the same lexer.
410 def __init__(self
, environment
):
412 c
= lambda x
: re
.compile(x
, re
.M | re
.S
)
415 # lexing rules for tags
417 (whitespace_re
, TOKEN_WHITESPACE
, None),
418 (float_re
, TOKEN_FLOAT
, None),
419 (integer_re
, TOKEN_INTEGER
, None),
420 (name_re
, TOKEN_NAME
, None),
421 (string_re
, TOKEN_STRING
, None),
422 (operator_re
, TOKEN_OPERATOR
, None)
425 # assemble the root lexing rule. because "|" is ungreedy
426 # we have to sort by length so that the lexer continues working
427 # as expected when we have parsing rules like <% for block and
428 # <%= for variables. (if someone wants asp like syntax)
429 # variables are just part of the rules if variable processing
431 root_tag_rules
= compile_rules(environment
)
433 # block suffix if trimming is enabled
434 block_suffix_re
= environment
.trim_blocks
and '\\n?' or ''
436 # strip leading spaces if lstrip_blocks is enabled
438 if environment
.lstrip_blocks
:
439 # use '{%+' to manually disable lstrip_blocks behavior
440 no_lstrip_re
= e('+')
441 # detect overlap between block and variable or comment strings
442 block_diff
= c(r
'^%s(.*)' % e(environment
.block_start_string
))
443 # make sure we don't mistake a block for a variable or a comment
444 m
= block_diff
.match(environment
.comment_start_string
)
445 no_lstrip_re
+= m
and r
'|%s' % e(m
.group(1)) or ''
446 m
= block_diff
.match(environment
.variable_start_string
)
447 no_lstrip_re
+= m
and r
'|%s' % e(m
.group(1)) or ''
449 # detect overlap between comment and variable strings
450 comment_diff
= c(r
'^%s(.*)' % e(environment
.comment_start_string
))
451 m
= comment_diff
.match(environment
.variable_start_string
)
452 no_variable_re
= m
and r
'(?!%s)' % e(m
.group(1)) or ''
454 lstrip_re
= r
'^[ \t]*'
455 block_prefix_re
= r
'%s%s(?!%s)|%s\+?' % (
457 e(environment
.block_start_string
),
459 e(environment
.block_start_string
),
461 comment_prefix_re
= r
'%s%s%s|%s\+?' % (
463 e(environment
.comment_start_string
),
465 e(environment
.comment_start_string
),
467 prefix_re
['block'] = block_prefix_re
468 prefix_re
['comment'] = comment_prefix_re
470 block_prefix_re
= '%s' % e(environment
.block_start_string
)
472 self
.newline_sequence
= environment
.newline_sequence
473 self
.keep_trailing_newline
= environment
.keep_trailing_newline
475 # global lexing rules
479 (c('(.*?)(?:%s)' % '|'.join(
480 [r
'(?P<raw_begin>(?:\s*%s\-|%s)\s*raw\s*(?:\-%s\s*|%s))' % (
481 e(environment
.block_start_string
),
483 e(environment
.block_end_string
),
484 e(environment
.block_end_string
)
486 r
'(?P<%s_begin>\s*%s\-|%s)' % (n
, r
, prefix_re
.get(n
,r
))
487 for n
, r
in root_tag_rules
488 ])), (TOKEN_DATA
, '#bygroup'), '#bygroup'),
490 (c('.+'), TOKEN_DATA
, None)
493 TOKEN_COMMENT_BEGIN
: [
494 (c(r
'(.*?)((?:\-%s\s*|%s)%s)' % (
495 e(environment
.comment_end_string
),
496 e(environment
.comment_end_string
),
498 )), (TOKEN_COMMENT
, TOKEN_COMMENT_END
), '#pop'),
499 (c('(.)'), (Failure('Missing end of comment tag'),), None)
503 (c('(?:\-%s\s*|%s)%s' % (
504 e(environment
.block_end_string
),
505 e(environment
.block_end_string
),
507 )), TOKEN_BLOCK_END
, '#pop'),
510 TOKEN_VARIABLE_BEGIN
: [
512 e(environment
.variable_end_string
),
513 e(environment
.variable_end_string
)
514 )), TOKEN_VARIABLE_END
, '#pop')
518 (c('(.*?)((?:\s*%s\-|%s)\s*endraw\s*(?:\-%s\s*|%s%s))' % (
519 e(environment
.block_start_string
),
521 e(environment
.block_end_string
),
522 e(environment
.block_end_string
),
524 )), (TOKEN_DATA
, TOKEN_RAW_END
), '#pop'),
525 (c('(.)'), (Failure('Missing end of raw directive'),), None)
528 TOKEN_LINESTATEMENT_BEGIN
: [
529 (c(r
'\s*(\n|$)'), TOKEN_LINESTATEMENT_END
, '#pop')
532 TOKEN_LINECOMMENT_BEGIN
: [
533 (c(r
'(.*?)()(?=\n|$)'), (TOKEN_LINECOMMENT
,
534 TOKEN_LINECOMMENT_END
), '#pop')
538 def _normalize_newlines(self
, value
):
539 """Called for strings and template data to normalize it to unicode."""
540 return newline_re
.sub(self
.newline_sequence
, value
)
542 def tokenize(self
, source
, name
=None, filename
=None, state
=None):
543 """Calls tokeniter + tokenize and wraps it in a token stream.
545 stream
= self
.tokeniter(source
, name
, filename
, state
)
546 return TokenStream(self
.wrap(stream
, name
, filename
), name
, filename
)
548 def wrap(self
, stream
, name
=None, filename
=None):
549 """This is called with the stream as returned by `tokenize` and wraps
550 every token in a :class:`Token` and converts the value.
552 for lineno
, token
, value
in stream
:
553 if token
in ignored_tokens
:
555 elif token
== 'linestatement_begin':
556 token
= 'block_begin'
557 elif token
== 'linestatement_end':
559 # we are not interested in those tokens in the parser
560 elif token
in ('raw_begin', 'raw_end'):
562 elif token
== 'data':
563 value
= self
._normalize
_newlines
(value
)
564 elif token
== 'keyword':
566 elif token
== 'name':
568 elif token
== 'string':
569 # try to unescape string
571 value
= self
._normalize
_newlines
(value
[1:-1]) \
572 .encode('ascii', 'backslashreplace') \
573 .decode('unicode-escape')
574 except Exception as e
:
575 msg
= str(e
).split(':')[-1].strip()
576 raise TemplateSyntaxError(msg
, lineno
, name
, filename
)
577 # if we can express it as bytestring (ascii only)
578 # we do that for support of semi broken APIs
579 # as datetime.datetime.strftime. On python 3 this
580 # call becomes a noop thanks to 2to3
585 elif token
== 'integer':
587 elif token
== 'float':
589 elif token
== 'operator':
590 token
= operators
[value
]
591 yield Token(lineno
, token
, value
)
593 def tokeniter(self
, source
, name
, filename
=None, state
=None):
594 """This method tokenizes the text and returns the tokens in a
595 generator. Use this method if you just want to tokenize a template.
597 source
= text_type(source
)
598 lines
= source
.splitlines()
599 if self
.keep_trailing_newline
and source
:
600 for newline
in ('\r\n', '\r', '\n'):
601 if source
.endswith(newline
):
604 source
= '\n'.join(lines
)
608 if state
is not None and state
!= 'root':
609 assert state
in ('variable', 'block'), 'invalid state'
610 stack
.append(state
+ '_begin')
613 statetokens
= self
.rules
[stack
[-1]]
614 source_length
= len(source
)
620 for regex
, tokens
, new_state
in statetokens
:
621 m
= regex
.match(source
, pos
)
622 # if no match we try again with the next rule
626 # we only match blocks and variables if braces / parentheses
627 # are balanced. continue parsing with the lower rule which
628 # is the operator rule. do this only if the end tags look
630 if balancing_stack
and \
631 tokens
in ('variable_end', 'block_end',
632 'linestatement_end'):
635 # tuples support more options
636 if isinstance(tokens
, tuple):
637 for idx
, token
in enumerate(tokens
):
639 if token
.__class
__ is Failure
:
640 raise token(lineno
, filename
)
641 # bygroup is a bit more complex, in that case we
642 # yield for the current token the first named
644 elif token
== '#bygroup':
645 for key
, value
in iteritems(m
.groupdict()):
646 if value
is not None:
647 yield lineno
, key
, value
648 lineno
+= value
.count('\n')
651 raise RuntimeError('%r wanted to resolve '
652 'the token dynamically'
653 ' but no group matched'
657 data
= m
.group(idx
+ 1)
658 if data
or token
not in ignore_if_empty
:
659 yield lineno
, token
, data
660 lineno
+= data
.count('\n')
662 # strings as token just are yielded as it.
665 # update brace/parentheses balance
666 if tokens
== 'operator':
668 balancing_stack
.append('}')
670 balancing_stack
.append(')')
672 balancing_stack
.append(']')
673 elif data
in ('}', ')', ']'):
674 if not balancing_stack
:
675 raise TemplateSyntaxError('unexpected \'%s\'' %
678 expected_op
= balancing_stack
.pop()
679 if expected_op
!= data
:
680 raise TemplateSyntaxError('unexpected \'%s\', '
686 if data
or tokens
not in ignore_if_empty
:
687 yield lineno
, tokens
, data
688 lineno
+= data
.count('\n')
690 # fetch new position into new variable so that we can check
691 # if there is a internal parsing error which would result
692 # in an infinite loop
695 # handle state changes
696 if new_state
is not None:
697 # remove the uppermost state
698 if new_state
== '#pop':
700 # resolve the new state by group checking
701 elif new_state
== '#bygroup':
702 for key
, value
in iteritems(m
.groupdict()):
703 if value
is not None:
707 raise RuntimeError('%r wanted to resolve the '
708 'new state dynamically but'
709 ' no group matched' %
711 # direct state name given
713 stack
.append(new_state
)
714 statetokens
= self
.rules
[stack
[-1]]
715 # we are still at the same position and no stack change.
716 # this means a loop without break condition, avoid that and
719 raise RuntimeError('%r yielded empty string without '
720 'stack change' % regex
)
721 # publish new function and start again
724 # if loop terminated without break we haven't found a single match
725 # either we are at the end of the file or we have a problem
728 if pos
>= source_length
:
730 # something went wrong
731 raise TemplateSyntaxError('unexpected char %r at %d' %
732 (source
[pos
], pos
), lineno
,