1 """Module to analyze Python source code; for syntax coloring tools.
4 tags = fontify(pytext, searchfrom, searchto)
6 The 'pytext' argument is a string containing Python source code.
7 The (optional) arguments 'searchfrom' and 'searchto' may contain a slice in pytext.
8 The returned value is a list of tuples, formatted like this:
9 [('keyword', 0, 6, None), ('keyword', 11, 17, None), ('comment', 23, 53, None), etc. ]
10 The tuple contents are always like this:
11 (tag, startindex, endindex, sublist)
12 tag is one of 'keyword', 'string', 'comment' or 'identifier'
13 sublist is not used, hence always None.
16 # Based on FontText.py by Mitchell S. Chapman,
17 # which was modified by Zachary Roadhouse,
18 # then un-Tk'd by Just van Rossum.
19 # Many thanks for regular expression debugging & authoring are due to:
20 # Tim (the-incredib-ly y'rs) Peters and Cristian Tismer
21 # So, who owns the copyright? ;-) How about this:
22 # Copyright 1996-2001:
23 # Mitchell S. Chapman,
33 # First a little helper, since I don't like to repeat things. (Tismer speaking)
35 def replace(where
, what
, with
):
36 return string
.join(string
.split(where
, what
), with
)
38 # This list of keywords is taken from ref/node13.html of the
39 # Python 1.3 HTML documentation. ("access" is intentionally omitted.)
42 "del", "from", "lambda", "return",
43 "and", "elif", "global", "not", "try",
44 "break", "else", "if", "or", "while",
45 "class", "except", "import", "pass",
46 "continue", "finally", "in", "print",
47 "def", "for", "is", "raise", "yield"]
49 # Build up a regular expression which will match anything
50 # interesting, including multi-line triple-quoted strings.
51 commentPat
= r
"#[^\n]*"
53 pat
= r
"q[^\\q\n]*(\\[\000-\377][^\\q\n]*)*q"
54 quotePat
= replace(pat
, "q", "'") + "|" + replace(pat
, 'q', '"')
75 pat
= string
.join(string
.split(pat
), '') # get rid of whitespace
76 tripleQuotePat
= replace(pat
, "q", "'") + "|" + replace(pat
, 'q', '"')
78 # Build up a regular expression which matches all and only
79 # Python keywords. This will let us skip the uninteresting
80 # identifier references.
81 # nonKeyPat identifies characters which may legally precede
83 nonKeyPat
= r
"(^|[^a-zA-Z0-9_.\"'])"
85 keyPat = nonKeyPat + "(" + "|".join(keywordsList) + ")" + nonKeyPat
87 matchPat = commentPat + "|" + keyPat + "|" + tripleQuotePat + "|" + quotePat
88 matchRE = re.compile(matchPat)
90 idKeyPat = "[ \t]*[A-Za-z_][A-Za-z_0-9.]*" # Ident w. leading whitespace.
91 idRE = re.compile(idKeyPat)
94 def fontify(pytext, searchfrom = 0, searchto = None):
96 searchto = len(pytext)
97 # Cache a few attributes for quicker reference.
98 search = matchRE.search
99 idSearch = idRE.search
102 tags_append = tags.append
103 commentTag = 'comment
'
105 keywordTag = 'keyword
'
106 identifierTag = 'identifier
'
111 m = search(pytext, end)
115 if start >= searchto:
118 end = start + len(match)
121 # Must have matched a keyword.
122 if start <> searchfrom:
123 # there's still a redundant char before and after it, strip!
127 # this is the first keyword in the text.
128 # Only a space at the end.
131 tags_append((keywordTag, start, end, None))
132 # If this was a defining keyword, look ahead to the
133 # following identifier.
134 if match in ["def", "class"]:
135 m = idSearch(pytext, end)
140 end = start + len(match)
141 tags_append((identifierTag, start, end, None))
143 tags_append((commentTag
, start
, end
, None))
145 tags_append((stringTag
, start
, end
, None))
154 for tag
, start
, end
, sublist
in tags
:
155 print tag
, `text
[start
:end
]`