1 """Module to analyze Python source code; for syntax coloring tools.
4 tags = fontify(pytext, searchfrom, searchto)
6 The 'pytext' argument is a string containing Python source code.
7 The (optional) arguments 'searchfrom' and 'searchto' may contain a slice in pytext.
8 The returned value is a list of tuples, formatted like this:
9 [('keyword', 0, 6, None), ('keyword', 11, 17, None), ('comment', 23, 53, None), etc. ]
10 The tuple contents are always like this:
11 (tag, startindex, endindex, sublist)
12 tag is one of 'keyword', 'string', 'comment' or 'identifier'
13 sublist is not used, hence always None.
16 # Based on FontText.py by Mitchell S. Chapman,
17 # which was modified by Zachary Roadhouse,
18 # then un-Tk'd by Just van Rossum.
19 # Many thanks for regular expression debugging & authoring are due to:
20 # Tim (the-incredib-ly y'rs) Peters and Cristian Tismer
21 # So, who owns the copyright? ;-) How about this:
22 # Copyright 1996-1997:
23 # Mitchell S. Chapman,
32 # First a little helper, since I don't like to repeat things. (Tismer speaking)
34 def replace(where
, what
, with
):
35 return string
.join(string
.split(where
, what
), with
)
37 # This list of keywords is taken from ref/node13.html of the
38 # Python 1.3 HTML documentation. ("access" is intentionally omitted.)
41 "del", "from", "lambda", "return",
42 "and", "elif", "global", "not", "try",
43 "break", "else", "if", "or", "while",
44 "class", "except", "import", "pass",
45 "continue", "finally", "in", "print",
46 "def", "for", "is", "raise"]
48 # Build up a regular expression which will match anything
49 # interesting, including multi-line triple-quoted strings.
52 pat
= "q[^\q\n]*\(\\\\[\000-\377][^\q\n]*\)*q"
53 quotePat
= replace(pat
, "q", "'") + "\|" + replace(pat
, 'q', '"')
74 pat
= string
.join(string
.split(pat
), '') # get rid of whitespace
75 tripleQuotePat
= replace(pat
, "q", "'") + "\|" + replace(pat
, 'q', '"')
77 # Build up a regular expression which matches all and only
78 # Python keywords. This will let us skip the uninteresting
79 # identifier references.
80 # nonKeyPat identifies characters which may legally precede
82 nonKeyPat
= "\(^\|[^a-zA-Z0-9_.\"']\)"
84 keyPat
= nonKeyPat
+ "\("
85 for keyword
in keywordsList
:
86 keyPat
= keyPat
+ keyword
+ "\|"
87 keyPat
= keyPat
[:-2] + "\)" + nonKeyPat
89 matchPat
= keyPat
+ "\|" + commentPat
+ "\|" + tripleQuotePat
+ "\|" + quotePat
90 matchRE
= regex
.compile(matchPat
)
92 idKeyPat
= "[ \t]*[A-Za-z_][A-Za-z_0-9.]*" # Ident w. leading whitespace.
93 idRE
= regex
.compile(idKeyPat
)
96 def fontify(pytext
, searchfrom
= 0, searchto
= None):
98 searchto
= len(pytext
)
99 # Cache a few attributes for quicker reference.
100 search
= matchRE
.search
101 group
= matchRE
.group
102 idSearch
= idRE
.search
106 tags_append
= tags
.append
107 commentTag
= 'comment'
109 keywordTag
= 'keyword'
110 identifierTag
= 'identifier'
115 start
= search(pytext
, end
)
116 if start
< 0 or start
>= searchto
:
119 end
= start
+ len(match
)
122 # Must have matched a keyword.
123 if start
<> searchfrom
:
124 # there's still a redundant char before and after it, strip!
128 # this is the first keyword in the text.
129 # Only a space at the end.
132 tags_append((keywordTag
, start
, end
, None))
133 # If this was a defining keyword, look ahead to the
134 # following identifier.
135 if match
in ["def", "class"]:
136 start
= idSearch(pytext
, end
)
139 end
= start
+ len(match
)
140 tags_append((identifierTag
, start
, end
, None))
142 tags_append((commentTag
, start
, end
, None))
144 tags_append((stringTag
, start
, end
, None))
153 for tag
, start
, end
, sublist
in tags
:
154 print tag
, `text
[start
:end
]`